aboutsummaryrefslogtreecommitdiff
path: root/lib/Target
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2018-07-28 10:51:19 +0000
committerDimitry Andric <dim@FreeBSD.org>2018-07-28 10:51:19 +0000
commiteb11fae6d08f479c0799db45860a98af528fa6e7 (patch)
tree44d492a50c8c1a7eb8e2d17ea3360ec4d066f042 /lib/Target
parentb8a2042aa938069e862750553db0e4d82d25822c (diff)
downloadsrc-eb11fae6d08f479c0799db45860a98af528fa6e7.tar.gz
src-eb11fae6d08f479c0799db45860a98af528fa6e7.zip
Vendor import of llvm trunk r338150:vendor/llvm/llvm-trunk-r338150
Notes
Notes: svn path=/vendor/llvm/dist/; revision=336809 svn path=/vendor/llvm/llvm-trunk-r338150/; revision=336814; tag=vendor/llvm/llvm-trunk-r338150
Diffstat (limited to 'lib/Target')
-rw-r--r--lib/Target/AArch64/AArch64.td86
-rw-r--r--lib/Target/AArch64/AArch64A53Fix835769.cpp25
-rw-r--r--lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp69
-rw-r--r--lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp6
-rw-r--r--lib/Target/AArch64/AArch64AsmPrinter.cpp38
-rw-r--r--lib/Target/AArch64/AArch64CallLowering.cpp21
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.td19
-rw-r--r--lib/Target/AArch64/AArch64CollectLOH.cpp40
-rw-r--r--lib/Target/AArch64/AArch64CondBrTuning.cpp29
-rw-r--r--lib/Target/AArch64/AArch64ConditionOptimizer.cpp30
-rw-r--r--lib/Target/AArch64/AArch64ConditionalCompares.cpp93
-rw-r--r--lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp25
-rw-r--r--lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp245
-rw-r--r--lib/Target/AArch64/AArch64FalkorHWPFFix.cpp35
-rw-r--r--lib/Target/AArch64/AArch64FastISel.cpp42
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.cpp315
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.h2
-rw-r--r--lib/Target/AArch64/AArch64ISelDAGToDAG.cpp162
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp1555
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.h68
-rw-r--r--lib/Target/AArch64/AArch64InstrAtomics.td7
-rw-r--r--lib/Target/AArch64/AArch64InstrFormats.td813
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.cpp745
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.h186
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.td742
-rw-r--r--lib/Target/AArch64/AArch64InstructionSelector.cpp298
-rw-r--r--lib/Target/AArch64/AArch64LegalizerInfo.cpp635
-rw-r--r--lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp222
-rw-r--r--lib/Target/AArch64/AArch64MCInstLower.cpp15
-rw-r--r--lib/Target/AArch64/AArch64MachineFunctionInfo.h31
-rw-r--r--lib/Target/AArch64/AArch64MacroFusion.cpp357
-rw-r--r--lib/Target/AArch64/AArch64PBQPRegAlloc.cpp24
-rw-r--r--lib/Target/AArch64/AArch64PromoteConstant.cpp64
-rw-r--r--lib/Target/AArch64/AArch64RedundantCopyElimination.cpp75
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.cpp59
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.h9
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.td364
-rw-r--r--lib/Target/AArch64/AArch64SIMDInstrOpt.cpp2
-rw-r--r--lib/Target/AArch64/AArch64SVEInstrInfo.td963
-rw-r--r--lib/Target/AArch64/AArch64SchedA53.td12
-rw-r--r--lib/Target/AArch64/AArch64SchedExynosM1.td (renamed from lib/Target/AArch64/AArch64SchedM1.td)50
-rw-r--r--lib/Target/AArch64/AArch64SchedExynosM3.td860
-rw-r--r--lib/Target/AArch64/AArch64SchedFalkor.td3
-rw-r--r--lib/Target/AArch64/AArch64SchedFalkorDetails.td96
-rw-r--r--lib/Target/AArch64/AArch64SchedKryo.td3
-rw-r--r--lib/Target/AArch64/AArch64SchedThunderX.td3
-rw-r--r--lib/Target/AArch64/AArch64SchedThunderX2T99.td72
-rw-r--r--lib/Target/AArch64/AArch64StorePairSuppress.cpp14
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.cpp28
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.h33
-rw-r--r--lib/Target/AArch64/AArch64SystemOperands.td278
-rw-r--r--lib/Target/AArch64/AArch64TargetMachine.cpp24
-rw-r--r--lib/Target/AArch64/AArch64TargetObjectFile.h2
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.cpp190
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.h5
-rw-r--r--lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp2120
-rw-r--r--lib/Target/AArch64/CMakeLists.txt20
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp189
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp4
-rw-r--r--lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp232
-rw-r--r--lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h30
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h64
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp84
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp15
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp12
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h2
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp15
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h8
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp40
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp2
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h3
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp44
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h19
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp63
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp29
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp9
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h2
-rw-r--r--lib/Target/AArch64/SVEInstrFormats.td4018
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.cpp29
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.h37
-rw-r--r--lib/Target/AMDGPU/AMDGPU.h27
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td275
-rw-r--r--lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp9
-rw-r--r--lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp108
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp3
-rw-r--r--lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h5
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp580
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.h100
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.cpp122
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.h5
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallingConv.td40
-rw-r--r--lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp404
-rw-r--r--lib/Target/AMDGPU/AMDGPUFeatures.td60
-rw-r--r--lib/Target/AMDGPU/AMDGPUFrameLowering.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPUGISel.td138
-rw-r--r--lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def76
-rw-r--r--lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp (renamed from lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp)163
-rw-r--r--lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h (renamed from lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h)17
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp236
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp549
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h50
-rw-r--r--lib/Target/AMDGPU/AMDGPUInline.cpp6
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.cpp97
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.h48
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td19
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp315
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.h52
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructions.td174
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp8
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsics.td3
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp164
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.h5
-rw-r--r--lib/Target/AMDGPU/AMDGPULibCalls.cpp83
-rw-r--r--lib/Target/AMDGPU/AMDGPULibFunc.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp3
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp264
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp270
-rw-r--r--lib/Target/AMDGPU/AMDGPUMCInstLower.cpp107
-rw-r--r--lib/Target/AMDGPU/AMDGPUMCInstLower.h46
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp395
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.cpp24
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.h50
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h12
-rw-r--r--lib/Target/AMDGPU/AMDGPUMacroFusion.cpp3
-rw-r--r--lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp81
-rw-r--r--lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp397
-rw-r--r--lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h55
-rw-r--r--lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp102
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp352
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.h19
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBanks.td2
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.cpp16
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.h10
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.td1
-rw-r--r--lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp11
-rw-r--r--lib/Target/AMDGPU/AMDGPUSearchableTables.td77
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp220
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h794
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp102
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.h26
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetObjectFile.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp291
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h82
-rw-r--r--lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp44
-rw-r--r--lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp8
-rw-r--r--lib/Target/AMDGPU/AMDILCFGStructurizer.cpp338
-rw-r--r--lib/Target/AMDGPU/AMDKernelCodeT.h12
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp846
-rw-r--r--lib/Target/AMDGPU/BUFInstructions.td528
-rw-r--r--lib/Target/AMDGPU/CMakeLists.txt46
-rw-r--r--lib/Target/AMDGPU/DSInstructions.td48
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp107
-rw-r--r--lib/Target/AMDGPU/EvergreenInstructions.td3
-rw-r--r--lib/Target/AMDGPU/FLATInstructions.td28
-rw-r--r--lib/Target/AMDGPU/GCNHazardRecognizer.cpp11
-rw-r--r--lib/Target/AMDGPU/GCNHazardRecognizer.h4
-rw-r--r--lib/Target/AMDGPU/GCNILPSched.cpp27
-rw-r--r--lib/Target/AMDGPU/GCNIterativeScheduler.cpp114
-rw-r--r--lib/Target/AMDGPU/GCNMinRegStrategy.cpp51
-rw-r--r--lib/Target/AMDGPU/GCNProcessors.td20
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.cpp32
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.h11
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.cpp122
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.h6
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp155
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h15
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp32
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp12
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp37
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h9
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp2
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h2
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp36
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h29
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp288
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h34
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt2
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp43
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp27
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp34
-rw-r--r--lib/Target/AMDGPU/MIMGInstructions.td1028
-rw-r--r--lib/Target/AMDGPU/Processors.td12
-rw-r--r--lib/Target/AMDGPU/R600.td54
-rw-r--r--lib/Target/AMDGPU/R600AsmPrinter.cpp133
-rw-r--r--lib/Target/AMDGPU/R600AsmPrinter.h46
-rw-r--r--lib/Target/AMDGPU/R600ClauseMergePass.cpp33
-rw-r--r--lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp134
-rw-r--r--lib/Target/AMDGPU/R600Defines.h4
-rw-r--r--lib/Target/AMDGPU/R600EmitClauseMarkers.cpp49
-rw-r--r--lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp64
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp377
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.h5
-rw-r--r--lib/Target/AMDGPU/R600InstrFormats.td6
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.cpp443
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.h34
-rw-r--r--lib/Target/AMDGPU/R600Instructions.td108
-rw-r--r--lib/Target/AMDGPU/R600Intrinsics.td67
-rw-r--r--lib/Target/AMDGPU/R600MachineScheduler.cpp106
-rw-r--r--lib/Target/AMDGPU/R600MachineScheduler.h2
-rw-r--r--lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp (renamed from lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp)14
-rw-r--r--lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp31
-rw-r--r--lib/Target/AMDGPU/R600Packetizer.cpp52
-rw-r--r--lib/Target/AMDGPU/R600Processors.td56
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.cpp65
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.h15
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.td2
-rw-r--r--lib/Target/AMDGPU/SIAnnotateControlFlow.cpp38
-rw-r--r--lib/Target/AMDGPU/SIDebuggerInsertNops.cpp9
-rw-r--r--lib/Target/AMDGPU/SIDefines.h54
-rw-r--r--lib/Target/AMDGPU/SIFixSGPRCopies.cpp114
-rw-r--r--lib/Target/AMDGPU/SIFixVGPRCopies.cpp7
-rw-r--r--lib/Target/AMDGPU/SIFixWWMLiveness.cpp5
-rw-r--r--lib/Target/AMDGPU/SIFoldOperands.cpp72
-rw-r--r--lib/Target/AMDGPU/SIFormMemoryClauses.cpp398
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.cpp142
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.h12
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp2200
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h38
-rw-r--r--lib/Target/AMDGPU/SIInsertSkips.cpp59
-rw-r--r--lib/Target/AMDGPU/SIInsertWaitcnts.cpp499
-rw-r--r--lib/Target/AMDGPU/SIInsertWaits.cpp703
-rw-r--r--lib/Target/AMDGPU/SIInstrFormats.td29
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp370
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h80
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td333
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td169
-rw-r--r--lib/Target/AMDGPU/SILoadStoreOptimizer.cpp95
-rw-r--r--lib/Target/AMDGPU/SILowerControlFlow.cpp51
-rw-r--r--lib/Target/AMDGPU/SILowerI1Copies.cpp7
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp90
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.h144
-rw-r--r--lib/Target/AMDGPU/SIMachineScheduler.cpp162
-rw-r--r--lib/Target/AMDGPU/SIMachineScheduler.h2
-rw-r--r--lib/Target/AMDGPU/SIMemoryLegalizer.cpp1025
-rw-r--r--lib/Target/AMDGPU/SIOptimizeExecMasking.cpp28
-rw-r--r--lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp14
-rw-r--r--lib/Target/AMDGPU/SIPeepholeSDWA.cpp164
-rw-r--r--lib/Target/AMDGPU/SIProgramInfo.h77
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp125
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.h19
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.td28
-rw-r--r--lib/Target/AMDGPU/SISchedule.td2
-rw-r--r--lib/Target/AMDGPU/SIShrinkInstructions.cpp37
-rw-r--r--lib/Target/AMDGPU/SIWholeQuadMode.cpp19
-rw-r--r--lib/Target/AMDGPU/SMInstructions.td271
-rw-r--r--lib/Target/AMDGPU/SOPInstructions.td99
-rw-r--r--lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp6
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp10
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp249
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h111
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp75
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h24
-rw-r--r--lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h1
-rw-r--r--lib/Target/AMDGPU/Utils/CMakeLists.txt1
-rw-r--r--lib/Target/AMDGPU/VOP1Instructions.td74
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td80
-rw-r--r--lib/Target/AMDGPU/VOP3Instructions.td57
-rw-r--r--lib/Target/AMDGPU/VOP3PInstructions.td173
-rw-r--r--lib/Target/AMDGPU/VOPCInstructions.td5
-rw-r--r--lib/Target/AMDGPU/VOPInstructions.td64
-rw-r--r--lib/Target/ARC/ARCAsmPrinter.cpp2
-rw-r--r--lib/Target/ARC/ARCBranchFinalize.cpp16
-rw-r--r--lib/Target/ARC/ARCFrameLowering.cpp62
-rw-r--r--lib/Target/ARC/ARCISelLowering.cpp6
-rw-r--r--lib/Target/ARC/ARCISelLowering.h1
-rw-r--r--lib/Target/ARC/ARCInstrFormats.td2
-rw-r--r--lib/Target/ARC/ARCInstrInfo.cpp14
-rw-r--r--lib/Target/ARC/ARCInstrInfo.td58
-rw-r--r--lib/Target/ARC/ARCMCInstLower.cpp2
-rw-r--r--lib/Target/ARC/ARCMCInstLower.h2
-rw-r--r--lib/Target/ARC/ARCMachineFunctionInfo.h5
-rw-r--r--lib/Target/ARC/ARCRegisterInfo.cpp32
-rw-r--r--lib/Target/ARC/CMakeLists.txt13
-rw-r--r--lib/Target/ARC/Disassembler/ARCDisassembler.cpp28
-rw-r--r--lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp7
-rw-r--r--lib/Target/ARC/InstPrinter/ARCInstPrinter.h2
-rw-r--r--lib/Target/ARC/MCTargetDesc/ARCInfo.h2
-rw-r--r--lib/Target/ARM/A15SDOptimizer.cpp22
-rw-r--r--lib/Target/ARM/ARM.h7
-rw-r--r--lib/Target/ARM/ARM.td65
-rw-r--r--lib/Target/ARM/ARMAsmPrinter.cpp50
-rw-r--r--lib/Target/ARM/ARMAsmPrinter.h4
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.cpp259
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.h5
-rw-r--r--lib/Target/ARM/ARMBaseRegisterInfo.cpp8
-rw-r--r--lib/Target/ARM/ARMBaseRegisterInfo.h3
-rw-r--r--lib/Target/ARM/ARMCallLowering.cpp11
-rw-r--r--lib/Target/ARM/ARMCallingConv.h3
-rw-r--r--lib/Target/ARM/ARMCallingConv.td19
-rw-r--r--lib/Target/ARM/ARMCodeGenPrepare.cpp750
-rw-r--r--lib/Target/ARM/ARMComputeBlockSize.cpp1
-rw-r--r--lib/Target/ARM/ARMConstantIslandPass.cpp134
-rw-r--r--lib/Target/ARM/ARMConstantPoolValue.cpp1
-rw-r--r--lib/Target/ARM/ARMExpandPseudoInsts.cpp252
-rw-r--r--lib/Target/ARM/ARMFastISel.cpp11
-rw-r--r--lib/Target/ARM/ARMFrameLowering.cpp116
-rw-r--r--lib/Target/ARM/ARMFrameLowering.h2
-rw-r--r--lib/Target/ARM/ARMHazardRecognizer.cpp4
-rw-r--r--lib/Target/ARM/ARMISelDAGToDAG.cpp420
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp871
-rw-r--r--lib/Target/ARM/ARMISelLowering.h20
-rw-r--r--lib/Target/ARM/ARMInstrFormats.td9
-rw-r--r--lib/Target/ARM/ARMInstrInfo.cpp28
-rw-r--r--lib/Target/ARM/ARMInstrInfo.h7
-rw-r--r--lib/Target/ARM/ARMInstrInfo.td62
-rw-r--r--lib/Target/ARM/ARMInstrNEON.td320
-rw-r--r--lib/Target/ARM/ARMInstrThumb.td19
-rw-r--r--lib/Target/ARM/ARMInstrThumb2.td48
-rw-r--r--lib/Target/ARM/ARMInstrVFP.td235
-rw-r--r--lib/Target/ARM/ARMInstructionSelector.cpp128
-rw-r--r--lib/Target/ARM/ARMLegalizerInfo.cpp184
-rw-r--r--lib/Target/ARM/ARMLoadStoreOptimizer.cpp36
-rw-r--r--lib/Target/ARM/ARMMacroFusion.cpp65
-rw-r--r--lib/Target/ARM/ARMParallelDSP.cpp672
-rw-r--r--lib/Target/ARM/ARMRegisterBankInfo.cpp68
-rw-r--r--lib/Target/ARM/ARMRegisterBanks.td2
-rw-r--r--lib/Target/ARM/ARMRegisterInfo.td12
-rw-r--r--lib/Target/ARM/ARMScheduleA57.td41
-rw-r--r--lib/Target/ARM/ARMScheduleA9.td16
-rw-r--r--lib/Target/ARM/ARMScheduleR52.td45
-rw-r--r--lib/Target/ARM/ARMScheduleSwift.td37
-rw-r--r--lib/Target/ARM/ARMSubtarget.cpp2
-rw-r--r--lib/Target/ARM/ARMSubtarget.h41
-rw-r--r--lib/Target/ARM/ARMTargetMachine.cpp49
-rw-r--r--lib/Target/ARM/ARMTargetMachine.h10
-rw-r--r--lib/Target/ARM/ARMTargetObjectFile.cpp3
-rw-r--r--lib/Target/ARM/ARMTargetObjectFile.h5
-rw-r--r--lib/Target/ARM/ARMTargetTransformInfo.cpp30
-rw-r--r--lib/Target/ARM/AsmParser/ARMAsmParser.cpp532
-rw-r--r--lib/Target/ARM/CMakeLists.txt21
-rw-r--r--lib/Target/ARM/Disassembler/ARMDisassembler.cpp30
-rw-r--r--lib/Target/ARM/Disassembler/LLVMBuild.txt2
-rw-r--r--lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp16
-rw-r--r--lib/Target/ARM/InstPrinter/ARMInstPrinter.h2
-rw-r--r--lib/Target/ARM/LLVMBuild.txt2
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp133
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h33
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h11
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h12
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h10
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h18
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp8
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp19
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp2
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp50
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h41
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp10
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp7
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp15
-rw-r--r--lib/Target/ARM/MCTargetDesc/CMakeLists.txt1
-rw-r--r--lib/Target/ARM/MLxExpansionPass.cpp22
-rw-r--r--lib/Target/ARM/README.txt2
-rw-r--r--lib/Target/ARM/Thumb1FrameLowering.cpp19
-rw-r--r--lib/Target/ARM/Thumb1InstrInfo.cpp17
-rw-r--r--lib/Target/ARM/Thumb1InstrInfo.h1
-rw-r--r--lib/Target/ARM/Thumb2ITBlockPass.cpp4
-rw-r--r--lib/Target/ARM/Thumb2InstrInfo.cpp21
-rw-r--r--lib/Target/ARM/Thumb2SizeReduction.cpp14
-rw-r--r--lib/Target/ARM/ThumbRegisterInfo.cpp22
-rw-r--r--lib/Target/AVR/AVR.h2
-rw-r--r--lib/Target/AVR/AVRISelDAGToDAG.cpp5
-rw-r--r--lib/Target/AVR/AVRISelLowering.cpp13
-rw-r--r--lib/Target/AVR/AVRInstrInfo.cpp4
-rw-r--r--lib/Target/AVR/AVRInstrInfo.td18
-rw-r--r--lib/Target/AVR/AVRRegisterInfo.h5
-rw-r--r--lib/Target/AVR/AVRTargetMachine.cpp2
-rw-r--r--lib/Target/AVR/AsmParser/AVRAsmParser.cpp4
-rw-r--r--lib/Target/AVR/CMakeLists.txt8
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp23
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h16
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp6
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp6
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h13
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp6
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h10
-rw-r--r--lib/Target/BPF/AsmParser/BPFAsmParser.cpp8
-rw-r--r--lib/Target/BPF/BPF.h5
-rw-r--r--lib/Target/BPF/BPF.td6
-rw-r--r--lib/Target/BPF/BPFCallingConv.td20
-rw-r--r--lib/Target/BPF/BPFISelDAGToDAG.cpp92
-rw-r--r--lib/Target/BPF/BPFISelLowering.cpp257
-rw-r--r--lib/Target/BPF/BPFISelLowering.h18
-rw-r--r--lib/Target/BPF/BPFInstrInfo.cpp93
-rw-r--r--lib/Target/BPF/BPFInstrInfo.h5
-rw-r--r--lib/Target/BPF/BPFInstrInfo.td174
-rw-r--r--lib/Target/BPF/BPFMIPeephole.cpp284
-rw-r--r--lib/Target/BPF/BPFRegisterInfo.cpp4
-rw-r--r--lib/Target/BPF/BPFRegisterInfo.h2
-rw-r--r--lib/Target/BPF/BPFSelectionDAGInfo.cpp43
-rw-r--r--lib/Target/BPF/BPFSelectionDAGInfo.h36
-rw-r--r--lib/Target/BPF/BPFSubtarget.cpp3
-rw-r--r--lib/Target/BPF/BPFSubtarget.h13
-rw-r--r--lib/Target/BPF/BPFTargetMachine.cpp31
-rw-r--r--lib/Target/BPF/CMakeLists.txt15
-rw-r--r--lib/Target/BPF/Disassembler/BPFDisassembler.cpp45
-rw-r--r--lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp1
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp67
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp8
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h4
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp45
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp4
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h14
-rw-r--r--lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp83
-rw-r--r--lib/Target/Hexagon/BitTracker.cpp24
-rw-r--r--lib/Target/Hexagon/BitTracker.h12
-rw-r--r--lib/Target/Hexagon/CMakeLists.txt7
-rw-r--r--lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp60
-rw-r--r--lib/Target/Hexagon/Hexagon.h2
-rw-r--r--lib/Target/Hexagon/Hexagon.td70
-rw-r--r--lib/Target/Hexagon/HexagonAsmPrinter.cpp97
-rwxr-xr-xlib/Target/Hexagon/HexagonAsmPrinter.h16
-rw-r--r--lib/Target/Hexagon/HexagonBitSimplify.cpp23
-rw-r--r--lib/Target/Hexagon/HexagonBitTracker.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonBlockRanges.cpp16
-rw-r--r--lib/Target/Hexagon/HexagonBranchRelaxation.cpp19
-rw-r--r--lib/Target/Hexagon/HexagonCallingConv.td134
-rw-r--r--lib/Target/Hexagon/HexagonCommonGEP.cpp89
-rw-r--r--lib/Target/Hexagon/HexagonConstExtenders.cpp190
-rw-r--r--lib/Target/Hexagon/HexagonConstPropagation.cpp80
-rw-r--r--lib/Target/Hexagon/HexagonCopyToCombine.cpp14
-rw-r--r--lib/Target/Hexagon/HexagonDepArch.td12
-rw-r--r--lib/Target/Hexagon/HexagonDepIICScalar.td1209
-rw-r--r--lib/Target/Hexagon/HexagonDepInstrInfo.td292
-rw-r--r--lib/Target/Hexagon/HexagonDepMappings.td1
-rw-r--r--lib/Target/Hexagon/HexagonEarlyIfConv.cpp111
-rw-r--r--lib/Target/Hexagon/HexagonExpandCondsets.cpp68
-rw-r--r--lib/Target/Hexagon/HexagonFixupHwLoops.cpp20
-rw-r--r--lib/Target/Hexagon/HexagonFrameLowering.cpp66
-rw-r--r--lib/Target/Hexagon/HexagonGatherPacketize.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonGenInsert.cpp25
-rw-r--r--lib/Target/Hexagon/HexagonGenMux.cpp14
-rw-r--r--lib/Target/Hexagon/HexagonGenPredicate.cpp18
-rw-r--r--lib/Target/Hexagon/HexagonHardwareLoops.cpp74
-rw-r--r--lib/Target/Hexagon/HexagonHazardRecognizer.cpp54
-rw-r--r--lib/Target/Hexagon/HexagonHazardRecognizer.h17
-rw-r--r--lib/Target/Hexagon/HexagonISelDAGToDAG.cpp508
-rw-r--r--lib/Target/Hexagon/HexagonISelDAGToDAG.h12
-rw-r--r--lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp272
-rw-r--r--lib/Target/Hexagon/HexagonISelLowering.cpp1937
-rw-r--r--lib/Target/Hexagon/HexagonISelLowering.h104
-rw-r--r--lib/Target/Hexagon/HexagonISelLoweringHVX.cpp1333
-rw-r--r--lib/Target/Hexagon/HexagonInstrFormatsV60.td2
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.cpp387
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.h26
-rw-r--r--lib/Target/Hexagon/HexagonIntrinsics.td11
-rw-r--r--lib/Target/Hexagon/HexagonIntrinsicsV5.td2
-rw-r--r--lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp141
-rw-r--r--lib/Target/Hexagon/HexagonMachineScheduler.cpp469
-rw-r--r--lib/Target/Hexagon/HexagonMachineScheduler.h52
-rw-r--r--lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td154
-rw-r--r--lib/Target/Hexagon/HexagonNewValueJump.cpp52
-rw-r--r--lib/Target/Hexagon/HexagonOptAddrMode.cpp254
-rw-r--r--lib/Target/Hexagon/HexagonPatterns.td799
-rw-r--r--lib/Target/Hexagon/HexagonPatternsHVX.td497
-rw-r--r--lib/Target/Hexagon/HexagonPseudo.td96
-rw-r--r--lib/Target/Hexagon/HexagonRegisterInfo.cpp65
-rw-r--r--lib/Target/Hexagon/HexagonRegisterInfo.h10
-rw-r--r--lib/Target/Hexagon/HexagonRegisterInfo.td143
-rw-r--r--lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp15
-rw-r--r--lib/Target/Hexagon/HexagonSplitDouble.cpp120
-rw-r--r--lib/Target/Hexagon/HexagonStoreWidening.cpp22
-rw-r--r--lib/Target/Hexagon/HexagonSubtarget.cpp139
-rw-r--r--lib/Target/Hexagon/HexagonSubtarget.h64
-rw-r--r--lib/Target/Hexagon/HexagonTargetMachine.cpp23
-rw-r--r--lib/Target/Hexagon/HexagonTargetObjectFile.cpp26
-rw-r--r--lib/Target/Hexagon/HexagonTargetTransformInfo.cpp252
-rw-r--r--lib/Target/Hexagon/HexagonTargetTransformInfo.h82
-rw-r--r--lib/Target/Hexagon/HexagonVExtract.cpp166
-rw-r--r--lib/Target/Hexagon/HexagonVLIWPacketizer.cpp85
-rw-r--r--lib/Target/Hexagon/HexagonVLIWPacketizer.h2
-rw-r--r--lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp40
-rw-r--r--lib/Target/Hexagon/HexagonVectorPrint.cpp17
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp79
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h2
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp8
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp19
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp1133
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h45
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp28
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp38
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp31
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h6
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp33
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h6
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp13
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp13
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h9
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp6
-rw-r--r--lib/Target/Hexagon/RDFCopy.cpp5
-rw-r--r--lib/Target/Hexagon/RDFDeadCode.cpp2
-rw-r--r--lib/Target/Hexagon/RDFGraph.cpp4
-rw-r--r--lib/Target/Hexagon/RDFLiveness.cpp10
-rw-r--r--lib/Target/Hexagon/RDFLiveness.h4
-rw-r--r--lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp2
-rw-r--r--lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp8
-rw-r--r--lib/Target/Lanai/CMakeLists.txt7
-rw-r--r--lib/Target/Lanai/LanaiDelaySlotFiller.cpp2
-rw-r--r--lib/Target/Lanai/LanaiISelDAGToDAG.cpp7
-rw-r--r--lib/Target/Lanai/LanaiISelLowering.cpp22
-rw-r--r--lib/Target/Lanai/LanaiISelLowering.h1
-rw-r--r--lib/Target/Lanai/LanaiInstrFormats.td2
-rw-r--r--lib/Target/Lanai/LanaiInstrInfo.cpp6
-rw-r--r--lib/Target/Lanai/LanaiInstrInfo.h1
-rw-r--r--lib/Target/Lanai/LanaiInstrInfo.td4
-rw-r--r--lib/Target/Lanai/LanaiMemAluCombiner.cpp2
-rw-r--r--lib/Target/Lanai/LanaiTargetObjectFile.cpp12
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp31
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp7
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp1
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp6
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h9
-rw-r--r--lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp1
-rw-r--r--lib/Target/MSP430/CMakeLists.txt9
-rw-r--r--lib/Target/MSP430/MSP430BranchSelector.cpp12
-rw-r--r--lib/Target/MSP430/MSP430ISelDAGToDAG.cpp12
-rw-r--r--lib/Target/MSP430/MSP430InstrInfo.cpp4
-rw-r--r--lib/Target/Mips/AsmParser/MipsAsmParser.cpp814
-rw-r--r--lib/Target/Mips/CMakeLists.txt32
-rw-r--r--lib/Target/Mips/Disassembler/MipsDisassembler.cpp115
-rw-r--r--lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp5
-rw-r--r--lib/Target/Mips/InstPrinter/MipsInstPrinter.h1
-rw-r--r--lib/Target/Mips/LLVMBuild.txt1
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h6
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp2
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp128
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h19
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp61
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp16
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h10
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h10
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp14
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp37
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h4
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h2
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp11
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h10
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp12
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp66
-rw-r--r--lib/Target/Mips/MicroMips32r6InstrFormats.td164
-rw-r--r--lib/Target/Mips/MicroMips32r6InstrInfo.td392
-rw-r--r--lib/Target/Mips/MicroMipsDSPInstrFormats.td8
-rw-r--r--lib/Target/Mips/MicroMipsDSPInstrInfo.td13
-rw-r--r--lib/Target/Mips/MicroMipsInstrFPU.td350
-rw-r--r--lib/Target/Mips/MicroMipsInstrFormats.td75
-rw-r--r--lib/Target/Mips/MicroMipsInstrInfo.td947
-rw-r--r--lib/Target/Mips/MicroMipsSizeReduction.cpp297
-rw-r--r--lib/Target/Mips/Mips.h19
-rw-r--r--lib/Target/Mips/Mips.td19
-rw-r--r--lib/Target/Mips/Mips16FrameLowering.cpp8
-rw-r--r--lib/Target/Mips/Mips16HardFloat.cpp6
-rw-r--r--lib/Target/Mips/Mips16ISelDAGToDAG.cpp35
-rw-r--r--lib/Target/Mips/Mips16InstrInfo.cpp11
-rw-r--r--lib/Target/Mips/Mips16InstrInfo.h3
-rw-r--r--lib/Target/Mips/Mips16InstrInfo.td25
-rw-r--r--lib/Target/Mips/Mips16RegisterInfo.cpp4
-rw-r--r--lib/Target/Mips/Mips32r6InstrFormats.td30
-rw-r--r--lib/Target/Mips/Mips32r6InstrInfo.td101
-rw-r--r--lib/Target/Mips/Mips64InstrInfo.td458
-rw-r--r--lib/Target/Mips/Mips64r6InstrInfo.td57
-rw-r--r--lib/Target/Mips/MipsAsmPrinter.cpp31
-rw-r--r--lib/Target/Mips/MipsBranchExpansion.cpp (renamed from lib/Target/Mips/MipsLongBranch.cpp)483
-rw-r--r--lib/Target/Mips/MipsCallLowering.cpp441
-rw-r--r--lib/Target/Mips/MipsCallLowering.h86
-rw-r--r--lib/Target/Mips/MipsCondMov.td287
-rw-r--r--lib/Target/Mips/MipsConstantIslandPass.cpp99
-rw-r--r--lib/Target/Mips/MipsDSPInstrFormats.td14
-rw-r--r--lib/Target/Mips/MipsDSPInstrInfo.td6
-rw-r--r--lib/Target/Mips/MipsDelaySlotFiller.cpp76
-rw-r--r--lib/Target/Mips/MipsEVAInstrFormats.td2
-rw-r--r--lib/Target/Mips/MipsEVAInstrInfo.td79
-rw-r--r--lib/Target/Mips/MipsExpandPseudo.cpp702
-rw-r--r--lib/Target/Mips/MipsFastISel.cpp85
-rw-r--r--lib/Target/Mips/MipsFrameLowering.h4
-rw-r--r--lib/Target/Mips/MipsHazardSchedule.cpp163
-rw-r--r--lib/Target/Mips/MipsISelDAGToDAG.cpp13
-rw-r--r--lib/Target/Mips/MipsISelDAGToDAG.h28
-rw-r--r--lib/Target/Mips/MipsISelLowering.cpp730
-rw-r--r--lib/Target/Mips/MipsISelLowering.h38
-rw-r--r--lib/Target/Mips/MipsInstrFPU.td323
-rw-r--r--lib/Target/Mips/MipsInstrFormats.td35
-rw-r--r--lib/Target/Mips/MipsInstrInfo.cpp186
-rw-r--r--lib/Target/Mips/MipsInstrInfo.h4
-rw-r--r--lib/Target/Mips/MipsInstrInfo.td1101
-rw-r--r--lib/Target/Mips/MipsInstructionSelector.cpp184
-rw-r--r--lib/Target/Mips/MipsLegalizerInfo.cpp41
-rw-r--r--lib/Target/Mips/MipsLegalizerInfo.h29
-rw-r--r--lib/Target/Mips/MipsMCInstLower.cpp81
-rw-r--r--lib/Target/Mips/MipsMCInstLower.h4
-rw-r--r--lib/Target/Mips/MipsMSAInstrFormats.td5
-rw-r--r--lib/Target/Mips/MipsMSAInstrInfo.td136
-rw-r--r--lib/Target/Mips/MipsMTInstrFormats.td3
-rw-r--r--lib/Target/Mips/MipsMachineFunction.cpp38
-rw-r--r--lib/Target/Mips/MipsModuleISelDAGToDAG.cpp4
-rw-r--r--lib/Target/Mips/MipsOptimizePICCall.cpp12
-rw-r--r--lib/Target/Mips/MipsOs16.cpp16
-rw-r--r--lib/Target/Mips/MipsRegisterBankInfo.cpp100
-rw-r--r--lib/Target/Mips/MipsRegisterBankInfo.h43
-rw-r--r--lib/Target/Mips/MipsRegisterBanks.td (renamed from lib/Target/Hexagon/HexagonDepDecoders.h)8
-rw-r--r--lib/Target/Mips/MipsRegisterInfo.cpp16
-rw-r--r--lib/Target/Mips/MipsRegisterInfo.h4
-rw-r--r--lib/Target/Mips/MipsSEFrameLowering.cpp17
-rw-r--r--lib/Target/Mips/MipsSEFrameLowering.h1
-rw-r--r--lib/Target/Mips/MipsSEISelDAGToDAG.cpp8
-rw-r--r--lib/Target/Mips/MipsSEISelDAGToDAG.h28
-rw-r--r--lib/Target/Mips/MipsSEISelLowering.cpp174
-rw-r--r--lib/Target/Mips/MipsSEISelLowering.h34
-rw-r--r--lib/Target/Mips/MipsSEInstrInfo.cpp148
-rw-r--r--lib/Target/Mips/MipsSEInstrInfo.h9
-rw-r--r--lib/Target/Mips/MipsSERegisterInfo.cpp7
-rw-r--r--lib/Target/Mips/MipsSchedule.td48
-rw-r--r--lib/Target/Mips/MipsScheduleGeneric.td9
-rw-r--r--lib/Target/Mips/MipsScheduleP5600.td178
-rw-r--r--lib/Target/Mips/MipsSubtarget.cpp102
-rw-r--r--lib/Target/Mips/MipsSubtarget.h51
-rw-r--r--lib/Target/Mips/MipsTargetMachine.cpp69
-rw-r--r--lib/Target/Mips/MipsTargetMachine.h2
-rw-r--r--lib/Target/Mips/MipsTargetObjectFile.cpp7
-rw-r--r--lib/Target/Mips/MipsTargetStreamer.h24
-rw-r--r--lib/Target/NVPTX/CMakeLists.txt8
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt1
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp26
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h11
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp12
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp94
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h46
-rw-r--r--lib/Target/NVPTX/NVPTX.td16
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.cpp275
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.h52
-rw-r--r--lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp4
-rw-r--r--lib/Target/NVPTX/NVPTXFrameLowering.cpp8
-rw-r--r--lib/Target/NVPTX/NVPTXFrameLowering.h2
-rw-r--r--lib/Target/NVPTX/NVPTXGenericToNVVM.cpp60
-rw-r--r--lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp711
-rw-r--r--lib/Target/NVPTX/NVPTXISelDAGToDAG.h5
-rw-r--r--lib/Target/NVPTX/NVPTXISelLowering.cpp234
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.cpp45
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.h8
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.td32
-rw-r--r--lib/Target/NVPTX/NVPTXIntrinsics.td788
-rw-r--r--lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp12
-rw-r--r--lib/Target/NVPTX/NVPTXSection.h45
-rw-r--r--lib/Target/NVPTX/NVPTXSubtarget.h28
-rw-r--r--lib/Target/NVPTX/NVPTXTargetMachine.cpp34
-rw-r--r--lib/Target/NVPTX/NVPTXTargetMachine.h3
-rw-r--r--lib/Target/NVPTX/NVPTXTargetObjectFile.h61
-rw-r--r--lib/Target/NVPTX/NVPTXTargetTransformInfo.h20
-rw-r--r--lib/Target/NVPTX/NVVMReflect.cpp2
-rw-r--r--lib/Target/Nios2/CMakeLists.txt9
-rw-r--r--lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.cpp15
-rw-r--r--lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.h8
-rw-r--r--lib/Target/Nios2/MCTargetDesc/Nios2ELFObjectWriter.cpp7
-rw-r--r--lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h10
-rw-r--r--lib/Target/Nios2/MCTargetDesc/Nios2TargetStreamer.cpp2
-rw-r--r--lib/Target/Nios2/Nios2ISelDAGToDAG.cpp5
-rw-r--r--lib/Target/Nios2/Nios2ISelLowering.cpp29
-rw-r--r--lib/Target/Nios2/Nios2InstrFormats.td66
-rw-r--r--lib/Target/Nios2/Nios2InstrInfo.cpp11
-rw-r--r--lib/Target/Nios2/Nios2InstrInfo.h4
-rw-r--r--lib/Target/Nios2/Nios2InstrInfo.td39
-rw-r--r--lib/Target/Nios2/Nios2TargetObjectFile.h2
-rw-r--r--lib/Target/Nios2/Nios2TargetStreamer.h2
-rw-r--r--lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp30
-rw-r--r--lib/Target/PowerPC/CMakeLists.txt13
-rw-r--r--lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp73
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp52
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp30
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp15
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp12
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h2
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h19
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp10
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h3
-rw-r--r--lib/Target/PowerPC/P9InstrResources.td962
-rw-r--r--lib/Target/PowerPC/PPC.td40
-rw-r--r--lib/Target/PowerPC/PPCAsmPrinter.cpp127
-rw-r--r--lib/Target/PowerPC/PPCBranchCoalescing.cpp121
-rw-r--r--lib/Target/PowerPC/PPCCTRLoops.cpp86
-rw-r--r--lib/Target/PowerPC/PPCCallingConv.td114
-rw-r--r--lib/Target/PowerPC/PPCEarlyReturn.cpp2
-rw-r--r--lib/Target/PowerPC/PPCExpandISEL.cpp48
-rw-r--r--lib/Target/PowerPC/PPCFastISel.cpp148
-rw-r--r--lib/Target/PowerPC/PPCFrameLowering.cpp54
-rw-r--r--lib/Target/PowerPC/PPCFrameLowering.h4
-rw-r--r--lib/Target/PowerPC/PPCHazardRecognizers.cpp12
-rw-r--r--lib/Target/PowerPC/PPCISelDAGToDAG.cpp558
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp667
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.h37
-rw-r--r--lib/Target/PowerPC/PPCInstr64Bit.td298
-rw-r--r--lib/Target/PowerPC/PPCInstrAltivec.td56
-rw-r--r--lib/Target/PowerPC/PPCInstrFormats.td72
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.cpp746
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.h38
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.td398
-rw-r--r--lib/Target/PowerPC/PPCInstrQPX.td38
-rw-r--r--lib/Target/PowerPC/PPCInstrSPE.td1209
-rw-r--r--lib/Target/PowerPC/PPCInstrVSX.td809
-rw-r--r--lib/Target/PowerPC/PPCLoopPreIncPrep.cpp27
-rw-r--r--lib/Target/PowerPC/PPCMCInstLower.cpp14
-rw-r--r--lib/Target/PowerPC/PPCMIPeephole.cpp166
-rw-r--r--lib/Target/PowerPC/PPCMachineBasicBlockUtils.h198
-rw-r--r--lib/Target/PowerPC/PPCMachineFunctionInfo.h15
-rw-r--r--lib/Target/PowerPC/PPCPreEmitPeephole.cpp10
-rw-r--r--lib/Target/PowerPC/PPCReduceCRLogicals.cpp217
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.cpp54
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.h2
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.td37
-rw-r--r--lib/Target/PowerPC/PPCSchedule.td3
-rw-r--r--lib/Target/PowerPC/PPCScheduleE500.td274
-rw-r--r--lib/Target/PowerPC/PPCScheduleE500mc.td440
-rw-r--r--lib/Target/PowerPC/PPCScheduleP9.td251
-rw-r--r--lib/Target/PowerPC/PPCSubtarget.cpp47
-rw-r--r--lib/Target/PowerPC/PPCSubtarget.h5
-rw-r--r--lib/Target/PowerPC/PPCTLSDynamicCall.cpp4
-rw-r--r--lib/Target/PowerPC/PPCTargetMachine.cpp12
-rw-r--r--lib/Target/PowerPC/PPCTargetObjectFile.h4
-rw-r--r--lib/Target/PowerPC/PPCTargetTransformInfo.cpp13
-rw-r--r--lib/Target/PowerPC/PPCTargetTransformInfo.h2
-rw-r--r--lib/Target/PowerPC/PPCVSXFMAMutate.cpp12
-rw-r--r--lib/Target/PowerPC/PPCVSXSwapRemoval.cpp144
-rw-r--r--lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp318
-rw-r--r--lib/Target/RISCV/CMakeLists.txt13
-rw-r--r--lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp18
-rw-r--r--lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp20
-rw-r--r--lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h19
-rw-r--r--lib/Target/RISCV/MCTargetDesc/CMakeLists.txt2
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp220
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp42
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp42
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h27
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h12
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp2
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp87
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp28
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h3
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp22
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h10
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp32
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h37
-rw-r--r--lib/Target/RISCV/RISCV.h4
-rw-r--r--lib/Target/RISCV/RISCV.td10
-rw-r--r--lib/Target/RISCV/RISCVAsmPrinter.cpp66
-rw-r--r--lib/Target/RISCV/RISCVCallingConv.td37
-rw-r--r--lib/Target/RISCV/RISCVFrameLowering.cpp178
-rw-r--r--lib/Target/RISCV/RISCVFrameLowering.h8
-rw-r--r--lib/Target/RISCV/RISCVISelDAGToDAG.cpp162
-rw-r--r--lib/Target/RISCV/RISCVISelLowering.cpp882
-rw-r--r--lib/Target/RISCV/RISCVISelLowering.h42
-rw-r--r--lib/Target/RISCV/RISCVInstrFormats.td4
-rw-r--r--lib/Target/RISCV/RISCVInstrInfo.cpp392
-rw-r--r--lib/Target/RISCV/RISCVInstrInfo.h38
-rw-r--r--lib/Target/RISCV/RISCVInstrInfo.td195
-rw-r--r--lib/Target/RISCV/RISCVInstrInfoA.td20
-rw-r--r--lib/Target/RISCV/RISCVInstrInfoC.td384
-rw-r--r--lib/Target/RISCV/RISCVInstrInfoD.td115
-rw-r--r--lib/Target/RISCV/RISCVInstrInfoF.td93
-rw-r--r--lib/Target/RISCV/RISCVInstrInfoM.td15
-rw-r--r--lib/Target/RISCV/RISCVMCInstLower.cpp11
-rw-r--r--lib/Target/RISCV/RISCVMachineFunctionInfo.h55
-rw-r--r--lib/Target/RISCV/RISCVMergeBaseOffset.cpp286
-rw-r--r--lib/Target/RISCV/RISCVRegisterInfo.cpp51
-rw-r--r--lib/Target/RISCV/RISCVRegisterInfo.h14
-rw-r--r--lib/Target/RISCV/RISCVRegisterInfo.td24
-rw-r--r--lib/Target/RISCV/RISCVSubtarget.h2
-rw-r--r--lib/Target/RISCV/RISCVTargetMachine.cpp17
-rw-r--r--lib/Target/RISCV/RISCVTargetObjectFile.cpp19
-rw-r--r--lib/Target/RISCV/RISCVTargetObjectFile.h25
-rw-r--r--lib/Target/Sparc/AsmParser/SparcAsmParser.cpp60
-rw-r--r--lib/Target/Sparc/CMakeLists.txt19
-rw-r--r--lib/Target/Sparc/DelaySlotFiller.cpp4
-rwxr-xr-xlib/Target/Sparc/LeonFeatures.td8
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp40
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp10
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h6
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp11
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp21
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h2
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h11
-rw-r--r--lib/Target/Sparc/Sparc.td3
-rw-r--r--lib/Target/Sparc/SparcFrameLowering.cpp29
-rw-r--r--lib/Target/Sparc/SparcISelDAGToDAG.cpp8
-rw-r--r--lib/Target/Sparc/SparcISelLowering.cpp48
-rw-r--r--lib/Target/Sparc/SparcInstrAliases.td13
-rw-r--r--lib/Target/Sparc/SparcInstrInfo.cpp2
-rw-r--r--lib/Target/Sparc/SparcInstrInfo.td8
-rw-r--r--lib/Target/Sparc/SparcRegisterInfo.h2
-rw-r--r--lib/Target/Sparc/SparcSubtarget.cpp1
-rw-r--r--lib/Target/Sparc/SparcSubtarget.h2
-rw-r--r--lib/Target/SystemZ/CMakeLists.txt5
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp30
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp7
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h7
-rw-r--r--lib/Target/SystemZ/SystemZ.h16
-rw-r--r--lib/Target/SystemZ/SystemZ.td1
-rw-r--r--lib/Target/SystemZ/SystemZAsmPrinter.cpp129
-rw-r--r--lib/Target/SystemZ/SystemZAsmPrinter.h17
-rw-r--r--lib/Target/SystemZ/SystemZCallingConv.td9
-rw-r--r--lib/Target/SystemZ/SystemZElimCompare.cpp164
-rw-r--r--lib/Target/SystemZ/SystemZExpandPseudo.cpp6
-rw-r--r--lib/Target/SystemZ/SystemZFeatures.td1
-rw-r--r--lib/Target/SystemZ/SystemZFrameLowering.cpp92
-rw-r--r--lib/Target/SystemZ/SystemZFrameLowering.h5
-rw-r--r--lib/Target/SystemZ/SystemZHazardRecognizer.cpp144
-rw-r--r--lib/Target/SystemZ/SystemZHazardRecognizer.h13
-rw-r--r--lib/Target/SystemZ/SystemZISelDAGToDAG.cpp437
-rw-r--r--lib/Target/SystemZ/SystemZISelLowering.cpp1149
-rw-r--r--lib/Target/SystemZ/SystemZISelLowering.h48
-rw-r--r--lib/Target/SystemZ/SystemZInstrFP.td6
-rw-r--r--lib/Target/SystemZ/SystemZInstrFormats.td207
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.cpp51
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.td190
-rw-r--r--lib/Target/SystemZ/SystemZLongBranch.cpp4
-rw-r--r--lib/Target/SystemZ/SystemZMachineScheduler.cpp53
-rw-r--r--lib/Target/SystemZ/SystemZMachineScheduler.h11
-rw-r--r--lib/Target/SystemZ/SystemZOperands.td54
-rw-r--r--lib/Target/SystemZ/SystemZOperators.td268
-rw-r--r--lib/Target/SystemZ/SystemZRegisterInfo.cpp16
-rw-r--r--lib/Target/SystemZ/SystemZRegisterInfo.h8
-rw-r--r--lib/Target/SystemZ/SystemZRegisterInfo.td8
-rw-r--r--lib/Target/SystemZ/SystemZSchedule.td92
-rw-r--r--lib/Target/SystemZ/SystemZScheduleZ13.td1698
-rw-r--r--lib/Target/SystemZ/SystemZScheduleZ14.td1847
-rw-r--r--lib/Target/SystemZ/SystemZScheduleZ196.td1284
-rw-r--r--lib/Target/SystemZ/SystemZScheduleZEC12.td1322
-rw-r--r--lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp35
-rw-r--r--lib/Target/SystemZ/SystemZTargetMachine.cpp2
-rw-r--r--lib/Target/SystemZ/SystemZTargetTransformInfo.cpp2
-rw-r--r--lib/Target/TargetLoweringObjectFile.cpp52
-rw-r--r--lib/Target/TargetMachine.cpp47
-rw-r--r--lib/Target/TargetMachineC.cpp24
-rw-r--r--lib/Target/WebAssembly/AsmParser/CMakeLists.txt3
-rw-r--r--lib/Target/WebAssembly/AsmParser/LLVMBuild.txt23
-rw-r--r--lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp561
-rw-r--r--lib/Target/WebAssembly/CMakeLists.txt7
-rw-r--r--lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp128
-rw-r--r--lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp31
-rw-r--r--lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h4
-rw-r--r--lib/Target/WebAssembly/LLVMBuild.txt3
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt1
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp113
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp68
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp39
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h9
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp14
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp13
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h168
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp105
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h31
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp39
-rw-r--r--lib/Target/WebAssembly/README.txt54
-rw-r--r--lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp2
-rw-r--r--lib/Target/WebAssembly/WebAssembly.h30
-rw-r--r--lib/Target/WebAssembly/WebAssembly.td25
-rw-r--r--lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp144
-rw-r--r--lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp65
-rw-r--r--lib/Target/WebAssembly/WebAssemblyAsmPrinter.h1
-rw-r--r--lib/Target/WebAssembly/WebAssemblyCFGSort.cpp13
-rw-r--r--lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp46
-rw-r--r--lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp15
-rw-r--r--lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp197
-rw-r--r--lib/Target/WebAssembly/WebAssemblyExceptionInfo.h170
-rw-r--r--lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp40
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFastISel.cpp42
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp5
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp20
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp49
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFrameLowering.h2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISD.def2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp18
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISelLowering.cpp39
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISelLowering.h6
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrAtomics.td561
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrCall.td120
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrControl.td175
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrConv.td315
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td31
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrFloat.td16
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrFormats.td197
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInfo.h2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInfo.td126
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInteger.td28
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrMemory.td433
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrSIMD.td2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp383
-rw-r--r--lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp11
-rw-r--r--lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp57
-rw-r--r--lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp5
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp87
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMCInstLower.h2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h28
-rw-r--r--lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp13
-rw-r--r--lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp6
-rw-r--r--lib/Target/WebAssembly/WebAssemblyPeephole.cpp26
-rw-r--r--lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegColoring.cpp43
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp22
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegStackify.cpp45
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegisterInfo.h4
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegisterInfo.td16
-rw-r--r--lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp8
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp1409
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h2
-rw-r--r--lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp2
-rw-r--r--lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h2
-rw-r--r--lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp57
-rw-r--r--lib/Target/WebAssembly/WebAssemblyStoreResults.cpp13
-rw-r--r--lib/Target/WebAssembly/WebAssemblySubtarget.cpp8
-rw-r--r--lib/Target/WebAssembly/WebAssemblySubtarget.h6
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp81
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetMachine.h2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp9
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h8
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyUtilities.cpp159
-rw-r--r--lib/Target/WebAssembly/WebAssemblyUtilities.h47
-rw-r--r--lib/Target/WebAssembly/known_gcc_test_failures.txt54
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp486
-rw-r--r--lib/Target/X86/AsmParser/X86Operand.h88
-rw-r--r--lib/Target/X86/CMakeLists.txt25
-rw-r--r--lib/Target/X86/Disassembler/X86Disassembler.cpp42
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp97
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.h29
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h466
-rw-r--r--lib/Target/X86/InstPrinter/CMakeLists.txt1
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp134
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.h17
-rw-r--r--lib/Target/X86/InstPrinter/X86InstComments.cpp684
-rw-r--r--lib/Target/X86/InstPrinter/X86InstComments.h10
-rw-r--r--lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp142
-rw-r--r--lib/Target/X86/InstPrinter/X86InstPrinterCommon.h38
-rw-r--r--lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp131
-rw-r--r--lib/Target/X86/InstPrinter/X86IntelInstPrinter.h15
-rw-r--r--lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp210
-rw-r--r--lib/Target/X86/MCTargetDesc/X86BaseInfo.h110
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp11
-rw-r--r--lib/Target/X86/MCTargetDesc/X86FixupKinds.h1
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp72
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCExpr.h75
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp304
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h30
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp11
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp8
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp10
-rw-r--r--lib/Target/X86/README-MMX.txt71
-rw-r--r--lib/Target/X86/README-SSE.txt9
-rw-r--r--lib/Target/X86/README-UNIMPLEMENTED.txt14
-rw-r--r--lib/Target/X86/README.txt24
-rw-r--r--lib/Target/X86/ShadowCallStack.cpp326
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.cpp217
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.h74
-rw-r--r--lib/Target/X86/X86.h21
-rw-r--r--lib/Target/X86/X86.td249
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp159
-rw-r--r--lib/Target/X86/X86AsmPrinter.h6
-rw-r--r--lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp732
-rw-r--r--lib/Target/X86/X86CallFrameOptimization.cpp2
-rw-r--r--lib/Target/X86/X86CallLowering.cpp45
-rw-r--r--lib/Target/X86/X86CallingConv.td40
-rw-r--r--lib/Target/X86/X86CmovConversion.cpp20
-rw-r--r--lib/Target/X86/X86DomainReassignment.cpp87
-rwxr-xr-xlib/Target/X86/X86EvexToVex.cpp117
-rw-r--r--lib/Target/X86/X86ExpandPseudo.cpp109
-rw-r--r--lib/Target/X86/X86FastISel.cpp136
-rw-r--r--lib/Target/X86/X86FixupBWInsts.cpp164
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp77
-rw-r--r--lib/Target/X86/X86FlagsCopyLowering.cpp1052
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp75
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp239
-rw-r--r--lib/Target/X86/X86FrameLowering.h6
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp975
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp8626
-rw-r--r--lib/Target/X86/X86ISelLowering.h137
-rw-r--r--lib/Target/X86/X86IndirectBranchTracking.cpp121
-rw-r--r--lib/Target/X86/X86Instr3DNow.td141
-rw-r--r--lib/Target/X86/X86InstrAVX512.td7618
-rw-r--r--lib/Target/X86/X86InstrArithmetic.td547
-rw-r--r--lib/Target/X86/X86InstrCMovSetCC.td68
-rw-r--r--lib/Target/X86/X86InstrCompiler.td484
-rw-r--r--lib/Target/X86/X86InstrControl.td287
-rw-r--r--lib/Target/X86/X86InstrExtension.td100
-rw-r--r--lib/Target/X86/X86InstrFMA.td398
-rw-r--r--lib/Target/X86/X86InstrFMA3Info.cpp406
-rw-r--r--lib/Target/X86/X86InstrFMA3Info.h302
-rw-r--r--lib/Target/X86/X86InstrFPStack.td258
-rw-r--r--lib/Target/X86/X86InstrFoldTables.cpp5412
-rw-r--r--lib/Target/X86/X86InstrFoldTables.h85
-rw-r--r--lib/Target/X86/X86InstrFormats.td533
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td111
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp5129
-rw-r--r--lib/Target/X86/X86InstrInfo.h117
-rw-r--r--lib/Target/X86/X86InstrInfo.td1474
-rw-r--r--lib/Target/X86/X86InstrMMX.td534
-rw-r--r--lib/Target/X86/X86InstrMPX.td78
-rw-r--r--lib/Target/X86/X86InstrSGX.td6
-rw-r--r--lib/Target/X86/X86InstrSSE.td5244
-rw-r--r--lib/Target/X86/X86InstrSVM.td36
-rw-r--r--lib/Target/X86/X86InstrShiftRotate.td589
-rw-r--r--lib/Target/X86/X86InstrSystem.td603
-rw-r--r--lib/Target/X86/X86InstrVMX.td44
-rw-r--r--lib/Target/X86/X86InstrVecCompiler.td283
-rw-r--r--lib/Target/X86/X86InstrXOP.td309
-rw-r--r--lib/Target/X86/X86InstructionSelector.cpp336
-rw-r--r--lib/Target/X86/X86InterleavedAccess.cpp22
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h837
-rw-r--r--lib/Target/X86/X86LegalizerInfo.cpp52
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp732
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.h4
-rw-r--r--lib/Target/X86/X86MacroFusion.cpp3
-rw-r--r--lib/Target/X86/X86OptimizeLEAs.cpp31
-rw-r--r--lib/Target/X86/X86PadShortFunction.cpp26
-rw-r--r--lib/Target/X86/X86PfmCounters.td77
-rw-r--r--lib/Target/X86/X86RegisterBankInfo.cpp36
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp9
-rw-r--r--lib/Target/X86/X86RegisterInfo.td161
-rw-r--r--lib/Target/X86/X86RetpolineThunks.cpp274
-rwxr-xr-xlib/Target/X86/X86SchedBroadwell.td3501
-rw-r--r--lib/Target/X86/X86SchedHaswell.td3828
-rw-r--r--lib/Target/X86/X86SchedPredicates.td49
-rw-r--r--lib/Target/X86/X86SchedSandyBridge.td2748
-rw-r--r--lib/Target/X86/X86SchedSkylakeClient.td3625
-rwxr-xr-xlib/Target/X86/X86SchedSkylakeServer.td6250
-rw-r--r--lib/Target/X86/X86Schedule.td1066
-rw-r--r--lib/Target/X86/X86ScheduleAtom.td1419
-rw-r--r--lib/Target/X86/X86ScheduleBtVer2.td1057
-rw-r--r--lib/Target/X86/X86ScheduleSLM.td461
-rw-r--r--lib/Target/X86/X86ScheduleZnver1.td1222
-rw-r--r--lib/Target/X86/X86SpeculativeLoadHardening.cpp2247
-rw-r--r--lib/Target/X86/X86Subtarget.cpp163
-rw-r--r--lib/Target/X86/X86Subtarget.h311
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp117
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp97
-rw-r--r--lib/Target/X86/X86TargetObjectFile.h23
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp313
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.h1
-rw-r--r--lib/Target/X86/X86VZeroUpper.cpp8
-rw-r--r--lib/Target/X86/X86WinAllocaExpander.cpp26
-rw-r--r--lib/Target/X86/X86WinEHState.cpp12
-rw-r--r--lib/Target/XCore/CMakeLists.txt11
-rw-r--r--lib/Target/XCore/Disassembler/XCoreDisassembler.cpp4
-rw-r--r--lib/Target/XCore/InstPrinter/XCoreInstPrinter.h2
-rw-r--r--lib/Target/XCore/XCoreAsmPrinter.cpp2
-rw-r--r--lib/Target/XCore/XCoreFrameLowering.cpp6
-rw-r--r--lib/Target/XCore/XCoreISelLowering.cpp4
-rw-r--r--lib/Target/XCore/XCoreInstrInfo.cpp6
-rw-r--r--lib/Target/XCore/XCoreLowerThreadLocal.cpp4
-rw-r--r--lib/Target/XCore/XCoreMCInstLower.cpp2
-rw-r--r--lib/Target/XCore/XCoreMCInstLower.h2
-rw-r--r--lib/Target/XCore/XCoreRegisterInfo.cpp18
-rw-r--r--lib/Target/XCore/XCoreRegisterInfo.h2
1056 files changed, 110976 insertions, 73165 deletions
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 75fb937de9bf..a69d38144c78 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -26,8 +26,32 @@ def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
"Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
+def FeatureSM4 : SubtargetFeature<
+ "sm4", "HasSM4", "true",
+ "Enable SM3 and SM4 support", [FeatureNEON]>;
+
+def FeatureSHA2 : SubtargetFeature<
+ "sha2", "HasSHA2", "true",
+ "Enable SHA1 and SHA256 support", [FeatureNEON]>;
+
+def FeatureSHA3 : SubtargetFeature<
+ "sha3", "HasSHA3", "true",
+ "Enable SHA512 and SHA3 support", [FeatureNEON, FeatureSHA2]>;
+
+def FeatureAES : SubtargetFeature<
+ "aes", "HasAES", "true",
+ "Enable AES support", [FeatureNEON]>;
+
+// Crypto has been split up and any combination is now valid (see the
+// crypto defintions above). Also, crypto is now context sensitive:
+// it has a different meaning for e.g. Armv8.4 than it has for Armv8.2.
+// Therefore, we rely on Clang, the user interacing tool, to pass on the
+// appropriate crypto options. But here in the backend, crypto has very little
+// meaning anymore. We kept the Crypto defintion here for backward
+// compatibility, and now imply features SHA2 and AES, which was the
+// "traditional" meaning of Crypto.
def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
- "Enable cryptographic instructions", [FeatureNEON]>;
+ "Enable cryptographic instructions", [FeatureNEON, FeatureSHA2, FeatureAES]>;
def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
"Enable ARMv8 CRC-32 checksum instructions">;
@@ -76,6 +100,10 @@ def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true",
"Reserve X18, making it unavailable "
"as a GPR">;
+def FeatureReserveX20 : SubtargetFeature<"reserve-x20", "ReserveX20", "true",
+ "Reserve X20, making it unavailable "
+ "as a GPR">;
+
def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
"Use alias analysis during codegen">;
@@ -91,6 +119,11 @@ def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
"CustomAsCheapAsMove", "true",
"Use custom code for TargetInstrInfo::isAsCheapAsAMove()">;
+def FeatureExynosCheapAsMoveHandling : SubtargetFeature<"exynos-cheap-as-move",
+ "ExynosAsCheapAsMove", "true",
+ "Use Exynos specific code in TargetInstrInfo::isAsCheapAsAMove()",
+ [FeatureCustomCheapAsMoveHandling]>;
+
def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
"UsePostRAScheduler", "true", "Schedule again after register allocation">;
@@ -115,10 +148,18 @@ def FeatureArithmeticCbzFusion : SubtargetFeature<
"arith-cbz-fusion", "HasArithmeticCbzFusion", "true",
"CPU fuses arithmetic + cbz/cbnz operations">;
+def FeatureFuseAddress : SubtargetFeature<
+ "fuse-address", "HasFuseAddress", "true",
+ "CPU fuses address generation and memory operations">;
+
def FeatureFuseAES : SubtargetFeature<
"fuse-aes", "HasFuseAES", "true",
"CPU fuses AES crypto operations">;
+def FeatureFuseCCSelect : SubtargetFeature<
+ "fuse-csel", "HasFuseCCSelect", "true",
+ "CPU fuses conditional select operations">;
+
def FeatureFuseLiterals : SubtargetFeature<
"fuse-literals", "HasFuseLiterals", "true",
"CPU fuses literal generation operations">;
@@ -149,6 +190,12 @@ def FeatureLSLFast : SubtargetFeature<
"lsl-fast", "HasLSLFast", "true",
"CPU has a fastpath logical shift of up to 3 places">;
+def FeatureAggressiveFMA :
+ SubtargetFeature<"aggressive-fma",
+ "HasAggressiveFMA",
+ "true",
+ "Enable Aggressive FMA for floating-point.">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//
@@ -162,6 +209,9 @@ def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
"Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC]>;
+def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
+ "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd]>;
+
//===----------------------------------------------------------------------===//
// Register File Description
//===----------------------------------------------------------------------===//
@@ -193,7 +243,8 @@ include "AArch64SchedA57.td"
include "AArch64SchedCyclone.td"
include "AArch64SchedFalkor.td"
include "AArch64SchedKryo.td"
-include "AArch64SchedM1.td"
+include "AArch64SchedExynosM1.td"
+include "AArch64SchedExynosM3.td"
include "AArch64SchedThunderX.td"
include "AArch64SchedThunderX2T99.td"
@@ -294,7 +345,6 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
FeatureFuseAES,
FeatureNEON,
FeaturePerfMon,
- FeatureSlowMisaligned128Store,
FeatureZCRegMove,
FeatureZCZeroing,
FeatureZCZeroingFPWorkaround
@@ -305,7 +355,7 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
[FeatureSlowPaired128,
FeatureCRC,
FeatureCrypto,
- FeatureCustomCheapAsMoveHandling,
+ FeatureExynosCheapAsMoveHandling,
FeatureFPARMv8,
FeatureFuseAES,
FeatureNEON,
@@ -316,11 +366,11 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
FeatureZCZeroing]>;
def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
- "Samsung Exynos-M2/M3 processors",
+ "Samsung Exynos-M2 processors",
[FeatureSlowPaired128,
FeatureCRC,
FeatureCrypto,
- FeatureCustomCheapAsMoveHandling,
+ FeatureExynosCheapAsMoveHandling,
FeatureFPARMv8,
FeatureFuseAES,
FeatureNEON,
@@ -329,6 +379,23 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
FeatureSlowMisaligned128Store,
FeatureZCZeroing]>;
+def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
+ "Samsung Exynos-M3 processors",
+ [FeatureCRC,
+ FeatureCrypto,
+ FeatureExynosCheapAsMoveHandling,
+ FeatureFPARMv8,
+ FeatureFuseAddress,
+ FeatureFuseAES,
+ FeatureFuseCCSelect,
+ FeatureFuseLiterals,
+ FeatureLSLFast,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeaturePredictableSelectIsExpensive,
+ FeatureZCZeroing]>;
+
def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
"Qualcomm Kryo processors", [
FeatureCRC,
@@ -376,6 +443,7 @@ def ProcSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily",
"ThunderX2T99",
"Cavium ThunderX2 processors", [
+ FeatureAggressiveFMA,
FeatureCRC,
FeatureCrypto,
FeatureFPARMv8,
@@ -449,7 +517,8 @@ def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>;
def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>;
-def : ProcessorModel<"exynos-m3", ExynosM1Model, [ProcExynosM2]>;
+def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>;
+def : ProcessorModel<"exynos-m4", ExynosM3Model, [ProcExynosM3]>;
def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>;
def : ProcessorModel<"saphira", FalkorModel, [ProcSaphira]>;
def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
@@ -469,12 +538,14 @@ def GenericAsmParserVariant : AsmParserVariant {
int Variant = 0;
string Name = "generic";
string BreakCharacters = ".";
+ string TokenizingCharacters = "[]*!/";
}
def AppleAsmParserVariant : AsmParserVariant {
int Variant = 1;
string Name = "apple-neon";
string BreakCharacters = ".";
+ string TokenizingCharacters = "[]*!/";
}
//===----------------------------------------------------------------------===//
@@ -504,4 +575,5 @@ def AArch64 : Target {
let InstructionSet = AArch64InstrInfo;
let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
+ let AllowRegisterRenaming = 1;
}
diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp
index 7de5d0ef66b1..30232afaf024 100644
--- a/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -116,7 +116,7 @@ INITIALIZE_PASS(AArch64A53Fix835769, "aarch64-fix-cortex-a53-835769-pass",
bool
AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) {
- DEBUG(dbgs() << "***** AArch64A53Fix835769 *****\n");
+ LLVM_DEBUG(dbgs() << "***** AArch64A53Fix835769 *****\n");
bool Changed = false;
TII = F.getSubtarget().getInstrInfo();
@@ -190,7 +190,8 @@ static void insertNopBeforeInstruction(MachineBasicBlock &MBB, MachineInstr* MI,
bool
AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) {
bool Changed = false;
- DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n");
+ LLVM_DEBUG(dbgs() << "Running on MBB: " << MBB
+ << " - scanning instructions...\n");
// First, scan the basic block, looking for a sequence of 2 instructions
// that match the conditions under which the erratum may trigger.
@@ -206,17 +207,17 @@ AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) {
for (auto &MI : MBB) {
MachineInstr *CurrInstr = &MI;
- DEBUG(dbgs() << " Examining: " << MI);
+ LLVM_DEBUG(dbgs() << " Examining: " << MI);
if (PrevInstr) {
- DEBUG(dbgs() << " PrevInstr: " << *PrevInstr
- << " CurrInstr: " << *CurrInstr
- << " isFirstInstructionInSequence(PrevInstr): "
- << isFirstInstructionInSequence(PrevInstr) << "\n"
- << " isSecondInstructionInSequence(CurrInstr): "
- << isSecondInstructionInSequence(CurrInstr) << "\n");
+ LLVM_DEBUG(dbgs() << " PrevInstr: " << *PrevInstr
+ << " CurrInstr: " << *CurrInstr
+ << " isFirstInstructionInSequence(PrevInstr): "
+ << isFirstInstructionInSequence(PrevInstr) << "\n"
+ << " isSecondInstructionInSequence(CurrInstr): "
+ << isSecondInstructionInSequence(CurrInstr) << "\n");
if (isFirstInstructionInSequence(PrevInstr) &&
isSecondInstructionInSequence(CurrInstr)) {
- DEBUG(dbgs() << " ** pattern found at Idx " << Idx << "!\n");
+ LLVM_DEBUG(dbgs() << " ** pattern found at Idx " << Idx << "!\n");
Sequences.push_back(CurrInstr);
}
}
@@ -225,8 +226,8 @@ AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) {
++Idx;
}
- DEBUG(dbgs() << "Scan complete, " << Sequences.size()
- << " occurrences of pattern found.\n");
+ LLVM_DEBUG(dbgs() << "Scan complete, " << Sequences.size()
+ << " occurrences of pattern found.\n");
// Then update the basic block, inserting nops between the detected sequences.
for (auto &MI : Sequences) {
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 38a7e331bb97..a95476b91187 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -315,7 +315,7 @@ bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
return false;
bool Changed = false;
- DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n");
+ LLVM_DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n");
MRI = &F.getRegInfo();
TRI = F.getRegInfo().getTargetRegisterInfo();
@@ -330,7 +330,8 @@ bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
bool Changed = false;
- DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n");
+ LLVM_DEBUG(dbgs() << "Running on MBB: " << MBB
+ << " - scanning instructions...\n");
// First, scan the basic block producing a set of chains.
@@ -343,7 +344,8 @@ bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
for (auto &MI : MBB)
scanInstruction(&MI, Idx++, ActiveChains, AllChains);
- DEBUG(dbgs() << "Scan complete, "<< AllChains.size() << " chains created.\n");
+ LLVM_DEBUG(dbgs() << "Scan complete, " << AllChains.size()
+ << " chains created.\n");
// Group the chains into disjoint sets based on their liveness range. This is
// a poor-man's version of graph coloring. Ideally we'd create an interference
@@ -360,7 +362,7 @@ bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
for (auto &J : AllChains)
if (I != J && I->rangeOverlapsWith(*J))
EC.unionSets(I.get(), J.get());
- DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n");
+ LLVM_DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n");
// Now we assume that every member of an equivalence class interferes
// with every other member of that class, and with no members of other classes.
@@ -375,9 +377,9 @@ bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
// Now we have a set of sets, order them by start address so
// we can iterate over them sequentially.
- std::sort(V.begin(), V.end(),
- [](const std::vector<Chain*> &A,
- const std::vector<Chain*> &B) {
+ llvm::sort(V.begin(), V.end(),
+ [](const std::vector<Chain*> &A,
+ const std::vector<Chain*> &B) {
return A.front()->startsBefore(B.front());
});
@@ -440,7 +442,7 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
MachineBasicBlock &MBB,
int &Parity) {
bool Changed = false;
- DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n");
+ LLVM_DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n");
// Sort by descending size order so that we allocate the most important
// sets first.
@@ -451,7 +453,7 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
// change them to!
// Final tie-break with instruction order so pass output is stable (i.e. not
// dependent on malloc'd pointer values).
- std::sort(GV.begin(), GV.end(), [](const Chain *G1, const Chain *G2) {
+ llvm::sort(GV.begin(), GV.end(), [](const Chain *G1, const Chain *G2) {
if (G1->size() != G2->size())
return G1->size() > G2->size();
if (G1->requiresFixup() != G2->requiresFixup())
@@ -470,16 +472,18 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
// But if we really don't care, use the chain's preferred color.
C = G->getPreferredColor();
- DEBUG(dbgs() << " - Parity=" << Parity << ", Color="
- << ColorNames[(int)C] << "\n");
+ LLVM_DEBUG(dbgs() << " - Parity=" << Parity
+ << ", Color=" << ColorNames[(int)C] << "\n");
// If we'll need a fixup FMOV, don't bother. Testing has shown that this
// happens infrequently and when it does it has at least a 50% chance of
// slowing code down instead of speeding it up.
if (G->requiresFixup() && C != G->getPreferredColor()) {
C = G->getPreferredColor();
- DEBUG(dbgs() << " - " << G->str() << " - not worthwhile changing; "
- "color remains " << ColorNames[(int)C] << "\n");
+ LLVM_DEBUG(dbgs() << " - " << G->str()
+ << " - not worthwhile changing; "
+ "color remains "
+ << ColorNames[(int)C] << "\n");
}
Changed |= colorChain(G, C, MBB);
@@ -528,17 +532,17 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
MachineBasicBlock &MBB) {
bool Changed = false;
- DEBUG(dbgs() << " - colorChain(" << G->str() << ", "
- << ColorNames[(int)C] << ")\n");
+ LLVM_DEBUG(dbgs() << " - colorChain(" << G->str() << ", "
+ << ColorNames[(int)C] << ")\n");
// Try and obtain a free register of the right class. Without a register
// to play with we cannot continue.
int Reg = scavengeRegister(G, C, MBB);
if (Reg == -1) {
- DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n");
+ LLVM_DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n");
return false;
}
- DEBUG(dbgs() << " - Scavenged register: " << printReg(Reg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << " - Scavenged register: " << printReg(Reg, TRI) << "\n");
std::map<unsigned, unsigned> Substs;
for (MachineInstr &I : *G) {
@@ -586,11 +590,11 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
assert(Substs.size() == 0 && "No substitutions should be left active!");
if (G->getKill()) {
- DEBUG(dbgs() << " - Kill instruction seen.\n");
+ LLVM_DEBUG(dbgs() << " - Kill instruction seen.\n");
} else {
// We didn't have a kill instruction, but we didn't seem to need to change
// the destination register anyway.
- DEBUG(dbgs() << " - Destination register not changed.\n");
+ LLVM_DEBUG(dbgs() << " - Destination register not changed.\n");
}
return Changed;
}
@@ -611,8 +615,8 @@ void AArch64A57FPLoadBalancing::scanInstruction(
// unit.
unsigned DestReg = MI->getOperand(0).getReg();
- DEBUG(dbgs() << "New chain started for register " << printReg(DestReg, TRI)
- << " at " << *MI);
+ LLVM_DEBUG(dbgs() << "New chain started for register "
+ << printReg(DestReg, TRI) << " at " << *MI);
auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
ActiveChains[DestReg] = G.get();
@@ -631,8 +635,8 @@ void AArch64A57FPLoadBalancing::scanInstruction(
maybeKillChain(MI->getOperand(0), Idx, ActiveChains);
if (ActiveChains.find(AccumReg) != ActiveChains.end()) {
- DEBUG(dbgs() << "Chain found for accumulator register "
- << printReg(AccumReg, TRI) << " in MI " << *MI);
+ LLVM_DEBUG(dbgs() << "Chain found for accumulator register "
+ << printReg(AccumReg, TRI) << " in MI " << *MI);
// For simplicity we only chain together sequences of MULs/MLAs where the
// accumulator register is killed on each instruction. This means we don't
@@ -641,7 +645,7 @@ void AArch64A57FPLoadBalancing::scanInstruction(
// FIXME: We could extend to handle the non-kill cases for more coverage.
if (MI->getOperand(3).isKill()) {
// Add to chain.
- DEBUG(dbgs() << "Instruction was successfully added to chain.\n");
+ LLVM_DEBUG(dbgs() << "Instruction was successfully added to chain.\n");
ActiveChains[AccumReg]->add(MI, Idx, getColor(DestReg));
// Handle cases where the destination is not the same as the accumulator.
if (DestReg != AccumReg) {
@@ -651,13 +655,14 @@ void AArch64A57FPLoadBalancing::scanInstruction(
return;
}
- DEBUG(dbgs() << "Cannot add to chain because accumulator operand wasn't "
- << "marked <kill>!\n");
+ LLVM_DEBUG(
+ dbgs() << "Cannot add to chain because accumulator operand wasn't "
+ << "marked <kill>!\n");
maybeKillChain(MI->getOperand(3), Idx, ActiveChains);
}
- DEBUG(dbgs() << "Creating new chain for dest register "
- << printReg(DestReg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Creating new chain for dest register "
+ << printReg(DestReg, TRI) << "\n");
auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
ActiveChains[DestReg] = G.get();
AllChains.push_back(std::move(G));
@@ -685,8 +690,8 @@ maybeKillChain(MachineOperand &MO, unsigned Idx,
// If this is a KILL of a current chain, record it.
if (MO.isKill() && ActiveChains.find(MO.getReg()) != ActiveChains.end()) {
- DEBUG(dbgs() << "Kill seen for chain " << printReg(MO.getReg(), TRI)
- << "\n");
+ LLVM_DEBUG(dbgs() << "Kill seen for chain " << printReg(MO.getReg(), TRI)
+ << "\n");
ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied());
}
ActiveChains.erase(MO.getReg());
@@ -696,8 +701,8 @@ maybeKillChain(MachineOperand &MO, unsigned Idx,
for (auto I = ActiveChains.begin(), E = ActiveChains.end();
I != E;) {
if (MO.clobbersPhysReg(I->first)) {
- DEBUG(dbgs() << "Kill (regmask) seen for chain "
- << printReg(I->first, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Kill (regmask) seen for chain "
+ << printReg(I->first, TRI) << "\n");
I->second->setKill(MI, Idx, /*Immutable=*/true);
ActiveChains.erase(I++);
} else
diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 338daecb49e5..22b0c1e3b471 100644
--- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -277,7 +277,7 @@ static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI,
MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
TII->get(AArch64::COPY), Dst)
.addReg(Src, getKillRegState(IsKill));
- DEBUG(dbgs() << " adding copy: " << *MIB);
+ LLVM_DEBUG(dbgs() << " adding copy: " << *MIB);
++NumCopiesInserted;
return MIB;
}
@@ -286,7 +286,7 @@ static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI,
// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
// to be the correct register class, minimizing cross-class copies.
void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
- DEBUG(dbgs() << "Scalar transform: " << MI);
+ LLVM_DEBUG(dbgs() << "Scalar transform: " << MI);
MachineBasicBlock *MBB = MI.getParent();
unsigned OldOpc = MI.getOpcode();
@@ -391,7 +391,7 @@ bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
// runOnMachineFunction - Pass entry point from PassManager.
bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
bool Changed = false;
- DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
+ LLVM_DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
if (skipFunction(mf.getFunction()))
return false;
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 2ff2ee347f56..52819dedc23d 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -71,7 +71,7 @@ public:
StringRef getPassName() const override { return "AArch64 Assembly Printer"; }
- /// \brief Wrapper for MCInstLowering.lowerOperand() for the
+ /// Wrapper for MCInstLowering.lowerOperand() for the
/// tblgen'erated pseudo lowering.
bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
return MCInstLowering.lowerOperand(MO, MCOp);
@@ -88,7 +88,7 @@ public:
void EmitSled(const MachineInstr &MI, SledKind Kind);
- /// \brief tblgen'erated driver function for lowering simple MI->MC
+ /// tblgen'erated driver function for lowering simple MI->MC
/// pseudo instructions.
bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
const MachineInstr *MI);
@@ -131,7 +131,7 @@ private:
AArch64FunctionInfo *AArch64FI = nullptr;
- /// \brief Emit the LOHs contained in AArch64FI.
+ /// Emit the LOHs contained in AArch64FI.
void EmitLOHs();
/// Emit instruction to set float register to zero.
@@ -210,29 +210,6 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
SM.serializeToStackMapSection();
}
-
- if (TT.isOSBinFormatCOFF()) {
- const auto &TLOF =
- static_cast<const TargetLoweringObjectFileCOFF &>(getObjFileLowering());
-
- std::string Flags;
- raw_string_ostream OS(Flags);
-
- for (const auto &Function : M)
- TLOF.emitLinkerFlagsForGlobal(OS, &Function);
- for (const auto &Global : M.globals())
- TLOF.emitLinkerFlagsForGlobal(OS, &Global);
- for (const auto &Alias : M.aliases())
- TLOF.emitLinkerFlagsForGlobal(OS, &Alias);
-
- OS.flush();
-
- // Output collected flags
- if (!Flags.empty()) {
- OutStreamer->SwitchSection(TLOF.getDrectveSection());
- OutStreamer->EmitBytes(Flags);
- }
- }
}
void AArch64AsmPrinter::EmitLOHs() {
@@ -265,9 +242,7 @@ MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const {
Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
Twine(getFunctionNumber()) + "_" + Twine(CPID));
- return OutContext.getOrCreateSymbol(
- Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
- Twine(getFunctionNumber()) + "_" + Twine(CPID));
+ return AsmPrinter::GetCPISymbol(CPID);
}
void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
@@ -299,6 +274,11 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
printOffset(MO.getOffset(), O);
break;
}
+ case MachineOperand::MO_BlockAddress: {
+ MCSymbol *Sym = GetBlockAddressSymbol(MO.getBlockAddress());
+ Sym->print(O, MAI);
+ break;
+ }
}
}
diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp
index 08152c0d83d9..26d532555e78 100644
--- a/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -31,7 +31,6 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
@@ -40,6 +39,7 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
+#include "llvm/Support/MachineValueType.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -155,6 +155,12 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
+ if (VA.getLocInfo() == CCValAssign::LocInfo::AExt) {
+ Size = VA.getLocVT().getSizeInBits() / 8;
+ ValVReg = MIRBuilder.buildAnyExt(LLT::scalar(Size * 8), ValVReg)
+ ->getOperand(0)
+ .getReg();
+ }
auto MMO = MIRBuilder.getMF().getMachineMemOperand(
MPO, MachineMemOperand::MOStore, Size, 0);
MIRBuilder.buildStore(ValVReg, Addr, *MMO);
@@ -187,6 +193,9 @@ void AArch64CallLowering::splitToValueTypes(
const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
LLVMContext &Ctx = OrigArg.Ty->getContext();
+ if (OrigArg.Ty->isVoidTy())
+ return;
+
SmallVector<EVT, 4> SplitVTs;
SmallVector<uint64_t, 4> Offsets;
ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
@@ -226,9 +235,14 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg");
bool Success = true;
if (VReg) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // We zero-extend i1s to i8.
+ if (MRI.getType(VReg).getSizeInBits() == 1)
+ VReg = MIRBuilder.buildZExt(LLT::scalar(8), VReg)->getOperand(0).getReg();
+
const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
- MachineRegisterInfo &MRI = MF.getRegInfo();
auto &DL = F.getParent()->getDataLayout();
ArgInfo OrigArg{VReg, Val->getType()};
@@ -369,8 +383,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (Callee.isReg())
MIB->getOperand(0).setReg(constrainOperandRegClass(
MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
- *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(),
- Callee.getReg(), 0));
+ *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Callee, 0));
// Finally we can copy the returned value back into its virtual-register. In
// symmetry with the arugments, the physical register must be an
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 93a68449de8d..30492003df14 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -345,3 +345,22 @@ def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>;
def CSR_AArch64_RT_MostRegs : CalleeSavedRegs<(add CSR_AArch64_AAPCS,
(sequence "X%u", 9, 15))>;
+def CSR_AArch64_StackProbe_Windows
+ : CalleeSavedRegs<(add (sequence "X%u", 0, 15),
+ (sequence "X%u", 18, 28), FP, SP,
+ (sequence "Q%u", 0, 31))>;
+
+// Variants of the standard calling conventions for shadow call stack.
+// These all preserve x18 in addition to any other registers.
+def CSR_AArch64_NoRegs_SCS
+ : CalleeSavedRegs<(add CSR_AArch64_NoRegs, X18)>;
+def CSR_AArch64_AllRegs_SCS
+ : CalleeSavedRegs<(add CSR_AArch64_AllRegs, X18)>;
+def CSR_AArch64_CXX_TLS_Darwin_SCS
+ : CalleeSavedRegs<(add CSR_AArch64_CXX_TLS_Darwin, X18)>;
+def CSR_AArch64_AAPCS_SwiftError_SCS
+ : CalleeSavedRegs<(add CSR_AArch64_AAPCS_SwiftError, X18)>;
+def CSR_AArch64_RT_MostRegs_SCS
+ : CalleeSavedRegs<(add CSR_AArch64_RT_MostRegs, X18)>;
+def CSR_AArch64_AAPCS_SCS
+ : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X18)>;
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index 0a9167edcdb3..720323f81d29 100644
--- a/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -380,8 +380,8 @@ static bool handleMiddleInst(const MachineInstr &MI, LOHInfo &DefInfo,
static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI,
LOHInfo &Info) {
if (Info.LastADRP != nullptr) {
- DEBUG(dbgs() << "Adding MCLOH_AdrpAdrp:\n" << '\t' << MI << '\t'
- << *Info.LastADRP);
+ LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAdrp:\n"
+ << '\t' << MI << '\t' << *Info.LastADRP);
AFI.addLOHDirective(MCLOH_AdrpAdrp, {&MI, Info.LastADRP});
++NumADRPSimpleCandidate;
}
@@ -390,48 +390,52 @@ static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI,
if (Info.IsCandidate) {
switch (Info.Type) {
case MCLOH_AdrpAdd:
- DEBUG(dbgs() << "Adding MCLOH_AdrpAdd:\n" << '\t' << MI << '\t'
- << *Info.MI0);
+ LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAdd:\n"
+ << '\t' << MI << '\t' << *Info.MI0);
AFI.addLOHDirective(MCLOH_AdrpAdd, {&MI, Info.MI0});
++NumADRSimpleCandidate;
break;
case MCLOH_AdrpLdr:
if (supportLoadFromLiteral(*Info.MI0)) {
- DEBUG(dbgs() << "Adding MCLOH_AdrpLdr:\n" << '\t' << MI << '\t'
- << *Info.MI0);
+ LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdr:\n"
+ << '\t' << MI << '\t' << *Info.MI0);
AFI.addLOHDirective(MCLOH_AdrpLdr, {&MI, Info.MI0});
++NumADRPToLDR;
}
break;
case MCLOH_AdrpAddLdr:
- DEBUG(dbgs() << "Adding MCLOH_AdrpAddLdr:\n" << '\t' << MI << '\t'
- << *Info.MI1 << '\t' << *Info.MI0);
+ LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddLdr:\n"
+ << '\t' << MI << '\t' << *Info.MI1 << '\t'
+ << *Info.MI0);
AFI.addLOHDirective(MCLOH_AdrpAddLdr, {&MI, Info.MI1, Info.MI0});
++NumADDToLDR;
break;
case MCLOH_AdrpAddStr:
if (Info.MI1 != nullptr) {
- DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n" << '\t' << MI << '\t'
- << *Info.MI1 << '\t' << *Info.MI0);
+ LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n"
+ << '\t' << MI << '\t' << *Info.MI1 << '\t'
+ << *Info.MI0);
AFI.addLOHDirective(MCLOH_AdrpAddStr, {&MI, Info.MI1, Info.MI0});
++NumADDToSTR;
}
break;
case MCLOH_AdrpLdrGotLdr:
- DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotLdr:\n" << '\t' << MI << '\t'
- << *Info.MI1 << '\t' << *Info.MI0);
+ LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotLdr:\n"
+ << '\t' << MI << '\t' << *Info.MI1 << '\t'
+ << *Info.MI0);
AFI.addLOHDirective(MCLOH_AdrpLdrGotLdr, {&MI, Info.MI1, Info.MI0});
++NumLDRToLDR;
break;
case MCLOH_AdrpLdrGotStr:
- DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotStr:\n" << '\t' << MI << '\t'
- << *Info.MI1 << '\t' << *Info.MI0);
+ LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotStr:\n"
+ << '\t' << MI << '\t' << *Info.MI1 << '\t'
+ << *Info.MI0);
AFI.addLOHDirective(MCLOH_AdrpLdrGotStr, {&MI, Info.MI1, Info.MI0});
++NumLDRToSTR;
break;
case MCLOH_AdrpLdrGot:
- DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGot:\n" << '\t' << MI << '\t'
- << *Info.MI0);
+ LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGot:\n"
+ << '\t' << MI << '\t' << *Info.MI0);
AFI.addLOHDirective(MCLOH_AdrpLdrGot, {&MI, Info.MI0});
break;
case MCLOH_AdrpAdrp:
@@ -485,8 +489,8 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
- DEBUG(dbgs() << "********** AArch64 Collect LOH **********\n"
- << "Looking in function " << MF.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "********** AArch64 Collect LOH **********\n"
+ << "Looking in function " << MF.getName() << '\n');
LOHInfo LOHInfos[N_GPR_REGS];
AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp
index 30cefbad884c..5ae787409ae8 100644
--- a/lib/Target/AArch64/AArch64CondBrTuning.cpp
+++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -201,10 +201,10 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
I->readsRegister(AArch64::NZCV, TRI))
return false;
}
- DEBUG(dbgs() << " Replacing instructions:\n ");
- DEBUG(DefMI.print(dbgs()));
- DEBUG(dbgs() << " ");
- DEBUG(MI.print(dbgs()));
+ LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
+ LLVM_DEBUG(DefMI.print(dbgs()));
+ LLVM_DEBUG(dbgs() << " ");
+ LLVM_DEBUG(MI.print(dbgs()));
NewCmp = convertToFlagSetting(DefMI, IsFlagSetting);
NewBr = convertToCondBr(MI);
@@ -260,10 +260,10 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
I->readsRegister(AArch64::NZCV, TRI))
return false;
}
- DEBUG(dbgs() << " Replacing instructions:\n ");
- DEBUG(DefMI.print(dbgs()));
- DEBUG(dbgs() << " ");
- DEBUG(MI.print(dbgs()));
+ LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
+ LLVM_DEBUG(DefMI.print(dbgs()));
+ LLVM_DEBUG(dbgs() << " ");
+ LLVM_DEBUG(MI.print(dbgs()));
NewCmp = convertToFlagSetting(DefMI, IsFlagSetting);
NewBr = convertToCondBr(MI);
@@ -275,10 +275,10 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
(void)NewCmp; (void)NewBr;
assert(NewCmp && NewBr && "Expected new instructions.");
- DEBUG(dbgs() << " with instruction:\n ");
- DEBUG(NewCmp->print(dbgs()));
- DEBUG(dbgs() << " ");
- DEBUG(NewBr->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " with instruction:\n ");
+ LLVM_DEBUG(NewCmp->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " ");
+ LLVM_DEBUG(NewBr->print(dbgs()));
// If this was a flag setting version of the instruction, we use the original
// instruction by just clearing the dead marked on the implicit-def of NCZV.
@@ -293,8 +293,9 @@ bool AArch64CondBrTuning::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
- DEBUG(dbgs() << "********** AArch64 Conditional Branch Tuning **********\n"
- << "********** Function: " << MF.getName() << '\n');
+ LLVM_DEBUG(
+ dbgs() << "********** AArch64 Conditional Branch Tuning **********\n"
+ << "********** Function: " << MF.getName() << '\n');
TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
TRI = MF.getSubtarget().getRegisterInfo();
diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index d14bde33d94e..5064762b9f77 100644
--- a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -173,13 +173,14 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
case AArch64::ADDSXri: {
unsigned ShiftAmt = AArch64_AM::getShiftValue(I->getOperand(3).getImm());
if (!I->getOperand(2).isImm()) {
- DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n');
return nullptr;
} else if (I->getOperand(2).getImm() << ShiftAmt >= 0xfff) {
- DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I
+ << '\n');
return nullptr;
} else if (!MRI->use_empty(I->getOperand(0).getReg())) {
- DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
return nullptr;
}
return &*I;
@@ -207,7 +208,8 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
return nullptr;
}
}
- DEBUG(dbgs() << "Flags not defined in " << printMBBReference(*MBB) << '\n');
+ LLVM_DEBUG(dbgs() << "Flags not defined in " << printMBBReference(*MBB)
+ << '\n');
return nullptr;
}
@@ -325,8 +327,8 @@ bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI,
}
bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
- << "********** Function: " << MF.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
+ << "********** Function: " << MF.getName() << '\n');
if (skipFunction(MF.getFunction()))
return false;
@@ -384,15 +386,15 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
const int HeadImm = (int)HeadCmpMI->getOperand(2).getImm();
const int TrueImm = (int)TrueCmpMI->getOperand(2).getImm();
- DEBUG(dbgs() << "Head branch:\n");
- DEBUG(dbgs() << "\tcondition: "
- << AArch64CC::getCondCodeName(HeadCmp) << '\n');
- DEBUG(dbgs() << "\timmediate: " << HeadImm << '\n');
+ LLVM_DEBUG(dbgs() << "Head branch:\n");
+ LLVM_DEBUG(dbgs() << "\tcondition: " << AArch64CC::getCondCodeName(HeadCmp)
+ << '\n');
+ LLVM_DEBUG(dbgs() << "\timmediate: " << HeadImm << '\n');
- DEBUG(dbgs() << "True branch:\n");
- DEBUG(dbgs() << "\tcondition: "
- << AArch64CC::getCondCodeName(TrueCmp) << '\n');
- DEBUG(dbgs() << "\timmediate: " << TrueImm << '\n');
+ LLVM_DEBUG(dbgs() << "True branch:\n");
+ LLVM_DEBUG(dbgs() << "\tcondition: " << AArch64CC::getCondCodeName(TrueCmp)
+ << '\n');
+ LLVM_DEBUG(dbgs() << "\timmediate: " << TrueImm << '\n');
if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::LT) ||
(HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::GT)) &&
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index b0bda7c43c15..8176b6fb269d 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -311,7 +311,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
return &*I;
}
++NumCmpTermRejs;
- DEBUG(dbgs() << "Flags not used by terminator: " << *I);
+ LLVM_DEBUG(dbgs() << "Flags not used by terminator: " << *I);
return nullptr;
}
@@ -329,7 +329,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
// Check that the immediate operand is within range, ccmp wants a uimm5.
// Rd = SUBSri Rn, imm, shift
if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) {
- DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I);
+ LLVM_DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I);
++NumImmRangeRejs;
return nullptr;
}
@@ -340,7 +340,8 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
case AArch64::ADDSXrr:
if (isDeadDef(I->getOperand(0).getReg()))
return &*I;
- DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
+ LLVM_DEBUG(dbgs() << "Can't convert compare with live destination: "
+ << *I);
++NumLiveDstRejs;
return nullptr;
case AArch64::FCMPSrr:
@@ -358,18 +359,19 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
// The ccmp doesn't produce exactly the same flags as the original
// compare, so reject the transform if there are uses of the flags
// besides the terminators.
- DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
+ LLVM_DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
++NumMultNZCVUses;
return nullptr;
}
if (PRI.Defined || PRI.Clobbered) {
- DEBUG(dbgs() << "Not convertible compare: " << *I);
+ LLVM_DEBUG(dbgs() << "Not convertible compare: " << *I);
++NumUnknNZCVDefs;
return nullptr;
}
}
- DEBUG(dbgs() << "Flags not defined in " << printMBBReference(*MBB) << '\n');
+ LLVM_DEBUG(dbgs() << "Flags not defined in " << printMBBReference(*MBB)
+ << '\n');
return nullptr;
}
@@ -383,7 +385,7 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
// Reject any live-in physregs. It's probably NZCV/EFLAGS, and very hard to
// get right.
if (!MBB->livein_empty()) {
- DEBUG(dbgs() << printMBBReference(*MBB) << " has live-ins.\n");
+ LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has live-ins.\n");
return false;
}
@@ -392,18 +394,18 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
// Check all instructions, except the terminators. It is assumed that
// terminators never have side effects or define any used register values.
for (auto &I : make_range(MBB->begin(), MBB->getFirstTerminator())) {
- if (I.isDebugValue())
+ if (I.isDebugInstr())
continue;
if (++InstrCount > BlockInstrLimit && !Stress) {
- DEBUG(dbgs() << printMBBReference(*MBB) << " has more than "
- << BlockInstrLimit << " instructions.\n");
+ LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has more than "
+ << BlockInstrLimit << " instructions.\n");
return false;
}
// There shouldn't normally be any phis in a single-predecessor block.
if (I.isPHI()) {
- DEBUG(dbgs() << "Can't hoist: " << I);
+ LLVM_DEBUG(dbgs() << "Can't hoist: " << I);
return false;
}
@@ -411,20 +413,20 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
// speculate GOT or constant pool loads that are guaranteed not to trap,
// but we don't support that for now.
if (I.mayLoad()) {
- DEBUG(dbgs() << "Won't speculate load: " << I);
+ LLVM_DEBUG(dbgs() << "Won't speculate load: " << I);
return false;
}
// We never speculate stores, so an AA pointer isn't necessary.
bool DontMoveAcrossStore = true;
if (!I.isSafeToMove(nullptr, DontMoveAcrossStore)) {
- DEBUG(dbgs() << "Can't speculate: " << I);
+ LLVM_DEBUG(dbgs() << "Can't speculate: " << I);
return false;
}
// Only CmpMI is allowed to clobber the flags.
if (&I != CmpMI && I.modifiesRegister(AArch64::NZCV, TRI)) {
- DEBUG(dbgs() << "Clobbers flags: " << I);
+ LLVM_DEBUG(dbgs() << "Clobbers flags: " << I);
return false;
}
}
@@ -458,9 +460,9 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
return false;
// The CFG topology checks out.
- DEBUG(dbgs() << "\nTriangle: " << printMBBReference(*Head) << " -> "
- << printMBBReference(*CmpBB) << " -> "
- << printMBBReference(*Tail) << '\n');
+ LLVM_DEBUG(dbgs() << "\nTriangle: " << printMBBReference(*Head) << " -> "
+ << printMBBReference(*CmpBB) << " -> "
+ << printMBBReference(*Tail) << '\n');
++NumConsidered;
// Tail is allowed to have many predecessors, but we can't handle PHIs yet.
@@ -470,13 +472,13 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
// always be safe to sink the ccmp down to immediately before the CmpBB
// terminators.
if (!trivialTailPHIs()) {
- DEBUG(dbgs() << "Can't handle phis in Tail.\n");
+ LLVM_DEBUG(dbgs() << "Can't handle phis in Tail.\n");
++NumPhiRejs;
return false;
}
if (!Tail->livein_empty()) {
- DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n");
+ LLVM_DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n");
++NumPhysRejs;
return false;
}
@@ -484,13 +486,13 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
// CmpBB should never have PHIs since Head is its only predecessor.
// FIXME: Clean them up if it happens.
if (!CmpBB->empty() && CmpBB->front().isPHI()) {
- DEBUG(dbgs() << "Can't handle phis in CmpBB.\n");
+ LLVM_DEBUG(dbgs() << "Can't handle phis in CmpBB.\n");
++NumPhi2Rejs;
return false;
}
if (!CmpBB->livein_empty()) {
- DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n");
+ LLVM_DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n");
++NumPhysRejs;
return false;
}
@@ -499,7 +501,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
HeadCond.clear();
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
if (TII->analyzeBranch(*Head, TBB, FBB, HeadCond)) {
- DEBUG(dbgs() << "Head branch not analyzable.\n");
+ LLVM_DEBUG(dbgs() << "Head branch not analyzable.\n");
++NumHeadBranchRejs;
return false;
}
@@ -507,13 +509,14 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
// This is weird, probably some sort of degenerate CFG, or an edge to a
// landing pad.
if (!TBB || HeadCond.empty()) {
- DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
+ LLVM_DEBUG(
+ dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
++NumHeadBranchRejs;
return false;
}
if (!parseCond(HeadCond, HeadCmpBBCC)) {
- DEBUG(dbgs() << "Unsupported branch type on Head\n");
+ LLVM_DEBUG(dbgs() << "Unsupported branch type on Head\n");
++NumHeadBranchRejs;
return false;
}
@@ -527,19 +530,20 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
CmpBBCond.clear();
TBB = FBB = nullptr;
if (TII->analyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
- DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
+ LLVM_DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
++NumCmpBranchRejs;
return false;
}
if (!TBB || CmpBBCond.empty()) {
- DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
+ LLVM_DEBUG(
+ dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
++NumCmpBranchRejs;
return false;
}
if (!parseCond(CmpBBCond, CmpBBTailCC)) {
- DEBUG(dbgs() << "Unsupported branch type on CmpBB\n");
+ LLVM_DEBUG(dbgs() << "Unsupported branch type on CmpBB\n");
++NumCmpBranchRejs;
return false;
}
@@ -547,9 +551,10 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
if (TBB != Tail)
CmpBBTailCC = AArch64CC::getInvertedCondCode(CmpBBTailCC);
- DEBUG(dbgs() << "Head->CmpBB on " << AArch64CC::getCondCodeName(HeadCmpBBCC)
- << ", CmpBB->Tail on " << AArch64CC::getCondCodeName(CmpBBTailCC)
- << '\n');
+ LLVM_DEBUG(dbgs() << "Head->CmpBB on "
+ << AArch64CC::getCondCodeName(HeadCmpBBCC)
+ << ", CmpBB->Tail on "
+ << AArch64CC::getCondCodeName(CmpBBTailCC) << '\n');
CmpMI = findConvertibleCompare(CmpBB);
if (!CmpMI)
@@ -563,9 +568,9 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
}
void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
- DEBUG(dbgs() << "Merging " << printMBBReference(*CmpBB) << " into "
- << printMBBReference(*Head) << ":\n"
- << *CmpBB);
+ LLVM_DEBUG(dbgs() << "Merging " << printMBBReference(*CmpBB) << " into "
+ << printMBBReference(*Head) << ":\n"
+ << *CmpBB);
// All CmpBB instructions are moved into Head, and CmpBB is deleted.
// Update the CFG first.
@@ -710,7 +715,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
RemovedBlocks.push_back(CmpBB);
CmpBB->eraseFromParent();
- DEBUG(dbgs() << "Result:\n" << *Head);
+ LLVM_DEBUG(dbgs() << "Result:\n" << *Head);
++NumConverted;
}
@@ -860,13 +865,13 @@ bool AArch64ConditionalCompares::shouldConvert() {
// If code size is the main concern
if (MinSize) {
int CodeSizeDelta = CmpConv.expectedCodeSizeDelta();
- DEBUG(dbgs() << "Code size delta: " << CodeSizeDelta << '\n');
+ LLVM_DEBUG(dbgs() << "Code size delta: " << CodeSizeDelta << '\n');
// If we are minimizing the code size, do the conversion whatever
// the cost is.
if (CodeSizeDelta < 0)
return true;
if (CodeSizeDelta > 0) {
- DEBUG(dbgs() << "Code size is increasing, give up on this one.\n");
+ LLVM_DEBUG(dbgs() << "Code size is increasing, give up on this one.\n");
return false;
}
// CodeSizeDelta == 0, continue with the regular heuristics
@@ -885,24 +890,24 @@ bool AArch64ConditionalCompares::shouldConvert() {
Trace.getInstrCycles(*CmpConv.Head->getFirstTerminator()).Depth;
unsigned CmpBBDepth =
Trace.getInstrCycles(*CmpConv.CmpBB->getFirstTerminator()).Depth;
- DEBUG(dbgs() << "Head depth: " << HeadDepth
- << "\nCmpBB depth: " << CmpBBDepth << '\n');
+ LLVM_DEBUG(dbgs() << "Head depth: " << HeadDepth
+ << "\nCmpBB depth: " << CmpBBDepth << '\n');
if (CmpBBDepth > HeadDepth + DelayLimit) {
- DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit
- << " cycles.\n");
+ LLVM_DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit
+ << " cycles.\n");
return false;
}
// Check the resource depth at the bottom of CmpBB - these instructions will
// be speculated.
unsigned ResDepth = Trace.getResourceDepth(true);
- DEBUG(dbgs() << "Resources: " << ResDepth << '\n');
+ LLVM_DEBUG(dbgs() << "Resources: " << ResDepth << '\n');
// Heuristic: The speculatively executed instructions must all be able to
// merge into the Head block. The Head critical path should dominate the
// resource cost of the speculated instructions.
if (ResDepth > HeadDepth) {
- DEBUG(dbgs() << "Too many instructions to speculate.\n");
+ LLVM_DEBUG(dbgs() << "Too many instructions to speculate.\n");
return false;
}
return true;
@@ -922,8 +927,8 @@ bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
}
bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
- << "********** Function: " << MF.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
+ << "********** Function: " << MF.getName() << '\n');
if (skipFunction(MF.getFunction()))
return false;
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 8e7e740da6f6..2ba10d25e939 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -136,18 +136,21 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
// We need to skip this instruction because while it appears to have a
// dead def it uses a frame index which might expand into a multi
// instruction sequence during EPI.
- DEBUG(dbgs() << " Ignoring, operand is frame index\n");
+ LLVM_DEBUG(dbgs() << " Ignoring, operand is frame index\n");
continue;
}
if (MI.definesRegister(AArch64::XZR) || MI.definesRegister(AArch64::WZR)) {
// It is not allowed to write to the same register (not even the zero
// register) twice in a single instruction.
- DEBUG(dbgs() << " Ignoring, XZR or WZR already used by the instruction\n");
+ LLVM_DEBUG(
+ dbgs()
+ << " Ignoring, XZR or WZR already used by the instruction\n");
continue;
}
if (shouldSkip(MI, MF)) {
- DEBUG(dbgs() << " Ignoring, Atomic instruction with acquire semantics using WZR/XZR\n");
+ LLVM_DEBUG(dbgs() << " Ignoring, Atomic instruction with acquire "
+ "semantics using WZR/XZR\n");
continue;
}
@@ -163,30 +166,30 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
(!MO.isDead() && !MRI->use_nodbg_empty(Reg)))
continue;
assert(!MO.isImplicit() && "Unexpected implicit def!");
- DEBUG(dbgs() << " Dead def operand #" << I << " in:\n ";
- MI.print(dbgs()));
+ LLVM_DEBUG(dbgs() << " Dead def operand #" << I << " in:\n ";
+ MI.print(dbgs()));
// Be careful not to change the register if it's a tied operand.
if (MI.isRegTiedToUseOperand(I)) {
- DEBUG(dbgs() << " Ignoring, def is tied operand.\n");
+ LLVM_DEBUG(dbgs() << " Ignoring, def is tied operand.\n");
continue;
}
const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI, MF);
unsigned NewReg;
if (RC == nullptr) {
- DEBUG(dbgs() << " Ignoring, register is not a GPR.\n");
+ LLVM_DEBUG(dbgs() << " Ignoring, register is not a GPR.\n");
continue;
} else if (RC->contains(AArch64::WZR))
NewReg = AArch64::WZR;
else if (RC->contains(AArch64::XZR))
NewReg = AArch64::XZR;
else {
- DEBUG(dbgs() << " Ignoring, register is not a GPR.\n");
+ LLVM_DEBUG(dbgs() << " Ignoring, register is not a GPR.\n");
continue;
}
- DEBUG(dbgs() << " Replacing with zero register. New:\n ");
+ LLVM_DEBUG(dbgs() << " Replacing with zero register. New:\n ");
MO.setReg(NewReg);
MO.setIsDead();
- DEBUG(MI.print(dbgs()));
+ LLVM_DEBUG(MI.print(dbgs()));
++NumDeadDefsReplaced;
Changed = true;
// Only replace one dead register, see check for zero register above.
@@ -204,7 +207,7 @@ bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
TRI = MF.getSubtarget().getRegisterInfo();
TII = MF.getSubtarget().getInstrInfo();
MRI = &MF.getRegInfo();
- DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
+ LLVM_DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
Changed = false;
for (auto &MBB : MF)
processMachineBasicBlock(MBB);
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index c3842785f2be..9226a9dd879b 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -66,6 +66,11 @@ private:
MachineBasicBlock::iterator &NextMBBI);
bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
unsigned BitSize);
+ bool expandMOVImmSimple(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned BitSize,
+ unsigned OneChunks,
+ unsigned ZeroChunks);
bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
unsigned LdarOp, unsigned StlrOp, unsigned CmpOp,
@@ -83,7 +88,7 @@ char AArch64ExpandPseudo::ID = 0;
INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo",
AARCH64_EXPAND_PSEUDO_NAME, false, false)
-/// \brief Transfer implicit operands on the pseudo instruction to the
+/// Transfer implicit operands on the pseudo instruction to the
/// instructions created from the expansion.
static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
MachineInstrBuilder &DefMI) {
@@ -99,7 +104,7 @@ static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
}
}
-/// \brief Helper function which extracts the specified 16-bit chunk from a
+/// Helper function which extracts the specified 16-bit chunk from a
/// 64-bit value.
static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
assert(ChunkIdx < 4 && "Out of range chunk index specified!");
@@ -107,58 +112,7 @@ static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
}
-/// \brief Helper function which replicates a 16-bit chunk within a 64-bit
-/// value. Indices correspond to element numbers in a v4i16.
-static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
- assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!");
- const unsigned ShiftAmt = ToIdx * 16;
-
- // Replicate the source chunk to the destination position.
- const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt;
- // Clear the destination chunk.
- Imm &= ~(0xFFFFLL << ShiftAmt);
- // Insert the replicated chunk.
- return Imm | Chunk;
-}
-
-/// \brief Helper function which tries to materialize a 64-bit value with an
-/// ORR + MOVK instruction sequence.
-static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- const AArch64InstrInfo *TII, unsigned ChunkIdx) {
- assert(ChunkIdx < 4 && "Out of range chunk index specified!");
- const unsigned ShiftAmt = ChunkIdx * 16;
-
- uint64_t Encoding;
- if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
- // Create the ORR-immediate instruction.
- MachineInstrBuilder MIB =
- BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
- .add(MI.getOperand(0))
- .addReg(AArch64::XZR)
- .addImm(Encoding);
-
- // Create the MOVK instruction.
- const unsigned Imm16 = getChunk(UImm, ChunkIdx);
- const unsigned DstReg = MI.getOperand(0).getReg();
- const bool DstIsDead = MI.getOperand(0).isDead();
- MachineInstrBuilder MIB1 =
- BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
- .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
- .addReg(DstReg)
- .addImm(Imm16)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
-
- transferImpOps(MI, MIB, MIB1);
- MI.eraseFromParent();
- return true;
- }
-
- return false;
-}
-
-/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width
+/// Check whether the given 16-bit chunk replicated to full 64-bit width
/// can be materialized with an ORR instruction.
static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
@@ -166,7 +120,7 @@ static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding);
}
-/// \brief Check for identical 16-bit chunks within the constant and if so
+/// Check for identical 16-bit chunks within the constant and if so
/// materialize them with a single ORR instruction. The remaining one or two
/// 16-bit chunks will be materialized with MOVK instructions.
///
@@ -260,7 +214,7 @@ static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
return false;
}
-/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern
+/// Check whether this chunk matches the pattern '1...0...'. This pattern
/// starts a contiguous sequence of ones if we look at the bits from the LSB
/// towards the MSB.
static bool isStartChunk(uint64_t Chunk) {
@@ -270,7 +224,7 @@ static bool isStartChunk(uint64_t Chunk) {
return isMask_64(~Chunk);
}
-/// \brief Check whether this chunk matches the pattern '0...1...' This pattern
+/// Check whether this chunk matches the pattern '0...1...' This pattern
/// ends a contiguous sequence of ones if we look at the bits from the LSB
/// towards the MSB.
static bool isEndChunk(uint64_t Chunk) {
@@ -280,7 +234,7 @@ static bool isEndChunk(uint64_t Chunk) {
return isMask_64(Chunk);
}
-/// \brief Clear or set all bits in the chunk at the given index.
+/// Clear or set all bits in the chunk at the given index.
static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
const uint64_t Mask = 0xFFFF;
@@ -294,7 +248,7 @@ static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
return Imm;
}
-/// \brief Check whether the constant contains a sequence of contiguous ones,
+/// Check whether the constant contains a sequence of contiguous ones,
/// which might be interrupted by one or two chunks. If so, materialize the
/// sequence of contiguous ones with an ORR instruction.
/// Materialize the chunks which are either interrupting the sequence or outside
@@ -423,7 +377,7 @@ static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
return true;
}
-/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
+/// Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
/// real move-immediate instructions to synthesize the immediate.
bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
@@ -440,7 +394,22 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
return true;
}
- // Try a MOVI instruction (aka ORR-immediate with the zero register).
+ // Scan the immediate and count the number of 16-bit chunks which are either
+ // all ones or all zeros.
+ unsigned OneChunks = 0;
+ unsigned ZeroChunks = 0;
+ for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
+ const unsigned Chunk = (Imm >> Shift) & Mask;
+ if (Chunk == Mask)
+ OneChunks++;
+ else if (Chunk == 0)
+ ZeroChunks++;
+ }
+
+ // FIXME: Prefer MOVZ/MOVN over ORR because of the rules for the "mov"
+ // alias.
+
+ // Try a single ORR.
uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
uint64_t Encoding;
if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
@@ -455,74 +424,69 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
return true;
}
- // Scan the immediate and count the number of 16-bit chunks which are either
- // all ones or all zeros.
- unsigned OneChunks = 0;
- unsigned ZeroChunks = 0;
+ // Two instruction sequences.
+ //
+ // Prefer MOVZ/MOVN followed by MOVK; it's more readable, and possibly the
+ // fastest sequence with fast literal generation.
+ if (OneChunks >= (BitSize / 16) - 2 || ZeroChunks >= (BitSize / 16) - 2)
+ return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks);
+
+ assert(BitSize == 64 && "All 32-bit immediates can be expanded with a"
+ "MOVZ/MOVK pair");
+
+ // Try other two-instruction sequences.
+
+ // 64-bit ORR followed by MOVK.
+ // We try to construct the ORR immediate in three different ways: either we
+ // zero out the chunk which will be replaced, we fill the chunk which will
+ // be replaced with ones, or we take the bit pattern from the other half of
+ // the 64-bit immediate. This is comprehensive because of the way ORR
+ // immediates are constructed.
for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
- const unsigned Chunk = (Imm >> Shift) & Mask;
- if (Chunk == Mask)
- OneChunks++;
- else if (Chunk == 0)
- ZeroChunks++;
- }
+ uint64_t ShiftedMask = (0xFFFFULL << Shift);
+ uint64_t ZeroChunk = UImm & ~ShiftedMask;
+ uint64_t OneChunk = UImm | ShiftedMask;
+ uint64_t RotatedImm = (UImm << 32) | (UImm >> 32);
+ uint64_t ReplicateChunk = ZeroChunk | (RotatedImm & ShiftedMask);
+ if (AArch64_AM::processLogicalImmediate(ZeroChunk, BitSize, Encoding) ||
+ AArch64_AM::processLogicalImmediate(OneChunk, BitSize, Encoding) ||
+ AArch64_AM::processLogicalImmediate(ReplicateChunk,
+ BitSize, Encoding)) {
+ // Create the ORR-immediate instruction.
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+ .add(MI.getOperand(0))
+ .addReg(AArch64::XZR)
+ .addImm(Encoding);
+
+ // Create the MOVK instruction.
+ const unsigned Imm16 = getChunk(UImm, Shift / 16);
+ const unsigned DstReg = MI.getOperand(0).getReg();
+ const bool DstIsDead = MI.getOperand(0).isDead();
+ MachineInstrBuilder MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg)
+ .addImm(Imm16)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
- // Since we can't materialize the constant with a single ORR instruction,
- // let's see whether we can materialize 3/4 of the constant with an ORR
- // instruction and use an additional MOVK instruction to materialize the
- // remaining 1/4.
- //
- // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|.
- //
- // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR,
- // we would create the following instruction sequence:
- //
- // ORR x0, xzr, |A|X|A|X|
- // MOVK x0, |B|, LSL #16
- //
- // Only look at 64-bit constants which can't be materialized with a single
- // instruction e.g. which have less than either three all zero or all one
- // chunks.
- //
- // Ignore 32-bit constants here, they always can be materialized with a
- // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized
- // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair.
- // Thus we fall back to the default code below which in the best case creates
- // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one).
- //
- if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) {
- // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2
- // identical?
- if (getChunk(UImm, 0) == getChunk(UImm, 2)) {
- // See if we can come up with a constant which can be materialized with
- // ORR-immediate by replicating element 3 into element 1.
- uint64_t OrrImm = replicateChunk(UImm, 3, 1);
- if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1))
- return true;
-
- // See if we can come up with a constant which can be materialized with
- // ORR-immediate by replicating element 1 into element 3.
- OrrImm = replicateChunk(UImm, 1, 3);
- if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3))
- return true;
-
- // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3
- // identical?
- } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) {
- // See if we can come up with a constant which can be materialized with
- // ORR-immediate by replicating element 2 into element 0.
- uint64_t OrrImm = replicateChunk(UImm, 2, 0);
- if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0))
- return true;
-
- // See if we can come up with a constant which can be materialized with
- // ORR-immediate by replicating element 1 into element 3.
- OrrImm = replicateChunk(UImm, 0, 2);
- if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2))
- return true;
+ transferImpOps(MI, MIB, MIB1);
+ MI.eraseFromParent();
+ return true;
}
}
+ // FIXME: Add more two-instruction sequences.
+
+ // Three instruction sequences.
+ //
+ // Prefer MOVZ/MOVN followed by two MOVK; it's more readable, and possibly
+ // the fastest sequence with fast literal generation. (If neither MOVK is
+ // part of a fast literal generation pair, it could be slower than the
+ // four-instruction sequence, but we won't worry about that for now.)
+ if (OneChunks || ZeroChunks)
+ return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks);
+
// Check for identical 16-bit chunks within the constant and if so materialize
// them with a single ORR instruction. The remaining one or two 16-bit chunks
// will be materialized with MOVK instructions.
@@ -537,6 +501,23 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII))
return true;
+ // We found no possible two or three instruction sequence; use the general
+ // four-instruction sequence.
+ return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks);
+}
+
+/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a
+/// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions.
+bool AArch64ExpandPseudo::expandMOVImmSimple(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned BitSize,
+ unsigned OneChunks,
+ unsigned ZeroChunks) {
+ MachineInstr &MI = *MBBI;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ uint64_t Imm = MI.getOperand(1).getImm();
+ const unsigned Mask = 0xFFFF;
+
// Use a MOVZ or MOVN instruction to set the high bits, followed by one or
// more MOVK instructions to insert additional 16-bit portions into the
// lower bits.
@@ -778,7 +759,7 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
return true;
}
-/// \brief If MBBI references a pseudo instruction that should be expanded here,
+/// If MBBI references a pseudo instruction that should be expanded here,
/// do the expansion and return true. Otherwise return false.
bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
@@ -911,6 +892,16 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MI.eraseFromParent();
return true;
}
+ case AArch64::ADDlowTLS:
+ // Produce a plain ADD
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .addImm(0);
+ MI.eraseFromParent();
+ return true;
+
case AArch64::MOVbaseTLS: {
unsigned DstReg = MI.getOperand(0).getReg();
auto SysReg = AArch64SysReg::TPIDR_EL0;
@@ -980,7 +971,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
return false;
}
-/// \brief Iterate over the instructions in basic block MBB and expand any
+/// Iterate over the instructions in basic block MBB and expand any
/// pseudo instructions. Return true if anything was modified.
bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
bool Modified = false;
@@ -1004,7 +995,7 @@ bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
return Modified;
}
-/// \brief Returns an instance of the pseudo instruction expansion pass.
+/// Returns an instance of the pseudo instruction expansion pass.
FunctionPass *llvm::createAArch64ExpandPseudoPass() {
return new AArch64ExpandPseudo();
}
diff --git a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index d1ddb2e3ef70..bc9a5ca97fea 100644
--- a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -10,7 +10,7 @@
/// that may inhibit the HW prefetching. This is done in two steps. Before
/// ISel, we mark strided loads (i.e. those that will likely benefit from
/// prefetching) with metadata. Then, after opcodes have been finalized, we
-/// insert MOVs and re-write loads to prevent unintnentional tag collisions.
+/// insert MOVs and re-write loads to prevent unintentional tag collisions.
// ===---------------------------------------------------------------------===//
#include "AArch64.h"
@@ -46,6 +46,7 @@
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
#include <iterator>
@@ -59,7 +60,9 @@ STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
STATISTIC(NumCollisionsAvoided,
"Number of HW prefetch tag collisions avoided");
STATISTIC(NumCollisionsNotAvoided,
- "Number of HW prefetch tag collisions not avoided due to lack of regsiters");
+ "Number of HW prefetch tag collisions not avoided due to lack of registers");
+DEBUG_COUNTER(FixCounter, "falkor-hwpf",
+ "Controls which tag collisions are avoided");
namespace {
@@ -166,7 +169,7 @@ bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {
LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD,
MDNode::get(LoadI->getContext(), {}));
++NumStridedLoadsMarked;
- DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
+ LLVM_DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
MadeChange = true;
}
}
@@ -187,6 +190,7 @@ public:
bool runOnMachineFunction(MachineFunction &Fn) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
AU.addRequired<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -727,7 +731,22 @@ void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
continue;
bool Fixed = false;
- DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
+ LLVM_DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
+
+ if (!DebugCounter::shouldExecute(FixCounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping fix due to debug counter:\n " << MI);
+ continue;
+ }
+
+ // Add the non-base registers of MI as live so we don't use them as
+ // scratch registers.
+ for (unsigned OpI = 0, OpE = MI.getNumOperands(); OpI < OpE; ++OpI) {
+ if (OpI == static_cast<unsigned>(LdI.BaseRegIdx))
+ continue;
+ MachineOperand &MO = MI.getOperand(OpI);
+ if (MO.isReg() && MO.readsReg())
+ LR.addReg(MO.getReg());
+ }
for (unsigned ScratchReg : AArch64::GPR64RegClass) {
if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg))
@@ -740,8 +759,8 @@ void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
if (TagMap.count(NewTag))
continue;
- DEBUG(dbgs() << "Changing base reg to: " << printReg(ScratchReg, TRI)
- << '\n');
+ LLVM_DEBUG(dbgs() << "Changing base reg to: "
+ << printReg(ScratchReg, TRI) << '\n');
// Rewrite:
// Xd = LOAD Xb, off
@@ -759,8 +778,8 @@ void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
// If the load does a pre/post increment, then insert a MOV after as
// well to update the real base register.
if (LdI.IsPrePost) {
- DEBUG(dbgs() << "Doing post MOV of incremented reg: "
- << printReg(ScratchReg, TRI) << '\n');
+ LLVM_DEBUG(dbgs() << "Doing post MOV of incremented reg: "
+ << printReg(ScratchReg, TRI) << '\n');
MI.getOperand(0).setReg(
ScratchReg); // Change tied operand pre/post update dest.
BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL,
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 022200986d2b..43a3ae77a170 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -35,7 +35,6 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Argument.h"
@@ -66,6 +65,7 @@
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include <algorithm>
#include <cassert>
@@ -307,7 +307,7 @@ public:
#include "AArch64GenCallingConv.inc"
-/// \brief Check if the sign-/zero-extend will be a noop.
+/// Check if the sign-/zero-extend will be a noop.
static bool isIntExtFree(const Instruction *I) {
assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
"Unexpected integer extend instruction.");
@@ -326,7 +326,7 @@ static bool isIntExtFree(const Instruction *I) {
return false;
}
-/// \brief Determine the implicit scale factor that is applied by a memory
+/// Determine the implicit scale factor that is applied by a memory
/// operation for a given value type.
static unsigned getImplicitScaleFactor(MVT VT) {
switch (VT.SimpleTy) {
@@ -476,26 +476,27 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
// ADRP + LDRX
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
ADRPReg)
- .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+ .addGlobalAddress(GV, 0, AArch64II::MO_PAGE | OpFlags);
ResultReg = createResultReg(&AArch64::GPR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
ResultReg)
- .addReg(ADRPReg)
- .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
- AArch64II::MO_NC);
+ .addReg(ADRPReg)
+ .addGlobalAddress(GV, 0,
+ AArch64II::MO_PAGEOFF | AArch64II::MO_NC | OpFlags);
} else {
// ADRP + ADDX
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
ADRPReg)
- .addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
+ .addGlobalAddress(GV, 0, AArch64II::MO_PAGE | OpFlags);
ResultReg = createResultReg(&AArch64::GPR64spRegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
ResultReg)
- .addReg(ADRPReg)
- .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
- .addImm(0);
+ .addReg(ADRPReg)
+ .addGlobalAddress(GV, 0,
+ AArch64II::MO_PAGEOFF | AArch64II::MO_NC | OpFlags)
+ .addImm(0);
}
return ResultReg;
}
@@ -534,7 +535,7 @@ unsigned AArch64FastISel::fastMaterializeFloatZero(const ConstantFP* CFP) {
return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg, /*IsKill=*/true);
}
-/// \brief Check if the multiply is by a power-of-2 constant.
+/// Check if the multiply is by a power-of-2 constant.
static bool isMulPowOf2(const Value *I) {
if (const auto *MI = dyn_cast<MulOperator>(I)) {
if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(0)))
@@ -963,7 +964,7 @@ bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
return TLI.isTypeLegal(VT);
}
-/// \brief Determine if the value type is supported by FastISel.
+/// Determine if the value type is supported by FastISel.
///
/// FastISel for AArch64 can handle more value types than are legal. This adds
/// simple value type such as i1, i8, and i16.
@@ -1523,7 +1524,7 @@ unsigned AArch64FastISel::emitAdd(MVT RetVT, const Value *LHS, const Value *RHS,
IsZExt);
}
-/// \brief This method is a wrapper to simplify add emission.
+/// This method is a wrapper to simplify add emission.
///
/// First try to emit an add with an immediate operand using emitAddSub_ri. If
/// that fails, then try to materialize the immediate into a register and use
@@ -2253,7 +2254,7 @@ static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
}
}
-/// \brief Try to emit a combined compare-and-branch instruction.
+/// Try to emit a combined compare-and-branch instruction.
bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
assert(isa<CmpInst>(BI->getCondition()) && "Expected cmp instruction");
const CmpInst *CI = cast<CmpInst>(BI->getCondition());
@@ -2606,7 +2607,7 @@ bool AArch64FastISel::selectCmp(const Instruction *I) {
return true;
}
-/// \brief Optimize selects of i1 if one of the operands has a 'true' or 'false'
+/// Optimize selects of i1 if one of the operands has a 'true' or 'false'
/// value.
bool AArch64FastISel::optimizeSelect(const SelectInst *SI) {
if (!SI->getType()->isIntegerTy(1))
@@ -3321,7 +3322,7 @@ bool AArch64FastISel::tryEmitSmallMemCpy(Address Dest, Address Src,
return true;
}
-/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// Check if it is possible to fold the condition from the XALU intrinsic
/// into the user. The condition code will only be updated on success.
bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC,
const Instruction *I,
@@ -3456,7 +3457,8 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
// Small memcpy's are common enough that we want to do them without a call
// if possible.
uint64_t Len = cast<ConstantInt>(MTI->getLength())->getZExtValue();
- unsigned Alignment = MTI->getAlignment();
+ unsigned Alignment = MinAlign(MTI->getDestAlignment(),
+ MTI->getSourceAlignment());
if (isMemCpySmall(Len, Alignment)) {
Address Dest, Src;
if (!computeAddress(MTI->getRawDest(), Dest) ||
@@ -3476,7 +3478,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
return false;
const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove";
- return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 2);
+ return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 1);
}
case Intrinsic::memset: {
const MemSetInst *MSI = cast<MemSetInst>(II);
@@ -3492,7 +3494,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
// address spaces.
return false;
- return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+ return lowerCallTo(II, "memset", II->getNumArgOperands() - 1);
}
case Intrinsic::sin:
case Intrinsic::cos:
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index d66f7b59a4b5..6dc5d19862a9 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -140,8 +140,19 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
cl::desc("enable use of redzone on AArch64"),
cl::init(false), cl::Hidden);
+static cl::opt<bool>
+ ReverseCSRRestoreSeq("reverse-csr-restore-seq",
+ cl::desc("reverse the CSR restore sequence"),
+ cl::init(false), cl::Hidden);
+
STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+/// This is the biggest offset to the stack pointer we can encode in aarch64
+/// instructions (without using a separate calculation and a temp register).
+/// Note that the exception here are vector stores/loads which cannot encode any
+/// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
+static const unsigned DefaultSafeSPDisplacement = 255;
+
/// Look at each instruction that references stack frames and return the stack
/// size limit beyond which some of these instructions will require a scratch
/// register during their expansion later.
@@ -151,7 +162,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
// realistically that's not a big deal at this stage of the game.
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
- if (MI.isDebugValue() || MI.isPseudo() ||
+ if (MI.isDebugInstr() || MI.isPseudo() ||
MI.getOpcode() == AArch64::ADDXri ||
MI.getOpcode() == AArch64::ADDSXri)
continue;
@@ -167,7 +178,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
}
}
}
- return 255;
+ return DefaultSafeSPDisplacement;
}
bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
@@ -191,11 +202,25 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
// Retain behavior of always omitting the FP for leaf functions when possible.
- return (MFI.hasCalls() &&
- MF.getTarget().Options.DisableFramePointerElim(MF)) ||
- MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
- MFI.hasStackMap() || MFI.hasPatchPoint() ||
- RegInfo->needsStackRealignment(MF);
+ if (MFI.hasCalls() && MF.getTarget().Options.DisableFramePointerElim(MF))
+ return true;
+ if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
+ MFI.hasStackMap() || MFI.hasPatchPoint() ||
+ RegInfo->needsStackRealignment(MF))
+ return true;
+ // With large callframes around we may need to use FP to access the scavenging
+ // emergency spillslot.
+ //
+ // Unfortunately some calls to hasFP() like machine verifier ->
+ // getReservedReg() -> hasFP in the middle of global isel are too early
+ // to know the max call frame size. Hopefully conservatively returning "true"
+ // in those cases is fine.
+ // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
+ if (!MFI.isMaxCallFrameSizeComputed() ||
+ MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
+ return true;
+
+ return false;
}
/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
@@ -349,7 +374,8 @@ static bool windowsRequiresStackProbe(MachineFunction &MF,
F.getFnAttribute("stack-probe-size")
.getValueAsString()
.getAsInteger(0, StackProbeSize);
- return StackSizeInBytes >= StackProbeSize;
+ return (StackSizeInBytes >= StackProbeSize) &&
+ !F.hasFnAttribute("no-stack-arg-probe");
}
bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
@@ -388,6 +414,14 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) {
+ // Ignore instructions that do not operate on SP, i.e. shadow call stack
+ // instructions.
+ while (MBBI->getOpcode() == AArch64::STRXpost ||
+ MBBI->getOpcode() == AArch64::LDRXpre) {
+ assert(MBBI->getOperand(0).getReg() != AArch64::SP);
+ ++MBBI;
+ }
+
unsigned NewOpc;
bool NewIsUnscaled = false;
switch (MBBI->getOpcode()) {
@@ -455,6 +489,14 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
unsigned LocalStackSize) {
unsigned Opc = MI.getOpcode();
+
+ // Ignore instructions that do not operate on SP, i.e. shadow call stack
+ // instructions.
+ if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre) {
+ assert(MI.getOperand(0).getReg() != AArch64::SP);
+ return;
+ }
+
(void)Opc;
assert((Opc == AArch64::STPXi || Opc == AArch64::STPDi ||
Opc == AArch64::STRXui || Opc == AArch64::STRDui ||
@@ -472,6 +514,38 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8);
}
+static void adaptForLdStOpt(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator FirstSPPopI,
+ MachineBasicBlock::iterator LastPopI) {
+ // Sometimes (when we restore in the same order as we save), we can end up
+ // with code like this:
+ //
+ // ldp x26, x25, [sp]
+ // ldp x24, x23, [sp, #16]
+ // ldp x22, x21, [sp, #32]
+ // ldp x20, x19, [sp, #48]
+ // add sp, sp, #64
+ //
+ // In this case, it is always better to put the first ldp at the end, so
+ // that the load-store optimizer can run and merge the ldp and the add into
+ // a post-index ldp.
+ // If we managed to grab the first pop instruction, move it to the end.
+ if (ReverseCSRRestoreSeq)
+ MBB.splice(FirstSPPopI, &MBB, LastPopI);
+ // We should end up with something like this now:
+ //
+ // ldp x24, x23, [sp, #16]
+ // ldp x22, x21, [sp, #32]
+ // ldp x20, x19, [sp, #48]
+ // ldp x26, x25, [sp]
+ // add sp, sp, #64
+ //
+ // and the load-store optimizer can merge the last two instructions into:
+ //
+ // ldp x26, x25, [sp], #64
+ //
+}
+
void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -485,6 +559,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
bool needsFrameMoves = MMI.hasDebugInfo() || F.needsUnwindTableEntry();
bool HasFP = hasFP(MF);
+ // At this point, we're going to decide whether or not the function uses a
+ // redzone. In most cases, the function doesn't have a redzone so let's
+ // assume that's false and set it to true in the case that there's a redzone.
+ AFI->setHasRedZone(false);
+
// Debug location must be unknown since the first debug location is used
// to determine the end of the prologue.
DebugLoc DL;
@@ -505,9 +584,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
return;
// REDZONE: If the stack size is less than 128 bytes, we don't need
// to actually allocate.
- if (canUseRedZone(MF))
+ if (canUseRedZone(MF)) {
+ AFI->setHasRedZone(true);
++NumRedZoneFunctions;
- else {
+ } else {
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
MachineInstr::FrameSetup);
@@ -823,14 +903,32 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+ uint64_t AfterCSRPopSize = ArgumentPopSize;
auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
-
- if (!CombineSPBump && PrologueSaveSize != 0)
- convertCalleeSaveRestoreToSPPrePostIncDec(
- MBB, std::prev(MBB.getFirstTerminator()), DL, TII, PrologueSaveSize);
+ // Assume we can't combine the last pop with the sp restore.
+
+ if (!CombineSPBump && PrologueSaveSize != 0) {
+ MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
+ // Converting the last ldp to a post-index ldp is valid only if the last
+ // ldp's offset is 0.
+ const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
+ // If the offset is 0, convert it to a post-index ldp.
+ if (OffsetOp.getImm() == 0) {
+ convertCalleeSaveRestoreToSPPrePostIncDec(MBB, Pop, DL, TII,
+ PrologueSaveSize);
+ } else {
+ // If not, make sure to emit an add after the last ldp.
+ // We're doing this by transfering the size to be restored from the
+ // adjustment *before* the CSR pops to the adjustment *after* the CSR
+ // pops.
+ AfterCSRPopSize += PrologueSaveSize;
+ }
+ }
// Move past the restores of the callee-saved registers.
+ // If we plan on combining the sp bump of the local stack size and the callee
+ // save stack size, we might need to adjust the CSR save and restore offsets.
MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
MachineBasicBlock::iterator Begin = MBB.begin();
while (LastPopI != Begin) {
@@ -845,7 +943,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// If there is a single SP update, insert it before the ret and we're done.
if (CombineSPBump) {
emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
- NumBytes + ArgumentPopSize, TII,
+ NumBytes + AfterCSRPopSize, TII,
MachineInstr::FrameDestroy);
return;
}
@@ -857,19 +955,27 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
bool RedZone = canUseRedZone(MF);
// If this was a redzone leaf function, we don't need to restore the
// stack pointer (but we may need to pop stack args for fastcc).
- if (RedZone && ArgumentPopSize == 0)
+ if (RedZone && AfterCSRPopSize == 0)
return;
bool NoCalleeSaveRestore = PrologueSaveSize == 0;
int StackRestoreBytes = RedZone ? 0 : NumBytes;
if (NoCalleeSaveRestore)
- StackRestoreBytes += ArgumentPopSize;
- emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
- StackRestoreBytes, TII, MachineInstr::FrameDestroy);
+ StackRestoreBytes += AfterCSRPopSize;
+
// If we were able to combine the local stack pop with the argument pop,
// then we're done.
- if (NoCalleeSaveRestore || ArgumentPopSize == 0)
+ bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
+
+ // If we're done after this, make sure to help the load store optimizer.
+ if (Done)
+ adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
+
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+ StackRestoreBytes, TII, MachineInstr::FrameDestroy);
+ if (Done)
return;
+
NumBytes = 0;
}
@@ -888,9 +994,24 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// This must be placed after the callee-save restore code because that code
// assumes the SP is at the same location as it was after the callee-save save
// code in the prologue.
- if (ArgumentPopSize)
- emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
- ArgumentPopSize, TII, MachineInstr::FrameDestroy);
+ if (AfterCSRPopSize) {
+ // Find an insertion point for the first ldp so that it goes before the
+ // shadow call stack epilog instruction. This ensures that the restore of
+ // lr from x18 is placed after the restore from sp.
+ auto FirstSPPopI = MBB.getFirstTerminator();
+ while (FirstSPPopI != Begin) {
+ auto Prev = std::prev(FirstSPPopI);
+ if (Prev->getOpcode() != AArch64::LDRXpre ||
+ Prev->getOperand(0).getReg() == AArch64::SP)
+ break;
+ FirstSPPopI = Prev;
+ }
+
+ adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
+
+ emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
+ AfterCSRPopSize, TII, MachineInstr::FrameDestroy);
+ }
}
/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -917,6 +1038,8 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
int FPOffset = MFI.getObjectOffset(FI) + FixedObject + 16;
int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize();
bool isFixed = MFI.isFixedObjectIndex(FI);
+ bool isCSR = !isFixed && MFI.getObjectOffset(FI) >=
+ -((int)AFI->getCalleeSavedStackSize());
// Use frame pointer to reference fixed objects. Use it for locals if
// there are VLAs or a dynamically realigned SP (and thus the SP isn't
@@ -930,26 +1053,48 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
// Argument access should always use the FP.
if (isFixed) {
UseFP = hasFP(MF);
- } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF) &&
- !RegInfo->needsStackRealignment(MF)) {
- // Use SP or FP, whichever gives us the best chance of the offset
- // being in range for direct access. If the FPOffset is positive,
- // that'll always be best, as the SP will be even further away.
+ } else if (isCSR && RegInfo->needsStackRealignment(MF)) {
+ // References to the CSR area must use FP if we're re-aligning the stack
+ // since the dynamically-sized alignment padding is between the SP/BP and
+ // the CSR area.
+ assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
+ UseFP = true;
+ } else if (hasFP(MF) && !RegInfo->needsStackRealignment(MF)) {
// If the FPOffset is negative, we have to keep in mind that the
// available offset range for negative offsets is smaller than for
- // positive ones. If we have variable sized objects, we're stuck with
- // using the FP regardless, though, as the SP offset is unknown
- // and we don't have a base pointer available. If an offset is
+ // positive ones. If an offset is
// available via the FP and the SP, use whichever is closest.
- if (PreferFP || MFI.hasVarSizedObjects() || FPOffset >= 0 ||
- (FPOffset >= -256 && Offset > -FPOffset))
+ bool FPOffsetFits = FPOffset >= -256;
+ PreferFP |= Offset > -FPOffset;
+
+ if (MFI.hasVarSizedObjects()) {
+ // If we have variable sized objects, we can use either FP or BP, as the
+ // SP offset is unknown. We can use the base pointer if we have one and
+ // FP is not preferred. If not, we're stuck with using FP.
+ bool CanUseBP = RegInfo->hasBasePointer(MF);
+ if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
+ UseFP = PreferFP;
+ else if (!CanUseBP) // Can't use BP. Forced to use FP.
+ UseFP = true;
+ // else we can use BP and FP, but the offset from FP won't fit.
+ // That will make us scavenge registers which we can probably avoid by
+ // using BP. If it won't fit for BP either, we'll scavenge anyway.
+ } else if (FPOffset >= 0) {
+ // Use SP or FP, whichever gives us the best chance of the offset
+ // being in range for direct access. If the FPOffset is positive,
+ // that'll always be best, as the SP will be even further away.
UseFP = true;
+ } else {
+ // We have the choice between FP and (SP or BP).
+ if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
+ UseFP = true;
+ }
}
}
- assert((isFixed || !RegInfo->needsStackRealignment(MF) || !UseFP) &&
+ assert(((isFixed || isCSR) || !RegInfo->needsStackRealignment(MF) || !UseFP) &&
"In the presence of dynamic stack pointer realignment, "
- "non-argument objects cannot be accessed through the frame pointer");
+ "non-argument/CSR objects cannot be accessed through the frame pointer");
if (UseFP) {
FrameReg = RegInfo->getFrameRegister(MF);
@@ -960,6 +1105,8 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
if (RegInfo->hasBasePointer(MF))
FrameReg = RegInfo->getBaseRegister();
else {
+ assert(!MFI.hasVarSizedObjects() &&
+ "Can't use SP when we have var sized objects.");
FrameReg = AArch64::SP;
// If we're using the red zone for this function, the SP won't actually
// be adjusted, so the offsets will be negative. They're also all
@@ -1007,7 +1154,8 @@ struct RegPairInfo {
static void computeCalleeSaveRegisterPairs(
MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs) {
+ const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
+ bool &NeedShadowCallStackProlog) {
if (CSI.empty())
return;
@@ -1041,6 +1189,15 @@ static void computeCalleeSaveRegisterPairs(
RPI.Reg2 = NextReg;
}
+ // If either of the registers to be saved is the lr register, it means that
+ // we also need to save lr in the shadow call stack.
+ if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) &&
+ MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
+ if (!MF.getSubtarget<AArch64Subtarget>().isX18Reserved())
+ report_fatal_error("Must reserve x18 to use shadow call stack");
+ NeedShadowCallStackProlog = true;
+ }
+
// GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
// list to come in sorted by frame index so that we can issue the store
// pair instructions directly. Assert if we see anything otherwise.
@@ -1091,9 +1248,24 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
DebugLoc DL;
SmallVector<RegPairInfo, 8> RegPairs;
- computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+ bool NeedShadowCallStackProlog = false;
+ computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
+ NeedShadowCallStackProlog);
const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (NeedShadowCallStackProlog) {
+ // Shadow call stack prolog: str x30, [x18], #8
+ BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
+ .addReg(AArch64::X18, RegState::Define)
+ .addReg(AArch64::LR)
+ .addReg(AArch64::X18)
+ .addImm(8)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // This instruction also makes x18 live-in to the entry block.
+ MBB.addLiveIn(AArch64::X18);
+ }
+
for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
++RPII) {
RegPairInfo RPI = *RPII;
@@ -1115,13 +1287,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
else
StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
- DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
- if (RPI.isPaired())
- dbgs() << ", " << printReg(Reg2, TRI);
- dbgs() << ") -> fi#(" << RPI.FrameIdx;
- if (RPI.isPaired())
- dbgs() << ", " << RPI.FrameIdx+1;
- dbgs() << ")\n");
+ LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
+ if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
+ dbgs() << ") -> fi#(" << RPI.FrameIdx;
+ if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
+ dbgs() << ")\n");
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
if (!MRI.isReserved(Reg1))
@@ -1157,11 +1327,11 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
if (MI != MBB.end())
DL = MI->getDebugLoc();
- computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+ bool NeedShadowCallStackProlog = false;
+ computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
+ NeedShadowCallStackProlog);
- for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE;
- ++RPII) {
- RegPairInfo RPI = *RPII;
+ auto EmitMI = [&](const RegPairInfo &RPI) {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -1178,13 +1348,11 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
else
LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
- DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
- if (RPI.isPaired())
- dbgs() << ", " << printReg(Reg2, TRI);
- dbgs() << ") -> fi#(" << RPI.FrameIdx;
- if (RPI.isPaired())
- dbgs() << ", " << RPI.FrameIdx+1;
- dbgs() << ")\n");
+ LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
+ if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
+ dbgs() << ") -> fi#(" << RPI.FrameIdx;
+ if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
+ dbgs() << ")\n");
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
if (RPI.isPaired()) {
@@ -1200,7 +1368,25 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
MachineMemOperand::MOLoad, 8, 8));
+ };
+
+ if (ReverseCSRRestoreSeq)
+ for (const RegPairInfo &RPI : reverse(RegPairs))
+ EmitMI(RPI);
+ else
+ for (const RegPairInfo &RPI : RegPairs)
+ EmitMI(RPI);
+
+ if (NeedShadowCallStackProlog) {
+ // Shadow call stack epilog: ldr x30, [x18, #-8]!
+ BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
+ .addReg(AArch64::X18, RegState::Define)
+ .addReg(AArch64::LR, RegState::Define)
+ .addReg(AArch64::X18)
+ .addImm(-8)
+ .setMIFlag(MachineInstr::FrameDestroy);
}
+
return true;
}
@@ -1275,10 +1461,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
}
}
- DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
- for (unsigned Reg : SavedRegs.set_bits())
- dbgs() << ' ' << printReg(Reg, RegInfo);
- dbgs() << "\n";);
+ LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
+ for (unsigned Reg
+ : SavedRegs.set_bits()) dbgs()
+ << ' ' << printReg(Reg, RegInfo);
+ dbgs() << "\n";);
// If any callee-saved registers are used, the frame cannot be eliminated.
unsigned NumRegsSpilled = SavedRegs.count();
@@ -1287,7 +1474,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// The CSR spill slots have not been allocated yet, so estimateStackSize
// won't include them.
unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled;
- DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
+ LLVM_DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
bool BigStack = (CFSize > EstimatedStackSizeLimit);
if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
@@ -1301,8 +1488,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// here.
if (BigStack) {
if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
- DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
- << " to get a scratch register.\n");
+ LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
+ << " to get a scratch register.\n");
SavedRegs.set(UnspilledCSGPR);
// MachO's compact unwind format relies on all registers being stored in
// pairs, so if we need to spill one extra for BigStack, then we need to
@@ -1322,8 +1509,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
unsigned Align = TRI->getSpillAlignment(RC);
int FI = MFI.CreateStackObject(Size, Align, false);
RS->addScavengingFrameIndex(FI);
- DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
- << " as the emergency spill slot.\n");
+ LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
+ << " as the emergency spill slot.\n");
}
}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 55a256867fab..104e52b5f1f3 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -53,7 +53,7 @@ public:
std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI) const override;
- /// \brief Can this function use the red zone for local allocations.
+ /// Can this function use the red zone for local allocations.
bool canUseRedZone(const MachineFunction &MF) const;
bool hasFP(const MachineFunction &MF) const override;
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 0b10246b0cc8..c1a9ee333b62 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -168,6 +168,7 @@ public:
bool tryBitfieldExtractOpFromSExt(SDNode *N);
bool tryBitfieldInsertOp(SDNode *N);
bool tryBitfieldInsertInZeroOp(SDNode *N);
+ bool tryShiftAmountMod(SDNode *N);
bool tryReadRegister(SDNode *N);
bool tryWriteRegister(SDNode *N);
@@ -336,7 +337,7 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
}
}
-/// \brief Determine whether it is worth it to fold SHL into the addressing
+/// Determine whether it is worth it to fold SHL into the addressing
/// mode.
static bool isWorthFoldingSHL(SDValue V) {
assert(V.getOpcode() == ISD::SHL && "invalid opcode");
@@ -360,7 +361,7 @@ static bool isWorthFoldingSHL(SDValue V) {
return true;
}
-/// \brief Determine whether it is worth to fold V into an extended register.
+/// Determine whether it is worth to fold V into an extended register.
bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
// Trivial if we are optimizing for code size or if there is only
// one use of the value.
@@ -743,14 +744,16 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
if (!GAN)
return true;
- const GlobalValue *GV = GAN->getGlobal();
- unsigned Alignment = GV->getAlignment();
- Type *Ty = GV->getValueType();
- if (Alignment == 0 && Ty->isSized())
- Alignment = DL.getABITypeAlignment(Ty);
+ if (GAN->getOffset() % Size == 0) {
+ const GlobalValue *GV = GAN->getGlobal();
+ unsigned Alignment = GV->getAlignment();
+ Type *Ty = GV->getValueType();
+ if (Alignment == 0 && Ty->isSized())
+ Alignment = DL.getABITypeAlignment(Ty);
- if (Alignment >= Size)
- return true;
+ if (Alignment >= Size)
+ return true;
+ }
}
if (CurDAG->isBaseWithConstantOffset(N)) {
@@ -824,7 +827,7 @@ static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
return SDValue(Node, 0);
}
-/// \brief Check if the given SHL node (\p N), can be used to form an
+/// Check if the given SHL node (\p N), can be used to form an
/// extended register for an addressing mode.
bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
bool WantExtend, SDValue &Offset,
@@ -1512,7 +1515,7 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
// Because of simplify-demanded-bits in DAGCombine, the mask may have been
// simplified. Try to undo that
- AndImm |= (1 << NumberOfIgnoredLowBits) - 1;
+ AndImm |= maskTrailingOnes<uint64_t>(NumberOfIgnoredLowBits);
// The immediate is a mask of the low bits iff imm & (imm+1) == 0
if (AndImm & (AndImm + 1))
@@ -1551,8 +1554,9 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
// Bail out on large immediates. This happens when no proper
// combining/constant folding was performed.
if (!BiggerPattern && (SrlImm <= 0 || SrlImm >= VT.getSizeInBits())) {
- DEBUG((dbgs() << N
- << ": Found large shift immediate, this should not happen\n"));
+ LLVM_DEBUG(
+ (dbgs() << N
+ << ": Found large shift immediate, this should not happen\n"));
return false;
}
@@ -1681,7 +1685,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
// later find more redundancy.
Opd0 = N->getOperand(0).getOperand(0);
TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
- VT = Opd0->getValueType(0);
+ VT = Opd0.getValueType();
assert(VT == MVT::i64 && "the promoted type should be i64");
} else if (BiggerPattern) {
// Let's pretend a 0 shift left has been performed.
@@ -1694,8 +1698,9 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
// Missing combines/constant folding may have left us with strange
// constants.
if (ShlImm >= VT.getSizeInBits()) {
- DEBUG((dbgs() << N
- << ": Found large shift immediate, this should not happen\n"));
+ LLVM_DEBUG(
+ (dbgs() << N
+ << ": Found large shift immediate, this should not happen\n"));
return false;
}
@@ -2301,7 +2306,7 @@ static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
continue;
// Check the second part of the pattern
- EVT VT = OrOpd1->getValueType(0);
+ EVT VT = OrOpd1Val.getValueType();
assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
// Compute the Known Zero for the candidate of the first operand.
@@ -2437,6 +2442,111 @@ bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) {
return true;
}
+/// tryShiftAmountMod - Take advantage of built-in mod of shift amount in
+/// variable shift/rotate instructions.
+bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
+ EVT VT = N->getValueType(0);
+
+ unsigned Opc;
+ switch (N->getOpcode()) {
+ case ISD::ROTR:
+ Opc = (VT == MVT::i32) ? AArch64::RORVWr : AArch64::RORVXr;
+ break;
+ case ISD::SHL:
+ Opc = (VT == MVT::i32) ? AArch64::LSLVWr : AArch64::LSLVXr;
+ break;
+ case ISD::SRL:
+ Opc = (VT == MVT::i32) ? AArch64::LSRVWr : AArch64::LSRVXr;
+ break;
+ case ISD::SRA:
+ Opc = (VT == MVT::i32) ? AArch64::ASRVWr : AArch64::ASRVXr;
+ break;
+ default:
+ return false;
+ }
+
+ uint64_t Size;
+ uint64_t Bits;
+ if (VT == MVT::i32) {
+ Bits = 5;
+ Size = 32;
+ } else if (VT == MVT::i64) {
+ Bits = 6;
+ Size = 64;
+ } else
+ return false;
+
+ SDValue ShiftAmt = N->getOperand(1);
+ SDLoc DL(N);
+ SDValue NewShiftAmt;
+
+ // Skip over an extend of the shift amount.
+ if (ShiftAmt->getOpcode() == ISD::ZERO_EXTEND ||
+ ShiftAmt->getOpcode() == ISD::ANY_EXTEND)
+ ShiftAmt = ShiftAmt->getOperand(0);
+
+ if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
+ SDValue Add0 = ShiftAmt->getOperand(0);
+ SDValue Add1 = ShiftAmt->getOperand(1);
+ uint64_t Add0Imm;
+ uint64_t Add1Imm;
+ // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
+ // to avoid the ADD/SUB.
+ if (isIntImmediate(Add1, Add1Imm) && (Add1Imm % Size == 0))
+ NewShiftAmt = Add0;
+ // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
+ // generate a NEG instead of a SUB of a constant.
+ else if (ShiftAmt->getOpcode() == ISD::SUB &&
+ isIntImmediate(Add0, Add0Imm) && Add0Imm != 0 &&
+ (Add0Imm % Size == 0)) {
+ unsigned NegOpc;
+ unsigned ZeroReg;
+ EVT SubVT = ShiftAmt->getValueType(0);
+ if (SubVT == MVT::i32) {
+ NegOpc = AArch64::SUBWrr;
+ ZeroReg = AArch64::WZR;
+ } else {
+ assert(SubVT == MVT::i64);
+ NegOpc = AArch64::SUBXrr;
+ ZeroReg = AArch64::XZR;
+ }
+ SDValue Zero =
+ CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, ZeroReg, SubVT);
+ MachineSDNode *Neg =
+ CurDAG->getMachineNode(NegOpc, DL, SubVT, Zero, Add1);
+ NewShiftAmt = SDValue(Neg, 0);
+ } else
+ return false;
+ } else {
+ // If the shift amount is masked with an AND, check that the mask covers the
+ // bits that are implicitly ANDed off by the above opcodes and if so, skip
+ // the AND.
+ uint64_t MaskImm;
+ if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm))
+ return false;
+
+ if (countTrailingOnes(MaskImm) < Bits)
+ return false;
+
+ NewShiftAmt = ShiftAmt->getOperand(0);
+ }
+
+ // Narrow/widen the shift amount to match the size of the shift operation.
+ if (VT == MVT::i32)
+ NewShiftAmt = narrowIfNeeded(CurDAG, NewShiftAmt);
+ else if (VT == MVT::i64 && NewShiftAmt->getValueType(0) == MVT::i32) {
+ SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, DL, MVT::i32);
+ MachineSDNode *Ext = CurDAG->getMachineNode(
+ AArch64::SUBREG_TO_REG, DL, VT,
+ CurDAG->getTargetConstant(0, DL, MVT::i64), NewShiftAmt, SubReg);
+ NewShiftAmt = SDValue(Ext, 0);
+ }
+
+ SDValue Ops[] = {N->getOperand(0), NewShiftAmt};
+ CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+ return true;
+}
+
bool
AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
unsigned RegWidth) {
@@ -2653,14 +2763,9 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
}
void AArch64DAGToDAGISel::Select(SDNode *Node) {
- // Dump information about the Node being selected
- DEBUG(errs() << "Selecting: ");
- DEBUG(Node->dump(CurDAG));
- DEBUG(errs() << "\n");
-
// If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
- DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+ LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
Node->setNodeId(-1);
return;
}
@@ -2708,6 +2813,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
if (tryBitfieldInsertInZeroOp(Node))
return;
+ LLVM_FALLTHROUGH;
+ case ISD::ROTR:
+ case ISD::SHL:
+ if (tryShiftAmountMod(Node))
+ return;
break;
case ISD::SIGN_EXTEND:
@@ -2757,9 +2867,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
Node->getOperand(0));
- DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
- DEBUG(Extract->dumpr(CurDAG));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
+ LLVM_DEBUG(Extract->dumpr(CurDAG));
+ LLVM_DEBUG(dbgs() << "\n");
ReplaceNode(Node, Extract.getNode());
return;
}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6f7b2b6fd5b5..0c72f2ebee18 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -38,7 +38,6 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -70,6 +69,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
@@ -198,6 +198,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FREM, MVT::f64, Expand);
setOperationAction(ISD::FREM, MVT::f80, Expand);
+ setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+
// Custom lowering hooks are needed for XOR
// to fold it into CSINC/CSINV.
setOperationAction(ISD::XOR, MVT::i32, Custom);
@@ -253,7 +255,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// Variable-sized objects.
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
- setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+ if (Subtarget->isTargetWindows())
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
+ else
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
// Constant pool entries
setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
@@ -463,7 +469,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+ setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
// This requires the Performance Monitors extension.
@@ -567,9 +579,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
- MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
- MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
- MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
+ setTargetDAGCombine(ISD::GlobalAddress);
+
+ // In case of strict alignment, avoid an excessive number of byte wide stores.
+ MaxStoresPerMemsetOptSize = 8;
+ MaxStoresPerMemset = Subtarget->requiresStrictAlign()
+ ? MaxStoresPerMemsetOptSize : 32;
+
+ MaxGluedStoresPerMemcpy = 4;
+ MaxStoresPerMemcpyOptSize = 4;
+ MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
+ ? MaxStoresPerMemcpyOptSize : 16;
+
+ MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
setStackPointerRegisterToSaveRestore(AArch64::SP);
@@ -632,16 +654,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// AArch64 doesn't have a direct vector ->f32 conversion instructions for
// elements smaller than i32, so promote the input to i32 first.
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
+ setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
+ setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
+ setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
+ setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
// i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16
// -> v8f16 conversions.
- setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Promote);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Promote);
- setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Promote);
+ setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
+ setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
+ setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
+ setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
// Similarly, there is no direct i32 -> f64 vector conversion instruction.
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
@@ -691,9 +713,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
for (MVT VT : MVT::vector_valuetypes()) {
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
- setOperationAction(ISD::MULHS, VT, Expand);
+ if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
+ setOperationAction(ISD::MULHS, VT, Custom);
+ setOperationAction(ISD::MULHU, VT, Custom);
+ } else {
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ }
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
- setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::BSWAP, VT, Expand);
@@ -715,24 +742,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
}
+
+ setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
}
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
}
void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
- if (VT == MVT::v2f32 || VT == MVT::v4f16) {
- setOperationAction(ISD::LOAD, VT, Promote);
- AddPromotedToType(ISD::LOAD, VT, MVT::v2i32);
-
- setOperationAction(ISD::STORE, VT, Promote);
- AddPromotedToType(ISD::STORE, VT, MVT::v2i32);
- } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
- setOperationAction(ISD::LOAD, VT, Promote);
- AddPromotedToType(ISD::LOAD, VT, MVT::v2i64);
+ assert(VT.isVector() && "VT should be a vector type");
- setOperationAction(ISD::STORE, VT, Promote);
- AddPromotedToType(ISD::STORE, VT, MVT::v2i64);
+ if (VT.isFloatingPoint()) {
+ MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
+ setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
+ setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
}
// Mark vector float intrinsics as expand.
@@ -1431,7 +1454,8 @@ static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
static bool isLegalArithImmed(uint64_t C) {
// Matches AArch64DAGToDAGISel::SelectArithImmed().
bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
- DEBUG(dbgs() << "Is imm " << C << " legal: " << (IsLegal ? "yes\n" : "no\n"));
+ LLVM_DEBUG(dbgs() << "Is imm " << C
+ << " legal: " << (IsLegal ? "yes\n" : "no\n"));
return IsLegal;
}
@@ -2474,6 +2498,26 @@ static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
return false;
}
+SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+ SelectionDAG &DAG) const {
+ // The rounding mode is in bits 23:22 of the FPSCR.
+ // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
+ // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
+ // so that the shift + and get folded into a bitfield extract.
+ SDLoc dl(Op);
+
+ SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
+ DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
+ MVT::i64));
+ SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
+ SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
+ DAG.getConstant(1U << 22, dl, MVT::i32));
+ SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
+ DAG.getConstant(22, dl, MVT::i32));
+ return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
+ DAG.getConstant(3, dl, MVT::i32));
+}
+
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
// Multiplications are only custom-lowered for 128-bit vectors so that
// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
@@ -2543,6 +2587,66 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
}
+// Lower vector multiply high (ISD::MULHS and ISD::MULHU).
+static SDValue LowerMULH(SDValue Op, SelectionDAG &DAG) {
+ // Multiplications are only custom-lowered for 128-bit vectors so that
+ // {S,U}MULL{2} can be detected. Otherwise v2i64 multiplications are not
+ // legal.
+ EVT VT = Op.getValueType();
+ assert(VT.is128BitVector() && VT.isInteger() &&
+ "unexpected type for custom-lowering ISD::MULH{U,S}");
+
+ SDValue V0 = Op.getOperand(0);
+ SDValue V1 = Op.getOperand(1);
+
+ SDLoc DL(Op);
+
+ EVT ExtractVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
+
+ // We turn (V0 mulhs/mulhu V1) to:
+ //
+ // (uzp2 (smull (extract_subvector (ExtractVT V128:V0, (i64 0)),
+ // (extract_subvector (ExtractVT V128:V1, (i64 0))))),
+ // (smull (extract_subvector (ExtractVT V128:V0, (i64 VMull2Idx)),
+ // (extract_subvector (ExtractVT V128:V2, (i64 VMull2Idx))))))
+ //
+ // Where ExtractVT is a subvector with half number of elements, and
+ // VMullIdx2 is the index of the middle element (the high part).
+ //
+ // The vector hight part extract and multiply will be matched against
+ // {S,U}MULL{v16i8_v8i16,v8i16_v4i32,v4i32_v2i64} which in turn will
+ // issue a {s}mull2 instruction.
+ //
+ // This basically multiply the lower subvector with '{s,u}mull', the high
+ // subvector with '{s,u}mull2', and shuffle both results high part in
+ // resulting vector.
+ unsigned Mull2VectorIdx = VT.getVectorNumElements () / 2;
+ SDValue VMullIdx = DAG.getConstant(0, DL, MVT::i64);
+ SDValue VMull2Idx = DAG.getConstant(Mull2VectorIdx, DL, MVT::i64);
+
+ SDValue VMullV0 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMullIdx);
+ SDValue VMullV1 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMullIdx);
+
+ SDValue VMull2V0 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMull2Idx);
+ SDValue VMull2V1 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMull2Idx);
+
+ unsigned MullOpc = Op.getOpcode() == ISD::MULHS ? AArch64ISD::SMULL
+ : AArch64ISD::UMULL;
+
+ EVT MullVT = ExtractVT.widenIntegerVectorElementType(*DAG.getContext());
+ SDValue Mull = DAG.getNode(MullOpc, DL, MullVT, VMullV0, VMullV1);
+ SDValue Mull2 = DAG.getNode(MullOpc, DL, MullVT, VMull2V0, VMull2V1);
+
+ Mull = DAG.getNode(ISD::BITCAST, DL, VT, Mull);
+ Mull2 = DAG.getNode(ISD::BITCAST, DL, VT, Mull2);
+
+ return DAG.getNode(AArch64ISD::UZP2, DL, VT, Mull, Mull2);
+}
+
SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -2571,10 +2675,72 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}
+// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
+static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
+ EVT VT, EVT MemVT,
+ SelectionDAG &DAG) {
+ assert(VT.isVector() && "VT should be a vector type");
+ assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
+
+ SDValue Value = ST->getValue();
+
+ // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
+ // the word lane which represent the v4i8 subvector. It optimizes the store
+ // to:
+ //
+ // xtn v0.8b, v0.8h
+ // str s0, [x0]
+
+ SDValue Undef = DAG.getUNDEF(MVT::i16);
+ SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
+ {Undef, Undef, Undef, Undef});
+
+ SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
+ Value, UndefVec);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
+
+ Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
+ SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+ Trunc, DAG.getConstant(0, DL, MVT::i64));
+
+ return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
+ ST->getBasePtr(), ST->getMemOperand());
+}
+
+// Custom lowering for any store, vector or scalar and/or default or with
+// a truncate operations. Currently only custom lower truncate operation
+// from vector v4i16 to v4i8.
+SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc Dl(Op);
+ StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
+ assert (StoreNode && "Can only custom lower store nodes");
+
+ SDValue Value = StoreNode->getValue();
+
+ EVT VT = Value.getValueType();
+ EVT MemVT = StoreNode->getMemoryVT();
+
+ assert (VT.isVector() && "Can only custom lower vector store types");
+
+ unsigned AS = StoreNode->getAddressSpace();
+ unsigned Align = StoreNode->getAlignment();
+ if (Align < MemVT.getStoreSize() &&
+ !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
+ return scalarizeVectorStore(StoreNode, DAG);
+ }
+
+ if (StoreNode->isTruncatingStore()) {
+ return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
+ }
+
+ return SDValue();
+}
+
SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
- DEBUG(dbgs() << "Custom lowering: ");
- DEBUG(Op.dump());
+ LLVM_DEBUG(dbgs() << "Custom lowering: ");
+ LLVM_DEBUG(Op.dump());
switch (Op.getOpcode()) {
default:
@@ -2673,10 +2839,17 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerFP_TO_INT(Op, DAG);
case ISD::FSINCOS:
return LowerFSINCOS(Op, DAG);
+ case ISD::FLT_ROUNDS_:
+ return LowerFLT_ROUNDS_(Op, DAG);
case ISD::MUL:
return LowerMUL(Op, DAG);
+ case ISD::MULHS:
+ case ISD::MULHU:
+ return LowerMULH(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::STORE:
+ return LowerSTORE(Op, DAG);
case ISD::VECREDUCE_ADD:
case ISD::VECREDUCE_SMAX:
case ISD::VECREDUCE_SMIN:
@@ -2685,6 +2858,12 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::VECREDUCE_FMAX:
case ISD::VECREDUCE_FMIN:
return LowerVECREDUCE(Op, DAG);
+ case ISD::ATOMIC_LOAD_SUB:
+ return LowerATOMIC_LOAD_SUB(Op, DAG);
+ case ISD::ATOMIC_LOAD_AND:
+ return LowerATOMIC_LOAD_AND(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC:
+ return LowerDYNAMIC_STACKALLOC(Op, DAG);
}
}
@@ -3667,7 +3846,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
SelectionDAG &DAG,
unsigned Flag) const {
- return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
+ return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
+ N->getOffset(), Flag);
}
SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
@@ -3693,7 +3873,7 @@ SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
template <class NodeTy>
SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
unsigned Flags) const {
- DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
+ LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
@@ -3706,7 +3886,7 @@ SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
template <class NodeTy>
SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
unsigned Flags) const {
- DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
+ LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
const unsigned char MO_NC = AArch64II::MO_NC;
@@ -3722,7 +3902,7 @@ SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
template <class NodeTy>
SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
unsigned Flags) const {
- DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
+ LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
@@ -3742,8 +3922,9 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
unsigned char OpFlags =
Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
- assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
- "unexpected offset in global node");
+ if (OpFlags != AArch64II::MO_NO_FLAG)
+ assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
+ "unexpected offset in global node");
// This also catches the large code model case for Darwin.
if ((OpFlags & AArch64II::MO_GOT) != 0) {
@@ -3764,7 +3945,7 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
return Result;
}
-/// \brief Convert a TLS address reference into the correct sequence of loads
+/// Convert a TLS address reference into the correct sequence of loads
/// and calls to compute the variable's address (for Darwin, currently) and
/// return an SDValue containing the final node.
@@ -3968,16 +4149,77 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
}
+SDValue
+AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
+
+ SDValue Chain = DAG.getEntryNode();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDLoc DL(Op);
+
+ SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
+
+ // Load the ThreadLocalStoragePointer from the TEB
+ // A pointer to the TLS array is located at offset 0x58 from the TEB.
+ SDValue TLSArray =
+ DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
+ TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
+ Chain = TLSArray.getValue(1);
+
+ // Load the TLS index from the C runtime;
+ // This does the same as getAddr(), but without having a GlobalAddressSDNode.
+ // This also does the same as LOADgot, but using a generic i32 load,
+ // while LOADgot only loads i64.
+ SDValue TLSIndexHi =
+ DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
+ SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
+ "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
+ SDValue TLSIndex =
+ DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
+ TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
+ Chain = TLSIndex.getValue(1);
+
+ // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
+ // offset into the TLSArray.
+ TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
+ SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
+ DAG.getConstant(3, DL, PtrVT));
+ SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
+ DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
+ MachinePointerInfo());
+ Chain = TLS.getValue(1);
+
+ const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+ const GlobalValue *GV = GA->getGlobal();
+ SDValue TGAHi = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
+ SDValue TGALo = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0,
+ AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+ // Add the offset from the start of the .tls section (section base).
+ SDValue Addr =
+ SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
+ DAG.getTargetConstant(0, DL, MVT::i32)),
+ 0);
+ Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
+ return Addr;
+}
+
SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
- if (DAG.getTarget().Options.EmulatedTLS)
+ if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(GA, DAG);
if (Subtarget->isTargetDarwin())
return LowerDarwinGlobalTLSAddress(Op, DAG);
if (Subtarget->isTargetELF())
return LowerELFGlobalTLSAddress(Op, DAG);
+ if (Subtarget->isTargetWindows())
+ return LowerWindowsGlobalTLSAddress(Op, DAG);
llvm_unreachable("Unexpected platform trying to use TLS");
}
@@ -4778,9 +5020,13 @@ unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
.Case("sp", AArch64::SP)
.Case("x18", AArch64::X18)
.Case("w18", AArch64::W18)
+ .Case("x20", AArch64::X20)
+ .Case("w20", AArch64::W20)
.Default(0);
- if ((Reg == AArch64::X18 || Reg == AArch64::W18) &&
- !Subtarget->isX18Reserved())
+ if (((Reg == AArch64::X18 || Reg == AArch64::W18) &&
+ !Subtarget->isX18Reserved()) ||
+ ((Reg == AArch64::X20 || Reg == AArch64::W20) &&
+ !Subtarget->isX20Reserved()))
Reg = 0;
if (Reg)
return Reg;
@@ -4920,18 +5166,18 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
bool AArch64TargetLowering::isOffsetFoldingLegal(
const GlobalAddressSDNode *GA) const {
- DEBUG(dbgs() << "Skipping offset folding global address: ");
- DEBUG(GA->dump());
- DEBUG(dbgs() << "AArch64 doesn't support folding offsets into global "
- "addresses\n");
+ // Offsets are folded in the DAG combine rather than here so that we can
+ // intelligently choose an offset based on the uses.
return false;
}
bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
// We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
// FIXME: We should be able to handle f128 as well with a clever lowering.
- if (Imm.isPosZero() && (VT == MVT::f16 || VT == MVT::f64 || VT == MVT::f32)) {
- DEBUG(dbgs() << "Legal fp imm: materialize 0 using the zero register\n");
+ if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32 ||
+ (VT == MVT::f16 && Subtarget->hasFullFP16()))) {
+ LLVM_DEBUG(
+ dbgs() << "Legal fp imm: materialize 0 using the zero register\n");
return true;
}
@@ -4952,14 +5198,17 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
}
if (IsLegal) {
- DEBUG(dbgs() << "Legal " << FPType << " imm value: " << ImmStrVal << "\n");
+ LLVM_DEBUG(dbgs() << "Legal " << FPType << " imm value: " << ImmStrVal
+ << "\n");
return true;
}
if (!FPType.empty())
- DEBUG(dbgs() << "Illegal " << FPType << " imm value: " << ImmStrVal << "\n");
+ LLVM_DEBUG(dbgs() << "Illegal " << FPType << " imm value: " << ImmStrVal
+ << "\n");
else
- DEBUG(dbgs() << "Illegal fp imm " << ImmStrVal << ": unsupported fp type\n");
+ LLVM_DEBUG(dbgs() << "Illegal fp imm " << ImmStrVal
+ << ": unsupported fp type\n");
return false;
}
@@ -5003,7 +5252,7 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
EVT VT = Operand.getValueType();
SDNodeFlags Flags;
- Flags.setUnsafeAlgebra(true);
+ Flags.setAllowReassociation(true);
// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
@@ -5013,7 +5262,6 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
}
-
if (!Reciprocal) {
EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
VT);
@@ -5043,7 +5291,7 @@ SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
EVT VT = Operand.getValueType();
SDNodeFlags Flags;
- Flags.setUnsafeAlgebra(true);
+ Flags.setAllowReassociation(true);
// Newton reciprocal iteration: E * (2 - X * E)
// AArch64 reciprocal iteration instruction: (2 - M * N)
@@ -5066,7 +5314,7 @@ SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
// Table of Constraints
// TODO: This is the current set of constraints supported by ARM for the
-// compiler, not all of them may make sense, e.g. S may be difficult to support.
+// compiler, not all of them may make sense.
//
// r - A general register
// w - An FP/SIMD register of some size in the range v0-v31
@@ -5126,6 +5374,8 @@ AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
// currently handle addresses it is the same as 'r'.
case 'Q':
return C_Memory;
+ case 'S': // A symbolic address
+ return C_Other;
}
}
return TargetLowering::getConstraintType(Constraint);
@@ -5250,6 +5500,23 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint(
Result = DAG.getRegister(AArch64::WZR, MVT::i32);
break;
}
+ case 'S': {
+ // An absolute symbolic address or label reference.
+ if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
+ Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
+ GA->getValueType(0));
+ } else if (const BlockAddressSDNode *BA =
+ dyn_cast<BlockAddressSDNode>(Op)) {
+ Result =
+ DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
+ } else if (const ExternalSymbolSDNode *ES =
+ dyn_cast<ExternalSymbolSDNode>(Op)) {
+ Result =
+ DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0));
+ } else
+ return;
+ break;
+ }
case 'I':
case 'J':
@@ -5399,7 +5666,7 @@ static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
- DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
+ LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
SDLoc dl(Op);
EVT VT = Op.getValueType();
unsigned NumElts = VT.getVectorNumElements();
@@ -5435,10 +5702,11 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
continue;
else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(V.getOperand(1))) {
- DEBUG(dbgs() << "Reshuffle failed: "
- "a shuffle can only come from building a vector from "
- "various elements of other vectors, provided their "
- "indices are constant\n");
+ LLVM_DEBUG(
+ dbgs() << "Reshuffle failed: "
+ "a shuffle can only come from building a vector from "
+ "various elements of other vectors, provided their "
+ "indices are constant\n");
return SDValue();
}
@@ -5455,8 +5723,9 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
}
if (Sources.size() > 2) {
- DEBUG(dbgs() << "Reshuffle failed: currently only do something sane when at "
- "most two source vectors are involved\n");
+ LLVM_DEBUG(
+ dbgs() << "Reshuffle failed: currently only do something sane when at "
+ "most two source vectors are involved\n");
return SDValue();
}
@@ -5502,7 +5771,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
- DEBUG(dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
+ LLVM_DEBUG(
+ dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
return SDValue();
}
@@ -5548,10 +5818,9 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
}
// Final sanity check before we try to actually produce a shuffle.
- DEBUG(
- for (auto Src : Sources)
- assert(Src.ShuffleVec.getValueType() == ShuffleVT);
- );
+ LLVM_DEBUG(for (auto Src
+ : Sources)
+ assert(Src.ShuffleVec.getValueType() == ShuffleVT););
// The stars all align, our next step is to produce the mask for the shuffle.
SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
@@ -5584,7 +5853,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
// Final check before we try to produce nonsense...
if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
- DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
+ LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
return SDValue();
}
@@ -5596,12 +5865,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
ShuffleOps[1], Mask);
SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
- DEBUG(
- dbgs() << "Reshuffle, creating node: ";
- Shuffle.dump();
- dbgs() << "Reshuffle, creating node: ";
- V.dump();
- );
+ LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
+ dbgs() << "Reshuffle, creating node: "; V.dump(););
return V;
}
@@ -6236,96 +6501,235 @@ static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
return false;
}
-SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
- SelectionDAG &DAG) const {
- BuildVectorSDNode *BVN =
- dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
- SDValue LHS = Op.getOperand(0);
- SDLoc dl(Op);
- EVT VT = Op.getValueType();
+// Try 64-bit splatted SIMD immediate.
+static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
+ const APInt &Bits) {
+ if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
+ uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
+ EVT VT = Op.getValueType();
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
- if (!BVN)
- return Op;
+ if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
+ Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
- APInt CnstBits(VT.getSizeInBits(), 0);
- APInt UndefBits(VT.getSizeInBits(), 0);
- if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
- // We only have BIC vector immediate instruction, which is and-not.
- CnstBits = ~CnstBits;
-
- // We make use of a little bit of goto ickiness in order to avoid having to
- // duplicate the immediate matching logic for the undef toggled case.
- bool SecondTry = false;
- AttemptModImm:
-
- if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
- CnstBits = CnstBits.zextOrTrunc(64);
- uint64_t CnstVal = CnstBits.getZExtValue();
-
- if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(0, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
+ SDLoc dl(Op);
+ SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
+ DAG.getConstant(Value, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+ }
- if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(8, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
+ return SDValue();
+}
- if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(16, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
+// Try 32-bit splatted SIMD immediate.
+static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
+ const APInt &Bits,
+ const SDValue *LHS = nullptr) {
+ if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
+ uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
+ EVT VT = Op.getValueType();
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ bool isAdvSIMDModImm = false;
+ uint64_t Shift;
+
+ if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
+ Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
+ Shift = 0;
+ }
+ else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
+ Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
+ Shift = 8;
+ }
+ else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
+ Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
+ Shift = 16;
+ }
+ else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
+ Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
+ Shift = 24;
+ }
- if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(24, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
+ if (isAdvSIMDModImm) {
+ SDLoc dl(Op);
+ SDValue Mov;
- if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
- SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(0, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
+ if (LHS)
+ Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
+ DAG.getConstant(Value, dl, MVT::i32),
+ DAG.getConstant(Shift, dl, MVT::i32));
+ else
+ Mov = DAG.getNode(NewOp, dl, MovTy,
+ DAG.getConstant(Value, dl, MVT::i32),
+ DAG.getConstant(Shift, dl, MVT::i32));
- if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
- SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(8, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+ }
+
+ return SDValue();
+}
+
+// Try 16-bit splatted SIMD immediate.
+static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
+ const APInt &Bits,
+ const SDValue *LHS = nullptr) {
+ if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
+ uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
+ EVT VT = Op.getValueType();
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ bool isAdvSIMDModImm = false;
+ uint64_t Shift;
+
+ if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
+ Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
+ Shift = 0;
+ }
+ else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
+ Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
+ Shift = 8;
+ }
+
+ if (isAdvSIMDModImm) {
+ SDLoc dl(Op);
+ SDValue Mov;
+
+ if (LHS)
+ Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
+ DAG.getConstant(Value, dl, MVT::i32),
+ DAG.getConstant(Shift, dl, MVT::i32));
+ else
+ Mov = DAG.getNode(NewOp, dl, MovTy,
+ DAG.getConstant(Value, dl, MVT::i32),
+ DAG.getConstant(Shift, dl, MVT::i32));
+
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+ }
+
+ return SDValue();
+}
+
+// Try 32-bit splatted SIMD immediate with shifted ones.
+static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
+ SelectionDAG &DAG, const APInt &Bits) {
+ if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
+ uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
+ EVT VT = Op.getValueType();
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ bool isAdvSIMDModImm = false;
+ uint64_t Shift;
+
+ if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
+ Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
+ Shift = 264;
+ }
+ else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
+ Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
+ Shift = 272;
+ }
+
+ if (isAdvSIMDModImm) {
+ SDLoc dl(Op);
+ SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
+ DAG.getConstant(Value, dl, MVT::i32),
+ DAG.getConstant(Shift, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+ }
+
+ return SDValue();
+}
+
+// Try 8-bit splatted SIMD immediate.
+static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
+ const APInt &Bits) {
+ if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
+ uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
+ EVT VT = Op.getValueType();
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
+
+ if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
+ Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
+
+ SDLoc dl(Op);
+ SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
+ DAG.getConstant(Value, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
+ }
+
+ return SDValue();
+}
+
+// Try FP splatted SIMD immediate.
+static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
+ const APInt &Bits) {
+ if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
+ uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
+ EVT VT = Op.getValueType();
+ bool isWide = (VT.getSizeInBits() == 128);
+ MVT MovTy;
+ bool isAdvSIMDModImm = false;
+
+ if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
+ Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
+ MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
+ }
+ else if (isWide &&
+ (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
+ Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
+ MovTy = MVT::v2f64;
}
- if (SecondTry)
- goto FailedModImm;
- SecondTry = true;
- CnstBits = ~UndefBits;
- goto AttemptModImm;
+ if (isAdvSIMDModImm) {
+ SDLoc dl(Op);
+ SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
+ DAG.getConstant(Value, dl, MVT::i32));
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ }
}
-// We can always fall back to a non-immediate AND.
-FailedModImm:
+ return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue LHS = Op.getOperand(0);
+ EVT VT = Op.getValueType();
+
+ BuildVectorSDNode *BVN =
+ dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+ if (!BVN) {
+ // AND commutes, so try swapping the operands.
+ LHS = Op.getOperand(1);
+ BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
+ }
+ if (!BVN)
+ return Op;
+
+ APInt DefBits(VT.getSizeInBits(), 0);
+ APInt UndefBits(VT.getSizeInBits(), 0);
+ if (resolveBuildVector(BVN, DefBits, UndefBits)) {
+ SDValue NewOp;
+
+ // We only have BIC vector immediate instruction, which is and-not.
+ DefBits = ~DefBits;
+ if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, Op, DAG,
+ DefBits, &LHS)) ||
+ (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, Op, DAG,
+ DefBits, &LHS)))
+ return NewOp;
+
+ UndefBits = ~UndefBits;
+ if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, Op, DAG,
+ UndefBits, &LHS)) ||
+ (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, Op, DAG,
+ UndefBits, &LHS)))
+ return NewOp;
+ }
+
+ // We can always fall back to a non-immediate AND.
return Op;
}
@@ -6419,10 +6823,10 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
Shift.getOperand(1));
- DEBUG(dbgs() << "aarch64-lower: transformed: \n");
- DEBUG(N->dump(&DAG));
- DEBUG(dbgs() << "into: \n");
- DEBUG(ResultSLI->dump(&DAG));
+ LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
+ LLVM_DEBUG(N->dump(&DAG));
+ LLVM_DEBUG(dbgs() << "into: \n");
+ LLVM_DEBUG(ResultSLI->dump(&DAG));
++NumShiftInserts;
return ResultSLI;
@@ -6436,96 +6840,38 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
return Res;
}
- BuildVectorSDNode *BVN =
- dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
- SDValue LHS = Op.getOperand(1);
- SDLoc dl(Op);
EVT VT = Op.getValueType();
- // OR commutes, so try swapping the operands.
+ SDValue LHS = Op.getOperand(0);
+ BuildVectorSDNode *BVN =
+ dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
if (!BVN) {
- LHS = Op.getOperand(0);
- BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+ // OR commutes, so try swapping the operands.
+ LHS = Op.getOperand(1);
+ BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
}
if (!BVN)
return Op;
- APInt CnstBits(VT.getSizeInBits(), 0);
+ APInt DefBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
- if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
- // We make use of a little bit of goto ickiness in order to avoid having to
- // duplicate the immediate matching logic for the undef toggled case.
- bool SecondTry = false;
- AttemptModImm:
-
- if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
- CnstBits = CnstBits.zextOrTrunc(64);
- uint64_t CnstVal = CnstBits.getZExtValue();
-
- if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(0, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(8, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(16, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(24, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
- SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(0, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
+ if (resolveBuildVector(BVN, DefBits, UndefBits)) {
+ SDValue NewOp;
- if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
- SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(8, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
- }
+ if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
+ DefBits, &LHS)) ||
+ (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
+ DefBits, &LHS)))
+ return NewOp;
- if (SecondTry)
- goto FailedModImm;
- SecondTry = true;
- CnstBits = UndefBits;
- goto AttemptModImm;
+ if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
+ UndefBits, &LHS)) ||
+ (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
+ UndefBits, &LHS)))
+ return NewOp;
}
-// We can always fall back to a non-immediate OR.
-FailedModImm:
+ // We can always fall back to a non-immediate OR.
return Op;
}
@@ -6553,226 +6899,71 @@ static SDValue NormalizeBuildVector(SDValue Op,
return DAG.getBuildVector(VT, dl, Ops);
}
-SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc dl(Op);
+static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
- Op = NormalizeBuildVector(Op, DAG);
- BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
- APInt CnstBits(VT.getSizeInBits(), 0);
+ APInt DefBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
- if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
- // We make use of a little bit of goto ickiness in order to avoid having to
- // duplicate the immediate matching logic for the undef toggled case.
- bool SecondTry = false;
- AttemptModImm:
-
- if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
- CnstBits = CnstBits.zextOrTrunc(64);
- uint64_t CnstVal = CnstBits.getZExtValue();
-
- // Certain magic vector constants (used to express things like NOT
- // and NEG) are passed through unmodified. This allows codegen patterns
- // for these operations to match. Special-purpose patterns will lower
- // these immediates to MOVIs if it proves necessary.
- if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
- return Op;
-
- // The many faces of MOVI...
- if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
- if (VT.getSizeInBits() == 128) {
- SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
- DAG.getConstant(CnstVal, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- // Support the V64 version via subregister insertion.
- SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
- DAG.getConstant(CnstVal, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(0, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(8, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(16, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(24, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
- SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(0, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
- SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(8, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(264, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(272, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
- SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- // The few faces of FMOV...
- if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
- SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
- VT.getSizeInBits() == 128) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
- SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
- DAG.getConstant(CnstVal, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- // The many faces of MVNI...
- CnstVal = ~CnstVal;
- if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(0, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(8, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(16, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(24, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
-
- if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
- SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(0, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
+ BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+ if (resolveBuildVector(BVN, DefBits, UndefBits)) {
+ SDValue NewOp;
+ if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
+ return NewOp;
+
+ DefBits = ~DefBits;
+ if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
+ return NewOp;
+
+ DefBits = UndefBits;
+ if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
+ return NewOp;
+
+ DefBits = ~UndefBits;
+ if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
+ return NewOp;
+ }
- if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
- SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(8, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
+ return SDValue();
+}
- if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(264, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
- }
+SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
- if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
- CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
- MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
- SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
- DAG.getConstant(CnstVal, dl, MVT::i32),
- DAG.getConstant(272, dl, MVT::i32));
- return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+ // Try to build a simple constant vector.
+ Op = NormalizeBuildVector(Op, DAG);
+ if (VT.isInteger()) {
+ // Certain vector constants, used to express things like logical NOT and
+ // arithmetic NEG, are passed through unmodified. This allows special
+ // patterns for these operations to match, which will lower these constants
+ // to whatever is proven necessary.
+ BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+ if (BVN->isConstant())
+ if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
+ unsigned BitSize = VT.getVectorElementType().getSizeInBits();
+ APInt Val(BitSize,
+ Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
+ if (Val.isNullValue() || Val.isAllOnesValue())
+ return Op;
}
- }
-
- if (SecondTry)
- goto FailedModImm;
- SecondTry = true;
- CnstBits = UndefBits;
- goto AttemptModImm;
}
-FailedModImm:
+
+ if (SDValue V = ConstantBuildVector(Op, DAG))
+ return V;
// Scan through the operands to find some interesting properties we can
// exploit:
@@ -6785,16 +6976,21 @@ FailedModImm:
// select the values we'll be overwriting for the non-constant
// lanes such that we can directly materialize the vector
// some other way (MOVI, e.g.), we can be sneaky.
+ // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
+ SDLoc dl(Op);
unsigned NumElts = VT.getVectorNumElements();
bool isOnlyLowElement = true;
bool usesOnlyOneValue = true;
bool usesOnlyOneConstantValue = true;
bool isConstant = true;
+ bool AllLanesExtractElt = true;
unsigned NumConstantLanes = 0;
SDValue Value;
SDValue ConstantValue;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
+ if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ AllLanesExtractElt = false;
if (V.isUndef())
continue;
if (i > 0)
@@ -6817,23 +7013,86 @@ FailedModImm:
}
if (!Value.getNode()) {
- DEBUG(dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
+ LLVM_DEBUG(
+ dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
return DAG.getUNDEF(VT);
}
if (isOnlyLowElement) {
- DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
- "SCALAR_TO_VECTOR node\n");
+ LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
+ "SCALAR_TO_VECTOR node\n");
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
}
+ if (AllLanesExtractElt) {
+ SDNode *Vector = nullptr;
+ bool Even = false;
+ bool Odd = false;
+ // Check whether the extract elements match the Even pattern <0,2,4,...> or
+ // the Odd pattern <1,3,5,...>.
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ const SDNode *N = V.getNode();
+ if (!isa<ConstantSDNode>(N->getOperand(1)))
+ break;
+ SDValue N0 = N->getOperand(0);
+
+ // All elements are extracted from the same vector.
+ if (!Vector) {
+ Vector = N0.getNode();
+ // Check that the type of EXTRACT_VECTOR_ELT matches the type of
+ // BUILD_VECTOR.
+ if (VT.getVectorElementType() !=
+ N0.getValueType().getVectorElementType())
+ break;
+ } else if (Vector != N0.getNode()) {
+ Odd = false;
+ Even = false;
+ break;
+ }
+
+ // Extracted values are either at Even indices <0,2,4,...> or at Odd
+ // indices <1,3,5,...>.
+ uint64_t Val = N->getConstantOperandVal(1);
+ if (Val == 2 * i) {
+ Even = true;
+ continue;
+ }
+ if (Val - 1 == 2 * i) {
+ Odd = true;
+ continue;
+ }
+
+ // Something does not match: abort.
+ Odd = false;
+ Even = false;
+ break;
+ }
+ if (Even || Odd) {
+ SDValue LHS =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
+ DAG.getConstant(0, dl, MVT::i64));
+ SDValue RHS =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
+ DAG.getConstant(NumElts, dl, MVT::i64));
+
+ if (Even && !Odd)
+ return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
+ RHS);
+ if (Odd && !Even)
+ return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
+ RHS);
+ }
+ }
+
// Use DUP for non-constant splats. For f32 constant splats, reduce to
// i32 and try again.
if (usesOnlyOneValue) {
if (!isConstant) {
if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Value.getValueType() != VT) {
- DEBUG(dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
+ LLVM_DEBUG(
+ dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
}
@@ -6842,8 +7101,9 @@ FailedModImm:
SDValue Lane = Value.getOperand(1);
Value = Value.getOperand(0);
if (Value.getValueSizeInBits() == 64) {
- DEBUG(dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
- "widening it\n");
+ LLVM_DEBUG(
+ dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
+ "widening it\n");
Value = WidenVector(Value, DAG);
}
@@ -6856,17 +7116,16 @@ FailedModImm:
EVT EltTy = VT.getVectorElementType();
assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) &&
"Unsupported floating-point vector type");
- DEBUG(dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
- "BITCASTS, and try again\n");
+ LLVM_DEBUG(
+ dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
+ "BITCASTS, and try again\n");
MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
for (unsigned i = 0; i < NumElts; ++i)
Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
- DEBUG(
- dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
- Val.dump();
- );
+ LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
+ Val.dump(););
Val = LowerBUILD_VECTOR(Val, DAG);
if (Val.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, Val);
@@ -6878,24 +7137,32 @@ FailedModImm:
// is better than the default, which will perform a separate initialization
// for each lane.
if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
- SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
+ // Firstly, try to materialize the splat constant.
+ SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
+ Val = ConstantBuildVector(Vec, DAG);
+ if (!Val) {
+ // Otherwise, materialize the constant and splat it.
+ Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
+ DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
+ }
+
// Now insert the non-constant lanes.
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
- if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
+ if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V))
// Note that type legalization likely mucked about with the VT of the
// source operand, so we may have to convert it here before inserting.
Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
- }
}
return Val;
}
// This will generate a load from the constant pool.
if (isConstant) {
- DEBUG(dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
- "expansion\n");
+ LLVM_DEBUG(
+ dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
+ "expansion\n");
return SDValue();
}
@@ -6912,8 +7179,9 @@ FailedModImm:
// shuffle is valid for the target) and materialization element by element
// on the stack followed by a load for everything else.
if (!isConstant && !usesOnlyOneValue) {
- DEBUG(dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
- "of INSERT_VECTOR_ELT\n");
+ LLVM_DEBUG(
+ dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
+ "of INSERT_VECTOR_ELT\n");
SDValue Vec = DAG.getUNDEF(VT);
SDValue Op0 = Op.getOperand(0);
@@ -6930,14 +7198,12 @@ FailedModImm:
// extended (i32) and it is safe to cast them to the vector type by ignoring
// the upper bits of the lowest lane (e.g. v8i8, v4i16).
if (!Op0.isUndef()) {
- DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
+ LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
++i;
}
- DEBUG(
- if (i < NumElts)
- dbgs() << "Creating nodes for the other vector elements:\n";
- );
+ LLVM_DEBUG(if (i < NumElts) dbgs()
+ << "Creating nodes for the other vector elements:\n";);
for (; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.isUndef())
@@ -6948,8 +7214,9 @@ FailedModImm:
return Vec;
}
- DEBUG(dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
- "better alternative\n");
+ LLVM_DEBUG(
+ dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
+ "better alternative\n");
return SDValue();
}
@@ -7290,8 +7557,21 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
}
- if (LHS.getValueType().getVectorElementType() == MVT::f16)
- return SDValue();
+ const bool FullFP16 =
+ static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
+
+ // Make v4f16 (only) fcmp operations utilise vector instructions
+ // v8f16 support will be a litle more complicated
+ if (LHS.getValueType().getVectorElementType() == MVT::f16) {
+ if (!FullFP16 && LHS.getValueType().getVectorNumElements() == 4) {
+ LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
+ RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
+ SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
+ DAG.ReplaceAllUsesWith(Op, NewSetcc);
+ CmpVT = MVT::v4i32;
+ } else
+ return SDValue();
+ }
assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
LHS.getValueType().getVectorElementType() == MVT::f64);
@@ -7366,6 +7646,111 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
}
}
+SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
+ if (!Subtarget.hasLSE())
+ return SDValue();
+
+ // LSE has an atomic load-add instruction, but not a load-sub.
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue RHS = Op.getOperand(2);
+ AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
+ RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
+ Op.getOperand(0), Op.getOperand(1), RHS,
+ AN->getMemOperand());
+}
+
+SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
+ if (!Subtarget.hasLSE())
+ return SDValue();
+
+ // LSE has an atomic load-clear instruction, but not a load-and.
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue RHS = Op.getOperand(2);
+ AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
+ RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
+ Op.getOperand(0), Op.getOperand(1), RHS,
+ AN->getMemOperand());
+}
+
+SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
+ SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);
+
+ const uint32_t *Mask =
+ Subtarget->getRegisterInfo()->getWindowsStackProbePreservedMask();
+
+ Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
+ DAG.getConstant(4, dl, MVT::i64));
+ Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
+ Chain =
+ DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
+ Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
+ DAG.getRegisterMask(Mask), Chain.getValue(1));
+ // To match the actual intent better, we should read the output from X15 here
+ // again (instead of potentially spilling it to the stack), but rereading Size
+ // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
+ // here.
+
+ Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
+ DAG.getConstant(4, dl, MVT::i64));
+ return Chain;
+}
+
+SDValue
+AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetWindows() &&
+ "Only Windows alloca probing supported");
+ SDLoc dl(Op);
+ // Get the inputs.
+ SDNode *Node = Op.getNode();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ EVT VT = Node->getValueType(0);
+
+ if (DAG.getMachineFunction().getFunction().hasFnAttribute(
+ "no-stack-arg-probe")) {
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+ Chain = SP.getValue(1);
+ SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+ if (Align)
+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+ DAG.getConstant(-(uint64_t)Align, dl, VT));
+ Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
+ SDValue Ops[2] = {SP, Chain};
+ return DAG.getMergeValues(Ops, dl);
+ }
+
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
+
+ Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
+
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+ Chain = SP.getValue(1);
+ SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+ if (Align)
+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+ DAG.getConstant(-(uint64_t)Align, dl, VT));
+ Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
+
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+ SDValue Ops[2] = {SP, Chain};
+ return DAG.getMergeValues(Ops, dl);
+}
+
/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
/// specified in the intrinsic calls.
@@ -7471,6 +7856,33 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return false;
}
+bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
+ ISD::LoadExtType ExtTy,
+ EVT NewVT) const {
+ // If we're reducing the load width in order to avoid having to use an extra
+ // instruction to do extension then it's probably a good idea.
+ if (ExtTy != ISD::NON_EXTLOAD)
+ return true;
+ // Don't reduce load width if it would prevent us from combining a shift into
+ // the offset.
+ MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
+ assert(Mem);
+ const SDValue &Base = Mem->getBasePtr();
+ if (Base.getOpcode() == ISD::ADD &&
+ Base.getOperand(1).getOpcode() == ISD::SHL &&
+ Base.getOperand(1).hasOneUse() &&
+ Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
+ // The shift can be combined if it matches the size of the value being
+ // loaded (and so reducing the width would make it not match).
+ uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
+ uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
+ if (ShiftAmount == Log2_32(LoadBytes))
+ return false;
+ }
+ // We have no reason to disallow reducing the load width, so allow it.
+ return true;
+}
+
// Truncations from 64-bit GPR to 32-bit GPR is free.
bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
@@ -7646,7 +8058,7 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType(
return VecSize == 64 || VecSize % 128 == 0;
}
-/// \brief Lower an interleaved load into a ldN intrinsic.
+/// Lower an interleaved load into a ldN intrinsic.
///
/// E.g. Lower an interleaved load (Factor = 2):
/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
@@ -7758,7 +8170,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
return true;
}
-/// \brief Lower an interleaved store into a stN intrinsic.
+/// Lower an interleaved store into a stN intrinsic.
///
/// E.g. Lower an interleaved store (Factor = 3):
/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
@@ -7816,8 +8228,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
// vectors to integer vectors.
if (EltTy->isPointerTy()) {
Type *IntTy = DL.getIntPtrType(EltTy);
- unsigned NumOpElts =
- dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();
+ unsigned NumOpElts = Op0->getType()->getVectorNumElements();
// Convert to the corresponding integer vector.
Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
@@ -7932,15 +8343,16 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
// 12-bit optionally shifted immediates are legal for adds.
bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
if (Immed == std::numeric_limits<int64_t>::min()) {
- DEBUG(dbgs() << "Illegal add imm " << Immed << ": avoid UB for INT64_MIN\n");
+ LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
+ << ": avoid UB for INT64_MIN\n");
return false;
}
// Same encoding for add/sub, just flip the sign.
Immed = std::abs(Immed);
bool IsLegal = ((Immed >> 12) == 0 ||
((Immed & 0xfff) == 0 && Immed >> 24 == 0));
- DEBUG(dbgs() << "Is " << Immed << " legal add imm: " <<
- (IsLegal ? "yes" : "no") << "\n");
+ LLVM_DEBUG(dbgs() << "Is " << Immed
+ << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
return IsLegal;
}
@@ -8001,6 +8413,11 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
}
+bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
+ // Consider splitting large offset of struct or array.
+ return true;
+}
+
int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS) const {
@@ -8085,6 +8502,14 @@ bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return Shift < 3;
}
+bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+ unsigned Index) const {
+ if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+ return false;
+
+ return (Index == 0 || Index == ResVT.getVectorNumElements());
+}
+
/// Turn vector tests of the signbit in the form of:
/// xor (sra X, elt_size(X)-1), -1
/// into:
@@ -8727,10 +9152,12 @@ static SDValue performBitcastCombine(SDNode *N,
// If the source type has twice the number of elements as our destination
// type, we know this is an extract of the high or low half of the vector.
EVT SVT = Source->getValueType(0);
- if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
+ if (!SVT.isVector() ||
+ SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
return SDValue();
- DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
+ LLVM_DEBUG(
+ dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
// Create the simplified form to just extract the low or high half of the
// vector directly rather than bothering with the bitcasts.
@@ -8818,7 +9245,8 @@ static SDValue performConcatVectorsCombine(SDNode *N,
if (!RHSTy.isVector())
return SDValue();
- DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
+ LLVM_DEBUG(
+ dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
RHSTy.getVectorNumElements() * 2);
@@ -8831,7 +9259,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
static SDValue tryCombineFixedPointConvert(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
- // Wait 'til after everything is legalized to try this. That way we have
+ // Wait until after everything is legalized to try this. That way we have
// legal vector types and such.
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -8933,26 +9361,26 @@ static bool isEssentiallyExtractSubvector(SDValue N) {
N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
}
-/// \brief Helper structure to keep track of ISD::SET_CC operands.
+/// Helper structure to keep track of ISD::SET_CC operands.
struct GenericSetCCInfo {
const SDValue *Opnd0;
const SDValue *Opnd1;
ISD::CondCode CC;
};
-/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
+/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
struct AArch64SetCCInfo {
const SDValue *Cmp;
AArch64CC::CondCode CC;
};
-/// \brief Helper structure to keep track of SetCC information.
+/// Helper structure to keep track of SetCC information.
union SetCCInfo {
GenericSetCCInfo Generic;
AArch64SetCCInfo AArch64;
};
-/// \brief Helper structure to be able to read SetCC information. If set to
+/// Helper structure to be able to read SetCC information. If set to
/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
/// GenericSetCCInfo.
struct SetCCInfoAndKind {
@@ -8960,7 +9388,7 @@ struct SetCCInfoAndKind {
bool IsAArch64;
};
-/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
+/// Check whether or not \p Op is a SET_CC operation, either a generic or
/// an
/// AArch64 lowered one.
/// \p SetCCInfo is filled accordingly.
@@ -9637,6 +10065,15 @@ static SDValue performPostLD1Combine(SDNode *N,
if (LD->getOpcode() != ISD::LOAD)
return SDValue();
+ // The vector lane must be a constant in the LD1LANE opcode.
+ SDValue Lane;
+ if (IsLaneOp) {
+ Lane = N->getOperand(2);
+ auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
+ if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
+ return SDValue();
+ }
+
LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
EVT MemVT = LoadSDN->getMemoryVT();
// Check if memory operand is the same type as the vector element.
@@ -9693,7 +10130,7 @@ static SDValue performPostLD1Combine(SDNode *N,
Ops.push_back(LD->getOperand(0)); // Chain
if (IsLaneOp) {
Ops.push_back(Vector); // The vector to be inserted
- Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
+ Ops.push_back(Lane); // The lane to be inserted in the vector
}
Ops.push_back(Addr);
Ops.push_back(Inc);
@@ -10393,12 +10830,65 @@ static SDValue performNVCASTCombine(SDNode *N) {
return SDValue();
}
+// If all users of the globaladdr are of the form (globaladdr + constant), find
+// the smallest constant, fold it into the globaladdr's offset and rewrite the
+// globaladdr as (globaladdr + constant) - constant.
+static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget,
+ const TargetMachine &TM) {
+ auto *GN = dyn_cast<GlobalAddressSDNode>(N);
+ if (!GN || Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
+ AArch64II::MO_NO_FLAG)
+ return SDValue();
+
+ uint64_t MinOffset = -1ull;
+ for (SDNode *N : GN->uses()) {
+ if (N->getOpcode() != ISD::ADD)
+ return SDValue();
+ auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
+ if (!C)
+ C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!C)
+ return SDValue();
+ MinOffset = std::min(MinOffset, C->getZExtValue());
+ }
+ uint64_t Offset = MinOffset + GN->getOffset();
+
+ // Require that the new offset is larger than the existing one. Otherwise, we
+ // can end up oscillating between two possible DAGs, for example,
+ // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
+ if (Offset <= uint64_t(GN->getOffset()))
+ return SDValue();
+
+ // Check whether folding this offset is legal. It must not go out of bounds of
+ // the referenced object to avoid violating the code model, and must be
+ // smaller than 2^21 because this is the largest offset expressible in all
+ // object formats.
+ //
+ // This check also prevents us from folding negative offsets, which will end
+ // up being treated in the same way as large positive ones. They could also
+ // cause code model violations, and aren't really common enough to matter.
+ if (Offset >= (1 << 21))
+ return SDValue();
+
+ const GlobalValue *GV = GN->getGlobal();
+ Type *T = GV->getValueType();
+ if (!T->isSized() ||
+ Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
+ return SDValue();
+
+ SDLoc DL(GN);
+ SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
+ return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
+ DAG.getConstant(MinOffset, DL, MVT::i64));
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
default:
- DEBUG(dbgs() << "Custom combining: skipping\n");
+ LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
break;
case ISD::ADD:
case ISD::SUB:
@@ -10480,6 +10970,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
default:
break;
}
+ case ISD::GlobalAddress:
+ return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
}
return SDValue();
}
@@ -10640,11 +11132,79 @@ static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
return std::make_pair(Lo, Hi);
}
+// Create an even/odd pair of X registers holding integer value V.
+static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
+ SDLoc dl(V.getNode());
+ SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
+ SDValue VHi = DAG.getAnyExtOrTrunc(
+ DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
+ dl, MVT::i64);
+ if (DAG.getDataLayout().isBigEndian())
+ std::swap (VLo, VHi);
+ SDValue RegClass =
+ DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
+ SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
+ SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
+ const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
+ return SDValue(
+ DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
+}
+
static void ReplaceCMP_SWAP_128Results(SDNode *N,
- SmallVectorImpl<SDValue> & Results,
- SelectionDAG &DAG) {
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
assert(N->getValueType(0) == MVT::i128 &&
"AtomicCmpSwap on types less than 128 should be legal");
+
+ if (Subtarget->hasLSE()) {
+ // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
+ // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
+ SDValue Ops[] = {
+ createGPRPairNode(DAG, N->getOperand(2)), // Compare value
+ createGPRPairNode(DAG, N->getOperand(3)), // Store value
+ N->getOperand(1), // Ptr
+ N->getOperand(0), // Chain in
+ };
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+ MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+
+ unsigned Opcode;
+ switch (MemOp[0]->getOrdering()) {
+ case AtomicOrdering::Monotonic:
+ Opcode = AArch64::CASPX;
+ break;
+ case AtomicOrdering::Acquire:
+ Opcode = AArch64::CASPAX;
+ break;
+ case AtomicOrdering::Release:
+ Opcode = AArch64::CASPLX;
+ break;
+ case AtomicOrdering::AcquireRelease:
+ case AtomicOrdering::SequentiallyConsistent:
+ Opcode = AArch64::CASPALX;
+ break;
+ default:
+ llvm_unreachable("Unexpected ordering!");
+ }
+
+ MachineSDNode *CmpSwap = DAG.getMachineNode(
+ Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
+ CmpSwap->setMemRefs(MemOp, MemOp + 1);
+
+ unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
+ if (DAG.getDataLayout().isBigEndian())
+ std::swap(SubReg1, SubReg2);
+ Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
+ SDValue(CmpSwap, 0)));
+ Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
+ SDValue(CmpSwap, 0)));
+ Results.push_back(SDValue(CmpSwap, 1)); // Chain out
+ return;
+ }
+
auto Desired = splitInt128(N->getOperand(2), DAG);
auto New = splitInt128(N->getOperand(3), DAG);
SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
@@ -10703,7 +11263,7 @@ void AArch64TargetLowering::ReplaceNodeResults(
// Let normal code take care of it by not adding anything to Results.
return;
case ISD::ATOMIC_CMP_SWAP:
- ReplaceCMP_SWAP_128Results(N, Results, DAG);
+ ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
return;
}
}
@@ -10967,6 +11527,10 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
return OptSize && !VT.isVector();
}
+bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
+ return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
+}
+
unsigned
AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
@@ -10974,3 +11538,8 @@ AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
}
+
+void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
+ MF.getFrameInfo().computeMaxCallFrameSize(MF);
+ TargetLoweringBase::finalizeLowering(MF);
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 8d78b5b6b5b4..592845640a44 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -309,6 +309,9 @@ public:
MachineFunction &MF,
unsigned Intrinsic) const override;
+ bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
+ EVT NewVT) const override;
+
bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
bool isTruncateFree(EVT VT1, EVT VT2) const override;
@@ -332,6 +335,8 @@ public:
bool isLegalAddImmediate(int64_t) const override;
bool isLegalICmpImmediate(int64_t) const override;
+ bool shouldConsiderGEPOffsetSplit() const override;
+
EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
MachineFunction &MF) const override;
@@ -342,7 +347,7 @@ public:
unsigned AS,
Instruction *I = nullptr) const override;
- /// \brief Return the cost of the scaling factor used in the addressing
+ /// Return the cost of the scaling factor used in the addressing
/// mode represented by AM for this target, for a load/store
/// of the specified type.
/// If the AM is supported, the return value must be >= 0.
@@ -357,14 +362,19 @@ public:
const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
- /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask.
+ /// Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N) const override;
- /// \brief Returns true if it is beneficial to convert a load of a constant
+ /// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
+ /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
+ /// with this index.
+ bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+ unsigned Index) const override;
+
Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const override;
Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
@@ -433,9 +443,35 @@ public:
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
- bool hasAndNotCompare(SDValue) const override {
- // 'bics'
- return true;
+ bool hasAndNotCompare(SDValue V) const override {
+ // We can use bics for any scalar.
+ return V.getValueType().isScalarInteger();
+ }
+
+ bool hasAndNot(SDValue Y) const override {
+ EVT VT = Y.getValueType();
+
+ if (!VT.isVector())
+ return hasAndNotCompare(Y);
+
+ return VT.getSizeInBits() >= 64; // vector 'bic'
+ }
+
+ bool shouldTransformSignedTruncationCheck(EVT XVT,
+ unsigned KeptBits) const override {
+ // For vectors, we don't have a preference..
+ if (XVT.isVector())
+ return false;
+
+ auto VTIsOk = [](EVT VT) -> bool {
+ return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
+ VT == MVT::i64;
+ };
+
+ // We are ok with KeptBitsVT being byte/word/dword, what SXT supports.
+ // XVT will be larger than KeptBitsVT.
+ MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
+ return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
}
bool hasBitPreservingFPLogic(EVT VT) const override {
@@ -456,6 +492,9 @@ public:
return true;
}
+ /// Enable aggressive FMA fusion on targets that want it.
+ bool enableAggressiveFMAFusion(EVT VT) const override;
+
/// Returns the size of the platform's va_list object.
unsigned getVaListSizeInBits(const DataLayout &DL) const override;
@@ -476,12 +515,12 @@ public:
CallingConv::ID CallConv,
bool isVarArg) const override;
private:
- bool isExtFreeImpl(const Instruction *Ext) const override;
-
/// Keep a pointer to the AArch64Subtarget around so that we can
/// make the right decision when generating code for different targets.
const AArch64Subtarget *Subtarget;
+ bool isExtFreeImpl(const Instruction *Ext) const override;
+
void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT);
void addDRTypeForNEON(MVT VT);
void addQRTypeForNEON(MVT VT);
@@ -502,6 +541,8 @@ private:
SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
SDValue ThisVal) const;
+ SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
bool isEligibleForTailCallOptimization(
@@ -545,12 +586,14 @@ private:
SDValue getAddrLarge(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const;
template <class NodeTy>
SDValue getAddr(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const;
+ SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL,
SelectionDAG &DAG) const;
+ SDValue LowerWindowsGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -569,6 +612,7 @@ private:
SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
@@ -592,6 +636,12 @@ private:
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
+ SDValue &Size,
+ SelectionDAG &DAG) const;
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
std::vector<SDNode *> *Created) const override;
@@ -647,6 +697,8 @@ private:
SelectionDAG &DAG) const override;
bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override;
+
+ void finalizeLowering(MachineFunction &MF) const override;
};
namespace AArch64 {
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index 153bcf75cbcd..35cd7735ceb7 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -409,13 +409,18 @@ let Predicates = [HasLSE] in {
defm : LDOPregister_patterns<"LDADD", "atomic_load_add">;
defm : LDOPregister_patterns<"LDSET", "atomic_load_or">;
defm : LDOPregister_patterns<"LDEOR", "atomic_load_xor">;
+ defm : LDOPregister_patterns<"LDCLR", "atomic_load_clr">;
defm : LDOPregister_patterns<"LDSMAX", "atomic_load_max">;
defm : LDOPregister_patterns<"LDSMIN", "atomic_load_min">;
defm : LDOPregister_patterns<"LDUMAX", "atomic_load_umax">;
defm : LDOPregister_patterns<"LDUMIN", "atomic_load_umin">;
defm : LDOPregister_patterns<"SWP", "atomic_swap">;
+ defm : CASregister_patterns<"CAS", "atomic_cmp_swap">;
+
+ // These two patterns are only needed for global isel, selection dag isel
+ // converts atomic load-sub into a sub and atomic load-add, and likewise for
+ // and -> clr.
defm : LDOPregister_patterns_mod<"LDADD", "atomic_load_sub", "SUB">;
defm : LDOPregister_patterns_mod<"LDCLR", "atomic_load_and", "ORN">;
- defm : CASregister_patterns<"CAS", "atomic_cmp_swap">;
}
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 80c5092a4eed..1060c64f7b5d 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -167,7 +167,7 @@ def ExtendOperandLSL64 : AsmOperandClass {
// 8-bit floating-point immediate encodings.
def FPImmOperand : AsmOperandClass {
let Name = "FPImm";
- let ParserMethod = "tryParseFPImm";
+ let ParserMethod = "tryParseFPImm<true>";
let DiagnosticType = "InvalidFPImm";
}
@@ -179,20 +179,40 @@ def CondCode : AsmOperandClass {
// A 32-bit register pasrsed as 64-bit
def GPR32as64Operand : AsmOperandClass {
let Name = "GPR32as64";
+ let ParserMethod =
+ "tryParseGPROperand<false, RegConstraintEqualityTy::EqualsSubReg>";
}
def GPR32as64 : RegisterOperand<GPR32> {
let ParserMatchClass = GPR32as64Operand;
}
+// A 64-bit register pasrsed as 32-bit
+def GPR64as32Operand : AsmOperandClass {
+ let Name = "GPR64as32";
+ let ParserMethod =
+ "tryParseGPROperand<false, RegConstraintEqualityTy::EqualsSuperReg>";
+}
+def GPR64as32 : RegisterOperand<GPR64, "printGPR64as32"> {
+ let ParserMatchClass = GPR64as32Operand;
+}
+
// 8-bit immediate for AdvSIMD where 64-bit values of the form:
// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
// are encoded as the eight bit value 'abcdefgh'.
def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
-// Authenticated loads for v8.3 can have scaled 10-bit immediate offsets.
-def SImm10s8Operand : AsmOperandClass {
- let Name = "SImm10s8";
- let DiagnosticType = "InvalidMemoryIndexedSImm10";
+class UImmScaledMemoryIndexed<int Width, int Scale> : AsmOperandClass {
+ let Name = "UImm" # Width # "s" # Scale;
+ let DiagnosticType = "InvalidMemoryIndexed" # Scale # "UImm" # Width;
+ let RenderMethod = "addImmScaledOperands<" # Scale # ">";
+ let PredicateMethod = "isUImmScaled<" # Width # ", " # Scale # ">";
+}
+
+class SImmScaledMemoryIndexed<int Width, int Scale> : AsmOperandClass {
+ let Name = "SImm" # Width # "s" # Scale;
+ let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm" # Width;
+ let RenderMethod = "addImmScaledOperands<" # Scale # ">";
+ let PredicateMethod = "isSImmScaled<" # Width # ", " # Scale # ">";
}
//===----------------------------------------------------------------------===//
@@ -221,31 +241,66 @@ def adrlabel : Operand<i64> {
let ParserMatchClass = AdrOperand;
}
+class SImmOperand<int width> : AsmOperandClass {
+ let Name = "SImm" # width;
+ let DiagnosticType = "InvalidMemoryIndexedSImm" # width;
+ let RenderMethod = "addImmOperands";
+ let PredicateMethod = "isSImm<" # width # ">";
+}
+
+// Authenticated loads for v8.3 can have scaled 10-bit immediate offsets.
+def SImm10s8Operand : SImmScaledMemoryIndexed<10, 8>;
def simm10Scaled : Operand<i64> {
let ParserMatchClass = SImm10s8Operand;
let DecoderMethod = "DecodeSImm<10>";
let PrintMethod = "printImmScale<8>";
}
-// simm9 predicate - True if the immediate is in the range [-256, 255].
-def SImm9Operand : AsmOperandClass {
- let Name = "SImm9";
- let DiagnosticType = "InvalidMemoryIndexedSImm9";
+// uimm6 predicate - True if the immediate is in the range [0, 63].
+def UImm6Operand : AsmOperandClass {
+ let Name = "UImm6";
+ let DiagnosticType = "InvalidImm0_63";
+}
+
+def uimm6 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
+ let ParserMatchClass = UImm6Operand;
}
+
+def SImm9Operand : SImmOperand<9>;
def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
let ParserMatchClass = SImm9Operand;
+ let DecoderMethod = "DecodeSImm<9>";
+}
+
+def SImm8Operand : SImmOperand<8>;
+def simm8 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -128 && Imm < 127; }]> {
+ let ParserMatchClass = SImm8Operand;
+ let DecoderMethod = "DecodeSImm<8>";
+}
+
+def SImm6Operand : SImmOperand<6>;
+def simm6_32b : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -32 && Imm < 32; }]> {
+ let ParserMatchClass = SImm6Operand;
+ let DecoderMethod = "DecodeSImm<6>";
+}
+
+def SImm5Operand : SImmOperand<5>;
+def simm5_64b : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -16 && Imm < 16; }]> {
+ let ParserMatchClass = SImm5Operand;
+ let DecoderMethod = "DecodeSImm<5>";
+}
+
+def simm5_32b : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -16 && Imm < 16; }]> {
+ let ParserMatchClass = SImm5Operand;
+ let DecoderMethod = "DecodeSImm<5>";
}
// simm7sN predicate - True if the immediate is a multiple of N in the range
// [-64 * N, 63 * N].
-class SImm7Scaled<int Scale> : AsmOperandClass {
- let Name = "SImm7s" # Scale;
- let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm7";
-}
-def SImm7s4Operand : SImm7Scaled<4>;
-def SImm7s8Operand : SImm7Scaled<8>;
-def SImm7s16Operand : SImm7Scaled<16>;
+def SImm7s4Operand : SImmScaledMemoryIndexed<7, 4>;
+def SImm7s8Operand : SImmScaledMemoryIndexed<7, 8>;
+def SImm7s16Operand : SImmScaledMemoryIndexed<7, 16>;
def simm7s4 : Operand<i32> {
let ParserMatchClass = SImm7s4Operand;
@@ -268,9 +323,107 @@ def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
def am_indexed7s64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>;
def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
+// uimm5sN predicate - True if the immediate is a multiple of N in the range
+// [0 * N, 32 * N].
+def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>;
+def UImm5s4Operand : UImmScaledMemoryIndexed<5, 4>;
+def UImm5s8Operand : UImmScaledMemoryIndexed<5, 8>;
+
+def uimm5s2 : Operand<i64>, ImmLeaf<i64,
+ [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }]> {
+ let ParserMatchClass = UImm5s2Operand;
+ let PrintMethod = "printImmScale<2>";
+}
+def uimm5s4 : Operand<i64>, ImmLeaf<i64,
+ [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }]> {
+ let ParserMatchClass = UImm5s4Operand;
+ let PrintMethod = "printImmScale<4>";
+}
+def uimm5s8 : Operand<i64>, ImmLeaf<i64,
+ [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }]> {
+ let ParserMatchClass = UImm5s8Operand;
+ let PrintMethod = "printImmScale<8>";
+}
+
+// uimm6sN predicate - True if the immediate is a multiple of N in the range
+// [0 * N, 64 * N].
+def UImm6s1Operand : UImmScaledMemoryIndexed<6, 1>;
+def UImm6s2Operand : UImmScaledMemoryIndexed<6, 2>;
+def UImm6s4Operand : UImmScaledMemoryIndexed<6, 4>;
+def UImm6s8Operand : UImmScaledMemoryIndexed<6, 8>;
+
+def uimm6s1 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
+ let ParserMatchClass = UImm6s1Operand;
+}
+def uimm6s2 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >= 0 && Imm < (64*2) && ((Imm % 2) == 0); }]> {
+ let PrintMethod = "printImmScale<2>";
+ let ParserMatchClass = UImm6s2Operand;
+}
+def uimm6s4 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >= 0 && Imm < (64*4) && ((Imm % 4) == 0); }]> {
+ let PrintMethod = "printImmScale<4>";
+ let ParserMatchClass = UImm6s4Operand;
+}
+def uimm6s8 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >= 0 && Imm < (64*8) && ((Imm % 8) == 0); }]> {
+ let PrintMethod = "printImmScale<8>";
+ let ParserMatchClass = UImm6s8Operand;
+}
+
+// simm6sN predicate - True if the immediate is a multiple of N in the range
+// [-32 * N, 31 * N].
+def SImm6s1Operand : SImmScaledMemoryIndexed<6, 1>;
+def simm6s1 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -32 && Imm < 32; }]> {
+ let ParserMatchClass = SImm6s1Operand;
+ let DecoderMethod = "DecodeSImm<6>";
+}
+
+// simm4sN predicate - True if the immediate is a multiple of N in the range
+// [ -8* N, 7 * N].
+def SImm4s1Operand : SImmScaledMemoryIndexed<4, 1>;
+def SImm4s2Operand : SImmScaledMemoryIndexed<4, 2>;
+def SImm4s3Operand : SImmScaledMemoryIndexed<4, 3>;
+def SImm4s4Operand : SImmScaledMemoryIndexed<4, 4>;
+def SImm4s16Operand : SImmScaledMemoryIndexed<4, 16>;
+
+def simm4s1 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >=-8 && Imm <= 7; }]> {
+ let ParserMatchClass = SImm4s1Operand;
+ let DecoderMethod = "DecodeSImm<4>";
+}
+
+def simm4s2 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >=-16 && Imm <= 14 && (Imm % 2) == 0x0; }]> {
+ let PrintMethod = "printImmScale<2>";
+ let ParserMatchClass = SImm4s2Operand;
+ let DecoderMethod = "DecodeSImm<4>";
+}
+
+def simm4s3 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >=-24 && Imm <= 21 && (Imm % 3) == 0x0; }]> {
+ let PrintMethod = "printImmScale<3>";
+ let ParserMatchClass = SImm4s3Operand;
+ let DecoderMethod = "DecodeSImm<4>";
+}
+
+def simm4s4 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >=-32 && Imm <= 28 && (Imm % 4) == 0x0; }]> {
+ let PrintMethod = "printImmScale<4>";
+ let ParserMatchClass = SImm4s4Operand;
+ let DecoderMethod = "DecodeSImm<4>";
+}
+def simm4s16 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >=-128 && Imm <= 112 && (Imm % 16) == 0x0; }]> {
+ let PrintMethod = "printImmScale<16>";
+ let ParserMatchClass = SImm4s16Operand;
+ let DecoderMethod = "DecodeSImm<4>";
+}
+
class AsmImmRange<int Low, int High> : AsmOperandClass {
let Name = "Imm" # Low # "_" # High;
let DiagnosticType = "InvalidImm" # Low # "_" # High;
+ let RenderMethod = "addImmOperands";
let PredicateMethod = "isImmInRange<" # Low # "," # High # ">";
}
@@ -489,27 +642,35 @@ def logical_imm64_XFORM : SDNodeXForm<imm, [{
let DiagnosticType = "LogicalSecondSource" in {
def LogicalImm32Operand : AsmOperandClass {
let Name = "LogicalImm32";
+ let PredicateMethod = "isLogicalImm<int32_t>";
+ let RenderMethod = "addLogicalImmOperands<int32_t>";
}
def LogicalImm64Operand : AsmOperandClass {
let Name = "LogicalImm64";
+ let PredicateMethod = "isLogicalImm<int64_t>";
+ let RenderMethod = "addLogicalImmOperands<int64_t>";
}
def LogicalImm32NotOperand : AsmOperandClass {
let Name = "LogicalImm32Not";
+ let PredicateMethod = "isLogicalImm<int32_t>";
+ let RenderMethod = "addLogicalImmNotOperands<int32_t>";
}
def LogicalImm64NotOperand : AsmOperandClass {
let Name = "LogicalImm64Not";
+ let PredicateMethod = "isLogicalImm<int64_t>";
+ let RenderMethod = "addLogicalImmNotOperands<int64_t>";
}
}
def logical_imm32 : Operand<i32>, IntImmLeaf<i32, [{
return AArch64_AM::isLogicalImmediate(Imm.getZExtValue(), 32);
}], logical_imm32_XFORM> {
- let PrintMethod = "printLogicalImm32";
+ let PrintMethod = "printLogicalImm<int32_t>";
let ParserMatchClass = LogicalImm32Operand;
}
def logical_imm64 : Operand<i64>, IntImmLeaf<i64, [{
return AArch64_AM::isLogicalImmediate(Imm.getZExtValue(), 64);
}], logical_imm64_XFORM> {
- let PrintMethod = "printLogicalImm64";
+ let PrintMethod = "printLogicalImm<int64_t>";
let ParserMatchClass = LogicalImm64Operand;
}
def logical_imm32_not : Operand<i32> {
@@ -672,11 +833,13 @@ def move_vec_shift : Operand<i32> {
let DiagnosticType = "AddSubSecondSource" in {
def AddSubImmOperand : AsmOperandClass {
let Name = "AddSubImm";
- let ParserMethod = "tryParseAddSubImm";
+ let ParserMethod = "tryParseImmWithOptionalShift";
+ let RenderMethod = "addImmWithOptionalShiftOperands<12>";
}
def AddSubImmNegOperand : AsmOperandClass {
let Name = "AddSubImmNeg";
- let ParserMethod = "tryParseAddSubImm";
+ let ParserMethod = "tryParseImmWithOptionalShift";
+ let RenderMethod = "addImmNegWithOptionalShiftOperands<12>";
}
}
// An ADD/SUB immediate shifter operand:
@@ -797,52 +960,48 @@ def fpimm0 : FPImmLeaf<fAny, [{
}]>;
// Vector lane operands
-class AsmVectorIndex<string Suffix> : AsmOperandClass {
- let Name = "VectorIndex" # Suffix;
- let DiagnosticType = "InvalidIndex" # Suffix;
-}
-def VectorIndex1Operand : AsmVectorIndex<"1">;
-def VectorIndexBOperand : AsmVectorIndex<"B">;
-def VectorIndexHOperand : AsmVectorIndex<"H">;
-def VectorIndexSOperand : AsmVectorIndex<"S">;
-def VectorIndexDOperand : AsmVectorIndex<"D">;
-
-def VectorIndex1 : Operand<i64>, ImmLeaf<i64, [{
- return ((uint64_t)Imm) == 1;
-}]> {
- let ParserMatchClass = VectorIndex1Operand;
- let PrintMethod = "printVectorIndex";
- let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{
- return ((uint64_t)Imm) < 16;
-}]> {
- let ParserMatchClass = VectorIndexBOperand;
- let PrintMethod = "printVectorIndex";
- let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{
- return ((uint64_t)Imm) < 8;
-}]> {
- let ParserMatchClass = VectorIndexHOperand;
- let PrintMethod = "printVectorIndex";
- let MIOperandInfo = (ops i64imm);
+class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass {
+ let Name = NamePrefix # "IndexRange" # Min # "_" # Max;
+ let DiagnosticType = "Invalid" # Name;
+ let PredicateMethod = "isVectorIndex<" # Min # ", " # Max # ">";
+ let RenderMethod = "addVectorIndexOperands";
}
-def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{
- return ((uint64_t)Imm) < 4;
-}]> {
- let ParserMatchClass = VectorIndexSOperand;
- let PrintMethod = "printVectorIndex";
- let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{
- return ((uint64_t)Imm) < 2;
-}]> {
- let ParserMatchClass = VectorIndexDOperand;
+
+class AsmVectorIndexOpnd<AsmOperandClass mc, code pred>
+ : Operand<i64>, ImmLeaf<i64, pred> {
+ let ParserMatchClass = mc;
let PrintMethod = "printVectorIndex";
- let MIOperandInfo = (ops i64imm);
}
+def VectorIndex1Operand : AsmVectorIndex<1, 1>;
+def VectorIndexBOperand : AsmVectorIndex<0, 15>;
+def VectorIndexHOperand : AsmVectorIndex<0, 7>;
+def VectorIndexSOperand : AsmVectorIndex<0, 3>;
+def VectorIndexDOperand : AsmVectorIndex<0, 1>;
+
+def VectorIndex1 : AsmVectorIndexOpnd<VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>;
+def VectorIndexB : AsmVectorIndexOpnd<VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>;
+def VectorIndexH : AsmVectorIndexOpnd<VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>;
+def VectorIndexS : AsmVectorIndexOpnd<VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>;
+def VectorIndexD : AsmVectorIndexOpnd<VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>;
+
+def SVEVectorIndexExtDupBOperand : AsmVectorIndex<0, 63, "SVE">;
+def SVEVectorIndexExtDupHOperand : AsmVectorIndex<0, 31, "SVE">;
+def SVEVectorIndexExtDupSOperand : AsmVectorIndex<0, 15, "SVE">;
+def SVEVectorIndexExtDupDOperand : AsmVectorIndex<0, 7, "SVE">;
+def SVEVectorIndexExtDupQOperand : AsmVectorIndex<0, 3, "SVE">;
+
+def sve_elm_idx_extdup_b
+ : AsmVectorIndexOpnd<SVEVectorIndexExtDupBOperand, [{ return ((uint64_t)Imm) < 64; }]>;
+def sve_elm_idx_extdup_h
+ : AsmVectorIndexOpnd<SVEVectorIndexExtDupHOperand, [{ return ((uint64_t)Imm) < 32; }]>;
+def sve_elm_idx_extdup_s
+ : AsmVectorIndexOpnd<SVEVectorIndexExtDupSOperand, [{ return ((uint64_t)Imm) < 16; }]>;
+def sve_elm_idx_extdup_d
+ : AsmVectorIndexOpnd<SVEVectorIndexExtDupDOperand, [{ return ((uint64_t)Imm) < 8; }]>;
+def sve_elm_idx_extdup_q
+ : AsmVectorIndexOpnd<SVEVectorIndexExtDupQOperand, [{ return ((uint64_t)Imm) < 4; }]>;
+
// 8-bit immediate for AdvSIMD where 64-bit values of the form:
// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
// are encoded as the eight bit value 'abcdefgh'.
@@ -1224,6 +1383,7 @@ def am_brcond : Operand<OtherVT> {
let DecoderMethod = "DecodePCRelLabel19";
let PrintMethod = "printAlignedLabel";
let ParserMatchClass = PCRelLabel19Operand;
+ let OperandType = "OPERAND_PCREL";
}
class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
@@ -1279,18 +1439,20 @@ def am_tbrcond : Operand<OtherVT> {
let EncoderMethod = "getTestBranchTargetOpValue";
let PrintMethod = "printAlignedLabel";
let ParserMatchClass = BranchTarget14Operand;
+ let OperandType = "OPERAND_PCREL";
}
// AsmOperand classes to emit (or not) special diagnostics
def TBZImm0_31Operand : AsmOperandClass {
let Name = "TBZImm0_31";
let PredicateMethod = "isImmInRange<0,31>";
- let RenderMethod = "addImm0_31Operands";
+ let RenderMethod = "addImmOperands";
}
def TBZImm32_63Operand : AsmOperandClass {
let Name = "Imm32_63";
let PredicateMethod = "isImmInRange<32,63>";
let DiagnosticType = "InvalidImm0_63";
+ let RenderMethod = "addImmOperands";
}
class tbz_imm0_31<AsmOperandClass matcher> : Operand<i64>, ImmLeaf<i64, [{
@@ -1355,11 +1517,13 @@ def am_b_target : Operand<OtherVT> {
let EncoderMethod = "getBranchTargetOpValue";
let PrintMethod = "printAlignedLabel";
let ParserMatchClass = BranchTarget26Operand;
+ let OperandType = "OPERAND_PCREL";
}
def am_bl_target : Operand<i64> {
let EncoderMethod = "getBranchTargetOpValue";
let PrintMethod = "printAlignedLabel";
let ParserMatchClass = BranchTarget26Operand;
+ let OperandType = "OPERAND_PCREL";
}
class BImm<bit op, dag iops, string asm, list<dag> pattern>
@@ -1458,6 +1622,30 @@ class SignAuthTwoOperand<bits<4> opc, string asm,
let Inst{4-0} = Rd;
}
+// Base class for the Armv8.4-A 8 and 16-bit flag manipulation instructions
+class BaseFlagManipulation<bit sf, bit sz, dag iops, string asm, string ops>
+ : I<(outs), iops, asm, ops, "", []>,
+ Sched<[WriteI, ReadI, ReadI]> {
+ let Uses = [NZCV];
+ bits<5> Rn;
+ let Inst{31} = sf;
+ let Inst{30-15} = 0b0111010000000000;
+ let Inst{14} = sz;
+ let Inst{13-10} = 0b0010;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = 0b01101;
+}
+
+class FlagRotate<dag iops, string asm, string ops>
+ : BaseFlagManipulation<0b1, 0b0, iops, asm, ops> {
+ bits<6> imm;
+ bits<4> mask;
+ let Inst{20-15} = imm;
+ let Inst{13-10} = 0b0001;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = mask;
+}
+
//---
// Basic two-operand data processing instructions.
//---
@@ -2579,7 +2767,7 @@ class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
let DecoderMethod = "DecodeUnsignedLdStInstruction";
}
-multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
Operand indextype, string asm, list<dag> pattern> {
let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
def ui : BaseLoadStoreUI<sz, V, opc, (outs regtype:$Rt),
@@ -2591,7 +2779,7 @@ multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
(!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
-multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
Operand indextype, string asm, list<dag> pattern> {
let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
def ui : BaseLoadStoreUI<sz, V, opc, (outs),
@@ -2647,10 +2835,11 @@ def am_ldrlit : Operand<iPTR> {
let DecoderMethod = "DecodePCRelLabel19";
let PrintMethod = "printAlignedLabel";
let ParserMatchClass = PCRelLabel19Operand;
+ let OperandType = "OPERAND_PCREL";
}
let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm>
+class LoadLiteral<bits<2> opc, bit V, RegisterOperand regtype, string asm>
: I<(outs regtype:$Rt), (ins am_ldrlit:$label),
asm, "\t$Rt, $label", "", []>,
Sched<[WriteLD]> {
@@ -2761,7 +2950,7 @@ def ro64 : ROAddrMode<ro_Windexed64, ro_Xindexed64, ro_Wextend64, ro_Xextend64>;
def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128,
ro_Xextend128>;
-class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, dag ins, dag outs, list<dag> pat>
: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
bits<5> Rt;
@@ -2783,11 +2972,11 @@ class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
let Inst{4-0} = Rt;
}
-class ROInstAlias<string asm, RegisterClass regtype, Instruction INST>
+class ROInstAlias<string asm, RegisterOperand regtype, Instruction INST>
: InstAlias<asm # "\t$Rt, [$Rn, $Rm]",
(INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
-multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
let AddedComplexity = 10 in
def roW : LoadStore8RO<sz, V, opc, regtype, asm,
@@ -2814,7 +3003,7 @@ multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator storeop> {
let AddedComplexity = 10 in
def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
@@ -2839,7 +3028,7 @@ multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, dag ins, dag outs, list<dag> pat>
: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
bits<5> Rt;
@@ -2861,7 +3050,7 @@ class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
let Inst{4-0} = Rt;
}
-multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
let AddedComplexity = 10 in
def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
@@ -2886,7 +3075,7 @@ multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator storeop> {
let AddedComplexity = 10 in
def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
@@ -2911,7 +3100,7 @@ multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, dag ins, dag outs, list<dag> pat>
: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
bits<5> Rt;
@@ -2933,7 +3122,7 @@ class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
let Inst{4-0} = Rt;
}
-multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
let AddedComplexity = 10 in
def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
@@ -2958,7 +3147,7 @@ multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator storeop> {
let AddedComplexity = 10 in
def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
@@ -2983,7 +3172,7 @@ multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, dag ins, dag outs, list<dag> pat>
: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
bits<5> Rt;
@@ -3005,7 +3194,7 @@ class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
let Inst{4-0} = Rt;
}
-multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
@@ -3030,7 +3219,7 @@ multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator storeop> {
let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
@@ -3055,7 +3244,7 @@ multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, dag ins, dag outs, list<dag> pat>
: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
bits<5> Rt;
@@ -3077,7 +3266,7 @@ class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
let Inst{4-0} = Rt;
}
-multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
@@ -3102,7 +3291,7 @@ multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
}
-multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator storeop> {
let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
@@ -3216,7 +3405,33 @@ class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
let DecoderMethod = "DecodeSignedLdStInstruction";
}
-multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+// Armv8.4 LDAPR & STLR with Immediate Offset instruction
+multiclass BaseLoadUnscaleV84<string asm, bits<2> sz, bits<2> opc,
+ RegisterOperand regtype > {
+ def i : BaseLoadStoreUnscale<sz, 0, opc, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, simm9:$offset), asm, []>,
+ Sched<[WriteST]> {
+ let Inst{29} = 0;
+ let Inst{24} = 1;
+ }
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass BaseStoreUnscaleV84<string asm, bits<2> sz, bits<2> opc,
+ RegisterOperand regtype > {
+ def i : BaseLoadStoreUnscale<sz, 0, opc, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+ asm, []>,
+ Sched<[WriteST]> {
+ let Inst{29} = 0;
+ let Inst{24} = 1;
+ }
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, list<dag> pattern> {
let AddedComplexity = 1 in // try this before LoadUI
def i : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
@@ -3227,7 +3442,7 @@ multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
-multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, list<dag> pattern> {
let AddedComplexity = 1 in // try this before StoreUI
def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
@@ -3324,7 +3539,7 @@ class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
let hasSideEffects = 0 in {
let mayStore = 0, mayLoad = 1 in
-class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm>
: BaseLoadStorePreIdx<sz, V, opc,
(outs GPR64sp:$wback, regtype:$Rt),
@@ -3333,7 +3548,7 @@ class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
Sched<[WriteLD, WriteAdr]>;
let mayStore = 1, mayLoad = 0 in
-class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, SDPatternOperator storeop, ValueType Ty>
: BaseLoadStorePreIdx<sz, V, opc,
(outs GPR64sp:$wback),
@@ -3370,16 +3585,16 @@ class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
let hasSideEffects = 0 in {
let mayStore = 0, mayLoad = 1 in
-class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm>
: BaseLoadStorePostIdx<sz, V, opc,
(outs GPR64sp:$wback, regtype:$Rt),
(ins GPR64sp:$Rn, simm9:$offset),
asm, "$Rn = $wback,@earlyclobber $wback", []>,
- Sched<[WriteLD, WriteI]>;
+ Sched<[WriteLD, WriteAdr]>;
let mayStore = 1, mayLoad = 0 in
-class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, SDPatternOperator storeop, ValueType Ty>
: BaseLoadStorePostIdx<sz, V, opc,
(outs GPR64sp:$wback),
@@ -3387,7 +3602,7 @@ class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
asm, "$Rn = $wback,@earlyclobber $wback",
[(set GPR64sp:$wback,
(storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
- Sched<[WriteAdr, WriteST, ReadAdrBase]>;
+ Sched<[WriteAdr, WriteST]>;
} // hasSideEffects = 0
@@ -3417,7 +3632,7 @@ class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
let DecoderMethod = "DecodePairLdStInstruction";
}
-multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
+multiclass LoadPairOffset<bits<2> opc, bit V, RegisterOperand regtype,
Operand indextype, string asm> {
let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
def i : BaseLoadStorePairOffset<opc, V, 1,
@@ -3431,7 +3646,7 @@ multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
}
-multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
+multiclass StorePairOffset<bits<2> opc, bit V, RegisterOperand regtype,
Operand indextype, string asm> {
let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
def i : BaseLoadStorePairOffset<opc, V, 0, (outs),
@@ -3468,7 +3683,7 @@ class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
let hasSideEffects = 0 in {
let mayStore = 0, mayLoad = 1 in
-class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+class LoadPairPreIdx<bits<2> opc, bit V, RegisterOperand regtype,
Operand indextype, string asm>
: BaseLoadStorePairPreIdx<opc, V, 1,
(outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
@@ -3476,7 +3691,7 @@ class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
Sched<[WriteLD, WriteLDHi, WriteAdr]>;
let mayStore = 1, mayLoad = 0 in
-class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+class StorePairPreIdx<bits<2> opc, bit V, RegisterOperand regtype,
Operand indextype, string asm>
: BaseLoadStorePairPreIdx<opc, V, 0, (outs GPR64sp:$wback),
(ins regtype:$Rt, regtype:$Rt2,
@@ -3509,7 +3724,7 @@ class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
let hasSideEffects = 0 in {
let mayStore = 0, mayLoad = 1 in
-class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+class LoadPairPostIdx<bits<2> opc, bit V, RegisterOperand regtype,
Operand idxtype, string asm>
: BaseLoadStorePairPostIdx<opc, V, 1,
(outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
@@ -3517,7 +3732,7 @@ class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
Sched<[WriteLD, WriteLDHi, WriteAdr]>;
let mayStore = 1, mayLoad = 0 in
-class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+class StorePairPostIdx<bits<2> opc, bit V, RegisterOperand regtype,
Operand idxtype, string asm>
: BaseLoadStorePairPostIdx<opc, V, 0, (outs GPR64sp:$wback),
(ins regtype:$Rt, regtype:$Rt2,
@@ -4559,11 +4774,24 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
}
class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
- string kind2> :
- BaseSIMDThreeSameVector<Q, U, 0b100, 0b10010, V128, asm, kind1, [] > {
+ string kind2, RegisterOperand RegType,
+ ValueType AccumType, ValueType InputType,
+ SDPatternOperator OpNode> :
+ BaseSIMDThreeSameVectorTied<Q, U, 0b100, 0b10010, RegType, asm, kind1,
+ [(set (AccumType RegType:$dst),
+ (OpNode (AccumType RegType:$Rd),
+ (InputType RegType:$Rn),
+ (InputType RegType:$Rm)))]> {
let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
}
+multiclass SIMDThreeSameVectorDot<bit U, string asm, SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64,
+ v2i32, v8i8, OpNode>;
+ def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128,
+ v4i32, v16i8, OpNode>;
+}
+
// All operand sizes distinguished in the encoding.
multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
@@ -5492,7 +5720,7 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
def v16i8 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
V128, V128, V128,
asm#"2", ".8h", ".16b", ".16b", []>;
- let Predicates = [HasCrypto] in {
+ let Predicates = [HasAES] in {
def v1i64 : BaseSIMDDifferentThreeVector<U, 0b110, opc,
V128, V64, V64,
asm, ".1q", ".1d", ".1d", []>;
@@ -5911,10 +6139,10 @@ multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v1i32: BaseSIMDThreeScalarTied<U, 0b10, R, opc, (outs FPR32:$dst),
- (ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm),
+ (ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm),
asm, []>;
def v1i16: BaseSIMDThreeScalarTied<U, 0b01, R, opc, (outs FPR16:$dst),
- (ins FPR16:$Rd, FPR16:$Rn, FPR16:$Rm),
+ (ins FPR16:$Rd, FPR16:$Rn, FPR16:$Rm),
asm, []>;
}
@@ -6993,14 +7221,31 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
// ARMv8.2 Index Dot product instructions
class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
- string lhs_kind, string rhs_kind> :
- BaseSIMDIndexedTied<Q, U, 0b0, 0b10, 0b1110, V128, V128, V128, VectorIndexS,
- asm, "", dst_kind, lhs_kind, rhs_kind, []> {
+ string lhs_kind, string rhs_kind,
+ RegisterOperand RegType,
+ ValueType AccumType, ValueType InputType,
+ SDPatternOperator OpNode> :
+ BaseSIMDIndexedTied<Q, U, 0b0, 0b10, 0b1110, RegType, RegType, V128,
+ VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
+ [(set (AccumType RegType:$dst),
+ (AccumType (OpNode (AccumType RegType:$Rd),
+ (InputType RegType:$Rn),
+ (InputType (bitconvert (AccumType
+ (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx)))))))]> {
bits<2> idx;
let Inst{21} = idx{0}; // L
let Inst{11} = idx{1}; // H
}
+multiclass SIMDThreeSameVectorDotIndex<bit U, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b", V64,
+ v2i32, v8i8, OpNode>;
+ def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b", V128,
+ v4i32, v16i8, OpNode>;
+}
+
multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
@@ -7765,7 +8010,6 @@ multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> {
FPR32, FPR32, vecshiftR32, asm, []> {
let Inst{20-16} = imm{4-0};
}
-
def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
FPR64, FPR64, vecshiftR64, asm, []> {
let Inst{21-16} = imm{5-0};
@@ -8468,14 +8712,14 @@ class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
// The immediate form of AdvSIMD post-indexed addressing is encoded with
// register post-index addressing from the zero register.
-multiclass SIMDLdStAliases<string asm, string layout, string Count,
+multiclass SIMDLdStAliases<string BaseName, string asm, string layout, string Count,
int Offset, int Size> {
// E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
// "ld1\t$Vt, [$Rn], #16"
// may get mapped to
// (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR)
def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
- (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+ (!cast<Instruction>(BaseName # Count # "v" # layout # "_POST")
GPR64sp:$Rn,
!cast<RegisterOperand>("VecList" # Count # layout):$Vt,
XZR), 1>;
@@ -8485,7 +8729,7 @@ multiclass SIMDLdStAliases<string asm, string layout, string Count,
// may get mapped to
// (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR)
def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
- (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+ (!cast<Instruction>(BaseName # Count # "v" # layout # "_POST")
GPR64sp:$Rn,
!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
XZR), 0>;
@@ -8495,7 +8739,7 @@ multiclass SIMDLdStAliases<string asm, string layout, string Count,
// may get mapped to
// (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn)
def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
- (!cast<Instruction>(NAME # Count # "v" # layout)
+ (!cast<Instruction>(BaseName # Count # "v" # layout)
!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
GPR64sp:$Rn), 0>;
@@ -8504,14 +8748,14 @@ multiclass SIMDLdStAliases<string asm, string layout, string Count,
// may get mapped to
// (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm)
def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
- (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+ (!cast<Instruction>(BaseName # Count # "v" # layout # "_POST")
GPR64sp:$Rn,
!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
!cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
}
-multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
- int Offset64, bits<4> opcode> {
+multiclass BaseSIMDLdN<string BaseName, string Count, string asm, string veclist,
+ int Offset128, int Offset64, bits<4> opcode> {
let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
(outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
@@ -8573,18 +8817,18 @@ multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
}
- defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
- defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
- defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
- defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
- defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
- defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
- defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<BaseName, asm, "16b", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<BaseName, asm, "8h", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<BaseName, asm, "4s", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<BaseName, asm, "2d", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<BaseName, asm, "8b", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<BaseName, asm, "4h", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<BaseName, asm, "2s", Count, Offset64, 64>;
}
// Only ld1/st1 has a v1d version.
-multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
- int Offset64, bits<4> opcode> {
+multiclass BaseSIMDStN<string BaseName, string Count, string asm, string veclist,
+ int Offset128, int Offset64, bits<4> opcode> {
let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
(ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
@@ -8645,18 +8889,18 @@ multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
}
- defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
- defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
- defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
- defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
- defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
- defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
- defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<BaseName, asm, "16b", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<BaseName, asm, "8h", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<BaseName, asm, "4s", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<BaseName, asm, "2d", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<BaseName, asm, "8b", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<BaseName, asm, "4h", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<BaseName, asm, "2s", Count, Offset64, 64>;
}
-multiclass BaseSIMDLd1<string Count, string asm, string veclist,
+multiclass BaseSIMDLd1<string BaseName, string Count, string asm, string veclist,
int Offset128, int Offset64, bits<4> opcode>
- : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> {
+ : BaseSIMDLdN<BaseName, Count, asm, veclist, Offset128, Offset64, opcode> {
// LD1 instructions have extra "1d" variants.
let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
@@ -8671,12 +8915,12 @@ multiclass BaseSIMDLd1<string Count, string asm, string veclist,
!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
}
- defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<BaseName, asm, "1d", Count, Offset64, 64>;
}
-multiclass BaseSIMDSt1<string Count, string asm, string veclist,
+multiclass BaseSIMDSt1<string BaseName, string Count, string asm, string veclist,
int Offset128, int Offset64, bits<4> opcode>
- : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> {
+ : BaseSIMDStN<BaseName, Count, asm, veclist, Offset128, Offset64, opcode> {
// ST1 instructions have extra "1d" variants.
let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
@@ -8691,45 +8935,45 @@ multiclass BaseSIMDSt1<string Count, string asm, string veclist,
!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
}
- defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<BaseName, asm, "1d", Count, Offset64, 64>;
}
multiclass SIMDLd1Multiple<string asm> {
- defm One : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8, 0b0111>;
- defm Two : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
- defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
- defm Four : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+ defm One : BaseSIMDLd1<NAME, "One", asm, "VecListOne", 16, 8, 0b0111>;
+ defm Two : BaseSIMDLd1<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1010>;
+ defm Three : BaseSIMDLd1<NAME, "Three", asm, "VecListThree", 48, 24, 0b0110>;
+ defm Four : BaseSIMDLd1<NAME, "Four", asm, "VecListFour", 64, 32, 0b0010>;
}
multiclass SIMDSt1Multiple<string asm> {
- defm One : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8, 0b0111>;
- defm Two : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
- defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
- defm Four : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+ defm One : BaseSIMDSt1<NAME, "One", asm, "VecListOne", 16, 8, 0b0111>;
+ defm Two : BaseSIMDSt1<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1010>;
+ defm Three : BaseSIMDSt1<NAME, "Three", asm, "VecListThree", 48, 24, 0b0110>;
+ defm Four : BaseSIMDSt1<NAME, "Four", asm, "VecListFour", 64, 32, 0b0010>;
}
multiclass SIMDLd2Multiple<string asm> {
- defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+ defm Two : BaseSIMDLdN<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1000>;
}
multiclass SIMDSt2Multiple<string asm> {
- defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+ defm Two : BaseSIMDStN<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1000>;
}
multiclass SIMDLd3Multiple<string asm> {
- defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+ defm Three : BaseSIMDLdN<NAME, "Three", asm, "VecListThree", 48, 24, 0b0100>;
}
multiclass SIMDSt3Multiple<string asm> {
- defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+ defm Three : BaseSIMDStN<NAME, "Three", asm, "VecListThree", 48, 24, 0b0100>;
}
multiclass SIMDLd4Multiple<string asm> {
- defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+ defm Four : BaseSIMDLdN<NAME, "Four", asm, "VecListFour", 64, 32, 0b0000>;
}
multiclass SIMDSt4Multiple<string asm> {
- defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+ defm Four : BaseSIMDStN<NAME, "Four", asm, "VecListFour", 64, 32, 0b0000>;
}
//---
@@ -8769,7 +9013,7 @@ class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
- Operand listtype>
+ DAGOperand listtype>
: BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "",
(outs listtype:$Vt), (ins GPR64sp:$Rn),
[]> {
@@ -8781,7 +9025,7 @@ class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
}
let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
- string asm, Operand listtype, Operand GPR64pi>
+ string asm, DAGOperand listtype, DAGOperand GPR64pi>
: BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm",
"$Rn = $wback",
(outs GPR64sp:$wback, listtype:$Vt),
@@ -8794,14 +9038,14 @@ class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
let Inst{11-10} = size;
}
-multiclass SIMDLdrAliases<string asm, string layout, string Count,
+multiclass SIMDLdrAliases<string BaseName, string asm, string layout, string Count,
int Offset, int Size> {
// E.g. "ld1r { v0.8b }, [x1], #1"
// "ld1r.8b\t$Vt, [$Rn], #1"
// may get mapped to
// (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
- (!cast<Instruction>(NAME # "v" # layout # "_POST")
+ (!cast<Instruction>(BaseName # "v" # layout # "_POST")
GPR64sp:$Rn,
!cast<RegisterOperand>("VecList" # Count # layout):$Vt,
XZR), 1>;
@@ -8811,7 +9055,7 @@ multiclass SIMDLdrAliases<string asm, string layout, string Count,
// may get mapped to
// (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
- (!cast<Instruction>(NAME # "v" # layout # "_POST")
+ (!cast<Instruction>(BaseName # "v" # layout # "_POST")
GPR64sp:$Rn,
!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
XZR), 0>;
@@ -8821,7 +9065,7 @@ multiclass SIMDLdrAliases<string asm, string layout, string Count,
// may get mapped to
// (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
- (!cast<Instruction>(NAME # "v" # layout)
+ (!cast<Instruction>(BaseName # "v" # layout)
!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
GPR64sp:$Rn), 0>;
@@ -8830,7 +9074,7 @@ multiclass SIMDLdrAliases<string asm, string layout, string Count,
// may get mapped to
// (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
- (!cast<Instruction>(NAME # "v" # layout # "_POST")
+ (!cast<Instruction>(BaseName # "v" # layout # "_POST")
GPR64sp:$Rn,
!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
!cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
@@ -8839,55 +9083,55 @@ multiclass SIMDLdrAliases<string asm, string layout, string Count,
multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
int Offset1, int Offset2, int Offset4, int Offset8> {
def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
- !cast<Operand>("VecList" # Count # "8b")>;
+ !cast<DAGOperand>("VecList" # Count # "8b")>;
def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
- !cast<Operand>("VecList" # Count #"16b")>;
+ !cast<DAGOperand>("VecList" # Count #"16b")>;
def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
- !cast<Operand>("VecList" # Count #"4h")>;
+ !cast<DAGOperand>("VecList" # Count #"4h")>;
def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
- !cast<Operand>("VecList" # Count #"8h")>;
+ !cast<DAGOperand>("VecList" # Count #"8h")>;
def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
- !cast<Operand>("VecList" # Count #"2s")>;
+ !cast<DAGOperand>("VecList" # Count #"2s")>;
def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
- !cast<Operand>("VecList" # Count #"4s")>;
+ !cast<DAGOperand>("VecList" # Count #"4s")>;
def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
- !cast<Operand>("VecList" # Count #"1d")>;
+ !cast<DAGOperand>("VecList" # Count #"1d")>;
def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
- !cast<Operand>("VecList" # Count #"2d")>;
+ !cast<DAGOperand>("VecList" # Count #"2d")>;
def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
- !cast<Operand>("VecList" # Count # "8b"),
- !cast<Operand>("GPR64pi" # Offset1)>;
+ !cast<DAGOperand>("VecList" # Count # "8b"),
+ !cast<DAGOperand>("GPR64pi" # Offset1)>;
def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
- !cast<Operand>("VecList" # Count # "16b"),
- !cast<Operand>("GPR64pi" # Offset1)>;
+ !cast<DAGOperand>("VecList" # Count # "16b"),
+ !cast<DAGOperand>("GPR64pi" # Offset1)>;
def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
- !cast<Operand>("VecList" # Count # "4h"),
- !cast<Operand>("GPR64pi" # Offset2)>;
+ !cast<DAGOperand>("VecList" # Count # "4h"),
+ !cast<DAGOperand>("GPR64pi" # Offset2)>;
def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
- !cast<Operand>("VecList" # Count # "8h"),
- !cast<Operand>("GPR64pi" # Offset2)>;
+ !cast<DAGOperand>("VecList" # Count # "8h"),
+ !cast<DAGOperand>("GPR64pi" # Offset2)>;
def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
- !cast<Operand>("VecList" # Count # "2s"),
- !cast<Operand>("GPR64pi" # Offset4)>;
+ !cast<DAGOperand>("VecList" # Count # "2s"),
+ !cast<DAGOperand>("GPR64pi" # Offset4)>;
def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
- !cast<Operand>("VecList" # Count # "4s"),
- !cast<Operand>("GPR64pi" # Offset4)>;
+ !cast<DAGOperand>("VecList" # Count # "4s"),
+ !cast<DAGOperand>("GPR64pi" # Offset4)>;
def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
- !cast<Operand>("VecList" # Count # "1d"),
- !cast<Operand>("GPR64pi" # Offset8)>;
+ !cast<DAGOperand>("VecList" # Count # "1d"),
+ !cast<DAGOperand>("GPR64pi" # Offset8)>;
def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
- !cast<Operand>("VecList" # Count # "2d"),
- !cast<Operand>("GPR64pi" # Offset8)>;
+ !cast<DAGOperand>("VecList" # Count # "2d"),
+ !cast<DAGOperand>("GPR64pi" # Offset8)>;
- defm : SIMDLdrAliases<asm, "8b", Count, Offset1, 64>;
- defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>;
- defm : SIMDLdrAliases<asm, "4h", Count, Offset2, 64>;
- defm : SIMDLdrAliases<asm, "8h", Count, Offset2, 128>;
- defm : SIMDLdrAliases<asm, "2s", Count, Offset4, 64>;
- defm : SIMDLdrAliases<asm, "4s", Count, Offset4, 128>;
- defm : SIMDLdrAliases<asm, "1d", Count, Offset8, 64>;
- defm : SIMDLdrAliases<asm, "2d", Count, Offset8, 128>;
+ defm : SIMDLdrAliases<NAME, asm, "8b", Count, Offset1, 64>;
+ defm : SIMDLdrAliases<NAME, asm, "16b", Count, Offset1, 128>;
+ defm : SIMDLdrAliases<NAME, asm, "4h", Count, Offset2, 64>;
+ defm : SIMDLdrAliases<NAME, asm, "8h", Count, Offset2, 128>;
+ defm : SIMDLdrAliases<NAME, asm, "2s", Count, Offset4, 64>;
+ defm : SIMDLdrAliases<NAME, asm, "4s", Count, Offset4, 128>;
+ defm : SIMDLdrAliases<NAME, asm, "1d", Count, Offset8, 64>;
+ defm : SIMDLdrAliases<NAME, asm, "2d", Count, Offset8, 128>;
}
class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
@@ -9245,31 +9489,31 @@ multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
}
multiclass SIMDLdSt1SingleAliases<string asm> {
- defm : SIMDLdStSingleAliases<asm, "b", "i8", "One", 1, VectorIndexB>;
- defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
- defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
- defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
+ defm "" : SIMDLdStSingleAliases<asm, "b", "i8", "One", 1, VectorIndexB>;
+ defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
+ defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
+ defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
}
multiclass SIMDLdSt2SingleAliases<string asm> {
- defm : SIMDLdStSingleAliases<asm, "b", "i8", "Two", 2, VectorIndexB>;
- defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4, VectorIndexH>;
- defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8, VectorIndexS>;
- defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
+ defm "" : SIMDLdStSingleAliases<asm, "b", "i8", "Two", 2, VectorIndexB>;
+ defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4, VectorIndexH>;
+ defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8, VectorIndexS>;
+ defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
}
multiclass SIMDLdSt3SingleAliases<string asm> {
- defm : SIMDLdStSingleAliases<asm, "b", "i8", "Three", 3, VectorIndexB>;
- defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6, VectorIndexH>;
- defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
- defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
+ defm "" : SIMDLdStSingleAliases<asm, "b", "i8", "Three", 3, VectorIndexB>;
+ defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6, VectorIndexH>;
+ defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
+ defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
}
multiclass SIMDLdSt4SingleAliases<string asm> {
- defm : SIMDLdStSingleAliases<asm, "b", "i8", "Four", 4, VectorIndexB>;
- defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8, VectorIndexH>;
- defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
- defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
+ defm "" : SIMDLdStSingleAliases<asm, "b", "i8", "Four", 4, VectorIndexB>;
+ defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8, VectorIndexH>;
+ defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
+ defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
}
} // end of 'let Predicates = [HasNEON]'
@@ -9280,9 +9524,9 @@ multiclass SIMDLdSt4SingleAliases<string asm> {
let Predicates = [HasNEON, HasRDM] in {
class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
- RegisterOperand regtype, string asm,
+ RegisterOperand regtype, string asm,
string kind, list<dag> pattern>
- : BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind,
+ : BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind,
pattern> {
}
multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
@@ -9291,7 +9535,7 @@ multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
[(set (v4i16 V64:$dst),
(Accum (v4i16 V64:$Rd),
(v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn),
- (v4i16 V64:$Rm)))))]>;
+ (v4i16 V64:$Rm)))))]>;
def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h",
[(set (v8i16 V128:$dst),
(Accum (v8i16 V128:$Rd),
@@ -9355,28 +9599,28 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
let Inst{21} = idx{0};
}
- // FIXME: it would be nice to use the scalar (v1i32) instruction here, but
+ // FIXME: it would be nice to use the scalar (v1i32) instruction here, but
// an intermediate EXTRACT_SUBREG would be untyped.
- // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we
+ // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we
// got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..)))
def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
- (i32 (vector_extract
+ (i32 (vector_extract
(v4i32 (insert_subvector
- (undef),
- (v2i32 (int_aarch64_neon_sqrdmulh
+ (undef),
+ (v2i32 (int_aarch64_neon_sqrdmulh
(v2i32 V64:$Rn),
- (v2i32 (AArch64duplane32
+ (v2i32 (AArch64duplane32
(v4i32 V128:$Rm),
VectorIndexS:$idx)))),
(i32 0))),
(i64 0))))),
(EXTRACT_SUBREG
(v2i32 (!cast<Instruction>(NAME # v2i32_indexed)
- (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
- FPR32Op:$Rd,
- ssub)),
+ (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+ FPR32Op:$Rd,
+ ssub)),
V64:$Rn,
- V128:$Rm,
+ V128:$Rm,
VectorIndexS:$idx)),
ssub)>;
@@ -9397,26 +9641,26 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
// FIXME: it would be nice to use the scalar (v1i32) instruction here, but
// an intermediate EXTRACT_SUBREG would be untyped.
def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
- (i32 (vector_extract
- (v4i32 (int_aarch64_neon_sqrdmulh
+ (i32 (vector_extract
+ (v4i32 (int_aarch64_neon_sqrdmulh
(v4i32 V128:$Rn),
- (v4i32 (AArch64duplane32
+ (v4i32 (AArch64duplane32
(v4i32 V128:$Rm),
VectorIndexS:$idx)))),
(i64 0))))),
(EXTRACT_SUBREG
(v4i32 (!cast<Instruction>(NAME # v4i32_indexed)
- (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
- FPR32Op:$Rd,
- ssub)),
+ (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+ FPR32Op:$Rd,
+ ssub)),
V128:$Rn,
- V128:$Rm,
+ V128:$Rm,
VectorIndexS:$idx)),
ssub)>;
def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
FPR16Op, FPR16Op, V128_lo,
- VectorIndexH, asm, ".h", "", "", ".h",
+ VectorIndexH, asm, ".h", "", "", ".h",
[]> {
bits<3> idx;
let Inst{11} = idx{2};
@@ -9676,7 +9920,6 @@ multiclass SIMDIndexedTiedComplexHSD<bit U, bit opc1, bit opc2, Operand rottype,
// Crypto extensions
//----------------------------------------------------------------------------
-let Predicates = [HasCrypto] in {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
list<dag> pat>
@@ -9766,7 +10009,103 @@ class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
: SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
[(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
-} // end of 'let Predicates = [HasCrypto]'
+
+// Armv8.2-A Crypto extensions
+class BaseCryptoV82<dag oops, dag iops, string asm, string asmops, string cst,
+ list<dag> pattern>
+ : I <oops, iops, asm, asmops, cst, pattern>, Sched<[WriteV]> {
+ bits<5> Vd;
+ bits<5> Vn;
+ let Inst{31-25} = 0b1100111;
+ let Inst{9-5} = Vn;
+ let Inst{4-0} = Vd;
+}
+
+class CryptoRRTied<bits<1>op0, bits<2>op1, string asm, string asmops>
+ : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm, asmops,
+ "$Vm = $Vd", []> {
+ let Inst{31-25} = 0b1100111;
+ let Inst{24-21} = 0b0110;
+ let Inst{20-15} = 0b000001;
+ let Inst{14} = op0;
+ let Inst{13-12} = 0b00;
+ let Inst{11-10} = op1;
+}
+class CryptoRRTied_2D<bits<1>op0, bits<2>op1, string asm>
+ : CryptoRRTied<op0, op1, asm, "{\t$Vd.2d, $Vn.2d}">;
+class CryptoRRTied_4S<bits<1>op0, bits<2>op1, string asm>
+ : CryptoRRTied<op0, op1, asm, "{\t$Vd.4s, $Vn.4s}">;
+
+class CryptoRRR<bits<1> op0, bits<2>op1, dag oops, dag iops, string asm,
+ string asmops, string cst>
+ : BaseCryptoV82<oops, iops, asm , asmops, cst, []> {
+ bits<5> Vm;
+ let Inst{24-21} = 0b0011;
+ let Inst{20-16} = Vm;
+ let Inst{15} = 0b1;
+ let Inst{14} = op0;
+ let Inst{13-12} = 0b00;
+ let Inst{11-10} = op1;
+}
+class CryptoRRR_2D<bits<1> op0, bits<2>op1, string asm>
+ : CryptoRRR<op0, op1, (outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm,
+ "{\t$Vd.2d, $Vn.2d, $Vm.2d}", "">;
+class CryptoRRRTied_2D<bits<1> op0, bits<2>op1, string asm>
+ : CryptoRRR<op0, op1, (outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm), asm,
+ "{\t$Vd.2d, $Vn.2d, $Vm.2d}", "$Vd = $Vdst">;
+class CryptoRRR_4S<bits<1> op0, bits<2>op1, string asm>
+ : CryptoRRR<op0, op1, (outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm,
+ "{\t$Vd.4s, $Vn.4s, $Vm.4s}", "">;
+class CryptoRRRTied_4S<bits<1> op0, bits<2>op1, string asm>
+ : CryptoRRR<op0, op1, (outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm), asm,
+ "{\t$Vd.4s, $Vn.4s, $Vm.4s}", "$Vd = $Vdst">;
+class CryptoRRRTied<bits<1> op0, bits<2>op1, string asm>
+ : CryptoRRR<op0, op1, (outs FPR128:$Vdst), (ins FPR128:$Vd, FPR128:$Vn, V128:$Vm),
+ asm, "{\t$Vd, $Vn, $Vm.2d}", "$Vd = $Vdst">;
+
+class CryptoRRRR<bits<2>op0, string asm, string asmops>
+ : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, V128:$Va), asm,
+ asmops, "", []> {
+ bits<5> Vm;
+ bits<5> Va;
+ let Inst{24-23} = 0b00;
+ let Inst{22-21} = op0;
+ let Inst{20-16} = Vm;
+ let Inst{15} = 0b0;
+ let Inst{14-10} = Va;
+}
+class CryptoRRRR_16B<bits<2>op0, string asm>
+ : CryptoRRRR<op0, asm, "{\t$Vd.16b, $Vn.16b, $Vm.16b, $Va.16b}"> {
+}
+class CryptoRRRR_4S<bits<2>op0, string asm>
+ : CryptoRRRR<op0, asm, "{\t$Vd.4s, $Vn.4s, $Vm.4s, $Va.4s}"> {
+}
+
+class CryptoRRRi6<string asm>
+ : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, uimm6:$imm), asm,
+ "{\t$Vd.2d, $Vn.2d, $Vm.2d, $imm}", "", []> {
+ bits<6> imm;
+ bits<5> Vm;
+ let Inst{24-21} = 0b0100;
+ let Inst{20-16} = Vm;
+ let Inst{15-10} = imm;
+ let Inst{9-5} = Vn;
+ let Inst{4-0} = Vd;
+}
+
+class CryptoRRRi2Tied<bits<1>op0, bits<2>op1, string asm>
+ : BaseCryptoV82<(outs V128:$Vdst),
+ (ins V128:$Vd, V128:$Vn, V128:$Vm, VectorIndexS:$imm),
+ asm, "{\t$Vd.4s, $Vn.4s, $Vm.s$imm}", "$Vd = $Vdst", []> {
+ bits<2> imm;
+ bits<5> Vm;
+ let Inst{24-21} = 0b0010;
+ let Inst{20-16} = Vm;
+ let Inst{15} = 0b1;
+ let Inst{14} = op0;
+ let Inst{13-12} = imm;
+ let Inst{11-10} = op1;
+}
//----------------------------------------------------------------------------
// v8.1 atomic instructions extension:
@@ -9910,7 +10249,7 @@ class BaseLDOPregister<string op, string order, string size, RegisterClass RC>
let Predicates = [HasLSE];
}
-multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel,
+multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel,
string order> {
let Sz = 0b00, Acq = Acq, Rel = Rel, opc = opc in
def B : BaseLDOPregister<op, order, "b", GPR32>;
@@ -9927,15 +10266,15 @@ multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel,
let Predicates = [HasLSE] in
multiclass LDOPregister_patterns_ord_dag<string inst, string suffix, string op,
string size, dag SrcRHS, dag DstRHS> {
- def : Pat<(!cast<SDNode>(op#"_"#size#"_monotonic") GPR64sp:$Rn, SrcRHS),
+ def : Pat<(!cast<PatFrag>(op#"_"#size#"_monotonic") GPR64sp:$Rn, SrcRHS),
(!cast<Instruction>(inst # suffix) DstRHS, GPR64sp:$Rn)>;
- def : Pat<(!cast<SDNode>(op#"_"#size#"_acquire") GPR64sp:$Rn, SrcRHS),
+ def : Pat<(!cast<PatFrag>(op#"_"#size#"_acquire") GPR64sp:$Rn, SrcRHS),
(!cast<Instruction>(inst # "A" # suffix) DstRHS, GPR64sp:$Rn)>;
- def : Pat<(!cast<SDNode>(op#"_"#size#"_release") GPR64sp:$Rn, SrcRHS),
+ def : Pat<(!cast<PatFrag>(op#"_"#size#"_release") GPR64sp:$Rn, SrcRHS),
(!cast<Instruction>(inst # "L" # suffix) DstRHS, GPR64sp:$Rn)>;
- def : Pat<(!cast<SDNode>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, SrcRHS),
+ def : Pat<(!cast<PatFrag>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, SrcRHS),
(!cast<Instruction>(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>;
- def : Pat<(!cast<SDNode>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, SrcRHS),
+ def : Pat<(!cast<PatFrag>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, SrcRHS),
(!cast<Instruction>(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>;
}
@@ -9974,15 +10313,15 @@ multiclass LDOPregister_patterns_mod<string inst, string op, string mod> {
let Predicates = [HasLSE] in
multiclass CASregister_patterns_ord_dag<string inst, string suffix, string op,
string size, dag OLD, dag NEW> {
- def : Pat<(!cast<SDNode>(op#"_"#size#"_monotonic") GPR64sp:$Rn, OLD, NEW),
+ def : Pat<(!cast<PatFrag>(op#"_"#size#"_monotonic") GPR64sp:$Rn, OLD, NEW),
(!cast<Instruction>(inst # suffix) OLD, NEW, GPR64sp:$Rn)>;
- def : Pat<(!cast<SDNode>(op#"_"#size#"_acquire") GPR64sp:$Rn, OLD, NEW),
+ def : Pat<(!cast<PatFrag>(op#"_"#size#"_acquire") GPR64sp:$Rn, OLD, NEW),
(!cast<Instruction>(inst # "A" # suffix) OLD, NEW, GPR64sp:$Rn)>;
- def : Pat<(!cast<SDNode>(op#"_"#size#"_release") GPR64sp:$Rn, OLD, NEW),
+ def : Pat<(!cast<PatFrag>(op#"_"#size#"_release") GPR64sp:$Rn, OLD, NEW),
(!cast<Instruction>(inst # "L" # suffix) OLD, NEW, GPR64sp:$Rn)>;
- def : Pat<(!cast<SDNode>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, OLD, NEW),
+ def : Pat<(!cast<PatFrag>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, OLD, NEW),
(!cast<Instruction>(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>;
- def : Pat<(!cast<SDNode>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, OLD, NEW),
+ def : Pat<(!cast<PatFrag>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, OLD, NEW),
(!cast<Instruction>(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>;
}
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 40836b00b9e6..230480cf1cea 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -19,7 +19,6 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -675,9 +674,13 @@ static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
if (!Subtarget.hasCustomCheapAsMoveHandling())
return MI.isAsCheapAsAMove();
- if (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
- isExynosShiftLeftFast(MI))
- return true;
+
+ if (Subtarget.hasExynosCheapAsMoveHandling()) {
+ if (isExynosResetFast(MI) || isExynosShiftLeftFast(MI))
+ return true;
+ else
+ return MI.isAsCheapAsAMove();
+ }
switch (MI.getOpcode()) {
default:
@@ -736,6 +739,77 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
llvm_unreachable("Unknown opcode to check as cheap as a move!");
}
+bool AArch64InstrInfo::isExynosResetFast(const MachineInstr &MI) const {
+ unsigned Reg, Imm, Shift;
+
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+
+ // MOV Rd, SP
+ case AArch64::ADDWri:
+ case AArch64::ADDXri:
+ if (!MI.getOperand(1).isReg() || !MI.getOperand(2).isImm())
+ return false;
+
+ Reg = MI.getOperand(1).getReg();
+ Imm = MI.getOperand(2).getImm();
+ return ((Reg == AArch64::WSP || Reg == AArch64::SP) && Imm == 0);
+
+ // Literal
+ case AArch64::ADR:
+ case AArch64::ADRP:
+ return true;
+
+ // MOVI Vd, #0
+ case AArch64::MOVID:
+ case AArch64::MOVIv8b_ns:
+ case AArch64::MOVIv2d_ns:
+ case AArch64::MOVIv16b_ns:
+ Imm = MI.getOperand(1).getImm();
+ return (Imm == 0);
+
+ // MOVI Vd, #0
+ case AArch64::MOVIv2i32:
+ case AArch64::MOVIv4i16:
+ case AArch64::MOVIv4i32:
+ case AArch64::MOVIv8i16:
+ Imm = MI.getOperand(1).getImm();
+ Shift = MI.getOperand(2).getImm();
+ return (Imm == 0 && Shift == 0);
+
+ // MOV Rd, Imm
+ case AArch64::MOVNWi:
+ case AArch64::MOVNXi:
+
+ // MOV Rd, Imm
+ case AArch64::MOVZWi:
+ case AArch64::MOVZXi:
+ return true;
+
+ // MOV Rd, Imm
+ case AArch64::ORRWri:
+ case AArch64::ORRXri:
+ if (!MI.getOperand(1).isReg())
+ return false;
+
+ Reg = MI.getOperand(1).getReg();
+ Imm = MI.getOperand(2).getImm();
+ return ((Reg == AArch64::WZR || Reg == AArch64::XZR) && Imm == 0);
+
+ // MOV Rd, Rm
+ case AArch64::ORRWrs:
+ case AArch64::ORRXrs:
+ if (!MI.getOperand(1).isReg())
+ return false;
+
+ Reg = MI.getOperand(1).getReg();
+ Imm = MI.getOperand(3).getImm();
+ Shift = AArch64_AM::getShiftValue(Imm);
+ return ((Reg == AArch64::WZR || Reg == AArch64::XZR) && Shift == 0);
+ }
+}
+
bool AArch64InstrInfo::isExynosShiftLeftFast(const MachineInstr &MI) const {
unsigned Imm, Shift;
AArch64_AM::ShiftExtendType Ext;
@@ -1135,7 +1209,7 @@ static bool UpdateOperandRegClass(MachineInstr &Instr) {
return true;
}
-/// \brief Return the opcode that does not set flags when possible - otherwise
+/// Return the opcode that does not set flags when possible - otherwise
/// return the original opcode. The caller is responsible to do the actual
/// substitution and legality checking.
static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
@@ -1574,7 +1648,7 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
/// Return true if this is this instruction has a non-zero immediate
-bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const {
+bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
break;
@@ -1612,7 +1686,7 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const {
}
/// Return true if this is this instruction has a non-zero immediate
-bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) const {
+bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
break;
@@ -1640,7 +1714,7 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) const {
// Return true if this instruction simply sets its single destination register
// to zero. This is equivalent to a register rename of the zero-register.
-bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
break;
@@ -1664,7 +1738,7 @@ bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) const {
// Return true if this instruction simply renames a general register without
// modifying bits.
-bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
break;
@@ -1694,7 +1768,7 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) const {
// Return true if this instruction simply renames a general register without
// modifying bits.
-bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
break;
@@ -1763,7 +1837,7 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
/// Return true if this is load/store scales or extends its register offset.
/// This refers to scaling a dynamic index as opposed to scaled immediates.
/// MI should be a memory op that allows scaled addressing.
-bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
break;
@@ -1822,27 +1896,27 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const {
}
/// Check all MachineMemOperands for a hint to suppress pairing.
-bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
return MMO->getFlags() & MOSuppressPair;
});
}
/// Set a flag on the first MachineMemOperand to suppress pairing.
-void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const {
+void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
if (MI.memoperands_empty())
return;
(*MI.memoperands_begin())->setFlags(MOSuppressPair);
}
/// Check all MachineMemOperands for a hint that the load/store is strided.
-bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
return MMO->getFlags() & MOStridedAccess;
});
}
-bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
+bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
switch (Opc) {
default:
return false;
@@ -1867,8 +1941,124 @@ bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
}
}
-bool AArch64InstrInfo::isUnscaledLdSt(MachineInstr &MI) const {
- return isUnscaledLdSt(MI.getOpcode());
+bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ // Scaled instructions.
+ case AArch64::STRSui:
+ case AArch64::STRDui:
+ case AArch64::STRQui:
+ case AArch64::STRXui:
+ case AArch64::STRWui:
+ case AArch64::LDRSui:
+ case AArch64::LDRDui:
+ case AArch64::LDRQui:
+ case AArch64::LDRXui:
+ case AArch64::LDRWui:
+ case AArch64::LDRSWui:
+ // Unscaled instructions.
+ case AArch64::STURSi:
+ case AArch64::STURDi:
+ case AArch64::STURQi:
+ case AArch64::STURWi:
+ case AArch64::STURXi:
+ case AArch64::LDURSi:
+ case AArch64::LDURDi:
+ case AArch64::LDURQi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi:
+ case AArch64::LDURSWi:
+ return true;
+ }
+}
+
+unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc,
+ bool &Is64Bit) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Opcode has no flag setting equivalent!");
+ // 32-bit cases:
+ case AArch64::ADDWri:
+ Is64Bit = false;
+ return AArch64::ADDSWri;
+ case AArch64::ADDWrr:
+ Is64Bit = false;
+ return AArch64::ADDSWrr;
+ case AArch64::ADDWrs:
+ Is64Bit = false;
+ return AArch64::ADDSWrs;
+ case AArch64::ADDWrx:
+ Is64Bit = false;
+ return AArch64::ADDSWrx;
+ case AArch64::ANDWri:
+ Is64Bit = false;
+ return AArch64::ANDSWri;
+ case AArch64::ANDWrr:
+ Is64Bit = false;
+ return AArch64::ANDSWrr;
+ case AArch64::ANDWrs:
+ Is64Bit = false;
+ return AArch64::ANDSWrs;
+ case AArch64::BICWrr:
+ Is64Bit = false;
+ return AArch64::BICSWrr;
+ case AArch64::BICWrs:
+ Is64Bit = false;
+ return AArch64::BICSWrs;
+ case AArch64::SUBWri:
+ Is64Bit = false;
+ return AArch64::SUBSWri;
+ case AArch64::SUBWrr:
+ Is64Bit = false;
+ return AArch64::SUBSWrr;
+ case AArch64::SUBWrs:
+ Is64Bit = false;
+ return AArch64::SUBSWrs;
+ case AArch64::SUBWrx:
+ Is64Bit = false;
+ return AArch64::SUBSWrx;
+ // 64-bit cases:
+ case AArch64::ADDXri:
+ Is64Bit = true;
+ return AArch64::ADDSXri;
+ case AArch64::ADDXrr:
+ Is64Bit = true;
+ return AArch64::ADDSXrr;
+ case AArch64::ADDXrs:
+ Is64Bit = true;
+ return AArch64::ADDSXrs;
+ case AArch64::ADDXrx:
+ Is64Bit = true;
+ return AArch64::ADDSXrx;
+ case AArch64::ANDXri:
+ Is64Bit = true;
+ return AArch64::ANDSXri;
+ case AArch64::ANDXrr:
+ Is64Bit = true;
+ return AArch64::ANDSXrr;
+ case AArch64::ANDXrs:
+ Is64Bit = true;
+ return AArch64::ANDSXrs;
+ case AArch64::BICXrr:
+ Is64Bit = true;
+ return AArch64::BICSXrr;
+ case AArch64::BICXrs:
+ Is64Bit = true;
+ return AArch64::BICSXrs;
+ case AArch64::SUBXri:
+ Is64Bit = true;
+ return AArch64::SUBSXri;
+ case AArch64::SUBXrr:
+ Is64Bit = true;
+ return AArch64::SUBSXrr;
+ case AArch64::SUBXrs:
+ Is64Bit = true;
+ return AArch64::SUBSXrs;
+ case AArch64::SUBXrx:
+ Is64Bit = true;
+ return AArch64::SUBSXrx;
+ }
}
// Is this a candidate for ld/st merging or pairing? For example, we don't
@@ -2592,6 +2782,16 @@ void AArch64InstrInfo::storeRegToStackSlot(
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Twov1d;
Offset = false;
+ } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, MBBI, DL, get(AArch64::STPXi))
+ .addReg(TRI->getSubReg(SrcReg, AArch64::sube64),
+ getKillRegState(isKill))
+ .addReg(TRI->getSubReg(SrcReg, AArch64::subo64),
+ getKillRegState(isKill))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO);
+ return;
}
break;
case 24:
@@ -2690,6 +2890,16 @@ void AArch64InstrInfo::loadRegFromStackSlot(
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Twov1d;
Offset = false;
+ } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, MBBI, DL, get(AArch64::LDPXi))
+ .addReg(TRI->getSubReg(DestReg, AArch64::sube64),
+ getDefRegState(true))
+ .addReg(TRI->getSubReg(DestReg, AArch64::subo64),
+ getDefRegState(true))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO);
+ return;
}
break;
case 24:
@@ -4432,7 +4642,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
DelInstrs.push_back(&Root);
}
-/// \brief Replace csincr-branch sequence by simple conditional branch
+/// Replace csincr-branch sequence by simple conditional branch
///
/// Examples:
/// 1. \code
@@ -4690,213 +4900,377 @@ AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
/// * Frame construction overhead: 1 (RET)
/// * Requires stack fixups? No
///
+ /// \p MachineOutlinerThunk implies that the function is being created from
+ /// a sequence of instructions ending in a call. The outlined function is
+ /// called with a BL instruction, and the outlined function tail-calls the
+ /// original call destination.
+ ///
+ /// That is,
+ ///
+ /// I1 OUTLINED_FUNCTION:
+ /// I2 --> BL OUTLINED_FUNCTION I1
+ /// BL f I2
+ /// B f
+ /// * Call construction overhead: 1 (BL)
+ /// * Frame construction overhead: 0
+ /// * Requires stack fixups? No
+ ///
enum MachineOutlinerClass {
MachineOutlinerDefault, /// Emit a save, restore, call, and return.
MachineOutlinerTailCall, /// Only emit a branch.
- MachineOutlinerNoLRSave /// Emit a call and return.
+ MachineOutlinerNoLRSave, /// Emit a call and return.
+ MachineOutlinerThunk, /// Emit a call and tail-call.
};
-bool AArch64InstrInfo::canOutlineWithoutLRSave(
- MachineBasicBlock::iterator &CallInsertionPt) const {
- // Was LR saved in the function containing this basic block?
- MachineBasicBlock &MBB = *(CallInsertionPt->getParent());
- LiveRegUnits LRU(getRegisterInfo());
- LRU.addLiveOuts(MBB);
-
- // Get liveness information from the end of the block to the end of the
- // prospective outlined region.
- std::for_each(MBB.rbegin(),
- (MachineBasicBlock::reverse_iterator)CallInsertionPt,
- [&LRU](MachineInstr &MI) { LRU.stepBackward(MI); });
-
- // If the link register is available at this point, then we can safely outline
- // the region without saving/restoring LR. Otherwise, we must emit a save and
- // restore.
- return LRU.available(AArch64::LR);
-}
+enum MachineOutlinerMBBFlags {
+ LRUnavailableSomewhere = 0x2,
+ HasCalls = 0x4
+};
-AArch64GenInstrInfo::MachineOutlinerInfo
-AArch64InstrInfo::getOutlininingCandidateInfo(
- std::vector<
- std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
- &RepeatedSequenceLocs) const {
+outliner::OutlinedFunction
+AArch64InstrInfo::getOutliningCandidateInfo(
+ std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
+ unsigned SequenceSize = std::accumulate(
+ RepeatedSequenceLocs[0].front(),
+ std::next(RepeatedSequenceLocs[0].back()),
+ 0, [this](unsigned Sum, const MachineInstr &MI) {
+ return Sum + getInstSizeInBytes(MI);
+ });
+
+ // Compute liveness information for each candidate.
+ const TargetRegisterInfo &TRI = getRegisterInfo();
+ std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
+ [&TRI](outliner::Candidate &C) { C.initLRU(TRI); });
+
+ // According to the AArch64 Procedure Call Standard, the following are
+ // undefined on entry/exit from a function call:
+ //
+ // * Registers x16, x17, (and thus w16, w17)
+ // * Condition codes (and thus the NZCV register)
+ //
+ // Because if this, we can't outline any sequence of instructions where
+ // one
+ // of these registers is live into/across it. Thus, we need to delete
+ // those
+ // candidates.
+ auto CantGuaranteeValueAcrossCall = [](outliner::Candidate &C) {
+ LiveRegUnits LRU = C.LRU;
+ return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
+ !LRU.available(AArch64::NZCV));
+ };
+
+ // Erase every candidate that violates the restrictions above. (It could be
+ // true that we have viable candidates, so it's not worth bailing out in
+ // the case that, say, 1 out of 20 candidates violate the restructions.)
+ RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
+ RepeatedSequenceLocs.end(),
+ CantGuaranteeValueAcrossCall),
+ RepeatedSequenceLocs.end());
+
+ // If the sequence is empty, we're done.
+ if (RepeatedSequenceLocs.empty())
+ return outliner::OutlinedFunction();
+
+ // At this point, we have only "safe" candidates to outline. Figure out
+ // frame + call instruction information.
+
+ unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
+
+ // Helper lambda which sets call information for every candidate.
+ auto SetCandidateCallInfo =
+ [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
+ for (outliner::Candidate &C : RepeatedSequenceLocs)
+ C.setCallInfo(CallID, NumBytesForCall);
+ };
- unsigned CallID = MachineOutlinerDefault;
unsigned FrameID = MachineOutlinerDefault;
- unsigned NumInstrsForCall = 3;
- unsigned NumInstrsToCreateFrame = 1;
-
- auto DoesntNeedLRSave =
- [this](std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>
- &I) { return canOutlineWithoutLRSave(I.second); };
+ unsigned NumBytesToCreateFrame = 4;
// If the last instruction in any candidate is a terminator, then we should
// tail call all of the candidates.
- if (RepeatedSequenceLocs[0].second->isTerminator()) {
- CallID = MachineOutlinerTailCall;
+ if (RepeatedSequenceLocs[0].back()->isTerminator()) {
FrameID = MachineOutlinerTailCall;
- NumInstrsForCall = 1;
- NumInstrsToCreateFrame = 0;
+ NumBytesToCreateFrame = 0;
+ SetCandidateCallInfo(MachineOutlinerTailCall, 4);
+ }
+
+ else if (LastInstrOpcode == AArch64::BL || LastInstrOpcode == AArch64::BLR) {
+ // FIXME: Do we need to check if the code after this uses the value of LR?
+ FrameID = MachineOutlinerThunk;
+ NumBytesToCreateFrame = 0;
+ SetCandidateCallInfo(MachineOutlinerThunk, 4);
+ }
+
+ // Make sure that LR isn't live on entry to this candidate. The only
+ // instructions that use LR that could possibly appear in a repeated sequence
+ // are calls. Therefore, we only have to check and see if LR is dead on entry
+ // to (or exit from) some candidate.
+ else if (std::all_of(RepeatedSequenceLocs.begin(),
+ RepeatedSequenceLocs.end(),
+ [](outliner::Candidate &C) {
+ return C.LRU.available(AArch64::LR);
+ })) {
+ FrameID = MachineOutlinerNoLRSave;
+ NumBytesToCreateFrame = 4;
+ SetCandidateCallInfo(MachineOutlinerNoLRSave, 4);
}
- else if (std::all_of(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
- DoesntNeedLRSave)) {
- CallID = MachineOutlinerNoLRSave;
- FrameID = MachineOutlinerNoLRSave;
- NumInstrsForCall = 1;
- NumInstrsToCreateFrame = 1;
+ // LR is live, so we need to save it to the stack.
+ else {
+ FrameID = MachineOutlinerDefault;
+ NumBytesToCreateFrame = 4;
+ SetCandidateCallInfo(MachineOutlinerDefault, 12);
}
// Check if the range contains a call. These require a save + restore of the
// link register.
- if (std::any_of(RepeatedSequenceLocs[0].first, RepeatedSequenceLocs[0].second,
+ if (std::any_of(RepeatedSequenceLocs[0].front(),
+ RepeatedSequenceLocs[0].back(),
[](const MachineInstr &MI) { return MI.isCall(); }))
- NumInstrsToCreateFrame += 2; // Save + restore the link register.
+ NumBytesToCreateFrame += 8; // Save + restore the link register.
// Handle the last instruction separately. If this is a tail call, then the
// last instruction is a call. We don't want to save + restore in this case.
// However, it could be possible that the last instruction is a call without
// it being valid to tail call this sequence. We should consider this as well.
- else if (RepeatedSequenceLocs[0].second->isCall() &&
- FrameID != MachineOutlinerTailCall)
- NumInstrsToCreateFrame += 2;
+ else if (FrameID != MachineOutlinerThunk &&
+ FrameID != MachineOutlinerTailCall &&
+ RepeatedSequenceLocs[0].back()->isCall())
+ NumBytesToCreateFrame += 8;
- return MachineOutlinerInfo(NumInstrsForCall, NumInstrsToCreateFrame, CallID,
- FrameID);
+ return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
+ NumBytesToCreateFrame, FrameID);
}
bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
const Function &F = MF.getFunction();
- // If F uses a redzone, then don't outline from it because it might mess up
- // the stack.
- if (!F.hasFnAttribute(Attribute::NoRedZone))
+ // Can F be deduplicated by the linker? If it can, don't outline from it.
+ if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
return false;
- // If anyone is using the address of this function, don't outline from it.
- if (F.hasAddressTaken())
+ // Don't outline from functions with section markings; the program could
+ // expect that all the code is in the named section.
+ // FIXME: Allow outlining from multiple functions with the same section
+ // marking.
+ if (F.hasSection())
return false;
- // Can F be deduplicated by the linker? If it can, don't outline from it.
- if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
+ // Outlining from functions with redzones is unsafe since the outliner may
+ // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
+ // outline from it.
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ if (!AFI || AFI->hasRedZone().getValueOr(true))
return false;
+ // It's safe to outline from MF.
return true;
}
-AArch64GenInstrInfo::MachineOutlinerInstrType
-AArch64InstrInfo::getOutliningType(MachineInstr &MI) const {
+unsigned
+AArch64InstrInfo::getMachineOutlinerMBBFlags(MachineBasicBlock &MBB) const {
+ unsigned Flags = 0x0;
+ // Check if there's a call inside this MachineBasicBlock. If there is, then
+ // set a flag.
+ if (std::any_of(MBB.begin(), MBB.end(),
+ [](MachineInstr &MI) { return MI.isCall(); }))
+ Flags |= MachineOutlinerMBBFlags::HasCalls;
+
+ // Check if LR is available through all of the MBB. If it's not, then set
+ // a flag.
+ assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
+ "Suitable Machine Function for outlining must track liveness");
+ LiveRegUnits LRU(getRegisterInfo());
+ LRU.addLiveOuts(MBB);
+
+ std::for_each(MBB.rbegin(),
+ MBB.rend(),
+ [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
- MachineFunction *MF = MI.getParent()->getParent();
+ if (!LRU.available(AArch64::LR))
+ Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
+
+ return Flags;
+}
+
+outliner::InstrType
+AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
+ unsigned Flags) const {
+ MachineInstr &MI = *MIT;
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction *MF = MBB->getParent();
AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
// Don't outline LOHs.
if (FuncInfo->getLOHRelated().count(&MI))
- return MachineOutlinerInstrType::Illegal;
+ return outliner::InstrType::Illegal;
// Don't allow debug values to impact outlining type.
- if (MI.isDebugValue() || MI.isIndirectDebugValue())
- return MachineOutlinerInstrType::Invisible;
-
+ if (MI.isDebugInstr() || MI.isIndirectDebugValue())
+ return outliner::InstrType::Invisible;
+
+ // At this point, KILL instructions don't really tell us much so we can go
+ // ahead and skip over them.
+ if (MI.isKill())
+ return outliner::InstrType::Invisible;
+
// Is this a terminator for a basic block?
if (MI.isTerminator()) {
// Is this the end of a function?
if (MI.getParent()->succ_empty())
- return MachineOutlinerInstrType::Legal;
-
+ return outliner::InstrType::Legal;
+
// It's not, so don't outline it.
- return MachineOutlinerInstrType::Illegal;
+ return outliner::InstrType::Illegal;
}
- // Outline calls without stack parameters or aggregate parameters.
+ // Make sure none of the operands are un-outlinable.
+ for (const MachineOperand &MOP : MI.operands()) {
+ if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
+ MOP.isTargetIndex())
+ return outliner::InstrType::Illegal;
+
+ // If it uses LR or W30 explicitly, then don't touch it.
+ if (MOP.isReg() && !MOP.isImplicit() &&
+ (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
+ return outliner::InstrType::Illegal;
+ }
+
+ // Special cases for instructions that can always be outlined, but will fail
+ // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
+ // be outlined because they don't require a *specific* value to be in LR.
+ if (MI.getOpcode() == AArch64::ADRP)
+ return outliner::InstrType::Legal;
+
+ // If MI is a call we might be able to outline it. We don't want to outline
+ // any calls that rely on the position of items on the stack. When we outline
+ // something containing a call, we have to emit a save and restore of LR in
+ // the outlined function. Currently, this always happens by saving LR to the
+ // stack. Thus, if we outline, say, half the parameters for a function call
+ // plus the call, then we'll break the callee's expectations for the layout
+ // of the stack.
+ //
+ // FIXME: Allow calls to functions which construct a stack frame, as long
+ // as they don't access arguments on the stack.
+ // FIXME: Figure out some way to analyze functions defined in other modules.
+ // We should be able to compute the memory usage based on the IR calling
+ // convention, even if we can't see the definition.
if (MI.isCall()) {
- const Module *M = MF->getFunction().getParent();
- assert(M && "No module?");
-
// Get the function associated with the call. Look at each operand and find
// the one that represents the callee and get its name.
- Function *Callee = nullptr;
+ const Function *Callee = nullptr;
for (const MachineOperand &MOP : MI.operands()) {
- if (MOP.isSymbol()) {
- Callee = M->getFunction(MOP.getSymbolName());
- break;
- }
-
- else if (MOP.isGlobal()) {
- Callee = M->getFunction(MOP.getGlobal()->getGlobalIdentifier());
+ if (MOP.isGlobal()) {
+ Callee = dyn_cast<Function>(MOP.getGlobal());
break;
}
}
- // Only handle functions that we have information about.
+ // Never outline calls to mcount. There isn't any rule that would require
+ // this, but the Linux kernel's "ftrace" feature depends on it.
+ if (Callee && Callee->getName() == "\01_mcount")
+ return outliner::InstrType::Illegal;
+
+ // If we don't know anything about the callee, assume it depends on the
+ // stack layout of the caller. In that case, it's only legal to outline
+ // as a tail-call. Whitelist the call instructions we know about so we
+ // don't get unexpected results with call pseudo-instructions.
+ auto UnknownCallOutlineType = outliner::InstrType::Illegal;
+ if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL)
+ UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
+
if (!Callee)
- return MachineOutlinerInstrType::Illegal;
+ return UnknownCallOutlineType;
// We have a function we have information about. Check it if it's something
// can safely outline.
-
- // If the callee is vararg, it passes parameters on the stack. Don't touch
- // it.
- // FIXME: Functions like printf are very common and we should be able to
- // outline them.
- if (Callee->isVarArg())
- return MachineOutlinerInstrType::Illegal;
-
- // Check if any of the arguments are a pointer to a struct. We don't want
- // to outline these since they might be loaded in two instructions.
- for (Argument &Arg : Callee->args()) {
- if (Arg.getType()->isPointerTy() &&
- Arg.getType()->getPointerElementType()->isAggregateType())
- return MachineOutlinerInstrType::Illegal;
- }
-
- // If the thing we're calling doesn't access memory at all, then we're good
- // to go.
- if (Callee->doesNotAccessMemory())
- return MachineOutlinerInstrType::Legal;
-
- // It accesses memory. Get the machine function for the callee to see if
- // it's safe to outline.
MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
// We don't know what's going on with the callee at all. Don't touch it.
if (!CalleeMF)
- return MachineOutlinerInstrType::Illegal;
+ return UnknownCallOutlineType;
- // Does it pass anything on the stack? If it does, don't outline it.
- if (CalleeMF->getInfo<AArch64FunctionInfo>()->getBytesInStackArgArea() != 0)
- return MachineOutlinerInstrType::Illegal;
+ // Check if we know anything about the callee saves on the function. If we
+ // don't, then don't touch it, since that implies that we haven't
+ // computed anything about its stack frame yet.
+ MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
+ if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
+ MFI.getNumObjects() > 0)
+ return UnknownCallOutlineType;
- // It doesn't, so it's safe to outline and we're done.
- return MachineOutlinerInstrType::Legal;
+ // At this point, we can say that CalleeMF ought to not pass anything on the
+ // stack. Therefore, we can outline it.
+ return outliner::InstrType::Legal;
}
// Don't outline positions.
if (MI.isPosition())
- return MachineOutlinerInstrType::Illegal;
+ return outliner::InstrType::Illegal;
// Don't touch the link register or W30.
if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
- return MachineOutlinerInstrType::Illegal;
-
- // Make sure none of the operands are un-outlinable.
- for (const MachineOperand &MOP : MI.operands()) {
- if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
- MOP.isTargetIndex())
- return MachineOutlinerInstrType::Illegal;
-
- // Don't outline anything that uses the link register.
- if (MOP.isReg() && getRegisterInfo().regsOverlap(MOP.getReg(), AArch64::LR))
- return MachineOutlinerInstrType::Illegal;
- }
+ return outliner::InstrType::Illegal;
// Does this use the stack?
if (MI.modifiesRegister(AArch64::SP, &RI) ||
MI.readsRegister(AArch64::SP, &RI)) {
-
+ // True if there is no chance that any outlined candidate from this range
+ // could require stack fixups. That is, both
+ // * LR is available in the range (No save/restore around call)
+ // * The range doesn't include calls (No save/restore in outlined frame)
+ // are true.
+ // FIXME: This is very restrictive; the flags check the whole block,
+ // not just the bit we will try to outline.
+ bool MightNeedStackFixUp =
+ (Flags & (MachineOutlinerMBBFlags::LRUnavailableSomewhere |
+ MachineOutlinerMBBFlags::HasCalls));
+
+ // If this instruction is in a range where it *never* needs to be fixed
+ // up, then we can *always* outline it. This is true even if it's not
+ // possible to fix that instruction up.
+ //
+ // Why? Consider two equivalent instructions I1, I2 where both I1 and I2
+ // use SP. Suppose that I1 sits within a range that definitely doesn't
+ // need stack fixups, while I2 sits in a range that does.
+ //
+ // First, I1 can be outlined as long as we *never* fix up the stack in
+ // any sequence containing it. I1 is already a safe instruction in the
+ // original program, so as long as we don't modify it we're good to go.
+ // So this leaves us with showing that outlining I2 won't break our
+ // program.
+ //
+ // Suppose I1 and I2 belong to equivalent candidate sequences. When we
+ // look at I2, we need to see if it can be fixed up. Suppose I2, (and
+ // thus I1) cannot be fixed up. Then I2 will be assigned an unique
+ // integer label; thus, I2 cannot belong to any candidate sequence (a
+ // contradiction). Suppose I2 can be fixed up. Then I1 can be fixed up
+ // as well, so we're good. Thus, I1 is always safe to outline.
+ //
+ // This gives us two things: first off, it buys us some more instructions
+ // for our search space by deeming stack instructions illegal only when
+ // they can't be fixed up AND we might have to fix them up. Second off,
+ // This allows us to catch tricky instructions like, say,
+ // %xi = ADDXri %sp, n, 0. We can't safely outline these since they might
+ // be paired with later SUBXris, which might *not* end up being outlined.
+ // If we mess with the stack to save something, then an ADDXri messes with
+ // it *after*, then we aren't going to restore the right something from
+ // the stack if we don't outline the corresponding SUBXri first. ADDXris and
+ // SUBXris are extremely common in prologue/epilogue code, so supporting
+ // them in the outliner can be a pretty big win!
+ if (!MightNeedStackFixUp)
+ return outliner::InstrType::Legal;
+
+ // Any modification of SP will break our code to save/restore LR.
+ // FIXME: We could handle some instructions which add a constant offset to
+ // SP, with a bit more work.
+ if (MI.modifiesRegister(AArch64::SP, &RI))
+ return outliner::InstrType::Illegal;
+
+ // At this point, we have a stack instruction that we might need to fix
+ // up. We'll handle it if it's a load or store.
if (MI.mayLoadOrStore()) {
unsigned Base; // Filled with the base regiser of MI.
int64_t Offset; // Filled with the offset of MI.
@@ -4905,7 +5279,7 @@ AArch64InstrInfo::getOutliningType(MachineInstr &MI) const {
// Does it allow us to offset the base register and is the base SP?
if (!getMemOpBaseRegImmOfsWidth(MI, Base, Offset, DummyWidth, &RI) ||
Base != AArch64::SP)
- return MachineOutlinerInstrType::Illegal;
+ return outliner::InstrType::Illegal;
// Find the minimum/maximum offset for this instruction and check if
// fixing it up would be in range.
@@ -4918,17 +5292,19 @@ AArch64InstrInfo::getOutliningType(MachineInstr &MI) const {
// to a MIR test, it really ought to be checked.
Offset += 16; // Update the offset to what it would be if we outlined.
if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale)
- return MachineOutlinerInstrType::Illegal;
+ return outliner::InstrType::Illegal;
// It's in range, so we can outline it.
- return MachineOutlinerInstrType::Legal;
+ return outliner::InstrType::Legal;
}
+ // FIXME: Add handling for instructions like "add x0, sp, #8".
+
// We can't fix it up, so don't outline it.
- return MachineOutlinerInstrType::Illegal;
+ return outliner::InstrType::Illegal;
}
- return MachineOutlinerInstrType::Legal;
+ return outliner::InstrType::Legal;
}
void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
@@ -4959,15 +5335,36 @@ void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
}
}
-void AArch64InstrInfo::insertOutlinerEpilogue(
+void AArch64InstrInfo::buildOutlinedFrame(
MachineBasicBlock &MBB, MachineFunction &MF,
- const MachineOutlinerInfo &MInfo) const {
+ const outliner::OutlinedFunction &OF) const {
+ // For thunk outlining, rewrite the last instruction from a call to a
+ // tail-call.
+ if (OF.FrameConstructionID == MachineOutlinerThunk) {
+ MachineInstr *Call = &*--MBB.instr_end();
+ unsigned TailOpcode;
+ if (Call->getOpcode() == AArch64::BL) {
+ TailOpcode = AArch64::TCRETURNdi;
+ } else {
+ assert(Call->getOpcode() == AArch64::BLR);
+ TailOpcode = AArch64::TCRETURNri;
+ }
+ MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
+ .add(Call->getOperand(0))
+ .addImm(0);
+ MBB.insert(MBB.end(), TC);
+ Call->eraseFromParent();
+ }
// Is there a call in the outlined range?
- if (std::any_of(MBB.instr_begin(), MBB.instr_end(),
- [](MachineInstr &MI) { return MI.isCall(); })) {
+ auto IsNonTailCall = [](MachineInstr &MI) {
+ return MI.isCall() && !MI.isReturn();
+ };
+ if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
// Fix up the instructions in the range, since we're going to modify the
// stack.
+ assert(OF.FrameConstructionID != MachineOutlinerDefault &&
+ "Can only fix up stack references once");
fixupPostOutline(MBB);
// LR has to be a live in so that we can save it.
@@ -4976,7 +5373,8 @@ void AArch64InstrInfo::insertOutlinerEpilogue(
MachineBasicBlock::iterator It = MBB.begin();
MachineBasicBlock::iterator Et = MBB.end();
- if (MInfo.FrameConstructionID == MachineOutlinerTailCall)
+ if (OF.FrameConstructionID == MachineOutlinerTailCall ||
+ OF.FrameConstructionID == MachineOutlinerThunk)
Et = std::prev(MBB.end());
// Insert a save before the outlined region
@@ -4987,6 +5385,25 @@ void AArch64InstrInfo::insertOutlinerEpilogue(
.addImm(-16);
It = MBB.insert(It, STRXpre);
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ const MCRegisterInfo *MRI = STI.getRegisterInfo();
+ unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
+
+ // Add a CFI saying the stack was moved 16 B down.
+ int64_t StackPosEntry =
+ MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16));
+ BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
+ .addCFIIndex(StackPosEntry)
+ .setMIFlags(MachineInstr::FrameSetup);
+
+ // Add a CFI saying that the LR that we want to find is now 16 B higher than
+ // before.
+ int64_t LRPosEntry =
+ MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16));
+ BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
+ .addCFIIndex(LRPosEntry)
+ .setMIFlags(MachineInstr::FrameSetup);
+
// Insert a restore before the terminator for the function.
MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
.addReg(AArch64::SP, RegState::Define)
@@ -4997,7 +5414,8 @@ void AArch64InstrInfo::insertOutlinerEpilogue(
}
// If this is a tail call outlined function, then there's already a return.
- if (MInfo.FrameConstructionID == MachineOutlinerTailCall)
+ if (OF.FrameConstructionID == MachineOutlinerTailCall ||
+ OF.FrameConstructionID == MachineOutlinerThunk)
return;
// It's not a tail call, so we have to insert the return ourselves.
@@ -5006,7 +5424,7 @@ void AArch64InstrInfo::insertOutlinerEpilogue(
MBB.insert(MBB.end(), ret);
// Did we have to modify the stack by saving the link register?
- if (MInfo.FrameConstructionID == MachineOutlinerNoLRSave)
+ if (OF.FrameConstructionID == MachineOutlinerNoLRSave)
return;
// We modified the stack.
@@ -5014,30 +5432,31 @@ void AArch64InstrInfo::insertOutlinerEpilogue(
fixupPostOutline(MBB);
}
-void AArch64InstrInfo::insertOutlinerPrologue(
- MachineBasicBlock &MBB, MachineFunction &MF,
- const MachineOutlinerInfo &MInfo) const {}
-
MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
- MachineFunction &MF, const MachineOutlinerInfo &MInfo) const {
+ MachineFunction &MF, const outliner::Candidate &C) const {
// Are we tail calling?
- if (MInfo.CallConstructionID == MachineOutlinerTailCall) {
+ if (C.CallConstructionID == MachineOutlinerTailCall) {
// If yes, then we can just branch to the label.
- It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::B))
- .addGlobalAddress(M.getNamedValue(MF.getName())));
+ It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
+ .addGlobalAddress(M.getNamedValue(MF.getName()))
+ .addImm(0));
return It;
}
// Are we saving the link register?
- if (MInfo.CallConstructionID == MachineOutlinerNoLRSave) {
+ if (C.CallConstructionID == MachineOutlinerNoLRSave ||
+ C.CallConstructionID == MachineOutlinerThunk) {
// No, so just insert the call.
It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
.addGlobalAddress(M.getNamedValue(MF.getName())));
return It;
}
+ // We want to return the spot where we inserted the call.
+ MachineBasicBlock::iterator CallPt;
+
// We have a default call. Save the link register.
MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
.addReg(AArch64::SP, RegState::Define)
@@ -5050,7 +5469,7 @@ MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
// Insert the call.
It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
.addGlobalAddress(M.getNamedValue(MF.getName())));
-
+ CallPt = It;
It++;
// Restore the link register.
@@ -5061,5 +5480,5 @@ MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
.addImm(16);
It = MBB.insert(It, LDRXpost);
- return It;
+ return CallPt;
}
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 2f10bef1e474..0e5953f6216d 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -64,165 +64,51 @@ public:
/// Returns true if there is a shiftable register and that the shift value
/// is non-zero.
- bool hasShiftedReg(const MachineInstr &MI) const;
+ static bool hasShiftedReg(const MachineInstr &MI);
/// Returns true if there is an extendable register and that the extending
/// value is non-zero.
- bool hasExtendedReg(const MachineInstr &MI) const;
+ static bool hasExtendedReg(const MachineInstr &MI);
- /// \brief Does this instruction set its full destination register to zero?
- bool isGPRZero(const MachineInstr &MI) const;
+ /// Does this instruction set its full destination register to zero?
+ static bool isGPRZero(const MachineInstr &MI);
- /// \brief Does this instruction rename a GPR without modifying bits?
- bool isGPRCopy(const MachineInstr &MI) const;
+ /// Does this instruction rename a GPR without modifying bits?
+ static bool isGPRCopy(const MachineInstr &MI);
- /// \brief Does this instruction rename an FPR without modifying bits?
- bool isFPRCopy(const MachineInstr &MI) const;
+ /// Does this instruction rename an FPR without modifying bits?
+ static bool isFPRCopy(const MachineInstr &MI);
/// Return true if this is load/store scales or extends its register offset.
/// This refers to scaling a dynamic index as opposed to scaled immediates.
/// MI should be a memory op that allows scaled addressing.
- bool isScaledAddr(const MachineInstr &MI) const;
+ static bool isScaledAddr(const MachineInstr &MI);
/// Return true if pairing the given load or store is hinted to be
/// unprofitable.
- bool isLdStPairSuppressed(const MachineInstr &MI) const;
+ static bool isLdStPairSuppressed(const MachineInstr &MI);
/// Return true if the given load or store is a strided memory access.
- bool isStridedAccess(const MachineInstr &MI) const;
+ static bool isStridedAccess(const MachineInstr &MI);
/// Return true if this is an unscaled load/store.
- bool isUnscaledLdSt(unsigned Opc) const;
-
- /// Return true if this is an unscaled load/store.
- bool isUnscaledLdSt(MachineInstr &MI) const;
-
- static bool isPairableLdStInst(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- default:
- return false;
- // Scaled instructions.
- case AArch64::STRSui:
- case AArch64::STRDui:
- case AArch64::STRQui:
- case AArch64::STRXui:
- case AArch64::STRWui:
- case AArch64::LDRSui:
- case AArch64::LDRDui:
- case AArch64::LDRQui:
- case AArch64::LDRXui:
- case AArch64::LDRWui:
- case AArch64::LDRSWui:
- // Unscaled instructions.
- case AArch64::STURSi:
- case AArch64::STURDi:
- case AArch64::STURQi:
- case AArch64::STURWi:
- case AArch64::STURXi:
- case AArch64::LDURSi:
- case AArch64::LDURDi:
- case AArch64::LDURQi:
- case AArch64::LDURWi:
- case AArch64::LDURXi:
- case AArch64::LDURSWi:
- return true;
- }
+ static bool isUnscaledLdSt(unsigned Opc);
+ static bool isUnscaledLdSt(MachineInstr &MI) {
+ return isUnscaledLdSt(MI.getOpcode());
}
- /// \brief Return the opcode that set flags when possible. The caller is
+ /// Return true if pairing the given load or store may be paired with another.
+ static bool isPairableLdStInst(const MachineInstr &MI);
+
+ /// Return the opcode that set flags when possible. The caller is
/// responsible for ensuring the opc has a flag setting equivalent.
- static unsigned convertToFlagSettingOpc(unsigned Opc, bool &Is64Bit) {
- switch (Opc) {
- default:
- llvm_unreachable("Opcode has no flag setting equivalent!");
- // 32-bit cases:
- case AArch64::ADDWri:
- Is64Bit = false;
- return AArch64::ADDSWri;
- case AArch64::ADDWrr:
- Is64Bit = false;
- return AArch64::ADDSWrr;
- case AArch64::ADDWrs:
- Is64Bit = false;
- return AArch64::ADDSWrs;
- case AArch64::ADDWrx:
- Is64Bit = false;
- return AArch64::ADDSWrx;
- case AArch64::ANDWri:
- Is64Bit = false;
- return AArch64::ANDSWri;
- case AArch64::ANDWrr:
- Is64Bit = false;
- return AArch64::ANDSWrr;
- case AArch64::ANDWrs:
- Is64Bit = false;
- return AArch64::ANDSWrs;
- case AArch64::BICWrr:
- Is64Bit = false;
- return AArch64::BICSWrr;
- case AArch64::BICWrs:
- Is64Bit = false;
- return AArch64::BICSWrs;
- case AArch64::SUBWri:
- Is64Bit = false;
- return AArch64::SUBSWri;
- case AArch64::SUBWrr:
- Is64Bit = false;
- return AArch64::SUBSWrr;
- case AArch64::SUBWrs:
- Is64Bit = false;
- return AArch64::SUBSWrs;
- case AArch64::SUBWrx:
- Is64Bit = false;
- return AArch64::SUBSWrx;
- // 64-bit cases:
- case AArch64::ADDXri:
- Is64Bit = true;
- return AArch64::ADDSXri;
- case AArch64::ADDXrr:
- Is64Bit = true;
- return AArch64::ADDSXrr;
- case AArch64::ADDXrs:
- Is64Bit = true;
- return AArch64::ADDSXrs;
- case AArch64::ADDXrx:
- Is64Bit = true;
- return AArch64::ADDSXrx;
- case AArch64::ANDXri:
- Is64Bit = true;
- return AArch64::ANDSXri;
- case AArch64::ANDXrr:
- Is64Bit = true;
- return AArch64::ANDSXrr;
- case AArch64::ANDXrs:
- Is64Bit = true;
- return AArch64::ANDSXrs;
- case AArch64::BICXrr:
- Is64Bit = true;
- return AArch64::BICSXrr;
- case AArch64::BICXrs:
- Is64Bit = true;
- return AArch64::BICSXrs;
- case AArch64::SUBXri:
- Is64Bit = true;
- return AArch64::SUBSXri;
- case AArch64::SUBXrr:
- Is64Bit = true;
- return AArch64::SUBSXrr;
- case AArch64::SUBXrs:
- Is64Bit = true;
- return AArch64::SUBSXrs;
- case AArch64::SUBXrx:
- Is64Bit = true;
- return AArch64::SUBSXrx;
- }
- }
+ static unsigned convertToFlagSettingOpc(unsigned Opc, bool &Is64Bit);
/// Return true if this is a load/store that can be potentially paired/merged.
bool isCandidateToMergeOrPair(MachineInstr &MI) const;
/// Hint that pairing the given load or store is unprofitable.
- void suppressLdStPair(MachineInstr &MI) const;
+ static void suppressLdStPair(MachineInstr &MI);
bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
int64_t &Offset,
@@ -235,7 +121,7 @@ public:
/// Return the immediate offset of the base register in a load/store \p LdSt.
MachineOperand &getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const;
- /// \brief Returns true if opcode \p Opc is a memory operation. If it is, set
+ /// Returns true if opcode \p Opc is a memory operation. If it is, set
/// \p Scale, \p Width, \p MinOffset, and \p MaxOffset accordingly.
///
/// For unscaled instructions, \p Scale is set to 1.
@@ -350,24 +236,22 @@ public:
ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
getSerializableMachineMemOperandTargetFlags() const override;
- bool
- canOutlineWithoutLRSave(MachineBasicBlock::iterator &CallInsertionPt) const;
bool isFunctionSafeToOutlineFrom(MachineFunction &MF,
bool OutlineFromLinkOnceODRs) const override;
- MachineOutlinerInfo getOutlininingCandidateInfo(
- std::vector<
- std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
- &RepeatedSequenceLocs) const override;
- AArch64GenInstrInfo::MachineOutlinerInstrType
- getOutliningType(MachineInstr &MI) const override;
- void insertOutlinerEpilogue(MachineBasicBlock &MBB, MachineFunction &MF,
- const MachineOutlinerInfo &MInfo) const override;
- void insertOutlinerPrologue(MachineBasicBlock &MBB, MachineFunction &MF,
- const MachineOutlinerInfo &MInfo) const override;
+ outliner::OutlinedFunction getOutliningCandidateInfo(
+ std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
+ outliner::InstrType
+ getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const override;
+ unsigned getMachineOutlinerMBBFlags(MachineBasicBlock &MBB) const override;
+ void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF,
+ const outliner::OutlinedFunction &OF) const override;
MachineBasicBlock::iterator
insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
MachineBasicBlock::iterator &It, MachineFunction &MF,
- const MachineOutlinerInfo &MInfo) const override;
+ const outliner::Candidate &C) const override;
+ /// Returns true if the instruction sets to an immediate value that can be
+ /// executed more efficiently.
+ bool isExynosResetFast(const MachineInstr &MI) const;
/// Returns true if the instruction has a shift left that can be executed
/// more efficiently.
bool isExynosShiftLeftFast(const MachineInstr &MI) const;
@@ -376,7 +260,7 @@ public:
bool isFalkorShiftExtFast(const MachineInstr &MI) const;
private:
- /// \brief Sets the offsets on outlined instructions in \p MBB which use SP
+ /// Sets the offsets on outlined instructions in \p MBB which use SP
/// so that they will be valid post-outlining.
///
/// \param MBB A \p MachineBasicBlock in an outlined function.
@@ -406,14 +290,14 @@ bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
unsigned FrameReg, int &Offset,
const AArch64InstrInfo *TII);
-/// \brief Use to report the frame offset status in isAArch64FrameOffsetLegal.
+/// Use to report the frame offset status in isAArch64FrameOffsetLegal.
enum AArch64FrameOffsetStatus {
AArch64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
AArch64FrameOffsetIsLegal = 0x1, ///< Offset is legal.
AArch64FrameOffsetCanUpdate = 0x2 ///< Offset can apply, at least partly.
};
-/// \brief Check if the @p Offset is a valid frame offset for @p MI.
+/// Check if the @p Offset is a valid frame offset for @p MI.
/// The returned value reports the validity of the frame offset for @p MI.
/// It uses the values defined by AArch64FrameOffsetStatus for that.
/// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 79826ca2ed8d..d6b8bb5d89c7 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -20,12 +20,22 @@ def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">,
AssemblerPredicate<"HasV8_3aOps", "armv8.3a">;
+def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">,
+ AssemblerPredicate<"HasV8_4aOps", "armv8.4a">;
def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
def HasNEON : Predicate<"Subtarget->hasNEON()">,
AssemblerPredicate<"FeatureNEON", "neon">;
def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasSM4 : Predicate<"Subtarget->hasSM4()">,
+ AssemblerPredicate<"FeatureSM4", "sm4">;
+def HasSHA3 : Predicate<"Subtarget->hasSHA3()">,
+ AssemblerPredicate<"FeatureSHA3", "sha3">;
+def HasSHA2 : Predicate<"Subtarget->hasSHA2()">,
+ AssemblerPredicate<"FeatureSHA2", "sha2">;
+def HasAES : Predicate<"Subtarget->hasAES()">,
+ AssemblerPredicate<"FeatureAES", "aes">;
def HasDotProd : Predicate<"Subtarget->hasDotProd()">,
AssemblerPredicate<"FeatureDotProd", "dotprod">;
def HasCRC : Predicate<"Subtarget->hasCRC()">,
@@ -396,6 +406,15 @@ def MOVaddrEXT
[(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
texternalsym:$low))]>,
Sched<[WriteAdrAdr]>;
+// Normally AArch64addlow either gets folded into a following ldr/str,
+// or together with an adrp into MOVaddr above. For cases with TLS, it
+// might appear without either of them, so allow lowering it into a plain
+// add.
+def ADDlowTLS
+ : Pseudo<(outs GPR64:$dst), (ins GPR64:$src, i64imm:$low),
+ [(set GPR64:$dst, (AArch64addlow GPR64:$src,
+ tglobaltlsaddr:$low))]>,
+ Sched<[WriteAdr]>;
} // isReMaterializable, isCodeGenOnly
@@ -420,6 +439,7 @@ def : InstAlias<"wfi", (HINT 0b011)>;
def : InstAlias<"sev", (HINT 0b100)>;
def : InstAlias<"sevl", (HINT 0b101)>;
def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>;
+def : InstAlias<"csdb", (HINT 20)>;
// v8.2a Statistical Profiling extension
def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>;
@@ -439,20 +459,46 @@ def DSB : CRmSystemI<barrier_op, 0b100, "dsb",
def ISB : CRmSystemI<barrier_op, 0b110, "isb",
[(int_aarch64_isb (i32 imm32_0_15:$CRm))]>;
+
+def TSB : CRmSystemI<barrier_op, 0b010, "tsb", []> {
+ let CRm = 0b0010;
+ let Inst{12} = 0;
+ let Predicates = [HasV8_4a];
+}
}
// ARMv8.2 Dot Product
let Predicates = [HasDotProd] in {
-def UDOT2S : BaseSIMDThreeSameVectorDot<0, 1, "udot", ".2s", ".8b">;
-def SDOT2S : BaseSIMDThreeSameVectorDot<0, 0, "sdot", ".2s", ".8b">;
-def UDOT4S : BaseSIMDThreeSameVectorDot<1, 1, "udot", ".4s", ".16b">;
-def SDOT4S : BaseSIMDThreeSameVectorDot<1, 0, "sdot", ".4s", ".16b">;
-def UDOTIDX2S : BaseSIMDThreeSameVectorDotIndex<0, 1, "udot", ".2s", ".8b", ".4b">;
-def SDOTIDX2S : BaseSIMDThreeSameVectorDotIndex<0, 0, "sdot", ".2s", ".8b", ".4b">;
-def UDOTIDX4S : BaseSIMDThreeSameVectorDotIndex<1, 1, "udot", ".4s", ".16b", ".4b">;
-def SDOTIDX4S : BaseSIMDThreeSameVectorDotIndex<1, 0, "sdot", ".4s", ".16b", ".4b">;
+defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>;
+defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>;
+defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>;
+defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>;
}
+// Armv8.2-A Crypto extensions
+let Predicates = [HasSHA3] in {
+def SHA512H : CryptoRRRTied<0b0, 0b00, "sha512h">;
+def SHA512H2 : CryptoRRRTied<0b0, 0b01, "sha512h2">;
+def SHA512SU0 : CryptoRRTied_2D<0b0, 0b00, "sha512su0">;
+def SHA512SU1 : CryptoRRRTied_2D<0b0, 0b10, "sha512su1">;
+def RAX1 : CryptoRRR_2D<0b0,0b11, "rax1">;
+def EOR3 : CryptoRRRR_16B<0b00, "eor3">;
+def BCAX : CryptoRRRR_16B<0b01, "bcax">;
+def XAR : CryptoRRRi6<"xar">;
+} // HasSHA3
+
+let Predicates = [HasSM4] in {
+def SM3TT1A : CryptoRRRi2Tied<0b0, 0b00, "sm3tt1a">;
+def SM3TT1B : CryptoRRRi2Tied<0b0, 0b01, "sm3tt1b">;
+def SM3TT2A : CryptoRRRi2Tied<0b0, 0b10, "sm3tt2a">;
+def SM3TT2B : CryptoRRRi2Tied<0b0, 0b11, "sm3tt2b">;
+def SM3SS1 : CryptoRRRR_4S<0b10, "sm3ss1">;
+def SM3PARTW1 : CryptoRRRTied_4S<0b1, 0b00, "sm3partw1">;
+def SM3PARTW2 : CryptoRRRTied_4S<0b1, 0b01, "sm3partw2">;
+def SM4ENCKEY : CryptoRRR_4S<0b1, 0b10, "sm4ekey">;
+def SM4E : CryptoRRTied_4S<0b0, 0b01, "sm4e">;
+} // HasSM4
+
let Predicates = [HasRCPC] in {
// v8.3 Release Consistent Processor Consistent support, optional in v8.2.
def LDAPRB : RCPCLoad<0b00, "ldaprb", GPR32>;
@@ -470,31 +516,34 @@ defm FCADD : SIMDThreeSameVectorComplexHSD<1, 0b111, complexrotateopodd,
defm FCMLA : SIMDIndexedTiedComplexHSD<1, 0, 1, complexrotateop, "fcmla",
null_frag>;
-let Predicates = [HasV8_3a] in {
- // v8.3a Pointer Authentication
- let Uses = [LR], Defs = [LR] in {
- def PACIAZ : SystemNoOperands<0b000, "paciaz">;
- def PACIBZ : SystemNoOperands<0b010, "pacibz">;
- def AUTIAZ : SystemNoOperands<0b100, "autiaz">;
- def AUTIBZ : SystemNoOperands<0b110, "autibz">;
- }
- let Uses = [LR, SP], Defs = [LR] in {
- def PACIASP : SystemNoOperands<0b001, "paciasp">;
- def PACIBSP : SystemNoOperands<0b011, "pacibsp">;
- def AUTIASP : SystemNoOperands<0b101, "autiasp">;
- def AUTIBSP : SystemNoOperands<0b111, "autibsp">;
- }
- let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in {
- def PACIA1716 : SystemNoOperands<0b000, "pacia1716">;
- def PACIB1716 : SystemNoOperands<0b010, "pacib1716">;
- def AUTIA1716 : SystemNoOperands<0b100, "autia1716">;
- def AUTIB1716 : SystemNoOperands<0b110, "autib1716">;
- }
+// v8.3a Pointer Authentication
+// These instructions inhabit part of the hint space and so can be used for
+// armv8 targets
+let Uses = [LR], Defs = [LR] in {
+ def PACIAZ : SystemNoOperands<0b000, "paciaz">;
+ def PACIBZ : SystemNoOperands<0b010, "pacibz">;
+ def AUTIAZ : SystemNoOperands<0b100, "autiaz">;
+ def AUTIBZ : SystemNoOperands<0b110, "autibz">;
+}
+let Uses = [LR, SP], Defs = [LR] in {
+ def PACIASP : SystemNoOperands<0b001, "paciasp">;
+ def PACIBSP : SystemNoOperands<0b011, "pacibsp">;
+ def AUTIASP : SystemNoOperands<0b101, "autiasp">;
+ def AUTIBSP : SystemNoOperands<0b111, "autibsp">;
+}
+let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in {
+ def PACIA1716 : SystemNoOperands<0b000, "pacia1716">;
+ def PACIB1716 : SystemNoOperands<0b010, "pacib1716">;
+ def AUTIA1716 : SystemNoOperands<0b100, "autia1716">;
+ def AUTIB1716 : SystemNoOperands<0b110, "autib1716">;
+}
- let Uses = [LR], Defs = [LR], CRm = 0b0000 in {
- def XPACLRI : SystemNoOperands<0b111, "xpaclri">;
- }
+let Uses = [LR], Defs = [LR], CRm = 0b0000 in {
+ def XPACLRI : SystemNoOperands<0b111, "xpaclri">;
+}
+// These pointer authentication isntructions require armv8.3a
+let Predicates = [HasV8_3a] in {
multiclass SignAuth<bits<3> prefix, bits<3> prefix_z, string asm> {
def IA : SignAuthOneData<prefix, 0b00, !strconcat(asm, "ia")>;
def IB : SignAuthOneData<prefix, 0b01, !strconcat(asm, "ib")>;
@@ -524,7 +573,7 @@ let Predicates = [HasV8_3a] in {
def BLRAAZ : AuthOneOperand<0b001, 0, "blraaz">;
def BLRABZ : AuthOneOperand<0b001, 1, "blrabz">;
- let isReturn = 1 in {
+ let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
def RETAA : AuthReturn<0b010, 0, "retaa">;
def RETAB : AuthReturn<0b010, 1, "retab">;
def ERETAA : AuthReturn<0b100, 0, "eretaa">;
@@ -541,7 +590,18 @@ let Predicates = [HasV8_3a] in {
let Inst{31} = 0;
}
-} // HasV8_3A
+} // HasV8_3a
+
+// v8.4 Flag manipulation instructions
+let Predicates = [HasV8_4a] in {
+def CFINV : SimpleSystemI<0, (ins), "cfinv", "">, Sched<[WriteSys]> {
+ let Inst{20-5} = 0b0000001000000000;
+}
+def SETF8 : BaseFlagManipulation<0, 0, (ins GPR32:$Rn), "setf8", "{\t$Rn}">;
+def SETF16 : BaseFlagManipulation<0, 1, (ins GPR32:$Rn), "setf16", "{\t$Rn}">;
+def RMIF : FlagRotate<(ins GPR64:$Rn, uimm6:$imm, imm0_15:$mask), "rmif",
+ "{\t$Rn, $imm, $mask}">;
+} // HasV8_4a
def : InstAlias<"clrex", (CLREX 0xf)>;
def : InstAlias<"isb", (ISB 0xf)>;
@@ -560,6 +620,9 @@ def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
let Predicates = [HasPerfMon] in
def : Pat<(readcyclecounter), (MRS 0xdce8)>;
+// FPCR register
+def : Pat<(i64 (int_aarch64_get_fpcr)), (MRS 0xda20)>;
+
// Generic system instructions
def SYSxt : SystemXtI<0, "sys">;
def SYSLxt : SystemLXtI<1, "sysl">;
@@ -678,6 +741,9 @@ def trunc_imm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32);
}]>;
+def gi_trunc_imm : GICustomOperandRenderer<"renderTruncImm">,
+ GISDNodeXFormEquiv<trunc_imm>;
+
def : Pat<(i64 i64imm_32bit:$src),
(SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
@@ -1327,6 +1393,7 @@ def ADRP : ADRI<1, "adrp", adrplabel,
// page address of a constant pool entry, block address
def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
+def : Pat<(AArch64adrp texternalsym:$sym), (ADRP texternalsym:$sym)>;
//===----------------------------------------------------------------------===//
// Unconditional branch (register) instructions.
@@ -1410,7 +1477,9 @@ def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;
//===----------------------------------------------------------------------===//
// Exception generation instructions.
//===----------------------------------------------------------------------===//
+let isTrap = 1 in {
def BRK : ExceptionGeneration<0b001, 0b00, "brk">;
+}
def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
@@ -1429,39 +1498,39 @@ def : InstAlias<"dcps3", (DCPS3 0)>;
//===----------------------------------------------------------------------===//
// Pair (indexed, offset)
-defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">;
-defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">;
-defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">;
-defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">;
-defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">;
+defm LDPW : LoadPairOffset<0b00, 0, GPR32z, simm7s4, "ldp">;
+defm LDPX : LoadPairOffset<0b10, 0, GPR64z, simm7s8, "ldp">;
+defm LDPS : LoadPairOffset<0b00, 1, FPR32Op, simm7s4, "ldp">;
+defm LDPD : LoadPairOffset<0b01, 1, FPR64Op, simm7s8, "ldp">;
+defm LDPQ : LoadPairOffset<0b10, 1, FPR128Op, simm7s16, "ldp">;
-defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">;
+defm LDPSW : LoadPairOffset<0b01, 0, GPR64z, simm7s4, "ldpsw">;
// Pair (pre-indexed)
-def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">;
-def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">;
-def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">;
-def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">;
-def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32z, simm7s4, "ldp">;
+def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64z, simm7s8, "ldp">;
+def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32Op, simm7s4, "ldp">;
+def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64Op, simm7s8, "ldp">;
+def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128Op, simm7s16, "ldp">;
-def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64z, simm7s4, "ldpsw">;
// Pair (post-indexed)
-def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
-def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
-def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
-def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
-def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32z, simm7s4, "ldp">;
+def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64z, simm7s8, "ldp">;
+def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32Op, simm7s4, "ldp">;
+def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64Op, simm7s8, "ldp">;
+def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128Op, simm7s16, "ldp">;
-def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64z, simm7s4, "ldpsw">;
// Pair (no allocate)
-defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">;
-defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">;
-defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">;
-defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">;
-defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">;
+defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32z, simm7s4, "ldnp">;
+defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64z, simm7s8, "ldnp">;
+defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32Op, simm7s4, "ldnp">;
+defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64Op, simm7s8, "ldnp">;
+defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128Op, simm7s16, "ldnp">;
//---
// (register offset)
@@ -1474,11 +1543,11 @@ defm LDRW : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
defm LDRX : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;
// Floating-point
-defm LDRB : Load8RO<0b00, 1, 0b01, FPR8, "ldr", untyped, load>;
-defm LDRH : Load16RO<0b01, 1, 0b01, FPR16, "ldr", f16, load>;
-defm LDRS : Load32RO<0b10, 1, 0b01, FPR32, "ldr", f32, load>;
-defm LDRD : Load64RO<0b11, 1, 0b01, FPR64, "ldr", f64, load>;
-defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>;
+defm LDRB : Load8RO<0b00, 1, 0b01, FPR8Op, "ldr", untyped, load>;
+defm LDRH : Load16RO<0b01, 1, 0b01, FPR16Op, "ldr", f16, load>;
+defm LDRS : Load32RO<0b10, 1, 0b01, FPR32Op, "ldr", f32, load>;
+defm LDRD : Load64RO<0b11, 1, 0b01, FPR64Op, "ldr", f64, load>;
+defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128Op, "ldr", f128, load>;
// Load sign-extended half-word
defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
@@ -1640,26 +1709,26 @@ let AddedComplexity = 10 in {
//---
// (unsigned immediate)
//---
-defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr",
- [(set GPR64:$Rt,
+defm LDRX : LoadUI<0b11, 0, 0b01, GPR64z, uimm12s8, "ldr",
+ [(set GPR64z:$Rt,
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
-defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr",
- [(set GPR32:$Rt,
+defm LDRW : LoadUI<0b10, 0, 0b01, GPR32z, uimm12s4, "ldr",
+ [(set GPR32z:$Rt,
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
-defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr",
- [(set FPR8:$Rt,
+defm LDRB : LoadUI<0b00, 1, 0b01, FPR8Op, uimm12s1, "ldr",
+ [(set FPR8Op:$Rt,
(load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
-defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr",
- [(set (f16 FPR16:$Rt),
+defm LDRH : LoadUI<0b01, 1, 0b01, FPR16Op, uimm12s2, "ldr",
+ [(set (f16 FPR16Op:$Rt),
(load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>;
-defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr",
- [(set (f32 FPR32:$Rt),
+defm LDRS : LoadUI<0b10, 1, 0b01, FPR32Op, uimm12s4, "ldr",
+ [(set (f32 FPR32Op:$Rt),
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
-defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr",
- [(set (f64 FPR64:$Rt),
+defm LDRD : LoadUI<0b11, 1, 0b01, FPR64Op, uimm12s8, "ldr",
+ [(set (f64 FPR64Op:$Rt),
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
-defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr",
- [(set (f128 FPR128:$Rt),
+defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr",
+ [(set (f128 FPR128Op:$Rt),
(load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
// For regular load, we do not have any alignment requirement.
@@ -1814,14 +1883,14 @@ def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
//---
// (literal)
-def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
-def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
-def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
-def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
-def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
+def LDRWl : LoadLiteral<0b00, 0, GPR32z, "ldr">;
+def LDRXl : LoadLiteral<0b01, 0, GPR64z, "ldr">;
+def LDRSl : LoadLiteral<0b00, 1, FPR32Op, "ldr">;
+def LDRDl : LoadLiteral<0b01, 1, FPR64Op, "ldr">;
+def LDRQl : LoadLiteral<0b10, 1, FPR128Op, "ldr">;
// load sign-extended word
-def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
+def LDRSWl : LoadLiteral<0b10, 0, GPR64z, "ldrsw">;
// prefetch
def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
@@ -1829,26 +1898,26 @@ def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
//---
// (unscaled immediate)
-defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur",
- [(set GPR64:$Rt,
+defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64z, "ldur",
+ [(set GPR64z:$Rt,
(load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur",
- [(set GPR32:$Rt,
+defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32z, "ldur",
+ [(set GPR32z:$Rt,
(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur",
- [(set FPR8:$Rt,
+defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur",
+ [(set FPR8Op:$Rt,
(load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur",
- [(set FPR16:$Rt,
+defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur",
+ [(set FPR16Op:$Rt,
(load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur",
- [(set (f32 FPR32:$Rt),
+defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32Op, "ldur",
+ [(set (f32 FPR32Op:$Rt),
(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur",
- [(set (f64 FPR64:$Rt),
+defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64Op, "ldur",
+ [(set (f64 FPR64Op:$Rt),
(load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur",
- [(set (f128 FPR128:$Rt),
+defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128Op, "ldur",
+ [(set (f128 FPR128Op:$Rt),
(load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURHH
@@ -1968,15 +2037,15 @@ def : InstAlias<"ldr $Rt, [$Rn, $offset]",
def : InstAlias<"ldr $Rt, [$Rn, $offset]",
(LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
def : InstAlias<"ldr $Rt, [$Rn, $offset]",
- (LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+ (LDURBi FPR8Op:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
def : InstAlias<"ldr $Rt, [$Rn, $offset]",
- (LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+ (LDURHi FPR16Op:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
def : InstAlias<"ldr $Rt, [$Rn, $offset]",
- (LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+ (LDURSi FPR32Op:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
def : InstAlias<"ldr $Rt, [$Rn, $offset]",
- (LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+ (LDURDi FPR64Op:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
def : InstAlias<"ldr $Rt, [$Rn, $offset]",
- (LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+ (LDURQi FPR128Op:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
// zextload -> i64
def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
@@ -2052,53 +2121,53 @@ defm LDTRSW : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
//---
// (immediate pre-indexed)
-def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
-def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
-def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8, "ldr">;
-def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
-def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
-def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
-def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
+def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32z, "ldr">;
+def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64z, "ldr">;
+def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8Op, "ldr">;
+def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16Op, "ldr">;
+def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32Op, "ldr">;
+def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64Op, "ldr">;
+def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128Op, "ldr">;
// load sign-extended half-word
-def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
-def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32z, "ldrsh">;
+def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64z, "ldrsh">;
// load sign-extended byte
-def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
-def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32z, "ldrsb">;
+def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64z, "ldrsb">;
// load zero-extended byte
-def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
-def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32z, "ldrb">;
+def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32z, "ldrh">;
// load sign-extended word
-def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64z, "ldrsw">;
//---
// (immediate post-indexed)
-def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
-def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
-def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8, "ldr">;
-def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
-def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
-def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
-def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
+def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32z, "ldr">;
+def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64z, "ldr">;
+def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8Op, "ldr">;
+def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16Op, "ldr">;
+def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32Op, "ldr">;
+def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64Op, "ldr">;
+def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128Op, "ldr">;
// load sign-extended half-word
-def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
-def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32z, "ldrsh">;
+def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64z, "ldrsh">;
// load sign-extended byte
-def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
-def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32z, "ldrsb">;
+def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64z, "ldrsb">;
// load zero-extended byte
-def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
-def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32z, "ldrb">;
+def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32z, "ldrh">;
// load sign-extended word
-def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64z, "ldrsw">;
//===----------------------------------------------------------------------===//
// Store instructions.
@@ -2106,32 +2175,32 @@ def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
// Pair (indexed, offset)
// FIXME: Use dedicated range-checked addressing mode operand here.
-defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">;
-defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">;
-defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">;
-defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">;
-defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">;
+defm STPW : StorePairOffset<0b00, 0, GPR32z, simm7s4, "stp">;
+defm STPX : StorePairOffset<0b10, 0, GPR64z, simm7s8, "stp">;
+defm STPS : StorePairOffset<0b00, 1, FPR32Op, simm7s4, "stp">;
+defm STPD : StorePairOffset<0b01, 1, FPR64Op, simm7s8, "stp">;
+defm STPQ : StorePairOffset<0b10, 1, FPR128Op, simm7s16, "stp">;
// Pair (pre-indexed)
-def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">;
-def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">;
-def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">;
-def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">;
-def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">;
+def STPWpre : StorePairPreIdx<0b00, 0, GPR32z, simm7s4, "stp">;
+def STPXpre : StorePairPreIdx<0b10, 0, GPR64z, simm7s8, "stp">;
+def STPSpre : StorePairPreIdx<0b00, 1, FPR32Op, simm7s4, "stp">;
+def STPDpre : StorePairPreIdx<0b01, 1, FPR64Op, simm7s8, "stp">;
+def STPQpre : StorePairPreIdx<0b10, 1, FPR128Op, simm7s16, "stp">;
// Pair (pre-indexed)
-def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
-def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
-def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
-def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
-def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
+def STPWpost : StorePairPostIdx<0b00, 0, GPR32z, simm7s4, "stp">;
+def STPXpost : StorePairPostIdx<0b10, 0, GPR64z, simm7s8, "stp">;
+def STPSpost : StorePairPostIdx<0b00, 1, FPR32Op, simm7s4, "stp">;
+def STPDpost : StorePairPostIdx<0b01, 1, FPR64Op, simm7s8, "stp">;
+def STPQpost : StorePairPostIdx<0b10, 1, FPR128Op, simm7s16, "stp">;
// Pair (no allocate)
-defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">;
-defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">;
-defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">;
-defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">;
-defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">;
+defm STNPW : StorePairNoAlloc<0b00, 0, GPR32z, simm7s4, "stnp">;
+defm STNPX : StorePairNoAlloc<0b10, 0, GPR64z, simm7s8, "stnp">;
+defm STNPS : StorePairNoAlloc<0b00, 1, FPR32Op, simm7s4, "stnp">;
+defm STNPD : StorePairNoAlloc<0b01, 1, FPR64Op, simm7s8, "stnp">;
+defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;
//---
// (Register offset)
@@ -2144,11 +2213,11 @@ defm STRX : Store64RO<0b11, 0, 0b00, GPR64, "str", i64, store>;
// Floating-point
-defm STRB : Store8RO< 0b00, 1, 0b00, FPR8, "str", untyped, store>;
-defm STRH : Store16RO<0b01, 1, 0b00, FPR16, "str", f16, store>;
-defm STRS : Store32RO<0b10, 1, 0b00, FPR32, "str", f32, store>;
-defm STRD : Store64RO<0b11, 1, 0b00, FPR64, "str", f64, store>;
-defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128, store>;
+defm STRB : Store8RO< 0b00, 1, 0b00, FPR8Op, "str", untyped, store>;
+defm STRH : Store16RO<0b01, 1, 0b00, FPR16Op, "str", f16, store>;
+defm STRS : Store32RO<0b10, 1, 0b00, FPR32Op, "str", f32, store>;
+defm STRD : Store64RO<0b11, 1, 0b00, FPR64Op, "str", f64, store>;
+defm STRQ : Store128RO<0b00, 1, 0b10, FPR128Op, "str", f128, store>;
let Predicates = [UseSTRQro], AddedComplexity = 10 in {
def : Pat<(store (f128 FPR128:$Rt),
@@ -2239,12 +2308,11 @@ multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop,
let AddedComplexity = 19 in {
defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>;
- defm : VecROStoreLane0Pat<ro16, store , v8i16, i16, hsub, STRHroW, STRHroX>;
- defm : VecROStoreLane0Pat<ro32, truncstorei32, v4i32, i32, ssub, STRSroW, STRSroX>;
- defm : VecROStoreLane0Pat<ro32, store , v4i32, i32, ssub, STRSroW, STRSroX>;
- defm : VecROStoreLane0Pat<ro32, store , v4f32, f32, ssub, STRSroW, STRSroX>;
- defm : VecROStoreLane0Pat<ro64, store , v2i64, i64, dsub, STRDroW, STRDroX>;
- defm : VecROStoreLane0Pat<ro64, store , v2f64, f64, dsub, STRDroW, STRDroX>;
+ defm : VecROStoreLane0Pat<ro16, store, v8f16, f16, hsub, STRHroW, STRHroX>;
+ defm : VecROStoreLane0Pat<ro32, store, v4i32, i32, ssub, STRSroW, STRSroX>;
+ defm : VecROStoreLane0Pat<ro32, store, v4f32, f32, ssub, STRSroW, STRSroX>;
+ defm : VecROStoreLane0Pat<ro64, store, v2i64, i64, dsub, STRDroW, STRDroX>;
+ defm : VecROStoreLane0Pat<ro64, store, v2f64, f64, dsub, STRDroW, STRDroX>;
}
//---
@@ -2255,19 +2323,19 @@ defm STRX : StoreUIz<0b11, 0, 0b00, GPR64z, uimm12s8, "str",
defm STRW : StoreUIz<0b10, 0, 0b00, GPR32z, uimm12s4, "str",
[(store GPR32z:$Rt,
(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
-defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str",
- [(store FPR8:$Rt,
+defm STRB : StoreUI<0b00, 1, 0b00, FPR8Op, uimm12s1, "str",
+ [(store FPR8Op:$Rt,
(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
-defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str",
- [(store (f16 FPR16:$Rt),
+defm STRH : StoreUI<0b01, 1, 0b00, FPR16Op, uimm12s2, "str",
+ [(store (f16 FPR16Op:$Rt),
(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>;
-defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str",
- [(store (f32 FPR32:$Rt),
+defm STRS : StoreUI<0b10, 1, 0b00, FPR32Op, uimm12s4, "str",
+ [(store (f32 FPR32Op:$Rt),
(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
-defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str",
- [(store (f64 FPR64:$Rt),
+defm STRD : StoreUI<0b11, 1, 0b00, FPR64Op, uimm12s8, "str",
+ [(store (f64 FPR64Op:$Rt),
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
-defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>;
+defm STRQ : StoreUI<0b00, 1, 0b10, FPR128Op, uimm12s16, "str", []>;
defm STRHH : StoreUIz<0b01, 0, 0b00, GPR32z, uimm12s2, "strh",
[(truncstorei16 GPR32z:$Rt,
@@ -2278,8 +2346,16 @@ defm STRBB : StoreUIz<0b00, 0, 0b00, GPR32z, uimm12s1, "strb",
(am_indexed8 GPR64sp:$Rn,
uimm12s1:$offset))]>;
-// Match all store 64 bits width whose type is compatible with FPR64
let AddedComplexity = 10 in {
+
+// Match all store 64 bits width whose type is compatible with FPR64
+def : Pat<(store (v1i64 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(store (v1f64 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+
let Predicates = [IsLE] in {
// We must use ST1 to store vectors in big-endian.
def : Pat<(store (v2f32 FPR64:$Rt),
@@ -2298,14 +2374,12 @@ let Predicates = [IsLE] in {
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
}
-def : Pat<(store (v1f64 FPR64:$Rt),
- (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
- (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
-def : Pat<(store (v1i64 FPR64:$Rt),
- (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
- (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
// Match all store 128 bits width whose type is compatible with FPR128
+def : Pat<(store (f128 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+
let Predicates = [IsLE] in {
// We must use ST1 to store vectors in big-endian.
def : Pat<(store (v4f32 FPR128:$Rt),
@@ -2330,9 +2404,6 @@ let Predicates = [IsLE] in {
(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
}
-def : Pat<(store (f128 FPR128:$Rt),
- (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
- (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
// truncstore i64
def : Pat<(truncstorei32 GPR64:$Rt,
@@ -2346,37 +2417,81 @@ def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
} // AddedComplexity = 10
+// Match stores from lane 0 to the appropriate subreg's store.
+multiclass VecStoreLane0Pat<Operand UIAddrMode, SDPatternOperator storeop,
+ ValueType VTy, ValueType STy,
+ SubRegIndex SubRegIdx, Operand IndexType,
+ Instruction STR> {
+ def : Pat<(storeop (STy (vector_extract (VTy VecListOne128:$Vt), 0)),
+ (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+ (STR (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+ GPR64sp:$Rn, IndexType:$offset)>;
+}
+
+let AddedComplexity = 19 in {
+ defm : VecStoreLane0Pat<am_indexed16, truncstorei16, v8i16, i32, hsub, uimm12s2, STRHui>;
+ defm : VecStoreLane0Pat<am_indexed16, store, v8f16, f16, hsub, uimm12s2, STRHui>;
+ defm : VecStoreLane0Pat<am_indexed32, store, v4i32, i32, ssub, uimm12s4, STRSui>;
+ defm : VecStoreLane0Pat<am_indexed32, store, v4f32, f32, ssub, uimm12s4, STRSui>;
+ defm : VecStoreLane0Pat<am_indexed64, store, v2i64, i64, dsub, uimm12s8, STRDui>;
+ defm : VecStoreLane0Pat<am_indexed64, store, v2f64, f64, dsub, uimm12s8, STRDui>;
+}
+
//---
// (unscaled immediate)
-defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur",
- [(store GPR64:$Rt,
+defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64z, "stur",
+ [(store GPR64z:$Rt,
(am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur",
- [(store GPR32:$Rt,
+defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32z, "stur",
+ [(store GPR32z:$Rt,
(am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur",
- [(store FPR8:$Rt,
+defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8Op, "stur",
+ [(store FPR8Op:$Rt,
(am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur",
- [(store (f16 FPR16:$Rt),
+defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16Op, "stur",
+ [(store (f16 FPR16Op:$Rt),
(am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur",
- [(store (f32 FPR32:$Rt),
+defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32Op, "stur",
+ [(store (f32 FPR32Op:$Rt),
(am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur",
- [(store (f64 FPR64:$Rt),
+defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64Op, "stur",
+ [(store (f64 FPR64Op:$Rt),
(am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur",
- [(store (f128 FPR128:$Rt),
+defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128Op, "stur",
+ [(store (f128 FPR128Op:$Rt),
(am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh",
- [(truncstorei16 GPR32:$Rt,
+defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32z, "sturh",
+ [(truncstorei16 GPR32z:$Rt,
(am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb",
- [(truncstorei8 GPR32:$Rt,
+defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32z, "sturb",
+ [(truncstorei8 GPR32z:$Rt,
(am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+// Armv8.4 LDAPR & STLR with Immediate Offset instruction
+let Predicates = [HasV8_4a] in {
+defm STLURB : BaseStoreUnscaleV84<"stlurb", 0b00, 0b00, GPR32>;
+defm STLURH : BaseStoreUnscaleV84<"stlurh", 0b01, 0b00, GPR32>;
+defm STLURW : BaseStoreUnscaleV84<"stlur", 0b10, 0b00, GPR32>;
+defm STLURX : BaseStoreUnscaleV84<"stlur", 0b11, 0b00, GPR64>;
+defm LDAPURB : BaseLoadUnscaleV84<"ldapurb", 0b00, 0b01, GPR32>;
+defm LDAPURSBW : BaseLoadUnscaleV84<"ldapursb", 0b00, 0b11, GPR32>;
+defm LDAPURSBX : BaseLoadUnscaleV84<"ldapursb", 0b00, 0b10, GPR64>;
+defm LDAPURH : BaseLoadUnscaleV84<"ldapurh", 0b01, 0b01, GPR32>;
+defm LDAPURSHW : BaseLoadUnscaleV84<"ldapursh", 0b01, 0b11, GPR32>;
+defm LDAPURSHX : BaseLoadUnscaleV84<"ldapursh", 0b01, 0b10, GPR64>;
+defm LDAPUR : BaseLoadUnscaleV84<"ldapur", 0b10, 0b01, GPR32>;
+defm LDAPURSW : BaseLoadUnscaleV84<"ldapursw", 0b10, 0b10, GPR64>;
+defm LDAPURX : BaseLoadUnscaleV84<"ldapur", 0b11, 0b01, GPR64>;
+}
+
// Match all store 64 bits width whose type is compatible with FPR64
+def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+
+let AddedComplexity = 10 in {
+
let Predicates = [IsLE] in {
// We must use ST1 to store vectors in big-endian.
def : Pat<(store (v2f32 FPR64:$Rt),
@@ -2395,12 +2510,11 @@ let Predicates = [IsLE] in {
(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
}
-def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
- (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
- (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
// Match all store 128 bits width whose type is compatible with FPR128
+def : Pat<(store (f128 FPR128:$Rt), (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+
let Predicates = [IsLE] in {
// We must use ST1 to store vectors in big-endian.
def : Pat<(store (v4f32 FPR128:$Rt),
@@ -2429,6 +2543,8 @@ let Predicates = [IsLE] in {
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
}
+} // AddedComplexity = 10
+
// unscaled i64 truncating stores
def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
(STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
@@ -2437,6 +2553,22 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
(STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+// Match stores from lane 0 to the appropriate subreg's store.
+multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
+ ValueType VTy, ValueType STy,
+ SubRegIndex SubRegIdx, Instruction STR> {
+ defm : VecStoreLane0Pat<am_unscaled128, StoreOp, VTy, STy, SubRegIdx, simm9, STR>;
+}
+
+let AddedComplexity = 19 in {
+ defm : VecStoreULane0Pat<truncstorei16, v8i16, i32, hsub, STURHi>;
+ defm : VecStoreULane0Pat<store, v8f16, f16, hsub, STURHi>;
+ defm : VecStoreULane0Pat<store, v4i32, i32, ssub, STURSi>;
+ defm : VecStoreULane0Pat<store, v4f32, f32, ssub, STURSi>;
+ defm : VecStoreULane0Pat<store, v2i64, i64, dsub, STURDi>;
+ defm : VecStoreULane0Pat<store, v2f64, f64, dsub, STURDi>;
+}
+
//---
// STR mnemonics fall back to STUR for negative or unaligned offsets.
def : InstAlias<"str $Rt, [$Rn, $offset]",
@@ -2444,15 +2576,15 @@ def : InstAlias<"str $Rt, [$Rn, $offset]",
def : InstAlias<"str $Rt, [$Rn, $offset]",
(STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
def : InstAlias<"str $Rt, [$Rn, $offset]",
- (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+ (STURBi FPR8Op:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
def : InstAlias<"str $Rt, [$Rn, $offset]",
- (STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+ (STURHi FPR16Op:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
def : InstAlias<"str $Rt, [$Rn, $offset]",
- (STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+ (STURSi FPR32Op:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
def : InstAlias<"str $Rt, [$Rn, $offset]",
- (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+ (STURDi FPR64Op:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
def : InstAlias<"str $Rt, [$Rn, $offset]",
- (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+ (STURQi FPR128Op:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
def : InstAlias<"strb $Rt, [$Rn, $offset]",
(STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
@@ -2469,16 +2601,16 @@ defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
//---
// (immediate pre-indexed)
-def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str", pre_store, i32>;
-def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str", pre_store, i64>;
-def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8, "str", pre_store, untyped>;
-def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str", pre_store, f16>;
-def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str", pre_store, f32>;
-def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str", pre_store, f64>;
-def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>;
+def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32z, "str", pre_store, i32>;
+def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64z, "str", pre_store, i64>;
+def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8Op, "str", pre_store, untyped>;
+def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16Op, "str", pre_store, f16>;
+def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32Op, "str", pre_store, f32>;
+def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64Op, "str", pre_store, f64>;
+def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128Op, "str", pre_store, f128>;
-def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8, i32>;
-def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>;
+def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32z, "strb", pre_truncsti8, i32>;
+def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32z, "strh", pre_truncsti16, i32>;
// truncstore i64
def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
@@ -2523,16 +2655,16 @@ def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
//---
// (immediate post-indexed)
-def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32, "str", post_store, i32>;
-def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64, "str", post_store, i64>;
-def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8, "str", post_store, untyped>;
-def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16, "str", post_store, f16>;
-def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32, "str", post_store, f32>;
-def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64, "str", post_store, f64>;
-def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>;
+def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32z, "str", post_store, i32>;
+def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64z, "str", post_store, i64>;
+def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8Op, "str", post_store, untyped>;
+def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16Op, "str", post_store, f16>;
+def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32Op, "str", post_store, f32>;
+def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64Op, "str", post_store, f64>;
+def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128Op, "str", post_store, f128>;
-def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>;
-def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>;
+def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32z, "strb", post_truncsti8, i32>;
+def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32z, "strh", post_truncsti16, i32>;
// truncstore i64
def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
@@ -2713,7 +2845,7 @@ defm FMOV : UnscaledConversion<"fmov">;
// Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable
let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in {
def FMOVH0 : Pseudo<(outs FPR16:$Rd), (ins), [(set f16:$Rd, (fpimm0))]>,
- Sched<[WriteF]>;
+ Sched<[WriteF]>, Requires<[HasFullFP16]>;
def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>,
Sched<[WriteF]>;
def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>,
@@ -3073,6 +3205,14 @@ defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>;
+let Predicates = [HasNEON] in {
+foreach VT = [ v2f32, v4f32, v2f64 ] in
+def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, VT:$Rm)>;
+}
+let Predicates = [HasNEON, HasFullFP16] in {
+foreach VT = [ v4f16, v8f16 ] in
+def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, VT:$Rm)>;
+}
defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>;
@@ -3382,6 +3522,11 @@ defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
defm FABD : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>;
def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FABD64 FPR64:$Rn, FPR64:$Rm)>;
+let Predicates = [HasFullFP16] in {
+def : Pat<(fabs (fsub f16:$Rn, f16:$Rm)), (FABD16 f16:$Rn, f16:$Rm)>;
+}
+def : Pat<(fabs (fsub f32:$Rn, f32:$Rm)), (FABD32 f32:$Rn, f32:$Rm)>;
+def : Pat<(fabs (fsub f64:$Rn, f64:$Rm)), (FABD64 f64:$Rn, f64:$Rm)>;
defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge",
int_aarch64_neon_facge>;
defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
@@ -3526,6 +3671,8 @@ def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
(FCVTPUv1i64 FPR64:$Rn)>;
+def : Pat<(f16 (int_aarch64_neon_frecpe (f16 FPR16:$Rn))),
+ (FRECPEv1f16 FPR16:$Rn)>;
def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
(FRECPEv1i32 FPR32:$Rn)>;
def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
@@ -3557,11 +3704,15 @@ def : Pat<(f64 (AArch64frecps (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
def : Pat<(v2f64 (AArch64frecps (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
(FRECPSv2f64 FPR128:$Rn, FPR128:$Rm)>;
+def : Pat<(f16 (int_aarch64_neon_frecpx (f16 FPR16:$Rn))),
+ (FRECPXv1f16 FPR16:$Rn)>;
def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
(FRECPXv1i32 FPR32:$Rn)>;
def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
(FRECPXv1i64 FPR64:$Rn)>;
+def : Pat<(f16 (int_aarch64_neon_frsqrte (f16 FPR16:$Rn))),
+ (FRSQRTEv1f16 FPR16:$Rn)>;
def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
(FRSQRTEv1i32 FPR32:$Rn)>;
def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
@@ -3744,6 +3895,25 @@ defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
+// Patterns for smull2/umull2.
+multiclass Neon_mul_high_patterns<SDPatternOperator opnode,
+ Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+ def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn),
+ (extract_high_v16i8 V128:$Rm))),
+ (INST8B V128:$Rn, V128:$Rm)>;
+ def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm))),
+ (INST4H V128:$Rn, V128:$Rm)>;
+ def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm))),
+ (INST2S V128:$Rn, V128:$Rm)>;
+}
+
+defm : Neon_mul_high_patterns<AArch64smull, SMULLv16i8_v8i16,
+ SMULLv8i16_v4i32, SMULLv4i32_v2i64>;
+defm : Neon_mul_high_patterns<AArch64umull, UMULLv16i8_v8i16,
+ UMULLv8i16_v4i32, UMULLv4i32_v2i64>;
+
// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
Instruction INST8B, Instruction INST4H, Instruction INST2S> {
@@ -4103,12 +4273,18 @@ def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+ (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+ (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
(v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
(i32 FPR32:$Rn), ssub))>;
def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 FPR32:$Rn), ssub))>;
+
def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
(i64 FPR64:$Rn), dsub))>;
@@ -4122,6 +4298,7 @@ def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+
def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
(INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
@@ -4592,10 +4769,8 @@ def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>;
-
// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
@@ -4617,6 +4792,7 @@ def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
(MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
// EDIT per word: 2s & 4s with MSL shifter
def MOVIv2s_msl : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
[(set (v2i32 V64:$Rd),
@@ -4629,13 +4805,16 @@ def MOVIv4s_msl : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64, imm0_255,
"movi", ".8b",
[(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
+
def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255,
"movi", ".16b",
[(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
+}
// AdvSIMD MVNI
// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
defm MVNI : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
@@ -4658,12 +4837,14 @@ def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
(MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
// EDIT per word: 2s & 4s with MSL shifter
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def MVNIv2s_msl : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
[(set (v2i32 V64:$Rd),
(AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
[(set (v4i32 V128:$Rd),
(AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+}
//----------------------------------------------------------------------------
// AdvSIMD indexed element
@@ -4850,20 +5031,55 @@ def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
vecshiftR64:$imm)),
(FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
- (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
(UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
- (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
(UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
vecshiftR64:$imm)),
(SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+ (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
vecshiftR64:$imm)),
(UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
+ (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+
+// Patterns for FP16 Instrinsics - requires reg copy to/from as i16s not supported.
+
+def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 (sext_inreg FPR32:$Rn, i16)), vecshiftR16:$imm)),
+ (SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
+def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 FPR32:$Rn), vecshiftR16:$imm)),
+ (SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
+def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp
+ (and FPR32:$Rn, (i32 65535)),
+ vecshiftR16:$imm)),
+ (UCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
+def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR16:$imm)),
+ (UCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
+def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR16:$imm)),
+ (UCVTFh (EXTRACT_SUBREG FPR64:$Rn, hsub), vecshiftR16:$imm)>;
+def : Pat<(i32 (int_aarch64_neon_vcvtfp2fxs (f16 FPR16:$Rn), vecshiftR32:$imm)),
+ (i32 (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)),
+ (FCVTZSh FPR16:$Rn, vecshiftR32:$imm),
+ hsub))>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f16 FPR16:$Rn), vecshiftR64:$imm)),
+ (i64 (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)),
+ (FCVTZSh FPR16:$Rn, vecshiftR64:$imm),
+ hsub))>;
+def : Pat<(i32 (int_aarch64_neon_vcvtfp2fxu (f16 FPR16:$Rn), vecshiftR32:$imm)),
+ (i32 (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)),
+ (FCVTZUh FPR16:$Rn, vecshiftR32:$imm),
+ hsub))>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f16 FPR16:$Rn), vecshiftR64:$imm)),
+ (i64 (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)),
+ (FCVTZUh FPR16:$Rn, vecshiftR64:$imm),
+ hsub))>;
defm SHL : SIMDScalarLShiftD< 0, 0b01010, "shl", AArch64vshl>;
defm SLI : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
@@ -5425,10 +5641,12 @@ defm ST4 : SIMDLdSt4SingleAliases<"st4">;
// Crypto extensions
//----------------------------------------------------------------------------
+let Predicates = [HasAES] in {
def AESErr : AESTiedInst<0b0100, "aese", int_aarch64_crypto_aese>;
def AESDrr : AESTiedInst<0b0101, "aesd", int_aarch64_crypto_aesd>;
def AESMCrr : AESInst< 0b0110, "aesmc", int_aarch64_crypto_aesmc>;
def AESIMCrr : AESInst< 0b0111, "aesimc", int_aarch64_crypto_aesimc>;
+}
// Pseudo instructions for AESMCrr/AESIMCrr with a register constraint required
// for AES fusion on some CPUs.
@@ -5455,6 +5673,7 @@ def : Pat<(v16i8 (int_aarch64_crypto_aesimc
(v16i8 V128:$src2)))))>,
Requires<[HasFuseAES]>;
+let Predicates = [HasSHA2] in {
def SHA1Crrr : SHATiedInstQSV<0b000, "sha1c", int_aarch64_crypto_sha1c>;
def SHA1Prrr : SHATiedInstQSV<0b001, "sha1p", int_aarch64_crypto_sha1p>;
def SHA1Mrrr : SHATiedInstQSV<0b010, "sha1m", int_aarch64_crypto_sha1m>;
@@ -5466,6 +5685,7 @@ def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1
def SHA1Hrr : SHAInstSS< 0b0000, "sha1h", int_aarch64_crypto_sha1h>;
def SHA1SU1rr : SHATiedInstVV<0b0001, "sha1su1", int_aarch64_crypto_sha1su1>;
def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>;
+}
//----------------------------------------------------------------------------
// Compiler-pseudos
@@ -5614,6 +5834,7 @@ def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
@@ -5785,7 +6006,7 @@ def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))),
def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
(v2i32 (REV64v2i32 FPR64:$src))>;
def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))),
- (v2i32 (REV64v4i16 FPR64:$src))>;
+ (v2i32 (REV32v4i16 FPR64:$src))>;
}
def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
@@ -5794,7 +6015,6 @@ def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
}
@@ -5807,18 +6027,16 @@ def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))),
(v4i16 (REV16v8i8 FPR64:$src))>;
def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))),
(v4i16 (REV64v4i16 FPR64:$src))>;
-def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))),
- (v4i16 (REV32v4i16 FPR64:$src))>;
def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
(v4i16 (REV32v4i16 FPR64:$src))>;
def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
(v4i16 (REV64v4i16 FPR64:$src))>;
}
+def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
-def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>;
@@ -5828,20 +6046,17 @@ let Predicates = [IsBE] in {
def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))),
(v4f16 (REV64v4i16 FPR64:$src))>;
def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))),
- (v4f16 (REV64v4i16 FPR64:$src))>;
-def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))),
- (v4f16 (REV64v4i16 FPR64:$src))>;
+ (v4f16 (REV32v4i16 FPR64:$src))>;
def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))),
(v4f16 (REV16v8i8 FPR64:$src))>;
def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))),
(v4f16 (REV64v4i16 FPR64:$src))>;
def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))),
- (v4f16 (REV64v4i16 FPR64:$src))>;
+ (v4f16 (REV32v4i16 FPR64:$src))>;
def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))),
(v4f16 (REV64v4i16 FPR64:$src))>;
}
-
-
+def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>;
@@ -5933,7 +6148,7 @@ def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))),
(v2f32 (REV64v2i32 FPR64:$src))>;
def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))),
- (v2f32 (REV64v4i16 FPR64:$src))>;
+ (v2f32 (REV32v4i16 FPR64:$src))>;
}
def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
@@ -6076,7 +6291,6 @@ def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))),
@@ -6093,15 +6307,13 @@ def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
(v8i16 (REV64v8i16 FPR128:$src))>;
def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
(v8i16 (REV32v8i16 FPR128:$src))>;
-def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))),
- (v8i16 (REV32v8i16 FPR128:$src))>;
}
+def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
@@ -6115,8 +6327,6 @@ def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))),
(v8f16 (REV64v8i16 FPR128:$src))>;
def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))),
(v8f16 (REV32v8i16 FPR128:$src))>;
-def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))),
- (v8f16 (REV64v8i16 FPR128:$src))>;
def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))),
(v8f16 (REV16v16i8 FPR128:$src))>;
def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
@@ -6124,6 +6334,7 @@ def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))),
(v8f16 (REV32v8i16 FPR128:$src))>;
}
+def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>;
@@ -6179,20 +6390,25 @@ def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
// A 64-bit subvector insert to the first 128-bit vector position
// is a subregister copy that needs no instruction.
-def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
- (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
- (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
- (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
- (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
- (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (i32 0)),
- (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+multiclass InsertSubvectorUndef<ValueType Ty> {
+ def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (Ty 0)),
+ (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+ def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (Ty 0)),
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+ def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (Ty 0)),
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+ def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (Ty 0)),
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+ def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (Ty 0)),
+ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+ def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (Ty 0)),
+ (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+ def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (Ty 0)),
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+}
+
+defm : InsertSubvectorUndef<i32>;
+defm : InsertSubvectorUndef<i64>;
// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
// or v2f32.
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index c2d3ae31c624..4d7ca2349ed1 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -92,6 +92,8 @@ private:
return selectAddrModeIndexed(Root, Width / 8);
}
+ void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const;
+
const AArch64TargetMachine &TM;
const AArch64Subtarget &STI;
const AArch64InstrInfo &TII;
@@ -133,16 +135,21 @@ AArch64InstructionSelector::AArch64InstructionSelector(
// for each class in the bank.
static const TargetRegisterClass *
getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
- const RegisterBankInfo &RBI) {
+ const RegisterBankInfo &RBI,
+ bool GetAllRegSet = false) {
if (RB.getID() == AArch64::GPRRegBankID) {
if (Ty.getSizeInBits() <= 32)
- return &AArch64::GPR32RegClass;
+ return GetAllRegSet ? &AArch64::GPR32allRegClass
+ : &AArch64::GPR32RegClass;
if (Ty.getSizeInBits() == 64)
- return &AArch64::GPR64RegClass;
+ return GetAllRegSet ? &AArch64::GPR64allRegClass
+ : &AArch64::GPR64RegClass;
return nullptr;
}
if (RB.getID() == AArch64::FPRRegBankID) {
+ if (Ty.getSizeInBits() <= 16)
+ return &AArch64::FPR16RegClass;
if (Ty.getSizeInBits() == 32)
return &AArch64::FPR32RegClass;
if (Ty.getSizeInBits() == 64)
@@ -167,7 +174,7 @@ static bool unsupportedBinOp(const MachineInstr &I,
const AArch64RegisterInfo &TRI) {
LLT Ty = MRI.getType(I.getOperand(0).getReg());
if (!Ty.isValid()) {
- DEBUG(dbgs() << "Generic binop register should be typed\n");
+ LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
return true;
}
@@ -175,7 +182,7 @@ static bool unsupportedBinOp(const MachineInstr &I,
for (auto &MO : I.operands()) {
// FIXME: Support non-register operands.
if (!MO.isReg()) {
- DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
+ LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
return true;
}
@@ -184,18 +191,18 @@ static bool unsupportedBinOp(const MachineInstr &I,
// bank out of the minimal class for the register.
// Either way, this needs to be documented (and possibly verified).
if (!TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
- DEBUG(dbgs() << "Generic inst has physical register operand\n");
+ LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
return true;
}
const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
if (!OpBank) {
- DEBUG(dbgs() << "Generic register has no bank or class\n");
+ LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
return true;
}
if (PrevOpBank && OpBank != PrevOpBank) {
- DEBUG(dbgs() << "Generic inst operands have different banks\n");
+ LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
return true;
}
PrevOpBank = OpBank;
@@ -310,19 +317,46 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
return GenericOpc;
}
+static bool selectFP16CopyFromGPR32(MachineInstr &I, const TargetInstrInfo &TII,
+ MachineRegisterInfo &MRI, unsigned SrcReg) {
+ // Copies from gpr32 to fpr16 need to use a sub-register copy.
+ unsigned CopyReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::COPY))
+ .addDef(CopyReg)
+ .addUse(SrcReg);
+ unsigned SubRegCopy = MRI.createVirtualRegister(&AArch64::FPR16RegClass);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY))
+ .addDef(SubRegCopy)
+ .addUse(CopyReg, 0, AArch64::hsub);
+
+ MachineOperand &RegOp = I.getOperand(1);
+ RegOp.setReg(SubRegCopy);
+ return true;
+}
+
static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
unsigned DstReg = I.getOperand(0).getReg();
+ unsigned SrcReg = I.getOperand(1).getReg();
+
if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+ if (TRI.getRegClass(AArch64::FPR16RegClassID)->contains(DstReg) &&
+ !TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+ const RegisterBank &RegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
+ const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(
+ MRI.getType(SrcReg), RegBank, RBI, /* GetAllRegSet */ true);
+ if (SrcRC == &AArch64::GPR32allRegClass)
+ return selectFP16CopyFromGPR32(I, TII, MRI, SrcReg);
+ }
assert(I.isCopy() && "Generic operators do not allow physical registers");
return true;
}
const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
- unsigned SrcReg = I.getOperand(1).getReg();
+ (void)DstSize;
const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
(void)SrcSize;
assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) &&
@@ -340,34 +374,46 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
"Copy with different width?!");
assert((DstSize <= 64 || RegBank.getID() == AArch64::FPRRegBankID) &&
"GPRs cannot get more than 64-bit width values");
- const TargetRegisterClass *RC = nullptr;
-
- if (RegBank.getID() == AArch64::FPRRegBankID) {
- if (DstSize <= 16)
- RC = &AArch64::FPR16RegClass;
- else if (DstSize <= 32)
- RC = &AArch64::FPR32RegClass;
- else if (DstSize <= 64)
- RC = &AArch64::FPR64RegClass;
- else if (DstSize <= 128)
- RC = &AArch64::FPR128RegClass;
- else {
- DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n');
- return false;
+
+ const TargetRegisterClass *RC = getRegClassForTypeOnBank(
+ MRI.getType(DstReg), RegBank, RBI, /* GetAllRegSet */ true);
+ if (!RC) {
+ LLVM_DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n');
+ return false;
+ }
+
+ if (!TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+ const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(SrcReg);
+ const TargetRegisterClass *SrcRC =
+ RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
+ const RegisterBank *RB = nullptr;
+ if (!SrcRC) {
+ RB = RegClassOrBank.get<const RegisterBank *>();
+ SrcRC = getRegClassForTypeOnBank(MRI.getType(SrcReg), *RB, RBI, true);
+ }
+ // Copies from fpr16 to gpr32 need to use SUBREG_TO_REG.
+ if (RC == &AArch64::GPR32allRegClass && SrcRC == &AArch64::FPR16RegClass) {
+ unsigned PromoteReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(AArch64::SUBREG_TO_REG))
+ .addDef(PromoteReg)
+ .addImm(0)
+ .addUse(SrcReg)
+ .addImm(AArch64::hsub);
+ MachineOperand &RegOp = I.getOperand(1);
+ RegOp.setReg(PromoteReg);
+ } else if (RC == &AArch64::FPR16RegClass &&
+ SrcRC == &AArch64::GPR32allRegClass) {
+ selectFP16CopyFromGPR32(I, TII, MRI, SrcReg);
}
- } else {
- assert(RegBank.getID() == AArch64::GPRRegBankID &&
- "Bitcast for the flags?");
- RC =
- DstSize <= 32 ? &AArch64::GPR32allRegClass : &AArch64::GPR64allRegClass;
}
// No need to constrain SrcReg. It will get constrained when
// we hit another of its use or its defs.
// Copies do not have constraints.
if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
- DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
- << " operand\n");
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
return false;
}
I.setDesc(TII.get(AArch64::COPY));
@@ -568,11 +614,11 @@ bool AArch64InstructionSelector::selectCompareBranch(
else
return false;
- auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc))
- .addUse(LHS)
- .addMBB(DestMBB);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc))
+ .addUse(LHS)
+ .addMBB(DestMBB)
+ .constrainAllUses(TII, TRI, RBI);
- constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
I.eraseFromParent();
return true;
}
@@ -640,13 +686,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
DefRC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
if (!DefRC) {
if (!DefTy.isValid()) {
- DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
+ LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
return false;
}
const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
if (!DefRC) {
- DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
+ LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
return false;
}
}
@@ -664,7 +710,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
if (I.getNumOperands() != I.getNumExplicitOperands()) {
- DEBUG(dbgs() << "Generic instruction has unexpected implicit operands\n");
+ LLVM_DEBUG(
+ dbgs() << "Generic instruction has unexpected implicit operands\n");
return false;
}
@@ -680,8 +727,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
// We shouldn't need this on AArch64, but it would be implemented as an
// EXTRACT_SUBREG followed by a TBNZW because TBNZX has no encoding if the
// bit being tested is < 32.
- DEBUG(dbgs() << "G_BRCOND has type: " << Ty
- << ", expected at most 32-bits");
+ LLVM_DEBUG(dbgs() << "G_BRCOND has type: " << Ty
+ << ", expected at most 32-bits");
return false;
}
@@ -721,15 +768,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
// FIXME: Redundant check, but even less readable when factored out.
if (isFP) {
if (Ty != s32 && Ty != s64) {
- DEBUG(dbgs() << "Unable to materialize FP " << Ty
- << " constant, expected: " << s32 << " or " << s64
- << '\n');
+ LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
+ << " constant, expected: " << s32 << " or " << s64
+ << '\n');
return false;
}
if (RB.getID() != AArch64::FPRRegBankID) {
- DEBUG(dbgs() << "Unable to materialize FP " << Ty
- << " constant on bank: " << RB << ", expected: FPR\n");
+ LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
+ << " constant on bank: " << RB
+ << ", expected: FPR\n");
return false;
}
@@ -740,15 +788,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
} else {
// s32 and s64 are covered by tablegen.
if (Ty != p0) {
- DEBUG(dbgs() << "Unable to materialize integer " << Ty
- << " constant, expected: " << s32 << ", " << s64 << ", or "
- << p0 << '\n');
+ LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
+ << " constant, expected: " << s32 << ", " << s64
+ << ", or " << p0 << '\n');
return false;
}
if (RB.getID() != AArch64::GPRRegBankID) {
- DEBUG(dbgs() << "Unable to materialize integer " << Ty
- << " constant on bank: " << RB << ", expected: GPR\n");
+ LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
+ << " constant on bank: " << RB
+ << ", expected: GPR\n");
return false;
}
}
@@ -774,7 +823,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
.addUse(DefGPRReg);
if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
- DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
+ LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
return false;
}
@@ -795,15 +844,24 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
}
case TargetOpcode::G_EXTRACT: {
LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
+ LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+ (void)DstTy;
+ unsigned SrcSize = SrcTy.getSizeInBits();
// Larger extracts are vectors, same-size extracts should be something else
// by now (either split up or simplified to a COPY).
if (SrcTy.getSizeInBits() > 64 || Ty.getSizeInBits() > 32)
return false;
- I.setDesc(TII.get(AArch64::UBFMXri));
+ I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
Ty.getSizeInBits() - 1);
+ if (SrcSize < 64) {
+ assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
+ "unexpected G_EXTRACT types");
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
+
unsigned DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
BuildMI(MBB, std::next(I.getIterator()), I.getDebugLoc(),
TII.get(AArch64::COPY))
@@ -818,17 +876,25 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_INSERT: {
LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
+ LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+ unsigned DstSize = DstTy.getSizeInBits();
// Larger inserts are vectors, same-size ones should be something else by
// now (split up or turned into COPYs).
if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
return false;
- I.setDesc(TII.get(AArch64::BFMXri));
+ I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
unsigned LSB = I.getOperand(3).getImm();
unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
- I.getOperand(3).setImm((64 - LSB) % 64);
+ I.getOperand(3).setImm((DstSize - LSB) % DstSize);
MachineInstrBuilder(MF, I).addImm(Width - 1);
+ if (DstSize < 64) {
+ assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
+ "unexpected G_INSERT types");
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
+
unsigned SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
TII.get(AArch64::SUBREG_TO_REG))
@@ -845,8 +911,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_FRAME_INDEX: {
// allocas and G_FRAME_INDEX are only supported in addrspace(0).
if (Ty != LLT::pointer(0, 64)) {
- DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
- << ", expected: " << LLT::pointer(0, 64) << '\n');
+ LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
+ << ", expected: " << LLT::pointer(0, 64) << '\n');
return false;
}
I.setDesc(TII.get(AArch64::ADDXri));
@@ -868,6 +934,40 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
if (OpFlags & AArch64II::MO_GOT) {
I.setDesc(TII.get(AArch64::LOADgot));
I.getOperand(1).setTargetFlags(OpFlags);
+ } else if (TM.getCodeModel() == CodeModel::Large) {
+ // Materialize the global using movz/movk instructions.
+ unsigned MovZDstReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ auto InsertPt = std::next(I.getIterator());
+ auto MovZ =
+ BuildMI(MBB, InsertPt, I.getDebugLoc(), TII.get(AArch64::MOVZXi))
+ .addDef(MovZDstReg);
+ MovZ->addOperand(MF, I.getOperand(1));
+ MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
+ AArch64II::MO_NC);
+ MovZ->addOperand(MF, MachineOperand::CreateImm(0));
+ constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
+
+ auto BuildMovK = [&](unsigned SrcReg, unsigned char Flags,
+ unsigned Offset, unsigned ForceDstReg) {
+ unsigned DstReg =
+ ForceDstReg ? ForceDstReg
+ : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ auto MovI = BuildMI(MBB, InsertPt, MovZ->getDebugLoc(),
+ TII.get(AArch64::MOVKXi))
+ .addDef(DstReg)
+ .addReg(SrcReg);
+ MovI->addOperand(MF, MachineOperand::CreateGA(
+ GV, MovZ->getOperand(1).getOffset(), Flags));
+ MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
+ constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
+ return DstReg;
+ };
+ unsigned DstReg = BuildMovK(MovZ->getOperand(0).getReg(),
+ AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
+ DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
+ BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
+ I.eraseFromParent();
+ return true;
} else {
I.setDesc(TII.get(AArch64::MOVaddr));
I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
@@ -880,20 +980,26 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_LOAD:
case TargetOpcode::G_STORE: {
- LLT MemTy = Ty;
LLT PtrTy = MRI.getType(I.getOperand(1).getReg());
if (PtrTy != LLT::pointer(0, 64)) {
- DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
- << ", expected: " << LLT::pointer(0, 64) << '\n');
+ LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
+ << ", expected: " << LLT::pointer(0, 64) << '\n');
return false;
}
auto &MemOp = **I.memoperands_begin();
if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
- DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+ LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
return false;
}
+ unsigned MemSizeInBits = MemOp.getSize() * 8;
+
+ // FIXME: PR36018: Volatile loads in some cases are incorrectly selected by
+ // folding with an extend. Until we have a G_SEXTLOAD solution bail out if
+ // we hit one.
+ if (Opcode == TargetOpcode::G_LOAD && MemOp.isVolatile())
+ return false;
const unsigned PtrReg = I.getOperand(1).getReg();
#ifndef NDEBUG
@@ -909,7 +1015,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
const unsigned NewOpc =
- selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemTy.getSizeInBits());
+ selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
if (NewOpc == I.getOpcode())
return false;
@@ -922,7 +1028,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
if (PtrMI->getOpcode() == TargetOpcode::G_GEP) {
if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) {
int64_t Imm = *COff;
- const unsigned Size = MemTy.getSizeInBits() / 8;
+ const unsigned Size = MemSizeInBits / 8;
const unsigned Scale = Log2_32(Size);
if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) {
unsigned Ptr2Reg = PtrMI->getOperand(1).getReg();
@@ -963,13 +1069,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
if (RB.getID() != AArch64::GPRRegBankID) {
- DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
+ LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
return false;
}
if (Ty != LLT::scalar(64)) {
- DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
- << ", expected: " << LLT::scalar(64) << '\n');
+ LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
+ << ", expected: " << LLT::scalar(64) << '\n');
return false;
}
@@ -1035,7 +1141,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
if (DstRB.getID() != SrcRB.getID()) {
- DEBUG(dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
+ LLVM_DEBUG(
+ dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
return false;
}
@@ -1052,7 +1159,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
- DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
+ LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
return false;
}
@@ -1066,7 +1173,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
SrcRC == &AArch64::GPR64RegClass) {
I.getOperand(1).setSubReg(AArch64::sub_32);
} else {
- DEBUG(dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
+ LLVM_DEBUG(
+ dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
return false;
}
@@ -1089,26 +1197,28 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
if (RBDst.getID() != AArch64::GPRRegBankID) {
- DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst << ", expected: GPR\n");
+ LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
+ << ", expected: GPR\n");
return false;
}
const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
if (RBSrc.getID() != AArch64::GPRRegBankID) {
- DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc << ", expected: GPR\n");
+ LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
+ << ", expected: GPR\n");
return false;
}
const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
if (DstSize == 0) {
- DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
+ LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
return false;
}
if (DstSize != 64 && DstSize > 32) {
- DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
- << ", expected: 32 or 64\n");
+ LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
+ << ", expected: 32 or 64\n");
return false;
}
// At this point G_ANYEXT is just like a plain COPY, but we need
@@ -1136,8 +1246,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
if (RB.getID() != AArch64::GPRRegBankID) {
- DEBUG(dbgs() << TII.getName(I.getOpcode()) << " on bank: " << RB
- << ", expected: GPR\n");
+ LLVM_DEBUG(dbgs() << TII.getName(I.getOpcode()) << " on bank: " << RB
+ << ", expected: GPR\n");
return false;
}
@@ -1145,8 +1255,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
if (DstTy == LLT::scalar(64)) {
// FIXME: Can we avoid manually doing this?
if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) {
- DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
- << " operand\n");
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
+ << " operand\n");
return false;
}
@@ -1214,8 +1324,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_SELECT: {
if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
- DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
- << ", expected: " << LLT::scalar(1) << '\n');
+ LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
+ << ", expected: " << LLT::scalar(1) << '\n');
return false;
}
@@ -1253,8 +1363,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
}
case TargetOpcode::G_ICMP: {
if (Ty != LLT::scalar(32)) {
- DEBUG(dbgs() << "G_ICMP result has type: " << Ty
- << ", expected: " << LLT::scalar(32) << '\n');
+ LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
+ << ", expected: " << LLT::scalar(32) << '\n');
return false;
}
@@ -1300,8 +1410,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_FCMP: {
if (Ty != LLT::scalar(32)) {
- DEBUG(dbgs() << "G_FCMP result has type: " << Ty
- << ", expected: " << LLT::scalar(32) << '\n');
+ LLVM_DEBUG(dbgs() << "G_FCMP result has type: " << Ty
+ << ", expected: " << LLT::scalar(32) << '\n');
return false;
}
@@ -1363,8 +1473,23 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_VASTART:
return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
: selectVaStartAAPCS(I, MF, MRI);
+ case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+ if (!I.getOperand(0).isIntrinsicID())
+ return false;
+ if (I.getOperand(0).getIntrinsicID() != Intrinsic::trap)
+ return false;
+ BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::BRK))
+ .addImm(1);
+ I.eraseFromParent();
+ return true;
case TargetOpcode::G_IMPLICIT_DEF:
I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
+ const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+ const TargetRegisterClass *DstRC =
+ getRegClassForTypeOnBank(DstTy, DstRB, RBI);
+ RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
return true;
}
@@ -1522,6 +1647,15 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
}};
}
+void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
+ const MachineInstr &MI) const {
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+ Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI);
+ assert(CstVal && "Expected constant value");
+ MIB.addImm(CstVal.getValue());
+}
+
namespace llvm {
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &TM,
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 05df51202229..9b8c0a34efba 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -23,110 +23,8 @@
#include "llvm/IR/Type.h"
using namespace llvm;
-
-/// FIXME: The following static functions are SizeChangeStrategy functions
-/// that are meant to temporarily mimic the behaviour of the old legalization
-/// based on doubling/halving non-legal types as closely as possible. This is
-/// not entirly possible as only legalizing the types that are exactly a power
-/// of 2 times the size of the legal types would require specifying all those
-/// sizes explicitly.
-/// In practice, not specifying those isn't a problem, and the below functions
-/// should disappear quickly as we add support for legalizing non-power-of-2
-/// sized types further.
-static void
-addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
- const LegalizerInfo::SizeAndActionsVec &v) {
- for (unsigned i = 0; i < v.size(); ++i) {
- result.push_back(v[i]);
- if (i + 1 < v[i].first && i + 1 < v.size() &&
- v[i + 1].first != v[i].first + 1)
- result.push_back({v[i].first + 1, LegalizerInfo::Unsupported});
- }
-}
-
-static LegalizerInfo::SizeAndActionsVec
-widen_1_narrow_128_ToLargest(const LegalizerInfo::SizeAndActionsVec &v) {
- assert(v.size() >= 1);
- assert(v[0].first > 2);
- LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::WidenScalar},
- {2, LegalizerInfo::Unsupported}};
- addAndInterleaveWithUnsupported(result, v);
- auto Largest = result.back().first;
- assert(Largest + 1 < 128);
- result.push_back({Largest + 1, LegalizerInfo::Unsupported});
- result.push_back({128, LegalizerInfo::NarrowScalar});
- result.push_back({129, LegalizerInfo::Unsupported});
- return result;
-}
-
-static LegalizerInfo::SizeAndActionsVec
-widen_16(const LegalizerInfo::SizeAndActionsVec &v) {
- assert(v.size() >= 1);
- assert(v[0].first > 17);
- LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::Unsupported},
- {16, LegalizerInfo::WidenScalar},
- {17, LegalizerInfo::Unsupported}};
- addAndInterleaveWithUnsupported(result, v);
- auto Largest = result.back().first;
- result.push_back({Largest + 1, LegalizerInfo::Unsupported});
- return result;
-}
-
-static LegalizerInfo::SizeAndActionsVec
-widen_1_8(const LegalizerInfo::SizeAndActionsVec &v) {
- assert(v.size() >= 1);
- assert(v[0].first > 9);
- LegalizerInfo::SizeAndActionsVec result = {
- {1, LegalizerInfo::WidenScalar}, {2, LegalizerInfo::Unsupported},
- {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported}};
- addAndInterleaveWithUnsupported(result, v);
- auto Largest = result.back().first;
- result.push_back({Largest + 1, LegalizerInfo::Unsupported});
- return result;
-}
-
-static LegalizerInfo::SizeAndActionsVec
-widen_1_8_16(const LegalizerInfo::SizeAndActionsVec &v) {
- assert(v.size() >= 1);
- assert(v[0].first > 17);
- LegalizerInfo::SizeAndActionsVec result = {
- {1, LegalizerInfo::WidenScalar}, {2, LegalizerInfo::Unsupported},
- {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported},
- {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
- addAndInterleaveWithUnsupported(result, v);
- auto Largest = result.back().first;
- result.push_back({Largest + 1, LegalizerInfo::Unsupported});
- return result;
-}
-
-static LegalizerInfo::SizeAndActionsVec
-widen_1_8_16_narrowToLargest(const LegalizerInfo::SizeAndActionsVec &v) {
- assert(v.size() >= 1);
- assert(v[0].first > 17);
- LegalizerInfo::SizeAndActionsVec result = {
- {1, LegalizerInfo::WidenScalar}, {2, LegalizerInfo::Unsupported},
- {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported},
- {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
- addAndInterleaveWithUnsupported(result, v);
- auto Largest = result.back().first;
- result.push_back({Largest + 1, LegalizerInfo::NarrowScalar});
- return result;
-}
-
-static LegalizerInfo::SizeAndActionsVec
-widen_1_8_16_32(const LegalizerInfo::SizeAndActionsVec &v) {
- assert(v.size() >= 1);
- assert(v[0].first > 33);
- LegalizerInfo::SizeAndActionsVec result = {
- {1, LegalizerInfo::WidenScalar}, {2, LegalizerInfo::Unsupported},
- {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported},
- {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported},
- {32, LegalizerInfo::WidenScalar}, {33, LegalizerInfo::Unsupported}};
- addAndInterleaveWithUnsupported(result, v);
- auto Largest = result.back().first;
- result.push_back({Largest + 1, LegalizerInfo::Unsupported});
- return result;
-}
+using namespace LegalizeActions;
+using namespace LegalityPredicates;
AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
using namespace TargetOpcode;
@@ -137,255 +35,356 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
const LLT s128 = LLT::scalar(128);
+ const LLT s256 = LLT::scalar(256);
+ const LLT s512 = LLT::scalar(512);
+ const LLT v16s8 = LLT::vector(16, 8);
+ const LLT v8s8 = LLT::vector(8, 8);
+ const LLT v4s8 = LLT::vector(4, 8);
+ const LLT v8s16 = LLT::vector(8, 16);
+ const LLT v4s16 = LLT::vector(4, 16);
+ const LLT v2s16 = LLT::vector(2, 16);
const LLT v2s32 = LLT::vector(2, 32);
const LLT v4s32 = LLT::vector(4, 32);
const LLT v2s64 = LLT::vector(2, 64);
- for (auto Ty : {p0, s1, s8, s16, s32, s64})
- setAction({G_IMPLICIT_DEF, Ty}, Legal);
-
- for (auto Ty : {s16, s32, s64, p0})
- setAction({G_PHI, Ty}, Legal);
-
- setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1_8);
-
- for (auto Ty : { s32, s64 })
- setAction({G_BSWAP, Ty}, Legal);
-
- for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) {
- // These operations naturally get the right answer when used on
- // GPR32, even if the actual type is narrower.
- for (auto Ty : {s32, s64, v2s32, v4s32, v2s64})
- setAction({BinOp, Ty}, Legal);
-
- if (BinOp != G_ADD)
- setLegalizeScalarToDifferentSizeStrategy(BinOp, 0,
- widen_1_8_16_narrowToLargest);
- }
-
- setAction({G_GEP, p0}, Legal);
- setAction({G_GEP, 1, s64}, Legal);
-
- setLegalizeScalarToDifferentSizeStrategy(G_GEP, 1, widen_1_8_16_32);
-
- setAction({G_PTR_MASK, p0}, Legal);
-
- for (unsigned BinOp : {G_LSHR, G_ASHR, G_SDIV, G_UDIV}) {
- for (auto Ty : {s32, s64})
- setAction({BinOp, Ty}, Legal);
-
- setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1_8_16);
- }
-
- for (unsigned BinOp : {G_SREM, G_UREM})
- for (auto Ty : { s1, s8, s16, s32, s64 })
- setAction({BinOp, Ty}, Lower);
-
- for (unsigned Op : {G_SMULO, G_UMULO}) {
- setAction({Op, 0, s64}, Lower);
- setAction({Op, 1, s1}, Legal);
- }
-
- for (unsigned Op : {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULH, G_UMULH}) {
- for (auto Ty : { s32, s64 })
- setAction({Op, Ty}, Legal);
-
- setAction({Op, 1, s1}, Legal);
- }
-
- for (unsigned BinOp : {G_FADD, G_FSUB, G_FMA, G_FMUL, G_FDIV})
- for (auto Ty : {s32, s64})
- setAction({BinOp, Ty}, Legal);
-
- for (unsigned BinOp : {G_FREM, G_FPOW}) {
- setAction({BinOp, s32}, Libcall);
- setAction({BinOp, s64}, Libcall);
- }
-
- for (auto Ty : {s32, s64, p0}) {
- setAction({G_INSERT, Ty}, Legal);
- setAction({G_INSERT, 1, Ty}, Legal);
- }
- setLegalizeScalarToDifferentSizeStrategy(G_INSERT, 0,
- widen_1_8_16_narrowToLargest);
- for (auto Ty : {s1, s8, s16}) {
- setAction({G_INSERT, 1, Ty}, Legal);
- // FIXME: Can't widen the sources because that violates the constraints on
- // G_INSERT (It seems entirely reasonable that inputs shouldn't overlap).
- }
-
- for (auto Ty : {s1, s8, s16, s32, s64, p0})
- setAction({G_EXTRACT, Ty}, Legal);
-
- for (auto Ty : {s32, s64})
- setAction({G_EXTRACT, 1, Ty}, Legal);
-
- for (unsigned MemOp : {G_LOAD, G_STORE}) {
- for (auto Ty : {s8, s16, s32, s64, p0, v2s32})
- setAction({MemOp, Ty}, Legal);
-
- setLegalizeScalarToDifferentSizeStrategy(MemOp, 0,
- widen_1_narrow_128_ToLargest);
-
- // And everything's fine in addrspace 0.
- setAction({MemOp, 1, p0}, Legal);
- }
+ getActionDefinitionsBuilder(G_IMPLICIT_DEF)
+ .legalFor({p0, s1, s8, s16, s32, s64})
+ .clampScalar(0, s1, s64)
+ .widenScalarToNextPow2(0, 8);
+
+ getActionDefinitionsBuilder(G_PHI)
+ .legalFor({p0, s16, s32, s64})
+ .clampScalar(0, s16, s64)
+ .widenScalarToNextPow2(0);
+
+ getActionDefinitionsBuilder(G_BSWAP)
+ .legalFor({s32, s64})
+ .clampScalar(0, s16, s64)
+ .widenScalarToNextPow2(0);
+
+ getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL})
+ .legalFor({s32, s64, v2s32, v4s32, v2s64})
+ .clampScalar(0, s32, s64)
+ .widenScalarToNextPow2(0)
+ .clampNumElements(0, v2s32, v4s32)
+ .clampNumElements(0, v2s64, v2s64)
+ .moreElementsToNextPow2(0);
+
+ getActionDefinitionsBuilder(G_GEP)
+ .legalFor({{p0, s64}})
+ .clampScalar(1, s64, s64);
+
+ getActionDefinitionsBuilder(G_PTR_MASK).legalFor({p0});
+
+ getActionDefinitionsBuilder({G_LSHR, G_ASHR, G_SDIV, G_UDIV})
+ .legalFor({s32, s64})
+ .clampScalar(0, s32, s64)
+ .widenScalarToNextPow2(0);
+
+ getActionDefinitionsBuilder({G_SREM, G_UREM})
+ .lowerFor({s1, s8, s16, s32, s64});
+
+ getActionDefinitionsBuilder({G_SMULO, G_UMULO})
+ .lowerFor({{s64, s1}});
+
+ getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64});
+
+ getActionDefinitionsBuilder({G_UADDE, G_USUBE, G_SADDO, G_SSUBO})
+ .legalFor({{s32, s1}, {s64, s1}});
+
+ getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMA, G_FMUL, G_FDIV})
+ .legalFor({s32, s64});
+
+ getActionDefinitionsBuilder({G_FREM, G_FPOW}).libcallFor({s32, s64});
+
+ getActionDefinitionsBuilder(G_INSERT)
+ .unsupportedIf([=](const LegalityQuery &Query) {
+ return Query.Types[0].getSizeInBits() <= Query.Types[1].getSizeInBits();
+ })
+ .legalIf([=](const LegalityQuery &Query) {
+ const LLT &Ty0 = Query.Types[0];
+ const LLT &Ty1 = Query.Types[1];
+ if (Ty0 != s32 && Ty0 != s64 && Ty0 != p0)
+ return false;
+ return isPowerOf2_32(Ty1.getSizeInBits()) &&
+ (Ty1.getSizeInBits() == 1 || Ty1.getSizeInBits() >= 8);
+ })
+ .clampScalar(0, s32, s64)
+ .widenScalarToNextPow2(0)
+ .maxScalarIf(typeInSet(0, {s32}), 1, s16)
+ .maxScalarIf(typeInSet(0, {s64}), 1, s32)
+ .widenScalarToNextPow2(1);
+
+ getActionDefinitionsBuilder(G_EXTRACT)
+ .unsupportedIf([=](const LegalityQuery &Query) {
+ return Query.Types[0].getSizeInBits() >= Query.Types[1].getSizeInBits();
+ })
+ .legalIf([=](const LegalityQuery &Query) {
+ const LLT &Ty0 = Query.Types[0];
+ const LLT &Ty1 = Query.Types[1];
+ if (Ty1 != s32 && Ty1 != s64)
+ return false;
+ if (Ty1 == p0)
+ return true;
+ return isPowerOf2_32(Ty0.getSizeInBits()) &&
+ (Ty0.getSizeInBits() == 1 || Ty0.getSizeInBits() >= 8);
+ })
+ .clampScalar(1, s32, s64)
+ .widenScalarToNextPow2(1)
+ .maxScalarIf(typeInSet(1, {s32}), 0, s16)
+ .maxScalarIf(typeInSet(1, {s64}), 0, s32)
+ .widenScalarToNextPow2(0);
+
+ getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
+ .legalForTypesWithMemSize({{s32, p0, 8},
+ {s32, p0, 16},
+ {s32, p0, 32},
+ {s64, p0, 64},
+ {p0, p0, 64},
+ {v2s32, p0, 64}})
+ .clampScalar(0, s32, s64)
+ .widenScalarToNextPow2(0)
+ // TODO: We could support sum-of-pow2's but the lowering code doesn't know
+ // how to do that yet.
+ .unsupportedIfMemSizeNotPow2()
+ // Lower anything left over into G_*EXT and G_LOAD
+ .lower();
+
+ getActionDefinitionsBuilder(G_LOAD)
+ .legalForTypesWithMemSize({{s8, p0, 8},
+ {s16, p0, 16},
+ {s32, p0, 32},
+ {s64, p0, 64},
+ {p0, p0, 64},
+ {v2s32, p0, 64}})
+ // These extends are also legal
+ .legalForTypesWithMemSize({{s32, p0, 8},
+ {s32, p0, 16}})
+ .clampScalar(0, s8, s64)
+ .widenScalarToNextPow2(0)
+ // TODO: We could support sum-of-pow2's but the lowering code doesn't know
+ // how to do that yet.
+ .unsupportedIfMemSizeNotPow2()
+ // Lower any any-extending loads left into G_ANYEXT and G_LOAD
+ .lowerIf([=](const LegalityQuery &Query) {
+ return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].Size * 8;
+ })
+ .clampNumElements(0, v2s32, v2s32);
+
+ getActionDefinitionsBuilder(G_STORE)
+ .legalForTypesWithMemSize({{s8, p0, 8},
+ {s16, p0, 16},
+ {s32, p0, 32},
+ {s64, p0, 64},
+ {p0, p0, 64},
+ {v2s32, p0, 64}})
+ .clampScalar(0, s8, s64)
+ .widenScalarToNextPow2(0)
+ // TODO: We could support sum-of-pow2's but the lowering code doesn't know
+ // how to do that yet.
+ .unsupportedIfMemSizeNotPow2()
+ .lowerIf([=](const LegalityQuery &Query) {
+ return Query.Types[0].isScalar() &&
+ Query.Types[0].getSizeInBits() != Query.MMODescrs[0].Size * 8;
+ })
+ .clampNumElements(0, v2s32, v2s32);
// Constants
- for (auto Ty : {s32, s64}) {
- setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
- setAction({TargetOpcode::G_FCONSTANT, Ty}, Legal);
- }
-
- setAction({G_CONSTANT, p0}, Legal);
-
- setLegalizeScalarToDifferentSizeStrategy(G_CONSTANT, 0, widen_1_8_16);
- setLegalizeScalarToDifferentSizeStrategy(G_FCONSTANT, 0, widen_16);
-
- setAction({G_ICMP, 1, s32}, Legal);
- setAction({G_ICMP, 1, s64}, Legal);
- setAction({G_ICMP, 1, p0}, Legal);
-
- setLegalizeScalarToDifferentSizeStrategy(G_ICMP, 0, widen_1_8_16);
- setLegalizeScalarToDifferentSizeStrategy(G_FCMP, 0, widen_1_8_16);
- setLegalizeScalarToDifferentSizeStrategy(G_ICMP, 1, widen_1_8_16);
-
- setAction({G_ICMP, s32}, Legal);
- setAction({G_FCMP, s32}, Legal);
- setAction({G_FCMP, 1, s32}, Legal);
- setAction({G_FCMP, 1, s64}, Legal);
+ getActionDefinitionsBuilder(G_CONSTANT)
+ .legalFor({p0, s32, s64})
+ .clampScalar(0, s32, s64)
+ .widenScalarToNextPow2(0);
+ getActionDefinitionsBuilder(G_FCONSTANT)
+ .legalFor({s32, s64})
+ .clampScalar(0, s32, s64);
+
+ getActionDefinitionsBuilder(G_ICMP)
+ .legalFor({{s32, s32}, {s32, s64}, {s32, p0}})
+ .clampScalar(0, s32, s32)
+ .clampScalar(1, s32, s64)
+ .widenScalarToNextPow2(1);
+
+ getActionDefinitionsBuilder(G_FCMP)
+ .legalFor({{s32, s32}, {s32, s64}})
+ .clampScalar(0, s32, s32)
+ .clampScalar(1, s32, s64)
+ .widenScalarToNextPow2(1);
// Extensions
- for (auto Ty : { s1, s8, s16, s32, s64 }) {
- setAction({G_ZEXT, Ty}, Legal);
- setAction({G_SEXT, Ty}, Legal);
- setAction({G_ANYEXT, Ty}, Legal);
- }
+ getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
+ .legalForCartesianProduct({s8, s16, s32, s64}, {s1, s8, s16, s32});
// FP conversions
- for (auto Ty : { s16, s32 }) {
- setAction({G_FPTRUNC, Ty}, Legal);
- setAction({G_FPEXT, 1, Ty}, Legal);
- }
-
- for (auto Ty : { s32, s64 }) {
- setAction({G_FPTRUNC, 1, Ty}, Legal);
- setAction({G_FPEXT, Ty}, Legal);
- }
+ getActionDefinitionsBuilder(G_FPTRUNC).legalFor(
+ {{s16, s32}, {s16, s64}, {s32, s64}});
+ getActionDefinitionsBuilder(G_FPEXT).legalFor(
+ {{s32, s16}, {s64, s16}, {s64, s32}});
// Conversions
- for (auto Ty : { s32, s64 }) {
- setAction({G_FPTOSI, 0, Ty}, Legal);
- setAction({G_FPTOUI, 0, Ty}, Legal);
- setAction({G_SITOFP, 1, Ty}, Legal);
- setAction({G_UITOFP, 1, Ty}, Legal);
- }
- setLegalizeScalarToDifferentSizeStrategy(G_FPTOSI, 0, widen_1_8_16);
- setLegalizeScalarToDifferentSizeStrategy(G_FPTOUI, 0, widen_1_8_16);
- setLegalizeScalarToDifferentSizeStrategy(G_SITOFP, 1, widen_1_8_16);
- setLegalizeScalarToDifferentSizeStrategy(G_UITOFP, 1, widen_1_8_16);
-
- for (auto Ty : { s32, s64 }) {
- setAction({G_FPTOSI, 1, Ty}, Legal);
- setAction({G_FPTOUI, 1, Ty}, Legal);
- setAction({G_SITOFP, 0, Ty}, Legal);
- setAction({G_UITOFP, 0, Ty}, Legal);
- }
+ getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
+ .legalForCartesianProduct({s32, s64})
+ .clampScalar(0, s32, s64)
+ .widenScalarToNextPow2(0)
+ .clampScalar(1, s32, s64)
+ .widenScalarToNextPow2(1);
+
+ getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
+ .legalForCartesianProduct({s32, s64})
+ .clampScalar(1, s32, s64)
+ .widenScalarToNextPow2(1)
+ .clampScalar(0, s32, s64)
+ .widenScalarToNextPow2(0);
// Control-flow
- for (auto Ty : {s1, s8, s16, s32})
- setAction({G_BRCOND, Ty}, Legal);
- setAction({G_BRINDIRECT, p0}, Legal);
+ getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s8, s16, s32});
+ getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
// Select
- setLegalizeScalarToDifferentSizeStrategy(G_SELECT, 0, widen_1_8_16);
-
- for (auto Ty : {s32, s64, p0})
- setAction({G_SELECT, Ty}, Legal);
-
- setAction({G_SELECT, 1, s1}, Legal);
+ getActionDefinitionsBuilder(G_SELECT)
+ .legalFor({{s32, s1}, {s64, s1}, {p0, s1}})
+ .clampScalar(0, s32, s64)
+ .widenScalarToNextPow2(0);
// Pointer-handling
- setAction({G_FRAME_INDEX, p0}, Legal);
- setAction({G_GLOBAL_VALUE, p0}, Legal);
-
- for (auto Ty : {s1, s8, s16, s32, s64})
- setAction({G_PTRTOINT, 0, Ty}, Legal);
+ getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
+ getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
- setAction({G_PTRTOINT, 1, p0}, Legal);
+ getActionDefinitionsBuilder(G_PTRTOINT)
+ .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0})
+ .maxScalar(0, s64)
+ .widenScalarToNextPow2(0, /*Min*/ 8);
- setAction({G_INTTOPTR, 0, p0}, Legal);
- setAction({G_INTTOPTR, 1, s64}, Legal);
+ getActionDefinitionsBuilder(G_INTTOPTR)
+ .unsupportedIf([&](const LegalityQuery &Query) {
+ return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
+ })
+ .legalFor({{p0, s64}});
// Casts for 32 and 64-bit width type are just copies.
// Same for 128-bit width type, except they are on the FPR bank.
- for (auto Ty : {s1, s8, s16, s32, s64, s128}) {
- setAction({G_BITCAST, 0, Ty}, Legal);
- setAction({G_BITCAST, 1, Ty}, Legal);
- }
-
- // For the sake of copying bits around, the type does not really
- // matter as long as it fits a register.
- for (int EltSize = 8; EltSize <= 64; EltSize *= 2) {
- setAction({G_BITCAST, 0, LLT::vector(128/EltSize, EltSize)}, Legal);
- setAction({G_BITCAST, 1, LLT::vector(128/EltSize, EltSize)}, Legal);
- if (EltSize >= 64)
- continue;
-
- setAction({G_BITCAST, 0, LLT::vector(64/EltSize, EltSize)}, Legal);
- setAction({G_BITCAST, 1, LLT::vector(64/EltSize, EltSize)}, Legal);
- if (EltSize >= 32)
- continue;
-
- setAction({G_BITCAST, 0, LLT::vector(32/EltSize, EltSize)}, Legal);
- setAction({G_BITCAST, 1, LLT::vector(32/EltSize, EltSize)}, Legal);
- }
+ getActionDefinitionsBuilder(G_BITCAST)
+ // FIXME: This is wrong since G_BITCAST is not allowed to change the
+ // number of bits but it's what the previous code described and fixing
+ // it breaks tests.
+ .legalForCartesianProduct({s1, s8, s16, s32, s64, s128, v16s8, v8s8, v4s8,
+ v8s16, v4s16, v2s16, v4s32, v2s32, v2s64});
- setAction({G_VASTART, p0}, Legal);
+ getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
// va_list must be a pointer, but most sized types are pretty easy to handle
// as the destination.
- setAction({G_VAARG, 1, p0}, Legal);
-
- for (auto Ty : {s8, s16, s32, s64, p0})
- setAction({G_VAARG, Ty}, Custom);
+ getActionDefinitionsBuilder(G_VAARG)
+ .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0})
+ .clampScalar(0, s8, s64)
+ .widenScalarToNextPow2(0, /*Min*/ 8);
if (ST.hasLSE()) {
- for (auto Ty : {s8, s16, s32, s64}) {
- setAction({G_ATOMIC_CMPXCHG_WITH_SUCCESS, Ty}, Lower);
- setAction({G_ATOMIC_CMPXCHG, Ty}, Legal);
- }
- setAction({G_ATOMIC_CMPXCHG, 1, p0}, Legal);
-
- for (unsigned Op :
- {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
- G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
- G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX}) {
- for (auto Ty : {s8, s16, s32, s64}) {
- setAction({Op, Ty}, Legal);
- }
- setAction({Op, 1, p0}, Legal);
- }
+ getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
+ .lowerIf(all(
+ typeInSet(0, {s8, s16, s32, s64}), typeIs(1, s1), typeIs(2, p0),
+ atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Monotonic)));
+
+ getActionDefinitionsBuilder(
+ {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
+ G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
+ G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX, G_ATOMIC_CMPXCHG})
+ .legalIf(all(
+ typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0),
+ atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Monotonic)));
}
// Merge/Unmerge
- for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES})
- for (int Sz : {8, 16, 32, 64, 128, 192, 256, 384, 512}) {
- LLT ScalarTy = LLT::scalar(Sz);
- setAction({Op, ScalarTy}, Legal);
- setAction({Op, 1, ScalarTy}, Legal);
- if (Sz < 32)
- continue;
- for (int EltSize = 8; EltSize <= 64; EltSize *= 2) {
- if (EltSize >= Sz)
- continue;
- LLT VecTy = LLT::vector(Sz / EltSize, EltSize);
- setAction({Op, VecTy}, Legal);
- setAction({Op, 1, VecTy}, Legal);
+ for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
+ unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
+ unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
+
+ auto notValidElt = [](const LegalityQuery &Query, unsigned TypeIdx) {
+ const LLT &Ty = Query.Types[TypeIdx];
+ if (Ty.isVector()) {
+ const LLT &EltTy = Ty.getElementType();
+ if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
+ return true;
+ if (!isPowerOf2_32(EltTy.getSizeInBits()))
+ return true;
}
- }
+ return false;
+ };
+ auto scalarize =
+ [](const LegalityQuery &Query, unsigned TypeIdx) {
+ const LLT &Ty = Query.Types[TypeIdx];
+ return std::make_pair(TypeIdx, Ty.getElementType());
+ };
+
+ // FIXME: This rule is horrible, but specifies the same as what we had
+ // before with the particularly strange definitions removed (e.g.
+ // s8 = G_MERGE_VALUES s32, s32).
+ // Part of the complexity comes from these ops being extremely flexible. For
+ // example, you can build/decompose vectors with it, concatenate vectors,
+ // etc. and in addition to this you can also bitcast with it at the same
+ // time. We've been considering breaking it up into multiple ops to make it
+ // more manageable throughout the backend.
+ getActionDefinitionsBuilder(Op)
+ // Break up vectors with weird elements into scalars
+ .fewerElementsIf(
+ [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
+ [=](const LegalityQuery &Query) { return scalarize(Query, 0); })
+ .fewerElementsIf(
+ [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
+ [=](const LegalityQuery &Query) { return scalarize(Query, 1); })
+ // Clamp the big scalar to s8-s512 and make it either a power of 2, 192,
+ // or 384.
+ .clampScalar(BigTyIdx, s8, s512)
+ .widenScalarIf(
+ [=](const LegalityQuery &Query) {
+ const LLT &Ty = Query.Types[BigTyIdx];
+ return !isPowerOf2_32(Ty.getSizeInBits()) &&
+ Ty.getSizeInBits() % 64 != 0;
+ },
+ [=](const LegalityQuery &Query) {
+ // Pick the next power of 2, or a multiple of 64 over 128.
+ // Whichever is smaller.
+ const LLT &Ty = Query.Types[BigTyIdx];
+ unsigned NewSizeInBits = 1
+ << Log2_32_Ceil(Ty.getSizeInBits() + 1);
+ if (NewSizeInBits >= 256) {
+ unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
+ if (RoundedTo < NewSizeInBits)
+ NewSizeInBits = RoundedTo;
+ }
+ return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
+ })
+ // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
+ // worth considering the multiples of 64 since 2*192 and 2*384 are not
+ // valid.
+ .clampScalar(LitTyIdx, s8, s256)
+ .widenScalarToNextPow2(LitTyIdx, /*Min*/ 8)
+ // So at this point, we have s8, s16, s32, s64, s128, s192, s256, s384,
+ // s512, <X x s8>, <X x s16>, <X x s32>, or <X x s64>.
+ // At this point it's simple enough to accept the legal types.
+ .legalIf([=](const LegalityQuery &Query) {
+ const LLT &BigTy = Query.Types[BigTyIdx];
+ const LLT &LitTy = Query.Types[LitTyIdx];
+ if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
+ return false;
+ if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
+ return false;
+ return BigTy.getSizeInBits() % LitTy.getSizeInBits() == 0;
+ })
+ // Any vectors left are the wrong size. Scalarize them.
+ .fewerElementsIf([](const LegalityQuery &Query) { return true; },
+ [](const LegalityQuery &Query) {
+ return std::make_pair(
+ 0, Query.Types[0].getElementType());
+ })
+ .fewerElementsIf([](const LegalityQuery &Query) { return true; },
+ [](const LegalityQuery &Query) {
+ return std::make_pair(
+ 1, Query.Types[1].getElementType());
+ });
+ }
computeTables();
+ verify(*ST.getInstrInfo());
}
bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 8a29456430b9..4a19ecd69103 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -98,8 +98,8 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
const TargetRegisterInfo *TRI;
const AArch64Subtarget *Subtarget;
- // Track which registers have been modified and used.
- BitVector ModifiedRegs, UsedRegs;
+ // Track which register units have been modified and used.
+ LiveRegUnits ModifiedRegUnits, UsedRegUnits;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AAResultsWrapperPass>();
@@ -702,16 +702,17 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
.addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
.add(BaseRegOp)
.addImm(OffsetImm)
- .setMemRefs(I->mergeMemRefsWith(*MergeMI));
+ .setMemRefs(I->mergeMemRefsWith(*MergeMI))
+ .setMIFlags(I->mergeFlagsWith(*MergeMI));
(void)MIB;
- DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
- DEBUG(I->print(dbgs()));
- DEBUG(dbgs() << " ");
- DEBUG(MergeMI->print(dbgs()));
- DEBUG(dbgs() << " with instruction:\n ");
- DEBUG(((MachineInstr *)MIB)->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
+ LLVM_DEBUG(I->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " ");
+ LLVM_DEBUG(MergeMI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " with instruction:\n ");
+ LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
// Erase the old instructions.
I->eraseFromParent();
@@ -818,15 +819,17 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
.add(RegOp1)
.add(BaseRegOp)
.addImm(OffsetImm)
- .setMemRefs(I->mergeMemRefsWith(*Paired));
+ .setMemRefs(I->mergeMemRefsWith(*Paired))
+ .setMIFlags(I->mergeFlagsWith(*Paired));
(void)MIB;
- DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n ");
- DEBUG(I->print(dbgs()));
- DEBUG(dbgs() << " ");
- DEBUG(Paired->print(dbgs()));
- DEBUG(dbgs() << " with instruction:\n ");
+ LLVM_DEBUG(
+ dbgs() << "Creating pair load/store. Replacing instructions:\n ");
+ LLVM_DEBUG(I->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " ");
+ LLVM_DEBUG(Paired->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " with instruction:\n ");
if (SExtIdx != -1) {
// Generate the sign extension for the proper result of the ldp.
// I.e., with X1, that would be:
@@ -840,8 +843,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
// Update the result of LDP to use the W instead of the X variant.
DstMO.setReg(DstRegW);
- DEBUG(((MachineInstr *)MIB)->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
// Make the machine verifier happy by providing a definition for
// the X register.
// Insert this definition right after the generated LDP, i.e., before
@@ -858,12 +861,12 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
.addImm(0)
.addImm(31);
(void)MIBSXTW;
- DEBUG(dbgs() << " Extend operand:\n ");
- DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " Extend operand:\n ");
+ LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
} else {
- DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+ LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
}
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "\n");
// Erase the old instructions.
I->eraseFromParent();
@@ -901,9 +904,9 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
break;
}
}
- DEBUG(dbgs() << "Remove load instruction:\n ");
- DEBUG(LoadI->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
+ LLVM_DEBUG(LoadI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
LoadI->eraseFromParent();
return NextI;
}
@@ -913,7 +916,8 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
.addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
.add(StMO)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+ .setMIFlags(LoadI->getFlags());
} else {
// FIXME: Currently we disable this transformation in big-endian targets as
// performance and correctness are verified only in little-endian.
@@ -954,7 +958,8 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
DestReg)
.add(StMO)
- .addImm(AndMaskEncoded);
+ .addImm(AndMaskEncoded)
+ .setMIFlags(LoadI->getFlags());
} else {
BitExtMI =
BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
@@ -962,7 +967,8 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
DestReg)
.add(StMO)
.addImm(Immr)
- .addImm(Imms);
+ .addImm(Imms)
+ .setMIFlags(LoadI->getFlags());
}
}
@@ -974,48 +980,21 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
break;
}
- DEBUG(dbgs() << "Promoting load by replacing :\n ");
- DEBUG(StoreI->print(dbgs()));
- DEBUG(dbgs() << " ");
- DEBUG(LoadI->print(dbgs()));
- DEBUG(dbgs() << " with instructions:\n ");
- DEBUG(StoreI->print(dbgs()));
- DEBUG(dbgs() << " ");
- DEBUG((BitExtMI)->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
+ LLVM_DEBUG(StoreI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " ");
+ LLVM_DEBUG(LoadI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " with instructions:\n ");
+ LLVM_DEBUG(StoreI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " ");
+ LLVM_DEBUG((BitExtMI)->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
// Erase the old instructions.
LoadI->eraseFromParent();
return NextI;
}
-/// trackRegDefsUses - Remember what registers the specified instruction uses
-/// and modifies.
-static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs,
- BitVector &UsedRegs,
- const TargetRegisterInfo *TRI) {
- for (const MachineOperand &MO : MI.operands()) {
- if (MO.isRegMask())
- ModifiedRegs.setBitsNotInMask(MO.getRegMask());
-
- if (!MO.isReg())
- continue;
- unsigned Reg = MO.getReg();
- if (!Reg)
- continue;
- if (MO.isDef()) {
- // WZR/XZR are not modified even when used as a destination register.
- if (Reg != AArch64::WZR && Reg != AArch64::XZR)
- for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
- ModifiedRegs.set(*AI);
- } else {
- assert(MO.isUse() && "Reg operand not a def and not a use?!?");
- for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
- UsedRegs.set(*AI);
- }
- }
-}
-
static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
// Convert the byte-offset used by unscaled into an "element" offset used
// by the scaled pair load/store instructions.
@@ -1073,10 +1052,10 @@ bool AArch64LoadStoreOpt::findMatchingStore(
if (MBBI == B)
return false;
- // Track which registers have been modified and used between the first insn
- // and the second insn.
- ModifiedRegs.reset();
- UsedRegs.reset();
+ // Track which register units have been modified and used between the first
+ // insn and the second insn.
+ ModifiedRegUnits.clear();
+ UsedRegUnits.clear();
unsigned Count = 0;
do {
@@ -1095,7 +1074,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
BaseReg == getLdStBaseOp(MI).getReg() &&
isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
- !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
+ ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
StoreI = MBBI;
return true;
}
@@ -1103,12 +1082,12 @@ bool AArch64LoadStoreOpt::findMatchingStore(
if (MI.isCall())
return false;
- // Update modified / uses register lists.
- trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ // Update modified / uses register units.
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
// Otherwise, if the base register is modified, we have no match, so
// return early.
- if (ModifiedRegs[BaseReg])
+ if (!ModifiedRegUnits.available(BaseReg))
return false;
// If we encounter a store aliased with the load, return early.
@@ -1186,10 +1165,10 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
- // Track which registers have been modified and used between the first insn
- // (inclusive) and the second insn.
- ModifiedRegs.reset();
- UsedRegs.reset();
+ // Track which register units have been modified and used between the first
+ // insn (inclusive) and the second insn.
+ ModifiedRegUnits.clear();
+ UsedRegUnits.clear();
// Remember any instructions that read/write memory between FirstMI and MI.
SmallVector<MachineInstr *, 4> MemInsns;
@@ -1224,7 +1203,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// If the unscaled offset isn't a multiple of the MemSize, we can't
// pair the operations together: bail and keep looking.
if (MIOffset % MemSize) {
- trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
+ UsedRegUnits, TRI);
MemInsns.push_back(&MI);
continue;
}
@@ -1244,7 +1224,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// the stored value is the same (i.e., WZR).
if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
(IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
- trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
+ UsedRegUnits, TRI);
MemInsns.push_back(&MI);
continue;
}
@@ -1254,7 +1235,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// immediate offset of merging these instructions is out of range for
// a pairwise instruction, bail and keep looking.
if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
- trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
+ UsedRegUnits, TRI);
MemInsns.push_back(&MI);
continue;
}
@@ -1262,7 +1244,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// can't express the offset of the unscaled input, bail and keep
// looking.
if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
- trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
+ UsedRegUnits, TRI);
MemInsns.push_back(&MI);
continue;
}
@@ -1271,7 +1254,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// and keep looking. A load-pair instruction with both destination
// registers the same is UNPREDICTABLE and will result in an exception.
if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
- trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
+ TRI);
MemInsns.push_back(&MI);
continue;
}
@@ -1280,8 +1264,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// the two instructions and none of the instructions between the second
// and first alias with the second, we can combine the second into the
// first.
- if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
- !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
+ if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) &&
+ !(MI.mayLoad() &&
+ !UsedRegUnits.available(getLdStRegOp(MI).getReg())) &&
!mayAlias(MI, MemInsns, AA)) {
Flags.setMergeForward(false);
return MBBI;
@@ -1291,8 +1276,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// between the two instructions and none of the instructions between the
// first and the second alias with the first, we can combine the first
// into the second.
- if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
- !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
+ if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg()) &&
+ !(MayLoad &&
+ !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) &&
!mayAlias(FirstMI, MemInsns, AA)) {
Flags.setMergeForward(true);
return MBBI;
@@ -1307,12 +1293,12 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
if (MI.isCall())
return E;
- // Update modified / uses register lists.
- trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ // Update modified / uses register units.
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
// Otherwise, if the base register is modified, we have no match, so
// return early.
- if (ModifiedRegs[BaseReg])
+ if (!ModifiedRegUnits.available(BaseReg))
return E;
// Update list of instructions that read/write memory.
@@ -1352,7 +1338,8 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
.add(getLdStRegOp(*I))
.add(getLdStBaseOp(*I))
.addImm(Value)
- .setMemRefs(I->memoperands_begin(), I->memoperands_end());
+ .setMemRefs(I->memoperands_begin(), I->memoperands_end())
+ .setMIFlags(I->mergeFlagsWith(*Update));
} else {
// Paired instruction.
int Scale = getMemScale(*I);
@@ -1362,24 +1349,25 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
.add(getLdStRegOp(*I, 1))
.add(getLdStBaseOp(*I))
.addImm(Value / Scale)
- .setMemRefs(I->memoperands_begin(), I->memoperands_end());
+ .setMemRefs(I->memoperands_begin(), I->memoperands_end())
+ .setMIFlags(I->mergeFlagsWith(*Update));
}
(void)MIB;
if (IsPreIdx) {
++NumPreFolded;
- DEBUG(dbgs() << "Creating pre-indexed load/store.");
+ LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
} else {
++NumPostFolded;
- DEBUG(dbgs() << "Creating post-indexed load/store.");
+ LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
}
- DEBUG(dbgs() << " Replacing instructions:\n ");
- DEBUG(I->print(dbgs()));
- DEBUG(dbgs() << " ");
- DEBUG(Update->print(dbgs()));
- DEBUG(dbgs() << " with instruction:\n ");
- DEBUG(((MachineInstr *)MIB)->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
+ LLVM_DEBUG(I->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " ");
+ LLVM_DEBUG(Update->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " with instruction:\n ");
+ LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
// Erase the old instructions for the block.
I->eraseFromParent();
@@ -1466,10 +1454,10 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
return E;
}
- // Track which registers have been modified and used between the first insn
- // (inclusive) and the second insn.
- ModifiedRegs.reset();
- UsedRegs.reset();
+ // Track which register units have been modified and used between the first
+ // insn (inclusive) and the second insn.
+ ModifiedRegUnits.clear();
+ UsedRegUnits.clear();
++MBBI;
for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
MachineInstr &MI = *MBBI;
@@ -1484,11 +1472,12 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
return MBBI;
// Update the status of what the instruction clobbered and used.
- trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
// Otherwise, if the base register is used or modified, we have no match, so
// return early.
- if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+ if (!ModifiedRegUnits.available(BaseReg) ||
+ !UsedRegUnits.available(BaseReg))
return E;
}
return E;
@@ -1517,10 +1506,10 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
return E;
}
- // Track which registers have been modified and used between the first insn
- // (inclusive) and the second insn.
- ModifiedRegs.reset();
- UsedRegs.reset();
+ // Track which register units have been modified and used between the first
+ // insn (inclusive) and the second insn.
+ ModifiedRegUnits.clear();
+ UsedRegUnits.clear();
unsigned Count = 0;
do {
--MBBI;
@@ -1536,11 +1525,12 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
return MBBI;
// Update the status of what the instruction clobbered and used.
- trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
// Otherwise, if the base register is used or modified, we have no match, so
// return early.
- if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+ if (!ModifiedRegUnits.available(BaseReg) ||
+ !UsedRegUnits.available(BaseReg))
return E;
} while (MBBI != B && Count < Limit);
return E;
@@ -1767,11 +1757,11 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
TRI = Subtarget->getRegisterInfo();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- // Resize the modified and used register bitfield trackers. We do this once
- // per function and then clear the bitfield each time we optimize a load or
- // store.
- ModifiedRegs.resize(TRI->getNumRegs());
- UsedRegs.resize(TRI->getNumRegs());
+ // Resize the modified and used register unit trackers. We do this once
+ // per function and then clear the register units each time we optimize a load
+ // or store.
+ ModifiedRegUnits.init(*TRI);
+ UsedRegUnits.init(*TRI);
bool Modified = false;
bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 65dae03a24db..6c0263585933 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -18,13 +18,13 @@
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/IR/Mangler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -173,11 +173,20 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
MCOperand AArch64MCInstLower::lowerSymbolOperandCOFF(const MachineOperand &MO,
MCSymbol *Sym) const {
- MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
- const MCExpr *Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
+ AArch64MCExpr::VariantKind RefKind = AArch64MCExpr::VK_NONE;
+ if (MO.getTargetFlags() & AArch64II::MO_TLS) {
+ if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGEOFF)
+ RefKind = AArch64MCExpr::VK_SECREL_LO12;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+ AArch64II::MO_HI12)
+ RefKind = AArch64MCExpr::VK_SECREL_HI12;
+ }
+ const MCExpr *Expr =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
if (!MO.isJTI() && MO.getOffset())
Expr = MCBinaryExpr::createAdd(
Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+ Expr = AArch64MCExpr::create(Expr, RefKind, Ctx);
return MCOperand::createExpr(Expr);
}
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 9f354c009461..798340f8fed8 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -15,6 +15,7 @@
#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -48,33 +49,33 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
/// determineCalleeSaves().
bool HasStackFrame = false;
- /// \brief Amount of stack frame size, not including callee-saved registers.
+ /// Amount of stack frame size, not including callee-saved registers.
unsigned LocalStackSize;
- /// \brief Amount of stack frame size used for saving callee-saved registers.
+ /// Amount of stack frame size used for saving callee-saved registers.
unsigned CalleeSavedStackSize;
- /// \brief Number of TLS accesses using the special (combinable)
+ /// Number of TLS accesses using the special (combinable)
/// _TLS_MODULE_BASE_ symbol.
unsigned NumLocalDynamicTLSAccesses = 0;
- /// \brief FrameIndex for start of varargs area for arguments passed on the
+ /// FrameIndex for start of varargs area for arguments passed on the
/// stack.
int VarArgsStackIndex = 0;
- /// \brief FrameIndex for start of varargs area for arguments passed in
+ /// FrameIndex for start of varargs area for arguments passed in
/// general purpose registers.
int VarArgsGPRIndex = 0;
- /// \brief Size of the varargs area for arguments passed in general purpose
+ /// Size of the varargs area for arguments passed in general purpose
/// registers.
unsigned VarArgsGPRSize = 0;
- /// \brief FrameIndex for start of varargs area for arguments passed in
+ /// FrameIndex for start of varargs area for arguments passed in
/// floating-point registers.
int VarArgsFPRIndex = 0;
- /// \brief Size of the varargs area for arguments passed in floating-point
+ /// Size of the varargs area for arguments passed in floating-point
/// registers.
unsigned VarArgsFPRSize = 0;
@@ -90,11 +91,22 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
/// other stack allocations.
bool CalleeSaveStackHasFreeSpace = false;
+ /// Has a value when it is known whether or not the function uses a
+ /// redzone, and no value otherwise.
+ /// Initialized during frame lowering, unless the function has the noredzone
+ /// attribute, in which case it is set to false at construction.
+ Optional<bool> HasRedZone;
+
public:
AArch64FunctionInfo() = default;
explicit AArch64FunctionInfo(MachineFunction &MF) {
(void)MF;
+
+ // If we already know that the function doesn't have a redzone, set
+ // HasRedZone here.
+ if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
+ HasRedZone = false;
}
unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
@@ -132,6 +144,9 @@ public:
return NumLocalDynamicTLSAccesses;
}
+ Optional<bool> hasRedZone() const { return HasRedZone; }
+ void setHasRedZone(bool s) { HasRedZone = s; }
+
int getVarArgsStackIndex() const { return VarArgsStackIndex; }
void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp
index 6930c816b5ae..bc0168e783be 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -20,135 +20,262 @@ using namespace llvm;
namespace {
-/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
-/// together. Given SecondMI, when FirstMI is unspecified, then check if
-/// SecondMI may be part of a fused pair at all.
-static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
- const TargetSubtargetInfo &TSI,
- const MachineInstr *FirstMI,
- const MachineInstr &SecondMI) {
- const AArch64InstrInfo &II = static_cast<const AArch64InstrInfo&>(TII);
- const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);
+// Fuse CMN, CMP, TST followed by Bcc.
+static bool isArithmeticBccPair(const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ if (SecondMI.getOpcode() == AArch64::Bcc) {
+ // Assume the 1st instr to be a wildcard if it is unspecified.
+ if (!FirstMI)
+ return true;
- // Assume wildcards for unspecified instrs.
+ switch (FirstMI->getOpcode()) {
+ case AArch64::ADDSWri:
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSXri:
+ case AArch64::ADDSXrr:
+ case AArch64::ANDSWri:
+ case AArch64::ANDSWrr:
+ case AArch64::ANDSXri:
+ case AArch64::ANDSXrr:
+ case AArch64::SUBSWri:
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSXri:
+ case AArch64::SUBSXrr:
+ case AArch64::BICSWrr:
+ case AArch64::BICSXrr:
+ return true;
+ case AArch64::ADDSWrs:
+ case AArch64::ADDSXrs:
+ case AArch64::ANDSWrs:
+ case AArch64::ANDSXrs:
+ case AArch64::SUBSWrs:
+ case AArch64::SUBSXrs:
+ case AArch64::BICSWrs:
+ case AArch64::BICSXrs:
+ // Shift value can be 0 making these behave like the "rr" variant...
+ return (!AArch64InstrInfo::hasShiftedReg(*FirstMI));
+ }
+ }
+ return false;
+}
+
+// Fuse ALU operations followed by CBZ/CBNZ.
+static bool isArithmeticCbzPair(const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ unsigned SecondOpcode = SecondMI.getOpcode();
+
+ if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
+ SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
+ // Assume the 1st instr to be a wildcard if it is unspecified.
+ if (!FirstMI)
+ return true;
+
+ switch (FirstMI->getOpcode()) {
+ case AArch64::ADDWri:
+ case AArch64::ADDWrr:
+ case AArch64::ADDXri:
+ case AArch64::ADDXrr:
+ case AArch64::ANDWri:
+ case AArch64::ANDWrr:
+ case AArch64::ANDXri:
+ case AArch64::ANDXrr:
+ case AArch64::EORWri:
+ case AArch64::EORWrr:
+ case AArch64::EORXri:
+ case AArch64::EORXrr:
+ case AArch64::ORRWri:
+ case AArch64::ORRWrr:
+ case AArch64::ORRXri:
+ case AArch64::ORRXrr:
+ case AArch64::SUBWri:
+ case AArch64::SUBWrr:
+ case AArch64::SUBXri:
+ case AArch64::SUBXrr:
+ return true;
+ case AArch64::ADDWrs:
+ case AArch64::ADDXrs:
+ case AArch64::ANDWrs:
+ case AArch64::ANDXrs:
+ case AArch64::SUBWrs:
+ case AArch64::SUBXrs:
+ case AArch64::BICWrs:
+ case AArch64::BICXrs:
+ // Shift value can be 0 making these behave like the "rr" variant...
+ return (!AArch64InstrInfo::hasShiftedReg(*FirstMI));
+ }
+ }
+ return false;
+}
+
+// Fuse AES crypto encoding or decoding.
+static bool isAESPair(const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ // Assume the 1st instr to be a wildcard if it is unspecified.
unsigned FirstOpcode =
FirstMI ? FirstMI->getOpcode()
: static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
unsigned SecondOpcode = SecondMI.getOpcode();
- if (ST.hasArithmeticBccFusion())
- // Fuse CMN, CMP, TST followed by Bcc.
- if (SecondOpcode == AArch64::Bcc)
- switch (FirstOpcode) {
- default:
- return false;
- case AArch64::ADDSWri:
- case AArch64::ADDSWrr:
- case AArch64::ADDSXri:
- case AArch64::ADDSXrr:
- case AArch64::ANDSWri:
- case AArch64::ANDSWrr:
- case AArch64::ANDSXri:
- case AArch64::ANDSXrr:
- case AArch64::SUBSWri:
- case AArch64::SUBSWrr:
- case AArch64::SUBSXri:
- case AArch64::SUBSXrr:
- case AArch64::BICSWrr:
- case AArch64::BICSXrr:
- return true;
- case AArch64::ADDSWrs:
- case AArch64::ADDSXrs:
- case AArch64::ANDSWrs:
- case AArch64::ANDSXrs:
+ // AES encode.
+ if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+ FirstOpcode == AArch64::AESErr) &&
+ (SecondOpcode == AArch64::AESMCrr ||
+ SecondOpcode == AArch64::AESMCrrTied))
+ return true;
+ // AES decode.
+ else if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+ FirstOpcode == AArch64::AESDrr) &&
+ (SecondOpcode == AArch64::AESIMCrr ||
+ SecondOpcode == AArch64::AESIMCrrTied))
+ return true;
+
+ return false;
+}
+
+// Fuse literal generation.
+static bool isLiteralsPair(const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ // Assume the 1st instr to be a wildcard if it is unspecified.
+ unsigned FirstOpcode =
+ FirstMI ? FirstMI->getOpcode()
+ : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
+ unsigned SecondOpcode = SecondMI.getOpcode();
+
+ // PC relative address.
+ if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+ FirstOpcode == AArch64::ADRP) &&
+ SecondOpcode == AArch64::ADDXri)
+ return true;
+ // 32 bit immediate.
+ else if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+ FirstOpcode == AArch64::MOVZWi) &&
+ (SecondOpcode == AArch64::MOVKWi &&
+ SecondMI.getOperand(3).getImm() == 16))
+ return true;
+ // Lower half of 64 bit immediate.
+ else if((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+ FirstOpcode == AArch64::MOVZXi) &&
+ (SecondOpcode == AArch64::MOVKXi &&
+ SecondMI.getOperand(3).getImm() == 16))
+ return true;
+ // Upper half of 64 bit immediate.
+ else if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+ (FirstOpcode == AArch64::MOVKXi &&
+ FirstMI->getOperand(3).getImm() == 32)) &&
+ (SecondOpcode == AArch64::MOVKXi &&
+ SecondMI.getOperand(3).getImm() == 48))
+ return true;
+
+ return false;
+}
+
+// Fuse address generation and loads or stores.
+static bool isAddressLdStPair(const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ unsigned SecondOpcode = SecondMI.getOpcode();
+
+ switch (SecondOpcode) {
+ case AArch64::STRBBui:
+ case AArch64::STRBui:
+ case AArch64::STRDui:
+ case AArch64::STRHHui:
+ case AArch64::STRHui:
+ case AArch64::STRQui:
+ case AArch64::STRSui:
+ case AArch64::STRWui:
+ case AArch64::STRXui:
+ case AArch64::LDRBBui:
+ case AArch64::LDRBui:
+ case AArch64::LDRDui:
+ case AArch64::LDRHHui:
+ case AArch64::LDRHui:
+ case AArch64::LDRQui:
+ case AArch64::LDRSui:
+ case AArch64::LDRWui:
+ case AArch64::LDRXui:
+ case AArch64::LDRSBWui:
+ case AArch64::LDRSBXui:
+ case AArch64::LDRSHWui:
+ case AArch64::LDRSHXui:
+ case AArch64::LDRSWui:
+ // Assume the 1st instr to be a wildcard if it is unspecified.
+ if (!FirstMI)
+ return true;
+
+ switch (FirstMI->getOpcode()) {
+ case AArch64::ADR:
+ return (SecondMI.getOperand(2).getImm() == 0);
+ case AArch64::ADRP:
+ return true;
+ }
+ }
+ return false;
+}
+
+// Fuse compare and conditional select.
+static bool isCCSelectPair(const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ unsigned SecondOpcode = SecondMI.getOpcode();
+
+ // 32 bits
+ if (SecondOpcode == AArch64::CSELWr) {
+ // Assume the 1st instr to be a wildcard if it is unspecified.
+ if (!FirstMI)
+ return true;
+
+ if (FirstMI->definesRegister(AArch64::WZR))
+ switch (FirstMI->getOpcode()) {
case AArch64::SUBSWrs:
- case AArch64::SUBSXrs:
- case AArch64::BICSWrs:
- case AArch64::BICSXrs:
- // Shift value can be 0 making these behave like the "rr" variant...
- return !II.hasShiftedReg(*FirstMI);
- case AArch64::INSTRUCTION_LIST_END:
+ return (!AArch64InstrInfo::hasShiftedReg(*FirstMI));
+ case AArch64::SUBSWrx:
+ return (!AArch64InstrInfo::hasExtendedReg(*FirstMI));
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSWri:
return true;
}
+ }
+ // 64 bits
+ else if (SecondOpcode == AArch64::CSELXr) {
+ // Assume the 1st instr to be a wildcard if it is unspecified.
+ if (!FirstMI)
+ return true;
- if (ST.hasArithmeticCbzFusion())
- // Fuse ALU operations followed by CBZ/CBNZ.
- if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
- SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX)
- switch (FirstOpcode) {
- default:
- return false;
- case AArch64::ADDWri:
- case AArch64::ADDWrr:
- case AArch64::ADDXri:
- case AArch64::ADDXrr:
- case AArch64::ANDWri:
- case AArch64::ANDWrr:
- case AArch64::ANDXri:
- case AArch64::ANDXrr:
- case AArch64::EORWri:
- case AArch64::EORWrr:
- case AArch64::EORXri:
- case AArch64::EORXrr:
- case AArch64::ORRWri:
- case AArch64::ORRWrr:
- case AArch64::ORRXri:
- case AArch64::ORRXrr:
- case AArch64::SUBWri:
- case AArch64::SUBWrr:
- case AArch64::SUBXri:
- case AArch64::SUBXrr:
- return true;
- case AArch64::ADDWrs:
- case AArch64::ADDXrs:
- case AArch64::ANDWrs:
- case AArch64::ANDXrs:
- case AArch64::SUBWrs:
- case AArch64::SUBXrs:
- case AArch64::BICWrs:
- case AArch64::BICXrs:
- // Shift value can be 0 making these behave like the "rr" variant...
- return !II.hasShiftedReg(*FirstMI);
- case AArch64::INSTRUCTION_LIST_END:
+ if (FirstMI->definesRegister(AArch64::XZR))
+ switch (FirstMI->getOpcode()) {
+ case AArch64::SUBSXrs:
+ return (!AArch64InstrInfo::hasShiftedReg(*FirstMI));
+ case AArch64::SUBSXrx:
+ case AArch64::SUBSXrx64:
+ return (!AArch64InstrInfo::hasExtendedReg(*FirstMI));
+ case AArch64::SUBSXrr:
+ case AArch64::SUBSXri:
return true;
}
+ }
+ return false;
+}
- if (ST.hasFuseAES())
- // Fuse AES crypto operations.
- switch(SecondOpcode) {
- // AES encode.
- case AArch64::AESMCrr:
- case AArch64::AESMCrrTied:
- return FirstOpcode == AArch64::AESErr ||
- FirstOpcode == AArch64::INSTRUCTION_LIST_END;
- // AES decode.
- case AArch64::AESIMCrr:
- case AArch64::AESIMCrrTied:
- return FirstOpcode == AArch64::AESDrr ||
- FirstOpcode == AArch64::INSTRUCTION_LIST_END;
- }
+/// Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+ const TargetSubtargetInfo &TSI,
+ const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);
- if (ST.hasFuseLiterals())
- // Fuse literal generation operations.
- switch (SecondOpcode) {
- // PC relative address.
- case AArch64::ADDXri:
- return FirstOpcode == AArch64::ADRP ||
- FirstOpcode == AArch64::INSTRUCTION_LIST_END;
- // 32 bit immediate.
- case AArch64::MOVKWi:
- return (FirstOpcode == AArch64::MOVZWi &&
- SecondMI.getOperand(3).getImm() == 16) ||
- FirstOpcode == AArch64::INSTRUCTION_LIST_END;
- // Lower and upper half of 64 bit immediate.
- case AArch64::MOVKXi:
- return FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
- (FirstOpcode == AArch64::MOVZXi &&
- SecondMI.getOperand(3).getImm() == 16) ||
- (FirstOpcode == AArch64::MOVKXi &&
- FirstMI->getOperand(3).getImm() == 32 &&
- SecondMI.getOperand(3).getImm() == 48);
- }
+ if (ST.hasArithmeticBccFusion() && isArithmeticBccPair(FirstMI, SecondMI))
+ return true;
+ if (ST.hasArithmeticCbzFusion() && isArithmeticCbzPair(FirstMI, SecondMI))
+ return true;
+ if (ST.hasFuseAES() && isAESPair(FirstMI, SecondMI))
+ return true;
+ if (ST.hasFuseLiterals() && isLiteralsPair(FirstMI, SecondMI))
+ return true;
+ if (ST.hasFuseAddress() && isAddressLdStPair(FirstMI, SecondMI))
+ return true;
+ if (ST.hasFuseCCSelect() && isCCSelectPair(FirstMI, SecondMI))
+ return true;
return false;
}
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
index ee6703aed1e2..ccf646575296 100644
--- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -164,10 +164,10 @@ bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd,
LiveIntervals &LIs = G.getMetadata().LIS;
if (TRI->isPhysicalRegister(Rd) || TRI->isPhysicalRegister(Ra)) {
- DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd)
- << '\n');
- DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra)
- << '\n');
+ LLVM_DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd)
+ << '\n');
+ LLVM_DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra)
+ << '\n');
return false;
}
@@ -247,14 +247,14 @@ void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd,
// Do some Chain management
if (Chains.count(Ra)) {
if (Rd != Ra) {
- DEBUG(dbgs() << "Moving acc chain from " << printReg(Ra, TRI) << " to "
- << printReg(Rd, TRI) << '\n';);
+ LLVM_DEBUG(dbgs() << "Moving acc chain from " << printReg(Ra, TRI)
+ << " to " << printReg(Rd, TRI) << '\n';);
Chains.remove(Ra);
Chains.insert(Rd);
}
} else {
- DEBUG(dbgs() << "Creating new acc chain for " << printReg(Rd, TRI)
- << '\n';);
+ LLVM_DEBUG(dbgs() << "Creating new acc chain for " << printReg(Rd, TRI)
+ << '\n';);
Chains.insert(Rd);
}
@@ -279,7 +279,7 @@ void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd,
assert(edge != G.invalidEdgeId() &&
"PBQP error ! The edge should exist !");
- DEBUG(dbgs() << "Refining constraint !\n";);
+ LLVM_DEBUG(dbgs() << "Refining constraint !\n";);
if (G.getEdgeNode1Id(edge) == node2) {
std::swap(node1, node2);
@@ -329,7 +329,7 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) {
LiveIntervals &LIs = G.getMetadata().LIS;
TRI = MF.getSubtarget().getRegisterInfo();
- DEBUG(MF.dump());
+ LLVM_DEBUG(MF.dump());
for (const auto &MBB: MF) {
Chains.clear(); // FIXME: really needed ? Could not work at MF level ?
@@ -340,8 +340,8 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) {
for (auto r : Chains) {
SmallVector<unsigned, 8> toDel;
if(regJustKilledBefore(LIs, r, MI)) {
- DEBUG(dbgs() << "Killing chain " << printReg(r, TRI) << " at ";
- MI.print(dbgs()););
+ LLVM_DEBUG(dbgs() << "Killing chain " << printReg(r, TRI) << " at ";
+ MI.print(dbgs()););
toDel.push_back(r);
}
diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp
index a8dc6e74ef6a..01d8a35bbc23 100644
--- a/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -119,7 +119,7 @@ public:
/// Iterate over the functions and promote the interesting constants into
/// global variables with module scope.
bool runOnModule(Module &M) override {
- DEBUG(dbgs() << getPassName() << '\n');
+ LLVM_DEBUG(dbgs() << getPassName() << '\n');
if (skipModule(M))
return false;
bool Changed = false;
@@ -380,9 +380,9 @@ bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Instruction *User,
(IPI.first->getParent() != NewPt->getParent() &&
DT.dominates(IPI.first->getParent(), NewPt->getParent()))) {
// No need to insert this point. Just record the dominated use.
- DEBUG(dbgs() << "Insertion point dominated by:\n");
- DEBUG(IPI.first->print(dbgs()));
- DEBUG(dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Insertion point dominated by:\n");
+ LLVM_DEBUG(IPI.first->print(dbgs()));
+ LLVM_DEBUG(dbgs() << '\n');
IPI.second.emplace_back(User, OpNo);
return true;
}
@@ -408,9 +408,9 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Instruction *User,
// Instructions are in the same block.
// By construction, NewPt is dominating the other.
// Indeed, isDominated returned false with the exact same arguments.
- DEBUG(dbgs() << "Merge insertion point with:\n");
- DEBUG(IPI->first->print(dbgs()));
- DEBUG(dbgs() << "\nat considered insertion point.\n");
+ LLVM_DEBUG(dbgs() << "Merge insertion point with:\n");
+ LLVM_DEBUG(IPI->first->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\nat considered insertion point.\n");
appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts);
return true;
}
@@ -430,11 +430,11 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Instruction *User,
}
// else, CommonDominator is the block of NewBB, hence NewBB is the last
// possible insertion point in that block.
- DEBUG(dbgs() << "Merge insertion point with:\n");
- DEBUG(IPI->first->print(dbgs()));
- DEBUG(dbgs() << '\n');
- DEBUG(NewPt->print(dbgs()));
- DEBUG(dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Merge insertion point with:\n");
+ LLVM_DEBUG(IPI->first->print(dbgs()));
+ LLVM_DEBUG(dbgs() << '\n');
+ LLVM_DEBUG(NewPt->print(dbgs()));
+ LLVM_DEBUG(dbgs() << '\n');
appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts);
return true;
}
@@ -443,15 +443,15 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Instruction *User,
void AArch64PromoteConstant::computeInsertionPoint(
Instruction *User, unsigned OpNo, InsertionPoints &InsertPts) {
- DEBUG(dbgs() << "Considered use, opidx " << OpNo << ":\n");
- DEBUG(User->print(dbgs()));
- DEBUG(dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Considered use, opidx " << OpNo << ":\n");
+ LLVM_DEBUG(User->print(dbgs()));
+ LLVM_DEBUG(dbgs() << '\n');
Instruction *InsertionPoint = findInsertionPoint(*User, OpNo);
- DEBUG(dbgs() << "Considered insertion point:\n");
- DEBUG(InsertionPoint->print(dbgs()));
- DEBUG(dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Considered insertion point:\n");
+ LLVM_DEBUG(InsertionPoint->print(dbgs()));
+ LLVM_DEBUG(dbgs() << '\n');
if (isDominated(InsertionPoint, User, OpNo, InsertPts))
return;
@@ -460,7 +460,7 @@ void AArch64PromoteConstant::computeInsertionPoint(
if (tryAndMerge(InsertionPoint, User, OpNo, InsertPts))
return;
- DEBUG(dbgs() << "Keep considered insertion point\n");
+ LLVM_DEBUG(dbgs() << "Keep considered insertion point\n");
// It is definitely useful by its own
InsertPts[InsertionPoint].emplace_back(User, OpNo);
@@ -476,9 +476,9 @@ static void ensurePromotedGV(Function &F, Constant &C,
*F.getParent(), C.getType(), true, GlobalValue::InternalLinkage, nullptr,
"_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
PC.GV->setInitializer(&C);
- DEBUG(dbgs() << "Global replacement: ");
- DEBUG(PC.GV->print(dbgs()));
- DEBUG(dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Global replacement: ");
+ LLVM_DEBUG(PC.GV->print(dbgs()));
+ LLVM_DEBUG(dbgs() << '\n');
++NumPromoted;
}
@@ -495,10 +495,10 @@ void AArch64PromoteConstant::insertDefinitions(Function &F,
// Create the load of the global variable.
IRBuilder<> Builder(IPI.first);
LoadInst *LoadedCst = Builder.CreateLoad(&PromotedGV);
- DEBUG(dbgs() << "**********\n");
- DEBUG(dbgs() << "New def: ");
- DEBUG(LoadedCst->print(dbgs()));
- DEBUG(dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "**********\n");
+ LLVM_DEBUG(dbgs() << "New def: ");
+ LLVM_DEBUG(LoadedCst->print(dbgs()));
+ LLVM_DEBUG(dbgs() << '\n');
// Update the dominated uses.
for (auto Use : IPI.second) {
@@ -507,11 +507,11 @@ void AArch64PromoteConstant::insertDefinitions(Function &F,
findInsertionPoint(*Use.first, Use.second)) &&
"Inserted definition does not dominate all its uses!");
#endif
- DEBUG({
- dbgs() << "Use to update " << Use.second << ":";
- Use.first->print(dbgs());
- dbgs() << '\n';
- });
+ LLVM_DEBUG({
+ dbgs() << "Use to update " << Use.second << ":";
+ Use.first->print(dbgs());
+ dbgs() << '\n';
+ });
Use.first->setOperand(Use.second, LoadedCst);
++NumPromotedUses;
}
@@ -523,7 +523,7 @@ void AArch64PromoteConstant::promoteConstants(
PromotionCacheTy &PromotionCache) {
// Promote the constants.
for (auto U = Updates.begin(), E = Updates.end(); U != E;) {
- DEBUG(dbgs() << "** Compute insertion points **\n");
+ LLVM_DEBUG(dbgs() << "** Compute insertion points **\n");
auto First = U;
Constant *C = First->C;
InsertionPoints InsertPts;
diff --git a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
index e5822b114324..fcb0b36a9f6d 100644
--- a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+++ b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -55,6 +55,7 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Support/Debug.h"
@@ -72,10 +73,10 @@ class AArch64RedundantCopyElimination : public MachineFunctionPass {
// DomBBClobberedRegs is used when computing known values in the dominating
// BB.
- BitVector DomBBClobberedRegs;
+ LiveRegUnits DomBBClobberedRegs, DomBBUsedRegs;
// OptBBClobberedRegs is used when optimizing away redundant copies/moves.
- BitVector OptBBClobberedRegs;
+ LiveRegUnits OptBBClobberedRegs, OptBBUsedRegs;
public:
static char ID;
@@ -109,28 +110,6 @@ char AArch64RedundantCopyElimination::ID = 0;
INITIALIZE_PASS(AArch64RedundantCopyElimination, "aarch64-copyelim",
"AArch64 redundant copy elimination pass", false, false)
-/// Remember what registers the specified instruction modifies.
-static void trackRegDefs(const MachineInstr &MI, BitVector &ClobberedRegs,
- const TargetRegisterInfo *TRI) {
- for (const MachineOperand &MO : MI.operands()) {
- if (MO.isRegMask()) {
- ClobberedRegs.setBitsNotInMask(MO.getRegMask());
- continue;
- }
-
- if (!MO.isReg())
- continue;
- unsigned Reg = MO.getReg();
- if (!Reg)
- continue;
- if (!MO.isDef())
- continue;
-
- for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
- ClobberedRegs.set(*AI);
- }
-}
-
/// It's possible to determine the value of a register based on a dominating
/// condition. To do so, this function checks to see if the basic block \p MBB
/// is the target of a conditional branch \p CondBr with an equality comparison.
@@ -182,7 +161,8 @@ bool AArch64RedundantCopyElimination::knownRegValInBlock(
// Registers clobbered in PredMBB between CondBr instruction and current
// instruction being checked in loop.
- DomBBClobberedRegs.reset();
+ DomBBClobberedRegs.clear();
+ DomBBUsedRegs.clear();
// Find compare instruction that sets NZCV used by CondBr.
MachineBasicBlock::reverse_iterator RIt = CondBr.getReverseIterator();
@@ -212,7 +192,7 @@ bool AArch64RedundantCopyElimination::knownRegValInBlock(
// register of the compare is not modified (including a self-clobbering
// compare) between the compare and conditional branch we known the value
// of the 1st source operand.
- if (PredI.getOperand(2).isImm() && !DomBBClobberedRegs[SrcReg] &&
+ if (PredI.getOperand(2).isImm() && DomBBClobberedRegs.available(SrcReg) &&
SrcReg != DstReg) {
// We've found the instruction that sets NZCV.
int32_t KnownImm = PredI.getOperand(2).getImm();
@@ -232,7 +212,7 @@ bool AArch64RedundantCopyElimination::knownRegValInBlock(
// The destination register must not be modified between the NZCV setting
// instruction and the conditional branch.
- if (DomBBClobberedRegs[DstReg])
+ if (!DomBBClobberedRegs.available(DstReg))
return Res;
FirstUse = PredI;
@@ -276,7 +256,7 @@ bool AArch64RedundantCopyElimination::knownRegValInBlock(
// The destination register of the NZCV setting instruction must not be
// modified before the conditional branch.
- if (DomBBClobberedRegs[DstReg])
+ if (!DomBBClobberedRegs.available(DstReg))
return false;
// We've found the instruction that sets NZCV whose DstReg == 0.
@@ -290,8 +270,9 @@ bool AArch64RedundantCopyElimination::knownRegValInBlock(
if (PredI.definesRegister(AArch64::NZCV))
return false;
- // Track clobbered registers.
- trackRegDefs(PredI, DomBBClobberedRegs, TRI);
+ // Track clobbered and used registers.
+ LiveRegUnits::accumulateUsedDefed(PredI, DomBBClobberedRegs, DomBBUsedRegs,
+ TRI);
}
return false;
}
@@ -330,8 +311,9 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) {
if (!knownRegValInBlock(*Itr, MBB, KnownRegs, FirstUse))
continue;
- // Reset the clobber list.
- OptBBClobberedRegs.reset();
+ // Reset the clobbered and used register units.
+ OptBBClobberedRegs.clear();
+ OptBBUsedRegs.clear();
// Look backward in PredMBB for COPYs from the known reg to find other
// registers that are known to be a constant value.
@@ -343,11 +325,12 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) {
MCPhysReg CopyDstReg = PredI->getOperand(0).getReg();
MCPhysReg CopySrcReg = PredI->getOperand(1).getReg();
for (auto &KnownReg : KnownRegs) {
- if (OptBBClobberedRegs[KnownReg.Reg])
+ if (!OptBBClobberedRegs.available(KnownReg.Reg))
continue;
// If we have X = COPY Y, and Y is known to be zero, then now X is
// known to be zero.
- if (CopySrcReg == KnownReg.Reg && !OptBBClobberedRegs[CopyDstReg]) {
+ if (CopySrcReg == KnownReg.Reg &&
+ OptBBClobberedRegs.available(CopyDstReg)) {
KnownRegs.push_back(RegImm(CopyDstReg, KnownReg.Imm));
if (SeenFirstUse)
FirstUse = PredI;
@@ -355,7 +338,8 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) {
}
// If we have X = COPY Y, and X is known to be zero, then now Y is
// known to be zero.
- if (CopyDstReg == KnownReg.Reg && !OptBBClobberedRegs[CopySrcReg]) {
+ if (CopyDstReg == KnownReg.Reg &&
+ OptBBClobberedRegs.available(CopySrcReg)) {
KnownRegs.push_back(RegImm(CopySrcReg, KnownReg.Imm));
if (SeenFirstUse)
FirstUse = PredI;
@@ -368,10 +352,11 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) {
if (PredI == PredMBB->begin())
break;
- trackRegDefs(*PredI, OptBBClobberedRegs, TRI);
+ LiveRegUnits::accumulateUsedDefed(*PredI, OptBBClobberedRegs,
+ OptBBUsedRegs, TRI);
// Stop if all of the known-zero regs have been clobbered.
if (all_of(KnownRegs, [&](RegImm KnownReg) {
- return OptBBClobberedRegs[KnownReg.Reg];
+ return !OptBBClobberedRegs.available(KnownReg.Reg);
}))
break;
}
@@ -427,9 +412,9 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) {
}
if (IsCopy)
- DEBUG(dbgs() << "Remove redundant Copy : " << *MI);
+ LLVM_DEBUG(dbgs() << "Remove redundant Copy : " << *MI);
else
- DEBUG(dbgs() << "Remove redundant Move : " << *MI);
+ LLVM_DEBUG(dbgs() << "Remove redundant Move : " << *MI);
MI->eraseFromParent();
Changed = true;
@@ -473,8 +458,8 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) {
// Clear kills in the range where changes were made. This is conservative,
// but should be okay since kill markers are being phased out.
- DEBUG(dbgs() << "Clearing kill flags.\n\tFirstUse: " << *FirstUse
- << "\tLastChange: " << *LastChange);
+ LLVM_DEBUG(dbgs() << "Clearing kill flags.\n\tFirstUse: " << *FirstUse
+ << "\tLastChange: " << *LastChange);
for (MachineInstr &MMI : make_range(FirstUse, PredMBB->end()))
MMI.clearKillInfo();
for (MachineInstr &MMI : make_range(MBB->begin(), LastChange))
@@ -490,10 +475,12 @@ bool AArch64RedundantCopyElimination::runOnMachineFunction(
TRI = MF.getSubtarget().getRegisterInfo();
MRI = &MF.getRegInfo();
- // Resize the clobber register bitfield trackers. We do this once per
+ // Resize the clobbered and used register unit trackers. We do this once per
// function.
- DomBBClobberedRegs.resize(TRI->getNumRegs());
- OptBBClobberedRegs.resize(TRI->getNumRegs());
+ DomBBClobberedRegs.init(*TRI);
+ DomBBUsedRegs.init(*TRI);
+ OptBBClobberedRegs.init(*TRI);
+ OptBBUsedRegs.init(*TRI);
bool Changed = false;
for (MachineBasicBlock &MBB : MF)
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 88dd297e0079..a7c2c1b8125b 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -72,24 +72,41 @@ const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
return nullptr;
}
+const TargetRegisterClass *
+AArch64RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
+ unsigned Idx) const {
+ // edge case for GPR/FPR register classes
+ if (RC == &AArch64::GPR32allRegClass && Idx == AArch64::hsub)
+ return &AArch64::FPR32RegClass;
+ else if (RC == &AArch64::GPR64allRegClass && Idx == AArch64::hsub)
+ return &AArch64::FPR64RegClass;
+
+ // Forward to TableGen's default version.
+ return AArch64GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
+}
+
const uint32_t *
AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
+ bool SCS = MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack);
if (CC == CallingConv::GHC)
// This is academic because all GHC calls are (supposed to be) tail calls
- return CSR_AArch64_NoRegs_RegMask;
+ return SCS ? CSR_AArch64_NoRegs_SCS_RegMask : CSR_AArch64_NoRegs_RegMask;
if (CC == CallingConv::AnyReg)
- return CSR_AArch64_AllRegs_RegMask;
+ return SCS ? CSR_AArch64_AllRegs_SCS_RegMask : CSR_AArch64_AllRegs_RegMask;
if (CC == CallingConv::CXX_FAST_TLS)
- return CSR_AArch64_CXX_TLS_Darwin_RegMask;
+ return SCS ? CSR_AArch64_CXX_TLS_Darwin_SCS_RegMask
+ : CSR_AArch64_CXX_TLS_Darwin_RegMask;
if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()
->supportSwiftError() &&
MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
- return CSR_AArch64_AAPCS_SwiftError_RegMask;
+ return SCS ? CSR_AArch64_AAPCS_SwiftError_SCS_RegMask
+ : CSR_AArch64_AAPCS_SwiftError_RegMask;
if (CC == CallingConv::PreserveMost)
- return CSR_AArch64_RT_MostRegs_RegMask;
+ return SCS ? CSR_AArch64_RT_MostRegs_SCS_RegMask
+ : CSR_AArch64_RT_MostRegs_RegMask;
else
- return CSR_AArch64_AAPCS_RegMask;
+ return SCS ? CSR_AArch64_AAPCS_SCS_RegMask : CSR_AArch64_AAPCS_RegMask;
}
const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
@@ -114,6 +131,10 @@ AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
return CSR_AArch64_AAPCS_ThisReturn_RegMask;
}
+const uint32_t *AArch64RegisterInfo::getWindowsStackProbePreservedMask() const {
+ return CSR_AArch64_StackProbe_Windows_RegMask;
+}
+
BitVector
AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
const AArch64FrameLowering *TFI = getFrameLowering(MF);
@@ -129,6 +150,9 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved())
markSuperRegs(Reserved, AArch64::W18); // Platform register
+ if (MF.getSubtarget<AArch64Subtarget>().isX20Reserved())
+ markSuperRegs(Reserved, AArch64::W20); // Platform register
+
if (hasBasePointer(MF))
markSuperRegs(Reserved, AArch64::W19);
@@ -151,12 +175,15 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
case AArch64::X18:
case AArch64::W18:
return MF.getSubtarget<AArch64Subtarget>().isX18Reserved();
+ case AArch64::X19:
+ case AArch64::W19:
+ return hasBasePointer(MF);
+ case AArch64::X20:
+ case AArch64::W20:
+ return MF.getSubtarget<AArch64Subtarget>().isX20Reserved();
case AArch64::FP:
case AArch64::W29:
return TFI->hasFP(MF) || TT.isOSDarwin();
- case AArch64::W19:
- case AArch64::X19:
- return hasBasePointer(MF);
}
return false;
@@ -225,11 +252,13 @@ bool AArch64RegisterInfo::requiresVirtualBaseRegisters(
bool
AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
- const MachineFrameInfo &MFI = MF.getFrameInfo();
- // AArch64FrameLowering::resolveFrameIndexReference() can always fall back
- // to the stack pointer, so only put the emergency spill slot next to the
- // FP when there's no better way to access it (SP or base pointer).
- return MFI.hasVarSizedObjects() && !hasBasePointer(MF);
+ // This function indicates whether the emergency spillslot should be placed
+ // close to the beginning of the stackframe (closer to FP) or the end
+ // (closer to SP).
+ //
+ // The beginning works most reliably if we have a frame pointer.
+ const AArch64FrameLowering &TFI = *getFrameLowering(MF);
+ return TFI.hasFP(MF);
}
bool AArch64RegisterInfo::requiresFrameIndexScavenging(
@@ -422,6 +451,8 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
- (TFI->hasFP(MF) || TT.isOSDarwin()) // FP
- MF.getSubtarget<AArch64Subtarget>()
.isX18Reserved() // X18 reserved as platform register
+ - MF.getSubtarget<AArch64Subtarget>()
+ .isX20Reserved() // X20 reserved as platform register
- hasBasePointer(MF); // X19
case AArch64::FPR8RegClassID:
case AArch64::FPR16RegClassID:
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
index 8ce893516fe2..57000d37090d 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -46,6 +46,10 @@ public:
return 5;
}
+ const TargetRegisterClass *
+ getSubClassWithSubReg(const TargetRegisterClass *RC,
+ unsigned Idx) const override;
+
// Calls involved in thread-local variable lookup save more registers than
// normal calls, so they need a different mask to represent this.
const uint32_t *getTLSCallPreservedMask() const;
@@ -61,6 +65,9 @@ public:
const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF,
CallingConv::ID) const;
+ /// Stack probing calls preserve different CSRs to the normal CC.
+ const uint32_t *getWindowsStackProbePreservedMask() const;
+
BitVector getReservedRegs(const MachineFunction &MF) const override;
bool isConstantPhysReg(unsigned PhysReg) const override;
const TargetRegisterClass *
@@ -69,6 +76,8 @@ public:
const TargetRegisterClass *
getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+ bool enableMultipleCopyHints() const override { return true; }
+
bool requiresRegisterScavenging(const MachineFunction &MF) const override;
bool useFPForScavengingIndex(const MachineFunction &MF) const override;
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index 39e3e33b0d27..7a653e117fd1 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -131,6 +131,9 @@ def XZR : AArch64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
// Condition code register.
def NZCV : AArch64Reg<0, "nzcv">;
+// First fault status register
+def FFR : AArch64Reg<0, "ffr">, DwarfRegNum<[47]>;
+
// GPR register classes with the intersections of GPR32/GPR32sp and
// GPR64/GPR64sp for use by the coalescer.
def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> {
@@ -168,6 +171,7 @@ def GPR64sponly : RegisterClass<"AArch64", [i64], 64, (add SP)>;
def GPR64spPlus0Operand : AsmOperandClass {
let Name = "GPR64sp0";
let RenderMethod = "addRegOperands";
+ let PredicateMethod = "isGPR64<AArch64::GPR64spRegClassID>";
let ParserMethod = "tryParseGPR64sp0Operand";
}
@@ -489,25 +493,25 @@ def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
let ParserMatchClass = VectorRegLoAsmOperand;
}
-class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
+class TypedVecListAsmOperand<int count, string vecty, int lanes, int eltsize>
: AsmOperandClass {
- let Name = "TypedVectorList" # count # "_" # lanes # kind;
+ let Name = "TypedVectorList" # count # "_" # lanes # eltsize;
let PredicateMethod
- = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
- let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
+ = "isTypedVectorList<RegKind::NeonVector, " # count # ", " # lanes # ", " # eltsize # ">";
+ let RenderMethod = "addVectorListOperands<" # vecty # ", " # count # ">";
}
-class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
+class TypedVecListRegOperand<RegisterClass Reg, int lanes, string eltsize>
: RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
- # kind # "'>">;
+ # eltsize # "'>">;
multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
// With implicit types (probably on instruction instead). E.g. { v0, v1 }
def _64AsmOperand : AsmOperandClass {
let Name = NAME # "64";
- let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
- let RenderMethod = "addVectorList64Operands<" # count # ">";
+ let PredicateMethod = "isImplicitlyTypedVectorList<RegKind::NeonVector, " # count # ">";
+ let RenderMethod = "addVectorListOperands<AArch64Operand::VecListIdx_DReg, " # count # ">";
}
def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
@@ -516,8 +520,8 @@ multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
def _128AsmOperand : AsmOperandClass {
let Name = NAME # "128";
- let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
- let RenderMethod = "addVectorList128Operands<" # count # ">";
+ let PredicateMethod = "isImplicitlyTypedVectorList<RegKind::NeonVector, " # count # ">";
+ let RenderMethod = "addVectorListOperands<AArch64Operand::VecListIdx_QReg, " # count # ">";
}
def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
@@ -527,25 +531,25 @@ multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
// 64-bit register lists with explicit type.
// { v0.8b, v1.8b }
- def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
+ def _8bAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_DReg", 8, 8>;
def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
}
// { v0.4h, v1.4h }
- def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
+ def _4hAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_DReg", 4, 16>;
def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
}
// { v0.2s, v1.2s }
- def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
+ def _2sAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_DReg", 2, 32>;
def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
}
// { v0.1d, v1.1d }
- def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
+ def _1dAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_DReg", 1, 64>;
def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
}
@@ -553,49 +557,49 @@ multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
// 128-bit register lists with explicit type
// { v0.16b, v1.16b }
- def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
+ def _16bAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 16, 8>;
def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
}
// { v0.8h, v1.8h }
- def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
+ def _8hAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 8, 16>;
def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
}
// { v0.4s, v1.4s }
- def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
+ def _4sAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 4, 32>;
def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
}
// { v0.2d, v1.2d }
- def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
+ def _2dAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 2, 64>;
def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
}
// { v0.b, v1.b }
- def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
+ def _bAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 0, 8>;
def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
}
// { v0.h, v1.h }
- def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
+ def _hAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 0, 16>;
def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
}
// { v0.s, v1.s }
- def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
+ def _sAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 0, 32>;
def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
}
// { v0.d, v1.d }
- def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
+ def _dAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 0, 64>;
def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
}
@@ -608,13 +612,32 @@ defm VecListTwo : VectorList<2, DD, QQ>;
defm VecListThree : VectorList<3, DDD, QQQ>;
defm VecListFour : VectorList<4, DDDD, QQQQ>;
+class FPRAsmOperand<string RC> : AsmOperandClass {
+ let Name = "FPRAsmOperand" # RC;
+ let PredicateMethod = "isGPR64<AArch64::" # RC # "RegClassID>";
+ let RenderMethod = "addRegOperands";
+}
// Register operand versions of the scalar FP registers.
-def FPR16Op : RegisterOperand<FPR16, "printOperand">;
-def FPR32Op : RegisterOperand<FPR32, "printOperand">;
-def FPR64Op : RegisterOperand<FPR64, "printOperand">;
-def FPR128Op : RegisterOperand<FPR128, "printOperand">;
+def FPR8Op : RegisterOperand<FPR8, "printOperand"> {
+ let ParserMatchClass = FPRAsmOperand<"FPR8">;
+}
+
+def FPR16Op : RegisterOperand<FPR16, "printOperand"> {
+ let ParserMatchClass = FPRAsmOperand<"FPR16">;
+}
+def FPR32Op : RegisterOperand<FPR32, "printOperand"> {
+ let ParserMatchClass = FPRAsmOperand<"FPR32">;
+}
+
+def FPR64Op : RegisterOperand<FPR64, "printOperand"> {
+ let ParserMatchClass = FPRAsmOperand<"FPR64">;
+}
+
+def FPR128Op : RegisterOperand<FPR128, "printOperand"> {
+ let ParserMatchClass = FPRAsmOperand<"FPR128">;
+}
//===----------------------------------------------------------------------===//
// ARMv8.1a atomic CASP register operands
@@ -756,27 +779,31 @@ class ZPRRegOp <string Suffix, AsmOperandClass C,
//******************************************************************************
-// SVE predicate register class.
-def PPR : RegisterClass<"AArch64",
- [nxv16i1, nxv8i1, nxv4i1, nxv2i1],
- 16, (sequence "P%u", 0, 15)> {
+// SVE predicate register classes.
+class PPRClass<int lastreg> : RegisterClass<
+ "AArch64",
+ [ nxv16i1, nxv8i1, nxv4i1, nxv2i1 ], 16,
+ (sequence "P%u", 0, lastreg)> {
let Size = 16;
}
-class PPRAsmOperand <string name, int Width>: AsmOperandClass {
+def PPR : PPRClass<15>;
+def PPR_3b : PPRClass<7>; // Restricted 3 bit SVE predicate register class.
+
+class PPRAsmOperand <string name, string RegClass, int Width>: AsmOperandClass {
let Name = "SVE" # name # "Reg";
- let PredicateMethod = "isSVEVectorRegOfWidth<"
- # Width # ", AArch64::PPRRegClassID>";
+ let PredicateMethod = "isSVEPredicateVectorRegOfWidth<"
+ # Width # ", " # "AArch64::" # RegClass # "RegClassID>";
let DiagnosticType = "InvalidSVE" # name # "Reg";
let RenderMethod = "addRegOperands";
let ParserMethod = "tryParseSVEPredicateVector";
}
-def PPRAsmOpAny : PPRAsmOperand<"PredicateAny", -1>;
-def PPRAsmOp8 : PPRAsmOperand<"PredicateB", 8>;
-def PPRAsmOp16 : PPRAsmOperand<"PredicateH", 16>;
-def PPRAsmOp32 : PPRAsmOperand<"PredicateS", 32>;
-def PPRAsmOp64 : PPRAsmOperand<"PredicateD", 64>;
+def PPRAsmOpAny : PPRAsmOperand<"PredicateAny", "PPR", 0>;
+def PPRAsmOp8 : PPRAsmOperand<"PredicateB", "PPR", 8>;
+def PPRAsmOp16 : PPRAsmOperand<"PredicateH", "PPR", 16>;
+def PPRAsmOp32 : PPRAsmOperand<"PredicateS", "PPR", 32>;
+def PPRAsmOp64 : PPRAsmOperand<"PredicateD", "PPR", 64>;
def PPRAny : PPRRegOp<"", PPRAsmOpAny, PPR>;
def PPR8 : PPRRegOp<"b", PPRAsmOp8, PPR>;
@@ -784,6 +811,18 @@ def PPR16 : PPRRegOp<"h", PPRAsmOp16, PPR>;
def PPR32 : PPRRegOp<"s", PPRAsmOp32, PPR>;
def PPR64 : PPRRegOp<"d", PPRAsmOp64, PPR>;
+def PPRAsmOp3bAny : PPRAsmOperand<"Predicate3bAny", "PPR_3b", 0>;
+def PPRAsmOp3b8 : PPRAsmOperand<"Predicate3bB", "PPR_3b", 8>;
+def PPRAsmOp3b16 : PPRAsmOperand<"Predicate3bH", "PPR_3b", 16>;
+def PPRAsmOp3b32 : PPRAsmOperand<"Predicate3bS", "PPR_3b", 32>;
+def PPRAsmOp3b64 : PPRAsmOperand<"Predicate3bD", "PPR_3b", 64>;
+
+def PPR3bAny : PPRRegOp<"", PPRAsmOp3bAny, PPR_3b>;
+def PPR3b8 : PPRRegOp<"b", PPRAsmOp3b8, PPR_3b>;
+def PPR3b16 : PPRRegOp<"h", PPRAsmOp3b16, PPR_3b>;
+def PPR3b32 : PPRRegOp<"s", PPRAsmOp3b32, PPR_3b>;
+def PPR3b64 : PPRRegOp<"d", PPRAsmOp3b64, PPR_3b>;
+
//******************************************************************************
// SVE vector register class
@@ -796,16 +835,39 @@ def ZPR : RegisterClass<"AArch64",
let Size = 128;
}
-class ZPRAsmOperand <string name, int Width>: AsmOperandClass {
+// SVE restricted 4 bit scalable vector register class
+def ZPR_4b : RegisterClass<"AArch64",
+ [nxv16i8, nxv8i16, nxv4i32, nxv2i64,
+ nxv2f16, nxv4f16, nxv8f16,
+ nxv1f32, nxv2f32, nxv4f32,
+ nxv1f64, nxv2f64],
+ 128, (sequence "Z%u", 0, 15)> {
+ let Size = 128;
+}
+
+// SVE restricted 3 bit scalable vector register class
+def ZPR_3b : RegisterClass<"AArch64",
+ [nxv16i8, nxv8i16, nxv4i32, nxv2i64,
+ nxv2f16, nxv4f16, nxv8f16,
+ nxv1f32, nxv2f32, nxv4f32,
+ nxv1f64, nxv2f64],
+ 128, (sequence "Z%u", 0, 7)> {
+ let Size = 128;
+}
+
+class ZPRAsmOperand<string name, int Width, string RegClassSuffix = "">
+ : AsmOperandClass {
let Name = "SVE" # name # "Reg";
- let PredicateMethod = "isSVEVectorRegOfWidth<"
- # Width # ", AArch64::ZPRRegClassID>";
+ let PredicateMethod = "isSVEDataVectorRegOfWidth<"
+ # Width # ", AArch64::ZPR"
+ # RegClassSuffix # "RegClassID>";
let RenderMethod = "addRegOperands";
- let ParserMethod = "tryParseSVEDataVector<"
- # !if(!eq(Width, -1), "false", "true") # ">";
+ let DiagnosticType = "InvalidZPR" # RegClassSuffix # Width;
+ let ParserMethod = "tryParseSVEDataVector<false, "
+ # !if(!eq(Width, 0), "false", "true") # ">";
}
-def ZPRAsmOpAny : ZPRAsmOperand<"VectorAny", -1>;
+def ZPRAsmOpAny : ZPRAsmOperand<"VectorAny", 0>;
def ZPRAsmOp8 : ZPRAsmOperand<"VectorB", 8>;
def ZPRAsmOp16 : ZPRAsmOperand<"VectorH", 16>;
def ZPRAsmOp32 : ZPRAsmOperand<"VectorS", 32>;
@@ -818,3 +880,217 @@ def ZPR16 : ZPRRegOp<"h", ZPRAsmOp16, ZPR>;
def ZPR32 : ZPRRegOp<"s", ZPRAsmOp32, ZPR>;
def ZPR64 : ZPRRegOp<"d", ZPRAsmOp64, ZPR>;
def ZPR128 : ZPRRegOp<"q", ZPRAsmOp128, ZPR>;
+
+def ZPRAsmOp3b8 : ZPRAsmOperand<"Vector3bB", 8, "_3b">;
+def ZPRAsmOp3b16 : ZPRAsmOperand<"Vector3bH", 16, "_3b">;
+def ZPRAsmOp3b32 : ZPRAsmOperand<"Vector3bS", 32, "_3b">;
+
+def ZPR3b8 : ZPRRegOp<"b", ZPRAsmOp3b8, ZPR_3b>;
+def ZPR3b16 : ZPRRegOp<"h", ZPRAsmOp3b16, ZPR_3b>;
+def ZPR3b32 : ZPRRegOp<"s", ZPRAsmOp3b32, ZPR_3b>;
+
+def ZPRAsmOp4b16 : ZPRAsmOperand<"Vector4bH", 16, "_4b">;
+def ZPRAsmOp4b32 : ZPRAsmOperand<"Vector4bS", 32, "_4b">;
+def ZPRAsmOp4b64 : ZPRAsmOperand<"Vector4bD", 64, "_4b">;
+
+def ZPR4b16 : ZPRRegOp<"h", ZPRAsmOp4b16, ZPR_4b>;
+def ZPR4b32 : ZPRRegOp<"s", ZPRAsmOp4b32, ZPR_4b>;
+def ZPR4b64 : ZPRRegOp<"d", ZPRAsmOp4b64, ZPR_4b>;
+
+class FPRasZPR<int Width> : AsmOperandClass{
+ let Name = "FPR" # Width # "asZPR";
+ let PredicateMethod = "isFPRasZPR<AArch64::FPR" # Width # "RegClassID>";
+ let RenderMethod = "addFPRasZPRRegOperands<" # Width # ">";
+}
+
+class FPRasZPROperand<int Width> : RegisterOperand<ZPR> {
+ let ParserMatchClass = FPRasZPR<Width>;
+ let PrintMethod = "printZPRasFPR<" # Width # ">";
+}
+
+def FPR8asZPR : FPRasZPROperand<8>;
+def FPR16asZPR : FPRasZPROperand<16>;
+def FPR32asZPR : FPRasZPROperand<32>;
+def FPR64asZPR : FPRasZPROperand<64>;
+def FPR128asZPR : FPRasZPROperand<128>;
+
+let Namespace = "AArch64" in {
+ def zsub0 : SubRegIndex<128, -1>;
+ def zsub1 : SubRegIndex<128, -1>;
+ def zsub2 : SubRegIndex<128, -1>;
+ def zsub3 : SubRegIndex<128, -1>;
+}
+
+// Pairs, triples, and quads of SVE vector registers.
+def ZSeqPairs : RegisterTuples<[zsub0, zsub1], [(rotl ZPR, 0), (rotl ZPR, 1)]>;
+def ZSeqTriples : RegisterTuples<[zsub0, zsub1, zsub2], [(rotl ZPR, 0), (rotl ZPR, 1), (rotl ZPR, 2)]>;
+def ZSeqQuads : RegisterTuples<[zsub0, zsub1, zsub2, zsub3], [(rotl ZPR, 0), (rotl ZPR, 1), (rotl ZPR, 2), (rotl ZPR, 3)]>;
+
+def ZPR2 : RegisterClass<"AArch64", [untyped], 128, (add ZSeqPairs)> {
+ let Size = 256;
+}
+def ZPR3 : RegisterClass<"AArch64", [untyped], 128, (add ZSeqTriples)> {
+ let Size = 384;
+}
+def ZPR4 : RegisterClass<"AArch64", [untyped], 128, (add ZSeqQuads)> {
+ let Size = 512;
+}
+
+class ZPRVectorList<int ElementWidth, int NumRegs> : AsmOperandClass {
+ let Name = "SVEVectorList" # NumRegs # ElementWidth;
+ let ParserMethod = "tryParseVectorList<RegKind::SVEDataVector>";
+ let PredicateMethod =
+ "isTypedVectorList<RegKind::SVEDataVector, " #NumRegs #", 0, " #ElementWidth #">";
+ let RenderMethod = "addVectorListOperands<AArch64Operand::VecListIdx_ZReg, " # NumRegs # ">";
+}
+
+def Z_b : RegisterOperand<ZPR, "printTypedVectorList<0,'b'>"> {
+ let ParserMatchClass = ZPRVectorList<8, 1>;
+}
+
+def Z_h : RegisterOperand<ZPR, "printTypedVectorList<0,'h'>"> {
+ let ParserMatchClass = ZPRVectorList<16, 1>;
+}
+
+def Z_s : RegisterOperand<ZPR, "printTypedVectorList<0,'s'>"> {
+ let ParserMatchClass = ZPRVectorList<32, 1>;
+}
+
+def Z_d : RegisterOperand<ZPR, "printTypedVectorList<0,'d'>"> {
+ let ParserMatchClass = ZPRVectorList<64, 1>;
+}
+
+def ZZ_b : RegisterOperand<ZPR2, "printTypedVectorList<0,'b'>"> {
+ let ParserMatchClass = ZPRVectorList<8, 2>;
+}
+
+def ZZ_h : RegisterOperand<ZPR2, "printTypedVectorList<0,'h'>"> {
+ let ParserMatchClass = ZPRVectorList<16, 2>;
+}
+
+def ZZ_s : RegisterOperand<ZPR2, "printTypedVectorList<0,'s'>"> {
+ let ParserMatchClass = ZPRVectorList<32, 2>;
+}
+
+def ZZ_d : RegisterOperand<ZPR2, "printTypedVectorList<0,'d'>"> {
+ let ParserMatchClass = ZPRVectorList<64, 2>;
+}
+
+def ZZZ_b : RegisterOperand<ZPR3, "printTypedVectorList<0,'b'>"> {
+ let ParserMatchClass = ZPRVectorList<8, 3>;
+}
+
+def ZZZ_h : RegisterOperand<ZPR3, "printTypedVectorList<0,'h'>"> {
+ let ParserMatchClass = ZPRVectorList<16, 3>;
+}
+
+def ZZZ_s : RegisterOperand<ZPR3, "printTypedVectorList<0,'s'>"> {
+ let ParserMatchClass = ZPRVectorList<32, 3>;
+}
+
+def ZZZ_d : RegisterOperand<ZPR3, "printTypedVectorList<0,'d'>"> {
+ let ParserMatchClass = ZPRVectorList<64, 3>;
+}
+
+def ZZZZ_b : RegisterOperand<ZPR4, "printTypedVectorList<0,'b'>"> {
+ let ParserMatchClass = ZPRVectorList<8, 4>;
+}
+
+def ZZZZ_h : RegisterOperand<ZPR4, "printTypedVectorList<0,'h'>"> {
+ let ParserMatchClass = ZPRVectorList<16, 4>;
+}
+
+def ZZZZ_s : RegisterOperand<ZPR4, "printTypedVectorList<0,'s'>"> {
+ let ParserMatchClass = ZPRVectorList<32, 4>;
+}
+
+def ZZZZ_d : RegisterOperand<ZPR4, "printTypedVectorList<0,'d'>"> {
+ let ParserMatchClass = ZPRVectorList<64, 4>;
+}
+
+class ZPRExtendAsmOperand<string ShiftExtend, int RegWidth, int Scale,
+ bit ScaleAlwaysSame = 0b0> : AsmOperandClass {
+ let Name = "ZPRExtend" # ShiftExtend # RegWidth # Scale
+ # !if(ScaleAlwaysSame, "Only", "");
+
+ let PredicateMethod = "isSVEDataVectorRegWithShiftExtend<"
+ # RegWidth # ", AArch64::ZPRRegClassID, "
+ # "AArch64_AM::" # ShiftExtend # ", "
+ # Scale # ", "
+ # !if(ScaleAlwaysSame, "true", "false")
+ # ">";
+ let DiagnosticType = "InvalidZPR" # RegWidth # ShiftExtend # Scale;
+ let RenderMethod = "addRegOperands";
+ let ParserMethod = "tryParseSVEDataVector<true, true>";
+}
+
+class ZPRExtendRegisterOperand<bit SignExtend, bit IsLSL, string Repr,
+ int RegWidth, int Scale, string Suffix = "">
+ : RegisterOperand<ZPR> {
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("ZPR" # RegWidth # "AsmOpndExt" # Repr # Scale # Suffix);
+ let PrintMethod = "printRegWithShiftExtend<"
+ # !if(SignExtend, "true", "false") # ", "
+ # Scale # ", "
+ # !if(IsLSL, "'x'", "'w'") # ", "
+ # !if(!eq(RegWidth, 32), "'s'", "'d'") # ">";
+}
+
+foreach RegWidth = [32, 64] in {
+ // UXTW(8|16|32|64)
+ def ZPR#RegWidth#AsmOpndExtUXTW8Only : ZPRExtendAsmOperand<"UXTW", RegWidth, 8, 0b1>;
+ def ZPR#RegWidth#AsmOpndExtUXTW8 : ZPRExtendAsmOperand<"UXTW", RegWidth, 8>;
+ def ZPR#RegWidth#AsmOpndExtUXTW16 : ZPRExtendAsmOperand<"UXTW", RegWidth, 16>;
+ def ZPR#RegWidth#AsmOpndExtUXTW32 : ZPRExtendAsmOperand<"UXTW", RegWidth, 32>;
+ def ZPR#RegWidth#AsmOpndExtUXTW64 : ZPRExtendAsmOperand<"UXTW", RegWidth, 64>;
+
+ def ZPR#RegWidth#ExtUXTW8Only : ZPRExtendRegisterOperand<0b0, 0b0, "UXTW", RegWidth, 8, "Only">;
+ def ZPR#RegWidth#ExtUXTW8 : ZPRExtendRegisterOperand<0b0, 0b0, "UXTW", RegWidth, 8>;
+ def ZPR#RegWidth#ExtUXTW16 : ZPRExtendRegisterOperand<0b0, 0b0, "UXTW", RegWidth, 16>;
+ def ZPR#RegWidth#ExtUXTW32 : ZPRExtendRegisterOperand<0b0, 0b0, "UXTW", RegWidth, 32>;
+ def ZPR#RegWidth#ExtUXTW64 : ZPRExtendRegisterOperand<0b0, 0b0, "UXTW", RegWidth, 64>;
+
+ // SXTW(8|16|32|64)
+ def ZPR#RegWidth#AsmOpndExtSXTW8Only : ZPRExtendAsmOperand<"SXTW", RegWidth, 8, 0b1>;
+ def ZPR#RegWidth#AsmOpndExtSXTW8 : ZPRExtendAsmOperand<"SXTW", RegWidth, 8>;
+ def ZPR#RegWidth#AsmOpndExtSXTW16 : ZPRExtendAsmOperand<"SXTW", RegWidth, 16>;
+ def ZPR#RegWidth#AsmOpndExtSXTW32 : ZPRExtendAsmOperand<"SXTW", RegWidth, 32>;
+ def ZPR#RegWidth#AsmOpndExtSXTW64 : ZPRExtendAsmOperand<"SXTW", RegWidth, 64>;
+
+ def ZPR#RegWidth#ExtSXTW8Only : ZPRExtendRegisterOperand<0b1, 0b0, "SXTW", RegWidth, 8, "Only">;
+ def ZPR#RegWidth#ExtSXTW8 : ZPRExtendRegisterOperand<0b1, 0b0, "SXTW", RegWidth, 8>;
+ def ZPR#RegWidth#ExtSXTW16 : ZPRExtendRegisterOperand<0b1, 0b0, "SXTW", RegWidth, 16>;
+ def ZPR#RegWidth#ExtSXTW32 : ZPRExtendRegisterOperand<0b1, 0b0, "SXTW", RegWidth, 32>;
+ def ZPR#RegWidth#ExtSXTW64 : ZPRExtendRegisterOperand<0b1, 0b0, "SXTW", RegWidth, 64>;
+
+ // LSL(8|16|32|64)
+ def ZPR#RegWidth#AsmOpndExtLSL8 : ZPRExtendAsmOperand<"LSL", RegWidth, 8>;
+ def ZPR#RegWidth#AsmOpndExtLSL16 : ZPRExtendAsmOperand<"LSL", RegWidth, 16>;
+ def ZPR#RegWidth#AsmOpndExtLSL32 : ZPRExtendAsmOperand<"LSL", RegWidth, 32>;
+ def ZPR#RegWidth#AsmOpndExtLSL64 : ZPRExtendAsmOperand<"LSL", RegWidth, 64>;
+ def ZPR#RegWidth#ExtLSL8 : ZPRExtendRegisterOperand<0b0, 0b1, "LSL", RegWidth, 8>;
+ def ZPR#RegWidth#ExtLSL16 : ZPRExtendRegisterOperand<0b0, 0b1, "LSL", RegWidth, 16>;
+ def ZPR#RegWidth#ExtLSL32 : ZPRExtendRegisterOperand<0b0, 0b1, "LSL", RegWidth, 32>;
+ def ZPR#RegWidth#ExtLSL64 : ZPRExtendRegisterOperand<0b0, 0b1, "LSL", RegWidth, 64>;
+}
+
+class GPR64ShiftExtendAsmOperand <string AsmOperandName, int Scale, string RegClass> : AsmOperandClass {
+ let Name = AsmOperandName # Scale;
+ let PredicateMethod = "isGPR64WithShiftExtend<AArch64::"#RegClass#"RegClassID, " # Scale # ">";
+ let DiagnosticType = "Invalid" # AsmOperandName # Scale;
+ let RenderMethod = "addRegOperands";
+ let ParserMethod = "tryParseGPROperand<true>";
+}
+
+class GPR64ExtendRegisterOperand<string Name, int Scale, RegisterClass RegClass> : RegisterOperand<RegClass>{
+ let ParserMatchClass = !cast<AsmOperandClass>(Name);
+ let PrintMethod = "printRegWithShiftExtend<false, " # Scale # ", 'x', 0>";
+}
+
+foreach Scale = [8, 16, 32, 64] in {
+ def GPR64shiftedAsmOpnd # Scale : GPR64ShiftExtendAsmOperand<"GPR64shifted", Scale, "GPR64">;
+ def GPR64shifted # Scale : GPR64ExtendRegisterOperand<"GPR64shiftedAsmOpnd" # Scale, Scale, GPR64>;
+
+ def GPR64NoXZRshiftedAsmOpnd # Scale : GPR64ShiftExtendAsmOperand<"GPR64NoXZRshifted", Scale, "GPR64common">;
+ def GPR64NoXZRshifted # Scale : GPR64ExtendRegisterOperand<"GPR64NoXZRshiftedAsmOpnd" # Scale, Scale, GPR64common>;
+}
diff --git a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
index e1851875abc5..af555f6d2266 100644
--- a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -700,7 +700,7 @@ bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
if (!AAII)
return false;
- SchedModel.init(ST.getSchedModel(), &ST, AAII);
+ SchedModel.init(&ST);
if (!SchedModel.hasInstrSchedModel())
return false;
diff --git a/lib/Target/AArch64/AArch64SVEInstrInfo.td b/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bcd7b60875a2..16e6ddda6398 100644
--- a/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -12,12 +12,975 @@
//===----------------------------------------------------------------------===//
let Predicates = [HasSVE] in {
+
+ def RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr">;
+ def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
+ def RDFFR_P : sve_int_rdffr_unpred<"rdffr">;
+ def SETFFR : sve_int_setffr<"setffr">;
+ def WRFFR : sve_int_wrffr<"wrffr">;
+
defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add">;
defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub">;
+ defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd">;
+ defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd">;
+ defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub">;
+ defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub">;
+
+ def AND_ZZZ : sve_int_bin_cons_log<0b00, "and">;
+ def ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr">;
+ def EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor">;
+ def BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic">;
+
+ defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add">;
+ defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub">;
+ defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr">;
+
+ defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr">;
+ defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor">;
+ defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and">;
+ defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic">;
+
+ defm ADD_ZI : sve_int_arith_imm0<0b000, "add">;
+ defm SUB_ZI : sve_int_arith_imm0<0b001, "sub">;
+ defm SUBR_ZI : sve_int_arith_imm0<0b011, "subr">;
+ defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd">;
+ defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd">;
+ defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub">;
+ defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub">;
+
+ defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad">;
+ defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb">;
+ defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla">;
+ defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls">;
+
+ // SVE predicated integer reductions.
+ defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv">;
+ defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv">;
+ defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv">;
+ defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv">;
+ defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv">;
+ defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv">;
+ defm ORV_VPZ : sve_int_reduce_2<0b000, "orv">;
+ defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv">;
+ defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv">;
+
+ defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn">;
+ defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon">;
+ defm AND_ZI : sve_int_log_imm<0b10, "and", "bic">;
+
+ defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", simm8>;
+ defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", simm8>;
+ defm UMAX_ZI : sve_int_arith_imm1<0b01, "umax", imm0_255>;
+ defm UMIN_ZI : sve_int_arith_imm1<0b11, "umin", imm0_255>;
+
+ defm MUL_ZI : sve_int_arith_imm2<"mul">;
+ defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul">;
+ defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh">;
+ defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh">;
+
+ defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv">;
+ defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv">;
+ defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr">;
+ defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr">;
+
+ defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot">;
+ defm UDOT_ZZZ : sve_intx_dot<0b1, "udot">;
+
+ defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot">;
+ defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot">;
+
+ defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb">;
+ defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb">;
+ defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth">;
+ defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth">;
+ defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw">;
+ defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw">;
+ defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs">;
+ defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg">;
+
+ defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls">;
+ defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz">;
+ defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt">;
+ defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot">;
+ defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not">;
+ defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs">;
+ defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg">;
+
+ defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax">;
+ defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax">;
+ defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin">;
+ defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin">;
+ defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd">;
+ defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd">;
+
+ defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe">;
+ defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte">;
+
+ defm FADD_ZPmI : sve_fp_2op_i_p_zds<0b000, "fadd", sve_fpimm_half_one>;
+ defm FSUB_ZPmI : sve_fp_2op_i_p_zds<0b001, "fsub", sve_fpimm_half_one>;
+ defm FMUL_ZPmI : sve_fp_2op_i_p_zds<0b010, "fmul", sve_fpimm_half_two>;
+ defm FSUBR_ZPmI : sve_fp_2op_i_p_zds<0b011, "fsubr", sve_fpimm_half_one>;
+ defm FMAXNM_ZPmI : sve_fp_2op_i_p_zds<0b100, "fmaxnm", sve_fpimm_zero_one>;
+ defm FMINNM_ZPmI : sve_fp_2op_i_p_zds<0b101, "fminnm", sve_fpimm_zero_one>;
+ defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>;
+ defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>;
+
+ defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd">;
+ defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub">;
+ defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul">;
+ defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr">;
+ defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm">;
+ defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm">;
+ defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax">;
+ defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin">;
+ defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd">;
+ defm FSCALE_ZPmZ : sve_fp_2op_p_zds<0b1001, "fscale">;
+ defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx">;
+ defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr">;
+ defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv">;
+
+ defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd">;
+ defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub">;
+ defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul">;
+ defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul">;
+ defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps">;
+ defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts">;
+
+ defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">;
+
+ defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd">;
+ defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla">;
+
+ defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla">;
+ defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls">;
+ defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla">;
+ defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls">;
+
+ defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad">;
+ defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb">;
+ defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad">;
+ defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb">;
+
+ defm FTMAD_ZZI : sve_fp_ftmad<"ftmad">;
+
+ defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla">;
+ defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls">;
+
+ defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla">;
+ defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul">;
+
+ // SVE floating point reductions.
+ defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda">;
+ defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv">;
+ defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv">;
+ defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv">;
+ defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv">;
+ defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv">;
+
+ // Splat immediate (unpredicated)
+ defm DUP_ZI : sve_int_dup_imm<"dup">;
+ defm FDUP_ZI : sve_int_dup_fpimm<"fdup">;
+ defm DUPM_ZI : sve_int_dup_mask_imm<"dupm">;
+
+ // Splat immediate (predicated)
+ defm CPY_ZPmI : sve_int_dup_imm_pred_merge<"cpy">;
+ defm CPY_ZPzI : sve_int_dup_imm_pred_zero<"cpy">;
+ defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">;
+
+ // Splat scalar register (unpredicated, GPR or vector + element index)
+ defm DUP_ZR : sve_int_perm_dup_r<"dup">;
+ defm DUP_ZZI : sve_int_perm_dup_i<"dup">;
+
+ // Splat scalar register (predicated)
+ defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy">;
+ defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy">;
+
+ // Select elements from either vector (predicated)
+ defm SEL_ZPZZ : sve_int_sel_vvv<"sel">;
+
+ defm SPLICE_ZPZ : sve_int_perm_splice<"splice">;
+ defm COMPACT_ZPZ : sve_int_perm_compact<"compact">;
+ defm INSR_ZR : sve_int_perm_insrs<"insr">;
+ defm INSR_ZV : sve_int_perm_insrv<"insr">;
+ def EXT_ZZI : sve_int_perm_extract_i<"ext">;
+
+ defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit">;
+ defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb">;
+ defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh">;
+ defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw">;
+
+ defm REV_PP : sve_int_perm_reverse_p<"rev">;
+ defm REV_ZZ : sve_int_perm_reverse_z<"rev">;
+
+ defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo">;
+ defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi">;
+ defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo">;
+ defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi">;
+
+ def PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo">;
+ def PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi">;
+
+ def FEXPA_ZZ_H : sve_int_bin_cons_misc_0_c<0b01000000, "fexpa", ZPR16>;
+ def FEXPA_ZZ_S : sve_int_bin_cons_misc_0_c<0b10000000, "fexpa", ZPR32>;
+ def FEXPA_ZZ_D : sve_int_bin_cons_misc_0_c<0b11000000, "fexpa", ZPR64>;
+
+ def AND_PPzPP : sve_int_pred_log<0b0000, "and">;
+ def BIC_PPzPP : sve_int_pred_log<0b0001, "bic">;
+ def EOR_PPzPP : sve_int_pred_log<0b0010, "eor">;
+ def SEL_PPPP : sve_int_pred_log<0b0011, "sel">;
+ def ANDS_PPzPP : sve_int_pred_log<0b0100, "ands">;
+ def BICS_PPzPP : sve_int_pred_log<0b0101, "bics">;
+ def EORS_PPzPP : sve_int_pred_log<0b0110, "eors">;
+ def ORR_PPzPP : sve_int_pred_log<0b1000, "orr">;
+ def ORN_PPzPP : sve_int_pred_log<0b1001, "orn">;
+ def NOR_PPzPP : sve_int_pred_log<0b1010, "nor">;
+ def NAND_PPzPP : sve_int_pred_log<0b1011, "nand">;
+ def ORRS_PPzPP : sve_int_pred_log<0b1100, "orrs">;
+ def ORNS_PPzPP : sve_int_pred_log<0b1101, "orns">;
+ def NORS_PPzPP : sve_int_pred_log<0b1110, "nors">;
+ def NANDS_PPzPP : sve_int_pred_log<0b1111, "nands">;
+
+ defm CLASTA_RPZ : sve_int_perm_clast_rz<0, "clasta">;
+ defm CLASTB_RPZ : sve_int_perm_clast_rz<1, "clastb">;
+ defm CLASTA_VPZ : sve_int_perm_clast_vz<0, "clasta">;
+ defm CLASTB_VPZ : sve_int_perm_clast_vz<1, "clastb">;
+ defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta">;
+ defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb">;
+
+ defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta">;
+ defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb">;
+ defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta">;
+ defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb">;
+
+ // continuous load with reg+immediate
+ defm LD1B_IMM : sve_mem_cld_si<0b0000, "ld1b", Z_b, ZPR8>;
+ defm LD1B_H_IMM : sve_mem_cld_si<0b0001, "ld1b", Z_h, ZPR16>;
+ defm LD1B_S_IMM : sve_mem_cld_si<0b0010, "ld1b", Z_s, ZPR32>;
+ defm LD1B_D_IMM : sve_mem_cld_si<0b0011, "ld1b", Z_d, ZPR64>;
+ defm LD1SW_D_IMM : sve_mem_cld_si<0b0100, "ld1sw", Z_d, ZPR64>;
+ defm LD1H_IMM : sve_mem_cld_si<0b0101, "ld1h", Z_h, ZPR16>;
+ defm LD1H_S_IMM : sve_mem_cld_si<0b0110, "ld1h", Z_s, ZPR32>;
+ defm LD1H_D_IMM : sve_mem_cld_si<0b0111, "ld1h", Z_d, ZPR64>;
+ defm LD1SH_D_IMM : sve_mem_cld_si<0b1000, "ld1sh", Z_d, ZPR64>;
+ defm LD1SH_S_IMM : sve_mem_cld_si<0b1001, "ld1sh", Z_s, ZPR32>;
+ defm LD1W_IMM : sve_mem_cld_si<0b1010, "ld1w", Z_s, ZPR32>;
+ defm LD1W_D_IMM : sve_mem_cld_si<0b1011, "ld1w", Z_d, ZPR64>;
+ defm LD1SB_D_IMM : sve_mem_cld_si<0b1100, "ld1sb", Z_d, ZPR64>;
+ defm LD1SB_S_IMM : sve_mem_cld_si<0b1101, "ld1sb", Z_s, ZPR32>;
+ defm LD1SB_H_IMM : sve_mem_cld_si<0b1110, "ld1sb", Z_h, ZPR16>;
+ defm LD1D_IMM : sve_mem_cld_si<0b1111, "ld1d", Z_d, ZPR64>;
+
+ // LD1R loads (splat scalar to vector)
+ defm LD1RB_IMM : sve_mem_ld_dup<0b00, 0b00, "ld1rb", Z_b, ZPR8, uimm6s1>;
+ defm LD1RB_H_IMM : sve_mem_ld_dup<0b00, 0b01, "ld1rb", Z_h, ZPR16, uimm6s1>;
+ defm LD1RB_S_IMM : sve_mem_ld_dup<0b00, 0b10, "ld1rb", Z_s, ZPR32, uimm6s1>;
+ defm LD1RB_D_IMM : sve_mem_ld_dup<0b00, 0b11, "ld1rb", Z_d, ZPR64, uimm6s1>;
+ defm LD1RSW_IMM : sve_mem_ld_dup<0b01, 0b00, "ld1rsw", Z_d, ZPR64, uimm6s4>;
+ defm LD1RH_IMM : sve_mem_ld_dup<0b01, 0b01, "ld1rh", Z_h, ZPR16, uimm6s2>;
+ defm LD1RH_S_IMM : sve_mem_ld_dup<0b01, 0b10, "ld1rh", Z_s, ZPR32, uimm6s2>;
+ defm LD1RH_D_IMM : sve_mem_ld_dup<0b01, 0b11, "ld1rh", Z_d, ZPR64, uimm6s2>;
+ defm LD1RSH_D_IMM : sve_mem_ld_dup<0b10, 0b00, "ld1rsh", Z_d, ZPR64, uimm6s2>;
+ defm LD1RSH_S_IMM : sve_mem_ld_dup<0b10, 0b01, "ld1rsh", Z_s, ZPR32, uimm6s2>;
+ defm LD1RW_IMM : sve_mem_ld_dup<0b10, 0b10, "ld1rw", Z_s, ZPR32, uimm6s4>;
+ defm LD1RW_D_IMM : sve_mem_ld_dup<0b10, 0b11, "ld1rw", Z_d, ZPR64, uimm6s4>;
+ defm LD1RSB_D_IMM : sve_mem_ld_dup<0b11, 0b00, "ld1rsb", Z_d, ZPR64, uimm6s1>;
+ defm LD1RSB_S_IMM : sve_mem_ld_dup<0b11, 0b01, "ld1rsb", Z_s, ZPR32, uimm6s1>;
+ defm LD1RSB_H_IMM : sve_mem_ld_dup<0b11, 0b10, "ld1rsb", Z_h, ZPR16, uimm6s1>;
+ defm LD1RD_IMM : sve_mem_ld_dup<0b11, 0b11, "ld1rd", Z_d, ZPR64, uimm6s8>;
+
+ // LD1RQ loads (load quadword-vector and splat to scalable vector)
+ defm LD1RQ_B_IMM : sve_mem_ldqr_si<0b00, "ld1rqb", Z_b, ZPR8>;
+ defm LD1RQ_H_IMM : sve_mem_ldqr_si<0b01, "ld1rqh", Z_h, ZPR16>;
+ defm LD1RQ_W_IMM : sve_mem_ldqr_si<0b10, "ld1rqw", Z_s, ZPR32>;
+ defm LD1RQ_D_IMM : sve_mem_ldqr_si<0b11, "ld1rqd", Z_d, ZPR64>;
+ defm LD1RQ_B : sve_mem_ldqr_ss<0b00, "ld1rqb", Z_b, ZPR8, GPR64NoXZRshifted8>;
+ defm LD1RQ_H : sve_mem_ldqr_ss<0b01, "ld1rqh", Z_h, ZPR16, GPR64NoXZRshifted16>;
+ defm LD1RQ_W : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>;
+ defm LD1RQ_D : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>;
+
+ // continuous load with reg+reg addressing.
+ defm LD1B : sve_mem_cld_ss<0b0000, "ld1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
+ defm LD1B_H : sve_mem_cld_ss<0b0001, "ld1b", Z_h, ZPR16, GPR64NoXZRshifted8>;
+ defm LD1B_S : sve_mem_cld_ss<0b0010, "ld1b", Z_s, ZPR32, GPR64NoXZRshifted8>;
+ defm LD1B_D : sve_mem_cld_ss<0b0011, "ld1b", Z_d, ZPR64, GPR64NoXZRshifted8>;
+ defm LD1SW_D : sve_mem_cld_ss<0b0100, "ld1sw", Z_d, ZPR64, GPR64NoXZRshifted32>;
+ defm LD1H : sve_mem_cld_ss<0b0101, "ld1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
+ defm LD1H_S : sve_mem_cld_ss<0b0110, "ld1h", Z_s, ZPR32, GPR64NoXZRshifted16>;
+ defm LD1H_D : sve_mem_cld_ss<0b0111, "ld1h", Z_d, ZPR64, GPR64NoXZRshifted16>;
+ defm LD1SH_D : sve_mem_cld_ss<0b1000, "ld1sh", Z_d, ZPR64, GPR64NoXZRshifted16>;
+ defm LD1SH_S : sve_mem_cld_ss<0b1001, "ld1sh", Z_s, ZPR32, GPR64NoXZRshifted16>;
+ defm LD1W : sve_mem_cld_ss<0b1010, "ld1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
+ defm LD1W_D : sve_mem_cld_ss<0b1011, "ld1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
+ defm LD1SB_D : sve_mem_cld_ss<0b1100, "ld1sb", Z_d, ZPR64, GPR64NoXZRshifted8>;
+ defm LD1SB_S : sve_mem_cld_ss<0b1101, "ld1sb", Z_s, ZPR32, GPR64NoXZRshifted8>;
+ defm LD1SB_H : sve_mem_cld_ss<0b1110, "ld1sb", Z_h, ZPR16, GPR64NoXZRshifted8>;
+ defm LD1D : sve_mem_cld_ss<0b1111, "ld1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
+
+ // non-faulting continuous load with reg+immediate
+ defm LDNF1B_IMM : sve_mem_cldnf_si<0b0000, "ldnf1b", Z_b, ZPR8>;
+ defm LDNF1B_H_IMM : sve_mem_cldnf_si<0b0001, "ldnf1b", Z_h, ZPR16>;
+ defm LDNF1B_S_IMM : sve_mem_cldnf_si<0b0010, "ldnf1b", Z_s, ZPR32>;
+ defm LDNF1B_D_IMM : sve_mem_cldnf_si<0b0011, "ldnf1b", Z_d, ZPR64>;
+ defm LDNF1SW_D_IMM : sve_mem_cldnf_si<0b0100, "ldnf1sw", Z_d, ZPR64>;
+ defm LDNF1H_IMM : sve_mem_cldnf_si<0b0101, "ldnf1h", Z_h, ZPR16>;
+ defm LDNF1H_S_IMM : sve_mem_cldnf_si<0b0110, "ldnf1h", Z_s, ZPR32>;
+ defm LDNF1H_D_IMM : sve_mem_cldnf_si<0b0111, "ldnf1h", Z_d, ZPR64>;
+ defm LDNF1SH_D_IMM : sve_mem_cldnf_si<0b1000, "ldnf1sh", Z_d, ZPR64>;
+ defm LDNF1SH_S_IMM : sve_mem_cldnf_si<0b1001, "ldnf1sh", Z_s, ZPR32>;
+ defm LDNF1W_IMM : sve_mem_cldnf_si<0b1010, "ldnf1w", Z_s, ZPR32>;
+ defm LDNF1W_D_IMM : sve_mem_cldnf_si<0b1011, "ldnf1w", Z_d, ZPR64>;
+ defm LDNF1SB_D_IMM : sve_mem_cldnf_si<0b1100, "ldnf1sb", Z_d, ZPR64>;
+ defm LDNF1SB_S_IMM : sve_mem_cldnf_si<0b1101, "ldnf1sb", Z_s, ZPR32>;
+ defm LDNF1SB_H_IMM : sve_mem_cldnf_si<0b1110, "ldnf1sb", Z_h, ZPR16>;
+ defm LDNF1D_IMM : sve_mem_cldnf_si<0b1111, "ldnf1d", Z_d, ZPR64>;
+
+ // First-faulting loads with reg+reg addressing.
+ defm LDFF1B : sve_mem_cldff_ss<0b0000, "ldff1b", Z_b, ZPR8, GPR64shifted8>;
+ defm LDFF1B_H : sve_mem_cldff_ss<0b0001, "ldff1b", Z_h, ZPR16, GPR64shifted8>;
+ defm LDFF1B_S : sve_mem_cldff_ss<0b0010, "ldff1b", Z_s, ZPR32, GPR64shifted8>;
+ defm LDFF1B_D : sve_mem_cldff_ss<0b0011, "ldff1b", Z_d, ZPR64, GPR64shifted8>;
+ defm LDFF1SW_D : sve_mem_cldff_ss<0b0100, "ldff1sw", Z_d, ZPR64, GPR64shifted32>;
+ defm LDFF1H : sve_mem_cldff_ss<0b0101, "ldff1h", Z_h, ZPR16, GPR64shifted16>;
+ defm LDFF1H_S : sve_mem_cldff_ss<0b0110, "ldff1h", Z_s, ZPR32, GPR64shifted16>;
+ defm LDFF1H_D : sve_mem_cldff_ss<0b0111, "ldff1h", Z_d, ZPR64, GPR64shifted16>;
+ defm LDFF1SH_D : sve_mem_cldff_ss<0b1000, "ldff1sh", Z_d, ZPR64, GPR64shifted16>;
+ defm LDFF1SH_S : sve_mem_cldff_ss<0b1001, "ldff1sh", Z_s, ZPR32, GPR64shifted16>;
+ defm LDFF1W : sve_mem_cldff_ss<0b1010, "ldff1w", Z_s, ZPR32, GPR64shifted32>;
+ defm LDFF1W_D : sve_mem_cldff_ss<0b1011, "ldff1w", Z_d, ZPR64, GPR64shifted32>;
+ defm LDFF1SB_D : sve_mem_cldff_ss<0b1100, "ldff1sb", Z_d, ZPR64, GPR64shifted8>;
+ defm LDFF1SB_S : sve_mem_cldff_ss<0b1101, "ldff1sb", Z_s, ZPR32, GPR64shifted8>;
+ defm LDFF1SB_H : sve_mem_cldff_ss<0b1110, "ldff1sb", Z_h, ZPR16, GPR64shifted8>;
+ defm LDFF1D : sve_mem_cldff_ss<0b1111, "ldff1d", Z_d, ZPR64, GPR64shifted64>;
+
+ // LD(2|3|4) structured loads with reg+immediate
+ defm LD2B_IMM : sve_mem_eld_si<0b00, 0b01, ZZ_b, "ld2b", simm4s2>;
+ defm LD3B_IMM : sve_mem_eld_si<0b00, 0b10, ZZZ_b, "ld3b", simm4s3>;
+ defm LD4B_IMM : sve_mem_eld_si<0b00, 0b11, ZZZZ_b, "ld4b", simm4s4>;
+ defm LD2H_IMM : sve_mem_eld_si<0b01, 0b01, ZZ_h, "ld2h", simm4s2>;
+ defm LD3H_IMM : sve_mem_eld_si<0b01, 0b10, ZZZ_h, "ld3h", simm4s3>;
+ defm LD4H_IMM : sve_mem_eld_si<0b01, 0b11, ZZZZ_h, "ld4h", simm4s4>;
+ defm LD2W_IMM : sve_mem_eld_si<0b10, 0b01, ZZ_s, "ld2w", simm4s2>;
+ defm LD3W_IMM : sve_mem_eld_si<0b10, 0b10, ZZZ_s, "ld3w", simm4s3>;
+ defm LD4W_IMM : sve_mem_eld_si<0b10, 0b11, ZZZZ_s, "ld4w", simm4s4>;
+ defm LD2D_IMM : sve_mem_eld_si<0b11, 0b01, ZZ_d, "ld2d", simm4s2>;
+ defm LD3D_IMM : sve_mem_eld_si<0b11, 0b10, ZZZ_d, "ld3d", simm4s3>;
+ defm LD4D_IMM : sve_mem_eld_si<0b11, 0b11, ZZZZ_d, "ld4d", simm4s4>;
+
+ // LD(2|3|4) structured loads (register + register)
+ def LD2B : sve_mem_eld_ss<0b00, 0b01, ZZ_b, "ld2b", GPR64NoXZRshifted8>;
+ def LD3B : sve_mem_eld_ss<0b00, 0b10, ZZZ_b, "ld3b", GPR64NoXZRshifted8>;
+ def LD4B : sve_mem_eld_ss<0b00, 0b11, ZZZZ_b, "ld4b", GPR64NoXZRshifted8>;
+ def LD2H : sve_mem_eld_ss<0b01, 0b01, ZZ_h, "ld2h", GPR64NoXZRshifted16>;
+ def LD3H : sve_mem_eld_ss<0b01, 0b10, ZZZ_h, "ld3h", GPR64NoXZRshifted16>;
+ def LD4H : sve_mem_eld_ss<0b01, 0b11, ZZZZ_h, "ld4h", GPR64NoXZRshifted16>;
+ def LD2W : sve_mem_eld_ss<0b10, 0b01, ZZ_s, "ld2w", GPR64NoXZRshifted32>;
+ def LD3W : sve_mem_eld_ss<0b10, 0b10, ZZZ_s, "ld3w", GPR64NoXZRshifted32>;
+ def LD4W : sve_mem_eld_ss<0b10, 0b11, ZZZZ_s, "ld4w", GPR64NoXZRshifted32>;
+ def LD2D : sve_mem_eld_ss<0b11, 0b01, ZZ_d, "ld2d", GPR64NoXZRshifted64>;
+ def LD3D : sve_mem_eld_ss<0b11, 0b10, ZZZ_d, "ld3d", GPR64NoXZRshifted64>;
+ def LD4D : sve_mem_eld_ss<0b11, 0b11, ZZZZ_d, "ld4d", GPR64NoXZRshifted64>;
+
+ // Gathers using unscaled 32-bit offsets, e.g.
+ // ld1h z0.s, p0/z, [x0, z0.s, uxtw]
+ defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
+ defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
+ defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
+ defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
+ defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+ defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+ defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+ defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+ defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+ defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+
+ // Gathers using scaled 32-bit offsets, e.g.
+ // ld1h z0.s, p0/z, [x0, z0.s, uxtw #1]
+ defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
+ defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
+ defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
+ defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
+ defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
+ defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
+
+ // Gathers using scaled 32-bit pointers with offset, e.g.
+ // ld1h z0.s, p0/z, [z0.s, #16]
+ defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31>;
+ defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31>;
+ defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31>;
+ defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31>;
+ defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2>;
+ defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2>;
+ defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2>;
+ defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2>;
+ defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4>;
+ defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4>;
+
+ // Gathers using scaled 64-bit pointers with offset, e.g.
+ // ld1h z0.d, p0/z, [z0.d, #16]
+ defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31>;
+ defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31>;
+ defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31>;
+ defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31>;
+ defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2>;
+ defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2>;
+ defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2>;
+ defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4>;
+ defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4>;
+ defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4>;
+ defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8>;
+ defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8>;
+
+ // Gathers using unscaled 64-bit offsets, e.g.
+ // ld1h z0.d, p0/z, [x0, z0.d]
+ defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb">;
+ defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb">;
+ defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b">;
+ defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b">;
+ defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh">;
+ defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh">;
+ defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h">;
+ defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h">;
+ defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw">;
+ defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw">;
+ defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w">;
+ defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w">;
+ defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d">;
+ defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d">;
+
+ // Gathers using scaled 64-bit offsets, e.g.
+ // ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
+ defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", ZPR64ExtLSL16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", ZPR64ExtLSL16>;
+ defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", ZPR64ExtLSL16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", ZPR64ExtLSL16>;
+ defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", ZPR64ExtLSL32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", ZPR64ExtLSL32>;
+ defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", ZPR64ExtLSL32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", ZPR64ExtLSL32>;
+ defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", ZPR64ExtLSL64>;
+ defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", ZPR64ExtLSL64>;
+
+ // Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g.
+ // ld1h z0.d, p0/z, [x0, z0.d, uxtw]
+ defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
+ defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
+ defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
+ defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
+ defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+ defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+ defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+ defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+ defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+ defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+ defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+ defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+
+ // Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g.
+ // ld1h z0.d, p0/z, [x0, z0.d, uxtw #1]
+ defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh",ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
+ defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
+ defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw",ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
+ defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
+ defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
+ defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
+
+ // Non-temporal contiguous loads (register + immediate)
+ defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
+ defm LDNT1H_ZRI : sve_mem_cldnt_si<0b01, "ldnt1h", Z_h, ZPR16>;
+ defm LDNT1W_ZRI : sve_mem_cldnt_si<0b10, "ldnt1w", Z_s, ZPR32>;
+ defm LDNT1D_ZRI : sve_mem_cldnt_si<0b11, "ldnt1d", Z_d, ZPR64>;
+
+ // Non-temporal contiguous loads (register + register)
+ defm LDNT1B_ZRR : sve_mem_cldnt_ss<0b00, "ldnt1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
+ defm LDNT1H_ZRR : sve_mem_cldnt_ss<0b01, "ldnt1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
+ defm LDNT1W_ZRR : sve_mem_cldnt_ss<0b10, "ldnt1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
+ defm LDNT1D_ZRR : sve_mem_cldnt_ss<0b11, "ldnt1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
+
+ // contiguous store with immediates
+ defm ST1B_IMM : sve_mem_cst_si<0b00, 0b00, "st1b", Z_b, ZPR8>;
+ defm ST1B_H_IMM : sve_mem_cst_si<0b00, 0b01, "st1b", Z_h, ZPR16>;
+ defm ST1B_S_IMM : sve_mem_cst_si<0b00, 0b10, "st1b", Z_s, ZPR32>;
+ defm ST1B_D_IMM : sve_mem_cst_si<0b00, 0b11, "st1b", Z_d, ZPR64>;
+ defm ST1H_IMM : sve_mem_cst_si<0b01, 0b01, "st1h", Z_h, ZPR16>;
+ defm ST1H_S_IMM : sve_mem_cst_si<0b01, 0b10, "st1h", Z_s, ZPR32>;
+ defm ST1H_D_IMM : sve_mem_cst_si<0b01, 0b11, "st1h", Z_d, ZPR64>;
+ defm ST1W_IMM : sve_mem_cst_si<0b10, 0b10, "st1w", Z_s, ZPR32>;
+ defm ST1W_D_IMM : sve_mem_cst_si<0b10, 0b11, "st1w", Z_d, ZPR64>;
+ defm ST1D_IMM : sve_mem_cst_si<0b11, 0b11, "st1d", Z_d, ZPR64>;
+
+ // contiguous store with reg+reg addressing.
+ defm ST1B : sve_mem_cst_ss<0b0000, "st1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
+ defm ST1B_H : sve_mem_cst_ss<0b0001, "st1b", Z_h, ZPR16, GPR64NoXZRshifted8>;
+ defm ST1B_S : sve_mem_cst_ss<0b0010, "st1b", Z_s, ZPR32, GPR64NoXZRshifted8>;
+ defm ST1B_D : sve_mem_cst_ss<0b0011, "st1b", Z_d, ZPR64, GPR64NoXZRshifted8>;
+ defm ST1H : sve_mem_cst_ss<0b0101, "st1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
+ defm ST1H_S : sve_mem_cst_ss<0b0110, "st1h", Z_s, ZPR32, GPR64NoXZRshifted16>;
+ defm ST1H_D : sve_mem_cst_ss<0b0111, "st1h", Z_d, ZPR64, GPR64NoXZRshifted16>;
+ defm ST1W : sve_mem_cst_ss<0b1010, "st1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
+ defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
+ defm ST1D : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
+
+ // Scatters using unscaled 32-bit offsets, e.g.
+ // st1h z0.s, p0, [x0, z0.s, uxtw]
+ // and unpacked:
+ // st1h z0.d, p0, [x0, z0.d, uxtw]
+ defm SST1B_D : sve_mem_sst_sv_32_unscaled<0b000, "st1b", Z_d, ZPR64, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
+ defm SST1B_S : sve_mem_sst_sv_32_unscaled<0b001, "st1b", Z_s, ZPR32, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
+ defm SST1H_D : sve_mem_sst_sv_32_unscaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+ defm SST1H_S : sve_mem_sst_sv_32_unscaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+ defm SST1W_D : sve_mem_sst_sv_32_unscaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+ defm SST1W : sve_mem_sst_sv_32_unscaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+ defm SST1D : sve_mem_sst_sv_32_unscaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+
+ // Scatters using scaled 32-bit offsets, e.g.
+ // st1h z0.s, p0, [x0, z0.s, uxtw #1]
+ // and unpacked:
+ // st1h z0.d, p0, [x0, z0.d, uxtw #1]
+ defm SST1H_D : sve_mem_sst_sv_32_scaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
+ defm SST1H_S : sve_mem_sst_sv_32_scaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
+ defm SST1W_D : sve_mem_sst_sv_32_scaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
+ defm SST1W : sve_mem_sst_sv_32_scaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
+ defm SST1D : sve_mem_sst_sv_32_scaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
+
+ // Scatters using 32/64-bit pointers with offset, e.g.
+ // st1h z0.s, p0, [z0.s, #16]
+ // st1h z0.d, p0, [z0.d, #16]
+ defm SST1B_D : sve_mem_sst_vi_ptrs<0b000, "st1b", Z_d, ZPR64, imm0_31>;
+ defm SST1B_S : sve_mem_sst_vi_ptrs<0b001, "st1b", Z_s, ZPR32, imm0_31>;
+ defm SST1H_D : sve_mem_sst_vi_ptrs<0b010, "st1h", Z_d, ZPR64, uimm5s2>;
+ defm SST1H_S : sve_mem_sst_vi_ptrs<0b011, "st1h", Z_s, ZPR32, uimm5s2>;
+ defm SST1W_D : sve_mem_sst_vi_ptrs<0b100, "st1w", Z_d, ZPR64, uimm5s4>;
+ defm SST1W : sve_mem_sst_vi_ptrs<0b101, "st1w", Z_s, ZPR32, uimm5s4>;
+ defm SST1D : sve_mem_sst_vi_ptrs<0b110, "st1d", Z_d, ZPR64, uimm5s8>;
+
+ // Scatters using unscaled 64-bit offsets, e.g.
+ // st1h z0.d, p0, [x0, z0.d]
+ defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b">;
+ defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h">;
+ defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w">;
+ defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d">;
+
+ // Scatters using scaled 64-bit offsets, e.g.
+ // st1h z0.d, p0, [x0, z0.d, lsl #1]
+ defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", ZPR64ExtLSL16>;
+ defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", ZPR64ExtLSL32>;
+ defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", ZPR64ExtLSL64>;
+
+ // ST(2|3|4) structured stores (register + immediate)
+ defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b, "st2b", simm4s2>;
+ defm ST3B_IMM : sve_mem_est_si<0b00, 0b10, ZZZ_b, "st3b", simm4s3>;
+ defm ST4B_IMM : sve_mem_est_si<0b00, 0b11, ZZZZ_b, "st4b", simm4s4>;
+ defm ST2H_IMM : sve_mem_est_si<0b01, 0b01, ZZ_h, "st2h", simm4s2>;
+ defm ST3H_IMM : sve_mem_est_si<0b01, 0b10, ZZZ_h, "st3h", simm4s3>;
+ defm ST4H_IMM : sve_mem_est_si<0b01, 0b11, ZZZZ_h, "st4h", simm4s4>;
+ defm ST2W_IMM : sve_mem_est_si<0b10, 0b01, ZZ_s, "st2w", simm4s2>;
+ defm ST3W_IMM : sve_mem_est_si<0b10, 0b10, ZZZ_s, "st3w", simm4s3>;
+ defm ST4W_IMM : sve_mem_est_si<0b10, 0b11, ZZZZ_s, "st4w", simm4s4>;
+ defm ST2D_IMM : sve_mem_est_si<0b11, 0b01, ZZ_d, "st2d", simm4s2>;
+ defm ST3D_IMM : sve_mem_est_si<0b11, 0b10, ZZZ_d, "st3d", simm4s3>;
+ defm ST4D_IMM : sve_mem_est_si<0b11, 0b11, ZZZZ_d, "st4d", simm4s4>;
+
+ // ST(2|3|4) structured stores (register + register)
+ def ST2B : sve_mem_est_ss<0b00, 0b01, ZZ_b, "st2b", GPR64NoXZRshifted8>;
+ def ST3B : sve_mem_est_ss<0b00, 0b10, ZZZ_b, "st3b", GPR64NoXZRshifted8>;
+ def ST4B : sve_mem_est_ss<0b00, 0b11, ZZZZ_b, "st4b", GPR64NoXZRshifted8>;
+ def ST2H : sve_mem_est_ss<0b01, 0b01, ZZ_h, "st2h", GPR64NoXZRshifted16>;
+ def ST3H : sve_mem_est_ss<0b01, 0b10, ZZZ_h, "st3h", GPR64NoXZRshifted16>;
+ def ST4H : sve_mem_est_ss<0b01, 0b11, ZZZZ_h, "st4h", GPR64NoXZRshifted16>;
+ def ST2W : sve_mem_est_ss<0b10, 0b01, ZZ_s, "st2w", GPR64NoXZRshifted32>;
+ def ST3W : sve_mem_est_ss<0b10, 0b10, ZZZ_s, "st3w", GPR64NoXZRshifted32>;
+ def ST4W : sve_mem_est_ss<0b10, 0b11, ZZZZ_s, "st4w", GPR64NoXZRshifted32>;
+ def ST2D : sve_mem_est_ss<0b11, 0b01, ZZ_d, "st2d", GPR64NoXZRshifted64>;
+ def ST3D : sve_mem_est_ss<0b11, 0b10, ZZZ_d, "st3d", GPR64NoXZRshifted64>;
+ def ST4D : sve_mem_est_ss<0b11, 0b11, ZZZZ_d, "st4d", GPR64NoXZRshifted64>;
+
+ // Non-temporal contiguous stores (register + immediate)
+ defm STNT1B_ZRI : sve_mem_cstnt_si<0b00, "stnt1b", Z_b, ZPR8>;
+ defm STNT1H_ZRI : sve_mem_cstnt_si<0b01, "stnt1h", Z_h, ZPR16>;
+ defm STNT1W_ZRI : sve_mem_cstnt_si<0b10, "stnt1w", Z_s, ZPR32>;
+ defm STNT1D_ZRI : sve_mem_cstnt_si<0b11, "stnt1d", Z_d, ZPR64>;
+
+ // Non-temporal contiguous stores (register + register)
+ defm STNT1B_ZRR : sve_mem_cstnt_ss<0b00, "stnt1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
+ defm STNT1H_ZRR : sve_mem_cstnt_ss<0b01, "stnt1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
+ defm STNT1W_ZRR : sve_mem_cstnt_ss<0b10, "stnt1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
+ defm STNT1D_ZRR : sve_mem_cstnt_ss<0b11, "stnt1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
+
+ // Fill/Spill
+ defm LDR_ZXI : sve_mem_z_fill<"ldr">;
+ defm LDR_PXI : sve_mem_p_fill<"ldr">;
+ defm STR_ZXI : sve_mem_z_spill<"str">;
+ defm STR_PXI : sve_mem_p_spill<"str">;
+
+ // Contiguous prefetch (register + immediate)
+ defm PRFB_PRI : sve_mem_prfm_si<0b00, "prfb">;
+ defm PRFH_PRI : sve_mem_prfm_si<0b01, "prfh">;
+ defm PRFW_PRI : sve_mem_prfm_si<0b10, "prfw">;
+ defm PRFD_PRI : sve_mem_prfm_si<0b11, "prfd">;
+
+ // Contiguous prefetch (register + register)
+ def PRFB_PRR : sve_mem_prfm_ss<0b001, "prfb", GPR64NoXZRshifted8>;
+ def PRFH_PRR : sve_mem_prfm_ss<0b011, "prfh", GPR64NoXZRshifted16>;
+ def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
+ def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>;
+
+ // Gather prefetch using scaled 32-bit offsets, e.g.
+ // prfh pldl1keep, p0, [x0, z0.s, uxtw #1]
+ defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
+ defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
+ defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
+ defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64>;
+
+ // Gather prefetch using unpacked, scaled 32-bit offsets, e.g.
+ // prfh pldl1keep, p0, [x0, z0.d, uxtw #1]
+ defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
+ defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
+ defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
+ defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
+
+ // Gather prefetch using scaled 64-bit offsets, e.g.
+ // prfh pldl1keep, p0, [x0, z0.d, lsl #1]
+ defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8>;
+ defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16>;
+ defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32>;
+ defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64>;
+
+ // Gather prefetch using 32/64-bit pointers with offset, e.g.
+ // prfh pldl1keep, p0, [z0.s, #16]
+ // prfh pldl1keep, p0, [z0.d, #16]
+ defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31>;
+ defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2>;
+ defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4>;
+ defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8>;
+
+ defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31>;
+ defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2>;
+ defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4>;
+ defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8>;
+
+ defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">;
+ defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">;
+ defm ADR_LSL_ZZZ_S : sve_int_bin_cons_misc_0_a_32_lsl<0b10, "adr">;
+ defm ADR_LSL_ZZZ_D : sve_int_bin_cons_misc_0_a_64_lsl<0b11, "adr">;
+
+ defm TBL_ZZZ : sve_int_perm_tbl<"tbl">;
defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1">;
defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2">;
+ defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1">;
+ defm UZP2_ZZZ : sve_int_perm_bin_perm_zz<0b011, "uzp2">;
+ defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1">;
+ defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2">;
defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1">;
defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2">;
+ defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1">;
+ defm UZP2_PPP : sve_int_perm_bin_perm_pp<0b011, "uzp2">;
+ defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1">;
+ defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2">;
+
+ defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs">;
+ defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi">;
+ defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge">;
+ defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt">;
+ defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq">;
+ defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne">;
+
+ defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq">;
+ defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne">;
+ defm CMPGE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b000, "cmpge">;
+ defm CMPGT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b001, "cmpgt">;
+ defm CMPLT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b010, "cmplt">;
+ defm CMPLE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b011, "cmple">;
+ defm CMPHS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b100, "cmphs">;
+ defm CMPHI_WIDE_PPzZZ : sve_int_cmp_1_wide<0b101, "cmphi">;
+ defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo">;
+ defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls">;
+
+ defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge">;
+ defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt">;
+ defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt">;
+ defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple">;
+ defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq">;
+ defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne">;
+ defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs">;
+ defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi">;
+ defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo">;
+ defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls">;
+
+ defm FCMGE_PPzZZ : sve_fp_3op_p_pd<0b000, "fcmge">;
+ defm FCMGT_PPzZZ : sve_fp_3op_p_pd<0b001, "fcmgt">;
+ defm FCMEQ_PPzZZ : sve_fp_3op_p_pd<0b010, "fcmeq">;
+ defm FCMNE_PPzZZ : sve_fp_3op_p_pd<0b011, "fcmne">;
+ defm FCMUO_PPzZZ : sve_fp_3op_p_pd<0b100, "fcmuo">;
+ defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge">;
+ defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt">;
+
+ defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge">;
+ defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt">;
+ defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt">;
+ defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle">;
+ defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq">;
+ defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne">;
+
+ def RDVLI_XI : sve_int_read_vl_a<0b0, 0b11111, "rdvl">;
+ def ADDVL_XXI : sve_int_arith_vl<0b0, "addvl">;
+ def ADDPL_XXI : sve_int_arith_vl<0b1, "addpl">;
+
+ defm CNTB_XPiI : sve_int_count<0b000, "cntb">;
+ defm CNTH_XPiI : sve_int_count<0b010, "cnth">;
+ defm CNTW_XPiI : sve_int_count<0b100, "cntw">;
+ defm CNTD_XPiI : sve_int_count<0b110, "cntd">;
+ defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp">;
+
+ defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb">;
+ defm DECB_XPiI : sve_int_pred_pattern_a<0b001, "decb">;
+ defm INCH_XPiI : sve_int_pred_pattern_a<0b010, "inch">;
+ defm DECH_XPiI : sve_int_pred_pattern_a<0b011, "dech">;
+ defm INCW_XPiI : sve_int_pred_pattern_a<0b100, "incw">;
+ defm DECW_XPiI : sve_int_pred_pattern_a<0b101, "decw">;
+ defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd">;
+ defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd">;
+
+ defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb">;
+ defm UQINCB_WPiI : sve_int_pred_pattern_b_u32<0b00001, "uqincb">;
+ defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb">;
+ defm UQDECB_WPiI : sve_int_pred_pattern_b_u32<0b00011, "uqdecb">;
+ defm SQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00100, "sqincb">;
+ defm UQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00101, "uqincb">;
+ defm SQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00110, "sqdecb">;
+ defm UQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00111, "uqdecb">;
+
+ defm SQINCH_XPiWdI : sve_int_pred_pattern_b_s32<0b01000, "sqinch">;
+ defm UQINCH_WPiI : sve_int_pred_pattern_b_u32<0b01001, "uqinch">;
+ defm SQDECH_XPiWdI : sve_int_pred_pattern_b_s32<0b01010, "sqdech">;
+ defm UQDECH_WPiI : sve_int_pred_pattern_b_u32<0b01011, "uqdech">;
+ defm SQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01100, "sqinch">;
+ defm UQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01101, "uqinch">;
+ defm SQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01110, "sqdech">;
+ defm UQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01111, "uqdech">;
+
+ defm SQINCW_XPiWdI : sve_int_pred_pattern_b_s32<0b10000, "sqincw">;
+ defm UQINCW_WPiI : sve_int_pred_pattern_b_u32<0b10001, "uqincw">;
+ defm SQDECW_XPiWdI : sve_int_pred_pattern_b_s32<0b10010, "sqdecw">;
+ defm UQDECW_WPiI : sve_int_pred_pattern_b_u32<0b10011, "uqdecw">;
+ defm SQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10100, "sqincw">;
+ defm UQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10101, "uqincw">;
+ defm SQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10110, "sqdecw">;
+ defm UQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10111, "uqdecw">;
+
+ defm SQINCD_XPiWdI : sve_int_pred_pattern_b_s32<0b11000, "sqincd">;
+ defm UQINCD_WPiI : sve_int_pred_pattern_b_u32<0b11001, "uqincd">;
+ defm SQDECD_XPiWdI : sve_int_pred_pattern_b_s32<0b11010, "sqdecd">;
+ defm UQDECD_WPiI : sve_int_pred_pattern_b_u32<0b11011, "uqdecd">;
+ defm SQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11100, "sqincd">;
+ defm UQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11101, "uqincd">;
+ defm SQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11110, "sqdecd">;
+ defm UQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11111, "uqdecd">;
+
+ defm SQINCH_ZPiI : sve_int_countvlv<0b01000, "sqinch", ZPR16>;
+ defm UQINCH_ZPiI : sve_int_countvlv<0b01001, "uqinch", ZPR16>;
+ defm SQDECH_ZPiI : sve_int_countvlv<0b01010, "sqdech", ZPR16>;
+ defm UQDECH_ZPiI : sve_int_countvlv<0b01011, "uqdech", ZPR16>;
+ defm INCH_ZPiI : sve_int_countvlv<0b01100, "inch", ZPR16>;
+ defm DECH_ZPiI : sve_int_countvlv<0b01101, "dech", ZPR16>;
+ defm SQINCW_ZPiI : sve_int_countvlv<0b10000, "sqincw", ZPR32>;
+ defm UQINCW_ZPiI : sve_int_countvlv<0b10001, "uqincw", ZPR32>;
+ defm SQDECW_ZPiI : sve_int_countvlv<0b10010, "sqdecw", ZPR32>;
+ defm UQDECW_ZPiI : sve_int_countvlv<0b10011, "uqdecw", ZPR32>;
+ defm INCW_ZPiI : sve_int_countvlv<0b10100, "incw", ZPR32>;
+ defm DECW_ZPiI : sve_int_countvlv<0b10101, "decw", ZPR32>;
+ defm SQINCD_ZPiI : sve_int_countvlv<0b11000, "sqincd", ZPR64>;
+ defm UQINCD_ZPiI : sve_int_countvlv<0b11001, "uqincd", ZPR64>;
+ defm SQDECD_ZPiI : sve_int_countvlv<0b11010, "sqdecd", ZPR64>;
+ defm UQDECD_ZPiI : sve_int_countvlv<0b11011, "uqdecd", ZPR64>;
+ defm INCD_ZPiI : sve_int_countvlv<0b11100, "incd", ZPR64>;
+ defm DECD_ZPiI : sve_int_countvlv<0b11101, "decd", ZPR64>;
+
+ defm SQINCP_XPWd : sve_int_count_r_s32<0b00000, "sqincp">;
+ defm SQINCP_XP : sve_int_count_r_x64<0b00010, "sqincp">;
+ defm UQINCP_WP : sve_int_count_r_u32<0b00100, "uqincp">;
+ defm UQINCP_XP : sve_int_count_r_x64<0b00110, "uqincp">;
+ defm SQDECP_XPWd : sve_int_count_r_s32<0b01000, "sqdecp">;
+ defm SQDECP_XP : sve_int_count_r_x64<0b01010, "sqdecp">;
+ defm UQDECP_WP : sve_int_count_r_u32<0b01100, "uqdecp">;
+ defm UQDECP_XP : sve_int_count_r_x64<0b01110, "uqdecp">;
+ defm INCP_XP : sve_int_count_r_x64<0b10000, "incp">;
+ defm DECP_XP : sve_int_count_r_x64<0b10100, "decp">;
+
+ defm SQINCP_ZP : sve_int_count_v<0b00000, "sqincp">;
+ defm UQINCP_ZP : sve_int_count_v<0b00100, "uqincp">;
+ defm SQDECP_ZP : sve_int_count_v<0b01000, "sqdecp">;
+ defm UQDECP_ZP : sve_int_count_v<0b01100, "uqdecp">;
+ defm INCP_ZP : sve_int_count_v<0b10000, "incp">;
+ defm DECP_ZP : sve_int_count_v<0b10100, "decp">;
+
+ defm INDEX_RR : sve_int_index_rr<"index">;
+ defm INDEX_IR : sve_int_index_ir<"index">;
+ defm INDEX_RI : sve_int_index_ri<"index">;
+ defm INDEX_II : sve_int_index_ii<"index">;
+
+ // Unpredicated shifts
+ defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr">;
+ defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr">;
+ defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl">;
+
+ defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
+ defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
+ defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;
+
+ // Predicated shifts
+ defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b000, "asr">;
+ defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b001, "lsr">;
+ defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b011, "lsl">;
+ defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b100, "asrd">;
+
+ defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr">;
+ defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr">;
+ defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl">;
+ defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr">;
+ defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr">;
+ defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr">;
+
+ defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr">;
+ defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr">;
+ defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl">;
+
+ def FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16>;
+ def FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32>;
+ def SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16>;
+ def SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32>;
+ def UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32>;
+ def UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16>;
+ def FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16>;
+ def FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32>;
+ def FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16>;
+ def FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32>;
+ def FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16>;
+ def FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64>;
+ def FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32>;
+ def FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64>;
+ def SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64>;
+ def UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64>;
+ def UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16>;
+ def SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32>;
+ def SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16>;
+ def SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16>;
+ def UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32>;
+ def UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16>;
+ def SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64>;
+ def UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64>;
+ def FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32>;
+ def FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32>;
+ def FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64>;
+ def FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32>;
+ def FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64>;
+ def FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32>;
+ def FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64>;
+ def FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64>;
+ def FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64>;
+ def FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64>;
+
+ defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn">;
+ defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp">;
+ defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm">;
+ defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz">;
+ defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta">;
+ defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx">;
+ defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti">;
+ defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx">;
+ defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt">;
+
+ // InstAliases
+ def : InstAlias<"mov $Zd, $Zn",
+ (ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>;
+ def : InstAlias<"mov $Pd, $Pg/m, $Pn",
+ (SEL_PPPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pd), 1>;
+ def : InstAlias<"mov $Pd, $Pn",
+ (ORR_PPzPP PPR8:$Pd, PPR8:$Pn, PPR8:$Pn, PPR8:$Pn), 1>;
+ def : InstAlias<"mov $Pd, $Pg/z, $Pn",
+ (AND_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pn), 1>;
+
+ def : InstAlias<"movs $Pd, $Pn",
+ (ORRS_PPzPP PPR8:$Pd, PPR8:$Pn, PPR8:$Pn, PPR8:$Pn), 1>;
+ def : InstAlias<"movs $Pd, $Pg/z, $Pn",
+ (ANDS_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pn), 1>;
+
+ def : InstAlias<"not $Pd, $Pg/z, $Pn",
+ (EOR_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPRAny:$Pg), 1>;
+
+ def : InstAlias<"nots $Pd, $Pg/z, $Pn",
+ (EORS_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPRAny:$Pg), 1>;
+
+ def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
+ (CMPGE_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
+ def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
+ (CMPGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+ def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
+ (CMPGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+ def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
+ (CMPGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+ def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
+ (CMPHI_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
+ def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
+ (CMPHI_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+ def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
+ (CMPHI_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+ def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
+ (CMPHI_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+ def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
+ (CMPHS_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
+ def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
+ (CMPHS_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+ def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
+ (CMPHS_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+ def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
+ (CMPHS_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+ def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
+ (CMPGT_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
+ def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
+ (CMPGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+ def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
+ (CMPGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+ def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
+ (CMPGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+ def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
+ (FACGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+ def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
+ (FACGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+ def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
+ (FACGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+ def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
+ (FACGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+ def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
+ (FACGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+ def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
+ (FACGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+ def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
+ (FCMGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+ def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
+ (FCMGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+ def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
+ (FCMGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+ def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
+ (FCMGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+ def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
+ (FCMGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+ def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
+ (FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
}
diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td
index 90ebd78f4ab9..f253a4f3e25a 100644
--- a/lib/Target/AArch64/AArch64SchedA53.td
+++ b/lib/Target/AArch64/AArch64SchedA53.td
@@ -222,19 +222,19 @@ def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
-def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
-def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
-def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD3Threev2d$")>;
def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
-def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev2d_POST$")>;
def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedExynosM1.td
index 91b6ffcd7083..ecc68aed1550 100644
--- a/lib/Target/AArch64/AArch64SchedM1.td
+++ b/lib/Target/AArch64/AArch64SchedExynosM1.td
@@ -1,4 +1,4 @@
-//=- AArch64SchedM1.td - Samsung Exynos-M1 Scheduling Defs ---*- tablegen -*-=//
+//=- AArch64SchedExynosM1.td - Samsung Exynos M1 Sched Defs --*- tablegen -*-=//
//
// The LLVM Compiler Infrastructure
//
@@ -7,7 +7,7 @@
//
//===----------------------------------------------------------------------===//
//
-// This file defines the machine model for Samsung Exynos-M1 to support
+// This file defines the machine model for the Samsung Exynos M1 to support
// instruction scheduling and other instruction cost heuristics.
//
//===----------------------------------------------------------------------===//
@@ -32,6 +32,8 @@ def ExynosM1Model : SchedMachineModel {
// Define each kind of processor resource and number available on the Exynos-M1,
// which has 9 pipelines, each with its own queue with out-of-order dispatch.
+let SchedModel = ExynosM1Model in {
+
def M1UnitA : ProcResource<2>; // Simple integer
def M1UnitC : ProcResource<1>; // Simple and complex integer
def M1UnitD : ProcResource<1>; // Integer division (inside C, serialized)
@@ -54,14 +56,10 @@ let Super = M1PipeF1 in {
def M1UnitFST : ProcResource<1>; // FP store
}
-let SchedModel = ExynosM1Model in {
- def M1UnitALU : ProcResGroup<[M1UnitA,
- M1UnitC]>; // All integer
- def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
- M1UnitNAL1]>; // All simple vector
-}
-
-let SchedModel = ExynosM1Model in {
+def M1UnitALU : ProcResGroup<[M1UnitA,
+ M1UnitC]>; // All integer
+def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
+ M1UnitNAL1]>; // All simple vector
//===----------------------------------------------------------------------===//
// Predicates.
@@ -109,7 +107,7 @@ def M1WriteLC : SchedWriteRes<[M1UnitL,
def M1WriteLD : SchedWriteRes<[M1UnitL,
M1UnitA]> { let Latency = 6;
let NumMicroOps = 2;
- let ResourceCycles = [2]; }
+ let ResourceCycles = [2, 1]; }
def M1WriteLH : SchedWriteRes<[]> { let Latency = 5;
let NumMicroOps = 0; }
def M1WriteLX : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteL5]>,
@@ -321,19 +319,19 @@ def M1WriteVLDC : SchedWriteRes<[M1UnitL,
def M1WriteVLDD : SchedWriteRes<[M1UnitL,
M1UnitNALU]> { let Latency = 7;
let NumMicroOps = 2;
- let ResourceCycles = [2]; }
+ let ResourceCycles = [2, 1]; }
def M1WriteVLDE : SchedWriteRes<[M1UnitL,
M1UnitNALU]> { let Latency = 6;
let NumMicroOps = 2; }
def M1WriteVLDF : SchedWriteRes<[M1UnitL,
M1UnitL]> { let Latency = 10;
let NumMicroOps = 2;
- let ResourceCycles = [5]; }
+ let ResourceCycles = [1, 1]; }
def M1WriteVLDG : SchedWriteRes<[M1UnitL,
M1UnitNALU,
M1UnitNALU]> { let Latency = 7;
let NumMicroOps = 3;
- let ResourceCycles = [2]; }
+ let ResourceCycles = [2, 1, 1]; }
def M1WriteVLDH : SchedWriteRes<[M1UnitL,
M1UnitNALU,
M1UnitNALU]> { let Latency = 6;
@@ -342,27 +340,27 @@ def M1WriteVLDI : SchedWriteRes<[M1UnitL,
M1UnitL,
M1UnitL]> { let Latency = 12;
let NumMicroOps = 3;
- let ResourceCycles = [6]; }
+ let ResourceCycles = [2, 2, 2]; }
def M1WriteVLDJ : SchedWriteRes<[M1UnitL,
M1UnitNALU,
M1UnitNALU,
M1UnitNALU]> { let Latency = 9;
let NumMicroOps = 4;
- let ResourceCycles = [4]; }
+ let ResourceCycles = [2, 1, 1, 1]; }
def M1WriteVLDK : SchedWriteRes<[M1UnitL,
M1UnitNALU,
M1UnitNALU,
M1UnitNALU,
M1UnitNALU]> { let Latency = 9;
let NumMicroOps = 5;
- let ResourceCycles = [4]; }
+ let ResourceCycles = [2, 1, 1, 1, 1]; }
def M1WriteVLDL : SchedWriteRes<[M1UnitL,
M1UnitNALU,
M1UnitNALU,
M1UnitL,
M1UnitNALU]> { let Latency = 7;
let NumMicroOps = 5;
- let ResourceCycles = [2]; }
+ let ResourceCycles = [1, 1, 1, 1, 1]; }
def M1WriteVLDM : SchedWriteRes<[M1UnitL,
M1UnitNALU,
M1UnitNALU,
@@ -370,13 +368,13 @@ def M1WriteVLDM : SchedWriteRes<[M1UnitL,
M1UnitNALU,
M1UnitNALU]> { let Latency = 7;
let NumMicroOps = 6;
- let ResourceCycles = [2]; }
+ let ResourceCycles = [1, 1, 1, 1, 1, 1]; }
def M1WriteVLDN : SchedWriteRes<[M1UnitL,
M1UnitL,
M1UnitL,
M1UnitL]> { let Latency = 14;
let NumMicroOps = 4;
- let ResourceCycles = [7]; }
+ let ResourceCycles = [2, 1, 2, 1]; }
def M1WriteVSTA : WriteSequence<[WriteVST], 2>;
def M1WriteVSTB : WriteSequence<[WriteVST], 3>;
def M1WriteVSTC : WriteSequence<[WriteVST], 4>;
@@ -384,14 +382,14 @@ def M1WriteVSTD : SchedWriteRes<[M1UnitS,
M1UnitFST,
M1UnitFST]> { let Latency = 7;
let NumMicroOps = 2;
- let ResourceCycles = [7]; }
+ let ResourceCycles = [7, 1, 1]; }
def M1WriteVSTE : SchedWriteRes<[M1UnitS,
M1UnitFST,
M1UnitS,
M1UnitFST,
M1UnitFST]> { let Latency = 8;
let NumMicroOps = 3;
- let ResourceCycles = [8]; }
+ let ResourceCycles = [7, 1, 1, 1, 1]; }
def M1WriteVSTF : SchedWriteRes<[M1UnitNALU,
M1UnitS,
M1UnitFST,
@@ -400,7 +398,7 @@ def M1WriteVSTF : SchedWriteRes<[M1UnitNALU,
M1UnitFST,
M1UnitFST]> { let Latency = 15;
let NumMicroOps = 5;
- let ResourceCycles = [15]; }
+ let ResourceCycles = [1, 7, 1, 7, 1, 1, 1]; }
def M1WriteVSTG : SchedWriteRes<[M1UnitNALU,
M1UnitS,
M1UnitFST,
@@ -411,14 +409,14 @@ def M1WriteVSTG : SchedWriteRes<[M1UnitNALU,
M1UnitFST,
M1UnitFST]> { let Latency = 16;
let NumMicroOps = 6;
- let ResourceCycles = [16]; }
+ let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1]; }
def M1WriteVSTH : SchedWriteRes<[M1UnitNALU,
M1UnitS,
M1UnitFST,
M1UnitFST,
M1UnitFST]> { let Latency = 14;
let NumMicroOps = 4;
- let ResourceCycles = [14]; }
+ let ResourceCycles = [1, 7, 1, 7, 1]; }
def M1WriteVSTI : SchedWriteRes<[M1UnitNALU,
M1UnitS,
M1UnitFST,
@@ -431,7 +429,7 @@ def M1WriteVSTI : SchedWriteRes<[M1UnitNALU,
M1UnitFST,
M1UnitFST]> { let Latency = 17;
let NumMicroOps = 7;
- let ResourceCycles = [17]; }
+ let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 1]; }
// Branch instructions
def : InstRW<[M1WriteB1], (instrs Bcc)>;
diff --git a/lib/Target/AArch64/AArch64SchedExynosM3.td b/lib/Target/AArch64/AArch64SchedExynosM3.td
new file mode 100644
index 000000000000..5e5369a5a7fe
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -0,0 +1,860 @@
+//=- AArch64SchedExynosM3.td - Samsung Exynos M3 Sched Defs --*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the Samsung Exynos M3 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// The Exynos-M3 is an advanced superscalar microprocessor with a 6-wide
+// in-order stage for decode and dispatch and a wider issue stage.
+// The execution units and loads and stores are out-of-order.
+
+def ExynosM3Model : SchedMachineModel {
+ let IssueWidth = 6; // Up to 6 uops per cycle.
+ let MicroOpBufferSize = 228; // ROB size.
+ let LoopMicroOpBufferSize = 40; // Based on the instruction queue size.
+ let LoadLatency = 4; // Optimistic load cases.
+ let MispredictPenalty = 16; // Minimum branch misprediction penalty.
+ let CompleteModel = 1; // Use the default model otherwise.
+
+ list<Predicate> UnsupportedFeatures = [HasSVE];
+
+ // FIXME: Remove when all errors have been fixed.
+ let FullInstRWOverlapCheck = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on the Exynos-M3,
+// which has 12 pipelines, each with its own queue with out-of-order dispatch.
+
+let SchedModel = ExynosM3Model in {
+
+def M3UnitA : ProcResource<2>; // Simple integer
+def M3UnitC : ProcResource<2>; // Simple and complex integer
+def M3UnitD : ProcResource<1>; // Integer division (inside C0, serialized)
+def M3UnitB : ProcResource<2>; // Branch
+def M3UnitL : ProcResource<2>; // Load
+def M3UnitS : ProcResource<1>; // Store
+def M3PipeF0 : ProcResource<1>; // FP #0
+let Super = M3PipeF0 in {
+ def M3UnitFMAC0 : ProcResource<1>; // FP multiplication
+ def M3UnitFADD0 : ProcResource<1>; // Simple FP
+ def M3UnitFCVT0 : ProcResource<1>; // FP conversion
+ def M3UnitFSQR : ProcResource<2>; // FP square root (serialized)
+ def M3UnitNALU0 : ProcResource<1>; // Simple vector
+ def M3UnitNMSC : ProcResource<1>; // FP and vector miscellanea
+ def M3UnitNSHT0 : ProcResource<1>; // Vector shifting
+ def M3UnitNSHF0 : ProcResource<1>; // Vector shuffling
+}
+def M3PipeF1 : ProcResource<1>; // FP #1
+let Super = M3PipeF1 in {
+ def M3UnitFMAC1 : ProcResource<1>; // FP multiplication
+ def M3UnitFADD1 : ProcResource<1>; // Simple FP
+ def M3UnitFDIV0 : ProcResource<2>; // FP division (serialized)
+ def M3UnitFCVT1 : ProcResource<1>; // FP conversion
+ def M3UnitFST0 : ProcResource<1>; // FP store
+ def M3UnitNALU1 : ProcResource<1>; // Simple vector
+ def M3UnitNCRY0 : ProcResource<1>; // Cryptographic
+ def M3UnitNMUL : ProcResource<1>; // Vector multiplication
+ def M3UnitNSHT1 : ProcResource<1>; // Vector shifting
+ def M3UnitNSHF1 : ProcResource<1>; // Vector shuffling
+}
+def M3PipeF2 : ProcResource<1>; // FP #2
+let Super = M3PipeF2 in {
+ def M3UnitFMAC2 : ProcResource<1>; // FP multiplication
+ def M3UnitFADD2 : ProcResource<1>; // Simple FP
+ def M3UnitFDIV1 : ProcResource<2>; // FP division (serialized)
+ def M3UnitFST1 : ProcResource<1>; // FP store
+ def M3UnitNALU2 : ProcResource<1>; // Simple vector
+ def M3UnitNCRY1 : ProcResource<1>; // Cryptographic
+ def M3UnitNSHT2 : ProcResource<1>; // Vector shifting
+ def M3UnitNSHF2 : ProcResource<1>; // Vector shuffling
+}
+
+
+def M3UnitALU : ProcResGroup<[M3UnitA,
+ M3UnitC]>;
+def M3UnitFMAC : ProcResGroup<[M3UnitFMAC0,
+ M3UnitFMAC1,
+ M3UnitFMAC2]>;
+def M3UnitFADD : ProcResGroup<[M3UnitFADD0,
+ M3UnitFADD1,
+ M3UnitFADD2]>;
+def M3UnitFDIV : ProcResGroup<[M3UnitFDIV0,
+ M3UnitFDIV1]>;
+def M3UnitFCVT : ProcResGroup<[M3UnitFCVT0,
+ M3UnitFCVT1]>;
+def M3UnitFST : ProcResGroup<[M3UnitFST0,
+ M3UnitFST1]>;
+def M3UnitNALU : ProcResGroup<[M3UnitNALU0,
+ M3UnitNALU1,
+ M3UnitNALU2]>;
+def M3UnitNCRY : ProcResGroup<[M3UnitNCRY0,
+ M3UnitNCRY1]>;
+def M3UnitNSHT : ProcResGroup<[M3UnitNSHT0,
+ M3UnitNSHT1,
+ M3UnitNSHT2]>;
+def M3UnitNSHF : ProcResGroup<[M3UnitNSHF0,
+ M3UnitNSHF1,
+ M3UnitNSHF2]>;
+
+//===----------------------------------------------------------------------===//
+// Predicates.
+
+def M3BranchLinkFastPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR &&
+ MI->getOperand(0).isReg() &&
+ MI->getOperand(0).getReg() != AArch64::LR}]>;
+def M3ResetFastPred : SchedPredicate<[{TII->isExynosResetFast(*MI)}]>;
+def M3RotateRightFastPred : SchedPredicate<[{(MI->getOpcode() == AArch64::EXTRWrri ||
+ MI->getOpcode() == AArch64::EXTRXrri) &&
+ MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+ MI->getOperand(1).getReg() == MI->getOperand(2).getReg()}]>;
+def M3ShiftLeftFastPred : SchedPredicate<[{TII->isExynosShiftLeftFast(*MI)}]>;
+
+//===----------------------------------------------------------------------===//
+// Coarse scheduling model.
+
+def M3WriteZ0 : SchedWriteRes<[]> { let Latency = 0;
+ let NumMicroOps = 1; }
+
+def M3WriteA1 : SchedWriteRes<[M3UnitALU]> { let Latency = 1; }
+def M3WriteAA : SchedWriteRes<[M3UnitALU]> { let Latency = 2;
+ let ResourceCycles = [2]; }
+def M3WriteAB : SchedWriteRes<[M3UnitALU,
+ M3UnitC]> { let Latency = 1;
+ let NumMicroOps = 2; }
+def M3WriteAC : SchedWriteRes<[M3UnitALU,
+ M3UnitALU,
+ M3UnitC]> { let Latency = 2;
+ let NumMicroOps = 3; }
+def M3WriteAD : SchedWriteRes<[M3UnitALU,
+ M3UnitC]> { let Latency = 2;
+ let NumMicroOps = 2; }
+def M3WriteC1 : SchedWriteRes<[M3UnitC]> { let Latency = 1; }
+def M3WriteC2 : SchedWriteRes<[M3UnitC]> { let Latency = 2; }
+def M3WriteAX : SchedWriteVariant<[SchedVar<M3ResetFastPred, [M3WriteZ0]>,
+ SchedVar<M3ShiftLeftFastPred, [M3WriteA1]>,
+ SchedVar<NoSchedPred, [M3WriteAA]>]>;
+def M3WriteAY : SchedWriteVariant<[SchedVar<M3RotateRightFastPred, [M3WriteA1]>,
+ SchedVar<NoSchedPred, [M3WriteAA]>]>;
+
+def M3WriteB1 : SchedWriteRes<[M3UnitB]> { let Latency = 1; }
+def M3WriteBX : SchedWriteVariant<[SchedVar<M3BranchLinkFastPred, [M3WriteAB]>,
+ SchedVar<NoSchedPred, [M3WriteAC]>]>;
+
+def M3WriteL4 : SchedWriteRes<[M3UnitL]> { let Latency = 4; }
+def M3WriteL5 : SchedWriteRes<[M3UnitL]> { let Latency = 5; }
+def M3WriteLA : SchedWriteRes<[M3UnitL,
+ M3UnitL]> { let Latency = 5;
+ let NumMicroOps = 1; }
+def M3WriteLB : SchedWriteRes<[M3UnitA,
+ M3UnitL]> { let Latency = 5;
+ let NumMicroOps = 2; }
+def M3WriteLC : SchedWriteRes<[M3UnitA,
+ M3UnitL,
+ M3UnitL]> { let Latency = 5;
+ let NumMicroOps = 2; }
+def M3WriteLD : SchedWriteRes<[M3UnitA,
+ M3UnitL]> { let Latency = 4;
+ let NumMicroOps = 2; }
+def M3WriteLH : SchedWriteRes<[]> { let Latency = 5;
+ let NumMicroOps = 0; }
+
+def M3WriteLX : SchedWriteVariant<[SchedVar<M3ShiftLeftFastPred, [M3WriteL5]>,
+ SchedVar<NoSchedPred, [M3WriteLB]>]>;
+
+def M3WriteS1 : SchedWriteRes<[M3UnitS]> { let Latency = 1; }
+def M3WriteSA : SchedWriteRes<[M3UnitA,
+ M3UnitS,
+ M3UnitFST]> { let Latency = 2;
+ let NumMicroOps = 2; }
+def M3WriteSB : SchedWriteRes<[M3UnitA,
+ M3UnitS]> { let Latency = 1;
+ let NumMicroOps = 2; }
+def M3WriteSC : SchedWriteRes<[M3UnitA,
+ M3UnitS]> { let Latency = 2;
+ let NumMicroOps = 2; }
+
+def M3WriteSX : SchedWriteVariant<[SchedVar<M3ShiftLeftFastPred, [M3WriteS1]>,
+ SchedVar<NoSchedPred, [M3WriteSB]>]>;
+def M3WriteSY : SchedWriteVariant<[SchedVar<M3ShiftLeftFastPred, [M3WriteS1]>,
+ SchedVar<NoSchedPred, [M3WriteSC]>]>;
+
+def M3ReadAdrBase : SchedReadVariant<[SchedVar<ScaledIdxPred, [ReadDefault]>,
+ SchedVar<NoSchedPred, [ReadDefault]>]>;
+
+// Branch instructions.
+def : SchedAlias<WriteBr, M3WriteZ0>;
+def : WriteRes<WriteBrReg, [M3UnitC]> { let Latency = 1; }
+
+// Arithmetic and logical integer instructions.
+def : WriteRes<WriteI, [M3UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [M3UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteIEReg, [M3UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteIS, [M3UnitALU]> { let Latency = 1; }
+
+// Move instructions.
+def : WriteRes<WriteImm, [M3UnitALU]> { let Latency = 1; }
+
+// Divide and multiply instructions.
+def : WriteRes<WriteID32, [M3UnitC,
+ M3UnitD]> { let Latency = 12;
+ let ResourceCycles = [1, 12]; }
+def : WriteRes<WriteID64, [M3UnitC,
+ M3UnitD]> { let Latency = 21;
+ let ResourceCycles = [1, 21]; }
+def : WriteRes<WriteIM32, [M3UnitC]> { let Latency = 3; }
+def : WriteRes<WriteIM64, [M3UnitC]> { let Latency = 4;
+ let ResourceCycles = [2]; }
+
+// Miscellaneous instructions.
+def : WriteRes<WriteExtr, [M3UnitALU,
+ M3UnitALU]> { let Latency = 1;
+ let NumMicroOps = 2; }
+
+// Addressing modes.
+def : WriteRes<WriteAdr, []> { let Latency = 1;
+ let NumMicroOps = 0; }
+def : SchedAlias<ReadAdrBase, M3ReadAdrBase>;
+
+// Load instructions.
+def : SchedAlias<WriteLD, M3WriteL4>;
+def : WriteRes<WriteLDHi, []> { let Latency = 4;
+ let NumMicroOps = 0; }
+def : SchedAlias<WriteLDIdx, M3WriteLX>;
+
+// Store instructions.
+def : SchedAlias<WriteST, M3WriteS1>;
+def : SchedAlias<WriteSTP, M3WriteS1>;
+def : SchedAlias<WriteSTX, M3WriteS1>;
+def : SchedAlias<WriteSTIdx, M3WriteSX>;
+
+// FP data instructions.
+def : WriteRes<WriteF, [M3UnitFADD]> { let Latency = 2; }
+def : WriteRes<WriteFCmp, [M3UnitNMSC]> { let Latency = 2; }
+def : WriteRes<WriteFDiv, [M3UnitFDIV]> { let Latency = 12;
+ let ResourceCycles = [12]; }
+def : WriteRes<WriteFMul, [M3UnitFMAC]> { let Latency = 4; }
+
+// FP miscellaneous instructions.
+// TODO: Conversion between register files is much different.
+def : WriteRes<WriteFCvt, [M3UnitFCVT]> { let Latency = 3; }
+def : WriteRes<WriteFImm, [M3UnitNALU]> { let Latency = 1; }
+def : WriteRes<WriteFCopy, [M3UnitNALU]> { let Latency = 1; }
+
+// FP load instructions.
+def : SchedAlias<WriteVLD, M3WriteL5>;
+
+// FP store instructions.
+def : WriteRes<WriteVST, [M3UnitS,
+ M3UnitFST]> { let Latency = 1;
+ let NumMicroOps = 1; }
+
+// ASIMD FP instructions.
+def : WriteRes<WriteV, [M3UnitNALU]> { let Latency = 3; }
+
+// Other miscellaneous instructions.
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+
+//===----------------------------------------------------------------------===//
+// Generic fast forwarding.
+
+// TODO: Add FP register forwarding rules.
+
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+// TODO: The forwarding for 32 bits actually saves 2 cycles.
+def : ReadAdvance<ReadIMA, 3, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+//===----------------------------------------------------------------------===//
+// Finer scheduling model.
+
+def M3WriteNEONA : SchedWriteRes<[M3UnitNSHF,
+ M3UnitFADD]> { let Latency = 3;
+ let NumMicroOps = 2; }
+def M3WriteNEONB : SchedWriteRes<[M3UnitNALU,
+ M3UnitFST]> { let Latency = 10;
+ let NumMicroOps = 2; }
+def M3WriteNEOND : SchedWriteRes<[M3UnitNSHF,
+ M3UnitFST]> { let Latency = 6;
+ let NumMicroOps = 2; }
+def M3WriteNEONH : SchedWriteRes<[M3UnitNALU,
+ M3UnitS]> { let Latency = 5;
+ let NumMicroOps = 2; }
+def M3WriteNEONI : SchedWriteRes<[M3UnitNSHF,
+ M3UnitS]> { let Latency = 5;
+ let NumMicroOps = 2; }
+def M3WriteNEONV : SchedWriteRes<[M3UnitFDIV0,
+ M3UnitFDIV1]> { let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [8, 8]; }
+def M3WriteNEONW : SchedWriteRes<[M3UnitFDIV0,
+ M3UnitFDIV1]> { let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [13, 13]; }
+def M3WriteNEONX : SchedWriteRes<[M3UnitFSQR,
+ M3UnitFSQR]> { let Latency = 18;
+ let NumMicroOps = 2;
+ let ResourceCycles = [19, 19]; }
+def M3WriteNEONY : SchedWriteRes<[M3UnitFSQR,
+ M3UnitFSQR]> { let Latency = 25;
+ let NumMicroOps = 2;
+ let ResourceCycles = [26, 26]; }
+def M3WriteNEONZ : SchedWriteRes<[M3UnitNMSC,
+ M3UnitNMSC]> { let Latency = 5;
+ let NumMicroOps = 2; }
+def M3WriteFADD2 : SchedWriteRes<[M3UnitFADD]> { let Latency = 2; }
+def M3WriteFCVT2 : SchedWriteRes<[M3UnitFCVT]> { let Latency = 2; }
+def M3WriteFCVT3 : SchedWriteRes<[M3UnitFCVT]> { let Latency = 3; }
+def M3WriteFCVT3A : SchedWriteRes<[M3UnitFCVT0]> { let Latency = 3; }
+def M3WriteFCVT4A : SchedWriteRes<[M3UnitFCVT0]> { let Latency = 4; }
+def M3WriteFCVT4 : SchedWriteRes<[M3UnitFCVT]> { let Latency = 4; }
+def M3WriteFDIV10 : SchedWriteRes<[M3UnitFDIV]> { let Latency = 7;
+ let ResourceCycles = [8]; }
+def M3WriteFDIV12 : SchedWriteRes<[M3UnitFDIV]> { let Latency = 12;
+ let ResourceCycles = [13]; }
+def M3WriteFMAC3 : SchedWriteRes<[M3UnitFMAC]> { let Latency = 3; }
+def M3WriteFMAC4 : SchedWriteRes<[M3UnitFMAC]> { let Latency = 4; }
+def M3WriteFMAC5 : SchedWriteRes<[M3UnitFMAC]> { let Latency = 5; }
+def M3WriteFSQR17 : SchedWriteRes<[M3UnitFSQR]> { let Latency = 18;
+ let ResourceCycles = [19]; }
+def M3WriteFSQR25 : SchedWriteRes<[M3UnitFSQR]> { let Latency = 25;
+ let ResourceCycles = [26]; }
+def M3WriteNALU1 : SchedWriteRes<[M3UnitNALU]> { let Latency = 1; }
+def M3WriteNCRY1A : SchedWriteRes<[M3UnitNCRY0]> { let Latency = 1; }
+def M3WriteNCRY3A : SchedWriteRes<[M3UnitNCRY0]> { let Latency = 3; }
+def M3WriteNCRY5A : SchedWriteRes<[M3UnitNCRY]> { let Latency = 5; }
+def M3WriteNMSC1 : SchedWriteRes<[M3UnitNMSC]> { let Latency = 1; }
+def M3WriteNMSC2 : SchedWriteRes<[M3UnitNMSC]> { let Latency = 2; }
+def M3WriteNMSC3 : SchedWriteRes<[M3UnitNMSC]> { let Latency = 3; }
+def M3WriteNMUL3 : SchedWriteRes<[M3UnitNMUL]> { let Latency = 3; }
+def M3WriteNSHF1 : SchedWriteRes<[M3UnitNSHF]> { let Latency = 1; }
+def M3WriteNSHF3 : SchedWriteRes<[M3UnitNSHF]> { let Latency = 3; }
+def M3WriteNSHT1 : SchedWriteRes<[M3UnitNSHT]> { let Latency = 1; }
+def M3WriteNSHT2 : SchedWriteRes<[M3UnitNSHT]> { let Latency = 2; }
+def M3WriteNSHT3 : SchedWriteRes<[M3UnitNSHT]> { let Latency = 3; }
+def M3WriteVLDA : SchedWriteRes<[M3UnitL,
+ M3UnitL]> { let Latency = 5;
+ let NumMicroOps = 2; }
+def M3WriteVLDB : SchedWriteRes<[M3UnitL,
+ M3UnitL,
+ M3UnitL]> { let Latency = 6;
+ let NumMicroOps = 3; }
+def M3WriteVLDC : SchedWriteRes<[M3UnitL,
+ M3UnitL,
+ M3UnitL,
+ M3UnitL]> { let Latency = 6;
+ let NumMicroOps = 4; }
+def M3WriteVLDD : SchedWriteRes<[M3UnitL,
+ M3UnitNALU]> { let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2, 1]; }
+def M3WriteVLDE : SchedWriteRes<[M3UnitL,
+ M3UnitNALU]> { let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2, 1]; }
+def M3WriteVLDF : SchedWriteRes<[M3UnitL,
+ M3UnitL]> { let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [5, 5]; }
+def M3WriteVLDG : SchedWriteRes<[M3UnitL,
+ M3UnitNALU,
+ M3UnitNALU]> { let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1, 1]; }
+def M3WriteVLDH : SchedWriteRes<[M3UnitL,
+ M3UnitNALU,
+ M3UnitNALU]> { let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1, 1]; }
+def M3WriteVLDI : SchedWriteRes<[M3UnitL,
+ M3UnitL,
+ M3UnitL]> { let Latency = 12;
+ let NumMicroOps = 3;
+ let ResourceCycles = [6, 6, 6]; }
+def M3WriteVLDJ : SchedWriteRes<[M3UnitL,
+ M3UnitNALU,
+ M3UnitNALU,
+ M3UnitNALU]> { let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1, 1]; }
+def M3WriteVLDK : SchedWriteRes<[M3UnitL,
+ M3UnitNALU,
+ M3UnitNALU,
+ M3UnitNALU,
+ M3UnitNALU]> { let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [4, 1, 1, 1, 1]; }
+def M3WriteVLDL : SchedWriteRes<[M3UnitL,
+ M3UnitNALU,
+ M3UnitNALU,
+ M3UnitL,
+ M3UnitNALU]> { let Latency = 6;
+ let NumMicroOps = 5;
+ let ResourceCycles = [6, 1, 1, 6, 1]; }
+def M3WriteVLDM : SchedWriteRes<[M3UnitL,
+ M3UnitNALU,
+ M3UnitNALU,
+ M3UnitL,
+ M3UnitNALU,
+ M3UnitNALU]> { let Latency = 7;
+ let NumMicroOps = 6;
+ let ResourceCycles = [6, 1, 1, 6, 1, 1]; }
+def M3WriteVLDN : SchedWriteRes<[M3UnitL,
+ M3UnitL,
+ M3UnitL,
+ M3UnitL]> { let Latency = 14;
+ let NumMicroOps = 4;
+ let ResourceCycles = [6, 6, 6, 6]; }
+def M3WriteVSTA : WriteSequence<[WriteVST], 2>;
+def M3WriteVSTB : WriteSequence<[WriteVST], 3>;
+def M3WriteVSTC : WriteSequence<[WriteVST], 4>;
+def M3WriteVSTD : SchedWriteRes<[M3UnitS,
+ M3UnitFST,
+ M3UnitS,
+ M3UnitFST]> { let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 3, 1, 3]; }
+def M3WriteVSTE : SchedWriteRes<[M3UnitS,
+ M3UnitFST,
+ M3UnitS,
+ M3UnitFST,
+ M3UnitS,
+ M3UnitFST]> { let Latency = 8;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1, 3, 1, 3, 1, 3]; }
+def M3WriteVSTF : SchedWriteRes<[M3UnitNALU,
+ M3UnitFST,
+ M3UnitFST,
+ M3UnitS,
+ M3UnitFST,
+ M3UnitS,
+ M3UnitFST]> { let Latency = 15;
+ let NumMicroOps = 7;
+ let ResourceCycles = [1, 3, 3, 1, 3, 1, 3]; }
+def M3WriteVSTG : SchedWriteRes<[M3UnitNALU,
+ M3UnitFST,
+ M3UnitFST,
+ M3UnitS,
+ M3UnitFST,
+ M3UnitS,
+ M3UnitFST,
+ M3UnitS,
+ M3UnitFST]> { let Latency = 16;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1, 3, 3, 1, 3, 1, 3, 1, 3]; }
+def M3WriteVSTH : SchedWriteRes<[M3UnitNALU,
+ M3UnitFST,
+ M3UnitFST,
+ M3UnitS,
+ M3UnitFST]> { let Latency = 14;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1, 3, 3, 1, 3]; }
+def M3WriteVSTI : SchedWriteRes<[M3UnitNALU,
+ M3UnitFST,
+ M3UnitFST,
+ M3UnitS,
+ M3UnitFST,
+ M3UnitS,
+ M3UnitFST,
+ M3UnitS,
+ M3UnitFST]> { let Latency = 17;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1, 3, 3, 1, 3, 1, 3, 1, 3]; }
+
+// Special cases.
+def M3WriteAES : SchedWriteRes<[M3UnitNCRY]> { let Latency = 1; }
+def M3ReadAES : SchedReadAdvance<1, [M3WriteAES]>;
+def M3ReadFMAC : SchedReadAdvance<1, [M3WriteFMAC4,
+ M3WriteFMAC5]>;
+def M3WriteMOVI : SchedWriteVariant<[SchedVar<M3ResetFastPred, [M3WriteZ0]>,
+ SchedVar<NoSchedPred, [M3WriteNALU1]>]>;
+def M3ReadNMUL : SchedReadAdvance<1, [M3WriteNMUL3]>;
+
+// Branch instructions
+def : InstRW<[M3WriteB1], (instrs Bcc)>;
+def : InstRW<[M3WriteA1], (instrs BL)>;
+def : InstRW<[M3WriteBX], (instrs BLR)>;
+def : InstRW<[M3WriteC1], (instregex "^CBN?Z[WX]")>;
+def : InstRW<[M3WriteAD], (instregex "^TBN?Z[WX]")>;
+
+// Arithmetic and logical integer instructions.
+def : InstRW<[M3WriteA1], (instrs COPY)>;
+def : InstRW<[M3WriteAX], (instregex "^(ADD|SUB)S?Xrx64")>;
+def : InstRW<[M3WriteAX], (instregex "^(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[sx]$")>;
+def : InstRW<[M3WriteAX], (instregex "^(ADD|BIC|SUB)S[WX]r[sx]$")>;
+def : InstRW<[M3WriteAX], (instregex "^(ADD|AND|EOR|ORR|SUB)[WX]ri")>;
+
+// Move instructions.
+def : InstRW<[M3WriteZ0], (instrs ADR, ADRP)>;
+def : InstRW<[M3WriteZ0], (instregex "^MOV[NZ][WX]i")>;
+
+// Divide and multiply instructions.
+
+// Miscellaneous instructions.
+def : InstRW<[M3WriteAY], (instrs EXTRWrri, EXTRXrri)>;
+
+// Load instructions.
+def : InstRW<[M3WriteLD,
+ WriteLDHi,
+ WriteAdr], (instregex "^LDP(SW|W|X)(post|pre)")>;
+def : InstRW<[M3WriteLX,
+ ReadAdrBase], (instregex "^PRFMro[WX]")>;
+
+// Store instructions.
+
+// FP data instructions.
+def : InstRW<[M3WriteNSHF1], (instregex "^FABS[DS]r")>;
+def : InstRW<[M3WriteFADD2], (instregex "^F(ADD|SUB)[DS]rr")>;
+def : InstRW<[M3WriteFDIV10], (instrs FDIVSrr)>;
+def : InstRW<[M3WriteFDIV12], (instrs FDIVDrr)>;
+def : InstRW<[M3WriteNMSC1], (instregex "^F(MAX|MIN).+rr")>;
+def : InstRW<[M3WriteFMAC3], (instregex "^FN?MUL[DS]rr")>;
+def : InstRW<[M3WriteFMAC4,
+ M3ReadFMAC], (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
+def : InstRW<[M3WriteNALU1], (instregex "^FNEG[DS]r")>;
+def : InstRW<[M3WriteFCVT3A], (instregex "^FRINT.+r")>;
+def : InstRW<[M3WriteNEONH], (instregex "^FCSEL[DS]rrr")>;
+def : InstRW<[M3WriteFSQR17], (instrs FSQRTSr)>;
+def : InstRW<[M3WriteFSQR25], (instrs FSQRTDr)>;
+
+// FP miscellaneous instructions.
+def : InstRW<[M3WriteFCVT3], (instregex "^FCVT[DHS][DHS]r")>;
+def : InstRW<[M3WriteFCVT4A], (instregex "^[SU]CVTF[SU][XW][DHS]ri")>;
+def : InstRW<[M3WriteFCVT3A], (instregex "^FCVT[AMNPZ][SU]U[XW][DHS]r")>;
+def : InstRW<[M3WriteFCVT3A], (instregex "^FCVTZ[SU][dhs]")>;
+def : InstRW<[M3WriteNALU1], (instregex "^FMOV[DS][ir]")>;
+def : InstRW<[M3WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev1")>;
+def : InstRW<[M3WriteNMSC1], (instregex "^FRECPXv1")>;
+def : InstRW<[M3WriteFMAC4,
+ M3ReadFMAC], (instregex "^F(RECP|RSQRT)S(16|32|64)")>;
+def : InstRW<[M3WriteNALU1], (instregex "^FMOV[WX][DS]r")>;
+def : InstRW<[M3WriteNALU1], (instregex "^FMOV[DS][WX]r")>;
+def : InstRW<[M3WriteNEONI], (instregex "^FMOV(DX|XD)Highr")>;
+
+// FP load instructions.
+def : InstRW<[WriteVLD], (instregex "^LDR[DSQ]l")>;
+def : InstRW<[WriteVLD], (instregex "^LDUR[BDHSQ]i")>;
+def : InstRW<[WriteVLD,
+ WriteAdr], (instregex "^LDR[BDHSQ](post|pre)")>;
+def : InstRW<[WriteVLD], (instregex "^LDR[BDHSQ]ui")>;
+def : InstRW<[M3WriteLX,
+ ReadAdrBase], (instregex "^LDR[BDHS]ro[WX]")>;
+def : InstRW<[M3WriteLB,
+ ReadAdrBase], (instregex "^LDRQro[WX]")>;
+def : InstRW<[WriteVLD,
+ M3WriteLH], (instregex "^LDN?P[DS]i")>;
+def : InstRW<[M3WriteLA,
+ M3WriteLH], (instregex "^LDN?PQi")>;
+def : InstRW<[M3WriteLB,
+ M3WriteLH,
+ WriteAdr], (instregex "^LDP[DS](post|pre)")>;
+def : InstRW<[M3WriteLC,
+ M3WriteLH,
+ WriteAdr], (instregex "^LDPQ(post|pre)")>;
+
+// FP store instructions.
+def : InstRW<[WriteVST], (instregex "^STUR[BDHSQ]i")>;
+def : InstRW<[WriteVST,
+ WriteAdr], (instregex "^STR[BDHSQ](post|pre)")>;
+def : InstRW<[WriteVST], (instregex "^STR[BDHSQ]ui")>;
+def : InstRW<[M3WriteSY,
+ ReadAdrBase], (instregex "^STR[BDHS]ro[WX]")>;
+def : InstRW<[M3WriteSA,
+ ReadAdrBase], (instregex "^STRQro[WX]")>;
+def : InstRW<[WriteVST], (instregex "^STN?P[DSQ]i")>;
+def : InstRW<[WriteVST,
+ WriteAdr], (instregex "^STP[DS](post|pre)")>;
+def : InstRW<[M3WriteSA,
+ WriteAdr], (instregex "^STPQ(post|pre)")>;
+
+// ASIMD instructions.
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU]ABAL?v")>;
+def : InstRW<[M3WriteNMSC1], (instregex "^[SU]ABDL?v")>;
+def : InstRW<[M3WriteNMSC1], (instregex "^(SQ)?(ABS|NEG)v")>;
+def : InstRW<[M3WriteNALU1], (instregex "^(ADD|NEG|SUB)v")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU]?ADDL?Pv")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU]H(ADD|SUB)v")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU](ADD|SUB)[LW]V?v")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^R?(ADD|SUB)HN2?v")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU]Q(ADD|SUB)v")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^(SU|US)QADDv")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU]RHADDv")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU]?ADDL?Vv")>;
+def : InstRW<[M3WriteNMSC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>;
+def : InstRW<[M3WriteNALU1], (instregex "^CMTSTv")>;
+def : InstRW<[M3WriteNALU1], (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>;
+def : InstRW<[M3WriteNMSC1], (instregex "^[SU](MIN|MAX)v")>;
+def : InstRW<[M3WriteNMSC2], (instregex "^[SU](MIN|MAX)Pv")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU](MIN|MAX)Vv")>;
+def : InstRW<[M3WriteNMUL3], (instregex "^(MUL|SQR?DMULH)v")>;
+def : InstRW<[M3WriteNMUL3,
+ M3ReadNMUL], (instregex "^ML[AS]v")>;
+def : InstRW<[M3WriteNMUL3], (instregex "^[SU]ML[AS]Lv")>;
+def : InstRW<[M3WriteNMUL3], (instregex "^SQDML[AS]L")>;
+def : InstRW<[M3WriteNMUL3], (instregex "^(S|U|SQD)MULLv")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU]ADALPv")>;
+def : InstRW<[M3WriteNSHT3], (instregex "^[SU]R?SRAv")>;
+def : InstRW<[M3WriteNSHT1], (instregex "^SHL[dv]")>;
+def : InstRW<[M3WriteNSHT1], (instregex "^[SU]SH[LR][dv]")>;
+def : InstRW<[M3WriteNSHT1], (instregex "^S[RS]I[dv]")>;
+def : InstRW<[M3WriteNSHT2], (instregex "^[SU]?SHLLv")>;
+def : InstRW<[M3WriteNSHT3], (instregex "^(([SU]Q)?R)?SHRU?N[bhsv]")>;
+def : InstRW<[M3WriteNSHT3], (instregex "^[SU]RSH[LR][dv]")>;
+def : InstRW<[M3WriteNSHT3], (instregex "^[SU]QR?SHLU?[bdhsv]")>;
+
+// ASIMD FP instructions.
+def : InstRW<[M3WriteNSHF1], (instregex "^FABSv")>;
+def : InstRW<[M3WriteFADD2], (instregex "^F(ABD|ADD|SUB)v")>;
+def : InstRW<[M3WriteNEONA], (instregex "^FADDP")>;
+def : InstRW<[M3WriteNMSC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>;
+def : InstRW<[M3WriteFCVT3], (instregex "^FCVT(L|N|XN)v")>;
+def : InstRW<[M3WriteFCVT2], (instregex "^FCVT[AMNPZ][SU]v")>;
+def : InstRW<[M3WriteFCVT2], (instregex "^[SU]CVTFv")>;
+def : InstRW<[M3WriteFDIV10], (instrs FDIVv2f32)>;
+def : InstRW<[M3WriteNEONV], (instrs FDIVv4f32)>;
+def : InstRW<[M3WriteNEONW], (instrs FDIVv2f64)>;
+def : InstRW<[M3WriteNMSC1], (instregex "^F(MAX|MIN)(NM)?v")>;
+def : InstRW<[M3WriteNMSC2], (instregex "^F(MAX|MIN)(NM)?Pv")>;
+def : InstRW<[M3WriteNEONZ], (instregex "^F(MAX|MIN)(NM)?Vv")>;
+def : InstRW<[M3WriteFMAC3], (instregex "^FMULX?v.[fi]")>;
+def : InstRW<[M3WriteFMAC4,
+ M3ReadFMAC], (instregex "^FML[AS]v.f")>;
+def : InstRW<[M3WriteFMAC5,
+ M3ReadFMAC], (instregex "^FML[AS]v.i")>;
+def : InstRW<[M3WriteNALU1], (instregex "^FNEGv")>;
+def : InstRW<[M3WriteFCVT3A], (instregex "^FRINT[AIMNPXZ]v")>;
+def : InstRW<[M3WriteFSQR17], (instrs FSQRTv2f32)>;
+def : InstRW<[M3WriteNEONX], (instrs FSQRTv4f32)>;
+def : InstRW<[M3WriteNEONY], (instrs FSQRTv2f64)>;
+
+// ASIMD miscellaneous instructions.
+def : InstRW<[M3WriteNALU1], (instregex "^RBITv")>;
+def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M3WriteNEONB], (instregex "^DUPv.+gpr")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^DUPv.+lane")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^EXTv")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^[SU]?Q?XTU?Nv")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^CPY")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^INSv.+lane")>;
+def : InstRW<[M3WriteMOVI], (instregex "^MOVI")>;
+def : InstRW<[M3WriteNALU1], (instregex "^FMOVv")>;
+def : InstRW<[M3WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev[248]")>;
+def : InstRW<[M3WriteFMAC4,
+ M3ReadFMAC], (instregex "^F(RECP|RSQRT)Sv")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^REV(16|32|64)v")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^TB[LX]v")>;
+def : InstRW<[M3WriteNEOND], (instregex "^[SU]MOVv")>;
+def : InstRW<[M3WriteNSHF3], (instregex "^INSv.+gpr")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^(TRN|UZP|ZIP)[12]v")>;
+
+// ASIMD load instructions.
+def : InstRW<[M3WriteL5], (instregex "LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteL5,
+ WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteL5], (instregex "LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteL5,
+ WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDA], (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVLDA,
+ WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVLDA], (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDA,
+ WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDB], (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVLDB,
+ WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVLDB], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDB,
+ WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDC], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVLDC,
+ WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVLDC], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDC,
+ WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDD], (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[M3WriteVLDD,
+ WriteAdr], (instregex "LD1i(8|16|32)_POST")>;
+def : InstRW<[M3WriteVLDE], (instregex "LD1i(64)$")>;
+def : InstRW<[M3WriteVLDE,
+ WriteAdr], (instregex "LD1i(64)_POST")>;
+
+def : InstRW<[M3WriteL5], (instregex "LD1Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteL5,
+ WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteL5], (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteL5,
+ WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDF], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[M3WriteVLDF,
+ WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST")>;
+def : InstRW<[M3WriteVLDF], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDF,
+ WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDG], (instregex "LD2i(8|16|32)$")>;
+def : InstRW<[M3WriteVLDG,
+ WriteAdr], (instregex "LD2i(8|16|32)_POST")>;
+def : InstRW<[M3WriteVLDH], (instregex "LD2i(64)$")>;
+def : InstRW<[M3WriteVLDH,
+ WriteAdr], (instregex "LD2i(64)_POST")>;
+
+def : InstRW<[M3WriteVLDA], (instregex "LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVLDA,
+ WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVLDA], (instregex "LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDA,
+ WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDI], (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[M3WriteVLDI,
+ WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[M3WriteVLDI], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDI,
+ WriteAdr], (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDJ], (instregex "LD3i(8|16|32)$")>;
+def : InstRW<[M3WriteVLDJ,
+ WriteAdr], (instregex "LD3i(8|16|32)_POST")>;
+def : InstRW<[M3WriteVLDL], (instregex "LD3i(64)$")>;
+def : InstRW<[M3WriteVLDL,
+ WriteAdr], (instregex "LD3i(64)_POST")>;
+
+def : InstRW<[M3WriteVLDB], (instregex "LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVLDB,
+ WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVLDB], (instregex "LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDB,
+ WriteAdr], (instregex "LD3Rv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDN], (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[M3WriteVLDN,
+ WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST")>;
+def : InstRW<[M3WriteVLDN], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDN,
+ WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDK], (instregex "LD4i(8|16|32)$")>;
+def : InstRW<[M3WriteVLDK,
+ WriteAdr], (instregex "LD4i(8|16|32)_POST")>;
+def : InstRW<[M3WriteVLDM], (instregex "LD4i(64)$")>;
+def : InstRW<[M3WriteVLDM,
+ WriteAdr], (instregex "LD4i(64)_POST")>;
+
+def : InstRW<[M3WriteVLDC], (instregex "LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVLDC,
+ WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVLDC], (instregex "LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDC,
+ WriteAdr], (instregex "LD4Rv(16b|8h|4s|2d)_POST")>;
+
+// ASIMD store instructions.
+def : InstRW<[WriteVST], (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVST,
+ WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST], (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVST,
+ WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVSTA], (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVSTA,
+ WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVSTA], (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVSTA,
+ WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVSTB], (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVSTB,
+ WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVSTB], (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVSTB,
+ WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVSTC], (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVSTC,
+ WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVSTC], (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVSTC,
+ WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVSTD], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[M3WriteVSTD,
+ WriteAdr], (instregex "ST1i(8|16|32|64)_POST")>;
+
+def : InstRW<[M3WriteVSTD], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[M3WriteVSTD,
+ WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST")>;
+def : InstRW<[M3WriteVSTE], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVSTE,
+ WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVSTD], (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[M3WriteVSTD,
+ WriteAdr], (instregex "ST2i(8|16|32)_POST")>;
+def : InstRW<[M3WriteVSTD], (instregex "ST2i(64)$")>;
+def : InstRW<[M3WriteVSTD,
+ WriteAdr], (instregex "ST2i(64)_POST")>;
+
+def : InstRW<[M3WriteVSTF], (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[M3WriteVSTF,
+ WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[M3WriteVSTG], (instregex "ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVSTG,
+ WriteAdr], (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVSTH], (instregex "ST3i(8|16|32)$")>;
+def : InstRW<[M3WriteVSTH,
+ WriteAdr], (instregex "ST3i(8|16|32)_POST")>;
+def : InstRW<[M3WriteVSTF], (instregex "ST3i(64)$")>;
+def : InstRW<[M3WriteVSTF,
+ WriteAdr], (instregex "ST3i(64)_POST")>;
+
+def : InstRW<[M3WriteVSTF], (instregex "ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[M3WriteVSTF,
+ WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST")>;
+def : InstRW<[M3WriteVSTI], (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVSTI,
+ WriteAdr], (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVSTF], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[M3WriteVSTF,
+ WriteAdr], (instregex "ST4i(8|16|32|64)_POST")>;
+
+// Cryptography instructions.
+def : InstRW<[M3WriteAES], (instregex "^AES[DE]")>;
+def : InstRW<[M3WriteAES,
+ M3ReadAES], (instregex "^AESI?MC")>;
+
+def : InstRW<[M3WriteNCRY3A], (instregex "^PMULL?v")>;
+
+def : InstRW<[M3WriteNCRY1A], (instregex "^SHA1([CHMP]|SU[01])")>;
+def : InstRW<[M3WriteNCRY1A], (instregex "^SHA256SU0")>;
+def : InstRW<[M3WriteNCRY5A], (instregex "^SHA256(H2?|SU1)")>;
+
+// CRC instructions.
+def : InstRW<[M3WriteC2], (instregex "^CRC32")>;
+
+} // SchedModel = ExynosM3Model
diff --git a/lib/Target/AArch64/AArch64SchedFalkor.td b/lib/Target/AArch64/AArch64SchedFalkor.td
index 7277198b585f..84825458e47c 100644
--- a/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -25,6 +25,9 @@ def FalkorModel : SchedMachineModel {
let CompleteModel = 1;
list<Predicate> UnsupportedFeatures = [HasSVE];
+
+ // FIXME: Remove when all errors have been fixed.
+ let FullInstRWOverlapCheck = 0;
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index 0aeb1f3e3058..ff14e639d1a5 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -32,8 +32,12 @@
//===----------------------------------------------------------------------===//
// Define 0 micro-op types
-def FalkorWr_LdStInc_none_3cyc : SchedWriteRes<[]> {
- let Latency = 3;
+def FalkorWr_LdInc_none_2cyc : SchedWriteRes<[]> {
+ let Latency = 2;
+ let NumMicroOps = 0;
+}
+def FalkorWr_StInc_none_2cyc : SchedWriteRes<[]> {
+ let Latency = 2;
let NumMicroOps = 0;
}
def FalkorWr_none_3cyc : SchedWriteRes<[]> {
@@ -514,8 +518,8 @@ def FalkorReadVMA : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr
def FalkorReadFMA32 : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>;
def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>;
-def FalkorReadIncLd : SchedReadAdvance<2, [FalkorWr_LdStInc_none_3cyc]>;
-def FalkorReadIncSt : SchedReadAdvance<1, [FalkorWr_LdStInc_none_3cyc]>;
+def FalkorReadIncLd : SchedReadAdvance<1, [FalkorWr_LdInc_none_2cyc]>;
+def FalkorReadIncSt : SchedReadAdvance<1, [FalkorWr_StInc_none_2cyc]>;
// SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast
// -----------------------------------------------------------------------------
@@ -776,99 +780,99 @@ def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
// SIMD Load Instructions
// -----------------------------------------------------------------------------
def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
(instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
(instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], (instrs LD2i64)>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
(instrs LD2i64_POST)>;
def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd], (instregex "^LD1i(8|16|32)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd],
(instregex "^LD1i(8|16|32)_POST$")>;
def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
(instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Twov(8b|4h|2s)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
(instregex "^LD2Twov(8b|4h|2s)_POST$")>;
def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
(instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
(instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
(instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
(instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instrs LD3i64)>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
(instrs LD3i64_POST)>;
def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instrs LD4i64)>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
(instrs LD4i64_POST)>;
def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd], (instregex "^LD2i(8|16|32)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd],
(instregex "^LD2i(8|16|32)_POST$")>;
def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
(instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
(instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd], (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
(instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd], (instrs LD3Threev2d)>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
(instrs LD3Threev2d_POST)>;
def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd], (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
(instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd], (instregex "^LD3i(8|16|32)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd],
(instregex "^LD3i(8|16|32)_POST$")>;
def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
(instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
(instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
(instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd], (instrs LD4Fourv2d)>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
(instrs LD4Fourv2d_POST)>;
def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd], (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
(instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd], (instregex "^LD4i(8|16|32)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd],
(instregex "^LD4i(8|16|32)_POST$")>;
def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
(instregex "^LD3Threev(8b|4h|2s)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
(instregex "^LD3Threev(8b|4h|2s)_POST$")>;
def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
(instregex "^LD4Fourv(8b|4h|2s)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
(instregex "^LD4Fourv(8b|4h|2s)_POST$")>;
def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
@@ -877,10 +881,10 @@ def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
(instregex "^LD4Fourv(16b|8h|4s)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, FalkorReadIncLd],
(instregex "^LD3Threev(16b|8h|4s)_POST$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, FalkorReadIncLd],
(instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
// Arithmetic and Logical Instructions
@@ -965,17 +969,17 @@ def : InstRW<[FalkorWr_5VXVY_7cyc], (instregex "^TBX(v8i8Four|v16i8Four)$")>;
def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^STR(Q|D|S|H|B)ui$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+def : InstRW<[FalkorWr_StInc_none_2cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^STR(Q|D|S|H|B)(post|pre)$")>;
def : InstRW<[FalkorWr_STRVro, ReadDefault, FalkorReadIncSt],
(instregex "^STR(D|S|H|B)ro(W|X)$")>;
def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
(instregex "^STPQi$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+def : InstRW<[FalkorWr_StInc_none_2cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
(instregex "^STPQ(post|pre)$")>;
def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
(instregex "^STP(D|S)(i)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+def : InstRW<[FalkorWr_StInc_none_2cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
(instregex "^STP(D|S)(post|pre)$")>;
def : InstRW<[FalkorWr_STRQro, ReadDefault, FalkorReadIncSt],
(instregex "^STRQro(W|X)$")>;
@@ -988,7 +992,7 @@ def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt]
def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+def : InstRW<[FalkorWr_StInc_none_2cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^ST1(One(v8b|v4h|v2s|v1d)_POST|(i8|i16|i32|i64)_POST)$")>;
def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))$")>;
@@ -1087,7 +1091,7 @@ def : InstRW<[FalkorWr_4VXVY_3cyc], (instrs SHA256SU1rrr)>;
// -----------------------------------------------------------------------------
def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
(instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
(instregex "^LDR(Q|D|S|H|B)(post|pre)$")>;
def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
(instregex "^LDUR(Q|D|S|H|B)i$")>;
@@ -1101,9 +1105,9 @@ def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instregex "LDNP(D|S)i$")>;
def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instregex "LDP(D|S)i$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instregex "LDP(D|S)(pre|post)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instregex "^LDPQ(pre|post)$")>;
// FP Data Processing Instructions
@@ -1165,11 +1169,11 @@ def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instregex "^LDNP(W|X)i$")>;
def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instregex "^LDP(W|X)i$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instregex "^LDP(W|X)(post|pre)$")>;
def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
(instregex "^LDR(BB|HH|W|X)ui$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
(instregex "^LDR(BB|HH|W|X)(post|pre)$")>;
def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd],
(instregex "^LDR(BB|HH|W|X)ro(W|X)$")>;
@@ -1182,11 +1186,11 @@ def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
def : InstRW<[FalkorWr_PRFMro], (instregex "^PRFMro(W|X)$")>;
def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
(instrs LDPSWi)>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
(instregex "^LDPSW(post|pre)$")>;
def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
(instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_4cyc, FalkorReadIncLd],
(instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
def : InstRW<[FalkorWr_LDRSro, FalkorReadIncLd],
(instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
@@ -1273,11 +1277,11 @@ def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, FalkorReadInc
// -----------------------------------------------------------------------------
def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
(instregex "^STP(W|X)i$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+def : InstRW<[FalkorWr_StInc_none_2cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
(instregex "^STP(W|X)(post|pre)$")>;
def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^STR(BB|HH|W|X)ui$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+def : InstRW<[FalkorWr_StInc_none_2cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^STR(BB|HH|W|X)(post|pre)$")>;
def : InstRW<[FalkorWr_STRro, ReadDefault, FalkorReadIncSt],
(instregex "^STR(BB|HH|W|X)ro(W|X)$")>;
diff --git a/lib/Target/AArch64/AArch64SchedKryo.td b/lib/Target/AArch64/AArch64SchedKryo.td
index ce2afd499afb..68de3e077c96 100644
--- a/lib/Target/AArch64/AArch64SchedKryo.td
+++ b/lib/Target/AArch64/AArch64SchedKryo.td
@@ -29,6 +29,9 @@ def KryoModel : SchedMachineModel {
let CompleteModel = 1;
list<Predicate> UnsupportedFeatures = [HasSVE];
+
+ // FIXME: Remove when all errors have been fixed.
+ let FullInstRWOverlapCheck = 0;
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedThunderX.td b/lib/Target/AArch64/AArch64SchedThunderX.td
index 585688aae279..fbbd3850d0fd 100644
--- a/lib/Target/AArch64/AArch64SchedThunderX.td
+++ b/lib/Target/AArch64/AArch64SchedThunderX.td
@@ -27,6 +27,9 @@ def ThunderXT8XModel : SchedMachineModel {
let CompleteModel = 1;
list<Predicate> UnsupportedFeatures = [HasSVE];
+
+ // FIXME: Remove when all errors have been fixed.
+ let FullInstRWOverlapCheck = 0;
}
// Modeling each pipeline with BufferSize == 0 since T8X is in-order.
diff --git a/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index 22f272edd680..bee3392b6d3b 100644
--- a/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -27,8 +27,13 @@ def ThunderX2T99Model : SchedMachineModel {
let CompleteModel = 1;
list<Predicate> UnsupportedFeatures = [HasSVE];
+
+ // FIXME: Remove when all errors have been fixed.
+ let FullInstRWOverlapCheck = 0;
}
+let SchedModel = ThunderX2T99Model in {
+
// Define the issue ports.
// Port 0: ALU, FP/SIMD.
@@ -49,8 +54,6 @@ def THX2T99P4 : ProcResource<1>;
// Port 5: Load/store.
def THX2T99P5 : ProcResource<1>;
-let SchedModel = ThunderX2T99Model in {
-
// Define groups for the functional units on each issue port. Each group
// created will be used by a WriteRes later on.
//
@@ -359,13 +362,10 @@ def : ReadAdvance<ReadID, 0>;
def : ReadAdvance<ReadExtrHi, 0>;
def : ReadAdvance<ReadAdrBase, 0>;
def : ReadAdvance<ReadVLD, 0>;
-}
//===----------------------------------------------------------------------===//
// 3. Instruction Tables.
-let SchedModel = ThunderX2T99Model in {
-
//---
// 3.1 Branch Instructions
//---
@@ -391,7 +391,7 @@ def : WriteRes<WriteBarrier, []> { let Latency = 1; }
def : WriteRes<WriteHint, []> { let Latency = 1; }
def : WriteRes<WriteAtomic, []> {
- let Unsupported = 1;
+ let Latency = 4;
let NumMicroOps = 2;
}
@@ -416,63 +416,63 @@ def : InstRW<[THX2T99Write_1Cyc_I2],
// Address generation
def : WriteRes<WriteI, [THX2T99I012]> {
let Latency = 1;
- let ResourceCycles = [1, 3];
+ let ResourceCycles = [1];
let NumMicroOps = 2;
}
def : InstRW<[WriteI],
(instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?",
"AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)",
- "ADC?(W|X)r(i|r|s|x)", "ADCS?(W|X)r(i|r|s|x)",
+ "ADC(W|X)r",
"BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)",
"EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)",
"ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)",
- "SUBS?(W|X)r(i|r|s|x)", "SBC?(W|X)r(i|r|s|x)",
- "SBCS?(W|X)r(i|r|s|x)", "CCMN?(W|X)r(i|r|s|x)",
- "CCMP?(W|X)r(i|r|s|x)", "CSEL?(W|X)r(i|r|s|x)",
- "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
- "CSNEG?(W|X)r(i|r|s|x)")>;
+ "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r",
+ "SBCS(W|X)r", "CCMN(W|X)(i|r)",
+ "CCMP(W|X)(i|r)", "CSEL(W|X)r",
+ "CSINC(W|X)r", "CSINV(W|X)r",
+ "CSNEG(W|X)r")>;
def : InstRW<[WriteI], (instrs COPY)>;
// ALU, extend and/or shift
def : WriteRes<WriteISReg, [THX2T99I012]> {
let Latency = 2;
- let ResourceCycles = [2, 3];
+ let ResourceCycles = [2];
let NumMicroOps = 2;
}
def : InstRW<[WriteISReg],
(instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?",
"AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)",
- "ADC?(W|X)r(i|r|s|x)", "ADCS?(W|X)r(i|r|s|x)",
+ "ADC(W|X)r",
"BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)",
"EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)",
"ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)",
- "SUBS?(W|X)r(i|r|s|x)", "SBC?(W|X)r(i|r|s|x)",
- "SBCS?(W|X)r(i|r|s|x)", "CCMN?(W|X)r(i|r|s|x)",
- "CCMP?(W|X)r(i|r|s|x)", "CSEL?(W|X)r(i|r|s|x)",
- "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
- "CSNEG?(W|X)r(i|r|s|x)")>;
+ "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r",
+ "SBCS(W|X)r", "CCMN(W|X)(i|r)",
+ "CCMP(W|X)(i|r)", "CSEL(W|X)r",
+ "CSINC(W|X)r", "CSINV(W|X)r",
+ "CSNEG(W|X)r")>;
def : WriteRes<WriteIEReg, [THX2T99I012]> {
let Latency = 1;
- let ResourceCycles = [1, 3];
+ let ResourceCycles = [1];
let NumMicroOps = 2;
}
def : InstRW<[WriteIEReg],
(instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?",
"AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)",
- "ADC?(W|X)r(i|r|s|x)", "ADCS?(W|X)r(i|r|s|x)",
+ "ADC(W|X)r",
"BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)",
"EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)",
"ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)",
- "SUBS?(W|X)r(i|r|s|x)", "SBC?(W|X)r(i|r|s|x)",
- "SBCS?(W|X)r(i|r|s|x)", "CCMN?(W|X)r(i|r|s|x)",
- "CCMP?(W|X)r(i|r|s|x)", "CSEL?(W|X)r(i|r|s|x)",
- "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
- "CSNEG?(W|X)r(i|r|s|x)")>;
+ "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r",
+ "SBCS(W|X)r", "CCMN(W|X)(i|r)",
+ "CCMP(W|X)(i|r)", "CSEL(W|X)r",
+ "CSINC(W|X)r", "CSINV(W|X)r",
+ "CSNEG(W|X)r")>;
// Move immed
def : WriteRes<WriteImm, [THX2T99I012]> {
@@ -500,14 +500,14 @@ def : WriteRes<WriteIS, [THX2T99I012]> {
// Latency range of 13-23/13-39.
def : WriteRes<WriteID32, [THX2T99I1]> {
let Latency = 39;
- let ResourceCycles = [13, 39];
+ let ResourceCycles = [39];
let NumMicroOps = 4;
}
// Divide, X-form
def : WriteRes<WriteID64, [THX2T99I1]> {
let Latency = 23;
- let ResourceCycles = [13, 23];
+ let ResourceCycles = [23];
let NumMicroOps = 4;
}
@@ -1147,7 +1147,7 @@ def : InstRW<[THX2T99XWriteFDivSP], (instrs FDIVSrr)>;
def : InstRW<[THX2T99XWriteFSqrtSP], (instrs FSQRTSr)>;
def : InstRW<[THX2T99XWriteFDivSP], (instregex "^FDIVv.*32$")>;
def : InstRW<[THX2T99XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
-def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "^FDIVSrr", "^FSQRTSrr")>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "^FDIVSrr", "^FSQRTSr")>;
// FP divide, D-form
// FP square root, D-form
@@ -1155,7 +1155,7 @@ def : InstRW<[THX2T99XWriteFDivDP], (instrs FDIVDrr)>;
def : InstRW<[THX2T99XWriteFSqrtDP], (instrs FSQRTDr)>;
def : InstRW<[THX2T99XWriteFDivDP], (instregex "^FDIVv.*64$")>;
def : InstRW<[THX2T99XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
-def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "^FDIVDrr", "^FSQRTDrr")>;
+def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "^FDIVDrr", "^FSQRTDr")>;
// FP multiply
// FP multiply accumulate
@@ -1252,17 +1252,17 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
def : WriteRes<WriteV, [THX2T99F01]> {
let Latency = 7;
let NumMicroOps = 4;
- let ResourceCycles = [4, 23];
+ let ResourceCycles = [4];
}
// ASIMD arith, reduce, 4H/4S
// ASIMD arith, reduce, 8B/8H
// ASIMD arith, reduce, 16B
-// ASIMD logical (MOV, MVN, ORN, ORR)
+// ASIMD logical (MVN (alias for NOT), ORN, ORR)
def : InstRW<[THX2T99Write_5Cyc_F01],
- (instregex "^ANDv", "^BICv", "^EORv", "^MOVv", "^MVNv",
- "^ORRv", "^ORNv", "^NOTv")>;
+ (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
+
// ASIMD arith, reduce
def : InstRW<[THX2T99Write_10Cyc_F01],
(instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
@@ -1513,7 +1513,7 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>;
def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^[SU]MOVv")>;
// ASIMD move, integer immed
-def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^MOVIv")>;
// ASIMD move, FP immed
def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMOVv")>;
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 571e61d7083c..fc7b5984fe3e 100644
--- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -91,9 +91,9 @@ bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB)
if (SCDesc->isValid() && !SCDesc->isVariant()) {
unsigned ResLenWithSTP = BBTrace.getResourceLength(None, SCDesc);
if (ResLenWithSTP > ResLength) {
- DEBUG(dbgs() << " Suppress STP in BB: " << BB->getNumber()
- << " resources " << ResLength << " -> " << ResLenWithSTP
- << "\n");
+ LLVM_DEBUG(dbgs() << " Suppress STP in BB: " << BB->getNumber()
+ << " resources " << ResLength << " -> " << ResLenWithSTP
+ << "\n");
return false;
}
}
@@ -127,14 +127,14 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
TRI = ST.getRegisterInfo();
MRI = &MF.getRegInfo();
- SchedModel.init(ST.getSchedModel(), &ST, TII);
+ SchedModel.init(&ST);
Traces = &getAnalysis<MachineTraceMetrics>();
MinInstr = nullptr;
- DEBUG(dbgs() << "*** " << getPassName() << ": " << MF.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << MF.getName() << '\n');
if (!SchedModel.hasInstrSchedModel()) {
- DEBUG(dbgs() << " Skipping pass: no machine model present.\n");
+ LLVM_DEBUG(dbgs() << " Skipping pass: no machine model present.\n");
return false;
}
@@ -156,7 +156,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
break;
// Otherwise, continue unpairing the stores in this block.
- DEBUG(dbgs() << "Unpairing store " << MI << "\n");
+ LLVM_DEBUG(dbgs() << "Unpairing store " << MI << "\n");
SuppressSTP = true;
TII->suppressLdStPair(MI);
}
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 688bb936d0ca..04bb90d30d6d 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/TargetParser.h"
using namespace llvm;
@@ -82,6 +83,12 @@ void AArch64Subtarget::initializeProperties() {
PrefFunctionAlignment = 4;
PrefLoopAlignment = 3;
break;
+ case ExynosM3:
+ MaxInterleaveFactor = 4;
+ MaxJumpTableSize = 20;
+ PrefFunctionAlignment = 5;
+ PrefLoopAlignment = 4;
+ break;
case Falkor:
MaxInterleaveFactor = 4;
// FIXME: remove this to enable 64-bit SLP if performance looks good.
@@ -145,7 +152,7 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const TargetMachine &TM, bool LittleEndian)
: AArch64GenSubtargetInfo(TT, CPU, FS),
- ReserveX18(TT.isOSDarwin() || TT.isOSWindows()), IsLittle(LittleEndian),
+ ReserveX18(AArch64::isX18ReservedByDefault(TT)), IsLittle(LittleEndian),
TargetTriple(TT), FrameLowering(),
InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
TLInfo(TM, *this) {
@@ -189,15 +196,18 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
return AArch64II::MO_GOT;
+ unsigned Flags = GV->hasDLLImportStorageClass() ? AArch64II::MO_DLLIMPORT
+ : AArch64II::MO_NO_FLAG;
+
if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
- return AArch64II::MO_GOT;
+ return AArch64II::MO_GOT | Flags;
// The small code model's direct accesses use ADRP, which cannot
// necessarily produce the value 0 (if the code is above 4GB).
if (useSmallAddressing() && GV->hasExternalWeakLinkage())
- return AArch64II::MO_GOT;
+ return AArch64II::MO_GOT | Flags;
- return AArch64II::MO_NO_FLAG;
+ return Flags;
}
unsigned char AArch64Subtarget::classifyGlobalFunctionReference(
@@ -250,3 +260,13 @@ std::unique_ptr<PBQPRAConstraint>
AArch64Subtarget::getCustomPBQPConstraints() const {
return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
}
+
+void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
+ // We usually compute max call frame size after ISel. Do the computation now
+ // if the .mir file didn't specify it. Note that this will probably give you
+ // bogus values after PEI has eliminated the callframe setup/destroy pseudo
+ // instructions, specify explicitely if you need it to be correct.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (!MFI.isMaxCallFrameSizeComputed())
+ MFI.computeMaxCallFrameSize(MF);
+}
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 9245b2f396b7..5af4c0dd9c19 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -48,6 +48,7 @@ public:
CortexA75,
Cyclone,
ExynosM1,
+ ExynosM3,
Falkor,
Kryo,
Saphira,
@@ -65,6 +66,7 @@ protected:
bool HasV8_1aOps = false;
bool HasV8_2aOps = false;
bool HasV8_3aOps = false;
+ bool HasV8_4aOps = false;
bool HasFPARMv8 = false;
bool HasNEON = false;
@@ -77,9 +79,18 @@ protected:
bool HasPerfMon = false;
bool HasFullFP16 = false;
bool HasSPE = false;
+
+ // ARMv8.4 Crypto extensions
+ bool HasSM4 = true;
+ bool HasSHA3 = true;
+
+ bool HasSHA2 = true;
+ bool HasAES = true;
+
bool HasLSLFast = false;
bool HasSVE = false;
bool HasRCPC = false;
+ bool HasAggressiveFMA = false;
// HasZeroCycleRegMove - Has zero-cycle register mov instructions.
bool HasZeroCycleRegMove = false;
@@ -101,6 +112,7 @@ protected:
bool PredictableSelectIsExpensive = false;
bool BalanceFPOps = false;
bool CustomAsCheapAsMove = false;
+ bool ExynosAsCheapAsMove = false;
bool UsePostRAScheduler = false;
bool Misaligned128StoreIsSlow = false;
bool Paired128IsSlow = false;
@@ -108,7 +120,9 @@ protected:
bool UseAlternateSExtLoadCVTF32Pattern = false;
bool HasArithmeticBccFusion = false;
bool HasArithmeticCbzFusion = false;
+ bool HasFuseAddress = false;
bool HasFuseAES = false;
+ bool HasFuseCCSelect = false;
bool HasFuseLiterals = false;
bool DisableLatencySchedHeuristic = false;
bool UseRSqrt = false;
@@ -126,6 +140,9 @@ protected:
// ReserveX18 - X18 is not available as a general purpose register.
bool ReserveX18;
+ // ReserveX20 - X20 is not available as a general purpose register.
+ bool ReserveX20 = false;
+
bool IsLittle;
/// TargetTriple - What processor and OS we're targeting.
@@ -193,6 +210,7 @@ public:
bool hasV8_1aOps() const { return HasV8_1aOps; }
bool hasV8_2aOps() const { return HasV8_2aOps; }
bool hasV8_3aOps() const { return HasV8_3aOps; }
+ bool hasV8_4aOps() const { return HasV8_4aOps; }
bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
@@ -211,6 +229,7 @@ public:
}
bool isX18Reserved() const { return ReserveX18; }
+ bool isX20Reserved() const { return ReserveX20; }
bool hasFPARMv8() const { return HasFPARMv8; }
bool hasNEON() const { return HasNEON; }
bool hasCrypto() const { return HasCrypto; }
@@ -219,11 +238,16 @@ public:
bool hasLSE() const { return HasLSE; }
bool hasRAS() const { return HasRAS; }
bool hasRDM() const { return HasRDM; }
+ bool hasSM4() const { return HasSM4; }
+ bool hasSHA3() const { return HasSHA3; }
+ bool hasSHA2() const { return HasSHA2; }
+ bool hasAES() const { return HasAES; }
bool balanceFPOps() const { return BalanceFPOps; }
bool predictableSelectIsExpensive() const {
return PredictableSelectIsExpensive;
}
bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
+ bool hasExynosCheapAsMoveHandling() const { return ExynosAsCheapAsMove; }
bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
bool isPaired128Slow() const { return Paired128IsSlow; }
bool isSTRQroSlow() const { return STRQroIsSlow; }
@@ -232,13 +256,15 @@ public:
}
bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; }
bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; }
+ bool hasFuseAddress() const { return HasFuseAddress; }
bool hasFuseAES() const { return HasFuseAES; }
+ bool hasFuseCCSelect() const { return HasFuseCCSelect; }
bool hasFuseLiterals() const { return HasFuseLiterals; }
- /// \brief Return true if the CPU supports any kind of instruction fusion.
+ /// Return true if the CPU supports any kind of instruction fusion.
bool hasFusion() const {
return hasArithmeticBccFusion() || hasArithmeticCbzFusion() ||
- hasFuseAES() || hasFuseLiterals();
+ hasFuseAES() || hasFuseCCSelect() || hasFuseLiterals();
}
bool useRSqrt() const { return UseRSqrt; }
@@ -269,6 +295,7 @@ public:
bool hasLSLFast() const { return HasLSLFast; }
bool hasSVE() const { return HasSVE; }
bool hasRCPC() const { return HasRCPC; }
+ bool hasAggressiveFMA() const { return HasAggressiveFMA; }
bool isLittleEndian() const { return IsLittle; }
@@ -326,6 +353,8 @@ public:
return false;
}
}
+
+ void mirFileLoaded(MachineFunction &MF) const override;
};
} // End llvm namespace
diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td
index 66b7e02ceb99..8acd32533eea 100644
--- a/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/lib/Target/AArch64/AArch64SystemOperands.td
@@ -143,6 +143,23 @@ class ISB<string name, bits<4> encoding> : SearchableTable{
def : ISB<"sy", 0xf>;
//===----------------------------------------------------------------------===//
+// TSB (Trace synchronization barrier) instruction options.
+//===----------------------------------------------------------------------===//
+
+class TSB<string name, bits<4> encoding> : SearchableTable{
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<4> Encoding;
+ let Encoding = encoding;
+
+ code Requires = [{ {AArch64::HasV8_4aOps} }];
+}
+
+def : TSB<"csync", 0>;
+
+//===----------------------------------------------------------------------===//
// PRFM (prefetch) instruction options.
//===----------------------------------------------------------------------===//
@@ -175,6 +192,87 @@ def : PRFM<"pstl3keep", 0x14>;
def : PRFM<"pstl3strm", 0x15>;
//===----------------------------------------------------------------------===//
+// SVE Prefetch instruction options.
+//===----------------------------------------------------------------------===//
+
+class SVEPRFM<string name, bits<4> encoding> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<4> Encoding;
+ let Encoding = encoding;
+ code Requires = [{ {} }];
+}
+
+let Requires = [{ {AArch64::FeatureSVE} }] in {
+def : SVEPRFM<"pldl1keep", 0x00>;
+def : SVEPRFM<"pldl1strm", 0x01>;
+def : SVEPRFM<"pldl2keep", 0x02>;
+def : SVEPRFM<"pldl2strm", 0x03>;
+def : SVEPRFM<"pldl3keep", 0x04>;
+def : SVEPRFM<"pldl3strm", 0x05>;
+def : SVEPRFM<"pstl1keep", 0x08>;
+def : SVEPRFM<"pstl1strm", 0x09>;
+def : SVEPRFM<"pstl2keep", 0x0a>;
+def : SVEPRFM<"pstl2strm", 0x0b>;
+def : SVEPRFM<"pstl3keep", 0x0c>;
+def : SVEPRFM<"pstl3strm", 0x0d>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Predicate patterns
+//===----------------------------------------------------------------------===//
+
+class SVEPREDPAT<string name, bits<5> encoding> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<5> Encoding;
+ let Encoding = encoding;
+}
+
+def : SVEPREDPAT<"pow2", 0x00>;
+def : SVEPREDPAT<"vl1", 0x01>;
+def : SVEPREDPAT<"vl2", 0x02>;
+def : SVEPREDPAT<"vl3", 0x03>;
+def : SVEPREDPAT<"vl4", 0x04>;
+def : SVEPREDPAT<"vl5", 0x05>;
+def : SVEPREDPAT<"vl6", 0x06>;
+def : SVEPREDPAT<"vl7", 0x07>;
+def : SVEPREDPAT<"vl8", 0x08>;
+def : SVEPREDPAT<"vl16", 0x09>;
+def : SVEPREDPAT<"vl32", 0x0a>;
+def : SVEPREDPAT<"vl64", 0x0b>;
+def : SVEPREDPAT<"vl128", 0x0c>;
+def : SVEPREDPAT<"vl256", 0x0d>;
+def : SVEPREDPAT<"mul4", 0x1d>;
+def : SVEPREDPAT<"mul3", 0x1e>;
+def : SVEPREDPAT<"all", 0x1f>;
+
+//===----------------------------------------------------------------------===//
+// Exact FP Immediates.
+//
+// These definitions are used to create a lookup table with FP Immediates that
+// is used for a few instructions that only accept a limited set of exact FP
+// immediates values.
+//===----------------------------------------------------------------------===//
+class ExactFPImm<string name, string repr, bits<4> enum > : SearchableTable {
+ let SearchableFields = ["Enum", "Repr"];
+ let EnumValueField = "Enum";
+
+ string Name = name;
+ bits<4> Enum = enum;
+ string Repr = repr;
+}
+
+def : ExactFPImm<"zero", "0.0", 0x0>;
+def : ExactFPImm<"half", "0.5", 0x1>;
+def : ExactFPImm<"one", "1.0", 0x2>;
+def : ExactFPImm<"two", "2.0", 0x3>;
+
+//===----------------------------------------------------------------------===//
// PState instruction options.
//===----------------------------------------------------------------------===//
@@ -197,7 +295,9 @@ def : PState<"PAN", 0b00100>;
// v8.2a "User Access Override" extension-specific PStates
let Requires = [{ {AArch64::HasV8_2aOps} }] in
def : PState<"UAO", 0b00011>;
-
+// v8.4a timining insensitivity of data processing instructions
+let Requires = [{ {AArch64::HasV8_4aOps} }] in
+def : PState<"DIT", 0b11010>;
//===----------------------------------------------------------------------===//
// PSB instruction options.
@@ -230,6 +330,7 @@ class TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm,
let Encoding{6-3} = crm;
let Encoding{2-0} = op2;
bit NeedsReg = needsreg;
+ code Requires = [{ {} }];
}
def : TLBI<"IPAS2E1IS", 0b100, 0b1000, 0b0000, 0b001>;
@@ -265,6 +366,59 @@ def : TLBI<"VALE3", 0b110, 0b1000, 0b0111, 0b101>;
def : TLBI<"VMALLS12E1", 0b100, 0b1000, 0b0111, 0b110, 0>;
def : TLBI<"VAALE1", 0b000, 0b1000, 0b0111, 0b111>;
+// Armv8.4-A Outer Sharable TLB Maintenance instructions:
+let Requires = [{ {AArch64::HasV8_4aOps} }] in {
+// op1 CRn CRm op2
+def : TLBI<"VMALLE1OS", 0b000, 0b1000, 0b0001, 0b000, 0>;
+def : TLBI<"VAE1OS", 0b000, 0b1000, 0b0001, 0b001>;
+def : TLBI<"ASIDE1OS", 0b000, 0b1000, 0b0001, 0b010>;
+def : TLBI<"VAAE1OS", 0b000, 0b1000, 0b0001, 0b011>;
+def : TLBI<"VALE1OS", 0b000, 0b1000, 0b0001, 0b101>;
+def : TLBI<"VAALE1OS", 0b000, 0b1000, 0b0001, 0b111>;
+def : TLBI<"IPAS2E1OS", 0b100, 0b1000, 0b0100, 0b000>;
+def : TLBI<"IPAS2LE1OS", 0b100, 0b1000, 0b0100, 0b100>;
+def : TLBI<"VAE2OS", 0b100, 0b1000, 0b0001, 0b001>;
+def : TLBI<"VALE2OS", 0b100, 0b1000, 0b0001, 0b101>;
+def : TLBI<"VMALLS12E1OS", 0b100, 0b1000, 0b0001, 0b110, 0>;
+def : TLBI<"VAE3OS", 0b110, 0b1000, 0b0001, 0b001>;
+def : TLBI<"VALE3OS", 0b110, 0b1000, 0b0001, 0b101>;
+def : TLBI<"ALLE2OS", 0b100, 0b1000, 0b0001, 0b000, 0>;
+def : TLBI<"ALLE1OS", 0b100, 0b1000, 0b0001, 0b100, 0>;
+def : TLBI<"ALLE3OS", 0b110, 0b1000, 0b0001, 0b000, 0>;
+
+// Armv8.4-A TLB Range Maintenance instructions:
+// op1 CRn CRm op2
+def : TLBI<"RVAE1", 0b000, 0b1000, 0b0110, 0b001>;
+def : TLBI<"RVAAE1", 0b000, 0b1000, 0b0110, 0b011>;
+def : TLBI<"RVALE1", 0b000, 0b1000, 0b0110, 0b101>;
+def : TLBI<"RVAALE1", 0b000, 0b1000, 0b0110, 0b111>;
+def : TLBI<"RVAE1IS", 0b000, 0b1000, 0b0010, 0b001>;
+def : TLBI<"RVAAE1IS", 0b000, 0b1000, 0b0010, 0b011>;
+def : TLBI<"RVALE1IS", 0b000, 0b1000, 0b0010, 0b101>;
+def : TLBI<"RVAALE1IS", 0b000, 0b1000, 0b0010, 0b111>;
+def : TLBI<"RVAE1OS", 0b000, 0b1000, 0b0101, 0b001>;
+def : TLBI<"RVAAE1OS", 0b000, 0b1000, 0b0101, 0b011>;
+def : TLBI<"RVALE1OS", 0b000, 0b1000, 0b0101, 0b101>;
+def : TLBI<"RVAALE1OS", 0b000, 0b1000, 0b0101, 0b111>;
+def : TLBI<"RIPAS2E1IS", 0b100, 0b1000, 0b0000, 0b010>;
+def : TLBI<"RIPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b110>;
+def : TLBI<"RIPAS2E1", 0b100, 0b1000, 0b0100, 0b010>;
+def : TLBI<"RIPAS2LE1", 0b100, 0b1000, 0b0100, 0b110>;
+def : TLBI<"RIPAS2E1OS", 0b100, 0b1000, 0b0100, 0b011>;
+def : TLBI<"RIPAS2LE1OS", 0b100, 0b1000, 0b0100, 0b111>;
+def : TLBI<"RVAE2", 0b100, 0b1000, 0b0110, 0b001>;
+def : TLBI<"RVALE2", 0b100, 0b1000, 0b0110, 0b101>;
+def : TLBI<"RVAE2IS", 0b100, 0b1000, 0b0010, 0b001>;
+def : TLBI<"RVALE2IS", 0b100, 0b1000, 0b0010, 0b101>;
+def : TLBI<"RVAE2OS", 0b100, 0b1000, 0b0101, 0b001>;
+def : TLBI<"RVALE2OS", 0b100, 0b1000, 0b0101, 0b101>;
+def : TLBI<"RVAE3", 0b110, 0b1000, 0b0110, 0b001>;
+def : TLBI<"RVALE3", 0b110, 0b1000, 0b0110, 0b101>;
+def : TLBI<"RVAE3IS", 0b110, 0b1000, 0b0010, 0b001>;
+def : TLBI<"RVALE3IS", 0b110, 0b1000, 0b0010, 0b101>;
+def : TLBI<"RVAE3OS", 0b110, 0b1000, 0b0101, 0b001>;
+def : TLBI<"RVALE3OS", 0b110, 0b1000, 0b0101, 0b101>;
+}
//===----------------------------------------------------------------------===//
// MRS/MSR (system register read/write) instruction options.
@@ -420,7 +574,7 @@ def : ROSysReg<"ICC_HPPIR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b010>;
def : ROSysReg<"ICC_RPR_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b011>;
def : ROSysReg<"ICH_VTR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b001>;
def : ROSysReg<"ICH_EISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b011>;
-def : ROSysReg<"ICH_ELSR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b101>;
+def : ROSysReg<"ICH_ELRSR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b101>;
// v8.1a "Limited Ordering Regions" extension-specific system register
// Op0 Op1 CRn CRm Op2
@@ -1037,6 +1191,126 @@ def : RWSysReg<"APGAKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0011, 0b000>;
def : RWSysReg<"APGAKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0011, 0b001>;
}
+let Requires = [{ {AArch64::HasV8_4aOps} }] in {
+
+// v8.4a "Virtualization secure second stage translation" registers
+// Op0 Op1 CRn CRm Op2
+def : RWSysReg<"VSTCR_EL2" , 0b11, 0b100, 0b0010, 0b0110, 0b010>;
+def : RWSysReg<"VSTTBR_EL2", 0b11, 0b100, 0b0010, 0b0110, 0b000>;
+
+// v8.4a "Virtualization timer" registers
+// Op0 Op1 CRn CRm Op2
+def : RWSysReg<"CNTHVS_TVAL_EL2", 0b11, 0b100, 0b1110, 0b0100, 0b000>;
+def : RWSysReg<"CNTHVS_CVAL_EL2", 0b11, 0b100, 0b1110, 0b0100, 0b010>;
+def : RWSysReg<"CNTHVS_CTL_EL2", 0b11, 0b100, 0b1110, 0b0100, 0b001>;
+def : RWSysReg<"CNTHPS_TVAL_EL2", 0b11, 0b100, 0b1110, 0b0101, 0b000>;
+def : RWSysReg<"CNTHPS_CVAL_EL2", 0b11, 0b100, 0b1110, 0b0101, 0b010>;
+def : RWSysReg<"CNTHPS_CTL_EL2", 0b11, 0b100, 0b1110, 0b0101, 0b001>;
+
+// v8.4a "Virtualization debug state" registers
+// Op0 Op1 CRn CRm Op2
+def : RWSysReg<"SDER32_EL2", 0b11, 0b100, 0b0001, 0b0011, 0b001>;
+
+// v8.4a RAS registers
+// Op0 Op1 CRn CRm Op2
+def : RWSysReg<"ERXPFGCTL_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b101>;
+def : RWSysReg<"ERXPFGCDN_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b110>;
+def : RWSysReg<"ERXTS_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b111>;
+def : RWSysReg<"ERXMISC2_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b010>;
+def : RWSysReg<"ERXMISC3_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b011>;
+def : ROSysReg<"ERXPFGF_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b100>;
+
+// v8.4a MPAM registers
+// Op0 Op1 CRn CRm Op2
+def : RWSysReg<"MPAM0_EL1", 0b11, 0b000, 0b1010, 0b0101, 0b001>;
+def : RWSysReg<"MPAM1_EL1", 0b11, 0b000, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAM2_EL2", 0b11, 0b100, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAM3_EL3", 0b11, 0b110, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAM1_EL12", 0b11, 0b101, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAMHCR_EL2", 0b11, 0b100, 0b1010, 0b0100, 0b000>;
+def : RWSysReg<"MPAMVPMV_EL2", 0b11, 0b100, 0b1010, 0b0100, 0b001>;
+def : RWSysReg<"MPAMVPM0_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b000>;
+def : RWSysReg<"MPAMVPM1_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b001>;
+def : RWSysReg<"MPAMVPM2_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b010>;
+def : RWSysReg<"MPAMVPM3_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b011>;
+def : RWSysReg<"MPAMVPM4_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b100>;
+def : RWSysReg<"MPAMVPM5_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b101>;
+def : RWSysReg<"MPAMVPM6_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b110>;
+def : RWSysReg<"MPAMVPM7_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b111>;
+def : ROSysReg<"MPAMIDR_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b100>;
+
+// v8.4a Activitiy monitor registers
+// Op0 Op1 CRn CRm Op2
+def : RWSysReg<"AMCR_EL0", 0b11, 0b011, 0b1101, 0b0010, 0b000>;
+def : ROSysReg<"AMCFGR_EL0", 0b11, 0b011, 0b1101, 0b0010, 0b001>;
+def : ROSysReg<"AMCGCR_EL0", 0b11, 0b011, 0b1101, 0b0010, 0b010>;
+def : RWSysReg<"AMUSERENR_EL0", 0b11, 0b011, 0b1101, 0b0010, 0b011>;
+def : RWSysReg<"AMCNTENCLR0_EL0", 0b11, 0b011, 0b1101, 0b0010, 0b100>;
+def : RWSysReg<"AMCNTENSET0_EL0", 0b11, 0b011, 0b1101, 0b0010, 0b101>;
+def : RWSysReg<"AMEVCNTR00_EL0", 0b11, 0b011, 0b1101, 0b0100, 0b000>;
+def : RWSysReg<"AMEVCNTR01_EL0", 0b11, 0b011, 0b1101, 0b0100, 0b001>;
+def : RWSysReg<"AMEVCNTR02_EL0", 0b11, 0b011, 0b1101, 0b0100, 0b010>;
+def : RWSysReg<"AMEVCNTR03_EL0", 0b11, 0b011, 0b1101, 0b0100, 0b011>;
+def : ROSysReg<"AMEVTYPER00_EL0", 0b11, 0b011, 0b1101, 0b0110, 0b000>;
+def : ROSysReg<"AMEVTYPER01_EL0", 0b11, 0b011, 0b1101, 0b0110, 0b001>;
+def : ROSysReg<"AMEVTYPER02_EL0", 0b11, 0b011, 0b1101, 0b0110, 0b010>;
+def : ROSysReg<"AMEVTYPER03_EL0", 0b11, 0b011, 0b1101, 0b0110, 0b011>;
+def : RWSysReg<"AMCNTENCLR1_EL0", 0b11, 0b011, 0b1101, 0b0011, 0b000>;
+def : RWSysReg<"AMCNTENSET1_EL0", 0b11, 0b011, 0b1101, 0b0011, 0b001>;
+def : RWSysReg<"AMEVCNTR10_EL0", 0b11, 0b011, 0b1101, 0b1100, 0b000>;
+def : RWSysReg<"AMEVCNTR11_EL0", 0b11, 0b011, 0b1101, 0b1100, 0b001>;
+def : RWSysReg<"AMEVCNTR12_EL0", 0b11, 0b011, 0b1101, 0b1100, 0b010>;
+def : RWSysReg<"AMEVCNTR13_EL0", 0b11, 0b011, 0b1101, 0b1100, 0b011>;
+def : RWSysReg<"AMEVCNTR14_EL0", 0b11, 0b011, 0b1101, 0b1100, 0b100>;
+def : RWSysReg<"AMEVCNTR15_EL0", 0b11, 0b011, 0b1101, 0b1100, 0b101>;
+def : RWSysReg<"AMEVCNTR16_EL0", 0b11, 0b011, 0b1101, 0b1100, 0b110>;
+def : RWSysReg<"AMEVCNTR17_EL0", 0b11, 0b011, 0b1101, 0b1100, 0b111>;
+def : RWSysReg<"AMEVCNTR18_EL0", 0b11, 0b011, 0b1101, 0b1101, 0b000>;
+def : RWSysReg<"AMEVCNTR19_EL0", 0b11, 0b011, 0b1101, 0b1101, 0b001>;
+def : RWSysReg<"AMEVCNTR110_EL0", 0b11, 0b011, 0b1101, 0b1101, 0b010>;
+def : RWSysReg<"AMEVCNTR111_EL0", 0b11, 0b011, 0b1101, 0b1101, 0b011>;
+def : RWSysReg<"AMEVCNTR112_EL0", 0b11, 0b011, 0b1101, 0b1101, 0b100>;
+def : RWSysReg<"AMEVCNTR113_EL0", 0b11, 0b011, 0b1101, 0b1101, 0b101>;
+def : RWSysReg<"AMEVCNTR114_EL0", 0b11, 0b011, 0b1101, 0b1101, 0b110>;
+def : RWSysReg<"AMEVCNTR115_EL0", 0b11, 0b011, 0b1101, 0b1101, 0b111>;
+def : RWSysReg<"AMEVTYPER10_EL0", 0b11, 0b011, 0b1101, 0b1110, 0b000>;
+def : RWSysReg<"AMEVTYPER11_EL0", 0b11, 0b011, 0b1101, 0b1110, 0b001>;
+def : RWSysReg<"AMEVTYPER12_EL0", 0b11, 0b011, 0b1101, 0b1110, 0b010>;
+def : RWSysReg<"AMEVTYPER13_EL0", 0b11, 0b011, 0b1101, 0b1110, 0b011>;
+def : RWSysReg<"AMEVTYPER14_EL0", 0b11, 0b011, 0b1101, 0b1110, 0b100>;
+def : RWSysReg<"AMEVTYPER15_EL0", 0b11, 0b011, 0b1101, 0b1110, 0b101>;
+def : RWSysReg<"AMEVTYPER16_EL0", 0b11, 0b011, 0b1101, 0b1110, 0b110>;
+def : RWSysReg<"AMEVTYPER17_EL0", 0b11, 0b011, 0b1101, 0b1110, 0b111>;
+def : RWSysReg<"AMEVTYPER18_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b000>;
+def : RWSysReg<"AMEVTYPER19_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b001>;
+def : RWSysReg<"AMEVTYPER110_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b010>;
+def : RWSysReg<"AMEVTYPER111_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b011>;
+def : RWSysReg<"AMEVTYPER112_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b100>;
+def : RWSysReg<"AMEVTYPER113_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b101>;
+def : RWSysReg<"AMEVTYPER114_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b110>;
+def : RWSysReg<"AMEVTYPER115_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b111>;
+
+// v8.4a Trace Extension registers
+//
+// Please note that the 8.4 spec also defines these registers:
+// TRCIDR1, ID_DFR0_EL1, ID_AA64DFR0_EL1, MDSCR_EL1, MDCR_EL2, and MDCR_EL3,
+// but they are already defined above.
+//
+// Op0 Op1 CRn CRm Op2
+def : RWSysReg<"TRFCR_EL1", 0b11, 0b000, 0b0001, 0b0010, 0b001>;
+def : RWSysReg<"TRFCR_EL2", 0b11, 0b100, 0b0001, 0b0010, 0b001>;
+def : RWSysReg<"TRFCR_EL12", 0b11, 0b101, 0b0001, 0b0010, 0b001>;
+
+// v8.4a Timining insensitivity of data processing instructions
+// Op0 Op1 CRn CRm Op2
+def : RWSysReg<"DIT", 0b11, 0b011, 0b0100, 0b0010, 0b101>;
+
+// v8.4a Enhanced Support for Nested Virtualization
+// Op0 Op1 CRn CRm Op2
+def : RWSysReg<"VNCR_EL2", 0b11, 0b100, 0b0010, 0b0010, 0b000>;
+
+} // HasV8_4aOps
+
// Cyclone specific system registers
// Op0 Op1 CRn CRm Op2
let Requires = [{ {AArch64::ProcCyclone} }] in
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 0e6ad944c141..01a997e5aed7 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -27,7 +27,6 @@
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
@@ -36,6 +35,7 @@
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/Scalar.h"
#include <memory>
@@ -136,7 +136,7 @@ static cl::opt<bool>
static cl::opt<int> EnableGlobalISelAtO(
"aarch64-enable-global-isel-at-O", cl::Hidden,
cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
- cl::init(-1));
+ cl::init(0));
static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
cl::init(true), cl::Hidden);
@@ -243,6 +243,18 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
getEffectiveCodeModel(TT, CM, JIT), OL),
TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian) {
initAsmInfo();
+
+ if (TT.isOSBinFormatMachO()) {
+ this->Options.TrapUnreachable = true;
+ this->Options.NoTrapAfterNoreturn = true;
+ }
+
+ // Enable GlobalISel at or below EnableGlobalISelAt0.
+ if (getOptLevel() <= EnableGlobalISelAtO)
+ setGlobalISel(true);
+
+ // AArch64 supports the MachineOutliner.
+ setMachineOutliner(true);
}
AArch64TargetMachine::~AArch64TargetMachine() = default;
@@ -340,8 +352,6 @@ public:
void addPostRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
-
- bool isGlobalISelEnabled() const override;
};
} // end anonymous namespace
@@ -387,7 +397,7 @@ void AArch64PassConfig::addIRPasses() {
// Call SeparateConstOffsetFromGEP pass to extract constants within indices
// and lower a GEP with multiple indices to either arithmetic operations or
// multiple GEPs with single index.
- addPass(createSeparateConstOffsetFromGEPPass(TM, true));
+ addPass(createSeparateConstOffsetFromGEPPass(true));
// Call EarlyCSE pass to find and remove subexpressions in the lowered
// result.
addPass(createEarlyCSEPass());
@@ -455,10 +465,6 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
return false;
}
-bool AArch64PassConfig::isGlobalISelEnabled() const {
- return TM->getOptLevel() <= EnableGlobalISelAtO;
-}
-
bool AArch64PassConfig::addILPOpts() {
if (EnableCondOpt)
addPass(createAArch64ConditionOptimizerPass());
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index f081d7caba67..9077eb7902fd 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -10,8 +10,8 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
namespace llvm {
class AArch64TargetMachine;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1820ad959fcb..d75fef7b0171 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -38,7 +38,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
return (CallerBits & CalleeBits) == CalleeBits;
}
-/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
int AArch64TTIImpl::getIntImmCost(int64_t Val) {
@@ -54,7 +54,7 @@ int AArch64TTIImpl::getIntImmCost(int64_t Val) {
return (64 - LZ + 15) / 16;
}
-/// \brief Calculate the cost of materializing the given constant.
+/// Calculate the cost of materializing the given constant.
int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
@@ -277,7 +277,7 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
// same as the second operand. In this case, we will generate a "long"
// version of the widening instruction.
if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
- if (I->getOpcode() == Cast->getOpcode() &&
+ if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
return 0;
}
@@ -493,32 +493,70 @@ int AArch64TTIImpl::getArithmeticInstrCost(
int ISD = TLI->InstructionOpcodeToISD(Opcode);
- if (ISD == ISD::SDIV &&
- Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
- Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
- // On AArch64, scalar signed division by constants power-of-two are
- // normally expanded to the sequence ADD + CMP + SELECT + SRA.
- // The OperandValue properties many not be same as that of previous
- // operation; conservatively assume OP_None.
- Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
- return Cost;
- }
-
switch (ISD) {
default:
return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
Opd1PropInfo, Opd2PropInfo);
+ case ISD::SDIV:
+ if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+ // On AArch64, scalar signed division by constants power-of-two are
+ // normally expanded to the sequence ADD + CMP + SELECT + SRA.
+ // The OperandValue properties many not be same as that of previous
+ // operation; conservatively assume OP_None.
+ Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ return Cost;
+ }
+ LLVM_FALLTHROUGH;
+ case ISD::UDIV:
+ if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
+ auto VT = TLI->getValueType(DL, Ty);
+ if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
+ // Vector signed division by constant are expanded to the
+ // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
+ // to MULHS + SUB + SRL + ADD + SRL.
+ int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, Opd1Info,
+ Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info,
+ Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info,
+ Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
+ }
+ }
+
+ Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo);
+ if (Ty->isVectorTy()) {
+ // On AArch64, vector divisions are not supported natively and are
+ // expanded into scalar divisions of each pair of elements.
+ Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info,
+ Opd2Info, Opd1PropInfo, Opd2PropInfo);
+ Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info,
+ Opd2Info, Opd1PropInfo, Opd2PropInfo);
+ // TODO: if one of the arguments is scalar, then it's not necessary to
+ // double the cost of handling the vector elements.
+ Cost += Cost;
+ }
+ return Cost;
+
case ISD::ADD:
case ISD::MUL:
case ISD::XOR:
@@ -596,14 +634,22 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
return LT.first * 2 * AmortizationCost;
}
- if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) &&
- Ty->getVectorNumElements() < 8) {
- // We scalarize the loads/stores because there is not v.4b register and we
- // have to promote the elements to v.4h.
- unsigned NumVecElts = Ty->getVectorNumElements();
- unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
- // We generate 2 instructions per vector element.
- return NumVectorizableInstsToAmortize * NumVecElts * 2;
+ if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) {
+ unsigned ProfitableNumElements;
+ if (Opcode == Instruction::Store)
+ // We use a custom trunc store lowering so v.4b should be profitable.
+ ProfitableNumElements = 4;
+ else
+ // We scalarize the loads because there is not v.4b register and we
+ // have to promote the elements to v.2.
+ ProfitableNumElements = 8;
+
+ if (Ty->getVectorNumElements() < ProfitableNumElements) {
+ unsigned NumVecElts = Ty->getVectorNumElements();
+ unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
+ // We generate 2 instructions per vector element.
+ return NumVectorizableInstsToAmortize * NumVecElts * 2;
+ }
}
return LT.first;
@@ -690,14 +736,14 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
};
int StridedLoads = countStridedLoads(L, SE);
- DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
- << " strided loads\n");
+ LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
+ << " strided loads\n");
// Pick the largest power of 2 unroll count that won't result in too many
// strided loads.
if (StridedLoads) {
UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
- DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount
- << '\n');
+ LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
+ << UP.MaxCount << '\n');
}
}
@@ -868,3 +914,73 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
}
return false;
}
+
+int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
+ bool IsPairwiseForm) {
+
+ if (IsPairwiseForm)
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ MVT MTy = LT.second;
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ // Horizontal adds can use the 'addv' instruction. We model the cost of these
+ // instructions as normal vector adds. This is the only arithmetic vector
+ // reduction operation for which we have an instruction.
+ static const CostTblEntry CostTblNoPairwise[]{
+ {ISD::ADD, MVT::v8i8, 1},
+ {ISD::ADD, MVT::v16i8, 1},
+ {ISD::ADD, MVT::v4i16, 1},
+ {ISD::ADD, MVT::v8i16, 1},
+ {ISD::ADD, MVT::v4i32, 1},
+ };
+
+ if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
+}
+
+int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp) {
+ if (Kind == TTI::SK_Transpose || Kind == TTI::SK_Select ||
+ Kind == TTI::SK_PermuteSingleSrc) {
+ static const CostTblEntry ShuffleTbl[] = {
+ // Transpose shuffle kinds can be performed with 'trn1/trn2' and
+ // 'zip1/zip2' instructions.
+ { TTI::SK_Transpose, MVT::v8i8, 1 },
+ { TTI::SK_Transpose, MVT::v16i8, 1 },
+ { TTI::SK_Transpose, MVT::v4i16, 1 },
+ { TTI::SK_Transpose, MVT::v8i16, 1 },
+ { TTI::SK_Transpose, MVT::v2i32, 1 },
+ { TTI::SK_Transpose, MVT::v4i32, 1 },
+ { TTI::SK_Transpose, MVT::v2i64, 1 },
+ { TTI::SK_Transpose, MVT::v2f32, 1 },
+ { TTI::SK_Transpose, MVT::v4f32, 1 },
+ { TTI::SK_Transpose, MVT::v2f64, 1 },
+ // Select shuffle kinds.
+ // TODO: handle vXi8/vXi16.
+ { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
+ { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
+ { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
+ { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
+ { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
+ { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
+ // PermuteSingleSrc shuffle kinds.
+ // TODO: handle vXi8/vXi16.
+ { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
+ { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
+ { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
+ { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
+ { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
+ { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
+ };
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 08c693ff38a8..c056a7d2428b 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -166,6 +166,11 @@ public:
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
+
+ int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
+ bool IsPairwiseForm);
+
+ int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
/// @}
};
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 6e63783e5646..a51c41d70915 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -66,6 +66,12 @@ enum class RegKind {
SVEPredicateVector
};
+enum RegConstraintEqualityTy {
+ EqualsReg,
+ EqualsSuperReg,
+ EqualsSubReg
+};
+
class AArch64AsmParser : public MCTargetAsmParser {
private:
StringRef Mnemonic; ///< Instruction mnemonic.
@@ -85,19 +91,18 @@ private:
AArch64CC::CondCode parseCondCodeString(StringRef Cond);
bool parseCondCode(OperandVector &Operands, bool invertCondCode);
unsigned matchRegisterNameAlias(StringRef Name, RegKind Kind);
- int tryParseRegister();
- int tryMatchVectorRegister(StringRef &Kind, bool expected);
bool parseRegister(OperandVector &Operands);
bool parseSymbolicImmVal(const MCExpr *&ImmVal);
- bool parseVectorList(OperandVector &Operands);
+ bool parseNeonVectorList(OperandVector &Operands);
+ bool parseOptionalMulOperand(OperandVector &Operands);
bool parseOperand(OperandVector &Operands, bool isCondCode,
bool invertCondCode);
- bool showMatchError(SMLoc Loc, unsigned ErrCode, OperandVector &Operands);
+ bool showMatchError(SMLoc Loc, unsigned ErrCode, uint64_t ErrorInfo,
+ OperandVector &Operands);
bool parseDirectiveArch(SMLoc L);
bool parseDirectiveCPU(SMLoc L);
- bool parseDirectiveWord(unsigned Size, SMLoc L);
bool parseDirectiveInst(SMLoc L);
bool parseDirectiveTLSDescCall(SMLoc L);
@@ -121,25 +126,36 @@ private:
/// }
- OperandMatchResultTy tryParseSVERegister(int &Reg, StringRef &Kind,
- RegKind MatchKind);
+ OperandMatchResultTy tryParseScalarRegister(unsigned &Reg);
+ OperandMatchResultTy tryParseVectorRegister(unsigned &Reg, StringRef &Kind,
+ RegKind MatchKind);
OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
+ template <bool IsSVEPrefetch = false>
OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
OperandMatchResultTy tryParsePSBHint(OperandVector &Operands);
OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
+ template<bool AddFPZeroAsLiteral>
OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
- OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
+ OperandMatchResultTy tryParseImmWithOptionalShift(OperandVector &Operands);
OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
bool tryParseNeonVectorRegister(OperandVector &Operands);
+ OperandMatchResultTy tryParseVectorIndex(OperandVector &Operands);
OperandMatchResultTy tryParseGPRSeqPair(OperandVector &Operands);
- template <bool ParseSuffix>
+ template <bool ParseShiftExtend,
+ RegConstraintEqualityTy EqTy = RegConstraintEqualityTy::EqualsReg>
+ OperandMatchResultTy tryParseGPROperand(OperandVector &Operands);
+ template <bool ParseShiftExtend, bool ParseSuffix>
OperandMatchResultTy tryParseSVEDataVector(OperandVector &Operands);
OperandMatchResultTy tryParseSVEPredicateVector(OperandVector &Operands);
+ template <RegKind VectorKind>
+ OperandMatchResultTy tryParseVectorList(OperandVector &Operands,
+ bool ExpectMatch = false);
+ OperandMatchResultTy tryParseSVEPattern(OperandVector &Operands);
public:
enum AArch64MatchResultTy {
@@ -158,10 +174,19 @@ public:
if (S.getTargetStreamer() == nullptr)
new AArch64TargetStreamer(S);
+ // Alias .hword/.word/xword to the target-independent .2byte/.4byte/.8byte
+ // directives as they have the same form and semantics:
+ /// ::= (.hword | .word | .xword ) [ expression (, expression)* ]
+ Parser.addAliasForDirective(".hword", ".2byte");
+ Parser.addAliasForDirective(".word", ".4byte");
+ Parser.addAliasForDirective(".xword", ".8byte");
+
// Initialize the set of available features.
setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
}
+ bool regsEqual(const MCParsedAsmOperand &Op1,
+ const MCParsedAsmOperand &Op2) const override;
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) override;
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
@@ -204,18 +229,45 @@ private:
bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
};
+ // Separate shift/extend operand.
+ struct ShiftExtendOp {
+ AArch64_AM::ShiftExtendType Type;
+ unsigned Amount;
+ bool HasExplicitAmount;
+ };
+
struct RegOp {
unsigned RegNum;
RegKind Kind;
-
int ElementWidth;
+
+ // The register may be allowed as a different register class,
+ // e.g. for GPR64as32 or GPR32as64.
+ RegConstraintEqualityTy EqualityTy;
+
+ // In some cases the shift/extend needs to be explicitly parsed together
+ // with the register, rather than as a separate operand. This is needed
+ // for addressing modes where the instruction as a whole dictates the
+ // scaling/extend, rather than specific bits in the instruction.
+ // By parsing them as a single operand, we avoid the need to pass an
+ // extra operand in all CodeGen patterns (because all operands need to
+ // have an associated value), and we avoid the need to update TableGen to
+ // accept operands that have no associated bits in the instruction.
+ //
+ // An added benefit of parsing them together is that the assembler
+ // can give a sensible diagnostic if the scaling is not correct.
+ //
+ // The default is 'lsl #0' (HasExplicitAmount = false) if no
+ // ShiftExtend is specified.
+ ShiftExtendOp ShiftExtend;
};
struct VectorListOp {
unsigned RegNum;
unsigned Count;
unsigned NumElements;
- unsigned ElementKind;
+ unsigned ElementWidth;
+ RegKind RegisterKind;
};
struct VectorIndexOp {
@@ -236,7 +288,8 @@ private:
};
struct FPImmOp {
- unsigned Val; // Encoded 8-bit representation.
+ uint64_t Val; // APFloat value bitcasted to uint64_t.
+ bool IsExact; // describes whether parsed value was exact.
};
struct BarrierOp {
@@ -269,12 +322,6 @@ private:
unsigned Val;
};
- struct ShiftExtendOp {
- AArch64_AM::ShiftExtendType Type;
- unsigned Amount;
- bool HasExplicitAmount;
- };
-
struct ExtendOp {
unsigned Val;
};
@@ -388,9 +435,14 @@ public:
return CondCode.Code;
}
- unsigned getFPImm() const {
- assert(Kind == k_FPImm && "Invalid access!");
- return FPImm.Val;
+ APFloat getFPImm() const {
+ assert (Kind == k_FPImm && "Invalid access!");
+ return APFloat(APFloat::IEEEdouble(), APInt(64, FPImm.Val, true));
+ }
+
+ bool getFPImmIsExact() const {
+ assert (Kind == k_FPImm && "Invalid access!");
+ return FPImm.IsExact;
}
unsigned getBarrier() const {
@@ -408,6 +460,11 @@ public:
return Reg.RegNum;
}
+ RegConstraintEqualityTy getRegEqualityTy() const {
+ assert(Kind == k_Register && "Invalid access!");
+ return Reg.EqualityTy;
+ }
+
unsigned getVectorListStart() const {
assert(Kind == k_VectorList && "Invalid access!");
return VectorList.RegNum;
@@ -454,66 +511,88 @@ public:
}
AArch64_AM::ShiftExtendType getShiftExtendType() const {
- assert(Kind == k_ShiftExtend && "Invalid access!");
- return ShiftExtend.Type;
+ if (Kind == k_ShiftExtend)
+ return ShiftExtend.Type;
+ if (Kind == k_Register)
+ return Reg.ShiftExtend.Type;
+ llvm_unreachable("Invalid access!");
}
unsigned getShiftExtendAmount() const {
- assert(Kind == k_ShiftExtend && "Invalid access!");
- return ShiftExtend.Amount;
+ if (Kind == k_ShiftExtend)
+ return ShiftExtend.Amount;
+ if (Kind == k_Register)
+ return Reg.ShiftExtend.Amount;
+ llvm_unreachable("Invalid access!");
}
bool hasShiftExtendAmount() const {
- assert(Kind == k_ShiftExtend && "Invalid access!");
- return ShiftExtend.HasExplicitAmount;
+ if (Kind == k_ShiftExtend)
+ return ShiftExtend.HasExplicitAmount;
+ if (Kind == k_Register)
+ return Reg.ShiftExtend.HasExplicitAmount;
+ llvm_unreachable("Invalid access!");
}
bool isImm() const override { return Kind == k_Immediate; }
bool isMem() const override { return false; }
- bool isSImm9() const {
+
+ bool isUImm6() const {
if (!isImm())
return false;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE)
return false;
int64_t Val = MCE->getValue();
- return (Val >= -256 && Val < 256);
+ return (Val >= 0 && Val < 64);
}
- bool isSImm10s8() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val >= -4096 && Val < 4089 && (Val & 7) == 0);
+
+ template <int Width> bool isSImm() const { return isSImmScaled<Width, 1>(); }
+
+ template <int Bits, int Scale> DiagnosticPredicate isSImmScaled() const {
+ return isImmScaled<Bits, Scale>(true);
}
- bool isSImm7s4() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val >= -256 && Val <= 252 && (Val & 3) == 0);
+
+ template <int Bits, int Scale> DiagnosticPredicate isUImmScaled() const {
+ return isImmScaled<Bits, Scale>(false);
}
- bool isSImm7s8() const {
+
+ template <int Bits, int Scale>
+ DiagnosticPredicate isImmScaled(bool Signed) const {
if (!isImm())
- return false;
+ return DiagnosticPredicateTy::NoMatch;
+
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE)
- return false;
+ return DiagnosticPredicateTy::NoMatch;
+
+ int64_t MinVal, MaxVal;
+ if (Signed) {
+ int64_t Shift = Bits - 1;
+ MinVal = (int64_t(1) << Shift) * -Scale;
+ MaxVal = ((int64_t(1) << Shift) - 1) * Scale;
+ } else {
+ MinVal = 0;
+ MaxVal = ((int64_t(1) << Bits) - 1) * Scale;
+ }
+
int64_t Val = MCE->getValue();
- return (Val >= -512 && Val <= 504 && (Val & 7) == 0);
+ if (Val >= MinVal && Val <= MaxVal && (Val % Scale) == 0)
+ return DiagnosticPredicateTy::Match;
+
+ return DiagnosticPredicateTy::NearMatch;
}
- bool isSImm7s16() const {
+
+ DiagnosticPredicate isSVEPattern() const {
if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ return DiagnosticPredicateTy::NoMatch;
+ auto *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE)
- return false;
+ return DiagnosticPredicateTy::NoMatch;
int64_t Val = MCE->getValue();
- return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0);
+ if (Val >= 0 && Val < 32)
+ return DiagnosticPredicateTy::Match;
+ return DiagnosticPredicateTy::NearMatch;
}
bool isSymbolicUImm12Offset(const MCExpr *Expr, unsigned Scale) const {
@@ -535,7 +614,9 @@ public:
ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC ||
- ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) {
+ ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_SECREL_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_SECREL_HI12) {
// Note that we don't range-check the addend. It's adjusted modulo page
// size when converted, so there is no "out of range" condition when using
// @pageoff.
@@ -572,48 +653,47 @@ public:
return (Val >= N && Val <= M);
}
- bool isLogicalImm32() const {
+ // NOTE: Also used for isLogicalImmNot as anything that can be represented as
+ // a logical immediate can always be represented when inverted.
+ template <typename T>
+ bool isLogicalImm() const {
if (!isImm())
return false;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE)
return false;
+
int64_t Val = MCE->getValue();
- if (Val >> 32 != 0 && Val >> 32 != ~0LL)
+ int64_t SVal = typename std::make_signed<T>::type(Val);
+ int64_t UVal = typename std::make_unsigned<T>::type(Val);
+ if (Val != SVal && Val != UVal)
return false;
- Val &= 0xFFFFFFFF;
- return AArch64_AM::isLogicalImmediate(Val, 32);
- }
- bool isLogicalImm64() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64);
+ return AArch64_AM::isLogicalImmediate(UVal, sizeof(T) * 8);
}
- bool isLogicalImm32Not() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = ~MCE->getValue() & 0xFFFFFFFF;
- return AArch64_AM::isLogicalImmediate(Val, 32);
- }
+ bool isShiftedImm() const { return Kind == k_ShiftedImm; }
- bool isLogicalImm64Not() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- return AArch64_AM::isLogicalImmediate(~MCE->getValue(), 64);
- }
+ /// Returns the immediate value as a pair of (imm, shift) if the immediate is
+ /// a shifted immediate by value 'Shift' or '0', or if it is an unshifted
+ /// immediate that can be shifted by 'Shift'.
+ template <unsigned Width>
+ Optional<std::pair<int64_t, unsigned> > getShiftedVal() const {
+ if (isShiftedImm() && Width == getShiftedImmShift())
+ if (auto *CE = dyn_cast<MCConstantExpr>(getShiftedImmVal()))
+ return std::make_pair(CE->getValue(), Width);
+
+ if (isImm())
+ if (auto *CE = dyn_cast<MCConstantExpr>(getImm())) {
+ int64_t Val = CE->getValue();
+ if ((Val != 0) && (uint64_t(Val >> Width) << Width) == uint64_t(Val))
+ return std::make_pair(Val >> Width, Width);
+ else
+ return std::make_pair(Val, 0u);
+ }
- bool isShiftedImm() const { return Kind == k_ShiftedImm; }
+ return {};
+ }
bool isAddSubImm() const {
if (!isShiftedImm() && !isImm())
@@ -646,12 +726,14 @@ public:
|| ELFRefKind == AArch64MCExpr::VK_TPREL_HI12
|| ELFRefKind == AArch64MCExpr::VK_TPREL_LO12
|| ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC
- || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12;
+ || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12
+ || ELFRefKind == AArch64MCExpr::VK_SECREL_HI12
+ || ELFRefKind == AArch64MCExpr::VK_SECREL_LO12;
}
- // If it's a constant, it should be a real immediate in range:
- if (auto *CE = dyn_cast<MCConstantExpr>(Expr))
- return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+ // If it's a constant, it should be a real immediate in range.
+ if (auto ShiftedVal = getShiftedVal<12>())
+ return ShiftedVal->first >= 0 && ShiftedVal->first <= 0xfff;
// If it's an expression, we hope for the best and let the fixup/relocation
// code deal with it.
@@ -662,20 +744,56 @@ public:
if (!isShiftedImm() && !isImm())
return false;
- const MCExpr *Expr;
+ // Otherwise it should be a real negative immediate in range.
+ if (auto ShiftedVal = getShiftedVal<12>())
+ return ShiftedVal->first < 0 && -ShiftedVal->first <= 0xfff;
- // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
- if (isShiftedImm()) {
- unsigned Shift = ShiftedImm.ShiftAmount;
- Expr = ShiftedImm.Val;
- if (Shift != 0 && Shift != 12)
- return false;
- } else
- Expr = getImm();
+ return false;
+ }
+
+ // Signed value in the range -128 to +127. For element widths of
+ // 16 bits or higher it may also be a signed multiple of 256 in the
+ // range -32768 to +32512.
+ // For element-width of 8 bits a range of -128 to 255 is accepted,
+ // since a copy of a byte can be either signed/unsigned.
+ template <typename T>
+ DiagnosticPredicate isSVECpyImm() const {
+ if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm())))
+ return DiagnosticPredicateTy::NoMatch;
+
+ bool IsByte =
+ std::is_same<int8_t, typename std::make_signed<T>::type>::value;
+ if (auto ShiftedImm = getShiftedVal<8>())
+ if (!(IsByte && ShiftedImm->second) &&
+ AArch64_AM::isSVECpyImm<T>(uint64_t(ShiftedImm->first)
+ << ShiftedImm->second))
+ return DiagnosticPredicateTy::Match;
- // Otherwise it should be a real negative immediate in range:
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
- return CE != nullptr && CE->getValue() < 0 && -CE->getValue() <= 0xfff;
+ return DiagnosticPredicateTy::NearMatch;
+ }
+
+ // Unsigned value in the range 0 to 255. For element widths of
+ // 16 bits or higher it may also be a signed multiple of 256 in the
+ // range 0 to 65280.
+ template <typename T> DiagnosticPredicate isSVEAddSubImm() const {
+ if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm())))
+ return DiagnosticPredicateTy::NoMatch;
+
+ bool IsByte =
+ std::is_same<int8_t, typename std::make_signed<T>::type>::value;
+ if (auto ShiftedImm = getShiftedVal<8>())
+ if (!(IsByte && ShiftedImm->second) &&
+ AArch64_AM::isSVEAddSubImm<T>(ShiftedImm->first
+ << ShiftedImm->second))
+ return DiagnosticPredicateTy::Match;
+
+ return DiagnosticPredicateTy::NearMatch;
+ }
+
+ template <typename T> DiagnosticPredicate isSVEPreferredLogicalImm() const {
+ if (isLogicalImm<T>() && !isSVECpyImm<T>())
+ return DiagnosticPredicateTy::Match;
+ return DiagnosticPredicateTy::NoMatch;
}
bool isCondCode() const { return Kind == k_CondCode; }
@@ -792,7 +910,11 @@ public:
return AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth);
}
- bool isFPImm() const { return Kind == k_FPImm; }
+ bool isFPImm() const {
+ return Kind == k_FPImm &&
+ AArch64_AM::getFP64Imm(getFPImm().bitcastToAPInt()) != -1;
+ }
+
bool isBarrier() const { return Kind == k_Barrier; }
bool isSysReg() const { return Kind == k_SysReg; }
@@ -810,6 +932,7 @@ public:
bool isSystemPStateFieldWithImm0_1() const {
if (!isSysReg()) return false;
return (SysReg.PStateField == AArch64PState::PAN ||
+ SysReg.PStateField == AArch64PState::DIT ||
SysReg.PStateField == AArch64PState::UAO);
}
@@ -819,6 +942,10 @@ public:
}
bool isReg() const override {
+ return Kind == k_Register;
+ }
+
+ bool isScalarReg() const {
return Kind == k_Register && Reg.Kind == RegKind::Scalar;
}
@@ -836,9 +963,12 @@ public:
RegKind RK;
switch (Class) {
case AArch64::ZPRRegClassID:
+ case AArch64::ZPR_3bRegClassID:
+ case AArch64::ZPR_4bRegClassID:
RK = RegKind::SVEDataVector;
break;
case AArch64::PPRRegClassID:
+ case AArch64::PPR_3bRegClassID:
RK = RegKind::SVEPredicateVector;
break;
default:
@@ -849,10 +979,56 @@ public:
AArch64MCRegisterClasses[Class].contains(getReg());
}
+ template <unsigned Class> bool isFPRasZPR() const {
+ return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
+ AArch64MCRegisterClasses[Class].contains(getReg());
+ }
+
template <int ElementWidth, unsigned Class>
- bool isSVEVectorRegOfWidth() const {
- return isSVEVectorReg<Class>() &&
- (ElementWidth == -1 || Reg.ElementWidth == ElementWidth);
+ DiagnosticPredicate isSVEPredicateVectorRegOfWidth() const {
+ if (Kind != k_Register || Reg.Kind != RegKind::SVEPredicateVector)
+ return DiagnosticPredicateTy::NoMatch;
+
+ if (isSVEVectorReg<Class>() &&
+ (ElementWidth == 0 || Reg.ElementWidth == ElementWidth))
+ return DiagnosticPredicateTy::Match;
+
+ return DiagnosticPredicateTy::NearMatch;
+ }
+
+ template <int ElementWidth, unsigned Class>
+ DiagnosticPredicate isSVEDataVectorRegOfWidth() const {
+ if (Kind != k_Register || Reg.Kind != RegKind::SVEDataVector)
+ return DiagnosticPredicateTy::NoMatch;
+
+ if (isSVEVectorReg<Class>() &&
+ (ElementWidth == 0 || Reg.ElementWidth == ElementWidth))
+ return DiagnosticPredicateTy::Match;
+
+ return DiagnosticPredicateTy::NearMatch;
+ }
+
+ template <int ElementWidth, unsigned Class,
+ AArch64_AM::ShiftExtendType ShiftExtendTy, int ShiftWidth,
+ bool ShiftWidthAlwaysSame>
+ DiagnosticPredicate isSVEDataVectorRegWithShiftExtend() const {
+ auto VectorMatch = isSVEDataVectorRegOfWidth<ElementWidth, Class>();
+ if (!VectorMatch.isMatch())
+ return DiagnosticPredicateTy::NoMatch;
+
+ // Give a more specific diagnostic when the user has explicitly typed in
+ // a shift-amount that does not match what is expected, but for which
+ // there is also an unscaled addressing mode (e.g. sxtw/uxtw).
+ bool MatchShift = getShiftExtendAmount() == Log2_32(ShiftWidth / 8);
+ if (!MatchShift && (ShiftExtendTy == AArch64_AM::UXTW ||
+ ShiftExtendTy == AArch64_AM::SXTW) &&
+ !ShiftWidthAlwaysSame && hasShiftExtendAmount() && ShiftWidth == 8)
+ return DiagnosticPredicateTy::NoMatch;
+
+ if (MatchShift && ShiftExtendTy == getShiftExtendType())
+ return DiagnosticPredicateTy::Match;
+
+ return DiagnosticPredicateTy::NearMatch;
}
bool isGPR32as64() const {
@@ -860,6 +1036,11 @@ public:
AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
}
+ bool isGPR64as32() const {
+ return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
+ AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(Reg.RegNum);
+ }
+
bool isWSeqPair() const {
return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains(
@@ -872,58 +1053,65 @@ public:
Reg.RegNum);
}
- bool isGPR64sp0() const {
- return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
- AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum);
- }
-
template<int64_t Angle, int64_t Remainder>
- bool isComplexRotation() const {
- if (!isImm()) return false;
+ DiagnosticPredicate isComplexRotation() const {
+ if (!isImm()) return DiagnosticPredicateTy::NoMatch;
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- if (!CE) return false;
+ if (!CE) return DiagnosticPredicateTy::NoMatch;
uint64_t Value = CE->getValue();
- return (Value % Angle == Remainder && Value <= 270);
+ if (Value % Angle == Remainder && Value <= 270)
+ return DiagnosticPredicateTy::Match;
+ return DiagnosticPredicateTy::NearMatch;
+ }
+
+ template <unsigned RegClassID> bool isGPR64() const {
+ return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
+ AArch64MCRegisterClasses[RegClassID].contains(getReg());
+ }
+
+ template <unsigned RegClassID, int ExtWidth>
+ DiagnosticPredicate isGPR64WithShiftExtend() const {
+ if (Kind != k_Register || Reg.Kind != RegKind::Scalar)
+ return DiagnosticPredicateTy::NoMatch;
+
+ if (isGPR64<RegClassID>() && getShiftExtendType() == AArch64_AM::LSL &&
+ getShiftExtendAmount() == Log2_32(ExtWidth / 8))
+ return DiagnosticPredicateTy::Match;
+ return DiagnosticPredicateTy::NearMatch;
}
/// Is this a vector list with the type implicit (presumably attached to the
/// instruction itself)?
- template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const {
+ template <RegKind VectorKind, unsigned NumRegs>
+ bool isImplicitlyTypedVectorList() const {
return Kind == k_VectorList && VectorList.Count == NumRegs &&
- !VectorList.ElementKind;
+ VectorList.NumElements == 0 &&
+ VectorList.RegisterKind == VectorKind;
}
- template <unsigned NumRegs, unsigned NumElements, char ElementKind>
+ template <RegKind VectorKind, unsigned NumRegs, unsigned NumElements,
+ unsigned ElementWidth>
bool isTypedVectorList() const {
if (Kind != k_VectorList)
return false;
if (VectorList.Count != NumRegs)
return false;
- if (VectorList.ElementKind != ElementKind)
+ if (VectorList.RegisterKind != VectorKind)
+ return false;
+ if (VectorList.ElementWidth != ElementWidth)
return false;
return VectorList.NumElements == NumElements;
}
- bool isVectorIndex1() const {
- return Kind == k_VectorIndex && VectorIndex.Val == 1;
- }
-
- bool isVectorIndexB() const {
- return Kind == k_VectorIndex && VectorIndex.Val < 16;
- }
-
- bool isVectorIndexH() const {
- return Kind == k_VectorIndex && VectorIndex.Val < 8;
- }
-
- bool isVectorIndexS() const {
- return Kind == k_VectorIndex && VectorIndex.Val < 4;
- }
-
- bool isVectorIndexD() const {
- return Kind == k_VectorIndex && VectorIndex.Val < 2;
+ template <int Min, int Max>
+ DiagnosticPredicate isVectorIndex() const {
+ if (Kind != k_VectorIndex)
+ return DiagnosticPredicateTy::NoMatch;
+ if (VectorIndex.Val >= Min && VectorIndex.Val <= Max)
+ return DiagnosticPredicateTy::Match;
+ return DiagnosticPredicateTy::NearMatch;
}
bool isToken() const override { return Kind == k_Token; }
@@ -944,6 +1132,39 @@ public:
ST == AArch64_AM::ASR || ST == AArch64_AM::ROR ||
ST == AArch64_AM::MSL);
}
+
+ template <unsigned ImmEnum> DiagnosticPredicate isExactFPImm() const {
+ if (Kind != k_FPImm)
+ return DiagnosticPredicateTy::NoMatch;
+
+ if (getFPImmIsExact()) {
+ // Lookup the immediate from table of supported immediates.
+ auto *Desc = AArch64ExactFPImm::lookupExactFPImmByEnum(ImmEnum);
+ assert(Desc && "Unknown enum value");
+
+ // Calculate its FP value.
+ APFloat RealVal(APFloat::IEEEdouble());
+ if (RealVal.convertFromString(Desc->Repr, APFloat::rmTowardZero) !=
+ APFloat::opOK)
+ llvm_unreachable("FP immediate is not exact");
+
+ if (getFPImm().bitwiseIsEqual(RealVal))
+ return DiagnosticPredicateTy::Match;
+ }
+
+ return DiagnosticPredicateTy::NearMatch;
+ }
+
+ template <unsigned ImmA, unsigned ImmB>
+ DiagnosticPredicate isExactFPImm() const {
+ DiagnosticPredicate Res = DiagnosticPredicateTy::NoMatch;
+ if ((Res = isExactFPImm<ImmA>()))
+ return DiagnosticPredicateTy::Match;
+ if ((Res = isExactFPImm<ImmB>()))
+ return DiagnosticPredicateTy::Match;
+ return Res;
+ }
+
bool isExtend() const {
if (!isShiftExtend())
return false;
@@ -1076,7 +1297,7 @@ public:
// ambiguity in the matcher.
template<int Width>
bool isSImm9OffsetFB() const {
- return isSImm9() && !isUImm12Offset<Width / 8>();
+ return isSImm<9>() && !isUImm12Offset<Width / 8>();
}
bool isAdrpLabel() const {
@@ -1138,6 +1359,33 @@ public:
Inst.addOperand(MCOperand::createReg(Reg));
}
+ void addGPR64as32Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ assert(
+ AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(getReg()));
+
+ const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+ uint32_t Reg = RI->getRegClass(AArch64::GPR64RegClassID).getRegister(
+ RI->getEncodingValue(getReg()));
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ }
+
+ template <int Width>
+ void addFPRasZPRRegOperands(MCInst &Inst, unsigned N) const {
+ unsigned Base;
+ switch (Width) {
+ case 8: Base = AArch64::B0; break;
+ case 16: Base = AArch64::H0; break;
+ case 32: Base = AArch64::S0; break;
+ case 64: Base = AArch64::D0; break;
+ case 128: Base = AArch64::Q0; break;
+ default:
+ llvm_unreachable("Unsupported width");
+ }
+ Inst.addOperand(MCOperand::createReg(AArch64::Z0 + getReg() - Base));
+ }
+
void addVectorReg64Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
assert(
@@ -1157,55 +1405,45 @@ public:
Inst.addOperand(MCOperand::createReg(getReg()));
}
- template <unsigned NumRegs>
- void addVectorList64Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- static const unsigned FirstRegs[] = { AArch64::D0,
- AArch64::D0_D1,
- AArch64::D0_D1_D2,
- AArch64::D0_D1_D2_D3 };
- unsigned FirstReg = FirstRegs[NumRegs - 1];
-
- Inst.addOperand(
- MCOperand::createReg(FirstReg + getVectorListStart() - AArch64::Q0));
- }
-
- template <unsigned NumRegs>
- void addVectorList128Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- static const unsigned FirstRegs[] = { AArch64::Q0,
- AArch64::Q0_Q1,
- AArch64::Q0_Q1_Q2,
- AArch64::Q0_Q1_Q2_Q3 };
- unsigned FirstReg = FirstRegs[NumRegs - 1];
-
- Inst.addOperand(
- MCOperand::createReg(FirstReg + getVectorListStart() - AArch64::Q0));
- }
-
- void addVectorIndex1Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createImm(getVectorIndex()));
- }
-
- void addVectorIndexBOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createImm(getVectorIndex()));
- }
+ enum VecListIndexType {
+ VecListIdx_DReg = 0,
+ VecListIdx_QReg = 1,
+ VecListIdx_ZReg = 2,
+ };
- void addVectorIndexHOperands(MCInst &Inst, unsigned N) const {
+ template <VecListIndexType RegTy, unsigned NumRegs>
+ void addVectorListOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createImm(getVectorIndex()));
- }
-
- void addVectorIndexSOperands(MCInst &Inst, unsigned N) const {
+ static const unsigned FirstRegs[][5] = {
+ /* DReg */ { AArch64::Q0,
+ AArch64::D0, AArch64::D0_D1,
+ AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 },
+ /* QReg */ { AArch64::Q0,
+ AArch64::Q0, AArch64::Q0_Q1,
+ AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 },
+ /* ZReg */ { AArch64::Z0,
+ AArch64::Z0, AArch64::Z0_Z1,
+ AArch64::Z0_Z1_Z2, AArch64::Z0_Z1_Z2_Z3 }
+ };
+
+ assert((RegTy != VecListIdx_ZReg || NumRegs <= 4) &&
+ " NumRegs must be <= 4 for ZRegs");
+
+ unsigned FirstReg = FirstRegs[(unsigned)RegTy][NumRegs];
+ Inst.addOperand(MCOperand::createReg(FirstReg + getVectorListStart() -
+ FirstRegs[(unsigned)RegTy][0]));
+ }
+
+ void addVectorIndexOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(getVectorIndex()));
}
- void addVectorIndexDOperands(MCInst &Inst, unsigned N) const {
+ template <unsigned ImmIs0, unsigned ImmIs1>
+ void addExactFPImmOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+ assert(bool(isExactFPImm<ImmIs0, ImmIs1>()) && "Invalid operand");
+ Inst.addOperand(MCOperand::createImm(bool(isExactFPImm<ImmIs1>())));
}
void addImmOperands(MCInst &Inst, unsigned N) const {
@@ -1216,9 +1454,13 @@ public:
addExpr(Inst, getImm());
}
- void addAddSubImmOperands(MCInst &Inst, unsigned N) const {
+ template <int Shift>
+ void addImmWithOptionalShiftOperands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
- if (isShiftedImm()) {
+ if (auto ShiftedVal = getShiftedVal<Shift>()) {
+ Inst.addOperand(MCOperand::createImm(ShiftedVal->first));
+ Inst.addOperand(MCOperand::createImm(ShiftedVal->second));
+ } else if (isShiftedImm()) {
addExpr(Inst, getShiftedImmVal());
Inst.addOperand(MCOperand::createImm(getShiftedImmShift()));
} else {
@@ -1227,16 +1469,14 @@ public:
}
}
- void addAddSubImmNegOperands(MCInst &Inst, unsigned N) const {
+ template <int Shift>
+ void addImmNegWithOptionalShiftOperands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
-
- const MCExpr *MCE = isShiftedImm() ? getShiftedImmVal() : getImm();
- const MCConstantExpr *CE = cast<MCConstantExpr>(MCE);
- int64_t Val = -CE->getValue();
- unsigned ShiftAmt = isShiftedImm() ? ShiftedImm.ShiftAmount : 0;
-
- Inst.addOperand(MCOperand::createImm(Val));
- Inst.addOperand(MCOperand::createImm(ShiftAmt));
+ if (auto ShiftedVal = getShiftedVal<Shift>()) {
+ Inst.addOperand(MCOperand::createImm(-ShiftedVal->first));
+ Inst.addOperand(MCOperand::createImm(ShiftedVal->second));
+ } else
+ llvm_unreachable("Not a shifted negative immediate");
}
void addCondCodeOperands(MCInst &Inst, unsigned N) const {
@@ -1269,155 +1509,34 @@ public:
Inst.addOperand(MCOperand::createImm(MCE->getValue() / Scale));
}
- void addSImm9Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue()));
- }
-
- void addSImm10s8Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue() / 8));
- }
-
- void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue() / 4));
- }
-
- void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue() / 8));
- }
-
- void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue() / 16));
- }
-
- void addImm0_1Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue()));
- }
-
- void addImm0_7Operands(MCInst &Inst, unsigned N) const {
+ void addUImm6Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(MCE->getValue()));
}
- void addImm1_8Operands(MCInst &Inst, unsigned N) const {
+ template <int Scale>
+ void addImmScaledOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue()));
- }
-
- void addImm0_15Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue()));
- }
-
- void addImm1_16Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- assert(MCE && "Invalid constant immediate operand!");
- Inst.addOperand(MCOperand::createImm(MCE->getValue()));
- }
-
- void addImm0_31Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue()));
- }
-
- void addImm1_31Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue()));
- }
-
- void addImm1_32Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue()));
- }
-
- void addImm0_63Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue()));
- }
-
- void addImm1_63Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue()));
- }
-
- void addImm1_64Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue()));
- }
-
- void addImm0_127Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue()));
- }
-
- void addImm0_255Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue()));
- }
-
- void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue()));
- }
-
- void addImm32_63Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(MCE->getValue()));
- }
-
- void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- uint64_t encoding =
- AArch64_AM::encodeLogicalImmediate(MCE->getValue() & 0xFFFFFFFF, 32);
- Inst.addOperand(MCOperand::createImm(encoding));
- }
-
- void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
- Inst.addOperand(MCOperand::createImm(encoding));
+ Inst.addOperand(MCOperand::createImm(MCE->getValue() / Scale));
}
- void addLogicalImm32NotOperands(MCInst &Inst, unsigned N) const {
+ template <typename T>
+ void addLogicalImmOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- int64_t Val = ~MCE->getValue() & 0xFFFFFFFF;
- uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, 32);
+ typename std::make_unsigned<T>::type Val = MCE->getValue();
+ uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8);
Inst.addOperand(MCOperand::createImm(encoding));
}
- void addLogicalImm64NotOperands(MCInst &Inst, unsigned N) const {
+ template <typename T>
+ void addLogicalImmNotOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
- uint64_t encoding =
- AArch64_AM::encodeLogicalImmediate(~MCE->getValue(), 64);
+ typename std::make_unsigned<T>::type Val = ~MCE->getValue();
+ uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8);
Inst.addOperand(MCOperand::createImm(encoding));
}
@@ -1472,7 +1591,8 @@ public:
void addFPImmOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createImm(getFPImm()));
+ Inst.addOperand(MCOperand::createImm(
+ AArch64_AM::getFP64Imm(getFPImm().bitcastToAPInt())));
}
void addBarrierOperands(MCInst &Inst, unsigned N) const {
@@ -1606,35 +1726,49 @@ public:
}
static std::unique_ptr<AArch64Operand>
- CreateReg(unsigned RegNum, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx) {
+ CreateReg(unsigned RegNum, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx,
+ RegConstraintEqualityTy EqTy = RegConstraintEqualityTy::EqualsReg,
+ AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL,
+ unsigned ShiftAmount = 0,
+ unsigned HasExplicitAmount = false) {
auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
Op->Reg.RegNum = RegNum;
Op->Reg.Kind = Kind;
+ Op->Reg.ElementWidth = 0;
+ Op->Reg.EqualityTy = EqTy;
+ Op->Reg.ShiftExtend.Type = ExtTy;
+ Op->Reg.ShiftExtend.Amount = ShiftAmount;
+ Op->Reg.ShiftExtend.HasExplicitAmount = HasExplicitAmount;
Op->StartLoc = S;
Op->EndLoc = E;
return Op;
}
static std::unique_ptr<AArch64Operand>
- CreateReg(unsigned RegNum, RegKind Kind, unsigned ElementWidth,
- SMLoc S, SMLoc E, MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
- Op->Reg.RegNum = RegNum;
+ CreateVectorReg(unsigned RegNum, RegKind Kind, unsigned ElementWidth,
+ SMLoc S, SMLoc E, MCContext &Ctx,
+ AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL,
+ unsigned ShiftAmount = 0,
+ unsigned HasExplicitAmount = false) {
+ assert((Kind == RegKind::NeonVector || Kind == RegKind::SVEDataVector ||
+ Kind == RegKind::SVEPredicateVector) &&
+ "Invalid vector kind");
+ auto Op = CreateReg(RegNum, Kind, S, E, Ctx, EqualsReg, ExtTy, ShiftAmount,
+ HasExplicitAmount);
Op->Reg.ElementWidth = ElementWidth;
- Op->Reg.Kind = Kind;
- Op->StartLoc = S;
- Op->EndLoc = E;
return Op;
}
static std::unique_ptr<AArch64Operand>
CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements,
- char ElementKind, SMLoc S, SMLoc E, MCContext &Ctx) {
+ unsigned ElementWidth, RegKind RegisterKind, SMLoc S, SMLoc E,
+ MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx);
Op->VectorList.RegNum = RegNum;
Op->VectorList.Count = Count;
Op->VectorList.NumElements = NumElements;
- Op->VectorList.ElementKind = ElementKind;
+ Op->VectorList.ElementWidth = ElementWidth;
+ Op->VectorList.RegisterKind = RegisterKind;
Op->StartLoc = S;
Op->EndLoc = E;
return Op;
@@ -1679,10 +1813,11 @@ public:
return Op;
}
- static std::unique_ptr<AArch64Operand> CreateFPImm(unsigned Val, SMLoc S,
- MCContext &Ctx) {
+ static std::unique_ptr<AArch64Operand>
+ CreateFPImm(APFloat Val, bool IsExact, SMLoc S, MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx);
- Op->FPImm.Val = Val;
+ Op->FPImm.Val = Val.bitcastToAPInt().getSExtValue();
+ Op->FPImm.IsExact = IsExact;
Op->StartLoc = S;
Op->EndLoc = S;
return Op;
@@ -1770,8 +1905,10 @@ public:
void AArch64Operand::print(raw_ostream &OS) const {
switch (Kind) {
case k_FPImm:
- OS << "<fpimm " << getFPImm() << "("
- << AArch64_AM::getFPImmFloat(getFPImm()) << ") >";
+ OS << "<fpimm " << getFPImm().bitcastToAPInt().getZExtValue();
+ if (!getFPImmIsExact())
+ OS << " (inexact)";
+ OS << ">";
break;
case k_Barrier: {
StringRef Name = getBarrierName();
@@ -1794,9 +1931,6 @@ void AArch64Operand::print(raw_ostream &OS) const {
case k_CondCode:
OS << "<condcode " << getCondCode() << ">";
break;
- case k_Register:
- OS << "<register " << getReg() << ">";
- break;
case k_VectorList: {
OS << "<vectorlist ";
unsigned Reg = getVectorListStart();
@@ -1828,6 +1962,11 @@ void AArch64Operand::print(raw_ostream &OS) const {
case k_PSBHint:
OS << getPSBHintName();
break;
+ case k_Register:
+ OS << "<register " << getReg() << ">";
+ if (!getShiftExtendAmount() && !hasShiftExtendAmount())
+ break;
+ LLVM_FALLTHROUGH;
case k_ShiftExtend:
OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
<< getShiftExtendAmount();
@@ -1882,29 +2021,65 @@ static unsigned MatchNeonVectorRegName(StringRef Name) {
.Default(0);
}
-static bool isValidVectorKind(StringRef Name) {
- return StringSwitch<bool>(Name.lower())
- .Case(".8b", true)
- .Case(".16b", true)
- .Case(".4h", true)
- .Case(".8h", true)
- .Case(".2s", true)
- .Case(".4s", true)
- .Case(".1d", true)
- .Case(".2d", true)
- .Case(".1q", true)
- // Accept the width neutral ones, too, for verbose syntax. If those
- // aren't used in the right places, the token operand won't match so
- // all will work out.
- .Case(".b", true)
- .Case(".h", true)
- .Case(".s", true)
- .Case(".d", true)
- // Needed for fp16 scalar pairwise reductions
- .Case(".2h", true)
- // another special case for the ARMv8.2a dot product operand
- .Case(".4b", true)
- .Default(false);
+/// Returns an optional pair of (#elements, element-width) if Suffix
+/// is a valid vector kind. Where the number of elements in a vector
+/// or the vector width is implicit or explicitly unknown (but still a
+/// valid suffix kind), 0 is used.
+static Optional<std::pair<int, int>> parseVectorKind(StringRef Suffix,
+ RegKind VectorKind) {
+ std::pair<int, int> Res = {-1, -1};
+
+ switch (VectorKind) {
+ case RegKind::NeonVector:
+ Res =
+ StringSwitch<std::pair<int, int>>(Suffix.lower())
+ .Case("", {0, 0})
+ .Case(".1d", {1, 64})
+ .Case(".1q", {1, 128})
+ // '.2h' needed for fp16 scalar pairwise reductions
+ .Case(".2h", {2, 16})
+ .Case(".2s", {2, 32})
+ .Case(".2d", {2, 64})
+ // '.4b' is another special case for the ARMv8.2a dot product
+ // operand
+ .Case(".4b", {4, 8})
+ .Case(".4h", {4, 16})
+ .Case(".4s", {4, 32})
+ .Case(".8b", {8, 8})
+ .Case(".8h", {8, 16})
+ .Case(".16b", {16, 8})
+ // Accept the width neutral ones, too, for verbose syntax. If those
+ // aren't used in the right places, the token operand won't match so
+ // all will work out.
+ .Case(".b", {0, 8})
+ .Case(".h", {0, 16})
+ .Case(".s", {0, 32})
+ .Case(".d", {0, 64})
+ .Default({-1, -1});
+ break;
+ case RegKind::SVEPredicateVector:
+ case RegKind::SVEDataVector:
+ Res = StringSwitch<std::pair<int, int>>(Suffix.lower())
+ .Case("", {0, 0})
+ .Case(".b", {0, 8})
+ .Case(".h", {0, 16})
+ .Case(".s", {0, 32})
+ .Case(".d", {0, 64})
+ .Case(".q", {0, 128})
+ .Default({-1, -1});
+ break;
+ default:
+ llvm_unreachable("Unsupported RegKind");
+ }
+
+ if (Res == std::make_pair(-1, -1))
+ return Optional<std::pair<int, int>>();
+
+ return Optional<std::pair<int, int>>(Res);
+}
+
+static bool isValidVectorKind(StringRef Suffix, RegKind VectorKind) {
+ return parseVectorKind(Suffix, VectorKind).hasValue();
}
static unsigned matchSVEDataVectorRegName(StringRef Name) {
@@ -1965,40 +2140,12 @@ static unsigned matchSVEPredicateVectorRegName(StringRef Name) {
.Default(0);
}
-static bool isValidSVEKind(StringRef Name) {
- return StringSwitch<bool>(Name.lower())
- .Case(".b", true)
- .Case(".h", true)
- .Case(".s", true)
- .Case(".d", true)
- .Case(".q", true)
- .Default(false);
-}
-
-static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
- char &ElementKind) {
- assert(isValidVectorKind(Name));
-
- ElementKind = Name.lower()[Name.size() - 1];
- NumElements = 0;
-
- if (Name.size() == 2)
- return;
-
- // Parse the lane count
- Name = Name.drop_front();
- while (isdigit(Name.front())) {
- NumElements = 10 * NumElements + (Name.front() - '0');
- Name = Name.drop_front();
- }
-}
-
bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
SMLoc &EndLoc) {
StartLoc = getLoc();
- RegNo = tryParseRegister();
+ auto Res = tryParseScalarRegister(RegNo);
EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
- return (RegNo == (unsigned)-1);
+ return Res != MatchOperand_Success;
}
// Matches a register name or register alias previously defined by '.req'
@@ -2019,6 +2166,15 @@ unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
return Kind == RegKind::Scalar ? RegNum : 0;
if (!RegNum) {
+ // Handle a few common aliases of registers.
+ if (auto RegNum = StringSwitch<unsigned>(Name.lower())
+ .Case("fp", AArch64::FP)
+ .Case("lr", AArch64::LR)
+ .Case("x31", AArch64::XZR)
+ .Case("w31", AArch64::WZR)
+ .Default(0))
+ return Kind == RegKind::Scalar ? RegNum : 0;
+
// Check for aliases registered via .req. Canonicalize to lower case.
// That's more consistent since register names are case insensitive, and
// it's how the original entry was passed in from MC/MCParser/AsmParser.
@@ -2033,65 +2189,24 @@ unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
return RegNum;
}
-/// tryParseRegister - Try to parse a register name. The token must be an
+/// tryParseScalarRegister - Try to parse a register name. The token must be an
/// Identifier when called, and if it is a register name the token is eaten and
/// the register is added to the operand list.
-int AArch64AsmParser::tryParseRegister() {
+OperandMatchResultTy
+AArch64AsmParser::tryParseScalarRegister(unsigned &RegNum) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
if (Tok.isNot(AsmToken::Identifier))
- return -1;
+ return MatchOperand_NoMatch;
std::string lowerCase = Tok.getString().lower();
- unsigned RegNum = matchRegisterNameAlias(lowerCase, RegKind::Scalar);
-
- // Also handle a few aliases of registers.
- if (RegNum == 0)
- RegNum = StringSwitch<unsigned>(lowerCase)
- .Case("fp", AArch64::FP)
- .Case("lr", AArch64::LR)
- .Case("x31", AArch64::XZR)
- .Case("w31", AArch64::WZR)
- .Default(0);
-
- if (RegNum == 0)
- return -1;
+ unsigned Reg = matchRegisterNameAlias(lowerCase, RegKind::Scalar);
+ if (Reg == 0)
+ return MatchOperand_NoMatch;
+ RegNum = Reg;
Parser.Lex(); // Eat identifier token.
- return RegNum;
-}
-
-/// tryMatchVectorRegister - Try to parse a vector register name with optional
-/// kind specifier. If it is a register specifier, eat the token and return it.
-int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
- MCAsmParser &Parser = getParser();
- if (Parser.getTok().isNot(AsmToken::Identifier)) {
- TokError("vector register expected");
- return -1;
- }
-
- StringRef Name = Parser.getTok().getString();
- // If there is a kind specifier, it's separated from the register name by
- // a '.'.
- size_t Start = 0, Next = Name.find('.');
- StringRef Head = Name.slice(Start, Next);
- unsigned RegNum = matchRegisterNameAlias(Head, RegKind::NeonVector);
-
- if (RegNum) {
- if (Next != StringRef::npos) {
- Kind = Name.slice(Next, StringRef::npos);
- if (!isValidVectorKind(Kind)) {
- TokError("invalid vector kind qualifier");
- return -1;
- }
- }
- Parser.Lex(); // Eat the register token.
- return RegNum;
- }
-
- if (expected)
- TokError("vector register expected");
- return -1;
+ return MatchOperand_Success;
}
/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
@@ -2125,11 +2240,32 @@ AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
}
/// tryParsePrefetch - Try to parse a prefetch operand.
+template <bool IsSVEPrefetch>
OperandMatchResultTy
AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
const AsmToken &Tok = Parser.getTok();
+
+ auto LookupByName = [](StringRef N) {
+ if (IsSVEPrefetch) {
+ if (auto Res = AArch64SVEPRFM::lookupSVEPRFMByName(N))
+ return Optional<unsigned>(Res->Encoding);
+ } else if (auto Res = AArch64PRFM::lookupPRFMByName(N))
+ return Optional<unsigned>(Res->Encoding);
+ return Optional<unsigned>();
+ };
+
+ auto LookupByEncoding = [](unsigned E) {
+ if (IsSVEPrefetch) {
+ if (auto Res = AArch64SVEPRFM::lookupSVEPRFMByEncoding(E))
+ return Optional<StringRef>(Res->Name);
+ } else if (auto Res = AArch64PRFM::lookupPRFMByEncoding(E))
+ return Optional<StringRef>(Res->Name);
+ return Optional<StringRef>();
+ };
+ unsigned MaxVal = IsSVEPrefetch ? 15 : 31;
+
// Either an identifier for named values or a 5-bit immediate.
// Eat optional hash.
if (parseOptionalToken(AsmToken::Hash) ||
@@ -2144,31 +2280,32 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
return MatchOperand_ParseFail;
}
unsigned prfop = MCE->getValue();
- if (prfop > 31) {
- TokError("prefetch operand out of range, [0,31] expected");
+ if (prfop > MaxVal) {
+ TokError("prefetch operand out of range, [0," + utostr(MaxVal) +
+ "] expected");
return MatchOperand_ParseFail;
}
- auto PRFM = AArch64PRFM::lookupPRFMByEncoding(MCE->getValue());
+ auto PRFM = LookupByEncoding(MCE->getValue());
Operands.push_back(AArch64Operand::CreatePrefetch(
- prfop, PRFM ? PRFM->Name : "", S, getContext()));
+ prfop, PRFM.getValueOr(""), S, getContext()));
return MatchOperand_Success;
}
if (Tok.isNot(AsmToken::Identifier)) {
- TokError("pre-fetch hint expected");
+ TokError("prefetch hint expected");
return MatchOperand_ParseFail;
}
- auto PRFM = AArch64PRFM::lookupPRFMByName(Tok.getString());
+ auto PRFM = LookupByName(Tok.getString());
if (!PRFM) {
- TokError("pre-fetch hint expected");
+ TokError("prefetch hint expected");
return MatchOperand_ParseFail;
}
Parser.Lex(); // Eat identifier token.
Operands.push_back(AArch64Operand::CreatePrefetch(
- PRFM->Encoding, Tok.getString(), S, getContext()));
+ *PRFM, Tok.getString(), S, getContext()));
return MatchOperand_Success;
}
@@ -2253,17 +2390,21 @@ AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
SMLoc S = getLoc();
const MCExpr *Expr;
- parseOptionalToken(AsmToken::Hash);
- if (getParser().parseExpression(Expr))
- return MatchOperand_ParseFail;
+ const AsmToken &Tok = getParser().getTok();
+ if (parseOptionalToken(AsmToken::Hash) || Tok.is(AsmToken::Integer)) {
+ if (getParser().parseExpression(Expr))
+ return MatchOperand_ParseFail;
- SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
- Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+ SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
- return MatchOperand_Success;
+ return MatchOperand_Success;
+ }
+ return MatchOperand_NoMatch;
}
/// tryParseFPImm - A floating point immediate expression operand.
+template<bool AddFPZeroAsLiteral>
OperandMatchResultTy
AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
@@ -2275,50 +2416,50 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
bool isNegative = parseOptionalToken(AsmToken::Minus);
const AsmToken &Tok = Parser.getTok();
- if (Tok.is(AsmToken::Real) || Tok.is(AsmToken::Integer)) {
- int64_t Val;
- if (Tok.is(AsmToken::Integer) && !isNegative && Tok.getString().startswith("0x")) {
- Val = Tok.getIntVal();
- if (Val > 255 || Val < 0) {
- TokError("encoded floating point value out of range");
- return MatchOperand_ParseFail;
- }
- } else {
- APFloat RealVal(APFloat::IEEEdouble(), Tok.getString());
- if (isNegative)
- RealVal.changeSign();
+ if (!Tok.is(AsmToken::Real) && !Tok.is(AsmToken::Integer)) {
+ if (!Hash)
+ return MatchOperand_NoMatch;
+ TokError("invalid floating point immediate");
+ return MatchOperand_ParseFail;
+ }
- uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
- Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
-
- // Check for out of range values. As an exception we let Zero through,
- // but as tokens instead of an FPImm so that it can be matched by the
- // appropriate alias if one exists.
- if (RealVal.isPosZero()) {
- Parser.Lex(); // Eat the token.
- Operands.push_back(AArch64Operand::CreateToken("#0", false, S, getContext()));
- Operands.push_back(AArch64Operand::CreateToken(".0", false, S, getContext()));
- return MatchOperand_Success;
- } else if (Val == -1) {
- TokError("expected compatible register or floating-point constant");
- return MatchOperand_ParseFail;
- }
+ // Parse hexadecimal representation.
+ if (Tok.is(AsmToken::Integer) && Tok.getString().startswith("0x")) {
+ if (Tok.getIntVal() > 255 || isNegative) {
+ TokError("encoded floating point value out of range");
+ return MatchOperand_ParseFail;
}
- Parser.Lex(); // Eat the token.
- Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
- return MatchOperand_Success;
+
+ APFloat F((double)AArch64_AM::getFPImmFloat(Tok.getIntVal()));
+ Operands.push_back(
+ AArch64Operand::CreateFPImm(F, true, S, getContext()));
+ } else {
+ // Parse FP representation.
+ APFloat RealVal(APFloat::IEEEdouble());
+ auto Status =
+ RealVal.convertFromString(Tok.getString(), APFloat::rmTowardZero);
+ if (isNegative)
+ RealVal.changeSign();
+
+ if (AddFPZeroAsLiteral && RealVal.isPosZero()) {
+ Operands.push_back(
+ AArch64Operand::CreateToken("#0", false, S, getContext()));
+ Operands.push_back(
+ AArch64Operand::CreateToken(".0", false, S, getContext()));
+ } else
+ Operands.push_back(AArch64Operand::CreateFPImm(
+ RealVal, Status == APFloat::opOK, S, getContext()));
}
- if (!Hash)
- return MatchOperand_NoMatch;
+ Parser.Lex(); // Eat the token.
- TokError("invalid floating point immediate");
- return MatchOperand_ParseFail;
+ return MatchOperand_Success;
}
-/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand
+/// tryParseImmWithOptionalShift - Parse immediate operand, optionally with
+/// a shift suffix, for example '#1, lsl #12'.
OperandMatchResultTy
-AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
+AArch64AsmParser::tryParseImmWithOptionalShift(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
@@ -2332,18 +2473,9 @@ AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
if (parseSymbolicImmVal(Imm))
return MatchOperand_ParseFail;
else if (Parser.getTok().isNot(AsmToken::Comma)) {
- uint64_t ShiftAmount = 0;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm);
- if (MCE) {
- int64_t Val = MCE->getValue();
- if (Val > 0xfff && (Val & 0xfff) == 0) {
- Imm = MCConstantExpr::create(Val >> 12, getContext());
- ShiftAmount = 12;
- }
- }
SMLoc E = Parser.getTok().getLoc();
- Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount, S, E,
- getContext()));
+ Operands.push_back(
+ AArch64Operand::CreateImm(Imm, S, E, getContext()));
return MatchOperand_Success;
}
@@ -2375,6 +2507,13 @@ AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
}
Parser.Lex(); // Eat the number
+ // Just in case the optional lsl #0 is used for immediates other than zero.
+ if (ShiftAmount == 0 && Imm != 0) {
+ SMLoc E = Parser.getTok().getLoc();
+ Operands.push_back(AArch64Operand::CreateImm(Imm, S, E, getContext()));
+ return MatchOperand_Success;
+ }
+
SMLoc E = Parser.getTok().getLoc();
Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount,
S, E, getContext()));
@@ -2403,6 +2542,22 @@ AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
.Case("al", AArch64CC::AL)
.Case("nv", AArch64CC::NV)
.Default(AArch64CC::Invalid);
+
+ if (CC == AArch64CC::Invalid &&
+ getSTI().getFeatureBits()[AArch64::FeatureSVE])
+ CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
+ .Case("none", AArch64CC::EQ)
+ .Case("any", AArch64CC::NE)
+ .Case("nlast", AArch64CC::HS)
+ .Case("last", AArch64CC::LO)
+ .Case("first", AArch64CC::MI)
+ .Case("nfrst", AArch64CC::PL)
+ .Case("pmore", AArch64CC::HI)
+ .Case("plast", AArch64CC::LS)
+ .Case("tcont", AArch64CC::GE)
+ .Case("tstop", AArch64CC::LT)
+ .Default(AArch64CC::Invalid);
+
return CC;
}
@@ -2510,6 +2665,10 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
Str += "ARMv8.1a";
else if (FBS[AArch64::HasV8_2aOps])
Str += "ARMv8.2a";
+ else if (FBS[AArch64::HasV8_3aOps])
+ Str += "ARMv8.3a";
+ else if (FBS[AArch64::HasV8_4aOps])
+ Str += "ARMv8.4a";
else
Str += "(unknown)";
}
@@ -2620,9 +2779,11 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
+ if (Mnemonic == "tsb" && Tok.isNot(AsmToken::Identifier)) {
+ TokError("'csync' operand expected");
+ return MatchOperand_ParseFail;
// Can be either a #imm style literal or an option name
- if (parseOptionalToken(AsmToken::Hash) ||
- Tok.is(AsmToken::Integer)) {
+ } else if (parseOptionalToken(AsmToken::Hash) || Tok.is(AsmToken::Integer)) {
// Immediate operand.
const MCExpr *ImmVal;
SMLoc ExprLoc = getLoc();
@@ -2648,18 +2809,23 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
return MatchOperand_ParseFail;
}
+ auto TSB = AArch64TSB::lookupTSBByName(Tok.getString());
// The only valid named option for ISB is 'sy'
auto DB = AArch64DB::lookupDBByName(Tok.getString());
if (Mnemonic == "isb" && (!DB || DB->Encoding != AArch64DB::sy)) {
TokError("'sy' or #imm operand expected");
return MatchOperand_ParseFail;
- } else if (!DB) {
+ // The only valid named option for TSB is 'csync'
+ } else if (Mnemonic == "tsb" && (!TSB || TSB->Encoding != AArch64TSB::csync)) {
+ TokError("'csync' operand expected");
+ return MatchOperand_ParseFail;
+ } else if (!DB && !TSB) {
TokError("invalid barrier option name");
return MatchOperand_ParseFail;
}
Operands.push_back(AArch64Operand::CreateBarrier(
- DB->Encoding, Tok.getString(), getLoc(), getContext()));
+ DB ? DB->Encoding : TSB->Encoding, Tok.getString(), getLoc(), getContext()));
Parser.Lex(); // Consume the option
return MatchOperand_Success;
@@ -2703,12 +2869,20 @@ bool AArch64AsmParser::tryParseNeonVectorRegister(OperandVector &Operands) {
SMLoc S = getLoc();
// Check for a vector register specifier first.
StringRef Kind;
- int64_t Reg = tryMatchVectorRegister(Kind, false);
- if (Reg == -1)
+ unsigned Reg;
+ OperandMatchResultTy Res =
+ tryParseVectorRegister(Reg, Kind, RegKind::NeonVector);
+ if (Res != MatchOperand_Success)
+ return true;
+
+ const auto &KindRes = parseVectorKind(Kind, RegKind::NeonVector);
+ if (!KindRes)
return true;
+
+ unsigned ElementWidth = KindRes->second;
Operands.push_back(
- AArch64Operand::CreateReg(Reg, RegKind::NeonVector, S, getLoc(),
- getContext()));
+ AArch64Operand::CreateVectorReg(Reg, RegKind::NeonVector, ElementWidth,
+ S, getLoc(), getContext()));
// If there was an explicit qualifier, that goes on as a literal text
// operand.
@@ -2716,36 +2890,41 @@ bool AArch64AsmParser::tryParseNeonVectorRegister(OperandVector &Operands) {
Operands.push_back(
AArch64Operand::CreateToken(Kind, false, S, getContext()));
- // If there is an index specifier following the register, parse that too.
+ return tryParseVectorIndex(Operands) == MatchOperand_ParseFail;
+}
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseVectorIndex(OperandVector &Operands) {
SMLoc SIdx = getLoc();
if (parseOptionalToken(AsmToken::LBrac)) {
const MCExpr *ImmVal;
if (getParser().parseExpression(ImmVal))
- return false;
+ return MatchOperand_NoMatch;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
if (!MCE) {
TokError("immediate value expected for vector index");
- return false;
+ return MatchOperand_ParseFail;;
}
SMLoc E = getLoc();
if (parseToken(AsmToken::RBrac, "']' expected"))
- return false;
+ return MatchOperand_ParseFail;;
Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
E, getContext()));
+ return MatchOperand_Success;
}
- return false;
+ return MatchOperand_NoMatch;
}
-// tryParseSVEDataVectorRegister - Try to parse a SVE vector register name with
+// tryParseVectorRegister - Try to parse a vector register name with
// optional kind specifier. If it is a register specifier, eat the token
// and return it.
OperandMatchResultTy
-AArch64AsmParser::tryParseSVERegister(int &Reg, StringRef &Kind,
- RegKind MatchKind) {
+AArch64AsmParser::tryParseVectorRegister(unsigned &Reg, StringRef &Kind,
+ RegKind MatchKind) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
@@ -2762,8 +2941,8 @@ AArch64AsmParser::tryParseSVERegister(int &Reg, StringRef &Kind,
if (RegNum) {
if (Next != StringRef::npos) {
Kind = Name.slice(Next, StringRef::npos);
- if (!isValidSVEKind(Kind)) {
- TokError("invalid sve vector kind qualifier");
+ if (!isValidVectorKind(Kind, MatchKind)) {
+ TokError("invalid vector kind qualifier");
return MatchOperand_ParseFail;
}
}
@@ -2782,45 +2961,64 @@ AArch64AsmParser::tryParseSVEPredicateVector(OperandVector &Operands) {
// Check for a SVE predicate register specifier first.
const SMLoc S = getLoc();
StringRef Kind;
- int RegNum = -1;
- auto Res = tryParseSVERegister(RegNum, Kind, RegKind::SVEPredicateVector);
+ unsigned RegNum;
+ auto Res = tryParseVectorRegister(RegNum, Kind, RegKind::SVEPredicateVector);
if (Res != MatchOperand_Success)
return Res;
- unsigned ElementWidth = StringSwitch<unsigned>(Kind.lower())
- .Case("", -1)
- .Case(".b", 8)
- .Case(".h", 16)
- .Case(".s", 32)
- .Case(".d", 64)
- .Case(".q", 128)
- .Default(0);
-
- if (!ElementWidth)
+ const auto &KindRes = parseVectorKind(Kind, RegKind::SVEPredicateVector);
+ if (!KindRes)
return MatchOperand_NoMatch;
+ unsigned ElementWidth = KindRes->second;
+ Operands.push_back(AArch64Operand::CreateVectorReg(
+ RegNum, RegKind::SVEPredicateVector, ElementWidth, S,
+ getLoc(), getContext()));
+
+ // Not all predicates are followed by a '/m' or '/z'.
+ MCAsmParser &Parser = getParser();
+ if (Parser.getTok().isNot(AsmToken::Slash))
+ return MatchOperand_Success;
+
+ // But when they do they shouldn't have an element type suffix.
+ if (!Kind.empty()) {
+ Error(S, "not expecting size suffix");
+ return MatchOperand_ParseFail;
+ }
+
+ // Add a literal slash as operand
Operands.push_back(
- AArch64Operand::CreateReg(RegNum, RegKind::SVEPredicateVector,
- ElementWidth, S, getLoc(), getContext()));
+ AArch64Operand::CreateToken("/" , false, getLoc(), getContext()));
+ Parser.Lex(); // Eat the slash.
+
+ // Zeroing or merging?
+ auto Pred = Parser.getTok().getString().lower();
+ if (Pred != "z" && Pred != "m") {
+ Error(getLoc(), "expecting 'm' or 'z' predication");
+ return MatchOperand_ParseFail;
+ }
+
+ // Add zero/merge token.
+ const char *ZM = Pred == "z" ? "z" : "m";
+ Operands.push_back(
+ AArch64Operand::CreateToken(ZM, false, getLoc(), getContext()));
+
+ Parser.Lex(); // Eat zero/merge token.
return MatchOperand_Success;
}
-/// parseRegister - Parse a non-vector register operand.
+/// parseRegister - Parse a register operand.
bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
- SMLoc S = getLoc();
- // Try for a vector (neon) register.
+ // Try for a Neon vector register.
if (!tryParseNeonVectorRegister(Operands))
return false;
- // Try for a scalar register.
- int64_t Reg = tryParseRegister();
- if (Reg == -1)
- return true;
- Operands.push_back(AArch64Operand::CreateReg(Reg, RegKind::Scalar, S,
- getLoc(), getContext()));
+ // Otherwise try for a scalar register.
+ if (tryParseGPROperand<false>(Operands) == MatchOperand_Success)
+ return false;
- return false;
+ return true;
}
bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
@@ -2871,6 +3069,8 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
.Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1)
.Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC)
.Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE)
+ .Case("secrel_lo12", AArch64MCExpr::VK_SECREL_LO12)
+ .Case("secrel_hi12", AArch64MCExpr::VK_SECREL_HI12)
.Default(AArch64MCExpr::VK_INVALID);
if (RefKind == AArch64MCExpr::VK_INVALID)
@@ -2891,33 +3091,74 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
return false;
}
-/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
-bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
+template <RegKind VectorKind>
+OperandMatchResultTy
+AArch64AsmParser::tryParseVectorList(OperandVector &Operands,
+ bool ExpectMatch) {
MCAsmParser &Parser = getParser();
- assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
+ if (!Parser.getTok().is(AsmToken::LCurly))
+ return MatchOperand_NoMatch;
+
+ // Wrapper around parse function
+ auto ParseVector = [this, &Parser](unsigned &Reg, StringRef &Kind, SMLoc Loc,
+ bool NoMatchIsError) {
+ auto RegTok = Parser.getTok();
+ auto ParseRes = tryParseVectorRegister(Reg, Kind, VectorKind);
+ if (ParseRes == MatchOperand_Success) {
+ if (parseVectorKind(Kind, VectorKind))
+ return ParseRes;
+ llvm_unreachable("Expected a valid vector kind");
+ }
+
+ if (RegTok.isNot(AsmToken::Identifier) ||
+ ParseRes == MatchOperand_ParseFail ||
+ (ParseRes == MatchOperand_NoMatch && NoMatchIsError)) {
+ Error(Loc, "vector register expected");
+ return MatchOperand_ParseFail;
+ }
+
+ return MatchOperand_NoMatch;
+ };
+
SMLoc S = getLoc();
+ auto LCurly = Parser.getTok();
Parser.Lex(); // Eat left bracket token.
+
StringRef Kind;
- int64_t FirstReg = tryMatchVectorRegister(Kind, true);
- if (FirstReg == -1)
- return true;
+ unsigned FirstReg;
+ auto ParseRes = ParseVector(FirstReg, Kind, getLoc(), ExpectMatch);
+
+ // Put back the original left bracket if there was no match, so that
+ // different types of list-operands can be matched (e.g. SVE, Neon).
+ if (ParseRes == MatchOperand_NoMatch)
+ Parser.getLexer().UnLex(LCurly);
+
+ if (ParseRes != MatchOperand_Success)
+ return ParseRes;
+
int64_t PrevReg = FirstReg;
unsigned Count = 1;
if (parseOptionalToken(AsmToken::Minus)) {
SMLoc Loc = getLoc();
StringRef NextKind;
- int64_t Reg = tryMatchVectorRegister(NextKind, true);
- if (Reg == -1)
- return true;
+
+ unsigned Reg;
+ ParseRes = ParseVector(Reg, NextKind, getLoc(), true);
+ if (ParseRes != MatchOperand_Success)
+ return ParseRes;
+
// Any Kind suffices must match on all regs in the list.
- if (Kind != NextKind)
- return Error(Loc, "mismatched register size suffix");
+ if (Kind != NextKind) {
+ Error(Loc, "mismatched register size suffix");
+ return MatchOperand_ParseFail;
+ }
unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg);
if (Space == 0 || Space > 3) {
- return Error(Loc, "invalid number of vectors");
+ Error(Loc, "invalid number of vectors");
+ return MatchOperand_ParseFail;
}
Count += Space;
@@ -2926,17 +3167,23 @@ bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
while (parseOptionalToken(AsmToken::Comma)) {
SMLoc Loc = getLoc();
StringRef NextKind;
- int64_t Reg = tryMatchVectorRegister(NextKind, true);
- if (Reg == -1)
- return true;
+ unsigned Reg;
+ ParseRes = ParseVector(Reg, NextKind, getLoc(), true);
+ if (ParseRes != MatchOperand_Success)
+ return ParseRes;
+
// Any Kind suffices must match on all regs in the list.
- if (Kind != NextKind)
- return Error(Loc, "mismatched register size suffix");
+ if (Kind != NextKind) {
+ Error(Loc, "mismatched register size suffix");
+ return MatchOperand_ParseFail;
+ }
// Registers must be incremental (with wraparound at 31)
if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
- (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32)
- return Error(Loc, "registers must be sequential");
+ (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32) {
+ Error(Loc, "registers must be sequential");
+ return MatchOperand_ParseFail;
+ }
PrevReg = Reg;
++Count;
@@ -2944,83 +3191,146 @@ bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
}
if (parseToken(AsmToken::RCurly, "'}' expected"))
- return true;
+ return MatchOperand_ParseFail;
- if (Count > 4)
- return Error(S, "invalid number of vectors");
+ if (Count > 4) {
+ Error(S, "invalid number of vectors");
+ return MatchOperand_ParseFail;
+ }
unsigned NumElements = 0;
- char ElementKind = 0;
- if (!Kind.empty())
- parseValidVectorKind(Kind, NumElements, ElementKind);
+ unsigned ElementWidth = 0;
+ if (!Kind.empty()) {
+ if (const auto &VK = parseVectorKind(Kind, VectorKind))
+ std::tie(NumElements, ElementWidth) = *VK;
+ }
Operands.push_back(AArch64Operand::CreateVectorList(
- FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext()));
+ FirstReg, Count, NumElements, ElementWidth, VectorKind, S, getLoc(),
+ getContext()));
- // If there is an index specifier following the list, parse that too.
- SMLoc SIdx = getLoc();
- if (parseOptionalToken(AsmToken::LBrac)) { // Eat left bracket token.
- const MCExpr *ImmVal;
- if (getParser().parseExpression(ImmVal))
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
- if (!MCE) {
- TokError("immediate value expected for vector index");
- return false;
- }
+ return MatchOperand_Success;
+}
- SMLoc E = getLoc();
- if (parseToken(AsmToken::RBrac, "']' expected"))
- return false;
+/// parseNeonVectorList - Parse a vector list operand for AdvSIMD instructions.
+bool AArch64AsmParser::parseNeonVectorList(OperandVector &Operands) {
+ auto ParseRes = tryParseVectorList<RegKind::NeonVector>(Operands, true);
+ if (ParseRes != MatchOperand_Success)
+ return true;
- Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
- E, getContext()));
- }
- return false;
+ return tryParseVectorIndex(Operands) == MatchOperand_ParseFail;
}
OperandMatchResultTy
AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
- MCAsmParser &Parser = getParser();
- const AsmToken &Tok = Parser.getTok();
- if (!Tok.is(AsmToken::Identifier))
- return MatchOperand_NoMatch;
+ SMLoc StartLoc = getLoc();
- unsigned RegNum = matchRegisterNameAlias(Tok.getString().lower(), RegKind::Scalar);
-
- MCContext &Ctx = getContext();
- const MCRegisterInfo *RI = Ctx.getRegisterInfo();
- if (!RI->getRegClass(AArch64::GPR64spRegClassID).contains(RegNum))
- return MatchOperand_NoMatch;
-
- SMLoc S = getLoc();
- Parser.Lex(); // Eat register
+ unsigned RegNum;
+ OperandMatchResultTy Res = tryParseScalarRegister(RegNum);
+ if (Res != MatchOperand_Success)
+ return Res;
if (!parseOptionalToken(AsmToken::Comma)) {
- Operands.push_back(
- AArch64Operand::CreateReg(RegNum, RegKind::Scalar, S, getLoc(), Ctx));
+ Operands.push_back(AArch64Operand::CreateReg(
+ RegNum, RegKind::Scalar, StartLoc, getLoc(), getContext()));
return MatchOperand_Success;
}
parseOptionalToken(AsmToken::Hash);
- if (Parser.getTok().isNot(AsmToken::Integer)) {
+ if (getParser().getTok().isNot(AsmToken::Integer)) {
Error(getLoc(), "index must be absent or #0");
return MatchOperand_ParseFail;
}
const MCExpr *ImmVal;
- if (Parser.parseExpression(ImmVal) || !isa<MCConstantExpr>(ImmVal) ||
+ if (getParser().parseExpression(ImmVal) || !isa<MCConstantExpr>(ImmVal) ||
cast<MCConstantExpr>(ImmVal)->getValue() != 0) {
Error(getLoc(), "index must be absent or #0");
return MatchOperand_ParseFail;
}
- Operands.push_back(
- AArch64Operand::CreateReg(RegNum, RegKind::Scalar, S, getLoc(), Ctx));
+ Operands.push_back(AArch64Operand::CreateReg(
+ RegNum, RegKind::Scalar, StartLoc, getLoc(), getContext()));
return MatchOperand_Success;
}
+template <bool ParseShiftExtend, RegConstraintEqualityTy EqTy>
+OperandMatchResultTy
+AArch64AsmParser::tryParseGPROperand(OperandVector &Operands) {
+ SMLoc StartLoc = getLoc();
+
+ unsigned RegNum;
+ OperandMatchResultTy Res = tryParseScalarRegister(RegNum);
+ if (Res != MatchOperand_Success)
+ return Res;
+
+ // No shift/extend is the default.
+ if (!ParseShiftExtend || getParser().getTok().isNot(AsmToken::Comma)) {
+ Operands.push_back(AArch64Operand::CreateReg(
+ RegNum, RegKind::Scalar, StartLoc, getLoc(), getContext(), EqTy));
+ return MatchOperand_Success;
+ }
+
+ // Eat the comma
+ getParser().Lex();
+
+ // Match the shift
+ SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> ExtOpnd;
+ Res = tryParseOptionalShiftExtend(ExtOpnd);
+ if (Res != MatchOperand_Success)
+ return Res;
+
+ auto Ext = static_cast<AArch64Operand*>(ExtOpnd.back().get());
+ Operands.push_back(AArch64Operand::CreateReg(
+ RegNum, RegKind::Scalar, StartLoc, Ext->getEndLoc(), getContext(), EqTy,
+ Ext->getShiftExtendType(), Ext->getShiftExtendAmount(),
+ Ext->hasShiftExtendAmount()));
+
+ return MatchOperand_Success;
+}
+
+bool AArch64AsmParser::parseOptionalMulOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+
+ // Some SVE instructions have a decoration after the immediate, i.e.
+ // "mul vl". We parse them here and add tokens, which must be present in the
+ // asm string in the tablegen instruction.
+ bool NextIsVL = Parser.getLexer().peekTok().getString().equals_lower("vl");
+ bool NextIsHash = Parser.getLexer().peekTok().is(AsmToken::Hash);
+ if (!Parser.getTok().getString().equals_lower("mul") ||
+ !(NextIsVL || NextIsHash))
+ return true;
+
+ Operands.push_back(
+ AArch64Operand::CreateToken("mul", false, getLoc(), getContext()));
+ Parser.Lex(); // Eat the "mul"
+
+ if (NextIsVL) {
+ Operands.push_back(
+ AArch64Operand::CreateToken("vl", false, getLoc(), getContext()));
+ Parser.Lex(); // Eat the "vl"
+ return false;
+ }
+
+ if (NextIsHash) {
+ Parser.Lex(); // Eat the #
+ SMLoc S = getLoc();
+
+ // Parse immediate operand.
+ const MCExpr *ImmVal;
+ if (!Parser.parseExpression(ImmVal))
+ if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal)) {
+ Operands.push_back(AArch64Operand::CreateImm(
+ MCConstantExpr::create(MCE->getValue(), getContext()), S, getLoc(),
+ getContext()));
+ return MatchOperand_Success;
+ }
+ }
+
+ return Error(getLoc(), "expected 'vl' or '#<imm>'");
+}
+
/// parseOperand - Parse a arm instruction operand. For now this parses the
/// operand regardless of the mnemonic.
bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
@@ -3064,7 +3374,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
return parseOperand(Operands, false, false);
}
case AsmToken::LCurly:
- return parseVectorList(Operands);
+ return parseNeonVectorList(Operands);
case AsmToken::Identifier: {
// If we're expecting a Condition Code operand, then just parse that.
if (isCondCode)
@@ -3074,6 +3384,11 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
if (!parseRegister(Operands))
return false;
+ // See if this is a "mul vl" decoration or "mul #<int>" operand used
+ // by SVE instructions.
+ if (!parseOptionalMulOperand(Operands))
+ return false;
+
// This could be an optional "shift" or "extend" operand.
OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands);
// We can only continue if no tokens were eaten.
@@ -3117,7 +3432,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" &&
Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" &&
- Mnemonic != "fcmlt")
+ Mnemonic != "fcmlt" && Mnemonic != "fcmne")
return TokError("unexpected floating point literal");
else if (IntVal != 0 || isNegative)
return TokError("expected floating-point constant #0.0");
@@ -3148,7 +3463,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
return true;
if (Operands.size() < 2 ||
- !static_cast<AArch64Operand &>(*Operands[1]).isReg())
+ !static_cast<AArch64Operand &>(*Operands[1]).isScalarReg())
return Error(Loc, "Only valid when first operand is register");
bool IsXReg =
@@ -3188,6 +3503,30 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
}
}
+bool AArch64AsmParser::regsEqual(const MCParsedAsmOperand &Op1,
+ const MCParsedAsmOperand &Op2) const {
+ auto &AOp1 = static_cast<const AArch64Operand&>(Op1);
+ auto &AOp2 = static_cast<const AArch64Operand&>(Op2);
+ if (AOp1.getRegEqualityTy() == RegConstraintEqualityTy::EqualsReg &&
+ AOp2.getRegEqualityTy() == RegConstraintEqualityTy::EqualsReg)
+ return MCTargetAsmParser::regsEqual(Op1, Op2);
+
+ assert(AOp1.isScalarReg() && AOp2.isScalarReg() &&
+ "Testing equality of non-scalar registers not supported");
+
+ // Check if a registers match their sub/super register classes.
+ if (AOp1.getRegEqualityTy() == EqualsSuperReg)
+ return getXRegFromWReg(Op1.getReg()) == Op2.getReg();
+ if (AOp1.getRegEqualityTy() == EqualsSubReg)
+ return getWRegFromXReg(Op1.getReg()) == Op2.getReg();
+ if (AOp2.getRegEqualityTy() == EqualsSuperReg)
+ return getXRegFromWReg(Op2.getReg()) == Op1.getReg();
+ if (AOp2.getRegEqualityTy() == EqualsSubReg)
+ return getWRegFromXReg(Op2.getReg()) == Op1.getReg();
+
+ return false;
+}
+
/// ParseInstruction - Parse an AArch64 instruction mnemonic followed by its
/// operands.
bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
@@ -3446,7 +3785,39 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst,
"is also a source");
break;
}
+ case AArch64::STXRB:
+ case AArch64::STXRH:
+ case AArch64::STXRW:
+ case AArch64::STXRX:
+ case AArch64::STLXRB:
+ case AArch64::STLXRH:
+ case AArch64::STLXRW:
+ case AArch64::STLXRX: {
+ unsigned Rs = Inst.getOperand(0).getReg();
+ unsigned Rt = Inst.getOperand(1).getReg();
+ unsigned Rn = Inst.getOperand(2).getReg();
+ if (RI->isSubRegisterEq(Rt, Rs) ||
+ (RI->isSubRegisterEq(Rn, Rs) && Rn != AArch64::SP))
+ return Error(Loc[0],
+ "unpredictable STXR instruction, status is also a source");
+ break;
+ }
+ case AArch64::STXPW:
+ case AArch64::STXPX:
+ case AArch64::STLXPW:
+ case AArch64::STLXPX: {
+ unsigned Rs = Inst.getOperand(0).getReg();
+ unsigned Rt1 = Inst.getOperand(1).getReg();
+ unsigned Rt2 = Inst.getOperand(2).getReg();
+ unsigned Rn = Inst.getOperand(3).getReg();
+ if (RI->isSubRegisterEq(Rt1, Rs) || RI->isSubRegisterEq(Rt2, Rs) ||
+ (RI->isSubRegisterEq(Rn, Rs) && Rn != AArch64::SP))
+ return Error(Loc[0],
+ "unpredictable STXP instruction, status is also a source");
+ break;
}
+ }
+
// Now check immediate ranges. Separate from the above as there is overlap
// in the instructions being checked and this keeps the nested conditionals
@@ -3483,7 +3854,9 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst,
ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 ||
ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
- ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) &&
+ ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_SECREL_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_SECREL_HI12) &&
(Inst.getOpcode() == AArch64::ADDXri ||
Inst.getOpcode() == AArch64::ADDWri))
return false;
@@ -3507,8 +3880,23 @@ static std::string AArch64MnemonicSpellCheck(StringRef S, uint64_t FBS,
unsigned VariantID = 0);
bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
+ uint64_t ErrorInfo,
OperandVector &Operands) {
switch (ErrCode) {
+ case Match_InvalidTiedOperand: {
+ RegConstraintEqualityTy EqTy =
+ static_cast<const AArch64Operand &>(*Operands[ErrorInfo])
+ .getRegEqualityTy();
+ switch (EqTy) {
+ case RegConstraintEqualityTy::EqualsSubReg:
+ return Error(Loc, "operand must be 64-bit form of destination register");
+ case RegConstraintEqualityTy::EqualsSuperReg:
+ return Error(Loc, "operand must be 32-bit form of destination register");
+ case RegConstraintEqualityTy::EqualsReg:
+ return Error(Loc, "operand must match destination register");
+ }
+ llvm_unreachable("Unknown RegConstraintEqualityTy");
+ }
case Match_MissingFeature:
return Error(Loc,
"instruction requires a CPU feature not currently enabled");
@@ -3542,9 +3930,27 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
case Match_InvalidFPImm:
return Error(Loc,
"expected compatible register or floating-point constant");
+ case Match_InvalidMemoryIndexedSImm6:
+ return Error(Loc, "index must be an integer in range [-32, 31].");
+ case Match_InvalidMemoryIndexedSImm5:
+ return Error(Loc, "index must be an integer in range [-16, 15].");
+ case Match_InvalidMemoryIndexed1SImm4:
+ return Error(Loc, "index must be an integer in range [-8, 7].");
+ case Match_InvalidMemoryIndexed2SImm4:
+ return Error(Loc, "index must be a multiple of 2 in range [-16, 14].");
+ case Match_InvalidMemoryIndexed3SImm4:
+ return Error(Loc, "index must be a multiple of 3 in range [-24, 21].");
+ case Match_InvalidMemoryIndexed4SImm4:
+ return Error(Loc, "index must be a multiple of 4 in range [-32, 28].");
+ case Match_InvalidMemoryIndexed16SImm4:
+ return Error(Loc, "index must be a multiple of 16 in range [-128, 112].");
+ case Match_InvalidMemoryIndexed1SImm6:
+ return Error(Loc, "index must be an integer in range [-32, 31].");
+ case Match_InvalidMemoryIndexedSImm8:
+ return Error(Loc, "index must be an integer in range [-128, 127].");
case Match_InvalidMemoryIndexedSImm9:
return Error(Loc, "index must be an integer in range [-256, 255].");
- case Match_InvalidMemoryIndexedSImm10:
+ case Match_InvalidMemoryIndexed8SImm10:
return Error(Loc, "index must be a multiple of 8 in range [-4096, 4088].");
case Match_InvalidMemoryIndexed4SImm7:
return Error(Loc, "index must be a multiple of 4 in range [-256, 252].");
@@ -3552,6 +3958,20 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
return Error(Loc, "index must be a multiple of 8 in range [-512, 504].");
case Match_InvalidMemoryIndexed16SImm7:
return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008].");
+ case Match_InvalidMemoryIndexed8UImm5:
+ return Error(Loc, "index must be a multiple of 8 in range [0, 248].");
+ case Match_InvalidMemoryIndexed4UImm5:
+ return Error(Loc, "index must be a multiple of 4 in range [0, 124].");
+ case Match_InvalidMemoryIndexed2UImm5:
+ return Error(Loc, "index must be a multiple of 2 in range [0, 62].");
+ case Match_InvalidMemoryIndexed8UImm6:
+ return Error(Loc, "index must be a multiple of 8 in range [0, 504].");
+ case Match_InvalidMemoryIndexed4UImm6:
+ return Error(Loc, "index must be a multiple of 4 in range [0, 252].");
+ case Match_InvalidMemoryIndexed2UImm6:
+ return Error(Loc, "index must be a multiple of 2 in range [0, 126].");
+ case Match_InvalidMemoryIndexed1UImm6:
+ return Error(Loc, "index must be in range [0, 63].");
case Match_InvalidMemoryWExtend8:
return Error(Loc,
"expected 'uxtw' or 'sxtw' with optional shift of #0");
@@ -3616,16 +4036,44 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
return Error(Loc, "immediate must be an integer in range [1, 32].");
case Match_InvalidImm1_64:
return Error(Loc, "immediate must be an integer in range [1, 64].");
- case Match_InvalidIndex1:
+ case Match_InvalidSVEAddSubImm8:
+ return Error(Loc, "immediate must be an integer in range [0, 255]"
+ " with a shift amount of 0");
+ case Match_InvalidSVEAddSubImm16:
+ case Match_InvalidSVEAddSubImm32:
+ case Match_InvalidSVEAddSubImm64:
+ return Error(Loc, "immediate must be an integer in range [0, 255] or a "
+ "multiple of 256 in range [256, 65280]");
+ case Match_InvalidSVECpyImm8:
+ return Error(Loc, "immediate must be an integer in range [-128, 255]"
+ " with a shift amount of 0");
+ case Match_InvalidSVECpyImm16:
+ return Error(Loc, "immediate must be an integer in range [-128, 127] or a "
+ "multiple of 256 in range [-32768, 65280]");
+ case Match_InvalidSVECpyImm32:
+ case Match_InvalidSVECpyImm64:
+ return Error(Loc, "immediate must be an integer in range [-128, 127] or a "
+ "multiple of 256 in range [-32768, 32512]");
+ case Match_InvalidIndexRange1_1:
return Error(Loc, "expected lane specifier '[1]'");
- case Match_InvalidIndexB:
+ case Match_InvalidIndexRange0_15:
return Error(Loc, "vector lane must be an integer in range [0, 15].");
- case Match_InvalidIndexH:
+ case Match_InvalidIndexRange0_7:
return Error(Loc, "vector lane must be an integer in range [0, 7].");
- case Match_InvalidIndexS:
+ case Match_InvalidIndexRange0_3:
return Error(Loc, "vector lane must be an integer in range [0, 3].");
- case Match_InvalidIndexD:
+ case Match_InvalidIndexRange0_1:
return Error(Loc, "vector lane must be an integer in range [0, 1].");
+ case Match_InvalidSVEIndexRange0_63:
+ return Error(Loc, "vector lane must be an integer in range [0, 63].");
+ case Match_InvalidSVEIndexRange0_31:
+ return Error(Loc, "vector lane must be an integer in range [0, 31].");
+ case Match_InvalidSVEIndexRange0_15:
+ return Error(Loc, "vector lane must be an integer in range [0, 15].");
+ case Match_InvalidSVEIndexRange0_7:
+ return Error(Loc, "vector lane must be an integer in range [0, 7].");
+ case Match_InvalidSVEIndexRange0_3:
+ return Error(Loc, "vector lane must be an integer in range [0, 3].");
case Match_InvalidLabel:
return Error(Loc, "expected label or encodable integer pc offset");
case Match_MRS:
@@ -3642,12 +4090,102 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
ComputeAvailableFeatures(STI->getFeatureBits()));
return Error(Loc, "unrecognized instruction mnemonic" + Suggestion);
}
+ case Match_InvalidGPR64shifted8:
+ return Error(Loc, "register must be x0..x30 or xzr, without shift");
+ case Match_InvalidGPR64shifted16:
+ return Error(Loc, "register must be x0..x30 or xzr, with required shift 'lsl #1'");
+ case Match_InvalidGPR64shifted32:
+ return Error(Loc, "register must be x0..x30 or xzr, with required shift 'lsl #2'");
+ case Match_InvalidGPR64shifted64:
+ return Error(Loc, "register must be x0..x30 or xzr, with required shift 'lsl #3'");
+ case Match_InvalidGPR64NoXZRshifted8:
+ return Error(Loc, "register must be x0..x30 without shift");
+ case Match_InvalidGPR64NoXZRshifted16:
+ return Error(Loc, "register must be x0..x30 with required shift 'lsl #1'");
+ case Match_InvalidGPR64NoXZRshifted32:
+ return Error(Loc, "register must be x0..x30 with required shift 'lsl #2'");
+ case Match_InvalidGPR64NoXZRshifted64:
+ return Error(Loc, "register must be x0..x30 with required shift 'lsl #3'");
+ case Match_InvalidZPR32UXTW8:
+ case Match_InvalidZPR32SXTW8:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw|sxtw)'");
+ case Match_InvalidZPR32UXTW16:
+ case Match_InvalidZPR32SXTW16:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw|sxtw) #1'");
+ case Match_InvalidZPR32UXTW32:
+ case Match_InvalidZPR32SXTW32:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw|sxtw) #2'");
+ case Match_InvalidZPR32UXTW64:
+ case Match_InvalidZPR32SXTW64:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw|sxtw) #3'");
+ case Match_InvalidZPR64UXTW8:
+ case Match_InvalidZPR64SXTW8:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (uxtw|sxtw)'");
+ case Match_InvalidZPR64UXTW16:
+ case Match_InvalidZPR64SXTW16:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (lsl|uxtw|sxtw) #1'");
+ case Match_InvalidZPR64UXTW32:
+ case Match_InvalidZPR64SXTW32:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (lsl|uxtw|sxtw) #2'");
+ case Match_InvalidZPR64UXTW64:
+ case Match_InvalidZPR64SXTW64:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (lsl|uxtw|sxtw) #3'");
+ case Match_InvalidZPR32LSL8:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s'");
+ case Match_InvalidZPR32LSL16:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, lsl #1'");
+ case Match_InvalidZPR32LSL32:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, lsl #2'");
+ case Match_InvalidZPR32LSL64:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, lsl #3'");
+ case Match_InvalidZPR64LSL8:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d'");
+ case Match_InvalidZPR64LSL16:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, lsl #1'");
+ case Match_InvalidZPR64LSL32:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, lsl #2'");
+ case Match_InvalidZPR64LSL64:
+ return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, lsl #3'");
+ case Match_InvalidZPR0:
+ return Error(Loc, "expected register without element width sufix");
+ case Match_InvalidZPR8:
+ case Match_InvalidZPR16:
+ case Match_InvalidZPR32:
+ case Match_InvalidZPR64:
+ case Match_InvalidZPR128:
+ return Error(Loc, "invalid element width");
+ case Match_InvalidZPR_3b8:
+ return Error(Loc, "Invalid restricted vector register, expected z0.b..z7.b");
+ case Match_InvalidZPR_3b16:
+ return Error(Loc, "Invalid restricted vector register, expected z0.h..z7.h");
+ case Match_InvalidZPR_3b32:
+ return Error(Loc, "Invalid restricted vector register, expected z0.s..z7.s");
+ case Match_InvalidZPR_4b16:
+ return Error(Loc, "Invalid restricted vector register, expected z0.h..z15.h");
+ case Match_InvalidZPR_4b32:
+ return Error(Loc, "Invalid restricted vector register, expected z0.s..z15.s");
+ case Match_InvalidZPR_4b64:
+ return Error(Loc, "Invalid restricted vector register, expected z0.d..z15.d");
+ case Match_InvalidSVEPattern:
+ return Error(Loc, "invalid predicate pattern");
case Match_InvalidSVEPredicateAnyReg:
case Match_InvalidSVEPredicateBReg:
case Match_InvalidSVEPredicateHReg:
case Match_InvalidSVEPredicateSReg:
case Match_InvalidSVEPredicateDReg:
return Error(Loc, "invalid predicate register.");
+ case Match_InvalidSVEPredicate3bAnyReg:
+ case Match_InvalidSVEPredicate3bBReg:
+ case Match_InvalidSVEPredicate3bHReg:
+ case Match_InvalidSVEPredicate3bSReg:
+ case Match_InvalidSVEPredicate3bDReg:
+ return Error(Loc, "restricted predicate has range [0, 7].");
+ case Match_InvalidSVEExactFPImmOperandHalfOne:
+ return Error(Loc, "Invalid floating point constant, expected 0.5 or 1.0.");
+ case Match_InvalidSVEExactFPImmOperandHalfTwo:
+ return Error(Loc, "Invalid floating point constant, expected 0.5 or 2.0.");
+ case Match_InvalidSVEExactFPImmOperandZeroOne:
+ return Error(Loc, "Invalid floating point constant, expected 0.0 or 1.0.");
default:
llvm_unreachable("unexpected error code!");
}
@@ -3670,7 +4208,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
if (NumOperands == 4 && Tok == "lsl") {
AArch64Operand &Op2 = static_cast<AArch64Operand &>(*Operands[2]);
AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
- if (Op2.isReg() && Op3.isImm()) {
+ if (Op2.isScalarReg() && Op3.isImm()) {
const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
if (Op3CE) {
uint64_t Op3Val = Op3CE->getValue();
@@ -3702,7 +4240,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
AArch64Operand LSBOp = static_cast<AArch64Operand &>(*Operands[2]);
AArch64Operand WidthOp = static_cast<AArch64Operand &>(*Operands[3]);
- if (Op1.isReg() && LSBOp.isImm() && WidthOp.isImm()) {
+ if (Op1.isScalarReg() && LSBOp.isImm() && WidthOp.isImm()) {
const MCConstantExpr *LSBCE = dyn_cast<MCConstantExpr>(LSBOp.getImm());
const MCConstantExpr *WidthCE = dyn_cast<MCConstantExpr>(WidthOp.getImm());
@@ -3758,7 +4296,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);
- if (Op1.isReg() && Op3.isImm() && Op4.isImm()) {
+ if (Op1.isScalarReg() && Op3.isImm() && Op4.isImm()) {
const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());
@@ -3822,7 +4360,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);
- if (Op1.isReg() && Op3.isImm() && Op4.isImm()) {
+ if (Op1.isScalarReg() && Op3.isImm() && Op4.isImm()) {
const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());
@@ -3901,7 +4439,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
// The source register can be Wn here, but the matcher expects a
// GPR64. Twiddle it here if necessary.
AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
- if (Op.isReg()) {
+ if (Op.isScalarReg()) {
unsigned Reg = getXRegFromWReg(Op.getReg());
Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar,
Op.getStartLoc(), Op.getEndLoc(),
@@ -3911,13 +4449,13 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
// FIXME: Likewise for sxt[bh] with a Xd dst operand
else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) {
AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
- if (Op.isReg() &&
+ if (Op.isScalarReg() &&
AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
Op.getReg())) {
// The source register can be Wn here, but the matcher expects a
// GPR64. Twiddle it here if necessary.
AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
- if (Op.isReg()) {
+ if (Op.isScalarReg()) {
unsigned Reg = getXRegFromWReg(Op.getReg());
Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar,
Op.getStartLoc(),
@@ -3928,13 +4466,13 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
// FIXME: Likewise for uxt[bh] with a Xd dst operand
else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) {
AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
- if (Op.isReg() &&
+ if (Op.isScalarReg() &&
AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
Op.getReg())) {
// The source register can be Wn here, but the matcher expects a
// GPR32. Twiddle it here if necessary.
AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
- if (Op.isReg()) {
+ if (Op.isScalarReg()) {
unsigned Reg = getWRegFromXReg(Op.getReg());
Operands[1] = AArch64Operand::CreateReg(Reg, RegKind::Scalar,
Op.getStartLoc(),
@@ -4001,7 +4539,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return Error(IDLoc, Msg);
}
case Match_MnemonicFail:
- return showMatchError(IDLoc, MatchResult, Operands);
+ return showMatchError(IDLoc, MatchResult, ErrorInfo, Operands);
case Match_InvalidOperand: {
SMLoc ErrorLoc = IDLoc;
@@ -4020,8 +4558,9 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix())
MatchResult = Match_InvalidSuffix;
- return showMatchError(ErrorLoc, MatchResult, Operands);
+ return showMatchError(ErrorLoc, MatchResult, ErrorInfo, Operands);
}
+ case Match_InvalidTiedOperand:
case Match_InvalidMemoryIndexed1:
case Match_InvalidMemoryIndexed2:
case Match_InvalidMemoryIndexed4:
@@ -4047,11 +4586,27 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
case Match_InvalidMemoryXExtend32:
case Match_InvalidMemoryXExtend64:
case Match_InvalidMemoryXExtend128:
+ case Match_InvalidMemoryIndexed1SImm4:
+ case Match_InvalidMemoryIndexed2SImm4:
+ case Match_InvalidMemoryIndexed3SImm4:
+ case Match_InvalidMemoryIndexed4SImm4:
+ case Match_InvalidMemoryIndexed1SImm6:
+ case Match_InvalidMemoryIndexed16SImm4:
case Match_InvalidMemoryIndexed4SImm7:
case Match_InvalidMemoryIndexed8SImm7:
case Match_InvalidMemoryIndexed16SImm7:
+ case Match_InvalidMemoryIndexed8UImm5:
+ case Match_InvalidMemoryIndexed4UImm5:
+ case Match_InvalidMemoryIndexed2UImm5:
+ case Match_InvalidMemoryIndexed1UImm6:
+ case Match_InvalidMemoryIndexed2UImm6:
+ case Match_InvalidMemoryIndexed4UImm6:
+ case Match_InvalidMemoryIndexed8UImm6:
+ case Match_InvalidMemoryIndexedSImm6:
+ case Match_InvalidMemoryIndexedSImm5:
+ case Match_InvalidMemoryIndexedSImm8:
case Match_InvalidMemoryIndexedSImm9:
- case Match_InvalidMemoryIndexedSImm10:
+ case Match_InvalidMemoryIndexed8SImm10:
case Match_InvalidImm0_1:
case Match_InvalidImm0_7:
case Match_InvalidImm0_15:
@@ -4064,19 +4619,85 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
case Match_InvalidImm1_16:
case Match_InvalidImm1_32:
case Match_InvalidImm1_64:
- case Match_InvalidIndex1:
- case Match_InvalidIndexB:
- case Match_InvalidIndexH:
- case Match_InvalidIndexS:
- case Match_InvalidIndexD:
+ case Match_InvalidSVEAddSubImm8:
+ case Match_InvalidSVEAddSubImm16:
+ case Match_InvalidSVEAddSubImm32:
+ case Match_InvalidSVEAddSubImm64:
+ case Match_InvalidSVECpyImm8:
+ case Match_InvalidSVECpyImm16:
+ case Match_InvalidSVECpyImm32:
+ case Match_InvalidSVECpyImm64:
+ case Match_InvalidIndexRange1_1:
+ case Match_InvalidIndexRange0_15:
+ case Match_InvalidIndexRange0_7:
+ case Match_InvalidIndexRange0_3:
+ case Match_InvalidIndexRange0_1:
+ case Match_InvalidSVEIndexRange0_63:
+ case Match_InvalidSVEIndexRange0_31:
+ case Match_InvalidSVEIndexRange0_15:
+ case Match_InvalidSVEIndexRange0_7:
+ case Match_InvalidSVEIndexRange0_3:
case Match_InvalidLabel:
case Match_InvalidComplexRotationEven:
case Match_InvalidComplexRotationOdd:
+ case Match_InvalidGPR64shifted8:
+ case Match_InvalidGPR64shifted16:
+ case Match_InvalidGPR64shifted32:
+ case Match_InvalidGPR64shifted64:
+ case Match_InvalidGPR64NoXZRshifted8:
+ case Match_InvalidGPR64NoXZRshifted16:
+ case Match_InvalidGPR64NoXZRshifted32:
+ case Match_InvalidGPR64NoXZRshifted64:
+ case Match_InvalidZPR32UXTW8:
+ case Match_InvalidZPR32UXTW16:
+ case Match_InvalidZPR32UXTW32:
+ case Match_InvalidZPR32UXTW64:
+ case Match_InvalidZPR32SXTW8:
+ case Match_InvalidZPR32SXTW16:
+ case Match_InvalidZPR32SXTW32:
+ case Match_InvalidZPR32SXTW64:
+ case Match_InvalidZPR64UXTW8:
+ case Match_InvalidZPR64SXTW8:
+ case Match_InvalidZPR64UXTW16:
+ case Match_InvalidZPR64SXTW16:
+ case Match_InvalidZPR64UXTW32:
+ case Match_InvalidZPR64SXTW32:
+ case Match_InvalidZPR64UXTW64:
+ case Match_InvalidZPR64SXTW64:
+ case Match_InvalidZPR32LSL8:
+ case Match_InvalidZPR32LSL16:
+ case Match_InvalidZPR32LSL32:
+ case Match_InvalidZPR32LSL64:
+ case Match_InvalidZPR64LSL8:
+ case Match_InvalidZPR64LSL16:
+ case Match_InvalidZPR64LSL32:
+ case Match_InvalidZPR64LSL64:
+ case Match_InvalidZPR0:
+ case Match_InvalidZPR8:
+ case Match_InvalidZPR16:
+ case Match_InvalidZPR32:
+ case Match_InvalidZPR64:
+ case Match_InvalidZPR128:
+ case Match_InvalidZPR_3b8:
+ case Match_InvalidZPR_3b16:
+ case Match_InvalidZPR_3b32:
+ case Match_InvalidZPR_4b16:
+ case Match_InvalidZPR_4b32:
+ case Match_InvalidZPR_4b64:
case Match_InvalidSVEPredicateAnyReg:
+ case Match_InvalidSVEPattern:
case Match_InvalidSVEPredicateBReg:
case Match_InvalidSVEPredicateHReg:
case Match_InvalidSVEPredicateSReg:
case Match_InvalidSVEPredicateDReg:
+ case Match_InvalidSVEPredicate3bAnyReg:
+ case Match_InvalidSVEPredicate3bBReg:
+ case Match_InvalidSVEPredicate3bHReg:
+ case Match_InvalidSVEPredicate3bSReg:
+ case Match_InvalidSVEPredicate3bDReg:
+ case Match_InvalidSVEExactFPImmOperandHalfOne:
+ case Match_InvalidSVEExactFPImmOperandHalfTwo:
+ case Match_InvalidSVEExactFPImmOperandZeroOne:
case Match_MSR:
case Match_MRS: {
if (ErrorInfo >= Operands.size())
@@ -4086,7 +4707,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
if (ErrorLoc == SMLoc())
ErrorLoc = IDLoc;
- return showMatchError(ErrorLoc, MatchResult, Operands);
+ return showMatchError(ErrorLoc, MatchResult, ErrorInfo, Operands);
}
}
@@ -4106,12 +4727,6 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
parseDirectiveArch(Loc);
else if (IDVal == ".cpu")
parseDirectiveCPU(Loc);
- else if (IDVal == ".hword")
- parseDirectiveWord(2, Loc);
- else if (IDVal == ".word")
- parseDirectiveWord(4, Loc);
- else if (IDVal == ".xword")
- parseDirectiveWord(8, Loc);
else if (IDVal == ".tlsdesccall")
parseDirectiveTLSDescCall(Loc);
else if (IDVal == ".ltorg" || IDVal == ".pool")
@@ -4134,7 +4749,11 @@ static const struct {
const char *Name;
const FeatureBitset Features;
} ExtensionMap[] = {
- { "crc", {AArch64::FeatureCRC} },
+ { "crc", {AArch64::FeatureCRC} },
+ { "sm4", {AArch64::FeatureSM4} },
+ { "sha3", {AArch64::FeatureSHA3} },
+ { "sha2", {AArch64::FeatureSHA2} },
+ { "aes", {AArch64::FeatureAES} },
{ "crypto", {AArch64::FeatureCrypto} },
{ "fp", {AArch64::FeatureFPARMv8} },
{ "simd", {AArch64::FeatureNEON} },
@@ -4148,6 +4767,54 @@ static const struct {
{ "profile", {} },
};
+static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
+ SmallVector<StringRef, 4> &RequestedExtensions) {
+ const bool NoCrypto =
+ (std::find(RequestedExtensions.begin(), RequestedExtensions.end(),
+ "nocrypto") != std::end(RequestedExtensions));
+ const bool Crypto =
+ (std::find(RequestedExtensions.begin(), RequestedExtensions.end(),
+ "crypto") != std::end(RequestedExtensions));
+
+ if (!NoCrypto && Crypto) {
+ switch (ArchKind) {
+ default:
+ // Map 'generic' (and others) to sha2 and aes, because
+ // that was the traditional meaning of crypto.
+ case AArch64::ArchKind::ARMV8_1A:
+ case AArch64::ArchKind::ARMV8_2A:
+ case AArch64::ArchKind::ARMV8_3A:
+ RequestedExtensions.push_back("sha2");
+ RequestedExtensions.push_back("aes");
+ break;
+ case AArch64::ArchKind::ARMV8_4A:
+ RequestedExtensions.push_back("sm4");
+ RequestedExtensions.push_back("sha3");
+ RequestedExtensions.push_back("sha2");
+ RequestedExtensions.push_back("aes");
+ break;
+ }
+ } else if (NoCrypto) {
+ switch (ArchKind) {
+ default:
+ // Map 'generic' (and others) to sha2 and aes, because
+ // that was the traditional meaning of crypto.
+ case AArch64::ArchKind::ARMV8_1A:
+ case AArch64::ArchKind::ARMV8_2A:
+ case AArch64::ArchKind::ARMV8_3A:
+ RequestedExtensions.push_back("nosha2");
+ RequestedExtensions.push_back("noaes");
+ break;
+ case AArch64::ArchKind::ARMV8_4A:
+ RequestedExtensions.push_back("nosm4");
+ RequestedExtensions.push_back("nosha3");
+ RequestedExtensions.push_back("nosha2");
+ RequestedExtensions.push_back("noaes");
+ break;
+ }
+ }
+}
+
/// parseDirectiveArch
/// ::= .arch token
bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
@@ -4178,6 +4845,8 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
if (!ExtensionString.empty())
ExtensionString.split(RequestedExtensions, '+');
+ ExpandCryptoAEK(ID, RequestedExtensions);
+
FeatureBitset Features = STI.getFeatureBits();
for (auto Name : RequestedExtensions) {
bool EnableFeature = true;
@@ -4237,6 +4906,8 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
STI.setDefaultFeatures(CPU, "");
CurLoc = incrementLoc(CurLoc, CPU.size());
+ ExpandCryptoAEK(llvm::AArch64::getCPUArchKind(CPU), RequestedExtensions);
+
FeatureBitset Features = STI.getFeatureBits();
for (auto Name : RequestedExtensions) {
// Advance source location past '+'.
@@ -4276,22 +4947,6 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
return false;
}
-/// parseDirectiveWord
-/// ::= .word [ expression (, expression)* ]
-bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
- auto parseOp = [&]() -> bool {
- const MCExpr *Value;
- if (getParser().parseExpression(Value))
- return true;
- getParser().getStreamer().EmitValue(Value, Size, L);
- return false;
- };
-
- if (parseMany(parseOp))
- return true;
- return false;
-}
-
/// parseDirectiveInst
/// ::= .inst opcode [, ...]
bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) {
@@ -4402,46 +5057,50 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
MCAsmParser &Parser = getParser();
Parser.Lex(); // Eat the '.req' token.
SMLoc SRegLoc = getLoc();
- int RegNum = tryParseRegister();
RegKind RegisterKind = RegKind::Scalar;
+ unsigned RegNum;
+ OperandMatchResultTy ParseRes = tryParseScalarRegister(RegNum);
- if (RegNum == -1) {
+ if (ParseRes != MatchOperand_Success) {
StringRef Kind;
RegisterKind = RegKind::NeonVector;
- RegNum = tryMatchVectorRegister(Kind, false);
- if (!Kind.empty())
+ ParseRes = tryParseVectorRegister(RegNum, Kind, RegKind::NeonVector);
+
+ if (ParseRes == MatchOperand_ParseFail)
+ return true;
+
+ if (ParseRes == MatchOperand_Success && !Kind.empty())
return Error(SRegLoc, "vector register without type specifier expected");
}
- if (RegNum == -1) {
+ if (ParseRes != MatchOperand_Success) {
StringRef Kind;
RegisterKind = RegKind::SVEDataVector;
- OperandMatchResultTy Res =
- tryParseSVERegister(RegNum, Kind, RegKind::SVEDataVector);
+ ParseRes =
+ tryParseVectorRegister(RegNum, Kind, RegKind::SVEDataVector);
- if (Res == MatchOperand_ParseFail)
+ if (ParseRes == MatchOperand_ParseFail)
return true;
- if (Res == MatchOperand_Success && !Kind.empty())
+ if (ParseRes == MatchOperand_Success && !Kind.empty())
return Error(SRegLoc,
"sve vector register without type specifier expected");
}
- if (RegNum == -1) {
+ if (ParseRes != MatchOperand_Success) {
StringRef Kind;
RegisterKind = RegKind::SVEPredicateVector;
- OperandMatchResultTy Res =
- tryParseSVERegister(RegNum, Kind, RegKind::SVEPredicateVector);
+ ParseRes = tryParseVectorRegister(RegNum, Kind, RegKind::SVEPredicateVector);
- if (Res == MatchOperand_ParseFail)
+ if (ParseRes == MatchOperand_ParseFail)
return true;
- if (Res == MatchOperand_Success && !Kind.empty())
+ if (ParseRes == MatchOperand_Success && !Kind.empty())
return Error(SRegLoc,
"sve predicate register without type specifier expected");
}
- if (RegNum == -1)
+ if (ParseRes != MatchOperand_Success)
return Error(SRegLoc, "register name or alias expected");
// Shouldn't be anything else.
@@ -4503,7 +5162,7 @@ AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
BE->getOpcode() != MCBinaryExpr::Sub)
return false;
- // See if the addend is is a constant, otherwise there's more going
+ // See if the addend is a constant, otherwise there's more going
// on here than we can deal with.
auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
if (!AddendExpr)
@@ -4604,10 +5263,11 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
return MatchOperand_ParseFail;
}
- int FirstReg = tryParseRegister();
- if (FirstReg == -1) {
+ unsigned FirstReg;
+ OperandMatchResultTy Res = tryParseScalarRegister(FirstReg);
+ if (Res != MatchOperand_Success)
return MatchOperand_ParseFail;
- }
+
const MCRegisterClass &WRegClass =
AArch64MCRegisterClasses[AArch64::GPR32RegClassID];
const MCRegisterClass &XRegClass =
@@ -4630,19 +5290,18 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
return MatchOperand_ParseFail;
}
- SMLoc M = getLoc();
if (getParser().getTok().isNot(AsmToken::Comma)) {
- Error(M, "expected comma");
+ Error(getLoc(), "expected comma");
return MatchOperand_ParseFail;
}
// Eat the comma
getParser().Lex();
SMLoc E = getLoc();
- int SecondReg = tryParseRegister();
- if (SecondReg ==-1) {
+ unsigned SecondReg;
+ Res = tryParseScalarRegister(SecondReg);
+ if (Res != MatchOperand_Success)
return MatchOperand_ParseFail;
- }
if (RI->getEncodingValue(SecondReg) != FirstEncoding + 1 ||
(isXReg && !XRegClass.contains(SecondReg)) ||
@@ -4667,16 +5326,16 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
return MatchOperand_Success;
}
-template <bool ParseSuffix>
+template <bool ParseShiftExtend, bool ParseSuffix>
OperandMatchResultTy
AArch64AsmParser::tryParseSVEDataVector(OperandVector &Operands) {
const SMLoc S = getLoc();
// Check for a SVE vector register specifier first.
- int RegNum = -1;
+ unsigned RegNum;
StringRef Kind;
OperandMatchResultTy Res =
- tryParseSVERegister(RegNum, Kind, RegKind::SVEDataVector);
+ tryParseVectorRegister(RegNum, Kind, RegKind::SVEDataVector);
if (Res != MatchOperand_Success)
return Res;
@@ -4684,20 +5343,81 @@ AArch64AsmParser::tryParseSVEDataVector(OperandVector &Operands) {
if (ParseSuffix && Kind.empty())
return MatchOperand_NoMatch;
- unsigned ElementWidth = StringSwitch<unsigned>(Kind.lower())
- .Case("", -1)
- .Case(".b", 8)
- .Case(".h", 16)
- .Case(".s", 32)
- .Case(".d", 64)
- .Case(".q", 128)
- .Default(0);
- if (!ElementWidth)
+ const auto &KindRes = parseVectorKind(Kind, RegKind::SVEDataVector);
+ if (!KindRes)
+ return MatchOperand_NoMatch;
+
+ unsigned ElementWidth = KindRes->second;
+
+ // No shift/extend is the default.
+ if (!ParseShiftExtend || getParser().getTok().isNot(AsmToken::Comma)) {
+ Operands.push_back(AArch64Operand::CreateVectorReg(
+ RegNum, RegKind::SVEDataVector, ElementWidth, S, S, getContext()));
+
+ OperandMatchResultTy Res = tryParseVectorIndex(Operands);
+ if (Res == MatchOperand_ParseFail)
+ return MatchOperand_ParseFail;
+ return MatchOperand_Success;
+ }
+
+ // Eat the comma
+ getParser().Lex();
+
+ // Match the shift
+ SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> ExtOpnd;
+ Res = tryParseOptionalShiftExtend(ExtOpnd);
+ if (Res != MatchOperand_Success)
+ return Res;
+
+ auto Ext = static_cast<AArch64Operand *>(ExtOpnd.back().get());
+ Operands.push_back(AArch64Operand::CreateVectorReg(
+ RegNum, RegKind::SVEDataVector, ElementWidth, S, Ext->getEndLoc(),
+ getContext(), Ext->getShiftExtendType(), Ext->getShiftExtendAmount(),
+ Ext->hasShiftExtendAmount()));
+
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseSVEPattern(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+
+ SMLoc SS = getLoc();
+ const AsmToken &TokE = Parser.getTok();
+ bool IsHash = TokE.is(AsmToken::Hash);
+
+ if (!IsHash && TokE.isNot(AsmToken::Identifier))
return MatchOperand_NoMatch;
+ int64_t Pattern;
+ if (IsHash) {
+ Parser.Lex(); // Eat hash
+
+ // Parse the immediate operand.
+ const MCExpr *ImmVal;
+ SS = getLoc();
+ if (Parser.parseExpression(ImmVal))
+ return MatchOperand_ParseFail;
+
+ auto *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+ if (!MCE)
+ return MatchOperand_ParseFail;
+
+ Pattern = MCE->getValue();
+ } else {
+ // Parse the pattern
+ auto Pat = AArch64SVEPredPattern::lookupSVEPREDPATByName(TokE.getString());
+ if (!Pat)
+ return MatchOperand_NoMatch;
+
+ Parser.Lex();
+ Pattern = Pat->Encoding;
+ assert(Pattern >= 0 && Pattern < 32);
+ }
+
Operands.push_back(
- AArch64Operand::CreateReg(RegNum, RegKind::SVEDataVector, ElementWidth,
- S, S, getContext()));
+ AArch64Operand::CreateImm(MCConstantExpr::create(Pattern, getContext()),
+ SS, getLoc(), getContext()));
return MatchOperand_Success;
}
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index 3d4b9dcf7e8d..d9a00512f71d 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -1,20 +1,20 @@
set(LLVM_TARGET_DEFINITIONS AArch64.td)
-tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter)
-tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering)
+tablegen(LLVM AArch64GenAsmMatcher.inc -gen-asm-matcher)
tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
-tablegen(LLVM AArch64GenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel)
-tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
+tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel)
+tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter)
+tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering)
+tablegen(LLVM AArch64GenRegisterBank.inc -gen-register-bank)
+tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
tablegen(LLVM AArch64GenSystemOperands.inc -gen-searchable-tables)
-tablegen(LLVM AArch64GenRegisterBank.inc -gen-register-bank)
-tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel)
add_public_tablegen_target(AArch64CommonTableGen)
@@ -59,9 +59,9 @@ add_llvm_target(AArch64CodeGen
intrinsics_gen
)
-add_subdirectory(TargetInfo)
add_subdirectory(AsmParser)
add_subdirectory(Disassembler)
add_subdirectory(InstPrinter)
add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
add_subdirectory(Utils)
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index ae278caeda69..cef0ff346448 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -55,6 +55,9 @@ static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
@@ -87,10 +90,28 @@ static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
const void *Decoder);
static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decode);
+ const void *Decoder);
+static DecodeStatus DecodeZPR_4bRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decode);
+ const void *Decoder);
+static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
uint64_t Address,
@@ -185,9 +206,18 @@ static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
unsigned RegNo,
uint64_t Addr,
const void *Decoder);
+static DecodeStatus DecodeSVELogicalImmInstruction(llvm::MCInst &Inst,
+ uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
template<int Bits>
static DecodeStatus DecodeSImm(llvm::MCInst &Inst, uint64_t Imm,
uint64_t Address, const void *Decoder);
+template <int ElementWidth>
+static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
static bool Check(DecodeStatus &Out, DecodeStatus In) {
switch (In) {
@@ -386,6 +416,17 @@ static const unsigned GPR64DecoderTable[] = {
AArch64::LR, AArch64::XZR
};
+static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 30)
+ return Fail;
+
+ unsigned Register = GPR64DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Addr,
const void *Decoder) {
@@ -464,6 +505,91 @@ static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
return Success;
}
+static DecodeStatus DecodeZPR_4bRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 15)
+ return Fail;
+ return DecodeZPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 7)
+ return Fail;
+ return DecodeZPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static const unsigned ZZDecoderTable[] = {
+ AArch64::Z0_Z1, AArch64::Z1_Z2, AArch64::Z2_Z3, AArch64::Z3_Z4,
+ AArch64::Z4_Z5, AArch64::Z5_Z6, AArch64::Z6_Z7, AArch64::Z7_Z8,
+ AArch64::Z8_Z9, AArch64::Z9_Z10, AArch64::Z10_Z11, AArch64::Z11_Z12,
+ AArch64::Z12_Z13, AArch64::Z13_Z14, AArch64::Z14_Z15, AArch64::Z15_Z16,
+ AArch64::Z16_Z17, AArch64::Z17_Z18, AArch64::Z18_Z19, AArch64::Z19_Z20,
+ AArch64::Z20_Z21, AArch64::Z21_Z22, AArch64::Z22_Z23, AArch64::Z23_Z24,
+ AArch64::Z24_Z25, AArch64::Z25_Z26, AArch64::Z26_Z27, AArch64::Z27_Z28,
+ AArch64::Z28_Z29, AArch64::Z29_Z30, AArch64::Z30_Z31, AArch64::Z31_Z0
+};
+
+static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void* Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = ZZDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static const unsigned ZZZDecoderTable[] = {
+ AArch64::Z0_Z1_Z2, AArch64::Z1_Z2_Z3, AArch64::Z2_Z3_Z4,
+ AArch64::Z3_Z4_Z5, AArch64::Z4_Z5_Z6, AArch64::Z5_Z6_Z7,
+ AArch64::Z6_Z7_Z8, AArch64::Z7_Z8_Z9, AArch64::Z8_Z9_Z10,
+ AArch64::Z9_Z10_Z11, AArch64::Z10_Z11_Z12, AArch64::Z11_Z12_Z13,
+ AArch64::Z12_Z13_Z14, AArch64::Z13_Z14_Z15, AArch64::Z14_Z15_Z16,
+ AArch64::Z15_Z16_Z17, AArch64::Z16_Z17_Z18, AArch64::Z17_Z18_Z19,
+ AArch64::Z18_Z19_Z20, AArch64::Z19_Z20_Z21, AArch64::Z20_Z21_Z22,
+ AArch64::Z21_Z22_Z23, AArch64::Z22_Z23_Z24, AArch64::Z23_Z24_Z25,
+ AArch64::Z24_Z25_Z26, AArch64::Z25_Z26_Z27, AArch64::Z26_Z27_Z28,
+ AArch64::Z27_Z28_Z29, AArch64::Z28_Z29_Z30, AArch64::Z29_Z30_Z31,
+ AArch64::Z30_Z31_Z0, AArch64::Z31_Z0_Z1
+};
+
+static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void* Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = ZZZDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
+static const unsigned ZZZZDecoderTable[] = {
+ AArch64::Z0_Z1_Z2_Z3, AArch64::Z1_Z2_Z3_Z4, AArch64::Z2_Z3_Z4_Z5,
+ AArch64::Z3_Z4_Z5_Z6, AArch64::Z4_Z5_Z6_Z7, AArch64::Z5_Z6_Z7_Z8,
+ AArch64::Z6_Z7_Z8_Z9, AArch64::Z7_Z8_Z9_Z10, AArch64::Z8_Z9_Z10_Z11,
+ AArch64::Z9_Z10_Z11_Z12, AArch64::Z10_Z11_Z12_Z13, AArch64::Z11_Z12_Z13_Z14,
+ AArch64::Z12_Z13_Z14_Z15, AArch64::Z13_Z14_Z15_Z16, AArch64::Z14_Z15_Z16_Z17,
+ AArch64::Z15_Z16_Z17_Z18, AArch64::Z16_Z17_Z18_Z19, AArch64::Z17_Z18_Z19_Z20,
+ AArch64::Z18_Z19_Z20_Z21, AArch64::Z19_Z20_Z21_Z22, AArch64::Z20_Z21_Z22_Z23,
+ AArch64::Z21_Z22_Z23_Z24, AArch64::Z22_Z23_Z24_Z25, AArch64::Z23_Z24_Z25_Z26,
+ AArch64::Z24_Z25_Z26_Z27, AArch64::Z25_Z26_Z27_Z28, AArch64::Z26_Z27_Z28_Z29,
+ AArch64::Z27_Z28_Z29_Z30, AArch64::Z28_Z29_Z30_Z31, AArch64::Z29_Z30_Z31_Z0,
+ AArch64::Z30_Z31_Z0_Z1, AArch64::Z31_Z0_Z1_Z2
+};
+
+static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void* Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = ZZZZDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
static const unsigned PPRDecoderTable[] = {
AArch64::P0, AArch64::P1, AArch64::P2, AArch64::P3,
AArch64::P4, AArch64::P5, AArch64::P6, AArch64::P7,
@@ -481,6 +607,16 @@ static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo,
return Success;
}
+static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void* Decoder) {
+ if (RegNo > 7)
+ return Fail;
+
+ // Just reuse the PPR decode table
+ return DecodePPRRegisterClass(Inst, RegNo, Addr, Decoder);
+}
+
static const unsigned VectorDecoderTable[] = {
AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4,
AArch64::Q5, AArch64::Q6, AArch64::Q7, AArch64::Q8, AArch64::Q9,
@@ -1047,6 +1183,14 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn,
case AArch64::LDRHHpost:
case AArch64::STRWpost:
case AArch64::LDRWpost:
+ case AArch64::STLURBi:
+ case AArch64::STLURHi:
+ case AArch64::STLURWi:
+ case AArch64::LDAPURBi:
+ case AArch64::LDAPURSBWi:
+ case AArch64::LDAPURHi:
+ case AArch64::LDAPURSHWi:
+ case AArch64::LDAPURi:
DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
break;
case AArch64::LDURSBXi:
@@ -1069,6 +1213,11 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn,
case AArch64::STRXpost:
case AArch64::LDRSWpost:
case AArch64::LDRXpost:
+ case AArch64::LDAPURSWi:
+ case AArch64::LDAPURSHXi:
+ case AArch64::LDAPURSBXi:
+ case AArch64::STLURXi:
+ case AArch64::LDAPURXi:
DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
break;
case AArch64::LDURQi:
@@ -1636,6 +1785,23 @@ static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
RegNo, Addr, Decoder);
}
+static DecodeStatus DecodeSVELogicalImmInstruction(llvm::MCInst &Inst,
+ uint32_t insn,
+ uint64_t Addr,
+ const void *Decoder) {
+ unsigned Zdn = fieldFromInstruction(insn, 0, 5);
+ unsigned imm = fieldFromInstruction(insn, 5, 13);
+ if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64))
+ return Fail;
+
+ // The same (tied) operand is added twice to the instruction.
+ DecodeZPRRegisterClass(Inst, Zdn, Addr, Decoder);
+ if (Inst.getOpcode() != AArch64::DUPM_ZI)
+ DecodeZPRRegisterClass(Inst, Zdn, Addr, Decoder);
+ Inst.addOperand(MCOperand::createImm(imm));
+ return Success;
+}
+
template<int Bits>
static DecodeStatus DecodeSImm(llvm::MCInst &Inst, uint64_t Imm,
uint64_t Address, const void *Decoder) {
@@ -1650,3 +1816,22 @@ static DecodeStatus DecodeSImm(llvm::MCInst &Inst, uint64_t Imm,
return Success;
}
+// Decode 8-bit signed/unsigned immediate for a given element width.
+template <int ElementWidth>
+static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ unsigned Val = (uint8_t)Imm;
+ unsigned Shift = (Imm & 0x100) ? 8 : 0;
+ if (ElementWidth == 8 && Shift)
+ return Fail;
+ Inst.addOperand(MCOperand::createImm(Val));
+ Inst.addOperand(MCOperand::createImm(Shift));
+ return Success;
+}
+
+// Decode uimm4 ranged from 1-16.
+static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm(Imm + 1));
+ return Success;
+}
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
index 19d0ba2e1c41..6e64fc9347b9 100644
--- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -99,8 +99,8 @@ bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // reg
SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
&ReferenceName);
- CommentStream << format("0x%llx",
- 0xfffffffffffff000LL & (Address + Value));
+ CommentStream << format("0x%llx", (0xfffffffffffff000LL & Address) +
+ Value * 0x1000);
} else if (MI.getOpcode() == AArch64::ADDXri ||
MI.getOpcode() == AArch64::LDRXui ||
MI.getOpcode() == AArch64::LDRXl ||
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index bdf71b095fda..26e41215afc6 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -282,6 +282,13 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
return;
}
+ // Instruction TSB is specified as a one operand instruction, but 'csync' is
+ // not encoded, so for printing it is treated as a special case here:
+ if (Opcode == AArch64::TSB) {
+ O << "\ttsb\tcsync";
+ return;
+ }
+
if (!printAliasInstr(MI, STI, O))
printInstruction(MI, STI, O);
@@ -907,20 +914,13 @@ void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
}
}
-void AArch64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- uint64_t Val = MI->getOperand(OpNum).getImm();
- O << "#0x";
- O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 32));
-}
-
-void AArch64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
+template <typename T>
+void AArch64InstPrinter::printLogicalImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
uint64_t Val = MI->getOperand(OpNum).getImm();
O << "#0x";
- O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 64));
+ O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 8 * sizeof(T)));
}
void AArch64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum,
@@ -976,12 +976,9 @@ void AArch64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum,
O << " #" << ShiftVal;
}
-void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum,
- raw_ostream &O, char SrcRegKind,
- unsigned Width) {
- unsigned SignExtend = MI->getOperand(OpNum).getImm();
- unsigned DoShift = MI->getOperand(OpNum + 1).getImm();
-
+static void printMemExtendImpl(bool SignExtend, bool DoShift,
+ unsigned Width, char SrcRegKind,
+ raw_ostream &O) {
// sxtw, sxtx, uxtw or lsl (== uxtx)
bool IsLSL = !SignExtend && SrcRegKind == 'x';
if (IsLSL)
@@ -993,6 +990,32 @@ void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum,
O << " #" << Log2_32(Width / 8);
}
+void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O, char SrcRegKind,
+ unsigned Width) {
+ bool SignExtend = MI->getOperand(OpNum).getImm();
+ bool DoShift = MI->getOperand(OpNum + 1).getImm();
+ printMemExtendImpl(SignExtend, DoShift, Width, SrcRegKind, O);
+}
+
+template <bool SignExtend, int ExtWidth, char SrcRegKind, char Suffix>
+void AArch64InstPrinter::printRegWithShiftExtend(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printOperand(MI, OpNum, STI, O);
+ if (Suffix == 's' || Suffix == 'd')
+ O << '.' << Suffix;
+ else
+ assert(Suffix == 0 && "Unsupported suffix size");
+
+ bool DoShift = ExtWidth != 8;
+ if (SignExtend || DoShift || SrcRegKind == 'w') {
+ O << ", ";
+ printMemExtendImpl(SignExtend, DoShift, ExtWidth, SrcRegKind, O);
+ }
+}
+
void AArch64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -1045,15 +1068,22 @@ void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
O << ']';
}
+template <bool IsSVEPrefetch>
void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned prfop = MI->getOperand(OpNum).getImm();
- auto PRFM = AArch64PRFM::lookupPRFMByEncoding(prfop);
- if (PRFM)
+ if (IsSVEPrefetch) {
+ if (auto PRFM = AArch64SVEPRFM::lookupSVEPRFMByEncoding(prfop)) {
+ O << PRFM->Name;
+ return;
+ }
+ } else if (auto PRFM = AArch64PRFM::lookupPRFMByEncoding(prfop)) {
O << PRFM->Name;
- else
- O << '#' << formatImm(prfop);
+ return;
+ }
+
+ O << '#' << formatImm(prfop);
}
void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum,
@@ -1118,6 +1148,41 @@ static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
case AArch64::Q31:
Reg = AArch64::Q0;
break;
+ case AArch64::Z0: Reg = AArch64::Z1; break;
+ case AArch64::Z1: Reg = AArch64::Z2; break;
+ case AArch64::Z2: Reg = AArch64::Z3; break;
+ case AArch64::Z3: Reg = AArch64::Z4; break;
+ case AArch64::Z4: Reg = AArch64::Z5; break;
+ case AArch64::Z5: Reg = AArch64::Z6; break;
+ case AArch64::Z6: Reg = AArch64::Z7; break;
+ case AArch64::Z7: Reg = AArch64::Z8; break;
+ case AArch64::Z8: Reg = AArch64::Z9; break;
+ case AArch64::Z9: Reg = AArch64::Z10; break;
+ case AArch64::Z10: Reg = AArch64::Z11; break;
+ case AArch64::Z11: Reg = AArch64::Z12; break;
+ case AArch64::Z12: Reg = AArch64::Z13; break;
+ case AArch64::Z13: Reg = AArch64::Z14; break;
+ case AArch64::Z14: Reg = AArch64::Z15; break;
+ case AArch64::Z15: Reg = AArch64::Z16; break;
+ case AArch64::Z16: Reg = AArch64::Z17; break;
+ case AArch64::Z17: Reg = AArch64::Z18; break;
+ case AArch64::Z18: Reg = AArch64::Z19; break;
+ case AArch64::Z19: Reg = AArch64::Z20; break;
+ case AArch64::Z20: Reg = AArch64::Z21; break;
+ case AArch64::Z21: Reg = AArch64::Z22; break;
+ case AArch64::Z22: Reg = AArch64::Z23; break;
+ case AArch64::Z23: Reg = AArch64::Z24; break;
+ case AArch64::Z24: Reg = AArch64::Z25; break;
+ case AArch64::Z25: Reg = AArch64::Z26; break;
+ case AArch64::Z26: Reg = AArch64::Z27; break;
+ case AArch64::Z27: Reg = AArch64::Z28; break;
+ case AArch64::Z28: Reg = AArch64::Z29; break;
+ case AArch64::Z29: Reg = AArch64::Z30; break;
+ case AArch64::Z30: Reg = AArch64::Z31; break;
+ // Vector lists can wrap around.
+ case AArch64::Z31:
+ Reg = AArch64::Z0;
+ break;
}
}
return Reg;
@@ -1152,12 +1217,15 @@ void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
// list).
unsigned NumRegs = 1;
if (MRI.getRegClass(AArch64::DDRegClassID).contains(Reg) ||
+ MRI.getRegClass(AArch64::ZPR2RegClassID).contains(Reg) ||
MRI.getRegClass(AArch64::QQRegClassID).contains(Reg))
NumRegs = 2;
else if (MRI.getRegClass(AArch64::DDDRegClassID).contains(Reg) ||
+ MRI.getRegClass(AArch64::ZPR3RegClassID).contains(Reg) ||
MRI.getRegClass(AArch64::QQQRegClassID).contains(Reg))
NumRegs = 3;
else if (MRI.getRegClass(AArch64::DDDDRegClassID).contains(Reg) ||
+ MRI.getRegClass(AArch64::ZPR4RegClassID).contains(Reg) ||
MRI.getRegClass(AArch64::QQQQRegClassID).contains(Reg))
NumRegs = 4;
@@ -1166,6 +1234,8 @@ void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
Reg = FirstReg;
else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::qsub0))
Reg = FirstReg;
+ else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::zsub0))
+ Reg = FirstReg;
// If it's a D-reg, we need to promote it to the equivalent Q-reg before
// printing (otherwise getRegisterName fails).
@@ -1176,7 +1246,11 @@ void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
}
for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) {
- O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix;
+ if (MRI.getRegClass(AArch64::ZPRRegClassID).contains(Reg))
+ O << getRegisterName(Reg) << LayoutSuffix;
+ else
+ O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix;
+
if (i + 1 != NumRegs)
O << ", ";
}
@@ -1262,6 +1336,9 @@ void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
if (Opcode == AArch64::ISB) {
auto ISB = AArch64ISB::lookupISBByEncoding(Val);
Name = ISB ? ISB->Name : "";
+ } else if (Opcode == AArch64::TSB) {
+ auto TSB = AArch64TSB::lookupTSBByEncoding(Val);
+ Name = TSB ? TSB->Name : "";
} else {
auto DB = AArch64DB::lookupDBByEncoding(Val);
Name = DB ? DB->Name : "";
@@ -1340,6 +1417,16 @@ void AArch64InstPrinter::printComplexRotationOp(const MCInst *MI, unsigned OpNo,
O << "#" << (Val * Angle) + Remainder;
}
+void AArch64InstPrinter::printSVEPattern(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNum).getImm();
+ if (auto Pat = AArch64SVEPredPattern::lookupSVEPREDPATByEncoding(Val))
+ O << Pat->Name;
+ else
+ O << '#' << formatImm(Val);
+}
+
template <char suffix>
void AArch64InstPrinter::printSVERegOp(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
@@ -1359,4 +1446,101 @@ void AArch64InstPrinter::printSVERegOp(const MCInst *MI, unsigned OpNum,
O << getRegisterName(Reg);
if (suffix != 0)
O << '.' << suffix;
-} \ No newline at end of file
+}
+
+template <typename T>
+void AArch64InstPrinter::printImmSVE(T Value, raw_ostream &O) {
+ typename std::make_unsigned<T>::type HexValue = Value;
+
+ if (getPrintImmHex())
+ O << '#' << formatHex((uint64_t)HexValue);
+ else
+ O << '#' << formatDec(Value);
+
+ if (CommentStream) {
+ // Do the opposite to that used for instruction operands.
+ if (getPrintImmHex())
+ *CommentStream << '=' << formatDec(HexValue) << '\n';
+ else
+ *CommentStream << '=' << formatHex((uint64_t)Value) << '\n';
+ }
+}
+
+template <typename T>
+void AArch64InstPrinter::printImm8OptLsl(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned UnscaledVal = MI->getOperand(OpNum).getImm();
+ unsigned Shift = MI->getOperand(OpNum + 1).getImm();
+ assert(AArch64_AM::getShiftType(Shift) == AArch64_AM::LSL &&
+ "Unexepected shift type!");
+
+ // #0 lsl #8 is never pretty printed
+ if ((UnscaledVal == 0) && (AArch64_AM::getShiftValue(Shift) != 0)) {
+ O << '#' << formatImm(UnscaledVal);
+ printShifter(MI, OpNum + 1, STI, O);
+ return;
+ }
+
+ T Val;
+ if (std::is_signed<T>())
+ Val = (int8_t)UnscaledVal * (1 << AArch64_AM::getShiftValue(Shift));
+ else
+ Val = (uint8_t)UnscaledVal * (1 << AArch64_AM::getShiftValue(Shift));
+
+ printImmSVE(Val, O);
+}
+
+template <typename T>
+void AArch64InstPrinter::printSVELogicalImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ typedef typename std::make_signed<T>::type SignedT;
+ typedef typename std::make_unsigned<T>::type UnsignedT;
+
+ uint64_t Val = MI->getOperand(OpNum).getImm();
+ UnsignedT PrintVal = AArch64_AM::decodeLogicalImmediate(Val, 64);
+
+ // Prefer the default format for 16bit values, hex otherwise.
+ if ((int16_t)PrintVal == (SignedT)PrintVal)
+ printImmSVE((T)PrintVal, O);
+ else if ((uint16_t)PrintVal == PrintVal)
+ printImmSVE(PrintVal, O);
+ else
+ O << '#' << formatHex((uint64_t)PrintVal);
+}
+
+template <int Width>
+void AArch64InstPrinter::printZPRasFPR(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Base;
+ switch (Width) {
+ case 8: Base = AArch64::B0; break;
+ case 16: Base = AArch64::H0; break;
+ case 32: Base = AArch64::S0; break;
+ case 64: Base = AArch64::D0; break;
+ case 128: Base = AArch64::Q0; break;
+ default:
+ llvm_unreachable("Unsupported width");
+ }
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+ O << getRegisterName(Reg - AArch64::Z0 + Base);
+}
+
+template <unsigned ImmIs0, unsigned ImmIs1>
+void AArch64InstPrinter::printExactFPImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ auto *Imm0Desc = AArch64ExactFPImm::lookupExactFPImmByEnum(ImmIs0);
+ auto *Imm1Desc = AArch64ExactFPImm::lookupExactFPImmByEnum(ImmIs1);
+ unsigned Val = MI->getOperand(OpNum).getImm();
+ O << "#" << (Val ? Imm1Desc->Repr : Imm0Desc->Repr);
+}
+
+void AArch64InstPrinter::printGPR64as32(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+ O << getRegisterName(getWRegFromXReg(Reg));
+}
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 76f20f042cef..8dc9264f94a1 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -17,6 +17,7 @@
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCInstPrinter.h"
+#include "../Utils/AArch64BaseInfo.h"
namespace llvm {
@@ -56,6 +57,7 @@ protected:
raw_ostream &O);
void printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ template <typename T> void printImmSVE(T Value, raw_ostream &O);
void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
raw_ostream &O);
template <int Amount>
@@ -70,10 +72,9 @@ protected:
const MCSubtargetInfo &STI, raw_ostream &O);
void printAddSubImm(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printLogicalImm32(const MCInst *MI, unsigned OpNum,
- const MCSubtargetInfo &STI, raw_ostream &O);
- void printLogicalImm64(const MCInst *MI, unsigned OpNum,
- const MCSubtargetInfo &STI, raw_ostream &O);
+ template <typename T>
+ void printLogicalImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printShifter(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
void printShiftedRegister(const MCInst *MI, unsigned OpNum,
@@ -90,7 +91,9 @@ protected:
const MCSubtargetInfo &STI, raw_ostream &O) {
printMemExtend(MI, OpNum, O, SrcRegKind, Width);
}
-
+ template <bool SignedExtend, int ExtWidth, char SrcRegKind, char Suffix>
+ void printRegWithShiftExtend(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printCondCode(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
void printInverseCondCode(const MCInst *MI, unsigned OpNum,
@@ -121,6 +124,7 @@ protected:
void printImmScale(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
+ template <bool IsSVEPrefetch = false>
void printPrefetchOp(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
@@ -165,9 +169,25 @@ protected:
void printGPRSeqPairsClassOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O);
+ template <typename T>
+ void printImm8OptLsl(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ template <typename T>
+ void printSVELogicalImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSVEPattern(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
template <char = 0>
void printSVERegOp(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printGPR64as32(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ template <int Width>
+ void printZPRasFPR(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ template <unsigned ImmIs0, unsigned ImmIs1>
+ void printExactFPImm(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
};
class AArch64AppleInstPrinter : public AArch64InstPrinter {
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 3e5ef4df4706..62644ab2f457 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -213,7 +213,8 @@ static inline uint64_t ror(uint64_t elt, unsigned size) {
static inline bool processLogicalImmediate(uint64_t Imm, unsigned RegSize,
uint64_t &Encoding) {
if (Imm == 0ULL || Imm == ~0ULL ||
- (RegSize != 64 && (Imm >> RegSize != 0 || Imm == ~0U)))
+ (RegSize != 64 &&
+ (Imm >> RegSize != 0 || Imm == (~0ULL >> (64 - RegSize)))))
return false;
// First, determine the element size.
@@ -753,6 +754,67 @@ static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
return (EncVal << 32) | EncVal;
}
+/// Returns true if Imm is the concatenation of a repeating pattern of type T.
+template <typename T>
+static inline bool isSVEMaskOfIdenticalElements(int64_t Imm) {
+ union {
+ int64_t Whole;
+ T Parts[sizeof(int64_t)/sizeof(T)];
+ } Vec { Imm };
+
+ return all_of(Vec.Parts, [Vec](T Elem) { return Elem == Vec.Parts[0]; });
+}
+
+/// Returns true if Imm is valid for CPY/DUP.
+template <typename T>
+static inline bool isSVECpyImm(int64_t Imm) {
+ bool IsImm8 = int8_t(Imm) == Imm;
+ bool IsImm16 = int16_t(Imm & ~0xff) == Imm;
+
+ if (std::is_same<int8_t, typename std::make_signed<T>::type>::value)
+ return IsImm8 || uint8_t(Imm) == Imm;
+
+ if (std::is_same<int16_t, typename std::make_signed<T>::type>::value)
+ return IsImm8 || IsImm16 || uint16_t(Imm & ~0xff) == Imm;
+
+ return IsImm8 || IsImm16;
+}
+
+/// Returns true if Imm is valid for ADD/SUB.
+template <typename T>
+static inline bool isSVEAddSubImm(int64_t Imm) {
+ bool IsInt8t =
+ std::is_same<int8_t, typename std::make_signed<T>::type>::value;
+ return uint8_t(Imm) == Imm || (!IsInt8t && uint16_t(Imm & ~0xff) == Imm);
+}
+
+/// Return true if Imm is valid for DUPM and has no single CPY/DUP equivalent.
+static inline bool isSVEMoveMaskPreferredLogicalImmediate(int64_t Imm) {
+ union {
+ int64_t D;
+ int32_t S[2];
+ int16_t H[4];
+ int8_t B[8];
+ } Vec = { Imm };
+
+ if (isSVECpyImm<int64_t>(Vec.D))
+ return false;
+
+ if (isSVEMaskOfIdenticalElements<int32_t>(Imm) &&
+ isSVECpyImm<int32_t>(Vec.S[0]))
+ return false;
+
+ if (isSVEMaskOfIdenticalElements<int16_t>(Imm) &&
+ isSVECpyImm<int16_t>(Vec.H[0]))
+ return false;
+
+ if (isSVEMaskOfIdenticalElements<int8_t>(Imm) &&
+ isSVECpyImm<int8_t>(Vec.B[0]))
+ return false;
+
+ return isLogicalImmediate(Vec.D, 64);
+}
+
inline static bool isAnyMOVZMovAlias(uint64_t Value, int RegWidth) {
for (int Shift = 0; Shift <= RegWidth - 16; Shift += 16)
if ((Value & ~(0xffffULL << Shift)) == 0)
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 7b33b4b5b542..856946555198 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -33,11 +33,9 @@ class AArch64AsmBackend : public MCAsmBackend {
Triple TheTriple;
public:
- bool IsLittleEndian;
-
-public:
AArch64AsmBackend(const Target &T, const Triple &TT, bool IsLittleEndian)
- : MCAsmBackend(), TheTriple(TT), IsLittleEndian(IsLittleEndian) {}
+ : MCAsmBackend(IsLittleEndian ? support::little : support::big),
+ TheTriple(TT) {}
unsigned getNumFixupKinds() const override {
return AArch64::NumTargetFixupKinds;
@@ -75,15 +73,17 @@ public:
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsResolved) const override;
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override;
- bool mayNeedRelaxation(const MCInst &Inst) const override;
+ bool mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const override;
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override;
void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
MCInst &Res) const override;
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
@@ -97,7 +97,7 @@ public:
} // end anonymous namespace
-/// \brief The number of bytes the fixup may change.
+/// The number of bytes the fixup may change.
static unsigned getFixupKindNumBytes(unsigned Kind) {
switch (Kind) {
default:
@@ -248,7 +248,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
/// getFixupKindContainereSizeInBytes - The number of bytes of the
/// container involved in big endian or 0 if the item is little endian
unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) const {
- if (IsLittleEndian)
+ if (Endian == support::little)
return 0;
switch (Kind) {
@@ -287,7 +287,8 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con
void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target,
MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) const {
+ bool IsResolved,
+ const MCSubtargetInfo *STI) const {
unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
if (!Value)
return; // Doesn't change encoding.
@@ -323,7 +324,8 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
}
}
-bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const {
return false;
}
@@ -344,16 +346,16 @@ void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented");
}
-bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool AArch64AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
// If the count is not 4-byte aligned, we must be writing data into the text
// section (otherwise we have unaligned instructions, and thus have far
// bigger problems), so just write zeros instead.
- OW->WriteZeros(Count % 4);
+ OS.write_zeros(Count % 4);
// We are properly aligned, so write NOPs as requested.
Count /= 4;
for (uint64_t i = 0; i != Count; ++i)
- OW->write32(0xd503201f);
+ support::endian::write<uint32_t>(OS, 0xd503201f, Endian);
return true;
}
@@ -381,20 +383,20 @@ namespace {
namespace CU {
-/// \brief Compact unwind encoding values.
+/// Compact unwind encoding values.
enum CompactUnwindEncodings {
- /// \brief A "frameless" leaf function, where no non-volatile registers are
+ /// A "frameless" leaf function, where no non-volatile registers are
/// saved. The return remains in LR throughout the function.
UNWIND_ARM64_MODE_FRAMELESS = 0x02000000,
- /// \brief No compact unwind encoding available. Instead the low 23-bits of
+ /// No compact unwind encoding available. Instead the low 23-bits of
/// the compact unwind encoding is the offset of the DWARF FDE in the
/// __eh_frame section. This mode is never used in object files. It is only
/// generated by the linker in final linked images, which have only DWARF info
/// for a function.
UNWIND_ARM64_MODE_DWARF = 0x03000000,
- /// \brief This is a standard arm64 prologue where FP/LR are immediately
+ /// This is a standard arm64 prologue where FP/LR are immediately
/// pushed on the stack, then SP is copied to FP. If there are any
/// non-volatile register saved, they are copied into the stack fame in pairs
/// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
@@ -402,7 +404,7 @@ enum CompactUnwindEncodings {
/// in register number order.
UNWIND_ARM64_MODE_FRAME = 0x04000000,
- /// \brief Frame register pair encodings.
+ /// Frame register pair encodings.
UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001,
UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002,
UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004,
@@ -420,7 +422,7 @@ enum CompactUnwindEncodings {
class DarwinAArch64AsmBackend : public AArch64AsmBackend {
const MCRegisterInfo &MRI;
- /// \brief Encode compact unwind stack adjustment for frameless functions.
+ /// Encode compact unwind stack adjustment for frameless functions.
/// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
/// The stack size always needs to be 16 byte aligned.
uint32_t encodeStackAdjustment(uint32_t StackSize) const {
@@ -432,13 +434,13 @@ public:
const MCRegisterInfo &MRI)
: AArch64AsmBackend(T, TT, /*IsLittleEndian*/ true), MRI(MRI) {}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createAArch64MachObjectWriter(MachO::CPU_TYPE_ARM64,
MachO::CPU_SUBTYPE_ARM64_ALL);
}
- /// \brief Generate the compact unwind encoding from the CFI directives.
+ /// Generate the compact unwind encoding from the CFI directives.
uint32_t generateCompactUnwindEncoding(
ArrayRef<MCCFIInstruction> Instrs) const override {
if (Instrs.empty())
@@ -457,9 +459,17 @@ public:
return CU::UNWIND_ARM64_MODE_DWARF;
case MCCFIInstruction::OpDefCfa: {
// Defines a frame pointer.
- assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
- AArch64::FP &&
- "Invalid frame pointer!");
+ unsigned XReg =
+ getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true));
+
+ // Other CFA registers than FP are not supported by compact unwind.
+ // Fallback on DWARF.
+ // FIXME: When opt-remarks are supported in MC, add a remark to notify
+ // the user.
+ if (XReg != AArch64::FP)
+ return CU::UNWIND_ARM64_MODE_DWARF;
+
+ assert(XReg == AArch64::FP && "Invalid frame pointer!");
assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
const MCCFIInstruction &LRPush = Instrs[++i];
@@ -583,9 +593,9 @@ public:
: AArch64AsmBackend(T, TT, IsLittleEndian), OSABI(OSABI),
IsILP32(IsILP32) {}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian, IsILP32);
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createAArch64ELFObjectWriter(OSABI, IsILP32);
}
};
@@ -597,18 +607,18 @@ public:
COFFAArch64AsmBackend(const Target &T, const Triple &TheTriple)
: AArch64AsmBackend(T, TheTriple, /*IsLittleEndian*/ true) {}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createAArch64WinCOFFObjectWriter(OS);
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createAArch64WinCOFFObjectWriter();
}
};
}
MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TheTriple,
- StringRef CPU,
const MCTargetOptions &Options) {
+ const Triple &TheTriple = STI.getTargetTriple();
if (TheTriple.isOSBinFormatMachO())
return new DarwinAArch64AsmBackend(T, TheTriple, MRI);
@@ -624,10 +634,10 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
}
MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TheTriple,
- StringRef CPU,
const MCTargetOptions &Options) {
+ const Triple &TheTriple = STI.getTargetTriple();
assert(TheTriple.isOSBinFormatELF() &&
"Big endian is only supported for ELF targets!");
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 2d90e67960f8..a11e396217af 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -31,7 +31,7 @@ namespace {
class AArch64ELFObjectWriter : public MCELFObjectTargetWriter {
public:
- AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian, bool IsILP32);
+ AArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32);
~AArch64ELFObjectWriter() override = default;
@@ -43,9 +43,7 @@ protected:
} // end anonymous namespace
-AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
- bool IsLittleEndian,
- bool IsILP32)
+AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32)
: MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
/*HasRelocationAddend*/ true),
IsILP32(IsILP32) {}
@@ -429,10 +427,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
llvm_unreachable("Unimplemented fixup -> relocation");
}
-std::unique_ptr<MCObjectWriter>
-llvm::createAArch64ELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
- bool IsLittleEndian, bool IsILP32) {
- auto MOTW =
- llvm::make_unique<AArch64ELFObjectWriter>(OSABI, IsLittleEndian, IsILP32);
- return createELFObjectWriter(std::move(MOTW), OS, IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createAArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32) {
+ return llvm::make_unique<AArch64ELFObjectWriter>(OSABI, IsILP32);
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 8ee627d50df2..c0ef8b670286 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -27,6 +27,7 @@
#include "llvm/MC/MCELFStreamer.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -87,9 +88,10 @@ public:
friend class AArch64TargetELFStreamer;
AArch64ELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter)
- : MCELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)),
+ : MCELFStreamer(Context, std::move(TAB), std::move(OW),
+ std::move(Emitter)),
MappingSymbolCounter(0), LastEMS(EMS_None) {}
void ChangeSection(MCSection *Section, const MCExpr *Subsection) override {
@@ -209,11 +211,11 @@ MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
MCELFStreamer *createAArch64ELFStreamer(MCContext &Context,
std::unique_ptr<MCAsmBackend> TAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter,
bool RelaxAll) {
- AArch64ELFStreamer *S =
- new AArch64ELFStreamer(Context, std::move(TAB), OS, std::move(Emitter));
+ AArch64ELFStreamer *S = new AArch64ELFStreamer(
+ Context, std::move(TAB), std::move(OW), std::move(Emitter));
if (RelaxAll)
S->getAssembler().setRelaxAll(true);
return S;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
index 19b188aa1c61..d5b009ec30d1 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -20,7 +20,7 @@ namespace llvm {
MCELFStreamer *createAArch64ELFStreamer(MCContext &Context,
std::unique_ptr<MCAsmBackend> TAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter,
bool RelaxAll);
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 12b5a27b7699..ebb49121c1bf 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -101,7 +101,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
HasIdentDirective = true;
}
-AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() {
+AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
PrivateGlobalPrefix = ".L";
PrivateLabelPrefix = ".L";
@@ -112,14 +112,23 @@ AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() {
AlignmentIsInBytes = false;
SupportsDebugInformation = true;
CodePointerSize = 8;
-}
-AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
CommentString = ";";
ExceptionsType = ExceptionHandling::WinEH;
}
AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
+ PrivateGlobalPrefix = ".L";
+ PrivateLabelPrefix = ".L";
+
+ Data16bitsDirective = "\t.hword\t";
+ Data32bitsDirective = "\t.word\t";
+ Data64bitsDirective = "\t.xword\t";
+
+ AlignmentIsInBytes = false;
+ SupportsDebugInformation = true;
+ CodePointerSize = 8;
+
CommentString = "//";
ExceptionsType = ExceptionHandling::DwarfCFI;
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index afde87b40929..e8570b1c2887 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -34,15 +34,11 @@ struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
explicit AArch64MCAsmInfoELF(const Triple &T);
};
-struct AArch64MCAsmInfoCOFF : public MCAsmInfoCOFF {
- explicit AArch64MCAsmInfoCOFF();
-};
-
-struct AArch64MCAsmInfoMicrosoftCOFF : public AArch64MCAsmInfoCOFF {
+struct AArch64MCAsmInfoMicrosoftCOFF : public MCAsmInfoMicrosoft {
explicit AArch64MCAsmInfoMicrosoftCOFF();
};
-struct AArch64MCAsmInfoGNUCOFF : public AArch64MCAsmInfoCOFF {
+struct AArch64MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
explicit AArch64MCAsmInfoGNUCOFF();
};
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 33698d2b8c38..41cad48f7aea 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -163,6 +163,13 @@ public:
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ uint32_t getImm8OptLsl(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getSVEIncDecImm(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
const MCSubtargetInfo &STI) const;
@@ -276,7 +283,8 @@ AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
AArch64MCExpr::VariantKind RefKind = A64E->getKind();
if (RefKind == AArch64MCExpr::VK_TPREL_HI12 ||
- RefKind == AArch64MCExpr::VK_DTPREL_HI12)
+ RefKind == AArch64MCExpr::VK_DTPREL_HI12 ||
+ RefKind == AArch64MCExpr::VK_SECREL_HI12)
ShiftVal = 12;
}
return ShiftVal == 0 ? 0 : (1 << ShiftVal);
@@ -508,6 +516,34 @@ AArch64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
return MO.getImm() - 8;
}
+uint32_t
+AArch64MCCodeEmitter::getImm8OptLsl(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Test shift
+ auto ShiftOpnd = MI.getOperand(OpIdx + 1).getImm();
+ assert(AArch64_AM::getShiftType(ShiftOpnd) == AArch64_AM::LSL &&
+ "Unexpected shift type for imm8_opt_lsl immediate.");
+
+ unsigned ShiftVal = AArch64_AM::getShiftValue(ShiftOpnd);
+ assert((ShiftVal == 0 || ShiftVal == 8) &&
+ "Unexpected shift value for imm8_opt_lsl immediate.");
+
+ // Test immediate
+ auto Immediate = MI.getOperand(OpIdx).getImm();
+ return (Immediate & 0xff) | (ShiftVal == 0 ? 0 : (1 << ShiftVal));
+}
+
+uint32_t
+AArch64MCCodeEmitter::getSVEIncDecImm(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value!");
+ // Normalize 1-16 range to 0-15.
+ return MO.getImm() - 1;
+}
+
/// getMoveVecShifterOpValue - Return the encoded value for the vector move
/// shifter (MSL).
uint32_t AArch64MCCodeEmitter::getMoveVecShifterOpValue(
@@ -571,7 +607,7 @@ void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
}
uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
- support::endian::Writer<support::little>(OS).write<uint32_t>(Binary);
+ support::endian::write<uint32_t>(OS, Binary, support::little);
++MCNumEmitted; // Keep track of the # of mi's emitted.
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index f606d272bcb0..cd937935ddbf 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -70,6 +70,8 @@ StringRef AArch64MCExpr::getVariantKindName() const {
case VK_GOTTPREL_G0_NC: return ":gottprel_g0_nc:";
case VK_TLSDESC: return "";
case VK_TLSDESC_PAGE: return ":tlsdesc:";
+ case VK_SECREL_LO12: return ":secrel_lo12:";
+ case VK_SECREL_HI12: return ":secrel_hi12:";
default:
llvm_unreachable("Invalid ELF symbol kind");
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 3dbf0f84a665..b6bf254d3835 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -35,6 +35,7 @@ public:
VK_GOTTPREL = 0x005,
VK_TPREL = 0x006,
VK_TLSDESC = 0x007,
+ VK_SECREL = 0x008,
VK_SymLocBits = 0x00f,
// Variants specifying which part of the final address calculation is
@@ -98,6 +99,8 @@ public:
VK_TPREL_LO12_NC = VK_TPREL | VK_PAGEOFF | VK_NC,
VK_TLSDESC_LO12 = VK_TLSDESC | VK_PAGEOFF,
VK_TLSDESC_PAGE = VK_TLSDESC | VK_PAGE,
+ VK_SECREL_LO12 = VK_SECREL | VK_PAGEOFF,
+ VK_SECREL_HI12 = VK_SECREL | VK_HI12,
VK_INVALID = 0xfff
};
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index c3458d625b83..4ceda7e122f4 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -20,6 +20,7 @@
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -103,36 +104,61 @@ static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T,
static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
std::unique_ptr<MCAsmBackend> &&TAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&Emitter,
bool RelaxAll) {
- return createAArch64ELFStreamer(Ctx, std::move(TAB), OS, std::move(Emitter),
- RelaxAll);
+ return createAArch64ELFStreamer(Ctx, std::move(TAB), std::move(OW),
+ std::move(Emitter), RelaxAll);
}
static MCStreamer *createMachOStreamer(MCContext &Ctx,
std::unique_ptr<MCAsmBackend> &&TAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&Emitter,
bool RelaxAll,
bool DWARFMustBeAtTheEnd) {
- return createMachOStreamer(Ctx, std::move(TAB), OS, std::move(Emitter),
- RelaxAll, DWARFMustBeAtTheEnd,
+ return createMachOStreamer(Ctx, std::move(TAB), std::move(OW),
+ std::move(Emitter), RelaxAll, DWARFMustBeAtTheEnd,
/*LabelSections*/ true);
}
static MCStreamer *
createWinCOFFStreamer(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&TAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll,
bool IncrementalLinkerCompatible) {
- return createAArch64WinCOFFStreamer(Ctx, std::move(TAB), OS,
+ return createAArch64WinCOFFStreamer(Ctx, std::move(TAB), std::move(OW),
std::move(Emitter), RelaxAll,
IncrementalLinkerCompatible);
}
+namespace {
+
+class AArch64MCInstrAnalysis : public MCInstrAnalysis {
+public:
+ AArch64MCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
+
+ bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+ uint64_t &Target) const override {
+ // Search for a PC-relative argument.
+ // This will handle instructions like bcc (where the first argument is the
+ // condition code) and cbz (where it is a register).
+ const auto &Desc = Info->get(Inst.getOpcode());
+ for (unsigned i = 0, e = Inst.getNumOperands(); i != e; i++) {
+ if (Desc.OpInfo[i].OperandType == MCOI::OPERAND_PCREL) {
+ int64_t Imm = Inst.getOperand(i).getImm() * 4;
+ Target = Addr + Imm;
+ return true;
+ }
+ }
+ return false;
+ }
+};
+
+} // end anonymous namespace
+
static MCInstrAnalysis *createAArch64InstrAnalysis(const MCInstrInfo *Info) {
- return new MCInstrAnalysis(Info);
+ return new AArch64MCInstrAnalysis(Info);
}
// Force static initialization.
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index b9e1673b9317..63f50778ccdb 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -26,7 +26,7 @@ class MCContext;
class MCInstrInfo;
class MCInstPrinter;
class MCRegisterInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
class MCStreamer;
class MCSubtargetInfo;
class MCTargetOptions;
@@ -45,24 +45,21 @@ MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
MCAsmBackend *createAArch64leAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
const MCTargetOptions &Options);
MCAsmBackend *createAArch64beAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
const MCTargetOptions &Options);
-std::unique_ptr<MCObjectWriter>
-createAArch64ELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
- bool IsLittleEndian, bool IsILP32);
+std::unique_ptr<MCObjectTargetWriter>
+createAArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32);
-std::unique_ptr<MCObjectWriter>
-createAArch64MachObjectWriter(raw_pwrite_stream &OS, uint32_t CPUType,
- uint32_t CPUSubtype);
+std::unique_ptr<MCObjectTargetWriter>
+createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype);
-std::unique_ptr<MCObjectWriter>
-createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS);
+std::unique_ptr<MCObjectTargetWriter> createAArch64WinCOFFObjectWriter();
MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
formatted_raw_ostream &OS,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 55151c2b8d21..1021cdeeb3be 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -306,39 +306,24 @@ void AArch64MachObjectWriter::recordRelocation(
bool CanUseLocalRelocation =
canUseLocalRelocation(Section, *Symbol, Log2Size);
if (Symbol->isTemporary() && (Value || !CanUseLocalRelocation)) {
+ // Make sure that the symbol is actually in a section here. If it isn't,
+ // emit an error and exit.
+ if (!Symbol->isInSection()) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "unsupported relocation of local symbol '" + Symbol->getName() +
+ "'. Must have non-local symbol earlier in section.");
+ return;
+ }
const MCSection &Sec = Symbol->getSection();
if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
Symbol->setUsedInReloc();
}
const MCSymbol *Base = Asm.getAtom(*Symbol);
-
- // If the symbol is a variable and we weren't able to get a Base for it
- // (i.e., it's not in the symbol table associated with a section) resolve
- // the relocation based its expansion instead.
- if (Symbol->isVariable() && !Base) {
- // If the evaluation is an absolute value, just use that directly
- // to keep things easy.
- int64_t Res;
- if (Symbol->getVariableValue()->evaluateAsAbsolute(
- Res, Layout, Writer->getSectionAddressMap())) {
- FixedValue = Res;
- return;
- }
-
- // FIXME: Will the Target we already have ever have any data in it
- // we need to preserve and merge with the new Target? How about
- // the FixedValue?
- if (!Symbol->getVariableValue()->evaluateAsRelocatable(Target, &Layout,
- &Fixup)) {
- Asm.getContext().reportError(Fixup.getLoc(),
- "unable to resolve variable '" +
- Symbol->getName() + "'");
- return;
- }
- return recordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
- FixedValue);
- }
+ // If the symbol is a variable it can either be in a section and
+ // we have a base or it is absolute and should have been expanded.
+ assert(!Symbol->isVariable() || Base);
// Relocations inside debug sections always use local relocations when
// possible. This seems to be done because the debugger doesn't fully
@@ -377,19 +362,8 @@ void AArch64MachObjectWriter::recordRelocation(
Value -= Writer->getFragmentAddress(Fragment, Layout) +
Fixup.getOffset() + (1ULL << Log2Size);
} else {
- // Resolve constant variables.
- if (Symbol->isVariable()) {
- int64_t Res;
- if (Symbol->getVariableValue()->evaluateAsAbsolute(
- Res, Layout, Writer->getSectionAddressMap())) {
- FixedValue = Res;
- return;
- }
- }
- Asm.getContext().reportError(Fixup.getLoc(),
- "unsupported relocation of variable '" +
- Symbol->getName() + "'");
- return;
+ llvm_unreachable(
+ "This constant variable should have been expanded during evaluation");
}
}
@@ -430,10 +404,7 @@ void AArch64MachObjectWriter::recordRelocation(
Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
}
-std::unique_ptr<MCObjectWriter>
-llvm::createAArch64MachObjectWriter(raw_pwrite_stream &OS, uint32_t CPUType,
- uint32_t CPUSubtype) {
- return createMachObjectWriter(
- llvm::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype), OS,
- /*IsLittleEndian=*/true);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype) {
+ return llvm::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype);
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index d06c5e8862ae..7ea7d5f2a20e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -8,6 +8,7 @@
//===---------------------------------------------------------------------===//
#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/COFF.h"
#include "llvm/MC/MCAsmBackend.h"
@@ -46,6 +47,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
bool IsCrossSection, const MCAsmBackend &MAB) const {
auto Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None
: Target.getSymA()->getKind();
+ const MCExpr *Expr = Fixup.getValue();
switch (static_cast<unsigned>(Fixup.getKind())) {
default: {
@@ -73,6 +75,13 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
return COFF::IMAGE_REL_ARM64_SECREL;
case AArch64::fixup_aarch64_add_imm12:
+ if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
+ AArch64MCExpr::VariantKind RefKind = A64E->getKind();
+ if (RefKind == AArch64MCExpr::VK_SECREL_LO12)
+ return COFF::IMAGE_REL_ARM64_SECREL_LOW12A;
+ if (RefKind == AArch64MCExpr::VK_SECREL_HI12)
+ return COFF::IMAGE_REL_ARM64_SECREL_HIGH12A;
+ }
return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12A;
case AArch64::fixup_aarch64_ldst_imm12_scale1:
@@ -80,11 +89,25 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
case AArch64::fixup_aarch64_ldst_imm12_scale4:
case AArch64::fixup_aarch64_ldst_imm12_scale8:
case AArch64::fixup_aarch64_ldst_imm12_scale16:
+ if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
+ AArch64MCExpr::VariantKind RefKind = A64E->getKind();
+ if (RefKind == AArch64MCExpr::VK_SECREL_LO12)
+ return COFF::IMAGE_REL_ARM64_SECREL_LOW12L;
+ }
return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12L;
+ case AArch64::fixup_aarch64_pcrel_adr_imm21:
+ return COFF::IMAGE_REL_ARM64_REL21;
+
case AArch64::fixup_aarch64_pcrel_adrp_imm21:
return COFF::IMAGE_REL_ARM64_PAGEBASE_REL21;
+ case AArch64::fixup_aarch64_pcrel_branch14:
+ return COFF::IMAGE_REL_ARM64_BRANCH14;
+
+ case AArch64::fixup_aarch64_pcrel_branch19:
+ return COFF::IMAGE_REL_ARM64_BRANCH19;
+
case AArch64::fixup_aarch64_pcrel_branch26:
case AArch64::fixup_aarch64_pcrel_call26:
return COFF::IMAGE_REL_ARM64_BRANCH26;
@@ -97,10 +120,8 @@ bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
namespace llvm {
-std::unique_ptr<MCObjectWriter>
-createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS) {
- auto MOTW = llvm::make_unique<AArch64WinCOFFObjectWriter>();
- return createWinCOFFObjectWriter(std::move(MOTW), OS);
+std::unique_ptr<MCObjectTargetWriter> createAArch64WinCOFFObjectWriter() {
+ return llvm::make_unique<AArch64WinCOFFObjectWriter>();
}
} // end namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index c88363d2c250..9871dc553bed 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -10,6 +10,7 @@
#include "AArch64WinCOFFStreamer.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCObjectWriter.h"
using namespace llvm;
@@ -21,8 +22,8 @@ public:
AArch64WinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> AB,
std::unique_ptr<MCCodeEmitter> CE,
- raw_pwrite_stream &OS)
- : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), OS) {}
+ std::unique_ptr<MCObjectWriter> OW)
+ : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
void FinishImpl() override;
};
@@ -37,10 +38,10 @@ void AArch64WinCOFFStreamer::FinishImpl() {
namespace llvm {
MCWinCOFFStreamer *createAArch64WinCOFFStreamer(
MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
- raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+ std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
bool RelaxAll, bool IncrementalLinkerCompatible) {
auto *S = new AArch64WinCOFFStreamer(Context, std::move(MAB),
- std::move(Emitter), OS);
+ std::move(Emitter), std::move(OW));
S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
return S;
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
index b67a19e883e9..c05422163584 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
@@ -35,7 +35,7 @@ namespace llvm {
MCWinCOFFStreamer *createAArch64WinCOFFStreamer(
MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
- raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+ std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
bool RelaxAll, bool IncrementalLinkerCompatible);
} // end llvm namespace
diff --git a/lib/Target/AArch64/SVEInstrFormats.td b/lib/Target/AArch64/SVEInstrFormats.td
index 15c1275f259d..17b3f6041279 100644
--- a/lib/Target/AArch64/SVEInstrFormats.td
+++ b/lib/Target/AArch64/SVEInstrFormats.td
@@ -11,6 +11,934 @@
//
//===----------------------------------------------------------------------===//
+def SVEPatternOperand : AsmOperandClass {
+ let Name = "SVEPattern";
+ let ParserMethod = "tryParseSVEPattern";
+ let PredicateMethod = "isSVEPattern";
+ let RenderMethod = "addImmOperands";
+ let DiagnosticType = "InvalidSVEPattern";
+}
+
+def sve_pred_enum : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) < 32);
+ }]> {
+
+ let PrintMethod = "printSVEPattern";
+ let ParserMatchClass = SVEPatternOperand;
+}
+
+def SVEPrefetchOperand : AsmOperandClass {
+ let Name = "SVEPrefetch";
+ let ParserMethod = "tryParsePrefetch<true>";
+ let PredicateMethod = "isPrefetch";
+ let RenderMethod = "addPrefetchOperands";
+}
+
+def sve_prfop : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) <= 15);
+ }]> {
+ let PrintMethod = "printPrefetchOp<true>";
+ let ParserMatchClass = SVEPrefetchOperand;
+}
+
+class SVELogicalImmOperand<int Width> : AsmOperandClass {
+ let Name = "SVELogicalImm" # Width;
+ let DiagnosticType = "LogicalSecondSource";
+ let PredicateMethod = "isLogicalImm<int" # Width # "_t>";
+ let RenderMethod = "addLogicalImmOperands<int" # Width # "_t>";
+}
+
+def sve_logical_imm8 : Operand<i64> {
+ let ParserMatchClass = SVELogicalImmOperand<8>;
+ let PrintMethod = "printLogicalImm<int8_t>";
+
+ let MCOperandPredicate = [{
+ if (!MCOp.isImm())
+ return false;
+ int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
+ return AArch64_AM::isSVEMaskOfIdenticalElements<int8_t>(Val);
+ }];
+}
+
+def sve_logical_imm16 : Operand<i64> {
+ let ParserMatchClass = SVELogicalImmOperand<16>;
+ let PrintMethod = "printLogicalImm<int16_t>";
+
+ let MCOperandPredicate = [{
+ if (!MCOp.isImm())
+ return false;
+ int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
+ return AArch64_AM::isSVEMaskOfIdenticalElements<int16_t>(Val);
+ }];
+}
+
+def sve_logical_imm32 : Operand<i64> {
+ let ParserMatchClass = SVELogicalImmOperand<32>;
+ let PrintMethod = "printLogicalImm<int32_t>";
+
+ let MCOperandPredicate = [{
+ if (!MCOp.isImm())
+ return false;
+ int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
+ return AArch64_AM::isSVEMaskOfIdenticalElements<int32_t>(Val);
+ }];
+}
+
+class SVEPreferredLogicalImmOperand<int Width> : AsmOperandClass {
+ let Name = "SVEPreferredLogicalImm" # Width;
+ let PredicateMethod = "isSVEPreferredLogicalImm<int" # Width # "_t>";
+ let RenderMethod = "addLogicalImmOperands<int" # Width # "_t>";
+}
+
+def sve_preferred_logical_imm16 : Operand<i64> {
+ let ParserMatchClass = SVEPreferredLogicalImmOperand<16>;
+ let PrintMethod = "printSVELogicalImm<int16_t>";
+
+ let MCOperandPredicate = [{
+ if (!MCOp.isImm())
+ return false;
+ int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
+ return AArch64_AM::isSVEMaskOfIdenticalElements<int16_t>(Val) &&
+ AArch64_AM::isSVEMoveMaskPreferredLogicalImmediate(Val);
+ }];
+}
+
+def sve_preferred_logical_imm32 : Operand<i64> {
+ let ParserMatchClass = SVEPreferredLogicalImmOperand<32>;
+ let PrintMethod = "printSVELogicalImm<int32_t>";
+
+ let MCOperandPredicate = [{
+ if (!MCOp.isImm())
+ return false;
+ int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
+ return AArch64_AM::isSVEMaskOfIdenticalElements<int32_t>(Val) &&
+ AArch64_AM::isSVEMoveMaskPreferredLogicalImmediate(Val);
+ }];
+}
+
+def sve_preferred_logical_imm64 : Operand<i64> {
+ let ParserMatchClass = SVEPreferredLogicalImmOperand<64>;
+ let PrintMethod = "printSVELogicalImm<int64_t>";
+
+ let MCOperandPredicate = [{
+ if (!MCOp.isImm())
+ return false;
+ int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
+ return AArch64_AM::isSVEMaskOfIdenticalElements<int64_t>(Val) &&
+ AArch64_AM::isSVEMoveMaskPreferredLogicalImmediate(Val);
+ }];
+}
+
+class SVELogicalImmNotOperand<int Width> : AsmOperandClass {
+ let Name = "SVELogicalImm" # Width # "Not";
+ let DiagnosticType = "LogicalSecondSource";
+ let PredicateMethod = "isLogicalImm<int" # Width # "_t>";
+ let RenderMethod = "addLogicalImmNotOperands<int" # Width # "_t>";
+}
+
+def sve_logical_imm8_not : Operand<i64> {
+ let ParserMatchClass = SVELogicalImmNotOperand<8>;
+}
+
+def sve_logical_imm16_not : Operand<i64> {
+ let ParserMatchClass = SVELogicalImmNotOperand<16>;
+}
+
+def sve_logical_imm32_not : Operand<i64> {
+ let ParserMatchClass = SVELogicalImmNotOperand<32>;
+}
+
+class SVEShiftedImmOperand<int ElementWidth, string Infix, string Predicate>
+ : AsmOperandClass {
+ let Name = "SVE" # Infix # "Imm" # ElementWidth;
+ let DiagnosticType = "Invalid" # Name;
+ let RenderMethod = "addImmWithOptionalShiftOperands<8>";
+ let ParserMethod = "tryParseImmWithOptionalShift";
+ let PredicateMethod = Predicate;
+}
+
+def SVECpyImmOperand8 : SVEShiftedImmOperand<8, "Cpy", "isSVECpyImm<int8_t>">;
+def SVECpyImmOperand16 : SVEShiftedImmOperand<16, "Cpy", "isSVECpyImm<int16_t>">;
+def SVECpyImmOperand32 : SVEShiftedImmOperand<32, "Cpy", "isSVECpyImm<int32_t>">;
+def SVECpyImmOperand64 : SVEShiftedImmOperand<64, "Cpy", "isSVECpyImm<int64_t>">;
+
+def SVEAddSubImmOperand8 : SVEShiftedImmOperand<8, "AddSub", "isSVEAddSubImm<int8_t>">;
+def SVEAddSubImmOperand16 : SVEShiftedImmOperand<16, "AddSub", "isSVEAddSubImm<int16_t>">;
+def SVEAddSubImmOperand32 : SVEShiftedImmOperand<32, "AddSub", "isSVEAddSubImm<int32_t>">;
+def SVEAddSubImmOperand64 : SVEShiftedImmOperand<64, "AddSub", "isSVEAddSubImm<int64_t>">;
+
+class imm8_opt_lsl<int ElementWidth, string printType,
+ AsmOperandClass OpndClass, code Predicate>
+ : Operand<i32>, ImmLeaf<i32, Predicate> {
+ let EncoderMethod = "getImm8OptLsl";
+ let DecoderMethod = "DecodeImm8OptLsl<" # ElementWidth # ">";
+ let PrintMethod = "printImm8OptLsl<" # printType # ">";
+ let ParserMatchClass = OpndClass;
+ let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+def cpy_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "int8_t", SVECpyImmOperand8, [{
+ return AArch64_AM::isSVECpyImm<int8_t>(Imm);
+}]>;
+def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16, [{
+ return AArch64_AM::isSVECpyImm<int16_t>(Imm);
+}]>;
+def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32, [{
+ return AArch64_AM::isSVECpyImm<int32_t>(Imm);
+}]>;
+def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64, [{
+ return AArch64_AM::isSVECpyImm<int64_t>(Imm);
+}]>;
+
+def addsub_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "uint8_t", SVEAddSubImmOperand8, [{
+ return AArch64_AM::isSVEAddSubImm<int8_t>(Imm);
+}]>;
+def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16, [{
+ return AArch64_AM::isSVEAddSubImm<int16_t>(Imm);
+}]>;
+def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32, [{
+ return AArch64_AM::isSVEAddSubImm<int32_t>(Imm);
+}]>;
+def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64, [{
+ return AArch64_AM::isSVEAddSubImm<int64_t>(Imm);
+}]>;
+
+class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
+ let Name = "SVEExactFPImmOperand" # Suffix;
+ let DiagnosticType = "Invalid" # Name;
+ let ParserMethod = "tryParseFPImm<false>";
+ let PredicateMethod = "isExactFPImm<" # ValA # ", " # ValB # ">";
+ let RenderMethod = "addExactFPImmOperands<" # ValA # ", " # ValB # ">";
+}
+
+class SVEExactFPImmOperand<string Suffix, string ValA, string ValB> : Operand<i32> {
+ let PrintMethod = "printExactFPImm<" # ValA # ", " # ValB # ">";
+ let ParserMatchClass = SVEExactFPImm<Suffix, ValA, ValB>;
+}
+
+def sve_fpimm_half_one
+ : SVEExactFPImmOperand<"HalfOne", "AArch64ExactFPImm::half",
+ "AArch64ExactFPImm::one">;
+def sve_fpimm_half_two
+ : SVEExactFPImmOperand<"HalfTwo", "AArch64ExactFPImm::half",
+ "AArch64ExactFPImm::two">;
+def sve_fpimm_zero_one
+ : SVEExactFPImmOperand<"ZeroOne", "AArch64ExactFPImm::zero",
+ "AArch64ExactFPImm::one">;
+
+def sve_incdec_imm : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+ let ParserMatchClass = Imm1_16Operand;
+ let EncoderMethod = "getSVEIncDecImm";
+ let DecoderMethod = "DecodeSVEIncDecImm";
+}
+
+//===----------------------------------------------------------------------===//
+// SVE PTrue - These are used extensively throughout the pattern matching so
+// it's important we define them first.
+//===----------------------------------------------------------------------===//
+
+class sve_int_ptrue<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty>
+: I<(outs pprty:$Pd), (ins sve_pred_enum:$pattern),
+ asm, "\t$Pd, $pattern",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pd;
+ bits<5> pattern;
+ let Inst{31-24} = 0b00100101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-19} = 0b011;
+ let Inst{18-17} = opc{2-1};
+ let Inst{16} = opc{0};
+ let Inst{15-10} = 0b111000;
+ let Inst{9-5} = pattern;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = Pd;
+
+ let Defs = !if(!eq (opc{0}, 1), [NZCV], []);
+}
+
+multiclass sve_int_ptrue<bits<3> opc, string asm> {
+ def _B : sve_int_ptrue<0b00, opc, asm, PPR8>;
+ def _H : sve_int_ptrue<0b01, opc, asm, PPR16>;
+ def _S : sve_int_ptrue<0b10, opc, asm, PPR32>;
+ def _D : sve_int_ptrue<0b11, opc, asm, PPR64>;
+
+ def : InstAlias<asm # "\t$Pd",
+ (!cast<Instruction>(NAME # _B) PPR8:$Pd, 0b11111), 1>;
+ def : InstAlias<asm # "\t$Pd",
+ (!cast<Instruction>(NAME # _H) PPR16:$Pd, 0b11111), 1>;
+ def : InstAlias<asm # "\t$Pd",
+ (!cast<Instruction>(NAME # _S) PPR32:$Pd, 0b11111), 1>;
+ def : InstAlias<asm # "\t$Pd",
+ (!cast<Instruction>(NAME # _D) PPR64:$Pd, 0b11111), 1>;
+}
+
+let Predicates = [HasSVE] in {
+ defm PTRUE : sve_int_ptrue<0b000, "ptrue">;
+ defm PTRUES : sve_int_ptrue<0b001, "ptrues">;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Predicate Count Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_count_r<bits<2> sz8_64, bits<5> opc, string asm,
+ RegisterOperand dty, PPRRegOp pprty, RegisterOperand sty>
+: I<(outs dty:$Rdn), (ins pprty:$Pg, sty:$_Rdn),
+ asm, "\t$Rdn, $Pg",
+ "",
+ []>, Sched<[]> {
+ bits<5> Rdn;
+ bits<4> Pg;
+ let Inst{31-24} = 0b00100101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-19} = 0b101;
+ let Inst{18-16} = opc{4-2};
+ let Inst{15-11} = 0b10001;
+ let Inst{10-9} = opc{1-0};
+ let Inst{8-5} = Pg;
+ let Inst{4-0} = Rdn;
+
+ // Signed 32bit forms require their GPR operand printed.
+ let AsmString = !if(!eq(opc{4,2-0}, 0b0000),
+ !strconcat(asm, "\t$Rdn, $Pg, $_Rdn"),
+ !strconcat(asm, "\t$Rdn, $Pg"));
+ let Constraints = "$Rdn = $_Rdn";
+}
+
+multiclass sve_int_count_r_s32<bits<5> opc, string asm> {
+ def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64as32>;
+ def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64as32>;
+ def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64as32>;
+ def _D : sve_int_count_r<0b11, opc, asm, GPR64z, PPR64, GPR64as32>;
+}
+
+multiclass sve_int_count_r_u32<bits<5> opc, string asm> {
+ def _B : sve_int_count_r<0b00, opc, asm, GPR32z, PPR8, GPR32z>;
+ def _H : sve_int_count_r<0b01, opc, asm, GPR32z, PPR16, GPR32z>;
+ def _S : sve_int_count_r<0b10, opc, asm, GPR32z, PPR32, GPR32z>;
+ def _D : sve_int_count_r<0b11, opc, asm, GPR32z, PPR64, GPR32z>;
+}
+
+multiclass sve_int_count_r_x64<bits<5> opc, string asm> {
+ def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64z>;
+ def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64z>;
+ def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64z>;
+ def _D : sve_int_count_r<0b11, opc, asm, GPR64z, PPR64, GPR64z>;
+}
+
+class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
+ ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, PPRAny:$Pg),
+ asm, "\t$Zdn, $Pg",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pg;
+ bits<5> Zdn;
+ let Inst{31-24} = 0b00100101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-19} = 0b101;
+ let Inst{18-16} = opc{4-2};
+ let Inst{15-11} = 0b10000;
+ let Inst{10-9} = opc{1-0};
+ let Inst{8-5} = Pg;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_count_v<bits<5> opc, string asm> {
+ def _H : sve_int_count_v<0b01, opc, asm, ZPR16>;
+ def _S : sve_int_count_v<0b10, opc, asm, ZPR32>;
+ def _D : sve_int_count_v<0b11, opc, asm, ZPR64>;
+}
+
+class sve_int_pcount_pred<bits<2> sz8_64, bits<4> opc, string asm,
+ PPRRegOp pprty>
+: I<(outs GPR64:$Rd), (ins PPRAny:$Pg, pprty:$Pn),
+ asm, "\t$Rd, $Pg, $Pn",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pg;
+ bits<4> Pn;
+ bits<5> Rd;
+ let Inst{31-24} = 0b00100101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-19} = 0b100;
+ let Inst{18-16} = opc{3-1};
+ let Inst{15-14} = 0b10;
+ let Inst{13-10} = Pg;
+ let Inst{9} = opc{0};
+ let Inst{8-5} = Pn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass sve_int_pcount_pred<bits<4> opc, string asm> {
+ def _B : sve_int_pcount_pred<0b00, opc, asm, PPR8>;
+ def _H : sve_int_pcount_pred<0b01, opc, asm, PPR16>;
+ def _S : sve_int_pcount_pred<0b10, opc, asm, PPR32>;
+ def _D : sve_int_pcount_pred<0b11, opc, asm, PPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Element Count Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_count<bits<3> opc, string asm>
+: I<(outs GPR64:$Rd), (ins sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
+ asm, "\t$Rd, $pattern, mul $imm4",
+ "",
+ []>, Sched<[]> {
+ bits<5> Rd;
+ bits<4> imm4;
+ bits<5> pattern;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = opc{2-1};
+ let Inst{21-20} = 0b10;
+ let Inst{19-16} = imm4;
+ let Inst{15-11} = 0b11100;
+ let Inst{10} = opc{0};
+ let Inst{9-5} = pattern;
+ let Inst{4-0} = Rd;
+}
+
+multiclass sve_int_count<bits<3> opc, string asm> {
+ def NAME : sve_int_count<opc, asm>;
+
+ def : InstAlias<asm # "\t$Rd, $pattern",
+ (!cast<Instruction>(NAME) GPR64:$Rd, sve_pred_enum:$pattern, 1), 1>;
+ def : InstAlias<asm # "\t$Rd",
+ (!cast<Instruction>(NAME) GPR64:$Rd, 0b11111, 1), 2>;
+}
+
+class sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
+ asm, "\t$Zdn, $pattern, mul $imm4",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zdn;
+ bits<5> pattern;
+ bits<4> imm4;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = opc{4-3};
+ let Inst{21} = 0b1;
+ let Inst{20} = opc{2};
+ let Inst{19-16} = imm4;
+ let Inst{15-12} = 0b1100;
+ let Inst{11-10} = opc{1-0};
+ let Inst{9-5} = pattern;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty> {
+ def NAME : sve_int_countvlv<opc, asm, zprty>;
+
+ def : InstAlias<asm # "\t$Zdn, $pattern",
+ (!cast<Instruction>(NAME) zprty:$Zdn, sve_pred_enum:$pattern, 1), 1>;
+ def : InstAlias<asm # "\t$Zdn",
+ (!cast<Instruction>(NAME) zprty:$Zdn, 0b11111, 1), 2>;
+}
+
+class sve_int_pred_pattern_a<bits<3> opc, string asm>
+: I<(outs GPR64:$Rdn), (ins GPR64:$_Rdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
+ asm, "\t$Rdn, $pattern, mul $imm4",
+ "",
+ []>, Sched<[]> {
+ bits<5> Rdn;
+ bits<5> pattern;
+ bits<4> imm4;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = opc{2-1};
+ let Inst{21-20} = 0b11;
+ let Inst{19-16} = imm4;
+ let Inst{15-11} = 0b11100;
+ let Inst{10} = opc{0};
+ let Inst{9-5} = pattern;
+ let Inst{4-0} = Rdn;
+
+ let Constraints = "$Rdn = $_Rdn";
+}
+
+multiclass sve_int_pred_pattern_a<bits<3> opc, string asm> {
+ def NAME : sve_int_pred_pattern_a<opc, asm>;
+
+ def : InstAlias<asm # "\t$Rdn, $pattern",
+ (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1), 1>;
+ def : InstAlias<asm # "\t$Rdn",
+ (!cast<Instruction>(NAME) GPR64:$Rdn, 0b11111, 1), 2>;
+}
+
+class sve_int_pred_pattern_b<bits<5> opc, string asm, RegisterOperand dt,
+ RegisterOperand st>
+: I<(outs dt:$Rdn), (ins st:$_Rdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
+ asm, "\t$Rdn, $pattern, mul $imm4",
+ "",
+ []>, Sched<[]> {
+ bits<5> Rdn;
+ bits<5> pattern;
+ bits<4> imm4;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = opc{4-3};
+ let Inst{21} = 0b1;
+ let Inst{20} = opc{2};
+ let Inst{19-16} = imm4;
+ let Inst{15-12} = 0b1111;
+ let Inst{11-10} = opc{1-0};
+ let Inst{9-5} = pattern;
+ let Inst{4-0} = Rdn;
+
+ // Signed 32bit forms require their GPR operand printed.
+ let AsmString = !if(!eq(opc{2,0}, 0b00),
+ !strconcat(asm, "\t$Rdn, $_Rdn, $pattern, mul $imm4"),
+ !strconcat(asm, "\t$Rdn, $pattern, mul $imm4"));
+
+ let Constraints = "$Rdn = $_Rdn";
+}
+
+multiclass sve_int_pred_pattern_b_s32<bits<5> opc, string asm> {
+ def NAME : sve_int_pred_pattern_b<opc, asm, GPR64z, GPR64as32>;
+
+ def : InstAlias<asm # "\t$Rd, $Rn, $pattern",
+ (!cast<Instruction>(NAME) GPR64z:$Rd, GPR64as32:$Rn, sve_pred_enum:$pattern, 1), 1>;
+ def : InstAlias<asm # "\t$Rd, $Rn",
+ (!cast<Instruction>(NAME) GPR64z:$Rd, GPR64as32:$Rn, 0b11111, 1), 2>;
+}
+
+multiclass sve_int_pred_pattern_b_u32<bits<5> opc, string asm> {
+ def NAME : sve_int_pred_pattern_b<opc, asm, GPR32z, GPR32z>;
+
+ def : InstAlias<asm # "\t$Rdn, $pattern",
+ (!cast<Instruction>(NAME) GPR32z:$Rdn, sve_pred_enum:$pattern, 1), 1>;
+ def : InstAlias<asm # "\t$Rdn",
+ (!cast<Instruction>(NAME) GPR32z:$Rdn, 0b11111, 1), 2>;
+}
+
+multiclass sve_int_pred_pattern_b_x64<bits<5> opc, string asm> {
+ def NAME : sve_int_pred_pattern_b<opc, asm, GPR64z, GPR64z>;
+
+ def : InstAlias<asm # "\t$Rdn, $pattern",
+ (!cast<Instruction>(NAME) GPR64z:$Rdn, sve_pred_enum:$pattern, 1), 1>;
+ def : InstAlias<asm # "\t$Rdn",
+ (!cast<Instruction>(NAME) GPR64z:$Rdn, 0b11111, 1), 2>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Permute - Cross Lane Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+ RegisterClass srcRegType>
+: I<(outs zprty:$Zd), (ins srcRegType:$Rn),
+ asm, "\t$Zd, $Rn",
+ "",
+ []>, Sched<[]> {
+ bits<5> Rn;
+ bits<5> Zd;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-10} = 0b100000001110;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_perm_dup_r<string asm> {
+ def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, GPR32sp>;
+ def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, GPR32sp>;
+ def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, GPR32sp>;
+ def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, GPR64sp>;
+
+ def : InstAlias<"mov $Zd, $Rn",
+ (!cast<Instruction>(NAME # _B) ZPR8:$Zd, GPR32sp:$Rn), 1>;
+ def : InstAlias<"mov $Zd, $Rn",
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd, GPR32sp:$Rn), 1>;
+ def : InstAlias<"mov $Zd, $Rn",
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd, GPR32sp:$Rn), 1>;
+ def : InstAlias<"mov $Zd, $Rn",
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd, GPR64sp:$Rn), 1>;
+}
+
+class sve_int_perm_dup_i<bits<5> tsz, Operand immtype, string asm,
+ ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$idx),
+ asm, "\t$Zd, $Zn$idx",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<7> idx;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = {?,?}; // imm3h
+ let Inst{21} = 0b1;
+ let Inst{20-16} = tsz;
+ let Inst{15-10} = 0b001000;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_perm_dup_i<string asm> {
+ def _B : sve_int_perm_dup_i<{?,?,?,?,1}, sve_elm_idx_extdup_b, asm, ZPR8> {
+ let Inst{23-22} = idx{5-4};
+ let Inst{20-17} = idx{3-0};
+ }
+ def _H : sve_int_perm_dup_i<{?,?,?,1,0}, sve_elm_idx_extdup_h, asm, ZPR16> {
+ let Inst{23-22} = idx{4-3};
+ let Inst{20-18} = idx{2-0};
+ }
+ def _S : sve_int_perm_dup_i<{?,?,1,0,0}, sve_elm_idx_extdup_s, asm, ZPR32> {
+ let Inst{23-22} = idx{3-2};
+ let Inst{20-19} = idx{1-0};
+ }
+ def _D : sve_int_perm_dup_i<{?,1,0,0,0}, sve_elm_idx_extdup_d, asm, ZPR64> {
+ let Inst{23-22} = idx{2-1};
+ let Inst{20} = idx{0};
+ }
+ def _Q : sve_int_perm_dup_i<{1,0,0,0,0}, sve_elm_idx_extdup_q, asm, ZPR128> {
+ let Inst{23-22} = idx{1-0};
+ }
+
+ def : InstAlias<"mov $Zd, $Zn$idx",
+ (!cast<Instruction>(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, sve_elm_idx_extdup_b:$idx), 1>;
+ def : InstAlias<"mov $Zd, $Zn$idx",
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd, ZPR16:$Zn, sve_elm_idx_extdup_h:$idx), 1>;
+ def : InstAlias<"mov $Zd, $Zn$idx",
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd, ZPR32:$Zn, sve_elm_idx_extdup_s:$idx), 1>;
+ def : InstAlias<"mov $Zd, $Zn$idx",
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, sve_elm_idx_extdup_d:$idx), 1>;
+ def : InstAlias<"mov $Zd, $Zn$idx",
+ (!cast<Instruction>(NAME # _Q) ZPR128:$Zd, ZPR128:$Zn, sve_elm_idx_extdup_q:$idx), 1>;
+ def : InstAlias<"mov $Zd, $Bn",
+ (!cast<Instruction>(NAME # _B) ZPR8:$Zd, FPR8asZPR:$Bn, 0), 2>;
+ def : InstAlias<"mov $Zd, $Hn",
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd, FPR16asZPR:$Hn, 0), 2>;
+ def : InstAlias<"mov $Zd, $Sn",
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd, FPR32asZPR:$Sn, 0), 2>;
+ def : InstAlias<"mov $Zd, $Dn",
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd, FPR64asZPR:$Dn, 0), 2>;
+ def : InstAlias<"mov $Zd, $Qn",
+ (!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>;
+}
+
+class sve_int_perm_tbl<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+ RegisterOperand VecList>
+: I<(outs zprty:$Zd), (ins VecList:$Zn, zprty:$Zm),
+ asm, "\t$Zd, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zm;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-10} = 0b001100;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_perm_tbl<string asm> {
+ def _B : sve_int_perm_tbl<0b00, asm, ZPR8, Z_b>;
+ def _H : sve_int_perm_tbl<0b01, asm, ZPR16, Z_h>;
+ def _S : sve_int_perm_tbl<0b10, asm, ZPR32, Z_s>;
+ def _D : sve_int_perm_tbl<0b11, asm, ZPR64, Z_d>;
+
+ def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
+ (!cast<Instruction>(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 0>;
+ def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd, ZPR16:$Zn, ZPR16:$Zm), 0>;
+ def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd, ZPR32:$Zn, ZPR32:$Zm), 0>;
+ def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zm), 0>;
+}
+
+class sve_int_perm_reverse_z<bits<2> sz8_64, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn),
+ asm, "\t$Zd, $Zn",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-10} = 0b111000001110;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_perm_reverse_z<string asm> {
+ def _B : sve_int_perm_reverse_z<0b00, asm, ZPR8>;
+ def _H : sve_int_perm_reverse_z<0b01, asm, ZPR16>;
+ def _S : sve_int_perm_reverse_z<0b10, asm, ZPR32>;
+ def _D : sve_int_perm_reverse_z<0b11, asm, ZPR64>;
+}
+
+class sve_int_perm_reverse_p<bits<2> sz8_64, string asm, PPRRegOp pprty>
+: I<(outs pprty:$Pd), (ins pprty:$Pn),
+ asm, "\t$Pd, $Pn",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pd;
+ bits<4> Pn;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-9} = 0b1101000100000;
+ let Inst{8-5} = Pn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = Pd;
+}
+
+multiclass sve_int_perm_reverse_p<string asm> {
+ def _B : sve_int_perm_reverse_p<0b00, asm, PPR8>;
+ def _H : sve_int_perm_reverse_p<0b01, asm, PPR16>;
+ def _S : sve_int_perm_reverse_p<0b10, asm, PPR32>;
+ def _D : sve_int_perm_reverse_p<0b11, asm, PPR64>;
+}
+
+class sve_int_perm_unpk<bits<2> sz16_64, bits<2> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty2:$Zn),
+ asm, "\t$Zd, $Zn",
+ "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz16_64;
+ let Inst{21-18} = 0b1100;
+ let Inst{17-16} = opc;
+ let Inst{15-10} = 0b001110;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_perm_unpk<bits<2> opc, string asm> {
+ def _H : sve_int_perm_unpk<0b01, opc, asm, ZPR16, ZPR8>;
+ def _S : sve_int_perm_unpk<0b10, opc, asm, ZPR32, ZPR16>;
+ def _D : sve_int_perm_unpk<0b11, opc, asm, ZPR64, ZPR32>;
+}
+
+class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+ RegisterClass srcRegType>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcRegType:$Rm),
+ asm, "\t$Zdn, $Rm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Rm;
+ bits<5> Zdn;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-10} = 0b100100001110;
+ let Inst{9-5} = Rm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_perm_insrs<string asm> {
+ def _B : sve_int_perm_insrs<0b00, asm, ZPR8, GPR32>;
+ def _H : sve_int_perm_insrs<0b01, asm, ZPR16, GPR32>;
+ def _S : sve_int_perm_insrs<0b10, asm, ZPR32, GPR32>;
+ def _D : sve_int_perm_insrs<0b11, asm, ZPR64, GPR64>;
+}
+
+class sve_int_perm_insrv<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+ RegisterClass srcRegType>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcRegType:$Vm),
+ asm, "\t$Zdn, $Vm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Vm;
+ bits<5> Zdn;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-10} = 0b110100001110;
+ let Inst{9-5} = Vm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_perm_insrv<string asm> {
+ def _B : sve_int_perm_insrv<0b00, asm, ZPR8, FPR8>;
+ def _H : sve_int_perm_insrv<0b01, asm, ZPR16, FPR16>;
+ def _S : sve_int_perm_insrv<0b10, asm, ZPR32, FPR32>;
+ def _D : sve_int_perm_insrv<0b11, asm, ZPR64, FPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Permute - Extract Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_perm_extract_i<string asm>
+: I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn, ZPR8:$Zm, imm0_255:$imm8),
+ asm, "\t$Zdn, $_Zdn, $Zm, $imm8",
+ "", []>, Sched<[]> {
+ bits<5> Zdn;
+ bits<5> Zm;
+ bits<8> imm8;
+ let Inst{31-21} = 0b00000101001;
+ let Inst{20-16} = imm8{7-3};
+ let Inst{15-13} = 0b000;
+ let Inst{12-10} = imm8{2-0};
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Vector Select Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_sel_vvv<bits<2> sz8_64, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins PPRAny:$Pg, zprty:$Zn, zprty:$Zm),
+ asm, "\t$Zd, $Pg, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pg;
+ bits<5> Zd;
+ bits<5> Zm;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-14} = 0b11;
+ let Inst{13-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_sel_vvv<string asm> {
+ def _B : sve_int_sel_vvv<0b00, asm, ZPR8>;
+ def _H : sve_int_sel_vvv<0b01, asm, ZPR16>;
+ def _S : sve_int_sel_vvv<0b10, asm, ZPR32>;
+ def _D : sve_int_sel_vvv<0b11, asm, ZPR64>;
+
+ def : InstAlias<"mov $Zd, $Pg/m, $Zn",
+ (!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, ZPR8:$Zn, ZPR8:$Zd), 1>;
+ def : InstAlias<"mov $Zd, $Pg/m, $Zn",
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, ZPR16:$Zn, ZPR16:$Zd), 1>;
+ def : InstAlias<"mov $Zd, $Pg/m, $Zn",
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, ZPR32:$Zn, ZPR32:$Zd), 1>;
+ def : InstAlias<"mov $Zd, $Pg/m, $Zn",
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, ZPR64:$Zn, ZPR64:$Zd), 1>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Predicate Logical Operations Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_pred_log<bits<4> opc, string asm>
+: I<(outs PPR8:$Pd), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$Pm),
+ asm, "\t$Pd, $Pg/z, $Pn, $Pm",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pd;
+ bits<4> Pg;
+ bits<4> Pm;
+ bits<4> Pn;
+ let Inst{31-24} = 0b00100101;
+ let Inst{23-22} = opc{3-2};
+ let Inst{21-20} = 0b00;
+ let Inst{19-16} = Pm;
+ let Inst{15-14} = 0b01;
+ let Inst{13-10} = Pg;
+ let Inst{9} = opc{1};
+ let Inst{8-5} = Pn;
+ let Inst{4} = opc{0};
+ let Inst{3-0} = Pd;
+
+ // SEL has no predication qualifier.
+ let AsmString = !if(!eq(opc, 0b0011),
+ !strconcat(asm, "\t$Pd, $Pg, $Pn, $Pm"),
+ !strconcat(asm, "\t$Pd, $Pg/z, $Pn, $Pm"));
+
+ let Defs = !if(!eq (opc{2}, 1), [NZCV], []);
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Logical Mask Immediate Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_log_imm<bits<2> opc, string asm>
+: I<(outs ZPR64:$Zdn), (ins ZPR64:$_Zdn, logical_imm64:$imms13),
+ asm, "\t$Zdn, $_Zdn, $imms13",
+ "", []>, Sched<[]> {
+ bits<5> Zdn;
+ bits<13> imms13;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = opc;
+ let Inst{21-18} = 0b0000;
+ let Inst{17-5} = imms13;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+ let DecoderMethod = "DecodeSVELogicalImmInstruction";
+}
+
+multiclass sve_int_log_imm<bits<2> opc, string asm, string alias> {
+ def NAME : sve_int_log_imm<opc, asm>;
+
+ def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
+ (!cast<Instruction>(NAME) ZPR8:$Zdn, sve_logical_imm8:$imm), 4>;
+ def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
+ (!cast<Instruction>(NAME) ZPR16:$Zdn, sve_logical_imm16:$imm), 3>;
+ def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
+ (!cast<Instruction>(NAME) ZPR32:$Zdn, sve_logical_imm32:$imm), 2>;
+
+ def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
+ (!cast<Instruction>(NAME) ZPR8:$Zdn, sve_logical_imm8_not:$imm), 0>;
+ def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
+ (!cast<Instruction>(NAME) ZPR16:$Zdn, sve_logical_imm16_not:$imm), 0>;
+ def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
+ (!cast<Instruction>(NAME) ZPR32:$Zdn, sve_logical_imm32_not:$imm), 0>;
+ def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
+ (!cast<Instruction>(NAME) ZPR64:$Zdn, logical_imm64_not:$imm), 0>;
+}
+
+class sve_int_dup_mask_imm<string asm>
+: I<(outs ZPR64:$Zd), (ins logical_imm64:$imms),
+ asm, "\t$Zd, $imms",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<13> imms;
+ let Inst{31-18} = 0b00000101110000;
+ let Inst{17-5} = imms;
+ let Inst{4-0} = Zd;
+
+ let isReMaterializable = 1;
+ let DecoderMethod = "DecodeSVELogicalImmInstruction";
+}
+
+multiclass sve_int_dup_mask_imm<string asm> {
+ def NAME : sve_int_dup_mask_imm<asm>;
+
+ def : InstAlias<"dupm $Zd, $imm",
+ (!cast<Instruction>(NAME) ZPR8:$Zd, sve_logical_imm8:$imm), 4>;
+ def : InstAlias<"dupm $Zd, $imm",
+ (!cast<Instruction>(NAME) ZPR16:$Zd, sve_logical_imm16:$imm), 3>;
+ def : InstAlias<"dupm $Zd, $imm",
+ (!cast<Instruction>(NAME) ZPR32:$Zd, sve_logical_imm32:$imm), 2>;
+
+ // All Zd.b forms have a CPY/DUP equivalent, hence no byte alias here.
+ def : InstAlias<"mov $Zd, $imm",
+ (!cast<Instruction>(NAME) ZPR16:$Zd, sve_preferred_logical_imm16:$imm), 7>;
+ def : InstAlias<"mov $Zd, $imm",
+ (!cast<Instruction>(NAME) ZPR32:$Zd, sve_preferred_logical_imm32:$imm), 6>;
+ def : InstAlias<"mov $Zd, $imm",
+ (!cast<Instruction>(NAME) ZPR64:$Zd, sve_preferred_logical_imm64:$imm), 5>;
+}
+
//===----------------------------------------------------------------------===//
// SVE Integer Arithmetic - Unpredicated Group.
//===----------------------------------------------------------------------===//
@@ -41,6 +969,408 @@ multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm> {
}
//===----------------------------------------------------------------------===//
+// SVE Floating Point Arithmetic - Predicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_2op_i_p_zds<bits<2> sz, bits<3> opc, string asm,
+ ZPRRegOp zprty,
+ Operand imm_ty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, imm_ty:$i1),
+ asm, "\t$Zdn, $Pg/m, $_Zdn, $i1",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zdn;
+ bit i1;
+ let Inst{31-24} = 0b01100101;
+ let Inst{23-22} = sz;
+ let Inst{21-19} = 0b011;
+ let Inst{18-16} = opc;
+ let Inst{15-13} = 0b100;
+ let Inst{12-10} = Pg;
+ let Inst{9-6} = 0b0000;
+ let Inst{5} = i1;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_fp_2op_i_p_zds<bits<3> opc, string asm, Operand imm_ty> {
+ def _H : sve_fp_2op_i_p_zds<0b01, opc, asm, ZPR16, imm_ty>;
+ def _S : sve_fp_2op_i_p_zds<0b10, opc, asm, ZPR32, imm_ty>;
+ def _D : sve_fp_2op_i_p_zds<0b11, opc, asm, ZPR64, imm_ty>;
+}
+
+class sve_fp_2op_p_zds<bits<2> sz, bits<4> opc, string asm,
+ ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
+ asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zdn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b01100101;
+ let Inst{23-22} = sz;
+ let Inst{21-20} = 0b00;
+ let Inst{19-16} = opc;
+ let Inst{15-13} = 0b100;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_fp_2op_p_zds<bits<4> opc, string asm> {
+ def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>;
+ def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>;
+ def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>;
+}
+
+class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, imm0_7:$imm3),
+ asm, "\t$Zdn, $_Zdn, $Zm, $imm3",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zdn;
+ bits<5> Zm;
+ bits<3> imm3;
+ let Inst{31-24} = 0b01100101;
+ let Inst{23-22} = sz;
+ let Inst{21-19} = 0b010;
+ let Inst{18-16} = imm3;
+ let Inst{15-10} = 0b100000;
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_fp_ftmad<string asm> {
+ def _H : sve_fp_ftmad<0b01, asm, ZPR16>;
+ def _S : sve_fp_ftmad<0b10, asm, ZPR32>;
+ def _D : sve_fp_ftmad<0b11, asm, ZPR64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Arithmetic - Unpredicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm,
+ ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
+ asm, "\t$Zd, $Zn, $Zm",
+ "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zm;
+ bits<5> Zn;
+ let Inst{31-24} = 0b01100101;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Zm;
+ let Inst{15-13} = 0b000;
+ let Inst{12-10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_fp_3op_u_zd<bits<3> opc, string asm> {
+ def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
+ def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
+ def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Fused Multiply-Add Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_3op_p_zds_a<bits<2> sz, bits<2> opc, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm),
+ asm, "\t$Zda, $Pg/m, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zda;
+ bits<5> Zm;
+ bits<5> Zn;
+ let Inst{31-24} = 0b01100101;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15} = 0b0;
+ let Inst{14-13} = opc;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+}
+
+multiclass sve_fp_3op_p_zds_a<bits<2> opc, string asm> {
+ def _H : sve_fp_3op_p_zds_a<0b01, opc, asm, ZPR16>;
+ def _S : sve_fp_3op_p_zds_a<0b10, opc, asm, ZPR32>;
+ def _D : sve_fp_3op_p_zds_a<0b11, opc, asm, ZPR64>;
+}
+
+class sve_fp_3op_p_zds_b<bits<2> sz, bits<2> opc, string asm,
+ ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm, zprty:$Za),
+ asm, "\t$Zdn, $Pg/m, $Zm, $Za",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Za;
+ bits<5> Zdn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b01100101;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Za;
+ let Inst{15} = 0b1;
+ let Inst{14-13} = opc;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_fp_3op_p_zds_b<bits<2> opc, string asm> {
+ def _H : sve_fp_3op_p_zds_b<0b01, opc, asm, ZPR16>;
+ def _S : sve_fp_3op_p_zds_b<0b10, opc, asm, ZPR32>;
+ def _D : sve_fp_3op_p_zds_b<0b11, opc, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Multiply-Add - Indexed Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_fma_by_indexed_elem<bits<2> sz, bit opc, string asm,
+ ZPRRegOp zprty1,
+ ZPRRegOp zprty2, Operand itype>
+: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty1:$Zn, zprty2:$Zm, itype:$iop),
+ asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ let Inst{31-24} = 0b01100100;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{15-11} = 0;
+ let Inst{10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+}
+
+multiclass sve_fp_fma_by_indexed_elem<bit opc, string asm> {
+ def _H : sve_fp_fma_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR3b16, VectorIndexH> {
+ bits<3> Zm;
+ bits<3> iop;
+ let Inst{22} = iop{2};
+ let Inst{20-19} = iop{1-0};
+ let Inst{18-16} = Zm;
+ }
+ def _S : sve_fp_fma_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR3b32, VectorIndexS> {
+ bits<3> Zm;
+ bits<2> iop;
+ let Inst{20-19} = iop;
+ let Inst{18-16} = Zm;
+ }
+ def _D : sve_fp_fma_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR4b64, VectorIndexD> {
+ bits<4> Zm;
+ bit iop;
+ let Inst{20} = iop;
+ let Inst{19-16} = Zm;
+ }
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Multiply - Indexed Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_fmul_by_indexed_elem<bits<2> sz, string asm, ZPRRegOp zprty,
+ ZPRRegOp zprty2, Operand itype>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty2:$Zm, itype:$iop),
+ asm, "\t$Zd, $Zn, $Zm$iop", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ let Inst{31-24} = 0b01100100;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{15-10} = 0b001000;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_fp_fmul_by_indexed_elem<string asm> {
+ def _H : sve_fp_fmul_by_indexed_elem<{0, ?}, asm, ZPR16, ZPR3b16, VectorIndexH> {
+ bits<3> Zm;
+ bits<3> iop;
+ let Inst{22} = iop{2};
+ let Inst{20-19} = iop{1-0};
+ let Inst{18-16} = Zm;
+ }
+ def _S : sve_fp_fmul_by_indexed_elem<0b10, asm, ZPR32, ZPR3b32, VectorIndexS> {
+ bits<3> Zm;
+ bits<2> iop;
+ let Inst{20-19} = iop;
+ let Inst{18-16} = Zm;
+ }
+ def _D : sve_fp_fmul_by_indexed_elem<0b11, asm, ZPR64, ZPR4b64, VectorIndexD> {
+ bits<4> Zm;
+ bit iop;
+ let Inst{20} = iop;
+ let Inst{19-16} = Zm;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Complex Multiply-Add Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_fcmla<bits<2> sz, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm,
+ complexrotateop:$imm),
+ asm, "\t$Zda, $Pg/m, $Zn, $Zm, $imm",
+ "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<3> Pg;
+ bits<5> Zn;
+ bits<5> Zm;
+ bits<2> imm;
+ let Inst{31-24} = 0b01100100;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0;
+ let Inst{20-16} = Zm;
+ let Inst{15} = 0;
+ let Inst{14-13} = imm;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+}
+
+multiclass sve_fp_fcmla<string asm> {
+ def _H : sve_fp_fcmla<0b01, asm, ZPR16>;
+ def _S : sve_fp_fcmla<0b10, asm, ZPR32>;
+ def _D : sve_fp_fcmla<0b11, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Complex Multiply-Add - Indexed Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_fcmla_by_indexed_elem<bits<2> sz, string asm,
+ ZPRRegOp zprty,
+ ZPRRegOp zprty2, Operand itype>
+: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, zprty2:$Zm, itype:$iop,
+ complexrotateop:$imm),
+ asm, "\t$Zda, $Zn, $Zm$iop, $imm",
+ "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ bits<2> imm;
+ let Inst{31-24} = 0b01100100;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{15-12} = 0b0001;
+ let Inst{11-10} = imm;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+}
+
+multiclass sve_fp_fcmla_by_indexed_elem<string asm> {
+ def _H : sve_fp_fcmla_by_indexed_elem<0b10, asm, ZPR16, ZPR3b16, VectorIndexS> {
+ bits<3> Zm;
+ bits<2> iop;
+ let Inst{20-19} = iop;
+ let Inst{18-16} = Zm;
+ }
+ def _S : sve_fp_fcmla_by_indexed_elem<0b11, asm, ZPR32, ZPR4b32, VectorIndexD> {
+ bits<4> Zm;
+ bits<1> iop;
+ let Inst{20} = iop;
+ let Inst{19-16} = Zm;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Complex Addition Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_fcadd<bits<2> sz, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm,
+ complexrotateopodd:$imm),
+ asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm, $imm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zdn;
+ bits<5> Zm;
+ bits<3> Pg;
+ bit imm;
+ let Inst{31-24} = 0b01100100;
+ let Inst{23-22} = sz;
+ let Inst{21-17} = 0;
+ let Inst{16} = imm;
+ let Inst{15-13} = 0b100;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_fp_fcadd<string asm> {
+ def _H : sve_fp_fcadd<0b01, asm, ZPR16>;
+ def _S : sve_fp_fcadd<0b10, asm, ZPR32>;
+ def _D : sve_fp_fcadd<0b11, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Stack Allocation Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_arith_vl<bit opc, string asm>
+: I<(outs GPR64sp:$Rd), (ins GPR64sp:$Rn, simm6_32b:$imm6),
+ asm, "\t$Rd, $Rn, $imm6",
+ "",
+ []>, Sched<[]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<6> imm6;
+ let Inst{31-23} = 0b000001000;
+ let Inst{22} = opc;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Rn;
+ let Inst{15-11} = 0b01010;
+ let Inst{10-5} = imm6;
+ let Inst{4-0} = Rd;
+}
+
+class sve_int_read_vl_a<bit op, bits<5> opc2, string asm>
+: I<(outs GPR64:$Rd), (ins simm6_32b:$imm6),
+ asm, "\t$Rd, $imm6",
+ "",
+ []>, Sched<[]> {
+ bits<5> Rd;
+ bits<6> imm6;
+ let Inst{31-23} = 0b000001001;
+ let Inst{22} = op;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = opc2{4-0};
+ let Inst{15-11} = 0b01010;
+ let Inst{10-5} = imm6;
+ let Inst{4-0} = Rd;
+}
+
+//===----------------------------------------------------------------------===//
// SVE Permute - In Lane Group
//===----------------------------------------------------------------------===//
@@ -71,6 +1401,1442 @@ multiclass sve_int_perm_bin_perm_zz<bits<3> opc, string asm> {
}
//===----------------------------------------------------------------------===//
+// SVE Floating Point Unary Operations Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype,
+ RegisterOperand o_zprtype>
+: I<(outs o_zprtype:$Zd), (ins i_zprtype:$_Zd, PPR3bAny:$Pg, i_zprtype:$Zn),
+ asm, "\t$Zd, $Pg/m, $Zn",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zd;
+ bits<5> Zn;
+ let Inst{31-24} = 0b01100101;
+ let Inst{23-22} = opc{6-5};
+ let Inst{21} = 0b0;
+ let Inst{20-16} = opc{4-0};
+ let Inst{15-13} = 0b101;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm> {
+ def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16>;
+ def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32>;
+ def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Unary Operations - Unpredicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_2op_u_zd<bits<2> sz, bits<3> opc, string asm,
+ ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn),
+ asm, "\t$Zd, $Zn",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ let Inst{31-24} = 0b01100101;
+ let Inst{23-22} = sz;
+ let Inst{21-19} = 0b001;
+ let Inst{18-16} = opc;
+ let Inst{15-10} = 0b001100;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_fp_2op_u_zd<bits<3> opc, string asm> {
+ def _H : sve_fp_2op_u_zd<0b01, opc, asm, ZPR16>;
+ def _S : sve_fp_2op_u_zd<0b10, opc, asm, ZPR32>;
+ def _D : sve_fp_2op_u_zd<0b11, opc, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Arithmetic - Binary Predicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_bin_pred_arit_log<bits<2> sz8_64, bits<2> fmt, bits<3> opc,
+ string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
+ asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm", "", []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zdn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 0b0;
+ let Inst{20-19} = fmt;
+ let Inst{18-16} = opc;
+ let Inst{15-13} = 0b000;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_bin_pred_log<bits<3> opc, string asm> {
+ def _B : sve_int_bin_pred_arit_log<0b00, 0b11, opc, asm, ZPR8>;
+ def _H : sve_int_bin_pred_arit_log<0b01, 0b11, opc, asm, ZPR16>;
+ def _S : sve_int_bin_pred_arit_log<0b10, 0b11, opc, asm, ZPR32>;
+ def _D : sve_int_bin_pred_arit_log<0b11, 0b11, opc, asm, ZPR64>;
+}
+
+multiclass sve_int_bin_pred_arit_0<bits<3> opc, string asm> {
+ def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>;
+ def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>;
+ def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>;
+ def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>;
+}
+
+multiclass sve_int_bin_pred_arit_1<bits<3> opc, string asm> {
+ def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>;
+ def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>;
+ def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>;
+ def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>;
+}
+
+multiclass sve_int_bin_pred_arit_2<bits<3> opc, string asm> {
+ def _B : sve_int_bin_pred_arit_log<0b00, 0b10, opc, asm, ZPR8>;
+ def _H : sve_int_bin_pred_arit_log<0b01, 0b10, opc, asm, ZPR16>;
+ def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>;
+ def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>;
+}
+
+// Special case for divides which are not defined for 8b/16b elements.
+multiclass sve_int_bin_pred_arit_2_div<bits<3> opc, string asm> {
+ def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>;
+ def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Multiply-Add Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_mladdsub_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
+ ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm, zprty:$Za),
+ asm, "\t$Zdn, $Pg/m, $Zm, $Za",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zdn;
+ bits<5> Za;
+ bits<5> Zm;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Zm;
+ let Inst{15-14} = 0b11;
+ let Inst{13} = opc;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Za;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_mladdsub_vvv_pred<bits<1> opc, string asm> {
+ def _B : sve_int_mladdsub_vvv_pred<0b00, opc, asm, ZPR8>;
+ def _H : sve_int_mladdsub_vvv_pred<0b01, opc, asm, ZPR16>;
+ def _S : sve_int_mladdsub_vvv_pred<0b10, opc, asm, ZPR32>;
+ def _D : sve_int_mladdsub_vvv_pred<0b11, opc, asm, ZPR64>;
+}
+
+class sve_int_mlas_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
+ ZPRRegOp zprty>
+: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm),
+ asm, "\t$Zda, $Pg/m, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zda;
+ bits<5> Zm;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Zm;
+ let Inst{15-14} = 0b01;
+ let Inst{13} = opc;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+}
+
+multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm> {
+ def _B : sve_int_mlas_vvv_pred<0b00, opc, asm, ZPR8>;
+ def _H : sve_int_mlas_vvv_pred<0b01, opc, asm, ZPR16>;
+ def _S : sve_int_mlas_vvv_pred<0b10, opc, asm, ZPR32>;
+ def _D : sve_int_mlas_vvv_pred<0b11, opc, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Dot Product Group
+//===----------------------------------------------------------------------===//
+
+class sve_intx_dot<bit sz, bit U, string asm, ZPRRegOp zprty1,
+ ZPRRegOp zprty2>
+: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm), asm,
+ "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-23} = 0b010001001;
+ let Inst{22} = sz;
+ let Inst{21} = 0;
+ let Inst{20-16} = Zm;
+ let Inst{15-11} = 0;
+ let Inst{10} = U;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+}
+
+multiclass sve_intx_dot<bit opc, string asm> {
+ def _S : sve_intx_dot<0b0, opc, asm, ZPR32, ZPR8>;
+ def _D : sve_intx_dot<0b1, opc, asm, ZPR64, ZPR16>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Dot Product Group - Indexed Group
+//===----------------------------------------------------------------------===//
+
+class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2,
+ ZPRRegOp zprty3, Operand itype>
+: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop),
+ asm, "\t$Zda, $Zn, $Zm$iop",
+ "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ let Inst{31-23} = 0b010001001;
+ let Inst{22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{15-11} = 0;
+ let Inst{10} = U;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+}
+
+multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm> {
+ def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> {
+ bits<2> iop;
+ bits<3> Zm;
+ let Inst{20-19} = iop;
+ let Inst{18-16} = Zm;
+ }
+ def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> {
+ bits<1> iop;
+ bits<4> Zm;
+ let Inst{20} = iop;
+ let Inst{19-16} = Zm;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Arithmetic - Unary Predicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_un_pred_arit<bits<2> sz8_64, bits<4> opc,
+ string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
+ asm, "\t$Zd, $Pg/m, $Zn",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zd;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-20} = 0b01;
+ let Inst{19} = opc{0};
+ let Inst{18-16} = opc{3-1};
+ let Inst{15-13} = 0b101;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm> {
+ def _B : sve_int_un_pred_arit<0b00, { opc, 0b0 }, asm, ZPR8>;
+ def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
+ def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
+ def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
+}
+
+multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm> {
+ def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
+ def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
+ def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
+}
+
+multiclass sve_int_un_pred_arit_0_w<bits<3> opc, string asm> {
+ def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
+ def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
+}
+
+multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm> {
+ def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
+}
+
+multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm> {
+ def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>;
+ def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
+ def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
+ def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
+}
+
+multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm> {
+ def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
+ def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
+ def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Wide Immediate - Unpredicated Group
+//===----------------------------------------------------------------------===//
+class sve_int_dup_imm<bits<2> sz8_64, string asm,
+ ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zd), (ins immtype:$imm),
+ asm, "\t$Zd, $imm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<9> imm;
+ let Inst{31-24} = 0b00100101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-14} = 0b11100011;
+ let Inst{13} = imm{8}; // sh
+ let Inst{12-5} = imm{7-0}; // imm8
+ let Inst{4-0} = Zd;
+
+ let isReMaterializable = 1;
+}
+
+multiclass sve_int_dup_imm<string asm> {
+ def _B : sve_int_dup_imm<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8>;
+ def _H : sve_int_dup_imm<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16>;
+ def _S : sve_int_dup_imm<0b10, asm, ZPR32, cpy_imm8_opt_lsl_i32>;
+ def _D : sve_int_dup_imm<0b11, asm, ZPR64, cpy_imm8_opt_lsl_i64>;
+
+ def : InstAlias<"mov $Zd, $imm",
+ (!cast<Instruction>(NAME # _B) ZPR8:$Zd, cpy_imm8_opt_lsl_i8:$imm), 1>;
+ def : InstAlias<"mov $Zd, $imm",
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd, cpy_imm8_opt_lsl_i16:$imm), 1>;
+ def : InstAlias<"mov $Zd, $imm",
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd, cpy_imm8_opt_lsl_i32:$imm), 1>;
+ def : InstAlias<"mov $Zd, $imm",
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd, cpy_imm8_opt_lsl_i64:$imm), 1>;
+
+ def : InstAlias<"fmov $Zd, #0.0",
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd, 0, 0), 1>;
+ def : InstAlias<"fmov $Zd, #0.0",
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd, 0, 0), 1>;
+ def : InstAlias<"fmov $Zd, #0.0",
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd, 0, 0), 1>;
+}
+
+class sve_int_dup_fpimm<bits<2> sz8_64, Operand fpimmtype,
+ string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins fpimmtype:$imm8),
+ asm, "\t$Zd, $imm8",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<8> imm8;
+ let Inst{31-24} = 0b00100101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-14} = 0b11100111;
+ let Inst{13} = 0b0;
+ let Inst{12-5} = imm8;
+ let Inst{4-0} = Zd;
+
+ let isReMaterializable = 1;
+}
+
+multiclass sve_int_dup_fpimm<string asm> {
+ def _H : sve_int_dup_fpimm<0b01, fpimm16, asm, ZPR16>;
+ def _S : sve_int_dup_fpimm<0b10, fpimm32, asm, ZPR32>;
+ def _D : sve_int_dup_fpimm<0b11, fpimm64, asm, ZPR64>;
+
+ def : InstAlias<"fmov $Zd, $imm8",
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd, fpimm16:$imm8), 1>;
+ def : InstAlias<"fmov $Zd, $imm8",
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd, fpimm32:$imm8), 1>;
+ def : InstAlias<"fmov $Zd, $imm8",
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd, fpimm64:$imm8), 1>;
+}
+
+class sve_int_arith_imm0<bits<2> sz8_64, bits<3> opc, string asm,
+ ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, immtype:$imm),
+ asm, "\t$Zdn, $_Zdn, $imm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zdn;
+ bits<9> imm;
+ let Inst{31-24} = 0b00100101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-19} = 0b100;
+ let Inst{18-16} = opc;
+ let Inst{15-14} = 0b11;
+ let Inst{13} = imm{8}; // sh
+ let Inst{12-5} = imm{7-0}; // imm8
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_arith_imm0<bits<3> opc, string asm> {
+ def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>;
+ def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>;
+ def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>;
+ def _D : sve_int_arith_imm0<0b11, opc, asm, ZPR64, addsub_imm8_opt_lsl_i64>;
+}
+
+class sve_int_arith_imm<bits<2> sz8_64, bits<6> opc, string asm,
+ ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, immtype:$imm),
+ asm, "\t$Zdn, $_Zdn, $imm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zdn;
+ bits<8> imm;
+ let Inst{31-24} = 0b00100101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-16} = opc;
+ let Inst{15-13} = 0b110;
+ let Inst{12-5} = imm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_arith_imm1<bits<2> opc, string asm, Operand immtype> {
+ def _B : sve_int_arith_imm<0b00, { 0b1010, opc }, asm, ZPR8, immtype>;
+ def _H : sve_int_arith_imm<0b01, { 0b1010, opc }, asm, ZPR16, immtype>;
+ def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, immtype>;
+ def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, immtype>;
+}
+
+multiclass sve_int_arith_imm2<string asm> {
+ def _B : sve_int_arith_imm<0b00, 0b110000, asm, ZPR8, simm8>;
+ def _H : sve_int_arith_imm<0b01, 0b110000, asm, ZPR16, simm8>;
+ def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>;
+ def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Bitwise Logical - Unpredicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_bin_cons_log<bits<2> opc, string asm>
+: I<(outs ZPR64:$Zd), (ins ZPR64:$Zn, ZPR64:$Zm),
+ asm, "\t$Zd, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zm;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = opc{1-0};
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-10} = 0b001100;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Wide Immediate - Predicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_dup_fpimm_pred<bits<2> sz, Operand fpimmtype,
+ string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPRAny:$Pg, fpimmtype:$imm8),
+ asm, "\t$Zd, $Pg/m, $imm8",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pg;
+ bits<5> Zd;
+ bits<8> imm8;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz;
+ let Inst{21-20} = 0b01;
+ let Inst{19-16} = Pg;
+ let Inst{15-13} = 0b110;
+ let Inst{12-5} = imm8;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve_int_dup_fpimm_pred<string asm> {
+ def _H : sve_int_dup_fpimm_pred<0b01, fpimm16, asm, ZPR16>;
+ def _S : sve_int_dup_fpimm_pred<0b10, fpimm32, asm, ZPR32>;
+ def _D : sve_int_dup_fpimm_pred<0b11, fpimm64, asm, ZPR64>;
+
+ def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, fpimm16:$imm8), 1>;
+ def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, fpimm32:$imm8), 1>;
+ def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, fpimm64:$imm8), 1>;
+}
+
+class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm,
+ ZPRRegOp zprty, string pred_qual, dag iops>
+: I<(outs zprty:$Zd), iops,
+ asm, "\t$Zd, $Pg"#pred_qual#", $imm",
+ "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<4> Pg;
+ bits<9> imm;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-20} = 0b01;
+ let Inst{19-16} = Pg;
+ let Inst{15} = 0b0;
+ let Inst{14} = m;
+ let Inst{13} = imm{8}; // sh
+ let Inst{12-5} = imm{7-0}; // imm8
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_dup_imm_pred_merge<string asm> {
+ let Constraints = "$Zd = $_Zd" in {
+ def _B : sve_int_dup_imm_pred<0b00, 1, asm, ZPR8, "/m", (ins ZPR8:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>;
+ def _H : sve_int_dup_imm_pred<0b01, 1, asm, ZPR16, "/m", (ins ZPR16:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>;
+ def _S : sve_int_dup_imm_pred<0b10, 1, asm, ZPR32, "/m", (ins ZPR32:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>;
+ def _D : sve_int_dup_imm_pred<0b11, 1, asm, ZPR64, "/m", (ins ZPR64:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>;
+ }
+
+ def : InstAlias<"mov $Zd, $Pg/m, $imm",
+ (!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>;
+ def : InstAlias<"mov $Zd, $Pg/m, $imm",
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>;
+ def : InstAlias<"mov $Zd, $Pg/m, $imm",
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>;
+ def : InstAlias<"mov $Zd, $Pg/m, $imm",
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>;
+
+ def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>;
+ def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, 0, 0), 0>;
+ def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>;
+}
+
+multiclass sve_int_dup_imm_pred_zero<string asm> {
+ def _B : sve_int_dup_imm_pred<0b00, 0, asm, ZPR8, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>;
+ def _H : sve_int_dup_imm_pred<0b01, 0, asm, ZPR16, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>;
+ def _S : sve_int_dup_imm_pred<0b10, 0, asm, ZPR32, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>;
+ def _D : sve_int_dup_imm_pred<0b11, 0, asm, ZPR64, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>;
+
+ def : InstAlias<"mov $Zd, $Pg/z, $imm",
+ (!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>;
+ def : InstAlias<"mov $Zd, $Pg/z, $imm",
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>;
+ def : InstAlias<"mov $Zd, $Pg/z, $imm",
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>;
+ def : InstAlias<"mov $Zd, $Pg/z, $imm",
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Compare - Vectors Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_cmp<bit cmp_1, bits<2> sz8_64, bits<3> opc, string asm,
+ PPRRegOp pprty, ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty1:$Zn, zprty2:$Zm),
+ asm, "\t$Pd, $Pg/z, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pd;
+ bits<3> Pg;
+ bits<5> Zm;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00100100;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Zm;
+ let Inst{15} = opc{2};
+ let Inst{14} = cmp_1;
+ let Inst{13} = opc{1};
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4} = opc{0};
+ let Inst{3-0} = Pd;
+
+ let Defs = [NZCV];
+}
+
+multiclass sve_int_cmp_0<bits<3> opc, string asm> {
+ def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR8>;
+ def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR16>;
+ def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR32>;
+ def _D : sve_int_cmp<0b0, 0b11, opc, asm, PPR64, ZPR64, ZPR64>;
+}
+
+multiclass sve_int_cmp_0_wide<bits<3> opc, string asm> {
+ def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR64>;
+ def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR64>;
+ def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR64>;
+}
+
+multiclass sve_int_cmp_1_wide<bits<3> opc, string asm> {
+ def _B : sve_int_cmp<0b1, 0b00, opc, asm, PPR8, ZPR8, ZPR64>;
+ def _H : sve_int_cmp<0b1, 0b01, opc, asm, PPR16, ZPR16, ZPR64>;
+ def _S : sve_int_cmp<0b1, 0b10, opc, asm, PPR32, ZPR32, ZPR64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Compare - Signed Immediate Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_scmp_vi<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
+ ZPRRegOp zprty,
+ Operand immtype>
+: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, immtype:$imm5),
+ asm, "\t$Pd, $Pg/z, $Zn, $imm5",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pd;
+ bits<3> Pg;
+ bits<5> Zn;
+ bits<5> imm5;
+ let Inst{31-24} = 0b00100101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = imm5;
+ let Inst{15} = opc{2};
+ let Inst{14} = 0b0;
+ let Inst{13} = opc{1};
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4} = opc{0};
+ let Inst{3-0} = Pd;
+
+ let Defs = [NZCV];
+}
+
+multiclass sve_int_scmp_vi<bits<3> opc, string asm> {
+ def _B : sve_int_scmp_vi<0b00, opc, asm, PPR8, ZPR8, simm5_32b>;
+ def _H : sve_int_scmp_vi<0b01, opc, asm, PPR16, ZPR16, simm5_32b>;
+ def _S : sve_int_scmp_vi<0b10, opc, asm, PPR32, ZPR32, simm5_32b>;
+ def _D : sve_int_scmp_vi<0b11, opc, asm, PPR64, ZPR64, simm5_64b>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Compare - Unsigned Immediate Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_ucmp_vi<bits<2> sz8_64, bits<2> opc, string asm, PPRRegOp pprty,
+ ZPRRegOp zprty, Operand immtype>
+: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, immtype:$imm7),
+ asm, "\t$Pd, $Pg/z, $Zn, $imm7",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pd;
+ bits<3> Pg;
+ bits<5> Zn;
+ bits<7> imm7;
+ let Inst{31-24} = 0b00100100;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 1;
+ let Inst{20-14} = imm7;
+ let Inst{13} = opc{1};
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4} = opc{0};
+ let Inst{3-0} = Pd;
+
+ let Defs = [NZCV];
+}
+
+multiclass sve_int_ucmp_vi<bits<2> opc, string asm> {
+ def _B : sve_int_ucmp_vi<0b00, opc, asm, PPR8, ZPR8, imm0_127>;
+ def _H : sve_int_ucmp_vi<0b01, opc, asm, PPR16, ZPR16, imm0_127>;
+ def _S : sve_int_ucmp_vi<0b10, opc, asm, PPR32, ZPR32, imm0_127>;
+ def _D : sve_int_ucmp_vi<0b11, opc, asm, PPR64, ZPR64, imm0_127>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Fast Reduction Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_fast_red<bits<2> sz, bits<3> opc, string asm,
+ ZPRRegOp zprty, RegisterClass dstRegClass>
+: I<(outs dstRegClass:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
+ asm, "\t$Vd, $Pg, $Zn",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zn;
+ bits<5> Vd;
+ bits<3> Pg;
+ let Inst{31-24} = 0b01100101;
+ let Inst{23-22} = sz;
+ let Inst{21-19} = 0b000;
+ let Inst{18-16} = opc;
+ let Inst{15-13} = 0b001;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Vd;
+}
+
+multiclass sve_fp_fast_red<bits<3> opc, string asm> {
+ def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16>;
+ def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32>;
+ def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Accumulating Reduction Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_2op_p_vd<bits<2> sz, bits<3> opc, string asm,
+ ZPRRegOp zprty, RegisterClass dstRegClass>
+: I<(outs dstRegClass:$Vdn), (ins PPR3bAny:$Pg, dstRegClass:$_Vdn, zprty:$Zm),
+ asm, "\t$Vdn, $Pg, $_Vdn, $Zm",
+ "",
+ []>,
+ Sched<[]> {
+ bits<3> Pg;
+ bits<5> Vdn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b01100101;
+ let Inst{23-22} = sz;
+ let Inst{21-19} = 0b011;
+ let Inst{18-16} = opc;
+ let Inst{15-13} = 0b001;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Vdn;
+
+ let Constraints = "$Vdn = $_Vdn";
+}
+
+multiclass sve_fp_2op_p_vd<bits<3> opc, string asm> {
+ def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16>;
+ def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32>;
+ def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Compare - Vectors Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_3op_p_pd<bits<2> sz, bits<3> opc, string asm, PPRRegOp pprty,
+ ZPRRegOp zprty>
+: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
+ asm, "\t$Pd, $Pg/z, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pd;
+ bits<3> Pg;
+ bits<5> Zm;
+ bits<5> Zn;
+ let Inst{31-24} = 0b01100101;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Zm;
+ let Inst{15} = opc{2};
+ let Inst{14} = 0b1;
+ let Inst{13} = opc{1};
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4} = opc{0};
+ let Inst{3-0} = Pd;
+}
+
+multiclass sve_fp_3op_p_pd<bits<3> opc, string asm> {
+ def _H : sve_fp_3op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
+ def _S : sve_fp_3op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
+ def _D : sve_fp_3op_p_pd<0b11, opc, asm, PPR64, ZPR64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Compare - with Zero Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_2op_p_pd<bits<2> sz, bits<3> opc, string asm, PPRRegOp pprty,
+ ZPRRegOp zprty>
+: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn),
+ asm, "\t$Pd, $Pg/z, $Zn, #0.0",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pd;
+ bits<3> Pg;
+ bits<5> Zn;
+ let Inst{31-24} = 0b01100101;
+ let Inst{23-22} = sz;
+ let Inst{21-18} = 0b0100;
+ let Inst{17-16} = opc{2-1};
+ let Inst{15-13} = 0b001;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4} = opc{0};
+ let Inst{3-0} = Pd;
+}
+
+multiclass sve_fp_2op_p_pd<bits<3> opc, string asm> {
+ def _H : sve_fp_2op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
+ def _S : sve_fp_2op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
+ def _D : sve_fp_2op_p_pd<0b11, opc, asm, PPR64, ZPR64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+//SVE Index Generation Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+ Operand imm_ty>
+: I<(outs zprty:$Zd), (ins imm_ty:$imm5, imm_ty:$imm5b),
+ asm, "\t$Zd, $imm5, $imm5b",
+ "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> imm5;
+ bits<5> imm5b;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = imm5b;
+ let Inst{15-10} = 0b010000;
+ let Inst{9-5} = imm5;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_index_ii<string asm> {
+ def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_32b>;
+ def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_32b>;
+ def _S : sve_int_index_ii<0b10, asm, ZPR32, simm5_32b>;
+ def _D : sve_int_index_ii<0b11, asm, ZPR64, simm5_64b>;
+}
+
+class sve_int_index_ir<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+ RegisterClass srcRegType, Operand imm_ty>
+: I<(outs zprty:$Zd), (ins imm_ty:$imm5, srcRegType:$Rm),
+ asm, "\t$Zd, $imm5, $Rm",
+ "", []>, Sched<[]> {
+ bits<5> Rm;
+ bits<5> Zd;
+ bits<5> imm5;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Rm;
+ let Inst{15-10} = 0b010010;
+ let Inst{9-5} = imm5;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_index_ir<string asm> {
+ def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_32b>;
+ def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_32b>;
+ def _S : sve_int_index_ir<0b10, asm, ZPR32, GPR32, simm5_32b>;
+ def _D : sve_int_index_ir<0b11, asm, ZPR64, GPR64, simm5_64b>;
+}
+
+class sve_int_index_ri<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+ RegisterClass srcRegType, Operand imm_ty>
+: I<(outs zprty:$Zd), (ins srcRegType:$Rn, imm_ty:$imm5),
+ asm, "\t$Zd, $Rn, $imm5",
+ "", []>, Sched<[]> {
+ bits<5> Rn;
+ bits<5> Zd;
+ bits<5> imm5;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = imm5;
+ let Inst{15-10} = 0b010001;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_index_ri<string asm> {
+ def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_32b>;
+ def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_32b>;
+ def _S : sve_int_index_ri<0b10, asm, ZPR32, GPR32, simm5_32b>;
+ def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>;
+}
+
+class sve_int_index_rr<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+ RegisterClass srcRegType>
+: I<(outs zprty:$Zd), (ins srcRegType:$Rn, srcRegType:$Rm),
+ asm, "\t$Zd, $Rn, $Rm",
+ "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Rm;
+ bits<5> Rn;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Rm;
+ let Inst{15-10} = 0b010011;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_index_rr<string asm> {
+ def _B : sve_int_index_rr<0b00, asm, ZPR8, GPR32>;
+ def _H : sve_int_index_rr<0b01, asm, ZPR16, GPR32>;
+ def _S : sve_int_index_rr<0b10, asm, ZPR32, GPR32>;
+ def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>;
+}
+//
+//===----------------------------------------------------------------------===//
+// SVE Bitwise Shift - Predicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<3> opc, string asm,
+ ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm),
+ asm, "\t$Zdn, $Pg/m, $_Zdn, $imm",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zdn;
+ bits<6> imm;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = tsz8_64{3-2};
+ let Inst{21-19} = 0b000;
+ let Inst{18-16} = opc;
+ let Inst{15-13} = 0b100;
+ let Inst{12-10} = Pg;
+ let Inst{9-8} = tsz8_64{1-0};
+ let Inst{7-5} = imm{2-0}; // imm3
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_bin_pred_shift_imm_left<bits<3> opc, string asm> {
+ def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
+ def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
+ let Inst{8} = imm{3};
+ }
+ def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
+ let Inst{9-8} = imm{4-3};
+ }
+ def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
+ let Inst{22} = imm{5};
+ let Inst{9-8} = imm{4-3};
+ }
+}
+
+multiclass sve_int_bin_pred_shift_imm_right<bits<3> opc, string asm> {
+ def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+ def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+ let Inst{8} = imm{3};
+ }
+ def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+ let Inst{9-8} = imm{4-3};
+ }
+ def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+ let Inst{22} = imm{5};
+ let Inst{9-8} = imm{4-3};
+ }
+}
+
+class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc,
+ string asm, ZPRRegOp zprty, ZPRRegOp zprty2>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty2:$Zm),
+ asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zdn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-20} = 0b01;
+ let Inst{19} = wide;
+ let Inst{18-16} = opc;
+ let Inst{15-13} = 0b100;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_bin_pred_shift<bits<3> opc, string asm> {
+ def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>;
+ def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>;
+ def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>;
+ def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>;
+}
+
+multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm> {
+ def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>;
+ def _H : sve_int_bin_pred_shift<0b01, 0b1, opc, asm, ZPR16, ZPR64>;
+ def _S : sve_int_bin_pred_shift<0b10, 0b1, opc, asm, ZPR32, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Shift - Unpredicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_bin_cons_shift_wide<bits<2> sz8_64, bits<2> opc, string asm,
+ ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, ZPR64:$Zm),
+ asm, "\t$Zd, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zm;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-12} = 0b1000;
+ let Inst{11-10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm> {
+ def _B : sve_int_bin_cons_shift_wide<0b00, opc, asm, ZPR8>;
+ def _H : sve_int_bin_cons_shift_wide<0b01, opc, asm, ZPR16>;
+ def _S : sve_int_bin_cons_shift_wide<0b10, opc, asm, ZPR32>;
+}
+
+class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
+ ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
+ asm, "\t$Zd, $Zn, $imm",
+ "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<6> imm;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = tsz8_64{3-2};
+ let Inst{21} = 0b1;
+ let Inst{20-19} = tsz8_64{1-0};
+ let Inst{18-16} = imm{2-0}; // imm3
+ let Inst{15-12} = 0b1001;
+ let Inst{11-10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm> {
+ def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
+ def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
+ let Inst{19} = imm{3};
+ }
+ def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
+ let Inst{20-19} = imm{4-3};
+ }
+ def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
+ let Inst{22} = imm{5};
+ let Inst{20-19} = imm{4-3};
+ }
+}
+
+multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm> {
+ def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+ def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+ let Inst{19} = imm{3};
+ }
+ def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+ let Inst{20-19} = imm{4-3};
+ }
+ def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+ let Inst{22} = imm{5};
+ let Inst{20-19} = imm{4-3};
+ }
+}
+//===----------------------------------------------------------------------===//
+// SVE Memory - Store Group
+//===----------------------------------------------------------------------===//
+
+class sve_mem_cst_si<bits<2> msz, bits<2> esz, string asm,
+ RegisterOperand VecList>
+: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
+ asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<5> Zt;
+ bits<4> imm4;
+ let Inst{31-25} = 0b1110010;
+ let Inst{24-23} = msz;
+ let Inst{22-21} = esz;
+ let Inst{20} = 0;
+ let Inst{19-16} = imm4;
+ let Inst{15-13} = 0b111;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayStore = 1;
+}
+
+multiclass sve_mem_cst_si<bits<2> msz, bits<2> esz, string asm,
+ RegisterOperand listty, ZPRRegOp zprty>
+{
+ def NAME : sve_mem_cst_si<msz, esz, asm, listty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
+ (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_est_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
+ string asm, Operand immtype>
+: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm4),
+ asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<5> Zt;
+ bits<4> imm4;
+ let Inst{31-25} = 0b1110010;
+ let Inst{24-23} = sz;
+ let Inst{22-21} = nregs;
+ let Inst{20} = 1;
+ let Inst{19-16} = imm4;
+ let Inst{15-13} = 0b111;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayStore = 1;
+}
+
+multiclass sve_mem_est_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
+ string asm, Operand immtype> {
+ def NAME : sve_mem_est_si<sz, nregs, VecList, asm, immtype>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
+ (!cast<Instruction>(NAME) VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_est_ss<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
+ string asm, RegisterOperand gprty>
+: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+ asm, "\t$Zt, $Pg, [$Rn, $Rm]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rm;
+ bits<5> Rn;
+ bits<5> Zt;
+ let Inst{31-25} = 0b1110010;
+ let Inst{24-23} = sz;
+ let Inst{22-21} = nregs;
+ let Inst{20-16} = Rm;
+ let Inst{15-13} = 0b011;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayStore = 1;
+}
+
+class sve_mem_cst_ss_base<bits<4> dtype, string asm,
+ RegisterOperand listty, RegisterOperand gprty>
+: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+ asm, "\t$Zt, $Pg, [$Rn, $Rm]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rm;
+ bits<5> Rn;
+ bits<5> Zt;
+ let Inst{31-25} = 0b1110010;
+ let Inst{24-21} = dtype;
+ let Inst{20-16} = Rm;
+ let Inst{15-13} = 0b010;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayStore = 1;
+}
+
+multiclass sve_mem_cst_ss<bits<4> dtype, string asm,
+ RegisterOperand listty, ZPRRegOp zprty,
+ RegisterOperand gprty> {
+ def NAME : sve_mem_cst_ss_base<dtype, asm, listty, gprty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Rm]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
+}
+
+class sve_mem_cstnt_si<bits<2> msz, string asm, RegisterOperand VecList>
+: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
+ asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<5> Zt;
+ bits<4> imm4;
+ let Inst{31-25} = 0b1110010;
+ let Inst{24-23} = msz;
+ let Inst{22-20} = 0b001;
+ let Inst{19-16} = imm4;
+ let Inst{15-13} = 0b111;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayStore = 1;
+}
+
+multiclass sve_mem_cstnt_si<bits<2> msz, string asm, RegisterOperand listty,
+ ZPRRegOp zprty> {
+ def NAME : sve_mem_cstnt_si<msz, asm, listty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
+ (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_cstnt_ss_base<bits<2> msz, string asm, RegisterOperand listty,
+ RegisterOperand gprty>
+: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+ asm, "\t$Zt, $Pg, [$Rn, $Rm]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rm;
+ bits<5> Rn;
+ bits<5> Zt;
+ let Inst{31-25} = 0b1110010;
+ let Inst{24-23} = msz;
+ let Inst{22-21} = 0b00;
+ let Inst{20-16} = Rm;
+ let Inst{15-13} = 0b011;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayStore = 1;
+}
+
+multiclass sve_mem_cstnt_ss<bits<2> msz, string asm, RegisterOperand listty,
+ ZPRRegOp zprty, RegisterOperand gprty> {
+ def NAME : sve_mem_cstnt_ss_base<msz, asm, listty, gprty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Rm]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
+}
+
+class sve_mem_sst_sv<bits<3> opc, bit xs, bit scaled, string asm,
+ RegisterOperand VecList, RegisterOperand zprext>
+: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
+ asm, "\t$Zt, $Pg, [$Rn, $Zm]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<5> Zm;
+ bits<5> Zt;
+ let Inst{31-25} = 0b1110010;
+ let Inst{24-22} = opc;
+ let Inst{21} = scaled;
+ let Inst{20-16} = Zm;
+ let Inst{15} = 0b1;
+ let Inst{14} = xs;
+ let Inst{13} = 0;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayStore = 1;
+}
+
+multiclass sve_mem_sst_sv_32_scaled<bits<3> opc, string asm,
+ RegisterOperand listty,
+ ZPRRegOp zprty,
+ RegisterOperand sxtw_opnd,
+ RegisterOperand uxtw_opnd > {
+ def _UXTW_SCALED : sve_mem_sst_sv<opc, 0, 1, asm, listty, uxtw_opnd>;
+ def _SXTW_SCALED : sve_mem_sst_sv<opc, 1, 1, asm, listty, sxtw_opnd>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _UXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _SXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+}
+
+multiclass sve_mem_sst_sv_32_unscaled<bits<3> opc, string asm,
+ RegisterOperand listty,
+ ZPRRegOp zprty,
+ RegisterOperand sxtw_opnd,
+ RegisterOperand uxtw_opnd> {
+ def _UXTW : sve_mem_sst_sv<opc, 0, 0, asm, listty, uxtw_opnd>;
+ def _SXTW : sve_mem_sst_sv<opc, 1, 0, asm, listty, sxtw_opnd>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _UXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _SXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+}
+
+class sve_mem_sst_sv2<bits<2> msz, bit scaled, string asm,
+ RegisterOperand zprext>
+: I<(outs), (ins Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
+ asm, "\t$Zt, $Pg, [$Rn, $Zm]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<5> Zm;
+ bits<5> Zt;
+ let Inst{31-25} = 0b1110010;
+ let Inst{24-23} = msz;
+ let Inst{22} = 0b0;
+ let Inst{21} = scaled;
+ let Inst{20-16} = Zm;
+ let Inst{15-13} = 0b101;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayStore = 1;
+}
+
+multiclass sve_mem_sst_sv_64_scaled<bits<2> msz, string asm,
+ RegisterOperand zprext> {
+ def "" : sve_mem_sst_sv2<msz, 1, asm, zprext>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
+
+}
+
+multiclass sve_mem_sst_sv_64_unscaled<bits<2> msz, string asm> {
+ def "" : sve_mem_sst_sv2<msz, 0, asm, ZPR64ExtLSL8>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
+}
+
+class sve_mem_sst_vi<bits<3> opc, string asm, ZPRRegOp zprty,
+ RegisterOperand VecList, Operand imm_ty>
+: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, zprty:$Zn, imm_ty:$imm5),
+ asm, "\t$Zt, $Pg, [$Zn, $imm5]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> imm5;
+ bits<5> Zn;
+ bits<5> Zt;
+ let Inst{31-25} = 0b1110010;
+ let Inst{24-23} = opc{2-1};
+ let Inst{22} = 0b1;
+ let Inst{21} = opc{0};
+ let Inst{20-16} = imm5;
+ let Inst{15-13} = 0b101;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zt;
+
+ let mayStore = 1;
+}
+
+multiclass sve_mem_sst_vi_ptrs<bits<3> opc, string asm, RegisterOperand listty,
+ ZPRRegOp zprty, Operand imm_ty> {
+ def _IMM : sve_mem_sst_vi<opc, asm, zprty, listty, imm_ty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+ (!cast<Instruction>(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $imm5]",
+ (!cast<Instruction>(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, imm_ty:$imm5), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+ (!cast<Instruction>(NAME # _IMM) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 1>;
+}
+
+class sve_mem_z_spill<string asm>
+: I<(outs), (ins ZPRAny:$Zt, GPR64sp:$Rn, simm9:$imm9),
+ asm, "\t$Zt, [$Rn, $imm9, mul vl]",
+ "",
+ []>, Sched<[]> {
+ bits<5> Rn;
+ bits<5> Zt;
+ bits<9> imm9;
+ let Inst{31-22} = 0b1110010110;
+ let Inst{21-16} = imm9{8-3};
+ let Inst{15-13} = 0b010;
+ let Inst{12-10} = imm9{2-0};
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayStore = 1;
+}
+
+multiclass sve_mem_z_spill<string asm> {
+ def NAME : sve_mem_z_spill<asm>;
+
+ def : InstAlias<asm # "\t$Zt, [$Rn]",
+ (!cast<Instruction>(NAME) ZPRAny:$Zt, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_p_spill<string asm>
+: I<(outs), (ins PPRAny:$Pt, GPR64sp:$Rn, simm9:$imm9),
+ asm, "\t$Pt, [$Rn, $imm9, mul vl]",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pt;
+ bits<5> Rn;
+ bits<9> imm9;
+ let Inst{31-22} = 0b1110010110;
+ let Inst{21-16} = imm9{8-3};
+ let Inst{15-13} = 0b000;
+ let Inst{12-10} = imm9{2-0};
+ let Inst{9-5} = Rn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = Pt;
+
+ let mayStore = 1;
+}
+
+multiclass sve_mem_p_spill<string asm> {
+ def NAME : sve_mem_p_spill<asm>;
+
+ def : InstAlias<asm # "\t$Pt, [$Rn]",
+ (!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
+}
+
+//===----------------------------------------------------------------------===//
// SVE Permute - Predicates Group
//===----------------------------------------------------------------------===//
@@ -100,4 +2866,1254 @@ multiclass sve_int_perm_bin_perm_pp<bits<3> opc, string asm> {
def _H : sve_int_perm_bin_perm_pp<opc, 0b01, asm, PPR16>;
def _S : sve_int_perm_bin_perm_pp<opc, 0b10, asm, PPR32>;
def _D : sve_int_perm_bin_perm_pp<opc, 0b11, asm, PPR64>;
-} \ No newline at end of file
+}
+
+class sve_int_perm_punpk<bit opc, string asm>
+: I<(outs PPR16:$Pd), (ins PPR8:$Pn),
+ asm, "\t$Pd, $Pn",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pd;
+ bits<4> Pn;
+ let Inst{31-17} = 0b000001010011000;
+ let Inst{16} = opc;
+ let Inst{15-9} = 0b0100000;
+ let Inst{8-5} = Pn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = Pd;
+}
+
+class sve_int_rdffr_pred<bit s, string asm>
+: I<(outs PPR8:$Pd), (ins PPRAny:$Pg),
+ asm, "\t$Pd, $Pg/z",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pd;
+ bits<4> Pg;
+ let Inst{31-23} = 0b001001010;
+ let Inst{22} = s;
+ let Inst{21-9} = 0b0110001111000;
+ let Inst{8-5} = Pg;
+ let Inst{4} = 0;
+ let Inst{3-0} = Pd;
+
+ let Defs = !if(!eq (s, 1), [NZCV], []);
+ let Uses = [FFR];
+}
+
+class sve_int_rdffr_unpred<string asm> : I<
+ (outs PPR8:$Pd), (ins),
+ asm, "\t$Pd",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pd;
+ let Inst{31-4} = 0b0010010100011001111100000000;
+ let Inst{3-0} = Pd;
+
+ let Uses = [FFR];
+}
+
+class sve_int_wrffr<string asm>
+: I<(outs), (ins PPR8:$Pn),
+ asm, "\t$Pn",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pn;
+ let Inst{31-9} = 0b00100101001010001001000;
+ let Inst{8-5} = Pn;
+ let Inst{4-0} = 0b00000;
+
+ let hasSideEffects = 1;
+ let Defs = [FFR];
+}
+
+class sve_int_setffr<string asm>
+: I<(outs), (ins),
+ asm, "",
+ "",
+ []>, Sched<[]> {
+ let Inst{31-0} = 0b00100101001011001001000000000000;
+
+ let hasSideEffects = 1;
+ let Defs = [FFR];
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Permute Vector - Predicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_perm_clast_rz<bits<2> sz8_64, bit ab, string asm,
+ ZPRRegOp zprty, RegisterClass rt>
+: I<(outs rt:$Rdn), (ins PPR3bAny:$Pg, rt:$_Rdn, zprty:$Zm),
+ asm, "\t$Rdn, $Pg, $_Rdn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rdn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-17} = 0b11000;
+ let Inst{16} = ab;
+ let Inst{15-13} = 0b101;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Rdn;
+
+ let Constraints = "$Rdn = $_Rdn";
+}
+
+multiclass sve_int_perm_clast_rz<bit ab, string asm> {
+ def _B : sve_int_perm_clast_rz<0b00, ab, asm, ZPR8, GPR32>;
+ def _H : sve_int_perm_clast_rz<0b01, ab, asm, ZPR16, GPR32>;
+ def _S : sve_int_perm_clast_rz<0b10, ab, asm, ZPR32, GPR32>;
+ def _D : sve_int_perm_clast_rz<0b11, ab, asm, ZPR64, GPR64>;
+}
+
+class sve_int_perm_clast_vz<bits<2> sz8_64, bit ab, string asm,
+ ZPRRegOp zprty, RegisterClass rt>
+: I<(outs rt:$Vdn), (ins PPR3bAny:$Pg, rt:$_Vdn, zprty:$Zm),
+ asm, "\t$Vdn, $Pg, $_Vdn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Vdn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-17} = 0b10101;
+ let Inst{16} = ab;
+ let Inst{15-13} = 0b100;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Vdn;
+
+ let Constraints = "$Vdn = $_Vdn";
+}
+
+multiclass sve_int_perm_clast_vz<bit ab, string asm> {
+ def _B : sve_int_perm_clast_vz<0b00, ab, asm, ZPR8, FPR8>;
+ def _H : sve_int_perm_clast_vz<0b01, ab, asm, ZPR16, FPR16>;
+ def _S : sve_int_perm_clast_vz<0b10, ab, asm, ZPR32, FPR32>;
+ def _D : sve_int_perm_clast_vz<0b11, ab, asm, ZPR64, FPR64>;
+}
+
+class sve_int_perm_clast_zz<bits<2> sz8_64, bit ab, string asm,
+ ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
+ asm, "\t$Zdn, $Pg, $_Zdn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zdn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-17} = 0b10100;
+ let Inst{16} = ab;
+ let Inst{15-13} = 0b100;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_perm_clast_zz<bit ab, string asm> {
+ def _B : sve_int_perm_clast_zz<0b00, ab, asm, ZPR8>;
+ def _H : sve_int_perm_clast_zz<0b01, ab, asm, ZPR16>;
+ def _S : sve_int_perm_clast_zz<0b10, ab, asm, ZPR32>;
+ def _D : sve_int_perm_clast_zz<0b11, ab, asm, ZPR64>;
+}
+
+class sve_int_perm_last_r<bits<2> sz8_64, bit ab, string asm,
+ ZPRRegOp zprty, RegisterClass resultRegType>
+: I<(outs resultRegType:$Rd), (ins PPR3bAny:$Pg, zprty:$Zn),
+ asm, "\t$Rd, $Pg, $Zn",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rd;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-17} = 0b10000;
+ let Inst{16} = ab;
+ let Inst{15-13} = 0b101;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass sve_int_perm_last_r<bit ab, string asm> {
+ def _B : sve_int_perm_last_r<0b00, ab, asm, ZPR8, GPR32>;
+ def _H : sve_int_perm_last_r<0b01, ab, asm, ZPR16, GPR32>;
+ def _S : sve_int_perm_last_r<0b10, ab, asm, ZPR32, GPR32>;
+ def _D : sve_int_perm_last_r<0b11, ab, asm, ZPR64, GPR64>;
+}
+
+class sve_int_perm_last_v<bits<2> sz8_64, bit ab, string asm,
+ ZPRRegOp zprty, RegisterClass dstRegtype>
+: I<(outs dstRegtype:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
+ asm, "\t$Vd, $Pg, $Zn",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Vd;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-17} = 0b10001;
+ let Inst{16} = ab;
+ let Inst{15-13} = 0b100;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Vd;
+}
+
+multiclass sve_int_perm_last_v<bit ab, string asm> {
+ def _B : sve_int_perm_last_v<0b00, ab, asm, ZPR8, FPR8>;
+ def _H : sve_int_perm_last_v<0b01, ab, asm, ZPR16, FPR16>;
+ def _S : sve_int_perm_last_v<0b10, ab, asm, ZPR32, FPR32>;
+ def _D : sve_int_perm_last_v<0b11, ab, asm, ZPR64, FPR64>;
+}
+
+class sve_int_perm_splice<bits<2> sz8_64, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
+ asm, "\t$Zdn, $Pg, $_Zdn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zdn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-13} = 0b101100100;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_perm_splice<string asm> {
+ def _B : sve_int_perm_splice<0b00, asm, ZPR8>;
+ def _H : sve_int_perm_splice<0b01, asm, ZPR16>;
+ def _S : sve_int_perm_splice<0b10, asm, ZPR32>;
+ def _D : sve_int_perm_splice<0b11, asm, ZPR64>;
+}
+
+class sve_int_perm_rev<bits<2> sz8_64, bits<2> opc, string asm,
+ ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
+ asm, "\t$Zd, $Pg/m, $Zn",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<3> Pg;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-18} = 0b1001;
+ let Inst{17-16} = opc;
+ let Inst{15-13} = 0b100;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve_int_perm_rev_rbit<string asm> {
+ def _B : sve_int_perm_rev<0b00, 0b11, asm, ZPR8>;
+ def _H : sve_int_perm_rev<0b01, 0b11, asm, ZPR16>;
+ def _S : sve_int_perm_rev<0b10, 0b11, asm, ZPR32>;
+ def _D : sve_int_perm_rev<0b11, 0b11, asm, ZPR64>;
+}
+
+multiclass sve_int_perm_rev_revb<string asm> {
+ def _H : sve_int_perm_rev<0b01, 0b00, asm, ZPR16>;
+ def _S : sve_int_perm_rev<0b10, 0b00, asm, ZPR32>;
+ def _D : sve_int_perm_rev<0b11, 0b00, asm, ZPR64>;
+}
+
+multiclass sve_int_perm_rev_revh<string asm> {
+ def _S : sve_int_perm_rev<0b10, 0b01, asm, ZPR32>;
+ def _D : sve_int_perm_rev<0b11, 0b01, asm, ZPR64>;
+}
+
+multiclass sve_int_perm_rev_revw<string asm> {
+ def _D : sve_int_perm_rev<0b11, 0b10, asm, ZPR64>;
+}
+
+class sve_int_perm_cpy_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+ RegisterClass srcRegType>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, srcRegType:$Rn),
+ asm, "\t$Zd, $Pg/m, $Rn",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<5> Zd;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-13} = 0b101000101;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve_int_perm_cpy_r<string asm> {
+ def _B : sve_int_perm_cpy_r<0b00, asm, ZPR8, GPR32sp>;
+ def _H : sve_int_perm_cpy_r<0b01, asm, ZPR16, GPR32sp>;
+ def _S : sve_int_perm_cpy_r<0b10, asm, ZPR32, GPR32sp>;
+ def _D : sve_int_perm_cpy_r<0b11, asm, ZPR64, GPR64sp>;
+
+ def : InstAlias<"mov $Zd, $Pg/m, $Rn",
+ (!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
+ def : InstAlias<"mov $Zd, $Pg/m, $Rn",
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
+ def : InstAlias<"mov $Zd, $Pg/m, $Rn",
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
+ def : InstAlias<"mov $Zd, $Pg/m, $Rn",
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, GPR64sp:$Rn), 1>;
+}
+
+class sve_int_perm_cpy_v<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+ RegisterClass srcRegtype>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, srcRegtype:$Vn),
+ asm, "\t$Zd, $Pg/m, $Vn",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Vn;
+ bits<5> Zd;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-13} = 0b100000100;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Vn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve_int_perm_cpy_v<string asm> {
+ def _B : sve_int_perm_cpy_v<0b00, asm, ZPR8, FPR8>;
+ def _H : sve_int_perm_cpy_v<0b01, asm, ZPR16, FPR16>;
+ def _S : sve_int_perm_cpy_v<0b10, asm, ZPR32, FPR32>;
+ def _D : sve_int_perm_cpy_v<0b11, asm, ZPR64, FPR64>;
+
+ def : InstAlias<"mov $Zd, $Pg/m, $Vn",
+ (!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPR3bAny:$Pg, FPR8:$Vn), 1>;
+ def : InstAlias<"mov $Zd, $Pg/m, $Vn",
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPR3bAny:$Pg, FPR16:$Vn), 1>;
+ def : InstAlias<"mov $Zd, $Pg/m, $Vn",
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, FPR32:$Vn), 1>;
+ def : InstAlias<"mov $Zd, $Pg/m, $Vn",
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, FPR64:$Vn), 1>;
+}
+
+class sve_int_perm_compact<bit sz, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zn),
+ asm, "\t$Zd, $Pg, $Zn",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zd;
+ bits<5> Zn;
+ let Inst{31-23} = 0b000001011;
+ let Inst{22} = sz;
+ let Inst{21-13} = 0b100001100;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_perm_compact<string asm> {
+ def _S : sve_int_perm_compact<0b0, asm, ZPR32>;
+ def _D : sve_int_perm_compact<0b1, asm, ZPR64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Memory - Contiguous Load Group
+//===----------------------------------------------------------------------===//
+
+class sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
+ RegisterOperand VecList>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
+ asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<5> Zt;
+ bits<4> imm4;
+ let Inst{31-25} = 0b1010010;
+ let Inst{24-21} = dtype;
+ let Inst{20} = nf;
+ let Inst{19-16} = imm4;
+ let Inst{15-13} = 0b101;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+ let Uses = !if(!eq(nf, 1), [FFR], []);
+ let Defs = !if(!eq(nf, 1), [FFR], []);
+}
+
+multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
+ RegisterOperand listty, ZPRRegOp zprty> {
+ def _REAL : sve_mem_cld_si_base<dtype, nf, asm, listty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+ (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
+ (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+ (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+multiclass sve_mem_cld_si<bits<4> dtype, string asm, RegisterOperand listty,
+ ZPRRegOp zprty>
+: sve_mem_cld_si_base<dtype, 0, asm, listty, zprty>;
+
+class sve_mem_cldnt_si_base<bits<2> msz, string asm, RegisterOperand VecList>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
+ asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zt;
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<4> imm4;
+ let Inst{31-25} = 0b1010010;
+ let Inst{24-23} = msz;
+ let Inst{22-20} = 0b000;
+ let Inst{19-16} = imm4;
+ let Inst{15-13} = 0b111;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+}
+
+multiclass sve_mem_cldnt_si<bits<2> msz, string asm, RegisterOperand listty,
+ ZPRRegOp zprty> {
+ def NAME : sve_mem_cldnt_si_base<msz, asm, listty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+ (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_cldnt_ss_base<bits<2> msz, string asm, RegisterOperand VecList,
+ RegisterOperand gprty>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+ asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rm;
+ bits<5> Rn;
+ bits<5> Zt;
+ let Inst{31-25} = 0b1010010;
+ let Inst{24-23} = msz;
+ let Inst{22-21} = 0b00;
+ let Inst{20-16} = Rm;
+ let Inst{15-13} = 0b110;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+}
+
+multiclass sve_mem_cldnt_ss<bits<2> msz, string asm, RegisterOperand listty,
+ ZPRRegOp zprty, RegisterOperand gprty> {
+ def NAME : sve_mem_cldnt_ss_base<msz, asm, listty, gprty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
+}
+
+class sve_mem_ldqr_si<bits<2> sz, string asm, RegisterOperand VecList>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s16:$imm4),
+ asm, "\t$Zt, $Pg/z, [$Rn, $imm4]", "", []>, Sched<[]> {
+ bits<5> Zt;
+ bits<5> Rn;
+ bits<3> Pg;
+ bits<4> imm4;
+ let Inst{31-25} = 0b1010010;
+ let Inst{24-23} = sz;
+ let Inst{22-20} = 0;
+ let Inst{19-16} = imm4;
+ let Inst{15-13} = 0b001;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+}
+
+multiclass sve_mem_ldqr_si<bits<2> sz, string asm, RegisterOperand listty,
+ ZPRRegOp zprty> {
+ def NAME : sve_mem_ldqr_si<sz, asm, listty>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+ (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s16:$imm4), 0>;
+}
+
+class sve_mem_ldqr_ss<bits<2> sz, string asm, RegisterOperand VecList,
+ RegisterOperand gprty>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+ asm, "\t$Zt, $Pg/z, [$Rn, $Rm]", "", []>, Sched<[]> {
+ bits<5> Zt;
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31-25} = 0b1010010;
+ let Inst{24-23} = sz;
+ let Inst{22-21} = 0;
+ let Inst{20-16} = Rm;
+ let Inst{15-13} = 0;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+}
+
+multiclass sve_mem_ldqr_ss<bits<2> sz, string asm, RegisterOperand listty,
+ ZPRRegOp zprty, RegisterOperand gprty> {
+ def NAME : sve_mem_ldqr_ss<sz, asm, listty, gprty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
+}
+
+class sve_mem_ld_dup<bits<2> dtypeh, bits<2> dtypel, string asm,
+ RegisterOperand VecList, Operand immtype>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm6),
+ asm, "\t$Zt, $Pg/z, [$Rn, $imm6]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<5> Zt;
+ bits<6> imm6;
+ let Inst{31-25} = 0b1000010;
+ let Inst{24-23} = dtypeh;
+ let Inst{22} = 1;
+ let Inst{21-16} = imm6;
+ let Inst{15} = 0b1;
+ let Inst{14-13} = dtypel;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+}
+
+multiclass sve_mem_ld_dup<bits<2> dtypeh, bits<2> dtypel, string asm,
+ RegisterOperand zlistty, ZPRRegOp zprty, Operand immtype> {
+ def NAME : sve_mem_ld_dup<dtypeh, dtypel, asm, zlistty, immtype>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm6]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm6), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+ (!cast<Instruction>(NAME) zlistty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_cld_ss_base<bits<4> dtype, bit ff, dag iops, string asm,
+ RegisterOperand VecList>
+: I<(outs VecList:$Zt), iops,
+ asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zt;
+ bits<3> Pg;
+ bits<5> Rm;
+ bits<5> Rn;
+ let Inst{31-25} = 0b1010010;
+ let Inst{24-21} = dtype;
+ let Inst{20-16} = Rm;
+ let Inst{15-14} = 0b01;
+ let Inst{13} = ff;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+ let Uses = !if(!eq(ff, 1), [FFR], []);
+ let Defs = !if(!eq(ff, 1), [FFR], []);
+}
+
+multiclass sve_mem_cld_ss<bits<4> dtype, string asm, RegisterOperand listty,
+ ZPRRegOp zprty, RegisterOperand gprty> {
+ def "" : sve_mem_cld_ss_base<dtype, 0, (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+ asm, listty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
+}
+
+multiclass sve_mem_cldff_ss<bits<4> dtype, string asm, RegisterOperand listty,
+ ZPRRegOp zprty, RegisterOperand gprty> {
+ def _REAL : sve_mem_cld_ss_base<dtype, 1, (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+ asm, listty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
+ (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+ (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 1>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+ (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>;
+}
+
+multiclass sve_mem_cldnf_si<bits<4> dtype, string asm, RegisterOperand listty,
+ ZPRRegOp zprty>
+: sve_mem_cld_si_base<dtype, 1, asm, listty, zprty>;
+
+class sve_mem_eld_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
+ string asm, Operand immtype>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm4),
+ asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zt;
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<4> imm4;
+ let Inst{31-25} = 0b1010010;
+ let Inst{24-23} = sz;
+ let Inst{22-21} = nregs;
+ let Inst{20} = 0;
+ let Inst{19-16} = imm4;
+ let Inst{15-13} = 0b111;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+}
+
+multiclass sve_mem_eld_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
+ string asm, Operand immtype> {
+ def NAME : sve_mem_eld_si<sz, nregs, VecList, asm, immtype>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+ (!cast<Instruction>(NAME) VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_eld_ss<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
+ string asm, RegisterOperand gprty>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+ asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rm;
+ bits<5> Rn;
+ bits<5> Zt;
+ let Inst{31-25} = 0b1010010;
+ let Inst{24-23} = sz;
+ let Inst{22-21} = nregs;
+ let Inst{20-16} = Rm;
+ let Inst{15-13} = 0b110;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Memory - 32-bit Gather and Unsized Contiguous Group
+//===----------------------------------------------------------------------===//
+
+// bit xs is '1' if offsets are signed
+// bit scaled is '1' if the offsets are scaled
+class sve_mem_32b_gld_sv<bits<4> opc, bit xs, bit scaled, string asm,
+ RegisterOperand zprext>
+: I<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
+ asm, "\t$Zt, $Pg/z, [$Rn, $Zm]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<5> Zm;
+ bits<5> Zt;
+ let Inst{31-25} = 0b1000010;
+ let Inst{24-23} = opc{3-2};
+ let Inst{22} = xs;
+ let Inst{21} = scaled;
+ let Inst{20-16} = Zm;
+ let Inst{15} = 0b0;
+ let Inst{14-13} = opc{1-0};
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+ let Defs = !if(!eq(opc{0}, 1), [FFR], []);
+ let Uses = !if(!eq(opc{0}, 1), [FFR], []);
+}
+
+multiclass sve_mem_32b_gld_sv_32_scaled<bits<4> opc, string asm,
+ RegisterOperand sxtw_opnd,
+ RegisterOperand uxtw_opnd> {
+ def _UXTW_SCALED_REAL : sve_mem_32b_gld_sv<opc, 0, 1, asm, uxtw_opnd>;
+ def _SXTW_SCALED_REAL : sve_mem_32b_gld_sv<opc, 1, 1, asm, sxtw_opnd>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+}
+
+multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm,
+ RegisterOperand sxtw_opnd,
+ RegisterOperand uxtw_opnd> {
+ def _UXTW_REAL : sve_mem_32b_gld_sv<opc, 0, 0, asm, uxtw_opnd>;
+ def _SXTW_REAL : sve_mem_32b_gld_sv<opc, 1, 0, asm, sxtw_opnd>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _UXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _SXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+}
+
+
+class sve_mem_32b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
+: I<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5),
+ asm, "\t$Zt, $Pg/z, [$Zn, $imm5]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zn;
+ bits<5> Zt;
+ bits<5> imm5;
+ let Inst{31-25} = 0b1000010;
+ let Inst{24-23} = opc{3-2};
+ let Inst{22-21} = 0b01;
+ let Inst{20-16} = imm5;
+ let Inst{15} = 0b1;
+ let Inst{14-13} = opc{1-0};
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+ let Defs = !if(!eq(opc{0}, 1), [FFR], []);
+ let Uses = !if(!eq(opc{0}, 1), [FFR], []);
+}
+
+multiclass sve_mem_32b_gld_vi_32_ptrs<bits<4> opc, string asm, Operand imm_ty> {
+ def _IMM_REAL : sve_mem_32b_gld_vi<opc, asm, imm_ty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+ (!cast<Instruction>(NAME # _IMM_REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $imm5]",
+ (!cast<Instruction>(NAME # _IMM_REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+ (!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
+}
+
+class sve_mem_prfm_si<bits<2> msz, string asm>
+: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, simm6s1:$imm6),
+ asm, "\t$prfop, $Pg, [$Rn, $imm6, mul vl]",
+ "",
+ []>, Sched<[]> {
+ bits<5> Rn;
+ bits<3> Pg;
+ bits<6> imm6;
+ bits<4> prfop;
+ let Inst{31-22} = 0b1000010111;
+ let Inst{21-16} = imm6;
+ let Inst{15} = 0b0;
+ let Inst{14-13} = msz;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = prfop;
+
+ let hasSideEffects = 1;
+}
+
+multiclass sve_mem_prfm_si<bits<2> msz, string asm> {
+ def NAME : sve_mem_prfm_si<msz, asm>;
+
+ def : InstAlias<asm # "\t$prfop, $Pg, [$Rn]",
+ (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_prfm_ss<bits<3> opc, string asm, RegisterOperand gprty>
+: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+ asm, "\t$prfop, $Pg, [$Rn, $Rm]",
+ "",
+ []>, Sched<[]> {
+ bits<5> Rm;
+ bits<5> Rn;
+ bits<3> Pg;
+ bits<4> prfop;
+ let Inst{31-25} = 0b1000010;
+ let Inst{24-23} = opc{2-1};
+ let Inst{22-21} = 0b00;
+ let Inst{20-16} = Rm;
+ let Inst{15} = 0b1;
+ let Inst{14} = opc{0};
+ let Inst{13} = 0b0;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = prfop;
+
+ let hasSideEffects = 1;
+}
+
+class sve_mem_32b_prfm_sv<bits<2> msz, bit xs, string asm,
+ RegisterOperand zprext>
+: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
+ asm, "\t$prfop, $Pg, [$Rn, $Zm]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<5> Zm;
+ bits<4> prfop;
+ let Inst{31-23} = 0b100001000;
+ let Inst{22} = xs;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15} = 0b0;
+ let Inst{14-13} = msz;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = prfop;
+
+ let hasSideEffects = 1;
+}
+
+multiclass sve_mem_32b_prfm_sv_scaled<bits<2> msz, string asm,
+ RegisterOperand sxtw_opnd,
+ RegisterOperand uxtw_opnd> {
+ def _UXTW_SCALED : sve_mem_32b_prfm_sv<msz, 0, asm, uxtw_opnd>;
+ def _SXTW_SCALED : sve_mem_32b_prfm_sv<msz, 1, asm, sxtw_opnd>;
+}
+
+class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
+: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5),
+ asm, "\t$prfop, $Pg, [$Zn, $imm5]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zn;
+ bits<5> imm5;
+ bits<4> prfop;
+ let Inst{31-25} = 0b1000010;
+ let Inst{24-23} = msz;
+ let Inst{22-21} = 0b00;
+ let Inst{20-16} = imm5;
+ let Inst{15-13} = 0b111;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = prfop;
+}
+
+multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
+ def NAME : sve_mem_32b_prfm_vi<msz, asm, imm_ty>;
+
+ def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
+ (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
+}
+
+class sve_mem_z_fill<string asm>
+: I<(outs ZPRAny:$Zt), (ins GPR64sp:$Rn, simm9:$imm9),
+ asm, "\t$Zt, [$Rn, $imm9, mul vl]",
+ "",
+ []>, Sched<[]> {
+ bits<5> Rn;
+ bits<5> Zt;
+ bits<9> imm9;
+ let Inst{31-22} = 0b1000010110;
+ let Inst{21-16} = imm9{8-3};
+ let Inst{15-13} = 0b010;
+ let Inst{12-10} = imm9{2-0};
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+}
+
+multiclass sve_mem_z_fill<string asm> {
+ def NAME : sve_mem_z_fill<asm>;
+
+ def : InstAlias<asm # "\t$Zt, [$Rn]",
+ (!cast<Instruction>(NAME) ZPRAny:$Zt, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_p_fill<string asm>
+: I<(outs PPRAny:$Pt), (ins GPR64sp:$Rn, simm9:$imm9),
+ asm, "\t$Pt, [$Rn, $imm9, mul vl]",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pt;
+ bits<5> Rn;
+ bits<9> imm9;
+ let Inst{31-22} = 0b1000010110;
+ let Inst{21-16} = imm9{8-3};
+ let Inst{15-13} = 0b000;
+ let Inst{12-10} = imm9{2-0};
+ let Inst{9-5} = Rn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = Pt;
+
+ let mayLoad = 1;
+}
+
+multiclass sve_mem_p_fill<string asm> {
+ def NAME : sve_mem_p_fill<asm>;
+
+ def : InstAlias<asm # "\t$Pt, [$Rn]",
+ (!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Memory - 64-bit Gather Group
+//===----------------------------------------------------------------------===//
+
+// bit xs is '1' if offsets are signed
+// bit scaled is '1' if the offsets are scaled
+// bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
+class sve_mem_64b_gld_sv<bits<4> opc, bit xs, bit scaled, bit lsl, string asm,
+ RegisterOperand zprext>
+: I<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
+ asm, "\t$Zt, $Pg/z, [$Rn, $Zm]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<5> Zm;
+ bits<5> Zt;
+ let Inst{31-25} = 0b1100010;
+ let Inst{24-23} = opc{3-2};
+ let Inst{22} = xs;
+ let Inst{21} = scaled;
+ let Inst{20-16} = Zm;
+ let Inst{15} = lsl;
+ let Inst{14-13} = opc{1-0};
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+ let Defs = !if(!eq(opc{0}, 1), [FFR], []);
+ let Uses = !if(!eq(opc{0}, 1), [FFR], []);
+}
+
+multiclass sve_mem_64b_gld_sv_32_scaled<bits<4> opc, string asm,
+ RegisterOperand sxtw_opnd,
+ RegisterOperand uxtw_opnd> {
+ def _UXTW_SCALED_REAL : sve_mem_64b_gld_sv<opc, 0, 1, 0, asm, uxtw_opnd>;
+ def _SXTW_SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 0, asm, sxtw_opnd>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+}
+
+multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
+ RegisterOperand sxtw_opnd,
+ RegisterOperand uxtw_opnd> {
+ def _UXTW_REAL : sve_mem_64b_gld_sv<opc, 0, 0, 0, asm, uxtw_opnd>;
+ def _SXTW_REAL : sve_mem_64b_gld_sv<opc, 1, 0, 0, asm, sxtw_opnd>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _UXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _SXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+}
+
+multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm,
+ RegisterOperand zprext> {
+ def _SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 1, asm, zprext>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
+}
+
+multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm> {
+ def _REAL : sve_mem_64b_gld_sv<opc, 1, 0, 1, asm, ZPR64ExtLSL8>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
+}
+
+class sve_mem_64b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
+: I<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5),
+ asm, "\t$Zt, $Pg/z, [$Zn, $imm5]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zn;
+ bits<5> Zt;
+ bits<5> imm5;
+ let Inst{31-25} = 0b1100010;
+ let Inst{24-23} = opc{3-2};
+ let Inst{22-21} = 0b01;
+ let Inst{20-16} = imm5;
+ let Inst{15} = 0b1;
+ let Inst{14-13} = opc{1-0};
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+ let Defs = !if(!eq(opc{0}, 1), [FFR], []);
+ let Uses = !if(!eq(opc{0}, 1), [FFR], []);
+}
+
+multiclass sve_mem_64b_gld_vi_64_ptrs<bits<4> opc, string asm, Operand imm_ty> {
+ def _IMM_REAL : sve_mem_64b_gld_vi<opc, asm, imm_ty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+ (!cast<Instruction>(NAME # _IMM_REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $imm5]",
+ (!cast<Instruction>(NAME # _IMM_REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+ (!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
+}
+
+// bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
+class sve_mem_64b_prfm_sv<bits<2> msz, bit xs, bit lsl, string asm,
+ RegisterOperand zprext>
+: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
+ asm, "\t$prfop, $Pg, [$Rn, $Zm]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rn;
+ bits<5> Zm;
+ bits<4> prfop;
+ let Inst{31-23} = 0b110001000;
+ let Inst{22} = xs;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15} = lsl;
+ let Inst{14-13} = msz;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Rn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = prfop;
+
+ let hasSideEffects = 1;
+}
+
+multiclass sve_mem_64b_prfm_sv_ext_scaled<bits<2> msz, string asm,
+ RegisterOperand sxtw_opnd,
+ RegisterOperand uxtw_opnd> {
+ def _UXTW_SCALED : sve_mem_64b_prfm_sv<msz, 0, 0, asm, uxtw_opnd>;
+ def _SXTW_SCALED : sve_mem_64b_prfm_sv<msz, 1, 0, asm, sxtw_opnd>;
+}
+
+multiclass sve_mem_64b_prfm_sv_lsl_scaled<bits<2> msz, string asm,
+ RegisterOperand zprext> {
+ def NAME : sve_mem_64b_prfm_sv<msz, 1, 1, asm, zprext>;
+}
+
+
+class sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
+: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5),
+ asm, "\t$prfop, $Pg, [$Zn, $imm5]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zn;
+ bits<5> imm5;
+ bits<4> prfop;
+ let Inst{31-25} = 0b1100010;
+ let Inst{24-23} = msz;
+ let Inst{22-21} = 0b00;
+ let Inst{20-16} = imm5;
+ let Inst{15-13} = 0b111;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = prfop;
+
+ let hasSideEffects = 1;
+}
+
+multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
+ def NAME : sve_mem_64b_prfm_vi<msz, asm, imm_ty>;
+
+ def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
+ (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Compute Vector Address Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_bin_cons_misc_0_a<bits<2> opc, bits<2> msz, string asm,
+ ZPRRegOp zprty, RegisterOperand zprext>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, zprext:$Zm),
+ asm, "\t$Zd, [$Zn, $Zm]",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = opc;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-12} = 0b1010;
+ let Inst{11-10} = msz;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_bin_cons_misc_0_a_uxtw<bits<2> opc, string asm> {
+ def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtUXTW8>;
+ def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtUXTW16>;
+ def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtUXTW32>;
+ def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtUXTW64>;
+}
+
+multiclass sve_int_bin_cons_misc_0_a_sxtw<bits<2> opc, string asm> {
+ def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtSXTW8>;
+ def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtSXTW16>;
+ def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtSXTW32>;
+ def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtSXTW64>;
+}
+
+multiclass sve_int_bin_cons_misc_0_a_32_lsl<bits<2> opc, string asm> {
+ def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR32, ZPR32ExtLSL8>;
+ def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR32, ZPR32ExtLSL16>;
+ def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR32, ZPR32ExtLSL32>;
+ def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR32, ZPR32ExtLSL64>;
+}
+
+multiclass sve_int_bin_cons_misc_0_a_64_lsl<bits<2> opc, string asm> {
+ def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtLSL8>;
+ def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtLSL16>;
+ def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtLSL32>;
+ def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtLSL64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Misc - Unpredicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_bin_cons_misc_0_b<bits<2> sz, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
+ asm, "\t$Zd, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zm;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-10} = 0b101100;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_bin_cons_misc_0_b<string asm> {
+ def _H : sve_int_bin_cons_misc_0_b<0b01, asm, ZPR16>;
+ def _S : sve_int_bin_cons_misc_0_b<0b10, asm, ZPR32>;
+ def _D : sve_int_bin_cons_misc_0_b<0b11, asm, ZPR64>;
+}
+
+class sve_int_bin_cons_misc_0_c<bits<8> opc, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn),
+ asm, "\t$Zd, $Zn",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = opc{7-6};
+ let Inst{21} = 0b1;
+ let Inst{20-16} = opc{5-1};
+ let Inst{15-11} = 0b10111;
+ let Inst{10} = opc{0};
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Reduction Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_reduce<bits<2> sz8_32, bits<2> fmt, bits<3> opc, string asm,
+ ZPRRegOp zprty, RegisterClass regtype>
+: I<(outs regtype:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
+ asm, "\t$Vd, $Pg, $Zn",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Vd;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = sz8_32;
+ let Inst{21} = 0b0;
+ let Inst{20-19} = fmt;
+ let Inst{18-16} = opc;
+ let Inst{15-13} = 0b001;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Vd;
+}
+
+multiclass sve_int_reduce_0_saddv<bits<3> opc, string asm> {
+ def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
+ def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
+ def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;
+}
+
+multiclass sve_int_reduce_0_uaddv<bits<3> opc, string asm> {
+ def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
+ def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
+ def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;
+ def _D : sve_int_reduce<0b11, 0b00, opc, asm, ZPR64, FPR64>;
+}
+
+multiclass sve_int_reduce_1<bits<3> opc, string asm> {
+ def _B : sve_int_reduce<0b00, 0b01, opc, asm, ZPR8, FPR8>;
+ def _H : sve_int_reduce<0b01, 0b01, opc, asm, ZPR16, FPR16>;
+ def _S : sve_int_reduce<0b10, 0b01, opc, asm, ZPR32, FPR32>;
+ def _D : sve_int_reduce<0b11, 0b01, opc, asm, ZPR64, FPR64>;
+}
+
+multiclass sve_int_reduce_2<bits<3> opc, string asm> {
+ def _B : sve_int_reduce<0b00, 0b11, opc, asm, ZPR8, FPR8>;
+ def _H : sve_int_reduce<0b01, 0b11, opc, asm, ZPR16, FPR16>;
+ def _S : sve_int_reduce<0b10, 0b11, opc, asm, ZPR32, FPR32>;
+ def _D : sve_int_reduce<0b11, 0b11, opc, asm, ZPR64, FPR64>;
+}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index e65ba1f2401d..23cc21ce2e7c 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -53,6 +53,14 @@ namespace llvm {
#include "AArch64GenSystemOperands.inc"
}
}
+
+namespace llvm {
+ namespace AArch64TSB {
+#define GET_TSB_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
+
namespace llvm {
namespace AArch64PRFM {
#define GET_PRFM_IMPL
@@ -61,6 +69,27 @@ namespace llvm {
}
namespace llvm {
+ namespace AArch64SVEPRFM {
+#define GET_SVEPRFM_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
+
+namespace llvm {
+ namespace AArch64SVEPredPattern {
+#define GET_SVEPREDPAT_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
+
+namespace llvm {
+ namespace AArch64ExactFPImm {
+#define GET_EXACTFPIMM_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
+
+namespace llvm {
namespace AArch64PState {
#define GET_PSTATE_IMPL
#include "AArch64GenSystemOperands.inc"
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index c1c799b7b349..2874c4ab42ea 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -285,6 +285,8 @@ struct SysAlias {
struct SysAliasReg : SysAlias {
bool NeedsReg;
SysAliasReg(const char *N, uint16_t E, bool R) : SysAlias(N, E), NeedsReg(R) {};
+ SysAliasReg(const char *N, uint16_t E, bool R, FeatureBitset F) : SysAlias(N, E, F),
+ NeedsReg(R) {};
};
namespace AArch64AT{
@@ -327,6 +329,14 @@ namespace AArch64ISB {
#include "AArch64GenSystemOperands.inc"
}
+namespace AArch64TSB {
+ struct TSB : SysAlias {
+ using SysAlias::SysAlias;
+ };
+ #define GET_TSB_DECL
+ #include "AArch64GenSystemOperands.inc"
+}
+
namespace AArch64PRFM {
struct PRFM : SysAlias {
using SysAlias::SysAlias;
@@ -335,6 +345,33 @@ namespace AArch64PRFM {
#include "AArch64GenSystemOperands.inc"
}
+namespace AArch64SVEPRFM {
+ struct SVEPRFM : SysAlias {
+ using SysAlias::SysAlias;
+ };
+#define GET_SVEPRFM_DECL
+#include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64SVEPredPattern {
+ struct SVEPREDPAT {
+ const char *Name;
+ uint16_t Encoding;
+ };
+#define GET_SVEPREDPAT_DECL
+#include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64ExactFPImm {
+ struct ExactFPImm {
+ const char *Name;
+ int Enum;
+ const char *Repr;
+ };
+#define GET_EXACTFPIMM_DECL
+#include "AArch64GenSystemOperands.inc"
+}
+
namespace AArch64PState {
struct PState : SysAlias{
using SysAlias::SysAlias;
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 0ddc43ad5033..796766d94622 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -11,7 +11,6 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
@@ -50,9 +49,9 @@ FunctionPass *createSIOptimizeExecMaskingPreRAPass();
FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSIMemoryLegalizerPass();
FunctionPass *createSIDebuggerInsertNopsPass();
-FunctionPass *createSIInsertWaitsPass();
FunctionPass *createSIInsertWaitcntsPass();
FunctionPass *createSIFixWWMLivenessPass();
+FunctionPass *createSIFormMemoryClausesPass();
FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
FunctionPass *createAMDGPUUseNativeCallsPass();
FunctionPass *createAMDGPUCodeGenPreparePass();
@@ -74,6 +73,14 @@ ModulePass *createAMDGPULowerIntrinsicsPass();
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
extern char &AMDGPULowerIntrinsicsID;
+FunctionPass *createAMDGPULowerKernelArgumentsPass();
+void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
+extern char &AMDGPULowerKernelArgumentsID;
+
+ModulePass *createAMDGPULowerKernelAttributesPass();
+void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
+extern char &AMDGPULowerKernelAttributesID;
+
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;
@@ -134,6 +141,9 @@ extern char &AMDGPUSimplifyLibCallsID;
void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
extern char &AMDGPUUseNativeCallsID;
+void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
+extern char &AMDGPUPerfHintAnalysisID;
+
// Passes common to R600 and SI
FunctionPass *createAMDGPUPromoteAlloca();
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
@@ -144,7 +154,7 @@ FunctionPass *createAMDGPUISelDag(
TargetMachine *TM = nullptr,
CodeGenOpt::Level OptLevel = CodeGenOpt::Default);
ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true);
-ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
+ModulePass *createR600OpenCLImageTypeLoweringPass();
FunctionPass *createAMDGPUAnnotateUniformValues();
ModulePass* createAMDGPUUnifyMetadataPass();
@@ -169,12 +179,12 @@ extern char &SIMemoryLegalizerID;
void initializeSIDebuggerInsertNopsPass(PassRegistry&);
extern char &SIDebuggerInsertNopsID;
-void initializeSIInsertWaitsPass(PassRegistry&);
-extern char &SIInsertWaitsID;
-
void initializeSIInsertWaitcntsPass(PassRegistry&);
extern char &SIInsertWaitcntsID;
+void initializeSIFormMemoryClausesPass(PassRegistry&);
+extern char &SIFormMemoryClausesID;
+
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
extern char &AMDGPUUnifyDivergentExitNodesID;
@@ -222,8 +232,11 @@ struct AMDGPUAS {
MAX_COMMON_ADDRESS = 5,
GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
- CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
+ CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2)
LOCAL_ADDRESS = 3, ///< Address space for local memory.
+
+ CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory
+
/// Address space for direct addressible parameter memory (CONST0)
PARAM_D_ADDRESS = 6,
/// Address space for indirect addressible parameter memory (VTX1)
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index c02d0a131041..16c2a366db28 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -7,58 +7,30 @@
//
//===------------------------------------------------------------===//
+include "llvm/TableGen/SearchableTable.td"
include "llvm/Target/Target.td"
+include "AMDGPUFeatures.td"
//===------------------------------------------------------------===//
// Subtarget Features (device properties)
//===------------------------------------------------------------===//
-def FeatureFP64 : SubtargetFeature<"fp64",
- "FP64",
- "true",
- "Enable double precision operations"
->;
-
-def FeatureFMA : SubtargetFeature<"fmaf",
- "FMA",
- "true",
- "Enable single precision FMA (not as fast as mul+add, but fused)"
->;
-
def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
"FastFMAF32",
"true",
"Assuming f32 fma is at least as fast as mul + add"
>;
-def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
- "HalfRate64Ops",
- "true",
- "Most fp64 instructions are half rate instead of quarter"
->;
-
-def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
- "R600ALUInst",
- "false",
- "Older version of ALU instructions encoding"
->;
-
-def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
- "HasVertexCache",
+def FeatureMIMG_R128 : SubtargetFeature<"mimg-r128",
+ "MIMG_R128",
"true",
- "Specify use of dedicated vertex cache"
+ "Support 128-bit texture resources"
>;
-def FeatureCaymanISA : SubtargetFeature<"caymanISA",
- "CaymanISA",
- "true",
- "Use Cayman ISA"
->;
-
-def FeatureCFALUBug : SubtargetFeature<"cfalubug",
- "CFALUBug",
+def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
+ "HalfRate64Ops",
"true",
- "GPU has CF_ALU bug"
+ "Most fp64 instructions are half rate instead of quarter"
>;
def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
@@ -121,6 +93,12 @@ def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts",
"Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions"
>;
+def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
+ "HasFmaMixInsts",
+ "true",
+ "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
+>;
+
// XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
// XNACK. The current default kernel driver setting is:
// - graphics ring: XNACK disabled
@@ -140,27 +118,6 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
"VI SGPR initialization bug requiring a fixed SGPR allocation size"
>;
-class SubtargetFeatureFetchLimit <string Value> :
- SubtargetFeature <"fetch"#Value,
- "TexVTXClauseSize",
- Value,
- "Limit the maximum number of fetches in a clause to "#Value
->;
-
-def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
-def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
-
-class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
- "wavefrontsize"#Value,
- "WavefrontSize",
- !cast<string>(Value),
- "The number of threads per wavefront"
->;
-
-def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
-def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
-def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
-
class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
"ldsbankcount"#Value,
"LDSBankCount",
@@ -171,19 +128,6 @@ class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
-class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
- "localmemorysize"#Value,
- "LocalMemorySize",
- !cast<string>(Value),
- "The size of local memory in bytes"
->;
-
-def FeatureGCN : SubtargetFeature<"gcn",
- "IsGCN",
- "true",
- "GCN or newer GPU"
->;
-
def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
"GCN3Encoding",
"true",
@@ -244,6 +188,12 @@ def FeatureScalarStores : SubtargetFeature<"scalar-stores",
"Has store scalar memory instructions"
>;
+def FeatureScalarAtomics : SubtargetFeature<"scalar-atomics",
+ "HasScalarAtomics",
+ "true",
+ "Has atomic scalar memory instructions"
+>;
+
def FeatureSDWA : SubtargetFeature<"sdwa",
"HasSDWA",
"true",
@@ -292,6 +242,27 @@ def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
"Support clamp for integer destination"
>;
+def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem",
+ "HasUnpackedD16VMem",
+ "true",
+ "Has unpacked d16 vmem instructions"
+>;
+
+def FeatureDLInsts : SubtargetFeature<"dl-insts",
+ "HasDLInsts",
+ "true",
+ "Has deep learning instructions"
+>;
+
+def FeatureD16PreservesUnusedBits : SubtargetFeature<
+ "d16-preserves-unused-bits",
+ "D16PreservesUnusedBits",
+ "true",
+ "If present, then instructions defined by HasD16LoadStore predicate preserve "
+ "unused bits. Otherwise instructions defined by HasD16LoadStore predicate "
+ "zero unused bits."
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -329,12 +300,6 @@ def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
[FeatureFP64FP16Denormals]
>;
-def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
- "DX10Clamp",
- "true",
- "clamp modifier clamps NaNs to 0.0"
->;
-
def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
"FPExceptions",
"true",
@@ -377,12 +342,6 @@ def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
"Dump MachineInstrs in the CodeEmitter"
>;
-def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
- "EnablePromoteAlloca",
- "true",
- "Enable promote alloca pass"
->;
-
// XXX - This should probably be removed once enabled by default
def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
"EnableLoadStoreOpt",
@@ -408,6 +367,12 @@ def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler",
"Enable SI Machine Scheduler"
>;
+def FeatureEnableDS128 : SubtargetFeature<"enable-ds128",
+ "EnableDS128",
+ "true",
+ "Use ds_{read|write}_b128"
+>;
+
// Unless +-flat-for-global is specified, turn on FlatForGlobal for
// all OS-es on VI and newer hardware to avoid assertion failures due
// to missing ADDR64 variants of MUBUF instructions.
@@ -440,46 +405,30 @@ def FeatureDisable : SubtargetFeature<"",
"Dummy feature to disable assembler instructions"
>;
-class SubtargetFeatureGeneration <string Value,
- list<SubtargetFeature> Implies> :
- SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
- Value#" GPU generation", Implies>;
-
-def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
-def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
-def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
-
-def FeatureR600 : SubtargetFeatureGeneration<"R600",
- [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]
->;
-
-def FeatureR700 : SubtargetFeatureGeneration<"R700",
- [FeatureFetchLimit16, FeatureLocalMemorySize0]
->;
-
-def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
- [FeatureFetchLimit16, FeatureLocalMemorySize32768]
+def FeatureGCN : SubtargetFeature<"gcn",
+ "IsGCN",
+ "true",
+ "GCN or newer GPU"
>;
-def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
- [FeatureFetchLimit16, FeatureWavefrontSize64,
- FeatureLocalMemorySize32768]
->;
+class GCNSubtargetFeatureGeneration <string Value,
+ list<SubtargetFeature> Implies> :
+ SubtargetFeatureGeneration <Value, "GCNSubtarget", Implies>;
-def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
- [FeatureFP64, FeatureLocalMemorySize32768,
+def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
+ [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
FeatureWavefrontSize64, FeatureGCN,
FeatureLDSBankCount32, FeatureMovrel]
>;
-def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
- [FeatureFP64, FeatureLocalMemorySize65536,
+def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
+ [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
FeatureCIInsts, FeatureMovrel]
>;
-def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
- [FeatureFP64, FeatureLocalMemorySize65536,
+def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
+ [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
@@ -489,7 +438,7 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
]
>;
-def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
+def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
[FeatureFP64, FeatureLocalMemorySize65536,
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
@@ -498,7 +447,7 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
- FeatureAddNoCarryInsts
+ FeatureAddNoCarryInsts, FeatureScalarAtomics
]
>;
@@ -534,7 +483,8 @@ def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1,
def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
[FeatureSeaIslands,
- FeatureLDSBankCount16]>;
+ FeatureLDSBankCount16,
+ FeatureFastFMAF32]>;
def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3,
[FeatureSeaIslands,
@@ -544,26 +494,24 @@ def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4,
[FeatureSeaIslands,
FeatureLDSBankCount32]>;
-def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0,
- [FeatureVolcanicIslands,
- FeatureLDSBankCount32,
- FeatureSGPRInitBug]>;
-
def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
[FeatureVolcanicIslands,
FeatureFastFMAF32,
HalfRate64Ops,
FeatureLDSBankCount32,
- FeatureXNACK]>;
+ FeatureXNACK,
+ FeatureUnpackedD16VMem]>;
def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2,
[FeatureVolcanicIslands,
FeatureLDSBankCount32,
- FeatureSGPRInitBug]>;
+ FeatureSGPRInitBug,
+ FeatureUnpackedD16VMem]>;
def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
[FeatureVolcanicIslands,
- FeatureLDSBankCount32]>;
+ FeatureLDSBankCount32,
+ FeatureUnpackedD16VMem]>;
def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
[FeatureVolcanicIslands,
@@ -573,14 +521,28 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
[FeatureGFX9,
FeatureMadMixInsts,
- FeatureLDSBankCount32
- ]>;
+ FeatureLDSBankCount32,
+ FeatureD16PreservesUnusedBits]>;
def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
[FeatureGFX9,
FeatureMadMixInsts,
- FeatureLDSBankCount32
- ]>;
+ FeatureLDSBankCount32,
+ FeatureXNACK,
+ FeatureD16PreservesUnusedBits]>;
+
+def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
+ [FeatureGFX9,
+ FeatureLDSBankCount32,
+ FeatureFmaMixInsts,
+ FeatureD16PreservesUnusedBits]>;
+
+def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
+ [FeatureGFX9,
+ HalfRate64Ops,
+ FeatureFmaMixInsts,
+ FeatureLDSBankCount32,
+ FeatureDLInsts]>;
//===----------------------------------------------------------------------===//
// Debugger related subtarget features.
@@ -593,13 +555,6 @@ def FeatureDebuggerInsertNops : SubtargetFeature<
"Insert one nop instruction for each high level source statement"
>;
-def FeatureDebuggerReserveRegs : SubtargetFeature<
- "amdgpu-debugger-reserve-regs",
- "DebuggerReserveRegs",
- "true",
- "Reserve registers for debugger usage"
->;
-
def FeatureDebuggerEmitPrologue : SubtargetFeature<
"amdgpu-debugger-emit-prologue",
"DebuggerEmitPrologue",
@@ -675,6 +630,7 @@ def AMDGPU : Target {
SDWA9AsmParserVariant,
DPPAsmParserVariant];
let AssemblyWriters = [AMDGPUAsmWriter];
+ let AllowRegisterRenaming = 1;
}
// Dummy Instruction itineraries for pseudo instructions
@@ -685,8 +641,6 @@ def NullALU : InstrItinClass;
// Predicate helper class
//===----------------------------------------------------------------------===//
-def TruePredicate : Predicate<"true">;
-
def isSICI : Predicate<
"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
@@ -715,6 +669,13 @@ def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">,
def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
AssemblerPredicate<"FeatureGFX9Insts">;
+def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
+ AssemblerPredicate<"FeatureUnpackedD16VMem">;
+def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
+ AssemblerPredicate<"!FeatureUnpackedD16VMem">;
+
+def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">,
+ AssemblerPredicate<"FeatureD16PreservesUnusedBits">;
def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
@@ -733,6 +694,9 @@ def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
AssemblerPredicate<"FeatureVOP3P">;
+def NotHasVOP3PInsts : Predicate<"!Subtarget->hasVOP3PInsts()">,
+ AssemblerPredicate<"!FeatureVOP3P">;
+
def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;
@@ -748,38 +712,35 @@ def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,
def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,
AssemblerPredicate<"FeatureMadMixInsts">;
-def EnableLateCFGStructurize : Predicate<
- "EnableLateStructurizeCFG">;
+def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">,
+ AssemblerPredicate<"FeatureScalarAtomics">;
-// Exists to help track down where SubtargetPredicate isn't set rather
-// than letting tablegen crash with an unhelpful error.
-def InvalidPred : Predicate<"predicate not set on instruction or pattern">;
-
-class PredicateControl {
- Predicate SubtargetPredicate = InvalidPred;
- Predicate SIAssemblerPredicate = isSICI;
- Predicate VIAssemblerPredicate = isVI;
- list<Predicate> AssemblerPredicates = [];
- Predicate AssemblerPredicate = TruePredicate;
- list<Predicate> OtherPredicates = [];
- list<Predicate> Predicates = !listconcat([SubtargetPredicate,
- AssemblerPredicate],
- AssemblerPredicates,
- OtherPredicates);
-}
+def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
+def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
+def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
+ AssemblerPredicate<"FeatureVGPRIndexMode">;
+def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
+ AssemblerPredicate<"FeatureMovrel">;
+
+def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
+ AssemblerPredicate<"FeatureFmaMixInsts">;
-class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>,
- PredicateControl;
+def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
+ AssemblerPredicate<"FeatureDLInsts">;
+def EnableLateCFGStructurize : Predicate<
+ "EnableLateStructurizeCFG">;
+
// Include AMDGPU TD files
-include "R600Schedule.td"
-include "R600Processors.td"
include "SISchedule.td"
include "GCNProcessors.td"
include "AMDGPUInstrInfo.td"
include "AMDGPUIntrinsics.td"
+include "SIIntrinsics.td"
include "AMDGPURegisterInfo.td"
include "AMDGPURegisterBanks.td"
include "AMDGPUInstructions.td"
+include "SIInstrInfo.td"
include "AMDGPUCallingConv.td"
+include "AMDGPUSearchableTables.td"
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index 392b011e387c..ef4b69d09d9f 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -61,7 +61,7 @@ AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Ar
/* Region */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}
};
static const AliasResult ASAliasRulesGenIsZero[6][6] = {
- /* Flat Global Constant Group Region Private */
+ /* Flat Global Region Group Constant Private */
/* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
/* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias},
/* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias},
@@ -72,9 +72,9 @@ AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Ar
assert(AS.MAX_COMMON_ADDRESS <= 5);
if (AS.FLAT_ADDRESS == 0) {
assert(AS.GLOBAL_ADDRESS == 1 &&
- AS.REGION_ADDRESS == 4 &&
+ AS.REGION_ADDRESS == 2 &&
AS.LOCAL_ADDRESS == 3 &&
- AS.CONSTANT_ADDRESS == 2 &&
+ AS.CONSTANT_ADDRESS == 4 &&
AS.PRIVATE_ADDRESS == 5);
ASAliasRules = &ASAliasRulesGenIsZero;
} else {
@@ -115,7 +115,8 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
bool OrLocal) {
const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
- if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS) {
+ if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS ||
+ Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS_32BIT) {
return true;
}
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index c27425443abc..d4bbb2c1eb8d 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -14,6 +14,9 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -30,13 +33,18 @@ static cl::opt<bool> StressCalls(
class AMDGPUAlwaysInline : public ModulePass {
bool GlobalOpt;
+ void recursivelyVisitUsers(GlobalValue &GV,
+ SmallPtrSetImpl<Function *> &FuncsToAlwaysInline);
public:
static char ID;
AMDGPUAlwaysInline(bool GlobalOpt = false) :
ModulePass(ID), GlobalOpt(GlobalOpt) { }
bool runOnModule(Module &M) override;
- StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
};
} // End anonymous namespace
@@ -46,15 +54,53 @@ INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline",
char AMDGPUAlwaysInline::ID = 0;
+void AMDGPUAlwaysInline::recursivelyVisitUsers(
+ GlobalValue &GV,
+ SmallPtrSetImpl<Function *> &FuncsToAlwaysInline) {
+ SmallVector<User *, 16> Stack;
+
+ SmallPtrSet<const Value *, 8> Visited;
+
+ for (User *U : GV.users())
+ Stack.push_back(U);
+
+ while (!Stack.empty()) {
+ User *U = Stack.pop_back_val();
+ if (!Visited.insert(U).second)
+ continue;
+
+ if (Instruction *I = dyn_cast<Instruction>(U)) {
+ Function *F = I->getParent()->getParent();
+ if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+ FuncsToAlwaysInline.insert(F);
+ Stack.push_back(F);
+ }
+
+ // No need to look at further users, but we do need to inline any callers.
+ continue;
+ }
+
+ for (User *UU : U->users())
+ Stack.push_back(UU);
+ }
+}
+
bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+ AMDGPUAS AMDGPUAS = AMDGPU::getAMDGPUAS(M);
+
std::vector<GlobalAlias*> AliasesToRemove;
- std::vector<Function *> FuncsToClone;
+
+ SmallPtrSet<Function *, 8> FuncsToAlwaysInline;
+ SmallPtrSet<Function *, 8> FuncsToNoInline;
for (GlobalAlias &A : M.aliases()) {
if (Function* F = dyn_cast<Function>(A.getAliasee())) {
A.replaceAllUsesWith(F);
AliasesToRemove.push_back(&A);
}
+
+ // FIXME: If the aliasee isn't a function, it's some kind of constant expr
+ // cast that won't be inlined through.
}
if (GlobalOpt) {
@@ -63,31 +109,51 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
}
}
- auto NewAttr = StressCalls ? Attribute::NoInline : Attribute::AlwaysInline;
- auto IncompatAttr
- = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
-
- for (Function &F : M) {
- if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
- !F.hasFnAttribute(IncompatAttr))
- FuncsToClone.push_back(&F);
- }
-
- for (Function *F : FuncsToClone) {
- ValueToValueMapTy VMap;
- Function *NewFunc = CloneFunction(F, VMap);
- NewFunc->setLinkage(GlobalValue::InternalLinkage);
- F->replaceAllUsesWith(NewFunc);
+ // Always force inlining of any function that uses an LDS global address. This
+ // is something of a workaround because we don't have a way of supporting LDS
+ // objects defined in functions. LDS is always allocated by a kernel, and it
+ // is difficult to manage LDS usage if a function may be used by multiple
+ // kernels.
+ //
+ // OpenCL doesn't allow declaring LDS in non-kernels, so in practice this
+ // should only appear when IPO passes manages to move LDs defined in a kernel
+ // into a single user function.
+
+ for (GlobalVariable &GV : M.globals()) {
+ // TODO: Region address
+ unsigned AS = GV.getType()->getAddressSpace();
+ if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS.REGION_ADDRESS)
+ continue;
+
+ recursivelyVisitUsers(GV, FuncsToAlwaysInline);
}
- for (Function &F : M) {
- if (F.hasLocalLinkage() && !F.hasFnAttribute(IncompatAttr)) {
- F.addFnAttr(NewAttr);
+ if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
+ auto IncompatAttr
+ = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
+
+ for (Function &F : M) {
+ if (!F.isDeclaration() && !F.use_empty() &&
+ !F.hasFnAttribute(IncompatAttr)) {
+ if (StressCalls) {
+ if (!FuncsToAlwaysInline.count(&F))
+ FuncsToNoInline.insert(&F);
+ } else
+ FuncsToAlwaysInline.insert(&F);
+ }
}
}
- return false;
+
+ for (Function *F : FuncsToAlwaysInline)
+ F->addFnAttr(Attribute::AlwaysInline);
+
+ for (Function *F : FuncsToNoInline)
+ F->addFnAttr(Attribute::NoInline);
+
+ return !FuncsToAlwaysInline.empty() || !FuncsToNoInline.empty();
}
ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) {
return new AMDGPUAlwaysInline(GlobalOpt);
}
+
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index ce17202f3414..1a70833a4472 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -219,7 +219,7 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
}
bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
- const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
bool HasFlat = ST.hasFlatAddressSpace();
bool HasApertureRegs = ST.hasApertureRegs();
SmallPtrSet<const Constant *, 8> ConstantExprVisited;
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index dcca3a2fab96..7465cf22b5a4 100644
--- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -55,9 +55,6 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
<< " DispatchID: " << FI.second.DispatchID
<< " FlatScratchInit: " << FI.second.FlatScratchInit
<< " PrivateSegmentSize: " << FI.second.PrivateSegmentSize
- << " GridWorkgroupCountX: " << FI.second.GridWorkGroupCountX
- << " GridWorkgroupCountY: " << FI.second.GridWorkGroupCountY
- << " GridWorkgroupCountZ: " << FI.second.GridWorkGroupCountZ
<< " WorkGroupIDX: " << FI.second.WorkGroupIDX
<< " WorkGroupIDY: " << FI.second.WorkGroupIDY
<< " WorkGroupIDZ: " << FI.second.WorkGroupIDZ
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index bf9635549a8c..f0e6d1b83f15 100644
--- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -18,7 +18,7 @@ namespace llvm {
class Function;
class raw_ostream;
-class SISubtarget;
+class GCNSubtarget;
class TargetMachine;
class TargetRegisterClass;
class TargetRegisterInfo;
@@ -111,9 +111,6 @@ struct AMDGPUFunctionArgInfo {
ArgDescriptor DispatchID;
ArgDescriptor FlatScratchInit;
ArgDescriptor PrivateSegmentSize;
- ArgDescriptor GridWorkGroupCountX;
- ArgDescriptor GridWorkGroupCountY;
- ArgDescriptor GridWorkGroupCountZ;
// System SGPRs in kernels.
ArgDescriptor WorkGroupIDX;
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index fda6252f46e3..e62e5d52ad74 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===//
+//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer -------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -21,7 +21,9 @@
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
+#include "R600AsmPrinter.h"
#include "R600Defines.h"
#include "R600MachineFunctionInfo.h"
#include "R600RegisterInfo.h"
@@ -32,7 +34,6 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionELF.h"
@@ -40,6 +41,7 @@
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
using namespace llvm;
using namespace llvm::AMDGPU;
@@ -65,7 +67,7 @@ using namespace llvm::AMDGPU;
// instructions to run at the double precision rate for the device so it's
// probably best to just report no single precision denormals.
static uint32_t getFPMode(const MachineFunction &F) {
- const SISubtarget& ST = F.getSubtarget<SISubtarget>();
+ const GCNSubtarget& ST = F.getSubtarget<GCNSubtarget>();
// TODO: Is there any real use for the flush in only / flush out only modes?
uint32_t FP32Denormals =
@@ -88,7 +90,7 @@ createAMDGPUAsmPrinterPass(TargetMachine &tm,
extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
- createAMDGPUAsmPrinterPass);
+ llvm::createR600AsmPrinterPass);
TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
createAMDGPUAsmPrinterPass);
}
@@ -114,7 +116,8 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
}
void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
- if (TM.getTargetTriple().getArch() != Triple::amdgcn)
+ if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+ TM.getTargetTriple().getOS() == Triple::AMDHSA)
return;
if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
@@ -127,10 +130,6 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
readPALMetadata(M);
- // Deprecated notes are not emitted for code object v3.
- if (IsaInfo::hasCodeObjectV3(getSTI()->getFeatureBits()))
- return;
-
// HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
@@ -142,7 +141,9 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
}
void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
- if (TM.getTargetTriple().getArch() != Triple::amdgcn)
+ // TODO: Add metadata to code object v3.
+ if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+ TM.getTargetTriple().getOS() == Triple::AMDHSA)
return;
// Following code requires TargetStreamer to be present.
@@ -189,37 +190,82 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
}
void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
- const AMDGPUMachineFunction *MFI = MF->getInfo<AMDGPUMachineFunction>();
- if (!MFI->isEntryFunction())
+ const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+ if (!MFI.isEntryFunction())
+ return;
+ if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+ TM.getTargetTriple().getOS() == Triple::AMDHSA)
return;
- const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
- amd_kernel_code_t KernelCode;
- if (STM.isAmdCodeObjectV2(*MF)) {
+ const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
+ const Function &F = MF->getFunction();
+ if (STM.isAmdCodeObjectV2(F) &&
+ (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+ F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
+ amd_kernel_code_t KernelCode;
getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
-
- OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
}
if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
return;
- HSAMetadataStream.emitKernel(MF->getFunction(),
- getHSACodeProps(*MF, CurrentProgramInfo),
- getHSADebugProps(*MF, CurrentProgramInfo));
+ HSAMetadataStream.emitKernel(*MF, CurrentProgramInfo);
+}
+
+void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
+ const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+ if (!MFI.isEntryFunction())
+ return;
+ if (!IsaInfo::hasCodeObjectV3(getSTI()) ||
+ TM.getTargetTriple().getOS() != Triple::AMDHSA)
+ return;
+
+ auto &Streamer = getTargetStreamer()->getStreamer();
+ auto &Context = Streamer.getContext();
+ auto &ObjectFileInfo = *Context.getObjectFileInfo();
+ auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
+
+ Streamer.PushSection();
+ Streamer.SwitchSection(&ReadOnlySection);
+
+ // CP microcode requires the kernel descriptor to be allocated on 64 byte
+ // alignment.
+ Streamer.EmitValueToAlignment(64, 0, 1, 0);
+ if (ReadOnlySection.getAlignment() < 64)
+ ReadOnlySection.setAlignment(64);
+
+ SmallString<128> KernelName;
+ getNameWithPrefix(KernelName, &MF->getFunction());
+ getTargetStreamer()->EmitAmdhsaKernelDescriptor(
+ *getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
+ CurrentProgramInfo.NumVGPRsForWavesPerEU,
+ CurrentProgramInfo.NumSGPRsForWavesPerEU -
+ IsaInfo::getNumExtraSGPRs(getSTI()->getFeatureBits(),
+ CurrentProgramInfo.VCCUsed,
+ CurrentProgramInfo.FlatUsed),
+ CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
+ hasXNACK(*getSTI()));
+
+ Streamer.PopSection();
}
void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
+ if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+ TM.getTargetTriple().getOS() == Triple::AMDHSA) {
+ AsmPrinter::EmitFunctionEntryLabel();
+ return;
+ }
+
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
- if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) {
+ const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
+ if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(MF->getFunction())) {
SmallString<128> SymbolName;
getNameWithPrefix(SymbolName, &MF->getFunction()),
getTargetStreamer()->EmitAMDGPUSymbolType(
SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
}
- const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+ const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
if (STI.dumpCode()) {
// Disassemble function name label to text.
DisasmLines.push_back(MF->getName().str() + ":");
@@ -231,7 +277,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
}
void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
- const AMDGPUSubtarget &STI = MBB.getParent()->getSubtarget<AMDGPUSubtarget>();
+ const GCNSubtarget &STI = MBB.getParent()->getSubtarget<GCNSubtarget>();
if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) {
// Write a line for the basic block label if it is not only fallthrough.
DisasmLines.push_back(
@@ -283,11 +329,66 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments(
uint32_t NumVGPR,
uint32_t NumSGPR,
uint64_t ScratchSize,
- uint64_t CodeSize) {
+ uint64_t CodeSize,
+ const AMDGPUMachineFunction *MFI) {
OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
+ OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
+ false);
+}
+
+uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
+ const MachineFunction &MF) const {
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ uint16_t KernelCodeProperties = 0;
+
+ if (MFI.hasPrivateSegmentBuffer()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+ }
+ if (MFI.hasDispatchPtr()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+ }
+ if (MFI.hasQueuePtr()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+ }
+ if (MFI.hasKernargSegmentPtr()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+ }
+ if (MFI.hasDispatchID()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+ }
+ if (MFI.hasFlatScratchInit()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+ }
+
+ return KernelCodeProperties;
+}
+
+amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
+ const MachineFunction &MF,
+ const SIProgramInfo &PI) const {
+ amdhsa::kernel_descriptor_t KernelDescriptor;
+ memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
+
+ assert(isUInt<32>(PI.ScratchSize));
+ assert(isUInt<32>(PI.ComputePGMRSrc1));
+ assert(isUInt<32>(PI.ComputePGMRSrc2));
+
+ KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
+ KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
+ KernelDescriptor.compute_pgm_rsrc1 = PI.ComputePGMRSrc1;
+ KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
+ KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
+
+ return KernelDescriptor;
}
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
@@ -301,32 +402,29 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
SetupMachineFunction(MF);
- const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
MCContext &Context = getObjFileLowering().getContext();
- if (!STM.isAmdHsaOS()) {
+ // FIXME: This should be an explicit check for Mesa.
+ if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
MCSectionELF *ConfigSection =
Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
OutStreamer->SwitchSection(ConfigSection);
}
- if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- if (MFI->isEntryFunction()) {
- getSIProgramInfo(CurrentProgramInfo, MF);
- } else {
- auto I = CallGraphResourceInfo.insert(
- std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
- SIFunctionResourceInfo &Info = I.first->second;
- assert(I.second && "should only be called once per function");
- Info = analyzeResourceUsage(MF);
- }
-
- if (STM.isAmdPalOS())
- EmitPALMetadata(MF, CurrentProgramInfo);
- if (!STM.isAmdHsaOS()) {
- EmitProgramInfoSI(MF, CurrentProgramInfo);
- }
+ if (MFI->isEntryFunction()) {
+ getSIProgramInfo(CurrentProgramInfo, MF);
} else {
- EmitProgramInfoR600(MF);
+ auto I = CallGraphResourceInfo.insert(
+ std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
+ SIFunctionResourceInfo &Info = I.first->second;
+ assert(I.second && "should only be called once per function");
+ Info = analyzeResourceUsage(MF);
+ }
+
+ if (STM.isAmdPalOS())
+ EmitPALMetadata(MF, CurrentProgramInfo);
+ else if (!STM.isAmdHsaOS()) {
+ EmitProgramInfoSI(MF, CurrentProgramInfo);
}
DisasmLines.clear();
@@ -340,84 +438,74 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
OutStreamer->SwitchSection(CommentSection);
- if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- if (!MFI->isEntryFunction()) {
- OutStreamer->emitRawComment(" Function info:", false);
- SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
- emitCommonFunctionComments(
- Info.NumVGPR,
- Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()),
- Info.PrivateSegmentSize,
- getFunctionCodeSize(MF));
- return false;
- }
-
- OutStreamer->emitRawComment(" Kernel info:", false);
- emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
- CurrentProgramInfo.NumSGPR,
- CurrentProgramInfo.ScratchSize,
- getFunctionCodeSize(MF));
-
- OutStreamer->emitRawComment(
- " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
- OutStreamer->emitRawComment(
- " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
- OutStreamer->emitRawComment(
- " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
- " bytes/workgroup (compile time only)", false);
-
- OutStreamer->emitRawComment(
- " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
- OutStreamer->emitRawComment(
- " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
-
- OutStreamer->emitRawComment(
- " NumSGPRsForWavesPerEU: " +
- Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
- OutStreamer->emitRawComment(
- " NumVGPRsForWavesPerEU: " +
- Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
-
- OutStreamer->emitRawComment(
- " ReservedVGPRFirst: " + Twine(CurrentProgramInfo.ReservedVGPRFirst),
- false);
- OutStreamer->emitRawComment(
- " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount),
- false);
-
- if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
- OutStreamer->emitRawComment(
- " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
- Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
- OutStreamer->emitRawComment(
- " DebuggerPrivateSegmentBufferSGPR: s" +
- Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
- }
+ if (!MFI->isEntryFunction()) {
+ OutStreamer->emitRawComment(" Function info:", false);
+ SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
+ emitCommonFunctionComments(
+ Info.NumVGPR,
+ Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
+ Info.PrivateSegmentSize,
+ getFunctionCodeSize(MF), MFI);
+ return false;
+ }
+ OutStreamer->emitRawComment(" Kernel info:", false);
+ emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
+ CurrentProgramInfo.NumSGPR,
+ CurrentProgramInfo.ScratchSize,
+ getFunctionCodeSize(MF), MFI);
+
+ OutStreamer->emitRawComment(
+ " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
+ OutStreamer->emitRawComment(
+ " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
+ OutStreamer->emitRawComment(
+ " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
+ " bytes/workgroup (compile time only)", false);
+
+ OutStreamer->emitRawComment(
+ " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
+ OutStreamer->emitRawComment(
+ " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
+
+ OutStreamer->emitRawComment(
+ " NumSGPRsForWavesPerEU: " +
+ Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
+ OutStreamer->emitRawComment(
+ " NumVGPRsForWavesPerEU: " +
+ Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
+
+ OutStreamer->emitRawComment(
+ " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
+
+ if (MF.getSubtarget<GCNSubtarget>().debuggerEmitPrologue()) {
OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:USER_SGPR: " +
- Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
- OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
- Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
+ Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
- Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
- OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
- Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
- OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
- Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
- OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
- Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
- false);
- } else {
- R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
- OutStreamer->emitRawComment(
- Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize)));
+ " DebuggerPrivateSegmentBufferSGPR: s" +
+ Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
}
+
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:USER_SGPR: " +
+ Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
+ Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+ Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+ Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+ Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+ Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
+ false);
}
if (STM.dumpCode()) {
@@ -440,67 +528,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
return false;
}
-void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
- unsigned MaxGPR = 0;
- bool killPixel = false;
- const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
- const R600RegisterInfo *RI = STM.getRegisterInfo();
- const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-
- for (const MachineBasicBlock &MBB : MF) {
- for (const MachineInstr &MI : MBB) {
- if (MI.getOpcode() == AMDGPU::KILLGT)
- killPixel = true;
- unsigned numOperands = MI.getNumOperands();
- for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
- const MachineOperand &MO = MI.getOperand(op_idx);
- if (!MO.isReg())
- continue;
- unsigned HWReg = RI->getHWRegIndex(MO.getReg());
-
- // Register with value > 127 aren't GPR
- if (HWReg > 127)
- continue;
- MaxGPR = std::max(MaxGPR, HWReg);
- }
- }
- }
-
- unsigned RsrcReg;
- if (STM.getGeneration() >= R600Subtarget::EVERGREEN) {
- // Evergreen / Northern Islands
- switch (MF.getFunction().getCallingConv()) {
- default: LLVM_FALLTHROUGH;
- case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
- case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
- case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
- case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
- }
- } else {
- // R600 / R700
- switch (MF.getFunction().getCallingConv()) {
- default: LLVM_FALLTHROUGH;
- case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH;
- case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH;
- case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
- case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
- }
- }
-
- OutStreamer->EmitIntValue(RsrcReg, 4);
- OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
- S_STACK_SIZE(MFI->CFStackSize), 4);
- OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
- OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
-
- if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
- OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
- OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
- }
-}
-
uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = STM.getInstrInfo();
uint64_t CodeSize = 0;
@@ -510,7 +539,7 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const
// TODO: CodeSize should account for multiple functions.
// TODO: Should we count size of debug info?
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
continue;
CodeSize += TII->getInstSizeInBytes(MI);
@@ -531,30 +560,10 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
return false;
}
-static unsigned getNumExtraSGPRs(const SISubtarget &ST,
- bool VCCUsed,
- bool FlatScrUsed) {
- unsigned ExtraSGPRs = 0;
- if (VCCUsed)
- ExtraSGPRs = 2;
-
- if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
- if (FlatScrUsed)
- ExtraSGPRs = 4;
- } else {
- if (ST.isXNACKEnabled())
- ExtraSGPRs = 4;
-
- if (FlatScrUsed)
- ExtraSGPRs = 6;
- }
-
- return ExtraSGPRs;
-}
-
int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
- const SISubtarget &ST) const {
- return NumExplicitSGPR + getNumExtraSGPRs(ST, UsesVCC, UsesFlatScratch);
+ const GCNSubtarget &ST) const {
+ return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(),
+ UsesVCC, UsesFlatScratch);
}
AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
@@ -562,7 +571,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
SIFunctionResourceInfo Info;
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
const MachineRegisterInfo &MRI = MF.getRegInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -586,6 +595,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
Info.PrivateSegmentSize = FrameInfo.getStackSize();
+ if (MFI->isStackRealigned())
+ Info.PrivateSegmentSize += FrameInfo.getMaxAlignment();
Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
@@ -649,7 +660,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
continue;
case AMDGPU::NoRegister:
- assert(MI.isDebugValue());
+ assert(MI.isDebugInstr());
continue;
case AMDGPU::VCC:
@@ -663,6 +674,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
case AMDGPU::FLAT_SCR_HI:
continue;
+ case AMDGPU::XNACK_MASK:
+ case AMDGPU::XNACK_MASK_LO:
+ case AMDGPU::XNACK_MASK_HI:
+ llvm_unreachable("xnack_mask registers should not be used");
+
case AMDGPU::TBA:
case AMDGPU::TBA_LO:
case AMDGPU::TBA_HI:
@@ -742,8 +758,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
// conservative guesses.
// 48 SGPRs - vcc, - flat_scr, -xnack
- int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true,
- ST.hasFlatAddressSpace());
+ int MaxSGPRGuess =
+ 47 - IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(), true,
+ ST.hasFlatAddressSpace());
MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
MaxVGPR = std::max(MaxVGPR, 23);
@@ -798,15 +815,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
MF.getFunction().getContext().diagnose(DiagStackSize);
}
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const SIInstrInfo *TII = STM.getInstrInfo();
const SIRegisterInfo *RI = &TII->getRegisterInfo();
- unsigned ExtraSGPRs = getNumExtraSGPRs(STM,
- ProgInfo.VCCUsed,
- ProgInfo.FlatUsed);
- unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF);
+ // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
+ // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
+ // unified.
+ unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
+ STM.getFeatureBits(), ProgInfo.VCCUsed, ProgInfo.FlatUsed);
// Check the addressable register limit before we add ExtraSGPRs.
if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
@@ -827,7 +845,19 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// Account for extra SGPRs and VGPRs reserved for debugger use.
ProgInfo.NumSGPR += ExtraSGPRs;
- ProgInfo.NumVGPR += ExtraVGPRs;
+
+ // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
+ // dispatch registers are function args.
+ unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
+ for (auto &Arg : MF.getFunction().args()) {
+ unsigned NumRegs = (Arg.getType()->getPrimitiveSizeInBits() + 31) / 32;
+ if (Arg.hasAttribute(Attribute::InReg))
+ WaveDispatchNumSGPR += NumRegs;
+ else
+ WaveDispatchNumVGPR += NumRegs;
+ }
+ ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
+ ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
// Adjust number of registers used to meet default/requested minimum/maximum
// number of waves per execution unit request.
@@ -875,19 +905,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
Ctx.diagnose(Diag);
}
- // SGPRBlocks is actual number of SGPR blocks minus 1.
- ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU,
- STM.getSGPREncodingGranule());
- ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / STM.getSGPREncodingGranule() - 1;
-
- // VGPRBlocks is actual number of VGPR blocks minus 1.
- ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU,
- STM.getVGPREncodingGranule());
- ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1;
-
- // Record first reserved VGPR and number of reserved VGPRs.
- ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? ProgInfo.NumVGPR : 0;
- ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF);
+ ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
+ STM.getFeatureBits(), ProgInfo.NumSGPRsForWavesPerEU);
+ ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
+ STM.getFeatureBits(), ProgInfo.NumVGPRsForWavesPerEU);
// Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
// DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
@@ -909,7 +930,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.DX10Clamp = STM.enableDX10Clamp();
unsigned LDSAlignShift;
- if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
+ if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
// LDS is allocated in 64 dword blocks.
LDSAlignShift = 8;
} else {
@@ -954,7 +975,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.ComputePGMRSrc2 =
S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
- S_00B84C_TRAP_HANDLER(STM.isTrapHandlerEnabled()) |
+ // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
+ S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
@@ -981,7 +1003,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
const SIProgramInfo &CurrentProgramInfo) {
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
@@ -1002,26 +1024,21 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->EmitIntValue(RsrcReg, 4);
OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
- unsigned Rsrc2Val = 0;
if (STM.isVGPRSpillingEnabled(MF.getFunction())) {
OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
- if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
- Rsrc2Val = S_00B84C_SCRATCH_EN(CurrentProgramInfo.ScratchBlocks > 0);
- }
- if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
- OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
- OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
- OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
- OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
- Rsrc2Val |= S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks);
- }
- if (Rsrc2Val) {
- OutStreamer->EmitIntValue(RsrcReg + 4 /*rsrc2*/, 4);
- OutStreamer->EmitIntValue(Rsrc2Val, 4);
}
}
+ if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
+ OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
+ OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
+ OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
+ OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
+ OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
+ OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
+ }
+
OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
@@ -1114,8 +1131,12 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
const SIProgramInfo &CurrentProgramInfo,
const MachineFunction &MF) const {
+ const Function &F = MF.getFunction();
+ assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+ F.getCallingConv() == CallingConv::SPIR_KERNEL);
+
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());
@@ -1151,21 +1172,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
if (MFI->hasFlatScratchInit())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
- if (MFI->hasGridWorkgroupCountX()) {
- Out.code_properties |=
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
- }
-
- if (MFI->hasGridWorkgroupCountY()) {
- Out.code_properties |=
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
- }
-
- if (MFI->hasGridWorkgroupCountZ()) {
- Out.code_properties |=
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
- }
-
if (MFI->hasDispatchPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
@@ -1175,20 +1181,17 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
if (STM.isXNACKEnabled())
Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
- // FIXME: Should use getKernArgSize
- Out.kernarg_segment_byte_size =
- STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
+ unsigned MaxKernArgAlign;
+ Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
- Out.reserved_vgpr_first = CurrentProgramInfo.ReservedVGPRFirst;
- Out.reserved_vgpr_count = CurrentProgramInfo.ReservedVGPRCount;
// These alignment values are specified in powers of two, so alignment =
// 2^n. The minimum alignment is 2^4 = 16.
Out.kernarg_segment_alignment = std::max((size_t)4,
- countTrailingZeros(MFI->getMaxKernArgAlign()));
+ countTrailingZeros(MaxKernArgAlign));
if (STM.debuggerEmitPrologue()) {
Out.debug_wavefront_private_segment_offset_sgpr =
@@ -1198,55 +1201,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
}
}
-AMDGPU::HSAMD::Kernel::CodeProps::Metadata AMDGPUAsmPrinter::getHSACodeProps(
- const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) const {
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
- const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
- HSAMD::Kernel::CodeProps::Metadata HSACodeProps;
-
- HSACodeProps.mKernargSegmentSize =
- STM.getKernArgSegmentSize(MF, MFI.getABIArgOffset());
- HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
- HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
- HSACodeProps.mKernargSegmentAlign =
- std::max(uint32_t(4), MFI.getMaxKernArgAlign());
- HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
- HSACodeProps.mNumSGPRs = CurrentProgramInfo.NumSGPR;
- HSACodeProps.mNumVGPRs = CurrentProgramInfo.NumVGPR;
- HSACodeProps.mMaxFlatWorkGroupSize = MFI.getMaxFlatWorkGroupSize();
- HSACodeProps.mIsDynamicCallStack = ProgramInfo.DynamicCallStack;
- HSACodeProps.mIsXNACKEnabled = STM.isXNACKEnabled();
- HSACodeProps.mNumSpilledSGPRs = MFI.getNumSpilledSGPRs();
- HSACodeProps.mNumSpilledVGPRs = MFI.getNumSpilledVGPRs();
-
- return HSACodeProps;
-}
-
-AMDGPU::HSAMD::Kernel::DebugProps::Metadata AMDGPUAsmPrinter::getHSADebugProps(
- const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) const {
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
- HSAMD::Kernel::DebugProps::Metadata HSADebugProps;
-
- if (!STM.debuggerSupported())
- return HSADebugProps;
-
- HSADebugProps.mDebuggerABIVersion.push_back(1);
- HSADebugProps.mDebuggerABIVersion.push_back(0);
- HSADebugProps.mReservedNumVGPRs = ProgramInfo.ReservedVGPRCount;
- HSADebugProps.mReservedFirstVGPR = ProgramInfo.ReservedVGPRFirst;
-
- if (STM.debuggerEmitPrologue()) {
- HSADebugProps.mPrivateSegmentBufferSGPR =
- ProgramInfo.DebuggerPrivateSegmentBufferSGPR;
- HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR =
- ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
- }
-
- return HSADebugProps;
-}
-
bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
unsigned AsmVariant,
const char *ExtraCode, raw_ostream &O) {
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 51d48a0c7320..22982d912c70 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU Assembly printer class.
+/// AMDGPU Assembly printer class.
//
//===----------------------------------------------------------------------===//
@@ -17,9 +17,11 @@
#include "AMDGPU.h"
#include "AMDKernelCodeT.h"
-#include "MCTargetDesc/AMDGPUHSAMetadataStreamer.h"
+#include "AMDGPUHSAMetadataStreamer.h"
+#include "SIProgramInfo.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include <cstddef>
#include <cstdint>
#include <limits>
@@ -29,9 +31,10 @@
namespace llvm {
+class AMDGPUMachineFunction;
class AMDGPUTargetStreamer;
class MCOperand;
-class SISubtarget;
+class GCNSubtarget;
class AMDGPUAsmPrinter final : public AsmPrinter {
private:
@@ -47,68 +50,7 @@ private:
bool HasDynamicallySizedStack = false;
bool HasRecursion = false;
- int32_t getTotalNumSGPRs(const SISubtarget &ST) const;
- };
-
- // Track resource usage for kernels / entry functions.
- struct SIProgramInfo {
- // Fields set in PGM_RSRC1 pm4 packet.
- uint32_t VGPRBlocks = 0;
- uint32_t SGPRBlocks = 0;
- uint32_t Priority = 0;
- uint32_t FloatMode = 0;
- uint32_t Priv = 0;
- uint32_t DX10Clamp = 0;
- uint32_t DebugMode = 0;
- uint32_t IEEEMode = 0;
- uint64_t ScratchSize = 0;
-
- uint64_t ComputePGMRSrc1 = 0;
-
- // Fields set in PGM_RSRC2 pm4 packet.
- uint32_t LDSBlocks = 0;
- uint32_t ScratchBlocks = 0;
-
- uint64_t ComputePGMRSrc2 = 0;
-
- uint32_t NumVGPR = 0;
- uint32_t NumSGPR = 0;
- uint32_t LDSSize = 0;
- bool FlatUsed = false;
-
- // Number of SGPRs that meets number of waves per execution unit request.
- uint32_t NumSGPRsForWavesPerEU = 0;
-
- // Number of VGPRs that meets number of waves per execution unit request.
- uint32_t NumVGPRsForWavesPerEU = 0;
-
- // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first
- // fixed VGPR number reserved.
- uint16_t ReservedVGPRFirst = 0;
-
- // The number of consecutive VGPRs reserved.
- uint16_t ReservedVGPRCount = 0;
-
- // Fixed SGPR number used to hold wave scratch offset for entire kernel
- // execution, or std::numeric_limits<uint16_t>::max() if the register is not
- // used or not known.
- uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR =
- std::numeric_limits<uint16_t>::max();
-
- // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
- // kernel execution, or std::numeric_limits<uint16_t>::max() if the register
- // is not used or not known.
- uint16_t DebuggerPrivateSegmentBufferSGPR =
- std::numeric_limits<uint16_t>::max();
-
- // Whether there is recursion, dynamic allocas, indirect calls or some other
- // reason there may be statically unknown stack usage.
- bool DynamicCallStack = false;
-
- // Bonus information for debugging.
- bool VCCUsed = false;
-
- SIProgramInfo() = default;
+ int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
};
SIProgramInfo CurrentProgramInfo;
@@ -128,16 +70,8 @@ private:
unsigned &NumSGPR,
unsigned &NumVGPR) const;
- AMDGPU::HSAMD::Kernel::CodeProps::Metadata getHSACodeProps(
- const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) const;
- AMDGPU::HSAMD::Kernel::DebugProps::Metadata getHSADebugProps(
- const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) const;
-
- /// \brief Emit register usage information so that the GPU driver
+ /// Emit register usage information so that the GPU driver
/// can correctly setup the GPU state.
- void EmitProgramInfoR600(const MachineFunction &MF);
void EmitProgramInfoSI(const MachineFunction &MF,
const SIProgramInfo &KernelInfo);
void EmitPALMetadata(const MachineFunction &MF,
@@ -145,7 +79,15 @@ private:
void emitCommonFunctionComments(uint32_t NumVGPR,
uint32_t NumSGPR,
uint64_t ScratchSize,
- uint64_t CodeSize);
+ uint64_t CodeSize,
+ const AMDGPUMachineFunction* MFI);
+
+ uint16_t getAmdhsaKernelCodeProperties(
+ const MachineFunction &MF) const;
+
+ amdhsa::kernel_descriptor_t getAmdhsaKernelDescriptor(
+ const MachineFunction &MF,
+ const SIProgramInfo &PI) const;
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
@@ -160,16 +102,16 @@ public:
bool doFinalization(Module &M) override;
bool runOnMachineFunction(MachineFunction &MF) override;
- /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
+ /// Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
/// pseudo lowering.
bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
- /// \brief Lower the specified LLVM Constant to an MCExpr.
+ /// Lower the specified LLVM Constant to an MCExpr.
/// The AsmPrinter::lowerConstantof does not know how to lower
/// addrspacecast, therefore they should be lowered by this function.
const MCExpr *lowerConstant(const Constant *CV) override;
- /// \brief tblgen'erated driver function for lowering simple MI->MC pseudo
+ /// tblgen'erated driver function for lowering simple MI->MC pseudo
/// instructions.
bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
const MachineInstr *MI);
@@ -179,6 +121,8 @@ public:
void EmitFunctionBodyStart() override;
+ void EmitFunctionBodyEnd() override;
+
void EmitFunctionEntryLabel() override;
void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 5a9138731934..18c7df0d94f2 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -20,6 +20,7 @@
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -32,13 +33,17 @@ AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
const Value *Val, unsigned VReg) const {
+ // FIXME: Add support for non-void returns.
+ if (Val)
+ return false;
+
MIRBuilder.buildInstr(AMDGPU::S_ENDPGM);
return true;
}
unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
Type *ParamTy,
- unsigned Offset) const {
+ uint64_t Offset) const {
MachineFunction &MF = MIRBuilder.getMF();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -61,7 +66,8 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
}
void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
- Type *ParamTy, unsigned Offset,
+ Type *ParamTy, uint64_t Offset,
+ unsigned Align,
unsigned DstReg) const {
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
@@ -69,7 +75,6 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
- unsigned Align = DL.getABITypeAlignment(ParamTy);
unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
MachineMemOperand *MMO =
@@ -84,12 +89,16 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
const Function &F,
ArrayRef<unsigned> VRegs) const {
+ // AMDGPU_GS and AMDGP_HS are not supported yet.
+ if (F.getCallingConv() == CallingConv::AMDGPU_GS ||
+ F.getCallingConv() == CallingConv::AMDGPU_HS)
+ return false;
MachineFunction &MF = MIRBuilder.getMF();
- const SISubtarget *Subtarget = static_cast<const SISubtarget *>(&MF.getSubtarget());
+ const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+ const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
const DataLayout &DL = F.getParent()->getDataLayout();
SmallVector<CCValAssign, 16> ArgLocs;
@@ -116,7 +125,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
if (Info->hasKernargSegmentPtr()) {
unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
- const LLT P2 = LLT::pointer(2, 64);
+ const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
unsigned VReg = MRI.createGenericVirtualRegister(P2);
MRI.addLiveIn(InputPtrReg, VReg);
MIRBuilder.getMBB().addLiveIn(InputPtrReg);
@@ -136,49 +145,106 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
CCInfo.AllocateReg(FlatScratchInitReg);
}
+ // The infrastructure for normal calling convention lowering is essentially
+ // useless for kernels. We want to avoid any kind of legalization or argument
+ // splitting.
+ if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) {
+ unsigned i = 0;
+ const unsigned KernArgBaseAlign = 16;
+ const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
+ uint64_t ExplicitArgOffset = 0;
+
+ // TODO: Align down to dword alignment and extract bits for extending loads.
+ for (auto &Arg : F.args()) {
+ Type *ArgTy = Arg.getType();
+ unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+ if (AllocSize == 0)
+ continue;
+
+ unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
+
+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
+
+ unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
+ ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
+ lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, VRegs[i]);
+ ++i;
+ }
+
+ return true;
+ }
+
unsigned NumArgs = F.arg_size();
Function::const_arg_iterator CurOrigArg = F.arg_begin();
const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
+ unsigned PSInputNum = 0;
+ BitVector Skipped(NumArgs);
for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) {
EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType());
// We can only hanlde simple value types at the moment.
- if (!ValEVT.isSimple())
- return false;
- MVT ValVT = ValEVT.getSimpleVT();
ISD::ArgFlagsTy Flags;
ArgInfo OrigArg{VRegs[i], CurOrigArg->getType()};
setArgFlags(OrigArg, i + 1, DL, F);
Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType()));
+
+ if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
+ !OrigArg.Flags.isInReg() && !OrigArg.Flags.isByVal() &&
+ PSInputNum <= 15) {
+ if (CurOrigArg->use_empty() && !Info->isPSInputAllocated(PSInputNum)) {
+ Skipped.set(i);
+ ++PSInputNum;
+ continue;
+ }
+
+ Info->markPSInputAllocated(PSInputNum);
+ if (!CurOrigArg->use_empty())
+ Info->markPSInputEnabled(PSInputNum);
+
+ ++PSInputNum;
+ }
+
CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(),
/*IsVarArg=*/false);
- bool Res =
- AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo);
- // Fail if we don't know how to handle this type.
- if (Res)
- return false;
+ if (ValEVT.isVector()) {
+ EVT ElemVT = ValEVT.getVectorElementType();
+ if (!ValEVT.isSimple())
+ return false;
+ MVT ValVT = ElemVT.getSimpleVT();
+ bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full,
+ OrigArg.Flags, CCInfo);
+ if (!Res)
+ return false;
+ } else {
+ MVT ValVT = ValEVT.getSimpleVT();
+ if (!ValEVT.isSimple())
+ return false;
+ bool Res =
+ AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo);
+
+ // Fail if we don't know how to handle this type.
+ if (Res)
+ return false;
+ }
}
Function::const_arg_iterator Arg = F.arg_begin();
- if (F.getCallingConv() == CallingConv::AMDGPU_VS) {
- for (unsigned i = 0; i != NumArgs; ++i, ++Arg) {
- CCValAssign &VA = ArgLocs[i];
- MRI.addLiveIn(VA.getLocReg(), VRegs[i]);
+ if (F.getCallingConv() == CallingConv::AMDGPU_VS ||
+ F.getCallingConv() == CallingConv::AMDGPU_PS) {
+ for (unsigned i = 0, OrigArgIdx = 0;
+ OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) {
+ if (Skipped.test(OrigArgIdx))
+ continue;
+ CCValAssign &VA = ArgLocs[i++];
+ MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx]);
MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
- MIRBuilder.buildCopy(VRegs[i], VA.getLocReg());
+ MIRBuilder.buildCopy(VRegs[OrigArgIdx], VA.getLocReg());
}
return true;
}
- for (unsigned i = 0; i != NumArgs; ++i, ++Arg) {
- // FIXME: We should be getting DebugInfo from the arguments some how.
- CCValAssign &VA = ArgLocs[i];
- lowerParameter(MIRBuilder, Arg->getType(),
- VA.getLocMemOffset() +
- Subtarget->getExplicitKernelArgOffset(MF), VRegs[i]);
- }
-
- return true;
+ return false;
}
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 251cb7a2c440..f51cb6abbf65 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -26,10 +26,11 @@ class AMDGPUCallLowering: public CallLowering {
AMDGPUAS AMDGPUASI;
unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
- unsigned Offset) const;
+ uint64_t Offset) const;
void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy,
- unsigned Offset, unsigned DstReg) const;
+ uint64_t Offset, unsigned Align,
+ unsigned DstReg) const;
public:
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index c1c066fd1404..68bc7fdd9961 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -85,22 +85,6 @@ def RetCC_SI_Shader : CallingConv<[
]>>
]>;
-// Calling convention for R600
-def CC_R600 : CallingConv<[
- CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
- T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
- T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
- T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
- T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
- T30_XYZW, T31_XYZW, T32_XYZW
- ]>>>
-]>;
-
-// Calling convention for compute kernels
-def CC_AMDGPU_Kernel : CallingConv<[
- CCCustom<"allocateKernArg">
-]>;
-
def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs<
(sequence "VGPR%u", 24, 255)
>;
@@ -127,7 +111,7 @@ def CC_AMDGPU_Func : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>,
+ CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>,
CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
@@ -144,30 +128,16 @@ def RetCC_AMDGPU_Func : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>
+ CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>
]>;
def CC_AMDGPU : CallingConv<[
- CCIf<"static_cast<const AMDGPUSubtarget&>"
- "(State.getMachineFunction().getSubtarget()).getGeneration() >="
- "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
- "!AMDGPU::isShader(State.getCallingConv())",
- CCDelegateTo<CC_AMDGPU_Kernel>>,
- CCIf<"static_cast<const AMDGPUSubtarget&>"
- "(State.getMachineFunction().getSubtarget()).getGeneration() < "
- "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
- "!AMDGPU::isShader(State.getCallingConv())",
- CCDelegateTo<CC_AMDGPU_Kernel>>,
- CCIf<"static_cast<const AMDGPUSubtarget&>"
+ CCIf<"static_cast<const GCNSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() >= "
"AMDGPUSubtarget::SOUTHERN_ISLANDS",
CCDelegateTo<CC_SI>>,
- CCIf<"static_cast<const AMDGPUSubtarget&>"
+ CCIf<"static_cast<const GCNSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() >= "
"AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
- CCDelegateTo<CC_AMDGPU_Func>>,
- CCIf<"static_cast<const AMDGPUSubtarget&>"
- "(State.getMachineFunction().getSubtarget()).getGeneration() < "
- "AMDGPUSubtarget::SOUTHERN_ISLANDS",
- CCDelegateTo<CC_R600>>
+ CCDelegateTo<CC_AMDGPU_Func>>
]>;
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index b17b67167666..5713b7b7f9a8 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -17,8 +17,10 @@
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
@@ -48,15 +50,22 @@ using namespace llvm;
namespace {
+static cl::opt<bool> WidenLoads(
+ "amdgpu-codegenprepare-widen-constant-loads",
+ cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden,
+ cl::init(true));
+
class AMDGPUCodeGenPrepare : public FunctionPass,
public InstVisitor<AMDGPUCodeGenPrepare, bool> {
- const SISubtarget *ST = nullptr;
+ const GCNSubtarget *ST = nullptr;
+ AssumptionCache *AC = nullptr;
DivergenceAnalysis *DA = nullptr;
Module *Mod = nullptr;
bool HasUnsafeFPMath = false;
AMDGPUAS AMDGPUASI;
- /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
+ /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
/// binary operation \p V.
///
/// \returns Binary operation \p V.
@@ -80,7 +89,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
/// false otherwise.
bool needsPromotionToI32(const Type *T) const;
- /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
+ /// Promotes uniform binary operation \p I to equivalent 32 bit binary
/// operation.
///
/// \details \p I's base element bit width must be greater than 1 and less
@@ -93,7 +102,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
/// false otherwise.
bool promoteUniformOpToI32(BinaryOperator &I) const;
- /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
+ /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
///
/// \details \p I's base element bit width must be greater than 1 and less
/// than or equal 16. Promotion is done by sign or zero extending operands to
@@ -102,7 +111,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
/// \returns True.
bool promoteUniformOpToI32(ICmpInst &I) const;
- /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
+ /// Promotes uniform 'select' operation \p I to 32 bit 'select'
/// operation.
///
/// \details \p I's base element bit width must be greater than 1 and less
@@ -113,7 +122,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
/// \returns True.
bool promoteUniformOpToI32(SelectInst &I) const;
- /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
+ /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
/// intrinsic.
///
/// \details \p I's base element bit width must be greater than 1 and less
@@ -125,7 +134,17 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
///
/// \returns True.
bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
- /// \brief Widen a scalar load.
+
+ /// Expands 24 bit div or rem.
+ Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
+ Value *Num, Value *Den,
+ bool IsDiv, bool IsSigned) const;
+
+ /// Expands 32 bit div or rem.
+ Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
+ Value *Num, Value *Den) const;
+
+ /// Widen a scalar load.
///
/// \details \p Widen scalar load for uniform, small type loads from constant
// memory / to a full 32-bits and then truncate the input to allow a scalar
@@ -157,6 +176,7 @@ public:
StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<DivergenceAnalysis>();
AU.setPreservesAll();
}
@@ -250,7 +270,9 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
"I does not need promotion to i32");
if (I.getOpcode() == Instruction::SDiv ||
- I.getOpcode() == Instruction::UDiv)
+ I.getOpcode() == Instruction::UDiv ||
+ I.getOpcode() == Instruction::SRem ||
+ I.getOpcode() == Instruction::URem)
return false;
IRBuilder<> Builder(&I);
@@ -372,13 +394,18 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
return true;
}
-static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
+static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
if (!CNum)
- return false;
+ return HasDenormals;
+
+ if (UnsafeDiv)
+ return true;
+
+ bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
// Reciprocal f32 is handled separately without denormals.
- return UnsafeDiv || CNum->isExactlyValue(+1.0);
+ return HasDenormals ^ IsOne;
}
// Insert an intrinsic for fast fdiv for safe math situations where we can
@@ -404,7 +431,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
FMF.allowReciprocal();
// With UnsafeDiv node will be optimized to just rcp and mul.
- if (ST->hasFP32Denormals() || UnsafeDiv)
+ if (UnsafeDiv)
return false;
IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
@@ -418,6 +445,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
Value *NewFDiv = nullptr;
+ bool HasDenormals = ST->hasFP32Denormals();
if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
NewFDiv = UndefValue::get(VT);
@@ -428,7 +456,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
Value *DenEltI = Builder.CreateExtractElement(Den, I);
Value *NewElt;
- if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
+ if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {
NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
} else {
NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
@@ -437,7 +465,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
}
} else {
- if (!shouldKeepFDivF32(Num, UnsafeDiv))
+ if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))
NewFDiv = Builder.CreateCall(Decl, { Num, Den });
}
@@ -447,7 +475,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
FDiv.eraseFromParent();
}
- return true;
+ return !!NewFDiv;
}
static bool hasUnsafeFPMath(const Function &F) {
@@ -455,18 +483,324 @@ static bool hasUnsafeFPMath(const Function &F) {
return Attr.getValueAsString() == "true";
}
+static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
+ Value *LHS, Value *RHS) {
+ Type *I32Ty = Builder.getInt32Ty();
+ Type *I64Ty = Builder.getInt64Ty();
+
+ Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
+ Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
+ Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
+ Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
+ Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
+ Hi = Builder.CreateTrunc(Hi, I32Ty);
+ return std::make_pair(Lo, Hi);
+}
+
+static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
+ return getMul64(Builder, LHS, RHS).second;
+}
+
+// The fractional part of a float is enough to accurately represent up to
+// a 24-bit signed integer.
+Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
+ BinaryOperator &I,
+ Value *Num, Value *Den,
+ bool IsDiv, bool IsSigned) const {
+ assert(Num->getType()->isIntegerTy(32));
+
+ const DataLayout &DL = Mod->getDataLayout();
+ unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
+ if (LHSSignBits < 9)
+ return nullptr;
+
+ unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
+ if (RHSSignBits < 9)
+ return nullptr;
+
+
+ unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
+ unsigned DivBits = 32 - SignBits;
+ if (IsSigned)
+ ++DivBits;
+
+ Type *Ty = Num->getType();
+ Type *I32Ty = Builder.getInt32Ty();
+ Type *F32Ty = Builder.getFloatTy();
+ ConstantInt *One = Builder.getInt32(1);
+ Value *JQ = One;
+
+ if (IsSigned) {
+ // char|short jq = ia ^ ib;
+ JQ = Builder.CreateXor(Num, Den);
+
+ // jq = jq >> (bitsize - 2)
+ JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
+
+ // jq = jq | 0x1
+ JQ = Builder.CreateOr(JQ, One);
+ }
+
+ // int ia = (int)LHS;
+ Value *IA = Num;
+
+ // int ib, (int)RHS;
+ Value *IB = Den;
+
+ // float fa = (float)ia;
+ Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
+ : Builder.CreateUIToFP(IA, F32Ty);
+
+ // float fb = (float)ib;
+ Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
+ : Builder.CreateUIToFP(IB,F32Ty);
+
+ Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB);
+ Value *FQM = Builder.CreateFMul(FA, RCP);
+
+ // fq = trunc(fqm);
+ CallInst* FQ = Builder.CreateIntrinsic(Intrinsic::trunc, { FQM });
+ FQ->copyFastMathFlags(Builder.getFastMathFlags());
+
+ // float fqneg = -fq;
+ Value *FQNeg = Builder.CreateFNeg(FQ);
+
+ // float fr = mad(fqneg, fb, fa);
+ Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz,
+ { FQNeg, FB, FA }, FQ);
+
+ // int iq = (int)fq;
+ Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
+ : Builder.CreateFPToUI(FQ, I32Ty);
+
+ // fr = fabs(fr);
+ FR = Builder.CreateIntrinsic(Intrinsic::fabs, { FR }, FQ);
+
+ // fb = fabs(fb);
+ FB = Builder.CreateIntrinsic(Intrinsic::fabs, { FB }, FQ);
+
+ // int cv = fr >= fb;
+ Value *CV = Builder.CreateFCmpOGE(FR, FB);
+
+ // jq = (cv ? jq : 0);
+ JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
+
+ // dst = iq + jq;
+ Value *Div = Builder.CreateAdd(IQ, JQ);
+
+ Value *Res = Div;
+ if (!IsDiv) {
+ // Rem needs compensation, it's easier to recompute it
+ Value *Rem = Builder.CreateMul(Div, Den);
+ Res = Builder.CreateSub(Num, Rem);
+ }
+
+ // Truncate to number of bits this divide really is.
+ if (IsSigned) {
+ Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits));
+ Res = Builder.CreateSExt(Res, Ty);
+ } else {
+ ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
+ Res = Builder.CreateAnd(Res, TruncMask);
+ }
+
+ return Res;
+}
+
+Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
+ BinaryOperator &I,
+ Value *Num, Value *Den) const {
+ Instruction::BinaryOps Opc = I.getOpcode();
+ assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
+ Opc == Instruction::SRem || Opc == Instruction::SDiv);
+
+ FastMathFlags FMF;
+ FMF.setFast();
+ Builder.setFastMathFlags(FMF);
+
+ if (isa<Constant>(Den))
+ return nullptr; // Keep it for optimization
+
+ bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
+ bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
+
+ Type *Ty = Num->getType();
+ Type *I32Ty = Builder.getInt32Ty();
+ Type *F32Ty = Builder.getFloatTy();
+
+ if (Ty->getScalarSizeInBits() < 32) {
+ if (IsSigned) {
+ Num = Builder.CreateSExt(Num, I32Ty);
+ Den = Builder.CreateSExt(Den, I32Ty);
+ } else {
+ Num = Builder.CreateZExt(Num, I32Ty);
+ Den = Builder.CreateZExt(Den, I32Ty);
+ }
+ }
+
+ if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) {
+ Res = Builder.CreateTrunc(Res, Ty);
+ return Res;
+ }
+
+ ConstantInt *Zero = Builder.getInt32(0);
+ ConstantInt *One = Builder.getInt32(1);
+ ConstantInt *MinusOne = Builder.getInt32(~0);
+
+ Value *Sign = nullptr;
+ if (IsSigned) {
+ ConstantInt *K31 = Builder.getInt32(31);
+ Value *LHSign = Builder.CreateAShr(Num, K31);
+ Value *RHSign = Builder.CreateAShr(Den, K31);
+ // Remainder sign is the same as LHS
+ Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign;
+
+ Num = Builder.CreateAdd(Num, LHSign);
+ Den = Builder.CreateAdd(Den, RHSign);
+
+ Num = Builder.CreateXor(Num, LHSign);
+ Den = Builder.CreateXor(Den, RHSign);
+ }
+
+ // RCP = URECIP(Den) = 2^32 / Den + e
+ // e is rounding error.
+ Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty);
+ Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32);
+ Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000));
+ Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1);
+ Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty);
+
+ // RCP_LO, RCP_HI = mul(RCP, Den) */
+ Value *RCP_LO, *RCP_HI;
+ std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den);
+
+ // NEG_RCP_LO = -RCP_LO
+ Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO);
+
+ // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
+ Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero);
+ Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO);
+
+ // Calculate the rounding error from the URECIP instruction
+ // E = mulhu(ABS_RCP_LO, RCP)
+ Value *E = getMulHu(Builder, ABS_RCP_LO, RCP);
+
+ // RCP_A_E = RCP + E
+ Value *RCP_A_E = Builder.CreateAdd(RCP, E);
+
+ // RCP_S_E = RCP - E
+ Value *RCP_S_E = Builder.CreateSub(RCP, E);
+
+ // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
+ Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E);
+
+ // Quotient = mulhu(Tmp0, Num)
+ Value *Quotient = getMulHu(Builder, Tmp0, Num);
+
+ // Num_S_Remainder = Quotient * Den
+ Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den);
+
+ // Remainder = Num - Num_S_Remainder
+ Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder);
+
+ // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
+ Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den);
+ Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero);
+
+ // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
+ Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder);
+ Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC,
+ MinusOne, Zero);
+
+ // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
+ Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero);
+ Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero);
+
+ Value *Res;
+ if (IsDiv) {
+ // Quotient_A_One = Quotient + 1
+ Value *Quotient_A_One = Builder.CreateAdd(Quotient, One);
+
+ // Quotient_S_One = Quotient - 1
+ Value *Quotient_S_One = Builder.CreateSub(Quotient, One);
+
+ // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
+ Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One);
+
+ // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
+ Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One);
+ } else {
+ // Remainder_S_Den = Remainder - Den
+ Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den);
+
+ // Remainder_A_Den = Remainder + Den
+ Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den);
+
+ // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
+ Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den);
+
+ // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
+ Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den);
+ }
+
+ if (IsSigned) {
+ Res = Builder.CreateXor(Res, Sign);
+ Res = Builder.CreateSub(Res, Sign);
+ }
+
+ Res = Builder.CreateTrunc(Res, Ty);
+
+ return Res;
+}
+
bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
+ if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+ DA->isUniform(&I) && promoteUniformOpToI32(I))
+ return true;
+
bool Changed = false;
+ Instruction::BinaryOps Opc = I.getOpcode();
+ Type *Ty = I.getType();
+ Value *NewDiv = nullptr;
+ if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
+ Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
+ Ty->getScalarSizeInBits() <= 32) {
+ Value *Num = I.getOperand(0);
+ Value *Den = I.getOperand(1);
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
- if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
- DA->isUniform(&I))
- Changed |= promoteUniformOpToI32(I);
+ if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+ NewDiv = UndefValue::get(VT);
+
+ for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
+ Value *NumEltN = Builder.CreateExtractElement(Num, N);
+ Value *DenEltN = Builder.CreateExtractElement(Den, N);
+ Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
+ if (!NewElt)
+ NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+ NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
+ }
+ } else {
+ NewDiv = expandDivRem32(Builder, I, Num, Den);
+ }
+
+ if (NewDiv) {
+ I.replaceAllUsesWith(NewDiv);
+ I.eraseFromParent();
+ Changed = true;
+ }
+ }
return Changed;
}
-bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
- if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
+ if (!WidenLoads)
+ return false;
+
+ if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+ I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
canWidenScalarExtLoad(I)) {
IRBuilder<> Builder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());
@@ -474,7 +808,28 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
Type *I32Ty = Builder.getInt32Ty();
Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
- Value *WidenLoad = Builder.CreateLoad(BitCast);
+ LoadInst *WidenLoad = Builder.CreateLoad(BitCast);
+ WidenLoad->copyMetadata(I);
+
+ // If we have range metadata, we need to convert the type, and not make
+ // assumptions about the high bits.
+ if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
+ ConstantInt *Lower =
+ mdconst::extract<ConstantInt>(Range->getOperand(0));
+
+ if (Lower->getValue().isNullValue()) {
+ WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
+ } else {
+ Metadata *LowAndHigh[] = {
+ ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
+ // Don't make assumptions about the high bits.
+ ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
+ };
+
+ WidenLoad->setMetadata(LLVMContext::MD_range,
+ MDNode::get(Mod->getContext(), LowAndHigh));
+ }
+ }
int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
Type *IntNTy = Builder.getIntNTy(TySize);
@@ -540,10 +895,12 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
if (!TPC)
return false;
- const TargetMachine &TM = TPC->getTM<TargetMachine>();
- ST = &TM.getSubtarget<SISubtarget>(F);
+ const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
+ ST = &TM.getSubtarget<GCNSubtarget>(F);
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
DA = &getAnalysis<DivergenceAnalysis>();
HasUnsafeFPMath = hasUnsafeFPMath(F);
+ AMDGPUASI = TM.getAMDGPUAS();
bool MadeChange = false;
@@ -560,6 +917,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
"AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
false, false)
diff --git a/lib/Target/AMDGPU/AMDGPUFeatures.td b/lib/Target/AMDGPU/AMDGPUFeatures.td
new file mode 100644
index 000000000000..b375cae9018e
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -0,0 +1,60 @@
+//===-- AMDGPUFeatures.td - AMDGPU Feature Definitions -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureFP64 : SubtargetFeature<"fp64",
+ "FP64",
+ "true",
+ "Enable double precision operations"
+>;
+
+def FeatureFMA : SubtargetFeature<"fmaf",
+ "FMA",
+ "true",
+ "Enable single precision FMA (not as fast as mul+add, but fused)"
+>;
+
+class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
+ "localmemorysize"#Value,
+ "LocalMemorySize",
+ !cast<string>(Value),
+ "The size of local memory in bytes"
+>;
+
+def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
+def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
+def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
+
+class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
+ "wavefrontsize"#Value,
+ "WavefrontSize",
+ !cast<string>(Value),
+ "The number of threads per wavefront"
+>;
+
+def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
+def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
+def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
+
+class SubtargetFeatureGeneration <string Value, string Subtarget,
+ list<SubtargetFeature> Implies> :
+ SubtargetFeature <Value, "Gen", Subtarget#"::"#Value,
+ Value#" GPU generation", Implies>;
+
+def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
+ "DX10Clamp",
+ "true",
+ "clamp modifier clamps NaNs to 0.0"
+>;
+
+def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
+ "EnablePromoteAlloca",
+ "true",
+ "Enable promote alloca pass"
+>;
+
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 91fe921bfeec..ee836bf8a631 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface to describe a layout of a stack frame on an AMDGPU target.
+/// Interface to describe a layout of a stack frame on an AMDGPU target.
//
//===----------------------------------------------------------------------===//
@@ -19,7 +19,7 @@
namespace llvm {
-/// \brief Information about the stack frame layout on the AMDGPU targets.
+/// Information about the stack frame layout on the AMDGPU targets.
///
/// It holds the direction of the stack growth, the known stack alignment on
/// entry to each function, and the offset to the locals area.
diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td
new file mode 100644
index 000000000000..ba735390f679
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -0,0 +1,138 @@
+//===-- AMDGPUGIsel.td - AMDGPU GlobalISel Patterns---------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This files contains patterns that should only be used by GlobalISel. For
+// example patterns for V_* instructions that have S_* equivalents.
+// SelectionDAG does not support selecting V_* instructions.
+//===----------------------------------------------------------------------===//
+
+include "AMDGPU.td"
+
+def sd_vsrc0 : ComplexPattern<i32, 1, "">;
+def gi_vsrc0 :
+ GIComplexOperandMatcher<s32, "selectVSRC0">,
+ GIComplexPatternEquiv<sd_vsrc0>;
+
+def sd_vcsrc : ComplexPattern<i32, 1, "">;
+def gi_vcsrc :
+ GIComplexOperandMatcher<s32, "selectVCSRC">,
+ GIComplexPatternEquiv<sd_vcsrc>;
+
+def gi_vop3mods0 :
+ GIComplexOperandMatcher<s32, "selectVOP3Mods0">,
+ GIComplexPatternEquiv<VOP3Mods0>;
+
+def gi_vop3mods :
+ GIComplexOperandMatcher<s32, "selectVOP3Mods">,
+ GIComplexPatternEquiv<VOP3Mods>;
+
+def gi_vop3omods :
+ GIComplexOperandMatcher<s32, "selectVOP3OMods">,
+ GIComplexPatternEquiv<VOP3OMods>;
+
+class GISelSop2Pat <
+ SDPatternOperator node,
+ Instruction inst,
+ ValueType dst_vt,
+ ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat <
+
+ (dst_vt (node (src0_vt SReg_32:$src0), (src1_vt SReg_32:$src1))),
+ (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop2Pat <
+ SDPatternOperator node,
+ Instruction inst,
+ ValueType dst_vt,
+ ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat <
+
+ (dst_vt (node (src0_vt (sd_vsrc0 src0_vt:$src0)), (src1_vt VGPR_32:$src1))),
+ (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop2CommutePat <
+ SDPatternOperator node,
+ Instruction inst,
+ ValueType dst_vt,
+ ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat <
+
+ (dst_vt (node (src1_vt VGPR_32:$src1), (src0_vt (sd_vsrc0 src0_vt:$src0)))),
+ (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop3Pat2 <
+ SDPatternOperator node,
+ Instruction inst,
+ ValueType dst_vt,
+ ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat <
+
+ (dst_vt (node (src0_vt (sd_vcsrc src0_vt:$src0)), (src1_vt (sd_vcsrc src1_vt:$src1)))),
+ (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop3Pat2CommutePat <
+ SDPatternOperator node,
+ Instruction inst,
+ ValueType dst_vt,
+ ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat <
+
+ (dst_vt (node (src0_vt (sd_vcsrc src0_vt:$src0)), (src1_vt (sd_vcsrc src1_vt:$src1)))),
+ (inst src0_vt:$src1, src1_vt:$src0)
+>;
+
+class GISelVop3Pat2ModsPat <
+ SDPatternOperator node,
+ Instruction inst,
+ ValueType dst_vt,
+ ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat <
+
+ (dst_vt (node (src0_vt (VOP3Mods0 src0_vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omods)),
+ (src1_vt (VOP3Mods src1_vt:$src1, i32:$src1_modifiers)))),
+ (inst i32:$src0_modifiers, src0_vt:$src0,
+ i32:$src1_modifiers, src1_vt:$src1, $clamp, $omods)
+>;
+
+multiclass GISelVop2IntrPat <
+ SDPatternOperator node, Instruction inst,
+ ValueType dst_vt, ValueType src_vt = dst_vt> {
+
+ def : GISelVop2Pat <node, inst, dst_vt, src_vt>;
+
+ // FIXME: Intrinsics aren't marked as commutable, so we need to add an explcit
+ // pattern to handle commuting. This is another reason why legalizing to a
+ // generic machine instruction may be better that matching the intrinsic
+ // directly.
+ def : GISelVop2CommutePat <node, inst, dst_vt, src_vt>;
+}
+
+def : GISelSop2Pat <or, S_OR_B32, i32>;
+def : GISelVop2Pat <or, V_OR_B32_e32, i32>;
+
+def : GISelSop2Pat <sra, S_ASHR_I32, i32>;
+let AddedComplexity = 100 in {
+let SubtargetPredicate = isSICI in {
+def : GISelVop2Pat <sra, V_ASHR_I32_e32, i32>;
+}
+def : GISelVop2CommutePat <sra, V_ASHRREV_I32_e32, i32>;
+}
+def : GISelVop3Pat2CommutePat <sra, V_ASHRREV_I32_e64, i32>;
+
+// FIXME: Select directly to _e32 so we don't need to deal with modifiers.
+// FIXME: We can't re-use SelectionDAG patterns here because they match
+// against a custom SDNode and we would need to create a generic machine
+// instruction that is equivalent to the custom SDNode. This would also require
+// us to custom legalize the intrinsic to the new generic machine instruction,
+// but I can't get custom legalizing of intrinsic to work and I'm not sure if
+// this is even supported yet.
+defm : GISelVop2IntrPat <
+ int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e32, v2f16, f32>;
+
+defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>;
+def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>;
+defm : GISelVop2IntrPat <int_minnum, V_MIN_F32_e32, f32>;
+def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>;
diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index bf7deb500d1a..3a58c6c6a29f 100644
--- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -16,41 +16,89 @@ namespace AMDGPU {
enum PartialMappingIdx {
None = - 1,
- PM_SGPR32 = 0,
- PM_SGPR64 = 1,
- PM_VGPR32 = 2,
- PM_VGPR64 = 3
+ PM_SGPR1 = 0,
+ PM_SGPR16 = 4,
+ PM_SGPR32 = 5,
+ PM_SGPR64 = 6,
+ PM_SGPR128 = 7,
+ PM_SGPR256 = 8,
+ PM_SGPR512 = 9,
+ PM_VGPR1 = 10,
+ PM_VGPR16 = 14,
+ PM_VGPR32 = 15,
+ PM_VGPR64 = 16,
+ PM_VGPR128 = 17,
+ PM_VGPR256 = 18,
+ PM_VGPR512 = 19,
+ PM_SGPR96 = 20,
+ PM_VGPR96 = 21
};
const RegisterBankInfo::PartialMapping PartMappings[] {
// StartIdx, Length, RegBank
+ {0, 1, SCCRegBank},
+ {0, 16, SGPRRegBank},
{0, 32, SGPRRegBank},
{0, 64, SGPRRegBank},
+ {0, 128, SGPRRegBank},
+ {0, 256, SGPRRegBank},
+ {0, 512, SGPRRegBank},
+ {0, 1, SGPRRegBank},
+ {0, 16, VGPRRegBank},
{0, 32, VGPRRegBank},
- {0, 64, VGPRRegBank}
+ {0, 64, VGPRRegBank},
+ {0, 128, VGPRRegBank},
+ {0, 256, VGPRRegBank},
+ {0, 512, VGPRRegBank},
+ {0, 96, SGPRRegBank},
+ {0, 96, VGPRRegBank},
};
const RegisterBankInfo::ValueMapping ValMappings[] {
- // SGPR 32-bit
{&PartMappings[0], 1},
- // SGPR 64-bit
+ {nullptr, 0},
+ {nullptr, 0},
+ {nullptr, 0},
{&PartMappings[1], 1},
- // VGPR 32-bit
{&PartMappings[2], 1},
- // VGPR 64-bit
- {&PartMappings[3], 1}
+ {&PartMappings[3], 1},
+ {&PartMappings[4], 1},
+ {&PartMappings[5], 1},
+ {&PartMappings[6], 1},
+ {&PartMappings[7], 1},
+ {nullptr, 0},
+ {nullptr, 0},
+ {nullptr, 0},
+ {&PartMappings[8], 1},
+ {&PartMappings[9], 1},
+ {&PartMappings[10], 1},
+ {&PartMappings[11], 1},
+ {&PartMappings[12], 1},
+ {&PartMappings[13], 1},
+ {&PartMappings[14], 1},
+ {&PartMappings[15], 1}
};
enum ValueMappingIdx {
SGPRStartIdx = 0,
- VGPRStartIdx = 2
+ VGPRStartIdx = 10
};
const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
unsigned Size) {
- assert(Size % 32 == 0);
- unsigned Idx = BankID == AMDGPU::SGPRRegBankID ? SGPRStartIdx : VGPRStartIdx;
- Idx += (Size / 32) - 1;
+ unsigned Idx;
+ switch (Size) {
+ case 1:
+ Idx = BankID == AMDGPU::SCCRegBankID ? PM_SGPR1 : PM_VGPR1;
+ break;
+ case 96:
+ Idx = BankID == AMDGPU::SGPRRegBankID ? PM_SGPR96 : PM_VGPR96;
+ break;
+ default:
+ Idx = BankID == AMDGPU::VGPRRegBankID ? VGPRStartIdx : SGPRStartIdx;
+ Idx += Log2_32_Ceil(Size);
+ break;
+ }
return &ValMappings[Idx];
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 463e700f13b7..01ef346f74ee 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -8,13 +8,17 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU HSA Metadata Streamer.
+/// AMDGPU HSA Metadata Streamer.
///
//
//===----------------------------------------------------------------------===//
#include "AMDGPUHSAMetadataStreamer.h"
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIProgramInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Module.h"
@@ -196,6 +200,57 @@ std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions(
return Dims;
}
+Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps(
+ const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const {
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ HSAMD::Kernel::CodeProps::Metadata HSACodeProps;
+ const Function &F = MF.getFunction();
+
+ assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+ F.getCallingConv() == CallingConv::SPIR_KERNEL);
+
+ unsigned MaxKernArgAlign;
+ HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F,
+ MaxKernArgAlign);
+ HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
+ HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
+ HSACodeProps.mKernargSegmentAlign = std::max(MaxKernArgAlign, 4u);
+ HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
+ HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR;
+ HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR;
+ HSACodeProps.mMaxFlatWorkGroupSize = MFI.getMaxFlatWorkGroupSize();
+ HSACodeProps.mIsDynamicCallStack = ProgramInfo.DynamicCallStack;
+ HSACodeProps.mIsXNACKEnabled = STM.isXNACKEnabled();
+ HSACodeProps.mNumSpilledSGPRs = MFI.getNumSpilledSGPRs();
+ HSACodeProps.mNumSpilledVGPRs = MFI.getNumSpilledVGPRs();
+
+ return HSACodeProps;
+}
+
+Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps(
+ const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const {
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ HSAMD::Kernel::DebugProps::Metadata HSADebugProps;
+
+ if (!STM.debuggerSupported())
+ return HSADebugProps;
+
+ HSADebugProps.mDebuggerABIVersion.push_back(1);
+ HSADebugProps.mDebuggerABIVersion.push_back(0);
+
+ if (STM.debuggerEmitPrologue()) {
+ HSADebugProps.mPrivateSegmentBufferSGPR =
+ ProgramInfo.DebuggerPrivateSegmentBufferSGPR;
+ HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR =
+ ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
+ }
+
+ return HSADebugProps;
+}
+
void MetadataStreamer::emitVersion() {
auto &Version = HSAMetadata.mVersion;
@@ -255,32 +310,7 @@ void MetadataStreamer::emitKernelArgs(const Function &Func) {
for (auto &Arg : Func.args())
emitKernelArg(Arg);
- // TODO: What about other languages?
- if (!Func.getParent()->getNamedMetadata("opencl.ocl.version"))
- return;
-
- auto &DL = Func.getParent()->getDataLayout();
- auto Int64Ty = Type::getInt64Ty(Func.getContext());
-
- emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX);
- emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
- emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
-
- auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
- AMDGPUASI.GLOBAL_ADDRESS);
- auto CallsPrintf = Func.getParent()->getNamedMetadata("llvm.printf.fmts");
- if (CallsPrintf)
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
- if (Func.hasFnAttribute("calls-enqueue-kernel")) {
- if (!CallsPrintf) {
- // Emit a dummy argument so that the remaining hidden arguments
- // have a fixed position relative to the first hidden argument.
- // This is to facilitate library code to access hidden arguments.
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
- }
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
- }
+ emitHiddenKernelArgs(Func);
}
void MetadataStreamer::emitKernelArg(const Argument &Arg) {
@@ -320,13 +350,26 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) {
if (Node && ArgNo < Node->getNumOperands())
TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
- emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(),
- getValueKind(Arg.getType(), TypeQual, BaseTypeName), Name,
- TypeName, BaseTypeName, AccQual, TypeQual);
+ Type *Ty = Arg.getType();
+ const DataLayout &DL = Func->getParent()->getDataLayout();
+
+ unsigned PointeeAlign = 0;
+ if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
+ if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
+ PointeeAlign = Arg.getParamAlignment();
+ if (PointeeAlign == 0)
+ PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType());
+ }
+ }
+
+ emitKernelArg(DL, Ty, getValueKind(Arg.getType(), TypeQual, BaseTypeName),
+ PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual);
}
void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
- ValueKind ValueKind, StringRef Name,
+ ValueKind ValueKind,
+ unsigned PointeeAlign,
+ StringRef Name,
StringRef TypeName, StringRef BaseTypeName,
StringRef AccQual, StringRef TypeQual) {
HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata());
@@ -338,12 +381,7 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
Arg.mAlign = DL.getABITypeAlignment(Ty);
Arg.mValueKind = ValueKind;
Arg.mValueType = getValueType(Ty, BaseTypeName);
-
- if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
- auto ElTy = PtrTy->getElementType();
- if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS && ElTy->isSized())
- Arg.mPointeeAlign = DL.getABITypeAlignment(ElTy);
- }
+ Arg.mPointeeAlign = PointeeAlign;
if (auto PtrTy = dyn_cast<PointerType>(Ty))
Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace());
@@ -366,6 +404,48 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
}
}
+void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) {
+ int HiddenArgNumBytes =
+ getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
+
+ if (!HiddenArgNumBytes)
+ return;
+
+ auto &DL = Func.getParent()->getDataLayout();
+ auto Int64Ty = Type::getInt64Ty(Func.getContext());
+
+ if (HiddenArgNumBytes >= 8)
+ emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX);
+ if (HiddenArgNumBytes >= 16)
+ emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
+ if (HiddenArgNumBytes >= 24)
+ emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
+
+ auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
+ AMDGPUASI.GLOBAL_ADDRESS);
+
+ // Emit "printf buffer" argument if printf is used, otherwise emit dummy
+ // "none" argument.
+ if (HiddenArgNumBytes >= 32) {
+ if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
+ else
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+ }
+
+ // Emit "default queue" and "completion action" arguments if enqueue kernel is
+ // used, otherwise emit dummy "none" arguments.
+ if (HiddenArgNumBytes >= 48) {
+ if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
+ } else {
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+ }
+ }
+}
+
void MetadataStreamer::begin(const Module &Mod) {
AMDGPUASI = getAMDGPUAS(Mod);
emitVersion();
@@ -383,13 +463,14 @@ void MetadataStreamer::end() {
verify(HSAMetadataString);
}
-void MetadataStreamer::emitKernel(
- const Function &Func,
- const Kernel::CodeProps::Metadata &CodeProps,
- const Kernel::DebugProps::Metadata &DebugProps) {
+void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) {
+ auto &Func = MF.getFunction();
if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL)
return;
+ auto CodeProps = getHSACodeProps(MF, ProgramInfo);
+ auto DebugProps = getHSADebugProps(MF, ProgramInfo);
+
HSAMetadata.mKernels.push_back(Kernel::Metadata());
auto &Kernel = HSAMetadata.mKernels.back();
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index bd6515521a74..3424c956d781 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU HSA Metadata Streamer.
+/// AMDGPU HSA Metadata Streamer.
///
//
//===----------------------------------------------------------------------===//
@@ -28,6 +28,7 @@ class DataLayout;
class Function;
class MDNode;
class Module;
+struct SIProgramInfo;
class Type;
namespace AMDGPU {
@@ -55,6 +56,13 @@ private:
std::vector<uint32_t> getWorkGroupDimensions(MDNode *Node) const;
+ Kernel::CodeProps::Metadata getHSACodeProps(
+ const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const;
+ Kernel::DebugProps::Metadata getHSADebugProps(
+ const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const;
+
void emitVersion();
void emitPrintf(const Module &Mod);
@@ -68,10 +76,13 @@ private:
void emitKernelArg(const Argument &Arg);
void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind,
+ unsigned PointeeAlign = 0,
StringRef Name = "", StringRef TypeName = "",
StringRef BaseTypeName = "", StringRef AccQual = "",
StringRef TypeQual = "");
+ void emitHiddenKernelArgs(const Function &Func);
+
public:
MetadataStreamer() = default;
~MetadataStreamer() = default;
@@ -84,9 +95,7 @@ public:
void end();
- void emitKernel(const Function &Func,
- const Kernel::CodeProps::Metadata &CodeProps,
- const Kernel::DebugProps::Metadata &DebugProps);
+ void emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo);
};
} // end namespace HSAMD
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index f4776adb069c..f25f4d4693ea 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -8,7 +8,7 @@
//==-----------------------------------------------------------------------===//
//
/// \file
-/// \brief Defines an instruction selector for the AMDGPU target.
+/// Defines an instruction selector for the AMDGPU target.
//
//===----------------------------------------------------------------------===//
@@ -16,6 +16,7 @@
#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUISelLowering.h" // For AMDGPUISD
#include "AMDGPUInstrInfo.h"
+#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
@@ -24,15 +25,16 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -43,6 +45,7 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include <cassert>
#include <cstdint>
@@ -68,7 +71,7 @@ namespace {
class AMDGPUDAGToDAGISel : public SelectionDAGISel {
// Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
// make the right decision when generating code for different targets.
- const AMDGPUSubtarget *Subtarget;
+ const GCNSubtarget *Subtarget;
AMDGPUAS AMDGPUASI;
bool EnableLateStructurizeCFG;
@@ -83,6 +86,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AMDGPUArgumentUsageInfo>();
+ AU.addRequired<AMDGPUPerfHintAnalysis>();
+ AU.addRequired<DivergenceAnalysis>();
SelectionDAGISel::getAnalysisUsage(AU);
}
@@ -98,20 +103,12 @@ private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
bool isNoNanSrc(SDValue N) const;
bool isInlineImmediate(const SDNode *N) const;
- bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
- const R600InstrInfo *TII);
- bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
- bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
- bool isConstantLoad(const MemSDNode *N, int cbID) const;
bool isUniformBr(const SDNode *N) const;
SDNode *glueCopyToM0(SDNode *N) const;
const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
- bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
- bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
- SDValue& Offset);
virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
@@ -162,6 +159,7 @@ private:
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
bool &Imm) const;
+ SDValue Expand32BitAddress(SDValue Addr) const;
bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
bool &Imm) const;
bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
@@ -216,7 +214,7 @@ private:
void SelectS_BFE(SDNode *N);
bool isCBranchSCC(const SDNode *N) const;
void SelectBRCOND(SDNode *N);
- void SelectFMAD(SDNode *N);
+ void SelectFMAD_FMA(SDNode *N);
void SelectATOMIC_CMP_SWAP(SDNode *N);
protected:
@@ -225,9 +223,18 @@ protected:
};
class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
+ const R600Subtarget *Subtarget;
+ AMDGPUAS AMDGPUASI;
+
+ bool isConstantLoad(const MemSDNode *N, int cbID) const;
+ bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
+ bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
+ SDValue& Offset);
public:
explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) :
- AMDGPUDAGToDAGISel(TM, OptLevel) {}
+ AMDGPUDAGToDAGISel(TM, OptLevel) {
+ AMDGPUASI = AMDGPU::getAMDGPUAS(*TM);
+ }
void Select(SDNode *N) override;
@@ -235,6 +242,11 @@ public:
SDValue &Offset) override;
bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
SDValue &Offset) override;
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+protected:
+ // Include the pieces autogenerated from the target description.
+#include "R600GenDAGISel.inc"
};
} // end anonymous namespace
@@ -242,17 +254,19 @@ public:
INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "isel",
"AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
+INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel",
"AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
-/// \brief This pass converts a legalized DAG into a AMDGPU-specific
+/// This pass converts a legalized DAG into a AMDGPU-specific
// DAG, ready for instruction scheduling.
FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
CodeGenOpt::Level OptLevel) {
return new AMDGPUDAGToDAGISel(TM, OptLevel);
}
-/// \brief This pass converts a legalized DAG into a R600-specific
+/// This pass converts a legalized DAG into a R600-specific
// DAG, ready for instruction scheduling.
FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
CodeGenOpt::Level OptLevel) {
@@ -260,7 +274,7 @@ FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
}
bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
- Subtarget = &MF.getSubtarget<AMDGPUSubtarget>();
+ Subtarget = &MF.getSubtarget<GCNSubtarget>();
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -276,8 +290,7 @@ bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
}
bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
- const SIInstrInfo *TII
- = static_cast<const SISubtarget *>(Subtarget)->getInstrInfo();
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
return TII->isInlineConstant(C->getAPIntValue());
@@ -288,7 +301,7 @@ bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
return false;
}
-/// \brief Determine the register class for \p OpNo
+/// Determine the register class for \p OpNo
/// \returns The register class of the virtual register that will be used for
/// the given operand number \OpNo or NULL if the register class cannot be
/// determined.
@@ -303,7 +316,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
}
const SIRegisterInfo *TRI
- = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+ = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
return TRI->getPhysRegClass(Reg);
}
@@ -394,7 +407,6 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
EVT EltVT = VT.getVectorElementType();
- const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
SDLoc DL(N);
SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
@@ -420,10 +432,9 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
IsRegSeq = false;
break;
}
+ unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
- RegSeqArgs[1 + (2 * i) + 1] =
- CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL,
- MVT::i32);
+ RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
}
if (NOps != NumVectorElts) {
// Fill in the missing undef elements if this was a scalar_to_vector.
@@ -431,9 +442,10 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
DL, EltVT);
for (unsigned i = NOps; i < NumVectorElts; ++i) {
+ unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
RegSeqArgs[1 + (2 * i) + 1] =
- CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32);
+ CurDAG->getTargetConstant(Sub, DL, MVT::i32);
}
}
@@ -450,7 +462,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
}
if (isa<AtomicSDNode>(N) ||
- (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC))
+ (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
+ Opc == AMDGPUISD::ATOMIC_LOAD_FADD ||
+ Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
+ Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))
N = glueCopyToM0(N);
switch (Opc) {
@@ -487,9 +502,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::BUILD_VECTOR: {
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
-
- if (VT == MVT::v2i16 || VT == MVT::v2f16) {
- if (Opc == ISD::BUILD_VECTOR) {
+ if (VT.getScalarSizeInBits() == 16) {
+ if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
uint32_t LHSVal, RHSVal;
if (getConstantValue(N->getOperand(0), LHSVal) &&
getConstantValue(N->getOperand(1), RHSVal)) {
@@ -559,7 +573,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
return;
}
case ISD::LOAD:
- case ISD::STORE: {
+ case ISD::STORE:
+ case ISD::ATOMIC_LOAD:
+ case ISD::ATOMIC_STORE: {
N = glueCopyToM0(N);
break;
}
@@ -619,7 +635,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectBRCOND(N);
return;
case ISD::FMAD:
- SelectFMAD(N);
+ case ISD::FMA:
+ SelectFMAD_FMA(N);
return;
case AMDGPUISD::ATOMIC_CMP_SWAP:
SelectATOMIC_CMP_SWAP(N);
@@ -629,15 +646,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectCode(N);
}
-bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
- if (!N->readMem())
- return false;
- if (CbId == -1)
- return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS;
-
- return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId;
-}
-
bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
const Instruction *Term = BB->getTerminator();
@@ -653,26 +661,6 @@ StringRef AMDGPUDAGToDAGISel::getPassName() const {
// Complex Patterns
//===----------------------------------------------------------------------===//
-bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
- SDValue& IntPtr) {
- if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
- IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
- true);
- return true;
- }
- return false;
-}
-
-bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
- SDValue& BaseReg, SDValue &Offset) {
- if (!isa<ConstantSDNode>(Addr)) {
- BaseReg = Addr;
- Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
- return true;
- }
- return false;
-}
-
bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
SDValue &Offset) {
return false;
@@ -684,11 +672,11 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
SDLoc DL(Addr);
if ((C = dyn_cast<ConstantSDNode>(Addr))) {
- Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+ Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
} else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
(C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
- Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+ Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
} else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
(C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
@@ -759,12 +747,11 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
if (ProduceCarry) {
// Replace the carry-use
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(AddHi, 1));
+ ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
}
// Replace the remaining uses.
- CurDAG->ReplaceAllUsesWith(N, RegSequence);
- CurDAG->RemoveDeadNode(N);
+ ReplaceNode(N, RegSequence);
}
void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
@@ -1410,7 +1397,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
return false;
SDLoc SL(ByteOffsetNode);
- AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration();
+ GCNSubtarget::Generation Gen = Subtarget->getGeneration();
int64_t ByteOffset = C->getSExtValue();
int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset);
@@ -1435,19 +1422,45 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
return true;
}
+SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
+ if (Addr.getValueType() != MVT::i32)
+ return Addr;
+
+ // Zero-extend a 32-bit address.
+ SDLoc SL(Addr);
+
+ const MachineFunction &MF = CurDAG->getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned AddrHiVal = Info->get32BitAddressHighBits();
+ SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
+
+ const SDValue Ops[] = {
+ CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
+ Addr,
+ CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+ SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
+ 0),
+ CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
+ };
+
+ return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
+ Ops), 0);
+}
+
bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
SDValue &Offset, bool &Imm) const {
SDLoc SL(Addr);
+
if (CurDAG->isBaseWithConstantOffset(Addr)) {
SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);
if (SelectSMRDOffset(N1, Offset, Imm)) {
- SBase = N0;
+ SBase = Expand32BitAddress(N0);
return true;
}
}
- SBase = Addr;
+ SBase = Expand32BitAddress(Addr);
Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
Imm = true;
return true;
@@ -1651,7 +1664,7 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
return true;
if (VT == MVT::i64) {
- auto ST = static_cast<const SISubtarget *>(Subtarget);
+ auto ST = static_cast<const GCNSubtarget *>(Subtarget);
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
@@ -1674,15 +1687,39 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
unsigned CondReg = UseSCCBr ? AMDGPU::SCC : AMDGPU::VCC;
SDLoc SL(N);
+ if (!UseSCCBr) {
+ // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
+ // analyzed what generates the vcc value, so we do not know whether vcc
+ // bits for disabled lanes are 0. Thus we need to mask out bits for
+ // disabled lanes.
+ //
+ // For the case that we select S_CBRANCH_SCC1 and it gets
+ // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
+ // SIInstrInfo::moveToVALU which inserts the S_AND).
+ //
+ // We could add an analysis of what generates the vcc value here and omit
+ // the S_AND when is unnecessary. But it would be better to add a separate
+ // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
+ // catches both cases.
+ Cond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1,
+ CurDAG->getRegister(AMDGPU::EXEC, MVT::i1),
+ Cond),
+ 0);
+ }
+
SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
N->getOperand(2), // Basic Block
VCC.getValue(0));
}
-void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
+void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
MVT VT = N->getSimpleValueType(0);
- if (VT != MVT::f32 || !Subtarget->hasMadMixInsts()) {
+ bool IsFMA = N->getOpcode() == ISD::FMA;
+ if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
+ !Subtarget->hasFmaMixInsts()) ||
+ ((IsFMA && Subtarget->hasMadMixInsts()) ||
+ (!IsFMA && Subtarget->hasFmaMixInsts()))) {
SelectCode(N);
return;
}
@@ -1692,13 +1729,13 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
SDValue Src2 = N->getOperand(2);
unsigned Src0Mods, Src1Mods, Src2Mods;
- // Avoid using v_mad_mix_f32 unless there is actually an operand using the
- // conversion from f16.
+ // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
+ // using the conversion from f16.
bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
- assert(!Subtarget->hasFP32Denormals() &&
+ assert((IsFMA || !Subtarget->hasFP32Denormals()) &&
"fmad selected with denormals enabled");
// TODO: We can select this with f32 denormals enabled if all the sources are
// converted from f16 (in which case fmad isn't legal).
@@ -1714,7 +1751,9 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
Zero, Zero
};
- CurDAG->SelectNodeTo(N, AMDGPU::V_MAD_MIX_F32, MVT::f32, Ops);
+ CurDAG->SelectNodeTo(N,
+ IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
+ MVT::f32, Ops);
} else {
SelectCode(N);
}
@@ -2100,6 +2139,41 @@ void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
} while (IsModified);
}
+bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &MF.getSubtarget<R600Subtarget>();
+ return SelectionDAGISel::runOnMachineFunction(MF);
+}
+
+bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
+ if (!N->readMem())
+ return false;
+ if (CbId == -1)
+ return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+ N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
+
+ return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId;
+}
+
+bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
+ SDValue& IntPtr) {
+ if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
+ IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
+ true);
+ return true;
+ }
+ return false;
+}
+
+bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
+ SDValue& BaseReg, SDValue &Offset) {
+ if (!isa<ConstantSDNode>(Addr)) {
+ BaseReg = Addr;
+ Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
+ return true;
+ }
+ return false;
+}
+
void R600DAGToDAGISel::Select(SDNode *N) {
unsigned int Opc = N->getOpcode();
if (N->isMachineOpcode()) {
@@ -2120,12 +2194,12 @@ void R600DAGToDAGISel::Select(SDNode *N) {
// pass. We want to avoid 128 bits copies as much as possible because they
// can't be bundled by our scheduler.
switch(NumVectorElts) {
- case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+ case 2: RegClassID = R600::R600_Reg64RegClassID; break;
case 4:
if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
- RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
+ RegClassID = R600::R600_Reg128VerticalRegClassID;
else
- RegClassID = AMDGPU::R600_Reg128RegClassID;
+ RegClassID = R600::R600_Reg128RegClassID;
break;
default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
}
@@ -2143,11 +2217,11 @@ bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
SDLoc DL(Addr);
if ((C = dyn_cast<ConstantSDNode>(Addr))) {
- Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+ Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
} else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
(C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
- Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+ Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
} else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
(C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
@@ -2178,7 +2252,7 @@ bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
&& isInt<16>(IMMOffset->getZExtValue())) {
Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
SDLoc(CurDAG->getEntryNode()),
- AMDGPU::ZERO, MVT::i32);
+ R600::ZERO, MVT::i32);
Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
MVT::i32);
return true;
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 49929441ef21..b201126c593b 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This is the parent TargetLowering class for hardware code gen
+/// This is the parent TargetLowering class for hardware code gen
/// targets.
//
//===----------------------------------------------------------------------===//
@@ -25,9 +25,12 @@
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "R600MachineFunctionInfo.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -38,18 +41,6 @@
#include "llvm/Support/KnownBits.h"
using namespace llvm;
-static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- MachineFunction &MF = State.getMachineFunction();
- AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
-
- uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
- ArgFlags.getOrigAlign());
- State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return true;
-}
-
static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State,
@@ -71,7 +62,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
case MVT::i64:
case MVT::f64:
case MVT::v2i32:
- case MVT::v2f32: {
+ case MVT::v2f32:
+ case MVT::v4i16:
+ case MVT::v4f16: {
// Up to SGPR0-SGPR39
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
&AMDGPU::SGPR_64RegClass, 20);
@@ -92,7 +85,9 @@ static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
case MVT::i64:
case MVT::f64:
case MVT::v2i32:
- case MVT::v2f32: {
+ case MVT::v2f32:
+ case MVT::v4i16:
+ case MVT::v4f16: {
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
&AMDGPU::VReg_64RegClass, 31);
}
@@ -324,10 +319,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FLOG, MVT::f32, Custom);
setOperationAction(ISD::FLOG10, MVT::f32, Custom);
- if (Subtarget->has16BitInsts()) {
- setOperationAction(ISD::FLOG, MVT::f16, Custom);
- setOperationAction(ISD::FLOG10, MVT::f16, Custom);
- }
setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
@@ -335,10 +326,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FREM, MVT::f32, Custom);
setOperationAction(ISD::FREM, MVT::f64, Custom);
- // v_mad_f32 does not support denormals according to some sources.
- if (!Subtarget->hasFP32Denormals())
- setOperationAction(ISD::FMAD, MVT::f32, Legal);
-
// Expand to fneg + fadd.
setOperationAction(ISD::FSUB, MVT::f64, Expand);
@@ -353,19 +340,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
- if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
- setOperationAction(ISD::FCEIL, MVT::f64, Custom);
- setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
- setOperationAction(ISD::FRINT, MVT::f64, Custom);
- setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
- }
-
- if (!Subtarget->hasBFI()) {
- // fcopysign can be done in a single instruction with BFI.
- setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
- }
-
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
@@ -389,13 +363,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BSWAP, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
setOperationAction(ISD::CTLZ, VT, Expand);
- }
-
- if (!Subtarget->hasBCNT(32))
- setOperationAction(ISD::CTPOP, MVT::i32, Expand);
- if (!Subtarget->hasBCNT(64))
- setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+ // AMDGPU uses ADDC/SUBC/ADDE/SUBE
+ setOperationAction(ISD::ADDC, VT, Legal);
+ setOperationAction(ISD::SUBC, VT, Legal);
+ setOperationAction(ISD::ADDE, VT, Legal);
+ setOperationAction(ISD::SUBE, VT, Legal);
+ }
// The hardware supports 32-bit ROTR, but not ROTL.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
@@ -416,28 +390,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SMAX, MVT::i32, Legal);
setOperationAction(ISD::UMAX, MVT::i32, Legal);
- if (Subtarget->hasFFBH())
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
-
- if (Subtarget->hasFFBL())
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
-
setOperationAction(ISD::CTTZ, MVT::i64, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
setOperationAction(ISD::CTLZ, MVT::i64, Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
- // We only really have 32-bit BFE instructions (and 16-bit on VI).
- //
- // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
- // effort to match them now. We want this to be false for i64 cases when the
- // extraction isn't restricted to the upper or lower half. Ideally we would
- // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
- // span the midpoint are probably relatively rare, so don't worry about them
- // for now.
- if (Subtarget->hasBFE())
- setHasExtractBitsInsn(true);
-
static const MVT::SimpleValueType VectorIntTypes[] = {
MVT::v2i32, MVT::v4i32
};
@@ -468,10 +425,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Custom);
setOperationAction(ISD::UDIVREM, VT, Expand);
- setOperationAction(ISD::ADDC, VT, Expand);
- setOperationAction(ISD::SUBC, VT, Expand);
- setOperationAction(ISD::ADDE, VT, Expand);
- setOperationAction(ISD::SUBE, VT, Expand);
setOperationAction(ISD::SELECT, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
@@ -546,11 +499,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// vector compares until that is fixed.
setHasMultipleConditionRegisters(true);
- // SI at least has hardware support for floating point exceptions, but no way
- // of using or handling them is implemented. They are also optional in OpenCL
- // (Section 7.3)
- setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
-
PredictableSelectIsExpensive = false;
// We want to find all load dependencies for long chains of stores to enable
@@ -573,6 +521,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::MULHU);
setTargetDAGCombine(ISD::MULHS);
@@ -607,6 +556,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
case ISD::FNEARBYINT:
case AMDGPUISD::RCP:
case AMDGPUISD::RCP_LEGACY:
+ case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::SIN_HW:
case AMDGPUISD::FMUL_LEGACY:
case AMDGPUISD::FMIN_LEGACY:
@@ -748,6 +698,37 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
return true;
}
+bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case ISD::EntryToken:
+ case ISD::TokenFactor:
+ return true;
+ case ISD::INTRINSIC_WO_CHAIN:
+ {
+ unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IntrID) {
+ default:
+ return false;
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane:
+ return true;
+ }
+ }
+ break;
+ case ISD::LOAD:
+ {
+ const LoadSDNode * L = dyn_cast<LoadSDNode>(N);
+ if (L->getMemOperand()->getAddrSpace()
+ == AMDGPUASI.CONSTANT_ADDRESS_32BIT)
+ return true;
+ return false;
+ }
+ break;
+ }
+}
+
//===---------------------------------------------------------------------===//
// Target Properties
//===---------------------------------------------------------------------===//
@@ -832,17 +813,6 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return isZExtFree(Val.getValueType(), VT2);
}
-// v_mad_mix* support a conversion from f16 to f32.
-//
-// There is only one special case when denormals are enabled we don't currently,
-// where this is OK to use.
-bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode,
- EVT DestVT, EVT SrcVT) const {
- return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() &&
- DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
- SrcVT.getScalarType() == MVT::f16;
-}
-
bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
// limited number of native 64-bit operations. Shrinking an operation to fit
@@ -862,7 +832,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
switch (CC) {
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
- return CC_AMDGPU_Kernel;
+ llvm_unreachable("kernels should not be handled here");
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
@@ -885,7 +855,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
switch (CC) {
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
- return CC_AMDGPU_Kernel;
+ llvm_unreachable("kernels should not be handled here");
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
@@ -929,74 +899,118 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
/// for each individual part is i8. We pass the memory type as LocVT to the
/// calling convention analysis function and the register type (Ins[x].VT) as
/// the ValVT.
-void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
- const SmallVectorImpl<ISD::InputArg> &Ins) const {
- for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
- const ISD::InputArg &In = Ins[i];
- EVT MemVT;
-
- unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
-
- if (!Subtarget->isAmdHsaOS() &&
- (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
- // The ABI says the caller will extend these values to 32-bits.
- MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
- } else if (NumRegs == 1) {
- // This argument is not split, so the IR type is the memory type.
- assert(!In.Flags.isSplit());
- if (In.ArgVT.isExtended()) {
- // We have an extended type, like i24, so we should just use the register type
- MemVT = In.VT;
- } else {
- MemVT = In.ArgVT;
- }
- } else if (In.ArgVT.isVector() && In.VT.isVector() &&
- In.ArgVT.getScalarType() == In.VT.getScalarType()) {
- assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
- // We have a vector value which has been split into a vector with
- // the same scalar type, but fewer elements. This should handle
- // all the floating-point vector types.
- MemVT = In.VT;
- } else if (In.ArgVT.isVector() &&
- In.ArgVT.getVectorNumElements() == NumRegs) {
- // This arg has been split so that each element is stored in a separate
- // register.
- MemVT = In.ArgVT.getScalarType();
- } else if (In.ArgVT.isExtended()) {
- // We have an extended type, like i65.
- MemVT = In.VT;
- } else {
- unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
- assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
- if (In.VT.isInteger()) {
- MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
- } else if (In.VT.isVector()) {
- assert(!In.VT.getScalarType().isFloatingPoint());
- unsigned NumElements = In.VT.getVectorNumElements();
- assert(MemoryBits % NumElements == 0);
- // This vector type has been split into another vector type with
- // a different elements size.
- EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
- MemoryBits / NumElements);
- MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
+ CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) const {
+ const MachineFunction &MF = State.getMachineFunction();
+ const Function &Fn = MF.getFunction();
+ LLVMContext &Ctx = Fn.getParent()->getContext();
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
+ const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
+
+ unsigned MaxAlign = 1;
+ uint64_t ExplicitArgOffset = 0;
+ const DataLayout &DL = Fn.getParent()->getDataLayout();
+
+ unsigned InIndex = 0;
+
+ for (const Argument &Arg : Fn.args()) {
+ Type *BaseArgTy = Arg.getType();
+ unsigned Align = DL.getABITypeAlignment(BaseArgTy);
+ MaxAlign = std::max(Align, MaxAlign);
+ unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
+
+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+
+ // We're basically throwing away everything passed into us and starting over
+ // to get accurate in-memory offsets. The "PartOffset" is completely useless
+ // to us as computed in Ins.
+ //
+ // We also need to figure out what type legalization is trying to do to get
+ // the correct memory offsets.
+
+ SmallVector<EVT, 16> ValueVTs;
+ SmallVector<uint64_t, 16> Offsets;
+ ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
+
+ for (unsigned Value = 0, NumValues = ValueVTs.size();
+ Value != NumValues; ++Value) {
+ uint64_t BasePartOffset = Offsets[Value];
+
+ EVT ArgVT = ValueVTs[Value];
+ EVT MemVT = ArgVT;
+ MVT RegisterVT =
+ getRegisterTypeForCallingConv(Ctx, ArgVT);
+ unsigned NumRegs =
+ getNumRegistersForCallingConv(Ctx, ArgVT);
+
+ if (!Subtarget->isAmdHsaOS() &&
+ (ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) {
+ // The ABI says the caller will extend these values to 32-bits.
+ MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32;
+ } else if (NumRegs == 1) {
+ // This argument is not split, so the IR type is the memory type.
+ if (ArgVT.isExtended()) {
+ // We have an extended type, like i24, so we should just use the
+ // register type.
+ MemVT = RegisterVT;
+ } else {
+ MemVT = ArgVT;
+ }
+ } else if (ArgVT.isVector() && RegisterVT.isVector() &&
+ ArgVT.getScalarType() == RegisterVT.getScalarType()) {
+ assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
+ // We have a vector value which has been split into a vector with
+ // the same scalar type, but fewer elements. This should handle
+ // all the floating-point vector types.
+ MemVT = RegisterVT;
+ } else if (ArgVT.isVector() &&
+ ArgVT.getVectorNumElements() == NumRegs) {
+ // This arg has been split so that each element is stored in a separate
+ // register.
+ MemVT = ArgVT.getScalarType();
+ } else if (ArgVT.isExtended()) {
+ // We have an extended type, like i65.
+ MemVT = RegisterVT;
} else {
- llvm_unreachable("cannot deduce memory type.");
+ unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
+ assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
+ if (RegisterVT.isInteger()) {
+ MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
+ } else if (RegisterVT.isVector()) {
+ assert(!RegisterVT.getScalarType().isFloatingPoint());
+ unsigned NumElements = RegisterVT.getVectorNumElements();
+ assert(MemoryBits % NumElements == 0);
+ // This vector type has been split into another vector type with
+ // a different elements size.
+ EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
+ MemoryBits / NumElements);
+ MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+ } else {
+ llvm_unreachable("cannot deduce memory type.");
+ }
}
- }
- // Convert one element vectors to scalar.
- if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
- MemVT = MemVT.getScalarType();
+ // Convert one element vectors to scalar.
+ if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
+ MemVT = MemVT.getScalarType();
- if (MemVT.isExtended()) {
- // This should really only happen if we have vec3 arguments
- assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
- MemVT = MemVT.getPow2VectorType(State.getContext());
- }
+ if (MemVT.isExtended()) {
+ // This should really only happen if we have vec3 arguments
+ assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
+ MemVT = MemVT.getPow2VectorType(State.getContext());
+ }
- assert(MemVT.isSimple());
- allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
- State);
+ unsigned PartOffset = 0;
+ for (unsigned i = 0; i != NumRegs; ++i) {
+ State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
+ BasePartOffset + PartOffset,
+ MemVT.getSimpleVT(),
+ CCValAssign::Full));
+ PartOffset += MemVT.getStoreSize();
+ }
+ }
}
}
@@ -1178,7 +1192,15 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = G->getGlobal();
- if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
+ if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
+ G->getAddressSpace() == AMDGPUASI.REGION_ADDRESS) {
+ if (!MFI->isEntryFunction()) {
+ const Function &Fn = DAG.getMachineFunction().getFunction();
+ DiagnosticInfoUnsupported BadLDSDecl(
+ Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
+ DAG.getContext()->diagnose(BadLDSDecl);
+ }
+
// XXX: What does the value of G->getOffset() mean?
assert(G->getOffset() == 0 &&
"Do not know what to do with an non-zero offset");
@@ -1201,6 +1223,16 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
SelectionDAG &DAG) const {
SmallVector<SDValue, 8> Args;
+ EVT VT = Op.getValueType();
+ if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SDLoc SL(Op);
+ SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
+ SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
+
+ SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
+ return DAG.getNode(ISD::BITCAST, SL, VT, BV);
+ }
+
for (const SDUse &U : Op->ops())
DAG.ExtractVectorElements(U.get(), Args);
@@ -1219,7 +1251,7 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
}
-/// \brief Generate Min/Max node
+/// Generate Min/Max node
SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
SDValue LHS, SDValue RHS,
SDValue True, SDValue False,
@@ -1985,7 +2017,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
- // Extend back to to 64-bits.
+ // Extend back to 64-bits.
SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
@@ -2806,28 +2838,6 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
SN->getBasePtr(), SN->getMemOperand());
}
-SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
- ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
- if (!CSrc)
- return SDValue();
-
- const APFloat &F = CSrc->getValueAPF();
- APFloat Zero = APFloat::getZero(F.getSemantics());
- APFloat::cmpResult Cmp0 = F.compare(Zero);
- if (Cmp0 == APFloat::cmpLessThan ||
- (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
- return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
- }
-
- APFloat One(F.getSemantics(), "1.0");
- APFloat::cmpResult Cmp1 = F.compare(One);
- if (Cmp1 == APFloat::cmpGreaterThan)
- return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
-
- return SDValue(CSrc, 0);
-}
-
// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
// issues.
@@ -2903,7 +2913,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
SDValue X = LHS->getOperand(0);
if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
- isTypeLegal(MVT::v2i16)) {
+ isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
// Prefer build_vector as the canonical form if packed types are legal.
// (shl ([asz]ext i16:x), 16 -> build_vector 0, x
SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
@@ -3017,6 +3027,92 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
}
+SDValue AMDGPUTargetLowering::performTruncateCombine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SDLoc SL(N);
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+
+ // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
+ if (Src.getOpcode() == ISD::BITCAST) {
+ SDValue Vec = Src.getOperand(0);
+ if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
+ SDValue Elt0 = Vec.getOperand(0);
+ EVT EltVT = Elt0.getValueType();
+ if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
+ if (EltVT.isFloatingPoint()) {
+ Elt0 = DAG.getNode(ISD::BITCAST, SL,
+ EltVT.changeTypeToInteger(), Elt0);
+ }
+
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
+ }
+ }
+ }
+
+ // Equivalent of above for accessing the high element of a vector as an
+ // integer operation.
+ // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
+ if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
+ if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
+ if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
+ SDValue BV = stripBitcast(Src.getOperand(0));
+ if (BV.getOpcode() == ISD::BUILD_VECTOR &&
+ BV.getValueType().getVectorNumElements() == 2) {
+ SDValue SrcElt = BV.getOperand(1);
+ EVT SrcEltVT = SrcElt.getValueType();
+ if (SrcEltVT.isFloatingPoint()) {
+ SrcElt = DAG.getNode(ISD::BITCAST, SL,
+ SrcEltVT.changeTypeToInteger(), SrcElt);
+ }
+
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
+ }
+ }
+ }
+ }
+
+ // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
+ //
+ // i16 (trunc (srl i64:x, K)), K <= 16 ->
+ // i16 (trunc (srl (i32 (trunc x), K)))
+ if (VT.getScalarSizeInBits() < 32) {
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT.getScalarSizeInBits() > 32 &&
+ (Src.getOpcode() == ISD::SRL ||
+ Src.getOpcode() == ISD::SRA ||
+ Src.getOpcode() == ISD::SHL)) {
+ SDValue Amt = Src.getOperand(1);
+ KnownBits Known;
+ DAG.computeKnownBits(Amt, Known);
+ unsigned Size = VT.getScalarSizeInBits();
+ if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
+ (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
+ EVT MidVT = VT.isVector() ?
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ VT.getVectorNumElements()) : MVT::i32;
+
+ EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
+ Src.getOperand(0));
+ DCI.AddToWorklist(Trunc.getNode());
+
+ if (Amt.getValueType() != NewShiftVT) {
+ Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
+ DCI.AddToWorklist(Amt.getNode());
+ }
+
+ SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
+ Trunc, Amt);
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
+ }
+ }
+ }
+
+ return SDValue();
+}
+
// We need to specifically handle i64 mul here to avoid unnecessary conversion
// instructions. If we only match on the legalized i64 mul expansion,
// SimplifyDemandedBits will be unable to remove them because there will be
@@ -3058,6 +3154,17 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+
+ // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
+ // in the source into any_extends if the result of the mul is truncated. Since
+ // we can assume the high bits are whatever we want, use the underlying value
+ // to avoid the unknown high bits from interfering.
+ if (N0.getOpcode() == ISD::ANY_EXTEND)
+ N0 = N0.getOperand(0);
+
+ if (N1.getOpcode() == ISD::ANY_EXTEND)
+ N1 = N1.getOperand(0);
+
SDValue Mul;
if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
@@ -3495,6 +3602,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
case ISD::FSIN:
case AMDGPUISD::RCP:
case AMDGPUISD::RCP_LEGACY:
+ case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::SIN_HW: {
SDValue CvtSrc = N0.getOperand(0);
if (CvtSrc.getOpcode() == ISD::FNEG) {
@@ -3571,6 +3679,18 @@ SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
}
}
+SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+ if (!CFP)
+ return SDValue();
+
+ // XXX - Should this flush denormals?
+ const APFloat &Val = CFP->getValueAPF();
+ APFloat One(Val.getSemantics(), "1.0");
+ return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
+}
+
SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -3617,12 +3737,13 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
// TODO: Generalize and move to DAGCombiner
SDValue Src = N->getOperand(0);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
- assert(Src.getValueType() == MVT::i64);
- SDLoc SL(N);
- uint64_t CVal = C->getZExtValue();
- return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
- DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
- DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+ if (Src.getValueType() == MVT::i64) {
+ SDLoc SL(N);
+ uint64_t CVal = C->getZExtValue();
+ return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
+ DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+ DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+ }
}
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
@@ -3656,6 +3777,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return performSraCombine(N, DCI);
}
+ case ISD::TRUNCATE:
+ return performTruncateCombine(N, DCI);
case ISD::MUL:
return performMulCombine(N, DCI);
case ISD::MULHS:
@@ -3768,18 +3891,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return performLoadCombine(N, DCI);
case ISD::STORE:
return performStoreCombine(N, DCI);
- case AMDGPUISD::CLAMP:
- return performClampCombine(N, DCI);
- case AMDGPUISD::RCP: {
- if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
- // XXX - Should this flush denormals?
- const APFloat &Val = CFP->getValueAPF();
- APFloat One(Val.getSemantics(), "1.0");
- return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
- }
-
- break;
- }
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RCP_IFLAG:
+ return performRcpCombine(N, DCI);
case ISD::AssertZext:
case ISD::AssertSext:
return performAssertSZExtCombine(N, DCI);
@@ -3856,9 +3970,14 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
}
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
- const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
- unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
- uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
+ const MachineFunction &MF, const ImplicitParameter Param) const {
+ const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
+ const AMDGPUSubtarget &ST =
+ AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
+ unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
+ unsigned Alignment = ST.getAlignmentForImplicitArgPtr();
+ uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
+ ExplicitArgOffset;
switch (Param) {
case GRID_DIM:
return ArgOffset;
@@ -3907,6 +4026,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FMED3)
NODE_NAME_CASE(SMED3)
NODE_NAME_CASE(UMED3)
+ NODE_NAME_CASE(FDOT2)
NODE_NAME_CASE(URECIP)
NODE_NAME_CASE(DIV_SCALE)
NODE_NAME_CASE(DIV_FMAS)
@@ -3917,6 +4037,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(RSQ)
NODE_NAME_CASE(RCP_LEGACY)
NODE_NAME_CASE(RSQ_LEGACY)
+ NODE_NAME_CASE(RCP_IFLAG)
NODE_NAME_CASE(FMUL_LEGACY)
NODE_NAME_CASE(RSQ_CLAMP)
NODE_NAME_CASE(LDEXP)
@@ -3941,6 +4062,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(MAD_I24)
NODE_NAME_CASE(MAD_I64_I32)
NODE_NAME_CASE(MAD_U64_U32)
+ NODE_NAME_CASE(PERM)
NODE_NAME_CASE(TEXTURE_FETCH)
NODE_NAME_CASE(EXPORT)
NODE_NAME_CASE(EXPORT_DONE)
@@ -3957,6 +4079,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVT_F32_UBYTE2)
NODE_NAME_CASE(CVT_F32_UBYTE3)
NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
+ NODE_NAME_CASE(CVT_PKNORM_I16_F32)
+ NODE_NAME_CASE(CVT_PKNORM_U16_F32)
+ NODE_NAME_CASE(CVT_PK_I16_I32)
+ NODE_NAME_CASE(CVT_PK_U16_U32)
NODE_NAME_CASE(FP_TO_FP16)
NODE_NAME_CASE(FP16_ZEXT)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
@@ -3976,14 +4102,21 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(LOAD_CONSTANT)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
+ NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
+ NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
NODE_NAME_CASE(ATOMIC_INC)
NODE_NAME_CASE(ATOMIC_DEC)
+ NODE_NAME_CASE(ATOMIC_LOAD_FADD)
+ NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
+ NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
NODE_NAME_CASE(BUFFER_LOAD)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
+ NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(BUFFER_STORE)
NODE_NAME_CASE(BUFFER_STORE_FORMAT)
+ NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
@@ -3995,6 +4128,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_OR)
NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
+
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
return nullptr;
@@ -4108,14 +4242,45 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
Known.Zero.setHighBits(32 - MaxValBits);
break;
}
+ case AMDGPUISD::PERM: {
+ ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ if (!CMask)
+ return;
+
+ KnownBits LHSKnown, RHSKnown;
+ DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
+ DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
+ unsigned Sel = CMask->getZExtValue();
+
+ for (unsigned I = 0; I < 32; I += 8) {
+ unsigned SelBits = Sel & 0xff;
+ if (SelBits < 4) {
+ SelBits *= 8;
+ Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
+ Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
+ } else if (SelBits < 7) {
+ SelBits = (SelBits & 3) * 8;
+ Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
+ Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
+ } else if (SelBits == 0x0c) {
+ Known.Zero |= 0xff << I;
+ } else if (SelBits > 0x0c) {
+ Known.One |= 0xff << I;
+ }
+ Sel >>= 8;
+ }
+ break;
+ }
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (IID) {
case Intrinsic::amdgcn_mbcnt_lo:
case Intrinsic::amdgcn_mbcnt_hi: {
+ const GCNSubtarget &ST =
+ DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
// These return at most the wavefront size - 1.
unsigned Size = Op.getValueType().getSizeInBits();
- Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2());
+ Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
break;
}
default:
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 5c31bddd9b1a..a4c3b413e103 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface definition of the TargetLowering class that is common
+/// Interface definition of the TargetLowering class that is common
/// to all AMD GPUs.
//
//===----------------------------------------------------------------------===//
@@ -28,6 +28,8 @@ struct ArgDescriptor;
class AMDGPUTargetLowering : public TargetLowering {
private:
+ const AMDGPUSubtarget *Subtarget;
+
/// \returns AMDGPUISD::FFBH_U32 node if the incoming \p Op may have been
/// legalized from a smaller type VT. Need to match pre-legalized type because
/// the generic legalization inserts the add/sub between the select and
@@ -39,12 +41,11 @@ public:
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
protected:
- const AMDGPUSubtarget *Subtarget;
AMDGPUAS AMDGPUASI;
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
- /// \brief Split a vector store into multiple scalar stores.
+ /// Split a vector store into multiple scalar stores.
/// \returns The resulting chain.
SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
@@ -78,7 +79,6 @@ protected:
bool shouldCombineMemoryType(EVT VT) const;
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
@@ -87,6 +87,7 @@ protected:
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -96,6 +97,7 @@ protected:
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
@@ -108,10 +110,10 @@ protected:
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;
- /// \brief Split a vector load into 2 loads of half the vector.
+ /// Split a vector load into 2 loads of half the vector.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
- /// \brief Split a vector store into 2 stores of half the vector.
+ /// Split a vector store into 2 stores of half the vector.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
@@ -120,8 +122,11 @@ protected:
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results) const;
- void analyzeFormalArgumentsCompute(CCState &State,
- const SmallVectorImpl<ISD::InputArg> &Ins) const;
+
+ void analyzeFormalArgumentsCompute(
+ CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) const;
+
public:
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
@@ -136,6 +141,10 @@ public:
return false;
}
+ static inline SDValue stripBitcast(SDValue Val) {
+ return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
+ }
+
static bool allUsesHaveSourceMods(const SDNode *N,
unsigned CostThreshold = 4);
bool isFAbsFree(EVT VT) const override;
@@ -146,7 +155,6 @@ public:
bool isZExtFree(Type *Src, Type *Dest) const override;
bool isZExtFree(EVT Src, EVT Dest) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;
- bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const override;
bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
@@ -168,6 +176,7 @@ public:
bool isCheapToSpeculateCttz() const override;
bool isCheapToSpeculateCtlz() const override;
+ bool isSDNodeAlwaysUniform(const SDNode *N) const override;
static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
@@ -224,7 +233,7 @@ public:
virtual SDNode *PostISelFolding(MachineSDNode *N,
SelectionDAG &DAG) const = 0;
- /// \brief Determine which of the bits specified in \p Mask are known to be
+ /// Determine which of the bits specified in \p Mask are known to be
/// either zero or one and return them in the \p KnownZero and \p KnownOne
/// bitsets.
void computeKnownBitsForTargetNode(const SDValue Op,
@@ -237,7 +246,7 @@ public:
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
- /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
+ /// Helper function that adds Reg to the LiveIn list of the DAG's
/// MachineFunction.
///
/// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise
@@ -285,9 +294,9 @@ public:
GRID_OFFSET,
};
- /// \brief Helper function that returns the byte offset of the given
+ /// Helper function that returns the byte offset of the given
/// type of implicit parameter.
- uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
+ uint32_t getImplicitParameterOffset(const MachineFunction &MF,
const ImplicitParameter Param) const;
AMDGPUAS getAMDGPUAS() const {
@@ -357,6 +366,7 @@ enum NodeType : unsigned {
FMED3,
SMED3,
UMED3,
+ FDOT2,
URECIP,
DIV_SCALE,
DIV_FMAS,
@@ -372,6 +382,7 @@ enum NodeType : unsigned {
RSQ,
RCP_LEGACY,
RSQ_LEGACY,
+ RCP_IFLAG,
FMUL_LEGACY,
RSQ_CLAMP,
LDEXP,
@@ -396,6 +407,7 @@ enum NodeType : unsigned {
MAD_I64_I32,
MUL_LOHI_I24,
MUL_LOHI_U24,
+ PERM,
TEXTURE_FETCH,
EXPORT, // exp on SI+
EXPORT_DONE, // exp on SI+ with done bit set
@@ -417,6 +429,10 @@ enum NodeType : unsigned {
// Convert two float 32 numbers into a single register holding two packed f16
// with round to zero.
CVT_PKRTZ_F16_F32,
+ CVT_PKNORM_I16_F32,
+ CVT_PKNORM_U16_F32,
+ CVT_PK_I16_I32,
+ CVT_PK_U16_U32,
// Same as the standard node, except the high bits of the resulting integer
// are known 0.
@@ -451,14 +467,21 @@ enum NodeType : unsigned {
LOAD_CONSTANT,
TBUFFER_STORE_FORMAT,
TBUFFER_STORE_FORMAT_X3,
+ TBUFFER_STORE_FORMAT_D16,
TBUFFER_LOAD_FORMAT,
+ TBUFFER_LOAD_FORMAT_D16,
ATOMIC_CMP_SWAP,
ATOMIC_INC,
ATOMIC_DEC,
+ ATOMIC_LOAD_FADD,
+ ATOMIC_LOAD_FMIN,
+ ATOMIC_LOAD_FMAX,
BUFFER_LOAD,
BUFFER_LOAD_FORMAT,
+ BUFFER_LOAD_FORMAT_D16,
BUFFER_STORE,
BUFFER_STORE_FORMAT,
+ BUFFER_STORE_FORMAT_D16,
BUFFER_ATOMIC_SWAP,
BUFFER_ATOMIC_ADD,
BUFFER_ATOMIC_SUB,
@@ -470,6 +493,7 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_OR,
BUFFER_ATOMIC_XOR,
BUFFER_ATOMIC_CMPSWAP,
+
LAST_AMDGPU_ISD_NUMBER
};
diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp
index ff9e7b50ed5c..35dd9eb0a478 100644
--- a/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This is AMDGPU specific replacement of the standard inliner.
+/// This is AMDGPU specific replacement of the standard inliner.
/// The main purpose is to account for the fact that calls not only expensive
/// on the AMDGPU, but much more expensive if a private memory pointer is
/// passed to a function as an argument. In this situation, we are unable to
@@ -161,8 +161,8 @@ static bool isWrapperOnlyCall(CallSite CS) {
return false;
}
if (isa<ReturnInst>(*std::next(I->getIterator()))) {
- DEBUG(dbgs() << " Wrapper only call detected: "
- << Callee->getName() << '\n');
+ LLVM_DEBUG(dbgs() << " Wrapper only call detected: "
+ << Callee->getName() << '\n');
return true;
}
}
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 8156599528c2..07aa7c2cc8ad 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -16,95 +16,36 @@
#include "AMDGPUInstrInfo.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
using namespace llvm;
-#define GET_INSTRINFO_CTOR_DTOR
-#include "AMDGPUGenInstrInfo.inc"
-
// Pin the vtable to this file.
-void AMDGPUInstrInfo::anchor() {}
-
-AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
- : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
- ST(ST),
- AMDGPUASI(ST.getAMDGPUAS()) {}
-
-// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
-// the first 16 loads will be interleaved with the stores, and the next 16 will
-// be clustered as expected. It should really split into 2 16 store batches.
-//
-// Loads are clustered until this returns false, rather than trying to schedule
-// groups of stores. This also means we have to deal with saying different
-// address space loads should be clustered, and ones which might cause bank
-// conflicts.
-//
-// This might be deprecated so it might not be worth that much effort to fix.
-bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
- int64_t Offset0, int64_t Offset1,
- unsigned NumLoads) const {
- assert(Offset1 > Offset0 &&
- "Second offset should be larger than first offset!");
- // If we have less than 16 loads in a row, and the offsets are within 64
- // bytes, then schedule together.
-
- // A cacheline is 64 bytes (for global memory).
- return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
-}
-
-// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
-enum SIEncodingFamily {
- SI = 0,
- VI = 1,
- SDWA = 2,
- SDWA9 = 3,
- GFX9 = 4
-};
-
-static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
- switch (ST.getGeneration()) {
- case AMDGPUSubtarget::SOUTHERN_ISLANDS:
- case AMDGPUSubtarget::SEA_ISLANDS:
- return SIEncodingFamily::SI;
- case AMDGPUSubtarget::VOLCANIC_ISLANDS:
- case AMDGPUSubtarget::GFX9:
- return SIEncodingFamily::VI;
-
- // FIXME: This should never be called for r600 GPUs.
- case AMDGPUSubtarget::R600:
- case AMDGPUSubtarget::R700:
- case AMDGPUSubtarget::EVERGREEN:
- case AMDGPUSubtarget::NORTHERN_ISLANDS:
- return SIEncodingFamily::SI;
- }
-
- llvm_unreachable("Unknown subtarget generation!");
-}
-
-int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
- SIEncodingFamily Gen = subtargetEncodingFamily(ST);
+//void AMDGPUInstrInfo::anchor() {}
- if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
- ST.getGeneration() >= AMDGPUSubtarget::GFX9)
- Gen = SIEncodingFamily::GFX9;
+AMDGPUInstrInfo::AMDGPUInstrInfo(const GCNSubtarget &ST) { }
- if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
- Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
- : SIEncodingFamily::SDWA;
- int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
+// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
+bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) {
+ const Value *Ptr = MMO->getValue();
+ // UndefValue means this is a load of a kernel input. These are uniform.
+ // Sometimes LDS instructions have constant pointers.
+ // If Ptr is null, then that means this mem operand contains a
+ // PseudoSourceValue like GOT.
+ if (!Ptr || isa<UndefValue>(Ptr) ||
+ isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
+ return true;
- // -1 means that Opcode is already a native instruction.
- if (MCOp == -1)
- return Opcode;
+ if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+ return true;
- // (uint16_t)-1 means that Opcode is a pseudo instruction that has
- // no encoding in the given subtarget generation.
- if (MCOp == (uint16_t)-1)
- return -1;
+ if (const Argument *Arg = dyn_cast<Argument>(Ptr))
+ return AMDGPU::isArgPassedInSGPR(Arg);
- return MCOp;
+ const Instruction *I = dyn_cast<Instruction>(Ptr);
+ return I && I->getMetadata("amdgpu.uniform");
}
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index a9fcd4834638..2f8166da0d33 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Contains the definition of a TargetInstrInfo class that is common
+/// Contains the definition of a TargetInstrInfo class that is common
/// to all AMD GPUs.
//
//===----------------------------------------------------------------------===//
@@ -20,37 +20,43 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
-#define GET_INSTRINFO_HEADER
-#include "AMDGPUGenInstrInfo.inc"
-#undef GET_INSTRINFO_HEADER
-
namespace llvm {
-class AMDGPUSubtarget;
+class GCNSubtarget;
class MachineFunction;
class MachineInstr;
class MachineInstrBuilder;
-class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
-private:
- const AMDGPUSubtarget &ST;
+class AMDGPUInstrInfo {
+public:
+ explicit AMDGPUInstrInfo(const GCNSubtarget &st);
- virtual void anchor();
-protected:
- AMDGPUAS AMDGPUASI;
+ static bool isUniformMMO(const MachineMemOperand *MMO);
+};
-public:
- explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
+namespace AMDGPU {
- bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
- int64_t Offset1, int64_t Offset2,
- unsigned NumLoads) const override;
+struct RsrcIntrinsic {
+ unsigned Intr;
+ uint8_t RsrcArg;
+ bool IsImage;
+};
+const RsrcIntrinsic *lookupRsrcIntrinsic(unsigned Intr);
+
+struct D16ImageDimIntrinsic {
+ unsigned Intr;
+ unsigned D16HelperIntr;
+};
+const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr);
- /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
- /// Return -1 if the target-specific opcode for the pseudo instruction does
- /// not exist. If Opcode is not a pseudo instruction, this is identity.
- int pseudoToMCOpcode(int Opcode) const;
+struct ImageDimIntrinsicInfo {
+ unsigned Intr;
+ unsigned BaseOpcode;
+ MIMGDim Dim;
};
+const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr);
+
+} // end AMDGPU namespace
} // End llvm namespace
#endif
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index c024010f3e96..96b7568eec1f 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -35,6 +35,10 @@ def AMDGPUFPPackOp : SDTypeProfile<1, 2,
[SDTCisFP<1>, SDTCisSameAs<1, 2>]
>;
+def AMDGPUIntPackOp : SDTypeProfile<1, 2,
+ [SDTCisInt<1>, SDTCisSameAs<1, 2>]
+>;
+
def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
[SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
>;
@@ -136,12 +140,18 @@ def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
+def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>;
+
// out = 1.0 / sqrt(a) result clamped to +/- max_float.
def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
+def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
+def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
@@ -160,8 +170,6 @@ def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]
>;
-def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
-
// out = min(a, b) a and b are floats, where a nan comparison fails.
def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
[]
@@ -333,6 +341,13 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
+def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
+ SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
+ SDTCisFP<0>, SDTCisVec<1>]>,
+ []>;
+
+def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
+
def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
SDTypeProfile<0, 1, [SDTCisInt<0>]>,
[SDNPHasChain, SDNPInGlue]>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 16d240e96196..219d430fbb39 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -17,6 +17,12 @@
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -30,10 +36,48 @@
using namespace llvm;
+#define GET_GLOBALISEL_IMPL
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+#undef AMDGPUSubtarget
+
AMDGPUInstructionSelector::AMDGPUInstructionSelector(
- const SISubtarget &STI, const AMDGPURegisterBankInfo &RBI)
+ const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
+ const AMDGPUTargetMachine &TM)
: InstructionSelector(), TII(*STI.getInstrInfo()),
- TRI(*STI.getRegisterInfo()), RBI(RBI), AMDGPUASI(STI.getAMDGPUAS()) {}
+ TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
+ STI(STI),
+ EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+ ,AMDGPUASI(STI.getAMDGPUAS())
+{
+}
+
+const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
+
+bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ I.setDesc(TII.get(TargetOpcode::COPY));
+ for (const MachineOperand &MO : I.operands()) {
+ if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+ continue;
+
+ const TargetRegisterClass *RC =
+ TRI.getConstrainedRegClassForOperand(MO, MRI);
+ if (!RC)
+ continue;
+ RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+ }
+ return true;
+}
MachineOperand
AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
@@ -71,6 +115,10 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
}
}
+static int64_t getConstant(const MachineInstr *MI) {
+ return MI->getOperand(1).getCImm()->getSExtValue();
+}
+
bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
@@ -118,12 +166,144 @@ bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const {
return selectG_ADD(I);
}
+bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const MachineOperand &MO = I.getOperand(0);
+ const TargetRegisterClass *RC =
+ TRI.getConstrainedRegClassForOperand(MO, MRI);
+ if (RC)
+ RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+ I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I,
+ CodeGenCoverage &CoverageInfo) const {
+ unsigned IntrinsicID = I.getOperand(1).getIntrinsicID();
+
+ switch (IntrinsicID) {
+ default:
+ break;
+ case Intrinsic::maxnum:
+ case Intrinsic::minnum:
+ case Intrinsic::amdgcn_cvt_pkrtz:
+ return selectImpl(I, CoverageInfo);
+
+ case Intrinsic::amdgcn_kernarg_segment_ptr: {
+ MachineFunction *MF = I.getParent()->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ const ArgDescriptor *InputPtrReg;
+ const TargetRegisterClass *RC;
+ const DebugLoc &DL = I.getDebugLoc();
+
+ std::tie(InputPtrReg, RC)
+ = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
+ if (!InputPtrReg)
+ report_fatal_error("missing kernarg segment ptr");
+
+ BuildMI(*I.getParent(), &I, DL, TII.get(AMDGPU::COPY))
+ .add(I.getOperand(0))
+ .addReg(MRI.getLiveInVirtReg(InputPtrReg->getRegister()));
+ I.eraseFromParent();
+ return true;
+ }
+ }
+ return false;
+}
+
+static MachineInstr *
+buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt,
+ unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3,
+ unsigned VM, bool Compr, unsigned Enabled, bool Done) {
+ const DebugLoc &DL = Insert->getDebugLoc();
+ MachineBasicBlock &BB = *Insert->getParent();
+ unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP;
+ return BuildMI(BB, Insert, DL, TII.get(Opcode))
+ .addImm(Tgt)
+ .addReg(Reg0)
+ .addReg(Reg1)
+ .addReg(Reg2)
+ .addReg(Reg3)
+ .addImm(VM)
+ .addImm(Compr)
+ .addImm(Enabled);
+}
+
+bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
+ MachineInstr &I,
+ CodeGenCoverage &CoverageInfo) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ unsigned IntrinsicID = I.getOperand(0).getIntrinsicID();
+ switch (IntrinsicID) {
+ case Intrinsic::amdgcn_exp: {
+ int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
+ int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
+ int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(7).getReg()));
+ int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(8).getReg()));
+
+ MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(),
+ I.getOperand(4).getReg(),
+ I.getOperand(5).getReg(),
+ I.getOperand(6).getReg(),
+ VM, false, Enabled, Done);
+
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
+ }
+ case Intrinsic::amdgcn_exp_compr: {
+ const DebugLoc &DL = I.getDebugLoc();
+ int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
+ int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
+ unsigned Reg0 = I.getOperand(3).getReg();
+ unsigned Reg1 = I.getOperand(4).getReg();
+ unsigned Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(5).getReg()));
+ int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(6).getReg()));
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
+ MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM,
+ true, Enabled, Done);
+
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
+ }
+ }
+ return false;
+}
+
bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
DebugLoc DL = I.getDebugLoc();
+ unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
+ unsigned Opcode;
// FIXME: Select store instruction based on address space
- MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(AMDGPU::FLAT_STORE_DWORD))
+ switch (StoreSize) {
+ default:
+ return false;
+ case 32:
+ Opcode = AMDGPU::FLAT_STORE_DWORD;
+ break;
+ case 64:
+ Opcode = AMDGPU::FLAT_STORE_DWORDX2;
+ break;
+ case 96:
+ Opcode = AMDGPU::FLAT_STORE_DWORDX3;
+ break;
+ case 128:
+ Opcode = AMDGPU::FLAT_STORE_DWORDX4;
+ break;
+ }
+
+ MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
.add(I.getOperand(1))
.add(I.getOperand(0))
.addImm(0) // offset
@@ -143,36 +323,67 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineOperand &ImmOp = I.getOperand(1);
+
+ // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
+ if (ImmOp.isFPImm()) {
+ const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
+ ImmOp.ChangeToImmediate(Imm.getZExtValue());
+ } else if (ImmOp.isCImm()) {
+ ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue());
+ }
+
unsigned DstReg = I.getOperand(0).getReg();
- unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+ unsigned Size;
+ bool IsSgpr;
+ const RegisterBank *RB = MRI.getRegBankOrNull(I.getOperand(0).getReg());
+ if (RB) {
+ IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
+ Size = MRI.getType(DstReg).getSizeInBits();
+ } else {
+ const TargetRegisterClass *RC = TRI.getRegClassForReg(MRI, DstReg);
+ IsSgpr = TRI.isSGPRClass(RC);
+ Size = TRI.getRegSizeInBits(*RC);
+ }
+ if (Size != 32 && Size != 64)
+ return false;
+
+ unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
if (Size == 32) {
- I.setDesc(TII.get(AMDGPU::S_MOV_B32));
+ I.setDesc(TII.get(Opcode));
+ I.addImplicitDefUseOperands(*MF);
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
- assert(Size == 64);
-
DebugLoc DL = I.getDebugLoc();
- unsigned LoReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- unsigned HiReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- const APInt &Imm = I.getOperand(1).getCImm()->getValue();
+ const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass :
+ &AMDGPU::VGPR_32RegClass;
+ unsigned LoReg = MRI.createVirtualRegister(RC);
+ unsigned HiReg = MRI.createVirtualRegister(RC);
+ const APInt &Imm = APInt(Size, I.getOperand(1).getImm());
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), LoReg)
+ BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
.addImm(Imm.trunc(32).getZExtValue());
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
+ BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
.addImm(Imm.ashr(32).getZExtValue());
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
- .addReg(LoReg)
- .addImm(AMDGPU::sub0)
- .addReg(HiReg)
- .addImm(AMDGPU::sub1);
+ const MachineInstr *RS =
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+ .addReg(LoReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(HiReg)
+ .addImm(AMDGPU::sub1);
+
// We can't call constrainSelectedInstRegOperands here, because it doesn't
// work for target independent opcodes
I.eraseFromParent();
- return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI);
+ const TargetRegisterClass *DstRC =
+ TRI.getConstrainedRegClassForOperand(RS->getOperand(0), MRI);
+ if (!DstRC)
+ return true;
+ return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
}
static bool isConstant(const MachineInstr &MI) {
@@ -228,6 +439,9 @@ static bool isInstrUniform(const MachineInstr &MI) {
isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
return true;
+ if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+ return true;
+
const Instruction *I = dyn_cast<Instruction>(Ptr);
return I && I->getMetadata("amdgpu.uniform");
}
@@ -292,7 +506,8 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
if (!I.hasOneMemOperand())
return false;
- if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS)
+ if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
+ (*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT)
return false;
if (!isInstrUniform(I))
@@ -303,7 +518,7 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
- const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = MF->getRegInfo();
unsigned DstReg = I.getOperand(0).getReg();
const DebugLoc &DL = I.getDebugLoc();
@@ -405,18 +620,30 @@ bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
bool AMDGPUInstructionSelector::select(MachineInstr &I,
CodeGenCoverage &CoverageInfo) const {
- if (!isPreISelGenericOpcode(I.getOpcode()))
+ if (!isPreISelGenericOpcode(I.getOpcode())) {
+ if (I.isCopy())
+ return selectCOPY(I);
return true;
+ }
switch (I.getOpcode()) {
default:
- break;
+ return selectImpl(I, CoverageInfo);
case TargetOpcode::G_ADD:
return selectG_ADD(I);
+ case TargetOpcode::G_BITCAST:
+ return selectCOPY(I);
case TargetOpcode::G_CONSTANT:
+ case TargetOpcode::G_FCONSTANT:
return selectG_CONSTANT(I);
case TargetOpcode::G_GEP:
return selectG_GEP(I);
+ case TargetOpcode::G_IMPLICIT_DEF:
+ return selectG_IMPLICIT_DEF(I);
+ case TargetOpcode::G_INTRINSIC:
+ return selectG_INTRINSIC(I, CoverageInfo);
+ case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+ return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo);
case TargetOpcode::G_LOAD:
return selectG_LOAD(I);
case TargetOpcode::G_STORE:
@@ -424,3 +651,47 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
}
return false;
}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
+ }};
+
+}
+
+///
+/// This will select either an SGPR or VGPR operand and will save us from
+/// having to write an extra tablegen pattern.
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src0_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
+ }};
+}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
+ }};
+}
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 715c4882f380..68b40b20aca2 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -15,27 +15,39 @@
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
#include "AMDGPU.h"
+#include "AMDGPUArgumentUsageInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+namespace {
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+#undef AMDGPUSubtarget
+}
+
namespace llvm {
class AMDGPUInstrInfo;
class AMDGPURegisterBankInfo;
+class GCNSubtarget;
class MachineInstr;
class MachineOperand;
class MachineRegisterInfo;
class SIInstrInfo;
+class SIMachineFunctionInfo;
class SIRegisterInfo;
-class SISubtarget;
class AMDGPUInstructionSelector : public InstructionSelector {
public:
- AMDGPUInstructionSelector(const SISubtarget &STI,
- const AMDGPURegisterBankInfo &RBI);
+ AMDGPUInstructionSelector(const GCNSubtarget &STI,
+ const AMDGPURegisterBankInfo &RBI,
+ const AMDGPUTargetMachine &TM);
bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+ static const char *getName();
private:
struct GEPInfo {
@@ -46,10 +58,18 @@ private:
GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { }
};
+ /// tblgen-erated 'select' implementation.
+ bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+
MachineOperand getSubOperand64(MachineOperand &MO, unsigned SubIdx) const;
+ bool selectCOPY(MachineInstr &I) const;
bool selectG_CONSTANT(MachineInstr &I) const;
bool selectG_ADD(MachineInstr &I) const;
bool selectG_GEP(MachineInstr &I) const;
+ bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
+ bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+ bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I,
+ CodeGenCoverage &CoverageInfo) const;
bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
SmallVectorImpl<GEPInfo> &AddrInfo) const;
@@ -57,9 +77,35 @@ private:
bool selectG_LOAD(MachineInstr &I) const;
bool selectG_STORE(MachineInstr &I) const;
+ InstructionSelector::ComplexRendererFns
+ selectVCSRC(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectVSRC0(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectVOP3Mods0(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVOP3OMods(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVOP3Mods(MachineOperand &Root) const;
+
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
const AMDGPURegisterBankInfo &RBI;
+ const AMDGPUTargetMachine &TM;
+ const GCNSubtarget &STI;
+ bool EnableLateStructurizeCFG;
+#define GET_GLOBALISEL_PREDICATES_DECL
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+#undef AMDGPUSubtarget
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+
protected:
AMDGPUAS AMDGPUASI;
};
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 31f728b0c22f..9426df399597 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -42,6 +42,47 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm = "",
field bits<32> Inst = 0xffffffff;
}
+//===---------------------------------------------------------------------===//
+// Return instruction
+//===---------------------------------------------------------------------===//
+
+class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
+: Instruction {
+
+ let Namespace = "AMDGPU";
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ let Pattern = pattern;
+ let AsmString = !strconcat(asmstr, "\n");
+ let isPseudo = 1;
+ let Itinerary = NullALU;
+ bit hasIEEEFlag = 0;
+ bit hasZeroOpFlag = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let isCodeGenOnly = 1;
+}
+
+def TruePredicate : Predicate<"true">;
+
+// Exists to help track down where SubtargetPredicate isn't set rather
+// than letting tablegen crash with an unhelpful error.
+def InvalidPred : Predicate<"predicate not set on instruction or pattern">;
+
+class PredicateControl {
+ Predicate SubtargetPredicate = InvalidPred;
+ list<Predicate> AssemblerPredicates = [];
+ Predicate AssemblerPredicate = TruePredicate;
+ list<Predicate> OtherPredicates = [];
+ list<Predicate> Predicates = !listconcat([SubtargetPredicate,
+ AssemblerPredicate],
+ AssemblerPredicates,
+ OtherPredicates);
+}
+class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>,
+ PredicateControl;
+
def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">;
def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">;
def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">;
@@ -52,7 +93,6 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
def FMA : Predicate<"Subtarget->hasFMA()">;
def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
-def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
def u16ImmTarget : AsmOperandClass {
let Name = "U16Imm";
@@ -95,12 +135,6 @@ def brtarget : Operand<OtherVT>;
// Misc. PatFrags
//===----------------------------------------------------------------------===//
-class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag<
- (ops node:$src0),
- (op $src0),
- [{ return N->hasOneUse(); }]
->;
-
class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
(ops node:$src0, node:$src1),
(op $src0, $src1),
@@ -113,8 +147,6 @@ class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
[{ return N->hasOneUse(); }]
>;
-def trunc_oneuse : HasOneUseUnaryOp<trunc>;
-
let Properties = [SDNPCommutative, SDNPAssociative] in {
def smax_oneuse : HasOneUseBinOp<smax>;
def smin_oneuse : HasOneUseBinOp<smin>;
@@ -127,6 +159,7 @@ def or_oneuse : HasOneUseBinOp<or>;
def xor_oneuse : HasOneUseBinOp<xor>;
} // Properties = [SDNPCommutative, SDNPAssociative]
+def add_oneuse : HasOneUseBinOp<add>;
def sub_oneuse : HasOneUseBinOp<sub>;
def srl_oneuse : HasOneUseBinOp<srl>;
@@ -240,6 +273,37 @@ def COND_NULL : PatLeaf <
[{(void)N; return false;}]
>;
+//===----------------------------------------------------------------------===//
+// PatLeafs for Texture Constants
+//===----------------------------------------------------------------------===//
+
+def TEX_ARRAY : PatLeaf<
+ (imm),
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
+ return TType == 9 || TType == 10 || TType == 16;
+ }]
+>;
+
+def TEX_RECT : PatLeaf<
+ (imm),
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
+ return TType == 5;
+ }]
+>;
+
+def TEX_SHADOW : PatLeaf<
+ (imm),
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
+ return (TType >= 6 && TType <= 8) || TType == 13;
+ }]
+>;
+
+def TEX_SHADOW_ARRAY : PatLeaf<
+ (imm),
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
+ return TType == 11 || TType == 12 || TType == 17;
+ }]
+>;
//===----------------------------------------------------------------------===//
// Load/Store Pattern Fragments
@@ -249,6 +313,10 @@ class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
}]>;
+class Aligned16Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
+ return cast<MemSDNode>(N)->getAlignment() >= 16;
+}]>;
+
class LoadFrag <SDPatternOperator op> : PatFrag<(ops node:$ptr), (op node:$ptr)>;
class StoreFrag<SDPatternOperator op> : PatFrag <
@@ -361,21 +429,31 @@ def az_extloadi8_local : LocalLoad <az_extloadi8>;
def sextloadi8_local : LocalLoad <sextloadi8>;
def az_extloadi16_local : LocalLoad <az_extloadi16>;
def sextloadi16_local : LocalLoad <sextloadi16>;
+def atomic_load_32_local : LocalLoad<atomic_load_32>;
+def atomic_load_64_local : LocalLoad<atomic_load_64>;
def store_local : LocalStore <store>;
def truncstorei8_local : LocalStore <truncstorei8>;
def truncstorei16_local : LocalStore <truncstorei16>;
def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress;
def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress;
+def atomic_store_local : LocalStore <atomic_store>;
def load_align8_local : Aligned8Bytes <
(ops node:$ptr), (load_local node:$ptr)
>;
+def load_align16_local : Aligned16Bytes <
+ (ops node:$ptr), (load_local node:$ptr)
+>;
+
def store_align8_local : Aligned8Bytes <
(ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
>;
+def store_align16_local : Aligned16Bytes <
+ (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
+>;
def load_flat : FlatLoad <load>;
def az_extloadi8_flat : FlatLoad <az_extloadi8>;
@@ -571,6 +649,18 @@ multiclass BFIPatterns <Instruction BFI_INT,
(BFI_INT $x, $y, $z)
>;
+ // 64-bit version
+ def : AMDGPUPat <
+ (or (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
+ (REG_SEQUENCE RC64,
+ (BFI_INT (i32 (EXTRACT_SUBREG $x, sub0)),
+ (i32 (EXTRACT_SUBREG $y, sub0)),
+ (i32 (EXTRACT_SUBREG $z, sub0))), sub0,
+ (BFI_INT (i32 (EXTRACT_SUBREG $x, sub1)),
+ (i32 (EXTRACT_SUBREG $y, sub1)),
+ (i32 (EXTRACT_SUBREG $z, sub1))), sub1)
+ >;
+
// SHA-256 Ch function
// z ^ (x & (y ^ z))
def : AMDGPUPat <
@@ -578,6 +668,18 @@ multiclass BFIPatterns <Instruction BFI_INT,
(BFI_INT $x, $y, $z)
>;
+ // 64-bit version
+ def : AMDGPUPat <
+ (xor i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
+ (REG_SEQUENCE RC64,
+ (BFI_INT (i32 (EXTRACT_SUBREG $x, sub0)),
+ (i32 (EXTRACT_SUBREG $y, sub0)),
+ (i32 (EXTRACT_SUBREG $z, sub0))), sub0,
+ (BFI_INT (i32 (EXTRACT_SUBREG $x, sub1)),
+ (i32 (EXTRACT_SUBREG $y, sub1)),
+ (i32 (EXTRACT_SUBREG $z, sub1))), sub1)
+ >;
+
def : AMDGPUPat <
(fcopysign f32:$src0, f32:$src1),
(BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1)
@@ -611,10 +713,25 @@ multiclass BFIPatterns <Instruction BFI_INT,
// SHA-256 Ma patterns
// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
-class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : AMDGPUPat <
- (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
- (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
->;
+multiclass SHA256MaPattern <Instruction BFI_INT, Instruction XOR, RegisterClass RC64> {
+ def : AMDGPUPat <
+ (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
+ (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
+ >;
+
+ def : AMDGPUPat <
+ (or (and i64:$x, i64:$z), (and i64:$y, (or i64:$x, i64:$z))),
+ (REG_SEQUENCE RC64,
+ (BFI_INT (XOR (i32 (EXTRACT_SUBREG $x, sub0)),
+ (i32 (EXTRACT_SUBREG $y, sub0))),
+ (i32 (EXTRACT_SUBREG $z, sub0)),
+ (i32 (EXTRACT_SUBREG $y, sub0))), sub0,
+ (BFI_INT (XOR (i32 (EXTRACT_SUBREG $x, sub1)),
+ (i32 (EXTRACT_SUBREG $y, sub1))),
+ (i32 (EXTRACT_SUBREG $z, sub1)),
+ (i32 (EXTRACT_SUBREG $y, sub1))), sub1)
+ >;
+}
// Bitfield extract patterns
@@ -633,14 +750,33 @@ multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> {
(UBFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
>;
+ // x & ((1 << y) - 1)
+ def : AMDGPUPat <
+ (and i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)),
+ (UBFE $src, (MOV (i32 0)), $width)
+ >;
+
+ // x & ~(-1 << y)
+ def : AMDGPUPat <
+ (and i32:$src, (xor_oneuse (shl_oneuse -1, i32:$width), -1)),
+ (UBFE $src, (MOV (i32 0)), $width)
+ >;
+
+ // x & (-1 >> (bitwidth - y))
+ def : AMDGPUPat <
+ (and i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
+ (UBFE $src, (MOV (i32 0)), $width)
+ >;
+
+ // x << (bitwidth - y) >> (bitwidth - y)
def : AMDGPUPat <
(srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
- (UBFE $src, (i32 0), $width)
+ (UBFE $src, (MOV (i32 0)), $width)
>;
def : AMDGPUPat <
(sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
- (SBFE $src, (i32 0), $width)
+ (SBFE $src, (MOV (i32 0)), $width)
>;
}
@@ -697,11 +833,3 @@ class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat <
(AMDGPUrcp (fsqrt vt:$src)),
(RsqInst $src)
>;
-
-include "R600Instructions.td"
-include "R700Instructions.td"
-include "EvergreenInstructions.td"
-include "CaymanInstructions.td"
-
-include "SIInstrInfo.td"
-
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index 86dc9bd9ea74..896e2055cf62 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -8,7 +8,7 @@
//==-----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU Implementation of the IntrinsicInfo class.
+/// AMDGPU Implementation of the IntrinsicInfo class.
//
//===-----------------------------------------------------------------------===//
@@ -25,13 +25,13 @@ AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo()
static const char *const IntrinsicNameTable[] = {
#define GET_INTRINSIC_NAME_TABLE
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicImpl.inc"
#undef GET_INTRINSIC_NAME_TABLE
};
namespace {
#define GET_INTRINSIC_ATTRIBUTES
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicImpl.inc"
#undef GET_INTRINSIC_ATTRIBUTES
}
@@ -80,7 +80,7 @@ unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
// Overload Table
#define GET_INTRINSIC_OVERLOAD_TABLE
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicImpl.inc"
#undef GET_INTRINSIC_OVERLOAD_TABLE
}
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
index 6cb8b9644642..ef42f9a319af 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -8,7 +8,7 @@
//==-----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
+/// Interface for the AMDGPU Implementation of the Intrinsic Info class.
//
//===-----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
@@ -24,7 +24,7 @@ namespace AMDGPUIntrinsic {
enum ID {
last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
#define GET_INTRINSIC_ENUM_VALUES
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicEnums.inc"
#undef GET_INTRINSIC_ENUM_VALUES
, num_AMDGPU_intrinsics
};
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
index 18c9bd933af2..230a04628504 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -13,7 +13,4 @@
let TargetPrefix = "AMDGPU", isTarget = 1 in {
def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
- def int_AMDGPU_kilp : Intrinsic<[], [], []>;
}
-
-include "SIIntrinsics.td"
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b4704f6feb92..87b072c9ea20 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -12,7 +12,9 @@
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
+#include "AMDGPU.h"
#include "AMDGPULegalizerInfo.h"
+#include "AMDGPUTargetMachine.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
@@ -20,19 +22,46 @@
#include "llvm/Support/Debug.h"
using namespace llvm;
+using namespace LegalizeActions;
-AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
+AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
+ const GCNTargetMachine &TM) {
using namespace TargetOpcode;
- const LLT S1= LLT::scalar(1);
+ auto GetAddrSpacePtr = [&TM](unsigned AS) {
+ return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
+ };
+
+ auto AMDGPUAS = ST.getAMDGPUAS();
+
+ const LLT S1 = LLT::scalar(1);
const LLT V2S16 = LLT::vector(2, 16);
+
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
- const LLT P1 = LLT::pointer(1, 64);
- const LLT P2 = LLT::pointer(2, 64);
+ const LLT S512 = LLT::scalar(512);
+
+ const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
+ const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
+ const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
+ const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS.FLAT_ADDRESS);
+ const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS.PRIVATE_ADDRESS);
+
+ const LLT AddrSpaces[] = {
+ GlobalPtr,
+ ConstantPtr,
+ LocalPtr,
+ FlatPtr,
+ PrivatePtr
+ };
setAction({G_ADD, S32}, Legal);
+ setAction({G_ASHR, S32}, Legal);
+ setAction({G_SUB, S32}, Legal);
+ setAction({G_MUL, S32}, Legal);
setAction({G_AND, S32}, Legal);
+ setAction({G_OR, S32}, Legal);
+ setAction({G_XOR, S32}, Legal);
setAction({G_BITCAST, V2S16}, Legal);
setAction({G_BITCAST, 1, S32}, Legal);
@@ -40,41 +69,88 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
setAction({G_BITCAST, S32}, Legal);
setAction({G_BITCAST, 1, V2S16}, Legal);
+ getActionDefinitionsBuilder(G_FCONSTANT)
+ .legalFor({S32, S64});
+
+ // G_IMPLICIT_DEF is a no-op so we can make it legal for any value type that
+ // can fit in a register.
+ // FIXME: We need to legalize several more operations before we can add
+ // a test case for size > 512.
+ getActionDefinitionsBuilder(G_IMPLICIT_DEF)
+ .legalIf([=](const LegalityQuery &Query) {
+ return Query.Types[0].getSizeInBits() <= 512;
+ })
+ .clampScalar(0, S1, S512);
+
+ getActionDefinitionsBuilder(G_CONSTANT)
+ .legalFor({S1, S32, S64});
+
// FIXME: i1 operands to intrinsics should always be legal, but other i1
// values may not be legal. We need to figure out how to distinguish
// between these two scenarios.
setAction({G_CONSTANT, S1}, Legal);
- setAction({G_CONSTANT, S32}, Legal);
- setAction({G_CONSTANT, S64}, Legal);
-
- setAction({G_FCONSTANT, S32}, Legal);
setAction({G_FADD, S32}, Legal);
+ setAction({G_FCMP, S1}, Legal);
+ setAction({G_FCMP, 1, S32}, Legal);
+ setAction({G_FCMP, 1, S64}, Legal);
+
setAction({G_FMUL, S32}, Legal);
- setAction({G_GEP, P1}, Legal);
- setAction({G_GEP, P2}, Legal);
- setAction({G_GEP, 1, S64}, Legal);
+ setAction({G_ZEXT, S64}, Legal);
+ setAction({G_ZEXT, 1, S32}, Legal);
+
+ setAction({G_FPTOSI, S32}, Legal);
+ setAction({G_FPTOSI, 1, S32}, Legal);
+
+ setAction({G_SITOFP, S32}, Legal);
+ setAction({G_SITOFP, 1, S32}, Legal);
+
+ setAction({G_FPTOUI, S32}, Legal);
+ setAction({G_FPTOUI, 1, S32}, Legal);
+
+ for (LLT PtrTy : AddrSpaces) {
+ LLT IdxTy = LLT::scalar(PtrTy.getSizeInBits());
+ setAction({G_GEP, PtrTy}, Legal);
+ setAction({G_GEP, 1, IdxTy}, Legal);
+ }
setAction({G_ICMP, S1}, Legal);
setAction({G_ICMP, 1, S32}, Legal);
- setAction({G_LOAD, P1}, Legal);
- setAction({G_LOAD, P2}, Legal);
- setAction({G_LOAD, S32}, Legal);
- setAction({G_LOAD, 1, P1}, Legal);
- setAction({G_LOAD, 1, P2}, Legal);
- setAction({G_OR, S32}, Legal);
+ getActionDefinitionsBuilder({G_LOAD, G_STORE})
+ .legalIf([=, &ST](const LegalityQuery &Query) {
+ const LLT &Ty0 = Query.Types[0];
+
+ // TODO: Decompose private loads into 4-byte components.
+ // TODO: Illegal flat loads on SI
+ switch (Ty0.getSizeInBits()) {
+ case 32:
+ case 64:
+ case 128:
+ return true;
+
+ case 96:
+ // XXX hasLoadX3
+ return (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS);
+
+ case 256:
+ case 512:
+ // TODO: constant loads
+ default:
+ return false;
+ }
+ });
+
+
setAction({G_SELECT, S32}, Legal);
setAction({G_SELECT, 1, S1}, Legal);
setAction({G_SHL, S32}, Legal);
- setAction({G_STORE, S32}, Legal);
- setAction({G_STORE, 1, P1}, Legal);
// FIXME: When RegBankSelect inserts copies, it will only create new
// registers with scalar types. This means we can end up with
@@ -83,8 +159,54 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
// if it sees a generic instruction which isn't legal, so we need to
// tell it that scalar types are legal for pointer operands
setAction({G_GEP, S64}, Legal);
- setAction({G_LOAD, 1, S64}, Legal);
- setAction({G_STORE, 1, S64}, Legal);
+
+ for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
+ getActionDefinitionsBuilder(Op)
+ .legalIf([=](const LegalityQuery &Query) {
+ const LLT &VecTy = Query.Types[1];
+ const LLT &IdxTy = Query.Types[2];
+ return VecTy.getSizeInBits() % 32 == 0 &&
+ VecTy.getSizeInBits() <= 512 &&
+ IdxTy.getSizeInBits() == 32;
+ });
+ }
+
+ // FIXME: Doesn't handle extract of illegal sizes.
+ getActionDefinitionsBuilder({G_EXTRACT, G_INSERT})
+ .legalIf([=](const LegalityQuery &Query) {
+ const LLT &Ty0 = Query.Types[0];
+ const LLT &Ty1 = Query.Types[1];
+ return (Ty0.getSizeInBits() % 32 == 0) &&
+ (Ty1.getSizeInBits() % 32 == 0);
+ });
+
+ // Merge/Unmerge
+ for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
+ unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
+ unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
+
+ getActionDefinitionsBuilder(Op)
+ .legalIf([=](const LegalityQuery &Query) {
+ const LLT &BigTy = Query.Types[BigTyIdx];
+ const LLT &LitTy = Query.Types[LitTyIdx];
+ return BigTy.getSizeInBits() % 32 == 0 &&
+ LitTy.getSizeInBits() % 32 == 0 &&
+ BigTy.getSizeInBits() <= 512;
+ })
+ // Any vectors left are the wrong size. Scalarize them.
+ .fewerElementsIf([](const LegalityQuery &Query) { return true; },
+ [](const LegalityQuery &Query) {
+ return std::make_pair(
+ 0, Query.Types[0].getElementType());
+ })
+ .fewerElementsIf([](const LegalityQuery &Query) { return true; },
+ [](const LegalityQuery &Query) {
+ return std::make_pair(
+ 1, Query.Types[1].getElementType());
+ });
+
+ }
computeTables();
+ verify(*ST.getInstrInfo());
}
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 291e3361f163..1cbd37c42c4b 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -19,12 +19,15 @@
namespace llvm {
+class GCNTargetMachine;
class LLVMContext;
+class GCNSubtarget;
/// This class provides the information for the target register banks.
class AMDGPULegalizerInfo : public LegalizerInfo {
public:
- AMDGPULegalizerInfo();
+ AMDGPULegalizerInfo(const GCNSubtarget &ST,
+ const GCNTargetMachine &TM);
};
} // End llvm namespace.
#endif
diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index f594767c8edb..7a7ed7a4f065 100644
--- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This file does AMD library function optimizations.
+/// This file does AMD library function optimizations.
//
//===----------------------------------------------------------------------===//
@@ -765,8 +765,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
ArrayRef<double> tmp(DVal);
nval = ConstantDataVector::get(context, tmp);
}
- DEBUG(errs() << "AMDIC: " << *CI
- << " ---> " << *nval << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
replaceCall(nval);
return true;
}
@@ -776,8 +775,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
for (int i = 0; i < sz; ++i) {
if (CF->isExactlyValue(ftbl[i].input)) {
Value *nval = ConstantFP::get(CF->getType(), ftbl[i].result);
- DEBUG(errs() << "AMDIC: " << *CI
- << " ---> " << *nval << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
replaceCall(nval);
return true;
}
@@ -798,11 +796,11 @@ bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) {
AMDGPULibFunc nf = FInfo;
nf.setPrefix(AMDGPULibFunc::NATIVE);
if (Constant *FPExpr = getFunction(M, nf)) {
- DEBUG(dbgs() << "AMDIC: " << *CI << " ---> ");
+ LLVM_DEBUG(dbgs() << "AMDIC: " << *CI << " ---> ");
CI->setCalledFunction(FPExpr);
- DEBUG(dbgs() << *CI << '\n');
+ LLVM_DEBUG(dbgs() << *CI << '\n');
return true;
}
@@ -820,8 +818,7 @@ bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B,
Value *nval = B.CreateFDiv(ConstantFP::get(CF->getType(), 1.0),
opr0,
"recip2div");
- DEBUG(errs() << "AMDIC: " << *CI
- << " ---> " << *nval << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
replaceCall(nval);
return true;
}
@@ -899,7 +896,7 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0) || CZero) {
// pow/powr/pown(x, 0) == 1
- DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n");
Constant *cnval = ConstantFP::get(eltType, 1.0);
if (getVecSize(FInfo) > 1) {
cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
@@ -909,23 +906,21 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
}
if ((CF && CF->isExactlyValue(1.0)) || (CINT && ci_opr1 == 1)) {
// pow/powr/pown(x, 1.0) = x
- DEBUG(errs() << "AMDIC: " << *CI
- << " ---> " << *opr0 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n");
replaceCall(opr0);
return true;
}
if ((CF && CF->isExactlyValue(2.0)) || (CINT && ci_opr1 == 2)) {
// pow/powr/pown(x, 2.0) = x*x
- DEBUG(errs() << "AMDIC: " << *CI
- << " ---> " << *opr0 << " * " << *opr0 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * " << *opr0
+ << "\n");
Value *nval = B.CreateFMul(opr0, opr0, "__pow2");
replaceCall(nval);
return true;
}
if ((CF && CF->isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) {
// pow/powr/pown(x, -1.0) = 1.0/x
- DEBUG(errs() << "AMDIC: " << *CI
- << " ---> 1 / " << *opr0 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1 / " << *opr0 << "\n");
Constant *cnval = ConstantFP::get(eltType, 1.0);
if (getVecSize(FInfo) > 1) {
cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
@@ -942,8 +937,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
if (Constant *FPExpr = getFunction(M,
AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
: AMDGPULibFunc::EI_RSQRT, FInfo))) {
- DEBUG(errs() << "AMDIC: " << *CI << " ---> "
- << FInfo.getName().c_str() << "(" << *opr0 << ")\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ << FInfo.getName().c_str() << "(" << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt"
: "__pow2rsqrt");
replaceCall(nval);
@@ -999,8 +994,9 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
}
nval = B.CreateFDiv(cnval, nval, "__1powprod");
}
- DEBUG(errs() << "AMDIC: " << *CI << " ---> "
- << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0 << ")\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0
+ << ")\n");
replaceCall(nval);
return true;
}
@@ -1137,8 +1133,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
nval = B.CreateBitCast(nval, opr0->getType());
}
- DEBUG(errs() << "AMDIC: " << *CI << " ---> "
- << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
replaceCall(nval);
return true;
@@ -1155,8 +1151,7 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
}
int ci_opr1 = (int)CINT->getSExtValue();
if (ci_opr1 == 1) { // rootn(x, 1) = x
- DEBUG(errs() << "AMDIC: " << *CI
- << " ---> " << *opr0 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n");
replaceCall(opr0);
return true;
}
@@ -1166,7 +1161,7 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
Module *M = CI->getModule();
if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT,
FInfo))) {
- DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt");
replaceCall(nval);
return true;
@@ -1175,13 +1170,13 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
Module *M = CI->getModule();
if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT,
FInfo))) {
- DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt");
replaceCall(nval);
return true;
}
} else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
- DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n");
Value *nval = B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0),
opr0,
"__rootn2div");
@@ -1193,7 +1188,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
Module *M = CI->getModule();
if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT,
FInfo))) {
- DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0 << ")\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0
+ << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt");
replaceCall(nval);
return true;
@@ -1212,22 +1208,22 @@ bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
if ((CF0 && CF0->isZero()) || (CF1 && CF1->isZero())) {
// fma/mad(a, b, c) = c if a=0 || b=0
- DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n");
replaceCall(opr2);
return true;
}
if (CF0 && CF0->isExactlyValue(1.0f)) {
// fma/mad(a, b, c) = b+c if a=1
- DEBUG(errs() << "AMDIC: " << *CI << " ---> "
- << *opr1 << " + " << *opr2 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr1 << " + " << *opr2
+ << "\n");
Value *nval = B.CreateFAdd(opr1, opr2, "fmaadd");
replaceCall(nval);
return true;
}
if (CF1 && CF1->isExactlyValue(1.0f)) {
// fma/mad(a, b, c) = a+c if b=1
- DEBUG(errs() << "AMDIC: " << *CI << " ---> "
- << *opr0 << " + " << *opr2 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " + " << *opr2
+ << "\n");
Value *nval = B.CreateFAdd(opr0, opr2, "fmaadd");
replaceCall(nval);
return true;
@@ -1235,8 +1231,8 @@ bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
if (ConstantFP *CF = dyn_cast<ConstantFP>(opr2)) {
if (CF->isZero()) {
// fma/mad(a, b, c) = a*b if c=0
- DEBUG(errs() << "AMDIC: " << *CI << " ---> "
- << *opr0 << " * " << *opr1 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * "
+ << *opr1 << "\n");
Value *nval = B.CreateFMul(opr0, opr1, "fmamul");
replaceCall(nval);
return true;
@@ -1263,8 +1259,8 @@ bool AMDGPULibCalls::fold_sqrt(CallInst *CI, IRBuilder<> &B,
if (Constant *FPExpr = getNativeFunction(
CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
Value *opr0 = CI->getArgOperand(0);
- DEBUG(errs() << "AMDIC: " << *CI << " ---> "
- << "sqrt(" << *opr0 << ")\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ << "sqrt(" << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__sqrt");
replaceCall(nval);
return true;
@@ -1355,8 +1351,8 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
P = B.CreateAddrSpaceCast(Alloc, PTy);
CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
- DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI
- << ") with " << *Call << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI << ") with "
+ << *Call << "\n");
if (!isSin) { // CI->cos, UI->sin
B.SetInsertPoint(&*ItOld);
@@ -1719,9 +1715,8 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
bool Changed = false;
auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- DEBUG(dbgs() << "AMDIC: process function ";
- F.printAsOperand(dbgs(), false, F.getParent());
- dbgs() << '\n';);
+ LLVM_DEBUG(dbgs() << "AMDIC: process function ";
+ F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
if (!EnablePreLink)
Changed |= setFastFlags(F, Options);
@@ -1737,8 +1732,8 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
Function *Callee = CI->getCalledFunction();
if (Callee == 0) continue;
- DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
- dbgs().flush());
+ LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
+ dbgs().flush());
if(Simplifier.fold(CI, AA))
Changed = true;
}
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.h b/lib/Target/AMDGPU/AMDGPULibFunc.h
index 5405bc645714..fe062384800a 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -1,4 +1,4 @@
-//===-- AMDGPULibFunc.h ---------------------------------------------------===//
+//===-- AMDGPULibFunc.h ----------------------------------------*- C++ -*--===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index 7e0e9802c0e6..2cec8fe53283 100644
--- a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -117,7 +117,6 @@ bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
return false;
const TargetMachine &TM = TPC->getTM<TargetMachine>();
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(F);
bool Changed = false;
for (auto *U : F.users()) {
@@ -125,7 +124,7 @@ bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
if (!CI)
continue;
- Changed |= ST.makeLIDRangeMetadata(CI);
+ Changed |= AMDGPUSubtarget::get(TM, F).makeLIDRangeMetadata(CI);
}
return Changed;
}
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
new file mode 100644
index 000000000000..8cc7e38f7b29
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -0,0 +1,264 @@
+//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass replaces accesses to kernel arguments with loads from
+/// offsets from the kernarg base pointer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+
+#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPULowerKernelArguments : public FunctionPass{
+public:
+ static char ID;
+
+ AMDGPULowerKernelArguments() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ AU.setPreservesAll();
+ }
+};
+
+} // end anonymous namespace
+
+bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
+ CallingConv::ID CC = F.getCallingConv();
+ if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
+ return false;
+
+ auto &TPC = getAnalysis<TargetPassConfig>();
+
+ const TargetMachine &TM = TPC.getTM<TargetMachine>();
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ LLVMContext &Ctx = F.getParent()->getContext();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ BasicBlock &EntryBlock = *F.begin();
+ IRBuilder<> Builder(&*EntryBlock.begin());
+
+ const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
+ const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
+
+ unsigned MaxAlign;
+ // FIXME: Alignment is broken broken with explicit arg offset.;
+ const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
+ if (TotalKernArgSize == 0)
+ return false;
+
+ CallInst *KernArgSegment =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, nullptr,
+ F.getName() + ".kernarg.segment");
+
+ KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+ KernArgSegment->addAttribute(AttributeList::ReturnIndex,
+ Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
+
+ unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
+ uint64_t ExplicitArgOffset = 0;
+
+ for (Argument &Arg : F.args()) {
+ Type *ArgTy = Arg.getType();
+ unsigned Align = DL.getABITypeAlignment(ArgTy);
+ unsigned Size = DL.getTypeSizeInBits(ArgTy);
+ unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+
+
+ // Clover seems to always pad i8/i16 to i32, but doesn't properly align
+ // them?
+ // Make sure the struct elements have correct size and alignment for ext
+ // args. These seem to be padded up to 4-bytes but not correctly aligned.
+ bool IsExtArg = AllocSize < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
+ !ST.isAmdHsaOS();
+ if (IsExtArg)
+ AllocSize = 4;
+
+ uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+
+ if (Arg.use_empty())
+ continue;
+
+ if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
+ // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
+ // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
+ // can't represent this with range metadata because it's only allowed for
+ // integer types.
+ if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+ ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ continue;
+
+ // FIXME: We can replace this with equivalent alias.scope/noalias
+ // metadata, but this appears to be a lot of work.
+ if (Arg.hasNoAliasAttr())
+ continue;
+ }
+
+ VectorType *VT = dyn_cast<VectorType>(ArgTy);
+ bool IsV3 = VT && VT->getNumElements() == 3;
+ VectorType *V4Ty = nullptr;
+
+ int64_t AlignDownOffset = alignDown(EltOffset, 4);
+ int64_t OffsetDiff = EltOffset - AlignDownOffset;
+ unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset);
+
+ Value *ArgPtr;
+ if (Size < 32 && !ArgTy->isAggregateType()) { // FIXME: Handle aggregate types
+ // Since we don't have sub-dword scalar loads, avoid doing an extload by
+ // loading earlier than the argument address, and extracting the relevant
+ // bits.
+ //
+ // Additionally widen any sub-dword load to i32 even if suitably aligned,
+ // so that CSE between different argument loads works easily.
+
+ ArgPtr = Builder.CreateConstInBoundsGEP1_64(
+ KernArgSegment,
+ AlignDownOffset,
+ Arg.getName() + ".kernarg.offset.align.down");
+ ArgPtr = Builder.CreateBitCast(ArgPtr,
+ Builder.getInt32Ty()->getPointerTo(AS),
+ ArgPtr->getName() + ".cast");
+ } else {
+ ArgPtr = Builder.CreateConstInBoundsGEP1_64(
+ KernArgSegment,
+ AlignDownOffset,
+ Arg.getName() + ".kernarg.offset");
+ ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS),
+ ArgPtr->getName() + ".cast");
+ }
+
+ assert((!IsExtArg || !IsV3) && "incompatible situation");
+
+ if (IsV3 && Size >= 32) {
+ V4Ty = VectorType::get(VT->getVectorElementType(), 4);
+ // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
+ ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS));
+ }
+
+ LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign);
+ Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
+
+ MDBuilder MDB(Ctx);
+
+ if (isa<PointerType>(ArgTy)) {
+ if (Arg.hasNonNullAttr())
+ Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
+
+ uint64_t DerefBytes = Arg.getDereferenceableBytes();
+ if (DerefBytes != 0) {
+ Load->setMetadata(
+ LLVMContext::MD_dereferenceable,
+ MDNode::get(Ctx,
+ MDB.createConstant(
+ ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
+ }
+
+ uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
+ if (DerefOrNullBytes != 0) {
+ Load->setMetadata(
+ LLVMContext::MD_dereferenceable_or_null,
+ MDNode::get(Ctx,
+ MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
+ DerefOrNullBytes))));
+ }
+
+ unsigned ParamAlign = Arg.getParamAlignment();
+ if (ParamAlign != 0) {
+ Load->setMetadata(
+ LLVMContext::MD_align,
+ MDNode::get(Ctx,
+ MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
+ ParamAlign))));
+ }
+ }
+
+ // TODO: Convert noalias arg to !noalias
+
+ if (Size < 32 && !ArgTy->isAggregateType()) {
+ if (IsExtArg && OffsetDiff == 0) {
+ Type *I32Ty = Builder.getInt32Ty();
+ bool IsSext = Arg.hasSExtAttr();
+ Metadata *LowAndHigh[] = {
+ ConstantAsMetadata::get(
+ ConstantInt::get(I32Ty, IsSext ? minIntN(Size) : 0)),
+ ConstantAsMetadata::get(
+ ConstantInt::get(I32Ty,
+ IsSext ? maxIntN(Size) + 1 : maxUIntN(Size) + 1))
+ };
+
+ Load->setMetadata(LLVMContext::MD_range, MDNode::get(Ctx, LowAndHigh));
+ }
+
+ Value *ExtractBits = OffsetDiff == 0 ?
+ Load : Builder.CreateLShr(Load, OffsetDiff * 8);
+
+ IntegerType *ArgIntTy = Builder.getIntNTy(Size);
+ Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
+ Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
+ Arg.getName() + ".load");
+ Arg.replaceAllUsesWith(NewVal);
+ } else if (IsV3) {
+ Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
+ {0, 1, 2},
+ Arg.getName() + ".load");
+ Arg.replaceAllUsesWith(Shuf);
+ } else {
+ Load->setName(Arg.getName() + ".load");
+ Arg.replaceAllUsesWith(Load);
+ }
+ }
+
+ KernArgSegment->addAttribute(
+ AttributeList::ReturnIndex,
+ Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
+
+ return true;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
+ "AMDGPU Lower Kernel Arguments", false, false)
+INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
+ false, false)
+
+char AMDGPULowerKernelArguments::ID = 0;
+
+FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
+ return new AMDGPULowerKernelArguments();
+}
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
new file mode 100644
index 000000000000..a43dcef4cf0b
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -0,0 +1,270 @@
+//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass does attempts to make use of reqd_work_group_size metadata
+/// to eliminate loads from the dispatch packet and to constant fold OpenCL
+/// get_local_size-like functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+
+#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
+
+using namespace llvm;
+
+namespace {
+
+// Field offsets in hsa_kernel_dispatch_packet_t.
+enum DispatchPackedOffsets {
+ WORKGROUP_SIZE_X = 4,
+ WORKGROUP_SIZE_Y = 6,
+ WORKGROUP_SIZE_Z = 8,
+
+ GRID_SIZE_X = 12,
+ GRID_SIZE_Y = 16,
+ GRID_SIZE_Z = 20
+};
+
+class AMDGPULowerKernelAttributes : public ModulePass {
+ Module *Mod = nullptr;
+
+public:
+ static char ID;
+
+ AMDGPULowerKernelAttributes() : ModulePass(ID) {}
+
+ bool processUse(CallInst *CI);
+
+ bool doInitialization(Module &M) override;
+ bool runOnModule(Module &M) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU Kernel Attributes";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+};
+
+} // end anonymous namespace
+
+bool AMDGPULowerKernelAttributes::doInitialization(Module &M) {
+ Mod = &M;
+ return false;
+}
+
+bool AMDGPULowerKernelAttributes::processUse(CallInst *CI) {
+ Function *F = CI->getParent()->getParent();
+
+ auto MD = F->getMetadata("reqd_work_group_size");
+ const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
+
+ const bool HasUniformWorkGroupSize =
+ F->getFnAttribute("uniform-work-group-size").getValueAsString() == "true";
+
+ if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
+ return false;
+
+ Value *WorkGroupSizeX = nullptr;
+ Value *WorkGroupSizeY = nullptr;
+ Value *WorkGroupSizeZ = nullptr;
+
+ Value *GridSizeX = nullptr;
+ Value *GridSizeY = nullptr;
+ Value *GridSizeZ = nullptr;
+
+ const DataLayout &DL = Mod->getDataLayout();
+
+ // We expect to see several GEP users, casted to the appropriate type and
+ // loaded.
+ for (User *U : CI->users()) {
+ if (!U->hasOneUse())
+ continue;
+
+ int64_t Offset = 0;
+ if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
+ continue;
+
+ auto *BCI = dyn_cast<BitCastInst>(*U->user_begin());
+ if (!BCI || !BCI->hasOneUse())
+ continue;
+
+ auto *Load = dyn_cast<LoadInst>(*BCI->user_begin());
+ if (!Load || !Load->isSimple())
+ continue;
+
+ unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
+
+ // TODO: Handle merged loads.
+ switch (Offset) {
+ case WORKGROUP_SIZE_X:
+ if (LoadSize == 2)
+ WorkGroupSizeX = Load;
+ break;
+ case WORKGROUP_SIZE_Y:
+ if (LoadSize == 2)
+ WorkGroupSizeY = Load;
+ break;
+ case WORKGROUP_SIZE_Z:
+ if (LoadSize == 2)
+ WorkGroupSizeZ = Load;
+ break;
+ case GRID_SIZE_X:
+ if (LoadSize == 4)
+ GridSizeX = Load;
+ break;
+ case GRID_SIZE_Y:
+ if (LoadSize == 4)
+ GridSizeY = Load;
+ break;
+ case GRID_SIZE_Z:
+ if (LoadSize == 4)
+ GridSizeZ = Load;
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Pattern match the code used to handle partial workgroup dispatches in the
+ // library implementation of get_local_size, so the entire function can be
+ // constant folded with a known group size.
+ //
+ // uint r = grid_size - group_id * group_size;
+ // get_local_size = (r < group_size) ? r : group_size;
+ //
+ // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
+ // the grid_size is required to be a multiple of group_size). In this case:
+ //
+ // grid_size - (group_id * group_size) < group_size
+ // ->
+ // grid_size < group_size + (group_id * group_size)
+ //
+ // (grid_size / group_size) < 1 + group_id
+ //
+ // grid_size / group_size is at least 1, so we can conclude the select
+ // condition is false (except for group_id == 0, where the select result is
+ // the same).
+
+ bool MadeChange = false;
+ Value *WorkGroupSizes[3] = { WorkGroupSizeX, WorkGroupSizeY, WorkGroupSizeZ };
+ Value *GridSizes[3] = { GridSizeX, GridSizeY, GridSizeZ };
+
+ for (int I = 0; HasUniformWorkGroupSize && I < 3; ++I) {
+ Value *GroupSize = WorkGroupSizes[I];
+ Value *GridSize = GridSizes[I];
+ if (!GroupSize || !GridSize)
+ continue;
+
+ for (User *U : GroupSize->users()) {
+ auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
+ if (!ZextGroupSize)
+ continue;
+
+ for (User *ZextUser : ZextGroupSize->users()) {
+ auto *SI = dyn_cast<SelectInst>(ZextUser);
+ if (!SI)
+ continue;
+
+ using namespace llvm::PatternMatch;
+ auto GroupIDIntrin = I == 0 ?
+ m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() :
+ (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() :
+ m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
+
+ auto SubExpr = m_Sub(m_Specific(GridSize),
+ m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize)));
+
+ ICmpInst::Predicate Pred;
+ if (match(SI,
+ m_Select(m_ICmp(Pred, SubExpr, m_Specific(ZextGroupSize)),
+ SubExpr,
+ m_Specific(ZextGroupSize))) &&
+ Pred == ICmpInst::ICMP_ULT) {
+ if (HasReqdWorkGroupSize) {
+ ConstantInt *KnownSize
+ = mdconst::extract<ConstantInt>(MD->getOperand(I));
+ SI->replaceAllUsesWith(ConstantExpr::getIntegerCast(KnownSize,
+ SI->getType(),
+ false));
+ } else {
+ SI->replaceAllUsesWith(ZextGroupSize);
+ }
+
+ MadeChange = true;
+ }
+ }
+ }
+ }
+
+ if (!HasReqdWorkGroupSize)
+ return MadeChange;
+
+ // Eliminate any other loads we can from the dispatch packet.
+ for (int I = 0; I < 3; ++I) {
+ Value *GroupSize = WorkGroupSizes[I];
+ if (!GroupSize)
+ continue;
+
+ ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
+ GroupSize->replaceAllUsesWith(
+ ConstantExpr::getIntegerCast(KnownSize,
+ GroupSize->getType(),
+ false));
+ MadeChange = true;
+ }
+
+ return MadeChange;
+}
+
+// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
+// TargetPassConfig for subtarget.
+bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
+ StringRef DispatchPtrName
+ = Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
+
+ Function *DispatchPtr = Mod->getFunction(DispatchPtrName);
+ if (!DispatchPtr) // Dispatch ptr not used.
+ return false;
+
+ bool MadeChange = false;
+
+ SmallPtrSet<Instruction *, 4> HandledUses;
+ for (auto *U : DispatchPtr->users()) {
+ CallInst *CI = cast<CallInst>(U);
+ if (HandledUses.insert(CI).second) {
+ if (processUse(CI))
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
+ "AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU IR optimizations",
+ false, false)
+
+char AMDGPULowerKernelAttributes::ID = 0;
+
+ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
+ return new AMDGPULowerKernelAttributes();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 23fd8113932c..1876dc3f7122 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -8,16 +8,17 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
+/// Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
//
//===----------------------------------------------------------------------===//
//
-#include "AMDGPUMCInstLower.h"
#include "AMDGPUAsmPrinter.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "R600AsmPrinter.h"
#include "SIInstrInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -36,9 +37,43 @@
using namespace llvm;
+namespace {
+
+class AMDGPUMCInstLower {
+ MCContext &Ctx;
+ const TargetSubtargetInfo &ST;
+ const AsmPrinter &AP;
+
+ const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB,
+ const MachineOperand &MO) const;
+
+public:
+ AMDGPUMCInstLower(MCContext &ctx, const TargetSubtargetInfo &ST,
+ const AsmPrinter &AP);
+
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+
+ /// Lower a MachineInstr to an MCInst
+ void lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+};
+
+class R600MCInstLower : public AMDGPUMCInstLower {
+public:
+ R600MCInstLower(MCContext &ctx, const R600Subtarget &ST,
+ const AsmPrinter &AP);
+
+ /// Lower a MachineInstr to an MCInst
+ void lower(const MachineInstr *MI, MCInst &OutMI) const;
+};
+
+
+} // End anonymous namespace
+
#include "AMDGPUGenMCPseudoLowering.inc"
-AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st,
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx,
+ const TargetSubtargetInfo &st,
const AsmPrinter &ap):
Ctx(ctx), ST(st), AP(ap) { }
@@ -129,7 +164,7 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
unsigned Opcode = MI->getOpcode();
- const auto *TII = ST.getInstrInfo();
+ const auto *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
// FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
// need to select it to the subtarget specific version, and there's no way to
@@ -169,16 +204,18 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,
MCOperand &MCOp) const {
- const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+ const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
return MCInstLowering.lowerOperand(MO, MCOp);
}
-const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
+static const MCExpr *lowerAddrSpaceCast(const TargetMachine &TM,
+ const Constant *CV,
+ MCContext &OutContext) {
// TargetMachine does not support llvm-style cast. Use C++-style cast.
// This is safe since TM is always of type AMDGPUTargetMachine or its
// derived class.
- auto *AT = static_cast<AMDGPUTargetMachine*>(&TM);
+ auto &AT = static_cast<const AMDGPUTargetMachine&>(TM);
auto *CE = dyn_cast<ConstantExpr>(CV);
// Lower null pointers in private and local address space.
@@ -187,12 +224,18 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
if (CE && CE->getOpcode() == Instruction::AddrSpaceCast) {
auto Op = CE->getOperand(0);
auto SrcAddr = Op->getType()->getPointerAddressSpace();
- if (Op->isNullValue() && AT->getNullPointerValue(SrcAddr) == 0) {
+ if (Op->isNullValue() && AT.getNullPointerValue(SrcAddr) == 0) {
auto DstAddr = CE->getType()->getPointerAddressSpace();
- return MCConstantExpr::create(AT->getNullPointerValue(DstAddr),
+ return MCConstantExpr::create(AT.getNullPointerValue(DstAddr),
OutContext);
}
}
+ return nullptr;
+}
+
+const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
+ if (const MCExpr *E = lowerAddrSpaceCast(TM, CV, OutContext))
+ return E;
return AsmPrinter::lowerConstant(CV);
}
@@ -200,7 +243,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
if (emitPseudoExpansionLowering(*OutStreamer, MI))
return;
- const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+ const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
StringRef Err;
@@ -292,3 +335,47 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
}
}
+
+R600MCInstLower::R600MCInstLower(MCContext &Ctx, const R600Subtarget &ST,
+ const AsmPrinter &AP) :
+ AMDGPUMCInstLower(Ctx, ST, AP) { }
+
+void R600MCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+ OutMI.setOpcode(MI->getOpcode());
+ for (const MachineOperand &MO : MI->explicit_operands()) {
+ MCOperand MCOp;
+ lowerOperand(MO, MCOp);
+ OutMI.addOperand(MCOp);
+ }
+}
+
+void R600AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ const R600Subtarget &STI = MF->getSubtarget<R600Subtarget>();
+ R600MCInstLower MCInstLowering(OutContext, STI, *this);
+
+ StringRef Err;
+ if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
+ LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
+ C.emitError("Illegal instruction detected: " + Err);
+ MI->print(errs());
+ }
+
+ if (MI->isBundle()) {
+ const MachineBasicBlock *MBB = MI->getParent();
+ MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
+ while (I != MBB->instr_end() && I->isInsideBundle()) {
+ EmitInstruction(&*I);
+ ++I;
+ }
+ } else {
+ MCInst TmpInst;
+ MCInstLowering.lower(MI, TmpInst);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ }
+}
+
+const MCExpr *R600AsmPrinter::lowerConstant(const Constant *CV) {
+ if (const MCExpr *E = lowerAddrSpaceCast(TM, CV, OutContext))
+ return E;
+ return AsmPrinter::lowerConstant(CV);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/lib/Target/AMDGPU/AMDGPUMCInstLower.h
deleted file mode 100644
index 57d2d85daecd..000000000000
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ /dev/null
@@ -1,46 +0,0 @@
-//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
-
-namespace llvm {
-
-class AMDGPUSubtarget;
-class AsmPrinter;
-class MachineBasicBlock;
-class MachineInstr;
-class MachineOperand;
-class MCContext;
-class MCExpr;
-class MCInst;
-class MCOperand;
-
-class AMDGPUMCInstLower {
- MCContext &Ctx;
- const AMDGPUSubtarget &ST;
- const AsmPrinter &AP;
-
- const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB,
- const MachineOperand &MO) const;
-
-public:
- AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST,
- const AsmPrinter &AP);
-
- bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
-
- /// \brief Lower a MachineInstr to an MCInst
- void lower(const MachineInstr *MI, MCInst &OutMI) const;
-
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 20918233e447..6f44e2dbb2d5 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -31,6 +31,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/Pass.h"
#include "llvm/Support/Compiler.h"
@@ -658,7 +659,7 @@ RegionMRT *MRT::buildMRT(MachineFunction &MF,
continue;
}
- DEBUG(dbgs() << "Visiting " << printMBBReference(*MBB) << "\n");
+ LLVM_DEBUG(dbgs() << "Visiting " << printMBBReference(*MBB) << "\n");
MBBMRT *NewMBB = new MBBMRT(MBB);
MachineRegion *Region = RegionInfo->getRegionFor(MBB);
@@ -695,18 +696,19 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
const TargetRegisterInfo *TRI,
PHILinearize &PHIInfo) {
if (TRI->isVirtualRegister(Reg)) {
- DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
+ << "\n");
// If this is a source register to a PHI we are chaining, it
// must be live out.
if (PHIInfo.isSource(Reg)) {
- DEBUG(dbgs() << "Add LiveOut (PHI): " << printReg(Reg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Add LiveOut (PHI): " << printReg(Reg, TRI) << "\n");
addLiveOut(Reg);
} else {
// If this is live out of the MBB
for (auto &UI : MRI->use_operands(Reg)) {
if (UI.getParent()->getParent() != MBB) {
- DEBUG(dbgs() << "Add LiveOut (MBB " << printMBBReference(*MBB)
- << "): " << printReg(Reg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Add LiveOut (MBB " << printMBBReference(*MBB)
+ << "): " << printReg(Reg, TRI) << "\n");
addLiveOut(Reg);
} else {
// If the use is in the same MBB we have to make sure
@@ -717,8 +719,8 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
MIE = UseInstr->getParent()->instr_end();
MII != MIE; ++MII) {
if ((&(*MII)) == DefInstr) {
- DEBUG(dbgs() << "Add LiveOut (Loop): " << printReg(Reg, TRI)
- << "\n");
+ LLVM_DEBUG(dbgs() << "Add LiveOut (Loop): " << printReg(Reg, TRI)
+ << "\n");
addLiveOut(Reg);
}
}
@@ -734,11 +736,12 @@ void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
const TargetRegisterInfo *TRI,
PHILinearize &PHIInfo) {
if (TRI->isVirtualRegister(Reg)) {
- DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
+ << "\n");
for (auto &UI : MRI->use_operands(Reg)) {
if (!Region->contains(UI.getParent()->getParent())) {
- DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region
- << "): " << printReg(Reg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region
+ << "): " << printReg(Reg, TRI) << "\n");
addLiveOut(Reg);
}
}
@@ -749,8 +752,8 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
const MachineRegisterInfo *MRI,
const TargetRegisterInfo *TRI,
PHILinearize &PHIInfo) {
- DEBUG(dbgs() << "-Store Live Outs Begin (" << printMBBReference(*MBB)
- << ")-\n");
+ LLVM_DEBUG(dbgs() << "-Store Live Outs Begin (" << printMBBReference(*MBB)
+ << ")-\n");
for (auto &II : *MBB) {
for (auto &RI : II.defs()) {
storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo);
@@ -774,9 +777,10 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
for (int i = 0; i < numPreds; ++i) {
if (getPHIPred(PHI, i) == MBB) {
unsigned PHIReg = getPHISourceReg(PHI, i);
- DEBUG(dbgs() << "Add LiveOut (PhiSource " << printMBBReference(*MBB)
- << " -> " << printMBBReference(*(*SI))
- << "): " << printReg(PHIReg, TRI) << "\n");
+ LLVM_DEBUG(dbgs()
+ << "Add LiveOut (PhiSource " << printMBBReference(*MBB)
+ << " -> " << printMBBReference(*(*SI))
+ << "): " << printReg(PHIReg, TRI) << "\n");
addLiveOut(PHIReg);
}
}
@@ -784,7 +788,7 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
}
}
- DEBUG(dbgs() << "-Store Live Outs Endn-\n");
+ LLVM_DEBUG(dbgs() << "-Store Live Outs Endn-\n");
}
void LinearizedRegion::storeMBBLiveOuts(MachineBasicBlock *MBB,
@@ -844,8 +848,8 @@ void LinearizedRegion::storeLiveOuts(RegionMRT *Region,
for (int i = 0; i < numPreds; ++i) {
if (Region->contains(getPHIPred(PHI, i))) {
unsigned PHIReg = getPHISourceReg(PHI, i);
- DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region
- << "): " << printReg(PHIReg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region
+ << "): " << printReg(PHIReg, TRI) << "\n");
addLiveOut(PHIReg);
}
}
@@ -909,20 +913,21 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
bool IncludeLoopPHI) {
assert(Register != NewRegister && "Cannot replace a reg with itself");
- DEBUG(dbgs() << "Pepareing to replace register (region): "
- << printReg(Register, MRI->getTargetRegisterInfo()) << " with "
- << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
+ LLVM_DEBUG(
+ dbgs() << "Pepareing to replace register (region): "
+ << printReg(Register, MRI->getTargetRegisterInfo()) << " with "
+ << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
// If we are replacing outside, we also need to update the LiveOuts
if (ReplaceOutside &&
(isLiveOut(Register) || this->getParent()->isLiveOut(Register))) {
LinearizedRegion *Current = this;
while (Current != nullptr && Current->getEntry() != nullptr) {
- DEBUG(dbgs() << "Region before register replace\n");
- DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+ LLVM_DEBUG(dbgs() << "Region before register replace\n");
+ LLVM_DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
Current->replaceLiveOut(Register, NewRegister);
- DEBUG(dbgs() << "Region after register replace\n");
- DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+ LLVM_DEBUG(dbgs() << "Region after register replace\n");
+ LLVM_DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
Current = Current->getParent();
}
}
@@ -946,16 +951,16 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
if (ShouldReplace) {
if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
- DEBUG(dbgs() << "Trying to substitute physical register: "
- << printReg(NewRegister, MRI->getTargetRegisterInfo())
- << "\n");
+ LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
+ << printReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
llvm_unreachable("Cannot substitute physical registers");
} else {
- DEBUG(dbgs() << "Replacing register (region): "
- << printReg(Register, MRI->getTargetRegisterInfo())
- << " with "
- << printReg(NewRegister, MRI->getTargetRegisterInfo())
- << "\n");
+ LLVM_DEBUG(dbgs() << "Replacing register (region): "
+ << printReg(Register, MRI->getTargetRegisterInfo())
+ << " with "
+ << printReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
O.setReg(NewRegister);
}
}
@@ -1022,18 +1027,18 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
if (hasNoDef(Reg, MRI))
continue;
if (!MRI->hasOneDef(Reg)) {
- DEBUG(this->getEntry()->getParent()->dump());
- DEBUG(dbgs() << printReg(Reg, TRI) << "\n");
+ LLVM_DEBUG(this->getEntry()->getParent()->dump());
+ LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << "\n");
}
if (MRI->def_begin(Reg) == MRI->def_end()) {
- DEBUG(dbgs() << "Register "
- << printReg(Reg, MRI->getTargetRegisterInfo())
- << " has NO defs\n");
+ LLVM_DEBUG(dbgs() << "Register "
+ << printReg(Reg, MRI->getTargetRegisterInfo())
+ << " has NO defs\n");
} else if (!MRI->hasOneDef(Reg)) {
- DEBUG(dbgs() << "Register "
- << printReg(Reg, MRI->getTargetRegisterInfo())
- << " has multiple defs\n");
+ LLVM_DEBUG(dbgs() << "Register "
+ << printReg(Reg, MRI->getTargetRegisterInfo())
+ << " has multiple defs\n");
}
assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
@@ -1041,8 +1046,8 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
MachineOperand *UseOperand = &(RI);
bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB;
if (UseIsOutsideDefMBB && UseOperand->isKill()) {
- DEBUG(dbgs() << "Removing kill flag on register: "
- << printReg(Reg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Removing kill flag on register: "
+ << printReg(Reg, TRI) << "\n");
UseOperand->setIsKill(false);
}
}
@@ -1415,8 +1420,8 @@ void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) {
MachineInstr &Instr = *I;
if (Instr.isPHI()) {
unsigned PHIDestReg = getPHIDestReg(Instr);
- DEBUG(dbgs() << "Extractking killed phi:\n");
- DEBUG(Instr.dump());
+ LLVM_DEBUG(dbgs() << "Extractking killed phi:\n");
+ LLVM_DEBUG(Instr.dump());
PHIs.insert(&Instr);
PHIInfo.addDest(PHIDestReg, Instr.getDebugLoc());
storePHILinearizationInfoDest(PHIDestReg, Instr);
@@ -1448,9 +1453,10 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
MachineBasicBlock *SourceMBB,
SmallVector<unsigned, 2> &PHIIndices,
unsigned *ReplaceReg) {
- DEBUG(dbgs() << "Shrink PHI: ");
- DEBUG(PHI.dump());
- DEBUG(dbgs() << " to " << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
+ LLVM_DEBUG(dbgs() << "Shrink PHI: ");
+ LLVM_DEBUG(PHI.dump());
+ LLVM_DEBUG(dbgs() << " to " << printReg(getPHIDestReg(PHI), TRI)
+ << " = PHI(");
bool Replaced = false;
unsigned NumInputs = getPHINumInputs(PHI);
@@ -1480,8 +1486,8 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
if (SourceMBB) {
MIB.addReg(CombinedSourceReg);
MIB.addMBB(SourceMBB);
- DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
- << printMBBReference(*SourceMBB));
+ LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
+ << printMBBReference(*SourceMBB));
}
for (unsigned i = 0; i < NumInputs; ++i) {
@@ -1492,10 +1498,10 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
MIB.addReg(SourceReg);
MIB.addMBB(SourcePred);
- DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
- << printMBBReference(*SourcePred));
+ LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+ << printMBBReference(*SourcePred));
}
- DEBUG(dbgs() << ")\n");
+ LLVM_DEBUG(dbgs() << ")\n");
}
PHI.eraseFromParent();
return Replaced;
@@ -1504,9 +1510,10 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
void AMDGPUMachineCFGStructurizer::replacePHI(
MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *LastMerge,
SmallVector<unsigned, 2> &PHIRegionIndices) {
- DEBUG(dbgs() << "Replace PHI: ");
- DEBUG(PHI.dump());
- DEBUG(dbgs() << " with " << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
+ LLVM_DEBUG(dbgs() << "Replace PHI: ");
+ LLVM_DEBUG(PHI.dump());
+ LLVM_DEBUG(dbgs() << " with " << printReg(getPHIDestReg(PHI), TRI)
+ << " = PHI(");
bool HasExternalEdge = false;
unsigned NumInputs = getPHINumInputs(PHI);
@@ -1523,8 +1530,8 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
getPHIDestReg(PHI));
MIB.addReg(CombinedSourceReg);
MIB.addMBB(LastMerge);
- DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
- << printMBBReference(*LastMerge));
+ LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
+ << printMBBReference(*LastMerge));
for (unsigned i = 0; i < NumInputs; ++i) {
if (isPHIRegionIndex(PHIRegionIndices, i)) {
continue;
@@ -1533,10 +1540,10 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
MIB.addReg(SourceReg);
MIB.addMBB(SourcePred);
- DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
- << printMBBReference(*SourcePred));
+ LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+ << printMBBReference(*SourcePred));
}
- DEBUG(dbgs() << ")\n");
+ LLVM_DEBUG(dbgs() << ")\n");
} else {
replaceRegisterWith(getPHIDestReg(PHI), CombinedSourceReg);
}
@@ -1546,9 +1553,9 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB,
SmallVector<unsigned, 2> &PHIRegionIndices) {
- DEBUG(dbgs() << "Replace entry PHI: ");
- DEBUG(PHI.dump());
- DEBUG(dbgs() << " with ");
+ LLVM_DEBUG(dbgs() << "Replace entry PHI: ");
+ LLVM_DEBUG(PHI.dump());
+ LLVM_DEBUG(dbgs() << " with ");
unsigned NumInputs = getPHINumInputs(PHI);
unsigned NumNonRegionInputs = NumInputs;
@@ -1561,18 +1568,19 @@ void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
if (NumNonRegionInputs == 0) {
auto DestReg = getPHIDestReg(PHI);
replaceRegisterWith(DestReg, CombinedSourceReg);
- DEBUG(dbgs() << " register " << printReg(CombinedSourceReg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << " register " << printReg(CombinedSourceReg, TRI)
+ << "\n");
PHI.eraseFromParent();
} else {
- DEBUG(dbgs() << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
+ LLVM_DEBUG(dbgs() << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
MachineBasicBlock *MBB = PHI.getParent();
MachineInstrBuilder MIB =
BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
getPHIDestReg(PHI));
MIB.addReg(CombinedSourceReg);
MIB.addMBB(IfMBB);
- DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
- << printMBBReference(*IfMBB));
+ LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
+ << printMBBReference(*IfMBB));
unsigned NumInputs = getPHINumInputs(PHI);
for (unsigned i = 0; i < NumInputs; ++i) {
if (isPHIRegionIndex(PHIRegionIndices, i)) {
@@ -1582,10 +1590,10 @@ void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
MIB.addReg(SourceReg);
MIB.addMBB(SourcePred);
- DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
- << printMBBReference(*SourcePred));
+ LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+ << printMBBReference(*SourcePred));
}
- DEBUG(dbgs() << ")\n");
+ LLVM_DEBUG(dbgs() << ")\n");
PHI.eraseFromParent();
}
}
@@ -1607,8 +1615,9 @@ void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs(
}
}
- DEBUG(dbgs() << "Register " << printReg(Reg, TRI) << " is "
- << (IsDead ? "dead" : "alive") << " after PHI replace\n");
+ LLVM_DEBUG(dbgs() << "Register " << printReg(Reg, TRI) << " is "
+ << (IsDead ? "dead" : "alive")
+ << " after PHI replace\n");
if (IsDead) {
LRegion->removeLiveOut(Reg);
}
@@ -1682,8 +1691,8 @@ void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHIs(LinearizedRegion *Regi
void AMDGPUMachineCFGStructurizer::insertUnconditionalBranch(MachineBasicBlock *MBB,
MachineBasicBlock *Dest,
const DebugLoc &DL) {
- DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber()
- << " -> " << Dest->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber()
+ << " -> " << Dest->getNumber() << "\n");
MachineBasicBlock::instr_iterator Terminator = MBB->getFirstInstrTerminator();
bool HasTerminator = Terminator != MBB->instr_end();
if (HasTerminator) {
@@ -1732,7 +1741,8 @@ AMDGPUMachineCFGStructurizer::createLinearizedExitBlock(RegionMRT *Region) {
MF->insert(ExitIter, LastMerge);
LastMerge->addSuccessor(Exit);
insertUnconditionalBranch(LastMerge, Exit);
- DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber()
+ << "\n");
}
return LastMerge;
}
@@ -1748,11 +1758,12 @@ void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB,
if (MergeBB->succ_begin() == MergeBB->succ_end()) {
return;
}
- DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB)
- << "): " << printReg(DestRegister, TRI) << " = PHI("
- << printReg(IfSourceRegister, TRI) << ", "
- << printMBBReference(*IfBB) << printReg(CodeSourceRegister, TRI)
- << ", " << printMBBReference(*CodeBB) << ")\n");
+ LLVM_DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB)
+ << "): " << printReg(DestRegister, TRI) << " = PHI("
+ << printReg(IfSourceRegister, TRI) << ", "
+ << printMBBReference(*IfBB)
+ << printReg(CodeSourceRegister, TRI) << ", "
+ << printMBBReference(*CodeBB) << ")\n");
const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin());
MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL,
TII->get(TargetOpcode::PHI), DestRegister);
@@ -1810,8 +1821,8 @@ static void removeExternalCFGEdges(MachineBasicBlock *StartMBB,
for (auto SI : Succs) {
std::pair<MachineBasicBlock *, MachineBasicBlock *> Edge = SI;
- DEBUG(dbgs() << "Removing edge: " << printMBBReference(*Edge.first)
- << " -> " << printMBBReference(*Edge.second) << "\n");
+ LLVM_DEBUG(dbgs() << "Removing edge: " << printMBBReference(*Edge.first)
+ << " -> " << printMBBReference(*Edge.second) << "\n");
Edge.first->removeSuccessor(Edge.second);
}
}
@@ -1844,13 +1855,13 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock(
IfBB->addSuccessor(MergeBB);
IfBB->addSuccessor(CodeBBStart);
- DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n");
// Ensure that the MergeBB is a successor of the CodeEndBB.
if (!CodeBBEnd->isSuccessor(MergeBB))
CodeBBEnd->addSuccessor(MergeBB);
- DEBUG(dbgs() << "Moved " << printMBBReference(*CodeBBStart) << " through "
- << printMBBReference(*CodeBBEnd) << "\n");
+ LLVM_DEBUG(dbgs() << "Moved " << printMBBReference(*CodeBBStart)
+ << " through " << printMBBReference(*CodeBBEnd) << "\n");
// If we have a single predecessor we can find a reasonable debug location
MachineBasicBlock *SinglePred =
@@ -1935,16 +1946,18 @@ void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *Co
MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) {
if (MRI->def_begin(Reg) == MRI->def_end()) {
- DEBUG(dbgs() << "Register " << printReg(Reg, MRI->getTargetRegisterInfo())
- << " has NO defs\n");
+ LLVM_DEBUG(dbgs() << "Register "
+ << printReg(Reg, MRI->getTargetRegisterInfo())
+ << " has NO defs\n");
} else if (!MRI->hasOneDef(Reg)) {
- DEBUG(dbgs() << "Register " << printReg(Reg, MRI->getTargetRegisterInfo())
- << " has multiple defs\n");
- DEBUG(dbgs() << "DEFS BEGIN:\n");
+ LLVM_DEBUG(dbgs() << "Register "
+ << printReg(Reg, MRI->getTargetRegisterInfo())
+ << " has multiple defs\n");
+ LLVM_DEBUG(dbgs() << "DEFS BEGIN:\n");
for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) {
- DEBUG(DI->getParent()->dump());
+ LLVM_DEBUG(DI->getParent()->dump());
}
- DEBUG(dbgs() << "DEFS END\n");
+ LLVM_DEBUG(dbgs() << "DEFS END\n");
}
assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
@@ -1986,7 +1999,7 @@ void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB,
const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg);
unsigned NextDestReg = MRI->createVirtualRegister(RegClass);
bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1;
- DEBUG(dbgs() << "Insert Chained PHI\n");
+ LLVM_DEBUG(dbgs() << "Insert Chained PHI\n");
insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg,
SourceReg, IsLastDef);
@@ -2022,16 +2035,16 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
}
for (auto LI : OldLiveOuts) {
- DEBUG(dbgs() << "LiveOut: " << printReg(LI, TRI));
+ LLVM_DEBUG(dbgs() << "LiveOut: " << printReg(LI, TRI));
if (!containsDef(CodeBB, InnerRegion, LI) ||
(!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) {
// If the register simly lives through the CodeBB, we don't have
// to rewrite anything since the register is not defined in this
// part of the code.
- DEBUG(dbgs() << "- through");
+ LLVM_DEBUG(dbgs() << "- through");
continue;
}
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "\n");
unsigned Reg = LI;
if (/*!PHIInfo.isSource(Reg) &&*/ Reg != InnerRegion->getBBSelectRegOut()) {
// If the register is live out, we do want to create a phi,
@@ -2048,12 +2061,12 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
unsigned IfSourceReg = MRI->createVirtualRegister(RegClass);
// Create initializer, this value is never used, but is needed
// to satisfy SSA.
- DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n");
+ LLVM_DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n");
TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(),
IfSourceReg, 0);
InnerRegion->replaceRegisterOutsideRegion(Reg, PHIDestReg, true, MRI);
- DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n");
+ LLVM_DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n");
insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, PHIDestReg,
IfSourceReg, Reg, true);
}
@@ -2063,22 +2076,22 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
// is a source block for a definition.
SmallVector<unsigned, 4> Sources;
if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) {
- DEBUG(dbgs() << "Inserting PHI Live Out from " << printMBBReference(*CodeBB)
- << "\n");
+ LLVM_DEBUG(dbgs() << "Inserting PHI Live Out from "
+ << printMBBReference(*CodeBB) << "\n");
for (auto SI : Sources) {
unsigned DestReg;
PHIInfo.findDest(SI, CodeBB, DestReg);
insertChainedPHI(IfBB, CodeBB, MergeBB, InnerRegion, DestReg, SI);
}
- DEBUG(dbgs() << "Insertion done.\n");
+ LLVM_DEBUG(dbgs() << "Insertion done.\n");
}
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(PHIInfo.dump(MRI));
}
void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) {
- DEBUG(dbgs() << "Before PHI Prune\n");
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(dbgs() << "Before PHI Prune\n");
+ LLVM_DEBUG(PHIInfo.dump(MRI));
SmallVector<std::tuple<unsigned, unsigned, MachineBasicBlock *>, 4>
ElimiatedSources;
for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
@@ -2118,8 +2131,8 @@ void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) {
PHIInfo.removeSource(std::get<0>(SourceInfo), std::get<1>(SourceInfo),
std::get<2>(SourceInfo));
}
- DEBUG(dbgs() << "After PHI Prune\n");
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(dbgs() << "After PHI Prune\n");
+ LLVM_DEBUG(PHIInfo.dump(MRI));
}
void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegion,
@@ -2127,8 +2140,8 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
MachineBasicBlock *Entry = CurrentRegion->getEntry();
MachineBasicBlock *Exit = CurrentRegion->getExit();
- DEBUG(dbgs() << "RegionExit: " << Exit->getNumber()
- << " Pred: " << (*(Entry->pred_begin()))->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "RegionExit: " << Exit->getNumber() << " Pred: "
+ << (*(Entry->pred_begin()))->getNumber() << "\n");
int NumSources = 0;
auto SE = PHIInfo.sources_end(DestReg);
@@ -2145,7 +2158,7 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
const DebugLoc &DL = Entry->findDebugLoc(Entry->begin());
MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL,
TII->get(TargetOpcode::PHI), DestReg);
- DEBUG(dbgs() << "Entry PHI " << printReg(DestReg, TRI) << " = PHI(");
+ LLVM_DEBUG(dbgs() << "Entry PHI " << printReg(DestReg, TRI) << " = PHI(");
unsigned CurrentBackedgeReg = 0;
@@ -2169,19 +2182,19 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
BackedgePHI.addReg(getPHISourceReg(*PHIDefInstr, 1));
BackedgePHI.addMBB((*SRI).second);
CurrentBackedgeReg = NewBackedgeReg;
- DEBUG(dbgs() << "Inserting backedge PHI: "
- << printReg(NewBackedgeReg, TRI) << " = PHI("
- << printReg(CurrentBackedgeReg, TRI) << ", "
- << printMBBReference(*getPHIPred(*PHIDefInstr, 0))
- << ", "
- << printReg(getPHISourceReg(*PHIDefInstr, 1), TRI)
- << ", " << printMBBReference(*(*SRI).second));
+ LLVM_DEBUG(dbgs()
+ << "Inserting backedge PHI: "
+ << printReg(NewBackedgeReg, TRI) << " = PHI("
+ << printReg(CurrentBackedgeReg, TRI) << ", "
+ << printMBBReference(*getPHIPred(*PHIDefInstr, 0)) << ", "
+ << printReg(getPHISourceReg(*PHIDefInstr, 1), TRI) << ", "
+ << printMBBReference(*(*SRI).second));
}
} else {
MIB.addReg(SourceReg);
MIB.addMBB((*SRI).second);
- DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
- << printMBBReference(*(*SRI).second) << ", ");
+ LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+ << printMBBReference(*(*SRI).second) << ", ");
}
}
@@ -2189,16 +2202,16 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
if (CurrentBackedgeReg != 0) {
MIB.addReg(CurrentBackedgeReg);
MIB.addMBB(Exit);
- DEBUG(dbgs() << printReg(CurrentBackedgeReg, TRI) << ", "
- << printMBBReference(*Exit) << ")\n");
+ LLVM_DEBUG(dbgs() << printReg(CurrentBackedgeReg, TRI) << ", "
+ << printMBBReference(*Exit) << ")\n");
} else {
- DEBUG(dbgs() << ")\n");
+ LLVM_DEBUG(dbgs() << ")\n");
}
}
}
void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegion) {
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(PHIInfo.dump(MRI));
for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
++DRI) {
@@ -2219,19 +2232,19 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
MachineOperand &O = *I;
++I;
if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
- DEBUG(dbgs() << "Trying to substitute physical register: "
- << printReg(NewRegister, MRI->getTargetRegisterInfo())
- << "\n");
+ LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
+ << printReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
llvm_unreachable("Cannot substitute physical registers");
// We don't handle physical registers, but if we need to
// in the future This is how we do it:
// O.substPhysReg(NewRegister, *TRI);
} else {
- DEBUG(dbgs() << "Replacing register: "
- << printReg(Register, MRI->getTargetRegisterInfo())
- << " with "
- << printReg(NewRegister, MRI->getTargetRegisterInfo())
- << "\n");
+ LLVM_DEBUG(dbgs() << "Replacing register: "
+ << printReg(Register, MRI->getTargetRegisterInfo())
+ << " with "
+ << printReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
O.setReg(NewRegister);
}
}
@@ -2239,20 +2252,20 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
getRegionMRT()->replaceLiveOutReg(Register, NewRegister);
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(PHIInfo.dump(MRI));
}
void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEntry) {
- DEBUG(dbgs() << "Resolve PHI Infos\n");
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(dbgs() << "Resolve PHI Infos\n");
+ LLVM_DEBUG(PHIInfo.dump(MRI));
for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
++DRI) {
unsigned DestReg = *DRI;
- DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) << "\n");
auto SRI = PHIInfo.sources_begin(DestReg);
unsigned SourceReg = (*SRI).first;
- DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI)
- << " SourceReg: " << printReg(SourceReg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI)
+ << " SourceReg: " << printReg(SourceReg, TRI) << "\n");
assert(PHIInfo.sources_end(DestReg) == ++SRI &&
"More than one phi source in entry node");
@@ -2326,9 +2339,9 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
MachineOperand RegOp =
MachineOperand::CreateReg(Reg, false, false, true);
ArrayRef<MachineOperand> Cond(RegOp);
- DEBUG(dbgs() << "RegionExitReg: ");
- DEBUG(Cond[0].print(dbgs(), TRI));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "RegionExitReg: ");
+ LLVM_DEBUG(Cond[0].print(dbgs(), TRI));
+ LLVM_DEBUG(dbgs() << "\n");
TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
Cond, DebugLoc());
RegionExit->addSuccessor(CurrentRegion->getEntry());
@@ -2338,12 +2351,12 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo);
InnerRegion.setParent(CurrentRegion);
- DEBUG(dbgs() << "Insert BB Select PHI (BB)\n");
+ LLVM_DEBUG(dbgs() << "Insert BB Select PHI (BB)\n");
insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
CodeBBSelectReg);
InnerRegion.addMBB(MergeBB);
- DEBUG(InnerRegion.print(dbgs(), TRI));
+ LLVM_DEBUG(InnerRegion.print(dbgs(), TRI));
rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion);
extractKilledPHIs(CodeBB);
if (IsRegionEntryBB) {
@@ -2384,16 +2397,16 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
CurrentRegion->getRegionMRT()->getEntry()->getNumber());
MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true);
ArrayRef<MachineOperand> Cond(RegOp);
- DEBUG(dbgs() << "RegionExitReg: ");
- DEBUG(Cond[0].print(dbgs(), TRI));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "RegionExitReg: ");
+ LLVM_DEBUG(Cond[0].print(dbgs(), TRI));
+ LLVM_DEBUG(dbgs() << "\n");
TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
Cond, DebugLoc());
RegionExit->addSuccessor(IfBB);
}
}
CurrentRegion->addMBBs(InnerRegion);
- DEBUG(dbgs() << "Insert BB Select PHI (region)\n");
+ LLVM_DEBUG(dbgs() << "Insert BB Select PHI (region)\n");
insertMergePHI(IfBB, CodeExitBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
CodeBBSelectReg);
@@ -2439,15 +2452,16 @@ void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI,
MachineInstrBuilder MIB =
BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(),
TII->get(TargetOpcode::PHI), NewDestReg);
- DEBUG(dbgs() << "Split Entry PHI " << printReg(NewDestReg, TRI) << " = PHI(");
+ LLVM_DEBUG(dbgs() << "Split Entry PHI " << printReg(NewDestReg, TRI)
+ << " = PHI(");
MIB.addReg(PHISource);
MIB.addMBB(Entry);
- DEBUG(dbgs() << printReg(PHISource, TRI) << ", "
- << printMBBReference(*Entry));
+ LLVM_DEBUG(dbgs() << printReg(PHISource, TRI) << ", "
+ << printMBBReference(*Entry));
MIB.addReg(RegionSourceReg);
MIB.addMBB(RegionSourceMBB);
- DEBUG(dbgs() << " ," << printReg(RegionSourceReg, TRI) << ", "
- << printMBBReference(*RegionSourceMBB) << ")\n");
+ LLVM_DEBUG(dbgs() << " ," << printReg(RegionSourceReg, TRI) << ", "
+ << printMBBReference(*RegionSourceMBB) << ")\n");
}
void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry,
@@ -2480,7 +2494,8 @@ AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) {
LRegion->addMBB(NewExit);
LRegion->setExit(NewExit);
- DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber()
+ << "\n");
// Replace any PHI Predecessors in the successor with NewExit
for (auto &II : *Succ) {
@@ -2528,9 +2543,9 @@ AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) {
MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI());
MachineBasicBlock *Exit = LRegion->getExit();
- DEBUG(dbgs() << "Split " << printMBBReference(*Entry) << " to "
- << printMBBReference(*Entry) << " -> "
- << printMBBReference(*EntrySucc) << "\n");
+ LLVM_DEBUG(dbgs() << "Split " << printMBBReference(*Entry) << " to "
+ << printMBBReference(*Entry) << " -> "
+ << printMBBReference(*EntrySucc) << "\n");
LRegion->addMBB(EntrySucc);
// Make the backedge go to Entry Succ
@@ -2621,21 +2636,21 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
rewriteRegionExitPHIs(Region, LastMerge, LRegion);
removeOldExitPreds(Region);
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(PHIInfo.dump(MRI));
SetVector<MRT *> *Children = Region->getChildren();
- DEBUG(dbgs() << "===========If Region Start===============\n");
+ LLVM_DEBUG(dbgs() << "===========If Region Start===============\n");
if (LRegion->getHasLoop()) {
- DEBUG(dbgs() << "Has Backedge: Yes\n");
+ LLVM_DEBUG(dbgs() << "Has Backedge: Yes\n");
} else {
- DEBUG(dbgs() << "Has Backedge: No\n");
+ LLVM_DEBUG(dbgs() << "Has Backedge: No\n");
}
unsigned BBSelectRegIn;
unsigned BBSelectRegOut;
for (auto CI = Children->begin(), CE = Children->end(); CI != CE; ++CI) {
- DEBUG(dbgs() << "CurrentRegion: \n");
- DEBUG(LRegion->print(dbgs(), TRI));
+ LLVM_DEBUG(dbgs() << "CurrentRegion: \n");
+ LLVM_DEBUG(LRegion->print(dbgs(), TRI));
auto CNI = CI;
++CNI;
@@ -2649,9 +2664,9 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
// We found the block is the exit of an inner region, we need
// to put it in the current linearized region.
- DEBUG(dbgs() << "Linearizing region: ");
- DEBUG(InnerLRegion->print(dbgs(), TRI));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "Linearizing region: ");
+ LLVM_DEBUG(InnerLRegion->print(dbgs(), TRI));
+ LLVM_DEBUG(dbgs() << "\n");
MachineBasicBlock *InnerEntry = InnerLRegion->getEntry();
if ((&(*(InnerEntry->getParent()->begin()))) == InnerEntry) {
@@ -2669,10 +2684,10 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
BBSelectRegOut = Child->getBBSelectRegOut();
BBSelectRegIn = Child->getBBSelectRegIn();
- DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
- << "\n");
- DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
- << "\n");
+ LLVM_DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
+ << "\n");
+ LLVM_DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
+ << "\n");
MachineBasicBlock *IfEnd = CurrentMerge;
CurrentMerge = createIfRegion(CurrentMerge, InnerLRegion, LRegion,
@@ -2681,7 +2696,7 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
} else {
MachineBasicBlock *MBB = Child->getMBBMRT()->getMBB();
- DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n");
if (MBB == getSingleExitNode(*(MBB->getParent()))) {
// If this is the exit block then we need to skip to the next.
@@ -2693,10 +2708,10 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
BBSelectRegOut = Child->getBBSelectRegOut();
BBSelectRegIn = Child->getBBSelectRegIn();
- DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
- << "\n");
- DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
- << "\n");
+ LLVM_DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
+ << "\n");
+ LLVM_DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
+ << "\n");
MachineBasicBlock *IfEnd = CurrentMerge;
// This is a basic block that is not part of an inner region, we
@@ -2707,7 +2722,7 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
}
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(PHIInfo.dump(MRI));
}
}
@@ -2728,7 +2743,7 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
NewInReg, Region->getEntry()->getNumber());
// Need to be careful about updating the registers inside the region.
LRegion->replaceRegisterInsideRegion(InReg, InnerSelectReg, false, MRI);
- DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n");
+ LLVM_DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n");
insertMergePHI(LRegion->getEntry(), LRegion->getExit(), NewSucc,
InnerSelectReg, NewInReg,
LRegion->getRegionMRT()->getInnerOutputRegister());
@@ -2740,11 +2755,11 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
TII->insertReturn(*LastMerge);
}
- DEBUG(Region->getEntry()->getParent()->dump());
- DEBUG(LRegion->print(dbgs(), TRI));
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(Region->getEntry()->getParent()->dump());
+ LLVM_DEBUG(LRegion->print(dbgs(), TRI));
+ LLVM_DEBUG(PHIInfo.dump(MRI));
- DEBUG(dbgs() << "===========If Region End===============\n");
+ LLVM_DEBUG(dbgs() << "===========If Region End===============\n");
Region->setLinearizedRegion(LRegion);
return true;
@@ -2784,12 +2799,12 @@ bool AMDGPUMachineCFGStructurizer::structurizeRegions(RegionMRT *Region,
}
void AMDGPUMachineCFGStructurizer::initFallthroughMap(MachineFunction &MF) {
- DEBUG(dbgs() << "Fallthrough Map:\n");
+ LLVM_DEBUG(dbgs() << "Fallthrough Map:\n");
for (auto &MBBI : MF) {
MachineBasicBlock *MBB = MBBI.getFallThrough();
if (MBB != nullptr) {
- DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> "
- << MBB->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> "
+ << MBB->getNumber() << "\n");
}
FallthroughMap[&MBBI] = MBB;
}
@@ -2800,8 +2815,8 @@ void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region,
LinearizedRegion *LRegion = new LinearizedRegion();
if (SelectOut) {
LRegion->addLiveOut(SelectOut);
- DEBUG(dbgs() << "Add LiveOut (BBSelect): " << printReg(SelectOut, TRI)
- << "\n");
+ LLVM_DEBUG(dbgs() << "Add LiveOut (BBSelect): " << printReg(SelectOut, TRI)
+ << "\n");
}
LRegion->setRegionMRT(Region);
Region->setLinearizedRegion(LRegion);
@@ -2856,26 +2871,26 @@ static void checkRegOnlyPHIInputs(MachineFunction &MF) {
}
bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
TRI = ST.getRegisterInfo();
MRI = &(MF.getRegInfo());
initFallthroughMap(MF);
checkRegOnlyPHIInputs(MF);
- DEBUG(dbgs() << "----STRUCTURIZER START----\n");
- DEBUG(MF.dump());
+ LLVM_DEBUG(dbgs() << "----STRUCTURIZER START----\n");
+ LLVM_DEBUG(MF.dump());
Regions = &(getAnalysis<MachineRegionInfoPass>().getRegionInfo());
- DEBUG(Regions->dump());
+ LLVM_DEBUG(Regions->dump());
RegionMRT *RTree = MRT::buildMRT(MF, Regions, TII, MRI);
setRegionMRT(RTree);
initializeSelectRegisters(RTree, 0, MRI, TII);
- DEBUG(RTree->dump(TRI));
+ LLVM_DEBUG(RTree->dump(TRI));
bool result = structurizeRegions(RTree, true);
delete RTree;
- DEBUG(dbgs() << "----STRUCTURIZER END----\n");
+ LLVM_DEBUG(dbgs() << "----STRUCTURIZER END----\n");
initFallthroughMap(MF);
return result;
}
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index b7c8c1213537..13b4b50149ce 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -9,20 +9,38 @@
#include "AMDGPUMachineFunction.h"
#include "AMDGPUSubtarget.h"
+#include "AMDGPUPerfHintAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
using namespace llvm;
AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
MachineFunctionInfo(),
LocalMemoryObjects(),
- KernArgSize(0),
+ ExplicitKernArgSize(0),
MaxKernArgAlign(0),
LDSSize(0),
- ABIArgOffset(0),
IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
- NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
+ NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath),
+ MemoryBound(false),
+ WaveLimiter(false) {
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
+
// FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
// except reserved size is not correctly aligned.
+ const Function &F = MF.getFunction();
+
+ if (auto *Resolver = MF.getMMI().getResolver()) {
+ if (AMDGPUPerfHintAnalysis *PHA = static_cast<AMDGPUPerfHintAnalysis*>(
+ Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) {
+ MemoryBound = PHA->isMemoryBound(&F);
+ WaveLimiter = PHA->needsWaveLimiter(&F);
+ }
+ }
+
+ CallingConv::ID CC = F.getCallingConv();
+ if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
+ ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
}
unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 99bb61b21db0..8d6b871bc03e 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -15,57 +15,43 @@
namespace llvm {
+class GCNSubtarget;
+
class AMDGPUMachineFunction : public MachineFunctionInfo {
/// A map to keep track of local memory objects and their offsets within the
/// local memory space.
SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects;
- uint64_t KernArgSize;
- unsigned MaxKernArgAlign;
+protected:
+ uint64_t ExplicitKernArgSize; // Cache for this.
+ unsigned MaxKernArgAlign; // Cache for this.
/// Number of bytes in the LDS that are being used.
unsigned LDSSize;
- // FIXME: This should probably be removed.
- /// Start of implicit kernel args
- unsigned ABIArgOffset;
-
- // Kernels + shaders. i.e. functions called by the driver and not not called
+ // Kernels + shaders. i.e. functions called by the driver and not called
// by other functions.
bool IsEntryFunction;
bool NoSignedZerosFPMath;
-public:
- AMDGPUMachineFunction(const MachineFunction &MF);
-
- uint64_t allocateKernArg(uint64_t Size, unsigned Align) {
- assert(isPowerOf2_32(Align));
- KernArgSize = alignTo(KernArgSize, Align);
+ // Function may be memory bound.
+ bool MemoryBound;
- uint64_t Result = KernArgSize;
- KernArgSize += Size;
+ // Kernel may need limited waves per EU for better performance.
+ bool WaveLimiter;
- MaxKernArgAlign = std::max(Align, MaxKernArgAlign);
- return Result;
- }
+public:
+ AMDGPUMachineFunction(const MachineFunction &MF);
- uint64_t getKernArgSize() const {
- return KernArgSize;
+ uint64_t getExplicitKernArgSize() const {
+ return ExplicitKernArgSize;
}
unsigned getMaxKernArgAlign() const {
return MaxKernArgAlign;
}
- void setABIArgOffset(unsigned NewOffset) {
- ABIArgOffset = NewOffset;
- }
-
- unsigned getABIArgOffset() const {
- return ABIArgOffset;
- }
-
unsigned getLDSSize() const {
return LDSSize;
}
@@ -78,6 +64,14 @@ public:
return NoSignedZerosFPMath;
}
+ bool isMemoryBound() const {
+ return MemoryBound;
+ }
+
+ bool needsWaveLimiter() const {
+ return WaveLimiter;
+ }
+
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV);
};
diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
index 3164140abe29..7b9f673c418c 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU Machine Module Info.
+/// AMDGPU Machine Module Info.
///
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
index 1a728c6bd04a..1219ab26fb69 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU Machine Module Info.
+/// AMDGPU Machine Module Info.
///
//
//===----------------------------------------------------------------------===//
@@ -30,14 +30,14 @@ private:
// All supported memory/synchronization scopes can be found here:
// http://llvm.org/docs/AMDGPUUsage.html#memory-scopes
- /// \brief Agent synchronization scope ID.
+ /// Agent synchronization scope ID.
SyncScope::ID AgentSSID;
- /// \brief Workgroup synchronization scope ID.
+ /// Workgroup synchronization scope ID.
SyncScope::ID WorkgroupSSID;
- /// \brief Wavefront synchronization scope ID.
+ /// Wavefront synchronization scope ID.
SyncScope::ID WavefrontSSID;
- /// \brief In AMDGPU target synchronization scopes are inclusive, meaning a
+ /// In AMDGPU target synchronization scopes are inclusive, meaning a
/// larger synchronization scope is inclusive of a smaller synchronization
/// scope.
///
@@ -74,7 +74,7 @@ public:
return WavefrontSSID;
}
- /// \brief In AMDGPU target synchronization scopes are inclusive, meaning a
+ /// In AMDGPU target synchronization scopes are inclusive, meaning a
/// larger synchronization scope is inclusive of a smaller synchronization
/// scope.
///
diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
index 7263ba73d155..995d9ae3907f 100644
--- a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -15,6 +15,7 @@
#include "AMDGPUMacroFusion.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MacroFusion.h"
@@ -22,7 +23,7 @@ using namespace llvm;
namespace {
-/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// Check if the instr pair, FirstMI and SecondMI, should be fused
/// together. Given SecondMI, when FirstMI is unspecified, then check if
/// SecondMI may be part of a fused pair at all.
static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_,
diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index bb65636f15af..7bd8533a0ccf 100644
--- a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
// \file
-// \brief This post-linking pass replaces the function pointer of enqueued
+// This post-linking pass replaces the function pointer of enqueued
// block kernel with a global variable (runtime handle) and adds
// "runtime-handle" attribute to the enqueued block kernel.
//
@@ -36,7 +36,9 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Mangler.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/User.h"
#include "llvm/Pass.h"
@@ -49,7 +51,7 @@ using namespace llvm;
namespace {
-/// \brief Lower enqueued blocks.
+/// Lower enqueued blocks.
class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
public:
static char ID;
@@ -80,49 +82,63 @@ static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
for (auto U : F->users()) {
if (auto *CI = dyn_cast<CallInst>(&*U)) {
auto *Caller = CI->getParent()->getParent();
- if (Callers.count(Caller))
- continue;
- Callers.insert(Caller);
- collectCallers(Caller, Callers);
+ if (Callers.insert(Caller).second)
+ collectCallers(Caller, Callers);
}
}
}
+/// If \p U is instruction or constant, collect functions which directly or
+/// indirectly use it.
+static void collectFunctionUsers(User *U, DenseSet<Function *> &Funcs) {
+ if (auto *I = dyn_cast<Instruction>(U)) {
+ auto *F = I->getParent()->getParent();
+ if (Funcs.insert(F).second)
+ collectCallers(F, Funcs);
+ return;
+ }
+ if (!isa<Constant>(U))
+ return;
+ for (auto UU : U->users())
+ collectFunctionUsers(&*UU, Funcs);
+}
+
bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
DenseSet<Function *> Callers;
auto &C = M.getContext();
bool Changed = false;
for (auto &F : M.functions()) {
if (F.hasFnAttribute("enqueued-block")) {
- if (!F.hasOneUse() || !F.user_begin()->hasOneUse() ||
- !isa<ConstantExpr>(*F.user_begin()) ||
- !isa<ConstantExpr>(*F.user_begin()->user_begin())) {
- continue;
+ if (!F.hasName()) {
+ SmallString<64> Name;
+ Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel",
+ M.getDataLayout());
+ F.setName(Name);
}
- auto *BitCast = cast<ConstantExpr>(*F.user_begin());
- auto *AddrCast = cast<ConstantExpr>(*BitCast->user_begin());
- auto RuntimeHandle = (F.getName() + "_runtime_handle").str();
+ LLVM_DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n');
+ auto RuntimeHandle = (F.getName() + ".runtime_handle").str();
+ auto T = ArrayType::get(Type::getInt64Ty(C), 2);
auto *GV = new GlobalVariable(
- M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS),
- /*IsConstant=*/true, GlobalValue::ExternalLinkage,
- /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr,
- GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS,
- /*IsExternallyInitialized=*/true);
- DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
- auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType());
- AddrCast->replaceAllUsesWith(NewPtr);
- F.addFnAttr("runtime-handle", RuntimeHandle);
- F.setLinkage(GlobalValue::ExternalLinkage);
-
- // Collect direct or indirect callers of enqueue_kernel.
- for (auto U : NewPtr->users()) {
- if (auto *I = dyn_cast<Instruction>(&*U)) {
- auto *F = I->getParent()->getParent();
- Callers.insert(F);
- collectCallers(F, Callers);
- }
+ M, T,
+ /*IsConstant=*/false, GlobalValue::ExternalLinkage,
+ /*Initializer=*/Constant::getNullValue(T), RuntimeHandle,
+ /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
+ AMDGPUAS::GLOBAL_ADDRESS,
+ /*IsExternallyInitialized=*/false);
+ LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
+
+ for (auto U : F.users()) {
+ auto *UU = &*U;
+ if (!isa<ConstantExpr>(UU))
+ continue;
+ collectFunctionUsers(UU, Callers);
+ auto *BitCast = cast<ConstantExpr>(UU);
+ auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType());
+ BitCast->replaceAllUsesWith(NewPtr);
+ F.addFnAttr("runtime-handle", RuntimeHandle);
+ F.setLinkage(GlobalValue::ExternalLinkage);
+ Changed = true;
}
- Changed = true;
}
}
@@ -130,6 +146,7 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
continue;
F->addFnAttr("calls-enqueue-kernel");
+ LLVM_DEBUG(dbgs() << "mark enqueue_kernel caller:" << F->getName() << '\n');
}
return Changed;
}
diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
new file mode 100644
index 000000000000..3cfdccc9fe51
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -0,0 +1,397 @@
+//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Analyzes if a function potentially memory bound and if a kernel
+/// kernel may benefit from limiting number of waves to reduce cache thrashing.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUPerfHintAnalysis.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-perf-hint"
+
+static cl::opt<unsigned>
+ MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden,
+ cl::desc("Function mem bound threshold in %"));
+
+static cl::opt<unsigned>
+ LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden,
+ cl::desc("Kernel limit wave threshold in %"));
+
+static cl::opt<unsigned>
+ IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden,
+ cl::desc("Indirect access memory instruction weight"));
+
+static cl::opt<unsigned>
+ LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden,
+ cl::desc("Large stride memory access weight"));
+
+static cl::opt<unsigned>
+ LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden,
+ cl::desc("Large stride memory access threshold"));
+
+STATISTIC(NumMemBound, "Number of functions marked as memory bound");
+STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
+
+char llvm::AMDGPUPerfHintAnalysis::ID = 0;
+char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID;
+
+INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE,
+ "Analysis if a function is memory bound", true, true)
+
+namespace {
+
+struct AMDGPUPerfHint {
+ friend AMDGPUPerfHintAnalysis;
+
+public:
+ AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
+ const TargetLowering *TLI_)
+ : FIM(FIM_), DL(nullptr), TLI(TLI_) {}
+
+ void runOnFunction(Function &F);
+
+private:
+ struct MemAccessInfo {
+ const Value *V;
+ const Value *Base;
+ int64_t Offset;
+ MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {}
+ bool isLargeStride(MemAccessInfo &Reference) const;
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ Printable print() const {
+ return Printable([this](raw_ostream &OS) {
+ OS << "Value: " << *V << '\n'
+ << "Base: " << *Base << " Offset: " << Offset << '\n';
+ });
+ }
+#endif
+ };
+
+ MemAccessInfo makeMemAccessInfo(Instruction *) const;
+
+ MemAccessInfo LastAccess; // Last memory access info
+
+ AMDGPUPerfHintAnalysis::FuncInfoMap &FIM;
+
+ const DataLayout *DL;
+
+ AMDGPUAS AS;
+
+ const TargetLowering *TLI;
+
+ void visit(const Function &F);
+ static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
+ static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
+
+ bool isIndirectAccess(const Instruction *Inst) const;
+
+ /// Check if the instruction is large stride.
+ /// The purpose is to identify memory access pattern like:
+ /// x = a[i];
+ /// y = a[i+1000];
+ /// z = a[i+2000];
+ /// In the above example, the second and third memory access will be marked
+ /// large stride memory access.
+ bool isLargeStride(const Instruction *Inst);
+
+ bool isGlobalAddr(const Value *V) const;
+ bool isLocalAddr(const Value *V) const;
+ bool isConstantAddr(const Value *V) const;
+};
+
+static const Value *getMemoryInstrPtr(const Instruction *Inst) {
+ if (auto LI = dyn_cast<LoadInst>(Inst)) {
+ return LI->getPointerOperand();
+ }
+ if (auto SI = dyn_cast<StoreInst>(Inst)) {
+ return SI->getPointerOperand();
+ }
+ if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+ return AI->getPointerOperand();
+ }
+ if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) {
+ return AI->getPointerOperand();
+ }
+ if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
+ return MI->getRawDest();
+ }
+
+ return nullptr;
+}
+
+bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
+ LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
+ SmallSet<const Value *, 32> WorkSet;
+ SmallSet<const Value *, 32> Visited;
+ if (const Value *MO = getMemoryInstrPtr(Inst)) {
+ if (isGlobalAddr(MO))
+ WorkSet.insert(MO);
+ }
+
+ while (!WorkSet.empty()) {
+ const Value *V = *WorkSet.begin();
+ WorkSet.erase(*WorkSet.begin());
+ if (!Visited.insert(V).second)
+ continue;
+ LLVM_DEBUG(dbgs() << " check: " << *V << '\n');
+
+ if (auto LD = dyn_cast<LoadInst>(V)) {
+ auto M = LD->getPointerOperand();
+ if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) {
+ LLVM_DEBUG(dbgs() << " is IA\n");
+ return true;
+ }
+ continue;
+ }
+
+ if (auto GEP = dyn_cast<GetElementPtrInst>(V)) {
+ auto P = GEP->getPointerOperand();
+ WorkSet.insert(P);
+ for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)
+ WorkSet.insert(GEP->getOperand(I));
+ continue;
+ }
+
+ if (auto U = dyn_cast<UnaryInstruction>(V)) {
+ WorkSet.insert(U->getOperand(0));
+ continue;
+ }
+
+ if (auto BO = dyn_cast<BinaryOperator>(V)) {
+ WorkSet.insert(BO->getOperand(0));
+ WorkSet.insert(BO->getOperand(1));
+ continue;
+ }
+
+ if (auto S = dyn_cast<SelectInst>(V)) {
+ WorkSet.insert(S->getFalseValue());
+ WorkSet.insert(S->getTrueValue());
+ continue;
+ }
+
+ if (auto E = dyn_cast<ExtractElementInst>(V)) {
+ WorkSet.insert(E->getVectorOperand());
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << " dropped\n");
+ }
+
+ LLVM_DEBUG(dbgs() << " is not IA\n");
+ return false;
+}
+
+void AMDGPUPerfHint::visit(const Function &F) {
+ auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo()));
+ if (!FIP.second)
+ return;
+
+ AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second;
+
+ LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
+
+ for (auto &B : F) {
+ LastAccess = MemAccessInfo();
+ for (auto &I : B) {
+ if (getMemoryInstrPtr(&I)) {
+ if (isIndirectAccess(&I))
+ ++FI.IAMInstCount;
+ if (isLargeStride(&I))
+ ++FI.LSMInstCount;
+ ++FI.MemInstCount;
+ ++FI.InstCount;
+ continue;
+ }
+ CallSite CS(const_cast<Instruction *>(&I));
+ if (CS) {
+ Function *Callee = CS.getCalledFunction();
+ if (!Callee || Callee->isDeclaration()) {
+ ++FI.InstCount;
+ continue;
+ }
+ if (&F == Callee) // Handle immediate recursion
+ continue;
+
+ visit(*Callee);
+ auto Loc = FIM.find(Callee);
+
+ assert(Loc != FIM.end() && "No func info");
+ FI.MemInstCount += Loc->second.MemInstCount;
+ FI.InstCount += Loc->second.InstCount;
+ FI.IAMInstCount += Loc->second.IAMInstCount;
+ FI.LSMInstCount += Loc->second.LSMInstCount;
+ } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ TargetLoweringBase::AddrMode AM;
+ auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
+ AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
+ AM.HasBaseReg = !AM.BaseGV;
+ if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
+ GEP->getPointerAddressSpace()))
+ // Offset will likely be folded into load or store
+ continue;
+ ++FI.InstCount;
+ } else {
+ ++FI.InstCount;
+ }
+ }
+ }
+}
+
+void AMDGPUPerfHint::runOnFunction(Function &F) {
+ if (FIM.find(&F) != FIM.end())
+ return;
+
+ const Module &M = *F.getParent();
+ DL = &M.getDataLayout();
+ AS = AMDGPU::getAMDGPUAS(M);
+
+ visit(F);
+ auto Loc = FIM.find(&F);
+
+ assert(Loc != FIM.end() && "No func info");
+ LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Loc->second.MemInstCount
+ << '\n'
+ << " IAMInst: " << Loc->second.IAMInstCount << '\n'
+ << " LSMInst: " << Loc->second.LSMInstCount << '\n'
+ << " TotalInst: " << Loc->second.InstCount << '\n');
+
+ auto &FI = Loc->second;
+
+ if (isMemBound(FI)) {
+ LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
+ NumMemBound++;
+ }
+
+ if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) {
+ LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
+ NumLimitWave++;
+ }
+}
+
+bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
+ return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;
+}
+
+bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
+ return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +
+ FI.LSMInstCount * LSWeight) *
+ 100 / FI.InstCount) > LimitWaveThresh;
+}
+
+bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
+ if (auto PT = dyn_cast<PointerType>(V->getType())) {
+ unsigned As = PT->getAddressSpace();
+ // Flat likely points to global too.
+ return As == AS.GLOBAL_ADDRESS || As == AS.FLAT_ADDRESS;
+ }
+ return false;
+}
+
+bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
+ if (auto PT = dyn_cast<PointerType>(V->getType()))
+ return PT->getAddressSpace() == AS.LOCAL_ADDRESS;
+ return false;
+}
+
+bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
+ LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');
+
+ MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
+ bool IsLargeStride = MAI.isLargeStride(LastAccess);
+ if (MAI.Base)
+ LastAccess = std::move(MAI);
+
+ return IsLargeStride;
+}
+
+AMDGPUPerfHint::MemAccessInfo
+AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
+ MemAccessInfo MAI;
+ const Value *MO = getMemoryInstrPtr(Inst);
+
+ LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
+ // Do not treat local-addr memory access as large stride.
+ if (isLocalAddr(MO))
+ return MAI;
+
+ MAI.V = MO;
+ MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
+ return MAI;
+}
+
+bool AMDGPUPerfHint::isConstantAddr(const Value *V) const {
+ if (auto PT = dyn_cast<PointerType>(V->getType())) {
+ unsigned As = PT->getAddressSpace();
+ return As == AS.CONSTANT_ADDRESS || As == AS.CONSTANT_ADDRESS_32BIT;
+ }
+ return false;
+}
+
+bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
+ MemAccessInfo &Reference) const {
+
+ if (!Base || !Reference.Base || Base != Reference.Base)
+ return false;
+
+ uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
+ : Reference.Offset - Offset;
+ bool Result = Diff > LargeStrideThresh;
+ LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
+ << print() << "<=>\n"
+ << Reference.print() << "Result:" << Result << '\n');
+ return Result;
+}
+} // namespace
+
+bool AMDGPUPerfHintAnalysis::runOnFunction(Function &F) {
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ const TargetMachine &TM = TPC->getTM<TargetMachine>();
+ const TargetSubtargetInfo *ST = TM.getSubtargetImpl(F);
+
+ AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
+ Analyzer.runOnFunction(F);
+ return false;
+}
+
+bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const {
+ auto FI = FIM.find(F);
+ if (FI == FIM.end())
+ return false;
+
+ return AMDGPUPerfHint::isMemBound(FI->second);
+}
+
+bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const {
+ auto FI = FIM.find(F);
+ if (FI == FIM.end())
+ return false;
+
+ return AMDGPUPerfHint::needLimitWave(FI->second);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
new file mode 100644
index 000000000000..be7f37cb6815
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -0,0 +1,55 @@
+//===- AMDGPUPerfHintAnalysis.h - analysis of functions memory traffic ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Analyzes if a function potentially memory bound and if a kernel
+/// kernel may benefit from limiting number of waves to reduce cache thrashing.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
+#define LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+struct AMDGPUPerfHintAnalysis : public FunctionPass {
+ static char ID;
+
+public:
+ AMDGPUPerfHintAnalysis() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+
+ bool isMemoryBound(const Function *F) const;
+
+ bool needsWaveLimiter(const Function *F) const;
+
+ struct FuncInfo {
+ unsigned MemInstCount;
+ unsigned InstCount;
+ unsigned IAMInstCount; // Indirect access memory instruction count
+ unsigned LSMInstCount; // Large stride memory instruction count
+ FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0),
+ LSMInstCount(0) {}
+ };
+
+ typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
+
+private:
+
+ FuncInfoMap FIM;
+};
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 41876ed45c8c..d341fec6296f 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -65,6 +65,11 @@ using namespace llvm;
namespace {
+static cl::opt<bool> DisablePromoteAllocaToVector(
+ "disable-promote-alloca-to-vector",
+ cl::desc("Disable promote alloca to vector"),
+ cl::init(false));
+
// FIXME: This can create globals so should be a module pass.
class AMDGPUPromoteAlloca : public FunctionPass {
private:
@@ -147,7 +152,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
IsAMDGCN = TT.getArch() == Triple::amdgcn;
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
- const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
if (!ST.isPromoteAllocaEnabled())
return false;
@@ -169,8 +174,8 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
std::pair<Value *, Value *>
AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
- const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
- *Builder.GetInsertBlock()->getParent());
+ const Function &F = *Builder.GetInsertBlock()->getParent();
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
if (!IsAMDHSA) {
Function *LocalSizeYFn
@@ -256,8 +261,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
}
Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
- const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
- *Builder.GetInsertBlock()->getParent());
+ const AMDGPUSubtarget &ST =
+ AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent());
Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
switch (N) {
@@ -318,18 +323,19 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
static bool canVectorizeInst(Instruction *Inst, User *User) {
switch (Inst->getOpcode()) {
case Instruction::Load: {
+ // Currently only handle the case where the Pointer Operand is a GEP.
+ // Also we could not vectorize volatile or atomic loads.
LoadInst *LI = cast<LoadInst>(Inst);
- // Currently only handle the case where the Pointer Operand is a GEP so check for that case.
- return isa<GetElementPtrInst>(LI->getPointerOperand()) && !LI->isVolatile();
+ return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
}
case Instruction::BitCast:
- case Instruction::AddrSpaceCast:
return true;
case Instruction::Store: {
// Must be the stored pointer operand, not a stored value, plus
// since it should be canonical form, the User should be a GEP.
+ // Also we could not vectorize volatile or atomic stores.
StoreInst *SI = cast<StoreInst>(Inst);
- return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && !SI->isVolatile();
+ return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
}
default:
return false;
@@ -337,19 +343,25 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
}
static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
+
+ if (DisablePromoteAllocaToVector) {
+ LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
+ return false;
+ }
+
ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
- DEBUG(dbgs() << "Alloca candidate for vectorization\n");
+ LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
// FIXME: There is no reason why we can't support larger arrays, we
// are just being conservative for now.
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
// could also be promoted but we don't currently handle this case
if (!AllocaTy ||
- AllocaTy->getNumElements() > 4 ||
+ AllocaTy->getNumElements() > 16 ||
AllocaTy->getNumElements() < 2 ||
!VectorType::isValidElementType(AllocaTy->getElementType())) {
- DEBUG(dbgs() << " Cannot convert type to vector\n");
+ LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
return false;
}
@@ -370,7 +382,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
// If we can't compute a vector index from this GEP, then we can't
// promote this alloca to vector.
if (!Index) {
- DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');
+ LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP
+ << '\n');
return false;
}
@@ -385,8 +398,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
- DEBUG(dbgs() << " Converting alloca to vector "
- << *AllocaTy << " -> " << *VectorTy << '\n');
+ LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
+ << *VectorTy << '\n');
for (Value *V : WorkList) {
Instruction *Inst = cast<Instruction>(V);
@@ -443,7 +456,8 @@ static bool isCallPromotable(CallInst *CI) {
case Intrinsic::lifetime_end:
case Intrinsic::invariant_start:
case Intrinsic::invariant_end:
- case Intrinsic::invariant_group_barrier:
+ case Intrinsic::launder_invariant_group:
+ case Intrinsic::strip_invariant_group:
case Intrinsic::objectsize:
return true;
default:
@@ -475,7 +489,8 @@ bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
// important part is both must have the same address space at
// the end.
if (OtherObj != BaseAlloca) {
- DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
+ LLVM_DEBUG(
+ dbgs() << "Found a binary instruction with another alloca object\n");
return false;
}
@@ -588,7 +603,7 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
FunctionType *FTy = F.getFunctionType();
- const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
// If the function has any arguments in the local address space, then it's
// possible these arguments require the entire local memory space, so
@@ -597,8 +612,8 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
LocalMemLimit = 0;
- DEBUG(dbgs() << "Function has local memory argument. Promoting to "
- "local memory disabled.\n");
+ LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
+ "local memory disabled.\n");
return false;
}
}
@@ -667,13 +682,12 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
LocalMemLimit = MaxSizeWithWaveCount;
- DEBUG(
- dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
- << " Rounding size to " << MaxSizeWithWaveCount
- << " with a maximum occupancy of " << MaxOccupancy << '\n'
- << " and " << (LocalMemLimit - CurrentLocalMemUsage)
- << " available for promotion\n"
- );
+ LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
+ << " bytes of LDS\n"
+ << " Rounding size to " << MaxSizeWithWaveCount
+ << " with a maximum occupancy of " << MaxOccupancy << '\n'
+ << " and " << (LocalMemLimit - CurrentLocalMemUsage)
+ << " available for promotion\n");
return true;
}
@@ -690,7 +704,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
// First try to replace the alloca with a vector
Type *AllocaTy = I.getAllocatedType();
- DEBUG(dbgs() << "Trying to promote " << I << '\n');
+ LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
if (tryPromoteAllocaToVector(&I, AS))
return true; // Promoted to vector.
@@ -706,7 +720,9 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
case CallingConv::SPIR_KERNEL:
break;
default:
- DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n");
+ LLVM_DEBUG(
+ dbgs()
+ << " promote alloca to LDS not supported with calling convention.\n");
return false;
}
@@ -714,8 +730,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
if (!SufficientLDS)
return false;
- const AMDGPUSubtarget &ST =
- TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
const DataLayout &DL = Mod->getDataLayout();
@@ -735,8 +750,8 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
NewSize += AllocSize;
if (NewSize > LocalMemLimit) {
- DEBUG(dbgs() << " " << AllocSize
- << " bytes of local memory not available to promote\n");
+ LLVM_DEBUG(dbgs() << " " << AllocSize
+ << " bytes of local memory not available to promote\n");
return false;
}
@@ -745,11 +760,11 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
std::vector<Value*> WorkList;
if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
- DEBUG(dbgs() << " Do not know how to convert all uses\n");
+ LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
return false;
}
- DEBUG(dbgs() << "Promoting alloca to local memory\n");
+ LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
Function *F = I.getParent()->getParent();
@@ -843,31 +858,32 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
continue;
case Intrinsic::memcpy: {
MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
- Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
- MemCpy->getLength(), MemCpy->getAlignment(),
- MemCpy->isVolatile());
+ Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlignment(),
+ MemCpy->getRawSource(), MemCpy->getSourceAlignment(),
+ MemCpy->getLength(), MemCpy->isVolatile());
Intr->eraseFromParent();
continue;
}
case Intrinsic::memmove: {
MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
- Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
- MemMove->getLength(), MemMove->getAlignment(),
- MemMove->isVolatile());
+ Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlignment(),
+ MemMove->getRawSource(), MemMove->getSourceAlignment(),
+ MemMove->getLength(), MemMove->isVolatile());
Intr->eraseFromParent();
continue;
}
case Intrinsic::memset: {
MemSetInst *MemSet = cast<MemSetInst>(Intr);
Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
- MemSet->getLength(), MemSet->getAlignment(),
+ MemSet->getLength(), MemSet->getDestAlignment(),
MemSet->isVolatile());
Intr->eraseFromParent();
continue;
}
case Intrinsic::invariant_start:
case Intrinsic::invariant_end:
- case Intrinsic::invariant_group_barrier:
+ case Intrinsic::launder_invariant_group:
+ case Intrinsic::strip_invariant_group:
Intr->eraseFromParent();
// FIXME: I think the invariant marker should still theoretically apply,
// but the intrinsics need to be changed to accept pointers with any
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1ed02fae085a..012e4fe200aa 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -14,7 +14,9 @@
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -50,10 +52,38 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
}
-unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &A,
- const RegisterBank &B,
- unsigned Size) const {
- return RegisterBankInfo::copyCost(A, B, Size);
+static bool isConstant(const MachineOperand &MO, int64_t &C) {
+ const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const MachineInstr *Def = MRI.getVRegDef(MO.getReg());
+ if (!Def)
+ return false;
+
+ if (Def->getOpcode() == AMDGPU::G_CONSTANT) {
+ C = Def->getOperand(1).getCImm()->getSExtValue();
+ return true;
+ }
+
+ if (Def->getOpcode() == AMDGPU::COPY)
+ return isConstant(Def->getOperand(1), C);
+
+ return false;
+}
+
+unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
+ const RegisterBank &Src,
+ unsigned Size) const {
+ if (Dst.getID() == AMDGPU::SGPRRegBankID &&
+ Src.getID() == AMDGPU::VGPRRegBankID)
+ return std::numeric_limits<unsigned>::max();
+
+ // SGPRRegBank with size 1 is actually vcc or another 64-bit sgpr written by
+ // the valu.
+ if (Size == 1 && Dst.getID() == AMDGPU::SCCRegBankID &&
+ Src.getID() == AMDGPU::SGPRRegBankID)
+ return std::numeric_limits<unsigned>::max();
+
+ return RegisterBankInfo::copyCost(Dst, Src, Size);
}
const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
@@ -72,11 +102,11 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
- unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
InstructionMappings AltMappings;
switch (MI.getOpcode()) {
case TargetOpcode::G_LOAD: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
// FIXME: Should we be hard coding the size for these mappings?
const InstructionMapping &SSMapping = getInstructionMapping(
1, 1, getOperandsMapping(
@@ -104,6 +134,42 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
return AltMappings;
}
+ case TargetOpcode::G_ICMP: {
+ unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
+ const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
+ getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
+ nullptr, // Predicate operand.
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+ 4); // Num Operands
+ AltMappings.push_back(&SSMapping);
+
+ const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
+ getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+ nullptr, // Predicate operand.
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
+ 4); // Num Operands
+ AltMappings.push_back(&SVMapping);
+
+ const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
+ getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+ nullptr, // Predicate operand.
+ AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+ 4); // Num Operands
+ AltMappings.push_back(&VSMapping);
+
+ const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
+ getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+ nullptr, // Predicate operand.
+ AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
+ 4); // Num Operands
+ AltMappings.push_back(&VVMapping);
+
+ return AltMappings;
+ }
default:
break;
}
@@ -120,7 +186,60 @@ static bool isInstrUniform(const MachineInstr &MI) {
return false;
const MachineMemOperand *MMO = *MI.memoperands_begin();
- return AMDGPU::isUniformMMO(MMO);
+ return AMDGPUInstrInfo::isUniformMMO(MMO);
+}
+
+bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
+ unsigned Reg = MI.getOperand(i).getReg();
+ const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
+ if (Bank && Bank->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+ }
+ return true;
+}
+
+const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
+ OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ }
+ return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
+ MI.getNumOperands());
+}
+
+const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+ unsigned OpdIdx = 0;
+
+ unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
+
+ if (MI.getOperand(OpdIdx).isIntrinsicID())
+ OpdsMapping[OpdIdx++] = nullptr;
+
+ unsigned Reg1 = MI.getOperand(OpdIdx).getReg();
+ unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
+ unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI);
+ OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
+
+ for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
+ unsigned Size = getSizeInBits(MI.getOperand(OpdIdx).getReg(), MRI, *TRI);
+ OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ }
+
+ return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
+ MI.getNumOperands());
}
const RegisterBankInfo::InstructionMapping &
@@ -155,6 +274,22 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
// handle that during instruction selection?
}
+unsigned
+AMDGPURegisterBankInfo::getRegBankID(unsigned Reg,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI,
+ unsigned Default) const {
+
+ const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
+ return Bank ? Bank->getID() : Default;
+}
+
+///
+/// This function must return a legal mapping, because
+/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
+/// in RegBankSelect::Mode::Fast. Any mapping that would cause a
+/// VGPR to SGPR generated is illegal.
+///
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
@@ -166,16 +301,102 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
- bool IsComplete = true;
switch (MI.getOpcode()) {
default:
- IsComplete = false;
+ return getInvalidInstructionMapping();
+ case AMDGPU::G_ADD:
+ case AMDGPU::G_SUB:
+ case AMDGPU::G_MUL:
+ case AMDGPU::G_AND:
+ case AMDGPU::G_OR:
+ case AMDGPU::G_XOR:
+ case AMDGPU::G_SHL:
+ if (isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ // Fall-through
+
+ case AMDGPU::G_FADD:
+ case AMDGPU::G_FPTOSI:
+ case AMDGPU::G_FPTOUI:
+ case AMDGPU::G_FMUL:
+ return getDefaultMappingVOP(MI);
+ case AMDGPU::G_IMPLICIT_DEF: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
+ }
+ case AMDGPU::G_FCONSTANT:
case AMDGPU::G_CONSTANT: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
+ case AMDGPU::G_EXTRACT: {
+ unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
+ OpdsMapping[2] = nullptr;
+ break;
+ }
+ case AMDGPU::G_MERGE_VALUES: {
+ unsigned Bank = isSALUMapping(MI) ?
+ AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
+ // Op1 and Dst should use the same register bank.
+ for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
+ OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
+ break;
+ }
+ case AMDGPU::G_BITCAST: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
+ break;
+ }
+ case AMDGPU::G_TRUNC: {
+ unsigned Dst = MI.getOperand(0).getReg();
+ unsigned Src = MI.getOperand(1).getReg();
+ unsigned Bank = getRegBankID(Src, MRI, *TRI);
+ unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
+ unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
+ break;
+ }
+ case AMDGPU::G_ZEXT: {
+ unsigned Dst = MI.getOperand(0).getReg();
+ unsigned Src = MI.getOperand(1).getReg();
+ unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
+ unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
+ unsigned SrcBank = getRegBankID(Src, MRI, *TRI,
+ SrcSize == 1 ? AMDGPU::SGPRRegBankID :
+ AMDGPU::VGPRRegBankID);
+ unsigned DstBank = SrcBank;
+ if (SrcSize == 1) {
+ if (SrcBank == AMDGPU::SGPRRegBankID)
+ DstBank = AMDGPU::VGPRRegBankID;
+ else
+ DstBank = AMDGPU::SGPRRegBankID;
+ }
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank, SrcSize);
+ break;
+ }
+ case AMDGPU::G_FCMP: {
+ unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+ unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 1);
+ OpdsMapping[1] = nullptr; // Predicate Operand.
+ OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ break;
+ }
case AMDGPU::G_GEP: {
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
if (!MI.getOperand(i).isReg())
@@ -204,24 +425,113 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
- case AMDGPU::G_LOAD:
- return getInstrMappingForLoad(MI);
+ case AMDGPU::G_ICMP: {
+ unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+ unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+ unsigned Op0Bank = Op2Bank == AMDGPU::SGPRRegBankID &&
+ Op3Bank == AMDGPU::SGPRRegBankID ?
+ AMDGPU::SCCRegBankID : AMDGPU::VGPRRegBankID;
+ OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
+ OpdsMapping[1] = nullptr; // Predicate Operand.
+ OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
+ OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
+ break;
+ }
+
+
+ case AMDGPU::G_EXTRACT_VECTOR_ELT: {
+ unsigned IdxOp = 2;
+ int64_t Imm;
+ // XXX - Do we really need to fully handle these? The constant case should
+ // be legalized away before RegBankSelect?
+
+ unsigned OutputBankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ?
+ AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+
+ unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(0).getReg()).getSizeInBits());
+ OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(1).getReg()).getSizeInBits());
+
+ // The index can be either if the source vector is VGPR.
+ OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
+ break;
}
+ case AMDGPU::G_INSERT_VECTOR_ELT: {
+ // XXX - Do we really need to fully handle these? The constant case should
+ // be legalized away before RegBankSelect?
+
+ int64_t Imm;
+
+ unsigned IdxOp = MI.getOpcode() == AMDGPU::G_EXTRACT_VECTOR_ELT ? 2 : 3;
+ unsigned BankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ?
+ AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+
+
+
+ // TODO: Can do SGPR indexing, which would obviate the need for the
+ // isConstant check.
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
+ OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
+ }
- if (!IsComplete) {
- unsigned BankID = AMDGPU::SGPRRegBankID;
- unsigned Size = 0;
- for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) {
- // If the operand is not a register default to the size of the previous
- // operand.
- // FIXME: Can't we pull the types from the MachineInstr rather than the
- // operands.
- if (MI.getOperand(Idx).isReg())
- Size = getSizeInBits(MI.getOperand(Idx).getReg(), MRI, *TRI);
- OpdsMapping.push_back(AMDGPU::getValueMapping(BankID, Size));
+ break;
+ }
+ case AMDGPU::G_INTRINSIC: {
+ switch (MI.getOperand(1).getIntrinsicID()) {
+ default:
+ return getInvalidInstructionMapping();
+ case Intrinsic::maxnum:
+ case Intrinsic::minnum:
+ case Intrinsic::amdgcn_cvt_pkrtz:
+ return getDefaultMappingVOP(MI);
+ case Intrinsic::amdgcn_kernarg_segment_ptr: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ break;
+ }
+ }
+ break;
+ }
+ case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
+ switch (MI.getOperand(0).getIntrinsicID()) {
+ default:
+ return getInvalidInstructionMapping();
+ case Intrinsic::amdgcn_exp_compr:
+ OpdsMapping[0] = nullptr; // IntrinsicID
+ // FIXME: These are immediate values which can't be read from registers.
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ // FIXME: Could we support packed types here?
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ // FIXME: These are immediate values which can't be read from registers.
+ OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ break;
+ case Intrinsic::amdgcn_exp:
+ OpdsMapping[0] = nullptr; // IntrinsicID
+ // FIXME: These are immediate values which can't be read from registers.
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ // FIXME: Could we support packed types here?
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ // FIXME: These are immediate values which can't be read from registers.
+ OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ break;
}
+ break;
+ }
+ case AMDGPU::G_LOAD:
+ return getInstrMappingForLoad(MI);
}
+
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
MI.getNumOperands());
}
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 201fdc1974c6..d48a66589873 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -16,19 +16,15 @@
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#define GET_REGBANK_DECLARATIONS
+#include "AMDGPUGenRegisterBank.inc"
+#undef GET_REGBANK_DECLARATIONS
+
namespace llvm {
class SIRegisterInfo;
class TargetRegisterInfo;
-namespace AMDGPU {
-enum {
- SGPRRegBankID = 0,
- VGPRRegBankID = 1,
- NumRegisterBanks
-};
-} // End AMDGPU namespace.
-
/// This class provides the information for the target register banks.
class AMDGPUGenRegisterBankInfo : public RegisterBankInfo {
@@ -46,6 +42,13 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr &MI) const;
+ unsigned getRegBankID(unsigned Reg, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI,
+ unsigned Default = AMDGPU::VGPRRegBankID) const;
+
+ bool isSALUMapping(const MachineInstr &MI) const;
+ const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const;
+ const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
public:
AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI);
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index f4428e56035f..7f7f75f65647 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -14,3 +14,5 @@ def SGPRRegBank : RegisterBank<"SGPR",
def VGPRRegBank : RegisterBank<"VGPR",
[VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512]
>;
+
+def SCCRegBank : RegisterBank <"SCC", [SCC_CLASS ]>;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index 5e4d33aaa691..50f859addc2b 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -8,13 +8,15 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Parent TargetRegisterInfo class common to all hw codegen targets.
+/// Parent TargetRegisterInfo class common to all hw codegen targets.
//
//===----------------------------------------------------------------------===//
#include "AMDGPURegisterInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
using namespace llvm;
@@ -25,7 +27,7 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
// they are not supported at this time.
//===----------------------------------------------------------------------===//
-unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
+unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) {
static const unsigned SubRegs[] = {
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
@@ -37,6 +39,13 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
return SubRegs[Channel];
}
+void AMDGPURegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
+ MCRegAliasIterator R(Reg, this, true);
+
+ for (; R.isValid(); ++R)
+ Reserved.set(*R);
+}
+
#define GET_REGINFO_TARGET_DESC
#include "AMDGPUGenRegisterInfo.inc"
@@ -75,5 +84,6 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
}
unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
- return AMDGPU::NoRegister;
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ return FuncInfo->getFrameOffsetReg();
}
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index d8604d2590f1..07de5fc549e2 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief TargetRegisterInfo interface that is implemented by all hw codegen
+/// TargetRegisterInfo interface that is implemented by all hw codegen
/// targets.
//
//===----------------------------------------------------------------------===//
@@ -21,15 +21,19 @@
namespace llvm {
-class AMDGPUSubtarget;
+class GCNSubtarget;
class TargetInstrInfo;
struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
AMDGPURegisterInfo();
+ bool enableMultipleCopyHints() const override { return true; }
+
/// \returns the sub reg enum value for the given \p Channel
/// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
- unsigned getSubRegFromChannel(unsigned Channel) const;
+ static unsigned getSubRegFromChannel(unsigned Channel);
+
+ void reserveRegisterTuples(BitVector &, unsigned Reg) const;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/lib/Target/AMDGPU/AMDGPURegisterInfo.td
index 3bbcba826f63..ceabae524414 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.td
@@ -19,5 +19,4 @@ foreach Index = 0-15 in {
}
-include "R600RegisterInfo.td"
include "SIRegisterInfo.td"
diff --git a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 83e56a9ab495..a861762a8c9e 100644
--- a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -249,8 +249,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
SmallVector<Argument *, 4> OutArgs;
for (Argument &Arg : F.args()) {
if (isOutArgumentCandidate(Arg)) {
- DEBUG(dbgs() << "Found possible out argument " << Arg
- << " in function " << F.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "Found possible out argument " << Arg
+ << " in function " << F.getName() << '\n');
OutArgs.push_back(&Arg);
}
}
@@ -310,7 +310,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
SI = dyn_cast<StoreInst>(Q.getInst());
if (SI) {
- DEBUG(dbgs() << "Found out argument store: " << *SI << '\n');
+ LLVM_DEBUG(dbgs() << "Found out argument store: " << *SI << '\n');
ReplaceableStores.emplace_back(RI, SI);
} else {
ThisReplaceable = false;
@@ -328,7 +328,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
if (llvm::find_if(ValVec,
[OutArg](const std::pair<Argument *, Value *> &Entry) {
return Entry.first == OutArg;}) != ValVec.end()) {
- DEBUG(dbgs() << "Saw multiple out arg stores" << *OutArg << '\n');
+ LLVM_DEBUG(dbgs()
+ << "Saw multiple out arg stores" << *OutArg << '\n');
// It is possible to see stores to the same argument multiple times,
// but we expect these would have been optimized out already.
ThisReplaceable = false;
@@ -358,7 +359,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
F.getFunctionType()->params(),
F.isVarArg());
- DEBUG(dbgs() << "Computed new return type: " << *NewRetTy << '\n');
+ LLVM_DEBUG(dbgs() << "Computed new return type: " << *NewRetTy << '\n');
Function *NewFunc = Function::Create(NewFuncTy, Function::PrivateLinkage,
F.getName() + ".body");
diff --git a/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
new file mode 100644
index 000000000000..9dbd7751b4d8
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -0,0 +1,77 @@
+//===-- AMDGPUSearchableTables.td - ------------------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Resource intrinsics table.
+//===----------------------------------------------------------------------===//
+
+class RsrcIntrinsic<AMDGPURsrcIntrinsic intr> {
+ Intrinsic Intr = !cast<Intrinsic>(intr);
+ bits<8> RsrcArg = intr.RsrcArg;
+ bit IsImage = intr.IsImage;
+}
+
+def RsrcIntrinsics : GenericTable {
+ let FilterClass = "RsrcIntrinsic";
+ let Fields = ["Intr", "RsrcArg", "IsImage"];
+
+ let PrimaryKey = ["Intr"];
+ let PrimaryKeyName = "lookupRsrcIntrinsic";
+}
+
+foreach intr = !listconcat(AMDGPUBufferIntrinsics,
+ AMDGPUImageDimIntrinsics,
+ AMDGPUImageDimAtomicIntrinsics) in {
+ def : RsrcIntrinsic<!cast<AMDGPURsrcIntrinsic>(intr)>;
+}
+
+class SourceOfDivergence<Intrinsic intr> {
+ Intrinsic Intr = intr;
+}
+
+def SourcesOfDivergence : GenericTable {
+ let FilterClass = "SourceOfDivergence";
+ let Fields = ["Intr"];
+
+ let PrimaryKey = ["Intr"];
+ let PrimaryKeyName = "lookupSourceOfDivergence";
+}
+
+def : SourceOfDivergence<int_amdgcn_workitem_id_x>;
+def : SourceOfDivergence<int_amdgcn_workitem_id_y>;
+def : SourceOfDivergence<int_amdgcn_workitem_id_z>;
+def : SourceOfDivergence<int_amdgcn_interp_mov>;
+def : SourceOfDivergence<int_amdgcn_interp_p1>;
+def : SourceOfDivergence<int_amdgcn_interp_p2>;
+def : SourceOfDivergence<int_amdgcn_mbcnt_hi>;
+def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
+def : SourceOfDivergence<int_r600_read_tidig_x>;
+def : SourceOfDivergence<int_r600_read_tidig_y>;
+def : SourceOfDivergence<int_r600_read_tidig_z>;
+def : SourceOfDivergence<int_amdgcn_atomic_inc>;
+def : SourceOfDivergence<int_amdgcn_atomic_dec>;
+def : SourceOfDivergence<int_amdgcn_ds_fadd>;
+def : SourceOfDivergence<int_amdgcn_ds_fmin>;
+def : SourceOfDivergence<int_amdgcn_ds_fmax>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_smin>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_umin>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_smax>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_umax>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_and>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_or>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_ps_live>;
+def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
+
+foreach intr = AMDGPUImageDimAtomicIntrinsics in
+def : SourceOfDivergence<intr>;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 80feaa44766f..98b49070fa99 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
+/// Implements the AMDGPU specific subclass of TargetSubtarget.
//
//===----------------------------------------------------------------------===//
@@ -20,8 +20,10 @@
#include "AMDGPULegalizerInfo.h"
#include "AMDGPURegisterBankInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include <algorithm>
@@ -32,12 +34,37 @@ using namespace llvm;
#define GET_SUBTARGETINFO_TARGET_DESC
#define GET_SUBTARGETINFO_CTOR
+#define AMDGPUSubtarget GCNSubtarget
#include "AMDGPUGenSubtargetInfo.inc"
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#undef AMDGPUSubtarget
+#include "R600GenSubtargetInfo.inc"
-AMDGPUSubtarget::~AMDGPUSubtarget() = default;
+GCNSubtarget::~GCNSubtarget() = default;
+
+R600Subtarget &
+R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
+ StringRef GPU, StringRef FS) {
+ SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
+ FullFS += FS;
+ ParseSubtargetFeatures(GPU, FullFS);
+
+ // FIXME: I don't think think Evergreen has any useful support for
+ // denormals, but should be checked. Should we issue a warning somewhere
+ // if someone tries to enable these?
+ if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+ FP32Denormals = false;
+ }
+
+ HasMulU24 = getGeneration() >= EVERGREEN;
+ HasMulI24 = hasCaymanISA();
+
+ return *this;
+}
-AMDGPUSubtarget &
-AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
+GCNSubtarget &
+GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
StringRef GPU, StringRef FS) {
// Determine default and user-specified characteristics
// On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
@@ -92,26 +119,43 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
HasMovrel = true;
}
+ HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
+
return *this;
}
-AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
- const TargetMachine &TM)
- : AMDGPUGenSubtargetInfo(TT, GPU, FS),
+AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT,
+ const FeatureBitset &FeatureBits) :
+ TargetTriple(TT),
+ SubtargetFeatureBits(FeatureBits),
+ Has16BitInsts(false),
+ HasMadMixInsts(false),
+ FP32Denormals(false),
+ FPExceptions(false),
+ HasSDWA(false),
+ HasVOP3PInsts(false),
+ HasMulI24(true),
+ HasMulU24(true),
+ HasFminFmaxLegacy(true),
+ EnablePromoteAlloca(false),
+ LocalMemorySize(0),
+ WavefrontSize(0)
+ { }
+
+GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+ const GCNTargetMachine &TM) :
+ AMDGPUGenSubtargetInfo(TT, GPU, FS),
+ AMDGPUSubtarget(TT, getFeatureBits()),
TargetTriple(TT),
- Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
+ Gen(SOUTHERN_ISLANDS),
IsaVersion(ISAVersion0_0_0),
- WavefrontSize(0),
- LocalMemorySize(0),
LDSBankCount(0),
MaxPrivateElementSize(0),
FastFMAF32(false),
HalfRate64Ops(false),
- FP32Denormals(false),
FP64FP16Denormals(false),
- FPExceptions(false),
DX10Clamp(false),
FlatForGlobal(false),
AutoWaitcntBeforeBarrier(false),
@@ -123,57 +167,56 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
EnableXNACK(false),
TrapHandler(false),
DebuggerInsertNops(false),
- DebuggerReserveRegs(false),
DebuggerEmitPrologue(false),
EnableHugePrivateBuffer(false),
EnableVGPRSpilling(false),
- EnablePromoteAlloca(false),
EnableLoadStoreOpt(false),
EnableUnsafeDSOffsetFolding(false),
EnableSIScheduler(false),
+ EnableDS128(false),
DumpCode(false),
FP64(false),
- FMA(false),
- IsGCN(false),
GCN3Encoding(false),
CIInsts(false),
GFX9Insts(false),
SGPRInitBug(false),
HasSMemRealTime(false),
- Has16BitInsts(false),
HasIntClamp(false),
- HasVOP3PInsts(false),
- HasMadMixInsts(false),
+ HasFmaMixInsts(false),
HasMovrel(false),
HasVGPRIndexMode(false),
HasScalarStores(false),
+ HasScalarAtomics(false),
HasInv2PiInlineImm(false),
- HasSDWA(false),
HasSDWAOmod(false),
HasSDWAScalar(false),
HasSDWASdst(false),
HasSDWAMac(false),
HasSDWAOutModsVOPC(false),
HasDPP(false),
+ HasDLInsts(false),
+ D16PreservesUnusedBits(false),
FlatAddressSpace(false),
FlatInstOffsets(false),
FlatGlobalInsts(false),
FlatScratchInsts(false),
AddNoCarryInsts(false),
+ HasUnpackedD16VMem(false),
- R600ALUInst(false),
- CaymanISA(false),
- CFALUBug(false),
- HasVertexCache(false),
- TexVTXClauseSize(0),
ScalarizeGlobal(false),
FeatureDisable(false),
- InstrItins(getInstrItineraryForCPU(GPU)) {
+ InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
+ TLInfo(TM, *this),
+ FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
AS = AMDGPU::getAMDGPUAS(TT);
- initializeSubtargetDependencies(TT, GPU, FS);
+ CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
+ Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
+ RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
+ InstSelector.reset(new AMDGPUInstructionSelector(
+ *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
}
unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
@@ -198,6 +241,12 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
return NumWaves;
}
+unsigned
+AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
+ const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
+}
+
std::pair<unsigned, unsigned>
AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
switch (CC) {
@@ -357,27 +406,64 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
return true;
}
-R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
- const TargetMachine &TM) :
- AMDGPUSubtarget(TT, GPU, FS, TM),
- InstrInfo(*this),
- FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
- TLInfo(TM, *this) {}
+uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
+ unsigned &MaxAlign) const {
+ assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+ F.getCallingConv() == CallingConv::SPIR_KERNEL);
-SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
- const TargetMachine &TM)
- : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
- FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
- TLInfo(TM, *this) {
- CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
- Legalizer.reset(new AMDGPULegalizerInfo());
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ uint64_t ExplicitArgBytes = 0;
+ MaxAlign = 1;
- RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
- InstSelector.reset(new AMDGPUInstructionSelector(
- *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
+ for (const Argument &Arg : F.args()) {
+ Type *ArgTy = Arg.getType();
+
+ unsigned Align = DL.getABITypeAlignment(ArgTy);
+ uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
+ ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
+ MaxAlign = std::max(MaxAlign, Align);
+ }
+
+ return ExplicitArgBytes;
}
-void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
+ unsigned &MaxAlign) const {
+ uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
+
+ unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
+
+ uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
+ unsigned ImplicitBytes = getImplicitArgNumBytes(F);
+ if (ImplicitBytes != 0) {
+ unsigned Alignment = getAlignmentForImplicitArgPtr();
+ TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
+ }
+
+ // Being able to dereference past the end is useful for emitting scalar loads.
+ return alignTo(TotalSize, 4);
+}
+
+R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
+ const TargetMachine &TM) :
+ R600GenSubtargetInfo(TT, GPU, FS),
+ AMDGPUSubtarget(TT, getFeatureBits()),
+ InstrInfo(*this),
+ FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
+ FMA(false),
+ CaymanISA(false),
+ CFALUBug(false),
+ DX10Clamp(false),
+ HasVertexCache(false),
+ R600ALUInst(false),
+ FP64(false),
+ TexVTXClauseSize(0),
+ Gen(R600),
+ TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
+ InstrItins(getInstrItineraryForCPU(GPU)),
+ AS (AMDGPU::getAMDGPUAS(TT)) { }
+
+void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const {
// Track register pressure so the scheduler can try to decrease
// pressure once register usage is above the threshold defined by
@@ -394,22 +480,12 @@ void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
Policy.ShouldTrackLaneMasks = true;
}
-bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
+bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
}
-unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
- unsigned ExplicitArgBytes) const {
- unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
- if (ImplicitBytes == 0)
- return ExplicitArgBytes;
-
- unsigned Alignment = getAlignmentForImplicitArgPtr();
- return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
-}
-
-unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
- if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
+ if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
if (SGPRs <= 80)
return 10;
if (SGPRs <= 88)
@@ -431,7 +507,7 @@ unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
return 5;
}
-unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
+unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
if (VGPRs <= 24)
return 10;
if (VGPRs <= 28)
@@ -453,7 +529,7 @@ unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
return 1;
}
-unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
if (MFI.hasFlatScratchInit()) {
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
@@ -467,7 +543,7 @@ unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
return 2; // VCC.
}
-unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
const Function &F = MF.getFunction();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
@@ -517,7 +593,7 @@ unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
MaxAddressableNumSGPRs);
}
-unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
const Function &F = MF.getFunction();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
@@ -532,10 +608,6 @@ unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
unsigned Requested = AMDGPU::getIntegerAttribute(
F, "amdgpu-num-vgpr", MaxNumVGPRs);
- // Make sure requested value does not violate subtarget's specifications.
- if (Requested && Requested <= getReservedNumVGPRs(MF))
- Requested = 0;
-
// Make sure requested value is compatible with values implied by
// default/requested minimum/maximum number of waves per execution unit.
if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
@@ -548,7 +620,7 @@ unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
MaxNumVGPRs = Requested;
}
- return MaxNumVGPRs - getReservedNumVGPRs(MF);
+ return MaxNumVGPRs;
}
namespace {
@@ -602,7 +674,21 @@ struct MemOpClusterMutation : ScheduleDAGMutation {
};
} // namespace
-void SISubtarget::getPostRAMutations(
+void GCNSubtarget::getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
}
+
+const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
+ if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
+ return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
+ else
+ return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
+}
+
+const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
+ if (TM.getTargetTriple().getArch() == Triple::amdgcn)
+ return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
+ else
+ return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
+}
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index cf4a691d4b58..623109733651 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -8,7 +8,7 @@
//==-----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU specific subclass of TargetSubtarget.
+/// AMDGPU specific subclass of TargetSubtarget.
//
//===----------------------------------------------------------------------===//
@@ -23,7 +23,6 @@
#include "SIFrameLowering.h"
#include "SIISelLowering.h"
#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/Triple.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
@@ -40,24 +39,216 @@
#define GET_SUBTARGETINFO_HEADER
#include "AMDGPUGenSubtargetInfo.inc"
+#define GET_SUBTARGETINFO_HEADER
+#include "R600GenSubtargetInfo.inc"
namespace llvm {
class StringRef;
-class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
+class AMDGPUSubtarget {
public:
enum Generation {
R600 = 0,
- R700,
- EVERGREEN,
- NORTHERN_ISLANDS,
- SOUTHERN_ISLANDS,
- SEA_ISLANDS,
- VOLCANIC_ISLANDS,
- GFX9,
+ R700 = 1,
+ EVERGREEN = 2,
+ NORTHERN_ISLANDS = 3,
+ SOUTHERN_ISLANDS = 4,
+ SEA_ISLANDS = 5,
+ VOLCANIC_ISLANDS = 6,
+ GFX9 = 7
};
+private:
+ Triple TargetTriple;
+
+protected:
+ const FeatureBitset &SubtargetFeatureBits;
+ bool Has16BitInsts;
+ bool HasMadMixInsts;
+ bool FP32Denormals;
+ bool FPExceptions;
+ bool HasSDWA;
+ bool HasVOP3PInsts;
+ bool HasMulI24;
+ bool HasMulU24;
+ bool HasFminFmaxLegacy;
+ bool EnablePromoteAlloca;
+ int LocalMemorySize;
+ unsigned WavefrontSize;
+
+public:
+ AMDGPUSubtarget(const Triple &TT, const FeatureBitset &FeatureBits);
+
+ static const AMDGPUSubtarget &get(const MachineFunction &MF);
+ static const AMDGPUSubtarget &get(const TargetMachine &TM,
+ const Function &F);
+
+ /// \returns Default range flat work group size for a calling convention.
+ std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
+
+ /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
+ /// for function \p F, or minimum/maximum flat work group sizes explicitly
+ /// requested using "amdgpu-flat-work-group-size" attribute attached to
+ /// function \p F.
+ ///
+ /// \returns Subtarget's default values if explicitly requested values cannot
+ /// be converted to integer, or violate subtarget's specifications.
+ std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
+
+ /// \returns Subtarget's default pair of minimum/maximum number of waves per
+ /// execution unit for function \p F, or minimum/maximum number of waves per
+ /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
+ /// attached to function \p F.
+ ///
+ /// \returns Subtarget's default values if explicitly requested values cannot
+ /// be converted to integer, violate subtarget's specifications, or are not
+ /// compatible with minimum/maximum number of waves limited by flat work group
+ /// size, register usage, and/or lds usage.
+ std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
+
+ /// Return the amount of LDS that can be used that will not restrict the
+ /// occupancy lower than WaveCount.
+ unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
+ const Function &) const;
+
+ /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
+ /// the given LDS memory size is the only constraint.
+ unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
+
+ unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
+
+ bool isAmdHsaOS() const {
+ return TargetTriple.getOS() == Triple::AMDHSA;
+ }
+
+ bool isAmdPalOS() const {
+ return TargetTriple.getOS() == Triple::AMDPAL;
+ }
+
+ bool isMesa3DOS() const {
+ return TargetTriple.getOS() == Triple::Mesa3D;
+ }
+
+ bool isMesaKernel(const Function &F) const {
+ return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
+ }
+
+ bool isAmdCodeObjectV2(const Function &F) const {
+ return isAmdHsaOS() || isMesaKernel(F);
+ }
+
+ bool has16BitInsts() const {
+ return Has16BitInsts;
+ }
+
+ bool hasMadMixInsts() const {
+ return HasMadMixInsts;
+ }
+
+ bool hasFP32Denormals() const {
+ return FP32Denormals;
+ }
+
+ bool hasFPExceptions() const {
+ return FPExceptions;
+ }
+
+ bool hasSDWA() const {
+ return HasSDWA;
+ }
+
+ bool hasVOP3PInsts() const {
+ return HasVOP3PInsts;
+ }
+
+ bool hasMulI24() const {
+ return HasMulI24;
+ }
+
+ bool hasMulU24() const {
+ return HasMulU24;
+ }
+
+ bool hasFminFmaxLegacy() const {
+ return HasFminFmaxLegacy;
+ }
+
+ bool isPromoteAllocaEnabled() const {
+ return EnablePromoteAlloca;
+ }
+
+ unsigned getWavefrontSize() const {
+ return WavefrontSize;
+ }
+
+ int getLocalMemorySize() const {
+ return LocalMemorySize;
+ }
+
+ unsigned getAlignmentForImplicitArgPtr() const {
+ return isAmdHsaOS() ? 8 : 4;
+ }
+
+ /// Returns the offset in bytes from the start of the input buffer
+ /// of the first explicit kernel argument.
+ unsigned getExplicitKernelArgOffset(const Function &F) const {
+ return isAmdCodeObjectV2(F) ? 0 : 36;
+ }
+
+ /// \returns Maximum number of work groups per compute unit supported by the
+ /// subtarget and limited by given \p FlatWorkGroupSize.
+ unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
+ return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(SubtargetFeatureBits,
+ FlatWorkGroupSize);
+ }
+
+ /// \returns Minimum flat work group size supported by the subtarget.
+ unsigned getMinFlatWorkGroupSize() const {
+ return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(SubtargetFeatureBits);
+ }
+
+ /// \returns Maximum flat work group size supported by the subtarget.
+ unsigned getMaxFlatWorkGroupSize() const {
+ return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(SubtargetFeatureBits);
+ }
+
+ /// \returns Maximum number of waves per execution unit supported by the
+ /// subtarget and limited by given \p FlatWorkGroupSize.
+ unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
+ return AMDGPU::IsaInfo::getMaxWavesPerEU(SubtargetFeatureBits,
+ FlatWorkGroupSize);
+ }
+
+ /// \returns Minimum number of waves per execution unit supported by the
+ /// subtarget.
+ unsigned getMinWavesPerEU() const {
+ return AMDGPU::IsaInfo::getMinWavesPerEU(SubtargetFeatureBits);
+ }
+
+ unsigned getMaxWavesPerEU() const { return 10; }
+
+ /// Creates value range metadata on an workitemid.* inrinsic call or load.
+ bool makeLIDRangeMetadata(Instruction *I) const;
+
+ /// \returns Number of bytes of arguments that are passed to a shader or
+ /// kernel in addition to the explicit ones declared for the function.
+ unsigned getImplicitArgNumBytes(const Function &F) const {
+ if (isMesaKernel(F))
+ return 16;
+ return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
+ }
+ uint64_t getExplicitKernArgSize(const Function &F,
+ unsigned &MaxAlign) const;
+ unsigned getKernArgSegmentSize(const Function &F,
+ unsigned &MaxAlign) const;
+
+ virtual ~AMDGPUSubtarget() {}
+};
+
+class GCNSubtarget : public AMDGPUGenSubtargetInfo,
+ public AMDGPUSubtarget {
+public:
enum {
ISAVersion0_0_0,
ISAVersion6_0_0,
@@ -67,13 +258,14 @@ public:
ISAVersion7_0_2,
ISAVersion7_0_3,
ISAVersion7_0_4,
- ISAVersion8_0_0,
ISAVersion8_0_1,
ISAVersion8_0_2,
ISAVersion8_0_3,
ISAVersion8_1_0,
ISAVersion9_0_0,
- ISAVersion9_0_2
+ ISAVersion9_0_2,
+ ISAVersion9_0_4,
+ ISAVersion9_0_6,
};
enum TrapHandlerAbi {
@@ -96,13 +288,18 @@ public:
LLVMTrapHandlerRegValue = 1
};
+private:
+ /// GlobalISel related APIs.
+ std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+ std::unique_ptr<InstructionSelector> InstSelector;
+ std::unique_ptr<LegalizerInfo> Legalizer;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
protected:
// Basic subtarget description.
Triple TargetTriple;
- Generation Gen;
+ unsigned Gen;
unsigned IsaVersion;
- unsigned WavefrontSize;
- int LocalMemorySize;
int LDSBankCount;
unsigned MaxPrivateElementSize;
@@ -111,9 +308,7 @@ protected:
bool HalfRate64Ops;
// Dynamially set bits that enable features.
- bool FP32Denormals;
bool FP64FP16Denormals;
- bool FPExceptions;
bool DX10Clamp;
bool FlatForGlobal;
bool AutoWaitcntBeforeBarrier;
@@ -124,47 +319,48 @@ protected:
bool EnableXNACK;
bool TrapHandler;
bool DebuggerInsertNops;
- bool DebuggerReserveRegs;
bool DebuggerEmitPrologue;
// Used as options.
bool EnableHugePrivateBuffer;
bool EnableVGPRSpilling;
- bool EnablePromoteAlloca;
bool EnableLoadStoreOpt;
bool EnableUnsafeDSOffsetFolding;
bool EnableSIScheduler;
+ bool EnableDS128;
bool DumpCode;
// Subtarget statically properties set by tablegen
bool FP64;
bool FMA;
+ bool MIMG_R128;
bool IsGCN;
bool GCN3Encoding;
bool CIInsts;
bool GFX9Insts;
bool SGPRInitBug;
bool HasSMemRealTime;
- bool Has16BitInsts;
bool HasIntClamp;
- bool HasVOP3PInsts;
- bool HasMadMixInsts;
+ bool HasFmaMixInsts;
bool HasMovrel;
bool HasVGPRIndexMode;
bool HasScalarStores;
+ bool HasScalarAtomics;
bool HasInv2PiInlineImm;
- bool HasSDWA;
bool HasSDWAOmod;
bool HasSDWAScalar;
bool HasSDWASdst;
bool HasSDWAMac;
bool HasSDWAOutModsVOPC;
bool HasDPP;
+ bool HasDLInsts;
+ bool D16PreservesUnusedBits;
bool FlatAddressSpace;
bool FlatInstOffsets;
bool FlatGlobalInsts;
bool FlatScratchInsts;
bool AddNoCarryInsts;
+ bool HasUnpackedD16VMem;
bool R600ALUInst;
bool CaymanISA;
bool CFALUBug;
@@ -175,67 +371,68 @@ protected:
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable;
- InstrItineraryData InstrItins;
SelectionDAGTargetInfo TSInfo;
AMDGPUAS AS;
+private:
+ SIInstrInfo InstrInfo;
+ SITargetLowering TLInfo;
+ SIFrameLowering FrameLowering;
public:
- AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
- const TargetMachine &TM);
- ~AMDGPUSubtarget() override;
+ GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+ const GCNTargetMachine &TM);
+ ~GCNSubtarget() override;
- AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT,
+ GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
StringRef GPU, StringRef FS);
- const AMDGPUInstrInfo *getInstrInfo() const override = 0;
- const AMDGPUFrameLowering *getFrameLowering() const override = 0;
- const AMDGPUTargetLowering *getTargetLowering() const override = 0;
- const AMDGPURegisterInfo *getRegisterInfo() const override = 0;
+ const SIInstrInfo *getInstrInfo() const override {
+ return &InstrInfo;
+ }
- const InstrItineraryData *getInstrItineraryData() const override {
- return &InstrItins;
+ const SIFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
}
- // Nothing implemented, just prevent crashes on use.
- const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
- return &TSInfo;
+ const SITargetLowering *getTargetLowering() const override {
+ return &TLInfo;
}
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ const SIRegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
- bool isAmdHsaOS() const {
- return TargetTriple.getOS() == Triple::AMDHSA;
+ const CallLowering *getCallLowering() const override {
+ return CallLoweringInfo.get();
}
- bool isMesa3DOS() const {
- return TargetTriple.getOS() == Triple::Mesa3D;
+ const InstructionSelector *getInstructionSelector() const override {
+ return InstSelector.get();
}
- bool isOpenCLEnv() const {
- return TargetTriple.getEnvironment() == Triple::OpenCL ||
- TargetTriple.getEnvironmentName() == "amdgizcl";
+ const LegalizerInfo *getLegalizerInfo() const override {
+ return Legalizer.get();
}
- bool isAmdPalOS() const {
- return TargetTriple.getOS() == Triple::AMDPAL;
+ const RegisterBankInfo *getRegBankInfo() const override {
+ return RegBankInfo.get();
}
- Generation getGeneration() const {
- return Gen;
+ // Nothing implemented, just prevent crashes on use.
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
}
- unsigned getWavefrontSize() const {
- return WavefrontSize;
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ Generation getGeneration() const {
+ return (Generation)Gen;
}
unsigned getWavefrontSizeLog2() const {
return Log2_32(WavefrontSize);
}
- int getLocalMemorySize() const {
- return LocalMemorySize;
- }
-
int getLDSBankCount() const {
return LDSBankCount;
}
@@ -248,19 +445,19 @@ public:
return AS;
}
- bool has16BitInsts() const {
- return Has16BitInsts;
- }
-
bool hasIntClamp() const {
return HasIntClamp;
}
- bool hasVOP3PInsts() const {
- return HasVOP3PInsts;
+ bool hasFP64() const {
+ return FP64;
}
- bool hasFP64() const {
+ bool hasMIMG_R128() const {
+ return MIMG_R128;
+ }
+
+ bool hasHWFP64() const {
return FP64;
}
@@ -273,15 +470,15 @@ public:
}
bool hasAddr64() const {
- return (getGeneration() < VOLCANIC_ISLANDS);
+ return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
}
bool hasBFE() const {
- return (getGeneration() >= EVERGREEN);
+ return true;
}
bool hasBFI() const {
- return (getGeneration() >= EVERGREEN);
+ return true;
}
bool hasBFM() const {
@@ -289,62 +486,31 @@ public:
}
bool hasBCNT(unsigned Size) const {
- if (Size == 32)
- return (getGeneration() >= EVERGREEN);
-
- if (Size == 64)
- return (getGeneration() >= SOUTHERN_ISLANDS);
-
- return false;
- }
-
- bool hasMulU24() const {
- return (getGeneration() >= EVERGREEN);
- }
-
- bool hasMulI24() const {
- return (getGeneration() >= SOUTHERN_ISLANDS ||
- hasCaymanISA());
+ return true;
}
bool hasFFBL() const {
- return (getGeneration() >= EVERGREEN);
+ return true;
}
bool hasFFBH() const {
- return (getGeneration() >= EVERGREEN);
+ return true;
}
bool hasMed3_16() const {
- return getGeneration() >= GFX9;
+ return getGeneration() >= AMDGPUSubtarget::GFX9;
}
bool hasMin3Max3_16() const {
- return getGeneration() >= GFX9;
- }
-
- bool hasMadMixInsts() const {
- return HasMadMixInsts;
+ return getGeneration() >= AMDGPUSubtarget::GFX9;
}
- bool hasSBufferLoadStoreAtomicDwordxN() const {
- // Only use the "x1" variants on GFX9 or don't use the buffer variants.
- // For x2 and higher variants, if the accessed region spans 2 VM pages and
- // the second page is unmapped, the hw hangs.
- // TODO: There is one future GFX9 chip that doesn't have this bug.
- return getGeneration() != GFX9;
+ bool hasFmaMixInsts() const {
+ return HasFmaMixInsts;
}
bool hasCARRY() const {
- return (getGeneration() >= EVERGREEN);
- }
-
- bool hasBORROW() const {
- return (getGeneration() >= EVERGREEN);
- }
-
- bool hasCaymanISA() const {
- return CaymanISA;
+ return true;
}
bool hasFMA() const {
@@ -359,10 +525,6 @@ public:
return EnableHugePrivateBuffer;
}
- bool isPromoteAllocaEnabled() const {
- return EnablePromoteAlloca;
- }
-
bool unsafeDSOffsetFoldingEnabled() const {
return EnableUnsafeDSOffsetFolding;
}
@@ -376,23 +538,10 @@ public:
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
const Function &) const;
- /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
- /// the given LDS memory size is the only constraint.
- unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
-
- unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
- const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
- return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
- }
-
bool hasFP16Denormals() const {
return FP64FP16Denormals;
}
- bool hasFP32Denormals() const {
- return FP32Denormals;
- }
-
bool hasFP64Denormals() const {
return FP64FP16Denormals;
}
@@ -401,10 +550,6 @@ public:
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
- bool hasFPExceptions() const {
- return FPExceptions;
- }
-
bool enableDX10Clamp() const {
return DX10Clamp;
}
@@ -417,6 +562,12 @@ public:
return FlatForGlobal;
}
+ /// \returns If target supports ds_read/write_b128 and user enables generation
+ /// of ds_read/write_b128.
+ bool useDS128() const {
+ return CIInsts && EnableDS128;
+ }
+
/// \returns If MUBUF instructions always perform range checking, even for
/// buffer resources used for private memory access.
bool privateMemoryResourceIsRangeChecked() const {
@@ -440,7 +591,7 @@ public:
}
bool hasApertureRegs() const {
- return HasApertureRegs;
+ return HasApertureRegs;
}
bool isTrapHandlerEnabled() const {
@@ -467,6 +618,10 @@ public:
return FlatScratchInsts;
}
+ bool hasFlatLgkmVMemCountInOrder() const {
+ return getGeneration() > GFX9;
+ }
+
bool hasD16LoadStore() const {
return getGeneration() >= GFX9;
}
@@ -481,31 +636,19 @@ public:
return AddNoCarryInsts;
}
- bool isMesaKernel(const MachineFunction &MF) const {
- return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction().getCallingConv());
+ bool hasUnpackedD16VMem() const {
+ return HasUnpackedD16VMem;
}
// Covers VS/PS/CS graphics shaders
- bool isMesaGfxShader(const MachineFunction &MF) const {
- return isMesa3DOS() && AMDGPU::isShader(MF.getFunction().getCallingConv());
- }
-
- bool isAmdCodeObjectV2(const MachineFunction &MF) const {
- return isAmdHsaOS() || isMesaKernel(MF);
+ bool isMesaGfxShader(const Function &F) const {
+ return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
}
bool hasMad64_32() const {
return getGeneration() >= SEA_ISLANDS;
}
- bool hasFminFmaxLegacy() const {
- return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
- }
-
- bool hasSDWA() const {
- return HasSDWA;
- }
-
bool hasSDWAOmod() const {
return HasSDWAOmod;
}
@@ -526,29 +669,28 @@ public:
return HasSDWAOutModsVOPC;
}
- /// \brief Returns the offset in bytes from the start of the input buffer
- /// of the first explicit kernel argument.
- unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
- return isAmdCodeObjectV2(MF) ? 0 : 36;
+ bool vmemWriteNeedsExpWaitcnt() const {
+ return getGeneration() < SEA_ISLANDS;
}
- unsigned getAlignmentForImplicitArgPtr() const {
- return isAmdHsaOS() ? 8 : 4;
+ bool hasDLInsts() const {
+ return HasDLInsts;
}
- unsigned getImplicitArgNumBytes(const MachineFunction &MF) const {
- if (isMesaKernel(MF))
- return 16;
- if (isAmdHsaOS() && isOpenCLEnv())
- return 32;
- return 0;
+ bool d16PreservesUnusedBits() const {
+ return D16PreservesUnusedBits;
}
// Scratch is allocated in 256 dword per wave blocks for the entire
// wavefront. When viewed from the perspecive of an arbitrary workitem, this
// is 4-byte aligned.
+ //
+ // Only 4-byte alignment is really needed to access anything. Transformations
+ // on the pointer value itself may rely on the alignment / known low bits of
+ // the pointer. Set this to something above the minimum to avoid needing
+ // dynamic realignment in common cases.
unsigned getStackAlignment() const {
- return 4;
+ return 16;
}
bool enableMachineScheduler() const override {
@@ -559,184 +701,43 @@ public:
return true;
}
- void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
- bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
+ void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
+ bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
/// \returns Number of execution units per compute unit supported by the
/// subtarget.
unsigned getEUsPerCU() const {
- return AMDGPU::IsaInfo::getEUsPerCU(getFeatureBits());
- }
-
- /// \returns Maximum number of work groups per compute unit supported by the
- /// subtarget and limited by given \p FlatWorkGroupSize.
- unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
- return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(getFeatureBits(),
- FlatWorkGroupSize);
+ return AMDGPU::IsaInfo::getEUsPerCU(MCSubtargetInfo::getFeatureBits());
}
/// \returns Maximum number of waves per compute unit supported by the
/// subtarget without any kind of limitation.
unsigned getMaxWavesPerCU() const {
- return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits());
+ return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits());
}
/// \returns Maximum number of waves per compute unit supported by the
/// subtarget and limited by given \p FlatWorkGroupSize.
unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
- return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits(),
+ return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits(),
FlatWorkGroupSize);
}
- /// \returns Minimum number of waves per execution unit supported by the
- /// subtarget.
- unsigned getMinWavesPerEU() const {
- return AMDGPU::IsaInfo::getMinWavesPerEU(getFeatureBits());
- }
-
/// \returns Maximum number of waves per execution unit supported by the
/// subtarget without any kind of limitation.
unsigned getMaxWavesPerEU() const {
- return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits());
- }
-
- /// \returns Maximum number of waves per execution unit supported by the
- /// subtarget and limited by given \p FlatWorkGroupSize.
- unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
- return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits(),
- FlatWorkGroupSize);
- }
-
- /// \returns Minimum flat work group size supported by the subtarget.
- unsigned getMinFlatWorkGroupSize() const {
- return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(getFeatureBits());
- }
-
- /// \returns Maximum flat work group size supported by the subtarget.
- unsigned getMaxFlatWorkGroupSize() const {
- return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(getFeatureBits());
+ return AMDGPU::IsaInfo::getMaxWavesPerEU();
}
/// \returns Number of waves per work group supported by the subtarget and
/// limited by given \p FlatWorkGroupSize.
unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
- return AMDGPU::IsaInfo::getWavesPerWorkGroup(getFeatureBits(),
- FlatWorkGroupSize);
+ return AMDGPU::IsaInfo::getWavesPerWorkGroup(
+ MCSubtargetInfo::getFeatureBits(), FlatWorkGroupSize);
}
- /// \returns Default range flat work group size for a calling convention.
- std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
-
- /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
- /// for function \p F, or minimum/maximum flat work group sizes explicitly
- /// requested using "amdgpu-flat-work-group-size" attribute attached to
- /// function \p F.
- ///
- /// \returns Subtarget's default values if explicitly requested values cannot
- /// be converted to integer, or violate subtarget's specifications.
- std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
-
- /// \returns Subtarget's default pair of minimum/maximum number of waves per
- /// execution unit for function \p F, or minimum/maximum number of waves per
- /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
- /// attached to function \p F.
- ///
- /// \returns Subtarget's default values if explicitly requested values cannot
- /// be converted to integer, violate subtarget's specifications, or are not
- /// compatible with minimum/maximum number of waves limited by flat work group
- /// size, register usage, and/or lds usage.
- std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
-
- /// Creates value range metadata on an workitemid.* inrinsic call or load.
- bool makeLIDRangeMetadata(Instruction *I) const;
-};
-
-class R600Subtarget final : public AMDGPUSubtarget {
-private:
- R600InstrInfo InstrInfo;
- R600FrameLowering FrameLowering;
- R600TargetLowering TLInfo;
-
-public:
- R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
- const TargetMachine &TM);
-
- const R600InstrInfo *getInstrInfo() const override {
- return &InstrInfo;
- }
-
- const R600FrameLowering *getFrameLowering() const override {
- return &FrameLowering;
- }
-
- const R600TargetLowering *getTargetLowering() const override {
- return &TLInfo;
- }
-
- const R600RegisterInfo *getRegisterInfo() const override {
- return &InstrInfo.getRegisterInfo();
- }
-
- bool hasCFAluBug() const {
- return CFALUBug;
- }
-
- bool hasVertexCache() const {
- return HasVertexCache;
- }
-
- short getTexVTXClauseSize() const {
- return TexVTXClauseSize;
- }
-};
-
-class SISubtarget final : public AMDGPUSubtarget {
-private:
- SIInstrInfo InstrInfo;
- SIFrameLowering FrameLowering;
- SITargetLowering TLInfo;
-
- /// GlobalISel related APIs.
- std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
- std::unique_ptr<InstructionSelector> InstSelector;
- std::unique_ptr<LegalizerInfo> Legalizer;
- std::unique_ptr<RegisterBankInfo> RegBankInfo;
-
-public:
- SISubtarget(const Triple &TT, StringRef CPU, StringRef FS,
- const TargetMachine &TM);
-
- const SIInstrInfo *getInstrInfo() const override {
- return &InstrInfo;
- }
-
- const SIFrameLowering *getFrameLowering() const override {
- return &FrameLowering;
- }
-
- const SITargetLowering *getTargetLowering() const override {
- return &TLInfo;
- }
-
- const CallLowering *getCallLowering() const override {
- return CallLoweringInfo.get();
- }
-
- const InstructionSelector *getInstructionSelector() const override {
- return InstSelector.get();
- }
-
- const LegalizerInfo *getLegalizerInfo() const override {
- return Legalizer.get();
- }
-
- const RegisterBankInfo *getRegBankInfo() const override {
- return RegBankInfo.get();
- }
-
- const SIRegisterInfo *getRegisterInfo() const override {
- return &InstrInfo.getRegisterInfo();
- }
+ // static wrappers
+ static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
// XXX - Why is this here if it isn't in the default pass set?
bool enableEarlyIfConversion() const override {
@@ -746,7 +747,7 @@ public:
void overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const override;
- bool isVGPRSpillingEnabled(const Function& F) const;
+ bool isVGPRSpillingEnabled(const Function &F) const;
unsigned getMaxNumUserSGPRs() const {
return 16;
@@ -776,6 +777,10 @@ public:
return HasScalarStores;
}
+ bool hasScalarAtomics() const {
+ return HasScalarAtomics;
+ }
+
bool hasInv2PiInlineImm() const {
return HasInv2PiInlineImm;
}
@@ -789,18 +794,13 @@ public:
}
bool debuggerSupported() const {
- return debuggerInsertNops() && debuggerReserveRegs() &&
- debuggerEmitPrologue();
+ return debuggerInsertNops() && debuggerEmitPrologue();
}
bool debuggerInsertNops() const {
return DebuggerInsertNops;
}
- bool debuggerReserveRegs() const {
- return DebuggerReserveRegs;
- }
-
bool debuggerEmitPrologue() const {
return DebuggerEmitPrologue;
}
@@ -829,52 +829,61 @@ public:
return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
}
- unsigned getKernArgSegmentSize(const MachineFunction &MF,
- unsigned ExplictArgBytes) const;
-
- /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
+ /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
+ /// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
- /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs
+ /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
+ /// VGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
/// \returns true if the flat_scratch register should be initialized with the
/// pointer to the wave's scratch memory rather than a size and offset.
bool flatScratchIsPointer() const {
+ return getGeneration() >= AMDGPUSubtarget::GFX9;
+ }
+
+ /// \returns true if the machine has merged shaders in which s0-s7 are
+ /// reserved by the hardware and user SGPRs start at s8
+ bool hasMergedShaders() const {
return getGeneration() >= GFX9;
}
/// \returns SGPR allocation granularity supported by the subtarget.
unsigned getSGPRAllocGranule() const {
- return AMDGPU::IsaInfo::getSGPRAllocGranule(getFeatureBits());
+ return AMDGPU::IsaInfo::getSGPRAllocGranule(
+ MCSubtargetInfo::getFeatureBits());
}
/// \returns SGPR encoding granularity supported by the subtarget.
unsigned getSGPREncodingGranule() const {
- return AMDGPU::IsaInfo::getSGPREncodingGranule(getFeatureBits());
+ return AMDGPU::IsaInfo::getSGPREncodingGranule(
+ MCSubtargetInfo::getFeatureBits());
}
/// \returns Total number of SGPRs supported by the subtarget.
unsigned getTotalNumSGPRs() const {
- return AMDGPU::IsaInfo::getTotalNumSGPRs(getFeatureBits());
+ return AMDGPU::IsaInfo::getTotalNumSGPRs(MCSubtargetInfo::getFeatureBits());
}
/// \returns Addressable number of SGPRs supported by the subtarget.
unsigned getAddressableNumSGPRs() const {
- return AMDGPU::IsaInfo::getAddressableNumSGPRs(getFeatureBits());
+ return AMDGPU::IsaInfo::getAddressableNumSGPRs(
+ MCSubtargetInfo::getFeatureBits());
}
/// \returns Minimum number of SGPRs that meets the given number of waves per
/// execution unit requirement supported by the subtarget.
unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
- return AMDGPU::IsaInfo::getMinNumSGPRs(getFeatureBits(), WavesPerEU);
+ return AMDGPU::IsaInfo::getMinNumSGPRs(MCSubtargetInfo::getFeatureBits(),
+ WavesPerEU);
}
/// \returns Maximum number of SGPRs that meets the given number of waves per
/// execution unit requirement supported by the subtarget.
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
- return AMDGPU::IsaInfo::getMaxNumSGPRs(getFeatureBits(), WavesPerEU,
- Addressable);
+ return AMDGPU::IsaInfo::getMaxNumSGPRs(MCSubtargetInfo::getFeatureBits(),
+ WavesPerEU, Addressable);
}
/// \returns Reserved number of SGPRs for given function \p MF.
@@ -892,39 +901,39 @@ public:
/// \returns VGPR allocation granularity supported by the subtarget.
unsigned getVGPRAllocGranule() const {
- return AMDGPU::IsaInfo::getVGPRAllocGranule(getFeatureBits());
+ return AMDGPU::IsaInfo::getVGPRAllocGranule(
+ MCSubtargetInfo::getFeatureBits());
}
/// \returns VGPR encoding granularity supported by the subtarget.
unsigned getVGPREncodingGranule() const {
- return AMDGPU::IsaInfo::getVGPREncodingGranule(getFeatureBits());
+ return AMDGPU::IsaInfo::getVGPREncodingGranule(
+ MCSubtargetInfo::getFeatureBits());
}
/// \returns Total number of VGPRs supported by the subtarget.
unsigned getTotalNumVGPRs() const {
- return AMDGPU::IsaInfo::getTotalNumVGPRs(getFeatureBits());
+ return AMDGPU::IsaInfo::getTotalNumVGPRs(MCSubtargetInfo::getFeatureBits());
}
/// \returns Addressable number of VGPRs supported by the subtarget.
unsigned getAddressableNumVGPRs() const {
- return AMDGPU::IsaInfo::getAddressableNumVGPRs(getFeatureBits());
+ return AMDGPU::IsaInfo::getAddressableNumVGPRs(
+ MCSubtargetInfo::getFeatureBits());
}
/// \returns Minimum number of VGPRs that meets given number of waves per
/// execution unit requirement supported by the subtarget.
unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
- return AMDGPU::IsaInfo::getMinNumVGPRs(getFeatureBits(), WavesPerEU);
+ return AMDGPU::IsaInfo::getMinNumVGPRs(MCSubtargetInfo::getFeatureBits(),
+ WavesPerEU);
}
/// \returns Maximum number of VGPRs that meets given number of waves per
/// execution unit requirement supported by the subtarget.
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
- return AMDGPU::IsaInfo::getMaxNumVGPRs(getFeatureBits(), WavesPerEU);
- }
-
- /// \returns Reserved number of VGPRs for given function \p MF.
- unsigned getReservedNumVGPRs(const MachineFunction &MF) const {
- return debuggerReserveRegs() ? 4 : 0;
+ return AMDGPU::IsaInfo::getMaxNumVGPRs(MCSubtargetInfo::getFeatureBits(),
+ WavesPerEU);
}
/// \returns Maximum number of VGPRs that meets number of waves per execution
@@ -942,6 +951,119 @@ public:
const override;
};
+class R600Subtarget final : public R600GenSubtargetInfo,
+ public AMDGPUSubtarget {
+private:
+ R600InstrInfo InstrInfo;
+ R600FrameLowering FrameLowering;
+ bool FMA;
+ bool CaymanISA;
+ bool CFALUBug;
+ bool DX10Clamp;
+ bool HasVertexCache;
+ bool R600ALUInst;
+ bool FP64;
+ short TexVTXClauseSize;
+ Generation Gen;
+ R600TargetLowering TLInfo;
+ InstrItineraryData InstrItins;
+ SelectionDAGTargetInfo TSInfo;
+ AMDGPUAS AS;
+
+public:
+ R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+ const TargetMachine &TM);
+
+ const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+
+ const R600FrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+
+ const R600TargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+
+ const R600RegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+
+ const InstrItineraryData *getInstrItineraryData() const override {
+ return &InstrItins;
+ }
+
+ // Nothing implemented, just prevent crashes on use.
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ Generation getGeneration() const {
+ return Gen;
+ }
+
+ unsigned getStackAlignment() const {
+ return 4;
+ }
+
+ R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
+ StringRef GPU, StringRef FS);
+
+ bool hasBFE() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasBFI() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasBCNT(unsigned Size) const {
+ if (Size == 32)
+ return (getGeneration() >= EVERGREEN);
+
+ return false;
+ }
+
+ bool hasBORROW() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasCARRY() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasCaymanISA() const {
+ return CaymanISA;
+ }
+
+ bool hasFFBL() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasFFBH() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasFMA() const { return FMA; }
+
+ bool hasCFAluBug() const { return CFALUBug; }
+
+ bool hasVertexCache() const { return HasVertexCache; }
+
+ short getTexVTXClauseSize() const { return TexVTXClauseSize; }
+
+ AMDGPUAS getAMDGPUAS() const { return AS; }
+
+ bool enableMachineScheduler() const override {
+ return true;
+ }
+
+ bool enableSubRegLiveness() const override {
+ return true;
+ }
+};
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2042dbf6d5e2..2205819c444f 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief The AMDGPU target machine contains all of the hardware specific
+/// The AMDGPU target machine contains all of the hardware specific
/// information needed to emit code for R600 and SI GPUs.
//
//===----------------------------------------------------------------------===//
@@ -31,7 +31,6 @@
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
@@ -40,6 +39,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/AlwaysInliner.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
@@ -79,7 +79,7 @@ static cl::opt<bool> EnableLoadStoreVectorizer(
cl::init(true),
cl::Hidden);
-// Option to to control global loads scalarization
+// Option to control global loads scalarization
static cl::opt<bool> ScalarizeGlobal(
"amdgpu-scalarize-global-loads",
cl::desc("Enable global load scalarization"),
@@ -110,12 +110,6 @@ static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
cl::desc("Enable AMDGPU Alias Analysis"),
cl::init(true));
-// Option to enable new waitcnt insertion pass.
-static cl::opt<bool> EnableSIInsertWaitcntsPass(
- "enable-si-insert-waitcnts",
- cl::desc("Use new waitcnt insertion pass"),
- cl::init(true));
-
// Option to run late CFG structurizer
static cl::opt<bool, true> LateCFGStructurize(
"amdgpu-late-structurize",
@@ -123,16 +117,23 @@ static cl::opt<bool, true> LateCFGStructurize(
cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
cl::Hidden);
-static cl::opt<bool> EnableAMDGPUFunctionCalls(
+static cl::opt<bool, true> EnableAMDGPUFunctionCalls(
"amdgpu-function-calls",
- cl::Hidden,
cl::desc("Enable AMDGPU function call support"),
- cl::init(false));
+ cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
+ cl::init(false),
+ cl::Hidden);
// Enable lib calls simplifications
static cl::opt<bool> EnableLibCallSimplify(
"amdgpu-simplify-libcall",
- cl::desc("Enable mdgpu library simplifications"),
+ cl::desc("Enable amdgpu library simplifications"),
+ cl::init(true),
+ cl::Hidden);
+
+static cl::opt<bool> EnableLowerKernelArguments(
+ "amdgpu-ir-lower-kernel-arguments",
+ cl::desc("Lower kernel argument loads in IR pass"),
cl::init(true),
cl::Hidden);
@@ -147,6 +148,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeR600PacketizerPass(*PR);
initializeR600ExpandSpecialInstrsPassPass(*PR);
initializeR600VectorRegMergerPass(*PR);
+ initializeGlobalISel(*PR);
initializeAMDGPUDAGToDAGISelPass(*PR);
initializeSILowerI1CopiesPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
@@ -160,6 +162,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
+ initializeAMDGPULowerKernelArgumentsPass(*PR);
+ initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
@@ -167,7 +171,6 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
- initializeSIInsertWaitsPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
@@ -176,6 +179,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIDebuggerInsertNopsPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
initializeSIFixWWMLivenessPass(*PR);
+ initializeSIFormMemoryClausesPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
initializeAMDGPUUseNativeCallsPass(*PR);
@@ -260,24 +264,15 @@ GCNILPSchedRegistry("gcn-ilp",
static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.
- if (TT.getEnvironmentName() == "amdgiz" ||
- TT.getEnvironmentName() == "amdgizcl")
return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
- return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
}
// 32-bit private, local, and region pointers. 64-bit global, constant and
// flat.
- if (TT.getEnvironmentName() == "amdgiz" ||
- TT.getEnvironmentName() == "amdgizcl")
- return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
+ return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
- return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
- "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
}
LLVM_READNONE
@@ -317,9 +312,10 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
initAsmInfo();
}
-AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
-
bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
+bool AMDGPUTargetMachine::EnableFunctionCalls = false;
+
+AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
Attribute GPUAttr = F.getFnAttribute("target-cpu");
@@ -412,6 +408,10 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
// Add infer address spaces pass to the opt pipeline after inlining
// but before SROA to increase SROA opportunities.
PM.add(createInferAddressSpacesPass());
+
+ // This should run after inlining to have any chance of doing anything,
+ // and before other cleanup optimizations.
+ PM.add(createAMDGPULowerKernelAttributesPass());
});
}
@@ -449,6 +449,11 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl(
return I.get();
}
+TargetTransformInfo
+R600TargetMachine::getTargetTransformInfo(const Function &F) {
+ return TargetTransformInfo(R600TTIImpl(this, F));
+}
+
//===----------------------------------------------------------------------===//
// GCN Target Machine (SI+)
//===----------------------------------------------------------------------===//
@@ -461,7 +466,7 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
CodeGenOpt::Level OL, bool JIT)
: AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
-const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
+const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
StringRef GPU = getGPUName(F);
StringRef FS = getFeatureString(F);
@@ -474,7 +479,7 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
+ I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
}
I->setScalarizeGlobalBehavior(ScalarizeGlobal);
@@ -482,6 +487,11 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
return I.get();
}
+TargetTransformInfo
+GCNTargetMachine::getTargetTransformInfo(const Function &F) {
+ return TargetTransformInfo(GCNTTIImpl(this, F));
+}
+
//===----------------------------------------------------------------------===//
// AMDGPU Pass Setup
//===----------------------------------------------------------------------===//
@@ -571,11 +581,6 @@ public:
} // end anonymous namespace
-TargetTransformInfo
-AMDGPUTargetMachine::getTargetTransformInfo(const Function &F) {
- return TargetTransformInfo(AMDGPUTTIImpl(this, F));
-}
-
void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
if (getOptLevel() == CodeGenOpt::Aggressive)
addPass(createGVNPass());
@@ -584,6 +589,7 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
}
void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
+ addPass(createLICMPass());
addPass(createSeparateConstOffsetFromGEPPass());
addPass(createSpeculativeExecutionPass());
// ReassociateGEPs exposes more opportunites for SLSR. See
@@ -629,7 +635,8 @@ void AMDGPUPassConfig::addIRPasses() {
}
// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
- addPass(createAMDGPUOpenCLImageTypeLoweringPass());
+ if (TM.getTargetTriple().getArch() == Triple::r600)
+ addPass(createR600OpenCLImageTypeLoweringPass());
// Replace OpenCL enqueued block function pointers with global variables.
addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
@@ -672,6 +679,10 @@ void AMDGPUPassConfig::addIRPasses() {
}
void AMDGPUPassConfig::addCodeGenPrepare() {
+ if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
+ EnableLowerKernelArguments)
+ addPass(createAMDGPULowerKernelArgumentsPass());
+
TargetPassConfig::addCodeGenPrepare();
if (EnableLoadStoreVectorizer)
@@ -739,7 +750,7 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
MachineSchedContext *C) const {
- const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
if (ST.enableSIScheduler())
return createSIMachineScheduler(C);
return createGCNMaxOccupancyMachineScheduler(C);
@@ -782,7 +793,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
addPass(&SILoadStoreOptimizerID);
if (EnableSDWAPeephole) {
addPass(&SIPeepholeSDWAID);
- addPass(&MachineLICMID);
+ addPass(&EarlyMachineLICMID);
addPass(&MachineCSEID);
addPass(&SIFoldOperandsID);
addPass(&DeadMachineInstructionElimID);
@@ -851,6 +862,8 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
+ insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
+
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.
@@ -873,6 +886,10 @@ void GCNPassConfig::addPreSched2() {
}
void GCNPassConfig::addPreEmitPass() {
+ addPass(createSIMemoryLegalizerPass());
+ addPass(createSIInsertWaitcntsPass());
+ addPass(createSIShrinkInstructionsPass());
+
// The hazard recognizer that runs as part of the post-ra scheduler does not
// guarantee to be able handle all hazards correctly. This is because if there
// are multiple scheduling regions in a basic block, the regions are scheduled
@@ -881,15 +898,12 @@ void GCNPassConfig::addPreEmitPass() {
//
// Here we add a stand-alone hazard recognizer pass which can handle all
// cases.
+ //
+ // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
+ // be better for it to emit S_NOP <N> when possible.
addPass(&PostRAHazardRecognizerID);
- if (EnableSIInsertWaitcntsPass)
- addPass(createSIInsertWaitcntsPass());
- else
- addPass(createSIInsertWaitsPass());
- addPass(createSIShrinkInstructionsPass());
addPass(&SIInsertSkipsPassID);
- addPass(createSIMemoryLegalizerPass());
addPass(createSIDebuggerInsertNopsPass());
addPass(&BranchRelaxationPassID);
}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 5f9b2a7fca20..0fe14493fabd 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets.
+/// The AMDGPU TargetMachine interface definition for hw codgen targets.
//
//===----------------------------------------------------------------------===//
@@ -34,7 +34,6 @@ namespace llvm {
class AMDGPUTargetMachine : public LLVMTargetMachine {
protected:
std::unique_ptr<TargetLoweringObjectFile> TLOF;
- AMDGPUIntrinsicInfo IntrinsicInfo;
AMDGPUAS AS;
StringRef getGPUName(const Function &F) const;
@@ -42,6 +41,7 @@ protected:
public:
static bool EnableLateStructurizeCFG;
+ static bool EnableFunctionCalls;
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, TargetOptions Options,
@@ -49,13 +49,8 @@ public:
CodeGenOpt::Level OL);
~AMDGPUTargetMachine() override;
- const AMDGPUSubtarget *getSubtargetImpl() const;
- const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override = 0;
-
- const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
- return &IntrinsicInfo;
- }
- TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+ const TargetSubtargetInfo *getSubtargetImpl() const;
+ const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override = 0;
TargetLoweringObjectFile *getObjFileLowering() const override {
return TLOF.get();
@@ -91,6 +86,8 @@ public:
const R600Subtarget *getSubtargetImpl(const Function &) const override;
+ TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+
bool isMachineVerifierClean() const override {
return false;
}
@@ -102,7 +99,8 @@ public:
class GCNTargetMachine final : public AMDGPUTargetMachine {
private:
- mutable StringMap<std::unique_ptr<SISubtarget>> SubtargetMap;
+ AMDGPUIntrinsicInfo IntrinsicInfo;
+ mutable StringMap<std::unique_ptr<GCNSubtarget>> SubtargetMap;
public:
GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
@@ -112,7 +110,13 @@ public:
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
- const SISubtarget *getSubtargetImpl(const Function &) const override;
+ const GCNSubtarget *getSubtargetImpl(const Function &) const override;
+
+ TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+
+ const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
+ return &IntrinsicInfo;
+ }
bool useIPRA() const override {
return true;
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index ca6210f69298..dd9dc1a88fc2 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file declares the AMDGPU-specific subclass of
+/// This file declares the AMDGPU-specific subclass of
/// TargetLoweringObjectFile.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 77c2d4b956c6..a68b8d03f06e 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -17,12 +17,12 @@
#include "AMDGPUTargetTransformInfo.h"
#include "AMDGPUSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
@@ -43,6 +43,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include <algorithm>
@@ -101,7 +102,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
unsigned ThresholdPrivate = UnrollThresholdPrivate;
unsigned ThresholdLocal = UnrollThresholdLocal;
unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
- AMDGPUAS ASST = ST->getAMDGPUAS();
+ const AMDGPUAS &ASST = AMDGPU::getAMDGPUAS(TargetTriple);
for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getModule()->getDataLayout();
unsigned LocalGEPsSeen = 0;
@@ -123,8 +124,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
continue;
if (dependsOnLocalPhi(L, Br->getCondition())) {
UP.Threshold += UnrollThresholdIf;
- DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
- << " for loop:\n" << *L << " due to " << *Br << '\n');
+ LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
+ << " for loop:\n"
+ << *L << " due to " << *Br << '\n');
if (UP.Threshold >= MaxBoost)
return;
}
@@ -200,61 +202,76 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
// Don't use the maximum allowed value here as it will make some
// programs way too big.
UP.Threshold = Threshold;
- DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n"
- << *L << " due to " << *GEP << '\n');
+ LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
+ << " for loop:\n"
+ << *L << " due to " << *GEP << '\n');
if (UP.Threshold >= MaxBoost)
return;
}
}
}
-unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
+unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
// The concept of vector registers doesn't really exist. Some packed vector
// operations operate on the normal 32-bit registers.
-
- // Number of VGPRs on SI.
- if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
- return 256;
-
- return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
+ return 256;
}
-unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const {
+unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
// This is really the number of registers to fill when vectorizing /
// interleaving loops, so we lie to avoid trying to use all registers.
return getHardwareNumberOfRegisters(Vec) >> 3;
}
-unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const {
+unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
return 32;
}
-unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const {
+unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
return 32;
}
-unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
+unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
+ unsigned ChainSizeInBytes,
+ VectorType *VecTy) const {
+ unsigned VecRegBitWidth = VF * LoadSize;
+ if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
+ // TODO: Support element-size less than 32bit?
+ return 128 / LoadSize;
+
+ return VF;
+}
+
+unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
+ unsigned ChainSizeInBytes,
+ VectorType *VecTy) const {
+ unsigned VecRegBitWidth = VF * StoreSize;
+ if (VecRegBitWidth > 128)
+ return 128 / StoreSize;
+
+ return VF;
+}
+
+unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
AMDGPUAS AS = ST->getAMDGPUAS();
if (AddrSpace == AS.GLOBAL_ADDRESS ||
AddrSpace == AS.CONSTANT_ADDRESS ||
- AddrSpace == AS.FLAT_ADDRESS)
- return 128;
- if (AddrSpace == AS.LOCAL_ADDRESS ||
+ AddrSpace == AS.CONSTANT_ADDRESS_32BIT) {
+ return 512;
+ }
+
+ if (AddrSpace == AS.FLAT_ADDRESS ||
+ AddrSpace == AS.LOCAL_ADDRESS ||
AddrSpace == AS.REGION_ADDRESS)
- return 64;
+ return 128;
+
if (AddrSpace == AS.PRIVATE_ADDRESS)
return 8 * ST->getMaxPrivateElementSize();
- if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
- (AddrSpace == AS.PARAM_D_ADDRESS ||
- AddrSpace == AS.PARAM_I_ADDRESS ||
- (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
- AddrSpace <= AS.CONSTANT_BUFFER_15)))
- return 128;
llvm_unreachable("unhandled address space");
}
-bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const {
// We allow vectorization of flat stores, even though we may need to decompose
@@ -267,19 +284,19 @@ bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
return true;
}
-bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
-bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
-unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
// Disable unrolling if the loop is not vectorized.
// TODO: Enable this again.
if (VF == 1)
@@ -288,11 +305,14 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
return 8;
}
-bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
+bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) const {
switch (Inst->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec: {
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax: {
auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
if (!Ordering || !Volatile)
@@ -314,7 +334,7 @@ bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
}
}
-int AMDGPUTTIImpl::getArithmeticInstrCost(
+int GCNTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
@@ -424,7 +444,7 @@ int AMDGPUTTIImpl::getArithmeticInstrCost(
Opd1PropInfo, Opd2PropInfo);
}
-unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
+unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
// XXX - For some reason this isn't called for switch.
switch (Opcode) {
case Instruction::Br:
@@ -435,7 +455,38 @@ unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
}
}
-int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
+ bool IsPairwise) {
+ EVT OrigTy = TLI->getValueType(DL, Ty);
+
+ // Computes cost on targets that have packed math instructions(which support
+ // 16-bit types only).
+ if (IsPairwise ||
+ !ST->hasVOP3PInsts() ||
+ OrigTy.getScalarSizeInBits() != 16)
+ return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise);
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ return LT.first * getFullRateInstrCost();
+}
+
+int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
+ bool IsPairwise,
+ bool IsUnsigned) {
+ EVT OrigTy = TLI->getValueType(DL, Ty);
+
+ // Computes cost on targets that have packed math instructions(which support
+ // 16-bit types only).
+ if (IsPairwise ||
+ !ST->hasVOP3PInsts() ||
+ OrigTy.getScalarSizeInBits() != 16)
+ return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned);
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ return LT.first * getHalfRateInstrCost();
+}
+
+int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
unsigned Index) {
switch (Opcode) {
case Instruction::ExtractElement:
@@ -460,52 +511,7 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
}
}
-static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) {
- switch (I->getIntrinsicID()) {
- case Intrinsic::amdgcn_workitem_id_x:
- case Intrinsic::amdgcn_workitem_id_y:
- case Intrinsic::amdgcn_workitem_id_z:
- case Intrinsic::amdgcn_interp_mov:
- case Intrinsic::amdgcn_interp_p1:
- case Intrinsic::amdgcn_interp_p2:
- case Intrinsic::amdgcn_mbcnt_hi:
- case Intrinsic::amdgcn_mbcnt_lo:
- case Intrinsic::r600_read_tidig_x:
- case Intrinsic::r600_read_tidig_y:
- case Intrinsic::r600_read_tidig_z:
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
- case Intrinsic::amdgcn_image_atomic_swap:
- case Intrinsic::amdgcn_image_atomic_add:
- case Intrinsic::amdgcn_image_atomic_sub:
- case Intrinsic::amdgcn_image_atomic_smin:
- case Intrinsic::amdgcn_image_atomic_umin:
- case Intrinsic::amdgcn_image_atomic_smax:
- case Intrinsic::amdgcn_image_atomic_umax:
- case Intrinsic::amdgcn_image_atomic_and:
- case Intrinsic::amdgcn_image_atomic_or:
- case Intrinsic::amdgcn_image_atomic_xor:
- case Intrinsic::amdgcn_image_atomic_inc:
- case Intrinsic::amdgcn_image_atomic_dec:
- case Intrinsic::amdgcn_image_atomic_cmpswap:
- case Intrinsic::amdgcn_buffer_atomic_swap:
- case Intrinsic::amdgcn_buffer_atomic_add:
- case Intrinsic::amdgcn_buffer_atomic_sub:
- case Intrinsic::amdgcn_buffer_atomic_smin:
- case Intrinsic::amdgcn_buffer_atomic_umin:
- case Intrinsic::amdgcn_buffer_atomic_smax:
- case Intrinsic::amdgcn_buffer_atomic_umax:
- case Intrinsic::amdgcn_buffer_atomic_and:
- case Intrinsic::amdgcn_buffer_atomic_or:
- case Intrinsic::amdgcn_buffer_atomic_xor:
- case Intrinsic::amdgcn_buffer_atomic_cmpswap:
- case Intrinsic::amdgcn_ps_live:
- case Intrinsic::amdgcn_ds_swizzle:
- return true;
- default:
- return false;
- }
-}
+
static bool isArgPassedInSGPR(const Argument *A) {
const Function *F = A->getParent();
@@ -535,7 +541,7 @@ static bool isArgPassedInSGPR(const Argument *A) {
/// \returns true if the result of the value could potentially be
/// different across workitems in a wavefront.
-bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
+bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
if (const Argument *A = dyn_cast<Argument>(V))
return !isArgPassedInSGPR(A);
@@ -556,7 +562,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
return true;
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
- return isIntrinsicSourceOfDivergence(Intrinsic);
+ return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
// Assume all function calls are a source of divergence.
if (isa<CallInst>(V) || isa<InvokeInst>(V))
@@ -565,7 +571,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
return false;
}
-bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const {
+bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
switch (Intrinsic->getIntrinsicID()) {
default:
@@ -578,7 +584,7 @@ bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const {
return false;
}
-unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
if (ST->hasVOP3PInsts()) {
VectorType *VT = cast<VectorType>(Tp);
@@ -601,7 +607,7 @@ unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Inde
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
-bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller,
+bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
const FeatureBitset &CallerBits =
@@ -613,3 +619,114 @@ bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller,
FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
}
+
+void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::UnrollingPreferences &UP) {
+ CommonTTI.getUnrollingPreferences(L, SE, UP);
+}
+
+unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
+ return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
+}
+
+unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
+ return getHardwareNumberOfRegisters(Vec);
+}
+
+unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
+ return 32;
+}
+
+unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
+ return 32;
+}
+
+unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
+ AMDGPUAS AS = ST->getAMDGPUAS();
+ if (AddrSpace == AS.GLOBAL_ADDRESS ||
+ AddrSpace == AS.CONSTANT_ADDRESS)
+ return 128;
+ if (AddrSpace == AS.LOCAL_ADDRESS ||
+ AddrSpace == AS.REGION_ADDRESS)
+ return 64;
+ if (AddrSpace == AS.PRIVATE_ADDRESS)
+ return 32;
+
+ if ((AddrSpace == AS.PARAM_D_ADDRESS ||
+ AddrSpace == AS.PARAM_I_ADDRESS ||
+ (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
+ AddrSpace <= AS.CONSTANT_BUFFER_15)))
+ return 128;
+ llvm_unreachable("unhandled address space");
+}
+
+bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+ unsigned Alignment,
+ unsigned AddrSpace) const {
+ // We allow vectorization of flat stores, even though we may need to decompose
+ // them later if they may access private memory. We don't have enough context
+ // here, and legalization can handle it.
+ if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS)
+ return false;
+ return true;
+}
+
+bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+ unsigned Alignment,
+ unsigned AddrSpace) const {
+ return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
+}
+
+bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+ unsigned Alignment,
+ unsigned AddrSpace) const {
+ return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
+}
+
+unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+ // Disable unrolling if the loop is not vectorized.
+ // TODO: Enable this again.
+ if (VF == 1)
+ return 1;
+
+ return 8;
+}
+
+unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) {
+ // XXX - For some reason this isn't called for switch.
+ switch (Opcode) {
+ case Instruction::Br:
+ case Instruction::Ret:
+ return 10;
+ default:
+ return BaseT::getCFInstrCost(Opcode);
+ }
+}
+
+int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index) {
+ switch (Opcode) {
+ case Instruction::ExtractElement:
+ case Instruction::InsertElement: {
+ unsigned EltSize
+ = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
+ if (EltSize < 32) {
+ return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+ }
+
+ // Extracts are just reads of a subregister, so are free. Inserts are
+ // considered free because we don't want to have any cost for scalarizing
+ // operations, and we don't have to copy into a different register class.
+
+ // Dynamic indexing isn't free and is best avoided.
+ return Index == ~0u ? 2 : 0;
+ }
+ default:
+ return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+ }
+}
+
+void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::UnrollingPreferences &UP) {
+ CommonTTI.getUnrollingPreferences(L, SE, UP);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 8899d2c6da8a..8e63d789e17d 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -21,6 +21,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -44,8 +45,26 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
friend BaseT;
- const AMDGPUSubtarget *ST;
+ Triple TargetTriple;
+
+public:
+ explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()),
+ TargetTriple(TM->getTargetTriple()) {}
+
+ void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::UnrollingPreferences &UP);
+};
+
+class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
+ using BaseT = BasicTTIImplBase<GCNTTIImpl>;
+ using TTI = TargetTransformInfo;
+
+ friend BaseT;
+
+ const GCNSubtarget *ST;
const AMDGPUTargetLowering *TLI;
+ AMDGPUTTIImpl CommonTTI;
bool IsGraphicsShader;
const FeatureBitset InlineFeatureIgnoreList = {
@@ -61,7 +80,6 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
AMDGPU::FeatureAutoWaitcntBeforeBarrier,
AMDGPU::FeatureDebuggerEmitPrologue,
AMDGPU::FeatureDebuggerInsertNops,
- AMDGPU::FeatureDebuggerReserveRegs,
// Property of the kernel/environment which can't actually differ.
AMDGPU::FeatureSGPRInitBug,
@@ -73,7 +91,7 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
AMDGPU::HalfRate64Ops
};
- const AMDGPUSubtarget *getST() const { return ST; }
+ const GCNSubtarget *getST() const { return ST; }
const AMDGPUTargetLowering *getTLI() const { return TLI; }
static inline int getFullRateInstrCost() {
@@ -98,10 +116,11 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
}
public:
- explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+ explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()),
- ST(TM->getSubtargetImpl(F)),
+ ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))),
TLI(ST->getTargetLowering()),
+ CommonTTI(TM, F),
IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {}
bool hasBranchDivergence() { return true; }
@@ -118,6 +137,12 @@ public:
unsigned getNumberOfRegisters(bool Vector) const;
unsigned getRegisterBitWidth(bool Vector) const;
unsigned getMinVectorRegisterBitWidth() const;
+ unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
+ unsigned ChainSizeInBytes,
+ VectorType *VecTy) const;
+ unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
+ unsigned ChainSizeInBytes,
+ VectorType *VecTy) const;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
@@ -166,6 +191,53 @@ public:
const Function *Callee) const;
unsigned getInliningThresholdMultiplier() { return 9; }
+
+ int getArithmeticReductionCost(unsigned Opcode,
+ Type *Ty,
+ bool IsPairwise);
+ int getMinMaxReductionCost(Type *Ty, Type *CondTy,
+ bool IsPairwiseForm,
+ bool IsUnsigned);
+};
+
+class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
+ using BaseT = BasicTTIImplBase<R600TTIImpl>;
+ using TTI = TargetTransformInfo;
+
+ friend BaseT;
+
+ const R600Subtarget *ST;
+ const AMDGPUTargetLowering *TLI;
+ AMDGPUTTIImpl CommonTTI;
+
+public:
+ explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()),
+ ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))),
+ TLI(ST->getTargetLowering()),
+ CommonTTI(TM, F) {}
+
+ const R600Subtarget *getST() const { return ST; }
+ const AMDGPUTargetLowering *getTLI() const { return TLI; }
+
+ void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::UnrollingPreferences &UP);
+ unsigned getHardwareNumberOfRegisters(bool Vec) const;
+ unsigned getNumberOfRegisters(bool Vec) const;
+ unsigned getRegisterBitWidth(bool Vector) const;
+ unsigned getMinVectorRegisterBitWidth() const;
+ unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
+ bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment,
+ unsigned AddrSpace) const;
+ bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+ unsigned Alignment,
+ unsigned AddrSpace) const;
+ bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+ unsigned Alignment,
+ unsigned AddrSpace) const;
+ unsigned getMaxInterleaveFactor(unsigned VF);
+ unsigned getCFInstrCost(unsigned Opcode);
+ int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 6107f3a7dd18..0d3a1673696a 100644
--- a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -28,6 +28,7 @@
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
@@ -39,7 +40,7 @@
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils.h"
using namespace llvm;
@@ -144,7 +145,8 @@ static BasicBlock *unifyReturnBlockSet(Function &F,
if (PN)
PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
- BB->getInstList().pop_back(); // Remove the return insn
+ // Remove and delete the return inst.
+ BB->getTerminator()->eraseFromParent();
BranchInst::Create(NewRetBlock, BB);
}
@@ -168,6 +170,9 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
SmallVector<BasicBlock *, 4> ReturningBlocks;
SmallVector<BasicBlock *, 4> UnreachableBlocks;
+ // Dummy return block for infinite loop.
+ BasicBlock *DummyReturnBB = nullptr;
+
for (BasicBlock *BB : PDT.getRoots()) {
if (isa<ReturnInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
@@ -175,6 +180,35 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
} else if (isa<UnreachableInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
UnreachableBlocks.push_back(BB);
+ } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+
+ ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
+ if (DummyReturnBB == nullptr) {
+ DummyReturnBB = BasicBlock::Create(F.getContext(),
+ "DummyReturnBlock", &F);
+ Type *RetTy = F.getReturnType();
+ Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
+ ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+ ReturningBlocks.push_back(DummyReturnBB);
+ }
+
+ if (BI->isUnconditional()) {
+ BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
+ BI->eraseFromParent(); // Delete the unconditional branch.
+ // Add a new conditional branch with a dummy edge to the return block.
+ BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
+ } else { // Conditional branch.
+ // Create a new transition block to hold the conditional branch.
+ BasicBlock *TransitionBB = BasicBlock::Create(F.getContext(),
+ "TransitionBlock", &F);
+
+ // Move BI from BB to the new transition block.
+ BI->removeFromParent();
+ TransitionBB->getInstList().push_back(BI);
+
+ // Create a branch that will always branch to the transition block.
+ BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
+ }
}
}
@@ -189,7 +223,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
new UnreachableInst(F.getContext(), UnreachableBlock);
for (BasicBlock *BB : UnreachableBlocks) {
- BB->getInstList().pop_back(); // Remove the unreachable inst.
+ // Remove and delete the unreachable inst.
+ BB->getTerminator()->eraseFromParent();
BranchInst::Create(UnreachableBlock, BB);
}
}
@@ -200,7 +235,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
Type *RetTy = F.getReturnType();
Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
- UnreachableBlock->getInstList().pop_back(); // Remove the unreachable inst.
+ // Remove and delete the unreachable inst.
+ UnreachableBlock->getTerminator()->eraseFromParent();
Function *UnreachableIntrin =
Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
index b78568e89cfb..1f6d9234c1ed 100644
--- a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
// \file
-// \brief This pass that unifies multiple OpenCL metadata due to linking.
+// This pass that unifies multiple OpenCL metadata due to linking.
//
//===----------------------------------------------------------------------===//
@@ -37,7 +37,7 @@ namespace {
} // end namespace kOCLMD
- /// \brief Unify multiple OpenCL metadata due to linking.
+ /// Unify multiple OpenCL metadata due to linking.
class AMDGPUUnifyMetadata : public ModulePass {
public:
static char ID;
@@ -47,7 +47,7 @@ namespace {
private:
bool runOnModule(Module &M) override;
- /// \brief Unify version metadata.
+ /// Unify version metadata.
/// \return true if changes are made.
/// Assume the named metadata has operands each of which is a pair of
/// integer constant, e.g.
@@ -82,7 +82,7 @@ namespace {
return true;
}
- /// \brief Unify version metadata.
+ /// Unify version metadata.
/// \return true if changes are made.
/// Assume the named metadata has operands each of which is a list e.g.
/// !Name = {!n1, !n2}
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 0a0e43123ae0..11cd49e5b3dc 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -11,6 +11,7 @@
#include "AMDGPUSubtarget.h"
#include "R600InstrInfo.h"
#include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SCCIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
@@ -28,12 +29,12 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
#include <cstddef>
@@ -78,23 +79,18 @@ namespace {
//
//===----------------------------------------------------------------------===//
-#define SHOWNEWINSTR(i) \
- DEBUG(dbgs() << "New instr: " << *i << "\n");
+#define SHOWNEWINSTR(i) LLVM_DEBUG(dbgs() << "New instr: " << *i << "\n");
-#define SHOWNEWBLK(b, msg) \
-DEBUG( \
- dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
- dbgs() << "\n"; \
-);
+#define SHOWNEWBLK(b, msg) \
+ LLVM_DEBUG(dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+ dbgs() << "\n";);
-#define SHOWBLK_DETAIL(b, msg) \
-DEBUG( \
- if (b) { \
- dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
- b->print(dbgs()); \
- dbgs() << "\n"; \
- } \
-);
+#define SHOWBLK_DETAIL(b, msg) \
+ LLVM_DEBUG(if (b) { \
+ dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+ b->print(dbgs()); \
+ dbgs() << "\n"; \
+ });
#define INVALIDSCCNUM -1
@@ -158,19 +154,19 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override {
TII = MF.getSubtarget<R600Subtarget>().getInstrInfo();
TRI = &TII->getRegisterInfo();
- DEBUG(MF.dump(););
+ LLVM_DEBUG(MF.dump(););
OrderedBlks.clear();
Visited.clear();
FuncRep = &MF;
MLI = &getAnalysis<MachineLoopInfo>();
- DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
+ LLVM_DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
MDT = &getAnalysis<MachineDominatorTree>();
- DEBUG(MDT->print(dbgs(), (const Module*)nullptr););
+ LLVM_DEBUG(MDT->print(dbgs(), (const Module *)nullptr););
PDT = &getAnalysis<MachinePostDominatorTree>();
- DEBUG(PDT->print(dbgs()););
+ LLVM_DEBUG(PDT->print(dbgs()););
prepare();
run();
- DEBUG(MF.dump(););
+ LLVM_DEBUG(MF.dump(););
return true;
}
@@ -436,19 +432,19 @@ void AMDGPUCFGStructurizer::reversePredicateSetter(
for (;; --I) {
if (I == MBB.end())
continue;
- if (I->getOpcode() == AMDGPU::PRED_X) {
+ if (I->getOpcode() == R600::PRED_X) {
switch (I->getOperand(2).getImm()) {
- case AMDGPU::PRED_SETE_INT:
- I->getOperand(2).setImm(AMDGPU::PRED_SETNE_INT);
+ case R600::PRED_SETE_INT:
+ I->getOperand(2).setImm(R600::PRED_SETNE_INT);
return;
- case AMDGPU::PRED_SETNE_INT:
- I->getOperand(2).setImm(AMDGPU::PRED_SETE_INT);
+ case R600::PRED_SETNE_INT:
+ I->getOperand(2).setImm(R600::PRED_SETE_INT);
return;
- case AMDGPU::PRED_SETE:
- I->getOperand(2).setImm(AMDGPU::PRED_SETNE);
+ case R600::PRED_SETE:
+ I->getOperand(2).setImm(R600::PRED_SETNE);
return;
- case AMDGPU::PRED_SETNE:
- I->getOperand(2).setImm(AMDGPU::PRED_SETE);
+ case R600::PRED_SETNE:
+ I->getOperand(2).setImm(R600::PRED_SETE);
return;
default:
llvm_unreachable("PRED_X Opcode invalid!");
@@ -517,10 +513,10 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore(
int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
switch(OldOpcode) {
- case AMDGPU::JUMP_COND:
- case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
- case AMDGPU::BRANCH_COND_i32:
- case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
+ case R600::JUMP_COND:
+ case R600::JUMP: return R600::IF_PREDICATE_SET;
+ case R600::BRANCH_COND_i32:
+ case R600::BRANCH_COND_f32: return R600::IF_LOGICALNZ_f32;
default: llvm_unreachable("internal error");
}
return -1;
@@ -528,10 +524,10 @@ int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
switch(OldOpcode) {
- case AMDGPU::JUMP_COND:
- case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
- case AMDGPU::BRANCH_COND_i32:
- case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
+ case R600::JUMP_COND:
+ case R600::JUMP: return R600::IF_PREDICATE_SET;
+ case R600::BRANCH_COND_i32:
+ case R600::BRANCH_COND_f32: return R600::IF_LOGICALZ_f32;
default: llvm_unreachable("internal error");
}
return -1;
@@ -539,8 +535,8 @@ int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
switch(OldOpcode) {
- case AMDGPU::JUMP_COND:
- case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
+ case R600::JUMP_COND:
+ case R600::JUMP: return R600::CONTINUE_LOGICALNZ_i32;
default: llvm_unreachable("internal error");
}
return -1;
@@ -548,8 +544,8 @@ int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
switch(OldOpcode) {
- case AMDGPU::JUMP_COND:
- case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
+ case R600::JUMP_COND:
+ case R600::JUMP: return R600::CONTINUE_LOGICALZ_i32;
default: llvm_unreachable("internal error");
}
return -1;
@@ -577,9 +573,9 @@ AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
switch (MI->getOpcode()) {
- case AMDGPU::JUMP_COND:
- case AMDGPU::BRANCH_COND_i32:
- case AMDGPU::BRANCH_COND_f32: return true;
+ case R600::JUMP_COND:
+ case R600::BRANCH_COND_i32:
+ case R600::BRANCH_COND_f32: return true;
default:
return false;
}
@@ -588,8 +584,8 @@ bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
switch (MI->getOpcode()) {
- case AMDGPU::JUMP:
- case AMDGPU::BRANCH:
+ case R600::JUMP:
+ case R600::BRANCH:
return true;
default:
return false;
@@ -638,7 +634,7 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
MachineBasicBlock::reverse_iterator It = MBB->rbegin();
if (It != MBB->rend()) {
MachineInstr *instr = &(*It);
- if (instr->getOpcode() == AMDGPU::RETURN)
+ if (instr->getOpcode() == R600::RETURN)
return instr;
}
return nullptr;
@@ -650,9 +646,8 @@ bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
if (MI)
assert(IsReturn);
else if (IsReturn)
- DEBUG(
- dbgs() << "BB" << MBB->getNumber()
- <<" is return block without RETURN instr\n";);
+ LLVM_DEBUG(dbgs() << "BB" << MBB->getNumber()
+ << " is return block without RETURN instr\n";);
return IsReturn;
}
@@ -692,8 +687,8 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator It = Pre;
while (It != E) {
- if (Pre->getOpcode() == AMDGPU::CONTINUE
- && It->getOpcode() == AMDGPU::ENDLOOP)
+ if (Pre->getOpcode() == R600::CONTINUE
+ && It->getOpcode() == R600::ENDLOOP)
ContInstr.push_back(&*Pre);
Pre = It;
++It;
@@ -714,7 +709,7 @@ bool AMDGPUCFGStructurizer::prepare() {
//FIXME: if not reducible flow graph, make it so ???
- DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
+ LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
orderBlocks(FuncRep);
@@ -757,14 +752,14 @@ bool AMDGPUCFGStructurizer::prepare() {
bool AMDGPUCFGStructurizer::run() {
//Assume reducible CFG...
- DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
+ LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
#ifdef STRESSTEST
//Use the worse block ordering to test the algorithm.
ReverseVector(orderedBlks);
#endif
- DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
+ LLVM_DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
int NumIter = 0;
bool Finish = false;
MachineBasicBlock *MBB;
@@ -774,10 +769,8 @@ bool AMDGPUCFGStructurizer::run() {
do {
++NumIter;
- DEBUG(
- dbgs() << "numIter = " << NumIter
- << ", numRemaintedBlk = " << NumRemainedBlk << "\n";
- );
+ LLVM_DEBUG(dbgs() << "numIter = " << NumIter
+ << ", numRemaintedBlk = " << NumRemainedBlk << "\n";);
SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
OrderedBlks.begin();
@@ -799,10 +792,8 @@ bool AMDGPUCFGStructurizer::run() {
SccBeginMBB = MBB;
SccNumIter = 0;
SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
- DEBUG(
- dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
- dbgs() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
+ dbgs() << "\n";);
}
if (!isRetiredBlock(MBB))
@@ -817,20 +808,16 @@ bool AMDGPUCFGStructurizer::run() {
++SccNumIter;
int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
- DEBUG(
- dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
- << ", sccNumIter = " << SccNumIter;
- dbgs() << "doesn't make any progress\n";
- );
+ LLVM_DEBUG(dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
+ << ", sccNumIter = " << SccNumIter;
+ dbgs() << "doesn't make any progress\n";);
ContNextScc = true;
} else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
SccNumBlk = sccRemainedNumBlk;
It = SccBeginIter;
ContNextScc = false;
- DEBUG(
- dbgs() << "repeat processing SCC" << getSCCNum(MBB)
- << "sccNumIter = " << SccNumIter << '\n';
- );
+ LLVM_DEBUG(dbgs() << "repeat processing SCC" << getSCCNum(MBB)
+ << "sccNumIter = " << SccNumIter << '\n';);
} else {
// Finish the current scc.
ContNextScc = true;
@@ -848,9 +835,7 @@ bool AMDGPUCFGStructurizer::run() {
*GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
if (EntryMBB->succ_size() == 0) {
Finish = true;
- DEBUG(
- dbgs() << "Reduce to one block\n";
- );
+ LLVM_DEBUG(dbgs() << "Reduce to one block\n";);
} else {
int NewnumRemainedBlk
= countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
@@ -860,9 +845,7 @@ bool AMDGPUCFGStructurizer::run() {
NumRemainedBlk = NewnumRemainedBlk;
} else {
MakeProgress = false;
- DEBUG(
- dbgs() << "No progress\n";
- );
+ LLVM_DEBUG(dbgs() << "No progress\n";);
}
}
} while (!Finish && MakeProgress);
@@ -875,9 +858,7 @@ bool AMDGPUCFGStructurizer::run() {
It != E; ++It) {
if ((*It).second && (*It).second->IsRetired) {
assert(((*It).first)->getNumber() != -1);
- DEBUG(
- dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";);
(*It).first->eraseFromParent(); //Remove from the parent Function.
}
delete (*It).second;
@@ -886,7 +867,7 @@ bool AMDGPUCFGStructurizer::run() {
LLInfoMap.clear();
if (!Finish) {
- DEBUG(FuncRep->viewCFG());
+ LLVM_DEBUG(FuncRep->viewCFG());
report_fatal_error("IRREDUCIBLE_CFG");
}
@@ -920,17 +901,13 @@ int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
int NumMatch = 0;
int CurMatch;
- DEBUG(
- dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";);
while ((CurMatch = patternMatchGroup(MBB)) > 0)
NumMatch += CurMatch;
- DEBUG(
- dbgs() << "End patternMatch BB" << MBB->getNumber()
- << ", numMatch = " << NumMatch << "\n";
- );
+ LLVM_DEBUG(dbgs() << "End patternMatch BB" << MBB->getNumber()
+ << ", numMatch = " << NumMatch << "\n";);
return NumMatch;
}
@@ -1050,7 +1027,7 @@ int AMDGPUCFGStructurizer::loopendPatternMatch() {
for (MachineLoop *ExaminedLoop : NestedLoops) {
if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
continue;
- DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
+ LLVM_DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
int NumBreak = mergeLoop(ExaminedLoop);
if (NumBreak == -1)
break;
@@ -1064,7 +1041,8 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
MBBVector ExitingMBBs;
LoopRep->getExitingBlocks(ExitingMBBs);
assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
- DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
+ LLVM_DEBUG(dbgs() << "Loop has " << ExitingMBBs.size()
+ << " exiting blocks\n";);
// We assume a single ExitBlk
MBBVector ExitBlks;
LoopRep->getExitBlocks(ExitBlks);
@@ -1106,11 +1084,9 @@ bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
if (TheEntry) {
- DEBUG(
- dbgs() << "isLoopContBreakBlock yes src1 = BB"
- << Src1MBB->getNumber()
- << " src2 = BB" << Src2MBB->getNumber() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "isLoopContBreakBlock yes src1 = BB"
+ << Src1MBB->getNumber() << " src2 = BB"
+ << Src2MBB->getNumber() << "\n";);
return true;
}
}
@@ -1122,9 +1098,8 @@ int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
if (Num == 0) {
- DEBUG(
- dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
- );
+ LLVM_DEBUG(dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk"
+ << "\n";);
Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
}
return Num;
@@ -1138,22 +1113,16 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
//trueBlk could be the common post dominator
DownBlk = TrueMBB;
- DEBUG(
- dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
- << " true = BB" << TrueMBB->getNumber()
- << ", numSucc=" << TrueMBB->succ_size()
- << " false = BB" << FalseMBB->getNumber() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
+ << " true = BB" << TrueMBB->getNumber()
+ << ", numSucc=" << TrueMBB->succ_size() << " false = BB"
+ << FalseMBB->getNumber() << "\n";);
while (DownBlk) {
- DEBUG(
- dbgs() << "check down = BB" << DownBlk->getNumber();
- );
+ LLVM_DEBUG(dbgs() << "check down = BB" << DownBlk->getNumber(););
if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
- DEBUG(
- dbgs() << " working\n";
- );
+ LLVM_DEBUG(dbgs() << " working\n";);
Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
@@ -1166,9 +1135,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
break;
}
- DEBUG(
- dbgs() << " not working\n";
- );
+ LLVM_DEBUG(dbgs() << " not working\n";);
DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
} // walk down the postDomTree
@@ -1247,10 +1214,9 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
MigrateFalse = true;
- DEBUG(
- dbgs() << "before improveSimpleJumpintoIf: ";
- showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
- );
+ LLVM_DEBUG(
+ dbgs() << "before improveSimpleJumpintoIf: ";
+ showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0););
// org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
//
@@ -1337,15 +1303,15 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
- //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
- MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
+ //insert R600::ENDIF to avoid special case "input landBlk == NULL"
+ MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, R600::ENDIF);
if (LandBlkHasOtherPred) {
report_fatal_error("Extra register needed to handle CFG");
unsigned CmpResReg =
HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
report_fatal_error("Extra compare instruction needed to handle CFG");
- insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
+ insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET,
CmpResReg, DebugLoc());
}
@@ -1353,7 +1319,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
// cause an assertion failure in the PostRA scheduling pass.
unsigned InitReg =
HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
- insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
+ insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, InitReg,
DebugLoc());
if (MigrateTrue) {
@@ -1363,7 +1329,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
// (initVal != 1).
report_fatal_error("Extra register needed to handle CFG");
}
- insertInstrBefore(I, AMDGPU::ELSE);
+ insertInstrBefore(I, R600::ELSE);
if (MigrateFalse) {
migrateInstruction(FalseMBB, LandBlk, I);
@@ -1375,7 +1341,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
if (LandBlkHasOtherPred) {
// add endif
- insertInstrBefore(I, AMDGPU::ENDIF);
+ insertInstrBefore(I, R600::ENDIF);
// put initReg = 2 to other predecessors of landBlk
for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
@@ -1385,10 +1351,9 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
report_fatal_error("Extra register needed to handle CFG");
}
}
- DEBUG(
- dbgs() << "result from improveSimpleJumpintoIf: ";
- showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
- );
+ LLVM_DEBUG(
+ dbgs() << "result from improveSimpleJumpintoIf: ";
+ showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0););
// update landBlk
*LandMBBPtr = LandBlk;
@@ -1398,10 +1363,8 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
MachineBasicBlock *SrcMBB) {
- DEBUG(
- dbgs() << "serialPattern BB" << DstMBB->getNumber()
- << " <= BB" << SrcMBB->getNumber() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "serialPattern BB" << DstMBB->getNumber() << " <= BB"
+ << SrcMBB->getNumber() << "\n";);
DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
DstMBB->removeSuccessor(SrcMBB, true);
@@ -1416,26 +1379,15 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
assert (TrueMBB);
- DEBUG(
- dbgs() << "ifPattern BB" << MBB->getNumber();
- dbgs() << "{ ";
- if (TrueMBB) {
- dbgs() << "BB" << TrueMBB->getNumber();
- }
- dbgs() << " } else ";
- dbgs() << "{ ";
- if (FalseMBB) {
- dbgs() << "BB" << FalseMBB->getNumber();
- }
- dbgs() << " }\n ";
- dbgs() << "landBlock: ";
- if (!LandMBB) {
- dbgs() << "NULL";
- } else {
- dbgs() << "BB" << LandMBB->getNumber();
- }
- dbgs() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "ifPattern BB" << MBB->getNumber(); dbgs() << "{ ";
+ if (TrueMBB) { dbgs() << "BB" << TrueMBB->getNumber(); } dbgs()
+ << " } else ";
+ dbgs() << "{ "; if (FalseMBB) {
+ dbgs() << "BB" << FalseMBB->getNumber();
+ } dbgs() << " }\n ";
+ dbgs() << "landBlock: "; if (!LandMBB) { dbgs() << "NULL"; } else {
+ dbgs() << "BB" << LandMBB->getNumber();
+ } dbgs() << "\n";);
int OldOpcode = BranchMI->getOpcode();
DebugLoc BranchDL = BranchMI->getDebugLoc();
@@ -1462,7 +1414,7 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
}
if (FalseMBB) {
- insertInstrBefore(I, AMDGPU::ELSE);
+ insertInstrBefore(I, R600::ELSE);
MBB->splice(I, FalseMBB, FalseMBB->begin(),
FalseMBB->end());
MBB->removeSuccessor(FalseMBB, true);
@@ -1471,7 +1423,7 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
retireBlock(FalseMBB);
MLI->removeBlock(FalseMBB);
}
- insertInstrBefore(I, AMDGPU::ENDIF);
+ insertInstrBefore(I, R600::ENDIF);
BranchMI->eraseFromParent();
@@ -1481,18 +1433,19 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
MachineBasicBlock *LandMBB) {
- DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
- << " land = BB" << LandMBB->getNumber() << "\n";);
+ LLVM_DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
+ << " land = BB" << LandMBB->getNumber() << "\n";);
- insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
- insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
+ insertInstrBefore(DstBlk, R600::WHILELOOP, DebugLoc());
+ insertInstrEnd(DstBlk, R600::ENDLOOP, DebugLoc());
DstBlk->replaceSuccessor(DstBlk, LandMBB);
}
void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
MachineBasicBlock *LandMBB) {
- DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
- << " land = BB" << LandMBB->getNumber() << "\n";);
+ LLVM_DEBUG(dbgs() << "loopbreakPattern exiting = BB"
+ << ExitingMBB->getNumber() << " land = BB"
+ << LandMBB->getNumber() << "\n";);
MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
assert(BranchMI && isCondBranch(BranchMI));
DebugLoc DL = BranchMI->getDebugLoc();
@@ -1500,9 +1453,9 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
MachineBasicBlock::iterator I = BranchMI;
if (TrueBranch != LandMBB)
reversePredicateSetter(I, *I->getParent());
- insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
- insertInstrBefore(I, AMDGPU::BREAK);
- insertInstrBefore(I, AMDGPU::ENDIF);
+ insertCondBranchBefore(ExitingMBB, I, R600::IF_PREDICATE_SET, R600::PREDICATE_BIT, DL);
+ insertInstrBefore(I, R600::BREAK);
+ insertInstrBefore(I, R600::ENDIF);
//now branchInst can be erase safely
BranchMI->eraseFromParent();
//now take care of successors, retire blocks
@@ -1511,9 +1464,9 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
MachineBasicBlock *ContMBB) {
- DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
- << ContingMBB->getNumber()
- << ", cont = BB" << ContMBB->getNumber() << "\n";);
+ LLVM_DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
+ << ContingMBB->getNumber() << ", cont = BB"
+ << ContMBB->getNumber() << "\n";);
MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
if (MI) {
@@ -1531,8 +1484,8 @@ void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
getBranchZeroOpcode(OldOpcode);
insertCondBranchBefore(I, BranchOpcode, DL);
// insertEnd to ensure phi-moves, if exist, go before the continue-instr.
- insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
- insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
+ insertInstrEnd(ContingMBB, R600::CONTINUE, DL);
+ insertInstrEnd(ContingMBB, R600::ENDIF, DL);
} else {
int BranchOpcode =
TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
@@ -1547,7 +1500,7 @@ void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
// location we've just inserted that reference here so it should be
// representative insertEnd to ensure phi-moves, if exist, go before the
// continue-instr.
- insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
+ insertInstrEnd(ContingMBB, R600::CONTINUE,
getLastDebugLocInBB(ContingMBB));
}
}
@@ -1587,10 +1540,9 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
numClonedInstr += MBB->size();
- DEBUG(
- dbgs() << "Cloned block: " << "BB"
- << MBB->getNumber() << "size " << MBB->size() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "Cloned block: "
+ << "BB" << MBB->getNumber() << "size " << MBB->size()
+ << "\n";);
SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
@@ -1603,26 +1555,22 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
//look for the input branchinstr, not the AMDGPU branchinstr
MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
if (!BranchMI) {
- DEBUG(
- dbgs() << "migrateInstruction don't see branch instr\n";
- );
+ LLVM_DEBUG(dbgs() << "migrateInstruction don't see branch instr\n";);
SpliceEnd = SrcMBB->end();
} else {
- DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
+ LLVM_DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
SpliceEnd = BranchMI;
}
- DEBUG(
- dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
- << "srcSize = " << SrcMBB->size() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "migrateInstruction before splice dstSize = "
+ << DstMBB->size() << "srcSize = " << SrcMBB->size()
+ << "\n";);
//splice insert before insertPos
DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
- DEBUG(
- dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
- << "srcSize = " << SrcMBB->size() << '\n';
- );
+ LLVM_DEBUG(dbgs() << "migrateInstruction after splice dstSize = "
+ << DstMBB->size() << "srcSize = " << SrcMBB->size()
+ << '\n';);
}
MachineBasicBlock *
@@ -1640,7 +1588,7 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
FuncRep->push_back(DummyExitBlk); //insert to function
SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
- DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
+ LLVM_DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
LLVMContext &Ctx = LoopHeader->getParent()->getFunction().getContext();
Ctx.emitError("Extra register needed to handle CFG");
return nullptr;
@@ -1653,7 +1601,7 @@ void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
// test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
while ((BranchMI = getLoopendBlockBranchInstr(MBB))
&& isUncondBranch(BranchMI)) {
- DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
+ LLVM_DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
BranchMI->eraseFromParent();
}
}
@@ -1669,7 +1617,7 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
assert(BranchMI && isCondBranch(BranchMI));
- DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
+ LLVM_DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
BranchMI->eraseFromParent();
SHOWNEWBLK(MBB1, "Removing redundant successor");
MBB->removeSuccessor(MBB1, true);
@@ -1679,7 +1627,7 @@ void AMDGPUCFGStructurizer::addDummyExitBlock(
SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
FuncRep->push_back(DummyExitBlk); //insert to function
- insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);
+ insertInstrEnd(DummyExitBlk, R600::RETURN);
for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
E = RetMBB.end(); It != E; ++It) {
@@ -1688,10 +1636,8 @@ void AMDGPUCFGStructurizer::addDummyExitBlock(
if (MI)
MI->eraseFromParent();
MBB->addSuccessor(DummyExitBlk);
- DEBUG(
- dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
- << " successors\n";
- );
+ LLVM_DEBUG(dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
+ << " successors\n";);
}
SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
}
@@ -1710,9 +1656,7 @@ void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
}
void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
- DEBUG(
- dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "Retiring BB" << MBB->getNumber() << "\n";);
BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
diff --git a/lib/Target/AMDGPU/AMDKernelCodeT.h b/lib/Target/AMDGPU/AMDKernelCodeT.h
index 5d243e949fd3..289642aaa2d0 100644
--- a/lib/Target/AMDGPU/AMDKernelCodeT.h
+++ b/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -198,7 +198,7 @@ enum amd_code_property_mask_t {
AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT
};
-/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL
+/// The hsa_ext_control_directives_t specifies the values for the HSAIL
/// control directives. These control how the finalizer generates code. This
/// struct is used both as an argument to hsaFinalizeKernel to specify values for
/// the control directives, and is used in HsaKernelCode to record the values of
@@ -551,14 +551,8 @@ typedef struct amd_kernel_code_s {
int64_t kernel_code_prefetch_byte_offset;
uint64_t kernel_code_prefetch_byte_size;
- /// Number of bytes of scratch backing memory required for full
- /// occupancy of target chip. This takes into account the number of
- /// bytes of scratch per work-item, the wavefront size, the maximum
- /// number of wavefronts per CU, and the number of CUs. This is an
- /// upper limit on scratch. If the grid being dispatched is small it
- /// may only need less than this. If the kernel uses no scratch, or
- /// the Finalizer has not computed this value, it must be 0.
- uint64_t max_scratch_backing_memory_byte_size;
+ /// Reserved. Must be 0.
+ uint64_t reserved0;
/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
/// COMPUTE_PGM_RSRC2 registers.
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index ebf656c549ec..31e2885c833d 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -12,6 +12,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "SIDefines.h"
+#include "SIInstrInfo.h"
#include "Utils/AMDGPUAsmUtils.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDKernelCodeTUtils.h"
@@ -25,7 +26,6 @@
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
@@ -42,9 +42,11 @@
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/SMLoc.h"
#include "llvm/Support/TargetRegistry.h"
@@ -60,6 +62,7 @@
using namespace llvm;
using namespace llvm::AMDGPU;
+using namespace llvm::amdhsa;
namespace {
@@ -128,6 +131,7 @@ public:
enum ImmTy {
ImmTyNone,
ImmTyGDS,
+ ImmTyLDS,
ImmTyOffen,
ImmTyIdxen,
ImmTyAddr64,
@@ -138,6 +142,7 @@ public:
ImmTyGLC,
ImmTySLC,
ImmTyTFE,
+ ImmTyD16,
ImmTyClampSI,
ImmTyOModSI,
ImmTyDppCtrl,
@@ -267,7 +272,11 @@ public:
return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
}
- bool isSDWARegKind() const;
+ bool isSDWAOperand(MVT type) const;
+ bool isSDWAFP16Operand() const;
+ bool isSDWAFP32Operand() const;
+ bool isSDWAInt16Operand() const;
+ bool isSDWAInt32Operand() const;
bool isImmTy(ImmTy ImmT) const {
return isImm() && Imm.Type == ImmT;
@@ -282,7 +291,7 @@ public:
bool isDMask() const { return isImmTy(ImmTyDMask); }
bool isUNorm() const { return isImmTy(ImmTyUNorm); }
bool isDA() const { return isImmTy(ImmTyDA); }
- bool isR128() const { return isImmTy(ImmTyUNorm); }
+ bool isR128() const { return isImmTy(ImmTyR128); }
bool isLWE() const { return isImmTy(ImmTyLWE); }
bool isOff() const { return isImmTy(ImmTyOff); }
bool isExpTgt() const { return isImmTy(ImmTyExpTgt); }
@@ -298,9 +307,11 @@ public:
bool isOffsetU12() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isUInt<12>(getImm()); }
bool isOffsetS13() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isInt<13>(getImm()); }
bool isGDS() const { return isImmTy(ImmTyGDS); }
+ bool isLDS() const { return isImmTy(ImmTyLDS); }
bool isGLC() const { return isImmTy(ImmTyGLC); }
bool isSLC() const { return isImmTy(ImmTySLC); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
+ bool isD16() const { return isImmTy(ImmTyD16); }
bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); }
bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); }
bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
@@ -434,7 +445,7 @@ public:
}
bool isVSrcB32() const {
- return isVCSrcF32() || isLiteralImm(MVT::i32);
+ return isVCSrcF32() || isLiteralImm(MVT::i32) || isExpr();
}
bool isVSrcB64() const {
@@ -451,7 +462,7 @@ public:
}
bool isVSrcF32() const {
- return isVCSrcF32() || isLiteralImm(MVT::f32);
+ return isVCSrcF32() || isLiteralImm(MVT::f32) || isExpr();
}
bool isVSrcF64() const {
@@ -643,6 +654,7 @@ public:
switch (Type) {
case ImmTyNone: OS << "None"; break;
case ImmTyGDS: OS << "GDS"; break;
+ case ImmTyLDS: OS << "LDS"; break;
case ImmTyOffen: OS << "Offen"; break;
case ImmTyIdxen: OS << "Idxen"; break;
case ImmTyAddr64: OS << "Addr64"; break;
@@ -653,6 +665,7 @@ public:
case ImmTyGLC: OS << "GLC"; break;
case ImmTySLC: OS << "SLC"; break;
case ImmTyTFE: OS << "TFE"; break;
+ case ImmTyD16: OS << "D16"; break;
case ImmTyDFMT: OS << "DFMT"; break;
case ImmTyNFMT: OS << "NFMT"; break;
case ImmTyClampSI: OS << "ClampSI"; break;
@@ -815,6 +828,10 @@ public:
class AMDGPUAsmParser : public MCTargetAsmParser {
MCAsmParser &Parser;
+ // Number of extra operands parsed after the first optional operand.
+ // This may be necessary to skip hardcoded mandatory operands.
+ static const unsigned MAX_OPR_LOOKAHEAD = 8;
+
unsigned ForcedEncodingSize = 0;
bool ForcedDPP = false;
bool ForcedSDWA = false;
@@ -830,6 +847,27 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
private:
bool ParseAsAbsoluteExpression(uint32_t &Ret);
+ bool OutOfRangeError(SMRange Range);
+ /// Calculate VGPR/SGPR blocks required for given target, reserved
+ /// registers, and user-specified NextFreeXGPR values.
+ ///
+ /// \param Features [in] Target features, used for bug corrections.
+ /// \param VCCUsed [in] Whether VCC special SGPR is reserved.
+ /// \param FlatScrUsed [in] Whether FLAT_SCRATCH special SGPR is reserved.
+ /// \param XNACKUsed [in] Whether XNACK_MASK special SGPR is reserved.
+ /// \param NextFreeVGPR [in] Max VGPR number referenced, plus one.
+ /// \param VGPRRange [in] Token range, used for VGPR diagnostics.
+ /// \param NextFreeSGPR [in] Max SGPR number referenced, plus one.
+ /// \param SGPRRange [in] Token range, used for SGPR diagnostics.
+ /// \param VGPRBlocks [out] Result VGPR block count.
+ /// \param SGPRBlocks [out] Result SGPR block count.
+ bool calculateGPRBlocks(const FeatureBitset &Features, bool VCCUsed,
+ bool FlatScrUsed, bool XNACKUsed,
+ unsigned NextFreeVGPR, SMRange VGPRRange,
+ unsigned NextFreeSGPR, SMRange SGPRRange,
+ unsigned &VGPRBlocks, unsigned &SGPRBlocks);
+ bool ParseDirectiveAMDGCNTarget();
+ bool ParseDirectiveAMDHSAKernel();
bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor);
bool ParseDirectiveHSACodeObjectVersion();
bool ParseDirectiveHSACodeObjectISA();
@@ -848,8 +886,12 @@ private:
bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
unsigned& RegNum, unsigned& RegWidth,
unsigned *DwordRegIndex);
+ Optional<StringRef> getGprCountSymbolName(RegisterKind RegKind);
+ void initializeGprCountSymbol(RegisterKind RegKind);
+ bool updateGprCountSymbols(RegisterKind RegKind, unsigned DwordRegIndex,
+ unsigned RegWidth);
void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands,
- bool IsAtomic, bool IsAtomicReturn);
+ bool IsAtomic, bool IsAtomicReturn, bool IsLds = false);
void cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
bool IsGdsHardcoded);
@@ -881,15 +923,37 @@ public:
AMDGPU::IsaInfo::IsaVersion ISA =
AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
MCContext &Ctx = getContext();
- MCSymbol *Sym =
- Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
- Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
- Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
+ if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+ MCSymbol *Sym =
+ Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
+ Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
+ } else {
+ MCSymbol *Sym =
+ Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
+ Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
+ Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
+ Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
+ Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
+ Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
+ }
+ if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+ initializeGprCountSymbol(IS_VGPR);
+ initializeGprCountSymbol(IS_SGPR);
+ } else
+ KernelScope.initialize(getContext());
}
- KernelScope.initialize(getContext());
+ }
+
+ bool hasXNACK() const {
+ return AMDGPU::hasXNACK(getSTI());
+ }
+
+ bool hasMIMG_R128() const {
+ return AMDGPU::hasMIMG_R128(getSTI());
+ }
+
+ bool hasPackedD16() const {
+ return AMDGPU::hasPackedD16(getSTI());
}
bool isSI() const {
@@ -1025,6 +1089,11 @@ private:
bool validateConstantBusLimitations(const MCInst &Inst);
bool validateEarlyClobberLimitations(const MCInst &Inst);
bool validateIntClampSupported(const MCInst &Inst);
+ bool validateMIMGAtomicDMask(const MCInst &Inst);
+ bool validateMIMGGatherDMask(const MCInst &Inst);
+ bool validateMIMGDataSize(const MCInst &Inst);
+ bool validateMIMGR128(const MCInst &Inst);
+ bool validateMIMGD16(const MCInst &Inst);
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
@@ -1037,6 +1106,7 @@ private:
public:
OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
+ OperandMatchResultTy parseOptionalOpr(OperandVector &Operands);
OperandMatchResultTy parseExpTgt(OperandVector &Operands);
OperandMatchResultTy parseSendMsgOp(OperandVector &Operands);
@@ -1060,17 +1130,12 @@ public:
void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+ void cvtMubufLds(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false, true); }
void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
AMDGPUOperand::Ptr defaultGLC() const;
AMDGPUOperand::Ptr defaultSLC() const;
- AMDGPUOperand::Ptr defaultTFE() const;
- AMDGPUOperand::Ptr defaultDMask() const;
- AMDGPUOperand::Ptr defaultUNorm() const;
- AMDGPUOperand::Ptr defaultDA() const;
- AMDGPUOperand::Ptr defaultR128() const;
- AMDGPUOperand::Ptr defaultLWE() const;
AMDGPUOperand::Ptr defaultSMRDOffset8() const;
AMDGPUOperand::Ptr defaultSMRDOffset20() const;
AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
@@ -1276,15 +1341,31 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
}
-bool AMDGPUOperand::isSDWARegKind() const {
+bool AMDGPUOperand::isSDWAOperand(MVT type) const {
if (AsmParser->isVI())
return isVReg();
else if (AsmParser->isGFX9())
- return isRegKind();
+ return isRegKind() || isInlinableImm(type);
else
return false;
}
+bool AMDGPUOperand::isSDWAFP16Operand() const {
+ return isSDWAOperand(MVT::f16);
+}
+
+bool AMDGPUOperand::isSDWAFP32Operand() const {
+ return isSDWAOperand(MVT::f32);
+}
+
+bool AMDGPUOperand::isSDWAInt16Operand() const {
+ return isSDWAOperand(MVT::i16);
+}
+
+bool AMDGPUOperand::isSDWAInt32Operand() const {
+ return isSDWAOperand(MVT::i32);
+}
+
uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
{
assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
@@ -1516,12 +1597,15 @@ static unsigned getSpecialRegForName(StringRef RegName) {
.Case("exec", AMDGPU::EXEC)
.Case("vcc", AMDGPU::VCC)
.Case("flat_scratch", AMDGPU::FLAT_SCR)
+ .Case("xnack_mask", AMDGPU::XNACK_MASK)
.Case("m0", AMDGPU::M0)
.Case("scc", AMDGPU::SCC)
.Case("tba", AMDGPU::TBA)
.Case("tma", AMDGPU::TMA)
.Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
.Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
+ .Case("xnack_mask_lo", AMDGPU::XNACK_MASK_LO)
+ .Case("xnack_mask_hi", AMDGPU::XNACK_MASK_HI)
.Case("vcc_lo", AMDGPU::VCC_LO)
.Case("vcc_hi", AMDGPU::VCC_HI)
.Case("exec_lo", AMDGPU::EXEC_LO)
@@ -1559,6 +1643,11 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
RegWidth = 2;
return true;
}
+ if (Reg == AMDGPU::XNACK_MASK_LO && Reg1 == AMDGPU::XNACK_MASK_HI) {
+ Reg = AMDGPU::XNACK_MASK;
+ RegWidth = 2;
+ return true;
+ }
if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) {
Reg = AMDGPU::VCC;
RegWidth = 2;
@@ -1717,6 +1806,54 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
return true;
}
+Optional<StringRef>
+AMDGPUAsmParser::getGprCountSymbolName(RegisterKind RegKind) {
+ switch (RegKind) {
+ case IS_VGPR:
+ return StringRef(".amdgcn.next_free_vgpr");
+ case IS_SGPR:
+ return StringRef(".amdgcn.next_free_sgpr");
+ default:
+ return None;
+ }
+}
+
+void AMDGPUAsmParser::initializeGprCountSymbol(RegisterKind RegKind) {
+ auto SymbolName = getGprCountSymbolName(RegKind);
+ assert(SymbolName && "initializing invalid register kind");
+ MCSymbol *Sym = getContext().getOrCreateSymbol(*SymbolName);
+ Sym->setVariableValue(MCConstantExpr::create(0, getContext()));
+}
+
+bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind,
+ unsigned DwordRegIndex,
+ unsigned RegWidth) {
+ // Symbols are only defined for GCN targets
+ if (AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()).Major < 6)
+ return true;
+
+ auto SymbolName = getGprCountSymbolName(RegKind);
+ if (!SymbolName)
+ return true;
+ MCSymbol *Sym = getContext().getOrCreateSymbol(*SymbolName);
+
+ int64_t NewMax = DwordRegIndex + RegWidth - 1;
+ int64_t OldCount;
+
+ if (!Sym->isVariable())
+ return !Error(getParser().getTok().getLoc(),
+ ".amdgcn.next_free_{v,s}gpr symbols must be variable");
+ if (!Sym->getVariableValue(false)->evaluateAsAbsolute(OldCount))
+ return !Error(
+ getParser().getTok().getLoc(),
+ ".amdgcn.next_free_{v,s}gpr symbols must be absolute expressions");
+
+ if (OldCount <= NewMax)
+ Sym->setVariableValue(MCConstantExpr::create(NewMax + 1, getContext()));
+
+ return true;
+}
+
std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
const auto &Tok = Parser.getTok();
SMLoc StartLoc = Tok.getLoc();
@@ -1727,7 +1864,11 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) {
return nullptr;
}
- KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
+ if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+ if (!updateGprCountSymbols(RegKind, DwordRegIndex, RegWidth))
+ return nullptr;
+ } else
+ KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false);
}
@@ -2234,6 +2375,111 @@ bool AMDGPUAsmParser::validateIntClampSupported(const MCInst &Inst) {
return true;
}
+bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
+
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ return true;
+
+ int VDataIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+ int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask);
+ int TFEIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::tfe);
+
+ assert(VDataIdx != -1);
+ assert(DMaskIdx != -1);
+ assert(TFEIdx != -1);
+
+ unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx);
+ unsigned TFESize = Inst.getOperand(TFEIdx).getImm()? 1 : 0;
+ unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
+ if (DMask == 0)
+ DMask = 1;
+
+ unsigned DataSize =
+ (Desc.TSFlags & SIInstrFlags::Gather4) ? 4 : countPopulation(DMask);
+ if (hasPackedD16()) {
+ int D16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::d16);
+ if (D16Idx >= 0 && Inst.getOperand(D16Idx).getImm())
+ DataSize = (DataSize + 1) / 2;
+ }
+
+ return (VDataSize / 4) == DataSize + TFESize;
+}
+
+bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) {
+
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ return true;
+ if (!Desc.mayLoad() || !Desc.mayStore())
+ return true; // Not atomic
+
+ int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask);
+ unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
+
+ // This is an incomplete check because image_atomic_cmpswap
+ // may only use 0x3 and 0xf while other atomic operations
+ // may use 0x1 and 0x3. However these limitations are
+ // verified when we check that dmask matches dst size.
+ return DMask == 0x1 || DMask == 0x3 || DMask == 0xf;
+}
+
+bool AMDGPUAsmParser::validateMIMGGatherDMask(const MCInst &Inst) {
+
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::Gather4) == 0)
+ return true;
+
+ int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask);
+ unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
+
+ // GATHER4 instructions use dmask in a different fashion compared to
+ // other MIMG instructions. The only useful DMASK values are
+ // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
+ // (red,red,red,red) etc.) The ISA document doesn't mention
+ // this.
+ return DMask == 0x1 || DMask == 0x2 || DMask == 0x4 || DMask == 0x8;
+}
+
+bool AMDGPUAsmParser::validateMIMGR128(const MCInst &Inst) {
+
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ return true;
+
+ int Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128);
+ assert(Idx != -1);
+
+ bool R128 = (Inst.getOperand(Idx).getImm() != 0);
+
+ return !R128 || hasMIMG_R128();
+}
+
+bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
+
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ return true;
+
+ int D16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::d16);
+ if (D16Idx >= 0 && Inst.getOperand(D16Idx).getImm()) {
+ if (isCI() || isSI())
+ return false;
+ }
+
+ return true;
+}
+
bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
const SMLoc &IDLoc) {
if (!validateConstantBusLimitations(Inst)) {
@@ -2251,6 +2497,32 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
"integer clamping is not supported on this GPU");
return false;
}
+ if (!validateMIMGR128(Inst)) {
+ Error(IDLoc,
+ "r128 modifier is not supported on this GPU");
+ return false;
+ }
+ // For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate.
+ if (!validateMIMGD16(Inst)) {
+ Error(IDLoc,
+ "d16 modifier is not supported on this GPU");
+ return false;
+ }
+ if (!validateMIMGDataSize(Inst)) {
+ Error(IDLoc,
+ "image data size does not match dmask and tfe");
+ return false;
+ }
+ if (!validateMIMGAtomicDMask(Inst)) {
+ Error(IDLoc,
+ "invalid atomic image dmask");
+ return false;
+ }
+ if (!validateMIMGGatherDMask(Inst)) {
+ Error(IDLoc,
+ "invalid image_gather dmask: only one bit must be set");
+ return false;
+ }
return true;
}
@@ -2355,6 +2627,320 @@ bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major,
return false;
}
+bool AMDGPUAsmParser::ParseDirectiveAMDGCNTarget() {
+ if (getSTI().getTargetTriple().getArch() != Triple::amdgcn)
+ return TokError("directive only supported for amdgcn architecture");
+
+ std::string Target;
+
+ SMLoc TargetStart = getTok().getLoc();
+ if (getParser().parseEscapedString(Target))
+ return true;
+ SMRange TargetRange = SMRange(TargetStart, getTok().getLoc());
+
+ std::string ExpectedTarget;
+ raw_string_ostream ExpectedTargetOS(ExpectedTarget);
+ IsaInfo::streamIsaVersion(&getSTI(), ExpectedTargetOS);
+
+ if (Target != ExpectedTargetOS.str())
+ return getParser().Error(TargetRange.Start, "target must match options",
+ TargetRange);
+
+ getTargetStreamer().EmitDirectiveAMDGCNTarget(Target);
+ return false;
+}
+
+bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) {
+ return getParser().Error(Range.Start, "value out of range", Range);
+}
+
+bool AMDGPUAsmParser::calculateGPRBlocks(
+ const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed,
+ bool XNACKUsed, unsigned NextFreeVGPR, SMRange VGPRRange,
+ unsigned NextFreeSGPR, SMRange SGPRRange, unsigned &VGPRBlocks,
+ unsigned &SGPRBlocks) {
+ // TODO(scott.linder): These calculations are duplicated from
+ // AMDGPUAsmPrinter::getSIProgramInfo and could be unified.
+ IsaInfo::IsaVersion Version = IsaInfo::getIsaVersion(Features);
+
+ unsigned NumVGPRs = NextFreeVGPR;
+ unsigned NumSGPRs = NextFreeSGPR;
+ unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(Features);
+
+ if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) &&
+ NumSGPRs > MaxAddressableNumSGPRs)
+ return OutOfRangeError(SGPRRange);
+
+ NumSGPRs +=
+ IsaInfo::getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed, XNACKUsed);
+
+ if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
+ NumSGPRs > MaxAddressableNumSGPRs)
+ return OutOfRangeError(SGPRRange);
+
+ if (Features.test(FeatureSGPRInitBug))
+ NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+
+ VGPRBlocks = IsaInfo::getNumVGPRBlocks(Features, NumVGPRs);
+ SGPRBlocks = IsaInfo::getNumSGPRBlocks(Features, NumSGPRs);
+
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
+ if (getSTI().getTargetTriple().getArch() != Triple::amdgcn)
+ return TokError("directive only supported for amdgcn architecture");
+
+ if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA)
+ return TokError("directive only supported for amdhsa OS");
+
+ StringRef KernelName;
+ if (getParser().parseIdentifier(KernelName))
+ return true;
+
+ kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor();
+
+ StringSet<> Seen;
+
+ IsaInfo::IsaVersion IVersion =
+ IsaInfo::getIsaVersion(getSTI().getFeatureBits());
+
+ SMRange VGPRRange;
+ uint64_t NextFreeVGPR = 0;
+ SMRange SGPRRange;
+ uint64_t NextFreeSGPR = 0;
+ unsigned UserSGPRCount = 0;
+ bool ReserveVCC = true;
+ bool ReserveFlatScr = true;
+ bool ReserveXNACK = hasXNACK();
+
+ while (true) {
+ while (getLexer().is(AsmToken::EndOfStatement))
+ Lex();
+
+ if (getLexer().isNot(AsmToken::Identifier))
+ return TokError("expected .amdhsa_ directive or .end_amdhsa_kernel");
+
+ StringRef ID = getTok().getIdentifier();
+ SMRange IDRange = getTok().getLocRange();
+ Lex();
+
+ if (ID == ".end_amdhsa_kernel")
+ break;
+
+ if (Seen.find(ID) != Seen.end())
+ return TokError(".amdhsa_ directives cannot be repeated");
+ Seen.insert(ID);
+
+ SMLoc ValStart = getTok().getLoc();
+ int64_t IVal;
+ if (getParser().parseAbsoluteExpression(IVal))
+ return true;
+ SMLoc ValEnd = getTok().getLoc();
+ SMRange ValRange = SMRange(ValStart, ValEnd);
+
+ if (IVal < 0)
+ return OutOfRangeError(ValRange);
+
+ uint64_t Val = IVal;
+
+#define PARSE_BITS_ENTRY(FIELD, ENTRY, VALUE, RANGE) \
+ if (!isUInt<ENTRY##_WIDTH>(VALUE)) \
+ return OutOfRangeError(RANGE); \
+ AMDHSA_BITS_SET(FIELD, ENTRY, VALUE);
+
+ if (ID == ".amdhsa_group_segment_fixed_size") {
+ if (!isUInt<sizeof(KD.group_segment_fixed_size) * CHAR_BIT>(Val))
+ return OutOfRangeError(ValRange);
+ KD.group_segment_fixed_size = Val;
+ } else if (ID == ".amdhsa_private_segment_fixed_size") {
+ if (!isUInt<sizeof(KD.private_segment_fixed_size) * CHAR_BIT>(Val))
+ return OutOfRangeError(ValRange);
+ KD.private_segment_fixed_size = Val;
+ } else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") {
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
+ Val, ValRange);
+ UserSGPRCount++;
+ } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") {
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val,
+ ValRange);
+ UserSGPRCount++;
+ } else if (ID == ".amdhsa_user_sgpr_queue_ptr") {
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val,
+ ValRange);
+ UserSGPRCount++;
+ } else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") {
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
+ Val, ValRange);
+ UserSGPRCount++;
+ } else if (ID == ".amdhsa_user_sgpr_dispatch_id") {
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val,
+ ValRange);
+ UserSGPRCount++;
+ } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
+ ValRange);
+ UserSGPRCount++;
+ } else if (ID == ".amdhsa_user_sgpr_private_segment_size") {
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
+ Val, ValRange);
+ UserSGPRCount++;
+ } else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
+ PARSE_BITS_ENTRY(
+ KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_system_sgpr_workgroup_id_y") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_system_sgpr_workgroup_id_z") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_system_sgpr_workgroup_info") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_system_vgpr_workitem_id") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_next_free_vgpr") {
+ VGPRRange = ValRange;
+ NextFreeVGPR = Val;
+ } else if (ID == ".amdhsa_next_free_sgpr") {
+ SGPRRange = ValRange;
+ NextFreeSGPR = Val;
+ } else if (ID == ".amdhsa_reserve_vcc") {
+ if (!isUInt<1>(Val))
+ return OutOfRangeError(ValRange);
+ ReserveVCC = Val;
+ } else if (ID == ".amdhsa_reserve_flat_scratch") {
+ if (IVersion.Major < 7)
+ return getParser().Error(IDRange.Start, "directive requires gfx7+",
+ IDRange);
+ if (!isUInt<1>(Val))
+ return OutOfRangeError(ValRange);
+ ReserveFlatScr = Val;
+ } else if (ID == ".amdhsa_reserve_xnack_mask") {
+ if (IVersion.Major < 8)
+ return getParser().Error(IDRange.Start, "directive requires gfx8+",
+ IDRange);
+ if (!isUInt<1>(Val))
+ return OutOfRangeError(ValRange);
+ ReserveXNACK = Val;
+ } else if (ID == ".amdhsa_float_round_mode_32") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32, Val, ValRange);
+ } else if (ID == ".amdhsa_float_round_mode_16_64") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64, Val, ValRange);
+ } else if (ID == ".amdhsa_float_denorm_mode_32") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32, Val, ValRange);
+ } else if (ID == ".amdhsa_float_denorm_mode_16_64") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_dx10_clamp") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, Val, ValRange);
+ } else if (ID == ".amdhsa_ieee_mode") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE,
+ Val, ValRange);
+ } else if (ID == ".amdhsa_fp16_overflow") {
+ if (IVersion.Major < 9)
+ return getParser().Error(IDRange.Start, "directive requires gfx9+",
+ IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
+ PARSE_BITS_ENTRY(
+ KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_exception_fp_denorm_src") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE,
+ Val, ValRange);
+ } else if (ID == ".amdhsa_exception_fp_ieee_div_zero") {
+ PARSE_BITS_ENTRY(
+ KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_exception_fp_ieee_overflow") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW,
+ Val, ValRange);
+ } else if (ID == ".amdhsa_exception_fp_ieee_underflow") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW,
+ Val, ValRange);
+ } else if (ID == ".amdhsa_exception_fp_ieee_inexact") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT,
+ Val, ValRange);
+ } else if (ID == ".amdhsa_exception_int_div_zero") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO,
+ Val, ValRange);
+ } else {
+ return getParser().Error(IDRange.Start,
+ "unknown .amdhsa_kernel directive", IDRange);
+ }
+
+#undef PARSE_BITS_ENTRY
+ }
+
+ if (Seen.find(".amdhsa_next_free_vgpr") == Seen.end())
+ return TokError(".amdhsa_next_free_vgpr directive is required");
+
+ if (Seen.find(".amdhsa_next_free_sgpr") == Seen.end())
+ return TokError(".amdhsa_next_free_sgpr directive is required");
+
+ unsigned VGPRBlocks;
+ unsigned SGPRBlocks;
+ if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr,
+ ReserveXNACK, NextFreeVGPR, VGPRRange, NextFreeSGPR,
+ SGPRRange, VGPRBlocks, SGPRBlocks))
+ return true;
+
+ if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>(
+ VGPRBlocks))
+ return OutOfRangeError(VGPRRange);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT, VGPRBlocks);
+
+ if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>(
+ SGPRBlocks))
+ return OutOfRangeError(SGPRRange);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT,
+ SGPRBlocks);
+
+ if (!isUInt<COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH>(UserSGPRCount))
+ return TokError("too many user SGPRs enabled");
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT,
+ UserSGPRCount);
+
+ getTargetStreamer().EmitAmdhsaKernelDescriptor(
+ getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC,
+ ReserveFlatScr, ReserveXNACK);
+ return false;
+}
+
bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() {
uint32_t Major;
uint32_t Minor;
@@ -2421,6 +3007,13 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
amd_kernel_code_t &Header) {
+ // max_scratch_backing_memory_byte_size is deprecated. Ignore it while parsing
+ // assembly for backwards compatibility.
+ if (ID == "max_scratch_backing_memory_byte_size") {
+ Parser.eatToEndOfStatement();
+ return false;
+ }
+
SmallString<40> ErrStr;
raw_svector_ostream Err(ErrStr);
if (!parseAmdKernelCodeField(ID, getParser(), Header, Err)) {
@@ -2467,7 +3060,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
getTargetStreamer().EmitAMDGPUSymbolType(KernelName,
ELF::STT_AMDGPU_HSA_KERNEL);
Lex();
- KernelScope.initialize(getContext());
+ if (!AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI()))
+ KernelScope.initialize(getContext());
return false;
}
@@ -2571,20 +3165,28 @@ bool AMDGPUAsmParser::ParseDirectivePALMetadata() {
bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getString();
- if (IDVal == ".hsa_code_object_version")
- return ParseDirectiveHSACodeObjectVersion();
+ if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+ if (IDVal == ".amdgcn_target")
+ return ParseDirectiveAMDGCNTarget();
+
+ if (IDVal == ".amdhsa_kernel")
+ return ParseDirectiveAMDHSAKernel();
+ } else {
+ if (IDVal == ".hsa_code_object_version")
+ return ParseDirectiveHSACodeObjectVersion();
- if (IDVal == ".hsa_code_object_isa")
- return ParseDirectiveHSACodeObjectISA();
+ if (IDVal == ".hsa_code_object_isa")
+ return ParseDirectiveHSACodeObjectISA();
- if (IDVal == ".amd_kernel_code_t")
- return ParseDirectiveAMDKernelCodeT();
+ if (IDVal == ".amd_kernel_code_t")
+ return ParseDirectiveAMDKernelCodeT();
- if (IDVal == ".amdgpu_hsa_kernel")
- return ParseDirectiveAMDGPUHsaKernel();
+ if (IDVal == ".amdgpu_hsa_kernel")
+ return ParseDirectiveAMDGPUHsaKernel();
- if (IDVal == ".amd_amdgpu_isa")
- return ParseDirectiveISAVersion();
+ if (IDVal == ".amd_amdgpu_isa")
+ return ParseDirectiveISAVersion();
+ }
if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin)
return ParseDirectiveHSAMetadata();
@@ -2612,6 +3214,10 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
case AMDGPU::TMA_LO:
case AMDGPU::TMA_HI:
return !isGFX9();
+ case AMDGPU::XNACK_MASK:
+ case AMDGPU::XNACK_MASK_LO:
+ case AMDGPU::XNACK_MASK_HI:
+ return !isCI() && !isSI() && hasXNACK();
default:
break;
}
@@ -3158,7 +3764,10 @@ bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset,
HwReg.IsSymbolic = true;
HwReg.Id = ID_UNKNOWN_;
const StringRef tok = Parser.getTok().getString();
- for (int i = ID_SYMBOLIC_FIRST_; i < ID_SYMBOLIC_LAST_; ++i) {
+ int Last = ID_SYMBOLIC_LAST_;
+ if (isSI() || isCI() || isVI())
+ Last = ID_SYMBOLIC_FIRST_GFX9_;
+ for (int i = ID_SYMBOLIC_FIRST_; i < Last; ++i) {
if (tok == IdSymbolic[i]) {
HwReg.Id = i;
break;
@@ -3859,7 +4468,7 @@ AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) {
} else {
// Swizzle "offset" operand is optional.
// If it is omitted, try parsing other optional operands.
- return parseOptionalOperand(Operands);
+ return parseOptionalOpr(Operands);
}
}
@@ -3907,13 +4516,13 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySLC);
}
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultTFE() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyTFE);
-}
-
void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
const OperandVector &Operands,
- bool IsAtomic, bool IsAtomicReturn) {
+ bool IsAtomic,
+ bool IsAtomicReturn,
+ bool IsLds) {
+ bool IsLdsOpcode = IsLds;
+ bool HasLdsModifier = false;
OptionalImmIndexMap OptionalIdx;
assert(IsAtomicReturn ? IsAtomic : true);
@@ -3932,6 +4541,8 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
continue;
}
+ HasLdsModifier = Op.isLDS();
+
// Handle tokens like 'offen' which are sometimes hard-coded into the
// asm string. There are no MCInst operands for these.
if (Op.isToken()) {
@@ -3943,6 +4554,21 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
OptionalIdx[Op.getImmTy()] = i;
}
+ // This is a workaround for an llvm quirk which may result in an
+ // incorrect instruction selection. Lds and non-lds versions of
+ // MUBUF instructions are identical except that lds versions
+ // have mandatory 'lds' modifier. However this modifier follows
+ // optional modifiers and llvm asm matcher regards this 'lds'
+ // modifier as an optional one. As a result, an lds version
+ // of opcode may be selected even if it has no 'lds' modifier.
+ if (IsLdsOpcode && !HasLdsModifier) {
+ int NoLdsOpcode = AMDGPU::getMUBUFNoLdsInst(Inst.getOpcode());
+ if (NoLdsOpcode != -1) { // Got lds version - correct it.
+ Inst.setOpcode(NoLdsOpcode);
+ IsLdsOpcode = false;
+ }
+ }
+
// Copy $vdata_in operand and insert as $vdata for MUBUF_Atomic RTN insns.
if (IsAtomicReturn) {
MCInst::iterator I = Inst.begin(); // $vdata_in is always at the beginning.
@@ -3954,7 +4580,10 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
}
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+
+ if (!IsLdsOpcode) { // tfe is not legal with lds opcodes
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+ }
}
void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
@@ -4009,7 +4638,8 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
if (IsAtomic) {
// Add src, same as dst
- ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1);
+ assert(Desc.getNumDefs() == 1);
+ ((AMDGPUOperand &)*Operands[I - 1]).addRegOperands(Inst, 1);
}
OptionalImmIndexMap OptionalIdx;
@@ -4018,9 +4648,8 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
// Add the register arguments
- if (Op.isRegOrImm()) {
- Op.addRegOrImmOperands(Inst, 1);
- continue;
+ if (Op.isReg()) {
+ Op.addRegOperands(Inst, 1);
} else if (Op.isImmModifier()) {
OptionalIdx[Op.getImmTy()] = I;
} else {
@@ -4031,37 +4660,18 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16);
}
void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) {
cvtMIMG(Inst, Operands, true);
}
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDMask);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultUNorm() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyUNorm);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDA() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDA);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultR128() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyR128);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultLWE() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyLWE);
-}
-
//===----------------------------------------------------------------------===//
// smrd
//===----------------------------------------------------------------------===//
@@ -4148,6 +4758,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"offset0", AMDGPUOperand::ImmTyOffset0, false, nullptr},
{"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
{"gds", AMDGPUOperand::ImmTyGDS, true, nullptr},
+ {"lds", AMDGPUOperand::ImmTyLDS, true, nullptr},
{"offset", AMDGPUOperand::ImmTyOffset, false, nullptr},
{"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
{"dfmt", AMDGPUOperand::ImmTyDFMT, false, nullptr},
@@ -4155,6 +4766,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
{"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
{"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr},
+ {"d16", AMDGPUOperand::ImmTyD16, true, nullptr},
{"high", AMDGPUOperand::ImmTyHigh, true, nullptr},
{"clamp", AMDGPUOperand::ImmTyClampSI, true, nullptr},
{"omod", AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul},
@@ -4162,6 +4774,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"da", AMDGPUOperand::ImmTyDA, true, nullptr},
{"r128", AMDGPUOperand::ImmTyR128, true, nullptr},
{"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr},
+ {"d16", AMDGPUOperand::ImmTyD16, true, nullptr},
{"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr},
{"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr},
{"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr},
@@ -4179,6 +4792,39 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
};
OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
+ unsigned size = Operands.size();
+ assert(size > 0);
+
+ OperandMatchResultTy res = parseOptionalOpr(Operands);
+
+ // This is a hack to enable hardcoded mandatory operands which follow
+ // optional operands.
+ //
+ // Current design assumes that all operands after the first optional operand
+ // are also optional. However implementation of some instructions violates
+ // this rule (see e.g. flat/global atomic which have hardcoded 'glc' operands).
+ //
+ // To alleviate this problem, we have to (implicitly) parse extra operands
+ // to make sure autogenerated parser of custom operands never hit hardcoded
+ // mandatory operands.
+
+ if (size == 1 || ((AMDGPUOperand &)*Operands[size - 1]).isRegKind()) {
+
+ // We have parsed the first optional operand.
+ // Parse as many operands as necessary to skip all mandatory operands.
+
+ for (unsigned i = 0; i < MAX_OPR_LOOKAHEAD; ++i) {
+ if (res != MatchOperand_Success ||
+ getLexer().is(AsmToken::EndOfStatement)) break;
+ if (getLexer().is(AsmToken::Comma)) Parser.Lex();
+ res = parseOptionalOpr(Operands);
+ }
+ }
+
+ return res;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands) {
OperandMatchResultTy res;
for (const OptionalOperand &Op : AMDGPUOptionalOperandTable) {
// try to parse any optional operand here
@@ -4341,12 +4987,14 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
}
- // special case v_mac_{f16, f32}:
+ // Special case v_mac_{f16, f32} and v_fmac_f32 (gfx906):
// it has src2 register operand that is tied to dst operand
// we don't allow modifiers for this operand in assembler so src2_modifiers
- // should be 0
- if (Opc == AMDGPU::V_MAC_F32_e64_si || Opc == AMDGPU::V_MAC_F32_e64_vi ||
- Opc == AMDGPU::V_MAC_F16_e64_vi) {
+ // should be 0.
+ if (Opc == AMDGPU::V_MAC_F32_e64_si ||
+ Opc == AMDGPU::V_MAC_F32_e64_vi ||
+ Opc == AMDGPU::V_MAC_F16_e64_vi ||
+ Opc == AMDGPU::V_FMAC_F32_e64_vi) {
auto it = Inst.begin();
std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers));
it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
@@ -4448,21 +5096,23 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst,
//===----------------------------------------------------------------------===//
bool AMDGPUOperand::isDPPCtrl() const {
+ using namespace AMDGPU::DPP;
+
bool result = isImm() && getImmTy() == ImmTyDppCtrl && isUInt<9>(getImm());
if (result) {
int64_t Imm = getImm();
- return ((Imm >= 0x000) && (Imm <= 0x0ff)) ||
- ((Imm >= 0x101) && (Imm <= 0x10f)) ||
- ((Imm >= 0x111) && (Imm <= 0x11f)) ||
- ((Imm >= 0x121) && (Imm <= 0x12f)) ||
- (Imm == 0x130) ||
- (Imm == 0x134) ||
- (Imm == 0x138) ||
- (Imm == 0x13c) ||
- (Imm == 0x140) ||
- (Imm == 0x141) ||
- (Imm == 0x142) ||
- (Imm == 0x143);
+ return (Imm >= DppCtrl::QUAD_PERM_FIRST && Imm <= DppCtrl::QUAD_PERM_LAST) ||
+ (Imm >= DppCtrl::ROW_SHL_FIRST && Imm <= DppCtrl::ROW_SHL_LAST) ||
+ (Imm >= DppCtrl::ROW_SHR_FIRST && Imm <= DppCtrl::ROW_SHR_LAST) ||
+ (Imm >= DppCtrl::ROW_ROR_FIRST && Imm <= DppCtrl::ROW_ROR_LAST) ||
+ (Imm == DppCtrl::WAVE_SHL1) ||
+ (Imm == DppCtrl::WAVE_ROL1) ||
+ (Imm == DppCtrl::WAVE_SHR1) ||
+ (Imm == DppCtrl::WAVE_ROR1) ||
+ (Imm == DppCtrl::ROW_MIRROR) ||
+ (Imm == DppCtrl::ROW_HALF_MIRROR) ||
+ (Imm == DppCtrl::BCAST15) ||
+ (Imm == DppCtrl::BCAST31);
}
return false;
}
@@ -4481,6 +5131,8 @@ bool AMDGPUOperand::isU16Imm() const {
OperandMatchResultTy
AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
+ using namespace AMDGPU::DPP;
+
SMLoc S = Parser.getTok().getLoc();
StringRef Prefix;
int64_t Int;
@@ -4492,10 +5144,10 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
}
if (Prefix == "row_mirror") {
- Int = 0x140;
+ Int = DppCtrl::ROW_MIRROR;
Parser.Lex();
} else if (Prefix == "row_half_mirror") {
- Int = 0x141;
+ Int = DppCtrl::ROW_HALF_MIRROR;
Parser.Lex();
} else {
// Check to prevent parseDPPCtrlOps from eating invalid tokens
@@ -4547,24 +5199,24 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
return MatchOperand_ParseFail;
if (Prefix == "row_shl" && 1 <= Int && Int <= 15) {
- Int |= 0x100;
+ Int |= DppCtrl::ROW_SHL0;
} else if (Prefix == "row_shr" && 1 <= Int && Int <= 15) {
- Int |= 0x110;
+ Int |= DppCtrl::ROW_SHR0;
} else if (Prefix == "row_ror" && 1 <= Int && Int <= 15) {
- Int |= 0x120;
+ Int |= DppCtrl::ROW_ROR0;
} else if (Prefix == "wave_shl" && 1 == Int) {
- Int = 0x130;
+ Int = DppCtrl::WAVE_SHL1;
} else if (Prefix == "wave_rol" && 1 == Int) {
- Int = 0x134;
+ Int = DppCtrl::WAVE_ROL1;
} else if (Prefix == "wave_shr" && 1 == Int) {
- Int = 0x138;
+ Int = DppCtrl::WAVE_SHR1;
} else if (Prefix == "wave_ror" && 1 == Int) {
- Int = 0x13C;
+ Int = DppCtrl::WAVE_ROR1;
} else if (Prefix == "row_bcast") {
if (Int == 15) {
- Int = 0x142;
+ Int = DppCtrl::BCAST15;
} else if (Int == 31) {
- Int = 0x143;
+ Int = DppCtrl::BCAST31;
} else {
return MatchOperand_ParseFail;
}
@@ -4742,7 +5394,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
}
}
if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
- Op.addRegWithInputModsOperands(Inst, 2);
+ Op.addRegOrImmWithInputModsOperands(Inst, 2);
} else if (Op.isImm()) {
// Handle optional arguments
OptionalIdx[Op.getImmTy()] = I;
@@ -4824,6 +5476,8 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
return Operand.isAddr64() ? Match_Success : Match_InvalidOperand;
case MCK_gds:
return Operand.isGDS() ? Match_Success : Match_InvalidOperand;
+ case MCK_lds:
+ return Operand.isLDS() ? Match_Success : Match_InvalidOperand;
case MCK_glc:
return Operand.isGLC() ? Match_Success : Match_InvalidOperand;
case MCK_idxen:
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 2230457b3a9b..b87c47a6b9ee 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -52,14 +52,19 @@ class getAddrName<int addrKind> {
"")))));
}
-class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
+class MUBUFAddr64Table <bit is_addr64, string Name> {
bit IsAddr64 = is_addr64;
- string OpName = NAME # suffix;
+ string OpName = Name;
}
-class MTBUFAddr64Table <bit is_addr64, string suffix = ""> {
+class MUBUFLdsTable <bit is_lds, string Name> {
+ bit IsLds = is_lds;
+ string OpName = Name;
+}
+
+class MTBUFAddr64Table <bit is_addr64, string Name> {
bit IsAddr64 = is_addr64;
- string OpName = NAME # suffix;
+ string OpName = Name;
}
//===----------------------------------------------------------------------===//
@@ -137,17 +142,17 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
dag InsNoData = !if(!empty(vaddrList),
(ins SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe),
+ offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe),
(ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe)
+ offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe)
);
dag InsData = !if(!empty(vaddrList),
(ins vdataClass:$vdata, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
- slc:$slc, tfe:$tfe),
+ SLC:$slc, TFE:$tfe),
(ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
- slc:$slc, tfe:$tfe)
+ SLC:$slc, TFE:$tfe)
);
dag ret = !if(!empty(vdataList), InsNoData, InsData);
}
@@ -214,13 +219,13 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
[(set load_vt:$vdata,
(ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt,
i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
- MTBUFAddr64Table<0>;
+ MTBUFAddr64Table<0, NAME>;
def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(set load_vt:$vdata,
(ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
- MTBUFAddr64Table<1>;
+ MTBUFAddr64Table<1, NAME>;
def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
@@ -260,13 +265,13 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
[(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
i1:$slc, i1:$tfe))]>,
- MTBUFAddr64Table<0>;
+ MTBUFAddr64Table<0, NAME>;
def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
i1:$slc, i1:$tfe))]>,
- MTBUFAddr64Table<1>;
+ MTBUFAddr64Table<1, NAME>;
def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
@@ -310,6 +315,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> offen = 0;
bits<1> idxen = 0;
bits<1> addr64 = 0;
+ bits<1> lds = 0;
bits<1> has_vdata = 1;
bits<1> has_vaddr = 1;
bits<1> has_glc = 1;
@@ -336,7 +342,6 @@ class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
bits<12> offset;
bits<1> glc;
- bits<1> lds = 0;
bits<8> vaddr;
bits<8> vdata;
bits<7> srsrc;
@@ -371,31 +376,35 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node> :
}
class getMUBUFInsDA<list<RegisterClass> vdataList,
- list<RegisterClass> vaddrList=[]> {
+ list<RegisterClass> vaddrList=[],
+ bit isLds = 0> {
RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
dag InsNoData = !if(!empty(vaddrList),
(ins SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe),
+ offset:$offset, GLC:$glc, SLC:$slc),
(ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe)
+ offset:$offset, GLC:$glc, SLC:$slc)
);
dag InsData = !if(!empty(vaddrList),
(ins vdataClass:$vdata, SReg_128:$srsrc,
- SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe),
+ SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc),
(ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
- SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe)
+ SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc)
);
- dag ret = !if(!empty(vdataList), InsNoData, InsData);
+ dag ret = !con(
+ !if(!empty(vdataList), InsNoData, InsData),
+ !if(isLds, (ins), (ins TFE:$tfe))
+ );
}
-class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit isLds = 0> {
dag ret =
- !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList>.ret,
- !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32]>.ret,
- !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32]>.ret,
- !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64]>.ret,
- !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32], isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64], isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64], isLds>.ret,
(ins))))));
}
@@ -426,20 +435,29 @@ class MUBUF_Load_Pseudo <string opName,
int addrKind,
RegisterClass vdataClass,
bit HasTiedDest = 0,
+ bit isLds = 0,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind>
: MUBUF_Pseudo<opName,
(outs vdataClass:$vdata),
- !con(getMUBUFIns<addrKindCopy>.ret, !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))),
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ !con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
+ !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))),
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" #
+ !if(isLds, " lds", "$tfe"),
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
- let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+ let PseudoInstr = opName # !if(isLds, "_lds", "") #
+ "_" # getAddrName<addrKindCopy>.ret;
+ let AsmMatchConverter = !if(isLds, "cvtMubufLds", "cvtMubuf");
+
let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", "");
let mayLoad = 1;
let mayStore = 0;
let maybeAtomic = 1;
+ let Uses = !if(isLds, [EXEC, M0], [EXEC]);
+ let has_tfe = !if(isLds, 0, 1);
+ let lds = isLds;
}
// FIXME: tfe can't be an operand because it requires a separate
@@ -447,32 +465,45 @@ class MUBUF_Load_Pseudo <string opName,
multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
ValueType load_vt = i32,
SDPatternOperator ld = null_frag,
- bit TiedDest = 0> {
+ bit TiedDest = 0,
+ bit isLds = 0> {
def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
- TiedDest,
- [(set load_vt:$vdata,
- (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
- MUBUFAddr64Table<0>;
+ TiedDest, isLds,
+ !if(isLds,
+ [],
+ [(set load_vt:$vdata,
+ (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>,
+ MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>;
def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
- TiedDest,
- [(set load_vt:$vdata,
- (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
- MUBUFAddr64Table<1>;
+ TiedDest, isLds,
+ !if(isLds,
+ [],
+ [(set load_vt:$vdata,
+ (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>,
+ MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>;
- def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>;
- def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>;
- def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>;
+ def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
+ def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>;
+ def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest>;
- def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>;
- def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>;
- def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>;
+ def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>;
+ def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
+ def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>;
+ def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>;
}
}
+multiclass MUBUF_Pseudo_Loads_Lds<string opName, RegisterClass vdataClass,
+ ValueType load_vt = i32,
+ SDPatternOperator ld_nolds = null_frag,
+ SDPatternOperator ld_lds = null_frag> {
+ defm NAME : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_nolds>;
+ defm _LDS : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_lds, 0, 1>;
+}
+
class MUBUF_Store_Pseudo <string opName,
int addrKind,
RegisterClass vdataClass,
@@ -499,12 +530,12 @@ multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
- MUBUFAddr64Table<0>;
+ MUBUFAddr64Table<0, NAME>;
def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
- MUBUFAddr64Table<1>;
+ MUBUFAddr64Table<1, NAME>;
def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
@@ -518,6 +549,23 @@ multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
}
}
+class MUBUF_Pseudo_Store_Lds<string opName>
+ : MUBUF_Pseudo<opName,
+ (outs),
+ (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc),
+ " $srsrc, $soffset$offset lds$glc$slc"> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let maybeAtomic = 1;
+
+ let has_vdata = 0;
+ let has_vaddr = 0;
+ let has_tfe = 0;
+ let lds = 1;
+
+ let Uses = [EXEC, M0];
+ let AsmMatchConverter = "cvtMubufLds";
+}
class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
list<RegisterClass> vaddrList=[]> {
@@ -525,15 +573,15 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
dag ret = !if(vdata_in,
!if(!empty(vaddrList),
(ins vdataClass:$vdata_in,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc),
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc),
(ins vdataClass:$vdata_in, vaddrClass:$vaddr,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc)
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc)
),
!if(!empty(vaddrList),
(ins vdataClass:$vdata,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc),
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc),
(ins vdataClass:$vdata, vaddrClass:$vaddr,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc)
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc)
));
}
@@ -618,9 +666,9 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
SDPatternOperator atomic> {
def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
- MUBUFAddr64Table <0>;
+ MUBUFAddr64Table <0, NAME>;
def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
- MUBUFAddr64Table <1>;
+ MUBUFAddr64Table <1, NAME>;
def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
@@ -629,13 +677,13 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
[(set vdataType:$vdata,
(atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
vdataType:$vdata_in))]>,
- MUBUFAddr64Table <0, "_RTN">;
+ MUBUFAddr64Table <0, NAME # "_RTN">;
def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(set vdataType:$vdata,
(atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc),
vdataType:$vdata_in))]>,
- MUBUFAddr64Table <1, "_RTN">;
+ MUBUFAddr64Table <1, NAME # "_RTN">;
def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
@@ -647,7 +695,7 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
// MUBUF Instructions
//===----------------------------------------------------------------------===//
-defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads_Lds <
"buffer_load_format_x", VGPR_32
>;
defm BUFFER_LOAD_FORMAT_XY : MUBUF_Pseudo_Loads <
@@ -671,19 +719,74 @@ defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Pseudo_Stores <
defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores <
"buffer_store_format_xyzw", VReg_128
>;
-defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads <
+
+let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
+ defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_x", VGPR_32
+ >;
+ defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_xy", VReg_64
+ >;
+ defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_xyz", VReg_96
+ >;
+ defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_xyzw", VReg_128
+ >;
+ defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_x", VGPR_32
+ >;
+ defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_xy", VReg_64
+ >;
+ defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_xyz", VReg_96
+ >;
+ defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_xyzw", VReg_128
+ >;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
+ defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_x", VGPR_32
+ >;
+ defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_xy", VGPR_32
+ >;
+ defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_xyz", VReg_64
+ >;
+ defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_xyzw", VReg_64
+ >;
+ defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_x", VGPR_32
+ >;
+ defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_xy", VGPR_32
+ >;
+ defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_xyz", VReg_64
+ >;
+ defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_xyzw", VReg_64
+ >;
+} // End HasPackedD16VMem.
+
+defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds <
"buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8
>;
-defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads_Lds <
"buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8
>;
-defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads_Lds <
"buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16
>;
-defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads_Lds <
"buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16
>;
-defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds <
"buffer_load_dword", VGPR_32, i32, mubuf_load
>;
defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
@@ -695,6 +798,22 @@ defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
"buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
>;
+
+// This is not described in AMD documentation,
+// but 'lds' versions of these opcodes are available
+// in at least GFX8+ chips. See Bug 37653.
+let SubtargetPredicate = isVI in {
+defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads <
+ "buffer_load_dwordx2", VReg_64, v2i32, null_frag, 0, 1
+>;
+defm BUFFER_LOAD_DWORDX3_LDS : MUBUF_Pseudo_Loads <
+ "buffer_load_dwordx3", VReg_96, untyped, null_frag, 0, 1
+>;
+defm BUFFER_LOAD_DWORDX4_LDS : MUBUF_Pseudo_Loads <
+ "buffer_load_dwordx4", VReg_128, v4i32, null_frag, 0, 1
+>;
+}
+
defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
"buffer_store_byte", VGPR_32, i32, truncstorei8_global
>;
@@ -792,6 +911,10 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
"buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global
>;
+let SubtargetPredicate = isVI in {
+def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">;
+}
+
let SubtargetPredicate = isSI in { // isn't on CI & VI
/*
defm BUFFER_ATOMIC_RSUB : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">;
@@ -842,6 +965,13 @@ defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores <
"buffer_store_short_d16_hi", VGPR_32, i32
>;
+defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_hi_x", VGPR_32
+>;
+defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_hi_x", VGPR_32
+>;
+
} // End HasD16LoadStore
def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
@@ -860,6 +990,28 @@ defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy",
defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>;
defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
+let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
+ defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>;
+ defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128>;
+ defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>;
+ defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
+ defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>;
+ defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64>;
+ defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>;
+ defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>;
+} // End HasPackedD16VMem.
+
let SubtargetPredicate = isCIVI in {
//===----------------------------------------------------------------------===//
@@ -922,6 +1074,19 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
+
+let SubtargetPredicate = HasUnpackedD16VMem in {
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
@@ -969,6 +1134,19 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
+
+let SubtargetPredicate = HasUnpackedD16VMem in {
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
@@ -1210,7 +1388,7 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>;
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>;
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>;
@@ -1325,7 +1503,7 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OF
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>;
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
// Hiding the extract high pattern in the PatFrag seems to not
// automatically increase the complexity.
let AddedComplexity = 1 in {
@@ -1382,6 +1560,18 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32, "TBUFFER_LOAD_FORMAT_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
+let SubtargetPredicate = HasUnpackedD16VMem in {
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">;
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
@@ -1431,6 +1621,18 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY"
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
+let SubtargetPredicate = HasUnpackedD16VMem in {
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">;
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">;
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">;
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
//===----------------------------------------------------------------------===//
// Target instructions, move to the appropriate target TD file
//===----------------------------------------------------------------------===//
@@ -1451,7 +1653,7 @@ class MUBUF_Real_si <bits<7> op, MUBUF_Pseudo ps> :
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
let Inst{15} = ps.addr64;
- let Inst{16} = lds;
+ let Inst{16} = !if(ps.lds, 1, 0);
let Inst{24-18} = op;
let Inst{31-26} = 0x38; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
@@ -1470,6 +1672,31 @@ multiclass MUBUF_Real_AllAddr_si<bits<7> op> {
def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
}
+multiclass MUBUF_Real_AllAddr_Lds_si<bits<7> op> {
+
+ def _OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+ MUBUFLdsTable<0, NAME # "_OFFSET_si">;
+ def _ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>,
+ MUBUFLdsTable<0, NAME # "_ADDR64_si">;
+ def _OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+ MUBUFLdsTable<0, NAME # "_OFFEN_si">;
+ def _IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+ MUBUFLdsTable<0, NAME # "_IDXEN_si">;
+ def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+ MUBUFLdsTable<0, NAME # "_BOTHEN_si">;
+
+ def _LDS_OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
+ MUBUFLdsTable<1, NAME # "_OFFSET_si">;
+ def _LDS_ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>,
+ MUBUFLdsTable<1, NAME # "_ADDR64_si">;
+ def _LDS_OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
+ MUBUFLdsTable<1, NAME # "_OFFEN_si">;
+ def _LDS_IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
+ MUBUFLdsTable<1, NAME # "_IDXEN_si">;
+ def _LDS_BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
+ MUBUFLdsTable<1, NAME # "_BOTHEN_si">;
+}
+
multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> {
def _OFFSET_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
def _ADDR64_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>;
@@ -1478,7 +1705,7 @@ multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> {
def _BOTHEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
}
-defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_si <0x00>;
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_si <0x00>;
defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_si <0x01>;
defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x02>;
defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x03>;
@@ -1486,11 +1713,11 @@ defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_si <0x04>;
defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_si <0x05>;
defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x06>;
defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x07>;
-defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_si <0x08>;
-defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_si <0x09>;
-defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_si <0x0a>;
-defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_si <0x0b>;
-defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_si <0x0c>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_Lds_si <0x08>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_si <0x09>;
+defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_si <0x0a>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_si <0x0b>;
+defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_si <0x0c>;
defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_si <0x0d>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_si <0x0e>;
defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_si <0x0f>;
@@ -1575,7 +1802,7 @@ multiclass MTBUF_Real_AllAddr_si<bits<3> op> {
defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_si <0>;
defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_si <1>;
-//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_si <2>;
+defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_si <2>;
defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_si <3>;
defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_si <4>;
defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_si <5>;
@@ -1610,7 +1837,7 @@ class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> :
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
- let Inst{16} = lds;
+ let Inst{16} = !if(ps.lds, 1, 0);
let Inst{17} = !if(ps.has_slc, slc, ?);
let Inst{24-18} = op;
let Inst{31-26} = 0x38; //encoding
@@ -1628,6 +1855,56 @@ multiclass MUBUF_Real_AllAddr_vi<bits<7> op> {
def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
}
+multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> {
+
+ def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+ MUBUFLdsTable<0, NAME # "_OFFSET_vi">;
+ def _OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+ MUBUFLdsTable<0, NAME # "_OFFEN_vi">;
+ def _IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+ MUBUFLdsTable<0, NAME # "_IDXEN_vi">;
+ def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+ MUBUFLdsTable<0, NAME # "_BOTHEN_vi">;
+
+ def _LDS_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
+ MUBUFLdsTable<1, NAME # "_OFFSET_vi">;
+ def _LDS_OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
+ MUBUFLdsTable<1, NAME # "_OFFEN_vi">;
+ def _LDS_IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
+ MUBUFLdsTable<1, NAME # "_IDXEN_vi">;
+ def _LDS_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
+ MUBUFLdsTable<1, NAME # "_BOTHEN_vi">;
+}
+
+class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> :
+ MUBUF_Real<op, ps>,
+ Enc64,
+ SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX80> {
+ let AssemblerPredicate=HasUnpackedD16VMem;
+ let DecoderNamespace="GFX80_UNPACKED";
+
+ let Inst{11-0} = !if(ps.has_offset, offset, ?);
+ let Inst{12} = ps.offen;
+ let Inst{13} = ps.idxen;
+ let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{16} = !if(ps.lds, 1, 0);
+ let Inst{17} = !if(ps.has_slc, slc, ?);
+ let Inst{24-18} = op;
+ let Inst{31-26} = 0x38; //encoding
+ let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+ let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+multiclass MUBUF_Real_AllAddr_gfx80<bits<7> op> {
+ def _OFFSET_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _OFFEN_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
multiclass MUBUF_Real_Atomic_vi<bits<7> op> :
MUBUF_Real_AllAddr_vi<op> {
def _OFFSET_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
@@ -1636,7 +1913,7 @@ multiclass MUBUF_Real_Atomic_vi<bits<7> op> :
def _BOTHEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
}
-defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_vi <0x00>;
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_vi <0x00>;
defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x01>;
defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x02>;
defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x03>;
@@ -1644,14 +1921,34 @@ defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_vi <0x04>;
defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x05>;
defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x06>;
defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x07>;
-defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_vi <0x10>;
-defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_vi <0x11>;
-defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_vi <0x12>;
-defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_vi <0x13>;
-defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_vi <0x14>;
-defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_vi <0x15>;
-defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_vi <0x16>;
-defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_vi <0x17>;
+let SubtargetPredicate = HasUnpackedD16VMem in {
+ defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x08>;
+ defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x09>;
+ defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0a>;
+ defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0b>;
+ defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0c>;
+ defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0d>;
+ defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0e>;
+ defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0f>;
+} // End HasUnpackedD16VMem.
+let SubtargetPredicate = HasPackedD16VMem in {
+ defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_vi <0x08>;
+ defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_vi <0x09>;
+ defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_vi <0x0a>;
+ defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_vi <0x0b>;
+ defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_vi <0x0c>;
+ defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_vi <0x0d>;
+ defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_vi <0x0e>;
+ defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_vi <0x0f>;
+} // End HasPackedD16VMem.
+defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_Lds_vi <0x10>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_vi <0x11>;
+defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_vi <0x12>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_vi <0x13>;
+defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_vi <0x14>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_Lds_vi <0x15>;
+defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_Lds_vi <0x16>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_Lds_vi <0x17>;
defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_vi <0x18>;
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x19>;
defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_vi <0x1a>;
@@ -1668,6 +1965,9 @@ defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x23>;
defm BUFFER_LOAD_SHORT_D16 : MUBUF_Real_AllAddr_vi <0x24>;
defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Real_AllAddr_vi <0x25>;
+defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_vi <0x26>;
+defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_vi <0x27>;
+
defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_vi <0x40>;
defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_vi <0x41>;
defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_vi <0x42>;
@@ -1696,6 +1996,8 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_vi <0x6a>;
defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_vi <0x6b>;
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_vi <0x6c>;
+def BUFFER_STORE_LDS_DWORD_vi : MUBUF_Real_vi <0x3d, BUFFER_STORE_LDS_DWORD>;
+
def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>;
def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
@@ -1729,11 +2031,61 @@ multiclass MTBUF_Real_AllAddr_vi<bits<4> op> {
def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
}
-defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0>;
-defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <1>;
-//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <2>;
-defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <3>;
-defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <4>;
-defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <5>;
-defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <6>;
-defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>;
+class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> :
+ MTBUF_Real<ps>,
+ Enc64,
+ SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX80> {
+ let AssemblerPredicate=HasUnpackedD16VMem;
+ let DecoderNamespace="GFX80_UNPACKED";
+
+ let Inst{11-0} = !if(ps.has_offset, offset, ?);
+ let Inst{12} = ps.offen;
+ let Inst{13} = ps.idxen;
+ let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{18-15} = op;
+ let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+ let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+ let Inst{31-26} = 0x3a; //encoding
+ let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+ let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+ let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+multiclass MTBUF_Real_AllAddr_gfx80<bits<4> op> {
+ def _OFFSET_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _OFFEN_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
+defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0x00>;
+defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <0x01>;
+defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <0x02>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <0x03>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <0x04>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <0x05>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <0x06>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <0x07>;
+let SubtargetPredicate = HasUnpackedD16VMem in {
+ defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x08>;
+ defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x09>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0a>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0b>;
+ defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0c>;
+ defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0d>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0e>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0f>;
+} // End HasUnpackedD16VMem.
+let SubtargetPredicate = HasPackedD16VMem in {
+ defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_vi <0x08>;
+ defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_vi <0x09>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0a>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0b>;
+ defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_vi <0x0c>;
+ defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_vi <0x0d>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0e>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0f>;
+} // End HasUnpackedD16VMem.
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 3a8503030414..174b2df15300 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -1,18 +1,33 @@
set(LLVM_TARGET_DEFINITIONS AMDGPU.td)
-tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
-tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic)
-tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
-tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer)
-tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM AMDGPUGenIntrinsicEnums.inc -gen-tgt-intrinsic-enums)
+tablegen(LLVM AMDGPUGenIntrinsicImpl.inc -gen-tgt-intrinsic-impl)
+tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering)
tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank)
+tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AMDGPUGenSearchableTables.inc -gen-searchable-tables)
+tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
+
+set(LLVM_TARGET_DEFINITIONS AMDGPUGISel.td)
+tablegen(LLVM AMDGPUGenGlobalISel.inc -gen-global-isel)
+
+set(LLVM_TARGET_DEFINITIONS R600.td)
+tablegen(LLVM R600GenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM R600GenCallingConv.inc -gen-callingconv)
+tablegen(LLVM R600GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM R600GenDFAPacketizer.inc -gen-dfa-packetizer)
+tablegen(LLVM R600GenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM R600GenMCCodeEmitter.inc -gen-emitter)
+tablegen(LLVM R600GenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM R600GenSubtargetInfo.inc -gen-subtarget)
+
add_public_tablegen_target(AMDGPUCommonTableGen)
add_llvm_target(AMDGPUCodeGen
@@ -25,6 +40,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUCallLowering.cpp
AMDGPUCodeGenPrepare.cpp
AMDGPUFrameLowering.cpp
+ AMDGPUHSAMetadataStreamer.cpp
AMDGPUInstrInfo.cpp
AMDGPUInstructionSelector.cpp
AMDGPUIntrinsicInfo.cpp
@@ -34,13 +50,14 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULibCalls.cpp
AMDGPULibFunc.cpp
AMDGPULowerIntrinsics.cpp
+ AMDGPULowerKernelArguments.cpp
+ AMDGPULowerKernelAttributes.cpp
AMDGPUMachineCFGStructurizer.cpp
AMDGPUMachineFunction.cpp
AMDGPUMachineModuleInfo.cpp
AMDGPUMacroFusion.cpp
AMDGPUMCInstLower.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp
- AMDGPUOpenCLImageTypeLoweringPass.cpp
AMDGPUPromoteAlloca.cpp
AMDGPURegAsmNames.inc.cpp
AMDGPURegisterBankInfo.cpp
@@ -53,12 +70,14 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUUnifyDivergentExitNodes.cpp
AMDGPUUnifyMetadata.cpp
AMDGPUInline.cpp
+ AMDGPUPerfHintAnalysis.cpp
AMDILCFGStructurizer.cpp
GCNHazardRecognizer.cpp
GCNIterativeScheduler.cpp
GCNMinRegStrategy.cpp
GCNRegPressure.cpp
GCNSchedStrategy.cpp
+ R600AsmPrinter.cpp
R600ClauseMergePass.cpp
R600ControlFlowFinalizer.cpp
R600EmitClauseMarkers.cpp
@@ -68,6 +87,7 @@ add_llvm_target(AMDGPUCodeGen
R600ISelLowering.cpp
R600MachineFunctionInfo.cpp
R600MachineScheduler.cpp
+ R600OpenCLImageTypeLoweringPass.cpp
R600OptimizeVectorRegisters.cpp
R600Packetizer.cpp
R600RegisterInfo.cpp
@@ -77,10 +97,10 @@ add_llvm_target(AMDGPUCodeGen
SIFixVGPRCopies.cpp
SIFixWWMLiveness.cpp
SIFoldOperands.cpp
+ SIFormMemoryClauses.cpp
SIFrameLowering.cpp
SIInsertSkips.cpp
SIInsertWaitcnts.cpp
- SIInsertWaits.cpp
SIInstrInfo.cpp
SIISelLowering.cpp
SILoadStoreOptimizer.cpp
@@ -99,8 +119,8 @@ add_llvm_target(AMDGPUCodeGen
)
add_subdirectory(AsmParser)
-add_subdirectory(InstPrinter)
add_subdirectory(Disassembler)
-add_subdirectory(TargetInfo)
+add_subdirectory(InstPrinter)
add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
add_subdirectory(Utils)
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index f898fd7948cc..cdc6ab9412e6 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -440,7 +440,7 @@ defm DS_XOR_RTN_B32 : DS_1A1D_RET_mc<"ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">;
defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">;
defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">;
defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
-defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc <"ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
+defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc<"ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
defm DS_MAX_RTN_F32 : DS_1A1D_RET_mc<"ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b32">;
@@ -584,6 +584,8 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
int_amdgcn_ds_bpermute>;
}
+def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
+
} // let SubtargetPredicate = isVI
//===----------------------------------------------------------------------===//
@@ -600,8 +602,6 @@ class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
(inst $ptr, (as_i16imm $offset), (i1 0))
>;
-// FIXME: Passing name of PatFrag in workaround. Why doesn't
-// !cast<PatFrag>(frag.NAME#"_m0") work!?
multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
@@ -609,7 +609,7 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
- def : DSReadPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
}
}
@@ -647,14 +647,17 @@ defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "az_extloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">;
+defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
+defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">;
let AddedComplexity = 100 in {
defm : DSReadPat_mc <DS_READ_B64, v2i32, "load_align8_local">;
+defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">;
} // End AddedComplexity = 100
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
let AddedComplexity = 100 in {
defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
@@ -678,7 +681,24 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
- def : DSWritePat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ }
+}
+
+// Irritatingly, atomic_store reverses the order of operands from a
+// normal store.
+class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
+ (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+>;
+
+multiclass DSAtomicWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSAtomicWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSAtomicWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
}
}
@@ -687,8 +707,10 @@ defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local">;
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>;
def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_local_hi16>;
}
@@ -720,6 +742,8 @@ def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>;
let AddedComplexity = 100 in {
defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">;
+defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">;
+
} // End AddedComplexity = 100
class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
@@ -732,7 +756,8 @@ multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
- def : DSAtomicRetPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag)>;
}
}
@@ -749,7 +774,8 @@ multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> {
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
- def : DSAtomicCmpXChg<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag)>;
}
}
@@ -769,6 +795,9 @@ defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max_local">;
defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin_local">;
defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax_local">;
defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap_local">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin_local">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax_local">;
+defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd_local">;
// 64-bit atomics.
defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap_local">;
@@ -1123,6 +1152,7 @@ def DS_XOR_SRC2_B32_vi : DS_Real_vi<0x8b, DS_XOR_SRC2_B32>;
def DS_WRITE_SRC2_B32_vi : DS_Real_vi<0x8d, DS_WRITE_SRC2_B32>;
def DS_MIN_SRC2_F32_vi : DS_Real_vi<0x92, DS_MIN_SRC2_F32>;
def DS_MAX_SRC2_F32_vi : DS_Real_vi<0x93, DS_MAX_SRC2_F32>;
+def DS_ADD_SRC2_F32_vi : DS_Real_vi<0x95, DS_ADD_SRC2_F32>;
def DS_ADD_SRC2_U64_vi : DS_Real_vi<0xc0, DS_ADD_SRC2_U64>;
def DS_SUB_SRC2_U64_vi : DS_Real_vi<0xc1, DS_SUB_SRC2_U64>;
def DS_RSUB_SRC2_U64_vi : DS_Real_vi<0xc2, DS_RSUB_SRC2_U64>;
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 47a2d3f2fdc5..f3de903f21b2 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -20,7 +20,9 @@
#include "Disassembler/AMDGPUDisassembler.h"
#include "AMDGPU.h"
#include "AMDGPURegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm-c/Disassembler.h"
#include "llvm/ADT/APInt.h"
@@ -198,6 +200,21 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);
if (Res) { IsSDWA = true; break; }
+
+ if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) {
+ Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address);
+ if (Res)
+ break;
+ }
+
+ // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
+ // v_mad_mixhi_f16 for FMA variants. Try to decode using this special
+ // table first so we print the correct name.
+ if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) {
+ Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address);
+ if (Res)
+ break;
+ }
}
// Reinitialize Bytes as DPP64 could have eaten too much
@@ -228,7 +245,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
- MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) {
+ MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi)) {
// Insert dummy unused src2_modifiers.
insertNamedMCOperand(MI, MCOperand::createImm(0),
AMDGPU::OpName::src2_modifiers);
@@ -241,7 +259,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (Res && IsSDWA)
Res = convertSDWAInst(MI);
- Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0;
+ // if the opcode was not recognized we'll assume a Size of 4 bytes
+ // (unless there are fewer bytes left)
+ Size = Res ? (MaxInstBytesNum - Bytes.size())
+ : std::min((size_t)4, Bytes_.size());
return Res;
}
@@ -264,26 +285,70 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
return MCDisassembler::Success;
}
+// Note that MIMG format provides no information about VADDR size.
+// Consequently, decoded instructions always show address
+// as if it has 1 dword, which could be not really so.
DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
+
+ int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::vdst);
+
int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vdata);
int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::dmask);
+
+ int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::tfe);
+ int D16Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::d16);
+
+ assert(VDataIdx != -1);
+ assert(DMaskIdx != -1);
+ assert(TFEIdx != -1);
+
+ bool IsAtomic = (VDstIdx != -1);
+ bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4;
+
unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
if (DMask == 0)
return MCDisassembler::Success;
- unsigned ChannelCount = countPopulation(DMask);
- if (ChannelCount == 1)
+ unsigned DstSize = IsGather4 ? 4 : countPopulation(DMask);
+ if (DstSize == 1)
return MCDisassembler::Success;
- int NewOpcode = AMDGPU::getMaskedMIMGOp(*MCII, MI.getOpcode(), ChannelCount);
- assert(NewOpcode != -1 && "could not find matching mimg channel instruction");
+ bool D16 = D16Idx >= 0 && MI.getOperand(D16Idx).getImm();
+ if (D16 && AMDGPU::hasPackedD16(STI)) {
+ DstSize = (DstSize + 1) / 2;
+ }
+
+ // FIXME: Add tfe support
+ if (MI.getOperand(TFEIdx).getImm())
+ return MCDisassembler::Success;
+
+ int NewOpcode = -1;
+
+ if (IsGather4) {
+ if (D16 && AMDGPU::hasPackedD16(STI))
+ NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), 2);
+ else
+ return MCDisassembler::Success;
+ } else {
+ NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), DstSize);
+ if (NewOpcode == -1)
+ return MCDisassembler::Success;
+ }
+
auto RCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass;
- // Widen the register to the correct number of enabled channels.
+ // Get first subregister of VData
unsigned Vdata0 = MI.getOperand(VDataIdx).getReg();
+ unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
+ Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
+
+ // Widen the register to the correct number of enabled channels.
auto NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
&MRI.getRegClass(RCID));
if (NewVdata == AMDGPU::NoRegister) {
@@ -297,6 +362,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
// how it is usually emitted because the number of register components is not
// in the instruction encoding.
MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);
+
+ if (IsAtomic) {
+ // Atomic operations have an additional operand (a copy of data)
+ MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata);
+ }
+
return MCDisassembler::Success;
}
@@ -690,9 +761,8 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
switch (Val) {
case 102: return createRegOperand(FLAT_SCR_LO);
case 103: return createRegOperand(FLAT_SCR_HI);
- // ToDo: no support for xnack_mask_lo/_hi register
- case 104:
- case 105: break;
+ case 104: return createRegOperand(XNACK_MASK_LO);
+ case 105: return createRegOperand(XNACK_MASK_HI);
case 106: return createRegOperand(VCC_LO);
case 107: return createRegOperand(VCC_HI);
case 108: assert(!isGFX9()); return createRegOperand(TBA_LO);
@@ -722,6 +792,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
switch (Val) {
case 102: return createRegOperand(FLAT_SCR);
+ case 104: return createRegOperand(XNACK_MASK);
case 106: return createRegOperand(VCC);
case 108: assert(!isGFX9()); return createRegOperand(TBA);
case 110: assert(!isGFX9()); return createRegOperand(TMA);
@@ -732,8 +803,9 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
}
MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
- unsigned Val) const {
+ const unsigned Val) const {
using namespace AMDGPU::SDWA;
+ using namespace AMDGPU::EncValues;
if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
// XXX: static_cast<int> is needed to avoid stupid warning:
@@ -754,7 +826,15 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
Val - SDWA9EncValues::SRC_TTMP_MIN);
}
- return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
+ const unsigned SVal = Val - SDWA9EncValues::SRC_SGPR_MIN;
+
+ if (INLINE_INTEGER_C_MIN <= SVal && SVal <= INLINE_INTEGER_C_MAX)
+ return decodeIntImmed(SVal);
+
+ if (INLINE_FLOATING_C_MIN <= SVal && SVal <= INLINE_FLOATING_C_MAX)
+ return decodeFPImmed(Width, SVal);
+
+ return decodeSpecialReg32(SVal);
} else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
return createRegOperand(getVgprClassId(Width), Val);
}
@@ -815,6 +895,9 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
}
auto *Symbols = static_cast<SectionSymbolsTy *>(DisInfo);
+ if (!Symbols)
+ return false;
+
auto Result = std::find_if(Symbols->begin(), Symbols->end(),
[Value](const SymbolInfoTy& Val) {
return std::get<0>(Val) == static_cast<uint64_t>(Value)
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 5e26f97b0c86..944f4ffe598d 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -15,7 +15,6 @@
def isEG : Predicate<
"Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
- "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && "
"!Subtarget->hasCaymanISA()"
>;
@@ -693,7 +692,7 @@ def : EGOrCaymanPat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
def : EGOrCaymanPat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
// SHA-256 Patterns
-def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
+defm : SHA256MaPattern <BFI_INT_eg, XOR_INT, R600_Reg64>;
def EG_ExportSwz : ExportSwzInst {
let Word1{19-16} = 0; // BURST_COUNT
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 693869128081..3ef473b7fd96 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -135,7 +135,7 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
!con((ins VReg_64:$vaddr),
!if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
(ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
- (ins GLC:$glc, slc:$slc)),
+ (ins GLC:$glc, SLC:$slc)),
!if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
" $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> {
let has_data = 0;
@@ -158,7 +158,7 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
!con((ins VReg_64:$vaddr, vdataClass:$vdata),
!if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
(ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
- (ins GLC:$glc, slc:$slc)),
+ (ins GLC:$glc, SLC:$slc)),
" $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> {
let mayLoad = 0;
let mayStore = 1;
@@ -188,8 +188,8 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
opName,
(outs regClass:$vdst),
!if(EnableSaddr,
- (ins SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, slc:$slc),
- (ins VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc)),
+ (ins SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc),
+ (ins VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)),
" $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc"> {
let has_data = 0;
let mayLoad = 1;
@@ -204,8 +204,8 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En
opName,
(outs),
!if(EnableSaddr,
- (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, slc:$slc),
- (ins vdataClass:$vdata, VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc)),
+ (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc),
+ (ins vdataClass:$vdata, VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)),
" "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc"> {
let mayLoad = 0;
let mayStore = 1;
@@ -260,7 +260,7 @@ multiclass FLAT_Atomic_Pseudo<
RegisterClass data_rc = vdst_rc> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
" $vaddr, $vdata$offset$slc">,
AtomicNoRet <opName, 0> {
let PseudoInstr = NAME;
@@ -268,7 +268,7 @@ multiclass FLAT_Atomic_Pseudo<
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
" $vdst, $vaddr, $vdata$offset glc$slc",
[(set vt:$vdst,
(atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
@@ -285,7 +285,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
" $vaddr, $vdata, off$offset$slc">,
AtomicNoRet <opName, 0> {
let has_saddr = 1;
@@ -294,7 +294,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
" $vdst, $vaddr, $vdata, off$offset glc$slc",
[(set vt:$vdst,
(atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
@@ -304,7 +304,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, slc:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
" $vaddr, $vdata, $saddr$offset$slc">,
AtomicNoRet <opName#"_saddr", 0> {
let has_saddr = 1;
@@ -314,7 +314,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, slc:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
" $vdst, $vaddr, $vdata, $saddr$offset glc$slc">,
AtomicNoRet <opName#"_saddr", 1> {
let has_saddr = 1;
@@ -780,7 +780,7 @@ def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
@@ -824,7 +824,7 @@ def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32>;
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index dd515b0bf2f1..f236f10ba75a 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -16,6 +16,7 @@
#include "SIDefines.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -39,7 +40,7 @@ using namespace llvm;
GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
CurrCycleInstr(nullptr),
MF(MF),
- ST(MF.getSubtarget<SISubtarget>()),
+ ST(MF.getSubtarget<GCNSubtarget>()),
TII(*ST.getInstrInfo()),
TRI(TII.getRegisterInfo()),
ClauseUses(TRI.getNumRegUnits()),
@@ -355,13 +356,13 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
}
int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
int WaitStatesNeeded = 0;
WaitStatesNeeded = checkSoftClauseHazards(SMRD);
// This SMRD hazard only affects SI.
- if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS)
+ if (ST.getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS)
return WaitStatesNeeded;
// A read of an SGPR by SMRD instruction requires 4 wait states when the
@@ -398,7 +399,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
}
int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
- if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
return 0;
int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
@@ -634,7 +635,7 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
}
int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
- if (MI->isDebugValue())
+ if (MI->isDebugInstr())
return 0;
const SIRegisterInfo *TRI = ST.getRegisterInfo();
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h
index f9a6e395a454..ca17e7cb6018 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -28,7 +28,7 @@ class MachineRegisterInfo;
class ScheduleDAG;
class SIInstrInfo;
class SIRegisterInfo;
-class SISubtarget;
+class GCNSubtarget;
class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
// This variable stores the instruction that has been emitted this cycle. It
@@ -37,7 +37,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
MachineInstr *CurrCycleInstr;
std::list<MachineInstr*> EmittedInstrs;
const MachineFunction &MF;
- const SISubtarget &ST;
+ const GCNSubtarget &ST;
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp
index ba8211b189cf..651091d44136 100644
--- a/lib/Target/AMDGPU/GCNILPSched.cpp
+++ b/lib/Target/AMDGPU/GCNILPSched.cpp
@@ -149,9 +149,9 @@ static int BUCompareLatency(const SUnit *left, const SUnit *right) {
int LDepth = left->getDepth();
int RDepth = right->getDepth();
if (LDepth != RDepth) {
- DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum
- << ") depth " << LDepth << " vs SU (" << right->NodeNum
- << ") depth " << RDepth << "\n");
+ LLVM_DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum
+ << ") depth " << LDepth << " vs SU (" << right->NodeNum
+ << ") depth " << RDepth << "\n");
return LDepth < RDepth ? 1 : -1;
}
if (left->Latency != right->Latency)
@@ -169,9 +169,9 @@ const SUnit *GCNILPScheduler::pickBest(const SUnit *left, const SUnit *right)
if (!DisableSchedCriticalPath) {
int spread = (int)left->getDepth() - (int)right->getDepth();
if (std::abs(spread) > MaxReorderWindow) {
- DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
- << left->getDepth() << " != SU(" << right->NodeNum << "): "
- << right->getDepth() << "\n");
+ LLVM_DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
+ << left->getDepth() << " != SU(" << right->NodeNum
+ << "): " << right->getDepth() << "\n");
return left->getDepth() < right->getDepth() ? right : left;
}
}
@@ -324,19 +324,18 @@ GCNILPScheduler::schedule(ArrayRef<const SUnit*> BotRoots,
if (AvailQueue.empty())
break;
- DEBUG(
- dbgs() << "\n=== Picking candidate\n"
- "Ready queue:";
- for (auto &C : AvailQueue)
- dbgs() << ' ' << C.SU->NodeNum;
- dbgs() << '\n';
- );
+ LLVM_DEBUG(dbgs() << "\n=== Picking candidate\n"
+ "Ready queue:";
+ for (auto &C
+ : AvailQueue) dbgs()
+ << ' ' << C.SU->NodeNum;
+ dbgs() << '\n';);
auto C = pickCandidate();
assert(C);
AvailQueue.remove(*C);
auto SU = C->SU;
- DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+ LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
advanceToCycle(SU->getHeight());
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index a0e4f7ff24cb..15366d66bd85 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -11,6 +11,7 @@
#include "AMDGPUSubtarget.h"
#include "GCNRegPressure.h"
#include "GCNSchedStrategy.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
@@ -19,6 +20,7 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -68,14 +70,14 @@ static void printRegion(raw_ostream &OS,
auto I = Begin;
MaxInstNum = std::max(MaxInstNum, 1u);
for (; I != End && MaxInstNum; ++I, --MaxInstNum) {
- if (!I->isDebugValue() && LIS)
+ if (!I->isDebugInstr() && LIS)
OS << LIS->getInstructionIndex(*I);
OS << '\t' << *I;
}
if (I != End) {
OS << "\t...\n";
I = std::prev(End);
- if (!I->isDebugValue() && LIS)
+ if (!I->isDebugInstr() && LIS)
OS << LIS->getInstructionIndex(*I);
OS << '\t' << *I;
}
@@ -106,7 +108,7 @@ static void printLivenessInfo(raw_ostream &OS,
LLVM_DUMP_METHOD
void GCNIterativeScheduler::printRegions(raw_ostream &OS) const {
- const auto &ST = MF.getSubtarget<SISubtarget>();
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
for (const auto R : Regions) {
OS << "Region to schedule ";
printRegion(OS, R->Begin, R->End, LIS, 1);
@@ -130,7 +132,7 @@ LLVM_DUMP_METHOD
void GCNIterativeScheduler::printSchedRP(raw_ostream &OS,
const GCNRegPressure &Before,
const GCNRegPressure &After) const {
- const auto &ST = MF.getSubtarget<SISubtarget>();
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
OS << "RP before: ";
Before.print(OS, &ST);
OS << "RP after: ";
@@ -199,8 +201,8 @@ public:
void schedule() {
assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
- DEBUG(dbgs() << "\nScheduling ";
- printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2));
+ LLVM_DEBUG(dbgs() << "\nScheduling ";
+ printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2));
Sch.BaseClass::schedule();
// Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
@@ -310,14 +312,13 @@ void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden
void GCNIterativeScheduler::schedule() { // overriden
// do nothing
- DEBUG(
- printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS);
- if (!Regions.empty() && Regions.back()->Begin == RegionBegin) {
- dbgs() << "Max RP: ";
- Regions.back()->MaxPressure.print(dbgs(), &MF.getSubtarget<SISubtarget>());
- }
- dbgs() << '\n';
- );
+ LLVM_DEBUG(printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS);
+ if (!Regions.empty() && Regions.back()->Begin == RegionBegin) {
+ dbgs() << "Max RP: ";
+ Regions.back()->MaxPressure.print(
+ dbgs(), &MF.getSubtarget<GCNSubtarget>());
+ } dbgs()
+ << '\n';);
}
void GCNIterativeScheduler::finalizeSchedule() { // overriden
@@ -383,10 +384,10 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
if (MI != &*Top) {
BB->remove(MI);
BB->insert(Top, MI);
- if (!MI->isDebugValue())
+ if (!MI->isDebugInstr())
LIS->handleMove(*MI, true);
}
- if (!MI->isDebugValue()) {
+ if (!MI->isDebugInstr()) {
// Reset read - undef flags and update them later.
for (auto &Op : MI->operands())
if (Op.isReg() && Op.isDef())
@@ -417,7 +418,7 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
#ifndef NDEBUG
const auto RegionMaxRP = getRegionPressure(R);
- const auto &ST = MF.getSubtarget<SISubtarget>();
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
#endif
assert((SchedMaxRP == RegionMaxRP && (MaxRP.empty() || SchedMaxRP == MaxRP))
|| (dbgs() << "Max RP mismatch!!!\n"
@@ -432,8 +433,8 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
// Sort recorded regions by pressure - highest at the front
void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
- const auto &ST = MF.getSubtarget<SISubtarget>();
- std::sort(Regions.begin(), Regions.end(),
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
+ llvm::sort(Regions.begin(), Regions.end(),
[&ST, TargetOcc](const Region *R1, const Region *R2) {
return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
});
@@ -450,24 +451,24 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
// BestSchedules aren't deleted on fail.
unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
// TODO: assert Regions are sorted descending by pressure
- const auto &ST = MF.getSubtarget<SISubtarget>();
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
- DEBUG(dbgs() << "Trying to to improve occupancy, target = " << TargetOcc
- << ", current = " << Occ << '\n');
+ LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
+ << ", current = " << Occ << '\n');
auto NewOcc = TargetOcc;
for (auto R : Regions) {
if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
break;
- DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
- printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
+ LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
+ printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
BuildDAG DAG(*R, *this);
const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
const auto MaxRP = getSchedulePressure(*R, MinSchedule);
- DEBUG(dbgs() << "Occupancy improvement attempt:\n";
- printSchedRP(dbgs(), R->MaxPressure, MaxRP));
+ LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
+ printSchedRP(dbgs(), R->MaxPressure, MaxRP));
NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
if (NewOcc <= Occ)
@@ -475,15 +476,21 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
setBestSchedule(*R, MinSchedule, MaxRP);
}
- DEBUG(dbgs() << "New occupancy = " << NewOcc
- << ", prev occupancy = " << Occ << '\n');
+ LLVM_DEBUG(dbgs() << "New occupancy = " << NewOcc
+ << ", prev occupancy = " << Occ << '\n');
+ if (NewOcc > Occ) {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MFI->increaseOccupancy(MF, NewOcc);
+ }
+
return std::max(NewOcc, Occ);
}
void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
bool TryMaximizeOccupancy) {
- const auto &ST = MF.getSubtarget<SISubtarget>();
- auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ auto TgtOcc = MFI->getMinAllowedOccupancy();
sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
@@ -496,9 +503,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
const int NumPasses = Occ < TgtOcc ? 2 : 1;
TgtOcc = std::min(Occ, TgtOcc);
- DEBUG(dbgs() << "Scheduling using default scheduler, "
- "target occupancy = " << TgtOcc << '\n');
+ LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
+ "target occupancy = "
+ << TgtOcc << '\n');
GCNMaxOccupancySchedStrategy LStrgy(Context);
+ unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
for (int I = 0; I < NumPasses; ++I) {
// running first pass with TargetOccupancy = 0 mimics previous scheduling
@@ -509,30 +518,33 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
Ovr.schedule();
const auto RP = getRegionPressure(*R);
- DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
+ LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
if (RP.getOccupancy(ST) < TgtOcc) {
- DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
+ LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() &&
R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
- DEBUG(dbgs() << ", scheduling minimal register\n");
+ LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
} else {
- DEBUG(dbgs() << ", restoring\n");
+ LLVM_DEBUG(dbgs() << ", restoring\n");
Ovr.restoreOrder();
assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
}
}
+ FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
}
}
+ MFI->limitOccupancy(FinalOccupancy);
}
///////////////////////////////////////////////////////////////////////////////
// Minimal Register Strategy
void GCNIterativeScheduler::scheduleMinReg(bool force) {
- const auto &ST = MF.getSubtarget<SISubtarget>();
- const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const auto TgtOcc = MFI->getOccupancy();
sortRegionsByPressure(TgtOcc);
auto MaxPressure = Regions.front()->MaxPressure;
@@ -544,7 +556,7 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
const auto RP = getSchedulePressure(*R, MinSchedule);
- DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
+ LLVM_DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
dbgs() << "\nWarning: Pressure becomes worse after minreg!";
printSchedRP(dbgs(), R->MaxPressure, RP);
});
@@ -553,7 +565,7 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
break;
scheduleRegion(*R, MinSchedule, RP);
- DEBUG(printSchedResult(dbgs(), R, RP));
+ LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
MaxPressure = RP;
}
@@ -564,9 +576,9 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
void GCNIterativeScheduler::scheduleILP(
bool TryMaximizeOccupancy) {
- const auto &ST = MF.getSubtarget<SISubtarget>();
- auto TgtOcc = std::min(ST.getOccupancyWithLocalMemSize(MF),
- ST.getWavesPerEU(MF.getFunction()).second);
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ auto TgtOcc = MFI->getMinAllowedOccupancy();
sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
@@ -575,26 +587,30 @@ void GCNIterativeScheduler::scheduleILP(
Occ = tryMaximizeOccupancy(TgtOcc);
TgtOcc = std::min(Occ, TgtOcc);
- DEBUG(dbgs() << "Scheduling using default scheduler, "
- "target occupancy = " << TgtOcc << '\n');
+ LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
+ "target occupancy = "
+ << TgtOcc << '\n');
+ unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
for (auto R : Regions) {
BuildDAG DAG(*R, *this);
const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this);
const auto RP = getSchedulePressure(*R, ILPSchedule);
- DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
+ LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
if (RP.getOccupancy(ST) < TgtOcc) {
- DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
+ LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() &&
R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
- DEBUG(dbgs() << ", scheduling minimal register\n");
+ LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
}
} else {
scheduleRegion(*R, ILPSchedule, RP);
- DEBUG(printSchedResult(dbgs(), R, RP));
+ LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
+ FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
}
}
+ MFI->limitOccupancy(FinalOccupancy);
}
diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index 9904b5f0f4ba..192d534bb9cf 100644
--- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -142,35 +142,38 @@ GCNMinRegScheduler::Candidate* GCNMinRegScheduler::pickCandidate() {
unsigned Num = RQ.size();
if (Num == 1) break;
- DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num << '\n');
+ LLVM_DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num
+ << '\n');
Num = findMax(Num, [=](const Candidate &C) { return C.Priority; });
if (Num == 1) break;
- DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among "
- << Num << '\n');
+ LLVM_DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among "
+ << Num << '\n');
Num = findMax(Num, [=](const Candidate &C) {
auto SU = C.SU;
int Res = getNotReadySuccessors(SU);
- DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready "
- << Res << " successors, metric = " << -Res << '\n');
+ LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready "
+ << Res << " successors, metric = " << -Res << '\n');
return -Res;
});
if (Num == 1) break;
- DEBUG(dbgs() << "\nSelecting most producing candidate among "
- << Num << '\n');
+ LLVM_DEBUG(dbgs() << "\nSelecting most producing candidate among " << Num
+ << '\n');
Num = findMax(Num, [=](const Candidate &C) {
auto SU = C.SU;
auto Res = getReadySuccessors(SU);
- DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready "
- << Res << " successors, metric = " << Res << '\n');
+ LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready " << Res
+ << " successors, metric = " << Res << '\n');
return Res;
});
if (Num == 1) break;
Num = Num ? Num : RQ.size();
- DEBUG(dbgs() << "\nCan't find best candidate, selecting in program order among "
- << Num << '\n');
+ LLVM_DEBUG(
+ dbgs()
+ << "\nCan't find best candidate, selecting in program order among "
+ << Num << '\n');
Num = findMax(Num, [=](const Candidate &C) { return -(int64_t)C.SU->NodeNum; });
assert(Num == 1);
} while (false);
@@ -202,17 +205,17 @@ void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) {
Worklist.push_back(P.getSUnit());
}
}
- DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
- << ")'s non-ready successors of " << Priority
- << " priority in ready queue: ");
+ LLVM_DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
+ << ")'s non-ready successors of " << Priority
+ << " priority in ready queue: ");
const auto SetEnd = Set.end();
for (auto &C : RQ) {
if (Set.find(C.SU) != SetEnd) {
C.Priority = Priority;
- DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
+ LLVM_DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
}
}
- DEBUG(dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << '\n');
}
void GCNMinRegScheduler::releaseSuccessors(const SUnit* SU, int Priority) {
@@ -243,19 +246,19 @@ GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots,
releaseSuccessors(&DAG.EntrySU, StepNo);
while (!RQ.empty()) {
- DEBUG(
- dbgs() << "\n=== Picking candidate, Step = " << StepNo << "\n"
- "Ready queue:";
- for (auto &C : RQ)
- dbgs() << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')';
- dbgs() << '\n';
- );
+ LLVM_DEBUG(dbgs() << "\n=== Picking candidate, Step = " << StepNo
+ << "\n"
+ "Ready queue:";
+ for (auto &C
+ : RQ) dbgs()
+ << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')';
+ dbgs() << '\n';);
auto C = pickCandidate();
assert(C);
RQ.remove(*C);
auto SU = C->SU;
- DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+ LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
releaseSuccessors(SU, StepNo);
Schedule.push_back(SU);
diff --git a/lib/Target/AMDGPU/GCNProcessors.td b/lib/Target/AMDGPU/GCNProcessors.td
index b2a3f652abd8..d76acfa24f90 100644
--- a/lib/Target/AMDGPU/GCNProcessors.td
+++ b/lib/Target/AMDGPU/GCNProcessors.td
@@ -93,14 +93,6 @@ def : ProcessorModel<"bonaire", SIQuarterSpeedModel,
// GCN GFX8 (Volcanic Islands (VI)).
//===----------------------------------------------------------------------===//
-def : ProcessorModel<"gfx800", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_0]
->;
-
-def : ProcessorModel<"iceland", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_0]
->;
-
def : ProcessorModel<"gfx801", SIQuarterSpeedModel,
[FeatureISAVersion8_0_1]
>;
@@ -113,6 +105,10 @@ def : ProcessorModel<"gfx802", SIQuarterSpeedModel,
[FeatureISAVersion8_0_2]
>;
+def : ProcessorModel<"iceland", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_2]
+>;
+
def : ProcessorModel<"tonga", SIQuarterSpeedModel,
[FeatureISAVersion8_0_2]
>;
@@ -152,3 +148,11 @@ def : ProcessorModel<"gfx900", SIQuarterSpeedModel,
def : ProcessorModel<"gfx902", SIQuarterSpeedModel,
[FeatureISAVersion9_0_2]
>;
+
+def : ProcessorModel<"gfx904", SIQuarterSpeedModel,
+ [FeatureISAVersion9_0_4]
+>;
+
+def : ProcessorModel<"gfx906", SIQuarterSpeedModel,
+ [FeatureISAVersion9_0_6]
+>;
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 992bb7cceb6f..3d8cacc4f02c 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -19,6 +19,7 @@
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
@@ -131,7 +132,7 @@ void GCNRegPressure::inc(unsigned Reg,
}
}
-bool GCNRegPressure::less(const SISubtarget &ST,
+bool GCNRegPressure::less(const GCNSubtarget &ST,
const GCNRegPressure& O,
unsigned MaxOccupancy) const {
const auto SGPROcc = std::min(MaxOccupancy,
@@ -177,7 +178,7 @@ bool GCNRegPressure::less(const SISubtarget &ST,
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD
-void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
+void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const {
OS << "VGPRs: " << getVGPRNum();
if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')';
OS << ", SGPRs: " << getSGPRNum();
@@ -283,24 +284,33 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
return LiveRegs;
}
-void GCNUpwardRPTracker::reset(const MachineInstr &MI,
- const LiveRegSet *LiveRegsCopy) {
- MRI = &MI.getParent()->getParent()->getRegInfo();
+void GCNRPTracker::reset(const MachineInstr &MI,
+ const LiveRegSet *LiveRegsCopy,
+ bool After) {
+ const MachineFunction &MF = *MI.getMF();
+ MRI = &MF.getRegInfo();
if (LiveRegsCopy) {
if (&LiveRegs != LiveRegsCopy)
LiveRegs = *LiveRegsCopy;
} else {
- LiveRegs = getLiveRegsAfter(MI, LIS);
+ LiveRegs = After ? getLiveRegsAfter(MI, LIS)
+ : getLiveRegsBefore(MI, LIS);
}
+
MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
}
+void GCNUpwardRPTracker::reset(const MachineInstr &MI,
+ const LiveRegSet *LiveRegsCopy) {
+ GCNRPTracker::reset(MI, LiveRegsCopy, true);
+}
+
void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
assert(MRI && "call reset first");
LastTrackedMI = &MI;
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
return;
auto const RegUses = collectVirtualRegUses(MI, LIS, *MRI);
@@ -348,13 +358,7 @@ bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
if (NextMI == MBBEnd)
return false;
- if (LiveRegsCopy) {
- if (&LiveRegs != LiveRegsCopy)
- LiveRegs = *LiveRegsCopy;
- } else {
- LiveRegs = getLiveRegsBefore(*NextMI, LIS);
- }
- MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+ GCNRPTracker::reset(*NextMI, LiveRegsCopy, false);
return true;
}
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
index e418aa0fe911..357d3b7b2334 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -49,7 +49,7 @@ struct GCNRegPressure {
unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
- unsigned getOccupancy(const SISubtarget &ST) const {
+ unsigned getOccupancy(const GCNSubtarget &ST) const {
return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
ST.getOccupancyWithNumVGPRs(getVGPRNum()));
}
@@ -59,11 +59,11 @@ struct GCNRegPressure {
LaneBitmask NewMask,
const MachineRegisterInfo &MRI);
- bool higherOccupancy(const SISubtarget &ST, const GCNRegPressure& O) const {
+ bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure& O) const {
return getOccupancy(ST) > O.getOccupancy(ST);
}
- bool less(const SISubtarget &ST, const GCNRegPressure& O,
+ bool less(const GCNSubtarget &ST, const GCNRegPressure& O,
unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
bool operator==(const GCNRegPressure &O) const {
@@ -74,7 +74,7 @@ struct GCNRegPressure {
return !(*this == O);
}
- void print(raw_ostream &OS, const SISubtarget *ST = nullptr) const;
+ void print(raw_ostream &OS, const GCNSubtarget *ST = nullptr) const;
void dump() const { print(dbgs()); }
private:
@@ -106,6 +106,9 @@ protected:
GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+ void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy,
+ bool After);
+
public:
// live regs for the current state
const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index d414b899050a..f09b7f6cff22 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -28,18 +28,6 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
const MachineSchedContext *C) :
GenericScheduler(C), TargetOccupancy(0), MF(nullptr) { }
-static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
- const MachineFunction &MF) {
-
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
- ST.getOccupancyWithNumVGPRs(VGPRs));
- return std::min(MinRegOccupancy,
- ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
- MF.getFunction()));
-}
-
void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG);
@@ -47,7 +35,7 @@ void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
MF = &DAG->MF;
- const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
// FIXME: This is also necessary, because some passes that run after
// scheduling and before regalloc increase register pressure.
@@ -81,7 +69,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
Cand.AtTop = AtTop;
// getDownwardPressure() and getUpwardPressure() make temporary changes to
- // the the tracker, so we need to pass those function a non-const copy.
+ // the tracker, so we need to pass those function a non-const copy.
RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
std::vector<unsigned> Pressure;
@@ -200,34 +188,30 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
// See if BotCand is still valid (because we previously scheduled from Top).
- DEBUG(dbgs() << "Picking from Bot:\n");
+ LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
if (!BotCand.isValid() || BotCand.SU->isScheduled ||
BotCand.Policy != BotPolicy) {
BotCand.reset(CandPolicy());
pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
assert(BotCand.Reason != NoCand && "failed to find the first candidate");
} else {
- DEBUG(traceCandidate(BotCand));
+ LLVM_DEBUG(traceCandidate(BotCand));
}
// Check if the top Q has a better candidate.
- DEBUG(dbgs() << "Picking from Top:\n");
+ LLVM_DEBUG(dbgs() << "Picking from Top:\n");
if (!TopCand.isValid() || TopCand.SU->isScheduled ||
TopCand.Policy != TopPolicy) {
TopCand.reset(CandPolicy());
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
assert(TopCand.Reason != NoCand && "failed to find the first candidate");
} else {
- DEBUG(traceCandidate(TopCand));
+ LLVM_DEBUG(traceCandidate(TopCand));
}
// Pick best from BotCand and TopCand.
- DEBUG(
- dbgs() << "Top Cand: ";
- traceCandidate(TopCand);
- dbgs() << "Bot Cand: ";
- traceCandidate(BotCand);
- );
+ LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
+ dbgs() << "Bot Cand: "; traceCandidate(BotCand););
SchedCandidate Cand;
if (TopCand.Reason == BotCand.Reason) {
Cand = BotCand;
@@ -256,10 +240,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
}
}
}
- DEBUG(
- dbgs() << "Picking: ";
- traceCandidate(Cand);
- );
+ LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
IsTopNode = Cand.AtTop;
return Cand.SU;
@@ -305,20 +286,20 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
if (SU->isBottomReady())
Bot.removeReady(SU);
- DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
+ LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+ << *SU->getInstr());
return SU;
}
GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
std::unique_ptr<MachineSchedStrategy> S) :
ScheduleDAGMILive(C, std::move(S)),
- ST(MF.getSubtarget<SISubtarget>()),
+ ST(MF.getSubtarget<GCNSubtarget>()),
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
- StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(),
- MF.getFunction())),
+ StartingOccupancy(MFI.getOccupancy()),
MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
- DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
+ LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
}
void GCNScheduleDAGMILive::schedule() {
@@ -338,12 +319,12 @@ void GCNScheduleDAGMILive::schedule() {
if (LIS) {
PressureBefore = Pressure[RegionIdx];
- DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
- GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
- dbgs() << "Region live-in pressure: ";
- llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
- dbgs() << "Region register pressure: ";
- PressureBefore.print(dbgs()));
+ LLVM_DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
+ GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
+ dbgs() << "Region live-in pressure: ";
+ llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
+ dbgs() << "Region register pressure: ";
+ PressureBefore.print(dbgs()));
}
ScheduleDAGMILive::schedule();
@@ -356,45 +337,54 @@ void GCNScheduleDAGMILive::schedule() {
GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
auto PressureAfter = getRealRegPressure();
- DEBUG(dbgs() << "Pressure after scheduling: "; PressureAfter.print(dbgs()));
+ LLVM_DEBUG(dbgs() << "Pressure after scheduling: ";
+ PressureAfter.print(dbgs()));
if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) {
Pressure[RegionIdx] = PressureAfter;
- DEBUG(dbgs() << "Pressure in desired limits, done.\n");
+ LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
return;
}
- unsigned WavesAfter = getMaxWaves(PressureAfter.getSGPRNum(),
- PressureAfter.getVGPRNum(), MF);
- unsigned WavesBefore = getMaxWaves(PressureBefore.getSGPRNum(),
- PressureBefore.getVGPRNum(), MF);
- DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore <<
- ", after " << WavesAfter << ".\n");
+ unsigned Occ = MFI.getOccupancy();
+ unsigned WavesAfter = std::min(Occ, PressureAfter.getOccupancy(ST));
+ unsigned WavesBefore = std::min(Occ, PressureBefore.getOccupancy(ST));
+ LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
+ << ", after " << WavesAfter << ".\n");
// We could not keep current target occupancy because of the just scheduled
// region. Record new occupancy for next scheduling cycle.
unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
+ // Allow memory bound functions to drop to 4 waves if not limited by an
+ // attribute.
+ if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy &&
+ WavesAfter >= MFI.getMinAllowedOccupancy()) {
+ LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
+ << MFI.getMinAllowedOccupancy() << " waves\n");
+ NewOccupancy = WavesAfter;
+ }
if (NewOccupancy < MinOccupancy) {
MinOccupancy = NewOccupancy;
- DEBUG(dbgs() << "Occupancy lowered for the function to "
- << MinOccupancy << ".\n");
+ MFI.limitOccupancy(MinOccupancy);
+ LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
+ << MinOccupancy << ".\n");
}
- if (WavesAfter >= WavesBefore) {
+ if (WavesAfter >= MinOccupancy) {
Pressure[RegionIdx] = PressureAfter;
return;
}
- DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+ LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
RegionEnd = RegionBegin;
for (MachineInstr *MI : Unsched) {
- if (MI->isDebugValue())
+ if (MI->isDebugInstr())
continue;
if (MI->getIterator() != RegionEnd) {
BB->remove(MI);
BB->insert(RegionEnd, MI);
- if (!MI->isDebugValue())
+ if (!MI->isDebugInstr())
LIS->handleMove(*MI, true);
}
// Reset read-undef flags and update them later.
@@ -403,7 +393,7 @@ void GCNScheduleDAGMILive::schedule() {
Op.setIsUndef(false);
RegisterOperands RegOpers;
RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
- if (!MI->isDebugValue()) {
+ if (!MI->isDebugInstr()) {
if (ShouldTrackLaneMasks) {
// Adjust liveness and add missing dead+read-undef flags.
SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
@@ -415,7 +405,7 @@ void GCNScheduleDAGMILive::schedule() {
}
RegionEnd = MI->getIterator();
++RegionEnd;
- DEBUG(dbgs() << "Scheduling " << *MI);
+ LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
}
RegionBegin = Unsched.front()->getIterator();
Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
@@ -490,7 +480,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
void GCNScheduleDAGMILive::finalizeSchedule() {
GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
- DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
+ LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());
@@ -509,9 +499,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
if (!LIS || StartingOccupancy <= MinOccupancy)
break;
- DEBUG(dbgs()
- << "Retrying function scheduling with lowest recorded occupancy "
- << MinOccupancy << ".\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "Retrying function scheduling with lowest recorded occupancy "
+ << MinOccupancy << ".\n");
S.setTargetOccupancy(MinOccupancy);
}
@@ -537,12 +528,13 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
continue;
}
- DEBUG(dbgs() << "********** MI Scheduling **********\n");
- DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " "
- << MBB->getName() << "\n From: " << *begin() << " To: ";
- if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
- else dbgs() << "End";
- dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+ LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
+ LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " "
+ << MBB->getName() << "\n From: " << *begin()
+ << " To: ";
+ if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+ else dbgs() << "End";
+ dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
schedule();
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index 060d2ca72d93..3ac6af89cb9b 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -21,7 +21,7 @@ namespace llvm {
class SIMachineFunctionInfo;
class SIRegisterInfo;
-class SISubtarget;
+class GCNSubtarget;
/// This is a minimal scheduler strategy. The main difference between this
/// and the GenericScheduler is that GCNSchedStrategy uses different
@@ -62,9 +62,9 @@ public:
class GCNScheduleDAGMILive : public ScheduleDAGMILive {
- const SISubtarget &ST;
+ const GCNSubtarget &ST;
- const SIMachineFunctionInfo &MFI;
+ SIMachineFunctionInfo &MFI;
// Occupancy target at the beginning of function scheduling cycle.
unsigned StartingOccupancy;
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index bf57f88bef91..db908368a179 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -217,6 +217,11 @@ void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo,
printNamedBit(MI, OpNo, O, "lwe");
}
+void AMDGPUInstPrinter::printD16(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printNamedBit(MI, OpNo, O, "d16");
+}
+
void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -267,6 +272,9 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
case AMDGPU::FLAT_SCR:
O << "flat_scratch";
return;
+ case AMDGPU::XNACK_MASK:
+ O << "xnack_mask";
+ return;
case AMDGPU::VCC_LO:
O << "vcc_lo";
return;
@@ -297,6 +305,12 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
case AMDGPU::FLAT_SCR_HI:
O << "flat_scratch_hi";
return;
+ case AMDGPU::XNACK_MASK_LO:
+ O << "xnack_mask_lo";
+ return;
+ case AMDGPU::XNACK_MASK_HI:
+ O << "xnack_mask_hi";
+ return;
case AMDGPU::FP_REG:
case AMDGPU::SP_REG:
case AMDGPU::SCRATCH_WAVE_OFFSET_REG:
@@ -371,6 +385,16 @@ void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
printOperand(MI, OpNo, STI, O);
}
+void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI))
+ O << " ";
+ else
+ O << "_e32 ";
+
+ printOperand(MI, OpNo, STI, O);
+}
+
void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -486,11 +510,6 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) {
- static_cast<R600InstPrinter*>(this)->printOperand(MI, OpNo, O);
- return;
- }
-
if (OpNo >= MI->getNumOperands()) {
O << "/*Missing OP" << OpNo << "*/";
return;
@@ -612,40 +631,45 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ using namespace AMDGPU::DPP;
+
unsigned Imm = MI->getOperand(OpNo).getImm();
- if (Imm <= 0x0ff) {
+ if (Imm <= DppCtrl::QUAD_PERM_LAST) {
O << " quad_perm:[";
O << formatDec(Imm & 0x3) << ',';
O << formatDec((Imm & 0xc) >> 2) << ',';
O << formatDec((Imm & 0x30) >> 4) << ',';
O << formatDec((Imm & 0xc0) >> 6) << ']';
- } else if ((Imm >= 0x101) && (Imm <= 0x10f)) {
+ } else if ((Imm >= DppCtrl::ROW_SHL_FIRST) &&
+ (Imm <= DppCtrl::ROW_SHL_LAST)) {
O << " row_shl:";
printU4ImmDecOperand(MI, OpNo, O);
- } else if ((Imm >= 0x111) && (Imm <= 0x11f)) {
+ } else if ((Imm >= DppCtrl::ROW_SHR_FIRST) &&
+ (Imm <= DppCtrl::ROW_SHR_LAST)) {
O << " row_shr:";
printU4ImmDecOperand(MI, OpNo, O);
- } else if ((Imm >= 0x121) && (Imm <= 0x12f)) {
+ } else if ((Imm >= DppCtrl::ROW_ROR_FIRST) &&
+ (Imm <= DppCtrl::ROW_ROR_LAST)) {
O << " row_ror:";
printU4ImmDecOperand(MI, OpNo, O);
- } else if (Imm == 0x130) {
+ } else if (Imm == DppCtrl::WAVE_SHL1) {
O << " wave_shl:1";
- } else if (Imm == 0x134) {
+ } else if (Imm == DppCtrl::WAVE_ROL1) {
O << " wave_rol:1";
- } else if (Imm == 0x138) {
+ } else if (Imm == DppCtrl::WAVE_SHR1) {
O << " wave_shr:1";
- } else if (Imm == 0x13c) {
+ } else if (Imm == DppCtrl::WAVE_ROR1) {
O << " wave_ror:1";
- } else if (Imm == 0x140) {
+ } else if (Imm == DppCtrl::ROW_MIRROR) {
O << " row_mirror";
- } else if (Imm == 0x141) {
+ } else if (Imm == DppCtrl::ROW_HALF_MIRROR) {
O << " row_half_mirror";
- } else if (Imm == 0x142) {
+ } else if (Imm == DppCtrl::BCAST15) {
O << " row_bcast:15";
- } else if (Imm == 0x143) {
+ } else if (Imm == DppCtrl::BCAST31) {
O << " row_bcast:31";
} else {
- llvm_unreachable("Invalid dpp_ctrl value");
+ O << " /* Invalid dpp_ctrl value */";
}
}
@@ -936,11 +960,6 @@ void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) {
- static_cast<R600InstPrinter*>(this)->printMemOperand(MI, OpNo, O);
- return;
- }
-
printOperand(MI, OpNo, STI, O);
O << ", ";
printOperand(MI, OpNo + 1, STI, O);
@@ -966,16 +985,6 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
O << Asm;
}
-void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printAbs(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printClamp(MI, OpNo, O);
-}
-
void AMDGPUInstPrinter::printHigh(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -1002,70 +1011,6 @@ void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
O << " div:2";
}
-void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printLiteral(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printLast(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printNeg(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printOMOD(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printRel(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printUpdateExecMask(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printUpdatePred(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printWrite(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printBankSwizzle(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printRSel(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printCT(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printKCache(MI, OpNo, O);
-}
-
void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -1254,7 +1199,10 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
const unsigned Width = ((SImm16 & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
O << "hwreg(";
- if (ID_SYMBOLIC_FIRST_ <= Id && Id < ID_SYMBOLIC_LAST_) {
+ unsigned Last = ID_SYMBOLIC_LAST_;
+ if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI) || AMDGPU::isVI(STI))
+ Last = ID_SYMBOLIC_FIRST_GFX9_;
+ if (ID_SYMBOLIC_FIRST_ <= Id && Id < Last && IdSymbolic[Id]) {
O << IdSymbolic[Id];
} else {
O << Id;
@@ -1267,6 +1215,13 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
#include "AMDGPUGenAsmWriter.inc"
+void R600InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ O.flush();
+ printInstruction(MI, O);
+ printAnnotation(O, Annot);
+}
+
void R600InstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '|');
@@ -1385,7 +1340,7 @@ void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
if (Op.isReg()) {
switch (Op.getReg()) {
// This is the default predicate state, so we don't need to print it.
- case AMDGPU::PRED_SEL_OFF:
+ case R600::PRED_SEL_OFF:
break;
default:
@@ -1461,3 +1416,5 @@ void R600InstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
O << " (MASKED)";
}
}
+
+#include "R600GenAsmWriter.inc"
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index d97f04689e18..11a496a38b2c 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -84,6 +84,8 @@ private:
raw_ostream &O);
void printLWE(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printD16(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printExpCompr(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printExpVM(const MCInst *MI, unsigned OpNo,
@@ -96,6 +98,8 @@ private:
void printRegOperand(unsigned RegNo, raw_ostream &O);
void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
@@ -214,13 +218,16 @@ protected:
raw_ostream &O);
};
-// FIXME: R600 specific parts of AMDGPUInstrPrinter should be moved here, and
-// MCTargetDesc should be using R600InstPrinter for the R600 target.
-class R600InstPrinter : public AMDGPUInstPrinter {
+class R600InstPrinter : public MCInstPrinter {
public:
R600InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
const MCRegisterInfo &MRI)
- : AMDGPUInstPrinter(MAI, MII, MRI) {}
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 778d4a7ba9d0..abc88c02adca 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -26,14 +26,14 @@ namespace {
class AMDGPUAsmBackend : public MCAsmBackend {
public:
- AMDGPUAsmBackend(const Target &T)
- : MCAsmBackend() {}
+ AMDGPUAsmBackend(const Target &T) : MCAsmBackend(support::little) {}
unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsResolved) const override;
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override;
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override {
@@ -43,10 +43,13 @@ public:
MCInst &Res) const override {
llvm_unreachable("Not implemented");
}
- bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+ bool mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const override {
+ return false;
+ }
unsigned getMinimumNopSize() const override;
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
};
@@ -103,7 +106,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target,
MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) const {
+ bool IsResolved,
+ const MCSubtargetInfo *STI) const {
Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
if (!Value)
return; // Doesn't change encoding.
@@ -140,11 +144,11 @@ unsigned AMDGPUAsmBackend::getMinimumNopSize() const {
return 4;
}
-bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool AMDGPUAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
// If the count is not 4-byte aligned, we must be writing data into the text
// section (otherwise we have unaligned instructions, and thus have far
// bigger problems), so just write zeros instead.
- OW->WriteZeros(Count % 4);
+ OS.write_zeros(Count % 4);
// We are properly aligned, so write NOPs as requested.
Count /= 4;
@@ -154,7 +158,7 @@ bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
const uint32_t Encoded_S_NOP_0 = 0xbf800000;
for (uint64_t I = 0; I != Count; ++I)
- OW->write32(Encoded_S_NOP_0);
+ support::endian::write<uint32_t>(OS, Encoded_S_NOP_0, Endian);
return true;
}
@@ -189,18 +193,18 @@ public:
}
}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend, OS);
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend);
}
};
} // end anonymous namespace
MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
const MCTargetOptions &Options) {
// Use 64-bit ELF for amdgcn
- return new ELFAMDGPUAsmBackend(T, TT);
+ return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple());
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index e443b0729606..07bef9103c0d 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -66,6 +66,8 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
return ELF::R_AMDGPU_REL32_LO;
case MCSymbolRefExpr::VK_AMDGPU_REL32_HI:
return ELF::R_AMDGPU_REL32_HI;
+ case MCSymbolRefExpr::VK_AMDGPU_REL64:
+ return ELF::R_AMDGPU_REL64;
}
switch (Fixup.getKind()) {
@@ -82,11 +84,9 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
llvm_unreachable("unhandled relocation type");
}
-std::unique_ptr<MCObjectWriter>
+std::unique_ptr<MCObjectTargetWriter>
llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
- bool HasRelocationAddend,
- raw_pwrite_stream &OS) {
- auto MOTW = llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
- HasRelocationAddend);
- return createELFObjectWriter(std::move(MOTW), OS, true);
+ bool HasRelocationAddend) {
+ return llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
+ HasRelocationAddend);
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
index 1497edc7a054..c627a08e7463 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -12,37 +12,28 @@
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCObjectWriter.h"
using namespace llvm;
-AMDGPUELFStreamer::AMDGPUELFStreamer(const Triple &T, MCContext &Context,
- std::unique_ptr<MCAsmBackend> MAB,
- raw_pwrite_stream &OS,
- std::unique_ptr<MCCodeEmitter> Emitter)
- : MCELFStreamer(Context, std::move(MAB), OS, std::move(Emitter)) {
- unsigned Arch = ELF::EF_AMDGPU_ARCH_NONE;
- switch (T.getArch()) {
- case Triple::r600:
- Arch = ELF::EF_AMDGPU_ARCH_R600;
- break;
- case Triple::amdgcn:
- Arch = ELF::EF_AMDGPU_ARCH_GCN;
- break;
- default:
- break;
- }
+namespace {
+
+class AMDGPUELFStreamer : public MCELFStreamer {
+public:
+ AMDGPUELFStreamer(const Triple &T, MCContext &Context,
+ std::unique_ptr<MCAsmBackend> MAB,
+ std::unique_ptr<MCObjectWriter> OW,
+ std::unique_ptr<MCCodeEmitter> Emitter)
+ : MCELFStreamer(Context, std::move(MAB), std::move(OW),
+ std::move(Emitter)) {}
+};
- MCAssembler &MCA = getAssembler();
- unsigned EFlags = MCA.getELFHeaderEFlags();
- EFlags &= ~ELF::EF_AMDGPU_ARCH;
- EFlags |= Arch;
- MCA.setELFHeaderEFlags(EFlags);
}
MCELFStreamer *llvm::createAMDGPUELFStreamer(
const Triple &T, MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
- raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+ std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
bool RelaxAll) {
- return new AMDGPUELFStreamer(T, Context, std::move(MAB), OS,
+ return new AMDGPUELFStreamer(T, Context, std::move(MAB), std::move(OW),
std::move(Emitter));
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
index 0cc0a4c5cd5d..41e9063a759e 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -23,16 +23,9 @@ class MCCodeEmitter;
class MCContext;
class MCSubtargetInfo;
-class AMDGPUELFStreamer : public MCELFStreamer {
-public:
- AMDGPUELFStreamer(const Triple &T, MCContext &Context,
- std::unique_ptr<MCAsmBackend> MAB, raw_pwrite_stream &OS,
- std::unique_ptr<MCCodeEmitter> Emitter);
-};
-
MCELFStreamer *createAMDGPUELFStreamer(const Triple &T, MCContext &Context,
std::unique_ptr<MCAsmBackend> MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter,
bool RelaxAll);
} // namespace llvm.
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 521b3b39bba2..cae7a7a6c7e7 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief CodeEmitter interface for R600 and SI codegen.
+/// CodeEmitter interface for R600 and SI codegen.
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 1b062064ace1..dcc10a032afe 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief CodeEmitter interface for R600 and SI codegen.
+/// CodeEmitter interface for R600 and SI codegen.
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 2b321c04fb30..c579c7d60e16 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This file provides AMDGPU specific target descriptions.
+/// This file provides AMDGPU specific target descriptions.
//
//===----------------------------------------------------------------------===//
@@ -22,6 +22,7 @@
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -37,9 +38,17 @@ using namespace llvm;
#define GET_SUBTARGETINFO_MC_DESC
#include "AMDGPUGenSubtargetInfo.inc"
+#define NoSchedModel NoSchedModelR600
+#define GET_SUBTARGETINFO_MC_DESC
+#include "R600GenSubtargetInfo.inc"
+#undef NoSchedModelR600
+
#define GET_REGINFO_MC_DESC
#include "AMDGPUGenRegisterInfo.inc"
+#define GET_REGINFO_MC_DESC
+#include "R600GenRegisterInfo.inc"
+
static MCInstrInfo *createAMDGPUMCInstrInfo() {
MCInstrInfo *X = new MCInstrInfo();
InitAMDGPUMCInstrInfo(X);
@@ -48,12 +57,17 @@ static MCInstrInfo *createAMDGPUMCInstrInfo() {
static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) {
MCRegisterInfo *X = new MCRegisterInfo();
- InitAMDGPUMCRegisterInfo(X, 0);
+ if (TT.getArch() == Triple::r600)
+ InitR600MCRegisterInfo(X, 0);
+ else
+ InitAMDGPUMCRegisterInfo(X, 0);
return X;
}
static MCSubtargetInfo *
createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+ if (TT.getArch() == Triple::r600)
+ return createR600MCSubtargetInfoImpl(TT, CPU, FS);
return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS);
}
@@ -62,8 +76,10 @@ static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T,
const MCAsmInfo &MAI,
const MCInstrInfo &MII,
const MCRegisterInfo &MRI) {
- return T.getArch() == Triple::r600 ? new R600InstPrinter(MAI, MII, MRI) :
- new AMDGPUInstPrinter(MAI, MII, MRI);
+ if (T.getArch() == Triple::r600)
+ return new R600InstPrinter(MAI, MII, MRI);
+ else
+ return new AMDGPUInstPrinter(MAI, MII, MRI);
}
static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S,
@@ -76,23 +92,25 @@ static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S,
static MCTargetStreamer * createAMDGPUObjectTargetStreamer(
MCStreamer &S,
const MCSubtargetInfo &STI) {
- return new AMDGPUTargetELFStreamer(S);
+ return new AMDGPUTargetELFStreamer(S, STI);
}
static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
std::unique_ptr<MCAsmBackend> &&MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&Emitter,
bool RelaxAll) {
- return createAMDGPUELFStreamer(T, Context, std::move(MAB), OS,
+ return createAMDGPUELFStreamer(T, Context, std::move(MAB), std::move(OW),
std::move(Emitter), RelaxAll);
}
extern "C" void LLVMInitializeAMDGPUTargetMC() {
+
+ TargetRegistry::RegisterMCInstrInfo(getTheGCNTarget(), createAMDGPUMCInstrInfo);
+ TargetRegistry::RegisterMCInstrInfo(getTheAMDGPUTarget(), createR600MCInstrInfo);
for (Target *T : {&getTheAMDGPUTarget(), &getTheGCNTarget()}) {
RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
- TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo);
TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo);
TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter);
@@ -103,6 +121,8 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() {
// R600 specific registration
TargetRegistry::RegisterMCCodeEmitter(getTheAMDGPUTarget(),
createR600MCCodeEmitter);
+ TargetRegistry::RegisterObjectTargetStreamer(
+ getTheAMDGPUTarget(), createAMDGPUObjectTargetStreamer);
// GCN specific registration
TargetRegistry::RegisterMCCodeEmitter(getTheGCNTarget(),
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 0b3563303ad0..f3628d96d6e9 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Provides AMDGPU specific target descriptions.
+/// Provides AMDGPU specific target descriptions.
//
//===----------------------------------------------------------------------===//
//
@@ -25,7 +25,7 @@ class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
class MCRegisterInfo;
class MCSubtargetInfo;
class MCTargetOptions;
@@ -40,24 +40,30 @@ Target &getTheGCNTarget();
MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
+MCInstrInfo *createR600MCInstrInfo();
MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
-MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
+MCAsmBackend *createAMDGPUAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const MCTargetOptions &Options);
-std::unique_ptr<MCObjectWriter>
+std::unique_ptr<MCObjectTargetWriter>
createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
- bool HasRelocationAddend, raw_pwrite_stream &OS);
+ bool HasRelocationAddend);
} // End llvm namespace
#define GET_REGINFO_ENUM
#include "AMDGPUGenRegisterInfo.inc"
#undef GET_REGINFO_ENUM
+#define GET_REGINFO_ENUM
+#include "R600GenRegisterInfo.inc"
+#undef GET_REGINFO_ENUM
+
#define GET_INSTRINFO_ENUM
#define GET_INSTRINFO_OPERAND_ENUM
#define GET_INSTRINFO_SCHED_ENUM
@@ -66,9 +72,20 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
#undef GET_INSTRINFO_OPERAND_ENUM
#undef GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_OPERAND_ENUM
+#define GET_INSTRINFO_SCHED_ENUM
+#include "R600GenInstrInfo.inc"
+#undef GET_INSTRINFO_SCHED_ENUM
+#undef GET_INSTRINFO_OPERAND_ENUM
+#undef GET_INSTRINFO_ENUM
#define GET_SUBTARGETINFO_ENUM
#include "AMDGPUGenSubtargetInfo.inc"
#undef GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_ENUM
+#include "R600GenSubtargetInfo.inc"
+#undef GET_SUBTARGETINFO_ENUM
+
#endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index d897956daccf..6a41e3f650bc 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -39,6 +39,84 @@ using namespace llvm::AMDGPU;
// AMDGPUTargetStreamer
//===----------------------------------------------------------------------===//
+static const struct {
+ const char *Name;
+ unsigned Mach;
+} MachTable[] = {
+ // Radeon HD 2000/3000 Series (R600).
+ { "r600", ELF::EF_AMDGPU_MACH_R600_R600 },
+ { "r630", ELF::EF_AMDGPU_MACH_R600_R630 },
+ { "rs880", ELF::EF_AMDGPU_MACH_R600_RS880 },
+ { "rv670", ELF::EF_AMDGPU_MACH_R600_RV670 },
+ // Radeon HD 4000 Series (R700).
+ { "rv710", ELF::EF_AMDGPU_MACH_R600_RV710 },
+ { "rv730", ELF::EF_AMDGPU_MACH_R600_RV730 },
+ { "rv770", ELF::EF_AMDGPU_MACH_R600_RV770 },
+ // Radeon HD 5000 Series (Evergreen).
+ { "cedar", ELF::EF_AMDGPU_MACH_R600_CEDAR },
+ { "cypress", ELF::EF_AMDGPU_MACH_R600_CYPRESS },
+ { "juniper", ELF::EF_AMDGPU_MACH_R600_JUNIPER },
+ { "redwood", ELF::EF_AMDGPU_MACH_R600_REDWOOD },
+ { "sumo", ELF::EF_AMDGPU_MACH_R600_SUMO },
+ // Radeon HD 6000 Series (Northern Islands).
+ { "barts", ELF::EF_AMDGPU_MACH_R600_BARTS },
+ { "caicos", ELF::EF_AMDGPU_MACH_R600_CAICOS },
+ { "cayman", ELF::EF_AMDGPU_MACH_R600_CAYMAN },
+ { "turks", ELF::EF_AMDGPU_MACH_R600_TURKS },
+ // AMDGCN GFX6.
+ { "gfx600", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 },
+ { "tahiti", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 },
+ { "gfx601", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+ { "hainan", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+ { "oland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+ { "pitcairn", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+ { "verde", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+ // AMDGCN GFX7.
+ { "gfx700", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 },
+ { "kaveri", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 },
+ { "gfx701", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 },
+ { "hawaii", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 },
+ { "gfx702", ELF::EF_AMDGPU_MACH_AMDGCN_GFX702 },
+ { "gfx703", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
+ { "kabini", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
+ { "mullins", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
+ { "gfx704", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 },
+ { "bonaire", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 },
+ // AMDGCN GFX8.
+ { "gfx801", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 },
+ { "carrizo", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 },
+ { "gfx802", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
+ { "iceland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
+ { "tonga", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
+ { "gfx803", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+ { "fiji", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+ { "polaris10", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+ { "polaris11", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+ { "gfx810", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 },
+ { "stoney", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 },
+ // AMDGCN GFX9.
+ { "gfx900", ELF::EF_AMDGPU_MACH_AMDGCN_GFX900 },
+ { "gfx902", ELF::EF_AMDGPU_MACH_AMDGCN_GFX902 },
+ { "gfx904", ELF::EF_AMDGPU_MACH_AMDGCN_GFX904 },
+ { "gfx906", ELF::EF_AMDGPU_MACH_AMDGCN_GFX906 },
+ // Not specified processor.
+ { nullptr, ELF::EF_AMDGPU_MACH_NONE }
+};
+
+unsigned AMDGPUTargetStreamer::getMACH(StringRef GPU) const {
+ auto Entry = MachTable;
+ for (; Entry->Name && GPU != Entry->Name; ++Entry)
+ ;
+ return Entry->Mach;
+}
+
+const char *AMDGPUTargetStreamer::getMachName(unsigned Mach) {
+ auto Entry = MachTable;
+ for (; Entry->Name && Mach != Entry->Mach; ++Entry)
+ ;
+ return Entry->Name;
+}
+
bool AMDGPUTargetStreamer::EmitHSAMetadata(StringRef HSAMetadataString) {
HSAMD::Metadata HSAMetadata;
if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
@@ -55,9 +133,12 @@ AMDGPUTargetAsmStreamer::AMDGPUTargetAsmStreamer(MCStreamer &S,
formatted_raw_ostream &OS)
: AMDGPUTargetStreamer(S), OS(OS) { }
-void
-AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
- uint32_t Minor) {
+void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {
+ OS << "\t.amdgcn_target \"" << Target << "\"\n";
+}
+
+void AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(
+ uint32_t Major, uint32_t Minor) {
OS << "\t.hsa_code_object_version " <<
Twine(Major) << "," << Twine(Minor) << '\n';
}
@@ -118,12 +199,157 @@ bool AMDGPUTargetAsmStreamer::EmitPALMetadata(
return true;
}
+void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
+ const MCSubtargetInfo &STI, StringRef KernelName,
+ const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR,
+ bool ReserveVCC, bool ReserveFlatScr, bool ReserveXNACK) {
+ amdhsa::kernel_descriptor_t DefaultKD = getDefaultAmdhsaKernelDescriptor();
+
+ IsaInfo::IsaVersion IVersion = IsaInfo::getIsaVersion(STI.getFeatureBits());
+
+ OS << "\t.amdhsa_kernel " << KernelName << '\n';
+
+#define PRINT_IF_NOT_DEFAULT(STREAM, DIRECTIVE, KERNEL_DESC, \
+ DEFAULT_KERNEL_DESC, MEMBER_NAME, FIELD_NAME) \
+ if (AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) != \
+ AMDHSA_BITS_GET(DEFAULT_KERNEL_DESC.MEMBER_NAME, FIELD_NAME)) \
+ STREAM << "\t\t" << DIRECTIVE << " " \
+ << AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) << '\n';
+
+ if (KD.group_segment_fixed_size != DefaultKD.group_segment_fixed_size)
+ OS << "\t\t.amdhsa_group_segment_fixed_size " << KD.group_segment_fixed_size
+ << '\n';
+ if (KD.private_segment_fixed_size != DefaultKD.private_segment_fixed_size)
+ OS << "\t\t.amdhsa_private_segment_fixed_size "
+ << KD.private_segment_fixed_size << '\n';
+
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, DefaultKD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD, DefaultKD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_queue_ptr", KD, DefaultKD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_user_sgpr_kernarg_segment_ptr", KD, DefaultKD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_id", KD, DefaultKD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, DefaultKD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_user_sgpr_private_segment_size", KD, DefaultKD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_y", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_z", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_info", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_vgpr_workitem_id", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
+
+ // These directives are required.
+ OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n';
+ OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n';
+
+ if (!ReserveVCC)
+ OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
+ if (IVersion.Major >= 7 && !ReserveFlatScr)
+ OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n';
+ if (IVersion.Major >= 8 && ReserveXNACK != hasXNACK(STI))
+ OS << "\t\t.amdhsa_reserve_xnack_mask " << ReserveXNACK << '\n';
+
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_32", KD, DefaultKD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_16_64", KD, DefaultKD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_32", KD, DefaultKD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_16_64", KD, DefaultKD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_dx10_clamp", KD, DefaultKD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_ieee_mode", KD, DefaultKD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
+ if (IVersion.Major >= 9)
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_fp16_overflow", KD, DefaultKD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_exception_fp_denorm_src", KD, DefaultKD, compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_exception_fp_ieee_div_zero", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_exception_fp_ieee_overflow", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_exception_fp_ieee_underflow", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_exception_fp_ieee_inexact", KD, DefaultKD, compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_exception_int_div_zero", KD, DefaultKD, compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
+#undef PRINT_IF_NOT_DEFAULT
+
+ OS << "\t.end_amdhsa_kernel\n";
+}
+
//===----------------------------------------------------------------------===//
// AMDGPUTargetELFStreamer
//===----------------------------------------------------------------------===//
-AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S)
- : AMDGPUTargetStreamer(S), Streamer(S) {}
+AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(
+ MCStreamer &S, const MCSubtargetInfo &STI)
+ : AMDGPUTargetStreamer(S), Streamer(S) {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ unsigned EFlags = MCA.getELFHeaderEFlags();
+
+ EFlags &= ~ELF::EF_AMDGPU_MACH;
+ EFlags |= getMACH(STI.getCPU());
+
+ EFlags &= ~ELF::EF_AMDGPU_XNACK;
+ if (AMDGPU::hasXNACK(STI))
+ EFlags |= ELF::EF_AMDGPU_XNACK;
+
+ MCA.setELFHeaderEFlags(EFlags);
+}
MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
return static_cast<MCELFStreamer &>(Streamer);
@@ -150,9 +376,10 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
S.PopSection();
}
-void
-AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
- uint32_t Minor) {
+void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {}
+
+void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(
+ uint32_t Major, uint32_t Minor) {
EmitAMDGPUNote(
MCConstantExpr::create(8, getContext()),
@@ -207,7 +434,7 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
unsigned Type) {
MCSymbolELF *Symbol = cast<MCSymbolELF>(
getStreamer().getContext().getOrCreateSymbol(SymbolName));
- Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL);
+ Symbol->setType(Type);
}
bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
@@ -271,3 +498,46 @@ bool AMDGPUTargetELFStreamer::EmitPALMetadata(
);
return true;
}
+
+void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
+ const MCSubtargetInfo &STI, StringRef KernelName,
+ const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+ bool ReserveXNACK) {
+ auto &Streamer = getStreamer();
+ auto &Context = Streamer.getContext();
+
+ MCSymbolELF *KernelDescriptorSymbol = cast<MCSymbolELF>(
+ Context.getOrCreateSymbol(Twine(KernelName) + Twine(".kd")));
+ KernelDescriptorSymbol->setBinding(ELF::STB_GLOBAL);
+ KernelDescriptorSymbol->setType(ELF::STT_OBJECT);
+ KernelDescriptorSymbol->setSize(
+ MCConstantExpr::create(sizeof(KernelDescriptor), Context));
+
+ MCSymbolELF *KernelCodeSymbol = cast<MCSymbolELF>(
+ Context.getOrCreateSymbol(Twine(KernelName)));
+ KernelCodeSymbol->setBinding(ELF::STB_LOCAL);
+
+ Streamer.EmitLabel(KernelDescriptorSymbol);
+ Streamer.EmitBytes(StringRef(
+ (const char*)&(KernelDescriptor),
+ offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset)));
+ // FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The
+ // expression being created is:
+ // (start of kernel code) - (start of kernel descriptor)
+ // It implies R_AMDGPU_REL64, but ends up being R_AMDGPU_ABS64.
+ Streamer.EmitValue(MCBinaryExpr::createSub(
+ MCSymbolRefExpr::create(
+ KernelCodeSymbol, MCSymbolRefExpr::VK_AMDGPU_REL64, Context),
+ MCSymbolRefExpr::create(
+ KernelDescriptorSymbol, MCSymbolRefExpr::VK_None, Context),
+ Context),
+ sizeof(KernelDescriptor.kernel_code_entry_byte_offset));
+ Streamer.EmitBytes(StringRef(
+ (const char*)&(KernelDescriptor) +
+ offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) +
+ sizeof(KernelDescriptor.kernel_code_entry_byte_offset),
+ sizeof(KernelDescriptor) -
+ offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) -
+ sizeof(KernelDescriptor.kernel_code_entry_byte_offset)));
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 0919b754480d..472da1b73593 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -14,6 +14,7 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
namespace llvm {
#include "AMDGPUPTNote.h"
@@ -30,9 +31,17 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
protected:
MCContext &getContext() const { return Streamer.getContext(); }
+ /// \returns Equivalent EF_AMDGPU_MACH_* value for given \p GPU name.
+ unsigned getMACH(StringRef GPU) const;
+
public:
+ /// \returns Equivalent GPU name for an EF_AMDGPU_MACH_* value.
+ static const char *getMachName(unsigned Mach);
+
AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+ virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0;
+
virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) = 0;
@@ -56,12 +65,21 @@ public:
/// \returns True on success, false on failure.
virtual bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) = 0;
+
+ virtual void EmitAmdhsaKernelDescriptor(
+ const MCSubtargetInfo &STI, StringRef KernelName,
+ const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+ bool ReserveXNACK) = 0;
};
class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
formatted_raw_ostream &OS;
public:
AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+
+ void EmitDirectiveAMDGCNTarget(StringRef Target) override;
+
void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) override;
@@ -81,6 +99,12 @@ public:
/// \returns True on success, false on failure.
bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
+
+ void EmitAmdhsaKernelDescriptor(
+ const MCSubtargetInfo &STI, StringRef KernelName,
+ const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+ bool ReserveXNACK) override;
};
class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
@@ -90,10 +114,12 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
function_ref<void(MCELFStreamer &)> EmitDesc);
public:
- AMDGPUTargetELFStreamer(MCStreamer &S);
+ AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
MCELFStreamer &getStreamer();
+ void EmitDirectiveAMDGCNTarget(StringRef Target) override;
+
void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) override;
@@ -113,6 +139,12 @@ public:
/// \returns True on success, false on failure.
bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
+
+ void EmitAmdhsaKernelDescriptor(
+ const MCSubtargetInfo &STI, StringRef KernelName,
+ const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+ bool ReserveXNACK) override;
};
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
index f9cb4678dc51..2d201bbbd7b8 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
@@ -2,11 +2,11 @@ add_llvm_library(LLVMAMDGPUDesc
AMDGPUAsmBackend.cpp
AMDGPUELFObjectWriter.cpp
AMDGPUELFStreamer.cpp
- AMDGPUHSAMetadataStreamer.cpp
AMDGPUMCAsmInfo.cpp
AMDGPUMCCodeEmitter.cpp
AMDGPUMCTargetDesc.cpp
AMDGPUTargetStreamer.cpp
R600MCCodeEmitter.cpp
+ R600MCTargetDesc.cpp
SIMCCodeEmitter.cpp
)
diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index eab90e1d344c..28d4bc1829e2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -9,13 +9,12 @@
//
/// \file
///
-/// \brief The R600 code emitter produces machine code that can be executed
+/// The R600 code emitter produces machine code that can be executed
/// directly on the GPU device.
//
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/AMDGPUFixupKinds.h"
-#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "R600Defines.h"
#include "llvm/MC/MCCodeEmitter.h"
@@ -36,30 +35,40 @@ using namespace llvm;
namespace {
-class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
+class R600MCCodeEmitter : public MCCodeEmitter {
const MCRegisterInfo &MRI;
+ const MCInstrInfo &MCII;
public:
R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri)
- : AMDGPUMCCodeEmitter(mcii), MRI(mri) {}
+ : MRI(mri), MCII(mcii) {}
R600MCCodeEmitter(const R600MCCodeEmitter &) = delete;
R600MCCodeEmitter &operator=(const R600MCCodeEmitter &) = delete;
- /// \brief Encode the instruction and write it to the OS.
+ /// Encode the instruction and write it to the OS.
void encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
+ const MCSubtargetInfo &STI) const;
/// \returns the encoding for an MCOperand.
uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
+ const MCSubtargetInfo &STI) const;
private:
+
void Emit(uint32_t value, raw_ostream &OS) const;
void Emit(uint64_t value, raw_ostream &OS) const;
unsigned getHWReg(unsigned regNo) const;
+
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+ void verifyInstructionPredicates(const MCInst &MI,
+ uint64_t AvailableFeatures) const;
+
};
} // end anonymous namespace
@@ -94,16 +103,16 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
computeAvailableFeatures(STI.getFeatureBits()));
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
- if (MI.getOpcode() == AMDGPU::RETURN ||
- MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
- MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
- MI.getOpcode() == AMDGPU::BUNDLE ||
- MI.getOpcode() == AMDGPU::KILL) {
+ if (MI.getOpcode() == R600::RETURN ||
+ MI.getOpcode() == R600::FETCH_CLAUSE ||
+ MI.getOpcode() == R600::ALU_CLAUSE ||
+ MI.getOpcode() == R600::BUNDLE ||
+ MI.getOpcode() == R600::KILL) {
return;
} else if (IS_VTX(Desc)) {
uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI);
uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
- if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) {
+ if (!(STI.getFeatureBits()[R600::FeatureCaymanISA])) {
InstWord2 |= 1 << 19; // Mega-Fetch bit
}
@@ -136,7 +145,7 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
Emit((uint32_t) 0, OS);
} else {
uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI);
- if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) &&
+ if ((STI.getFeatureBits()[R600::FeatureR600ALUInst]) &&
((Desc.TSFlags & R600_InstFlag::OP1) ||
Desc.TSFlags & R600_InstFlag::OP2)) {
uint64_t ISAOpCode = Inst & (0x3FFULL << 39);
@@ -148,11 +157,11 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
}
void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
- support::endian::Writer<support::little>(OS).write(Value);
+ support::endian::write(OS, Value, support::little);
}
void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
- support::endian::Writer<support::little>(OS).write(Value);
+ support::endian::write(OS, Value, support::little);
}
unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
@@ -186,4 +195,4 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
}
#define ENABLE_INSTR_PREDICATE_VERIFIER
-#include "AMDGPUGenMCCodeEmitter.inc"
+#include "R600GenMCCodeEmitter.inc"
diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
new file mode 100644
index 000000000000..1c99a708e5ac
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
@@ -0,0 +1,27 @@
+//===-- R600MCTargetDesc.cpp - R600 Target Descriptions -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This file provides R600 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "R600GenInstrInfo.inc"
+
+MCInstrInfo *llvm::createR600MCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitR600MCInstrInfo(X);
+ return X;
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 94c0157edeb5..36913bd04274 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief The SI code emitter produces machine code that can be executed
+/// The SI code emitter produces machine code that can be executed
/// directly on the GPU device.
//
//===----------------------------------------------------------------------===//
@@ -43,7 +43,7 @@ namespace {
class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
const MCRegisterInfo &MRI;
- /// \brief Encode an fp or int literal
+ /// Encode an fp or int literal
uint32_t getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo,
const MCSubtargetInfo &STI) const;
@@ -54,7 +54,7 @@ public:
SIMCCodeEmitter(const SIMCCodeEmitter &) = delete;
SIMCCodeEmitter &operator=(const SIMCCodeEmitter &) = delete;
- /// \brief Encode the instruction and write it to the OS.
+ /// Encode the instruction and write it to the OS.
void encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
@@ -64,7 +64,7 @@ public:
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
- /// \brief Use a fixup to encode the simm16 field for SOPP branch
+ /// Use a fixup to encode the simm16 field for SOPP branch
/// instructions.
unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
@@ -335,13 +335,24 @@ SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
const MCOperand &MO = MI.getOperand(OpNo);
- unsigned Reg = MO.getReg();
- RegEnc |= MRI.getEncodingValue(Reg);
- RegEnc &= SDWA9EncValues::SRC_VGPR_MASK;
- if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
- RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ RegEnc |= MRI.getEncodingValue(Reg);
+ RegEnc &= SDWA9EncValues::SRC_VGPR_MASK;
+ if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
+ RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;
+ }
+ return RegEnc;
+ } else {
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
+ if (Enc != ~0U && Enc != 255) {
+ return Enc | SDWA9EncValues::SRC_SGPR_MASK;
+ }
}
- return RegEnc;
+
+ llvm_unreachable("Unsupported operand kind");
+ return 0;
}
unsigned
@@ -427,3 +438,6 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
llvm_unreachable("Encoding of this operand type is not supported yet.");
return 0;
}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index 30a2df510386..1e0bc62c45a6 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -7,9 +7,63 @@
//
//===----------------------------------------------------------------------===//
-class MIMG_Mask <string op, int channels> {
- string Op = op;
- int Channels = channels;
+// MIMG-specific encoding families to distinguish between semantically
+// equivalent machine instructions with different encoding.
+//
+// - MIMGEncGfx6: encoding introduced with gfx6 (obsoleted for atomics in gfx8)
+// - MIMGEncGfx8: encoding introduced with gfx8 for atomics
+class MIMGEncoding;
+
+def MIMGEncGfx6 : MIMGEncoding;
+def MIMGEncGfx8 : MIMGEncoding;
+
+def MIMGEncoding : GenericEnum {
+ let FilterClass = "MIMGEncoding";
+}
+
+// Represent an ISA-level opcode, independent of the encoding and the
+// vdata/vaddr size.
+class MIMGBaseOpcode {
+ MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(NAME);
+ bit Store = 0;
+ bit Atomic = 0;
+ bit AtomicX2 = 0; // (f)cmpswap
+ bit Sampler = 0;
+ bits<8> NumExtraArgs = 0;
+ bit Gradients = 0;
+ bit Coordinates = 1;
+ bit LodOrClampOrMip = 0;
+ bit HasD16 = 0;
+}
+
+def MIMGBaseOpcode : GenericEnum {
+ let FilterClass = "MIMGBaseOpcode";
+}
+
+def MIMGBaseOpcodesTable : GenericTable {
+ let FilterClass = "MIMGBaseOpcode";
+ let CppTypeName = "MIMGBaseOpcodeInfo";
+ let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
+ "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip",
+ "HasD16"];
+ GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
+
+ let PrimaryKey = ["BaseOpcode"];
+ let PrimaryKeyName = "getMIMGBaseOpcodeInfo";
+}
+
+def MIMGDim : GenericEnum {
+ let FilterClass = "AMDGPUDimProps";
+}
+
+def MIMGDimInfoTable : GenericTable {
+ let FilterClass = "AMDGPUDimProps";
+ let CppTypeName = "MIMGDimInfo";
+ let Fields = ["Dim", "NumCoords", "NumGradients", "DA"];
+ GenericEnum TypeOf_Dim = MIMGDim;
+
+ let PrimaryKey = ["Dim"];
+ let PrimaryKeyName = "getMIMGDimInfo";
}
class mimg <bits<7> si, bits<7> vi = si> {
@@ -17,254 +71,372 @@ class mimg <bits<7> si, bits<7> vi = si> {
field bits<7> VI = vi;
}
-class MIMG_Helper <dag outs, dag ins, string asm,
- string dns=""> : MIMG<outs, ins, asm,[]> {
+class MIMG <dag outs, string dns = "">
+ : InstSI <outs, (ins), "", []> {
+
+ let VM_CNT = 1;
+ let EXP_CNT = 1;
+ let MIMG = 1;
+ let Uses = [EXEC];
let mayLoad = 1;
let mayStore = 0;
let hasPostISelHook = 1;
+ let SchedRW = [WriteVMEM];
+ let UseNamedOperandTable = 1;
+ let hasSideEffects = 0; // XXX ????
+
+ let SubtargetPredicate = isGCN;
let DecoderNamespace = dns;
let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
let AsmMatchConverter = "cvtMIMG";
let usesCustomInserter = 1;
- let SchedRW = [WriteVMEM];
+
+ Instruction Opcode = !cast<Instruction>(NAME);
+ MIMGBaseOpcode BaseOpcode;
+ MIMGEncoding MIMGEncoding = MIMGEncGfx6;
+ bits<8> VDataDwords;
+ bits<8> VAddrDwords;
+}
+
+def MIMGInfoTable : GenericTable {
+ let FilterClass = "MIMG";
+ let CppTypeName = "MIMGInfo";
+ let Fields = ["Opcode", "BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"];
+ GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
+ GenericEnum TypeOf_MIMGEncoding = MIMGEncoding;
+
+ let PrimaryKey = ["BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"];
+ let PrimaryKeyName = "getMIMGOpcodeHelper";
+}
+
+def getMIMGInfo : SearchIndex {
+ let Table = MIMGInfoTable;
+ let Key = ["Opcode"];
}
class MIMG_NoSampler_Helper <bits<7> op, string asm,
RegisterClass dst_rc,
RegisterClass addr_rc,
- string dns=""> : MIMG_Helper <
- (outs dst_rc:$vdata),
- (ins addr_rc:$vaddr, SReg_256:$srsrc,
- dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
- r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da",
- dns>, MIMGe<op> {
+ string dns="">
+ : MIMG <(outs dst_rc:$vdata), dns>,
+ MIMGe<op> {
let ssamp = 0;
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+
+ let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
+ DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ #!if(BaseOpcode.HasD16, "$d16", "");
}
multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- int channels> {
- def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
- !if(!eq(channels, 1), "AMDGPU", "")>,
- MIMG_Mask<asm#"_V1", channels>;
- def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
- MIMG_Mask<asm#"_V2", channels>;
- def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>,
- MIMG_Mask<asm#"_V4", channels>;
-}
+ RegisterClass dst_rc,
+ bit enableDisasm> {
+ let VAddrDwords = 1 in
+ def NAME # _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ let VAddrDwords = 2 in
+ def NAME # _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
+ let VAddrDwords = 3 in
+ def NAME # _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
+ let VAddrDwords = 4 in
+ def NAME # _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
+}
+
+multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0,
+ bit isResInfo = 0> {
+ def "" : MIMGBaseOpcode {
+ let Coordinates = !if(isResInfo, 0, 1);
+ let LodOrClampOrMip = mip;
+ let HasD16 = has_d16;
+ }
-multiclass MIMG_NoSampler <bits<7> op, string asm> {
- defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
- defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
- defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
- defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
+ let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
+ mayLoad = !if(isResInfo, 0, 1) in {
+ let VDataDwords = 1 in
+ defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
+ let VDataDwords = 2 in
+ defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0>;
+ let VDataDwords = 3 in
+ defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
+ let VDataDwords = 4 in
+ defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
+ }
}
class MIMG_Store_Helper <bits<7> op, string asm,
RegisterClass data_rc,
RegisterClass addr_rc,
- string dns = ""> : MIMG_Helper <
- (outs),
- (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
- dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
- r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", dns>, MIMGe<op> {
+ string dns = "">
+ : MIMG <(outs), dns>,
+ MIMGe<op> {
let ssamp = 0;
- let mayLoad = 1; // TableGen requires this for matching with the intrinsics
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+
+ let mayLoad = 0;
let mayStore = 1;
- let hasSideEffects = 1;
+ let hasSideEffects = 0;
let hasPostISelHook = 0;
let DisableWQM = 1;
+
+ let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+ DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ #!if(BaseOpcode.HasD16, "$d16", "");
}
multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
RegisterClass data_rc,
- int channels> {
- def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
- !if(!eq(channels, 1), "AMDGPU", "")>,
- MIMG_Mask<asm#"_V1", channels>;
- def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>,
- MIMG_Mask<asm#"_V2", channels>;
- def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>,
- MIMG_Mask<asm#"_V4", channels>;
-}
+ bit enableDisasm> {
+ let VAddrDwords = 1 in
+ def NAME # _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ let VAddrDwords = 2 in
+ def NAME # _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
+ let VAddrDwords = 3 in
+ def NAME # _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
+ let VAddrDwords = 4 in
+ def NAME # _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
+}
+
+multiclass MIMG_Store <bits<7> op, string asm, bit has_d16, bit mip = 0> {
+ def "" : MIMGBaseOpcode {
+ let Store = 1;
+ let LodOrClampOrMip = mip;
+ let HasD16 = has_d16;
+ }
-multiclass MIMG_Store <bits<7> op, string asm> {
- defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
- defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 2>;
- defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 3>;
- defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 4>;
+ let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
+ let VDataDwords = 1 in
+ defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
+ let VDataDwords = 2 in
+ defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 0>;
+ let VDataDwords = 3 in
+ defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 0>;
+ let VDataDwords = 4 in
+ defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 0>;
+ }
}
class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
- RegisterClass addr_rc> : MIMG_Helper <
- (outs data_rc:$vdst),
- (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
- dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
- r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
- > {
+ RegisterClass addr_rc, string dns="",
+ bit enableDasm = 0>
+ : MIMG <(outs data_rc:$vdst), !if(enableDasm, dns, "")> {
+ let mayLoad = 1;
let mayStore = 1;
- let hasSideEffects = 1;
+ let hasSideEffects = 1; // FIXME: Remove this
let hasPostISelHook = 0;
let DisableWQM = 1;
let Constraints = "$vdst = $vdata";
let AsmMatchConverter = "cvtMIMGAtomic";
-}
-class MIMG_Atomic_Real_si<mimg op, string name, string asm,
- RegisterClass data_rc, RegisterClass addr_rc> :
- MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
- SIMCInstr<name, SIEncodingFamily.SI>,
- MIMGe<op.SI> {
- let isCodeGenOnly = 0;
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-class MIMG_Atomic_Real_vi<mimg op, string name, string asm,
- RegisterClass data_rc, RegisterClass addr_rc> :
- MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
- SIMCInstr<name, SIEncodingFamily.VI>,
- MIMGe<op.VI> {
- let isCodeGenOnly = 0;
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-multiclass MIMG_Atomic_Helper_m <mimg op, string name, string asm,
- RegisterClass data_rc, RegisterClass addr_rc> {
- let isPseudo = 1, isCodeGenOnly = 1 in {
- def "" : MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
- SIMCInstr<name, SIEncodingFamily.NONE>;
+ let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+ DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
+ let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da";
+}
+
+multiclass MIMG_Atomic_Helper_m <mimg op, string asm, RegisterClass data_rc,
+ RegisterClass addr_rc, bit enableDasm = 0> {
+ let ssamp = 0, d16 = 0 in {
+ def _si : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "SICI", enableDasm>,
+ SIMCInstr<NAME, SIEncodingFamily.SI>,
+ MIMGe<op.SI> {
+ let AssemblerPredicates = [isSICI];
+ let DisableDecoder = DisableSIDecoder;
+ }
+
+ def _vi : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "VI", enableDasm>,
+ SIMCInstr<NAME, SIEncodingFamily.VI>,
+ MIMGe<op.VI> {
+ let AssemblerPredicates = [isVI];
+ let DisableDecoder = DisableVIDecoder;
+ let MIMGEncoding = MIMGEncGfx8;
+ }
}
+}
- let ssamp = 0 in {
- def _si : MIMG_Atomic_Real_si<op, name, asm, data_rc, addr_rc>;
+multiclass MIMG_Atomic_Addr_Helper_m <mimg op, string asm,
+ RegisterClass data_rc,
+ bit enableDasm = 0> {
+ // _V* variants have different address size, but the size is not encoded.
+ // So only one variant can be disassembled. V1 looks the safest to decode.
+ let VAddrDwords = 1 in
+ defm _V1 : MIMG_Atomic_Helper_m <op, asm, data_rc, VGPR_32, enableDasm>;
+ let VAddrDwords = 2 in
+ defm _V2 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_64>;
+ let VAddrDwords = 3 in
+ defm _V3 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_96>;
+ let VAddrDwords = 4 in
+ defm _V4 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_128>;
+}
+
+multiclass MIMG_Atomic <mimg op, string asm, bit isCmpSwap = 0> { // 64-bit atomics
+ def "" : MIMGBaseOpcode {
+ let Atomic = 1;
+ let AtomicX2 = isCmpSwap;
+ }
- def _vi : MIMG_Atomic_Real_vi<op, name, asm, data_rc, addr_rc>;
+ let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
+ // _V* variants have different dst size, but the size is encoded implicitly,
+ // using dmask and tfe. Only 32-bit variant is registered with disassembler.
+ // Other variants are reconstructed by disassembler using dmask and tfe.
+ let VDataDwords = !if(isCmpSwap, 2, 1) in
+ defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1>;
+ let VDataDwords = !if(isCmpSwap, 4, 2) in
+ defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64)>;
}
}
-multiclass MIMG_Atomic <mimg op, string asm, RegisterClass data_rc = VGPR_32> {
- defm _V1 : MIMG_Atomic_Helper_m <op, asm # "_V1", asm, data_rc, VGPR_32>;
- defm _V2 : MIMG_Atomic_Helper_m <op, asm # "_V2", asm, data_rc, VReg_64>;
- defm _V4 : MIMG_Atomic_Helper_m <op, asm # "_V3", asm, data_rc, VReg_128>;
+class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc,
+ RegisterClass src_rc, string dns="">
+ : MIMG <(outs dst_rc:$vdata), dns>,
+ MIMGe<op> {
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+
+ let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+ DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMGAddrSize<int dw, bit enable_disasm> {
+ int NumWords = dw;
+
+ RegisterClass RegClass = !if(!le(NumWords, 0), ?,
+ !if(!eq(NumWords, 1), VGPR_32,
+ !if(!eq(NumWords, 2), VReg_64,
+ !if(!eq(NumWords, 3), VReg_96,
+ !if(!eq(NumWords, 4), VReg_128,
+ !if(!le(NumWords, 8), VReg_256,
+ !if(!le(NumWords, 16), VReg_512, ?)))))));
+
+ // Whether the instruction variant with this vaddr size should be enabled for
+ // the auto-generated disassembler.
+ bit Disassemble = enable_disasm;
+}
+
+// Return whether a value inside the range [min, max] (endpoints inclusive)
+// is in the given list.
+class isRangeInList<int min, int max, list<int> lst> {
+ bit ret = !foldl(0, lst, lhs, y, !or(lhs, !and(!le(min, y), !le(y, max))));
+}
+
+class MIMGAddrSizes_tmp<list<MIMGAddrSize> lst, int min> {
+ list<MIMGAddrSize> List = lst;
+ int Min = min;
+}
+
+class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> {
+ // List of all possible numbers of address words, taking all combinations of
+ // A16 and image dimension into account (note: no MSAA, since this is for
+ // sample/gather ops).
+ list<int> AllNumAddrWords =
+ !foreach(dw, !if(sample.Gradients,
+ !if(!eq(sample.LodOrClamp, ""),
+ [2, 3, 4, 5, 6, 7, 9],
+ [2, 3, 4, 5, 7, 8, 10]),
+ !if(!eq(sample.LodOrClamp, ""),
+ [1, 2, 3],
+ [1, 2, 3, 4])),
+ !add(dw, !size(sample.ExtraAddrArgs)));
+
+ // Generate machine instructions based on possible register classes for the
+ // required numbers of address words. The disassembler defaults to the
+ // smallest register class.
+ list<MIMGAddrSize> MachineInstrs =
+ !foldl(MIMGAddrSizes_tmp<[], 0>, [1, 2, 3, 4, 8, 16], lhs, dw,
+ !if(isRangeInList<lhs.Min, dw, AllNumAddrWords>.ret,
+ MIMGAddrSizes_tmp<
+ !listconcat(lhs.List, [MIMGAddrSize<dw, !empty(lhs.List)>]),
+ !if(!eq(dw, 3), 3, !add(dw, 1))>, // we still need _V4 for codegen w/ 3 dwords
+ lhs)).List;
}
-class MIMG_Sampler_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- RegisterClass src_rc,
- bit wqm,
- string dns=""> : MIMG_Helper <
- (outs dst_rc:$vdata),
- (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
- dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
- r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
- dns>, MIMGe<op> {
- let WQM = wqm;
+multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
+ AMDGPUSampleVariant sample, RegisterClass dst_rc,
+ bit enableDisasm = 0> {
+ foreach addr = MIMG_Sampler_AddrSizes<sample>.MachineInstrs in {
+ let VAddrDwords = addr.NumWords in
+ def _V # addr.NumWords
+ : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ }
}
-multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- int channels, bit wqm> {
- def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm,
- !if(!eq(channels, 1), "AMDGPU", "")>,
- MIMG_Mask<asm#"_V1", channels>;
- def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>,
- MIMG_Mask<asm#"_V2", channels>;
- def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>,
- MIMG_Mask<asm#"_V4", channels>;
- def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>,
- MIMG_Mask<asm#"_V8", channels>;
- def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>,
- MIMG_Mask<asm#"_V16", channels>;
-}
-
-multiclass MIMG_Sampler <bits<7> op, string asm, bit wqm=0> {
- defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, wqm>;
- defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, wqm>;
- defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, wqm>;
- defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, wqm>;
-}
-
-multiclass MIMG_Sampler_WQM <bits<7> op, string asm> : MIMG_Sampler<op, asm, 1>;
-
-class MIMG_Gather_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- RegisterClass src_rc, bit wqm> : MIMG <
- (outs dst_rc:$vdata),
- (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
- dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
- r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
- []>, MIMGe<op> {
- let mayLoad = 1;
- let mayStore = 0;
+class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample>
+ : MIMGBaseOpcode {
+ let Sampler = 1;
+ let NumExtraArgs = !size(sample.ExtraAddrArgs);
+ let Gradients = sample.Gradients;
+ let LodOrClampOrMip = !ne(sample.LodOrClamp, "");
+}
- // DMASK was repurposed for GATHER4. 4 components are always
- // returned and DMASK works like a swizzle - it selects
- // the component to fetch. The only useful DMASK values are
- // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
- // (red,red,red,red) etc.) The ISA document doesn't mention
- // this.
- // Therefore, disable all code which updates DMASK by setting this:
- let Gather4 = 1;
- let hasPostISelHook = 0;
- let WQM = wqm;
+multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
+ bit isGetLod = 0,
+ string asm = "image_sample"#sample.LowerCaseMod> {
+ def "" : MIMG_Sampler_BaseOpcode<sample> {
+ let HasD16 = !if(isGetLod, 0, 1);
+ }
- let isAsmParserOnly = 1; // TBD: fix it later
+ let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
+ mayLoad = !if(isGetLod, 0, 1) in {
+ let VDataDwords = 1 in
+ defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1>;
+ let VDataDwords = 2 in
+ defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>;
+ let VDataDwords = 3 in
+ defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
+ let VDataDwords = 4 in
+ defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
+ }
}
-multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- int channels, bit wqm> {
- def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>,
- MIMG_Mask<asm#"_V1", channels>;
- def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>,
- MIMG_Mask<asm#"_V2", channels>;
- def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>,
- MIMG_Mask<asm#"_V4", channels>;
- def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>,
- MIMG_Mask<asm#"_V8", channels>;
- def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>,
- MIMG_Mask<asm#"_V16", channels>;
-}
+multiclass MIMG_Sampler_WQM <bits<7> op, AMDGPUSampleVariant sample>
+ : MIMG_Sampler<op, sample, 1>;
-multiclass MIMG_Gather <bits<7> op, string asm, bit wqm=0> {
- defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, wqm>;
- defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, wqm>;
- defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, wqm>;
- defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, wqm>;
+multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
+ string asm = "image_gather4"#sample.LowerCaseMod> {
+ def "" : MIMG_Sampler_BaseOpcode<sample> {
+ let HasD16 = 1;
+ }
+
+ let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
+ Gather4 = 1, hasPostISelHook = 0 in {
+ let VDataDwords = 2 in
+ defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
+ let VDataDwords = 4 in
+ defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
+ }
}
-multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>;
+multiclass MIMG_Gather_WQM <bits<7> op, AMDGPUSampleVariant sample>
+ : MIMG_Gather<op, sample, 1>;
//===----------------------------------------------------------------------===//
// MIMG Instructions
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGCN in {
-defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">;
-defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">;
-//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>;
-//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>;
-//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>;
-//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>;
-defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">;
-defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">;
-//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
-//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
-
-let mayLoad = 0, mayStore = 0 in {
-defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
-}
+defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load", 1>;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip", 1, 1>;
+defm IMAGE_LOAD_PCK : MIMG_NoSampler <0x00000002, "image_load_pck", 0>;
+defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <0x00000003, "image_load_pck_sgn", 0>;
+defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <0x00000004, "image_load_mip_pck", 0, 1>;
+defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <0x00000005, "image_load_mip_pck_sgn", 0, 1>;
+defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store", 1>;
+defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip", 1, 1>;
+defm IMAGE_STORE_PCK : MIMG_Store <0x0000000a, "image_store_pck", 0>;
+defm IMAGE_STORE_MIP_PCK : MIMG_Store <0x0000000b, "image_store_mip_pck", 0, 1>;
+
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo", 0, 1, 1>;
defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">;
-defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>;
+defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", 1>;
defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">;
defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">;
//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI
@@ -277,397 +449,101 @@ defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">;
defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">;
defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">;
defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
-//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d, 1>; -- not on VI
//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
-defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">;
-defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">;
-defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">;
-defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">;
-defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">;
-defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">;
-defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">;
-defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">;
-defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">;
-defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">;
-defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">;
-defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">;
-defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">;
-defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">;
-defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">;
-defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">;
-defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">;
-defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">;
-defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">;
-defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">;
-defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">;
-defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">;
-defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">;
-defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">;
-defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">;
-defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">;
-defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">;
-defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">;
-defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">;
-defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">;
-defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">;
-defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">;
-defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">;
-defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">;
-defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">;
-defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">;
-defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">;
-defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">;
-defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">;
-defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">;
-defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">;
-defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">;
-defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">;
-defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">;
-defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">;
-defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">;
-defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">;
-defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">;
-defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">;
-defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">;
-defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">;
-defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">;
-defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
-defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">;
-defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
-
-let mayLoad = 0, mayStore = 0 in {
-defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">;
-}
-
-defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">;
-defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
-defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
-defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">;
-defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">;
-defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">;
-defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">;
-defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">;
+defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>;
+defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>;
+defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>;
+defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>;
+defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, AMDGPUSample_l>;
+defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>;
+defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>;
+defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, AMDGPUSample_lz>;
+defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>;
+defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>;
+defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>;
+defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>;
+defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>;
+defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>;
+defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>;
+defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>;
+defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>;
+defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>;
+defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>;
+defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>;
+defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>;
+defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>;
+defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>;
+defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>;
+defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>;
+defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>;
+defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>;
+defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>;
+defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>;
+defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>;
+defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>;
+defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, AMDGPUSample>;
+defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>;
+defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, AMDGPUSample_l>;
+defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>;
+defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>;
+defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, AMDGPUSample_lz>;
+defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>;
+defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>;
+defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>;
+defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>;
+defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>;
+defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>;
+defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>;
+defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>;
+defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, AMDGPUSample_l_o>;
+defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>;
+defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>;
+defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>;
+defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>;
+defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>;
+defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>;
+defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>;
+
+defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 1, "image_get_lod">;
+
+defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, AMDGPUSample_cd>;
+defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>;
+defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>;
+defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>;
+defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>;
+defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>;
+defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>;
//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
+
+/********** ========================================= **********/
+/********** Table of dimension-aware image intrinsics **********/
+/********** ========================================= **********/
+
+class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
+ Intrinsic Intr = I;
+ MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod));
+ AMDGPUDimProps Dim = I.P.Dim;
}
-/********** ======================= **********/
-/********** Image sampling patterns **********/
-/********** ======================= **********/
-
-// Image + sampler
-class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
- i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
- (opcode $addr, $rsrc, $sampler,
- (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
- (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
->;
-
-multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
- def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
- def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
- def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
- def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>;
- def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>;
-}
-
-// Image + sampler for amdgcn
-// TODO:
-// 1. Handle half data type like v4f16, and add D16 bit support;
-// 2. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128).
-// 3. Add A16 support when we pass address of half type.
-multiclass AMDGCNSamplePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
- def : GCNPat<
- (dt (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc,
- i1:$slc, i1:$lwe, i1:$da)),
- (opcode $addr, $rsrc, $sampler,
- (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
- 0, 0, (as_i1imm $lwe), (as_i1imm $da))
- >;
-}
-
-multiclass AMDGCNSampleDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
- defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V1), dt, f32>;
- defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V2), dt, v2f32>;
- defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V4), dt, v4f32>;
- defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V8), dt, v8f32>;
- defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V16), dt, v16f32>;
-}
-
-// TODO: support v3f32.
-multiclass AMDGCNSamplePatterns<SDPatternOperator name, string opcode> {
- defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V1), f32>;
- defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
- defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
-}
-
-// Image only
-class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat <
- (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm,
- imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe),
- (opcode $addr, $rsrc,
- (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
- (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
->;
-
-multiclass ImagePatterns<SDPatternOperator name, string opcode> {
- def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
- def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
- def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
-}
-
-multiclass ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
- def : GCNPat <
- (dt (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe,
- i1:$da)),
- (opcode $addr, $rsrc,
- (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
- 0, 0, (as_i1imm $lwe), (as_i1imm $da))
- >;
-}
-
-multiclass ImageLoadDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
- defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V1), dt, i32>;
- defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>;
- defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>;
-}
-
-// TODO: support v3f32.
-multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> {
- defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f32>;
- defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
- defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
-}
-
-multiclass ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
- def : GCNPat <
- (name dt:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc,
- i1:$lwe, i1:$da),
- (opcode $data, $addr, $rsrc,
- (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
- 0, 0, (as_i1imm $lwe), (as_i1imm $da))
- >;
-}
-
-multiclass ImageStoreDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
- defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V1), dt, i32>;
- defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>;
- defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>;
-}
-
-// TODO: support v3f32.
-multiclass ImageStorePatterns<SDPatternOperator name, string opcode> {
- defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V1), f32>;
- defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
- defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
-}
-
-class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat <
- (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc),
- (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da))
->;
-
-multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> {
- def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V1), i32>;
- def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V2), v2i32>;
- def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>;
-}
-
-class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : GCNPat <
- (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc,
- imm:$r128, imm:$da, imm:$slc),
- (EXTRACT_SUBREG
- (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1),
- $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)),
- sub0)
->;
-
-// ======= amdgcn Image Intrinsics ==============
-
-// Image load
-defm : ImageLoadPatterns<int_amdgcn_image_load, "IMAGE_LOAD">;
-defm : ImageLoadPatterns<int_amdgcn_image_load_mip, "IMAGE_LOAD_MIP">;
-defm : ImageLoadPatterns<int_amdgcn_image_getresinfo, "IMAGE_GET_RESINFO">;
-
-// Image store
-defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">;
-defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">;
-
-// Basic sample
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample, "IMAGE_SAMPLE">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl, "IMAGE_SAMPLE_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d, "IMAGE_SAMPLE_D">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l, "IMAGE_SAMPLE_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b, "IMAGE_SAMPLE_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz, "IMAGE_SAMPLE_LZ">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd, "IMAGE_SAMPLE_CD">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">;
-
-// Sample with comparison
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c, "IMAGE_SAMPLE_C">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d, "IMAGE_SAMPLE_C_D">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l, "IMAGE_SAMPLE_C_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b, "IMAGE_SAMPLE_C_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">;
-
-// Sample with offsets
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_o, "IMAGE_SAMPLE_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_o, "IMAGE_SAMPLE_D_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l_o, "IMAGE_SAMPLE_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_o, "IMAGE_SAMPLE_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">;
-
-// Sample with comparison and offsets
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_o, "IMAGE_SAMPLE_C_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
-
-// Gather opcodes
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4, "IMAGE_GATHER4">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl, "IMAGE_GATHER4_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l, "IMAGE_GATHER4_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b, "IMAGE_GATHER4_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl, "IMAGE_GATHER4_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz, "IMAGE_GATHER4_LZ">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c, "IMAGE_GATHER4_C">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl, "IMAGE_GATHER4_C_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l, "IMAGE_GATHER4_C_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b, "IMAGE_GATHER4_C_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl, "IMAGE_GATHER4_C_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz, "IMAGE_GATHER4_C_LZ">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_o, "IMAGE_GATHER4_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl_o, "IMAGE_GATHER4_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l_o, "IMAGE_GATHER4_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_o, "IMAGE_GATHER4_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl_o, "IMAGE_GATHER4_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz_o, "IMAGE_GATHER4_LZ_O">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_o, "IMAGE_GATHER4_C_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl_o, "IMAGE_GATHER4_C_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l_o, "IMAGE_GATHER4_C_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_o, "IMAGE_GATHER4_C_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl_o, "IMAGE_GATHER4_C_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz_o, "IMAGE_GATHER4_C_LZ_O">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_getlod, "IMAGE_GET_LOD">;
-
-// Image atomics
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_swap, "IMAGE_ATOMIC_SWAP">;
-def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V1, i32>;
-def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V2, v2i32>;
-def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V4, v4i32>;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_add, "IMAGE_ATOMIC_ADD">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_sub, "IMAGE_ATOMIC_SUB">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smin, "IMAGE_ATOMIC_SMIN">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umin, "IMAGE_ATOMIC_UMIN">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smax, "IMAGE_ATOMIC_SMAX">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umax, "IMAGE_ATOMIC_UMAX">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_and, "IMAGE_ATOMIC_AND">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_or, "IMAGE_ATOMIC_OR">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_xor, "IMAGE_ATOMIC_XOR">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">;
-
-/* SIsample for simple 1D texture lookup */
-def : GCNPat <
- (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
- (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
->;
-
-class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
- (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
->;
-
-class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT),
- (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0)
->;
-
-class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY),
- (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
->;
-
-class SampleShadowPattern<SDNode name, MIMG opcode,
- ValueType vt> : GCNPat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW),
- (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
->;
-
-class SampleShadowArrayPattern<SDNode name, MIMG opcode,
- ValueType vt> : GCNPat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
- (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
->;
-
-/* SIsample* for texture lookups consuming more address parameters */
-multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
- MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
-MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
- def : SamplePattern <SIsample, sample, addr_type>;
- def : SampleRectPattern <SIsample, sample, addr_type>;
- def : SampleArrayPattern <SIsample, sample, addr_type>;
- def : SampleShadowPattern <SIsample, sample_c, addr_type>;
- def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>;
-
- def : SamplePattern <SIsamplel, sample_l, addr_type>;
- def : SampleArrayPattern <SIsamplel, sample_l, addr_type>;
- def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>;
- def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>;
-
- def : SamplePattern <SIsampleb, sample_b, addr_type>;
- def : SampleArrayPattern <SIsampleb, sample_b, addr_type>;
- def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>;
- def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>;
-
- def : SamplePattern <SIsampled, sample_d, addr_type>;
- def : SampleArrayPattern <SIsampled, sample_d, addr_type>;
- def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>;
- def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>;
-}
-
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2,
- IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2,
- IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2,
- IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2,
- v2i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4,
- IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4,
- IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4,
- IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4,
- v4i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8,
- IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8,
- IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8,
- IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8,
- v8i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
- IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16,
- IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16,
- IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
- v16i32>;
+def ImageDimIntrinsicTable : GenericTable {
+ let FilterClass = "ImageDimIntrinsicInfo";
+ let Fields = ["Intr", "BaseOpcode", "Dim"];
+ GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
+ GenericEnum TypeOf_Dim = MIMGDim;
+
+ let PrimaryKey = ["Intr"];
+ let PrimaryKeyName = "getImageDimIntrinsicInfo";
+ let PrimaryKeyEarlyOut = 1;
+}
+
+foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
+ AMDGPUImageDimAtomicIntrinsics) in {
+ def : ImageDimIntrinsicInfo<intr>;
+}
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
deleted file mode 100644
index d50dae78e247..000000000000
--- a/lib/Target/AMDGPU/Processors.td
+++ /dev/null
@@ -1,12 +0,0 @@
-//===-- Processors.td - AMDGPU Processor definitions ----------------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-FIXME: Deleting this file broke buildbots that don't do full rebuilds. This
-file is no longer used by the backend, so it can be deleted once all
-the buildbots update there dependencies.
diff --git a/lib/Target/AMDGPU/R600.td b/lib/Target/AMDGPU/R600.td
new file mode 100644
index 000000000000..5c9c1c1ed504
--- /dev/null
+++ b/lib/Target/AMDGPU/R600.td
@@ -0,0 +1,54 @@
+//===-- R600.td - R600 Tablegen files ----------------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+def R600InstrInfo : InstrInfo {
+ let guessInstructionProperties = 1;
+ let noNamedPositionallyEncodedOperands = 1;
+}
+
+def R600 : Target {
+ let InstructionSet = R600InstrInfo;
+ let AllowRegisterRenaming = 1;
+}
+
+let Namespace = "R600" in {
+
+foreach Index = 0-15 in {
+ def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
+}
+
+include "R600RegisterInfo.td"
+
+}
+
+def NullALU : InstrItinClass;
+def ALU_NULL : FuncUnit;
+
+include "AMDGPUFeatures.td"
+include "R600Schedule.td"
+include "R600Processors.td"
+include "AMDGPUInstrInfo.td"
+include "AMDGPUInstructions.td"
+include "R600Instructions.td"
+include "R700Instructions.td"
+include "EvergreenInstructions.td"
+include "CaymanInstructions.td"
+
+// Calling convention for R600
+def CC_R600 : CallingConv<[
+ CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
+ T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
+ T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
+ T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
+ T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
+ T30_XYZW, T31_XYZW, T32_XYZW
+ ]>>>
+]>;
diff --git a/lib/Target/AMDGPU/R600AsmPrinter.cpp b/lib/Target/AMDGPU/R600AsmPrinter.cpp
new file mode 100644
index 000000000000..68f8c30775b8
--- /dev/null
+++ b/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -0,0 +1,133 @@
+//===-- R600AsmPrinter.cpp - R600 Assebly printer ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// The R600AsmPrinter is used to print both assembly string and also binary
+/// code. When passed an MCAsmStreamer it prints assembly and when passed
+/// an MCObjectStreamer it outputs binary code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600AsmPrinter.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+AsmPrinter *
+llvm::createR600AsmPrinterPass(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> &&Streamer) {
+ return new R600AsmPrinter(TM, std::move(Streamer));
+}
+
+R600AsmPrinter::R600AsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)) { }
+
+StringRef R600AsmPrinter::getPassName() const {
+ return "R600 Assembly Printer";
+}
+
+void R600AsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
+ unsigned MaxGPR = 0;
+ bool killPixel = false;
+ const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
+ const R600RegisterInfo *RI = STM.getRegisterInfo();
+ const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ if (MI.getOpcode() == R600::KILLGT)
+ killPixel = true;
+ unsigned numOperands = MI.getNumOperands();
+ for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+ const MachineOperand &MO = MI.getOperand(op_idx);
+ if (!MO.isReg())
+ continue;
+ unsigned HWReg = RI->getHWRegIndex(MO.getReg());
+
+ // Register with value > 127 aren't GPR
+ if (HWReg > 127)
+ continue;
+ MaxGPR = std::max(MaxGPR, HWReg);
+ }
+ }
+ }
+
+ unsigned RsrcReg;
+ if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
+ // Evergreen / Northern Islands
+ switch (MF.getFunction().getCallingConv()) {
+ default: LLVM_FALLTHROUGH;
+ case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
+ case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
+ case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
+ case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
+ }
+ } else {
+ // R600 / R700
+ switch (MF.getFunction().getCallingConv()) {
+ default: LLVM_FALLTHROUGH;
+ case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH;
+ case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH;
+ case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
+ case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
+ }
+ }
+
+ OutStreamer->EmitIntValue(RsrcReg, 4);
+ OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
+ S_STACK_SIZE(MFI->CFStackSize), 4);
+ OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
+ OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
+
+ if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
+ OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
+ OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
+ }
+}
+
+bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+
+
+ // Functions needs to be cacheline (256B) aligned.
+ MF.ensureAlignment(8);
+
+ SetupMachineFunction(MF);
+
+ MCContext &Context = getObjFileLowering().getContext();
+ MCSectionELF *ConfigSection =
+ Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
+ OutStreamer->SwitchSection(ConfigSection);
+
+ EmitProgramInfoR600(MF);
+
+ EmitFunctionBody();
+
+ if (isVerbose()) {
+ MCSectionELF *CommentSection =
+ Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
+ OutStreamer->SwitchSection(CommentSection);
+
+ R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+ OutStreamer->emitRawComment(
+ Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize)));
+ }
+
+ return false;
+}
+
diff --git a/lib/Target/AMDGPU/R600AsmPrinter.h b/lib/Target/AMDGPU/R600AsmPrinter.h
new file mode 100644
index 000000000000..079fc707b03c
--- /dev/null
+++ b/lib/Target/AMDGPU/R600AsmPrinter.h
@@ -0,0 +1,46 @@
+//===-- R600AsmPrinter.h - Print R600 assembly code -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// R600 Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600ASMPRINTER_H
+#define LLVM_LIB_TARGET_AMDGPU_R600ASMPRINTER_H
+
+#include "llvm/CodeGen/AsmPrinter.h"
+
+namespace llvm {
+
+class R600AsmPrinter final : public AsmPrinter {
+
+public:
+ explicit R600AsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer);
+ StringRef getPassName() const override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ /// Implemented in AMDGPUMCInstLower.cpp
+ void EmitInstruction(const MachineInstr *MI) override;
+ /// Lower the specified LLVM Constant to an MCExpr.
+ /// The AsmPrinter::lowerConstantof does not know how to lower
+ /// addrspacecast, therefore they should be lowered by this function.
+ const MCExpr *lowerConstant(const Constant *CV) override;
+
+private:
+ void EmitProgramInfoR600(const MachineFunction &MF);
+};
+
+AsmPrinter *
+createR600AsmPrinterPass(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> &&Streamer);
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_R600ASMPRINTER_H
diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index 5e1ba6b506da..0c62d6a4b3d9 100644
--- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -19,6 +19,7 @@
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
#include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -33,8 +34,8 @@ namespace {
static bool isCFAlu(const MachineInstr &MI) {
switch (MI.getOpcode()) {
- case AMDGPU::CF_ALU:
- case AMDGPU::CF_ALU_PUSH_BEFORE:
+ case R600::CF_ALU:
+ case R600::CF_ALU_PUSH_BEFORE:
return true;
default:
return false;
@@ -84,20 +85,20 @@ char &llvm::R600ClauseMergePassID = R600ClauseMergePass::ID;
unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr &MI) const {
assert(isCFAlu(MI));
return MI
- .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::COUNT))
+ .getOperand(TII->getOperandIdx(MI.getOpcode(), R600::OpName::COUNT))
.getImm();
}
bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr &MI) const {
assert(isCFAlu(MI));
return MI
- .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::Enabled))
+ .getOperand(TII->getOperandIdx(MI.getOpcode(), R600::OpName::Enabled))
.getImm();
}
void R600ClauseMergePass::cleanPotentialDisabledCFAlu(
MachineInstr &CFAlu) const {
- int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+ int CntIdx = TII->getOperandIdx(R600::CF_ALU, R600::OpName::COUNT);
MachineBasicBlock::iterator I = CFAlu, E = CFAlu.getParent()->end();
I++;
do {
@@ -116,46 +117,46 @@ void R600ClauseMergePass::cleanPotentialDisabledCFAlu(
bool R600ClauseMergePass::mergeIfPossible(MachineInstr &RootCFAlu,
const MachineInstr &LatrCFAlu) const {
assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu));
- int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+ int CntIdx = TII->getOperandIdx(R600::CF_ALU, R600::OpName::COUNT);
unsigned RootInstCount = getCFAluSize(RootCFAlu),
LaterInstCount = getCFAluSize(LatrCFAlu);
unsigned CumuledInsts = RootInstCount + LaterInstCount;
if (CumuledInsts >= TII->getMaxAlusPerClause()) {
- DEBUG(dbgs() << "Excess inst counts\n");
+ LLVM_DEBUG(dbgs() << "Excess inst counts\n");
return false;
}
- if (RootCFAlu.getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+ if (RootCFAlu.getOpcode() == R600::CF_ALU_PUSH_BEFORE)
return false;
// Is KCache Bank 0 compatible ?
int Mode0Idx =
- TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0);
+ TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_MODE0);
int KBank0Idx =
- TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0);
+ TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_BANK0);
int KBank0LineIdx =
- TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0);
+ TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_ADDR0);
if (LatrCFAlu.getOperand(Mode0Idx).getImm() &&
RootCFAlu.getOperand(Mode0Idx).getImm() &&
(LatrCFAlu.getOperand(KBank0Idx).getImm() !=
RootCFAlu.getOperand(KBank0Idx).getImm() ||
LatrCFAlu.getOperand(KBank0LineIdx).getImm() !=
RootCFAlu.getOperand(KBank0LineIdx).getImm())) {
- DEBUG(dbgs() << "Wrong KC0\n");
+ LLVM_DEBUG(dbgs() << "Wrong KC0\n");
return false;
}
// Is KCache Bank 1 compatible ?
int Mode1Idx =
- TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1);
+ TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_MODE1);
int KBank1Idx =
- TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1);
+ TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_BANK1);
int KBank1LineIdx =
- TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1);
+ TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_ADDR1);
if (LatrCFAlu.getOperand(Mode1Idx).getImm() &&
RootCFAlu.getOperand(Mode1Idx).getImm() &&
(LatrCFAlu.getOperand(KBank1Idx).getImm() !=
RootCFAlu.getOperand(KBank1Idx).getImm() ||
LatrCFAlu.getOperand(KBank1LineIdx).getImm() !=
RootCFAlu.getOperand(KBank1LineIdx).getImm())) {
- DEBUG(dbgs() << "Wrong KC0\n");
+ LLVM_DEBUG(dbgs() << "Wrong KC0\n");
return false;
}
if (LatrCFAlu.getOperand(Mode0Idx).getImm()) {
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 0e788df1c9c0..a19020276f35 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -19,6 +19,7 @@
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
#include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
@@ -93,7 +94,7 @@ bool CFStack::branchStackContains(CFStack::StackItem Item) {
}
bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
- if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
+ if (Opcode == R600::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
getLoopDepth() > 1)
return true;
@@ -102,10 +103,10 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
switch(Opcode) {
default: return false;
- case AMDGPU::CF_ALU_PUSH_BEFORE:
- case AMDGPU::CF_ALU_ELSE_AFTER:
- case AMDGPU::CF_ALU_BREAK:
- case AMDGPU::CF_ALU_CONTINUE:
+ case R600::CF_ALU_PUSH_BEFORE:
+ case R600::CF_ALU_ELSE_AFTER:
+ case R600::CF_ALU_BREAK:
+ case R600::CF_ALU_CONTINUE:
if (CurrentSubEntries == 0)
return false;
if (ST->getWavefrontSize() == 64) {
@@ -136,7 +137,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
return 0;
case CFStack::FIRST_NON_WQM_PUSH:
assert(!ST->hasCaymanISA());
- if (ST->getGeneration() <= R600Subtarget::R700) {
+ if (ST->getGeneration() <= AMDGPUSubtarget::R700) {
// +1 For the push operation.
// +2 Extra space required.
return 3;
@@ -149,7 +150,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
return 2;
}
case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
- assert(ST->getGeneration() >= R600Subtarget::EVERGREEN);
+ assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
// +1 For the push operation.
// +1 Extra space required.
return 2;
@@ -167,8 +168,8 @@ void CFStack::updateMaxStackSize() {
void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
CFStack::StackItem Item = CFStack::ENTRY;
switch(Opcode) {
- case AMDGPU::CF_PUSH_EG:
- case AMDGPU::CF_ALU_PUSH_BEFORE:
+ case R600::CF_PUSH_EG:
+ case R600::CF_ALU_PUSH_BEFORE:
if (!isWQM) {
if (!ST->hasCaymanISA() &&
!branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
@@ -176,7 +177,7 @@ void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
// See comment in
// CFStack::getSubEntrySize()
else if (CurrentEntries > 0 &&
- ST->getGeneration() > R600Subtarget::EVERGREEN &&
+ ST->getGeneration() > AMDGPUSubtarget::EVERGREEN &&
!ST->hasCaymanISA() &&
!branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
@@ -239,8 +240,8 @@ private:
bool IsTrivialInst(MachineInstr &MI) const {
switch (MI.getOpcode()) {
- case AMDGPU::KILL:
- case AMDGPU::RETURN:
+ case R600::KILL:
+ case R600::RETURN:
return true;
default:
return false;
@@ -249,44 +250,44 @@ private:
const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
unsigned Opcode = 0;
- bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN);
+ bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
switch (CFI) {
case CF_TC:
- Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
+ Opcode = isEg ? R600::CF_TC_EG : R600::CF_TC_R600;
break;
case CF_VC:
- Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
+ Opcode = isEg ? R600::CF_VC_EG : R600::CF_VC_R600;
break;
case CF_CALL_FS:
- Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
+ Opcode = isEg ? R600::CF_CALL_FS_EG : R600::CF_CALL_FS_R600;
break;
case CF_WHILE_LOOP:
- Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
+ Opcode = isEg ? R600::WHILE_LOOP_EG : R600::WHILE_LOOP_R600;
break;
case CF_END_LOOP:
- Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
+ Opcode = isEg ? R600::END_LOOP_EG : R600::END_LOOP_R600;
break;
case CF_LOOP_BREAK:
- Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
+ Opcode = isEg ? R600::LOOP_BREAK_EG : R600::LOOP_BREAK_R600;
break;
case CF_LOOP_CONTINUE:
- Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
+ Opcode = isEg ? R600::CF_CONTINUE_EG : R600::CF_CONTINUE_R600;
break;
case CF_JUMP:
- Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
+ Opcode = isEg ? R600::CF_JUMP_EG : R600::CF_JUMP_R600;
break;
case CF_ELSE:
- Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
+ Opcode = isEg ? R600::CF_ELSE_EG : R600::CF_ELSE_R600;
break;
case CF_POP:
- Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
+ Opcode = isEg ? R600::POP_EG : R600::POP_R600;
break;
case CF_END:
if (ST->hasCaymanISA()) {
- Opcode = AMDGPU::CF_END_CM;
+ Opcode = R600::CF_END_CM;
break;
}
- Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
+ Opcode = isEg ? R600::CF_END_EG : R600::CF_END_R600;
break;
}
assert (Opcode && "No opcode selected");
@@ -304,21 +305,21 @@ private:
continue;
if (MO.isDef()) {
unsigned Reg = MO.getReg();
- if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+ if (R600::R600_Reg128RegClass.contains(Reg))
DstMI = Reg;
else
DstMI = TRI->getMatchingSuperReg(Reg,
- TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
- &AMDGPU::R600_Reg128RegClass);
+ AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+ &R600::R600_Reg128RegClass);
}
if (MO.isUse()) {
unsigned Reg = MO.getReg();
- if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+ if (R600::R600_Reg128RegClass.contains(Reg))
SrcMI = Reg;
else
SrcMI = TRI->getMatchingSuperReg(Reg,
- TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
- &AMDGPU::R600_Reg128RegClass);
+ AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+ &R600::R600_Reg128RegClass);
}
}
if ((DstRegs.find(SrcMI) == DstRegs.end())) {
@@ -358,15 +359,15 @@ private:
void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const {
static const unsigned LiteralRegs[] = {
- AMDGPU::ALU_LITERAL_X,
- AMDGPU::ALU_LITERAL_Y,
- AMDGPU::ALU_LITERAL_Z,
- AMDGPU::ALU_LITERAL_W
+ R600::ALU_LITERAL_X,
+ R600::ALU_LITERAL_Y,
+ R600::ALU_LITERAL_Z,
+ R600::ALU_LITERAL_W
};
const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs =
TII->getSrcs(MI);
for (const auto &Src:Srcs) {
- if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X)
+ if (Src.first->getReg() != R600::ALU_LITERAL_X)
continue;
int64_t Imm = Src.second;
std::vector<MachineOperand *>::iterator It =
@@ -376,7 +377,7 @@ private:
// Get corresponding Operand
MachineOperand &Operand = MI.getOperand(
- TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+ TII->getOperandIdx(MI.getOpcode(), R600::OpName::literal));
if (It != Lits.end()) {
// Reuse existing literal reg
@@ -399,7 +400,7 @@ private:
unsigned LiteralPair0 = Literals[i];
unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
- TII->get(AMDGPU::LITERALS))
+ TII->get(R600::LITERALS))
.addImm(LiteralPair0)
.addImm(LiteralPair1);
}
@@ -441,7 +442,7 @@ private:
}
for (unsigned i = 0, e = Literals.size(); i < e; i += 2) {
MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(),
- TII->get(AMDGPU::LITERALS));
+ TII->get(R600::LITERALS));
if (Literals[i]->isImm()) {
MILit.addImm(Literals[i]->getImm());
} else {
@@ -470,7 +471,7 @@ private:
unsigned &CfCount) {
CounterPropagateAddr(*Clause.first, CfCount);
MachineBasicBlock *BB = Clause.first->getParent();
- BuildMI(BB, DL, TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount);
+ BuildMI(BB, DL, TII->get(R600::FETCH_CLAUSE)).addImm(CfCount);
for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
BB->splice(InsertPos, BB, Clause.second[i]);
}
@@ -482,7 +483,7 @@ private:
Clause.first->getOperand(0).setImm(0);
CounterPropagateAddr(*Clause.first, CfCount);
MachineBasicBlock *BB = Clause.first->getParent();
- BuildMI(BB, DL, TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount);
+ BuildMI(BB, DL, TII->get(R600::ALU_CLAUSE)).addImm(CfCount);
for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
BB->splice(InsertPos, BB, Clause.second[i]);
}
@@ -531,7 +532,7 @@ public:
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E;) {
if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) {
- DEBUG(dbgs() << CfCount << ":"; I->dump(););
+ LLVM_DEBUG(dbgs() << CfCount << ":"; I->dump(););
FetchClauses.push_back(MakeFetchClause(MBB, I));
CfCount++;
LastAlu.back() = nullptr;
@@ -539,33 +540,34 @@ public:
}
MachineBasicBlock::iterator MI = I;
- if (MI->getOpcode() != AMDGPU::ENDIF)
+ if (MI->getOpcode() != R600::ENDIF)
LastAlu.back() = nullptr;
- if (MI->getOpcode() == AMDGPU::CF_ALU)
+ if (MI->getOpcode() == R600::CF_ALU)
LastAlu.back() = &*MI;
I++;
bool RequiresWorkAround =
CFStack.requiresWorkAroundForInst(MI->getOpcode());
switch (MI->getOpcode()) {
- case AMDGPU::CF_ALU_PUSH_BEFORE:
+ case R600::CF_ALU_PUSH_BEFORE:
if (RequiresWorkAround) {
- DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
- BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
+ LLVM_DEBUG(dbgs()
+ << "Applying bug work-around for ALU_PUSH_BEFORE\n");
+ BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(R600::CF_PUSH_EG))
.addImm(CfCount + 1)
.addImm(1);
- MI->setDesc(TII->get(AMDGPU::CF_ALU));
+ MI->setDesc(TII->get(R600::CF_ALU));
CfCount++;
- CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
+ CFStack.pushBranch(R600::CF_PUSH_EG);
} else
- CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
+ CFStack.pushBranch(R600::CF_ALU_PUSH_BEFORE);
LLVM_FALLTHROUGH;
- case AMDGPU::CF_ALU:
+ case R600::CF_ALU:
I = MI;
AluClauses.push_back(MakeALUClause(MBB, I));
- DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+ LLVM_DEBUG(dbgs() << CfCount << ":"; MI->dump(););
CfCount++;
break;
- case AMDGPU::WHILELOOP: {
+ case R600::WHILELOOP: {
CFStack.pushLoop();
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
getHWInstrDesc(CF_WHILE_LOOP))
@@ -578,7 +580,7 @@ public:
CfCount++;
break;
}
- case AMDGPU::ENDLOOP: {
+ case R600::ENDLOOP: {
CFStack.popLoop();
std::pair<unsigned, std::set<MachineInstr *>> Pair =
std::move(LoopStack.back());
@@ -590,19 +592,19 @@ public:
CfCount++;
break;
}
- case AMDGPU::IF_PREDICATE_SET: {
+ case R600::IF_PREDICATE_SET: {
LastAlu.push_back(nullptr);
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
getHWInstrDesc(CF_JUMP))
.addImm(0)
.addImm(0);
IfThenElseStack.push_back(MIb);
- DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+ LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
MI->eraseFromParent();
CfCount++;
break;
}
- case AMDGPU::ELSE: {
+ case R600::ELSE: {
MachineInstr * JumpInst = IfThenElseStack.back();
IfThenElseStack.pop_back();
CounterPropagateAddr(*JumpInst, CfCount);
@@ -610,13 +612,13 @@ public:
getHWInstrDesc(CF_ELSE))
.addImm(0)
.addImm(0);
- DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+ LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
IfThenElseStack.push_back(MIb);
MI->eraseFromParent();
CfCount++;
break;
}
- case AMDGPU::ENDIF: {
+ case R600::ENDIF: {
CFStack.popBranch();
if (LastAlu.back()) {
ToPopAfter.push_back(LastAlu.back());
@@ -626,7 +628,7 @@ public:
.addImm(CfCount + 1)
.addImm(1);
(void)MIb;
- DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+ LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
CfCount++;
}
@@ -638,7 +640,7 @@ public:
MI->eraseFromParent();
break;
}
- case AMDGPU::BREAK: {
+ case R600::BREAK: {
CfCount ++;
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
getHWInstrDesc(CF_LOOP_BREAK))
@@ -647,7 +649,7 @@ public:
MI->eraseFromParent();
break;
}
- case AMDGPU::CONTINUE: {
+ case R600::CONTINUE: {
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
getHWInstrDesc(CF_LOOP_CONTINUE))
.addImm(0);
@@ -656,12 +658,12 @@ public:
CfCount++;
break;
}
- case AMDGPU::RETURN: {
+ case R600::RETURN: {
DebugLoc DL = MBB.findDebugLoc(MI);
BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END));
CfCount++;
if (CfCount % 2) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::PAD));
+ BuildMI(MBB, I, DL, TII->get(R600::PAD));
CfCount++;
}
MI->eraseFromParent();
@@ -673,7 +675,7 @@ public:
}
default:
if (TII->isExport(MI->getOpcode())) {
- DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+ LLVM_DEBUG(dbgs() << CfCount << ":"; MI->dump(););
CfCount++;
}
break;
@@ -682,7 +684,7 @@ public:
for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
MachineInstr *Alu = ToPopAfter[i];
BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
- TII->get(AMDGPU::CF_ALU_POP_AFTER))
+ TII->get(R600::CF_ALU_POP_AFTER))
.addImm(Alu->getOperand(0).getImm())
.addImm(Alu->getOperand(1).getImm())
.addImm(Alu->getOperand(2).getImm())
diff --git a/lib/Target/AMDGPU/R600Defines.h b/lib/Target/AMDGPU/R600Defines.h
index 534461adc59f..0d33d82e8e0f 100644
--- a/lib/Target/AMDGPU/R600Defines.h
+++ b/lib/Target/AMDGPU/R600Defines.h
@@ -23,7 +23,7 @@
#define MO_FLAG_LAST (1 << 6)
#define NUM_MO_FLAGS 7
-/// \brief Helper for getting the operand index for the instruction flags
+/// Helper for getting the operand index for the instruction flags
/// operand.
#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3)
@@ -52,7 +52,7 @@ namespace R600_InstFlag {
#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS)
-/// \brief Defines for extracting register information from register encoding
+/// Defines for extracting register information from register encoding
#define HW_REG_MASK 0x1ff
#define HW_CHAN_SHIFT 9
diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index 0d8ccd088ec4..1683fe6c9a57 100644
--- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -19,6 +19,7 @@
#include "R600Defines.h"
#include "R600InstrInfo.h"
#include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -51,12 +52,12 @@ private:
unsigned OccupiedDwords(MachineInstr &MI) const {
switch (MI.getOpcode()) {
- case AMDGPU::INTERP_PAIR_XY:
- case AMDGPU::INTERP_PAIR_ZW:
- case AMDGPU::INTERP_VEC_LOAD:
- case AMDGPU::DOT_4:
+ case R600::INTERP_PAIR_XY:
+ case R600::INTERP_PAIR_ZW:
+ case R600::INTERP_VEC_LOAD:
+ case R600::DOT_4:
return 4;
- case AMDGPU::KILL:
+ case R600::KILL:
return 0;
default:
break;
@@ -76,7 +77,7 @@ private:
E = MI.operands_end();
It != E; ++It) {
MachineOperand &MO = *It;
- if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+ if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X)
++NumLiteral;
}
return 1 + NumLiteral;
@@ -88,12 +89,12 @@ private:
if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode()))
return true;
switch (MI.getOpcode()) {
- case AMDGPU::PRED_X:
- case AMDGPU::INTERP_PAIR_XY:
- case AMDGPU::INTERP_PAIR_ZW:
- case AMDGPU::INTERP_VEC_LOAD:
- case AMDGPU::COPY:
- case AMDGPU::DOT_4:
+ case R600::PRED_X:
+ case R600::INTERP_PAIR_XY:
+ case R600::INTERP_PAIR_ZW:
+ case R600::INTERP_VEC_LOAD:
+ case R600::COPY:
+ case R600::DOT_4:
return true;
default:
return false;
@@ -102,9 +103,9 @@ private:
bool IsTrivialInst(MachineInstr &MI) const {
switch (MI.getOpcode()) {
- case AMDGPU::KILL:
- case AMDGPU::RETURN:
- case AMDGPU::IMPLICIT_DEF:
+ case R600::KILL:
+ case R600::RETURN:
+ case R600::IMPLICIT_DEF:
return true;
default:
return false;
@@ -131,16 +132,16 @@ private:
bool UpdateInstr = true) const {
std::vector<std::pair<unsigned, unsigned>> UsedKCache;
- if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4)
+ if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != R600::DOT_4)
return true;
const SmallVectorImpl<std::pair<MachineOperand *, int64_t>> &Consts =
TII->getSrcs(MI);
assert(
- (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == AMDGPU::DOT_4) &&
+ (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == R600::DOT_4) &&
"Can't assign Const");
for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
- if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+ if (Consts[i].first->getReg() != R600::ALU_CONST)
continue;
unsigned Sel = Consts[i].second;
unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31;
@@ -171,16 +172,16 @@ private:
return true;
for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
- if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+ if (Consts[i].first->getReg() != R600::ALU_CONST)
continue;
switch(UsedKCache[j].first) {
case 0:
Consts[i].first->setReg(
- AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second));
+ R600::R600_KC0RegClass.getRegister(UsedKCache[j].second));
break;
case 1:
Consts[i].first->setReg(
- AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second));
+ R600::R600_KC1RegClass.getRegister(UsedKCache[j].second));
break;
default:
llvm_unreachable("Wrong Cache Line");
@@ -252,7 +253,7 @@ private:
break;
if (AluInstCount > TII->getMaxAlusPerClause())
break;
- if (I->getOpcode() == AMDGPU::PRED_X) {
+ if (I->getOpcode() == R600::PRED_X) {
// We put PRED_X in its own clause to ensure that ifcvt won't create
// clauses with more than 128 insts.
// IfCvt is indeed checking that "then" and "else" branches of an if
@@ -288,7 +289,7 @@ private:
AluInstCount += OccupiedDwords(*I);
}
unsigned Opcode = PushBeforeModifier ?
- AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
+ R600::CF_ALU_PUSH_BEFORE : R600::CF_ALU;
BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
// We don't use the ADDR field until R600ControlFlowFinalizer pass, where
// it is safe to assume it is 0. However if we always put 0 here, the ifcvt
@@ -321,7 +322,7 @@ public:
BB != BB_E; ++BB) {
MachineBasicBlock &MBB = *BB;
MachineBasicBlock::iterator I = MBB.begin();
- if (I != MBB.end() && I->getOpcode() == AMDGPU::CF_ALU)
+ if (I != MBB.end() && I->getOpcode() == R600::CF_ALU)
continue; // BB was already parsed
for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
if (isALU(*I)) {
diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index ffea231ee4d0..b924ff019dd1 100644
--- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -21,6 +21,7 @@
#include "R600RegisterInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -95,16 +96,16 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
// Expand LDS_*_RET instructions
if (TII->isLDSRetInstr(MI.getOpcode())) {
- int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+ int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst);
assert(DstIdx != -1);
MachineOperand &DstOp = MI.getOperand(DstIdx);
MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
- DstOp.getReg(), AMDGPU::OQAP);
- DstOp.setReg(AMDGPU::OQAP);
+ DstOp.getReg(), R600::OQAP);
+ DstOp.setReg(R600::OQAP);
int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::pred_sel);
+ R600::OpName::pred_sel);
int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
- AMDGPU::OpName::pred_sel);
+ R600::OpName::pred_sel);
// Copy the pred_sel bit
Mov->getOperand(MovPredSelIdx).setReg(
MI.getOperand(LDSPredSelIdx).getReg());
@@ -113,7 +114,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
switch (MI.getOpcode()) {
default: break;
// Expand PRED_X to one of the PRED_SET instructions.
- case AMDGPU::PRED_X: {
+ case R600::PRED_X: {
uint64_t Flags = MI.getOperand(3).getImm();
// The native opcode used by PRED_X is stored as an immediate in the
// third operand.
@@ -121,17 +122,18 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
MI.getOperand(2).getImm(), // opcode
MI.getOperand(0).getReg(), // dst
MI.getOperand(1).getReg(), // src0
- AMDGPU::ZERO); // src1
+ R600::ZERO); // src1
TII->addFlag(*PredSet, 0, MO_FLAG_MASK);
if (Flags & MO_FLAG_PUSH) {
- TII->setImmOperand(*PredSet, AMDGPU::OpName::update_exec_mask, 1);
+ TII->setImmOperand(*PredSet, R600::OpName::update_exec_mask, 1);
} else {
- TII->setImmOperand(*PredSet, AMDGPU::OpName::update_pred, 1);
+ TII->setImmOperand(*PredSet, R600::OpName::update_pred, 1);
}
MI.eraseFromParent();
continue;
}
- case AMDGPU::DOT_4: {
+ case R600::DOT_4: {
+
const R600RegisterInfo &TRI = TII->getRegisterInfo();
unsigned DstReg = MI.getOperand(0).getReg();
@@ -140,7 +142,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
for (unsigned Chan = 0; Chan < 4; ++Chan) {
bool Mask = (Chan != TRI.getHWRegChan(DstReg));
unsigned SubDstReg =
- AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+ R600::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
MachineInstr *BMI =
TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
if (Chan > 0) {
@@ -155,10 +157,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
// While not strictly necessary from hw point of view, we force
// all src operands of a dot4 inst to belong to the same slot.
unsigned Src0 = BMI->getOperand(
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
+ TII->getOperandIdx(Opcode, R600::OpName::src0))
.getReg();
unsigned Src1 = BMI->getOperand(
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
+ TII->getOperandIdx(Opcode, R600::OpName::src1))
.getReg();
(void) Src0;
(void) Src1;
@@ -205,26 +207,26 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
// T0_W = CUBE T1_Y, T1_Z
for (unsigned Chan = 0; Chan < 4; Chan++) {
unsigned DstReg = MI.getOperand(
- TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg();
+ TII->getOperandIdx(MI, R600::OpName::dst)).getReg();
unsigned Src0 = MI.getOperand(
- TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg();
+ TII->getOperandIdx(MI, R600::OpName::src0)).getReg();
unsigned Src1 = 0;
// Determine the correct source registers
if (!IsCube) {
- int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1);
+ int Src1Idx = TII->getOperandIdx(MI, R600::OpName::src1);
if (Src1Idx != -1) {
Src1 = MI.getOperand(Src1Idx).getReg();
}
}
if (IsReduction) {
- unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+ unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan);
Src0 = TRI.getSubReg(Src0, SubRegIndex);
Src1 = TRI.getSubReg(Src1, SubRegIndex);
} else if (IsCube) {
static const int CubeSrcSwz[] = {2, 2, 0, 1};
- unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
- unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
+ unsigned SubRegIndex0 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[Chan]);
+ unsigned SubRegIndex1 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
Src1 = TRI.getSubReg(Src0, SubRegIndex1);
Src0 = TRI.getSubReg(Src0, SubRegIndex0);
}
@@ -233,14 +235,14 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
bool Mask = false;
bool NotLast = true;
if (IsCube) {
- unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+ unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan);
DstReg = TRI.getSubReg(DstReg, SubRegIndex);
} else {
// Mask the write if the original instruction does not write to
// the current Channel.
Mask = (Chan != TRI.getHWRegChan(DstReg));
unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
- DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+ DstReg = R600::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
}
// Set the IsLast bit
@@ -249,11 +251,11 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
// Add the new instruction
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
- case AMDGPU::CUBE_r600_pseudo:
- Opcode = AMDGPU::CUBE_r600_real;
+ case R600::CUBE_r600_pseudo:
+ Opcode = R600::CUBE_r600_real;
break;
- case AMDGPU::CUBE_eg_pseudo:
- Opcode = AMDGPU::CUBE_eg_real;
+ case R600::CUBE_eg_pseudo:
+ Opcode = R600::CUBE_eg_real;
break;
default:
break;
@@ -270,12 +272,12 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
if (NotLast) {
TII->addFlag(*NewMI, 0, MO_FLAG_NOT_LAST);
}
- SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
- SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
- SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs);
- SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs);
- SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg);
- SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg);
+ SetFlagInNewMI(NewMI, &MI, R600::OpName::clamp);
+ SetFlagInNewMI(NewMI, &MI, R600::OpName::literal);
+ SetFlagInNewMI(NewMI, &MI, R600::OpName::src0_abs);
+ SetFlagInNewMI(NewMI, &MI, R600::OpName::src1_abs);
+ SetFlagInNewMI(NewMI, &MI, R600::OpName::src0_neg);
+ SetFlagInNewMI(NewMI, &MI, R600::OpName::src1_neg);
}
MI.eraseFromParent();
}
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 66291d0be4e6..113d6249fa60 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -8,18 +8,18 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Custom DAG lowering for R600
+/// Custom DAG lowering for R600
//
//===----------------------------------------------------------------------===//
#include "R600ISelLowering.h"
#include "AMDGPUFrameLowering.h"
-#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "R600Defines.h"
#include "R600FrameLowering.h"
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
@@ -35,13 +35,13 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include <cassert>
#include <cstdint>
#include <iterator>
@@ -50,17 +50,19 @@
using namespace llvm;
+#include "R600GenCallingConv.inc"
+
R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
const R600Subtarget &STI)
- : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
- addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
- addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
- addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
- addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
- addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
- addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
+ : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) {
+ addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass);
+ addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass);
+ addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass);
+ addRegisterClass(MVT::v2i32, &R600::R600_Reg64RegClass);
+ addRegisterClass(MVT::v4f32, &R600::R600_Reg128RegClass);
+ addRegisterClass(MVT::v4i32, &R600::R600_Reg128RegClass);
- computeRegisterProperties(STI.getRegisterInfo());
+ computeRegisterProperties(Subtarget->getRegisterInfo());
// Legalize loads and stores to the private address space.
setOperationAction(ISD::LOAD, MVT::i32, Custom);
@@ -147,6 +149,11 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSUB, MVT::f32, Expand);
+ setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+ setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+ setOperationAction(ISD::FRINT, MVT::f64, Custom);
+ setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
+
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
@@ -216,6 +223,34 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMA, MVT::f64, Expand);
}
+ // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
+ // need it for R600.
+ if (!Subtarget->hasFP32Denormals())
+ setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+ if (!Subtarget->hasBFI()) {
+ // fcopysign can be done in a single instruction with BFI.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ }
+
+ if (!Subtarget->hasBCNT(32))
+ setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+ if (!Subtarget->hasBCNT(64))
+ setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+ if (Subtarget->hasFFBH())
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+
+ if (Subtarget->hasFFBL())
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+
+ // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
+ // need it for R600.
+ if (Subtarget->hasBFE())
+ setHasExtractBitsInsn(true);
+
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
@@ -245,14 +280,10 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::LOAD);
}
-const R600Subtarget *R600TargetLowering::getSubtarget() const {
- return static_cast<const R600Subtarget *>(Subtarget);
-}
-
static inline bool isEOP(MachineBasicBlock::iterator I) {
if (std::next(I) == I->getParent()->end())
return false;
- return std::next(I)->getOpcode() == AMDGPU::RETURN;
+ return std::next(I)->getOpcode() == R600::RETURN;
}
MachineBasicBlock *
@@ -261,24 +292,24 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock::iterator I = MI;
- const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+ const R600InstrInfo *TII = Subtarget->getInstrInfo();
switch (MI.getOpcode()) {
default:
// Replace LDS_*_RET instruction that don't have any uses with the
// equivalent LDS_*_NORET instruction.
if (TII->isLDSRetInstr(MI.getOpcode())) {
- int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+ int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst);
assert(DstIdx != -1);
MachineInstrBuilder NewMI;
// FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
// LDS_1A2D support and remove this special case.
if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
- MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
+ MI.getOpcode() == R600::LDS_CMPST_RET)
return BB;
NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
- TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
+ TII->get(R600::getLDSNoRetOp(MI.getOpcode())));
for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
NewMI.add(MI.getOperand(i));
}
@@ -286,31 +317,24 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
break;
- case AMDGPU::CLAMP_R600: {
- MachineInstr *NewMI = TII->buildDefaultInstruction(
- *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
- MI.getOperand(1).getReg());
- TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
- break;
- }
- case AMDGPU::FABS_R600: {
+ case R600::FABS_R600: {
MachineInstr *NewMI = TII->buildDefaultInstruction(
- *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+ *BB, I, R600::MOV, MI.getOperand(0).getReg(),
MI.getOperand(1).getReg());
TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
break;
}
- case AMDGPU::FNEG_R600: {
+ case R600::FNEG_R600: {
MachineInstr *NewMI = TII->buildDefaultInstruction(
- *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+ *BB, I, R600::MOV, MI.getOperand(0).getReg(),
MI.getOperand(1).getReg());
TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
break;
}
- case AMDGPU::MASK_WRITE: {
+ case R600::MASK_WRITE: {
unsigned maskedRegister = MI.getOperand(0).getReg();
assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
@@ -318,7 +342,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
break;
}
- case AMDGPU::MOV_IMM_F32:
+ case R600::MOV_IMM_F32:
TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
.getFPImm()
->getValueAPF()
@@ -326,39 +350,39 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.getZExtValue());
break;
- case AMDGPU::MOV_IMM_I32:
+ case R600::MOV_IMM_I32:
TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
MI.getOperand(1).getImm());
break;
- case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
+ case R600::MOV_IMM_GLOBAL_ADDR: {
//TODO: Perhaps combine this instruction with the next if possible
auto MIB = TII->buildDefaultInstruction(
- *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
- int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
+ *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_LITERAL_X);
+ int Idx = TII->getOperandIdx(*MIB, R600::OpName::literal);
//TODO: Ugh this is rather ugly
MIB->getOperand(Idx) = MI.getOperand(1);
break;
}
- case AMDGPU::CONST_COPY: {
+ case R600::CONST_COPY: {
MachineInstr *NewMI = TII->buildDefaultInstruction(
- *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
- TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
+ *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_CONST);
+ TII->setImmOperand(*NewMI, R600::OpName::src0_sel,
MI.getOperand(1).getImm());
break;
}
- case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
- case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
- case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
+ case R600::RAT_WRITE_CACHELESS_32_eg:
+ case R600::RAT_WRITE_CACHELESS_64_eg:
+ case R600::RAT_WRITE_CACHELESS_128_eg:
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
.add(MI.getOperand(0))
.add(MI.getOperand(1))
.addImm(isEOP(I)); // Set End of program bit
break;
- case AMDGPU::RAT_STORE_TYPED_eg:
+ case R600::RAT_STORE_TYPED_eg:
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
.add(MI.getOperand(0))
.add(MI.getOperand(1))
@@ -366,49 +390,49 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addImm(isEOP(I)); // Set End of program bit
break;
- case AMDGPU::BRANCH:
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+ case R600::BRANCH:
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP))
.add(MI.getOperand(0));
break;
- case AMDGPU::BRANCH_COND_f32: {
+ case R600::BRANCH_COND_f32: {
MachineInstr *NewMI =
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
- AMDGPU::PREDICATE_BIT)
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X),
+ R600::PREDICATE_BIT)
.add(MI.getOperand(1))
- .addImm(AMDGPU::PRED_SETNE)
+ .addImm(R600::PRED_SETNE)
.addImm(0); // Flags
TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND))
.add(MI.getOperand(0))
- .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+ .addReg(R600::PREDICATE_BIT, RegState::Kill);
break;
}
- case AMDGPU::BRANCH_COND_i32: {
+ case R600::BRANCH_COND_i32: {
MachineInstr *NewMI =
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
- AMDGPU::PREDICATE_BIT)
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X),
+ R600::PREDICATE_BIT)
.add(MI.getOperand(1))
- .addImm(AMDGPU::PRED_SETNE_INT)
+ .addImm(R600::PRED_SETNE_INT)
.addImm(0); // Flags
TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND))
.add(MI.getOperand(0))
- .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+ .addReg(R600::PREDICATE_BIT, RegState::Kill);
break;
}
- case AMDGPU::EG_ExportSwz:
- case AMDGPU::R600_ExportSwz: {
+ case R600::EG_ExportSwz:
+ case R600::R600_ExportSwz: {
// Instruction is left unmodified if its not the last one of its type
bool isLastInstructionOfItsType = true;
unsigned InstExportType = MI.getOperand(1).getImm();
for (MachineBasicBlock::iterator NextExportInst = std::next(I),
EndBlock = BB->end(); NextExportInst != EndBlock;
NextExportInst = std::next(NextExportInst)) {
- if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
- NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
+ if (NextExportInst->getOpcode() == R600::EG_ExportSwz ||
+ NextExportInst->getOpcode() == R600::R600_ExportSwz) {
unsigned CurrentInstExportType = NextExportInst->getOperand(1)
.getImm();
if (CurrentInstExportType == InstExportType) {
@@ -420,7 +444,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
bool EOP = isEOP(I);
if (!EOP && !isLastInstructionOfItsType)
return BB;
- unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
+ unsigned CfInst = (MI.getOpcode() == R600::EG_ExportSwz) ? 84 : 40;
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
.add(MI.getOperand(0))
.add(MI.getOperand(1))
@@ -433,7 +457,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addImm(EOP);
break;
}
- case AMDGPU::RETURN: {
+ case R600::RETURN: {
return BB;
}
}
@@ -478,7 +502,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
unsigned IntrinsicID =
cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
switch (IntrinsicID) {
- case AMDGPUIntrinsic::r600_store_swizzle: {
+ case Intrinsic::r600_store_swizzle: {
SDLoc DL(Op);
const SDValue Args[8] = {
Chain,
@@ -505,14 +529,14 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
EVT VT = Op.getValueType();
SDLoc DL(Op);
switch (IntrinsicID) {
- case AMDGPUIntrinsic::r600_tex:
- case AMDGPUIntrinsic::r600_texc: {
+ case Intrinsic::r600_tex:
+ case Intrinsic::r600_texc: {
unsigned TextureOp;
switch (IntrinsicID) {
- case AMDGPUIntrinsic::r600_tex:
+ case Intrinsic::r600_tex:
TextureOp = 0;
break;
- case AMDGPUIntrinsic::r600_texc:
+ case Intrinsic::r600_texc:
TextureOp = 1;
break;
default:
@@ -542,7 +566,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
};
return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
}
- case AMDGPUIntrinsic::r600_dot4: {
+ case Intrinsic::r600_dot4: {
SDValue Args[8] = {
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
DAG.getConstant(0, DL, MVT::i32)),
@@ -566,7 +590,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
case Intrinsic::r600_implicitarg_ptr: {
MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS);
- uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+ uint32_t ByteOffset = getImplicitParameterOffset(MF, FIRST_IMPLICIT);
return DAG.getConstant(ByteOffset, DL, PtrVT);
}
case Intrinsic::r600_read_ngroups_x:
@@ -589,23 +613,23 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
return LowerImplicitParameter(DAG, VT, DL, 8);
case Intrinsic::r600_read_tgid_x:
- return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T1_X, VT);
+ return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+ R600::T1_X, VT);
case Intrinsic::r600_read_tgid_y:
- return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T1_Y, VT);
+ return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+ R600::T1_Y, VT);
case Intrinsic::r600_read_tgid_z:
- return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T1_Z, VT);
+ return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+ R600::T1_Z, VT);
case Intrinsic::r600_read_tidig_x:
- return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T0_X, VT);
+ return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+ R600::T0_X, VT);
case Intrinsic::r600_read_tidig_y:
- return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T0_Y, VT);
+ return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+ R600::T0_Y, VT);
case Intrinsic::r600_read_tidig_z:
- return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T0_Z, VT);
+ return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+ R600::T0_Z, VT);
case Intrinsic::r600_recipsqrt_ieee:
return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
@@ -755,7 +779,7 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
DAG.getNode(ISD::FADD, DL, VT, FractPart,
DAG.getConstantFP(-0.5, DL, MVT::f32)));
- if (Gen >= R600Subtarget::R700)
+ if (Gen >= AMDGPUSubtarget::R700)
return TrigVal;
// On R600 hw, COS/SIN input must be between -Pi and Pi.
return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
@@ -1527,7 +1551,7 @@ SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
- const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
+ const R600FrameLowering *TFL = Subtarget->getFrameLowering();
FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
@@ -1539,6 +1563,28 @@ SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
Op.getValueType());
}
+CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+ bool IsVarArg) const {
+ switch (CC) {
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
+ case CallingConv::C:
+ case CallingConv::Fast:
+ case CallingConv::Cold:
+ llvm_unreachable("kernels should not be handled here");
+ case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_GS:
+ case CallingConv::AMDGPU_PS:
+ case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_ES:
+ case CallingConv::AMDGPU_LS:
+ return CC_R600;
+ default:
+ report_fatal_error("Unsupported calling convention.");
+ }
+}
+
/// XXX Only kernel functions are supported, so we can assume for now that
/// every function is a kernel function, but in the future we should use
/// separate calling conventions for kernel and non-kernel functions.
@@ -1550,8 +1596,6 @@ SDValue R600TargetLowering::LowerFormalArguments(
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
MachineFunction &MF = DAG.getMachineFunction();
- R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-
SmallVector<ISD::InputArg, 8> LocalIns;
if (AMDGPU::isShader(CallConv)) {
@@ -1571,7 +1615,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
}
if (AMDGPU::isShader(CallConv)) {
- unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass);
SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
InVals.push_back(Register);
continue;
@@ -1602,19 +1646,18 @@ SDValue R600TargetLowering::LowerFormalArguments(
unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
unsigned PartOffset = VA.getLocMemOffset();
- unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset();
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
SDValue Arg = DAG.getLoad(
ISD::UNINDEXED, Ext, VT, DL, Chain,
- DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
+ DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32),
+ PtrInfo,
MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
// 4 is the preferred alignment for the CONSTANT memory space.
InVals.push_back(Arg);
- MFI->setABIArgOffset(Offset + MemVT.getStoreSize());
}
return Chain;
}
@@ -1989,26 +2032,26 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
SDValue &Src, SDValue &Neg, SDValue &Abs,
SDValue &Sel, SDValue &Imm,
SelectionDAG &DAG) const {
- const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+ const R600InstrInfo *TII = Subtarget->getInstrInfo();
if (!Src.isMachineOpcode())
return false;
switch (Src.getMachineOpcode()) {
- case AMDGPU::FNEG_R600:
+ case R600::FNEG_R600:
if (!Neg.getNode())
return false;
Src = Src.getOperand(0);
Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
return true;
- case AMDGPU::FABS_R600:
+ case R600::FABS_R600:
if (!Abs.getNode())
return false;
Src = Src.getOperand(0);
Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
return true;
- case AMDGPU::CONST_COPY: {
+ case R600::CONST_COPY: {
unsigned Opcode = ParentNode->getMachineOpcode();
- bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+ bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
if (!Sel.getNode())
return false;
@@ -2019,17 +2062,17 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
// Gather constants values
int SrcIndices[] = {
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+ TII->getOperandIdx(Opcode, R600::OpName::src0),
+ TII->getOperandIdx(Opcode, R600::OpName::src1),
+ TII->getOperandIdx(Opcode, R600::OpName::src2),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_W),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_W)
};
std::vector<unsigned> Consts;
for (int OtherSrcIdx : SrcIndices) {
@@ -2042,7 +2085,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
}
if (RegisterSDNode *Reg =
dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
- if (Reg->getReg() == AMDGPU::ALU_CONST) {
+ if (Reg->getReg() == R600::ALU_CONST) {
ConstantSDNode *Cst
= cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
Consts.push_back(Cst->getZExtValue());
@@ -2057,30 +2100,30 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
}
Sel = CstOffset;
- Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
+ Src = DAG.getRegister(R600::ALU_CONST, MVT::f32);
return true;
}
- case AMDGPU::MOV_IMM_GLOBAL_ADDR:
+ case R600::MOV_IMM_GLOBAL_ADDR:
// Check if the Imm slot is used. Taken from below.
if (cast<ConstantSDNode>(Imm)->getZExtValue())
return false;
Imm = Src.getOperand(0);
- Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
+ Src = DAG.getRegister(R600::ALU_LITERAL_X, MVT::i32);
return true;
- case AMDGPU::MOV_IMM_I32:
- case AMDGPU::MOV_IMM_F32: {
- unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
+ case R600::MOV_IMM_I32:
+ case R600::MOV_IMM_F32: {
+ unsigned ImmReg = R600::ALU_LITERAL_X;
uint64_t ImmValue = 0;
- if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
+ if (Src.getMachineOpcode() == R600::MOV_IMM_F32) {
ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
float FloatValue = FPC->getValueAPF().convertToFloat();
if (FloatValue == 0.0) {
- ImmReg = AMDGPU::ZERO;
+ ImmReg = R600::ZERO;
} else if (FloatValue == 0.5) {
- ImmReg = AMDGPU::HALF;
+ ImmReg = R600::HALF;
} else if (FloatValue == 1.0) {
- ImmReg = AMDGPU::ONE;
+ ImmReg = R600::ONE;
} else {
ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
}
@@ -2088,9 +2131,9 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
uint64_t Value = C->getZExtValue();
if (Value == 0) {
- ImmReg = AMDGPU::ZERO;
+ ImmReg = R600::ZERO;
} else if (Value == 1) {
- ImmReg = AMDGPU::ONE_INT;
+ ImmReg = R600::ONE_INT;
} else {
ImmValue = Value;
}
@@ -2099,7 +2142,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
// Check that we aren't already using an immediate.
// XXX: It's possible for an instruction to have more than one
// immediate operand, but this is not supported yet.
- if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+ if (ImmReg == R600::ALU_LITERAL_X) {
if (!Imm.getNode())
return false;
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
@@ -2116,10 +2159,10 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
}
}
-/// \brief Fold the instructions after selecting them
+/// Fold the instructions after selecting them
SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
SelectionDAG &DAG) const {
- const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+ const R600InstrInfo *TII = Subtarget->getInstrInfo();
if (!Node->isMachineOpcode())
return Node;
@@ -2128,36 +2171,36 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
- if (Opcode == AMDGPU::DOT_4) {
+ if (Opcode == R600::DOT_4) {
int OperandIdx[] = {
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+ TII->getOperandIdx(Opcode, R600::OpName::src0_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_W),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_W)
};
int NegIdx[] = {
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
+ TII->getOperandIdx(Opcode, R600::OpName::src0_neg_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_neg_W),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_neg_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_neg_W)
};
int AbsIdx[] = {
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
+ TII->getOperandIdx(Opcode, R600::OpName::src0_abs_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_abs_W),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_abs_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_abs_W)
};
for (unsigned i = 0; i < 8; i++) {
if (OperandIdx[i] < 0)
@@ -2165,7 +2208,7 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
SDValue &Src = Ops[OperandIdx[i] - 1];
SDValue &Neg = Ops[NegIdx[i] - 1];
SDValue &Abs = Ops[AbsIdx[i] - 1];
- bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+ bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
if (HasDst)
SelIdx--;
@@ -2173,42 +2216,28 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
}
- } else if (Opcode == AMDGPU::REG_SEQUENCE) {
+ } else if (Opcode == R600::REG_SEQUENCE) {
for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
SDValue &Src = Ops[i];
if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
}
- } else if (Opcode == AMDGPU::CLAMP_R600) {
- SDValue Src = Node->getOperand(0);
- if (!Src.isMachineOpcode() ||
- !TII->hasInstrModifiers(Src.getMachineOpcode()))
- return Node;
- int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
- AMDGPU::OpName::clamp);
- if (ClampIdx < 0)
- return Node;
- SDLoc DL(Node);
- std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
- Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
- return DAG.getMachineNode(Src.getMachineOpcode(), DL,
- Node->getVTList(), Ops);
} else {
if (!TII->hasInstrModifiers(Opcode))
return Node;
int OperandIdx[] = {
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
+ TII->getOperandIdx(Opcode, R600::OpName::src0),
+ TII->getOperandIdx(Opcode, R600::OpName::src1),
+ TII->getOperandIdx(Opcode, R600::OpName::src2)
};
int NegIdx[] = {
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
+ TII->getOperandIdx(Opcode, R600::OpName::src0_neg),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_neg),
+ TII->getOperandIdx(Opcode, R600::OpName::src2_neg)
};
int AbsIdx[] = {
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_abs),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_abs),
-1
};
for (unsigned i = 0; i < 3; i++) {
@@ -2218,9 +2247,9 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
SDValue &Neg = Ops[NegIdx[i] - 1];
SDValue FakeAbs;
SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
- bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+ bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
- int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
+ int ImmIdx = TII->getOperandIdx(Opcode, R600::OpName::literal);
if (HasDst) {
SelIdx--;
ImmIdx--;
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index 2a774693f02b..907d1f10e151 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief R600 DAG Lowering interface definition
+/// R600 DAG Lowering interface definition
//
//===----------------------------------------------------------------------===//
@@ -23,6 +23,8 @@ class R600InstrInfo;
class R600Subtarget;
class R600TargetLowering final : public AMDGPUTargetLowering {
+
+ const R600Subtarget *Subtarget;
public:
R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI);
@@ -36,6 +38,7 @@ public:
void ReplaceNodeResults(SDNode * N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
+ CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
diff --git a/lib/Target/AMDGPU/R600InstrFormats.td b/lib/Target/AMDGPU/R600InstrFormats.td
index 61106ed42e64..687a9affa138 100644
--- a/lib/Target/AMDGPU/R600InstrFormats.td
+++ b/lib/Target/AMDGPU/R600InstrFormats.td
@@ -11,10 +11,10 @@
//
//===----------------------------------------------------------------------===//
-def isR600 : Predicate<"Subtarget->getGeneration() <= R600Subtarget::R700">;
+def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">;
def isR600toCayman : Predicate<
- "Subtarget->getGeneration() <= R600Subtarget::NORTHERN_ISLANDS">;
+ "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
class R600Pat<dag pattern, dag result> : AMDGPUPat<pattern, result> {
let SubtargetPredicate = isR600toCayman;
@@ -41,7 +41,7 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
bit LDS_1A2D = 0;
let SubtargetPredicate = isR600toCayman;
- let Namespace = "AMDGPU";
+ let Namespace = "R600";
let OutOperandList = outs;
let InOperandList = ins;
let AsmString = asm;
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index 23e646c8147c..5397e779474c 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief R600 Implementation of TargetInstrInfo.
+/// R600 Implementation of TargetInstrInfo.
//
//===----------------------------------------------------------------------===//
@@ -19,6 +19,7 @@
#include "R600Defines.h"
#include "R600FrameLowering.h"
#include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallSet.h"
@@ -44,10 +45,15 @@
using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
-#include "AMDGPUGenDFAPacketizer.inc"
+#include "R600GenDFAPacketizer.inc"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRMAP_INFO
+#define GET_INSTRINFO_NAMED_OPS
+#include "R600GenInstrInfo.inc"
R600InstrInfo::R600InstrInfo(const R600Subtarget &ST)
- : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
+ : R600GenInstrInfo(-1, -1), RI(), ST(ST) {}
bool R600InstrInfo::isVector(const MachineInstr &MI) const {
return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
@@ -58,31 +64,31 @@ void R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
const DebugLoc &DL, unsigned DestReg,
unsigned SrcReg, bool KillSrc) const {
unsigned VectorComponents = 0;
- if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
- AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
- (AMDGPU::R600_Reg128RegClass.contains(SrcReg) ||
- AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) {
+ if ((R600::R600_Reg128RegClass.contains(DestReg) ||
+ R600::R600_Reg128VerticalRegClass.contains(DestReg)) &&
+ (R600::R600_Reg128RegClass.contains(SrcReg) ||
+ R600::R600_Reg128VerticalRegClass.contains(SrcReg))) {
VectorComponents = 4;
- } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) ||
- AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) &&
- (AMDGPU::R600_Reg64RegClass.contains(SrcReg) ||
- AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) {
+ } else if((R600::R600_Reg64RegClass.contains(DestReg) ||
+ R600::R600_Reg64VerticalRegClass.contains(DestReg)) &&
+ (R600::R600_Reg64RegClass.contains(SrcReg) ||
+ R600::R600_Reg64VerticalRegClass.contains(SrcReg))) {
VectorComponents = 2;
}
if (VectorComponents > 0) {
for (unsigned I = 0; I < VectorComponents; I++) {
- unsigned SubRegIndex = RI.getSubRegFromChannel(I);
- buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+ unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(I);
+ buildDefaultInstruction(MBB, MI, R600::MOV,
RI.getSubReg(DestReg, SubRegIndex),
RI.getSubReg(SrcReg, SubRegIndex))
.addReg(DestReg,
RegState::Define | RegState::Implicit);
}
} else {
- MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+ MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, R600::MOV,
DestReg, SrcReg);
- NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0))
+ NewMI->getOperand(getOperandIdx(*NewMI, R600::OpName::src0))
.setIsKill(KillSrc);
}
}
@@ -103,9 +109,9 @@ bool R600InstrInfo::isMov(unsigned Opcode) const {
switch(Opcode) {
default:
return false;
- case AMDGPU::MOV:
- case AMDGPU::MOV_IMM_F32:
- case AMDGPU::MOV_IMM_I32:
+ case R600::MOV:
+ case R600::MOV_IMM_F32:
+ case R600::MOV_IMM_I32:
return true;
}
}
@@ -117,10 +123,10 @@ bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
switch(Opcode) {
default: return false;
- case AMDGPU::CUBE_r600_pseudo:
- case AMDGPU::CUBE_r600_real:
- case AMDGPU::CUBE_eg_pseudo:
- case AMDGPU::CUBE_eg_real:
+ case R600::CUBE_r600_pseudo:
+ case R600::CUBE_r600_real:
+ case R600::CUBE_eg_pseudo:
+ case R600::CUBE_eg_real:
return true;
}
}
@@ -148,7 +154,7 @@ bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
}
bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const {
- return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1;
+ return isLDSInstr(Opcode) && getOperandIdx(Opcode, R600::OpName::dst) != -1;
}
bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
@@ -157,12 +163,12 @@ bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
if (isVector(MI) || isCubeOp(MI.getOpcode()))
return true;
switch (MI.getOpcode()) {
- case AMDGPU::PRED_X:
- case AMDGPU::INTERP_PAIR_XY:
- case AMDGPU::INTERP_PAIR_ZW:
- case AMDGPU::INTERP_VEC_LOAD:
- case AMDGPU::COPY:
- case AMDGPU::DOT_4:
+ case R600::PRED_X:
+ case R600::INTERP_PAIR_XY:
+ case R600::INTERP_PAIR_ZW:
+ case R600::INTERP_VEC_LOAD:
+ case R600::COPY:
+ case R600::DOT_4:
return true;
default:
return false;
@@ -172,7 +178,7 @@ bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
if (ST.hasCaymanISA())
return false;
- return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU);
+ return (get(Opcode).getSchedClass() == R600::Sched::TransALU);
}
bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const {
@@ -180,7 +186,7 @@ bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const {
}
bool R600InstrInfo::isVectorOnly(unsigned Opcode) const {
- return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU);
+ return (get(Opcode).getSchedClass() == R600::Sched::VecALU);
}
bool R600InstrInfo::isVectorOnly(const MachineInstr &MI) const {
@@ -214,8 +220,8 @@ bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const {
bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
switch (Opcode) {
- case AMDGPU::KILLGT:
- case AMDGPU::GROUP_BARRIER:
+ case R600::KILLGT:
+ case R600::GROUP_BARRIER:
return true;
default:
return false;
@@ -223,11 +229,11 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
}
bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const {
- return MI.findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
+ return MI.findRegisterUseOperandIdx(R600::AR_X) != -1;
}
bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const {
- return MI.findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
+ return MI.findRegisterDefOperandIdx(R600::AR_X) != -1;
}
bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
@@ -241,7 +247,7 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
TargetRegisterInfo::isVirtualRegister(I->getReg()))
continue;
- if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
+ if (R600::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
return true;
}
return false;
@@ -249,17 +255,17 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const {
static const unsigned SrcSelTable[][2] = {
- {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
- {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
- {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
- {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
- {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
- {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
- {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
- {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
- {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
- {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
- {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}
+ {R600::OpName::src0, R600::OpName::src0_sel},
+ {R600::OpName::src1, R600::OpName::src1_sel},
+ {R600::OpName::src2, R600::OpName::src2_sel},
+ {R600::OpName::src0_X, R600::OpName::src0_sel_X},
+ {R600::OpName::src0_Y, R600::OpName::src0_sel_Y},
+ {R600::OpName::src0_Z, R600::OpName::src0_sel_Z},
+ {R600::OpName::src0_W, R600::OpName::src0_sel_W},
+ {R600::OpName::src1_X, R600::OpName::src1_sel_X},
+ {R600::OpName::src1_Y, R600::OpName::src1_sel_Y},
+ {R600::OpName::src1_Z, R600::OpName::src1_sel_Z},
+ {R600::OpName::src1_W, R600::OpName::src1_sel_W}
};
for (const auto &Row : SrcSelTable) {
@@ -274,23 +280,23 @@ SmallVector<std::pair<MachineOperand *, int64_t>, 3>
R600InstrInfo::getSrcs(MachineInstr &MI) const {
SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
- if (MI.getOpcode() == AMDGPU::DOT_4) {
+ if (MI.getOpcode() == R600::DOT_4) {
static const unsigned OpTable[8][2] = {
- {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
- {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
- {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
- {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
- {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
- {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
- {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
- {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W},
+ {R600::OpName::src0_X, R600::OpName::src0_sel_X},
+ {R600::OpName::src0_Y, R600::OpName::src0_sel_Y},
+ {R600::OpName::src0_Z, R600::OpName::src0_sel_Z},
+ {R600::OpName::src0_W, R600::OpName::src0_sel_W},
+ {R600::OpName::src1_X, R600::OpName::src1_sel_X},
+ {R600::OpName::src1_Y, R600::OpName::src1_sel_Y},
+ {R600::OpName::src1_Z, R600::OpName::src1_sel_Z},
+ {R600::OpName::src1_W, R600::OpName::src1_sel_W},
};
for (unsigned j = 0; j < 8; j++) {
MachineOperand &MO =
MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0]));
unsigned Reg = MO.getReg();
- if (Reg == AMDGPU::ALU_CONST) {
+ if (Reg == R600::ALU_CONST) {
MachineOperand &Sel =
MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
Result.push_back(std::make_pair(&MO, Sel.getImm()));
@@ -302,9 +308,9 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
}
static const unsigned OpTable[3][2] = {
- {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
- {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
- {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
+ {R600::OpName::src0, R600::OpName::src0_sel},
+ {R600::OpName::src1, R600::OpName::src1_sel},
+ {R600::OpName::src2, R600::OpName::src2_sel},
};
for (unsigned j = 0; j < 3; j++) {
@@ -313,15 +319,15 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
break;
MachineOperand &MO = MI.getOperand(SrcIdx);
unsigned Reg = MO.getReg();
- if (Reg == AMDGPU::ALU_CONST) {
+ if (Reg == R600::ALU_CONST) {
MachineOperand &Sel =
MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
Result.push_back(std::make_pair(&MO, Sel.getImm()));
continue;
}
- if (Reg == AMDGPU::ALU_LITERAL_X) {
+ if (Reg == R600::ALU_LITERAL_X) {
MachineOperand &Operand =
- MI.getOperand(getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+ MI.getOperand(getOperandIdx(MI.getOpcode(), R600::OpName::literal));
if (Operand.isImm()) {
Result.push_back(std::make_pair(&MO, Operand.getImm()));
continue;
@@ -345,7 +351,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI,
++i;
unsigned Reg = Src.first->getReg();
int Index = RI.getEncodingValue(Reg) & 0xff;
- if (Reg == AMDGPU::OQAP) {
+ if (Reg == R600::OQAP) {
Result.push_back(std::make_pair(Index, 0U));
}
if (PV.find(Reg) != PV.end()) {
@@ -435,7 +441,7 @@ unsigned R600InstrInfo::isLegalUpTo(
const std::pair<int, unsigned> &Src = Srcs[j];
if (Src.first < 0 || Src.first == 255)
continue;
- if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
+ if (Src.first == GET_REG_INDEX(RI.getEncodingValue(R600::OQAP))) {
if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 &&
Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) {
// The value from output queue A (denoted by register OQAP) can
@@ -541,7 +547,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
for (unsigned i = 0, e = IG.size(); i < e; ++i) {
IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount));
unsigned Op = getOperandIdx(IG[i]->getOpcode(),
- AMDGPU::OpName::bank_swizzle);
+ R600::OpName::bank_swizzle);
ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
IG[i]->getOperand(Op).getImm());
}
@@ -610,14 +616,14 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
continue;
for (const auto &Src : getSrcs(MI)) {
- if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X)
+ if (Src.first->getReg() == R600::ALU_LITERAL_X)
Literals.insert(Src.second);
if (Literals.size() > 4)
return false;
- if (Src.first->getReg() == AMDGPU::ALU_CONST)
+ if (Src.first->getReg() == R600::ALU_CONST)
Consts.push_back(Src.second);
- if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) ||
- AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) {
+ if (R600::R600_KC0RegClass.contains(Src.first->getReg()) ||
+ R600::R600_KC1RegClass.contains(Src.first->getReg())) {
unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff;
unsigned Chan = RI.getHWRegChan(Src.first->getReg());
Consts.push_back((Index << 2) | Chan);
@@ -636,7 +642,7 @@ R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const {
static bool
isPredicateSetter(unsigned Opcode) {
switch (Opcode) {
- case AMDGPU::PRED_X:
+ case R600::PRED_X:
return true;
default:
return false;
@@ -658,12 +664,12 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
static
bool isJump(unsigned Opcode) {
- return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND;
+ return Opcode == R600::JUMP || Opcode == R600::JUMP_COND;
}
static bool isBranch(unsigned Opcode) {
- return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 ||
- Opcode == AMDGPU::BRANCH_COND_f32;
+ return Opcode == R600::BRANCH || Opcode == R600::BRANCH_COND_i32 ||
+ Opcode == R600::BRANCH_COND_f32;
}
bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
@@ -678,7 +684,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
if (I == MBB.end())
return false;
- // AMDGPU::BRANCH* instructions are only available after isel and are not
+ // R600::BRANCH* instructions are only available after isel and are not
// handled
if (isBranch(I->getOpcode()))
return true;
@@ -687,7 +693,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
}
// Remove successive JUMP
- while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) {
+ while (I != MBB.begin() && std::prev(I)->getOpcode() == R600::JUMP) {
MachineBasicBlock::iterator PriorI = std::prev(I);
if (AllowModify)
I->removeFromParent();
@@ -698,10 +704,10 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
// If there is only one terminator instruction, process it.
unsigned LastOpc = LastInst.getOpcode();
if (I == MBB.begin() || !isJump((--I)->getOpcode())) {
- if (LastOpc == AMDGPU::JUMP) {
+ if (LastOpc == R600::JUMP) {
TBB = LastInst.getOperand(0).getMBB();
return false;
- } else if (LastOpc == AMDGPU::JUMP_COND) {
+ } else if (LastOpc == R600::JUMP_COND) {
auto predSet = I;
while (!isPredicateSetter(predSet->getOpcode())) {
predSet = --I;
@@ -709,7 +715,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
TBB = LastInst.getOperand(0).getMBB();
Cond.push_back(predSet->getOperand(1));
Cond.push_back(predSet->getOperand(2));
- Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+ Cond.push_back(MachineOperand::CreateReg(R600::PRED_SEL_ONE, false));
return false;
}
return true; // Can't handle indirect branch.
@@ -720,7 +726,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
unsigned SecondLastOpc = SecondLastInst.getOpcode();
// If the block ends with a B and a Bcc, handle it.
- if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) {
+ if (SecondLastOpc == R600::JUMP_COND && LastOpc == R600::JUMP) {
auto predSet = --I;
while (!isPredicateSetter(predSet->getOpcode())) {
predSet = --I;
@@ -729,7 +735,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
FBB = LastInst.getOperand(0).getMBB();
Cond.push_back(predSet->getOperand(1));
Cond.push_back(predSet->getOperand(2));
- Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+ Cond.push_back(MachineOperand::CreateReg(R600::PRED_SEL_ONE, false));
return false;
}
@@ -741,8 +747,8 @@ static
MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend();
It != E; ++It) {
- if (It->getOpcode() == AMDGPU::CF_ALU ||
- It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+ if (It->getOpcode() == R600::CF_ALU ||
+ It->getOpcode() == R600::CF_ALU_PUSH_BEFORE)
return It.getReverse();
}
return MBB.end();
@@ -759,7 +765,7 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
if (!FBB) {
if (Cond.empty()) {
- BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
+ BuildMI(&MBB, DL, get(R600::JUMP)).addMBB(TBB);
return 1;
} else {
MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
@@ -767,14 +773,14 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
addFlag(*PredSet, 0, MO_FLAG_PUSH);
PredSet->getOperand(2).setImm(Cond[1].getImm());
- BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+ BuildMI(&MBB, DL, get(R600::JUMP_COND))
.addMBB(TBB)
- .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+ .addReg(R600::PREDICATE_BIT, RegState::Kill);
MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
if (CfAlu == MBB.end())
return 1;
- assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
- CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+ assert (CfAlu->getOpcode() == R600::CF_ALU);
+ CfAlu->setDesc(get(R600::CF_ALU_PUSH_BEFORE));
return 1;
}
} else {
@@ -782,15 +788,15 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
assert(PredSet && "No previous predicate !");
addFlag(*PredSet, 0, MO_FLAG_PUSH);
PredSet->getOperand(2).setImm(Cond[1].getImm());
- BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+ BuildMI(&MBB, DL, get(R600::JUMP_COND))
.addMBB(TBB)
- .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
- BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB);
+ .addReg(R600::PREDICATE_BIT, RegState::Kill);
+ BuildMI(&MBB, DL, get(R600::JUMP)).addMBB(FBB);
MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
if (CfAlu == MBB.end())
return 2;
- assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
- CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+ assert (CfAlu->getOpcode() == R600::CF_ALU);
+ CfAlu->setDesc(get(R600::CF_ALU_PUSH_BEFORE));
return 2;
}
}
@@ -811,18 +817,18 @@ unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB,
switch (I->getOpcode()) {
default:
return 0;
- case AMDGPU::JUMP_COND: {
+ case R600::JUMP_COND: {
MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
clearFlag(*predSet, 0, MO_FLAG_PUSH);
I->eraseFromParent();
MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
if (CfAlu == MBB.end())
break;
- assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
- CfAlu->setDesc(get(AMDGPU::CF_ALU));
+ assert (CfAlu->getOpcode() == R600::CF_ALU_PUSH_BEFORE);
+ CfAlu->setDesc(get(R600::CF_ALU));
break;
}
- case AMDGPU::JUMP:
+ case R600::JUMP:
I->eraseFromParent();
break;
}
@@ -836,18 +842,18 @@ unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB,
// FIXME: only one case??
default:
return 1;
- case AMDGPU::JUMP_COND: {
+ case R600::JUMP_COND: {
MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
clearFlag(*predSet, 0, MO_FLAG_PUSH);
I->eraseFromParent();
MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
if (CfAlu == MBB.end())
break;
- assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
- CfAlu->setDesc(get(AMDGPU::CF_ALU));
+ assert (CfAlu->getOpcode() == R600::CF_ALU_PUSH_BEFORE);
+ CfAlu->setDesc(get(R600::CF_ALU));
break;
}
- case AMDGPU::JUMP:
+ case R600::JUMP:
I->eraseFromParent();
break;
}
@@ -862,9 +868,9 @@ bool R600InstrInfo::isPredicated(const MachineInstr &MI) const {
unsigned Reg = MI.getOperand(idx).getReg();
switch (Reg) {
default: return false;
- case AMDGPU::PRED_SEL_ONE:
- case AMDGPU::PRED_SEL_ZERO:
- case AMDGPU::PREDICATE_BIT:
+ case R600::PRED_SEL_ONE:
+ case R600::PRED_SEL_ZERO:
+ case R600::PREDICATE_BIT:
return true;
}
}
@@ -875,9 +881,9 @@ bool R600InstrInfo::isPredicable(const MachineInstr &MI) const {
// be predicated. Until we have proper support for instruction clauses in the
// backend, we will mark KILL* instructions as unpredicable.
- if (MI.getOpcode() == AMDGPU::KILLGT) {
+ if (MI.getOpcode() == R600::KILLGT) {
return false;
- } else if (MI.getOpcode() == AMDGPU::CF_ALU) {
+ } else if (MI.getOpcode() == R600::CF_ALU) {
// If the clause start in the middle of MBB then the MBB has more
// than a single clause, unable to predicate several clauses.
if (MI.getParent()->begin() != MachineBasicBlock::const_iterator(MI))
@@ -887,7 +893,7 @@ bool R600InstrInfo::isPredicable(const MachineInstr &MI) const {
} else if (isVector(MI)) {
return false;
} else {
- return AMDGPUInstrInfo::isPredicable(MI);
+ return TargetInstrInfo::isPredicable(MI);
}
}
@@ -928,17 +934,17 @@ bool
R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
MachineOperand &MO = Cond[1];
switch (MO.getImm()) {
- case AMDGPU::PRED_SETE_INT:
- MO.setImm(AMDGPU::PRED_SETNE_INT);
+ case R600::PRED_SETE_INT:
+ MO.setImm(R600::PRED_SETNE_INT);
break;
- case AMDGPU::PRED_SETNE_INT:
- MO.setImm(AMDGPU::PRED_SETE_INT);
+ case R600::PRED_SETNE_INT:
+ MO.setImm(R600::PRED_SETE_INT);
break;
- case AMDGPU::PRED_SETE:
- MO.setImm(AMDGPU::PRED_SETNE);
+ case R600::PRED_SETE:
+ MO.setImm(R600::PRED_SETNE);
break;
- case AMDGPU::PRED_SETNE:
- MO.setImm(AMDGPU::PRED_SETE);
+ case R600::PRED_SETNE:
+ MO.setImm(R600::PRED_SETE);
break;
default:
return true;
@@ -946,11 +952,11 @@ R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) con
MachineOperand &MO2 = Cond[2];
switch (MO2.getReg()) {
- case AMDGPU::PRED_SEL_ZERO:
- MO2.setReg(AMDGPU::PRED_SEL_ONE);
+ case R600::PRED_SEL_ZERO:
+ MO2.setReg(R600::PRED_SEL_ONE);
break;
- case AMDGPU::PRED_SEL_ONE:
- MO2.setReg(AMDGPU::PRED_SEL_ZERO);
+ case R600::PRED_SEL_ONE:
+ MO2.setReg(R600::PRED_SEL_ZERO);
break;
default:
return true;
@@ -967,22 +973,22 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
ArrayRef<MachineOperand> Pred) const {
int PIdx = MI.findFirstPredOperandIdx();
- if (MI.getOpcode() == AMDGPU::CF_ALU) {
+ if (MI.getOpcode() == R600::CF_ALU) {
MI.getOperand(8).setImm(0);
return true;
}
- if (MI.getOpcode() == AMDGPU::DOT_4) {
- MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_X))
+ if (MI.getOpcode() == R600::DOT_4) {
+ MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_X))
.setReg(Pred[2].getReg());
- MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Y))
+ MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_Y))
.setReg(Pred[2].getReg());
- MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Z))
+ MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_Z))
.setReg(Pred[2].getReg());
- MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_W))
+ MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_W))
.setReg(Pred[2].getReg());
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
- MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+ MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit);
return true;
}
@@ -990,7 +996,7 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
MachineOperand &PMO = MI.getOperand(PIdx);
PMO.setReg(Pred[2].getReg());
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
- MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+ MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit);
return true;
}
@@ -1020,20 +1026,20 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
default: {
MachineBasicBlock *MBB = MI.getParent();
int OffsetOpIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::addr);
+ R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::addr);
// addr is a custom operand with multiple MI operands, and only the
// first MI operand is given a name.
int RegOpIdx = OffsetOpIdx + 1;
int ChanOpIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::chan);
+ R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::chan);
if (isRegisterLoad(MI)) {
int DstOpIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+ R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::dst);
unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
unsigned Address = calculateIndirectAddress(RegIndex, Channel);
unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
- if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+ if (OffsetReg == R600::INDIRECT_BASE_ADDR) {
buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(),
getIndirectAddrRegClass()->getRegister(Address));
} else {
@@ -1042,12 +1048,12 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
} else if (isRegisterStore(MI)) {
int ValOpIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::val);
+ R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::val);
unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
unsigned Address = calculateIndirectAddress(RegIndex, Channel);
unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
- if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+ if (OffsetReg == R600::INDIRECT_BASE_ADDR) {
buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
MI.getOperand(ValOpIdx).getReg());
} else {
@@ -1062,15 +1068,15 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MBB->erase(MI);
return true;
}
- case AMDGPU::R600_EXTRACT_ELT_V2:
- case AMDGPU::R600_EXTRACT_ELT_V4:
+ case R600::R600_EXTRACT_ELT_V2:
+ case R600::R600_EXTRACT_ELT_V4:
buildIndirectRead(MI.getParent(), MI, MI.getOperand(0).getReg(),
RI.getHWRegIndex(MI.getOperand(1).getReg()), // Address
MI.getOperand(2).getReg(),
RI.getHWRegChan(MI.getOperand(1).getReg()));
break;
- case AMDGPU::R600_INSERT_ELT_V2:
- case AMDGPU::R600_INSERT_ELT_V4:
+ case R600::R600_INSERT_ELT_V2:
+ case R600::R600_INSERT_ELT_V4:
buildIndirectWrite(MI.getParent(), MI, MI.getOperand(2).getReg(), // Value
RI.getHWRegIndex(MI.getOperand(1).getReg()), // Address
MI.getOperand(3).getReg(), // Offset
@@ -1082,7 +1088,8 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
- const MachineFunction &MF) const {
+ const MachineFunction &MF,
+ const R600RegisterInfo &TRI) const {
const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
const R600FrameLowering *TFL = ST.getFrameLowering();
@@ -1093,17 +1100,15 @@ void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
return;
for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
- unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
- Reserved.set(SuperReg);
for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
- unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
- Reserved.set(Reg);
+ unsigned Reg = R600::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
+ TRI.reserveRegisterTuples(Reserved, Reg);
}
}
}
const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const {
- return &AMDGPU::R600_TReg32_XRegClass;
+ return &R600::R600_TReg32_XRegClass;
}
MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
@@ -1121,20 +1126,20 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
unsigned AddrReg;
switch (AddrChan) {
default: llvm_unreachable("Invalid Channel");
- case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
- case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
- case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
- case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+ case 0: AddrReg = R600::R600_AddrRegClass.getRegister(Address); break;
+ case 1: AddrReg = R600::R600_Addr_YRegClass.getRegister(Address); break;
+ case 2: AddrReg = R600::R600_Addr_ZRegClass.getRegister(Address); break;
+ case 3: AddrReg = R600::R600_Addr_WRegClass.getRegister(Address); break;
}
- MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
- AMDGPU::AR_X, OffsetReg);
- setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
+ MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, R600::MOVA_INT_eg,
+ R600::AR_X, OffsetReg);
+ setImmOperand(*MOVA, R600::OpName::write, 0);
- MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+ MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, R600::MOV,
AddrReg, ValueReg)
- .addReg(AMDGPU::AR_X,
+ .addReg(R600::AR_X,
RegState::Implicit | RegState::Kill);
- setImmOperand(*Mov, AMDGPU::OpName::dst_rel, 1);
+ setImmOperand(*Mov, R600::OpName::dst_rel, 1);
return Mov;
}
@@ -1153,21 +1158,21 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
unsigned AddrReg;
switch (AddrChan) {
default: llvm_unreachable("Invalid Channel");
- case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
- case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
- case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
- case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+ case 0: AddrReg = R600::R600_AddrRegClass.getRegister(Address); break;
+ case 1: AddrReg = R600::R600_Addr_YRegClass.getRegister(Address); break;
+ case 2: AddrReg = R600::R600_Addr_ZRegClass.getRegister(Address); break;
+ case 3: AddrReg = R600::R600_Addr_WRegClass.getRegister(Address); break;
}
- MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
- AMDGPU::AR_X,
+ MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, R600::MOVA_INT_eg,
+ R600::AR_X,
OffsetReg);
- setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
- MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+ setImmOperand(*MOVA, R600::OpName::write, 0);
+ MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, R600::MOV,
ValueReg,
AddrReg)
- .addReg(AMDGPU::AR_X,
+ .addReg(R600::AR_X,
RegState::Implicit | RegState::Kill);
- setImmOperand(*Mov, AMDGPU::OpName::src0_rel, 1);
+ setImmOperand(*Mov, R600::OpName::src0_rel, 1);
return Mov;
}
@@ -1265,7 +1270,7 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB
//XXX: The r600g finalizer expects this to be 1, once we've moved the
//scheduling to the backend, we can change the default to 0.
MIB.addImm(1) // $last
- .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel
+ .addReg(R600::PRED_SEL_OFF) // $pred_sel
.addImm(0) // $literal
.addImm(0); // $bank_swizzle
@@ -1286,23 +1291,23 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB
static unsigned getSlotedOps(unsigned Op, unsigned Slot) {
switch (Op) {
- OPERAND_CASE(AMDGPU::OpName::update_exec_mask)
- OPERAND_CASE(AMDGPU::OpName::update_pred)
- OPERAND_CASE(AMDGPU::OpName::write)
- OPERAND_CASE(AMDGPU::OpName::omod)
- OPERAND_CASE(AMDGPU::OpName::dst_rel)
- OPERAND_CASE(AMDGPU::OpName::clamp)
- OPERAND_CASE(AMDGPU::OpName::src0)
- OPERAND_CASE(AMDGPU::OpName::src0_neg)
- OPERAND_CASE(AMDGPU::OpName::src0_rel)
- OPERAND_CASE(AMDGPU::OpName::src0_abs)
- OPERAND_CASE(AMDGPU::OpName::src0_sel)
- OPERAND_CASE(AMDGPU::OpName::src1)
- OPERAND_CASE(AMDGPU::OpName::src1_neg)
- OPERAND_CASE(AMDGPU::OpName::src1_rel)
- OPERAND_CASE(AMDGPU::OpName::src1_abs)
- OPERAND_CASE(AMDGPU::OpName::src1_sel)
- OPERAND_CASE(AMDGPU::OpName::pred_sel)
+ OPERAND_CASE(R600::OpName::update_exec_mask)
+ OPERAND_CASE(R600::OpName::update_pred)
+ OPERAND_CASE(R600::OpName::write)
+ OPERAND_CASE(R600::OpName::omod)
+ OPERAND_CASE(R600::OpName::dst_rel)
+ OPERAND_CASE(R600::OpName::clamp)
+ OPERAND_CASE(R600::OpName::src0)
+ OPERAND_CASE(R600::OpName::src0_neg)
+ OPERAND_CASE(R600::OpName::src0_rel)
+ OPERAND_CASE(R600::OpName::src0_abs)
+ OPERAND_CASE(R600::OpName::src0_sel)
+ OPERAND_CASE(R600::OpName::src1)
+ OPERAND_CASE(R600::OpName::src1_neg)
+ OPERAND_CASE(R600::OpName::src1_rel)
+ OPERAND_CASE(R600::OpName::src1_abs)
+ OPERAND_CASE(R600::OpName::src1_sel)
+ OPERAND_CASE(R600::OpName::pred_sel)
default:
llvm_unreachable("Wrong Operand");
}
@@ -1313,39 +1318,39 @@ static unsigned getSlotedOps(unsigned Op, unsigned Slot) {
MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg)
const {
- assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
+ assert (MI->getOpcode() == R600::DOT_4 && "Not Implemented");
unsigned Opcode;
- if (ST.getGeneration() <= R600Subtarget::R700)
- Opcode = AMDGPU::DOT4_r600;
+ if (ST.getGeneration() <= AMDGPUSubtarget::R700)
+ Opcode = R600::DOT4_r600;
else
- Opcode = AMDGPU::DOT4_eg;
+ Opcode = R600::DOT4_eg;
MachineBasicBlock::iterator I = MI;
MachineOperand &Src0 = MI->getOperand(
- getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot)));
+ getOperandIdx(MI->getOpcode(), getSlotedOps(R600::OpName::src0, Slot)));
MachineOperand &Src1 = MI->getOperand(
- getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot)));
+ getOperandIdx(MI->getOpcode(), getSlotedOps(R600::OpName::src1, Slot)));
MachineInstr *MIB = buildDefaultInstruction(
MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg());
static const unsigned Operands[14] = {
- AMDGPU::OpName::update_exec_mask,
- AMDGPU::OpName::update_pred,
- AMDGPU::OpName::write,
- AMDGPU::OpName::omod,
- AMDGPU::OpName::dst_rel,
- AMDGPU::OpName::clamp,
- AMDGPU::OpName::src0_neg,
- AMDGPU::OpName::src0_rel,
- AMDGPU::OpName::src0_abs,
- AMDGPU::OpName::src0_sel,
- AMDGPU::OpName::src1_neg,
- AMDGPU::OpName::src1_rel,
- AMDGPU::OpName::src1_abs,
- AMDGPU::OpName::src1_sel,
+ R600::OpName::update_exec_mask,
+ R600::OpName::update_pred,
+ R600::OpName::write,
+ R600::OpName::omod,
+ R600::OpName::dst_rel,
+ R600::OpName::clamp,
+ R600::OpName::src0_neg,
+ R600::OpName::src0_rel,
+ R600::OpName::src0_abs,
+ R600::OpName::src0_sel,
+ R600::OpName::src1_neg,
+ R600::OpName::src1_rel,
+ R600::OpName::src1_abs,
+ R600::OpName::src1_sel,
};
MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
- getSlotedOps(AMDGPU::OpName::pred_sel, Slot)));
- MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel))
+ getSlotedOps(R600::OpName::pred_sel, Slot)));
+ MIB->getOperand(getOperandIdx(Opcode, R600::OpName::pred_sel))
.setReg(MO.getReg());
for (unsigned i = 0; i < 14; i++) {
@@ -1362,16 +1367,16 @@ MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
MachineBasicBlock::iterator I,
unsigned DstReg,
uint64_t Imm) const {
- MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
- AMDGPU::ALU_LITERAL_X);
- setImmOperand(*MovImm, AMDGPU::OpName::literal, Imm);
+ MachineInstr *MovImm = buildDefaultInstruction(BB, I, R600::MOV, DstReg,
+ R600::ALU_LITERAL_X);
+ setImmOperand(*MovImm, R600::OpName::literal, Imm);
return MovImm;
}
MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
unsigned DstReg, unsigned SrcReg) const {
- return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg);
+ return buildDefaultInstruction(*MBB, I, R600::MOV, DstReg, SrcReg);
}
int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
@@ -1379,7 +1384,7 @@ int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
}
int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const {
- return AMDGPU::getNamedOperandIdx(Opcode, Op);
+ return R600::getNamedOperandIdx(Opcode, Op);
}
void R600InstrInfo::setImmOperand(MachineInstr &MI, unsigned Op,
@@ -1406,25 +1411,25 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx,
bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
switch (Flag) {
case MO_FLAG_CLAMP:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::clamp);
+ FlagIndex = getOperandIdx(MI, R600::OpName::clamp);
break;
case MO_FLAG_MASK:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::write);
+ FlagIndex = getOperandIdx(MI, R600::OpName::write);
break;
case MO_FLAG_NOT_LAST:
case MO_FLAG_LAST:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::last);
+ FlagIndex = getOperandIdx(MI, R600::OpName::last);
break;
case MO_FLAG_NEG:
switch (SrcIdx) {
case 0:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_neg);
+ FlagIndex = getOperandIdx(MI, R600::OpName::src0_neg);
break;
case 1:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_neg);
+ FlagIndex = getOperandIdx(MI, R600::OpName::src1_neg);
break;
case 2:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src2_neg);
+ FlagIndex = getOperandIdx(MI, R600::OpName::src2_neg);
break;
}
break;
@@ -1435,10 +1440,10 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx,
(void)IsOP3;
switch (SrcIdx) {
case 0:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_abs);
+ FlagIndex = getOperandIdx(MI, R600::OpName::src0_abs);
break;
case 1:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_abs);
+ FlagIndex = getOperandIdx(MI, R600::OpName::src1_abs);
break;
}
break;
@@ -1499,15 +1504,15 @@ unsigned R600InstrInfo::getAddressSpaceForPseudoSourceKind(
switch (Kind) {
case PseudoSourceValue::Stack:
case PseudoSourceValue::FixedStack:
- return AMDGPUASI.PRIVATE_ADDRESS;
+ return ST.getAMDGPUAS().PRIVATE_ADDRESS;
case PseudoSourceValue::ConstantPool:
case PseudoSourceValue::GOT:
case PseudoSourceValue::JumpTable:
case PseudoSourceValue::GlobalValueCallEntry:
case PseudoSourceValue::ExternalSymbolCallEntry:
case PseudoSourceValue::TargetCustom:
- return AMDGPUASI.CONSTANT_ADDRESS;
+ return ST.getAMDGPUAS().CONSTANT_ADDRESS;
}
llvm_unreachable("Invalid pseudo source kind");
- return AMDGPUASI.PRIVATE_ADDRESS;
+ return ST.getAMDGPUAS().PRIVATE_ADDRESS;
}
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
index abaa37450758..7a3dece31665 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -8,15 +8,18 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface definition for R600InstrInfo
+/// Interface definition for R600InstrInfo
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
#define LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
-#include "AMDGPUInstrInfo.h"
#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "R600GenInstrInfo.inc"
namespace llvm {
@@ -34,7 +37,7 @@ class MachineInstr;
class MachineInstrBuilder;
class R600Subtarget;
-class R600InstrInfo final : public AMDGPUInstrInfo {
+class R600InstrInfo final : public R600GenInstrInfo {
private:
const R600RegisterInfo RI;
const R600Subtarget &ST;
@@ -150,7 +153,7 @@ public:
/// Same but using const index set instead of MI set.
bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
- /// \brief Vector instructions are instructions that must fill all
+ /// Vector instructions are instructions that must fill all
/// instruction slots within an instruction group.
bool isVector(const MachineInstr &MI) const;
@@ -209,9 +212,10 @@ public:
bool expandPostRAPseudo(MachineInstr &MI) const override;
- /// \brief Reserve the registers that may be accesed using indirect addressing.
+ /// Reserve the registers that may be accesed using indirect addressing.
void reserveIndirectRegisters(BitVector &Reserved,
- const MachineFunction &MF) const;
+ const MachineFunction &MF,
+ const R600RegisterInfo &TRI) const;
/// Calculate the "Indirect Address" for the given \p RegIndex and
/// \p Channel
@@ -235,7 +239,7 @@ public:
/// read or write or -1 if indirect addressing is not used by this program.
int getIndirectIndexEnd(const MachineFunction &MF) const;
- /// \brief Build instruction(s) for an indirect register write.
+ /// Build instruction(s) for an indirect register write.
///
/// \returns The instruction that performs the indirect register write
MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
@@ -243,7 +247,7 @@ public:
unsigned ValueReg, unsigned Address,
unsigned OffsetReg) const;
- /// \brief Build instruction(s) for an indirect register read.
+ /// Build instruction(s) for an indirect register read.
///
/// \returns The instruction that performs the indirect register read
MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
@@ -281,23 +285,23 @@ public:
MachineBasicBlock::iterator I,
unsigned DstReg, unsigned SrcReg) const;
- /// \brief Get the index of Op in the MachineInstr.
+ /// Get the index of Op in the MachineInstr.
///
/// \returns -1 if the Instruction does not contain the specified \p Op.
int getOperandIdx(const MachineInstr &MI, unsigned Op) const;
- /// \brief Get the index of \p Op for the given Opcode.
+ /// Get the index of \p Op for the given Opcode.
///
/// \returns -1 if the Instruction does not contain the specified \p Op.
int getOperandIdx(unsigned Opcode, unsigned Op) const;
- /// \brief Helper function for setting instruction flag values.
+ /// Helper function for setting instruction flag values.
void setImmOperand(MachineInstr &MI, unsigned Op, int64_t Imm) const;
- ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
+ ///Add one of the MO_FLAG* flags to the specified \p Operand.
void addFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
- ///\brief Determine if the specified \p Flag is set on this \p Operand.
+ ///Determine if the specified \p Flag is set on this \p Operand.
bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
/// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2)
@@ -307,7 +311,7 @@ public:
MachineOperand &getFlagOp(MachineInstr &MI, unsigned SrcIdx = 0,
unsigned Flag = 0) const;
- /// \brief Clear the specified flag on the instruction.
+ /// Clear the specified flag on the instruction.
void clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
// Helper functions that check the opcode for status information
@@ -323,7 +327,7 @@ public:
PseudoSourceValue::PSVKind Kind) const override;
};
-namespace AMDGPU {
+namespace R600 {
int getLDSNoRetOp(uint16_t Opcode);
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 801e4e61fca6..7bf174f4cd86 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -12,20 +12,19 @@
//
//===----------------------------------------------------------------------===//
-include "R600Intrinsics.td"
include "R600InstrFormats.td"
// FIXME: Should not be arbitrarily split from other R600 inst classes.
class R600WrapperInst <dag outs, dag ins, string asm = "", list<dag> pattern = []> :
AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
let SubtargetPredicate = isR600toCayman;
+ let Namespace = "R600";
}
class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern = []> :
InstR600 <outs, ins, asm, pattern, NullALU> {
- let Namespace = "AMDGPU";
}
def MEMxi : Operand<iPTR> {
@@ -81,11 +80,18 @@ def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
+def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
(ops PRED_SEL_OFF)>;
+let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
+ usesCustomInserter = 1, Namespace = "R600" in {
+ def RETURN : ILFormat<(outs), (ins variable_ops),
+ "RETURN", [(AMDGPUendpgm)]
+ >;
+}
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
@@ -219,34 +225,6 @@ class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
-def TEX_SHADOW : PatLeaf<
- (imm),
- [{uint32_t TType = (uint32_t)N->getZExtValue();
- return (TType >= 6 && TType <= 8) || TType == 13;
- }]
->;
-
-def TEX_RECT : PatLeaf<
- (imm),
- [{uint32_t TType = (uint32_t)N->getZExtValue();
- return TType == 5;
- }]
->;
-
-def TEX_ARRAY : PatLeaf<
- (imm),
- [{uint32_t TType = (uint32_t)N->getZExtValue();
- return TType == 9 || TType == 10 || TType == 16;
- }]
->;
-
-def TEX_SHADOW_ARRAY : PatLeaf<
- (imm),
- [{uint32_t TType = (uint32_t)N->getZExtValue();
- return TType == 11 || TType == 12 || TType == 17;
- }]
->;
-
class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
dag outs, dag ins, string asm, list<dag> pattern> :
InstR600ISA <outs, ins, asm, pattern>,
@@ -357,6 +335,8 @@ def vtx_id2_load : LoadVtxId2 <load>;
// R600 SDNodes
//===----------------------------------------------------------------------===//
+let Namespace = "R600" in {
+
def INTERP_PAIR_XY : AMDGPUShaderInst <
(outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1),
(ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
@@ -369,6 +349,8 @@ def INTERP_PAIR_ZW : AMDGPUShaderInst <
"INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1",
[]>;
+}
+
def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
[SDNPVariadic]
@@ -416,11 +398,15 @@ def : R600Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR,
// Interpolation Instructions
//===----------------------------------------------------------------------===//
+let Namespace = "R600" in {
+
def INTERP_VEC_LOAD : AMDGPUShaderInst <
(outs R600_Reg128:$dst),
(ins i32imm:$src0),
"INTERP_LOAD $src0 : $dst">;
+}
+
def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
let bank_swizzle = 5;
}
@@ -660,14 +646,7 @@ def PAD : R600WrapperInst <(outs), (ins), "PAD", [] > {
let isCodeGenOnly = 1, isPseudo = 1 in {
-let usesCustomInserter = 1 in {
-
-class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
- (outs rc:$dst),
- (ins rc:$src0),
- "CLAMP $dst, $src0",
- [(set f32:$dst, (AMDGPUclamp f32:$src0))]
->;
+let Namespace = "R600", usesCustomInserter = 1 in {
class FABS <RegisterClass rc> : AMDGPUShaderInst <
(outs rc:$dst),
@@ -799,7 +778,9 @@ class MOV_IMM <ValueType vt, Operand immType> : R600WrapperInst <
(ins immType:$imm),
"",
[]
->;
+> {
+ let Namespace = "R600";
+}
} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
@@ -1014,7 +995,7 @@ class CNDGE_Common <bits<5> inst> : R600_3OP <
}
-let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in {
+let isCodeGenOnly = 1, isPseudo = 1, Namespace = "R600" in {
class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins
// Slot X
UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X,
@@ -1193,7 +1174,6 @@ class COS_Common <bits<11> inst> : R600_1OP <
let Itinerary = TransALU;
}
-def CLAMP_R600 : CLAMP <R600_Reg32>;
def FABS_R600 : FABS<R600_Reg32>;
def FNEG_R600 : FNEG<R600_Reg32>;
@@ -1334,7 +1314,9 @@ let Predicates = [isR600] in {
// Regist loads and stores - for indirect addressing
//===----------------------------------------------------------------------===//
+let Namespace = "R600" in {
defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
+}
// Hardcode channel to 0
// NOTE: LSHR is not available here. LSHR is per family instruction
@@ -1386,11 +1368,12 @@ let usesCustomInserter = 1 in {
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
-def MASK_WRITE : AMDGPUShaderInst <
+def MASK_WRITE : InstR600 <
(outs),
(ins R600_Reg32:$src),
"MASK_WRITE $src",
- []
+ [],
+ NullALU
>;
} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
@@ -1421,7 +1404,7 @@ def TXD_SHADOW: InstR600 <
// Constant Buffer Addressing Support
//===----------------------------------------------------------------------===//
-let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in {
+let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "R600" in {
def CONST_COPY : Instruction {
let OutOperandList = (outs R600_Reg32:$dst);
let InOperandList = (ins i32imm:$src);
@@ -1544,23 +1527,6 @@ let Inst{63-32} = Word1;
//===---------------------------------------------------------------------===//
// Flow and Program control Instructions
//===---------------------------------------------------------------------===//
-class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
-: Instruction {
-
- let Namespace = "AMDGPU";
- dag OutOperandList = outs;
- dag InOperandList = ins;
- let Pattern = pattern;
- let AsmString = !strconcat(asmstr, "\n");
- let isPseudo = 1;
- let Itinerary = NullALU;
- bit hasIEEEFlag = 0;
- bit hasZeroOpFlag = 0;
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let isCodeGenOnly = 1;
-}
multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
def _i32 : ILFormat<(outs),
@@ -1592,23 +1558,14 @@ multiclass BranchInstr2<string name> {
// Custom Inserter for Branches and returns, this eventually will be a
// separate pass
//===---------------------------------------------------------------------===//
-let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
+let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1,
+ Namespace = "R600" in {
def BRANCH : ILFormat<(outs), (ins brtarget:$target),
"; Pseudo unconditional branch instruction",
[(br bb:$target)]>;
defm BRANCH_COND : BranchConditional<IL_brcond, R600_Reg32, R600_Reg32>;
}
-//===---------------------------------------------------------------------===//
-// Return instruction
-//===---------------------------------------------------------------------===//
-let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
- usesCustomInserter = 1 in {
- def RETURN : ILFormat<(outs), (ins variable_ops),
- "RETURN", [(AMDGPUendpgm)]
- >;
-}
-
//===----------------------------------------------------------------------===//
// Branch Instructions
//===----------------------------------------------------------------------===//
@@ -1738,13 +1695,8 @@ def : R600Pat <
>;
// KIL Patterns
-def KILP : R600Pat <
- (int_AMDGPU_kilp),
- (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
->;
-
def KIL : R600Pat <
- (int_AMDGPU_kill f32:$src0),
+ (int_r600_kill f32:$src0),
(MASK_WRITE (KILLGT (f32 ZERO), $src0))
>;
diff --git a/lib/Target/AMDGPU/R600Intrinsics.td b/lib/Target/AMDGPU/R600Intrinsics.td
deleted file mode 100644
index 4c9e1e8a5434..000000000000
--- a/lib/Target/AMDGPU/R600Intrinsics.td
+++ /dev/null
@@ -1,67 +0,0 @@
-//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// R600 Intrinsic Definitions
-//
-//===----------------------------------------------------------------------===//
-
-class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [
- llvm_v4f32_ty, // Coord
- llvm_i32_ty, // offset_x
- llvm_i32_ty, // offset_y,
- llvm_i32_ty, // offset_z,
- llvm_i32_ty, // resource_id
- llvm_i32_ty, // samplerid
- llvm_i32_ty, // coord_type_x
- llvm_i32_ty, // coord_type_y
- llvm_i32_ty, // coord_type_z
- llvm_i32_ty], // coord_type_w
- [IntrNoMem]
->;
-
-class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [
- llvm_v4i32_ty, // Coord
- llvm_i32_ty, // offset_x
- llvm_i32_ty, // offset_y,
- llvm_i32_ty, // offset_z,
- llvm_i32_ty, // resource_id
- llvm_i32_ty, // samplerid
- llvm_i32_ty, // coord_type_x
- llvm_i32_ty, // coord_type_y
- llvm_i32_ty, // coord_type_z
- llvm_i32_ty], // coord_type_w
- [IntrNoMem]
->;
-
-let TargetPrefix = "r600", isTarget = 1 in {
-
-def int_r600_store_swizzle :
- Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []
->;
-
-def int_r600_store_stream_output : Intrinsic<
- [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []
->;
-
-def int_r600_tex : TextureIntrinsicFloatInput;
-def int_r600_texc : TextureIntrinsicFloatInput;
-def int_r600_txl : TextureIntrinsicFloatInput;
-def int_r600_txlc : TextureIntrinsicFloatInput;
-def int_r600_txb : TextureIntrinsicFloatInput;
-def int_r600_txbc : TextureIntrinsicFloatInput;
-def int_r600_txf : TextureIntrinsicInt32Input;
-def int_r600_txq : TextureIntrinsicInt32Input;
-def int_r600_ddx : TextureIntrinsicFloatInput;
-def int_r600_ddy : TextureIntrinsicFloatInput;
-
-def int_r600_dot4 : Intrinsic<[llvm_float_ty],
- [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
->;
-
-} // End TargetPrefix = "r600", isTarget = 1
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
index a7e540f9d14d..a1429a2ac50f 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -8,13 +8,14 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief R600 Machine Scheduler interface
+/// R600 Machine Scheduler interface
//
//===----------------------------------------------------------------------===//
#include "R600MachineScheduler.h"
#include "AMDGPUSubtarget.h"
#include "R600InstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/Pass.h"
@@ -78,7 +79,7 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
AllowSwitchFromAlu = true;
} else {
unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
- DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+ LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n");
// We assume the local GPR requirements to be "dominated" by the requirement
// of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
// after TEX are indeed likely to consume or generate values from/for the
@@ -124,26 +125,24 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
NextInstKind = IDOther;
}
- DEBUG(
- if (SU) {
- dbgs() << " ** Pick node **\n";
- SU->dump(DAG);
- } else {
- dbgs() << "NO NODE \n";
- for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
- const SUnit &S = DAG->SUnits[i];
- if (!S.isScheduled)
- S.dump(DAG);
- }
- }
- );
+ LLVM_DEBUG(if (SU) {
+ dbgs() << " ** Pick node **\n";
+ SU->dump(DAG);
+ } else {
+ dbgs() << "NO NODE \n";
+ for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
+ const SUnit &S = DAG->SUnits[i];
+ if (!S.isScheduled)
+ S.dump(DAG);
+ }
+ });
return SU;
}
void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
if (NextInstKind != CurInstKind) {
- DEBUG(dbgs() << "Instruction Type Switch\n");
+ LLVM_DEBUG(dbgs() << "Instruction Type Switch\n");
if (NextInstKind != IDAlu)
OccupedSlotsMask |= 31;
CurEmitted = 0;
@@ -163,7 +162,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
E = SU->getInstr()->operands_end(); It != E; ++It) {
MachineOperand &MO = *It;
- if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+ if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X)
++CurEmitted;
}
}
@@ -172,8 +171,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
++CurEmitted;
}
-
- DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
+ LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
if (CurInstKind != IDFetch) {
MoveUnits(Pending[IDFetch], Available[IDFetch]);
@@ -183,18 +181,18 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
static bool
isPhysicalRegCopy(MachineInstr *MI) {
- if (MI->getOpcode() != AMDGPU::COPY)
+ if (MI->getOpcode() != R600::COPY)
return false;
return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
}
void R600SchedStrategy::releaseTopNode(SUnit *SU) {
- DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
+ LLVM_DEBUG(dbgs() << "Top Releasing "; SU->dump(DAG););
}
void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
- DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
+ LLVM_DEBUG(dbgs() << "Bottom Releasing "; SU->dump(DAG););
if (isPhysicalRegCopy(SU->getInstr())) {
PhysicalRegCopy.push_back(SU);
return;
@@ -226,14 +224,14 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
return AluTrans;
switch (MI->getOpcode()) {
- case AMDGPU::PRED_X:
+ case R600::PRED_X:
return AluPredX;
- case AMDGPU::INTERP_PAIR_XY:
- case AMDGPU::INTERP_PAIR_ZW:
- case AMDGPU::INTERP_VEC_LOAD:
- case AMDGPU::DOT_4:
+ case R600::INTERP_PAIR_XY:
+ case R600::INTERP_PAIR_ZW:
+ case R600::INTERP_VEC_LOAD:
+ case R600::DOT_4:
return AluT_XYZW;
- case AMDGPU::COPY:
+ case R600::COPY:
if (MI->getOperand(1).isUndef()) {
// MI will become a KILL, don't considers it in scheduling
return AluDiscarded;
@@ -248,7 +246,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
if(TII->isVector(*MI) ||
TII->isCubeOp(MI->getOpcode()) ||
TII->isReductionOp(MI->getOpcode()) ||
- MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
+ MI->getOpcode() == R600::GROUP_BARRIER) {
return AluT_XYZW;
}
@@ -259,13 +257,13 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
// Is the result already assigned to a channel ?
unsigned DestSubReg = MI->getOperand(0).getSubReg();
switch (DestSubReg) {
- case AMDGPU::sub0:
+ case R600::sub0:
return AluT_X;
- case AMDGPU::sub1:
+ case R600::sub1:
return AluT_Y;
- case AMDGPU::sub2:
+ case R600::sub2:
return AluT_Z;
- case AMDGPU::sub3:
+ case R600::sub3:
return AluT_W;
default:
break;
@@ -273,16 +271,16 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
// Is the result already member of a X/Y/Z/W class ?
unsigned DestReg = MI->getOperand(0).getReg();
- if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
- regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
+ if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) ||
+ regBelongsToClass(DestReg, &R600::R600_AddrRegClass))
return AluT_X;
- if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
+ if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass))
return AluT_Y;
- if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
+ if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass))
return AluT_Z;
- if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
+ if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass))
return AluT_W;
- if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
+ if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass))
return AluT_XYZW;
// LDS src registers cannot be used in the Trans slot.
@@ -303,13 +301,13 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
}
switch (Opcode) {
- case AMDGPU::PRED_X:
- case AMDGPU::COPY:
- case AMDGPU::CONST_COPY:
- case AMDGPU::INTERP_PAIR_XY:
- case AMDGPU::INTERP_PAIR_ZW:
- case AMDGPU::INTERP_VEC_LOAD:
- case AMDGPU::DOT_4:
+ case R600::PRED_X:
+ case R600::COPY:
+ case R600::CONST_COPY:
+ case R600::INTERP_PAIR_XY:
+ case R600::INTERP_PAIR_ZW:
+ case R600::INTERP_VEC_LOAD:
+ case R600::DOT_4:
return IDAlu;
default:
return IDOther;
@@ -345,17 +343,17 @@ void R600SchedStrategy::LoadAlu() {
}
void R600SchedStrategy::PrepareNextSlot() {
- DEBUG(dbgs() << "New Slot\n");
+ LLVM_DEBUG(dbgs() << "New Slot\n");
assert (OccupedSlotsMask && "Slot wasn't filled");
OccupedSlotsMask = 0;
-// if (HwGen == R600Subtarget::NORTHERN_ISLANDS)
+// if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
// OccupedSlotsMask |= 16;
InstructionsGroupCandidate.clear();
LoadAlu();
}
void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
- int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+ int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst);
if (DstIndex == -1) {
return;
}
@@ -372,16 +370,16 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
// Constrains the regclass of DestReg to assign it to Slot
switch (Slot) {
case 0:
- MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
+ MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass);
break;
case 1:
- MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
+ MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass);
break;
case 2:
- MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
+ MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass);
break;
case 3:
- MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
+ MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass);
break;
}
}
@@ -461,7 +459,7 @@ SUnit* R600SchedStrategy::pickOther(int QID) {
}
if (!AQ.empty()) {
SU = AQ.back();
- AQ.resize(AQ.size() - 1);
+ AQ.pop_back();
}
return SU;
}
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.h b/lib/Target/AMDGPU/R600MachineScheduler.h
index 9a6770570477..8a9a8d3d1e23 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.h
+++ b/lib/Target/AMDGPU/R600MachineScheduler.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief R600 Machine Scheduler interface
+/// R600 Machine Scheduler interface
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index cd71f19760b9..7de5e2c9577d 100644
--- a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
+++ b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -1,4 +1,4 @@
-//===- AMDGPUOpenCLImageTypeLoweringPass.cpp ------------------------------===//
+//===- R600OpenCLImageTypeLoweringPass.cpp ------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -153,7 +153,7 @@ PushArgMD(KernelArgMD &MD, const MDVector &V) {
namespace {
-class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
+class R600OpenCLImageTypeLoweringPass : public ModulePass {
static char ID;
LLVMContext *Context;
@@ -364,7 +364,7 @@ class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
}
public:
- AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {}
+ R600OpenCLImageTypeLoweringPass() : ModulePass(ID) {}
bool runOnModule(Module &M) override {
Context = &M.getContext();
@@ -376,14 +376,14 @@ public:
}
StringRef getPassName() const override {
- return "AMDGPU OpenCL Image Type Pass";
+ return "R600 OpenCL Image Type Pass";
}
};
} // end anonymous namespace
-char AMDGPUOpenCLImageTypeLoweringPass::ID = 0;
+char R600OpenCLImageTypeLoweringPass::ID = 0;
-ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() {
- return new AMDGPUOpenCLImageTypeLoweringPass();
+ModulePass *llvm::createR600OpenCLImageTypeLoweringPass() {
+ return new R600OpenCLImageTypeLoweringPass();
}
diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 4a14d95f1cc4..692451cb8fe0 100644
--- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -31,6 +31,7 @@
#include "AMDGPUSubtarget.h"
#include "R600Defines.h"
#include "R600InstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
@@ -78,7 +79,7 @@ public:
std::vector<unsigned> UndefReg;
RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
- assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE);
+ assert(MI->getOpcode() == R600::REG_SEQUENCE);
for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
MachineOperand &MO = Instr->getOperand(i);
unsigned Chan = Instr->getOperand(i + 1).getImm();
@@ -158,8 +159,8 @@ bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
return true;
switch (MI.getOpcode()) {
- case AMDGPU::R600_ExportSwz:
- case AMDGPU::EG_ExportSwz:
+ case R600::R600_ExportSwz:
+ case R600::EG_ExportSwz:
return true;
default:
return false;
@@ -212,12 +213,12 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
E = RSI->RegToChan.end(); It != E; ++It) {
- unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+ unsigned DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass);
unsigned SubReg = (*It).first;
unsigned Swizzle = (*It).second;
unsigned Chan = getReassignedChan(RemapChan, Swizzle);
- MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG),
+ MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(R600::INSERT_SUBREG),
DstReg)
.addReg(SrcVec)
.addReg(SubReg)
@@ -228,20 +229,20 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
UpdatedUndef.erase(ChanPos);
assert(!is_contained(UpdatedUndef, Chan) &&
"UpdatedUndef shouldn't contain Chan more than once!");
- DEBUG(dbgs() << " ->"; Tmp->dump(););
+ LLVM_DEBUG(dbgs() << " ->"; Tmp->dump(););
(void)Tmp;
SrcVec = DstReg;
}
MachineInstr *NewMI =
- BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg).addReg(SrcVec);
- DEBUG(dbgs() << " ->"; NewMI->dump(););
+ BuildMI(MBB, Pos, DL, TII->get(R600::COPY), Reg).addReg(SrcVec);
+ LLVM_DEBUG(dbgs() << " ->"; NewMI->dump(););
- DEBUG(dbgs() << " Updating Swizzle:\n");
+ LLVM_DEBUG(dbgs() << " Updating Swizzle:\n");
for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
E = MRI->use_instr_end(); It != E; ++It) {
- DEBUG(dbgs() << " ";(*It).dump(); dbgs() << " ->");
+ LLVM_DEBUG(dbgs() << " "; (*It).dump(); dbgs() << " ->");
SwizzleInput(*It, RemapChan);
- DEBUG((*It).dump());
+ LLVM_DEBUG((*It).dump());
}
RSI->Instr->eraseFromParent();
@@ -353,7 +354,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
MII != MIIE; ++MII) {
MachineInstr &MI = *MII;
- if (MI.getOpcode() != AMDGPU::REG_SEQUENCE) {
+ if (MI.getOpcode() != R600::REG_SEQUENCE) {
if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
unsigned Reg = MI.getOperand(1).getReg();
for (MachineRegisterInfo::def_instr_iterator
@@ -372,14 +373,14 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
if (!areAllUsesSwizzeable(Reg))
continue;
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Trying to optimize ";
MI.dump();
});
RegSeqInfo CandidateRSI;
std::vector<std::pair<unsigned, unsigned>> RemapChan;
- DEBUG(dbgs() << "Using common slots...\n";);
+ LLVM_DEBUG(dbgs() << "Using common slots...\n";);
if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) {
// Remove CandidateRSI mapping
RemoveMI(CandidateRSI.Instr);
@@ -387,7 +388,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
trackRSI(RSI);
continue;
}
- DEBUG(dbgs() << "Using free slots...\n";);
+ LLVM_DEBUG(dbgs() << "Using free slots...\n";);
RemapChan.clear();
if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) {
RemoveMI(CandidateRSI.Instr);
diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp
index 7340318d2d88..612c62b514fd 100644
--- a/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -17,6 +17,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "R600InstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/DFAPacketizer.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -83,39 +84,39 @@ private:
LastDstChan = BISlot;
if (TII->isPredicated(*BI))
continue;
- int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
+ int OperandIdx = TII->getOperandIdx(BI->getOpcode(), R600::OpName::write);
if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
continue;
- int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst);
+ int DstIdx = TII->getOperandIdx(BI->getOpcode(), R600::OpName::dst);
if (DstIdx == -1) {
continue;
}
unsigned Dst = BI->getOperand(DstIdx).getReg();
if (isTrans || TII->isTransOnly(*BI)) {
- Result[Dst] = AMDGPU::PS;
+ Result[Dst] = R600::PS;
continue;
}
- if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
- BI->getOpcode() == AMDGPU::DOT4_eg) {
- Result[Dst] = AMDGPU::PV_X;
+ if (BI->getOpcode() == R600::DOT4_r600 ||
+ BI->getOpcode() == R600::DOT4_eg) {
+ Result[Dst] = R600::PV_X;
continue;
}
- if (Dst == AMDGPU::OQAP) {
+ if (Dst == R600::OQAP) {
continue;
}
unsigned PVReg = 0;
switch (TRI.getHWRegChan(Dst)) {
case 0:
- PVReg = AMDGPU::PV_X;
+ PVReg = R600::PV_X;
break;
case 1:
- PVReg = AMDGPU::PV_Y;
+ PVReg = R600::PV_Y;
break;
case 2:
- PVReg = AMDGPU::PV_Z;
+ PVReg = R600::PV_Z;
break;
case 3:
- PVReg = AMDGPU::PV_W;
+ PVReg = R600::PV_W;
break;
default:
llvm_unreachable("Invalid Chan");
@@ -128,9 +129,9 @@ private:
void substitutePV(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PVs)
const {
unsigned Ops[] = {
- AMDGPU::OpName::src0,
- AMDGPU::OpName::src1,
- AMDGPU::OpName::src2
+ R600::OpName::src0,
+ R600::OpName::src1,
+ R600::OpName::src2
};
for (unsigned i = 0; i < 3; i++) {
int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]);
@@ -170,7 +171,7 @@ public:
return true;
if (!TII->isALUInstr(MI.getOpcode()))
return true;
- if (MI.getOpcode() == AMDGPU::GROUP_BARRIER)
+ if (MI.getOpcode() == R600::GROUP_BARRIER)
return true;
// XXX: This can be removed once the packetizer properly handles all the
// LDS instruction group restrictions.
@@ -184,8 +185,8 @@ public:
if (getSlot(*MII) == getSlot(*MIJ))
ConsideredInstUsesAlreadyWrittenVectorElement = true;
// Does MII and MIJ share the same pred_sel ?
- int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
- OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel);
+ int OpI = TII->getOperandIdx(MII->getOpcode(), R600::OpName::pred_sel),
+ OpJ = TII->getOperandIdx(MIJ->getOpcode(), R600::OpName::pred_sel);
unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0,
PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0;
if (PredI != PredJ)
@@ -219,7 +220,7 @@ public:
}
void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
- unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last);
+ unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), R600::OpName::last);
MI->getOperand(LastOp).setImm(Bit);
}
@@ -236,7 +237,7 @@ public:
if (ConsideredInstUsesAlreadyWrittenVectorElement &&
!TII->isVectorOnly(MI) && VLIW5) {
isTransSlot = true;
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Considering as Trans Inst :";
MI.dump();
});
@@ -249,7 +250,7 @@ public:
// Are the Constants limitations met ?
CurrentPacketMIs.push_back(&MI);
if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Couldn't pack :\n";
MI.dump();
dbgs() << "with the following packets :\n";
@@ -266,7 +267,7 @@ public:
// Is there a BankSwizzle set that meet Read Port limitations ?
if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
PV, BS, isTransSlot)) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Couldn't pack :\n";
MI.dump();
dbgs() << "with the following packets :\n";
@@ -300,11 +301,11 @@ public:
for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) {
MachineInstr *MI = CurrentPacketMIs[i];
unsigned Op = TII->getOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::bank_swizzle);
+ R600::OpName::bank_swizzle);
MI->getOperand(Op).setImm(BS[i]);
}
unsigned Op =
- TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::bank_swizzle);
+ TII->getOperandIdx(MI.getOpcode(), R600::OpName::bank_swizzle);
MI.getOperand(Op).setImm(BS.back());
if (!CurrentPacketMIs.empty())
setIsLastBit(CurrentPacketMIs.back(), 0);
@@ -333,6 +334,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
// DFA state table should not be empty.
assert(Packetizer.getResourceTracker() && "Empty DFA table!");
+ assert(Packetizer.getResourceTracker()->getInstrItins());
if (Packetizer.getResourceTracker()->getInstrItins()->isEmpty())
return false;
@@ -352,8 +354,8 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
MachineBasicBlock::iterator End = MBB->end();
MachineBasicBlock::iterator MI = MBB->begin();
while (MI != End) {
- if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF ||
- (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) {
+ if (MI->isKill() || MI->getOpcode() == R600::IMPLICIT_DEF ||
+ (MI->getOpcode() == R600::CF_ALU && !MI->getOperand(8).getImm())) {
MachineBasicBlock::iterator DeleteMI = MI;
++MI;
MBB->erase(DeleteMI);
diff --git a/lib/Target/AMDGPU/R600Processors.td b/lib/Target/AMDGPU/R600Processors.td
index 89194dc1bdf6..f39b3dc1bfd4 100644
--- a/lib/Target/AMDGPU/R600Processors.td
+++ b/lib/Target/AMDGPU/R600Processors.td
@@ -7,6 +7,62 @@
//
//===----------------------------------------------------------------------===//
+class SubtargetFeatureFetchLimit <string Value> :
+ SubtargetFeature <"fetch"#Value,
+ "TexVTXClauseSize",
+ Value,
+ "Limit the maximum number of fetches in a clause to "#Value
+>;
+
+def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
+ "R600ALUInst",
+ "false",
+ "Older version of ALU instructions encoding"
+>;
+
+def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
+def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
+
+def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
+ "HasVertexCache",
+ "true",
+ "Specify use of dedicated vertex cache"
+>;
+
+def FeatureCaymanISA : SubtargetFeature<"caymanISA",
+ "CaymanISA",
+ "true",
+ "Use Cayman ISA"
+>;
+
+def FeatureCFALUBug : SubtargetFeature<"cfalubug",
+ "CFALUBug",
+ "true",
+ "GPU has CF_ALU bug"
+>;
+
+class R600SubtargetFeatureGeneration <string Value,
+ list<SubtargetFeature> Implies> :
+ SubtargetFeatureGeneration <Value, "R600Subtarget", Implies>;
+
+def FeatureR600 : R600SubtargetFeatureGeneration<"R600",
+ [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]
+>;
+
+def FeatureR700 : R600SubtargetFeatureGeneration<"R700",
+ [FeatureFetchLimit16, FeatureLocalMemorySize0]
+>;
+
+def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN",
+ [FeatureFetchLimit16, FeatureLocalMemorySize32768]
+>;
+
+def FeatureNorthernIslands : R600SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
+ [FeatureFetchLimit16, FeatureWavefrontSize64,
+ FeatureLocalMemorySize32768]
+>;
+
+
//===----------------------------------------------------------------------===//
// Radeon HD 2000/3000 Series (R600).
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp
index 7501facb0cba..38933e7616a0 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief R600 implementation of the TargetRegisterInfo class.
+/// R600 implementation of the TargetRegisterInfo class.
//
//===----------------------------------------------------------------------===//
@@ -17,47 +17,51 @@
#include "R600Defines.h"
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
using namespace llvm;
-R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() {
+R600RegisterInfo::R600RegisterInfo() : R600GenRegisterInfo(0) {
RCW.RegWeight = 0;
RCW.WeightLimit = 0;
}
+#define GET_REGINFO_TARGET_DESC
+#include "R600GenRegisterInfo.inc"
+
BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
const R600InstrInfo *TII = ST.getInstrInfo();
- Reserved.set(AMDGPU::ZERO);
- Reserved.set(AMDGPU::HALF);
- Reserved.set(AMDGPU::ONE);
- Reserved.set(AMDGPU::ONE_INT);
- Reserved.set(AMDGPU::NEG_HALF);
- Reserved.set(AMDGPU::NEG_ONE);
- Reserved.set(AMDGPU::PV_X);
- Reserved.set(AMDGPU::ALU_LITERAL_X);
- Reserved.set(AMDGPU::ALU_CONST);
- Reserved.set(AMDGPU::PREDICATE_BIT);
- Reserved.set(AMDGPU::PRED_SEL_OFF);
- Reserved.set(AMDGPU::PRED_SEL_ZERO);
- Reserved.set(AMDGPU::PRED_SEL_ONE);
- Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
-
- for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(),
- E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) {
- Reserved.set(*I);
+ reserveRegisterTuples(Reserved, R600::ZERO);
+ reserveRegisterTuples(Reserved, R600::HALF);
+ reserveRegisterTuples(Reserved, R600::ONE);
+ reserveRegisterTuples(Reserved, R600::ONE_INT);
+ reserveRegisterTuples(Reserved, R600::NEG_HALF);
+ reserveRegisterTuples(Reserved, R600::NEG_ONE);
+ reserveRegisterTuples(Reserved, R600::PV_X);
+ reserveRegisterTuples(Reserved, R600::ALU_LITERAL_X);
+ reserveRegisterTuples(Reserved, R600::ALU_CONST);
+ reserveRegisterTuples(Reserved, R600::PREDICATE_BIT);
+ reserveRegisterTuples(Reserved, R600::PRED_SEL_OFF);
+ reserveRegisterTuples(Reserved, R600::PRED_SEL_ZERO);
+ reserveRegisterTuples(Reserved, R600::PRED_SEL_ONE);
+ reserveRegisterTuples(Reserved, R600::INDIRECT_BASE_ADDR);
+
+ for (TargetRegisterClass::iterator I = R600::R600_AddrRegClass.begin(),
+ E = R600::R600_AddrRegClass.end(); I != E; ++I) {
+ reserveRegisterTuples(Reserved, *I);
}
- TII->reserveIndirectRegisters(Reserved, MF);
+ TII->reserveIndirectRegisters(Reserved, MF, *this);
return Reserved;
}
// Dummy to not crash RegisterClassInfo.
-static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
+static const MCPhysReg CalleeSavedReg = R600::NoRegister;
const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
const MachineFunction *) const {
@@ -65,7 +69,7 @@ const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
}
unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
- return AMDGPU::NoRegister;
+ return R600::NoRegister;
}
unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
@@ -80,7 +84,7 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
MVT VT) const {
switch(VT.SimpleTy) {
default:
- case MVT::i32: return &AMDGPU::R600_TReg32RegClass;
+ case MVT::i32: return &R600::R600_TReg32RegClass;
}
}
@@ -93,9 +97,9 @@ bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
assert(!TargetRegisterInfo::isVirtualRegister(Reg));
switch (Reg) {
- case AMDGPU::OQAP:
- case AMDGPU::OQBP:
- case AMDGPU::AR_X:
+ case R600::OQAP:
+ case R600::OQBP:
+ case R600::AR_X:
return false;
default:
return true;
@@ -108,3 +112,10 @@ void R600RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
RegScavenger *RS) const {
llvm_unreachable("Subroutines not supported yet");
}
+
+void R600RegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
+ MCRegAliasIterator R(Reg, this, true);
+
+ for (; R.isValid(); ++R)
+ Reserved.set(*R);
+}
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h
index f0d9644b02f2..c4c77172b299 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -8,20 +8,19 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface definition for R600RegisterInfo
+/// Interface definition for R600RegisterInfo
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
#define LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
-#include "AMDGPURegisterInfo.h"
+#define GET_REGINFO_HEADER
+#include "R600GenRegisterInfo.inc"
namespace llvm {
-class AMDGPUSubtarget;
-
-struct R600RegisterInfo final : public AMDGPURegisterInfo {
+struct R600RegisterInfo final : public R600GenRegisterInfo {
RegClassWeight RCW;
R600RegisterInfo();
@@ -30,12 +29,12 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo {
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
unsigned getFrameRegister(const MachineFunction &MF) const override;
- /// \brief get the HW encoding for a register's channel.
+ /// get the HW encoding for a register's channel.
unsigned getHWRegChan(unsigned reg) const;
unsigned getHWRegIndex(unsigned Reg) const;
- /// \brief get the register class of the specified type to use in the
+ /// get the register class of the specified type to use in the
/// CFGStructurizer
const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const;
@@ -49,6 +48,8 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo {
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
unsigned FIOperandNum,
RegScavenger *RS = nullptr) const override;
+
+ void reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td
index 84ab328bdb2b..02164b74a01b 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.td
+++ b/lib/Target/AMDGPU/R600RegisterInfo.td
@@ -245,7 +245,7 @@ def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
(add V0123_W, V0123_Z, V0123_Y, V0123_X)
>;
-def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32, i64, f64], 64,
(add (sequence "T%u_XY", 0, 63))>;
def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 150d8c3dc3d3..74f1bd8fb986 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -18,6 +18,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constant.h"
@@ -37,7 +38,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
@@ -133,7 +133,7 @@ INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
char SIAnnotateControlFlow::ID = 0;
-/// \brief Initialize all the types and constants used in the pass
+/// Initialize all the types and constants used in the pass
bool SIAnnotateControlFlow::doInitialization(Module &M) {
LLVMContext &Context = M.getContext();
@@ -157,29 +157,29 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
return false;
}
-/// \brief Is the branch condition uniform or did the StructurizeCFG pass
+/// Is the branch condition uniform or did the StructurizeCFG pass
/// consider it as such?
bool SIAnnotateControlFlow::isUniform(BranchInst *T) {
return DA->isUniform(T->getCondition()) ||
T->getMetadata("structurizecfg.uniform") != nullptr;
}
-/// \brief Is BB the last block saved on the stack ?
+/// Is BB the last block saved on the stack ?
bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
return !Stack.empty() && Stack.back().first == BB;
}
-/// \brief Pop the last saved value from the control flow stack
+/// Pop the last saved value from the control flow stack
Value *SIAnnotateControlFlow::popSaved() {
return Stack.pop_back_val().second;
}
-/// \brief Push a BB and saved value to the control flow stack
+/// Push a BB and saved value to the control flow stack
void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) {
Stack.push_back(std::make_pair(BB, Saved));
}
-/// \brief Can the condition represented by this PHI node treated like
+/// Can the condition represented by this PHI node treated like
/// an "Else" block?
bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock();
@@ -198,14 +198,14 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
return true;
}
-// \brief Erase "Phi" if it is not used any more
+// Erase "Phi" if it is not used any more
void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
if (RecursivelyDeleteDeadPHINode(Phi)) {
- DEBUG(dbgs() << "Erased unused condition phi\n");
+ LLVM_DEBUG(dbgs() << "Erased unused condition phi\n");
}
}
-/// \brief Open a new "If" block
+/// Open a new "If" block
void SIAnnotateControlFlow::openIf(BranchInst *Term) {
if (isUniform(Term))
return;
@@ -215,7 +215,7 @@ void SIAnnotateControlFlow::openIf(BranchInst *Term) {
push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
}
-/// \brief Close the last "If" block and open a new "Else" block
+/// Close the last "If" block and open a new "Else" block
void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
if (isUniform(Term)) {
return;
@@ -225,7 +225,7 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
}
-/// \brief Recursively handle the condition leading to a loop
+/// Recursively handle the condition leading to a loop
Value *SIAnnotateControlFlow::handleLoopCondition(
Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term,
SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) {
@@ -322,7 +322,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
llvm_unreachable("Unhandled loop condition!");
}
-/// \brief Handle a back edge (loop)
+/// Handle a back edge (loop)
void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
if (isUniform(Term))
return;
@@ -353,7 +353,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
push(Term->getSuccessor(0), Arg);
}
-/// \brief Close the last opened control flow
+/// Close the last opened control flow
void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
llvm::Loop *L = LI->getLoopFor(BB);
@@ -381,7 +381,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
CallInst::Create(EndCf, Exec, "", FirstInsertionPt);
}
-/// \brief Annotate the control flow with intrinsics so the backend can
+/// Annotate the control flow with intrinsics so the backend can
/// recognize if/then/else and loops.
bool SIAnnotateControlFlow::runOnFunction(Function &F) {
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -422,11 +422,15 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
openIf(Term);
}
- assert(Stack.empty());
+ if (!Stack.empty()) {
+ // CFG was probably not structured.
+ report_fatal_error("failed to annotate CFG");
+ }
+
return true;
}
-/// \brief Create the annotation pass
+/// Create the annotation pass
FunctionPass *llvm::createSIAnnotateControlFlowPass() {
return new SIAnnotateControlFlow();
}
diff --git a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
index b5c439b21b89..7e884ad93a23 100644
--- a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
+++ b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Inserts one nop instruction for each high level source statement for
+/// Inserts one nop instruction for each high level source statement for
/// debugger usage.
///
/// Tools, such as a debugger, need to pause execution based on user input (i.e.
@@ -21,6 +21,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -62,7 +63,7 @@ FunctionPass *llvm::createSIDebuggerInsertNopsPass() {
bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) {
// Skip this pass if "amdgpu-debugger-insert-nops" attribute was not
// specified.
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.debuggerInsertNops())
return false;
@@ -78,8 +79,8 @@ bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) {
for (auto &MBB : MF) {
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
- // Skip DBG_VALUE instructions and instructions without location.
- if (MI->isDebugValue() || !MI->getDebugLoc())
+ // Skip debug instructions and instructions without location.
+ if (MI->isDebugInstr() || !MI->getDebugLoc())
continue;
// Insert nop instruction if line number does not have nop inserted.
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index a9f6069e798a..a6d28d6999e5 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -85,7 +85,10 @@ enum : uint64_t {
ClampHi = UINT64_C(1) << 48,
// Is a packed VOP3P instruction.
- IsPacked = UINT64_C(1) << 49
+ IsPacked = UINT64_C(1) << 49,
+
+ // Is a D16 buffer instruction.
+ D16Buf = UINT64_C(1) << 50
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -137,7 +140,6 @@ namespace AMDGPU {
OPERAND_INPUT_MODS,
// Operand for SDWA instructions
- OPERAND_SDWA_SRC,
OPERAND_SDWA_VOPC_DST,
/// Operand with 32-bit immediate that uses the constant bus.
@@ -146,6 +148,13 @@ namespace AMDGPU {
};
}
+namespace SIStackID {
+enum StackTypes : uint8_t {
+ SCRATCH = 0,
+ SGPR_SPILL = 1
+};
+}
+
// Input operand modifiers bit-masks
// NEG and SEXT share same bit-mask because they can't be set simultaneously.
namespace SISrcMods {
@@ -273,8 +282,9 @@ enum Id { // HwRegCode, (6) [5:0]
ID_GPR_ALLOC = 5,
ID_LDS_ALLOC = 6,
ID_IB_STS = 7,
- ID_SYMBOLIC_LAST_ = 8,
ID_MEM_BASES = 15,
+ ID_SYMBOLIC_FIRST_GFX9_ = ID_MEM_BASES,
+ ID_SYMBOLIC_LAST_ = 16,
ID_SHIFT_ = 0,
ID_WIDTH_ = 6,
ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
@@ -375,6 +385,44 @@ enum SDWA9EncValues{
};
} // namespace SDWA
+
+namespace DPP {
+
+enum DppCtrl {
+ QUAD_PERM_FIRST = 0,
+ QUAD_PERM_LAST = 0xFF,
+ DPP_UNUSED1 = 0x100,
+ ROW_SHL0 = 0x100,
+ ROW_SHL_FIRST = 0x101,
+ ROW_SHL_LAST = 0x10F,
+ DPP_UNUSED2 = 0x110,
+ ROW_SHR0 = 0x110,
+ ROW_SHR_FIRST = 0x111,
+ ROW_SHR_LAST = 0x11F,
+ DPP_UNUSED3 = 0x120,
+ ROW_ROR0 = 0x120,
+ ROW_ROR_FIRST = 0x121,
+ ROW_ROR_LAST = 0x12F,
+ WAVE_SHL1 = 0x130,
+ DPP_UNUSED4_FIRST = 0x131,
+ DPP_UNUSED4_LAST = 0x133,
+ WAVE_ROL1 = 0x134,
+ DPP_UNUSED5_FIRST = 0x135,
+ DPP_UNUSED5_LAST = 0x137,
+ WAVE_SHR1 = 0x138,
+ DPP_UNUSED6_FIRST = 0x139,
+ DPP_UNUSED6_LAST = 0x13B,
+ WAVE_ROR1 = 0x13C,
+ DPP_UNUSED7_FIRST = 0x13D,
+ DPP_UNUSED7_LAST = 0x13F,
+ ROW_MIRROR = 0x140,
+ ROW_HALF_MIRROR = 0x141,
+ BCAST15 = 0x142,
+ BCAST31 = 0x143,
+ DPP_LAST = BCAST31
+};
+
+} // namespace DPP
} // namespace AMDGPU
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 8b155c2d2780..566e0d3febc7 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -69,6 +69,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
@@ -81,7 +82,6 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/CodeGen.h"
@@ -110,12 +110,7 @@ namespace {
class SIFixSGPRCopies : public MachineFunctionPass {
MachineDominatorTree *MDT;
- MachinePostDominatorTree *MPDT;
- DenseMap<MachineBasicBlock *, SetVector<MachineBasicBlock*>> PDF;
- void computePDF(MachineFunction * MF);
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- void printPDF();
-#endif
+
public:
static char ID;
@@ -128,8 +123,6 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
- AU.addRequired<MachinePostDominatorTree>();
- AU.addPreserved<MachinePostDominatorTree>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -417,6 +410,12 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
return false;
}
+static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
+ const TargetRegisterInfo *TRI) {
+ return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
+ return hasTerminatorThatModifiesExec(*MBB, *TRI); });
+}
+
// Checks if there is potential path From instruction To instruction.
// If CutOff is specified and it sits in between of that path we ignore
// a higher portion of the path and report it is not reachable.
@@ -515,9 +514,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
if (MDT.dominates(MI1, MI2)) {
if (!intereferes(MI2, MI1)) {
- DEBUG(dbgs() << "Erasing from "
- << printMBBReference(*MI2->getParent()) << " "
- << *MI2);
+ LLVM_DEBUG(dbgs()
+ << "Erasing from "
+ << printMBBReference(*MI2->getParent()) << " " << *MI2);
MI2->eraseFromParent();
Defs.erase(I2++);
Changed = true;
@@ -525,9 +524,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
}
} else if (MDT.dominates(MI2, MI1)) {
if (!intereferes(MI1, MI2)) {
- DEBUG(dbgs() << "Erasing from "
- << printMBBReference(*MI1->getParent()) << " "
- << *MI1);
+ LLVM_DEBUG(dbgs()
+ << "Erasing from "
+ << printMBBReference(*MI1->getParent()) << " " << *MI1);
MI1->eraseFromParent();
Defs.erase(I1++);
Changed = true;
@@ -543,11 +542,12 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
if (!intereferes(MI1, I) && !intereferes(MI2, I)) {
- DEBUG(dbgs() << "Erasing from "
- << printMBBReference(*MI1->getParent()) << " " << *MI1
- << "and moving from "
- << printMBBReference(*MI2->getParent()) << " to "
- << printMBBReference(*I->getParent()) << " " << *MI2);
+ LLVM_DEBUG(dbgs()
+ << "Erasing from "
+ << printMBBReference(*MI1->getParent()) << " " << *MI1
+ << "and moving from "
+ << printMBBReference(*MI2->getParent()) << " to "
+ << printMBBReference(*I->getParent()) << " " << *MI2);
I->getParent()->splice(I, MI2->getParent(), MI2);
MI1->eraseFromParent();
Defs.erase(I1++);
@@ -567,47 +567,12 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
return Changed;
}
-void SIFixSGPRCopies::computePDF(MachineFunction *MF) {
- MachineFunction::iterator B = MF->begin();
- MachineFunction::iterator E = MF->end();
- for (; B != E; ++B) {
- if (B->succ_size() > 1) {
- for (auto S : B->successors()) {
- MachineDomTreeNode *runner = MPDT->getNode(&*S);
- MachineDomTreeNode *sentinel = MPDT->getNode(&*B)->getIDom();
- while (runner && runner != sentinel) {
- PDF[runner->getBlock()].insert(&*B);
- runner = runner->getIDom();
- }
- }
- }
- }
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void SIFixSGPRCopies::printPDF() {
- dbgs() << "\n######## PostDominanceFrontiers set #########\n";
- for (auto &I : PDF) {
- dbgs() << "PDF[ " << I.first->getNumber() << "] : ";
- for (auto &J : I.second) {
- dbgs() << J->getNumber() << ' ';
- }
- dbgs() << '\n';
- }
- dbgs() << "\n##############################################\n";
-}
-#endif
-
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
MDT = &getAnalysis<MachineDominatorTree>();
- MPDT = &getAnalysis<MachinePostDominatorTree>();
- PDF.clear();
- computePDF(&MF);
- DEBUG(printPDF());
SmallVector<MachineInstr *, 16> Worklist;
@@ -661,28 +626,17 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
break;
- // We don't need to fix the PHI if all the source blocks
- // have no divergent control dependecies
+ // We don't need to fix the PHI if the common dominator of the
+ // two incoming blocks terminates with a uniform branch.
bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
- if (!HasVGPROperand) {
- bool Uniform = true;
- MachineBasicBlock * Join = MI.getParent();
- for (auto &O : MI.explicit_operands()) {
- if (O.isMBB()) {
- MachineBasicBlock * Source = O.getMBB();
- SetVector<MachineBasicBlock*> &SourcePDF = PDF[Source];
- SetVector<MachineBasicBlock*> &JoinPDF = PDF[Join];
- SetVector<MachineBasicBlock*> CDList;
- for (auto &I : SourcePDF) {
- if (!JoinPDF.count(I) || /* back edge */MDT->dominates(Join, I)) {
- if (hasTerminatorThatModifiesExec(*I, *TRI))
- Uniform = false;
- }
- }
- }
- }
- if (Uniform) {
- DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
+ if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) {
+ MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
+ MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
+
+ if (!predsHasDivergentTerminator(MBB0, TRI) &&
+ !predsHasDivergentTerminator(MBB1, TRI)) {
+ LLVM_DEBUG(dbgs()
+ << "Not fixing PHI for uniform branch: " << MI << '\n');
break;
}
}
@@ -722,7 +676,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
SmallSet<unsigned, 8> Visited;
if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
- DEBUG(dbgs() << "Fixing PHI: " << MI);
+ LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
TII->moveToVALU(MI);
}
break;
@@ -734,7 +688,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
continue;
}
- DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
+ LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
TII->moveToVALU(MI);
break;
@@ -745,7 +699,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
if (TRI->isSGPRClass(DstRC) &&
(TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
- DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
+ LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
TII->moveToVALU(MI);
}
break;
diff --git a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
index 3d3121788b5e..15ba78edf919 100644
--- a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
@@ -8,13 +8,14 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Add implicit use of exec to vector register copies.
+/// Add implicit use of exec to vector register copies.
///
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
using namespace llvm;
@@ -46,7 +47,7 @@ char SIFixVGPRCopies::ID = 0;
char &llvm::SIFixVGPRCopiesID = SIFixVGPRCopies::ID;
bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
bool Changed = false;
@@ -58,7 +59,7 @@ bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) {
if (TII->isVGPRCopy(MI) && !MI.readsRegister(AMDGPU::EXEC, TRI)) {
MI.addOperand(MF,
MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
- DEBUG(dbgs() << "Add exec use to " << MI);
+ LLVM_DEBUG(dbgs() << "Add exec use to " << MI);
Changed = true;
}
break;
diff --git a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
index 3493c7775f0c..5d613d8874fa 100644
--- a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
+++ b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Computations in WWM can overwrite values in inactive channels for
+/// Computations in WWM can overwrite values in inactive channels for
/// variables that the register allocator thinks are dead. This pass adds fake
/// uses of those variables to WWM instructions to make sure that they aren't
/// overwritten.
@@ -55,6 +55,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SparseBitVector.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -184,7 +185,7 @@ bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
// This doesn't actually need LiveIntervals, but we can preserve them.
LIS = getAnalysisIfAvailable<LiveIntervals>();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 783181980342..338cabcb906b 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -13,6 +13,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -75,7 +76,7 @@ public:
MachineRegisterInfo *MRI;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
- const SISubtarget *ST;
+ const GCNSubtarget *ST;
void foldOperand(MachineOperand &OpToFold,
MachineInstr *UseMI,
@@ -127,14 +128,18 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
unsigned Opc = UseMI.getOpcode();
switch (Opc) {
case AMDGPU::V_MAC_F32_e64:
- case AMDGPU::V_MAC_F16_e64: {
+ case AMDGPU::V_MAC_F16_e64:
+ case AMDGPU::V_FMAC_F32_e64: {
// Special case for mac. Since this is replaced with mad when folded into
// src2, we need to check the legality for the final instruction.
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (static_cast<int>(OpNo) == Src2Idx) {
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
- const MCInstrDesc &MadDesc
- = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+
+ unsigned Opc = IsFMA ?
+ AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+ const MCInstrDesc &MadDesc = TII->get(Opc);
return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
}
return false;
@@ -155,6 +160,35 @@ static bool updateOperand(FoldCandidate &Fold,
assert(Old.isReg());
if (Fold.isImm()) {
+ if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) {
+ // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
+ // already set.
+ unsigned Opcode = MI->getOpcode();
+ int OpNo = MI->getOperandNo(&Old);
+ int ModIdx = -1;
+ if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
+ ModIdx = AMDGPU::OpName::src0_modifiers;
+ else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
+ ModIdx = AMDGPU::OpName::src1_modifiers;
+ else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
+ ModIdx = AMDGPU::OpName::src2_modifiers;
+ assert(ModIdx != -1);
+ ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
+ MachineOperand &Mod = MI->getOperand(ModIdx);
+ unsigned Val = Mod.getImm();
+ if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
+ return false;
+ // If upper part is all zero we do not need op_sel_hi.
+ if (!isUInt<16>(Fold.ImmToFold)) {
+ if (!(Fold.ImmToFold & 0xffff)) {
+ Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
+ Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+ Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+ return true;
+ }
+ Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+ }
+ }
Old.ChangeToImmediate(Fold.ImmToFold);
return true;
}
@@ -195,13 +229,17 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
unsigned Opc = MI->getOpcode();
- if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
+ if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64) &&
(int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+ unsigned NewOpc = IsFMA ?
+ AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
// Check if changing this to a v_mad_{f16, f32} instruction will allow us
// to fold the operand.
- MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16));
+ MI->setDesc(TII->get(NewOpc));
bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
if (FoldAsMAD) {
MI->untieRegOperand(OpNo);
@@ -345,6 +383,7 @@ void SIFoldOperands::foldOperand(
// Don't fold into target independent nodes. Target independent opcodes
// don't have defined register classes.
if (UseDesc.isVariadic() ||
+ UseOp.isImplicit() ||
UseDesc.OpInfo[UseOpIdx].RegClass == -1)
return;
}
@@ -470,7 +509,8 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
MachineOperand &Op) {
if (Op.isReg()) {
// If this has a subregister, it obviously is a register source.
- if (Op.getSubReg() != AMDGPU::NoSubRegister)
+ if (Op.getSubReg() != AMDGPU::NoSubRegister ||
+ !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
return &Op;
MachineInstr *Def = MRI.getVRegDef(Op.getReg());
@@ -598,14 +638,14 @@ static bool tryFoldInst(const SIInstrInfo *TII,
const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
if (Src1->isIdenticalTo(*Src0)) {
- DEBUG(dbgs() << "Folded " << *MI << " into ");
+ LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (Src2Idx != -1)
MI->RemoveOperand(Src2Idx);
MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
: getMovOpc(false)));
- DEBUG(dbgs() << *MI << '\n');
+ LLVM_DEBUG(dbgs() << *MI << '\n');
return true;
}
}
@@ -646,7 +686,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
// be folded due to multiple uses or operand constraints.
if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
- DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n');
+ LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n');
// Some constant folding cases change the same immediate's use to a new
// instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
@@ -713,8 +753,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
// copies.
MRI->clearKillFlags(Fold.OpToFold->getReg());
}
- DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
- static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+ LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
+ << static_cast<int>(Fold.UseOpNo) << " of "
+ << *Fold.UseMI << '\n');
tryFoldInst(TII, Fold.UseMI);
} else if (Fold.isCommuted()) {
// Restoring instruction's original operand order if fold has failed.
@@ -794,7 +835,8 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
if (!DefClamp)
return false;
- DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n');
+ LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def
+ << '\n');
// Clamp is applied after omod, so it is OK if omod is set.
DefClamp->setImm(1);
@@ -917,7 +959,7 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
return false;
- DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
+ LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
DefOMod->setImm(OMod);
MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
@@ -930,7 +972,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
return false;
MRI = &MF.getRegInfo();
- ST = &MF.getSubtarget<SISubtarget>();
+ ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
new file mode 100644
index 000000000000..cd14239de822
--- /dev/null
+++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -0,0 +1,398 @@
+//===-- SIFormMemoryClauses.cpp -------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass creates bundles of SMEM and VMEM instructions forming memory
+/// clauses if XNACK is enabled. Def operands of clauses are marked as early
+/// clobber to make sure we will not override any source within a clause.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "GCNRegPressure.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-form-memory-clauses"
+
+// Clauses longer then 15 instructions would overflow one of the counters
+// and stall. They can stall even earlier if there are outstanding counters.
+static cl::opt<unsigned>
+MaxClause("amdgpu-max-memory-clause", cl::Hidden, cl::init(15),
+ cl::desc("Maximum length of a memory clause, instructions"));
+
+namespace {
+
+class SIFormMemoryClauses : public MachineFunctionPass {
+ typedef DenseMap<unsigned, std::pair<unsigned, LaneBitmask>> RegUse;
+
+public:
+ static char ID;
+
+public:
+ SIFormMemoryClauses() : MachineFunctionPass(ID) {
+ initializeSIFormMemoryClausesPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI Form memory clauses";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ template <typename Callable>
+ void forAllLanes(unsigned Reg, LaneBitmask LaneMask, Callable Func) const;
+
+ bool canBundle(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
+ bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT);
+ void collectRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
+ bool processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses,
+ GCNDownwardRPTracker &RPT);
+
+ const GCNSubtarget *ST;
+ const SIRegisterInfo *TRI;
+ const MachineRegisterInfo *MRI;
+ SIMachineFunctionInfo *MFI;
+
+ unsigned LastRecordedOccupancy;
+ unsigned MaxVGPRs;
+ unsigned MaxSGPRs;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFormMemoryClauses, DEBUG_TYPE,
+ "SI Form memory clauses", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIFormMemoryClauses, DEBUG_TYPE,
+ "SI Form memory clauses", false, false)
+
+
+char SIFormMemoryClauses::ID = 0;
+
+char &llvm::SIFormMemoryClausesID = SIFormMemoryClauses::ID;
+
+FunctionPass *llvm::createSIFormMemoryClausesPass() {
+ return new SIFormMemoryClauses();
+}
+
+static bool isVMEMClauseInst(const MachineInstr &MI) {
+ return SIInstrInfo::isFLAT(MI) || SIInstrInfo::isVMEM(MI);
+}
+
+static bool isSMEMClauseInst(const MachineInstr &MI) {
+ return SIInstrInfo::isSMRD(MI);
+}
+
+// There no sense to create store clauses, they do not define anything,
+// thus there is nothing to set early-clobber.
+static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
+ if (MI.isDebugValue() || MI.isBundled())
+ return false;
+ if (!MI.mayLoad() || MI.mayStore())
+ return false;
+ if (AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1 ||
+ AMDGPU::getAtomicRetOp(MI.getOpcode()) != -1)
+ return false;
+ if (IsVMEMClause && !isVMEMClauseInst(MI))
+ return false;
+ if (!IsVMEMClause && !isSMEMClauseInst(MI))
+ return false;
+ return true;
+}
+
+static unsigned getMopState(const MachineOperand &MO) {
+ unsigned S = 0;
+ if (MO.isImplicit())
+ S |= RegState::Implicit;
+ if (MO.isDead())
+ S |= RegState::Dead;
+ if (MO.isUndef())
+ S |= RegState::Undef;
+ if (MO.isKill())
+ S |= RegState::Kill;
+ if (MO.isEarlyClobber())
+ S |= RegState::EarlyClobber;
+ if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) && MO.isRenamable())
+ S |= RegState::Renamable;
+ return S;
+}
+
+template <typename Callable>
+void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask,
+ Callable Func) const {
+ if (LaneMask.all() || TargetRegisterInfo::isPhysicalRegister(Reg) ||
+ LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) {
+ Func(0);
+ return;
+ }
+
+ const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+ unsigned E = TRI->getNumSubRegIndices();
+ SmallVector<unsigned, AMDGPU::NUM_TARGET_SUBREGS> CoveringSubregs;
+ for (unsigned Idx = 1; Idx < E; ++Idx) {
+ // Is this index even compatible with the given class?
+ if (TRI->getSubClassWithSubReg(RC, Idx) != RC)
+ continue;
+ LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
+ // Early exit if we found a perfect match.
+ if (SubRegMask == LaneMask) {
+ Func(Idx);
+ return;
+ }
+
+ if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
+ continue;
+
+ CoveringSubregs.push_back(Idx);
+ }
+
+ llvm::sort(CoveringSubregs.begin(), CoveringSubregs.end(),
+ [this](unsigned A, unsigned B) {
+ LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A);
+ LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B);
+ unsigned NA = MaskA.getNumLanes();
+ unsigned NB = MaskB.getNumLanes();
+ if (NA != NB)
+ return NA > NB;
+ return MaskA.getHighestLane() > MaskB.getHighestLane();
+ });
+
+ for (unsigned Idx : CoveringSubregs) {
+ LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
+ if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
+ continue;
+
+ Func(Idx);
+ LaneMask &= ~SubRegMask;
+ if (LaneMask.none())
+ return;
+ }
+
+ llvm_unreachable("Failed to find all subregs to cover lane mask");
+}
+
+// Returns false if there is a use of a def already in the map.
+// In this case we must break the clause.
+bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
+ RegUse &Defs, RegUse &Uses) const {
+ // Check interference with defs.
+ for (const MachineOperand &MO : MI.operands()) {
+ // TODO: Prologue/Epilogue Insertion pass does not process bundled
+ // instructions.
+ if (MO.isFI())
+ return false;
+
+ if (!MO.isReg())
+ continue;
+
+ unsigned Reg = MO.getReg();
+
+ // If it is tied we will need to write same register as we read.
+ if (MO.isTied())
+ return false;
+
+ RegUse &Map = MO.isDef() ? Uses : Defs;
+ auto Conflict = Map.find(Reg);
+ if (Conflict == Map.end())
+ continue;
+
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ return false;
+
+ LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+ if ((Conflict->second.second & Mask).any())
+ return false;
+ }
+
+ return true;
+}
+
+// Since all defs in the clause are early clobber we can run out of registers.
+// Function returns false if pressure would hit the limit if instruction is
+// bundled into a memory clause.
+bool SIFormMemoryClauses::checkPressure(const MachineInstr &MI,
+ GCNDownwardRPTracker &RPT) {
+ // NB: skip advanceBeforeNext() call. Since all defs will be marked
+ // early-clobber they will all stay alive at least to the end of the
+ // clause. Therefor we should not decrease pressure even if load
+ // pointer becomes dead and could otherwise be reused for destination.
+ RPT.advanceToNext();
+ GCNRegPressure MaxPressure = RPT.moveMaxPressure();
+ unsigned Occupancy = MaxPressure.getOccupancy(*ST);
+ if (Occupancy >= MFI->getMinAllowedOccupancy() &&
+ MaxPressure.getVGPRNum() <= MaxVGPRs &&
+ MaxPressure.getSGPRNum() <= MaxSGPRs) {
+ LastRecordedOccupancy = Occupancy;
+ return true;
+ }
+ return false;
+}
+
+// Collect register defs and uses along with their lane masks and states.
+void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
+ RegUse &Defs, RegUse &Uses) const {
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (!Reg)
+ continue;
+
+ LaneBitmask Mask = TargetRegisterInfo::isVirtualRegister(Reg) ?
+ TRI->getSubRegIndexLaneMask(MO.getSubReg()) :
+ LaneBitmask::getAll();
+ RegUse &Map = MO.isDef() ? Defs : Uses;
+
+ auto Loc = Map.find(Reg);
+ unsigned State = getMopState(MO);
+ if (Loc == Map.end()) {
+ Map[Reg] = std::make_pair(State, Mask);
+ } else {
+ Loc->second.first |= State;
+ Loc->second.second |= Mask;
+ }
+ }
+}
+
+// Check register def/use conflicts, occupancy limits and collect def/use maps.
+// Return true if instruction can be bundled with previous. It it cannot
+// def/use maps are not updated.
+bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI,
+ RegUse &Defs, RegUse &Uses,
+ GCNDownwardRPTracker &RPT) {
+ if (!canBundle(MI, Defs, Uses))
+ return false;
+
+ if (!checkPressure(MI, RPT))
+ return false;
+
+ collectRegUses(MI, Defs, Uses);
+ return true;
+}
+
+bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!ST->isXNACKEnabled())
+ return false;
+
+ const SIInstrInfo *TII = ST->getInstrInfo();
+ TRI = ST->getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ MFI = MF.getInfo<SIMachineFunctionInfo>();
+ LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+ SlotIndexes *Ind = LIS->getSlotIndexes();
+ bool Changed = false;
+
+ MaxVGPRs = TRI->getAllocatableSet(MF, &AMDGPU::VGPR_32RegClass).count();
+ MaxSGPRs = TRI->getAllocatableSet(MF, &AMDGPU::SGPR_32RegClass).count();
+
+ for (MachineBasicBlock &MBB : MF) {
+ MachineBasicBlock::instr_iterator Next;
+ for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; I = Next) {
+ MachineInstr &MI = *I;
+ Next = std::next(I);
+
+ bool IsVMEM = isVMEMClauseInst(MI);
+
+ if (!isValidClauseInst(MI, IsVMEM))
+ continue;
+
+ RegUse Defs, Uses;
+ GCNDownwardRPTracker RPT(*LIS);
+ RPT.reset(MI);
+
+ if (!processRegUses(MI, Defs, Uses, RPT))
+ continue;
+
+ unsigned Length = 1;
+ for ( ; Next != E && Length < MaxClause; ++Next) {
+ if (!isValidClauseInst(*Next, IsVMEM))
+ break;
+
+ // A load from pointer which was loaded inside the same bundle is an
+ // impossible clause because we will need to write and read the same
+ // register inside. In this case processRegUses will return false.
+ if (!processRegUses(*Next, Defs, Uses, RPT))
+ break;
+
+ ++Length;
+ }
+ if (Length < 2)
+ continue;
+
+ Changed = true;
+ MFI->limitOccupancy(LastRecordedOccupancy);
+
+ auto B = BuildMI(MBB, I, DebugLoc(), TII->get(TargetOpcode::BUNDLE));
+ Ind->insertMachineInstrInMaps(*B);
+
+ for (auto BI = I; BI != Next; ++BI) {
+ BI->bundleWithPred();
+ Ind->removeSingleMachineInstrFromMaps(*BI);
+
+ for (MachineOperand &MO : BI->defs())
+ if (MO.readsReg())
+ MO.setIsInternalRead(true);
+ }
+
+ for (auto &&R : Defs) {
+ forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) {
+ unsigned S = R.second.first | RegState::EarlyClobber;
+ if (!SubReg)
+ S &= ~(RegState::Undef | RegState::Dead);
+ B.addDef(R.first, S, SubReg);
+ });
+ }
+
+ for (auto &&R : Uses) {
+ forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) {
+ B.addUse(R.first, R.second.first & ~RegState::Kill, SubReg);
+ });
+ }
+
+ for (auto &&R : Defs) {
+ unsigned Reg = R.first;
+ Uses.erase(Reg);
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ continue;
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+ }
+
+ for (auto &&R : Uses) {
+ unsigned Reg = R.first;
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ continue;
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+ }
+ }
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 89bb98dbd028..ac0ef90f25a4 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -12,7 +12,9 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -21,19 +23,19 @@
using namespace llvm;
-static ArrayRef<MCPhysReg> getAllSGPR128(const SISubtarget &ST,
+static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
const MachineFunction &MF) {
return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
ST.getMaxNumSGPRs(MF) / 4);
}
-static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST,
+static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
const MachineFunction &MF) {
return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
ST.getMaxNumSGPRs(MF));
}
-void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
+void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
MachineFunction &MF,
MachineBasicBlock &MBB) const {
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -96,7 +98,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
}
unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
- const SISubtarget &ST,
+ const GCNSubtarget &ST,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
SIMachineFunctionInfo *MFI,
@@ -147,7 +149,7 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
// SGPRs.
std::pair<unsigned, unsigned>
SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
- const SISubtarget &ST,
+ const GCNSubtarget &ST,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
SIMachineFunctionInfo *MFI,
@@ -218,7 +220,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
// Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
// specified.
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (ST.debuggerEmitPrologue())
emitDebuggerPrologue(MF, MBB);
@@ -235,6 +237,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
+ const Function &F = MF.getFunction();
// We need to do the replacement of the private segment buffer and wave offset
// register even if there are no stack objects. There could be stores to undef
@@ -286,7 +289,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
- if (ST.isAmdCodeObjectV2(MF)) {
+ if (ST.isAmdCodeObjectV2(F)) {
PreloadedPrivateBufferReg = MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
}
@@ -305,7 +308,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
}
if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
- assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF));
+ assert(ST.isAmdCodeObjectV2(F) || ST.isMesaGfxShader(F));
MRI.addLiveIn(PreloadedPrivateBufferReg);
MBB.addLiveIn(PreloadedPrivateBufferReg);
}
@@ -330,7 +333,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
bool CopyBuffer = ResourceRegUsed &&
PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
- ST.isAmdCodeObjectV2(MF) &&
+ ST.isAmdCodeObjectV2(F) &&
ScratchRsrcReg != PreloadedPrivateBufferReg;
// This needs to be careful of the copying order to avoid overwriting one of
@@ -361,13 +364,14 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
}
// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
-void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
+void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
unsigned ScratchRsrcReg) const {
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+ const Function &Fn = MF.getFunction();
DebugLoc DL;
if (ST.isAmdPalOS()) {
@@ -387,12 +391,27 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
BuildMI(MBB, I, DL, GetPC64, Rsrc01);
}
+ auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
+ if (ST.hasMergedShaders()) {
+ switch (MF.getFunction().getCallingConv()) {
+ case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_GS:
+ // Low GIT address is passed in s8 rather than s0 for an LS+HS or
+ // ES+GS merged shader on gfx9+.
+ GitPtrLo = AMDGPU::SGPR8;
+ break;
+ default:
+ break;
+ }
+ }
+ MF.getRegInfo().addLiveIn(GitPtrLo);
+ MF.front().addLiveIn(GitPtrLo);
BuildMI(MBB, I, DL, SMovB32, RsrcLo)
- .addReg(AMDGPU::SGPR0) // Low address passed in
+ .addReg(GitPtrLo)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
// We now have the GIT ptr - now get the scratch descriptor from the entry
- // at offset 0.
+ // at offset 0 (or offset 16 for a compute shader).
PointerType *PtrTy =
PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
AMDGPUAS::CONSTANT_ADDRESS);
@@ -403,17 +422,18 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
MachineMemOperand::MOInvariant |
MachineMemOperand::MODereferenceable,
0, 0);
+ unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
.addReg(Rsrc01)
- .addImm(0) // offset
+ .addImm(Offset) // offset
.addImm(0) // glc
.addReg(ScratchRsrcReg, RegState::ImplicitDefine)
.addMemOperand(MMO);
return;
}
- if (ST.isMesaGfxShader(MF)
+ if (ST.isMesaGfxShader(Fn)
|| (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
- assert(!ST.isAmdCodeObjectV2(MF));
+ assert(!ST.isAmdCodeObjectV2(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
@@ -474,17 +494,52 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
}
}
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer. We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
+ MachineFunction *MF = MBB.getParent();
+
+ const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
+ LivePhysRegs LiveRegs(TRI);
+ LiveRegs.addLiveIns(MBB);
+
+ // Mark callee saved registers as used so we will not choose them.
+ const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
+ for (unsigned i = 0; CSRegs[i]; ++i)
+ LiveRegs.addReg(CSRegs[i]);
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
+ if (LiveRegs.available(MRI, Reg))
+ return Reg;
+ }
+
+ return AMDGPU::NoRegister;
+}
+
void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
if (FuncInfo->isEntryFunction()) {
emitEntryFunctionPrologue(MF, MBB);
return;
}
const MachineFrameInfo &MFI = MF.getFrameInfo();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
@@ -492,8 +547,34 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock::iterator MBBI = MBB.begin();
DebugLoc DL;
+ // XXX - Is this the right predicate?
+
bool NeedFP = hasFP(MF);
- if (NeedFP) {
+ uint32_t NumBytes = MFI.getStackSize();
+ uint32_t RoundedSize = NumBytes;
+ const bool NeedsRealignment = TRI.needsStackRealignment(MF);
+
+ if (NeedsRealignment) {
+ assert(NeedFP);
+ const unsigned Alignment = MFI.getMaxAlignment();
+
+ RoundedSize += Alignment;
+
+ unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
+ assert(ScratchSPReg != AMDGPU::NoRegister);
+
+ // s_add_u32 tmp_reg, s32, NumBytes
+ // s_and_b32 s32, tmp_reg, 0b111...0000
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
+ .addReg(StackPtrReg)
+ .addImm((Alignment - 1) * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
+ .addReg(ScratchSPReg, RegState::Kill)
+ .addImm(-Alignment * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameSetup);
+ FuncInfo->setIsStackRealigned(true);
+ } else if (NeedFP) {
// If we need a base pointer, set it up here. It's whatever the value of
// the stack pointer is at this point. Any variable size objects will be
// allocated after this, so we can still use the base pointer to reference
@@ -503,11 +584,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
- uint32_t NumBytes = MFI.getStackSize();
- if (NumBytes != 0 && hasSP(MF)) {
+ if (RoundedSize != 0 && hasSP(MF)) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
.addReg(StackPtrReg)
- .addImm(NumBytes * ST.getWavefrontSize())
+ .addImm(RoundedSize * ST.getWavefrontSize())
.setMIFlag(MachineInstr::FrameSetup);
}
@@ -527,7 +607,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
if (FuncInfo->isEntryFunction())
return;
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
@@ -553,10 +633,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
// it's really whether we need SP to be accurate or not.
if (NumBytes != 0 && hasSP(MF)) {
+ uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
+ NumBytes + MFI.getMaxAlignment() : NumBytes;
+
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
.addReg(StackPtrReg)
- .addImm(NumBytes * ST.getWavefrontSize())
- .setMIFlag(MachineInstr::FrameDestroy);
+ .addImm(RoundedSize * ST.getWavefrontSize());
}
}
@@ -572,7 +654,7 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const {
- const SIRegisterInfo *RI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+ const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
FrameReg = RI->getFrameRegister(MF);
return MF.getFrameInfo().getObjectOffset(FI);
@@ -586,7 +668,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
if (!MFI.hasStackObjects())
return;
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
@@ -611,6 +693,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
if (TII->isSGPRSpill(MI)) {
int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
+ assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL);
if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
(void)Spilled;
@@ -667,7 +750,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
if (Amount == 0)
return MBB.erase(I);
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const DebugLoc &DL = I->getDebugLoc();
unsigned Opc = I->getOpcode();
@@ -696,7 +779,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -746,7 +829,8 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
}
bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
+ const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
// All stack operations are relative to the frame offset SGPR.
const MachineFrameInfo &MFI = MF.getFrameInfo();
- return MFI.hasCalls() || MFI.hasVarSizedObjects();
+ return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF);
}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index df6f1632a316..2f35b3631cdc 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -17,7 +17,7 @@ namespace llvm {
class SIInstrInfo;
class SIMachineFunctionInfo;
class SIRegisterInfo;
-class SISubtarget;
+class GCNSubtarget;
class SIFrameLowering final : public AMDGPUFrameLowering {
public:
@@ -48,29 +48,29 @@ public:
MachineBasicBlock::iterator MI) const override;
private:
- void emitFlatScratchInit(const SISubtarget &ST,
+ void emitFlatScratchInit(const GCNSubtarget &ST,
MachineFunction &MF,
MachineBasicBlock &MBB) const;
unsigned getReservedPrivateSegmentBufferReg(
- const SISubtarget &ST,
+ const GCNSubtarget &ST,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
SIMachineFunctionInfo *MFI,
MachineFunction &MF) const;
std::pair<unsigned, unsigned> getReservedPrivateSegmentWaveByteOffsetReg(
- const SISubtarget &ST,
+ const GCNSubtarget &ST,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
SIMachineFunctionInfo *MFI,
MachineFunction &MF) const;
- /// \brief Emits debugger prologue.
+ /// Emits debugger prologue.
void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
- void emitEntryFunctionScratchSetup(const SISubtarget &ST, MachineFunction &MF,
+ void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF,
MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
unsigned ScratchRsrcReg) const;
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 50ee88fa635a..5b7fc2656a20 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Custom DAG lowering for SI
+/// Custom DAG lowering for SI
//
//===----------------------------------------------------------------------===//
@@ -26,6 +26,7 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
@@ -49,7 +50,6 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetCallingConv.h"
@@ -73,6 +73,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"
#include <cassert>
@@ -111,8 +112,9 @@ static unsigned findFirstFreeSGPR(CCState &CCInfo) {
}
SITargetLowering::SITargetLowering(const TargetMachine &TM,
- const SISubtarget &STI)
- : AMDGPUTargetLowering(TM, STI) {
+ const GCNSubtarget &STI)
+ : AMDGPUTargetLowering(TM, STI),
+ Subtarget(&STI) {
addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
@@ -138,14 +140,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->has16BitInsts()) {
addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
- }
- if (Subtarget->hasVOP3PInsts()) {
+ // Unless there are also VOP3P operations, not operations are really legal.
addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
+ addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
+ addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
}
- computeRegisterProperties(STI.getRegisterInfo());
+ computeRegisterProperties(Subtarget->getRegisterInfo());
// We need to custom lower vector stores from local memory
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
@@ -173,7 +176,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
- setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
setOperationAction(ISD::SELECT, MVT::i1, Promote);
setOperationAction(ISD::SELECT, MVT::i64, Custom);
@@ -205,13 +207,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
@@ -231,13 +237,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
#endif
- //setOperationAction(ISD::ADDC, MVT::i64, Expand);
- //setOperationAction(ISD::SUBC, MVT::i64, Expand);
-
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
- MVT::v2i64, MVT::v2f64}) {
+ MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -260,6 +263,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
}
+ setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
+
// TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
// is expanded to avoid having two separate loops in case the index is a VGPR.
@@ -284,12 +289,30 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
+
// Avoid stack access for these.
// TODO: Generalize to more vector types.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
+
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
// and output demarshalling
@@ -301,7 +324,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
- if (getSubtarget()->hasFlatAddressSpace()) {
+ if (Subtarget->hasFlatAddressSpace()) {
setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
}
@@ -314,13 +337,56 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::TRAP, MVT::Other, Custom);
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
+ if (Subtarget->has16BitInsts()) {
+ setOperationAction(ISD::FLOG, MVT::f16, Custom);
+ setOperationAction(ISD::FLOG10, MVT::f16, Custom);
+ }
+
+ // v_mad_f32 does not support denormals according to some sources.
+ if (!Subtarget->hasFP32Denormals())
+ setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+ if (!Subtarget->hasBFI()) {
+ // fcopysign can be done in a single instruction with BFI.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ }
+
+ if (!Subtarget->hasBCNT(32))
+ setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+ if (!Subtarget->hasBCNT(64))
+ setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+ if (Subtarget->hasFFBH())
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+
+ if (Subtarget->hasFFBL())
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+
+ // We only really have 32-bit BFE instructions (and 16-bit on VI).
+ //
+ // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
+ // effort to match them now. We want this to be false for i64 cases when the
+ // extraction isn't restricted to the upper or lower half. Ideally we would
+ // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
+ // span the midpoint are probably relatively rare, so don't worry about them
+ // for now.
+ if (Subtarget->hasBFE())
+ setHasExtractBitsInsn(true);
+
setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
- if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
setOperationAction(ISD::FRINT, MVT::f64, Legal);
+ } else {
+ setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+ setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+ setOperationAction(ISD::FRINT, MVT::f64, Custom);
+ setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
}
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
@@ -357,6 +423,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
setOperationAction(ISD::CTLZ, MVT::i16, Promote);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
+ setOperationAction(ISD::CTPOP, MVT::i16, Promote);
setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
@@ -406,10 +473,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMA, MVT::f16, Legal);
if (!Subtarget->hasFP16Denormals())
setOperationAction(ISD::FMAD, MVT::f16, Legal);
- }
- if (Subtarget->hasVOP3PInsts()) {
- for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
+ for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -436,6 +501,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::Constant, MVT::v2i16, Legal);
setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
+ setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
+ setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
+
setOperationAction(ISD::STORE, MVT::v2i16, Promote);
AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
setOperationAction(ISD::STORE, MVT::v2f16, Promote);
@@ -452,11 +520,38 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
setOperationAction(ISD::XOR, MVT::v2i16, Promote);
AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
- setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
- AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
- setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
- AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
+ setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
+ setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
+
+ setOperationAction(ISD::STORE, MVT::v4i16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
+ setOperationAction(ISD::STORE, MVT::v4f16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
+
+ setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
+
+ setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
+
+ if (!Subtarget->hasVOP3PInsts()) {
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
+ }
+
+ setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
+ // This isn't really legal, but this avoids the legalizer unrolling it (and
+ // allows matching fneg (fabs x) patterns)
+ setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+ }
+
+ if (Subtarget->hasVOP3PInsts()) {
setOperationAction(ISD::ADD, MVT::v2i16, Legal);
setOperationAction(ISD::SUB, MVT::v2i16, Legal);
setOperationAction(ISD::MUL, MVT::v2i16, Legal);
@@ -469,26 +564,51 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
setOperationAction(ISD::FADD, MVT::v2f16, Legal);
- setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
setOperationAction(ISD::FMA, MVT::v2f16, Legal);
setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
-
- // This isn't really legal, but this avoids the legalizer unrolling it (and
- // allows matching fneg (fabs x) patterns)
- setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+ setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
- setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
+ setOperationAction(ISD::SHL, MVT::v4i16, Custom);
+ setOperationAction(ISD::SRA, MVT::v4i16, Custom);
+ setOperationAction(ISD::SRL, MVT::v4i16, Custom);
+ setOperationAction(ISD::ADD, MVT::v4i16, Custom);
+ setOperationAction(ISD::SUB, MVT::v4i16, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i16, Custom);
+
+ setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
+ setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
+ setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
+ setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
+
+ setOperationAction(ISD::FADD, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
+
+ setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
+ }
+
+ setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
+ setOperationAction(ISD::FABS, MVT::v4f16, Custom);
+
+ if (Subtarget->has16BitInsts()) {
+ setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
+ setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
} else {
+ // Legalization hack.
setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
+
+ setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
+ setOperationAction(ISD::FABS, MVT::v2f16, Custom);
}
for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
@@ -503,6 +623,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM);
+ setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SMIN);
setTargetDAGCombine(ISD::SMAX);
setTargetDAGCombine(ISD::UMIN);
@@ -540,16 +661,33 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
setSchedulingPreference(Sched::RegPressure);
+
+ // SI at least has hardware support for floating point exceptions, but no way
+ // of using or handling them is implemented. They are also optional in OpenCL
+ // (Section 7.3)
+ setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
}
-const SISubtarget *SITargetLowering::getSubtarget() const {
- return static_cast<const SISubtarget *>(Subtarget);
+const GCNSubtarget *SITargetLowering::getSubtarget() const {
+ return Subtarget;
}
//===----------------------------------------------------------------------===//
// TargetLowering queries
//===----------------------------------------------------------------------===//
+// v_mad_mix* support a conversion from f16 to f32.
+//
+// There is only one special case when denormals are enabled we don't currently,
+// where this is OK to use.
+bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
+ EVT DestVT, EVT SrcVT) const {
+ return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
+ (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
+ DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
+ SrcVT.getScalarType() == MVT::f16;
+}
+
bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
// SI has some legal vector types, but no legal vector operations. Say no
// shuffles are legal in order to prefer scalarizing some vector operations.
@@ -560,9 +698,55 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
unsigned IntrID) const {
+ if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
+ AMDGPU::lookupRsrcIntrinsic(IntrID)) {
+ AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
+ (Intrinsic::ID)IntrID);
+ if (Attr.hasFnAttribute(Attribute::ReadNone))
+ return false;
+
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ if (RsrcIntr->IsImage) {
+ Info.ptrVal = MFI->getImagePSV(
+ *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
+ CI.getArgOperand(RsrcIntr->RsrcArg));
+ Info.align = 0;
+ } else {
+ Info.ptrVal = MFI->getBufferPSV(
+ *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
+ CI.getArgOperand(RsrcIntr->RsrcArg));
+ }
+
+ Info.flags = MachineMemOperand::MODereferenceable;
+ if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.flags |= MachineMemOperand::MOLoad;
+ } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
+ Info.flags |= MachineMemOperand::MOStore;
+ } else {
+ // Atomic
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable;
+
+ // XXX - Should this be volatile without known ordering?
+ Info.flags |= MachineMemOperand::MOVolatile;
+ }
+ return true;
+ }
+
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec: {
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
@@ -575,6 +759,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
+
default:
return false;
}
@@ -585,7 +770,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
Type *&AccessTy) const {
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec: {
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax: {
Value *Ptr = II->getArgOperand(0);
AccessTy = II->getType();
Ops.push_back(Ptr);
@@ -675,7 +863,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
if (AS == AMDGPUASI.GLOBAL_ADDRESS)
return isLegalGlobalAddressingMode(AM);
- if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
+ if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
@@ -686,19 +875,19 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// will use a MUBUF load.
// FIXME?: We also need to do this if unaligned, but we don't know the
// alignment here.
- if (DL.getTypeStoreSize(Ty) < 4)
+ if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
return isLegalGlobalAddressingMode(AM);
- if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
// SMRD instructions have an 8-bit, dword offset on SI.
if (!isUInt<8>(AM.BaseOffs / 4))
return false;
- } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
+ } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
// On CI+, this can also be a 32-bit literal constant offset. If it fits
// in 8-bits, it can use a smaller encoding.
if (!isUInt<32>(AM.BaseOffs / 4))
return false;
- } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
if (!isUInt<20>(AM.BaseOffs))
return false;
@@ -798,7 +987,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
// If we have an uniform constant load, it still requires using a slow
// buffer instruction if unaligned.
if (IsFast) {
- *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
+ *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS ||
+ AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ?
(Align % 4 == 0) : true;
}
@@ -841,7 +1031,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
return AS == AMDGPUASI.GLOBAL_ADDRESS ||
AS == AMDGPUASI.FLAT_ADDRESS ||
- AS == AMDGPUASI.CONSTANT_ADDRESS;
+ AS == AMDGPUASI.CONSTANT_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
}
bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@@ -853,7 +1044,7 @@ bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
const Value *Ptr = MemNode->getMemOperand()->getValue();
- const Instruction *I = dyn_cast<Instruction>(Ptr);
+ const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
return I && I->getMetadata("amdgpu.noclobber");
}
@@ -870,7 +1061,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
- return AMDGPU::isUniformMMO(MemNode->getMemOperand());
+ return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
}
TargetLoweringBase::LegalizeTypeAction
@@ -932,14 +1123,13 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
- return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
- DAG.getConstant(Offset, SL, PtrVT));
+ return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
}
SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
const SDLoc &SL) const {
- auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
- uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+ uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
+ FIRST_IMPLICIT);
return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
}
@@ -966,18 +1156,42 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
SDValue SITargetLowering::lowerKernargMemParameter(
SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Chain,
- uint64_t Offset, bool Signed,
+ uint64_t Offset, unsigned Align, bool Signed,
const ISD::InputArg *Arg) const {
- const DataLayout &DL = DAG.getDataLayout();
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
- unsigned Align = DL.getABITypeAlignment(Ty);
+ // Try to avoid using an extload by loading earlier than the argument address,
+ // and extracting the relevant bits. The load should hopefully be merged with
+ // the previous argument.
+ if (MemVT.getStoreSize() < 4 && Align < 4) {
+ // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
+ int64_t AlignDownOffset = alignDown(Offset, 4);
+ int64_t OffsetDiff = Offset - AlignDownOffset;
+
+ EVT IntVT = MemVT.changeTypeToInteger();
+
+ // TODO: If we passed in the base kernel offset we could have a better
+ // alignment than 4, but we don't really need it.
+ SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
+ SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+
+ SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
+ SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
+
+ SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
+ ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
+ ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
+
+
+ return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
+ }
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
- MachineMemOperand::MONonTemporal |
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
@@ -1052,36 +1266,51 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
FunctionType *FType,
SIMachineFunctionInfo *Info) {
for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
- const ISD::InputArg &Arg = Ins[I];
+ const ISD::InputArg *Arg = &Ins[I];
// First check if it's a PS input addr.
- if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
- !Arg.Flags.isByVal() && PSInputNum <= 15) {
+ if (CallConv == CallingConv::AMDGPU_PS &&
+ !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
+
+ bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
+
+ // Inconveniently only the first part of the split is marked as isSplit,
+ // so skip to the end. We only want to increment PSInputNum once for the
+ // entire split argument.
+ if (Arg->Flags.isSplit()) {
+ while (!Arg->Flags.isSplitEnd()) {
+ assert(!Arg->VT.isVector() &&
+ "unexpected vector split in ps argument type");
+ if (!SkipArg)
+ Splits.push_back(*Arg);
+ Arg = &Ins[++I];
+ }
+ }
- if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
+ if (SkipArg) {
// We can safely skip PS inputs.
- Skipped.set(I);
+ Skipped.set(Arg->getOrigArgIndex());
++PSInputNum;
continue;
}
Info->markPSInputAllocated(PSInputNum);
- if (Arg.Used)
+ if (Arg->Used)
Info->markPSInputEnabled(PSInputNum);
++PSInputNum;
}
// Second split vertices into their elements.
- if (Arg.VT.isVector()) {
- ISD::InputArg NewArg = Arg;
+ if (Arg->VT.isVector()) {
+ ISD::InputArg NewArg = *Arg;
NewArg.Flags.setSplit();
- NewArg.VT = Arg.VT.getVectorElementType();
+ NewArg.VT = Arg->VT.getVectorElementType();
// We REALLY want the ORIGINAL number of vertex elements here, e.g. a
// three or five element vertex only needs three or five registers,
// NOT four or eight.
- Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+ Type *ParamType = FType->getParamType(Arg->getOrigArgIndex());
unsigned NumElements = ParamType->getVectorNumElements();
for (unsigned J = 0; J != NumElements; ++J) {
@@ -1089,7 +1318,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
NewArg.PartOffset += NewArg.VT.getStoreSize();
}
} else {
- Splits.push_back(Arg);
+ Splits.push_back(*Arg);
}
}
}
@@ -1347,8 +1576,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// the scratch registers to pass in.
bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- if (ST.isAmdCodeObjectV2(MF)) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (ST.isAmdCodeObjectV2(MF.getFunction())) {
if (RequiresStackAccess) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
@@ -1460,12 +1689,12 @@ SDValue SITargetLowering::LowerFormalArguments(
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
MachineFunction &MF = DAG.getMachineFunction();
+ const Function &Fn = MF.getFunction();
FunctionType *FType = MF.getFunction().getFunctionType();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
- const Function &Fn = MF.getFunction();
DiagnosticInfoUnsupported NoGraphicsHSA(
Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
DAG.getContext()->diagnose(NoGraphicsHSA);
@@ -1562,9 +1791,16 @@ SDValue SITargetLowering::LowerFormalArguments(
SmallVector<SDValue, 16> Chains;
- for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+ // FIXME: This is the minimum kernel argument alignment. We should improve
+ // this to the maximum alignment of the arguments.
+ //
+ // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
+ // kern arg offset.
+ const unsigned KernelArgBaseAlign = 16;
+
+ for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
const ISD::InputArg &Arg = Ins[i];
- if (Skipped[i]) {
+ if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
InVals.push_back(DAG.getUNDEF(Arg.VT));
continue;
}
@@ -1576,19 +1812,16 @@ SDValue SITargetLowering::LowerFormalArguments(
VT = Ins[i].VT;
EVT MemVT = VA.getLocVT();
- const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
- VA.getLocMemOffset();
- Info->setABIArgOffset(Offset + MemVT.getStoreSize());
+ const uint64_t Offset = VA.getLocMemOffset();
+ unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
- // The first 36 bytes of the input buffer contains information about
- // thread group and global sizes.
SDValue Arg = lowerKernargMemParameter(
- DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
+ DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
Chains.push_back(Arg.getValue(1));
auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
- if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
// On SI local pointers are just offsets into LDS, so they are always
// less than 16-bits. On CI and newer they could potentially be
@@ -1696,7 +1929,7 @@ SDValue SITargetLowering::LowerFormalArguments(
auto &ArgUsageInfo =
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
- ArgUsageInfo.setFuncArgInfo(MF.getFunction(), Info->getArgInfo());
+ ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
unsigned StackArgSize = CCInfo.getNextStackOffset();
Info->setBytesInStackArgArea(StackArgSize);
@@ -1841,8 +2074,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// FIXME: Does sret work properly?
if (!Info->isEntryFunction()) {
- const SIRegisterInfo *TRI
- = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *I =
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
if (I) {
@@ -1944,8 +2176,7 @@ void SITargetLowering::passSpecialInputs(
SelectionDAG &DAG = CLI.DAG;
const SDLoc &DL = CLI.DL;
- const SISubtarget *ST = getSubtarget();
- const SIRegisterInfo *TRI = ST->getRegisterInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
auto &ArgUsageInfo =
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
@@ -2138,6 +2369,13 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
"unsupported required tail call to function ");
}
+ if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
+ // Note the issue is with the CC of the calling function, not of the call
+ // itself.
+ return lowerUnhandledCall(CLI, InVals,
+ "unsupported call from graphics shader of function ");
+ }
+
// The first 4 bytes are reserved for the callee's emergency stack slot.
const unsigned CalleeUsableStackOffset = 4;
@@ -2383,7 +2621,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// Add a register mask operand representing the call-preserved registers.
- const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+ auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
@@ -2443,7 +2681,7 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
}
- if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
report_fatal_error(Twine("invalid register \""
+ StringRef(RegName) + "\" for subtarget."));
@@ -2517,7 +2755,8 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
unsigned PhiReg,
unsigned InitSaveExecReg,
int Offset,
- bool UseGPRIdxMode) {
+ bool UseGPRIdxMode,
+ bool IsIndirectSrc) {
MachineBasicBlock::iterator I = LoopBB.begin();
unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -2546,6 +2785,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
.addReg(CurrentIdxReg)
.addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
+ // Update EXEC, save the original EXEC value to VCC.
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
+ .addReg(CondReg, RegState::Kill);
+
+ MRI.setSimpleHint(NewExec, CondReg);
+
if (UseGPRIdxMode) {
unsigned IdxReg;
if (Offset == 0) {
@@ -2556,11 +2801,13 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
.addReg(CurrentIdxReg, RegState::Kill)
.addImm(Offset);
}
-
- MachineInstr *SetIdx =
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
- .addReg(IdxReg, RegState::Kill);
- SetIdx->getOperand(2).setIsUndef();
+ unsigned IdxMode = IsIndirectSrc ?
+ VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
+ MachineInstr *SetOn =
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addReg(IdxReg, RegState::Kill)
+ .addImm(IdxMode);
+ SetOn->getOperand(3).setIsUndef();
} else {
// Move index from VCC into M0
if (Offset == 0) {
@@ -2573,12 +2820,6 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
}
}
- // Update EXEC, save the original EXEC value to VCC.
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
- .addReg(CondReg, RegState::Kill);
-
- MRI.setSimpleHint(NewExec, CondReg);
-
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
MachineInstr *InsertPt =
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
@@ -2606,7 +2847,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
unsigned InitResultReg,
unsigned PhiReg,
int Offset,
- bool UseGPRIdxMode) {
+ bool UseGPRIdxMode,
+ bool IsIndirectSrc) {
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
@@ -2645,7 +2887,7 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
InitResultReg, DstReg, PhiReg, TmpExec,
- Offset, UseGPRIdxMode);
+ Offset, UseGPRIdxMode, IsIndirectSrc);
MachineBasicBlock::iterator First = RemainderBB->begin();
BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
@@ -2730,7 +2972,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
// Control flow needs to be inserted if indexing with a VGPR.
static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
MachineBasicBlock &MBB,
- const SISubtarget &ST) {
+ const GCNSubtarget &ST) {
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
MachineFunction *MF = MBB.getParent();
@@ -2780,17 +3022,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
- if (UseGPRIdxMode) {
- MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
- .addImm(0) // Reset inside loop.
- .addImm(VGPRIndexMode::SRC0_ENABLE);
- SetOn->getOperand(3).setIsUndef();
-
- // Disable again after the loop.
- BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
- }
-
- auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
+ Offset, UseGPRIdxMode, true);
MachineBasicBlock *LoopBB = InsPt->getParent();
if (UseGPRIdxMode) {
@@ -2798,6 +3031,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
.addReg(SrcReg, RegState::Undef, SubReg)
.addReg(SrcReg, RegState::Implicit)
.addReg(AMDGPU::M0, RegState::Implicit);
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
} else {
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
.addReg(SrcReg, RegState::Undef, SubReg)
@@ -2829,7 +3063,7 @@ static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
MachineBasicBlock &MBB,
- const SISubtarget &ST) {
+ const GCNSubtarget &ST) {
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
MachineFunction *MF = MBB.getParent();
@@ -2898,22 +3132,10 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
const DebugLoc &DL = MI.getDebugLoc();
- if (UseGPRIdxMode) {
- MachineBasicBlock::iterator I(&MI);
-
- MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
- .addImm(0) // Reset inside loop.
- .addImm(VGPRIndexMode::DST_ENABLE);
- SetOn->getOperand(3).setIsUndef();
-
- // Disable again after the loop.
- BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
- }
-
unsigned PhiReg = MRI.createVirtualRegister(VecRC);
auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
- Offset, UseGPRIdxMode);
+ Offset, UseGPRIdxMode, false);
MachineBasicBlock *LoopBB = InsPt->getParent();
if (UseGPRIdxMode) {
@@ -2923,6 +3145,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
.addReg(Dst, RegState::ImplicitDefine)
.addReg(PhiReg, RegState::Implicit)
.addReg(AMDGPU::M0, RegState::Implicit);
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
} else {
const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
@@ -2946,24 +3169,12 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
if (TII->isMIMG(MI)) {
- if (!MI.memoperands_empty())
- return BB;
+ if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
+ report_fatal_error("missing mem operand from MIMG instruction");
+ }
// Add a memoperand for mimg instructions so that they aren't assumed to
// be ordered memory instuctions.
- MachinePointerInfo PtrInfo(MFI->getImagePSV());
- MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable;
- if (MI.mayStore())
- Flags |= MachineMemOperand::MOStore;
-
- if (MI.mayLoad())
- Flags |= MachineMemOperand::MOLoad;
-
- if (Flags != MachineMemOperand::MODereferenceable) {
- auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
- MI.addMemOperand(*MF, MMO);
- }
-
return BB;
}
@@ -3145,8 +3356,13 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::ADJCALLSTACKDOWN: {
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
MachineInstrBuilder MIB(*MF, &MI);
+
+ // Add an implicit use of the frame offset reg to prevent the restore copy
+ // inserted after the call from being reorderd after stack operations in the
+ // the caller's frame.
MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
- .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
+ .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
+ .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
return BB;
}
case AMDGPU::SI_CALL_ISEL:
@@ -3236,12 +3452,17 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
VT = VT.getScalarType();
switch (VT.getSimpleVT().SimpleTy) {
- case MVT::f32:
+ case MVT::f32: {
// This is as fast on some subtargets. However, we always have full rate f32
// mad available which returns the same result as the separate operations
// which we should prefer over fma. We can't use this if we want to support
// denormals, so only report this in these cases.
- return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
+ if (Subtarget->hasFP32Denormals())
+ return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
+
+ // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
+ return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
+ }
case MVT::f64:
return true;
case MVT::f16:
@@ -3257,6 +3478,49 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
// Custom DAG Lowering Operations
//===----------------------------------------------------------------------===//
+// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
+// wider vector type is legal.
+SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::v4f16);
+
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
+
+ SDLoc SL(Op);
+ SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
+ Op->getFlags());
+ SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
+ Op->getFlags());
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
+}
+
+// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
+// wider vector type is legal.
+SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+
+ SDValue Lo0, Hi0;
+ std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
+ SDValue Lo1, Hi1;
+ std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
+
+ SDLoc SL(Op);
+
+ SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
+ Op->getFlags());
+ SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
+ Op->getFlags());
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
+}
+
SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
@@ -3289,15 +3553,105 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::BUILD_VECTOR:
+ return lowerBUILD_VECTOR(Op, DAG);
case ISD::FP_ROUND:
return lowerFP_ROUND(Op, DAG);
case ISD::TRAP:
- case ISD::DEBUGTRAP:
return lowerTRAP(Op, DAG);
+ case ISD::DEBUGTRAP:
+ return lowerDEBUGTRAP(Op, DAG);
+ case ISD::FABS:
+ case ISD::FNEG:
+ return splitUnaryVectorOp(Op, DAG);
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ case ISD::UMIN:
+ case ISD::UMAX:
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case ISD::FADD:
+ case ISD::FMUL:
+ return splitBinaryVectorOp(Op, DAG);
}
return SDValue();
}
+static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
+ const SDLoc &DL,
+ SelectionDAG &DAG, bool Unpacked) {
+ if (!LoadVT.isVector())
+ return Result;
+
+ if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
+ // Truncate to v2i16/v4i16.
+ EVT IntLoadVT = LoadVT.changeTypeToInteger();
+
+ // Workaround legalizer not scalarizing truncate after vector op
+ // legalization byt not creating intermediate vector trunc.
+ SmallVector<SDValue, 4> Elts;
+ DAG.ExtractVectorElements(Result, Elts);
+ for (SDValue &Elt : Elts)
+ Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
+
+ Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
+
+ // Bitcast to original type (v2f16/v4f16).
+ return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+ }
+
+ // Cast back to the original packed type.
+ return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+}
+
+SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
+ MemSDNode *M,
+ SelectionDAG &DAG,
+ bool IsIntrinsic) const {
+ SDLoc DL(M);
+ SmallVector<SDValue, 10> Ops;
+ Ops.reserve(M->getNumOperands());
+
+ Ops.push_back(M->getOperand(0));
+ if (IsIntrinsic)
+ Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32));
+
+ // Skip 1, as it is the intrinsic ID.
+ for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I)
+ Ops.push_back(M->getOperand(I));
+
+ bool Unpacked = Subtarget->hasUnpackedD16VMem();
+ EVT LoadVT = M->getValueType(0);
+
+ EVT EquivLoadVT = LoadVT;
+ if (Unpacked && LoadVT.isVector()) {
+ EquivLoadVT = LoadVT.isVector() ?
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ LoadVT.getVectorNumElements()) : LoadVT;
+ }
+
+ // Change from v4f16/v2f16 to EquivLoadVT.
+ SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
+
+ SDValue Load
+ = DAG.getMemIntrinsicNode(
+ IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
+ VTList, Ops, M->getMemoryVT(),
+ M->getMemOperand());
+ if (!Unpacked) // Just adjusted the opcode.
+ return Load;
+
+ SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
+
+ return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
+}
+
void SITargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
@@ -3314,7 +3668,8 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
- if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
+ switch (IID) {
+ case Intrinsic::amdgcn_cvt_pkrtz: {
SDValue Src0 = N->getOperand(1);
SDValue Src1 = N->getOperand(2);
SDLoc SL(N);
@@ -3323,6 +3678,38 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
return;
}
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ SDValue Src0 = N->getOperand(1);
+ SDValue Src1 = N->getOperand(2);
+ SDLoc SL(N);
+ unsigned Opcode;
+
+ if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
+ Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+ else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
+ Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+ else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
+ Opcode = AMDGPUISD::CVT_PK_I16_I32;
+ else
+ Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+ SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
+ return;
+ }
+ }
+ break;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ return;
+ }
+
break;
}
case ISD::SELECT: {
@@ -3347,12 +3734,38 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
return;
}
+ case ISD::FNEG: {
+ if (N->getValueType(0) != MVT::v2f16)
+ break;
+
+ SDLoc SL(N);
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
+
+ SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
+ BC,
+ DAG.getConstant(0x80008000, SL, MVT::i32));
+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
+ return;
+ }
+ case ISD::FABS: {
+ if (N->getValueType(0) != MVT::v2f16)
+ break;
+
+ SDLoc SL(N);
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
+
+ SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
+ BC,
+ DAG.getConstant(0x7fff7fff, SL, MVT::i32));
+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
+ return;
+ }
default:
break;
}
}
-/// \brief Helper function for LowerBRCOND
+/// Helper function for LowerBRCOND
static SDNode *findUser(SDValue Value, unsigned Opcode) {
SDNode *Parent = Value.getNode();
@@ -3417,13 +3830,15 @@ void SITargetLowering::createDebuggerPrologueStackObjects(
bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
const Triple &TT = getTargetMachine().getTargetTriple();
- return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+ return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+ GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
AMDGPU::shouldEmitConstantsToTextSection(TT);
}
bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
- GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
+ GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+ GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
!shouldEmitFixup(GV) &&
!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
}
@@ -3560,40 +3975,37 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
- MachineFunction &MF = DAG.getMachineFunction();
SDValue Chain = Op.getOperand(0);
- unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
- SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap;
-
- if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
- Subtarget->isTrapHandlerEnabled()) {
- SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- unsigned UserSGPR = Info->getQueuePtrUserSGPR();
- assert(UserSGPR != AMDGPU::NoRegister);
-
- SDValue QueuePtr = CreateLiveInRegister(
- DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
-
- SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
-
- SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
- QueuePtr, SDValue());
+ if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
+ !Subtarget->isTrapHandlerEnabled())
+ return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
- SDValue Ops[] = {
- ToReg,
- DAG.getTargetConstant(TrapID, SL, MVT::i16),
- SGPR01,
- ToReg.getValue(1)
- };
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+ assert(UserSGPR != AMDGPU::NoRegister);
+ SDValue QueuePtr = CreateLiveInRegister(
+ DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+ SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
+ SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
+ QueuePtr, SDValue());
+ SDValue Ops[] = {
+ ToReg,
+ DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
+ SGPR01,
+ ToReg.getValue(1)
+ };
+ return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
+}
- return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
- }
+SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Chain = Op.getOperand(0);
+ MachineFunction &MF = DAG.getMachineFunction();
- switch (TrapID) {
- case SISubtarget::TrapIDLLVMTrap:
- return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
- case SISubtarget::TrapIDLLVMDebugTrap: {
+ if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
+ !Subtarget->isTrapHandlerEnabled()) {
DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
"debugtrap handler not supported",
Op.getDebugLoc(),
@@ -3602,11 +4014,12 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
Ctx.diagnose(NoTrap);
return Chain;
}
- default:
- llvm_unreachable("unsupported trap handler type!");
- }
- return Chain;
+ SDValue Ops[] = {
+ Chain,
+ DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
+ };
+ return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
}
SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
@@ -3719,34 +4132,78 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
+ SDValue Vec = Op.getOperand(0);
+ SDValue InsVal = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);
+ EVT VecVT = Vec.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+ unsigned VecSize = VecVT.getSizeInBits();
+ unsigned EltSize = EltVT.getSizeInBits();
+
+
+ assert(VecSize <= 64);
+
+ unsigned NumElts = VecVT.getVectorNumElements();
+ SDLoc SL(Op);
+ auto KIdx = dyn_cast<ConstantSDNode>(Idx);
+
+ if (NumElts == 4 && EltSize == 16 && KIdx) {
+ SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
+
+ SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
+ DAG.getConstant(0, SL, MVT::i32));
+ SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
+ DAG.getConstant(1, SL, MVT::i32));
+
+ SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
+ SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
+
+ unsigned Idx = KIdx->getZExtValue();
+ bool InsertLo = Idx < 2;
+ SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
+ InsertLo ? LoVec : HiVec,
+ DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
+ DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
+
+ InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
+
+ SDValue Concat = InsertLo ?
+ DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
+ DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
+
+ return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
+ }
+
if (isa<ConstantSDNode>(Idx))
return SDValue();
+ MVT IntVT = MVT::getIntegerVT(VecSize);
+
// Avoid stack access for dynamic indexing.
- SDLoc SL(Op);
- SDValue Vec = Op.getOperand(0);
- SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
+ SDValue Val = InsVal;
+ if (InsVal.getValueType() == MVT::f16)
+ Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
- SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
+ SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
- // Convert vector index to bit-index.
- SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
- DAG.getConstant(16, SL, MVT::i32));
+ assert(isPowerOf2_32(EltSize));
+ SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
- SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+ // Convert vector index to bit-index.
+ SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
- SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
- DAG.getConstant(0xffff, SL, MVT::i32),
+ SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
+ SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
+ DAG.getConstant(0xffff, SL, IntVT),
ScaledIdx);
- SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
- SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
- DAG.getNOT(SL, BFM, MVT::i32), BCVec);
+ SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
+ SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
+ DAG.getNOT(SL, BFM, IntVT), BCVec);
- SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
- return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
+ SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
+ return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
}
SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
@@ -3756,51 +4213,87 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
EVT ResultVT = Op.getValueType();
SDValue Vec = Op.getOperand(0);
SDValue Idx = Op.getOperand(1);
+ EVT VecVT = Vec.getValueType();
+ unsigned VecSize = VecVT.getSizeInBits();
+ EVT EltVT = VecVT.getVectorElementType();
+ assert(VecSize <= 64);
DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
- // Make sure we we do any optimizations that will make it easier to fold
+ // Make sure we do any optimizations that will make it easier to fold
// source modifiers before obscuring it with bit operations.
// XXX - Why doesn't this get called when vector_shuffle is expanded?
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
return Combined;
- if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
- SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+ unsigned EltSize = EltVT.getSizeInBits();
+ assert(isPowerOf2_32(EltSize));
- if (CIdx->getZExtValue() == 1) {
- Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result,
- DAG.getConstant(16, SL, MVT::i32));
- } else {
- assert(CIdx->getZExtValue() == 0);
- }
+ MVT IntVT = MVT::getIntegerVT(VecSize);
+ SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
+
+ // Convert vector index to bit-index (* EltSize)
+ SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
- if (ResultVT.bitsLT(MVT::i32))
- Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
+ SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
+
+ if (ResultVT == MVT::f16) {
+ SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
}
- SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32);
+ return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
+}
- // Convert vector index to bit-index.
- SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen);
+SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ EVT VT = Op.getValueType();
+
+ if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
+
+ // Turn into pair of packed build_vectors.
+ // TODO: Special case for constants that can be materialized with s_mov_b64.
+ SDValue Lo = DAG.getBuildVector(HalfVT, SL,
+ { Op.getOperand(0), Op.getOperand(1) });
+ SDValue Hi = DAG.getBuildVector(HalfVT, SL,
+ { Op.getOperand(2), Op.getOperand(3) });
+
+ SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
+ SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
+
+ SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
+ return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
+ }
- SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
- SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
+ assert(VT == MVT::v2f16 || VT == MVT::v2i16);
- SDValue Result = Elt;
- if (ResultVT.bitsLT(MVT::i32))
- Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
+ SDValue Lo = Op.getOperand(0);
+ SDValue Hi = Op.getOperand(1);
- return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
+ Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
+ Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
+
+ Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
+ Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
+
+ SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
+ DAG.getConstant(16, SL, MVT::i32));
+
+ SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
+
+ return DAG.getNode(ISD::BITCAST, SL, VT, Or);
}
bool
SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
// We can fold offsets for anything that doesn't require a GOT relocation.
return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
- GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
+ GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+ GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
!shouldEmitGOTReloc(GA->getGlobal());
}
@@ -3853,6 +4346,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
const GlobalValue *GV = GSD->getGlobal();
if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
+ GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
// FIXME: It isn't correct to rely on the type of the pointer. This should
// be removed when address space 0 is 64-bit.
@@ -3905,7 +4399,7 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
unsigned Offset) const {
SDLoc SL(Op);
SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
- DAG.getEntryNode(), Offset, false);
+ DAG.getEntryNode(), Offset, 4, false);
// The local size values will have the hi 16-bits as zero.
return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
DAG.getValueType(VT));
@@ -3929,6 +4423,245 @@ static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
return DAG.getUNDEF(VT);
}
+static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
+ ArrayRef<SDValue> Elts) {
+ assert(!Elts.empty());
+ MVT Type;
+ unsigned NumElts;
+
+ if (Elts.size() == 1) {
+ Type = MVT::f32;
+ NumElts = 1;
+ } else if (Elts.size() == 2) {
+ Type = MVT::v2f32;
+ NumElts = 2;
+ } else if (Elts.size() <= 4) {
+ Type = MVT::v4f32;
+ NumElts = 4;
+ } else if (Elts.size() <= 8) {
+ Type = MVT::v8f32;
+ NumElts = 8;
+ } else {
+ assert(Elts.size() <= 16);
+ Type = MVT::v16f32;
+ NumElts = 16;
+ }
+
+ SmallVector<SDValue, 16> VecElts(NumElts);
+ for (unsigned i = 0; i < Elts.size(); ++i) {
+ SDValue Elt = Elts[i];
+ if (Elt.getValueType() != MVT::f32)
+ Elt = DAG.getBitcast(MVT::f32, Elt);
+ VecElts[i] = Elt;
+ }
+ for (unsigned i = Elts.size(); i < NumElts; ++i)
+ VecElts[i] = DAG.getUNDEF(MVT::f32);
+
+ if (NumElts == 1)
+ return VecElts[0];
+ return DAG.getBuildVector(Type, DL, VecElts);
+}
+
+static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
+ SDValue *GLC, SDValue *SLC) {
+ auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode());
+ if (!CachePolicyConst)
+ return false;
+
+ uint64_t Value = CachePolicyConst->getZExtValue();
+ SDLoc DL(CachePolicy);
+ if (GLC) {
+ *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
+ Value &= ~(uint64_t)0x1;
+ }
+ if (SLC) {
+ *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
+ Value &= ~(uint64_t)0x2;
+ }
+
+ return Value == 0;
+}
+
+SDValue SITargetLowering::lowerImage(SDValue Op,
+ const AMDGPU::ImageDimIntrinsicInfo *Intr,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MachineFunction &MF = DAG.getMachineFunction();
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
+
+ SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
+ bool IsD16 = false;
+ SDValue VData;
+ int NumVDataDwords;
+ unsigned AddrIdx; // Index of first address argument
+ unsigned DMask;
+
+ if (BaseOpcode->Atomic) {
+ VData = Op.getOperand(2);
+
+ bool Is64Bit = VData.getValueType() == MVT::i64;
+ if (BaseOpcode->AtomicX2) {
+ SDValue VData2 = Op.getOperand(3);
+ VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
+ {VData, VData2});
+ if (Is64Bit)
+ VData = DAG.getBitcast(MVT::v4i32, VData);
+
+ ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
+ DMask = Is64Bit ? 0xf : 0x3;
+ NumVDataDwords = Is64Bit ? 4 : 2;
+ AddrIdx = 4;
+ } else {
+ DMask = Is64Bit ? 0x3 : 0x1;
+ NumVDataDwords = Is64Bit ? 2 : 1;
+ AddrIdx = 3;
+ }
+ } else {
+ unsigned DMaskIdx;
+
+ if (BaseOpcode->Store) {
+ VData = Op.getOperand(2);
+
+ MVT StoreVT = VData.getSimpleValueType();
+ if (StoreVT.getScalarType() == MVT::f16) {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
+ !BaseOpcode->HasD16)
+ return Op; // D16 is unsupported for this instruction
+
+ IsD16 = true;
+ VData = handleD16VData(VData, DAG);
+ }
+
+ NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
+ DMaskIdx = 3;
+ } else {
+ MVT LoadVT = Op.getSimpleValueType();
+ if (LoadVT.getScalarType() == MVT::f16) {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
+ !BaseOpcode->HasD16)
+ return Op; // D16 is unsupported for this instruction
+
+ IsD16 = true;
+ if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
+ ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
+ }
+
+ NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
+ DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
+ }
+
+ auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
+ if (!DMaskConst)
+ return Op;
+
+ AddrIdx = DMaskIdx + 1;
+ DMask = DMaskConst->getZExtValue();
+ if (!DMask && !BaseOpcode->Store) {
+ // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
+ // store the channels' default values.
+ SDValue Undef = DAG.getUNDEF(Op.getValueType());
+ if (isa<MemSDNode>(Op))
+ return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
+ return Undef;
+ }
+ }
+
+ unsigned NumVAddrs = BaseOpcode->NumExtraArgs +
+ (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) +
+ (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) +
+ (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+ SmallVector<SDValue, 4> VAddrs;
+ for (unsigned i = 0; i < NumVAddrs; ++i)
+ VAddrs.push_back(Op.getOperand(AddrIdx + i));
+ SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
+
+ SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
+ SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
+ unsigned CtrlIdx; // Index of texfailctrl argument
+ SDValue Unorm;
+ if (!BaseOpcode->Sampler) {
+ Unorm = True;
+ CtrlIdx = AddrIdx + NumVAddrs + 1;
+ } else {
+ auto UnormConst =
+ dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
+ if (!UnormConst)
+ return Op;
+
+ Unorm = UnormConst->getZExtValue() ? True : False;
+ CtrlIdx = AddrIdx + NumVAddrs + 3;
+ }
+
+ SDValue TexFail = Op.getOperand(CtrlIdx);
+ auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
+ if (!TexFailConst || TexFailConst->getZExtValue() != 0)
+ return Op;
+
+ SDValue GLC;
+ SDValue SLC;
+ if (BaseOpcode->Atomic) {
+ GLC = True; // TODO no-return optimization
+ if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
+ return Op;
+ } else {
+ if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
+ return Op;
+ }
+
+ SmallVector<SDValue, 14> Ops;
+ if (BaseOpcode->Store || BaseOpcode->Atomic)
+ Ops.push_back(VData); // vdata
+ Ops.push_back(VAddr);
+ Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
+ if (BaseOpcode->Sampler)
+ Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
+ Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
+ Ops.push_back(Unorm);
+ Ops.push_back(GLC);
+ Ops.push_back(SLC);
+ Ops.push_back(False); // r128
+ Ops.push_back(False); // tfe
+ Ops.push_back(False); // lwe
+ Ops.push_back(DimInfo->DA ? True : False);
+ if (BaseOpcode->HasD16)
+ Ops.push_back(IsD16 ? True : False);
+ if (isa<MemSDNode>(Op))
+ Ops.push_back(Op.getOperand(0)); // chain
+
+ int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
+ int Opcode = -1;
+
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx8,
+ NumVDataDwords, NumVAddrDwords);
+ if (Opcode == -1)
+ Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx6,
+ NumVDataDwords, NumVAddrDwords);
+ assert(Opcode != -1);
+
+ MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
+ if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
+ MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
+ *MemRefs = MemOp->getMemOperand();
+ NewNode->setMemRefs(MemRefs, MemRefs + 1);
+ }
+
+ if (BaseOpcode->AtomicX2) {
+ SmallVector<SDValue, 1> Elt;
+ DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
+ return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
+ } else if (IsD16 && !BaseOpcode->Store) {
+ MVT LoadVT = Op.getSimpleValueType();
+ SDValue Adjusted = adjustLoadValueTypeImpl(
+ SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
+ return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
+ }
+
+ return SDValue(NewNode, 0);
+}
+
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
@@ -3942,14 +4675,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
switch (IntrinsicID) {
case Intrinsic::amdgcn_implicit_buffer_ptr: {
- if (getSubtarget()->isAmdCodeObjectV2(MF))
+ if (getSubtarget()->isAmdCodeObjectV2(MF.getFunction()))
return emitNonHSAIntrinsicError(DAG, DL, VT);
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
}
case Intrinsic::amdgcn_dispatch_ptr:
case Intrinsic::amdgcn_queue_ptr: {
- if (!Subtarget->isAmdCodeObjectV2(MF)) {
+ if (!Subtarget->isAmdCodeObjectV2(MF.getFunction())) {
DiagnosticInfoUnsupported BadIntrin(
MF.getFunction(), "unsupported hsa intrinsic without hsa target",
DL.getDebugLoc());
@@ -3979,16 +4712,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_rsq:
return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_rsq_legacy:
- if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return emitRemovedIntrinsicError(DAG, DL, VT);
return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_rcp_legacy:
- if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return emitRemovedIntrinsicError(DAG, DL, VT);
return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_rsq_clamp: {
- if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
Type *Type = VT.getTypeForEVT(*DAG.getContext());
@@ -4006,37 +4739,37 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::NGROUPS_X, false);
+ SI::KernelInputOffsets::NGROUPS_X, 4, false);
case Intrinsic::r600_read_ngroups_y:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::NGROUPS_Y, false);
+ SI::KernelInputOffsets::NGROUPS_Y, 4, false);
case Intrinsic::r600_read_ngroups_z:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::NGROUPS_Z, false);
+ SI::KernelInputOffsets::NGROUPS_Z, 4, false);
case Intrinsic::r600_read_global_size_x:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
+ SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
case Intrinsic::r600_read_global_size_y:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
+ SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
case Intrinsic::r600_read_global_size_z:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
+ SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
case Intrinsic::r600_read_local_size_x:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
@@ -4125,7 +4858,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_log_clamp: {
- if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
return SDValue();
DiagnosticInfoUnsupported BadIntrin(
@@ -4210,6 +4943,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_fmed3:
return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::amdgcn_fdot2:
+ return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_fmul_legacy:
return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
Op.getOperand(1), Op.getOperand(2));
@@ -4221,10 +4957,27 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_ubfe:
return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
- case Intrinsic::amdgcn_cvt_pkrtz: {
- // FIXME: Stop adding cast if v2f16 legal.
+ case Intrinsic::amdgcn_cvt_pkrtz:
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ // FIXME: Stop adding cast if v2f16/v2i16 are legal.
EVT VT = Op.getValueType();
- SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
+ unsigned Opcode;
+
+ if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
+ Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
+ Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
+ Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
+ Opcode = AMDGPUISD::CVT_PK_I16_I32;
+ else
+ Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+ SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
Op.getOperand(1), Op.getOperand(2));
return DAG.getNode(ISD::BITCAST, DL, VT, Node);
}
@@ -4238,17 +4991,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
0);
}
- case Intrinsic::amdgcn_image_getlod:
- case Intrinsic::amdgcn_image_getresinfo: {
- unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4;
-
- // Replace dmask with everything disabled with undef.
- const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx));
- if (!DMask || DMask->isNullValue())
- return DAG.getUNDEF(Op.getValueType());
- return SDValue();
- }
+ case Intrinsic::amdgcn_fmad_ftz:
+ return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3));
default:
+ if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
+ return lowerImage(Op, ImageDimIntr, DAG);
+
return Op;
}
}
@@ -4257,14 +5007,34 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
SDLoc DL(Op);
- MachineFunction &MF = DAG.getMachineFunction();
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec: {
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax: {
MemSDNode *M = cast<MemSDNode>(Op);
- unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
- AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
+ unsigned Opc;
+ switch (IntrID) {
+ case Intrinsic::amdgcn_atomic_inc:
+ Opc = AMDGPUISD::ATOMIC_INC;
+ break;
+ case Intrinsic::amdgcn_atomic_dec:
+ Opc = AMDGPUISD::ATOMIC_DEC;
+ break;
+ case Intrinsic::amdgcn_ds_fadd:
+ Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
+ break;
+ case Intrinsic::amdgcn_ds_fmin:
+ Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
+ break;
+ case Intrinsic::amdgcn_ds_fmax:
+ Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
+ break;
+ default:
+ llvm_unreachable("Unknown intrinsic!");
+ }
SDValue Ops[] = {
M->getOperand(0), // Chain
M->getOperand(2), // Ptr
@@ -4284,21 +5054,28 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(5), // glc
Op.getOperand(6) // slc
};
- SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
EVT VT = Op.getValueType();
EVT IntVT = VT.changeTypeToInteger();
+ auto *M = cast<MemSDNode>(Op);
+ EVT LoadVT = Op.getValueType();
+ bool IsD16 = LoadVT.getScalarType() == MVT::f16;
+ if (IsD16)
+ return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG);
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(MFI->getBufferPSV()),
- MachineMemOperand::MOLoad,
- VT.getStoreSize(), VT.getStoreSize());
-
- return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
+ M->getMemOperand());
}
case Intrinsic::amdgcn_tbuffer_load: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ EVT LoadVT = Op.getValueType();
+ bool IsD16 = LoadVT.getScalarType() == MVT::f16;
+ if (IsD16) {
+ return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG);
+ }
+
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
@@ -4312,14 +5089,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(10) // slc
};
- EVT VT = Op.getOperand(2).getValueType();
-
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad,
- VT.getStoreSize(), VT.getStoreSize());
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
- Op->getVTList(), Ops, VT, MMO);
+ Op->getVTList(), Ops, LoadVT,
+ M->getMemOperand());
}
case Intrinsic::amdgcn_buffer_atomic_swap:
case Intrinsic::amdgcn_buffer_atomic_add:
@@ -4339,14 +5111,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(5), // offset
Op.getOperand(6) // slc
};
- EVT VT = Op.getOperand(3).getValueType();
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOVolatile,
- VT.getStoreSize(), 4);
+ EVT VT = Op.getValueType();
+
+ auto *M = cast<MemSDNode>(Op);
unsigned Opcode = 0;
switch (IntrID) {
@@ -4384,7 +5151,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
llvm_unreachable("unhandled atomic opcode");
}
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO);
+ return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
+ M->getMemOperand());
}
case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
@@ -4397,78 +5165,46 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(6), // offset
Op.getOperand(7) // slc
};
- EVT VT = Op.getOperand(4).getValueType();
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOVolatile,
- VT.getStoreSize(), 4);
+ EVT VT = Op.getValueType();
+ auto *M = cast<MemSDNode>(Op);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
- Op->getVTList(), Ops, VT, MMO);
+ Op->getVTList(), Ops, VT, M->getMemOperand());
}
- // Basic sample.
- case Intrinsic::amdgcn_image_sample:
- case Intrinsic::amdgcn_image_sample_cl:
- case Intrinsic::amdgcn_image_sample_d:
- case Intrinsic::amdgcn_image_sample_d_cl:
- case Intrinsic::amdgcn_image_sample_l:
- case Intrinsic::amdgcn_image_sample_b:
- case Intrinsic::amdgcn_image_sample_b_cl:
- case Intrinsic::amdgcn_image_sample_lz:
- case Intrinsic::amdgcn_image_sample_cd:
- case Intrinsic::amdgcn_image_sample_cd_cl:
-
- // Sample with comparison.
- case Intrinsic::amdgcn_image_sample_c:
- case Intrinsic::amdgcn_image_sample_c_cl:
- case Intrinsic::amdgcn_image_sample_c_d:
- case Intrinsic::amdgcn_image_sample_c_d_cl:
- case Intrinsic::amdgcn_image_sample_c_l:
- case Intrinsic::amdgcn_image_sample_c_b:
- case Intrinsic::amdgcn_image_sample_c_b_cl:
- case Intrinsic::amdgcn_image_sample_c_lz:
- case Intrinsic::amdgcn_image_sample_c_cd:
- case Intrinsic::amdgcn_image_sample_c_cd_cl:
-
- // Sample with offsets.
- case Intrinsic::amdgcn_image_sample_o:
- case Intrinsic::amdgcn_image_sample_cl_o:
- case Intrinsic::amdgcn_image_sample_d_o:
- case Intrinsic::amdgcn_image_sample_d_cl_o:
- case Intrinsic::amdgcn_image_sample_l_o:
- case Intrinsic::amdgcn_image_sample_b_o:
- case Intrinsic::amdgcn_image_sample_b_cl_o:
- case Intrinsic::amdgcn_image_sample_lz_o:
- case Intrinsic::amdgcn_image_sample_cd_o:
- case Intrinsic::amdgcn_image_sample_cd_cl_o:
-
- // Sample with comparison and offsets.
- case Intrinsic::amdgcn_image_sample_c_o:
- case Intrinsic::amdgcn_image_sample_c_cl_o:
- case Intrinsic::amdgcn_image_sample_c_d_o:
- case Intrinsic::amdgcn_image_sample_c_d_cl_o:
- case Intrinsic::amdgcn_image_sample_c_l_o:
- case Intrinsic::amdgcn_image_sample_c_b_o:
- case Intrinsic::amdgcn_image_sample_c_b_cl_o:
- case Intrinsic::amdgcn_image_sample_c_lz_o:
- case Intrinsic::amdgcn_image_sample_c_cd_o:
- case Intrinsic::amdgcn_image_sample_c_cd_cl_o: {
- // Replace dmask with everything disabled with undef.
- const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
- if (!DMask || DMask->isNullValue()) {
- SDValue Undef = DAG.getUNDEF(Op.getValueType());
- return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
- }
+ default:
+ if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(IntrID))
+ return lowerImage(Op, ImageDimIntr, DAG);
return SDValue();
}
- default:
- return SDValue();
+}
+
+SDValue SITargetLowering::handleD16VData(SDValue VData,
+ SelectionDAG &DAG) const {
+ EVT StoreVT = VData.getValueType();
+
+ // No change for f16 and legal vector D16 types.
+ if (!StoreVT.isVector())
+ return VData;
+
+ SDLoc DL(VData);
+ assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
+
+ if (Subtarget->hasUnpackedD16VMem()) {
+ // We need to unpack the packed data to store.
+ EVT IntStoreVT = StoreVT.changeTypeToInteger();
+ SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
+
+ EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ StoreVT.getVectorNumElements());
+ SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
+ return DAG.UnrollVectorOp(ZExt.getNode());
}
+
+ assert(isTypeLegal(StoreVT));
+ return VData;
}
SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
@@ -4558,7 +5294,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
case Intrinsic::amdgcn_s_barrier: {
if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
if (WGSize <= ST.getWavefrontSize())
return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
@@ -4613,9 +5349,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
case Intrinsic::amdgcn_tbuffer_store: {
+ SDValue VData = Op.getOperand(2);
+ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ if (IsD16)
+ VData = handleD16VData(VData, DAG);
SDValue Ops[] = {
Chain,
- Op.getOperand(2), // vdata
+ VData, // vdata
Op.getOperand(3), // rsrc
Op.getOperand(4), // vindex
Op.getOperand(5), // voffset
@@ -4626,42 +5366,133 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(10), // glc
Op.getOperand(11) // slc
};
- EVT VT = Op.getOperand(3).getValueType();
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOStore,
- VT.getStoreSize(), 4);
- return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
- Op->getVTList(), Ops, VT, MMO);
+ unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
+ AMDGPUISD::TBUFFER_STORE_FORMAT;
+ MemSDNode *M = cast<MemSDNode>(Op);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+ M->getMemoryVT(), M->getMemOperand());
}
case Intrinsic::amdgcn_buffer_store:
case Intrinsic::amdgcn_buffer_store_format: {
+ SDValue VData = Op.getOperand(2);
+ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ if (IsD16)
+ VData = handleD16VData(VData, DAG);
SDValue Ops[] = {
Chain,
- Op.getOperand(2), // vdata
+ VData, // vdata
Op.getOperand(3), // rsrc
Op.getOperand(4), // vindex
Op.getOperand(5), // offset
Op.getOperand(6), // glc
Op.getOperand(7) // slc
};
- EVT VT = Op.getOperand(3).getValueType();
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable,
- VT.getStoreSize(), 4);
+ unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
+ AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
+ Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
+ MemSDNode *M = cast<MemSDNode>(Op);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+ M->getMemoryVT(), M->getMemOperand());
+ }
+ default: {
+ if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
+ return lowerImage(Op, ImageDimIntr, DAG);
- unsigned Opcode = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
- AMDGPUISD::BUFFER_STORE :
- AMDGPUISD::BUFFER_STORE_FORMAT;
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO);
+ return Op;
}
+ }
+}
- default:
+static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
+ ISD::LoadExtType ExtType, SDValue Op,
+ const SDLoc &SL, EVT VT) {
+ if (VT.bitsLT(Op.getValueType()))
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
+
+ switch (ExtType) {
+ case ISD::SEXTLOAD:
+ return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
+ case ISD::ZEXTLOAD:
+ return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
+ case ISD::EXTLOAD:
+ return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
+ case ISD::NON_EXTLOAD:
return Op;
}
+
+ llvm_unreachable("invalid ext type");
+}
+
+SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ if (Ld->getAlignment() < 4 || Ld->isDivergent())
+ return SDValue();
+
+ // FIXME: Constant loads should all be marked invariant.
+ unsigned AS = Ld->getAddressSpace();
+ if (AS != AMDGPUASI.CONSTANT_ADDRESS &&
+ AS != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
+ (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
+ return SDValue();
+
+ // Don't do this early, since it may interfere with adjacent load merging for
+ // illegal types. We can avoid losing alignment information for exotic types
+ // pre-legalize.
+ EVT MemVT = Ld->getMemoryVT();
+ if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
+ MemVT.getSizeInBits() >= 32)
+ return SDValue();
+
+ SDLoc SL(Ld);
+
+ assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
+ "unexpected vector extload");
+
+ // TODO: Drop only high part of range.
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
+ MVT::i32, SL, Ld->getChain(), Ptr,
+ Ld->getOffset(),
+ Ld->getPointerInfo(), MVT::i32,
+ Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags(),
+ Ld->getAAInfo(),
+ nullptr); // Drop ranges
+
+ EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+ if (MemVT.isFloatingPoint()) {
+ assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
+ "unexpected fp extload");
+ TruncVT = MemVT.changeTypeToInteger();
+ }
+
+ SDValue Cvt = NewLoad;
+ if (Ld->getExtensionType() == ISD::SEXTLOAD) {
+ Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
+ DAG.getValueType(TruncVT));
+ } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
+ Ld->getExtensionType() == ISD::NON_EXTLOAD) {
+ Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
+ } else {
+ assert(Ld->getExtensionType() == ISD::EXTLOAD);
+ }
+
+ EVT VT = Ld->getValueType(0);
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+
+ DCI.AddToWorklist(Cvt.getNode());
+
+ // We may need to handle exotic cases, such as i16->i64 extloads, so insert
+ // the appropriate extension from the 32-bit load.
+ Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
+ DCI.AddToWorklist(Cvt.getNode());
+
+ // Handle conversion back to floating point if necessary.
+ Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
+
+ return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
}
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
@@ -4700,9 +5531,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
"Custom lowering for non-i32 vectors hasn't been implemented.");
+ unsigned Alignment = Load->getAlignment();
unsigned AS = Load->getAddressSpace();
if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
- AS, Load->getAlignment())) {
+ AS, Alignment)) {
SDValue Ops[2];
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
return DAG.getMergeValues(Ops, DL);
@@ -4717,24 +5549,32 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
unsigned NumElements = MemVT.getVectorNumElements();
- if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
- if (isMemOpUniform(Load))
+
+ if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
+ if (!Op->isDivergent() && Alignment >= 4)
return SDValue();
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
// loads.
//
}
- if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
- if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
- !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
+
+ if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
+ AS == AMDGPUASI.GLOBAL_ADDRESS) {
+ if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
+ !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
+ Alignment >= 4)
return SDValue();
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
// loads.
//
}
- if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS ||
+ if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
+ AS == AMDGPUASI.GLOBAL_ADDRESS ||
AS == AMDGPUASI.FLAT_ADDRESS) {
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
@@ -4761,21 +5601,20 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
- if (NumElements > 2)
- return SplitVectorLoad(Op, DAG);
-
- if (NumElements == 2)
+ // Use ds_read_b128 if possible.
+ if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
+ MemVT.getStoreSize() == 16)
return SDValue();
- // If properly aligned, if we split we might be able to use ds_read_b64.
- return SplitVectorLoad(Op, DAG);
+ if (NumElements > 2)
+ return SplitVectorLoad(Op, DAG);
}
return SDValue();
}
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
- if (Op.getValueType() != MVT::i64)
- return SDValue();
+ EVT VT = Op.getValueType();
+ assert(VT.getSizeInBits() == 64);
SDLoc DL(Op);
SDValue Cond = Op.getOperand(0);
@@ -4797,7 +5636,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
- return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Res);
}
// Catch division cases where we can use shortcuts with rcp and rsq
@@ -4809,8 +5648,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
SDValue RHS = Op.getOperand(1);
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();
- bool Unsafe = DAG.getTarget().Options.UnsafeFPMath ||
- Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal();
+ bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
return SDValue();
@@ -5067,7 +5905,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
SDValue Scale;
- if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
// Workaround a hardware bug on SI where the condition output from div_scale
// is not usable.
@@ -5165,14 +6003,14 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
+ // Use ds_write_b128 if possible.
+ if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
+ VT.getStoreSize() == 16)
+ return SDValue();
+
if (NumElements > 2)
return SplitVectorStore(Op, DAG);
-
- if (NumElements == 2)
- return Op;
-
- // If properly aligned, if we split we might be able to use ds_write_b64.
- return SplitVectorStore(Op, DAG);
+ return SDValue();
} else {
llvm_unreachable("unhandled address space");
}
@@ -5246,7 +6084,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
// easier if i8 vectors weren't promoted to i32 vectors, particularly after
// types are legalized. v4i8 -> v4f32 is probably the only case to worry
// about in practice.
- if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
+ if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
DCI.AddToWorklist(Cvt.getNode());
@@ -5389,6 +6227,71 @@ static bool isBoolSGPR(SDValue V) {
return false;
}
+// If a constant has all zeroes or all ones within each byte return it.
+// Otherwise return 0.
+static uint32_t getConstantPermuteMask(uint32_t C) {
+ // 0xff for any zero byte in the mask
+ uint32_t ZeroByteMask = 0;
+ if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
+ if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
+ if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
+ if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
+ uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
+ if ((NonZeroByteMask & C) != NonZeroByteMask)
+ return 0; // Partial bytes selected.
+ return C;
+}
+
+// Check if a node selects whole bytes from its operand 0 starting at a byte
+// boundary while masking the rest. Returns select mask as in the v_perm_b32
+// or -1 if not succeeded.
+// Note byte select encoding:
+// value 0-3 selects corresponding source byte;
+// value 0xc selects zero;
+// value 0xff selects 0xff.
+static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
+ assert(V.getValueSizeInBits() == 32);
+
+ if (V.getNumOperands() != 2)
+ return ~0;
+
+ ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
+ if (!N1)
+ return ~0;
+
+ uint32_t C = N1->getZExtValue();
+
+ switch (V.getOpcode()) {
+ default:
+ break;
+ case ISD::AND:
+ if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+ return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
+ }
+ break;
+
+ case ISD::OR:
+ if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+ return (0x03020100 & ~ConstMask) | ConstMask;
+ }
+ break;
+
+ case ISD::SHL:
+ if (C % 8)
+ return ~0;
+
+ return uint32_t((0x030201000c0c0c0cull << C) >> 32);
+
+ case ISD::SRL:
+ if (C % 8)
+ return ~0;
+
+ return uint32_t(0x0c0c0c0c03020100ull >> C);
+ }
+
+ return ~0;
+}
+
SDValue SITargetLowering::performAndCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (DCI.isBeforeLegalize())
@@ -5435,6 +6338,20 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
}
}
}
+
+ // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
+ if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
+ isa<ConstantSDNode>(LHS.getOperand(2))) {
+ uint32_t Sel = getConstantPermuteMask(Mask);
+ if (!Sel)
+ return SDValue();
+
+ // Select 0xc for all zero bytes
+ Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
+ SDLoc DL(N);
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
+ LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
+ }
}
// (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
@@ -5487,6 +6404,54 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
}
+ // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
+ N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+ uint32_t LHSMask = getPermuteMask(DAG, LHS);
+ uint32_t RHSMask = getPermuteMask(DAG, RHS);
+ if (LHSMask != ~0u && RHSMask != ~0u) {
+ // Canonicalize the expression in an attempt to have fewer unique masks
+ // and therefore fewer registers used to hold the masks.
+ if (LHSMask > RHSMask) {
+ std::swap(LHSMask, RHSMask);
+ std::swap(LHS, RHS);
+ }
+
+ // Select 0xc for each lane used from source operand. Zero has 0xc mask
+ // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
+ uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+ uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+
+ // Check of we need to combine values from two sources within a byte.
+ if (!(LHSUsedLanes & RHSUsedLanes) &&
+ // If we select high and lower word keep it for SDWA.
+ // TODO: teach SDWA to work with v_perm_b32 and remove the check.
+ !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
+ // Each byte in each mask is either selector mask 0-3, or has higher
+ // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
+ // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
+ // mask which is not 0xff wins. By anding both masks we have a correct
+ // result except that 0x0c shall be corrected to give 0x0c only.
+ uint32_t Mask = LHSMask & RHSMask;
+ for (unsigned I = 0; I < 32; I += 8) {
+ uint32_t ByteSel = 0xff << I;
+ if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
+ Mask &= (0x0c << I) & 0xffffffff;
+ }
+
+ // Add 4 to each active LHS lane. It will not affect any existing 0xff
+ // or 0x0c.
+ uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
+ SDLoc DL(N);
+
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
+ LHS.getOperand(0), RHS.getOperand(0),
+ DAG.getConstant(Sel, DL, MVT::i32));
+ }
+ }
+ }
+
return SDValue();
}
@@ -5522,6 +6487,60 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
return SDValue();
}
+ // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
+ if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
+ LHS.getOpcode() == AMDGPUISD::PERM &&
+ isa<ConstantSDNode>(LHS.getOperand(2))) {
+ uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
+ if (!Sel)
+ return SDValue();
+
+ Sel |= LHS.getConstantOperandVal(2);
+ SDLoc DL(N);
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
+ LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
+ }
+
+ // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
+ N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+ uint32_t LHSMask = getPermuteMask(DAG, LHS);
+ uint32_t RHSMask = getPermuteMask(DAG, RHS);
+ if (LHSMask != ~0u && RHSMask != ~0u) {
+ // Canonicalize the expression in an attempt to have fewer unique masks
+ // and therefore fewer registers used to hold the masks.
+ if (LHSMask > RHSMask) {
+ std::swap(LHSMask, RHSMask);
+ std::swap(LHS, RHS);
+ }
+
+ // Select 0xc for each lane used from source operand. Zero has 0xc mask
+ // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
+ uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+ uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+
+ // Check of we need to combine values from two sources within a byte.
+ if (!(LHSUsedLanes & RHSUsedLanes) &&
+ // If we select high and lower word keep it for SDWA.
+ // TODO: teach SDWA to work with v_perm_b32 and remove the check.
+ !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
+ // Kill zero bytes selected by other mask. Zero value is 0xc.
+ LHSMask &= ~RHSUsedLanes;
+ RHSMask &= ~LHSUsedLanes;
+ // Add 4 to each active LHS lane
+ LHSMask |= LHSUsedLanes & 0x04040404;
+ // Combine masks
+ uint32_t Sel = LHSMask | RHSMask;
+ SDLoc DL(N);
+
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
+ LHS.getOperand(0), RHS.getOperand(0),
+ DAG.getConstant(Sel, DL, MVT::i32));
+ }
+ }
+ }
+
if (VT != MVT::i64)
return SDValue();
@@ -5628,6 +6647,7 @@ static bool fp16SrcZerosHighBits(unsigned Opc) {
case AMDGPUISD::FMAD_FTZ:
case AMDGPUISD::RCP:
case AMDGPUISD::RSQ:
+ case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::LDEXP:
return true;
default:
@@ -5680,6 +6700,23 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performRcpCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+
+ if (N0.isUndef())
+ return N0;
+
+ if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
+ N0.getOpcode() == ISD::SINT_TO_FP)) {
+ return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
+ N->getFlags());
+ }
+
+ return AMDGPUTargetLowering::performRcpCombine(N, DCI);
+}
+
static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
return true;
@@ -5688,7 +6725,7 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
}
static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
- const SISubtarget *ST, unsigned MaxDepth=5) {
+ const GCNSubtarget *ST, unsigned MaxDepth=5) {
// If source is a result of another standard FP operation it is already in
// canonical form.
@@ -5946,7 +6983,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
- VT != MVT::f64 &&
+ !VT.isVector() && VT != MVT::f64 &&
((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
// max(max(a, b), c) -> max3(a, b, c)
// min(min(a, b), c) -> min3(a, b, c)
@@ -6066,15 +7103,87 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
SDValue SITargetLowering::performExtractVectorEltCombine(
SDNode *N, DAGCombinerInfo &DCI) const {
SDValue Vec = N->getOperand(0);
-
SelectionDAG &DAG = DCI.DAG;
- if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
+
+ EVT VecVT = Vec.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+
+ if ((Vec.getOpcode() == ISD::FNEG ||
+ Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
SDLoc SL(N);
EVT EltVT = N->getValueType(0);
SDValue Idx = N->getOperand(1);
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
Vec.getOperand(0), Idx);
- return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
+ return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
+ }
+
+ // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
+ // =>
+ // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
+ // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
+ // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
+ if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
+ SDLoc SL(N);
+ EVT EltVT = N->getValueType(0);
+ SDValue Idx = N->getOperand(1);
+ unsigned Opc = Vec.getOpcode();
+
+ switch(Opc) {
+ default:
+ return SDValue();
+ // TODO: Support other binary operations.
+ case ISD::FADD:
+ case ISD::ADD:
+ case ISD::UMIN:
+ case ISD::UMAX:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ case ISD::FMAXNUM:
+ case ISD::FMINNUM:
+ return DAG.getNode(Opc, SL, EltVT,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+ Vec.getOperand(0), Idx),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+ Vec.getOperand(1), Idx));
+ }
+ }
+
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ unsigned VecSize = VecVT.getSizeInBits();
+ unsigned EltSize = EltVT.getSizeInBits();
+
+ // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
+ // elements. This exposes more load reduction opportunities by replacing
+ // multiple small extract_vector_elements with a single 32-bit extract.
+ auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (EltSize <= 16 &&
+ EltVT.isByteSized() &&
+ VecSize > 32 &&
+ VecSize % 32 == 0 &&
+ Idx) {
+ EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
+
+ unsigned BitIndex = Idx->getZExtValue() * EltSize;
+ unsigned EltIdx = BitIndex / 32;
+ unsigned LeftoverBitIdx = BitIndex % 32;
+ SDLoc SL(N);
+
+ SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
+ DCI.AddToWorklist(Cast.getNode());
+
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+ DCI.AddToWorklist(Elt.getNode());
+ SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
+ DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
+ DCI.AddToWorklist(Srl.getNode());
+
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
+ DCI.AddToWorklist(Trunc.getNode());
+ return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
}
return SDValue();
@@ -6135,8 +7244,8 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const TargetOptions &Options = DAG.getTarget().Options;
if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
- (N0->getFlags().hasUnsafeAlgebra() &&
- N1->getFlags().hasUnsafeAlgebra())) &&
+ (N0->getFlags().hasAllowContract() &&
+ N1->getFlags().hasAllowContract())) &&
isFMAFasterThanFMulAndFAdd(VT)) {
return ISD::FMA;
}
@@ -6192,7 +7301,7 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
return SDValue();
}
- if (VT != MVT::i32)
+ if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
return SDValue();
// add x, zext (setcc) => addcarry x, 0, setcc
@@ -6368,6 +7477,79 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performFMACombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDLoc SL(N);
+
+ if (!Subtarget->hasDLInsts() || VT != MVT::f32)
+ return SDValue();
+
+ // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
+ // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
+ SDValue Op1 = N->getOperand(0);
+ SDValue Op2 = N->getOperand(1);
+ SDValue FMA = N->getOperand(2);
+
+ if (FMA.getOpcode() != ISD::FMA ||
+ Op1.getOpcode() != ISD::FP_EXTEND ||
+ Op2.getOpcode() != ISD::FP_EXTEND)
+ return SDValue();
+
+ // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
+ // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
+ // is sufficient to allow generaing fdot2.
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+ (N->getFlags().hasAllowContract() &&
+ FMA->getFlags().hasAllowContract())) {
+ Op1 = Op1.getOperand(0);
+ Op2 = Op2.getOperand(0);
+ if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ SDValue Vec1 = Op1.getOperand(0);
+ SDValue Idx1 = Op1.getOperand(1);
+ SDValue Vec2 = Op2.getOperand(0);
+
+ SDValue FMAOp1 = FMA.getOperand(0);
+ SDValue FMAOp2 = FMA.getOperand(1);
+ SDValue FMAAcc = FMA.getOperand(2);
+
+ if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
+ FMAOp2.getOpcode() != ISD::FP_EXTEND)
+ return SDValue();
+
+ FMAOp1 = FMAOp1.getOperand(0);
+ FMAOp2 = FMAOp2.getOperand(0);
+ if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ SDValue Vec3 = FMAOp1.getOperand(0);
+ SDValue Vec4 = FMAOp2.getOperand(0);
+ SDValue Idx2 = FMAOp1.getOperand(1);
+
+ if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
+ // Idx1 and Idx2 cannot be the same.
+ Idx1 == Idx2)
+ return SDValue();
+
+ if (Vec1 == Vec2 || Vec3 == Vec4)
+ return SDValue();
+
+ if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
+ return SDValue();
+
+ if ((Vec1 == Vec3 && Vec2 == Vec4) ||
+ (Vec1 == Vec4 && Vec2 == Vec3))
+ return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc);
+ }
+ return SDValue();
+}
+
SDValue SITargetLowering::performSetCCCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -6387,23 +7569,49 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
}
}
- if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
- isBoolSGPR(LHS.getOperand(0))) {
- // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
- // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
- // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
- // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
- if ((CRHS->isAllOnesValue() &&
- (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
- (CRHS->isNullValue() &&
- (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
- return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
- DAG.getConstant(-1, SL, MVT::i1));
- if ((CRHS->isAllOnesValue() &&
- (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
- (CRHS->isNullValue() &&
- (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
- return LHS.getOperand(0);
+ if (CRHS) {
+ if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
+ isBoolSGPR(LHS.getOperand(0))) {
+ // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
+ // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
+ // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
+ // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
+ if ((CRHS->isAllOnesValue() &&
+ (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
+ (CRHS->isNullValue() &&
+ (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
+ return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
+ DAG.getConstant(-1, SL, MVT::i1));
+ if ((CRHS->isAllOnesValue() &&
+ (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
+ (CRHS->isNullValue() &&
+ (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
+ return LHS.getOperand(0);
+ }
+
+ uint64_t CRHSVal = CRHS->getZExtValue();
+ if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+ LHS.getOpcode() == ISD::SELECT &&
+ isa<ConstantSDNode>(LHS.getOperand(1)) &&
+ isa<ConstantSDNode>(LHS.getOperand(2)) &&
+ LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
+ isBoolSGPR(LHS.getOperand(0))) {
+ // Given CT != FT:
+ // setcc (select cc, CT, CF), CF, eq => xor cc, -1
+ // setcc (select cc, CT, CF), CF, ne => cc
+ // setcc (select cc, CT, CF), CT, ne => xor cc, -1
+ // setcc (select cc, CT, CF), CT, eq => cc
+ uint64_t CT = LHS.getConstantOperandVal(1);
+ uint64_t CF = LHS.getConstantOperandVal(2);
+
+ if ((CF == CRHSVal && CC == ISD::SETEQ) ||
+ (CT == CRHSVal && CC == ISD::SETNE))
+ return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
+ DAG.getConstant(-1, SL, MVT::i1));
+ if ((CF == CRHSVal && CC == ISD::SETNE) ||
+ (CT == CRHSVal && CC == ISD::SETEQ))
+ return LHS.getOperand(0);
+ }
}
if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
@@ -6472,6 +7680,29 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performClampCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+ if (!CSrc)
+ return SDValue();
+
+ const APFloat &F = CSrc->getValueAPF();
+ APFloat Zero = APFloat::getZero(F.getSemantics());
+ APFloat::cmpResult Cmp0 = F.compare(Zero);
+ if (Cmp0 == APFloat::cmpLessThan ||
+ (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
+ return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
+ }
+
+ APFloat One(F.getSemantics(), "1.0");
+ APFloat::cmpResult Cmp1 = F.compare(One);
+ if (Cmp1 == APFloat::cmpGreaterThan)
+ return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
+
+ return SDValue(CSrc, 0);
+}
+
+
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
@@ -6503,7 +7734,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performMinMaxCombine(N, DCI);
break;
}
- case ISD::LOAD:
+ case ISD::FMA:
+ return performFMACombine(N, DCI);
+ case ISD::LOAD: {
+ if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
+ return Widended;
+ LLVM_FALLTHROUGH;
+ }
case ISD::STORE:
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE:
@@ -6521,7 +7758,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ATOMIC_LOAD_UMIN:
case ISD::ATOMIC_LOAD_UMAX:
case AMDGPUISD::ATOMIC_INC:
- case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics.
+ case AMDGPUISD::ATOMIC_DEC:
+ case AMDGPUISD::ATOMIC_LOAD_FADD:
+ case AMDGPUISD::ATOMIC_LOAD_FMIN:
+ case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
if (DCI.isBeforeLegalize())
break;
return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
@@ -6537,11 +7777,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performClassCombine(N, DCI);
case ISD::FCANONICALIZE:
return performFCanonicalizeCombine(N, DCI);
- case AMDGPUISD::FRACT:
case AMDGPUISD::RCP:
+ return performRcpCombine(N, DCI);
+ case AMDGPUISD::FRACT:
case AMDGPUISD::RSQ:
case AMDGPUISD::RCP_LEGACY:
case AMDGPUISD::RSQ_LEGACY:
+ case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::RSQ_CLAMP:
case AMDGPUISD::LDEXP: {
SDValue Src = N->getOperand(0);
@@ -6561,6 +7803,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performFMed3Combine(N, DCI);
case AMDGPUISD::CVT_PKRTZ_F16_F32:
return performCvtPkRTZCombine(N, DCI);
+ case AMDGPUISD::CLAMP:
+ return performClampCombine(N, DCI);
case ISD::SCALAR_TO_VECTOR: {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
@@ -6587,7 +7831,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
-/// \brief Helper function for adjustWritemask
+/// Helper function for adjustWritemask
static unsigned SubIdx2Lane(unsigned Idx) {
switch (Idx) {
default: return 0;
@@ -6598,12 +7842,19 @@ static unsigned SubIdx2Lane(unsigned Idx) {
}
}
-/// \brief Adjust the writemask of MIMG instructions
+/// Adjust the writemask of MIMG instructions
SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
SelectionDAG &DAG) const {
+ unsigned Opcode = Node->getMachineOpcode();
+
+ // Subtract 1 because the vdata output is not a MachineSDNode operand.
+ int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
+ if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
+ return Node; // not implemented for D16
+
SDNode *Users[4] = { nullptr };
unsigned Lane = 0;
- unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
+ unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
unsigned NewDmask = 0;
bool HasChain = Node->getNumValues() > 1;
@@ -6653,9 +7904,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
unsigned BitsSet = countPopulation(NewDmask);
- const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
- int NewOpcode = AMDGPU::getMaskedMIMGOp(*TII,
- Node->getMachineOpcode(), BitsSet);
+ int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
assert(NewOpcode != -1 &&
NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
"failed to find equivalent MIMG op");
@@ -6720,7 +7969,7 @@ static bool isFrameIndexOp(SDValue Op) {
return isa<FrameIndexSDNode>(Op);
}
-/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
+/// Legalize target independent instructions (e.g. INSERT_SUBREG)
/// with frame index operands.
/// LLVM assumes that inputs are to these instructions are registers.
SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
@@ -6767,7 +8016,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
return DAG.UpdateNodeOperands(Node, Ops);
}
-/// \brief Fold the instructions after selecting them.
+/// Fold the instructions after selecting them.
/// Returns null if users were already updated.
SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
SelectionDAG &DAG) const {
@@ -6841,7 +8090,7 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
return Node;
}
-/// \brief Assign the register class depending on the number of
+/// Assign the register class depending on the number of
/// bits set in the writemask
void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
@@ -6928,7 +8177,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
}
-/// \brief Return a resource descriptor with the 'Add TID' bit enabled
+/// Return a resource descriptor with the 'Add TID' bit enabled
/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
/// of the resource descriptor) to create an offset, which is added to
/// the resource pointer.
@@ -6970,11 +8219,11 @@ std::pair<unsigned, const TargetRegisterClass *>
SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
- if (!isTypeLegal(VT))
- return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
-
+ const TargetRegisterClass *RC = nullptr;
if (Constraint.size() == 1) {
switch (Constraint[0]) {
+ default:
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
case 's':
case 'r':
switch (VT.getSizeInBits()) {
@@ -6982,40 +8231,56 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, nullptr);
case 32:
case 16:
- return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
+ RC = &AMDGPU::SReg_32_XM0RegClass;
+ break;
case 64:
- return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
+ RC = &AMDGPU::SGPR_64RegClass;
+ break;
case 128:
- return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
+ RC = &AMDGPU::SReg_128RegClass;
+ break;
case 256:
- return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+ RC = &AMDGPU::SReg_256RegClass;
+ break;
case 512:
- return std::make_pair(0U, &AMDGPU::SReg_512RegClass);
+ RC = &AMDGPU::SReg_512RegClass;
+ break;
}
-
+ break;
case 'v':
switch (VT.getSizeInBits()) {
default:
return std::make_pair(0U, nullptr);
case 32:
case 16:
- return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
+ RC = &AMDGPU::VGPR_32RegClass;
+ break;
case 64:
- return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
+ RC = &AMDGPU::VReg_64RegClass;
+ break;
case 96:
- return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
+ RC = &AMDGPU::VReg_96RegClass;
+ break;
case 128:
- return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
+ RC = &AMDGPU::VReg_128RegClass;
+ break;
case 256:
- return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
+ RC = &AMDGPU::VReg_256RegClass;
+ break;
case 512:
- return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
+ RC = &AMDGPU::VReg_512RegClass;
+ break;
}
+ break;
}
+ // We actually support i128, i16 and f16 as inline parameters
+ // even if they are not reported as legal
+ if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
+ VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
+ return std::make_pair(0U, RC);
}
if (Constraint.size() > 1) {
- const TargetRegisterClass *RC = nullptr;
if (Constraint[1] == 'v') {
RC = &AMDGPU::VGPR_32RegClass;
} else if (Constraint[1] == 's') {
@@ -7052,8 +8317,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const MachineFrameInfo &MFI = MF.getFrameInfo();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (Info->isEntryFunction()) {
// Callable functions have fixed registers used for stack access.
@@ -7083,6 +8347,8 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
Info->getScratchWaveOffsetReg());
+ Info->limitOccupancy(MF);
+
TargetLoweringBase::finalizeLowering(MF);
}
@@ -7103,3 +8369,69 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
// calculation won't overflow, so assume the sign bit is never set.
Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
}
+
+bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
+ FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const
+{
+ switch (N->getOpcode()) {
+ case ISD::Register:
+ case ISD::CopyFromReg:
+ {
+ const RegisterSDNode *R = nullptr;
+ if (N->getOpcode() == ISD::Register) {
+ R = dyn_cast<RegisterSDNode>(N);
+ }
+ else {
+ R = dyn_cast<RegisterSDNode>(N->getOperand(1));
+ }
+ if (R)
+ {
+ const MachineFunction * MF = FLI->MF;
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
+ unsigned Reg = R->getReg();
+ if (TRI.isPhysicalRegister(Reg))
+ return TRI.isVGPR(MRI, Reg);
+
+ if (MRI.isLiveIn(Reg)) {
+ // workitem.id.x workitem.id.y workitem.id.z
+ // Any VGPR formal argument is also considered divergent
+ if (TRI.isVGPR(MRI, Reg))
+ return true;
+ // Formal arguments of non-entry functions
+ // are conservatively considered divergent
+ else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
+ return true;
+ }
+ return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg));
+ }
+ }
+ break;
+ case ISD::LOAD: {
+ const LoadSDNode *L = dyn_cast<LoadSDNode>(N);
+ if (L->getMemOperand()->getAddrSpace() ==
+ Subtarget->getAMDGPUAS().PRIVATE_ADDRESS)
+ return true;
+ } break;
+ case ISD::CALLSEQ_END:
+ return true;
+ break;
+ case ISD::INTRINSIC_WO_CHAIN:
+ {
+
+ }
+ return AMDGPU::isIntrinsicSourceOfDivergence(
+ cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
+ case ISD::INTRINSIC_W_CHAIN:
+ return AMDGPU::isIntrinsicSourceOfDivergence(
+ cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+ // In some cases intrinsics that are a source of divergence have been
+ // lowered to AMDGPUISD so we also need to check those too.
+ case AMDGPUISD::INTERP_MOV:
+ case AMDGPUISD::INTERP_P1:
+ case AMDGPUISD::INTERP_P2:
+ return true;
+ }
+ return false;
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index b48e67f7563a..ad049f2a71c3 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief SI DAG Lowering interface definition
+/// SI DAG Lowering interface definition
//
//===----------------------------------------------------------------------===//
@@ -22,12 +22,15 @@
namespace llvm {
class SITargetLowering final : public AMDGPUTargetLowering {
+private:
+ const GCNSubtarget *Subtarget;
+
SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
SDValue Chain, uint64_t Offset) const;
SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Chain,
- uint64_t Offset, bool Signed,
+ uint64_t Offset, unsigned Align, bool Signed,
const ISD::InputArg *Arg = nullptr) const;
SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
@@ -42,10 +45,14 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SelectionDAG &DAG) const override;
SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
MVT VT, unsigned Offset) const;
+ SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
+ SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
@@ -60,7 +67,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
- /// \brief Converts \p Op, which must be of floating point type, to the
+ SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
+ SelectionDAG &DAG,
+ bool IsIntrinsic = false) const;
+
+ SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;
+
+ /// Converts \p Op, which must be of floating point type, to the
/// floating point type \p VT, by either extending or truncating it.
SDValue getFPExtOrFPTrunc(SelectionDAG &DAG,
SDValue Op,
@@ -71,7 +84,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val,
bool Signed, const ISD::InputArg *Arg = nullptr) const;
- /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16.
+ /// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
@@ -80,7 +93,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const;
SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
@@ -121,8 +136,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
@@ -145,9 +163,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool shouldEmitPCReloc(const GlobalValue *GV) const;
public:
- SITargetLowering(const TargetMachine &tm, const SISubtarget &STI);
+ SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
- const SISubtarget *getSubtarget() const;
+ const GCNSubtarget *getSubtarget() const;
+
+ bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const override;
bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const override;
@@ -255,7 +275,10 @@ public:
EVT VT) const override;
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+ SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
+ SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
@@ -284,6 +307,9 @@ public:
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
+
+ bool isSDNodeSourceOfDivergence(const SDNode *N,
+ FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp
index a2f844d7854e..61c8f359e168 100644
--- a/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This pass inserts branches on the 0 exec mask over divergent branches
+/// This pass inserts branches on the 0 exec mask over divergent branches
/// branches when it's expected that jumping over the untaken control flow will
/// be cheaper than having every workitem no-op through it.
//
@@ -18,6 +18,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -210,65 +211,73 @@ void SIInsertSkips::kill(MachineInstr &MI) {
switch (MI.getOperand(2).getImm()) {
case ISD::SETOEQ:
case ISD::SETEQ:
- Opcode = AMDGPU::V_CMPX_EQ_F32_e32;
+ Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
break;
case ISD::SETOGT:
case ISD::SETGT:
- Opcode = AMDGPU::V_CMPX_LT_F32_e32;
+ Opcode = AMDGPU::V_CMPX_LT_F32_e64;
break;
case ISD::SETOGE:
case ISD::SETGE:
- Opcode = AMDGPU::V_CMPX_LE_F32_e32;
+ Opcode = AMDGPU::V_CMPX_LE_F32_e64;
break;
case ISD::SETOLT:
case ISD::SETLT:
- Opcode = AMDGPU::V_CMPX_GT_F32_e32;
+ Opcode = AMDGPU::V_CMPX_GT_F32_e64;
break;
case ISD::SETOLE:
case ISD::SETLE:
- Opcode = AMDGPU::V_CMPX_GE_F32_e32;
+ Opcode = AMDGPU::V_CMPX_GE_F32_e64;
break;
case ISD::SETONE:
case ISD::SETNE:
- Opcode = AMDGPU::V_CMPX_LG_F32_e32;
+ Opcode = AMDGPU::V_CMPX_LG_F32_e64;
break;
case ISD::SETO:
- Opcode = AMDGPU::V_CMPX_O_F32_e32;
+ Opcode = AMDGPU::V_CMPX_O_F32_e64;
break;
case ISD::SETUO:
- Opcode = AMDGPU::V_CMPX_U_F32_e32;
+ Opcode = AMDGPU::V_CMPX_U_F32_e64;
break;
case ISD::SETUEQ:
- Opcode = AMDGPU::V_CMPX_NLG_F32_e32;
+ Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
break;
case ISD::SETUGT:
- Opcode = AMDGPU::V_CMPX_NGE_F32_e32;
+ Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
break;
case ISD::SETUGE:
- Opcode = AMDGPU::V_CMPX_NGT_F32_e32;
+ Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
break;
case ISD::SETULT:
- Opcode = AMDGPU::V_CMPX_NLE_F32_e32;
+ Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
break;
case ISD::SETULE:
- Opcode = AMDGPU::V_CMPX_NLT_F32_e32;
+ Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
break;
case ISD::SETUNE:
- Opcode = AMDGPU::V_CMPX_NEQ_F32_e32;
+ Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
break;
default:
llvm_unreachable("invalid ISD:SET cond code");
}
- // TODO: Allow this:
- if (!MI.getOperand(0).isReg() ||
- !TRI->isVGPR(MBB.getParent()->getRegInfo(),
- MI.getOperand(0).getReg()))
- llvm_unreachable("SI_KILL operand should be a VGPR");
-
- BuildMI(MBB, &MI, DL, TII->get(Opcode))
- .add(MI.getOperand(1))
- .add(MI.getOperand(0));
+ assert(MI.getOperand(0).isReg());
+
+ if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
+ MI.getOperand(0).getReg())) {
+ Opcode = AMDGPU::getVOPe32(Opcode);
+ BuildMI(MBB, &MI, DL, TII->get(Opcode))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(0));
+ } else {
+ BuildMI(MBB, &MI, DL, TII->get(Opcode))
+ .addReg(AMDGPU::VCC, RegState::Define)
+ .addImm(0) // src0 modifiers
+ .add(MI.getOperand(1))
+ .addImm(0) // src1 modifiers
+ .add(MI.getOperand(0))
+ .addImm(0); // omod
+ }
break;
}
case AMDGPU::SI_KILL_I1_TERMINATOR: {
@@ -330,7 +339,7 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
}
bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
SkipThreshold = SkipThresholdFlag;
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6bbe5979316d..d456e3d9b94d 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Insert wait instructions for memory reads and writes.
+/// Insert wait instructions for memory reads and writes.
///
/// Memory reads and writes are issued asynchronously, so we need to insert
/// S_WAITCNT instructions when we want to access any of their results or
@@ -40,6 +40,7 @@
#include "llvm/IR/DebugLoc.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
@@ -50,9 +51,21 @@
#include <utility>
#include <vector>
+using namespace llvm;
+
#define DEBUG_TYPE "si-insert-waitcnts"
-using namespace llvm;
+DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
+ "Force emit s_waitcnt expcnt(0) instrs");
+DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
+ "Force emit s_waitcnt lgkmcnt(0) instrs");
+DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
+ "Force emit s_waitcnt vmcnt(0) instrs");
+
+static cl::opt<unsigned> ForceEmitZeroFlag(
+ "amdgpu-waitcnt-forcezero",
+ cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
+ cl::init(0), cl::Hidden);
namespace {
@@ -115,15 +128,15 @@ enum RegisterMapping {
(w) = (enum WaitEventType)((w) + 1))
// This is a per-basic-block object that maintains current score brackets
-// of each wait-counter, and a per-register scoreboard for each wait-couner.
+// of each wait counter, and a per-register scoreboard for each wait counter.
// We also maintain the latest score for every event type that can change the
// waitcnt in order to know if there are multiple types of events within
// the brackets. When multiple types of event happen in the bracket,
-// wait-count may get decreased out of order, therefore we need to put in
+// wait count may get decreased out of order, therefore we need to put in
// "s_waitcnt 0" before use.
class BlockWaitcntBrackets {
public:
- BlockWaitcntBrackets() {
+ BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
T = (enum InstCounterType)(T + 1)) {
memset(VgprScores[T], 0, sizeof(VgprScores[T]));
@@ -301,6 +314,7 @@ public:
void dump() { print(dbgs()); }
private:
+ const GCNSubtarget *ST = nullptr;
bool WaitAtBeginning = false;
bool RevisitLoop = false;
bool MixedExpTypes = false;
@@ -332,14 +346,12 @@ public:
void incIterCnt() { IterCnt++; }
void resetIterCnt() { IterCnt = 0; }
- int32_t getIterCnt() { return IterCnt; }
+ unsigned getIterCnt() { return IterCnt; }
void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
MachineInstr *getWaitcnt() const { return LfWaitcnt; }
- void print() {
- DEBUG(dbgs() << " iteration " << IterCnt << '\n';);
- }
+ void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }
private:
// s_waitcnt added at the end of loop footer to stablize wait scores
@@ -352,7 +364,7 @@ private:
class SIInsertWaitcnts : public MachineFunctionPass {
private:
- const SISubtarget *ST = nullptr;
+ const GCNSubtarget *ST = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
const MachineRegisterInfo *MRI = nullptr;
@@ -361,22 +373,31 @@ private:
AMDGPUAS AMDGPUASI;
DenseSet<MachineBasicBlock *> BlockVisitedSet;
- DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
+ DenseSet<MachineInstr *> TrackedWaitcntSet;
DenseSet<MachineInstr *> VCCZBugHandledSet;
DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
BlockWaitcntBracketsMap;
- DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
+ std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
+ // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
+ // because of amdgpu-waitcnt-forcezero flag
+ bool ForceEmitZeroWaitcnts;
+ bool ForceEmitWaitcnt[NUM_INST_CNTS];
+
public:
static char ID;
- SIInsertWaitcnts() : MachineFunctionPass(ID) {}
+ SIInsertWaitcnts() : MachineFunctionPass(ID) {
+ (void)ForceExpCounter;
+ (void)ForceLgkmCounter;
+ (void)ForceVMCounter;
+ }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -397,15 +418,53 @@ public:
llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
}
+ bool isForceEmitWaitcnt() const {
+ for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+ T = (enum InstCounterType)(T + 1))
+ if (ForceEmitWaitcnt[T])
+ return true;
+ return false;
+ }
+
+ void setForceEmitWaitcnt() {
+// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
+// For debug builds, get the debug counter info and adjust if need be
+#ifndef NDEBUG
+ if (DebugCounter::isCounterSet(ForceExpCounter) &&
+ DebugCounter::shouldExecute(ForceExpCounter)) {
+ ForceEmitWaitcnt[EXP_CNT] = true;
+ } else {
+ ForceEmitWaitcnt[EXP_CNT] = false;
+ }
+
+ if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
+ DebugCounter::shouldExecute(ForceLgkmCounter)) {
+ ForceEmitWaitcnt[LGKM_CNT] = true;
+ } else {
+ ForceEmitWaitcnt[LGKM_CNT] = false;
+ }
+
+ if (DebugCounter::isCounterSet(ForceVMCounter) &&
+ DebugCounter::shouldExecute(ForceVMCounter)) {
+ ForceEmitWaitcnt[VM_CNT] = true;
+ } else {
+ ForceEmitWaitcnt[VM_CNT] = false;
+ }
+#endif // NDEBUG
+ }
+
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
- MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
- BlockWaitcntBrackets *ScoreBrackets);
- void updateEventWaitCntAfter(MachineInstr &Inst,
+ void generateWaitcntInstBefore(MachineInstr &MI,
+ BlockWaitcntBrackets *ScoreBrackets);
+ void updateEventWaitcntAfter(MachineInstr &Inst,
BlockWaitcntBrackets *ScoreBrackets);
void mergeInputScoreBrackets(MachineBasicBlock &Block);
- MachineBasicBlock *loopBottom(const MachineLoop *Loop);
+ bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
+ unsigned countNumBottomBlocks(const MachineLoop *Loop);
void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
+ bool isWaitcntStronger(unsigned LHS, unsigned RHS);
+ unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
};
} // end anonymous namespace
@@ -459,7 +518,7 @@ void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
const MachineRegisterInfo *MRI,
unsigned OpNo, int32_t Val) {
RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
- DEBUG({
+ LLVM_DEBUG({
const MachineOperand &Opnd = MI->getOperand(OpNo);
assert(TRI->isVGPR(*MRI, Opnd.getReg()));
});
@@ -681,14 +740,17 @@ unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
const int32_t LB = getScoreLB(T);
const int32_t UB = getScoreUB(T);
if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
- if (T == VM_CNT && hasPendingFlat()) {
- // If there is a pending FLAT operation, and this is a VM waitcnt,
- // then we need to force a waitcnt 0 for VM.
+ if ((T == VM_CNT || T == LGKM_CNT) &&
+ hasPendingFlat() &&
+ !ST->hasFlatLgkmVMemCountInOrder()) {
+ // If there is a pending FLAT operation, and this is a VMem or LGKM
+ // waitcnt and the target can report early completion, then we need
+ // to force a waitcnt 0.
NeedWait = CNT_MASK(T);
setScoreLB(T, getScoreUB(T));
} else if (counterOutOfOrder(T)) {
// Counter can get decremented out-of-order when there
- // are multiple types event in the brack. Also emit an s_wait counter
+ // are multiple types event in the bracket. Also emit an s_wait counter
// with a conservative value of 0 for the counter.
NeedWait = CNT_MASK(T);
setScoreLB(T, getScoreUB(T));
@@ -789,7 +851,30 @@ static bool readsVCCZ(const MachineInstr &MI) {
!MI.getOperand(1).isUndef();
}
-/// \brief Generate s_waitcnt instruction to be placed before cur_Inst.
+/// Given wait count encodings checks if LHS is stronger than RHS.
+bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
+ if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
+ return false;
+ if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
+ return false;
+ if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
+ return false;
+ return true;
+}
+
+/// Given wait count encodings create a new encoding which is stronger
+/// or equal to both.
+unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
+ unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
+ AMDGPU::decodeVmcnt(IV, RHS));
+ unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
+ AMDGPU::decodeLgkmcnt(IV, RHS));
+ unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
+ AMDGPU::decodeExpcnt(IV, RHS));
+ return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
+}
+
+/// Generate s_waitcnt instruction to be placed before cur_Inst.
/// Instructions of a given type are returned in order,
/// but instructions of different types can complete out of order.
/// We rely on this in-order completion
@@ -799,23 +884,29 @@ static bool readsVCCZ(const MachineInstr &MI) {
/// and if so what the value of each counter is.
/// The "score bracket" is bound by the lower bound and upper bound
/// scores (*_score_LB and *_score_ub respectively).
-MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
+void SIInsertWaitcnts::generateWaitcntInstBefore(
MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
// To emit, or not to emit - that's the question!
// Start with an assumption that there is no need to emit.
- unsigned int EmitSwaitcnt = 0;
- // s_waitcnt instruction to return; default is NULL.
- MachineInstr *SWaitInst = nullptr;
+ unsigned int EmitWaitcnt = 0;
+
// No need to wait before phi. If a phi-move exists, then the wait should
// has been inserted before the move. If a phi-move does not exist, then
// wait should be inserted before the real use. The same is true for
// sc-merge. It is not a coincident that all these cases correspond to the
// instructions that are skipped in the assembling loop.
bool NeedLineMapping = false; // TODO: Check on this.
- if (MI.isDebugValue() &&
+
+ // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
+ bool ForceEmitZeroWaitcnt = false;
+
+ setForceEmitWaitcnt();
+ bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
+
+ if (MI.isDebugInstr() &&
// TODO: any other opcode?
!NeedLineMapping) {
- return SWaitInst;
+ return;
}
// See if an s_waitcnt is forced at block entry, or is needed at
@@ -826,7 +917,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
ScoreBrackets->clearWaitAtBeginning();
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
T = (enum InstCounterType)(T + 1)) {
- EmitSwaitcnt |= CNT_MASK(T);
+ EmitWaitcnt |= CNT_MASK(T);
ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
}
}
@@ -836,21 +927,20 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
- EmitSwaitcnt |=
+ EmitWaitcnt |=
ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
}
// All waits must be resolved at call return.
// NOTE: this could be improved with knowledge of all call sites or
// with knowledge of the called routines.
- if (MI.getOpcode() == AMDGPU::RETURN ||
- MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+ if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
T = (enum InstCounterType)(T + 1)) {
if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
- EmitSwaitcnt |= CNT_MASK(T);
+ EmitWaitcnt |= CNT_MASK(T);
}
}
}
@@ -861,7 +951,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
AMDGPU::SendMsg::ID_GS_DONE)) {
if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
- EmitSwaitcnt |= CNT_MASK(VM_CNT);
+ EmitWaitcnt |= CNT_MASK(VM_CNT);
}
}
#if 0 // TODO: the following blocks of logic when we have fence.
@@ -879,11 +969,11 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
case SCMEM_LDS:
if (group_is_multi_wave ||
context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
- EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+ EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
ScoreBrackets->getScoreUB(LGKM_CNT));
// LDS may have to wait for VM_CNT after buffer load to LDS
if (target_info->HasBufferLoadToLDS()) {
- EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+ EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
ScoreBrackets->getScoreUB(VM_CNT));
}
}
@@ -891,9 +981,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
case SCMEM_GDS:
if (group_is_multi_wave || fence_is_global) {
- EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+ EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
ScoreBrackets->getScoreUB(EXP_CNT));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+ EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
ScoreBrackets->getScoreUB(LGKM_CNT));
}
break;
@@ -903,9 +993,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
case SCMEM_RING:
case SCMEM_SCATTER:
if (group_is_multi_wave || fence_is_global) {
- EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+ EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
ScoreBrackets->getScoreUB(EXP_CNT));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+ EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
ScoreBrackets->getScoreUB(VM_CNT));
}
break;
@@ -926,13 +1016,13 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
// Export and GDS are tracked individually, either may trigger a waitcnt
// for EXEC.
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
}
@@ -947,7 +1037,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
if (ScoreBrackets->getScoreUB(EXP_CNT) >
ScoreBrackets->getScoreLB(EXP_CNT)) {
ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
- EmitSwaitcnt |= CNT_MASK(EXP_CNT);
+ EmitWaitcnt |= CNT_MASK(EXP_CNT);
}
}
#endif
@@ -965,7 +1055,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
// VM_CNT is only relevant to vgpr or LDS.
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
}
@@ -977,10 +1067,10 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
if (TRI->isVGPR(MRIA, Op.getReg())) {
// VM_CNT is only relevant to vgpr or LDS.
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
}
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
}
}
@@ -999,9 +1089,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
if (AS != AMDGPUASI.LOCAL_ADDRESS)
continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
}
}
@@ -1012,38 +1102,35 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
if (TRI->isVGPR(MRIA, Def.getReg())) {
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
}
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
}
} // End of for loop that looks at all dest operands.
}
- // TODO: Tie force zero to a compiler triage option.
- bool ForceZero = false;
-
// Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
// occurs before the instruction. Doing it here prevents any additional
// S_WAITCNTs from being emitted if the instruction was marked as
// requiring a WAITCNT beforehand.
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
!ST->hasAutoWaitcntBeforeBarrier()) {
- EmitSwaitcnt |=
+ EmitWaitcnt |=
ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
}
// TODO: Remove this work-around, enable the assert for Bug 457939
// after fixing the scheduler. Also, the Shader Compiler code is
// independent of target.
- if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
+ if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
if (ScoreBrackets->getScoreLB(LGKM_CNT) <
ScoreBrackets->getScoreUB(LGKM_CNT) &&
ScoreBrackets->hasPendingSMEM()) {
@@ -1052,17 +1139,20 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
// block, so if we only wait on LGKM here, we might end up with
// another s_waitcnt inserted right after this if there are non-LGKM
// instructions still outstanding.
- ForceZero = true;
- EmitSwaitcnt = true;
+ // FIXME: this is too conservative / the comment is wrong.
+ // We don't wait on everything at the end of the block and we combine
+ // waitcnts so we should never have back-to-back waitcnts.
+ ForceEmitZeroWaitcnt = true;
+ EmitWaitcnt = true;
}
}
// Does this operand processing indicate s_wait counter update?
- if (EmitSwaitcnt) {
+ if (EmitWaitcnt || IsForceEmitWaitcnt) {
int CntVal[NUM_INST_CNTS];
bool UseDefaultWaitcntStrategy = true;
- if (ForceZero) {
+ if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
// Force all waitcnts to 0.
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
T = (enum InstCounterType)(T + 1)) {
@@ -1077,7 +1167,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
if (UseDefaultWaitcntStrategy) {
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
T = (enum InstCounterType)(T + 1)) {
- if (EmitSwaitcnt & CNT_MASK(T)) {
+ if (EmitWaitcnt & CNT_MASK(T)) {
int Delta =
ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
int MaxDelta = ScoreBrackets->getWaitCountMax(T);
@@ -1087,7 +1177,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
ScoreBrackets->setScoreLB(
T, ScoreBrackets->getScoreUB(T) - MaxDelta);
}
- EmitSwaitcnt &= ~CNT_MASK(T);
+ EmitWaitcnt &= ~CNT_MASK(T);
}
CntVal[T] = Delta;
} else {
@@ -1099,10 +1189,11 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
}
// If we are not waiting on any counter we can skip the wait altogether.
- if (EmitSwaitcnt != 0) {
+ if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
- if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
+ if (!OldWaitcnt ||
+ (AMDGPU::decodeVmcnt(IV, Imm) !=
(CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
(AMDGPU::decodeExpcnt(IV, Imm) !=
(CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
@@ -1114,39 +1205,80 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
BlockWaitcntBrackets *ScoreBracket =
BlockWaitcntBracketsMap[TBB].get();
if (!ScoreBracket) {
- assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
+ assert(!BlockVisitedSet.count(TBB));
BlockWaitcntBracketsMap[TBB] =
- llvm::make_unique<BlockWaitcntBrackets>();
+ llvm::make_unique<BlockWaitcntBrackets>(ST);
ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
}
ScoreBracket->setRevisitLoop(true);
- DEBUG(dbgs() << "set-revisit: block"
- << ContainingLoop->getHeader()->getNumber() << '\n';);
+ LLVM_DEBUG(dbgs()
+ << "set-revisit2: Block"
+ << ContainingLoop->getHeader()->getNumber() << '\n';);
}
}
// Update an existing waitcount, or make a new one.
- MachineFunction &MF = *MI.getParent()->getParent();
- if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) {
- SWaitInst = OldWaitcnt;
- } else {
- SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
- MI.getDebugLoc());
- CompilerGeneratedWaitcntSet.insert(SWaitInst);
- }
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV,
+ ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
+ ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
+ ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
+ // We don't remove waitcnts that existed prior to the waitcnt
+ // pass. Check if the waitcnt to-be-inserted can be avoided
+ // or if the prev waitcnt can be updated.
+ bool insertSWaitInst = true;
+ for (MachineBasicBlock::iterator I = MI.getIterator(),
+ B = MI.getParent()->begin();
+ insertSWaitInst && I != B; --I) {
+ if (I == MI.getIterator())
+ continue;
- const MachineOperand &Op =
- MachineOperand::CreateImm(AMDGPU::encodeWaitcnt(
- IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT]));
- SWaitInst->addOperand(MF, Op);
+ switch (I->getOpcode()) {
+ case AMDGPU::S_WAITCNT:
+ if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
+ insertSWaitInst = false;
+ else if (!OldWaitcnt) {
+ OldWaitcnt = &*I;
+ Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
+ }
+ break;
+ // TODO: skip over instructions which never require wait.
+ }
+ break;
+ }
+ if (insertSWaitInst) {
+ if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
+ if (ForceEmitZeroWaitcnts)
+ LLVM_DEBUG(
+ dbgs()
+ << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
+ if (IsForceEmitWaitcnt)
+ LLVM_DEBUG(dbgs()
+ << "Force emit a s_waitcnt due to debug counter\n");
+
+ OldWaitcnt->getOperand(0).setImm(Enc);
+ if (!OldWaitcnt->getParent())
+ MI.getParent()->insert(MI, OldWaitcnt);
+
+ LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
+ << "Old Instr: " << MI << '\n'
+ << "New Instr: " << *OldWaitcnt << '\n');
+ } else {
+ auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
+ MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+ .addImm(Enc);
+ TrackedWaitcntSet.insert(SWaitInst);
+
+ LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+ << "Old Instr: " << MI << '\n'
+ << "New Instr: " << *SWaitInst << '\n');
+ }
+ }
if (CntVal[EXP_CNT] == 0) {
ScoreBrackets->setMixedExpTypes(false);
}
}
}
-
- return SWaitInst;
}
void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
@@ -1180,7 +1312,7 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
return false;
}
-void SIInsertWaitcnts::updateEventWaitCntAfter(
+void SIInsertWaitcnts::updateEventWaitcntAfter(
MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
// Now look at the instruction opcode. If it is a memory access
// instruction, update the upper-bound of the appropriate counter's
@@ -1214,7 +1346,7 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
- if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
+ if (ST->vmemWriteNeedsExpWaitcnt() &&
(Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
}
@@ -1247,27 +1379,37 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
}
}
+// Merge the score brackets of the Block's predecessors;
+// this merged score bracket is used when adding waitcnts to the Block
void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
int32_t MaxPending[NUM_INST_CNTS] = {0};
int32_t MaxFlat[NUM_INST_CNTS] = {0};
bool MixedExpTypes = false;
- // Clear the score bracket state.
- ScoreBrackets->clear();
-
- // Compute the number of pending elements on block entry.
+ // For single basic block loops, we need to retain the Block's
+ // score bracket to have accurate Pred info. So, make a copy of Block's
+ // score bracket, clear() it (which retains several important bits of info),
+ // populate, and then replace en masse. For non-single basic block loops,
+ // just clear Block's current score bracket and repopulate in-place.
+ bool IsSelfPred;
+ std::unique_ptr<BlockWaitcntBrackets> S;
+
+ IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
+ != Block.pred_end();
+ if (IsSelfPred) {
+ S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
+ ScoreBrackets = S.get();
+ }
- // IMPORTANT NOTE: If iterative handling of loops is added, the code will
- // need to handle single BBs with backedges to themselves. This means that
- // they will need to retain and not clear their initial state.
+ ScoreBrackets->clear();
// See if there are any uninitialized predecessors. If so, emit an
// s_waitcnt 0 at the beginning of the block.
- for (MachineBasicBlock *pred : Block.predecessors()) {
+ for (MachineBasicBlock *Pred : Block.predecessors()) {
BlockWaitcntBrackets *PredScoreBrackets =
- BlockWaitcntBracketsMap[pred].get();
- bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
+ BlockWaitcntBracketsMap[Pred].get();
+ bool Visited = BlockVisitedSet.count(Pred);
if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
continue;
}
@@ -1306,7 +1448,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
for (MachineBasicBlock *Pred : Block.predecessors()) {
BlockWaitcntBrackets *PredScoreBrackets =
BlockWaitcntBracketsMap[Pred].get();
- bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
+ bool Visited = BlockVisitedSet.count(Pred);
if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
continue;
}
@@ -1354,7 +1496,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
// Set the register scoreboard.
for (MachineBasicBlock *Pred : Block.predecessors()) {
- if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+ if (!BlockVisitedSet.count(Pred)) {
continue;
}
@@ -1468,7 +1610,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
// sequencing predecessors, because changes to EXEC require waitcnts due to
// the delayed nature of these operations.
for (MachineBasicBlock *Pred : Block.predecessors()) {
- if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+ if (!BlockVisitedSet.count(Pred)) {
continue;
}
@@ -1496,17 +1638,36 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
}
}
}
+
+ // if a single block loop, update the score brackets. Not needed for other
+ // blocks, as we did this in-place
+ if (IsSelfPred) {
+ BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
+ }
}
-/// Return the "bottom" block of a loop. This differs from
-/// MachineLoop::getBottomBlock in that it works even if the loop is
-/// discontiguous.
-MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
- MachineBasicBlock *Bottom = Loop->getHeader();
- for (MachineBasicBlock *MBB : Loop->blocks())
- if (MBB->getNumber() > Bottom->getNumber())
- Bottom = MBB;
- return Bottom;
+/// Return true if the given basic block is a "bottom" block of a loop.
+/// This works even if the loop is discontiguous. This also handles
+/// multiple back-edges for the same "header" block of a loop.
+bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
+ const MachineBasicBlock *Block) {
+ for (MachineBasicBlock *MBB : Loop->blocks()) {
+ if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/// Count the number of "bottom" basic blocks of a loop.
+unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
+ unsigned Count = 0;
+ for (MachineBasicBlock *MBB : Loop->blocks()) {
+ if (MBB->isSuccessor(Loop->getHeader())) {
+ Count++;
+ }
+ }
+ return Count;
}
// Generate s_waitcnt instructions where needed.
@@ -1517,8 +1678,8 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
- DEBUG({
- dbgs() << "Block" << Block.getNumber();
+ LLVM_DEBUG({
+ dbgs() << "*** Block" << Block.getNumber() << " ***";
ScoreBrackets->dump();
});
@@ -1528,16 +1689,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
MachineInstr &Inst = *Iter;
// Remove any previously existing waitcnts.
if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
- // TODO: Register the old waitcnt and optimize the following waitcnts.
- // Leaving the previously existing waitcnts is conservatively correct.
- if (CompilerGeneratedWaitcntSet.find(&Inst) ==
- CompilerGeneratedWaitcntSet.end())
+ // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
+ // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
+ // as needed.
+ if (!TrackedWaitcntSet.count(&Inst))
++Iter;
else {
- ScoreBrackets->setWaitcnt(&Inst);
++Iter;
Inst.removeFromParent();
}
+ ScoreBrackets->setWaitcnt(&Inst);
continue;
}
@@ -1550,29 +1711,20 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
bool VCCZBugWorkAround = false;
if (readsVCCZ(Inst) &&
- (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
+ (!VCCZBugHandledSet.count(&Inst))) {
if (ScoreBrackets->getScoreLB(LGKM_CNT) <
ScoreBrackets->getScoreUB(LGKM_CNT) &&
ScoreBrackets->hasPendingSMEM()) {
- if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
+ if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
VCCZBugWorkAround = true;
}
}
// Generate an s_waitcnt instruction to be placed before
// cur_Inst, if needed.
- MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
-
- if (SWaitInst) {
- Block.insert(Inst, SWaitInst);
- if (ScoreBrackets->getWaitcnt() != SWaitInst) {
- DEBUG(dbgs() << "insertWaitcntInBlock\n"
- << "Old Instr: " << Inst << '\n'
- << "New Instr: " << *SWaitInst << '\n';);
- }
- }
+ generateWaitcntInstBefore(Inst, ScoreBrackets);
- updateEventWaitCntAfter(Inst, ScoreBrackets);
+ updateEventWaitcntAfter(Inst, ScoreBrackets);
#if 0 // TODO: implement resource type check controlled by options with ub = LB.
// If this instruction generates a S_SETVSKIP because it is an
@@ -1587,10 +1739,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
ScoreBrackets->clearWaitcnt();
- if (SWaitInst) {
- DEBUG({ SWaitInst->print(dbgs() << '\n'); });
- }
- DEBUG({
+ LLVM_DEBUG({
Inst.print(dbgs());
ScoreBrackets->dump();
});
@@ -1627,21 +1776,22 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Check if we need to force convergence at loop footer.
MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
- if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
+ if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
WaitcntData->print();
- DEBUG(dbgs() << '\n';);
+ LLVM_DEBUG(dbgs() << '\n';);
// The iterative waitcnt insertion algorithm aims for optimal waitcnt
- // placement and doesn't always guarantee convergence for a loop. Each
- // loop should take at most 2 iterations for it to converge naturally.
- // When this max is reached and result doesn't converge, we force
- // convergence by inserting a s_waitcnt at the end of loop footer.
- if (WaitcntData->getIterCnt() > 2) {
+ // placement, but doesn't guarantee convergence for a loop. Each
+ // loop should take at most (n+1) iterations for it to converge naturally,
+ // where n is the number of bottom blocks. If this threshold is reached and
+ // the result hasn't converged, then we force convergence by inserting
+ // a s_waitcnt at the end of loop footer.
+ if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
// To ensure convergence, need to make wait events at loop footer be no
// more than those from the previous iteration.
- // As a simplification, Instead of tracking individual scores and
- // generate the precise wait count, just wait on 0.
+ // As a simplification, instead of tracking individual scores and
+ // generating the precise wait count, just wait on 0.
bool HasPending = false;
MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
@@ -1649,16 +1799,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
HasPending = true;
+ break;
}
}
if (HasPending) {
if (!SWaitInst) {
- SWaitInst = Block.getParent()->CreateMachineInstr(
- TII->get(AMDGPU::S_WAITCNT), DebugLoc());
- CompilerGeneratedWaitcntSet.insert(SWaitInst);
- const MachineOperand &Op = MachineOperand::CreateImm(0);
- SWaitInst->addOperand(MF, Op);
+ SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
+ DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+ .addImm(0);
+ TrackedWaitcntSet.insert(SWaitInst);
#if 0 // TODO: Format the debug output
OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
OutputTransformAdd(SWaitInst, context);
@@ -1670,7 +1820,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
}
if (SWaitInst) {
- DEBUG({
+ LLVM_DEBUG({
SWaitInst->print(dbgs());
dbgs() << "\nAdjusted score board:";
ScoreBrackets->dump();
@@ -1678,7 +1828,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Add this waitcnt to the block. It is either newly created or
// created in previous iterations and added back since block traversal
- // always remove waitcnt.
+ // always removes waitcnts.
insertWaitcntBeforeCF(Block, SWaitInst);
WaitcntData->setWaitcnt(SWaitInst);
}
@@ -1687,7 +1837,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
}
bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
- ST = &MF.getSubtarget<SISubtarget>();
+ ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
@@ -1696,6 +1846,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
AMDGPUASI = ST->getAMDGPUAS();
+ ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
+ for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+ T = (enum InstCounterType)(T + 1))
+ ForceEmitWaitcnt[T] = false;
+
HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
@@ -1712,6 +1867,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
RegisterEncoding.SGPRL =
RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
+ TrackedWaitcntSet.clear();
+ BlockVisitedSet.clear();
+ VCCZBugHandledSet.clear();
+ LoopWaitcntDataMap.clear();
+ BlockWaitcntProcessedSet.clear();
+
// Walk over the blocks in reverse post-dominator order, inserting
// s_waitcnt where needed.
ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
@@ -1726,7 +1887,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
if (!ScoreBrackets) {
- BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
+ BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
}
ScoreBrackets->setPostOrder(MBB.getNumber());
@@ -1737,22 +1898,30 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
// If we are walking into the block from before the loop, then guarantee
// at least 1 re-walk over the loop to propagate the information, even if
// no S_WAITCNT instructions were generated.
- if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
- (BlockWaitcntProcessedSet.find(&MBB) ==
- BlockWaitcntProcessedSet.end())) {
- BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
- DEBUG(dbgs() << "set-revisit: block"
- << ContainingLoop->getHeader()->getNumber() << '\n';);
+ if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
+ unsigned Count = countNumBottomBlocks(ContainingLoop);
+
+ // If the loop has multiple back-edges, and so more than one "bottom"
+ // basic block, we have to guarantee a re-walk over every blocks.
+ if ((std::count(BlockWaitcntProcessedSet.begin(),
+ BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
+ BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
+ LLVM_DEBUG(dbgs() << "set-revisit1: Block"
+ << ContainingLoop->getHeader()->getNumber() << '\n';);
+ }
}
// Walk over the instructions.
insertWaitcntInBlock(MF, MBB);
- // Flag that waitcnts have been processed at least once.
- BlockWaitcntProcessedSet.insert(&MBB);
+ // Record that waitcnts have been processed at least once for this block.
+ BlockWaitcntProcessedSet.push_back(&MBB);
- // See if we want to revisit the loop.
- if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
+ // See if we want to revisit the loop. If a loop has multiple back-edges,
+ // we shouldn't revisit the same "bottom" basic block.
+ if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
+ std::count(BlockWaitcntProcessedSet.begin(),
+ BlockWaitcntProcessedSet.end(), &MBB) == 1) {
MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
if (EntrySB && EntrySB->getRevisitLoop()) {
@@ -1772,7 +1941,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
}
LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
WaitcntData->incIterCnt();
- DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
+ LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
continue;
} else {
LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
@@ -1837,7 +2006,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
if (!MFI->isEntryFunction()) {
// Wait for any outstanding memory operations that the input registers may
- // depend on. We can't track them and it's better to to the wait after the
+ // depend on. We can't track them and it's better to the wait after the
// costly call sequence.
// TODO: Could insert earlier and schedule more liberally with operations
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
deleted file mode 100644
index b074b95c2d3c..000000000000
--- a/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ /dev/null
@@ -1,703 +0,0 @@
-//===- SILowerControlFlow.cpp - Use predicates for control flow -----------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Insert wait instructions for memory reads and writes.
-///
-/// Memory reads and writes are issued asynchronously, so we need to insert
-/// S_WAITCNT instructions when we want to access any of their results or
-/// overwrite any register that's used asynchronously.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <utility>
-
-#define DEBUG_TYPE "si-insert-waits"
-
-using namespace llvm;
-
-namespace {
-
-/// \brief One variable for each of the hardware counters
-using Counters = union {
- struct {
- unsigned VM;
- unsigned EXP;
- unsigned LGKM;
- } Named;
- unsigned Array[3];
-};
-
-using InstType = enum {
- OTHER,
- SMEM,
- VMEM
-};
-
-using RegCounters = Counters[512];
-using RegInterval = std::pair<unsigned, unsigned>;
-
-class SIInsertWaits : public MachineFunctionPass {
-private:
- const SISubtarget *ST = nullptr;
- const SIInstrInfo *TII = nullptr;
- const SIRegisterInfo *TRI = nullptr;
- const MachineRegisterInfo *MRI;
- AMDGPU::IsaInfo::IsaVersion ISA;
-
- /// \brief Constant zero value
- static const Counters ZeroCounts;
-
- /// \brief Hardware limits
- Counters HardwareLimits;
-
- /// \brief Counter values we have already waited on.
- Counters WaitedOn;
-
- /// \brief Counter values that we must wait on before the next counter
- /// increase.
- Counters DelayedWaitOn;
-
- /// \brief Counter values for last instruction issued.
- Counters LastIssued;
-
- /// \brief Registers used by async instructions.
- RegCounters UsedRegs;
-
- /// \brief Registers defined by async instructions.
- RegCounters DefinedRegs;
-
- /// \brief Different export instruction types seen since last wait.
- unsigned ExpInstrTypesSeen = 0;
-
- /// \brief Type of the last opcode.
- InstType LastOpcodeType;
-
- bool LastInstWritesM0;
-
- /// Whether or not we have flat operations outstanding.
- bool IsFlatOutstanding;
-
- /// \brief Whether the machine function returns void
- bool ReturnsVoid;
-
- /// Whether the VCCZ bit is possibly corrupt
- bool VCCZCorrupt = false;
-
- /// \brief Get increment/decrement amount for this instruction.
- Counters getHwCounts(MachineInstr &MI);
-
- /// \brief Is operand relevant for async execution?
- bool isOpRelevant(MachineOperand &Op);
-
- /// \brief Get register interval an operand affects.
- RegInterval getRegInterval(const TargetRegisterClass *RC,
- const MachineOperand &Reg) const;
-
- /// \brief Handle instructions async components
- void pushInstruction(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const Counters& Increment);
-
- /// \brief Insert the actual wait instruction
- bool insertWait(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const Counters &Counts);
-
- /// \brief Handle existing wait instructions (from intrinsics)
- void handleExistingWait(MachineBasicBlock::iterator I);
-
- /// \brief Do we need def2def checks?
- bool unorderedDefines(MachineInstr &MI);
-
- /// \brief Resolve all operand dependencies to counter requirements
- Counters handleOperands(MachineInstr &MI);
-
- /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
- void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
-
- /// Return true if there are LGKM instrucitons that haven't been waited on
- /// yet.
- bool hasOutstandingLGKM() const;
-
-public:
- static char ID;
-
- SIInsertWaits() : MachineFunctionPass(ID) {}
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override {
- return "SI insert wait instructions";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-};
-
-} // end anonymous namespace
-
-INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
- "SI Insert Waits", false, false)
-INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
- "SI Insert Waits", false, false)
-
-char SIInsertWaits::ID = 0;
-
-char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
-
-FunctionPass *llvm::createSIInsertWaitsPass() {
- return new SIInsertWaits();
-}
-
-const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
-
-static bool readsVCCZ(const MachineInstr &MI) {
- unsigned Opc = MI.getOpcode();
- return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
- !MI.getOperand(1).isUndef();
-}
-
-bool SIInsertWaits::hasOutstandingLGKM() const {
- return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
-}
-
-Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
- uint64_t TSFlags = MI.getDesc().TSFlags;
- Counters Result = { { 0, 0, 0 } };
-
- Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
-
- // Only consider stores or EXP for EXP_CNT
- Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
-
- // LGKM may uses larger values
- if (TSFlags & SIInstrFlags::LGKM_CNT) {
-
- if (TII->isSMRD(MI)) {
-
- if (MI.getNumOperands() != 0) {
- assert(MI.getOperand(0).isReg() &&
- "First LGKM operand must be a register!");
-
- // XXX - What if this is a write into a super register?
- const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
- unsigned Size = TRI->getRegSizeInBits(*RC);
- Result.Named.LGKM = Size > 32 ? 2 : 1;
- } else {
- // s_dcache_inv etc. do not have a a destination register. Assume we
- // want a wait on these.
- // XXX - What is the right value?
- Result.Named.LGKM = 1;
- }
- } else {
- // DS
- Result.Named.LGKM = 1;
- }
-
- } else {
- Result.Named.LGKM = 0;
- }
-
- return Result;
-}
-
-bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
- // Constants are always irrelevant
- if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
- return false;
-
- // Defines are always relevant
- if (Op.isDef())
- return true;
-
- // For exports all registers are relevant.
- // TODO: Skip undef/disabled registers.
- MachineInstr &MI = *Op.getParent();
- if (TII->isEXP(MI))
- return true;
-
- // For stores the stored value is also relevant
- if (!MI.getDesc().mayStore())
- return false;
-
- // Check if this operand is the value being stored.
- // Special case for DS/FLAT instructions, since the address
- // operand comes before the value operand and it may have
- // multiple data operands.
-
- if (TII->isDS(MI)) {
- MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
- if (Data0 && Op.isIdenticalTo(*Data0))
- return true;
-
- MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
- return Data1 && Op.isIdenticalTo(*Data1);
- }
-
- if (TII->isFLAT(MI)) {
- MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
- if (Data && Op.isIdenticalTo(*Data))
- return true;
- }
-
- // NOTE: This assumes that the value operand is before the
- // address operand, and that there is only one value operand.
- for (MachineInstr::mop_iterator I = MI.operands_begin(),
- E = MI.operands_end(); I != E; ++I) {
-
- if (I->isReg() && I->isUse())
- return Op.isIdenticalTo(*I);
- }
-
- return false;
-}
-
-RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
- const MachineOperand &Reg) const {
- unsigned Size = TRI->getRegSizeInBits(*RC);
- assert(Size >= 32);
-
- RegInterval Result;
- Result.first = TRI->getEncodingValue(Reg.getReg());
- Result.second = Result.first + Size / 32;
-
- return Result;
-}
-
-void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const Counters &Increment) {
- // Get the hardware counter increments and sum them up
- Counters Limit = ZeroCounts;
- unsigned Sum = 0;
-
- if (TII->mayAccessFlatAddressSpace(*I))
- IsFlatOutstanding = true;
-
- for (unsigned i = 0; i < 3; ++i) {
- LastIssued.Array[i] += Increment.Array[i];
- if (Increment.Array[i])
- Limit.Array[i] = LastIssued.Array[i];
- Sum += Increment.Array[i];
- }
-
- // If we don't increase anything then that's it
- if (Sum == 0) {
- LastOpcodeType = OTHER;
- return;
- }
-
- if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
- // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
- // or SMEM clause, respectively.
- //
- // The temporary workaround is to break the clauses with S_NOP.
- //
- // The proper solution would be to allocate registers such that all source
- // and destination registers don't overlap, e.g. this is illegal:
- // r0 = load r2
- // r2 = load r0
- if (LastOpcodeType == VMEM && Increment.Named.VM) {
- // Insert a NOP to break the clause.
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
- .addImm(0);
- LastInstWritesM0 = false;
- }
-
- if (TII->isSMRD(*I))
- LastOpcodeType = SMEM;
- else if (Increment.Named.VM)
- LastOpcodeType = VMEM;
- }
-
- // Remember which export instructions we have seen
- if (Increment.Named.EXP) {
- ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
- }
-
- for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
- MachineOperand &Op = I->getOperand(i);
- if (!isOpRelevant(Op))
- continue;
-
- const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
- RegInterval Interval = getRegInterval(RC, Op);
- for (unsigned j = Interval.first; j < Interval.second; ++j) {
-
- // Remember which registers we define
- if (Op.isDef())
- DefinedRegs[j] = Limit;
-
- // and which one we are using
- if (Op.isUse())
- UsedRegs[j] = Limit;
- }
- }
-}
-
-bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const Counters &Required) {
- // End of program? No need to wait on anything
- // A function not returning void needs to wait, because other bytecode will
- // be appended after it and we don't know what it will be.
- if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
- return false;
-
- // Figure out if the async instructions execute in order
- bool Ordered[3];
-
- // VM_CNT is always ordered except when there are flat instructions, which
- // can return out of order.
- Ordered[0] = !IsFlatOutstanding;
-
- // EXP_CNT is unordered if we have both EXP & VM-writes
- Ordered[1] = ExpInstrTypesSeen == 3;
-
- // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
- Ordered[2] = false;
-
- // The values we are going to put into the S_WAITCNT instruction
- Counters Counts = HardwareLimits;
-
- // Do we really need to wait?
- bool NeedWait = false;
-
- for (unsigned i = 0; i < 3; ++i) {
- if (Required.Array[i] <= WaitedOn.Array[i])
- continue;
-
- NeedWait = true;
-
- if (Ordered[i]) {
- unsigned Value = LastIssued.Array[i] - Required.Array[i];
-
- // Adjust the value to the real hardware possibilities.
- Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
- } else
- Counts.Array[i] = 0;
-
- // Remember on what we have waited on.
- WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
- }
-
- if (!NeedWait)
- return false;
-
- // Reset EXP_CNT instruction types
- if (Counts.Named.EXP == 0)
- ExpInstrTypesSeen = 0;
-
- // Build the wait instruction
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
- .addImm(AMDGPU::encodeWaitcnt(ISA,
- Counts.Named.VM,
- Counts.Named.EXP,
- Counts.Named.LGKM));
-
- LastOpcodeType = OTHER;
- LastInstWritesM0 = false;
- IsFlatOutstanding = false;
- return true;
-}
-
-/// \brief helper function for handleOperands
-static void increaseCounters(Counters &Dst, const Counters &Src) {
- for (unsigned i = 0; i < 3; ++i)
- Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
-}
-
-/// \brief check whether any of the counters is non-zero
-static bool countersNonZero(const Counters &Counter) {
- for (unsigned i = 0; i < 3; ++i)
- if (Counter.Array[i])
- return true;
- return false;
-}
-
-void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
- assert(I->getOpcode() == AMDGPU::S_WAITCNT);
-
- unsigned Imm = I->getOperand(0).getImm();
- Counters Counts, WaitOn;
-
- Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
- Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
- Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
-
- for (unsigned i = 0; i < 3; ++i) {
- if (Counts.Array[i] <= LastIssued.Array[i])
- WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
- else
- WaitOn.Array[i] = 0;
- }
-
- increaseCounters(DelayedWaitOn, WaitOn);
-}
-
-Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
- Counters Result = ZeroCounts;
-
- // For each register affected by this instruction increase the result
- // sequence.
- //
- // TODO: We could probably just look at explicit operands if we removed VCC /
- // EXEC from SMRD dest reg classes.
- for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
- MachineOperand &Op = MI.getOperand(i);
- if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
- continue;
-
- const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
- RegInterval Interval = getRegInterval(RC, Op);
- for (unsigned j = Interval.first; j < Interval.second; ++j) {
- if (Op.isDef()) {
- increaseCounters(Result, UsedRegs[j]);
- increaseCounters(Result, DefinedRegs[j]);
- }
-
- if (Op.isUse())
- increaseCounters(Result, DefinedRegs[j]);
- }
- }
-
- return Result;
-}
-
-void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) {
- if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
- return;
-
- // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
- if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
- LastInstWritesM0 = false;
- return;
- }
-
- // Set whether this instruction sets M0
- LastInstWritesM0 = false;
-
- unsigned NumOperands = I->getNumOperands();
- for (unsigned i = 0; i < NumOperands; i++) {
- const MachineOperand &Op = I->getOperand(i);
-
- if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
- LastInstWritesM0 = true;
- }
-}
-
-/// Return true if \p MBB has one successor immediately following, and is its
-/// only predecessor
-static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
- if (MBB.succ_size() != 1)
- return false;
-
- const MachineBasicBlock *Succ = *MBB.succ_begin();
- return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
-}
-
-// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
-// around other non-memory instructions.
-bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
- bool Changes = false;
-
- ST = &MF.getSubtarget<SISubtarget>();
- TII = ST->getInstrInfo();
- TRI = &TII->getRegisterInfo();
- MRI = &MF.getRegInfo();
- ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
- HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
- HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
- HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
-
- WaitedOn = ZeroCounts;
- DelayedWaitOn = ZeroCounts;
- LastIssued = ZeroCounts;
- LastOpcodeType = OTHER;
- LastInstWritesM0 = false;
- IsFlatOutstanding = false;
- ReturnsVoid = MFI->returnsVoid();
-
- memset(&UsedRegs, 0, sizeof(UsedRegs));
- memset(&DefinedRegs, 0, sizeof(DefinedRegs));
-
- SmallVector<MachineInstr *, 4> RemoveMI;
- SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
-
- bool HaveScalarStores = false;
-
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; ++BI) {
- MachineBasicBlock &MBB = *BI;
-
- for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
- I != E; ++I) {
- if (!HaveScalarStores && TII->isScalarStore(*I))
- HaveScalarStores = true;
-
- if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
- // There is a hardware bug on CI/SI where SMRD instruction may corrupt
- // vccz bit, so when we detect that an instruction may read from a
- // corrupt vccz bit, we need to:
- // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
- // complete.
- // 2. Restore the correct value of vccz by writing the current value
- // of vcc back to vcc.
-
- if (TII->isSMRD(I->getOpcode())) {
- VCCZCorrupt = true;
- } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
- // FIXME: We only care about SMRD instructions here, not LDS or GDS.
- // Whenever we store a value in vcc, the correct value of vccz is
- // restored.
- VCCZCorrupt = false;
- }
-
- // Check if we need to apply the bug work-around
- if (VCCZCorrupt && readsVCCZ(*I)) {
- DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
-
- // Wait on everything, not just LGKM. vccz reads usually come from
- // terminators, and we always wait on everything at the end of the
- // block, so if we only wait on LGKM here, we might end up with
- // another s_waitcnt inserted right after this if there are non-LGKM
- // instructions still outstanding.
- insertWait(MBB, I, LastIssued);
-
- // Restore the vccz bit. Any time a value is written to vcc, the vcc
- // bit is updated, so we can restore the bit by reading the value of
- // vcc and then writing it back to the register.
- BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
- AMDGPU::VCC)
- .addReg(AMDGPU::VCC);
- }
- }
-
- // Record pre-existing, explicitly requested waits
- if (I->getOpcode() == AMDGPU::S_WAITCNT) {
- handleExistingWait(*I);
- RemoveMI.push_back(&*I);
- continue;
- }
-
- Counters Required;
-
- // Wait for everything before a barrier.
- //
- // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
- // but we also want to wait for any other outstanding transfers before
- // signalling other hardware blocks
- if ((I->getOpcode() == AMDGPU::S_BARRIER &&
- !ST->hasAutoWaitcntBeforeBarrier()) ||
- I->getOpcode() == AMDGPU::S_SENDMSG ||
- I->getOpcode() == AMDGPU::S_SENDMSGHALT)
- Required = LastIssued;
- else
- Required = handleOperands(*I);
-
- Counters Increment = getHwCounts(*I);
-
- if (countersNonZero(Required) || countersNonZero(Increment))
- increaseCounters(Required, DelayedWaitOn);
-
- Changes |= insertWait(MBB, I, Required);
-
- pushInstruction(MBB, I, Increment);
- handleSendMsg(MBB, I);
-
- if (I->getOpcode() == AMDGPU::S_ENDPGM ||
- I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
- EndPgmBlocks.push_back(&MBB);
- }
-
- // Wait for everything at the end of the MBB. If there is only one
- // successor, we can defer this until the uses there.
- if (!hasTrivialSuccessor(MBB))
- Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
- }
-
- if (HaveScalarStores) {
- // If scalar writes are used, the cache must be flushed or else the next
- // wave to reuse the same scratch memory can be clobbered.
- //
- // Insert s_dcache_wb at wave termination points if there were any scalar
- // stores, and only if the cache hasn't already been flushed. This could be
- // improved by looking across blocks for flushes in postdominating blocks
- // from the stores but an explicitly requested flush is probably very rare.
- for (MachineBasicBlock *MBB : EndPgmBlocks) {
- bool SeenDCacheWB = false;
-
- for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
- I != E; ++I) {
- if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
- SeenDCacheWB = true;
- else if (TII->isScalarStore(*I))
- SeenDCacheWB = false;
-
- // FIXME: It would be better to insert this before a waitcnt if any.
- if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
- I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
- Changes = true;
- BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
- }
- }
- }
- }
-
- for (MachineInstr *I : RemoveMI)
- I->eraseFromParent();
-
- if (!MFI->isEntryFunction()) {
- // Wait for any outstanding memory operations that the input registers may
- // depend on. We can't track them and it's better to to the wait after the
- // costly call sequence.
-
- // TODO: Could insert earlier and schedule more liberally with operations
- // that only use caller preserved registers.
- MachineBasicBlock &EntryBB = MF.front();
- BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
- .addImm(0);
-
- Changes = true;
- }
-
- return Changes;
-}
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 25917cc06e6a..b73d30940fc3 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -12,16 +12,16 @@
//===----------------------------------------------------------------------===//
def isGCN : Predicate<"Subtarget->getGeneration() "
- ">= SISubtarget::SOUTHERN_ISLANDS">,
+ ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">,
AssemblerPredicate<"FeatureGCN">;
def isSI : Predicate<"Subtarget->getGeneration() "
- "== SISubtarget::SOUTHERN_ISLANDS">,
+ "== AMDGPUSubtarget::SOUTHERN_ISLANDS">,
AssemblerPredicate<"FeatureSouthernIslands">;
class InstSI <dag outs, dag ins, string asm = "",
list<dag> pattern = []> :
- AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
+ AMDGPUInst<outs, ins, asm, pattern>, GCNPredicateControl {
let SubtargetPredicate = isGCN;
// Low bits - basic encoding information.
@@ -118,6 +118,9 @@ class InstSI <dag outs, dag ins, string asm = "",
// This bit indicates that this is a packed VOP3P instruction
field bit IsPacked = 0;
+ // This bit indicates that this is a D16 buffer instruction.
+ field bit D16Buf = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@@ -173,6 +176,8 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{49} = IsPacked;
+ let TSFlags{50} = D16Buf;
+
let SchedRW = [Write32Bit];
field bits<1> DisableSIDecoder = 0;
@@ -181,6 +186,9 @@ class InstSI <dag outs, dag ins, string asm = "",
let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1);
let AsmVariantName = AMDGPUAsmVariants.Default;
+
+ // Avoid changing source registers in a way that violates constant bus read limitations.
+ let hasExtraSrcRegAllocReq = !if(VOP1,1,!if(VOP2,1,!if(VOP3,1,!if(VOPC,1,!if(SDWA,1, !if(VALU,1,0))))));
}
class PseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
@@ -247,6 +255,7 @@ class MIMGe <bits<7> op> : Enc64 {
bits<1> tfe;
bits<1> lwe;
bits<1> slc;
+ bit d16;
bits<8> vaddr;
bits<7> srsrc;
bits<7> ssamp;
@@ -265,6 +274,7 @@ class MIMGe <bits<7> op> : Enc64 {
let Inst{47-40} = vdata;
let Inst{52-48} = srsrc{6-2};
let Inst{57-53} = ssamp{6-2};
+ let Inst{63} = d16;
}
class EXPe : Enc64 {
@@ -309,6 +319,7 @@ class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
+ let VALU = 1;
}
class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> :
@@ -323,15 +334,3 @@ class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> :
}
} // End Uses = [EXEC]
-
-class MIMG <dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins, asm, pattern> {
-
- let VM_CNT = 1;
- let EXP_CNT = 1;
- let MIMG = 1;
- let Uses = [EXEC];
-
- let UseNamedOperandTable = 1;
- let hasSideEffects = 0; // XXX ????
-}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 61967605432e..6c85c92454c3 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8,17 +8,19 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief SI Implementation of TargetInstrInfo.
+/// SI Implementation of TargetInstrInfo.
//
//===----------------------------------------------------------------------===//
#include "SIInstrInfo.h"
#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "GCNHazardRecognizer.h"
#include "SIDefines.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
@@ -37,7 +39,6 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -53,6 +54,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
#include <cassert>
@@ -62,6 +64,19 @@
using namespace llvm;
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AMDGPUGenInstrInfo.inc"
+
+namespace llvm {
+namespace AMDGPU {
+#define GET_D16ImageDimIntrinsics_IMPL
+#define GET_ImageDimIntrinsicTable_IMPL
+#define GET_RsrcIntrinsics_IMPL
+#include "AMDGPUGenSearchableTables.inc"
+}
+}
+
+
// Must be at least 4 to be able to branch over minimum unconditional branch
// code. This is only for making it possible to write reasonably small tests for
// long branches.
@@ -69,8 +84,9 @@ static cl::opt<unsigned>
BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
cl::desc("Restrict range of branch instructions (DEBUG)"));
-SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
- : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}
+SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
+ : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+ RI(ST), ST(ST) {}
//===----------------------------------------------------------------------===//
// TargetInstrInfo callbacks
@@ -89,7 +105,7 @@ static SDValue findChainOperand(SDNode *Load) {
return LastOp;
}
-/// \brief Returns true if both nodes have the same value for the given
+/// Returns true if both nodes have the same value for the given
/// operand \p Op, or if both nodes do not have this operand.
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
unsigned Opc0 = N0->getMachineOpcode();
@@ -437,6 +453,28 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
}
+// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
+// the first 16 loads will be interleaved with the stores, and the next 16 will
+// be clustered as expected. It should really split into 2 16 store batches.
+//
+// Loads are clustered until this returns false, rather than trying to schedule
+// groups of stores. This also means we have to deal with saying different
+// address space loads should be clustered, and ones which might cause bank
+// conflicts.
+//
+// This might be deprecated so it might not be worth that much effort to fix.
+bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
+ int64_t Offset0, int64_t Offset1,
+ unsigned NumLoads) const {
+ assert(Offset1 > Offset0 &&
+ "Second offset should be larger than first offset!");
+ // If we have less than 16 loads in a row, and the offsets are within 64
+ // bytes, then schedule together.
+
+ // A cacheline is 64 bytes (for global memory).
+ return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
+}
+
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DestReg,
@@ -827,10 +865,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
- assert(SrcReg != MFI->getStackPtrOffsetReg() &&
- SrcReg != MFI->getFrameOffsetReg() &&
- SrcReg != MFI->getScratchWaveOffsetReg());
-
unsigned Size = FrameInfo.getObjectSize(FrameIndex);
unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
MachinePointerInfo PtrInfo
@@ -864,7 +898,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
// needing them, and need to ensure that the reserved registers are
// correctly handled.
- FrameInfo.setStackID(FrameIndex, 1);
+ FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
if (ST.hasScalarStores()) {
// m0 is used for offset to scalar stores if used to spill.
Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
@@ -960,7 +994,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
}
- FrameInfo.setStackID(FrameIndex, 1);
+ FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
@@ -1001,7 +1035,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
unsigned FrameOffset, unsigned Size) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
DebugLoc DL = MBB.findDebugLoc(MI);
unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
unsigned WavefrontSize = ST.getWavefrontSize();
@@ -1137,7 +1171,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MBB.findDebugLoc(MI);
switch (MI.getOpcode()) {
- default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+ default: return TargetInstrInfo::expandPostRAPseudo(MI);
case AMDGPU::S_MOV_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
@@ -1269,6 +1303,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(AMDGPU::S_MOV_B64));
break;
}
+ case TargetOpcode::BUNDLE: {
+ if (!MI.mayLoad())
+ return false;
+
+ // If it is a load it must be a memory clause
+ for (MachineBasicBlock::instr_iterator I = MI.getIterator();
+ I->isBundledWithSucc(); ++I) {
+ I->unbundleFromSucc();
+ for (MachineOperand &MO : I->operands())
+ if (MO.isReg())
+ MO.setIsInternalRead(false);
+ }
+
+ MI.eraseFromParent();
+ break;
+ }
}
return true;
}
@@ -1887,16 +1937,16 @@ unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
switch(Kind) {
case PseudoSourceValue::Stack:
case PseudoSourceValue::FixedStack:
- return AMDGPUASI.PRIVATE_ADDRESS;
+ return ST.getAMDGPUAS().PRIVATE_ADDRESS;
case PseudoSourceValue::ConstantPool:
case PseudoSourceValue::GOT:
case PseudoSourceValue::JumpTable:
case PseudoSourceValue::GlobalValueCallEntry:
case PseudoSourceValue::ExternalSymbolCallEntry:
case PseudoSourceValue::TargetCustom:
- return AMDGPUASI.CONSTANT_ADDRESS;
+ return ST.getAMDGPUAS().CONSTANT_ADDRESS;
}
- return AMDGPUASI.FLAT_ADDRESS;
+ return ST.getAMDGPUAS().FLAT_ADDRESS;
}
static void removeModOperands(MachineInstr &MI) {
@@ -2165,20 +2215,24 @@ static int64_t getFoldableImm(const MachineOperand* MO) {
MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
MachineInstr &MI,
LiveVariables *LV) const {
+ unsigned Opc = MI.getOpcode();
bool IsF16 = false;
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
- switch (MI.getOpcode()) {
+ switch (Opc) {
default:
return nullptr;
case AMDGPU::V_MAC_F16_e64:
IsF16 = true;
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e64:
+ case AMDGPU::V_FMAC_F32_e64:
break;
case AMDGPU::V_MAC_F16_e32:
IsF16 = true;
LLVM_FALLTHROUGH;
- case AMDGPU::V_MAC_F32_e32: {
+ case AMDGPU::V_MAC_F32_e32:
+ case AMDGPU::V_FMAC_F32_e32: {
int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::src0);
const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
@@ -2203,7 +2257,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
- if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
+ if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
// If we have an SGPR input, we will violate the constant bus restriction.
(!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
if (auto Imm = getFoldableImm(Src2)) {
@@ -2234,8 +2288,10 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
}
}
- return BuildMI(*MBB, MI, MI.getDebugLoc(),
- get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
+ assert((!IsFMA || !IsF16) && "fmac only expected with f32");
+ unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
+ (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
+ return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.addImm(Src0Mods ? Src0Mods->getImm() : 0)
.add(*Src0)
@@ -2339,6 +2395,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
}
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+ if (isUInt<16>(Imm)) {
+ int16_t Trunc = static_cast<int16_t>(Imm);
+ return ST.has16BitInsts() &&
+ AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
+ }
+ if (!(Imm & 0xffff)) {
+ return ST.has16BitInsts() &&
+ AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm());
+ }
uint32_t Trunc = static_cast<uint32_t>(Imm);
return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
}
@@ -2711,14 +2776,16 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
- // Verify VOP*
- if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) {
+ // Verify VOP*. Ignore multiple sgpr operands on writelane.
+ if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
+ && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
// Only look at the true operands. Only a real operand can use the constant
// bus, and we don't want to check pseudo-operands like the source modifier
// flags.
const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
unsigned ConstantBusCount = 0;
+ unsigned LiteralCount = 0;
if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
++ConstantBusCount;
@@ -2738,6 +2805,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
SGPRUsed = MO.getReg();
} else {
++ConstantBusCount;
+ ++LiteralCount;
}
}
}
@@ -2745,6 +2813,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
ErrInfo = "VOP* instruction uses the constant bus more than once";
return false;
}
+
+ if (isVOP3(MI) && LiteralCount) {
+ ErrInfo = "VOP3 instruction uses literal";
+ return false;
+ }
}
// Verify misc. restrictions on specific instructions.
@@ -2842,7 +2915,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
- if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) {
+ if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) {
const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
if (Offset->getImm() != 0) {
ErrInfo = "subtarget does not support offsets in flat instructions";
@@ -2850,6 +2923,22 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
+ if (DppCt) {
+ using namespace AMDGPU::DPP;
+
+ unsigned DC = DppCt->getImm();
+ if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
+ DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
+ (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
+ (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
+ (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
+ (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) {
+ ErrInfo = "Invalid dpp_ctrl value";
+ return false;
+ }
+ }
+
return true;
}
@@ -3147,6 +3236,29 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
legalizeOpWithMove(MI, Src0Idx);
}
+ // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
+ // both the value to write (src0) and lane select (src1). Fix up non-SGPR
+ // src0/src1 with V_READFIRSTLANE.
+ if (Opc == AMDGPU::V_WRITELANE_B32) {
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ const DebugLoc &DL = MI.getDebugLoc();
+ if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src0);
+ Src0.ChangeToRegister(Reg, false);
+ }
+ if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src1);
+ Src1.ChangeToRegister(Reg, false);
+ }
+ return;
+ }
+
// VOP2 src0 instructions support all operand types, so we don't need to check
// their legality. If src1 is already legal, we don't need to do anything.
if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
@@ -3261,6 +3373,13 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
unsigned DstReg = MRI.createVirtualRegister(SRC);
unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
+ if (SubRegs == 1) {
+ BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+ get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .addReg(SrcReg);
+ return DstReg;
+ }
+
SmallVector<unsigned, 8> SRegs;
for (unsigned i = 0; i < SubRegs; ++i) {
unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
@@ -3438,6 +3557,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
return;
}
+ // Legalize SI_INIT_M0
+ if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
+ MachineOperand &Src = MI.getOperand(0);
+ if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
+ Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
+ return;
+ }
+
// Legalize MIMG and MUBUF/MTBUF for shaders.
//
// Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
@@ -3539,8 +3666,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
} else {
// This instructions is the _OFFSET variant, so we need to convert it to
// ADDR64.
- assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
- < SISubtarget::VOLCANIC_ISLANDS &&
+ assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
+ < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
"FIXME: Need to emit flat atomics here");
MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
@@ -3676,37 +3803,37 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
continue;
case AMDGPU::S_LSHL_B32:
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
swapOperands(Inst);
}
break;
case AMDGPU::S_ASHR_I32:
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
swapOperands(Inst);
}
break;
case AMDGPU::S_LSHR_B32:
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
swapOperands(Inst);
}
break;
case AMDGPU::S_LSHL_B64:
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
NewOpcode = AMDGPU::V_LSHLREV_B64;
swapOperands(Inst);
}
break;
case AMDGPU::S_ASHR_I64:
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
NewOpcode = AMDGPU::V_ASHRREV_I64;
swapOperands(Inst);
}
break;
case AMDGPU::S_LSHR_B64:
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
NewOpcode = AMDGPU::V_LSHRREV_B64;
swapOperands(Inst);
}
@@ -3756,39 +3883,49 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
// FIXME: This isn't safe because the addressing mode doesn't work
// correctly if vaddr is negative.
//
- // FIXME: Handle v_add_u32 and VOP3 form. Also don't rely on immediate
- // being in src0.
- //
// FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
//
// See if we can extract an immediate offset by recognizing one of these:
// V_ADD_I32_e32 dst, imm, src1
// V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
// V_ADD will be removed by "Remove dead machine instructions".
- if (Add && Add->getOpcode() == AMDGPU::V_ADD_I32_e32) {
- const MachineOperand *Src =
- getNamedOperand(*Add, AMDGPU::OpName::src0);
-
- if (Src->isReg()) {
- auto Mov = MRI.getUniqueVRegDef(Src->getReg());
- if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32)
- Src = &Mov->getOperand(1);
- }
-
- if (Src) {
- if (Src->isImm())
- Offset = Src->getImm();
- else if (Src->isCImm())
- Offset = Src->getCImm()->getZExtValue();
- }
+ if (Add &&
+ (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
+ Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
+ static const unsigned SrcNames[2] = {
+ AMDGPU::OpName::src0,
+ AMDGPU::OpName::src1,
+ };
+
+ // Find a literal offset in one of source operands.
+ for (int i = 0; i < 2; i++) {
+ const MachineOperand *Src =
+ getNamedOperand(*Add, SrcNames[i]);
+
+ if (Src->isReg()) {
+ auto Mov = MRI.getUniqueVRegDef(Src->getReg());
+ if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32)
+ Src = &Mov->getOperand(1);
+ }
+
+ if (Src) {
+ if (Src->isImm())
+ Offset = Src->getImm();
+ else if (Src->isCImm())
+ Offset = Src->getCImm()->getZExtValue();
+ }
+
+ if (Offset && isLegalMUBUFImmOffset(Offset)) {
+ VAddr = getNamedOperand(*Add, SrcNames[!i]);
+ break;
+ }
- if (Offset && isLegalMUBUFImmOffset(Offset))
- VAddr = getNamedOperand(*Add, AMDGPU::OpName::src1);
- else
Offset = 0;
+ }
}
- BuildMI(*MBB, Inst, Inst.getDebugLoc(),
+ MachineInstr *NewInstr =
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(),
get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst)
.add(*VAddr) // vaddr
.add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
@@ -3797,12 +3934,17 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
.addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
.addImm(0) // slc
.addImm(0) // tfe
- .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end());
+ .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end())
+ .getInstr();
MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
VDst);
addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
Inst.eraseFromParent();
+
+ // Legalize all operands other than the offset. Notably, convert the srsrc
+ // into SGPRs using v_readfirstlane if needed.
+ legalizeOperands(*NewInstr);
continue;
}
}
@@ -3884,6 +4026,13 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
MRI.clearKillFlags(Inst.getOperand(1).getReg());
Inst.getOperand(0).setReg(DstReg);
+
+ // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
+ // these are deleted later, but at -O0 it would leave a suspicious
+ // looking illegal copy of an undef register.
+ for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
+ Inst.RemoveOperand(I);
+ Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
continue;
}
@@ -3975,17 +4124,23 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
- unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
- .add(Src0)
- .add(Src1);
+ unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ if (ST.hasDLInsts()) {
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
+ .add(Src0)
+ .add(Src1);
+ } else {
+ unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
+ .add(Src0)
+ .add(Src1);
- unsigned Not = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), Not)
- .addReg(Xor);
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest)
+ .addReg(Xor);
+ }
- MRI.replaceRegWith(Dest.getReg(), Not);
- addUsersToMoveToVALUWorklist(Not, MRI, Worklist);
+ MRI.replaceRegWith(Dest.getReg(), NewDest);
+ addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
}
void SIInstrInfo::splitScalar64BitUnaryOp(
@@ -4478,12 +4633,12 @@ uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
if (ST.isAmdHsaOS()) {
// Set ATC = 1. GFX9 doesn't have this bit.
- if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS)
+ if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
RsrcDataFormat |= (1ULL << 56);
// Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
// BTW, it disables TC L2 and therefore decreases performance.
- if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS)
+ if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
RsrcDataFormat |= (2ULL << 59);
}
@@ -4496,7 +4651,7 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
0xffffffff; // Size;
// GFX9 doesn't have ELEMENT_SIZE.
- if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) {
+ if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
}
@@ -4506,7 +4661,7 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
// If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
// Clear them unless we want a huge stride.
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
return Rsrc23;
@@ -4531,7 +4686,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
return AMDGPU::NoRegister;
assert(!MI.memoperands_empty() &&
- (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS);
+ (*MI.memoperands_begin())->getAddrSpace() == ST.getAMDGPUAS().PRIVATE_ADDRESS);
FrameIndex = Addr->getIndex();
return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
@@ -4598,12 +4753,12 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
if (DescSize != 0 && DescSize != 4)
return DescSize;
+ if (isFixedSize(MI))
+ return DescSize;
+
// 4-byte instructions may have a 32-bit literal encoded after them. Check
// operands that coud ever be literals.
if (isVALU(MI) || isSALU(MI)) {
- if (isFixedSize(MI))
- return DescSize;
-
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
if (Src0Idx == -1)
return 4; // No operands.
@@ -4650,7 +4805,7 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
return true;
for (const MachineMemOperand *MMO : MI.memoperands()) {
- if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS)
+ if (MMO->getAddrSpace() == ST.getAMDGPUAS().FLAT_ADDRESS)
return true;
}
return false;
@@ -4817,3 +4972,70 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con
llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
}
}
+
+bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
+ if (!isSMRD(MI))
+ return false;
+
+ // Check that it is using a buffer resource.
+ int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
+ if (Idx == -1) // e.g. s_memtime
+ return false;
+
+ const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
+ return RCID == AMDGPU::SReg_128RegClassID;
+}
+
+// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
+enum SIEncodingFamily {
+ SI = 0,
+ VI = 1,
+ SDWA = 2,
+ SDWA9 = 3,
+ GFX80 = 4,
+ GFX9 = 5
+};
+
+static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
+ switch (ST.getGeneration()) {
+ default:
+ break;
+ case AMDGPUSubtarget::SOUTHERN_ISLANDS:
+ case AMDGPUSubtarget::SEA_ISLANDS:
+ return SIEncodingFamily::SI;
+ case AMDGPUSubtarget::VOLCANIC_ISLANDS:
+ case AMDGPUSubtarget::GFX9:
+ return SIEncodingFamily::VI;
+ }
+ llvm_unreachable("Unknown subtarget generation!");
+}
+
+int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
+ SIEncodingFamily Gen = subtargetEncodingFamily(ST);
+
+ if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
+ ST.getGeneration() >= AMDGPUSubtarget::GFX9)
+ Gen = SIEncodingFamily::GFX9;
+
+ if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
+ Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
+ : SIEncodingFamily::SDWA;
+ // Adjust the encoding family to GFX80 for D16 buffer instructions when the
+ // subtarget has UnpackedD16VMem feature.
+ // TODO: remove this when we discard GFX80 encoding.
+ if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
+ Gen = SIEncodingFamily::GFX80;
+
+ int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
+
+ // -1 means that Opcode is already a native instruction.
+ if (MCOp == -1)
+ return Opcode;
+
+ // (uint16_t)-1 means that Opcode is a pseudo instruction that has
+ // no encoding in the given subtarget generation.
+ if (MCOp == (uint16_t)-1)
+ return -1;
+
+ return MCOp;
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 24ee843e6ade..0a735257d34e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface definition for SIInstrInfo.
+/// Interface definition for SIInstrInfo.
//
//===----------------------------------------------------------------------===//
@@ -31,20 +31,23 @@
#include <cassert>
#include <cstdint>
+#define GET_INSTRINFO_HEADER
+#include "AMDGPUGenInstrInfo.inc"
+
namespace llvm {
class APInt;
class MachineRegisterInfo;
class RegScavenger;
-class SISubtarget;
+class GCNSubtarget;
class TargetRegisterClass;
-class SIInstrInfo final : public AMDGPUInstrInfo {
+class SIInstrInfo final : public AMDGPUGenInstrInfo {
private:
const SIRegisterInfo RI;
- const SISubtarget &ST;
+ const GCNSubtarget &ST;
- // The the inverse predicate should have the negative value.
+ // The inverse predicate should have the negative value.
enum BranchPredicate {
INVALID_BR = 0,
SCC_TRUE = 1,
@@ -144,7 +147,7 @@ public:
MO_REL32_HI = 5
};
- explicit SIInstrInfo(const SISubtarget &ST);
+ explicit SIInstrInfo(const GCNSubtarget &ST);
const SIRegisterInfo &getRegisterInfo() const {
return RI;
@@ -163,7 +166,10 @@ public:
bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
MachineInstr &SecondLdSt, unsigned BaseReg2,
- unsigned NumLoads) const final;
+ unsigned NumLoads) const override;
+
+ bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0,
+ int64_t Offset1, unsigned NumLoads) const override;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
@@ -203,7 +209,7 @@ public:
bool expandPostRAPseudo(MachineInstr &MI) const override;
- // \brief Returns an opcode that can be used to move a value to a \p DstRC
+ // Returns an opcode that can be used to move a value to a \p DstRC
// register. If there is no hardware instruction that can store to \p
// DstRC, then AMDGPU::COPY is returned.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
@@ -419,18 +425,7 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::SMRD;
}
- bool isBufferSMRD(const MachineInstr &MI) const {
- if (!isSMRD(MI))
- return false;
-
- // Check that it is using a buffer resource.
- int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
- if (Idx == -1) // e.g. s_memtime
- return false;
-
- const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
- return RCID == AMDGPU::SReg_128RegClassID;
- }
+ bool isBufferSMRD(const MachineInstr &MI) const;
static bool isDS(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::DS;
@@ -674,16 +669,16 @@ public:
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
const MachineOperand &MO) const;
- /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
+ /// Return true if this 64-bit VALU instruction has a 32-bit encoding.
/// This function will return false if you pass it a 32-bit instruction.
bool hasVALU32BitEncoding(unsigned Opcode) const;
- /// \brief Returns true if this operand uses the constant bus.
+ /// Returns true if this operand uses the constant bus.
bool usesConstantBus(const MachineRegisterInfo &MRI,
const MachineOperand &MO,
const MCOperandInfo &OpInfo) const;
- /// \brief Return true if this instruction has any modifiers.
+ /// Return true if this instruction has any modifiers.
/// e.g. src[012]_mod, omod, clamp.
bool hasModifiers(unsigned Opcode) const;
@@ -696,7 +691,7 @@ public:
unsigned getVALUOp(const MachineInstr &MI) const;
- /// \brief Return the correct register class for \p OpNo. For target-specific
+ /// Return the correct register class for \p OpNo. For target-specific
/// instructions, this will return the register class that has been defined
/// in tablegen. For generic instructions, like REG_SEQUENCE it will return
/// the register class of its machine operand.
@@ -704,7 +699,7 @@ public:
const TargetRegisterClass *getOpRegClass(const MachineInstr &MI,
unsigned OpNo) const;
- /// \brief Return the size in bytes of the operand OpNo on the given
+ /// Return the size in bytes of the operand OpNo on the given
// instruction opcode.
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const {
const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo];
@@ -718,7 +713,7 @@ public:
return RI.getRegSizeInBits(*RI.getRegClass(OpInfo.RegClass)) / 8;
}
- /// \brief This form should usually be preferred since it handles operands
+ /// This form should usually be preferred since it handles operands
/// with unknown register classes.
unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8;
@@ -728,7 +723,7 @@ public:
/// to read a VGPR.
bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const;
- /// \brief Legalize the \p OpIndex operand of this instruction by inserting
+ /// Legalize the \p OpIndex operand of this instruction by inserting
/// a MOV. For example:
/// ADD_I32_e32 VGPR0, 15
/// to
@@ -739,29 +734,29 @@ public:
/// instead of MOV.
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const;
- /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand
+ /// Check if \p MO is a legal operand if it was the \p OpIdx Operand
/// for \p MI.
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand *MO = nullptr) const;
- /// \brief Check if \p MO would be a valid operand for the given operand
+ /// Check if \p MO would be a valid operand for the given operand
/// definition \p OpInfo. Note this does not attempt to validate constant bus
/// restrictions (e.g. literal constant usage).
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI,
const MCOperandInfo &OpInfo,
const MachineOperand &MO) const;
- /// \brief Check if \p MO (a register operand) is a legal register for the
+ /// Check if \p MO (a register operand) is a legal register for the
/// given operand description.
bool isLegalRegOperand(const MachineRegisterInfo &MRI,
const MCOperandInfo &OpInfo,
const MachineOperand &MO) const;
- /// \brief Legalize operands in \p MI by either commuting it or inserting a
+ /// Legalize operands in \p MI by either commuting it or inserting a
/// copy of src1.
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;
- /// \brief Fix operands in \p MI to satisfy constant bus requirements.
+ /// Fix operands in \p MI to satisfy constant bus requirements.
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const;
/// Copy a value from a VGPR (\p SrcReg) to SGPR. This function can only
@@ -779,11 +774,11 @@ public:
MachineOperand &Op, MachineRegisterInfo &MRI,
const DebugLoc &DL) const;
- /// \brief Legalize all operands in this instruction. This function may
+ /// Legalize all operands in this instruction. This function may
/// create new instruction and insert them before \p MI.
void legalizeOperands(MachineInstr &MI) const;
- /// \brief Replace this instruction's opcode with the equivalent VALU
+ /// Replace this instruction's opcode with the equivalent VALU
/// opcode. This function will also move the users of \p MI to the
/// VALU if necessary.
void moveToVALU(MachineInstr &MI) const;
@@ -795,11 +790,11 @@ public:
MachineBasicBlock::iterator MI) const override;
void insertReturn(MachineBasicBlock &MBB) const;
- /// \brief Return the number of wait states that result from executing this
+ /// Return the number of wait states that result from executing this
/// instruction.
unsigned getNumWaitStates(const MachineInstr &MI) const;
- /// \brief Returns the operand named \p Op. If \p MI does not have an
+ /// Returns the operand named \p Op. If \p MI does not have an
/// operand named \c Op, this function returns nullptr.
LLVM_READONLY
MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const;
@@ -822,7 +817,7 @@ public:
bool isLowLatencyInstruction(const MachineInstr &MI) const;
bool isHighLatencyInstruction(const MachineInstr &MI) const;
- /// \brief Return the descriptor of the target-specific machine instruction
+ /// Return the descriptor of the target-specific machine instruction
/// that corresponds to the specified pseudo or native opcode.
const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const {
return get(pseudoToMCOpcode(Opcode));
@@ -867,7 +862,7 @@ public:
bool isBasicBlockPrologue(const MachineInstr &MI) const override;
- /// \brief Return a partially built integer add instruction without carry.
+ /// Return a partially built integer add instruction without carry.
/// Caller must add source operands.
/// For pre-GFX9 it will generate unused carry destination operand.
/// TODO: After GFX9 it should return a no-carry operation.
@@ -882,6 +877,12 @@ public:
static bool isLegalMUBUFImmOffset(unsigned Imm) {
return isUInt<12>(Imm);
}
+
+ /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
+ /// Return -1 if the target-specific opcode for the pseudo instruction does
+ /// not exist. If Opcode is not a pseudo instruction, this is identity.
+ int pseudoToMCOpcode(int Opcode) const;
+
};
namespace AMDGPU {
@@ -908,6 +909,9 @@ namespace AMDGPU {
int getAddr64Inst(uint16_t Opcode);
LLVM_READONLY
+ int getMUBUFNoLdsInst(uint16_t Opcode);
+
+ LLVM_READONLY
int getAtomicRetOp(uint16_t Opcode);
LLVM_READONLY
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index fc2d35d873aa..8fa37aa83dae 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -7,16 +7,21 @@
//
//===----------------------------------------------------------------------===//
def isCI : Predicate<"Subtarget->getGeneration() "
- ">= SISubtarget::SEA_ISLANDS">;
+ ">= AMDGPUSubtarget::SEA_ISLANDS">;
def isCIOnly : Predicate<"Subtarget->getGeneration() =="
- "SISubtarget::SEA_ISLANDS">,
+ "AMDGPUSubtarget::SEA_ISLANDS">,
AssemblerPredicate <"FeatureSeaIslands">;
def isVIOnly : Predicate<"Subtarget->getGeneration() =="
- "SISubtarget::VOLCANIC_ISLANDS">,
+ "AMDGPUSubtarget::VOLCANIC_ISLANDS">,
AssemblerPredicate <"FeatureVolcanicIslands">;
def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
+class GCNPredicateControl : PredicateControl {
+ Predicate SIAssemblerPredicate = isSICI;
+ Predicate VIAssemblerPredicate = isVI;
+}
+
// Execpt for the NONE field, this must be kept in sync with the
// SIEncodingFamily enum in AMDGPUInstrInfo.cpp
def SIEncodingFamily {
@@ -25,13 +30,16 @@ def SIEncodingFamily {
int VI = 1;
int SDWA = 2;
int SDWA9 = 3;
- int GFX9 = 4;
+ int GFX80 = 4;
+ int GFX9 = 5;
}
//===----------------------------------------------------------------------===//
// SI DAG Nodes
//===----------------------------------------------------------------------===//
+def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
+
def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
[SDNPMayLoad, SDNPMemOperand]
@@ -45,22 +53,41 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
-def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT",
- SDTypeProfile<1, 9,
- [ // vdata
- SDTCisVT<1, v4i32>, // rsrc
- SDTCisVT<2, i32>, // vindex(VGPR)
- SDTCisVT<3, i32>, // voffset(VGPR)
- SDTCisVT<4, i32>, // soffset(SGPR)
- SDTCisVT<5, i32>, // offset(imm)
- SDTCisVT<6, i32>, // dfmt(imm)
- SDTCisVT<7, i32>, // nfmt(imm)
- SDTCisVT<8, i32>, // glc(imm)
- SDTCisVT<9, i32> // slc(imm)
- ]>,
- [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
+ SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
+]>;
+
+def SIatomic_fadd : SDNode<"AMDGPUISD::ATOMIC_LOAD_FADD", SDTAtomic2_f32,
+ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32,
+ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
+def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
+ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SDTbuffer_load : SDTypeProfile<1, 9,
+ [ // vdata
+ SDTCisVT<1, v4i32>, // rsrc
+ SDTCisVT<2, i32>, // vindex(VGPR)
+ SDTCisVT<3, i32>, // voffset(VGPR)
+ SDTCisVT<4, i32>, // soffset(SGPR)
+ SDTCisVT<5, i32>, // offset(imm)
+ SDTCisVT<6, i32>, // dfmt(imm)
+ SDTCisVT<7, i32>, // nfmt(imm)
+ SDTCisVT<8, i32>, // glc(imm)
+ SDTCisVT<9, i32> // slc(imm)
+ ]>;
+
+def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTbuffer_load,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>;
+def SItbuffer_load_d16 : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT_D16",
+ SDTbuffer_load,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>;
+
def SDTtbuffer_store : SDTypeProfile<0, 10,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
@@ -79,6 +106,9 @@ def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store
def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3",
SDTtbuffer_store,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16",
+ SDTtbuffer_store,
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
def SDTBufferLoad : SDTypeProfile<1, 5,
[ // vdata
@@ -92,6 +122,9 @@ def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16",
+ SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SDTBufferStore : SDTypeProfile<0, 6,
[ // vdata
@@ -102,9 +135,13 @@ def SDTBufferStore : SDTypeProfile<0, 6,
SDTCisVT<5, i1>]>; // slc
def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore,
- [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>;
-def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore,
- [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>;
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT",
+ SDTBufferStore,
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16",
+ SDTBufferStore,
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
class SDBufferAtomic<string opcode> : SDNode <opcode,
SDTypeProfile<1, 5,
@@ -140,21 +177,41 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;
-class SDSample<string opcode> : SDNode <opcode,
- SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>,
- SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
->;
-
-def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
-def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
-def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
-def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
-
def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
>;
//===----------------------------------------------------------------------===//
+// ValueType helpers
+//===----------------------------------------------------------------------===//
+
+// Returns 1 if the source arguments have modifiers, 0 if they do not.
+// XXX - do f16 instructions?
+class isFloatType<ValueType SrcVT> {
+ bit ret =
+ !if(!eq(SrcVT.Value, f16.Value), 1,
+ !if(!eq(SrcVT.Value, f32.Value), 1,
+ !if(!eq(SrcVT.Value, f64.Value), 1,
+ !if(!eq(SrcVT.Value, v2f16.Value), 1,
+ 0))));
+}
+
+class isIntType<ValueType SrcVT> {
+ bit ret =
+ !if(!eq(SrcVT.Value, i16.Value), 1,
+ !if(!eq(SrcVT.Value, i32.Value), 1,
+ !if(!eq(SrcVT.Value, i64.Value), 1,
+ 0)));
+}
+
+class isPackedType<ValueType SrcVT> {
+ bit ret =
+ !if(!eq(SrcVT.Value, v2i16.Value), 1,
+ !if(!eq(SrcVT.Value, v2f16.Value), 1, 0)
+ );
+}
+
+//===----------------------------------------------------------------------===//
// PatFrags for global memory operations
//===----------------------------------------------------------------------===//
@@ -163,6 +220,9 @@ defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>;
def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>;
+def atomic_load_fadd_local : local_binary_atomic_op<SIatomic_fadd>;
+def atomic_load_fmin_local : local_binary_atomic_op<SIatomic_fmin>;
+def atomic_load_fmax_local : local_binary_atomic_op<SIatomic_fmax>;
//===----------------------------------------------------------------------===//
// SDNodes PatFrags for loads/stores with a glue input.
@@ -178,6 +238,10 @@ def AMDGPUld_glue : SDNode <"ISD::LOAD", SDTLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
>;
+def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+>;
+
def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr), [{
return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
}]>;
@@ -186,6 +250,18 @@ def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr), [{
return cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
}]>;
+def atomic_load_32_glue : PatFrag<(ops node:$ptr),
+ (AMDGPUatomic_ld_glue node:$ptr)> {
+ let IsAtomic = 1;
+ let MemoryVT = i32;
+}
+
+def atomic_load_64_glue : PatFrag<(ops node:$ptr),
+ (AMDGPUatomic_ld_glue node:$ptr)> {
+ let IsAtomic = 1;
+ let MemoryVT = i64;
+}
+
def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr), [{
return cast<LoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD;
}]>;
@@ -219,6 +295,9 @@ def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{
def load_glue_align8 : Aligned8Bytes <
(ops node:$ptr), (load_glue node:$ptr)
>;
+def load_glue_align16 : Aligned16Bytes <
+ (ops node:$ptr), (load_glue node:$ptr)
+>;
def load_local_m0 : LoadFrag<load_glue>, LocalAddress;
@@ -227,12 +306,23 @@ def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress;
def az_extloadi8_local_m0 : LoadFrag<az_extloadi8_glue>, LocalAddress;
def az_extloadi16_local_m0 : LoadFrag<az_extloadi16_glue>, LocalAddress;
def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress;
+def load_align16_local_m0 : LoadFrag <load_glue_align16>, LocalAddress;
+def atomic_load_32_local_m0 : LoadFrag<atomic_load_32_glue>, LocalAddress;
+def atomic_load_64_local_m0 : LoadFrag<atomic_load_64_glue>, LocalAddress;
def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
>;
+def AMDGPUatomic_st_glue : SDNode <"ISD::ATOMIC_STORE", SDTAtomicStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
+>;
+
+def atomic_store_glue : PatFrag<(ops node:$ptr, node:$val),
+ (AMDGPUatomic_st_glue node:$ptr, node:$val)> {
+}
+
def unindexedstore_glue : PatFrag<(ops node:$val, node:$ptr),
(AMDGPUst_glue node:$val, node:$ptr), [{
return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
@@ -262,11 +352,17 @@ def store_glue_align8 : Aligned8Bytes <
(ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr)
>;
+def store_glue_align16 : Aligned16Bytes <
+ (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr)
+>;
+
def store_local_m0 : StoreFrag<store_glue>, LocalAddress;
def truncstorei8_local_m0 : StoreFrag<truncstorei8_glue>, LocalAddress;
def truncstorei16_local_m0 : StoreFrag<truncstorei16_glue>, LocalAddress;
+def atomic_store_local_m0 : StoreFrag<AMDGPUatomic_st_glue>, LocalAddress;
def store_align8_local_m0 : StoreFrag<store_glue_align8>, LocalAddress;
+def store_align16_local_m0 : StoreFrag<store_glue_align16>, LocalAddress;
def si_setcc_uniform : PatFrag <
(ops node:$lhs, node:$rhs, node:$cond),
@@ -297,10 +393,11 @@ def lshl_rev : PatFrag <
(shl $src0, $src1)
>;
-multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> {
+multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
+ SDTypeProfile tc = SDTAtomic2> {
def _glue : SDNode <
- !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, SDTAtomic2,
+ !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, tc,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
>;
@@ -319,6 +416,9 @@ defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
+defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 1, SDTAtomic2_f32>;
+defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>;
+defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>;
def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
@@ -368,6 +468,12 @@ return CurDAG->getTargetConstant(
N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
}]>;
+class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{
+ uint64_t Imm = N->getZExtValue();
+ unsigned Bit = (Imm >> }] # bitnum # [{ ) & 1;
+ return CurDAG->getTargetConstant(Bit, SDLoc(N), MVT::i1);
+}]>;
+
def SIMM16bit : PatLeaf <(imm),
[{return isInt<16>(N->getSExtValue());}]
>;
@@ -381,7 +487,7 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
}]>;
class VGPRImm <dag frag> : PatLeaf<frag, [{
- if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
return false;
}
const SIRegisterInfo *SIRI =
@@ -552,19 +658,18 @@ def ExpSrc3 : RegisterOperand<VGPR_32> {
let ParserMatchClass = VReg32OrOffClass;
}
-class SDWASrc : RegisterOperand<VS_32> {
+class SDWASrc<ValueType vt> : RegisterOperand<VS_32> {
let OperandNamespace = "AMDGPU";
- let OperandType = "OPERAND_SDWA_SRC";
+ string Type = !if(isFloatType<vt>.ret, "FP", "INT");
+ let OperandType = "OPERAND_REG_INLINE_C_"#Type#vt.Size;
+ let DecoderMethod = "decodeSDWASrc"#vt.Size;
let EncoderMethod = "getSDWASrcEncoding";
}
-def SDWASrc32 : SDWASrc {
- let DecoderMethod = "decodeSDWASrc32";
-}
-
-def SDWASrc16 : SDWASrc {
- let DecoderMethod = "decodeSDWASrc16";
-}
+def SDWASrc_i32 : SDWASrc<i32>;
+def SDWASrc_i16 : SDWASrc<i16>;
+def SDWASrc_f32 : SDWASrc<f32>;
+def SDWASrc_f16 : SDWASrc<f16>;
def SDWAVopcDst : VOPDstOperand<SReg_64> {
let OperandNamespace = "AMDGPU";
@@ -637,19 +742,20 @@ def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>;
def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>;
def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
-def slc : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
-def tfe : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
-def unorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
-def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
-def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
-def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
+def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
+def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
+def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
+def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
+def R128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
+def D16 : NamedOperandBit<"D16", NamedMatchClass<"D16">>;
+def LWE : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>;
def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>;
-def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
+def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
@@ -747,16 +853,23 @@ class OpSelModsMatchClass : AsmOperandClass {
def IntOpSelModsMatchClass : OpSelModsMatchClass;
def IntOpSelMods : InputMods<IntOpSelModsMatchClass>;
-def FPRegSDWAInputModsMatchClass : AsmOperandClass {
- let Name = "SDWARegWithFPInputMods";
- let ParserMethod = "parseRegWithFPInputMods";
- let PredicateMethod = "isSDWARegKind";
+class FPSDWAInputModsMatchClass <int opSize> : AsmOperandClass {
+ let Name = "SDWAWithFP"#opSize#"InputMods";
+ let ParserMethod = "parseRegOrImmWithFPInputMods";
+ let PredicateMethod = "isSDWAFP"#opSize#"Operand";
}
-def FPRegSDWAInputMods : InputMods <FPRegSDWAInputModsMatchClass> {
+def FP16SDWAInputModsMatchClass : FPSDWAInputModsMatchClass<16>;
+def FP32SDWAInputModsMatchClass : FPSDWAInputModsMatchClass<32>;
+
+class FPSDWAInputMods <FPSDWAInputModsMatchClass matchClass> :
+ InputMods <matchClass> {
let PrintMethod = "printOperandAndFPInputMods";
}
+def FP16SDWAInputMods : FPSDWAInputMods<FP16SDWAInputModsMatchClass>;
+def FP32SDWAInputMods : FPSDWAInputMods<FP32SDWAInputModsMatchClass>;
+
def FPVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithFPInputMods";
let ParserMethod = "parseRegWithFPInputMods";
@@ -767,17 +880,23 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
let PrintMethod = "printOperandAndFPInputMods";
}
-
-def IntRegSDWAInputModsMatchClass : AsmOperandClass {
- let Name = "SDWARegWithIntInputMods";
- let ParserMethod = "parseRegWithIntInputMods";
- let PredicateMethod = "isSDWARegKind";
+class IntSDWAInputModsMatchClass <int opSize> : AsmOperandClass {
+ let Name = "SDWAWithInt"#opSize#"InputMods";
+ let ParserMethod = "parseRegOrImmWithIntInputMods";
+ let PredicateMethod = "isSDWAInt"#opSize#"Operand";
}
-def IntRegSDWAInputMods : InputMods <IntRegSDWAInputModsMatchClass> {
+def Int16SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<16>;
+def Int32SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<32>;
+
+class IntSDWAInputMods <IntSDWAInputModsMatchClass matchClass> :
+ InputMods <matchClass> {
let PrintMethod = "printOperandAndIntInputMods";
}
+def Int16SDWAInputMods : IntSDWAInputMods<Int16SDWAInputModsMatchClass>;
+def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>;
+
def IntVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithIntInputMods";
let ParserMethod = "parseRegWithIntInputMods";
@@ -1023,7 +1142,12 @@ class getVregSrcForVT<ValueType VT> {
}
class getSDWASrcForVT <ValueType VT> {
- RegisterOperand ret = !if(!eq(VT.Size, 16), SDWASrc16, SDWASrc32);
+ bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+ !if(!eq(VT.Value, f32.Value), 1,
+ 0));
+ RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32);
+ RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32);
+ RegisterOperand ret = !if(isFP, retFlt, retInt);
}
// Returns the register class to use for sources of VOP3 instructions for the
@@ -1064,32 +1188,6 @@ class getVOP3SrcForVT<ValueType VT> {
);
}
-// Returns 1 if the source arguments have modifiers, 0 if they do not.
-// XXX - do f16 instructions?
-class isFloatType<ValueType SrcVT> {
- bit ret =
- !if(!eq(SrcVT.Value, f16.Value), 1,
- !if(!eq(SrcVT.Value, f32.Value), 1,
- !if(!eq(SrcVT.Value, f64.Value), 1,
- !if(!eq(SrcVT.Value, v2f16.Value), 1,
- 0))));
-}
-
-class isIntType<ValueType SrcVT> {
- bit ret =
- !if(!eq(SrcVT.Value, i16.Value), 1,
- !if(!eq(SrcVT.Value, i32.Value), 1,
- !if(!eq(SrcVT.Value, i64.Value), 1,
- 0)));
-}
-
-class isPackedType<ValueType SrcVT> {
- bit ret =
- !if(!eq(SrcVT.Value, v2i16.Value), 1,
- !if(!eq(SrcVT.Value, v2f16.Value), 1, 0)
- );
-}
-
// Float or packed int
class isModifierType<ValueType SrcVT> {
bit ret =
@@ -1134,11 +1232,10 @@ class getSrcModExt <ValueType VT> {
// Return type of input modifiers operand specified input operand for SDWA
class getSrcModSDWA <ValueType VT> {
- bit isFP = !if(!eq(VT.Value, f16.Value), 1,
- !if(!eq(VT.Value, f32.Value), 1,
- !if(!eq(VT.Value, f64.Value), 1,
- 0)));
- Operand ret = !if(isFP, FPRegSDWAInputMods, IntRegSDWAInputMods);
+ Operand ret = !if(!eq(VT.Value, f16.Value), FP16SDWAInputMods,
+ !if(!eq(VT.Value, f32.Value), FP32SDWAInputMods,
+ !if(!eq(VT.Value, i16.Value), Int16SDWAInputMods,
+ Int32SDWAInputMods)));
}
// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
@@ -1733,6 +1830,9 @@ def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>;
+def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>;
+def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>;
+
class Commutable_REV <string revOp, bit isOrig> {
string RevOp = revOp;
bit IsOrig = isOrig;
@@ -1747,6 +1847,8 @@ class AtomicNoRet <string noRetOp, bit isRet> {
// Interpolation opcodes
//===----------------------------------------------------------------------===//
+class VINTRPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVINTRPDst">;
+
class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
VINTRPCommon <outs, ins, "", pattern>,
SIMCInstr<opName, SIEncodingFamily.NONE> {
@@ -1823,38 +1925,6 @@ def getBasicFromSDWAOp : InstrMapping {
let ValueCols = [["Default"]];
}
-def getMaskedMIMGOp1 : InstrMapping {
- let FilterClass = "MIMG_Mask";
- let RowFields = ["Op"];
- let ColFields = ["Channels"];
- let KeyCol = ["1"];
- let ValueCols = [["2"], ["3"], ["4"] ];
-}
-
-def getMaskedMIMGOp2 : InstrMapping {
- let FilterClass = "MIMG_Mask";
- let RowFields = ["Op"];
- let ColFields = ["Channels"];
- let KeyCol = ["2"];
- let ValueCols = [["1"], ["3"], ["4"] ];
-}
-
-def getMaskedMIMGOp3 : InstrMapping {
- let FilterClass = "MIMG_Mask";
- let RowFields = ["Op"];
- let ColFields = ["Channels"];
- let KeyCol = ["3"];
- let ValueCols = [["1"], ["2"], ["4"] ];
-}
-
-def getMaskedMIMGOp4 : InstrMapping {
- let FilterClass = "MIMG_Mask";
- let RowFields = ["Op"];
- let ColFields = ["Channels"];
- let KeyCol = ["4"];
- let ValueCols = [["1"], ["2"], ["3"] ];
-}
-
// Maps an commuted opcode to its original version
def getCommuteOrig : InstrMapping {
let FilterClass = "Commutable_REV";
@@ -1882,6 +1952,11 @@ def getMCOpcodeGen : InstrMapping {
[!cast<string>(SIEncodingFamily.VI)],
[!cast<string>(SIEncodingFamily.SDWA)],
[!cast<string>(SIEncodingFamily.SDWA9)],
+ // GFX80 encoding is added to work around a multiple matching
+ // issue for buffer instructions with unpacked d16 data. This
+ // does not actually change the encoding, and thus may be
+ // removed later.
+ [!cast<string>(SIEncodingFamily.GFX80)],
[!cast<string>(SIEncodingFamily.GFX9)]];
}
@@ -1902,6 +1977,14 @@ def getAddr64Inst : InstrMapping {
let ValueCols = [["1"]];
}
+def getMUBUFNoLdsInst : InstrMapping {
+ let FilterClass = "MUBUFLdsTable";
+ let RowFields = ["OpName"];
+ let ColFields = ["IsLds"];
+ let KeyCol = ["1"];
+ let ValueCols = [["0"]];
+}
+
// Maps an atomic opcode to its version with a return value.
def getAtomicRetOp : InstrMapping {
let FilterClass = "AtomicNoRet";
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 9740a18b7248..c3f8bfb53ef4 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -11,18 +11,10 @@
// that are not yet supported remain commented out.
//===----------------------------------------------------------------------===//
-def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
-def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
-def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
- AssemblerPredicate<"FeatureVGPRIndexMode">;
-def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
- AssemblerPredicate<"FeatureMovrel">;
-
-class GCNPat<dag pattern, dag result> : AMDGPUPat<pattern, result> {
+class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl {
let SubtargetPredicate = isGCN;
}
-
include "VOPInstructions.td"
include "SOPInstructions.td"
include "SMInstructions.td"
@@ -40,15 +32,18 @@ defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
// VINTRP Instructions
//===----------------------------------------------------------------------===//
+// Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI)
+def VINTRPDst : VINTRPDstOperand <VGPR_32>;
+
let Uses = [M0, EXEC] in {
// FIXME: Specify SchedRW for VINTRP insturctions.
multiclass V_INTERP_P1_F32_m : VINTRP_m <
0x00000000,
- (outs VGPR_32:$vdst),
+ (outs VINTRPDst:$vdst),
(ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
- "v_interp_p1_f32 $vdst, $vsrc, $attr$attrchan",
+ "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan",
[(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan),
(i32 imm:$attr)))]
>;
@@ -69,9 +64,9 @@ let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
defm V_INTERP_P2_F32 : VINTRP_m <
0x00000001,
- (outs VGPR_32:$vdst),
+ (outs VINTRPDst:$vdst),
(ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
- "v_interp_p2_f32 $vdst, $vsrc, $attr$attrchan",
+ "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan",
[(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan),
(i32 imm:$attr)))]>;
@@ -79,9 +74,9 @@ defm V_INTERP_P2_F32 : VINTRP_m <
defm V_INTERP_MOV_F32 : VINTRP_m <
0x00000002,
- (outs VGPR_32:$vdst),
+ (outs VINTRPDst:$vdst),
(ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
- "v_interp_mov_f32 $vdst, $vsrc, $attr$attrchan",
+ "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan",
[(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan),
(i32 imm:$attr)))]>;
@@ -186,6 +181,7 @@ def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst),
let SALU = 1;
let isAsCheapAsAMove = 1;
let isTerminator = 1;
+ let Defs = [SCC];
}
def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst),
@@ -246,7 +242,6 @@ def SI_IF: CFPseudoInstSI <
def SI_ELSE : CFPseudoInstSI <
(outs SReg_64:$dst),
(ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
- let Constraints = "$src = $dst";
let Size = 12;
let hasSideEffects = 1;
}
@@ -296,14 +291,21 @@ def SI_ELSE_BREAK : CFPseudoInstSI <
let isReMaterializable = 1;
}
-let Uses = [EXEC], Defs = [EXEC,VCC] in {
+let Uses = [EXEC] in {
multiclass PseudoInstKill <dag ins> {
+ // Even though this pseudo can usually be expanded without an SCC def, we
+ // conservatively assume that it has an SCC def, both because it is sometimes
+ // required in degenerate cases (when V_CMPX cannot be used due to constant
+ // bus limitations) and because it allows us to avoid having to track SCC
+ // liveness across basic blocks.
+ let Defs = [EXEC,VCC,SCC] in
def _PSEUDO : PseudoInstSI <(outs), ins> {
let isConvergent = 1;
let usesCustomInserter = 1;
}
+ let Defs = [EXEC,VCC,SCC] in
def _TERMINATOR : SPseudoInstSI <(outs), ins> {
let isTerminator = 1;
}
@@ -312,6 +314,7 @@ multiclass PseudoInstKill <dag ins> {
defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>;
defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
+let Defs = [EXEC,VCC] in
def SI_ILLEGAL_COPY : SPseudoInstSI <
(outs unknown:$dst), (ins unknown:$src),
[], " ; illegal copy $src to $dst">;
@@ -371,6 +374,7 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI <
let isReturn = 1;
let hasNoSchedulingInfo = 1;
let DisableWQM = 1;
+ let FixedSize = 1;
}
// Return for returning function calls.
@@ -449,7 +453,7 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
let usesCustomInserter = 1;
}
-let Defs = [M0, EXEC],
+let Defs = [M0, EXEC, SCC],
UseNamedOperandTable = 1 in {
class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
@@ -569,11 +573,6 @@ def : GCNPat<
(SI_ELSE $src, $target, 0)
>;
-def : GCNPat <
- (int_AMDGPU_kilp),
- (SI_KILL_I1_PSEUDO (i1 0), 0)
->;
-
def : Pat <
// -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0)
(AMDGPUkill (i32 -1082130432)),
@@ -643,6 +642,11 @@ def : GCNPat <
>;
def : GCNPat <
+ (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
+ (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)), DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : GCNPat <
(f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
(V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
@@ -700,15 +704,19 @@ multiclass FMADPat <ValueType vt, Instruction inst> {
defm : FMADPat <f16, V_MAC_F16_e64>;
defm : FMADPat <f32, V_MAC_F32_e64>;
-class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : GCNPat<
- (f32 (mad_opr (VOP3Mods f32:$src0, i32:$src0_mod),
- (VOP3Mods f32:$src1, i32:$src1_mod),
- (VOP3Mods f32:$src2, i32:$src2_mod))),
+class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
+ : GCNPat<
+ (Ty (mad_opr (VOP3Mods Ty:$src0, i32:$src0_mod),
+ (VOP3Mods Ty:$src1, i32:$src1_mod),
+ (VOP3Mods Ty:$src2, i32:$src2_mod))),
(inst $src0_mod, $src0, $src1_mod, $src1,
$src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz>;
+def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>;
+def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
+ let SubtargetPredicate = Has16BitInsts;
+}
multiclass SelectPat <ValueType vt, Instruction inst> {
def : GCNPat <
@@ -726,6 +734,10 @@ def : GCNPat <
(i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
(V_BCNT_U32_B32_e64 $popcnt, $val)
>;
+def : GCNPat <
+ (i16 (add (i16 (trunc (ctpop i32:$popcnt))), i16:$val)),
+ (V_BCNT_U32_B32_e64 $popcnt, $val)
+>;
/********** ============================================ **********/
/********** Extraction, Insertion, Building and Casting **********/
@@ -795,6 +807,27 @@ foreach Index = 0-15 in {
>;
}
+
+def : Pat <
+ (extract_subvector v4i16:$vec, (i32 0)),
+ (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0))
+>;
+
+def : Pat <
+ (extract_subvector v4i16:$vec, (i32 2)),
+ (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1))
+>;
+
+def : Pat <
+ (extract_subvector v4f16:$vec, (i32 0)),
+ (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0))
+>;
+
+def : Pat <
+ (extract_subvector v4f16:$vec, (i32 2)),
+ (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
+>;
+
let SubtargetPredicate = isGCN in {
// FIXME: Why do only some of these type combinations for SReg and
@@ -834,6 +867,26 @@ def : BitConvert <f64, v2f32, VReg_64>;
def : BitConvert <v2f32, f64, VReg_64>;
def : BitConvert <f64, v2i32, VReg_64>;
def : BitConvert <v2i32, f64, VReg_64>;
+
+// FIXME: Make SGPR
+def : BitConvert <v2i32, v4f16, VReg_64>;
+def : BitConvert <v4f16, v2i32, VReg_64>;
+def : BitConvert <v2i32, v4f16, VReg_64>;
+def : BitConvert <v2i32, v4i16, VReg_64>;
+def : BitConvert <v4i16, v2i32, VReg_64>;
+def : BitConvert <v2f32, v4f16, VReg_64>;
+def : BitConvert <v4f16, v2f32, VReg_64>;
+def : BitConvert <v2f32, v4i16, VReg_64>;
+def : BitConvert <v4i16, v2f32, VReg_64>;
+def : BitConvert <v4i16, f64, VReg_64>;
+def : BitConvert <v4f16, f64, VReg_64>;
+def : BitConvert <f64, v4i16, VReg_64>;
+def : BitConvert <f64, v4f16, VReg_64>;
+def : BitConvert <v4i16, i64, VReg_64>;
+def : BitConvert <v4f16, i64, VReg_64>;
+def : BitConvert <i64, v4i16, VReg_64>;
+def : BitConvert <i64, v4f16, VReg_64>;
+
def : BitConvert <v4i32, v4f32, VReg_128>;
def : BitConvert <v4f32, v4i32, VReg_128>;
@@ -876,11 +929,13 @@ def : ClampPat<V_MAX_F32_e64, f32>;
def : ClampPat<V_MAX_F64, f64>;
def : ClampPat<V_MAX_F16_e64, f16>;
+let SubtargetPredicate = HasVOP3PInsts in {
def : GCNPat <
(v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))),
(V_PK_MAX_F16 $src0_modifiers, $src0,
$src0_modifiers, $src0, DSTCLAMP.ENABLE)
>;
+}
/********** ================================ **********/
/********** Floating point absolute/negative **********/
@@ -906,7 +961,7 @@ def : GCNPat <
def : GCNPat <
(fabs f32:$src),
- (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
+ (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fffffff)))
>;
def : GCNPat <
@@ -967,12 +1022,12 @@ def : GCNPat <
def : GCNPat <
(fneg f16:$src),
- (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
+ (S_XOR_B32 $src, (S_MOV_B32 (i32 0x00008000)))
>;
def : GCNPat <
(fabs f16:$src),
- (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff)))
+ (S_AND_B32 $src, (S_MOV_B32 (i32 0x00007fff)))
>;
def : GCNPat <
@@ -982,12 +1037,12 @@ def : GCNPat <
def : GCNPat <
(fneg v2f16:$src),
- (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src)
+ (S_XOR_B32 $src, (S_MOV_B32 (i32 0x80008000)))
>;
def : GCNPat <
(fabs v2f16:$src),
- (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src)
+ (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fff7fff)))
>;
// This is really (fneg (fabs v2f16:$src))
@@ -996,7 +1051,12 @@ def : GCNPat <
// VOP3P instructions, so it is turned into the bit op.
def : GCNPat <
(fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))),
- (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit
+ (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
+>;
+
+def : GCNPat <
+ (fneg (v2f16 (fabs v2f16:$src))),
+ (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
>;
/********** ================== **********/
@@ -1097,6 +1157,7 @@ let SubtargetPredicate = isGCN in {
def : IMad24Pat<V_MAD_I32_I24, 1>;
def : UMad24Pat<V_MAD_U32_U24, 1>;
+// FIXME: This should only be done for VALU inputs
defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
def : ROTRPattern <V_ALIGNBIT_B32>;
@@ -1337,11 +1398,13 @@ def : GCNPat<
(V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)
>;
+let SubtargetPredicate = HasVOP3PInsts in {
def : GCNPat<
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
(V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)
>;
}
+}
let OtherPredicates = [NoFP32Denormals] in {
def : GCNPat<
@@ -1371,6 +1434,16 @@ def : GCNPat<
>;
}
+let OtherPredicates = [HasDLInsts] in {
+def : GCNPat <
+ (fma (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3NoMods f32:$src2))),
+ (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+ SRCMODS.NONE, $src2, $clamp, $omod)
+>;
+} // End OtherPredicates = [HasDLInsts]
+
// Allow integer inputs
class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat<
@@ -1381,11 +1454,6 @@ class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPa
def : ExpPattern<AMDGPUexport, i32, EXP>;
def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
-def : GCNPat <
- (v2i16 (build_vector i16:$src0, i16:$src1)),
- (v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
->;
-
// COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs
// from S_LSHL_B32's multiple outputs from implicit scc def.
def : GCNPat <
@@ -1393,6 +1461,13 @@ def : GCNPat <
(v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0))
>;
+
+let SubtargetPredicate = HasVOP3PInsts in {
+def : GCNPat <
+ (v2i16 (build_vector i16:$src0, i16:$src1)),
+ (v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
+>;
+
// With multiple uses of the shift, this will duplicate the shift and
// increase register pressure.
def : GCNPat <
@@ -1400,6 +1475,7 @@ def : GCNPat <
(v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1))
>;
+
def : GCNPat <
(v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))),
(i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
@@ -1412,6 +1488,9 @@ def : GCNPat <
(v2f16 (S_PACK_LL_B32_B16 $src0, $src1))
>;
+} // End SubtargetPredicate = HasVOP3PInsts
+
+
// def : GCNPat <
// (v2f16 (scalar_to_vector f16:$src0)),
// (COPY $src0)
@@ -1422,6 +1501,16 @@ def : GCNPat <
// (COPY $src0)
// >;
+def : GCNPat <
+ (v4i16 (scalar_to_vector i16:$src0)),
+ (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
+>;
+
+def : GCNPat <
+ (v4f16 (scalar_to_vector f16:$src0)),
+ (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
+>;
+
//===----------------------------------------------------------------------===//
// Fract Patterns
//===----------------------------------------------------------------------===//
@@ -1486,7 +1575,7 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
-def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
+defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 84cd47a101a8..4b537540046f 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -45,6 +45,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
@@ -102,7 +103,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
};
private:
- const SISubtarget *STM = nullptr;
+ const GCNSubtarget *STM = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
MachineRegisterInfo *MRI = nullptr;
@@ -137,7 +138,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- StringRef getPassName() const override { return "SI Load / Store Optimizer"; }
+ StringRef getPassName() const override { return "SI Load Store Optimizer"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -150,10 +151,10 @@ public:
} // end anonymous namespace.
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
- "SI Load / Store Optimizer", false, false)
+ "SI Load Store Optimizer", false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
- "SI Load / Store Optimizer", false, false)
+ "SI Load Store Optimizer", false, false)
char SILoadStoreOptimizer::ID = 0;
@@ -173,10 +174,18 @@ static void moveInstsAfter(MachineBasicBlock::iterator I,
}
}
-static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
- // XXX: Should this be looking for implicit defs?
- for (const MachineOperand &Def : MI.defs())
- Defs.insert(Def.getReg());
+static void addDefsUsesToList(const MachineInstr &MI,
+ DenseSet<unsigned> &RegDefs,
+ DenseSet<unsigned> &PhysRegUses) {
+ for (const MachineOperand &Op : MI.operands()) {
+ if (Op.isReg()) {
+ if (Op.isDef())
+ RegDefs.insert(Op.getReg());
+ else if (Op.readsReg() &&
+ TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
+ PhysRegUses.insert(Op.getReg());
+ }
+ }
}
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
@@ -194,16 +203,24 @@ static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
// already in the list. Returns true in that case.
static bool
addToListsIfDependent(MachineInstr &MI,
- DenseSet<unsigned> &Defs,
+ DenseSet<unsigned> &RegDefs,
+ DenseSet<unsigned> &PhysRegUses,
SmallVectorImpl<MachineInstr*> &Insts) {
for (MachineOperand &Use : MI.operands()) {
// If one of the defs is read, then there is a use of Def between I and the
// instruction that I will potentially be merged with. We will need to move
// this instruction after the merged instructions.
-
- if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {
+ //
+ // Similarly, if there is a def which is read by an instruction that is to
+ // be moved for merging, then we need to move the def-instruction as well.
+ // This can only happen for physical registers such as M0; virtual
+ // registers are in SSA form.
+ if (Use.isReg() &&
+ ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
+ (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
+ PhysRegUses.count(Use.getReg())))) {
Insts.push_back(&MI);
- addDefsToList(MI, Defs);
+ addDefsUsesToList(MI, RegDefs, PhysRegUses);
return true;
}
}
@@ -332,8 +349,9 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
++MBBI;
- DenseSet<unsigned> DefsToMove;
- addDefsToList(*CI.I, DefsToMove);
+ DenseSet<unsigned> RegDefsToMove;
+ DenseSet<unsigned> PhysRegUsesToMove;
+ addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
for ( ; MBBI != E; ++MBBI) {
if (MBBI->getOpcode() != CI.I->getOpcode()) {
@@ -356,14 +374,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
// #2. Add this instruction to the move list and then we will check
// if condition #2 holds once we have selected the matching instruction.
CI.InstsToMove.push_back(&*MBBI);
- addDefsToList(*MBBI, DefsToMove);
+ addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
continue;
}
// When we match I with another DS instruction we will be moving I down
// to the location of the matched instruction any uses of I will need to
// be moved down as well.
- addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
+ addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
+ CI.InstsToMove);
continue;
}
@@ -377,7 +396,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
// DS_WRITE_B32 addr, f(w), idx1
// where the DS_READ_B32 ends up in InstsToMove and therefore prevents
// merging of the two writes.
- if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
+ if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
+ CI.InstsToMove))
continue;
bool Match = true;
@@ -436,7 +456,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
// down past this instruction.
// check if we can move I across MBBI and if we can move all I's users
if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
+ !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
break;
}
return false;
@@ -496,13 +516,15 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
unsigned BaseReg = AddrReg->getReg();
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
+ unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+ .addImm(CI.BaseOff);
+
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
- unsigned AddOpc = STM->hasAddNoCarry() ?
- AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
- BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
- .addImm(CI.BaseOff)
+ TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
+ .addReg(ImmReg)
.addReg(AddrReg->getReg());
}
@@ -532,7 +554,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
CI.I->eraseFromParent();
CI.Paired->eraseFromParent();
- DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
+ LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
return Next;
}
@@ -556,7 +578,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
// sure we preserve the subregister index and any register flags set on them.
- const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
+ const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
const MachineOperand *Data1
= TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
@@ -579,17 +601,19 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
const MCInstrDesc &Write2Desc = TII->get(Opc);
DebugLoc DL = CI.I->getDebugLoc();
- unsigned BaseReg = Addr->getReg();
+ unsigned BaseReg = AddrReg->getReg();
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
+ unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+ .addImm(CI.BaseOff);
+
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
- unsigned AddOpc = STM->hasAddNoCarry() ?
- AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
- BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
- .addImm(CI.BaseOff)
- .addReg(Addr->getReg());
+ TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
+ .addReg(ImmReg)
+ .addReg(AddrReg->getReg());
}
MachineInstrBuilder Write2 =
@@ -608,7 +632,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
CI.I->eraseFromParent();
CI.Paired->eraseFromParent();
- DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
+ LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
return Next;
}
@@ -849,9 +873,8 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
continue;
}
- if (STM->hasSBufferLoadStoreAtomicDwordxN() &&
- (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
- Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) {
+ if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
+ Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
// EltSize is in units of the offset encoding.
CI.InstClass = S_BUFFER_LOAD_IMM;
CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
@@ -916,7 +939,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
- STM = &MF.getSubtarget<SISubtarget>();
+ STM = &MF.getSubtarget<GCNSubtarget>();
if (!STM->loadStoreOptEnabled())
return false;
@@ -928,7 +951,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
assert(MRI->isSSA() && "Must be run on SSA");
- DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
+ LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
bool Modified = false;
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index a9af83323976..ad30317c344c 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This pass lowers the pseudo control flow instructions to real
+/// This pass lowers the pseudo control flow instructions to real
/// machine instructions.
///
/// All control flow is handled using predicated instructions and
@@ -51,6 +51,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -343,11 +344,49 @@ void SILowerControlFlow::emitBreak(MachineInstr &MI) {
}
void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
- MI.setDesc(TII->get(AMDGPU::S_OR_B64));
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ auto Dst = MI.getOperand(0).getReg();
+
+ // Skip ANDing with exec if the break condition is already masked by exec
+ // because it is a V_CMP in the same basic block. (We know the break
+ // condition operand was an i1 in IR, so if it is a VALU instruction it must
+ // be one with a carry-out.)
+ bool SkipAnding = false;
+ if (MI.getOperand(1).isReg()) {
+ if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) {
+ SkipAnding = Def->getParent() == MI.getParent()
+ && SIInstrInfo::isVALU(*Def);
+ }
+ }
+
+ // AND the break condition operand with exec, then OR that into the "loop
+ // exit" mask.
+ MachineInstr *And = nullptr, *Or = nullptr;
+ if (!SkipAnding) {
+ And = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
+ .addReg(AMDGPU::EXEC)
+ .add(MI.getOperand(1));
+ Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+ .addReg(Dst)
+ .add(MI.getOperand(2));
+ } else
+ Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2));
+
+ if (LIS) {
+ if (And)
+ LIS->InsertMachineInstrInMaps(*And);
+ LIS->ReplaceMachineInstrInMaps(MI, *Or);
+ }
+
+ MI.eraseFromParent();
}
void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
- MI.setDesc(TII->get(AMDGPU::S_OR_B64));
+ // Lowered in the same way as emitIfBreak above.
+ emitIfBreak(MI);
}
void SILowerControlFlow::emitLoop(MachineInstr &MI) {
@@ -414,8 +453,8 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
return;
for (const auto &SrcOp : Def->explicit_operands())
- if (SrcOp.isUse() && (!SrcOp.isReg() ||
- TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
+ if (SrcOp.isReg() && SrcOp.isUse() &&
+ (TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
SrcOp.getReg() == AMDGPU::EXEC))
Src.push_back(SrcOp);
}
@@ -447,7 +486,7 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
}
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index da57b90dd8c4..ecc6cff407e1 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -17,6 +17,8 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPULaneDominator.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -64,7 +66,7 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
@@ -141,7 +143,8 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
DefInst->getOperand(3).getReg()) &&
TRI->getCommonSubClass(
MRI.getRegClass(DefInst->getOperand(3).getReg()),
- &AMDGPU::SGPR_64RegClass)) {
+ &AMDGPU::SGPR_64RegClass) &&
+ AMDGPU::laneDominates(DefInst->getParent(), &MBB)) {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
.add(Dst)
.addReg(AMDGPU::EXEC)
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 6013ebc81d9f..0d5ff75e37ed 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -11,6 +11,7 @@
#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUSubtarget.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -28,17 +29,12 @@ using namespace llvm;
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
- BufferPSV(*(MF.getSubtarget().getInstrInfo())),
- ImagePSV(*(MF.getSubtarget().getInstrInfo())),
PrivateSegmentBuffer(false),
DispatchPtr(false),
QueuePtr(false),
KernargSegmentPtr(false),
DispatchID(false),
FlatScratchInit(false),
- GridWorkgroupCountX(false),
- GridWorkgroupCountY(false),
- GridWorkgroupCountZ(false),
WorkGroupIDX(false),
WorkGroupIDY(false),
WorkGroupIDZ(false),
@@ -49,12 +45,26 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
WorkItemIDZ(false),
ImplicitBufferPtr(false),
ImplicitArgPtr(false),
- GITPtrHigh(0xffffffff) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ GITPtrHigh(0xffffffff),
+ HighBitsOf32BitAddress(0) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const Function &F = MF.getFunction();
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
WavesPerEU = ST.getWavesPerEU(F);
+ Occupancy = getMaxWavesPerEU();
+ limitOccupancy(MF);
+ CallingConv::ID CC = F.getCallingConv();
+
+ if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
+ if (!F.arg_empty())
+ KernargSegmentPtr = true;
+ WorkGroupIDX = true;
+ WorkItemIDX = true;
+ } else if (CC == CallingConv::AMDGPU_PS) {
+ PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
+ }
+
if (!isEntryFunction()) {
// Non-entry functions have no special inputs for now, other registers
// required for scratch access.
@@ -71,18 +81,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
ImplicitArgPtr = true;
} else {
- if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
- KernargSegmentPtr = true;
- }
-
- CallingConv::ID CC = F.getCallingConv();
- if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
- if (!F.arg_empty())
+ if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) {
KernargSegmentPtr = true;
- WorkGroupIDX = true;
- WorkItemIDX = true;
- } else if (CC == CallingConv::AMDGPU_PS) {
- PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
+ MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
+ MaxKernArgAlign);
+ }
}
if (ST.debuggerEmitPrologue()) {
@@ -134,7 +137,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
}
}
- bool IsCOV2 = ST.isAmdCodeObjectV2(MF);
+ bool IsCOV2 = ST.isAmdCodeObjectV2(F);
if (IsCOV2) {
if (HasStackObjects || MaySpill)
PrivateSegmentBuffer = true;
@@ -147,7 +150,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (F.hasFnAttribute("amdgpu-dispatch-id"))
DispatchID = true;
- } else if (ST.isMesaGfxShader(MF)) {
+ } else if (ST.isMesaGfxShader(F)) {
if (HasStackObjects || MaySpill)
ImplicitBufferPtr = true;
}
@@ -166,6 +169,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
StringRef S = A.getValueAsString();
if (!S.empty())
S.consumeInteger(0, GITPtrHigh);
+
+ A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
+ S = A.getValueAsString();
+ if (!S.empty())
+ S.consumeInteger(0, HighBitsOf32BitAddress);
+}
+
+void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
+ limitOccupancy(getMaxWavesPerEU());
+ const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
+ limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(),
+ MF.getFunction()));
}
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@@ -238,7 +253,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
if (!SpillLanes.empty())
return true;
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -269,10 +284,9 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
}
Optional<int> CSRSpillFI;
- if (FrameInfo.hasCalls() && CSRegs && isCalleeSavedReg(CSRegs, LaneVGPR)) {
- // TODO: Should this be a CreateSpillStackObject? This is technically a
- // weird CSR spill.
- CSRSpillFI = FrameInfo.CreateStackObject(4, 4, false);
+ if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs &&
+ isCalleeSavedReg(CSRegs, LaneVGPR)) {
+ CSRSpillFI = FrameInfo.CreateSpillStackObject(4, 4);
}
SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
@@ -295,3 +309,29 @@ void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI)
for (auto &R : SGPRToVGPRSpills)
MFI.RemoveStackObject(R.first);
}
+
+
+/// \returns VGPR used for \p Dim' work item ID.
+unsigned SIMachineFunctionInfo::getWorkItemIDVGPR(unsigned Dim) const {
+ switch (Dim) {
+ case 0:
+ assert(hasWorkItemIDX());
+ return AMDGPU::VGPR0;
+ case 1:
+ assert(hasWorkItemIDY());
+ return AMDGPU::VGPR1;
+ case 2:
+ assert(hasWorkItemIDZ());
+ return AMDGPU::VGPR2;
+ }
+ llvm_unreachable("unexpected dimension");
+}
+
+MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
+ assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
+ return AMDGPU::SGPR0 + NumUserSGPRs;
+}
+
+MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
+ return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
+}
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 5dde72910ee3..ef91d1e43075 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -16,7 +16,9 @@
#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUMachineFunction.h"
+#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Optional.h"
@@ -38,8 +40,9 @@ class TargetRegisterClass;
class AMDGPUImagePseudoSourceValue : public PseudoSourceValue {
public:
+ // TODO: Is the img rsrc useful?
explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII) :
- PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) { }
+ PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) {}
bool isConstant(const MachineFrameInfo *) const override {
// This should probably be true for most images, but we will start by being
@@ -48,15 +51,11 @@ public:
}
bool isAliased(const MachineFrameInfo *) const override {
- // FIXME: If we ever change image intrinsics to accept fat pointers, then
- // this could be true for some cases.
- return false;
+ return true;
}
bool mayAlias(const MachineFrameInfo *) const override {
- // FIXME: If we ever change image intrinsics to accept fat pointers, then
- // this could be true for some cases.
- return false;
+ return true;
}
};
@@ -72,15 +71,11 @@ public:
}
bool isAliased(const MachineFrameInfo *) const override {
- // FIXME: If we ever change image intrinsics to accept fat pointers, then
- // this could be true for some cases.
- return false;
+ return true;
}
bool mayAlias(const MachineFrameInfo *) const override {
- // FIXME: If we ever change image intrinsics to accept fat pointers, then
- // this could be true for some cases.
- return false;
+ return true;
}
};
@@ -135,8 +130,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
// Stack object indices for work item IDs.
std::array<int, 3> DebuggerWorkItemIDStackObjectIndices = {{0, 0, 0}};
- AMDGPUBufferPseudoSourceValue BufferPSV;
- AMDGPUImagePseudoSourceValue ImagePSV;
+ DenseMap<const Value *,
+ std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs;
+ DenseMap<const Value *,
+ std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs;
private:
unsigned LDSWaveSpillSize = 0;
@@ -146,6 +143,7 @@ private:
bool HasSpilledSGPRs = false;
bool HasSpilledVGPRs = false;
bool HasNonSpillStackObjects = false;
+ bool IsStackRealigned = false;
unsigned NumSpilledSGPRs = 0;
unsigned NumSpilledVGPRs = 0;
@@ -157,9 +155,6 @@ private:
bool KernargSegmentPtr : 1;
bool DispatchID : 1;
bool FlatScratchInit : 1;
- bool GridWorkgroupCountX : 1;
- bool GridWorkgroupCountY : 1;
- bool GridWorkgroupCountZ : 1;
// Feature bits required for inputs passed in system SGPRs.
bool WorkGroupIDX : 1; // Always initialized.
@@ -186,25 +181,25 @@ private:
// current hardware only allows a 16 bit value.
unsigned GITPtrHigh;
- MCPhysReg getNextUserSGPR() const {
- assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
- return AMDGPU::SGPR0 + NumUserSGPRs;
- }
+ unsigned HighBitsOf32BitAddress;
- MCPhysReg getNextSystemSGPR() const {
- return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
- }
+ // Current recorded maximum possible occupancy.
+ unsigned Occupancy;
+
+ MCPhysReg getNextUserSGPR() const;
+
+ MCPhysReg getNextSystemSGPR() const;
public:
struct SpilledReg {
- unsigned VGPR = AMDGPU::NoRegister;
+ unsigned VGPR = 0;
int Lane = -1;
SpilledReg() = default;
SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) {}
bool hasLane() { return Lane != -1;}
- bool hasReg() { return VGPR != AMDGPU::NoRegister;}
+ bool hasReg() { return VGPR != 0;}
};
struct SGPRSpillVGPRCSR {
@@ -244,8 +239,8 @@ public:
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
- bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }
- unsigned getTIDReg() const { return TIDReg; }
+ bool hasCalculatedTID() const { return TIDReg != 0; };
+ unsigned getTIDReg() const { return TIDReg; };
void setTIDReg(unsigned Reg) { TIDReg = Reg; }
unsigned getBytesInStackArgArea() const {
@@ -338,18 +333,6 @@ public:
return FlatScratchInit;
}
- bool hasGridWorkgroupCountX() const {
- return GridWorkgroupCountX;
- }
-
- bool hasGridWorkgroupCountY() const {
- return GridWorkgroupCountY;
- }
-
- bool hasGridWorkgroupCountZ() const {
- return GridWorkgroupCountZ;
- }
-
bool hasWorkGroupIDX() const {
return WorkGroupIDX;
}
@@ -411,6 +394,10 @@ public:
return GITPtrHigh;
}
+ unsigned get32BitAddressHighBits() const {
+ return HighBitsOf32BitAddress;
+ }
+
unsigned getNumUserSGPRs() const {
return NumUserSGPRs;
}
@@ -423,14 +410,14 @@ public:
return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}
- /// \brief Returns the physical register reserved for use as the resource
+ /// Returns the physical register reserved for use as the resource
/// descriptor for scratch accesses.
unsigned getScratchRSrcReg() const {
return ScratchRSrcReg;
}
void setScratchRSrcReg(unsigned Reg) {
- assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+ assert(Reg != 0 && "Should never be unset");
ScratchRSrcReg = Reg;
}
@@ -443,6 +430,7 @@ public:
}
void setStackPtrOffsetReg(unsigned Reg) {
+ assert(Reg != 0 && "Should never be unset");
StackPtrOffsetReg = Reg;
}
@@ -455,7 +443,7 @@ public:
}
void setScratchWaveOffsetReg(unsigned Reg) {
- assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+ assert(Reg != 0 && "Should never be unset");
ScratchWaveOffsetReg = Reg;
if (isEntryFunction())
FrameOffsetReg = ScratchWaveOffsetReg;
@@ -493,6 +481,14 @@ public:
HasNonSpillStackObjects = StackObject;
}
+ bool isStackRealigned() const {
+ return IsStackRealigned;
+ }
+
+ void setIsStackRealigned(bool Realigned = true) {
+ IsStackRealigned = Realigned;
+ }
+
unsigned getNumSpilledSGPRs() const {
return NumSpilledSGPRs;
}
@@ -575,7 +571,7 @@ public:
return DebuggerWorkGroupIDStackObjectIndices[Dim];
}
- /// \brief Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
+ /// Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
assert(Dim < 3);
DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx;
@@ -587,7 +583,7 @@ public:
return DebuggerWorkItemIDStackObjectIndices[Dim];
}
- /// \brief Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
+ /// Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
assert(Dim < 3);
DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx;
@@ -610,31 +606,51 @@ public:
}
/// \returns VGPR used for \p Dim' work item ID.
- unsigned getWorkItemIDVGPR(unsigned Dim) const {
- switch (Dim) {
- case 0:
- assert(hasWorkItemIDX());
- return AMDGPU::VGPR0;
- case 1:
- assert(hasWorkItemIDY());
- return AMDGPU::VGPR1;
- case 2:
- assert(hasWorkItemIDZ());
- return AMDGPU::VGPR2;
- }
- llvm_unreachable("unexpected dimension");
- }
+ unsigned getWorkItemIDVGPR(unsigned Dim) const;
unsigned getLDSWaveSpillSize() const {
return LDSWaveSpillSize;
}
- const AMDGPUBufferPseudoSourceValue *getBufferPSV() const {
- return &BufferPSV;
+ const AMDGPUBufferPseudoSourceValue *getBufferPSV(const SIInstrInfo &TII,
+ const Value *BufferRsrc) {
+ assert(BufferRsrc);
+ auto PSV = BufferPSVs.try_emplace(
+ BufferRsrc,
+ llvm::make_unique<AMDGPUBufferPseudoSourceValue>(TII));
+ return PSV.first->second.get();
+ }
+
+ const AMDGPUImagePseudoSourceValue *getImagePSV(const SIInstrInfo &TII,
+ const Value *ImgRsrc) {
+ assert(ImgRsrc);
+ auto PSV = ImagePSVs.try_emplace(
+ ImgRsrc,
+ llvm::make_unique<AMDGPUImagePseudoSourceValue>(TII));
+ return PSV.first->second.get();
+ }
+
+ unsigned getOccupancy() const {
+ return Occupancy;
+ }
+
+ unsigned getMinAllowedOccupancy() const {
+ if (!isMemoryBound() && !needsWaveLimiter())
+ return Occupancy;
+ return (Occupancy < 4) ? Occupancy : 4;
+ }
+
+ void limitOccupancy(const MachineFunction &MF);
+
+ void limitOccupancy(unsigned Limit) {
+ if (Occupancy > Limit)
+ Occupancy = Limit;
}
- const AMDGPUImagePseudoSourceValue *getImagePSV() const {
- return &ImagePSV;
+ void increaseOccupancy(const MachineFunction &MF, unsigned Limit) {
+ if (Occupancy < Limit)
+ Occupancy = Limit;
+ limitOccupancy(MF);
}
};
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 6b67b76652ed..18754442898f 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief SI Machine Scheduler interface
+/// SI Machine Scheduler interface
//
//===----------------------------------------------------------------------===//
@@ -16,6 +16,7 @@
#include "AMDGPU.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/LiveInterval.h"
@@ -154,6 +155,8 @@ static const char *getReasonStr(SIScheduleCandReason Reason) {
#endif
+namespace llvm {
+namespace SISched {
static bool tryLess(int TryVal, int CandVal,
SISchedulerCandidate &TryCand,
SISchedulerCandidate &Cand,
@@ -187,6 +190,8 @@ static bool tryGreater(int TryVal, int CandVal,
Cand.setRepeat(Reason);
return false;
}
+} // end namespace SISched
+} // end namespace llvm
// SIScheduleBlock //
@@ -212,7 +217,8 @@ void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
}
if (Cand.SGPRUsage > 60 &&
- tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
+ SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
+ TryCand, Cand, RegUsage))
return;
// Schedule low latency instructions as top as possible.
@@ -230,21 +236,22 @@ void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
// could go quite high, thus above the arbitrary limit of 60 will encourage
// use the already loaded constants (in order to release some SGPRs) before
// loading more.
- if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
- Cand.HasLowLatencyNonWaitedParent,
- TryCand, Cand, SIScheduleCandReason::Depth))
+ if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
+ Cand.HasLowLatencyNonWaitedParent,
+ TryCand, Cand, SIScheduleCandReason::Depth))
return;
- if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
- TryCand, Cand, SIScheduleCandReason::Depth))
+ if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
+ TryCand, Cand, SIScheduleCandReason::Depth))
return;
if (TryCand.IsLowLatency &&
- tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
- TryCand, Cand, SIScheduleCandReason::Depth))
+ SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
+ TryCand, Cand, SIScheduleCandReason::Depth))
return;
- if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
+ if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
+ TryCand, Cand, RegUsage))
return;
// Fall through to original instruction order.
@@ -1201,7 +1208,7 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
NextReservedID = 1;
NextNonReservedID = DAGSize + 1;
- DEBUG(dbgs() << "Coloring the graph\n");
+ LLVM_DEBUG(dbgs() << "Coloring the graph\n");
if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
colorHighLatenciesGroups();
@@ -1258,13 +1265,11 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
SIScheduleBlock *Block = CurrentBlocks[i];
Block->finalizeUnits();
}
- DEBUG(
- dbgs() << "Blocks created:\n\n";
- for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
- SIScheduleBlock *Block = CurrentBlocks[i];
- Block->printDebug(true);
- }
- );
+ LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
+ for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->printDebug(true);
+ });
}
// Two functions taken from Codegen/MachineScheduler.cpp
@@ -1274,7 +1279,7 @@ static MachineBasicBlock::iterator
nextIfDebug(MachineBasicBlock::iterator I,
MachineBasicBlock::const_iterator End) {
for (; I != End; ++I) {
- if (!I->isDebugValue())
+ if (!I->isDebugInstr())
break;
}
return I;
@@ -1284,7 +1289,7 @@ void SIScheduleBlockCreator::topologicalSort() {
unsigned DAGSize = CurrentBlocks.size();
std::vector<int> WorkList;
- DEBUG(dbgs() << "Topological Sort\n");
+ LLVM_DEBUG(dbgs() << "Topological Sort\n");
WorkList.reserve(DAGSize);
TopDownIndex2Block.resize(DAGSize);
@@ -1331,11 +1336,11 @@ void SIScheduleBlockCreator::topologicalSort() {
void SIScheduleBlockCreator::scheduleInsideBlocks() {
unsigned DAGSize = CurrentBlocks.size();
- DEBUG(dbgs() << "\nScheduling Blocks\n\n");
+ LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
// We do schedule a valid scheduling such that a Block corresponds
// to a range of instructions.
- DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
+ LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
for (unsigned i = 0, e = DAGSize; i != e; ++i) {
SIScheduleBlock *Block = CurrentBlocks[i];
Block->fastSchedule();
@@ -1389,7 +1394,7 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
}
- DEBUG(dbgs() << "Restoring MI Pos\n");
+ LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
// Restore old ordering (which prevents a LIS->handleMove bug).
for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
MachineBasicBlock::iterator POld = PosOld[i-1];
@@ -1403,12 +1408,10 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
}
}
- DEBUG(
- for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
- SIScheduleBlock *Block = CurrentBlocks[i];
- Block->printDebug(true);
- }
- );
+ LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->printDebug(true);
+ });
}
void SIScheduleBlockCreator::fillStats() {
@@ -1559,13 +1562,10 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
blockScheduled(Block);
}
- DEBUG(
- dbgs() << "Block Order:";
- for (SIScheduleBlock* Block : BlocksScheduled) {
- dbgs() << ' ' << Block->getID();
- }
- dbgs() << '\n';
- );
+ LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
+ : BlocksScheduled) {
+ dbgs() << ' ' << Block->getID();
+ } dbgs() << '\n';);
}
bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
@@ -1576,19 +1576,19 @@ bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
}
// Try to hide high latencies.
- if (tryLess(TryCand.LastPosHighLatParentScheduled,
- Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
+ if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
+ Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
return true;
// Schedule high latencies early so you can hide them better.
- if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
- TryCand, Cand, Latency))
+ if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
+ TryCand, Cand, Latency))
return true;
- if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
- TryCand, Cand, Depth))
+ if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
+ TryCand, Cand, Depth))
return true;
- if (tryGreater(TryCand.NumHighLatencySuccessors,
- Cand.NumHighLatencySuccessors,
- TryCand, Cand, Successor))
+ if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
+ Cand.NumHighLatencySuccessors,
+ TryCand, Cand, Successor))
return true;
return false;
}
@@ -1600,17 +1600,17 @@ bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
return true;
}
- if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
- TryCand, Cand, RegUsage))
+ if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
+ TryCand, Cand, RegUsage))
return true;
- if (tryGreater(TryCand.NumSuccessors > 0,
- Cand.NumSuccessors > 0,
- TryCand, Cand, Successor))
+ if (SISched::tryGreater(TryCand.NumSuccessors > 0,
+ Cand.NumSuccessors > 0,
+ TryCand, Cand, Successor))
return true;
- if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
+ if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
return true;
- if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
- TryCand, Cand, RegUsage))
+ if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
+ TryCand, Cand, RegUsage))
return true;
return false;
}
@@ -1628,18 +1628,17 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
maxVregUsage = VregCurrentUsage;
if (SregCurrentUsage > maxSregUsage)
maxSregUsage = SregCurrentUsage;
- DEBUG(
- dbgs() << "Picking New Blocks\n";
- dbgs() << "Available: ";
- for (SIScheduleBlock* Block : ReadyBlocks)
- dbgs() << Block->getID() << ' ';
- dbgs() << "\nCurrent Live:\n";
- for (unsigned Reg : LiveRegs)
- dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
- dbgs() << '\n';
- dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
- dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
- );
+ LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
+ for (SIScheduleBlock *Block
+ : ReadyBlocks) dbgs()
+ << Block->getID() << ' ';
+ dbgs() << "\nCurrent Live:\n";
+ for (unsigned Reg
+ : LiveRegs) dbgs()
+ << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+ dbgs() << '\n';
+ dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
+ dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
Cand.Block = nullptr;
for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
@@ -1671,20 +1670,18 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
if (TryCand.Reason != NoCand) {
Cand.setBest(TryCand);
Best = I;
- DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
- << getReasonStr(Cand.Reason) << '\n');
+ LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
+ << getReasonStr(Cand.Reason) << '\n');
}
}
- DEBUG(
- dbgs() << "Picking: " << Cand.Block->getID() << '\n';
- dbgs() << "Is a block with high latency instruction: "
- << (Cand.IsHighLatency ? "yes\n" : "no\n");
- dbgs() << "Position of last high latency dependency: "
- << Cand.LastPosHighLatParentScheduled << '\n';
- dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
- dbgs() << '\n';
- );
+ LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
+ dbgs() << "Is a block with high latency instruction: "
+ << (Cand.IsHighLatency ? "yes\n" : "no\n");
+ dbgs() << "Position of last high latency dependency: "
+ << Cand.LastPosHighLatParentScheduled << '\n';
+ dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
+ dbgs() << '\n';);
Block = Cand.Block;
ReadyBlocks.erase(Best);
@@ -1933,13 +1930,10 @@ void SIScheduleDAGMI::schedule()
{
SmallVector<SUnit*, 8> TopRoots, BotRoots;
SIScheduleBlockResult Best, Temp;
- DEBUG(dbgs() << "Preparing Scheduling\n");
+ LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
buildDAGWithRegPressure();
- DEBUG(
- for(SUnit& SU : SUnits)
- SU.dumpAll(this)
- );
+ LLVM_DEBUG(for (SUnit &SU : SUnits) SU.dumpAll(this));
topologicalSort();
findRootsAndBiasEdges(TopRoots, BotRoots);
@@ -2041,15 +2035,15 @@ void SIScheduleDAGMI::schedule()
scheduleMI(SU, true);
- DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
- << *SU->getInstr());
+ LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+ << *SU->getInstr());
}
assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
placeDebugValues();
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "*** Final schedule for "
<< printMBBReference(*begin()->getParent()) << " ***\n";
dumpSchedule();
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h
index d824e38504e6..0ce68ac6a897 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief SI Machine Scheduler interface
+/// SI Machine Scheduler interface
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index c73fb10b7ea0..938cdaf1ef8f 100644
--- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Memory legalizer - implements memory model. More information can be
+/// Memory legalizer - implements memory model. More information can be
/// found here:
/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
//
@@ -19,7 +19,9 @@
#include "AMDGPUSubtarget.h"
#include "SIDefines.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/BitmaskEnum.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -36,6 +38,7 @@
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Pass.h"
#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/MathExtras.h"
#include <cassert>
#include <list>
@@ -47,42 +50,142 @@ using namespace llvm::AMDGPU;
namespace {
+LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
+
+/// Memory operation flags. Can be ORed together.
+enum class SIMemOp {
+ NONE = 0u,
+ LOAD = 1u << 0,
+ STORE = 1u << 1,
+ LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
+};
+
+/// Position to insert a new instruction relative to an existing
+/// instruction.
+enum class Position {
+ BEFORE,
+ AFTER
+};
+
+/// The atomic synchronization scopes supported by the AMDGPU target.
+enum class SIAtomicScope {
+ NONE,
+ SINGLETHREAD,
+ WAVEFRONT,
+ WORKGROUP,
+ AGENT,
+ SYSTEM
+};
+
+/// The distinct address spaces supported by the AMDGPU target for
+/// atomic memory operation. Can be ORed toether.
+enum class SIAtomicAddrSpace {
+ NONE = 0u,
+ GLOBAL = 1u << 0,
+ LDS = 1u << 1,
+ SCRATCH = 1u << 2,
+ GDS = 1u << 3,
+ OTHER = 1u << 4,
+
+ /// The address spaces that can be accessed by a FLAT instruction.
+ FLAT = GLOBAL | LDS | SCRATCH,
+
+ /// The address spaces that support atomic instructions.
+ ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
+
+ /// All address spaces.
+ ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
+
+ LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
+};
+
+/// Sets named bit \p BitName to "true" if present in instruction \p MI.
+/// \returns Returns true if \p MI is modified, false otherwise.
+template <uint16_t BitName>
+bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
+ int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
+ if (BitIdx == -1)
+ return false;
+
+ MachineOperand &Bit = MI->getOperand(BitIdx);
+ if (Bit.getImm() != 0)
+ return false;
+
+ Bit.setImm(1);
+ return true;
+}
+
class SIMemOpInfo final {
private:
- SyncScope::ID SSID = SyncScope::System;
+
+ friend class SIMemOpAccess;
+
AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+ SIAtomicScope Scope = SIAtomicScope::SYSTEM;
+ SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+ SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
+ bool IsCrossAddressSpaceOrdering = false;
bool IsNonTemporal = false;
- SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering)
- : SSID(SSID), Ordering(Ordering) {}
-
- SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering,
- AtomicOrdering FailureOrdering, bool IsNonTemporal = false)
- : SSID(SSID), Ordering(Ordering), FailureOrdering(FailureOrdering),
- IsNonTemporal(IsNonTemporal) {}
-
- /// \returns Info constructed from \p MI, which has at least machine memory
- /// operand.
- static Optional<SIMemOpInfo> constructFromMIWithMMO(
- const MachineBasicBlock::iterator &MI);
+ SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
+ SIAtomicScope Scope = SIAtomicScope::SYSTEM,
+ SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
+ SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
+ bool IsCrossAddressSpaceOrdering = true,
+ AtomicOrdering FailureOrdering =
+ AtomicOrdering::SequentiallyConsistent,
+ bool IsNonTemporal = false)
+ : Ordering(Ordering), FailureOrdering(FailureOrdering),
+ Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
+ InstrAddrSpace(InstrAddrSpace),
+ IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
+ IsNonTemporal(IsNonTemporal) {
+ // There is also no cross address space ordering if the ordering
+ // address space is the same as the instruction address space and
+ // only contains a single address space.
+ if ((OrderingAddrSpace == InstrAddrSpace) &&
+ isPowerOf2_32(uint32_t(InstrAddrSpace)))
+ IsCrossAddressSpaceOrdering = false;
+ }
public:
- /// \returns Synchronization scope ID of the machine instruction used to
+ /// \returns Atomic synchronization scope of the machine instruction used to
/// create this SIMemOpInfo.
- SyncScope::ID getSSID() const {
- return SSID;
+ SIAtomicScope getScope() const {
+ return Scope;
}
+
/// \returns Ordering constraint of the machine instruction used to
/// create this SIMemOpInfo.
AtomicOrdering getOrdering() const {
return Ordering;
}
+
/// \returns Failure ordering constraint of the machine instruction used to
/// create this SIMemOpInfo.
AtomicOrdering getFailureOrdering() const {
return FailureOrdering;
}
+
+ /// \returns The address spaces be accessed by the machine
+ /// instruction used to create this SiMemOpInfo.
+ SIAtomicAddrSpace getInstrAddrSpace() const {
+ return InstrAddrSpace;
+ }
+
+ /// \returns The address spaces that must be ordered by the machine
+ /// instruction used to create this SiMemOpInfo.
+ SIAtomicAddrSpace getOrderingAddrSpace() const {
+ return OrderingAddrSpace;
+ }
+
+ /// \returns Return true iff memory ordering of operations on
+ /// different address spaces is required.
+ bool getIsCrossAddressSpaceOrdering() const {
+ return IsCrossAddressSpaceOrdering;
+ }
+
/// \returns True if memory access of the machine instruction used to
/// create this SIMemOpInfo is non-temporal, false otherwise.
bool isNonTemporal() const {
@@ -95,109 +198,198 @@ public:
return Ordering != AtomicOrdering::NotAtomic;
}
+};
+
+class SIMemOpAccess final {
+private:
+
+ AMDGPUAS SIAddrSpaceInfo;
+ AMDGPUMachineModuleInfo *MMI = nullptr;
+
+ /// Reports unsupported message \p Msg for \p MI to LLVM context.
+ void reportUnsupported(const MachineBasicBlock::iterator &MI,
+ const char *Msg) const;
+
+ /// Inspects the target synchonization scope \p SSID and determines
+ /// the SI atomic scope it corresponds to, the address spaces it
+ /// covers, and whether the memory ordering applies between address
+ /// spaces.
+ Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
+ toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
+
+ /// \return Return a bit set of the address spaces accessed by \p AS.
+ SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
+
+ /// \returns Info constructed from \p MI, which has at least machine memory
+ /// operand.
+ Optional<SIMemOpInfo> constructFromMIWithMMO(
+ const MachineBasicBlock::iterator &MI) const;
+
+public:
+ /// Construct class to support accessing the machine memory operands
+ /// of instructions in the machine function \p MF.
+ SIMemOpAccess(MachineFunction &MF);
+
/// \returns Load info if \p MI is a load operation, "None" otherwise.
- static Optional<SIMemOpInfo> getLoadInfo(
- const MachineBasicBlock::iterator &MI);
+ Optional<SIMemOpInfo> getLoadInfo(
+ const MachineBasicBlock::iterator &MI) const;
+
/// \returns Store info if \p MI is a store operation, "None" otherwise.
- static Optional<SIMemOpInfo> getStoreInfo(
- const MachineBasicBlock::iterator &MI);
+ Optional<SIMemOpInfo> getStoreInfo(
+ const MachineBasicBlock::iterator &MI) const;
+
/// \returns Atomic fence info if \p MI is an atomic fence operation,
/// "None" otherwise.
- static Optional<SIMemOpInfo> getAtomicFenceInfo(
- const MachineBasicBlock::iterator &MI);
- /// \returns Atomic cmpxchg info if \p MI is an atomic cmpxchg operation,
- /// "None" otherwise.
- static Optional<SIMemOpInfo> getAtomicCmpxchgInfo(
- const MachineBasicBlock::iterator &MI);
- /// \returns Atomic rmw info if \p MI is an atomic rmw operation,
- /// "None" otherwise.
- static Optional<SIMemOpInfo> getAtomicRmwInfo(
- const MachineBasicBlock::iterator &MI);
+ Optional<SIMemOpInfo> getAtomicFenceInfo(
+ const MachineBasicBlock::iterator &MI) const;
- /// \brief Reports unknown synchronization scope used in \p MI to LLVM
- /// context.
- static void reportUnknownSyncScope(
- const MachineBasicBlock::iterator &MI);
+ /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
+ /// rmw operation, "None" otherwise.
+ Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
+ const MachineBasicBlock::iterator &MI) const;
};
-class SIMemoryLegalizer final : public MachineFunctionPass {
-private:
- /// \brief Machine module info.
- const AMDGPUMachineModuleInfo *MMI = nullptr;
+class SICacheControl {
+protected:
- /// \brief Instruction info.
+ /// Instruction info.
const SIInstrInfo *TII = nullptr;
- /// \brief Immediate for "vmcnt(0)".
- unsigned Vmcnt0Immediate = 0;
+ IsaInfo::IsaVersion IV;
- /// \brief Opcode for cache invalidation instruction (L1).
- unsigned Wbinvl1Opcode = 0;
+ SICacheControl(const GCNSubtarget &ST);
- /// \brief List of atomic pseudo instructions.
- std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
+public:
- /// \brief Sets named bit (BitName) to "true" if present in \p MI. Returns
- /// true if \p MI is modified, false otherwise.
- template <uint16_t BitName>
- bool enableNamedBit(const MachineBasicBlock::iterator &MI) const {
- int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
- if (BitIdx == -1)
- return false;
+ /// Create a cache control for the subtarget \p ST.
+ static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
+
+ /// Update \p MI memory load instruction to bypass any caches up to
+ /// the \p Scope memory scope for address spaces \p
+ /// AddrSpace. Return true iff the instruction was modified.
+ virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const = 0;
+
+ /// Update \p MI memory instruction to indicate it is
+ /// nontemporal. Return true iff the instruction was modified.
+ virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
+ const = 0;
+
+ /// Inserts any necessary instructions at position \p Pos relative
+ /// to instruction \p MI to ensure any caches associated with
+ /// address spaces \p AddrSpace for memory scopes up to memory scope
+ /// \p Scope are invalidated. Returns true iff any instructions
+ /// inserted.
+ virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const = 0;
+
+ /// Inserts any necessary instructions at position \p Pos relative
+ /// to instruction \p MI to ensure memory instructions of kind \p Op
+ /// associated with address spaces \p AddrSpace have completed as
+ /// observed by other memory instructions executing in memory scope
+ /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
+ /// ordering is between address spaces. Returns true iff any
+ /// instructions inserted.
+ virtual bool insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const = 0;
+
+ /// Virtual destructor to allow derivations to be deleted.
+ virtual ~SICacheControl() = default;
- MachineOperand &Bit = MI->getOperand(BitIdx);
- if (Bit.getImm() != 0)
- return false;
+};
- Bit.setImm(1);
- return true;
- }
+class SIGfx6CacheControl : public SICacheControl {
+protected:
- /// \brief Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
+ /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
return enableNamedBit<AMDGPU::OpName::glc>(MI);
}
- /// \brief Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
+ /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
return enableNamedBit<AMDGPU::OpName::slc>(MI);
}
- /// \brief Inserts "buffer_wbinvl1_vol" instruction \p Before or after \p MI.
- /// Always returns true.
- bool insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI,
- bool Before = true) const;
- /// \brief Inserts "s_waitcnt vmcnt(0)" instruction \p Before or after \p MI.
- /// Always returns true.
- bool insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
- bool Before = true) const;
+public:
+
+ SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
+
+ bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
+
+ bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
+
+ bool insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override;
+};
+
+class SIGfx7CacheControl : public SIGfx6CacheControl {
+public:
- /// \brief Removes all processed atomic pseudo instructions from the current
+ SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
+
+ bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
+
+};
+
+class SIMemoryLegalizer final : public MachineFunctionPass {
+private:
+
+ /// Cache Control.
+ std::unique_ptr<SICacheControl> CC = nullptr;
+
+ /// List of atomic pseudo instructions.
+ std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
+
+ /// Return true iff instruction \p MI is a atomic instruction that
+ /// returns a result.
+ bool isAtomicRet(const MachineInstr &MI) const {
+ return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
+ }
+
+ /// Removes all processed atomic pseudo instructions from the current
/// function. Returns true if current function is modified, false otherwise.
bool removeAtomicPseudoMIs();
- /// \brief Expands load operation \p MI. Returns true if instructions are
+ /// Expands load operation \p MI. Returns true if instructions are
/// added/deleted or \p MI is modified, false otherwise.
bool expandLoad(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
- /// \brief Expands store operation \p MI. Returns true if instructions are
+ /// Expands store operation \p MI. Returns true if instructions are
/// added/deleted or \p MI is modified, false otherwise.
bool expandStore(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
- /// \brief Expands atomic fence operation \p MI. Returns true if
+ /// Expands atomic fence operation \p MI. Returns true if
/// instructions are added/deleted or \p MI is modified, false otherwise.
bool expandAtomicFence(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
- /// \brief Expands atomic cmpxchg operation \p MI. Returns true if
- /// instructions are added/deleted or \p MI is modified, false otherwise.
- bool expandAtomicCmpxchg(const SIMemOpInfo &MOI,
- MachineBasicBlock::iterator &MI);
- /// \brief Expands atomic rmw operation \p MI. Returns true if
+ /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
/// instructions are added/deleted or \p MI is modified, false otherwise.
- bool expandAtomicRmw(const SIMemOpInfo &MOI,
- MachineBasicBlock::iterator &MI);
+ bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI);
public:
static char ID;
@@ -218,48 +410,129 @@ public:
} // end namespace anonymous
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::constructFromMIWithMMO(
- const MachineBasicBlock::iterator &MI) {
- assert(MI->getNumMemOperands() > 0);
+void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
+ const char *Msg) const {
+ const Function &Func = MI->getParent()->getParent()->getFunction();
+ DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
+ Func.getContext().diagnose(Diag);
+}
- const MachineFunction *MF = MI->getParent()->getParent();
- const AMDGPUMachineModuleInfo *MMI =
- &MF->getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
+Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
+SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
+ SIAtomicAddrSpace InstrScope) const {
+ /// TODO: For now assume OpenCL memory model which treats each
+ /// address space as having a separate happens-before relation, and
+ /// so an instruction only has ordering with respect to the address
+ /// space it accesses, and if it accesses multiple address spaces it
+ /// does not require ordering of operations in different address
+ /// spaces.
+ if (SSID == SyncScope::System)
+ return std::make_tuple(SIAtomicScope::SYSTEM,
+ SIAtomicAddrSpace::ATOMIC & InstrScope,
+ false);
+ if (SSID == MMI->getAgentSSID())
+ return std::make_tuple(SIAtomicScope::AGENT,
+ SIAtomicAddrSpace::ATOMIC & InstrScope,
+ false);
+ if (SSID == MMI->getWorkgroupSSID())
+ return std::make_tuple(SIAtomicScope::WORKGROUP,
+ SIAtomicAddrSpace::ATOMIC & InstrScope,
+ false);
+ if (SSID == MMI->getWavefrontSSID())
+ return std::make_tuple(SIAtomicScope::WAVEFRONT,
+ SIAtomicAddrSpace::ATOMIC & InstrScope,
+ false);
+ if (SSID == SyncScope::SingleThread)
+ return std::make_tuple(SIAtomicScope::SINGLETHREAD,
+ SIAtomicAddrSpace::ATOMIC & InstrScope,
+ false);
+ /// TODO: To support HSA Memory Model need to add additional memory
+ /// scopes that specify that do require cross address space
+ /// ordering.
+ return None;
+}
+
+SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
+ if (AS == SIAddrSpaceInfo.FLAT_ADDRESS)
+ return SIAtomicAddrSpace::FLAT;
+ if (AS == SIAddrSpaceInfo.GLOBAL_ADDRESS)
+ return SIAtomicAddrSpace::GLOBAL;
+ if (AS == SIAddrSpaceInfo.LOCAL_ADDRESS)
+ return SIAtomicAddrSpace::LDS;
+ if (AS == SIAddrSpaceInfo.PRIVATE_ADDRESS)
+ return SIAtomicAddrSpace::SCRATCH;
+ if (AS == SIAddrSpaceInfo.REGION_ADDRESS)
+ return SIAtomicAddrSpace::GDS;
+
+ return SIAtomicAddrSpace::OTHER;
+}
+
+SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
+ SIAddrSpaceInfo = getAMDGPUAS(MF.getTarget());
+ MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
+}
+
+Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
+ const MachineBasicBlock::iterator &MI) const {
+ assert(MI->getNumMemOperands() > 0);
SyncScope::ID SSID = SyncScope::SingleThread;
AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+ SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
bool IsNonTemporal = true;
// Validator should check whether or not MMOs cover the entire set of
// locations accessed by the memory instruction.
for (const auto &MMO : MI->memoperands()) {
- const auto &IsSyncScopeInclusion =
- MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
- if (!IsSyncScopeInclusion) {
- reportUnknownSyncScope(MI);
- return None;
- }
-
- SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
- Ordering =
- isStrongerThan(Ordering, MMO->getOrdering()) ?
- Ordering : MMO->getOrdering();
- FailureOrdering =
- isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
- FailureOrdering : MMO->getFailureOrdering();
+ IsNonTemporal &= MMO->isNonTemporal();
+ InstrAddrSpace |=
+ toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
+ AtomicOrdering OpOrdering = MMO->getOrdering();
+ if (OpOrdering != AtomicOrdering::NotAtomic) {
+ const auto &IsSyncScopeInclusion =
+ MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
+ if (!IsSyncScopeInclusion) {
+ reportUnsupported(MI,
+ "Unsupported non-inclusive atomic synchronization scope");
+ return None;
+ }
- if (!(MMO->getFlags() & MachineMemOperand::MONonTemporal))
- IsNonTemporal = false;
+ SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
+ Ordering =
+ isStrongerThan(Ordering, OpOrdering) ?
+ Ordering : MMO->getOrdering();
+ assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
+ MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
+ FailureOrdering =
+ isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
+ FailureOrdering : MMO->getFailureOrdering();
+ }
}
- return SIMemOpInfo(SSID, Ordering, FailureOrdering, IsNonTemporal);
+ SIAtomicScope Scope = SIAtomicScope::NONE;
+ SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+ bool IsCrossAddressSpaceOrdering = false;
+ if (Ordering != AtomicOrdering::NotAtomic) {
+ auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
+ if (!ScopeOrNone) {
+ reportUnsupported(MI, "Unsupported atomic synchronization scope");
+ return None;
+ }
+ std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
+ ScopeOrNone.getValue();
+ if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
+ ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+ reportUnsupported(MI, "Unsupported atomic address space");
+ return None;
+ }
+ }
+ return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
+ IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
}
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getLoadInfo(
- const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
+ const MachineBasicBlock::iterator &MI) const {
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
if (!(MI->mayLoad() && !MI->mayStore()))
@@ -267,15 +540,13 @@ Optional<SIMemOpInfo> SIMemOpInfo::getLoadInfo(
// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
- return SIMemOpInfo(SyncScope::System,
- AtomicOrdering::SequentiallyConsistent);
+ return SIMemOpInfo();
- return SIMemOpInfo::constructFromMIWithMMO(MI);
+ return constructFromMIWithMMO(MI);
}
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getStoreInfo(
- const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
+ const MachineBasicBlock::iterator &MI) const {
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
if (!(!MI->mayLoad() && MI->mayStore()))
@@ -283,30 +554,46 @@ Optional<SIMemOpInfo> SIMemOpInfo::getStoreInfo(
// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
- return SIMemOpInfo(SyncScope::System,
- AtomicOrdering::SequentiallyConsistent);
+ return SIMemOpInfo();
- return SIMemOpInfo::constructFromMIWithMMO(MI);
+ return constructFromMIWithMMO(MI);
}
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicFenceInfo(
- const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
+ const MachineBasicBlock::iterator &MI) const {
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
return None;
- SyncScope::ID SSID =
- static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
AtomicOrdering Ordering =
- static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
- return SIMemOpInfo(SSID, Ordering);
+ static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
+
+ SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
+ auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
+ if (!ScopeOrNone) {
+ reportUnsupported(MI, "Unsupported atomic synchronization scope");
+ return None;
+ }
+
+ SIAtomicScope Scope = SIAtomicScope::NONE;
+ SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+ bool IsCrossAddressSpaceOrdering = false;
+ std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
+ ScopeOrNone.getValue();
+
+ if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
+ ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+ reportUnsupported(MI, "Unsupported atomic address space");
+ return None;
+ }
+
+ return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
+ IsCrossAddressSpaceOrdering);
}
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicCmpxchgInfo(
- const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
+ const MachineBasicBlock::iterator &MI) const {
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
if (!(MI->mayLoad() && MI->mayStore()))
@@ -314,68 +601,251 @@ Optional<SIMemOpInfo> SIMemOpInfo::getAtomicCmpxchgInfo(
// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
- return SIMemOpInfo(SyncScope::System,
- AtomicOrdering::SequentiallyConsistent,
- AtomicOrdering::SequentiallyConsistent);
+ return SIMemOpInfo();
- return SIMemOpInfo::constructFromMIWithMMO(MI);
+ return constructFromMIWithMMO(MI);
+}
+
+SICacheControl::SICacheControl(const GCNSubtarget &ST) {
+ TII = ST.getInstrInfo();
+ IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
}
/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicRmwInfo(
- const MachineBasicBlock::iterator &MI) {
- assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
+std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
+ GCNSubtarget::Generation Generation = ST.getGeneration();
+ if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ return make_unique<SIGfx6CacheControl>(ST);
+ return make_unique<SIGfx7CacheControl>(ST);
+}
- if (!(MI->mayLoad() && MI->mayStore()))
- return None;
+bool SIGfx6CacheControl::enableLoadCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && !MI->mayStore());
+ bool Changed = false;
- // Be conservative if there are no memory operands.
- if (MI->getNumMemOperands() == 0)
- return SIMemOpInfo(SyncScope::System,
- AtomicOrdering::SequentiallyConsistent);
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ /// TODO: Do not set glc for rmw atomic operations as they
+ /// implicitly bypass the L1 cache.
- return SIMemOpInfo::constructFromMIWithMMO(MI);
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ Changed |= enableGLCBit(MI);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not hava a cache.
+
+ return Changed;
}
-/* static */
-void SIMemOpInfo::reportUnknownSyncScope(
- const MachineBasicBlock::iterator &MI) {
- DiagnosticInfoUnsupported Diag(MI->getParent()->getParent()->getFunction(),
- "Unsupported synchronization scope");
- LLVMContext *CTX = &MI->getParent()->getParent()->getFunction().getContext();
- CTX->diagnose(Diag);
+bool SIGfx6CacheControl::enableNonTemporal(
+ const MachineBasicBlock::iterator &MI) const {
+ assert(MI->mayLoad() ^ MI->mayStore());
+ bool Changed = false;
+
+ /// TODO: Do not enableGLCBit if rmw atomic.
+ Changed |= enableGLCBit(MI);
+ Changed |= enableSLCBit(MI);
+
+ return Changed;
}
-bool SIMemoryLegalizer::insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI,
- bool Before) const {
+bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ bool Changed = false;
+
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
- if (!Before)
+ if (Pos == Position::AFTER)
++MI;
- BuildMI(MBB, MI, DL, TII->get(Wbinvl1Opcode));
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
+ Changed = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to invalidate.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory cache
+ /// to be flushed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not hava a cache.
- if (!Before)
+ if (Pos == Position::AFTER)
--MI;
- return true;
+ return Changed;
}
-bool SIMemoryLegalizer::insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
- bool Before) const {
+bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ bool Changed = false;
+
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
- if (!Before)
+ if (Pos == Position::AFTER)
++MI;
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Vmcnt0Immediate);
+ bool VMCnt = false;
+ bool LGKMCnt = false;
+ bool EXPCnt = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ VMCnt = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The L1 cache keeps all memory operations in order for
+ // wavefronts in the same work-group.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ case SIAtomicScope::WORKGROUP:
+ // If no cross address space ordering then an LDS waitcnt is not
+ // needed as LDS operations for all waves are executed in a
+ // total global ordering as observed by all waves. Required if
+ // also synchronizing with global/GDS memory as LDS operations
+ // could be reordered with respect to later global/GDS memory
+ // operations of the same wave.
+ LGKMCnt = IsCrossAddrSpaceOrdering;
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The LDS keeps all memory operations in order for
+ // the same wavesfront.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ // If no cross address space ordering then an GDS waitcnt is not
+ // needed as GDS operations for all waves are executed in a
+ // total global ordering as observed by all waves. Required if
+ // also synchronizing with global/LDS memory as GDS operations
+ // could be reordered with respect to later global/LDS memory
+ // operations of the same wave.
+ EXPCnt = IsCrossAddrSpaceOrdering;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The GDS keeps all memory operations in order for
+ // the same work-group.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
- if (!Before)
+ if (VMCnt || LGKMCnt || EXPCnt) {
+ unsigned WaitCntImmediate =
+ AMDGPU::encodeWaitcnt(IV,
+ VMCnt ? 0 : getVmcntBitMask(IV),
+ EXPCnt ? 0 : getExpcntBitMask(IV),
+ LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+ Changed = true;
+ }
+
+ if (Pos == Position::AFTER)
--MI;
- return true;
+ return Changed;
+}
+
+bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1_VOL));
+ Changed = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to invalidate.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory cache
+ /// to be flushed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not hava a cache.
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return Changed;
}
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
@@ -396,37 +866,38 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
bool Changed = false;
if (MOI.isAtomic()) {
- if (MOI.getSSID() == SyncScope::System ||
- MOI.getSSID() == MMI->getAgentSSID()) {
- if (MOI.getOrdering() == AtomicOrdering::Acquire ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= enableGLCBit(MI);
-
- if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= insertWaitcntVmcnt0(MI);
-
- if (MOI.getOrdering() == AtomicOrdering::Acquire ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
- Changed |= insertWaitcntVmcnt0(MI, false);
- Changed |= insertBufferWbinvl1Vol(MI, false);
- }
-
- return Changed;
+ if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
+ MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+ Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace());
}
- if (MOI.getSSID() == SyncScope::SingleThread ||
- MOI.getSSID() == MMI->getWorkgroupSSID() ||
- MOI.getSSID() == MMI->getWavefrontSSID()) {
- return Changed;
+ if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+ Changed |= CC->insertWait(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ SIMemOp::LOAD | SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE);
+
+ if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+ Changed |= CC->insertWait(MI, MOI.getScope(),
+ MOI.getInstrAddrSpace(),
+ SIMemOp::LOAD,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::AFTER);
+ Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ Position::AFTER);
}
- llvm_unreachable("Unsupported synchronization scope");
+ return Changed;
}
// Atomic instructions do not have the nontemporal attribute.
if (MOI.isNonTemporal()) {
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
+ Changed |= CC->enableNonTemporal(MI);
return Changed;
}
@@ -440,28 +911,20 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
bool Changed = false;
if (MOI.isAtomic()) {
- if (MOI.getSSID() == SyncScope::System ||
- MOI.getSSID() == MMI->getAgentSSID()) {
- if (MOI.getOrdering() == AtomicOrdering::Release ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= insertWaitcntVmcnt0(MI);
+ if (MOI.getOrdering() == AtomicOrdering::Release ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+ Changed |= CC->insertWait(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ SIMemOp::LOAD | SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE);
- return Changed;
- }
-
- if (MOI.getSSID() == SyncScope::SingleThread ||
- MOI.getSSID() == MMI->getWorkgroupSSID() ||
- MOI.getSSID() == MMI->getWavefrontSSID()) {
- return Changed;
- }
-
- llvm_unreachable("Unsupported synchronization scope");
+ return Changed;
}
// Atomic instructions do not have the nontemporal attribute.
if (MOI.isNonTemporal()) {
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
+ Changed |= CC->enableNonTemporal(MI);
return Changed;
}
@@ -472,111 +935,74 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI) {
assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
+ AtomicPseudoMIs.push_back(MI);
bool Changed = false;
if (MOI.isAtomic()) {
- if (MOI.getSSID() == SyncScope::System ||
- MOI.getSSID() == MMI->getAgentSSID()) {
- if (MOI.getOrdering() == AtomicOrdering::Acquire ||
- MOI.getOrdering() == AtomicOrdering::Release ||
- MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= insertWaitcntVmcnt0(MI);
-
- if (MOI.getOrdering() == AtomicOrdering::Acquire ||
- MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= insertBufferWbinvl1Vol(MI);
-
- AtomicPseudoMIs.push_back(MI);
- return Changed;
- }
+ if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::Release ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+ /// TODO: This relies on a barrier always generating a waitcnt
+ /// for LDS to ensure it is not reordered with the completion of
+ /// the proceeding LDS operations. If barrier had a memory
+ /// ordering and memory scope, then library does not need to
+ /// generate a fence. Could add support in this file for
+ /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
+ /// adding waitcnt before a S_BARRIER.
+ Changed |= CC->insertWait(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ SIMemOp::LOAD | SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE);
+
+ if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+ Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ Position::BEFORE);
- if (MOI.getSSID() == SyncScope::SingleThread ||
- MOI.getSSID() == MMI->getWorkgroupSSID() ||
- MOI.getSSID() == MMI->getWavefrontSSID()) {
- AtomicPseudoMIs.push_back(MI);
- return Changed;
- }
-
- SIMemOpInfo::reportUnknownSyncScope(MI);
- }
-
- return Changed;
-}
-
-bool SIMemoryLegalizer::expandAtomicCmpxchg(const SIMemOpInfo &MOI,
- MachineBasicBlock::iterator &MI) {
- assert(MI->mayLoad() && MI->mayStore());
-
- bool Changed = false;
-
- if (MOI.isAtomic()) {
- if (MOI.getSSID() == SyncScope::System ||
- MOI.getSSID() == MMI->getAgentSSID()) {
- if (MOI.getOrdering() == AtomicOrdering::Release ||
- MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
- MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= insertWaitcntVmcnt0(MI);
-
- if (MOI.getOrdering() == AtomicOrdering::Acquire ||
- MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
- MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
- MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
- Changed |= insertWaitcntVmcnt0(MI, false);
- Changed |= insertBufferWbinvl1Vol(MI, false);
- }
-
- return Changed;
- }
-
- if (MOI.getSSID() == SyncScope::SingleThread ||
- MOI.getSSID() == MMI->getWorkgroupSSID() ||
- MOI.getSSID() == MMI->getWavefrontSSID()) {
- Changed |= enableGLCBit(MI);
- return Changed;
- }
-
- llvm_unreachable("Unsupported synchronization scope");
+ return Changed;
}
return Changed;
}
-bool SIMemoryLegalizer::expandAtomicRmw(const SIMemOpInfo &MOI,
- MachineBasicBlock::iterator &MI) {
+bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI) {
assert(MI->mayLoad() && MI->mayStore());
bool Changed = false;
if (MOI.isAtomic()) {
- if (MOI.getSSID() == SyncScope::System ||
- MOI.getSSID() == MMI->getAgentSSID()) {
- if (MOI.getOrdering() == AtomicOrdering::Release ||
- MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= insertWaitcntVmcnt0(MI);
-
- if (MOI.getOrdering() == AtomicOrdering::Acquire ||
- MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
- Changed |= insertWaitcntVmcnt0(MI, false);
- Changed |= insertBufferWbinvl1Vol(MI, false);
- }
-
- return Changed;
+ if (MOI.getOrdering() == AtomicOrdering::Release ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
+ MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
+ Changed |= CC->insertWait(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ SIMemOp::LOAD | SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE);
+
+ if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
+ MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
+ MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
+ Changed |= CC->insertWait(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ isAtomicRet(*MI) ? SIMemOp::LOAD :
+ SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::AFTER);
+ Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ Position::AFTER);
}
- if (MOI.getSSID() == SyncScope::SingleThread ||
- MOI.getSSID() == MMI->getWorkgroupSSID() ||
- MOI.getSSID() == MMI->getWavefrontSSID()) {
- Changed |= enableGLCBit(MI);
- return Changed;
- }
-
- llvm_unreachable("Unsupported synchronization scope");
+ return Changed;
}
return Changed;
@@ -584,32 +1010,23 @@ bool SIMemoryLegalizer::expandAtomicRmw(const SIMemOpInfo &MOI,
bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- const IsaInfo::IsaVersion IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
-
- MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
- TII = ST.getInstrInfo();
- Vmcnt0Immediate =
- AMDGPU::encodeWaitcnt(IV, 0, getExpcntBitMask(IV), getLgkmcntBitMask(IV));
- Wbinvl1Opcode = ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS ?
- AMDGPU::BUFFER_WBINVL1 : AMDGPU::BUFFER_WBINVL1_VOL;
+ SIMemOpAccess MOA(MF);
+ CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
for (auto &MBB : MF) {
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
continue;
- if (const auto &MOI = SIMemOpInfo::getLoadInfo(MI))
+ if (const auto &MOI = MOA.getLoadInfo(MI))
Changed |= expandLoad(MOI.getValue(), MI);
- else if (const auto &MOI = SIMemOpInfo::getStoreInfo(MI))
+ else if (const auto &MOI = MOA.getStoreInfo(MI))
Changed |= expandStore(MOI.getValue(), MI);
- else if (const auto &MOI = SIMemOpInfo::getAtomicFenceInfo(MI))
+ else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
Changed |= expandAtomicFence(MOI.getValue(), MI);
- else if (const auto &MOI = SIMemOpInfo::getAtomicCmpxchgInfo(MI))
- Changed |= expandAtomicCmpxchg(MOI.getValue(), MI);
- else if (const auto &MOI = SIMemOpInfo::getAtomicRmwInfo(MI))
- Changed |= expandAtomicRmw(MOI.getValue(), MI);
+ else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
+ Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
}
}
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 2dc6f2702b3b..ebcad30a1866 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -10,6 +10,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -76,7 +77,7 @@ static unsigned isCopyToExec(const MachineInstr &MI) {
case AMDGPU::COPY:
case AMDGPU::S_MOV_B64: {
const MachineOperand &Dst = MI.getOperand(0);
- if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC)
+ if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC && MI.getOperand(1).isReg())
return MI.getOperand(1).getReg();
break;
}
@@ -208,7 +209,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -243,11 +244,11 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
// Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec
if (CopyToExecInst->getOperand(1).isKill() &&
isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
- DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
+ LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC);
- DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
+ LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
CopyToExecInst->eraseFromParent();
}
@@ -257,7 +258,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (isLiveOut(MBB, CopyToExec)) {
// The copied register is live out and has a second use in another block.
- DEBUG(dbgs() << "Exec copy source register is live out\n");
+ LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n");
continue;
}
@@ -269,7 +270,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
= std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
J != JE; ++J) {
if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) {
- DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
+ LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
// Make sure this is inserted after any VALU ops that may have been
// scheduled in between.
SaveExecInst = nullptr;
@@ -280,8 +281,8 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (J->modifiesRegister(CopyToExec, TRI)) {
if (SaveExecInst) {
- DEBUG(dbgs() << "Multiple instructions modify "
- << printReg(CopyToExec, TRI) << '\n');
+ LLVM_DEBUG(dbgs() << "Multiple instructions modify "
+ << printReg(CopyToExec, TRI) << '\n');
SaveExecInst = nullptr;
break;
}
@@ -292,10 +293,11 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (ReadsCopyFromExec) {
SaveExecInst = &*J;
- DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
+ LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
continue;
} else {
- DEBUG(dbgs() << "Instruction does not read exec copy: " << *J << '\n');
+ LLVM_DEBUG(dbgs()
+ << "Instruction does not read exec copy: " << *J << '\n');
break;
}
} else if (ReadsCopyFromExec && !SaveExecInst) {
@@ -307,8 +309,8 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
// spill %sgpr0_sgpr1
// %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1
//
- DEBUG(dbgs() << "Found second use of save inst candidate: "
- << *J << '\n');
+ LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J
+ << '\n');
break;
}
@@ -321,7 +323,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (!SaveExecInst)
continue;
- DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
+ LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
MachineOperand &Src0 = SaveExecInst->getOperand(1);
MachineOperand &Src1 = SaveExecInst->getOperand(2);
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 83074773c495..7b678d12ba81 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This pass removes redundant S_OR_B64 instructions enabling lanes in
+/// This pass removes redundant S_OR_B64 instructions enabling lanes in
/// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any
/// vector instructions between them we can only keep outer SI_END_CF, given
/// that CFG is structured and exec bits of the outer end statement are always
@@ -23,6 +23,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -106,7 +107,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -134,7 +135,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
}
while (I != E) {
- if (I->isDebugValue()) {
+ if (I->isDebugInstr()) {
I = std::next(I);
continue;
}
@@ -143,7 +144,8 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
I->hasUnmodeledSideEffects() || I->hasOrderedMemoryRef())
break;
- DEBUG(dbgs() << "Removing no effect instruction: " << *I << '\n');
+ LLVM_DEBUG(dbgs()
+ << "Removing no effect instruction: " << *I << '\n');
for (auto &Op : I->operands()) {
if (Op.isReg())
@@ -193,7 +195,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
!getOrExecSource(*NextLead, *TII, MRI))
continue;
- DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
+ LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
auto SaveExec = getOrExecSource(*Lead, *TII, MRI);
unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII);
@@ -224,7 +226,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
break;
}
- DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n');
+ LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n');
}
if (SafeToReplace) {
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 5ed7fdf220bf..0e000b72962e 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -25,6 +25,7 @@
#include "SIDefines.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
@@ -39,6 +40,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Pass.h"
@@ -86,11 +88,11 @@ public:
}
bool runOnMachineFunction(MachineFunction &MF) override;
- void matchSDWAOperands(MachineFunction &MF);
+ void matchSDWAOperands(MachineBasicBlock &MBB);
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
- bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const;
+ bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
- void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const;
+ void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
StringRef getPassName() const override { return "SI Peephole SDWA"; }
@@ -218,7 +220,7 @@ FunctionPass *llvm::createSIPeepholeSDWAPass() {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) {
+static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
switch(Sel) {
case BYTE_0: OS << "BYTE_0"; break;
case BYTE_1: OS << "BYTE_1"; break;
@@ -366,18 +368,53 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
// Find operand in instruction that matches source operand and replace it with
// target operand. Set corresponding src_sel
-
+ bool IsPreserveSrc = false;
MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
MachineOperand *SrcMods =
TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
assert(Src && (Src->isReg() || Src->isImm()));
if (!isSameReg(*Src, *getReplacedOperand())) {
- // If this is not src0 then it should be src1
+ // If this is not src0 then it could be src1
Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+ if (!Src ||
+ !isSameReg(*Src, *getReplacedOperand())) {
+ // It's possible this Src is a tied operand for
+ // UNUSED_PRESERVE, in which case we can either
+ // abandon the peephole attempt, or if legal we can
+ // copy the target operand into the tied slot
+ // if the preserve operation will effectively cause the same
+ // result by overwriting the rest of the dst.
+ MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ MachineOperand *DstUnused =
+ TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+
+ if (Dst &&
+ DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
+ // This will work if the tied src is acessing WORD_0, and the dst is
+ // writing WORD_1. Modifiers don't matter because all the bits that
+ // would be impacted are being overwritten by the dst.
+ // Any other case will not work.
+ SdwaSel DstSel = static_cast<SdwaSel>(
+ TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
+ if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
+ getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
+ IsPreserveSrc = true;
+ auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::vdst);
+ auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
+ Src = &MI.getOperand(TiedIdx);
+ SrcSel = nullptr;
+ SrcMods = nullptr;
+ } else {
+ // Not legal to convert this src
+ return false;
+ }
+ }
+ }
assert(Src && Src->isReg());
if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
@@ -388,11 +425,14 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
return false;
}
- assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods);
+ assert(isSameReg(*Src, *getReplacedOperand()) &&
+ (IsPreserveSrc || (SrcSel && SrcMods)));
}
copyRegOperand(*Src, *getTargetOperand());
- SrcSel->setImm(getSrcSel());
- SrcMods->setImm(getSrcMods(TII, Src));
+ if (!IsPreserveSrc) {
+ SrcSel->setImm(getSrcSel());
+ SrcMods->setImm(getSrcMods(TII, Src));
+ }
getTargetOperand()->setIsKill(false);
return true;
}
@@ -661,7 +701,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (TRI->isPhysicalRegister(Src1->getReg()) ||
+ if (TRI->isPhysicalRegister(ValSrc->getReg()) ||
TRI->isPhysicalRegister(Dst->getReg()))
break;
@@ -739,8 +779,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
// TODO: add support for non-SDWA instructions as OtherInst.
// For now this only works with SDWA instructions. For regular instructions
- // there is no way to determine if instruction write only 8/16/24-bit out of
- // full register size and all registers are at min 32-bit wide.
+ // there is no way to determine if the instruction writes only 8/16/24-bit
+ // out of full register size and all registers are at min 32-bit wide.
if (!TII->isSDWA(*OtherInst))
break;
@@ -804,20 +844,18 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
return std::unique_ptr<SDWAOperand>(nullptr);
}
-void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- if (auto Operand = matchSDWAOperand(MI)) {
- DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
- SDWAOperands[&MI] = std::move(Operand);
- ++NumSDWAPatternsFound;
- }
+void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
+ for (MachineInstr &MI : MBB) {
+ if (auto Operand = matchSDWAOperand(MI)) {
+ LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
+ SDWAOperands[&MI] = std::move(Operand);
+ ++NumSDWAPatternsFound;
}
}
}
bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
- const SISubtarget &ST) const {
+ const GCNSubtarget &ST) const {
// Check if this is already an SDWA instruction
unsigned Opc = MI.getOpcode();
if (TII->isSDWA(Opc))
@@ -854,11 +892,18 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
Opc == AMDGPU::V_MAC_F32_e32))
return false;
+ // FIXME: has SDWA but require handling of implicit VCC use
+ if (Opc == AMDGPU::V_CNDMASK_B32_e32)
+ return false;
+
return true;
}
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
const SDWAOperandsVector &SDWAOperands) {
+
+ LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
+
// Convert to sdwa
int SDWAOpcode;
unsigned Opcode = MI.getOpcode();
@@ -984,9 +1029,29 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
}
}
- // Apply all sdwa operand pattenrs
+ // Check for a preserved register that needs to be copied.
+ auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+ if (DstUnused &&
+ DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
+ // We expect, if we are here, that the instruction was already in it's SDWA form,
+ // with a tied operand.
+ assert(Dst && Dst->isTied());
+ assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
+ // We also expect a vdst, since sdst can't preserve.
+ auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
+ assert(PreserveDstIdx != -1);
+
+ auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
+ auto Tied = MI.getOperand(TiedIdx);
+
+ SDWAInst.add(Tied);
+ SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
+ }
+
+ // Apply all sdwa operand patterns.
bool Converted = false;
for (auto &Operand : SDWAOperands) {
+ LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
// There should be no intesection between SDWA operands and potential MIs
// e.g.:
// v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
@@ -1007,8 +1072,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
return false;
}
- DEBUG(dbgs() << "Convert instruction:" << MI
- << "Into:" << *SDWAInst << '\n');
+ LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
++NumSDWAInstructionsPeepholed;
MI.eraseFromParent();
@@ -1017,7 +1081,8 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
// If an instruction was converted to SDWA it should not have immediates or SGPR
// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
-void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const {
+void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
+ const GCNSubtarget &ST) const {
const MCInstrDesc &Desc = TII->get(MI.getOpcode());
unsigned ConstantBusCount = 0;
for (MachineOperand &Op : MI.explicit_uses()) {
@@ -1048,7 +1113,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget
}
bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
return false;
@@ -1058,35 +1123,36 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
TII = ST.getInstrInfo();
// Find all SDWA operands in MF.
- bool Changed = false;
bool Ret = false;
- do {
- matchSDWAOperands(MF);
-
- for (const auto &OperandPair : SDWAOperands) {
- const auto &Operand = OperandPair.second;
- MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
- if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
- PotentialMatches[PotentialMI].push_back(Operand.get());
+ for (MachineBasicBlock &MBB : MF) {
+ bool Changed = false;
+ do {
+ matchSDWAOperands(MBB);
+
+ for (const auto &OperandPair : SDWAOperands) {
+ const auto &Operand = OperandPair.second;
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+ if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
+ PotentialMatches[PotentialMI].push_back(Operand.get());
+ }
}
- }
- for (auto &PotentialPair : PotentialMatches) {
- MachineInstr &PotentialMI = *PotentialPair.first;
- convertToSDWA(PotentialMI, PotentialPair.second);
- }
-
- PotentialMatches.clear();
- SDWAOperands.clear();
+ for (auto &PotentialPair : PotentialMatches) {
+ MachineInstr &PotentialMI = *PotentialPair.first;
+ convertToSDWA(PotentialMI, PotentialPair.second);
+ }
- Changed = !ConvertedInstructions.empty();
+ PotentialMatches.clear();
+ SDWAOperands.clear();
- if (Changed)
- Ret = true;
+ Changed = !ConvertedInstructions.empty();
- while (!ConvertedInstructions.empty())
- legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
- } while (Changed);
+ if (Changed)
+ Ret = true;
+ while (!ConvertedInstructions.empty())
+ legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
+ } while (Changed);
+ }
return Ret;
}
diff --git a/lib/Target/AMDGPU/SIProgramInfo.h b/lib/Target/AMDGPU/SIProgramInfo.h
new file mode 100644
index 000000000000..383f6b575808
--- /dev/null
+++ b/lib/Target/AMDGPU/SIProgramInfo.h
@@ -0,0 +1,77 @@
+//===--- SIProgramInfo.h ----------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Defines struct to track resource usage for kernels and entry functions.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
+
+namespace llvm {
+
+/// Track resource usage for kernels / entry functions.
+struct SIProgramInfo {
+ // Fields set in PGM_RSRC1 pm4 packet.
+ uint32_t VGPRBlocks = 0;
+ uint32_t SGPRBlocks = 0;
+ uint32_t Priority = 0;
+ uint32_t FloatMode = 0;
+ uint32_t Priv = 0;
+ uint32_t DX10Clamp = 0;
+ uint32_t DebugMode = 0;
+ uint32_t IEEEMode = 0;
+ uint64_t ScratchSize = 0;
+
+ uint64_t ComputePGMRSrc1 = 0;
+
+ // Fields set in PGM_RSRC2 pm4 packet.
+ uint32_t LDSBlocks = 0;
+ uint32_t ScratchBlocks = 0;
+
+ uint64_t ComputePGMRSrc2 = 0;
+
+ uint32_t NumVGPR = 0;
+ uint32_t NumSGPR = 0;
+ uint32_t LDSSize = 0;
+ bool FlatUsed = false;
+
+ // Number of SGPRs that meets number of waves per execution unit request.
+ uint32_t NumSGPRsForWavesPerEU = 0;
+
+ // Number of VGPRs that meets number of waves per execution unit request.
+ uint32_t NumVGPRsForWavesPerEU = 0;
+
+ // Fixed SGPR number used to hold wave scratch offset for entire kernel
+ // execution, or std::numeric_limits<uint16_t>::max() if the register is not
+ // used or not known.
+ uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR =
+ std::numeric_limits<uint16_t>::max();
+
+ // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
+ // kernel execution, or std::numeric_limits<uint16_t>::max() if the register
+ // is not used or not known.
+ uint16_t DebuggerPrivateSegmentBufferSGPR =
+ std::numeric_limits<uint16_t>::max();
+
+ // Whether there is recursion, dynamic allocas, indirect calls or some other
+ // reason there may be statically unknown stack usage.
+ bool DynamicCallStack = false;
+
+ // Bonus information for debugging.
+ bool VCCUsed = false;
+
+ SIProgramInfo() = default;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 65cdc13e03cd..624607f6ea54 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -8,14 +8,16 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief SI implementation of the TargetRegisterInfo class.
+/// SI implementation of the TargetRegisterInfo class.
//
//===----------------------------------------------------------------------===//
#include "SIRegisterInfo.h"
+#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/RegisterScavenging.h"
@@ -54,7 +56,7 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
cl::ReallyHidden,
cl::init(true));
-SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) :
+SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
AMDGPURegisterInfo(),
SGPRPressureSets(getNumRegPressureSets()),
VGPRPressureSets(getNumRegPressureSets()),
@@ -101,17 +103,10 @@ SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) :
VGPRSetID < NumRegPressureSets);
}
-void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
- MCRegAliasIterator R(Reg, this, true);
-
- for (; R.isValid(); ++R)
- Reserved.set(*R);
-}
-
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
const MachineFunction &MF) const {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
@@ -136,7 +131,7 @@ static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
return AMDGPU::SGPR_32RegClass.getRegister(Reg);
}
@@ -163,6 +158,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
+ // Reserve xnack_mask registers - support is not implemented in Codegen.
+ reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
+
// Reserve Trap Handler registers - support is not implemented in Codegen.
reserveRegisterTuples(Reserved, AMDGPU::TBA);
reserveRegisterTuples(Reserved, AMDGPU::TMA);
@@ -175,7 +173,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
@@ -255,7 +253,7 @@ bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
// create a virtual register for it during frame index elimination, so the
// scavenger is directly needed.
return MF.getFrameInfo().hasStackObjects() &&
- MF.getSubtarget<SISubtarget>().hasScalarStores() &&
+ MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
}
@@ -310,7 +308,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
DL = Ins->getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = Subtarget.getInstrInfo();
if (Offset == 0) {
@@ -339,7 +337,7 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
MachineBasicBlock *MBB = MI.getParent();
MachineFunction *MF = MBB->getParent();
- const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = Subtarget.getInstrInfo();
#ifndef NDEBUG
@@ -526,7 +524,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
RegScavenger *RS) const {
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MI->getParent()->getParent();
- const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const MachineFrameInfo &MFI = MF->getFrameInfo();
@@ -534,22 +532,29 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
const DebugLoc &DL = MI->getDebugLoc();
bool IsStore = Desc.mayStore();
- bool RanOutOfSGPRs = false;
bool Scavenged = false;
unsigned SOffset = ScratchOffsetReg;
+ const unsigned EltSize = 4;
const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
- unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32;
- unsigned Size = NumSubRegs * 4;
+ unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT);
+ unsigned Size = NumSubRegs * EltSize;
int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
- const int64_t OriginalImmOffset = Offset;
+ int64_t ScratchOffsetRegDelta = 0;
unsigned Align = MFI.getObjectAlignment(Index);
const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
- if (!isUInt<12>(Offset + Size)) {
+ assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
+
+ if (!isUInt<12>(Offset + Size - EltSize)) {
SOffset = AMDGPU::NoRegister;
+ // We currently only support spilling VGPRs to EltSize boundaries, meaning
+ // we can simplify the adjustment of Offset here to just scale with
+ // WavefrontSize.
+ Offset *= ST.getWavefrontSize();
+
// We don't have access to the register scavenger if this function is called
// during PEI::scavengeFrameVirtualRegs().
if (RS)
@@ -563,8 +568,8 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
// add the offset directly to the ScratchOffset register, and then
// subtract the offset after the spill to return ScratchOffset to it's
// original value.
- RanOutOfSGPRs = true;
SOffset = ScratchOffsetReg;
+ ScratchOffsetRegDelta = Offset;
} else {
Scavenged = true;
}
@@ -576,8 +581,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
Offset = 0;
}
- const unsigned EltSize = 4;
-
for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
unsigned SubReg = NumSubRegs == 1 ?
ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
@@ -609,11 +612,11 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
}
- if (RanOutOfSGPRs) {
+ if (ScratchOffsetRegDelta != 0) {
// Subtract the offset we added to the ScratchOffset register.
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
- .addReg(ScratchOffsetReg)
- .addImm(OriginalImmOffset);
+ .addReg(ScratchOffsetReg)
+ .addImm(ScratchOffsetRegDelta);
}
}
@@ -640,6 +643,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MBB->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ DenseSet<unsigned> SGPRSpillVGPRDefinedSet;
ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
= MFI->getSGPRToVGPRSpills(Index);
@@ -648,7 +652,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
return false;
MachineRegisterInfo &MRI = MF->getRegInfo();
- const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
unsigned SuperReg = MI->getOperand(0).getReg();
@@ -661,6 +665,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
if (SpillToSMEM && OnlyToVGPR)
return false;
+ assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
+ SuperReg != MFI->getFrameOffsetReg() &&
+ SuperReg != MFI->getScratchWaveOffsetReg()));
+
assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
unsigned OffsetReg = AMDGPU::M0;
@@ -736,11 +744,21 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
+ // During SGPR spilling to VGPR, determine if the VGPR is defined. The
+ // only circumstance in which we say it is undefined is when it is the
+ // first spill to this VGPR in the first basic block.
+ bool VGPRDefined = true;
+ if (MBB == &MF->front())
+ VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;
+
+ // Mark the "old value of vgpr" input undef only if this is the first sgpr
+ // spill to this specific vgpr in the first basic block.
BuildMI(*MBB, MI, DL,
TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
Spill.VGPR)
.addReg(SubReg, getKillRegState(IsKill))
- .addImm(Spill.Lane);
+ .addImm(Spill.Lane)
+ .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);
// FIXME: Since this spills to another register instead of an actual
// frame index, we should delete the frame index when all references to
@@ -812,7 +830,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
return false;
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const DebugLoc &DL = MI->getDebugLoc();
@@ -972,7 +990,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
@@ -1051,8 +1069,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// Convert to an absolute stack address by finding the offset from the
// scratch wave base and scaling by the wave size.
//
- // In an entry function/kernel the stack address is already the absolute
- // address relative to the the scratch wave offset.
+ // In an entry function/kernel the stack address is already the
+ // absolute address relative to the scratch wave offset.
unsigned DiffReg
= MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -1219,6 +1237,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
&AMDGPU::VReg_512RegClass,
&AMDGPU::SReg_512RegClass,
&AMDGPU::SCC_CLASSRegClass,
+ &AMDGPU::Pseudo_SReg_32RegClass,
+ &AMDGPU::Pseudo_SReg_128RegClass,
};
for (const TargetRegisterClass *BaseClass : BaseClasses) {
@@ -1355,7 +1375,7 @@ bool SIRegisterInfo::shouldRewriteCopySrc(
return getCommonSubClass(DefRC, SrcRC) != nullptr;
}
-/// \brief Returns a register that is not used at any point in the function.
+/// Returns a register that is not used at any point in the function.
/// If all registers are used, then this function will return
// AMDGPU::NoRegister.
unsigned
@@ -1483,7 +1503,9 @@ SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
unsigned Reg) const {
- return hasVGPRs(getRegClassForReg(MRI, Reg));
+ const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
+ assert(RC && "Register class for the reg not found");
+ return hasVGPRs(RC);
}
bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
@@ -1510,7 +1532,7 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
@@ -1545,3 +1567,34 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
return Empty;
return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit);
}
+
+unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
+ // Not a callee saved register.
+ return AMDGPU::SGPR30_SGPR31;
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
+ const MachineRegisterInfo &MRI) const {
+ unsigned Size = getRegSizeInBits(MO.getReg(), MRI);
+ const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
+ if (!RB)
+ return nullptr;
+
+ switch (Size) {
+ case 32:
+ return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
+ &AMDGPU::SReg_32_XM0RegClass;
+ case 64:
+ return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
+ &AMDGPU::SReg_64_XEXECRegClass;
+ case 96:
+ return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass :
+ nullptr;
+ case 128:
+ return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
+ &AMDGPU::SReg_128RegClass;
+ default:
+ llvm_unreachable("not implemented");
+ }
+}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index bf814b6974a8..5a51b67ca719 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface definition for SIRegisterInfo
+/// Interface definition for SIRegisterInfo
//
//===----------------------------------------------------------------------===//
@@ -16,15 +16,14 @@
#define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
#include "AMDGPURegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
namespace llvm {
+class GCNSubtarget;
class LiveIntervals;
class MachineRegisterInfo;
-class SISubtarget;
class SIMachineFunctionInfo;
class SIRegisterInfo final : public AMDGPURegisterInfo {
@@ -36,11 +35,10 @@ private:
bool SpillSGPRToVGPR;
bool SpillSGPRToSMEM;
- void reserveRegisterTuples(BitVector &, unsigned Reg) const;
void classifyPressureSet(unsigned PSetID, unsigned Reg,
BitVector &PressureSets) const;
public:
- SIRegisterInfo(const SISubtarget &ST);
+ SIRegisterInfo(const GCNSubtarget &ST);
bool spillSGPRToVGPR() const {
return SpillSGPRToVGPR;
@@ -126,7 +124,7 @@ public:
return getEncodingValue(Reg) & 0xff;
}
- /// \brief Return the 'base' register class for this register.
+ /// Return the 'base' register class for this register.
/// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
@@ -224,10 +222,11 @@ public:
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
- unsigned getReturnAddressReg(const MachineFunction &MF) const {
- // Not a callee saved register.
- return AMDGPU::SGPR30_SGPR31;
- }
+ unsigned getReturnAddressReg(const MachineFunction &MF) const;
+
+ const TargetRegisterClass *
+ getConstrainedRegClassForOperand(const MachineOperand &MO,
+ const MachineRegisterInfo &MRI) const override;
private:
void buildSpillLoadStore(MachineBasicBlock::iterator MI,
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index dd0efef7f91b..f87a0763b353 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -76,6 +76,16 @@ def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>;
def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>;
def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>;
+def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>;
+def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>;
+
+def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
+ DwarfRegAlias<XNACK_MASK_LO> {
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1];
+ let HWEncoding = 104;
+}
+
// Trap handler registers
def TBA_LO : SIReg<"tba_lo", 108>;
def TBA_HI : SIReg<"tba_hi", 109>;
@@ -394,7 +404,7 @@ def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16],
let CopyCost = -1;
}
-def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32,
+def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
(add PRIVATE_RSRC_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
@@ -403,7 +413,7 @@ def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32,
// Subset of SReg_32 without M0 for SMRD instructions and alike.
// See comments in SIInstructions.td for more info.
def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
+ (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> {
let AllocationPriority = 7;
@@ -425,22 +435,22 @@ def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
let AllocationPriority = 7;
}
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> {
let CopyCost = 1;
let AllocationPriority = 8;
}
-def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> {
+def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> {
let isAllocatable = 0;
}
-def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
- (add SGPR_64, VCC, FLAT_SCR, TTMP_64, TBA, TMA)> {
+def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
+ (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
let CopyCost = 1;
let AllocationPriority = 8;
}
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
(add SReg_64_XEXEC, EXEC)> {
let CopyCost = 1;
let AllocationPriority = 8;
@@ -457,7 +467,7 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128R
let isAllocatable = 0;
}
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32,
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64, v2f64], 32,
(add SGPR_128, TTMP_128)> {
let AllocationPriority = 10;
}
@@ -495,7 +505,7 @@ def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
}
// Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, (add VGPR_64)> {
let Size = 64;
// Requires 2 v_mov_b32 to copy
diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td
index 0f02f5825cb0..7af69cb6a46d 100644
--- a/lib/Target/AMDGPU/SISchedule.td
+++ b/lib/Target/AMDGPU/SISchedule.td
@@ -46,7 +46,7 @@ def Write64Bit : SchedWrite;
// instructions)
class SISchedMachineModel : SchedMachineModel {
- let CompleteModel = 1;
+ let CompleteModel = 0;
// MicroOpBufferSize = 1 means that instructions will always be added
// the ready queue when they become available. This exposes them
// to the register pressure analysis.
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 41f989ad3228..4189bcce52ea 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -10,9 +10,9 @@
//
#include "AMDGPU.h"
-#include "AMDGPUMCInstLower.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -64,17 +64,6 @@ FunctionPass *llvm::createSIShrinkInstructionsPass() {
return new SIShrinkInstructions();
}
-static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
- const MachineRegisterInfo &MRI) {
- if (!MO->isReg())
- return false;
-
- if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
- return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
-
- return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
-}
-
static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
const SIRegisterInfo &TRI,
const MachineRegisterInfo &MRI) {
@@ -92,14 +81,18 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
case AMDGPU::V_ADDC_U32_e64:
case AMDGPU::V_SUBB_U32_e64:
- if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isImm())
+ case AMDGPU::V_SUBBREV_U32_e64: {
+ const MachineOperand *Src1
+ = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ if (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()))
return false;
// Additional verification is needed for sdst/src2.
return true;
-
+ }
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_F16_e64:
- if (!isVGPR(Src2, TRI, MRI) ||
+ case AMDGPU::V_FMAC_F32_e64:
+ if (!Src2->isReg() || !TRI.isVGPR(MRI, Src2->getReg()) ||
TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
return false;
break;
@@ -110,7 +103,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
}
const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- if (Src1 && (!isVGPR(Src1, TRI, MRI) ||
+ if (Src1 && (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()) ||
TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
return false;
@@ -124,7 +117,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
!TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
}
-/// \brief This function checks \p MI for operands defined by a move immediate
+/// This function checks \p MI for operands defined by a move immediate
/// instruction and then folds the literal constant into the instruction if it
/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
@@ -290,7 +283,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
return false;
MachineRegisterInfo &MRI = MF.getRegInfo();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
@@ -442,7 +435,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
//
// So, instead of forcing the instruction to write to VCC, we provide
- // a hint to the register allocator to use VCC and then we we will run
+ // a hint to the register allocator to use VCC and then we will run
// this pass again after RA and shrink it if it outputs to VCC.
MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
continue;
@@ -493,7 +486,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
}
// We can shrink this instruction
- DEBUG(dbgs() << "Shrinking " << MI);
+ LLVM_DEBUG(dbgs() << "Shrinking " << MI);
MachineInstrBuilder Inst32 =
BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
@@ -537,9 +530,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MI.eraseFromParent();
foldImmediates(*Inst32, TII, MRI);
- DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
-
-
+ LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
}
}
return false;
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 53aefe829737..879726b1528c 100644
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This pass adds instructions to enable whole quad mode for pixel
+/// This pass adds instructions to enable whole quad mode for pixel
/// shaders, and whole wavefront mode for all programs.
///
/// Whole quad mode is required for derivative computations, but it interferes
@@ -60,6 +60,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallVector.h"
@@ -325,9 +326,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
unsigned Opcode = MI.getOpcode();
char Flags = 0;
- if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) {
- Flags = StateWQM;
- } else if (TII->isWQM(Opcode)) {
+ if (TII->isWQM(Opcode)) {
// Sampling instructions don't need to produce results for all pixels
// in a quad, they just require all inputs of a quad to have been
// computed for derivatives.
@@ -454,6 +453,11 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
if (II.Needs != 0)
markInstructionUses(MI, II.Needs, Worklist);
+
+ // Ensure we process a block containing WWM, even if it does not require any
+ // WQM transitions.
+ if (II.Needs & StateWWM)
+ BI.Needs |= StateWWM;
}
void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
@@ -681,7 +685,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
return;
- DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) << ":\n");
+ LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
+ << ":\n");
unsigned SavedWQMReg = 0;
unsigned SavedNonWWMReg = 0;
@@ -844,7 +849,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LowerToCopyInstrs.clear();
CallingConv = MF.getFunction().getCallingConv();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
@@ -884,7 +889,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
}
}
- DEBUG(printInfo());
+ LLVM_DEBUG(printInfo());
lowerCopyInstrs();
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 8f347986eb8a..7485326017b2 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -63,6 +63,18 @@ class SM_Real <SM_Pseudo ps>
bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0);
}
+class SM_Probe_Pseudo <string opName, dag ins, bit isImm>
+ : SM_Pseudo<opName, (outs), ins, " $sdata, $sbase, $offset"> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let has_glc = 0;
+ let LGKM_CNT = 0;
+ let ScalarStore = 0;
+ let hasSideEffects = 1;
+ let offset_is_imm = isImm;
+ let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR");
+}
+
class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]>
: SM_Pseudo<opName, outs, ins, asmOps, pattern> {
RegisterClass BaseClass;
@@ -81,6 +93,18 @@ class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern
let ScalarStore = 1;
}
+class SM_Discard_Pseudo <string opName, dag ins, bit isImm>
+ : SM_Pseudo<opName, (outs), ins, " $sbase, $offset"> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let has_glc = 0;
+ let has_sdst = 0;
+ let ScalarStore = 0;
+ let hasSideEffects = 1;
+ let offset_is_imm = isImm;
+ let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR");
+}
+
multiclass SM_Pseudo_Loads<string opName,
RegisterClass baseClass,
RegisterClass dstClass> {
@@ -125,6 +149,11 @@ multiclass SM_Pseudo_Stores<string opName,
}
}
+multiclass SM_Pseudo_Discards<string opName> {
+ def _IMM : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, smrd_offset_20:$offset), 1>;
+ def _SGPR : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, SReg_32:$offset), 0>;
+}
+
class SM_Time_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
opName, (outs SReg_64_XEXEC:$sdst), (ins),
" $sdst", [(set i64:$sdst, (node))]> {
@@ -144,6 +173,60 @@ class SM_Inval_Pseudo <string opName, SDPatternOperator node> : SM_Pseudo<
let has_offset = 0;
}
+multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> {
+ def _IMM : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, smrd_offset_20:$offset), 1>;
+ def _SGPR : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, SReg_32:$offset), 0>;
+}
+
+//===----------------------------------------------------------------------===//
+// Scalar Atomic Memory Classes
+//===----------------------------------------------------------------------===//
+
+class SM_Atomic_Pseudo <string opName,
+ dag outs, dag ins, string asmOps, bit isRet>
+ : SM_Pseudo<opName, outs, ins, asmOps, []> {
+
+ bit glc = isRet;
+
+ let mayLoad = 1;
+ let mayStore = 1;
+ let has_glc = 1;
+
+ // Should these be set?
+ let ScalarStore = 1;
+ let hasSideEffects = 1;
+ let maybeAtomic = 1;
+}
+
+class SM_Pseudo_Atomic<string opName,
+ RegisterClass baseClass,
+ RegisterClass dataClass,
+ bit isImm,
+ bit isRet> :
+ SM_Atomic_Pseudo<opName,
+ !if(isRet, (outs dataClass:$sdst), (outs)),
+ !if(isImm,
+ (ins dataClass:$sdata, baseClass:$sbase, smrd_offset_20:$offset),
+ (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset)),
+ !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", ""),
+ isRet> {
+ let offset_is_imm = isImm;
+ let PseudoInstr = opName # !if(isImm,
+ !if(isRet, "_IMM_RTN", "_IMM"),
+ !if(isRet, "_SGPR_RTN", "_SGPR"));
+
+ let Constraints = !if(isRet, "$sdst = $sdata", "");
+ let DisableEncoding = !if(isRet, "$sdata", "");
+}
+
+multiclass SM_Pseudo_Atomics<string opName,
+ RegisterClass baseClass,
+ RegisterClass dataClass> {
+ def _IMM : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 0>;
+ def _SGPR : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 0>;
+ def _IMM_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 1>;
+ def _SGPR_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 1>;
+}
//===----------------------------------------------------------------------===//
// Scalar Memory Instructions
@@ -211,9 +294,85 @@ let SubtargetPredicate = isVI in {
def S_DCACHE_WB : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>;
def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>;
-} // SubtargetPredicate = isVI
+defm S_ATC_PROBE : SM_Pseudo_Probe <"s_atc_probe", SReg_64>;
+defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>;
+} // SubtargetPredicate = isVI
+let SubtargetPredicate = HasFlatScratchInsts, Uses = [FLAT_SCR] in {
+defm S_SCRATCH_LOAD_DWORD : SM_Pseudo_Loads <"s_scratch_load_dword", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_SCRATCH_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_scratch_load_dwordx2", SReg_64, SReg_64_XEXEC>;
+defm S_SCRATCH_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_scratch_load_dwordx4", SReg_64, SReg_128>;
+
+defm S_SCRATCH_STORE_DWORD : SM_Pseudo_Stores <"s_scratch_store_dword", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_SCRATCH_STORE_DWORDX2 : SM_Pseudo_Stores <"s_scratch_store_dwordx2", SReg_64, SReg_64_XEXEC>;
+defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <"s_scratch_store_dwordx4", SReg_64, SReg_128>;
+} // SubtargetPredicate = HasFlatScratchInsts
+
+let SubtargetPredicate = HasScalarAtomics in {
+
+defm S_BUFFER_ATOMIC_SWAP : SM_Pseudo_Atomics <"s_buffer_atomic_swap", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_ADD : SM_Pseudo_Atomics <"s_buffer_atomic_add", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SUB : SM_Pseudo_Atomics <"s_buffer_atomic_sub", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SMIN : SM_Pseudo_Atomics <"s_buffer_atomic_smin", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_UMIN : SM_Pseudo_Atomics <"s_buffer_atomic_umin", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SMAX : SM_Pseudo_Atomics <"s_buffer_atomic_smax", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_UMAX : SM_Pseudo_Atomics <"s_buffer_atomic_umax", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_AND : SM_Pseudo_Atomics <"s_buffer_atomic_and", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_OR : SM_Pseudo_Atomics <"s_buffer_atomic_or", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_XOR : SM_Pseudo_Atomics <"s_buffer_atomic_xor", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_INC : SM_Pseudo_Atomics <"s_buffer_atomic_inc", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_DEC : SM_Pseudo_Atomics <"s_buffer_atomic_dec", SReg_128, SReg_32_XM0_XEXEC>;
+
+defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_swap_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap_x2", SReg_128, SReg_128>;
+defm S_BUFFER_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_add_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_sub_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_smin_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_umin_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_smax_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_umax_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_AND_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_and_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_OR_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_or_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_xor_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_INC_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_inc_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_dec_x2", SReg_128, SReg_64_XEXEC>;
+
+defm S_ATOMIC_SWAP : SM_Pseudo_Atomics <"s_atomic_swap", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <"s_atomic_cmpswap", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_ADD : SM_Pseudo_Atomics <"s_atomic_add", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SUB : SM_Pseudo_Atomics <"s_atomic_sub", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SMIN : SM_Pseudo_Atomics <"s_atomic_smin", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_UMIN : SM_Pseudo_Atomics <"s_atomic_umin", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SMAX : SM_Pseudo_Atomics <"s_atomic_smax", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_UMAX : SM_Pseudo_Atomics <"s_atomic_umax", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_AND : SM_Pseudo_Atomics <"s_atomic_and", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_OR : SM_Pseudo_Atomics <"s_atomic_or", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_XOR : SM_Pseudo_Atomics <"s_atomic_xor", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_INC : SM_Pseudo_Atomics <"s_atomic_inc", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_DEC : SM_Pseudo_Atomics <"s_atomic_dec", SReg_64, SReg_32_XM0_XEXEC>;
+
+defm S_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <"s_atomic_swap_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <"s_atomic_cmpswap_x2", SReg_64, SReg_128>;
+defm S_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <"s_atomic_add_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <"s_atomic_sub_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <"s_atomic_smin_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <"s_atomic_umin_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <"s_atomic_smax_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <"s_atomic_umax_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_AND_X2 : SM_Pseudo_Atomics <"s_atomic_and_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_OR_X2 : SM_Pseudo_Atomics <"s_atomic_or_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <"s_atomic_xor_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_INC_X2 : SM_Pseudo_Atomics <"s_atomic_inc_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_atomic_dec_x2", SReg_64, SReg_64_XEXEC>;
+
+} // let SubtargetPredicate = HasScalarAtomics
+
+let SubtargetPredicate = isGFX9 in {
+defm S_DCACHE_DISCARD : SM_Pseudo_Discards <"s_dcache_discard">;
+defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards <"s_dcache_discard_x2">;
+}
//===----------------------------------------------------------------------===//
// Scalar Memory Patterns
@@ -223,11 +382,9 @@ def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>
def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
auto Ld = cast<LoadSDNode>(N);
return Ld->getAlignment() >= 4 &&
- ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
- static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
+ ((((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) || (Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT)) && !N->isDivergent()) ||
(Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS &&
- !Ld->isVolatile() &&
- static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
+ !Ld->isVolatile() && !N->isDivergent() &&
static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
}]>;
@@ -407,6 +564,11 @@ multiclass SM_Real_Stores_vi<bits<8> op, string ps,
}
}
+multiclass SM_Real_Probe_vi<bits<8> op, string ps> {
+ def _IMM_vi : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
+ def _SGPR_vi : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>;
+}
+
defm S_LOAD_DWORD : SM_Real_Loads_vi <0x00, "S_LOAD_DWORD">;
defm S_LOAD_DWORDX2 : SM_Real_Loads_vi <0x01, "S_LOAD_DWORDX2">;
defm S_LOAD_DWORDX4 : SM_Real_Loads_vi <0x02, "S_LOAD_DWORDX4">;
@@ -434,6 +596,103 @@ def S_DCACHE_WB_VOL_vi : SMEM_Real_vi <0x23, S_DCACHE_WB_VOL>;
def S_MEMTIME_vi : SMEM_Real_vi <0x24, S_MEMTIME>;
def S_MEMREALTIME_vi : SMEM_Real_vi <0x25, S_MEMREALTIME>;
+defm S_SCRATCH_LOAD_DWORD : SM_Real_Loads_vi <0x05, "S_SCRATCH_LOAD_DWORD">;
+defm S_SCRATCH_LOAD_DWORDX2 : SM_Real_Loads_vi <0x06, "S_SCRATCH_LOAD_DWORDX2">;
+defm S_SCRATCH_LOAD_DWORDX4 : SM_Real_Loads_vi <0x07, "S_SCRATCH_LOAD_DWORDX4">;
+
+defm S_SCRATCH_STORE_DWORD : SM_Real_Stores_vi <0x15, "S_SCRATCH_STORE_DWORD">;
+defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_vi <0x16, "S_SCRATCH_STORE_DWORDX2">;
+defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_vi <0x17, "S_SCRATCH_STORE_DWORDX4">;
+
+defm S_ATC_PROBE : SM_Real_Probe_vi <0x26, "S_ATC_PROBE">;
+defm S_ATC_PROBE_BUFFER : SM_Real_Probe_vi <0x27, "S_ATC_PROBE_BUFFER">;
+
+//===----------------------------------------------------------------------===//
+// GFX9
+//===----------------------------------------------------------------------===//
+
+class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps>
+ : SMEM_Real_vi <op, ps> {
+
+ bits<7> sdata;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ let glc = ps.glc;
+ let Inst{12-6} = !if(glc, sdst{6-0}, sdata{6-0});
+}
+
+multiclass SM_Real_Atomics_vi<bits<8> op, string ps> {
+ def _IMM_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>;
+ def _SGPR_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>;
+ def _IMM_RTN_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM_RTN)>;
+ def _SGPR_RTN_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>;
+}
+
+defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_vi <0x40, "S_BUFFER_ATOMIC_SWAP">;
+defm S_BUFFER_ATOMIC_CMPSWAP : SM_Real_Atomics_vi <0x41, "S_BUFFER_ATOMIC_CMPSWAP">;
+defm S_BUFFER_ATOMIC_ADD : SM_Real_Atomics_vi <0x42, "S_BUFFER_ATOMIC_ADD">;
+defm S_BUFFER_ATOMIC_SUB : SM_Real_Atomics_vi <0x43, "S_BUFFER_ATOMIC_SUB">;
+defm S_BUFFER_ATOMIC_SMIN : SM_Real_Atomics_vi <0x44, "S_BUFFER_ATOMIC_SMIN">;
+defm S_BUFFER_ATOMIC_UMIN : SM_Real_Atomics_vi <0x45, "S_BUFFER_ATOMIC_UMIN">;
+defm S_BUFFER_ATOMIC_SMAX : SM_Real_Atomics_vi <0x46, "S_BUFFER_ATOMIC_SMAX">;
+defm S_BUFFER_ATOMIC_UMAX : SM_Real_Atomics_vi <0x47, "S_BUFFER_ATOMIC_UMAX">;
+defm S_BUFFER_ATOMIC_AND : SM_Real_Atomics_vi <0x48, "S_BUFFER_ATOMIC_AND">;
+defm S_BUFFER_ATOMIC_OR : SM_Real_Atomics_vi <0x49, "S_BUFFER_ATOMIC_OR">;
+defm S_BUFFER_ATOMIC_XOR : SM_Real_Atomics_vi <0x4a, "S_BUFFER_ATOMIC_XOR">;
+defm S_BUFFER_ATOMIC_INC : SM_Real_Atomics_vi <0x4b, "S_BUFFER_ATOMIC_INC">;
+defm S_BUFFER_ATOMIC_DEC : SM_Real_Atomics_vi <0x4c, "S_BUFFER_ATOMIC_DEC">;
+
+defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Real_Atomics_vi <0x60, "S_BUFFER_ATOMIC_SWAP_X2">;
+defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_vi <0x61, "S_BUFFER_ATOMIC_CMPSWAP_X2">;
+defm S_BUFFER_ATOMIC_ADD_X2 : SM_Real_Atomics_vi <0x62, "S_BUFFER_ATOMIC_ADD_X2">;
+defm S_BUFFER_ATOMIC_SUB_X2 : SM_Real_Atomics_vi <0x63, "S_BUFFER_ATOMIC_SUB_X2">;
+defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Real_Atomics_vi <0x64, "S_BUFFER_ATOMIC_SMIN_X2">;
+defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Real_Atomics_vi <0x65, "S_BUFFER_ATOMIC_UMIN_X2">;
+defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Real_Atomics_vi <0x66, "S_BUFFER_ATOMIC_SMAX_X2">;
+defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Real_Atomics_vi <0x67, "S_BUFFER_ATOMIC_UMAX_X2">;
+defm S_BUFFER_ATOMIC_AND_X2 : SM_Real_Atomics_vi <0x68, "S_BUFFER_ATOMIC_AND_X2">;
+defm S_BUFFER_ATOMIC_OR_X2 : SM_Real_Atomics_vi <0x69, "S_BUFFER_ATOMIC_OR_X2">;
+defm S_BUFFER_ATOMIC_XOR_X2 : SM_Real_Atomics_vi <0x6a, "S_BUFFER_ATOMIC_XOR_X2">;
+defm S_BUFFER_ATOMIC_INC_X2 : SM_Real_Atomics_vi <0x6b, "S_BUFFER_ATOMIC_INC_X2">;
+defm S_BUFFER_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0x6c, "S_BUFFER_ATOMIC_DEC_X2">;
+
+defm S_ATOMIC_SWAP : SM_Real_Atomics_vi <0x80, "S_ATOMIC_SWAP">;
+defm S_ATOMIC_CMPSWAP : SM_Real_Atomics_vi <0x81, "S_ATOMIC_CMPSWAP">;
+defm S_ATOMIC_ADD : SM_Real_Atomics_vi <0x82, "S_ATOMIC_ADD">;
+defm S_ATOMIC_SUB : SM_Real_Atomics_vi <0x83, "S_ATOMIC_SUB">;
+defm S_ATOMIC_SMIN : SM_Real_Atomics_vi <0x84, "S_ATOMIC_SMIN">;
+defm S_ATOMIC_UMIN : SM_Real_Atomics_vi <0x85, "S_ATOMIC_UMIN">;
+defm S_ATOMIC_SMAX : SM_Real_Atomics_vi <0x86, "S_ATOMIC_SMAX">;
+defm S_ATOMIC_UMAX : SM_Real_Atomics_vi <0x87, "S_ATOMIC_UMAX">;
+defm S_ATOMIC_AND : SM_Real_Atomics_vi <0x88, "S_ATOMIC_AND">;
+defm S_ATOMIC_OR : SM_Real_Atomics_vi <0x89, "S_ATOMIC_OR">;
+defm S_ATOMIC_XOR : SM_Real_Atomics_vi <0x8a, "S_ATOMIC_XOR">;
+defm S_ATOMIC_INC : SM_Real_Atomics_vi <0x8b, "S_ATOMIC_INC">;
+defm S_ATOMIC_DEC : SM_Real_Atomics_vi <0x8c, "S_ATOMIC_DEC">;
+
+defm S_ATOMIC_SWAP_X2 : SM_Real_Atomics_vi <0xa0, "S_ATOMIC_SWAP_X2">;
+defm S_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_vi <0xa1, "S_ATOMIC_CMPSWAP_X2">;
+defm S_ATOMIC_ADD_X2 : SM_Real_Atomics_vi <0xa2, "S_ATOMIC_ADD_X2">;
+defm S_ATOMIC_SUB_X2 : SM_Real_Atomics_vi <0xa3, "S_ATOMIC_SUB_X2">;
+defm S_ATOMIC_SMIN_X2 : SM_Real_Atomics_vi <0xa4, "S_ATOMIC_SMIN_X2">;
+defm S_ATOMIC_UMIN_X2 : SM_Real_Atomics_vi <0xa5, "S_ATOMIC_UMIN_X2">;
+defm S_ATOMIC_SMAX_X2 : SM_Real_Atomics_vi <0xa6, "S_ATOMIC_SMAX_X2">;
+defm S_ATOMIC_UMAX_X2 : SM_Real_Atomics_vi <0xa7, "S_ATOMIC_UMAX_X2">;
+defm S_ATOMIC_AND_X2 : SM_Real_Atomics_vi <0xa8, "S_ATOMIC_AND_X2">;
+defm S_ATOMIC_OR_X2 : SM_Real_Atomics_vi <0xa9, "S_ATOMIC_OR_X2">;
+defm S_ATOMIC_XOR_X2 : SM_Real_Atomics_vi <0xaa, "S_ATOMIC_XOR_X2">;
+defm S_ATOMIC_INC_X2 : SM_Real_Atomics_vi <0xab, "S_ATOMIC_INC_X2">;
+defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0xac, "S_ATOMIC_DEC_X2">;
+
+multiclass SM_Real_Discard_vi<bits<8> op, string ps> {
+ def _IMM_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_IMM)>;
+ def _SGPR_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR)>;
+}
+
+defm S_DCACHE_DISCARD : SM_Real_Discard_vi <0x28, "S_DCACHE_DISCARD">;
+defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_vi <0x29, "S_DCACHE_DISCARD_X2">;
//===----------------------------------------------------------------------===//
// CI
@@ -502,7 +761,7 @@ let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in {
class SMRD_Pattern_ci <string Instr, ValueType vt> : GCNPat <
(smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
+ (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
let OtherPredicates = [isCIOnly];
}
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index 02a95a4b6f24..6f5db9644c86 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -19,17 +19,28 @@ def GPRIdxMode : Operand<i32> {
let OperandType = "OPERAND_IMMEDIATE";
}
+class SOP_Pseudo<string opName, dag outs, dag ins, string asmOps,
+ list<dag> pattern=[]> :
+ InstSI<outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let SubtargetPredicate = isGCN;
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ bits<1> has_sdst = 0;
+}
+
//===----------------------------------------------------------------------===//
// SOP1 Instructions
//===----------------------------------------------------------------------===//
class SOP1_Pseudo <string opName, dag outs, dag ins,
string asmOps, list<dag> pattern=[]> :
- InstSI <outs, ins, "", pattern>,
- SIMCInstr<opName, SIEncodingFamily.NONE> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
- let SubtargetPredicate = isGCN;
+ SOP_Pseudo<opName, outs, ins, asmOps, pattern> {
let mayLoad = 0;
let mayStore = 0;
@@ -40,9 +51,6 @@ class SOP1_Pseudo <string opName, dag outs, dag ins,
let Size = 4;
let UseNamedOperandTable = 1;
- string Mnemonic = opName;
- string AsmOperands = asmOps;
-
bits<1> has_src0 = 1;
bits<1> has_sdst = 1;
}
@@ -247,17 +255,25 @@ def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> {
}
}
+let SubtargetPredicate = isGFX9 in {
+ let hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] in {
+ def S_ANDN1_SAVEEXEC_B64 : SOP1_64<"s_andn1_saveexec_b64">;
+ def S_ORN1_SAVEEXEC_B64 : SOP1_64<"s_orn1_saveexec_b64">;
+ def S_ANDN1_WREXEC_B64 : SOP1_64<"s_andn1_wrexec_b64">;
+ def S_ANDN2_WREXEC_B64 : SOP1_64<"s_andn2_wrexec_b64">;
+ } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC]
+
+ def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">;
+} // End SubtargetPredicate = isGFX9
+
//===----------------------------------------------------------------------===//
// SOP2 Instructions
//===----------------------------------------------------------------------===//
class SOP2_Pseudo<string opName, dag outs, dag ins,
string asmOps, list<dag> pattern=[]> :
- InstSI<outs, ins, "", pattern>,
- SIMCInstr<opName, SIEncodingFamily.NONE> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
- let SubtargetPredicate = isGCN;
+ SOP_Pseudo<opName, outs, ins, asmOps, pattern> {
+
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -266,10 +282,7 @@ class SOP2_Pseudo<string opName, dag outs, dag ins,
let SchedRW = [WriteSALU];
let UseNamedOperandTable = 1;
- string Mnemonic = opName;
- string AsmOperands = asmOps;
-
- bits<1> has_sdst = 1;
+ let has_sdst = 1;
// Pseudo instructions have no encodings, but adding this field here allows
// us to do:
@@ -279,7 +292,7 @@ class SOP2_Pseudo<string opName, dag outs, dag ins,
// let Size = 4; // Do we need size here?
}
-class SOP2_Real<bits<7> op, SOP2_Pseudo ps> :
+class SOP2_Real<bits<7> op, SOP_Pseudo ps> :
InstSI <ps.OutOperandList, ps.InOperandList,
ps.Mnemonic # " " # ps.AsmOperands, []>,
Enc32 {
@@ -482,6 +495,16 @@ let SubtargetPredicate = isGFX9 in {
def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">;
def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">;
def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">;
+
+ let Defs = [SCC] in {
+ def S_LSHL1_ADD_U32 : SOP2_32<"s_lshl1_add_u32">;
+ def S_LSHL2_ADD_U32 : SOP2_32<"s_lshl2_add_u32">;
+ def S_LSHL3_ADD_U32 : SOP2_32<"s_lshl3_add_u32">;
+ def S_LSHL4_ADD_U32 : SOP2_32<"s_lshl4_add_u32">;
+ } // End Defs = [SCC]
+
+ def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32">;
+ def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32">;
}
//===----------------------------------------------------------------------===//
@@ -659,6 +682,16 @@ def S_SETREG_IMM32_B32 : SOPK_Pseudo <
} // End hasSideEffects = 1
+let SubtargetPredicate = isGFX9 in {
+ def S_CALL_B64 : SOPK_Pseudo<
+ "s_call_b64",
+ (outs SReg_64:$sdst),
+ (ins s16imm:$simm16),
+ "$sdst, $simm16"> {
+ let isCall = 1;
+ }
+}
+
//===----------------------------------------------------------------------===//
// SOPC Instructions
//===----------------------------------------------------------------------===//
@@ -806,6 +839,13 @@ def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> {
}
}
+let SubtargetPredicate = isGFX9 in {
+ let isBarrier = 1, isReturn = 1, simm16 = 0 in {
+ def S_ENDPGM_ORDERED_PS_DONE :
+ SOPP<0x01e, (ins), "s_endpgm_ordered_ps_done">;
+ } // End isBarrier = 1, isReturn = 1, simm16 = 0
+} // End SubtargetPredicate = isGFX9
+
let isBranch = 1, SchedRW = [WriteBranch] in {
def S_BRANCH : SOPP <
0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
@@ -1312,3 +1352,26 @@ def S_SETREG_B32_vi : SOPK_Real_vi <0x12, S_SETREG_B32>;
//def S_GETREG_REGRD_B32_vi : SOPK_Real_vi <0x13, S_GETREG_REGRD_B32>; // see pseudo for comments
def S_SETREG_IMM32_B32_vi : SOPK_Real64<0x14, S_SETREG_IMM32_B32>,
Select_vi<S_SETREG_IMM32_B32.Mnemonic>;
+
+def S_CALL_B64_vi : SOPK_Real_vi <0x15, S_CALL_B64>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 - GFX9.
+//===----------------------------------------------------------------------===//
+
+def S_ANDN1_SAVEEXEC_B64_vi : SOP1_Real_vi<0x33, S_ANDN1_SAVEEXEC_B64>;
+def S_ORN1_SAVEEXEC_B64_vi : SOP1_Real_vi<0x34, S_ORN1_SAVEEXEC_B64>;
+def S_ANDN1_WREXEC_B64_vi : SOP1_Real_vi<0x35, S_ANDN1_WREXEC_B64>;
+def S_ANDN2_WREXEC_B64_vi : SOP1_Real_vi<0x36, S_ANDN2_WREXEC_B64>;
+def S_BITREPLICATE_B64_B32_vi : SOP1_Real_vi<0x37, S_BITREPLICATE_B64_B32>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX9.
+//===----------------------------------------------------------------------===//
+
+def S_LSHL1_ADD_U32_vi : SOP2_Real_vi<0x2e, S_LSHL1_ADD_U32>;
+def S_LSHL2_ADD_U32_vi : SOP2_Real_vi<0x2f, S_LSHL2_ADD_U32>;
+def S_LSHL3_ADD_U32_vi : SOP2_Real_vi<0x30, S_LSHL3_ADD_U32>;
+def S_LSHL4_ADD_U32_vi : SOP2_Real_vi<0x31, S_LSHL4_ADD_U32>;
+def S_MUL_HI_U32_vi : SOP2_Real_vi<0x2c, S_MUL_HI_U32>;
+def S_MUL_HI_I32_vi : SOP2_Real_vi<0x2d, S_MUL_HI_I32>;
diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
index f61e2e413ad4..e4c442db3016 100644
--- a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -16,19 +16,19 @@
using namespace llvm;
-/// \brief The target which supports all AMD GPUs. This will eventually
+/// The target which supports all AMD GPUs. This will eventually
/// be deprecated and there will be a R600 target and a GCN target.
Target &llvm::getTheAMDGPUTarget() {
static Target TheAMDGPUTarget;
return TheAMDGPUTarget;
}
-/// \brief The target for GCN GPUs
+/// The target for GCN GPUs
Target &llvm::getTheGCNTarget() {
static Target TheGCNTarget;
return TheGCNTarget;
}
-/// \brief Extern function to initialize the targets for the AMDGPU backend
+/// Extern function to initialize the targets for the AMDGPU backend
extern "C" void LLVMInitializeAMDGPUTargetInfo() {
RegisterTarget<Triple::r600, false> R600(getTheAMDGPUTarget(), "r600",
"AMD GPUs HD2XXX-HD6XXX", "AMDGPU");
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 03b11ae80500..9eb4c6513cce 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -61,7 +61,15 @@ const char* const IdSymbolic[] = {
"HW_REG_HW_ID",
"HW_REG_GPR_ALLOC",
"HW_REG_LDS_ALLOC",
- "HW_REG_IB_STS"
+ "HW_REG_IB_STS",
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ "HW_REG_SH_MEM_BASES"
};
} // namespace Hwreg
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 125a3b22d0cf..3fd3c75874a3 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -8,6 +8,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUBaseInfo.h"
+#include "AMDGPUTargetTransformInfo.h"
#include "AMDGPU.h"
#include "SIDefines.h"
#include "llvm/ADT/StringRef.h"
@@ -52,7 +53,7 @@ unsigned getBitMask(unsigned Shift, unsigned Width) {
return ((1 << Width) - 1) << Shift;
}
-/// \brief Packs \p Src into \p Dst for given bit \p Shift and bit \p Width.
+/// Packs \p Src into \p Dst for given bit \p Shift and bit \p Width.
///
/// \returns Packed \p Dst.
unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) {
@@ -61,7 +62,7 @@ unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) {
return Dst;
}
-/// \brief Unpacks bits from \p Src for given bit \p Shift and bit \p Width.
+/// Unpacks bits from \p Src for given bit \p Shift and bit \p Width.
///
/// \returns Unpacked bits.
unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) {
@@ -96,64 +97,34 @@ unsigned getVmcntBitWidthHi() { return 2; }
namespace llvm {
-static cl::opt<bool> EnablePackedInlinableLiterals(
- "enable-packed-inlinable-literals",
- cl::desc("Enable packed inlinable literals (v2f16, v2i16)"),
- cl::init(false));
-
namespace AMDGPU {
-LLVM_READNONE
-static inline Channels indexToChannel(unsigned Channel) {
- switch (Channel) {
- case 1:
- return AMDGPU::Channels_1;
- case 2:
- return AMDGPU::Channels_2;
- case 3:
- return AMDGPU::Channels_3;
- case 4:
- return AMDGPU::Channels_4;
- default:
- llvm_unreachable("invalid MIMG channel");
- }
-}
+struct MIMGInfo {
+ uint16_t Opcode;
+ uint16_t BaseOpcode;
+ uint8_t MIMGEncoding;
+ uint8_t VDataDwords;
+ uint8_t VAddrDwords;
+};
+#define GET_MIMGBaseOpcodesTable_IMPL
+#define GET_MIMGDimInfoTable_IMPL
+#define GET_MIMGInfoTable_IMPL
+#include "AMDGPUGenSearchableTables.inc"
-// FIXME: Need to handle d16 images correctly.
-static unsigned rcToChannels(unsigned RCID) {
- switch (RCID) {
- case AMDGPU::VGPR_32RegClassID:
- return 1;
- case AMDGPU::VReg_64RegClassID:
- return 2;
- case AMDGPU::VReg_96RegClassID:
- return 3;
- case AMDGPU::VReg_128RegClassID:
- return 4;
- default:
- llvm_unreachable("invalid MIMG register class");
- }
+int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
+ unsigned VDataDwords, unsigned VAddrDwords) {
+ const MIMGInfo *Info = getMIMGOpcodeHelper(BaseOpcode, MIMGEncoding,
+ VDataDwords, VAddrDwords);
+ return Info ? Info->Opcode : -1;
}
-int getMaskedMIMGOp(const MCInstrInfo &MII, unsigned Opc, unsigned NewChannels) {
- AMDGPU::Channels Channel = AMDGPU::indexToChannel(NewChannels);
- unsigned OrigChannels = rcToChannels(MII.get(Opc).OpInfo[0].RegClass);
- if (NewChannels == OrigChannels)
- return Opc;
-
- switch (OrigChannels) {
- case 1:
- return AMDGPU::getMaskedMIMGOp1(Opc, Channel);
- case 2:
- return AMDGPU::getMaskedMIMGOp2(Opc, Channel);
- case 3:
- return AMDGPU::getMaskedMIMGOp3(Opc, Channel);
- case 4:
- return AMDGPU::getMaskedMIMGOp4(Opc, Channel);
- default:
- llvm_unreachable("invalid MIMG channel");
- }
+int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {
+ const MIMGInfo *OrigInfo = getMIMGInfo(Opc);
+ const MIMGInfo *NewInfo =
+ getMIMGOpcodeHelper(OrigInfo->BaseOpcode, OrigInfo->MIMGEncoding,
+ NewChannels, OrigInfo->VAddrDwords);
+ return NewInfo ? NewInfo->Opcode : -1;
}
// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
@@ -183,10 +154,10 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
return {7, 0, 3};
if (Features.test(FeatureISAVersion7_0_4))
return {7, 0, 4};
+ if (Features.test(FeatureSeaIslands))
+ return {7, 0, 0};
// GCN GFX8 (Volcanic Islands (VI)).
- if (Features.test(FeatureISAVersion8_0_0))
- return {8, 0, 0};
if (Features.test(FeatureISAVersion8_0_1))
return {8, 0, 1};
if (Features.test(FeatureISAVersion8_0_2))
@@ -195,14 +166,22 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
return {8, 0, 3};
if (Features.test(FeatureISAVersion8_1_0))
return {8, 1, 0};
+ if (Features.test(FeatureVolcanicIslands))
+ return {8, 0, 0};
// GCN GFX9.
if (Features.test(FeatureISAVersion9_0_0))
return {9, 0, 0};
if (Features.test(FeatureISAVersion9_0_2))
return {9, 0, 2};
+ if (Features.test(FeatureISAVersion9_0_4))
+ return {9, 0, 4};
+ if (Features.test(FeatureISAVersion9_0_6))
+ return {9, 0, 6};
+ if (Features.test(FeatureGFX9))
+ return {9, 0, 0};
- if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands))
+ if (Features.test(FeatureSouthernIslands))
return {0, 0, 0};
return {7, 0, 0};
}
@@ -219,11 +198,15 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
<< ISAVersion.Major
<< ISAVersion.Minor
<< ISAVersion.Stepping;
+
+ if (hasXNACK(*STI))
+ Stream << "+xnack";
+
Stream.flush();
}
-bool hasCodeObjectV3(const FeatureBitset &Features) {
- return Features.test(FeatureCodeObjectV3);
+bool hasCodeObjectV3(const MCSubtargetInfo *STI) {
+ return STI->getFeatureBits().test(FeatureCodeObjectV3);
}
unsigned getWavefrontSize(const FeatureBitset &Features) {
@@ -260,7 +243,7 @@ unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
}
unsigned getMaxWavesPerCU(const FeatureBitset &Features) {
- return getMaxWavesPerEU(Features) * getEUsPerCU(Features);
+ return getMaxWavesPerEU() * getEUsPerCU(Features);
}
unsigned getMaxWavesPerCU(const FeatureBitset &Features,
@@ -272,9 +255,7 @@ unsigned getMinWavesPerEU(const FeatureBitset &Features) {
return 1;
}
-unsigned getMaxWavesPerEU(const FeatureBitset &Features) {
- if (!Features.test(FeatureGCN))
- return 8;
+unsigned getMaxWavesPerEU() {
// FIXME: Need to take scratch memory into account.
return 10;
}
@@ -330,11 +311,13 @@ unsigned getAddressableNumSGPRs(const FeatureBitset &Features) {
unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
assert(WavesPerEU != 0);
- if (WavesPerEU >= getMaxWavesPerEU(Features))
+ if (WavesPerEU >= getMaxWavesPerEU())
return 0;
- unsigned MinNumSGPRs =
- alignDown(getTotalNumSGPRs(Features) / (WavesPerEU + 1),
- getSGPRAllocGranule(Features)) + 1;
+
+ unsigned MinNumSGPRs = getTotalNumSGPRs(Features) / (WavesPerEU + 1);
+ if (Features.test(FeatureTrapHandler))
+ MinNumSGPRs -= std::min(MinNumSGPRs, (unsigned)TRAP_NUM_SGPRS);
+ MinNumSGPRs = alignDown(MinNumSGPRs, getSGPRAllocGranule(Features)) + 1;
return std::min(MinNumSGPRs, getAddressableNumSGPRs(Features));
}
@@ -343,14 +326,49 @@ unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
assert(WavesPerEU != 0);
IsaVersion Version = getIsaVersion(Features);
- unsigned MaxNumSGPRs = alignDown(getTotalNumSGPRs(Features) / WavesPerEU,
- getSGPRAllocGranule(Features));
unsigned AddressableNumSGPRs = getAddressableNumSGPRs(Features);
if (Version.Major >= 8 && !Addressable)
AddressableNumSGPRs = 112;
+ unsigned MaxNumSGPRs = getTotalNumSGPRs(Features) / WavesPerEU;
+ if (Features.test(FeatureTrapHandler))
+ MaxNumSGPRs -= std::min(MaxNumSGPRs, (unsigned)TRAP_NUM_SGPRS);
+ MaxNumSGPRs = alignDown(MaxNumSGPRs, getSGPRAllocGranule(Features));
return std::min(MaxNumSGPRs, AddressableNumSGPRs);
}
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+ bool FlatScrUsed, bool XNACKUsed) {
+ unsigned ExtraSGPRs = 0;
+ if (VCCUsed)
+ ExtraSGPRs = 2;
+
+ IsaVersion Version = getIsaVersion(Features);
+ if (Version.Major < 8) {
+ if (FlatScrUsed)
+ ExtraSGPRs = 4;
+ } else {
+ if (XNACKUsed)
+ ExtraSGPRs = 4;
+
+ if (FlatScrUsed)
+ ExtraSGPRs = 6;
+ }
+
+ return ExtraSGPRs;
+}
+
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+ bool FlatScrUsed) {
+ return getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed,
+ Features[AMDGPU::FeatureXNACK]);
+}
+
+unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs) {
+ NumSGPRs = alignTo(std::max(1u, NumSGPRs), getSGPREncodingGranule(Features));
+ // SGPRBlocks is actual number of SGPR blocks minus 1.
+ return NumSGPRs / getSGPREncodingGranule(Features) - 1;
+}
+
unsigned getVGPRAllocGranule(const FeatureBitset &Features) {
return 4;
}
@@ -370,7 +388,7 @@ unsigned getAddressableNumVGPRs(const FeatureBitset &Features) {
unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
assert(WavesPerEU != 0);
- if (WavesPerEU >= getMaxWavesPerEU(Features))
+ if (WavesPerEU >= getMaxWavesPerEU())
return 0;
unsigned MinNumVGPRs =
alignDown(getTotalNumVGPRs(Features) / (WavesPerEU + 1),
@@ -387,6 +405,12 @@ unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
return std::min(MaxNumVGPRs, AddressableNumVGPRs);
}
+unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumVGPRs) {
+ NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(Features));
+ // VGPRBlocks is actual number of VGPR blocks minus 1.
+ return NumVGPRs / getVGPREncodingGranule(Features) - 1;
+}
+
} // end namespace IsaInfo
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
@@ -396,7 +420,7 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
memset(&Header, 0, sizeof(Header));
Header.amd_kernel_code_version_major = 1;
- Header.amd_kernel_code_version_minor = 1;
+ Header.amd_kernel_code_version_minor = 2;
Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
Header.amd_machine_version_major = ISA.Major;
Header.amd_machine_version_minor = ISA.Minor;
@@ -416,6 +440,21 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
Header.private_segment_alignment = 4;
}
+amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() {
+ amdhsa::kernel_descriptor_t KD;
+ memset(&KD, 0, sizeof(KD));
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64,
+ amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, 1);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1);
+ return KD;
+}
+
bool isGroupSegment(const GlobalValue *GV) {
return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
}
@@ -425,7 +464,8 @@ bool isGlobalSegment(const GlobalValue *GV) {
}
bool isReadOnlySegment(const GlobalValue *GV) {
- return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+ return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+ GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
}
bool shouldEmitConstantsToTextSection(const Triple &TT) {
@@ -598,6 +638,18 @@ bool isEntryFunctionCC(CallingConv::ID CC) {
}
}
+bool hasXNACK(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureXNACK];
+}
+
+bool hasMIMG_R128(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128];
+}
+
+bool hasPackedD16(const MCSubtargetInfo &STI) {
+ return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem];
+}
+
bool isSI(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
}
@@ -681,6 +733,8 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
case node: return isGFX9(STI) ? node##_gfx9 : node##_vi;
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
+ if (STI.getTargetTriple().getArch() == Triple::r600)
+ return Reg;
MAP_REG2REG
}
@@ -837,9 +891,6 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
assert(HasInv2Pi);
- if (!EnablePackedInlinableLiterals)
- return false;
-
int16_t Lo16 = static_cast<int16_t>(Literal);
int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
@@ -871,24 +922,6 @@ bool isArgPassedInSGPR(const Argument *A) {
}
}
-// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
-bool isUniformMMO(const MachineMemOperand *MMO) {
- const Value *Ptr = MMO->getValue();
- // UndefValue means this is a load of a kernel input. These are uniform.
- // Sometimes LDS instructions have constant pointers.
- // If Ptr is null, then that means this mem operand contains a
- // PseudoSourceValue like GOT.
- if (!Ptr || isa<UndefValue>(Ptr) ||
- isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
- return true;
-
- if (const Argument *Arg = dyn_cast<Argument>(Ptr))
- return isArgPassedInSGPR(Arg);
-
- const Instruction *I = dyn_cast<Instruction>(Ptr);
- return I && I->getMetadata("amdgpu.uniform");
-}
-
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
if (isGCN3Encoding(ST))
return ByteOffset;
@@ -909,18 +942,10 @@ namespace llvm {
namespace AMDGPU {
AMDGPUAS getAMDGPUAS(Triple T) {
- auto Env = T.getEnvironmentName();
AMDGPUAS AS;
- if (Env == "amdgiz" || Env == "amdgizcl") {
- AS.FLAT_ADDRESS = 0;
- AS.PRIVATE_ADDRESS = 5;
- AS.REGION_ADDRESS = 4;
- }
- else {
- AS.FLAT_ADDRESS = 4;
- AS.PRIVATE_ADDRESS = 0;
- AS.REGION_ADDRESS = 5;
- }
+ AS.FLAT_ADDRESS = 0;
+ AS.PRIVATE_ADDRESS = 5;
+ AS.REGION_ADDRESS = 2;
return AS;
}
@@ -931,5 +956,21 @@ AMDGPUAS getAMDGPUAS(const TargetMachine &M) {
AMDGPUAS getAMDGPUAS(const Module &M) {
return getAMDGPUAS(Triple(M.getTargetTriple()));
}
+
+namespace {
+
+struct SourceOfDivergence {
+ unsigned Intr;
+};
+const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr);
+
+#define GET_SourcesOfDivergence_IMPL
+#include "AMDGPUGenSearchableTables.inc"
+
+} // end anonymous namespace
+
+bool isIntrinsicSourceOfDivergence(unsigned IntrID) {
+ return lookupSourceOfDivergence(IntrID);
+}
} // namespace AMDGPU
} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index a215b445378e..70681c271697 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -16,6 +16,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include <cstdint>
@@ -28,24 +29,31 @@ class Argument;
class FeatureBitset;
class Function;
class GlobalValue;
-class MachineMemOperand;
class MCContext;
class MCRegisterClass;
class MCRegisterInfo;
class MCSection;
class MCSubtargetInfo;
+class MachineMemOperand;
class Triple;
namespace AMDGPU {
+
+#define GET_MIMGBaseOpcode_DECL
+#define GET_MIMGDim_DECL
+#define GET_MIMGEncoding_DECL
+#include "AMDGPUGenSearchableTables.inc"
+
namespace IsaInfo {
enum {
// The closed Vulkan driver sets 96, which limits the wave count to 8 but
// doesn't spill SGPRs as much as when 80 is set.
- FIXED_NUM_SGPRS_FOR_INIT_BUG = 96
+ FIXED_NUM_SGPRS_FOR_INIT_BUG = 96,
+ TRAP_NUM_SGPRS = 16
};
-/// \brief Instruction set architecture version.
+/// Instruction set architecture version.
struct IsaVersion {
unsigned Major;
unsigned Minor;
@@ -55,12 +63,12 @@ struct IsaVersion {
/// \returns Isa version for given subtarget \p Features.
IsaVersion getIsaVersion(const FeatureBitset &Features);
-/// \brief Streams isa version string for given subtarget \p STI into \p Stream.
+/// Streams isa version string for given subtarget \p STI into \p Stream.
void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
-/// \returns True if given subtarget \p Features support code object version 3,
+/// \returns True if given subtarget \p STI supports code object version 3,
/// false otherwise.
-bool hasCodeObjectV3(const FeatureBitset &Features);
+bool hasCodeObjectV3(const MCSubtargetInfo *STI);
/// \returns Wavefront size for given subtarget \p Features.
unsigned getWavefrontSize(const FeatureBitset &Features);
@@ -92,7 +100,7 @@ unsigned getMinWavesPerEU(const FeatureBitset &Features);
/// \returns Maximum number of waves per execution unit for given subtarget \p
/// Features without any kind of limitation.
-unsigned getMaxWavesPerEU(const FeatureBitset &Features);
+unsigned getMaxWavesPerEU();
/// \returns Maximum number of waves per execution unit for given subtarget \p
/// Features and limited by given \p FlatWorkGroupSize.
@@ -131,6 +139,22 @@ unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
bool Addressable);
+/// \returns Number of extra SGPRs implicitly required by given subtarget \p
+/// Features when the given special registers are used.
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+ bool FlatScrUsed, bool XNACKUsed);
+
+/// \returns Number of extra SGPRs implicitly required by given subtarget \p
+/// Features when the given special registers are used. XNACK is inferred from
+/// \p Features.
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+ bool FlatScrUsed);
+
+/// \returns Number of SGPR blocks needed for given subtarget \p Features when
+/// \p NumSGPRs are used. \p NumSGPRs should already include any special
+/// register counts.
+unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs);
+
/// \returns VGPR allocation granularity for given subtarget \p Features.
unsigned getVGPRAllocGranule(const FeatureBitset &Features);
@@ -151,20 +175,57 @@ unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
/// execution unit requirement for given subtarget \p Features.
unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
+/// \returns Number of VGPR blocks needed for given subtarget \p Features when
+/// \p NumVGPRs are used.
+unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs);
+
} // end namespace IsaInfo
LLVM_READONLY
int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
+struct MIMGBaseOpcodeInfo {
+ MIMGBaseOpcode BaseOpcode;
+ bool Store;
+ bool Atomic;
+ bool AtomicX2;
+ bool Sampler;
+
+ uint8_t NumExtraArgs;
+ bool Gradients;
+ bool Coordinates;
+ bool LodOrClampOrMip;
+ bool HasD16;
+};
+
+LLVM_READONLY
+const MIMGBaseOpcodeInfo *getMIMGBaseOpcodeInfo(unsigned BaseOpcode);
+
+struct MIMGDimInfo {
+ MIMGDim Dim;
+ uint8_t NumCoords;
+ uint8_t NumGradients;
+ bool DA;
+};
+
LLVM_READONLY
-int getMaskedMIMGOp(const MCInstrInfo &MII,
- unsigned Opc, unsigned NewChannels);
+const MIMGDimInfo *getMIMGDimInfo(unsigned Dim);
+
+LLVM_READONLY
+int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
+ unsigned VDataDwords, unsigned VAddrDwords);
+
+LLVM_READONLY
+int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels);
+
LLVM_READONLY
int getMCOpcode(uint16_t Opcode, unsigned Gen);
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
const FeatureBitset &Features);
+amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor();
+
bool isGroupSegment(const GlobalValue *GV);
bool isGlobalSegment(const GlobalValue *GV);
bool isReadOnlySegment(const GlobalValue *GV);
@@ -216,7 +277,7 @@ unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
/// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version.
unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
-/// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
+/// Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
/// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and
/// \p Lgkmcnt respectively.
///
@@ -240,7 +301,7 @@ unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
unsigned Lgkmcnt);
-/// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
+/// Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
/// \p Version.
///
/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
@@ -278,41 +339,45 @@ inline bool isKernel(CallingConv::ID CC) {
}
}
+bool hasXNACK(const MCSubtargetInfo &STI);
+bool hasMIMG_R128(const MCSubtargetInfo &STI);
+bool hasPackedD16(const MCSubtargetInfo &STI);
+
bool isSI(const MCSubtargetInfo &STI);
bool isCI(const MCSubtargetInfo &STI);
bool isVI(const MCSubtargetInfo &STI);
bool isGFX9(const MCSubtargetInfo &STI);
-/// \brief Is Reg - scalar register
+/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
-/// \brief Is there any intersection between registers
+/// Is there any intersection between registers
bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI);
/// If \p Reg is a pseudo reg, return the correct hardware register given
/// \p STI otherwise return \p Reg.
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
-/// \brief Convert hardware register \p Reg to a pseudo register
+/// Convert hardware register \p Reg to a pseudo register
LLVM_READNONE
unsigned mc2PseudoReg(unsigned Reg);
-/// \brief Can this operand also contain immediate values?
+/// Can this operand also contain immediate values?
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo);
-/// \brief Is this floating-point operand?
+/// Is this floating-point operand?
bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo);
-/// \brief Does this opearnd support only inlinable literals?
+/// Does this opearnd support only inlinable literals?
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo);
-/// \brief Get the size in bits of a register from the register class \p RC.
+/// Get the size in bits of a register from the register class \p RC.
unsigned getRegBitWidth(unsigned RCID);
-/// \brief Get the size in bits of a register from the register class \p RC.
+/// Get the size in bits of a register from the register class \p RC.
unsigned getRegBitWidth(const MCRegisterClass &RC);
-/// \brief Get size of register operand
+/// Get size of register operand
unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
unsigned OpNo);
@@ -349,7 +414,7 @@ inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) {
return getOperandSize(Desc.OpInfo[OpNo]);
}
-/// \brief Is this literal inlinable
+/// Is this literal inlinable
LLVM_READNONE
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);
@@ -363,7 +428,6 @@ LLVM_READNONE
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
bool isArgPassedInSGPR(const Argument *Arg);
-bool isUniformMMO(const MachineMemOperand *MMO);
/// \returns The encoding that will be used for \p ByteOffset in the SMRD
/// offset field.
@@ -374,6 +438,9 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
/// not the encoded offset.
bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
+/// \returns true if the intrinsic is divergent
+bool isIntrinsicSourceOfDivergence(unsigned IntrID);
+
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
new file mode 100644
index 000000000000..1924f71f11c8
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
@@ -0,0 +1,75 @@
+//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MBB A lane-dominates MBB B if
+// 1. A dominates B in the usual sense, i.e. every path from the entry to B
+// goes through A, and
+// 2. whenever B executes, every active lane during that execution of B was
+// also active during the most recent execution of A.
+//
+// The simplest example where A dominates B but does not lane-dominate it is
+// where A is a loop:
+//
+// |
+// +--+
+// A |
+// +--+
+// |
+// B
+//
+// Unfortunately, the second condition is not fully captured by the control
+// flow graph when it is unstructured (as may happen when branch conditions are
+// uniform).
+//
+// The following replacement of the second condition is a conservative
+// approximation. It is an equivalent condition when the CFG is fully
+// structured:
+//
+// 2'. every cycle in the CFG that contains A also contains B.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULaneDominator.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
+namespace llvm {
+
+namespace AMDGPU {
+
+// Given machine basic blocks A and B where A dominates B, check whether
+// A lane-dominates B.
+//
+// The check is conservative, i.e. there can be false-negatives.
+bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) {
+ // Check whether A is reachable from itself without going through B.
+ DenseSet<MachineBasicBlock *> Reachable;
+ SmallVector<MachineBasicBlock *, 8> Stack;
+
+ Stack.push_back(A);
+ do {
+ MachineBasicBlock *MBB = Stack.back();
+ Stack.pop_back();
+
+ for (MachineBasicBlock *Succ : MBB->successors()) {
+ if (Succ == A)
+ return false;
+ if (Succ != B && Reachable.insert(Succ).second)
+ Stack.push_back(Succ);
+ }
+ } while (!Stack.empty());
+
+ return true;
+}
+
+} // namespace AMDGPU
+
+} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
new file mode 100644
index 000000000000..4f33a89a364b
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
@@ -0,0 +1,24 @@
+//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
+
+namespace llvm {
+
+class MachineBasicBlock;
+
+namespace AMDGPU {
+
+bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB);
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
index 991408c81c92..9f0a4d29b5e4 100644
--- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -73,7 +73,6 @@ FIELD2(amd_machine_version_stepping, machine_version_stepping, amd_machine_ve
FIELD(kernel_code_entry_byte_offset),
FIELD(kernel_code_prefetch_byte_size),
-FIELD(max_scratch_backing_memory_byte_size),
COMPPGM1(granulated_workitem_vgpr_count, compute_pgm_rsrc1_vgprs, VGPRS),
COMPPGM1(granulated_wavefront_sgpr_count, compute_pgm_rsrc1_sgprs, SGPRS),
diff --git a/lib/Target/AMDGPU/Utils/CMakeLists.txt b/lib/Target/AMDGPU/Utils/CMakeLists.txt
index 01b80ebe8d3d..c5ed32e46821 100644
--- a/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -2,4 +2,5 @@ add_llvm_library(LLVMAMDGPUUtils
AMDGPUBaseInfo.cpp
AMDKernelCodeTUtils.cpp
AMDGPUAsmUtils.cpp
+ AMDGPULaneDominator.cpp
)
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index ff2bd2454400..4c7a92219755 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -40,17 +40,9 @@ class VOP1_SDWA9Ae <bits<8> op, VOPProfile P> : VOP_SDWA9Ae <P> {
}
class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> :
- InstSI <P.Outs32, P.Ins32, "", pattern>,
- VOP <opName>,
- SIMCInstr <!if(VOP1Only, opName, opName#"_e32"), SIEncodingFamily.NONE>,
- MnemonicAlias<!if(VOP1Only, opName, opName#"_e32"), opName> {
+ VOP_Pseudo <opName, !if(VOP1Only, "", "_e32"), P, P.Outs32, P.Ins32, "", pattern> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
- let UseNamedOperandTable = 1;
-
- string Mnemonic = opName;
- string AsmOperands = P.Asm32;
+ let AsmOperands = P.Asm32;
let Size = 4;
let mayLoad = 0;
@@ -63,8 +55,6 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On
let Uses = [EXEC];
let AsmVariantName = AMDGPUAsmVariants.Default;
-
- VOPProfile Pfl = P;
}
class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
@@ -86,6 +76,7 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
+ let Defs = ps.Defs;
}
class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -202,13 +193,14 @@ defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>;
defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
-defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
let SchedRW = [WriteQuarterRate32] in {
+defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
-defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32>;
+defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>;
defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
+defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
} // End SchedRW = [WriteQuarterRate32]
let SchedRW = [WriteDouble] in {
@@ -216,8 +208,6 @@ defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
} // End SchedRW = [WriteDouble];
-defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
-
let SchedRW = [WriteDouble] in {
defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>;
} // End SchedRW = [WriteDouble]
@@ -232,9 +222,9 @@ defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>;
defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>;
defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>;
defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
-defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
let SchedRW = [WriteDoubleAdd] in {
+defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
} // End SchedRW = [WriteDoubleAdd]
@@ -298,9 +288,7 @@ defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>;
defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
} // End Uses = [M0, EXEC]
-let SchedRW = [WriteQuarterRate32] in {
defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
-}
// These instruction only exist on SI and CI
let SubtargetPredicate = isSICI in {
@@ -344,11 +332,15 @@ defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
+let SchedRW = [WriteQuarterRate32] in {
defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>;
defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>;
defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>;
+defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
+defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
+} // End SchedRW = [WriteQuarterRate32]
defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
@@ -356,8 +348,6 @@ defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>;
defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
-defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
-defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
}
@@ -392,6 +382,12 @@ let SubtargetPredicate = isGFX9 in {
def V_SWAP_B32 : VOP1_Pseudo <"v_swap_b32", VOP_SWAP_I32, [], 1>;
}
+defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>;
+
+defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>;
+defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
+defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+
} // End SubtargetPredicate = isGFX9
//===----------------------------------------------------------------------===//
@@ -521,7 +517,7 @@ multiclass VOP1Only_Real_vi <bits<10> op> {
}
}
-multiclass VOP1_Real_vi <bits<10> op> {
+multiclass VOP1_Real_e32e64_vi <bits<10> op> {
let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
def _e32_vi :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
@@ -530,6 +526,10 @@ multiclass VOP1_Real_vi <bits<10> op> {
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
VOP3e_vi <!add(0x140, op), !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
+}
+
+multiclass VOP1_Real_vi <bits<10> op> {
+ defm NAME : VOP1_Real_e32e64_vi <op>;
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
@@ -593,9 +593,9 @@ defm V_FRACT_F64 : VOP1_Real_vi <0x32>;
defm V_FREXP_EXP_I32_F32 : VOP1_Real_vi <0x33>;
defm V_FREXP_MANT_F32 : VOP1_Real_vi <0x34>;
defm V_CLREXCP : VOP1_Real_vi <0x35>;
-defm V_MOVRELD_B32 : VOP1_Real_vi <0x36>;
-defm V_MOVRELS_B32 : VOP1_Real_vi <0x37>;
-defm V_MOVRELSD_B32 : VOP1_Real_vi <0x38>;
+defm V_MOVRELD_B32 : VOP1_Real_e32e64_vi <0x36>;
+defm V_MOVRELS_B32 : VOP1_Real_e32e64_vi <0x37>;
+defm V_MOVRELSD_B32 : VOP1_Real_e32e64_vi <0x38>;
defm V_TRUNC_F64 : VOP1_Real_vi <0x17>;
defm V_CEIL_F64 : VOP1_Real_vi <0x18>;
defm V_FLOOR_F64 : VOP1_Real_vi <0x1A>;
@@ -622,6 +622,10 @@ defm V_SIN_F16 : VOP1_Real_vi <0x49>;
defm V_COS_F16 : VOP1_Real_vi <0x4a>;
defm V_SWAP_B32 : VOP1Only_Real_vi <0x51>;
+defm V_SAT_PK_U8_I16 : VOP1_Real_vi<0x4f>;
+defm V_CVT_NORM_I16_F16 : VOP1_Real_vi<0x4d>;
+defm V_CVT_NORM_U16_F16 : VOP1_Real_vi<0x4e>;
+
// Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
// indexing mode. vdst can't be treated as a def for codegen purposes,
// and an implicit use and def of the super register should be added.
@@ -694,3 +698,23 @@ def : GCNPat <
>;
} // End OtherPredicates = [isVI]
+
+//===----------------------------------------------------------------------===//
+// GFX9
+//===----------------------------------------------------------------------===//
+
+multiclass VOP1_Real_gfx9 <bits<10> op> {
+ let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in {
+ defm NAME : VOP1_Real_e32e64_vi <op>;
+ }
+
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+ // For now left dpp only for asm/dasm
+ // TODO: add corresponding pseudo
+ def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
+}
+
+defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index ef90b68db1a8..5ec1a15c5cd2 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -61,17 +61,9 @@ class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> {
}
class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> :
- InstSI <P.Outs32, P.Ins32, "", pattern>,
- VOP <opName>,
- SIMCInstr <opName#suffix, SIEncodingFamily.NONE>,
- MnemonicAlias<opName#suffix, opName> {
+ VOP_Pseudo <opName, suffix, P, P.Outs32, P.Ins32, "", pattern> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
- let UseNamedOperandTable = 1;
-
- string Mnemonic = opName;
- string AsmOperands = P.Asm32;
+ let AsmOperands = P.Asm32;
let Size = 4;
let mayLoad = 0;
@@ -84,8 +76,6 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf
let Uses = [EXEC];
let AsmVariantName = AMDGPUAsmVariants.Default;
-
- VOPProfile Pfl = P;
}
class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
@@ -107,6 +97,7 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
+ let Defs = ps.Defs;
}
class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -177,6 +168,10 @@ multiclass VOP2eInst <string opName,
let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in {
def _e32 : VOP2_Pseudo <opName, P>,
Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
+ let AsmMatchConverter = "cvtSdwaVOP2b";
+ }
}
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -303,12 +298,30 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
let Src0RC32 = VCSrc_b32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above.
let Asm32 = "$vdst, $src0, $src1, vcc";
let Asm64 = "$vdst, $src0, $src1, $src2";
+ let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+ let AsmSDWA9 = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+ let AsmDPP = "$vdst, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+
let Outs32 = (outs DstRC:$vdst);
let Outs64 = (outs DstRC:$vdst);
// Suppress src2 implied by type since the 32-bit encoding uses an
// implicit VCC use.
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
+
+ let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+ Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
+ clampmod:$clamp,
+ dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel, src1_sel:$src1_sel);
+
+ let InsDPP = (ins DstRCDPP:$old,
+ Src0DPP:$src0,
+ Src1DPP:$src1,
+ dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let HasExt = 1;
+ let HasSDWA9 = 1;
}
def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
@@ -322,15 +335,17 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
let HasSDWA9 = 0;
}
-def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
+def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
let Outs32 = (outs VGPR_32:$vdst);
let Outs64 = Outs32;
- let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1);
+ let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in);
let Ins64 = Ins32;
let Asm32 = " $vdst, $src0, $src1";
let Asm64 = Asm32;
let HasExt = 0;
let HasSDWA9 = 0;
+ let HasSrc2 = 0;
+ let HasSrc2Mods = 0;
}
//===----------------------------------------------------------------------===//
@@ -398,7 +413,10 @@ let isConvergent = 1, Uses = []<Register> in {
def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
[(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">;
-def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
+let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
+ [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))], "">;
+} // End $vdst = $vdst_in, DisableEncoding $vdst_in
} // End isConvergent = 1
defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
@@ -407,11 +425,11 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32
defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_i16_f32>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_u16_f32>;
defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_u16_u32>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_i16_i32>;
} // End SubtargetPredicate = isGCN
@@ -473,6 +491,19 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
} // End SubtargetPredicate = Has16BitInsts
+let SubtargetPredicate = HasDLInsts in {
+
+defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32>;
+
+let Constraints = "$vdst = $src2",
+ DisableEncoding="$src2",
+ isConvertibleToThreeAddress = 1,
+ isCommutable = 1 in {
+defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>;
+}
+
+} // End SubtargetPredicate = HasDLInsts
+
// Note: 16-bit instructions produce a 0 result in the high 16-bits.
multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
@@ -639,7 +670,7 @@ defm V_SUBBREV_U32 : VOP2be_Real_e32e64_si <0x2a>;
defm V_READLANE_B32 : VOP2_Real_si <0x01>;
-let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1) in {
+let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
defm V_WRITELANE_B32 : VOP2_Real_si <0x02>;
}
@@ -824,7 +855,7 @@ multiclass VOP2_Real_e32e64_vi <bits<6> op> :
def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
}
-defm V_CNDMASK_B32 : Base_VOP2_Real_e32e64_vi <0x0>;
+defm V_CNDMASK_B32 : VOP2_Real_e32e64_vi <0x0>;
defm V_ADD_F32 : VOP2_Real_e32e64_vi <0x1>;
defm V_SUB_F32 : VOP2_Real_e32e64_vi <0x2>;
defm V_SUBREV_F32 : VOP2_Real_e32e64_vi <0x3>;
@@ -926,3 +957,10 @@ def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
} // End SubtargetPredicate = isVI
+
+let SubtargetPredicate = HasDLInsts in {
+
+defm V_FMAC_F32 : VOP2_Real_e32e64_vi <0x3b>;
+defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>;
+
+} // End SubtargetPredicate = HasDLInsts
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index aedbfa015bf6..17ae08dc6267 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -153,19 +153,24 @@ class getVOP3VCC<VOPProfile P, SDPatternOperator node> {
(i1 VCC)))];
}
-class VOP3Features<bit Clamp, bit OpSel> {
+class VOP3Features<bit Clamp, bit OpSel, bit Packed> {
bit HasClamp = Clamp;
bit HasOpSel = OpSel;
+ bit IsPacked = Packed;
}
-def VOP3_REGULAR : VOP3Features<0, 0>;
-def VOP3_CLAMP : VOP3Features<1, 0>;
-def VOP3_OPSEL : VOP3Features<1, 1>;
+def VOP3_REGULAR : VOP3Features<0, 0, 0>;
+def VOP3_CLAMP : VOP3Features<1, 0, 0>;
+def VOP3_OPSEL : VOP3Features<1, 1, 0>;
+def VOP3_PACKED : VOP3Features<1, 1, 1>;
class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
+ let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
+
+ let HasModifiers = !if(Features.IsPacked, 1, P.HasModifiers);
// FIXME: Hack to stop printing _e64
let Outs64 = (outs DstRC.RegClass:$vdst);
@@ -283,10 +288,10 @@ def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
-def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
let SchedRW = [WriteDoubleAdd] in {
+def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
@@ -355,14 +360,12 @@ def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPU
def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
let SchedRW = [WriteFloatFMA, WriteSALU];
- let hasExtraSrcRegAllocReq = 1;
let AsmMatchConverter = "";
}
// Double precision division pre-scale.
def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
let SchedRW = [WriteDouble, WriteSALU];
- let hasExtraSrcRegAllocReq = 1;
let AsmMatchConverter = "";
}
@@ -376,6 +379,7 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3
let SchedRW = [WriteDouble];
}
+let SchedRW = [Write64Bit] in {
// These instructions only exist on SI and CI
let SubtargetPredicate = isSICI in {
def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>>;
@@ -389,17 +393,17 @@ def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
} // End SubtargetPredicate = isVI
-
+} // End SchedRW = [Write64Bit]
let SubtargetPredicate = isCIVI in {
-let Constraints = "@earlyclobber $vdst" in {
+let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP>>;
-} // End Constraints = "@earlyclobber $vdst"
+} // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
let isCommutable = 1 in {
-let SchedRW = [WriteDouble, WriteSALU] in {
+let SchedRW = [WriteQuarterRate32, WriteSALU] in {
def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
} // End SchedRW = [WriteDouble, WriteSALU]
@@ -408,16 +412,16 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
} // End SubtargetPredicate = isCIVI
-let SubtargetPredicate = Has16BitInsts in {
-
-let renamedInGFX9 = 1 in {
-def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
+def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> {
+ let Predicates = [Has16BitInsts, isVIOnly];
}
-let SubtargetPredicate = isGFX9 in {
-def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
+def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
+ VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> {
+ let renamedInGFX9 = 1;
+ let Predicates = [Has16BitInsts, isGFX9];
}
-let isCommutable = 1 in {
+let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in {
let renamedInGFX9 = 1 in {
def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
@@ -438,15 +442,14 @@ def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f1
def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>;
def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
-} // End isCommutable = 1
-} // End SubtargetPredicate = Has16BitInsts
+} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
let SubtargetPredicate = isVI in {
def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
-def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
} // End SubtargetPredicate = isVI
let Predicates = [Has16BitInsts] in {
@@ -697,7 +700,7 @@ multiclass VOP3Interp_F16_Real_vi<bits<10> op> {
let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in {
multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
- def _vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
+ def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
VOP3e_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
let AsmString = AsmName # ps.AsmOperands;
@@ -705,7 +708,7 @@ multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
}
multiclass VOP3OpSel_F16_Real_gfx9<bits<10> op, string AsmName> {
- def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
+ def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME);
let AsmString = AsmName # ps.AsmOperands;
@@ -713,7 +716,7 @@ multiclass VOP3OpSel_F16_Real_gfx9<bits<10> op, string AsmName> {
}
multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
- def _vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
+ def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
VOP3Interp_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
let AsmString = AsmName # ps.AsmOperands;
@@ -721,9 +724,9 @@ multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName>
}
multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> {
- def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
- VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
- VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME);
+ def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.GFX9>,
+ VOP3e_vi <op, !cast<VOP_Pseudo>(NAME).Pfl> {
+ VOP_Pseudo ps = !cast<VOP_Pseudo>(NAME);
let AsmString = AsmName # ps.AsmOperands;
}
}
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index eeee8b36c175..5c78ada3211e 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -68,6 +68,67 @@ def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I1
def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
+multiclass MadFmaMixPats<SDPatternOperator fma_like,
+ Instruction mix_inst,
+ Instruction mixlo_inst,
+ Instruction mixhi_inst> {
+ def : GCNPat <
+ (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+ (mixlo_inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.NONE,
+ (i32 (IMPLICIT_DEF)))
+ >;
+
+ // FIXME: Special case handling for maxhi (especially for clamp)
+ // because dealing with the write to high half of the register is
+ // difficult.
+ def : GCNPat <
+ (build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+ (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.NONE,
+ $elt0))
+ >;
+
+ def : GCNPat <
+ (build_vector
+ f16:$elt0,
+ (AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
+ (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.ENABLE,
+ $elt0))
+ >;
+
+ def : GCNPat <
+ (AMDGPUclamp (build_vector
+ (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
+ (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
+ (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
+ $hi_src1_modifiers, $hi_src1,
+ $hi_src2_modifiers, $hi_src2,
+ DSTCLAMP.ENABLE,
+ (mixlo_inst $lo_src0_modifiers, $lo_src0,
+ $lo_src1_modifiers, $lo_src1,
+ $lo_src2_modifiers, $lo_src2,
+ DSTCLAMP.ENABLE,
+ (i32 (IMPLICIT_DEF)))))
+ >;
+}
let SubtargetPredicate = HasMadMixInsts in {
// These are VOP3a-like opcodes which accept no omod.
@@ -84,68 +145,41 @@ def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16
}
}
-def : GCNPat <
- (f16 (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
- (V_MAD_MIXLO_F16 $src0_modifiers, $src0,
- $src1_modifiers, $src1,
- $src2_modifiers, $src2,
- DSTCLAMP.NONE,
- (i32 (IMPLICIT_DEF)))
->;
+defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
+} // End SubtargetPredicate = HasMadMixInsts
-// FIXME: Special case handling for maxhi (especially for clamp)
-// because dealing with the write to high half of the register is
-// difficult.
-def : GCNPat <
- (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
- (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
- $src1_modifiers, $src1,
- $src2_modifiers, $src2,
- DSTCLAMP.NONE,
- $elt0))
->;
-def : GCNPat <
- (build_vector
- f16:$elt0,
- (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
- (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
- $src1_modifiers, $src1,
- $src2_modifiers, $src2,
- DSTCLAMP.ENABLE,
- $elt0))
->;
+// Essentially the same as the mad_mix versions
+let SubtargetPredicate = HasFmaMixInsts in {
+let isCommutable = 1 in {
+def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
-def : GCNPat <
- (AMDGPUclamp (build_vector
- (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
- (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
- (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0,
- $hi_src1_modifiers, $hi_src1,
- $hi_src2_modifiers, $hi_src2,
- DSTCLAMP.ENABLE,
- (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0,
- $lo_src1_modifiers, $lo_src1,
- $lo_src2_modifiers, $lo_src2,
- DSTCLAMP.ENABLE,
- (i32 (IMPLICIT_DEF)))))
->;
+// Clamp modifier is applied after conversion to f16.
+def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+
+let ClampLo = 0, ClampHi = 1 in {
+def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+}
+}
+
+defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
+}
-} // End SubtargetPredicate = [HasMadMixInsts]
+let SubtargetPredicate = HasDLInsts in {
+
+def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, AMDGPUfdot2>;
+def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>;
+def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>;
+def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4>;
+def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4>;
+def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8>;
+def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8>;
+
+} // End SubtargetPredicate = HasDLInsts
multiclass VOP3P_Real_vi<bits<10> op> {
- def _vi : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.VI>,
- VOP3Pe <op, !cast<VOP3P_Pseudo>(NAME).Pfl> {
+ def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
let AssemblerPredicates = [HasVOP3PInsts];
let DecoderNamespace = "VI";
}
@@ -172,6 +206,33 @@ defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>;
defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>;
defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>;
+
+let SubtargetPredicate = HasMadMixInsts in {
defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>;
defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+}
+
+let SubtargetPredicate = HasFmaMixInsts in {
+let DecoderNamespace = "GFX9_DL" in {
+// The mad_mix instructions were renamed and their behaviors changed,
+// but the opcode stayed the same so we need to put these in a
+// different DecoderNamespace to avoid the ambiguity.
+defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x3a0>;
+defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
+defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+}
+}
+
+
+let SubtargetPredicate = HasDLInsts in {
+
+defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>;
+defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>;
+defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>;
+defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>;
+defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x3a9>;
+defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>;
+defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x3ab>;
+
+} // End SubtargetPredicate = HasDLInsts
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index 146870e21531..cc6b8116afee 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -30,8 +30,8 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
let Inst{31-25} = 0x3e; // encoding
// VOPC disallows dst_sel and dst_unused as they have no effect on destination
- let Inst{42-40} = SDWA.DWORD;
- let Inst{44-43} = SDWA.UNUSED_PRESERVE;
+ let Inst{42-40} = 0;
+ let Inst{44-43} = 0;
}
class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> {
@@ -106,6 +106,7 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> :
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
+ let Defs = ps.Defs;
}
class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index f24ff5ce8dea..f0f7f259f71d 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -38,6 +38,23 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
let Uses = [EXEC];
}
+class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
+ string asm, list<dag> pattern> :
+ InstSI <outs, ins, asm, pattern>,
+ VOP <opName>,
+ SIMCInstr <opName#suffix, SIEncodingFamily.NONE>,
+ MnemonicAlias<opName#suffix, opName> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ VOPProfile Pfl = P;
+
+ string AsmOperands;
+}
+
class VOP3Common <dag outs, dag ins, string asm = "",
list<dag> pattern = [], bit HasMods = 0,
bit VOP3Only = 0> :
@@ -66,26 +83,18 @@ class VOP3Common <dag outs, dag ins, string asm = "",
class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
bit VOP3Only = 0, bit isVOP3P = 0, bit isVop3OpSel = 0> :
- InstSI <P.Outs64,
- !if(isVop3OpSel,
- P.InsVOP3OpSel,
- !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64)),
- "",
- pattern>,
- VOP <opName>,
- SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>,
- MnemonicAlias<opName#"_e64", opName> {
+ VOP_Pseudo <opName, "_e64", P, P.Outs64,
+ !if(isVop3OpSel,
+ P.InsVOP3OpSel,
+ !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64)),
+ "", pattern> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
- let UseNamedOperandTable = 1;
let VOP3_OPSEL = isVop3OpSel;
let IsPacked = P.IsPacked;
- string Mnemonic = opName;
- string AsmOperands = !if(isVop3OpSel,
- P.AsmVOP3OpSel,
- !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
+ let AsmOperands = !if(isVop3OpSel,
+ P.AsmVOP3OpSel,
+ !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
let Size = 8;
let mayLoad = 0;
@@ -120,8 +129,6 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
!if(!or(P.HasModifiers, !or(P.HasOMod, P.HasIntClamp)),
"cvtVOP3",
""));
-
- VOPProfile Pfl = P;
}
class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
@@ -129,7 +136,7 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
let VOP3P = 1;
}
-class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
+class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
@@ -149,13 +156,14 @@ class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
+ let Defs = ps.Defs;
VOPProfile Pfl = ps.Pfl;
}
// XXX - Is there any reason to distingusih this from regular VOP3
// here?
-class VOP3P_Real<VOP3P_Pseudo ps, int EncodingFamily> :
+class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily> :
VOP3_Real<ps, EncodingFamily>;
class VOP3a<VOPProfile P> : Enc64 {
@@ -324,13 +332,13 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
bits<1> clamp;
let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
- let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD);
- let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
+ let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0);
+ let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0);
let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0);
- let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
+ let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0);
let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
- let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
+ let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, 0);
let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
}
@@ -358,11 +366,11 @@ class VOP_SDWA9e<VOPProfile P> : Enc64 {
bits<1> src1_sgpr;
let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
- let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
+ let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0);
let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
let Inst{55} = !if(P.HasSrc0, src0{8}, 0);
- let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
+ let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, 0);
let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
let Inst{63} = 0; // src1_sgpr - should be specified in subclass
@@ -375,8 +383,8 @@ class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> {
bits<1> clamp;
bits<2> omod;
- let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD);
- let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
+ let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0);
+ let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0);
let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0);
let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0);
}
diff --git a/lib/Target/ARC/ARCAsmPrinter.cpp b/lib/Target/ARC/ARCAsmPrinter.cpp
index af9dd968b7a6..8c13da0484fd 100644
--- a/lib/Target/ARC/ARCAsmPrinter.cpp
+++ b/lib/Target/ARC/ARCAsmPrinter.cpp
@@ -25,7 +25,6 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
@@ -34,6 +33,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include <algorithm>
using namespace llvm;
diff --git a/lib/Target/ARC/ARCBranchFinalize.cpp b/lib/Target/ARC/ARCBranchFinalize.cpp
index 9341e7bdda41..3b410fa383b7 100644
--- a/lib/Target/ARC/ARCBranchFinalize.cpp
+++ b/lib/Target/ARC/ARCBranchFinalize.cpp
@@ -112,7 +112,7 @@ static unsigned getCmpForPseudo(MachineInstr *MI) {
}
void ARCBranchFinalize::replaceWithBRcc(MachineInstr *MI) const {
- DEBUG(dbgs() << "Replacing pseudo branch with BRcc\n");
+ LLVM_DEBUG(dbgs() << "Replacing pseudo branch with BRcc\n");
unsigned CC = getCCForBRcc(MI->getOperand(3).getImm());
if (CC != -1U) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
@@ -128,8 +128,8 @@ void ARCBranchFinalize::replaceWithBRcc(MachineInstr *MI) const {
}
void ARCBranchFinalize::replaceWithCmpBcc(MachineInstr *MI) const {
- DEBUG(dbgs() << "Branch: " << *MI << "\n");
- DEBUG(dbgs() << "Replacing pseudo branch with Cmp + Bcc\n");
+ LLVM_DEBUG(dbgs() << "Branch: " << *MI << "\n");
+ LLVM_DEBUG(dbgs() << "Replacing pseudo branch with Cmp + Bcc\n");
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(getCmpForPseudo(MI)))
.addReg(MI->getOperand(1).getReg())
@@ -141,8 +141,8 @@ void ARCBranchFinalize::replaceWithCmpBcc(MachineInstr *MI) const {
}
bool ARCBranchFinalize::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(dbgs() << "Running ARC Branch Finalize on "
- << MF.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Running ARC Branch Finalize on " << MF.getName()
+ << "\n");
std::vector<MachineInstr *> Branches;
bool Changed = false;
unsigned MaxSize = 0;
@@ -156,7 +156,7 @@ bool ARCBranchFinalize::runOnMachineFunction(MachineFunction &MF) {
for (auto &MI : MBB) {
unsigned Size = TII->getInstSizeInBytes(MI);
if (Size > 8 || Size == 0) {
- DEBUG(dbgs() << "Unknown (or size 0) size for: " << MI << "\n");
+ LLVM_DEBUG(dbgs() << "Unknown (or size 0) size for: " << MI << "\n");
} else {
MaxSize += Size;
}
@@ -172,8 +172,8 @@ bool ARCBranchFinalize::runOnMachineFunction(MachineFunction &MF) {
isInt<9>(MaxSize) ? replaceWithBRcc(P.first) : replaceWithCmpBcc(P.first);
}
- DEBUG(dbgs() << "Estimated function size for " << MF.getName()
- << ": " << MaxSize << "\n");
+ LLVM_DEBUG(dbgs() << "Estimated function size for " << MF.getName() << ": "
+ << MaxSize << "\n");
return Changed;
}
diff --git a/lib/Target/ARC/ARCFrameLowering.cpp b/lib/Target/ARC/ARCFrameLowering.cpp
index 195a781950be..ca59cb2baaa7 100644
--- a/lib/Target/ARC/ARCFrameLowering.cpp
+++ b/lib/Target/ARC/ARCFrameLowering.cpp
@@ -59,8 +59,8 @@ static void generateStackAdjustment(MachineBasicBlock &MBB,
Positive = true;
}
- DEBUG(dbgs() << "Internal: adjust stack by: " << Amount << "," << AbsAmount
- << "\n");
+ LLVM_DEBUG(dbgs() << "Internal: adjust stack by: " << Amount << ","
+ << AbsAmount << "\n");
assert((AbsAmount % 4 == 0) && "Stack adjustments must be 4-byte aligned.");
if (isUInt<6>(AbsAmount))
@@ -88,8 +88,7 @@ determineLastCalleeSave(const std::vector<CalleeSavedInfo> &CSI) {
void ARCFrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedRegs,
RegScavenger *RS) const {
- DEBUG(dbgs() << "Determine Callee Saves: " << MF.getName()
- << "\n");
+ LLVM_DEBUG(dbgs() << "Determine Callee Saves: " << MF.getName() << "\n");
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
SavedRegs.set(ARC::BLINK);
}
@@ -115,7 +114,7 @@ void ARCFrameLowering::adjustStackToMatchRecords(
/// registers onto the stack, when enough callee saved registers are required.
void ARCFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- DEBUG(dbgs() << "Emit Prologue: " << MF.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Emit Prologue: " << MF.getName() << "\n");
auto *AFI = MF.getInfo<ARCFunctionInfo>();
MachineModuleInfo &MMI = MF.getMMI();
MCContext &Context = MMI.getContext();
@@ -133,7 +132,7 @@ void ARCFrameLowering::emitPrologue(MachineFunction &MF,
unsigned AlreadyAdjusted = 0;
if (MF.getFunction().isVarArg()) {
// Add in the varargs area here first.
- DEBUG(dbgs() << "Varargs\n");
+ LLVM_DEBUG(dbgs() << "Varargs\n");
unsigned VarArgsBytes = MFI.getObjectSize(AFI->getVarArgsFrameIndex());
BuildMI(MBB, MBBI, dl, TII->get(ARC::SUB_rru6))
.addReg(ARC::SP)
@@ -141,7 +140,7 @@ void ARCFrameLowering::emitPrologue(MachineFunction &MF,
.addImm(VarArgsBytes);
}
if (hasFP(MF)) {
- DEBUG(dbgs() << "Saving FP\n");
+ LLVM_DEBUG(dbgs() << "Saving FP\n");
BuildMI(MBB, MBBI, dl, TII->get(ARC::ST_AW_rs9))
.addReg(ARC::SP, RegState::Define)
.addReg(ARC::FP)
@@ -150,7 +149,7 @@ void ARCFrameLowering::emitPrologue(MachineFunction &MF,
AlreadyAdjusted += 4;
}
if (UseSaveRestoreFunclet && Last > ARC::R14) {
- DEBUG(dbgs() << "Creating store funclet.\n");
+ LLVM_DEBUG(dbgs() << "Creating store funclet.\n");
// BL to __save_r13_to_<TRI->getRegAsmName()>
StackSlotsUsedByFunclet = Last - ARC::R12;
BuildMI(MBB, MBBI, dl, TII->get(ARC::PUSH_S_BLINK));
@@ -166,20 +165,20 @@ void ARCFrameLowering::emitPrologue(MachineFunction &MF,
}
// If we haven't saved BLINK, but we need to...do that now.
if (MFI.hasCalls() && !SavedBlink) {
- DEBUG(dbgs() << "Creating save blink.\n");
+ LLVM_DEBUG(dbgs() << "Creating save blink.\n");
BuildMI(MBB, MBBI, dl, TII->get(ARC::PUSH_S_BLINK));
AlreadyAdjusted += 4;
}
if (AFI->MaxCallStackReq > 0)
MFI.setStackSize(MFI.getStackSize() + AFI->MaxCallStackReq);
// We have already saved some of the stack...
- DEBUG(dbgs() << "Adjusting stack by: "
- << (MFI.getStackSize() - AlreadyAdjusted) << "\n");
+ LLVM_DEBUG(dbgs() << "Adjusting stack by: "
+ << (MFI.getStackSize() - AlreadyAdjusted) << "\n");
generateStackAdjustment(MBB, MBBI, *ST.getInstrInfo(), dl,
-(MFI.getStackSize() - AlreadyAdjusted), ARC::SP);
if (hasFP(MF)) {
- DEBUG(dbgs() << "Setting FP from SP.\n");
+ LLVM_DEBUG(dbgs() << "Setting FP from SP.\n");
BuildMI(MBB, MBBI, dl,
TII->get(isUInt<6>(MFI.getStackSize()) ? ARC::ADD_rru6
: ARC::ADD_rrlimm),
@@ -235,7 +234,7 @@ void ARCFrameLowering::emitPrologue(MachineFunction &MF,
/// registers onto the stack, when enough callee saved registers are required.
void ARCFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- DEBUG(dbgs() << "Emit Epilogue: " << MF.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Emit Epilogue: " << MF.getName() << "\n");
auto *AFI = MF.getInfo<ARCFunctionInfo>();
const ARCInstrInfo *TII = MF.getSubtarget<ARCSubtarget>().getInstrInfo();
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
@@ -304,7 +303,7 @@ void ARCFrameLowering::emitEpilogue(MachineFunction &MF,
// Relieve the varargs area if necessary.
if (MF.getFunction().isVarArg()) {
// Add in the varargs area here first.
- DEBUG(dbgs() << "Varargs\n");
+ LLVM_DEBUG(dbgs() << "Varargs\n");
unsigned VarArgsBytes = MFI.getObjectSize(AFI->getVarArgsFrameIndex());
BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(ARC::ADD_rru6))
.addReg(ARC::SP)
@@ -334,16 +333,16 @@ bool ARCFrameLowering::assignCalleeSavedSpillSlots(
if (hasFP(MF)) {
// Create a fixed slot at for FP
int StackObj = MFI.CreateFixedSpillStackObject(4, CurOffset, true);
- DEBUG(dbgs() << "Creating fixed object (" << StackObj << ") for FP at "
- << CurOffset << "\n");
+ LLVM_DEBUG(dbgs() << "Creating fixed object (" << StackObj << ") for FP at "
+ << CurOffset << "\n");
(void)StackObj;
CurOffset -= 4;
}
if (MFI.hasCalls() || (UseSaveRestoreFunclet && Last > ARC::R14)) {
// Create a fixed slot for BLINK.
int StackObj = MFI.CreateFixedSpillStackObject(4, CurOffset, true);
- DEBUG(dbgs() << "Creating fixed object (" << StackObj << ") for BLINK at "
- << CurOffset << "\n");
+ LLVM_DEBUG(dbgs() << "Creating fixed object (" << StackObj
+ << ") for BLINK at " << CurOffset << "\n");
(void)StackObj;
CurOffset -= 4;
}
@@ -366,12 +365,12 @@ bool ARCFrameLowering::assignCalleeSavedSpillSlots(
continue;
if (I.getFrameIdx() == 0) {
I.setFrameIdx(MFI.CreateFixedSpillStackObject(4, CurOffset, true));
- DEBUG(dbgs() << "Creating fixed object (" << I.getFrameIdx()
- << ") for other register at " << CurOffset << "\n");
+ LLVM_DEBUG(dbgs() << "Creating fixed object (" << I.getFrameIdx()
+ << ") for other register at " << CurOffset << "\n");
} else {
MFI.setObjectOffset(I.getFrameIdx(), CurOffset);
- DEBUG(dbgs() << "Updating fixed object (" << I.getFrameIdx()
- << ") for other register at " << CurOffset << "\n");
+ LLVM_DEBUG(dbgs() << "Updating fixed object (" << I.getFrameIdx()
+ << ") for other register at " << CurOffset << "\n");
}
CurOffset -= 4;
}
@@ -382,8 +381,8 @@ bool ARCFrameLowering::spillCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
const std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI) const {
- DEBUG(dbgs() << "Spill callee saved registers: "
- << MBB.getParent()->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Spill callee saved registers: "
+ << MBB.getParent()->getName() << "\n");
// There are routines for saving at least 3 registers (r13 to r15, etc.)
unsigned Last = determineLastCalleeSave(CSI);
if (UseSaveRestoreFunclet && Last > ARC::R14) {
@@ -399,8 +398,8 @@ bool ARCFrameLowering::spillCalleeSavedRegisters(
bool ARCFrameLowering::restoreCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const {
- DEBUG(dbgs() << "Restore callee saved registers: "
- << MBB.getParent()->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Restore callee saved registers: "
+ << MBB.getParent()->getName() << "\n");
// There are routines for saving at least 3 registers (r13 to r15, etc.)
unsigned Last = determineLastCalleeSave(CSI);
if (UseSaveRestoreFunclet && Last > ARC::R14) {
@@ -414,16 +413,17 @@ bool ARCFrameLowering::restoreCalleeSavedRegisters(
void ARCFrameLowering::processFunctionBeforeFrameFinalized(
MachineFunction &MF, RegScavenger *RS) const {
const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
- DEBUG(dbgs() << "Process function before frame finalized: "
- << MF.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Process function before frame finalized: "
+ << MF.getName() << "\n");
MachineFrameInfo &MFI = MF.getFrameInfo();
- DEBUG(dbgs() << "Current stack size: " << MFI.getStackSize() << "\n");
+ LLVM_DEBUG(dbgs() << "Current stack size: " << MFI.getStackSize() << "\n");
const TargetRegisterClass *RC = &ARC::GPR32RegClass;
if (MFI.hasStackObjects()) {
int RegScavFI = MFI.CreateStackObject(
RegInfo->getSpillSize(*RC), RegInfo->getSpillAlignment(*RC), false);
RS->addScavengingFrameIndex(RegScavFI);
- DEBUG(dbgs() << "Created scavenging index RegScavFI=" << RegScavFI << "\n");
+ LLVM_DEBUG(dbgs() << "Created scavenging index RegScavFI=" << RegScavFI
+ << "\n");
}
}
@@ -440,7 +440,7 @@ static void emitRegUpdate(MachineBasicBlock &MBB,
MachineBasicBlock::iterator ARCFrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
- DEBUG(dbgs() << "EmitCallFramePseudo: " << MF.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "EmitCallFramePseudo: " << MF.getName() << "\n");
const ARCInstrInfo *TII = MF.getSubtarget<ARCSubtarget>().getInstrInfo();
MachineInstr &Old = *I;
DebugLoc dl = Old.getDebugLoc();
diff --git a/lib/Target/ARC/ARCISelLowering.cpp b/lib/Target/ARC/ARCISelLowering.cpp
index 5991838a15c4..bf98af801406 100644
--- a/lib/Target/ARC/ARCISelLowering.cpp
+++ b/lib/Target/ARC/ARCISelLowering.cpp
@@ -72,7 +72,7 @@ static ARCCC::CondCode ISDCCtoARCCC(ISD::CondCode isdCC) {
ARCTargetLowering::ARCTargetLowering(const TargetMachine &TM,
const ARCSubtarget &Subtarget)
- : TargetLowering(TM), TM(TM), Subtarget(Subtarget) {
+ : TargetLowering(TM), Subtarget(Subtarget) {
// Set up the register classes.
addRegisterClass(MVT::i32, &ARC::GPR32RegClass);
@@ -486,8 +486,8 @@ SDValue ARCTargetLowering::LowerCallArguments(
EVT RegVT = VA.getLocVT();
switch (RegVT.getSimpleVT().SimpleTy) {
default: {
- DEBUG(errs() << "LowerFormalArguments Unhandled argument type: "
- << (unsigned)RegVT.getSimpleVT().SimpleTy << "\n");
+ LLVM_DEBUG(errs() << "LowerFormalArguments Unhandled argument type: "
+ << (unsigned)RegVT.getSimpleVT().SimpleTy << "\n");
llvm_unreachable("Unhandled LowerFormalArguments type.");
}
case MVT::i32:
diff --git a/lib/Target/ARC/ARCISelLowering.h b/lib/Target/ARC/ARCISelLowering.h
index cb06e9dcd79f..fec01b13a866 100644
--- a/lib/Target/ARC/ARCISelLowering.h
+++ b/lib/Target/ARC/ARCISelLowering.h
@@ -76,7 +76,6 @@ public:
Instruction *I = nullptr) const override;
private:
- const TargetMachine &TM;
const ARCSubtarget &Subtarget;
// Lower Operand helpers
diff --git a/lib/Target/ARC/ARCInstrFormats.td b/lib/Target/ARC/ARCInstrFormats.td
index 50edddd4ea9f..0a49b83ef16a 100644
--- a/lib/Target/ARC/ARCInstrFormats.td
+++ b/lib/Target/ARC/ARCInstrFormats.td
@@ -62,7 +62,7 @@ class InstARC<int sz, dag outs, dag ins, string asmstr, list<dag> pattern>
let Namespace = "ARC";
dag OutOperandList = outs;
dag InOperandList = ins;
- let AsmString = asmstr;
+ let AsmString = asmstr;
let Pattern = pattern;
let Size = sz;
}
diff --git a/lib/Target/ARC/ARCInstrInfo.cpp b/lib/Target/ARC/ARCInstrInfo.cpp
index a299e32c03a0..a8084f16893b 100644
--- a/lib/Target/ARC/ARCInstrInfo.cpp
+++ b/lib/Target/ARC/ARCInstrInfo.cpp
@@ -103,6 +103,10 @@ static ARCCC::CondCode GetOppositeBranchCondition(ARCCC::CondCode CC) {
return ARCCC::LE;
case ARCCC::GE:
return ARCCC::LT;
+ case ARCCC::VS:
+ return ARCCC::VC;
+ case ARCCC::VC:
+ return ARCCC::VS;
case ARCCC::LT:
return ARCCC::GE;
case ARCCC::LE:
@@ -169,7 +173,7 @@ bool ARCInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
bool CantAnalyze = false;
// Skip over DEBUG values and predicated nonterminators.
- while (I->isDebugValue() || !I->isTerminator()) {
+ while (I->isDebugInstr() || !I->isTerminator()) {
if (I == MBB.begin())
return false;
--I;
@@ -294,8 +298,8 @@ void ARCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
"Only support 4-byte stores to stack now.");
assert(ARC::GPR32RegClass.hasSubClassEq(RC) &&
"Only support GPR32 stores to stack now.");
- DEBUG(dbgs() << "Created store reg=" << printReg(SrcReg, TRI)
- << " to FrameIndex=" << FrameIndex << "\n");
+ LLVM_DEBUG(dbgs() << "Created store reg=" << printReg(SrcReg, TRI)
+ << " to FrameIndex=" << FrameIndex << "\n");
BuildMI(MBB, I, dl, get(ARC::ST_rs9))
.addReg(SrcReg, getKillRegState(isKill))
.addFrameIndex(FrameIndex)
@@ -321,8 +325,8 @@ void ARCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
"Only support 4-byte loads from stack now.");
assert(ARC::GPR32RegClass.hasSubClassEq(RC) &&
"Only support GPR32 stores to stack now.");
- DEBUG(dbgs() << "Created load reg=" << printReg(DestReg, TRI)
- << " from FrameIndex=" << FrameIndex << "\n");
+ LLVM_DEBUG(dbgs() << "Created load reg=" << printReg(DestReg, TRI)
+ << " from FrameIndex=" << FrameIndex << "\n");
BuildMI(MBB, I, dl, get(ARC::LD_rs9))
.addReg(DestReg, RegState::Define)
.addFrameIndex(FrameIndex)
diff --git a/lib/Target/ARC/ARCInstrInfo.td b/lib/Target/ARC/ARCInstrInfo.td
index edd853fe150d..525098c4ff66 100644
--- a/lib/Target/ARC/ARCInstrInfo.td
+++ b/lib/Target/ARC/ARCInstrInfo.td
@@ -125,18 +125,36 @@ multiclass ArcBinaryInst<bits<5> major, bits<6> mincode,
(ins GPR32:$B, GPR32:$C),
!strconcat(opasm, "\t$A, $B, $C"),
[]>;
+ def _f_rrr : F32_DOP_RR<major, mincode, 1, (outs GPR32:$A),
+ (ins GPR32:$B, GPR32:$C),
+ !strconcat(opasm, ".f\t$A, $B, $C"),
+ []>
+ { let Defs = [STATUS32]; }
// 2 register with unsigned 6-bit immediate variant.
def _rru6 : F32_DOP_RU6<major, mincode, 0, (outs GPR32:$A),
(ins GPR32:$B, immU6:$U6),
!strconcat(opasm, "\t$A, $B, $U6"),
[]>;
+ def _f_rru6 : F32_DOP_RU6<major, mincode, 1, (outs GPR32:$A),
+ (ins GPR32:$B, immU6:$U6),
+ !strconcat(opasm, ".f\t$A, $B, $U6"),
+ []>
+ { let Defs = [STATUS32]; }
+
// 2 register with 32-bit immediate variant.
def _rrlimm : F32_DOP_RLIMM<major, mincode, 0,
- (outs GPR32:$A),
- (ins GPR32:$B, i32imm:$LImm),
- !strconcat(opasm, "\t$A, $B, $LImm"),
- []>;
+ (outs GPR32:$A),
+ (ins GPR32:$B, i32imm:$LImm),
+ !strconcat(opasm, "\t$A, $B, $LImm"),
+ []>;
+ def _f_rrlimm : F32_DOP_RLIMM<major, mincode, 1,
+ (outs GPR32:$A),
+ (ins GPR32:$B, i32imm:$LImm),
+ !strconcat(opasm, ".f\t$A, $B, $LImm"),
+ []>
+ { let Defs = [STATUS32]; }
+
// 2 matched-register with signed 12-bit immediate variant (add r0, r0, -1).
def _rrs12 : F32_DOP_RS12<major, mincode, 0,
(outs GPR32:$B),
@@ -144,6 +162,12 @@ multiclass ArcBinaryInst<bits<5> major, bits<6> mincode,
!strconcat(opasm, "\t$B, $in, $S12"),
[]>
{ let Constraints = "$B = $in"; }
+ def _f_rrs12 : F32_DOP_RS12<major, mincode, 1,
+ (outs GPR32:$B),
+ (ins GPR32:$in, immS<12>:$S12),
+ !strconcat(opasm, ".f\t$B, $in, $S12"),
+ []>
+ { let Constraints = "$B = $in"; let Defs = [STATUS32]; }
}
// Special multivariant GEN4 DOP format instruction that take 2 registers.
@@ -168,6 +192,10 @@ multiclass ArcUnaryInst<bits<5> major, bits<6> subop,
string opasm> {
def _rr : F32_SOP_RR<major, subop, 0, (outs GPR32:$B), (ins GPR32:$C),
!strconcat(opasm, "\t$B, $C"), []>;
+
+ def _f_rr : F32_SOP_RR<major, subop, 1, (outs GPR32:$B), (ins GPR32:$C),
+ !strconcat(opasm, ".f\t$B, $C"), []>
+ { let Defs = [STATUS32]; }
}
@@ -328,11 +356,19 @@ let isBranch = 1, isTerminator = 1 in {
{ let Size = 8; }
} // let isBranch, isTerminator
-// Indirect, unconditional Jump.
-let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in
-def J : F32_DOP_RR<0b00100, 0b100000, 0,
- (outs), (ins GPR32:$C),
- "j\t[$C]", [(brind i32:$C)]>;
+// Unconditional Jump.
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+ // Indirect.
+ let isIndirectBranch = 1 in
+ def J : F32_DOP_RR<0b00100, 0b100000, 0,
+ (outs), (ins GPR32:$C),
+ "j\t[$C]", [(brind i32:$C)]>;
+
+ // Direct.
+ def J_LImm : F32_DOP_RLIMM<0b00100, 0b100000, 0,
+ (outs), (ins i32imm:$LImm),
+ "j\t$LImm", []>;
+}
// Call instructions.
let isCall = 1, isBarrier = 1, Defs = [BLINK], Uses = [SP] in {
@@ -344,6 +380,10 @@ let isCall = 1, isBarrier = 1, Defs = [BLINK], Uses = [SP] in {
let isIndirectBranch = 1 in
def JL : F32_DOP_RR<0b00100, 0b100010, 0, (outs), (ins GPR32:$C),
"jl\t[$C]", [(ARCJumpLink i32:$C)]>;
+
+ // Direct unconditional call.
+ def JL_LImm : F32_DOP_RLIMM<0b00100, 0b100010, 0, (outs), (ins i32imm:$LImm),
+ "jl\t$LImm", []>;
} // let isCall, isBarrier, Defs, Uses
// Pattern to generate BL instruction.
diff --git a/lib/Target/ARC/ARCMCInstLower.cpp b/lib/Target/ARC/ARCMCInstLower.cpp
index 4658388924ec..43b087a57204 100644
--- a/lib/Target/ARC/ARCMCInstLower.cpp
+++ b/lib/Target/ARC/ARCMCInstLower.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains code to lower ARC MachineInstrs to their
+/// This file contains code to lower ARC MachineInstrs to their
/// corresponding MCInst records.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARC/ARCMCInstLower.h b/lib/Target/ARC/ARCMCInstLower.h
index 22e15cdb351e..9a698f26334a 100644
--- a/lib/Target/ARC/ARCMCInstLower.h
+++ b/lib/Target/ARC/ARCMCInstLower.h
@@ -23,7 +23,7 @@ class MachineFunction;
class Mangler;
class AsmPrinter;
-/// \brief This class is used to lower an MachineInstr into an MCInst.
+/// This class is used to lower an MachineInstr into an MCInst.
class LLVM_LIBRARY_VISIBILITY ARCMCInstLower {
using MachineOperandType = MachineOperand::MachineOperandType;
MCContext *Ctx;
diff --git a/lib/Target/ARC/ARCMachineFunctionInfo.h b/lib/Target/ARC/ARCMachineFunctionInfo.h
index bfb3fdef5ebf..95ad294e3668 100644
--- a/lib/Target/ARC/ARCMachineFunctionInfo.h
+++ b/lib/Target/ARC/ARCMachineFunctionInfo.h
@@ -25,16 +25,15 @@ class ARCFunctionInfo : public MachineFunctionInfo {
virtual void anchor();
bool ReturnStackOffsetSet;
int VarArgsFrameIndex;
- unsigned VarArgFrameBytes;
unsigned ReturnStackOffset;
public:
ARCFunctionInfo()
- : ReturnStackOffsetSet(false), VarArgsFrameIndex(0), VarArgFrameBytes(0),
+ : ReturnStackOffsetSet(false), VarArgsFrameIndex(0),
ReturnStackOffset(-1U), MaxCallStackReq(0) {}
explicit ARCFunctionInfo(MachineFunction &MF)
- : ReturnStackOffsetSet(false), VarArgsFrameIndex(0), VarArgFrameBytes(0),
+ : ReturnStackOffsetSet(false), VarArgsFrameIndex(0),
ReturnStackOffset(-1U), MaxCallStackReq(0) {
// Functions are 4-byte (2**2) aligned.
MF.setAlignment(2);
diff --git a/lib/Target/ARC/ARCRegisterInfo.cpp b/lib/Target/ARC/ARCRegisterInfo.cpp
index cb9f89d3499b..38ea3c93a2d4 100644
--- a/lib/Target/ARC/ARCRegisterInfo.cpp
+++ b/lib/Target/ARC/ARCRegisterInfo.cpp
@@ -66,9 +66,9 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
MBB.getParent()->getSubtarget().getRegisterInfo();
BaseReg = RS->scavengeRegister(&ARC::GPR32RegClass, II, SPAdj);
assert(BaseReg && "Register scavenging failed.");
- DEBUG(dbgs() << "Scavenged register " << printReg(BaseReg, TRI)
- << " for FrameReg=" << printReg(FrameReg, TRI)
- << "+Offset=" << Offset << "\n");
+ LLVM_DEBUG(dbgs() << "Scavenged register " << printReg(BaseReg, TRI)
+ << " for FrameReg=" << printReg(FrameReg, TRI)
+ << "+Offset=" << Offset << "\n");
(void)TRI;
RS->setRegUsed(BaseReg);
}
@@ -88,7 +88,7 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
assert((Offset % 2 == 0) && "LDH needs 2 byte alignment.");
case ARC::LDB_rs9:
case ARC::LDB_X_rs9:
- DEBUG(dbgs() << "Building LDFI\n");
+ LLVM_DEBUG(dbgs() << "Building LDFI\n");
BuildMI(MBB, II, dl, TII.get(MI.getOpcode()), Reg)
.addReg(BaseReg, KillState)
.addImm(Offset)
@@ -99,7 +99,7 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
case ARC::STH_rs9:
assert((Offset % 2 == 0) && "STH needs 2 byte alignment.");
case ARC::STB_rs9:
- DEBUG(dbgs() << "Building STFI\n");
+ LLVM_DEBUG(dbgs() << "Building STFI\n");
BuildMI(MBB, II, dl, TII.get(MI.getOpcode()))
.addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
.addReg(BaseReg, KillState)
@@ -107,7 +107,7 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
.addMemOperand(*MI.memoperands_begin());
break;
case ARC::GETFI:
- DEBUG(dbgs() << "Building GETFI\n");
+ LLVM_DEBUG(dbgs() << "Building GETFI\n");
BuildMI(MBB, II, dl,
TII.get(isUInt<6>(Offset) ? ARC::ADD_rru6 : ARC::ADD_rrlimm))
.addReg(Reg, RegState::Define)
@@ -175,14 +175,14 @@ void ARCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int StackSize = MF.getFrameInfo().getStackSize();
int LocalFrameSize = MF.getFrameInfo().getLocalFrameSize();
- DEBUG(dbgs() << "\nFunction : " << MF.getName() << "\n");
- DEBUG(dbgs() << "<--------->\n");
- DEBUG(dbgs() << MI << "\n");
- DEBUG(dbgs() << "FrameIndex : " << FrameIndex << "\n");
- DEBUG(dbgs() << "ObjSize : " << ObjSize << "\n");
- DEBUG(dbgs() << "FrameOffset : " << Offset << "\n");
- DEBUG(dbgs() << "StackSize : " << StackSize << "\n");
- DEBUG(dbgs() << "LocalFrameSize : " << LocalFrameSize << "\n");
+ LLVM_DEBUG(dbgs() << "\nFunction : " << MF.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "<--------->\n");
+ LLVM_DEBUG(dbgs() << MI << "\n");
+ LLVM_DEBUG(dbgs() << "FrameIndex : " << FrameIndex << "\n");
+ LLVM_DEBUG(dbgs() << "ObjSize : " << ObjSize << "\n");
+ LLVM_DEBUG(dbgs() << "FrameOffset : " << Offset << "\n");
+ LLVM_DEBUG(dbgs() << "StackSize : " << StackSize << "\n");
+ LLVM_DEBUG(dbgs() << "LocalFrameSize : " << LocalFrameSize << "\n");
(void)LocalFrameSize;
// Special handling of DBG_VALUE instructions.
@@ -200,8 +200,8 @@ void ARCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// ldb needs no alignment,
// ldh needs 2 byte alignment
// ld needs 4 byte alignment
- DEBUG(dbgs() << "Offset : " << Offset << "\n"
- << "<--------->\n");
+ LLVM_DEBUG(dbgs() << "Offset : " << Offset << "\n"
+ << "<--------->\n");
unsigned Reg = MI.getOperand(0).getReg();
assert(ARC::GPR32RegClass.contains(Reg) && "Unexpected register operand");
diff --git a/lib/Target/ARC/CMakeLists.txt b/lib/Target/ARC/CMakeLists.txt
index b862a5e61e07..5a7d9eee5ff4 100644
--- a/lib/Target/ARC/CMakeLists.txt
+++ b/lib/Target/ARC/CMakeLists.txt
@@ -1,12 +1,13 @@
set(LLVM_TARGET_DEFINITIONS ARC.td)
-tablegen(LLVM ARCGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM ARCGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM ARCGenDisassemblerTables.inc -gen-disassembler)
tablegen(LLVM ARCGenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM ARCGenDAGISel.inc -gen-dag-isel)
tablegen(LLVM ARCGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM ARCGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM ARCGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM ARCGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM ARCGenRegisterInfo.inc -gen-register-info)
tablegen(LLVM ARCGenSubtargetInfo.inc -gen-subtarget)
+
add_public_tablegen_target(ARCCommonTableGen)
add_llvm_target(ARCCodeGen
@@ -24,7 +25,7 @@ add_llvm_target(ARCCodeGen
ARCTargetMachine.cpp
)
+add_subdirectory(Disassembler)
add_subdirectory(InstPrinter)
-add_subdirectory(TargetInfo)
add_subdirectory(MCTargetDesc)
-add_subdirectory(Disassembler)
+add_subdirectory(TargetInfo)
diff --git a/lib/Target/ARC/Disassembler/ARCDisassembler.cpp b/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
index dd181767d81a..3fc5a033dd5d 100644
--- a/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
+++ b/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file is part of the ARC Disassembler.
+/// This file is part of the ARC Disassembler.
///
//===----------------------------------------------------------------------===//
@@ -31,7 +31,7 @@ using DecodeStatus = MCDisassembler::DecodeStatus;
namespace {
-/// \brief A disassembler class for ARC.
+/// A disassembler class for ARC.
class ARCDisassembler : public MCDisassembler {
public:
std::unique_ptr<MCInstrInfo const> const MCII;
@@ -122,7 +122,7 @@ static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder) {
if (RegNo >= 32) {
- DEBUG(dbgs() << "Not a GPR32 register.");
+ LLVM_DEBUG(dbgs() << "Not a GPR32 register.");
return MCDisassembler::Fail;
}
@@ -222,7 +222,7 @@ static DecodeStatus DecodeStLImmInstruction(MCInst &Inst, uint64_t Insn,
unsigned SrcC, DstB, LImm;
DstB = decodeBField(Insn);
if (DstB != 62) {
- DEBUG(dbgs() << "Decoding StLImm found non-limm register.");
+ LLVM_DEBUG(dbgs() << "Decoding StLImm found non-limm register.");
return MCDisassembler::Fail;
}
SrcC = decodeCField(Insn);
@@ -237,10 +237,10 @@ static DecodeStatus DecodeLdLImmInstruction(MCInst &Inst, uint64_t Insn,
uint64_t Address,
const void *Decoder) {
unsigned DstA, SrcB, LImm;
- DEBUG(dbgs() << "Decoding LdLImm:\n");
+ LLVM_DEBUG(dbgs() << "Decoding LdLImm:\n");
SrcB = decodeBField(Insn);
if (SrcB != 62) {
- DEBUG(dbgs() << "Decoding LdLImm found non-limm register.");
+ LLVM_DEBUG(dbgs() << "Decoding LdLImm found non-limm register.");
return MCDisassembler::Fail;
}
DstA = decodeAField(Insn);
@@ -255,13 +255,13 @@ static DecodeStatus DecodeLdRLImmInstruction(MCInst &Inst, uint64_t Insn,
uint64_t Address,
const void *Decoder) {
unsigned DstA, SrcB;
- DEBUG(dbgs() << "Decoding LdRLimm\n");
+ LLVM_DEBUG(dbgs() << "Decoding LdRLimm\n");
DstA = decodeAField(Insn);
DecodeGPR32RegisterClass(Inst, DstA, Address, Decoder);
SrcB = decodeBField(Insn);
DecodeGPR32RegisterClass(Inst, SrcB, Address, Decoder);
if (decodeCField(Insn) != 62) {
- DEBUG(dbgs() << "Decoding LdRLimm found non-limm register.");
+ LLVM_DEBUG(dbgs() << "Decoding LdRLimm found non-limm register.");
return MCDisassembler::Fail;
}
Inst.addOperand(MCOperand::createImm((uint32_t)(Insn >> 32)));
@@ -271,7 +271,7 @@ static DecodeStatus DecodeLdRLImmInstruction(MCInst &Inst, uint64_t Insn,
static DecodeStatus DecodeMoveHRegInstruction(MCInst &Inst, uint64_t Insn,
uint64_t Address,
const void *Decoder) {
- DEBUG(dbgs() << "Decoding MOV_S h-register\n");
+ LLVM_DEBUG(dbgs() << "Decoding MOV_S h-register\n");
using Field = decltype(Insn);
Field h = fieldFromInstruction(Insn, 5, 3) |
(fieldFromInstruction(Insn, 0, 2) << 3);
@@ -322,10 +322,10 @@ DecodeStatus ARCDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
Result =
decodeInstruction(DecoderTable64, Instr, Insn64, Address, this, STI);
if (Success == Result) {
- DEBUG(dbgs() << "Successfully decoded 64-bit instruction.");
+ LLVM_DEBUG(dbgs() << "Successfully decoded 64-bit instruction.");
return Result;
}
- DEBUG(dbgs() << "Not a 64-bit instruction, falling back to 32-bit.");
+ LLVM_DEBUG(dbgs() << "Not a 64-bit instruction, falling back to 32-bit.");
}
uint32_t Insn32;
if (!readInstruction32(Bytes, Address, Size, Insn32)) {
@@ -342,10 +342,12 @@ DecodeStatus ARCDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
Result =
decodeInstruction(DecoderTable48, Instr, Insn48, Address, this, STI);
if (Success == Result) {
- DEBUG(dbgs() << "Successfully decoded 16-bit instruction with limm.");
+ LLVM_DEBUG(
+ dbgs() << "Successfully decoded 16-bit instruction with limm.");
return Result;
}
- DEBUG(dbgs() << "Not a 16-bit instruction with limm, try without it.");
+ LLVM_DEBUG(
+ dbgs() << "Not a 16-bit instruction with limm, try without it.");
}
uint32_t Insn16;
diff --git a/lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp b/lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp
index 4760ac4456d0..0c627d04698b 100644
--- a/lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp
+++ b/lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp
@@ -43,9 +43,8 @@ static const char *ARCBRCondCodeToString(ARCCC::BRCondCode BRCC) {
return "lo";
case ARCCC::BRHS:
return "hs";
- default:
- llvm_unreachable("Unhandled ARCCC::BRCondCode");
}
+ llvm_unreachable("Unhandled ARCCC::BRCondCode");
}
static const char *ARCCondCodeToString(ARCCC::CondCode CC) {
@@ -66,6 +65,10 @@ static const char *ARCCondCodeToString(ARCCC::CondCode CC) {
return "gt";
case ARCCC::GE:
return "ge";
+ case ARCCC::VS:
+ return "vs";
+ case ARCCC::VC:
+ return "vc";
case ARCCC::LT:
return "lt";
case ARCCC::LE:
diff --git a/lib/Target/ARC/InstPrinter/ARCInstPrinter.h b/lib/Target/ARC/InstPrinter/ARCInstPrinter.h
index e26c08104e23..bb3898a67cef 100644
--- a/lib/Target/ARC/InstPrinter/ARCInstPrinter.h
+++ b/lib/Target/ARC/InstPrinter/ARCInstPrinter.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains the declaration of the ARCInstPrinter class,
+/// This file contains the declaration of the ARCInstPrinter class,
/// which is used to print ARC MCInst to a .s file.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARC/MCTargetDesc/ARCInfo.h b/lib/Target/ARC/MCTargetDesc/ARCInfo.h
index b9ed99885702..401b4c5e6613 100644
--- a/lib/Target/ARC/MCTargetDesc/ARCInfo.h
+++ b/lib/Target/ARC/MCTargetDesc/ARCInfo.h
@@ -30,6 +30,8 @@ enum CondCode {
N = 0x4,
LO = 0x5,
HS = 0x6,
+ VS = 0x7,
+ VC = 0x8,
GT = 0x9,
GE = 0xa,
LT = 0xb,
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index 16d5f74d19e3..be88fe4ddb14 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -180,7 +180,7 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
SmallVector<MachineInstr *, 8> Front;
DeadInstr.insert(MI);
- DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n");
+ LLVM_DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n");
Front.push_back(MI);
while (Front.size() != 0) {
@@ -232,7 +232,7 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
if (!IsDead) continue;
- DEBUG(dbgs() << "Deleting instruction " << *Def << "\n");
+ LLVM_DEBUG(dbgs() << "Deleting instruction " << *Def << "\n");
DeadInstr.insert(Def);
}
}
@@ -264,7 +264,7 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
// Is it a subreg copy of ssub_0?
if (EC && EC->isCopy() &&
EC->getOperand(1).getSubReg() == ARM::ssub_0) {
- DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI);
+ LLVM_DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI);
// Find the thing we're subreg copying out of - is it of the same
// regclass as DPRMI? (i.e. a DPR or QPR).
@@ -272,8 +272,8 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
const TargetRegisterClass *TRC =
MRI->getRegClass(MI->getOperand(1).getReg());
if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) {
- DEBUG(dbgs() << "Subreg copy is compatible - returning ");
- DEBUG(dbgs() << printReg(FullReg) << "\n");
+ LLVM_DEBUG(dbgs() << "Subreg copy is compatible - returning ");
+ LLVM_DEBUG(dbgs() << printReg(FullReg) << "\n");
eraseInstrWithNoUses(MI);
return FullReg;
}
@@ -387,7 +387,7 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
continue;
Front.push_back(NewMI);
} else {
- DEBUG(dbgs() << "Found partial copy" << *MI <<"\n");
+ LLVM_DEBUG(dbgs() << "Found partial copy" << *MI << "\n");
Outs.push_back(MI);
}
}
@@ -642,9 +642,8 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
// to find.
MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg()));
- DEBUG(dbgs() << "Replacing operand "
- << **I << " with "
- << printReg(NewReg) << "\n");
+ LLVM_DEBUG(dbgs() << "Replacing operand " << **I << " with "
+ << printReg(NewReg) << "\n");
(*I)->substVirtReg(NewReg, 0, *TRI);
}
}
@@ -661,14 +660,15 @@ bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>();
// Since the A15SDOptimizer pass can insert VDUP instructions, it can only be
// enabled when NEON is available.
- if (!(STI.isCortexA15() && STI.hasNEON()))
+ if (!(STI.useSplatVFPToNeon() && STI.hasNEON()))
return false;
+
TII = STI.getInstrInfo();
TRI = STI.getRegisterInfo();
MRI = &Fn.getRegInfo();
bool Modified = false;
- DEBUG(dbgs() << "Running on function " << Fn.getName()<< "\n");
+ LLVM_DEBUG(dbgs() << "Running on function " << Fn.getName() << "\n");
DeadInstr.clear();
Replacements.clear();
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 9ffb4c2055f9..b5cc45c5cc94 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -15,6 +15,7 @@
#ifndef LLVM_LIB_TARGET_ARM_ARM_H
#define LLVM_LIB_TARGET_ARM_ARM_H
+#include "llvm/IR/LegacyPassManager.h"
#include "llvm/Support/CodeGen.h"
#include <functional>
#include <vector>
@@ -35,11 +36,14 @@ class MachineInstr;
class MCInst;
class PassRegistry;
+
+Pass *createARMParallelDSPPass();
FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
CodeGenOpt::Level OptLevel);
FunctionPass *createA15SDOptimizerPass();
FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
FunctionPass *createARMExpandPseudoPass();
+FunctionPass *createARMCodeGenPreparePass();
FunctionPass *createARMConstantIslandPass();
FunctionPass *createMLxExpansionPass();
FunctionPass *createThumb2ITBlockPass();
@@ -57,8 +61,11 @@ void computeBlockSize(MachineFunction *MF, MachineBasicBlock *MBB,
BasicBlockInfo &BBI);
std::vector<BasicBlockInfo> computeAllBlockSizes(MachineFunction *MF);
+
+void initializeARMParallelDSPPass(PassRegistry &);
void initializeARMLoadStoreOptPass(PassRegistry &);
void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
+void initializeARMCodeGenPreparePass(PassRegistry &);
void initializeARMConstantIslandsPass(PassRegistry &);
void initializeARMExpandPseudoPass(PassRegistry &);
void initializeThumb2SizeReducePass(PassRegistry &);
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index c9766aa2161a..2e62a0790418 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -109,10 +109,16 @@ def Feature8MSecExt : SubtargetFeature<"8msecext", "Has8MSecExt", "true",
"Enable support for ARMv8-M "
"Security Extensions">;
+def FeatureSHA2 : SubtargetFeature<"sha2", "HasSHA2", "true",
+ "Enable SHA1 and SHA256 support", [FeatureNEON]>;
+
+def FeatureAES : SubtargetFeature<"aes", "HasAES", "true",
+ "Enable AES support", [FeatureNEON]>;
+
def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
"Enable support for "
"Cryptography extensions",
- [FeatureNEON]>;
+ [FeatureNEON, FeatureSHA2, FeatureAES]>;
def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
"Enable support for CRC instructions">;
@@ -135,6 +141,10 @@ def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true",
def FeatureFuseAES : SubtargetFeature<"fuse-aes", "HasFuseAES", "true",
"CPU fuses AES crypto operations">;
+// Fast execution of bottom and top halves of literal generation
+def FeatureFuseLiterals : SubtargetFeature<"fuse-literals", "HasFuseLiterals", "true",
+ "CPU fuses literal generation operations">;
+
// The way of reading thread pointer
def FeatureReadTp : SubtargetFeature<"read-tp-hard", "ReadTPHard", "true",
"Reading thread pointer from register">;
@@ -189,6 +199,13 @@ def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs",
"DontWidenVMOVS", "true",
"Don't widen VMOVS to VMOVD">;
+// Some targets (e.g. Cortex-A15) prefer to avoid mixing operations on different
+// VFP register widths.
+def FeatureSplatVFPToNeon : SubtargetFeature<"splat-vfp-neon",
+ "SplatVFPToNeon", "true",
+ "Splat register from VFP to NEON",
+ [FeatureDontWidenVMOVS]>;
+
// Whether or not it is profitable to expand VFP/NEON MLA/MLS instructions.
def FeatureExpandMLx : SubtargetFeature<"expand-fp-mlx",
"ExpandMLx", "true",
@@ -330,6 +347,10 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
"DisablePostRAScheduler", "true",
"Don't schedule again after register allocation">;
+// Enable use of alias analysis during code generation
+def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
+ "Use alias analysis during codegen">;
+
//===----------------------------------------------------------------------===//
// ARM architecture class
//
@@ -415,6 +436,10 @@ def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
"Support ARM v8.3a instructions",
[HasV8_2aOps]>;
+def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
+ "Support ARM v8.4a instructions",
+ [HasV8_3aOps, FeatureDotProd]>;
+
//===----------------------------------------------------------------------===//
// ARM Processor subtarget features.
//
@@ -507,7 +532,8 @@ def ARMv5te : Architecture<"armv5te", "ARMv5te", [HasV5TEOps]>;
def ARMv5tej : Architecture<"armv5tej", "ARMv5tej", [HasV5TEOps]>;
-def ARMv6 : Architecture<"armv6", "ARMv6", [HasV6Ops]>;
+def ARMv6 : Architecture<"armv6", "ARMv6", [HasV6Ops,
+ FeatureDSP]>;
def ARMv6t2 : Architecture<"armv6t2", "ARMv6t2", [HasV6T2Ops,
FeatureDSP]>;
@@ -521,13 +547,15 @@ def ARMv6m : Architecture<"armv6-m", "ARMv6m", [HasV6MOps,
FeatureNoARM,
ModeThumb,
FeatureDB,
- FeatureMClass]>;
+ FeatureMClass,
+ FeatureStrictAlign]>;
def ARMv6sm : Architecture<"armv6s-m", "ARMv6sm", [HasV6MOps,
FeatureNoARM,
ModeThumb,
FeatureDB,
- FeatureMClass]>;
+ FeatureMClass,
+ FeatureStrictAlign]>;
def ARMv7a : Architecture<"armv7-a", "ARMv7a", [HasV7Ops,
FeatureNEON,
@@ -617,6 +645,20 @@ def ARMv83a : Architecture<"armv8.3-a", "ARMv83a", [HasV8_3aOps,
FeatureCRC,
FeatureRAS]>;
+def ARMv84a : Architecture<"armv8.4-a", "ARMv84a", [HasV8_4aOps,
+ FeatureAClass,
+ FeatureDB,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeatureDSP,
+ FeatureTrustZone,
+ FeatureMP,
+ FeatureVirtualization,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureRAS,
+ FeatureDotProd]>;
+
def ARMv8r : Architecture<"armv8-r", "ARMv8r", [HasV8Ops,
FeatureRClass,
FeatureDB,
@@ -637,7 +679,8 @@ def ARMv8mBaseline : Architecture<"armv8-m.base", "ARMv8mBaseline",
FeatureV7Clrex,
Feature8MSecExt,
FeatureAcquireRelease,
- FeatureMClass]>;
+ FeatureMClass,
+ FeatureStrictAlign]>;
def ARMv8mMainline : Architecture<"armv8-m.main", "ARMv8mMainline",
[HasV8MMainlineOps,
@@ -787,6 +830,7 @@ def : ProcessorModel<"cortex-a12", CortexA9Model, [ARMv7a, ProcA12,
def : ProcessorModel<"cortex-a15", CortexA9Model, [ARMv7a, ProcA15,
FeatureDontWidenVMOVS,
+ FeatureSplatVFPToNeon,
FeatureHasRetAddrStack,
FeatureMuxedUnits,
FeatureTrustZone,
@@ -991,6 +1035,12 @@ def : ProcNoItin<"exynos-m3", [ARMv8a, ProcExynosM1,
FeatureCrypto,
FeatureCRC]>;
+def : ProcNoItin<"exynos-m4", [ARMv8a, ProcExynosM1,
+ FeatureHWDivThumb,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC]>;
+
def : ProcNoItin<"kryo", [ARMv8a, ProcKryo,
FeatureHWDivThumb,
FeatureHWDivARM,
@@ -998,7 +1048,9 @@ def : ProcNoItin<"kryo", [ARMv8a, ProcKryo,
FeatureCRC]>;
def : ProcessorModel<"cortex-r52", CortexR52Model, [ARMv8r, ProcR52,
- FeatureFPAO]>;
+ FeatureUseMISched,
+ FeatureFPAO,
+ FeatureUseAA]>;
//===----------------------------------------------------------------------===//
// Register File Description
@@ -1042,4 +1094,5 @@ def ARM : Target {
let AssemblyWriters = [ARMAsmWriter];
let AssemblyParsers = [ARMAsmParser];
let AssemblyParserVariants = [ARMAsmParserVariant];
+ let AllowRegisterRenaming = 1;
}
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index d3d79fe975bb..2196f9b47f3b 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -235,6 +235,15 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
}
}
+MCSymbol *ARMAsmPrinter::GetCPISymbol(unsigned CPID) const {
+ // The AsmPrinter::GetCPISymbol superclass method tries to use CPID as
+ // indexes in MachineConstantPool, which isn't in sync with indexes used here.
+ const DataLayout &DL = getDataLayout();
+ return OutContext.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+ "CPI" + Twine(getFunctionNumber()) + "_" +
+ Twine(CPID));
+}
+
//===--------------------------------------------------------------------===//
MCSymbol *ARMAsmPrinter::
@@ -545,29 +554,6 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
}
- if (TT.isOSBinFormatCOFF()) {
- const auto &TLOF =
- static_cast<const TargetLoweringObjectFileCOFF &>(getObjFileLowering());
-
- std::string Flags;
- raw_string_ostream OS(Flags);
-
- for (const auto &Function : M)
- TLOF.emitLinkerFlagsForGlobal(OS, &Function);
- for (const auto &Global : M.globals())
- TLOF.emitLinkerFlagsForGlobal(OS, &Global);
- for (const auto &Alias : M.aliases())
- TLOF.emitLinkerFlagsForGlobal(OS, &Alias);
-
- OS.flush();
-
- // Output collected flags
- if (!Flags.empty()) {
- OutStreamer->SwitchSection(TLOF.getDrectveSection());
- OutStreamer->EmitBytes(Flags);
- }
- }
-
// The last attribute to be emitted is ABI_optimization_goals
MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
@@ -1086,6 +1072,8 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
unsigned StartOp = 2 + 2;
// Use all the operands.
unsigned NumOffset = 0;
+ // Amount of SP adjustment folded into a push.
+ unsigned Pad = 0;
switch (Opc) {
default:
@@ -1107,6 +1095,16 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
// temporary to workaround PR11902.
if (MO.isImplicit())
continue;
+ // Registers, pushed as a part of folding an SP update into the
+ // push instruction are marked as undef and should not be
+ // restored when unwinding, because the function can modify the
+ // corresponding stack slots.
+ if (MO.isUndef()) {
+ assert(RegList.empty() &&
+ "Pad registers must come before restored ones");
+ Pad += 4;
+ continue;
+ }
RegList.push_back(MO.getReg());
}
break;
@@ -1118,8 +1116,12 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
RegList.push_back(SrcReg);
break;
}
- if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM)
+ if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM) {
ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
+ // Account for the SP adjustment, folded into the push.
+ if (Pad)
+ ATS.emitPad(Pad);
+ }
} else {
// Changes of stack / frame pointer.
if (SrcReg == ARM::SP) {
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index 7b811b18f74a..0ba4bc05d6f7 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -101,7 +101,9 @@ public:
void EmitEndOfAsmFile(Module &M) override;
void EmitXXStructor(const DataLayout &DL, const Constant *CV) override;
void EmitGlobalVariable(const GlobalVariable *GV) override;
-
+
+ MCSymbol *GetCPISymbol(unsigned CPID) const override;
+
// lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 8c1727724a9e..b1c2031c7d7b 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -331,7 +331,7 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
bool CantAnalyze = false;
// Skip over DEBUG values and predicated nonterminators.
- while (I->isDebugValue() || !I->isTerminator()) {
+ while (I->isDebugInstr() || !I->isTerminator()) {
if (I == MBB.begin())
return false;
--I;
@@ -935,6 +935,25 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Mov->addRegisterKilled(SrcReg, TRI);
}
+bool ARMBaseInstrInfo::isCopyInstr(const MachineInstr &MI,
+ const MachineOperand *&Src,
+ const MachineOperand *&Dest) const {
+ // VMOVRRD is also a copy instruction but it requires
+ // special way of handling. It is more complex copy version
+ // and since that we are not considering it. For recognition
+ // of such instruction isExtractSubregLike MI interface fuction
+ // could be used.
+ // VORRq is considered as a move only if two inputs are
+ // the same register.
+ if (!MI.isMoveReg() ||
+ (MI.getOpcode() == ARM::VORRq &&
+ MI.getOperand(1).getReg() != MI.getOperand(2).getReg()))
+ return false;
+ Dest = &MI.getOperand(0);
+ Src = &MI.getOperand(1);
+ return true;
+}
+
const MachineInstrBuilder &
ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
unsigned SubIdx, unsigned State,
@@ -963,6 +982,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MFI.getObjectSize(FI), Align);
switch (TRI->getSpillSize(*RC)) {
+ case 2:
+ if (ARM::HPRRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(ARM::VSTRH))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO)
+ .add(predOps(ARMCC::AL));
+ } else
+ llvm_unreachable("Unknown reg class!");
+ break;
case 4:
if (ARM::GPRRegClass.hasSubClassEq(RC)) {
BuildMI(MBB, I, DL, get(ARM::STRi12))
@@ -1161,6 +1191,16 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MFI.getObjectSize(FI), Align);
switch (TRI->getSpillSize(*RC)) {
+ case 2:
+ if (ARM::HPRRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(ARM::VLDRH), DestReg)
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO)
+ .add(predOps(ARMCC::AL));
+ } else
+ llvm_unreachable("Unknown reg class!");
+ break;
case 4:
if (ARM::GPRRegClass.hasSubClassEq(RC)) {
BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
@@ -1168,7 +1208,6 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
.addImm(0)
.addMemOperand(MMO)
.add(predOps(ARMCC::AL));
-
} else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
.addFrameIndex(FI)
@@ -1321,7 +1360,13 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
}
break;
case ARM::VLD1q64:
+ case ARM::VLD1d8TPseudo:
+ case ARM::VLD1d16TPseudo:
+ case ARM::VLD1d32TPseudo:
case ARM::VLD1d64TPseudo:
+ case ARM::VLD1d8QPseudo:
+ case ARM::VLD1d16QPseudo:
+ case ARM::VLD1d32QPseudo:
case ARM::VLD1d64QPseudo:
if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) {
FrameIndex = MI.getOperand(1).getIndex();
@@ -1345,7 +1390,7 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
return MI.mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
}
-/// \brief Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD
+/// Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD
/// depending on whether the result is used.
void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
bool isThumb1 = Subtarget.isThumb1Only();
@@ -1358,7 +1403,6 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
MachineInstrBuilder LDM, STM;
if (isThumb1 || !MI->getOperand(1).isDead()) {
MachineOperand LDWb(MI->getOperand(1));
- LDWb.setIsRenamable(false);
LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD
: isThumb1 ? ARM::tLDMIA_UPD
: ARM::LDMIA_UPD))
@@ -1369,7 +1413,6 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
if (isThumb1 || !MI->getOperand(0).isDead()) {
MachineOperand STWb(MI->getOperand(0));
- STWb.setIsRenamable(false);
STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD
: isThumb1 ? ARM::tSTMIA_UPD
: ARM::STMIA_UPD))
@@ -1379,11 +1422,9 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
}
MachineOperand LDBase(MI->getOperand(3));
- LDBase.setIsRenamable(false);
LDM.add(LDBase).add(predOps(ARMCC::AL));
MachineOperand STBase(MI->getOperand(2));
- STBase.setIsRenamable(false);
STM.add(STBase).add(predOps(ARMCC::AL));
// Sort the scratch registers into ascending order.
@@ -1391,12 +1432,12 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
SmallVector<unsigned, 6> ScratchRegs;
for(unsigned I = 5; I < MI->getNumOperands(); ++I)
ScratchRegs.push_back(MI->getOperand(I).getReg());
- std::sort(ScratchRegs.begin(), ScratchRegs.end(),
- [&TRI](const unsigned &Reg1,
- const unsigned &Reg2) -> bool {
- return TRI.getEncodingValue(Reg1) <
- TRI.getEncodingValue(Reg2);
- });
+ llvm::sort(ScratchRegs.begin(), ScratchRegs.end(),
+ [&TRI](const unsigned &Reg1,
+ const unsigned &Reg2) -> bool {
+ return TRI.getEncodingValue(Reg1) <
+ TRI.getEncodingValue(Reg2);
+ });
for (const auto &Reg : ScratchRegs) {
LDM.addReg(Reg, RegState::Define);
@@ -1453,7 +1494,7 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return false;
// All clear, widen the COPY.
- DEBUG(dbgs() << "widening: " << MI);
+ LLVM_DEBUG(dbgs() << "widening: " << MI);
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
// Get rid of the old implicit-def of DstRegD. Leave it if it defines a Q-reg
@@ -1482,7 +1523,7 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.addRegisterKilled(SrcRegS, TRI, true);
}
- DEBUG(dbgs() << "replaced by: " << MI);
+ LLVM_DEBUG(dbgs() << "replaced by: " << MI);
return true;
}
@@ -1659,7 +1700,7 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0,
}
for (unsigned i = 3, e = MI0.getNumOperands(); i != e; ++i) {
- // %12 = PICLDR %11, 0, pred:14, pred:%noreg
+ // %12 = PICLDR %11, 0, 14, %noreg
const MachineOperand &MO0 = MI0.getOperand(i);
const MachineOperand &MO1 = MI1.getOperand(i);
if (!MO0.isIdenticalTo(MO1))
@@ -1799,7 +1840,7 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
// considered a scheduling hazard, which is wrong. It should be the actual
// instruction preceding the dbg_value instruction(s), just like it is
// when debug info is not present.
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
return false;
// Terminators and labels can't be scheduled around.
@@ -1813,8 +1854,8 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
// to the t2IT instruction. The added compile time and complexity does not
// seem worth it.
MachineBasicBlock::const_iterator I = MI;
- // Make sure to skip any dbg_value instructions
- while (++I != MBB->end() && I->isDebugValue())
+ // Make sure to skip any debug instructions
+ while (++I != MBB->end() && I->isDebugInstr())
;
if (I != MBB->end() && I->getOpcode() == ARM::t2IT)
return true;
@@ -2277,9 +2318,9 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
--CurRegEnc) {
unsigned CurReg = RegClass->getRegister(CurRegEnc);
if (!IsPop) {
- // Pushing any register is completely harmless, mark the
- // register involved as undef since we don't care about it in
- // the slightest.
+ // Pushing any register is completely harmless, mark the register involved
+ // as undef since we don't care about its value and must not restore it
+ // during stack unwinding.
RegList.push_back(MachineOperand::CreateReg(CurReg, false, false,
false, false, true));
--RegsNeeded;
@@ -2409,6 +2450,14 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
NumBits = 8;
Scale = 4;
break;
+ case ARMII::AddrMode5FP16:
+ ImmIdx = FrameRegIdx+1;
+ InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
+ if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+ InstrOffs *= -1;
+ NumBits = 8;
+ Scale = 2;
+ break;
default:
llvm_unreachable("Unsupported addressing mode!");
}
@@ -2534,14 +2583,28 @@ inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) {
}
}
+/// getCmpToAddCondition - assume the flags are set by CMP(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by ADD(a,b,X).
+inline static ARMCC::CondCodes getCmpToAddCondition(ARMCC::CondCodes CC) {
+ switch (CC) {
+ default: return ARMCC::AL;
+ case ARMCC::HS: return ARMCC::LO;
+ case ARMCC::LO: return ARMCC::HS;
+ case ARMCC::VS: return ARMCC::VS;
+ case ARMCC::VC: return ARMCC::VC;
+ }
+}
+
/// isRedundantFlagInstr - check whether the first instruction, whose only
/// purpose is to update flags, can be made redundant.
/// CMPrr can be made redundant by SUBrr if the operands are the same.
/// CMPri can be made redundant by SUBri if the operands are the same.
+/// CMPrr(r0, r1) can be made redundant by ADDr[ri](r0, r1, X).
/// This function can be extended later on.
-inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
- unsigned SrcReg2, int ImmValue,
- MachineInstr *OI) {
+inline static bool isRedundantFlagInstr(const MachineInstr *CmpI,
+ unsigned SrcReg, unsigned SrcReg2,
+ int ImmValue, const MachineInstr *OI) {
if ((CmpI->getOpcode() == ARM::CMPrr ||
CmpI->getOpcode() == ARM::t2CMPrr) &&
(OI->getOpcode() == ARM::SUBrr ||
@@ -2559,6 +2622,14 @@ inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
OI->getOperand(1).getReg() == SrcReg &&
OI->getOperand(2).getImm() == ImmValue)
return true;
+
+ if ((CmpI->getOpcode() == ARM::CMPrr || CmpI->getOpcode() == ARM::t2CMPrr) &&
+ (OI->getOpcode() == ARM::ADDrr || OI->getOpcode() == ARM::t2ADDrr ||
+ OI->getOpcode() == ARM::ADDri || OI->getOpcode() == ARM::t2ADDri) &&
+ OI->getOperand(0).isReg() && OI->getOperand(1).isReg() &&
+ OI->getOperand(0).getReg() == SrcReg &&
+ OI->getOperand(1).getReg() == SrcReg2)
+ return true;
return false;
}
@@ -2661,17 +2732,18 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
if (I == B) return false;
// There are two possible candidates which can be changed to set CPSR:
- // One is MI, the other is a SUB instruction.
- // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+ // One is MI, the other is a SUB or ADD instruction.
+ // For CMPrr(r1,r2), we are looking for SUB(r1,r2), SUB(r2,r1), or
+ // ADDr[ri](r1, r2, X).
// For CMPri(r1, CmpValue), we are looking for SUBri(r1, CmpValue).
- MachineInstr *Sub = nullptr;
+ MachineInstr *SubAdd = nullptr;
if (SrcReg2 != 0)
// MI is not a candidate for CMPrr.
MI = nullptr;
else if (MI->getParent() != CmpInstr.getParent() || CmpValue != 0) {
// Conservatively refuse to convert an instruction which isn't in the same
// BB as the comparison.
- // For CMPri w/ CmpValue != 0, a Sub may still be a candidate.
+ // For CMPri w/ CmpValue != 0, a SubAdd may still be a candidate.
// Thus we cannot return here.
if (CmpInstr.getOpcode() == ARM::CMPri ||
CmpInstr.getOpcode() == ARM::t2CMPri)
@@ -2716,11 +2788,20 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
}
// Check that CPSR isn't set between the comparison instruction and the one we
- // want to change. At the same time, search for Sub.
+ // want to change. At the same time, search for SubAdd.
const TargetRegisterInfo *TRI = &getRegisterInfo();
- --I;
- for (; I != E; --I) {
- const MachineInstr &Instr = *I;
+ do {
+ const MachineInstr &Instr = *--I;
+
+ // Check whether CmpInstr can be made redundant by the current instruction.
+ if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &Instr)) {
+ SubAdd = &*I;
+ break;
+ }
+
+ // Allow E (which was initially MI) to be SubAdd but do not search before E.
+ if (I == E)
+ break;
if (Instr.modifiesRegister(ARM::CPSR, TRI) ||
Instr.readsRegister(ARM::CPSR, TRI))
@@ -2728,23 +2809,14 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
// change. We can't do this transformation.
return false;
- // Check whether CmpInstr can be made redundant by the current instruction.
- if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
- Sub = &*I;
- break;
- }
-
- if (I == B)
- // The 'and' is below the comparison instruction.
- return false;
- }
+ } while (I != B);
// Return false if no candidates exist.
- if (!MI && !Sub)
+ if (!MI && !SubAdd)
return false;
// The single candidate is called MI.
- if (!MI) MI = Sub;
+ if (!MI) MI = SubAdd;
// We can't use a predicated instruction - it doesn't always write the flags.
if (isPredicated(*MI))
@@ -2802,25 +2874,31 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
break;
}
- if (Sub) {
- ARMCC::CondCodes NewCC = getSwappedCondition(CC);
- if (NewCC == ARMCC::AL)
- return false;
+ if (SubAdd) {
// If we have SUB(r1, r2) and CMP(r2, r1), the condition code based
// on CMP needs to be updated to be based on SUB.
+ // If we have ADD(r1, r2, X) and CMP(r1, r2), the condition code also
+ // needs to be modified.
// Push the condition code operands to OperandsToUpdate.
// If it is safe to remove CmpInstr, the condition code of these
// operands will be modified.
- if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
- Sub->getOperand(2).getReg() == SrcReg) {
+ unsigned Opc = SubAdd->getOpcode();
+ bool IsSub = Opc == ARM::SUBrr || Opc == ARM::t2SUBrr ||
+ Opc == ARM::SUBri || Opc == ARM::t2SUBri;
+ if (!IsSub || (SrcReg2 != 0 && SubAdd->getOperand(1).getReg() == SrcReg2 &&
+ SubAdd->getOperand(2).getReg() == SrcReg)) {
// VSel doesn't support condition code update.
if (IsInstrVSel)
return false;
+ // Ensure we can swap the condition.
+ ARMCC::CondCodes NewCC = (IsSub ? getSwappedCondition(CC) : getCmpToAddCondition(CC));
+ if (NewCC == ARMCC::AL)
+ return false;
OperandsToUpdate.push_back(
std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
}
} else {
- // No Sub, so this is x = <op> y, z; cmp x, 0.
+ // No SubAdd, so this is x = <op> y, z; cmp x, 0.
switch (CC) {
case ARMCC::EQ: // Z
case ARMCC::NE: // Z
@@ -2874,6 +2952,23 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
return true;
}
+bool ARMBaseInstrInfo::shouldSink(const MachineInstr &MI) const {
+ // Do not sink MI if it might be used to optimize a redundant compare.
+ // We heuristically only look at the instruction immediately following MI to
+ // avoid potentially searching the entire basic block.
+ if (isPredicated(MI))
+ return true;
+ MachineBasicBlock::const_iterator Next = &MI;
+ ++Next;
+ unsigned SrcReg, SrcReg2;
+ int CmpMask, CmpValue;
+ if (Next != MI.getParent()->end() &&
+ analyzeCompare(*Next, SrcReg, SrcReg2, CmpMask, CmpValue) &&
+ isRedundantFlagInstr(&*Next, SrcReg, SrcReg2, CmpValue, &MI))
+ return false;
+ return true;
+}
+
bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned Reg,
MachineRegisterInfo *MRI) const {
@@ -3467,8 +3562,8 @@ bool ARMBaseInstrInfo::isLDMBaseRegInList(const MachineInstr &MI) const {
}
unsigned
ARMBaseInstrInfo::getLDMVariableDefsSize(const MachineInstr &MI) const {
- // ins GPR:$Rn, pred:$p (2xOp), reglist:$regs, variable_ops
- // (outs GPR:$wb), (ins GPR:$Rn, pred:$p (2xOp), reglist:$regs, variable_ops)
+ // ins GPR:$Rn, $p (2xOp), reglist:$regs, variable_ops
+ // (outs GPR:$wb), (ins GPR:$Rn, $p (2xOp), reglist:$regs, variable_ops)
return MI.getNumOperands() + 1 - MI.getDesc().getNumOperands();
}
@@ -4142,8 +4237,12 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
case ARM::VLD3d8Pseudo:
case ARM::VLD3d16Pseudo:
case ARM::VLD3d32Pseudo:
+ case ARM::VLD1d8TPseudo:
+ case ARM::VLD1d16TPseudo:
+ case ARM::VLD1d32TPseudo:
case ARM::VLD1d64TPseudo:
case ARM::VLD1d64TPseudoWB_fixed:
+ case ARM::VLD1d64TPseudoWB_register:
case ARM::VLD3d8Pseudo_UPD:
case ARM::VLD3d16Pseudo_UPD:
case ARM::VLD3d32Pseudo_UPD:
@@ -4159,8 +4258,28 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
case ARM::VLD4d8Pseudo:
case ARM::VLD4d16Pseudo:
case ARM::VLD4d32Pseudo:
+ case ARM::VLD1d8QPseudo:
+ case ARM::VLD1d16QPseudo:
+ case ARM::VLD1d32QPseudo:
case ARM::VLD1d64QPseudo:
case ARM::VLD1d64QPseudoWB_fixed:
+ case ARM::VLD1d64QPseudoWB_register:
+ case ARM::VLD1q8HighQPseudo:
+ case ARM::VLD1q8LowQPseudo_UPD:
+ case ARM::VLD1q8HighTPseudo:
+ case ARM::VLD1q8LowTPseudo_UPD:
+ case ARM::VLD1q16HighQPseudo:
+ case ARM::VLD1q16LowQPseudo_UPD:
+ case ARM::VLD1q16HighTPseudo:
+ case ARM::VLD1q16LowTPseudo_UPD:
+ case ARM::VLD1q32HighQPseudo:
+ case ARM::VLD1q32LowQPseudo_UPD:
+ case ARM::VLD1q32HighTPseudo:
+ case ARM::VLD1q32LowTPseudo_UPD:
+ case ARM::VLD1q64HighQPseudo:
+ case ARM::VLD1q64LowQPseudo_UPD:
+ case ARM::VLD1q64HighTPseudo:
+ case ARM::VLD1q64LowTPseudo_UPD:
case ARM::VLD4d8Pseudo_UPD:
case ARM::VLD4d16Pseudo_UPD:
case ARM::VLD4d32Pseudo_UPD:
@@ -4191,12 +4310,30 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
case ARM::VLD2DUPd8wb_register:
case ARM::VLD2DUPd16wb_register:
case ARM::VLD2DUPd32wb_register:
+ case ARM::VLD2DUPq8EvenPseudo:
+ case ARM::VLD2DUPq8OddPseudo:
+ case ARM::VLD2DUPq16EvenPseudo:
+ case ARM::VLD2DUPq16OddPseudo:
+ case ARM::VLD2DUPq32EvenPseudo:
+ case ARM::VLD2DUPq32OddPseudo:
+ case ARM::VLD3DUPq8EvenPseudo:
+ case ARM::VLD3DUPq8OddPseudo:
+ case ARM::VLD3DUPq16EvenPseudo:
+ case ARM::VLD3DUPq16OddPseudo:
+ case ARM::VLD3DUPq32EvenPseudo:
+ case ARM::VLD3DUPq32OddPseudo:
case ARM::VLD4DUPd8Pseudo:
case ARM::VLD4DUPd16Pseudo:
case ARM::VLD4DUPd32Pseudo:
case ARM::VLD4DUPd8Pseudo_UPD:
case ARM::VLD4DUPd16Pseudo_UPD:
case ARM::VLD4DUPd32Pseudo_UPD:
+ case ARM::VLD4DUPq8EvenPseudo:
+ case ARM::VLD4DUPq8OddPseudo:
+ case ARM::VLD4DUPq16EvenPseudo:
+ case ARM::VLD4DUPq16OddPseudo:
+ case ARM::VLD4DUPq32EvenPseudo:
+ case ARM::VLD4DUPq32OddPseudo:
case ARM::VLD1LNq8Pseudo:
case ARM::VLD1LNq16Pseudo:
case ARM::VLD1LNq32Pseudo:
@@ -4864,12 +5001,14 @@ bool ARMBaseInstrInfo::getRegSequenceLikeInputs(
// Populate the InputRegs accordingly.
// rY
const MachineOperand *MOReg = &MI.getOperand(1);
- InputRegs.push_back(
- RegSubRegPairAndIdx(MOReg->getReg(), MOReg->getSubReg(), ARM::ssub_0));
+ if (!MOReg->isUndef())
+ InputRegs.push_back(RegSubRegPairAndIdx(MOReg->getReg(),
+ MOReg->getSubReg(), ARM::ssub_0));
// rZ
MOReg = &MI.getOperand(2);
- InputRegs.push_back(
- RegSubRegPairAndIdx(MOReg->getReg(), MOReg->getSubReg(), ARM::ssub_1));
+ if (!MOReg->isUndef())
+ InputRegs.push_back(RegSubRegPairAndIdx(MOReg->getReg(),
+ MOReg->getSubReg(), ARM::ssub_1));
return true;
}
llvm_unreachable("Target dependent opcode missing");
@@ -4888,6 +5027,8 @@ bool ARMBaseInstrInfo::getExtractSubregLikeInputs(
// rX = EXTRACT_SUBREG dZ, ssub_0
// rY = EXTRACT_SUBREG dZ, ssub_1
const MachineOperand &MOReg = MI.getOperand(2);
+ if (MOReg.isUndef())
+ return false;
InputReg.Reg = MOReg.getReg();
InputReg.SubReg = MOReg.getSubReg();
InputReg.SubIdx = DefIdx == 0 ? ARM::ssub_0 : ARM::ssub_1;
@@ -4907,6 +5048,8 @@ bool ARMBaseInstrInfo::getInsertSubregLikeInputs(
// dX = VSETLNi32 dY, rZ, imm
const MachineOperand &MOBaseReg = MI.getOperand(1);
const MachineOperand &MOInsertedReg = MI.getOperand(2);
+ if (MOInsertedReg.isUndef())
+ return false;
const MachineOperand &MOIndex = MI.getOperand(3);
BaseReg.Reg = MOBaseReg.getReg();
BaseReg.SubReg = MOBaseReg.getSubReg();
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index d375f40d6e14..b54be15097b1 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -201,6 +201,9 @@ public:
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc) const override;
+ bool isCopyInstr(const MachineInstr &MI, const MachineOperand *&Src,
+ const MachineOperand *&Dest) const override;
+
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
unsigned SrcReg, bool isKill, int FrameIndex,
@@ -215,6 +218,8 @@ public:
bool expandPostRAPseudo(MachineInstr &MI) const override;
+ bool shouldSink(const MachineInstr &MI) const override;
+
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
unsigned DestReg, unsigned SubIdx,
const MachineInstr &Orig,
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 4b9a4376adf8..43e8b7d66c62 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -838,10 +838,10 @@ bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI,
auto AFI = MF->getInfo<ARMFunctionInfo>();
auto It = AFI->getCoalescedWeight(MBB);
- DEBUG(dbgs() << "\tARM::shouldCoalesce - Coalesced Weight: "
- << It->second << "\n");
- DEBUG(dbgs() << "\tARM::shouldCoalesce - Reg Weight: "
- << NewRCWeight.RegWeight << "\n");
+ LLVM_DEBUG(dbgs() << "\tARM::shouldCoalesce - Coalesced Weight: "
+ << It->second << "\n");
+ LLVM_DEBUG(dbgs() << "\tARM::shouldCoalesce - Reg Weight: "
+ << NewRCWeight.RegWeight << "\n");
// This number is the largest round number that which meets the criteria:
// (1) addresses PR18825
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 5801e6bdbd0e..f755f66a0f3a 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -154,6 +154,7 @@ public:
void updateRegAllocHint(unsigned Reg, unsigned NewReg,
MachineFunction &MF) const override;
+ bool enableMultipleCopyHints() const override { return true; }
bool hasBasePointer(const MachineFunction &MF) const;
@@ -200,7 +201,7 @@ public:
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS = nullptr) const override;
- /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true
+ /// SrcRC and DstRC will be morphed into NewRC if this returns true
bool shouldCoalesce(MachineInstr *MI,
const TargetRegisterClass *SrcRC,
unsigned SubReg,
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index eab4b3b13f31..47f998b696f5 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -31,7 +31,6 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
@@ -43,6 +42,7 @@
#include "llvm/IR/Value.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/MachineValueType.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -469,7 +469,12 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
if (!MBB.empty())
MIRBuilder.setInstr(*MBB.begin());
- return handleAssignments(MIRBuilder, ArgInfos, ArgHandler);
+ if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler))
+ return false;
+
+ // Move back to the end of the basic block.
+ MIRBuilder.setMBB(MBB);
+ return true;
}
namespace {
@@ -521,7 +526,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg))
MIB->getOperand(0).setReg(constrainOperandRegClass(
MF, *TRI, MRI, *STI.getInstrInfo(), *STI.getRegBankInfo(),
- *MIB.getInstr(), MIB->getDesc(), CalleeReg, 0));
+ *MIB.getInstr(), MIB->getDesc(), Callee, 0));
}
SmallVector<ArgInfo, 8> ArgInfos;
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index 284b67fd59b6..63bf48abb7ac 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -217,12 +217,15 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
break;
}
+ case MVT::f16:
case MVT::f32:
RegList = SRegList;
break;
+ case MVT::v4f16:
case MVT::f64:
RegList = DRegList;
break;
+ case MVT::v8f16:
case MVT::v2f64:
RegList = QRegList;
break;
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index dcfd6518a840..f173e423f3e4 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -160,8 +160,8 @@ def CC_ARM_AAPCS : CallingConv<[
CCIfNest<CCAssignToReg<[R12]>>,
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -176,8 +176,8 @@ def CC_ARM_AAPCS : CallingConv<[
def RetCC_ARM_AAPCS : CallingConv<[
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16,v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -187,6 +187,7 @@ def RetCC_ARM_AAPCS : CallingConv<[
CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
CCIfType<[f32], CCBitConvertToType<i32>>,
+
CCDelegateTo<RetCC_ARM_AAPCS_Common>
]>;
@@ -200,8 +201,8 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
CCIfByVal<CCPassByVal<4, 4>>,
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -221,8 +222,8 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
def RetCC_ARM_AAPCS_VFP : CallingConv<[
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -233,7 +234,7 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[
CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
- S9, S10, S11, S12, S13, S14, S15]>>,
+ S9, S10, S11, S12, S13, S14, S15]>>,
CCDelegateTo<RetCC_ARM_AAPCS_Common>
]>;
diff --git a/lib/Target/ARM/ARMCodeGenPrepare.cpp b/lib/Target/ARM/ARMCodeGenPrepare.cpp
new file mode 100644
index 000000000000..24071277427a
--- /dev/null
+++ b/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -0,0 +1,750 @@
+//===----- ARMCodeGenPrepare.cpp ------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass inserts intrinsics to handle small types that would otherwise be
+/// promoted during legalization. Here we can manually promote types or insert
+/// intrinsics which can handle narrow types that aren't supported by the
+/// register classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+
+#define DEBUG_TYPE "arm-codegenprepare"
+
+using namespace llvm;
+
+static cl::opt<bool>
+DisableCGP("arm-disable-cgp", cl::Hidden, cl::init(true),
+ cl::desc("Disable ARM specific CodeGenPrepare pass"));
+
+static cl::opt<bool>
+EnableDSP("arm-enable-scalar-dsp", cl::Hidden, cl::init(false),
+ cl::desc("Use DSP instructions for scalar operations"));
+
+static cl::opt<bool>
+EnableDSPWithImms("arm-enable-scalar-dsp-imms", cl::Hidden, cl::init(false),
+ cl::desc("Use DSP instructions for scalar operations\
+ with immediate operands"));
+
+namespace {
+
+class IRPromoter {
+ SmallPtrSet<Value*, 8> NewInsts;
+ SmallVector<Instruction*, 4> InstsToRemove;
+ Module *M = nullptr;
+ LLVMContext &Ctx;
+
+public:
+ IRPromoter(Module *M) : M(M), Ctx(M->getContext()) { }
+
+ void Cleanup() {
+ for (auto *I : InstsToRemove) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n");
+ I->dropAllReferences();
+ I->eraseFromParent();
+ }
+ InstsToRemove.clear();
+ NewInsts.clear();
+ }
+
+ void Mutate(Type *OrigTy,
+ SmallPtrSetImpl<Value*> &Visited,
+ SmallPtrSetImpl<Value*> &Leaves,
+ SmallPtrSetImpl<Instruction*> &Roots);
+};
+
+class ARMCodeGenPrepare : public FunctionPass {
+ const ARMSubtarget *ST = nullptr;
+ IRPromoter *Promoter = nullptr;
+ std::set<Value*> AllVisited;
+ Type *OrigTy = nullptr;
+ unsigned TypeSize = 0;
+
+ bool isNarrowInstSupported(Instruction *I);
+ bool isSupportedValue(Value *V);
+ bool isLegalToPromote(Value *V);
+ bool TryToPromote(Value *V);
+
+public:
+ static char ID;
+
+ ARMCodeGenPrepare() : FunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ }
+
+ StringRef getPassName() const override { return "ARM IR optimizations"; }
+
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+ bool doFinalization(Module &M) override;
+};
+
+}
+
+/// Can the given value generate sign bits.
+static bool isSigned(Value *V) {
+ if (!isa<Instruction>(V))
+ return false;
+
+ unsigned Opc = cast<Instruction>(V)->getOpcode();
+ return Opc == Instruction::AShr || Opc == Instruction::SDiv ||
+ Opc == Instruction::SRem;
+}
+
+/// Some instructions can use 8- and 16-bit operands, and we don't need to
+/// promote anything larger. We disallow booleans to make life easier when
+/// dealing with icmps but allow any other integer that is <= 16 bits. Void
+/// types are accepted so we can handle switches.
+static bool isSupportedType(Value *V) {
+ if (V->getType()->isVoidTy())
+ return true;
+
+ const IntegerType *IntTy = dyn_cast<IntegerType>(V->getType());
+ if (!IntTy)
+ return false;
+
+ // Don't try to promote boolean values.
+ if (IntTy->getBitWidth() == 1)
+ return false;
+
+ if (auto *ZExt = dyn_cast<ZExtInst>(V))
+ return isSupportedType(ZExt->getOperand(0));
+
+ return IntTy->getBitWidth() <= 16;
+}
+
+/// Return true if V will require any promoted values to be truncated for the
+/// use to be valid.
+static bool isSink(Value *V) {
+ auto UsesNarrowValue = [](Value *V) {
+ return V->getType()->getScalarSizeInBits() <= 32;
+ };
+
+ if (auto *Store = dyn_cast<StoreInst>(V))
+ return UsesNarrowValue(Store->getValueOperand());
+ if (auto *Return = dyn_cast<ReturnInst>(V))
+ return UsesNarrowValue(Return->getReturnValue());
+
+ return isa<CallInst>(V);
+}
+
+/// Return true if the given value is a leaf that will need to be zext'd.
+static bool isSource(Value *V) {
+ if (isa<Argument>(V) && isSupportedType(V))
+ return true;
+ else if (isa<TruncInst>(V))
+ return true;
+ else if (auto *ZExt = dyn_cast<ZExtInst>(V))
+ // ZExt can be a leaf if its the only user of a load.
+ return isa<LoadInst>(ZExt->getOperand(0)) &&
+ ZExt->getOperand(0)->hasOneUse();
+ else if (auto *Call = dyn_cast<CallInst>(V))
+ return Call->hasRetAttr(Attribute::AttrKind::ZExt);
+ else if (auto *Load = dyn_cast<LoadInst>(V)) {
+ if (!isa<IntegerType>(Load->getType()))
+ return false;
+ // A load is a leaf, unless its already just being zext'd.
+ if (Load->hasOneUse() && isa<ZExtInst>(*Load->use_begin()))
+ return false;
+
+ return true;
+ }
+ return false;
+}
+
+/// Return whether the instruction can be promoted within any modifications to
+/// it's operands or result.
+static bool isSafeOverflow(Instruction *I) {
+ if (isa<OverflowingBinaryOperator>(I) && I->hasNoUnsignedWrap())
+ return true;
+
+ unsigned Opc = I->getOpcode();
+ if (Opc == Instruction::Add || Opc == Instruction::Sub) {
+ // We don't care if the add or sub could wrap if the value is decreasing
+ // and is only being used by an unsigned compare.
+ if (!I->hasOneUse() ||
+ !isa<ICmpInst>(*I->user_begin()) ||
+ !isa<ConstantInt>(I->getOperand(1)))
+ return false;
+
+ auto *CI = cast<ICmpInst>(*I->user_begin());
+ if (CI->isSigned())
+ return false;
+
+ bool NegImm = cast<ConstantInt>(I->getOperand(1))->isNegative();
+ bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) ||
+ ((Opc == Instruction::Add) && NegImm);
+ if (!IsDecreasing)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n");
+ return true;
+ }
+
+ // Otherwise, if an instruction is using a negative immediate we will need
+ // to fix it up during the promotion.
+ for (auto &Op : I->operands()) {
+ if (auto *Const = dyn_cast<ConstantInt>(Op))
+ if (Const->isNegative())
+ return false;
+ }
+ return false;
+}
+
+static bool shouldPromote(Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+
+ if (!isa<IntegerType>(V->getType()))
+ return false;
+
+ if (isa<StoreInst>(I) || isa<TerminatorInst>(I) || isa<TruncInst>(I) ||
+ isa<ICmpInst>(I))
+ return false;
+
+ if (auto *ZExt = dyn_cast<ZExtInst>(I))
+ return !ZExt->getDestTy()->isIntegerTy(32);
+
+ return true;
+}
+
+/// Return whether we can safely mutate V's type to ExtTy without having to be
+/// concerned with zero extending or truncation.
+static bool isPromotedResultSafe(Value *V) {
+ if (!isa<Instruction>(V))
+ return true;
+
+ if (isSigned(V))
+ return false;
+
+ // If I is only being used by something that will require its value to be
+ // truncated, then we don't care about the promoted result.
+ auto *I = cast<Instruction>(V);
+ if (I->hasOneUse() && isSink(*I->use_begin()))
+ return true;
+
+ if (isa<OverflowingBinaryOperator>(I))
+ return isSafeOverflow(I);
+ return true;
+}
+
+/// Return the intrinsic for the instruction that can perform the same
+/// operation but on a narrow type. This is using the parallel dsp intrinsics
+/// on scalar values.
+static Intrinsic::ID getNarrowIntrinsic(Instruction *I, unsigned TypeSize) {
+ // Whether we use the signed or unsigned versions of these intrinsics
+ // doesn't matter because we're not using the GE bits that they set in
+ // the APSR.
+ switch(I->getOpcode()) {
+ default:
+ break;
+ case Instruction::Add:
+ return TypeSize == 16 ? Intrinsic::arm_uadd16 :
+ Intrinsic::arm_uadd8;
+ case Instruction::Sub:
+ return TypeSize == 16 ? Intrinsic::arm_usub16 :
+ Intrinsic::arm_usub8;
+ }
+ llvm_unreachable("unhandled opcode for narrow intrinsic");
+}
+
+void IRPromoter::Mutate(Type *OrigTy,
+ SmallPtrSetImpl<Value*> &Visited,
+ SmallPtrSetImpl<Value*> &Leaves,
+ SmallPtrSetImpl<Instruction*> &Roots) {
+ IRBuilder<> Builder{Ctx};
+ Type *ExtTy = Type::getInt32Ty(M->getContext());
+ unsigned TypeSize = OrigTy->getPrimitiveSizeInBits();
+ SmallPtrSet<Value*, 8> Promoted;
+ LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from " << TypeSize
+ << " to 32-bits\n");
+
+ auto ReplaceAllUsersOfWith = [&](Value *From, Value *To) {
+ SmallVector<Instruction*, 4> Users;
+ Instruction *InstTo = dyn_cast<Instruction>(To);
+ for (Use &U : From->uses()) {
+ auto *User = cast<Instruction>(U.getUser());
+ if (InstTo && User->isIdenticalTo(InstTo))
+ continue;
+ Users.push_back(User);
+ }
+
+ for (auto &U : Users)
+ U->replaceUsesOfWith(From, To);
+ };
+
+ auto FixConst = [&](ConstantInt *Const, Instruction *I) {
+ Constant *NewConst = nullptr;
+ if (isSafeOverflow(I)) {
+ NewConst = (Const->isNegative()) ?
+ ConstantExpr::getSExt(Const, ExtTy) :
+ ConstantExpr::getZExt(Const, ExtTy);
+ } else {
+ uint64_t NewVal = *Const->getValue().getRawData();
+ if (Const->getType() == Type::getInt16Ty(Ctx))
+ NewVal &= 0xFFFF;
+ else
+ NewVal &= 0xFF;
+ NewConst = ConstantInt::get(ExtTy, NewVal);
+ }
+ I->replaceUsesOfWith(Const, NewConst);
+ };
+
+ auto InsertDSPIntrinsic = [&](Instruction *I) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
+ << *I << "\n");
+ Function *DSPInst =
+ Intrinsic::getDeclaration(M, getNarrowIntrinsic(I, TypeSize));
+ Builder.SetInsertPoint(I);
+ Builder.SetCurrentDebugLocation(I->getDebugLoc());
+ Value *Args[] = { I->getOperand(0), I->getOperand(1) };
+ CallInst *Call = Builder.CreateCall(DSPInst, Args);
+ ReplaceAllUsersOfWith(I, Call);
+ InstsToRemove.push_back(I);
+ NewInsts.insert(Call);
+ };
+
+ auto InsertZExt = [&](Value *V, Instruction *InsertPt) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n");
+ Builder.SetInsertPoint(InsertPt);
+ if (auto *I = dyn_cast<Instruction>(V))
+ Builder.SetCurrentDebugLocation(I->getDebugLoc());
+ auto *ZExt = cast<Instruction>(Builder.CreateZExt(V, ExtTy));
+ if (isa<Argument>(V))
+ ZExt->moveBefore(InsertPt);
+ else
+ ZExt->moveAfter(InsertPt);
+ ReplaceAllUsersOfWith(V, ZExt);
+ NewInsts.insert(ZExt);
+ };
+
+ // First, insert extending instructions between the leaves and their users.
+ LLVM_DEBUG(dbgs() << "ARM CGP: Promoting leaves:\n");
+ for (auto V : Leaves) {
+ LLVM_DEBUG(dbgs() << " - " << *V << "\n");
+ if (auto *ZExt = dyn_cast<ZExtInst>(V))
+ ZExt->mutateType(ExtTy);
+ else if (auto *I = dyn_cast<Instruction>(V))
+ InsertZExt(I, I);
+ else if (auto *Arg = dyn_cast<Argument>(V)) {
+ BasicBlock &BB = Arg->getParent()->front();
+ InsertZExt(Arg, &*BB.getFirstInsertionPt());
+ } else {
+ llvm_unreachable("unhandled leaf that needs extending");
+ }
+ Promoted.insert(V);
+ }
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n");
+ // Then mutate the types of the instructions within the tree. Here we handle
+ // constant operands.
+ for (auto *V : Visited) {
+ if (Leaves.count(V))
+ continue;
+
+ if (!isa<Instruction>(V))
+ continue;
+
+ auto *I = cast<Instruction>(V);
+ if (Roots.count(I))
+ continue;
+
+ for (auto &U : I->operands()) {
+ if ((U->getType() == ExtTy) || !isSupportedType(&*U))
+ continue;
+
+ if (auto *Const = dyn_cast<ConstantInt>(&*U))
+ FixConst(Const, I);
+ else if (isa<UndefValue>(&*U))
+ U->mutateType(ExtTy);
+ }
+
+ if (shouldPromote(I)) {
+ I->mutateType(ExtTy);
+ Promoted.insert(I);
+ }
+ }
+
+ // Now we need to remove any zexts that have become unnecessary, as well
+ // as insert any intrinsics.
+ for (auto *V : Visited) {
+ if (Leaves.count(V))
+ continue;
+ if (auto *ZExt = dyn_cast<ZExtInst>(V)) {
+ if (ZExt->getDestTy() != ExtTy) {
+ ZExt->mutateType(ExtTy);
+ Promoted.insert(ZExt);
+ }
+ else if (ZExt->getSrcTy() == ExtTy) {
+ ReplaceAllUsersOfWith(V, ZExt->getOperand(0));
+ InstsToRemove.push_back(ZExt);
+ }
+ continue;
+ }
+
+ if (!shouldPromote(V) || isPromotedResultSafe(V))
+ continue;
+
+ // Replace unsafe instructions with appropriate intrinsic calls.
+ InsertDSPIntrinsic(cast<Instruction>(V));
+ }
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the roots:\n");
+ // Fix up any stores or returns that use the results of the promoted
+ // chain.
+ for (auto I : Roots) {
+ LLVM_DEBUG(dbgs() << " - " << *I << "\n");
+ Type *TruncTy = OrigTy;
+ if (auto *Store = dyn_cast<StoreInst>(I)) {
+ auto *PtrTy = cast<PointerType>(Store->getPointerOperandType());
+ TruncTy = PtrTy->getElementType();
+ } else if (isa<ReturnInst>(I)) {
+ Function *F = I->getParent()->getParent();
+ TruncTy = F->getFunctionType()->getReturnType();
+ }
+
+ for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+ Value *V = I->getOperand(i);
+ if (Promoted.count(V) || NewInsts.count(V)) {
+ if (auto *Op = dyn_cast<Instruction>(V)) {
+
+ if (auto *Call = dyn_cast<CallInst>(I))
+ TruncTy = Call->getFunctionType()->getParamType(i);
+
+ if (TruncTy == ExtTy)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy
+ << " Trunc for " << *Op << "\n");
+ Builder.SetInsertPoint(Op);
+ auto *Trunc = cast<Instruction>(Builder.CreateTrunc(Op, TruncTy));
+ Trunc->moveBefore(I);
+ I->setOperand(i, Trunc);
+ NewInsts.insert(Trunc);
+ }
+ }
+ }
+ }
+ LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete.\n");
+}
+
+bool ARMCodeGenPrepare::isNarrowInstSupported(Instruction *I) {
+ if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I))
+ return false;
+
+ if (ST->isThumb() && !ST->hasThumb2())
+ return false;
+
+ if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
+ return false;
+
+ // TODO
+ // Would it be profitable? For Thumb code, these parallel DSP instructions
+ // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
+ // Cortex-A, specifically Cortex-A72, the latency is double and throughput is
+ // halved. They also do not take immediates as operands.
+ for (auto &Op : I->operands()) {
+ if (isa<Constant>(Op)) {
+ if (!EnableDSPWithImms)
+ return false;
+ }
+ }
+ return true;
+}
+
+/// We accept most instructions, as well as Arguments and ConstantInsts. We
+/// Disallow casts other than zext and truncs and only allow calls if their
+/// return value is zeroext. We don't allow opcodes that can introduce sign
+/// bits.
+bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Is " << *V << " supported?\n");
+
+ // Non-instruction values that we can handle.
+ if (isa<ConstantInt>(V) || isa<Argument>(V))
+ return true;
+
+ // Memory instructions
+ if (isa<StoreInst>(V) || isa<LoadInst>(V) || isa<GetElementPtrInst>(V))
+ return true;
+
+ // Branches and targets.
+ if (auto *ICmp = dyn_cast<ICmpInst>(V))
+ return ICmp->isEquality() || !ICmp->isSigned();
+
+ if( isa<BranchInst>(V) || isa<SwitchInst>(V) || isa<BasicBlock>(V))
+ return true;
+
+ if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V))
+ return true;
+
+ // Special cases for calls as we need to check for zeroext
+ // TODO We should accept calls even if they don't have zeroext, as they can
+ // still be roots.
+ if (auto *Call = dyn_cast<CallInst>(V))
+ return Call->hasRetAttr(Attribute::AttrKind::ZExt);
+ else if (auto *Cast = dyn_cast<CastInst>(V)) {
+ if (isa<ZExtInst>(Cast))
+ return Cast->getDestTy()->getScalarSizeInBits() <= 32;
+ else if (auto *Trunc = dyn_cast<TruncInst>(V))
+ return Trunc->getDestTy()->getScalarSizeInBits() <= TypeSize;
+ else {
+ LLVM_DEBUG(dbgs() << "ARM CGP: No, unsupported cast.\n");
+ return false;
+ }
+ } else if (!isa<BinaryOperator>(V)) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: No, not a binary operator.\n");
+ return false;
+ }
+
+ bool res = !isSigned(V);
+ if (!res)
+ LLVM_DEBUG(dbgs() << "ARM CGP: No, it's a signed instruction.\n");
+ return res;
+}
+
+/// Check that the type of V would be promoted and that the original type is
+/// smaller than the targeted promoted type. Check that we're not trying to
+/// promote something larger than our base 'TypeSize' type.
+bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
+ if (!isSupportedType(V))
+ return false;
+
+ unsigned VSize = 0;
+ if (auto *Ld = dyn_cast<LoadInst>(V)) {
+ auto *PtrTy = cast<PointerType>(Ld->getPointerOperandType());
+ VSize = PtrTy->getElementType()->getPrimitiveSizeInBits();
+ } else if (auto *ZExt = dyn_cast<ZExtInst>(V)) {
+ VSize = ZExt->getOperand(0)->getType()->getPrimitiveSizeInBits();
+ } else {
+ VSize = V->getType()->getPrimitiveSizeInBits();
+ }
+
+ if (VSize > TypeSize)
+ return false;
+
+ if (isPromotedResultSafe(V))
+ return true;
+
+ if (auto *I = dyn_cast<Instruction>(V))
+ return isNarrowInstSupported(I);
+
+ return false;
+}
+
+bool ARMCodeGenPrepare::TryToPromote(Value *V) {
+ OrigTy = V->getType();
+ TypeSize = OrigTy->getPrimitiveSizeInBits();
+
+ if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << "\n");
+
+ SetVector<Value*> WorkList;
+ SmallPtrSet<Value*, 8> Leaves;
+ SmallPtrSet<Instruction*, 4> Roots;
+ WorkList.insert(V);
+ SmallPtrSet<Value*, 16> CurrentVisited;
+ CurrentVisited.clear();
+
+ // Return true if the given value can, or has been, visited. Add V to the
+ // worklist if needed.
+ auto AddLegalInst = [&](Value *V) {
+ if (CurrentVisited.count(V))
+ return true;
+
+ if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Can't handle: " << *V << "\n");
+ return false;
+ }
+
+ WorkList.insert(V);
+ return true;
+ };
+
+ // Iterate through, and add to, a tree of operands and users in the use-def.
+ while (!WorkList.empty()) {
+ Value *V = WorkList.back();
+ WorkList.pop_back();
+ if (CurrentVisited.count(V))
+ continue;
+
+ if (!isa<Instruction>(V) && !isSource(V))
+ continue;
+
+ // If we've already visited this value from somewhere, bail now because
+ // the tree has already been explored.
+ // TODO: This could limit the transform, ie if we try to promote something
+ // from an i8 and fail first, before trying an i16.
+ if (AllVisited.count(V)) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Already visited this: " << *V << "\n");
+ return false;
+ }
+
+ CurrentVisited.insert(V);
+ AllVisited.insert(V);
+
+ // Calls can be both sources and sinks.
+ if (isSink(V))
+ Roots.insert(cast<Instruction>(V));
+ if (isSource(V))
+ Leaves.insert(V);
+ else if (auto *I = dyn_cast<Instruction>(V)) {
+ // Visit operands of any instruction visited.
+ for (auto &U : I->operands()) {
+ if (!AddLegalInst(U))
+ return false;
+ }
+ }
+
+ // Don't visit users of a node which isn't going to be mutated unless its a
+ // source.
+ if (isSource(V) || shouldPromote(V)) {
+ for (Use &U : V->uses()) {
+ if (!AddLegalInst(U.getUser()))
+ return false;
+ }
+ }
+ }
+
+ unsigned NumToPromote = 0;
+ unsigned Cost = 0;
+ for (auto *V : CurrentVisited) {
+ // Truncs will cause a uxt and no zeroext arguments will often require
+ // a uxt somewhere.
+ if (isa<TruncInst>(V))
+ ++Cost;
+ else if (auto *Arg = dyn_cast<Argument>(V)) {
+ if (!Arg->hasZExtAttr())
+ ++Cost;
+ }
+
+ // Mem ops can automatically be extended/truncated and non-instructions
+ // don't need anything done.
+ if (Leaves.count(V) || isa<StoreInst>(V) || !isa<Instruction>(V))
+ continue;
+
+ // Will need to truncate calls args and returns.
+ if (Roots.count(cast<Instruction>(V))) {
+ ++Cost;
+ continue;
+ }
+
+ if (shouldPromote(V))
+ ++NumToPromote;
+ }
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n";
+ for (auto *I : CurrentVisited)
+ I->dump();
+ );
+ LLVM_DEBUG(dbgs() << "ARM CGP: Cost of promoting " << NumToPromote
+ << " instructions = " << Cost << "\n");
+ if (Cost > NumToPromote || (NumToPromote == 0))
+ return false;
+
+ Promoter->Mutate(OrigTy, CurrentVisited, Leaves, Roots);
+ return true;
+}
+
+bool ARMCodeGenPrepare::doInitialization(Module &M) {
+ Promoter = new IRPromoter(&M);
+ return false;
+}
+
+bool ARMCodeGenPrepare::runOnFunction(Function &F) {
+ if (skipFunction(F) || DisableCGP)
+ return false;
+
+ auto *TPC = &getAnalysis<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ const TargetMachine &TM = TPC->getTM<TargetMachine>();
+ ST = &TM.getSubtarget<ARMSubtarget>(F);
+ bool MadeChange = false;
+ LLVM_DEBUG(dbgs() << "ARM CGP: Running on " << F.getName() << "\n");
+
+ // Search up from icmps to try to promote their operands.
+ for (BasicBlock &BB : F) {
+ auto &Insts = BB.getInstList();
+ for (auto &I : Insts) {
+ if (AllVisited.count(&I))
+ continue;
+
+ if (isa<ICmpInst>(I)) {
+ auto &CI = cast<ICmpInst>(I);
+
+ // Skip signed or pointer compares
+ if (CI.isSigned() || !isa<IntegerType>(CI.getOperand(0)->getType()))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n");
+ for (auto &Op : CI.operands()) {
+ if (auto *I = dyn_cast<Instruction>(Op)) {
+ if (isa<ZExtInst>(I))
+ MadeChange |= TryToPromote(I->getOperand(0));
+ else
+ MadeChange |= TryToPromote(I);
+ }
+ }
+ }
+ }
+ Promoter->Cleanup();
+ LLVM_DEBUG(if (verifyFunction(F, &dbgs())) {
+ dbgs();
+ report_fatal_error("Broken function after type promotion");
+ });
+ }
+ if (MadeChange)
+ LLVM_DEBUG(dbgs() << "After ARMCodeGenPrepare: " << F << "\n");
+
+ return MadeChange;
+}
+
+bool ARMCodeGenPrepare::doFinalization(Module &M) {
+ delete Promoter;
+ return false;
+}
+
+INITIALIZE_PASS_BEGIN(ARMCodeGenPrepare, DEBUG_TYPE,
+ "ARM IR optimizations", false, false)
+INITIALIZE_PASS_END(ARMCodeGenPrepare, DEBUG_TYPE, "ARM IR optimizations",
+ false, false)
+
+char ARMCodeGenPrepare::ID = 0;
+
+FunctionPass *llvm::createARMCodeGenPreparePass() {
+ return new ARMCodeGenPrepare();
+}
diff --git a/lib/Target/ARM/ARMComputeBlockSize.cpp b/lib/Target/ARM/ARMComputeBlockSize.cpp
index 2e97b99b05a7..b263e9d86c42 100644
--- a/lib/Target/ARM/ARMComputeBlockSize.cpp
+++ b/lib/Target/ARM/ARMComputeBlockSize.cpp
@@ -35,6 +35,7 @@ mayOptimizeThumb2Instruction(const MachineInstr *MI) {
case ARM::tBcc:
// optimizeThumb2JumpTables.
case ARM::t2BR_JT:
+ case ARM::tBR_JTr:
return true;
}
return false;
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 8baee1ce281d..de08eb8c6985 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -35,6 +35,7 @@
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/MCInstrDesc.h"
@@ -301,7 +302,7 @@ void ARMConstantIslands::verify() {
return BBInfo[LHS.getNumber()].postOffset() <
BBInfo[RHS.getNumber()].postOffset();
}));
- DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
+ LLVM_DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
CPUser &U = CPUsers[i];
unsigned UserOffset = getUserOffset(U);
@@ -309,12 +310,12 @@ void ARMConstantIslands::verify() {
// adjustment.
if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, U.getMaxDisp()+2, U.NegOk,
/* DoDump = */ true)) {
- DEBUG(dbgs() << "OK\n");
+ LLVM_DEBUG(dbgs() << "OK\n");
continue;
}
- DEBUG(dbgs() << "Out of range.\n");
+ LLVM_DEBUG(dbgs() << "Out of range.\n");
dumpBBs();
- DEBUG(MF->dump());
+ LLVM_DEBUG(MF->dump());
llvm_unreachable("Constant pool entry out of range!");
}
#endif
@@ -323,7 +324,7 @@ void ARMConstantIslands::verify() {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// print block size and offset information - debugging
LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() {
- DEBUG({
+ LLVM_DEBUG({
for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
const BasicBlockInfo &BBI = BBInfo[J];
dbgs() << format("%08x %bb.%u\t", BBI.Offset, J)
@@ -340,9 +341,9 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
MF = &mf;
MCP = mf.getConstantPool();
- DEBUG(dbgs() << "***** ARMConstantIslands: "
- << MCP->getConstants().size() << " CP entries, aligned to "
- << MCP->getConstantPoolAlignment() << " bytes *****\n");
+ LLVM_DEBUG(dbgs() << "***** ARMConstantIslands: "
+ << MCP->getConstants().size() << " CP entries, aligned to "
+ << MCP->getConstantPoolAlignment() << " bytes *****\n");
STI = &static_cast<const ARMSubtarget &>(MF->getSubtarget());
TII = STI->getInstrInfo();
@@ -393,7 +394,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
// constant pool users.
initializeFunctionInfo(CPEMIs);
CPEMIs.clear();
- DEBUG(dumpBBs());
+ LLVM_DEBUG(dumpBBs());
// Functions with jump tables need an alignment of 4 because they use the ADR
// instruction, which aligns the PC to 4 bytes before adding an offset.
@@ -407,7 +408,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
// is no change.
unsigned NoCPIters = 0, NoBRIters = 0;
while (true) {
- DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
+ LLVM_DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
bool CPChange = false;
for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
// For most inputs, it converges in no more than 5 iterations.
@@ -416,19 +417,19 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
CPChange |= handleConstantPoolUser(i, NoCPIters >= CPMaxIteration / 2);
if (CPChange && ++NoCPIters > CPMaxIteration)
report_fatal_error("Constant Island pass failed to converge!");
- DEBUG(dumpBBs());
+ LLVM_DEBUG(dumpBBs());
// Clear NewWaterList now. If we split a block for branches, it should
// appear as "new water" for the next iteration of constant pool placement.
NewWaterList.clear();
- DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
+ LLVM_DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
bool BRChange = false;
for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
BRChange |= fixupImmediateBr(ImmBranches[i]);
if (BRChange && ++NoBRIters > 30)
report_fatal_error("Branch Fix Up pass failed to converge!");
- DEBUG(dumpBBs());
+ LLVM_DEBUG(dumpBBs());
if (!CPChange && !BRChange)
break;
@@ -464,7 +465,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
}
}
- DEBUG(dbgs() << '\n'; dumpBBs());
+ LLVM_DEBUG(dbgs() << '\n'; dumpBBs());
BBInfo.clear();
WaterList.clear();
@@ -479,7 +480,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
return MadeChange;
}
-/// \brief Perform the initial placement of the regular constant pool entries.
+/// Perform the initial placement of the regular constant pool entries.
/// To start with, we put them all at the end of the function.
void
ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) {
@@ -510,7 +511,6 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
const DataLayout &TD = MF->getDataLayout();
for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
- assert(Size >= 4 && "Too small constant pool entry");
unsigned Align = CPs[i].getAlignment();
assert(isPowerOf2_32(Align) && "Invalid alignment");
// Verify that all constant pool entries are a multiple of their alignment.
@@ -534,13 +534,13 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
// Add a new CPEntry, but no corresponding CPUser yet.
CPEntries.emplace_back(1, CPEntry(CPEMI, i));
++NumCPEs;
- DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
- << Size << ", align = " << Align <<'\n');
+ LLVM_DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
+ << Size << ", align = " << Align << '\n');
}
- DEBUG(BB->dump());
+ LLVM_DEBUG(BB->dump());
}
-/// \brief Do initial placement of the jump tables. Because Thumb2's TBB and TBH
+/// Do initial placement of the jump tables. Because Thumb2's TBB and TBH
/// instructions can be made more efficient if the jump table immediately
/// follows the instruction, it's best to place them immediately next to their
/// jumps to begin with. In almost all cases they'll never be moved from that
@@ -701,7 +701,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
WaterList.push_back(&MBB);
for (MachineInstr &I : MBB) {
- if (I.isDebugValue())
+ if (I.isDebugInstr())
continue;
unsigned Opc = I.getOpcode();
@@ -820,6 +820,11 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
Scale = 4; // +-(offset_8*4)
NegOk = true;
break;
+ case ARM::VLDRH:
+ Bits = 8;
+ Scale = 2; // +-(offset_8*2)
+ NegOk = true;
+ break;
case ARM::tLDRHi:
Bits = 5;
@@ -1066,7 +1071,7 @@ bool ARMConstantIslands::isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
unsigned CPEOffset = getOffsetOf(CPEMI);
if (DoDump) {
- DEBUG({
+ LLVM_DEBUG({
unsigned Block = MI->getParent()->getNumber();
const BasicBlockInfo &BBI = BBInfo[Block];
dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
@@ -1159,7 +1164,7 @@ int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset) {
// Check to see if the CPE is already in-range.
if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
true)) {
- DEBUG(dbgs() << "In range\n");
+ LLVM_DEBUG(dbgs() << "In range\n");
return 1;
}
@@ -1175,8 +1180,8 @@ int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset) {
continue;
if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
U.NegOk)) {
- DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
- << CPEs[i].CPI << "\n");
+ LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+ << CPEs[i].CPI << "\n");
// Point the CPUser node to the replacement
U.CPEMI = CPEs[i].CPEMI;
// Change the CPI in the instruction operand to refer to the clone.
@@ -1261,8 +1266,8 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
// This is the least amount of required padding seen so far.
BestGrowth = Growth;
WaterIter = IP;
- DEBUG(dbgs() << "Found water after " << printMBBReference(*WaterBB)
- << " Growth=" << Growth << '\n');
+ LLVM_DEBUG(dbgs() << "Found water after " << printMBBReference(*WaterBB)
+ << " Growth=" << Growth << '\n');
if (CloserWater && WaterBB == U.MI->getParent())
return true;
@@ -1305,8 +1310,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
if (isOffsetInRange(UserOffset, CPEOffset, U)) {
- DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB)
- << format(", expected CPE offset %#x\n", CPEOffset));
+ LLVM_DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB)
+ << format(", expected CPE offset %#x\n", CPEOffset));
NewMBB = &*++UserMBB->getIterator();
// Add an unconditional branch from UserMBB to fallthrough block. Record
// it for branch lengthening; this new branch will not get out of range,
@@ -1349,18 +1354,17 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
unsigned KnownBits = UserBBI.internalKnownBits();
unsigned UPad = UnknownPadding(LogAlign, KnownBits);
unsigned BaseInsertOffset = UserOffset + U.getMaxDisp() - UPad;
- DEBUG(dbgs() << format("Split in middle of big block before %#x",
- BaseInsertOffset));
+ LLVM_DEBUG(dbgs() << format("Split in middle of big block before %#x",
+ BaseInsertOffset));
// The 4 in the following is for the unconditional branch we'll be inserting
// (allows for long branch on Thumb1). Alignment of the island is handled
// inside isOffsetInRange.
BaseInsertOffset -= 4;
- DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
- << " la=" << LogAlign
- << " kb=" << KnownBits
- << " up=" << UPad << '\n');
+ LLVM_DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+ << " la=" << LogAlign << " kb=" << KnownBits
+ << " up=" << UPad << '\n');
// This could point off the end of the block if we've already got constant
// pool entries following this block; only the last one is in the water list.
@@ -1373,7 +1377,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
BaseInsertOffset =
std::max(UserBBI.postOffset() - UPad - 8,
UserOffset + TII->getInstSizeInBytes(*UserMI) + 1);
- DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+ LLVM_DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
}
unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad +
CPEMI->getOperand(2).getImm();
@@ -1417,8 +1421,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
}
// We really must not split an IT block.
- DEBUG(unsigned PredReg;
- assert(!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL));
+ LLVM_DEBUG(unsigned PredReg; assert(
+ !isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL));
NewMBB = splitBlockBeforeInstr(&*MI);
}
@@ -1452,7 +1456,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
MachineBasicBlock *NewMBB;
water_iterator IP;
if (findAvailableWater(U, UserOffset, IP, CloserWater)) {
- DEBUG(dbgs() << "Found water in range\n");
+ LLVM_DEBUG(dbgs() << "Found water in range\n");
MachineBasicBlock *WaterBB = *IP;
// If the original WaterList entry was "new water" on this iteration,
@@ -1465,7 +1469,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
NewMBB = &*++WaterBB->getIterator();
} else {
// No water found.
- DEBUG(dbgs() << "No water found\n");
+ LLVM_DEBUG(dbgs() << "No water found\n");
createNewWater(CPUserIndex, UserOffset, NewMBB);
// splitBlockBeforeInstr adds to WaterList, which is important when it is
@@ -1481,6 +1485,12 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
// We are adding new water. Update NewWaterList.
NewWaterList.insert(NewIsland);
}
+ // Always align the new block because CP entries can be smaller than 4
+ // bytes. Be careful not to decrease the existing alignment, e.g. NewMBB may
+ // be an already aligned constant pool block.
+ const unsigned Align = isThumb ? 1 : 2;
+ if (NewMBB->getAlignment() < Align)
+ NewMBB->setAlignment(Align);
// Remove the original WaterList entry; we want subsequent insertions in
// this vicinity to go after the one we're about to insert. This
@@ -1522,8 +1532,9 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
break;
}
- DEBUG(dbgs() << " Moved CPE to #" << ID << " CPI=" << CPI
- << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+ LLVM_DEBUG(
+ dbgs() << " Moved CPE to #" << ID << " CPI=" << CPI
+ << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
return true;
}
@@ -1578,11 +1589,11 @@ bool ARMConstantIslands::isBBInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
unsigned BrOffset = getOffsetOf(MI) + PCAdj;
unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
- DEBUG(dbgs() << "Branch of destination " << printMBBReference(*DestBB)
- << " from " << printMBBReference(*MI->getParent())
- << " max delta=" << MaxDisp << " from " << getOffsetOf(MI)
- << " to " << DestOffset << " offset "
- << int(DestOffset - BrOffset) << "\t" << *MI);
+ LLVM_DEBUG(dbgs() << "Branch of destination " << printMBBReference(*DestBB)
+ << " from " << printMBBReference(*MI->getParent())
+ << " max delta=" << MaxDisp << " from " << getOffsetOf(MI)
+ << " to " << DestOffset << " offset "
+ << int(DestOffset - BrOffset) << "\t" << *MI);
if (BrOffset <= DestOffset) {
// Branch before the Dest.
@@ -1629,7 +1640,7 @@ ARMConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
HasFarJump = true;
++NumUBrFixed;
- DEBUG(dbgs() << " Changed B to long jump " << *MI);
+ LLVM_DEBUG(dbgs() << " Changed B to long jump " << *MI);
return true;
}
@@ -1673,8 +1684,9 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
// b L1
MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
- DEBUG(dbgs() << " Invert Bcc condition and swap its destination with "
- << *BMI);
+ LLVM_DEBUG(
+ dbgs() << " Invert Bcc condition and swap its destination with "
+ << *BMI);
BMI->getOperand(0).setMBB(DestBB);
MI->getOperand(0).setMBB(NewDest);
MI->getOperand(1).setImm(CC);
@@ -1700,9 +1712,9 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
}
MachineBasicBlock *NextBB = &*++MBB->getIterator();
- DEBUG(dbgs() << " Insert B to " << printMBBReference(*DestBB)
- << " also invert condition and change dest. to "
- << printMBBReference(*NextBB) << "\n");
+ LLVM_DEBUG(dbgs() << " Insert B to " << printMBBReference(*DestBB)
+ << " also invert condition and change dest. to "
+ << printMBBReference(*NextBB) << "\n");
// Insert a new conditional branch and a new unconditional branch.
// Also update the ImmBranch as well as adding a new entry for the new branch.
@@ -1795,7 +1807,7 @@ bool ARMConstantIslands::optimizeThumb2Instructions() {
// FIXME: Check if offset is multiple of scale if scale is not 4.
if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) {
- DEBUG(dbgs() << "Shrink: " << *U.MI);
+ LLVM_DEBUG(dbgs() << "Shrink: " << *U.MI);
U.MI->setDesc(TII->get(NewOpc));
MachineBasicBlock *MBB = U.MI->getParent();
BBInfo[MBB->getNumber()].Size -= 2;
@@ -1839,7 +1851,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
if (isBBInRange(Br.MI, DestBB, MaxOffs)) {
- DEBUG(dbgs() << "Shrink branch: " << *Br.MI);
+ LLVM_DEBUG(dbgs() << "Shrink branch: " << *Br.MI);
Br.MI->setDesc(TII->get(NewOpc));
MachineBasicBlock *MBB = Br.MI->getParent();
BBInfo[MBB->getNumber()].Size -= 2;
@@ -1883,7 +1895,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
CmpMI->getOperand(1).getImm() == 0 &&
isARMLowRegister(Reg)) {
MachineBasicBlock *MBB = Br.MI->getParent();
- DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI);
+ LLVM_DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI);
MachineInstr *NewBR =
BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc))
.addReg(Reg).addMBB(DestBB,Br.MI->getOperand(0).getTargetFlags());
@@ -1918,7 +1930,7 @@ static bool isSimpleIndexCalc(MachineInstr &I, unsigned EntryReg,
return true;
}
-/// \brief While trying to form a TBB/TBH instruction, we may (if the table
+/// While trying to form a TBB/TBH instruction, we may (if the table
/// doesn't immediately follow the BR_JT) need access to the start of the
/// jump-table. We know one instruction that produces such a register; this
/// function works out whether that definition can be preserved to the BR_JT,
@@ -2006,7 +2018,7 @@ bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI,
return true;
}
-/// \brief Returns whether CPEMI is the first instruction in the block
+/// Returns whether CPEMI is the first instruction in the block
/// immediately following JTMI (assumed to be a TBB or TBH terminator). If so,
/// we can switch the first register to PC and usually remove the address
/// calculation that preceded it.
@@ -2052,7 +2064,7 @@ static void RemoveDeadAddBetweenLEAAndJT(MachineInstr *LEAMI,
}
}
- DEBUG(dbgs() << "Removing Dead Add: " << *RemovableAdd);
+ LLVM_DEBUG(dbgs() << "Removing Dead Add: " << *RemovableAdd);
RemovableAdd->eraseFromParent();
DeadSize += 4;
}
@@ -2198,7 +2210,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
DeadSize += 4;
}
- DEBUG(dbgs() << "Shrink JT: " << *MI);
+ LLVM_DEBUG(dbgs() << "Shrink JT: " << *MI);
MachineInstr *CPEMI = User.CPEMI;
unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT;
if (!isThumb2)
@@ -2212,7 +2224,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
.addReg(IdxReg, getKillRegState(IdxRegKill))
.addJumpTableIndex(JTI, JTOP.getTargetFlags())
.addImm(CPEMI->getOperand(0).getImm());
- DEBUG(dbgs() << printMBBReference(*MBB) << ": " << *NewJTMI);
+ LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << ": " << *NewJTMI);
unsigned JTOpc = ByteOk ? ARM::JUMPTABLE_TBB : ARM::JUMPTABLE_TBH;
CPEMI->setDesc(TII->get(JTOpc));
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index 39ae02af513b..236c4fab2a5c 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -14,6 +14,7 @@
#include "ARMConstantPoolValue.h"
#include "llvm/ADT/FoldingSet.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/GlobalValue.h"
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index b14b2c6a813f..5dac6ec0b799 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -110,6 +110,9 @@ namespace {
// OddDblSpc depending on the lane number operand.
enum NEONRegSpacing {
SingleSpc,
+ SingleLowSpc , // Single spacing, low registers, three and four vectors.
+ SingleHighQSpc, // Single spacing, high registers, four vectors.
+ SingleHighTSpc, // Single spacing, high registers, three vectors.
EvenDblSpc,
OddDblSpc
};
@@ -154,10 +157,41 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
{ ARM::VLD1LNq8Pseudo, ARM::VLD1LNd8, true, false, false, EvenDblSpc, 1, 8 ,true},
{ ARM::VLD1LNq8Pseudo_UPD, ARM::VLD1LNd8_UPD, true, true, true, EvenDblSpc, 1, 8 ,true},
+{ ARM::VLD1d16QPseudo, ARM::VLD1d16Q, true, false, false, SingleSpc, 4, 4 ,false},
+{ ARM::VLD1d16TPseudo, ARM::VLD1d16T, true, false, false, SingleSpc, 3, 4 ,false},
+{ ARM::VLD1d32QPseudo, ARM::VLD1d32Q, true, false, false, SingleSpc, 4, 2 ,false},
+{ ARM::VLD1d32TPseudo, ARM::VLD1d32T, true, false, false, SingleSpc, 3, 2 ,false},
{ ARM::VLD1d64QPseudo, ARM::VLD1d64Q, true, false, false, SingleSpc, 4, 1 ,false},
{ ARM::VLD1d64QPseudoWB_fixed, ARM::VLD1d64Qwb_fixed, true, true, false, SingleSpc, 4, 1 ,false},
+{ ARM::VLD1d64QPseudoWB_register, ARM::VLD1d64Qwb_register, true, true, true, SingleSpc, 4, 1 ,false},
{ ARM::VLD1d64TPseudo, ARM::VLD1d64T, true, false, false, SingleSpc, 3, 1 ,false},
{ ARM::VLD1d64TPseudoWB_fixed, ARM::VLD1d64Twb_fixed, true, true, false, SingleSpc, 3, 1 ,false},
+{ ARM::VLD1d64TPseudoWB_register, ARM::VLD1d64Twb_register, true, true, true, SingleSpc, 3, 1 ,false},
+{ ARM::VLD1d8QPseudo, ARM::VLD1d8Q, true, false, false, SingleSpc, 4, 8 ,false},
+{ ARM::VLD1d8TPseudo, ARM::VLD1d8T, true, false, false, SingleSpc, 3, 8 ,false},
+{ ARM::VLD1q16HighQPseudo, ARM::VLD1d16Q, true, false, false, SingleHighQSpc, 4, 4 ,false},
+{ ARM::VLD1q16HighTPseudo, ARM::VLD1d16T, true, false, false, SingleHighTSpc, 3, 4 ,false},
+{ ARM::VLD1q16LowQPseudo_UPD, ARM::VLD1d16Qwb_fixed, true, true, true, SingleLowSpc, 4, 4 ,false},
+{ ARM::VLD1q16LowTPseudo_UPD, ARM::VLD1d16Twb_fixed, true, true, true, SingleLowSpc, 3, 4 ,false},
+{ ARM::VLD1q32HighQPseudo, ARM::VLD1d32Q, true, false, false, SingleHighQSpc, 4, 2 ,false},
+{ ARM::VLD1q32HighTPseudo, ARM::VLD1d32T, true, false, false, SingleHighTSpc, 3, 2 ,false},
+{ ARM::VLD1q32LowQPseudo_UPD, ARM::VLD1d32Qwb_fixed, true, true, true, SingleLowSpc, 4, 2 ,false},
+{ ARM::VLD1q32LowTPseudo_UPD, ARM::VLD1d32Twb_fixed, true, true, true, SingleLowSpc, 3, 2 ,false},
+{ ARM::VLD1q64HighQPseudo, ARM::VLD1d64Q, true, false, false, SingleHighQSpc, 4, 1 ,false},
+{ ARM::VLD1q64HighTPseudo, ARM::VLD1d64T, true, false, false, SingleHighTSpc, 3, 1 ,false},
+{ ARM::VLD1q64LowQPseudo_UPD, ARM::VLD1d64Qwb_fixed, true, true, true, SingleLowSpc, 4, 1 ,false},
+{ ARM::VLD1q64LowTPseudo_UPD, ARM::VLD1d64Twb_fixed, true, true, true, SingleLowSpc, 3, 1 ,false},
+{ ARM::VLD1q8HighQPseudo, ARM::VLD1d8Q, true, false, false, SingleHighQSpc, 4, 8 ,false},
+{ ARM::VLD1q8HighTPseudo, ARM::VLD1d8T, true, false, false, SingleHighTSpc, 3, 8 ,false},
+{ ARM::VLD1q8LowQPseudo_UPD, ARM::VLD1d8Qwb_fixed, true, true, true, SingleLowSpc, 4, 8 ,false},
+{ ARM::VLD1q8LowTPseudo_UPD, ARM::VLD1d8Twb_fixed, true, true, true, SingleLowSpc, 3, 8 ,false},
+
+{ ARM::VLD2DUPq16EvenPseudo, ARM::VLD2DUPd16x2, true, false, false, EvenDblSpc, 2, 4 ,false},
+{ ARM::VLD2DUPq16OddPseudo, ARM::VLD2DUPd16x2, true, false, false, OddDblSpc, 2, 4 ,false},
+{ ARM::VLD2DUPq32EvenPseudo, ARM::VLD2DUPd32x2, true, false, false, EvenDblSpc, 2, 2 ,false},
+{ ARM::VLD2DUPq32OddPseudo, ARM::VLD2DUPd32x2, true, false, false, OddDblSpc, 2, 2 ,false},
+{ ARM::VLD2DUPq8EvenPseudo, ARM::VLD2DUPd8x2, true, false, false, EvenDblSpc, 2, 8 ,false},
+{ ARM::VLD2DUPq8OddPseudo, ARM::VLD2DUPd8x2, true, false, false, OddDblSpc, 2, 8 ,false},
{ ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true},
{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true},
@@ -186,6 +220,12 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
{ ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, true, SingleSpc, 3, 2,true},
{ ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd8, true, false, false, SingleSpc, 3, 8,true},
{ ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd8_UPD, true, true, true, SingleSpc, 3, 8,true},
+{ ARM::VLD3DUPq16EvenPseudo, ARM::VLD3DUPq16, true, false, false, EvenDblSpc, 3, 4 ,true},
+{ ARM::VLD3DUPq16OddPseudo, ARM::VLD3DUPq16, true, false, false, OddDblSpc, 3, 4 ,true},
+{ ARM::VLD3DUPq32EvenPseudo, ARM::VLD3DUPq32, true, false, false, EvenDblSpc, 3, 2 ,true},
+{ ARM::VLD3DUPq32OddPseudo, ARM::VLD3DUPq32, true, false, false, OddDblSpc, 3, 2 ,true},
+{ ARM::VLD3DUPq8EvenPseudo, ARM::VLD3DUPq8, true, false, false, EvenDblSpc, 3, 8 ,true},
+{ ARM::VLD3DUPq8OddPseudo, ARM::VLD3DUPq8, true, false, false, OddDblSpc, 3, 8 ,true},
{ ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, false, SingleSpc, 3, 4 ,true},
{ ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true, SingleSpc, 3, 4 ,true},
@@ -221,6 +261,12 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
{ ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, true, SingleSpc, 4, 2,true},
{ ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd8, true, false, false, SingleSpc, 4, 8,true},
{ ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, true, SingleSpc, 4, 8,true},
+{ ARM::VLD4DUPq16EvenPseudo, ARM::VLD4DUPq16, true, false, false, EvenDblSpc, 4, 4 ,true},
+{ ARM::VLD4DUPq16OddPseudo, ARM::VLD4DUPq16, true, false, false, OddDblSpc, 4, 4 ,true},
+{ ARM::VLD4DUPq32EvenPseudo, ARM::VLD4DUPq32, true, false, false, EvenDblSpc, 4, 2 ,true},
+{ ARM::VLD4DUPq32OddPseudo, ARM::VLD4DUPq32, true, false, false, OddDblSpc, 4, 2 ,true},
+{ ARM::VLD4DUPq8EvenPseudo, ARM::VLD4DUPq8, true, false, false, EvenDblSpc, 4, 8 ,true},
+{ ARM::VLD4DUPq8OddPseudo, ARM::VLD4DUPq8, true, false, false, OddDblSpc, 4, 8 ,true},
{ ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, false, SingleSpc, 4, 4 ,true},
{ ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true, SingleSpc, 4, 4 ,true},
@@ -257,12 +303,34 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
{ ARM::VST1LNq8Pseudo, ARM::VST1LNd8, false, false, false, EvenDblSpc, 1, 8 ,true},
{ ARM::VST1LNq8Pseudo_UPD, ARM::VST1LNd8_UPD, false, true, true, EvenDblSpc, 1, 8 ,true},
+{ ARM::VST1d16QPseudo, ARM::VST1d16Q, false, false, false, SingleSpc, 4, 4 ,false},
+{ ARM::VST1d16TPseudo, ARM::VST1d16T, false, false, false, SingleSpc, 3, 4 ,false},
+{ ARM::VST1d32QPseudo, ARM::VST1d32Q, false, false, false, SingleSpc, 4, 2 ,false},
+{ ARM::VST1d32TPseudo, ARM::VST1d32T, false, false, false, SingleSpc, 3, 2 ,false},
{ ARM::VST1d64QPseudo, ARM::VST1d64Q, false, false, false, SingleSpc, 4, 1 ,false},
{ ARM::VST1d64QPseudoWB_fixed, ARM::VST1d64Qwb_fixed, false, true, false, SingleSpc, 4, 1 ,false},
{ ARM::VST1d64QPseudoWB_register, ARM::VST1d64Qwb_register, false, true, true, SingleSpc, 4, 1 ,false},
{ ARM::VST1d64TPseudo, ARM::VST1d64T, false, false, false, SingleSpc, 3, 1 ,false},
{ ARM::VST1d64TPseudoWB_fixed, ARM::VST1d64Twb_fixed, false, true, false, SingleSpc, 3, 1 ,false},
{ ARM::VST1d64TPseudoWB_register, ARM::VST1d64Twb_register, false, true, true, SingleSpc, 3, 1 ,false},
+{ ARM::VST1d8QPseudo, ARM::VST1d8Q, false, false, false, SingleSpc, 4, 8 ,false},
+{ ARM::VST1d8TPseudo, ARM::VST1d8T, false, false, false, SingleSpc, 3, 8 ,false},
+{ ARM::VST1q16HighQPseudo, ARM::VST1d16Q, false, false, false, SingleHighQSpc, 4, 4 ,false},
+{ ARM::VST1q16HighTPseudo, ARM::VST1d16T, false, false, false, SingleHighTSpc, 3, 4 ,false},
+{ ARM::VST1q16LowQPseudo_UPD, ARM::VST1d16Qwb_fixed, false, true, true, SingleLowSpc, 4, 4 ,false},
+{ ARM::VST1q16LowTPseudo_UPD, ARM::VST1d16Twb_fixed, false, true, true, SingleLowSpc, 3, 4 ,false},
+{ ARM::VST1q32HighQPseudo, ARM::VST1d32Q, false, false, false, SingleHighQSpc, 4, 2 ,false},
+{ ARM::VST1q32HighTPseudo, ARM::VST1d32T, false, false, false, SingleHighTSpc, 3, 2 ,false},
+{ ARM::VST1q32LowQPseudo_UPD, ARM::VST1d32Qwb_fixed, false, true, true, SingleLowSpc, 4, 2 ,false},
+{ ARM::VST1q32LowTPseudo_UPD, ARM::VST1d32Twb_fixed, false, true, true, SingleLowSpc, 3, 2 ,false},
+{ ARM::VST1q64HighQPseudo, ARM::VST1d64Q, false, false, false, SingleHighQSpc, 4, 1 ,false},
+{ ARM::VST1q64HighTPseudo, ARM::VST1d64T, false, false, false, SingleHighTSpc, 3, 1 ,false},
+{ ARM::VST1q64LowQPseudo_UPD, ARM::VST1d64Qwb_fixed, false, true, true, SingleLowSpc, 4, 1 ,false},
+{ ARM::VST1q64LowTPseudo_UPD, ARM::VST1d64Twb_fixed, false, true, true, SingleLowSpc, 3, 1 ,false},
+{ ARM::VST1q8HighQPseudo, ARM::VST1d8Q, false, false, false, SingleHighQSpc, 4, 8 ,false},
+{ ARM::VST1q8HighTPseudo, ARM::VST1d8T, false, false, false, SingleHighTSpc, 3, 8 ,false},
+{ ARM::VST1q8LowQPseudo_UPD, ARM::VST1d8Qwb_fixed, false, true, true, SingleLowSpc, 4, 8 ,false},
+{ ARM::VST1q8LowTPseudo_UPD, ARM::VST1d8Twb_fixed, false, true, true, SingleLowSpc, 3, 8 ,false},
{ ARM::VST2LNd16Pseudo, ARM::VST2LNd16, false, false, false, SingleSpc, 2, 4 ,true},
{ ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true, true, SingleSpc, 2, 4 ,true},
@@ -347,11 +415,11 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
#ifndef NDEBUG
// Make sure the table is sorted.
- static bool TableChecked = false;
- if (!TableChecked) {
+ static std::atomic<bool> TableChecked(false);
+ if (!TableChecked.load(std::memory_order_relaxed)) {
assert(std::is_sorted(std::begin(NEONLdStTable), std::end(NEONLdStTable)) &&
"NEONLdStTable is not sorted!");
- TableChecked = true;
+ TableChecked.store(true, std::memory_order_relaxed);
}
#endif
@@ -368,11 +436,21 @@ static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc,
const TargetRegisterInfo *TRI, unsigned &D0,
unsigned &D1, unsigned &D2, unsigned &D3) {
- if (RegSpc == SingleSpc) {
+ if (RegSpc == SingleSpc || RegSpc == SingleLowSpc) {
D0 = TRI->getSubReg(Reg, ARM::dsub_0);
D1 = TRI->getSubReg(Reg, ARM::dsub_1);
D2 = TRI->getSubReg(Reg, ARM::dsub_2);
D3 = TRI->getSubReg(Reg, ARM::dsub_3);
+ } else if (RegSpc == SingleHighQSpc) {
+ D0 = TRI->getSubReg(Reg, ARM::dsub_4);
+ D1 = TRI->getSubReg(Reg, ARM::dsub_5);
+ D2 = TRI->getSubReg(Reg, ARM::dsub_6);
+ D3 = TRI->getSubReg(Reg, ARM::dsub_7);
+ } else if (RegSpc == SingleHighTSpc) {
+ D0 = TRI->getSubReg(Reg, ARM::dsub_3);
+ D1 = TRI->getSubReg(Reg, ARM::dsub_4);
+ D2 = TRI->getSubReg(Reg, ARM::dsub_5);
+ D3 = TRI->getSubReg(Reg, ARM::dsub_6);
} else if (RegSpc == EvenDblSpc) {
D0 = TRI->getSubReg(Reg, ARM::dsub_0);
D1 = TRI->getSubReg(Reg, ARM::dsub_2);
@@ -404,15 +482,31 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
bool DstIsDead = MI.getOperand(OpIdx).isDead();
unsigned DstReg = MI.getOperand(OpIdx++).getReg();
- unsigned D0, D1, D2, D3;
- GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
- MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
- if (NumRegs > 1 && TableEntry->copyAllListRegs)
- MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
- if (NumRegs > 2 && TableEntry->copyAllListRegs)
- MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
- if (NumRegs > 3 && TableEntry->copyAllListRegs)
- MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
+ if(TableEntry->RealOpc == ARM::VLD2DUPd8x2 ||
+ TableEntry->RealOpc == ARM::VLD2DUPd16x2 ||
+ TableEntry->RealOpc == ARM::VLD2DUPd32x2) {
+ unsigned SubRegIndex;
+ if (RegSpc == EvenDblSpc) {
+ SubRegIndex = ARM::dsub_0;
+ } else {
+ assert(RegSpc == OddDblSpc && "Unexpected spacing!");
+ SubRegIndex = ARM::dsub_1;
+ }
+ unsigned SubReg = TRI->getSubReg(DstReg, SubRegIndex);
+ unsigned DstRegPair = TRI->getMatchingSuperReg(SubReg, ARM::dsub_0,
+ &ARM::DPairSpcRegClass);
+ MIB.addReg(DstRegPair, RegState::Define | getDeadRegState(DstIsDead));
+ } else {
+ unsigned D0, D1, D2, D3;
+ GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
+ MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
+ if (NumRegs > 1 && TableEntry->copyAllListRegs)
+ MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+ if (NumRegs > 2 && TableEntry->copyAllListRegs)
+ MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
+ if (NumRegs > 3 && TableEntry->copyAllListRegs)
+ MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
+ }
if (TableEntry->isUpdating)
MIB.add(MI.getOperand(OpIdx++));
@@ -420,16 +514,45 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
// Copy the addrmode6 operands.
MIB.add(MI.getOperand(OpIdx++));
MIB.add(MI.getOperand(OpIdx++));
+
// Copy the am6offset operand.
- if (TableEntry->hasWritebackOperand)
- MIB.add(MI.getOperand(OpIdx++));
+ if (TableEntry->hasWritebackOperand) {
+ // TODO: The writing-back pseudo instructions we translate here are all
+ // defined to take am6offset nodes that are capable to represent both fixed
+ // and register forms. Some real instructions, however, do not rely on
+ // am6offset and have separate definitions for such forms. When this is the
+ // case, fixed forms do not take any offset nodes, so here we skip them for
+ // such instructions. Once all real and pseudo writing-back instructions are
+ // rewritten without use of am6offset nodes, this code will go away.
+ const MachineOperand &AM6Offset = MI.getOperand(OpIdx++);
+ if (TableEntry->RealOpc == ARM::VLD1d8Qwb_fixed ||
+ TableEntry->RealOpc == ARM::VLD1d16Qwb_fixed ||
+ TableEntry->RealOpc == ARM::VLD1d32Qwb_fixed ||
+ TableEntry->RealOpc == ARM::VLD1d64Qwb_fixed ||
+ TableEntry->RealOpc == ARM::VLD1d8Twb_fixed ||
+ TableEntry->RealOpc == ARM::VLD1d16Twb_fixed ||
+ TableEntry->RealOpc == ARM::VLD1d32Twb_fixed ||
+ TableEntry->RealOpc == ARM::VLD1d64Twb_fixed) {
+ assert(AM6Offset.getReg() == 0 &&
+ "A fixed writing-back pseudo instruction provides an offset "
+ "register!");
+ } else {
+ MIB.add(AM6Offset);
+ }
+ }
// For an instruction writing double-spaced subregs, the pseudo instruction
// has an extra operand that is a use of the super-register. Record the
// operand index and skip over it.
unsigned SrcOpIdx = 0;
- if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc)
- SrcOpIdx = OpIdx++;
+ if(TableEntry->RealOpc != ARM::VLD2DUPd8x2 &&
+ TableEntry->RealOpc != ARM::VLD2DUPd16x2 &&
+ TableEntry->RealOpc != ARM::VLD2DUPd32x2) {
+ if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc ||
+ RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc ||
+ RegSpc == SingleHighTSpc)
+ SrcOpIdx = OpIdx++;
+ }
// Copy the predicate operands.
MIB.add(MI.getOperand(OpIdx++));
@@ -472,9 +595,31 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
// Copy the addrmode6 operands.
MIB.add(MI.getOperand(OpIdx++));
MIB.add(MI.getOperand(OpIdx++));
- // Copy the am6offset operand.
- if (TableEntry->hasWritebackOperand)
- MIB.add(MI.getOperand(OpIdx++));
+
+ if (TableEntry->hasWritebackOperand) {
+ // TODO: The writing-back pseudo instructions we translate here are all
+ // defined to take am6offset nodes that are capable to represent both fixed
+ // and register forms. Some real instructions, however, do not rely on
+ // am6offset and have separate definitions for such forms. When this is the
+ // case, fixed forms do not take any offset nodes, so here we skip them for
+ // such instructions. Once all real and pseudo writing-back instructions are
+ // rewritten without use of am6offset nodes, this code will go away.
+ const MachineOperand &AM6Offset = MI.getOperand(OpIdx++);
+ if (TableEntry->RealOpc == ARM::VST1d8Qwb_fixed ||
+ TableEntry->RealOpc == ARM::VST1d16Qwb_fixed ||
+ TableEntry->RealOpc == ARM::VST1d32Qwb_fixed ||
+ TableEntry->RealOpc == ARM::VST1d64Qwb_fixed ||
+ TableEntry->RealOpc == ARM::VST1d8Twb_fixed ||
+ TableEntry->RealOpc == ARM::VST1d16Twb_fixed ||
+ TableEntry->RealOpc == ARM::VST1d32Twb_fixed ||
+ TableEntry->RealOpc == ARM::VST1d64Twb_fixed) {
+ assert(AM6Offset.getReg() == 0 &&
+ "A fixed writing-back pseudo instruction provides an offset "
+ "register!");
+ } else {
+ MIB.add(AM6Offset);
+ }
+ }
bool SrcIsKill = MI.getOperand(OpIdx).isKill();
bool SrcIsUndef = MI.getOperand(OpIdx).isUndef();
@@ -608,7 +753,6 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
MIB.add(MI.getOperand(OpIdx++));
if (IsExt) {
MachineOperand VdSrc(MI.getOperand(OpIdx++));
- VdSrc.setIsRenamable(false);
MIB.add(VdSrc);
}
@@ -620,7 +764,6 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
// Copy the other source register operand.
MachineOperand VmSrc(MI.getOperand(OpIdx++));
- VmSrc.setIsRenamable(false);
MIB.add(VmSrc);
// Copy the predicate operands.
@@ -1470,7 +1613,6 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// Copy the destination register.
MachineOperand Dst(MI.getOperand(OpIdx++));
- Dst.setIsRenamable(false);
MIB.add(Dst);
// Copy the predicate operands.
@@ -1504,8 +1646,12 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case ARM::VLD3d8Pseudo:
case ARM::VLD3d16Pseudo:
case ARM::VLD3d32Pseudo:
+ case ARM::VLD1d8TPseudo:
+ case ARM::VLD1d16TPseudo:
+ case ARM::VLD1d32TPseudo:
case ARM::VLD1d64TPseudo:
case ARM::VLD1d64TPseudoWB_fixed:
+ case ARM::VLD1d64TPseudoWB_register:
case ARM::VLD3d8Pseudo_UPD:
case ARM::VLD3d16Pseudo_UPD:
case ARM::VLD3d32Pseudo_UPD:
@@ -1521,8 +1667,28 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case ARM::VLD4d8Pseudo:
case ARM::VLD4d16Pseudo:
case ARM::VLD4d32Pseudo:
+ case ARM::VLD1d8QPseudo:
+ case ARM::VLD1d16QPseudo:
+ case ARM::VLD1d32QPseudo:
case ARM::VLD1d64QPseudo:
case ARM::VLD1d64QPseudoWB_fixed:
+ case ARM::VLD1d64QPseudoWB_register:
+ case ARM::VLD1q8HighQPseudo:
+ case ARM::VLD1q8LowQPseudo_UPD:
+ case ARM::VLD1q8HighTPseudo:
+ case ARM::VLD1q8LowTPseudo_UPD:
+ case ARM::VLD1q16HighQPseudo:
+ case ARM::VLD1q16LowQPseudo_UPD:
+ case ARM::VLD1q16HighTPseudo:
+ case ARM::VLD1q16LowTPseudo_UPD:
+ case ARM::VLD1q32HighQPseudo:
+ case ARM::VLD1q32LowQPseudo_UPD:
+ case ARM::VLD1q32HighTPseudo:
+ case ARM::VLD1q32LowTPseudo_UPD:
+ case ARM::VLD1q64HighQPseudo:
+ case ARM::VLD1q64LowQPseudo_UPD:
+ case ARM::VLD1q64HighTPseudo:
+ case ARM::VLD1q64LowTPseudo_UPD:
case ARM::VLD4d8Pseudo_UPD:
case ARM::VLD4d16Pseudo_UPD:
case ARM::VLD4d32Pseudo_UPD:
@@ -1547,6 +1713,24 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case ARM::VLD4DUPd8Pseudo_UPD:
case ARM::VLD4DUPd16Pseudo_UPD:
case ARM::VLD4DUPd32Pseudo_UPD:
+ case ARM::VLD2DUPq8EvenPseudo:
+ case ARM::VLD2DUPq8OddPseudo:
+ case ARM::VLD2DUPq16EvenPseudo:
+ case ARM::VLD2DUPq16OddPseudo:
+ case ARM::VLD2DUPq32EvenPseudo:
+ case ARM::VLD2DUPq32OddPseudo:
+ case ARM::VLD3DUPq8EvenPseudo:
+ case ARM::VLD3DUPq8OddPseudo:
+ case ARM::VLD3DUPq16EvenPseudo:
+ case ARM::VLD3DUPq16OddPseudo:
+ case ARM::VLD3DUPq32EvenPseudo:
+ case ARM::VLD3DUPq32OddPseudo:
+ case ARM::VLD4DUPq8EvenPseudo:
+ case ARM::VLD4DUPq8OddPseudo:
+ case ARM::VLD4DUPq16EvenPseudo:
+ case ARM::VLD4DUPq16OddPseudo:
+ case ARM::VLD4DUPq32EvenPseudo:
+ case ARM::VLD4DUPq32OddPseudo:
ExpandVLD(MBBI);
return true;
@@ -1562,6 +1746,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case ARM::VST3d8Pseudo:
case ARM::VST3d16Pseudo:
case ARM::VST3d32Pseudo:
+ case ARM::VST1d8TPseudo:
+ case ARM::VST1d16TPseudo:
+ case ARM::VST1d32TPseudo:
case ARM::VST1d64TPseudo:
case ARM::VST3d8Pseudo_UPD:
case ARM::VST3d16Pseudo_UPD:
@@ -1580,12 +1767,31 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case ARM::VST4d8Pseudo:
case ARM::VST4d16Pseudo:
case ARM::VST4d32Pseudo:
+ case ARM::VST1d8QPseudo:
+ case ARM::VST1d16QPseudo:
+ case ARM::VST1d32QPseudo:
case ARM::VST1d64QPseudo:
case ARM::VST4d8Pseudo_UPD:
case ARM::VST4d16Pseudo_UPD:
case ARM::VST4d32Pseudo_UPD:
case ARM::VST1d64QPseudoWB_fixed:
case ARM::VST1d64QPseudoWB_register:
+ case ARM::VST1q8HighQPseudo:
+ case ARM::VST1q8LowQPseudo_UPD:
+ case ARM::VST1q8HighTPseudo:
+ case ARM::VST1q8LowTPseudo_UPD:
+ case ARM::VST1q16HighQPseudo:
+ case ARM::VST1q16LowQPseudo_UPD:
+ case ARM::VST1q16HighTPseudo:
+ case ARM::VST1q16LowTPseudo_UPD:
+ case ARM::VST1q32HighQPseudo:
+ case ARM::VST1q32LowQPseudo_UPD:
+ case ARM::VST1q32HighTPseudo:
+ case ARM::VST1q32LowTPseudo_UPD:
+ case ARM::VST1q64HighQPseudo:
+ case ARM::VST1q64LowQPseudo_UPD:
+ case ARM::VST1q64HighTPseudo:
+ case ARM::VST1q64LowTPseudo_UPD:
case ARM::VST4q8Pseudo_UPD:
case ARM::VST4q16Pseudo_UPD:
case ARM::VST4q32Pseudo_UPD:
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 60048d4453d8..26d4aaa12acf 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -41,7 +41,6 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
@@ -75,6 +74,7 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
@@ -2352,8 +2352,8 @@ bool ARMFastISel::SelectCall(const Instruction *I,
for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
i != e; ++i) {
// If we're lowering a memory intrinsic instead of a regular call, skip the
- // last two arguments, which shouldn't be passed to the underlying function.
- if (IntrMemName && e-i <= 2)
+ // last argument, which shouldn't be passed to the underlying function.
+ if (IntrMemName && e - i <= 1)
break;
ISD::ArgFlagsTy Flags;
@@ -2546,7 +2546,8 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
if (!ARMComputeAddress(MTI.getRawDest(), Dest) ||
!ARMComputeAddress(MTI.getRawSource(), Src))
return false;
- unsigned Alignment = MTI.getAlignment();
+ unsigned Alignment = MinAlign(MTI.getDestAlignment(),
+ MTI.getSourceAlignment());
if (ARMTryEmitSmallMemCpy(Dest, Src, Len, Alignment))
return true;
}
@@ -2912,7 +2913,7 @@ static const struct FoldableLoadExtendsStruct {
{ { ARM::UXTB, ARM::t2UXTB }, 0, 1, MVT::i8 }
};
-/// \brief The specified machine instr operand is a vreg, and that
+/// The specified machine instr operand is a vreg, and that
/// vreg is being provided by the specified load instruction. If possible,
/// try to fold the load as an operand to the instruction, returning true if
/// successful.
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 4ff864ac6ccd..af983ce2606a 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -87,6 +87,18 @@ bool ARMFrameLowering::noFramePointerElim(const MachineFunction &MF) const {
MF.getSubtarget<ARMSubtarget>().useFastISel();
}
+/// Returns true if the target can safely skip saving callee-saved registers
+/// for noreturn nounwind functions.
+bool ARMFrameLowering::enableCalleeSaveSkip(const MachineFunction &MF) const {
+ assert(MF.getFunction().hasFnAttribute(Attribute::NoReturn) &&
+ MF.getFunction().hasFnAttribute(Attribute::NoUnwind) &&
+ !MF.getFunction().hasFnAttribute(Attribute::UWTable));
+
+ // Frame pointer and link register are not treated as normal CSR, thus we
+ // can always skip CSR saves for nonreturning functions.
+ return true;
+}
+
/// hasFP - Return true if the specified function should have a dedicated frame
/// pointer register. This is true if the function has variable sized allocas
/// or if frame pointer elimination is disabled.
@@ -209,7 +221,8 @@ static bool WindowsRequiresStackProbe(const MachineFunction &MF,
F.getFnAttribute("stack-probe-size")
.getValueAsString()
.getAsInteger(0, StackProbeSize);
- return StackSizeInBytes >= StackProbeSize;
+ return (StackSizeInBytes >= StackProbeSize) &&
+ !F.hasFnAttribute("no-stack-arg-probe");
}
namespace {
@@ -918,15 +931,17 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
return FPOffset;
}
}
- } else if (AFI->isThumb2Function()) {
+ } else if (AFI->isThumbFunction()) {
+ // Prefer SP to base pointer, if the offset is suitably aligned and in
+ // range as the effective range of the immediate offset is bigger when
+ // basing off SP.
// Use add <rd>, sp, #<imm8>
// ldr <rd>, [sp, #<imm8>]
- // if at all possible to save space.
if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020)
return Offset;
// In Thumb2 mode, the negative offset is very limited. Try to avoid
// out of range references. ldr <rt>,[<rn>, #-<imm8>]
- if (FPOffset >= -255 && FPOffset < 0) {
+ if (AFI->isThumb2Function() && FPOffset >= -255 && FPOffset < 0) {
FrameReg = RegInfo->getFrameRegister(MF);
return FPOffset;
}
@@ -991,8 +1006,8 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
if (Regs.empty())
continue;
- std::sort(Regs.begin(), Regs.end(), [&](const RegAndKill &LHS,
- const RegAndKill &RHS) {
+ llvm::sort(Regs.begin(), Regs.end(), [&](const RegAndKill &LHS,
+ const RegAndKill &RHS) {
return TRI.getEncodingValue(LHS.first) < TRI.getEncodingValue(RHS.first);
});
@@ -1065,6 +1080,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
!isTrap && STI.hasV5TOps()) {
if (MBB.succ_empty()) {
Reg = ARM::PC;
+ // Fold the return instruction into the LDM.
DeleteRet = true;
LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
// We 'restore' LR into PC so it is not live out of the return block:
@@ -1072,7 +1088,6 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
Info.setRestored(false);
} else
LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
- // Fold the return instruction into the LDM.
}
// If NoGap is true, pop consecutive registers and then leave the rest
@@ -1088,7 +1103,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
if (Regs.empty())
continue;
- std::sort(Regs.begin(), Regs.end(), [&](unsigned LHS, unsigned RHS) {
+ llvm::sort(Regs.begin(), Regs.end(), [&](unsigned LHS, unsigned RHS) {
return TRI.getEncodingValue(LHS) < TRI.getEncodingValue(RHS);
});
@@ -1605,6 +1620,17 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
(MFI.hasVarSizedObjects() || RegInfo->needsStackRealignment(MF)))
SavedRegs.set(ARM::R4);
+ // If a stack probe will be emitted, spill R4 and LR, since they are
+ // clobbered by the stack probe call.
+ // This estimate should be a safe, conservative estimate. The actual
+ // stack probe is enabled based on the size of the local objects;
+ // this estimate also includes the varargs store size.
+ if (STI.isTargetWindows() &&
+ WindowsRequiresStackProbe(MF, MFI.estimateStackSize(MF))) {
+ SavedRegs.set(ARM::R4);
+ SavedRegs.set(ARM::LR);
+ }
+
if (AFI->isThumb1OnlyFunction()) {
// Spill LR if Thumb1 function uses variable length argument lists.
if (AFI->getArgRegsSaveSize() > 0)
@@ -1797,34 +1823,36 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
for (unsigned Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) {
if (!MF.getRegInfo().isLiveIn(Reg)) {
--EntryRegDeficit;
- DEBUG(dbgs() << printReg(Reg, TRI)
- << " is unused argument register, EntryRegDeficit = "
- << EntryRegDeficit << "\n");
+ LLVM_DEBUG(dbgs()
+ << printReg(Reg, TRI)
+ << " is unused argument register, EntryRegDeficit = "
+ << EntryRegDeficit << "\n");
}
}
// Unused return registers can be clobbered in the epilogue for free.
int ExitRegDeficit = AFI->getReturnRegsCount() - 4;
- DEBUG(dbgs() << AFI->getReturnRegsCount()
- << " return regs used, ExitRegDeficit = " << ExitRegDeficit
- << "\n");
+ LLVM_DEBUG(dbgs() << AFI->getReturnRegsCount()
+ << " return regs used, ExitRegDeficit = "
+ << ExitRegDeficit << "\n");
int RegDeficit = std::max(EntryRegDeficit, ExitRegDeficit);
- DEBUG(dbgs() << "RegDeficit = " << RegDeficit << "\n");
+ LLVM_DEBUG(dbgs() << "RegDeficit = " << RegDeficit << "\n");
// r4-r6 can be used in the prologue if they are pushed by the first push
// instruction.
for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6}) {
if (SavedRegs.test(Reg)) {
--RegDeficit;
- DEBUG(dbgs() << printReg(Reg, TRI)
- << " is saved low register, RegDeficit = " << RegDeficit
- << "\n");
+ LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
+ << " is saved low register, RegDeficit = "
+ << RegDeficit << "\n");
} else {
AvailableRegs.push_back(Reg);
- DEBUG(dbgs()
- << printReg(Reg, TRI)
- << " is non-saved low register, adding to AvailableRegs\n");
+ LLVM_DEBUG(
+ dbgs()
+ << printReg(Reg, TRI)
+ << " is non-saved low register, adding to AvailableRegs\n");
}
}
@@ -1832,12 +1860,13 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (!HasFP) {
if (SavedRegs.test(ARM::R7)) {
--RegDeficit;
- DEBUG(dbgs() << "%r7 is saved low register, RegDeficit = "
- << RegDeficit << "\n");
+ LLVM_DEBUG(dbgs() << "%r7 is saved low register, RegDeficit = "
+ << RegDeficit << "\n");
} else {
AvailableRegs.push_back(ARM::R7);
- DEBUG(dbgs()
- << "%r7 is non-saved low register, adding to AvailableRegs\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "%r7 is non-saved low register, adding to AvailableRegs\n");
}
}
@@ -1845,9 +1874,9 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
for (unsigned Reg : {ARM::R8, ARM::R9, ARM::R10, ARM::R11}) {
if (SavedRegs.test(Reg)) {
++RegDeficit;
- DEBUG(dbgs() << printReg(Reg, TRI)
- << " is saved high register, RegDeficit = " << RegDeficit
- << "\n");
+ LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
+ << " is saved high register, RegDeficit = "
+ << RegDeficit << "\n");
}
}
@@ -1859,11 +1888,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
MF.getFrameInfo().isReturnAddressTaken())) {
if (SavedRegs.test(ARM::LR)) {
--RegDeficit;
- DEBUG(dbgs() << "%lr is saved register, RegDeficit = " << RegDeficit
- << "\n");
+ LLVM_DEBUG(dbgs() << "%lr is saved register, RegDeficit = "
+ << RegDeficit << "\n");
} else {
AvailableRegs.push_back(ARM::LR);
- DEBUG(dbgs() << "%lr is not saved, adding to AvailableRegs\n");
+ LLVM_DEBUG(dbgs() << "%lr is not saved, adding to AvailableRegs\n");
}
}
@@ -1872,11 +1901,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
// instructions. This might not reduce RegDeficit all the way to zero,
// because we can only guarantee that r4-r6 are available, but r8-r11 may
// need saving.
- DEBUG(dbgs() << "Final RegDeficit = " << RegDeficit << "\n");
+ LLVM_DEBUG(dbgs() << "Final RegDeficit = " << RegDeficit << "\n");
for (; RegDeficit > 0 && !AvailableRegs.empty(); --RegDeficit) {
unsigned Reg = AvailableRegs.pop_back_val();
- DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI)
- << " to make up reg deficit\n");
+ LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI)
+ << " to make up reg deficit\n");
SavedRegs.set(Reg);
NumGPRSpills++;
CS1Spilled = true;
@@ -1887,7 +1916,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (Reg == ARM::LR)
LRSpilled = true;
}
- DEBUG(dbgs() << "After adding spills, RegDeficit = " << RegDeficit << "\n");
+ LLVM_DEBUG(dbgs() << "After adding spills, RegDeficit = " << RegDeficit
+ << "\n");
}
// If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
@@ -1908,7 +1938,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
// If stack and double are 8-byte aligned and we are spilling an odd number
// of GPRs, spill one extra callee save GPR so we won't have to pad between
// the integer and double callee save areas.
- DEBUG(dbgs() << "NumGPRSpills = " << NumGPRSpills << "\n");
+ LLVM_DEBUG(dbgs() << "NumGPRSpills = " << NumGPRSpills << "\n");
unsigned TargetAlign = getStackAlignment();
if (TargetAlign >= 8 && (NumGPRSpills & 1)) {
if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
@@ -1920,8 +1950,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
(STI.isTargetWindows() && Reg == ARM::R11) ||
isARMLowRegister(Reg) || Reg == ARM::LR) {
SavedRegs.set(Reg);
- DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI)
- << " to make up alignment\n");
+ LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI)
+ << " to make up alignment\n");
if (!MRI.isReserved(Reg) && !MRI.isPhysRegUsed(Reg))
ExtraCSSpill = true;
break;
@@ -1930,8 +1960,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
} else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) {
unsigned Reg = UnspilledCS2GPRs.front();
SavedRegs.set(Reg);
- DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI)
- << " to make up alignment\n");
+ LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI)
+ << " to make up alignment\n");
if (!MRI.isReserved(Reg) && !MRI.isPhysRegUsed(Reg))
ExtraCSSpill = true;
}
@@ -2118,8 +2148,10 @@ void ARMFrameLowering::adjustForSegmentedStacks(
uint64_t StackSize = MFI.getStackSize();
- // Do not generate a prologue for functions with a stack of size zero
- if (StackSize == 0)
+ // Do not generate a prologue for leaf functions with a stack of size zero.
+ // For non-leaf functions we have to allow for the possibility that the
+ // call is to a non-split function, as in PR37807.
+ if (StackSize == 0 && !MFI.hasTailCall())
return;
// Use R4 and R5 as scratch registers.
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index 1f18e2bf80c4..e994cab28fe7 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -44,6 +44,8 @@ public:
bool noFramePointerElim(const MachineFunction &MF) const override;
+ bool enableCalleeSaveSkip(const MachineFunction &MF) const override;
+
bool hasFP(const MachineFunction &MF) const override;
bool hasReservedCallFrame(const MachineFunction &MF) const override;
bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index f878bf9937a4..d5dacbe08770 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -37,7 +37,7 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
MachineInstr *MI = SU->getInstr();
- if (!MI->isDebugValue()) {
+ if (!MI->isDebugInstr()) {
// Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following
// a VMLA / VMLS will cause 4 cycle stall.
const MCInstrDesc &MCID = MI->getDesc();
@@ -81,7 +81,7 @@ void ARMHazardRecognizer::Reset() {
void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {
MachineInstr *MI = SU->getInstr();
- if (!MI->isDebugValue()) {
+ if (!MI->isDebugInstr()) {
LastMI = MI;
FpMLxStalls = 0;
}
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 8d32510e2004..081d4ff033bd 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -97,6 +97,8 @@ public:
return SelectImmShifterOperand(N, A, B, false);
}
+ bool SelectAddLikeOr(SDNode *Parent, SDValue N, SDValue &Out);
+
bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
bool SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc);
@@ -118,8 +120,10 @@ public:
SDValue &Offset, SDValue &Opc);
bool SelectAddrMode3Offset(SDNode *Op, SDValue N,
SDValue &Offset, SDValue &Opc);
- bool SelectAddrMode5(SDValue N, SDValue &Base,
- SDValue &Offset);
+ bool IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset,
+ int Lwb, int Upb, bool FP16);
+ bool SelectAddrMode5(SDValue N, SDValue &Base, SDValue &Offset);
+ bool SelectAddrMode5FP16(SDValue N, SDValue &Base, SDValue &Offset);
bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align);
bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset);
@@ -199,10 +203,11 @@ private:
/// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs
/// should be 1, 2, 3 or 4. The opcode array specifies the instructions used
- /// for loading D registers. (Q registers are not supported.)
- void SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
- const uint16_t *DOpcodes,
- const uint16_t *QOpcodes = nullptr);
+ /// for loading D registers.
+ void SelectVLDDup(SDNode *N, bool IsIntrinsic, bool isUpdating,
+ unsigned NumVecs, const uint16_t *DOpcodes,
+ const uint16_t *QOpcodes0 = nullptr,
+ const uint16_t *QOpcodes1 = nullptr);
/// Try to select SBFX/UBFX instructions for ARM.
bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
@@ -281,7 +286,7 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
isInt32Immediate(N->getOperand(1).getNode(), Imm);
}
-/// \brief Check whether a particular node is a constant value representable as
+/// Check whether a particular node is a constant value representable as
/// (N * Scale) where (N in [\p RangeMin, \p RangeMax).
///
/// \param ScaledConstant [out] - On success, the pre-scaled constant value.
@@ -498,7 +503,7 @@ bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) {
CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode());
- CurDAG->ReplaceAllUsesWith(N, M);
+ ReplaceUses(N, M);
}
bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
@@ -567,6 +572,14 @@ bool ARMDAGToDAGISel::SelectRegShifterOperand(SDValue N,
return true;
}
+// Determine whether an ISD::OR's operands are suitable to turn the operation
+// into an addition, which often has more compact encodings.
+bool ARMDAGToDAGISel::SelectAddLikeOr(SDNode *Parent, SDValue N, SDValue &Out) {
+ assert(Parent->getOpcode() == ISD::OR && "unexpected parent");
+ Out = N;
+ return CurDAG->haveNoCommonBitsSet(N, Parent->getOperand(1));
+}
+
bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
SDValue &Base,
@@ -886,8 +899,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N,
return true;
}
-bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
- SDValue &Base, SDValue &Offset) {
+bool ARMDAGToDAGISel::IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset,
+ int Lwb, int Upb, bool FP16) {
if (!CurDAG->isBaseWithConstantOffset(N)) {
Base = N;
if (N.getOpcode() == ISD::FrameIndex) {
@@ -907,8 +920,9 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
// If the RHS is +/- imm8, fold into addr mode.
int RHSC;
- if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4,
- -256 + 1, 256, RHSC)) {
+ const int Scale = FP16 ? 2 : 4;
+
+ if (isScaledConstantInRange(N.getOperand(1), Scale, Lwb, Upb, RHSC)) {
Base = N.getOperand(0);
if (Base.getOpcode() == ISD::FrameIndex) {
int FI = cast<FrameIndexSDNode>(Base)->getIndex();
@@ -921,17 +935,43 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
AddSub = ARM_AM::sub;
RHSC = -RHSC;
}
- Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
- SDLoc(N), MVT::i32);
+
+ if (FP16)
+ Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(AddSub, RHSC),
+ SDLoc(N), MVT::i32);
+ else
+ Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
+ SDLoc(N), MVT::i32);
+
return true;
}
Base = N;
- Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
- SDLoc(N), MVT::i32);
+
+ if (FP16)
+ Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(ARM_AM::add, 0),
+ SDLoc(N), MVT::i32);
+ else
+ Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+ SDLoc(N), MVT::i32);
+
return true;
}
+bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
+ SDValue &Base, SDValue &Offset) {
+ int Lwb = -256 + 1;
+ int Upb = 256;
+ return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ false);
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode5FP16(SDValue N,
+ SDValue &Base, SDValue &Offset) {
+ int Lwb = -512 + 1;
+ int Upb = 512;
+ return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ true);
+}
+
bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,
SDValue &Align) {
Addr = N;
@@ -1467,7 +1507,7 @@ bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) {
return false;
}
-/// \brief Form a GPRPair pseudo register from a pair of GPR regs.
+/// Form a GPRPair pseudo register from a pair of GPR regs.
SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
SDLoc dl(V0.getNode());
SDValue RegClass =
@@ -1478,7 +1518,7 @@ SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
}
-/// \brief Form a D register from a pair of S registers.
+/// Form a D register from a pair of S registers.
SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) {
SDLoc dl(V0.getNode());
SDValue RegClass =
@@ -1489,7 +1529,7 @@ SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) {
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
}
-/// \brief Form a quad register from a pair of D registers.
+/// Form a quad register from a pair of D registers.
SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) {
SDLoc dl(V0.getNode());
SDValue RegClass = CurDAG->getTargetConstant(ARM::QPRRegClassID, dl,
@@ -1500,7 +1540,7 @@ SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) {
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
}
-/// \brief Form 4 consecutive D registers from a pair of Q registers.
+/// Form 4 consecutive D registers from a pair of Q registers.
SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) {
SDLoc dl(V0.getNode());
SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, dl,
@@ -1511,7 +1551,7 @@ SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) {
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
}
-/// \brief Form 4 consecutive S registers.
+/// Form 4 consecutive S registers.
SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1,
SDValue V2, SDValue V3) {
SDLoc dl(V0.getNode());
@@ -1526,7 +1566,7 @@ SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1,
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
}
-/// \brief Form 4 consecutive D registers.
+/// Form 4 consecutive D registers.
SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1,
SDValue V2, SDValue V3) {
SDLoc dl(V0.getNode());
@@ -1541,7 +1581,7 @@ SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1,
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
}
-/// \brief Form 4 consecutive Q registers.
+/// Form 4 consecutive Q registers.
SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1,
SDValue V2, SDValue V3) {
SDLoc dl(V0.getNode());
@@ -1708,7 +1748,9 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
SDLoc dl(N);
SDValue MemAddr, Align;
- unsigned AddrOpIdx = isUpdating ? 1 : 2;
+ bool IsIntrinsic = !isUpdating; // By coincidence, all supported updating
+ // nodes are not intrinsics.
+ unsigned AddrOpIdx = IsIntrinsic ? 2 : 1;
if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
return;
@@ -1732,9 +1774,7 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
case MVT::v4f32:
case MVT::v4i32: OpcodeIndex = 2; break;
case MVT::v2f64:
- case MVT::v2i64: OpcodeIndex = 3;
- assert(NumVecs == 1 && "v2i64 type only supported for VLD1");
- break;
+ case MVT::v2i64: OpcodeIndex = 3; break;
}
EVT ResTy;
@@ -1765,15 +1805,17 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
Ops.push_back(Align);
if (isUpdating) {
SDValue Inc = N->getOperand(AddrOpIdx + 1);
- // FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0
- // case entirely when the rest are updated to that form, too.
bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs);
- if ((NumVecs <= 2) && !IsImmUpdate)
- Opc = getVLDSTRegisterUpdateOpcode(Opc);
- // FIXME: We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
- // check for that explicitly too. Horribly hacky, but temporary.
- if ((NumVecs > 2 && !isVLDfixed(Opc)) || !IsImmUpdate)
- Ops.push_back(IsImmUpdate ? Reg0 : Inc);
+ if (!IsImmUpdate) {
+ // We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
+ // check for the opcode rather than the number of vector elements.
+ if (isVLDfixed(Opc))
+ Opc = getVLDSTRegisterUpdateOpcode(Opc);
+ Ops.push_back(Inc);
+ // VLD1/VLD2 fixed increment does not need Reg0 so only include it in
+ // the operands if not such an opcode.
+ } else if (!isVLDfixed(Opc))
+ Ops.push_back(Reg0);
}
Ops.push_back(Pred);
Ops.push_back(Reg0);
@@ -1844,7 +1886,9 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
SDLoc dl(N);
SDValue MemAddr, Align;
- unsigned AddrOpIdx = isUpdating ? 1 : 2;
+ bool IsIntrinsic = !isUpdating; // By coincidence, all supported updating
+ // nodes are not intrinsics.
+ unsigned AddrOpIdx = IsIntrinsic ? 2 : 1;
unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
return;
@@ -1862,19 +1906,19 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
default: llvm_unreachable("unhandled vst type");
// Double-register operations:
case MVT::v8i8: OpcodeIndex = 0; break;
+ case MVT::v4f16:
case MVT::v4i16: OpcodeIndex = 1; break;
case MVT::v2f32:
case MVT::v2i32: OpcodeIndex = 2; break;
case MVT::v1i64: OpcodeIndex = 3; break;
// Quad-register operations:
case MVT::v16i8: OpcodeIndex = 0; break;
+ case MVT::v8f16:
case MVT::v8i16: OpcodeIndex = 1; break;
case MVT::v4f32:
case MVT::v4i32: OpcodeIndex = 2; break;
case MVT::v2f64:
- case MVT::v2i64: OpcodeIndex = 3;
- assert(NumVecs == 1 && "v2i64 type only supported for VST1");
- break;
+ case MVT::v2i64: OpcodeIndex = 3; break;
}
std::vector<EVT> ResTys;
@@ -1919,16 +1963,17 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
Ops.push_back(Align);
if (isUpdating) {
SDValue Inc = N->getOperand(AddrOpIdx + 1);
- // FIXME: VST1/VST2 fixed increment doesn't need Reg0. Remove the reg0
- // case entirely when the rest are updated to that form, too.
bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs);
- if (NumVecs <= 2 && !IsImmUpdate)
- Opc = getVLDSTRegisterUpdateOpcode(Opc);
- // FIXME: We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so
- // check for that explicitly too. Horribly hacky, but temporary.
- if (!IsImmUpdate)
+ if (!IsImmUpdate) {
+ // We use a VST1 for v1i64 even if the pseudo says VST2/3/4, so
+ // check for the opcode rather than the number of vector elements.
+ if (isVSTfixed(Opc))
+ Opc = getVLDSTRegisterUpdateOpcode(Opc);
Ops.push_back(Inc);
- else if (NumVecs > 2 && !isVSTfixed(Opc))
+ }
+ // VST1/VST2 fixed increment does not need Reg0 so only include it in
+ // the operands if not such an opcode.
+ else if (!isVSTfixed(Opc))
Ops.push_back(Reg0);
}
Ops.push_back(SrcReg);
@@ -1993,7 +2038,9 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
SDLoc dl(N);
SDValue MemAddr, Align;
- unsigned AddrOpIdx = isUpdating ? 1 : 2;
+ bool IsIntrinsic = !isUpdating; // By coincidence, all supported updating
+ // nodes are not intrinsics.
+ unsigned AddrOpIdx = IsIntrinsic ? 2 : 1;
unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
return;
@@ -2109,21 +2156,22 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
CurDAG->RemoveDeadNode(N);
}
-void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
+ bool isUpdating, unsigned NumVecs,
const uint16_t *DOpcodes,
- const uint16_t *QOpcodes) {
+ const uint16_t *QOpcodes0,
+ const uint16_t *QOpcodes1) {
assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
SDLoc dl(N);
SDValue MemAddr, Align;
- if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
+ unsigned AddrOpIdx = IsIntrinsic ? 2 : 1;
+ if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
return;
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-
SDValue Chain = N->getOperand(0);
EVT VT = N->getValueType(0);
+ bool is64BitVector = VT.is64BitVector();
unsigned Alignment = 0;
if (NumVecs != 3) {
@@ -2140,49 +2188,84 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
}
Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32);
- unsigned Opc;
+ unsigned OpcodeIndex;
switch (VT.getSimpleVT().SimpleTy) {
default: llvm_unreachable("unhandled vld-dup type");
- case MVT::v8i8: Opc = DOpcodes[0]; break;
- case MVT::v16i8: Opc = QOpcodes[0]; break;
- case MVT::v4i16: Opc = DOpcodes[1]; break;
- case MVT::v8i16: Opc = QOpcodes[1]; break;
+ case MVT::v8i8:
+ case MVT::v16i8: OpcodeIndex = 0; break;
+ case MVT::v4i16:
+ case MVT::v8i16: OpcodeIndex = 1; break;
case MVT::v2f32:
- case MVT::v2i32: Opc = DOpcodes[2]; break;
+ case MVT::v2i32:
case MVT::v4f32:
- case MVT::v4i32: Opc = QOpcodes[2]; break;
- }
-
- SDValue Pred = getAL(CurDAG, dl);
- SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
- SmallVector<SDValue, 6> Ops;
- Ops.push_back(MemAddr);
- Ops.push_back(Align);
- if (isUpdating) {
- // fixed-stride update instructions don't have an explicit writeback
- // operand. It's implicit in the opcode itself.
- SDValue Inc = N->getOperand(2);
- bool IsImmUpdate =
- isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs);
- if (NumVecs <= 2 && !IsImmUpdate)
- Opc = getVLDSTRegisterUpdateOpcode(Opc);
- if (!IsImmUpdate)
- Ops.push_back(Inc);
- // FIXME: VLD3 and VLD4 haven't been updated to that form yet.
- else if (NumVecs > 2)
- Ops.push_back(Reg0);
+ case MVT::v4i32: OpcodeIndex = 2; break;
+ case MVT::v1f64:
+ case MVT::v1i64: OpcodeIndex = 3; break;
}
- Ops.push_back(Pred);
- Ops.push_back(Reg0);
- Ops.push_back(Chain);
unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+ if (!is64BitVector)
+ ResTyElts *= 2;
+ EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
+
std::vector<EVT> ResTys;
- ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,ResTyElts));
+ ResTys.push_back(ResTy);
if (isUpdating)
ResTys.push_back(MVT::i32);
ResTys.push_back(MVT::Other);
- SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+ SDValue Pred = getAL(CurDAG, dl);
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+
+ SDNode *VLdDup;
+ if (is64BitVector || NumVecs == 1) {
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(MemAddr);
+ Ops.push_back(Align);
+ unsigned Opc = is64BitVector ? DOpcodes[OpcodeIndex] :
+ QOpcodes0[OpcodeIndex];
+ if (isUpdating) {
+ // fixed-stride update instructions don't have an explicit writeback
+ // operand. It's implicit in the opcode itself.
+ SDValue Inc = N->getOperand(2);
+ bool IsImmUpdate =
+ isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs);
+ if (NumVecs <= 2 && !IsImmUpdate)
+ Opc = getVLDSTRegisterUpdateOpcode(Opc);
+ if (!IsImmUpdate)
+ Ops.push_back(Inc);
+ // FIXME: VLD3 and VLD4 haven't been updated to that form yet.
+ else if (NumVecs > 2)
+ Ops.push_back(Reg0);
+ }
+ Ops.push_back(Pred);
+ Ops.push_back(Reg0);
+ Ops.push_back(Chain);
+ VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+ } else if (NumVecs == 2) {
+ const SDValue OpsA[] = { MemAddr, Align, Pred, Reg0, Chain };
+ SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex],
+ dl, ResTys, OpsA);
+
+ Chain = SDValue(VLdA, 1);
+ const SDValue OpsB[] = { MemAddr, Align, Pred, Reg0, Chain };
+ VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB);
+ } else {
+ SDValue ImplDef =
+ SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
+ const SDValue OpsA[] = { MemAddr, Align, ImplDef, Pred, Reg0, Chain };
+ SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex],
+ dl, ResTys, OpsA);
+
+ SDValue SuperReg = SDValue(VLdA, 0);
+ Chain = SDValue(VLdA, 1);
+ const SDValue OpsB[] = { MemAddr, Align, SuperReg, Pred, Reg0, Chain };
+ VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB);
+ }
+
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
// Extract the subregisters.
@@ -2191,10 +2274,11 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
} else {
SDValue SuperReg = SDValue(VLdDup, 0);
static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering");
- unsigned SubIdx = ARM::dsub_0;
- for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+ unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
+ for (unsigned Vec = 0; Vec != NumVecs; ++Vec) {
ReplaceUses(SDValue(N, Vec),
CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
+ }
}
ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
if (isUpdating)
@@ -2253,6 +2337,7 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
return true;
}
+ assert(LSB + Width + 1 <= 32 && "Shouldn't create an invalid ubfx");
SDValue Ops[] = { N->getOperand(0).getOperand(0),
CurDAG->getTargetConstant(LSB, dl, MVT::i32),
CurDAG->getTargetConstant(Width, dl, MVT::i32),
@@ -2277,6 +2362,7 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
if (LSB < 0)
return false;
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ assert(LSB + Width + 1 <= 32 && "Shouldn't create an invalid ubfx");
SDValue Ops[] = { N->getOperand(0).getOperand(0),
CurDAG->getTargetConstant(LSB, dl, MVT::i32),
CurDAG->getTargetConstant(Width, dl, MVT::i32),
@@ -2298,6 +2384,7 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
// Note: The width operand is encoded as width-1.
unsigned Width = MSB - LSB;
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ assert(Srl_imm + Width + 1 <= 32 && "Shouldn't create an invalid ubfx");
SDValue Ops[] = { N->getOperand(0).getOperand(0),
CurDAG->getTargetConstant(Srl_imm, dl, MVT::i32),
CurDAG->getTargetConstant(Width, dl, MVT::i32),
@@ -2318,6 +2405,7 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
return false;
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ assert(LSB + Width <= 32 && "Shouldn't create an invalid ubfx");
SDValue Ops[] = { N->getOperand(0).getOperand(0),
CurDAG->getTargetConstant(LSB, dl, MVT::i32),
CurDAG->getTargetConstant(Width - 1, dl, MVT::i32),
@@ -2427,7 +2515,7 @@ void ARMDAGToDAGISel::SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI) {
SDValue X = And.getOperand(0);
auto C = dyn_cast<ConstantSDNode>(And.getOperand(1));
- if (!C || !X->hasOneUse())
+ if (!C)
return;
auto Range = getContiguousRangeOfSetBits(C->getAPIntValue());
if (!Range)
@@ -2765,7 +2853,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
}
}
case ARMISD::SUBE: {
- if (!Subtarget->hasV6Ops())
+ if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
break;
// Look for a pattern to match SMMLS
// (sube a, (smul_loHi a, b), (subc 0, (smul_LOhi(a, b))))
@@ -3026,14 +3114,14 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
ARM::VLD1DUPd32 };
static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8, ARM::VLD1DUPq16,
ARM::VLD1DUPq32 };
- SelectVLDDup(N, false, 1, DOpcodes, QOpcodes);
+ SelectVLDDup(N, /* IsIntrinsic= */ false, false, 1, DOpcodes, QOpcodes);
return;
}
case ARMISD::VLD2DUP: {
static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
ARM::VLD2DUPd32 };
- SelectVLDDup(N, false, 2, Opcodes);
+ SelectVLDDup(N, /* IsIntrinsic= */ false, false, 2, Opcodes);
return;
}
@@ -3041,7 +3129,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo,
ARM::VLD3DUPd16Pseudo,
ARM::VLD3DUPd32Pseudo };
- SelectVLDDup(N, false, 3, Opcodes);
+ SelectVLDDup(N, /* IsIntrinsic= */ false, false, 3, Opcodes);
return;
}
@@ -3049,7 +3137,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo,
ARM::VLD4DUPd16Pseudo,
ARM::VLD4DUPd32Pseudo };
- SelectVLDDup(N, false, 4, Opcodes);
+ SelectVLDDup(N, /* IsIntrinsic= */ false, false, 4, Opcodes);
return;
}
@@ -3060,7 +3148,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8wb_fixed,
ARM::VLD1DUPq16wb_fixed,
ARM::VLD1DUPq32wb_fixed };
- SelectVLDDup(N, true, 1, DOpcodes, QOpcodes);
+ SelectVLDDup(N, /* IsIntrinsic= */ false, true, 1, DOpcodes, QOpcodes);
return;
}
@@ -3068,7 +3156,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
ARM::VLD2DUPd16wb_fixed,
ARM::VLD2DUPd32wb_fixed };
- SelectVLDDup(N, true, 2, Opcodes);
+ SelectVLDDup(N, /* IsIntrinsic= */ false, true, 2, Opcodes);
return;
}
@@ -3076,7 +3164,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
ARM::VLD3DUPd16Pseudo_UPD,
ARM::VLD3DUPd32Pseudo_UPD };
- SelectVLDDup(N, true, 3, Opcodes);
+ SelectVLDDup(N, /* IsIntrinsic= */ false, true, 3, Opcodes);
return;
}
@@ -3084,7 +3172,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
ARM::VLD4DUPd16Pseudo_UPD,
ARM::VLD4DUPd32Pseudo_UPD };
- SelectVLDDup(N, true, 4, Opcodes);
+ SelectVLDDup(N, /* IsIntrinsic= */ false, true, 4, Opcodes);
return;
}
@@ -3407,6 +3495,51 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
return;
}
+ case Intrinsic::arm_neon_vld1x2: {
+ static const uint16_t DOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
+ ARM::VLD1q32, ARM::VLD1q64 };
+ static const uint16_t QOpcodes[] = { ARM::VLD1d8QPseudo,
+ ARM::VLD1d16QPseudo,
+ ARM::VLD1d32QPseudo,
+ ARM::VLD1d64QPseudo };
+ SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vld1x3: {
+ static const uint16_t DOpcodes[] = { ARM::VLD1d8TPseudo,
+ ARM::VLD1d16TPseudo,
+ ARM::VLD1d32TPseudo,
+ ARM::VLD1d64TPseudo };
+ static const uint16_t QOpcodes0[] = { ARM::VLD1q8LowTPseudo_UPD,
+ ARM::VLD1q16LowTPseudo_UPD,
+ ARM::VLD1q32LowTPseudo_UPD,
+ ARM::VLD1q64LowTPseudo_UPD };
+ static const uint16_t QOpcodes1[] = { ARM::VLD1q8HighTPseudo,
+ ARM::VLD1q16HighTPseudo,
+ ARM::VLD1q32HighTPseudo,
+ ARM::VLD1q64HighTPseudo };
+ SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vld1x4: {
+ static const uint16_t DOpcodes[] = { ARM::VLD1d8QPseudo,
+ ARM::VLD1d16QPseudo,
+ ARM::VLD1d32QPseudo,
+ ARM::VLD1d64QPseudo };
+ static const uint16_t QOpcodes0[] = { ARM::VLD1q8LowQPseudo_UPD,
+ ARM::VLD1q16LowQPseudo_UPD,
+ ARM::VLD1q32LowQPseudo_UPD,
+ ARM::VLD1q64LowQPseudo_UPD };
+ static const uint16_t QOpcodes1[] = { ARM::VLD1q8HighQPseudo,
+ ARM::VLD1q16HighQPseudo,
+ ARM::VLD1q32HighQPseudo,
+ ARM::VLD1q64HighQPseudo };
+ SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+
case Intrinsic::arm_neon_vld2: {
static const uint16_t DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
ARM::VLD2d32, ARM::VLD1q64 };
@@ -3446,6 +3579,52 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
return;
}
+ case Intrinsic::arm_neon_vld2dup: {
+ static const uint16_t DOpcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
+ ARM::VLD2DUPd32, ARM::VLD1q64 };
+ static const uint16_t QOpcodes0[] = { ARM::VLD2DUPq8EvenPseudo,
+ ARM::VLD2DUPq16EvenPseudo,
+ ARM::VLD2DUPq32EvenPseudo };
+ static const uint16_t QOpcodes1[] = { ARM::VLD2DUPq8OddPseudo,
+ ARM::VLD2DUPq16OddPseudo,
+ ARM::VLD2DUPq32OddPseudo };
+ SelectVLDDup(N, /* IsIntrinsic= */ true, false, 2,
+ DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vld3dup: {
+ static const uint16_t DOpcodes[] = { ARM::VLD3DUPd8Pseudo,
+ ARM::VLD3DUPd16Pseudo,
+ ARM::VLD3DUPd32Pseudo,
+ ARM::VLD1d64TPseudo };
+ static const uint16_t QOpcodes0[] = { ARM::VLD3DUPq8EvenPseudo,
+ ARM::VLD3DUPq16EvenPseudo,
+ ARM::VLD3DUPq32EvenPseudo };
+ static const uint16_t QOpcodes1[] = { ARM::VLD3DUPq8OddPseudo,
+ ARM::VLD3DUPq16OddPseudo,
+ ARM::VLD3DUPq32OddPseudo };
+ SelectVLDDup(N, /* IsIntrinsic= */ true, false, 3,
+ DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vld4dup: {
+ static const uint16_t DOpcodes[] = { ARM::VLD4DUPd8Pseudo,
+ ARM::VLD4DUPd16Pseudo,
+ ARM::VLD4DUPd32Pseudo,
+ ARM::VLD1d64QPseudo };
+ static const uint16_t QOpcodes0[] = { ARM::VLD4DUPq8EvenPseudo,
+ ARM::VLD4DUPq16EvenPseudo,
+ ARM::VLD4DUPq32EvenPseudo };
+ static const uint16_t QOpcodes1[] = { ARM::VLD4DUPq8OddPseudo,
+ ARM::VLD4DUPq16OddPseudo,
+ ARM::VLD4DUPq32OddPseudo };
+ SelectVLDDup(N, /* IsIntrinsic= */ true, false, 4,
+ DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+
case Intrinsic::arm_neon_vld2lane: {
static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo,
ARM::VLD2LNd16Pseudo,
@@ -3485,6 +3664,51 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
return;
}
+ case Intrinsic::arm_neon_vst1x2: {
+ static const uint16_t DOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
+ ARM::VST1q32, ARM::VST1q64 };
+ static const uint16_t QOpcodes[] = { ARM::VST1d8QPseudo,
+ ARM::VST1d16QPseudo,
+ ARM::VST1d32QPseudo,
+ ARM::VST1d64QPseudo };
+ SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vst1x3: {
+ static const uint16_t DOpcodes[] = { ARM::VST1d8TPseudo,
+ ARM::VST1d16TPseudo,
+ ARM::VST1d32TPseudo,
+ ARM::VST1d64TPseudo };
+ static const uint16_t QOpcodes0[] = { ARM::VST1q8LowTPseudo_UPD,
+ ARM::VST1q16LowTPseudo_UPD,
+ ARM::VST1q32LowTPseudo_UPD,
+ ARM::VST1q64LowTPseudo_UPD };
+ static const uint16_t QOpcodes1[] = { ARM::VST1q8HighTPseudo,
+ ARM::VST1q16HighTPseudo,
+ ARM::VST1q32HighTPseudo,
+ ARM::VST1q64HighTPseudo };
+ SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+
+ case Intrinsic::arm_neon_vst1x4: {
+ static const uint16_t DOpcodes[] = { ARM::VST1d8QPseudo,
+ ARM::VST1d16QPseudo,
+ ARM::VST1d32QPseudo,
+ ARM::VST1d64QPseudo };
+ static const uint16_t QOpcodes0[] = { ARM::VST1q8LowQPseudo_UPD,
+ ARM::VST1q16LowQPseudo_UPD,
+ ARM::VST1q32LowQPseudo_UPD,
+ ARM::VST1q64LowQPseudo_UPD };
+ static const uint16_t QOpcodes1[] = { ARM::VST1q8HighQPseudo,
+ ARM::VST1q16HighQPseudo,
+ ARM::VST1q32HighQPseudo,
+ ARM::VST1q64HighQPseudo };
+ SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+
case Intrinsic::arm_neon_vst2: {
static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
ARM::VST2d32, ARM::VST1q64 };
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index aeda7c06a27a..47222a66f798 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -53,7 +53,6 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -97,6 +96,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
@@ -308,13 +308,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setCmpLibcallCC(LC.Op, LC.Cond);
}
}
-
- // Set the correct calling convention for ARMv7k WatchOS. It's just
- // AAPCS_VFP for functions as simple as libcalls.
- if (Subtarget->isTargetWatchABI()) {
- for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i)
- setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP);
- }
}
// These libcalls are not available in 32-bit.
@@ -522,6 +515,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::f64, &ARM::DPRRegClass);
}
+ if (Subtarget->hasFullFP16()) {
+ addRegisterClass(MVT::f16, &ARM::HPRRegClass);
+ setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::i32, Custom);
+ setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+
+ setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
+ }
+
for (MVT VT : MVT::vector_valuetypes()) {
for (MVT InnerVT : MVT::vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
@@ -558,6 +561,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
addQRTypeForNEON(MVT::v4i32);
addQRTypeForNEON(MVT::v2i64);
+ if (Subtarget->hasFullFP16()) {
+ addQRTypeForNEON(MVT::v8f16);
+ addDRTypeForNEON(MVT::v4f16);
+ }
+
// v2f64 is legal so that QR subregs can be extracted as f64 elements, but
// neither Neon nor VFP support any arithmetic operations on it.
// The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
@@ -820,10 +828,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SRA, MVT::i64, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
- setOperationAction(ISD::ADDC, MVT::i32, Custom);
- setOperationAction(ISD::ADDE, MVT::i32, Custom);
- setOperationAction(ISD::SUBC, MVT::i32, Custom);
- setOperationAction(ISD::SUBE, MVT::i32, Custom);
+ // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
+ if (Subtarget->isThumb1Only()) {
+ setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+ setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+ setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+ }
if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
@@ -949,7 +959,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
- if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+ if (Subtarget->isTargetWindows())
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
else
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
@@ -1036,13 +1046,18 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+ if (Subtarget->hasFullFP16()) {
+ setOperationAction(ISD::SETCC, MVT::f16, Expand);
+ setOperationAction(ISD::SELECT, MVT::f16, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
+ }
- // Thumb-1 cannot currently select ARMISD::SUBE.
- if (!Subtarget->isThumb1Only())
- setOperationAction(ISD::SETCCE, MVT::i32, Custom);
+ setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+ if (Subtarget->hasFullFP16())
+ setOperationAction(ISD::BR_CC, MVT::f16, Custom);
setOperationAction(ISD::BR_CC, MVT::f32, Custom);
setOperationAction(ISD::BR_CC, MVT::f64, Custom);
setOperationAction(ISD::BR_JT, MVT::Other, Custom);
@@ -1121,6 +1136,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
if (Subtarget->hasNEON()) {
// vmin and vmax aren't available in a scalar form, so we use
// a NEON instruction with an undef lane instead.
+ setOperationAction(ISD::FMINNAN, MVT::f16, Legal);
+ setOperationAction(ISD::FMAXNAN, MVT::f16, Legal);
setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
@@ -1259,6 +1276,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD";
case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR";
+ case ARMISD::VMOVhr: return "ARMISD::VMOVhr";
+ case ARMISD::VMOVrh: return "ARMISD::VMOVrh";
+ case ARMISD::VMOVSR: return "ARMISD::VMOVSR";
case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
@@ -1337,6 +1357,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::SMLALDX: return "ARMISD::SMLALDX";
case ARMISD::SMLSLD: return "ARMISD::SMLSLD";
case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX";
+ case ARMISD::SMMLAR: return "ARMISD::SMMLAR";
+ case ARMISD::SMMLSR: return "ARMISD::SMMLSR";
case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR";
case ARMISD::BFI: return "ARMISD::BFI";
case ARMISD::VORRIMM: return "ARMISD::VORRIMM";
@@ -2465,12 +2487,37 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
assert(VA.isRegLoc() && "Can only return in registers!");
SDValue Arg = OutVals[realRVLocIdx];
+ bool ReturnF16 = false;
+
+ if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
+ // Half-precision return values can be returned like this:
+ //
+ // t11 f16 = fadd ...
+ // t12: i16 = bitcast t11
+ // t13: i32 = zero_extend t12
+ // t14: f32 = bitcast t13 <~~~~~~~ Arg
+ //
+ // to avoid code generation for bitcasts, we simply set Arg to the node
+ // that produces the f16 value, t11 in this case.
+ //
+ if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
+ SDValue ZE = Arg.getOperand(0);
+ if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
+ SDValue BC = ZE.getOperand(0);
+ if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
+ Arg = BC.getOperand(0);
+ ReturnF16 = true;
+ }
+ }
+ }
+ }
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::BCvt:
- Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+ if (!ReturnF16)
+ Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
break;
}
@@ -2518,7 +2565,8 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// Guarantee that all emitted copies are
// stuck together, avoiding something bad.
Flag = Chain.getValue(1);
- RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(),
+ ReturnF16 ? MVT::f16 : VA.getLocVT()));
}
const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *I =
@@ -2738,7 +2786,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
}
-/// \brief Convert a TLS address reference into the correct sequence of loads
+/// Convert a TLS address reference into the correct sequence of loads
/// and calls to compute the variable's address for Darwin, and return an
/// SDValue containing the final node.
@@ -2959,7 +3007,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
SDValue
ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
- if (DAG.getTarget().Options.EmulatedTLS)
+ if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(GA, DAG);
if (Subtarget->isTargetDarwin())
@@ -3675,11 +3723,14 @@ SDValue ARMTargetLowering::LowerFormalArguments(
} else {
const TargetRegisterClass *RC;
- if (RegVT == MVT::f32)
+
+ if (RegVT == MVT::f16)
+ RC = &ARM::HPRRegClass;
+ else if (RegVT == MVT::f32)
RC = &ARM::SPRRegClass;
- else if (RegVT == MVT::f64)
+ else if (RegVT == MVT::f64 || RegVT == MVT::v4f16)
RC = &ARM::DPRRegClass;
- else if (RegVT == MVT::v2f64)
+ else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16)
RC = &ARM::QPRRegClass;
else if (RegVT == MVT::i32)
RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
@@ -3799,8 +3850,8 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
const SDLoc &dl) const {
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
unsigned C = RHSC->getZExtValue();
- if (!isLegalICmpImmediate(C)) {
- // Constant does not fit, try adjusting it by one?
+ if (!isLegalICmpImmediate((int32_t)C)) {
+ // Constant does not fit, try adjusting it by one.
switch (CC) {
default: break;
case ISD::SETLT:
@@ -3940,6 +3991,29 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
break;
+ case ISD::UMULO:
+ // We generate a UMUL_LOHI and then check if the high word is 0.
+ ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
+ Value = DAG.getNode(ISD::UMUL_LOHI, dl,
+ DAG.getVTList(Op.getValueType(), Op.getValueType()),
+ LHS, RHS);
+ OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
+ DAG.getConstant(0, dl, MVT::i32));
+ Value = Value.getValue(0); // We only want the low 32 bits for the result.
+ break;
+ case ISD::SMULO:
+ // We generate a SMUL_LOHI and then check if all the bits of the high word
+ // are the same as the sign bit of the low word.
+ ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
+ Value = DAG.getNode(ISD::SMUL_LOHI, dl,
+ DAG.getVTList(Op.getValueType(), Op.getValueType()),
+ LHS, RHS);
+ OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
+ DAG.getNode(ISD::SRA, dl, Op.getValueType(),
+ Value.getValue(0),
+ DAG.getConstant(31, dl, MVT::i32)));
+ Value = Value.getValue(0); // We only want the low 32 bits for the result.
+ break;
} // switch (...)
return std::make_pair(Value, OverflowCmp);
@@ -3973,11 +4047,12 @@ static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,
SDLoc DL(BoolCarry);
EVT CarryVT = BoolCarry.getValueType();
- APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
// This converts the boolean value carry into the carry flag by doing
- // ARMISD::ADDC Carry, ~0
- return DAG.getNode(ARMISD::ADDC, DL, DAG.getVTList(CarryVT, MVT::i32),
- BoolCarry, DAG.getConstant(NegOne, DL, CarryVT));
+ // ARMISD::SUBC Carry, 1
+ SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
+ DAG.getVTList(CarryVT, MVT::i32),
+ BoolCarry, DAG.getConstant(1, DL, CarryVT));
+ return Carry.getValue(1);
}
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
@@ -4313,6 +4388,48 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
return false;
}
+// Check if a condition of the type x < k ? k : x can be converted into a
+// bit operation instead of conditional moves.
+// Currently this is allowed given:
+// - The conditions and values match up
+// - k is 0 or -1 (all ones)
+// This function will not check the last condition, thats up to the caller
+// It returns true if the transformation can be made, and in such case
+// returns x in V, and k in SatK.
+static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
+ SDValue &SatK)
+{
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+ SDValue TrueVal = Op.getOperand(2);
+ SDValue FalseVal = Op.getOperand(3);
+
+ SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
+ ? &RHS
+ : nullptr;
+
+ // No constant operation in comparison, early out
+ if (!K)
+ return false;
+
+ SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
+ V = (KTmp == TrueVal) ? FalseVal : TrueVal;
+ SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
+
+ // If the constant on left and right side, or variable on left and right,
+ // does not match, early out
+ if (*K != KTmp || V != VTmp)
+ return false;
+
+ if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
+ SatK = *K;
+ return true;
+ }
+
+ return false;
+}
+
SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc dl(Op);
@@ -4331,6 +4448,25 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
}
+ // Try to convert expressions of the form x < k ? k : x (and similar forms)
+ // into more efficient bit operations, which is possible when k is 0 or -1
+ // On ARM and Thumb-2 which have flexible operand 2 this will result in
+ // single instructions. On Thumb the shift and the bit operation will be two
+ // instructions.
+ // Only allow this transformation on full-width (32-bit) operations
+ SDValue LowerSatConstant;
+ if (VT == MVT::i32 &&
+ isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
+ SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
+ DAG.getConstant(31, dl, VT));
+ if (isNullConstant(LowerSatConstant)) {
+ SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
+ DAG.getAllOnesConstant(dl, VT));
+ return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
+ } else if (isAllOnesConstant(LowerSatConstant))
+ return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
+ }
+
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
@@ -4380,9 +4516,12 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
bool InvalidOnQNaN;
FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
- // Try to generate VMAXNM/VMINNM on ARMv8.
- if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
- TrueVal.getValueType() == MVT::f64)) {
+ // Normalize the fp compare. If RHS is zero we keep it there so we match
+ // CMPFPw0 instead of CMPFP.
+ if (Subtarget->hasFPARMv8() && !isFloatingPointZero(RHS) &&
+ (TrueVal.getValueType() == MVT::f16 ||
+ TrueVal.getValueType() == MVT::f32 ||
+ TrueVal.getValueType() == MVT::f64)) {
bool swpCmpOps = false;
bool swpVselOps = false;
checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
@@ -4532,10 +4671,14 @@ SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Dest = Op.getOperand(2);
SDLoc dl(Op);
- // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction.
+ // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
+ // instruction.
unsigned Opc = Cond.getOpcode();
- if (Cond.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO ||
- Opc == ISD::SSUBO || Opc == ISD::USUBO)) {
+ bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
+ !Subtarget->isThumb1Only();
+ if (Cond.getResNo() == 1 &&
+ (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+ Opc == ISD::USUBO || OptimizeMul)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
return SDValue();
@@ -4579,11 +4722,15 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
}
}
- // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction.
+ // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
+ // instruction.
unsigned Opc = LHS.getOpcode();
+ bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
+ !Subtarget->isThumb1Only();
if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
(Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
- Opc == ISD::USUBO) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ Opc == ISD::USUBO || OptimizeMul) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
return SDValue();
@@ -4614,8 +4761,6 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
Chain, Dest, ARMcc, CCR, Cmp);
}
- assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
-
if (getTargetMachine().Options.UnsafeFPMath &&
(CC == ISD::SETEQ || CC == ISD::SETOEQ ||
CC == ISD::SETNE || CC == ISD::SETUNE)) {
@@ -4979,7 +5124,8 @@ static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
/// operand type is illegal (e.g., v2f32 for a target that doesn't support
/// vectors), since the legalizer won't know what to do with that.
-static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
+static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc dl(N);
SDValue Op = N->getOperand(0);
@@ -4988,8 +5134,78 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
// source or destination of the bit convert.
EVT SrcVT = Op.getValueType();
EVT DstVT = N->getValueType(0);
- assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
- "ExpandBITCAST called for non-i64 type");
+ const bool HasFullFP16 = Subtarget->hasFullFP16();
+
+ if (SrcVT == MVT::f32 && DstVT == MVT::i32) {
+ // FullFP16: half values are passed in S-registers, and we don't
+ // need any of the bitcast and moves:
+ //
+ // t2: f32,ch = CopyFromReg t0, Register:f32 %0
+ // t5: i32 = bitcast t2
+ // t18: f16 = ARMISD::VMOVhr t5
+ if (Op.getOpcode() != ISD::CopyFromReg ||
+ Op.getValueType() != MVT::f32)
+ return SDValue();
+
+ auto Move = N->use_begin();
+ if (Move->getOpcode() != ARMISD::VMOVhr)
+ return SDValue();
+
+ SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
+ SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops);
+ DAG.ReplaceAllUsesWith(*Move, &Copy);
+ return Copy;
+ }
+
+ if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
+ if (!HasFullFP16)
+ return SDValue();
+ // SoftFP: read half-precision arguments:
+ //
+ // t2: i32,ch = ...
+ // t7: i16 = truncate t2 <~~~~ Op
+ // t8: f16 = bitcast t7 <~~~~ N
+ //
+ if (Op.getOperand(0).getValueType() == MVT::i32)
+ return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op),
+ MVT::f16, Op.getOperand(0));
+
+ return SDValue();
+ }
+
+ // Half-precision return values
+ if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
+ if (!HasFullFP16)
+ return SDValue();
+ //
+ // t11: f16 = fadd t8, t10
+ // t12: i16 = bitcast t11 <~~~ SDNode N
+ // t13: i32 = zero_extend t12
+ // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13
+ // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1
+ //
+ // transform this into:
+ //
+ // t20: i32 = ARMISD::VMOVrh t11
+ // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20
+ //
+ auto ZeroExtend = N->use_begin();
+ if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND ||
+ ZeroExtend->getValueType(0) != MVT::i32)
+ return SDValue();
+
+ auto Copy = ZeroExtend->use_begin();
+ if (Copy->getOpcode() == ISD::CopyToReg &&
+ Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) {
+ SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op);
+ DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt);
+ return Cvt;
+ }
+ return SDValue();
+ }
+
+ if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
+ return SDValue();
// Turn i64->f64 into VMOVDRR.
if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
@@ -5566,16 +5782,22 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
return Result;
}
-static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDValue Carry = Op.getOperand(2);
SDValue Cond = Op.getOperand(3);
SDLoc DL(Op);
- assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+ assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
+
+ // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
+ // have to invert the carry first.
+ Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
+ DAG.getConstant(1, DL, MVT::i32), Carry);
+ // This converts the boolean value carry into the carry flag.
+ Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
- assert(Carry.getOpcode() != ISD::CARRY_FALSE);
SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
@@ -5731,23 +5953,34 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *ST) const {
- bool IsDouble = Op.getValueType() == MVT::f64;
+ EVT VT = Op.getValueType();
+ bool IsDouble = (VT == MVT::f64);
ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
const APFloat &FPVal = CFP->getValueAPF();
// Prevent floating-point constants from using literal loads
// when execute-only is enabled.
if (ST->genExecuteOnly()) {
+ // If we can represent the constant as an immediate, don't lower it
+ if (isFPImmLegal(FPVal, VT))
+ return Op;
+ // Otherwise, construct as integer, and move to float register
APInt INTVal = FPVal.bitcastToAPInt();
SDLoc DL(CFP);
- if (IsDouble) {
- SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
- SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
- if (!ST->isLittle())
- std::swap(Lo, Hi);
- return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
- } else {
- return DAG.getConstant(INTVal, DL, MVT::i32);
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("Unknown floating point type!");
+ break;
+ case MVT::f64: {
+ SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
+ SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
+ if (!ST->isLittle())
+ std::swap(Lo, Hi);
+ return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
+ }
+ case MVT::f32:
+ return DAG.getNode(ARMISD::VMOVSR, DL, VT,
+ DAG.getConstant(INTVal, DL, MVT::i32));
}
}
@@ -6598,10 +6831,9 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
}
// Final sanity check before we try to actually produce a shuffle.
- DEBUG(
- for (auto Src : Sources)
- assert(Src.ShuffleVec.getValueType() == ShuffleVT);
- );
+ LLVM_DEBUG(for (auto Src
+ : Sources)
+ assert(Src.ShuffleVec.getValueType() == ShuffleVT););
// The stars all align, our next step is to produce the mask for the shuffle.
SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
@@ -7490,39 +7722,15 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
return N0;
}
-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
- EVT VT = Op.getNode()->getValueType(0);
- SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-
- unsigned Opc;
- bool ExtraOp = false;
- switch (Op.getOpcode()) {
- default: llvm_unreachable("Invalid code");
- case ISD::ADDC: Opc = ARMISD::ADDC; break;
- case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
- case ISD::SUBC: Opc = ARMISD::SUBC; break;
- case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
- }
-
- if (!ExtraOp)
- return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
- Op.getOperand(1));
- return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
- Op.getOperand(1), Op.getOperand(2));
-}
-
static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
SDNode *N = Op.getNode();
EVT VT = N->getValueType(0);
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
SDValue Carry = Op.getOperand(2);
- EVT CarryVT = Carry.getValueType();
SDLoc DL(Op);
- APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
-
SDValue Result;
if (Op.getOpcode() == ISD::ADDCARRY) {
// This converts the boolean value carry into the carry flag.
@@ -7530,7 +7738,7 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
// Do the addition proper using the carry flag we wanted.
Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
- Op.getOperand(1), Carry.getValue(1));
+ Op.getOperand(1), Carry);
// Now convert the carry flag into a boolean value.
Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
@@ -7544,7 +7752,7 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
// Do the subtraction proper using the carry flag we wanted.
Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
- Op.getOperand(1), Carry.getValue(1));
+ Op.getOperand(1), Carry);
// Now convert the carry flag into a boolean value.
Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
@@ -7851,7 +8059,7 @@ static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
}
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
- DEBUG(dbgs() << "Lowering node: "; Op.dump());
+ LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
switch (Op.getOpcode()) {
default: llvm_unreachable("Don't know how to custom lower this!");
case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
@@ -7879,7 +8087,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
Subtarget);
- case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG);
+ case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
case ISD::SHL:
case ISD::SRL:
case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
@@ -7892,7 +8100,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG);
- case ISD::SETCCE: return LowerSETCCE(Op, DAG);
+ case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
@@ -7909,10 +8117,6 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
return LowerDIV_Windows(Op, DAG, /* Signed */ false);
return LowerUDIV(Op, DAG);
- case ISD::ADDC:
- case ISD::ADDE:
- case ISD::SUBC:
- case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
case ISD::ADDCARRY:
case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
case ISD::SADDO:
@@ -7927,7 +8131,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SDIVREM:
case ISD::UDIVREM: return LowerDivRem(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
- if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+ if (Subtarget->isTargetWindows())
return LowerDYNAMIC_STACKALLOC(Op, DAG);
llvm_unreachable("Don't know how to custom lower this!");
case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
@@ -7981,7 +8185,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
ExpandREAD_REGISTER(N, Results, DAG);
break;
case ISD::BITCAST:
- Res = ExpandBITCAST(N, DAG);
+ Res = ExpandBITCAST(N, DAG, Subtarget);
break;
case ISD::SRL:
case ISD::SRA:
@@ -9055,8 +9259,6 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// Thumb1 post-indexed loads are really just single-register LDMs.
case ARM::tLDR_postidx: {
MachineOperand Def(MI.getOperand(1));
- if (TargetRegisterInfo::isPhysicalRegister(Def.getReg()))
- Def.setIsRenamable(false);
BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
.add(Def) // Rn_wb
.add(MI.getOperand(2)) // Rn
@@ -9323,7 +9525,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
}
}
-/// \brief Attaches vregs to MEMCPY that it will use as scratch registers
+/// Attaches vregs to MEMCPY that it will use as scratch registers
/// when it is expanded into LDM/STM. This is done as a post-isel lowering
/// instead of as a custom inserter because we need the use list from the SDNode.
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
@@ -9860,7 +10062,7 @@ static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
return resNode;
}
-static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,
+static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// Look for multiply add opportunities.
@@ -9877,49 +10079,61 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,
// V V
// ADDE <- hiAdd
//
- assert(AddeNode->getOpcode() == ARMISD::ADDE && "Expect an ADDE");
-
- assert(AddeNode->getNumOperands() == 3 &&
- AddeNode->getOperand(2).getValueType() == MVT::i32 &&
+ // In the special case where only the higher part of a signed result is used
+ // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
+ // a constant with the exact value of 0x80000000, we recognize we are dealing
+ // with a "rounded multiply and add" (or subtract) and transform it into
+ // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
+
+ assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
+ AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
+ "Expect an ADDE or SUBE");
+
+ assert(AddeSubeNode->getNumOperands() == 3 &&
+ AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
"ADDE node has the wrong inputs");
- // Check that we are chained to the right ADDC node.
- SDNode* AddcNode = AddeNode->getOperand(2).getNode();
- if (AddcNode->getOpcode() != ARMISD::ADDC)
+ // Check that we are chained to the right ADDC or SUBC node.
+ SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
+ if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
+ AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
+ (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
+ AddcSubcNode->getOpcode() != ARMISD::SUBC))
return SDValue();
- SDValue AddcOp0 = AddcNode->getOperand(0);
- SDValue AddcOp1 = AddcNode->getOperand(1);
+ SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
+ SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
// Check if the two operands are from the same mul_lohi node.
- if (AddcOp0.getNode() == AddcOp1.getNode())
+ if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
return SDValue();
- assert(AddcNode->getNumValues() == 2 &&
- AddcNode->getValueType(0) == MVT::i32 &&
+ assert(AddcSubcNode->getNumValues() == 2 &&
+ AddcSubcNode->getValueType(0) == MVT::i32 &&
"Expect ADDC with two result values. First: i32");
// Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
// maybe a SMLAL which multiplies two 16-bit values.
- if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
- AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
- AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
- AddcOp1->getOpcode() != ISD::SMUL_LOHI)
- return AddCombineTo64BitSMLAL16(AddcNode, AddeNode, DCI, Subtarget);
+ if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
+ AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
+ AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
+ AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
+ AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
+ return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
// Check for the triangle shape.
- SDValue AddeOp0 = AddeNode->getOperand(0);
- SDValue AddeOp1 = AddeNode->getOperand(1);
+ SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
+ SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
- // Make sure that the ADDE operands are not coming from the same node.
- if (AddeOp0.getNode() == AddeOp1.getNode())
+ // Make sure that the ADDE/SUBE operands are not coming from the same node.
+ if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
return SDValue();
- // Find the MUL_LOHI node walking up ADDE's operands.
+ // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
bool IsLeftOperandMUL = false;
- SDValue MULOp = findMUL_LOHI(AddeOp0);
+ SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
if (MULOp == SDValue())
- MULOp = findMUL_LOHI(AddeOp1);
+ MULOp = findMUL_LOHI(AddeSubeOp1);
else
IsLeftOperandMUL = true;
if (MULOp == SDValue())
@@ -9930,63 +10144,88 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,
unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
// Figure out the high and low input values to the MLAL node.
- SDValue* HiAdd = nullptr;
- SDValue* LoMul = nullptr;
- SDValue* LowAdd = nullptr;
+ SDValue *HiAddSub = nullptr;
+ SDValue *LoMul = nullptr;
+ SDValue *LowAddSub = nullptr;
- // Ensure that ADDE is from high result of ISD::xMUL_LOHI.
- if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1)))
+ // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
+ if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
return SDValue();
if (IsLeftOperandMUL)
- HiAdd = &AddeOp1;
+ HiAddSub = &AddeSubeOp1;
else
- HiAdd = &AddeOp0;
+ HiAddSub = &AddeSubeOp0;
+ // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
+ // whose low result is fed to the ADDC/SUBC we are checking.
- // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node
- // whose low result is fed to the ADDC we are checking.
-
- if (AddcOp0 == MULOp.getValue(0)) {
- LoMul = &AddcOp0;
- LowAdd = &AddcOp1;
+ if (AddcSubcOp0 == MULOp.getValue(0)) {
+ LoMul = &AddcSubcOp0;
+ LowAddSub = &AddcSubcOp1;
}
- if (AddcOp1 == MULOp.getValue(0)) {
- LoMul = &AddcOp1;
- LowAdd = &AddcOp0;
+ if (AddcSubcOp1 == MULOp.getValue(0)) {
+ LoMul = &AddcSubcOp1;
+ LowAddSub = &AddcSubcOp0;
}
if (!LoMul)
return SDValue();
- // If HiAdd is the same node as ADDC or is a predecessor of ADDC the
- // replacement below will create a cycle.
- if (AddcNode == HiAdd->getNode() ||
- AddcNode->isPredecessorOf(HiAdd->getNode()))
+ // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
+ // the replacement below will create a cycle.
+ if (AddcSubcNode == HiAddSub->getNode() ||
+ AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
return SDValue();
// Create the merged node.
SelectionDAG &DAG = DCI.DAG;
- // Build operand list.
+ // Start building operand list.
SmallVector<SDValue, 8> Ops;
Ops.push_back(LoMul->getOperand(0));
Ops.push_back(LoMul->getOperand(1));
- Ops.push_back(*LowAdd);
- Ops.push_back(*HiAdd);
- SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcNode),
+ // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
+ // the case, we must be doing signed multiplication and only use the higher
+ // part of the result of the MLAL, furthermore the LowAddSub must be a constant
+ // addition or subtraction with the value of 0x800000.
+ if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
+ FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
+ LowAddSub->getNode()->getOpcode() == ISD::Constant &&
+ static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
+ 0x80000000) {
+ Ops.push_back(*HiAddSub);
+ if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
+ FinalOpc = ARMISD::SMMLSR;
+ } else {
+ FinalOpc = ARMISD::SMMLAR;
+ }
+ SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
+
+ return SDValue(AddeSubeNode, 0);
+ } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
+ // SMMLS is generated during instruction selection and the rest of this
+ // function can not handle the case where AddcSubcNode is a SUBC.
+ return SDValue();
+
+ // Finish building the operand list for {U/S}MLAL
+ Ops.push_back(*LowAddSub);
+ Ops.push_back(*HiAddSub);
+
+ SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
DAG.getVTList(MVT::i32, MVT::i32), Ops);
// Replace the ADDs' nodes uses by the MLA node's values.
SDValue HiMLALResult(MLALNode.getNode(), 1);
- DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
SDValue LoMLALResult(MLALNode.getNode(), 0);
- DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
// Return original node to notify the driver to stop replacing.
- return SDValue(AddeNode, 0);
+ return SDValue(AddeSubeNode, 0);
}
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
@@ -10071,13 +10310,13 @@ static SDValue PerformAddcSubcCombine(SDNode *N,
const ARMSubtarget *Subtarget) {
SelectionDAG &DAG(DCI.DAG);
- if (N->getOpcode() == ARMISD::ADDC) {
- // (ADDC (ADDE 0, 0, C), -1) -> C
+ if (N->getOpcode() == ARMISD::SUBC) {
+ // (SUBC (ADDE 0, 0, C), 1) -> C
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
if (LHS->getOpcode() == ARMISD::ADDE &&
isNullConstant(LHS->getOperand(0)) &&
- isNullConstant(LHS->getOperand(1)) && isAllOnesConstant(RHS)) {
+ isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
}
}
@@ -10095,12 +10334,15 @@ static SDValue PerformAddcSubcCombine(SDNode *N,
}
}
}
+
return SDValue();
}
-static SDValue PerformAddeSubeCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue PerformAddeSubeCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
if (Subtarget->isThumb1Only()) {
+ SelectionDAG &DAG = DCI.DAG;
SDValue RHS = N->getOperand(1);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
int64_t imm = C->getSExtValue();
@@ -10118,6 +10360,8 @@ static SDValue PerformAddeSubeCombine(SDNode *N, SelectionDAG &DAG,
N->getOperand(0), RHS, N->getOperand(2));
}
}
+ } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
+ return AddCombineTo64bitMLAL(N, DCI, Subtarget);
}
return SDValue();
}
@@ -10130,7 +10374,7 @@ static SDValue PerformADDECombine(SDNode *N,
const ARMSubtarget *Subtarget) {
// Only ARM and Thumb2 support UMLAL/SMLAL.
if (Subtarget->isThumb1Only())
- return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);
+ return PerformAddeSubeCombine(N, DCI, Subtarget);
// Only perform the checks after legalize when the pattern is available.
if (DCI.isBeforeLegalize()) return SDValue();
@@ -10201,7 +10445,14 @@ static SDValue PerformSHLSimplify(SDNode *N,
case ISD::XOR:
case ISD::SETCC:
case ARMISD::CMP:
- // Check that its not already using a shl.
+ // Check that the user isn't already using a constant because there
+ // aren't any instructions that support an immediate operand and a
+ // shifted operand.
+ if (isa<ConstantSDNode>(U->getOperand(0)) ||
+ isa<ConstantSDNode>(U->getOperand(1)))
+ return SDValue();
+
+ // Check that it's not already using a shift.
if (U->getOperand(0).getOpcode() == ISD::SHL ||
U->getOperand(1).getOpcode() == ISD::SHL)
return SDValue();
@@ -10223,8 +10474,6 @@ static SDValue PerformSHLSimplify(SDNode *N,
if (!C1ShlC2 || !C2)
return SDValue();
- DEBUG(dbgs() << "Trying to simplify shl: "; N->dump());
-
APInt C2Int = C2->getAPIntValue();
APInt C1Int = C1ShlC2->getAPIntValue();
@@ -10238,12 +10487,12 @@ static SDValue PerformSHLSimplify(SDNode *N,
C1Int.lshrInPlace(C2Int);
// The immediates are encoded as an 8-bit value that can be rotated.
- unsigned Zeros = C1Int.countLeadingZeros() + C1Int.countTrailingZeros();
- if (C1Int.getBitWidth() - Zeros > 8)
- return SDValue();
+ auto LargeImm = [](const APInt &Imm) {
+ unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros();
+ return Imm.getBitWidth() - Zeros > 8;
+ };
- Zeros = C2Int.countLeadingZeros() + C2Int.countTrailingZeros();
- if (C2Int.getBitWidth() - Zeros > 8)
+ if (LargeImm(C1Int) || LargeImm(C2Int))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
@@ -10254,6 +10503,10 @@ static SDValue PerformSHLSimplify(SDNode *N,
// Shift left to compensate for the lshr of C1Int.
SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
+ LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
+ SHL.dump(); N->dump());
+ LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
+
DAG.ReplaceAllUsesWith(SDValue(N, 0), Res);
return SDValue(N, 0);
}
@@ -10423,6 +10676,83 @@ static SDValue PerformMULCombine(SDNode *N,
return SDValue();
}
+static SDValue CombineANDShift(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ // Allow DAGCombine to pattern-match before we touch the canonical form.
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ if (N->getValueType(0) != MVT::i32)
+ return SDValue();
+
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!N1C)
+ return SDValue();
+
+ uint32_t C1 = (uint32_t)N1C->getZExtValue();
+ // Don't transform uxtb/uxth.
+ if (C1 == 255 || C1 == 65535)
+ return SDValue();
+
+ SDNode *N0 = N->getOperand(0).getNode();
+ if (!N0->hasOneUse())
+ return SDValue();
+
+ if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
+ return SDValue();
+
+ bool LeftShift = N0->getOpcode() == ISD::SHL;
+
+ ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+ if (!N01C)
+ return SDValue();
+
+ uint32_t C2 = (uint32_t)N01C->getZExtValue();
+ if (!C2 || C2 >= 32)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ // We have a pattern of the form "(and (shl x, c2) c1)" or
+ // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
+ // transform to a pair of shifts, to save materializing c1.
+
+ // First pattern: right shift, and c1+1 is a power of two.
+ // FIXME: Also check reversed pattern (left shift, and ~c1+1 is a power
+ // of two).
+ // FIXME: Use demanded bits?
+ if (!LeftShift && isMask_32(C1)) {
+ uint32_t C3 = countLeadingZeros(C1);
+ if (C2 < C3) {
+ SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
+ DAG.getConstant(C3 - C2, DL, MVT::i32));
+ return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
+ DAG.getConstant(C3, DL, MVT::i32));
+ }
+ }
+
+ // Second pattern: left shift, and (c1>>c2)+1 is a power of two.
+ // FIXME: Also check reversed pattern (right shift, and ~(c1<<c2)+1
+ // is a power of two).
+ // FIXME: Use demanded bits?
+ if (LeftShift && isShiftedMask_32(C1)) {
+ uint32_t C3 = countLeadingZeros(C1);
+ if (C2 + C3 < 32 && C1 == ((-1U << (C2 + C3)) >> C3)) {
+ SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
+ DAG.getConstant(C2 + C3, DL, MVT::i32));
+ return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
+ DAG.getConstant(C3, DL, MVT::i32));
+ }
+ }
+
+ // FIXME: Transform "(and (shl x, c2) c1)" ->
+ // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
+ // c1.
+ return SDValue();
+}
+
static SDValue PerformANDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
@@ -10464,6 +10794,10 @@ static SDValue PerformANDCombine(SDNode *N,
return Result;
}
+ if (Subtarget->isThumb1Only())
+ if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
+ return Result;
+
return SDValue();
}
@@ -11012,7 +11346,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
return DAG.getNode(ISD::BITCAST, dl, VT, BV);
}
-/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
+/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static SDValue
PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
// ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
@@ -11228,6 +11562,12 @@ static SDValue CombineBaseUpdate(SDNode *N,
NumVecs = 3; break;
case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;
NumVecs = 4; break;
+ case Intrinsic::arm_neon_vld2dup:
+ case Intrinsic::arm_neon_vld3dup:
+ case Intrinsic::arm_neon_vld4dup:
+ // TODO: Support updating VLDxDUP nodes. For now, we just skip
+ // combining base updates for such intrinsics.
+ continue;
case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
NumVecs = 2; isLaneOp = true; break;
case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
@@ -12306,6 +12646,89 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
}
}
+ if (!VT.isInteger())
+ return SDValue();
+
+ // Materialize a boolean comparison for integers so we can avoid branching.
+ if (isNullConstant(FalseVal)) {
+ if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
+ if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
+ // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
+ // right 5 bits will make that 32 be 1, otherwise it will be 0.
+ // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+ Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
+ DAG.getConstant(5, dl, MVT::i32));
+ } else {
+ // CMOV 0, 1, ==, (CMPZ x, y) ->
+ // (ADDCARRY (SUB x, y), t:0, t:1)
+ // where t = (SUBCARRY 0, (SUB x, y), 0)
+ //
+ // The SUBCARRY computes 0 - (x - y) and this will give a borrow when
+ // x != y. In other words, a carry C == 1 when x == y, C == 0
+ // otherwise.
+ // The final ADDCARRY computes
+ // x - y + (0 - (x - y)) + C == C
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
+ // ISD::SUBCARRY returns a borrow but we want the carry here
+ // actually.
+ SDValue Carry =
+ DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
+ Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
+ }
+ } else if (CC == ARMCC::NE && LHS != RHS &&
+ (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
+ // This seems pointless but will allow us to combine it further below.
+ // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+ Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
+ N->getOperand(3), Cmp);
+ }
+ } else if (isNullConstant(TrueVal)) {
+ if (CC == ARMCC::EQ && LHS != RHS &&
+ (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
+ // This seems pointless but will allow us to combine it further below
+ // Note that we change == for != as this is the dual for the case above.
+ // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+ Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
+ DAG.getConstant(ARMCC::NE, dl, MVT::i32),
+ N->getOperand(3), Cmp);
+ }
+ }
+
+ // On Thumb1, the DAG above may be further combined if z is a power of 2
+ // (z == 2 ^ K).
+ // CMOV (SUB x, y), z, !=, (CMPZ x, y) ->
+ // merge t3, t4
+ // where t1 = (SUBCARRY (SUB x, y), z, 0)
+ // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
+ // t3 = if K != 0 then (SHL t2:0, K) else t2:0
+ // t4 = (SUB 1, t2:1) [ we want a carry, not a borrow ]
+ const APInt *TrueConst;
+ if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
+ (FalseVal.getOpcode() == ISD::SUB) && (FalseVal.getOperand(0) == LHS) &&
+ (FalseVal.getOperand(1) == RHS) &&
+ (TrueConst = isPowerOf2Constant(TrueVal))) {
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ unsigned ShiftAmount = TrueConst->logBase2();
+ if (ShiftAmount)
+ TrueVal = DAG.getConstant(1, dl, VT);
+ SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
+ Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));
+ // Make it a carry, not a borrow.
+ SDValue Carry = DAG.getNode(
+ ISD::SUB, dl, VT, DAG.getConstant(1, dl, MVT::i32), Res.getValue(1));
+ Res = DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Res, Carry);
+
+ if (ShiftAmount)
+ Res = DAG.getNode(ISD::SHL, dl, VT, Res,
+ DAG.getConstant(ShiftAmount, dl, MVT::i32));
+ }
+
if (Res.getNode()) {
KnownBits Known;
DAG.computeKnownBits(SDValue(N,0), Known);
@@ -12338,7 +12761,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
case ARMISD::ADDC:
case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
- case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);
+ case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
case ARMISD::BFI: return PerformBFICombine(N, DCI);
case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
@@ -12424,13 +12847,22 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
case Intrinsic::arm_neon_vld1:
+ case Intrinsic::arm_neon_vld1x2:
+ case Intrinsic::arm_neon_vld1x3:
+ case Intrinsic::arm_neon_vld1x4:
case Intrinsic::arm_neon_vld2:
case Intrinsic::arm_neon_vld3:
case Intrinsic::arm_neon_vld4:
case Intrinsic::arm_neon_vld2lane:
case Intrinsic::arm_neon_vld3lane:
case Intrinsic::arm_neon_vld4lane:
+ case Intrinsic::arm_neon_vld2dup:
+ case Intrinsic::arm_neon_vld3dup:
+ case Intrinsic::arm_neon_vld4dup:
case Intrinsic::arm_neon_vst1:
+ case Intrinsic::arm_neon_vst1x2:
+ case Intrinsic::arm_neon_vst1x3:
+ case Intrinsic::arm_neon_vst1x4:
case Intrinsic::arm_neon_vst2:
case Intrinsic::arm_neon_vst3:
case Intrinsic::arm_neon_vst4:
@@ -12454,6 +12886,10 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
unsigned,
unsigned,
bool *Fast) const {
+ // Depends what it gets converted into if the type is weird.
+ if (!VT.isSimple())
+ return false;
+
// The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
@@ -12560,6 +12996,24 @@ bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return false;
}
+bool ARMTargetLowering::isFNegFree(EVT VT) const {
+ if (!VT.isSimple())
+ return false;
+
+ // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
+ // negate values directly (fneg is free). So, we don't want to let the DAG
+ // combiner rewrite fneg into xors and some other instructions. For f16 and
+ // FullFP16 argument passing, some bitcast nodes may be introduced,
+ // triggering this DAG combine rewrite, so we are avoiding that with this.
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::f16:
+ return Subtarget->hasFullFP16();
+ }
+
+ return false;
+}
+
bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
EVT VT = ExtVal.getValueType();
@@ -12828,9 +13282,11 @@ bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
// Thumb2 and ARM modes can use cmn for negative immediates.
if (!Subtarget->isThumb())
- return ARM_AM::getSOImmVal(std::abs(Imm)) != -1;
+ return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
+ ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
if (Subtarget->isThumb2())
- return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1;
+ return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
+ ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
// Thumb1 doesn't have cmn, and only 8-bit immediates.
return Imm >= 0 && Imm <= 255;
}
@@ -13262,8 +13718,14 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
return RCPair(0U, &ARM::QPR_8RegClass);
break;
case 't':
+ if (VT == MVT::Other)
+ break;
if (VT == MVT::f32 || VT == MVT::i32)
return RCPair(0U, &ARM::SPRRegClass);
+ if (VT.getSizeInBits() == 64)
+ return RCPair(0U, &ARM::DPR_VFP2RegClass);
+ if (VT.getSizeInBits() == 128)
+ return RCPair(0U, &ARM::QPR_VFP2RegClass);
break;
}
}
@@ -13593,6 +14055,20 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
+ if (DAG.getMachineFunction().getFunction().hasFnAttribute(
+ "no-stack-arg-probe")) {
+ unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
+ Chain = SP.getValue(1);
+ SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
+ if (Align)
+ SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
+ DAG.getConstant(-(uint64_t)Align, DL, MVT::i32));
+ Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
+ SDValue Ops[2] = { SP, Chain };
+ return DAG.getMergeValues(Ops, DL);
+ }
+
SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
DAG.getConstant(2, DL, MVT::i32));
@@ -13656,6 +14132,8 @@ bool ARM::isBitFieldInvertedMask(unsigned v) {
bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
if (!Subtarget->hasVFP3())
return false;
+ if (VT == MVT::f16 && Subtarget->hasFullFP16())
+ return ARM_AM::getFP16Imm(Imm) != -1;
if (VT == MVT::f32)
return ARM_AM::getFP32Imm(Imm) != -1;
if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
@@ -13677,7 +14155,10 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::arm_neon_vld4:
case Intrinsic::arm_neon_vld2lane:
case Intrinsic::arm_neon_vld3lane:
- case Intrinsic::arm_neon_vld4lane: {
+ case Intrinsic::arm_neon_vld4lane:
+ case Intrinsic::arm_neon_vld2dup:
+ case Intrinsic::arm_neon_vld3dup:
+ case Intrinsic::arm_neon_vld4dup: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
// Conservatively set memVT to the entire set of vectors loaded.
auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
@@ -13691,6 +14172,21 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad;
return true;
}
+ case Intrinsic::arm_neon_vld1x2:
+ case Intrinsic::arm_neon_vld1x3:
+ case Intrinsic::arm_neon_vld1x4: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ // Conservatively set memVT to the entire set of vectors loaded.
+ auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+ uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+ Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+ Info.offset = 0;
+ Info.align = 0;
+ // volatile loads with NEON intrinsics not supported
+ Info.flags = MachineMemOperand::MOLoad;
+ return true;
+ }
case Intrinsic::arm_neon_vst1:
case Intrinsic::arm_neon_vst2:
case Intrinsic::arm_neon_vst3:
@@ -13717,6 +14213,27 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOStore;
return true;
}
+ case Intrinsic::arm_neon_vst1x2:
+ case Intrinsic::arm_neon_vst1x3:
+ case Intrinsic::arm_neon_vst1x4: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ // Conservatively set memVT to the entire set of vectors stored.
+ auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+ unsigned NumElts = 0;
+ for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+ Type *ArgTy = I.getArgOperand(ArgI)->getType();
+ if (!ArgTy->isVectorTy())
+ break;
+ NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
+ }
+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.align = 0;
+ // volatile stores with NEON intrinsics not supported
+ Info.flags = MachineMemOperand::MOStore;
+ return true;
+ }
case Intrinsic::arm_ldaex:
case Intrinsic::arm_ldrex: {
auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
@@ -13768,7 +14285,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return false;
}
-/// \brief Returns true if it is beneficial to convert a load of a constant
+/// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
@@ -14064,7 +14581,7 @@ bool ARMTargetLowering::isLegalInterleavedAccessType(
return VecSize == 64 || VecSize % 128 == 0;
}
-/// \brief Lower an interleaved load into a vldN intrinsic.
+/// Lower an interleaved load into a vldN intrinsic.
///
/// E.g. Lower an interleaved load (Factor = 2):
/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
@@ -14182,7 +14699,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
return true;
}
-/// \brief Lower an interleaved store into a vstN intrinsic.
+/// Lower an interleaved store into a vstN intrinsic.
///
/// E.g. Lower an interleaved store (Factor = 3):
/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
@@ -14380,7 +14897,19 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
return (Members > 0 && Members <= 4);
}
-/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
+/// Return the correct alignment for the current calling convention.
+unsigned
+ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
+ DataLayout DL) const {
+ if (!ArgTy->isVectorTy())
+ return DL.getABITypeAlignment(ArgTy);
+
+ // Avoid over-aligning vector parameters. It would require realigning the
+ // stack and waste space for no real benefit.
+ return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment());
+}
+
+/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
/// passing according to AAPCS rules.
bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
@@ -14392,7 +14921,7 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
HABaseType Base = HA_UNKNOWN;
uint64_t Members = 0;
bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
- DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
+ LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
return IsHA || IsIntArray;
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index bf63dfae4407..50b4c2977fb5 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -21,7 +21,6 @@
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/ValueTypes.h"
@@ -31,6 +30,7 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/MachineValueType.h"
#include <utility>
namespace llvm {
@@ -102,6 +102,7 @@ class VectorType;
VMOVRRD, // double to two gprs.
VMOVDRR, // Two gprs to double.
+ VMOVSR, // move gpr to single, used for f32 literal constructed in a gpr
EH_SJLJ_SETJMP, // SjLj exception handling setjmp.
EH_SJLJ_LONGJMP, // SjLj exception handling longjmp.
@@ -171,6 +172,10 @@ class VectorType;
// Vector move f32 immediate:
VMOVFPIMM,
+ // Move H <-> R, clearing top 16 bits
+ VMOVrh,
+ VMOVhr,
+
// Vector duplicate:
VDUP,
VDUPLANE,
@@ -203,6 +208,8 @@ class VectorType;
SMLALDX, // Signed multiply accumulate long dual exchange
SMLSLD, // Signed multiply subtract long dual
SMLSLDX, // Signed multiply subtract long dual exchange
+ SMMLAR, // Signed multiply long, round and add
+ SMMLSR, // Signed multiply long, subtract and round
// Operands of the standard BUILD_VECTOR node are not legalized, which
// is fine if BUILD_VECTORs are always lowered to shuffles or other
@@ -325,6 +332,7 @@ class VectorType;
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override;
bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;
+ bool isFNegFree(EVT VT) const override;
bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
@@ -346,7 +354,7 @@ class VectorType;
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
- /// \brief Returns true if the addresing mode representing by AM is legal
+ /// Returns true if the addresing mode representing by AM is legal
/// for the Thumb1 target, for a load/store of the specified type.
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
@@ -474,7 +482,7 @@ class VectorType;
MachineFunction &MF,
unsigned Intrinsic) const override;
- /// \brief Returns true if it is beneficial to convert a load of a constant
+ /// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
@@ -484,7 +492,7 @@ class VectorType;
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const override;
- /// \brief Returns true if an argument of type Ty needs to be passed in a
+ /// Returns true if an argument of type Ty needs to be passed in a
/// contiguous block of registers in calling convention CallConv.
bool functionArgumentNeedsConsecutiveRegisters(
Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;
@@ -571,6 +579,10 @@ class VectorType;
void finalizeLowering(MachineFunction &MF) const override;
+ /// Return the correct alignment for the current calling convention.
+ unsigned getABIAlignmentForCallingConv(Type *ArgTy,
+ DataLayout DL) const override;
+
protected:
std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo *TRI,
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index f7c6c32eb4dc..70aded247f65 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -108,6 +108,7 @@ def AddrModeT2_so : AddrMode<13>;
def AddrModeT2_pc : AddrMode<14>;
def AddrModeT2_i8s4 : AddrMode<15>;
def AddrMode_i12 : AddrMode<16>;
+def AddrMode5FP16 : AddrMode<17>;
// Load / store index mode.
class IndexMode<bits<2> val> {
@@ -1023,6 +1024,12 @@ class Thumb2DSPPat<dag pattern, dag result> : Pat<pattern, result> {
class Thumb2DSPMulPat<dag pattern, dag result> : Pat<pattern, result> {
list<Predicate> Predicates = [IsThumb2, UseMulOps, HasDSP];
}
+class FP16Pat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [HasFP16];
+}
+class FullFP16Pat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [HasFullFP16];
+}
//===----------------------------------------------------------------------===//
// Thumb Instruction Format Definitions.
//
@@ -1527,7 +1534,7 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
class AHI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
InstrItinClass itin,
string opc, string asm, list<dag> pattern>
- : VFPI<oops, iops, AddrMode5, 4, IndexModeNone,
+ : VFPI<oops, iops, AddrMode5FP16, 4, IndexModeNone,
VFPLdStFrm, itin, opc, asm, "", pattern> {
list<Predicate> Predicates = [HasFullFP16];
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index a0e2ac4cbc6f..397c9dadb4ac 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -135,3 +135,31 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
.setMemRefs(MI->memoperands_begin(), MI->memoperands_end())
.add(predOps(ARMCC::AL));
}
+
+std::pair<unsigned, unsigned>
+ARMInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ const unsigned Mask = ARMII::MO_OPTION_MASK;
+ return std::make_pair(TF & Mask, TF & ~Mask);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+ARMInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ using namespace ARMII;
+
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_LO16, "arm-lo16"}, {MO_HI16, "arm-hi16"}};
+ return makeArrayRef(TargetFlags);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+ARMInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
+ using namespace ARMII;
+
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_GOT, "arm-got"},
+ {MO_SBREL, "arm-sbrel"},
+ {MO_DLLIMPORT, "arm-dllimport"},
+ {MO_SECREL, "arm-secrel"},
+ {MO_NONLAZY, "arm-nonlazy"}};
+ return makeArrayRef(TargetFlags);
+}
diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
index c87fb97448c9..c54c987134df 100644
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@@ -38,6 +38,13 @@ public:
///
const ARMRegisterInfo &getRegisterInfo() const override { return RI; }
+ std::pair<unsigned, unsigned>
+ decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableDirectMachineOperandTargetFlags() const override;
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableBitmaskMachineOperandTargetFlags() const override;
+
private:
void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
};
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index eb8526bfeadf..d4c342cee5c0 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -105,6 +105,14 @@ def ARMSmlaldx : SDNode<"ARMISD::SMLALDX", SDT_LongMac>;
def ARMSmlsld : SDNode<"ARMISD::SMLSLD", SDT_LongMac>;
def ARMSmlsldx : SDNode<"ARMISD::SMLSLDX", SDT_LongMac>;
+def SDT_MulHSR : SDTypeProfile<1, 3, [SDTCisVT<0,i32>,
+ SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>]>;
+
+def ARMsmmlar : SDNode<"ARMISD::SMMLAR", SDT_MulHSR>;
+def ARMsmmlsr : SDNode<"ARMISD::SMMLSR", SDT_MulHSR>;
+
// Node definitions.
def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>;
def ARMWrapperPIC : SDNode<"ARMISD::WrapperPIC", SDTIntUnaryOp>;
@@ -245,6 +253,8 @@ def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">,
AssemblerPredicate<"HasV8_3aOps", "armv8.3a">;
+def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">,
+ AssemblerPredicate<"HasV8_4aOps", "armv8.4a">;
def NoVFP : Predicate<"!Subtarget->hasVFP2()">;
def HasVFP2 : Predicate<"Subtarget->hasVFP2()">,
AssemblerPredicate<"FeatureVFP2", "VFP2">;
@@ -259,6 +269,10 @@ def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
AssemblerPredicate<"FeatureFPARMv8", "FPARMv8">;
def HasNEON : Predicate<"Subtarget->hasNEON()">,
AssemblerPredicate<"FeatureNEON", "NEON">;
+def HasSHA2 : Predicate<"Subtarget->hasSHA2()">,
+ AssemblerPredicate<"FeatureSHA2", "sha2">;
+def HasAES : Predicate<"Subtarget->hasAES()">,
+ AssemblerPredicate<"FeatureAES", "aes">;
def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
AssemblerPredicate<"FeatureCrypto", "crypto">;
def HasDotProd : Predicate<"Subtarget->hasDotProd()">,
@@ -875,6 +889,16 @@ def bf_inv_mask_imm : Operand<i32>,
let PrintMethod = "printBitfieldInvMaskImmOperand";
let DecoderMethod = "DecodeBitfieldMaskOperand";
let ParserMatchClass = BitfieldAsmOperand;
+ let GISelPredicateCode = [{
+ // There's better methods of implementing this check. IntImmLeaf<> would be
+ // equivalent and have less boilerplate but we need a test for C++
+ // predicates and this one causes new rules to be imported into GlobalISel
+ // without requiring additional features first.
+ const auto &MO = MI.getOperand(1);
+ if (!MO.isCImm())
+ return false;
+ return ARM::isBitFieldInvertedMask(MO.getCImm()->getZExtValue());
+ }];
}
def imm1_32_XFORM: SDNodeXForm<imm, [{
@@ -1996,6 +2020,7 @@ def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6K]>;
def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6K]>;
def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>;
def : InstAlias<"esb$p", (HINT 16, pred:$p)>, Requires<[IsARM, HasRAS]>;
+def : InstAlias<"csdb$p", (HINT 20, pred:$p)>, Requires<[IsARM, HasV6K]>;
def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
"\t$Rd, $Rn, $Rm",
@@ -3331,7 +3356,7 @@ defm sysSTM : arm_ldst_mult<"stm", " ^", 0, 1, LdStMulFrm, IIC_iStore_m,
// Move Instructions.
//
-let hasSideEffects = 0 in
+let hasSideEffects = 0, isMoveReg = 1 in
def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
"mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
bits<4> Rd;
@@ -3904,6 +3929,8 @@ def MVNr : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
let Inst{11-4} = 0b00000000;
let Inst{15-12} = Rd;
let Inst{3-0} = Rm;
+
+ let Unpredictable{19-16} = 0b1111;
}
def MVNsi : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift),
DPSoRegImmFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
@@ -3917,10 +3944,12 @@ def MVNsi : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift),
let Inst{11-5} = shift{11-5};
let Inst{4} = 0;
let Inst{3-0} = shift{3-0};
+
+ let Unpredictable{19-16} = 0b1111;
}
-def MVNsr : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
+def MVNsr : AsI1<0b1111, (outs GPRnopc:$Rd), (ins so_reg_reg:$shift),
DPSoRegRegFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
- [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP,
+ [(set GPRnopc:$Rd, (not so_reg_reg:$shift))]>, UnaryDP,
Sched<[WriteALU]> {
bits<4> Rd;
bits<12> shift;
@@ -3932,6 +3961,8 @@ def MVNsr : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
let Inst{6-5} = shift{6-5};
let Inst{4} = 1;
let Inst{3-0} = shift{3-0};
+
+ let Unpredictable{19-16} = 0b1111;
}
let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
def MVNi : AsI1<0b1111, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm,
@@ -4143,7 +4174,8 @@ def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
}
def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
- IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>,
+ IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm",
+ [(set GPR:$Rd, (ARMsmmlar GPR:$Rn, GPR:$Rm, (i32 0)))]>,
Requires<[IsARM, HasV6]>,
Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
let Inst{15-12} = 0b1111;
@@ -4158,7 +4190,8 @@ def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd),
def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
(ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
- IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
+ IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra",
+ [(set GPR:$Rd, (ARMsmmlar GPR:$Rn, GPR:$Rm, GPR:$Ra))]>,
Requires<[IsARM, HasV6]>,
Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
@@ -4170,7 +4203,8 @@ def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
(ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
- IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
+ IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra",
+ [(set GPR:$Rd, (ARMsmmlsr GPR:$Rn, GPR:$Rm, GPR:$Ra))]>,
Requires<[IsARM, HasV6]>,
Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
@@ -4785,6 +4819,15 @@ def instsyncb_opt : Operand<i32> {
let DecoderMethod = "DecodeInstSyncBarrierOption";
}
+def TraceSyncBarrierOptOperand : AsmOperandClass {
+ let Name = "TraceSyncBarrierOpt";
+ let ParserMethod = "parseTraceSyncBarrierOptOperand";
+}
+def tsb_opt : Operand<i32> {
+ let PrintMethod = "printTraceSyncBOption";
+ let ParserMatchClass = TraceSyncBarrierOptOperand;
+}
+
// Memory barriers protect the atomic sequences
let hasSideEffects = 1 in {
def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
@@ -4811,6 +4854,13 @@ def ISB : AInoP<(outs), (ins instsyncb_opt:$opt), MiscFrm, NoItinerary,
let Inst{31-4} = 0xf57ff06;
let Inst{3-0} = opt;
}
+
+let hasNoSchedulingInfo = 1 in
+def TSB : AInoP<(outs), (ins tsb_opt:$opt), MiscFrm, NoItinerary,
+ "tsb", "\t$opt", []>, Requires<[IsARM, HasV8_4a]> {
+ let Inst{31-0} = 0xe320f012;
+}
+
}
let usesCustomInserter = 1, Defs = [CPSR] in {
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index cd67dded5853..4525eec8da03 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -48,46 +48,28 @@ def nImmVMOVI32 : Operand<i32> {
let ParserMatchClass = nImmVMOVI32AsmOperand;
}
-def nImmVMOVI16AsmOperandByteReplicate :
- AsmOperandClass {
- let Name = "NEONi16vmovByteReplicate";
- let PredicateMethod = "isNEONi16ByteReplicate";
- let RenderMethod = "addNEONvmovByteReplicateOperands";
-}
-def nImmVMOVI32AsmOperandByteReplicate :
- AsmOperandClass {
- let Name = "NEONi32vmovByteReplicate";
- let PredicateMethod = "isNEONi32ByteReplicate";
- let RenderMethod = "addNEONvmovByteReplicateOperands";
-}
-def nImmVMVNI16AsmOperandByteReplicate :
- AsmOperandClass {
- let Name = "NEONi16invByteReplicate";
- let PredicateMethod = "isNEONi16ByteReplicate";
- let RenderMethod = "addNEONinvByteReplicateOperands";
-}
-def nImmVMVNI32AsmOperandByteReplicate :
- AsmOperandClass {
- let Name = "NEONi32invByteReplicate";
- let PredicateMethod = "isNEONi32ByteReplicate";
- let RenderMethod = "addNEONinvByteReplicateOperands";
-}
-
-def nImmVMOVI16ByteReplicate : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
- let ParserMatchClass = nImmVMOVI16AsmOperandByteReplicate;
+class nImmVMOVIAsmOperandReplicate<ValueType From, ValueType To>
+ : AsmOperandClass {
+ let Name = "NEONi" # To.Size # "vmovi" # From.Size # "Replicate";
+ let PredicateMethod = "isNEONmovReplicate<" # From.Size # ", " # To.Size # ">";
+ let RenderMethod = "addNEONvmovi" # From.Size # "ReplicateOperands";
}
-def nImmVMOVI32ByteReplicate : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
- let ParserMatchClass = nImmVMOVI32AsmOperandByteReplicate;
+
+class nImmVINVIAsmOperandReplicate<ValueType From, ValueType To>
+ : AsmOperandClass {
+ let Name = "NEONi" # To.Size # "invi" # From.Size # "Replicate";
+ let PredicateMethod = "isNEONinvReplicate<" # From.Size # ", " # To.Size # ">";
+ let RenderMethod = "addNEONinvi" # From.Size # "ReplicateOperands";
}
-def nImmVMVNI16ByteReplicate : Operand<i32> {
+
+class nImmVMOVIReplicate<ValueType From, ValueType To> : Operand<i32> {
let PrintMethod = "printNEONModImmOperand";
- let ParserMatchClass = nImmVMVNI16AsmOperandByteReplicate;
+ let ParserMatchClass = nImmVMOVIAsmOperandReplicate<From, To>;
}
-def nImmVMVNI32ByteReplicate : Operand<i32> {
+
+class nImmVINVIReplicate<ValueType From, ValueType To> : Operand<i32> {
let PrintMethod = "printNEONModImmOperand";
- let ParserMatchClass = nImmVMVNI32AsmOperandByteReplicate;
+ let ParserMatchClass = nImmVINVIAsmOperandReplicate<From, To>;
}
def nImmVMOVI32NegAsmOperand : AsmOperandClass { let Name = "NEONi32vmovNeg"; }
@@ -227,7 +209,7 @@ def VecListDPairSpacedAllLanesAsmOperand : AsmOperandClass {
let ParserMethod = "parseVectorList";
let RenderMethod = "addVecListOperands";
}
-def VecListDPairSpacedAllLanes : RegisterOperand<DPair,
+def VecListDPairSpacedAllLanes : RegisterOperand<DPairSpc,
"printVectorListTwoSpacedAllLanes"> {
let ParserMatchClass = VecListDPairSpacedAllLanesAsmOperand;
}
@@ -788,10 +770,22 @@ defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16", addrmode6align64>;
defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32", addrmode6align64>;
defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64", addrmode6align64>;
+def VLD1d8TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d16TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d32TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q8HighTPseudo : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q8LowTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q16HighTPseudo : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q16LowTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q32HighTPseudo : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q32LowTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q64HighTPseudo : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q64LowTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+
// ...with 4 registers
class VLD1D4<bits<4> op7_4, string Dt, Operand AddrMode>
: NLdSt<0, 0b10, 0b0010, op7_4, (outs VecListFourD:$Vd),
@@ -829,10 +823,22 @@ defm VLD1d16Qwb : VLD1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
defm VLD1d32Qwb : VLD1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
defm VLD1d64Qwb : VLD1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
+def VLD1d8QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d16QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d32QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q8LowQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q8HighQPseudo : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q16LowQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q16HighQPseudo : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q32LowQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q32HighQPseudo : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q64LowQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q64HighQPseudo : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+
// VLD2 : Vector Load (multiple 2-element structures)
class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
InstrItinClass itin, Operand AddrMode>
@@ -1512,6 +1518,13 @@ def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
addrmode6dupalign64>;
+def VLD2DUPq8EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+
// ...with address register writeback:
multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
Operand AddrMode> {
@@ -1572,6 +1585,13 @@ def VLD3DUPq8 : VLD3DUP<{0,0,1,?}, "8">;
def VLD3DUPq16 : VLD3DUP<{0,1,1,?}, "16">;
def VLD3DUPq32 : VLD3DUP<{1,0,1,?}, "32">;
+def VLD3DUPq8EvenPseudo : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPq8OddPseudo : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPq16EvenPseudo : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPq16OddPseudo : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPq32EvenPseudo : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPq32OddPseudo : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+
// ...with address register writeback:
class VLD3DUPWB<bits<4> op7_4, string Dt, Operand AddrMode>
: NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
@@ -1618,6 +1638,13 @@ def VLD4DUPq8 : VLD4DUP<{0,0,1,?}, "8">;
def VLD4DUPq16 : VLD4DUP<{0,1,1,?}, "16">;
def VLD4DUPq32 : VLD4DUP<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; }
+def VLD4DUPq8EvenPseudo : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPq8OddPseudo : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPq16EvenPseudo : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPq16OddPseudo : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPq32EvenPseudo : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPq32OddPseudo : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+
// ...with address register writeback:
class VLD4DUPWB<bits<4> op7_4, string Dt>
: NLdSt<1, 0b10, 0b1111, op7_4,
@@ -1795,10 +1822,22 @@ defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16", addrmode6align64>;
defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>;
defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>;
+def VST1d8TPseudo : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1d16TPseudo : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1d32TPseudo : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
def VST1d64TPseudo : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
def VST1d64TPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
+def VST1q8HighTPseudo : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q8LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q16HighTPseudo : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q16LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q32HighTPseudo : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q32LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q64HighTPseudo : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q64LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+
// ...with 4 registers
class VST1D4<bits<4> op7_4, string Dt, Operand AddrMode>
: NLdSt<0, 0b00, 0b0010, op7_4, (outs),
@@ -1838,10 +1877,22 @@ defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
+def VST1d8QPseudo : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1d16QPseudo : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1d32QPseudo : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
def VST1d64QPseudo : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
def VST1d64QPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
+def VST1q8HighQPseudo : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q8LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q16HighQPseudo : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q16LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q32HighQPseudo : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q32LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q64HighQPseudo : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q64LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+
// VST2 : Vector Store (multiple 2-element structures)
class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
InstrItinClass itin, Operand AddrMode>
@@ -4700,37 +4751,59 @@ def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)),
// We put them in the VFPV8 decoder namespace because the ARM and Thumb
// encodings are the same and thus no further bit twiddling is necessary
// in the disassembler.
-let Predicates = [HasDotProd], DecoderNamespace = "VFPV8" in {
-
-def VUDOTD : N3Vnp<0b11000, 0b10, 0b1101, 0b0, 0b1,
- (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm),
- N3RegFrm, IIC_VDOTPROD, "vudot", "u8", []>;
-def VSDOTD : N3Vnp<0b11000, 0b10, 0b1101, 0b0, 0b0,
- (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm),
- N3RegFrm, IIC_VDOTPROD, "vsdot", "s8", []>;
-def VUDOTQ : N3Vnp<0b11000, 0b10, 0b1101, 0b1, 0b1,
- (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm),
- N3RegFrm, IIC_VDOTPROD, "vudot", "u8", []>;
-def VSDOTQ : N3Vnp<0b11000, 0b10, 0b1101, 0b1, 0b0,
- (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm),
- N3RegFrm, IIC_VDOTPROD, "vsdot", "s8", []>;
+class VDOT<bit op6, bit op4, RegisterClass RegTy, string Asm, string AsmTy,
+ ValueType AccumTy, ValueType InputTy,
+ SDPatternOperator OpNode> :
+ N3Vnp<0b11000, 0b10, 0b1101, op6, op4, (outs RegTy:$dst),
+ (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), N3RegFrm, IIC_VDOTPROD,
+ Asm, AsmTy,
+ [(set (AccumTy RegTy:$dst),
+ (OpNode (AccumTy RegTy:$Vd),
+ (InputTy RegTy:$Vn),
+ (InputTy RegTy:$Vm)))]> {
+ let Predicates = [HasDotProd];
+ let DecoderNamespace = "VFPV8";
+ let Constraints = "$dst = $Vd";
+}
+
+def VUDOTD : VDOT<0, 1, DPR, "vudot", "u8", v2i32, v8i8, int_arm_neon_udot>;
+def VSDOTD : VDOT<0, 0, DPR, "vsdot", "s8", v2i32, v8i8, int_arm_neon_sdot>;
+def VUDOTQ : VDOT<1, 1, QPR, "vudot", "u8", v4i32, v16i8, int_arm_neon_udot>;
+def VSDOTQ : VDOT<1, 0, QPR, "vsdot", "s8", v4i32, v16i8, int_arm_neon_sdot>;
// Indexed dot product instructions:
-class DOTI<string opc, string dt, bit Q, bit U, RegisterClass Ty> :
- N3Vnp<0b11100, 0b10, 0b1101, Q, U,
- (outs Ty:$Vd), (ins Ty:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
- N3RegFrm, IIC_VDOTPROD, opc, dt, []> {
- bit lane;
- let Inst{5} = lane;
- let AsmString = !strconcat(opc, ".", dt, "\t$Vd, $Vn, $Vm$lane");
+multiclass DOTI<string opc, string dt, bit Q, bit U, RegisterClass Ty,
+ ValueType AccumType, ValueType InputType, SDPatternOperator OpNode,
+ dag RHS> {
+ def "" : N3Vnp<0b11100, 0b10, 0b1101, Q, U, (outs Ty:$dst),
+ (ins Ty:$Vd, Ty:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+ N3RegFrm, IIC_VDOTPROD, opc, dt, []> {
+ bit lane;
+ let Inst{5} = lane;
+ let AsmString = !strconcat(opc, ".", dt, "\t$Vd, $Vn, $Vm$lane");
+ let Constraints = "$dst = $Vd";
+ let Predicates = [HasDotProd];
+ let DecoderNamespace = "VFPV8";
+ }
+
+ def : Pat<
+ (AccumType (OpNode (AccumType Ty:$Vd),
+ (InputType Ty:$Vn),
+ (InputType (bitconvert (AccumType
+ (NEONvduplane (AccumType Ty:$Vm),
+ VectorIndex32:$lane)))))),
+ (!cast<Instruction>(NAME) Ty:$Vd, Ty:$Vn, RHS, VectorIndex32:$lane)>;
}
-def VUDOTDI : DOTI<"vudot", "u8", 0b0, 0b1, DPR>;
-def VSDOTDI : DOTI<"vsdot", "s8", 0b0, 0b0, DPR>;
-def VUDOTQI : DOTI<"vudot", "u8", 0b1, 0b1, QPR>;
-def VSDOTQI : DOTI<"vsdot", "s8", 0b1, 0b0, QPR>;
+defm VUDOTDI : DOTI<"vudot", "u8", 0b0, 0b1, DPR, v2i32, v8i8,
+ int_arm_neon_udot, (v2i32 DPR_VFP2:$Vm)>;
+defm VSDOTDI : DOTI<"vsdot", "s8", 0b0, 0b0, DPR, v2i32, v8i8,
+ int_arm_neon_sdot, (v2i32 DPR_VFP2:$Vm)>;
+defm VUDOTQI : DOTI<"vudot", "u8", 0b1, 0b1, QPR, v4i32, v16i8,
+ int_arm_neon_udot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+defm VSDOTQI : DOTI<"vsdot", "s8", 0b1, 0b0, QPR, v4i32, v16i8,
+ int_arm_neon_sdot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
-} // HasDotProd
// ARMv8.3 complex operations
class BaseN3VCP8ComplexTied<bit op21, bit op4, bit s, bit q,
@@ -5340,23 +5413,19 @@ defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q,
defm VABDLu : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q,
"vabdl", "u", int_arm_neon_vabdu, zext, 1>;
+def : Pat<(v8i16 (abs (sub (zext (v8i8 DPR:$opA)), (zext (v8i8 DPR:$opB))))),
+ (VABDLuv8i16 DPR:$opA, DPR:$opB)>;
+def : Pat<(v4i32 (abs (sub (zext (v4i16 DPR:$opA)), (zext (v4i16 DPR:$opB))))),
+ (VABDLuv4i32 DPR:$opA, DPR:$opB)>;
+
+// ISD::ABS is not legal for v2i64, so VABDL needs to be matched from the
+// shift/xor pattern for ABS.
+
def abd_shr :
PatFrag<(ops node:$in1, node:$in2, node:$shift),
(NEONvshrs (sub (zext node:$in1),
(zext node:$in2)), (i32 $shift))>;
-def : Pat<(xor (v4i32 (bitconvert (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15)))),
- (v4i32 (bitconvert (v8i16 (add (sub (zext (v8i8 DPR:$opA)),
- (zext (v8i8 DPR:$opB))),
- (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15))))))),
- (VABDLuv8i16 DPR:$opA, DPR:$opB)>;
-
-def : Pat<(xor (v4i32 (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)),
- (v4i32 (add (sub (zext (v4i16 DPR:$opA)),
- (zext (v4i16 DPR:$opB))),
- (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)))),
- (VABDLuv4i32 DPR:$opA, DPR:$opB)>;
-
def : Pat<(xor (v4i32 (bitconvert (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))),
(v4i32 (bitconvert (v2i64 (add (sub (zext (v2i32 DPR:$opA)),
(zext (v2i32 DPR:$opB))),
@@ -5933,34 +6002,57 @@ def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd),
} // isReMaterializable, isAsCheapAsAMove
// Add support for bytes replication feature, so it could be GAS compatible.
-// E.g. instructions below:
-// "vmov.i32 d0, 0xffffffff"
-// "vmov.i32 d0, 0xabababab"
-// "vmov.i16 d0, 0xabab"
-// are incorrect, but we could deal with such cases.
-// For last two instructions, for example, it should emit:
-// "vmov.i8 d0, 0xab"
-def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm",
- (VMOVv8i8 DPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm",
- (VMOVv8i8 DPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm",
- (VMOVv16i8 QPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm",
- (VMOVv16i8 QPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>;
-
-// Also add same support for VMVN instructions. So instruction:
-// "vmvn.i32 d0, 0xabababab"
-// actually means:
-// "vmov.i8 d0, 0x54"
-def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm",
- (VMOVv8i8 DPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm",
- (VMOVv8i8 DPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm",
- (VMOVv16i8 QPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm",
- (VMOVv16i8 QPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>;
+multiclass NEONImmReplicateI8InstAlias<ValueType To> {
+ // E.g. instructions below:
+ // "vmov.i32 d0, #0xffffffff"
+ // "vmov.i32 d0, #0xabababab"
+ // "vmov.i16 d0, #0xabab"
+ // are incorrect, but we could deal with such cases.
+ // For last two instructions, for example, it should emit:
+ // "vmov.i8 d0, #0xab"
+ def : NEONInstAlias<"vmov${p}.i" # To.Size # " $Vd, $Vm",
+ (VMOVv8i8 DPR:$Vd, nImmVMOVIReplicate<i8, To>:$Vm, pred:$p)>;
+ def : NEONInstAlias<"vmov${p}.i" # To.Size # " $Vd, $Vm",
+ (VMOVv16i8 QPR:$Vd, nImmVMOVIReplicate<i8, To>:$Vm, pred:$p)>;
+ // Also add same support for VMVN instructions. So instruction:
+ // "vmvn.i32 d0, #0xabababab"
+ // actually means:
+ // "vmov.i8 d0, #0x54"
+ def : NEONInstAlias<"vmvn${p}.i" # To.Size # " $Vd, $Vm",
+ (VMOVv8i8 DPR:$Vd, nImmVINVIReplicate<i8, To>:$Vm, pred:$p)>;
+ def : NEONInstAlias<"vmvn${p}.i" # To.Size # " $Vd, $Vm",
+ (VMOVv16i8 QPR:$Vd, nImmVINVIReplicate<i8, To>:$Vm, pred:$p)>;
+}
+
+defm : NEONImmReplicateI8InstAlias<i16>;
+defm : NEONImmReplicateI8InstAlias<i32>;
+defm : NEONImmReplicateI8InstAlias<i64>;
+
+// Similar to above for types other than i8, e.g.:
+// "vmov.i32 d0, #0xab00ab00" -> "vmov.i16 d0, #0xab00"
+// "vmvn.i64 q0, #0xab000000ab000000" -> "vmvn.i32 q0, #0xab000000"
+// In this case we do not canonicalize VMVN to VMOV
+multiclass NEONImmReplicateInstAlias<ValueType From, NeonI V8, NeonI V16,
+ NeonI NV8, NeonI NV16, ValueType To> {
+ def : NEONInstAlias<"vmov${p}.i" # To.Size # " $Vd, $Vm",
+ (V8 DPR:$Vd, nImmVMOVIReplicate<From, To>:$Vm, pred:$p)>;
+ def : NEONInstAlias<"vmov${p}.i" # To.Size # " $Vd, $Vm",
+ (V16 QPR:$Vd, nImmVMOVIReplicate<From, To>:$Vm, pred:$p)>;
+ def : NEONInstAlias<"vmvn${p}.i" # To.Size # " $Vd, $Vm",
+ (NV8 DPR:$Vd, nImmVMOVIReplicate<From, To>:$Vm, pred:$p)>;
+ def : NEONInstAlias<"vmvn${p}.i" # To.Size # " $Vd, $Vm",
+ (NV16 QPR:$Vd, nImmVMOVIReplicate<From, To>:$Vm, pred:$p)>;
+}
+
+defm : NEONImmReplicateInstAlias<i16, VMOVv4i16, VMOVv8i16,
+ VMVNv4i16, VMVNv8i16, i32>;
+defm : NEONImmReplicateInstAlias<i16, VMOVv4i16, VMOVv8i16,
+ VMVNv4i16, VMVNv8i16, i64>;
+defm : NEONImmReplicateInstAlias<i32, VMOVv2i32, VMOVv4i32,
+ VMVNv2i32, VMVNv4i32, i64>;
+// TODO: add "VMOV <-> VMVN" conversion for cases like
+// "vmov.i32 d0, #0xffaaffaa" -> "vmvn.i16 d0, #0x55"
+// "vmvn.i32 d0, #0xaaffaaff" -> "vmov.i16 d0, #0xff00"
// On some CPUs the two instructions "vmov.i32 dD, #0" and "vmov.i32 qD, #0"
// require zero cycles to execute so they should be used wherever possible for
@@ -6865,6 +6957,17 @@ class N3VSPat<SDNode OpNode, NeonI Inst>
(v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>;
+class N3VSPatFP16<SDNode OpNode, NeonI Inst>
+ : NEONFPPat<(f16 (OpNode HPR:$a, HPR:$b)),
+ (EXTRACT_SUBREG
+ (v4f16 (COPY_TO_REGCLASS (Inst
+ (INSERT_SUBREG
+ (v4f16 (COPY_TO_REGCLASS (v4f16 (IMPLICIT_DEF)), DPR_VFP2)),
+ HPR:$a, ssub_0),
+ (INSERT_SUBREG
+ (v4f16 (COPY_TO_REGCLASS (v4f16 (IMPLICIT_DEF)), DPR_VFP2)),
+ HPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>;
+
class N3VSMulOpPat<SDNode MulNode, SDNode OpNode, NeonI Inst>
: NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))),
(EXTRACT_SUBREG
@@ -6907,6 +7010,8 @@ def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
def : N2VSPat<fabs, VABSfd>;
def : N2VSPat<fneg, VNEGfd>;
+def : N3VSPatFP16<fmaxnan, VMAXhd>, Requires<[HasFullFP16]>;
+def : N3VSPatFP16<fminnan, VMINhd>, Requires<[HasFullFP16]>;
def : N3VSPat<fmaxnan, VMAXfd>, Requires<[HasNEON]>;
def : N3VSPat<fminnan, VMINfd>, Requires<[HasNEON]>;
def : NVCVTFIPat<fp_to_sint, VCVTf2sd>;
@@ -6930,6 +7035,9 @@ def : VFPPat<(f64 (uint_to_fp (extractelt (v4i32 QPR:$src), imm:$lane))),
def : Pat<(f32 (bitconvert GPR:$a)),
(EXTRACT_SUBREG (VMOVDRR GPR:$a, GPR:$a), ssub_0)>,
Requires<[HasNEON, DontUseVMOVSR]>;
+def : Pat<(arm_vmovsr GPR:$a),
+ (EXTRACT_SUBREG (VMOVDRR GPR:$a, GPR:$a), ssub_0)>,
+ Requires<[HasNEON, DontUseVMOVSR]>;
//===----------------------------------------------------------------------===//
// Non-Instruction Patterns
@@ -6966,9 +7074,11 @@ def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>;
let Predicates = [IsLE] in {
def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>;
def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (f64 DPR:$src)>;
def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>;
def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>;
def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>;
+ def : Pat<(v4f16 (bitconvert (f64 DPR:$src))), (v4f16 DPR:$src)>;
def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
}
def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>;
@@ -6997,6 +7107,7 @@ let Predicates = [IsLE] in {
def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
@@ -7014,6 +7125,7 @@ def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>;
def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
}
@@ -7039,6 +7151,7 @@ let Predicates = [IsBE] in {
def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (VREV64d8 DPR:$src)>;
def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (VREV32d8 DPR:$src)>;
def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>;
def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>;
def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
@@ -7060,6 +7173,7 @@ let Predicates = [IsBE] in {
def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8 QPR:$src)>;
@@ -7068,10 +7182,12 @@ let Predicates = [IsBE] in {
def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>;
def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>;
def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>;
def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>;
def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>;
def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
}
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index c2bcc087e077..88aab47a79bf 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -270,6 +270,14 @@ def t_addrmode_sp : MemOperand,
let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
}
+// Inspects parent to determine whether an or instruction can be implemented as
+// an add (i.e. whether we know overflow won't occur in the add).
+def AddLikeOrOp : ComplexPattern<i32, 1, "SelectAddLikeOr", [],
+ [SDNPWantParent]>;
+
+// Pattern to exclude immediates from matching
+def non_imm32 : PatLeaf<(i32 GPR), [{ return !isa<ConstantSDNode>(N); }]>;
+
//===----------------------------------------------------------------------===//
// Miscellaneous Instructions.
//
@@ -997,6 +1005,15 @@ let isAdd = 1 in {
}
}
+// Thumb has more flexible short encodings for ADD than ORR, so use those where
+// possible.
+def : T1Pat<(or AddLikeOrOp:$Rn, imm0_7:$imm), (tADDi3 $Rn, imm0_7:$imm)>;
+
+def : T1Pat<(or AddLikeOrOp:$Rn, imm8_255:$imm), (tADDi8 $Rn, imm8_255:$imm)>;
+
+def : T1Pat<(or AddLikeOrOp:$Rn, tGPR:$Rm), (tADDrr $Rn, $Rm)>;
+
+
def : tInstAlias <"add${s}${p} $Rdn, $Rm",
(tADDrr tGPR:$Rdn,s_cc_out:$s, tGPR:$Rdn, tGPR:$Rm, pred:$p)>;
@@ -1154,7 +1171,7 @@ def : tInstAlias <"movs $Rdn, $imm",
// A7-73: MOV(2) - mov setting flag.
-let hasSideEffects = 0 in {
+let hasSideEffects = 0, isMoveReg = 1 in {
def tMOVr : Thumb1pI<(outs GPR:$Rd), (ins GPR:$Rm), AddrModeNone,
2, IIC_iMOVr,
"mov", "\t$Rd, $Rm", "", []>,
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 4592249f5795..c7133b6483ef 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -2104,6 +2104,12 @@ def : t2InstSubst<"sub${s}${p}.w $rd, $rn, $imm",
(t2ADDri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
def : t2InstSubst<"subw${p} $rd, $rn, $imm",
(t2ADDri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>;
+def : t2InstSubst<"subw${p} $Rd, $Rn, $imm",
+ (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstSubst<"sub${s}${p} $rd, $rn, $imm",
+ (t2ADDri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
+def : t2InstSubst<"sub${p} $rd, $rn, $imm",
+ (t2ADDri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>;
// RSB
defm t2RSB : T2I_rbin_irs <0b1110, "rsb", sub>;
@@ -2594,6 +2600,18 @@ def : T2Pat<(or rGPR:$src, t2_so_imm_not:$imm),
def : T2Pat<(t2_so_imm_not:$src),
(t2MVNi t2_so_imm_not:$src)>;
+// There are shorter Thumb encodings for ADD than ORR, so to increase
+// Thumb2SizeReduction's chances later on we select a t2ADD for an or where
+// possible.
+def : T2Pat<(or AddLikeOrOp:$Rn, t2_so_imm:$imm),
+ (t2ADDri $Rn, t2_so_imm:$imm)>;
+
+def : T2Pat<(or AddLikeOrOp:$Rn, imm0_4095:$Rm),
+ (t2ADDri12 $Rn, imm0_4095:$Rm)>;
+
+def : T2Pat<(or AddLikeOrOp:$Rn, non_imm32:$Rm),
+ (t2ADDrr $Rn, $Rm)>;
+
//===----------------------------------------------------------------------===//
// Multiply Instructions.
//
@@ -2661,7 +2679,9 @@ class T2SMMUL<bits<4> op7_4, string opc, list<dag> pattern>
}
def t2SMMUL : T2SMMUL<0b0000, "smmul", [(set rGPR:$Rd, (mulhs rGPR:$Rn,
rGPR:$Rm))]>;
-def t2SMMULR : T2SMMUL<0b0001, "smmulr", []>;
+def t2SMMULR :
+ T2SMMUL<0b0001, "smmulr",
+ [(set rGPR:$Rd, (ARMsmmlar rGPR:$Rn, rGPR:$Rm, (i32 0)))]>;
class T2FourRegSMMLA<bits<3> op22_20, bits<4> op7_4, string opc,
list<dag> pattern>
@@ -2677,9 +2697,11 @@ class T2FourRegSMMLA<bits<3> op22_20, bits<4> op7_4, string opc,
def t2SMMLA : T2FourRegSMMLA<0b101, 0b0000, "smmla",
[(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>;
-def t2SMMLAR: T2FourRegSMMLA<0b101, 0b0001, "smmlar", []>;
+def t2SMMLAR: T2FourRegSMMLA<0b101, 0b0001, "smmlar",
+ [(set rGPR:$Rd, (ARMsmmlar rGPR:$Rn, rGPR:$Rm, rGPR:$Ra))]>;
def t2SMMLS: T2FourRegSMMLA<0b110, 0b0000, "smmls", []>;
-def t2SMMLSR: T2FourRegSMMLA<0b110, 0b0001, "smmlsr", []>;
+def t2SMMLSR: T2FourRegSMMLA<0b110, 0b0001, "smmlsr",
+ [(set rGPR:$Rd, (ARMsmmlsr rGPR:$Rn, rGPR:$Rm, rGPR:$Ra))]>;
class T2ThreeRegSMUL<bits<3> op22_20, bits<2> op5_4, string opc,
list<dag> pattern>
@@ -3193,6 +3215,12 @@ def t2ISB : T2I<(outs), (ins instsyncb_opt:$opt), NoItinerary,
let Inst{31-4} = 0xf3bf8f6;
let Inst{3-0} = opt;
}
+
+let hasNoSchedulingInfo = 1 in
+def t2TSB : T2I<(outs), (ins tsb_opt:$opt), NoItinerary,
+ "tsb", "\t$opt", []>, Requires<[IsThumb, HasV8_4a]> {
+ let Inst{31-0} = 0xf3af8012;
+}
}
class T2I_ldrex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
@@ -3696,6 +3724,8 @@ def : t2InstAlias<"esb$p.w", (t2HINT 16, pred:$p), 1> {
def : t2InstAlias<"esb$p", (t2HINT 16, pred:$p), 0> {
let Predicates = [IsThumb2, HasRAS];
}
+def : t2InstAlias<"csdb$p.w", (t2HINT 20, pred:$p), 0>;
+def : t2InstAlias<"csdb$p", (t2HINT 20, pred:$p), 1>;
def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt",
[(int_arm_dbg imm0_15:$opt)]> {
@@ -4713,12 +4743,24 @@ def : t2InstSubst<"bic${s}${p} $Rd, $Rn, $imm",
def : t2InstSubst<"bic${s}${p} $Rdn, $imm",
(t2ANDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
pred:$p, cc_out:$s)>;
+def : t2InstSubst<"bic${s}${p}.w $Rd, $Rn, $imm",
+ (t2ANDri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
+ pred:$p, cc_out:$s)>;
+def : t2InstSubst<"bic${s}${p}.w $Rdn, $imm",
+ (t2ANDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
+ pred:$p, cc_out:$s)>;
def : t2InstSubst<"and${s}${p} $Rd, $Rn, $imm",
(t2BICri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
pred:$p, cc_out:$s)>;
def : t2InstSubst<"and${s}${p} $Rdn, $imm",
(t2BICri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
pred:$p, cc_out:$s)>;
+def : t2InstSubst<"and${s}${p}.w $Rd, $Rn, $imm",
+ (t2BICri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
+ pred:$p, cc_out:$s)>;
+def : t2InstSubst<"and${s}${p}.w $Rdn, $imm",
+ (t2BICri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
+ pred:$p, cc_out:$s)>;
// And ORR <--> ORN
def : t2InstSubst<"orn${s}${p} $Rd, $Rn, $imm",
(t2ORRri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 22e157a7480b..2f14b78c91fd 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -17,11 +17,19 @@ def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
SDTCisVT<2, f64>]>;
+def SDT_VMOVSR : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i32>]>;
+
def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTNone, [SDNPInGlue, SDNPOutGlue]>;
def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMFCmp, [SDNPOutGlue]>;
def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
def arm_fmdrr : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
def arm_fmrrd : SDNode<"ARMISD::VMOVRRD", SDT_VMOVRRD>;
+def arm_vmovsr : SDNode<"ARMISD::VMOVSR", SDT_VMOVSR>;
+
+def SDT_VMOVhr : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, i32>] >;
+def SDT_VMOVrh : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisFP<1>] >;
+def arm_vmovhr : SDNode<"ARMISD::VMOVhr", SDT_VMOVhr>;
+def arm_vmovrh : SDNode<"ARMISD::VMOVrh", SDT_VMOVrh>;
//===----------------------------------------------------------------------===//
// Operand Definitions.
@@ -39,7 +47,7 @@ def vfp_f16imm : Operand<f16>,
}], SDNodeXForm<fpimm, [{
APFloat InVal = N->getValueAPF();
uint32_t enc = ARM_AM::getFP16Imm(InVal);
- return CurDAG->getTargetConstant(enc, MVT::i32);
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
}]>> {
let PrintMethod = "printFPImmOperand";
let ParserMatchClass = FPImmOperand;
@@ -69,10 +77,19 @@ def vfp_f64imm : Operand<f64>,
let ParserMatchClass = FPImmOperand;
}
+def alignedload16 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 2;
+}]>;
+
def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
return cast<LoadSDNode>(N)->getAlignment() >= 4;
}]>;
+def alignedstore16 : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 2;
+}]>;
+
def alignedstore32 : PatFrag<(ops node:$val, node:$ptr),
(store node:$val, node:$ptr), [{
return cast<StoreSDNode>(N)->getAlignment() >= 4;
@@ -113,9 +130,9 @@ def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
let D = VFPNeonDomain;
}
-def VLDRH : AHI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5fp16:$addr),
+def VLDRH : AHI5<0b1101, 0b01, (outs HPR:$Sd), (ins addrmode5fp16:$addr),
IIC_fpLoad16, "vldr", ".16\t$Sd, $addr",
- []>,
+ [(set HPR:$Sd, (alignedload16 addrmode5fp16:$addr))]>,
Requires<[HasFullFP16]>;
} // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
@@ -132,9 +149,9 @@ def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
let D = VFPNeonDomain;
}
-def VSTRH : AHI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5fp16:$addr),
+def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr),
IIC_fpStore16, "vstr", ".16\t$Sd, $addr",
- []>,
+ [(alignedstore16 HPR:$Sd, addrmode5fp16:$addr)]>,
Requires<[HasFullFP16]>;
//===----------------------------------------------------------------------===//
@@ -335,9 +352,9 @@ def VADDS : ASbIn<0b11100, 0b11, 0, 0,
let TwoOperandAliasConstraint = "$Sn = $Sd" in
def VADDH : AHbI<0b11100, 0b11, 0, 0,
- (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
- []>,
+ [(set HPR:$Sd, (fadd HPR:$Sn, HPR:$Sm))]>,
Sched<[WriteFPALU32]>;
let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -360,9 +377,9 @@ def VSUBS : ASbIn<0b11100, 0b11, 1, 0,
let TwoOperandAliasConstraint = "$Sn = $Sd" in
def VSUBH : AHbI<0b11100, 0b11, 1, 0,
- (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
- []>,
+ [(set HPR:$Sd, (fsub HPR:$Sn, HPR:$Sm))]>,
Sched<[WriteFPALU32]>;
let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -381,9 +398,9 @@ def VDIVS : ASbI<0b11101, 0b00, 0, 0,
let TwoOperandAliasConstraint = "$Sn = $Sd" in
def VDIVH : AHbI<0b11101, 0b00, 0, 0,
- (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm",
- []>,
+ [(set HPR:$Sd, (fdiv HPR:$Sn, HPR:$Sm))]>,
Sched<[WriteFPDIV32]>;
let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -406,9 +423,9 @@ def VMULS : ASbIn<0b11100, 0b10, 0, 0,
let TwoOperandAliasConstraint = "$Sn = $Sd" in
def VMULH : AHbI<0b11100, 0b10, 0, 0,
- (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm",
- []>,
+ [(set HPR:$Sd, (fmul HPR:$Sn, HPR:$Sm))]>,
Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
def VNMULD : ADbI<0b11100, 0b10, 1, 0,
@@ -428,18 +445,18 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
}
def VNMULH : AHbI<0b11100, 0b10, 1, 0,
- (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm",
- []>,
+ [(set HPR:$Sd, (fneg (fmul HPR:$Sn, HPR:$Sm)))]>,
Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
multiclass vsel_inst<string op, bits<2> opc, int CC> {
let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
Uses = [CPSR], AddedComplexity = 4 in {
def H : AHbInp<0b11100, opc, 0,
- (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
NoItinerary, !strconcat("vsel", op, ".f16\t$Sd, $Sn, $Sm"),
- []>,
+ [(set HPR:$Sd, (ARMcmov HPR:$Sm, HPR:$Sn, CC))]>,
Requires<[HasFullFP16]>;
def S : ASbInp<0b11100, opc, 0,
@@ -465,9 +482,9 @@ defm VSELVS : vsel_inst<"vs", 0b01, 6>;
multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
def H : AHbInp<0b11101, 0b00, opc,
- (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
NoItinerary, !strconcat(op, ".f16\t$Sd, $Sn, $Sm"),
- []>,
+ [(set HPR:$Sd, (SD HPR:$Sn, HPR:$Sm))]>,
Requires<[HasFullFP16]>;
def S : ASbInp<0b11101, 0b00, opc,
@@ -511,9 +528,9 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
}
def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
- (outs), (ins SPR:$Sd, SPR:$Sm),
+ (outs), (ins HPR:$Sd, HPR:$Sm),
IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
- []>;
+ [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 1))]>;
def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
(outs), (ins DPR:$Dd, DPR:$Dm),
@@ -530,9 +547,9 @@ def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
}
def VCMPH : AHuI<0b11101, 0b11, 0b0100, 0b01, 0,
- (outs), (ins SPR:$Sd, SPR:$Sm),
+ (outs), (ins HPR:$Sd, HPR:$Sm),
IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm",
- []>;
+ [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 0))]>;
} // Defs = [FPSCR_NZCV]
//===----------------------------------------------------------------------===//
@@ -580,9 +597,9 @@ def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
}
def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
- (outs), (ins SPR:$Sd),
+ (outs), (ins HPR:$Sd),
IIC_fpCMP16, "vcmpe", ".f16\t$Sd, #0",
- []> {
+ [(arm_cmpfp0 HPR:$Sd, (i32 1))]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
}
@@ -608,9 +625,9 @@ def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
}
def VCMPZH : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
- (outs), (ins SPR:$Sd),
+ (outs), (ins HPR:$Sd),
IIC_fpCMP16, "vcmp", ".f16\t$Sd, #0",
- []> {
+ [(arm_cmpfp0 HPR:$Sd, (i32 0))]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
}
@@ -658,20 +675,29 @@ def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
let Predicates = [HasVFP2, HasDPVFP];
}
-// Between half, single and double-precision. For disassembly only.
-
+// Between half, single and double-precision.
def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
- [/* For disassembly only; pattern left blank */]>,
+ [/* Intentionally left blank, see patterns below */]>,
Requires<[HasFP16]>,
Sched<[WriteFPCVT]>;
+def : FullFP16Pat<(f32 (fpextend HPR:$Sm)),
+ (VCVTBHS (COPY_TO_REGCLASS HPR:$Sm, SPR))>;
+def : FP16Pat<(f16_to_fp GPR:$a),
+ (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
- [/* For disassembly only; pattern left blank */]>,
+ [/* Intentionally left blank, see patterns below */]>,
Requires<[HasFP16]>,
Sched<[WriteFPCVT]>;
+def : FullFP16Pat<(f16 (fpround SPR:$Sm)),
+ (COPY_TO_REGCLASS (VCVTBSH SPR:$Sm), HPR)>;
+def : FP16Pat<(fp_to_f16 SPR:$a),
+ (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
+
def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
[/* For disassembly only; pattern left blank */]>,
@@ -687,7 +713,8 @@ def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
(outs DPR:$Dd), (ins SPR:$Sm),
NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm",
- []>, Requires<[HasFPARMv8, HasDPVFP]>,
+ [/* Intentionally left blank, see patterns below */]>,
+ Requires<[HasFPARMv8, HasDPVFP]>,
Sched<[WriteFPCVT]> {
// Instruction operands.
bits<5> Sm;
@@ -697,10 +724,16 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
let Inst{5} = Sm{0};
}
+def : FullFP16Pat<(f64 (fpextend HPR:$Sm)),
+ (VCVTBHD (COPY_TO_REGCLASS HPR:$Sm, SPR))>;
+def : FP16Pat<(f64 (f16_to_fp GPR:$a)),
+ (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
(outs SPR:$Sd), (ins DPR:$Dm),
NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm",
- []>, Requires<[HasFPARMv8, HasDPVFP]> {
+ [/* Intentionally left blank, see patterns below */]>,
+ Requires<[HasFPARMv8, HasDPVFP]> {
// Instruction operands.
bits<5> Sd;
bits<5> Dm;
@@ -712,6 +745,11 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
let Inst{22} = Sd{0};
}
+def : FullFP16Pat<(f16 (fpround DPR:$Dm)),
+ (COPY_TO_REGCLASS (VCVTBDH DPR:$Dm), HPR)>;
+def : FP16Pat<(fp_to_f16 (f64 DPR:$a)),
+ (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>;
+
def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
(outs DPR:$Dd), (ins SPR:$Sm),
NoItinerary, "vcvtt", ".f64.f16\t$Dd, $Sm",
@@ -739,23 +777,11 @@ def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
let Inst{5} = Dm{4};
}
-def : Pat<(fp_to_f16 SPR:$a),
- (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
-
-def : Pat<(fp_to_f16 (f64 DPR:$a)),
- (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>;
-
-def : Pat<(f16_to_fp GPR:$a),
- (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
-
-def : Pat<(f64 (f16_to_fp GPR:$a)),
- (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>;
-
multiclass vcvt_inst<string opc, bits<2> rm,
SDPatternOperator node = null_frag> {
let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
def SH : AHuInp<0b11101, 0b11, 0b1100, 0b11, 0,
- (outs SPR:$Sd), (ins SPR:$Sm),
+ (outs SPR:$Sd), (ins HPR:$Sm),
NoItinerary, !strconcat("vcvt", opc, ".s32.f16\t$Sd, $Sm"),
[]>,
Requires<[HasFullFP16]> {
@@ -763,7 +789,7 @@ multiclass vcvt_inst<string opc, bits<2> rm,
}
def UH : AHuInp<0b11101, 0b11, 0b1100, 0b01, 0,
- (outs SPR:$Sd), (ins SPR:$Sm),
+ (outs SPR:$Sd), (ins HPR:$Sm),
NoItinerary, !strconcat("vcvt", opc, ".u32.f16\t$Sd, $Sm"),
[]>,
Requires<[HasFullFP16]> {
@@ -818,6 +844,17 @@ multiclass vcvt_inst<string opc, bits<2> rm,
}
let Predicates = [HasFPARMv8] in {
+ let Predicates = [HasFullFP16] in {
+ def : Pat<(i32 (fp_to_sint (node HPR:$a))),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(NAME#"SH") HPR:$a),
+ GPR)>;
+
+ def : Pat<(i32 (fp_to_uint (node HPR:$a))),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(NAME#"UH") HPR:$a),
+ GPR)>;
+ }
def : Pat<(i32 (fp_to_sint (node SPR:$a))),
(COPY_TO_REGCLASS
(!cast<Instruction>(NAME#"SS") SPR:$a),
@@ -859,9 +896,9 @@ def VNEGS : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
}
def VNEGH : AHuI<0b11101, 0b11, 0b0001, 0b01, 0,
- (outs SPR:$Sd), (ins SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sm),
IIC_fpUNA16, "vneg", ".f16\t$Sd, $Sm",
- []>;
+ [(set HPR:$Sd, (fneg HPR:$Sm))]>;
multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0,
@@ -940,7 +977,7 @@ multiclass vrint_inst_anpm<string opc, bits<2> rm,
}
defm VRINTA : vrint_inst_anpm<"a", 0b00, fround>;
-defm VRINTN : vrint_inst_anpm<"n", 0b01>;
+defm VRINTN : vrint_inst_anpm<"n", 0b01, int_arm_neon_vrintn>;
defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>;
defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>;
@@ -962,6 +999,7 @@ def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0,
[]>;
let hasSideEffects = 0 in {
+let isMoveReg = 1 in {
def VMOVD : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
(outs DPR:$Dd), (ins DPR:$Dm),
IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>;
@@ -969,6 +1007,7 @@ def VMOVD : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
def VMOVS : ASuI<0b11101, 0b11, 0b0000, 0b01, 0,
(outs SPR:$Sd), (ins SPR:$Sm),
IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>;
+} // isMoveReg
let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
def VMOVH : ASuInp<0b11101, 0b11, 0b0000, 0b01, 0,
@@ -987,6 +1026,7 @@ def VINSH : ASuInp<0b11101, 0b11, 0b0000, 0b11, 0,
// FP <-> GPR Copies. Int <-> FP Conversions.
//
+let isMoveReg = 1 in {
def VMOVRS : AVConv2I<0b11100001, 0b1010,
(outs GPR:$Rt), (ins SPR:$Sn),
IIC_fpMOVSI, "vmov", "\t$Rt, $Sn",
@@ -1032,6 +1072,8 @@ def VMOVSR : AVConv4I<0b11100000, 0b1010,
// pipelines.
let D = VFPNeonDomain;
}
+} // isMoveReg
+def : Pat<(arm_vmovsr GPR:$Rt), (VMOVSR GPR:$Rt)>, Requires<[HasVFP2, UseVMOVSR]>;
let hasSideEffects = 0 in {
def VMOVRRD : AVConv3I<0b11000101, 0b1011,
@@ -1160,9 +1202,9 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010,
// Move H->R, clearing top 16 bits
def VMOVRH : AVConv2I<0b11100001, 0b1001,
- (outs GPR:$Rt), (ins SPR:$Sn),
+ (outs GPR:$Rt), (ins HPR:$Sn),
IIC_fpMOVSI, "vmov", ".f16\t$Rt, $Sn",
- []>,
+ [(set GPR:$Rt, (arm_vmovrh HPR:$Sn))]>,
Requires<[HasFullFP16]>,
Sched<[WriteFPMOV]> {
// Instruction operands.
@@ -1180,9 +1222,9 @@ def VMOVRH : AVConv2I<0b11100001, 0b1001,
// Move R->H, clearing top 16 bits
def VMOVHR : AVConv4I<0b11100000, 0b1001,
- (outs SPR:$Sn), (ins GPR:$Rt),
+ (outs HPR:$Sn), (ins GPR:$Rt),
IIC_fpMOVIS, "vmov", ".f16\t$Sn, $Rt",
- []>,
+ [(set HPR:$Sn, (arm_vmovhr GPR:$Rt))]>,
Requires<[HasFullFP16]>,
Sched<[WriteFPMOV]> {
// Instruction operands.
@@ -1297,13 +1339,16 @@ def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
(VSITOS (VLDRS addrmode5:$a))>;
def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
- (outs SPR:$Sd), (ins SPR:$Sm),
+ (outs HPR:$Sd), (ins SPR:$Sm),
IIC_fpCVTIH, "vcvt", ".f16.s32\t$Sd, $Sm",
[]>,
Sched<[WriteFPCVT]> {
let Inst{7} = 1; // s32
}
+def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)),
+ (VSITOH (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
(outs DPR:$Dd), (ins SPR:$Sm),
IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm",
@@ -1339,13 +1384,16 @@ def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
(VUITOS (VLDRS addrmode5:$a))>;
def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
- (outs SPR:$Sd), (ins SPR:$Sm),
+ (outs HPR:$Sd), (ins SPR:$Sm),
IIC_fpCVTIH, "vcvt", ".f16.u32\t$Sd, $Sm",
[]>,
Sched<[WriteFPCVT]> {
let Inst{7} = 0; // u32
}
+def : VFPNoNEONPat<(f16 (uint_to_fp GPR:$a)),
+ (VUITOH (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
// FP -> Int:
class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -1440,13 +1488,16 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))),
(VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>;
def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
- (outs SPR:$Sd), (ins SPR:$Sm),
+ (outs SPR:$Sd), (ins HPR:$Sm),
IIC_fpCVTHI, "vcvt", ".s32.f16\t$Sd, $Sm",
[]>,
Sched<[WriteFPCVT]> {
let Inst{7} = 1; // Z bit
}
+def : VFPNoNEONPat<(i32 (fp_to_sint HPR:$a)),
+ (COPY_TO_REGCLASS (VTOSIZH HPR:$a), GPR)>;
+
def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
(outs SPR:$Sd), (ins DPR:$Dm),
IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm",
@@ -1483,13 +1534,16 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))),
(VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>;
def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
- (outs SPR:$Sd), (ins SPR:$Sm),
+ (outs SPR:$Sd), (ins HPR:$Sm),
IIC_fpCVTHI, "vcvt", ".u32.f16\t$Sd, $Sm",
[]>,
Sched<[WriteFPCVT]> {
let Inst{7} = 1; // Z bit
}
+def : VFPNoNEONPat<(i32 (fp_to_uint HPR:$a)),
+ (COPY_TO_REGCLASS (VTOUIZH HPR:$a), GPR)>;
+
// And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
let Uses = [FPSCR] in {
def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
@@ -1773,9 +1827,10 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
}
def VMLAH : AHbI<0b11100, 0b00, 0, 0,
- (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vmla", ".f16\t$Sd, $Sn, $Sm",
- []>,
+ [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
+ HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
@@ -1785,6 +1840,10 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
(VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
+ (VMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
+ Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
+
def VMLSD : ADbI<0b11100, 0b00, 1, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1809,9 +1868,10 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
}
def VMLSH : AHbI<0b11100, 0b00, 1, 0,
- (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vmls", ".f16\t$Sd, $Sn, $Sm",
- []>,
+ [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
+ HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
@@ -1821,6 +1881,9 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
(VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
+ (VMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
+ Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1845,9 +1908,10 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
}
def VNMLAH : AHbI<0b11100, 0b01, 1, 0,
- (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vnmla", ".f16\t$Sd, $Sn, $Sm",
- []>,
+ [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
+ HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
@@ -1858,6 +1922,9 @@ def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
(VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx (fneg (fmul_su HPR:$a, HPR:$b)), HPR:$dstin),
+ (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
+ Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
// (-dst - (a * b)) -> -(dst + (a * b))
def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))),
@@ -1866,6 +1933,9 @@ def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))),
def : Pat<(fsub_mlx (fneg SPR:$dstin), (fmul_su SPR:$a, SPR:$b)),
(VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su HPR:$a, HPR:$b)),
+ (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
+ Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1889,9 +1959,9 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
}
def VNMLSH : AHbI<0b11100, 0b01, 0, 0,
- (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm",
- []>,
+ [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
@@ -1901,6 +1971,9 @@ def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
(VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx (fmul_su HPR:$a, HPR:$b), HPR:$dstin),
+ (VNMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
+ Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
//===----------------------------------------------------------------------===//
// Fused FP Multiply-Accumulate Operations.
@@ -1927,9 +2000,10 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
}
def VFMAH : AHbI<0b11101, 0b10, 0, 0,
- (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm",
- []>,
+ [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
+ HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFusedMAC]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -1940,6 +2014,9 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
(VFMAS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
+ (VFMAH HPR:$dstin, HPR:$a, HPR:$b)>,
+ Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>;
// Match @llvm.fma.* intrinsics
// (fma x, y, z) -> (vfms z, x, y)
@@ -1972,9 +2049,10 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
}
def VFMSH : AHbI<0b11101, 0b10, 1, 0,
- (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm",
- []>,
+ [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
+ HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFusedMAC]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -1985,6 +2063,9 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
(VFMSS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
+ (VFMSH HPR:$dstin, HPR:$a, HPR:$b)>,
+ Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>;
// Match @llvm.fma.* intrinsics
// (fma (fneg x), y, z) -> (vfms z, x, y)
@@ -2024,9 +2105,10 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
}
def VFNMAH : AHbI<0b11101, 0b01, 1, 0,
- (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm",
- []>,
+ [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
+ HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFusedMAC]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -2075,9 +2157,9 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
}
def VFNMSH : AHbI<0b11101, 0b01, 0, 0,
- (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm",
- []>,
+ [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFusedMAC]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -2269,10 +2351,11 @@ def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
let Inst{3-0} = imm{3-0};
}
-def FCONSTH : VFPAI<(outs SPR:$Sd), (ins vfp_f16imm:$imm),
+def FCONSTH : VFPAI<(outs HPR:$Sd), (ins vfp_f16imm:$imm),
VFPMiscFrm, IIC_fpUNA16,
"vmov", ".f16\t$Sd, $imm",
- []>, Requires<[HasFullFP16]> {
+ [(set HPR:$Sd, vfp_f16imm:$imm)]>,
+ Requires<[HasFullFP16]> {
bits<5> Sd;
bits<8> imm;
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index b0fd0b476920..6692a4d41420 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -117,39 +117,47 @@ ARMInstructionSelector::ARMInstructionSelector(const ARMBaseTargetMachine &TM,
{
}
-static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
- MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
- const RegisterBankInfo &RBI) {
- unsigned DstReg = I.getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(DstReg))
- return true;
-
- const RegisterBank *RegBank = RBI.getRegBank(DstReg, MRI, TRI);
- (void)RegBank;
+static const TargetRegisterClass *guessRegClass(unsigned Reg,
+ MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI,
+ const RegisterBankInfo &RBI) {
+ const RegisterBank *RegBank = RBI.getRegBank(Reg, MRI, TRI);
assert(RegBank && "Can't get reg bank for virtual register");
- const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+ const unsigned Size = MRI.getType(Reg).getSizeInBits();
assert((RegBank->getID() == ARM::GPRRegBankID ||
RegBank->getID() == ARM::FPRRegBankID) &&
"Unsupported reg bank");
- const TargetRegisterClass *RC = &ARM::GPRRegClass;
-
if (RegBank->getID() == ARM::FPRRegBankID) {
- if (DstSize == 32)
- RC = &ARM::SPRRegClass;
- else if (DstSize == 64)
- RC = &ARM::DPRRegClass;
+ if (Size == 32)
+ return &ARM::SPRRegClass;
+ else if (Size == 64)
+ return &ARM::DPRRegClass;
+ else if (Size == 128)
+ return &ARM::QPRRegClass;
else
llvm_unreachable("Unsupported destination size");
}
+ return &ARM::GPRRegClass;
+}
+
+static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
+ MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+ const RegisterBankInfo &RBI) {
+ unsigned DstReg = I.getOperand(0).getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ return true;
+
+ const TargetRegisterClass *RC = guessRegClass(DstReg, MRI, TRI, RBI);
+
// No need to constrain SrcReg. It will get constrained when
// we hit another of its uses or its defs.
// Copies do not have constraints.
if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
- DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
- << " operand\n");
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
return false;
}
return true;
@@ -393,12 +401,12 @@ bool ARMInstructionSelector::validReg(MachineRegisterInfo &MRI, unsigned Reg,
unsigned ExpectedSize,
unsigned ExpectedRegBankID) const {
if (MRI.getType(Reg).getSizeInBits() != ExpectedSize) {
- DEBUG(dbgs() << "Unexpected size for register");
+ LLVM_DEBUG(dbgs() << "Unexpected size for register");
return false;
}
if (RBI.getRegBank(Reg, MRI, TRI)->getID() != ExpectedRegBankID) {
- DEBUG(dbgs() << "Unexpected register bank for register");
+ LLVM_DEBUG(dbgs() << "Unexpected register bank for register");
return false;
}
@@ -490,13 +498,13 @@ bool ARMInstructionSelector::insertComparison(CmpConstants Helper, InsertInfo I,
bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
MachineRegisterInfo &MRI) const {
if ((STI.isROPI() || STI.isRWPI()) && !STI.isTargetELF()) {
- DEBUG(dbgs() << "ROPI and RWPI only supported for ELF\n");
+ LLVM_DEBUG(dbgs() << "ROPI and RWPI only supported for ELF\n");
return false;
}
auto GV = MIB->getOperand(1).getGlobal();
if (GV->isThreadLocal()) {
- DEBUG(dbgs() << "TLS variables not supported yet\n");
+ LLVM_DEBUG(dbgs() << "TLS variables not supported yet\n");
return false;
}
@@ -505,7 +513,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
bool UseMovt = STI.useMovt(MF);
- unsigned Size = TM.getPointerSize();
+ unsigned Size = TM.getPointerSize(0);
unsigned Alignment = 4;
auto addOpsForConstantPoolLoad = [&MF, Alignment,
@@ -548,7 +556,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
if (Indirect)
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getGOT(MF), MachineMemOperand::MOLoad,
- TM.getPointerSize(), Alignment));
+ TM.getProgramPointerSize(), Alignment));
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
@@ -601,7 +609,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
else
MIB->setDesc(TII.get(ARM::LDRLIT_ga_abs));
} else {
- DEBUG(dbgs() << "Object format not supported yet\n");
+ LLVM_DEBUG(dbgs() << "Object format not supported yet\n");
return false;
}
@@ -670,14 +678,6 @@ bool ARMInstructionSelector::select(MachineInstr &I,
}
using namespace TargetOpcode;
- if (I.getOpcode() == G_CONSTANT) {
- // Pointer constants should be treated the same as 32-bit integer constants.
- // Change the type and let TableGen handle it.
- unsigned ResultReg = I.getOperand(0).getReg();
- LLT Ty = MRI.getType(ResultReg);
- if (Ty.isPointer())
- MRI.setType(ResultReg, LLT::scalar(32));
- }
if (selectImpl(I, CoverageInfo))
return true;
@@ -693,7 +693,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
LLT DstTy = MRI.getType(I.getOperand(0).getReg());
// FIXME: Smaller destination sizes coming soon!
if (DstTy.getSizeInBits() != 32) {
- DEBUG(dbgs() << "Unsupported destination size for extension");
+ LLVM_DEBUG(dbgs() << "Unsupported destination size for extension");
return false;
}
@@ -735,7 +735,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
break;
}
default:
- DEBUG(dbgs() << "Unsupported source size for extension");
+ LLVM_DEBUG(dbgs() << "Unsupported source size for extension");
return false;
}
break;
@@ -776,18 +776,45 @@ bool ARMInstructionSelector::select(MachineInstr &I,
}
if (SrcRegBank.getID() != DstRegBank.getID()) {
- DEBUG(dbgs() << "G_TRUNC/G_ANYEXT operands on different register banks\n");
+ LLVM_DEBUG(
+ dbgs() << "G_TRUNC/G_ANYEXT operands on different register banks\n");
return false;
}
if (SrcRegBank.getID() != ARM::GPRRegBankID) {
- DEBUG(dbgs() << "G_TRUNC/G_ANYEXT on non-GPR not supported yet\n");
+ LLVM_DEBUG(dbgs() << "G_TRUNC/G_ANYEXT on non-GPR not supported yet\n");
return false;
}
I.setDesc(TII.get(COPY));
return selectCopy(I, TII, MRI, TRI, RBI);
}
+ case G_CONSTANT: {
+ if (!MRI.getType(I.getOperand(0).getReg()).isPointer()) {
+ // Non-pointer constants should be handled by TableGen.
+ LLVM_DEBUG(dbgs() << "Unsupported constant type\n");
+ return false;
+ }
+
+ auto &Val = I.getOperand(1);
+ if (Val.isCImm()) {
+ if (!Val.getCImm()->isZero()) {
+ LLVM_DEBUG(dbgs() << "Unsupported pointer constant value\n");
+ return false;
+ }
+ Val.ChangeToImmediate(0);
+ } else {
+ assert(Val.isImm() && "Unexpected operand for G_CONSTANT");
+ if (Val.getImm() != 0) {
+ LLVM_DEBUG(dbgs() << "Unsupported pointer constant value\n");
+ return false;
+ }
+ }
+
+ I.setDesc(TII.get(ARM::MOVi));
+ MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
+ break;
+ }
case G_INTTOPTR:
case G_PTRTOINT: {
auto SrcReg = I.getOperand(1).getReg();
@@ -797,13 +824,15 @@ bool ARMInstructionSelector::select(MachineInstr &I,
const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
if (SrcRegBank.getID() != DstRegBank.getID()) {
- DEBUG(dbgs()
- << "G_INTTOPTR/G_PTRTOINT operands on different register banks\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "G_INTTOPTR/G_PTRTOINT operands on different register banks\n");
return false;
}
if (SrcRegBank.getID() != ARM::GPRRegBankID) {
- DEBUG(dbgs() << "G_INTTOPTR/G_PTRTOINT on non-GPR not supported yet\n");
+ LLVM_DEBUG(
+ dbgs() << "G_INTTOPTR/G_PTRTOINT on non-GPR not supported yet\n");
return false;
}
@@ -824,11 +853,11 @@ bool ARMInstructionSelector::select(MachineInstr &I,
unsigned Size = MRI.getType(OpReg).getSizeInBits();
if (Size == 64 && STI.isFPOnlySP()) {
- DEBUG(dbgs() << "Subtarget only supports single precision");
+ LLVM_DEBUG(dbgs() << "Subtarget only supports single precision");
return false;
}
if (Size != 32 && Size != 64) {
- DEBUG(dbgs() << "Unsupported size for G_FCMP operand");
+ LLVM_DEBUG(dbgs() << "Unsupported size for G_FCMP operand");
return false;
}
@@ -859,7 +888,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
case G_LOAD: {
const auto &MemOp = **I.memoperands_begin();
if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
- DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+ LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
return false;
}
@@ -896,7 +925,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
}
case G_BRCOND: {
if (!validReg(MRI, I.getOperand(0).getReg(), 1, ARM::GPRRegBankID)) {
- DEBUG(dbgs() << "Unsupported condition register for G_BRCOND");
+ LLVM_DEBUG(dbgs() << "Unsupported condition register for G_BRCOND");
return false;
}
@@ -917,6 +946,17 @@ bool ARMInstructionSelector::select(MachineInstr &I,
I.eraseFromParent();
return true;
}
+ case G_PHI: {
+ I.setDesc(TII.get(PHI));
+
+ unsigned DstReg = I.getOperand(0).getReg();
+ const TargetRegisterClass *RC = guessRegClass(DstReg, MRI, TRI, RBI);
+ if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+ break;
+ }
+
+ return true;
+ }
default:
return false;
}
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 8cff1f0869d0..891418306903 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Type.h"
using namespace llvm;
+using namespace LegalizeActions;
/// FIXME: The following static functions are SizeChangeStrategy functions
/// that are meant to temporarily mimic the behaviour of the old legalization
@@ -40,7 +41,7 @@ addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
result.push_back(v[i]);
if (i + 1 < v[i].first && i + 1 < v.size() &&
v[i + 1].first != v[i].first + 1)
- result.push_back({v[i].first + 1, LegalizerInfo::Unsupported});
+ result.push_back({v[i].first + 1, Unsupported});
}
}
@@ -48,27 +49,14 @@ static LegalizerInfo::SizeAndActionsVec
widen_8_16(const LegalizerInfo::SizeAndActionsVec &v) {
assert(v.size() >= 1);
assert(v[0].first > 17);
- LegalizerInfo::SizeAndActionsVec result = {
- {1, LegalizerInfo::Unsupported},
- {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported},
- {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
+ LegalizerInfo::SizeAndActionsVec result = {{1, Unsupported},
+ {8, WidenScalar},
+ {9, Unsupported},
+ {16, WidenScalar},
+ {17, Unsupported}};
addAndInterleaveWithUnsupported(result, v);
auto Largest = result.back().first;
- result.push_back({Largest + 1, LegalizerInfo::Unsupported});
- return result;
-}
-
-static LegalizerInfo::SizeAndActionsVec
-widen_1_8_16(const LegalizerInfo::SizeAndActionsVec &v) {
- assert(v.size() >= 1);
- assert(v[0].first > 17);
- LegalizerInfo::SizeAndActionsVec result = {
- {1, LegalizerInfo::WidenScalar}, {2, LegalizerInfo::Unsupported},
- {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported},
- {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
- addAndInterleaveWithUnsupported(result, v);
- auto Largest = result.back().first;
- result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+ result.push_back({Largest + 1, Unsupported});
return result;
}
@@ -87,30 +75,21 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
- setAction({G_GLOBAL_VALUE, p0}, Legal);
- setAction({G_FRAME_INDEX, p0}, Legal);
+ getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
+ getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
- for (unsigned Op : {G_LOAD, G_STORE}) {
- for (auto Ty : {s1, s8, s16, s32, p0})
- setAction({Op, Ty}, Legal);
- setAction({Op, 1, p0}, Legal);
- }
-
- for (unsigned Op : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) {
- if (Op != G_ADD)
- setLegalizeScalarToDifferentSizeStrategy(
- Op, 0, widenToLargerTypesUnsupportedOtherwise);
- setAction({Op, s32}, Legal);
- }
+ getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+ .legalFor({s32})
+ .minScalar(0, s32);
- for (unsigned Op : {G_SDIV, G_UDIV}) {
- setLegalizeScalarToDifferentSizeStrategy(Op, 0,
- widenToLargerTypesUnsupportedOtherwise);
- if (ST.hasDivideInARMMode())
- setAction({Op, s32}, Legal);
- else
- setAction({Op, s32}, Libcall);
- }
+ if (ST.hasDivideInARMMode())
+ getActionDefinitionsBuilder({G_SDIV, G_UDIV})
+ .legalFor({s32})
+ .clampScalar(0, s32, s32);
+ else
+ getActionDefinitionsBuilder({G_SDIV, G_UDIV})
+ .libcallFor({s32})
+ .clampScalar(0, s32, s32);
for (unsigned Op : {G_SREM, G_UREM}) {
setLegalizeScalarToDifferentSizeStrategy(Op, 0, widen_8_16);
@@ -122,74 +101,96 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
setAction({Op, s32}, Libcall);
}
- for (unsigned Op : {G_SEXT, G_ZEXT, G_ANYEXT}) {
- setAction({Op, s32}, Legal);
- }
+ getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
+ .legalForCartesianProduct({s32}, {s1, s8, s16});
+
+ getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s32}});
+ getActionDefinitionsBuilder(G_PTRTOINT).legalFor({{s32, p0}});
- setAction({G_INTTOPTR, p0}, Legal);
- setAction({G_INTTOPTR, 1, s32}, Legal);
+ getActionDefinitionsBuilder({G_ASHR, G_LSHR, G_SHL}).legalFor({s32});
- setAction({G_PTRTOINT, s32}, Legal);
- setAction({G_PTRTOINT, 1, p0}, Legal);
+ getActionDefinitionsBuilder(G_GEP).legalFor({{p0, s32}});
- for (unsigned Op : {G_ASHR, G_LSHR, G_SHL})
- setAction({Op, s32}, Legal);
+ getActionDefinitionsBuilder(G_SELECT).legalForCartesianProduct({s32, p0},
+ {s1});
- setAction({G_GEP, p0}, Legal);
- setAction({G_GEP, 1, s32}, Legal);
+ getActionDefinitionsBuilder(G_BRCOND).legalFor({s1});
- setAction({G_SELECT, s32}, Legal);
- setAction({G_SELECT, p0}, Legal);
- setAction({G_SELECT, 1, s1}, Legal);
+ getActionDefinitionsBuilder(G_CONSTANT)
+ .legalFor({s32, p0})
+ .clampScalar(0, s32, s32);
- setAction({G_BRCOND, s1}, Legal);
+ getActionDefinitionsBuilder(G_ICMP)
+ .legalForCartesianProduct({s1}, {s32, p0})
+ .minScalar(1, s32);
- setAction({G_CONSTANT, s32}, Legal);
- setAction({G_CONSTANT, p0}, Legal);
- setLegalizeScalarToDifferentSizeStrategy(G_CONSTANT, 0, widen_1_8_16);
+ // We're keeping these builders around because we'll want to add support for
+ // floating point to them.
+ auto &LoadStoreBuilder =
+ getActionDefinitionsBuilder({G_LOAD, G_STORE})
+ .legalForCartesianProduct({s1, s8, s16, s32, p0}, {p0});
- setAction({G_ICMP, s1}, Legal);
- setLegalizeScalarToDifferentSizeStrategy(G_ICMP, 1,
- widenToLargerTypesUnsupportedOtherwise);
- for (auto Ty : {s32, p0})
- setAction({G_ICMP, 1, Ty}, Legal);
+ auto &PhiBuilder =
+ getActionDefinitionsBuilder(G_PHI).legalFor({s32, p0}).minScalar(0, s32);
if (!ST.useSoftFloat() && ST.hasVFP2()) {
- for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
- for (auto Ty : {s32, s64})
- setAction({BinOp, Ty}, Legal);
+ getActionDefinitionsBuilder(
+ {G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FCONSTANT, G_FNEG})
+ .legalFor({s32, s64});
+
+ LoadStoreBuilder.legalFor({{s64, p0}});
+ PhiBuilder.legalFor({s64});
+
+ getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct({s1},
+ {s32, s64});
- setAction({G_LOAD, s64}, Legal);
- setAction({G_STORE, s64}, Legal);
+ getActionDefinitionsBuilder(G_MERGE_VALUES).legalFor({{s64, s32}});
+ getActionDefinitionsBuilder(G_UNMERGE_VALUES).legalFor({{s32, s64}});
- setAction({G_FCMP, s1}, Legal);
- setAction({G_FCMP, 1, s32}, Legal);
- setAction({G_FCMP, 1, s64}, Legal);
+ getActionDefinitionsBuilder(G_FPEXT).legalFor({{s64, s32}});
+ getActionDefinitionsBuilder(G_FPTRUNC).legalFor({{s32, s64}});
- setAction({G_MERGE_VALUES, s64}, Legal);
- setAction({G_MERGE_VALUES, 1, s32}, Legal);
- setAction({G_UNMERGE_VALUES, s32}, Legal);
- setAction({G_UNMERGE_VALUES, 1, s64}, Legal);
+ getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
+ .legalForCartesianProduct({s32}, {s32, s64});
+ getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
+ .legalForCartesianProduct({s32, s64}, {s32});
} else {
- for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
- for (auto Ty : {s32, s64})
- setAction({BinOp, Ty}, Libcall);
+ getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV})
+ .libcallFor({s32, s64});
+
+ LoadStoreBuilder.maxScalar(0, s32);
+
+ for (auto Ty : {s32, s64})
+ setAction({G_FNEG, Ty}, Lower);
- setAction({G_FCMP, s1}, Legal);
- setAction({G_FCMP, 1, s32}, Custom);
- setAction({G_FCMP, 1, s64}, Custom);
+ getActionDefinitionsBuilder(G_FCONSTANT).customFor({s32, s64});
+
+ getActionDefinitionsBuilder(G_FCMP).customForCartesianProduct({s1},
+ {s32, s64});
if (AEABI(ST))
setFCmpLibcallsAEABI();
else
setFCmpLibcallsGNU();
+
+ getActionDefinitionsBuilder(G_FPEXT).libcallFor({{s64, s32}});
+ getActionDefinitionsBuilder(G_FPTRUNC).libcallFor({{s32, s64}});
+
+ getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
+ .libcallForCartesianProduct({s32}, {s32, s64});
+ getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
+ .libcallForCartesianProduct({s32, s64}, {s32});
}
- for (unsigned Op : {G_FREM, G_FPOW})
- for (auto Ty : {s32, s64})
- setAction({Op, Ty}, Libcall);
+ if (!ST.useSoftFloat() && ST.hasVFP4())
+ getActionDefinitionsBuilder(G_FMA).legalFor({s32, s64});
+ else
+ getActionDefinitionsBuilder(G_FMA).libcallFor({s32, s64});
+
+ getActionDefinitionsBuilder({G_FREM, G_FPOW}).libcallFor({s32, s64});
computeTables();
+ verify(*ST.getInstrInfo());
}
void ARMLegalizerInfo::setFCmpLibcallsAEABI() {
@@ -305,6 +306,7 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
using namespace TargetOpcode;
MIRBuilder.setInstr(MI);
+ LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
switch (MI.getOpcode()) {
default:
@@ -321,7 +323,6 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
// Our divmod libcalls return a struct containing the quotient and the
// remainder. We need to create a virtual register for it.
- auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
Type *ArgTy = Type::getInt32Ty(Ctx);
StructType *RetTy = StructType::get(Ctx, {ArgTy, ArgTy}, /* Packed */ true);
auto RetVal = MRI.createGenericVirtualRegister(
@@ -362,7 +363,6 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
return true;
}
- auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
assert((OpSize == 32 || OpSize == 64) && "Unsupported operand size");
auto *ArgTy = OpSize == 32 ? Type::getFloatTy(Ctx) : Type::getDoubleTy(Ctx);
auto *RetTy = Type::getInt32Ty(Ctx);
@@ -407,6 +407,14 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
}
break;
}
+ case G_FCONSTANT: {
+ // Convert to integer constants, while preserving the binary representation.
+ auto AsInteger =
+ MI.getOperand(1).getFPImm()->getValueAPF().bitcastToAPInt();
+ MIRBuilder.buildConstant(MI.getOperand(0).getReg(),
+ *ConstantInt::get(Ctx, AsInteger));
+ break;
+ }
}
MI.eraseFromParent();
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 8b3a2e223796..901138dbdfd5 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -1198,7 +1198,7 @@ findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
// Skip debug values.
MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
- while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI)
+ while (PrevMBBI->isDebugInstr() && PrevMBBI != BeginMBBI)
--PrevMBBI;
Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
@@ -1214,7 +1214,7 @@ findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
MachineBasicBlock::iterator EndMBBI = MBB.end();
MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
// Skip debug values.
- while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+ while (NextMBBI != EndMBBI && NextMBBI->isDebugInstr())
++NextMBBI;
if (NextMBBI == EndMBBI)
return EndMBBI;
@@ -1807,7 +1807,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
MBBI = I;
--Position;
// Fallthrough to look into existing chain.
- } else if (MBBI->isDebugValue()) {
+ } else if (MBBI->isDebugInstr()) {
continue;
} else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
MBBI->getOpcode() == ARM::t2STRDi8) {
@@ -1834,7 +1834,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) {
return M0->InsertPos < M1->InsertPos;
};
- std::sort(Candidates.begin(), Candidates.end(), LessThan);
+ llvm::sort(Candidates.begin(), Candidates.end(), LessThan);
// Go through list of candidates and merge.
bool Changed = false;
@@ -1891,8 +1891,8 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
MBBI->getOpcode() == ARM::tBX_RET ||
MBBI->getOpcode() == ARM::MOVPCLR)) {
MachineBasicBlock::iterator PrevI = std::prev(MBBI);
- // Ignore any DBG_VALUE instructions.
- while (PrevI->isDebugValue() && PrevI != MBB.begin())
+ // Ignore any debug instructions.
+ while (PrevI->isDebugInstr() && PrevI != MBB.begin())
--PrevI;
MachineInstr &PrevMI = *PrevI;
unsigned Opcode = PrevMI.getOpcode();
@@ -2063,7 +2063,7 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
// Are there stores / loads / calls between them?
SmallSet<unsigned, 4> AddedRegPressure;
while (++I != E) {
- if (I->isDebugValue() || MemOps.count(&*I))
+ if (I->isDebugInstr() || MemOps.count(&*I))
continue;
if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
return false;
@@ -2172,13 +2172,13 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
bool RetVal = false;
// Sort by offset (in reverse order).
- std::sort(Ops.begin(), Ops.end(),
- [](const MachineInstr *LHS, const MachineInstr *RHS) {
- int LOffset = getMemoryOpOffset(*LHS);
- int ROffset = getMemoryOpOffset(*RHS);
- assert(LHS == RHS || LOffset != ROffset);
- return LOffset > ROffset;
- });
+ llvm::sort(Ops.begin(), Ops.end(),
+ [](const MachineInstr *LHS, const MachineInstr *RHS) {
+ int LOffset = getMemoryOpOffset(*LHS);
+ int ROffset = getMemoryOpOffset(*RHS);
+ assert(LHS == RHS || LOffset != ROffset);
+ return LOffset > ROffset;
+ });
// The loads / stores of the same base are in order. Scan them from first to
// last and check for the following:
@@ -2253,7 +2253,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
// This is the new location for the loads / stores.
MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
while (InsertPos != MBB->end() &&
- (MemOps.count(&*InsertPos) || InsertPos->isDebugValue()))
+ (MemOps.count(&*InsertPos) || InsertPos->isDebugInstr()))
++InsertPos;
// If we are moving a pair of loads / stores, see if it makes sense
@@ -2291,7 +2291,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
MIB.addReg(0);
MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
- DEBUG(dbgs() << "Formed " << *MIB << "\n");
+ LLVM_DEBUG(dbgs() << "Formed " << *MIB << "\n");
++NumLDRDFormed;
} else {
MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
@@ -2305,7 +2305,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
MIB.addReg(0);
MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
- DEBUG(dbgs() << "Formed " << *MIB << "\n");
+ LLVM_DEBUG(dbgs() << "Formed " << *MIB << "\n");
++NumSTRDFormed;
}
MBB->erase(Op0);
@@ -2355,7 +2355,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
break;
}
- if (!MI.isDebugValue())
+ if (!MI.isDebugInstr())
MI2LocMap[&MI] = ++Loc;
if (!isMemoryOp(MI))
diff --git a/lib/Target/ARM/ARMMacroFusion.cpp b/lib/Target/ARM/ARMMacroFusion.cpp
index 5c9aad417ceb..d11fe9d5c502 100644
--- a/lib/Target/ARM/ARMMacroFusion.cpp
+++ b/lib/Target/ARM/ARMMacroFusion.cpp
@@ -19,7 +19,48 @@
namespace llvm {
-/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+// Fuse AES crypto encoding or decoding.
+static bool isAESPair(const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ // Assume the 1st instr to be a wildcard if it is unspecified.
+ unsigned FirstOpcode =
+ FirstMI ? FirstMI->getOpcode()
+ : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
+ unsigned SecondOpcode = SecondMI.getOpcode();
+
+ switch(SecondOpcode) {
+ // AES encode.
+ case ARM::AESMC :
+ return FirstOpcode == ARM::AESE ||
+ FirstOpcode == ARM::INSTRUCTION_LIST_END;
+ // AES decode.
+ case ARM::AESIMC:
+ return FirstOpcode == ARM::AESD ||
+ FirstOpcode == ARM::INSTRUCTION_LIST_END;
+ }
+
+ return false;
+}
+
+// Fuse literal generation.
+static bool isLiteralsPair(const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ // Assume the 1st instr to be a wildcard if it is unspecified.
+ unsigned FirstOpcode =
+ FirstMI ? FirstMI->getOpcode()
+ : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
+ unsigned SecondOpcode = SecondMI.getOpcode();
+
+ // 32 bit immediate.
+ if ((FirstOpcode == ARM::INSTRUCTION_LIST_END ||
+ FirstOpcode == ARM::MOVi16) &&
+ SecondOpcode == ARM::MOVTi16)
+ return true;
+
+ return false;
+}
+
+/// Check if the instr pair, FirstMI and SecondMI, should be fused
/// together. Given SecondMI, when FirstMI is unspecified, then check if
/// SecondMI may be part of a fused pair at all.
static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
@@ -28,24 +69,10 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
const MachineInstr &SecondMI) {
const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(TSI);
- // Assume wildcards for unspecified instrs.
- unsigned FirstOpcode =
- FirstMI ? FirstMI->getOpcode()
- : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
- unsigned SecondOpcode = SecondMI.getOpcode();
-
- if (ST.hasFuseAES())
- // Fuse AES crypto operations.
- switch(SecondOpcode) {
- // AES encode.
- case ARM::AESMC :
- return FirstOpcode == ARM::AESE ||
- FirstOpcode == ARM::INSTRUCTION_LIST_END;
- // AES decode.
- case ARM::AESIMC:
- return FirstOpcode == ARM::AESD ||
- FirstOpcode == ARM::INSTRUCTION_LIST_END;
- }
+ if (ST.hasFuseAES() && isAESPair(FirstMI, SecondMI))
+ return true;
+ if (ST.hasFuseLiterals() && isLiteralsPair(FirstMI, SecondMI))
+ return true;
return false;
}
diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp
new file mode 100644
index 000000000000..9d5478b76c18
--- /dev/null
+++ b/lib/Target/ARM/ARMParallelDSP.cpp
@@ -0,0 +1,672 @@
+//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Armv6 introduced instructions to perform 32-bit SIMD operations. The
+/// purpose of this pass is do some IR pattern matching to create ACLE
+/// DSP intrinsics, which map on these 32-bit SIMD operations.
+/// This pass runs only when unaligned accesses is supported/enabled.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "ARM.h"
+#include "ARMSubtarget.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "arm-parallel-dsp"
+
+STATISTIC(NumSMLAD , "Number of smlad instructions generated");
+
+namespace {
+ struct OpChain;
+ struct BinOpChain;
+ struct Reduction;
+
+ using OpChainList = SmallVector<std::unique_ptr<OpChain>, 8>;
+ using ReductionList = SmallVector<Reduction, 8>;
+ using ValueList = SmallVector<Value*, 8>;
+ using MemInstList = SmallVector<Instruction*, 8>;
+ using PMACPair = std::pair<BinOpChain*,BinOpChain*>;
+ using PMACPairList = SmallVector<PMACPair, 8>;
+ using Instructions = SmallVector<Instruction*,16>;
+ using MemLocList = SmallVector<MemoryLocation, 4>;
+
+ struct OpChain {
+ Instruction *Root;
+ ValueList AllValues;
+ MemInstList VecLd; // List of all load instructions.
+ MemLocList MemLocs; // All memory locations read by this tree.
+ bool ReadOnly = true;
+
+ OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
+ virtual ~OpChain() = default;
+
+ void SetMemoryLocations() {
+ const auto Size = MemoryLocation::UnknownSize;
+ for (auto *V : AllValues) {
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ if (I->mayWriteToMemory())
+ ReadOnly = false;
+ if (auto *Ld = dyn_cast<LoadInst>(V))
+ MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
+ }
+ }
+ }
+
+ unsigned size() const { return AllValues.size(); }
+ };
+
+ // 'BinOpChain' and 'Reduction' are just some bookkeeping data structures.
+ // 'Reduction' contains the phi-node and accumulator statement from where we
+ // start pattern matching, and 'BinOpChain' the multiplication
+ // instructions that are candidates for parallel execution.
+ struct BinOpChain : public OpChain {
+ ValueList LHS; // List of all (narrow) left hand operands.
+ ValueList RHS; // List of all (narrow) right hand operands.
+
+ BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) :
+ OpChain(I, lhs), LHS(lhs), RHS(rhs) {
+ for (auto *V : RHS)
+ AllValues.push_back(V);
+ }
+ };
+
+ struct Reduction {
+ PHINode *Phi; // The Phi-node from where we start
+ // pattern matching.
+ Instruction *AccIntAdd; // The accumulating integer add statement,
+ // i.e, the reduction statement.
+
+ OpChainList MACCandidates; // The MAC candidates associated with
+ // this reduction statement.
+ Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
+ };
+
+ class ARMParallelDSP : public LoopPass {
+ ScalarEvolution *SE;
+ AliasAnalysis *AA;
+ TargetLibraryInfo *TLI;
+ DominatorTree *DT;
+ LoopInfo *LI;
+ Loop *L;
+ const DataLayout *DL;
+ Module *M;
+
+ bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs);
+ bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
+ PMACPairList CreateParallelMACPairs(OpChainList &Candidates);
+ Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
+ Instruction *Acc, Instruction *InsertAfter);
+
+ /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
+ /// Dual performs two signed 16x16-bit multiplications. It adds the
+ /// products to a 32-bit accumulate operand. Optionally, the instruction can
+ /// exchange the halfwords of the second operand before performing the
+ /// arithmetic.
+ bool MatchSMLAD(Function &F);
+
+ public:
+ static char ID;
+
+ ARMParallelDSP() : LoopPass(ID) { }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ LoopPass::getAnalysisUsage(AU);
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.setPreservesCFG();
+ }
+
+ bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
+ L = TheLoop;
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &TPC = getAnalysis<TargetPassConfig>();
+
+ BasicBlock *Header = TheLoop->getHeader();
+ if (!Header)
+ return false;
+
+ // TODO: We assume the loop header and latch to be the same block.
+ // This is not a fundamental restriction, but lifting this would just
+ // require more work to do the transformation and then patch up the CFG.
+ if (Header != TheLoop->getLoopLatch()) {
+ LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
+ "running pass ARMParallelDSP\n");
+ return false;
+ }
+
+ Function &F = *Header->getParent();
+ M = F.getParent();
+ DL = &M->getDataLayout();
+
+ auto &TM = TPC.getTM<TargetMachine>();
+ auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
+
+ if (!ST->allowsUnalignedMem()) {
+ LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
+ "running pass ARMParallelDSP\n");
+ return false;
+ }
+
+ if (!ST->hasDSP()) {
+ LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
+ "ARMParallelDSP\n");
+ return false;
+ }
+
+ LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
+ bool Changes = false;
+
+ LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n\n");
+ Changes = MatchSMLAD(F);
+ return Changes;
+ }
+ };
+}
+
+// MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
+// instructions, which is set to 16. So here we should collect all i8 and i16
+// narrow operations.
+// TODO: we currently only collect i16, and will support i8 later, so that's
+// why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
+template<unsigned MaxBitWidth>
+static bool IsNarrowSequence(Value *V, ValueList &VL) {
+ LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump());
+ ConstantInt *CInt;
+
+ if (match(V, m_ConstantInt(CInt))) {
+ // TODO: if a constant is used, it needs to fit within the bit width.
+ return false;
+ }
+
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+
+ Value *Val, *LHS, *RHS;
+ if (match(V, m_Trunc(m_Value(Val)))) {
+ if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth)
+ return IsNarrowSequence<MaxBitWidth>(Val, VL);
+ } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
+ // TODO: we need to implement sadd16/sadd8 for this, which enables to
+ // also do the rewrite for smlad8.ll, but it is unsupported for now.
+ LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
+ return false;
+ } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
+ if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) {
+ LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " <<
+ cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() << "\n");
+ return false;
+ }
+
+ if (match(Val, m_Load(m_Value()))) {
+ LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump());
+ VL.push_back(Val);
+ VL.push_back(I);
+ return true;
+ }
+ }
+ LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
+ return false;
+}
+
+// Element-by-element comparison of Value lists returning true if they are
+// instructions with the same opcode or constants with the same value.
+static bool AreSymmetrical(const ValueList &VL0,
+ const ValueList &VL1) {
+ if (VL0.size() != VL1.size()) {
+ LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
+ << VL0.size() << " != " << VL1.size() << "\n");
+ return false;
+ }
+
+ const unsigned Pairs = VL0.size();
+ LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
+
+ for (unsigned i = 0; i < Pairs; ++i) {
+ const Value *V0 = VL0[i];
+ const Value *V1 = VL1[i];
+ const auto *Inst0 = dyn_cast<Instruction>(V0);
+ const auto *Inst1 = dyn_cast<Instruction>(V1);
+
+ LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
+ dbgs() << "mul1: "; V0->dump();
+ dbgs() << "mul2: "; V1->dump());
+
+ if (!Inst0 || !Inst1)
+ return false;
+
+ if (Inst0->isSameOperationAs(Inst1)) {
+ LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
+ continue;
+ }
+
+ const APInt *C0, *C1;
+ if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
+ return true;
+}
+
+template<typename MemInst>
+static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
+ MemInstList &VecMem, const DataLayout &DL,
+ ScalarEvolution &SE) {
+ if (!MemOp0->isSimple() || !MemOp1->isSimple()) {
+ LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
+ return false;
+ }
+ if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
+ VecMem.push_back(MemOp0);
+ VecMem.push_back(MemOp1);
+ LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
+ return true;
+ }
+ LLVM_DEBUG(dbgs() << "No, accesses aren't consecutive.\n");
+ return false;
+}
+
+bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
+ MemInstList &VecMem) {
+ if (!Ld0 || !Ld1)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Are consecutive loads:\n";
+ dbgs() << "Ld0:"; Ld0->dump();
+ dbgs() << "Ld1:"; Ld1->dump();
+ );
+
+ if (!Ld0->hasOneUse() || !Ld1->hasOneUse()) {
+ LLVM_DEBUG(dbgs() << "No, load has more than one use.\n");
+ return false;
+ }
+
+ return AreSequentialAccesses<LoadInst>(Ld0, Ld1, VecMem, *DL, *SE);
+}
+
+PMACPairList
+ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) {
+ const unsigned Elems = Candidates.size();
+ PMACPairList PMACPairs;
+
+ if (Elems < 2)
+ return PMACPairs;
+
+ // TODO: for now we simply try to match consecutive pairs i and i+1.
+ // We can compare all elements, but then we need to compare and evaluate
+ // different solutions.
+ for(unsigned i=0; i<Elems-1; i+=2) {
+ BinOpChain *PMul0 = static_cast<BinOpChain*>(Candidates[i].get());
+ BinOpChain *PMul1 = static_cast<BinOpChain*>(Candidates[i+1].get());
+ const Instruction *Mul0 = PMul0->Root;
+ const Instruction *Mul1 = PMul1->Root;
+
+ if (Mul0 == Mul1)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
+ dbgs() << "- "; Mul0->dump();
+ dbgs() << "- "; Mul1->dump());
+
+ const ValueList &Mul0_LHS = PMul0->LHS;
+ const ValueList &Mul0_RHS = PMul0->RHS;
+ const ValueList &Mul1_LHS = PMul1->LHS;
+ const ValueList &Mul1_RHS = PMul1->RHS;
+
+ if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) ||
+ !AreSymmetrical(Mul0_RHS, Mul1_RHS))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
+ // The first elements of each vector should be loads with sexts. If we find
+ // that its two pairs of consecutive loads, then these can be transformed
+ // into two wider loads and the users can be replaced with DSP
+ // intrinsics.
+ for (unsigned x = 0; x < Mul0_LHS.size(); x += 2) {
+ auto *Ld0 = dyn_cast<LoadInst>(Mul0_LHS[x]);
+ auto *Ld1 = dyn_cast<LoadInst>(Mul1_LHS[x]);
+ auto *Ld2 = dyn_cast<LoadInst>(Mul0_RHS[x]);
+ auto *Ld3 = dyn_cast<LoadInst>(Mul1_RHS[x]);
+
+ LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n";
+ dbgs() << "\t mul1: "; Mul0_LHS[x]->dump();
+ dbgs() << "\t mul2: "; Mul1_LHS[x]->dump();
+ dbgs() << "and operands " << x + 2 << ":\n";
+ dbgs() << "\t mul1: "; Mul0_RHS[x]->dump();
+ dbgs() << "\t mul2: "; Mul1_RHS[x]->dump());
+
+ if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd) &&
+ AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
+ LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+ PMACPairs.push_back(std::make_pair(PMul0, PMul1));
+ }
+ }
+ }
+ return PMACPairs;
+}
+
+bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction,
+ PMACPairList &PMACPairs) {
+ Instruction *Acc = Reduction.Phi;
+ Instruction *InsertAfter = Reduction.AccIntAdd;
+
+ for (auto &Pair : PMACPairs) {
+ LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
+ dbgs() << "- "; Pair.first->Root->dump();
+ dbgs() << "- "; Pair.second->Root->dump());
+ auto *VecLd0 = cast<LoadInst>(Pair.first->VecLd[0]);
+ auto *VecLd1 = cast<LoadInst>(Pair.second->VecLd[0]);
+ Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, InsertAfter);
+ InsertAfter = Acc;
+ }
+
+ if (Acc != Reduction.Phi) {
+ LLVM_DEBUG(dbgs() << "Replace Accumulate: "; Acc->dump());
+ Reduction.AccIntAdd->replaceAllUsesWith(Acc);
+ return true;
+ }
+ return false;
+}
+
+static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
+ ReductionList &Reductions) {
+ RecurrenceDescriptor RecDesc;
+ const bool HasFnNoNaNAttr =
+ F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+ const BasicBlock *Latch = TheLoop->getLoopLatch();
+
+ // We need a preheader as getIncomingValueForBlock assumes there is one.
+ if (!TheLoop->getLoopPreheader()) {
+ LLVM_DEBUG(dbgs() << "No preheader found, bailing out\n");
+ return;
+ }
+
+ for (PHINode &Phi : Header->phis()) {
+ const auto *Ty = Phi.getType();
+ if (!Ty->isIntegerTy(32))
+ continue;
+
+ const bool IsReduction =
+ RecurrenceDescriptor::AddReductionVar(&Phi,
+ RecurrenceDescriptor::RK_IntegerAdd,
+ TheLoop, HasFnNoNaNAttr, RecDesc);
+ if (!IsReduction)
+ continue;
+
+ Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
+ if (!Acc)
+ continue;
+
+ Reductions.push_back(Reduction(&Phi, Acc));
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "\nAccumulating integer additions (reductions) found:\n";
+ for (auto &R : Reductions) {
+ dbgs() << "- "; R.Phi->dump();
+ dbgs() << "-> "; R.AccIntAdd->dump();
+ }
+ );
+}
+
+static void AddMACCandidate(OpChainList &Candidates,
+ const Instruction *Acc,
+ Value *MulOp0, Value *MulOp1, int MulOpNum) {
+ Instruction *Mul = dyn_cast<Instruction>(Acc->getOperand(MulOpNum));
+ LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
+ ValueList LHS;
+ ValueList RHS;
+ if (IsNarrowSequence<16>(MulOp0, LHS) &&
+ IsNarrowSequence<16>(MulOp1, RHS)) {
+ LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
+ Candidates.push_back(make_unique<BinOpChain>(Mul, LHS, RHS));
+ }
+}
+
+static void MatchParallelMACSequences(Reduction &R,
+ OpChainList &Candidates) {
+ const Instruction *Acc = R.AccIntAdd;
+ Value *A, *MulOp0, *MulOp1;
+ LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump());
+
+ // Pattern 1: the accumulator is the RHS of the mul.
+ while(match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)),
+ m_Value(A)))){
+ AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0);
+ Acc = dyn_cast<Instruction>(A);
+ }
+ // Pattern 2: the accumulator is the LHS of the mul.
+ while(match(Acc, m_Add(m_Value(A),
+ m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
+ AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 1);
+ Acc = dyn_cast<Instruction>(A);
+ }
+
+ // The last mul in the chain has a slightly different pattern:
+ // the mul is the first operand
+ if (match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)), m_Value(A))))
+ AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0);
+
+ // Because we start at the bottom of the chain, and we work our way up,
+ // the muls are added in reverse program order to the list.
+ std::reverse(Candidates.begin(), Candidates.end());
+}
+
+// Collects all instructions that are not part of the MAC chains, which is the
+// set of instructions that can potentially alias with the MAC operands.
+static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
+ Instructions &Writes) {
+ for (auto &I : *Header) {
+ if (I.mayReadFromMemory())
+ Reads.push_back(&I);
+ if (I.mayWriteToMemory())
+ Writes.push_back(&I);
+ }
+}
+
+// Check whether statements in the basic block that write to memory alias with
+// the memory locations accessed by the MAC-chains.
+// TODO: we need the read statements when we accept more complicated chains.
+static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
+ Instructions &Writes, OpChainList &MACCandidates) {
+ LLVM_DEBUG(dbgs() << "Alias checks:\n");
+ for (auto &MAC : MACCandidates) {
+ LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
+
+ // At the moment, we allow only simple chains that only consist of reads,
+ // accumulate their result with an integer add, and thus that don't write
+ // memory, and simply bail if they do.
+ if (!MAC->ReadOnly)
+ return true;
+
+ // Now for all writes in the basic block, check that they don't alias with
+ // the memory locations accessed by our MAC-chain:
+ for (auto *I : Writes) {
+ LLVM_DEBUG(dbgs() << "- "; I->dump());
+ assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
+ for (auto &MemLoc : MAC->MemLocs) {
+ if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
+ ModRefInfo::ModRef))) {
+ LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
+ return true;
+ }
+ }
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
+ return false;
+}
+
+static bool CheckMACMemory(OpChainList &Candidates) {
+ for (auto &C : Candidates) {
+ // A mul has 2 operands, and a narrow op consist of sext and a load; thus
+ // we expect at least 4 items in this operand value list.
+ if (C->size() < 4) {
+ LLVM_DEBUG(dbgs() << "Operand list too short.\n");
+ return false;
+ }
+ C->SetMemoryLocations();
+ ValueList &LHS = static_cast<BinOpChain*>(C.get())->LHS;
+ ValueList &RHS = static_cast<BinOpChain*>(C.get())->RHS;
+
+ // Use +=2 to skip over the expected extend instructions.
+ for (unsigned i = 0, e = LHS.size(); i < e; i += 2) {
+ if (!isa<LoadInst>(LHS[i]) || !isa<LoadInst>(RHS[i]))
+ return false;
+ }
+ }
+ return true;
+}
+
+// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
+// multiplications.
+// To use SMLAD:
+// 1) we first need to find integer add reduction PHIs,
+// 2) then from the PHI, look for this pattern:
+//
+// acc0 = phi i32 [0, %entry], [%acc1, %loop.body]
+// ld0 = load i16
+// sext0 = sext i16 %ld0 to i32
+// ld1 = load i16
+// sext1 = sext i16 %ld1 to i32
+// mul0 = mul %sext0, %sext1
+// ld2 = load i16
+// sext2 = sext i16 %ld2 to i32
+// ld3 = load i16
+// sext3 = sext i16 %ld3 to i32
+// mul1 = mul i32 %sext2, %sext3
+// add0 = add i32 %mul0, %acc0
+// acc1 = add i32 %add0, %mul1
+//
+// Which can be selected to:
+//
+// ldr.h r0
+// ldr.h r1
+// smlad r2, r0, r1, r2
+//
+// If constants are used instead of loads, these will need to be hoisted
+// out and into a register.
+//
+// If loop invariants are used instead of loads, these need to be packed
+// before the loop begins.
+//
+bool ARMParallelDSP::MatchSMLAD(Function &F) {
+ BasicBlock *Header = L->getHeader();
+ LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n";
+ dbgs() << "Header block:\n"; Header->dump();
+ dbgs() << "Loop info:\n\n"; L->dump());
+
+ bool Changed = false;
+ ReductionList Reductions;
+ MatchReductions(F, L, Header, Reductions);
+
+ for (auto &R : Reductions) {
+ OpChainList MACCandidates;
+ MatchParallelMACSequences(R, MACCandidates);
+ if (!CheckMACMemory(MACCandidates))
+ continue;
+
+ R.MACCandidates = std::move(MACCandidates);
+
+ LLVM_DEBUG(dbgs() << "MAC candidates:\n";
+ for (auto &M : R.MACCandidates)
+ M->Root->dump();
+ dbgs() << "\n";);
+ }
+
+ // Collect all instructions that may read or write memory. Our alias
+ // analysis checks bail out if any of these instructions aliases with an
+ // instruction from the MAC-chain.
+ Instructions Reads, Writes;
+ AliasCandidates(Header, Reads, Writes);
+
+ for (auto &R : Reductions) {
+ if (AreAliased(AA, Reads, Writes, R.MACCandidates))
+ return false;
+ PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates);
+ Changed |= InsertParallelMACs(R, PMACPairs);
+ }
+
+ LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
+ return Changed;
+}
+
+static void CreateLoadIns(IRBuilder<NoFolder> &IRB, Instruction *Acc,
+ LoadInst **VecLd) {
+ const Type *AccTy = Acc->getType();
+ const unsigned AddrSpace = (*VecLd)->getPointerAddressSpace();
+
+ Value *VecPtr = IRB.CreateBitCast((*VecLd)->getPointerOperand(),
+ AccTy->getPointerTo(AddrSpace));
+ *VecLd = IRB.CreateAlignedLoad(VecPtr, (*VecLd)->getAlignment());
+}
+
+Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
+ Instruction *Acc,
+ Instruction *InsertAfter) {
+ LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n";
+ dbgs() << "- "; VecLd0->dump();
+ dbgs() << "- "; VecLd1->dump();
+ dbgs() << "- "; Acc->dump());
+
+ IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
+ ++BasicBlock::iterator(InsertAfter));
+
+ // Replace the reduction chain with an intrinsic call
+ CreateLoadIns(Builder, Acc, &VecLd0);
+ CreateLoadIns(Builder, Acc, &VecLd1);
+ Value* Args[] = { VecLd0, VecLd1, Acc };
+ Function *SMLAD = Intrinsic::getDeclaration(M, Intrinsic::arm_smlad);
+ CallInst *Call = Builder.CreateCall(SMLAD, Args);
+ NumSMLAD++;
+ return Call;
+}
+
+Pass *llvm::createARMParallelDSPPass() {
+ return new ARMParallelDSP();
+}
+
+char ARMParallelDSP::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
+ "Transform loops to use DSP intrinsics", false, false)
+INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
+ "Transform loops to use DSP intrinsics", false, false)
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index fad0e98285e6..0e16d6bcfe2b 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -175,15 +175,20 @@ const RegisterBank &ARMRegisterBankInfo::getRegBankFromRegClass(
switch (RC.getID()) {
case GPRRegClassID:
+ case GPRwithAPSRRegClassID:
case GPRnopcRegClassID:
+ case rGPRRegClassID:
case GPRspRegClassID:
case tGPR_and_tcGPRRegClassID:
+ case tcGPRRegClassID:
case tGPRRegClassID:
return getRegBank(ARM::GPRRegBankID);
+ case HPRRegClassID:
case SPR_8RegClassID:
case SPRRegClassID:
case DPR_8RegClassID:
case DPRRegClassID:
+ case QPRRegClassID:
return getRegBank(ARM::FPRRegBankID);
default:
llvm_unreachable("Unsupported register kind");
@@ -263,13 +268,74 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case G_FADD:
case G_FSUB:
case G_FMUL:
- case G_FDIV: {
+ case G_FDIV:
+ case G_FNEG: {
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
OperandsMapping =Ty.getSizeInBits() == 64
? &ARM::ValueMappings[ARM::DPR3OpsIdx]
: &ARM::ValueMappings[ARM::SPR3OpsIdx];
break;
}
+ case G_FMA: {
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+ OperandsMapping =
+ Ty.getSizeInBits() == 64
+ ? getOperandsMapping({&ARM::ValueMappings[ARM::DPR3OpsIdx],
+ &ARM::ValueMappings[ARM::DPR3OpsIdx],
+ &ARM::ValueMappings[ARM::DPR3OpsIdx],
+ &ARM::ValueMappings[ARM::DPR3OpsIdx]})
+ : getOperandsMapping({&ARM::ValueMappings[ARM::SPR3OpsIdx],
+ &ARM::ValueMappings[ARM::SPR3OpsIdx],
+ &ARM::ValueMappings[ARM::SPR3OpsIdx],
+ &ARM::ValueMappings[ARM::SPR3OpsIdx]});
+ break;
+ }
+ case G_FPEXT: {
+ LLT ToTy = MRI.getType(MI.getOperand(0).getReg());
+ LLT FromTy = MRI.getType(MI.getOperand(1).getReg());
+ if (ToTy.getSizeInBits() == 64 && FromTy.getSizeInBits() == 32)
+ OperandsMapping =
+ getOperandsMapping({&ARM::ValueMappings[ARM::DPR3OpsIdx],
+ &ARM::ValueMappings[ARM::SPR3OpsIdx]});
+ break;
+ }
+ case G_FPTRUNC: {
+ LLT ToTy = MRI.getType(MI.getOperand(0).getReg());
+ LLT FromTy = MRI.getType(MI.getOperand(1).getReg());
+ if (ToTy.getSizeInBits() == 32 && FromTy.getSizeInBits() == 64)
+ OperandsMapping =
+ getOperandsMapping({&ARM::ValueMappings[ARM::SPR3OpsIdx],
+ &ARM::ValueMappings[ARM::DPR3OpsIdx]});
+ break;
+ }
+ case G_FPTOSI:
+ case G_FPTOUI: {
+ LLT ToTy = MRI.getType(MI.getOperand(0).getReg());
+ LLT FromTy = MRI.getType(MI.getOperand(1).getReg());
+ if ((FromTy.getSizeInBits() == 32 || FromTy.getSizeInBits() == 64) &&
+ ToTy.getSizeInBits() == 32)
+ OperandsMapping =
+ FromTy.getSizeInBits() == 64
+ ? getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx],
+ &ARM::ValueMappings[ARM::DPR3OpsIdx]})
+ : getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx],
+ &ARM::ValueMappings[ARM::SPR3OpsIdx]});
+ break;
+ }
+ case G_SITOFP:
+ case G_UITOFP: {
+ LLT ToTy = MRI.getType(MI.getOperand(0).getReg());
+ LLT FromTy = MRI.getType(MI.getOperand(1).getReg());
+ if (FromTy.getSizeInBits() == 32 &&
+ (ToTy.getSizeInBits() == 32 || ToTy.getSizeInBits() == 64))
+ OperandsMapping =
+ ToTy.getSizeInBits() == 64
+ ? getOperandsMapping({&ARM::ValueMappings[ARM::DPR3OpsIdx],
+ &ARM::ValueMappings[ARM::GPR3OpsIdx]})
+ : getOperandsMapping({&ARM::ValueMappings[ARM::SPR3OpsIdx],
+ &ARM::ValueMappings[ARM::GPR3OpsIdx]});
+ break;
+ }
case G_CONSTANT:
case G_FRAME_INDEX:
case G_GLOBAL_VALUE:
diff --git a/lib/Target/ARM/ARMRegisterBanks.td b/lib/Target/ARM/ARMRegisterBanks.td
index 7cd2d60d36a4..6e3834da3bb5 100644
--- a/lib/Target/ARM/ARMRegisterBanks.td
+++ b/lib/Target/ARM/ARMRegisterBanks.td
@@ -11,4 +11,4 @@
//===----------------------------------------------------------------------===//
def GPRRegBank : RegisterBank<"GPRB", [GPR, GPRwithAPSR]>;
-def FPRRegBank : RegisterBank<"FPRB", [SPR, DPR]>;
+def FPRRegBank : RegisterBank<"FPRB", [HPR, SPR, DPR, QPR]>;
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 14526b777c70..dc56186cb54a 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -307,6 +307,18 @@ def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> {
let DiagnosticString = "operand must be a register in range [s0, s31]";
}
+def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> {
+ let AltOrders = [(add (decimate HPR, 2), SPR),
+ (add (decimate HPR, 4),
+ (decimate HPR, 2),
+ (decimate (rotl HPR, 1), 4),
+ (decimate (rotl HPR, 1), 2))];
+ let AltOrderSelect = [{
+ return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+ }];
+ let DiagnosticString = "operand must be a register in range [s0, s31]";
+}
+
// Subset of SPR which can be used as a source of NEON scalars for 16-bit
// operations
def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)> {
diff --git a/lib/Target/ARM/ARMScheduleA57.td b/lib/Target/ARM/ARMScheduleA57.td
index 1ed9e14dfcd6..63f975ba6e39 100644
--- a/lib/Target/ARM/ARMScheduleA57.td
+++ b/lib/Target/ARM/ARMScheduleA57.td
@@ -92,6 +92,9 @@ def CortexA57Model : SchedMachineModel {
// Enable partial & runtime unrolling.
let LoopMicroOpBufferSize = 16;
let CompleteModel = 1;
+
+ // FIXME: Remove when all errors have been fixed.
+ let FullInstRWOverlapCheck = 0;
}
//===----------------------------------------------------------------------===//
@@ -125,8 +128,9 @@ def : InstRW<[WriteNoop], (instregex "(t)?BKPT$", "(t2)?CDP(2)?$",
"(t2)?CPS[123]p$", "(t2)?DBG$", "(t2)?DMB$", "(t2)?DSB$", "ERET$",
"(t2|t)?HINT$", "(t)?HLT$", "(t2)?HVC$", "(t2)?ISB$", "ITasm$",
"(t2)?RFE(DA|DB|IA|IB)", "(t)?SETEND", "(t2)?SETPAN", "(t2)?SMC", "SPACE",
- "(t2)?SRS(DA|DB|IA|IB)", "SWP(B)?", "t?TRAP", "UDF$", "t2DCPS", "t2SG",
- "t2TT", "tCPS", "CMP_SWAP", "t?SVC", "t2IT", "CompilerBarrier")>;
+ "(t2)?SRS(DA|DB|IA|IB)", "SWP(B)?", "t?TRAP", "(t2|t)?UDF$", "t2DCPS", "t2SG",
+ "t2TT", "tCPS", "CMP_SWAP", "t?SVC", "t2IT", "CompilerBarrier",
+ "t__brkdiv0")>;
def : InstRW<[WriteNoop], (instregex "VMRS", "VMSR", "FMSTAT")>;
@@ -146,7 +150,7 @@ def : InstRW<[WriteNoop], (instregex "FLDM", "FSTM")>;
// Pseudos
def : InstRW<[WriteNoop], (instregex "(t2)?ABS$",
"(t)?ADJCALLSTACKDOWN$", "(t)?ADJCALLSTACKUP$", "(t2|t)?Int_eh_sjlj",
- "tLDRpci_pic", "t2SUBS_PC_LR",
+ "tLDRpci_pic", "(t2)?SUBS_PC_LR",
"JUMPTABLE", "tInt_WIN_eh_sjlj_longjmp",
"VLD(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
"VLD(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
@@ -279,6 +283,9 @@ def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; }
def A57ReadMLA : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>;
+def : InstRW<[A57WriteMLA],
+ (instregex "t2SMLAD", "t2SMLADX", "t2SMLSD", "t2SMLSDX")>;
+
def : SchedAlias<WriteMAC16, A57WriteMLA>;
def : SchedAlias<WriteMAC32, A57WriteMLA>;
def : SchedAlias<ReadMAC, A57ReadMLA>;
@@ -587,6 +594,8 @@ def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>;
def : InstRW<[A57WriteLDM_Upd],
(instregex "(t|t2|sys)?LDM(IA_UPD|DA_UPD|DB_UPD|IB_UPD|IA_RET)", "tPOP")>;
+def : InstRW<[A57Write_5cyc_1L], (instregex "VLLDM")>;
+
// --- 3.9 Store Instructions ---
// Store, immed offset
@@ -705,6 +714,8 @@ def : InstRW<[A57WriteSTM], (instregex "(t2|sys|t)?STM(IA|DA|DB|IB)$")>;
def : InstRW<[A57WrBackOne, A57WriteSTM_Upd],
(instregex "(t2|sys|t)?STM(IA_UPD|DA_UPD|DB_UPD|IB_UPD)", "tPUSH")>;
+def : InstRW<[A57Write_5cyc_1S], (instregex "VLSTM")>;
+
// --- 3.10 FP Data Processing Instructions ---
def : SchedAlias<WriteFPALU32, A57Write_5cyc_1V>;
def : SchedAlias<WriteFPALU64, A57Write_5cyc_1V>;
@@ -722,9 +733,11 @@ def : InstRW<[A57WriteVcmp],
// fp convert
def : InstRW<[A57Write_5cyc_1V], (instregex
"VCVT(A|N|P|M)(SH|UH|SS|US|SD|UD)", "VCVT(BDH|THD|TDH)")>;
-
+def : InstRW<[A57Write_5cyc_1V], (instregex "VTOSLS", "VTOUHS", "VTOULS")>;
def : SchedAlias<WriteFPCVT, A57Write_5cyc_1V>;
+def : InstRW<[A57Write_5cyc_1V], (instregex "VJCVT")>;
+
// FP round to integral
def : InstRW<[A57Write_5cyc_1V], (instregex "VRINT(A|N|P|M|Z|R|X)(H|S|D)$")>;
@@ -734,6 +747,8 @@ def : SchedAlias<WriteFPDIV64, A57Write_32cyc_1W>;
def : SchedAlias<WriteFPSQRT32, A57Write_17cyc_1W>;
def : SchedAlias<WriteFPSQRT64, A57Write_32cyc_1W>;
+def : InstRW<[A57Write_17cyc_1W], (instregex "VSQRTH")>;
+
// FP max/min
def : InstRW<[A57Write_5cyc_1V], (instregex "VMAX", "VMIN")>;
@@ -767,6 +782,13 @@ def : SchedAlias<WriteFPMAC32, A57WriteVFMA>;
def : SchedAlias<WriteFPMAC64, A57WriteVFMA>;
def : SchedAlias<ReadFPMAC, A57ReadVFMA5>;
+// VMLAH/VMLSH are not binded to scheduling classes by default, so here custom:
+def : InstRW<[A57WriteVFMA, A57ReadVFMA5, ReadFPMUL, ReadFPMUL],
+ (instregex "VMLAH", "VMLSH", "VNMLAH", "VNMLSH")>;
+
+def : InstRW<[A57WriteVMUL],
+ (instregex "VUDOTD", "VSDOTD", "VUDOTQ", "VSDOTQ")>;
+
def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG")>;
def : InstRW<[A57Write_3cyc_1V], (instregex "VSEL")>;
@@ -775,6 +797,8 @@ def : InstRW<[A57Write_3cyc_1V], (instregex "VSEL")>;
def : InstRW<[A57Write_3cyc_1V], (instregex "FCONST(D|S|H)")>;
def : InstRW<[A57Write_3cyc_1V], (instregex "VMOV(D|S|H)(cc)?$")>;
+def : InstRW<[A57Write_3cyc_1V], (instregex "VINSH")>;
+
// 5cyc L for FP transfer, vfp to core reg,
// 5cyc L for FP transfer, core reg to vfp
def : SchedAlias<WriteFPMOV, A57Write_5cyc_1L>;
@@ -1062,6 +1086,11 @@ def A57ReadVQDMLAL_VecInt : SchedReadVariant<[
def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
(instregex "VQDMLAL", "VQDMLSL")>;
+// Vector Saturating Rounding Doubling Multiply Accumulate/Subtract Long
+// Scheduling info from VQDMLAL/VQDMLSL
+def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
+ (instregex "VQRDMLAH", "VQRDMLSH")>;
+
// ASIMD multiply long
// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
def A57WriteVMULL_VecInt : SchedWriteVariant<[
@@ -1126,6 +1155,8 @@ def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(fd|fq|hd|hq)")>;
def : InstRW<[A57Write_5cyc_1V], (instregex "VABD(fd|fq|hd|hq)",
"VADD(fd|fq|hd|hq)", "VPADD(f|h)", "VSUB(fd|fq|hd|hq)")>;
+def : InstRW<[A57Write_5cyc_1V], (instregex "VCADD", "VCMLA")>;
+
// ASIMD FP compare
def : InstRW<[A57Write_5cyc_1V], (instregex "VAC(GE|GT|LE|LT)",
"VC(EQ|GE|GT|LE)(fd|fq|hd|hq)")>;
@@ -1184,7 +1215,7 @@ def : InstRW<[A57Write_3cyc_1V], (instregex "VEXT(d|q)(8|16|32|64)")>;
// ASIMD move, immed
def : InstRW<[A57Write_3cyc_1V], (instregex
"VMOV(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v1i64|v2i64|v2f32|v4f32)",
- "VMOVQ0")>;
+ "VMOVD0", "VMOVQ0")>;
// ASIMD move, narrowing
def : InstRW<[A57Write_3cyc_1V], (instregex "VMOVN")>;
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 4e72b13d94cb..fc301c589269 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1898,6 +1898,9 @@ def CortexA9Model : SchedMachineModel {
// FIXME: Many vector operations were never given an itinerary. We
// haven't mapped these to the new model either.
let CompleteModel = 0;
+
+ // FIXME: Remove when all errors have been fixed.
+ let FullInstRWOverlapCheck = 0;
}
//===----------------------------------------------------------------------===//
@@ -1993,15 +1996,15 @@ def : WriteRes<WriteVST4, []>;
// Reserve A9UnitFP for 2 consecutive cycles.
def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
let Latency = 4;
- let ResourceCycles = [2];
+ let ResourceCycles = [2, 1];
}
def A9Write2V7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
let Latency = 7;
- let ResourceCycles = [2];
+ let ResourceCycles = [2, 1];
}
def A9Write2V9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
let Latency = 9;
- let ResourceCycles = [2];
+ let ResourceCycles = [2, 1];
}
// Branches don't have a def operand but still consume resources.
@@ -2534,8 +2537,7 @@ def : SchedAlias<WriteCMPsr, A9WriteALU>;
def : InstRW< [A9WriteIsr], (instregex "MOVsr", "MOVsi", "MVNsr", "MOVCCsi",
"MOVCCsr")>;
def : InstRW< [WriteALU, A9ReadALU], (instregex "MVNr")>;
-def : InstRW< [A9WriteI2], (instregex "MOVCCi32imm", "MOVi32imm",
- "MOV_ga_dyn")>;
+def : InstRW< [A9WriteI2], (instregex "MOVCCi32imm", "MOVi32imm")>;
def : InstRW< [A9WriteI2pc], (instregex "MOV_ga_pcrel")>;
def : InstRW< [A9WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
@@ -2548,12 +2550,12 @@ def : InstRW< [A9WriteM],
"SMMLA", "SMMLAR", "SMMLS", "SMMLSR")>;
def : InstRW< [A9WriteM, A9WriteMHi],
(instregex "SMULL", "SMULLv5", "UMULL", "UMULLv5", "SMLAL$", "UMLAL",
- "UMAAL", "SMLALv5", "UMLALv5", "UMAALv5", "SMLALBB", "SMLALBT", "SMLALTB",
+ "UMAAL", "SMLALv5", "UMLALv5", "SMLALBB", "SMLALBT", "SMLALTB",
"SMLALTT")>;
// FIXME: These instructions used to have NoItinerary. Just copied the one from above.
def : InstRW< [A9WriteM, A9WriteMHi],
(instregex "SMLAD", "SMLADX", "SMLALD", "SMLALDX", "SMLSD", "SMLSDX",
- "SMLSLD", "SMLLDX", "SMUAD", "SMUADX", "SMUSD", "SMUSDX")>;
+ "SMLSLD", "SMLSLDX", "SMUAD", "SMUADX", "SMUSD", "SMUSDX")>;
def : InstRW<[A9WriteM16, A9WriteM16Hi],
(instregex "SMULBB", "SMULBT", "SMULTB", "SMULTT", "SMULWB", "SMULWT")>;
diff --git a/lib/Target/ARM/ARMScheduleR52.td b/lib/Target/ARM/ARMScheduleR52.td
index ca3172808d36..11bce45161b3 100644
--- a/lib/Target/ARM/ARMScheduleR52.td
+++ b/lib/Target/ARM/ARMScheduleR52.td
@@ -217,12 +217,11 @@ def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_ISS],
"t2SXTB", "t2SXTH", "t2SXTB16", "t2UXTB", "t2UXTH", "t2UXTB16")>;
def : InstRW<[R52WriteALU_EX1, R52Read_ISS],
- (instregex "MOVCCi32imm", "MOVi32imm", "MOV_ga_dyn", "t2MOVCCi",
- "t2MOVi", "t2MOV_ga_dyn")>;
+ (instregex "MOVCCi32imm", "MOVi32imm", "t2MOVCCi", "t2MOVi")>;
def : InstRW<[R52WriteALU_EX2, R52Read_EX1],
- (instregex "MOV_ga_pcrel", "t2MOV_ga_pcrel")>;
+ (instregex "MOV_ga_pcrel$")>;
def : InstRW<[R52WriteLd,R52Read_ISS],
- (instregex "MOV_ga_pcrel_ldr", "t2MOV_ga_pcrel_ldr")>;
+ (instregex "MOV_ga_pcrel_ldr")>;
def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_EX1], (instregex "SEL", "t2SEL")>;
@@ -257,12 +256,12 @@ def : InstRW< [R52WriteALU_EX2, R52Read_EX1, R52Read_EX1],
// Sum of Absolute Difference
def : InstRW< [R52WriteALU_WRI, R52Read_ISS, R52Read_ISS, R52Read_ISS],
- (instregex "USAD8", "t2USAD8", "tUSAD8","USADA8", "t2USADA8", "tUSADA8") >;
+ (instregex "USAD8", "t2USAD8", "USADA8", "t2USADA8") >;
// Integer Multiply
def : InstRW<[R52WriteMAC, R52Read_ISS, R52Read_ISS],
- (instregex "MULS", "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT",
- "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDXi", "t2MUL",
+ (instregex "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT",
+ "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDX", "t2MUL",
"t2SMMUL", "t2SMMULR", "t2SMULBB", "t2SMULBT", "t2SMULTB", "t2SMULTT",
"t2SMULWB", "t2SMULWT", "t2SMUSD")>;
@@ -270,17 +269,17 @@ def : InstRW<[R52WriteMAC, R52Read_ISS, R52Read_ISS],
// Even for 64-bit accumulation (or Long), the single MAC is used (not ALUs).
// The store pipeline is used partly for 64-bit operations.
def : InstRW<[R52WriteMAC, R52Read_ISS, R52Read_ISS, R52Read_ISS],
- (instregex "MLAS", "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR",
- "t2MLA", "t2MLS", "t2MLAS", "t2SMMLA", "t2SMMLAR", "t2SMMLS", "t2SMMLSR",
+ (instregex "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR",
+ "t2MLA", "t2MLS", "t2SMMLA", "t2SMMLAR", "t2SMMLS", "t2SMMLSR",
"SMUAD", "SMUADX", "t2SMUAD", "t2SMUADX",
"SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLSD", "SMLSDX",
"SMLAWB", "SMLAWT", "t2SMLABB", "t2SMLABT", "t2SMLATB", "t2SMLATT",
"t2SMLSD", "t2SMLSDX", "t2SMLAWB", "t2SMLAWT",
"SMLAD", "SMLADX", "t2SMLAD", "t2SMLADX",
"SMULL$", "UMULL$", "t2SMULL$", "t2UMULL$",
- "SMLALS", "UMLALS", "SMLAL", "UMLAL", "MLALBB", "SMLALBT",
+ "SMLAL", "UMLAL", "SMLALBT",
"SMLALTB", "SMLALTT", "SMLALD", "SMLALDX", "SMLSLD", "SMLSLDX",
- "UMAAL", "t2SMLALS", "t2UMLALS", "t2SMLAL", "t2UMLAL", "t2MLALBB",
+ "UMAAL", "t2SMLAL", "t2UMLAL",
"t2SMLALBT", "t2SMLALTB", "t2SMLALTT", "t2SMLALD", "t2SMLALDX",
"t2SMLSLD", "t2SMLSLDX", "t2UMAAL")>;
@@ -301,31 +300,31 @@ def : InstRW<[R52WriteLd, R52WriteAdr, R52Read_ISS, R52Read_ISS],
"LDRBT_POST$", "LDR(T|BT)_POST_(REG|IMM)", "LDRHT(i|r)",
"t2LD(R|RB|RH)_(PRE|POST)", "t2LD(R|RB|RH)T",
"LDR(SH|SB)(_POST|_PRE)", "t2LDR(SH|SB)(_POST|_PRE)",
- "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)", "t2LDRS(B|H)T",
+ "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)?",
"LDRD_(POST|PRE)", "t2LDRD_(POST|PRE)")>;
def : InstRW<[R52WriteALU_EX2, R52Read_EX1], (instregex "MOVS?sr", "t2MOVS?sr")>;
def : InstRW<[R52WriteALU_WRI, R52Read_EX2], (instregex "MOVT", "t2MOVT")>;
-def : InstRW<[R52WriteALU_EX2, R52Read_EX1], (instregex "AD(C|D)S?ri","ANDS?ri",
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1], (instregex "AD(C|D)S?ri", "ANDS?ri",
"BICS?ri", "CLZ", "EORri", "MVNS?r", "ORRri", "RSBS?ri", "RSCri", "SBCri",
"t2AD(C|D)S?ri", "t2ANDS?ri", "t2BICS?ri","t2CLZ", "t2EORri", "t2MVN",
"t2ORRri", "t2RSBS?ri", "t2SBCri")>;
def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_EX1], (instregex "AD(C|D)S?rr",
- "ANDS?rr", "BICS?rr", "CRC*", "EORrr", "ORRrr", "RSBrr", "RSCrr", "SBCrr",
+ "ANDS?rr", "BICS?rr", "CRC", "EORrr", "ORRrr", "RSBrr", "RSCrr", "SBCrr",
"t2AD(C|D)S?rr", "t2ANDS?rr", "t2BICS?rr", "t2CRC", "t2EORrr", "t2SBCrr")>;
def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_ISS], (instregex "AD(C|D)S?rsi",
"ANDS?rsi", "BICS?rsi", "EORrsi", "ORRrsi", "RSBrsi", "RSCrsi", "SBCrsi",
- "t2AD(|D)S?rsi", "t2ANDS?rsi", "t2BICS?rsi", "t2EORrsi", "t2ORRrsi", "t2RSBrsi", "t2SBCrsi")>;
+ "t2AD(C|D)S?rs", "t2ANDS?rs", "t2BICS?rs", "t2EORrs", "t2ORRrs", "t2RSBrs", "t2SBCrs")>;
def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_ISS, R52Read_ISS],
(instregex "AD(C|D)S?rsr", "ANDS?rsr", "BICS?rsr", "EORrsr", "MVNS?sr",
- "ORRrsrr", "RSBrsr", "RSCrsr", "SBCrsr")>;
+ "ORRrsr", "RSBrsr", "RSCrsr", "SBCrsr")>;
def : InstRW<[R52WriteALU_EX1],
- (instregex "ADR", "MOVSi", "MOVSsi", "MOVST?i16*", "MVNS?s?i", "t2MOVS?si")>;
+ (instregex "ADR", "MOVsi", "MVNS?s?i", "t2MOVS?si")>;
def : InstRW<[R52WriteALU_EX1, R52Read_ISS], (instregex "ASRi", "RORS?i")>;
def : InstRW<[R52WriteALU_EX1, R52Read_ISS, R52Read_ISS],
@@ -484,7 +483,7 @@ def : InstRW<[R52WriteILDM, R52Read_ISS],
def : InstRW<[R52WriteILDM, R52WriteAdr, R52Read_ISS],
(instregex "LDM(IA|DA|DB|IB)_UPD", "(t2|sys|t)LDM(IA|DA|DB|IB)_UPD")>;
def : InstRW<[R52WriteILDM, R52WriteAdr, R52Read_ISS],
- (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "POP", "tPOP")>;
+ (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "tPOP")>;
// Integer Store, Single Element
def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_EX2],
@@ -500,7 +499,7 @@ def : InstRW<[R52WriteLd, R52WriteAdr, R52Read_ISS, R52Read_EX2],
// Integer Store, Dual
def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_EX2],
- (instregex "STRD$", "t2STRDi8", "STL", "t2STRD$", "t2STL")>;
+ (instregex "STRD$", "t2STRDi8", "STL", "t2STL")>;
def : InstRW<[R52WriteLd, R52WriteAdr, R52Read_ISS, R52Read_EX2],
(instregex "(t2|t)STRD_(POST|PRE)", "STRD_(POST|PRE)")>;
@@ -508,11 +507,11 @@ def : InstRW<[R52WriteISTM, R52Read_ISS, R52Read_EX2],
(instregex "STM(IB|IA|DB|DA)$", "(t2|sys|t)STM(IB|IA|DB|DA)$")>;
def : InstRW<[R52WriteISTM, R52WriteAdr, R52Read_ISS, R52Read_EX2],
(instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD",
- "PUSH", "tPUSH")>;
+ "tPUSH")>;
// LDRLIT pseudo instructions, they expand to LDR + PICADD
def : InstRW<[R52WriteLd],
- (instregex "t?LDRLIT_ga_abs", "t?LDRLIT_ga_pcrel")>;
+ (instregex "t?LDRLIT_ga_abs", "t?LDRLIT_ga_pcrel$")>;
// LDRLIT_ga_pcrel_ldr expands to LDR + PICLDR
def : InstRW<[R52WriteLd], (instregex "LDRLIT_ga_pcrel_ldr")>;
@@ -530,7 +529,7 @@ def : InstRW<[R52Write2FPALU_F5, R52Read_F1], (instregex "VABS(fq|hq)")>;
def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "(VACGE|VACGT)(fd|hd)")>;
def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F1], (instregex "(VACGE|VACGT)(fq|hq)")>;
-def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1], (instregex "(VADD|VSUB)(D|S|H|fd|hd)")>;
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1], (instregex "(VADD|VSUB)(D|S|H|fd|hd)$")>;
def : InstRW<[R52Write2FPALU_F5, R52Read_F1, R52Read_F1], (instregex "(VADD|VSUB)(fq|hq)")>;
def : InstRW<[R52WriteFPLd_F4, R52Read_ISS, R52Read_F1], (instregex "VLDR")>;
@@ -792,8 +791,6 @@ def : InstRW<[R52Write2FPALU_F3, R52Read_F2], (instregex "VBICi(v8i16|v4i32)")>;
def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL)d")>;
def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL)q")>;
-def : InstRW<[R52Write2FPALU_F3, R52Read_F2], (instregex "VBICi(v8i16|v4i32)")>;
-
def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1],
(instregex "(VCEQ|VCGE|VCGT|VCLE|VCLT|VCLZ|VCMP|VCMPE|VCNT)")>;
def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1],
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
index b838688c6f04..87984648139b 100644
--- a/lib/Target/ARM/ARMScheduleSwift.td
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -44,6 +44,9 @@ def SwiftModel : SchedMachineModel {
let LoadLatency = 3;
let MispredictPenalty = 14; // A branch direction mispredict.
let CompleteModel = 0; // FIXME: Remove if all instructions are covered.
+
+ // FIXME: Remove when all errors have been fixed.
+ let FullInstRWOverlapCheck = 0;
}
// Swift predicates.
@@ -161,12 +164,12 @@ let SchedModel = SwiftModel in {
"t2UXTB16")>;
// Pseudo instructions.
def : InstRW<[SwiftWriteP01OneCycle2x],
- (instregex "MOVCCi32imm", "MOVi32imm", "MOV_ga_dyn", "t2MOVCCi32imm",
- "t2MOVi32imm", "t2MOV_ga_dyn")>;
+ (instregex "MOVCCi32imm", "MOVi32imm", "t2MOVCCi32imm",
+ "t2MOVi32imm")>;
def : InstRW<[SwiftWriteP01OneCycle3x],
(instregex "MOV_ga_pcrel", "t2MOV_ga_pcrel", "t2MOVi16_ga_pcrel")>;
def : InstRW<[SwiftWriteP01OneCycle2x_load],
- (instregex "MOV_ga_pcrel_ldr", "t2MOV_ga_pcrel_ldr")>;
+ (instregex "MOV_ga_pcrel_ldr")>;
def SwiftWriteP0TwoCycleTwoUops : WriteSequence<[SwiftWriteP0OneCycle], 2>;
@@ -218,8 +221,8 @@ let SchedModel = SwiftModel in {
// 4.2.12 Integer Multiply (32-bit result)
// Two sources.
def : InstRW< [SwiftWriteP0FourCycle],
- (instregex "MULS", "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT",
- "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDXi", "t2MUL",
+ (instregex "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT",
+ "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDX", "t2MUL",
"t2SMMUL", "t2SMMULR", "t2SMULBB", "t2SMULBT", "t2SMULTB", "t2SMULTT",
"t2SMULWB", "t2SMULWT", "t2SMUSD")>;
@@ -241,8 +244,8 @@ let SchedModel = SwiftModel in {
// Multiply accumulate, three sources
def : InstRW< [SwiftPredP0P01FourFiveCycle, ReadALU, ReadALU,
SwiftReadAdvanceFourCyclesPred],
- (instregex "MLAS", "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR",
- "t2MLA", "t2MLS", "t2MLAS", "t2SMMLA", "t2SMMLAR", "t2SMMLS",
+ (instregex "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR",
+ "t2MLA", "t2MLS", "t2SMMLA", "t2SMMLAR", "t2SMMLS",
"t2SMMLSR")>;
// 4.2.13 Integer Multiply (32-bit result, Q flag)
@@ -302,9 +305,9 @@ let SchedModel = SwiftModel in {
// We are being a bit inaccurate here.
def : InstRW< [SwiftWrite5Cycle, Swift2P03P01FiveCycle, ReadALU, ReadALU,
SchedReadAdvance<4>, SchedReadAdvance<3>],
- (instregex "SMLALS", "UMLALS", "SMLAL", "UMLAL", "MLALBB", "SMLALBT",
+ (instregex "SMLAL", "UMLAL", "SMLALBT",
"SMLALTB", "SMLALTT", "SMLALD", "SMLALDX", "SMLSLD", "SMLSLDX",
- "UMAAL", "t2SMLALS", "t2UMLALS", "t2SMLAL", "t2UMLAL", "t2MLALBB", "t2SMLALBT",
+ "UMAAL", "t2SMLAL", "t2UMLAL", "t2SMLALBB", "t2SMLALBT",
"t2SMLALTB", "t2SMLALTT", "t2SMLALD", "t2SMLALDX", "t2SMLSLD", "t2SMLSLDX",
"t2UMAAL")>;
@@ -366,7 +369,7 @@ let SchedModel = SwiftModel in {
"t2LD(R|RB|RH)_(PRE|POST)", "t2LD(R|RB|RH)T")>;
def : InstRW<[SwiftWriteP2P01P01FourCycle, SwiftWrBackOne],
(instregex "LDR(SH|SB)(_POST|_PRE)", "t2LDR(SH|SB)(_POST|_PRE)",
- "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)", "t2LDRS(B|H)T")>;
+ "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)?")>;
// 4.2.21 Integer Dual Load
// Not accurate.
@@ -483,7 +486,7 @@ let SchedModel = SwiftModel in {
(instregex /*"t2LDMIA_RET", "tLDMIA_RET", "LDMIA_RET",*/
"LDM(IA|DA|DB|IB)_UPD", "(t2|sys|t)LDM(IA|DA|DB|IB)_UPD")>;
def : InstRW<[SwiftWriteLDMAddrWB, SwiftWriteLM, SwiftWriteP1TwoCycle],
- (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "POP", "tPOP")>;
+ (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "tPOP")>;
// 4.2.23 Integer Store, Single Element
def : InstRW<[SwiftWriteP2],
(instregex "PICSTR", "STR(i12|rs)", "STRB(i12|rs)", "STRH$", "STREX",
@@ -533,7 +536,7 @@ let SchedModel = SwiftModel in {
(instregex "STM(IB|IA|DB|DA)$", "(t2|sys|t)STM(IB|IA|DB|DA)$")>;
def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteSTM],
(instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD",
- "PUSH", "tPUSH")>;
+ "tPUSH")>;
// LDRLIT pseudo instructions, they expand to LDR + PICADD
def : InstRW<[SwiftWriteP2ThreeCycle, WriteALU],
@@ -549,14 +552,14 @@ let SchedModel = SwiftModel in {
// 4.2.27 Not issued
def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
- def : InstRW<[WriteNoop], (instregex "t2IT", "IT", "NOP")>;
+ def : InstRW<[WriteNoop], (instregex "t2IT", "IT")>;
// 4.2.28 Advanced SIMD, Integer, 2 cycle
def : InstRW<[SwiftWriteP0TwoCycle],
(instregex "VADDv", "VSUBv", "VNEG(s|f|v)", "VADDL", "VSUBL",
"VADDW", "VSUBW", "VHADD", "VHSUB", "VRHADD", "VPADDi",
"VPADDL", "VAND", "VBIC", "VEOR", "VORN", "VORR", "VTST",
- "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL", "VQSHLU", "VBIF",
+ "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL(s|u)", "VBIF",
"VBIT", "VBSL", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>;
def : InstRW<[SwiftWriteP1TwoCycle],
@@ -566,7 +569,7 @@ let SchedModel = SwiftModel in {
// 4.2.30 Advanced SIMD, Integer with Accumulate
def : InstRW<[SwiftWriteP0FourCycle],
(instregex "VABA", "VABAL", "VPADAL", "VRSRA", "VSRA", "VACGE", "VACGT",
- "VACLE", "VACLT", "VCEQ", "VCGE", "VCGT", "VCLE", "VCLT", "VRSHL",
+ "VCEQ", "VCGE", "VCGT", "VCLE", "VCLT", "VRSHL",
"VQRSHL", "VRSHR(u|s)", "VABS(f|v)", "VQABS", "VQNEG", "VQADD",
"VQSUB")>;
def : InstRW<[SwiftWriteP1FourCycle],
@@ -623,12 +626,12 @@ let SchedModel = SwiftModel in {
// 4.2.37 Advanced SIMD and VFP, Move
def : InstRW<[SwiftWriteP0TwoCycle],
(instregex "VMOVv", "VMOV(S|D)$", "VMOV(S|D)cc",
- "VMVNv", "VMVN(d|q)", "VMVN(S|D)cc",
+ "VMVNv", "VMVN(d|q)",
"FCONST(D|S)")>;
def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VMOVN", "VMOVL")>;
def : InstRW<[WriteSequence<[SwiftWriteP0FourCycle, SwiftWriteP1TwoCycle]>],
(instregex "VQMOVN")>;
- def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VDUPLN", "VDUPf")>;
+ def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VDUPLN")>;
def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP1TwoCycle]>],
(instregex "VDUP(8|16|32)")>;
def : InstRW<[SwiftWriteP2ThreeCycle], (instregex "VMOVRS$")>;
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 23027e92481f..f42cbbda1b71 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -302,6 +302,8 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
}
}
+bool ARMSubtarget::isTargetHardFloat() const { return TM.isTargetHardFloat(); }
+
bool ARMSubtarget::isAPCS_ABI() const {
assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_APCS;
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index eedb675a3304..74aee9a8ed38 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -105,6 +105,7 @@ protected:
ARMv81a,
ARMv82a,
ARMv83a,
+ ARMv84a,
ARMv8a,
ARMv8mBaseline,
ARMv8mMainline,
@@ -151,6 +152,7 @@ protected:
bool HasV8_1aOps = false;
bool HasV8_2aOps = false;
bool HasV8_3aOps = false;
+ bool HasV8_4aOps = false;
bool HasV8MBaselineOps = false;
bool HasV8MMainlineOps = false;
@@ -198,6 +200,9 @@ protected:
/// register allocation.
bool DisablePostRAScheduler = false;
+ /// UseAA - True if using AA during codegen (DAGCombine, MISched, etc)
+ bool UseAA = false;
+
/// HasThumb2 - True if Thumb2 instructions are supported.
bool HasThumb2 = false;
@@ -296,6 +301,12 @@ protected:
/// Has8MSecExt - if true, processor supports ARMv8-M Security Extensions
bool Has8MSecExt = false;
+ /// HasSHA2 - if true, processor supports SHA1 and SHA256
+ bool HasSHA2 = false;
+
+ /// HasAES - if true, processor supports AES
+ bool HasAES = false;
+
/// HasCrypto - if true, processor supports Cryptography extensions
bool HasCrypto = false;
@@ -316,6 +327,10 @@ protected:
/// pairs faster.
bool HasFuseAES = false;
+ /// HasFuseLiterals - if true, processor executes back to back
+ /// bottom and top halves of literal generation faster.
+ bool HasFuseLiterals = false;
+
/// If true, if conversion may decide to leave some instructions unpredicated.
bool IsProfitableToUnpredicate = false;
@@ -341,9 +356,12 @@ protected:
/// If true, the AGU and NEON/FPU units are multiplexed.
bool HasMuxedUnits = false;
- /// If true, VMOVS will never be widened to VMOVD
+ /// If true, VMOVS will never be widened to VMOVD.
bool DontWidenVMOVS = false;
+ /// If true, splat a register between VFP and NEON instructions.
+ bool SplatVFPToNeon = false;
+
/// If true, run the MLx expansion pass.
bool ExpandMLx = false;
@@ -510,6 +528,7 @@ public:
bool hasV8_1aOps() const { return HasV8_1aOps; }
bool hasV8_2aOps() const { return HasV8_2aOps; }
bool hasV8_3aOps() const { return HasV8_3aOps; }
+ bool hasV8_4aOps() const { return HasV8_4aOps; }
bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
@@ -535,6 +554,8 @@ public:
bool hasVFP4() const { return HasVFPv4; }
bool hasFPARMv8() const { return HasFPARMv8; }
bool hasNEON() const { return HasNEON; }
+ bool hasSHA2() const { return HasSHA2; }
+ bool hasAES() const { return HasAES; }
bool hasCrypto() const { return HasCrypto; }
bool hasDotProd() const { return HasDotProd; }
bool hasCRC() const { return HasCRC; }
@@ -577,6 +598,7 @@ public:
bool hasSlowLoadDSubregister() const { return SlowLoadDSubregister; }
bool hasMuxedUnits() const { return HasMuxedUnits; }
bool dontWidenVMOVS() const { return DontWidenVMOVS; }
+ bool useSplatVFPToNeon() const { return SplatVFPToNeon; }
bool useNEONForFPMovs() const { return UseNEONForFPMovs; }
bool checkVLDnAccessAlignment() const { return CheckVLDnAlign; }
bool nonpipelinedVFP() const { return NonpipelinedVFP; }
@@ -598,8 +620,9 @@ public:
bool hasFullFP16() const { return HasFullFP16; }
bool hasFuseAES() const { return HasFuseAES; }
- /// \brief Return true if the CPU supports any kind of instruction fusion.
- bool hasFusion() const { return hasFuseAES(); }
+ bool hasFuseLiterals() const { return HasFuseLiterals; }
+ /// Return true if the CPU supports any kind of instruction fusion.
+ bool hasFusion() const { return hasFuseAES() || hasFuseLiterals(); }
const Triple &getTargetTriple() const { return TargetTriple; }
@@ -652,13 +675,7 @@ public:
!isTargetDarwin() && !isTargetWindows();
}
- bool isTargetHardFloat() const {
- // FIXME: this is invalid for WindowsCE
- return TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
- TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
- TargetTriple.getEnvironment() == Triple::EABIHF ||
- isTargetWindows() || isAAPCS16_ABI();
- }
+ bool isTargetHardFloat() const;
bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
@@ -723,6 +740,10 @@ public:
/// True for some subtargets at > -O0.
bool enablePostRAScheduler() const override;
+ /// Enable use of alias analysis during code generation (during MI
+ /// scheduling, DAGCombine, etc.).
+ bool useAA() const override { return UseAA; }
+
// enableAtomicExpand- True if we need to expand our atomics.
bool enableAtomicExpand() const override;
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 0f6d1eddc985..519f789fc215 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -22,7 +22,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/ExecutionDepsFix.h"
+#include "llvm/CodeGen/ExecutionDomainFix.h"
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
@@ -34,7 +34,6 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DataLayout.h"
@@ -45,6 +44,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetParser.h"
#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/Scalar.h"
#include <cassert>
@@ -75,7 +75,7 @@ EnableGlobalMerge("arm-global-merge", cl::Hidden,
cl::desc("Enable the global merge pass"));
namespace llvm {
- void initializeARMExecutionDepsFixPass(PassRegistry&);
+ void initializeARMExecutionDomainFixPass(PassRegistry&);
}
extern "C" void LLVMInitializeARMTarget() {
@@ -89,8 +89,10 @@ extern "C" void LLVMInitializeARMTarget() {
initializeGlobalISel(Registry);
initializeARMLoadStoreOptPass(Registry);
initializeARMPreAllocLoadStoreOptPass(Registry);
+ initializeARMParallelDSPPass(Registry);
+ initializeARMCodeGenPreparePass(Registry);
initializeARMConstantIslandsPass(Registry);
- initializeARMExecutionDepsFixPass(Registry);
+ initializeARMExecutionDomainFixPass(Registry);
initializeARMExpandPseudoPass(Registry);
initializeThumb2SizeReducePass(Registry);
}
@@ -214,11 +216,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
// Default to triple-appropriate float ABI
if (Options.FloatABIType == FloatABI::Default) {
- if (TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
- TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
- TargetTriple.getEnvironment() == Triple::EABIHF ||
- TargetTriple.isOSWindows() ||
- TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16)
+ if (isTargetHardFloat())
this->Options.FloatABIType = FloatABI::Hard;
else
this->Options.FloatABIType = FloatABI::Soft;
@@ -238,6 +236,11 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
this->Options.EABIVersion = EABI::EABI5;
}
+ if (TT.isOSBinFormatMachO()) {
+ this->Options.TrapUnreachable = true;
+ this->Options.NoTrapAfterNoreturn = true;
+ }
+
initAsmInfo();
}
@@ -344,6 +347,7 @@ public:
}
void addIRPasses() override;
+ void addCodeGenPrepare() override;
bool addPreISel() override;
bool addInstSelector() override;
bool addIRTranslator() override;
@@ -355,20 +359,23 @@ public:
void addPreEmitPass() override;
};
-class ARMExecutionDepsFix : public ExecutionDepsFix {
+class ARMExecutionDomainFix : public ExecutionDomainFix {
public:
static char ID;
- ARMExecutionDepsFix() : ExecutionDepsFix(ID, ARM::DPRRegClass) {}
+ ARMExecutionDomainFix() : ExecutionDomainFix(ID, ARM::DPRRegClass) {}
StringRef getPassName() const override {
- return "ARM Execution Dependency Fix";
+ return "ARM Execution Domain Fix";
}
};
-char ARMExecutionDepsFix::ID;
+char ARMExecutionDomainFix::ID;
} // end anonymous namespace
-INITIALIZE_PASS(ARMExecutionDepsFix, "arm-execution-deps-fix",
- "ARM Execution Dependency Fix", false, false)
+INITIALIZE_PASS_BEGIN(ARMExecutionDomainFix, "arm-execution-domain-fix",
+ "ARM Execution Domain Fix", false, false)
+INITIALIZE_PASS_DEPENDENCY(ReachingDefAnalysis)
+INITIALIZE_PASS_END(ARMExecutionDomainFix, "arm-execution-domain-fix",
+ "ARM Execution Domain Fix", false, false)
TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
return new ARMPassConfig(*this, PM);
@@ -397,7 +404,16 @@ void ARMPassConfig::addIRPasses() {
addPass(createInterleavedAccessPass());
}
+void ARMPassConfig::addCodeGenPrepare() {
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createARMCodeGenPreparePass());
+ TargetPassConfig::addCodeGenPrepare();
+}
+
bool ARMPassConfig::addPreISel() {
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createARMParallelDSPPass());
+
if ((TM->getOptLevel() != CodeGenOpt::None &&
EnableGlobalMerge == cl::BOU_UNSET) ||
EnableGlobalMerge == cl::BOU_TRUE) {
@@ -462,7 +478,8 @@ void ARMPassConfig::addPreSched2() {
if (EnableARMLoadStoreOpt)
addPass(createARMLoadStoreOptimizationPass());
- addPass(new ARMExecutionDepsFix());
+ addPass(new ARMExecutionDomainFix());
+ addPass(createBreakFalseDeps());
}
// Expand some pseudo instructions into multiple instructions to allow
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index 2072bb731f0a..2c791998e702 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -61,6 +61,16 @@ public:
TargetLoweringObjectFile *getObjFileLowering() const override {
return TLOF.get();
}
+
+ bool isTargetHardFloat() const {
+ return TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+ TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
+ TargetTriple.getEnvironment() == Triple::EABIHF ||
+ (TargetTriple.isOSBinFormatMachO() &&
+ TargetTriple.getSubArch() == Triple::ARMSubArch_v7em) ||
+ TargetTriple.isOSWindows() ||
+ TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+ }
};
/// ARM/Thumb little endian target machine.
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index 88bab64ffaf2..d0620761ea9c 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -40,9 +40,6 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
if (isAAPCS_ABI) {
LSDASection = nullptr;
}
-
- AttributesSection =
- getContext().getELFSection(".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, 0);
}
const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
index bd7aa1cfe02b..0dc0882809c0 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/lib/Target/ARM/ARMTargetObjectFile.h
@@ -16,9 +16,6 @@
namespace llvm {
class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
-protected:
- const MCSection *AttributesSection = nullptr;
-
public:
ARMElfTargetObjectFile()
: TargetLoweringObjectFileELF() {
@@ -33,7 +30,7 @@ public:
MachineModuleInfo *MMI,
MCStreamer &Streamer) const override;
- /// \brief Describe a TLS variable address within debug info.
+ /// Describe a TLS variable address within debug info.
const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 43d7888075b5..f8cae31641ff 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -15,7 +15,6 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
@@ -26,6 +25,7 @@
#include "llvm/IR/Type.h"
#include "llvm/MC/SubtargetFeature.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Target/TargetMachine.h"
#include <algorithm>
#include <cassert>
@@ -126,6 +126,10 @@ int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
return 0;
}
+ // xor a, -1 can always be folded to MVN
+ if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
+ return 0;
+
return getIntImmCost(Imm, Ty);
}
@@ -351,7 +355,7 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
- // On NEON a a vector select gets lowered to vbsl.
+ // On NEON a vector select gets lowered to vbsl.
if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
// Lowering of some vector selects is currently far from perfect.
static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
@@ -396,8 +400,8 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
- // We only handle costs of reverse and alternate shuffles for now.
- if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
+ // We only handle costs of reverse and select shuffles for now.
+ if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Select)
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
if (Kind == TTI::SK_Reverse) {
@@ -422,9 +426,9 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
- if (Kind == TTI::SK_Alternate) {
- static const CostTblEntry NEONAltShuffleTbl[] = {
- // Alt shuffle cost table for ARM. Cost is the number of instructions
+ if (Kind == TTI::SK_Select) {
+ static const CostTblEntry NEONSelShuffleTbl[] = {
+ // Select shuffle cost table for ARM. Cost is the number of instructions
// required to create the shuffled vector.
{ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
@@ -441,7 +445,7 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
- if (const auto *Entry = CostTableLookup(NEONAltShuffleTbl,
+ if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
ISD::VECTOR_SHUFFLE, LT.second))
return LT.first * Entry->Cost;
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
@@ -579,9 +583,9 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
SmallVector<BasicBlock*, 4> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
- DEBUG(dbgs() << "Loop has:\n"
- << "Blocks: " << L->getNumBlocks() << "\n"
- << "Exit blocks: " << ExitingBlocks.size() << "\n");
+ LLVM_DEBUG(dbgs() << "Loop has:\n"
+ << "Blocks: " << L->getNumBlocks() << "\n"
+ << "Exit blocks: " << ExitingBlocks.size() << "\n");
// Only allow another exit other than the latch. This acts as an early exit
// as it mirrors the profitability calculation of the runtime unroller.
@@ -612,12 +616,14 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
}
- DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
+ LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
UP.Partial = true;
UP.Runtime = true;
UP.UnrollRemainder = true;
UP.DefaultUnrollRuntimeCount = 4;
+ UP.UnrollAndJam = true;
+ UP.UnrollAndJamInnerLoopThreshold = 60;
// Force unrolling small loops can be very useful because of the branch
// taken cost of the backedge.
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 97b642c99f80..807d62547337 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -527,6 +527,7 @@ class ARMAsmParser : public MCTargetAsmParser {
OperandMatchResultTy parseCoprocRegOperand(OperandVector &);
OperandMatchResultTy parseCoprocOptionOperand(OperandVector &);
OperandMatchResultTy parseMemBarrierOptOperand(OperandVector &);
+ OperandMatchResultTy parseTraceSyncBarrierOptOperand(OperandVector &);
OperandMatchResultTy parseInstSyncBarrierOptOperand(OperandVector &);
OperandMatchResultTy parseProcIFlagsOperand(OperandVector &);
OperandMatchResultTy parseMSRMaskOperand(OperandVector &);
@@ -561,6 +562,8 @@ class ARMAsmParser : public MCTargetAsmParser {
bool shouldOmitPredicateOperand(StringRef Mnemonic, OperandVector &Operands);
bool isITBlockTerminator(MCInst &Inst) const;
void fixupGNULDRDAlias(StringRef Mnemonic, OperandVector &Operands);
+ bool validateLDRDSTRD(MCInst &Inst, const OperandVector &Operands,
+ bool Load, bool ARMMode, bool Writeback);
public:
enum ARMMatchResultTy {
@@ -644,6 +647,7 @@ class ARMOperand : public MCParsedAsmOperand {
k_Immediate,
k_MemBarrierOpt,
k_InstSyncBarrierOpt,
+ k_TraceSyncBarrierOpt,
k_Memory,
k_PostIndexRegister,
k_MSRMask,
@@ -694,6 +698,10 @@ class ARMOperand : public MCParsedAsmOperand {
ARM_ISB::InstSyncBOpt Val;
};
+ struct TSBOptOp {
+ ARM_TSB::TraceSyncBOpt Val;
+ };
+
struct IFlagsOp {
ARM_PROC::IFlags Val;
};
@@ -790,6 +798,7 @@ class ARMOperand : public MCParsedAsmOperand {
struct CoprocOptionOp CoprocOption;
struct MBOptOp MBOpt;
struct ISBOptOp ISBOpt;
+ struct TSBOptOp TSBOpt;
struct ITMaskOp ITMask;
struct IFlagsOp IFlags;
struct MMaskOp MMask;
@@ -879,6 +888,11 @@ public:
return ISBOpt.Val;
}
+ ARM_TSB::TraceSyncBOpt getTraceSyncBarrierOpt() const {
+ assert(Kind == k_TraceSyncBarrierOpt && "Invalid access!");
+ return TSBOpt.Val;
+ }
+
ARM_PROC::IFlags getProcIFlags() const {
assert(Kind == k_ProcIFlags && "Invalid access!");
return IFlags.Val;
@@ -1028,7 +1042,12 @@ public:
if (!isImm()) return false;
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
if (!CE) return false;
- int64_t Value = -CE->getValue();
+ // isImm0_4095Neg is used with 32-bit immediates only.
+ // 32-bit immediates are zero extended to 64-bit when parsed,
+ // thus simple -CE->getValue() results in a big negative number,
+ // not a small positive number as intended
+ if ((CE->getValue() >> 32) > 0) return false;
+ uint32_t Value = -static_cast<uint32_t>(CE->getValue());
return Value > 0 && Value < 4096;
}
@@ -1150,10 +1169,31 @@ public:
bool isToken() const override { return Kind == k_Token; }
bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; }
bool isInstSyncBarrierOpt() const { return Kind == k_InstSyncBarrierOpt; }
- bool isMem() const override { return Kind == k_Memory; }
+ bool isTraceSyncBarrierOpt() const { return Kind == k_TraceSyncBarrierOpt; }
+ bool isMem() const override {
+ if (Kind != k_Memory)
+ return false;
+ if (Memory.BaseRegNum &&
+ !ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Memory.BaseRegNum))
+ return false;
+ if (Memory.OffsetRegNum &&
+ !ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Memory.OffsetRegNum))
+ return false;
+ return true;
+ }
bool isShifterImm() const { return Kind == k_ShifterImmediate; }
- bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; }
- bool isRegShiftedImm() const { return Kind == k_ShiftedImmediate; }
+ bool isRegShiftedReg() const {
+ return Kind == k_ShiftedRegister &&
+ ARMMCRegisterClasses[ARM::GPRRegClassID].contains(
+ RegShiftedReg.SrcReg) &&
+ ARMMCRegisterClasses[ARM::GPRRegClassID].contains(
+ RegShiftedReg.ShiftReg);
+ }
+ bool isRegShiftedImm() const {
+ return Kind == k_ShiftedImmediate &&
+ ARMMCRegisterClasses[ARM::GPRRegClassID].contains(
+ RegShiftedImm.SrcReg);
+ }
bool isRotImm() const { return Kind == k_RotateImmediate; }
bool isModImm() const { return Kind == k_ModifiedImmediate; }
@@ -1192,9 +1232,12 @@ public:
bool isConstantPoolImm() const { return Kind == k_ConstantPoolImmediate; }
bool isBitfield() const { return Kind == k_BitfieldDescriptor; }
- bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; }
+ bool isPostIdxRegShifted() const {
+ return Kind == k_PostIndexRegister &&
+ ARMMCRegisterClasses[ARM::GPRRegClassID].contains(PostIdxReg.RegNum);
+ }
bool isPostIdxReg() const {
- return Kind == k_PostIndexRegister && PostIdxReg.ShiftTy ==ARM_AM::no_shift;
+ return isPostIdxRegShifted() && PostIdxReg.ShiftTy == ARM_AM::no_shift;
}
bool isMemNoOffset(bool alignOK = false, unsigned Alignment = 0) const {
if (!isMem())
@@ -1331,10 +1374,10 @@ public:
}
bool isAM3Offset() const {
- if (Kind != k_Immediate && Kind != k_PostIndexRegister)
+ if (isPostIdxReg())
+ return true;
+ if (!isImm())
return false;
- if (Kind == k_PostIndexRegister)
- return PostIdxReg.ShiftTy == ARM_AM::no_shift;
// Immediate offset in range [-255, 255].
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
if (!CE) return false;
@@ -1834,7 +1877,22 @@ public:
return ARM_AM::isNEONi32splat(~Value);
}
- bool isNEONByteReplicate(unsigned NumBytes) const {
+ static bool isValidNEONi32vmovImm(int64_t Value) {
+ // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
+ // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
+ return ((Value & 0xffffffffffffff00) == 0) ||
+ ((Value & 0xffffffffffff00ff) == 0) ||
+ ((Value & 0xffffffffff00ffff) == 0) ||
+ ((Value & 0xffffffff00ffffff) == 0) ||
+ ((Value & 0xffffffffffff00ff) == 0xff) ||
+ ((Value & 0xffffffffff00ffff) == 0xffff);
+ }
+
+ bool isNEONReplicate(unsigned Width, unsigned NumElems, bool Inv) const {
+ assert((Width == 8 || Width == 16 || Width == 32) &&
+ "Invalid element width");
+ assert(NumElems * Width <= 64 && "Invalid result width");
+
if (!isImm())
return false;
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -1844,18 +1902,49 @@ public:
int64_t Value = CE->getValue();
if (!Value)
return false; // Don't bother with zero.
+ if (Inv)
+ Value = ~Value;
- unsigned char B = Value & 0xff;
- for (unsigned i = 1; i < NumBytes; ++i) {
- Value >>= 8;
- if ((Value & 0xff) != B)
+ uint64_t Mask = (1ull << Width) - 1;
+ uint64_t Elem = Value & Mask;
+ if (Width == 16 && (Elem & 0x00ff) != 0 && (Elem & 0xff00) != 0)
+ return false;
+ if (Width == 32 && !isValidNEONi32vmovImm(Elem))
+ return false;
+
+ for (unsigned i = 1; i < NumElems; ++i) {
+ Value >>= Width;
+ if ((Value & Mask) != Elem)
return false;
}
return true;
}
- bool isNEONi16ByteReplicate() const { return isNEONByteReplicate(2); }
- bool isNEONi32ByteReplicate() const { return isNEONByteReplicate(4); }
+ bool isNEONByteReplicate(unsigned NumBytes) const {
+ return isNEONReplicate(8, NumBytes, false);
+ }
+
+ static void checkNeonReplicateArgs(unsigned FromW, unsigned ToW) {
+ assert((FromW == 8 || FromW == 16 || FromW == 32) &&
+ "Invalid source width");
+ assert((ToW == 16 || ToW == 32 || ToW == 64) &&
+ "Invalid destination width");
+ assert(FromW < ToW && "ToW is not less than FromW");
+ }
+
+ template<unsigned FromW, unsigned ToW>
+ bool isNEONmovReplicate() const {
+ checkNeonReplicateArgs(FromW, ToW);
+ if (ToW == 64 && isNEONi64splat())
+ return false;
+ return isNEONReplicate(FromW, ToW / FromW, false);
+ }
+
+ template<unsigned FromW, unsigned ToW>
+ bool isNEONinvReplicate() const {
+ checkNeonReplicateArgs(FromW, ToW);
+ return isNEONReplicate(FromW, ToW / FromW, true);
+ }
bool isNEONi32vmov() const {
if (isNEONByteReplicate(4))
@@ -1866,16 +1955,7 @@ public:
// Must be a constant.
if (!CE)
return false;
- int64_t Value = CE->getValue();
- // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
- // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
- // FIXME: This is probably wrong and a copy and paste from previous example
- return (Value >= 0 && Value < 256) ||
- (Value >= 0x0100 && Value <= 0xff00) ||
- (Value >= 0x010000 && Value <= 0xff0000) ||
- (Value >= 0x01000000 && Value <= 0xff000000) ||
- (Value >= 0x01ff && Value <= 0xffff && (Value & 0xff) == 0xff) ||
- (Value >= 0x01ffff && Value <= 0xffffff && (Value & 0xffff) == 0xffff);
+ return isValidNEONi32vmovImm(CE->getValue());
}
bool isNEONi32vmovNeg() const {
@@ -1883,16 +1963,7 @@ public:
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
// Must be a constant.
if (!CE) return false;
- int64_t Value = ~CE->getValue();
- // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
- // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
- // FIXME: This is probably wrong and a copy and paste from previous example
- return (Value >= 0 && Value < 256) ||
- (Value >= 0x0100 && Value <= 0xff00) ||
- (Value >= 0x010000 && Value <= 0xff0000) ||
- (Value >= 0x01000000 && Value <= 0xff000000) ||
- (Value >= 0x01ff && Value <= 0xffff && (Value & 0xff) == 0xff) ||
- (Value >= 0x01ffff && Value <= 0xffffff && (Value & 0xffff) == 0xffff);
+ return isValidNEONi32vmovImm(~CE->getValue());
}
bool isNEONi64splat() const {
@@ -2189,7 +2260,7 @@ public:
// The operand is actually an imm0_4095, but we have its
// negation in the assembly source, so twiddle it here.
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::createImm(-CE->getValue()));
+ Inst.addOperand(MCOperand::createImm(-(uint32_t)CE->getValue()));
}
void addUnsignedOffset_b8s2Operands(MCInst &Inst, unsigned N) const {
@@ -2234,6 +2305,11 @@ public:
Inst.addOperand(MCOperand::createImm(unsigned(getInstSyncBarrierOpt())));
}
+ void addTraceSyncBarrierOptOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(unsigned(getTraceSyncBarrierOpt())));
+ }
+
void addMemNoOffsetOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
@@ -2710,62 +2786,87 @@ public:
Inst.addOperand(MCOperand::createImm(Value));
}
- void addNEONinvByteReplicateOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
+ void addNEONi8ReplicateOperands(MCInst &Inst, bool Inv) const {
// The immediate encodes the type of constant as well as the value.
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- unsigned Value = CE->getValue();
assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
Inst.getOpcode() == ARM::VMOVv16i8) &&
- "All vmvn instructions that wants to replicate non-zero byte "
- "always must be replaced with VMOVv8i8 or VMOVv16i8.");
- unsigned B = ((~Value) & 0xff);
+ "All instructions that wants to replicate non-zero byte "
+ "always must be replaced with VMOVv8i8 or VMOVv16i8.");
+ unsigned Value = CE->getValue();
+ if (Inv)
+ Value = ~Value;
+ unsigned B = Value & 0xff;
B |= 0xe00; // cmode = 0b1110
Inst.addOperand(MCOperand::createImm(B));
}
- void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const {
+ void addNEONinvi8ReplicateOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- // The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- unsigned Value = CE->getValue();
+ addNEONi8ReplicateOperands(Inst, true);
+ }
+
+ static unsigned encodeNeonVMOVImmediate(unsigned Value) {
if (Value >= 256 && Value <= 0xffff)
Value = (Value >> 8) | ((Value & 0xff) ? 0xc00 : 0x200);
else if (Value > 0xffff && Value <= 0xffffff)
Value = (Value >> 16) | ((Value & 0xff) ? 0xd00 : 0x400);
else if (Value > 0xffffff)
Value = (Value >> 24) | 0x600;
- Inst.addOperand(MCOperand::createImm(Value));
+ return Value;
}
- void addNEONvmovByteReplicateOperands(MCInst &Inst, unsigned N) const {
+ void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- unsigned Value = CE->getValue();
- assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
- Inst.getOpcode() == ARM::VMOVv16i8) &&
- "All instructions that wants to replicate non-zero byte "
- "always must be replaced with VMOVv8i8 or VMOVv16i8.");
- unsigned B = Value & 0xff;
- B |= 0xe00; // cmode = 0b1110
- Inst.addOperand(MCOperand::createImm(B));
+ unsigned Value = encodeNeonVMOVImmediate(CE->getValue());
+ Inst.addOperand(MCOperand::createImm(Value));
+ }
+
+ void addNEONvmovi8ReplicateOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addNEONi8ReplicateOperands(Inst, false);
+ }
+
+ void addNEONvmovi16ReplicateOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ assert((Inst.getOpcode() == ARM::VMOVv4i16 ||
+ Inst.getOpcode() == ARM::VMOVv8i16 ||
+ Inst.getOpcode() == ARM::VMVNv4i16 ||
+ Inst.getOpcode() == ARM::VMVNv8i16) &&
+ "All instructions that want to replicate non-zero half-word "
+ "always must be replaced with V{MOV,MVN}v{4,8}i16.");
+ uint64_t Value = CE->getValue();
+ unsigned Elem = Value & 0xffff;
+ if (Elem >= 256)
+ Elem = (Elem >> 8) | 0x200;
+ Inst.addOperand(MCOperand::createImm(Elem));
}
void addNEONi32vmovNegOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- unsigned Value = ~CE->getValue();
- if (Value >= 256 && Value <= 0xffff)
- Value = (Value >> 8) | ((Value & 0xff) ? 0xc00 : 0x200);
- else if (Value > 0xffff && Value <= 0xffffff)
- Value = (Value >> 16) | ((Value & 0xff) ? 0xd00 : 0x400);
- else if (Value > 0xffffff)
- Value = (Value >> 24) | 0x600;
+ unsigned Value = encodeNeonVMOVImmediate(~CE->getValue());
Inst.addOperand(MCOperand::createImm(Value));
}
+ void addNEONvmovi32ReplicateOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ assert((Inst.getOpcode() == ARM::VMOVv2i32 ||
+ Inst.getOpcode() == ARM::VMOVv4i32 ||
+ Inst.getOpcode() == ARM::VMVNv2i32 ||
+ Inst.getOpcode() == ARM::VMVNv4i32) &&
+ "All instructions that want to replicate non-zero word "
+ "always must be replaced with V{MOV,MVN}v{2,4}i32.");
+ uint64_t Value = CE->getValue();
+ unsigned Elem = encodeNeonVMOVImmediate(Value & 0xffffffff);
+ Inst.addOperand(MCOperand::createImm(Elem));
+ }
+
void addNEONi64splatOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
@@ -3064,6 +3165,15 @@ public:
return Op;
}
+ static std::unique_ptr<ARMOperand>
+ CreateTraceSyncBarrierOpt(ARM_TSB::TraceSyncBOpt Opt, SMLoc S) {
+ auto Op = make_unique<ARMOperand>(k_TraceSyncBarrierOpt);
+ Op->TSBOpt.Val = Opt;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
static std::unique_ptr<ARMOperand> CreateProcIFlags(ARM_PROC::IFlags IFlags,
SMLoc S) {
auto Op = make_unique<ARMOperand>(k_ProcIFlags);
@@ -3133,6 +3243,9 @@ void ARMOperand::print(raw_ostream &OS) const {
case k_InstSyncBarrierOpt:
OS << "<ARM_ISB::" << InstSyncBOptToString(getInstSyncBarrierOpt()) << ">";
break;
+ case k_TraceSyncBarrierOpt:
+ OS << "<ARM_TSB::" << TraceSyncBOptToString(getTraceSyncBarrierOpt()) << ">";
+ break;
case k_Memory:
OS << "<memory "
<< " base:" << Memory.BaseRegNum;
@@ -4122,6 +4235,24 @@ ARMAsmParser::parseMemBarrierOptOperand(OperandVector &Operands) {
return MatchOperand_Success;
}
+OperandMatchResultTy
+ARMAsmParser::parseTraceSyncBarrierOptOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = Parser.getTok().getLoc();
+ const AsmToken &Tok = Parser.getTok();
+
+ if (Tok.isNot(AsmToken::Identifier))
+ return MatchOperand_NoMatch;
+
+ if (!Tok.getString().equals_lower("csync"))
+ return MatchOperand_NoMatch;
+
+ Parser.Lex(); // Eat identifier token.
+
+ Operands.push_back(ARMOperand::CreateTraceSyncBarrierOpt(ARM_TSB::CSYNC, S));
+ return MatchOperand_Success;
+}
+
/// parseInstSyncBarrierOptOperand - Try to parse ISB inst sync barrier options.
OperandMatchResultTy
ARMAsmParser::parseInstSyncBarrierOptOperand(OperandVector &Operands) {
@@ -4215,6 +4346,18 @@ ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
SMLoc S = Parser.getTok().getLoc();
const AsmToken &Tok = Parser.getTok();
+
+ if (Tok.is(AsmToken::Integer)) {
+ int64_t Val = Tok.getIntVal();
+ if (Val > 255 || Val < 0) {
+ return MatchOperand_NoMatch;
+ }
+ unsigned SYSmvalue = Val & 0xFF;
+ Parser.Lex();
+ Operands.push_back(ARMOperand::CreateMSRMask(SYSmvalue, S));
+ return MatchOperand_Success;
+ }
+
if (!Tok.is(AsmToken::Identifier))
return MatchOperand_NoMatch;
StringRef Mask = Tok.getString();
@@ -5450,7 +5593,7 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
return false;
}
-/// \brief Given a mnemonic, split out possible predication code and carry
+/// Given a mnemonic, split out possible predication code and carry
/// setting letters to form a canonical mnemonic and flags.
//
// FIXME: Would be nice to autogen this.
@@ -5541,7 +5684,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
return Mnemonic;
}
-/// \brief Given a canonical mnemonic, determine if the instruction ever allows
+/// Given a canonical mnemonic, determine if the instruction ever allows
/// inclusion of carry set or predication code operands.
//
// FIXME: It would be nice to autogen this.
@@ -5585,6 +5728,7 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
Mnemonic != "isb" && Mnemonic != "pld" && Mnemonic != "pli" &&
Mnemonic != "pldw" && Mnemonic != "ldc2" && Mnemonic != "ldc2l" &&
Mnemonic != "stc2" && Mnemonic != "stc2l" &&
+ Mnemonic != "tsb" &&
!Mnemonic.startswith("rfe") && !Mnemonic.startswith("srs");
} else if (isThumbOne()) {
if (hasV6MOps())
@@ -5595,7 +5739,7 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
CanAcceptPredicationCode = true;
}
-// \brief Some Thumb instructions have two operand forms that are not
+// Some Thumb instructions have two operand forms that are not
// available as three operand, convert to two operand form if possible.
//
// FIXME: We would really like to be able to tablegen'erate this.
@@ -6214,6 +6358,65 @@ bool ARMAsmParser::validatetSTMRegList(const MCInst &Inst,
return false;
}
+bool ARMAsmParser::validateLDRDSTRD(MCInst &Inst,
+ const OperandVector &Operands,
+ bool Load, bool ARMMode, bool Writeback) {
+ unsigned RtIndex = Load || !Writeback ? 0 : 1;
+ unsigned Rt = MRI->getEncodingValue(Inst.getOperand(RtIndex).getReg());
+ unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(RtIndex + 1).getReg());
+
+ if (ARMMode) {
+ // Rt can't be R14.
+ if (Rt == 14)
+ return Error(Operands[3]->getStartLoc(),
+ "Rt can't be R14");
+
+ // Rt must be even-numbered.
+ if ((Rt & 1) == 1)
+ return Error(Operands[3]->getStartLoc(),
+ "Rt must be even-numbered");
+
+ // Rt2 must be Rt + 1.
+ if (Rt2 != Rt + 1) {
+ if (Load)
+ return Error(Operands[3]->getStartLoc(),
+ "destination operands must be sequential");
+ else
+ return Error(Operands[3]->getStartLoc(),
+ "source operands must be sequential");
+ }
+
+ // FIXME: Diagnose m == 15
+ // FIXME: Diagnose ldrd with m == t || m == t2.
+ }
+
+ if (!ARMMode && Load) {
+ if (Rt2 == Rt)
+ return Error(Operands[3]->getStartLoc(),
+ "destination operands can't be identical");
+ }
+
+ if (Writeback) {
+ unsigned Rn = MRI->getEncodingValue(Inst.getOperand(3).getReg());
+
+ if (Rn == Rt || Rn == Rt2) {
+ if (Load)
+ return Error(Operands[3]->getStartLoc(),
+ "base register needs to be different from destination "
+ "registers");
+ else
+ return Error(Operands[3]->getStartLoc(),
+ "source register and base register can't be identical");
+ }
+
+ // FIXME: Diagnose ldrd/strd with writeback and n == 15.
+ // (Except the immediate form of ldrd?)
+ }
+
+ return false;
+}
+
+
// FIXME: We would really like to be able to tablegen'erate this.
bool ARMAsmParser::validateInstruction(MCInst &Inst,
const OperandVector &Operands) {
@@ -6227,7 +6430,8 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
// The instruction must be predicable.
if (!MCID.isPredicable())
return Error(Loc, "instructions in IT block must be predicable");
- unsigned Cond = Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm();
+ ARMCC::CondCodes Cond = ARMCC::CondCodes(
+ Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm());
if (Cond != currentITCond()) {
// Find the condition code Operand to get its SMLoc information.
SMLoc CondLoc;
@@ -6235,9 +6439,9 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
if (static_cast<ARMOperand &>(*Operands[I]).isCondCode())
CondLoc = Operands[I]->getStartLoc();
return Error(CondLoc, "incorrect condition in IT block; got '" +
- StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) +
- "', but expected '" +
- ARMCondCodeToString(ARMCC::CondCodes(currentITCond())) + "'");
+ StringRef(ARMCondCodeToString(Cond)) +
+ "', but expected '" +
+ ARMCondCodeToString(currentITCond()) + "'");
}
// Check for non-'al' condition codes outside of the IT block.
} else if (isThumbTwo() && MCID.isPredicable() &&
@@ -6259,51 +6463,43 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
const unsigned Opcode = Inst.getOpcode();
switch (Opcode) {
+ case ARM::t2IT: {
+ // Encoding is unpredictable if it ever results in a notional 'NV'
+ // predicate. Since we don't parse 'NV' directly this means an 'AL'
+ // predicate with an "else" mask bit.
+ unsigned Cond = Inst.getOperand(0).getImm();
+ unsigned Mask = Inst.getOperand(1).getImm();
+
+ // Mask hasn't been modified to the IT instruction encoding yet so
+ // conditions only allowing a 't' are a block of 1s starting at bit 3
+ // followed by all 0s. Easiest way is to just list the 4 possibilities.
+ if (Cond == ARMCC::AL && Mask != 8 && Mask != 12 && Mask != 14 &&
+ Mask != 15)
+ return Error(Loc, "unpredictable IT predicate sequence");
+ break;
+ }
case ARM::LDRD:
+ if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/true,
+ /*Writeback*/false))
+ return true;
+ break;
case ARM::LDRD_PRE:
- case ARM::LDRD_POST: {
- const unsigned RtReg = Inst.getOperand(0).getReg();
-
- // Rt can't be R14.
- if (RtReg == ARM::LR)
- return Error(Operands[3]->getStartLoc(),
- "Rt can't be R14");
-
- const unsigned Rt = MRI->getEncodingValue(RtReg);
- // Rt must be even-numbered.
- if ((Rt & 1) == 1)
- return Error(Operands[3]->getStartLoc(),
- "Rt must be even-numbered");
-
- // Rt2 must be Rt + 1.
- const unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
- if (Rt2 != Rt + 1)
- return Error(Operands[3]->getStartLoc(),
- "destination operands must be sequential");
-
- if (Opcode == ARM::LDRD_PRE || Opcode == ARM::LDRD_POST) {
- const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(3).getReg());
- // For addressing modes with writeback, the base register needs to be
- // different from the destination registers.
- if (Rn == Rt || Rn == Rt2)
- return Error(Operands[3]->getStartLoc(),
- "base register needs to be different from destination "
- "registers");
- }
-
- return false;
- }
+ case ARM::LDRD_POST:
+ if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/true,
+ /*Writeback*/true))
+ return true;
+ break;
case ARM::t2LDRDi8:
+ if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/false,
+ /*Writeback*/false))
+ return true;
+ break;
case ARM::t2LDRD_PRE:
- case ARM::t2LDRD_POST: {
- // Rt2 must be different from Rt.
- unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
- unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
- if (Rt2 == Rt)
- return Error(Operands[3]->getStartLoc(),
- "destination operands can't be identical");
- return false;
- }
+ case ARM::t2LDRD_POST:
+ if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/false,
+ /*Writeback*/true))
+ return true;
+ break;
case ARM::t2BXJ: {
const unsigned RmReg = Inst.getOperand(0).getReg();
// Rm = SP is no longer unpredictable in v8-A
@@ -6312,35 +6508,39 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
"r13 (SP) is an unpredictable operand to BXJ");
return false;
}
- case ARM::STRD: {
- // Rt2 must be Rt + 1.
- unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
- unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
- if (Rt2 != Rt + 1)
- return Error(Operands[3]->getStartLoc(),
- "source operands must be sequential");
- return false;
- }
+ case ARM::STRD:
+ if (validateLDRDSTRD(Inst, Operands, /*Load*/false, /*ARMMode*/true,
+ /*Writeback*/false))
+ return true;
+ break;
case ARM::STRD_PRE:
- case ARM::STRD_POST: {
- // Rt2 must be Rt + 1.
- unsigned Rt = MRI->getEncodingValue(Inst.getOperand(1).getReg());
- unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(2).getReg());
- if (Rt2 != Rt + 1)
- return Error(Operands[3]->getStartLoc(),
- "source operands must be sequential");
- return false;
- }
+ case ARM::STRD_POST:
+ if (validateLDRDSTRD(Inst, Operands, /*Load*/false, /*ARMMode*/true,
+ /*Writeback*/true))
+ return true;
+ break;
+ case ARM::t2STRD_PRE:
+ case ARM::t2STRD_POST:
+ if (validateLDRDSTRD(Inst, Operands, /*Load*/false, /*ARMMode*/false,
+ /*Writeback*/true))
+ return true;
+ break;
case ARM::STR_PRE_IMM:
case ARM::STR_PRE_REG:
+ case ARM::t2STR_PRE:
case ARM::STR_POST_IMM:
case ARM::STR_POST_REG:
+ case ARM::t2STR_POST:
case ARM::STRH_PRE:
+ case ARM::t2STRH_PRE:
case ARM::STRH_POST:
+ case ARM::t2STRH_POST:
case ARM::STRB_PRE_IMM:
case ARM::STRB_PRE_REG:
+ case ARM::t2STRB_PRE:
case ARM::STRB_POST_IMM:
- case ARM::STRB_POST_REG: {
+ case ARM::STRB_POST_REG:
+ case ARM::t2STRB_POST: {
// Rt must be different from Rn.
const unsigned Rt = MRI->getEncodingValue(Inst.getOperand(1).getReg());
const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg());
@@ -6352,18 +6552,28 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
}
case ARM::LDR_PRE_IMM:
case ARM::LDR_PRE_REG:
+ case ARM::t2LDR_PRE:
case ARM::LDR_POST_IMM:
case ARM::LDR_POST_REG:
+ case ARM::t2LDR_POST:
case ARM::LDRH_PRE:
+ case ARM::t2LDRH_PRE:
case ARM::LDRH_POST:
+ case ARM::t2LDRH_POST:
case ARM::LDRSH_PRE:
+ case ARM::t2LDRSH_PRE:
case ARM::LDRSH_POST:
+ case ARM::t2LDRSH_POST:
case ARM::LDRB_PRE_IMM:
case ARM::LDRB_PRE_REG:
+ case ARM::t2LDRB_PRE:
case ARM::LDRB_POST_IMM:
case ARM::LDRB_POST_REG:
+ case ARM::t2LDRB_POST:
case ARM::LDRSB_PRE:
- case ARM::LDRSB_POST: {
+ case ARM::t2LDRSB_PRE:
+ case ARM::LDRSB_POST:
+ case ARM::t2LDRSB_POST: {
// Rt must be different from Rn.
const unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg());
@@ -6374,7 +6584,9 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
return false;
}
case ARM::SBFX:
- case ARM::UBFX: {
+ case ARM::t2SBFX:
+ case ARM::UBFX:
+ case ARM::t2UBFX: {
// Width must be in range [1, 32-lsb].
unsigned LSB = Inst.getOperand(2).getImm();
unsigned Widthm1 = Inst.getOperand(3).getImm();
@@ -6592,19 +6804,40 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
break;
}
case ARM::HINT:
- case ARM::t2HINT:
- if (hasRAS()) {
- // ESB is not predicable (pred must be AL)
- unsigned Imm8 = Inst.getOperand(0).getImm();
- unsigned Pred = Inst.getOperand(1).getImm();
- if (Imm8 == 0x10 && Pred != ARMCC::AL)
- return Error(Operands[1]->getStartLoc(), "instruction 'esb' is not "
- "predicable, but condition "
- "code specified");
- }
- // Without the RAS extension, this behaves as any other unallocated hint.
+ case ARM::t2HINT: {
+ unsigned Imm8 = Inst.getOperand(0).getImm();
+ unsigned Pred = Inst.getOperand(1).getImm();
+ // ESB is not predicable (pred must be AL). Without the RAS extension, this
+ // behaves as any other unallocated hint.
+ if (Imm8 == 0x10 && Pred != ARMCC::AL && hasRAS())
+ return Error(Operands[1]->getStartLoc(), "instruction 'esb' is not "
+ "predicable, but condition "
+ "code specified");
+ if (Imm8 == 0x14 && Pred != ARMCC::AL)
+ return Error(Operands[1]->getStartLoc(), "instruction 'csdb' is not "
+ "predicable, but condition "
+ "code specified");
+ break;
+ }
+ case ARM::VMOVRRS: {
+ // Source registers must be sequential.
+ const unsigned Sm = MRI->getEncodingValue(Inst.getOperand(2).getReg());
+ const unsigned Sm1 = MRI->getEncodingValue(Inst.getOperand(3).getReg());
+ if (Sm1 != Sm + 1)
+ return Error(Operands[5]->getStartLoc(),
+ "source operands must be sequential");
break;
}
+ case ARM::VMOVSRR: {
+ // Destination registers must be sequential.
+ const unsigned Sm = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+ const unsigned Sm1 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+ if (Sm1 != Sm + 1)
+ return Error(Operands[3]->getStartLoc(),
+ "destination operands must be sequential");
+ break;
+ }
+ }
return false;
}
@@ -10173,10 +10406,11 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl<NearMissInfo> &NearMissesIn,
Message.Message = "too many operands for instruction";
} else {
Message.Message = "invalid operand for instruction";
- DEBUG(dbgs() << "Missing diagnostic string for operand class " <<
- getMatchClassName((MatchClassKind)I.getOperandClass())
- << I.getOperandClass() << ", error " << I.getOperandError()
- << ", opcode " << MII.getName(I.getOpcode()) << "\n");
+ LLVM_DEBUG(
+ dbgs() << "Missing diagnostic string for operand class "
+ << getMatchClassName((MatchClassKind)I.getOperandClass())
+ << I.getOperandClass() << ", error " << I.getOperandError()
+ << ", opcode " << MII.getName(I.getOpcode()) << "\n");
}
NearMissesOut.emplace_back(Message);
break;
@@ -10203,6 +10437,8 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl<NearMissInfo> &NearMissesIn,
if (!isThumb() && (MissingFeatures & Feature_IsThumb2) &&
(MissingFeatures & ~(Feature_IsThumb2 | Feature_IsThumb)))
break;
+ if (isMClass() && (MissingFeatures & Feature_HasNEON))
+ break;
NearMissMessage Message;
Message.Loc = IDLoc;
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 014ac2ae8b48..d635c0add577 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -1,19 +1,20 @@
set(LLVM_TARGET_DEFINITIONS ARM.td)
-tablegen(LLVM ARMGenRegisterBank.inc -gen-register-bank)
+tablegen(LLVM ARMGenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM ARMGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM ARMGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM ARMGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM ARMGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM ARMGenFastISel.inc -gen-fast-isel)
tablegen(LLVM ARMGenGlobalISel.inc -gen-global-isel)
-tablegen(LLVM ARMGenRegisterInfo.inc -gen-register-info)
tablegen(LLVM ARMGenInstrInfo.inc -gen-instr-info)
tablegen(LLVM ARMGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM ARMGenMCPseudoLowering.inc -gen-pseudo-lowering)
-tablegen(LLVM ARMGenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM ARMGenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM ARMGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM ARMGenFastISel.inc -gen-fast-isel)
-tablegen(LLVM ARMGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM ARMGenRegisterBank.inc -gen-register-bank)
+tablegen(LLVM ARMGenRegisterInfo.inc -gen-register-info)
tablegen(LLVM ARMGenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM ARMGenDisassemblerTables.inc -gen-disassembler)
tablegen(LLVM ARMGenSystemRegister.inc -gen-searchable-tables)
+
add_public_tablegen_target(ARMCommonTableGen)
add_llvm_target(ARMCodeGen
@@ -22,6 +23,7 @@ add_llvm_target(ARMCodeGen
ARMBaseInstrInfo.cpp
ARMBaseRegisterInfo.cpp
ARMCallLowering.cpp
+ ARMCodeGenPrepare.cpp
ARMConstantIslandPass.cpp
ARMConstantPoolValue.cpp
ARMExpandPseudoInsts.cpp
@@ -33,6 +35,7 @@ add_llvm_target(ARMCodeGen
ARMISelLowering.cpp
ARMInstrInfo.cpp
ARMLegalizerInfo.cpp
+ ARMParallelDSP.cpp
ARMLoadStoreOptimizer.cpp
ARMMCInstLower.cpp
ARMMachineFunctionInfo.cpp
@@ -55,9 +58,9 @@ add_llvm_target(ARMCodeGen
ARMComputeBlockSize.cpp
)
-add_subdirectory(TargetInfo)
add_subdirectory(AsmParser)
add_subdirectory(Disassembler)
add_subdirectory(InstPrinter)
add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
add_subdirectory(Utils)
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 53c635877675..4733cf49827e 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -158,6 +158,8 @@ static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -657,6 +659,8 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const {
unsigned CC;
CC = ITBlock.getITCC();
+ if (CC == 0xF)
+ CC = ARMCC::AL;
if (ITBlock.instrInITBlock())
ITBlock.advanceITState();
@@ -727,10 +731,13 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// code and mask operands so that we can apply them correctly
// to the subsequent instructions.
if (MI.getOpcode() == ARM::t2IT) {
-
unsigned Firstcond = MI.getOperand(0).getImm();
unsigned Mask = MI.getOperand(1).getImm();
ITBlock.setITState(Firstcond, Mask);
+
+ // An IT instruction that would give a 'NV' predicate is unpredictable.
+ if (Firstcond == ARMCC::AL && !isPowerOf2_32(Mask))
+ CS << "unpredictable IT predicate sequence";
}
return Result;
@@ -996,6 +1003,11 @@ static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
return MCDisassembler::Success;
}
+static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ return DecodeSPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
static const uint16_t DPRDecoderTable[] = {
ARM::D0, ARM::D1, ARM::D2, ARM::D3,
ARM::D4, ARM::D5, ARM::D6, ARM::D7,
@@ -4142,7 +4154,6 @@ static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
case 0x8a: // msplim_ns
case 0x8b: // psplim_ns
case 0x91: // basepri_ns
- case 0x92: // basepri_max_ns
case 0x93: // faultmask_ns
if (!(FeatureBits[ARM::HasV8MMainlineOps]))
return MCDisassembler::Fail;
@@ -4158,7 +4169,9 @@ static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
return MCDisassembler::Fail;
break;
default:
- return MCDisassembler::Fail;
+ // Architecturally defined as unpredictable
+ S = MCDisassembler::SoftFail;
+ break;
}
if (Inst.getOpcode() == ARM::t2MSR_M) {
@@ -4198,15 +4211,8 @@ static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Val,
// The table of encodings for these banked registers comes from B9.2.3 of the
// ARM ARM. There are patterns, but nothing regular enough to make this logic
// neater. So by fiat, these values are UNPREDICTABLE:
- if (!R) {
- if (SysM == 0x7 || SysM == 0xf || SysM == 0x18 || SysM == 0x19 ||
- SysM == 0x1a || SysM == 0x1b)
- return MCDisassembler::SoftFail;
- } else {
- if (SysM != 0xe && SysM != 0x10 && SysM != 0x12 && SysM != 0x14 &&
- SysM != 0x16 && SysM != 0x1c && SysM != 0x1e)
- return MCDisassembler::SoftFail;
- }
+ if (!ARMBankedReg::lookupBankedRegByEncoding((R << 5) | SysM))
+ return MCDisassembler::Fail;
Inst.addOperand(MCOperand::createImm(Val));
return MCDisassembler::Success;
diff --git a/lib/Target/ARM/Disassembler/LLVMBuild.txt b/lib/Target/ARM/Disassembler/LLVMBuild.txt
index a64a8a970c05..48eef05e4f2d 100644
--- a/lib/Target/ARM/Disassembler/LLVMBuild.txt
+++ b/lib/Target/ARM/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
type = Library
name = ARMDisassembler
parent = ARM
-required_libraries = ARMDesc ARMInfo MCDisassembler Support
+required_libraries = ARMDesc ARMInfo MCDisassembler Support ARMUtils
add_to_library_groups = ARM
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 4fc67a4f6eb5..75ed40c18fa2 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -13,8 +13,6 @@
#include "ARMInstPrinter.h"
#include "Utils/ARMBaseInfo.h"
-#include "ARMBaseRegisterInfo.h"
-#include "ARMBaseRegisterInfo.h"
#include "MCTargetDesc/ARMAddressingModes.h"
#include "MCTargetDesc/ARMBaseInfo.h"
#include "llvm/MC/MCAsmInfo.h"
@@ -271,6 +269,10 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
}
break;
}
+ case ARM::TSB:
+ case ARM::t2TSB:
+ O << "\ttsb\tcsync";
+ return;
}
if (!printAliasInstr(MI, STI, O))
@@ -698,6 +700,13 @@ void ARMInstPrinter::printInstSyncBOption(const MCInst *MI, unsigned OpNum,
O << ARM_ISB::InstSyncBOptToString(val);
}
+void ARMInstPrinter::printTraceSyncBOption(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned val = MI->getOperand(OpNum).getImm();
+ O << ARM_TSB::TraceSyncBOptToString(val);
+}
+
void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -825,7 +834,8 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
return;
}
- llvm_unreachable("Unexpected mask value!");
+ O << SYSm;
+
return;
}
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 7dc311229cca..afc8515136bc 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -94,6 +94,8 @@ public:
const MCSubtargetInfo &STI, raw_ostream &O);
void printInstSyncBOption(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printTraceSyncBOption(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printShiftImmOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum,
diff --git a/lib/Target/ARM/LLVMBuild.txt b/lib/Target/ARM/LLVMBuild.txt
index a450acc5e13a..78d28427f3d9 100644
--- a/lib/Target/ARM/LLVMBuild.txt
+++ b/lib/Target/ARM/LLVMBuild.txt
@@ -31,5 +31,5 @@ has_jit = 1
type = Library
name = ARMCodeGen
parent = ARM
-required_libraries = ARMAsmPrinter ARMDesc ARMInfo Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target GlobalISel ARMUtils
+required_libraries = ARMAsmPrinter ARMDesc ARMInfo Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target GlobalISel ARMUtils TransformUtils
add_to_library_groups = ARM
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 1cb9dd44f789..f524a0081301 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -31,6 +31,7 @@
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/EndianStream.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/TargetParser.h"
@@ -155,7 +156,8 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
"Invalid kind!");
- return (IsLittleEndian ? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
+ return (Endian == support::little ? InfosLE
+ : InfosBE)[Kind - FirstTargetFixupKind];
}
void ARMAsmBackend::handleAssemblerFlag(MCAssemblerFlag Flag) {
@@ -171,9 +173,10 @@ void ARMAsmBackend::handleAssemblerFlag(MCAssemblerFlag Flag) {
}
}
-unsigned ARMAsmBackend::getRelaxedOpcode(unsigned Op) const {
- bool HasThumb2 = STI->getFeatureBits()[ARM::FeatureThumb2];
- bool HasV8MBaselineOps = STI->getFeatureBits()[ARM::HasV8MBaselineOps];
+unsigned ARMAsmBackend::getRelaxedOpcode(unsigned Op,
+ const MCSubtargetInfo &STI) const {
+ bool HasThumb2 = STI.getFeatureBits()[ARM::FeatureThumb2];
+ bool HasV8MBaselineOps = STI.getFeatureBits()[ARM::HasV8MBaselineOps];
switch (Op) {
default:
@@ -193,8 +196,9 @@ unsigned ARMAsmBackend::getRelaxedOpcode(unsigned Op) const {
}
}
-bool ARMAsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
- if (getRelaxedOpcode(Inst.getOpcode()) != Inst.getOpcode())
+bool ARMAsmBackend::mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const {
+ if (getRelaxedOpcode(Inst.getOpcode(), STI) != Inst.getOpcode())
return true;
return false;
}
@@ -239,7 +243,7 @@ const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup,
}
case ARM::fixup_arm_thumb_cb: {
// If we have a Thumb CBZ or CBNZ instruction and its target is the next
- // instruction it is is actually out of range for the instruction.
+ // instruction it is actually out of range for the instruction.
// It will be changed to a NOP.
int64_t Offset = (Value & ~1);
if (Offset == 2)
@@ -261,7 +265,7 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
void ARMAsmBackend::relaxInstruction(const MCInst &Inst,
const MCSubtargetInfo &STI,
MCInst &Res) const {
- unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
+ unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode(), STI);
// Sanity check w/ diagnostic if we get here w/ a bogus instruction.
if (RelaxedOp == Inst.getOpcode()) {
@@ -289,7 +293,7 @@ void ARMAsmBackend::relaxInstruction(const MCInst &Inst,
Res.setOpcode(RelaxedOp);
}
-bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool ARMAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
const uint16_t Thumb1_16bitNopEncoding = 0x46c0; // using MOV r8,r8
const uint16_t Thumb2_16bitNopEncoding = 0xbf00; // NOP
const uint32_t ARMv4_NopEncoding = 0xe1a00000; // using MOV r0,r0
@@ -299,9 +303,9 @@ bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
hasNOP() ? Thumb2_16bitNopEncoding : Thumb1_16bitNopEncoding;
uint64_t NumNops = Count / 2;
for (uint64_t i = 0; i != NumNops; ++i)
- OW->write16(nopEncoding);
+ support::endian::write(OS, nopEncoding, Endian);
if (Count & 1)
- OW->write8(0);
+ OS << '\0';
return true;
}
// ARM mode
@@ -309,21 +313,20 @@ bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
hasNOP() ? ARMv6T2_NopEncoding : ARMv4_NopEncoding;
uint64_t NumNops = Count / 4;
for (uint64_t i = 0; i != NumNops; ++i)
- OW->write32(nopEncoding);
+ support::endian::write(OS, nopEncoding, Endian);
// FIXME: should this function return false when unable to write exactly
// 'Count' bytes with NOP encodings?
switch (Count % 4) {
default:
break; // No leftover bytes to write
case 1:
- OW->write8(0);
+ OS << '\0';
break;
case 2:
- OW->write16(0);
+ OS.write("\0\0", 2);
break;
case 3:
- OW->write16(0);
- OW->write8(0xa0);
+ OS.write("\0\0\xa0", 3);
break;
}
@@ -360,7 +363,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
const MCFixup &Fixup,
const MCValue &Target, uint64_t Value,
bool IsResolved, MCContext &Ctx,
- bool IsLittleEndian) const {
+ const MCSubtargetInfo* STI) const {
unsigned Kind = Fixup.getKind();
// MachO tries to make .o files that look vaguely pre-linked, so for MOVW/MOVT
@@ -389,6 +392,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
case FK_SecRel_4:
return Value;
case ARM::fixup_arm_movt_hi16:
+ assert(STI != nullptr);
if (IsResolved || !STI->getTargetTriple().isOSBinFormatELF())
Value >>= 16;
LLVM_FALLTHROUGH;
@@ -401,6 +405,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
return Value;
}
case ARM::fixup_t2_movt_hi16:
+ assert(STI != nullptr);
if (IsResolved || !STI->getTargetTriple().isOSBinFormatELF())
Value >>= 16;
LLVM_FALLTHROUGH;
@@ -414,7 +419,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
// inst{14-12} = Mid3;
// inst{7-0} = Lo8;
Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8);
- return swapHalfWords(Value, IsLittleEndian);
+ return swapHalfWords(Value, Endian == support::little);
}
case ARM::fixup_arm_ldst_pcrel_12:
// ARM PC-relative values are offset by 8.
@@ -437,7 +442,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
// Same addressing mode as fixup_arm_pcrel_10,
// but with 16-bit halfwords swapped.
if (Kind == ARM::fixup_t2_ldst_pcrel_12)
- return swapHalfWords(Value, IsLittleEndian);
+ return swapHalfWords(Value, Endian == support::little);
return Value;
}
@@ -470,7 +475,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
out |= (Value & 0x700) << 4;
out |= (Value & 0x0FF);
- return swapHalfWords(out, IsLittleEndian);
+ return swapHalfWords(out, Endian == support::little);
}
case ARM::fixup_arm_condbranch:
@@ -487,6 +492,11 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
return 0xffffff & ((Value - 8) >> 2);
case ARM::fixup_t2_uncondbranch: {
Value = Value - 4;
+ if (!isInt<25>(Value)) {
+ Ctx.reportError(Fixup.getLoc(), "Relocation out of range");
+ return 0;
+ }
+
Value >>= 1; // Low bit is not encoded.
uint32_t out = 0;
@@ -502,10 +512,15 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
out |= (Value & 0x1FF800) << 5; // imm6 field
out |= (Value & 0x0007FF); // imm11 field
- return swapHalfWords(out, IsLittleEndian);
+ return swapHalfWords(out, Endian == support::little);
}
case ARM::fixup_t2_condbranch: {
Value = Value - 4;
+ if (!isInt<21>(Value)) {
+ Ctx.reportError(Fixup.getLoc(), "Relocation out of range");
+ return 0;
+ }
+
Value >>= 1; // Low bit is not encoded.
uint64_t out = 0;
@@ -515,12 +530,14 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
out |= (Value & 0x1F800) << 5; // imm6 field
out |= (Value & 0x007FF); // imm11 field
- return swapHalfWords(out, IsLittleEndian);
+ return swapHalfWords(out, Endian == support::little);
}
case ARM::fixup_arm_thumb_bl: {
- // FIXME: We get both thumb1 and thumb2 in here, so we can only check for
- // the less strict thumb2 value.
- if (!isInt<26>(Value - 4)) {
+ if (!isInt<25>(Value - 4) ||
+ (!STI->getFeatureBits()[ARM::FeatureThumb2] &&
+ !STI->getFeatureBits()[ARM::HasV8MBaselineOps] &&
+ !STI->getFeatureBits()[ARM::HasV6MOps] &&
+ !isInt<23>(Value - 4))) {
Ctx.reportError(Fixup.getLoc(), "Relocation out of range");
return 0;
}
@@ -549,7 +566,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
(uint16_t)imm11Bits);
- return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
+ return joinHalfWords(FirstHalf, SecondHalf, Endian == support::little);
}
case ARM::fixup_arm_thumb_blx: {
// The value doesn't encode the low two bits (always zero) and is offset by
@@ -585,12 +602,13 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
((uint16_t)imm10LBits) << 1);
- return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
+ return joinHalfWords(FirstHalf, SecondHalf, Endian == support::little);
}
case ARM::fixup_thumb_adr_pcrel_10:
case ARM::fixup_arm_thumb_cp:
// On CPUs supporting Thumb2, this will be relaxed to an ldr.w, otherwise we
// could have an error on our hands.
+ assert(STI != nullptr);
if (!STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) {
const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
if (FixupDiagnostic) {
@@ -615,6 +633,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
}
case ARM::fixup_arm_thumb_br:
// Offset by 4 and don't encode the lower bit, which is always 0.
+ assert(STI != nullptr);
if (!STI->getFeatureBits()[ARM::FeatureThumb2] &&
!STI->getFeatureBits()[ARM::HasV8MBaselineOps]) {
const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
@@ -626,6 +645,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
return ((Value - 4) >> 1) & 0x7ff;
case ARM::fixup_arm_thumb_bcc:
// Offset by 4 and don't encode the lower bit, which is always 0.
+ assert(STI != nullptr);
if (!STI->getFeatureBits()[ARM::FeatureThumb2]) {
const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
if (FixupDiagnostic) {
@@ -673,7 +693,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
// Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords
// swapped.
if (Kind == ARM::fixup_t2_pcrel_10)
- return swapHalfWords(Value, IsLittleEndian);
+ return swapHalfWords(Value, Endian == support::little);
return Value;
}
@@ -704,7 +724,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
// Same addressing mode as fixup_arm_pcrel_9, but with 16-bit halfwords
// swapped.
if (Kind == ARM::fixup_t2_pcrel_9)
- return swapHalfWords(Value, IsLittleEndian);
+ return swapHalfWords(Value, Endian == support::little);
return Value;
}
@@ -730,7 +750,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
EncValue |= (Value & 0x800) << 15;
EncValue |= (Value & 0x700) << 4;
EncValue |= (Value & 0xff);
- return swapHalfWords(EncValue, IsLittleEndian);
+ return swapHalfWords(EncValue, Endian == support::little);
}
}
}
@@ -755,7 +775,7 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
// Create relocations for unconditional branches to function symbols with
// different execution mode in ELF binaries.
if (Sym && Sym->isELF()) {
- unsigned Type = dyn_cast<MCSymbolELF>(Sym)->getType();
+ unsigned Type = cast<MCSymbolELF>(Sym)->getType();
if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)) {
if (Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_uncondbranch))
return true;
@@ -882,11 +902,11 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target,
MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) const {
+ bool IsResolved,
+ const MCSubtargetInfo* STI) const {
unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
MCContext &Ctx = Asm.getContext();
- Value = adjustFixupValue(Asm, Fixup, Target, Value, IsResolved, Ctx,
- IsLittleEndian);
+ Value = adjustFixupValue(Asm, Fixup, Target, Value, IsResolved, Ctx, STI);
if (!Value)
return; // Doesn't change encoding.
@@ -895,7 +915,7 @@ void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
// Used to point to big endian bytes.
unsigned FullSizeBytes;
- if (!IsLittleEndian) {
+ if (Endian == support::big) {
FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind());
assert((Offset + FullSizeBytes) <= Data.size() && "Invalid fixup size!");
assert(NumBytes <= FullSizeBytes && "Invalid fixup size!");
@@ -905,14 +925,14 @@ void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
// the fixup value. The Value has been "split up" into the appropriate
// bitfields above.
for (unsigned i = 0; i != NumBytes; ++i) {
- unsigned Idx = IsLittleEndian ? i : (FullSizeBytes - 1 - i);
+ unsigned Idx = Endian == support::little ? i : (FullSizeBytes - 1 - i);
Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
}
}
namespace CU {
-/// \brief Compact unwind encoding values.
+/// Compact unwind encoding values.
enum CompactUnwindEncodings {
UNWIND_ARM_MODE_MASK = 0x0F000000,
UNWIND_ARM_MODE_FRAME = 0x01000000,
@@ -1153,52 +1173,39 @@ static MachO::CPUSubTypeARM getMachOSubTypeFromArch(StringRef Arch) {
}
}
-MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
- const MCRegisterInfo &MRI,
- const Triple &TheTriple, StringRef CPU,
- const MCTargetOptions &Options,
- bool isLittle) {
+static MCAsmBackend *createARMAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
+ const MCTargetOptions &Options,
+ support::endianness Endian) {
+ const Triple &TheTriple = STI.getTargetTriple();
switch (TheTriple.getObjectFormat()) {
default:
llvm_unreachable("unsupported object format");
case Triple::MachO: {
MachO::CPUSubTypeARM CS = getMachOSubTypeFromArch(TheTriple.getArchName());
- return new ARMAsmBackendDarwin(T, TheTriple, MRI, CS);
+ return new ARMAsmBackendDarwin(T, STI, MRI, CS);
}
case Triple::COFF:
assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
- return new ARMAsmBackendWinCOFF(T, TheTriple);
+ return new ARMAsmBackendWinCOFF(T, STI);
case Triple::ELF:
assert(TheTriple.isOSBinFormatELF() && "using ELF for non-ELF target");
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
- return new ARMAsmBackendELF(T, TheTriple, OSABI, isLittle);
+ return new ARMAsmBackendELF(T, STI, OSABI, Endian);
}
}
MCAsmBackend *llvm::createARMLEAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
const MCTargetOptions &Options) {
- return createARMAsmBackend(T, MRI, TT, CPU, Options, true);
+ return createARMAsmBackend(T, STI, MRI, Options, support::little);
}
MCAsmBackend *llvm::createARMBEAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
const MCTargetOptions &Options) {
- return createARMAsmBackend(T, MRI, TT, CPU, Options, false);
-}
-
-MCAsmBackend *llvm::createThumbLEAsmBackend(const Target &T,
- const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
- const MCTargetOptions &Options) {
- return createARMAsmBackend(T, MRI, TT, CPU, Options, true);
-}
-
-MCAsmBackend *llvm::createThumbBEAsmBackend(const Target &T,
- const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
- const MCTargetOptions &Options) {
- return createARMAsmBackend(T, MRI, TT, CPU, Options, false);
+ return createARMAsmBackend(T, STI, MRI, Options, support::big);
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 02374966dafe..88c476bf65f4 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -19,22 +19,24 @@
namespace llvm {
class ARMAsmBackend : public MCAsmBackend {
- const MCSubtargetInfo *STI;
+ // The STI from the target triple the MCAsmBackend was instantiated with
+ // note that MCFragments may have a different local STI that should be
+ // used in preference.
+ const MCSubtargetInfo &STI;
bool isThumbMode; // Currently emitting Thumb code.
- bool IsLittleEndian; // Big or little endian.
public:
- ARMAsmBackend(const Target &T, const Triple &TT, bool IsLittle)
- : MCAsmBackend(), STI(ARM_MC::createARMMCSubtargetInfo(TT, "", "")),
- isThumbMode(TT.getArchName().startswith("thumb")),
- IsLittleEndian(IsLittle) {}
-
- ~ARMAsmBackend() override { delete STI; }
+ ARMAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+ support::endianness Endian)
+ : MCAsmBackend(Endian), STI(STI),
+ isThumbMode(STI.getTargetTriple().isThumb()) {}
unsigned getNumFixupKinds() const override {
return ARM::NumTargetFixupKinds;
}
- bool hasNOP() const { return STI->getFeatureBits()[ARM::HasV6T2Ops]; }
+ // FIXME: this should be calculated per fragment as the STI may be
+ // different.
+ bool hasNOP() const { return STI.getFeatureBits()[ARM::HasV6T2Ops]; }
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
@@ -44,15 +46,17 @@ public:
unsigned adjustFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, uint64_t Value,
bool IsResolved, MCContext &Ctx,
- bool IsLittleEndian) const;
+ const MCSubtargetInfo *STI) const;
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsResolved) const override;
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override;
- unsigned getRelaxedOpcode(unsigned Op) const;
+ unsigned getRelaxedOpcode(unsigned Op, const MCSubtargetInfo &STI) const;
- bool mayNeedRelaxation(const MCInst &Inst) const override;
+ bool mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const override;
const char *reasonForFixupRelaxation(const MCFixup &Fixup,
uint64_t Value) const;
@@ -64,14 +68,13 @@ public:
void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
MCInst &Res) const override;
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
void handleAssemblerFlag(MCAssemblerFlag Flag) override;
unsigned getPointerSize() const { return 4; }
bool isThumb() const { return isThumbMode; }
void setIsThumb(bool it) { isThumbMode = it; }
- bool isLittle() const { return IsLittleEndian; }
};
} // end namespace llvm
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
index f05e3a6f1160..de1bfaf203e4 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -19,14 +19,13 @@ class ARMAsmBackendDarwin : public ARMAsmBackend {
const MCRegisterInfo &MRI;
public:
const MachO::CPUSubTypeARM Subtype;
- ARMAsmBackendDarwin(const Target &T, const Triple &TT,
+ ARMAsmBackendDarwin(const Target &T, const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI, MachO::CPUSubTypeARM st)
- : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), MRI(MRI), Subtype(st) {
- }
+ : ARMAsmBackend(T, STI, support::little), MRI(MRI), Subtype(st) {}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createARMMachObjectWriter(OS, /*Is64Bit=*/false, MachO::CPU_TYPE_ARM,
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createARMMachObjectWriter(/*Is64Bit=*/false, MachO::CPU_TYPE_ARM,
Subtype);
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
index d0f5419a1b0f..86a583b19cf7 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
@@ -20,13 +20,13 @@ namespace {
class ARMAsmBackendELF : public ARMAsmBackend {
public:
uint8_t OSABI;
- ARMAsmBackendELF(const Target &T, const Triple &TT, uint8_t OSABI,
- bool IsLittle)
- : ARMAsmBackend(T, TT, IsLittle), OSABI(OSABI) {}
+ ARMAsmBackendELF(const Target &T, const MCSubtargetInfo &STI, uint8_t OSABI,
+ support::endianness Endian)
+ : ARMAsmBackend(T, STI, Endian), OSABI(OSABI) {}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createARMELFObjectWriter(OS, OSABI, isLittle());
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createARMELFObjectWriter(OSABI);
}
};
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
index 53b9c29446a3..553922d20f43 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
@@ -17,11 +17,11 @@ using namespace llvm;
namespace {
class ARMAsmBackendWinCOFF : public ARMAsmBackend {
public:
- ARMAsmBackendWinCOFF(const Target &T, const Triple &TheTriple)
- : ARMAsmBackend(T, TheTriple, true) {}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createARMWinCOFFObjectWriter(OS, /*Is64Bit=*/false);
+ ARMAsmBackendWinCOFF(const Target &T, const MCSubtargetInfo &STI)
+ : ARMAsmBackend(T, STI, support::little) {}
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createARMWinCOFFObjectWriter(/*Is64Bit=*/false);
}
};
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index c4480e3da505..b918006fe9e3 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -98,6 +98,20 @@ namespace ARM_MB {
}
} // namespace ARM_MB
+namespace ARM_TSB {
+ enum TraceSyncBOpt {
+ CSYNC = 0
+ };
+
+ inline static const char *TraceSyncBOptToString(unsigned val) {
+ switch (val) {
+ default:
+ llvm_unreachable("Unknown trace synchronization barrier operation");
+ case CSYNC: return "csync";
+ }
+ }
+} // namespace ARM_TSB
+
namespace ARM_ISB {
enum InstSyncBOpt {
RESERVED_0 = 0,
@@ -186,7 +200,8 @@ namespace ARMII {
AddrModeT2_so = 13,
AddrModeT2_pc = 14, // +/- i12 for pc relative data
AddrModeT2_i8s4 = 15, // i8 * 4
- AddrMode_i12 = 16
+ AddrMode_i12 = 16,
+ AddrMode5FP16 = 17 // i8 * 2
};
inline static const char *AddrModeToString(AddrMode addrmode) {
@@ -197,6 +212,7 @@ namespace ARMII {
case AddrMode3: return "AddrMode3";
case AddrMode4: return "AddrMode4";
case AddrMode5: return "AddrMode5";
+ case AddrMode5FP16: return "AddrMode5FP16";
case AddrMode6: return "AddrMode6";
case AddrModeT1_1: return "AddrModeT1_1";
case AddrModeT1_2: return "AddrModeT1_2";
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 3cd52fe1e7eb..dfa339091a7b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -236,9 +236,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
}
}
-std::unique_ptr<MCObjectWriter>
-llvm::createARMELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
- bool IsLittleEndian) {
- return createELFObjectWriter(llvm::make_unique<ARMELFObjectWriter>(OSABI), OS,
- IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createARMELFObjectWriter(uint8_t OSABI) {
+ return llvm::make_unique<ARMELFObjectWriter>(OSABI);
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index d465da1a7bb1..3373d691db50 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -33,6 +33,7 @@
#include "llvm/MC/MCFragment.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCSectionELF.h"
@@ -441,9 +442,9 @@ public:
friend class ARMTargetELFStreamer;
ARMELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
- raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+ std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
bool IsThumb)
- : MCELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)),
+ : MCELFStreamer(Context, std::move(TAB), std::move(OW), std::move(Emitter)),
IsThumb(IsThumb) {
EHReset();
}
@@ -512,9 +513,11 @@ public:
assert(IsThumb);
EmitThumbMappingSymbol();
+ // Thumb wide instructions are emitted as a pair of 16-bit words of the
+ // appropriate endianness.
for (unsigned II = 0, IE = Size; II != IE; II = II + 2) {
- const unsigned I0 = LittleEndian ? II + 0 : (Size - II - 1);
- const unsigned I1 = LittleEndian ? II + 1 : (Size - II - 2);
+ const unsigned I0 = LittleEndian ? II + 0 : II + 1;
+ const unsigned I1 = LittleEndian ? II + 1 : II + 0;
Buffer[Size - II - 2] = uint8_t(Inst >> I0 * CHAR_BIT);
Buffer[Size - II - 1] = uint8_t(Inst >> I1 * CHAR_BIT);
}
@@ -856,6 +859,8 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
case ARM::ArchKind::ARMV8A:
case ARM::ArchKind::ARMV8_1A:
case ARM::ArchKind::ARMV8_2A:
+ case ARM::ArchKind::ARMV8_3A:
+ case ARM::ArchKind::ARMV8_4A:
setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
setAttributeItem(ARM_ISA_use, Allowed, false);
setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
@@ -1066,7 +1071,7 @@ void ARMTargetELFStreamer::finishAttributeSection() {
if (Contents.empty())
return;
- std::sort(Contents.begin(), Contents.end(), AttributeItem::LessTag);
+ llvm::sort(Contents.begin(), Contents.end(), AttributeItem::LessTag);
ARMELFStreamer &Streamer = getStreamer();
@@ -1492,10 +1497,10 @@ MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S,
MCELFStreamer *createARMELFStreamer(MCContext &Context,
std::unique_ptr<MCAsmBackend> TAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter,
bool RelaxAll, bool IsThumb) {
- ARMELFStreamer *S = new ARMELFStreamer(Context, std::move(TAB), OS,
+ ARMELFStreamer *S = new ARMELFStreamer(Context, std::move(TAB), std::move(OW),
std::move(Emitter), IsThumb);
// FIXME: This should eventually end up somewhere else where more
// intelligent flag decisions can be made. For now we are just maintaining
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index f1f35f409900..0dab789505d5 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -1520,7 +1520,7 @@ unsigned ARMMCCodeEmitter::
getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
- // 10 bits. lower 5 bits are are the lsb of the mask, high five bits are the
+ // 10 bits. lower 5 bits are the lsb of the mask, high five bits are the
// msb of the mask.
const MCOperand &MO = MI.getOperand(Op);
uint32_t v = ~MO.getImm();
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index ae5bc723ee5f..46434007a854 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -21,6 +21,7 @@
#include "llvm/MC/MCELFStreamer.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -140,17 +141,21 @@ std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) {
ARMArchFeature = (ARMArchFeature + "+" + ARM::getArchName(ArchID)).str();
if (TT.isThumb()) {
- if (ARMArchFeature.empty())
- ARMArchFeature = "+thumb-mode,+v4t";
- else
- ARMArchFeature += ",+thumb-mode,+v4t";
+ if (!ARMArchFeature.empty())
+ ARMArchFeature += ",";
+ ARMArchFeature += "+thumb-mode,+v4t";
}
if (TT.isOSNaCl()) {
- if (ARMArchFeature.empty())
- ARMArchFeature = "+nacl-trap";
- else
- ARMArchFeature += ",+nacl-trap";
+ if (!ARMArchFeature.empty())
+ ARMArchFeature += ",";
+ ARMArchFeature += "+nacl-trap";
+ }
+
+ if (TT.isOSWindows()) {
+ if (!ARMArchFeature.empty())
+ ARMArchFeature += ",";
+ ARMArchFeature += "+noarm";
}
return ARMArchFeature;
@@ -201,21 +206,21 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI,
static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
std::unique_ptr<MCAsmBackend> &&MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&Emitter,
bool RelaxAll) {
return createARMELFStreamer(
- Ctx, std::move(MAB), OS, std::move(Emitter), false,
+ Ctx, std::move(MAB), std::move(OW), std::move(Emitter), false,
(T.getArch() == Triple::thumb || T.getArch() == Triple::thumbeb));
}
static MCStreamer *
createARMMachOStreamer(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll,
bool DWARFMustBeAtTheEnd) {
- return createMachOStreamer(Ctx, std::move(MAB), OS, std::move(Emitter), false,
- DWARFMustBeAtTheEnd);
+ return createMachOStreamer(Ctx, std::move(MAB), std::move(OW),
+ std::move(Emitter), false, DWARFMustBeAtTheEnd);
}
static MCInstPrinter *createARMMCInstPrinter(const Triple &T,
@@ -338,19 +343,12 @@ extern "C" void LLVMInitializeARMTargetMC() {
for (Target *T : {&getTheThumbLETarget(), &getTheThumbBETarget()})
TargetRegistry::RegisterMCInstrAnalysis(*T, createThumbMCInstrAnalysis);
- // Register the MC Code Emitter
- for (Target *T : {&getTheARMLETarget(), &getTheThumbLETarget()})
+ for (Target *T : {&getTheARMLETarget(), &getTheThumbLETarget()}) {
TargetRegistry::RegisterMCCodeEmitter(*T, createARMLEMCCodeEmitter);
- for (Target *T : {&getTheARMBETarget(), &getTheThumbBETarget()})
+ TargetRegistry::RegisterMCAsmBackend(*T, createARMLEAsmBackend);
+ }
+ for (Target *T : {&getTheARMBETarget(), &getTheThumbBETarget()}) {
TargetRegistry::RegisterMCCodeEmitter(*T, createARMBEMCCodeEmitter);
-
- // Register the asm backend.
- TargetRegistry::RegisterMCAsmBackend(getTheARMLETarget(),
- createARMLEAsmBackend);
- TargetRegistry::RegisterMCAsmBackend(getTheARMBETarget(),
- createARMBEAsmBackend);
- TargetRegistry::RegisterMCAsmBackend(getTheThumbLETarget(),
- createThumbLEAsmBackend);
- TargetRegistry::RegisterMCAsmBackend(getTheThumbBETarget(),
- createThumbBEAsmBackend);
+ TargetRegistry::RegisterMCAsmBackend(*T, createARMBEAsmBackend);
+ }
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 0fb97e5fee97..3ee004592ac6 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -25,6 +25,7 @@ class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
class MCInstPrinter;
+class MCObjectTargetWriter;
class MCObjectWriter;
class MCRegisterInfo;
class MCSubtargetInfo;
@@ -68,52 +69,34 @@ MCCodeEmitter *createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
-MCAsmBackend *createARMAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
- const MCTargetOptions &Options,
- bool IsLittleEndian);
-
-MCAsmBackend *createARMLEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
+MCAsmBackend *createARMLEAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const MCTargetOptions &Options);
-MCAsmBackend *createARMBEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
+MCAsmBackend *createARMBEAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const MCTargetOptions &Options);
-MCAsmBackend *createThumbLEAsmBackend(const Target &T,
- const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
- const MCTargetOptions &Options);
-
-MCAsmBackend *createThumbBEAsmBackend(const Target &T,
- const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
- const MCTargetOptions &Options);
-
// Construct a PE/COFF machine code streamer which will generate a PE/COFF
// object file.
MCStreamer *createARMWinCOFFStreamer(MCContext &Context,
std::unique_ptr<MCAsmBackend> &&MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&Emitter,
bool RelaxAll,
bool IncrementalLinkerCompatible);
/// Construct an ELF Mach-O object writer.
-std::unique_ptr<MCObjectWriter> createARMELFObjectWriter(raw_pwrite_stream &OS,
- uint8_t OSABI,
- bool IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter> createARMELFObjectWriter(uint8_t OSABI);
/// Construct an ARM Mach-O object writer.
-std::unique_ptr<MCObjectWriter> createARMMachObjectWriter(raw_pwrite_stream &OS,
- bool Is64Bit,
- uint32_t CPUType,
- uint32_t CPUSubtype);
+std::unique_ptr<MCObjectTargetWriter>
+createARMMachObjectWriter(bool Is64Bit, uint32_t CPUType,
+ uint32_t CPUSubtype);
/// Construct an ARM PE/COFF object writer.
-std::unique_ptr<MCObjectWriter>
-createARMWinCOFFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit);
+std::unique_ptr<MCObjectTargetWriter>
+createARMWinCOFFObjectWriter(bool Is64Bit);
/// Construct ARM Mach-O relocation info.
MCRelocationInfo *createARMMachORelocationInfo(MCContext &Ctx);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 521ae5337e7a..4b4956e914f2 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -484,10 +484,8 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
}
-std::unique_ptr<MCObjectWriter>
-llvm::createARMMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
- uint32_t CPUType, uint32_t CPUSubtype) {
- return createMachObjectWriter(
- llvm::make_unique<ARMMachObjectWriter>(Is64Bit, CPUType, CPUSubtype), OS,
- /*IsLittleEndian=*/true);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createARMMachObjectWriter(bool Is64Bit, uint32_t CPUType,
+ uint32_t CPUSubtype) {
+ return llvm::make_unique<ARMMachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index 5e09b126f43f..8ae713b7b489 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -91,10 +91,9 @@ bool ARMWinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
namespace llvm {
-std::unique_ptr<MCObjectWriter>
-createARMWinCOFFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit) {
- auto MOTW = llvm::make_unique<ARMWinCOFFObjectWriter>(Is64Bit);
- return createWinCOFFObjectWriter(std::move(MOTW), OS);
+std::unique_ptr<MCObjectTargetWriter>
+createARMWinCOFFObjectWriter(bool Is64Bit) {
+ return llvm::make_unique<ARMWinCOFFObjectWriter>(Is64Bit);
}
} // end namespace llvm
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
index a2424e1abab3..32cb3dcdcad8 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -10,6 +10,7 @@
#include "ARMMCTargetDesc.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCWinCOFFStreamer.h"
using namespace llvm;
@@ -18,8 +19,9 @@ namespace {
class ARMWinCOFFStreamer : public MCWinCOFFStreamer {
public:
ARMWinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> AB,
- std::unique_ptr<MCCodeEmitter> CE, raw_pwrite_stream &OS)
- : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), OS) {}
+ std::unique_ptr<MCCodeEmitter> CE,
+ std::unique_ptr<MCObjectWriter> OW)
+ : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
void EmitThumbFunc(MCSymbol *Symbol) override;
@@ -48,10 +50,11 @@ void ARMWinCOFFStreamer::FinishImpl() {
MCStreamer *llvm::createARMWinCOFFStreamer(
MCContext &Context, std::unique_ptr<MCAsmBackend> &&MAB,
- raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> &&Emitter,
- bool RelaxAll, bool IncrementalLinkerCompatible) {
- auto *S =
- new ARMWinCOFFStreamer(Context, std::move(MAB), std::move(Emitter), OS);
+ std::unique_ptr<MCObjectWriter> &&OW,
+ std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll,
+ bool IncrementalLinkerCompatible) {
+ auto *S = new ARMWinCOFFStreamer(Context, std::move(MAB), std::move(Emitter),
+ std::move(OW));
S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
return S;
}
diff --git a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
index 9582e8cbef47..cb5742ccc6e3 100644
--- a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
@@ -1,7 +1,6 @@
add_llvm_library(LLVMARMDesc
ARMAsmBackend.cpp
ARMELFObjectWriter.cpp
- ARMELFObjectWriter.cpp
ARMELFStreamer.cpp
ARMMachObjectWriter.cpp
ARMMachORelocationInfo.cpp
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index 153e7b1e2197..637e4a44c428 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -309,17 +309,17 @@ MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
}
MIB.addImm(Pred).addReg(PredReg);
- DEBUG({
- dbgs() << "Expanding: " << *MI;
- dbgs() << " to:\n";
- MachineBasicBlock::iterator MII = MI;
- MII = std::prev(MII);
- MachineInstr &MI2 = *MII;
- MII = std::prev(MII);
- MachineInstr &MI1 = *MII;
- dbgs() << " " << MI1;
- dbgs() << " " << MI2;
- });
+ LLVM_DEBUG({
+ dbgs() << "Expanding: " << *MI;
+ dbgs() << " to:\n";
+ MachineBasicBlock::iterator MII = MI;
+ MII = std::prev(MII);
+ MachineInstr &MI2 = *MII;
+ MII = std::prev(MII);
+ MachineInstr &MI1 = *MII;
+ dbgs() << " " << MI1;
+ dbgs() << " " << MI2;
+ });
MI->eraseFromParent();
++NumExpand;
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 549af00fcc99..def67cfae727 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -502,7 +502,7 @@ those operations and the ARMv6 scalar versions.
//===---------------------------------------------------------------------===//
Split out LDR (literal) from normal ARM LDR instruction. Also consider spliting
-LDR into imm12 and so_reg forms. This allows us to clean up some code. e.g.
+LDR into imm12 and so_reg forms. This allows us to clean up some code. e.g.
ARMLoadStoreOptimizer does not need to look at LDR (literal) and LDR (so_reg)
while ARMConstantIslandPass only need to worry about LDR (literal).
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index ba00b3d79da9..a65e22fd86e8 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -611,6 +611,12 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
unsigned TemporaryReg = 0;
BitVector PopFriendly =
TRI.getAllocatableSet(MF, TRI.getRegClass(ARM::tGPRRegClassID));
+ // R7 may be used as a frame pointer, hence marked as not generally
+ // allocatable, however there's no reason to not use it as a temporary for
+ // restoring LR.
+ if (STI.useR7AsFramePointer())
+ PopFriendly.set(ARM::R7);
+
assert(PopFriendly.any() && "No allocatable pop-friendly register?!");
// Rebuild the GPRs from the high registers because they are removed
// form the GPR reg class for thumb1.
@@ -622,17 +628,20 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
GPRsNoLRSP.reset(ARM::PC);
findTemporariesForLR(GPRsNoLRSP, PopFriendly, UsedRegs, PopReg, TemporaryReg);
- // If we couldn't find a pop-friendly register, restore LR before popping the
- // other callee-saved registers, so we can use one of them as a temporary.
+ // If we couldn't find a pop-friendly register, try restoring LR before
+ // popping the other callee-saved registers, so we could use one of them as a
+ // temporary.
bool UseLDRSP = false;
if (!PopReg && MBBI != MBB.begin()) {
auto PrevMBBI = MBBI;
PrevMBBI--;
if (PrevMBBI->getOpcode() == ARM::tPOP) {
- MBBI = PrevMBBI;
- UsedRegs.stepBackward(*MBBI);
+ UsedRegs.stepBackward(*PrevMBBI);
findTemporariesForLR(GPRsNoLRSP, PopFriendly, UsedRegs, PopReg, TemporaryReg);
- UseLDRSP = true;
+ if (PopReg) {
+ MBBI = PrevMBBI;
+ UseLDRSP = true;
+ }
}
}
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 49645834e2de..11aa285fc939 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -109,11 +109,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
unsigned DestReg, int FI,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
- assert((RC == &ARM::tGPRRegClass ||
+ assert((RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
(TargetRegisterInfo::isPhysicalRegister(DestReg) &&
isARMLowRegister(DestReg))) && "Unknown regclass!");
- if (RC == &ARM::tGPRRegClass ||
+ if (RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
(TargetRegisterInfo::isPhysicalRegister(DestReg) &&
isARMLowRegister(DestReg))) {
DebugLoc DL;
@@ -141,3 +141,16 @@ void Thumb1InstrInfo::expandLoadStackGuard(
else
expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::tLDRi);
}
+
+bool Thumb1InstrInfo::canCopyGluedNodeDuringSchedule(SDNode *N) const {
+ // In Thumb1 the scheduler may need to schedule a cross-copy between GPRS and CPSR
+ // but this is not always possible there, so allow the Scheduler to clone tADCS and tSBCS
+ // even if they have glue.
+ // FIXME. Actually implement the cross-copy where it is possible (post v6)
+ // because these copies entail more spilling.
+ unsigned Opcode = N->getMachineOpcode();
+ if (Opcode == ARM::tADCS || Opcode == ARM::tSBCS)
+ return true;
+
+ return false;
+}
diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h
index e8d9a9c4ff14..9f04a3ed262f 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/lib/Target/ARM/Thumb1InstrInfo.h
@@ -53,6 +53,7 @@ public:
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
+ bool canCopyGluedNodeDuringSchedule(SDNode *N) const override;
private:
void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
};
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 04bdd91b53e6..e0a5f7f04fa9 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -183,7 +183,7 @@ Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
// If not, then there is nothing to be gained by moving the copy.
MachineBasicBlock::iterator I = MI; ++I;
MachineBasicBlock::iterator E = MI->getParent()->end();
- while (I != E && I->isDebugValue())
+ while (I != E && I->isDebugInstr())
++I;
if (I != E) {
unsigned NPredReg = 0;
@@ -237,7 +237,7 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
// block so check the instruction we just put in the block.
for (; MBBI != E && Pos &&
(!MI->isBranch() && !MI->isReturn()) ; ++MBBI) {
- if (MBBI->isDebugValue())
+ if (MBBI->isDebugInstr())
continue;
MachineInstr *NMI = &*MBBI;
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index c5eb14f3e608..d5f0ba9ee485 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -82,7 +82,7 @@ Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
MachineBasicBlock::iterator E = MBB->begin();
unsigned Count = 4; // At most 4 instructions in an IT block.
while (Count && MBBI != E) {
- if (MBBI->isDebugValue()) {
+ if (MBBI->isDebugInstr()) {
--MBBI;
continue;
}
@@ -109,7 +109,7 @@ Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
bool
Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI) const {
- while (MBBI->isDebugValue()) {
+ while (MBBI->isDebugInstr()) {
++MBBI;
if (MBBI == MBB.end())
return false;
@@ -489,7 +489,8 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
Offset += MI.getOperand(FrameRegIdx+1).getImm();
unsigned PredReg;
- if (Offset == 0 && getInstrPredicate(MI, PredReg) == ARMCC::AL) {
+ if (Offset == 0 && getInstrPredicate(MI, PredReg) == ARMCC::AL &&
+ !MI.definesRegister(ARM::CPSR)) {
// Turn it into a move.
MI.setDesc(TII.get(ARM::tMOVr));
MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
@@ -600,6 +601,20 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
Offset = -Offset;
isSub = true;
}
+ } else if (AddrMode == ARMII::AddrMode5FP16) {
+ // VFP address mode.
+ const MachineOperand &OffOp = MI.getOperand(FrameRegIdx+1);
+ int InstrOffs = ARM_AM::getAM5FP16Offset(OffOp.getImm());
+ if (ARM_AM::getAM5FP16Op(OffOp.getImm()) == ARM_AM::sub)
+ InstrOffs *= -1;
+ NumBits = 8;
+ Scale = 2;
+ Offset += InstrOffs * 2;
+ assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+ if (Offset < 0) {
+ Offset = -Offset;
+ isSub = true;
+ }
} else if (AddrMode == ARMII::AddrModeT2_i8s4) {
Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4;
NumBits = 10; // 8 bits scaled by 4
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 5357e26856ea..abf54ba7e87c 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -610,7 +610,8 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
// Transfer MI flags.
MIB.setMIFlags(MI->getFlags());
- DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB);
+ LLVM_DEBUG(errs() << "Converted 32-bit: " << *MI
+ << " to 16-bit: " << *MIB);
MBB.erase_instr(MI);
++NumLdSts;
@@ -657,7 +658,8 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
// Transfer MI flags.
MIB.setMIFlags(MI->getFlags());
- DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " <<*MIB);
+ LLVM_DEBUG(errs() << "Converted 32-bit: " << *MI
+ << " to 16-bit: " << *MIB);
MBB.erase_instr(MI);
++NumNarrows;
@@ -826,7 +828,8 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
// Transfer MI flags.
MIB.setMIFlags(MI->getFlags());
- DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB);
+ LLVM_DEBUG(errs() << "Converted 32-bit: " << *MI
+ << " to 16-bit: " << *MIB);
MBB.erase_instr(MI);
++Num2Addrs;
@@ -933,7 +936,8 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
// Transfer MI flags.
MIB.setMIFlags(MI->getFlags());
- DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB);
+ LLVM_DEBUG(errs() << "Converted 32-bit: " << *MI
+ << " to 16-bit: " << *MIB);
MBB.erase_instr(MI);
++NumNarrows;
@@ -1033,7 +1037,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
BundleMI = MI;
continue;
}
- if (MI->isDebugValue())
+ if (MI->isDebugInstr())
continue;
LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR);
diff --git a/lib/Target/ARM/ThumbRegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp
index d190edf5913c..e4bdd40fb743 100644
--- a/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -475,7 +475,7 @@ bool ThumbRegisterInfo::saveScavengerRegister(
// before that instead and adjust the UseMI.
bool done = false;
for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) {
- if (II->isDebugValue())
+ if (II->isDebugInstr())
continue;
// If this instruction affects R12, adjust our restore point.
for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
@@ -517,25 +517,13 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
unsigned VReg = 0;
const ARMBaseInstrInfo &TII = *STI.getInstrInfo();
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
DebugLoc dl = MI.getDebugLoc();
MachineInstrBuilder MIB(*MBB.getParent(), &MI);
- unsigned FrameReg = ARM::SP;
+ unsigned FrameReg;
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
- int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex) +
- MF.getFrameInfo().getStackSize() + SPAdj;
-
- if (MF.getFrameInfo().hasVarSizedObjects()) {
- assert(SPAdj == 0 && STI.getFrameLowering()->hasFP(MF) && "Unexpected");
- // There are alloca()'s in this function, must reference off the frame
- // pointer or base pointer instead.
- if (!hasBasePointer(MF)) {
- FrameReg = getFrameRegister(MF);
- Offset -= AFI->getFramePtrSpillOffset();
- } else
- FrameReg = BasePtr;
- }
+ const ARMFrameLowering *TFI = getFrameLowering(MF);
+ int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj);
// PEI::scavengeFrameVirtualRegs() cannot accurately track SPAdj because the
// call frame setup/destroy instructions have already been eliminated. That
@@ -560,7 +548,7 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
}
// Modify MI as necessary to handle as much of 'Offset' as possible
- assert(AFI->isThumbFunction() &&
+ assert(MF.getInfo<ARMFunctionInfo>()->isThumbFunction() &&
"This eliminateFrameIndex only supports Thumb1!");
if (rewriteFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
return;
diff --git a/lib/Target/AVR/AVR.h b/lib/Target/AVR/AVR.h
index 2535b63dccdd..48327fd377b2 100644
--- a/lib/Target/AVR/AVR.h
+++ b/lib/Target/AVR/AVR.h
@@ -37,8 +37,10 @@ void initializeAVRRelaxMemPass(PassRegistry&);
/// Contains the AVR backend.
namespace AVR {
+/// An integer that identifies all of the supported AVR address spaces.
enum AddressSpace { DataMemory, ProgramMemory };
+/// Checks if a given type is a pointer to program memory.
template <typename T> bool isProgramMemoryAddress(T *V) {
return cast<PointerType>(V->getType())->getAddressSpace() == ProgramMemory;
}
diff --git a/lib/Target/AVR/AVRISelDAGToDAG.cpp b/lib/Target/AVR/AVRISelDAGToDAG.cpp
index 462a7d57d2de..b0b23effc6c6 100644
--- a/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -519,12 +519,9 @@ bool AVRDAGToDAGISel::selectMultiplication(llvm::SDNode *N) {
}
void AVRDAGToDAGISel::Select(SDNode *N) {
- // Dump information about the Node being selected
- DEBUG(errs() << "Selecting: "; N->dump(CurDAG); errs() << "\n");
-
// If we have a custom node, we already have selected!
if (N->isMachineOpcode()) {
- DEBUG(errs() << "== "; N->dump(CurDAG); errs() << "\n");
+ LLVM_DEBUG(errs() << "== "; N->dump(CurDAG); errs() << "\n");
N->setNodeId(-1);
return;
}
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index d9e27e91405c..c1515571aae5 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -345,6 +345,9 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
case MVT::i64:
LC = IsSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64;
break;
+ case MVT::i128:
+ LC = IsSigned ? RTLIB::SDIVREM_I128 : RTLIB::UDIVREM_I128;
+ break;
}
SDValue InChain = DAG.getEntryNode();
@@ -867,10 +870,12 @@ bool AVRTargetLowering::isOffsetFoldingLegal(
/// For each argument in a function store the number of pieces it is composed
/// of.
-static void parseFunctionArgs(const Function *F, const DataLayout *TD,
+static void parseFunctionArgs(const SmallVectorImpl<ISD::InputArg> &Ins,
SmallVectorImpl<unsigned> &Out) {
- for (Argument const &Arg : F->args()) {
- unsigned Bytes = (TD->getTypeSizeInBits(Arg.getType()) + 7) / 8;
+ for (const ISD::InputArg &Arg : Ins) {
+ if(Arg.PartOffset > 0) continue;
+ unsigned Bytes = ((Arg.ArgVT.getSizeInBits()) + 7) / 8;
+
Out.push_back((Bytes + 1) / 2);
}
}
@@ -938,7 +943,7 @@ static void analyzeStandardArguments(TargetLowering::CallLoweringInfo *CLI,
parseExternFuncCallArgs(*Outs, Args);
} else {
assert(F != nullptr && "function should not be null");
- parseFunctionArgs(F, TD, Args);
+ parseFunctionArgs(*Ins, Args);
}
unsigned RegsLeft = array_lengthof(RegList8), ValNo = 0;
diff --git a/lib/Target/AVR/AVRInstrInfo.cpp b/lib/Target/AVR/AVRInstrInfo.cpp
index 1a89a13693e1..0c32334167f0 100644
--- a/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/lib/Target/AVR/AVRInstrInfo.cpp
@@ -273,7 +273,7 @@ bool AVRInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
while (I != MBB.begin()) {
--I;
- if (I->isDebugValue()) {
+ if (I->isDebugInstr()) {
continue;
}
@@ -444,7 +444,7 @@ unsigned AVRInstrInfo::removeBranch(MachineBasicBlock &MBB,
while (I != MBB.begin()) {
--I;
- if (I->isDebugValue()) {
+ if (I->isDebugInstr()) {
continue;
}
//:TODO: add here the missing jmp instructions once they are implemented
diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td
index 7d1bfc8d85e0..a2129cc0e2e9 100644
--- a/lib/Target/AVR/AVRInstrInfo.td
+++ b/lib/Target/AVR/AVRInstrInfo.td
@@ -1228,9 +1228,23 @@ isReMaterializable = 1 in
[(set i16:$dst, (load addr:$memri))]>,
Requires<[HasSRAM]>;
+ // An identical pseudo instruction to LDDWRdPtrQ, expect restricted to the Y
+ // register and without the @earlyclobber flag.
+ //
+ // Used to work around a bug caused by the register allocator not
+ // being able to handle the expansion of a COPY into an machine instruction
+ // that has an earlyclobber flag. This is because the register allocator will
+ // try expand a copy from a register slot into an earlyclobber instruction.
+ // Instructions that are earlyclobber need to be in a dedicated earlyclobber slot.
+ //
+ // This pseudo instruction can be used pre-AVR pseudo expansion in order to
+ // get a frame index load without directly using earlyclobber instructions.
+ //
+ // The pseudo expansion pass trivially expands this into LDDWRdPtrQ.
+ //
+ // This instruction may be removed once PR13375 is fixed.
let mayLoad = 1,
- hasSideEffects = 0,
- Constraints = "@earlyclobber $dst" in
+ hasSideEffects = 0 in
def LDDWRdYQ : Pseudo<(outs DREGS:$dst),
(ins memri:$memri),
"lddw\t$dst, $memri",
diff --git a/lib/Target/AVR/AVRRegisterInfo.h b/lib/Target/AVR/AVRRegisterInfo.h
index f8fefb859682..104b336b9c48 100644
--- a/lib/Target/AVR/AVRRegisterInfo.h
+++ b/lib/Target/AVR/AVRRegisterInfo.h
@@ -51,6 +51,11 @@ public:
/// Splits a 16-bit `DREGS` register into the lo/hi register pair.
/// \param Reg A 16-bit register to split.
void splitReg(unsigned Reg, unsigned &LoReg, unsigned &HiReg) const;
+
+ bool trackLivenessAfterRegAlloc(const MachineFunction &) const override {
+ return true;
+ }
+
};
} // end namespace llvm
diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp
index f9a738b2182c..74300d9a451c 100644
--- a/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/lib/Target/AVR/AVRTargetMachine.cpp
@@ -25,7 +25,7 @@
namespace llvm {
-static const char *AVRDataLayout = "e-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8";
+static const char *AVRDataLayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8";
/// Processes a CPU name.
static StringRef getCPU(StringRef CPU) {
diff --git a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index b527ad3e0b14..d57cc098497f 100644
--- a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -482,7 +482,7 @@ bool AVRAsmParser::tryParseRelocExpression(OperandVector &Operands) {
}
bool AVRAsmParser::parseOperand(OperandVector &Operands) {
- DEBUG(dbgs() << "parseOperand\n");
+ LLVM_DEBUG(dbgs() << "parseOperand\n");
switch (getLexer().getKind()) {
default:
@@ -527,7 +527,7 @@ bool AVRAsmParser::parseOperand(OperandVector &Operands) {
OperandMatchResultTy
AVRAsmParser::parseMemriOperand(OperandVector &Operands) {
- DEBUG(dbgs() << "parseMemriOperand()\n");
+ LLVM_DEBUG(dbgs() << "parseMemriOperand()\n");
SMLoc E, S;
MCExpr const *Expression;
diff --git a/lib/Target/AVR/CMakeLists.txt b/lib/Target/AVR/CMakeLists.txt
index af89a2476c4c..40ce548bff2c 100644
--- a/lib/Target/AVR/CMakeLists.txt
+++ b/lib/Target/AVR/CMakeLists.txt
@@ -1,19 +1,15 @@
set(LLVM_TARGET_DEFINITIONS AVR.td)
tablegen(LLVM AVRGenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM AVRGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM AVRGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM AVRGenDisassemblerTables.inc -gen-disassembler)
-tablegen(LLVM AVRGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM AVRGenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM AVRGenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM AVRGenDAGISel.inc -gen-dag-isel)
tablegen(LLVM AVRGenCallingConv.inc -gen-callingconv)
tablegen(LLVM AVRGenDAGISel.inc -gen-dag-isel)
tablegen(LLVM AVRGenDisassemblerTables.inc -gen-disassembler)
tablegen(LLVM AVRGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM AVRGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM AVRGenRegisterInfo.inc -gen-register-info)
tablegen(LLVM AVRGenSubtargetInfo.inc -gen-subtarget)
+
add_public_tablegen_target(AVRCommonTableGen)
add_llvm_target(AVRCodeGen
diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index 2d9dd4f8f83f..f81a57dd71e3 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -352,15 +352,16 @@ void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup,
}
}
-std::unique_ptr<MCObjectWriter>
-AVRAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
- return createAVRELFObjectWriter(OS,
- MCELFObjectTargetWriter::getOSABI(OSType));
+std::unique_ptr<MCObjectTargetWriter>
+AVRAsmBackend::createObjectTargetWriter() const {
+ return createAVRELFObjectWriter(MCELFObjectTargetWriter::getOSABI(OSType));
}
void AVRAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
- const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsPCRel) const {
+ const MCValue &Target,
+ MutableArrayRef<char> Data, uint64_t Value,
+ bool IsResolved,
+ const MCSubtargetInfo *STI) const {
adjustFixupValue(Fixup, Target, Value, &Asm.getContext());
if (Value == 0)
return; // Doesn't change encoding.
@@ -453,13 +454,13 @@ MCFixupKindInfo const &AVRAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
return Infos[Kind - FirstTargetFixupKind];
}
-bool AVRAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool AVRAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
// If the count is not 2-byte aligned, we must be writing data into the text
// section (otherwise we have unaligned instructions, and thus have far
// bigger problems), so just write zeros instead.
assert((Count % 2) == 0 && "NOP instructions must be 2 bytes");
- OW->WriteZeros(Count);
+ OS.write_zeros(Count);
return true;
}
@@ -476,10 +477,10 @@ bool AVRAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
}
}
-MCAsmBackend *createAVRAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
+MCAsmBackend *createAVRAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const llvm::MCTargetOptions &TO) {
- return new AVRAsmBackend(TT.getOS());
+ return new AVRAsmBackend(STI.getTargetTriple().getOS());
}
} // end of namespace llvm
diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
index af615df033aa..d48077c3ab8e 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -31,18 +31,19 @@ struct MCFixupKindInfo;
/// Utilities for manipulating generated AVR machine code.
class AVRAsmBackend : public MCAsmBackend {
public:
-
AVRAsmBackend(Triple::OSType OSType)
- : MCAsmBackend(), OSType(OSType) {}
+ : MCAsmBackend(support::little), OSType(OSType) {}
void adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
uint64_t &Value, MCContext *Ctx = nullptr) const;
- std::unique_ptr<MCObjectWriter> createObjectWriter(raw_pwrite_stream &OS) const override;
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override;
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsPCRel) const override;
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override;
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
@@ -50,7 +51,10 @@ public:
return AVR::NumTargetFixupKinds;
}
- bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+ bool mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const override {
+ return false;
+ }
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
@@ -62,7 +66,7 @@ public:
void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
MCInst &Res) const override {}
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target) override;
diff --git a/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
index 412f66fbcf22..4a921a1601a9 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -152,10 +152,8 @@ unsigned AVRELFObjectWriter::getRelocType(MCContext &Ctx,
}
}
-std::unique_ptr<MCObjectWriter>
-createAVRELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI) {
- std::unique_ptr<MCELFObjectTargetWriter> MOTW(new AVRELFObjectWriter(OSABI));
- return createELFObjectWriter(std::move(MOTW), OS, true);
+std::unique_ptr<MCObjectTargetWriter> createAVRELFObjectWriter(uint8_t OSABI) {
+ return make_unique<AVRELFObjectWriter>(OSABI);
}
} // end of namespace llvm
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
index c60ea7a92e6f..861acd47347f 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
@@ -17,6 +17,7 @@
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCObjectWriter.h"
using namespace llvm;
@@ -43,9 +44,10 @@ void AVRMCELFStreamer::EmitValueForModiferKind(
namespace llvm {
MCStreamer *createAVRELFStreamer(Triple const &TT, MCContext &Context,
std::unique_ptr<MCAsmBackend> MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> CE) {
- return new AVRMCELFStreamer(Context, std::move(MAB), OS, std::move(CE));
+ return new AVRMCELFStreamer(Context, std::move(MAB), std::move(OW),
+ std::move(CE));
}
} // end namespace llvm
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h b/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
index 398b409f4586..12e805fc7d13 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
@@ -16,6 +16,7 @@
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCELFStreamer.h"
#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
namespace llvm {
@@ -27,16 +28,18 @@ class AVRMCELFStreamer : public MCELFStreamer {
public:
AVRMCELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter)
- : MCELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)),
+ : MCELFStreamer(Context, std::move(TAB), std::move(OW),
+ std::move(Emitter)),
MCII(createAVRMCInstrInfo()) {}
AVRMCELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter,
MCAssembler *Assembler)
- : MCELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)),
+ : MCELFStreamer(Context, std::move(TAB), std::move(OW),
+ std::move(Emitter)),
MCII(createAVRMCInstrInfo()) {}
void EmitValueForModiferKind(
@@ -46,7 +49,7 @@ public:
MCStreamer *createAVRELFStreamer(Triple const &TT, MCContext &Context,
std::unique_ptr<MCAsmBackend> MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> CE);
} // end namespace llvm
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
index dd65a4312077..8c39b5f4039e 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
@@ -70,11 +70,11 @@ static MCInstPrinter *createAVRMCInstPrinter(const Triple &T,
static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
std::unique_ptr<MCAsmBackend> &&MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&Emitter,
bool RelaxAll) {
- return createELFStreamer(Context, std::move(MAB), OS,
- std::move(Emitter), RelaxAll);
+ return createELFStreamer(Context, std::move(MAB), std::move(OW),
+ std::move(Emitter), RelaxAll);
}
static MCTargetStreamer *
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
index 5615fd72e456..a764f15bd065 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
@@ -24,8 +24,9 @@ class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
class MCRegisterInfo;
+class MCSubtargetInfo;
class MCTargetOptions;
class StringRef;
class Target;
@@ -42,13 +43,12 @@ MCCodeEmitter *createAVRMCCodeEmitter(const MCInstrInfo &MCII,
MCContext &Ctx);
/// Creates an assembly backend for AVR.
-MCAsmBackend *createAVRAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
+MCAsmBackend *createAVRAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const llvm::MCTargetOptions &TO);
/// Creates an ELF object writer for AVR.
-std::unique_ptr<MCObjectWriter>
-createAVRELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI);
+std::unique_ptr<MCObjectTargetWriter> createAVRELFObjectWriter(uint8_t OSABI);
} // end namespace llvm
diff --git a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index deaa11325809..496f2befde58 100644
--- a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -460,7 +460,7 @@ bool BPFAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
} else if (BPFOperand::isValidIdAtStart (Name))
Operands.push_back(BPFOperand::createToken(Name, NameLoc));
else
- return true;
+ return Error(NameLoc, "invalid register/token name");
while (!getLexer().is(AsmToken::EndOfStatement)) {
// Attempt to parse token as operator
@@ -472,8 +472,10 @@ bool BPFAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
continue;
// Attempt to parse token as an immediate
- if (parseImmediate(Operands) != MatchOperand_Success)
- return true;
+ if (parseImmediate(Operands) != MatchOperand_Success) {
+ SMLoc Loc = getLexer().getLoc();
+ return Error(Loc, "unexpected token");
+ }
}
if (getLexer().isNot(AsmToken::EndOfStatement)) {
diff --git a/lib/Target/BPF/BPF.h b/lib/Target/BPF/BPF.h
index 4a0cb20357c8..76d3e1ca5f6f 100644
--- a/lib/Target/BPF/BPF.h
+++ b/lib/Target/BPF/BPF.h
@@ -17,6 +17,11 @@ namespace llvm {
class BPFTargetMachine;
FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
+FunctionPass *createBPFMIPeepholePass();
+FunctionPass *createBPFMIPreEmitPeepholePass();
+
+void initializeBPFMIPeepholePass(PassRegistry&);
+void initializeBPFMIPreEmitPeepholePass(PassRegistry&);
}
#endif
diff --git a/lib/Target/BPF/BPF.td b/lib/Target/BPF/BPF.td
index 2d0c22a3a516..877bd15f4f2b 100644
--- a/lib/Target/BPF/BPF.td
+++ b/lib/Target/BPF/BPF.td
@@ -26,6 +26,12 @@ def : Proc<"probe", []>;
def DummyFeature : SubtargetFeature<"dummy", "isDummyMode",
"true", "unused feature">;
+def ALU32 : SubtargetFeature<"alu32", "HasAlu32", "true",
+ "Enable ALU32 instructions">;
+
+def DwarfRIS: SubtargetFeature<"dwarfris", "UseDwarfRIS", "true",
+ "Disable MCAsmInfo DwarfUsesRelocationsAcrossSections">;
+
def BPFInstPrinter : AsmWriter {
string AsmWriterClassName = "InstPrinter";
bit isMCAsmWriter = 1;
diff --git a/lib/Target/BPF/BPFCallingConv.td b/lib/Target/BPF/BPFCallingConv.td
index 8cec6fa54698..637f9752ec42 100644
--- a/lib/Target/BPF/BPFCallingConv.td
+++ b/lib/Target/BPF/BPFCallingConv.td
@@ -26,4 +26,24 @@ def CC_BPF64 : CallingConv<[
CCAssignToStack<8, 8>
]>;
+// Return-value convention when -mattr=+alu32 enabled
+def RetCC_BPF32 : CallingConv<[
+ CCIfType<[i32], CCAssignToRegWithShadow<[W0], [R0]>>,
+ CCIfType<[i64], CCAssignToRegWithShadow<[R0], [W0]>>
+]>;
+
+// Calling convention when -mattr=+alu32 enabled
+def CC_BPF32 : CallingConv<[
+ // Promote i8/i16/i32 args to i64
+ CCIfType<[i32], CCAssignToRegWithShadow<[W1, W2, W3, W4, W5],
+ [R1, R2, R3, R4, R5]>>,
+
+ // All arguments get passed in integer registers if there is space.
+ CCIfType<[i64], CCAssignToRegWithShadow<[R1, R2, R3, R4, R5],
+ [W1, W2, W3, W4, W5]>>,
+
+ // Could be assigned to the stack in 8-byte aligned units, but unsupported
+ CCAssignToStack<8, 8>
+]>;
+
def CSR : CalleeSavedRegs<(add R6, R7, R8, R9, R10)>;
diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 61b04d1f2a13..8b9bc08e144f 100644
--- a/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -39,8 +39,14 @@ using namespace llvm;
namespace {
class BPFDAGToDAGISel : public SelectionDAGISel {
+
+ /// Subtarget - Keep a pointer to the BPFSubtarget around so that we can
+ /// make the right decision when generating code for different subtargets.
+ const BPFSubtarget *Subtarget;
+
public:
- explicit BPFDAGToDAGISel(BPFTargetMachine &TM) : SelectionDAGISel(TM) {
+ explicit BPFDAGToDAGISel(BPFTargetMachine &TM)
+ : SelectionDAGISel(TM), Subtarget(nullptr) {
curr_func_ = nullptr;
}
@@ -48,6 +54,12 @@ public:
return "BPF DAG->DAG Pattern Instruction Selection";
}
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ // Reset the subtarget each time through.
+ Subtarget = &MF.getSubtarget<BPFSubtarget>();
+ return SelectionDAGISel::runOnMachineFunction(MF);
+ }
+
void PreprocessISelDAG() override;
bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintCode,
@@ -65,9 +77,9 @@ private:
bool SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
// Node preprocessing cases
- void PreprocessLoad(SDNode *Node, SelectionDAG::allnodes_iterator I);
+ void PreprocessLoad(SDNode *Node, SelectionDAG::allnodes_iterator &I);
void PreprocessCopyToReg(SDNode *Node);
- void PreprocessTrunc(SDNode *Node, SelectionDAG::allnodes_iterator I);
+ void PreprocessTrunc(SDNode *Node, SelectionDAG::allnodes_iterator &I);
// Find constants from a constant structure
typedef std::vector<unsigned char> val_vec_type;
@@ -176,12 +188,9 @@ bool BPFDAGToDAGISel::SelectInlineAsmMemoryOperand(
void BPFDAGToDAGISel::Select(SDNode *Node) {
unsigned Opcode = Node->getOpcode();
- // Dump information about the Node being selected
- DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
-
// If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
- DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
return;
}
@@ -241,7 +250,7 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
}
void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
- SelectionDAG::allnodes_iterator I) {
+ SelectionDAG::allnodes_iterator &I) {
union {
uint8_t c[8];
uint16_t s;
@@ -268,7 +277,7 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
if (OP1N->getOpcode() <= ISD::BUILTIN_OP_END || OP1N->getNumOperands() == 0)
return;
- DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
const GlobalAddressSDNode *GADN =
dyn_cast<GlobalAddressSDNode>(OP1N->getOperand(0).getNode());
@@ -278,7 +287,7 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val.c);
} else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END &&
LDAddrNode->getNumOperands() > 0) {
- DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
SDValue OP1 = LDAddrNode->getOperand(0);
if (const GlobalAddressSDNode *GADN =
@@ -301,8 +310,8 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
val = new_val.d;
}
- DEBUG(dbgs() << "Replacing load of size " << size << " with constant " << val
- << '\n');
+ LLVM_DEBUG(dbgs() << "Replacing load of size " << size << " with constant "
+ << val << '\n');
SDValue NVal = CurDAG->getConstant(val, DL, MVT::i64);
// After replacement, the current node is dead, we need to
@@ -418,8 +427,8 @@ bool BPFDAGToDAGISel::fillGenericConstant(const DataLayout &DL,
if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
uint64_t val = CI->getZExtValue();
- DEBUG(dbgs() << "Byte array at offset " << Offset << " with value " << val
- << '\n');
+ LLVM_DEBUG(dbgs() << "Byte array at offset " << Offset << " with value "
+ << val << '\n');
if (Size > 8 || (Size & (Size - 1)))
return false;
@@ -508,17 +517,49 @@ void BPFDAGToDAGISel::PreprocessCopyToReg(SDNode *Node) {
break;
}
- DEBUG(dbgs() << "Find Load Value to VReg "
- << TargetRegisterInfo::virtReg2Index(RegN->getReg()) << '\n');
+ LLVM_DEBUG(dbgs() << "Find Load Value to VReg "
+ << TargetRegisterInfo::virtReg2Index(RegN->getReg())
+ << '\n');
load_to_vreg_[RegN->getReg()] = mem_load_op;
}
void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
- SelectionDAG::allnodes_iterator I) {
+ SelectionDAG::allnodes_iterator &I) {
ConstantSDNode *MaskN = dyn_cast<ConstantSDNode>(Node->getOperand(1));
if (!MaskN)
return;
+ // The Reg operand should be a virtual register, which is defined
+ // outside the current basic block. DAG combiner has done a pretty
+ // good job in removing truncating inside a single basic block except
+ // when the Reg operand comes from bpf_load_[byte | half | word] for
+ // which the generic optimizer doesn't understand their results are
+ // zero extended.
+ SDValue BaseV = Node->getOperand(0);
+ if (BaseV.getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+ unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue();
+ uint64_t MaskV = MaskN->getZExtValue();
+
+ if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) ||
+ (IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) ||
+ (IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF)))
+ return;
+
+ LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: ";
+ Node->dump(); dbgs() << '\n');
+
+ I--;
+ CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
+ I++;
+ CurDAG->DeleteNode(Node);
+
+ return;
+ }
+
+ // Multiple basic blocks case.
+ if (BaseV.getOpcode() != ISD::CopyFromReg)
+ return;
+
unsigned match_load_op = 0;
switch (MaskN->getZExtValue()) {
default:
@@ -534,19 +575,12 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
break;
}
- // The Reg operand should be a virtual register, which is defined
- // outside the current basic block. DAG combiner has done a pretty
- // good job in removing truncating inside a single basic block.
- SDValue BaseV = Node->getOperand(0);
- if (BaseV.getOpcode() != ISD::CopyFromReg)
- return;
-
const RegisterSDNode *RegN =
dyn_cast<RegisterSDNode>(BaseV.getNode()->getOperand(1));
if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg()))
return;
unsigned AndOpReg = RegN->getReg();
- DEBUG(dbgs() << "Examine " << printReg(AndOpReg) << '\n');
+ LLVM_DEBUG(dbgs() << "Examine " << printReg(AndOpReg) << '\n');
// Examine the PHI insns in the MachineBasicBlock to found out the
// definitions of this virtual register. At this stage (DAG2DAG
@@ -576,8 +610,8 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
// %2 = PHI %0, <%bb.1>, %1, <%bb.3>
// Trace each incoming definition, e.g., (%0, %bb.1) and (%1, %bb.3)
// The AND operation can be removed if both %0 in %bb.1 and %1 in
- // %bb.3 are defined with with a load matching the MaskN.
- DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n');
+ // %bb.3 are defined with a load matching the MaskN.
+ LLVM_DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n');
unsigned PrevReg = -1;
for (unsigned i = 0; i < MII->getNumOperands(); ++i) {
const MachineOperand &MOP = MII->getOperand(i);
@@ -593,8 +627,8 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
}
}
- DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump();
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump();
+ dbgs() << '\n');
I--;
CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index 3ea96e3148f2..9272cf692dc9 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -33,6 +33,10 @@ using namespace llvm;
#define DEBUG_TYPE "bpf-lower"
+static cl::opt<bool> BPFExpandMemcpyInOrder("bpf-expand-memcpy-in-order",
+ cl::Hidden, cl::init(false),
+ cl::desc("Expand memcpy into load/store pairs in order"));
+
static void fail(const SDLoc &DL, SelectionDAG &DAG, const Twine &Msg) {
MachineFunction &MF = DAG.getMachineFunction();
DAG.getContext()->diagnose(
@@ -57,6 +61,8 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
// Set up the register classes.
addRegisterClass(MVT::i64, &BPF::GPRRegClass);
+ if (STI.getHasAlu32())
+ addRegisterClass(MVT::i32, &BPF::GPR32RegClass);
// Compute derived properties from the register classes
computeRegisterProperties(STI.getRegisterInfo());
@@ -67,9 +73,6 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::BRIND, MVT::Other, Expand);
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
- setOperationAction(ISD::SETCC, MVT::i64, Expand);
- setOperationAction(ISD::SELECT, MVT::i64, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
@@ -77,32 +80,39 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
- setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
- setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
- setOperationAction(ISD::SREM, MVT::i64, Expand);
- setOperationAction(ISD::UREM, MVT::i64, Expand);
-
- setOperationAction(ISD::MULHU, MVT::i64, Expand);
- setOperationAction(ISD::MULHS, MVT::i64, Expand);
- setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
- setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+ for (auto VT : { MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i32 && !STI.getHasAlu32())
+ continue;
- setOperationAction(ISD::ADDC, MVT::i64, Expand);
- setOperationAction(ISD::ADDE, MVT::i64, Expand);
- setOperationAction(ISD::SUBC, MVT::i64, Expand);
- setOperationAction(ISD::SUBE, MVT::i64, Expand);
+ setOperationAction(ISD::SDIVREM, VT, Expand);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::SHL_PARTS, VT, Expand);
+ setOperationAction(ISD::SRL_PARTS, VT, Expand);
+ setOperationAction(ISD::SRA_PARTS, VT, Expand);
+ setOperationAction(ISD::CTPOP, VT, Expand);
+
+ setOperationAction(ISD::SETCC, VT, Expand);
+ setOperationAction(ISD::SELECT, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Custom);
+ }
- setOperationAction(ISD::ROTR, MVT::i64, Expand);
- setOperationAction(ISD::ROTL, MVT::i64, Expand);
- setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
- setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
- setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+ if (STI.getHasAlu32()) {
+ setOperationAction(ISD::BSWAP, MVT::i32, Promote);
+ setOperationAction(ISD::BR_CC, MVT::i32, Promote);
+ }
setOperationAction(ISD::CTTZ, MVT::i64, Custom);
setOperationAction(ISD::CTLZ, MVT::i64, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
- setOperationAction(ISD::CTPOP, MVT::i64, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
@@ -126,12 +136,33 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
setMinFunctionAlignment(3);
setPrefFunctionAlignment(3);
- // inline memcpy() for kernel to see explicit copy
- MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 128;
- MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 128;
- MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128;
+ if (BPFExpandMemcpyInOrder) {
+ // LLVM generic code will try to expand memcpy into load/store pairs at this
+ // stage which is before quite a few IR optimization passes, therefore the
+ // loads and stores could potentially be moved apart from each other which
+ // will cause trouble to memcpy pattern matcher inside kernel eBPF JIT
+ // compilers.
+ //
+ // When -bpf-expand-memcpy-in-order specified, we want to defer the expand
+ // of memcpy to later stage in IR optimization pipeline so those load/store
+ // pairs won't be touched and could be kept in order. Hence, we set
+ // MaxStoresPerMem* to zero to disable the generic getMemcpyLoadsAndStores
+ // code path, and ask LLVM to use target expander EmitTargetCodeForMemcpy.
+ MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 0;
+ MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 0;
+ MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 0;
+ } else {
+ // inline memcpy() for kernel to see explicit copy
+ unsigned CommonMaxStores =
+ STI.getSelectionDAGInfo()->getCommonMaxStoresPerMemFunc();
+
+ MaxStoresPerMemset = MaxStoresPerMemsetOptSize = CommonMaxStores;
+ MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = CommonMaxStores;
+ MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = CommonMaxStores;
+ }
// CPU/Feature control
+ HasAlu32 = STI.getHasAlu32();
HasJmpExt = STI.getHasJmpExt();
}
@@ -189,26 +220,29 @@ SDValue BPFTargetLowering::LowerFormalArguments(
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
- CCInfo.AnalyzeFormalArguments(Ins, CC_BPF64);
+ CCInfo.AnalyzeFormalArguments(Ins, getHasAlu32() ? CC_BPF32 : CC_BPF64);
for (auto &VA : ArgLocs) {
if (VA.isRegLoc()) {
// Arguments passed in registers
EVT RegVT = VA.getLocVT();
- switch (RegVT.getSimpleVT().SimpleTy) {
+ MVT::SimpleValueType SimpleTy = RegVT.getSimpleVT().SimpleTy;
+ switch (SimpleTy) {
default: {
errs() << "LowerFormalArguments Unhandled argument type: "
<< RegVT.getEVTString() << '\n';
llvm_unreachable(0);
}
+ case MVT::i32:
case MVT::i64:
- unsigned VReg = RegInfo.createVirtualRegister(&BPF::GPRRegClass);
+ unsigned VReg = RegInfo.createVirtualRegister(SimpleTy == MVT::i64 ?
+ &BPF::GPRRegClass :
+ &BPF::GPR32RegClass);
RegInfo.addLiveIn(VA.getLocReg(), VReg);
SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT);
- // If this is an 8/16/32-bit value, it is really passed promoted to 64
- // bits. Insert an assert[sz]ext to capture this, then truncate to the
- // right size.
+ // If this is an value that has been promoted to wider types, insert an
+ // assert[sz]ext to capture this, then truncate to the right size.
if (VA.getLocInfo() == CCValAssign::SExt)
ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
DAG.getValueType(VA.getValVT()));
@@ -220,6 +254,8 @@ SDValue BPFTargetLowering::LowerFormalArguments(
ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
InVals.push_back(ArgValue);
+
+ break;
}
} else {
fail(DL, DAG, "defined with too many args");
@@ -264,7 +300,7 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
- CCInfo.AnalyzeCallOperands(Outs, CC_BPF64);
+ CCInfo.AnalyzeCallOperands(Outs, getHasAlu32() ? CC_BPF32 : CC_BPF64);
unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -388,7 +424,7 @@ BPFTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
}
// Analize return values.
- CCInfo.AnalyzeReturn(Outs, RetCC_BPF64);
+ CCInfo.AnalyzeReturn(Outs, getHasAlu32() ? RetCC_BPF32 : RetCC_BPF64);
SDValue Flag;
SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -432,7 +468,7 @@ SDValue BPFTargetLowering::LowerCallResult(
return DAG.getCopyFromReg(Chain, DL, 1, Ins[0].VT, InFlag).getValue(1);
}
- CCInfo.AnalyzeCallResult(Ins, RetCC_BPF64);
+ CCInfo.AnalyzeCallResult(Ins, getHasAlu32() ? RetCC_BPF32 : RetCC_BPF64);
// Copy all of the result registers out of their specified physreg.
for (auto &Val : RVLocs) {
@@ -485,8 +521,7 @@ SDValue BPFTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
if (!getHasJmpExt())
NegateCC(LHS, RHS, CC);
- SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i64);
-
+ SDValue TargetCC = DAG.getConstant(CC, DL, LHS.getValueType());
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV};
@@ -507,6 +542,8 @@ const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
return "BPFISD::BR_CC";
case BPFISD::Wrapper:
return "BPFISD::Wrapper";
+ case BPFISD::MEMCPY:
+ return "BPFISD::MEMCPY";
}
return nullptr;
}
@@ -523,14 +560,90 @@ SDValue BPFTargetLowering::LowerGlobalAddress(SDValue Op,
return DAG.getNode(BPFISD::Wrapper, DL, MVT::i64, GA);
}
+unsigned
+BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB,
+ unsigned Reg, bool isSigned) const {
+ const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+ const TargetRegisterClass *RC = getRegClassFor(MVT::i64);
+ int RShiftOp = isSigned ? BPF::SRA_ri : BPF::SRL_ri;
+ MachineFunction *F = BB->getParent();
+ DebugLoc DL = MI.getDebugLoc();
+
+ MachineRegisterInfo &RegInfo = F->getRegInfo();
+ unsigned PromotedReg0 = RegInfo.createVirtualRegister(RC);
+ unsigned PromotedReg1 = RegInfo.createVirtualRegister(RC);
+ unsigned PromotedReg2 = RegInfo.createVirtualRegister(RC);
+ BuildMI(BB, DL, TII.get(BPF::MOV_32_64), PromotedReg0).addReg(Reg);
+ BuildMI(BB, DL, TII.get(BPF::SLL_ri), PromotedReg1)
+ .addReg(PromotedReg0).addImm(32);
+ BuildMI(BB, DL, TII.get(RShiftOp), PromotedReg2)
+ .addReg(PromotedReg1).addImm(32);
+
+ return PromotedReg2;
+}
+
+MachineBasicBlock *
+BPFTargetLowering::EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
+ MachineBasicBlock *BB)
+ const {
+ MachineFunction *MF = MI.getParent()->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineInstrBuilder MIB(*MF, MI);
+ unsigned ScratchReg;
+
+ // This function does custom insertion during lowering BPFISD::MEMCPY which
+ // only has two register operands from memcpy semantics, the copy source
+ // address and the copy destination address.
+ //
+ // Because we will expand BPFISD::MEMCPY into load/store pairs, we will need
+ // a third scratch register to serve as the destination register of load and
+ // source register of store.
+ //
+ // The scratch register here is with the Define | Dead | EarlyClobber flags.
+ // The EarlyClobber flag has the semantic property that the operand it is
+ // attached to is clobbered before the rest of the inputs are read. Hence it
+ // must be unique among the operands to the instruction. The Define flag is
+ // needed to coerce the machine verifier that an Undef value isn't a problem
+ // as we anyway is loading memory into it. The Dead flag is needed as the
+ // value in scratch isn't supposed to be used by any other instruction.
+ ScratchReg = MRI.createVirtualRegister(&BPF::GPRRegClass);
+ MIB.addReg(ScratchReg,
+ RegState::Define | RegState::Dead | RegState::EarlyClobber);
+
+ return BB;
+}
+
MachineBasicBlock *
BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
- bool isSelectOp = MI.getOpcode() == BPF::Select;
+ unsigned Opc = MI.getOpcode();
+ bool isSelectRROp = (Opc == BPF::Select ||
+ Opc == BPF::Select_64_32 ||
+ Opc == BPF::Select_32 ||
+ Opc == BPF::Select_32_64);
+
+ bool isMemcpyOp = Opc == BPF::MEMCPY;
+
+#ifndef NDEBUG
+ bool isSelectRIOp = (Opc == BPF::Select_Ri ||
+ Opc == BPF::Select_Ri_64_32 ||
+ Opc == BPF::Select_Ri_32 ||
+ Opc == BPF::Select_Ri_32_64);
+
+
+ assert((isSelectRROp || isSelectRIOp || isMemcpyOp) &&
+ "Unexpected instr type to insert");
+#endif
+
+ if (isMemcpyOp)
+ return EmitInstrWithCustomInserterMemcpy(MI, BB);
- assert((isSelectOp || MI.getOpcode() == BPF::Select_Ri) && "Unexpected instr type to insert");
+ bool is32BitCmp = (Opc == BPF::Select_32 ||
+ Opc == BPF::Select_32_64 ||
+ Opc == BPF::Select_Ri_32 ||
+ Opc == BPF::Select_Ri_32_64);
// To "insert" a SELECT instruction, we actually have to insert the diamond
// control-flow pattern. The incoming instruction knows the destination vreg
@@ -561,56 +674,72 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
BB->addSuccessor(Copy1MBB);
// Insert Branch if Flag
- unsigned LHS = MI.getOperand(1).getReg();
int CC = MI.getOperand(3).getImm();
int NewCC;
switch (CC) {
case ISD::SETGT:
- NewCC = isSelectOp ? BPF::JSGT_rr : BPF::JSGT_ri;
+ NewCC = isSelectRROp ? BPF::JSGT_rr : BPF::JSGT_ri;
break;
case ISD::SETUGT:
- NewCC = isSelectOp ? BPF::JUGT_rr : BPF::JUGT_ri;
+ NewCC = isSelectRROp ? BPF::JUGT_rr : BPF::JUGT_ri;
break;
case ISD::SETGE:
- NewCC = isSelectOp ? BPF::JSGE_rr : BPF::JSGE_ri;
+ NewCC = isSelectRROp ? BPF::JSGE_rr : BPF::JSGE_ri;
break;
case ISD::SETUGE:
- NewCC = isSelectOp ? BPF::JUGE_rr : BPF::JUGE_ri;
+ NewCC = isSelectRROp ? BPF::JUGE_rr : BPF::JUGE_ri;
break;
case ISD::SETEQ:
- NewCC = isSelectOp ? BPF::JEQ_rr : BPF::JEQ_ri;
+ NewCC = isSelectRROp ? BPF::JEQ_rr : BPF::JEQ_ri;
break;
case ISD::SETNE:
- NewCC = isSelectOp ? BPF::JNE_rr : BPF::JNE_ri;
+ NewCC = isSelectRROp ? BPF::JNE_rr : BPF::JNE_ri;
break;
case ISD::SETLT:
- NewCC = isSelectOp ? BPF::JSLT_rr : BPF::JSLT_ri;
+ NewCC = isSelectRROp ? BPF::JSLT_rr : BPF::JSLT_ri;
break;
case ISD::SETULT:
- NewCC = isSelectOp ? BPF::JULT_rr : BPF::JULT_ri;
+ NewCC = isSelectRROp ? BPF::JULT_rr : BPF::JULT_ri;
break;
case ISD::SETLE:
- NewCC = isSelectOp ? BPF::JSLE_rr : BPF::JSLE_ri;
+ NewCC = isSelectRROp ? BPF::JSLE_rr : BPF::JSLE_ri;
break;
case ISD::SETULE:
- NewCC = isSelectOp ? BPF::JULE_rr : BPF::JULE_ri;
+ NewCC = isSelectRROp ? BPF::JULE_rr : BPF::JULE_ri;
break;
default:
report_fatal_error("unimplemented select CondCode " + Twine(CC));
}
- if (isSelectOp)
- BuildMI(BB, DL, TII.get(NewCC))
- .addReg(LHS)
- .addReg(MI.getOperand(2).getReg())
- .addMBB(Copy1MBB);
- else {
+
+ unsigned LHS = MI.getOperand(1).getReg();
+ bool isSignedCmp = (CC == ISD::SETGT ||
+ CC == ISD::SETGE ||
+ CC == ISD::SETLT ||
+ CC == ISD::SETLE);
+
+ // eBPF at the moment only has 64-bit comparison. Any 32-bit comparison need
+ // to be promoted, however if the 32-bit comparison operands are destination
+ // registers then they are implicitly zero-extended already, there is no
+ // need of explicit zero-extend sequence for them.
+ //
+ // We simply do extension for all situations in this method, but we will
+ // try to remove those unnecessary in BPFMIPeephole pass.
+ if (is32BitCmp)
+ LHS = EmitSubregExt(MI, BB, LHS, isSignedCmp);
+
+ if (isSelectRROp) {
+ unsigned RHS = MI.getOperand(2).getReg();
+
+ if (is32BitCmp)
+ RHS = EmitSubregExt(MI, BB, RHS, isSignedCmp);
+
+ BuildMI(BB, DL, TII.get(NewCC)).addReg(LHS).addReg(RHS).addMBB(Copy1MBB);
+ } else {
int64_t imm32 = MI.getOperand(2).getImm();
// sanity check before we build J*_ri instruction.
assert (isInt<32>(imm32));
BuildMI(BB, DL, TII.get(NewCC))
- .addReg(LHS)
- .addImm(imm32)
- .addMBB(Copy1MBB);
+ .addReg(LHS).addImm(imm32).addMBB(Copy1MBB);
}
// Copy0MBB:
@@ -634,3 +763,13 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
+
+EVT BPFTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+ EVT VT) const {
+ return getHasAlu32() ? MVT::i32 : MVT::i64;
+}
+
+MVT BPFTargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
+ EVT VT) const {
+ return (getHasAlu32() && VT == MVT::i32) ? MVT::i32 : MVT::i64;
+}
diff --git a/lib/Target/BPF/BPFISelLowering.h b/lib/Target/BPF/BPFISelLowering.h
index 6ca2594a7e88..0aa8b9ac57ac 100644
--- a/lib/Target/BPF/BPFISelLowering.h
+++ b/lib/Target/BPF/BPFISelLowering.h
@@ -28,7 +28,8 @@ enum NodeType : unsigned {
CALL,
SELECT_CC,
BR_CC,
- Wrapper
+ Wrapper,
+ MEMCPY
};
}
@@ -54,10 +55,17 @@ public:
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const override;
+ bool getHasAlu32() const { return HasAlu32; }
bool getHasJmpExt() const { return HasJmpExt; }
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+ EVT VT) const override;
+
+ MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
+
private:
// Control Instruction Selection Features
+ bool HasAlu32;
bool HasJmpExt;
SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
@@ -100,6 +108,14 @@ private:
Type *Ty) const override {
return true;
}
+
+ unsigned EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB, unsigned Reg,
+ bool isSigned) const;
+
+ MachineBasicBlock * EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
+ MachineBasicBlock *BB)
+ const;
+
};
}
diff --git a/lib/Target/BPF/BPFInstrInfo.cpp b/lib/Target/BPF/BPFInstrInfo.cpp
index 5351cfa95020..4d47debdaa74 100644
--- a/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/lib/Target/BPF/BPFInstrInfo.cpp
@@ -36,10 +36,92 @@ void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (BPF::GPRRegClass.contains(DestReg, SrcReg))
BuildMI(MBB, I, DL, get(BPF::MOV_rr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
+ else if (BPF::GPR32RegClass.contains(DestReg, SrcReg))
+ BuildMI(MBB, I, DL, get(BPF::MOV_rr_32), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
else
llvm_unreachable("Impossible reg-to-reg copy");
}
+void BPFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ uint64_t CopyLen = MI->getOperand(2).getImm();
+ uint64_t Alignment = MI->getOperand(3).getImm();
+ unsigned ScratchReg = MI->getOperand(4).getReg();
+ MachineBasicBlock *BB = MI->getParent();
+ DebugLoc dl = MI->getDebugLoc();
+ unsigned LdOpc, StOpc;
+
+ switch (Alignment) {
+ case 1:
+ LdOpc = BPF::LDB;
+ StOpc = BPF::STB;
+ break;
+ case 2:
+ LdOpc = BPF::LDH;
+ StOpc = BPF::STH;
+ break;
+ case 4:
+ LdOpc = BPF::LDW;
+ StOpc = BPF::STW;
+ break;
+ case 8:
+ LdOpc = BPF::LDD;
+ StOpc = BPF::STD;
+ break;
+ default:
+ llvm_unreachable("unsupported memcpy alignment");
+ }
+
+ unsigned IterationNum = CopyLen >> Log2_64(Alignment);
+ for(unsigned I = 0; I < IterationNum; ++I) {
+ BuildMI(*BB, MI, dl, get(LdOpc))
+ .addReg(ScratchReg, RegState::Define).addReg(SrcReg)
+ .addImm(I * Alignment);
+ BuildMI(*BB, MI, dl, get(StOpc))
+ .addReg(ScratchReg, RegState::Kill).addReg(DstReg)
+ .addImm(I * Alignment);
+ }
+
+ unsigned BytesLeft = CopyLen & (Alignment - 1);
+ unsigned Offset = IterationNum * Alignment;
+ bool Hanging4Byte = BytesLeft & 0x4;
+ bool Hanging2Byte = BytesLeft & 0x2;
+ bool Hanging1Byte = BytesLeft & 0x1;
+ if (Hanging4Byte) {
+ BuildMI(*BB, MI, dl, get(BPF::LDW))
+ .addReg(ScratchReg, RegState::Define).addReg(SrcReg).addImm(Offset);
+ BuildMI(*BB, MI, dl, get(BPF::STW))
+ .addReg(ScratchReg, RegState::Kill).addReg(DstReg).addImm(Offset);
+ Offset += 4;
+ }
+ if (Hanging2Byte) {
+ BuildMI(*BB, MI, dl, get(BPF::LDH))
+ .addReg(ScratchReg, RegState::Define).addReg(SrcReg).addImm(Offset);
+ BuildMI(*BB, MI, dl, get(BPF::STH))
+ .addReg(ScratchReg, RegState::Kill).addReg(DstReg).addImm(Offset);
+ Offset += 2;
+ }
+ if (Hanging1Byte) {
+ BuildMI(*BB, MI, dl, get(BPF::LDB))
+ .addReg(ScratchReg, RegState::Define).addReg(SrcReg).addImm(Offset);
+ BuildMI(*BB, MI, dl, get(BPF::STB))
+ .addReg(ScratchReg, RegState::Kill).addReg(DstReg).addImm(Offset);
+ }
+
+ BB->erase(MI);
+}
+
+bool BPFInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ if (MI.getOpcode() == BPF::MEMCPY) {
+ expandMEMCPY(MI);
+ return true;
+ }
+
+ return false;
+}
+
void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
unsigned SrcReg, bool IsKill, int FI,
@@ -54,6 +136,11 @@ void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
.addReg(SrcReg, getKillRegState(IsKill))
.addFrameIndex(FI)
.addImm(0);
+ else if (RC == &BPF::GPR32RegClass)
+ BuildMI(MBB, I, DL, get(BPF::STW32))
+ .addReg(SrcReg, getKillRegState(IsKill))
+ .addFrameIndex(FI)
+ .addImm(0);
else
llvm_unreachable("Can't store this register to stack slot");
}
@@ -69,6 +156,8 @@ void BPFInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
if (RC == &BPF::GPRRegClass)
BuildMI(MBB, I, DL, get(BPF::LDD), DestReg).addFrameIndex(FI).addImm(0);
+ else if (RC == &BPF::GPR32RegClass)
+ BuildMI(MBB, I, DL, get(BPF::LDW32), DestReg).addFrameIndex(FI).addImm(0);
else
llvm_unreachable("Can't load this register from stack slot");
}
@@ -83,7 +172,7 @@ bool BPFInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I = MBB.end();
while (I != MBB.begin()) {
--I;
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
// Working from the bottom, when we see a non-terminator
@@ -158,7 +247,7 @@ unsigned BPFInstrInfo::removeBranch(MachineBasicBlock &MBB,
while (I != MBB.begin()) {
--I;
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
if (I->getOpcode() != BPF::JMP)
break;
diff --git a/lib/Target/BPF/BPFInstrInfo.h b/lib/Target/BPF/BPFInstrInfo.h
index f591f48a89a6..fb65a86a6d18 100644
--- a/lib/Target/BPF/BPFInstrInfo.h
+++ b/lib/Target/BPF/BPFInstrInfo.h
@@ -34,6 +34,8 @@ public:
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc) const override;
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
+
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, unsigned SrcReg,
bool isKill, int FrameIndex,
@@ -55,6 +57,9 @@ public:
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
const DebugLoc &DL,
int *BytesAdded = nullptr) const override;
+private:
+ void expandMEMCPY(MachineBasicBlock::iterator) const;
+
};
}
diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td
index 126d55fc28de..aaef5fb706e0 100644
--- a/lib/Target/BPF/BPFInstrInfo.td
+++ b/lib/Target/BPF/BPFInstrInfo.td
@@ -28,6 +28,10 @@ def SDT_BPFBrCC : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
SDTCisVT<3, OtherVT>]>;
def SDT_BPFWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
SDTCisPtrTy<0>]>;
+def SDT_BPFMEMCPY : SDTypeProfile<0, 4, [SDTCisVT<0, i64>,
+ SDTCisVT<1, i64>,
+ SDTCisVT<2, i64>,
+ SDTCisVT<3, i64>]>;
def BPFcall : SDNode<"BPFISD::CALL", SDT_BPFCall,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
@@ -43,8 +47,13 @@ def BPFbrcc : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC,
def BPFselectcc : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC, [SDNPInGlue]>;
def BPFWrapper : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>;
+def BPFmemcpy : SDNode<"BPFISD::MEMCPY", SDT_BPFMEMCPY,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+ SDNPMayStore, SDNPMayLoad]>;
def BPFIsLittleEndian : Predicate<"CurDAG->getDataLayout().isLittleEndian()">;
def BPFIsBigEndian : Predicate<"!CurDAG->getDataLayout().isLittleEndian()">;
+def BPFHasALU32 : Predicate<"Subtarget->getHasAlu32()">;
+def BPFNoALU32 : Predicate<"!Subtarget->getHasAlu32()">;
def brtarget : Operand<OtherVT> {
let PrintMethod = "printBrTargetOperand";
@@ -57,6 +66,8 @@ def u64imm : Operand<i64> {
def i64immSExt32 : PatLeaf<(i64 imm),
[{return isInt<32>(N->getSExtValue()); }]>;
+def i32immSExt32 : PatLeaf<(i32 imm),
+ [{return isInt<32>(N->getSExtValue()); }]>;
// Addressing modes.
def ADDRri : ComplexPattern<i64, 2, "SelectAddr", [], []>;
@@ -218,7 +229,7 @@ multiclass ALU<BPFArithOp Opc, string OpcodeStr, SDNode OpNode> {
(outs GPR32:$dst),
(ins GPR32:$src2, i32imm:$imm),
"$dst "#OpcodeStr#" $imm",
- [(set GPR32:$dst, (OpNode GPR32:$src2, i32:$imm))]>;
+ [(set GPR32:$dst, (OpNode GPR32:$src2, i32immSExt32:$imm))]>;
}
let Constraints = "$dst = $src2" in {
@@ -292,7 +303,7 @@ def MOV_ri_32 : ALU_RI<BPF_ALU, BPF_MOV,
(outs GPR32:$dst),
(ins i32imm:$imm),
"$dst = $imm",
- [(set GPR32:$dst, (i32 i32:$imm))]>;
+ [(set GPR32:$dst, (i32 i32immSExt32:$imm))]>;
}
def FI_ri
@@ -347,9 +358,11 @@ class STORE<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
class STOREi64<BPFWidthModifer Opc, string OpcodeStr, PatFrag OpNode>
: STORE<Opc, OpcodeStr, [(OpNode i64:$src, ADDRri:$addr)]>;
-def STW : STOREi64<BPF_W, "u32", truncstorei32>;
-def STH : STOREi64<BPF_H, "u16", truncstorei16>;
-def STB : STOREi64<BPF_B, "u8", truncstorei8>;
+let Predicates = [BPFNoALU32] in {
+ def STW : STOREi64<BPF_W, "u32", truncstorei32>;
+ def STH : STOREi64<BPF_H, "u16", truncstorei16>;
+ def STB : STOREi64<BPF_B, "u8", truncstorei8>;
+}
def STD : STOREi64<BPF_DW, "u64", store>;
// LOAD instructions
@@ -371,9 +384,13 @@ class LOAD<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
class LOADi64<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
: LOAD<SizeOp, OpcodeStr, [(set i64:$dst, (OpNode ADDRri:$addr))]>;
-def LDW : LOADi64<BPF_W, "u32", zextloadi32>;
-def LDH : LOADi64<BPF_H, "u16", zextloadi16>;
-def LDB : LOADi64<BPF_B, "u8", zextloadi8>;
+
+let Predicates = [BPFNoALU32] in {
+ def LDW : LOADi64<BPF_W, "u32", zextloadi32>;
+ def LDH : LOADi64<BPF_H, "u16", zextloadi16>;
+ def LDB : LOADi64<BPF_B, "u8", zextloadi8>;
+}
+
def LDD : LOADi64<BPF_DW, "u64", load>;
class BRANCH<BPFJumpOp Opc, string OpcodeStr, list<dag> Pattern>
@@ -456,7 +473,7 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1,
}
// ADJCALLSTACKDOWN/UP pseudo insns
-let Defs = [R11], Uses = [R11] in {
+let Defs = [R11], Uses = [R11], isCodeGenOnly = 1 in {
def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
"#ADJCALLSTACKDOWN $amt1 $amt2",
[(BPFcallseq_start timm:$amt1, timm:$amt2)]>;
@@ -465,7 +482,7 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
[(BPFcallseq_end timm:$amt1, timm:$amt2)]>;
}
-let usesCustomInserter = 1 in {
+let usesCustomInserter = 1, isCodeGenOnly = 1 in {
def Select : Pseudo<(outs GPR:$dst),
(ins GPR:$lhs, GPR:$rhs, i64imm:$imm, GPR:$src, GPR:$src2),
"# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
@@ -476,6 +493,36 @@ let usesCustomInserter = 1 in {
"# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
[(set i64:$dst,
(BPFselectcc i64:$lhs, (i64immSExt32:$rhs), (i64 imm:$imm), i64:$src, i64:$src2))]>;
+ def Select_64_32 : Pseudo<(outs GPR32:$dst),
+ (ins GPR:$lhs, GPR:$rhs, i64imm:$imm, GPR32:$src, GPR32:$src2),
+ "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+ [(set i32:$dst,
+ (BPFselectcc i64:$lhs, i64:$rhs, (i64 imm:$imm), i32:$src, i32:$src2))]>;
+ def Select_Ri_64_32 : Pseudo<(outs GPR32:$dst),
+ (ins GPR:$lhs, i64imm:$rhs, i64imm:$imm, GPR32:$src, GPR32:$src2),
+ "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+ [(set i32:$dst,
+ (BPFselectcc i64:$lhs, (i64immSExt32:$rhs), (i64 imm:$imm), i32:$src, i32:$src2))]>;
+ def Select_32 : Pseudo<(outs GPR32:$dst),
+ (ins GPR32:$lhs, GPR32:$rhs, i32imm:$imm, GPR32:$src, GPR32:$src2),
+ "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+ [(set i32:$dst,
+ (BPFselectcc i32:$lhs, i32:$rhs, (i32 imm:$imm), i32:$src, i32:$src2))]>;
+ def Select_Ri_32 : Pseudo<(outs GPR32:$dst),
+ (ins GPR32:$lhs, i32imm:$rhs, i32imm:$imm, GPR32:$src, GPR32:$src2),
+ "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+ [(set i32:$dst,
+ (BPFselectcc i32:$lhs, (i32immSExt32:$rhs), (i32 imm:$imm), i32:$src, i32:$src2))]>;
+ def Select_32_64 : Pseudo<(outs GPR:$dst),
+ (ins GPR32:$lhs, GPR32:$rhs, i32imm:$imm, GPR:$src, GPR:$src2),
+ "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+ [(set i64:$dst,
+ (BPFselectcc i32:$lhs, i32:$rhs, (i32 imm:$imm), i64:$src, i64:$src2))]>;
+ def Select_Ri_32_64 : Pseudo<(outs GPR:$dst),
+ (ins GPR32:$lhs, i32imm:$rhs, i32imm:$imm, GPR:$src, GPR:$src2),
+ "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+ [(set i64:$dst,
+ (BPFselectcc i32:$lhs, (i32immSExt32:$rhs), (i32 imm:$imm), i64:$src, i64:$src2))]>;
}
// load 64-bit global addr into register
@@ -492,9 +539,11 @@ def : Pat<(BPFcall imm:$dst), (JAL imm:$dst)>;
def : Pat<(BPFcall GPR:$dst), (JALX GPR:$dst)>;
// Loads
-def : Pat<(extloadi8 ADDRri:$src), (i64 (LDB ADDRri:$src))>;
-def : Pat<(extloadi16 ADDRri:$src), (i64 (LDH ADDRri:$src))>;
-def : Pat<(extloadi32 ADDRri:$src), (i64 (LDW ADDRri:$src))>;
+let Predicates = [BPFNoALU32] in {
+ def : Pat<(i64 (extloadi8 ADDRri:$src)), (i64 (LDB ADDRri:$src))>;
+ def : Pat<(i64 (extloadi16 ADDRri:$src)), (i64 (LDH ADDRri:$src))>;
+ def : Pat<(i64 (extloadi32 ADDRri:$src)), (i64 (LDW ADDRri:$src))>;
+}
// Atomics
class XADD<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
@@ -581,3 +630,102 @@ def LD_ABS_W : LOAD_ABS<BPF_W, "u32", int_bpf_load_word>;
def LD_IND_B : LOAD_IND<BPF_B, "u8", int_bpf_load_byte>;
def LD_IND_H : LOAD_IND<BPF_H, "u16", int_bpf_load_half>;
def LD_IND_W : LOAD_IND<BPF_W, "u32", int_bpf_load_word>;
+
+let isCodeGenOnly = 1 in {
+ def MOV_32_64 : ALU_RR<BPF_ALU, BPF_MOV,
+ (outs GPR:$dst), (ins GPR32:$src),
+ "$dst = $src", []>;
+}
+
+def : Pat<(i64 (sext GPR32:$src)),
+ (SRA_ri (SLL_ri (MOV_32_64 GPR32:$src), 32), 32)>;
+
+def : Pat<(i64 (zext GPR32:$src)),
+ (SRL_ri (SLL_ri (MOV_32_64 GPR32:$src), 32), 32)>;
+
+// For i64 -> i32 truncation, use the 32-bit subregister directly.
+def : Pat<(i32 (trunc GPR:$src)),
+ (i32 (EXTRACT_SUBREG GPR:$src, sub_32))>;
+
+// For i32 -> i64 anyext, we don't care about the high bits.
+def : Pat<(i64 (anyext GPR32:$src)),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
+
+class STORE32<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
+ : TYPE_LD_ST<BPF_MEM.Value, SizeOp.Value,
+ (outs),
+ (ins GPR32:$src, MEMri:$addr),
+ "*("#OpcodeStr#" *)($addr) = $src",
+ Pattern> {
+ bits<4> src;
+ bits<20> addr;
+
+ let Inst{51-48} = addr{19-16}; // base reg
+ let Inst{55-52} = src;
+ let Inst{47-32} = addr{15-0}; // offset
+ let BPFClass = BPF_STX;
+}
+
+class STOREi32<BPFWidthModifer Opc, string OpcodeStr, PatFrag OpNode>
+ : STORE32<Opc, OpcodeStr, [(OpNode i32:$src, ADDRri:$addr)]>;
+
+let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
+ def STW32 : STOREi32<BPF_W, "u32", store>;
+ def STH32 : STOREi32<BPF_H, "u16", truncstorei16>;
+ def STB32 : STOREi32<BPF_B, "u8", truncstorei8>;
+}
+
+class LOAD32<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
+ : TYPE_LD_ST<BPF_MEM.Value, SizeOp.Value,
+ (outs GPR32:$dst),
+ (ins MEMri:$addr),
+ "$dst = *("#OpcodeStr#" *)($addr)",
+ Pattern> {
+ bits<4> dst;
+ bits<20> addr;
+
+ let Inst{51-48} = dst;
+ let Inst{55-52} = addr{19-16};
+ let Inst{47-32} = addr{15-0};
+ let BPFClass = BPF_LDX;
+}
+
+class LOADi32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
+ : LOAD32<SizeOp, OpcodeStr, [(set i32:$dst, (OpNode ADDRri:$addr))]>;
+
+let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
+ def LDW32 : LOADi32<BPF_W, "u32", load>;
+ def LDH32 : LOADi32<BPF_H, "u16", zextloadi16>;
+ def LDB32 : LOADi32<BPF_B, "u8", zextloadi8>;
+}
+
+let Predicates = [BPFHasALU32] in {
+ def : Pat<(truncstorei8 GPR:$src, ADDRri:$dst),
+ (STB32 (EXTRACT_SUBREG GPR:$src, sub_32), ADDRri:$dst)>;
+ def : Pat<(truncstorei16 GPR:$src, ADDRri:$dst),
+ (STH32 (EXTRACT_SUBREG GPR:$src, sub_32), ADDRri:$dst)>;
+ def : Pat<(truncstorei32 GPR:$src, ADDRri:$dst),
+ (STW32 (EXTRACT_SUBREG GPR:$src, sub_32), ADDRri:$dst)>;
+ def : Pat<(i32 (extloadi8 ADDRri:$src)), (i32 (LDB32 ADDRri:$src))>;
+ def : Pat<(i32 (extloadi16 ADDRri:$src)), (i32 (LDH32 ADDRri:$src))>;
+ def : Pat<(i64 (zextloadi8 ADDRri:$src)),
+ (SUBREG_TO_REG (i64 0), (LDB32 ADDRri:$src), sub_32)>;
+ def : Pat<(i64 (zextloadi16 ADDRri:$src)),
+ (SUBREG_TO_REG (i64 0), (LDH32 ADDRri:$src), sub_32)>;
+ def : Pat<(i64 (zextloadi32 ADDRri:$src)),
+ (SUBREG_TO_REG (i64 0), (LDW32 ADDRri:$src), sub_32)>;
+ def : Pat<(i64 (extloadi8 ADDRri:$src)),
+ (SUBREG_TO_REG (i64 0), (LDB32 ADDRri:$src), sub_32)>;
+ def : Pat<(i64 (extloadi16 ADDRri:$src)),
+ (SUBREG_TO_REG (i64 0), (LDH32 ADDRri:$src), sub_32)>;
+ def : Pat<(i64 (extloadi32 ADDRri:$src)),
+ (SUBREG_TO_REG (i64 0), (LDW32 ADDRri:$src), sub_32)>;
+}
+
+let usesCustomInserter = 1, isCodeGenOnly = 1 in {
+ def MEMCPY : Pseudo<
+ (outs),
+ (ins GPR:$dst, GPR:$src, i64imm:$len, i64imm:$align, variable_ops),
+ "#memcpy dst: $dst, src: $src, len: $len, align: $align",
+ [(BPFmemcpy GPR:$dst, GPR:$src, imm:$len, imm:$align)]>;
+}
diff --git a/lib/Target/BPF/BPFMIPeephole.cpp b/lib/Target/BPF/BPFMIPeephole.cpp
new file mode 100644
index 000000000000..9e984d0facfb
--- /dev/null
+++ b/lib/Target/BPF/BPFMIPeephole.cpp
@@ -0,0 +1,284 @@
+//===-------------- BPFMIPeephole.cpp - MI Peephole Cleanups -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs peephole optimizations to cleanup ugly code sequences at
+// MachineInstruction layer.
+//
+// Currently, there are two optimizations implemented:
+// - One pre-RA MachineSSA pass to eliminate type promotion sequences, those
+// zero extend 32-bit subregisters to 64-bit registers, if the compiler
+// could prove the subregisters is defined by 32-bit operations in which
+// case the upper half of the underlying 64-bit registers were zeroed
+// implicitly.
+//
+// - One post-RA PreEmit pass to do final cleanup on some redundant
+// instructions generated due to bad RA on subregister.
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFInstrInfo.h"
+#include "BPFTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-mi-zext-elim"
+
+STATISTIC(ZExtElemNum, "Number of zero extension shifts eliminated");
+
+namespace {
+
+struct BPFMIPeephole : public MachineFunctionPass {
+
+ static char ID;
+ const BPFInstrInfo *TII;
+ MachineFunction *MF;
+ MachineRegisterInfo *MRI;
+
+ BPFMIPeephole() : MachineFunctionPass(ID) {
+ initializeBPFMIPeepholePass(*PassRegistry::getPassRegistry());
+ }
+
+private:
+ // Initialize class variables.
+ void initialize(MachineFunction &MFParm);
+
+ bool isMovFrom32Def(MachineInstr *MovMI);
+ bool eliminateZExtSeq(void);
+
+public:
+
+ // Main entry point for this pass.
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ initialize(MF);
+
+ return eliminateZExtSeq();
+ }
+};
+
+// Initialize class variables.
+void BPFMIPeephole::initialize(MachineFunction &MFParm) {
+ MF = &MFParm;
+ MRI = &MF->getRegInfo();
+ TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
+ LLVM_DEBUG(dbgs() << "*** BPF MachineSSA peephole pass ***\n\n");
+}
+
+bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI)
+{
+ MachineInstr *DefInsn = MRI->getVRegDef(MovMI->getOperand(1).getReg());
+
+ LLVM_DEBUG(dbgs() << " Def of Mov Src:");
+ LLVM_DEBUG(DefInsn->dump());
+
+ if (!DefInsn)
+ return false;
+
+ if (DefInsn->isPHI()) {
+ for (unsigned i = 1, e = DefInsn->getNumOperands(); i < e; i += 2) {
+ MachineOperand &opnd = DefInsn->getOperand(i);
+
+ if (!opnd.isReg())
+ return false;
+
+ MachineInstr *PhiDef = MRI->getVRegDef(opnd.getReg());
+ // quick check on PHI incoming definitions.
+ if (!PhiDef || PhiDef->isPHI() || PhiDef->getOpcode() == BPF::COPY)
+ return false;
+ }
+ }
+
+ if (DefInsn->getOpcode() == BPF::COPY) {
+ MachineOperand &opnd = DefInsn->getOperand(1);
+
+ if (!opnd.isReg())
+ return false;
+
+ unsigned Reg = opnd.getReg();
+ if ((TargetRegisterInfo::isVirtualRegister(Reg) &&
+ MRI->getRegClass(Reg) == &BPF::GPRRegClass))
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << " One ZExt elim sequence identified.\n");
+
+ return true;
+}
+
+bool BPFMIPeephole::eliminateZExtSeq(void) {
+ MachineInstr* ToErase = nullptr;
+ bool Eliminated = false;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB) {
+ // If the previous instruction was marked for elimination, remove it now.
+ if (ToErase) {
+ ToErase->eraseFromParent();
+ ToErase = nullptr;
+ }
+
+ // Eliminate the 32-bit to 64-bit zero extension sequence when possible.
+ //
+ // MOV_32_64 rB, wA
+ // SLL_ri rB, rB, 32
+ // SRL_ri rB, rB, 32
+ if (MI.getOpcode() == BPF::SRL_ri &&
+ MI.getOperand(2).getImm() == 32) {
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned ShfReg = MI.getOperand(1).getReg();
+ MachineInstr *SllMI = MRI->getVRegDef(ShfReg);
+
+ LLVM_DEBUG(dbgs() << "Starting SRL found:");
+ LLVM_DEBUG(MI.dump());
+
+ if (!SllMI ||
+ SllMI->isPHI() ||
+ SllMI->getOpcode() != BPF::SLL_ri ||
+ SllMI->getOperand(2).getImm() != 32)
+ continue;
+
+ LLVM_DEBUG(dbgs() << " SLL found:");
+ LLVM_DEBUG(SllMI->dump());
+
+ MachineInstr *MovMI = MRI->getVRegDef(SllMI->getOperand(1).getReg());
+ if (!MovMI ||
+ MovMI->isPHI() ||
+ MovMI->getOpcode() != BPF::MOV_32_64)
+ continue;
+
+ LLVM_DEBUG(dbgs() << " Type cast Mov found:");
+ LLVM_DEBUG(MovMI->dump());
+
+ unsigned SubReg = MovMI->getOperand(1).getReg();
+ if (!isMovFrom32Def(MovMI)) {
+ LLVM_DEBUG(dbgs()
+ << " One ZExt elim sequence failed qualifying elim.\n");
+ continue;
+ }
+
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::SUBREG_TO_REG), DstReg)
+ .addImm(0).addReg(SubReg).addImm(BPF::sub_32);
+
+ SllMI->eraseFromParent();
+ MovMI->eraseFromParent();
+ // MI is the right shift, we can't erase it in it's own iteration.
+ // Mark it to ToErase, and erase in the next iteration.
+ ToErase = &MI;
+ ZExtElemNum++;
+ Eliminated = true;
+ }
+ }
+ }
+
+ return Eliminated;
+}
+
+} // end default namespace
+
+INITIALIZE_PASS(BPFMIPeephole, DEBUG_TYPE,
+ "BPF MachineSSA Peephole Optimization", false, false)
+
+char BPFMIPeephole::ID = 0;
+FunctionPass* llvm::createBPFMIPeepholePass() { return new BPFMIPeephole(); }
+
+STATISTIC(RedundantMovElemNum, "Number of redundant moves eliminated");
+
+namespace {
+
+struct BPFMIPreEmitPeephole : public MachineFunctionPass {
+
+ static char ID;
+ MachineFunction *MF;
+ const TargetRegisterInfo *TRI;
+
+ BPFMIPreEmitPeephole() : MachineFunctionPass(ID) {
+ initializeBPFMIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
+ }
+
+private:
+ // Initialize class variables.
+ void initialize(MachineFunction &MFParm);
+
+ bool eliminateRedundantMov(void);
+
+public:
+
+ // Main entry point for this pass.
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ initialize(MF);
+
+ return eliminateRedundantMov();
+ }
+};
+
+// Initialize class variables.
+void BPFMIPreEmitPeephole::initialize(MachineFunction &MFParm) {
+ MF = &MFParm;
+ TRI = MF->getSubtarget<BPFSubtarget>().getRegisterInfo();
+ LLVM_DEBUG(dbgs() << "*** BPF PreEmit peephole pass ***\n\n");
+}
+
+bool BPFMIPreEmitPeephole::eliminateRedundantMov(void) {
+ MachineInstr* ToErase = nullptr;
+ bool Eliminated = false;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB) {
+ // If the previous instruction was marked for elimination, remove it now.
+ if (ToErase) {
+ LLVM_DEBUG(dbgs() << " Redundant Mov Eliminated:");
+ LLVM_DEBUG(ToErase->dump());
+ ToErase->eraseFromParent();
+ ToErase = nullptr;
+ }
+
+ // Eliminate identical move:
+ //
+ // MOV rA, rA
+ //
+ // This is particularly possible to happen when sub-register support
+ // enabled. The special type cast insn MOV_32_64 involves different
+ // register class on src (i32) and dst (i64), RA could generate useless
+ // instruction due to this.
+ if (MI.getOpcode() == BPF::MOV_32_64) {
+ unsigned dst = MI.getOperand(0).getReg();
+ unsigned dst_sub = TRI->getSubReg(dst, BPF::sub_32);
+ unsigned src = MI.getOperand(1).getReg();
+
+ if (dst_sub != src)
+ continue;
+
+ ToErase = &MI;
+ RedundantMovElemNum++;
+ Eliminated = true;
+ }
+ }
+ }
+
+ return Eliminated;
+}
+
+} // end default namespace
+
+INITIALIZE_PASS(BPFMIPreEmitPeephole, "bpf-mi-pemit-peephole",
+ "BPF PreEmit Peephole Optimization", false, false)
+
+char BPFMIPreEmitPeephole::ID = 0;
+FunctionPass* llvm::createBPFMIPreEmitPeepholePass()
+{
+ return new BPFMIPreEmitPeephole();
+}
diff --git a/lib/Target/BPF/BPFRegisterInfo.cpp b/lib/Target/BPF/BPFRegisterInfo.cpp
index 6f7067816098..635c11113151 100644
--- a/lib/Target/BPF/BPFRegisterInfo.cpp
+++ b/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -37,8 +37,8 @@ BPFRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
BitVector BPFRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
- Reserved.set(BPF::R10); // R10 is read only frame pointer
- Reserved.set(BPF::R11); // R11 is pseudo stack pointer
+ markSuperRegs(Reserved, BPF::W10); // [W|R]10 is read only frame pointer
+ markSuperRegs(Reserved, BPF::W11); // [W|R]11 is pseudo stack pointer
return Reserved;
}
diff --git a/lib/Target/BPF/BPFRegisterInfo.h b/lib/Target/BPF/BPFRegisterInfo.h
index 4202850e9eb9..bb0d6bcf5450 100644
--- a/lib/Target/BPF/BPFRegisterInfo.h
+++ b/lib/Target/BPF/BPFRegisterInfo.h
@@ -29,6 +29,8 @@ struct BPFRegisterInfo : public BPFGenRegisterInfo {
BitVector getReservedRegs(const MachineFunction &MF) const override;
+ bool enableMultipleCopyHints() const override { return true; }
+
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
unsigned FIOperandNum,
RegScavenger *RS = nullptr) const override;
diff --git a/lib/Target/BPF/BPFSelectionDAGInfo.cpp b/lib/Target/BPF/BPFSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..24d5f59bbfd7
--- /dev/null
+++ b/lib/Target/BPF/BPFSelectionDAGInfo.cpp
@@ -0,0 +1,43 @@
+//===-- BPFSelectionDAGInfo.cpp - BPF SelectionDAG Info -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BPFSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-selectiondag-info"
+
+SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ // Requires the copy size to be a constant.
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ if (!ConstantSize)
+ return SDValue();
+
+ unsigned CopyLen = ConstantSize->getZExtValue();
+ unsigned StoresNumEstimate = alignTo(CopyLen, Align) >> Log2_32(Align);
+ // Impose the same copy length limit as MaxStoresPerMemcpy.
+ if (StoresNumEstimate > getCommonMaxStoresPerMemFunc())
+ return SDValue();
+
+ SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+
+ Dst = DAG.getNode(BPFISD::MEMCPY, dl, VTs, Chain, Dst, Src,
+ DAG.getConstant(CopyLen, dl, MVT::i64),
+ DAG.getConstant(Align, dl, MVT::i64));
+
+ return Dst.getValue(0);
+}
diff --git a/lib/Target/BPF/BPFSelectionDAGInfo.h b/lib/Target/BPF/BPFSelectionDAGInfo.h
new file mode 100644
index 000000000000..19d3c5769573
--- /dev/null
+++ b/lib/Target/BPF/BPFSelectionDAGInfo.h
@@ -0,0 +1,36 @@
+//===-- BPFSelectionDAGInfo.h - BPF SelectionDAG Info -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the BPF subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class BPFSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
+ bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const override;
+
+ unsigned getCommonMaxStoresPerMemFunc() const { return 128; }
+
+};
+
+}
+
+#endif
diff --git a/lib/Target/BPF/BPFSubtarget.cpp b/lib/Target/BPF/BPFSubtarget.cpp
index 42ca87f9ef67..56780bd9d46f 100644
--- a/lib/Target/BPF/BPFSubtarget.cpp
+++ b/lib/Target/BPF/BPFSubtarget.cpp
@@ -30,11 +30,14 @@ BPFSubtarget &BPFSubtarget::initializeSubtargetDependencies(StringRef CPU,
StringRef FS) {
initializeEnvironment();
initSubtargetFeatures(CPU, FS);
+ ParseSubtargetFeatures(CPU, FS);
return *this;
}
void BPFSubtarget::initializeEnvironment() {
HasJmpExt = false;
+ HasAlu32 = false;
+ UseDwarfRIS = false;
}
void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
diff --git a/lib/Target/BPF/BPFSubtarget.h b/lib/Target/BPF/BPFSubtarget.h
index fa1f24443bc3..60e56435fe4c 100644
--- a/lib/Target/BPF/BPFSubtarget.h
+++ b/lib/Target/BPF/BPFSubtarget.h
@@ -17,6 +17,7 @@
#include "BPFFrameLowering.h"
#include "BPFISelLowering.h"
#include "BPFInstrInfo.h"
+#include "BPFSelectionDAGInfo.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DataLayout.h"
@@ -33,7 +34,7 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
BPFInstrInfo InstrInfo;
BPFFrameLowering FrameLowering;
BPFTargetLowering TLInfo;
- SelectionDAGTargetInfo TSInfo;
+ BPFSelectionDAGInfo TSInfo;
private:
void initializeEnvironment();
@@ -47,6 +48,12 @@ protected:
// whether the cpu supports jmp ext
bool HasJmpExt;
+ // whether the cpu supports alu32 instructions.
+ bool HasAlu32;
+
+ // whether we should enable MCAsmInfo DwarfUsesRelocationsAcrossSections
+ bool UseDwarfRIS;
+
public:
// This constructor initializes the data members to match that
// of the specified triple.
@@ -59,6 +66,8 @@ public:
// subtarget options. Definition of function is auto generated by tblgen.
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
bool getHasJmpExt() const { return HasJmpExt; }
+ bool getHasAlu32() const { return HasAlu32; }
+ bool getUseDwarfRIS() const { return UseDwarfRIS; }
const BPFInstrInfo *getInstrInfo() const override { return &InstrInfo; }
const BPFFrameLowering *getFrameLowering() const override {
@@ -67,7 +76,7 @@ public:
const BPFTargetLowering *getTargetLowering() const override {
return &TLInfo;
}
- const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ const BPFSelectionDAGInfo *getSelectionDAGInfo() const override {
return &TSInfo;
}
const TargetRegisterInfo *getRegisterInfo() const override {
diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp
index 60672fa2684b..84d89bff74fe 100644
--- a/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/lib/Target/BPF/BPFTargetMachine.cpp
@@ -13,6 +13,7 @@
#include "BPFTargetMachine.h"
#include "BPF.h"
+#include "MCTargetDesc/BPFMCAsmInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -22,11 +23,18 @@
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
+static cl::
+opt<bool> DisableMIPeephole("disable-bpf-peephole", cl::Hidden,
+ cl::desc("Disable machine peepholes for BPF"));
+
extern "C" void LLVMInitializeBPFTarget() {
// Register the target.
RegisterTargetMachine<BPFTargetMachine> X(getTheBPFleTarget());
RegisterTargetMachine<BPFTargetMachine> Y(getTheBPFbeTarget());
RegisterTargetMachine<BPFTargetMachine> Z(getTheBPFTarget());
+
+ PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeBPFMIPeepholePass(PR);
}
// DataLayout: little or big endian
@@ -61,6 +69,9 @@ BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
TLOF(make_unique<TargetLoweringObjectFileELF>()),
Subtarget(TT, CPU, FS, *this) {
initAsmInfo();
+
+ BPFMCAsmInfo *MAI = static_cast<BPFMCAsmInfo *>(const_cast<MCAsmInfo *>(AsmInfo));
+ MAI->setDwarfUsesRelocationsAcrossSections(!Subtarget.getUseDwarfRIS());
}
namespace {
// BPF Code Generator Pass Configuration Options.
@@ -74,6 +85,8 @@ public:
}
bool addInstSelector() override;
+ void addMachineSSAOptimization() override;
+ void addPreEmitPass() override;
};
}
@@ -88,3 +101,21 @@ bool BPFPassConfig::addInstSelector() {
return false;
}
+
+void BPFPassConfig::addMachineSSAOptimization() {
+ // The default implementation must be called first as we want eBPF
+ // Peephole ran at last.
+ TargetPassConfig::addMachineSSAOptimization();
+
+ const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl();
+ if (Subtarget->getHasAlu32() && !DisableMIPeephole)
+ addPass(createBPFMIPeepholePass());
+}
+
+void BPFPassConfig::addPreEmitPass() {
+ const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl();
+
+ if (getOptLevel() != CodeGenOpt::None)
+ if (Subtarget->getHasAlu32() && !DisableMIPeephole)
+ addPass(createBPFMIPreEmitPeepholePass());
+}
diff --git a/lib/Target/BPF/CMakeLists.txt b/lib/Target/BPF/CMakeLists.txt
index 1e4b685a6092..ee01b4b7b805 100644
--- a/lib/Target/BPF/CMakeLists.txt
+++ b/lib/Target/BPF/CMakeLists.txt
@@ -1,14 +1,15 @@
set(LLVM_TARGET_DEFINITIONS BPF.td)
-tablegen(LLVM BPFGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM BPFGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM BPFGenDisassemblerTables.inc -gen-disassembler)
-tablegen(LLVM BPFGenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM BPFGenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM BPFGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM BPFGenCallingConv.inc -gen-callingconv)
tablegen(LLVM BPFGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM BPFGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM BPFGenInstrInfo.inc -gen-instr-info)
tablegen(LLVM BPFGenMCCodeEmitter.inc -gen-emitter)
-tablegen(LLVM BPFGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM BPFGenRegisterInfo.inc -gen-register-info)
tablegen(LLVM BPFGenSubtargetInfo.inc -gen-subtarget)
+
add_public_tablegen_target(BPFCommonTableGen)
add_llvm_target(BPFCodeGen
@@ -19,12 +20,14 @@ add_llvm_target(BPFCodeGen
BPFISelLowering.cpp
BPFMCInstLower.cpp
BPFRegisterInfo.cpp
+ BPFSelectionDAGInfo.cpp
BPFSubtarget.cpp
BPFTargetMachine.cpp
+ BPFMIPeephole.cpp
)
add_subdirectory(AsmParser)
add_subdirectory(Disassembler)
add_subdirectory(InstPrinter)
-add_subdirectory(TargetInfo)
add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
diff --git a/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 6fc87d79c439..e7790ddb3d7e 100644
--- a/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -35,6 +35,34 @@ namespace {
/// A disassembler class for BPF.
class BPFDisassembler : public MCDisassembler {
public:
+ enum BPF_CLASS {
+ BPF_LD = 0x0,
+ BPF_LDX = 0x1,
+ BPF_ST = 0x2,
+ BPF_STX = 0x3,
+ BPF_ALU = 0x4,
+ BPF_JMP = 0x5,
+ BPF_RES = 0x6,
+ BPF_ALU64 = 0x7
+ };
+
+ enum BPF_SIZE {
+ BPF_W = 0x0,
+ BPF_H = 0x1,
+ BPF_B = 0x2,
+ BPF_DW = 0x3
+ };
+
+ enum BPF_MODE {
+ BPF_IMM = 0x0,
+ BPF_ABS = 0x1,
+ BPF_IND = 0x2,
+ BPF_MEM = 0x3,
+ BPF_LEN = 0x4,
+ BPF_MSH = 0x5,
+ BPF_XADD = 0x6
+ };
+
BPFDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
: MCDisassembler(STI, Ctx) {}
~BPFDisassembler() override = default;
@@ -43,6 +71,10 @@ public:
ArrayRef<uint8_t> Bytes, uint64_t Address,
raw_ostream &VStream,
raw_ostream &CStream) const override;
+
+ uint8_t getInstClass(uint64_t Inst) const { return (Inst >> 56) & 0x7; };
+ uint8_t getInstSize(uint64_t Inst) const { return (Inst >> 59) & 0x3; };
+ uint8_t getInstMode(uint64_t Inst) const { return (Inst >> 61) & 0x7; };
};
} // end anonymous namespace
@@ -141,8 +173,17 @@ DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
Result = readInstruction64(Bytes, Address, Size, Insn, IsLittleEndian);
if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
- Result = decodeInstruction(DecoderTableBPF64, Instr, Insn,
- Address, this, STI);
+ uint8_t InstClass = getInstClass(Insn);
+ if ((InstClass == BPF_LDX || InstClass == BPF_STX) &&
+ getInstSize(Insn) != BPF_DW &&
+ getInstMode(Insn) == BPF_MEM &&
+ STI.getFeatureBits()[BPF::ALU32])
+ Result = decodeInstruction(DecoderTableBPFALU3264, Instr, Insn, Address,
+ this, STI);
+ else
+ Result = decodeInstruction(DecoderTableBPF64, Instr, Insn, Address, this,
+ STI);
+
if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
switch (Instr.getOpcode()) {
diff --git a/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp b/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
index 1f4ef098403d..20627da38817 100644
--- a/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
+++ b/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
@@ -12,7 +12,6 @@
//===----------------------------------------------------------------------===//
#include "BPFInstPrinter.h"
-#include "BPF.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index e6ea92e08364..6c255e9ef780 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -12,6 +12,7 @@
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/Support/EndianStream.h"
#include <cassert>
#include <cstdint>
@@ -21,18 +22,16 @@ namespace {
class BPFAsmBackend : public MCAsmBackend {
public:
- bool IsLittleEndian;
-
- BPFAsmBackend(bool IsLittleEndian)
- : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
+ BPFAsmBackend(support::endianness Endian) : MCAsmBackend(Endian) {}
~BPFAsmBackend() override = default;
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsResolved) const override;
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override;
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override;
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override;
// No instruction requires relaxation
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -43,22 +42,25 @@ public:
unsigned getNumFixupKinds() const override { return 1; }
- bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+ bool mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const override {
+ return false;
+ }
void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
MCInst &Res) const override {}
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
};
} // end anonymous namespace
-bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool BPFAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
if ((Count % 8) != 0)
return false;
for (uint64_t i = 0; i < Count; i += 8)
- OW->write64(0x15000000);
+ support::endian::write<uint64_t>(OS, 0x15000000, Endian);
return true;
}
@@ -66,19 +68,17 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
void BPFAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target,
MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) const {
+ bool IsResolved,
+ const MCSubtargetInfo *STI) const {
if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
assert(Value == 0);
- } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
- unsigned Size = Fixup.getKind() == FK_Data_4 ? 4 : 8;
-
- for (unsigned i = 0; i != Size; ++i) {
- unsigned Idx = IsLittleEndian ? i : Size - i - 1;
- Data[Fixup.getOffset() + Idx] = uint8_t(Value >> (i * 8));
- }
+ } else if (Fixup.getKind() == FK_Data_4) {
+ support::endian::write<uint32_t>(&Data[Fixup.getOffset()], Value, Endian);
+ } else if (Fixup.getKind() == FK_Data_8) {
+ support::endian::write<uint64_t>(&Data[Fixup.getOffset()], Value, Endian);
} else if (Fixup.getKind() == FK_PCRel_4) {
Value = (uint32_t)((Value - 8) / 8);
- if (IsLittleEndian) {
+ if (Endian == support::little) {
Data[Fixup.getOffset() + 1] = 0x10;
support::endian::write32le(&Data[Fixup.getOffset() + 4], Value);
} else {
@@ -88,31 +88,26 @@ void BPFAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
} else {
assert(Fixup.getKind() == FK_PCRel_2);
Value = (uint16_t)((Value - 8) / 8);
- if (IsLittleEndian) {
- Data[Fixup.getOffset() + 2] = Value & 0xFF;
- Data[Fixup.getOffset() + 3] = Value >> 8;
- } else {
- Data[Fixup.getOffset() + 2] = Value >> 8;
- Data[Fixup.getOffset() + 3] = Value & 0xFF;
- }
+ support::endian::write<uint16_t>(&Data[Fixup.getOffset() + 2], Value,
+ Endian);
}
}
-std::unique_ptr<MCObjectWriter>
-BPFAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
- return createBPFELFObjectWriter(OS, 0, IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter>
+BPFAsmBackend::createObjectTargetWriter() const {
+ return createBPFELFObjectWriter(0);
}
MCAsmBackend *llvm::createBPFAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
- const MCTargetOptions&) {
- return new BPFAsmBackend(/*IsLittleEndian=*/true);
+ const MCTargetOptions &) {
+ return new BPFAsmBackend(support::little);
}
MCAsmBackend *llvm::createBPFbeAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
- const MCTargetOptions&) {
- return new BPFAsmBackend(/*IsLittleEndian=*/false);
+ const MCTargetOptions &) {
+ return new BPFAsmBackend(support::big);
}
diff --git a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index f7de612dab15..134e890dfe49 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -54,9 +54,7 @@ unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
}
}
-std::unique_ptr<MCObjectWriter>
-llvm::createBPFELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
- bool IsLittleEndian) {
- return createELFObjectWriter(llvm::make_unique<BPFELFObjectWriter>(OSABI), OS,
- IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createBPFELFObjectWriter(uint8_t OSABI) {
+ return llvm::make_unique<BPFELFObjectWriter>(OSABI);
}
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index fd7c97bf1f0a..171f7f607ff4 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -44,6 +44,10 @@ public:
// line numbers, etc.
CodePointerSize = 8;
}
+
+ void setDwarfUsesRelocationsAcrossSections(bool enable) {
+ DwarfUsesRelocationsAcrossSections = enable;
+ }
};
}
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
index b4ecfdee7bff..437f658caf6e 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -122,44 +122,35 @@ void BPFMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
computeAvailableFeatures(STI.getFeatureBits()));
unsigned Opcode = MI.getOpcode();
- support::endian::Writer<support::little> LE(OS);
- support::endian::Writer<support::big> BE(OS);
+ support::endian::Writer OSE(OS,
+ IsLittleEndian ? support::little : support::big);
if (Opcode == BPF::LD_imm64 || Opcode == BPF::LD_pseudo) {
uint64_t Value = getBinaryCodeForInstr(MI, Fixups, STI);
- LE.write<uint8_t>(Value >> 56);
+ OS << char(Value >> 56);
if (IsLittleEndian)
- LE.write<uint8_t>((Value >> 48) & 0xff);
+ OS << char((Value >> 48) & 0xff);
else
- LE.write<uint8_t>(SwapBits((Value >> 48) & 0xff));
- LE.write<uint16_t>(0);
- if (IsLittleEndian)
- LE.write<uint32_t>(Value & 0xffffFFFF);
- else
- BE.write<uint32_t>(Value & 0xffffFFFF);
+ OS << char(SwapBits((Value >> 48) & 0xff));
+ OSE.write<uint16_t>(0);
+ OSE.write<uint32_t>(Value & 0xffffFFFF);
const MCOperand &MO = MI.getOperand(1);
uint64_t Imm = MO.isImm() ? MO.getImm() : 0;
- LE.write<uint8_t>(0);
- LE.write<uint8_t>(0);
- LE.write<uint16_t>(0);
- if (IsLittleEndian)
- LE.write<uint32_t>(Imm >> 32);
- else
- BE.write<uint32_t>(Imm >> 32);
+ OSE.write<uint8_t>(0);
+ OSE.write<uint8_t>(0);
+ OSE.write<uint16_t>(0);
+ OSE.write<uint32_t>(Imm >> 32);
} else {
// Get instruction encoding and emit it
uint64_t Value = getBinaryCodeForInstr(MI, Fixups, STI);
- LE.write<uint8_t>(Value >> 56);
- if (IsLittleEndian) {
- LE.write<uint8_t>((Value >> 48) & 0xff);
- LE.write<uint16_t>((Value >> 32) & 0xffff);
- LE.write<uint32_t>(Value & 0xffffFFFF);
- } else {
- LE.write<uint8_t>(SwapBits((Value >> 48) & 0xff));
- BE.write<uint16_t>((Value >> 32) & 0xffff);
- BE.write<uint32_t>(Value & 0xffffFFFF);
- }
+ OS << char(Value >> 56);
+ if (IsLittleEndian)
+ OS << char((Value >> 48) & 0xff);
+ else
+ OS << char(SwapBits((Value >> 48) & 0xff));
+ OSE.write<uint16_t>((Value >> 32) & 0xffff);
+ OSE.write<uint32_t>(Value & 0xffffFFFF);
}
}
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index cbf1ea7d7fb8..834b57527882 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -52,10 +52,10 @@ static MCSubtargetInfo *createBPFMCSubtargetInfo(const Triple &TT,
static MCStreamer *createBPFMCStreamer(const Triple &T, MCContext &Ctx,
std::unique_ptr<MCAsmBackend> &&MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&Emitter,
bool RelaxAll) {
- return createELFStreamer(Ctx, std::move(MAB), OS, std::move(Emitter),
+ return createELFStreamer(Ctx, std::move(MAB), std::move(OW), std::move(Emitter),
RelaxAll);
}
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
index 6466042f6929..6d2f0a1601e6 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
@@ -24,7 +24,7 @@ class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
class MCRegisterInfo;
class MCSubtargetInfo;
class MCTargetOptions;
@@ -45,16 +45,14 @@ MCCodeEmitter *createBPFbeMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
-MCAsmBackend *createBPFAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
+MCAsmBackend *createBPFAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const MCTargetOptions &Options);
-MCAsmBackend *createBPFbeAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
+MCAsmBackend *createBPFbeAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const MCTargetOptions &Options);
-std::unique_ptr<MCObjectWriter> createBPFELFObjectWriter(raw_pwrite_stream &OS,
- uint8_t OSABI,
- bool IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter> createBPFELFObjectWriter(uint8_t OSABI);
}
// Defines symbolic names for BPF registers. This defines a mapping from
diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 387296c69c39..92bda224f3dc 100644
--- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -118,7 +118,6 @@ class HexagonAsmParser : public MCTargetAsmParser {
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
bool ParseDirectiveSubsection(SMLoc L);
- bool ParseDirectiveValue(unsigned Size, SMLoc L);
bool ParseDirectiveComm(bool IsLocal, SMLoc L);
bool RegisterMatchesArch(unsigned MatchNum) const;
@@ -165,6 +164,10 @@ public:
MCB.setOpcode(Hexagon::BUNDLE);
setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+ Parser.addAliasForDirective(".half", ".2byte");
+ Parser.addAliasForDirective(".hword", ".2byte");
+ Parser.addAliasForDirective(".word", ".4byte");
+
MCAsmParserExtension::Initialize(_Parser);
}
@@ -462,9 +465,9 @@ void HexagonOperand::print(raw_ostream &OS) const {
}
bool HexagonAsmParser::finishBundle(SMLoc IDLoc, MCStreamer &Out) {
- DEBUG(dbgs() << "Bundle:");
- DEBUG(MCB.dump_pretty(dbgs()));
- DEBUG(dbgs() << "--\n");
+ LLVM_DEBUG(dbgs() << "Bundle:");
+ LLVM_DEBUG(MCB.dump_pretty(dbgs()));
+ LLVM_DEBUG(dbgs() << "--\n");
MCB.setLoc(IDLoc);
// Check the bundle for errors.
@@ -506,16 +509,19 @@ bool HexagonAsmParser::matchBundleOptions() {
"supported with this architecture";
StringRef Option = Parser.getTok().getString();
auto IDLoc = Parser.getTok().getLoc();
- if (Option.compare_lower("endloop0") == 0)
+ if (Option.compare_lower("endloop01") == 0) {
+ HexagonMCInstrInfo::setInnerLoop(MCB);
+ HexagonMCInstrInfo::setOuterLoop(MCB);
+ } else if (Option.compare_lower("endloop0") == 0) {
HexagonMCInstrInfo::setInnerLoop(MCB);
- else if (Option.compare_lower("endloop1") == 0)
+ } else if (Option.compare_lower("endloop1") == 0) {
HexagonMCInstrInfo::setOuterLoop(MCB);
- else if (Option.compare_lower("mem_noshuf") == 0)
+ } else if (Option.compare_lower("mem_noshuf") == 0) {
if (getSTI().getFeatureBits()[Hexagon::FeatureMemNoShuf])
HexagonMCInstrInfo::setMemReorderDisabled(MCB);
else
return getParser().Error(IDLoc, MemNoShuffMsg);
- else
+ } else
return getParser().Error(IDLoc, llvm::Twine("'") + Option +
"' is not a valid bundle option");
Lex();
@@ -554,9 +560,9 @@ bool HexagonAsmParser::matchOneInstruction(MCInst &MCI, SMLoc IDLoc,
canonicalizeImmediates(MCI);
result = processInstruction(MCI, InstOperands, IDLoc);
- DEBUG(dbgs() << "Insn:");
- DEBUG(MCI.dump_pretty(dbgs()));
- DEBUG(dbgs() << "\n\n");
+ LLVM_DEBUG(dbgs() << "Insn:");
+ LLVM_DEBUG(MCI.dump_pretty(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n\n");
MCI.setLoc(IDLoc);
}
@@ -648,11 +654,6 @@ bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
/// ParseDirective parses the Hexagon specific directives
bool HexagonAsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getIdentifier();
- if ((IDVal.lower() == ".word") || (IDVal.lower() == ".4byte"))
- return ParseDirectiveValue(4, DirectiveID.getLoc());
- if (IDVal.lower() == ".short" || IDVal.lower() == ".hword" ||
- IDVal.lower() == ".half")
- return ParseDirectiveValue(2, DirectiveID.getLoc());
if (IDVal.lower() == ".falign")
return ParseDirectiveFalign(256, DirectiveID.getLoc());
if ((IDVal.lower() == ".lcomm") || (IDVal.lower() == ".lcommon"))
@@ -720,39 +721,6 @@ bool HexagonAsmParser::ParseDirectiveFalign(unsigned Size, SMLoc L) {
return false;
}
-/// ::= .word [ expression (, expression)* ]
-bool HexagonAsmParser::ParseDirectiveValue(unsigned Size, SMLoc L) {
- if (getLexer().isNot(AsmToken::EndOfStatement)) {
- while (true) {
- const MCExpr *Value;
- SMLoc ExprLoc = L;
- if (getParser().parseExpression(Value))
- return true;
-
- // Special case constant expressions to match code generator.
- if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
- assert(Size <= 8 && "Invalid size");
- uint64_t IntValue = MCE->getValue();
- if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
- return Error(ExprLoc, "literal value out of range for directive");
- getStreamer().EmitIntValue(IntValue, Size);
- } else
- getStreamer().EmitValue(Value, Size);
-
- if (getLexer().is(AsmToken::EndOfStatement))
- break;
-
- // FIXME: Improve diagnostic.
- if (getLexer().isNot(AsmToken::Comma))
- return TokError("unexpected token in directive");
- Lex();
- }
- }
-
- Lex();
- return false;
-}
-
// This is largely a copy of AsmParser's ParseDirectiveComm extended to
// accept a 3rd argument, AccessAlignment which indicates the smallest
// memory access made to the symbol, expressed in bytes. If no
@@ -1293,9 +1261,9 @@ unsigned HexagonAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
return Match_Success;
}
- DEBUG(dbgs() << "Unmatched Operand:");
- DEBUG(Op->dump());
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "Unmatched Operand:");
+ LLVM_DEBUG(Op->dump());
+ LLVM_DEBUG(dbgs() << "\n");
return Match_InvalidOperand;
}
@@ -1333,6 +1301,17 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
}
break;
+ case Hexagon::J2_trap1:
+ if (!getSTI().getFeatureBits()[Hexagon::ArchV65]) {
+ MCOperand &Rx = Inst.getOperand(0);
+ MCOperand &Ry = Inst.getOperand(1);
+ if (Rx.getReg() != Hexagon::R0 || Ry.getReg() != Hexagon::R0) {
+ Error(IDLoc, "trap1 can only have register r0 as operand");
+ return Match_InvalidOperand;
+ }
+ }
+ break;
+
case Hexagon::A2_iconst: {
Inst.setOpcode(Hexagon::A2_addi);
MCOperand Reg = Inst.getOperand(0);
diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp
index 15d6a05a0078..69529b0d1162 100644
--- a/lib/Target/Hexagon/BitTracker.cpp
+++ b/lib/Target/Hexagon/BitTracker.cpp
@@ -779,15 +779,18 @@ bool BT::UseQueueType::Cmp::operator()(const MachineInstr *InstA,
return BA->getNumber() > BB->getNumber();
}
- MachineBasicBlock::const_iterator ItA = InstA->getIterator();
- MachineBasicBlock::const_iterator ItB = InstB->getIterator();
- MachineBasicBlock::const_iterator End = BA->end();
- while (ItA != End) {
- if (ItA == ItB)
- return false; // ItA was before ItB.
- ++ItA;
- }
- return true;
+ auto getDist = [this] (const MachineInstr *MI) {
+ auto F = Dist.find(MI);
+ if (F != Dist.end())
+ return F->second;
+ MachineBasicBlock::const_iterator I = MI->getParent()->begin();
+ MachineBasicBlock::const_iterator E = MI->getIterator();
+ unsigned D = std::distance(I, E);
+ Dist.insert(std::make_pair(MI, D));
+ return D;
+ };
+
+ return getDist(InstA) > getDist(InstB);
}
// Main W-Z implementation.
@@ -840,7 +843,7 @@ void BT::visitPHI(const MachineInstr &PI) {
void BT::visitNonBranch(const MachineInstr &MI) {
if (Trace)
dbgs() << "Visit MI(" << printMBBReference(*MI.getParent()) << "): " << MI;
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
return;
assert(!MI.isBranch() && "Unexpected branch instruction");
@@ -1138,6 +1141,7 @@ void BT::run() {
runEdgeQueue(BlockScanned);
runUseQueue();
}
+ UseQ.reset();
if (Trace)
print_cells(dbgs() << "Cells after propagation:\n");
diff --git a/lib/Target/Hexagon/BitTracker.h b/lib/Target/Hexagon/BitTracker.h
index 5df6b61710f6..058225c0d812 100644
--- a/lib/Target/Hexagon/BitTracker.h
+++ b/lib/Target/Hexagon/BitTracker.h
@@ -13,6 +13,7 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
#include <cassert>
#include <cstdint>
@@ -28,7 +29,6 @@ class ConstantInt;
class MachineRegisterInfo;
class MachineBasicBlock;
class MachineFunction;
-class MachineInstr;
class raw_ostream;
class TargetRegisterClass;
class TargetRegisterInfo;
@@ -73,6 +73,8 @@ private:
// Priority queue of instructions using modified registers, ordered by
// their relative position in a basic block.
struct UseQueueType {
+ UseQueueType() : Uses(Dist) {}
+
unsigned size() const {
return Uses.size();
}
@@ -90,12 +92,18 @@ private:
Set.erase(front());
Uses.pop();
}
+ void reset() {
+ Dist.clear();
+ }
private:
struct Cmp {
+ Cmp(DenseMap<const MachineInstr*,unsigned> &Map) : Dist(Map) {}
bool operator()(const MachineInstr *MI, const MachineInstr *MJ) const;
+ DenseMap<const MachineInstr*,unsigned> &Dist;
};
std::priority_queue<MachineInstr*, std::vector<MachineInstr*>, Cmp> Uses;
- DenseSet<MachineInstr*> Set; // Set to avoid adding duplicate entries.
+ DenseSet<const MachineInstr*> Set; // Set to avoid adding duplicate entries.
+ DenseMap<const MachineInstr*,unsigned> Dist;
};
void reset();
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 1c36093923ac..a9f606c54eb1 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -2,6 +2,7 @@ set(LLVM_TARGET_DEFINITIONS Hexagon.td)
tablegen(LLVM HexagonGenAsmMatcher.inc -gen-asm-matcher)
tablegen(LLVM HexagonGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM HexagonGenCallingConv.inc -gen-callingconv)
tablegen(LLVM HexagonGenDAGISel.inc -gen-dag-isel)
tablegen(LLVM HexagonGenDFAPacketizer.inc -gen-dfa-packetizer)
tablegen(LLVM HexagonGenDisassemblerTables.inc -gen-disassembler)
@@ -9,6 +10,7 @@ tablegen(LLVM HexagonGenInstrInfo.inc -gen-instr-info)
tablegen(LLVM HexagonGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM HexagonGenRegisterInfo.inc -gen-register-info)
tablegen(LLVM HexagonGenSubtargetInfo.inc -gen-subtarget)
+
add_public_tablegen_target(HexagonCommonTableGen)
add_llvm_target(HexagonCodeGen
@@ -59,6 +61,7 @@ add_llvm_target(HexagonCodeGen
HexagonTargetTransformInfo.cpp
HexagonVectorLoopCarriedReuse.cpp
HexagonVectorPrint.cpp
+ HexagonVExtract.cpp
HexagonVLIWPacketizer.cpp
RDFCopy.cpp
RDFDeadCode.cpp
@@ -68,7 +71,7 @@ add_llvm_target(HexagonCodeGen
)
add_subdirectory(AsmParser)
-add_subdirectory(TargetInfo)
-add_subdirectory(MCTargetDesc)
add_subdirectory(Disassembler)
+add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 481b692ae8bf..1a619ebda84e 100644
--- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -40,7 +40,7 @@ using DecodeStatus = MCDisassembler::DecodeStatus;
namespace {
-/// \brief Hexagon disassembler for all Hexagon platforms.
+/// Hexagon disassembler for all Hexagon platforms.
class HexagonDisassembler : public MCDisassembler {
public:
std::unique_ptr<MCInstrInfo const> const MCII;
@@ -127,12 +127,18 @@ static DecodeStatus DecodeHvxQRRegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeGuestRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeGuestRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
uint64_t Address, const void *Decoder);
@@ -783,3 +789,55 @@ static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
HexagonMCInstrInfo::addConstant(MI, Extended, Disassembler.getContext());
return MCDisassembler::Success;
}
+
+static DecodeStatus DecodeGuestRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ using namespace Hexagon;
+
+ static const MCPhysReg GuestRegDecoderTable[] = {
+ /* 0 */ GELR, GSR, GOSP, G3,
+ /* 4 */ G4, G5, G6, G7,
+ /* 8 */ G8, G9, G10, G11,
+ /* 12 */ G12, G13, G14, G15,
+ /* 16 */ GPMUCNT4, GPMUCNT5, GPMUCNT6, GPMUCNT7,
+ /* 20 */ G20, G21, G22, G23,
+ /* 24 */ GPCYCLELO, GPCYCLEHI, GPMUCNT0, GPMUCNT1,
+ /* 28 */ GPMUCNT2, GPMUCNT3, G30, G31
+ };
+
+ if (RegNo >= array_lengthof(GuestRegDecoderTable))
+ return MCDisassembler::Fail;
+ if (GuestRegDecoderTable[RegNo] == Hexagon::NoRegister)
+ return MCDisassembler::Fail;
+
+ unsigned Register = GuestRegDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGuestRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ using namespace Hexagon;
+
+ static const MCPhysReg GuestReg64DecoderTable[] = {
+ /* 0 */ G1_0, 0, G3_2, 0,
+ /* 4 */ G5_4, 0, G7_6, 0,
+ /* 8 */ G9_8, 0, G11_10, 0,
+ /* 12 */ G13_12, 0, G15_14, 0,
+ /* 16 */ G17_16, 0, G19_18, 0,
+ /* 20 */ G21_20, 0, G23_22, 0,
+ /* 24 */ G25_24, 0, G27_26, 0,
+ /* 28 */ G29_28, 0, G31_30, 0
+ };
+
+ if (RegNo >= array_lengthof(GuestReg64DecoderTable))
+ return MCDisassembler::Fail;
+ if (GuestReg64DecoderTable[RegNo] == Hexagon::NoRegister)
+ return MCDisassembler::Fail;
+
+ unsigned Register = GuestReg64DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
index 66b387b62c6c..6ec52d18cdc4 100644
--- a/lib/Target/Hexagon/Hexagon.h
+++ b/lib/Target/Hexagon/Hexagon.h
@@ -49,7 +49,7 @@
namespace llvm {
class HexagonTargetMachine;
- /// \brief Creates a Hexagon-specific Target Transformation Info pass.
+ /// Creates a Hexagon-specific Target Transformation Info pass.
ImmutablePass *createHexagonTargetTransformInfoPass(const HexagonTargetMachine *TM);
} // end namespace llvm;
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index 6292e2a7a4ea..69e263a425f8 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -36,32 +36,36 @@ def ExtensionHVXV62: SubtargetFeature<"hvxv62", "HexagonHVXVersion",
def ExtensionHVXV65: SubtargetFeature<"hvxv65", "HexagonHVXVersion",
"Hexagon::ArchEnum::V65", "Hexagon HVX instructions",
[ExtensionHVX,ExtensionHVXV60, ExtensionHVXV62]>;
-def ExtensionHVX64B
- : SubtargetFeature<"hvx-length64b", "UseHVX64BOps", "true",
- "Hexagon HVX 64B instructions", [ExtensionHVX]>;
-def ExtensionHVX128B
- : SubtargetFeature<"hvx-length128b", "UseHVX128BOps", "true",
- "Hexagon HVX 128B instructions", [ExtensionHVX]>;
-
-// This is an alias to ExtensionHVX128B to accept the hvx-double as
-// an acceptable subtarget feature.
-def ExtensionHVXDbl
- : SubtargetFeature<"hvx-double", "UseHVX128BOps", "true",
- "Hexagon HVX 128B instructions", [ExtensionHVX128B]>;
+def ExtensionHVX64B: SubtargetFeature<"hvx-length64b", "UseHVX64BOps",
+ "true", "Hexagon HVX 64B instructions", [ExtensionHVX]>;
+def ExtensionHVX128B: SubtargetFeature<"hvx-length128b", "UseHVX128BOps",
+ "true", "Hexagon HVX 128B instructions", [ExtensionHVX]>;
+
+def FeaturePackets: SubtargetFeature<"packets", "UsePackets", "true",
+ "Support for instruction packets">;
def FeatureLongCalls: SubtargetFeature<"long-calls", "UseLongCalls", "true",
"Use constant-extended calls">;
def FeatureMemNoShuf: SubtargetFeature<"mem_noshuf", "HasMemNoShuf", "false",
"Supports mem_noshuf feature">;
-def FeatureDuplex : SubtargetFeature<"duplex", "EnableDuplex", "true",
+def FeatureMemops: SubtargetFeature<"memops", "UseMemops", "true",
+ "Use memop instructions">;
+def FeatureNVJ: SubtargetFeature<"nvj", "UseNewValueJumps", "true",
+ "Support for new-value jumps", [FeaturePackets]>;
+def FeatureNVS: SubtargetFeature<"nvs", "UseNewValueStores", "true",
+ "Support for new-value stores", [FeaturePackets]>;
+def FeatureSmallData: SubtargetFeature<"small-data", "UseSmallData", "true",
+ "Allow GP-relative addressing of global variables">;
+def FeatureDuplex: SubtargetFeature<"duplex", "EnableDuplex", "true",
"Enable generation of duplex instruction">;
+def FeatureReservedR19: SubtargetFeature<"reserved-r19", "ReservedR19",
+ "true", "Reserve register R19">;
//===----------------------------------------------------------------------===//
// Hexagon Instruction Predicate Definitions.
//===----------------------------------------------------------------------===//
-def UseMEMOP : Predicate<"HST->useMemOps()">;
-def IEEERndNearV5T : Predicate<"HST->modeIEEERndNear()">;
+def UseMEMOPS : Predicate<"HST->useMemops()">;
def UseHVX64B : Predicate<"HST->useHVX64BOps()">,
AssemblerPredicate<"ExtensionHVX64B">;
def UseHVX128B : Predicate<"HST->useHVX128BOps()">,
@@ -75,10 +79,8 @@ def UseHVXV62 : Predicate<"HST->useHVXOps()">,
def UseHVXV65 : Predicate<"HST->useHVXOps()">,
AssemblerPredicate<"ExtensionHVXV65">;
-def Hvx64 : HwMode<"+hvx-length64b">;
-def Hvx64old : HwMode<"-hvx-double">;
-def Hvx128 : HwMode<"+hvx-length128b">;
-def Hvx128old : HwMode<"+hvx-double">;
+def Hvx64: HwMode<"+hvx-length64b">;
+def Hvx128: HwMode<"+hvx-length128b">;
//===----------------------------------------------------------------------===//
// Classes used for relation maps.
@@ -300,8 +302,10 @@ include "HexagonDepITypes.td"
include "HexagonInstrFormats.td"
include "HexagonDepInstrFormats.td"
include "HexagonDepInstrInfo.td"
+include "HexagonCallingConv.td"
include "HexagonPseudo.td"
include "HexagonPatterns.td"
+include "HexagonPatternsHVX.td"
include "HexagonPatternsV65.td"
include "HexagonDepMappings.td"
include "HexagonIntrinsics.td"
@@ -318,19 +322,34 @@ class Proc<string Name, SchedMachineModel Model,
list<SubtargetFeature> Features>
: ProcessorModel<Name, Model, Features>;
+def : Proc<"generic", HexagonModelV60,
+ [ArchV4, ArchV5, ArchV55, ArchV60,
+ FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
+ FeaturePackets, FeatureSmallData]>;
def : Proc<"hexagonv4", HexagonModelV4,
- [ArchV4, FeatureDuplex]>;
+ [ArchV4,
+ FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
+ FeaturePackets, FeatureSmallData]>;
def : Proc<"hexagonv5", HexagonModelV4,
- [ArchV4, ArchV5, FeatureDuplex]>;
+ [ArchV4, ArchV5,
+ FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
+ FeaturePackets, FeatureSmallData]>;
def : Proc<"hexagonv55", HexagonModelV55,
- [ArchV4, ArchV5, ArchV55, FeatureDuplex]>;
+ [ArchV4, ArchV5, ArchV55,
+ FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
+ FeaturePackets, FeatureSmallData]>;
def : Proc<"hexagonv60", HexagonModelV60,
- [ArchV4, ArchV5, ArchV55, ArchV60, FeatureDuplex]>;
+ [ArchV4, ArchV5, ArchV55, ArchV60,
+ FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
+ FeaturePackets, FeatureSmallData]>;
def : Proc<"hexagonv62", HexagonModelV62,
- [ArchV4, ArchV5, ArchV55, ArchV60, ArchV62, FeatureDuplex]>;
+ [ArchV4, ArchV5, ArchV55, ArchV60, ArchV62,
+ FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
+ FeaturePackets, FeatureSmallData]>;
def : Proc<"hexagonv65", HexagonModelV65,
[ArchV4, ArchV5, ArchV55, ArchV60, ArchV62, ArchV65,
- FeatureMemNoShuf, FeatureDuplex]>;
+ FeatureDuplex, FeatureMemNoShuf, FeatureMemops, FeatureNVJ,
+ FeatureNVS, FeaturePackets, FeatureSmallData]>;
//===----------------------------------------------------------------------===//
// Declare the target which we are implementing
@@ -357,4 +376,5 @@ def Hexagon : Target {
let AssemblyParsers = [HexagonAsmParser];
let AssemblyParserVariants = [HexagonAsmParserVariant];
let AssemblyWriters = [HexagonAsmWriter];
+ let AllowRegisterRenaming = 1;
}
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 68b1fe6bf4b1..0ac83ea7c5fc 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -62,10 +62,6 @@ void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
#define DEBUG_TYPE "asm-printer"
-static cl::opt<bool> AlignCalls(
- "hexagon-align-calls", cl::Hidden, cl::init(true),
- cl::desc("Insert falign after call instruction for Hexagon target"));
-
// Given a scalar register return its pair.
inline static unsigned getHexagonRegisterPair(unsigned Reg,
const MCRegisterInfo *RI) {
@@ -76,16 +72,13 @@ inline static unsigned getHexagonRegisterPair(unsigned Reg,
return Pair;
}
-HexagonAsmPrinter::HexagonAsmPrinter(TargetMachine &TM,
- std::unique_ptr<MCStreamer> Streamer)
- : AsmPrinter(TM, std::move(Streamer)) {}
-
void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
raw_ostream &O) {
const MachineOperand &MO = MI->getOperand(OpNo);
switch (MO.getType()) {
- default: llvm_unreachable ("<unknown operand type>");
+ default:
+ llvm_unreachable ("<unknown operand type>");
case MachineOperand::MO_Register:
O << HexagonInstPrinter::getRegisterName(MO.getReg());
return;
@@ -112,8 +105,8 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
// for the case in which the basic block is reachable by a fall through but
// through an indirect from a jump table. In this case, the jump table
// will contain a label not defined by AsmPrinter.
-bool HexagonAsmPrinter::
-isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
+bool HexagonAsmPrinter::isBlockOnlyReachableByFallthrough(
+ const MachineBasicBlock *MBB) const {
if (MBB->hasAddressTaken())
return false;
return AsmPrinter::isBlockOnlyReachableByFallthrough(MBB);
@@ -167,7 +160,8 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
}
bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
- unsigned OpNo, unsigned AsmVariant,
+ unsigned OpNo,
+ unsigned AsmVariant,
const char *ExtraCode,
raw_ostream &O) {
if (ExtraCode && ExtraCode[0])
@@ -183,10 +177,10 @@ bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
if (Offset.isImm()) {
if (Offset.getImm())
- O << " + #" << Offset.getImm();
- }
- else
+ O << "+#" << Offset.getImm();
+ } else {
llvm_unreachable("Unimplemented");
+ }
return false;
}
@@ -285,7 +279,8 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
unsigned VectorSize = HRI.getRegSizeInBits(Hexagon::HvxVRRegClass) / 8;
switch (Inst.getOpcode()) {
- default: return;
+ default:
+ return;
case Hexagon::A2_iconst: {
Inst.setOpcode(Hexagon::A2_addi);
@@ -300,30 +295,40 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
break;
}
- case Hexagon::A2_tfrf:
+ case Hexagon::A2_tfrf: {
+ const MCConstantExpr *Zero = MCConstantExpr::create(0, OutContext);
Inst.setOpcode(Hexagon::A2_paddif);
- Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+ Inst.addOperand(MCOperand::createExpr(Zero));
break;
+ }
- case Hexagon::A2_tfrt:
+ case Hexagon::A2_tfrt: {
+ const MCConstantExpr *Zero = MCConstantExpr::create(0, OutContext);
Inst.setOpcode(Hexagon::A2_paddit);
- Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+ Inst.addOperand(MCOperand::createExpr(Zero));
break;
+ }
- case Hexagon::A2_tfrfnew:
+ case Hexagon::A2_tfrfnew: {
+ const MCConstantExpr *Zero = MCConstantExpr::create(0, OutContext);
Inst.setOpcode(Hexagon::A2_paddifnew);
- Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+ Inst.addOperand(MCOperand::createExpr(Zero));
break;
+ }
- case Hexagon::A2_tfrtnew:
+ case Hexagon::A2_tfrtnew: {
+ const MCConstantExpr *Zero = MCConstantExpr::create(0, OutContext);
Inst.setOpcode(Hexagon::A2_padditnew);
- Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+ Inst.addOperand(MCOperand::createExpr(Zero));
break;
+ }
- case Hexagon::A2_zxtb:
+ case Hexagon::A2_zxtb: {
+ const MCConstantExpr *C255 = MCConstantExpr::create(255, OutContext);
Inst.setOpcode(Hexagon::A2_andir);
- Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(255, OutContext)));
+ Inst.addOperand(MCOperand::createExpr(C255));
break;
+ }
// "$dst = CONST64(#$src1)",
case Hexagon::CONST64:
@@ -525,10 +530,12 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
bool Success = MO.getExpr()->evaluateAsAbsolute(Imm);
if (Success && Imm < 0) {
const MCExpr *MOne = MCConstantExpr::create(-1, OutContext);
- TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(MOne, OutContext)));
+ const HexagonMCExpr *E = HexagonMCExpr::create(MOne, OutContext);
+ TmpInst.addOperand(MCOperand::createExpr(E));
} else {
const MCExpr *Zero = MCConstantExpr::create(0, OutContext);
- TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(Zero, OutContext)));
+ const HexagonMCExpr *E = HexagonMCExpr::create(Zero, OutContext);
+ TmpInst.addOperand(MCOperand::createExpr(E));
}
TmpInst.addOperand(MO);
MappedInst = TmpInst;
@@ -569,9 +576,9 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
MO.setReg(High);
// Add a new operand for the second register in the pair.
MappedInst.addOperand(MCOperand::createReg(Low));
- MappedInst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew)
- ? Hexagon::C2_ccombinewnewt
- : Hexagon::C2_ccombinewnewf);
+ MappedInst.setOpcode(Inst.getOpcode() == Hexagon::A2_tfrptnew
+ ? Hexagon::C2_ccombinewnewt
+ : Hexagon::C2_ccombinewnewf);
return;
}
@@ -615,6 +622,7 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
MappedInst = TmpInst;
return;
}
+
case Hexagon::V6_vdd0: {
MCInst TmpInst;
assert (Inst.getOperand(0).isReg() &&
@@ -627,6 +635,7 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
MappedInst = TmpInst;
return;
}
+
case Hexagon::V6_vL32Ub_pi:
case Hexagon::V6_vL32b_cur_pi:
case Hexagon::V6_vL32b_nt_cur_pi:
@@ -735,12 +744,10 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
case Hexagon::V6_vS32b_srls_pi:
MappedInst = ScaleVectorOffset(Inst, 2, VectorSize, OutContext);
return;
-
}
}
-/// printMachineInstruction -- Print out a single Hexagon MI in Darwin syntax to
-/// the current output stream.
+/// Print out a single Hexagon MI to the current output stream.
void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
MCInst MCB;
MCB.setOpcode(Hexagon::BUNDLE);
@@ -748,21 +755,27 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
const MCInstrInfo &MCII = *Subtarget->getInstrInfo();
if (MI->isBundle()) {
+ assert(Subtarget->usePackets() && "Support for packets is disabled");
const MachineBasicBlock* MBB = MI->getParent();
MachineBasicBlock::const_instr_iterator MII = MI->getIterator();
for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII)
- if (!MII->isDebugValue() && !MII->isImplicitDef())
+ if (!MII->isDebugInstr() && !MII->isImplicitDef())
HexagonLowerToMC(MCII, &*MII, MCB, *this);
- }
- else
+ } else {
HexagonLowerToMC(MCII, MI, MCB, *this);
+ }
+
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ const auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+ if (MI->isBundle() && HII.getBundleNoShuf(*MI))
+ HexagonMCInstrInfo::setMemReorderDisabled(MCB);
- bool Ok = HexagonMCInstrInfo::canonicalizePacket(
- MCII, *Subtarget, OutStreamer->getContext(), MCB, nullptr);
- assert(Ok);
- (void)Ok;
- if(HexagonMCInstrInfo::bundleSize(MCB) == 0)
+ MCContext &Ctx = OutStreamer->getContext();
+ bool Ok = HexagonMCInstrInfo::canonicalizePacket(MCII, *Subtarget, Ctx,
+ MCB, nullptr);
+ assert(Ok); (void)Ok;
+ if (HexagonMCInstrInfo::bundleSize(MCB) == 0)
return;
OutStreamer->EmitInstruction(MCB, getSubtargetInfo());
}
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.h b/lib/Target/Hexagon/HexagonAsmPrinter.h
index 4b8865672cf4..d0629d173a65 100755
--- a/lib/Target/Hexagon/HexagonAsmPrinter.h
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -18,7 +18,8 @@
#include "HexagonSubtarget.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineFunction.h"
-#include <memory>
+#include "llvm/MC/MCStreamer.h"
+#include <utility>
namespace llvm {
@@ -32,7 +33,8 @@ class TargetMachine;
public:
explicit HexagonAsmPrinter(TargetMachine &TM,
- std::unique_ptr<MCStreamer> Streamer);
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)) {}
bool runOnMachineFunction(MachineFunction &Fn) override {
Subtarget = &Fn.getSubtarget<HexagonSubtarget>();
@@ -43,13 +45,11 @@ class TargetMachine;
return "Hexagon Assembly Printer";
}
- bool isBlockOnlyReachableByFallthrough(
- const MachineBasicBlock *MBB) const override;
+ bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB)
+ const override;
void EmitInstruction(const MachineInstr *MI) override;
-
- void HexagonProcessInstruction(MCInst &Inst,
- const MachineInstr &MBB);
+ void HexagonProcessInstruction(MCInst &Inst, const MachineInstr &MBB);
void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -58,8 +58,6 @@ class TargetMachine;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
unsigned AsmVariant, const char *ExtraCode,
raw_ostream &OS) override;
-
- static const char *getRegisterName(unsigned RegNo);
};
} // end namespace llvm
diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 9e73766b6fdc..4791b067aa8d 100644
--- a/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -184,9 +184,7 @@ namespace {
public:
static char ID;
- HexagonBitSimplify() : MachineFunctionPass(ID) {
- initializeHexagonBitSimplifyPass(*PassRegistry::getPassRegistry());
- }
+ HexagonBitSimplify() : MachineFunctionPass(ID) {}
StringRef getPassName() const override {
return "Hexagon bit simplification";
@@ -257,10 +255,10 @@ namespace {
char HexagonBitSimplify::ID = 0;
-INITIALIZE_PASS_BEGIN(HexagonBitSimplify, "hexbit",
+INITIALIZE_PASS_BEGIN(HexagonBitSimplify, "hexagon-bit-simplify",
"Hexagon bit simplification", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(HexagonBitSimplify, "hexbit",
+INITIALIZE_PASS_END(HexagonBitSimplify, "hexagon-bit-simplify",
"Hexagon bit simplification", false, false)
bool HexagonBitSimplify::visitBlock(MachineBasicBlock &B, Transformation &T,
@@ -622,7 +620,7 @@ bool HexagonBitSimplify::getUsedBitsInStore(unsigned Opc, BitVector &Bits,
// operand may be a subregister of a larger register, while Bits would
// correspond to the larger register in its entirety. Because of that,
// the parameter Begin can be used to indicate which bit of Bits should be
-// considered the LSB of of the operand.
+// considered the LSB of the operand.
bool HexagonBitSimplify::getUsedBits(unsigned Opc, unsigned OpN,
BitVector &Bits, uint16_t Begin, const HexagonInstrInfo &HII) {
using namespace Hexagon;
@@ -2452,7 +2450,7 @@ bool BitSimplification::simplifyExtractLow(MachineInstr *MI,
if (Len == RW)
return false;
- DEBUG({
+ LLVM_DEBUG({
dbgs() << __func__ << " on reg: " << printReg(RD.Reg, &HRI, RD.Sub)
<< ", MI: " << *MI;
dbgs() << "Cell: " << RC << '\n';
@@ -2646,7 +2644,7 @@ bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) {
const HexagonEvaluator HE(HRI, MRI, HII, MF);
BitTracker BT(HE, MF);
- DEBUG(BT.trace(true));
+ LLVM_DEBUG(BT.trace(true));
BT.run();
MachineBasicBlock &Entry = MF.front();
@@ -2977,7 +2975,8 @@ void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB,
}
bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
- DEBUG(dbgs() << "Processing loop in " << printMBBReference(*C.LB) << "\n");
+ LLVM_DEBUG(dbgs() << "Processing loop in " << printMBBReference(*C.LB)
+ << "\n");
std::vector<PhiInfo> Phis;
for (auto &I : *C.LB) {
if (!I.isPHI())
@@ -3001,7 +3000,7 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
Phis.push_back(PhiInfo(I, *C.LB));
}
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Phis: {";
for (auto &I : Phis) {
dbgs() << ' ' << printReg(I.DefR, HRI) << "=phi("
@@ -3122,7 +3121,7 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
Groups.push_back(G);
}
- DEBUG({
+ LLVM_DEBUG({
for (unsigned i = 0, n = Groups.size(); i < n; ++i) {
InstrGroup &G = Groups[i];
dbgs() << "Group[" << i << "] inp: "
@@ -3190,7 +3189,7 @@ bool HexagonLoopRescheduling::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
const HexagonEvaluator HE(*HRI, *MRI, *HII, MF);
BitTracker BT(HE, MF);
- DEBUG(BT.trace(true));
+ LLVM_DEBUG(BT.trace(true));
BT.run();
BTP = &BT;
diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp
index b6e220beb0c6..e13cfd3f655a 100644
--- a/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -325,7 +325,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
int FI = op(1).getIndex();
int Off = op(2).getImm();
unsigned A = MFI.getObjectAlignment(FI) + std::abs(Off);
- unsigned L = Log2_32(A);
+ unsigned L = countTrailingZeros(A);
RegisterCell RC = RegisterCell::self(Reg[0].Reg, W0);
RC.fill(0, L, BT::BitValue::Zero);
return rr0(RC, Outputs);
diff --git a/lib/Target/Hexagon/HexagonBlockRanges.cpp b/lib/Target/Hexagon/HexagonBlockRanges.cpp
index ff915ca59dae..48a4505458ae 100644
--- a/lib/Target/Hexagon/HexagonBlockRanges.cpp
+++ b/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -85,7 +85,7 @@ void HexagonBlockRanges::RangeList::unionize(bool MergeAdjacent) {
if (empty())
return;
- std::sort(begin(), end());
+ llvm::sort(begin(), end());
iterator Iter = begin();
while (Iter != end()-1) {
@@ -160,7 +160,7 @@ HexagonBlockRanges::InstrIndexMap::InstrIndexMap(MachineBasicBlock &B)
IndexType Idx = IndexType::First;
First = Idx;
for (auto &In : B) {
- if (In.isDebugValue())
+ if (In.isDebugInstr())
continue;
assert(getIndex(&In) == IndexType::None && "Instruction already in map");
Map.insert(std::make_pair(Idx, &In));
@@ -314,7 +314,7 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
RegisterSet Defs, Clobbers;
for (auto &In : B) {
- if (In.isDebugValue())
+ if (In.isDebugInstr())
continue;
IndexType Index = IndexMap.getIndex(&In);
// Process uses first.
@@ -422,10 +422,10 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
HexagonBlockRanges::RegToRangeMap HexagonBlockRanges::computeLiveMap(
InstrIndexMap &IndexMap) {
RegToRangeMap LiveMap;
- DEBUG(dbgs() << __func__ << ": index map\n" << IndexMap << '\n');
+ LLVM_DEBUG(dbgs() << __func__ << ": index map\n" << IndexMap << '\n');
computeInitialLiveRanges(IndexMap, LiveMap);
- DEBUG(dbgs() << __func__ << ": live map\n"
- << PrintRangeMap(LiveMap, TRI) << '\n');
+ LLVM_DEBUG(dbgs() << __func__ << ": live map\n"
+ << PrintRangeMap(LiveMap, TRI) << '\n');
return LiveMap;
}
@@ -486,8 +486,8 @@ HexagonBlockRanges::RegToRangeMap HexagonBlockRanges::computeDeadMap(
if (TargetRegisterInfo::isVirtualRegister(P.first.Reg))
addDeadRanges(P.first);
- DEBUG(dbgs() << __func__ << ": dead map\n"
- << PrintRangeMap(DeadMap, TRI) << '\n');
+ LLVM_DEBUG(dbgs() << __func__ << ": dead map\n"
+ << PrintRangeMap(DeadMap, TRI) << '\n');
return DeadMap;
}
diff --git a/lib/Target/Hexagon/HexagonBranchRelaxation.cpp b/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
index 84af4b14b9f7..2fa7888dd02b 100644
--- a/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
+++ b/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
@@ -90,7 +90,7 @@ FunctionPass *llvm::createHexagonBranchRelaxation() {
}
bool HexagonBranchRelaxation::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(dbgs() << "****** Hexagon Branch Relaxation ******\n");
+ LLVM_DEBUG(dbgs() << "****** Hexagon Branch Relaxation ******\n");
auto &HST = MF.getSubtarget<HexagonSubtarget>();
HII = HST.getInstrInfo();
@@ -114,8 +114,12 @@ void HexagonBranchRelaxation::computeOffset(MachineFunction &MF,
InstOffset = (InstOffset + ByteAlign) & ~(ByteAlign);
}
OffsetMap[&B] = InstOffset;
- for (auto &MI : B.instrs())
+ for (auto &MI : B.instrs()) {
InstOffset += HII->getSize(MI);
+ // Assume that all extendable branches will be extended.
+ if (MI.isBranch() && HII->isExtendable(MI))
+ InstOffset += HEXAGON_INSTR_SIZE;
+ }
}
}
@@ -145,6 +149,9 @@ bool HexagonBranchRelaxation::isJumpOutOfRange(MachineInstr &MI,
if (FirstTerm == B.instr_end())
return false;
+ if (HII->isExtended(MI))
+ return false;
+
unsigned InstOffset = BlockToInstOffset[&B];
unsigned Distance = 0;
@@ -193,14 +200,14 @@ bool HexagonBranchRelaxation::reGenerateBranch(MachineFunction &MF,
for (auto &MI : B) {
if (!MI.isBranch() || !isJumpOutOfRange(MI, BlockToInstOffset))
continue;
- DEBUG(dbgs() << "Long distance jump. isExtendable("
- << HII->isExtendable(MI) << ") isConstExtended("
- << HII->isConstExtended(MI) << ") " << MI);
+ LLVM_DEBUG(dbgs() << "Long distance jump. isExtendable("
+ << HII->isExtendable(MI) << ") isConstExtended("
+ << HII->isConstExtended(MI) << ") " << MI);
// Since we have not merged HW loops relaxation into
// this code (yet), soften our approach for the moment.
if (!HII->isExtendable(MI) && !HII->isExtended(MI)) {
- DEBUG(dbgs() << "\tUnderimplemented relax branch instruction.\n");
+ LLVM_DEBUG(dbgs() << "\tUnderimplemented relax branch instruction.\n");
} else {
// Find which operand is expandable.
int ExtOpNum = HII->getCExtOpNum(MI);
diff --git a/lib/Target/Hexagon/HexagonCallingConv.td b/lib/Target/Hexagon/HexagonCallingConv.td
new file mode 100644
index 000000000000..ed2f87570d6b
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonCallingConv.td
@@ -0,0 +1,134 @@
+//===- HexagonCallingConv.td ----------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class CCIfArgIsVarArg<CCAction A>
+ : CCIf<"State.isVarArg() && "
+ "ValNo >= static_cast<HexagonCCState&>(State)"
+ ".getNumNamedVarArgParams()", A>;
+
+def CC_HexagonStack: CallingConv<[
+ CCIfType<[i32,v2i16,v4i8],
+ CCAssignToStack<4,4>>,
+ CCIfType<[i64,v2i32,v4i16,v8i8],
+ CCAssignToStack<8,8>>
+]>;
+
+def CC_Hexagon: CallingConv<[
+ CCIfType<[i1,i8,i16],
+ CCPromoteToType<i32>>,
+ CCIfType<[f32],
+ CCBitConvertToType<i32>>,
+ CCIfType<[f64],
+ CCBitConvertToType<i64>>,
+
+ CCIfByVal<
+ CCPassByVal<8,8>>,
+ CCIfArgIsVarArg<
+ CCDelegateTo<CC_HexagonStack>>,
+
+ // Pass split values in pairs, allocate odd register if necessary.
+ CCIfType<[i32],
+ CCIfSplit<
+ CCCustom<"CC_SkipOdd">>>,
+
+ CCIfType<[i32,v2i16,v4i8],
+ CCAssignToReg<[R0,R1,R2,R3,R4,R5]>>,
+ // Make sure to allocate any skipped 32-bit register, so it does not get
+ // allocated to a subsequent 32-bit value.
+ CCIfType<[i64,v2i32,v4i16,v8i8],
+ CCCustom<"CC_SkipOdd">>,
+ CCIfType<[i64,v2i32,v4i16,v8i8],
+ CCAssignToReg<[D0,D1,D2]>>,
+
+ CCDelegateTo<CC_HexagonStack>
+]>;
+
+def RetCC_Hexagon: CallingConv<[
+ CCIfType<[i1,i8,i16],
+ CCPromoteToType<i32>>,
+ CCIfType<[f32],
+ CCBitConvertToType<i32>>,
+ CCIfType<[f64],
+ CCBitConvertToType<i64>>,
+
+ // Small structures are returned in a pair of registers, (which is
+ // always r1:0). In such case, what is returned are two i32 values
+ // without any additional information (in ArgFlags) stating that
+ // they are parts of a structure. Because of that there is no way
+ // to differentiate that situation from an attempt to return two
+ // values, so always assign R0 and R1.
+ CCIfSplit<
+ CCAssignToReg<[R0,R1]>>,
+ CCIfType<[i32,v2i16,v4i8],
+ CCAssignToReg<[R0,R1]>>,
+ CCIfType<[i64,v2i32,v4i16,v8i8],
+ CCAssignToReg<[D0]>>
+]>;
+
+
+class CCIfHvx64<CCAction A>
+ : CCIf<"State.getMachineFunction().getSubtarget<HexagonSubtarget>()"
+ ".useHVX64BOps()", A>;
+
+class CCIfHvx128<CCAction A>
+ : CCIf<"State.getMachineFunction().getSubtarget<HexagonSubtarget>()"
+ ".useHVX128BOps()", A>;
+
+def CC_Hexagon_HVX: CallingConv<[
+ // HVX 64-byte mode
+ CCIfHvx64<
+ CCIfType<[v16i32,v32i16,v64i8],
+ CCAssignToReg<[V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15]>>>,
+ CCIfHvx64<
+ CCIfType<[v32i32,v64i16,v128i8],
+ CCAssignToReg<[W0,W1,W2,W3,W4,W5,W6,W7]>>>,
+ CCIfHvx64<
+ CCIfType<[v16i32,v32i16,v64i8],
+ CCAssignToStack<64,64>>>,
+ CCIfHvx64<
+ CCIfType<[v32i32,v64i16,v128i8],
+ CCAssignToStack<128,64>>>,
+
+ // HVX 128-byte mode
+ CCIfHvx128<
+ CCIfType<[v32i32,v64i16,v128i8],
+ CCAssignToReg<[V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15]>>>,
+ CCIfHvx128<
+ CCIfType<[v64i32,v128i16,v256i8],
+ CCAssignToReg<[W0,W1,W2,W3,W4,W5,W6,W7]>>>,
+ CCIfHvx128<
+ CCIfType<[v32i32,v64i16,v128i8],
+ CCAssignToStack<128,128>>>,
+ CCIfHvx128<
+ CCIfType<[v64i32,v128i16,v256i8],
+ CCAssignToStack<256,128>>>,
+
+ CCDelegateTo<CC_Hexagon>
+]>;
+
+def RetCC_Hexagon_HVX: CallingConv<[
+ // HVX 64-byte mode
+ CCIfHvx64<
+ CCIfType<[v16i32,v32i16,v64i8],
+ CCAssignToReg<[V0]>>>,
+ CCIfHvx64<
+ CCIfType<[v32i32,v64i16,v128i8],
+ CCAssignToReg<[W0]>>>,
+
+ // HVX 128-byte mode
+ CCIfHvx128<
+ CCIfType<[v32i32,v64i16,v128i8],
+ CCAssignToReg<[V0]>>>,
+ CCIfHvx128<
+ CCIfType<[v64i32,v128i16,v256i8],
+ CCAssignToReg<[W0]>>>,
+
+ CCDelegateTo<RetCC_Hexagon>
+]>;
+
diff --git a/lib/Target/Hexagon/HexagonCommonGEP.cpp b/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 7e3d049d337f..f315e24eba62 100644
--- a/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
@@ -36,7 +37,6 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
@@ -342,7 +342,7 @@ bool HexagonCommonGEP::isHandledGepForm(GetElementPtrInst *GepI) {
void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI,
ValueToNodeMap &NM) {
- DEBUG(dbgs() << "Visiting GEP: " << *GepI << '\n');
+ LLVM_DEBUG(dbgs() << "Visiting GEP: " << *GepI << '\n');
GepNode *N = new (*Mem) GepNode;
Value *PtrOp = GepI->getPointerOperand();
uint32_t InBounds = GepI->isInBounds() ? GepNode::InBounds : 0;
@@ -426,7 +426,7 @@ void HexagonCommonGEP::collect() {
}
}
- DEBUG(dbgs() << "Gep nodes after initial collection:\n" << Nodes);
+ LLVM_DEBUG(dbgs() << "Gep nodes after initial collection:\n" << Nodes);
}
static void invert_find_roots(const NodeVect &Nodes, NodeChildrenMap &NCM,
@@ -575,7 +575,7 @@ void HexagonCommonGEP::common() {
}
}
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Gep node equality:\n";
for (NodePairSet::iterator I = Eq.begin(), E = Eq.end(); I != E; ++I)
dbgs() << "{ " << I->first << ", " << I->second << " }\n";
@@ -642,7 +642,7 @@ void HexagonCommonGEP::common() {
N->Parent = Rep;
}
- DEBUG(dbgs() << "Gep nodes after commoning:\n" << Nodes);
+ LLVM_DEBUG(dbgs() << "Gep nodes after commoning:\n" << Nodes);
// Finally, erase the nodes that are no longer used.
NodeSet Erase;
@@ -662,35 +662,35 @@ void HexagonCommonGEP::common() {
NodeVect::iterator NewE = remove_if(Nodes, in_set(Erase));
Nodes.resize(std::distance(Nodes.begin(), NewE));
- DEBUG(dbgs() << "Gep nodes after post-commoning cleanup:\n" << Nodes);
+ LLVM_DEBUG(dbgs() << "Gep nodes after post-commoning cleanup:\n" << Nodes);
}
template <typename T>
static BasicBlock *nearest_common_dominator(DominatorTree *DT, T &Blocks) {
- DEBUG({
- dbgs() << "NCD of {";
- for (typename T::iterator I = Blocks.begin(), E = Blocks.end();
- I != E; ++I) {
- if (!*I)
- continue;
- BasicBlock *B = cast<BasicBlock>(*I);
- dbgs() << ' ' << B->getName();
- }
- dbgs() << " }\n";
- });
+ LLVM_DEBUG({
+ dbgs() << "NCD of {";
+ for (typename T::iterator I = Blocks.begin(), E = Blocks.end(); I != E;
+ ++I) {
+ if (!*I)
+ continue;
+ BasicBlock *B = cast<BasicBlock>(*I);
+ dbgs() << ' ' << B->getName();
+ }
+ dbgs() << " }\n";
+ });
- // Allow null basic blocks in Blocks. In such cases, return nullptr.
- typename T::iterator I = Blocks.begin(), E = Blocks.end();
- if (I == E || !*I)
+ // Allow null basic blocks in Blocks. In such cases, return nullptr.
+ typename T::iterator I = Blocks.begin(), E = Blocks.end();
+ if (I == E || !*I)
+ return nullptr;
+ BasicBlock *Dom = cast<BasicBlock>(*I);
+ while (++I != E) {
+ BasicBlock *B = cast_or_null<BasicBlock>(*I);
+ Dom = B ? DT->findNearestCommonDominator(Dom, B) : nullptr;
+ if (!Dom)
return nullptr;
- BasicBlock *Dom = cast<BasicBlock>(*I);
- while (++I != E) {
- BasicBlock *B = cast_or_null<BasicBlock>(*I);
- Dom = B ? DT->findNearestCommonDominator(Dom, B) : nullptr;
- if (!Dom)
- return nullptr;
}
- DEBUG(dbgs() << "computed:" << Dom->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "computed:" << Dom->getName() << '\n');
return Dom;
}
@@ -753,7 +753,7 @@ static bool is_empty(const BasicBlock *B) {
BasicBlock *HexagonCommonGEP::recalculatePlacement(GepNode *Node,
NodeChildrenMap &NCM, NodeToValueMap &Loc) {
- DEBUG(dbgs() << "Loc for node:" << Node << '\n');
+ LLVM_DEBUG(dbgs() << "Loc for node:" << Node << '\n');
// Recalculate the placement for Node, assuming that the locations of
// its children in Loc are valid.
// Return nullptr if there is no valid placement for Node (for example, it
@@ -820,7 +820,7 @@ BasicBlock *HexagonCommonGEP::recalculatePlacement(GepNode *Node,
BasicBlock *HexagonCommonGEP::recalculatePlacementRec(GepNode *Node,
NodeChildrenMap &NCM, NodeToValueMap &Loc) {
- DEBUG(dbgs() << "LocRec begin for node:" << Node << '\n');
+ LLVM_DEBUG(dbgs() << "LocRec begin for node:" << Node << '\n');
// Recalculate the placement of Node, after recursively recalculating the
// placements of all its children.
NodeChildrenMap::iterator CF = NCM.find(Node);
@@ -830,7 +830,7 @@ BasicBlock *HexagonCommonGEP::recalculatePlacementRec(GepNode *Node,
recalculatePlacementRec(*I, NCM, Loc);
}
BasicBlock *LB = recalculatePlacement(Node, NCM, Loc);
- DEBUG(dbgs() << "LocRec end for node:" << Node << '\n');
+ LLVM_DEBUG(dbgs() << "LocRec end for node:" << Node << '\n');
return LB;
}
@@ -952,8 +952,8 @@ namespace {
void HexagonCommonGEP::separateChainForNode(GepNode *Node, Use *U,
NodeToValueMap &Loc) {
User *R = U->getUser();
- DEBUG(dbgs() << "Separating chain for node (" << Node << ") user: "
- << *R << '\n');
+ LLVM_DEBUG(dbgs() << "Separating chain for node (" << Node << ") user: " << *R
+ << '\n');
BasicBlock *PB = cast<Instruction>(R)->getParent();
GepNode *N = Node;
@@ -996,7 +996,7 @@ void HexagonCommonGEP::separateChainForNode(GepNode *Node, Use *U,
// Should at least have U in NewUs.
NewNode->Flags |= GepNode::Used;
- DEBUG(dbgs() << "new node: " << NewNode << " " << *NewNode << '\n');
+ LLVM_DEBUG(dbgs() << "new node: " << NewNode << " " << *NewNode << '\n');
assert(!NewUs.empty());
Uses[NewNode] = NewUs;
}
@@ -1007,7 +1007,7 @@ void HexagonCommonGEP::separateConstantChains(GepNode *Node,
NodeSet Ns;
nodes_for_root(Node, NCM, Ns);
- DEBUG(dbgs() << "Separating constant chains for node: " << Node << '\n');
+ LLVM_DEBUG(dbgs() << "Separating constant chains for node: " << Node << '\n');
// Collect all used nodes together with the uses from loads and stores,
// where the GEP node could be folded into the load/store instruction.
NodeToUsesMap FNs; // Foldable nodes.
@@ -1044,7 +1044,7 @@ void HexagonCommonGEP::separateConstantChains(GepNode *Node,
FNs.insert(std::make_pair(N, LSs));
}
- DEBUG(dbgs() << "Nodes with foldable users:\n" << FNs);
+ LLVM_DEBUG(dbgs() << "Nodes with foldable users:\n" << FNs);
for (NodeToUsesMap::iterator I = FNs.begin(), E = FNs.end(); I != E; ++I) {
GepNode *N = I->first;
@@ -1066,32 +1066,33 @@ void HexagonCommonGEP::computeNodePlacement(NodeToValueMap &Loc) {
for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
recalculatePlacementRec(*I, NCM, Loc);
- DEBUG(dbgs() << "Initial node placement:\n" << LocationAsBlock(Loc));
+ LLVM_DEBUG(dbgs() << "Initial node placement:\n" << LocationAsBlock(Loc));
if (OptEnableInv) {
for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
adjustForInvariance(*I, NCM, Loc);
- DEBUG(dbgs() << "Node placement after adjustment for invariance:\n"
- << LocationAsBlock(Loc));
+ LLVM_DEBUG(dbgs() << "Node placement after adjustment for invariance:\n"
+ << LocationAsBlock(Loc));
}
if (OptEnableConst) {
for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
separateConstantChains(*I, NCM, Loc);
}
- DEBUG(dbgs() << "Node use information:\n" << Uses);
+ LLVM_DEBUG(dbgs() << "Node use information:\n" << Uses);
// At the moment, there is no further refinement of the initial placement.
// Such a refinement could include splitting the nodes if they are placed
// too far from some of its users.
- DEBUG(dbgs() << "Final node placement:\n" << LocationAsBlock(Loc));
+ LLVM_DEBUG(dbgs() << "Final node placement:\n" << LocationAsBlock(Loc));
}
Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
BasicBlock *LocB) {
- DEBUG(dbgs() << "Fabricating GEP in " << LocB->getName()
- << " for nodes:\n" << NA);
+ LLVM_DEBUG(dbgs() << "Fabricating GEP in " << LocB->getName()
+ << " for nodes:\n"
+ << NA);
unsigned Num = NA.size();
GepNode *RN = NA[0];
assert((RN->Flags & GepNode::Root) && "Creating GEP for non-root");
@@ -1128,7 +1129,7 @@ Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
Type *ElTy = cast<PointerType>(InpTy->getScalarType())->getElementType();
NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", &*At);
NewInst->setIsInBounds(RN->Flags & GepNode::InBounds);
- DEBUG(dbgs() << "new GEP: " << *NewInst << '\n');
+ LLVM_DEBUG(dbgs() << "new GEP: " << *NewInst << '\n');
Input = NewInst;
} while (nax <= Num);
@@ -1161,7 +1162,7 @@ void HexagonCommonGEP::getAllUsersForNode(GepNode *Node, ValueVect &Values,
}
void HexagonCommonGEP::materialize(NodeToValueMap &Loc) {
- DEBUG(dbgs() << "Nodes before materialization:\n" << Nodes << '\n');
+ LLVM_DEBUG(dbgs() << "Nodes before materialization:\n" << Nodes << '\n');
NodeChildrenMap NCM;
NodeVect Roots;
// Compute the inversion again, since computing placement could alter
diff --git a/lib/Target/Hexagon/HexagonConstExtenders.cpp b/lib/Target/Hexagon/HexagonConstExtenders.cpp
index 294a6da69f51..cbce61bc63c9 100644
--- a/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -39,31 +39,57 @@ namespace llvm {
FunctionPass *createHexagonConstExtenders();
}
+static int32_t adjustUp(int32_t V, uint8_t A, uint8_t O) {
+ assert(isPowerOf2_32(A));
+ int32_t U = (V & -A) + O;
+ return U >= V ? U : U+A;
+}
+
+static int32_t adjustDown(int32_t V, uint8_t A, uint8_t O) {
+ assert(isPowerOf2_32(A));
+ int32_t U = (V & -A) + O;
+ return U <= V ? U : U-A;
+}
+
namespace {
struct OffsetRange {
+ // The range of values between Min and Max that are of form Align*N+Offset,
+ // for some integer N. Min and Max are required to be of that form as well,
+ // except in the case of an empty range.
int32_t Min = INT_MIN, Max = INT_MAX;
uint8_t Align = 1;
+ uint8_t Offset = 0;
OffsetRange() = default;
- OffsetRange(int32_t L, int32_t H, uint8_t A)
- : Min(L), Max(H), Align(A) {}
+ OffsetRange(int32_t L, int32_t H, uint8_t A, uint8_t O = 0)
+ : Min(L), Max(H), Align(A), Offset(O) {}
OffsetRange &intersect(OffsetRange A) {
- Align = std::max(Align, A.Align);
- Min = std::max(Min, A.Min);
- Max = std::min(Max, A.Max);
+ if (Align < A.Align)
+ std::swap(*this, A);
+
+ // Align >= A.Align.
+ if (Offset >= A.Offset && (Offset - A.Offset) % A.Align == 0) {
+ Min = adjustUp(std::max(Min, A.Min), Align, Offset);
+ Max = adjustDown(std::min(Max, A.Max), Align, Offset);
+ } else {
+ // Make an empty range.
+ Min = 0;
+ Max = -1;
+ }
// Canonicalize empty ranges.
if (Min > Max)
std::tie(Min, Max, Align) = std::make_tuple(0, -1, 1);
return *this;
}
OffsetRange &shift(int32_t S) {
- assert(alignTo(std::abs(S), Align) == uint64_t(std::abs(S)));
Min += S;
Max += S;
+ Offset = (Offset+S) % Align;
return *this;
}
OffsetRange &extendBy(int32_t D) {
// If D < 0, extend Min, otherwise extend Max.
+ assert(D % Align == 0);
if (D < 0)
Min = (INT_MIN-D < Min) ? Min+D : INT_MIN;
else
@@ -74,7 +100,7 @@ namespace {
return Min > Max;
}
bool contains(int32_t V) const {
- return Min <= V && V <= Max && (V % Align) == 0;
+ return Min <= V && V <= Max && (V-Offset) % Align == 0;
}
bool operator==(const OffsetRange &R) const {
return Min == R.Min && Max == R.Max && Align == R.Align;
@@ -408,7 +434,8 @@ namespace {
raw_ostream &operator<< (raw_ostream &OS, const OffsetRange &OR) {
if (OR.Min > OR.Max)
OS << '!';
- OS << '[' << OR.Min << ',' << OR.Max << "]a" << unsigned(OR.Align);
+ OS << '[' << OR.Min << ',' << OR.Max << "]a" << unsigned(OR.Align)
+ << '+' << unsigned(OR.Offset);
return OS;
}
@@ -703,9 +730,21 @@ bool HCE::ExtRoot::operator< (const HCE::ExtRoot &ER) const {
}
case MachineOperand::MO_ExternalSymbol:
return StringRef(V.SymbolName) < StringRef(ER.V.SymbolName);
- case MachineOperand::MO_GlobalAddress:
- assert(V.GV->hasName() && ER.V.GV->hasName());
- return V.GV->getName() < ER.V.GV->getName();
+ case MachineOperand::MO_GlobalAddress: {
+ // Global values may not have names, so compare their positions
+ // in the parent module.
+ const Module &M = *V.GV->getParent();
+ auto FindPos = [&M] (const GlobalValue &V) {
+ unsigned P = 0;
+ for (const GlobalValue &T : M.global_values()) {
+ if (&T == &V)
+ return P;
+ P++;
+ }
+ llvm_unreachable("Global value not found in module");
+ };
+ return FindPos(*V.GV) < FindPos(*ER.V.GV);
+ }
case MachineOperand::MO_BlockAddress: {
const BasicBlock *ThisB = V.BA->getBasicBlock();
const BasicBlock *OtherB = ER.V.BA->getBasicBlock();
@@ -999,15 +1038,19 @@ unsigned HCE::getDirectRegReplacement(unsigned ExtOpc) const {
return 0;
}
-// Return the allowable deviation from the current value of Rb which the
+// Return the allowable deviation from the current value of Rb (i.e. the
+// range of values that can be added to the current value) which the
// instruction MI can accommodate.
// The instruction MI is a user of register Rb, which is defined via an
// extender. It may be possible for MI to be tweaked to work for a register
// defined with a slightly different value. For example
-// ... = L2_loadrub_io Rb, 0
+// ... = L2_loadrub_io Rb, 1
// can be modifed to be
-// ... = L2_loadrub_io Rb', 1
-// if Rb' = Rb-1.
+// ... = L2_loadrub_io Rb', 0
+// if Rb' = Rb+1.
+// The range for Rb would be [Min+1, Max+1], where [Min, Max] is a range
+// for L2_loadrub with offset 0. That means that Rb could be replaced with
+// Rc, where Rc-Rb belongs to [Min+1, Max+1].
OffsetRange HCE::getOffsetRange(Register Rb, const MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
// Instructions that are constant-extended may be replaced with something
@@ -1109,6 +1152,13 @@ void HCE::recordExtender(MachineInstr &MI, unsigned OpNum) {
bool IsLoad = MI.mayLoad();
bool IsStore = MI.mayStore();
+ // Fixed stack slots have negative indexes, and they cannot be used
+ // with TRI::stackSlot2Index and TRI::index2StackSlot. This is somewhat
+ // unfortunate, but should not be a frequent thing.
+ for (MachineOperand &Op : MI.operands())
+ if (Op.isFI() && Op.getIndex() < 0)
+ return;
+
if (IsLoad || IsStore) {
unsigned AM = HII->getAddrMode(MI);
switch (AM) {
@@ -1220,7 +1270,7 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
if (!ED.IsDef)
continue;
ExtValue EV(ED);
- DEBUG(dbgs() << " =" << I << ". " << EV << " " << ED << '\n');
+ LLVM_DEBUG(dbgs() << " =" << I << ". " << EV << " " << ED << '\n');
assert(ED.Rd.Reg != 0);
Ranges[I-Begin] = getOffsetRange(ED.Rd).shift(EV.Offset);
// A2_tfrsi is a special case: it will be replaced with A2_addi, which
@@ -1240,7 +1290,7 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
if (ED.IsDef)
continue;
ExtValue EV(ED);
- DEBUG(dbgs() << " " << I << ". " << EV << " " << ED << '\n');
+ LLVM_DEBUG(dbgs() << " " << I << ". " << EV << " " << ED << '\n');
OffsetRange Dev = getOffsetRange(ED);
Ranges[I-Begin].intersect(Dev.shift(EV.Offset));
}
@@ -1252,7 +1302,7 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
for (unsigned I = Begin; I != End; ++I)
RangeMap[Ranges[I-Begin]].insert(I);
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Ranges\n";
for (unsigned I = Begin; I != End; ++I)
dbgs() << " " << I << ". " << Ranges[I-Begin] << '\n';
@@ -1280,11 +1330,17 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
SmallVector<RangeTree::Node*,8> Nodes;
Tree.order(Nodes);
- auto MaxAlign = [](const SmallVectorImpl<RangeTree::Node*> &Nodes) {
- uint8_t Align = 1;
- for (RangeTree::Node *N : Nodes)
- Align = std::max(Align, N->Range.Align);
- return Align;
+ auto MaxAlign = [](const SmallVectorImpl<RangeTree::Node*> &Nodes,
+ uint8_t Align, uint8_t Offset) {
+ for (RangeTree::Node *N : Nodes) {
+ if (N->Range.Align <= Align || N->Range.Offset < Offset)
+ continue;
+ if ((N->Range.Offset - Offset) % Align != 0)
+ continue;
+ Align = N->Range.Align;
+ Offset = N->Range.Offset;
+ }
+ return std::make_pair(Align, Offset);
};
// Construct the set of all potential definition points from the endpoints
@@ -1294,14 +1350,14 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
std::set<int32_t> CandSet;
for (RangeTree::Node *N : Nodes) {
const OffsetRange &R = N->Range;
- uint8_t A0 = MaxAlign(Tree.nodesWith(R.Min, false));
+ auto P0 = MaxAlign(Tree.nodesWith(R.Min, false), R.Align, R.Offset);
CandSet.insert(R.Min);
- if (R.Align < A0)
- CandSet.insert(R.Min < 0 ? -alignDown(-R.Min, A0) : alignTo(R.Min, A0));
- uint8_t A1 = MaxAlign(Tree.nodesWith(R.Max, false));
+ if (R.Align < P0.first)
+ CandSet.insert(adjustUp(R.Min, P0.first, P0.second));
+ auto P1 = MaxAlign(Tree.nodesWith(R.Max, false), R.Align, R.Offset);
CandSet.insert(R.Max);
- if (R.Align < A1)
- CandSet.insert(R.Max < 0 ? -alignTo(-R.Max, A1) : alignDown(R.Max, A1));
+ if (R.Align < P1.first)
+ CandSet.insert(adjustDown(R.Max, P1.first, P1.second));
}
// Build the assignment map: candidate C -> { list of extender indexes }.
@@ -1340,7 +1396,7 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
}
}
- DEBUG(dbgs() << "IMap (before fixup) = " << PrintIMap(IMap, *HRI));
+ LLVM_DEBUG(dbgs() << "IMap (before fixup) = " << PrintIMap(IMap, *HRI));
// There is some ambiguity in what initializer should be used, if the
// descriptor's subexpression is non-trivial: it can be the entire
@@ -1359,10 +1415,50 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
AssignmentMap::iterator F = IMap.find({EV, ExtExpr()});
if (F == IMap.end())
continue;
+
// Finally, check if all extenders have the same value as the initializer.
- auto SameValue = [&EV,this](unsigned I) {
+ // Make sure that extenders that are a part of a stack address are not
+ // merged with those that aren't. Stack addresses need an offset field
+ // (to be used by frame index elimination), while non-stack expressions
+ // can be replaced with forms (such as rr) that do not have such a field.
+ // Example:
+ //
+ // Collected 3 extenders
+ // =2. imm:0 off:32968 bb#2: %7 = ## + __ << 0, def
+ // 0. imm:0 off:267 bb#0: __ = ## + SS#1 << 0
+ // 1. imm:0 off:267 bb#1: __ = ## + SS#1 << 0
+ // Ranges
+ // 0. [-756,267]a1+0
+ // 1. [-756,267]a1+0
+ // 2. [201,65735]a1+0
+ // RangeMap
+ // [-756,267]a1+0 -> 0 1
+ // [201,65735]a1+0 -> 2
+ // IMap (before fixup) = {
+ // [imm:0 off:267, ## + __ << 0] -> { 2 }
+ // [imm:0 off:267, ## + SS#1 << 0] -> { 0 1 }
+ // }
+ // IMap (after fixup) = {
+ // [imm:0 off:267, ## + __ << 0] -> { 2 0 1 }
+ // [imm:0 off:267, ## + SS#1 << 0] -> { }
+ // }
+ // Inserted def in bb#0 for initializer: [imm:0 off:267, ## + __ << 0]
+ // %12:intregs = A2_tfrsi 267
+ //
+ // The result was
+ // %12:intregs = A2_tfrsi 267
+ // S4_pstorerbt_rr %3, %12, %stack.1, 0, killed %4
+ // Which became
+ // r0 = #267
+ // if (p0.new) memb(r0+r29<<#4) = r2
+
+ bool IsStack = any_of(F->second, [this](unsigned I) {
+ return Extenders[I].Expr.Rs.isSlot();
+ });
+ auto SameValue = [&EV,this,IsStack](unsigned I) {
const ExtDesc &ED = Extenders[I];
- return ExtValue(ED).Offset == EV.Offset;
+ return ED.Expr.Rs.isSlot() == IsStack &&
+ ExtValue(ED).Offset == EV.Offset;
};
if (all_of(P.second, SameValue)) {
F->second.insert(P.second.begin(), P.second.end());
@@ -1370,7 +1466,7 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
}
}
- DEBUG(dbgs() << "IMap (after fixup) = " << PrintIMap(IMap, *HRI));
+ LLVM_DEBUG(dbgs() << "IMap (after fixup) = " << PrintIMap(IMap, *HRI));
}
void HCE::calculatePlacement(const ExtenderInit &ExtI, const IndexList &Refs,
@@ -1473,9 +1569,9 @@ HCE::Register HCE::insertInitializer(Loc DefL, const ExtenderInit &ExtI) {
assert(InitI);
(void)InitI;
- DEBUG(dbgs() << "Inserted def in bb#" << MBB.getNumber()
- << " for initializer: " << PrintInit(ExtI, *HRI)
- << "\n " << *InitI);
+ LLVM_DEBUG(dbgs() << "Inserted def in bb#" << MBB.getNumber()
+ << " for initializer: " << PrintInit(ExtI, *HRI) << "\n "
+ << *InitI);
return { DefR, 0 };
}
@@ -1618,7 +1714,7 @@ bool HCE::replaceInstrExpr(const ExtDesc &ED, const ExtenderInit &ExtI,
assert(IdxOpc == Hexagon::A2_addi);
// Clamp Diff to the 16 bit range.
- int32_t D = isInt<16>(Diff) ? Diff : (Diff > 32767 ? 32767 : -32767);
+ int32_t D = isInt<16>(Diff) ? Diff : (Diff > 0 ? 32767 : -32768);
BuildMI(MBB, At, dl, HII->get(IdxOpc))
.add(MI.getOperand(0))
.add(MachineOperand(ExtR))
@@ -1626,11 +1722,13 @@ bool HCE::replaceInstrExpr(const ExtDesc &ED, const ExtenderInit &ExtI,
Diff -= D;
#ifndef NDEBUG
// Make sure the output is within allowable range for uses.
+ // "Diff" is a difference in the "opposite direction", i.e. Ext - DefV,
+ // not DefV - Ext, as the getOffsetRange would calculate.
OffsetRange Uses = getOffsetRange(MI.getOperand(0));
- if (!Uses.contains(Diff))
- dbgs() << "Diff: " << Diff << " out of range " << Uses
+ if (!Uses.contains(-Diff))
+ dbgs() << "Diff: " << -Diff << " out of range " << Uses
<< " for " << MI;
- assert(Uses.contains(Diff));
+ assert(Uses.contains(-Diff));
#endif
MBB.erase(MI);
return true;
@@ -1726,8 +1824,8 @@ bool HCE::replaceInstr(unsigned Idx, Register ExtR, const ExtenderInit &ExtI) {
ExtValue EV(ED);
int32_t Diff = EV.Offset - DefV.Offset;
const MachineInstr &MI = *ED.UseMI;
- DEBUG(dbgs() << __func__ << " Idx:" << Idx << " ExtR:"
- << PrintRegister(ExtR, *HRI) << " Diff:" << Diff << '\n');
+ LLVM_DEBUG(dbgs() << __func__ << " Idx:" << Idx << " ExtR:"
+ << PrintRegister(ExtR, *HRI) << " Diff:" << Diff << '\n');
// These two addressing modes must be converted into indexed forms
// regardless of what the initializer looks like.
@@ -1833,7 +1931,7 @@ const MachineOperand &HCE::getStoredValueOp(const MachineInstr &MI) const {
bool HCE::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
- DEBUG(MF.print(dbgs() << "Before " << getPassName() << '\n', nullptr));
+ LLVM_DEBUG(MF.print(dbgs() << "Before " << getPassName() << '\n', nullptr));
HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
@@ -1842,13 +1940,13 @@ bool HCE::runOnMachineFunction(MachineFunction &MF) {
AssignmentMap IMap;
collect(MF);
- std::sort(Extenders.begin(), Extenders.end(),
+ llvm::sort(Extenders.begin(), Extenders.end(),
[](const ExtDesc &A, const ExtDesc &B) {
return ExtValue(A) < ExtValue(B);
});
bool Changed = false;
- DEBUG(dbgs() << "Collected " << Extenders.size() << " extenders\n");
+ LLVM_DEBUG(dbgs() << "Collected " << Extenders.size() << " extenders\n");
for (unsigned I = 0, E = Extenders.size(); I != E; ) {
unsigned B = I;
const ExtRoot &T = Extenders[B].getOp();
@@ -1860,7 +1958,7 @@ bool HCE::runOnMachineFunction(MachineFunction &MF) {
Changed |= replaceExtenders(IMap);
}
- DEBUG({
+ LLVM_DEBUG({
if (Changed)
MF.print(dbgs() << "After " << getPassName() << '\n', nullptr);
else
diff --git a/lib/Target/Hexagon/HexagonConstPropagation.cpp b/lib/Target/Hexagon/HexagonConstPropagation.cpp
index 8ac96f3a4bfa..8f22a71dc1f3 100644
--- a/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -617,7 +617,7 @@ void MachineConstPropagator::CellMap::print(raw_ostream &os,
void MachineConstPropagator::visitPHI(const MachineInstr &PN) {
const MachineBasicBlock *MB = PN.getParent();
unsigned MBN = MB->getNumber();
- DEBUG(dbgs() << "Visiting FI(" << printMBBReference(*MB) << "): " << PN);
+ LLVM_DEBUG(dbgs() << "Visiting FI(" << printMBBReference(*MB) << "): " << PN);
const MachineOperand &MD = PN.getOperand(0);
Register DefR(MD);
@@ -642,8 +642,8 @@ Bottomize:
const MachineBasicBlock *PB = PN.getOperand(i+1).getMBB();
unsigned PBN = PB->getNumber();
if (!EdgeExec.count(CFGEdge(PBN, MBN))) {
- DEBUG(dbgs() << " edge " << printMBBReference(*PB) << "->"
- << printMBBReference(*MB) << " not executable\n");
+ LLVM_DEBUG(dbgs() << " edge " << printMBBReference(*PB) << "->"
+ << printMBBReference(*MB) << " not executable\n");
continue;
}
const MachineOperand &SO = PN.getOperand(i);
@@ -658,8 +658,9 @@ Bottomize:
LatticeCell SrcC;
bool Eval = MCE.evaluate(UseR, Cells.get(UseR.Reg), SrcC);
- DEBUG(dbgs() << " edge from " << printMBBReference(*PB) << ": "
- << printReg(UseR.Reg, &MCE.TRI, UseR.SubReg) << SrcC << '\n');
+ LLVM_DEBUG(dbgs() << " edge from " << printMBBReference(*PB) << ": "
+ << printReg(UseR.Reg, &MCE.TRI, UseR.SubReg) << SrcC
+ << '\n');
Changed |= Eval ? DefC.meet(SrcC)
: DefC.setBottom();
Cells.update(DefR.Reg, DefC);
@@ -671,11 +672,11 @@ Bottomize:
}
void MachineConstPropagator::visitNonBranch(const MachineInstr &MI) {
- DEBUG(dbgs() << "Visiting MI(" << printMBBReference(*MI.getParent())
- << "): " << MI);
+ LLVM_DEBUG(dbgs() << "Visiting MI(" << printMBBReference(*MI.getParent())
+ << "): " << MI);
CellMap Outputs;
bool Eval = MCE.evaluate(MI, Cells, Outputs);
- DEBUG({
+ LLVM_DEBUG({
if (Eval) {
dbgs() << " outputs:";
for (auto &I : Outputs)
@@ -713,7 +714,7 @@ void MachineConstPropagator::visitNonBranch(const MachineInstr &MI) {
}
}
-// \brief Starting at a given branch, visit remaining branches in the block.
+// Starting at a given branch, visit remaining branches in the block.
// Traverse over the subsequent branches for as long as the preceding one
// can fall through. Add all the possible targets to the flow work queue,
// including the potential fall-through to the layout-successor block.
@@ -728,8 +729,8 @@ void MachineConstPropagator::visitBranchesFrom(const MachineInstr &BrI) {
while (It != End) {
const MachineInstr &MI = *It;
InstrExec.insert(&MI);
- DEBUG(dbgs() << "Visiting " << (EvalOk ? "BR" : "br") << "("
- << printMBBReference(B) << "): " << MI);
+ LLVM_DEBUG(dbgs() << "Visiting " << (EvalOk ? "BR" : "br") << "("
+ << printMBBReference(B) << "): " << MI);
// Do not evaluate subsequent branches if the evaluation of any of the
// previous branches failed. Keep iterating over the branches only
// to mark them as executable.
@@ -763,23 +764,23 @@ void MachineConstPropagator::visitBranchesFrom(const MachineInstr &BrI) {
// last one set "FallsThru", then add an edge to the layout successor
// to the targets.
Targets.clear();
- DEBUG(dbgs() << " failed to evaluate a branch...adding all CFG "
- "successors\n");
+ LLVM_DEBUG(dbgs() << " failed to evaluate a branch...adding all CFG "
+ "successors\n");
for (const MachineBasicBlock *SB : B.successors())
Targets.insert(SB);
}
for (const MachineBasicBlock *TB : Targets) {
unsigned TBN = TB->getNumber();
- DEBUG(dbgs() << " pushing edge " << printMBBReference(B) << " -> "
- << printMBBReference(*TB) << "\n");
+ LLVM_DEBUG(dbgs() << " pushing edge " << printMBBReference(B) << " -> "
+ << printMBBReference(*TB) << "\n");
FlowQ.push(CFGEdge(MBN, TBN));
}
}
void MachineConstPropagator::visitUsesOf(unsigned Reg) {
- DEBUG(dbgs() << "Visiting uses of " << printReg(Reg, &MCE.TRI)
- << Cells.get(Reg) << '\n');
+ LLVM_DEBUG(dbgs() << "Visiting uses of " << printReg(Reg, &MCE.TRI)
+ << Cells.get(Reg) << '\n');
for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
// Do not process non-executable instructions. They can become exceutable
// later (via a flow-edge in the work queue). In such case, the instruc-
@@ -799,7 +800,7 @@ bool MachineConstPropagator::computeBlockSuccessors(const MachineBasicBlock *MB,
SetVector<const MachineBasicBlock*> &Targets) {
MachineBasicBlock::const_iterator FirstBr = MB->end();
for (const MachineInstr &MI : *MB) {
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
continue;
if (MI.isBranch()) {
FirstBr = MI.getIterator();
@@ -814,7 +815,7 @@ bool MachineConstPropagator::computeBlockSuccessors(const MachineBasicBlock *MB,
for (MachineBasicBlock::const_iterator I = FirstBr; I != End; ++I) {
const MachineInstr &MI = *I;
// Can there be debug instructions between branches?
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
continue;
if (!InstrExec.count(&MI))
continue;
@@ -870,10 +871,10 @@ void MachineConstPropagator::propagate(MachineFunction &MF) {
CFGEdge Edge = FlowQ.front();
FlowQ.pop();
- DEBUG(dbgs() << "Picked edge "
- << printMBBReference(*MF.getBlockNumbered(Edge.first)) << "->"
- << printMBBReference(*MF.getBlockNumbered(Edge.second))
- << '\n');
+ LLVM_DEBUG(
+ dbgs() << "Picked edge "
+ << printMBBReference(*MF.getBlockNumbered(Edge.first)) << "->"
+ << printMBBReference(*MF.getBlockNumbered(Edge.second)) << '\n');
if (Edge.first != EntryNum)
if (EdgeExec.count(Edge))
continue;
@@ -896,7 +897,7 @@ void MachineConstPropagator::propagate(MachineFunction &MF) {
// If the successor block just became executable, visit all instructions.
// To see if this is the first time we're visiting it, check the first
// non-debug instruction to see if it is executable.
- while (It != End && It->isDebugValue())
+ while (It != End && It->isDebugInstr())
++It;
assert(It == End || !It->isPHI());
// If this block has been visited, go on to the next one.
@@ -905,7 +906,7 @@ void MachineConstPropagator::propagate(MachineFunction &MF) {
// For now, scan all non-branch instructions. Branches require different
// processing.
while (It != End && !It->isBranch()) {
- if (!It->isDebugValue()) {
+ if (!It->isDebugInstr()) {
InstrExec.insert(&*It);
visitNonBranch(*It);
}
@@ -927,7 +928,7 @@ void MachineConstPropagator::propagate(MachineFunction &MF) {
}
} // while (FlowQ)
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Cells after propagation:\n";
Cells.print(dbgs(), MCE.TRI);
dbgs() << "Dead CFG edges:\n";
@@ -1042,7 +1043,7 @@ bool MachineConstPropagator::rewrite(MachineFunction &MF) {
// This is the constant propagation algorithm as described by Wegman-Zadeck.
// Most of the terminology comes from there.
bool MachineConstPropagator::run(MachineFunction &MF) {
- DEBUG(MF.print(dbgs() << "Starting MachineConstPropagator\n", nullptr));
+ LLVM_DEBUG(MF.print(dbgs() << "Starting MachineConstPropagator\n", nullptr));
MRI = &MF.getRegInfo();
@@ -1054,7 +1055,7 @@ bool MachineConstPropagator::run(MachineFunction &MF) {
propagate(MF);
bool Changed = rewrite(MF);
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "End of MachineConstPropagator (Changed=" << Changed << ")\n";
if (Changed)
MF.print(dbgs(), nullptr);
@@ -1880,10 +1881,7 @@ namespace {
public:
static char ID;
- HexagonConstPropagation() : MachineFunctionPass(ID) {
- PassRegistry &Registry = *PassRegistry::getPassRegistry();
- initializeHexagonConstPropagationPass(Registry);
- }
+ HexagonConstPropagation() : MachineFunctionPass(ID) {}
StringRef getPassName() const override {
return "Hexagon Constant Propagation";
@@ -1903,8 +1901,8 @@ namespace {
char HexagonConstPropagation::ID = 0;
-INITIALIZE_PASS(HexagonConstPropagation, "hcp", "Hexagon Constant Propagation",
- false, false)
+INITIALIZE_PASS(HexagonConstPropagation, "hexagon-constp",
+ "Hexagon Constant Propagation", false, false)
HexagonConstEvaluator::HexagonConstEvaluator(MachineFunction &Fn)
: MachineConstEvaluator(Fn),
@@ -2022,6 +2020,8 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI,
case Hexagon::A2_combineii: // combine(#s8Ext, #s8)
case Hexagon::A4_combineii: // combine(#s8, #u6Ext)
{
+ if (!MI.getOperand(1).isImm() || !MI.getOperand(2).isImm())
+ return false;
uint64_t Hi = MI.getOperand(1).getImm();
uint64_t Lo = MI.getOperand(2).getImm();
uint64_t Res = (Hi << 32) | (Lo & 0xFFFFFFFF);
@@ -2631,6 +2631,8 @@ bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI,
Eval = evaluateANDrr(R1, Register(Src2), Inputs, RC);
break;
case Hexagon::A2_andir: {
+ if (!Src2.isImm())
+ return false;
APInt A(32, Src2.getImm(), true);
Eval = evaluateANDri(R1, A, Inputs, RC);
break;
@@ -2640,6 +2642,8 @@ bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI,
Eval = evaluateORrr(R1, Register(Src2), Inputs, RC);
break;
case Hexagon::A2_orir: {
+ if (!Src2.isImm())
+ return false;
APInt A(32, Src2.getImm(), true);
Eval = evaluateORri(R1, A, Inputs, RC);
break;
@@ -2775,7 +2779,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
AllDefs = false;
// Some diagnostics.
- // DEBUG({...}) gets confused with all this code as an argument.
+ // LLVM_DEBUG({...}) gets confused with all this code as an argument.
#ifndef NDEBUG
bool Debugging = DebugFlag && isCurrentDebugType(DEBUG_TYPE);
if (Debugging) {
@@ -2920,7 +2924,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
ChangedNum++;
}
- DEBUG({
+ LLVM_DEBUG({
if (!NewInstrs.empty()) {
MachineFunction &MF = *MI.getParent()->getParent();
dbgs() << "In function: " << MF.getName() << "\n";
@@ -3087,7 +3091,7 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
MO.setIsKill(false);
}
- DEBUG({
+ LLVM_DEBUG({
if (NewMI) {
dbgs() << "Rewrite: for " << MI;
if (NewMI != &MI)
@@ -3127,7 +3131,7 @@ bool HexagonConstEvaluator::rewriteHexBranch(MachineInstr &BrI,
if (BrI.getOpcode() == Hexagon::J2_jump)
return false;
- DEBUG(dbgs() << "Rewrite(" << printMBBReference(B) << "):" << BrI);
+ LLVM_DEBUG(dbgs() << "Rewrite(" << printMBBReference(B) << "):" << BrI);
bool Rewritten = false;
if (NumTargets > 0) {
assert(!FallsThru && "This should have been checked before");
diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 087a77203fcb..fccde96d8a32 100644
--- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -300,7 +300,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr &I1,
// * reads I2's def reg
// * or has unmodelled side effects
// we can't move I2 across it.
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
if (isUnsafeToMoveAcross(*I, I2UseReg, I2DestReg, TRI)) {
@@ -358,7 +358,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr &I1,
// to remove the implicit killed %d4 operand. For now, we are
// conservative and disallow the move.
// we can't move I1 across it.
- if (MI.isDebugValue()) {
+ if (MI.isDebugInstr()) {
if (MI.readsRegister(I1DestReg, TRI)) // Move this instruction after I2.
DbgMItoMove.push_back(&MI);
continue;
@@ -396,7 +396,7 @@ void
HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
DenseMap<unsigned, MachineInstr *> LastDef;
for (MachineInstr &MI : BB) {
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
continue;
// Mark TFRs that feed a potential new value store as such.
@@ -423,7 +423,7 @@ HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
MachineBasicBlock::iterator It(DefInst);
unsigned NumInstsToDef = 0;
while (&*It != &MI) {
- if (!It->isDebugValue())
+ if (!It->isDebugInstr())
++NumInstsToDef;
++It;
}
@@ -489,7 +489,7 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
MI != End;) {
MachineInstr &I1 = *MI++;
- if (I1.isDebugValue())
+ if (I1.isDebugInstr())
continue;
// Don't combine a TFR whose user could be newified (instructions that
@@ -526,7 +526,7 @@ MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr &I1,
bool &DoInsertAtI1,
bool AllowC64) {
MachineBasicBlock::iterator I2 = std::next(MachineBasicBlock::iterator(I1));
- while (I2 != I1.getParent()->end() && I2->isDebugValue())
+ while (I2 != I1.getParent()->end() && I2->isDebugInstr())
++I2;
unsigned I1DestReg = I1.getOperand(0).getReg();
@@ -649,7 +649,7 @@ void HexagonCopyToCombine::emitConst64(MachineBasicBlock::iterator &InsertPt,
unsigned DoubleDestReg,
MachineOperand &HiOperand,
MachineOperand &LoOperand) {
- DEBUG(dbgs() << "Found a CONST64\n");
+ LLVM_DEBUG(dbgs() << "Found a CONST64\n");
DebugLoc DL = InsertPt->getDebugLoc();
MachineBasicBlock *BB = InsertPt->getParent();
diff --git a/lib/Target/Hexagon/HexagonDepArch.td b/lib/Target/Hexagon/HexagonDepArch.td
index 87dcd966f2ed..3594379aa841 100644
--- a/lib/Target/Hexagon/HexagonDepArch.td
+++ b/lib/Target/Hexagon/HexagonDepArch.td
@@ -11,14 +11,14 @@
def ArchV65: SubtargetFeature<"v65", "HexagonArchVersion", "Hexagon::ArchEnum::V65", "Enable Hexagon V65 architecture">;
-def HasV65T : Predicate<"HST->hasV65TOps()">, AssemblerPredicate<"ArchV65">;
+def HasV65 : Predicate<"HST->hasV65Ops()">, AssemblerPredicate<"ArchV65">;
def ArchV62: SubtargetFeature<"v62", "HexagonArchVersion", "Hexagon::ArchEnum::V62", "Enable Hexagon V62 architecture">;
-def HasV62T : Predicate<"HST->hasV62TOps()">, AssemblerPredicate<"ArchV62">;
+def HasV62 : Predicate<"HST->hasV62Ops()">, AssemblerPredicate<"ArchV62">;
def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "Hexagon::ArchEnum::V60", "Enable Hexagon V60 architecture">;
-def HasV60T : Predicate<"HST->hasV60TOps()">, AssemblerPredicate<"ArchV60">;
+def HasV60 : Predicate<"HST->hasV60Ops()">, AssemblerPredicate<"ArchV60">;
def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "Hexagon::ArchEnum::V55", "Enable Hexagon V55 architecture">;
-def HasV55T : Predicate<"HST->hasV55TOps()">, AssemblerPredicate<"ArchV55">;
+def HasV55 : Predicate<"HST->hasV55Ops()">, AssemblerPredicate<"ArchV55">;
def ArchV4: SubtargetFeature<"v4", "HexagonArchVersion", "Hexagon::ArchEnum::V4", "Enable Hexagon V4 architecture">;
-def HasV4T : Predicate<"HST->hasV4TOps()">, AssemblerPredicate<"ArchV4">;
+def HasV4 : Predicate<"HST->hasV4Ops()">, AssemblerPredicate<"ArchV4">;
def ArchV5: SubtargetFeature<"v5", "HexagonArchVersion", "Hexagon::ArchEnum::V5", "Enable Hexagon V5 architecture">;
-def HasV5T : Predicate<"HST->hasV5TOps()">, AssemblerPredicate<"ArchV5">;
+def HasV5 : Predicate<"HST->hasV5Ops()">, AssemblerPredicate<"ArchV5">;
diff --git a/lib/Target/Hexagon/HexagonDepIICScalar.td b/lib/Target/Hexagon/HexagonDepIICScalar.td
index 083ec7753e04..931504b56ccb 100644
--- a/lib/Target/Hexagon/HexagonDepIICScalar.td
+++ b/lib/Target/Hexagon/HexagonDepIICScalar.td
@@ -10,21 +10,17 @@
//===----------------------------------------------------------------------===//
-def tc_0077f68c : InstrItinClass;
def tc_00afc57e : InstrItinClass;
def tc_00e7c26e : InstrItinClass;
def tc_03220ffa : InstrItinClass;
def tc_038a1342 : InstrItinClass;
def tc_04c9decc : InstrItinClass;
def tc_05b6c987 : InstrItinClass;
-def tc_0a2b8c7c : InstrItinClass;
def tc_0cd51c76 : InstrItinClass;
def tc_0dc560de : InstrItinClass;
def tc_0fc1ae07 : InstrItinClass;
def tc_10b97e27 : InstrItinClass;
-def tc_128f96e3 : InstrItinClass;
def tc_1372bca1 : InstrItinClass;
-def tc_1432937d : InstrItinClass;
def tc_14cd4cfa : InstrItinClass;
def tc_15411484 : InstrItinClass;
def tc_16d0d8d5 : InstrItinClass;
@@ -32,18 +28,14 @@ def tc_181af5d0 : InstrItinClass;
def tc_1853ea6d : InstrItinClass;
def tc_1b82a277 : InstrItinClass;
def tc_1b9c9ee5 : InstrItinClass;
-def tc_1c0005f9 : InstrItinClass;
def tc_1d5a38a8 : InstrItinClass;
def tc_1e856f58 : InstrItinClass;
-def tc_20280784 : InstrItinClass;
def tc_234a11a5 : InstrItinClass;
def tc_238d91d2 : InstrItinClass;
def tc_29175780 : InstrItinClass;
-def tc_29641329 : InstrItinClass;
def tc_2a160009 : InstrItinClass;
def tc_2b2f4060 : InstrItinClass;
def tc_2b6f77c6 : InstrItinClass;
-def tc_2e00db30 : InstrItinClass;
def tc_2f185f5c : InstrItinClass;
def tc_2fc0c436 : InstrItinClass;
def tc_351fed2d : InstrItinClass;
@@ -71,22 +63,19 @@ def tc_51b866be : InstrItinClass;
def tc_523fcf30 : InstrItinClass;
def tc_5274e61a : InstrItinClass;
def tc_52d7bbea : InstrItinClass;
-def tc_53173427 : InstrItinClass;
def tc_53bc8a6a : InstrItinClass;
def tc_53bdb2f6 : InstrItinClass;
def tc_540fdfbc : InstrItinClass;
def tc_55050d58 : InstrItinClass;
-def tc_56d25411 : InstrItinClass;
def tc_57288781 : InstrItinClass;
def tc_594ab548 : InstrItinClass;
+def tc_59a01ead : InstrItinClass;
def tc_5acef64a : InstrItinClass;
def tc_5ba5997d : InstrItinClass;
def tc_5eb851fc : InstrItinClass;
def tc_5f6847a1 : InstrItinClass;
def tc_60571023 : InstrItinClass;
def tc_609d2efe : InstrItinClass;
-def tc_60d76817 : InstrItinClass;
-def tc_60f5738d : InstrItinClass;
def tc_63fe3df7 : InstrItinClass;
def tc_66888ded : InstrItinClass;
def tc_6792d5ff : InstrItinClass;
@@ -96,6 +85,7 @@ def tc_6aa5711a : InstrItinClass;
def tc_6ac37025 : InstrItinClass;
def tc_6ebb4a12 : InstrItinClass;
def tc_6efc556e : InstrItinClass;
+def tc_6fa4db47 : InstrItinClass;
def tc_73043bf4 : InstrItinClass;
def tc_746baa8e : InstrItinClass;
def tc_74e47fd9 : InstrItinClass;
@@ -103,18 +93,16 @@ def tc_7934b9df : InstrItinClass;
def tc_7a830544 : InstrItinClass;
def tc_7f881c76 : InstrItinClass;
def tc_84df2cd3 : InstrItinClass;
-def tc_85523bcb : InstrItinClass;
def tc_855b0b61 : InstrItinClass;
def tc_87735c3b : InstrItinClass;
-def tc_88fa1a78 : InstrItinClass;
def tc_897d1a9d : InstrItinClass;
def tc_8b15472a : InstrItinClass;
-def tc_8bb285ec : InstrItinClass;
def tc_8fd5f294 : InstrItinClass;
def tc_8fe6b782 : InstrItinClass;
def tc_90f3e30c : InstrItinClass;
def tc_976ddc4f : InstrItinClass;
def tc_97743097 : InstrItinClass;
+def tc_994333cd : InstrItinClass;
def tc_999d32db : InstrItinClass;
def tc_99be14ca : InstrItinClass;
def tc_9c00ce8d : InstrItinClass;
@@ -133,7 +121,6 @@ def tc_adb14c66 : InstrItinClass;
def tc_b13761ae : InstrItinClass;
def tc_b166348b : InstrItinClass;
def tc_b44c6e2a : InstrItinClass;
-def tc_b5a33b22 : InstrItinClass;
def tc_b77c481f : InstrItinClass;
def tc_b7dd427e : InstrItinClass;
def tc_b9488031 : InstrItinClass;
@@ -141,7 +128,6 @@ def tc_b9c0b731 : InstrItinClass;
def tc_b9c4623f : InstrItinClass;
def tc_bad2bcaf : InstrItinClass;
def tc_bcc96cee : InstrItinClass;
-def tc_bd90564c : InstrItinClass;
def tc_bde7aaf4 : InstrItinClass;
def tc_be706f30 : InstrItinClass;
def tc_c2f7d806 : InstrItinClass;
@@ -166,24 +152,20 @@ def tc_d9f95eef : InstrItinClass;
def tc_daa058fa : InstrItinClass;
def tc_dbdffe3d : InstrItinClass;
def tc_e0739b8c : InstrItinClass;
-def tc_e1e0a2dc : InstrItinClass;
def tc_e1e99bfa : InstrItinClass;
def tc_e216a5db : InstrItinClass;
def tc_e421e012 : InstrItinClass;
-def tc_e6b38e01 : InstrItinClass;
def tc_e7624c08 : InstrItinClass;
def tc_e7d02c66 : InstrItinClass;
def tc_e913dc32 : InstrItinClass;
def tc_e9c822f7 : InstrItinClass;
def tc_e9fae2d6 : InstrItinClass;
-def tc_ef20db1c : InstrItinClass;
def tc_ef52ed71 : InstrItinClass;
def tc_ef84f62f : InstrItinClass;
def tc_f2704b9a : InstrItinClass;
def tc_f3eaa14b : InstrItinClass;
def tc_f47d212f : InstrItinClass;
def tc_f49e76f4 : InstrItinClass;
-def tc_f4f43fb5 : InstrItinClass;
def tc_f7dd9c9f : InstrItinClass;
def tc_f86c328a : InstrItinClass;
def tc_f8eeed7a : InstrItinClass;
@@ -192,21 +174,17 @@ def tc_ff9ee76e : InstrItinClass;
class DepScalarItinV4 {
list<InstrItinData> DepScalarItinV4_list = [
- InstrItinData <tc_0077f68c, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_00afc57e, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_00e7c26e, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_03220ffa, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_038a1342, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_04c9decc, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_05b6c987, [InstrStage<1, [SLOT0, SLOT1]>]>,
- InstrItinData <tc_0a2b8c7c, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_0cd51c76, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_0dc560de, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_0fc1ae07, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_10b97e27, [InstrStage<1, [SLOT2]>]>,
- InstrItinData <tc_128f96e3, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_1372bca1, [InstrStage<1, [SLOT0]>]>,
- InstrItinData <tc_1432937d, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_14cd4cfa, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_15411484, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_16d0d8d5, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -214,18 +192,14 @@ class DepScalarItinV4 {
InstrItinData <tc_1853ea6d, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_1b82a277, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_1b9c9ee5, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_1c0005f9, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_1d5a38a8, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_1e856f58, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_20280784, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_234a11a5, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_238d91d2, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_29175780, [InstrStage<1, [SLOT3]>]>,
- InstrItinData <tc_29641329, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_2a160009, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_2b2f4060, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
InstrItinData <tc_2b6f77c6, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_2e00db30, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_2f185f5c, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_2fc0c436, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_351fed2d, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -253,22 +227,19 @@ class DepScalarItinV4 {
InstrItinData <tc_523fcf30, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_5274e61a, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_52d7bbea, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
- InstrItinData <tc_53173427, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_53bc8a6a, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_53bdb2f6, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_540fdfbc, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_55050d58, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_56d25411, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_57288781, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_594ab548, [InstrStage<1, [SLOT0]>]>,
+ InstrItinData <tc_59a01ead, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_5acef64a, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_5ba5997d, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
InstrItinData <tc_5eb851fc, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_5f6847a1, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
InstrItinData <tc_60571023, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_609d2efe, [InstrStage<1, [SLOT0, SLOT1]>]>,
- InstrItinData <tc_60d76817, [InstrStage<1, [SLOT3]>]>,
- InstrItinData <tc_60f5738d, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_63fe3df7, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_66888ded, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_6792d5ff, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -278,6 +249,7 @@ class DepScalarItinV4 {
InstrItinData <tc_6ac37025, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_6ebb4a12, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
InstrItinData <tc_6efc556e, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData <tc_6fa4db47, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_73043bf4, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_746baa8e, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_74e47fd9, [InstrStage<1, [SLOT0, SLOT1]>]>,
@@ -285,18 +257,16 @@ class DepScalarItinV4 {
InstrItinData <tc_7a830544, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_7f881c76, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_84df2cd3, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_85523bcb, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_855b0b61, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_87735c3b, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_88fa1a78, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_897d1a9d, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_8b15472a, [InstrStage<1, [SLOT0, SLOT1]>]>,
- InstrItinData <tc_8bb285ec, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_8fd5f294, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_8fe6b782, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
InstrItinData <tc_90f3e30c, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_976ddc4f, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_97743097, [InstrStage<1, [SLOT2]>]>,
+ InstrItinData <tc_994333cd, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_999d32db, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_99be14ca, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_9c00ce8d, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -315,7 +285,6 @@ class DepScalarItinV4 {
InstrItinData <tc_b13761ae, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_b166348b, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_b44c6e2a, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_b5a33b22, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_b77c481f, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_b7dd427e, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_b9488031, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
@@ -323,7 +292,6 @@ class DepScalarItinV4 {
InstrItinData <tc_b9c4623f, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_bad2bcaf, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_bcc96cee, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_bd90564c, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_bde7aaf4, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_be706f30, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_c2f7d806, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -348,24 +316,20 @@ class DepScalarItinV4 {
InstrItinData <tc_daa058fa, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_dbdffe3d, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_e0739b8c, [InstrStage<1, [SLOT2]>]>,
- InstrItinData <tc_e1e0a2dc, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_e1e99bfa, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_e216a5db, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_e421e012, [InstrStage<1, [SLOT0]>]>,
- InstrItinData <tc_e6b38e01, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_e7624c08, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_e7d02c66, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_e913dc32, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_e9c822f7, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_e9fae2d6, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_ef20db1c, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_ef52ed71, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_ef84f62f, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_f2704b9a, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_f3eaa14b, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_f47d212f, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_f49e76f4, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_f4f43fb5, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_f7dd9c9f, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_f86c328a, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_f8eeed7a, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -375,21 +339,17 @@ class DepScalarItinV4 {
class DepScalarItinV5 {
list<InstrItinData> DepScalarItinV5_list = [
- InstrItinData <tc_0077f68c, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_00afc57e, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_00e7c26e, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_03220ffa, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_038a1342, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_04c9decc, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_05b6c987, [InstrStage<1, [SLOT0, SLOT1]>]>,
- InstrItinData <tc_0a2b8c7c, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_0cd51c76, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_0dc560de, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_0fc1ae07, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_10b97e27, [InstrStage<1, [SLOT2]>]>,
- InstrItinData <tc_128f96e3, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_1372bca1, [InstrStage<1, [SLOT0]>]>,
- InstrItinData <tc_1432937d, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_14cd4cfa, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_15411484, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_16d0d8d5, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -397,18 +357,14 @@ class DepScalarItinV5 {
InstrItinData <tc_1853ea6d, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_1b82a277, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_1b9c9ee5, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_1c0005f9, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_1d5a38a8, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_1e856f58, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_20280784, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_234a11a5, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_238d91d2, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_29175780, [InstrStage<1, [SLOT3]>]>,
- InstrItinData <tc_29641329, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_2a160009, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_2b2f4060, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
InstrItinData <tc_2b6f77c6, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_2e00db30, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_2f185f5c, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_2fc0c436, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_351fed2d, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -436,22 +392,19 @@ class DepScalarItinV5 {
InstrItinData <tc_523fcf30, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_5274e61a, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_52d7bbea, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
- InstrItinData <tc_53173427, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_53bc8a6a, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_53bdb2f6, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_540fdfbc, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_55050d58, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_56d25411, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_57288781, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_594ab548, [InstrStage<1, [SLOT0]>]>,
+ InstrItinData <tc_59a01ead, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_5acef64a, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_5ba5997d, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
InstrItinData <tc_5eb851fc, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_5f6847a1, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
InstrItinData <tc_60571023, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_609d2efe, [InstrStage<1, [SLOT0, SLOT1]>]>,
- InstrItinData <tc_60d76817, [InstrStage<1, [SLOT3]>]>,
- InstrItinData <tc_60f5738d, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_63fe3df7, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_66888ded, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_6792d5ff, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -461,6 +414,7 @@ class DepScalarItinV5 {
InstrItinData <tc_6ac37025, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_6ebb4a12, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
InstrItinData <tc_6efc556e, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData <tc_6fa4db47, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_73043bf4, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_746baa8e, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_74e47fd9, [InstrStage<1, [SLOT0, SLOT1]>]>,
@@ -468,18 +422,16 @@ class DepScalarItinV5 {
InstrItinData <tc_7a830544, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_7f881c76, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_84df2cd3, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_85523bcb, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_855b0b61, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_87735c3b, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_88fa1a78, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_897d1a9d, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_8b15472a, [InstrStage<1, [SLOT0, SLOT1]>]>,
- InstrItinData <tc_8bb285ec, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_8fd5f294, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_8fe6b782, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
InstrItinData <tc_90f3e30c, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_976ddc4f, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_97743097, [InstrStage<1, [SLOT2]>]>,
+ InstrItinData <tc_994333cd, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_999d32db, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_99be14ca, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_9c00ce8d, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -498,7 +450,6 @@ class DepScalarItinV5 {
InstrItinData <tc_b13761ae, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_b166348b, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_b44c6e2a, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_b5a33b22, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_b77c481f, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_b7dd427e, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_b9488031, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
@@ -506,7 +457,6 @@ class DepScalarItinV5 {
InstrItinData <tc_b9c4623f, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_bad2bcaf, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_bcc96cee, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_bd90564c, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_bde7aaf4, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_be706f30, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_c2f7d806, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -531,24 +481,20 @@ class DepScalarItinV5 {
InstrItinData <tc_daa058fa, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_dbdffe3d, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_e0739b8c, [InstrStage<1, [SLOT2]>]>,
- InstrItinData <tc_e1e0a2dc, [InstrStage<1, [SLOT2]>]>,
InstrItinData <tc_e1e99bfa, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_e216a5db, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_e421e012, [InstrStage<1, [SLOT0]>]>,
- InstrItinData <tc_e6b38e01, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_e7624c08, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_e7d02c66, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_e913dc32, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_e9c822f7, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_e9fae2d6, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_ef20db1c, [InstrStage<1, [SLOT3]>]>,
InstrItinData <tc_ef52ed71, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_ef84f62f, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_f2704b9a, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_f3eaa14b, [InstrStage<1, [SLOT2, SLOT3]>]>,
InstrItinData <tc_f47d212f, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_f49e76f4, [InstrStage<1, [SLOT2, SLOT3]>]>,
- InstrItinData <tc_f4f43fb5, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_f7dd9c9f, [InstrStage<1, [SLOT0]>]>,
InstrItinData <tc_f86c328a, [InstrStage<1, [SLOT0, SLOT1]>]>,
InstrItinData <tc_f8eeed7a, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -558,10 +504,6 @@ class DepScalarItinV5 {
class DepScalarItinV55 {
list<InstrItinData> DepScalarItinV55_list = [
- InstrItinData <tc_0077f68c, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [2],
- [Hex_FWD]>,
-
InstrItinData <tc_00afc57e, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
[Hex_FWD, Hex_FWD]>,
@@ -586,10 +528,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_0a2b8c7c, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_0cd51c76, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -606,18 +544,10 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT2]>], [2, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_128f96e3, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1372bca1, /*tc_3stall*/
[InstrStage<1, [SLOT0]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1432937d, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_14cd4cfa, /*tc_2early*/
[InstrStage<1, [SLOT2]>], [2],
[Hex_FWD]>,
@@ -646,10 +576,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1c0005f9, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1d5a38a8, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -658,10 +584,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_20280784, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_234a11a5, /*tc_3x*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
[Hex_FWD, Hex_FWD]>,
@@ -674,10 +596,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT3]>], [4, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_29641329, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_2a160009, /*tc_2early*/
[InstrStage<1, [SLOT0]>], [],
[]>,
@@ -690,10 +608,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_2e00db30, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [],
- []>,
-
InstrItinData <tc_2f185f5c, /*tc_2early*/
[InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
@@ -802,10 +716,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
[]>,
- InstrItinData <tc_53173427, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_53bc8a6a, /*tc_2early*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -822,10 +732,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_56d25411, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_57288781, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
@@ -834,6 +740,10 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_59a01ead, /*tc_2early*/
+ [InstrStage<1, [SLOT2]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_5acef64a, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -858,14 +768,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_60d76817, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [],
- []>,
-
- InstrItinData <tc_60f5738d, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_63fe3df7, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -902,6 +804,10 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
[]>,
+ InstrItinData <tc_6fa4db47, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_73043bf4, /*tc_2early*/
[InstrStage<1, [SLOT3]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
@@ -930,10 +836,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_85523bcb, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_855b0b61, /*tc_2early*/
[InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -942,10 +844,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_88fa1a78, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_897d1a9d, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -954,10 +852,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_8bb285ec, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_8fd5f294, /*tc_3x*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -978,6 +872,10 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT2]>], [2, 1],
[Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_994333cd, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_999d32db, /*tc_2early*/
[InstrStage<1, [SLOT2]>], [1],
[Hex_FWD]>,
@@ -1050,10 +948,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_b5a33b22, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_b77c481f, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1082,10 +976,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_bd90564c, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_bde7aaf4, /*tc_3stall*/
[InstrStage<1, [SLOT0]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1182,10 +1072,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT2]>], [2, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_e1e0a2dc, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [],
- []>,
-
InstrItinData <tc_e1e99bfa, /*tc_2early*/
[InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
@@ -1198,10 +1084,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_e6b38e01, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_e7624c08, /*tc_3stall*/
[InstrStage<1, [SLOT0]>], [3],
[Hex_FWD]>,
@@ -1222,10 +1104,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_ef20db1c, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_ef52ed71, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1250,10 +1128,6 @@ class DepScalarItinV55 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_f4f43fb5, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_f7dd9c9f, /*tc_st*/
[InstrStage<1, [SLOT0]>], [1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1278,10 +1152,6 @@ class DepScalarItinV55 {
class DepScalarItinV60 {
list<InstrItinData> DepScalarItinV60_list = [
- InstrItinData <tc_0077f68c, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [2],
- [Hex_FWD]>,
-
InstrItinData <tc_00afc57e, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
[Hex_FWD, Hex_FWD]>,
@@ -1306,10 +1176,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_0a2b8c7c, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_0cd51c76, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1326,18 +1192,10 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT2]>], [2, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_128f96e3, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1372bca1, /*tc_3stall*/
[InstrStage<1, [SLOT0]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1432937d, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_14cd4cfa, /*tc_2early*/
[InstrStage<1, [SLOT2]>], [2],
[Hex_FWD]>,
@@ -1366,10 +1224,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1c0005f9, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1d5a38a8, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1378,10 +1232,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_20280784, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_234a11a5, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
[Hex_FWD, Hex_FWD]>,
@@ -1394,10 +1244,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT3]>], [4, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_29641329, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_2a160009, /*tc_2early*/
[InstrStage<1, [SLOT0]>], [],
[]>,
@@ -1410,10 +1256,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_2e00db30, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [],
- []>,
-
InstrItinData <tc_2f185f5c, /*tc_2early*/
[InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
@@ -1522,10 +1364,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
[]>,
- InstrItinData <tc_53173427, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_53bc8a6a, /*tc_2early*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1542,10 +1380,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_56d25411, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_57288781, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
@@ -1554,6 +1388,10 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_59a01ead, /*tc_2early*/
+ [InstrStage<1, [SLOT2]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_5acef64a, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1578,14 +1416,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_60d76817, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [],
- []>,
-
- InstrItinData <tc_60f5738d, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_63fe3df7, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1622,6 +1452,10 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
[]>,
+ InstrItinData <tc_6fa4db47, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_73043bf4, /*tc_2early*/
[InstrStage<1, [SLOT3]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
@@ -1650,10 +1484,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_85523bcb, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_855b0b61, /*tc_2early*/
[InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1662,10 +1492,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_88fa1a78, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_897d1a9d, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1674,10 +1500,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_8bb285ec, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_8fd5f294, /*tc_3x*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1698,6 +1520,10 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT2]>], [2, 1],
[Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_994333cd, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_999d32db, /*tc_2early*/
[InstrStage<1, [SLOT2]>], [1],
[Hex_FWD]>,
@@ -1770,10 +1596,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_b5a33b22, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_b77c481f, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1802,10 +1624,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_bd90564c, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_bde7aaf4, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1902,10 +1720,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT2]>], [2, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_e1e0a2dc, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [],
- []>,
-
InstrItinData <tc_e1e99bfa, /*tc_2early*/
[InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
@@ -1918,10 +1732,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_e6b38e01, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_e7624c08, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3],
[Hex_FWD]>,
@@ -1942,10 +1752,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_ef20db1c, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_ef52ed71, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1970,10 +1776,6 @@ class DepScalarItinV60 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_f4f43fb5, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_f7dd9c9f, /*tc_st*/
[InstrStage<1, [SLOT0]>], [1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1996,765 +1798,8 @@ class DepScalarItinV60 {
];
}
-class DepScalarItinV60se {
- list<InstrItinData> DepScalarItinV60se_list = [
- InstrItinData <tc_0077f68c, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [2],
- [Hex_FWD]>,
-
- InstrItinData <tc_00afc57e, /*tc_2*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_00e7c26e, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1],
- [Hex_FWD]>,
-
- InstrItinData <tc_03220ffa, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_038a1342, /*tc_4x*/
- [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_04c9decc, /*tc_3stall*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_05b6c987, /*tc_st*/
- [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_0a2b8c7c, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_0cd51c76, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_0dc560de, /*tc_st*/
- [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_0fc1ae07, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [2],
- [Hex_FWD]>,
-
- InstrItinData <tc_10b97e27, /*tc_2early*/
- [InstrStage<1, [SLOT2], 0>,
- InstrStage<1, [CVI_ST]>], [2, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_128f96e3, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_1372bca1, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_1432937d, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_14cd4cfa, /*tc_2early*/
- [InstrStage<1, [SLOT2], 0>,
- InstrStage<1, [CVI_ST]>], [2],
- [Hex_FWD]>,
-
- InstrItinData <tc_15411484, /*tc_2early*/
- [InstrStage<1, [SLOT2], 0>,
- InstrStage<1, [CVI_ST]>], [1],
- [Hex_FWD]>,
-
- InstrItinData <tc_16d0d8d5, /*tc_3x*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_181af5d0, /*tc_2early*/
- [InstrStage<1, [SLOT2], 0>,
- InstrStage<1, [CVI_ST]>], [3, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_1853ea6d, /*tc_3x*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_1b82a277, /*tc_1*/
- [InstrStage<1, [SLOT0, SLOT1]>], [3],
- [Hex_FWD]>,
-
- InstrItinData <tc_1b9c9ee5, /*tc_2*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_1c0005f9, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_1d5a38a8, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_1e856f58, /*tc_2early*/
- [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_20280784, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_234a11a5, /*tc_2*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_238d91d2, /*tc_st*/
- [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_29175780, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_29641329, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_2a160009, /*tc_2early*/
- [InstrStage<1, [SLOT0], 0>,
- InstrStage<1, [CVI_ST]>], [],
- []>,
-
- InstrItinData <tc_2b2f4060, /*tc_1*/
- [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_2b6f77c6, /*tc_2*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_2e00db30, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [],
- []>,
-
- InstrItinData <tc_2f185f5c, /*tc_2early*/
- [InstrStage<1, [SLOT2, SLOT3], 0>,
- InstrStage<1, [CVI_ST]>], [2, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_2fc0c436, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_351fed2d, /*tc_2early*/
- [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_3669266a, /*tc_2early*/
- [InstrStage<1, [SLOT2, SLOT3], 0>,
- InstrStage<1, [CVI_ST]>], [2],
- [Hex_FWD]>,
-
- InstrItinData <tc_367f7f3d, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [],
- []>,
-
- InstrItinData <tc_36c68ad1, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [],
- []>,
-
- InstrItinData <tc_395dc00f, /*tc_newvjump*/
- [InstrStage<1, [SLOT0], 0>,
- InstrStage<1, [CVI_ST]>], [3, 3, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_3bc2c5d3, /*tc_newvjump*/
- [InstrStage<1, [SLOT0], 0>,
- InstrStage<1, [CVI_ST]>], [2],
- [Hex_FWD]>,
-
- InstrItinData <tc_3cb8ea06, /*tc_2early*/
- [InstrStage<1, [SLOT2, SLOT3], 0>,
- InstrStage<1, [CVI_ST]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_3d04548d, /*tc_newvjump*/
- [InstrStage<1, [SLOT0], 0>,
- InstrStage<1, [CVI_ST]>], [3, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_3da80ba5, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [1],
- [Hex_FWD]>,
-
- InstrItinData <tc_3e07fb90, /*tc_st*/
- [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_41d5298e, /*tc_2*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_4403ca65, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_44126683, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_452f85af, /*tc_1*/
- [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
- [Hex_FWD]>,
-
- InstrItinData <tc_481e5e5c, /*tc_2early*/
- [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_49eb22c8, /*tc_1*/
- [InstrStage<1, [SLOT2, SLOT3], 0>,
- InstrStage<1, [CVI_ST]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_4ca572d4, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [],
- []>,
-
- InstrItinData <tc_4d9914c9, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_4d99bca9, /*tc_4x*/
- [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_4f7cd700, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [2, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_513bef45, /*tc_newvjump*/
- [InstrStage<1, [SLOT0], 0>,
- InstrStage<1, [CVI_ST]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_51b866be, /*tc_newvjump*/
- [InstrStage<1, [SLOT0], 0>,
- InstrStage<1, [CVI_ST]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_523fcf30, /*tc_3stall*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_5274e61a, /*tc_st*/
- [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_52d7bbea, /*tc_2early*/
- [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
- InstrStage<1, [CVI_ST]>], [],
- []>,
-
- InstrItinData <tc_53173427, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_53bc8a6a, /*tc_2early*/
- [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_53bdb2f6, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [3, 2, 3],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_540fdfbc, /*tc_1*/
- [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_55050d58, /*tc_1*/
- [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_56d25411, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_57288781, /*tc_st*/
- [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_594ab548, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_5acef64a, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_5ba5997d, /*tc_2*/
- [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_5eb851fc, /*tc_newvjump*/
- [InstrStage<1, [SLOT0], 0>,
- InstrStage<1, [CVI_ST]>], [2, 3, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_5f6847a1, /*tc_1*/
- [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_60571023, /*tc_3x*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_609d2efe, /*tc_1*/
- [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_60d76817, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [],
- []>,
-
- InstrItinData <tc_60f5738d, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1],
- [Hex_FWD]>,
-
- InstrItinData <tc_63fe3df7, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_66888ded, /*tc_st*/
- [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_6792d5ff, /*tc_4x*/
- [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_681a2300, /*tc_3stall*/
- [InstrStage<1, [SLOT2], 0>,
- InstrStage<1, [CVI_ST]>], [2],
- [Hex_FWD]>,
-
- InstrItinData <tc_68cb12ce, /*tc_1*/
- [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_6aa5711a, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_6ac37025, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [2, 2, 3],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_6ebb4a12, /*tc_2early*/
- [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_6efc556e, /*tc_1*/
- [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
- []>,
-
- InstrItinData <tc_73043bf4, /*tc_2early*/
- [InstrStage<1, [SLOT3], 0>,
- InstrStage<1, [CVI_ST]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_746baa8e, /*tc_newvjump*/
- [InstrStage<1, [SLOT0], 0>,
- InstrStage<1, [CVI_ST]>], [3, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_74e47fd9, /*tc_st*/
- [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_7934b9df, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [2, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_7a830544, /*tc_2early*/
- [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_7f881c76, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_84df2cd3, /*tc_2*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_85523bcb, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_855b0b61, /*tc_2early*/
- [InstrStage<1, [SLOT2, SLOT3], 0>,
- InstrStage<1, [CVI_ST]>], [1, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_87735c3b, /*tc_2*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_88fa1a78, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_897d1a9d, /*tc_1*/
- [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_8b15472a, /*tc_st*/
- [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_8bb285ec, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1],
- [Hex_FWD]>,
-
- InstrItinData <tc_8fd5f294, /*tc_3x*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_8fe6b782, /*tc_1*/
- [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_90f3e30c, /*tc_2early*/
- [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_976ddc4f, /*tc_2*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_97743097, /*tc_2early*/
- [InstrStage<1, [SLOT2], 0>,
- InstrStage<1, [CVI_ST]>], [2, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_999d32db, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1],
- [Hex_FWD]>,
-
- InstrItinData <tc_99be14ca, /*tc_2early*/
- [InstrStage<1, [SLOT2, SLOT3], 0>,
- InstrStage<1, [CVI_ST]>], [1, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_9c00ce8d, /*tc_4x*/
- [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_9c98e8af, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_9d5941c7, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_9ef61e5c, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_9faf76ae, /*tc_2early*/
- [InstrStage<1, [SLOT2], 0>,
- InstrStage<1, [CVI_ST]>], [1],
- [Hex_FWD]>,
-
- InstrItinData <tc_9fdb5406, /*tc_st*/
- [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_a21dc435, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_a27582fa, /*tc_2early*/
- [InstrStage<1, [SLOT2, SLOT3], 0>,
- InstrStage<1, [CVI_ST]>], [2],
- [Hex_FWD]>,
-
- InstrItinData <tc_a46f0df5, /*tc_2early*/
- [InstrStage<1, [SLOT2, SLOT3], 0>,
- InstrStage<1, [CVI_ST]>], [3, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_a788683e, /*tc_st*/
- [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_a8acdac0, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_a904d137, /*tc_1*/
- [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_adb14c66, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_b13761ae, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [],
- []>,
-
- InstrItinData <tc_b166348b, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_b44c6e2a, /*tc_2*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_b5a33b22, /*tc_2early*/
- [InstrStage<1, [SLOT2], 0>,
- InstrStage<1, [CVI_ST]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_b77c481f, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_b7dd427e, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_b9488031, /*tc_1*/
- [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_b9c0b731, /*tc_3x*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_b9c4623f, /*tc_2*/
- [InstrStage<1, [SLOT3]>], [4, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_bad2bcaf, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_bcc96cee, /*tc_3x*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_bd90564c, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_bde7aaf4, /*tc_newvjump*/
- [InstrStage<1, [SLOT0], 0>,
- InstrStage<1, [CVI_ST]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_be706f30, /*tc_1*/
- [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_c2f7d806, /*tc_2*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_c5e2426d, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [2, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_c6aa82f7, /*tc_2early*/
- [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_c6ce9b3f, /*tc_3x*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_c6ebf8dd, /*tc_3stall*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_c74f796f, /*tc_2*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_c82dc1ff, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [1],
- [Hex_FWD]>,
-
- InstrItinData <tc_caaebcba, /*tc_3stall*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_cd7374a0, /*tc_st*/
- [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_cde8b071, /*tc_1*/
- [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_cf47a43f, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_cf59f215, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [2, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_d088982c, /*tc_2*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_d1090e34, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_d24b2d85, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_d580173f, /*tc_4x*/
- [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_d6bf0472, /*tc_1*/
- [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_d9709180, /*tc_st*/
- [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_d9f95eef, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_daa058fa, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_dbdffe3d, /*tc_1*/
- [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_e0739b8c, /*tc_2early*/
- [InstrStage<1, [SLOT2], 0>,
- InstrStage<1, [CVI_ST]>], [2, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_e1e0a2dc, /*tc_3stall*/
- [InstrStage<1, [SLOT2], 0>,
- InstrStage<1, [CVI_ST]>], [],
- []>,
-
- InstrItinData <tc_e1e99bfa, /*tc_2early*/
- [InstrStage<1, [SLOT2, SLOT3], 0>,
- InstrStage<1, [CVI_ST]>], [2, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_e216a5db, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [4, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_e421e012, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_e6b38e01, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_e7624c08, /*tc_newvjump*/
- [InstrStage<1, [SLOT0], 0>,
- InstrStage<1, [CVI_ST]>], [3],
- [Hex_FWD]>,
-
- InstrItinData <tc_e7d02c66, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_e913dc32, /*tc_3x*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_e9c822f7, /*tc_1*/
- [InstrStage<1, [SLOT0, SLOT1]>], [3],
- [Hex_FWD]>,
-
- InstrItinData <tc_e9fae2d6, /*tc_2early*/
- [InstrStage<1, [SLOT2, SLOT3], 0>,
- InstrStage<1, [CVI_ST]>], [2, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_ef20db1c, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_ef52ed71, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_ef84f62f, /*tc_2*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_f2704b9a, /*tc_2early*/
- [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_f3eaa14b, /*tc_4x*/
- [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
- [Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_f47d212f, /*tc_ld*/
- [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_f49e76f4, /*tc_2*/
- [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_f4f43fb5, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_f7dd9c9f, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 2, 3],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_f86c328a, /*tc_st*/
- [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_f8eeed7a, /*tc_1*/
- [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
- InstrItinData <tc_fcab4871, /*tc_newvjump*/
- [InstrStage<1, [SLOT0], 0>,
- InstrStage<1, [CVI_ST]>], [],
- []>,
-
- InstrItinData <tc_ff9ee76e, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [2, 3],
- [Hex_FWD, Hex_FWD]>
- ];
-}
-
class DepScalarItinV62 {
list<InstrItinData> DepScalarItinV62_list = [
- InstrItinData <tc_0077f68c, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [2],
- [Hex_FWD]>,
-
InstrItinData <tc_00afc57e, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
[Hex_FWD, Hex_FWD]>,
@@ -2779,10 +1824,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_0a2b8c7c, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_0cd51c76, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -2799,18 +1840,10 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT2]>], [2, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_128f96e3, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1372bca1, /*tc_3stall*/
[InstrStage<1, [SLOT0]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1432937d, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_14cd4cfa, /*tc_2early*/
[InstrStage<1, [SLOT2]>], [2],
[Hex_FWD]>,
@@ -2839,10 +1872,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1c0005f9, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1d5a38a8, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -2851,10 +1880,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_20280784, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_234a11a5, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
[Hex_FWD, Hex_FWD]>,
@@ -2867,10 +1892,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT3]>], [4, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_29641329, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_2a160009, /*tc_2early*/
[InstrStage<1, [SLOT0]>], [],
[]>,
@@ -2883,10 +1904,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_2e00db30, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [],
- []>,
-
InstrItinData <tc_2f185f5c, /*tc_3*/
[InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
@@ -2995,10 +2012,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
[]>,
- InstrItinData <tc_53173427, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_53bc8a6a, /*tc_2early*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3015,10 +2028,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_56d25411, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_57288781, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
@@ -3027,6 +2036,10 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_59a01ead, /*tc_2early*/
+ [InstrStage<1, [SLOT2]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_5acef64a, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3051,14 +2064,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_60d76817, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [],
- []>,
-
- InstrItinData <tc_60f5738d, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_63fe3df7, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3095,6 +2100,10 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
[]>,
+ InstrItinData <tc_6fa4db47, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_73043bf4, /*tc_2early*/
[InstrStage<1, [SLOT3]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
@@ -3123,10 +2132,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_85523bcb, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_855b0b61, /*tc_2early*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3135,10 +2140,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_88fa1a78, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_897d1a9d, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3147,10 +2148,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_8bb285ec, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_8fd5f294, /*tc_3x*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3167,6 +2164,10 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_994333cd, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_97743097, /*tc_2early*/
[InstrStage<1, [SLOT2]>], [2, 1],
[Hex_FWD, Hex_FWD]>,
@@ -3243,10 +2244,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_b5a33b22, /*tc_2early*/
- [InstrStage<1, [SLOT2]>], [3, 2, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_b77c481f, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3275,10 +2272,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_bd90564c, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_bde7aaf4, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3375,10 +2368,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT2]>], [2, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_e1e0a2dc, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [],
- []>,
-
InstrItinData <tc_e1e99bfa, /*tc_2early*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
@@ -3391,10 +2380,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_e6b38e01, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_e7624c08, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3],
[Hex_FWD]>,
@@ -3415,10 +2400,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_ef20db1c, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_ef52ed71, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3443,10 +2424,6 @@ class DepScalarItinV62 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_f4f43fb5, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_f7dd9c9f, /*tc_st*/
[InstrStage<1, [SLOT0]>], [1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3471,10 +2448,6 @@ class DepScalarItinV62 {
class DepScalarItinV65 {
list<InstrItinData> DepScalarItinV65_list = [
- InstrItinData <tc_0077f68c, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [2],
- [Hex_FWD]>,
-
InstrItinData <tc_00afc57e, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
[Hex_FWD, Hex_FWD]>,
@@ -3499,10 +2472,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_0a2b8c7c, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_0cd51c76, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3519,18 +2488,10 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT2]>], [2, 1],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_128f96e3, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1372bca1, /*tc_3stall*/
[InstrStage<1, [SLOT0]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1432937d, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_14cd4cfa, /*tc_2early*/
[InstrStage<1, [SLOT2]>], [2],
[Hex_FWD]>,
@@ -3559,10 +2520,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_1c0005f9, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_1d5a38a8, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3571,10 +2528,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_20280784, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_234a11a5, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
[Hex_FWD, Hex_FWD]>,
@@ -3587,10 +2540,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT3]>], [4, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_29641329, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_2a160009, /*tc_2early*/
[InstrStage<1, [SLOT0]>], [],
[]>,
@@ -3603,10 +2552,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_2e00db30, /*tc_3stall*/
- [InstrStage<1, [SLOT0]>], [],
- []>,
-
InstrItinData <tc_2f185f5c, /*tc_3*/
[InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
@@ -3715,10 +2660,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
[]>,
- InstrItinData <tc_53173427, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_53bc8a6a, /*tc_1*/
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3735,10 +2676,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_56d25411, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_57288781, /*tc_st*/
[InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
[Hex_FWD, Hex_FWD]>,
@@ -3747,6 +2684,10 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_59a01ead, /*tc_3stall*/
+ [InstrStage<1, [SLOT2]>], [4, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_5acef64a, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3771,14 +2712,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_60d76817, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [],
- []>,
-
- InstrItinData <tc_60f5738d, /*tc_3stall*/
- [InstrStage<1, [SLOT3]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_63fe3df7, /*tc_latepredldaia*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3815,6 +2748,10 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
[]>,
+ InstrItinData <tc_6fa4db47, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_73043bf4, /*tc_1*/
[InstrStage<1, [SLOT3]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
@@ -3843,10 +2780,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_85523bcb, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_855b0b61, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3855,10 +2788,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_88fa1a78, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_897d1a9d, /*tc_2*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3867,10 +2796,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_8bb285ec, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1],
- [Hex_FWD]>,
-
InstrItinData <tc_8fd5f294, /*tc_3x*/
[InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3891,6 +2816,10 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT2]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
+ InstrItinData <tc_994333cd, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
InstrItinData <tc_999d32db, /*tc_3stall*/
[InstrStage<1, [SLOT2]>], [1],
[Hex_FWD]>,
@@ -3963,10 +2892,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_b5a33b22, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [4, 1, 2],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_b77c481f, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3995,10 +2920,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_bd90564c, /*tc_st*/
- [InstrStage<1, [SLOT0]>], [1, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_bde7aaf4, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -4095,10 +3016,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT2]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_e1e0a2dc, /*tc_3stall*/
- [InstrStage<1, [SLOT2]>], [],
- []>,
-
InstrItinData <tc_e1e99bfa, /*tc_1*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
@@ -4111,10 +3028,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_e6b38e01, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 2],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_e7624c08, /*tc_newvjump*/
[InstrStage<1, [SLOT0]>], [3],
[Hex_FWD]>,
@@ -4135,10 +3048,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
[Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_ef20db1c, /*tc_3x*/
- [InstrStage<1, [SLOT3]>], [4, 1],
- [Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_ef52ed71, /*tc_ld*/
[InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -4163,10 +3072,6 @@ class DepScalarItinV65 {
[InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
- InstrItinData <tc_f4f43fb5, /*tc_ld*/
- [InstrStage<1, [SLOT0]>], [4, 1, 1],
- [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
InstrItinData <tc_f7dd9c9f, /*tc_st*/
[InstrStage<1, [SLOT0]>], [1, 2, 3],
[Hex_FWD, Hex_FWD, Hex_FWD]>,
diff --git a/lib/Target/Hexagon/HexagonDepInstrInfo.td b/lib/Target/Hexagon/HexagonDepInstrInfo.td
index 6e16762ac0eb..b6824fa33106 100644
--- a/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -991,7 +991,7 @@ def A2_roundsat : HInst<
(outs IntRegs:$Rd32),
(ins DoubleRegs:$Rss32),
"$Rd32 = round($Rss32):sat",
-tc_c2f7d806, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_c2f7d806, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
let Inst{13-5} = 0b000000001;
let Inst{31-21} = 0b10001000110;
let hasNewValue = 1;
@@ -3301,7 +3301,7 @@ def A5_ACS : HInst<
(outs DoubleRegs:$Rxx32, PredRegs:$Pe4),
(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
"$Rxx32,$Pe4 = vacsh($Rss32,$Rtt32)",
-tc_caaebcba, TypeM>, Enc_831a7d, Requires<[HasV55T]> {
+tc_caaebcba, TypeM>, Enc_831a7d, Requires<[HasV55]> {
let Inst{7-7} = 0b0;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101010101;
@@ -3314,7 +3314,7 @@ def A5_vaddhubs : HInst<
(outs IntRegs:$Rd32),
(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
"$Rd32 = vaddhub($Rss32,$Rtt32):sat",
-tc_2b6f77c6, TypeS_3op>, Enc_d2216a, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_3op>, Enc_d2216a, Requires<[HasV5]> {
let Inst{7-5} = 0b001;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11000001010;
@@ -3327,7 +3327,7 @@ def A6_vcmpbeq_notany : HInst<
(outs PredRegs:$Pd4),
(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
"$Pd4 = !any8(vcmpb.eq($Rss32,$Rtt32))",
-tc_55050d58, TypeALU64>, Enc_fcf7a7, Requires<[HasV65T]> {
+tc_55050d58, TypeALU64>, Enc_fcf7a7, Requires<[HasV65]> {
let Inst{7-2} = 0b001000;
let Inst{13-13} = 0b1;
let Inst{31-21} = 0b11010010000;
@@ -3336,7 +3336,7 @@ def A6_vminub_RdP : HInst<
(outs DoubleRegs:$Rdd32, PredRegs:$Pe4),
(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
"$Rdd32,$Pe4 = vminub($Rtt32,$Rss32)",
-tc_ef84f62f, TypeM>, Enc_d2c7f1, Requires<[HasV62T]> {
+tc_ef84f62f, TypeM>, Enc_d2c7f1, Requires<[HasV62]> {
let Inst{7-7} = 0b0;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101010111;
@@ -4059,7 +4059,7 @@ def F2_conv_d2df : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rss32),
"$Rdd32 = convert_d2df($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
let Inst{13-5} = 0b000000011;
let Inst{31-21} = 0b10000000111;
let isFP = 1;
@@ -4069,7 +4069,7 @@ def F2_conv_d2sf : HInst<
(outs IntRegs:$Rd32),
(ins DoubleRegs:$Rss32),
"$Rd32 = convert_d2sf($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
let Inst{13-5} = 0b000000001;
let Inst{31-21} = 0b10001000010;
let hasNewValue = 1;
@@ -4081,7 +4081,7 @@ def F2_conv_df2d : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rss32),
"$Rdd32 = convert_df2d($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
let Inst{13-5} = 0b000000000;
let Inst{31-21} = 0b10000000111;
let isFP = 1;
@@ -4091,7 +4091,7 @@ def F2_conv_df2d_chop : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rss32),
"$Rdd32 = convert_df2d($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
let Inst{13-5} = 0b000000110;
let Inst{31-21} = 0b10000000111;
let isFP = 1;
@@ -4101,7 +4101,7 @@ def F2_conv_df2sf : HInst<
(outs IntRegs:$Rd32),
(ins DoubleRegs:$Rss32),
"$Rd32 = convert_df2sf($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
let Inst{13-5} = 0b000000001;
let Inst{31-21} = 0b10001000000;
let hasNewValue = 1;
@@ -4113,7 +4113,7 @@ def F2_conv_df2ud : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rss32),
"$Rdd32 = convert_df2ud($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
let Inst{13-5} = 0b000000001;
let Inst{31-21} = 0b10000000111;
let isFP = 1;
@@ -4123,7 +4123,7 @@ def F2_conv_df2ud_chop : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rss32),
"$Rdd32 = convert_df2ud($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
let Inst{13-5} = 0b000000111;
let Inst{31-21} = 0b10000000111;
let isFP = 1;
@@ -4133,7 +4133,7 @@ def F2_conv_df2uw : HInst<
(outs IntRegs:$Rd32),
(ins DoubleRegs:$Rss32),
"$Rd32 = convert_df2uw($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
let Inst{13-5} = 0b000000001;
let Inst{31-21} = 0b10001000011;
let hasNewValue = 1;
@@ -4145,7 +4145,7 @@ def F2_conv_df2uw_chop : HInst<
(outs IntRegs:$Rd32),
(ins DoubleRegs:$Rss32),
"$Rd32 = convert_df2uw($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
let Inst{13-5} = 0b000000001;
let Inst{31-21} = 0b10001000101;
let hasNewValue = 1;
@@ -4157,7 +4157,7 @@ def F2_conv_df2w : HInst<
(outs IntRegs:$Rd32),
(ins DoubleRegs:$Rss32),
"$Rd32 = convert_df2w($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
let Inst{13-5} = 0b000000001;
let Inst{31-21} = 0b10001000100;
let hasNewValue = 1;
@@ -4169,7 +4169,7 @@ def F2_conv_df2w_chop : HInst<
(outs IntRegs:$Rd32),
(ins DoubleRegs:$Rss32),
"$Rd32 = convert_df2w($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
let Inst{13-5} = 0b000000001;
let Inst{31-21} = 0b10001000111;
let hasNewValue = 1;
@@ -4181,7 +4181,7 @@ def F2_conv_sf2d : HInst<
(outs DoubleRegs:$Rdd32),
(ins IntRegs:$Rs32),
"$Rdd32 = convert_sf2d($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
let Inst{13-5} = 0b000000100;
let Inst{31-21} = 0b10000100100;
let isFP = 1;
@@ -4191,7 +4191,7 @@ def F2_conv_sf2d_chop : HInst<
(outs DoubleRegs:$Rdd32),
(ins IntRegs:$Rs32),
"$Rdd32 = convert_sf2d($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
let Inst{13-5} = 0b000000110;
let Inst{31-21} = 0b10000100100;
let isFP = 1;
@@ -4201,7 +4201,7 @@ def F2_conv_sf2df : HInst<
(outs DoubleRegs:$Rdd32),
(ins IntRegs:$Rs32),
"$Rdd32 = convert_sf2df($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
let Inst{13-5} = 0b000000000;
let Inst{31-21} = 0b10000100100;
let isFP = 1;
@@ -4211,7 +4211,7 @@ def F2_conv_sf2ud : HInst<
(outs DoubleRegs:$Rdd32),
(ins IntRegs:$Rs32),
"$Rdd32 = convert_sf2ud($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
let Inst{13-5} = 0b000000011;
let Inst{31-21} = 0b10000100100;
let isFP = 1;
@@ -4221,7 +4221,7 @@ def F2_conv_sf2ud_chop : HInst<
(outs DoubleRegs:$Rdd32),
(ins IntRegs:$Rs32),
"$Rdd32 = convert_sf2ud($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
let Inst{13-5} = 0b000000101;
let Inst{31-21} = 0b10000100100;
let isFP = 1;
@@ -4231,7 +4231,7 @@ def F2_conv_sf2uw : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32),
"$Rd32 = convert_sf2uw($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
let Inst{13-5} = 0b000000000;
let Inst{31-21} = 0b10001011011;
let hasNewValue = 1;
@@ -4243,7 +4243,7 @@ def F2_conv_sf2uw_chop : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32),
"$Rd32 = convert_sf2uw($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
let Inst{13-5} = 0b000000001;
let Inst{31-21} = 0b10001011011;
let hasNewValue = 1;
@@ -4255,7 +4255,7 @@ def F2_conv_sf2w : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32),
"$Rd32 = convert_sf2w($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
let Inst{13-5} = 0b000000000;
let Inst{31-21} = 0b10001011100;
let hasNewValue = 1;
@@ -4267,7 +4267,7 @@ def F2_conv_sf2w_chop : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32),
"$Rd32 = convert_sf2w($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
let Inst{13-5} = 0b000000001;
let Inst{31-21} = 0b10001011100;
let hasNewValue = 1;
@@ -4279,7 +4279,7 @@ def F2_conv_ud2df : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rss32),
"$Rdd32 = convert_ud2df($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
let Inst{13-5} = 0b000000010;
let Inst{31-21} = 0b10000000111;
let isFP = 1;
@@ -4289,7 +4289,7 @@ def F2_conv_ud2sf : HInst<
(outs IntRegs:$Rd32),
(ins DoubleRegs:$Rss32),
"$Rd32 = convert_ud2sf($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
let Inst{13-5} = 0b000000001;
let Inst{31-21} = 0b10001000001;
let hasNewValue = 1;
@@ -4301,7 +4301,7 @@ def F2_conv_uw2df : HInst<
(outs DoubleRegs:$Rdd32),
(ins IntRegs:$Rs32),
"$Rdd32 = convert_uw2df($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
let Inst{13-5} = 0b000000001;
let Inst{31-21} = 0b10000100100;
let isFP = 1;
@@ -4311,7 +4311,7 @@ def F2_conv_uw2sf : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32),
"$Rd32 = convert_uw2sf($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
let Inst{13-5} = 0b000000000;
let Inst{31-21} = 0b10001011001;
let hasNewValue = 1;
@@ -4323,7 +4323,7 @@ def F2_conv_w2df : HInst<
(outs DoubleRegs:$Rdd32),
(ins IntRegs:$Rs32),
"$Rdd32 = convert_w2df($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
let Inst{13-5} = 0b000000010;
let Inst{31-21} = 0b10000100100;
let isFP = 1;
@@ -4333,7 +4333,7 @@ def F2_conv_w2sf : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32),
"$Rd32 = convert_w2sf($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
let Inst{13-5} = 0b000000000;
let Inst{31-21} = 0b10001011010;
let hasNewValue = 1;
@@ -4345,7 +4345,7 @@ def F2_dfclass : HInst<
(outs PredRegs:$Pd4),
(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
"$Pd4 = dfclass($Rss32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_1f19b5, Requires<[HasV5T]> {
+tc_7a830544, TypeALU64>, Enc_1f19b5, Requires<[HasV5]> {
let Inst{4-2} = 0b100;
let Inst{13-10} = 0b0000;
let Inst{31-21} = 0b11011100100;
@@ -4356,7 +4356,7 @@ def F2_dfcmpeq : HInst<
(outs PredRegs:$Pd4),
(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
"$Pd4 = dfcmp.eq($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5T]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
let Inst{7-2} = 0b000000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11010010111;
@@ -4368,7 +4368,7 @@ def F2_dfcmpge : HInst<
(outs PredRegs:$Pd4),
(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
"$Pd4 = dfcmp.ge($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5T]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
let Inst{7-2} = 0b010000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11010010111;
@@ -4380,7 +4380,7 @@ def F2_dfcmpgt : HInst<
(outs PredRegs:$Pd4),
(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
"$Pd4 = dfcmp.gt($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5T]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
let Inst{7-2} = 0b001000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11010010111;
@@ -4392,7 +4392,7 @@ def F2_dfcmpuo : HInst<
(outs PredRegs:$Pd4),
(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
"$Pd4 = dfcmp.uo($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5T]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
let Inst{7-2} = 0b011000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11010010111;
@@ -4404,7 +4404,7 @@ def F2_dfimm_n : HInst<
(outs DoubleRegs:$Rdd32),
(ins u10_0Imm:$Ii),
"$Rdd32 = dfmake(#$Ii):neg",
-tc_234a11a5, TypeALU64>, Enc_e6c957, Requires<[HasV5T]> {
+tc_234a11a5, TypeALU64>, Enc_e6c957, Requires<[HasV5]> {
let Inst{20-16} = 0b00000;
let Inst{31-22} = 0b1101100101;
let prefersSlot3 = 1;
@@ -4413,7 +4413,7 @@ def F2_dfimm_p : HInst<
(outs DoubleRegs:$Rdd32),
(ins u10_0Imm:$Ii),
"$Rdd32 = dfmake(#$Ii):pos",
-tc_234a11a5, TypeALU64>, Enc_e6c957, Requires<[HasV5T]> {
+tc_234a11a5, TypeALU64>, Enc_e6c957, Requires<[HasV5]> {
let Inst{20-16} = 0b00000;
let Inst{31-22} = 0b1101100100;
let prefersSlot3 = 1;
@@ -4422,7 +4422,7 @@ def F2_sfadd : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, IntRegs:$Rt32),
"$Rd32 = sfadd($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
let Inst{7-5} = 0b000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101011000;
@@ -4436,7 +4436,7 @@ def F2_sfclass : HInst<
(outs PredRegs:$Pd4),
(ins IntRegs:$Rs32, u5_0Imm:$Ii),
"$Pd4 = sfclass($Rs32,#$Ii)",
-tc_7a830544, TypeS_2op>, Enc_83ee64, Requires<[HasV5T]> {
+tc_7a830544, TypeS_2op>, Enc_83ee64, Requires<[HasV5]> {
let Inst{7-2} = 0b000000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b10000101111;
@@ -4447,7 +4447,7 @@ def F2_sfcmpeq : HInst<
(outs PredRegs:$Pd4),
(ins IntRegs:$Rs32, IntRegs:$Rt32),
"$Pd4 = sfcmp.eq($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5T]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
let Inst{7-2} = 0b011000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11000111111;
@@ -4459,7 +4459,7 @@ def F2_sfcmpge : HInst<
(outs PredRegs:$Pd4),
(ins IntRegs:$Rs32, IntRegs:$Rt32),
"$Pd4 = sfcmp.ge($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5T]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
let Inst{7-2} = 0b000000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11000111111;
@@ -4471,7 +4471,7 @@ def F2_sfcmpgt : HInst<
(outs PredRegs:$Pd4),
(ins IntRegs:$Rs32, IntRegs:$Rt32),
"$Pd4 = sfcmp.gt($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5T]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
let Inst{7-2} = 0b100000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11000111111;
@@ -4483,7 +4483,7 @@ def F2_sfcmpuo : HInst<
(outs PredRegs:$Pd4),
(ins IntRegs:$Rs32, IntRegs:$Rt32),
"$Pd4 = sfcmp.uo($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5T]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
let Inst{7-2} = 0b001000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11000111111;
@@ -4495,7 +4495,7 @@ def F2_sffixupd : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, IntRegs:$Rt32),
"$Rd32 = sffixupd($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
let Inst{7-5} = 0b001;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101011110;
@@ -4507,7 +4507,7 @@ def F2_sffixupn : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, IntRegs:$Rt32),
"$Rd32 = sffixupn($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
let Inst{7-5} = 0b000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101011110;
@@ -4519,7 +4519,7 @@ def F2_sffixupr : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32),
"$Rd32 = sffixupr($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
let Inst{13-5} = 0b000000000;
let Inst{31-21} = 0b10001011101;
let hasNewValue = 1;
@@ -4530,7 +4530,7 @@ def F2_sffma : HInst<
(outs IntRegs:$Rx32),
(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
"$Rx32 += sfmpy($Rs32,$Rt32)",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5T]> {
+tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
let Inst{7-5} = 0b100;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101111000;
@@ -4544,7 +4544,7 @@ def F2_sffma_lib : HInst<
(outs IntRegs:$Rx32),
(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
"$Rx32 += sfmpy($Rs32,$Rt32):lib",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5T]> {
+tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
let Inst{7-5} = 0b110;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101111000;
@@ -4558,7 +4558,7 @@ def F2_sffma_sc : HInst<
(outs IntRegs:$Rx32),
(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32, PredRegs:$Pu4),
"$Rx32 += sfmpy($Rs32,$Rt32,$Pu4):scale",
-tc_038a1342, TypeM>, Enc_437f33, Requires<[HasV5T]> {
+tc_038a1342, TypeM>, Enc_437f33, Requires<[HasV5]> {
let Inst{7-7} = 0b1;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101111011;
@@ -4572,7 +4572,7 @@ def F2_sffms : HInst<
(outs IntRegs:$Rx32),
(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
"$Rx32 -= sfmpy($Rs32,$Rt32)",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5T]> {
+tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
let Inst{7-5} = 0b101;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101111000;
@@ -4586,7 +4586,7 @@ def F2_sffms_lib : HInst<
(outs IntRegs:$Rx32),
(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
"$Rx32 -= sfmpy($Rs32,$Rt32):lib",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5T]> {
+tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
let Inst{7-5} = 0b111;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101111000;
@@ -4600,7 +4600,7 @@ def F2_sfimm_n : HInst<
(outs IntRegs:$Rd32),
(ins u10_0Imm:$Ii),
"$Rd32 = sfmake(#$Ii):neg",
-tc_234a11a5, TypeALU64>, Enc_6c9440, Requires<[HasV5T]> {
+tc_234a11a5, TypeALU64>, Enc_6c9440, Requires<[HasV5]> {
let Inst{20-16} = 0b00000;
let Inst{31-22} = 0b1101011001;
let hasNewValue = 1;
@@ -4611,7 +4611,7 @@ def F2_sfimm_p : HInst<
(outs IntRegs:$Rd32),
(ins u10_0Imm:$Ii),
"$Rd32 = sfmake(#$Ii):pos",
-tc_234a11a5, TypeALU64>, Enc_6c9440, Requires<[HasV5T]> {
+tc_234a11a5, TypeALU64>, Enc_6c9440, Requires<[HasV5]> {
let Inst{20-16} = 0b00000;
let Inst{31-22} = 0b1101011000;
let hasNewValue = 1;
@@ -4622,7 +4622,7 @@ def F2_sfinvsqrta : HInst<
(outs IntRegs:$Rd32, PredRegs:$Pe4),
(ins IntRegs:$Rs32),
"$Rd32,$Pe4 = sfinvsqrta($Rs32)",
-tc_4d99bca9, TypeS_2op>, Enc_890909, Requires<[HasV5T]> {
+tc_4d99bca9, TypeS_2op>, Enc_890909, Requires<[HasV5]> {
let Inst{13-7} = 0b0000000;
let Inst{31-21} = 0b10001011111;
let hasNewValue = 1;
@@ -4634,7 +4634,7 @@ def F2_sfmax : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, IntRegs:$Rt32),
"$Rd32 = sfmax($Rs32,$Rt32)",
-tc_976ddc4f, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+tc_976ddc4f, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
let Inst{7-5} = 0b000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101011100;
@@ -4648,7 +4648,7 @@ def F2_sfmin : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, IntRegs:$Rt32),
"$Rd32 = sfmin($Rs32,$Rt32)",
-tc_976ddc4f, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+tc_976ddc4f, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
let Inst{7-5} = 0b001;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101011100;
@@ -4662,7 +4662,7 @@ def F2_sfmpy : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, IntRegs:$Rt32),
"$Rd32 = sfmpy($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
let Inst{7-5} = 0b000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101011010;
@@ -4676,7 +4676,7 @@ def F2_sfrecipa : HInst<
(outs IntRegs:$Rd32, PredRegs:$Pe4),
(ins IntRegs:$Rs32, IntRegs:$Rt32),
"$Rd32,$Pe4 = sfrecipa($Rs32,$Rt32)",
-tc_9c00ce8d, TypeM>, Enc_a94f3b, Requires<[HasV5T]> {
+tc_9c00ce8d, TypeM>, Enc_a94f3b, Requires<[HasV5]> {
let Inst{7-7} = 0b1;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101011111;
@@ -4689,7 +4689,7 @@ def F2_sfsub : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, IntRegs:$Rt32),
"$Rd32 = sfsub($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
let Inst{7-5} = 0b001;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101011000;
@@ -4698,6 +4698,44 @@ let opNewValue = 0;
let isFP = 1;
let Uses = [USR];
}
+def G4_tfrgcpp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins GuestRegs64:$Gss32),
+"$Rdd32 = $Gss32",
+tc_6fa4db47, TypeCR>, Enc_0aa344 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01101000001;
+}
+def G4_tfrgcrr : HInst<
+(outs IntRegs:$Rd32),
+(ins GuestRegs:$Gs32),
+"$Rd32 = $Gs32",
+tc_6fa4db47, TypeCR>, Enc_44271f {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01101010001;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def G4_tfrgpcp : HInst<
+(outs GuestRegs64:$Gdd32),
+(ins DoubleRegs:$Rss32),
+"$Gdd32 = $Rss32",
+tc_994333cd, TypeCR>, Enc_ed5027 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01100011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def G4_tfrgrcr : HInst<
+(outs GuestRegs:$Gd32),
+(ins IntRegs:$Rs32),
+"$Gd32 = $Rs32",
+tc_994333cd, TypeCR>, Enc_621fba {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01100010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
def J2_call : HInst<
(outs),
(ins a30_2Imm:$Ii),
@@ -4905,7 +4943,7 @@ def J2_jumpf_nopred_map : HInst<
(outs),
(ins PredRegs:$Pu4, b15_2Imm:$Ii),
"if (!$Pu4) jump $Ii",
-tc_e9fae2d6, TypeMAPPING>, Requires<[HasV60T]> {
+tc_e9fae2d6, TypeMAPPING>, Requires<[HasV60]> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
@@ -4967,7 +5005,7 @@ def J2_jumpfpt : HInst<
(outs),
(ins PredRegs:$Pu4, b30_2Imm:$Ii),
"if (!$Pu4) jump:t $Ii",
-tc_e1e99bfa, TypeJ>, Enc_daea09, Requires<[HasV60T]>, PredNewRel {
+tc_e1e99bfa, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel {
let Inst{0-0} = 0b0;
let Inst{12-10} = 0b100;
let Inst{21-21} = 0b1;
@@ -5029,7 +5067,7 @@ def J2_jumprf_nopred_map : HInst<
(outs),
(ins PredRegs:$Pu4, IntRegs:$Rs32),
"if (!$Pu4) jumpr $Rs32",
-tc_e0739b8c, TypeMAPPING>, Requires<[HasV60T]> {
+tc_e0739b8c, TypeMAPPING>, Requires<[HasV60]> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
@@ -5077,7 +5115,7 @@ def J2_jumprfpt : HInst<
(outs),
(ins PredRegs:$Pu4, IntRegs:$Rs32),
"if (!$Pu4) jumpr:t $Rs32",
-tc_97743097, TypeJ>, Enc_88d4d9, Requires<[HasV60T]>, PredNewRel {
+tc_97743097, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel {
let Inst{7-0} = 0b00000000;
let Inst{13-10} = 0b0100;
let Inst{31-21} = 0b01010011011;
@@ -5222,7 +5260,7 @@ def J2_jumprt_nopred_map : HInst<
(outs),
(ins PredRegs:$Pu4, IntRegs:$Rs32),
"if ($Pu4) jumpr $Rs32",
-tc_e0739b8c, TypeMAPPING>, Requires<[HasV60T]> {
+tc_e0739b8c, TypeMAPPING>, Requires<[HasV60]> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
@@ -5268,7 +5306,7 @@ def J2_jumprtpt : HInst<
(outs),
(ins PredRegs:$Pu4, IntRegs:$Rs32),
"if ($Pu4) jumpr:t $Rs32",
-tc_97743097, TypeJ>, Enc_88d4d9, Requires<[HasV60T]>, PredNewRel {
+tc_97743097, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel {
let Inst{7-0} = 0b00000000;
let Inst{13-10} = 0b0100;
let Inst{31-21} = 0b01010011010;
@@ -5347,7 +5385,7 @@ def J2_jumpt_nopred_map : HInst<
(outs),
(ins PredRegs:$Pu4, b15_2Imm:$Ii),
"if ($Pu4) jump $Ii",
-tc_e9fae2d6, TypeMAPPING>, Requires<[HasV60T]> {
+tc_e9fae2d6, TypeMAPPING>, Requires<[HasV60]> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
@@ -5407,7 +5445,7 @@ def J2_jumptpt : HInst<
(outs),
(ins PredRegs:$Pu4, b30_2Imm:$Ii),
"if ($Pu4) jump:t $Ii",
-tc_e1e99bfa, TypeJ>, Enc_daea09, Requires<[HasV60T]>, PredNewRel {
+tc_e1e99bfa, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel {
let Inst{0-0} = 0b0;
let Inst{12-10} = 0b100;
let Inst{21-21} = 0b0;
@@ -5631,6 +5669,30 @@ let Inst{13-13} = 0b0;
let Inst{31-16} = 0b0101010000000000;
let isSolo = 1;
}
+def J2_trap1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, u8_0Imm:$Ii),
+"trap1($Rx32,#$Ii)",
+tc_59a01ead, TypeJ>, Enc_33f8ba {
+let Inst{1-0} = 0b00;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01010100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isSolo = 1;
+let Uses = [GOSP];
+let Defs = [GOSP, PC];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def J2_trap1_noregmap : HInst<
+(outs),
+(ins u8_0Imm:$Ii),
+"trap1(#$Ii)",
+tc_59a01ead, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
def J4_cmpeq_f_jumpnv_nt : HInst<
(outs),
(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
@@ -13334,7 +13396,7 @@ def L4_return_map_to_raw_f : HInst<
(outs),
(ins PredRegs:$Pv4),
"if (!$Pv4) dealloc_return",
-tc_513bef45, TypeMAPPING>, Requires<[HasV65T]> {
+tc_513bef45, TypeMAPPING>, Requires<[HasV65]> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
@@ -13342,7 +13404,7 @@ def L4_return_map_to_raw_fnew_pnt : HInst<
(outs),
(ins PredRegs:$Pv4),
"if (!$Pv4.new) dealloc_return:nt",
-tc_395dc00f, TypeMAPPING>, Requires<[HasV65T]> {
+tc_395dc00f, TypeMAPPING>, Requires<[HasV65]> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
@@ -13350,7 +13412,7 @@ def L4_return_map_to_raw_fnew_pt : HInst<
(outs),
(ins PredRegs:$Pv4),
"if (!$Pv4.new) dealloc_return:t",
-tc_395dc00f, TypeMAPPING>, Requires<[HasV65T]> {
+tc_395dc00f, TypeMAPPING>, Requires<[HasV65]> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
@@ -13358,7 +13420,7 @@ def L4_return_map_to_raw_t : HInst<
(outs),
(ins PredRegs:$Pv4),
"if ($Pv4) dealloc_return",
-tc_3bc2c5d3, TypeMAPPING>, Requires<[HasV65T]> {
+tc_3bc2c5d3, TypeMAPPING>, Requires<[HasV65]> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
@@ -13366,7 +13428,7 @@ def L4_return_map_to_raw_tnew_pnt : HInst<
(outs),
(ins PredRegs:$Pv4),
"if ($Pv4.new) dealloc_return:nt",
-tc_e7624c08, TypeMAPPING>, Requires<[HasV65T]> {
+tc_e7624c08, TypeMAPPING>, Requires<[HasV65]> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
@@ -13374,7 +13436,7 @@ def L4_return_map_to_raw_tnew_pt : HInst<
(outs),
(ins PredRegs:$Pv4),
"if ($Pv4.new) dealloc_return:t",
-tc_e7624c08, TypeMAPPING>, Requires<[HasV65T]> {
+tc_e7624c08, TypeMAPPING>, Requires<[HasV65]> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
@@ -13528,7 +13590,7 @@ def L6_deallocframe_map_to_raw : HInst<
(outs),
(ins),
"deallocframe",
-tc_d1090e34, TypeMAPPING>, Requires<[HasV65T]> {
+tc_d1090e34, TypeMAPPING>, Requires<[HasV65]> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
@@ -13536,7 +13598,7 @@ def L6_return_map_to_raw : HInst<
(outs),
(ins),
"dealloc_return",
-tc_3d04548d, TypeMAPPING>, Requires<[HasV65T]> {
+tc_3d04548d, TypeMAPPING>, Requires<[HasV65]> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
@@ -16916,7 +16978,7 @@ def M4_cmpyi_whc : HInst<
(outs IntRegs:$Rd32),
(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
"$Rd32 = cmpyiwh($Rss32,$Rt32*):<<1:rnd:sat",
-tc_8fd5f294, TypeS_3op>, Enc_3d5b28, Requires<[HasV5T]> {
+tc_8fd5f294, TypeS_3op>, Enc_3d5b28, Requires<[HasV5]> {
let Inst{7-5} = 0b101;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11000101000;
@@ -16942,7 +17004,7 @@ def M4_cmpyr_whc : HInst<
(outs IntRegs:$Rd32),
(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
"$Rd32 = cmpyrwh($Rss32,$Rt32*):<<1:rnd:sat",
-tc_8fd5f294, TypeS_3op>, Enc_3d5b28, Requires<[HasV5T]> {
+tc_8fd5f294, TypeS_3op>, Enc_3d5b28, Requires<[HasV5]> {
let Inst{7-5} = 0b111;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11000101000;
@@ -17295,7 +17357,7 @@ def M5_vdmacbsu : HInst<
(outs DoubleRegs:$Rxx32),
(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
"$Rxx32 += vdmpybsu($Rss32,$Rtt32):sat",
-tc_e913dc32, TypeM>, Enc_88c16c, Requires<[HasV5T]> {
+tc_e913dc32, TypeM>, Enc_88c16c, Requires<[HasV5]> {
let Inst{7-5} = 0b001;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101010001;
@@ -17307,7 +17369,7 @@ def M5_vdmpybsu : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
"$Rdd32 = vdmpybsu($Rss32,$Rtt32):sat",
-tc_8fd5f294, TypeM>, Enc_a56825, Requires<[HasV5T]> {
+tc_8fd5f294, TypeM>, Enc_a56825, Requires<[HasV5]> {
let Inst{7-5} = 0b001;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101000101;
@@ -17402,7 +17464,7 @@ def M6_vabsdiffb : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
"$Rdd32 = vabsdiffb($Rtt32,$Rss32)",
-tc_f49e76f4, TypeM>, Enc_ea23e4, Requires<[HasV62T]> {
+tc_f49e76f4, TypeM>, Enc_ea23e4, Requires<[HasV62]> {
let Inst{7-5} = 0b000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101000111;
@@ -17412,7 +17474,7 @@ def M6_vabsdiffub : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
"$Rdd32 = vabsdiffub($Rtt32,$Rss32)",
-tc_f49e76f4, TypeM>, Enc_ea23e4, Requires<[HasV62T]> {
+tc_f49e76f4, TypeM>, Enc_ea23e4, Requires<[HasV62]> {
let Inst{7-5} = 0b000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11101000101;
@@ -18142,7 +18204,7 @@ def S2_asr_i_p_rnd : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
"$Rdd32 = asr($Rss32,#$Ii):rnd",
-tc_2b6f77c6, TypeS_2op>, Enc_5eac98, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_2op>, Enc_5eac98, Requires<[HasV5]> {
let Inst{7-5} = 0b111;
let Inst{31-21} = 0b10000000110;
let prefersSlot3 = 1;
@@ -18151,7 +18213,7 @@ def S2_asr_i_p_rnd_goodsyntax : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
"$Rdd32 = asrrnd($Rss32,#$Ii)",
-tc_2b6f77c6, TypeS_2op>, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_2op>, Requires<[HasV5]> {
let isPseudo = 1;
}
def S2_asr_i_r : HInst<
@@ -25086,7 +25148,7 @@ def S5_asrhub_rnd_sat : HInst<
(outs IntRegs:$Rd32),
(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
"$Rd32 = vasrhub($Rss32,#$Ii):raw",
-tc_2b6f77c6, TypeS_2op>, Enc_11a146, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_2op>, Enc_11a146, Requires<[HasV5]> {
let Inst{7-5} = 0b100;
let Inst{13-12} = 0b00;
let Inst{31-21} = 0b10001000011;
@@ -25099,7 +25161,7 @@ def S5_asrhub_rnd_sat_goodsyntax : HInst<
(outs IntRegs:$Rd32),
(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
"$Rd32 = vasrhub($Rss32,#$Ii):rnd:sat",
-tc_2b6f77c6, TypeS_2op>, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_2op>, Requires<[HasV5]> {
let hasNewValue = 1;
let opNewValue = 0;
let isPseudo = 1;
@@ -25108,7 +25170,7 @@ def S5_asrhub_sat : HInst<
(outs IntRegs:$Rd32),
(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
"$Rd32 = vasrhub($Rss32,#$Ii):sat",
-tc_2b6f77c6, TypeS_2op>, Enc_11a146, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_2op>, Enc_11a146, Requires<[HasV5]> {
let Inst{7-5} = 0b101;
let Inst{13-12} = 0b00;
let Inst{31-21} = 0b10001000011;
@@ -25121,7 +25183,7 @@ def S5_popcountp : HInst<
(outs IntRegs:$Rd32),
(ins DoubleRegs:$Rss32),
"$Rd32 = popcount($Rss32)",
-tc_00afc57e, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_00afc57e, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
let Inst{13-5} = 0b000000011;
let Inst{31-21} = 0b10001000011;
let hasNewValue = 1;
@@ -25132,7 +25194,7 @@ def S5_vasrhrnd : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
"$Rdd32 = vasrh($Rss32,#$Ii):raw",
-tc_2b6f77c6, TypeS_2op>, Enc_12b6e9, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_2op>, Enc_12b6e9, Requires<[HasV5]> {
let Inst{7-5} = 0b000;
let Inst{13-12} = 0b00;
let Inst{31-21} = 0b10000000001;
@@ -25142,14 +25204,14 @@ def S5_vasrhrnd_goodsyntax : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
"$Rdd32 = vasrh($Rss32,#$Ii):rnd",
-tc_2b6f77c6, TypeS_2op>, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_2op>, Requires<[HasV5]> {
let isPseudo = 1;
}
def S6_allocframe_to_raw : HInst<
(outs),
(ins u11_3Imm:$Ii),
"allocframe(#$Ii)",
-tc_e216a5db, TypeMAPPING>, Requires<[HasV65T]> {
+tc_e216a5db, TypeMAPPING>, Requires<[HasV65]> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
@@ -25157,7 +25219,7 @@ def S6_rol_i_p : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
"$Rdd32 = rol($Rss32,#$Ii)",
-tc_55050d58, TypeS_2op>, Enc_5eac98, Requires<[HasV60T]> {
+tc_55050d58, TypeS_2op>, Enc_5eac98, Requires<[HasV60]> {
let Inst{7-5} = 0b011;
let Inst{31-21} = 0b10000000000;
}
@@ -25165,7 +25227,7 @@ def S6_rol_i_p_acc : HInst<
(outs DoubleRegs:$Rxx32),
(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
"$Rxx32 += rol($Rss32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
let Inst{7-5} = 0b111;
let Inst{31-21} = 0b10000010000;
let prefersSlot3 = 1;
@@ -25175,7 +25237,7 @@ def S6_rol_i_p_and : HInst<
(outs DoubleRegs:$Rxx32),
(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
"$Rxx32 &= rol($Rss32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
let Inst{7-5} = 0b011;
let Inst{31-21} = 0b10000010010;
let prefersSlot3 = 1;
@@ -25185,7 +25247,7 @@ def S6_rol_i_p_nac : HInst<
(outs DoubleRegs:$Rxx32),
(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
"$Rxx32 -= rol($Rss32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
let Inst{7-5} = 0b011;
let Inst{31-21} = 0b10000010000;
let prefersSlot3 = 1;
@@ -25195,7 +25257,7 @@ def S6_rol_i_p_or : HInst<
(outs DoubleRegs:$Rxx32),
(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
"$Rxx32 |= rol($Rss32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
let Inst{7-5} = 0b111;
let Inst{31-21} = 0b10000010010;
let prefersSlot3 = 1;
@@ -25205,7 +25267,7 @@ def S6_rol_i_p_xacc : HInst<
(outs DoubleRegs:$Rxx32),
(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
"$Rxx32 ^= rol($Rss32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
let Inst{7-5} = 0b011;
let Inst{31-21} = 0b10000010100;
let prefersSlot3 = 1;
@@ -25215,7 +25277,7 @@ def S6_rol_i_r : HInst<
(outs IntRegs:$Rd32),
(ins IntRegs:$Rs32, u5_0Imm:$Ii),
"$Rd32 = rol($Rs32,#$Ii)",
-tc_55050d58, TypeS_2op>, Enc_a05677, Requires<[HasV60T]> {
+tc_55050d58, TypeS_2op>, Enc_a05677, Requires<[HasV60]> {
let Inst{7-5} = 0b011;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b10001100000;
@@ -25226,7 +25288,7 @@ def S6_rol_i_r_acc : HInst<
(outs IntRegs:$Rx32),
(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
"$Rx32 += rol($Rs32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
let Inst{7-5} = 0b111;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b10001110000;
@@ -25239,7 +25301,7 @@ def S6_rol_i_r_and : HInst<
(outs IntRegs:$Rx32),
(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
"$Rx32 &= rol($Rs32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
let Inst{7-5} = 0b011;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b10001110010;
@@ -25252,7 +25314,7 @@ def S6_rol_i_r_nac : HInst<
(outs IntRegs:$Rx32),
(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
"$Rx32 -= rol($Rs32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
let Inst{7-5} = 0b011;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b10001110000;
@@ -25265,7 +25327,7 @@ def S6_rol_i_r_or : HInst<
(outs IntRegs:$Rx32),
(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
"$Rx32 |= rol($Rs32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
let Inst{7-5} = 0b111;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b10001110010;
@@ -25278,7 +25340,7 @@ def S6_rol_i_r_xacc : HInst<
(outs IntRegs:$Rx32),
(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
"$Rx32 ^= rol($Rs32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
let Inst{7-5} = 0b011;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b10001110100;
@@ -25291,7 +25353,7 @@ def S6_vsplatrbp : HInst<
(outs DoubleRegs:$Rdd32),
(ins IntRegs:$Rs32),
"$Rdd32 = vsplatb($Rs32)",
-tc_be706f30, TypeS_2op>, Enc_3a3d62, Requires<[HasV62T]> {
+tc_be706f30, TypeS_2op>, Enc_3a3d62, Requires<[HasV62]> {
let Inst{13-5} = 0b000000100;
let Inst{31-21} = 0b10000100010;
}
@@ -25299,7 +25361,7 @@ def S6_vtrunehb_ppp : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
"$Rdd32 = vtrunehb($Rss32,$Rtt32)",
-tc_55050d58, TypeS_3op>, Enc_a56825, Requires<[HasV62T]> {
+tc_55050d58, TypeS_3op>, Enc_a56825, Requires<[HasV62]> {
let Inst{7-5} = 0b011;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11000001100;
@@ -25308,7 +25370,7 @@ def S6_vtrunohb_ppp : HInst<
(outs DoubleRegs:$Rdd32),
(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
"$Rdd32 = vtrunohb($Rss32,$Rtt32)",
-tc_55050d58, TypeS_3op>, Enc_a56825, Requires<[HasV62T]> {
+tc_55050d58, TypeS_3op>, Enc_a56825, Requires<[HasV62]> {
let Inst{7-5} = 0b101;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b11000001100;
@@ -26288,7 +26350,7 @@ def V6_ldntnt0 : HInst<
(outs HvxVR:$Vd32),
(ins IntRegs:$Rt32),
"$Vd32 = vmem($Rt32):nt",
-PSEUDO, TypeMAPPING>, Requires<[HasV62T]> {
+PSEUDO, TypeMAPPING>, Requires<[HasV62]> {
let hasNewValue = 1;
let opNewValue = 0;
let isPseudo = 1;
@@ -30301,7 +30363,7 @@ def V6_vasrhbrndsat_alt : HInst<
(outs HvxVR:$Vd32),
(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
"$Vd32 = vasrhb($Vu32,$Vv32,$Rt8):rnd:sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
let hasNewValue = 1;
let opNewValue = 0;
let isPseudo = 1;
@@ -30335,7 +30397,7 @@ def V6_vasrhubrndsat_alt : HInst<
(outs HvxVR:$Vd32),
(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
"$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):rnd:sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
let hasNewValue = 1;
let opNewValue = 0;
let isPseudo = 1;
@@ -30357,7 +30419,7 @@ def V6_vasrhubsat_alt : HInst<
(outs HvxVR:$Vd32),
(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
"$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
let hasNewValue = 1;
let opNewValue = 0;
let isPseudo = 1;
@@ -30500,7 +30562,7 @@ def V6_vasrwh_alt : HInst<
(outs HvxVR:$Vd32),
(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8)",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
let hasNewValue = 1;
let opNewValue = 0;
let isPseudo = 1;
@@ -30522,7 +30584,7 @@ def V6_vasrwhrndsat_alt : HInst<
(outs HvxVR:$Vd32),
(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):rnd:sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
let hasNewValue = 1;
let opNewValue = 0;
let isPseudo = 1;
@@ -30544,7 +30606,7 @@ def V6_vasrwhsat_alt : HInst<
(outs HvxVR:$Vd32),
(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
let hasNewValue = 1;
let opNewValue = 0;
let isPseudo = 1;
@@ -30578,7 +30640,7 @@ def V6_vasrwuhsat_alt : HInst<
(outs HvxVR:$Vd32),
(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
"$Vd32 = vasrwuh($Vu32,$Vv32,$Rt8):sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
let hasNewValue = 1;
let opNewValue = 0;
let isPseudo = 1;
@@ -36942,7 +37004,7 @@ def Y5_l2fetch : HInst<
(outs),
(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
"l2fetch($Rs32,$Rtt32)",
-tc_daa058fa, TypeST>, Enc_e6abcf, Requires<[HasV5T]> {
+tc_daa058fa, TypeST>, Enc_e6abcf, Requires<[HasV5]> {
let Inst{7-0} = 0b00000000;
let Inst{13-13} = 0b0;
let Inst{31-21} = 0b10100110100;
diff --git a/lib/Target/Hexagon/HexagonDepMappings.td b/lib/Target/Hexagon/HexagonDepMappings.td
index 7a156c39da9c..03c504ff0b08 100644
--- a/lib/Target/Hexagon/HexagonDepMappings.td
+++ b/lib/Target/Hexagon/HexagonDepMappings.td
@@ -26,6 +26,7 @@ def J2_jumpf_nopred_mapAlias : InstAlias<"if (!$Pu4) jump $Ii", (J2_jumpf PredRe
def J2_jumprf_nopred_mapAlias : InstAlias<"if (!$Pu4) jumpr $Rs32", (J2_jumprf PredRegs:$Pu4, IntRegs:$Rs32)>;
def J2_jumprt_nopred_mapAlias : InstAlias<"if ($Pu4) jumpr $Rs32", (J2_jumprt PredRegs:$Pu4, IntRegs:$Rs32)>;
def J2_jumpt_nopred_mapAlias : InstAlias<"if ($Pu4) jump $Ii", (J2_jumpt PredRegs:$Pu4, b30_2Imm:$Ii)>;
+def J2_trap1_noregmapAlias : InstAlias<"trap1(#$Ii)", (J2_trap1 R0, u8_0Imm:$Ii)>;
def L2_loadalignb_zomapAlias : InstAlias<"$Ryy32 = memb_fifo($Rs32)", (L2_loadalignb_io DoubleRegs:$Ryy32, IntRegs:$Rs32, 0)>;
def L2_loadalignh_zomapAlias : InstAlias<"$Ryy32 = memh_fifo($Rs32)", (L2_loadalignh_io DoubleRegs:$Ryy32, IntRegs:$Rs32, 0)>;
def L2_loadbsw2_zomapAlias : InstAlias<"$Rd32 = membh($Rs32)", (L2_loadbsw2_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
diff --git a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index 0f1b9a4733c5..557e6384be6a 100644
--- a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -100,7 +100,7 @@ namespace llvm {
} // end namespace llvm
static cl::opt<bool> EnableHexagonBP("enable-hexagon-br-prob", cl::Hidden,
- cl::init(false), cl::desc("Enable branch probability info"));
+ cl::init(true), cl::desc("Enable branch probability info"));
static cl::opt<unsigned> SizeLimit("eif-limit", cl::init(6), cl::Hidden,
cl::desc("Size limit in Hexagon early if-conversion"));
static cl::opt<bool> SkipExitBranches("eif-no-loop-exit", cl::init(false),
@@ -191,6 +191,7 @@ namespace {
bool isProfitable(const FlowPattern &FP) const;
bool isPredicableStore(const MachineInstr *MI) const;
bool isSafeToSpeculate(const MachineInstr *MI) const;
+ bool isPredicate(unsigned R) const;
unsigned getCondStoreOpcode(unsigned Opc, bool IfTrue) const;
void predicateInstr(MachineBasicBlock *ToB, MachineBasicBlock::iterator At,
@@ -207,7 +208,6 @@ namespace {
void removeBlock(MachineBasicBlock *B);
void eliminatePhis(MachineBasicBlock *B);
- void replacePhiEdges(MachineBasicBlock *OldB, MachineBasicBlock *NewB);
void mergeBlocks(MachineBasicBlock *PredB, MachineBasicBlock *SuccB);
void simplifyFlowGraph(const FlowPattern &FP);
@@ -238,11 +238,12 @@ bool HexagonEarlyIfConversion::isPreheader(const MachineBasicBlock *B) const {
bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
MachineLoop *L, FlowPattern &FP) {
- DEBUG(dbgs() << "Checking flow pattern at " << printMBBReference(*B) << "\n");
+ LLVM_DEBUG(dbgs() << "Checking flow pattern at " << printMBBReference(*B)
+ << "\n");
// Interested only in conditional branches, no .new, no new-value, etc.
// Check the terminators directly, it's easier than handling all responses
- // from AnalyzeBranch.
+ // from analyzeBranch.
MachineBasicBlock *TB = nullptr, *FB = nullptr;
MachineBasicBlock::const_iterator T1I = B->getFirstTerminator();
if (T1I == B->end())
@@ -325,17 +326,17 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
}
// Don't try to predicate loop preheaders.
if ((TB && isPreheader(TB)) || (FB && isPreheader(FB))) {
- DEBUG(dbgs() << "One of blocks " << PrintMB(TB) << ", " << PrintMB(FB)
- << " is a loop preheader. Skipping.\n");
+ LLVM_DEBUG(dbgs() << "One of blocks " << PrintMB(TB) << ", " << PrintMB(FB)
+ << " is a loop preheader. Skipping.\n");
return false;
}
FP = FlowPattern(B, PredR, TB, FB, JB);
- DEBUG(dbgs() << "Detected " << PrintFP(FP, *TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Detected " << PrintFP(FP, *TRI) << "\n");
return true;
}
-// KLUDGE: HexagonInstrInfo::AnalyzeBranch won't work on a block that
+// KLUDGE: HexagonInstrInfo::analyzeBranch won't work on a block that
// contains EH_LABEL.
bool HexagonEarlyIfConversion::hasEHLabel(const MachineBasicBlock *B) const {
for (auto &I : *B)
@@ -344,7 +345,7 @@ bool HexagonEarlyIfConversion::hasEHLabel(const MachineBasicBlock *B) const {
return false;
}
-// KLUDGE: HexagonInstrInfo::AnalyzeBranch may be unable to recognize
+// KLUDGE: HexagonInstrInfo::analyzeBranch may be unable to recognize
// that a block can never fall-through.
bool HexagonEarlyIfConversion::hasUncondBranch(const MachineBasicBlock *B)
const {
@@ -367,7 +368,7 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
return false;
for (auto &MI : *B) {
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
continue;
if (MI.isConditionalBranch())
return false;
@@ -387,13 +388,8 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
unsigned R = MO.getReg();
if (!TargetRegisterInfo::isVirtualRegister(R))
continue;
- switch (MRI->getRegClass(R)->getID()) {
- case Hexagon::PredRegsRegClassID:
- case Hexagon::HvxQRRegClassID:
- break;
- default:
- continue;
- }
+ if (!isPredicate(R))
+ continue;
for (auto U = MRI->use_begin(R); U != MRI->use_end(); ++U)
if (U->getParent()->isPHI())
return false;
@@ -443,8 +439,7 @@ bool HexagonEarlyIfConversion::isValid(const FlowPattern &FP) const {
if (usesUndefVReg(&MI))
return false;
unsigned DefR = MI.getOperand(0).getReg();
- const TargetRegisterClass *RC = MRI->getRegClass(DefR);
- if (RC == &Hexagon::PredRegsRegClass)
+ if (isPredicate(DefR))
return false;
}
}
@@ -500,7 +495,7 @@ unsigned HexagonEarlyIfConversion::countPredicateDefs(
unsigned R = MO.getReg();
if (!TargetRegisterInfo::isVirtualRegister(R))
continue;
- if (MRI->getRegClass(R) == &Hexagon::PredRegsRegClass)
+ if (isPredicate(R))
PredDefs++;
}
}
@@ -508,10 +503,21 @@ unsigned HexagonEarlyIfConversion::countPredicateDefs(
}
bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
+ BranchProbability JumpProb(1, 10);
+ BranchProbability Prob(9, 10);
+ if (MBPI && FP.TrueB && !FP.FalseB &&
+ (MBPI->getEdgeProbability(FP.SplitB, FP.TrueB) < JumpProb ||
+ MBPI->getEdgeProbability(FP.SplitB, FP.TrueB) > Prob))
+ return false;
+
+ if (MBPI && !FP.TrueB && FP.FalseB &&
+ (MBPI->getEdgeProbability(FP.SplitB, FP.FalseB) < JumpProb ||
+ MBPI->getEdgeProbability(FP.SplitB, FP.FalseB) > Prob))
+ return false;
+
if (FP.TrueB && FP.FalseB) {
// Do not IfCovert if the branch is one sided.
if (MBPI) {
- BranchProbability Prob(9, 10);
if (MBPI->getEdgeProbability(FP.SplitB, FP.TrueB) > Prob)
return false;
if (MBPI->getEdgeProbability(FP.SplitB, FP.FalseB) > Prob)
@@ -546,8 +552,9 @@ bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
};
unsigned Spare = 0;
unsigned TotalIn = TotalCount(FP.TrueB, Spare) + TotalCount(FP.FalseB, Spare);
- DEBUG(dbgs() << "Total number of instructions to be predicated/speculated: "
- << TotalIn << ", spare room: " << Spare << "\n");
+ LLVM_DEBUG(
+ dbgs() << "Total number of instructions to be predicated/speculated: "
+ << TotalIn << ", spare room: " << Spare << "\n");
if (TotalIn >= SizeLimit+Spare)
return false;
@@ -574,12 +581,13 @@ bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
PredDefs += countPredicateDefs(SB);
}
}
- DEBUG(dbgs() << "Total number of extra muxes from converted phis: "
- << TotalPh << "\n");
+ LLVM_DEBUG(dbgs() << "Total number of extra muxes from converted phis: "
+ << TotalPh << "\n");
if (TotalIn+TotalPh >= SizeLimit+Spare)
return false;
- DEBUG(dbgs() << "Total number of predicate registers: " << PredDefs << "\n");
+ LLVM_DEBUG(dbgs() << "Total number of predicate registers: " << PredDefs
+ << "\n");
if (PredDefs > 4)
return false;
@@ -620,11 +628,11 @@ bool HexagonEarlyIfConversion::visitBlock(MachineBasicBlock *B,
return Changed;
if (!isValid(FP)) {
- DEBUG(dbgs() << "Conversion is not valid\n");
+ LLVM_DEBUG(dbgs() << "Conversion is not valid\n");
return Changed;
}
if (!isProfitable(FP)) {
- DEBUG(dbgs() << "Conversion is not profitable\n");
+ LLVM_DEBUG(dbgs() << "Conversion is not profitable\n");
return Changed;
}
@@ -635,8 +643,9 @@ bool HexagonEarlyIfConversion::visitBlock(MachineBasicBlock *B,
bool HexagonEarlyIfConversion::visitLoop(MachineLoop *L) {
MachineBasicBlock *HB = L ? L->getHeader() : nullptr;
- DEBUG((L ? dbgs() << "Visiting loop H:" << PrintMB(HB)
- : dbgs() << "Visiting function") << "\n");
+ LLVM_DEBUG((L ? dbgs() << "Visiting loop H:" << PrintMB(HB)
+ : dbgs() << "Visiting function")
+ << "\n");
bool Changed = false;
if (L) {
for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
@@ -680,10 +689,18 @@ bool HexagonEarlyIfConversion::isSafeToSpeculate(const MachineInstr *MI)
return false;
if (MI->hasUnmodeledSideEffects())
return false;
+ if (MI->getOpcode() == TargetOpcode::LIFETIME_END)
+ return false;
return true;
}
+bool HexagonEarlyIfConversion::isPredicate(unsigned R) const {
+ const TargetRegisterClass *RC = MRI->getRegClass(R);
+ return RC == &Hexagon::PredRegsRegClass ||
+ RC == &Hexagon::HvxQRRegClass;
+}
+
unsigned HexagonEarlyIfConversion::getCondStoreOpcode(unsigned Opc,
bool IfTrue) const {
return HII->getCondOpcode(Opc, !IfTrue);
@@ -745,7 +762,7 @@ void HexagonEarlyIfConversion::predicateInstr(MachineBasicBlock *ToB,
void HexagonEarlyIfConversion::predicateBlockNB(MachineBasicBlock *ToB,
MachineBasicBlock::iterator At, MachineBasicBlock *FromB,
unsigned PredR, bool IfTrue) {
- DEBUG(dbgs() << "Predicating block " << PrintMB(FromB) << "\n");
+ LLVM_DEBUG(dbgs() << "Predicating block " << PrintMB(FromB) << "\n");
MachineBasicBlock::iterator End = FromB->getFirstTerminator();
MachineBasicBlock::iterator I, NextI;
@@ -765,9 +782,11 @@ unsigned HexagonEarlyIfConversion::buildMux(MachineBasicBlock *B,
unsigned Opc = 0;
switch (DRC->getID()) {
case Hexagon::IntRegsRegClassID:
+ case Hexagon::IntRegsLow8RegClassID:
Opc = Hexagon::C2_mux;
break;
case Hexagon::DoubleRegsRegClassID:
+ case Hexagon::GeneralDoubleLow8RegsRegClassID:
Opc = Hexagon::PS_pselect;
break;
case Hexagon::HvxVRRegClassID:
@@ -935,7 +954,7 @@ void HexagonEarlyIfConversion::convert(const FlowPattern &FP) {
}
void HexagonEarlyIfConversion::removeBlock(MachineBasicBlock *B) {
- DEBUG(dbgs() << "Removing block " << PrintMB(B) << "\n");
+ LLVM_DEBUG(dbgs() << "Removing block " << PrintMB(B) << "\n");
// Transfer the immediate dominator information from B to its descendants.
MachineDomTreeNode *N = MDT->getNode(B);
@@ -965,7 +984,7 @@ void HexagonEarlyIfConversion::removeBlock(MachineBasicBlock *B) {
}
void HexagonEarlyIfConversion::eliminatePhis(MachineBasicBlock *B) {
- DEBUG(dbgs() << "Removing phi nodes from block " << PrintMB(B) << "\n");
+ LLVM_DEBUG(dbgs() << "Removing phi nodes from block " << PrintMB(B) << "\n");
MachineBasicBlock::iterator I, NextI, NonPHI = B->getFirstNonPHI();
for (I = B->begin(); I != NonPHI; I = NextI) {
NextI = std::next(I);
@@ -990,34 +1009,16 @@ void HexagonEarlyIfConversion::eliminatePhis(MachineBasicBlock *B) {
}
}
-void HexagonEarlyIfConversion::replacePhiEdges(MachineBasicBlock *OldB,
- MachineBasicBlock *NewB) {
- for (auto I = OldB->succ_begin(), E = OldB->succ_end(); I != E; ++I) {
- MachineBasicBlock *SB = *I;
- MachineBasicBlock::iterator P, N = SB->getFirstNonPHI();
- for (P = SB->begin(); P != N; ++P) {
- MachineInstr &PN = *P;
- for (MachineOperand &MO : PN.operands())
- if (MO.isMBB() && MO.getMBB() == OldB)
- MO.setMBB(NewB);
- }
- }
-}
-
void HexagonEarlyIfConversion::mergeBlocks(MachineBasicBlock *PredB,
MachineBasicBlock *SuccB) {
- DEBUG(dbgs() << "Merging blocks " << PrintMB(PredB) << " and "
- << PrintMB(SuccB) << "\n");
+ LLVM_DEBUG(dbgs() << "Merging blocks " << PrintMB(PredB) << " and "
+ << PrintMB(SuccB) << "\n");
bool TermOk = hasUncondBranch(SuccB);
eliminatePhis(SuccB);
HII->removeBranch(*PredB);
PredB->removeSuccessor(SuccB);
PredB->splice(PredB->end(), SuccB, SuccB->begin(), SuccB->end());
- MachineBasicBlock::succ_iterator I, E = SuccB->succ_end();
- for (I = SuccB->succ_begin(); I != E; ++I)
- PredB->addSuccessor(*I);
- PredB->normalizeSuccProbs();
- replacePhiEdges(SuccB, PredB);
+ PredB->transferSuccessorsAndUpdatePHIs(SuccB);
removeBlock(SuccB);
if (!TermOk)
PredB->updateTerminator();
@@ -1039,7 +1040,7 @@ void HexagonEarlyIfConversion::simplifyFlowGraph(const FlowPattern &FP) {
// By now, the split block has only one successor (SB), and SB has only
// one predecessor. We can try to merge them. We will need to update ter-
- // minators in FP.Split+SB, and that requires working AnalyzeBranch, which
+ // minators in FP.Split+SB, and that requires working analyzeBranch, which
// fails on Hexagon for blocks that have EH_LABELs. However, if SB ends
// with an unconditional branch, we won't need to touch the terminators.
if (!hasEHLabel(SB) || hasUncondBranch(SB))
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index c2feaf5737b2..7e774674e0c0 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -316,8 +316,10 @@ void HexagonExpandCondsets::updateKillFlags(unsigned Reg) {
auto KillAt = [this,Reg] (SlotIndex K, LaneBitmask LM) -> void {
// Set the <kill> flag on a use of Reg whose lane mask is contained in LM.
MachineInstr *MI = LIS->getInstructionFromIndex(K);
- for (auto &Op : MI->operands()) {
- if (!Op.isReg() || !Op.isUse() || Op.getReg() != Reg)
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!Op.isReg() || !Op.isUse() || Op.getReg() != Reg ||
+ MI->isRegTiedToDefOperand(i))
continue;
LaneBitmask SLM = getLaneMask(Reg, Op.getSubReg());
if ((SLM & LM) == SLM) {
@@ -497,14 +499,18 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
if (!Op.isReg() || !DefRegs.count(Op))
continue;
if (Op.isDef()) {
- ImpUses.insert({Op, i});
+ // Tied defs will always have corresponding uses, so no extra
+ // implicit uses are needed.
+ if (!Op.isTied())
+ ImpUses.insert({Op, i});
} else {
// This function can be called for the same register with different
// lane masks. If the def in this instruction was for the whole
// register, we can get here more than once. Avoid adding multiple
// implicit uses (or adding an implicit use when an explicit one is
// present).
- ImpUses.erase(Op);
+ if (Op.isTied())
+ ImpUses.erase(Op);
}
}
if (ImpUses.empty())
@@ -545,7 +551,14 @@ void HexagonExpandCondsets::removeInstr(MachineInstr &MI) {
void HexagonExpandCondsets::updateLiveness(std::set<unsigned> &RegSet,
bool Recalc, bool UpdateKills, bool UpdateDeads) {
UpdateKills |= UpdateDeads;
- for (auto R : RegSet) {
+ for (unsigned R : RegSet) {
+ if (!TargetRegisterInfo::isVirtualRegister(R)) {
+ assert(TargetRegisterInfo::isPhysicalRegister(R));
+ // There shouldn't be any physical registers as operands, except
+ // possibly reserved registers.
+ assert(MRI->isReserved(R));
+ continue;
+ }
if (Recalc)
recalculateLiveInterval(R);
if (UpdateKills)
@@ -641,7 +654,7 @@ MachineInstr *HexagonExpandCondsets::genCondTfrFor(MachineOperand &SrcOp,
.add(SrcOp);
}
- DEBUG(dbgs() << "created an initial copy: " << *MIB);
+ LLVM_DEBUG(dbgs() << "created an initial copy: " << *MIB);
return &*MIB;
}
@@ -654,8 +667,8 @@ bool HexagonExpandCondsets::split(MachineInstr &MI,
return false;
TfrCounter++;
}
- DEBUG(dbgs() << "\nsplitting " << printMBBReference(*MI.getParent()) << ": "
- << MI);
+ LLVM_DEBUG(dbgs() << "\nsplitting " << printMBBReference(*MI.getParent())
+ << ": " << MI);
MachineOperand &MD = MI.getOperand(0); // Definition
MachineOperand &MP = MI.getOperand(1); // Predicate register
assert(MD.isDef());
@@ -932,8 +945,8 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
unsigned Opc = TfrI.getOpcode();
(void)Opc;
assert(Opc == Hexagon::A2_tfrt || Opc == Hexagon::A2_tfrf);
- DEBUG(dbgs() << "\nattempt to predicate if-" << (Cond ? "true" : "false")
- << ": " << TfrI);
+ LLVM_DEBUG(dbgs() << "\nattempt to predicate if-" << (Cond ? "true" : "false")
+ << ": " << TfrI);
MachineOperand &MD = TfrI.getOperand(0);
MachineOperand &MP = TfrI.getOperand(1);
@@ -954,7 +967,7 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
if (!DefI || !isPredicable(DefI))
return false;
- DEBUG(dbgs() << "Source def: " << *DefI);
+ LLVM_DEBUG(dbgs() << "Source def: " << *DefI);
// Collect the information about registers defined and used between the
// DefI and the TfrI.
@@ -1039,8 +1052,8 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
if (!canMoveMemTo(*DefI, TfrI, true))
CanDown = false;
- DEBUG(dbgs() << "Can move up: " << (CanUp ? "yes" : "no")
- << ", can move down: " << (CanDown ? "yes\n" : "no\n"));
+ LLVM_DEBUG(dbgs() << "Can move up: " << (CanUp ? "yes" : "no")
+ << ", can move down: " << (CanDown ? "yes\n" : "no\n"));
MachineBasicBlock::iterator PastDefIt = std::next(DefIt);
if (CanUp)
predicateAt(MD, *DefI, PastDefIt, MP, Cond, UpdRegs);
@@ -1135,10 +1148,10 @@ bool HexagonExpandCondsets::coalesceRegisters(RegisterRef R1, RegisterRef R2) {
return false;
bool Overlap = L1.overlaps(L2);
- DEBUG(dbgs() << "compatible registers: ("
- << (Overlap ? "overlap" : "disjoint") << ")\n "
- << printReg(R1.Reg, TRI, R1.Sub) << " " << L1 << "\n "
- << printReg(R2.Reg, TRI, R2.Sub) << " " << L2 << "\n");
+ LLVM_DEBUG(dbgs() << "compatible registers: ("
+ << (Overlap ? "overlap" : "disjoint") << ")\n "
+ << printReg(R1.Reg, TRI, R1.Sub) << " " << L1 << "\n "
+ << printReg(R2.Reg, TRI, R2.Sub) << " " << L2 << "\n");
if (R1.Sub || R2.Sub)
return false;
if (Overlap)
@@ -1171,7 +1184,7 @@ bool HexagonExpandCondsets::coalesceRegisters(RegisterRef R1, RegisterRef R2) {
LIS->removeInterval(R2.Reg);
updateKillFlags(R1.Reg);
- DEBUG(dbgs() << "coalesced: " << L1 << "\n");
+ LLVM_DEBUG(dbgs() << "coalesced: " << L1 << "\n");
L1.verify();
return true;
@@ -1252,8 +1265,8 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
LIS = &getAnalysis<LiveIntervals>();
MRI = &MF.getRegInfo();
- DEBUG(LIS->print(dbgs() << "Before expand-condsets\n",
- MF.getFunction().getParent()));
+ LLVM_DEBUG(LIS->print(dbgs() << "Before expand-condsets\n",
+ MF.getFunction().getParent()));
bool Changed = false;
std::set<unsigned> CoalUpd, PredUpd;
@@ -1280,8 +1293,8 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
if (!CoalUpd.count(Op.getReg()))
KillUpd.insert(Op.getReg());
updateLiveness(KillUpd, false, true, false);
- DEBUG(LIS->print(dbgs() << "After coalescing\n",
- MF.getFunction().getParent()));
+ LLVM_DEBUG(
+ LIS->print(dbgs() << "After coalescing\n", MF.getFunction().getParent()));
// First, simply split all muxes into a pair of conditional transfers
// and update the live intervals to reflect the new arrangement. The
@@ -1297,8 +1310,8 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
// predication, and after splitting they are difficult to recalculate
// (because of predicated defs), so make sure they are left untouched.
// Predication does not use live intervals.
- DEBUG(LIS->print(dbgs() << "After splitting\n",
- MF.getFunction().getParent()));
+ LLVM_DEBUG(
+ LIS->print(dbgs() << "After splitting\n", MF.getFunction().getParent()));
// Traverse all blocks and collapse predicable instructions feeding
// conditional transfers into predicated instructions.
@@ -1306,13 +1319,13 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
// cases that were not created in the previous step.
for (auto &B : MF)
Changed |= predicateInBlock(B, PredUpd);
- DEBUG(LIS->print(dbgs() << "After predicating\n",
- MF.getFunction().getParent()));
+ LLVM_DEBUG(LIS->print(dbgs() << "After predicating\n",
+ MF.getFunction().getParent()));
PredUpd.insert(CoalUpd.begin(), CoalUpd.end());
updateLiveness(PredUpd, true, true, true);
- DEBUG({
+ LLVM_DEBUG({
if (Changed)
LIS->print(dbgs() << "After expand-condsets\n",
MF.getFunction().getParent());
@@ -1324,7 +1337,6 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
//===----------------------------------------------------------------------===//
// Public Constructor Functions
//===----------------------------------------------------------------------===//
-
FunctionPass *llvm::createHexagonExpandCondsets() {
return new HexagonExpandCondsets();
}
diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index a842b672736c..e9067e2285a8 100644
--- a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -20,6 +20,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/PassSupport.h"
using namespace llvm;
@@ -59,12 +60,12 @@ namespace {
}
private:
- /// \brief Check the offset between each loop instruction and
+ /// Check the offset between each loop instruction and
/// the loop basic block to determine if we can use the LOOP instruction
/// or if we need to set the LC/SA registers explicitly.
bool fixupLoopInstrs(MachineFunction &MF);
- /// \brief Replace loop instruction with the constant extended
+ /// Replace loop instruction with the constant extended
/// version if the loop label is too far from the loop instruction.
void useExtLoopInstr(MachineFunction &MF,
MachineBasicBlock::iterator &MII);
@@ -80,7 +81,7 @@ FunctionPass *llvm::createHexagonFixupHwLoops() {
return new HexagonFixupHwLoops();
}
-/// \brief Returns true if the instruction is a hardware loop instruction.
+/// Returns true if the instruction is a hardware loop instruction.
static bool isHardwareLoop(const MachineInstr &MI) {
return MI.getOpcode() == Hexagon::J2_loop0r ||
MI.getOpcode() == Hexagon::J2_loop0i ||
@@ -94,7 +95,7 @@ bool HexagonFixupHwLoops::runOnMachineFunction(MachineFunction &MF) {
return fixupLoopInstrs(MF);
}
-/// \brief For Hexagon, if the loop label is to far from the
+/// For Hexagon, if the loop label is to far from the
/// loop instruction then we need to set the LC0 and SA0 registers
/// explicitly instead of using LOOP(start,count). This function
/// checks the distance, and generates register assignments if needed.
@@ -137,7 +138,7 @@ bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
MachineBasicBlock::iterator MII = MBB.begin();
MachineBasicBlock::iterator MIE = MBB.end();
while (MII != MIE) {
- InstOffset += HII->getSize(*MII);
+ unsigned InstSize = HII->getSize(*MII);
if (MII->isMetaInstruction()) {
++MII;
continue;
@@ -145,8 +146,10 @@ bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
if (isHardwareLoop(*MII)) {
assert(MII->getOperand(0).isMBB() &&
"Expect a basic block as loop operand");
- int diff = InstOffset - BlockToInstOffset[MII->getOperand(0).getMBB()];
- if ((unsigned)abs(diff) > MaxLoopRange) {
+ MachineBasicBlock *TargetBB = MII->getOperand(0).getMBB();
+ unsigned Diff = AbsoluteDifference(InstOffset,
+ BlockToInstOffset[TargetBB]);
+ if (Diff > MaxLoopRange) {
useExtLoopInstr(MF, MII);
MII = MBB.erase(MII);
Changed = true;
@@ -156,13 +159,14 @@ bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
} else {
++MII;
}
+ InstOffset += InstSize;
}
}
return Changed;
}
-/// \brief Replace loop instructions with the constant extended version.
+/// Replace loop instructions with the constant extended version.
void HexagonFixupHwLoops::useExtLoopInstr(MachineFunction &MF,
MachineBasicBlock::iterator &MII) {
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 65a2fc35b11b..97b02e2b34cb 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -442,7 +442,7 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF,
if (needsStackFrame(I, CSR, HRI))
SFBlocks.push_back(&I);
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Blocks needing SF: {";
for (auto &B : SFBlocks)
dbgs() << " " << printMBBReference(*B);
@@ -465,7 +465,7 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF,
if (!PDomB)
break;
}
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Computed dom block: ";
if (DomB)
dbgs() << printMBBReference(*DomB);
@@ -483,11 +483,11 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF,
// Make sure that DomB dominates PDomB and PDomB post-dominates DomB.
if (!MDT.dominates(DomB, PDomB)) {
- DEBUG(dbgs() << "Dom block does not dominate pdom block\n");
+ LLVM_DEBUG(dbgs() << "Dom block does not dominate pdom block\n");
return;
}
if (!MPT.dominates(PDomB, DomB)) {
- DEBUG(dbgs() << "PDom block does not post-dominate dom block\n");
+ LLVM_DEBUG(dbgs() << "PDom block does not post-dominate dom block\n");
return;
}
@@ -1396,7 +1396,7 @@ static void dump_registers(BitVector &Regs, const TargetRegisterInfo &TRI) {
bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI) const {
- DEBUG(dbgs() << __func__ << " on " << MF.getName() << '\n');
+ LLVM_DEBUG(dbgs() << __func__ << " on " << MF.getName() << '\n');
MachineFrameInfo &MFI = MF.getFrameInfo();
BitVector SRegs(Hexagon::NUM_TARGET_REGS);
@@ -1406,15 +1406,16 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
// (1) For each callee-saved register, add that register and all of its
// sub-registers to SRegs.
- DEBUG(dbgs() << "Initial CS registers: {");
+ LLVM_DEBUG(dbgs() << "Initial CS registers: {");
for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
unsigned R = CSI[i].getReg();
- DEBUG(dbgs() << ' ' << printReg(R, TRI));
+ LLVM_DEBUG(dbgs() << ' ' << printReg(R, TRI));
for (MCSubRegIterator SR(R, TRI, true); SR.isValid(); ++SR)
SRegs[*SR] = true;
}
- DEBUG(dbgs() << " }\n");
- DEBUG(dbgs() << "SRegs.1: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " }\n");
+ LLVM_DEBUG(dbgs() << "SRegs.1: "; dump_registers(SRegs, *TRI);
+ dbgs() << "\n");
// (2) For each reserved register, remove that register and all of its
// sub- and super-registers from SRegs.
@@ -1424,8 +1425,10 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
for (MCSuperRegIterator SR(R, TRI, true); SR.isValid(); ++SR)
SRegs[*SR] = false;
}
- DEBUG(dbgs() << "Res: "; dump_registers(Reserved, *TRI); dbgs() << "\n");
- DEBUG(dbgs() << "SRegs.2: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "Res: "; dump_registers(Reserved, *TRI);
+ dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "SRegs.2: "; dump_registers(SRegs, *TRI);
+ dbgs() << "\n");
// (3) Collect all registers that have at least one sub-register in SRegs,
// and also have no sub-registers that are reserved. These will be the can-
@@ -1446,11 +1449,13 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
break;
}
}
- DEBUG(dbgs() << "TmpSup: "; dump_registers(TmpSup, *TRI); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "TmpSup: "; dump_registers(TmpSup, *TRI);
+ dbgs() << "\n");
// (4) Include all super-registers found in (3) into SRegs.
SRegs |= TmpSup;
- DEBUG(dbgs() << "SRegs.4: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "SRegs.4: "; dump_registers(SRegs, *TRI);
+ dbgs() << "\n");
// (5) For each register R in SRegs, if any super-register of R is in SRegs,
// remove R from SRegs.
@@ -1463,7 +1468,8 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
break;
}
}
- DEBUG(dbgs() << "SRegs.5: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "SRegs.5: "; dump_registers(SRegs, *TRI);
+ dbgs() << "\n");
// Now, for each register that has a fixed stack slot, create the stack
// object for it.
@@ -1501,7 +1507,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
SRegs[R] = false;
}
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "CS information: {";
for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
int FI = CSI[i].getFrameIdx();
@@ -1706,11 +1712,6 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
for (auto R = B.begin(); R != It; ++R) {
Clobbers.clear();
LPR.stepForward(*R, Clobbers);
- // Dead defs are recorded in Clobbers, but are not automatically removed
- // from the live set.
- for (auto &C : Clobbers)
- if (C.second->isReg() && C.second->isDead())
- LPR.removeReg(C.first);
}
DebugLoc DL = MI->getDebugLoc();
@@ -1867,11 +1868,11 @@ bool HexagonFrameLowering::expandSpillMacros(MachineFunction &MF,
Changed |= expandCopy(B, I, MRI, HII, NewRegs);
break;
case Hexagon::STriw_pred:
- case Hexagon::STriw_mod:
+ case Hexagon::STriw_ctr:
Changed |= expandStoreInt(B, I, MRI, HII, NewRegs);
break;
case Hexagon::LDriw_pred:
- case Hexagon::LDriw_mod:
+ case Hexagon::LDriw_ctr:
Changed |= expandLoadInt(B, I, MRI, HII, NewRegs);
break;
case Hexagon::PS_vstorerq_ai:
@@ -1914,7 +1915,7 @@ void HexagonFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (OptimizeSpillSlots && !isOptNone(MF))
optimizeSpillSlots(MF, NewRegs);
- // We need to reserve a a spill slot if scavenging could potentially require
+ // We need to reserve a spill slot if scavenging could potentially require
// spilling a scavenged register.
if (!NewRegs.empty() || mayOverflowFrameOffset(MF)) {
MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -2026,8 +2027,8 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
auto P = BlockIndexes.insert(
std::make_pair(&B, HexagonBlockRanges::InstrIndexMap(B)));
auto &IndexMap = P.first->second;
- DEBUG(dbgs() << "Index map for " << printMBBReference(B) << "\n"
- << IndexMap << '\n');
+ LLVM_DEBUG(dbgs() << "Index map for " << printMBBReference(B) << "\n"
+ << IndexMap << '\n');
for (auto &In : B) {
int LFI, SFI;
@@ -2134,7 +2135,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
}
}
- DEBUG({
+ LLVM_DEBUG({
for (auto &P : FIRangeMap) {
dbgs() << "fi#" << P.first;
if (BadFIs.count(P.first))
@@ -2173,7 +2174,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
}
}
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Block-to-FI map (* -- live-on-exit):\n";
for (auto &P : BlockFIMap) {
auto &FIs = P.second;
@@ -2200,16 +2201,16 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
HexagonBlockRanges::InstrIndexMap &IM = F->second;
HexagonBlockRanges::RegToRangeMap LM = HBR.computeLiveMap(IM);
HexagonBlockRanges::RegToRangeMap DM = HBR.computeDeadMap(IM, LM);
- DEBUG(dbgs() << printMBBReference(B) << " dead map\n"
- << HexagonBlockRanges::PrintRangeMap(DM, HRI));
+ LLVM_DEBUG(dbgs() << printMBBReference(B) << " dead map\n"
+ << HexagonBlockRanges::PrintRangeMap(DM, HRI));
for (auto FI : BlockFIMap[&B]) {
if (BadFIs.count(FI))
continue;
- DEBUG(dbgs() << "Working on fi#" << FI << '\n');
+ LLVM_DEBUG(dbgs() << "Working on fi#" << FI << '\n');
HexagonBlockRanges::RangeList &RL = FIRangeMap[FI].Map[&B];
for (auto &Range : RL) {
- DEBUG(dbgs() << "--Examining range:" << RL << '\n');
+ LLVM_DEBUG(dbgs() << "--Examining range:" << RL << '\n');
if (!IndexType::isInstr(Range.start()) ||
!IndexType::isInstr(Range.end()))
continue;
@@ -2224,7 +2225,8 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
auto *RC = HII.getRegClass(SI.getDesc(), 2, &HRI, MF);
// The this-> is needed to unconfuse MSVC.
unsigned FoundR = this->findPhysReg(MF, Range, IM, DM, RC);
- DEBUG(dbgs() << "Replacement reg:" << printReg(FoundR, &HRI) << '\n');
+ LLVM_DEBUG(dbgs() << "Replacement reg:" << printReg(FoundR, &HRI)
+ << '\n');
if (FoundR == 0)
continue;
#ifndef NDEBUG
diff --git a/lib/Target/Hexagon/HexagonGatherPacketize.cpp b/lib/Target/Hexagon/HexagonGatherPacketize.cpp
index 253f09d12839..63ec9c3d3124 100644
--- a/lib/Target/Hexagon/HexagonGatherPacketize.cpp
+++ b/lib/Target/Hexagon/HexagonGatherPacketize.cpp
@@ -62,7 +62,7 @@ bool HexagonGatherPacketize::runOnMachineFunction(MachineFunction &Fn) {
if (!EnableGatherPacketize)
return false;
auto &ST = Fn.getSubtarget<HexagonSubtarget>();
- bool HasV65 = ST.hasV65TOps();
+ bool HasV65 = ST.hasV65Ops();
bool UseHVX = ST.useHVXOps();
if (!(HasV65 & UseHVX))
return false;
diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp
index c1841d735b8c..2582a021e956 100644
--- a/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -55,6 +55,12 @@ static cl::opt<unsigned> VRegDistCutoff("insert-dist-cutoff", cl::init(30U),
cl::Hidden, cl::ZeroOrMore, cl::desc("Vreg distance cutoff for insert "
"generation."));
+// Limit the container sizes for extreme cases where we run out of memory.
+static cl::opt<unsigned> MaxORLSize("insert-max-orl", cl::init(4096),
+ cl::Hidden, cl::ZeroOrMore, cl::desc("Maximum size of OrderedRegisterList"));
+static cl::opt<unsigned> MaxIFMSize("insert-max-ifmap", cl::init(1024),
+ cl::Hidden, cl::ZeroOrMore, cl::desc("Maximum size of IFMap"));
+
static cl::opt<bool> OptTiming("insert-timing", cl::init(false), cl::Hidden,
cl::ZeroOrMore, cl::desc("Enable timing of insert generation"));
static cl::opt<bool> OptTimingDetail("insert-timing-detail", cl::init(false),
@@ -86,6 +92,7 @@ namespace {
struct RegisterSet : private BitVector {
RegisterSet() = default;
explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {}
+ RegisterSet(const RegisterSet &RS) : BitVector(RS) {}
using BitVector::clear;
@@ -370,9 +377,11 @@ namespace {
class OrderedRegisterList {
using ListType = std::vector<unsigned>;
+ const unsigned MaxSize;
public:
- OrderedRegisterList(const RegisterOrdering &RO) : Ord(RO) {}
+ OrderedRegisterList(const RegisterOrdering &RO)
+ : MaxSize(MaxORLSize), Ord(RO) {}
void insert(unsigned VR);
void remove(unsigned VR);
@@ -433,12 +442,17 @@ void OrderedRegisterList::insert(unsigned VR) {
Seq.push_back(VR);
else
Seq.insert(L, VR);
+
+ unsigned S = Seq.size();
+ if (S > MaxSize)
+ Seq.resize(MaxSize);
+ assert(Seq.size() <= MaxSize);
}
void OrderedRegisterList::remove(unsigned VR) {
iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord);
- assert(L != Seq.end());
- Seq.erase(L);
+ if (L != Seq.end())
+ Seq.erase(L);
}
namespace {
@@ -618,7 +632,7 @@ void HexagonGenInsert::buildOrderingBT(RegisterOrdering &RB,
SortableVectorType VRs;
for (RegisterOrdering::iterator I = RB.begin(), E = RB.end(); I != E; ++I)
VRs.push_back(I->first);
- std::sort(VRs.begin(), VRs.end(), LexCmp);
+ llvm::sort(VRs.begin(), VRs.end(), LexCmp);
// Transfer the results to the outgoing register ordering.
for (unsigned i = 0, n = VRs.size(); i < n; ++i)
RO.insert(std::make_pair(VRs[i], i));
@@ -950,6 +964,9 @@ void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
continue;
findRecordInsertForms(VR, AVs);
+ // Stop if the map size is too large.
+ if (IFMap.size() > MaxIFMSize)
+ return;
}
}
diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp
index 5a001d6ed9c1..e5af96468af1 100644
--- a/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -40,6 +40,7 @@
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/MathExtras.h"
#include <algorithm>
#include <cassert>
@@ -56,6 +57,11 @@ namespace llvm {
} // end namespace llvm
+// Initialize this to 0 to always prefer generating mux by default.
+static cl::opt<unsigned> MinPredDist("hexagon-gen-mux-threshold", cl::Hidden,
+ cl::init(0), cl::desc("Minimum distance between predicate definition and "
+ "farther of the two predicated uses"));
+
namespace {
class HexagonGenMux : public MachineFunctionPass {
@@ -269,11 +275,13 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
// There is now a complete definition of DR, i.e. we have the predicate
// register, the definition if-true, and definition if-false.
- // First, check if both definitions are far enough from the definition
+ // First, check if the definitions are far enough from the definition
// of the predicate register.
unsigned MinX = std::min(CI.TrueX, CI.FalseX);
unsigned MaxX = std::max(CI.TrueX, CI.FalseX);
- unsigned SearchX = (MaxX > 4) ? MaxX-4 : 0;
+ // Specifically, check if the predicate definition is within a prescribed
+ // distance from the farther of the two predicated instructions.
+ unsigned SearchX = (MaxX >= MinPredDist) ? MaxX-MinPredDist : 0;
bool NearDef = false;
for (unsigned X = SearchX; X < MaxX; ++X) {
const DefUseInfo &DU = DUM.lookup(X);
@@ -348,7 +356,7 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
return false;
};
for (auto I = B.rbegin(), E = B.rend(); I != E; ++I) {
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
// This isn't 100% accurate, but it's safe.
// It won't detect (as a kill) a case like this
diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 9288ed03d4d2..c0d2de90467a 100644
--- a/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -222,13 +222,12 @@ void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) {
}
void HexagonGenPredicate::processPredicateGPR(const Register &Reg) {
- DEBUG(dbgs() << __func__ << ": "
- << printReg(Reg.R, TRI, Reg.S) << "\n");
+ LLVM_DEBUG(dbgs() << __func__ << ": " << printReg(Reg.R, TRI, Reg.S) << "\n");
using use_iterator = MachineRegisterInfo::use_iterator;
use_iterator I = MRI->use_begin(Reg.R), E = MRI->use_end();
if (I == E) {
- DEBUG(dbgs() << "Dead reg: " << printReg(Reg.R, TRI, Reg.S) << '\n');
+ LLVM_DEBUG(dbgs() << "Dead reg: " << printReg(Reg.R, TRI, Reg.S) << '\n');
MachineInstr *DefI = MRI->getVRegDef(Reg.R);
DefI->eraseFromParent();
return;
@@ -250,7 +249,7 @@ Register HexagonGenPredicate::getPredRegFor(const Register &Reg) {
if (F != G2P.end())
return F->second;
- DEBUG(dbgs() << __func__ << ": " << PrintRegister(Reg, *TRI));
+ LLVM_DEBUG(dbgs() << __func__ << ": " << PrintRegister(Reg, *TRI));
MachineInstr *DefI = MRI->getVRegDef(Reg.R);
assert(DefI);
unsigned Opc = DefI->getOpcode();
@@ -258,7 +257,7 @@ Register HexagonGenPredicate::getPredRegFor(const Register &Reg) {
assert(DefI->getOperand(0).isDef() && DefI->getOperand(1).isUse());
Register PR = DefI->getOperand(1);
G2P.insert(std::make_pair(Reg, PR));
- DEBUG(dbgs() << " -> " << PrintRegister(PR, *TRI) << '\n');
+ LLVM_DEBUG(dbgs() << " -> " << PrintRegister(PR, *TRI) << '\n');
return PR;
}
@@ -274,7 +273,8 @@ Register HexagonGenPredicate::getPredRegFor(const Register &Reg) {
BuildMI(B, std::next(DefIt), DL, TII->get(TargetOpcode::COPY), NewPR)
.addReg(Reg.R, 0, Reg.S);
G2P.insert(std::make_pair(Reg, Register(NewPR)));
- DEBUG(dbgs() << " -> !" << PrintRegister(Register(NewPR), *TRI) << '\n');
+ LLVM_DEBUG(dbgs() << " -> !" << PrintRegister(Register(NewPR), *TRI)
+ << '\n');
return Register(NewPR);
}
@@ -364,7 +364,7 @@ bool HexagonGenPredicate::isScalarPred(Register PredReg) {
}
bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
- DEBUG(dbgs() << __func__ << ": " << MI << " " << *MI);
+ LLVM_DEBUG(dbgs() << __func__ << ": " << MI << " " << *MI);
unsigned Opc = MI->getOpcode();
assert(isConvertibleToPredForm(MI));
@@ -426,7 +426,7 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
Register Pred = getPredRegFor(GPR);
MIB.addReg(Pred.R, 0, Pred.S);
}
- DEBUG(dbgs() << "generated: " << *MIB);
+ LLVM_DEBUG(dbgs() << "generated: " << *MIB);
// Generate a copy-out: NewGPR = NewPR, and replace all uses of OutR
// with NewGPR.
@@ -449,7 +449,7 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
}
bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) {
- DEBUG(dbgs() << __func__ << "\n");
+ LLVM_DEBUG(dbgs() << __func__ << "\n");
const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
bool Changed = false;
VectOfInst Erase;
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 715fd52f3acd..0e33976a58ac 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -168,7 +168,7 @@ namespace {
}
};
- /// \brief Find the register that contains the loop controlling
+ /// Find the register that contains the loop controlling
/// induction variable.
/// If successful, it will return true and set the \p Reg, \p IVBump
/// and \p IVOp arguments. Otherwise it will return false.
@@ -183,19 +183,19 @@ namespace {
bool findInductionRegister(MachineLoop *L, unsigned &Reg,
int64_t &IVBump, MachineInstr *&IVOp) const;
- /// \brief Return the comparison kind for the specified opcode.
+ /// Return the comparison kind for the specified opcode.
Comparison::Kind getComparisonKind(unsigned CondOpc,
MachineOperand *InitialValue,
const MachineOperand *Endvalue,
int64_t IVBump) const;
- /// \brief Analyze the statements in a loop to determine if the loop
+ /// Analyze the statements in a loop to determine if the loop
/// has a computable trip count and, if so, return a value that represents
/// the trip count expression.
CountValue *getLoopTripCount(MachineLoop *L,
SmallVectorImpl<MachineInstr *> &OldInsts);
- /// \brief Return the expression that represents the number of times
+ /// Return the expression that represents the number of times
/// a loop iterates. The function takes the operands that represent the
/// loop start value, loop end value, and induction value. Based upon
/// these operands, the function attempts to compute the trip count.
@@ -206,64 +206,64 @@ namespace {
const MachineOperand *End, unsigned IVReg,
int64_t IVBump, Comparison::Kind Cmp) const;
- /// \brief Return true if the instruction is not valid within a hardware
+ /// Return true if the instruction is not valid within a hardware
/// loop.
bool isInvalidLoopOperation(const MachineInstr *MI,
bool IsInnerHWLoop) const;
- /// \brief Return true if the loop contains an instruction that inhibits
+ /// Return true if the loop contains an instruction that inhibits
/// using the hardware loop.
bool containsInvalidInstruction(MachineLoop *L, bool IsInnerHWLoop) const;
- /// \brief Given a loop, check if we can convert it to a hardware loop.
+ /// Given a loop, check if we can convert it to a hardware loop.
/// If so, then perform the conversion and return true.
bool convertToHardwareLoop(MachineLoop *L, bool &L0used, bool &L1used);
- /// \brief Return true if the instruction is now dead.
+ /// Return true if the instruction is now dead.
bool isDead(const MachineInstr *MI,
SmallVectorImpl<MachineInstr *> &DeadPhis) const;
- /// \brief Remove the instruction if it is now dead.
+ /// Remove the instruction if it is now dead.
void removeIfDead(MachineInstr *MI);
- /// \brief Make sure that the "bump" instruction executes before the
+ /// Make sure that the "bump" instruction executes before the
/// compare. We need that for the IV fixup, so that the compare
/// instruction would not use a bumped value that has not yet been
/// defined. If the instructions are out of order, try to reorder them.
bool orderBumpCompare(MachineInstr *BumpI, MachineInstr *CmpI);
- /// \brief Return true if MO and MI pair is visited only once. If visited
+ /// Return true if MO and MI pair is visited only once. If visited
/// more than once, this indicates there is recursion. In such a case,
/// return false.
bool isLoopFeeder(MachineLoop *L, MachineBasicBlock *A, MachineInstr *MI,
const MachineOperand *MO,
LoopFeederMap &LoopFeederPhi) const;
- /// \brief Return true if the Phi may generate a value that may underflow,
+ /// Return true if the Phi may generate a value that may underflow,
/// or may wrap.
bool phiMayWrapOrUnderflow(MachineInstr *Phi, const MachineOperand *EndVal,
MachineBasicBlock *MBB, MachineLoop *L,
LoopFeederMap &LoopFeederPhi) const;
- /// \brief Return true if the induction variable may underflow an unsigned
+ /// Return true if the induction variable may underflow an unsigned
/// value in the first iteration.
bool loopCountMayWrapOrUnderFlow(const MachineOperand *InitVal,
const MachineOperand *EndVal,
MachineBasicBlock *MBB, MachineLoop *L,
LoopFeederMap &LoopFeederPhi) const;
- /// \brief Check if the given operand has a compile-time known constant
+ /// Check if the given operand has a compile-time known constant
/// value. Return true if yes, and false otherwise. When returning true, set
/// Val to the corresponding constant value.
bool checkForImmediate(const MachineOperand &MO, int64_t &Val) const;
- /// \brief Check if the operand has a compile-time known constant value.
+ /// Check if the operand has a compile-time known constant value.
bool isImmediate(const MachineOperand &MO) const {
int64_t V;
return checkForImmediate(MO, V);
}
- /// \brief Return the immediate for the specified operand.
+ /// Return the immediate for the specified operand.
int64_t getImmediate(const MachineOperand &MO) const {
int64_t V;
if (!checkForImmediate(MO, V))
@@ -271,12 +271,12 @@ namespace {
return V;
}
- /// \brief Reset the given machine operand to now refer to a new immediate
+ /// Reset the given machine operand to now refer to a new immediate
/// value. Assumes that the operand was already referencing an immediate
/// value, either directly, or via a register.
void setImmediate(MachineOperand &MO, int64_t Val);
- /// \brief Fix the data flow of the induction variable.
+ /// Fix the data flow of the induction variable.
/// The desired flow is: phi ---> bump -+-> comparison-in-latch.
/// |
/// +-> back to phi
@@ -297,7 +297,7 @@ namespace {
/// cannot be adjusted to reflect the post-bump value.
bool fixupInductionVariable(MachineLoop *L);
- /// \brief Given a loop, if it does not have a preheader, create one.
+ /// Given a loop, if it does not have a preheader, create one.
/// Return the block that is the preheader.
MachineBasicBlock *createPreheaderForLoop(MachineLoop *L);
};
@@ -307,7 +307,7 @@ namespace {
int HexagonHardwareLoops::Counter = 0;
#endif
- /// \brief Abstraction for a trip count of a loop. A smaller version
+ /// Abstraction for a trip count of a loop. A smaller version
/// of the MachineOperand class without the concerns of changing the
/// operand representation.
class CountValue {
@@ -376,7 +376,7 @@ FunctionPass *llvm::createHexagonHardwareLoops() {
}
bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(dbgs() << "********* Hexagon Hardware Loops *********\n");
+ LLVM_DEBUG(dbgs() << "********* Hexagon Hardware Loops *********\n");
if (skipFunction(MF.getFunction()))
return false;
@@ -556,7 +556,7 @@ HexagonHardwareLoops::getComparisonKind(unsigned CondOpc,
return Cmp;
}
-/// \brief Analyze the statements in a loop to determine if the loop has
+/// Analyze the statements in a loop to determine if the loop has
/// a computable trip count and, if so, return a value that represents
/// the trip count expression.
///
@@ -718,7 +718,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
return computeCount(L, InitialValue, EndValue, IVReg, IVBump, Cmp);
}
-/// \brief Helper function that returns the expression that represents the
+/// Helper function that returns the expression that represents the
/// number of times a loop iterates. The function takes the operands that
/// represent the loop start value, loop end value, and induction value.
/// Based upon these operands, the function attempts to compute the trip count.
@@ -928,6 +928,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
// 'Add' instruction.
const MachineInstr *EndValInstr = MRI->getVRegDef(End->getReg());
if (EndValInstr->getOpcode() == Hexagon::A2_addi &&
+ EndValInstr->getOperand(1).getSubReg() == 0 &&
EndValInstr->getOperand(2).getImm() == StartV) {
DistR = EndValInstr->getOperand(1).getReg();
} else {
@@ -984,7 +985,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
return new CountValue(CountValue::CV_Register, CountR, CountSR);
}
-/// \brief Return true if the operation is invalid within hardware loop.
+/// Return true if the operation is invalid within hardware loop.
bool HexagonHardwareLoops::isInvalidLoopOperation(const MachineInstr *MI,
bool IsInnerHWLoop) const {
// Call is not allowed because the callee may use a hardware loop except for
@@ -1006,19 +1007,20 @@ bool HexagonHardwareLoops::isInvalidLoopOperation(const MachineInstr *MI,
return false;
}
-/// \brief Return true if the loop contains an instruction that inhibits
+/// Return true if the loop contains an instruction that inhibits
/// the use of the hardware loop instruction.
bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L,
bool IsInnerHWLoop) const {
const std::vector<MachineBasicBlock *> &Blocks = L->getBlocks();
- DEBUG(dbgs() << "\nhw_loop head, " << printMBBReference(*Blocks[0]));
+ LLVM_DEBUG(dbgs() << "\nhw_loop head, " << printMBBReference(*Blocks[0]));
for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
MachineBasicBlock *MBB = Blocks[i];
for (MachineBasicBlock::iterator
MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) {
const MachineInstr *MI = &*MII;
if (isInvalidLoopOperation(MI, IsInnerHWLoop)) {
- DEBUG(dbgs()<< "\nCannot convert to hw_loop due to:"; MI->dump(););
+ LLVM_DEBUG(dbgs() << "\nCannot convert to hw_loop due to:";
+ MI->dump(););
return true;
}
}
@@ -1026,7 +1028,7 @@ bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L,
return false;
}
-/// \brief Returns true if the instruction is dead. This was essentially
+/// Returns true if the instruction is dead. This was essentially
/// copied from DeadMachineInstructionElim::isDead, but with special cases
/// for inline asm, physical registers and instructions with side effects
/// removed.
@@ -1083,7 +1085,7 @@ void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) {
SmallVector<MachineInstr*, 1> DeadPhis;
if (isDead(MI, DeadPhis)) {
- DEBUG(dbgs() << "HW looping will remove: " << *MI);
+ LLVM_DEBUG(dbgs() << "HW looping will remove: " << *MI);
// It is possible that some DBG_VALUE instructions refer to this
// instruction. Examine each def operand for such references;
@@ -1112,7 +1114,7 @@ void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) {
}
}
-/// \brief Check if the loop is a candidate for converting to a hardware
+/// Check if the loop is a candidate for converting to a hardware
/// loop. If so, then perform the transformation.
///
/// This function works on innermost loops first. A loop can be converted
@@ -1237,7 +1239,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L,
LoopStart = TopBlock;
// Convert the loop to a hardware loop.
- DEBUG(dbgs() << "Change to hardware loop at "; L->dump());
+ LLVM_DEBUG(dbgs() << "Change to hardware loop at "; L->dump());
DebugLoc DL;
if (InsertPos != Preheader->end())
DL = InsertPos->getDebugLoc();
@@ -1367,7 +1369,7 @@ bool HexagonHardwareLoops::isLoopFeeder(MachineLoop *L, MachineBasicBlock *A,
LoopFeederMap &LoopFeederPhi) const {
if (LoopFeederPhi.find(MO->getReg()) == LoopFeederPhi.end()) {
const std::vector<MachineBasicBlock *> &Blocks = L->getBlocks();
- DEBUG(dbgs() << "\nhw_loop head, " << printMBBReference(*Blocks[0]));
+ LLVM_DEBUG(dbgs() << "\nhw_loop head, " << printMBBReference(*Blocks[0]));
// Ignore all BBs that form Loop.
for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
MachineBasicBlock *MBB = Blocks[i];
@@ -1768,16 +1770,16 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
for (unsigned i = 1, n = PredDef->getNumOperands(); i < n; ++i) {
MachineOperand &MO = PredDef->getOperand(i);
if (MO.isReg() && MO.getReg() == RB.first) {
- DEBUG(dbgs() << "\n DefMI(" << i << ") = "
- << *(MRI->getVRegDef(I->first)));
+ LLVM_DEBUG(dbgs() << "\n DefMI(" << i
+ << ") = " << *(MRI->getVRegDef(I->first)));
if (IndI)
return false;
IndI = MRI->getVRegDef(I->first);
IndMO = &MO;
} else if (MO.isReg()) {
- DEBUG(dbgs() << "\n DefMI(" << i << ") = "
- << *(MRI->getVRegDef(MO.getReg())));
+ LLVM_DEBUG(dbgs() << "\n DefMI(" << i
+ << ") = " << *(MRI->getVRegDef(MO.getReg())));
if (nonIndI)
return false;
diff --git a/lib/Target/Hexagon/HexagonHazardRecognizer.cpp b/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
index 036b18678709..44f1f554c662 100644
--- a/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
+++ b/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
@@ -26,11 +26,13 @@ using namespace llvm;
#define DEBUG_TYPE "post-RA-sched"
void HexagonHazardRecognizer::Reset() {
- DEBUG(dbgs() << "Reset hazard recognizer\n");
+ LLVM_DEBUG(dbgs() << "Reset hazard recognizer\n");
Resources->clearResources();
PacketNum = 0;
UsesDotCur = nullptr;
DotCurPNum = -1;
+ UsesLoad = false;
+ PrefVectorStoreNew = nullptr;
RegDefs.clear();
}
@@ -41,7 +43,7 @@ HexagonHazardRecognizer::getHazardType(SUnit *SU, int stalls) {
return NoHazard;
if (!Resources->canReserveResources(*MI)) {
- DEBUG(dbgs() << "*** Hazard in cycle " << PacketNum << ", " << *MI);
+ LLVM_DEBUG(dbgs() << "*** Hazard in cycle " << PacketNum << ", " << *MI);
HazardType RetVal = Hazard;
if (TII->mayBeNewStore(*MI)) {
// Make sure the register to be stored is defined by an instruction in the
@@ -57,14 +59,16 @@ HexagonHazardRecognizer::getHazardType(SUnit *SU, int stalls) {
MI->getDebugLoc());
if (Resources->canReserveResources(*NewMI))
RetVal = NoHazard;
- DEBUG(dbgs() << "*** Try .new version? " << (RetVal == NoHazard) << "\n");
+ LLVM_DEBUG(dbgs() << "*** Try .new version? " << (RetVal == NoHazard)
+ << "\n");
MF->DeleteMachineInstr(NewMI);
}
return RetVal;
}
if (SU == UsesDotCur && DotCurPNum != (int)PacketNum) {
- DEBUG(dbgs() << "*** .cur Hazard in cycle " << PacketNum << ", " << *MI);
+ LLVM_DEBUG(dbgs() << "*** .cur Hazard in cycle " << PacketNum << ", "
+ << *MI);
return Hazard;
}
@@ -72,21 +76,33 @@ HexagonHazardRecognizer::getHazardType(SUnit *SU, int stalls) {
}
void HexagonHazardRecognizer::AdvanceCycle() {
- DEBUG(dbgs() << "Advance cycle, clear state\n");
+ LLVM_DEBUG(dbgs() << "Advance cycle, clear state\n");
Resources->clearResources();
if (DotCurPNum != -1 && DotCurPNum != (int)PacketNum) {
UsesDotCur = nullptr;
DotCurPNum = -1;
}
+ UsesLoad = false;
+ PrefVectorStoreNew = nullptr;
PacketNum++;
RegDefs.clear();
}
-/// If a packet contains a dot cur instruction, then we may prefer the
-/// instruction that can use the dot cur result. Or, if the use
-/// isn't scheduled in the same packet, then prefer other instructions
-/// in the subsequent packet.
+/// Handle the cases when we prefer one instruction over another. Case 1 - we
+/// prefer not to generate multiple loads in the packet to avoid a potential
+/// bank conflict. Case 2 - if a packet contains a dot cur instruction, then we
+/// prefer the instruction that can use the dot cur result. However, if the use
+/// is not scheduled in the same packet, then prefer other instructions in the
+/// subsequent packet. Case 3 - we prefer a vector store that can be converted
+/// to a .new store. The packetizer will not generate the .new store if the
+/// store doesn't have resources to fit in the packet (but the .new store may
+/// have resources). We attempt to schedule the store as soon as possible to
+/// help packetize the two instructions together.
bool HexagonHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
+ if (PrefVectorStoreNew != nullptr && PrefVectorStoreNew != SU)
+ return true;
+ if (UsesLoad && SU->isInstr() && SU->getInstr()->mayLoad())
+ return true;
return UsesDotCur && ((SU == UsesDotCur) ^ (DotCurPNum == (int)PacketNum));
}
@@ -118,17 +134,16 @@ void HexagonHazardRecognizer::EmitInstruction(SUnit *SU) {
}
else
Resources->reserveResources(*MI);
- DEBUG(dbgs() << " Add instruction " << *MI);
+ LLVM_DEBUG(dbgs() << " Add instruction " << *MI);
// When scheduling a dot cur instruction, check if there is an instruction
// that can use the dot cur in the same packet. If so, we'll attempt to
- // schedule it before other instructions. We only do this if the use has
- // the same height as the dot cur. Otherwise, we may miss scheduling an
- // instruction with a greater height, which is more important.
+ // schedule it before other instructions. We only do this if the load has a
+ // single zero-latency use.
if (TII->mayBeCurLoad(*MI))
for (auto &S : SU->Succs)
if (S.isAssignedRegDep() && S.getLatency() == 0 &&
- SU->getHeight() == S.getSUnit()->getHeight()) {
+ S.getSUnit()->NumPredsLeft == 1) {
UsesDotCur = S.getSUnit();
DotCurPNum = PacketNum;
break;
@@ -137,4 +152,15 @@ void HexagonHazardRecognizer::EmitInstruction(SUnit *SU) {
UsesDotCur = nullptr;
DotCurPNum = -1;
}
+
+ UsesLoad = MI->mayLoad();
+
+ if (TII->isHVXVec(*MI) && !MI->mayLoad() && !MI->mayStore())
+ for (auto &S : SU->Succs)
+ if (S.isAssignedRegDep() && S.getLatency() == 0 &&
+ TII->mayBeNewStore(*S.getSUnit()->getInstr()) &&
+ Resources->canReserveResources(*S.getSUnit()->getInstr())) {
+ PrefVectorStoreNew = S.getSUnit();
+ break;
+ }
}
diff --git a/lib/Target/Hexagon/HexagonHazardRecognizer.h b/lib/Target/Hexagon/HexagonHazardRecognizer.h
index 70efcb7a9f76..2874d73ce819 100644
--- a/lib/Target/Hexagon/HexagonHazardRecognizer.h
+++ b/lib/Target/Hexagon/HexagonHazardRecognizer.h
@@ -23,13 +23,21 @@ namespace llvm {
class HexagonHazardRecognizer : public ScheduleHazardRecognizer {
DFAPacketizer *Resources;
const HexagonInstrInfo *TII;
- unsigned PacketNum;
+ unsigned PacketNum = 0;
// If the packet contains a potential dot cur instruction. This is
// used for the scheduling priority function.
- SUnit *UsesDotCur;
+ SUnit *UsesDotCur = nullptr;
// The packet number when a dor cur is emitted. If its use is not generated
// in the same packet, then try to wait another cycle before emitting.
- int DotCurPNum;
+ int DotCurPNum = -1;
+ // Does the packet contain a load. Used to restrict another load, if possible.
+ bool UsesLoad = false;
+ // Check if we should prefer a vector store that will become a .new version.
+ // The .new store uses different resources than a normal store, and the
+ // packetizer will not generate the .new if the regular store does not have
+ // resources available (even if the .new version does). To help, the schedule
+ // attempts to schedule the .new as soon as possible in the packet.
+ SUnit *PrefVectorStoreNew = nullptr;
// The set of registers defined by instructions in the current packet.
SmallSet<unsigned, 8> RegDefs;
@@ -37,8 +45,7 @@ public:
HexagonHazardRecognizer(const InstrItineraryData *II,
const HexagonInstrInfo *HII,
const HexagonSubtarget &ST)
- : Resources(ST.createDFAPacketizer(II)), TII(HII), PacketNum(0),
- UsesDotCur(nullptr), DotCurPNum(-1) { }
+ : Resources(ST.createDFAPacketizer(II)), TII(HII) { }
~HexagonHazardRecognizer() override {
if (Resources)
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index a6ac4e3df745..efb4c2eb0fc3 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -64,51 +64,6 @@ FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
}
}
-// Intrinsics that return a a predicate.
-static bool doesIntrinsicReturnPredicate(unsigned ID) {
- switch (ID) {
- default:
- return false;
- case Intrinsic::hexagon_C2_cmpeq:
- case Intrinsic::hexagon_C2_cmpgt:
- case Intrinsic::hexagon_C2_cmpgtu:
- case Intrinsic::hexagon_C2_cmpgtup:
- case Intrinsic::hexagon_C2_cmpgtp:
- case Intrinsic::hexagon_C2_cmpeqp:
- case Intrinsic::hexagon_C2_bitsset:
- case Intrinsic::hexagon_C2_bitsclr:
- case Intrinsic::hexagon_C2_cmpeqi:
- case Intrinsic::hexagon_C2_cmpgti:
- case Intrinsic::hexagon_C2_cmpgtui:
- case Intrinsic::hexagon_C2_cmpgei:
- case Intrinsic::hexagon_C2_cmpgeui:
- case Intrinsic::hexagon_C2_cmplt:
- case Intrinsic::hexagon_C2_cmpltu:
- case Intrinsic::hexagon_C2_bitsclri:
- case Intrinsic::hexagon_C2_and:
- case Intrinsic::hexagon_C2_or:
- case Intrinsic::hexagon_C2_xor:
- case Intrinsic::hexagon_C2_andn:
- case Intrinsic::hexagon_C2_not:
- case Intrinsic::hexagon_C2_orn:
- case Intrinsic::hexagon_C2_pxfer_map:
- case Intrinsic::hexagon_C2_any8:
- case Intrinsic::hexagon_C2_all8:
- case Intrinsic::hexagon_A2_vcmpbeq:
- case Intrinsic::hexagon_A2_vcmpbgtu:
- case Intrinsic::hexagon_A2_vcmpheq:
- case Intrinsic::hexagon_A2_vcmphgt:
- case Intrinsic::hexagon_A2_vcmphgtu:
- case Intrinsic::hexagon_A2_vcmpweq:
- case Intrinsic::hexagon_A2_vcmpwgt:
- case Intrinsic::hexagon_A2_vcmpwgtu:
- case Intrinsic::hexagon_C2_tfrrp:
- case Intrinsic::hexagon_S2_tstbit_i:
- case Intrinsic::hexagon_S2_tstbit_r:
- return true;
- }
-}
-
void HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl) {
SDValue Chain = LD->getChain();
SDValue Base = LD->getBasePtr();
@@ -138,12 +93,18 @@ void HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl) {
Opcode = IsValidInc ? Hexagon::L2_loadrh_pi : Hexagon::L2_loadrh_io;
break;
case MVT::i32:
+ case MVT::f32:
+ case MVT::v2i16:
+ case MVT::v4i8:
Opcode = IsValidInc ? Hexagon::L2_loadri_pi : Hexagon::L2_loadri_io;
break;
case MVT::i64:
+ case MVT::f64:
+ case MVT::v2i32:
+ case MVT::v4i16:
+ case MVT::v8i8:
Opcode = IsValidInc ? Hexagon::L2_loadrd_pi : Hexagon::L2_loadrd_io;
break;
- // 64B
case MVT::v64i8:
case MVT::v32i16:
case MVT::v16i32:
@@ -223,7 +184,6 @@ void HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl) {
CurDAG->RemoveDeadNode(LD);
}
-
MachineSDNode *HexagonDAGToDAGISel::LoadInstrForLoadIntrinsic(SDNode *IntN) {
if (IntN->getOpcode() != ISD::INTRINSIC_W_CHAIN)
return nullptr;
@@ -241,35 +201,14 @@ MachineSDNode *HexagonDAGToDAGISel::LoadInstrForLoadIntrinsic(SDNode *IntN) {
};
auto FLC = LoadPciMap.find(IntNo);
if (FLC != LoadPciMap.end()) {
- SDNode *Mod = CurDAG->getMachineNode(Hexagon::A2_tfrrcr, dl, MVT::i32,
- IntN->getOperand(4));
EVT ValTy = (IntNo == Intrinsic::hexagon_circ_ldd) ? MVT::i64 : MVT::i32;
EVT RTys[] = { ValTy, MVT::i32, MVT::Other };
// Operands: { Base, Increment, Modifier, Chain }
auto Inc = cast<ConstantSDNode>(IntN->getOperand(5));
SDValue I = CurDAG->getTargetConstant(Inc->getSExtValue(), dl, MVT::i32);
MachineSDNode *Res = CurDAG->getMachineNode(FLC->second, dl, RTys,
- { IntN->getOperand(2), I, SDValue(Mod,0), IntN->getOperand(0) });
- return Res;
- }
-
- static std::map<unsigned,unsigned> LoadPbrMap = {
- { Intrinsic::hexagon_brev_ldb, Hexagon::L2_loadrb_pbr },
- { Intrinsic::hexagon_brev_ldub, Hexagon::L2_loadrub_pbr },
- { Intrinsic::hexagon_brev_ldh, Hexagon::L2_loadrh_pbr },
- { Intrinsic::hexagon_brev_lduh, Hexagon::L2_loadruh_pbr },
- { Intrinsic::hexagon_brev_ldw, Hexagon::L2_loadri_pbr },
- { Intrinsic::hexagon_brev_ldd, Hexagon::L2_loadrd_pbr },
- };
- auto FLB = LoadPbrMap.find(IntNo);
- if (FLB != LoadPbrMap.end()) {
- SDNode *Mod = CurDAG->getMachineNode(Hexagon::A2_tfrrcr, dl, MVT::i32,
- IntN->getOperand(4));
- EVT ValTy = (IntNo == Intrinsic::hexagon_brev_ldd) ? MVT::i64 : MVT::i32;
- EVT RTys[] = { ValTy, MVT::i32, MVT::Other };
- // Operands: { Base, Modifier, Chain }
- MachineSDNode *Res = CurDAG->getMachineNode(FLB->second, dl, RTys,
- { IntN->getOperand(2), SDValue(Mod,0), IntN->getOperand(0) });
+ { IntN->getOperand(2), I, IntN->getOperand(4),
+ IntN->getOperand(0) });
return Res;
}
@@ -343,14 +282,10 @@ bool HexagonDAGToDAGISel::tryLoadOfLoadIntrinsic(LoadSDNode *N) {
// a sign-extending intrinsic into (or the other way around).
ISD::LoadExtType IntExt;
switch (cast<ConstantSDNode>(C->getOperand(1))->getZExtValue()) {
- case Intrinsic::hexagon_brev_ldub:
- case Intrinsic::hexagon_brev_lduh:
case Intrinsic::hexagon_circ_ldub:
case Intrinsic::hexagon_circ_lduh:
IntExt = ISD::ZEXTLOAD;
break;
- case Intrinsic::hexagon_brev_ldw:
- case Intrinsic::hexagon_brev_ldd:
case Intrinsic::hexagon_circ_ldw:
case Intrinsic::hexagon_circ_ldd:
IntExt = ISD::NON_EXTLOAD;
@@ -378,6 +313,134 @@ bool HexagonDAGToDAGISel::tryLoadOfLoadIntrinsic(LoadSDNode *N) {
CurDAG->RemoveDeadNode(C);
return true;
}
+ return false;
+}
+
+// Convert the bit-reverse load intrinsic to appropriate target instruction.
+bool HexagonDAGToDAGISel::SelectBrevLdIntrinsic(SDNode *IntN) {
+ if (IntN->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+ return false;
+
+ const SDLoc &dl(IntN);
+ unsigned IntNo = cast<ConstantSDNode>(IntN->getOperand(1))->getZExtValue();
+
+ static const std::map<unsigned, unsigned> LoadBrevMap = {
+ { Intrinsic::hexagon_L2_loadrb_pbr, Hexagon::L2_loadrb_pbr },
+ { Intrinsic::hexagon_L2_loadrub_pbr, Hexagon::L2_loadrub_pbr },
+ { Intrinsic::hexagon_L2_loadrh_pbr, Hexagon::L2_loadrh_pbr },
+ { Intrinsic::hexagon_L2_loadruh_pbr, Hexagon::L2_loadruh_pbr },
+ { Intrinsic::hexagon_L2_loadri_pbr, Hexagon::L2_loadri_pbr },
+ { Intrinsic::hexagon_L2_loadrd_pbr, Hexagon::L2_loadrd_pbr }
+ };
+ auto FLI = LoadBrevMap.find(IntNo);
+ if (FLI != LoadBrevMap.end()) {
+ EVT ValTy =
+ (IntNo == Intrinsic::hexagon_L2_loadrd_pbr) ? MVT::i64 : MVT::i32;
+ EVT RTys[] = { ValTy, MVT::i32, MVT::Other };
+ // Operands of Intrinsic: {chain, enum ID of intrinsic, baseptr,
+ // modifier}.
+ // Operands of target instruction: { Base, Modifier, Chain }.
+ MachineSDNode *Res = CurDAG->getMachineNode(
+ FLI->second, dl, RTys,
+ {IntN->getOperand(2), IntN->getOperand(3), IntN->getOperand(0)});
+
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(IntN)->getMemOperand();
+ Res->setMemRefs(MemOp, MemOp + 1);
+
+ ReplaceUses(SDValue(IntN, 0), SDValue(Res, 0));
+ ReplaceUses(SDValue(IntN, 1), SDValue(Res, 1));
+ ReplaceUses(SDValue(IntN, 2), SDValue(Res, 2));
+ CurDAG->RemoveDeadNode(IntN);
+ return true;
+ }
+ return false;
+}
+
+/// Generate a machine instruction node for the new circlar buffer intrinsics.
+/// The new versions use a CSx register instead of the K field.
+bool HexagonDAGToDAGISel::SelectNewCircIntrinsic(SDNode *IntN) {
+ if (IntN->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+ return false;
+
+ SDLoc DL(IntN);
+ unsigned IntNo = cast<ConstantSDNode>(IntN->getOperand(1))->getZExtValue();
+ SmallVector<SDValue, 7> Ops;
+
+ static std::map<unsigned,unsigned> LoadNPcMap = {
+ { Intrinsic::hexagon_L2_loadrub_pci, Hexagon::PS_loadrub_pci },
+ { Intrinsic::hexagon_L2_loadrb_pci, Hexagon::PS_loadrb_pci },
+ { Intrinsic::hexagon_L2_loadruh_pci, Hexagon::PS_loadruh_pci },
+ { Intrinsic::hexagon_L2_loadrh_pci, Hexagon::PS_loadrh_pci },
+ { Intrinsic::hexagon_L2_loadri_pci, Hexagon::PS_loadri_pci },
+ { Intrinsic::hexagon_L2_loadrd_pci, Hexagon::PS_loadrd_pci },
+ { Intrinsic::hexagon_L2_loadrub_pcr, Hexagon::PS_loadrub_pcr },
+ { Intrinsic::hexagon_L2_loadrb_pcr, Hexagon::PS_loadrb_pcr },
+ { Intrinsic::hexagon_L2_loadruh_pcr, Hexagon::PS_loadruh_pcr },
+ { Intrinsic::hexagon_L2_loadrh_pcr, Hexagon::PS_loadrh_pcr },
+ { Intrinsic::hexagon_L2_loadri_pcr, Hexagon::PS_loadri_pcr },
+ { Intrinsic::hexagon_L2_loadrd_pcr, Hexagon::PS_loadrd_pcr }
+ };
+ auto FLI = LoadNPcMap.find (IntNo);
+ if (FLI != LoadNPcMap.end()) {
+ EVT ValTy = MVT::i32;
+ if (IntNo == Intrinsic::hexagon_L2_loadrd_pci ||
+ IntNo == Intrinsic::hexagon_L2_loadrd_pcr)
+ ValTy = MVT::i64;
+ EVT RTys[] = { ValTy, MVT::i32, MVT::Other };
+ // Handle load.*_pci case which has 6 operands.
+ if (IntN->getNumOperands() == 6) {
+ auto Inc = cast<ConstantSDNode>(IntN->getOperand(3));
+ SDValue I = CurDAG->getTargetConstant(Inc->getSExtValue(), DL, MVT::i32);
+ // Operands: { Base, Increment, Modifier, Start, Chain }.
+ Ops = { IntN->getOperand(2), I, IntN->getOperand(4), IntN->getOperand(5),
+ IntN->getOperand(0) };
+ } else
+ // Handle load.*_pcr case which has 5 operands.
+ // Operands: { Base, Modifier, Start, Chain }.
+ Ops = { IntN->getOperand(2), IntN->getOperand(3), IntN->getOperand(4),
+ IntN->getOperand(0) };
+ MachineSDNode *Res = CurDAG->getMachineNode(FLI->second, DL, RTys, Ops);
+ ReplaceUses(SDValue(IntN, 0), SDValue(Res, 0));
+ ReplaceUses(SDValue(IntN, 1), SDValue(Res, 1));
+ ReplaceUses(SDValue(IntN, 2), SDValue(Res, 2));
+ CurDAG->RemoveDeadNode(IntN);
+ return true;
+ }
+
+ static std::map<unsigned,unsigned> StoreNPcMap = {
+ { Intrinsic::hexagon_S2_storerb_pci, Hexagon::PS_storerb_pci },
+ { Intrinsic::hexagon_S2_storerh_pci, Hexagon::PS_storerh_pci },
+ { Intrinsic::hexagon_S2_storerf_pci, Hexagon::PS_storerf_pci },
+ { Intrinsic::hexagon_S2_storeri_pci, Hexagon::PS_storeri_pci },
+ { Intrinsic::hexagon_S2_storerd_pci, Hexagon::PS_storerd_pci },
+ { Intrinsic::hexagon_S2_storerb_pcr, Hexagon::PS_storerb_pcr },
+ { Intrinsic::hexagon_S2_storerh_pcr, Hexagon::PS_storerh_pcr },
+ { Intrinsic::hexagon_S2_storerf_pcr, Hexagon::PS_storerf_pcr },
+ { Intrinsic::hexagon_S2_storeri_pcr, Hexagon::PS_storeri_pcr },
+ { Intrinsic::hexagon_S2_storerd_pcr, Hexagon::PS_storerd_pcr }
+ };
+ auto FSI = StoreNPcMap.find (IntNo);
+ if (FSI != StoreNPcMap.end()) {
+ EVT RTys[] = { MVT::i32, MVT::Other };
+ // Handle store.*_pci case which has 7 operands.
+ if (IntN->getNumOperands() == 7) {
+ auto Inc = cast<ConstantSDNode>(IntN->getOperand(3));
+ SDValue I = CurDAG->getTargetConstant(Inc->getSExtValue(), DL, MVT::i32);
+ // Operands: { Base, Increment, Modifier, Value, Start, Chain }.
+ Ops = { IntN->getOperand(2), I, IntN->getOperand(4), IntN->getOperand(5),
+ IntN->getOperand(6), IntN->getOperand(0) };
+ } else
+ // Handle store.*_pcr case which has 6 operands.
+ // Operands: { Base, Modifier, Value, Start, Chain }.
+ Ops = { IntN->getOperand(2), IntN->getOperand(3), IntN->getOperand(4),
+ IntN->getOperand(5), IntN->getOperand(0) };
+ MachineSDNode *Res = CurDAG->getMachineNode(FSI->second, DL, RTys, Ops);
+ ReplaceUses(SDValue(IntN, 0), SDValue(Res, 0));
+ ReplaceUses(SDValue(IntN, 1), SDValue(Res, 1));
+ CurDAG->RemoveDeadNode(IntN);
+ return true;
+ }
return false;
}
@@ -385,9 +448,9 @@ bool HexagonDAGToDAGISel::tryLoadOfLoadIntrinsic(LoadSDNode *N) {
void HexagonDAGToDAGISel::SelectLoad(SDNode *N) {
SDLoc dl(N);
LoadSDNode *LD = cast<LoadSDNode>(N);
- ISD::MemIndexedMode AM = LD->getAddressingMode();
// Handle indexed loads.
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
if (AM != ISD::UNINDEXED) {
SelectIndexedLoad(LD, dl);
return;
@@ -422,9 +485,16 @@ void HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl) {
Opcode = IsValidInc ? Hexagon::S2_storerh_pi : Hexagon::S2_storerh_io;
break;
case MVT::i32:
+ case MVT::f32:
+ case MVT::v2i16:
+ case MVT::v4i8:
Opcode = IsValidInc ? Hexagon::S2_storeri_pi : Hexagon::S2_storeri_io;
break;
case MVT::i64:
+ case MVT::f64:
+ case MVT::v2i32:
+ case MVT::v4i16:
+ case MVT::v8i8:
Opcode = IsValidInc ? Hexagon::S2_storerd_pi : Hexagon::S2_storerd_io;
break;
case MVT::v64i8:
@@ -488,9 +558,9 @@ void HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl) {
void HexagonDAGToDAGISel::SelectStore(SDNode *N) {
SDLoc dl(N);
StoreSDNode *ST = cast<StoreSDNode>(N);
- ISD::MemIndexedMode AM = ST->getAddressingMode();
// Handle indexed stores.
+ ISD::MemIndexedMode AM = ST->getAddressingMode();
if (AM != ISD::UNINDEXED) {
SelectIndexedStore(ST, dl);
return;
@@ -553,85 +623,6 @@ void HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
return Default();
}
-
-//
-// If there is an zero_extend followed an intrinsic in DAG (this means - the
-// result of the intrinsic is predicate); convert the zero_extend to
-// transfer instruction.
-//
-// Zero extend -> transfer is lowered here. Otherwise, zero_extend will be
-// converted into a MUX as predicate registers defined as 1 bit in the
-// compiler. Architecture defines them as 8-bit registers.
-// We want to preserve all the lower 8-bits and, not just 1 LSB bit.
-//
-void HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
- SDLoc dl(N);
-
- SDValue Op0 = N->getOperand(0);
- EVT OpVT = Op0.getValueType();
- unsigned OpBW = OpVT.getSizeInBits();
-
- // Special handling for zero-extending a vector of booleans.
- if (OpVT.isVector() && OpVT.getVectorElementType() == MVT::i1 && OpBW <= 64) {
- SDNode *Mask = CurDAG->getMachineNode(Hexagon::C2_mask, dl, MVT::i64, Op0);
- unsigned NE = OpVT.getVectorNumElements();
- EVT ExVT = N->getValueType(0);
- unsigned ES = ExVT.getScalarSizeInBits();
- uint64_t MV = 0, Bit = 1;
- for (unsigned i = 0; i < NE; ++i) {
- MV |= Bit;
- Bit <<= ES;
- }
- SDValue Ones = CurDAG->getTargetConstant(MV, dl, MVT::i64);
- SDNode *OnesReg = CurDAG->getMachineNode(Hexagon::CONST64, dl,
- MVT::i64, Ones);
- if (ExVT.getSizeInBits() == 32) {
- SDNode *And = CurDAG->getMachineNode(Hexagon::A2_andp, dl, MVT::i64,
- SDValue(Mask,0), SDValue(OnesReg,0));
- SDValue SubR = CurDAG->getTargetConstant(Hexagon::isub_lo, dl, MVT::i32);
- ReplaceNode(N, CurDAG->getMachineNode(Hexagon::EXTRACT_SUBREG, dl, ExVT,
- SDValue(And, 0), SubR));
- return;
- }
- ReplaceNode(N,
- CurDAG->getMachineNode(Hexagon::A2_andp, dl, ExVT,
- SDValue(Mask, 0), SDValue(OnesReg, 0)));
- return;
- }
-
- SDNode *Int = N->getOperand(0).getNode();
- if ((Int->getOpcode() == ISD::INTRINSIC_WO_CHAIN)) {
- unsigned ID = cast<ConstantSDNode>(Int->getOperand(0))->getZExtValue();
- if (doesIntrinsicReturnPredicate(ID)) {
- // Now we need to differentiate target data types.
- if (N->getValueType(0) == MVT::i64) {
- // Convert the zero_extend to Rs = Pd followed by A2_combinew(0,Rs).
- SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
- SDNode *Result_1 = CurDAG->getMachineNode(Hexagon::C2_tfrpr, dl,
- MVT::i32, SDValue(Int, 0));
- SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl,
- MVT::i32, TargetConst0);
- SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::A2_combinew, dl,
- MVT::i64, MVT::Other,
- SDValue(Result_2, 0),
- SDValue(Result_1, 0));
- ReplaceNode(N, Result_3);
- return;
- }
- if (N->getValueType(0) == MVT::i32) {
- // Convert the zero_extend to Rs = Pd
- SDNode* RsPd = CurDAG->getMachineNode(Hexagon::C2_tfrpr, dl,
- MVT::i32, SDValue(Int, 0));
- ReplaceNode(N, RsPd);
- return;
- }
- llvm_unreachable("Unexpected value type");
- }
- }
- SelectCode(N);
-}
-
-
//
// Handling intrinsics for circular load and bitreverse load.
//
@@ -642,6 +633,13 @@ void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {
return;
}
+ // Handle bit-reverse load intrinsics.
+ if (SelectBrevLdIntrinsic(N))
+ return;
+
+ if (SelectNewCircIntrinsic(N))
+ return;
+
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
if (IntNo == Intrinsic::hexagon_V6_vgathermw ||
IntNo == Intrinsic::hexagon_V6_vgathermw_128B ||
@@ -735,7 +733,6 @@ void HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
SelectCode(N);
}
-
void HexagonDAGToDAGISel::SelectFrameIndex(SDNode *N) {
MachineFrameInfo &MFI = MF->getFrameInfo();
const HexagonFrameLowering *HFI = HST->getFrameLowering();
@@ -765,20 +762,113 @@ void HexagonDAGToDAGISel::SelectFrameIndex(SDNode *N) {
ReplaceNode(N, R);
}
+void HexagonDAGToDAGISel::SelectAddSubCarry(SDNode *N) {
+ unsigned OpcCarry = N->getOpcode() == HexagonISD::ADDC ? Hexagon::A4_addp_c
+ : Hexagon::A4_subp_c;
+ SDNode *C = CurDAG->getMachineNode(OpcCarry, SDLoc(N), N->getVTList(),
+ { N->getOperand(0), N->getOperand(1),
+ N->getOperand(2) });
+ ReplaceNode(N, C);
+}
-void HexagonDAGToDAGISel::SelectBitcast(SDNode *N) {
- EVT SVT = N->getOperand(0).getValueType();
- EVT DVT = N->getValueType(0);
- if (!SVT.isVector() || !DVT.isVector() ||
- SVT.getVectorElementType() == MVT::i1 ||
- DVT.getVectorElementType() == MVT::i1 ||
- SVT.getSizeInBits() != DVT.getSizeInBits()) {
- SelectCode(N);
- return;
+void HexagonDAGToDAGISel::SelectVAlign(SDNode *N) {
+ MVT ResTy = N->getValueType(0).getSimpleVT();
+ if (HST->isHVXVectorType(ResTy, true))
+ return SelectHvxVAlign(N);
+
+ const SDLoc &dl(N);
+ unsigned VecLen = ResTy.getSizeInBits();
+ if (VecLen == 32) {
+ SDValue Ops[] = {
+ CurDAG->getTargetConstant(Hexagon::DoubleRegsRegClassID, dl, MVT::i32),
+ N->getOperand(0),
+ CurDAG->getTargetConstant(Hexagon::isub_hi, dl, MVT::i32),
+ N->getOperand(1),
+ CurDAG->getTargetConstant(Hexagon::isub_lo, dl, MVT::i32)
+ };
+ SDNode *R = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl,
+ MVT::i64, Ops);
+
+ // Shift right by "(Addr & 0x3) * 8" bytes.
+ SDValue M0 = CurDAG->getTargetConstant(0x18, dl, MVT::i32);
+ SDValue M1 = CurDAG->getTargetConstant(0x03, dl, MVT::i32);
+ SDNode *C = CurDAG->getMachineNode(Hexagon::S4_andi_asl_ri, dl, MVT::i32,
+ M0, N->getOperand(2), M1);
+ SDNode *S = CurDAG->getMachineNode(Hexagon::S2_lsr_r_p, dl, MVT::i64,
+ SDValue(R, 0), SDValue(C, 0));
+ SDValue E = CurDAG->getTargetExtractSubreg(Hexagon::isub_lo, dl, ResTy,
+ SDValue(S, 0));
+ ReplaceNode(N, E.getNode());
+ } else {
+ assert(VecLen == 64);
+ SDNode *Pu = CurDAG->getMachineNode(Hexagon::C2_tfrrp, dl, MVT::v8i1,
+ N->getOperand(2));
+ SDNode *VA = CurDAG->getMachineNode(Hexagon::S2_valignrb, dl, ResTy,
+ N->getOperand(0), N->getOperand(1),
+ SDValue(Pu,0));
+ ReplaceNode(N, VA);
}
+}
+
+void HexagonDAGToDAGISel::SelectVAlignAddr(SDNode *N) {
+ const SDLoc &dl(N);
+ SDValue A = N->getOperand(1);
+ int Mask = -cast<ConstantSDNode>(A.getNode())->getSExtValue();
+ assert(isPowerOf2_32(-Mask));
+
+ SDValue M = CurDAG->getTargetConstant(Mask, dl, MVT::i32);
+ SDNode *AA = CurDAG->getMachineNode(Hexagon::A2_andir, dl, MVT::i32,
+ N->getOperand(0), M);
+ ReplaceNode(N, AA);
+}
+
+// Handle these nodes here to avoid having to write patterns for all
+// combinations of input/output types. In all cases, the resulting
+// instruction is the same.
+void HexagonDAGToDAGISel::SelectTypecast(SDNode *N) {
+ SDValue Op = N->getOperand(0);
+ MVT OpTy = Op.getValueType().getSimpleVT();
+ SDNode *T = CurDAG->MorphNodeTo(N, N->getOpcode(),
+ CurDAG->getVTList(OpTy), {Op});
+ ReplaceNode(T, Op.getNode());
+}
+
+void HexagonDAGToDAGISel::SelectP2D(SDNode *N) {
+ MVT ResTy = N->getValueType(0).getSimpleVT();
+ SDNode *T = CurDAG->getMachineNode(Hexagon::C2_mask, SDLoc(N), ResTy,
+ N->getOperand(0));
+ ReplaceNode(N, T);
+}
+
+void HexagonDAGToDAGISel::SelectD2P(SDNode *N) {
+ const SDLoc &dl(N);
+ MVT ResTy = N->getValueType(0).getSimpleVT();
+ SDValue Zero = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDNode *T = CurDAG->getMachineNode(Hexagon::A4_vcmpbgtui, dl, ResTy,
+ N->getOperand(0), Zero);
+ ReplaceNode(N, T);
+}
+
+void HexagonDAGToDAGISel::SelectV2Q(SDNode *N) {
+ const SDLoc &dl(N);
+ MVT ResTy = N->getValueType(0).getSimpleVT();
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N,0), N->getOperand(0));
- CurDAG->RemoveDeadNode(N);
+ SDValue C = CurDAG->getTargetConstant(-1, dl, MVT::i32);
+ SDNode *R = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32, C);
+ SDNode *T = CurDAG->getMachineNode(Hexagon::V6_vandvrt, dl, ResTy,
+ N->getOperand(0), SDValue(R,0));
+ ReplaceNode(N, T);
+}
+
+void HexagonDAGToDAGISel::SelectQ2V(SDNode *N) {
+ const SDLoc &dl(N);
+ MVT ResTy = N->getValueType(0).getSimpleVT();
+
+ SDValue C = CurDAG->getTargetConstant(-1, dl, MVT::i32);
+ SDNode *R = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32, C);
+ SDNode *T = CurDAG->getMachineNode(Hexagon::V6_vandqrt, dl, ResTy,
+ N->getOperand(0), SDValue(R,0));
+ ReplaceNode(N, T);
}
void HexagonDAGToDAGISel::Select(SDNode *N) {
@@ -789,13 +879,21 @@ void HexagonDAGToDAGISel::Select(SDNode *N) {
case ISD::Constant: return SelectConstant(N);
case ISD::ConstantFP: return SelectConstantFP(N);
case ISD::FrameIndex: return SelectFrameIndex(N);
- case ISD::BITCAST: return SelectBitcast(N);
case ISD::SHL: return SelectSHL(N);
case ISD::LOAD: return SelectLoad(N);
case ISD::STORE: return SelectStore(N);
- case ISD::ZERO_EXTEND: return SelectZeroExtend(N);
case ISD::INTRINSIC_W_CHAIN: return SelectIntrinsicWChain(N);
case ISD::INTRINSIC_WO_CHAIN: return SelectIntrinsicWOChain(N);
+
+ case HexagonISD::ADDC:
+ case HexagonISD::SUBC: return SelectAddSubCarry(N);
+ case HexagonISD::VALIGN: return SelectVAlign(N);
+ case HexagonISD::VALIGNADDR: return SelectVAlignAddr(N);
+ case HexagonISD::TYPECAST: return SelectTypecast(N);
+ case HexagonISD::P2D: return SelectP2D(N);
+ case HexagonISD::D2P: return SelectD2P(N);
+ case HexagonISD::Q2V: return SelectQ2V(N);
+ case HexagonISD::V2Q: return SelectV2Q(N);
}
if (HST->useHVXOps()) {
@@ -1240,7 +1338,7 @@ bool HexagonDAGToDAGISel::SelectAnyImmediate(SDValue &N, SDValue &R,
}
case HexagonISD::JT:
case HexagonISD::CP:
- // These are assumed to always be aligned at at least 8-byte boundary.
+ // These are assumed to always be aligned at least 8-byte boundary.
if (LogAlign > 3)
return false;
R = N.getOperand(0);
@@ -1252,7 +1350,7 @@ bool HexagonDAGToDAGISel::SelectAnyImmediate(SDValue &N, SDValue &R,
R = N;
return true;
case ISD::BlockAddress:
- // Block address is always aligned at at least 4-byte boundary.
+ // Block address is always aligned at least 4-byte boundary.
if (LogAlign > 2 || !IsAligned(cast<BlockAddressSDNode>(N)->getOffset()))
return false;
R = N;
@@ -1345,9 +1443,13 @@ bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) {
EVT T = Opc == ISD::SIGN_EXTEND
? N.getOperand(0).getValueType()
: cast<VTSDNode>(N.getOperand(1))->getVT();
- if (T.getSizeInBits() != 32)
+ unsigned SW = T.getSizeInBits();
+ if (SW == 32)
+ R = N.getOperand(0);
+ else if (SW < 32)
+ R = N;
+ else
return false;
- R = N.getOperand(0);
break;
}
case ISD::LOAD: {
@@ -1361,6 +1463,13 @@ bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) {
R = N;
break;
}
+ case ISD::SRA: {
+ auto *S = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!S || S->getZExtValue() != 32)
+ return false;
+ R = N;
+ break;
+ }
default:
return false;
}
@@ -1500,7 +1609,7 @@ static bool isOpcodeHandled(const SDNode *N) {
}
}
-/// \brief Return the weight of an SDNode
+/// Return the weight of an SDNode
int HexagonDAGToDAGISel::getWeight(SDNode *N) {
if (!isOpcodeHandled(N))
return 1;
@@ -1799,15 +1908,15 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
RootHeights[N] = std::max(getHeight(N->getOperand(0).getNode()),
getHeight(N->getOperand(1).getNode())) + 1;
- DEBUG(dbgs() << "--> No need to balance root (Weight=" << Weight
- << " Height=" << RootHeights[N] << "): ");
- DEBUG(N->dump());
+ LLVM_DEBUG(dbgs() << "--> No need to balance root (Weight=" << Weight
+ << " Height=" << RootHeights[N] << "): ");
+ LLVM_DEBUG(N->dump(CurDAG));
return SDValue(N, 0);
}
- DEBUG(dbgs() << "** Balancing root node: ");
- DEBUG(N->dump());
+ LLVM_DEBUG(dbgs() << "** Balancing root node: ");
+ LLVM_DEBUG(N->dump(CurDAG));
unsigned NOpcode = N->getOpcode();
@@ -1855,7 +1964,7 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
// Whoops, this node was RAUWd by one of the balanceSubTree calls we
// made. Our worklist isn't up to date anymore.
// Restart the whole process.
- DEBUG(dbgs() << "--> Subtree was RAUWd. Restarting...\n");
+ LLVM_DEBUG(dbgs() << "--> Subtree was RAUWd. Restarting...\n");
return balanceSubTree(N, TopLevel);
}
@@ -1926,15 +2035,15 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
}
}
- DEBUG(dbgs() << "--> Current height=" << NodeHeights[SDValue(N, 0)]
- << " weight=" << CurrentWeight << " imbalanced="
- << Imbalanced << "\n");
+ LLVM_DEBUG(dbgs() << "--> Current height=" << NodeHeights[SDValue(N, 0)]
+ << " weight=" << CurrentWeight
+ << " imbalanced=" << Imbalanced << "\n");
// Transform MUL(x, C * 2^Y) + SHL(z, Y) -> SHL(ADD(MUL(x, C), z), Y)
// This factors out a shift in order to match memw(a<<Y+b).
if (CanFactorize && (willShiftRightEliminate(Mul1.Value, MaxPowerOf2) ||
willShiftRightEliminate(Mul2.Value, MaxPowerOf2))) {
- DEBUG(dbgs() << "--> Found common factor for two MUL children!\n");
+ LLVM_DEBUG(dbgs() << "--> Found common factor for two MUL children!\n");
int Weight = Mul1.Weight + Mul2.Weight;
int Height = std::max(NodeHeights[Mul1.Value], NodeHeights[Mul2.Value]) + 1;
SDValue Mul1Factored = factorOutPowerOf2(Mul1.Value, MaxPowerOf2);
@@ -1968,9 +2077,9 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
if (getUsesInFunction(GANode->getGlobal()) == 1 && Offset->hasOneUse() &&
getTargetLowering()->isOffsetFoldingLegal(GANode)) {
- DEBUG(dbgs() << "--> Combining GA and offset (" << Offset->getSExtValue()
- << "): ");
- DEBUG(GANode->dump());
+ LLVM_DEBUG(dbgs() << "--> Combining GA and offset ("
+ << Offset->getSExtValue() << "): ");
+ LLVM_DEBUG(GANode->dump(CurDAG));
SDValue NewTGA =
CurDAG->getTargetGlobalAddress(GANode->getGlobal(), SDLoc(GA.Value),
@@ -2014,7 +2123,7 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
// If this is the top level and we haven't factored out a shift, we should try
// to move a constant to the bottom to match addressing modes like memw(rX+C)
if (TopLevel && !CanFactorize && Leaves.hasConst()) {
- DEBUG(dbgs() << "--> Pushing constant to tip of tree.");
+ LLVM_DEBUG(dbgs() << "--> Pushing constant to tip of tree.");
Leaves.pushToBottom(Leaves.pop());
}
@@ -2041,7 +2150,7 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
// Make sure that none of these nodes have been RAUW'd
if ((RootWeights.count(V0.getNode()) && RootWeights[V0.getNode()] == -2) ||
(RootWeights.count(V1.getNode()) && RootWeights[V1.getNode()] == -2)) {
- DEBUG(dbgs() << "--> Subtree was RAUWd. Restarting...\n");
+ LLVM_DEBUG(dbgs() << "--> Subtree was RAUWd. Restarting...\n");
return balanceSubTree(N, TopLevel);
}
@@ -2075,9 +2184,9 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
int Weight = V0Weight + V1Weight;
Leaves.push(WeightedLeaf(NewNode, Weight, L0.InsertionOrder));
- DEBUG(dbgs() << "--> Built new node (Weight=" << Weight << ",Height="
- << Height << "):\n");
- DEBUG(NewNode.dump());
+ LLVM_DEBUG(dbgs() << "--> Built new node (Weight=" << Weight
+ << ",Height=" << Height << "):\n");
+ LLVM_DEBUG(NewNode.dump());
}
assert(Leaves.size() == 1);
@@ -2101,15 +2210,15 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
}
if (N != NewRoot.getNode()) {
- DEBUG(dbgs() << "--> Root is now: ");
- DEBUG(NewRoot.dump());
+ LLVM_DEBUG(dbgs() << "--> Root is now: ");
+ LLVM_DEBUG(NewRoot.dump());
// Replace all uses of old root by new root
CurDAG->ReplaceAllUsesWith(N, NewRoot.getNode());
// Mark that we have RAUW'd N
RootWeights[N] = -2;
} else {
- DEBUG(dbgs() << "--> Root unchanged.\n");
+ LLVM_DEBUG(dbgs() << "--> Root unchanged.\n");
}
RootWeights[NewRoot.getNode()] = Leaves.top().Weight;
@@ -2132,8 +2241,8 @@ void HexagonDAGToDAGISel::rebalanceAddressTrees() {
if (RootWeights.count(BasePtr.getNode()))
continue;
- DEBUG(dbgs() << "** Rebalancing address calculation in node: ");
- DEBUG(N->dump());
+ LLVM_DEBUG(dbgs() << "** Rebalancing address calculation in node: ");
+ LLVM_DEBUG(N->dump(CurDAG));
// FindRoots
SmallVector<SDNode *, 4> Worklist;
@@ -2173,8 +2282,8 @@ void HexagonDAGToDAGISel::rebalanceAddressTrees() {
N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1),
NewBasePtr, N->getOperand(3));
- DEBUG(dbgs() << "--> Final node: ");
- DEBUG(N->dump());
+ LLVM_DEBUG(dbgs() << "--> Final node: ");
+ LLVM_DEBUG(N->dump(CurDAG));
}
CurDAG->RemoveDeadNodes();
@@ -2182,4 +2291,3 @@ void HexagonDAGToDAGISel::rebalanceAddressTrees() {
RootHeights.clear();
RootWeights.clear();
}
-
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.h b/lib/Target/Hexagon/HexagonISelDAGToDAG.h
index fc66940ee52d..f4f09dd4e758 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.h
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.h
@@ -90,6 +90,8 @@ public:
unsigned ConstraintID,
std::vector<SDValue> &OutOps) override;
bool tryLoadOfLoadIntrinsic(LoadSDNode *N);
+ bool SelectBrevLdIntrinsic(SDNode *IntN);
+ bool SelectNewCircIntrinsic(SDNode *IntN);
void SelectLoad(SDNode *N);
void SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl);
void SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl);
@@ -100,10 +102,17 @@ public:
void SelectIntrinsicWOChain(SDNode *N);
void SelectConstant(SDNode *N);
void SelectConstantFP(SDNode *N);
- void SelectBitcast(SDNode *N);
void SelectV65Gather(SDNode *N);
void SelectV65GatherPred(SDNode *N);
void SelectHVXDualOutput(SDNode *N);
+ void SelectAddSubCarry(SDNode *N);
+ void SelectVAlign(SDNode *N);
+ void SelectVAlignAddr(SDNode *N);
+ void SelectTypecast(SDNode *N);
+ void SelectP2D(SDNode *N);
+ void SelectD2P(SDNode *N);
+ void SelectQ2V(SDNode *N);
+ void SelectV2Q(SDNode *N);
// Include the declarations autogenerated from the selection patterns.
#define GET_DAGISEL_DECL
@@ -122,6 +131,7 @@ private:
void SelectHvxShuffle(SDNode *N);
void SelectHvxRor(SDNode *N);
+ void SelectHvxVAlign(SDNode *N);
bool keepsLowBits(const SDValue &Val, unsigned NumBits, SDValue &Src);
bool isAlignedMemNode(const MemSDNode *N) const;
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index 740861851185..8aef9b4560d5 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -11,6 +11,7 @@
#include "HexagonISelDAGToDAG.h"
#include "HexagonISelLowering.h"
#include "HexagonTargetMachine.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/IR/Intrinsics.h"
@@ -94,18 +95,13 @@ namespace {
// Benes network is a forward delta network immediately followed by
// a reverse delta network.
+enum class ColorKind { None, Red, Black };
// Graph coloring utility used to partition nodes into two groups:
// they will correspond to nodes routed to the upper and lower networks.
struct Coloring {
- enum : uint8_t {
- None = 0,
- Red,
- Black
- };
-
using Node = int;
- using MapType = std::map<Node,uint8_t>;
+ using MapType = std::map<Node, ColorKind>;
static constexpr Node Ignore = Node(-1);
Coloring(ArrayRef<Node> Ord) : Order(Ord) {
@@ -118,10 +114,10 @@ struct Coloring {
return Colors;
}
- uint8_t other(uint8_t Color) {
- if (Color == None)
- return Red;
- return Color == Red ? Black : Red;
+ ColorKind other(ColorKind Color) {
+ if (Color == ColorKind::None)
+ return ColorKind::Red;
+ return Color == ColorKind::Red ? ColorKind::Black : ColorKind::Red;
}
void dump() const;
@@ -139,28 +135,28 @@ private:
return (Pos < Num/2) ? Pos + Num/2 : Pos - Num/2;
}
- uint8_t getColor(Node N) {
+ ColorKind getColor(Node N) {
auto F = Colors.find(N);
- return F != Colors.end() ? F->second : (uint8_t)None;
+ return F != Colors.end() ? F->second : ColorKind::None;
}
- std::pair<bool,uint8_t> getUniqueColor(const NodeSet &Nodes);
+ std::pair<bool, ColorKind> getUniqueColor(const NodeSet &Nodes);
void build();
bool color();
};
} // namespace
-std::pair<bool,uint8_t> Coloring::getUniqueColor(const NodeSet &Nodes) {
- uint8_t Color = None;
+std::pair<bool, ColorKind> Coloring::getUniqueColor(const NodeSet &Nodes) {
+ auto Color = ColorKind::None;
for (Node N : Nodes) {
- uint8_t ColorN = getColor(N);
- if (ColorN == None)
+ ColorKind ColorN = getColor(N);
+ if (ColorN == ColorKind::None)
continue;
- if (Color == None)
+ if (Color == ColorKind::None)
Color = ColorN;
- else if (Color != None && Color != ColorN)
- return { false, None };
+ else if (Color != ColorKind::None && Color != ColorN)
+ return { false, ColorKind::None };
}
return { true, Color };
}
@@ -245,12 +241,12 @@ bool Coloring::color() {
// Coloring failed. Split this node.
Node C = conj(N);
- uint8_t ColorN = other(None);
- uint8_t ColorC = other(ColorN);
+ ColorKind ColorN = other(ColorKind::None);
+ ColorKind ColorC = other(ColorN);
NodeSet &Cs = Edges[C];
NodeSet CopyNs = Ns;
for (Node M : CopyNs) {
- uint8_t ColorM = getColor(M);
+ ColorKind ColorM = getColor(M);
if (ColorM == ColorC) {
// Connect M with C, disconnect M from N.
Cs.insert(M);
@@ -263,10 +259,10 @@ bool Coloring::color() {
Colors[C] = ColorC;
}
- // Explicitly assign "None" all all uncolored nodes.
+ // Explicitly assign "None" to all uncolored nodes.
for (unsigned I = 0; I != Order.size(); ++I)
if (Colors.count(I) == 0)
- Colors[I] = None;
+ Colors[I] = ColorKind::None;
return true;
}
@@ -296,10 +292,21 @@ void Coloring::dump() const {
}
dbgs() << " }\n";
- static const char *const Names[] = { "None", "Red", "Black" };
+ auto ColorKindToName = [](ColorKind C) {
+ switch (C) {
+ case ColorKind::None:
+ return "None";
+ case ColorKind::Red:
+ return "Red";
+ case ColorKind::Black:
+ return "Black";
+ }
+ llvm_unreachable("all ColorKinds should be handled by the switch above");
+ };
+
dbgs() << " Colors: {\n";
for (auto C : Colors)
- dbgs() << " " << C.first << " -> " << Names[C.second] << "\n";
+ dbgs() << " " << C.first << " -> " << ColorKindToName(C.second) << "\n";
dbgs() << " }\n}\n";
}
@@ -471,21 +478,21 @@ bool ReverseDeltaNetwork::route(ElemType *P, RowType *T, unsigned Size,
if (M.empty())
return false;
- uint8_t ColorUp = Coloring::None;
+ ColorKind ColorUp = ColorKind::None;
for (ElemType J = 0; J != Num; ++J) {
ElemType I = P[J];
// I is the position in the input,
// J is the position in the output.
if (I == Ignore)
continue;
- uint8_t C = M.at(I);
- if (C == Coloring::None)
+ ColorKind C = M.at(I);
+ if (C == ColorKind::None)
continue;
// During "Step", inputs cannot switch halves, so if the "up" color
// is still unknown, make sure that it is selected in such a way that
// "I" will stay in the same half.
bool InpUp = I < Num/2;
- if (ColorUp == Coloring::None)
+ if (ColorUp == ColorKind::None)
ColorUp = InpUp ? C : G.other(C);
if ((C == ColorUp) != InpUp) {
// If I should go to a different half than where is it now, give up.
@@ -545,16 +552,16 @@ bool BenesNetwork::route(ElemType *P, RowType *T, unsigned Size,
// Both assignments, i.e. Red->Up and Red->Down are valid, but they will
// result in different controls. Let's pick the one where the first
// control will be "Pass".
- uint8_t ColorUp = Coloring::None;
+ ColorKind ColorUp = ColorKind::None;
for (ElemType J = 0; J != Num; ++J) {
ElemType I = P[J];
if (I == Ignore)
continue;
- uint8_t C = M.at(I);
- if (C == Coloring::None)
+ ColorKind C = M.at(I);
+ if (C == ColorKind::None)
continue;
- if (ColorUp == Coloring::None) {
- ColorUp = (I < Num/2) ? Coloring::Red : Coloring::Black;
+ if (ColorUp == ColorKind::None) {
+ ColorUp = (I < Num / 2) ? ColorKind::Red : ColorKind::Black;
}
unsigned CI = (I < Num/2) ? I+Num/2 : I-Num/2;
if (C == ColorUp) {
@@ -769,6 +776,13 @@ struct ShuffleMask {
size_t H = Mask.size()/2;
return ShuffleMask(Mask.take_back(H));
}
+
+ void print(raw_ostream &OS) const {
+ OS << "MinSrc:" << MinSrc << ", MaxSrc:" << MaxSrc << " {";
+ for (int M : Mask)
+ OS << ' ' << M;
+ OS << " }";
+ }
};
} // namespace
@@ -806,6 +820,7 @@ namespace llvm {
void selectShuffle(SDNode *N);
void selectRor(SDNode *N);
+ void selectVAlign(SDNode *N);
private:
void materialize(const ResultStack &Results);
@@ -821,7 +836,6 @@ namespace llvm {
MutableArrayRef<int> NewMask, unsigned Options = None);
OpRef packp(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results,
MutableArrayRef<int> NewMask);
- OpRef zerous(ShuffleMask SM, OpRef Va, ResultStack &Results);
OpRef vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
ResultStack &Results);
OpRef vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
@@ -905,42 +919,55 @@ static bool isPermutation(ArrayRef<int> Mask) {
}
bool HvxSelector::selectVectorConstants(SDNode *N) {
- // Constant vectors are generated as loads from constant pools.
- // Since they are generated during the selection process, the main
- // selection algorithm is not aware of them. Select them directly
- // here.
- SmallVector<SDNode*,4> Loads;
- SmallVector<SDNode*,16> WorkQ;
+ // Constant vectors are generated as loads from constant pools or as
+ // splats of a constant value. Since they are generated during the
+ // selection process, the main selection algorithm is not aware of them.
+ // Select them directly here.
+ SmallVector<SDNode*,4> Nodes;
+ SetVector<SDNode*> WorkQ;
+
+ // The one-use test for VSPLATW's operand may fail due to dead nodes
+ // left over in the DAG.
+ DAG.RemoveDeadNodes();
// The DAG can change (due to CSE) during selection, so cache all the
// unselected nodes first to avoid traversing a mutating DAG.
- auto IsLoadToSelect = [] (SDNode *N) {
- if (!N->isMachineOpcode() && N->getOpcode() == ISD::LOAD) {
- SDValue Addr = cast<LoadSDNode>(N)->getBasePtr();
- unsigned AddrOpc = Addr.getOpcode();
- if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP)
- if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool)
- return true;
+ auto IsNodeToSelect = [] (SDNode *N) {
+ if (N->isMachineOpcode())
+ return false;
+ switch (N->getOpcode()) {
+ case HexagonISD::VZERO:
+ case HexagonISD::VSPLATW:
+ return true;
+ case ISD::LOAD: {
+ SDValue Addr = cast<LoadSDNode>(N)->getBasePtr();
+ unsigned AddrOpc = Addr.getOpcode();
+ if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP)
+ if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool)
+ return true;
+ }
+ break;
}
- return false;
+ // Make sure to select the operand of VSPLATW.
+ bool IsSplatOp = N->hasOneUse() &&
+ N->use_begin()->getOpcode() == HexagonISD::VSPLATW;
+ return IsSplatOp;
};
- WorkQ.push_back(N);
+ WorkQ.insert(N);
for (unsigned i = 0; i != WorkQ.size(); ++i) {
SDNode *W = WorkQ[i];
- if (IsLoadToSelect(W)) {
- Loads.push_back(W);
- continue;
- }
+ if (IsNodeToSelect(W))
+ Nodes.push_back(W);
for (unsigned j = 0, f = W->getNumOperands(); j != f; ++j)
- WorkQ.push_back(W->getOperand(j).getNode());
+ WorkQ.insert(W->getOperand(j).getNode());
}
- for (SDNode *L : Loads)
+ for (SDNode *L : Nodes)
ISel.Select(L);
- return !Loads.empty();
+ return !Nodes.empty();
}
void HvxSelector::materialize(const ResultStack &Results) {
@@ -977,15 +1004,11 @@ void HvxSelector::materialize(const ResultStack &Results) {
MVT OpTy = Op.getValueType().getSimpleVT();
if (Part != OpRef::Whole) {
assert(Part == OpRef::LoHalf || Part == OpRef::HiHalf);
- if (Op.getOpcode() == HexagonISD::VCOMBINE) {
- Op = (Part == OpRef::HiHalf) ? Op.getOperand(0) : Op.getOperand(1);
- } else {
- MVT HalfTy = MVT::getVectorVT(OpTy.getVectorElementType(),
- OpTy.getVectorNumElements()/2);
- unsigned Sub = (Part == OpRef::LoHalf) ? Hexagon::vsub_lo
- : Hexagon::vsub_hi;
- Op = DAG.getTargetExtractSubreg(Sub, dl, HalfTy, Op);
- }
+ MVT HalfTy = MVT::getVectorVT(OpTy.getVectorElementType(),
+ OpTy.getVectorNumElements()/2);
+ unsigned Sub = (Part == OpRef::LoHalf) ? Hexagon::vsub_lo
+ : Hexagon::vsub_hi;
+ Op = DAG.getTargetExtractSubreg(Sub, dl, HalfTy, Op);
}
Ops.push_back(Op);
} // for (Node : Results)
@@ -1031,25 +1054,53 @@ OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb,
int VecLen = SM.Mask.size();
MVT Ty = getSingleVT(MVT::i8);
- if (SM.MaxSrc - SM.MinSrc < int(HwLen)) {
- if (SM.MaxSrc < int(HwLen)) {
- memcpy(NewMask.data(), SM.Mask.data(), sizeof(int)*VecLen);
- return Va;
+ auto IsExtSubvector = [] (ShuffleMask M) {
+ assert(M.MinSrc >= 0 && M.MaxSrc >= 0);
+ for (int I = 0, E = M.Mask.size(); I != E; ++I) {
+ if (M.Mask[I] >= 0 && M.Mask[I]-I != M.MinSrc)
+ return false;
}
- if (SM.MinSrc >= int(HwLen)) {
- for (int I = 0; I != VecLen; ++I) {
- int M = SM.Mask[I];
- if (M != -1)
- M -= HwLen;
- NewMask[I] = M;
+ return true;
+ };
+
+ if (SM.MaxSrc - SM.MinSrc < int(HwLen)) {
+ if (SM.MinSrc == 0 || SM.MinSrc == int(HwLen) || !IsExtSubvector(SM)) {
+ // If the mask picks elements from only one of the operands, return
+ // that operand, and update the mask to use index 0 to refer to the
+ // first element of that operand.
+ // If the mask extracts a subvector, it will be handled below, so
+ // skip it here.
+ if (SM.MaxSrc < int(HwLen)) {
+ memcpy(NewMask.data(), SM.Mask.data(), sizeof(int)*VecLen);
+ return Va;
+ }
+ if (SM.MinSrc >= int(HwLen)) {
+ for (int I = 0; I != VecLen; ++I) {
+ int M = SM.Mask[I];
+ if (M != -1)
+ M -= HwLen;
+ NewMask[I] = M;
+ }
+ return Vb;
}
- return Vb;
+ }
+ int MinSrc = SM.MinSrc;
+ if (SM.MaxSrc < int(HwLen)) {
+ Vb = Va;
+ } else if (SM.MinSrc > int(HwLen)) {
+ Va = Vb;
+ MinSrc = SM.MinSrc - HwLen;
}
const SDLoc &dl(Results.InpNode);
- SDValue S = DAG.getTargetConstant(SM.MinSrc, dl, MVT::i32);
- if (isUInt<3>(SM.MinSrc)) {
- Results.push(Hexagon::V6_valignbi, Ty, {Vb, Va, S});
+ if (isUInt<3>(MinSrc) || isUInt<3>(HwLen-MinSrc)) {
+ bool IsRight = isUInt<3>(MinSrc); // Right align.
+ SDValue S = DAG.getTargetConstant(IsRight ? MinSrc : HwLen-MinSrc,
+ dl, MVT::i32);
+ unsigned Opc = IsRight ? Hexagon::V6_valignbi
+ : Hexagon::V6_vlalignbi;
+ Results.push(Opc, Ty, {Vb, Va, S});
} else {
+ SDValue S = DAG.getTargetConstant(MinSrc, dl, MVT::i32);
Results.push(Hexagon::A2_tfrsi, MVT::i32, {S});
unsigned Top = Results.top();
Results.push(Hexagon::V6_valignb, Ty, {Vb, Va, OpRef::res(Top)});
@@ -1139,25 +1190,6 @@ OpRef HvxSelector::packp(ShuffleMask SM, OpRef Va, OpRef Vb,
return concat(Out[0], Out[1], Results);
}
-OpRef HvxSelector::zerous(ShuffleMask SM, OpRef Va, ResultStack &Results) {
- DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
-
- int VecLen = SM.Mask.size();
- SmallVector<uint8_t,128> UsedBytes(VecLen);
- bool HasUnused = false;
- for (int I = 0; I != VecLen; ++I) {
- if (SM.Mask[I] != -1)
- UsedBytes[I] = 0xFF;
- else
- HasUnused = true;
- }
- if (!HasUnused)
- return Va;
- SDValue B = getVectorConstant(UsedBytes, SDLoc(Results.InpNode));
- Results.push(Hexagon::V6_vand, getSingleVT(MVT::i8), {Va, OpRef(B)});
- return OpRef::res(Results.top());
-}
-
OpRef HvxSelector::vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
ResultStack &Results) {
DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
@@ -1279,6 +1311,8 @@ OpRef HvxSelector::shuffp2(ShuffleMask SM, OpRef Va, OpRef Vb,
return shuffp1(ShuffleMask(PackedMask), P, Results);
SmallVector<int,256> MaskL(VecLen), MaskR(VecLen);
+ splitMask(SM.Mask, MaskL, MaskR);
+
OpRef L = shuffp1(ShuffleMask(MaskL), Va, Results);
OpRef R = shuffp1(ShuffleMask(MaskR), Vb, Results);
if (!L.isValid() || !R.isValid())
@@ -1934,7 +1968,6 @@ void HvxSelector::selectShuffle(SDNode *N) {
// If the mask is all -1's, generate "undef".
if (!UseLeft && !UseRight) {
ISel.ReplaceNode(N, ISel.selectUndef(SDLoc(SN), ResTy).getNode());
- DAG.RemoveDeadNode(N);
return;
}
@@ -1976,8 +2009,8 @@ void HvxSelector::selectRor(SDNode *N) {
SDNode *NewN = nullptr;
if (auto *CN = dyn_cast<ConstantSDNode>(RotV.getNode())) {
- unsigned S = CN->getZExtValue();
- if (S % HST.getVectorLength() == 0) {
+ unsigned S = CN->getZExtValue() % HST.getVectorLength();
+ if (S == 0) {
NewN = VecV.getNode();
} else if (isUInt<3>(S)) {
SDValue C = DAG.getTargetConstant(S, dl, MVT::i32);
@@ -1990,6 +2023,15 @@ void HvxSelector::selectRor(SDNode *N) {
NewN = DAG.getMachineNode(Hexagon::V6_vror, dl, Ty, {VecV, RotV});
ISel.ReplaceNode(N, NewN);
+}
+
+void HvxSelector::selectVAlign(SDNode *N) {
+ SDValue Vv = N->getOperand(0);
+ SDValue Vu = N->getOperand(1);
+ SDValue Rt = N->getOperand(2);
+ SDNode *NewN = DAG.getMachineNode(Hexagon::V6_valignb, SDLoc(N),
+ N->getValueType(0), {Vv, Vu, Rt});
+ ISel.ReplaceNode(N, NewN);
DAG.RemoveDeadNode(N);
}
@@ -2001,7 +2043,15 @@ void HexagonDAGToDAGISel::SelectHvxRor(SDNode *N) {
HvxSelector(*this, *CurDAG).selectRor(N);
}
+void HexagonDAGToDAGISel::SelectHvxVAlign(SDNode *N) {
+ HvxSelector(*this, *CurDAG).selectVAlign(N);
+}
+
void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) {
+ if (!HST->usePackets()) {
+ report_fatal_error("Support for gather requires packets, "
+ "which are disabled");
+ }
const SDLoc &dl(N);
SDValue Chain = N->getOperand(0);
SDValue Address = N->getOperand(2);
@@ -2037,11 +2087,14 @@ void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) {
MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
- ReplaceUses(N, Result);
- CurDAG->RemoveDeadNode(N);
+ ReplaceNode(N, Result);
}
void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
+ if (!HST->usePackets()) {
+ report_fatal_error("Support for gather requires packets, "
+ "which are disabled");
+ }
const SDLoc &dl(N);
SDValue Chain = N->getOperand(0);
SDValue Address = N->getOperand(2);
@@ -2076,8 +2129,7 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
- ReplaceUses(N, Result);
- CurDAG->RemoveDeadNode(N);
+ ReplaceNode(N, Result);
}
void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) {
@@ -2120,5 +2172,3 @@ void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) {
ReplaceUses(SDValue(N, 1), SDValue(Result, 1));
CurDAG->RemoveDeadNode(N);
}
-
-
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 0e0da2ddc400..604d84994b6c 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -40,6 +40,7 @@
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
@@ -103,427 +104,52 @@ static cl::opt<int> MaxStoresPerMemsetOptSizeCL("max-store-memset-Os",
cl::Hidden, cl::ZeroOrMore, cl::init(4),
cl::desc("Max #stores to inline memset"));
+static cl::opt<bool> AlignLoads("hexagon-align-loads",
+ cl::Hidden, cl::init(false),
+ cl::desc("Rewrite unaligned loads as a pair of aligned loads"));
+
namespace {
class HexagonCCState : public CCState {
- unsigned NumNamedVarArgParams;
+ unsigned NumNamedVarArgParams = 0;
public:
- HexagonCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+ HexagonCCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF,
SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
- int NumNamedVarArgParams)
- : CCState(CC, isVarArg, MF, locs, C),
- NumNamedVarArgParams(NumNamedVarArgParams) {}
-
+ unsigned NumNamedArgs)
+ : CCState(CC, IsVarArg, MF, locs, C),
+ NumNamedVarArgParams(NumNamedArgs) {}
unsigned getNumNamedVarArgParams() const { return NumNamedVarArgParams; }
};
- enum StridedLoadKind {
- Even = 0,
- Odd,
- NoPattern
- };
-
} // end anonymous namespace
-// Implement calling convention for Hexagon.
-
-static const MVT LegalV64[] = { MVT::v64i8, MVT::v32i16, MVT::v16i32 };
-static const MVT LegalW64[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 };
-static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 };
-static const MVT LegalW128[] = { MVT::v256i8, MVT::v128i16, MVT::v64i32 };
-
-static bool
-CC_Hexagon(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-CC_Hexagon32(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-CC_Hexagon64(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-CC_HexagonVector(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-RetCC_Hexagon(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- HexagonCCState &HState = static_cast<HexagonCCState &>(State);
-
- if (ValNo < HState.getNumNamedVarArgParams()) {
- // Deal with named arguments.
- return CC_Hexagon(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
- }
-
- // Deal with un-named arguments.
- unsigned Offset;
- if (ArgFlags.isByVal()) {
- // If pass-by-value, the size allocated on stack is decided
- // by ArgFlags.getByValSize(), not by the size of LocVT.
- Offset = State.AllocateStack(ArgFlags.getByValSize(),
- ArgFlags.getByValAlign());
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return false;
- }
- if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
- LocVT = MVT::i32;
- ValVT = MVT::i32;
- if (ArgFlags.isSExt())
- LocInfo = CCValAssign::SExt;
- else if (ArgFlags.isZExt())
- LocInfo = CCValAssign::ZExt;
- else
- LocInfo = CCValAssign::AExt;
- }
- if (LocVT == MVT::i32 || LocVT == MVT::f32) {
- Offset = State.AllocateStack(4, 4);
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return false;
- }
- if (LocVT == MVT::i64 || LocVT == MVT::f64) {
- Offset = State.AllocateStack(8, 8);
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return false;
- }
- if (LocVT == MVT::v2i64 || LocVT == MVT::v4i32 || LocVT == MVT::v8i16 ||
- LocVT == MVT::v16i8) {
- Offset = State.AllocateStack(16, 16);
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return false;
- }
- if (LocVT == MVT::v4i64 || LocVT == MVT::v8i32 || LocVT == MVT::v16i16 ||
- LocVT == MVT::v32i8) {
- Offset = State.AllocateStack(32, 32);
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return false;
- }
- if (LocVT == MVT::v16i32 || LocVT == MVT::v32i16 ||
- LocVT == MVT::v64i8 || LocVT == MVT::v512i1) {
- Offset = State.AllocateStack(64, 64);
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return false;
- }
- if (LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
- LocVT == MVT::v128i8 || LocVT == MVT::v1024i1) {
- Offset = State.AllocateStack(128, 128);
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return false;
- }
- if (LocVT == MVT::v64i32 || LocVT == MVT::v128i16 ||
- LocVT == MVT::v256i8) {
- Offset = State.AllocateStack(256, 256);
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return false;
- }
-
- llvm_unreachable(nullptr);
-}
-
-static bool CC_Hexagon (unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) {
- if (ArgFlags.isByVal()) {
- // Passed on stack.
- unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(),
- ArgFlags.getByValAlign());
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return false;
- }
-
- if (LocVT == MVT::i1) {
- LocVT = MVT::i32;
- } else if (LocVT == MVT::i8 || LocVT == MVT::i16) {
- LocVT = MVT::i32;
- ValVT = MVT::i32;
- if (ArgFlags.isSExt())
- LocInfo = CCValAssign::SExt;
- else if (ArgFlags.isZExt())
- LocInfo = CCValAssign::ZExt;
- else
- LocInfo = CCValAssign::AExt;
- } else if (LocVT == MVT::v4i8 || LocVT == MVT::v2i16) {
- LocVT = MVT::i32;
- LocInfo = CCValAssign::BCvt;
- } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) {
- LocVT = MVT::i64;
- LocInfo = CCValAssign::BCvt;
- }
-
- if (LocVT == MVT::i32 || LocVT == MVT::f32) {
- if (!CC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
- return false;
- }
-
- if (LocVT == MVT::i64 || LocVT == MVT::f64) {
- if (!CC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
- return false;
- }
-
- if (LocVT == MVT::v8i32 || LocVT == MVT::v16i16 || LocVT == MVT::v32i8) {
- unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(), 32);
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return false;
- }
-
- auto &HST = State.getMachineFunction().getSubtarget<HexagonSubtarget>();
- if (HST.isHVXVectorType(LocVT)) {
- if (!CC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
- return false;
- }
-
- return true; // CC didn't match.
-}
+// Implement calling convention for Hexagon.
-static bool CC_Hexagon32(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- static const MCPhysReg RegList[] = {
- Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
- Hexagon::R5
+static bool CC_SkipOdd(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ static const MCPhysReg ArgRegs[] = {
+ Hexagon::R0, Hexagon::R1, Hexagon::R2,
+ Hexagon::R3, Hexagon::R4, Hexagon::R5
};
- if (unsigned Reg = State.AllocateReg(RegList)) {
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- return false;
- }
+ const unsigned NumArgRegs = array_lengthof(ArgRegs);
+ unsigned RegNum = State.getFirstUnallocated(ArgRegs);
- unsigned Offset = State.AllocateStack(4, 4);
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return false;
-}
-
-static bool CC_Hexagon64(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- return false;
- }
-
- static const MCPhysReg RegList1[] = {
- Hexagon::D1, Hexagon::D2
- };
- static const MCPhysReg RegList2[] = {
- Hexagon::R1, Hexagon::R3
- };
- if (unsigned Reg = State.AllocateReg(RegList1, RegList2)) {
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- return false;
- }
+ // RegNum is an index into ArgRegs: skip a register if RegNum is odd.
+ if (RegNum != NumArgRegs && RegNum % 2 == 1)
+ State.AllocateReg(ArgRegs[RegNum]);
- unsigned Offset = State.AllocateStack(8, 8, Hexagon::D2);
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ // Always return false here, as this function only makes sure that the first
+ // unallocated register has an even register number and does not actually
+ // allocate a register for the current argument.
return false;
}
-static bool CC_HexagonVector(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- static const MCPhysReg VecLstS[] = {
- Hexagon::V0, Hexagon::V1, Hexagon::V2, Hexagon::V3, Hexagon::V4,
- Hexagon::V5, Hexagon::V6, Hexagon::V7, Hexagon::V8, Hexagon::V9,
- Hexagon::V10, Hexagon::V11, Hexagon::V12, Hexagon::V13, Hexagon::V14,
- Hexagon::V15
- };
- static const MCPhysReg VecLstD[] = {
- Hexagon::W0, Hexagon::W1, Hexagon::W2, Hexagon::W3, Hexagon::W4,
- Hexagon::W5, Hexagon::W6, Hexagon::W7
- };
- auto &MF = State.getMachineFunction();
- auto &HST = MF.getSubtarget<HexagonSubtarget>();
-
- if (HST.useHVX64BOps() &&
- (LocVT == MVT::v16i32 || LocVT == MVT::v32i16 ||
- LocVT == MVT::v64i8 || LocVT == MVT::v512i1)) {
- if (unsigned Reg = State.AllocateReg(VecLstS)) {
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- return false;
- }
- unsigned Offset = State.AllocateStack(64, 64);
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return false;
- }
- if (HST.useHVX64BOps() && (LocVT == MVT::v32i32 ||
- LocVT == MVT::v64i16 || LocVT == MVT::v128i8)) {
- if (unsigned Reg = State.AllocateReg(VecLstD)) {
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- return false;
- }
- unsigned Offset = State.AllocateStack(128, 128);
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return false;
- }
- // 128B Mode
- if (HST.useHVX128BOps() && (LocVT == MVT::v64i32 ||
- LocVT == MVT::v128i16 || LocVT == MVT::v256i8)) {
- if (unsigned Reg = State.AllocateReg(VecLstD)) {
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- return false;
- }
- unsigned Offset = State.AllocateStack(256, 256);
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return false;
- }
- if (HST.useHVX128BOps() &&
- (LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
- LocVT == MVT::v128i8 || LocVT == MVT::v1024i1)) {
- if (unsigned Reg = State.AllocateReg(VecLstS)) {
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- return false;
- }
- unsigned Offset = State.AllocateStack(128, 128);
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return false;
- }
- return true;
-}
-
-static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- auto &MF = State.getMachineFunction();
- auto &HST = MF.getSubtarget<HexagonSubtarget>();
-
- if (LocVT == MVT::i1) {
- // Return values of type MVT::i1 still need to be assigned to R0, but
- // the value type needs to remain i1. LowerCallResult will deal with it,
- // but it needs to recognize i1 as the value type.
- LocVT = MVT::i32;
- } else if (LocVT == MVT::i8 || LocVT == MVT::i16) {
- LocVT = MVT::i32;
- ValVT = MVT::i32;
- if (ArgFlags.isSExt())
- LocInfo = CCValAssign::SExt;
- else if (ArgFlags.isZExt())
- LocInfo = CCValAssign::ZExt;
- else
- LocInfo = CCValAssign::AExt;
- } else if (LocVT == MVT::v4i8 || LocVT == MVT::v2i16) {
- LocVT = MVT::i32;
- LocInfo = CCValAssign::BCvt;
- } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) {
- LocVT = MVT::i64;
- LocInfo = CCValAssign::BCvt;
- } else if (LocVT == MVT::v64i8 || LocVT == MVT::v32i16 ||
- LocVT == MVT::v16i32 || LocVT == MVT::v512i1) {
- LocVT = MVT::v16i32;
- ValVT = MVT::v16i32;
- LocInfo = CCValAssign::Full;
- } else if (LocVT == MVT::v128i8 || LocVT == MVT::v64i16 ||
- LocVT == MVT::v32i32 ||
- (LocVT == MVT::v1024i1 && HST.useHVX128BOps())) {
- LocVT = MVT::v32i32;
- ValVT = MVT::v32i32;
- LocInfo = CCValAssign::Full;
- } else if (LocVT == MVT::v256i8 || LocVT == MVT::v128i16 ||
- LocVT == MVT::v64i32) {
- LocVT = MVT::v64i32;
- ValVT = MVT::v64i32;
- LocInfo = CCValAssign::Full;
- }
- if (LocVT == MVT::i32 || LocVT == MVT::f32) {
- if (!RetCC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
- return false;
- }
-
- if (LocVT == MVT::i64 || LocVT == MVT::f64) {
- if (!RetCC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
- return false;
- }
- if (LocVT == MVT::v16i32 || LocVT == MVT::v32i32 || LocVT == MVT::v64i32) {
- if (!RetCC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
- return false;
- }
- return true; // CC didn't match.
-}
-
-static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- if (LocVT == MVT::i32 || LocVT == MVT::f32) {
- // Note that use of registers beyond R1 is not ABI compliant. However there
- // are (experimental) IR passes which generate internal functions that
- // return structs using these additional registers.
- static const uint16_t RegList[] = { Hexagon::R0, Hexagon::R1,
- Hexagon::R2, Hexagon::R3,
- Hexagon::R4, Hexagon::R5 };
- if (unsigned Reg = State.AllocateReg(RegList)) {
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- return false;
- }
- }
-
- return true;
-}
-
-static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- if (LocVT == MVT::i64 || LocVT == MVT::f64) {
- if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- return false;
- }
- }
-
- return true;
-}
+#include "HexagonGenCallingConv.inc"
-static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- auto &MF = State.getMachineFunction();
- auto &HST = MF.getSubtarget<HexagonSubtarget>();
-
- if (LocVT == MVT::v16i32) {
- if (unsigned Reg = State.AllocateReg(Hexagon::V0)) {
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- return false;
- }
- } else if (LocVT == MVT::v32i32) {
- unsigned Req = HST.useHVX128BOps() ? Hexagon::V0 : Hexagon::W0;
- if (unsigned Reg = State.AllocateReg(Req)) {
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- return false;
- }
- } else if (LocVT == MVT::v64i32) {
- if (unsigned Reg = State.AllocateReg(Hexagon::W0)) {
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- return false;
- }
- }
-
- return true;
-}
void HexagonTargetLowering::promoteLdStType(MVT VT, MVT PromotedLdStVT) {
if (VT != PromotedLdStVT) {
@@ -558,11 +184,14 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
bool
HexagonTargetLowering::CanLowerReturn(
- CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+ CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+ CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+
+ if (MF.getSubtarget<HexagonSubtarget>().useHVXOps())
+ return CCInfo.CheckReturn(Outs, RetCC_Hexagon_HVX);
return CCInfo.CheckReturn(Outs, RetCC_Hexagon);
}
@@ -571,7 +200,7 @@ HexagonTargetLowering::CanLowerReturn(
// the value is stored in memory pointed by a pointer passed by caller.
SDValue
HexagonTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
- bool isVarArg,
+ bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &dl, SelectionDAG &DAG) const {
@@ -579,11 +208,14 @@ HexagonTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SmallVector<CCValAssign, 16> RVLocs;
// CCState - Info about the registers and stack slot.
- CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
// Analyze return values of ISD::RET
- CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon);
+ if (Subtarget.useHVXOps())
+ CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon_HVX);
+ else
+ CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon);
SDValue Flag;
SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -624,17 +256,20 @@ bool HexagonTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
/// being lowered. Returns a SDNode with the same number of values as the
/// ISD::CALL.
SDValue HexagonTargetLowering::LowerCallResult(
- SDValue Chain, SDValue Glue, CallingConv::ID CallConv, bool isVarArg,
+ SDValue Chain, SDValue Glue, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
const SmallVectorImpl<SDValue> &OutVals, SDValue Callee) const {
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
- CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon);
+ if (Subtarget.useHVXOps())
+ CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon_HVX);
+ else
+ CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon);
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -683,67 +318,57 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
- bool &IsTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
bool DoesNotReturn = CLI.DoesNotReturn;
- bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+ bool IsStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
auto PtrVT = getPointerTy(MF.getDataLayout());
- // Check for varargs.
- unsigned NumNamedVarArgParams = -1U;
- if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee)) {
- const GlobalValue *GV = GAN->getGlobal();
- Callee = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
- if (const Function* F = dyn_cast<Function>(GV)) {
- // If a function has zero args and is a vararg function, that's
- // disallowed so it must be an undeclared function. Do not assume
- // varargs if the callee is undefined.
- if (F->isVarArg() && F->getFunctionType()->getNumParams() != 0)
- NumNamedVarArgParams = F->getFunctionType()->getNumParams();
- }
- }
+ unsigned NumParams = CLI.CS.getInstruction()
+ ? CLI.CS.getFunctionType()->getNumParams()
+ : 0;
+ if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee))
+ Callee = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, MVT::i32);
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
- HexagonCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
- *DAG.getContext(), NumNamedVarArgParams);
+ HexagonCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext(),
+ NumParams);
- if (IsVarArg)
- CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_VarArg);
+ if (Subtarget.useHVXOps())
+ CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_HVX);
else
CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon);
auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
if (Attr.getValueAsString() == "true")
- IsTailCall = false;
+ CLI.IsTailCall = false;
- if (IsTailCall) {
+ if (CLI.IsTailCall) {
bool StructAttrFlag = MF.getFunction().hasStructRetAttr();
- IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
- IsVarArg, IsStructRet,
- StructAttrFlag,
- Outs, OutVals, Ins, DAG);
+ CLI.IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+ IsVarArg, IsStructRet, StructAttrFlag, Outs,
+ OutVals, Ins, DAG);
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
if (VA.isMemLoc()) {
- IsTailCall = false;
+ CLI.IsTailCall = false;
break;
}
}
- DEBUG(dbgs() << (IsTailCall ? "Eligible for Tail Call\n"
- : "Argument must be passed on stack. "
- "Not eligible for Tail Call\n"));
+ LLVM_DEBUG(dbgs() << (CLI.IsTailCall ? "Eligible for Tail Call\n"
+ : "Argument must be passed on stack. "
+ "Not eligible for Tail Call\n"));
}
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
- auto &HRI = *Subtarget.getRegisterInfo();
+ const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
SDValue StackPtr =
DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(), PtrVT);
@@ -789,7 +414,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
VA.getLocVT().getStoreSizeInBits() >> 3);
if (Flags.isByVal()) {
// The argument is a struct passed by value. According to LLVM, "Arg"
- // is is pointer.
+ // is a pointer.
MemOpChains.push_back(CreateCopyOfByValArgument(Arg, MemAddr, Chain,
Flags, DAG, dl));
} else {
@@ -807,14 +432,10 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
}
- if (NeedsArgAlign && Subtarget.hasV60TOps()) {
- DEBUG(dbgs() << "Function needs byte stack align due to call args\n");
- // V6 vectors passed by value have 64 or 128 byte alignment depending
- // on whether we are 64 byte vector mode or 128 byte.
- bool UseHVX128B = Subtarget.useHVX128BOps();
- assert(Subtarget.useHVXOps());
- const unsigned ObjAlign = UseHVX128B ? 128 : 64;
- LargestAlignSeen = std::max(LargestAlignSeen, ObjAlign);
+ if (NeedsArgAlign && Subtarget.hasV60Ops()) {
+ LLVM_DEBUG(dbgs() << "Function needs byte stack align due to call args\n");
+ unsigned VecAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
+ LargestAlignSeen = std::max(LargestAlignSeen, VecAlign);
MFI.ensureMaxAlignment(LargestAlignSeen);
}
// Transform all store nodes into one single node because all store
@@ -823,7 +444,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
SDValue Glue;
- if (!IsTailCall) {
+ if (!CLI.IsTailCall) {
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
Glue = Chain.getValue(1);
}
@@ -832,7 +453,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// chain and flag operands which copy the outgoing args into registers.
// The Glue is necessary since all emitted instructions must be
// stuck together.
- if (!IsTailCall) {
+ if (!CLI.IsTailCall) {
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
RegsToPass[i].second, Glue);
@@ -891,7 +512,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (Glue.getNode())
Ops.push_back(Glue);
- if (IsTailCall) {
+ if (CLI.IsTailCall) {
MFI.setHasTailCall();
return DAG.getNode(HexagonISD::TC_RETURN, dl, NodeTys, Ops);
}
@@ -916,66 +537,36 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
InVals, OutVals, Callee);
}
-static bool getIndexedAddressParts(SDNode *Ptr, EVT VT,
- SDValue &Base, SDValue &Offset,
- bool &IsInc, SelectionDAG &DAG) {
- if (Ptr->getOpcode() != ISD::ADD)
- return false;
-
- auto &HST = static_cast<const HexagonSubtarget&>(DAG.getSubtarget());
-
- bool ValidHVX128BType =
- HST.useHVX128BOps() && (VT == MVT::v32i32 ||
- VT == MVT::v64i16 || VT == MVT::v128i8);
- bool ValidHVXType =
- HST.useHVX64BOps() && (VT == MVT::v16i32 ||
- VT == MVT::v32i16 || VT == MVT::v64i8);
-
- if (ValidHVX128BType || ValidHVXType || VT == MVT::i64 || VT == MVT::i32 ||
- VT == MVT::i16 || VT == MVT::i8) {
- IsInc = (Ptr->getOpcode() == ISD::ADD);
- Base = Ptr->getOperand(0);
- Offset = Ptr->getOperand(1);
- // Ensure that Offset is a constant.
- return isa<ConstantSDNode>(Offset);
- }
-
- return false;
-}
-
-/// getPostIndexedAddressParts - returns true by value, base pointer and
-/// offset pointer and addressing mode by reference if this node can be
-/// combined with a load / store to form a post-indexed load / store.
+/// Returns true by value, base pointer and offset pointer and addressing
+/// mode by reference if this node can be combined with a load / store to
+/// form a post-indexed load / store.
bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
- SDValue &Base,
- SDValue &Offset,
- ISD::MemIndexedMode &AM,
- SelectionDAG &DAG) const
-{
- EVT VT;
-
- if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
- VT = LD->getMemoryVT();
- } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
- VT = ST->getMemoryVT();
- if (ST->getValue().getValueType() == MVT::i64 && ST->isTruncatingStore())
- return false;
- } else {
+ SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const {
+ LSBaseSDNode *LSN = dyn_cast<LSBaseSDNode>(N);
+ if (!LSN)
+ return false;
+ EVT VT = LSN->getMemoryVT();
+ if (!VT.isSimple())
+ return false;
+ bool IsLegalType = VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
+ VT == MVT::i64 || VT == MVT::f32 || VT == MVT::f64 ||
+ VT == MVT::v2i16 || VT == MVT::v2i32 || VT == MVT::v4i8 ||
+ VT == MVT::v4i16 || VT == MVT::v8i8 ||
+ Subtarget.isHVXVectorType(VT.getSimpleVT());
+ if (!IsLegalType)
return false;
- }
- bool IsInc = false;
- bool isLegal = getIndexedAddressParts(Op, VT, Base, Offset, IsInc, DAG);
- if (isLegal) {
- auto &HII = *Subtarget.getInstrInfo();
- int32_t OffsetVal = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
- if (HII.isValidAutoIncImm(VT, OffsetVal)) {
- AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
- return true;
- }
- }
+ if (Op->getOpcode() != ISD::ADD)
+ return false;
+ Base = Op->getOperand(0);
+ Offset = Op->getOperand(1);
+ if (!isa<ConstantSDNode>(Offset.getNode()))
+ return false;
+ AM = ISD::POST_INC;
- return false;
+ int32_t V = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
+ return Subtarget.getInstrInfo()->isValidAutoIncImm(VT, V);
}
SDValue
@@ -1080,7 +671,7 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
if (A == 0)
A = HFI.getStackAlignment();
- DEBUG({
+ LLVM_DEBUG({
dbgs () << __func__ << " Align: " << A << " Size: ";
Size.getNode()->dump(&DAG);
dbgs() << "\n";
@@ -1095,20 +686,22 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
}
SDValue HexagonTargetLowering::LowerFormalArguments(
- SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
- MachineRegisterInfo &RegInfo = MF.getRegInfo();
- auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
- *DAG.getContext());
+ HexagonCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext(),
+ MF.getFunction().getFunctionType()->getNumParams());
- CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon);
+ if (Subtarget.useHVXOps())
+ CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon_HVX);
+ else
+ CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon);
// For LLVM, in the case when returning a struct by value (>8byte),
// the first argument is a pointer that points to the location on caller's
@@ -1117,110 +710,62 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
// equal to) 8 bytes. If not, no address will be passed into callee and
// callee return the result direclty through R0/R1.
- SmallVector<SDValue, 8> MemOps;
+ auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
ISD::ArgFlagsTy Flags = Ins[i].Flags;
- unsigned ObjSize;
- unsigned StackLocation;
- int FI;
-
- if ( (VA.isRegLoc() && !Flags.isByVal())
- || (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() > 8)) {
- // Arguments passed in registers
- // 1. int, long long, ptr args that get allocated in register.
- // 2. Large struct that gets an register to put its address in.
- EVT RegVT = VA.getLocVT();
- if (RegVT == MVT::i8 || RegVT == MVT::i16 ||
- RegVT == MVT::i32 || RegVT == MVT::f32) {
- unsigned VReg =
- RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass);
- RegInfo.addLiveIn(VA.getLocReg(), VReg);
- if (VA.getLocInfo() == CCValAssign::BCvt)
- RegVT = VA.getValVT();
- SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
- // Treat values of type MVT::i1 specially: they are passed in
- // registers of type i32, but they need to remain as values of
- // type i1 for consistency of the argument lowering.
- if (VA.getValVT() == MVT::i1) {
- // Generate a copy into a predicate register and use the value
- // of the register as the "InVal".
- unsigned PReg =
- RegInfo.createVirtualRegister(&Hexagon::PredRegsRegClass);
- SDNode *T = DAG.getMachineNode(Hexagon::C2_tfrrp, dl, MVT::i1,
- Copy.getValue(0));
- Copy = DAG.getCopyToReg(Copy.getValue(1), dl, PReg, SDValue(T, 0));
- Copy = DAG.getCopyFromReg(Copy, dl, PReg, MVT::i1);
- }
- InVals.push_back(Copy);
- Chain = Copy.getValue(1);
- } else if (RegVT == MVT::i64 || RegVT == MVT::f64) {
- unsigned VReg =
- RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
- RegInfo.addLiveIn(VA.getLocReg(), VReg);
- if (VA.getLocInfo() == CCValAssign::BCvt)
- RegVT = VA.getValVT();
- InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
-
- // Single Vector
- } else if ((RegVT == MVT::v16i32 ||
- RegVT == MVT::v32i16 || RegVT == MVT::v64i8)) {
- unsigned VReg =
- RegInfo.createVirtualRegister(&Hexagon::HvxVRRegClass);
- RegInfo.addLiveIn(VA.getLocReg(), VReg);
- InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
- } else if (Subtarget.useHVX128BOps() &&
- ((RegVT == MVT::v32i32 ||
- RegVT == MVT::v64i16 || RegVT == MVT::v128i8))) {
- unsigned VReg =
- RegInfo.createVirtualRegister(&Hexagon::HvxVRRegClass);
- RegInfo.addLiveIn(VA.getLocReg(), VReg);
- InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
-
- // Double Vector
- } else if ((RegVT == MVT::v32i32 ||
- RegVT == MVT::v64i16 || RegVT == MVT::v128i8)) {
- unsigned VReg =
- RegInfo.createVirtualRegister(&Hexagon::HvxWRRegClass);
- RegInfo.addLiveIn(VA.getLocReg(), VReg);
- InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
- } else if (Subtarget.useHVX128BOps() &&
- ((RegVT == MVT::v64i32 ||
- RegVT == MVT::v128i16 || RegVT == MVT::v256i8))) {
- unsigned VReg =
- RegInfo.createVirtualRegister(&Hexagon::HvxWRRegClass);
- RegInfo.addLiveIn(VA.getLocReg(), VReg);
- InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
- } else if (RegVT == MVT::v512i1 || RegVT == MVT::v1024i1) {
- assert(0 && "need to support VecPred regs");
- unsigned VReg =
- RegInfo.createVirtualRegister(&Hexagon::HvxQRRegClass);
- RegInfo.addLiveIn(VA.getLocReg(), VReg);
- InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+ bool ByVal = Flags.isByVal();
+
+ // Arguments passed in registers:
+ // 1. 32- and 64-bit values and HVX vectors are passed directly,
+ // 2. Large structs are passed via an address, and the address is
+ // passed in a register.
+ if (VA.isRegLoc() && ByVal && Flags.getByValSize() <= 8)
+ llvm_unreachable("ByValSize must be bigger than 8 bytes");
+
+ bool InReg = VA.isRegLoc() &&
+ (!ByVal || (ByVal && Flags.getByValSize() > 8));
+
+ if (InReg) {
+ MVT RegVT = VA.getLocVT();
+ if (VA.getLocInfo() == CCValAssign::BCvt)
+ RegVT = VA.getValVT();
+
+ const TargetRegisterClass *RC = getRegClassFor(RegVT);
+ unsigned VReg = MRI.createVirtualRegister(RC);
+ SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+
+ // Treat values of type MVT::i1 specially: they are passed in
+ // registers of type i32, but they need to remain as values of
+ // type i1 for consistency of the argument lowering.
+ if (VA.getValVT() == MVT::i1) {
+ assert(RegVT.getSizeInBits() <= 32);
+ SDValue T = DAG.getNode(ISD::AND, dl, RegVT,
+ Copy, DAG.getConstant(1, dl, RegVT));
+ Copy = DAG.getSetCC(dl, MVT::i1, T, DAG.getConstant(0, dl, RegVT),
+ ISD::SETNE);
} else {
- assert (0);
+#ifndef NDEBUG
+ unsigned RegSize = RegVT.getSizeInBits();
+ assert(RegSize == 32 || RegSize == 64 ||
+ Subtarget.isHVXVectorType(RegVT));
+#endif
}
- } else if (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() <= 8) {
- assert (0 && "ByValSize must be bigger than 8 bytes");
+ InVals.push_back(Copy);
+ MRI.addLiveIn(VA.getLocReg(), VReg);
} else {
- // Sanity check.
- assert(VA.isMemLoc());
-
- if (Flags.isByVal()) {
- // If it's a byval parameter, then we need to compute the
- // "real" size, not the size of the pointer.
- ObjSize = Flags.getByValSize();
- } else {
- ObjSize = VA.getLocVT().getStoreSizeInBits() >> 3;
- }
+ assert(VA.isMemLoc() && "Argument should be passed in memory");
- StackLocation = HEXAGON_LRFP_SIZE + VA.getLocMemOffset();
- // Create the frame index object for this incoming parameter...
- FI = MFI.CreateFixedObject(ObjSize, StackLocation, true);
+ // If it's a byval parameter, then we need to compute the
+ // "real" size, not the size of the pointer.
+ unsigned ObjSize = Flags.isByVal()
+ ? Flags.getByValSize()
+ : VA.getLocVT().getStoreSizeInBits() / 8;
- // Create the SelectionDAG nodes cordl, responding to a load
- // from this parameter.
+ // Create the frame index object for this incoming parameter.
+ int Offset = HEXAGON_LRFP_SIZE + VA.getLocMemOffset();
+ int FI = MFI.CreateFixedObject(ObjSize, Offset, true);
SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
if (Flags.isByVal()) {
@@ -1229,22 +774,19 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
// location.
InVals.push_back(FIN);
} else {
- InVals.push_back(
- DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
+ SDValue L = DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(MF, FI, 0));
+ InVals.push_back(L);
}
}
}
- if (!MemOps.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
- if (isVarArg) {
+ if (IsVarArg) {
// This will point to the next argument passed via stack.
- int FrameIndex = MFI.CreateFixedObject(Hexagon_PointerSize,
- HEXAGON_LRFP_SIZE +
- CCInfo.getNextStackOffset(),
- true);
- FuncInfo.setVarArgsFrameIndex(FrameIndex);
+ int Offset = HEXAGON_LRFP_SIZE + CCInfo.getNextStackOffset();
+ int FI = MFI.CreateFixedObject(Hexagon_PointerSize, Offset, true);
+ HMFI.setVarArgsFrameIndex(FI);
}
return Chain;
@@ -1262,66 +804,62 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MachinePointerInfo(SV));
}
-static bool isSExtFree(SDValue N) {
- // A sign-extend of a truncate of a sign-extend is free.
- if (N.getOpcode() == ISD::TRUNCATE &&
- N.getOperand(0).getOpcode() == ISD::AssertSext)
- return true;
- // We have sign-extended loads.
- if (N.getOpcode() == ISD::LOAD)
- return true;
- return false;
-}
-
SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
- SDLoc dl(Op);
-
+ const SDLoc &dl(Op);
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
- if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(ty(LHS)))
- return LowerHvxSetCC(Op, DAG);
-
- SDValue Cmp = Op.getOperand(2);
- ISD::CondCode CC = cast<CondCodeSDNode>(Cmp)->get();
-
- EVT VT = Op.getValueType();
- EVT LHSVT = LHS.getValueType();
- EVT RHSVT = RHS.getValueType();
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ MVT ResTy = ty(Op);
+ MVT OpTy = ty(LHS);
- if (LHSVT == MVT::v2i16) {
- assert(ISD::isSignedIntSetCC(CC) || ISD::isUnsignedIntSetCC(CC));
- unsigned ExtOpc = ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND
- : ISD::ZERO_EXTEND;
- SDValue LX = DAG.getNode(ExtOpc, dl, MVT::v2i32, LHS);
- SDValue RX = DAG.getNode(ExtOpc, dl, MVT::v2i32, RHS);
- SDValue SC = DAG.getNode(ISD::SETCC, dl, MVT::v2i1, LX, RX, Cmp);
- return SC;
+ if (OpTy == MVT::v2i16 || OpTy == MVT::v4i8) {
+ MVT ElemTy = OpTy.getVectorElementType();
+ assert(ElemTy.isScalarInteger());
+ MVT WideTy = MVT::getVectorVT(MVT::getIntegerVT(2*ElemTy.getSizeInBits()),
+ OpTy.getVectorNumElements());
+ return DAG.getSetCC(dl, ResTy,
+ DAG.getSExtOrTrunc(LHS, SDLoc(LHS), WideTy),
+ DAG.getSExtOrTrunc(RHS, SDLoc(RHS), WideTy), CC);
}
// Treat all other vector types as legal.
- if (VT.isVector())
+ if (ResTy.isVector())
return Op;
- // Equals and not equals should use sign-extend, not zero-extend, since
- // we can represent small negative values in the compare instructions.
+ // Comparisons of short integers should use sign-extend, not zero-extend,
+ // since we can represent small negative values in the compare instructions.
// The LLVM default is to use zero-extend arbitrarily in these cases.
- if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
- (RHSVT == MVT::i8 || RHSVT == MVT::i16) &&
- (LHSVT == MVT::i8 || LHSVT == MVT::i16)) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
- if (C && C->getAPIntValue().isNegative()) {
- LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, LHS);
- RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, RHS);
- return DAG.getNode(ISD::SETCC, dl, Op.getValueType(),
- LHS, RHS, Op.getOperand(2));
- }
- if (isSExtFree(LHS) || isSExtFree(RHS)) {
- LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, LHS);
- RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, RHS);
- return DAG.getNode(ISD::SETCC, dl, Op.getValueType(),
- LHS, RHS, Op.getOperand(2));
+ auto isSExtFree = [this](SDValue N) {
+ switch (N.getOpcode()) {
+ case ISD::TRUNCATE: {
+ // A sign-extend of a truncate of a sign-extend is free.
+ SDValue Op = N.getOperand(0);
+ if (Op.getOpcode() != ISD::AssertSext)
+ return false;
+ EVT OrigTy = cast<VTSDNode>(Op.getOperand(1))->getVT();
+ unsigned ThisBW = ty(N).getSizeInBits();
+ unsigned OrigBW = OrigTy.getSizeInBits();
+ // The type that was sign-extended to get the AssertSext must be
+ // narrower than the type of N (so that N has still the same value
+ // as the original).
+ return ThisBW >= OrigBW;
+ }
+ case ISD::LOAD:
+ // We have sign-extended loads.
+ return true;
}
+ return false;
+ };
+
+ if (OpTy == MVT::i8 || OpTy == MVT::i16) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
+ bool IsNegative = C && C->getAPIntValue().isNegative();
+ if (IsNegative || isSExtFree(LHS) || isSExtFree(RHS))
+ return DAG.getSetCC(dl, ResTy,
+ DAG.getSExtOrTrunc(LHS, SDLoc(LHS), MVT::i32),
+ DAG.getSExtOrTrunc(RHS, SDLoc(RHS), MVT::i32), CC);
}
+
return SDValue();
}
@@ -1393,8 +931,7 @@ HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
else if (isVTi1Type)
T = DAG.getTargetConstantPool(CVal, ValTy, Align, Offset, TF);
else
- T = DAG.getTargetConstantPool(CPN->getConstVal(), ValTy, Align, Offset,
- TF);
+ T = DAG.getTargetConstantPool(CPN->getConstVal(), ValTy, Align, Offset, TF);
assert(cast<ConstantPoolSDNode>(T)->getTargetFlags() == TF &&
"Inconsistent target flag encountered");
@@ -1480,7 +1017,7 @@ HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const {
if (RM == Reloc::Static) {
SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
const GlobalObject *GO = GV->getBaseObject();
- if (GO && HLOF.isGlobalInSmallSection(GO, HTM))
+ if (GO && Subtarget.useSmallData() && HLOF.isGlobalInSmallSection(GO, HTM))
return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, GA);
return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, GA);
}
@@ -1688,13 +1225,15 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
const HexagonSubtarget &ST)
: TargetLowering(TM), HTM(static_cast<const HexagonTargetMachine&>(TM)),
Subtarget(ST) {
- bool IsV4 = !Subtarget.hasV5TOps();
+ bool IsV4 = !Subtarget.hasV5Ops();
auto &HRI = *Subtarget.getRegisterInfo();
setPrefLoopAlignment(4);
setPrefFunctionAlignment(4);
setMinFunctionAlignment(2);
setStackPointerRegisterToSaveRestore(HRI.getStackRegister());
+ setBooleanContents(TargetLoweringBase::UndefinedBooleanContent);
+ setBooleanVectorContents(TargetLoweringBase::UndefinedBooleanContent);
setMaxAtomicSizeInBitsSupported(64);
setMinCmpXchgSizeInBits(32);
@@ -1728,45 +1267,11 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v4i16, &Hexagon::DoubleRegsRegClass);
addRegisterClass(MVT::v2i32, &Hexagon::DoubleRegsRegClass);
- if (Subtarget.hasV5TOps()) {
+ if (Subtarget.hasV5Ops()) {
addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
}
- if (Subtarget.hasV60TOps()) {
- if (Subtarget.useHVX64BOps()) {
- addRegisterClass(MVT::v64i8, &Hexagon::HvxVRRegClass);
- addRegisterClass(MVT::v32i16, &Hexagon::HvxVRRegClass);
- addRegisterClass(MVT::v16i32, &Hexagon::HvxVRRegClass);
- addRegisterClass(MVT::v128i8, &Hexagon::HvxWRRegClass);
- addRegisterClass(MVT::v64i16, &Hexagon::HvxWRRegClass);
- addRegisterClass(MVT::v32i32, &Hexagon::HvxWRRegClass);
- // These "short" boolean vector types should be legal because
- // they will appear as results of vector compares. If they were
- // not legal, type legalization would try to make them legal
- // and that would require using operations that do not use or
- // produce such types. That, in turn, would imply using custom
- // nodes, which would be unoptimizable by the DAG combiner.
- // The idea is to rely on target-independent operations as much
- // as possible.
- addRegisterClass(MVT::v16i1, &Hexagon::HvxQRRegClass);
- addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
- addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
- addRegisterClass(MVT::v512i1, &Hexagon::HvxQRRegClass);
- } else if (Subtarget.useHVX128BOps()) {
- addRegisterClass(MVT::v128i8, &Hexagon::HvxVRRegClass);
- addRegisterClass(MVT::v64i16, &Hexagon::HvxVRRegClass);
- addRegisterClass(MVT::v32i32, &Hexagon::HvxVRRegClass);
- addRegisterClass(MVT::v256i8, &Hexagon::HvxWRRegClass);
- addRegisterClass(MVT::v128i16, &Hexagon::HvxWRRegClass);
- addRegisterClass(MVT::v64i32, &Hexagon::HvxWRRegClass);
- addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
- addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
- addRegisterClass(MVT::v128i1, &Hexagon::HvxQRRegClass);
- addRegisterClass(MVT::v1024i1, &Hexagon::HvxQRRegClass);
- }
- }
-
//
// Handling of scalar operations.
//
@@ -1801,13 +1306,16 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
// Hexagon needs to optimize cases with negative constants.
- setOperationAction(ISD::SETCC, MVT::i8, Custom);
- setOperationAction(ISD::SETCC, MVT::i16, Custom);
+ setOperationAction(ISD::SETCC, MVT::i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::i16, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::v2i16, Custom);
// VASTART needs to be custom lowered to use the VarArgsFrameIndex.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
setOperationAction(ISD::VAARG, MVT::Other, Expand);
+ setOperationAction(ISD::VACOPY, MVT::Other, Expand);
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
@@ -1819,35 +1327,21 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setMinimumJumpTableEntries(std::numeric_limits<int>::max());
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
- // Hexagon has instructions for add/sub with carry. The problem with
- // modeling these instructions is that they produce 2 results: Rdd and Px.
- // To model the update of Px, we will have to use Defs[p0..p3] which will
- // cause any predicate live range to spill. So, we pretend we dont't have
- // these instructions.
- setOperationAction(ISD::ADDE, MVT::i8, Expand);
- setOperationAction(ISD::ADDE, MVT::i16, Expand);
- setOperationAction(ISD::ADDE, MVT::i32, Expand);
- setOperationAction(ISD::ADDE, MVT::i64, Expand);
- setOperationAction(ISD::SUBE, MVT::i8, Expand);
- setOperationAction(ISD::SUBE, MVT::i16, Expand);
- setOperationAction(ISD::SUBE, MVT::i32, Expand);
- setOperationAction(ISD::SUBE, MVT::i64, Expand);
- setOperationAction(ISD::ADDC, MVT::i8, Expand);
- setOperationAction(ISD::ADDC, MVT::i16, Expand);
- setOperationAction(ISD::ADDC, MVT::i32, Expand);
- setOperationAction(ISD::ADDC, MVT::i64, Expand);
- setOperationAction(ISD::SUBC, MVT::i8, Expand);
- setOperationAction(ISD::SUBC, MVT::i16, Expand);
- setOperationAction(ISD::SUBC, MVT::i32, Expand);
- setOperationAction(ISD::SUBC, MVT::i64, Expand);
-
- // Only add and sub that detect overflow are the saturating ones.
+ setOperationAction(ISD::ABS, MVT::i32, Legal);
+ setOperationAction(ISD::ABS, MVT::i64, Legal);
+
+ // Hexagon has A4_addp_c and A4_subp_c that take and generate a carry bit,
+ // but they only operate on i64.
for (MVT VT : MVT::integer_valuetypes()) {
- setOperationAction(ISD::UADDO, VT, Expand);
- setOperationAction(ISD::SADDO, VT, Expand);
- setOperationAction(ISD::USUBO, VT, Expand);
- setOperationAction(ISD::SSUBO, VT, Expand);
+ setOperationAction(ISD::UADDO, VT, Expand);
+ setOperationAction(ISD::USUBO, VT, Expand);
+ setOperationAction(ISD::SADDO, VT, Expand);
+ setOperationAction(ISD::SSUBO, VT, Expand);
+ setOperationAction(ISD::ADDCARRY, VT, Expand);
+ setOperationAction(ISD::SUBCARRY, VT, Expand);
}
+ setOperationAction(ISD::ADDCARRY, MVT::i64, Custom);
+ setOperationAction(ISD::SUBCARRY, MVT::i64, Custom);
setOperationAction(ISD::CTLZ, MVT::i8, Promote);
setOperationAction(ISD::CTLZ, MVT::i16, Promote);
@@ -1865,22 +1359,21 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
setOperationAction(ISD::BSWAP, MVT::i32, Legal);
setOperationAction(ISD::BSWAP, MVT::i64, Legal);
- setOperationAction(ISD::MUL, MVT::i64, Legal);
for (unsigned IntExpOp :
- { ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM,
- ISD::SDIVREM, ISD::UDIVREM, ISD::ROTL, ISD::ROTR,
- ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
- ISD::SMUL_LOHI, ISD::UMUL_LOHI }) {
- setOperationAction(IntExpOp, MVT::i32, Expand);
- setOperationAction(IntExpOp, MVT::i64, Expand);
+ {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM,
+ ISD::SDIVREM, ISD::UDIVREM, ISD::ROTL, ISD::ROTR,
+ ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
+ ISD::SMUL_LOHI, ISD::UMUL_LOHI}) {
+ for (MVT VT : MVT::integer_valuetypes())
+ setOperationAction(IntExpOp, VT, Expand);
}
for (unsigned FPExpOp :
{ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FSINCOS,
ISD::FPOW, ISD::FCOPYSIGN}) {
- setOperationAction(FPExpOp, MVT::f32, Expand);
- setOperationAction(FPExpOp, MVT::f64, Expand);
+ for (MVT VT : MVT::fp_valuetypes())
+ setOperationAction(FPExpOp, VT, Expand);
}
// No extending loads from i32.
@@ -1920,10 +1413,9 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
// either "custom" or "legal" for specific cases.
static const unsigned VectExpOps[] = {
// Integer arithmetic:
- ISD::ADD, ISD::SUB, ISD::MUL, ISD::SDIV, ISD::UDIV,
- ISD::SREM, ISD::UREM, ISD::SDIVREM, ISD::UDIVREM, ISD::ADDC,
- ISD::SUBC, ISD::SADDO, ISD::UADDO, ISD::SSUBO, ISD::USUBO,
- ISD::SMUL_LOHI, ISD::UMUL_LOHI,
+ ISD::ADD, ISD::SUB, ISD::MUL, ISD::SDIV, ISD::UDIV,
+ ISD::SREM, ISD::UREM, ISD::SDIVREM, ISD::UDIVREM, ISD::SADDO,
+ ISD::UADDO, ISD::SSUBO, ISD::USUBO, ISD::SMUL_LOHI, ISD::UMUL_LOHI,
// Logical/bit:
ISD::AND, ISD::OR, ISD::XOR, ISD::ROTL, ISD::ROTR,
ISD::CTPOP, ISD::CTLZ, ISD::CTTZ,
@@ -1970,16 +1462,16 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
// Extending loads from (native) vectors of i8 into (native) vectors of i16
// are legal.
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, MVT::v2i8, Legal);
setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, MVT::v2i8, Legal);
setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, MVT::v2i8, Legal);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Legal);
setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Legal);
setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Legal);
// Types natively supported:
- for (MVT NativeVT : {MVT::v32i1, MVT::v64i1, MVT::v4i8, MVT::v8i8, MVT::v2i16,
- MVT::v4i16, MVT::v1i32, MVT::v2i32, MVT::v1i64}) {
+ for (MVT NativeVT : {MVT::v8i1, MVT::v4i1, MVT::v2i1, MVT::v4i8,
+ MVT::v8i8, MVT::v2i16, MVT::v4i16, MVT::v2i32}) {
setOperationAction(ISD::BUILD_VECTOR, NativeVT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, NativeVT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, NativeVT, Custom);
@@ -1995,19 +1487,34 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::XOR, NativeVT, Legal);
}
+ // Custom lower unaligned loads.
+ for (MVT VecVT : {MVT::i32, MVT::v4i8, MVT::i64, MVT::v8i8,
+ MVT::v2i16, MVT::v4i16, MVT::v2i32}) {
+ setOperationAction(ISD::LOAD, VecVT, Custom);
+ }
+
+ for (MVT VT : {MVT::v2i16, MVT::v4i8, MVT::v2i32, MVT::v4i16, MVT::v2i32}) {
+ setCondCodeAction(ISD::SETLT, VT, Expand);
+ setCondCodeAction(ISD::SETLE, VT, Expand);
+ setCondCodeAction(ISD::SETULT, VT, Expand);
+ setCondCodeAction(ISD::SETULE, VT, Expand);
+ }
+
+ // Custom-lower bitcasts from i8 to v8i1.
+ setOperationAction(ISD::BITCAST, MVT::i8, Custom);
setOperationAction(ISD::SETCC, MVT::v2i16, Custom);
setOperationAction(ISD::VSELECT, MVT::v2i16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
- auto setPromoteTo = [this] (unsigned Opc, MVT FromTy, MVT ToTy) {
- setOperationAction(Opc, FromTy, Promote);
- AddPromotedToType(Opc, FromTy, ToTy);
- };
-
// Subtarget-specific operation actions.
//
- if (Subtarget.hasV5TOps()) {
+ if (Subtarget.hasV60Ops()) {
+ setOperationAction(ISD::ROTL, MVT::i32, Custom);
+ setOperationAction(ISD::ROTL, MVT::i64, Custom);
+ }
+ if (Subtarget.hasV5Ops()) {
setOperationAction(ISD::FMA, MVT::f64, Expand);
setOperationAction(ISD::FADD, MVT::f64, Expand);
setOperationAction(ISD::FSUB, MVT::f64, Expand);
@@ -2061,71 +1568,14 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
// Handling of indexed loads/stores: default is "expand".
//
- for (MVT VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
+ for (MVT VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f32, MVT::f64,
+ MVT::v2i16, MVT::v2i32, MVT::v4i8, MVT::v4i16, MVT::v8i8}) {
setIndexedLoadAction(ISD::POST_INC, VT, Legal);
setIndexedStoreAction(ISD::POST_INC, VT, Legal);
}
- if (Subtarget.useHVXOps()) {
- bool Use64b = Subtarget.useHVX64BOps();
- ArrayRef<MVT> LegalV = Use64b ? LegalV64 : LegalV128;
- ArrayRef<MVT> LegalW = Use64b ? LegalW64 : LegalW128;
- MVT ByteV = Use64b ? MVT::v64i8 : MVT::v128i8;
- MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8;
-
- setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal);
- setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal);
- setOperationAction(ISD::CONCAT_VECTORS, ByteW, Legal);
- setOperationAction(ISD::AND, ByteV, Legal);
- setOperationAction(ISD::OR, ByteV, Legal);
- setOperationAction(ISD::XOR, ByteV, Legal);
-
- for (MVT T : LegalV) {
- setIndexedLoadAction(ISD::POST_INC, T, Legal);
- setIndexedStoreAction(ISD::POST_INC, T, Legal);
-
- setOperationAction(ISD::ADD, T, Legal);
- setOperationAction(ISD::SUB, T, Legal);
- if (T != ByteV) {
- setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal);
- setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal);
- }
-
- setOperationAction(ISD::MUL, T, Custom);
- setOperationAction(ISD::SETCC, T, Custom);
- setOperationAction(ISD::BUILD_VECTOR, T, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, T, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, T, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, T, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom);
- if (T != ByteV)
- setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom);
- }
-
- for (MVT T : LegalV) {
- if (T == ByteV)
- continue;
- // Promote all shuffles and concats to operate on vectors of bytes.
- setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV);
- setPromoteTo(ISD::CONCAT_VECTORS, T, ByteV);
- setPromoteTo(ISD::AND, T, ByteV);
- setPromoteTo(ISD::OR, T, ByteV);
- setPromoteTo(ISD::XOR, T, ByteV);
- }
-
- for (MVT T : LegalW) {
- // Custom-lower BUILD_VECTOR for vector pairs. The standard (target-
- // independent) handling of it would convert it to a load, which is
- // not always the optimal choice.
- setOperationAction(ISD::BUILD_VECTOR, T, Custom);
-
- if (T == ByteW)
- continue;
- // Promote all shuffles and concats to operate on vectors of bytes.
- setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW);
- setPromoteTo(ISD::CONCAT_VECTORS, T, ByteW);
- }
- }
+ if (Subtarget.useHVXOps())
+ initializeHVXLowering();
computeRegisterProperties(&HRI);
@@ -2195,7 +1645,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3");
}
- if (Subtarget.hasV5TOps()) {
+ if (Subtarget.hasV5Ops()) {
if (FastMath)
setLibcallName(RTLIB::SQRT_F32, "__hexagon_fast2_sqrtf");
else
@@ -2242,6 +1692,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((HexagonISD::NodeType)Opcode) {
+ case HexagonISD::ADDC: return "HexagonISD::ADDC";
+ case HexagonISD::SUBC: return "HexagonISD::SUBC";
case HexagonISD::ALLOCA: return "HexagonISD::ALLOCA";
case HexagonISD::AT_GOT: return "HexagonISD::AT_GOT";
case HexagonISD::AT_PCREL: return "HexagonISD::AT_PCREL";
@@ -2255,16 +1707,12 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
case HexagonISD::CP: return "HexagonISD::CP";
case HexagonISD::DCFETCH: return "HexagonISD::DCFETCH";
case HexagonISD::EH_RETURN: return "HexagonISD::EH_RETURN";
+ case HexagonISD::TSTBIT: return "HexagonISD::TSTBIT";
case HexagonISD::EXTRACTU: return "HexagonISD::EXTRACTU";
- case HexagonISD::EXTRACTURP: return "HexagonISD::EXTRACTURP";
case HexagonISD::INSERT: return "HexagonISD::INSERT";
- case HexagonISD::INSERTRP: return "HexagonISD::INSERTRP";
case HexagonISD::JT: return "HexagonISD::JT";
case HexagonISD::RET_FLAG: return "HexagonISD::RET_FLAG";
case HexagonISD::TC_RETURN: return "HexagonISD::TC_RETURN";
- case HexagonISD::VCOMBINE: return "HexagonISD::VCOMBINE";
- case HexagonISD::VPACKE: return "HexagonISD::VPACKE";
- case HexagonISD::VPACKO: return "HexagonISD::VPACKO";
case HexagonISD::VASL: return "HexagonISD::VASL";
case HexagonISD::VASR: return "HexagonISD::VASR";
case HexagonISD::VLSR: return "HexagonISD::VLSR";
@@ -2274,11 +1722,97 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
case HexagonISD::VROR: return "HexagonISD::VROR";
case HexagonISD::READCYCLE: return "HexagonISD::READCYCLE";
case HexagonISD::VZERO: return "HexagonISD::VZERO";
+ case HexagonISD::VSPLATW: return "HexagonISD::VSPLATW";
+ case HexagonISD::D2P: return "HexagonISD::D2P";
+ case HexagonISD::P2D: return "HexagonISD::P2D";
+ case HexagonISD::V2Q: return "HexagonISD::V2Q";
+ case HexagonISD::Q2V: return "HexagonISD::Q2V";
+ case HexagonISD::QCAT: return "HexagonISD::QCAT";
+ case HexagonISD::QTRUE: return "HexagonISD::QTRUE";
+ case HexagonISD::QFALSE: return "HexagonISD::QFALSE";
+ case HexagonISD::TYPECAST: return "HexagonISD::TYPECAST";
+ case HexagonISD::VALIGN: return "HexagonISD::VALIGN";
+ case HexagonISD::VALIGNADDR: return "HexagonISD::VALIGNADDR";
case HexagonISD::OP_END: break;
}
return nullptr;
}
+// Bit-reverse Load Intrinsic: Check if the instruction is a bit reverse load
+// intrinsic.
+static bool isBrevLdIntrinsic(const Value *Inst) {
+ unsigned ID = cast<IntrinsicInst>(Inst)->getIntrinsicID();
+ return (ID == Intrinsic::hexagon_L2_loadrd_pbr ||
+ ID == Intrinsic::hexagon_L2_loadri_pbr ||
+ ID == Intrinsic::hexagon_L2_loadrh_pbr ||
+ ID == Intrinsic::hexagon_L2_loadruh_pbr ||
+ ID == Intrinsic::hexagon_L2_loadrb_pbr ||
+ ID == Intrinsic::hexagon_L2_loadrub_pbr);
+}
+
+// Bit-reverse Load Intrinsic :Crawl up and figure out the object from previous
+// instruction. So far we only handle bitcast, extract value and bit reverse
+// load intrinsic instructions. Should we handle CGEP ?
+static Value *getBrevLdObject(Value *V) {
+ if (Operator::getOpcode(V) == Instruction::ExtractValue ||
+ Operator::getOpcode(V) == Instruction::BitCast)
+ V = cast<Operator>(V)->getOperand(0);
+ else if (isa<IntrinsicInst>(V) && isBrevLdIntrinsic(V))
+ V = cast<Instruction>(V)->getOperand(0);
+ return V;
+}
+
+// Bit-reverse Load Intrinsic: For a PHI Node return either an incoming edge or
+// a back edge. If the back edge comes from the intrinsic itself, the incoming
+// edge is returned.
+static Value *returnEdge(const PHINode *PN, Value *IntrBaseVal) {
+ const BasicBlock *Parent = PN->getParent();
+ int Idx = -1;
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) {
+ BasicBlock *Blk = PN->getIncomingBlock(i);
+ // Determine if the back edge is originated from intrinsic.
+ if (Blk == Parent) {
+ Value *BackEdgeVal = PN->getIncomingValue(i);
+ Value *BaseVal;
+ // Loop over till we return the same Value or we hit the IntrBaseVal.
+ do {
+ BaseVal = BackEdgeVal;
+ BackEdgeVal = getBrevLdObject(BackEdgeVal);
+ } while ((BaseVal != BackEdgeVal) && (IntrBaseVal != BackEdgeVal));
+ // If the getBrevLdObject returns IntrBaseVal, we should return the
+ // incoming edge.
+ if (IntrBaseVal == BackEdgeVal)
+ continue;
+ Idx = i;
+ break;
+ } else // Set the node to incoming edge.
+ Idx = i;
+ }
+ assert(Idx >= 0 && "Unexpected index to incoming argument in PHI");
+ return PN->getIncomingValue(Idx);
+}
+
+// Bit-reverse Load Intrinsic: Figure out the underlying object the base
+// pointer points to, for the bit-reverse load intrinsic. Setting this to
+// memoperand might help alias analysis to figure out the dependencies.
+static Value *getUnderLyingObjectForBrevLdIntr(Value *V) {
+ Value *IntrBaseVal = V;
+ Value *BaseVal;
+ // Loop over till we return the same Value, implies we either figure out
+ // the object or we hit a PHI
+ do {
+ BaseVal = V;
+ V = getBrevLdObject(V);
+ } while (BaseVal != V);
+
+ // Identify the object from PHINode.
+ if (const PHINode *PN = dyn_cast<PHINode>(V))
+ return returnEdge(PN, IntrBaseVal);
+ // For non PHI nodes, the object is the last value returned by getBrevLdObject
+ else
+ return V;
+}
+
/// Given an intrinsic, checks if on the target the intrinsic will need to map
/// to a MemIntrinsicNode (touches memory). If this is the case, it returns
/// true and store the intrinsic information into the IntrinsicInfo that was
@@ -2288,6 +1822,32 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineFunction &MF,
unsigned Intrinsic) const {
switch (Intrinsic) {
+ case Intrinsic::hexagon_L2_loadrd_pbr:
+ case Intrinsic::hexagon_L2_loadri_pbr:
+ case Intrinsic::hexagon_L2_loadrh_pbr:
+ case Intrinsic::hexagon_L2_loadruh_pbr:
+ case Intrinsic::hexagon_L2_loadrb_pbr:
+ case Intrinsic::hexagon_L2_loadrub_pbr: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+ auto &Cont = I.getCalledFunction()->getParent()->getContext();
+ // The intrinsic function call is of the form { ElTy, i8* }
+ // @llvm.hexagon.L2.loadXX.pbr(i8*, i32). The pointer and memory access type
+ // should be derived from ElTy.
+ PointerType *PtrTy = I.getCalledFunction()
+ ->getReturnType()
+ ->getContainedType(0)
+ ->getPointerTo();
+ Info.memVT = MVT::getVT(PtrTy->getElementType());
+ llvm::Value *BasePtrVal = I.getOperand(0);
+ Info.ptrVal = getUnderLyingObjectForBrevLdIntr(BasePtrVal);
+ // The offset value comes through Modifier register. For now, assume the
+ // offset is 0.
+ Info.offset = 0;
+ Info.align = DL.getABITypeAlignment(Info.memVT.getTypeForEVT(Cont));
+ Info.flags = MachineMemOperand::MOLoad;
+ return true;
+ }
case Intrinsic::hexagon_V6_vgathermw:
case Intrinsic::hexagon_V6_vgathermw_128B:
case Intrinsic::hexagon_V6_vgathermh:
@@ -2319,17 +1879,13 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
}
bool HexagonTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
- EVT MTy1 = EVT::getEVT(Ty1);
- EVT MTy2 = EVT::getEVT(Ty2);
- if (!MTy1.isSimple() || !MTy2.isSimple())
- return false;
- return (MTy1.getSimpleVT() == MVT::i64) && (MTy2.getSimpleVT() == MVT::i32);
+ return isTruncateFree(EVT::getEVT(Ty1), EVT::getEVT(Ty2));
}
bool HexagonTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
if (!VT1.isSimple() || !VT2.isSimple())
return false;
- return (VT1.getSimpleVT() == MVT::i64) && (VT2.getSimpleVT() == MVT::i32);
+ return VT1.getSimpleVT() == MVT::i64 && VT2.getSimpleVT() == MVT::i32;
}
bool HexagonTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
@@ -2372,126 +1928,199 @@ HexagonTargetLowering::getPreferredVectorAction(EVT VT) const {
return TargetLoweringBase::TypeSplitVector;
}
+std::pair<SDValue, int>
+HexagonTargetLowering::getBaseAndOffset(SDValue Addr) const {
+ if (Addr.getOpcode() == ISD::ADD) {
+ SDValue Op1 = Addr.getOperand(1);
+ if (auto *CN = dyn_cast<const ConstantSDNode>(Op1.getNode()))
+ return { Addr.getOperand(0), CN->getSExtValue() };
+ }
+ return { Addr, 0 };
+}
+
// Lower a vector shuffle (V1, V2, V3). V1 and V2 are the two vectors
// to select data from, V3 is the permutation.
SDValue
HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
const {
- const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
- SDLoc dl(Op);
- EVT VT = Op.getValueType();
+ const auto *SVN = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> AM = SVN->getMask();
+ assert(AM.size() <= 8 && "Unexpected shuffle mask");
+ unsigned VecLen = AM.size();
- if (V2.isUndef())
- V2 = V1;
-
- if (SVN->isSplat()) {
- int Lane = SVN->getSplatIndex();
- if (Lane == -1) Lane = 0;
-
- // Test if V1 is a SCALAR_TO_VECTOR.
- if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
- return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0));
-
- // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
- // (and probably will turn into a SCALAR_TO_VECTOR once legalization
- // reaches it).
- if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
- !isa<ConstantSDNode>(V1.getOperand(0))) {
- bool IsScalarToVector = true;
- for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) {
- if (!V1.getOperand(i).isUndef()) {
- IsScalarToVector = false;
- break;
- }
- }
- if (IsScalarToVector)
- return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0));
+ MVT VecTy = ty(Op);
+ assert(!Subtarget.isHVXVectorType(VecTy, true) &&
+ "HVX shuffles should be legal");
+ assert(VecTy.getSizeInBits() <= 64 && "Unexpected vector length");
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ const SDLoc &dl(Op);
+
+ // If the inputs are not the same as the output, bail. This is not an
+ // error situation, but complicates the handling and the default expansion
+ // (into BUILD_VECTOR) should be adequate.
+ if (ty(Op0) != VecTy || ty(Op1) != VecTy)
+ return SDValue();
+
+ // Normalize the mask so that the first non-negative index comes from
+ // the first operand.
+ SmallVector<int,8> Mask(AM.begin(), AM.end());
+ unsigned F = llvm::find_if(AM, [](int M) { return M >= 0; }) - AM.data();
+ if (F == AM.size())
+ return DAG.getUNDEF(VecTy);
+ if (AM[F] >= int(VecLen)) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(Op0, Op1);
+ }
+
+ // Express the shuffle mask in terms of bytes.
+ SmallVector<int,8> ByteMask;
+ unsigned ElemBytes = VecTy.getVectorElementType().getSizeInBits() / 8;
+ for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+ int M = Mask[i];
+ if (M < 0) {
+ for (unsigned j = 0; j != ElemBytes; ++j)
+ ByteMask.push_back(-1);
+ } else {
+ for (unsigned j = 0; j != ElemBytes; ++j)
+ ByteMask.push_back(M*ElemBytes + j);
}
- return DAG.getNode(HexagonISD::VSPLAT, dl, VT,
- DAG.getConstant(Lane, dl, MVT::i32));
}
+ assert(ByteMask.size() <= 8);
+
+ // All non-undef (non-negative) indexes are well within [0..127], so they
+ // fit in a single byte. Build two 64-bit words:
+ // - MaskIdx where each byte is the corresponding index (for non-negative
+ // indexes), and 0xFF for negative indexes, and
+ // - MaskUnd that has 0xFF for each negative index.
+ uint64_t MaskIdx = 0;
+ uint64_t MaskUnd = 0;
+ for (unsigned i = 0, e = ByteMask.size(); i != e; ++i) {
+ unsigned S = 8*i;
+ uint64_t M = ByteMask[i] & 0xFF;
+ if (M == 0xFF)
+ MaskUnd |= M << S;
+ MaskIdx |= M << S;
+ }
+
+ if (ByteMask.size() == 4) {
+ // Identity.
+ if (MaskIdx == (0x03020100 | MaskUnd))
+ return Op0;
+ // Byte swap.
+ if (MaskIdx == (0x00010203 | MaskUnd)) {
+ SDValue T0 = DAG.getBitcast(MVT::i32, Op0);
+ SDValue T1 = DAG.getNode(ISD::BSWAP, dl, MVT::i32, T0);
+ return DAG.getBitcast(VecTy, T1);
+ }
- // FIXME: We need to support more general vector shuffles. See
- // below the comment from the ARM backend that deals in the general
- // case with the vector shuffles. For now, let expand handle these.
- return SDValue();
+ // Byte packs.
+ SDValue Concat10 = DAG.getNode(HexagonISD::COMBINE, dl,
+ typeJoin({ty(Op1), ty(Op0)}), {Op1, Op0});
+ if (MaskIdx == (0x06040200 | MaskUnd))
+ return getInstr(Hexagon::S2_vtrunehb, dl, VecTy, {Concat10}, DAG);
+ if (MaskIdx == (0x07050301 | MaskUnd))
+ return getInstr(Hexagon::S2_vtrunohb, dl, VecTy, {Concat10}, DAG);
+
+ SDValue Concat01 = DAG.getNode(HexagonISD::COMBINE, dl,
+ typeJoin({ty(Op0), ty(Op1)}), {Op0, Op1});
+ if (MaskIdx == (0x02000604 | MaskUnd))
+ return getInstr(Hexagon::S2_vtrunehb, dl, VecTy, {Concat01}, DAG);
+ if (MaskIdx == (0x03010705 | MaskUnd))
+ return getInstr(Hexagon::S2_vtrunohb, dl, VecTy, {Concat01}, DAG);
+ }
+
+ if (ByteMask.size() == 8) {
+ // Identity.
+ if (MaskIdx == (0x0706050403020100ull | MaskUnd))
+ return Op0;
+ // Byte swap.
+ if (MaskIdx == (0x0001020304050607ull | MaskUnd)) {
+ SDValue T0 = DAG.getBitcast(MVT::i64, Op0);
+ SDValue T1 = DAG.getNode(ISD::BSWAP, dl, MVT::i64, T0);
+ return DAG.getBitcast(VecTy, T1);
+ }
- // If the shuffle is not directly supported and it has 4 elements, use
- // the PerfectShuffle-generated table to synthesize it from other shuffles.
-}
+ // Halfword picks.
+ if (MaskIdx == (0x0d0c050409080100ull | MaskUnd))
+ return getInstr(Hexagon::S2_shuffeh, dl, VecTy, {Op1, Op0}, DAG);
+ if (MaskIdx == (0x0f0e07060b0a0302ull | MaskUnd))
+ return getInstr(Hexagon::S2_shuffoh, dl, VecTy, {Op1, Op0}, DAG);
+ if (MaskIdx == (0x0d0c090805040100ull | MaskUnd))
+ return getInstr(Hexagon::S2_vtrunewh, dl, VecTy, {Op1, Op0}, DAG);
+ if (MaskIdx == (0x0f0e0b0a07060302ull | MaskUnd))
+ return getInstr(Hexagon::S2_vtrunowh, dl, VecTy, {Op1, Op0}, DAG);
+ if (MaskIdx == (0x0706030205040100ull | MaskUnd)) {
+ VectorPair P = opSplit(Op0, dl, DAG);
+ return getInstr(Hexagon::S2_packhl, dl, VecTy, {P.second, P.first}, DAG);
+ }
-// If BUILD_VECTOR has same base element repeated several times,
-// report true.
-static bool isCommonSplatElement(BuildVectorSDNode *BVN) {
- unsigned NElts = BVN->getNumOperands();
- SDValue V0 = BVN->getOperand(0);
+ // Byte packs.
+ if (MaskIdx == (0x0e060c040a020800ull | MaskUnd))
+ return getInstr(Hexagon::S2_shuffeb, dl, VecTy, {Op1, Op0}, DAG);
+ if (MaskIdx == (0x0f070d050b030901ull | MaskUnd))
+ return getInstr(Hexagon::S2_shuffob, dl, VecTy, {Op1, Op0}, DAG);
+ }
- for (unsigned i = 1, e = NElts; i != e; ++i) {
- if (BVN->getOperand(i) != V0)
- return false;
+ return SDValue();
+}
+
+// Create a Hexagon-specific node for shifting a vector by an integer.
+SDValue
+HexagonTargetLowering::getVectorShiftByInt(SDValue Op, SelectionDAG &DAG)
+ const {
+ if (auto *BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode())) {
+ if (SDValue S = BVN->getSplatValue()) {
+ unsigned NewOpc;
+ switch (Op.getOpcode()) {
+ case ISD::SHL:
+ NewOpc = HexagonISD::VASL;
+ break;
+ case ISD::SRA:
+ NewOpc = HexagonISD::VASR;
+ break;
+ case ISD::SRL:
+ NewOpc = HexagonISD::VLSR;
+ break;
+ default:
+ llvm_unreachable("Unexpected shift opcode");
+ }
+ return DAG.getNode(NewOpc, SDLoc(Op), ty(Op), Op.getOperand(0), S);
+ }
}
- return true;
+
+ return SDValue();
}
-// Lower a vector shift. Try to convert
-// <VT> = SHL/SRA/SRL <VT> by <VT> to Hexagon specific
-// <VT> = SHL/SRA/SRL <VT> by <IT/i32>.
SDValue
HexagonTargetLowering::LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const {
- BuildVectorSDNode *BVN = nullptr;
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
- SDValue V3;
- SDLoc dl(Op);
- EVT VT = Op.getValueType();
+ return getVectorShiftByInt(Op, DAG);
+}
- if ((BVN = dyn_cast<BuildVectorSDNode>(V1.getNode())) &&
- isCommonSplatElement(BVN))
- V3 = V2;
- else if ((BVN = dyn_cast<BuildVectorSDNode>(V2.getNode())) &&
- isCommonSplatElement(BVN))
- V3 = V1;
- else
- return SDValue();
+SDValue
+HexagonTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
+ if (isa<ConstantSDNode>(Op.getOperand(1).getNode()))
+ return Op;
+ return SDValue();
+}
- SDValue CommonSplat = BVN->getOperand(0);
- SDValue Result;
+SDValue
+HexagonTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
+ MVT ResTy = ty(Op);
+ SDValue InpV = Op.getOperand(0);
+ MVT InpTy = ty(InpV);
+ assert(ResTy.getSizeInBits() == InpTy.getSizeInBits());
+ const SDLoc &dl(Op);
- if (VT.getSimpleVT() == MVT::v4i16) {
- switch (Op.getOpcode()) {
- case ISD::SRA:
- Result = DAG.getNode(HexagonISD::VASR, dl, VT, V3, CommonSplat);
- break;
- case ISD::SHL:
- Result = DAG.getNode(HexagonISD::VASL, dl, VT, V3, CommonSplat);
- break;
- case ISD::SRL:
- Result = DAG.getNode(HexagonISD::VLSR, dl, VT, V3, CommonSplat);
- break;
- default:
- return SDValue();
- }
- } else if (VT.getSimpleVT() == MVT::v2i32) {
- switch (Op.getOpcode()) {
- case ISD::SRA:
- Result = DAG.getNode(HexagonISD::VASR, dl, VT, V3, CommonSplat);
- break;
- case ISD::SHL:
- Result = DAG.getNode(HexagonISD::VASL, dl, VT, V3, CommonSplat);
- break;
- case ISD::SRL:
- Result = DAG.getNode(HexagonISD::VLSR, dl, VT, V3, CommonSplat);
- break;
- default:
- return SDValue();
- }
- } else {
- return SDValue();
+ // Handle conversion from i8 to v8i1.
+ if (ResTy == MVT::v8i1) {
+ SDValue Sc = DAG.getBitcast(tyScalar(InpTy), InpV);
+ SDValue Ext = DAG.getZExtOrTrunc(Sc, dl, MVT::i32);
+ return getInstr(Hexagon::C2_tfrrp, dl, ResTy, Ext, DAG);
}
- return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+ return SDValue();
}
bool
@@ -2509,9 +2138,10 @@ HexagonTargetLowering::getBuildVectorConstInts(ArrayRef<SDValue> Values,
Consts[i] = ConstantInt::get(IntTy, 0);
continue;
}
+ // Make sure to always cast to IntTy.
if (auto *CN = dyn_cast<ConstantSDNode>(V.getNode())) {
const ConstantInt *CI = CN->getConstantIntValue();
- Consts[i] = const_cast<ConstantInt*>(CI);
+ Consts[i] = ConstantInt::get(IntTy, CI->getValue().getSExtValue());
} else if (auto *CN = dyn_cast<ConstantFPSDNode>(V.getNode())) {
const ConstantFP *CF = CN->getConstantFPValue();
APInt A = CF->getValueAPF().bitcastToAPInt();
@@ -2550,8 +2180,8 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,
Consts[1]->getZExtValue() << 16;
return DAG.getBitcast(MVT::v2i16, DAG.getConstant(V, dl, MVT::i32));
}
- SDValue N = getNode(Hexagon::A2_combine_ll, dl, MVT::i32,
- {Elem[1], Elem[0]}, DAG);
+ SDValue N = getInstr(Hexagon::A2_combine_ll, dl, MVT::i32,
+ {Elem[1], Elem[0]}, DAG);
return DAG.getBitcast(MVT::v2i16, N);
}
@@ -2596,7 +2226,7 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,
SDValue B0 = DAG.getNode(ISD::OR, dl, MVT::i32, {Vs[0], T0});
SDValue B1 = DAG.getNode(ISD::OR, dl, MVT::i32, {Vs[2], T1});
- SDValue R = getNode(Hexagon::A2_combine_ll, dl, MVT::i32, {B1, B0}, DAG);
+ SDValue R = getInstr(Hexagon::A2_combine_ll, dl, MVT::i32, {B1, B0}, DAG);
return DAG.getBitcast(MVT::v4i8, R);
}
@@ -2651,7 +2281,7 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,
uint64_t Mask = (ElemTy == MVT::i8) ? 0xFFull
: (ElemTy == MVT::i16) ? 0xFFFFull : 0xFFFFFFFFull;
for (unsigned i = 0; i != Num; ++i)
- Val = (Val << W) | (Consts[i]->getZExtValue() & Mask);
+ Val = (Val << W) | (Consts[Num-1-i]->getZExtValue() & Mask);
SDValue V0 = DAG.getConstant(Val, dl, MVT::i64);
return DAG.getBitcast(VecTy, V0);
}
@@ -2677,8 +2307,56 @@ HexagonTargetLowering::extractVector(SDValue VecV, SDValue IdxV,
unsigned VecWidth = VecTy.getSizeInBits();
unsigned ValWidth = ValTy.getSizeInBits();
unsigned ElemWidth = VecTy.getVectorElementType().getSizeInBits();
- assert(VecWidth == 32 || VecWidth == 64);
assert((VecWidth % ElemWidth) == 0);
+ auto *IdxN = dyn_cast<ConstantSDNode>(IdxV);
+
+ // Special case for v{8,4,2}i1 (the only boolean vectors legal in Hexagon
+ // without any coprocessors).
+ if (ElemWidth == 1) {
+ assert(VecWidth == VecTy.getVectorNumElements() && "Sanity failure");
+ assert(VecWidth == 8 || VecWidth == 4 || VecWidth == 2);
+ // Check if this is an extract of the lowest bit.
+ if (IdxN) {
+ // Extracting the lowest bit is a no-op, but it changes the type,
+ // so it must be kept as an operation to avoid errors related to
+ // type mismatches.
+ if (IdxN->isNullValue() && ValTy.getSizeInBits() == 1)
+ return DAG.getNode(HexagonISD::TYPECAST, dl, MVT::i1, VecV);
+ }
+
+ // If the value extracted is a single bit, use tstbit.
+ if (ValWidth == 1) {
+ SDValue A0 = getInstr(Hexagon::C2_tfrpr, dl, MVT::i32, {VecV}, DAG);
+ SDValue M0 = DAG.getConstant(8 / VecWidth, dl, MVT::i32);
+ SDValue I0 = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV, M0);
+ return DAG.getNode(HexagonISD::TSTBIT, dl, MVT::i1, A0, I0);
+ }
+
+ // Each bool vector (v2i1, v4i1, v8i1) always occupies 8 bits in
+ // a predicate register. The elements of the vector are repeated
+ // in the register (if necessary) so that the total number is 8.
+ // The extracted subvector will need to be expanded in such a way.
+ unsigned Scale = VecWidth / ValWidth;
+
+ // Generate (p2d VecV) >> 8*Idx to move the interesting bytes to
+ // position 0.
+ assert(ty(IdxV) == MVT::i32);
+ SDValue S0 = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV,
+ DAG.getConstant(8*Scale, dl, MVT::i32));
+ SDValue T0 = DAG.getNode(HexagonISD::P2D, dl, MVT::i64, VecV);
+ SDValue T1 = DAG.getNode(ISD::SRL, dl, MVT::i64, T0, S0);
+ while (Scale > 1) {
+ // The longest possible subvector is at most 32 bits, so it is always
+ // contained in the low subregister.
+ T1 = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, T1);
+ T1 = expandPredicate(T1, dl, DAG);
+ Scale /= 2;
+ }
+
+ return DAG.getNode(HexagonISD::D2P, dl, ResTy, T1);
+ }
+
+ assert(VecWidth == 32 || VecWidth == 64);
// Cast everything to scalar integer types.
MVT ScalarTy = tyScalar(VecTy);
@@ -2687,8 +2365,8 @@ HexagonTargetLowering::extractVector(SDValue VecV, SDValue IdxV,
SDValue WidthV = DAG.getConstant(ValWidth, dl, MVT::i32);
SDValue ExtV;
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(IdxV)) {
- unsigned Off = C->getZExtValue() * ElemWidth;
+ if (IdxN) {
+ unsigned Off = IdxN->getZExtValue() * ElemWidth;
if (VecWidth == 64 && ValWidth == 32) {
assert(Off == 0 || Off == 32);
unsigned SubIdx = Off == 0 ? Hexagon::isub_lo : Hexagon::isub_hi;
@@ -2707,11 +2385,8 @@ HexagonTargetLowering::extractVector(SDValue VecV, SDValue IdxV,
IdxV = DAG.getZExtOrTrunc(IdxV, dl, MVT::i32);
SDValue OffV = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV,
DAG.getConstant(ElemWidth, dl, MVT::i32));
- // EXTRACTURP takes width/offset in a 64-bit pair.
- SDValue CombV = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
- {WidthV, OffV});
- ExtV = DAG.getNode(HexagonISD::EXTRACTURP, dl, ScalarTy,
- {VecV, CombV});
+ ExtV = DAG.getNode(HexagonISD::EXTRACTU, dl, ScalarTy,
+ {VecV, WidthV, OffV});
}
// Cast ExtV to the requested result type.
@@ -2725,6 +2400,33 @@ HexagonTargetLowering::insertVector(SDValue VecV, SDValue ValV, SDValue IdxV,
const SDLoc &dl, MVT ValTy,
SelectionDAG &DAG) const {
MVT VecTy = ty(VecV);
+ if (VecTy.getVectorElementType() == MVT::i1) {
+ MVT ValTy = ty(ValV);
+ assert(ValTy.getVectorElementType() == MVT::i1);
+ SDValue ValR = DAG.getNode(HexagonISD::P2D, dl, MVT::i64, ValV);
+ unsigned VecLen = VecTy.getVectorNumElements();
+ unsigned Scale = VecLen / ValTy.getVectorNumElements();
+ assert(Scale > 1);
+
+ for (unsigned R = Scale; R > 1; R /= 2) {
+ ValR = contractPredicate(ValR, dl, DAG);
+ ValR = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
+ DAG.getUNDEF(MVT::i32), ValR);
+ }
+ // The longest possible subvector is at most 32 bits, so it is always
+ // contained in the low subregister.
+ ValR = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, ValR);
+
+ unsigned ValBytes = 64 / Scale;
+ SDValue Width = DAG.getConstant(ValBytes*8, dl, MVT::i32);
+ SDValue Idx = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV,
+ DAG.getConstant(8, dl, MVT::i32));
+ SDValue VecR = DAG.getNode(HexagonISD::P2D, dl, MVT::i64, VecV);
+ SDValue Ins = DAG.getNode(HexagonISD::INSERT, dl, MVT::i32,
+ {VecR, ValR, Width, Idx});
+ return DAG.getNode(HexagonISD::D2P, dl, VecTy, Ins);
+ }
+
unsigned VecWidth = VecTy.getSizeInBits();
unsigned ValWidth = ValTy.getSizeInBits();
assert(VecWidth == 32 || VecWidth == 64);
@@ -2752,17 +2454,32 @@ HexagonTargetLowering::insertVector(SDValue VecV, SDValue ValV, SDValue IdxV,
if (ty(IdxV) != MVT::i32)
IdxV = DAG.getZExtOrTrunc(IdxV, dl, MVT::i32);
SDValue OffV = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV, WidthV);
- // INSERTRP takes width/offset in a 64-bit pair.
- SDValue CombV = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
- {WidthV, OffV});
- InsV = DAG.getNode(HexagonISD::INSERTRP, dl, ScalarTy,
- {VecV, ValV, CombV});
+ InsV = DAG.getNode(HexagonISD::INSERT, dl, ScalarTy,
+ {VecV, ValV, WidthV, OffV});
}
return DAG.getNode(ISD::BITCAST, dl, VecTy, InsV);
}
SDValue
+HexagonTargetLowering::expandPredicate(SDValue Vec32, const SDLoc &dl,
+ SelectionDAG &DAG) const {
+ assert(ty(Vec32).getSizeInBits() == 32);
+ if (isUndef(Vec32))
+ return DAG.getUNDEF(MVT::i64);
+ return getInstr(Hexagon::S2_vsxtbh, dl, MVT::i64, {Vec32}, DAG);
+}
+
+SDValue
+HexagonTargetLowering::contractPredicate(SDValue Vec64, const SDLoc &dl,
+ SelectionDAG &DAG) const {
+ assert(ty(Vec64).getSizeInBits() == 64);
+ if (isUndef(Vec64))
+ return DAG.getUNDEF(MVT::i32);
+ return getInstr(Hexagon::S2_vtrunehb, dl, MVT::i32, {Vec64}, DAG);
+}
+
+SDValue
HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG)
const {
if (Ty.isVector()) {
@@ -2784,18 +2501,34 @@ SDValue
HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
MVT VecTy = ty(Op);
unsigned BW = VecTy.getSizeInBits();
-
- if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy, true))
- return LowerHvxBuildVector(Op, DAG);
-
- if (BW == 32 || BW == 64) {
- const SDLoc &dl(Op);
- SmallVector<SDValue,8> Ops;
- for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i)
- Ops.push_back(Op.getOperand(i));
- if (BW == 32)
- return buildVector32(Ops, dl, VecTy, DAG);
+ const SDLoc &dl(Op);
+ SmallVector<SDValue,8> Ops;
+ for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i)
+ Ops.push_back(Op.getOperand(i));
+
+ if (BW == 32)
+ return buildVector32(Ops, dl, VecTy, DAG);
+ if (BW == 64)
return buildVector64(Ops, dl, VecTy, DAG);
+
+ if (VecTy == MVT::v8i1 || VecTy == MVT::v4i1 || VecTy == MVT::v2i1) {
+ // For each i1 element in the resulting predicate register, put 1
+ // shifted by the index of the element into a general-purpose register,
+ // then or them together and transfer it back into a predicate register.
+ SDValue Rs[8];
+ SDValue Z = getZero(dl, MVT::i32, DAG);
+ // Always produce 8 bits, repeat inputs if necessary.
+ unsigned Rep = 8 / VecTy.getVectorNumElements();
+ for (unsigned i = 0; i != 8; ++i) {
+ SDValue S = DAG.getConstant(1ull << i, dl, MVT::i32);
+ Rs[i] = DAG.getSelect(dl, MVT::i32, Ops[i/Rep], S, Z);
+ }
+ for (ArrayRef<SDValue> A(Rs); A.size() != 1; A = A.drop_back(A.size()/2)) {
+ for (unsigned i = 0, e = A.size()/2; i != e; ++i)
+ Rs[i] = DAG.getNode(ISD::OR, dl, MVT::i32, Rs[2*i], Rs[2*i+1]);
+ }
+ // Move the value directly to a predicate register.
+ return getInstr(Hexagon::C2_tfrrp, dl, VecTy, {Rs[0]}, DAG);
}
return SDValue();
@@ -2805,14 +2538,64 @@ SDValue
HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
SelectionDAG &DAG) const {
MVT VecTy = ty(Op);
- assert(!Subtarget.useHVXOps() || !Subtarget.isHVXVectorType(VecTy));
-
+ const SDLoc &dl(Op);
if (VecTy.getSizeInBits() == 64) {
assert(Op.getNumOperands() == 2);
- return DAG.getNode(HexagonISD::COMBINE, SDLoc(Op), VecTy, Op.getOperand(1),
+ return DAG.getNode(HexagonISD::COMBINE, dl, VecTy, Op.getOperand(1),
Op.getOperand(0));
}
+ MVT ElemTy = VecTy.getVectorElementType();
+ if (ElemTy == MVT::i1) {
+ assert(VecTy == MVT::v2i1 || VecTy == MVT::v4i1 || VecTy == MVT::v8i1);
+ MVT OpTy = ty(Op.getOperand(0));
+ // Scale is how many times the operands need to be contracted to match
+ // the representation in the target register.
+ unsigned Scale = VecTy.getVectorNumElements() / OpTy.getVectorNumElements();
+ assert(Scale == Op.getNumOperands() && Scale > 1);
+
+ // First, convert all bool vectors to integers, then generate pairwise
+ // inserts to form values of doubled length. Up until there are only
+ // two values left to concatenate, all of these values will fit in a
+ // 32-bit integer, so keep them as i32 to use 32-bit inserts.
+ SmallVector<SDValue,4> Words[2];
+ unsigned IdxW = 0;
+
+ for (SDValue P : Op.getNode()->op_values()) {
+ SDValue W = DAG.getNode(HexagonISD::P2D, dl, MVT::i64, P);
+ for (unsigned R = Scale; R > 1; R /= 2) {
+ W = contractPredicate(W, dl, DAG);
+ W = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
+ DAG.getUNDEF(MVT::i32), W);
+ }
+ W = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, W);
+ Words[IdxW].push_back(W);
+ }
+
+ while (Scale > 2) {
+ SDValue WidthV = DAG.getConstant(64 / Scale, dl, MVT::i32);
+ Words[IdxW ^ 1].clear();
+
+ for (unsigned i = 0, e = Words[IdxW].size(); i != e; i += 2) {
+ SDValue W0 = Words[IdxW][i], W1 = Words[IdxW][i+1];
+ // Insert W1 into W0 right next to the significant bits of W0.
+ SDValue T = DAG.getNode(HexagonISD::INSERT, dl, MVT::i32,
+ {W0, W1, WidthV, WidthV});
+ Words[IdxW ^ 1].push_back(T);
+ }
+ IdxW ^= 1;
+ Scale /= 2;
+ }
+
+ // Another sanity check. At this point there should only be two words
+ // left, and Scale should be 2.
+ assert(Scale == 2 && Words[IdxW].size() == 2);
+
+ SDValue WW = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
+ Words[IdxW][1], Words[IdxW][0]);
+ return DAG.getNode(HexagonISD::D2P, dl, VecTy, WW);
+ }
+
return SDValue();
}
@@ -2820,10 +2603,6 @@ SDValue
HexagonTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
SDValue Vec = Op.getOperand(0);
- MVT VecTy = ty(Vec);
- if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy))
- return LowerHvxExtractElement(Op, DAG);
-
MVT ElemTy = ty(Vec).getVectorElementType();
return extractVector(Vec, Op.getOperand(1), SDLoc(Op), ElemTy, ty(Op), DAG);
}
@@ -2831,31 +2610,20 @@ HexagonTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue
HexagonTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
SelectionDAG &DAG) const {
- SDValue Vec = Op.getOperand(0);
- MVT VecTy = ty(Vec);
- if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy))
- return LowerHvxExtractSubvector(Op, DAG);
-
- return extractVector(Vec, Op.getOperand(1), SDLoc(Op), ty(Op), ty(Op), DAG);
+ return extractVector(Op.getOperand(0), Op.getOperand(1), SDLoc(Op),
+ ty(Op), ty(Op), DAG);
}
SDValue
HexagonTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
- MVT VecTy = ty(Op);
- if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy))
- return LowerHvxInsertElement(Op, DAG);
-
return insertVector(Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
- SDLoc(Op), VecTy.getVectorElementType(), DAG);
+ SDLoc(Op), ty(Op).getVectorElementType(), DAG);
}
SDValue
HexagonTargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
SelectionDAG &DAG) const {
- if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(ty(Op)))
- return LowerHvxInsertSubvector(Op, DAG);
-
SDValue ValV = Op.getOperand(1);
return insertVector(Op.getOperand(0), ValV, Op.getOperand(2),
SDLoc(Op), ty(ValV), DAG);
@@ -2875,6 +2643,109 @@ HexagonTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
}
SDValue
+HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG)
+ const {
+ LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
+ unsigned HaveAlign = LN->getAlignment();
+ MVT LoadTy = ty(Op);
+ unsigned NeedAlign = Subtarget.getTypeAlignment(LoadTy);
+ if (HaveAlign >= NeedAlign)
+ return Op;
+
+ const SDLoc &dl(Op);
+ const DataLayout &DL = DAG.getDataLayout();
+ LLVMContext &Ctx = *DAG.getContext();
+ unsigned AS = LN->getAddressSpace();
+
+ // If the load aligning is disabled or the load can be broken up into two
+ // smaller legal loads, do the default (target-independent) expansion.
+ bool DoDefault = false;
+ // Handle it in the default way if this is an indexed load.
+ if (!LN->isUnindexed())
+ DoDefault = true;
+
+ if (!AlignLoads) {
+ if (allowsMemoryAccess(Ctx, DL, LN->getMemoryVT(), AS, HaveAlign))
+ return Op;
+ DoDefault = true;
+ }
+ if (!DoDefault && 2*HaveAlign == NeedAlign) {
+ // The PartTy is the equivalent of "getLoadableTypeOfSize(HaveAlign)".
+ MVT PartTy = HaveAlign <= 8 ? MVT::getIntegerVT(8*HaveAlign)
+ : MVT::getVectorVT(MVT::i8, HaveAlign);
+ DoDefault = allowsMemoryAccess(Ctx, DL, PartTy, AS, HaveAlign);
+ }
+ if (DoDefault) {
+ std::pair<SDValue, SDValue> P = expandUnalignedLoad(LN, DAG);
+ return DAG.getMergeValues({P.first, P.second}, dl);
+ }
+
+ // The code below generates two loads, both aligned as NeedAlign, and
+ // with the distance of NeedAlign between them. For that to cover the
+ // bits that need to be loaded (and without overlapping), the size of
+ // the loads should be equal to NeedAlign. This is true for all loadable
+ // types, but add an assertion in case something changes in the future.
+ assert(LoadTy.getSizeInBits() == 8*NeedAlign);
+
+ unsigned LoadLen = NeedAlign;
+ SDValue Base = LN->getBasePtr();
+ SDValue Chain = LN->getChain();
+ auto BO = getBaseAndOffset(Base);
+ unsigned BaseOpc = BO.first.getOpcode();
+ if (BaseOpc == HexagonISD::VALIGNADDR && BO.second % LoadLen == 0)
+ return Op;
+
+ if (BO.second % LoadLen != 0) {
+ BO.first = DAG.getNode(ISD::ADD, dl, MVT::i32, BO.first,
+ DAG.getConstant(BO.second % LoadLen, dl, MVT::i32));
+ BO.second -= BO.second % LoadLen;
+ }
+ SDValue BaseNoOff = (BaseOpc != HexagonISD::VALIGNADDR)
+ ? DAG.getNode(HexagonISD::VALIGNADDR, dl, MVT::i32, BO.first,
+ DAG.getConstant(NeedAlign, dl, MVT::i32))
+ : BO.first;
+ SDValue Base0 = DAG.getMemBasePlusOffset(BaseNoOff, BO.second, dl);
+ SDValue Base1 = DAG.getMemBasePlusOffset(BaseNoOff, BO.second+LoadLen, dl);
+
+ MachineMemOperand *WideMMO = nullptr;
+ if (MachineMemOperand *MMO = LN->getMemOperand()) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ WideMMO = MF.getMachineMemOperand(MMO->getPointerInfo(), MMO->getFlags(),
+ 2*LoadLen, LoadLen, MMO->getAAInfo(), MMO->getRanges(),
+ MMO->getSyncScopeID(), MMO->getOrdering(),
+ MMO->getFailureOrdering());
+ }
+
+ SDValue Load0 = DAG.getLoad(LoadTy, dl, Chain, Base0, WideMMO);
+ SDValue Load1 = DAG.getLoad(LoadTy, dl, Chain, Base1, WideMMO);
+
+ SDValue Aligned = DAG.getNode(HexagonISD::VALIGN, dl, LoadTy,
+ {Load1, Load0, BaseNoOff.getOperand(0)});
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ Load0.getValue(1), Load1.getValue(1));
+ SDValue M = DAG.getMergeValues({Aligned, NewChain}, dl);
+ return M;
+}
+
+SDValue
+HexagonTargetLowering::LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const {
+ const SDLoc &dl(Op);
+ unsigned Opc = Op.getOpcode();
+ SDValue X = Op.getOperand(0), Y = Op.getOperand(1), C = Op.getOperand(2);
+
+ if (Opc == ISD::ADDCARRY)
+ return DAG.getNode(HexagonISD::ADDC, dl, Op.getNode()->getVTList(),
+ { X, Y, C });
+
+ EVT CarryTy = C.getValueType();
+ SDValue SubC = DAG.getNode(HexagonISD::SUBC, dl, Op.getNode()->getVTList(),
+ { X, Y, DAG.getLogicalNOT(dl, C, CarryTy) });
+ SDValue Out[] = { SubC.getValue(0),
+ DAG.getLogicalNOT(dl, SubC.getValue(1), CarryTy) };
+ return DAG.getMergeValues(Out, dl);
+}
+
+SDValue
HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
SDValue Offset = Op.getOperand(1);
@@ -2904,6 +2775,17 @@ HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
SDValue
HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
+
+ // Handle INLINEASM first.
+ if (Opc == ISD::INLINEASM)
+ return LowerINLINEASM(Op, DAG);
+
+ if (isHvxOperation(Op)) {
+ // If HVX lowering returns nothing, try the default lowering.
+ if (SDValue V = LowerHvxOperation(Op, DAG))
+ return V;
+ }
+
switch (Opc) {
default:
#ifndef NDEBUG
@@ -2919,13 +2801,17 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::BITCAST: return LowerBITCAST(Op, DAG);
+ case ISD::LOAD: return LowerUnalignedLoad(Op, DAG);
+ case ISD::ADDCARRY:
+ case ISD::SUBCARRY: return LowerAddSubCarry(Op, DAG);
case ISD::SRA:
case ISD::SHL:
case ISD::SRL: return LowerVECTOR_SHIFT(Op, DAG);
+ case ISD::ROTL: return LowerROTL(Op, DAG);
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
- // Frame & Return address. Currently unimplemented.
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
@@ -2939,17 +2825,35 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::VSELECT: return LowerVSELECT(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
- case ISD::INLINEASM: return LowerINLINEASM(Op, DAG);
case ISD::PREFETCH: return LowerPREFETCH(Op, DAG);
case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG);
- case ISD::MUL:
- if (Subtarget.useHVXOps())
- return LowerHvxMul(Op, DAG);
break;
}
+
return SDValue();
}
+void
+HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ const SDLoc &dl(N);
+ switch (N->getOpcode()) {
+ case ISD::SRL:
+ case ISD::SRA:
+ case ISD::SHL:
+ return;
+ case ISD::BITCAST:
+ // Handle a bitcast from v8i1 to i8.
+ if (N->getValueType(0) == MVT::i8) {
+ SDValue P = getInstr(Hexagon::C2_tfrpr, dl, MVT::i32,
+ N->getOperand(0), DAG);
+ Results.push_back(P);
+ }
+ break;
+ }
+}
+
/// Returns relocation base for the given PIC jumptable.
SDValue
HexagonTargetLowering::getPICJumpTableRelocBase(SDValue Table,
@@ -3023,7 +2927,7 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(
case 512:
return {0u, &Hexagon::HvxVRRegClass};
case 1024:
- if (Subtarget.hasV60TOps() && Subtarget.useHVX128BOps())
+ if (Subtarget.hasV60Ops() && Subtarget.useHVX128BOps())
return {0u, &Hexagon::HvxVRRegClass};
return {0u, &Hexagon::HvxWRRegClass};
case 2048:
@@ -3042,7 +2946,7 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
- return Subtarget.hasV5TOps();
+ return Subtarget.hasV5Ops();
}
/// isLegalAddressingMode - Return true if the addressing mode represented by
@@ -3104,9 +3008,9 @@ bool HexagonTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
SDValue Callee,
CallingConv::ID CalleeCC,
- bool isVarArg,
- bool isCalleeStructRet,
- bool isCallerStructRet,
+ bool IsVarArg,
+ bool IsCalleeStructRet,
+ bool IsCallerStructRet,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -3137,12 +3041,12 @@ bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
}
// Do not tail call optimize vararg calls.
- if (isVarArg)
+ if (IsVarArg)
return false;
// Also avoid tail call optimization if either caller or callee uses struct
// return semantics.
- if (isCalleeStructRet || isCallerStructRet)
+ if (IsCalleeStructRet || IsCallerStructRet)
return false;
// In addition to the cases above, we also disable Tail Call Optimization if
@@ -3185,54 +3089,25 @@ bool HexagonTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
unsigned AS, unsigned Align, bool *Fast) const {
if (Fast)
*Fast = false;
-
- switch (VT.getSimpleVT().SimpleTy) {
- default:
- return false;
- case MVT::v64i8:
- case MVT::v128i8:
- case MVT::v256i8:
- case MVT::v32i16:
- case MVT::v64i16:
- case MVT::v128i16:
- case MVT::v16i32:
- case MVT::v32i32:
- case MVT::v64i32:
- return true;
- }
- return false;
+ return Subtarget.isHVXVectorType(VT.getSimpleVT());
}
std::pair<const TargetRegisterClass*, uint8_t>
HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
MVT VT) const {
- const TargetRegisterClass *RRC = nullptr;
+ if (Subtarget.isHVXVectorType(VT, true)) {
+ unsigned BitWidth = VT.getSizeInBits();
+ unsigned VecWidth = Subtarget.getVectorLength() * 8;
- uint8_t Cost = 1;
- switch (VT.SimpleTy) {
- default:
- return TargetLowering::findRepresentativeClass(TRI, VT);
- case MVT::v64i8:
- case MVT::v32i16:
- case MVT::v16i32:
- RRC = &Hexagon::HvxVRRegClass;
- break;
- case MVT::v128i8:
- case MVT::v64i16:
- case MVT::v32i32:
- if (Subtarget.hasV60TOps() && Subtarget.useHVXOps() &&
- Subtarget.useHVX128BOps())
- RRC = &Hexagon::HvxVRRegClass;
- else
- RRC = &Hexagon::HvxWRRegClass;
- break;
- case MVT::v256i8:
- case MVT::v128i16:
- case MVT::v64i32:
- RRC = &Hexagon::HvxWRRegClass;
- break;
+ if (VT.getVectorElementType() == MVT::i1)
+ return std::make_pair(&Hexagon::HvxQRRegClass, 1);
+ if (BitWidth == VecWidth)
+ return std::make_pair(&Hexagon::HvxVRRegClass, 1);
+ assert(BitWidth == 2 * VecWidth);
+ return std::make_pair(&Hexagon::HvxWRRegClass, 1);
}
- return std::make_pair(RRC, Cost);
+
+ return TargetLowering::findRepresentativeClass(TRI, VT);
}
Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 732834b464b4..3d94bd1ff6ed 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -18,12 +18,12 @@
#include "Hexagon.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/MachineValueType.h"
#include <cstdint>
#include <utility>
@@ -36,6 +36,8 @@ namespace HexagonISD {
CONST32 = OP_BEGIN,
CONST32_GP, // For marking data present in GP.
+ ADDC, // Add with carry: (X, Y, Cin) -> (X+Y, Cout).
+ SUBC, // Sub with carry: (X, Y, Cin) -> (X+~Y+Cin, Cout).
ALLOCA,
AT_GOT, // Index in GOT.
@@ -51,18 +53,15 @@ namespace HexagonISD {
CP, // Constant pool.
COMBINE,
- VSPLAT,
+ VSPLAT, // Generic splat, selection depends on argument/return
+ // types.
VASL,
VASR,
VLSR,
+ TSTBIT,
INSERT,
- INSERTRP,
EXTRACTU,
- EXTRACTURP,
- VCOMBINE,
- VPACKE,
- VPACKO,
VEXTRACTW,
VINSERTW0,
VROR,
@@ -70,8 +69,24 @@ namespace HexagonISD {
EH_RETURN,
DCFETCH,
READCYCLE,
+ D2P, // Convert 8-byte value to 8-bit predicate register. [*]
+ P2D, // Convert 8-bit predicate register to 8-byte value. [*]
+ V2Q, // Convert HVX vector to a vector predicate reg. [*]
+ Q2V, // Convert vector predicate to an HVX vector. [*]
+ // [*] The equivalence is defined as "Q <=> (V != 0)",
+ // where the != operation compares bytes.
+ // Note: V != 0 is implemented as V >u 0.
+ QCAT,
+ QTRUE,
+ QFALSE,
VZERO,
-
+ VSPLATW, // HVX splat of a 32-bit word with an arbitrary result type.
+ TYPECAST, // No-op that's used to convert between different legal
+ // types in a register.
+ VALIGN, // Align two vectors (in Op0, Op1) to one that would have
+ // been loaded from address in Op2.
+ VALIGNADDR, // Align vector address: Op0 & -Op1, except when it is
+ // an address in a vector load, then it's a no-op.
OP_END
};
@@ -110,6 +125,10 @@ namespace HexagonISD {
bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
bool isTruncateFree(EVT VT1, EVT VT2) const override;
+ bool isCheapToSpeculateCttz() const override { return true; }
+ bool isCheapToSpeculateCtlz() const override { return true; }
+ bool isCtlzFast() const override { return true; }
+
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
/// Return true if an FMA operation is faster than a pair of mul and add
@@ -127,6 +146,9 @@ namespace HexagonISD {
const override;
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+
const char *getTargetNodeName(unsigned Opcode) const override;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
@@ -137,6 +159,13 @@ namespace HexagonISD {
SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
@@ -284,6 +313,9 @@ namespace HexagonISD {
}
private:
+ void initializeHVXLowering();
+ std::pair<SDValue,int> getBaseAndOffset(SDValue Addr) const;
+
bool getBuildVectorConstInts(ArrayRef<SDValue> Values, MVT VecTy,
SelectionDAG &DAG,
MutableArrayRef<ConstantInt*> Consts) const;
@@ -295,13 +327,19 @@ namespace HexagonISD {
MVT ValTy, MVT ResTy, SelectionDAG &DAG) const;
SDValue insertVector(SDValue VecV, SDValue ValV, SDValue IdxV,
const SDLoc &dl, MVT ValTy, SelectionDAG &DAG) const;
+ SDValue expandPredicate(SDValue Vec32, const SDLoc &dl,
+ SelectionDAG &DAG) const;
+ SDValue contractPredicate(SDValue Vec64, const SDLoc &dl,
+ SelectionDAG &DAG) const;
+ SDValue getVectorShiftByInt(SDValue Op, SelectionDAG &DAG) const;
+
bool isUndef(SDValue Op) const {
if (Op.isMachineOpcode())
return Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF;
return Op.getOpcode() == ISD::UNDEF;
}
- SDValue getNode(unsigned MachineOpc, const SDLoc &dl, MVT Ty,
- ArrayRef<SDValue> Ops, SelectionDAG &DAG) const {
+ SDValue getInstr(unsigned MachineOpc, const SDLoc &dl, MVT Ty,
+ ArrayRef<SDValue> Ops, SelectionDAG &DAG) const {
SDNode *N = DAG.getMachineNode(MachineOpc, dl, Ty, Ops);
return SDValue(N, 0);
}
@@ -328,7 +366,8 @@ namespace HexagonISD {
MVT tyVector(MVT Ty, MVT ElemTy) const {
if (Ty.isVector() && Ty.getVectorElementType() == ElemTy)
return Ty;
- unsigned TyWidth = Ty.getSizeInBits(), ElemWidth = ElemTy.getSizeInBits();
+ unsigned TyWidth = Ty.getSizeInBits();
+ unsigned ElemWidth = ElemTy.getSizeInBits();
assert((TyWidth % ElemWidth) == 0);
return MVT::getVectorVT(ElemTy, TyWidth/ElemWidth);
}
@@ -343,31 +382,66 @@ namespace HexagonISD {
VectorPair opSplit(SDValue Vec, const SDLoc &dl, SelectionDAG &DAG) const;
SDValue opCastElem(SDValue Vec, MVT ElemTy, SelectionDAG &DAG) const;
+ bool isHvxSingleTy(MVT Ty) const;
+ bool isHvxPairTy(MVT Ty) const;
SDValue convertToByteIndex(SDValue ElemIdx, MVT ElemTy,
SelectionDAG &DAG) const;
SDValue getIndexInWord32(SDValue Idx, MVT ElemTy, SelectionDAG &DAG) const;
SDValue getByteShuffle(const SDLoc &dl, SDValue Op0, SDValue Op1,
ArrayRef<int> Mask, SelectionDAG &DAG) const;
- MVT getVecBoolVT() const;
-
- SDValue buildHvxVectorSingle(ArrayRef<SDValue> Values, const SDLoc &dl,
- MVT VecTy, SelectionDAG &DAG) const;
+ SDValue buildHvxVectorReg(ArrayRef<SDValue> Values, const SDLoc &dl,
+ MVT VecTy, SelectionDAG &DAG) const;
SDValue buildHvxVectorPred(ArrayRef<SDValue> Values, const SDLoc &dl,
MVT VecTy, SelectionDAG &DAG) const;
+ SDValue createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
+ unsigned BitBytes, bool ZeroFill,
+ SelectionDAG &DAG) const;
+ SDValue extractHvxElementReg(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+ MVT ResTy, SelectionDAG &DAG) const;
+ SDValue extractHvxElementPred(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+ MVT ResTy, SelectionDAG &DAG) const;
+ SDValue insertHvxElementReg(SDValue VecV, SDValue IdxV, SDValue ValV,
+ const SDLoc &dl, SelectionDAG &DAG) const;
+ SDValue insertHvxElementPred(SDValue VecV, SDValue IdxV, SDValue ValV,
+ const SDLoc &dl, SelectionDAG &DAG) const;
+ SDValue extractHvxSubvectorReg(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+ MVT ResTy, SelectionDAG &DAG) const;
+ SDValue extractHvxSubvectorPred(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+ MVT ResTy, SelectionDAG &DAG) const;
+ SDValue insertHvxSubvectorReg(SDValue VecV, SDValue SubV, SDValue IdxV,
+ const SDLoc &dl, SelectionDAG &DAG) const;
+ SDValue insertHvxSubvectorPred(SDValue VecV, SDValue SubV, SDValue IdxV,
+ const SDLoc &dl, SelectionDAG &DAG) const;
+ SDValue extendHvxVectorPred(SDValue VecV, const SDLoc &dl, MVT ResTy,
+ bool ZeroExt, SelectionDAG &DAG) const;
SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerHvxInsertSubvector(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerHvxAnyExt(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxSignExt(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerHvxMul(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxShift(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const;
+ SDValue SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const;
std::pair<const TargetRegisterClass*, uint8_t>
findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT)
const override;
+
+ bool isHvxOperation(SDValue Op) const;
+ SDValue LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const;
};
} // end namespace llvm
diff --git a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 51480d09d734..2566194ca9c6 100644
--- a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -10,9 +10,192 @@
#include "HexagonISelLowering.h"
#include "HexagonRegisterInfo.h"
#include "HexagonSubtarget.h"
+#include "llvm/Support/CommandLine.h"
using namespace llvm;
+static const MVT LegalV64[] = { MVT::v64i8, MVT::v32i16, MVT::v16i32 };
+static const MVT LegalW64[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 };
+static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 };
+static const MVT LegalW128[] = { MVT::v256i8, MVT::v128i16, MVT::v64i32 };
+
+
+void
+HexagonTargetLowering::initializeHVXLowering() {
+ if (Subtarget.useHVX64BOps()) {
+ addRegisterClass(MVT::v64i8, &Hexagon::HvxVRRegClass);
+ addRegisterClass(MVT::v32i16, &Hexagon::HvxVRRegClass);
+ addRegisterClass(MVT::v16i32, &Hexagon::HvxVRRegClass);
+ addRegisterClass(MVT::v128i8, &Hexagon::HvxWRRegClass);
+ addRegisterClass(MVT::v64i16, &Hexagon::HvxWRRegClass);
+ addRegisterClass(MVT::v32i32, &Hexagon::HvxWRRegClass);
+ // These "short" boolean vector types should be legal because
+ // they will appear as results of vector compares. If they were
+ // not legal, type legalization would try to make them legal
+ // and that would require using operations that do not use or
+ // produce such types. That, in turn, would imply using custom
+ // nodes, which would be unoptimizable by the DAG combiner.
+ // The idea is to rely on target-independent operations as much
+ // as possible.
+ addRegisterClass(MVT::v16i1, &Hexagon::HvxQRRegClass);
+ addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
+ addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
+ addRegisterClass(MVT::v512i1, &Hexagon::HvxQRRegClass);
+ } else if (Subtarget.useHVX128BOps()) {
+ addRegisterClass(MVT::v128i8, &Hexagon::HvxVRRegClass);
+ addRegisterClass(MVT::v64i16, &Hexagon::HvxVRRegClass);
+ addRegisterClass(MVT::v32i32, &Hexagon::HvxVRRegClass);
+ addRegisterClass(MVT::v256i8, &Hexagon::HvxWRRegClass);
+ addRegisterClass(MVT::v128i16, &Hexagon::HvxWRRegClass);
+ addRegisterClass(MVT::v64i32, &Hexagon::HvxWRRegClass);
+ addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
+ addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
+ addRegisterClass(MVT::v128i1, &Hexagon::HvxQRRegClass);
+ addRegisterClass(MVT::v1024i1, &Hexagon::HvxQRRegClass);
+ }
+
+ // Set up operation actions.
+
+ bool Use64b = Subtarget.useHVX64BOps();
+ ArrayRef<MVT> LegalV = Use64b ? LegalV64 : LegalV128;
+ ArrayRef<MVT> LegalW = Use64b ? LegalW64 : LegalW128;
+ MVT ByteV = Use64b ? MVT::v64i8 : MVT::v128i8;
+ MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8;
+
+ auto setPromoteTo = [this] (unsigned Opc, MVT FromTy, MVT ToTy) {
+ setOperationAction(Opc, FromTy, Promote);
+ AddPromotedToType(Opc, FromTy, ToTy);
+ };
+
+ setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal);
+ setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal);
+
+ for (MVT T : LegalV) {
+ setIndexedLoadAction(ISD::POST_INC, T, Legal);
+ setIndexedStoreAction(ISD::POST_INC, T, Legal);
+
+ setOperationAction(ISD::AND, T, Legal);
+ setOperationAction(ISD::OR, T, Legal);
+ setOperationAction(ISD::XOR, T, Legal);
+ setOperationAction(ISD::ADD, T, Legal);
+ setOperationAction(ISD::SUB, T, Legal);
+ setOperationAction(ISD::CTPOP, T, Legal);
+ setOperationAction(ISD::CTLZ, T, Legal);
+ if (T != ByteV) {
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal);
+ setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal);
+ setOperationAction(ISD::BSWAP, T, Legal);
+ }
+
+ setOperationAction(ISD::CTTZ, T, Custom);
+ setOperationAction(ISD::LOAD, T, Custom);
+ setOperationAction(ISD::MUL, T, Custom);
+ setOperationAction(ISD::MULHS, T, Custom);
+ setOperationAction(ISD::MULHU, T, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, T, Custom);
+ // Make concat-vectors custom to handle concats of more than 2 vectors.
+ setOperationAction(ISD::CONCAT_VECTORS, T, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, T, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, T, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, T, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom);
+ setOperationAction(ISD::ANY_EXTEND, T, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, T, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, T, Custom);
+ if (T != ByteV) {
+ setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom);
+ // HVX only has shifts of words and halfwords.
+ setOperationAction(ISD::SRA, T, Custom);
+ setOperationAction(ISD::SHL, T, Custom);
+ setOperationAction(ISD::SRL, T, Custom);
+
+ // Promote all shuffles to operate on vectors of bytes.
+ setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV);
+ }
+
+ setCondCodeAction(ISD::SETNE, T, Expand);
+ setCondCodeAction(ISD::SETLE, T, Expand);
+ setCondCodeAction(ISD::SETGE, T, Expand);
+ setCondCodeAction(ISD::SETLT, T, Expand);
+ setCondCodeAction(ISD::SETULE, T, Expand);
+ setCondCodeAction(ISD::SETUGE, T, Expand);
+ setCondCodeAction(ISD::SETULT, T, Expand);
+ }
+
+ for (MVT T : LegalW) {
+ // Custom-lower BUILD_VECTOR for vector pairs. The standard (target-
+ // independent) handling of it would convert it to a load, which is
+ // not always the optimal choice.
+ setOperationAction(ISD::BUILD_VECTOR, T, Custom);
+ // Make concat-vectors custom to handle concats of more than 2 vectors.
+ setOperationAction(ISD::CONCAT_VECTORS, T, Custom);
+
+ // Custom-lower these operations for pairs. Expand them into a concat
+ // of the corresponding operations on individual vectors.
+ setOperationAction(ISD::ANY_EXTEND, T, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, T, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, T, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, T, Custom);
+ setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal);
+ setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal);
+
+ setOperationAction(ISD::LOAD, T, Custom);
+ setOperationAction(ISD::STORE, T, Custom);
+ setOperationAction(ISD::CTLZ, T, Custom);
+ setOperationAction(ISD::CTTZ, T, Custom);
+ setOperationAction(ISD::CTPOP, T, Custom);
+
+ setOperationAction(ISD::ADD, T, Legal);
+ setOperationAction(ISD::SUB, T, Legal);
+ setOperationAction(ISD::MUL, T, Custom);
+ setOperationAction(ISD::MULHS, T, Custom);
+ setOperationAction(ISD::MULHU, T, Custom);
+ setOperationAction(ISD::AND, T, Custom);
+ setOperationAction(ISD::OR, T, Custom);
+ setOperationAction(ISD::XOR, T, Custom);
+ setOperationAction(ISD::SETCC, T, Custom);
+ setOperationAction(ISD::VSELECT, T, Custom);
+ if (T != ByteW) {
+ setOperationAction(ISD::SRA, T, Custom);
+ setOperationAction(ISD::SHL, T, Custom);
+ setOperationAction(ISD::SRL, T, Custom);
+
+ // Promote all shuffles to operate on vectors of bytes.
+ setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW);
+ }
+ }
+
+ // Boolean vectors.
+
+ for (MVT T : LegalW) {
+ // Boolean types for vector pairs will overlap with the boolean
+ // types for single vectors, e.g.
+ // v64i8 -> v64i1 (single)
+ // v64i16 -> v64i1 (pair)
+ // Set these actions first, and allow the single actions to overwrite
+ // any duplicates.
+ MVT BoolW = MVT::getVectorVT(MVT::i1, T.getVectorNumElements());
+ setOperationAction(ISD::SETCC, BoolW, Custom);
+ setOperationAction(ISD::AND, BoolW, Custom);
+ setOperationAction(ISD::OR, BoolW, Custom);
+ setOperationAction(ISD::XOR, BoolW, Custom);
+ }
+
+ for (MVT T : LegalV) {
+ MVT BoolV = MVT::getVectorVT(MVT::i1, T.getVectorNumElements());
+ setOperationAction(ISD::BUILD_VECTOR, BoolV, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, BoolV, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, BoolV, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, BoolV, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, BoolV, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, BoolV, Custom);
+ setOperationAction(ISD::AND, BoolV, Legal);
+ setOperationAction(ISD::OR, BoolV, Legal);
+ setOperationAction(ISD::XOR, BoolV, Legal);
+ }
+}
+
SDValue
HexagonTargetLowering::getInt(unsigned IntId, MVT ResTy, ArrayRef<SDValue> Ops,
const SDLoc &dl, SelectionDAG &DAG) const {
@@ -75,9 +258,23 @@ HexagonTargetLowering::VectorPair
HexagonTargetLowering::opSplit(SDValue Vec, const SDLoc &dl,
SelectionDAG &DAG) const {
TypePair Tys = typeSplit(ty(Vec));
+ if (Vec.getOpcode() == HexagonISD::QCAT)
+ return VectorPair(Vec.getOperand(0), Vec.getOperand(1));
return DAG.SplitVector(Vec, dl, Tys.first, Tys.second);
}
+bool
+HexagonTargetLowering::isHvxSingleTy(MVT Ty) const {
+ return Subtarget.isHVXVectorType(Ty) &&
+ Ty.getSizeInBits() == 8 * Subtarget.getVectorLength();
+}
+
+bool
+HexagonTargetLowering::isHvxPairTy(MVT Ty) const {
+ return Subtarget.isHVXVectorType(Ty) &&
+ Ty.getSizeInBits() == 16 * Subtarget.getVectorLength();
+}
+
SDValue
HexagonTargetLowering::convertToByteIndex(SDValue ElemIdx, MVT ElemTy,
SelectionDAG &DAG) const {
@@ -141,36 +338,16 @@ HexagonTargetLowering::getByteShuffle(const SDLoc &dl, SDValue Op0,
opCastElem(Op1, MVT::i8, DAG), ByteMask);
}
-MVT
-HexagonTargetLowering::getVecBoolVT() const {
- return MVT::getVectorVT(MVT::i1, 8*Subtarget.getVectorLength());
-}
-
SDValue
-HexagonTargetLowering::buildHvxVectorSingle(ArrayRef<SDValue> Values,
- const SDLoc &dl, MVT VecTy,
- SelectionDAG &DAG) const {
+HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
+ const SDLoc &dl, MVT VecTy,
+ SelectionDAG &DAG) const {
unsigned VecLen = Values.size();
MachineFunction &MF = DAG.getMachineFunction();
MVT ElemTy = VecTy.getVectorElementType();
unsigned ElemWidth = ElemTy.getSizeInBits();
unsigned HwLen = Subtarget.getVectorLength();
- SmallVector<ConstantInt*, 128> Consts(VecLen);
- bool AllConst = getBuildVectorConstInts(Values, VecTy, DAG, Consts);
- if (AllConst) {
- if (llvm::all_of(Consts, [](ConstantInt *CI) { return CI->isZero(); }))
- return getZero(dl, VecTy, DAG);
-
- ArrayRef<Constant*> Tmp((Constant**)Consts.begin(),
- (Constant**)Consts.end());
- Constant *CV = ConstantVector::get(Tmp);
- unsigned Align = HwLen;
- SDValue CP = LowerConstantPool(DAG.getConstantPool(CV, VecTy, Align), DAG);
- return DAG.getLoad(VecTy, dl, DAG.getEntryNode(), CP,
- MachinePointerInfo::getConstantPool(MF), Align);
- }
-
unsigned ElemSize = ElemWidth / 8;
assert(ElemSize*VecLen == HwLen);
SmallVector<SDValue,32> Words;
@@ -187,12 +364,47 @@ HexagonTargetLowering::buildHvxVectorSingle(ArrayRef<SDValue> Values,
Words.assign(Values.begin(), Values.end());
}
+ unsigned NumWords = Words.size();
+ bool IsSplat = true, IsUndef = true;
+ SDValue SplatV;
+ for (unsigned i = 0; i != NumWords && IsSplat; ++i) {
+ if (isUndef(Words[i]))
+ continue;
+ IsUndef = false;
+ if (!SplatV.getNode())
+ SplatV = Words[i];
+ else if (SplatV != Words[i])
+ IsSplat = false;
+ }
+ if (IsUndef)
+ return DAG.getUNDEF(VecTy);
+ if (IsSplat) {
+ assert(SplatV.getNode());
+ auto *IdxN = dyn_cast<ConstantSDNode>(SplatV.getNode());
+ if (IdxN && IdxN->isNullValue())
+ return getZero(dl, VecTy, DAG);
+ return DAG.getNode(HexagonISD::VSPLATW, dl, VecTy, SplatV);
+ }
+
+ // Delay recognizing constant vectors until here, so that we can generate
+ // a vsplat.
+ SmallVector<ConstantInt*, 128> Consts(VecLen);
+ bool AllConst = getBuildVectorConstInts(Values, VecTy, DAG, Consts);
+ if (AllConst) {
+ ArrayRef<Constant*> Tmp((Constant**)Consts.begin(),
+ (Constant**)Consts.end());
+ Constant *CV = ConstantVector::get(Tmp);
+ unsigned Align = HwLen;
+ SDValue CP = LowerConstantPool(DAG.getConstantPool(CV, VecTy, Align), DAG);
+ return DAG.getLoad(VecTy, dl, DAG.getEntryNode(), CP,
+ MachinePointerInfo::getConstantPool(MF), Align);
+ }
+
// Construct two halves in parallel, then or them together.
assert(4*Words.size() == Subtarget.getVectorLength());
- SDValue HalfV0 = getNode(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
- SDValue HalfV1 = getNode(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
+ SDValue HalfV0 = getInstr(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
+ SDValue HalfV1 = getInstr(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
SDValue S = DAG.getConstant(4, dl, MVT::i32);
- unsigned NumWords = Words.size();
for (unsigned i = 0; i != NumWords/2; ++i) {
SDValue N = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
{HalfV0, Words[i]});
@@ -209,6 +421,95 @@ HexagonTargetLowering::buildHvxVectorSingle(ArrayRef<SDValue> Values,
}
SDValue
+HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
+ unsigned BitBytes, bool ZeroFill, SelectionDAG &DAG) const {
+ MVT PredTy = ty(PredV);
+ unsigned HwLen = Subtarget.getVectorLength();
+ MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+
+ if (Subtarget.isHVXVectorType(PredTy, true)) {
+ // Move the vector predicate SubV to a vector register, and scale it
+ // down to match the representation (bytes per type element) that VecV
+ // uses. The scaling down will pick every 2nd or 4th (every Scale-th
+ // in general) element and put them at the front of the resulting
+ // vector. This subvector will then be inserted into the Q2V of VecV.
+ // To avoid having an operation that generates an illegal type (short
+ // vector), generate a full size vector.
+ //
+ SDValue T = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, PredV);
+ SmallVector<int,128> Mask(HwLen);
+ // Scale = BitBytes(PredV) / Given BitBytes.
+ unsigned Scale = HwLen / (PredTy.getVectorNumElements() * BitBytes);
+ unsigned BlockLen = PredTy.getVectorNumElements() * BitBytes;
+
+ for (unsigned i = 0; i != HwLen; ++i) {
+ unsigned Num = i % Scale;
+ unsigned Off = i / Scale;
+ Mask[BlockLen*Num + Off] = i;
+ }
+ SDValue S = DAG.getVectorShuffle(ByteTy, dl, T, DAG.getUNDEF(ByteTy), Mask);
+ if (!ZeroFill)
+ return S;
+ // Fill the bytes beyond BlockLen with 0s.
+ MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
+ SDValue Q = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy,
+ {DAG.getConstant(BlockLen, dl, MVT::i32)}, DAG);
+ SDValue M = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, Q);
+ return DAG.getNode(ISD::AND, dl, ByteTy, S, M);
+ }
+
+ // Make sure that this is a valid scalar predicate.
+ assert(PredTy == MVT::v2i1 || PredTy == MVT::v4i1 || PredTy == MVT::v8i1);
+
+ unsigned Bytes = 8 / PredTy.getVectorNumElements();
+ SmallVector<SDValue,4> Words[2];
+ unsigned IdxW = 0;
+
+ auto Lo32 = [&DAG, &dl] (SDValue P) {
+ return DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, P);
+ };
+ auto Hi32 = [&DAG, &dl] (SDValue P) {
+ return DAG.getTargetExtractSubreg(Hexagon::isub_hi, dl, MVT::i32, P);
+ };
+
+ SDValue W0 = isUndef(PredV)
+ ? DAG.getUNDEF(MVT::i64)
+ : DAG.getNode(HexagonISD::P2D, dl, MVT::i64, PredV);
+ Words[IdxW].push_back(Hi32(W0));
+ Words[IdxW].push_back(Lo32(W0));
+
+ while (Bytes < BitBytes) {
+ IdxW ^= 1;
+ Words[IdxW].clear();
+
+ if (Bytes < 4) {
+ for (const SDValue &W : Words[IdxW ^ 1]) {
+ SDValue T = expandPredicate(W, dl, DAG);
+ Words[IdxW].push_back(Hi32(T));
+ Words[IdxW].push_back(Lo32(T));
+ }
+ } else {
+ for (const SDValue &W : Words[IdxW ^ 1]) {
+ Words[IdxW].push_back(W);
+ Words[IdxW].push_back(W);
+ }
+ }
+ Bytes *= 2;
+ }
+
+ assert(Bytes == BitBytes);
+
+ SDValue Vec = ZeroFill ? getZero(dl, ByteTy, DAG) : DAG.getUNDEF(ByteTy);
+ SDValue S4 = DAG.getConstant(HwLen-4, dl, MVT::i32);
+ for (const SDValue &W : Words[IdxW]) {
+ Vec = DAG.getNode(HexagonISD::VROR, dl, ByteTy, Vec, S4);
+ Vec = DAG.getNode(HexagonISD::VINSERTW0, dl, ByteTy, Vec, W);
+ }
+
+ return Vec;
+}
+
+SDValue
HexagonTargetLowering::buildHvxVectorPred(ArrayRef<SDValue> Values,
const SDLoc &dl, MVT VecTy,
SelectionDAG &DAG) const {
@@ -218,6 +519,18 @@ HexagonTargetLowering::buildHvxVectorPred(ArrayRef<SDValue> Values,
unsigned HwLen = Subtarget.getVectorLength();
assert(VecLen <= HwLen || VecLen == 8*HwLen);
SmallVector<SDValue,128> Bytes;
+ bool AllT = true, AllF = true;
+
+ auto IsTrue = [] (SDValue V) {
+ if (const auto *N = dyn_cast<ConstantSDNode>(V.getNode()))
+ return !N->isNullValue();
+ return false;
+ };
+ auto IsFalse = [] (SDValue V) {
+ if (const auto *N = dyn_cast<ConstantSDNode>(V.getNode()))
+ return N->isNullValue();
+ return false;
+ };
if (VecLen <= HwLen) {
// In the hardware, each bit of a vector predicate corresponds to a byte
@@ -226,8 +539,11 @@ HexagonTargetLowering::buildHvxVectorPred(ArrayRef<SDValue> Values,
assert(HwLen % VecLen == 0);
unsigned BitBytes = HwLen / VecLen;
for (SDValue V : Values) {
+ AllT &= IsTrue(V);
+ AllF &= IsFalse(V);
+
SDValue Ext = !V.isUndef() ? DAG.getZExtOrTrunc(V, dl, MVT::i8)
- : DAG.getConstant(0, dl, MVT::i8);
+ : DAG.getUNDEF(MVT::i8);
for (unsigned B = 0; B != BitBytes; ++B)
Bytes.push_back(Ext);
}
@@ -243,8 +559,11 @@ HexagonTargetLowering::buildHvxVectorPred(ArrayRef<SDValue> Values,
break;
}
SDValue F = Values[I+B];
+ AllT &= IsTrue(F);
+ AllF &= IsFalse(F);
+
SDValue Ext = (B < 8) ? DAG.getZExtOrTrunc(F, dl, MVT::i8)
- : DAG.getConstant(0, dl, MVT::i8);
+ : DAG.getUNDEF(MVT::i8);
Bytes.push_back(Ext);
// Verify that the rest of values in the group are the same as the
// first.
@@ -253,53 +572,25 @@ HexagonTargetLowering::buildHvxVectorPred(ArrayRef<SDValue> Values,
}
}
- MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
- SDValue ByteVec = buildHvxVectorSingle(Bytes, dl, ByteTy, DAG);
- SDValue Cmp = DAG.getSetCC(dl, VecTy, ByteVec, getZero(dl, ByteTy, DAG),
- ISD::SETUGT);
- return Cmp;
-}
-
-SDValue
-HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
- const {
- const SDLoc &dl(Op);
- MVT VecTy = ty(Op);
-
- unsigned Size = Op.getNumOperands();
- SmallVector<SDValue,128> Ops;
- for (unsigned i = 0; i != Size; ++i)
- Ops.push_back(Op.getOperand(i));
-
- if (VecTy.getVectorElementType() == MVT::i1)
- return buildHvxVectorPred(Ops, dl, VecTy, DAG);
+ if (AllT)
+ return DAG.getNode(HexagonISD::QTRUE, dl, VecTy);
+ if (AllF)
+ return DAG.getNode(HexagonISD::QFALSE, dl, VecTy);
- if (VecTy.getSizeInBits() == 16*Subtarget.getVectorLength()) {
- ArrayRef<SDValue> A(Ops);
- MVT SingleTy = typeSplit(VecTy).first;
- SDValue V0 = buildHvxVectorSingle(A.take_front(Size/2), dl, SingleTy, DAG);
- SDValue V1 = buildHvxVectorSingle(A.drop_front(Size/2), dl, SingleTy, DAG);
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, V0, V1);
- }
-
- return buildHvxVectorSingle(Ops, dl, VecTy, DAG);
+ MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+ SDValue ByteVec = buildHvxVectorReg(Bytes, dl, ByteTy, DAG);
+ return DAG.getNode(HexagonISD::V2Q, dl, VecTy, ByteVec);
}
SDValue
-HexagonTargetLowering::LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG)
- const {
- // Change the type of the extracted element to i32.
- SDValue VecV = Op.getOperand(0);
+HexagonTargetLowering::extractHvxElementReg(SDValue VecV, SDValue IdxV,
+ const SDLoc &dl, MVT ResTy, SelectionDAG &DAG) const {
MVT ElemTy = ty(VecV).getVectorElementType();
+
unsigned ElemWidth = ElemTy.getSizeInBits();
assert(ElemWidth >= 8 && ElemWidth <= 32);
(void)ElemWidth;
- const SDLoc &dl(Op);
- SDValue IdxV = Op.getOperand(1);
- if (ty(IdxV) != MVT::i32)
- IdxV = DAG.getBitcast(MVT::i32, IdxV);
-
SDValue ByteIdx = convertToByteIndex(IdxV, ElemTy, DAG);
SDValue ExWord = DAG.getNode(HexagonISD::VEXTRACTW, dl, MVT::i32,
{VecV, ByteIdx});
@@ -316,13 +607,29 @@ HexagonTargetLowering::LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG)
}
SDValue
-HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG)
- const {
- const SDLoc &dl(Op);
- SDValue VecV = Op.getOperand(0);
- SDValue ValV = Op.getOperand(1);
- SDValue IdxV = Op.getOperand(2);
+HexagonTargetLowering::extractHvxElementPred(SDValue VecV, SDValue IdxV,
+ const SDLoc &dl, MVT ResTy, SelectionDAG &DAG) const {
+ // Implement other return types if necessary.
+ assert(ResTy == MVT::i1);
+
+ unsigned HwLen = Subtarget.getVectorLength();
+ MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+ SDValue ByteVec = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, VecV);
+
+ unsigned Scale = HwLen / ty(VecV).getVectorNumElements();
+ SDValue ScV = DAG.getConstant(Scale, dl, MVT::i32);
+ IdxV = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV, ScV);
+
+ SDValue ExtB = extractHvxElementReg(ByteVec, IdxV, dl, MVT::i32, DAG);
+ SDValue Zero = DAG.getTargetConstant(0, dl, MVT::i32);
+ return getInstr(Hexagon::C2_cmpgtui, dl, MVT::i1, {ExtB, Zero}, DAG);
+}
+
+SDValue
+HexagonTargetLowering::insertHvxElementReg(SDValue VecV, SDValue IdxV,
+ SDValue ValV, const SDLoc &dl, SelectionDAG &DAG) const {
MVT ElemTy = ty(VecV).getVectorElementType();
+
unsigned ElemWidth = ElemTy.getSizeInBits();
assert(ElemWidth >= 8 && ElemWidth <= 32);
(void)ElemWidth;
@@ -336,7 +643,7 @@ HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG)
SDValue RotV = DAG.getNode(HexagonISD::VROR, dl, VecTy, {VecV, MaskV});
SDValue InsV = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy, {RotV, ValV});
SDValue SubV = DAG.getNode(ISD::SUB, dl, MVT::i32,
- {DAG.getConstant(HwLen/4, dl, MVT::i32), MaskV});
+ {DAG.getConstant(HwLen, dl, MVT::i32), MaskV});
SDValue TorV = DAG.getNode(HexagonISD::VROR, dl, VecTy, {InsV, SubV});
return TorV;
};
@@ -349,9 +656,8 @@ HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG)
// 1. Extract the existing word from the target vector.
SDValue WordIdx = DAG.getNode(ISD::SRL, dl, MVT::i32,
{ByteIdx, DAG.getConstant(2, dl, MVT::i32)});
- SDValue Ex0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
- {opCastElem(VecV, MVT::i32, DAG), WordIdx});
- SDValue Ext = LowerHvxExtractElement(Ex0, DAG);
+ SDValue Ext = extractHvxElementReg(opCastElem(VecV, MVT::i32, DAG), WordIdx,
+ dl, MVT::i32, DAG);
// 2. Treating the extracted word as a 32-bit vector, insert the given
// value into it.
@@ -365,55 +671,531 @@ HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG)
}
SDValue
+HexagonTargetLowering::insertHvxElementPred(SDValue VecV, SDValue IdxV,
+ SDValue ValV, const SDLoc &dl, SelectionDAG &DAG) const {
+ unsigned HwLen = Subtarget.getVectorLength();
+ MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+ SDValue ByteVec = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, VecV);
+
+ unsigned Scale = HwLen / ty(VecV).getVectorNumElements();
+ SDValue ScV = DAG.getConstant(Scale, dl, MVT::i32);
+ IdxV = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV, ScV);
+ ValV = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, ValV);
+
+ SDValue InsV = insertHvxElementReg(ByteVec, IdxV, ValV, dl, DAG);
+ return DAG.getNode(HexagonISD::V2Q, dl, ty(VecV), InsV);
+}
+
+SDValue
+HexagonTargetLowering::extractHvxSubvectorReg(SDValue VecV, SDValue IdxV,
+ const SDLoc &dl, MVT ResTy, SelectionDAG &DAG) const {
+ MVT VecTy = ty(VecV);
+ unsigned HwLen = Subtarget.getVectorLength();
+ unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue();
+ MVT ElemTy = VecTy.getVectorElementType();
+ unsigned ElemWidth = ElemTy.getSizeInBits();
+
+ // If the source vector is a vector pair, get the single vector containing
+ // the subvector of interest. The subvector will never overlap two single
+ // vectors.
+ if (isHvxPairTy(VecTy)) {
+ unsigned SubIdx;
+ if (Idx * ElemWidth >= 8*HwLen) {
+ SubIdx = Hexagon::vsub_hi;
+ Idx -= VecTy.getVectorNumElements() / 2;
+ } else {
+ SubIdx = Hexagon::vsub_lo;
+ }
+ VecTy = typeSplit(VecTy).first;
+ VecV = DAG.getTargetExtractSubreg(SubIdx, dl, VecTy, VecV);
+ if (VecTy == ResTy)
+ return VecV;
+ }
+
+ // The only meaningful subvectors of a single HVX vector are those that
+ // fit in a scalar register.
+ assert(ResTy.getSizeInBits() == 32 || ResTy.getSizeInBits() == 64);
+
+ MVT WordTy = tyVector(VecTy, MVT::i32);
+ SDValue WordVec = DAG.getBitcast(WordTy, VecV);
+ unsigned WordIdx = (Idx*ElemWidth) / 32;
+
+ SDValue W0Idx = DAG.getConstant(WordIdx, dl, MVT::i32);
+ SDValue W0 = extractHvxElementReg(WordVec, W0Idx, dl, MVT::i32, DAG);
+ if (ResTy.getSizeInBits() == 32)
+ return DAG.getBitcast(ResTy, W0);
+
+ SDValue W1Idx = DAG.getConstant(WordIdx+1, dl, MVT::i32);
+ SDValue W1 = extractHvxElementReg(WordVec, W1Idx, dl, MVT::i32, DAG);
+ SDValue WW = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64, {W1, W0});
+ return DAG.getBitcast(ResTy, WW);
+}
+
+SDValue
+HexagonTargetLowering::extractHvxSubvectorPred(SDValue VecV, SDValue IdxV,
+ const SDLoc &dl, MVT ResTy, SelectionDAG &DAG) const {
+ MVT VecTy = ty(VecV);
+ unsigned HwLen = Subtarget.getVectorLength();
+ MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+ SDValue ByteVec = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, VecV);
+ // IdxV is required to be a constant.
+ unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue();
+
+ unsigned ResLen = ResTy.getVectorNumElements();
+ unsigned BitBytes = HwLen / VecTy.getVectorNumElements();
+ unsigned Offset = Idx * BitBytes;
+ SDValue Undef = DAG.getUNDEF(ByteTy);
+ SmallVector<int,128> Mask;
+
+ if (Subtarget.isHVXVectorType(ResTy, true)) {
+ // Converting between two vector predicates. Since the result is shorter
+ // than the source, it will correspond to a vector predicate with the
+ // relevant bits replicated. The replication count is the ratio of the
+ // source and target vector lengths.
+ unsigned Rep = VecTy.getVectorNumElements() / ResLen;
+ assert(isPowerOf2_32(Rep) && HwLen % Rep == 0);
+ for (unsigned i = 0; i != HwLen/Rep; ++i) {
+ for (unsigned j = 0; j != Rep; ++j)
+ Mask.push_back(i + Offset);
+ }
+ SDValue ShuffV = DAG.getVectorShuffle(ByteTy, dl, ByteVec, Undef, Mask);
+ return DAG.getNode(HexagonISD::V2Q, dl, ResTy, ShuffV);
+ }
+
+ // Converting between a vector predicate and a scalar predicate. In the
+ // vector predicate, a group of BitBytes bits will correspond to a single
+ // i1 element of the source vector type. Those bits will all have the same
+ // value. The same will be true for ByteVec, where each byte corresponds
+ // to a bit in the vector predicate.
+ // The algorithm is to traverse the ByteVec, going over the i1 values from
+ // the source vector, and generate the corresponding representation in an
+ // 8-byte vector. To avoid repeated extracts from ByteVec, shuffle the
+ // elements so that the interesting 8 bytes will be in the low end of the
+ // vector.
+ unsigned Rep = 8 / ResLen;
+ // Make sure the output fill the entire vector register, so repeat the
+ // 8-byte groups as many times as necessary.
+ for (unsigned r = 0; r != HwLen/ResLen; ++r) {
+ // This will generate the indexes of the 8 interesting bytes.
+ for (unsigned i = 0; i != ResLen; ++i) {
+ for (unsigned j = 0; j != Rep; ++j)
+ Mask.push_back(Offset + i*BitBytes);
+ }
+ }
+
+ SDValue Zero = getZero(dl, MVT::i32, DAG);
+ SDValue ShuffV = DAG.getVectorShuffle(ByteTy, dl, ByteVec, Undef, Mask);
+ // Combine the two low words from ShuffV into a v8i8, and byte-compare
+ // them against 0.
+ SDValue W0 = DAG.getNode(HexagonISD::VEXTRACTW, dl, MVT::i32, {ShuffV, Zero});
+ SDValue W1 = DAG.getNode(HexagonISD::VEXTRACTW, dl, MVT::i32,
+ {ShuffV, DAG.getConstant(4, dl, MVT::i32)});
+ SDValue Vec64 = DAG.getNode(HexagonISD::COMBINE, dl, MVT::v8i8, {W1, W0});
+ return getInstr(Hexagon::A4_vcmpbgtui, dl, ResTy,
+ {Vec64, DAG.getTargetConstant(0, dl, MVT::i32)}, DAG);
+}
+
+SDValue
+HexagonTargetLowering::insertHvxSubvectorReg(SDValue VecV, SDValue SubV,
+ SDValue IdxV, const SDLoc &dl, SelectionDAG &DAG) const {
+ MVT VecTy = ty(VecV);
+ MVT SubTy = ty(SubV);
+ unsigned HwLen = Subtarget.getVectorLength();
+ MVT ElemTy = VecTy.getVectorElementType();
+ unsigned ElemWidth = ElemTy.getSizeInBits();
+
+ bool IsPair = isHvxPairTy(VecTy);
+ MVT SingleTy = MVT::getVectorVT(ElemTy, (8*HwLen)/ElemWidth);
+ // The two single vectors that VecV consists of, if it's a pair.
+ SDValue V0, V1;
+ SDValue SingleV = VecV;
+ SDValue PickHi;
+
+ if (IsPair) {
+ V0 = DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, SingleTy, VecV);
+ V1 = DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, SingleTy, VecV);
+
+ SDValue HalfV = DAG.getConstant(SingleTy.getVectorNumElements(),
+ dl, MVT::i32);
+ PickHi = DAG.getSetCC(dl, MVT::i1, IdxV, HalfV, ISD::SETUGT);
+ if (isHvxSingleTy(SubTy)) {
+ if (const auto *CN = dyn_cast<const ConstantSDNode>(IdxV.getNode())) {
+ unsigned Idx = CN->getZExtValue();
+ assert(Idx == 0 || Idx == VecTy.getVectorNumElements()/2);
+ unsigned SubIdx = (Idx == 0) ? Hexagon::vsub_lo : Hexagon::vsub_hi;
+ return DAG.getTargetInsertSubreg(SubIdx, dl, VecTy, VecV, SubV);
+ }
+ // If IdxV is not a constant, generate the two variants: with the
+ // SubV as the high and as the low subregister, and select the right
+ // pair based on the IdxV.
+ SDValue InLo = DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, {SubV, V1});
+ SDValue InHi = DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, {V0, SubV});
+ return DAG.getNode(ISD::SELECT, dl, VecTy, PickHi, InHi, InLo);
+ }
+ // The subvector being inserted must be entirely contained in one of
+ // the vectors V0 or V1. Set SingleV to the correct one, and update
+ // IdxV to be the index relative to the beginning of that vector.
+ SDValue S = DAG.getNode(ISD::SUB, dl, MVT::i32, IdxV, HalfV);
+ IdxV = DAG.getNode(ISD::SELECT, dl, MVT::i32, PickHi, S, IdxV);
+ SingleV = DAG.getNode(ISD::SELECT, dl, SingleTy, PickHi, V1, V0);
+ }
+
+ // The only meaningful subvectors of a single HVX vector are those that
+ // fit in a scalar register.
+ assert(SubTy.getSizeInBits() == 32 || SubTy.getSizeInBits() == 64);
+ // Convert IdxV to be index in bytes.
+ auto *IdxN = dyn_cast<ConstantSDNode>(IdxV.getNode());
+ if (!IdxN || !IdxN->isNullValue()) {
+ IdxV = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV,
+ DAG.getConstant(ElemWidth/8, dl, MVT::i32));
+ SingleV = DAG.getNode(HexagonISD::VROR, dl, SingleTy, SingleV, IdxV);
+ }
+ // When inserting a single word, the rotation back to the original position
+ // would be by HwLen-Idx, but if two words are inserted, it will need to be
+ // by (HwLen-4)-Idx.
+ unsigned RolBase = HwLen;
+ if (VecTy.getSizeInBits() == 32) {
+ SDValue V = DAG.getBitcast(MVT::i32, SubV);
+ SingleV = DAG.getNode(HexagonISD::VINSERTW0, dl, SingleTy, V);
+ } else {
+ SDValue V = DAG.getBitcast(MVT::i64, SubV);
+ SDValue R0 = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, V);
+ SDValue R1 = DAG.getTargetExtractSubreg(Hexagon::isub_hi, dl, MVT::i32, V);
+ SingleV = DAG.getNode(HexagonISD::VINSERTW0, dl, SingleTy, SingleV, R0);
+ SingleV = DAG.getNode(HexagonISD::VROR, dl, SingleTy, SingleV,
+ DAG.getConstant(4, dl, MVT::i32));
+ SingleV = DAG.getNode(HexagonISD::VINSERTW0, dl, SingleTy, SingleV, R1);
+ RolBase = HwLen-4;
+ }
+ // If the vector wasn't ror'ed, don't ror it back.
+ if (RolBase != 4 || !IdxN || !IdxN->isNullValue()) {
+ SDValue RolV = DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DAG.getConstant(RolBase, dl, MVT::i32), IdxV);
+ SingleV = DAG.getNode(HexagonISD::VROR, dl, SingleTy, SingleV, RolV);
+ }
+
+ if (IsPair) {
+ SDValue InLo = DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, {SingleV, V1});
+ SDValue InHi = DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, {V0, SingleV});
+ return DAG.getNode(ISD::SELECT, dl, VecTy, PickHi, InHi, InLo);
+ }
+ return SingleV;
+}
+
+SDValue
+HexagonTargetLowering::insertHvxSubvectorPred(SDValue VecV, SDValue SubV,
+ SDValue IdxV, const SDLoc &dl, SelectionDAG &DAG) const {
+ MVT VecTy = ty(VecV);
+ MVT SubTy = ty(SubV);
+ assert(Subtarget.isHVXVectorType(VecTy, true));
+ // VecV is an HVX vector predicate. SubV may be either an HVX vector
+ // predicate as well, or it can be a scalar predicate.
+
+ unsigned VecLen = VecTy.getVectorNumElements();
+ unsigned HwLen = Subtarget.getVectorLength();
+ assert(HwLen % VecLen == 0 && "Unexpected vector type");
+
+ unsigned Scale = VecLen / SubTy.getVectorNumElements();
+ unsigned BitBytes = HwLen / VecLen;
+ unsigned BlockLen = HwLen / Scale;
+
+ MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+ SDValue ByteVec = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, VecV);
+ SDValue ByteSub = createHvxPrefixPred(SubV, dl, BitBytes, false, DAG);
+ SDValue ByteIdx;
+
+ auto *IdxN = dyn_cast<ConstantSDNode>(IdxV.getNode());
+ if (!IdxN || !IdxN->isNullValue()) {
+ ByteIdx = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV,
+ DAG.getConstant(BitBytes, dl, MVT::i32));
+ ByteVec = DAG.getNode(HexagonISD::VROR, dl, ByteTy, ByteVec, ByteIdx);
+ }
+
+ // ByteVec is the target vector VecV rotated in such a way that the
+ // subvector should be inserted at index 0. Generate a predicate mask
+ // and use vmux to do the insertion.
+ MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
+ SDValue Q = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy,
+ {DAG.getConstant(BlockLen, dl, MVT::i32)}, DAG);
+ ByteVec = getInstr(Hexagon::V6_vmux, dl, ByteTy, {Q, ByteSub, ByteVec}, DAG);
+ // Rotate ByteVec back, and convert to a vector predicate.
+ if (!IdxN || !IdxN->isNullValue()) {
+ SDValue HwLenV = DAG.getConstant(HwLen, dl, MVT::i32);
+ SDValue ByteXdi = DAG.getNode(ISD::SUB, dl, MVT::i32, HwLenV, ByteIdx);
+ ByteVec = DAG.getNode(HexagonISD::VROR, dl, ByteTy, ByteVec, ByteXdi);
+ }
+ return DAG.getNode(HexagonISD::V2Q, dl, VecTy, ByteVec);
+}
+
+SDValue
+HexagonTargetLowering::extendHvxVectorPred(SDValue VecV, const SDLoc &dl,
+ MVT ResTy, bool ZeroExt, SelectionDAG &DAG) const {
+ // Sign- and any-extending of a vector predicate to a vector register is
+ // equivalent to Q2V. For zero-extensions, generate a vmux between 0 and
+ // a vector of 1s (where the 1s are of type matching the vector type).
+ assert(Subtarget.isHVXVectorType(ResTy));
+ if (!ZeroExt)
+ return DAG.getNode(HexagonISD::Q2V, dl, ResTy, VecV);
+
+ assert(ty(VecV).getVectorNumElements() == ResTy.getVectorNumElements());
+ SDValue True = DAG.getNode(HexagonISD::VSPLAT, dl, ResTy,
+ DAG.getConstant(1, dl, MVT::i32));
+ SDValue False = getZero(dl, ResTy, DAG);
+ return DAG.getSelect(dl, ResTy, VecV, True, False);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
+ const {
+ const SDLoc &dl(Op);
+ MVT VecTy = ty(Op);
+
+ unsigned Size = Op.getNumOperands();
+ SmallVector<SDValue,128> Ops;
+ for (unsigned i = 0; i != Size; ++i)
+ Ops.push_back(Op.getOperand(i));
+
+ if (VecTy.getVectorElementType() == MVT::i1)
+ return buildHvxVectorPred(Ops, dl, VecTy, DAG);
+
+ if (VecTy.getSizeInBits() == 16*Subtarget.getVectorLength()) {
+ ArrayRef<SDValue> A(Ops);
+ MVT SingleTy = typeSplit(VecTy).first;
+ SDValue V0 = buildHvxVectorReg(A.take_front(Size/2), dl, SingleTy, DAG);
+ SDValue V1 = buildHvxVectorReg(A.drop_front(Size/2), dl, SingleTy, DAG);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, V0, V1);
+ }
+
+ return buildHvxVectorReg(Ops, dl, VecTy, DAG);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG)
+ const {
+ // Vector concatenation of two integer (non-bool) vectors does not need
+ // special lowering. Custom-lower concats of bool vectors and expand
+ // concats of more than 2 vectors.
+ MVT VecTy = ty(Op);
+ const SDLoc &dl(Op);
+ unsigned NumOp = Op.getNumOperands();
+ if (VecTy.getVectorElementType() != MVT::i1) {
+ if (NumOp == 2)
+ return Op;
+ // Expand the other cases into a build-vector.
+ SmallVector<SDValue,8> Elems;
+ for (SDValue V : Op.getNode()->ops())
+ DAG.ExtractVectorElements(V, Elems);
+ // A vector of i16 will be broken up into a build_vector of i16's.
+ // This is a problem, since at the time of operation legalization,
+ // all operations are expected to be type-legalized, and i16 is not
+ // a legal type. If any of the extracted elements is not of a valid
+ // type, sign-extend it to a valid one.
+ for (unsigned i = 0, e = Elems.size(); i != e; ++i) {
+ SDValue V = Elems[i];
+ MVT Ty = ty(V);
+ if (!isTypeLegal(Ty)) {
+ EVT NTy = getTypeToTransformTo(*DAG.getContext(), Ty);
+ if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ Elems[i] = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NTy,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NTy,
+ V.getOperand(0), V.getOperand(1)),
+ DAG.getValueType(Ty));
+ continue;
+ }
+ // A few less complicated cases.
+ if (V.getOpcode() == ISD::Constant)
+ Elems[i] = DAG.getSExtOrTrunc(V, dl, NTy);
+ else if (V.isUndef())
+ Elems[i] = DAG.getUNDEF(NTy);
+ else
+ llvm_unreachable("Unexpected vector element");
+ }
+ }
+ return DAG.getBuildVector(VecTy, dl, Elems);
+ }
+
+ assert(VecTy.getVectorElementType() == MVT::i1);
+ unsigned HwLen = Subtarget.getVectorLength();
+ assert(isPowerOf2_32(NumOp) && HwLen % NumOp == 0);
+
+ SDValue Op0 = Op.getOperand(0);
+
+ // If the operands are HVX types (i.e. not scalar predicates), then
+ // defer the concatenation, and create QCAT instead.
+ if (Subtarget.isHVXVectorType(ty(Op0), true)) {
+ if (NumOp == 2)
+ return DAG.getNode(HexagonISD::QCAT, dl, VecTy, Op0, Op.getOperand(1));
+
+ ArrayRef<SDUse> U(Op.getNode()->ops());
+ SmallVector<SDValue,4> SV(U.begin(), U.end());
+ ArrayRef<SDValue> Ops(SV);
+
+ MVT HalfTy = typeSplit(VecTy).first;
+ SDValue V0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfTy,
+ Ops.take_front(NumOp/2));
+ SDValue V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfTy,
+ Ops.take_back(NumOp/2));
+ return DAG.getNode(HexagonISD::QCAT, dl, VecTy, V0, V1);
+ }
+
+ // Count how many bytes (in a vector register) each bit in VecTy
+ // corresponds to.
+ unsigned BitBytes = HwLen / VecTy.getVectorNumElements();
+
+ SmallVector<SDValue,8> Prefixes;
+ for (SDValue V : Op.getNode()->op_values()) {
+ SDValue P = createHvxPrefixPred(V, dl, BitBytes, true, DAG);
+ Prefixes.push_back(P);
+ }
+
+ unsigned InpLen = ty(Op.getOperand(0)).getVectorNumElements();
+ MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+ SDValue S = DAG.getConstant(InpLen*BitBytes, dl, MVT::i32);
+ SDValue Res = getZero(dl, ByteTy, DAG);
+ for (unsigned i = 0, e = Prefixes.size(); i != e; ++i) {
+ Res = DAG.getNode(HexagonISD::VROR, dl, ByteTy, Res, S);
+ Res = DAG.getNode(ISD::OR, dl, ByteTy, Res, Prefixes[e-i-1]);
+ }
+ return DAG.getNode(HexagonISD::V2Q, dl, VecTy, Res);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG)
+ const {
+ // Change the type of the extracted element to i32.
+ SDValue VecV = Op.getOperand(0);
+ MVT ElemTy = ty(VecV).getVectorElementType();
+ const SDLoc &dl(Op);
+ SDValue IdxV = Op.getOperand(1);
+ if (ElemTy == MVT::i1)
+ return extractHvxElementPred(VecV, IdxV, dl, ty(Op), DAG);
+
+ return extractHvxElementReg(VecV, IdxV, dl, ty(Op), DAG);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG)
+ const {
+ const SDLoc &dl(Op);
+ SDValue VecV = Op.getOperand(0);
+ SDValue ValV = Op.getOperand(1);
+ SDValue IdxV = Op.getOperand(2);
+ MVT ElemTy = ty(VecV).getVectorElementType();
+ if (ElemTy == MVT::i1)
+ return insertHvxElementPred(VecV, IdxV, ValV, dl, DAG);
+
+ return insertHvxElementReg(VecV, IdxV, ValV, dl, DAG);
+}
+
+SDValue
HexagonTargetLowering::LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG)
const {
SDValue SrcV = Op.getOperand(0);
MVT SrcTy = ty(SrcV);
- unsigned SrcElems = SrcTy.getVectorNumElements();
+ MVT DstTy = ty(Op);
SDValue IdxV = Op.getOperand(1);
unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue();
- MVT DstTy = ty(Op);
- assert(Idx == 0 || DstTy.getVectorNumElements() % Idx == 0);
+ assert(Idx % DstTy.getVectorNumElements() == 0);
+ (void)Idx;
const SDLoc &dl(Op);
- if (Idx == 0)
- return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, DstTy, SrcV);
- if (Idx == SrcElems/2)
- return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, DstTy, SrcV);
- return SDValue();
+
+ MVT ElemTy = SrcTy.getVectorElementType();
+ if (ElemTy == MVT::i1)
+ return extractHvxSubvectorPred(SrcV, IdxV, dl, DstTy, DAG);
+
+ return extractHvxSubvectorReg(SrcV, IdxV, dl, DstTy, DAG);
}
SDValue
HexagonTargetLowering::LowerHvxInsertSubvector(SDValue Op, SelectionDAG &DAG)
const {
- // Idx may be variable.
+ // Idx does not need to be a constant.
+ SDValue VecV = Op.getOperand(0);
+ SDValue ValV = Op.getOperand(1);
SDValue IdxV = Op.getOperand(2);
- auto *IdxN = dyn_cast<ConstantSDNode>(IdxV.getNode());
- if (!IdxN)
- return SDValue();
- unsigned Idx = IdxN->getZExtValue();
- SDValue DstV = Op.getOperand(0);
- SDValue SrcV = Op.getOperand(1);
- MVT DstTy = ty(DstV);
- MVT SrcTy = ty(SrcV);
- unsigned DstElems = DstTy.getVectorNumElements();
- unsigned SrcElems = SrcTy.getVectorNumElements();
- if (2*SrcElems != DstElems)
- return SDValue();
+ const SDLoc &dl(Op);
+ MVT VecTy = ty(VecV);
+ MVT ElemTy = VecTy.getVectorElementType();
+ if (ElemTy == MVT::i1)
+ return insertHvxSubvectorPred(VecV, ValV, IdxV, dl, DAG);
+
+ return insertHvxSubvectorReg(VecV, ValV, IdxV, dl, DAG);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxAnyExt(SDValue Op, SelectionDAG &DAG) const {
+ // Lower any-extends of boolean vectors to sign-extends, since they
+ // translate directly to Q2V. Zero-extending could also be done equally
+ // fast, but Q2V is used/recognized in more places.
+ // For all other vectors, use zero-extend.
+ MVT ResTy = ty(Op);
+ SDValue InpV = Op.getOperand(0);
+ MVT ElemTy = ty(InpV).getVectorElementType();
+ if (ElemTy == MVT::i1 && Subtarget.isHVXVectorType(ResTy))
+ return LowerHvxSignExt(Op, DAG);
+ return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Op), ResTy, InpV);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxSignExt(SDValue Op, SelectionDAG &DAG) const {
+ MVT ResTy = ty(Op);
+ SDValue InpV = Op.getOperand(0);
+ MVT ElemTy = ty(InpV).getVectorElementType();
+ if (ElemTy == MVT::i1 && Subtarget.isHVXVectorType(ResTy))
+ return extendHvxVectorPred(InpV, SDLoc(Op), ty(Op), false, DAG);
+ return Op;
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const {
+ MVT ResTy = ty(Op);
+ SDValue InpV = Op.getOperand(0);
+ MVT ElemTy = ty(InpV).getVectorElementType();
+ if (ElemTy == MVT::i1 && Subtarget.isHVXVectorType(ResTy))
+ return extendHvxVectorPred(InpV, SDLoc(Op), ty(Op), true, DAG);
+ return Op;
+}
+SDValue
+HexagonTargetLowering::LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const {
+ // Lower vector CTTZ into a computation using CTLZ (Hacker's Delight):
+ // cttz(x) = bitwidth(x) - ctlz(~x & (x-1))
const SDLoc &dl(Op);
- if (Idx == 0)
- return DAG.getTargetInsertSubreg(Hexagon::vsub_lo, dl, DstTy, DstV, SrcV);
- if (Idx == SrcElems)
- return DAG.getTargetInsertSubreg(Hexagon::vsub_hi, dl, DstTy, DstV, SrcV);
- return SDValue();
+ MVT ResTy = ty(Op);
+ SDValue InpV = Op.getOperand(0);
+ assert(ResTy == ty(InpV));
+
+ // Calculate the vectors of 1 and bitwidth(x).
+ MVT ElemTy = ty(InpV).getVectorElementType();
+ unsigned ElemWidth = ElemTy.getSizeInBits();
+ // Using uint64_t because a shift by 32 can happen.
+ uint64_t Splat1 = 0, SplatW = 0;
+ assert(isPowerOf2_32(ElemWidth) && ElemWidth <= 32);
+ for (unsigned i = 0; i != 32/ElemWidth; ++i) {
+ Splat1 = (Splat1 << ElemWidth) | 1;
+ SplatW = (SplatW << ElemWidth) | ElemWidth;
+ }
+ SDValue Vec1 = DAG.getNode(HexagonISD::VSPLATW, dl, ResTy,
+ DAG.getConstant(uint32_t(Splat1), dl, MVT::i32));
+ SDValue VecW = DAG.getNode(HexagonISD::VSPLATW, dl, ResTy,
+ DAG.getConstant(uint32_t(SplatW), dl, MVT::i32));
+ SDValue VecN1 = DAG.getNode(HexagonISD::VSPLATW, dl, ResTy,
+ DAG.getConstant(-1, dl, MVT::i32));
+ // Do not use DAG.getNOT, because that would create BUILD_VECTOR with
+ // a BITCAST. Here we can skip the BITCAST (so we don't have to handle
+ // it separately in custom combine or selection).
+ SDValue A = DAG.getNode(ISD::AND, dl, ResTy,
+ {DAG.getNode(ISD::XOR, dl, ResTy, {InpV, VecN1}),
+ DAG.getNode(ISD::SUB, dl, ResTy, {InpV, Vec1})});
+ return DAG.getNode(ISD::SUB, dl, ResTy,
+ {VecW, DAG.getNode(ISD::CTLZ, dl, ResTy, A)});
}
SDValue
HexagonTargetLowering::LowerHvxMul(SDValue Op, SelectionDAG &DAG) const {
MVT ResTy = ty(Op);
- if (!ResTy.isVector())
- return SDValue();
+ assert(ResTy.isVector() && isHvxSingleTy(ResTy));
const SDLoc &dl(Op);
SmallVector<int,256> ShuffMask;
@@ -423,18 +1205,14 @@ HexagonTargetLowering::LowerHvxMul(SDValue Op, SelectionDAG &DAG) const {
SDValue Vt = Op.getOperand(1);
switch (ElemTy.SimpleTy) {
- case MVT::i8:
- case MVT::i16: {
+ case MVT::i8: {
// For i8 vectors Vs = (a0, a1, ...), Vt = (b0, b1, ...),
// V6_vmpybv Vs, Vt produces a pair of i16 vectors Hi:Lo,
// where Lo = (a0*b0, a2*b2, ...), Hi = (a1*b1, a3*b3, ...).
- // For i16, use V6_vmpyhv, which behaves in an analogous way to
- // V6_vmpybv: results Lo and Hi are products of even/odd elements
- // respectively.
MVT ExtTy = typeExtElem(ResTy, 2);
unsigned MpyOpc = ElemTy == MVT::i8 ? Hexagon::V6_vmpybv
: Hexagon::V6_vmpyhv;
- SDValue M = getNode(MpyOpc, dl, ExtTy, {Vs, Vt}, DAG);
+ SDValue M = getInstr(MpyOpc, dl, ExtTy, {Vs, Vt}, DAG);
// Discard high halves of the resulting values, collect the low halves.
for (unsigned I = 0; I < VecLen; I += 2) {
@@ -442,18 +1220,24 @@ HexagonTargetLowering::LowerHvxMul(SDValue Op, SelectionDAG &DAG) const {
ShuffMask.push_back(I+VecLen); // Pick odd element.
}
VectorPair P = opSplit(opCastElem(M, ElemTy, DAG), dl, DAG);
- return getByteShuffle(dl, P.first, P.second, ShuffMask, DAG);
+ SDValue BS = getByteShuffle(dl, P.first, P.second, ShuffMask, DAG);
+ return DAG.getBitcast(ResTy, BS);
}
+ case MVT::i16:
+ // For i16 there is V6_vmpyih, which acts exactly like the MUL opcode.
+ // (There is also V6_vmpyhv, which behaves in an analogous way to
+ // V6_vmpybv.)
+ return getInstr(Hexagon::V6_vmpyih, dl, ResTy, {Vs, Vt}, DAG);
case MVT::i32: {
// Use the following sequence for signed word multiply:
// T0 = V6_vmpyiowh Vs, Vt
// T1 = V6_vaslw T0, 16
// T2 = V6_vmpyiewuh_acc T1, Vs, Vt
SDValue S16 = DAG.getConstant(16, dl, MVT::i32);
- SDValue T0 = getNode(Hexagon::V6_vmpyiowh, dl, ResTy, {Vs, Vt}, DAG);
- SDValue T1 = getNode(Hexagon::V6_vaslw, dl, ResTy, {T0, S16}, DAG);
- SDValue T2 = getNode(Hexagon::V6_vmpyiewuh_acc, dl, ResTy,
- {T1, Vs, Vt}, DAG);
+ SDValue T0 = getInstr(Hexagon::V6_vmpyiowh, dl, ResTy, {Vs, Vt}, DAG);
+ SDValue T1 = getInstr(Hexagon::V6_vaslw, dl, ResTy, {T0, S16}, DAG);
+ SDValue T2 = getInstr(Hexagon::V6_vmpyiewuh_acc, dl, ResTy,
+ {T1, Vs, Vt}, DAG);
return T2;
}
default:
@@ -463,78 +1247,109 @@ HexagonTargetLowering::LowerHvxMul(SDValue Op, SelectionDAG &DAG) const {
}
SDValue
-HexagonTargetLowering::LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const {
- MVT VecTy = ty(Op.getOperand(0));
- assert(VecTy == ty(Op.getOperand(1)));
-
- SDValue Cmp = Op.getOperand(2);
- ISD::CondCode CC = cast<CondCodeSDNode>(Cmp)->get();
- bool Negate = false, Swap = false;
-
- // HVX has instructions for SETEQ, SETGT, SETUGT. The other comparisons
- // can be arranged as operand-swapped/negated versions of these. Since
- // the generated code will have the original CC expressed as
- // (negate (swap-op NewCmp)),
- // the condition code for the NewCmp should be calculated from the original
- // CC by applying these operations in the reverse order.
- //
- // This could also be done through setCondCodeAction, but for negation it
- // uses a xor with a vector of -1s, which it obtains from BUILD_VECTOR.
- // That is far too expensive for what can be done with a single instruction.
-
- switch (CC) {
- case ISD::SETNE: // !eq
- case ISD::SETLE: // !gt
- case ISD::SETGE: // !lt
- case ISD::SETULE: // !ugt
- case ISD::SETUGE: // !ult
- CC = ISD::getSetCCInverse(CC, true);
- Negate = true;
- break;
- default:
- break;
+HexagonTargetLowering::LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const {
+ MVT ResTy = ty(Op);
+ assert(ResTy.isVector());
+ const SDLoc &dl(Op);
+ SmallVector<int,256> ShuffMask;
+
+ MVT ElemTy = ResTy.getVectorElementType();
+ unsigned VecLen = ResTy.getVectorNumElements();
+ SDValue Vs = Op.getOperand(0);
+ SDValue Vt = Op.getOperand(1);
+ bool IsSigned = Op.getOpcode() == ISD::MULHS;
+
+ if (ElemTy == MVT::i8 || ElemTy == MVT::i16) {
+ // For i8 vectors Vs = (a0, a1, ...), Vt = (b0, b1, ...),
+ // V6_vmpybv Vs, Vt produces a pair of i16 vectors Hi:Lo,
+ // where Lo = (a0*b0, a2*b2, ...), Hi = (a1*b1, a3*b3, ...).
+ // For i16, use V6_vmpyhv, which behaves in an analogous way to
+ // V6_vmpybv: results Lo and Hi are products of even/odd elements
+ // respectively.
+ MVT ExtTy = typeExtElem(ResTy, 2);
+ unsigned MpyOpc = ElemTy == MVT::i8
+ ? (IsSigned ? Hexagon::V6_vmpybv : Hexagon::V6_vmpyubv)
+ : (IsSigned ? Hexagon::V6_vmpyhv : Hexagon::V6_vmpyuhv);
+ SDValue M = getInstr(MpyOpc, dl, ExtTy, {Vs, Vt}, DAG);
+
+ // Discard low halves of the resulting values, collect the high halves.
+ for (unsigned I = 0; I < VecLen; I += 2) {
+ ShuffMask.push_back(I+1); // Pick even element.
+ ShuffMask.push_back(I+VecLen+1); // Pick odd element.
+ }
+ VectorPair P = opSplit(opCastElem(M, ElemTy, DAG), dl, DAG);
+ SDValue BS = getByteShuffle(dl, P.first, P.second, ShuffMask, DAG);
+ return DAG.getBitcast(ResTy, BS);
}
- switch (CC) {
- case ISD::SETLT: // swap gt
- case ISD::SETULT: // swap ugt
- CC = ISD::getSetCCSwappedOperands(CC);
- Swap = true;
- break;
- default:
- break;
+ assert(ElemTy == MVT::i32);
+ SDValue S16 = DAG.getConstant(16, dl, MVT::i32);
+
+ if (IsSigned) {
+ // mulhs(Vs,Vt) =
+ // = [(Hi(Vs)*2^16 + Lo(Vs)) *s (Hi(Vt)*2^16 + Lo(Vt))] >> 32
+ // = [Hi(Vs)*2^16 *s Hi(Vt)*2^16 + Hi(Vs) *su Lo(Vt)*2^16
+ // + Lo(Vs) *us (Hi(Vt)*2^16 + Lo(Vt))] >> 32
+ // = [Hi(Vs) *s Hi(Vt)*2^32 + Hi(Vs) *su Lo(Vt)*2^16
+ // + Lo(Vs) *us Vt] >> 32
+ // The low half of Lo(Vs)*Lo(Vt) will be discarded (it's not added to
+ // anything, so it cannot produce any carry over to higher bits),
+ // so everything in [] can be shifted by 16 without loss of precision.
+ // = [Hi(Vs) *s Hi(Vt)*2^16 + Hi(Vs)*su Lo(Vt) + Lo(Vs)*Vt >> 16] >> 16
+ // = [Hi(Vs) *s Hi(Vt)*2^16 + Hi(Vs)*su Lo(Vt) + V6_vmpyewuh(Vs,Vt)] >> 16
+ // Denote Hi(Vs) = Vs':
+ // = [Vs'*s Hi(Vt)*2^16 + Vs' *su Lo(Vt) + V6_vmpyewuh(Vt,Vs)] >> 16
+ // = Vs'*s Hi(Vt) + (V6_vmpyiewuh(Vs',Vt) + V6_vmpyewuh(Vt,Vs)) >> 16
+ SDValue T0 = getInstr(Hexagon::V6_vmpyewuh, dl, ResTy, {Vt, Vs}, DAG);
+ // Get Vs':
+ SDValue S0 = getInstr(Hexagon::V6_vasrw, dl, ResTy, {Vs, S16}, DAG);
+ SDValue T1 = getInstr(Hexagon::V6_vmpyiewuh_acc, dl, ResTy,
+ {T0, S0, Vt}, DAG);
+ // Shift by 16:
+ SDValue S2 = getInstr(Hexagon::V6_vasrw, dl, ResTy, {T1, S16}, DAG);
+ // Get Vs'*Hi(Vt):
+ SDValue T2 = getInstr(Hexagon::V6_vmpyiowh, dl, ResTy, {S0, Vt}, DAG);
+ // Add:
+ SDValue T3 = DAG.getNode(ISD::ADD, dl, ResTy, {S2, T2});
+ return T3;
}
- assert(CC == ISD::SETEQ || CC == ISD::SETGT || CC == ISD::SETUGT);
+ // Unsigned mulhw. (Would expansion using signed mulhw be better?)
- MVT ElemTy = VecTy.getVectorElementType();
- unsigned ElemWidth = ElemTy.getSizeInBits();
- assert(isPowerOf2_32(ElemWidth));
-
- auto getIdx = [] (unsigned Code) {
- static const unsigned Idx[] = { ISD::SETEQ, ISD::SETGT, ISD::SETUGT };
- for (unsigned I = 0, E = array_lengthof(Idx); I != E; ++I)
- if (Code == Idx[I])
- return I;
- llvm_unreachable("Unhandled CondCode");
+ auto LoVec = [&DAG,ResTy,dl] (SDValue Pair) {
+ return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, ResTy, Pair);
};
-
- static unsigned OpcTable[3][3] = {
- // SETEQ SETGT, SETUGT
- /* Byte */ { Hexagon::V6_veqb, Hexagon::V6_vgtb, Hexagon::V6_vgtub },
- /* Half */ { Hexagon::V6_veqh, Hexagon::V6_vgth, Hexagon::V6_vgtuh },
- /* Word */ { Hexagon::V6_veqw, Hexagon::V6_vgtw, Hexagon::V6_vgtuw }
+ auto HiVec = [&DAG,ResTy,dl] (SDValue Pair) {
+ return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, ResTy, Pair);
};
- unsigned CmpOpc = OpcTable[Log2_32(ElemWidth)-3][getIdx(CC)];
-
- MVT ResTy = ty(Op);
- const SDLoc &dl(Op);
- SDValue OpL = Swap ? Op.getOperand(1) : Op.getOperand(0);
- SDValue OpR = Swap ? Op.getOperand(0) : Op.getOperand(1);
- SDValue CmpV = getNode(CmpOpc, dl, ResTy, {OpL, OpR}, DAG);
- return Negate ? getNode(Hexagon::V6_pred_not, dl, ResTy, {CmpV}, DAG)
- : CmpV;
+ MVT PairTy = typeJoin({ResTy, ResTy});
+ SDValue P = getInstr(Hexagon::V6_lvsplatw, dl, ResTy,
+ {DAG.getConstant(0x02020202, dl, MVT::i32)}, DAG);
+ // Multiply-unsigned halfwords:
+ // LoVec = Vs.uh[2i] * Vt.uh[2i],
+ // HiVec = Vs.uh[2i+1] * Vt.uh[2i+1]
+ SDValue T0 = getInstr(Hexagon::V6_vmpyuhv, dl, PairTy, {Vs, Vt}, DAG);
+ // The low halves in the LoVec of the pair can be discarded. They are
+ // not added to anything (in the full-precision product), so they cannot
+ // produce a carry into the higher bits.
+ SDValue T1 = getInstr(Hexagon::V6_vlsrw, dl, ResTy, {LoVec(T0), S16}, DAG);
+ // Swap low and high halves in Vt, and do the halfword multiplication
+ // to get products Vs.uh[2i] * Vt.uh[2i+1] and Vs.uh[2i+1] * Vt.uh[2i].
+ SDValue D0 = getInstr(Hexagon::V6_vdelta, dl, ResTy, {Vt, P}, DAG);
+ SDValue T2 = getInstr(Hexagon::V6_vmpyuhv, dl, PairTy, {Vs, D0}, DAG);
+ // T2 has mixed products of halfwords: Lo(Vt)*Hi(Vs) and Hi(Vt)*Lo(Vs).
+ // These products are words, but cannot be added directly because the
+ // sums could overflow. Add these products, by halfwords, where each sum
+ // of a pair of halfwords gives a word.
+ SDValue T3 = getInstr(Hexagon::V6_vadduhw, dl, PairTy,
+ {LoVec(T2), HiVec(T2)}, DAG);
+ // Add the high halfwords from the products of the low halfwords.
+ SDValue T4 = DAG.getNode(ISD::ADD, dl, ResTy, {T1, LoVec(T3)});
+ SDValue T5 = getInstr(Hexagon::V6_vlsrw, dl, ResTy, {T4, S16}, DAG);
+ SDValue T6 = DAG.getNode(ISD::ADD, dl, ResTy, {HiVec(T0), HiVec(T3)});
+ SDValue T7 = DAG.getNode(ISD::ADD, dl, ResTy, {T5, T6});
+ return T7;
}
SDValue
@@ -543,3 +1358,163 @@ HexagonTargetLowering::LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG);
return DAG.getZeroExtendVectorInReg(Op.getOperand(0), SDLoc(Op), ty(Op));
}
+
+SDValue
+HexagonTargetLowering::LowerHvxShift(SDValue Op, SelectionDAG &DAG) const {
+ if (SDValue S = getVectorShiftByInt(Op, DAG))
+ return S;
+ return Op;
+}
+
+SDValue
+HexagonTargetLowering::SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const {
+ assert(!Op.isMachineOpcode());
+ SmallVector<SDValue,2> OpsL, OpsH;
+ const SDLoc &dl(Op);
+
+ auto SplitVTNode = [&DAG,this] (const VTSDNode *N) {
+ MVT Ty = typeSplit(N->getVT().getSimpleVT()).first;
+ SDValue TV = DAG.getValueType(Ty);
+ return std::make_pair(TV, TV);
+ };
+
+ for (SDValue A : Op.getNode()->ops()) {
+ VectorPair P = Subtarget.isHVXVectorType(ty(A), true)
+ ? opSplit(A, dl, DAG)
+ : std::make_pair(A, A);
+ // Special case for type operand.
+ if (Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ if (const auto *N = dyn_cast<const VTSDNode>(A.getNode()))
+ P = SplitVTNode(N);
+ }
+ OpsL.push_back(P.first);
+ OpsH.push_back(P.second);
+ }
+
+ MVT ResTy = ty(Op);
+ MVT HalfTy = typeSplit(ResTy).first;
+ SDValue L = DAG.getNode(Op.getOpcode(), dl, HalfTy, OpsL);
+ SDValue H = DAG.getNode(Op.getOpcode(), dl, HalfTy, OpsH);
+ SDValue S = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResTy, L, H);
+ return S;
+}
+
+SDValue
+HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const {
+ LSBaseSDNode *BN = cast<LSBaseSDNode>(Op.getNode());
+ assert(BN->isUnindexed());
+ MVT MemTy = BN->getMemoryVT().getSimpleVT();
+ if (!isHvxPairTy(MemTy))
+ return Op;
+
+ const SDLoc &dl(Op);
+ unsigned HwLen = Subtarget.getVectorLength();
+ MVT SingleTy = typeSplit(MemTy).first;
+ SDValue Chain = BN->getChain();
+ SDValue Base0 = BN->getBasePtr();
+ SDValue Base1 = DAG.getMemBasePlusOffset(Base0, HwLen, dl);
+
+ MachineMemOperand *MOp0 = nullptr, *MOp1 = nullptr;
+ if (MachineMemOperand *MMO = BN->getMemOperand()) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MOp0 = MF.getMachineMemOperand(MMO, 0, HwLen);
+ MOp1 = MF.getMachineMemOperand(MMO, HwLen, HwLen);
+ }
+
+ unsigned MemOpc = BN->getOpcode();
+ SDValue NewOp;
+
+ if (MemOpc == ISD::LOAD) {
+ SDValue Load0 = DAG.getLoad(SingleTy, dl, Chain, Base0, MOp0);
+ SDValue Load1 = DAG.getLoad(SingleTy, dl, Chain, Base1, MOp1);
+ NewOp = DAG.getMergeValues(
+ { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, Load0, Load1),
+ DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ Load0.getValue(1), Load1.getValue(1)) }, dl);
+ } else {
+ assert(MemOpc == ISD::STORE);
+ VectorPair Vals = opSplit(cast<StoreSDNode>(Op)->getValue(), dl, DAG);
+ SDValue Store0 = DAG.getStore(Chain, dl, Vals.first, Base0, MOp0);
+ SDValue Store1 = DAG.getStore(Chain, dl, Vals.second, Base1, MOp1);
+ NewOp = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store0, Store1);
+ }
+
+ return NewOp;
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ bool IsPairOp = isHvxPairTy(ty(Op)) ||
+ llvm::any_of(Op.getNode()->ops(), [this] (SDValue V) {
+ return isHvxPairTy(ty(V));
+ });
+
+ if (IsPairOp) {
+ switch (Opc) {
+ default:
+ break;
+ case ISD::LOAD:
+ case ISD::STORE:
+ return SplitHvxMemOp(Op, DAG);
+ case ISD::CTPOP:
+ case ISD::CTLZ:
+ case ISD::CTTZ:
+ case ISD::MUL:
+ case ISD::MULHS:
+ case ISD::MULHU:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::SRA:
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SETCC:
+ case ISD::VSELECT:
+ case ISD::SIGN_EXTEND_INREG:
+ return SplitHvxPairOp(Op, DAG);
+ }
+ }
+
+ switch (Opc) {
+ default:
+ break;
+ case ISD::BUILD_VECTOR: return LowerHvxBuildVector(Op, DAG);
+ case ISD::CONCAT_VECTORS: return LowerHvxConcatVectors(Op, DAG);
+ case ISD::INSERT_SUBVECTOR: return LowerHvxInsertSubvector(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT: return LowerHvxInsertElement(Op, DAG);
+ case ISD::EXTRACT_SUBVECTOR: return LowerHvxExtractSubvector(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return LowerHvxExtractElement(Op, DAG);
+
+ case ISD::ANY_EXTEND: return LowerHvxAnyExt(Op, DAG);
+ case ISD::SIGN_EXTEND: return LowerHvxSignExt(Op, DAG);
+ case ISD::ZERO_EXTEND: return LowerHvxZeroExt(Op, DAG);
+ case ISD::CTTZ: return LowerHvxCttz(Op, DAG);
+ case ISD::SRA:
+ case ISD::SHL:
+ case ISD::SRL: return LowerHvxShift(Op, DAG);
+ case ISD::MUL: return LowerHvxMul(Op, DAG);
+ case ISD::MULHS:
+ case ISD::MULHU: return LowerHvxMulh(Op, DAG);
+ case ISD::ANY_EXTEND_VECTOR_INREG: return LowerHvxExtend(Op, DAG);
+ case ISD::SETCC:
+ case ISD::INTRINSIC_VOID: return Op;
+ // Unaligned loads will be handled by the default lowering.
+ case ISD::LOAD: return SDValue();
+ }
+#ifndef NDEBUG
+ Op.dumpr(&DAG);
+#endif
+ llvm_unreachable("Unhandled HVX operation");
+}
+
+bool
+HexagonTargetLowering::isHvxOperation(SDValue Op) const {
+ // If the type of the result, or any operand type are HVX vector types,
+ // this is an HVX operation.
+ return Subtarget.isHVXVectorType(ty(Op), true) ||
+ llvm::any_of(Op.getNode()->ops(),
+ [this] (SDValue V) {
+ return Subtarget.isHVXVectorType(ty(V), true);
+ });
+}
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/lib/Target/Hexagon/HexagonInstrFormatsV60.td
index 14bda0e0107d..1347a655353f 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV60.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV60.td
@@ -19,4 +19,4 @@ class CVI_VA_Resource<dag outs, dag ins, string asmstr,
list<dag> pattern = [], string cstr = "",
InstrItinClass itin = CVI_VA>
: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>,
- OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+ OpcodeHexagon, Requires<[HasV60, UseHVX]>;
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index b82a0157e81f..6019c7c5d024 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -34,7 +34,6 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
@@ -49,6 +48,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
@@ -134,7 +134,7 @@ static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB,
MachineBasicBlock::const_instr_iterator MIE) {
unsigned Count = 0;
for (; MIB != MIE; ++MIB) {
- if (!MIB->isDebugValue())
+ if (!MIB->isDebugInstr())
++Count;
}
return Count;
@@ -144,9 +144,9 @@ static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB,
/// On Hexagon, we have two instructions used to set-up the hardware loop
/// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions
/// to indicate the end of a loop.
-static MachineInstr *findLoopInstr(MachineBasicBlock *BB, unsigned EndLoopOp,
- MachineBasicBlock *TargetBB,
- SmallPtrSet<MachineBasicBlock *, 8> &Visited) {
+MachineInstr *HexagonInstrInfo::findLoopInstr(MachineBasicBlock *BB,
+ unsigned EndLoopOp, MachineBasicBlock *TargetBB,
+ SmallPtrSet<MachineBasicBlock *, 8> &Visited) const {
unsigned LOOPi;
unsigned LOOPr;
if (EndLoopOp == Hexagon::ENDLOOP0) {
@@ -240,41 +240,41 @@ static bool isDuplexPairMatch(unsigned Ga, unsigned Gb) {
unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
switch (MI.getOpcode()) {
- default:
- break;
- case Hexagon::L2_loadri_io:
- case Hexagon::L2_loadrd_io:
- case Hexagon::V6_vL32b_ai:
- case Hexagon::V6_vL32b_nt_ai:
- case Hexagon::V6_vL32Ub_ai:
- case Hexagon::LDriw_pred:
- case Hexagon::LDriw_mod:
- case Hexagon::PS_vloadrq_ai:
- case Hexagon::PS_vloadrw_ai:
- case Hexagon::PS_vloadrw_nt_ai: {
- const MachineOperand OpFI = MI.getOperand(1);
- if (!OpFI.isFI())
- return 0;
- const MachineOperand OpOff = MI.getOperand(2);
- if (!OpOff.isImm() || OpOff.getImm() != 0)
- return 0;
- FrameIndex = OpFI.getIndex();
- return MI.getOperand(0).getReg();
- }
+ default:
+ break;
+ case Hexagon::L2_loadri_io:
+ case Hexagon::L2_loadrd_io:
+ case Hexagon::V6_vL32b_ai:
+ case Hexagon::V6_vL32b_nt_ai:
+ case Hexagon::V6_vL32Ub_ai:
+ case Hexagon::LDriw_pred:
+ case Hexagon::LDriw_ctr:
+ case Hexagon::PS_vloadrq_ai:
+ case Hexagon::PS_vloadrw_ai:
+ case Hexagon::PS_vloadrw_nt_ai: {
+ const MachineOperand OpFI = MI.getOperand(1);
+ if (!OpFI.isFI())
+ return 0;
+ const MachineOperand OpOff = MI.getOperand(2);
+ if (!OpOff.isImm() || OpOff.getImm() != 0)
+ return 0;
+ FrameIndex = OpFI.getIndex();
+ return MI.getOperand(0).getReg();
+ }
- case Hexagon::L2_ploadrit_io:
- case Hexagon::L2_ploadrif_io:
- case Hexagon::L2_ploadrdt_io:
- case Hexagon::L2_ploadrdf_io: {
- const MachineOperand OpFI = MI.getOperand(2);
- if (!OpFI.isFI())
- return 0;
- const MachineOperand OpOff = MI.getOperand(3);
- if (!OpOff.isImm() || OpOff.getImm() != 0)
- return 0;
- FrameIndex = OpFI.getIndex();
- return MI.getOperand(0).getReg();
- }
+ case Hexagon::L2_ploadrit_io:
+ case Hexagon::L2_ploadrif_io:
+ case Hexagon::L2_ploadrdt_io:
+ case Hexagon::L2_ploadrdf_io: {
+ const MachineOperand OpFI = MI.getOperand(2);
+ if (!OpFI.isFI())
+ return 0;
+ const MachineOperand OpOff = MI.getOperand(3);
+ if (!OpOff.isImm() || OpOff.getImm() != 0)
+ return 0;
+ FrameIndex = OpFI.getIndex();
+ return MI.getOperand(0).getReg();
+ }
}
return 0;
@@ -288,48 +288,84 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
switch (MI.getOpcode()) {
- default:
- break;
- case Hexagon::S2_storerb_io:
- case Hexagon::S2_storerh_io:
- case Hexagon::S2_storeri_io:
- case Hexagon::S2_storerd_io:
- case Hexagon::V6_vS32b_ai:
- case Hexagon::V6_vS32Ub_ai:
- case Hexagon::STriw_pred:
- case Hexagon::STriw_mod:
- case Hexagon::PS_vstorerq_ai:
- case Hexagon::PS_vstorerw_ai: {
- const MachineOperand &OpFI = MI.getOperand(0);
- if (!OpFI.isFI())
- return 0;
- const MachineOperand &OpOff = MI.getOperand(1);
- if (!OpOff.isImm() || OpOff.getImm() != 0)
- return 0;
- FrameIndex = OpFI.getIndex();
- return MI.getOperand(2).getReg();
+ default:
+ break;
+ case Hexagon::S2_storerb_io:
+ case Hexagon::S2_storerh_io:
+ case Hexagon::S2_storeri_io:
+ case Hexagon::S2_storerd_io:
+ case Hexagon::V6_vS32b_ai:
+ case Hexagon::V6_vS32Ub_ai:
+ case Hexagon::STriw_pred:
+ case Hexagon::STriw_ctr:
+ case Hexagon::PS_vstorerq_ai:
+ case Hexagon::PS_vstorerw_ai: {
+ const MachineOperand &OpFI = MI.getOperand(0);
+ if (!OpFI.isFI())
+ return 0;
+ const MachineOperand &OpOff = MI.getOperand(1);
+ if (!OpOff.isImm() || OpOff.getImm() != 0)
+ return 0;
+ FrameIndex = OpFI.getIndex();
+ return MI.getOperand(2).getReg();
+ }
+
+ case Hexagon::S2_pstorerbt_io:
+ case Hexagon::S2_pstorerbf_io:
+ case Hexagon::S2_pstorerht_io:
+ case Hexagon::S2_pstorerhf_io:
+ case Hexagon::S2_pstorerit_io:
+ case Hexagon::S2_pstorerif_io:
+ case Hexagon::S2_pstorerdt_io:
+ case Hexagon::S2_pstorerdf_io: {
+ const MachineOperand &OpFI = MI.getOperand(1);
+ if (!OpFI.isFI())
+ return 0;
+ const MachineOperand &OpOff = MI.getOperand(2);
+ if (!OpOff.isImm() || OpOff.getImm() != 0)
+ return 0;
+ FrameIndex = OpFI.getIndex();
+ return MI.getOperand(3).getReg();
+ }
}
- case Hexagon::S2_pstorerbt_io:
- case Hexagon::S2_pstorerbf_io:
- case Hexagon::S2_pstorerht_io:
- case Hexagon::S2_pstorerhf_io:
- case Hexagon::S2_pstorerit_io:
- case Hexagon::S2_pstorerif_io:
- case Hexagon::S2_pstorerdt_io:
- case Hexagon::S2_pstorerdf_io: {
- const MachineOperand &OpFI = MI.getOperand(1);
- if (!OpFI.isFI())
- return 0;
- const MachineOperand &OpOff = MI.getOperand(2);
- if (!OpOff.isImm() || OpOff.getImm() != 0)
- return 0;
- FrameIndex = OpFI.getIndex();
- return MI.getOperand(3).getReg();
+ return 0;
+}
+
+/// This function checks if the instruction or bundle of instructions
+/// has load from stack slot and returns frameindex and machine memory
+/// operand of that instruction if true.
+bool HexagonInstrInfo::hasLoadFromStackSlot(const MachineInstr &MI,
+ const MachineMemOperand *&MMO,
+ int &FrameIndex) const {
+ if (MI.isBundle()) {
+ const MachineBasicBlock *MBB = MI.getParent();
+ MachineBasicBlock::const_instr_iterator MII = MI.getIterator();
+ for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII)
+ if (TargetInstrInfo::hasLoadFromStackSlot(*MII, MMO, FrameIndex))
+ return true;
+ return false;
}
+
+ return TargetInstrInfo::hasLoadFromStackSlot(MI, MMO, FrameIndex);
+}
+
+/// This function checks if the instruction or bundle of instructions
+/// has store to stack slot and returns frameindex and machine memory
+/// operand of that instruction if true.
+bool HexagonInstrInfo::hasStoreToStackSlot(const MachineInstr &MI,
+ const MachineMemOperand *&MMO,
+ int &FrameIndex) const {
+ if (MI.isBundle()) {
+ const MachineBasicBlock *MBB = MI.getParent();
+ MachineBasicBlock::const_instr_iterator MII = MI.getIterator();
+ for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII)
+ if (TargetInstrInfo::hasStoreToStackSlot(*MII, MMO, FrameIndex))
+ return true;
+ return false;
}
- return 0;
+ return TargetInstrInfo::hasStoreToStackSlot(MI, MMO, FrameIndex);
}
/// This function can analyze one/two way branching only and should (mostly) be
@@ -383,7 +419,7 @@ bool HexagonInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
I = MBB.instr_end();
--I;
- while (I->isDebugValue()) {
+ while (I->isDebugInstr()) {
if (I == MBB.instr_begin())
return false;
--I;
@@ -394,7 +430,7 @@ bool HexagonInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
// Delete the J2_jump if it's equivalent to a fall-through.
if (AllowModify && JumpToBlock &&
MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
- DEBUG(dbgs() << "\nErasing the jump to successor block\n";);
+ LLVM_DEBUG(dbgs() << "\nErasing the jump to successor block\n";);
I->eraseFromParent();
I = MBB.instr_end();
if (I == MBB.instr_begin())
@@ -463,8 +499,8 @@ bool HexagonInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
Cond.push_back(LastInst->getOperand(1));
return false;
}
- DEBUG(dbgs() << "\nCant analyze " << printMBBReference(MBB)
- << " with one jump\n";);
+ LLVM_DEBUG(dbgs() << "\nCant analyze " << printMBBReference(MBB)
+ << " with one jump\n";);
// Otherwise, don't know what this is.
return true;
}
@@ -511,8 +547,8 @@ bool HexagonInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
FBB = LastInst->getOperand(0).getMBB();
return false;
}
- DEBUG(dbgs() << "\nCant analyze " << printMBBReference(MBB)
- << " with two jumps";);
+ LLVM_DEBUG(dbgs() << "\nCant analyze " << printMBBReference(MBB)
+ << " with two jumps";);
// Otherwise, can't handle this.
return true;
}
@@ -521,12 +557,12 @@ unsigned HexagonInstrInfo::removeBranch(MachineBasicBlock &MBB,
int *BytesRemoved) const {
assert(!BytesRemoved && "code size not handled");
- DEBUG(dbgs() << "\nRemoving branches out of " << printMBBReference(MBB));
+ LLVM_DEBUG(dbgs() << "\nRemoving branches out of " << printMBBReference(MBB));
MachineBasicBlock::iterator I = MBB.end();
unsigned Count = 0;
while (I != MBB.begin()) {
--I;
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
// Only removing branches from end of MBB.
if (!I->isBranch())
@@ -593,7 +629,8 @@ unsigned HexagonInstrInfo::insertBranch(MachineBasicBlock &MBB,
// (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset)
// (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset)
unsigned Flags1 = getUndefRegState(Cond[1].isUndef());
- DEBUG(dbgs() << "\nInserting NVJump for " << printMBBReference(MBB););
+ LLVM_DEBUG(dbgs() << "\nInserting NVJump for "
+ << printMBBReference(MBB););
if (Cond[2].isReg()) {
unsigned Flags2 = getUndefRegState(Cond[2].isUndef());
BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1).
@@ -864,7 +901,7 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
.addFrameIndex(FI).addImm(0)
.addReg(SrcReg, KillFlag).addMemOperand(MMO);
} else if (Hexagon::ModRegsRegClass.hasSubClassEq(RC)) {
- BuildMI(MBB, I, DL, get(Hexagon::STriw_mod))
+ BuildMI(MBB, I, DL, get(Hexagon::STriw_ctr))
.addFrameIndex(FI).addImm(0)
.addReg(SrcReg, KillFlag).addMemOperand(MMO);
} else if (Hexagon::HvxQRRegClass.hasSubClassEq(RC)) {
@@ -926,7 +963,7 @@ void HexagonInstrInfo::loadRegFromStackSlot(
BuildMI(MBB, I, DL, get(Hexagon::LDriw_pred), DestReg)
.addFrameIndex(FI).addImm(0).addMemOperand(MMO);
} else if (Hexagon::ModRegsRegClass.hasSubClassEq(RC)) {
- BuildMI(MBB, I, DL, get(Hexagon::LDriw_mod), DestReg)
+ BuildMI(MBB, I, DL, get(Hexagon::LDriw_ctr), DestReg)
.addFrameIndex(FI).addImm(0).addMemOperand(MMO);
} else if (Hexagon::HvxQRRegClass.hasSubClassEq(RC)) {
BuildMI(MBB, I, DL, get(Hexagon::PS_vloadrq_ai), DestReg)
@@ -980,6 +1017,20 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
DebugLoc DL = MI.getDebugLoc();
unsigned Opc = MI.getOpcode();
+ auto RealCirc = [&](unsigned Opc, bool HasImm, unsigned MxOp) {
+ unsigned Mx = MI.getOperand(MxOp).getReg();
+ unsigned CSx = (Mx == Hexagon::M0 ? Hexagon::CS0 : Hexagon::CS1);
+ BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrrcr), CSx)
+ .add(MI.getOperand((HasImm ? 5 : 4)));
+ auto MIB = BuildMI(MBB, MI, DL, get(Opc)).add(MI.getOperand(0))
+ .add(MI.getOperand(1)).add(MI.getOperand(2)).add(MI.getOperand(3));
+ if (HasImm)
+ MIB.add(MI.getOperand(4));
+ MIB.addReg(CSx, RegState::Implicit);
+ MBB.erase(MI);
+ return true;
+ };
+
switch (Opc) {
case TargetOpcode::COPY: {
MachineOperand &MD = MI.getOperand(0);
@@ -1088,6 +1139,28 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MBB.erase(MI);
return true;
}
+ case Hexagon::PS_qtrue: {
+ BuildMI(MBB, MI, DL, get(Hexagon::V6_veqw), MI.getOperand(0).getReg())
+ .addReg(Hexagon::V0, RegState::Undef)
+ .addReg(Hexagon::V0, RegState::Undef);
+ MBB.erase(MI);
+ return true;
+ }
+ case Hexagon::PS_qfalse: {
+ BuildMI(MBB, MI, DL, get(Hexagon::V6_vgtw), MI.getOperand(0).getReg())
+ .addReg(Hexagon::V0, RegState::Undef)
+ .addReg(Hexagon::V0, RegState::Undef);
+ MBB.erase(MI);
+ return true;
+ }
+ case Hexagon::PS_vdd0: {
+ unsigned Vd = MI.getOperand(0).getReg();
+ BuildMI(MBB, MI, DL, get(Hexagon::V6_vsubw_dv), Vd)
+ .addReg(Vd, RegState::Undef)
+ .addReg(Vd, RegState::Undef);
+ MBB.erase(MI);
+ return true;
+ }
case Hexagon::PS_vmulw: {
// Expand a 64-bit vector multiply into 2 32-bit scalar multiplies.
unsigned DstReg = MI.getOperand(0).getReg();
@@ -1344,6 +1417,50 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MBB.erase(MI);
return true;
+ case Hexagon::PS_loadrub_pci:
+ return RealCirc(Hexagon::L2_loadrub_pci, /*HasImm*/true, /*MxOp*/4);
+ case Hexagon::PS_loadrb_pci:
+ return RealCirc(Hexagon::L2_loadrb_pci, /*HasImm*/true, /*MxOp*/4);
+ case Hexagon::PS_loadruh_pci:
+ return RealCirc(Hexagon::L2_loadruh_pci, /*HasImm*/true, /*MxOp*/4);
+ case Hexagon::PS_loadrh_pci:
+ return RealCirc(Hexagon::L2_loadrh_pci, /*HasImm*/true, /*MxOp*/4);
+ case Hexagon::PS_loadri_pci:
+ return RealCirc(Hexagon::L2_loadri_pci, /*HasImm*/true, /*MxOp*/4);
+ case Hexagon::PS_loadrd_pci:
+ return RealCirc(Hexagon::L2_loadrd_pci, /*HasImm*/true, /*MxOp*/4);
+ case Hexagon::PS_loadrub_pcr:
+ return RealCirc(Hexagon::L2_loadrub_pcr, /*HasImm*/false, /*MxOp*/3);
+ case Hexagon::PS_loadrb_pcr:
+ return RealCirc(Hexagon::L2_loadrb_pcr, /*HasImm*/false, /*MxOp*/3);
+ case Hexagon::PS_loadruh_pcr:
+ return RealCirc(Hexagon::L2_loadruh_pcr, /*HasImm*/false, /*MxOp*/3);
+ case Hexagon::PS_loadrh_pcr:
+ return RealCirc(Hexagon::L2_loadrh_pcr, /*HasImm*/false, /*MxOp*/3);
+ case Hexagon::PS_loadri_pcr:
+ return RealCirc(Hexagon::L2_loadri_pcr, /*HasImm*/false, /*MxOp*/3);
+ case Hexagon::PS_loadrd_pcr:
+ return RealCirc(Hexagon::L2_loadrd_pcr, /*HasImm*/false, /*MxOp*/3);
+ case Hexagon::PS_storerb_pci:
+ return RealCirc(Hexagon::S2_storerb_pci, /*HasImm*/true, /*MxOp*/3);
+ case Hexagon::PS_storerh_pci:
+ return RealCirc(Hexagon::S2_storerh_pci, /*HasImm*/true, /*MxOp*/3);
+ case Hexagon::PS_storerf_pci:
+ return RealCirc(Hexagon::S2_storerf_pci, /*HasImm*/true, /*MxOp*/3);
+ case Hexagon::PS_storeri_pci:
+ return RealCirc(Hexagon::S2_storeri_pci, /*HasImm*/true, /*MxOp*/3);
+ case Hexagon::PS_storerd_pci:
+ return RealCirc(Hexagon::S2_storerd_pci, /*HasImm*/true, /*MxOp*/3);
+ case Hexagon::PS_storerb_pcr:
+ return RealCirc(Hexagon::S2_storerb_pcr, /*HasImm*/false, /*MxOp*/2);
+ case Hexagon::PS_storerh_pcr:
+ return RealCirc(Hexagon::S2_storerh_pcr, /*HasImm*/false, /*MxOp*/2);
+ case Hexagon::PS_storerf_pcr:
+ return RealCirc(Hexagon::S2_storerf_pcr, /*HasImm*/false, /*MxOp*/2);
+ case Hexagon::PS_storeri_pcr:
+ return RealCirc(Hexagon::S2_storeri_pcr, /*HasImm*/false, /*MxOp*/2);
+ case Hexagon::PS_storerd_pcr:
+ return RealCirc(Hexagon::S2_storerd_pcr, /*HasImm*/false, /*MxOp*/2);
}
return false;
@@ -1393,7 +1510,7 @@ bool HexagonInstrInfo::PredicateInstruction(
MachineInstr &MI, ArrayRef<MachineOperand> Cond) const {
if (Cond.empty() || isNewValueJump(Cond[0].getImm()) ||
isEndLoopN(Cond[0].getImm())) {
- DEBUG(dbgs() << "\nCannot predicate:"; MI.dump(););
+ LLVM_DEBUG(dbgs() << "\nCannot predicate:"; MI.dump(););
return false;
}
int Opc = MI.getOpcode();
@@ -1483,7 +1600,7 @@ bool HexagonInstrInfo::isPredicable(const MachineInstr &MI) const {
}
// HVX loads are not predicable on v60, but are on v62.
- if (!Subtarget.hasV62TOps()) {
+ if (!Subtarget.hasV62Ops()) {
switch (MI.getOpcode()) {
case Hexagon::V6_vL32b_ai:
case Hexagon::V6_vL32b_pi:
@@ -1518,7 +1635,7 @@ bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
// considered a scheduling hazard, which is wrong. It should be the actual
// instruction preceding the dbg_value instruction(s), just like it is
// when debug info is not present.
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
return false;
// Throwing call is a boundary.
@@ -1586,7 +1703,7 @@ HexagonInstrInfo::CreateTargetPostRAHazardRecognizer(
return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
}
-/// \brief For a comparison instruction, return the source registers in
+/// For a comparison instruction, return the source registers in
/// \p SrcReg and \p SrcReg2 if having two register operands, and the value it
/// compares against in CmpValue. Return true if the comparison instruction
/// can be analyzed.
@@ -1836,6 +1953,10 @@ bool HexagonInstrInfo::isAccumulator(const MachineInstr &MI) const {
return((F >> HexagonII::AccumulatorPos) & HexagonII::AccumulatorMask);
}
+bool HexagonInstrInfo::isBaseImmOffset(const MachineInstr &MI) const {
+ return getAddrMode(MI) == HexagonII::BaseImmOffset;
+}
+
bool HexagonInstrInfo::isComplex(const MachineInstr &MI) const {
return !isTC1(MI) && !isTC2Early(MI) && !MI.getDesc().mayLoad() &&
!MI.getDesc().mayStore() &&
@@ -2139,13 +2260,13 @@ bool HexagonInstrInfo::isLateInstrFeedsEarlyInstr(const MachineInstr &LRMI,
bool isLate = isLateResultInstr(LRMI);
bool isEarly = isEarlySourceInstr(ESMI);
- DEBUG(dbgs() << "V60" << (isLate ? "-LR " : " -- "));
- DEBUG(LRMI.dump());
- DEBUG(dbgs() << "V60" << (isEarly ? "-ES " : " -- "));
- DEBUG(ESMI.dump());
+ LLVM_DEBUG(dbgs() << "V60" << (isLate ? "-LR " : " -- "));
+ LLVM_DEBUG(LRMI.dump());
+ LLVM_DEBUG(dbgs() << "V60" << (isEarly ? "-ES " : " -- "));
+ LLVM_DEBUG(ESMI.dump());
if (isLate && isEarly) {
- DEBUG(dbgs() << "++Is Late Result feeding Early Source\n");
+ LLVM_DEBUG(dbgs() << "++Is Late Result feeding Early Source\n");
return true;
}
@@ -2472,6 +2593,13 @@ bool HexagonInstrInfo::isValidAutoIncImm(const EVT VT, int Offset) const {
case MVT::i16:
case MVT::i32:
case MVT::i64:
+ case MVT::f32:
+ case MVT::f64:
+ case MVT::v2i16:
+ case MVT::v2i32:
+ case MVT::v4i8:
+ case MVT::v4i16:
+ case MVT::v8i8:
return isInt<4>(Count);
// For HVX vectors the auto-inc is s3
case MVT::v64i8:
@@ -2599,8 +2727,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
// any size. Later pass knows how to handle it.
case Hexagon::STriw_pred:
case Hexagon::LDriw_pred:
- case Hexagon::STriw_mod:
- case Hexagon::LDriw_mod:
+ case Hexagon::STriw_ctr:
+ case Hexagon::LDriw_ctr:
return true;
case Hexagon::PS_fi:
@@ -2754,7 +2882,7 @@ bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr &MI1,
return false;
}
-/// \brief Get the base register and byte offset of a load/store instr.
+/// Get the base register and byte offset of a load/store instr.
bool HexagonInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt,
unsigned &BaseReg, int64_t &Offset, const TargetRegisterInfo *TRI)
const {
@@ -2765,7 +2893,7 @@ bool HexagonInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt,
return BaseReg != 0;
}
-/// \brief Can these instructions execute at the same time in a bundle.
+/// Can these instructions execute at the same time in a bundle.
bool HexagonInstrInfo::canExecuteInBundle(const MachineInstr &First,
const MachineInstr &Second) const {
if (Second.mayStore() && First.getOpcode() == Hexagon::S2_allocframe) {
@@ -2860,11 +2988,14 @@ bool HexagonInstrInfo::hasUncondBranch(const MachineBasicBlock *B)
bool HexagonInstrInfo::mayBeCurLoad(const MachineInstr &MI) const {
const uint64_t F = MI.getDesc().TSFlags;
return ((F >> HexagonII::mayCVLoadPos) & HexagonII::mayCVLoadMask) &&
- Subtarget.hasV60TOps();
+ Subtarget.hasV60Ops();
}
// Returns true, if a ST insn can be promoted to a new-value store.
bool HexagonInstrInfo::mayBeNewStore(const MachineInstr &MI) const {
+ if (MI.mayStore() && !Subtarget.useNewValueStores())
+ return false;
+
const uint64_t F = MI.getDesc().TSFlags;
return (F >> HexagonII::mayNVStorePos) & HexagonII::mayNVStoreMask;
}
@@ -2917,10 +3048,29 @@ bool HexagonInstrInfo::predCanBeUsedAsDotNew(const MachineInstr &MI,
return false;
}
- // Hexagon Programmer's Reference says that decbin, memw_locked, and
- // memd_locked cannot be used as .new as well,
- // but we don't seem to have these instructions defined.
- return MI.getOpcode() != Hexagon::A4_tlbmatch;
+ // Instruction that produce late predicate cannot be used as sources of
+ // dot-new.
+ switch (MI.getOpcode()) {
+ case Hexagon::A4_addp_c:
+ case Hexagon::A4_subp_c:
+ case Hexagon::A4_tlbmatch:
+ case Hexagon::A5_ACS:
+ case Hexagon::F2_sfinvsqrta:
+ case Hexagon::F2_sfrecipa:
+ case Hexagon::J2_endloop0:
+ case Hexagon::J2_endloop01:
+ case Hexagon::J2_ploop1si:
+ case Hexagon::J2_ploop1sr:
+ case Hexagon::J2_ploop2si:
+ case Hexagon::J2_ploop2sr:
+ case Hexagon::J2_ploop3si:
+ case Hexagon::J2_ploop3sr:
+ case Hexagon::S2_cabacdecbin:
+ case Hexagon::S2_storew_locked:
+ case Hexagon::S4_stored_locked:
+ return false;
+ }
+ return true;
}
bool HexagonInstrInfo::PredOpcodeHasJMP_c(unsigned Opcode) const {
@@ -3047,7 +3197,7 @@ SmallVector<MachineInstr*, 2> HexagonInstrInfo::getBranchingInstrs(
I = MBB.instr_end();
--I;
- while (I->isDebugValue()) {
+ while (I->isDebugInstr()) {
if (I == MBB.instr_begin())
return Jumpers;
--I;
@@ -3496,7 +3646,7 @@ int HexagonInstrInfo::getDotOldOp(const MachineInstr &MI) const {
assert(NewOp >= 0 && "Couldn't change new-value store to its old form.");
}
- if (Subtarget.hasV60TOps())
+ if (Subtarget.hasV60Ops())
return NewOp;
// Subtargets prior to V60 didn't support 'taken' forms of predicated jumps.
@@ -3893,9 +4043,9 @@ int HexagonInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
// Get DefIdx and UseIdx for super registers.
- MachineOperand DefMO = DefMI.getOperand(DefIdx);
+ const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
- if (HRI.isPhysicalRegister(DefMO.getReg())) {
+ if (DefMO.isReg() && HRI.isPhysicalRegister(DefMO.getReg())) {
if (DefMO.isImplicit()) {
for (MCSuperRegIterator SR(DefMO.getReg(), &HRI); SR.isValid(); ++SR) {
int Idx = DefMI.findRegisterDefOperandIdx(*SR, false, false, &HRI);
@@ -3906,7 +4056,7 @@ int HexagonInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
}
}
- MachineOperand UseMO = UseMI.getOperand(UseIdx);
+ const MachineOperand &UseMO = UseMI.getOperand(UseIdx);
if (UseMO.isImplicit()) {
for (MCSuperRegIterator SR(UseMO.getReg(), &HRI); SR.isValid(); ++SR) {
int Idx = UseMI.findRegisterUseOperandIdx(*SR, false, &HRI);
@@ -4057,7 +4207,7 @@ bool HexagonInstrInfo::getPredReg(ArrayRef<MachineOperand> Cond,
return false;
assert(Cond.size() == 2);
if (isNewValueJump(Cond[0].getImm()) || Cond[1].isMBB()) {
- DEBUG(dbgs() << "No predregs for new-value jumps/endloop");
+ LLVM_DEBUG(dbgs() << "No predregs for new-value jumps/endloop");
return false;
}
PredReg = Cond[1].getReg();
@@ -4084,7 +4234,7 @@ short HexagonInstrInfo::getRegForm(const MachineInstr &MI) const {
// use a constant extender, which requires another 4 bytes.
// For debug instructions and prolog labels, return 0.
unsigned HexagonInstrInfo::getSize(const MachineInstr &MI) const {
- if (MI.isDebugValue() || MI.isPosition())
+ if (MI.isDebugInstr() || MI.isPosition())
return 0;
unsigned Size = MI.getDesc().getSize();
@@ -4159,9 +4309,9 @@ void HexagonInstrInfo::immediateExtend(MachineInstr &MI) const {
bool HexagonInstrInfo::invertAndChangeJumpTarget(
MachineInstr &MI, MachineBasicBlock *NewTarget) const {
- DEBUG(dbgs() << "\n[invertAndChangeJumpTarget] to "
- << printMBBReference(*NewTarget);
- MI.dump(););
+ LLVM_DEBUG(dbgs() << "\n[invertAndChangeJumpTarget] to "
+ << printMBBReference(*NewTarget);
+ MI.dump(););
assert(MI.isBranch());
unsigned NewOpcode = getInvertedPredicatedOpcode(MI.getOpcode());
int TargetPos = MI.getNumOperands() - 1;
@@ -4189,8 +4339,9 @@ void HexagonInstrInfo::genAllInsnTimingClasses(MachineFunction &MF) const {
for (unsigned insn = TargetOpcode::GENERIC_OP_END+1;
insn < Hexagon::INSTRUCTION_LIST_END; ++insn) {
NewMI = BuildMI(B, I, DL, get(insn));
- DEBUG(dbgs() << "\n" << getName(NewMI->getOpcode()) <<
- " Class: " << NewMI->getDesc().getSchedClass());
+ LLVM_DEBUG(dbgs() << "\n"
+ << getName(NewMI->getOpcode())
+ << " Class: " << NewMI->getDesc().getSchedClass());
NewMI->eraseFromParent();
}
/* --- The code above is used to generate complete set of Hexagon Insn --- */
@@ -4200,7 +4351,7 @@ void HexagonInstrInfo::genAllInsnTimingClasses(MachineFunction &MF) const {
// p -> NotP
// NotP -> P
bool HexagonInstrInfo::reversePredSense(MachineInstr &MI) const {
- DEBUG(dbgs() << "\nTrying to reverse pred. sense of:"; MI.dump());
+ LLVM_DEBUG(dbgs() << "\nTrying to reverse pred. sense of:"; MI.dump());
MI.setDesc(get(getInvertedPredicatedOpcode(MI.getOpcode())));
return true;
}
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 4530d3b999cc..96b4ffaba02f 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -18,9 +18,9 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/MachineValueType.h"
#include <cstdint>
#include <vector>
@@ -66,6 +66,20 @@ public:
unsigned isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
+ /// Check if the instruction or the bundle of instructions has
+ /// load from stack slots. Return the frameindex and machine memory operand
+ /// if true.
+ bool hasLoadFromStackSlot(const MachineInstr &MI,
+ const MachineMemOperand *&MMO,
+ int &FrameIndex) const override;
+
+ /// Check if the instruction or the bundle of instructions has
+ /// store to stack slots. Return the frameindex and machine memory operand
+ /// if true.
+ bool hasStoreToStackSlot(const MachineInstr &MI,
+ const MachineMemOperand *&MMO,
+ int &FrameIndex) const override;
+
/// Analyze the branching code at the end of MBB, returning
/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
/// implemented for a target). Upon success, this returns false and returns
@@ -122,8 +136,8 @@ public:
bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
MachineInstr *&CmpInst) const override;
- /// Generate code to reduce the loop iteration by one and check if the loop is
- /// finished. Return the value/register of the the new loop count. We need
+ /// Generate code to reduce the loop iteration by one and check if the loop
+ /// is finished. Return the value/register of the new loop count. We need
/// this function when peeling off one or more iterations of a loop. This
/// function assumes the nth iteration is peeled first.
unsigned reduceLoopCount(MachineBasicBlock &MBB,
@@ -201,7 +215,7 @@ public:
/// anything was changed.
bool expandPostRAPseudo(MachineInstr &MI) const override;
- /// \brief Get the base register and byte offset of a load/store instr.
+ /// Get the base register and byte offset of a load/store instr.
bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
int64_t &Offset,
const TargetRegisterInfo *TRI) const override;
@@ -332,7 +346,11 @@ public:
/// HexagonInstrInfo specifics.
unsigned createVR(MachineFunction *MF, MVT VT) const;
+ MachineInstr *findLoopInstr(MachineBasicBlock *BB, unsigned EndLoopOp,
+ MachineBasicBlock *TargetBB,
+ SmallPtrSet<MachineBasicBlock *, 8> &Visited) const;
+ bool isBaseImmOffset(const MachineInstr &MI) const;
bool isAbsoluteSet(const MachineInstr &MI) const;
bool isAccumulator(const MachineInstr &MI) const;
bool isAddrModeWithOffset(const MachineInstr &MI) const;
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index 1df143de6e80..b25e316709c5 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -807,7 +807,6 @@ def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s32_0ImmPred, s8_0ImmPred>;
// Shift halfword
def : T_R_pat<A2_aslh, int_hexagon_A2_aslh>;
def : T_R_pat<A2_asrh, int_hexagon_A2_asrh>;
-def : T_R_pat<A2_asrh, int_hexagon_SI_to_SXTHI_asrh>;
// Sign/zero extend
def : T_R_pat<A2_sxth, int_hexagon_A2_sxth>;
@@ -1353,11 +1352,11 @@ class T_stb_pat <InstHexagon MI, Intrinsic IntID, PatLeaf Val>
: Pat<(IntID I32:$Rs, Val:$Rt, I32:$Ru),
(MI I32:$Rs, I32:$Ru, Val:$Rt)>;
-def : T_stb_pat <S2_storerh_pbr, int_hexagon_brev_sth, I32>;
-def : T_stb_pat <S2_storerb_pbr, int_hexagon_brev_stb, I32>;
-def : T_stb_pat <S2_storeri_pbr, int_hexagon_brev_stw, I32>;
-def : T_stb_pat <S2_storerf_pbr, int_hexagon_brev_sthhi, I32>;
-def : T_stb_pat <S2_storerd_pbr, int_hexagon_brev_std, I64>;
+def : T_stb_pat <S2_storerh_pbr, int_hexagon_S2_storerh_pbr, I32>;
+def : T_stb_pat <S2_storerb_pbr, int_hexagon_S2_storerb_pbr, I32>;
+def : T_stb_pat <S2_storeri_pbr, int_hexagon_S2_storeri_pbr, I32>;
+def : T_stb_pat <S2_storerf_pbr, int_hexagon_S2_storerf_pbr, I32>;
+def : T_stb_pat <S2_storerd_pbr, int_hexagon_S2_storerd_pbr, I64>;
class T_stc_pat <InstHexagon MI, Intrinsic IntID, PatLeaf Imm, PatLeaf Val>
: Pat<(IntID I32:$Rs, Val:$Rt, I32:$Ru, Imm:$s),
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
index f27a63e20e61..29f67cffcf89 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsV5.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
@@ -9,7 +9,7 @@
//Rdd[+]=vrmpybsu(Rss,Rtt)
//Rdd[+]=vrmpybuu(Rss,Rtt)
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
def : T_PP_pat <M5_vrmpybsu, int_hexagon_M5_vrmpybsu>;
def : T_PP_pat <M5_vrmpybuu, int_hexagon_M5_vrmpybuu>;
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index fd602257934a..f9ed03909233 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -26,6 +26,7 @@
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
@@ -56,7 +57,7 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils.h"
#include <algorithm>
#include <array>
#include <cassert>
@@ -243,8 +244,8 @@ namespace {
const Value *V;
};
- raw_ostream &operator<< (raw_ostream &OS, const PE &P) LLVM_ATTRIBUTE_USED;
- raw_ostream &operator<< (raw_ostream &OS, const PE &P) {
+ LLVM_ATTRIBUTE_USED
+ raw_ostream &operator<<(raw_ostream &OS, const PE &P) {
P.C.print(OS, P.V ? P.V : P.C.Root);
return OS;
}
@@ -608,9 +609,9 @@ namespace {
unsigned getInverseMxN(unsigned QP);
Value *generate(BasicBlock::iterator At, ParsedValues &PV);
- void setupSimplifier();
+ void setupPreSimplifier(Simplifier &S);
+ void setupPostSimplifier(Simplifier &S);
- Simplifier Simp;
Loop *CurLoop;
const DataLayout &DL;
const DominatorTree &DT;
@@ -985,6 +986,7 @@ bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
case Instruction::Xor:
case Instruction::LShr: // Shift right is ok.
case Instruction::Select:
+ case Instruction::Trunc:
return true;
case Instruction::ICmp:
if (CmpInst *CI = cast<CmpInst>(In))
@@ -998,6 +1000,8 @@ bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
IntegerType *DestTy, BasicBlock *LoopB) {
+ Type *OrigTy = In->getType();
+
// Leave boolean values alone.
if (!In->getType()->isIntegerTy(1))
In->mutateType(DestTy);
@@ -1028,6 +1032,14 @@ void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
Z->eraseFromParent();
return;
}
+ if (TruncInst *T = dyn_cast<TruncInst>(In)) {
+ IntegerType *TruncTy = cast<IntegerType>(OrigTy);
+ Value *Mask = ConstantInt::get(DestTy, (1u << TruncTy->getBitWidth()) - 1);
+ Value *And = IRBuilder<>(In).CreateAnd(T->getOperand(0), Mask);
+ T->replaceAllUsesWith(And);
+ T->eraseFromParent();
+ return;
+ }
// Promote immediates.
for (unsigned i = 0, n = In->getNumOperands(); i != n; ++i) {
@@ -1050,14 +1062,11 @@ bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
// Check if the exit values have types that are no wider than the type
// that we want to promote to.
unsigned DestBW = DestTy->getBitWidth();
- for (Instruction &In : *ExitB) {
- PHINode *P = dyn_cast<PHINode>(&In);
- if (!P)
- break;
- if (P->getNumIncomingValues() != 1)
+ for (PHINode &P : ExitB->phis()) {
+ if (P.getNumIncomingValues() != 1)
return false;
- assert(P->getIncomingBlock(0) == LoopB);
- IntegerType *T = dyn_cast<IntegerType>(P->getType());
+ assert(P.getIncomingBlock(0) == LoopB);
+ IntegerType *T = dyn_cast<IntegerType>(P.getType());
if (!T || T->getBitWidth() > DestBW)
return false;
}
@@ -1572,8 +1581,8 @@ static bool hasZeroSignBit(const Value *V) {
return false;
}
-void PolynomialMultiplyRecognize::setupSimplifier() {
- Simp.addRule("sink-zext",
+void PolynomialMultiplyRecognize::setupPreSimplifier(Simplifier &S) {
+ S.addRule("sink-zext",
// Sink zext past bitwise operations.
[](Instruction *I, LLVMContext &Ctx) -> Value* {
if (I->getOpcode() != Instruction::ZExt)
@@ -1594,7 +1603,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
B.CreateZExt(T->getOperand(0), I->getType()),
B.CreateZExt(T->getOperand(1), I->getType()));
});
- Simp.addRule("xor/and -> and/xor",
+ S.addRule("xor/and -> and/xor",
// (xor (and x a) (and y a)) -> (and (xor x y) a)
[](Instruction *I, LLVMContext &Ctx) -> Value* {
if (I->getOpcode() != Instruction::Xor)
@@ -1612,7 +1621,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
And0->getOperand(1));
});
- Simp.addRule("sink binop into select",
+ S.addRule("sink binop into select",
// (Op (select c x y) z) -> (select c (Op x z) (Op y z))
// (Op x (select c y z)) -> (select c (Op x y) (Op x z))
[](Instruction *I, LLVMContext &Ctx) -> Value* {
@@ -1638,7 +1647,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
}
return nullptr;
});
- Simp.addRule("fold select-select",
+ S.addRule("fold select-select",
// (select c (select c x y) z) -> (select c x z)
// (select c x (select c y z)) -> (select c x z)
[](Instruction *I, LLVMContext &Ctx) -> Value* {
@@ -1657,7 +1666,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
}
return nullptr;
});
- Simp.addRule("or-signbit -> xor-signbit",
+ S.addRule("or-signbit -> xor-signbit",
// (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
[](Instruction *I, LLVMContext &Ctx) -> Value* {
if (I->getOpcode() != Instruction::Or)
@@ -1669,7 +1678,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
return nullptr;
return IRBuilder<>(Ctx).CreateXor(I->getOperand(0), Msb);
});
- Simp.addRule("sink lshr into binop",
+ S.addRule("sink lshr into binop",
// (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
[](Instruction *I, LLVMContext &Ctx) -> Value* {
if (I->getOpcode() != Instruction::LShr)
@@ -1691,7 +1700,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
B.CreateLShr(BitOp->getOperand(0), S),
B.CreateLShr(BitOp->getOperand(1), S));
});
- Simp.addRule("expose bitop-const",
+ S.addRule("expose bitop-const",
// (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
[](Instruction *I, LLVMContext &Ctx) -> Value* {
auto IsBitOp = [](unsigned Op) -> bool {
@@ -1720,16 +1729,44 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
});
}
+void PolynomialMultiplyRecognize::setupPostSimplifier(Simplifier &S) {
+ S.addRule("(and (xor (and x a) y) b) -> (and (xor x y) b), if b == b&a",
+ [](Instruction *I, LLVMContext &Ctx) -> Value* {
+ if (I->getOpcode() != Instruction::And)
+ return nullptr;
+ Instruction *Xor = dyn_cast<Instruction>(I->getOperand(0));
+ ConstantInt *C0 = dyn_cast<ConstantInt>(I->getOperand(1));
+ if (!Xor || !C0)
+ return nullptr;
+ if (Xor->getOpcode() != Instruction::Xor)
+ return nullptr;
+ Instruction *And0 = dyn_cast<Instruction>(Xor->getOperand(0));
+ Instruction *And1 = dyn_cast<Instruction>(Xor->getOperand(1));
+ // Pick the first non-null and.
+ if (!And0 || And0->getOpcode() != Instruction::And)
+ std::swap(And0, And1);
+ ConstantInt *C1 = dyn_cast<ConstantInt>(And0->getOperand(1));
+ if (!C1)
+ return nullptr;
+ uint32_t V0 = C0->getZExtValue();
+ uint32_t V1 = C1->getZExtValue();
+ if (V0 != (V0 & V1))
+ return nullptr;
+ IRBuilder<> B(Ctx);
+ return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1), C0);
+ });
+}
+
bool PolynomialMultiplyRecognize::recognize() {
- DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
- << *CurLoop << '\n');
+ LLVM_DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
+ << *CurLoop << '\n');
// Restrictions:
// - The loop must consist of a single block.
// - The iteration count must be known at compile-time.
// - The loop must have an induction variable starting from 0, and
// incremented in each iteration of the loop.
BasicBlock *LoopB = CurLoop->getHeader();
- DEBUG(dbgs() << "Loop header:\n" << *LoopB);
+ LLVM_DEBUG(dbgs() << "Loop header:\n" << *LoopB);
if (LoopB != CurLoop->getLoopLatch())
return false;
@@ -1749,10 +1786,12 @@ bool PolynomialMultiplyRecognize::recognize() {
Value *CIV = getCountIV(LoopB);
ParsedValues PV;
+ Simplifier PreSimp;
PV.IterCount = IterCount;
- DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount << '\n');
+ LLVM_DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount
+ << '\n');
- setupSimplifier();
+ setupPreSimplifier(PreSimp);
// Perform a preliminary scan of select instructions to see if any of them
// looks like a generator of the polynomial multiply steps. Assume that a
@@ -1775,9 +1814,9 @@ bool PolynomialMultiplyRecognize::recognize() {
continue;
Simplifier::Context C(SI);
- Value *T = Simp.simplify(C);
+ Value *T = PreSimp.simplify(C);
SelectInst *SelI = (T && isa<SelectInst>(T)) ? cast<SelectInst>(T) : SI;
- DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
+ LLVM_DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
if (scanSelect(SelI, LoopB, EntryB, CIV, PV, true)) {
FoundPreScan = true;
if (SelI != SI) {
@@ -1790,7 +1829,7 @@ bool PolynomialMultiplyRecognize::recognize() {
}
if (!FoundPreScan) {
- DEBUG(dbgs() << "Have not found candidates for pmpy\n");
+ LLVM_DEBUG(dbgs() << "Have not found candidates for pmpy\n");
return false;
}
@@ -1801,6 +1840,24 @@ bool PolynomialMultiplyRecognize::recognize() {
// wide as the target's pmpy instruction.
if (!promoteTypes(LoopB, ExitB))
return false;
+ // Run post-promotion simplifications.
+ Simplifier PostSimp;
+ setupPostSimplifier(PostSimp);
+ for (Instruction &In : *LoopB) {
+ SelectInst *SI = dyn_cast<SelectInst>(&In);
+ if (!SI || !FeedsPHI(SI))
+ continue;
+ Simplifier::Context C(SI);
+ Value *T = PostSimp.simplify(C);
+ SelectInst *SelI = dyn_cast_or_null<SelectInst>(T);
+ if (SelI != SI) {
+ Value *NewSel = C.materialize(LoopB, SI->getIterator());
+ SI->replaceAllUsesWith(NewSel);
+ RecursivelyDeleteTriviallyDeadInstructions(SI, &TLI);
+ }
+ break;
+ }
+
if (!convertShiftsToLeft(LoopB, ExitB, IterCount))
return false;
cleanupLoopBody(LoopB);
@@ -1812,14 +1869,14 @@ bool PolynomialMultiplyRecognize::recognize() {
SelectInst *SelI = dyn_cast<SelectInst>(&In);
if (!SelI)
continue;
- DEBUG(dbgs() << "scanSelect: " << *SelI << '\n');
+ LLVM_DEBUG(dbgs() << "scanSelect: " << *SelI << '\n');
FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false);
if (FoundScan)
break;
}
assert(FoundScan);
- DEBUG({
+ LLVM_DEBUG({
StringRef PP = (PV.M ? "(P+M)" : "P");
if (!PV.Inv)
dbgs() << "Found pmpy idiom: R = " << PP << ".Q\n";
@@ -1913,7 +1970,7 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
// Get the location that may be stored across the loop. Since the access
// is strided positively through memory, we say that the modified location
// starts at the pointer and has infinite size.
- uint64_t AccessSize = MemoryLocation::UnknownSize;
+ LocationSize AccessSize = MemoryLocation::UnknownSize;
// If the loop iterates a fixed number of times, we can refine the access
// size to be exactly the size of the memset, which is (BECount+1)*StoreSize
@@ -2083,7 +2140,6 @@ CleanupAndExit:
// pointer size if it isn't already.
LLVMContext &Ctx = SI->getContext();
BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
- unsigned Alignment = std::min(SI->getAlignment(), LI->getAlignment());
DebugLoc DLoc = SI->getDebugLoc();
const SCEV *NumBytesS =
@@ -2217,12 +2273,14 @@ CleanupAndExit:
: CondBuilder.CreateBitCast(LoadBasePtr, Int32PtrTy);
NewCall = CondBuilder.CreateCall(Fn, {Op0, Op1, NumWords});
} else {
- NewCall = CondBuilder.CreateMemMove(StoreBasePtr, LoadBasePtr,
- NumBytes, Alignment);
+ NewCall = CondBuilder.CreateMemMove(StoreBasePtr, SI->getAlignment(),
+ LoadBasePtr, LI->getAlignment(),
+ NumBytes);
}
} else {
- NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr,
- NumBytes, Alignment);
+ NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(),
+ LoadBasePtr, LI->getAlignment(),
+ NumBytes);
// Okay, the memcpy has been formed. Zap the original store and
// anything that feeds into it.
RecursivelyDeleteTriviallyDeadInstructions(SI, TLI);
@@ -2230,15 +2288,16 @@ CleanupAndExit:
NewCall->setDebugLoc(DLoc);
- DEBUG(dbgs() << " Formed " << (Overlap ? "memmove: " : "memcpy: ")
- << *NewCall << "\n"
- << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
- << " from store ptr=" << *StoreEv << " at: " << *SI << "\n");
+ LLVM_DEBUG(dbgs() << " Formed " << (Overlap ? "memmove: " : "memcpy: ")
+ << *NewCall << "\n"
+ << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+ << " from store ptr=" << *StoreEv << " at: " << *SI
+ << "\n");
return true;
}
-// \brief Check if the instructions in Insts, together with their dependencies
+// Check if the instructions in Insts, together with their dependencies
// cover the loop in the sense that the loop could be safely eliminated once
// the instructions in Insts are removed.
bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index b1c549aa13fa..74c550ce8226 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
@@ -47,26 +48,46 @@ using namespace llvm;
static cl::opt<bool> IgnoreBBRegPressure("ignore-bb-reg-pressure",
cl::Hidden, cl::ZeroOrMore, cl::init(false));
+static cl::opt<bool> UseNewerCandidate("use-newer-candidate",
+ cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
static cl::opt<unsigned> SchedDebugVerboseLevel("misched-verbose-level",
cl::Hidden, cl::ZeroOrMore, cl::init(1));
-static cl::opt<bool> TopUseShorterTie("top-use-shorter-tie",
- cl::Hidden, cl::ZeroOrMore, cl::init(false));
-
-static cl::opt<bool> BotUseShorterTie("bot-use-shorter-tie",
- cl::Hidden, cl::ZeroOrMore, cl::init(false));
-
-static cl::opt<bool> DisableTCTie("disable-tc-tie",
- cl::Hidden, cl::ZeroOrMore, cl::init(false));
-
// Check if the scheduler should penalize instructions that are available to
// early due to a zero-latency dependence.
static cl::opt<bool> CheckEarlyAvail("check-early-avail", cl::Hidden,
cl::ZeroOrMore, cl::init(true));
-/// Save the last formed packet
-void VLIWResourceModel::savePacket() {
- OldPacket = Packet;
+// This value is used to determine if a register class is a high pressure set.
+// We compute the maximum number of registers needed and divided by the total
+// available. Then, we compare the result to this value.
+static cl::opt<float> RPThreshold("hexagon-reg-pressure", cl::Hidden,
+ cl::init(0.75f), cl::desc("High register pressure threhold."));
+
+/// Return true if there is a dependence between SUd and SUu.
+static bool hasDependence(const SUnit *SUd, const SUnit *SUu,
+ const HexagonInstrInfo &QII) {
+ if (SUd->Succs.size() == 0)
+ return false;
+
+ // Enable .cur formation.
+ if (QII.mayBeCurLoad(*SUd->getInstr()))
+ return false;
+
+ if (QII.canExecuteInBundle(*SUd->getInstr(), *SUu->getInstr()))
+ return false;
+
+ for (const auto &S : SUd->Succs) {
+ // Since we do not add pseudos to packets, might as well
+ // ignore order dependencies.
+ if (S.isCtrl())
+ continue;
+
+ if (S.getSUnit() == SUu && S.getLatency() > 0)
+ return true;
+ }
+ return false;
}
/// Check if scheduling of this SU is possible
@@ -74,7 +95,7 @@ void VLIWResourceModel::savePacket() {
/// It is _not_ precise (statefull), it is more like
/// another heuristic. Many corner cases are figured
/// empirically.
-bool VLIWResourceModel::isResourceAvailable(SUnit *SU) {
+bool VLIWResourceModel::isResourceAvailable(SUnit *SU, bool IsTop) {
if (!SU || !SU->getInstr())
return false;
@@ -94,49 +115,39 @@ bool VLIWResourceModel::isResourceAvailable(SUnit *SU) {
break;
}
- MachineFunction &MF = *SU->getInstr()->getParent()->getParent();
- auto &QII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+ MachineBasicBlock *MBB = SU->getInstr()->getParent();
+ auto &QST = MBB->getParent()->getSubtarget<HexagonSubtarget>();
+ const auto &QII = *QST.getInstrInfo();
// Now see if there are no other dependencies to instructions already
// in the packet.
- for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
- if (Packet[i]->Succs.size() == 0)
- continue;
-
- // Enable .cur formation.
- if (QII.mayBeCurLoad(*Packet[i]->getInstr()))
- continue;
-
- for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(),
- E = Packet[i]->Succs.end(); I != E; ++I) {
- // Since we do not add pseudos to packets, might as well
- // ignore order dependencies.
- if (I->isCtrl())
- continue;
-
- if (I->getSUnit() == SU)
+ if (IsTop) {
+ for (unsigned i = 0, e = Packet.size(); i != e; ++i)
+ if (hasDependence(Packet[i], SU, QII))
+ return false;
+ } else {
+ for (unsigned i = 0, e = Packet.size(); i != e; ++i)
+ if (hasDependence(SU, Packet[i], QII))
return false;
- }
}
return true;
}
/// Keep track of available resources.
-bool VLIWResourceModel::reserveResources(SUnit *SU) {
+bool VLIWResourceModel::reserveResources(SUnit *SU, bool IsTop) {
bool startNewCycle = false;
// Artificially reset state.
if (!SU) {
ResourcesModel->clearResources();
- savePacket();
Packet.clear();
TotalPackets++;
return false;
}
- // If this SU does not fit in the packet
+ // If this SU does not fit in the packet or the packet is now full
// start a new one.
- if (!isResourceAvailable(SU)) {
+ if (!isResourceAvailable(SU, IsTop) ||
+ Packet.size() >= SchedModel->getIssueWidth()) {
ResourcesModel->clearResources();
- savePacket();
Packet.clear();
TotalPackets++;
startNewCycle = true;
@@ -161,24 +172,14 @@ bool VLIWResourceModel::reserveResources(SUnit *SU) {
Packet.push_back(SU);
#ifndef NDEBUG
- DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n");
+ LLVM_DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n");
for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
- DEBUG(dbgs() << "\t[" << i << "] SU(");
- DEBUG(dbgs() << Packet[i]->NodeNum << ")\t");
- DEBUG(Packet[i]->getInstr()->dump());
+ LLVM_DEBUG(dbgs() << "\t[" << i << "] SU(");
+ LLVM_DEBUG(dbgs() << Packet[i]->NodeNum << ")\t");
+ LLVM_DEBUG(Packet[i]->getInstr()->dump());
}
#endif
- // If packet is now full, reset the state so in the next cycle
- // we start fresh.
- if (Packet.size() >= SchedModel->getIssueWidth()) {
- ResourcesModel->clearResources();
- savePacket();
- Packet.clear();
- TotalPackets++;
- startNewCycle = true;
- }
-
return startNewCycle;
}
@@ -186,37 +187,43 @@ bool VLIWResourceModel::reserveResources(SUnit *SU) {
/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
/// only includes instructions that have DAG nodes, not scheduling boundaries.
void VLIWMachineScheduler::schedule() {
- DEBUG(dbgs() << "********** MI Converging Scheduling VLIW "
- << printMBBReference(*BB) << " " << BB->getName() << " in_func "
- << BB->getParent()->getName() << " at loop depth "
- << MLI->getLoopDepth(BB) << " \n");
+ LLVM_DEBUG(dbgs() << "********** MI Converging Scheduling VLIW "
+ << printMBBReference(*BB) << " " << BB->getName()
+ << " in_func " << BB->getParent()->getName()
+ << " at loop depth " << MLI->getLoopDepth(BB) << " \n");
buildDAGWithRegPressure();
+ Topo.InitDAGTopologicalSorting();
+
+ // Postprocess the DAG to add platform-specific artificial dependencies.
+ postprocessDAG();
+
SmallVector<SUnit*, 8> TopRoots, BotRoots;
findRootsAndBiasEdges(TopRoots, BotRoots);
// Initialize the strategy before modifying the DAG.
SchedImpl->initialize(this);
- DEBUG(unsigned maxH = 0;
- for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
- if (SUnits[su].getHeight() > maxH)
- maxH = SUnits[su].getHeight();
- dbgs() << "Max Height " << maxH << "\n";);
- DEBUG(unsigned maxD = 0;
- for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
- if (SUnits[su].getDepth() > maxD)
- maxD = SUnits[su].getDepth();
- dbgs() << "Max Depth " << maxD << "\n";);
- DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
- SUnits[su].dumpAll(this));
+ LLVM_DEBUG(unsigned maxH = 0;
+ for (unsigned su = 0, e = SUnits.size(); su != e;
+ ++su) if (SUnits[su].getHeight() > maxH) maxH =
+ SUnits[su].getHeight();
+ dbgs() << "Max Height " << maxH << "\n";);
+ LLVM_DEBUG(unsigned maxD = 0;
+ for (unsigned su = 0, e = SUnits.size(); su != e;
+ ++su) if (SUnits[su].getDepth() > maxD) maxD =
+ SUnits[su].getDepth();
+ dbgs() << "Max Depth " << maxD << "\n";);
+ LLVM_DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su) SUnits[su]
+ .dumpAll(this));
initQueues(TopRoots, BotRoots);
bool IsTopNode = false;
while (true) {
- DEBUG(dbgs() << "** VLIWMachineScheduler::schedule picking next node\n");
+ LLVM_DEBUG(
+ dbgs() << "** VLIWMachineScheduler::schedule picking next node\n");
SUnit *SU = SchedImpl->pickNode(IsTopNode);
if (!SU) break;
@@ -225,16 +232,16 @@ void VLIWMachineScheduler::schedule() {
scheduleMI(SU, IsTopNode);
- updateQueues(SU, IsTopNode);
-
// Notify the scheduling strategy after updating the DAG.
SchedImpl->schedNode(SU, IsTopNode);
+
+ updateQueues(SU, IsTopNode);
}
assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
placeDebugValues();
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "*** Final schedule for "
<< printMBBReference(*begin()->getParent()) << " ***\n";
dumpSchedule();
@@ -264,6 +271,15 @@ void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
Top.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel());
Bot.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel());
+ const std::vector<unsigned> &MaxPressure =
+ DAG->getRegPressure().MaxSetPressure;
+ HighPressureSets.assign(MaxPressure.size(), 0);
+ for (unsigned i = 0, e = MaxPressure.size(); i < e; ++i) {
+ unsigned Limit = DAG->getRegClassInfo()->getRegPressureSetLimit(i);
+ HighPressureSets[i] =
+ ((float) MaxPressure[i] > ((float) Limit * RPThreshold));
+ }
+
assert((!ForceTopDown || !ForceBottomUp) &&
"-misched-topdown incompatible with -misched-bottomup");
}
@@ -364,8 +380,8 @@ void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpCycle() {
}
CheckPending = true;
- DEBUG(dbgs() << "*** Next cycle " << Available.getName() << " cycle "
- << CurrCycle << '\n');
+ LLVM_DEBUG(dbgs() << "*** Next cycle " << Available.getName() << " cycle "
+ << CurrCycle << '\n');
}
/// Move the boundary of scheduled code by one SUnit.
@@ -383,18 +399,18 @@ void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpNode(SUnit *SU) {
}
// Update DFA model.
- startNewCycle = ResourceModel->reserveResources(SU);
+ startNewCycle = ResourceModel->reserveResources(SU, isTop());
// Check the instruction group dispatch limit.
// TODO: Check if this SU must end a dispatch group.
IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
if (startNewCycle) {
- DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+ LLVM_DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
bumpCycle();
}
else
- DEBUG(dbgs() << "*** IssueCount " << IssueCount
- << " at cycle " << CurrCycle << '\n');
+ LLVM_DEBUG(dbgs() << "*** IssueCount " << IssueCount << " at cycle "
+ << CurrCycle << '\n');
}
/// Release pending ready nodes in to the available queue. This makes them
@@ -443,10 +459,18 @@ SUnit *ConvergingVLIWScheduler::VLIWSchedBoundary::pickOnlyChoice() {
if (CheckPending)
releasePending();
- for (unsigned i = 0; Available.empty(); ++i) {
+ auto AdvanceCycle = [this]() {
+ if (Available.empty())
+ return true;
+ if (Available.size() == 1 && Pending.size() > 0)
+ return !ResourceModel->isResourceAvailable(*Available.begin(), isTop()) ||
+ getWeakLeft(*Available.begin(), isTop()) != 0;
+ return false;
+ };
+ for (unsigned i = 0; AdvanceCycle(); ++i) {
assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
"permanent hazard"); (void)i;
- ResourceModel->reserveResources(nullptr);
+ ResourceModel->reserveResources(nullptr, isTop());
bumpCycle();
releasePending();
}
@@ -520,13 +544,31 @@ static inline bool isSingleUnscheduledSucc(SUnit *SU, SUnit *SU2) {
return true;
}
+/// Check if the instruction changes the register pressure of a register in the
+/// high pressure set. The function returns a negative value if the pressure
+/// decreases and a positive value is the pressure increases. If the instruction
+/// doesn't use a high pressure register or doesn't change the register
+/// pressure, then return 0.
+int ConvergingVLIWScheduler::pressureChange(const SUnit *SU, bool isBotUp) {
+ PressureDiff &PD = DAG->getPressureDiff(SU);
+ for (auto &P : PD) {
+ if (!P.isValid())
+ continue;
+ // The pressure differences are computed bottom-up, so the comparision for
+ // an increase is positive in the bottom direction, but negative in the
+ // top-down direction.
+ if (HighPressureSets[P.getPSet()])
+ return (isBotUp ? P.getUnitInc() : -P.getUnitInc());
+ }
+ return 0;
+}
+
// Constants used to denote relative importance of
// heuristic components for cost computation.
static const unsigned PriorityOne = 200;
static const unsigned PriorityTwo = 50;
static const unsigned PriorityThree = 75;
static const unsigned ScaleTwo = 10;
-static const unsigned FactorOne = 2;
/// Single point to compute overall scheduling cost.
/// TODO: More heuristics will be used soon.
@@ -541,20 +583,23 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
if (!SU || SU->isScheduled)
return ResCount;
- MachineInstr &Instr = *SU->getInstr();
-
- DEBUG(if (verbose) dbgs() << ((Q.getID() == TopQID) ? "(top|" : "(bot|"));
+ LLVM_DEBUG(if (verbose) dbgs()
+ << ((Q.getID() == TopQID) ? "(top|" : "(bot|"));
// Forced priority is high.
if (SU->isScheduleHigh) {
ResCount += PriorityOne;
- DEBUG(dbgs() << "H|");
+ LLVM_DEBUG(dbgs() << "H|");
}
+ unsigned IsAvailableAmt = 0;
// Critical path first.
if (Q.getID() == TopQID) {
- ResCount += (SU->getHeight() * ScaleTwo);
+ if (Top.isLatencyBound(SU)) {
+ LLVM_DEBUG(if (verbose) dbgs() << "LB|");
+ ResCount += (SU->getHeight() * ScaleTwo);
+ }
- DEBUG(if (verbose) {
+ LLVM_DEBUG(if (verbose) {
std::stringstream dbgstr;
dbgstr << "h" << std::setw(3) << SU->getHeight() << "|";
dbgs() << dbgstr.str();
@@ -562,16 +607,19 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
// If resources are available for it, multiply the
// chance of scheduling.
- if (Top.ResourceModel->isResourceAvailable(SU)) {
- ResCount <<= FactorOne;
- ResCount += PriorityThree;
- DEBUG(if (verbose) dbgs() << "A|");
+ if (Top.ResourceModel->isResourceAvailable(SU, true)) {
+ IsAvailableAmt = (PriorityTwo + PriorityThree);
+ ResCount += IsAvailableAmt;
+ LLVM_DEBUG(if (verbose) dbgs() << "A|");
} else
- DEBUG(if (verbose) dbgs() << " |");
+ LLVM_DEBUG(if (verbose) dbgs() << " |");
} else {
- ResCount += (SU->getDepth() * ScaleTwo);
+ if (Bot.isLatencyBound(SU)) {
+ LLVM_DEBUG(if (verbose) dbgs() << "LB|");
+ ResCount += (SU->getDepth() * ScaleTwo);
+ }
- DEBUG(if (verbose) {
+ LLVM_DEBUG(if (verbose) {
std::stringstream dbgstr;
dbgstr << "d" << std::setw(3) << SU->getDepth() << "|";
dbgs() << dbgstr.str();
@@ -579,12 +627,12 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
// If resources are available for it, multiply the
// chance of scheduling.
- if (Bot.ResourceModel->isResourceAvailable(SU)) {
- ResCount <<= FactorOne;
- ResCount += PriorityThree;
- DEBUG(if (verbose) dbgs() << "A|");
+ if (Bot.ResourceModel->isResourceAvailable(SU, false)) {
+ IsAvailableAmt = (PriorityTwo + PriorityThree);
+ ResCount += IsAvailableAmt;
+ LLVM_DEBUG(if (verbose) dbgs() << "A|");
} else
- DEBUG(if (verbose) dbgs() << " |");
+ LLVM_DEBUG(if (verbose) dbgs() << " |");
}
unsigned NumNodesBlocking = 0;
@@ -593,18 +641,20 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
// Look at all of the successors of this node.
// Count the number of nodes that
// this node is the sole unscheduled node for.
- for (const SDep &SI : SU->Succs)
- if (isSingleUnscheduledPred(SI.getSUnit(), SU))
- ++NumNodesBlocking;
+ if (Top.isLatencyBound(SU))
+ for (const SDep &SI : SU->Succs)
+ if (isSingleUnscheduledPred(SI.getSUnit(), SU))
+ ++NumNodesBlocking;
} else {
// How many unscheduled predecessors block this node?
- for (const SDep &PI : SU->Preds)
- if (isSingleUnscheduledSucc(PI.getSUnit(), SU))
- ++NumNodesBlocking;
+ if (Bot.isLatencyBound(SU))
+ for (const SDep &PI : SU->Preds)
+ if (isSingleUnscheduledSucc(PI.getSUnit(), SU))
+ ++NumNodesBlocking;
}
ResCount += (NumNodesBlocking * ScaleTwo);
- DEBUG(if (verbose) {
+ LLVM_DEBUG(if (verbose) {
std::stringstream dbgstr;
dbgstr << "blk " << std::setw(2) << NumNodesBlocking << ")|";
dbgs() << dbgstr.str();
@@ -619,10 +669,17 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
// Decrease priority slightly if register pressure would increase over the
// current maximum.
ResCount -= (Delta.CurrentMax.getUnitInc()*PriorityTwo);
- DEBUG(if (verbose) {
- dbgs() << "RP " << Delta.Excess.getUnitInc() << "/"
- << Delta.CriticalMax.getUnitInc() <<"/"
- << Delta.CurrentMax.getUnitInc() << ")|";
+ // If there are register pressure issues, then we remove the value added for
+ // the instruction being available. The rationale is that we really don't
+ // want to schedule an instruction that causes a spill.
+ if (IsAvailableAmt && pressureChange(SU, Q.getID() != TopQID) > 0 &&
+ (Delta.Excess.getUnitInc() || Delta.CriticalMax.getUnitInc() ||
+ Delta.CurrentMax.getUnitInc()))
+ ResCount -= IsAvailableAmt;
+ LLVM_DEBUG(if (verbose) {
+ dbgs() << "RP " << Delta.Excess.getUnitInc() << "/"
+ << Delta.CriticalMax.getUnitInc() << "/"
+ << Delta.CurrentMax.getUnitInc() << ")|";
});
}
@@ -631,53 +688,39 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
auto &QST = DAG->MF.getSubtarget<HexagonSubtarget>();
auto &QII = *QST.getInstrInfo();
if (SU->isInstr() && QII.mayBeCurLoad(*SU->getInstr())) {
- if (Q.getID() == TopQID && Top.ResourceModel->isResourceAvailable(SU)) {
+ if (Q.getID() == TopQID &&
+ Top.ResourceModel->isResourceAvailable(SU, true)) {
ResCount += PriorityTwo;
- DEBUG(if (verbose) dbgs() << "C|");
+ LLVM_DEBUG(if (verbose) dbgs() << "C|");
} else if (Q.getID() == BotQID &&
- Bot.ResourceModel->isResourceAvailable(SU)) {
+ Bot.ResourceModel->isResourceAvailable(SU, false)) {
ResCount += PriorityTwo;
- DEBUG(if (verbose) dbgs() << "C|");
+ LLVM_DEBUG(if (verbose) dbgs() << "C|");
}
}
// Give preference to a zero latency instruction if the dependent
// instruction is in the current packet.
- if (Q.getID() == TopQID) {
+ if (Q.getID() == TopQID && getWeakLeft(SU, true) == 0) {
for (const SDep &PI : SU->Preds) {
if (!PI.getSUnit()->getInstr()->isPseudo() && PI.isAssignedRegDep() &&
PI.getLatency() == 0 &&
Top.ResourceModel->isInPacket(PI.getSUnit())) {
ResCount += PriorityThree;
- DEBUG(if (verbose) dbgs() << "Z|");
+ LLVM_DEBUG(if (verbose) dbgs() << "Z|");
}
}
- } else {
+ } else if (Q.getID() == BotQID && getWeakLeft(SU, false) == 0) {
for (const SDep &SI : SU->Succs) {
if (!SI.getSUnit()->getInstr()->isPseudo() && SI.isAssignedRegDep() &&
SI.getLatency() == 0 &&
Bot.ResourceModel->isInPacket(SI.getSUnit())) {
ResCount += PriorityThree;
- DEBUG(if (verbose) dbgs() << "Z|");
+ LLVM_DEBUG(if (verbose) dbgs() << "Z|");
}
}
}
- // Give less preference to an instruction that will cause a stall with
- // an instruction in the previous packet.
- if (QII.isHVXVec(Instr)) {
- // Check for stalls in the previous packet.
- if (Q.getID() == TopQID) {
- for (auto J : Top.ResourceModel->OldPacket)
- if (QII.producesStall(*J->getInstr(), Instr))
- ResCount -= PriorityOne;
- } else {
- for (auto J : Bot.ResourceModel->OldPacket)
- if (QII.producesStall(Instr, *J->getInstr()))
- ResCount -= PriorityOne;
- }
- }
-
// If the instruction has a non-zero latency dependence with an instruction in
// the current packet, then it should not be scheduled yet. The case occurs
// when the dependent instruction is scheduled in a new packet, so the
@@ -689,7 +732,7 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
if (PI.getLatency() > 0 &&
Top.ResourceModel->isInPacket(PI.getSUnit())) {
ResCount -= PriorityOne;
- DEBUG(if (verbose) dbgs() << "D|");
+ LLVM_DEBUG(if (verbose) dbgs() << "D|");
}
}
} else {
@@ -697,13 +740,13 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
if (SI.getLatency() > 0 &&
Bot.ResourceModel->isInPacket(SI.getSUnit())) {
ResCount -= PriorityOne;
- DEBUG(if (verbose) dbgs() << "D|");
+ LLVM_DEBUG(if (verbose) dbgs() << "D|");
}
}
}
}
- DEBUG(if (verbose) {
+ LLVM_DEBUG(if (verbose) {
std::stringstream dbgstr;
dbgstr << "Total " << std::setw(4) << ResCount << ")";
dbgs() << dbgstr.str();
@@ -718,11 +761,12 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
/// DAG building. To adjust for the current scheduling location we need to
/// maintain the number of vreg uses remaining to be top-scheduled.
ConvergingVLIWScheduler::CandResult ConvergingVLIWScheduler::
-pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
+pickNodeFromQueue(VLIWSchedBoundary &Zone, const RegPressureTracker &RPTracker,
SchedCandidate &Candidate) {
- DEBUG(if (SchedDebugVerboseLevel > 1)
- readyQueueVerboseDump(RPTracker, Candidate, Q);
- else Q.dump(););
+ ReadyQueue &Q = Zone.Available;
+ LLVM_DEBUG(if (SchedDebugVerboseLevel > 1)
+ readyQueueVerboseDump(RPTracker, Candidate, Q);
+ else Q.dump(););
// getMaxPressureDelta temporarily modifies the tracker.
RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
@@ -739,7 +783,7 @@ pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
// Initialize the candidate if needed.
if (!Candidate.SU) {
- DEBUG(traceCandidate("DCAND", Q, *I, CurrentCost));
+ LLVM_DEBUG(traceCandidate("DCAND", Q, *I, CurrentCost));
Candidate.SU = *I;
Candidate.RPDelta = RPDelta;
Candidate.SCost = CurrentCost;
@@ -747,9 +791,23 @@ pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
continue;
}
+ // Choose node order for negative cost candidates. There is no good
+ // candidate in this case.
+ if (CurrentCost < 0 && Candidate.SCost < 0) {
+ if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum)
+ || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
+ LLVM_DEBUG(traceCandidate("NCAND", Q, *I, CurrentCost));
+ Candidate.SU = *I;
+ Candidate.RPDelta = RPDelta;
+ Candidate.SCost = CurrentCost;
+ FoundCandidate = NodeOrder;
+ }
+ continue;
+ }
+
// Best cost.
if (CurrentCost > Candidate.SCost) {
- DEBUG(traceCandidate("CCAND", Q, *I, CurrentCost));
+ LLVM_DEBUG(traceCandidate("CCAND", Q, *I, CurrentCost));
Candidate.SU = *I;
Candidate.RPDelta = RPDelta;
Candidate.SCost = CurrentCost;
@@ -757,65 +815,53 @@ pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
continue;
}
- // Tie breaker using Timing Class.
- if (!DisableTCTie) {
- auto &QST = DAG->MF.getSubtarget<HexagonSubtarget>();
- auto &QII = *QST.getInstrInfo();
-
- const MachineInstr *MI = (*I)->getInstr();
- const MachineInstr *CandI = Candidate.SU->getInstr();
- const InstrItineraryData *InstrItins = QST.getInstrItineraryData();
-
- unsigned InstrLatency = QII.getInstrTimingClassLatency(InstrItins, *MI);
- unsigned CandLatency = QII.getInstrTimingClassLatency(InstrItins, *CandI);
- DEBUG(dbgs() << "TC Tie Breaker Cand: "
- << CandLatency << " Instr:" << InstrLatency << "\n"
- << *MI << *CandI << "\n");
- if (Q.getID() == TopQID && CurrentCost == Candidate.SCost) {
- if (InstrLatency < CandLatency && TopUseShorterTie) {
- Candidate.SU = *I;
- Candidate.RPDelta = RPDelta;
- Candidate.SCost = CurrentCost;
- FoundCandidate = BestCost;
- DEBUG(dbgs() << "Used top shorter tie breaker\n");
- continue;
- } else if (InstrLatency > CandLatency && !TopUseShorterTie) {
- Candidate.SU = *I;
- Candidate.RPDelta = RPDelta;
- Candidate.SCost = CurrentCost;
- FoundCandidate = BestCost;
- DEBUG(dbgs() << "Used top longer tie breaker\n");
- continue;
- }
- } else if (Q.getID() == BotQID && CurrentCost == Candidate.SCost) {
- if (InstrLatency < CandLatency && BotUseShorterTie) {
- Candidate.SU = *I;
- Candidate.RPDelta = RPDelta;
- Candidate.SCost = CurrentCost;
- FoundCandidate = BestCost;
- DEBUG(dbgs() << "Used Bot shorter tie breaker\n");
- continue;
- } else if (InstrLatency > CandLatency && !BotUseShorterTie) {
- Candidate.SU = *I;
- Candidate.RPDelta = RPDelta;
- Candidate.SCost = CurrentCost;
- FoundCandidate = BestCost;
- DEBUG(dbgs() << "Used Bot longer tie breaker\n");
- continue;
- }
+ // Choose an instruction that does not depend on an artificial edge.
+ unsigned CurrWeak = getWeakLeft(*I, (Q.getID() == TopQID));
+ unsigned CandWeak = getWeakLeft(Candidate.SU, (Q.getID() == TopQID));
+ if (CurrWeak != CandWeak) {
+ if (CurrWeak < CandWeak) {
+ LLVM_DEBUG(traceCandidate("WCAND", Q, *I, CurrentCost));
+ Candidate.SU = *I;
+ Candidate.RPDelta = RPDelta;
+ Candidate.SCost = CurrentCost;
+ FoundCandidate = Weak;
}
+ continue;
}
- if (CurrentCost == Candidate.SCost) {
- if ((Q.getID() == TopQID &&
- (*I)->Succs.size() > Candidate.SU->Succs.size()) ||
- (Q.getID() == BotQID &&
- (*I)->Preds.size() < Candidate.SU->Preds.size())) {
- DEBUG(traceCandidate("SPCAND", Q, *I, CurrentCost));
+ if (CurrentCost == Candidate.SCost && Zone.isLatencyBound(*I)) {
+ unsigned CurrSize, CandSize;
+ if (Q.getID() == TopQID) {
+ CurrSize = (*I)->Succs.size();
+ CandSize = Candidate.SU->Succs.size();
+ } else {
+ CurrSize = (*I)->Preds.size();
+ CandSize = Candidate.SU->Preds.size();
+ }
+ if (CurrSize > CandSize) {
+ LLVM_DEBUG(traceCandidate("SPCAND", Q, *I, CurrentCost));
Candidate.SU = *I;
Candidate.RPDelta = RPDelta;
Candidate.SCost = CurrentCost;
FoundCandidate = BestCost;
+ }
+ // Keep the old candidate if it's a better candidate. That is, don't use
+ // the subsequent tie breaker.
+ if (CurrSize != CandSize)
+ continue;
+ }
+
+ // Tie breaker.
+ // To avoid scheduling indeterminism, we need a tie breaker
+ // for the case when cost is identical for two nodes.
+ if (UseNewerCandidate && CurrentCost == Candidate.SCost) {
+ if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum)
+ || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
+ LLVM_DEBUG(traceCandidate("TCAND", Q, *I, CurrentCost));
+ Candidate.SU = *I;
+ Candidate.RPDelta = RPDelta;
+ Candidate.SCost = CurrentCost;
+ FoundCandidate = NodeOrder;
continue;
}
}
@@ -833,18 +879,18 @@ SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
// Schedule as far as possible in the direction of no choice. This is most
// efficient, but also provides the best heuristics for CriticalPSets.
if (SUnit *SU = Bot.pickOnlyChoice()) {
- DEBUG(dbgs() << "Picked only Bottom\n");
+ LLVM_DEBUG(dbgs() << "Picked only Bottom\n");
IsTopNode = false;
return SU;
}
if (SUnit *SU = Top.pickOnlyChoice()) {
- DEBUG(dbgs() << "Picked only Top\n");
+ LLVM_DEBUG(dbgs() << "Picked only Top\n");
IsTopNode = true;
return SU;
}
SchedCandidate BotCand;
// Prefer bottom scheduling when heuristics are silent.
- CandResult BotResult = pickNodeFromQueue(Bot.Available,
+ CandResult BotResult = pickNodeFromQueue(Bot,
DAG->getBotRPTracker(), BotCand);
assert(BotResult != NoCand && "failed to find the first candidate");
@@ -856,40 +902,40 @@ SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
// increase pressure for one of the excess PSets, then schedule in that
// direction first to provide more freedom in the other direction.
if (BotResult == SingleExcess || BotResult == SingleCritical) {
- DEBUG(dbgs() << "Prefered Bottom Node\n");
+ LLVM_DEBUG(dbgs() << "Prefered Bottom Node\n");
IsTopNode = false;
return BotCand.SU;
}
// Check if the top Q has a better candidate.
SchedCandidate TopCand;
- CandResult TopResult = pickNodeFromQueue(Top.Available,
+ CandResult TopResult = pickNodeFromQueue(Top,
DAG->getTopRPTracker(), TopCand);
assert(TopResult != NoCand && "failed to find the first candidate");
if (TopResult == SingleExcess || TopResult == SingleCritical) {
- DEBUG(dbgs() << "Prefered Top Node\n");
+ LLVM_DEBUG(dbgs() << "Prefered Top Node\n");
IsTopNode = true;
return TopCand.SU;
}
// If either Q has a single candidate that minimizes pressure above the
// original region's pressure pick it.
if (BotResult == SingleMax) {
- DEBUG(dbgs() << "Prefered Bottom Node SingleMax\n");
+ LLVM_DEBUG(dbgs() << "Prefered Bottom Node SingleMax\n");
IsTopNode = false;
return BotCand.SU;
}
if (TopResult == SingleMax) {
- DEBUG(dbgs() << "Prefered Top Node SingleMax\n");
+ LLVM_DEBUG(dbgs() << "Prefered Top Node SingleMax\n");
IsTopNode = true;
return TopCand.SU;
}
if (TopCand.SCost > BotCand.SCost) {
- DEBUG(dbgs() << "Prefered Top Node Cost\n");
+ LLVM_DEBUG(dbgs() << "Prefered Top Node Cost\n");
IsTopNode = true;
return TopCand.SU;
}
// Otherwise prefer the bottom candidate in node order.
- DEBUG(dbgs() << "Prefered Bottom in Node order\n");
+ LLVM_DEBUG(dbgs() << "Prefered Bottom in Node order\n");
IsTopNode = false;
return BotCand.SU;
}
@@ -907,7 +953,7 @@ SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
if (!SU) {
SchedCandidate TopCand;
CandResult TopResult =
- pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand);
+ pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
assert(TopResult != NoCand && "failed to find the first candidate");
(void)TopResult;
SU = TopCand.SU;
@@ -918,7 +964,7 @@ SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
if (!SU) {
SchedCandidate BotCand;
CandResult BotResult =
- pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand);
+ pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
assert(BotResult != NoCand && "failed to find the first candidate");
(void)BotResult;
SU = BotCand.SU;
@@ -932,10 +978,11 @@ SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
if (SU->isBottomReady())
Bot.removeReady(SU);
- DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
- << " Scheduling Instruction in cycle "
- << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << '\n';
- SU->dump(DAG));
+ LLVM_DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
+ << " Scheduling instruction in cycle "
+ << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << " ("
+ << reportPackets() << ")\n";
+ SU->dump(DAG));
return SU;
}
@@ -945,10 +992,10 @@ SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
/// does.
void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) {
if (IsTopNode) {
- SU->TopReadyCycle = Top.CurrCycle;
Top.bumpNode(SU);
+ SU->TopReadyCycle = Top.CurrCycle;
} else {
- SU->BotReadyCycle = Bot.CurrCycle;
Bot.bumpNode(SU);
+ SU->BotReadyCycle = Bot.CurrCycle;
}
}
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index bf7fe2d484a2..585a7858ad2b 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -49,9 +49,6 @@ class VLIWResourceModel {
unsigned TotalPackets = 0;
public:
- /// Save the last formed packet.
- std::vector<SUnit *> OldPacket;
-
VLIWResourceModel(const TargetSubtargetInfo &STI, const TargetSchedModel *SM)
: SchedModel(SM) {
ResourcesModel = STI.getInstrInfo()->CreateTargetScheduleState(STI);
@@ -62,8 +59,6 @@ public:
Packet.resize(SchedModel->getIssueWidth());
Packet.clear();
- OldPacket.resize(SchedModel->getIssueWidth());
- OldPacket.clear();
ResourcesModel->clearResources();
}
@@ -84,9 +79,8 @@ public:
ResourcesModel->clearResources();
}
- bool isResourceAvailable(SUnit *SU);
- bool reserveResources(SUnit *SU);
- void savePacket();
+ bool isResourceAvailable(SUnit *SU, bool IsTop);
+ bool reserveResources(SUnit *SU, bool IsTop);
unsigned getTotalPackets() const { return TotalPackets; }
bool isInPacket(SUnit *SU) const { return is_contained(Packet, SU); }
};
@@ -102,6 +96,9 @@ public:
/// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
/// time to do some work.
void schedule() override;
+
+ RegisterClassInfo *getRegClassInfo() { return RegClassInfo; }
+ int getBBSize() { return BB->size(); }
};
//===----------------------------------------------------------------------===//
@@ -129,7 +126,7 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
/// Represent the type of SchedCandidate found within a single queue.
enum CandResult {
NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure,
- BestCost};
+ BestCost, Weak};
/// Each Scheduling boundary is associated with ready queues. It tracks the
/// current cycle in whichever direction at has moved, and maintains the state
@@ -147,6 +144,7 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
unsigned CurrCycle = 0;
unsigned IssueCount = 0;
+ unsigned CriticalPathLength = 0;
/// MinReadyCycle - Cycle of the soonest available instruction.
unsigned MinReadyCycle = std::numeric_limits<unsigned>::max();
@@ -168,7 +166,27 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
void init(VLIWMachineScheduler *dag, const TargetSchedModel *smodel) {
DAG = dag;
SchedModel = smodel;
+ CurrCycle = 0;
IssueCount = 0;
+ // Initialize the critical path length limit, which used by the scheduling
+ // cost model to determine the value for scheduling an instruction. We use
+ // a slightly different heuristic for small and large functions. For small
+ // functions, it's important to use the height/depth of the instruction.
+ // For large functions, prioritizing by height or depth increases spills.
+ CriticalPathLength = DAG->getBBSize() / SchedModel->getIssueWidth();
+ if (DAG->getBBSize() < 50)
+ // We divide by two as a cheap and simple heuristic to reduce the
+ // critcal path length, which increases the priority of using the graph
+ // height/depth in the scheduler's cost computation.
+ CriticalPathLength >>= 1;
+ else {
+ // For large basic blocks, we prefer a larger critical path length to
+ // decrease the priority of using the graph height/depth.
+ unsigned MaxPath = 0;
+ for (auto &SU : DAG->SUnits)
+ MaxPath = std::max(MaxPath, isTop() ? SU.getHeight() : SU.getDepth());
+ CriticalPathLength = std::max(CriticalPathLength, MaxPath) + 1;
+ }
}
bool isTop() const {
@@ -188,6 +206,13 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
void removeReady(SUnit *SU);
SUnit *pickOnlyChoice();
+
+ bool isLatencyBound(SUnit *SU) {
+ if (CurrCycle >= CriticalPathLength)
+ return true;
+ unsigned PathLength = isTop() ? SU->getHeight() : SU->getDepth();
+ return CriticalPathLength - CurrCycle <= PathLength;
+ }
};
VLIWMachineScheduler *DAG = nullptr;
@@ -197,6 +222,9 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
VLIWSchedBoundary Top;
VLIWSchedBoundary Bot;
+ /// List of pressure sets that have a high pressure level in the region.
+ std::vector<bool> HighPressureSets;
+
public:
/// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
enum {
@@ -217,7 +245,7 @@ public:
void releaseBottomNode(SUnit *SU) override;
- unsigned ReportPackets() {
+ unsigned reportPackets() {
return Top.ResourceModel->getTotalPackets() +
Bot.ResourceModel->getTotalPackets();
}
@@ -225,11 +253,13 @@ public:
protected:
SUnit *pickNodeBidrectional(bool &IsTopNode);
+ int pressureChange(const SUnit *SU, bool isBotUp);
+
int SchedulingCost(ReadyQueue &Q,
SUnit *SU, SchedCandidate &Candidate,
RegPressureDelta &Delta, bool verbose);
- CandResult pickNodeFromQueue(ReadyQueue &Q,
+ CandResult pickNodeFromQueue(VLIWSchedBoundary &Zone,
const RegPressureTracker &RPTracker,
SchedCandidate &Candidate);
#ifndef NDEBUG
diff --git a/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td b/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td
index 718d3ac7d45a..c29a75e6fe74 100644
--- a/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td
+++ b/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td
@@ -7,80 +7,80 @@
//
//===----------------------------------------------------------------------===//
-def: Pat<(int_hexagon_A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2), (A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV65T]>;
-def: Pat<(int_hexagon_V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasruwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasruhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasruhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vabsb HvxVR:$src1), (V6_vabsb HvxVR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vabsb_128B HvxVR:$src1), (V6_vabsb HvxVR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vabsb_sat HvxVR:$src1), (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vabsb_sat_128B HvxVR:$src1), (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vaslh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasrh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavguw HvxVR:$src1, HvxVR:$src2), (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavguw_128B HvxVR:$src1, HvxVR:$src2), (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavguwrnd HvxVR:$src1, HvxVR:$src2), (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavguwrnd_128B HvxVR:$src1, HvxVR:$src2), (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavgb HvxVR:$src1, HvxVR:$src2), (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavgb_128B HvxVR:$src1, HvxVR:$src2), (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavgbrnd HvxVR:$src1, HvxVR:$src2), (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavgbrnd_128B HvxVR:$src1, HvxVR:$src2), (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vlut4 HvxVR:$src1, DoubleRegs:$src2), (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vlut4_128B HvxVR:$src1, DoubleRegs:$src2), (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vnavgb HvxVR:$src1, HvxVR:$src2), (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vnavgb_128B HvxVR:$src1, HvxVR:$src2), (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpabuu HvxWR:$src1, IntRegs:$src2), (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpabuu_128B HvxWR:$src1, IntRegs:$src2), (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpabuu_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpahhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpauhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpsuhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyuhe HvxVR:$src1, IntRegs:$src2), (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyuhe_128B HvxVR:$src1, IntRegs:$src2), (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyuhe_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqb HvxQR:$src1), (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqb_128B HvxQR:$src1), (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqh HvxQR:$src1), (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqh_128B HvxQR:$src1), (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqw HvxQR:$src1), (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqw_128B HvxQR:$src1), (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt_128B HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt_128B HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5), (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermw_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermw_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw_add_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5), (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vdd0), (V6_vdd0)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vdd0_128B), (V6_vdd0)>, Requires<[HasV65T, UseHVX]>;
+def: Pat<(int_hexagon_A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2), (A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV65]>;
+def: Pat<(int_hexagon_V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vasruwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vasruhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vasruhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vabsb HvxVR:$src1), (V6_vabsb HvxVR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vabsb_128B HvxVR:$src1), (V6_vabsb HvxVR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vabsb_sat HvxVR:$src1), (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vabsb_sat_128B HvxVR:$src1), (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vaslh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vasrh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavguw HvxVR:$src1, HvxVR:$src2), (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavguw_128B HvxVR:$src1, HvxVR:$src2), (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavguwrnd HvxVR:$src1, HvxVR:$src2), (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavguwrnd_128B HvxVR:$src1, HvxVR:$src2), (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavgb HvxVR:$src1, HvxVR:$src2), (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavgb_128B HvxVR:$src1, HvxVR:$src2), (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavgbrnd HvxVR:$src1, HvxVR:$src2), (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavgbrnd_128B HvxVR:$src1, HvxVR:$src2), (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vlut4 HvxVR:$src1, DoubleRegs:$src2), (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vlut4_128B HvxVR:$src1, DoubleRegs:$src2), (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vnavgb HvxVR:$src1, HvxVR:$src2), (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vnavgb_128B HvxVR:$src1, HvxVR:$src2), (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpabuu HvxWR:$src1, IntRegs:$src2), (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpabuu_128B HvxWR:$src1, IntRegs:$src2), (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpabuu_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpahhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpauhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpsuhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpyh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpyuhe HvxVR:$src1, IntRegs:$src2), (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpyuhe_128B HvxVR:$src1, IntRegs:$src2), (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpyuhe_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vprefixqb HvxQR:$src1), (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vprefixqb_128B HvxQR:$src1), (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vprefixqh HvxQR:$src1), (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vprefixqh_128B HvxQR:$src1), (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vprefixqw HvxQR:$src1), (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vprefixqw_128B HvxQR:$src1), (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpyub_rtt_128B HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpybub_rtt_128B HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5), (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermw_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermh_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermw_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermh_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhw_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhw_add_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5), (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vdd0), (V6_vdd0)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vdd0_128B), (V6_vdd0)>, Requires<[HasV65, UseHVX]>;
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index ffa447cc1311..f2a6627c99be 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -16,7 +16,7 @@
// The basic approach looks for sequence of predicated jump, compare instruciton
// that genereates the predicate and, the feeder to the predicate. Once it finds
-// all, it collapses compare and jump instruction into a new valu jump
+// all, it collapses compare and jump instruction into a new value jump
// intstructions.
//
//===----------------------------------------------------------------------===//
@@ -24,6 +24,7 @@
#include "Hexagon.h"
#include "HexagonInstrInfo.h"
#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
@@ -95,7 +96,7 @@ namespace {
const HexagonInstrInfo *QII;
const HexagonRegisterInfo *QRI;
- /// \brief A handle to the branch probability pass.
+ /// A handle to the branch probability pass.
const MachineBranchProbabilityInfo *MBPI;
bool isNewValueJumpCandidate(const MachineInstr &MI) const;
@@ -142,8 +143,24 @@ static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
if (QII->isSolo(*II))
return false;
- // Make sure there there is no 'def' or 'use' of any of the uses of
- // feeder insn between it's definition, this MI and jump, jmpInst
+ if (QII->isFloat(*II))
+ return false;
+
+ // Make sure that the (unique) def operand is a register from IntRegs.
+ bool HadDef = false;
+ for (const MachineOperand &Op : II->operands()) {
+ if (!Op.isReg() || !Op.isDef())
+ continue;
+ if (HadDef)
+ return false;
+ HadDef = true;
+ if (!Hexagon::IntRegsRegClass.contains(Op.getReg()))
+ return false;
+ }
+ assert(HadDef);
+
+ // Make sure there is no 'def' or 'use' of any of the uses of
+ // feeder insn between its definition, this MI and jump, jmpInst
// skipping compare, cmpInst.
// Here's the example.
// r21=memub(r22+r24<<#0)
@@ -270,8 +287,8 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
if (cmpReg1 == cmpOp2)
return false;
- // Make sure that that second register is not from COPY
- // At machine code level, we don't need this, but if we decide
+ // Make sure that the second register is not from COPY
+ // at machine code level, we don't need this, but if we decide
// to move new value jump prior to RA, we would be needing this.
MachineRegisterInfo &MRI = MF.getRegInfo();
if (secondReg && !TargetRegisterInfo::isPhysicalRegister(cmpOp2)) {
@@ -285,7 +302,7 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
// and satisfy the following conditions.
++II;
for (MachineBasicBlock::iterator localII = II; localII != end; ++localII) {
- if (localII->isDebugValue())
+ if (localII->isDebugInstr())
continue;
// Check 1.
@@ -431,8 +448,8 @@ bool HexagonNewValueJump::isNewValueJumpCandidate(
}
bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
- << "********** Function: " << MF.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
+ << "********** Function: " << MF.getName() << "\n");
if (skipFunction(MF.getFunction()))
return false;
@@ -445,9 +462,9 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
MF.getSubtarget().getRegisterInfo());
MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
- if (DisableNewValueJumps) {
+ if (DisableNewValueJumps ||
+ !MF.getSubtarget<HexagonSubtarget>().useNewValueJumps())
return false;
- }
int nvjCount = DbgNVJCount;
int nvjGenerated = 0;
@@ -457,9 +474,10 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
MBBb != MBBe; ++MBBb) {
MachineBasicBlock *MBB = &*MBBb;
- DEBUG(dbgs() << "** dumping bb ** " << MBB->getNumber() << "\n");
- DEBUG(MBB->dump());
- DEBUG(dbgs() << "\n" << "********** dumping instr bottom up **********\n");
+ LLVM_DEBUG(dbgs() << "** dumping bb ** " << MBB->getNumber() << "\n");
+ LLVM_DEBUG(MBB->dump());
+ LLVM_DEBUG(dbgs() << "\n"
+ << "********** dumping instr bottom up **********\n");
bool foundJump = false;
bool foundCompare = false;
bool invertPredicate = false;
@@ -477,14 +495,14 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock::iterator MII = MBB->end(), E = MBB->begin();
MII != E;) {
MachineInstr &MI = *--MII;
- if (MI.isDebugValue()) {
+ if (MI.isDebugInstr()) {
continue;
}
if ((nvjCount == 0) || (nvjCount > -1 && nvjCount <= nvjGenerated))
break;
- DEBUG(dbgs() << "Instr: "; MI.dump(); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "Instr: "; MI.dump(); dbgs() << "\n");
if (!foundJump && (MI.getOpcode() == Hexagon::J2_jumpt ||
MI.getOpcode() == Hexagon::J2_jumptpt ||
@@ -505,7 +523,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
// operands, the following check on the kill flag would suffice.
// if(!jmpInstr->getOperand(0).isKill()) break;
- // This predicate register is live out out of BB
+ // This predicate register is live out of BB
// this would only work if we can actually use Live
// variable analysis on phy regs - but LLVM does not
// provide LV analysis on phys regs.
diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 4738a4d32409..29c044b3b729 100644
--- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -27,6 +27,7 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Pass.h"
@@ -78,7 +79,9 @@ private:
using MISetType = DenseSet<MachineInstr *>;
using InstrEvalMap = DenseMap<MachineInstr *, bool>;
+ MachineRegisterInfo *MRI = nullptr;
const HexagonInstrInfo *HII = nullptr;
+ const HexagonRegisterInfo *HRI = nullptr;
MachineDominatorTree *MDT = nullptr;
DataFlowGraph *DFG = nullptr;
DataFlowGraph::DefStackMap DefM;
@@ -88,11 +91,16 @@ private:
bool processBlock(NodeAddr<BlockNode *> BA);
bool xformUseMI(MachineInstr *TfrMI, MachineInstr *UseMI,
NodeAddr<UseNode *> UseN, unsigned UseMOnum);
+ bool processAddUses(NodeAddr<StmtNode *> AddSN, MachineInstr *AddMI,
+ const NodeList &UNodeList);
+ bool updateAddUses(MachineInstr *AddMI, MachineInstr *UseMI);
bool analyzeUses(unsigned DefR, const NodeList &UNodeList,
InstrEvalMap &InstrEvalResult, short &SizeInc);
bool hasRepForm(MachineInstr &MI, unsigned TfrDefR);
bool canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN, MachineInstr &MI,
const NodeList &UNodeList);
+ bool isSafeToExtLR(NodeAddr<StmtNode *> SN, MachineInstr *MI,
+ unsigned LRExtReg, const NodeList &UNodeList);
void getAllRealUses(NodeAddr<StmtNode *> SN, NodeList &UNodeList);
bool allValidCandidates(NodeAddr<StmtNode *> SA, NodeList &UNodeList);
short getBaseWithLongOffset(const MachineInstr &MI) const;
@@ -101,6 +109,7 @@ private:
bool changeLoad(MachineInstr *OldMI, MachineOperand ImmOp, unsigned ImmOpNum);
bool changeAddAsl(NodeAddr<UseNode *> AddAslUN, MachineInstr *AddAslMI,
const MachineOperand &ImmOp, unsigned ImmOpNum);
+ bool isValidOffset(MachineInstr *MI, int Offset);
};
} // end anonymous namespace
@@ -208,7 +217,7 @@ bool HexagonOptAddrMode::allValidCandidates(NodeAddr<StmtNode *> SA,
NodeSet Visited, Defs;
const auto &P = LV->getAllReachingDefsRec(UR, UN, Visited, Defs);
if (!P.second) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "*** Unable to collect all reaching defs for use ***\n"
<< PrintNode<UseNode*>(UN, *DFG) << '\n'
<< "The program's complexity may exceed the limits.\n";
@@ -217,7 +226,7 @@ bool HexagonOptAddrMode::allValidCandidates(NodeAddr<StmtNode *> SA,
}
const auto &ReachingDefs = P.first;
if (ReachingDefs.size() > 1) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "*** Multiple Reaching Defs found!!! ***\n";
for (auto DI : ReachingDefs) {
NodeAddr<UseNode *> DA = DFG->addr<UseNode *>(DI);
@@ -235,15 +244,15 @@ bool HexagonOptAddrMode::allValidCandidates(NodeAddr<StmtNode *> SA,
void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
NodeList &UNodeList) {
for (NodeAddr<DefNode *> DA : SA.Addr->members_if(DFG->IsDef, *DFG)) {
- DEBUG(dbgs() << "\t\t[DefNode]: " << Print<NodeAddr<DefNode *>>(DA, *DFG)
- << "\n");
+ LLVM_DEBUG(dbgs() << "\t\t[DefNode]: "
+ << Print<NodeAddr<DefNode *>>(DA, *DFG) << "\n");
RegisterRef DR = DFG->getPRI().normalize(DA.Addr->getRegRef(*DFG));
auto UseSet = LV->getAllReachedUses(DR, DA);
for (auto UI : UseSet) {
NodeAddr<UseNode *> UA = DFG->addr<UseNode *>(UI);
- DEBUG({
+ LLVM_DEBUG({
NodeAddr<StmtNode *> TempIA = UA.Addr->getOwner(*DFG);
dbgs() << "\t\t\t[Reached Use]: "
<< Print<NodeAddr<InstrNode *>>(TempIA, *DFG) << "\n";
@@ -253,8 +262,8 @@ void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
NodeAddr<PhiNode *> PA = UA.Addr->getOwner(*DFG);
NodeId id = PA.Id;
const Liveness::RefMap &phiUse = LV->getRealUses(id);
- DEBUG(dbgs() << "\t\t\t\tphi real Uses"
- << Print<Liveness::RefMap>(phiUse, *DFG) << "\n");
+ LLVM_DEBUG(dbgs() << "\t\t\t\tphi real Uses"
+ << Print<Liveness::RefMap>(phiUse, *DFG) << "\n");
if (!phiUse.empty()) {
for (auto I : phiUse) {
if (!DFG->getPRI().alias(RegisterRef(I.first), DR))
@@ -272,6 +281,153 @@ void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
}
}
+bool HexagonOptAddrMode::isSafeToExtLR(NodeAddr<StmtNode *> SN,
+ MachineInstr *MI, unsigned LRExtReg,
+ const NodeList &UNodeList) {
+ RegisterRef LRExtRR;
+ NodeId LRExtRegRD = 0;
+ // Iterate through all the UseNodes in SN and find the reaching def
+ // for the LRExtReg.
+ for (NodeAddr<UseNode *> UA : SN.Addr->members_if(DFG->IsUse, *DFG)) {
+ RegisterRef RR = UA.Addr->getRegRef(*DFG);
+ if (LRExtReg == RR.Reg) {
+ LRExtRR = RR;
+ LRExtRegRD = UA.Addr->getReachingDef();
+ }
+ }
+
+ for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+ NodeAddr<UseNode *> UA = *I;
+ NodeAddr<InstrNode *> IA = UA.Addr->getOwner(*DFG);
+ // The reaching def of LRExtRR at load/store node should be same as the
+ // one reaching at the SN.
+ if (UA.Addr->getFlags() & NodeAttrs::PhiRef)
+ return false;
+ NodeAddr<RefNode*> AA = LV->getNearestAliasedRef(LRExtRR, IA);
+ if ((DFG->IsDef(AA) && AA.Id != LRExtRegRD) ||
+ AA.Addr->getReachingDef() != LRExtRegRD) {
+ LLVM_DEBUG(
+ dbgs() << "isSafeToExtLR: Returning false; another reaching def\n");
+ return false;
+ }
+
+ MachineInstr *UseMI = NodeAddr<StmtNode *>(IA).Addr->getCode();
+ NodeAddr<DefNode *> LRExtRegDN = DFG->addr<DefNode *>(LRExtRegRD);
+ // Reaching Def to LRExtReg can't be a phi.
+ if ((LRExtRegDN.Addr->getFlags() & NodeAttrs::PhiRef) &&
+ MI->getParent() != UseMI->getParent())
+ return false;
+ }
+ return true;
+}
+
+bool HexagonOptAddrMode::isValidOffset(MachineInstr *MI, int Offset) {
+ unsigned AlignMask = 0;
+ switch (HII->getMemAccessSize(*MI)) {
+ case HexagonII::MemAccessSize::DoubleWordAccess:
+ AlignMask = 0x7;
+ break;
+ case HexagonII::MemAccessSize::WordAccess:
+ AlignMask = 0x3;
+ break;
+ case HexagonII::MemAccessSize::HalfWordAccess:
+ AlignMask = 0x1;
+ break;
+ case HexagonII::MemAccessSize::ByteAccess:
+ AlignMask = 0x0;
+ break;
+ default:
+ return false;
+ }
+
+ if ((AlignMask & Offset) != 0)
+ return false;
+ return HII->isValidOffset(MI->getOpcode(), Offset, HRI, false);
+}
+
+bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN,
+ MachineInstr *AddMI,
+ const NodeList &UNodeList) {
+
+ unsigned AddDefR = AddMI->getOperand(0).getReg();
+ for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+ NodeAddr<UseNode *> UN = *I;
+ NodeAddr<StmtNode *> SN = UN.Addr->getOwner(*DFG);
+ MachineInstr *MI = SN.Addr->getCode();
+ const MCInstrDesc &MID = MI->getDesc();
+ if ((!MID.mayLoad() && !MID.mayStore()) ||
+ HII->getAddrMode(*MI) != HexagonII::BaseImmOffset ||
+ HII->isHVXVec(*MI))
+ return false;
+
+ MachineOperand BaseOp = MID.mayLoad() ? MI->getOperand(1)
+ : MI->getOperand(0);
+
+ if (!BaseOp.isReg() || BaseOp.getReg() != AddDefR)
+ return false;
+
+ MachineOperand OffsetOp = MID.mayLoad() ? MI->getOperand(2)
+ : MI->getOperand(1);
+ if (!OffsetOp.isImm())
+ return false;
+
+ int64_t newOffset = OffsetOp.getImm() + AddMI->getOperand(2).getImm();
+ if (!isValidOffset(MI, newOffset))
+ return false;
+
+ // Since we'll be extending the live range of Rt in the following example,
+ // make sure that is safe. another definition of Rt doesn't exist between 'add'
+ // and load/store instruction.
+ //
+ // Ex: Rx= add(Rt,#10)
+ // memw(Rx+#0) = Rs
+ // will be replaced with => memw(Rt+#10) = Rs
+ unsigned BaseReg = AddMI->getOperand(1).getReg();
+ if (!isSafeToExtLR(AddSN, AddMI, BaseReg, UNodeList))
+ return false;
+ }
+
+ // Update all the uses of 'add' with the appropriate base and offset
+ // values.
+ bool Changed = false;
+ for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+ NodeAddr<UseNode *> UseN = *I;
+ assert(!(UseN.Addr->getFlags() & NodeAttrs::PhiRef) &&
+ "Found a PhiRef node as a real reached use!!");
+
+ NodeAddr<StmtNode *> OwnerN = UseN.Addr->getOwner(*DFG);
+ MachineInstr *UseMI = OwnerN.Addr->getCode();
+ LLVM_DEBUG(dbgs() << "\t\t[MI <BB#" << UseMI->getParent()->getNumber()
+ << ">]: " << *UseMI << "\n");
+ Changed |= updateAddUses(AddMI, UseMI);
+ }
+
+ if (Changed)
+ Deleted.insert(AddMI);
+
+ return Changed;
+}
+
+bool HexagonOptAddrMode::updateAddUses(MachineInstr *AddMI,
+ MachineInstr *UseMI) {
+ const MachineOperand ImmOp = AddMI->getOperand(2);
+ const MachineOperand AddRegOp = AddMI->getOperand(1);
+ unsigned newReg = AddRegOp.getReg();
+ const MCInstrDesc &MID = UseMI->getDesc();
+
+ MachineOperand &BaseOp = MID.mayLoad() ? UseMI->getOperand(1)
+ : UseMI->getOperand(0);
+ MachineOperand &OffsetOp = MID.mayLoad() ? UseMI->getOperand(2)
+ : UseMI->getOperand(1);
+ BaseOp.setReg(newReg);
+ BaseOp.setIsUndef(AddRegOp.isUndef());
+ BaseOp.setImplicit(AddRegOp.isImplicit());
+ OffsetOp.setImm(ImmOp.getImm() + OffsetOp.getImm());
+ MRI->clearKillFlags(newReg);
+
+ return true;
+}
+
bool HexagonOptAddrMode::analyzeUses(unsigned tfrDefR,
const NodeList &UNodeList,
InstrEvalMap &InstrEvalResult,
@@ -296,7 +452,7 @@ bool HexagonOptAddrMode::analyzeUses(unsigned tfrDefR,
} else if (MI.getOpcode() == Hexagon::S2_addasl_rrri) {
NodeList AddaslUseList;
- DEBUG(dbgs() << "\nGetting ReachedUses for === " << MI << "\n");
+ LLVM_DEBUG(dbgs() << "\nGetting ReachedUses for === " << MI << "\n");
getAllRealUses(SN, AddaslUseList);
// Process phi nodes.
if (allValidCandidates(SN, AddaslUseList) &&
@@ -360,8 +516,8 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
} else
Changed = false;
- DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
- DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
+ LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+ LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
} else if (ImmOpNum == 2 && OldMI->getOperand(3).getImm() == 0) {
short NewOpCode = HII->changeAddrMode_rr_io(*OldMI);
assert(NewOpCode >= 0 && "Invalid New opcode\n");
@@ -371,8 +527,8 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
MIB.add(ImmOp);
OpStart = 4;
Changed = true;
- DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
- DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
+ LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+ LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
}
if (Changed)
@@ -413,8 +569,8 @@ bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
OpStart = 3;
}
Changed = true;
- DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
- DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
+ LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+ LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
} else if (ImmOpNum == 1 && OldMI->getOperand(2).getImm() == 0) {
short NewOpCode = HII->changeAddrMode_rr_io(*OldMI);
assert(NewOpCode >= 0 && "Invalid New opcode\n");
@@ -423,8 +579,8 @@ bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
MIB.add(ImmOp);
OpStart = 3;
Changed = true;
- DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
- DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
+ LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+ LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
}
if (Changed)
for (unsigned i = OpStart; i < OpEnd; ++i)
@@ -447,7 +603,7 @@ bool HexagonOptAddrMode::changeAddAsl(NodeAddr<UseNode *> AddAslUN,
unsigned ImmOpNum) {
NodeAddr<StmtNode *> SA = AddAslUN.Addr->getOwner(*DFG);
- DEBUG(dbgs() << "Processing addasl :" << *AddAslMI << "\n");
+ LLVM_DEBUG(dbgs() << "Processing addasl :" << *AddAslMI << "\n");
NodeList UNodeList;
getAllRealUses(SA, UNodeList);
@@ -458,11 +614,11 @@ bool HexagonOptAddrMode::changeAddAsl(NodeAddr<UseNode *> AddAslUN,
"Can't transform this 'AddAsl' instruction!");
NodeAddr<StmtNode *> UseIA = UseUN.Addr->getOwner(*DFG);
- DEBUG(dbgs() << "[InstrNode]: " << Print<NodeAddr<InstrNode *>>(UseIA, *DFG)
- << "\n");
+ LLVM_DEBUG(dbgs() << "[InstrNode]: "
+ << Print<NodeAddr<InstrNode *>>(UseIA, *DFG) << "\n");
MachineInstr *UseMI = UseIA.Addr->getCode();
- DEBUG(dbgs() << "[MI <" << printMBBReference(*UseMI->getParent())
- << ">]: " << *UseMI << "\n");
+ LLVM_DEBUG(dbgs() << "[MI <" << printMBBReference(*UseMI->getParent())
+ << ">]: " << *UseMI << "\n");
const MCInstrDesc &UseMID = UseMI->getDesc();
assert(HII->getAddrMode(*UseMI) == HexagonII::BaseImmOffset);
@@ -534,13 +690,15 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
NodeAddr<StmtNode *> SA = IA;
MachineInstr *MI = SA.Addr->getCode();
- if (MI->getOpcode() != Hexagon::A2_tfrsi ||
- !MI->getOperand(1).isGlobal())
- continue;
+ if ((MI->getOpcode() != Hexagon::A2_tfrsi ||
+ !MI->getOperand(1).isGlobal()) &&
+ (MI->getOpcode() != Hexagon::A2_addi ||
+ !MI->getOperand(2).isImm() || HII->isConstExtended(*MI)))
+ continue;
- DEBUG(dbgs() << "[Analyzing " << HII->getName(MI->getOpcode()) << "]: "
- << *MI << "\n\t[InstrNode]: "
- << Print<NodeAddr<InstrNode *>>(IA, *DFG) << '\n');
+ LLVM_DEBUG(dbgs() << "[Analyzing " << HII->getName(MI->getOpcode())
+ << "]: " << *MI << "\n\t[InstrNode]: "
+ << Print<NodeAddr<InstrNode *>>(IA, *DFG) << '\n');
NodeList UNodeList;
getAllRealUses(SA, UNodeList);
@@ -548,6 +706,21 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
if (!allValidCandidates(SA, UNodeList))
continue;
+ // Analyze all uses of 'add'. If the output of 'add' is used as an address
+ // in the base+immediate addressing mode load/store instructions, see if
+ // they can be updated to use the immediate value as an offet. Thus,
+ // providing us the opportunity to eliminate 'add'.
+ // Ex: Rx= add(Rt,#12)
+ // memw(Rx+#0) = Rs
+ // This can be replaced with memw(Rt+#12) = Rs
+ //
+ // This transformation is only performed if all uses can be updated and
+ // the offset isn't required to be constant extended.
+ if (MI->getOpcode() == Hexagon::A2_addi) {
+ Changed |= processAddUses(SA, MI, UNodeList);
+ continue;
+ }
+
short SizeInc = 0;
unsigned DefR = MI->getOperand(0).getReg();
InstrEvalMap InstrEvalResult;
@@ -561,8 +734,9 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
bool KeepTfr = false;
- DEBUG(dbgs() << "\t[Total reached uses] : " << UNodeList.size() << "\n");
- DEBUG(dbgs() << "\t[Processing Reached Uses] ===\n");
+ LLVM_DEBUG(dbgs() << "\t[Total reached uses] : " << UNodeList.size()
+ << "\n");
+ LLVM_DEBUG(dbgs() << "\t[Processing Reached Uses] ===\n");
for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
NodeAddr<UseNode *> UseN = *I;
assert(!(UseN.Addr->getFlags() & NodeAttrs::PhiRef) &&
@@ -570,8 +744,8 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
NodeAddr<StmtNode *> OwnerN = UseN.Addr->getOwner(*DFG);
MachineInstr *UseMI = OwnerN.Addr->getCode();
- DEBUG(dbgs() << "\t\t[MI <" << printMBBReference(*UseMI->getParent())
- << ">]: " << *UseMI << "\n");
+ LLVM_DEBUG(dbgs() << "\t\t[MI <" << printMBBReference(*UseMI->getParent())
+ << ">]: " << *UseMI << "\n");
int UseMOnum = -1;
unsigned NumOperands = UseMI->getNumOperands();
@@ -580,9 +754,11 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
if (op.isReg() && op.isUse() && DefR == op.getReg())
UseMOnum = j;
}
- assert(UseMOnum >= 0 && "Invalid reached use!");
+ // It is possible that the register will not be found in any operand.
+ // This could happen, for example, when DefR = R4, but the used
+ // register is D2.
- if (InstrEvalResult[UseMI])
+ if (UseMOnum >= 0 && InstrEvalResult[UseMI])
// Change UseMI if replacement is possible.
Changed |= xformUseMI(MI, UseMI, UseN, UseMOnum);
else
@@ -600,27 +776,27 @@ bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
auto &HST = MF.getSubtarget<HexagonSubtarget>();
- auto &MRI = MF.getRegInfo();
+ MRI = &MF.getRegInfo();
HII = HST.getInstrInfo();
+ HRI = HST.getRegisterInfo();
const auto &MDF = getAnalysis<MachineDominanceFrontier>();
MDT = &getAnalysis<MachineDominatorTree>();
- const auto &TRI = *MF.getSubtarget().getRegisterInfo();
const TargetOperandInfo TOI(*HII);
- DataFlowGraph G(MF, *HII, TRI, *MDT, MDF, TOI);
+ DataFlowGraph G(MF, *HII, *HRI, *MDT, MDF, TOI);
// Need to keep dead phis because we can propagate uses of registers into
// nodes dominated by those would-be phis.
G.build(BuildOptions::KeepDeadPhis);
DFG = &G;
- Liveness L(MRI, *DFG);
+ Liveness L(*MRI, *DFG);
L.computePhiInfo();
LV = &L;
Deleted.clear();
NodeAddr<FuncNode *> FA = DFG->getFunc();
- DEBUG(dbgs() << "==== [RefMap#]=====:\n "
- << Print<NodeAddr<FuncNode *>>(FA, *DFG) << "\n");
+ LLVM_DEBUG(dbgs() << "==== [RefMap#]=====:\n "
+ << Print<NodeAddr<FuncNode *>>(FA, *DFG) << "\n");
for (NodeAddr<BlockNode *> BA : FA.Addr->members(*DFG))
Changed |= processBlock(BA);
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index cdc2085986a5..384fda4ce39a 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -100,6 +100,17 @@ def HWI8: PatLeaf<(VecPI8 HvxWR:$R)>;
def HWI16: PatLeaf<(VecPI16 HvxWR:$R)>;
def HWI32: PatLeaf<(VecPI32 HvxWR:$R)>;
+def SDTVecVecIntOp:
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>,
+ SDTCisVT<3,i32>]>;
+
+def HexagonVALIGN: SDNode<"HexagonISD::VALIGN", SDTVecVecIntOp>;
+def HexagonVALIGNADDR: SDNode<"HexagonISD::VALIGNADDR", SDTIntUnaryOp>;
+
+def valign: PatFrag<(ops node:$Vt, node:$Vs, node:$Ru),
+ (HexagonVALIGN node:$Vt, node:$Vs, node:$Ru)>;
+def valignaddr: PatFrag<(ops node:$Addr), (HexagonVALIGNADDR node:$Addr)>;
+
// Pattern fragments to extract the low and high subregisters from a
// 64-bit value.
def LoReg: OutPatFrag<(ops node:$Rs), (EXTRACT_SUBREG (i64 $Rs), isub_lo)>;
@@ -109,16 +120,6 @@ def IsOrAdd: PatFrag<(ops node:$A, node:$B), (or node:$A, node:$B), [{
return isOrEquivalentToAdd(N);
}]>;
-def IsVecOff : PatLeaf<(i32 imm), [{
- int32_t V = N->getSExtValue();
- int32_t VecSize = HRI->getSpillSize(Hexagon::HvxVRRegClass);
- assert(isPowerOf2_32(VecSize));
- if ((uint32_t(V) & (uint32_t(VecSize)-1)) != 0)
- return false;
- int32_t L = Log2_32(VecSize);
- return isInt<4>(V >> L);
-}]>;
-
def IsPow2_32: PatLeaf<(i32 imm), [{
uint32_t V = N->getZExtValue();
return isPowerOf2_32(V);
@@ -214,7 +215,7 @@ def NegImm32: SDNodeXForm<imm, [{
// Helpers for type promotions/contractions.
def I1toI32: OutPatFrag<(ops node:$Rs), (C2_muxii (i1 $Rs), 1, 0)>;
-def I32toI1: OutPatFrag<(ops node:$Rs), (i1 (C2_tfrrp (i32 $Rs)))>;
+def I32toI1: OutPatFrag<(ops node:$Rs), (i1 (C2_cmpgtui (i32 $Rs), (i32 0)))>;
def ToZext64: OutPatFrag<(ops node:$Rs), (i64 (A4_combineir 0, (i32 $Rs)))>;
def ToSext64: OutPatFrag<(ops node:$Rs), (i64 (A2_sxtw (i32 $Rs)))>;
@@ -249,23 +250,6 @@ def: Pat<(IsOrAdd (i32 AddrFI:$Rs), s32_0ImmPred:$off),
(PS_fi (i32 AddrFI:$Rs), imm:$off)>;
-def alignedload: PatFrag<(ops node:$a), (load $a), [{
- return isAlignedMemNode(dyn_cast<MemSDNode>(N));
-}]>;
-
-def unalignedload: PatFrag<(ops node:$a), (load $a), [{
- return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
-}]>;
-
-def alignedstore: PatFrag<(ops node:$v, node:$a), (store $v, $a), [{
- return isAlignedMemNode(dyn_cast<MemSDNode>(N));
-}]>;
-
-def unalignedstore: PatFrag<(ops node:$v, node:$a), (store $v, $a), [{
- return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
-}]>;
-
-
// Converters from unary/binary SDNode to PatFrag.
class pf1<SDNode Op> : PatFrag<(ops node:$a), (Op node:$a)>;
class pf2<SDNode Op> : PatFrag<(ops node:$a, node:$b), (Op node:$a, node:$b)>;
@@ -274,7 +258,7 @@ class Not2<PatFrag P>
: PatFrag<(ops node:$A, node:$B), (P node:$A, (not node:$B))>;
class Su<PatFrag Op>
- : PatFrag<Op.Operands, Op.Fragment, [{ return hasOneUse(N); }],
+ : PatFrag<Op.Operands, !head(Op.Fragments), [{ return hasOneUse(N); }],
Op.OperandTransform>;
// Main selection macros.
@@ -298,9 +282,9 @@ class AccRRI_pat<InstHexagon MI, PatFrag AccOp, PatFrag Op,
(MI RegPred:$Rx, RegPred:$Rs, imm:$I)>;
class AccRRR_pat<InstHexagon MI, PatFrag AccOp, PatFrag Op,
- PatFrag RsPred, PatFrag RtPred>
- : Pat<(AccOp RsPred:$Rx, (Op RsPred:$Rs, RtPred:$Rt)),
- (MI RsPred:$Rx, RsPred:$Rs, RtPred:$Rt)>;
+ PatFrag RxPred, PatFrag RsPred, PatFrag RtPred>
+ : Pat<(AccOp RxPred:$Rx, (Op RsPred:$Rs, RtPred:$Rt)),
+ (MI RxPred:$Rx, RsPred:$Rs, RtPred:$Rt)>;
multiclass SelMinMax_pats<PatFrag CmpOp, PatFrag Val,
InstHexagon InstA, InstHexagon InstB> {
@@ -316,6 +300,7 @@ def Add: pf2<add>; def And: pf2<and>; def Sra: pf2<sra>;
def Sub: pf2<sub>; def Or: pf2<or>; def Srl: pf2<srl>;
def Mul: pf2<mul>; def Xor: pf2<xor>; def Shl: pf2<shl>;
+def Rol: pf2<rotl>;
// --(1) Immediate -------------------------------------------------------
//
@@ -363,7 +348,7 @@ def ToI32: OutPatFrag<(ops node:$V), (A2_tfrsi $V)>;
// --(2) Type cast -------------------------------------------------------
//
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
def: OpR_R_pat<F2_conv_sf2df, pf1<fpextend>, f64, F32>;
def: OpR_R_pat<F2_conv_df2sf, pf1<fpround>, f32, F64>;
@@ -389,7 +374,7 @@ let Predicates = [HasV5T] in {
}
// Bitcast is different than [fp|sint|uint]_to_[sint|uint|fp].
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
def: Pat<(i32 (bitconvert F32:$v)), (I32:$v)>;
def: Pat<(f32 (bitconvert I32:$v)), (F32:$v)>;
def: Pat<(i64 (bitconvert F64:$v)), (I64:$v)>;
@@ -422,9 +407,14 @@ def: Pat<(i64 (sext I1:$Pu)),
(Combinew (C2_muxii PredRegs:$Pu, -1, 0),
(C2_muxii PredRegs:$Pu, -1, 0))>;
-def: Pat<(i32 (sext I1:$Pu)), (C2_muxii I1:$Pu, -1, 0)>;
-def: Pat<(i32 (zext I1:$Pu)), (C2_muxii I1:$Pu, 1, 0)>;
-def: Pat<(i64 (zext I1:$Pu)), (ToZext64 (C2_muxii I1:$Pu, 1, 0))>;
+def: Pat<(i32 (sext I1:$Pu)), (C2_muxii I1:$Pu, -1, 0)>;
+def: Pat<(i32 (zext I1:$Pu)), (C2_muxii I1:$Pu, 1, 0)>;
+def: Pat<(i64 (zext I1:$Pu)), (ToZext64 (C2_muxii I1:$Pu, 1, 0))>;
+def: Pat<(v2i16 (sext V2I1:$Pu)), (S2_vtrunehb (C2_mask V2I1:$Pu))>;
+def: Pat<(v2i32 (sext V2I1:$Pu)), (C2_mask V2I1:$Pu)>;
+def: Pat<(v4i8 (sext V4I1:$Pu)), (S2_vtrunehb (C2_mask V4I1:$Pu))>;
+def: Pat<(v4i16 (sext V4I1:$Pu)), (C2_mask V4I1:$Pu)>;
+def: Pat<(v8i8 (sext V8I1:$Pu)), (C2_mask V8I1:$Pu)>;
def: Pat<(i64 (sext I32:$Rs)), (A2_sxtw I32:$Rs)>;
def: Pat<(Zext64 I32:$Rs), (ToZext64 $Rs)>;
@@ -441,6 +431,20 @@ let AddedComplexity = 20 in {
def: Pat<(i32 (anyext I1:$Pu)), (C2_muxii I1:$Pu, 1, 0)>;
def: Pat<(i64 (anyext I1:$Pu)), (ToZext64 (C2_muxii I1:$Pu, 1, 0))>;
+def Vsplatpi: OutPatFrag<(ops node:$V),
+ (Combinew (A2_tfrsi $V), (A2_tfrsi $V))>;
+def: Pat<(v8i8 (zext V8I1:$Pu)),
+ (A2_andp (C2_mask V8I1:$Pu), (Vsplatpi (i32 0x01010101)))>;
+def: Pat<(v4i16 (zext V4I1:$Pu)),
+ (A2_andp (C2_mask V4I1:$Pu), (Vsplatpi (i32 0x00010001)))>;
+def: Pat<(v2i32 (zext V2I1:$Pu)),
+ (A2_andp (C2_mask V2I1:$Pu), (A2_combineii (i32 1), (i32 1)))>;
+
+def: Pat<(v4i8 (zext V4I1:$Pu)),
+ (A2_andir (LoReg (C2_mask V4I1:$Pu)), (i32 0x01010101))>;
+def: Pat<(v2i16 (zext V2I1:$Pu)),
+ (A2_andir (LoReg (C2_mask V2I1:$Pu)), (i32 0x00010001))>;
+
def: Pat<(v4i16 (zext V4I8:$Rs)), (S2_vzxtbh V4I8:$Rs)>;
def: Pat<(v2i32 (zext V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>;
def: Pat<(v4i16 (anyext V4I8:$Rs)), (S2_vzxtbh V4I8:$Rs)>;
@@ -475,25 +479,40 @@ def: Pat<(v2i16 (trunc V2I32:$Rs)),
//
def: Pat<(not I1:$Ps), (C2_not I1:$Ps)>;
+def: Pat<(not V8I1:$Ps), (C2_not V8I1:$Ps)>;
def: Pat<(add I1:$Ps, -1), (C2_not I1:$Ps)>;
-def: OpR_RR_pat<C2_and, And, i1, I1>;
-def: OpR_RR_pat<C2_or, Or, i1, I1>;
-def: OpR_RR_pat<C2_xor, Xor, i1, I1>;
-def: OpR_RR_pat<C2_andn, Not2<And>, i1, I1>;
-def: OpR_RR_pat<C2_orn, Not2<Or>, i1, I1>;
+multiclass BoolOpR_RR_pat<InstHexagon MI, PatFrag Op> {
+ def: OpR_RR_pat<MI, Op, i1, I1>;
+ def: OpR_RR_pat<MI, Op, v2i1, V2I1>;
+ def: OpR_RR_pat<MI, Op, v4i1, V4I1>;
+ def: OpR_RR_pat<MI, Op, v8i1, V8I1>;
+}
+
+multiclass BoolAccRRR_pat<InstHexagon MI, PatFrag AccOp, PatFrag Op> {
+ def: AccRRR_pat<MI, AccOp, Op, I1, I1, I1>;
+ def: AccRRR_pat<MI, AccOp, Op, V2I1, V2I1, V2I1>;
+ def: AccRRR_pat<MI, AccOp, Op, V4I1, V4I1, V4I1>;
+ def: AccRRR_pat<MI, AccOp, Op, V8I1, V8I1, V8I1>;
+}
+
+defm: BoolOpR_RR_pat<C2_and, And>;
+defm: BoolOpR_RR_pat<C2_or, Or>;
+defm: BoolOpR_RR_pat<C2_xor, Xor>;
+defm: BoolOpR_RR_pat<C2_andn, Not2<And>>;
+defm: BoolOpR_RR_pat<C2_orn, Not2<Or>>;
// op(Ps, op(Pt, Pu))
-def: AccRRR_pat<C4_and_and, And, Su<And>, I1, I1>;
-def: AccRRR_pat<C4_and_or, And, Su<Or>, I1, I1>;
-def: AccRRR_pat<C4_or_and, Or, Su<And>, I1, I1>;
-def: AccRRR_pat<C4_or_or, Or, Su<Or>, I1, I1>;
+defm: BoolAccRRR_pat<C4_and_and, And, Su<And>>;
+defm: BoolAccRRR_pat<C4_and_or, And, Su<Or>>;
+defm: BoolAccRRR_pat<C4_or_and, Or, Su<And>>;
+defm: BoolAccRRR_pat<C4_or_or, Or, Su<Or>>;
// op(Ps, op(Pt, ~Pu))
-def: AccRRR_pat<C4_and_andn, And, Su<Not2<And>>, I1, I1>;
-def: AccRRR_pat<C4_and_orn, And, Su<Not2<Or>>, I1, I1>;
-def: AccRRR_pat<C4_or_andn, Or, Su<Not2<And>>, I1, I1>;
-def: AccRRR_pat<C4_or_orn, Or, Su<Not2<Or>>, I1, I1>;
+defm: BoolAccRRR_pat<C4_and_andn, And, Su<Not2<And>>>;
+defm: BoolAccRRR_pat<C4_and_orn, And, Su<Not2<Or>>>;
+defm: BoolAccRRR_pat<C4_or_andn, Or, Su<Not2<And>>>;
+defm: BoolAccRRR_pat<C4_or_orn, Or, Su<Not2<Or>>>;
// --(5) Compare ---------------------------------------------------------
@@ -519,7 +538,7 @@ def: Pat<(i1 (setult I32:$Rs, u32_0ImmPred:$u9)),
// Patfrag to convert the usual comparison patfrags (e.g. setlt) to ones
// that reverse the order of the operands.
class RevCmp<PatFrag F>
- : PatFrag<(ops node:$rhs, node:$lhs), F.Fragment, F.PredicateCode,
+ : PatFrag<(ops node:$rhs, node:$lhs), !head(F.Fragments), F.PredicateCode,
F.OperandTransform>;
def: OpR_RR_pat<C2_cmpeq, seteq, i1, I32>;
@@ -563,7 +582,7 @@ def: OpR_RR_pat<A2_vcmpwgtu, RevCmp<setult>, v2i1, V2I32>;
def: OpR_RR_pat<A2_vcmpwgtu, setugt, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpwgtu, setugt, v2i1, V2I32>;
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
def: OpR_RR_pat<F2_sfcmpeq, seteq, i1, F32>;
def: OpR_RR_pat<F2_sfcmpgt, setgt, i1, F32>;
def: OpR_RR_pat<F2_sfcmpge, setge, i1, F32>;
@@ -598,27 +617,40 @@ def: Pat<(i1 (setle I32:$Rs, anyimm:$u5)),
def: Pat<(i1 (setule I32:$Rs, anyimm:$u5)),
(C2_not (C2_cmpgtui I32:$Rs, imm:$u5))>;
-def: Pat<(i1 (setne I32:$Rs, I32:$Rt)),
- (C2_not (C2_cmpeq I32:$Rs, I32:$Rt))>;
-def: Pat<(i1 (setle I32:$Rs, I32:$Rt)),
- (C2_not (C2_cmpgt I32:$Rs, I32:$Rt))>;
-def: Pat<(i1 (setule I32:$Rs, I32:$Rt)),
- (C2_not (C2_cmpgtu I32:$Rs, I32:$Rt))>;
-def: Pat<(i1 (setge I32:$Rs, I32:$Rt)),
- (C2_not (C2_cmpgt I32:$Rt, I32:$Rs))>;
-def: Pat<(i1 (setuge I32:$Rs, I32:$Rt)),
- (C2_not (C2_cmpgtu I32:$Rt, I32:$Rs))>;
-
-def: Pat<(i1 (setle I64:$Rs, I64:$Rt)),
- (C2_not (C2_cmpgtp I64:$Rs, I64:$Rt))>;
-def: Pat<(i1 (setne I64:$Rs, I64:$Rt)),
- (C2_not (C2_cmpeqp I64:$Rs, I64:$Rt))>;
-def: Pat<(i1 (setge I64:$Rs, I64:$Rt)),
- (C2_not (C2_cmpgtp I64:$Rt, I64:$Rs))>;
-def: Pat<(i1 (setuge I64:$Rs, I64:$Rt)),
- (C2_not (C2_cmpgtup I64:$Rt, I64:$Rs))>;
-def: Pat<(i1 (setule I64:$Rs, I64:$Rt)),
- (C2_not (C2_cmpgtup I64:$Rs, I64:$Rt))>;
+class OpmR_RR_pat<PatFrag Output, PatFrag Op, ValueType ResType,
+ PatFrag RsPred, PatFrag RtPred = RsPred>
+ : Pat<(ResType (Op RsPred:$Rs, RtPred:$Rt)),
+ (Output RsPred:$Rs, RtPred:$Rt)>;
+
+class Outn<InstHexagon MI>
+ : OutPatFrag<(ops node:$Rs, node:$Rt),
+ (C2_not (MI $Rs, $Rt))>;
+
+def: OpmR_RR_pat<Outn<C2_cmpeq>, setne, i1, I32>;
+def: OpmR_RR_pat<Outn<C2_cmpgt>, setle, i1, I32>;
+def: OpmR_RR_pat<Outn<C2_cmpgtu>, setule, i1, I32>;
+def: OpmR_RR_pat<Outn<C2_cmpgt>, RevCmp<setge>, i1, I32>;
+def: OpmR_RR_pat<Outn<C2_cmpgtu>, RevCmp<setuge>, i1, I32>;
+def: OpmR_RR_pat<Outn<C2_cmpeqp>, setne, i1, I64>;
+def: OpmR_RR_pat<Outn<C2_cmpgtp>, setle, i1, I64>;
+def: OpmR_RR_pat<Outn<C2_cmpgtup>, setule, i1, I64>;
+def: OpmR_RR_pat<Outn<C2_cmpgtp>, RevCmp<setge>, i1, I64>;
+def: OpmR_RR_pat<Outn<C2_cmpgtup>, RevCmp<setuge>, i1, I64>;
+def: OpmR_RR_pat<Outn<A2_vcmpbeq>, setne, v8i1, V8I8>;
+def: OpmR_RR_pat<Outn<A4_vcmpbgt>, setle, v8i1, V8I8>;
+def: OpmR_RR_pat<Outn<A2_vcmpbgtu>, setule, v8i1, V8I8>;
+def: OpmR_RR_pat<Outn<A4_vcmpbgt>, RevCmp<setge>, v8i1, V8I8>;
+def: OpmR_RR_pat<Outn<A2_vcmpbgtu>, RevCmp<setuge>, v8i1, V8I8>;
+def: OpmR_RR_pat<Outn<A2_vcmpheq>, setne, v4i1, V4I16>;
+def: OpmR_RR_pat<Outn<A2_vcmphgt>, setle, v4i1, V4I16>;
+def: OpmR_RR_pat<Outn<A2_vcmphgtu>, setule, v4i1, V4I16>;
+def: OpmR_RR_pat<Outn<A2_vcmphgt>, RevCmp<setge>, v4i1, V4I16>;
+def: OpmR_RR_pat<Outn<A2_vcmphgtu>, RevCmp<setuge>, v4i1, V4I16>;
+def: OpmR_RR_pat<Outn<A2_vcmpweq>, setne, v2i1, V2I32>;
+def: OpmR_RR_pat<Outn<A2_vcmpwgt>, setle, v2i1, V2I32>;
+def: OpmR_RR_pat<Outn<A2_vcmpwgtu>, setule, v2i1, V2I32>;
+def: OpmR_RR_pat<Outn<A2_vcmpwgt>, RevCmp<setge>, v2i1, V2I32>;
+def: OpmR_RR_pat<Outn<A2_vcmpwgtu>, RevCmp<setuge>, v2i1, V2I32>;
let AddedComplexity = 100 in {
def: Pat<(i1 (seteq (and (xor I32:$Rs, I32:$Rt), 255), 0)),
@@ -680,25 +712,10 @@ def: Pat<(i32 (zext (i1 (seteq I32:$Rs, anyimm:$s8)))),
def: Pat<(i32 (zext (i1 (setne I32:$Rs, anyimm:$s8)))),
(A4_rcmpneqi I32:$Rs, imm:$s8)>;
-def: Pat<(i1 (setne I1:$Ps, I1:$Pt)),
- (C2_xor I1:$Ps, I1:$Pt)>;
-
-def: Pat<(i1 (seteq V4I8:$Rs, V4I8:$Rt)),
- (A2_vcmpbeq (ToZext64 $Rs), (ToZext64 $Rt))>;
-def: Pat<(i1 (setgt V4I8:$Rs, V4I8:$Rt)),
- (A4_vcmpbgt (ToZext64 $Rs), (ToZext64 $Rt))>;
-def: Pat<(i1 (setugt V4I8:$Rs, V4I8:$Rt)),
- (A2_vcmpbgtu (ToZext64 $Rs), (ToZext64 $Rt))>;
-
-def: Pat<(i1 (seteq V2I16:$Rs, V2I16:$Rt)),
- (A2_vcmpheq (ToZext64 $Rs), (ToZext64 $Rt))>;
-def: Pat<(i1 (setgt V2I16:$Rs, V2I16:$Rt)),
- (A2_vcmphgt (ToZext64 $Rs), (ToZext64 $Rt))>;
-def: Pat<(i1 (setugt V2I16:$Rs, V2I16:$Rt)),
- (A2_vcmphgtu (ToZext64 $Rs), (ToZext64 $Rt))>;
-
-def: Pat<(v2i1 (setne V2I32:$Rs, V2I32:$Rt)),
- (C2_not (v2i1 (A2_vcmpbeq V2I32:$Rs, V2I32:$Rt)))>;
+def: Pat<(i1 (seteq I1:$Ps, (i1 -1))), (I1:$Ps)>;
+def: Pat<(i1 (setne I1:$Ps, (i1 -1))), (C2_not I1:$Ps)>;
+def: Pat<(i1 (seteq I1:$Ps, I1:$Pt)), (C2_xor I1:$Ps, (C2_not I1:$Pt))>;
+def: Pat<(i1 (setne I1:$Ps, I1:$Pt)), (C2_xor I1:$Ps, I1:$Pt)>;
// Floating-point comparisons with checks for ordered/unordered status.
@@ -706,18 +723,13 @@ class T3<InstHexagon MI1, InstHexagon MI2, InstHexagon MI3>
: OutPatFrag<(ops node:$Rs, node:$Rt),
(MI1 (MI2 $Rs, $Rt), (MI3 $Rs, $Rt))>;
-class OpmR_RR_pat<PatFrag Output, PatFrag Op, ValueType ResType,
- PatFrag RsPred, PatFrag RtPred = RsPred>
- : Pat<(ResType (Op RsPred:$Rs, RtPred:$Rt)),
- (Output RsPred:$Rs, RtPred:$Rt)>;
-
class Cmpuf<InstHexagon MI>: T3<C2_or, F2_sfcmpuo, MI>;
class Cmpud<InstHexagon MI>: T3<C2_or, F2_dfcmpuo, MI>;
class Cmpufn<InstHexagon MI>: T3<C2_orn, F2_sfcmpuo, MI>;
class Cmpudn<InstHexagon MI>: T3<C2_orn, F2_dfcmpuo, MI>;
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
def: OpmR_RR_pat<Cmpuf<F2_sfcmpeq>, setueq, i1, F32>;
def: OpmR_RR_pat<Cmpuf<F2_sfcmpge>, setuge, i1, F32>;
def: OpmR_RR_pat<Cmpuf<F2_sfcmpgt>, setugt, i1, F32>;
@@ -733,11 +745,7 @@ let Predicates = [HasV5T] in {
def: OpmR_RR_pat<Cmpudn<F2_dfcmpeq>, setune, i1, F64>;
}
-class Outn<InstHexagon MI>
- : OutPatFrag<(ops node:$Rs, node:$Rt),
- (C2_not (MI $Rs, $Rt))>;
-
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setone, i1, F32>;
def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setne, i1, F32>;
@@ -776,7 +784,7 @@ def: Pat<(select I1:$Pu, I64:$Rs, I64:$Rt),
(Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)),
(C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>;
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
def: Pat<(select I1:$Pu, F32:$Rs, f32ImmPred:$I),
(C2_muxir I1:$Pu, F32:$Rs, (ftoi $I))>;
def: Pat<(select I1:$Pu, f32ImmPred:$I, F32:$Rt),
@@ -813,20 +821,6 @@ def: Pat<(vselect V4I1:$Pu, V4I16:$Rs, V4I16:$Rt),
def: Pat<(vselect V2I1:$Pu, V2I32:$Rs, V2I32:$Rt),
(C2_vmux V2I1:$Pu, V2I32:$Rs, V2I32:$Rt)>;
-
-class HvxSel_pat<InstHexagon MI, PatFrag RegPred>
- : Pat<(select I1:$Pu, RegPred:$Vs, RegPred:$Vt),
- (MI I1:$Pu, RegPred:$Vs, RegPred:$Vt)>;
-
-let Predicates = [HasV60T,UseHVX] in {
- def: HvxSel_pat<PS_vselect, HVI8>;
- def: HvxSel_pat<PS_vselect, HVI16>;
- def: HvxSel_pat<PS_vselect, HVI32>;
- def: HvxSel_pat<PS_wselect, HWI8>;
- def: HvxSel_pat<PS_wselect, HWI16>;
- def: HvxSel_pat<PS_wselect, HWI32>;
-}
-
// From LegalizeDAG.cpp: (Pu ? Pv : Pw) <=> (Pu & Pv) | (!Pu & Pw).
def: Pat<(select I1:$Pu, I1:$Pv, I1:$Pw),
(C2_or (C2_and I1:$Pu, I1:$Pv),
@@ -878,7 +872,7 @@ let AddedComplexity = 200 in {
defm: SelMinMax_pats<setult, I64, A2_minup, A2_maxup>;
}
-let AddedComplexity = 100, Predicates = [HasV5T] in {
+let AddedComplexity = 100, Predicates = [HasV5] in {
defm: SelMinMax_pats<setolt, F32, F2_sfmin, F2_sfmax>;
defm: SelMinMax_pats<setole, F32, F2_sfmin, F2_sfmax>;
defm: SelMinMax_pats<setogt, F32, F2_sfmax, F2_sfmin>;
@@ -892,40 +886,34 @@ let AddedComplexity = 100, Predicates = [HasV5T] in {
def SDTHexagonINSERT:
SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
SDTCisInt<0>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
-def SDTHexagonINSERTRP:
- SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
- SDTCisInt<0>, SDTCisVT<3, i64>]>;
-
def HexagonINSERT: SDNode<"HexagonISD::INSERT", SDTHexagonINSERT>;
-def HexagonINSERTRP: SDNode<"HexagonISD::INSERTRP", SDTHexagonINSERTRP>;
-def: Pat<(HexagonINSERT I32:$Rs, I32:$Rt, u5_0ImmPred:$u1, u5_0ImmPred:$u2),
- (S2_insert I32:$Rs, I32:$Rt, imm:$u1, imm:$u2)>;
-def: Pat<(HexagonINSERT I64:$Rs, I64:$Rt, u6_0ImmPred:$u1, u6_0ImmPred:$u2),
- (S2_insertp I64:$Rs, I64:$Rt, imm:$u1, imm:$u2)>;
-def: Pat<(HexagonINSERTRP I32:$Rs, I32:$Rt, I64:$Ru),
- (S2_insert_rp I32:$Rs, I32:$Rt, I64:$Ru)>;
-def: Pat<(HexagonINSERTRP I64:$Rs, I64:$Rt, I64:$Ru),
- (S2_insertp_rp I64:$Rs, I64:$Rt, I64:$Ru)>;
+let AddedComplexity = 10 in {
+ def: Pat<(HexagonINSERT I32:$Rs, I32:$Rt, u5_0ImmPred:$u1, u5_0ImmPred:$u2),
+ (S2_insert I32:$Rs, I32:$Rt, imm:$u1, imm:$u2)>;
+ def: Pat<(HexagonINSERT I64:$Rs, I64:$Rt, u6_0ImmPred:$u1, u6_0ImmPred:$u2),
+ (S2_insertp I64:$Rs, I64:$Rt, imm:$u1, imm:$u2)>;
+}
+def: Pat<(HexagonINSERT I32:$Rs, I32:$Rt, I32:$Width, I32:$Off),
+ (S2_insert_rp I32:$Rs, I32:$Rt, (Combinew $Width, $Off))>;
+def: Pat<(HexagonINSERT I64:$Rs, I64:$Rt, I32:$Width, I32:$Off),
+ (S2_insertp_rp I64:$Rs, I64:$Rt, (Combinew $Width, $Off))>;
def SDTHexagonEXTRACTU
: SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<1>,
SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
-def SDTHexagonEXTRACTURP
- : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<1>,
- SDTCisVT<2, i64>]>;
-
def HexagonEXTRACTU: SDNode<"HexagonISD::EXTRACTU", SDTHexagonEXTRACTU>;
-def HexagonEXTRACTURP: SDNode<"HexagonISD::EXTRACTURP", SDTHexagonEXTRACTURP>;
-def: Pat<(HexagonEXTRACTU I32:$Rs, u5_0ImmPred:$u5, u5_0ImmPred:$U5),
- (S2_extractu I32:$Rs, imm:$u5, imm:$U5)>;
-def: Pat<(HexagonEXTRACTU I64:$Rs, u6_0ImmPred:$u6, u6_0ImmPred:$U6),
- (S2_extractup I64:$Rs, imm:$u6, imm:$U6)>;
-def: Pat<(HexagonEXTRACTURP I32:$Rs, I64:$Rt),
- (S2_extractu_rp I32:$Rs, I64:$Rt)>;
-def: Pat<(HexagonEXTRACTURP I64:$Rs, I64:$Rt),
- (S2_extractup_rp I64:$Rs, I64:$Rt)>;
+let AddedComplexity = 10 in {
+ def: Pat<(HexagonEXTRACTU I32:$Rs, u5_0ImmPred:$u5, u5_0ImmPred:$U5),
+ (S2_extractu I32:$Rs, imm:$u5, imm:$U5)>;
+ def: Pat<(HexagonEXTRACTU I64:$Rs, u6_0ImmPred:$u6, u6_0ImmPred:$U6),
+ (S2_extractup I64:$Rs, imm:$u6, imm:$U6)>;
+}
+def: Pat<(HexagonEXTRACTU I32:$Rs, I32:$Width, I32:$Off),
+ (S2_extractu_rp I32:$Rs, (Combinew $Width, $Off))>;
+def: Pat<(HexagonEXTRACTU I64:$Rs, I32:$Width, I32:$Off),
+ (S2_extractup_rp I64:$Rs, (Combinew $Width, $Off))>;
def SDTHexagonVSPLAT:
SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
@@ -938,20 +926,20 @@ def: Pat<(v2i32 (HexagonVSPLAT s8_0ImmPred:$s8)),
(A2_combineii imm:$s8, imm:$s8)>;
def: Pat<(v2i32 (HexagonVSPLAT I32:$Rs)), (Combinew I32:$Rs, I32:$Rs)>;
+let AddedComplexity = 10 in
+def: Pat<(v8i8 (HexagonVSPLAT I32:$Rs)), (S6_vsplatrbp I32:$Rs)>,
+ Requires<[HasV62]>;
+def: Pat<(v8i8 (HexagonVSPLAT I32:$Rs)),
+ (Combinew (S2_vsplatrb I32:$Rs), (S2_vsplatrb I32:$Rs))>;
+
// --(8) Shift/permute ---------------------------------------------------
//
def SDTHexagonI64I32I32: SDTypeProfile<1, 2,
[SDTCisVT<0, i64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
-def SDTHexagonVCOMBINE: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>,
- SDTCisSubVecOfVec<1, 0>]>;
-def SDTHexagonVPACK: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>, SDTCisVec<1>]>;
def HexagonCOMBINE: SDNode<"HexagonISD::COMBINE", SDTHexagonI64I32I32>;
-def HexagonVCOMBINE: SDNode<"HexagonISD::VCOMBINE", SDTHexagonVCOMBINE>;
-def HexagonVPACKE: SDNode<"HexagonISD::VPACKE", SDTHexagonVPACK>;
-def HexagonVPACKO: SDNode<"HexagonISD::VPACKO", SDTHexagonVPACK>;
def: Pat<(HexagonCOMBINE I32:$Rs, I32:$Rt), (Combinew $Rs, $Rt)>;
@@ -1001,11 +989,15 @@ def: OpR_RR_pat<S2_asr_r_p, Sra, i64, I64, I32>;
def: OpR_RR_pat<S2_lsr_r_p, Srl, i64, I64, I32>;
def: OpR_RR_pat<S2_asl_r_p, Shl, i64, I64, I32>;
+let Predicates = [HasV60] in {
+ def: OpR_RI_pat<S6_rol_i_r, Rol, i32, I32, u5_0ImmPred>;
+ def: OpR_RI_pat<S6_rol_i_p, Rol, i64, I64, u6_0ImmPred>;
+}
def: Pat<(sra (add (sra I32:$Rs, u5_0ImmPred:$u5), 1), (i32 1)),
(S2_asr_i_r_rnd I32:$Rs, imm:$u5)>;
def: Pat<(sra (add (sra I64:$Rs, u6_0ImmPred:$u6), 1), (i32 1)),
- (S2_asr_i_p_rnd I64:$Rs, imm:$u6)>, Requires<[HasV5T]>;
+ (S2_asr_i_p_rnd I64:$Rs, imm:$u6)>, Requires<[HasV5]>;
// Prefer S2_addasl_rrri over S2_asl_i_r_acc.
let AddedComplexity = 120 in
@@ -1046,41 +1038,55 @@ let AddedComplexity = 100 in {
def: AccRRI_pat<S2_asl_i_p_and, And, Su<Shl>, I64, u6_0ImmPred>;
def: AccRRI_pat<S2_asl_i_p_or, Or, Su<Shl>, I64, u6_0ImmPred>;
def: AccRRI_pat<S2_asl_i_p_xacc, Xor, Su<Shl>, I64, u6_0ImmPred>;
+
+ let Predicates = [HasV60] in {
+ def: AccRRI_pat<S6_rol_i_r_acc, Add, Su<Rol>, I32, u5_0ImmPred>;
+ def: AccRRI_pat<S6_rol_i_r_nac, Sub, Su<Rol>, I32, u5_0ImmPred>;
+ def: AccRRI_pat<S6_rol_i_r_and, And, Su<Rol>, I32, u5_0ImmPred>;
+ def: AccRRI_pat<S6_rol_i_r_or, Or, Su<Rol>, I32, u5_0ImmPred>;
+ def: AccRRI_pat<S6_rol_i_r_xacc, Xor, Su<Rol>, I32, u5_0ImmPred>;
+
+ def: AccRRI_pat<S6_rol_i_p_acc, Add, Su<Rol>, I64, u6_0ImmPred>;
+ def: AccRRI_pat<S6_rol_i_p_nac, Sub, Su<Rol>, I64, u6_0ImmPred>;
+ def: AccRRI_pat<S6_rol_i_p_and, And, Su<Rol>, I64, u6_0ImmPred>;
+ def: AccRRI_pat<S6_rol_i_p_or, Or, Su<Rol>, I64, u6_0ImmPred>;
+ def: AccRRI_pat<S6_rol_i_p_xacc, Xor, Su<Rol>, I64, u6_0ImmPred>;
+ }
}
let AddedComplexity = 100 in {
- def: AccRRR_pat<S2_asr_r_r_acc, Add, Su<Sra>, I32, I32>;
- def: AccRRR_pat<S2_asr_r_r_nac, Sub, Su<Sra>, I32, I32>;
- def: AccRRR_pat<S2_asr_r_r_and, And, Su<Sra>, I32, I32>;
- def: AccRRR_pat<S2_asr_r_r_or, Or, Su<Sra>, I32, I32>;
+ def: AccRRR_pat<S2_asr_r_r_acc, Add, Su<Sra>, I32, I32, I32>;
+ def: AccRRR_pat<S2_asr_r_r_nac, Sub, Su<Sra>, I32, I32, I32>;
+ def: AccRRR_pat<S2_asr_r_r_and, And, Su<Sra>, I32, I32, I32>;
+ def: AccRRR_pat<S2_asr_r_r_or, Or, Su<Sra>, I32, I32, I32>;
- def: AccRRR_pat<S2_asr_r_p_acc, Add, Su<Sra>, I64, I32>;
- def: AccRRR_pat<S2_asr_r_p_nac, Sub, Su<Sra>, I64, I32>;
- def: AccRRR_pat<S2_asr_r_p_and, And, Su<Sra>, I64, I32>;
- def: AccRRR_pat<S2_asr_r_p_or, Or, Su<Sra>, I64, I32>;
- def: AccRRR_pat<S2_asr_r_p_xor, Xor, Su<Sra>, I64, I32>;
+ def: AccRRR_pat<S2_asr_r_p_acc, Add, Su<Sra>, I64, I64, I32>;
+ def: AccRRR_pat<S2_asr_r_p_nac, Sub, Su<Sra>, I64, I64, I32>;
+ def: AccRRR_pat<S2_asr_r_p_and, And, Su<Sra>, I64, I64, I32>;
+ def: AccRRR_pat<S2_asr_r_p_or, Or, Su<Sra>, I64, I64, I32>;
+ def: AccRRR_pat<S2_asr_r_p_xor, Xor, Su<Sra>, I64, I64, I32>;
- def: AccRRR_pat<S2_lsr_r_r_acc, Add, Su<Srl>, I32, I32>;
- def: AccRRR_pat<S2_lsr_r_r_nac, Sub, Su<Srl>, I32, I32>;
- def: AccRRR_pat<S2_lsr_r_r_and, And, Su<Srl>, I32, I32>;
- def: AccRRR_pat<S2_lsr_r_r_or, Or, Su<Srl>, I32, I32>;
+ def: AccRRR_pat<S2_lsr_r_r_acc, Add, Su<Srl>, I32, I32, I32>;
+ def: AccRRR_pat<S2_lsr_r_r_nac, Sub, Su<Srl>, I32, I32, I32>;
+ def: AccRRR_pat<S2_lsr_r_r_and, And, Su<Srl>, I32, I32, I32>;
+ def: AccRRR_pat<S2_lsr_r_r_or, Or, Su<Srl>, I32, I32, I32>;
- def: AccRRR_pat<S2_lsr_r_p_acc, Add, Su<Srl>, I64, I32>;
- def: AccRRR_pat<S2_lsr_r_p_nac, Sub, Su<Srl>, I64, I32>;
- def: AccRRR_pat<S2_lsr_r_p_and, And, Su<Srl>, I64, I32>;
- def: AccRRR_pat<S2_lsr_r_p_or, Or, Su<Srl>, I64, I32>;
- def: AccRRR_pat<S2_lsr_r_p_xor, Xor, Su<Srl>, I64, I32>;
+ def: AccRRR_pat<S2_lsr_r_p_acc, Add, Su<Srl>, I64, I64, I32>;
+ def: AccRRR_pat<S2_lsr_r_p_nac, Sub, Su<Srl>, I64, I64, I32>;
+ def: AccRRR_pat<S2_lsr_r_p_and, And, Su<Srl>, I64, I64, I32>;
+ def: AccRRR_pat<S2_lsr_r_p_or, Or, Su<Srl>, I64, I64, I32>;
+ def: AccRRR_pat<S2_lsr_r_p_xor, Xor, Su<Srl>, I64, I64, I32>;
- def: AccRRR_pat<S2_asl_r_r_acc, Add, Su<Shl>, I32, I32>;
- def: AccRRR_pat<S2_asl_r_r_nac, Sub, Su<Shl>, I32, I32>;
- def: AccRRR_pat<S2_asl_r_r_and, And, Su<Shl>, I32, I32>;
- def: AccRRR_pat<S2_asl_r_r_or, Or, Su<Shl>, I32, I32>;
+ def: AccRRR_pat<S2_asl_r_r_acc, Add, Su<Shl>, I32, I32, I32>;
+ def: AccRRR_pat<S2_asl_r_r_nac, Sub, Su<Shl>, I32, I32, I32>;
+ def: AccRRR_pat<S2_asl_r_r_and, And, Su<Shl>, I32, I32, I32>;
+ def: AccRRR_pat<S2_asl_r_r_or, Or, Su<Shl>, I32, I32, I32>;
- def: AccRRR_pat<S2_asl_r_p_acc, Add, Su<Shl>, I64, I32>;
- def: AccRRR_pat<S2_asl_r_p_nac, Sub, Su<Shl>, I64, I32>;
- def: AccRRR_pat<S2_asl_r_p_and, And, Su<Shl>, I64, I32>;
- def: AccRRR_pat<S2_asl_r_p_or, Or, Su<Shl>, I64, I32>;
- def: AccRRR_pat<S2_asl_r_p_xor, Xor, Su<Shl>, I64, I32>;
+ def: AccRRR_pat<S2_asl_r_p_acc, Add, Su<Shl>, I64, I64, I32>;
+ def: AccRRR_pat<S2_asl_r_p_nac, Sub, Su<Shl>, I64, I64, I32>;
+ def: AccRRR_pat<S2_asl_r_p_and, And, Su<Shl>, I64, I64, I32>;
+ def: AccRRR_pat<S2_asl_r_p_or, Or, Su<Shl>, I64, I64, I32>;
+ def: AccRRR_pat<S2_asl_r_p_xor, Xor, Su<Shl>, I64, I64, I32>;
}
@@ -1170,11 +1176,13 @@ def: Pat<(shl V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c))),
// --(9) Arithmetic/bitwise ----------------------------------------------
//
-def: Pat<(abs I32:$Rs), (A2_abs I32:$Rs)>;
-def: Pat<(not I32:$Rs), (A2_subri -1, I32:$Rs)>;
-def: Pat<(not I64:$Rs), (A2_notp I64:$Rs)>;
+def: Pat<(abs I32:$Rs), (A2_abs I32:$Rs)>;
+def: Pat<(abs I64:$Rs), (A2_absp I64:$Rs)>;
+def: Pat<(not I32:$Rs), (A2_subri -1, I32:$Rs)>;
+def: Pat<(not I64:$Rs), (A2_notp I64:$Rs)>;
+def: Pat<(ineg I64:$Rs), (A2_negp I64:$Rs)>;
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
def: Pat<(fabs F32:$Rs), (S2_clrbit_i F32:$Rs, 31)>;
def: Pat<(fneg F32:$Rs), (S2_togglebit_i F32:$Rs, 31)>;
@@ -1186,13 +1194,6 @@ let Predicates = [HasV5T] in {
(i32 (LoReg $Rs)))>;
}
-let AddedComplexity = 50 in
-def: Pat<(xor (add (sra I32:$Rs, (i32 31)),
- I32:$Rs),
- (sra I32:$Rs, (i32 31))),
- (A2_abs I32:$Rs)>;
-
-
def: Pat<(add I32:$Rs, anyimm:$s16), (A2_addi I32:$Rs, imm:$s16)>;
def: Pat<(or I32:$Rs, anyimm:$s10), (A2_orir I32:$Rs, imm:$s10)>;
def: Pat<(and I32:$Rs, anyimm:$s10), (A2_andir I32:$Rs, imm:$s10)>;
@@ -1221,18 +1222,20 @@ def: OpR_RR_pat<A2_vsubub, Sub, v8i8, V8I8>;
def: OpR_RR_pat<A2_vsubh, Sub, v4i16, V4I16>;
def: OpR_RR_pat<A2_vsubw, Sub, v2i32, V2I32>;
+def: OpR_RR_pat<A2_and, And, v4i8, V4I8>;
+def: OpR_RR_pat<A2_xor, Xor, v4i8, V4I8>;
+def: OpR_RR_pat<A2_or, Or, v4i8, V4I8>;
def: OpR_RR_pat<A2_and, And, v2i16, V2I16>;
def: OpR_RR_pat<A2_xor, Xor, v2i16, V2I16>;
def: OpR_RR_pat<A2_or, Or, v2i16, V2I16>;
-
def: OpR_RR_pat<A2_andp, And, v8i8, V8I8>;
-def: OpR_RR_pat<A2_andp, And, v4i16, V4I16>;
-def: OpR_RR_pat<A2_andp, And, v2i32, V2I32>;
def: OpR_RR_pat<A2_orp, Or, v8i8, V8I8>;
-def: OpR_RR_pat<A2_orp, Or, v4i16, V4I16>;
-def: OpR_RR_pat<A2_orp, Or, v2i32, V2I32>;
def: OpR_RR_pat<A2_xorp, Xor, v8i8, V8I8>;
+def: OpR_RR_pat<A2_andp, And, v4i16, V4I16>;
+def: OpR_RR_pat<A2_orp, Or, v4i16, V4I16>;
def: OpR_RR_pat<A2_xorp, Xor, v4i16, V4I16>;
+def: OpR_RR_pat<A2_andp, And, v2i32, V2I32>;
+def: OpR_RR_pat<A2_orp, Or, v2i32, V2I32>;
def: OpR_RR_pat<A2_xorp, Xor, v2i32, V2I32>;
def: OpR_RR_pat<M2_mpyi, Mul, i32, I32>;
@@ -1255,7 +1258,7 @@ def: OpR_RR_pat<C2_and, Mul, v2i1, V2I1>;
def: OpR_RR_pat<C2_and, Mul, v4i1, V4I1>;
def: OpR_RR_pat<C2_and, Mul, v8i1, V8I1>;
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
def: OpR_RR_pat<F2_sfadd, pf2<fadd>, f32, F32>;
def: OpR_RR_pat<F2_sfsub, pf2<fsub>, f32, F32>;
def: OpR_RR_pat<F2_sfmpy, pf2<fmul>, f32, F32>;
@@ -1268,12 +1271,62 @@ let Predicates = [HasV5T] in {
let AddedComplexity = 10 in {
def: AccRRI_pat<M2_macsip, Add, Su<Mul>, I32, u32_0ImmPred>;
def: AccRRI_pat<M2_macsin, Sub, Su<Mul>, I32, u32_0ImmPred>;
- def: AccRRR_pat<M2_maci, Add, Su<Mul>, I32, I32>;
+ def: AccRRR_pat<M2_maci, Add, Su<Mul>, I32, I32, I32>;
}
def: AccRRI_pat<M2_naccii, Sub, Su<Add>, I32, s32_0ImmPred>;
def: AccRRI_pat<M2_accii, Add, Su<Add>, I32, s32_0ImmPred>;
-def: AccRRR_pat<M2_acci, Add, Su<Add>, I32, I32>;
+def: AccRRR_pat<M2_acci, Add, Su<Add>, I32, I32, I32>;
+
+// Mulh for vectors
+//
+def: Pat<(v2i32 (mulhu V2I32:$Rss, V2I32:$Rtt)),
+ (Combinew (M2_mpyu_up (HiReg $Rss), (HiReg $Rtt)),
+ (M2_mpyu_up (LoReg $Rss), (LoReg $Rtt)))>;
+
+def: Pat<(v2i32 (mulhs V2I32:$Rs, V2I32:$Rt)),
+ (Combinew (M2_mpy_up (HiReg $Rs), (HiReg $Rt)),
+ (M2_mpy_up (LoReg $Rt), (LoReg $Rt)))>;
+
+def Mulhub:
+ OutPatFrag<(ops node:$Rss, node:$Rtt),
+ (Combinew (S2_vtrunohb (M5_vmpybuu (HiReg $Rss), (HiReg $Rtt))),
+ (S2_vtrunohb (M5_vmpybuu (LoReg $Rss), (LoReg $Rtt))))>;
+
+// Equivalent of byte-wise arithmetic shift right by 7 in v8i8.
+def Asr7:
+ OutPatFrag<(ops node:$Rss), (C2_mask (C2_not (A4_vcmpbgti $Rss, 0)))>;
+
+def: Pat<(v8i8 (mulhu V8I8:$Rss, V8I8:$Rtt)),
+ (Mulhub $Rss, $Rtt)>;
+
+def: Pat<(v8i8 (mulhs V8I8:$Rss, V8I8:$Rtt)),
+ (A2_vsubub
+ (Mulhub $Rss, $Rtt),
+ (A2_vaddub (A2_andp V8I8:$Rss, (Asr7 $Rtt)),
+ (A2_andp V8I8:$Rtt, (Asr7 $Rss))))>;
+
+def Mpysh:
+ OutPatFrag<(ops node:$Rs, node:$Rt), (M2_vmpy2s_s0 $Rs, $Rt)>;
+def Mpyshh:
+ OutPatFrag<(ops node:$Rss, node:$Rtt), (Mpysh (HiReg $Rss), (HiReg $Rtt))>;
+def Mpyshl:
+ OutPatFrag<(ops node:$Rss, node:$Rtt), (Mpysh (LoReg $Rss), (LoReg $Rtt))>;
+
+def Mulhsh:
+ OutPatFrag<(ops node:$Rss, node:$Rtt),
+ (Combinew (A2_combine_hh (HiReg (Mpyshh $Rss, $Rtt)),
+ (LoReg (Mpyshh $Rss, $Rtt))),
+ (A2_combine_hh (HiReg (Mpyshl $Rss, $Rtt)),
+ (LoReg (Mpyshl $Rss, $Rtt))))>;
+
+def: Pat<(v4i16 (mulhs V4I16:$Rss, V4I16:$Rtt)), (Mulhsh $Rss, $Rtt)>;
+
+def: Pat<(v4i16 (mulhu V4I16:$Rss, V4I16:$Rtt)),
+ (A2_vaddh
+ (Mulhsh $Rss, $Rtt),
+ (A2_vaddh (A2_andp V4I16:$Rss, (S2_asr_i_vh $Rtt, 15)),
+ (A2_andp V4I16:$Rtt, (S2_asr_i_vh $Rss, 15))))>;
def: Pat<(ineg (mul I32:$Rs, u8_0ImmPred:$u8)),
@@ -1291,24 +1344,24 @@ def: Pat<(mul I32:$Rs, n8_0ImmPred:$n8),
def: Pat<(add Sext64:$Rs, I64:$Rt),
(A2_addsp (LoReg Sext64:$Rs), I64:$Rt)>;
-def: AccRRR_pat<M4_and_and, And, Su<And>, I32, I32>;
-def: AccRRR_pat<M4_and_or, And, Su<Or>, I32, I32>;
-def: AccRRR_pat<M4_and_xor, And, Su<Xor>, I32, I32>;
-def: AccRRR_pat<M4_or_and, Or, Su<And>, I32, I32>;
-def: AccRRR_pat<M4_or_or, Or, Su<Or>, I32, I32>;
-def: AccRRR_pat<M4_or_xor, Or, Su<Xor>, I32, I32>;
-def: AccRRR_pat<M4_xor_and, Xor, Su<And>, I32, I32>;
-def: AccRRR_pat<M4_xor_or, Xor, Su<Or>, I32, I32>;
-def: AccRRR_pat<M2_xor_xacc, Xor, Su<Xor>, I32, I32>;
-def: AccRRR_pat<M4_xor_xacc, Xor, Su<Xor>, I64, I64>;
+def: AccRRR_pat<M4_and_and, And, Su<And>, I32, I32, I32>;
+def: AccRRR_pat<M4_and_or, And, Su<Or>, I32, I32, I32>;
+def: AccRRR_pat<M4_and_xor, And, Su<Xor>, I32, I32, I32>;
+def: AccRRR_pat<M4_or_and, Or, Su<And>, I32, I32, I32>;
+def: AccRRR_pat<M4_or_or, Or, Su<Or>, I32, I32, I32>;
+def: AccRRR_pat<M4_or_xor, Or, Su<Xor>, I32, I32, I32>;
+def: AccRRR_pat<M4_xor_and, Xor, Su<And>, I32, I32, I32>;
+def: AccRRR_pat<M4_xor_or, Xor, Su<Or>, I32, I32, I32>;
+def: AccRRR_pat<M2_xor_xacc, Xor, Su<Xor>, I32, I32, I32>;
+def: AccRRR_pat<M4_xor_xacc, Xor, Su<Xor>, I64, I64, I64>;
// For dags like (or (and (not _), _), (shl _, _)) where the "or" with
// one argument matches the patterns below, and with the other argument
// matches S2_asl_r_r_or, etc, prefer the patterns below.
let AddedComplexity = 110 in { // greater than S2_asl_r_r_and/or/xor.
- def: AccRRR_pat<M4_and_andn, And, Su<Not2<And>>, I32, I32>;
- def: AccRRR_pat<M4_or_andn, Or, Su<Not2<And>>, I32, I32>;
- def: AccRRR_pat<M4_xor_andn, Xor, Su<Not2<And>>, I32, I32>;
+ def: AccRRR_pat<M4_and_andn, And, Su<Not2<And>>, I32, I32, I32>;
+ def: AccRRR_pat<M4_or_andn, Or, Su<Not2<And>>, I32, I32, I32>;
+ def: AccRRR_pat<M4_xor_andn, Xor, Su<Not2<And>>, I32, I32, I32>;
}
// S4_addaddi and S4_subaddi don't have tied operands, so give them
@@ -1444,7 +1497,7 @@ def: Pat<(add I32:$Ru, (Su<Mul> I32:$Ry, I32:$Rs)),
(M4_mpyrr_addr IntRegs:$Ru, IntRegs:$Ry, IntRegs:$Rs)>;
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
def: Pat<(fma F32:$Rs, F32:$Rt, F32:$Rx),
(F2_sffma F32:$Rx, F32:$Rs, F32:$Rt)>;
def: Pat<(fma (fneg F32:$Rs), F32:$Rt, F32:$Rx),
@@ -1479,13 +1532,13 @@ def: Pat<(v4i16 (mul V4I16:$Rs, V4I16:$Rt)),
// Multiplies two v4i8 vectors.
def: Pat<(v4i8 (mul V4I8:$Rs, V4I8:$Rt)),
(S2_vtrunehb (M5_vmpybuu V4I8:$Rs, V4I8:$Rt))>,
- Requires<[HasV5T]>;
+ Requires<[HasV5]>;
// Multiplies two v8i8 vectors.
def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)),
(Combinew (S2_vtrunehb (M5_vmpybuu (HiReg $Rs), (HiReg $Rt))),
(S2_vtrunehb (M5_vmpybuu (LoReg $Rs), (LoReg $Rt))))>,
- Requires<[HasV5T]>;
+ Requires<[HasV5]>;
// --(10) Bit ------------------------------------------------------------
@@ -1519,7 +1572,6 @@ def: Pat<(i32 (ctpop I32:$Rs)), (S5_popcountp (A4_combineir 0, I32:$Rs))>;
def: Pat<(bitreverse I32:$Rs), (S2_brev I32:$Rs)>;
def: Pat<(bitreverse I64:$Rss), (S2_brevp I64:$Rss)>;
-
let AddedComplexity = 20 in { // Complexity greater than and/or/xor
def: Pat<(and I32:$Rs, IsNPow2_32:$V),
(S2_clrbit_i IntRegs:$Rs, (LogN2_32 $V))>;
@@ -1582,6 +1634,15 @@ let AddedComplexity = 10 in // Complexity greater than compare reg-reg.
def: Pat<(i1 (seteq (and I32:$Rs, I32:$Rt), IntRegs:$Rt)),
(C2_bitsset IntRegs:$Rs, IntRegs:$Rt)>;
+def SDTTestBit:
+ SDTypeProfile<1, 2, [SDTCisVT<0, i1>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
+def HexagonTSTBIT: SDNode<"HexagonISD::TSTBIT", SDTTestBit>;
+
+def: Pat<(HexagonTSTBIT I32:$Rs, u5_0ImmPred:$u5),
+ (S2_tstbit_i I32:$Rs, imm:$u5)>;
+def: Pat<(HexagonTSTBIT I32:$Rs, I32:$Rt),
+ (S2_tstbit_r I32:$Rs, I32:$Rt)>;
+
let AddedComplexity = 20 in { // Complexity greater than cmp reg-imm.
def: Pat<(i1 (seteq (and (shl 1, u5_0ImmPred:$u5), I32:$Rs), 0)),
(S4_ntstbit_i I32:$Rs, imm:$u5)>;
@@ -1790,7 +1851,12 @@ let AddedComplexity = 20 in {
defm: Loadxi_pat<zextloadv2i8, v2i16, anyimm1, L2_loadbzw2_io>;
defm: Loadxi_pat<zextloadv4i8, v4i16, anyimm2, L2_loadbzw4_io>;
defm: Loadxi_pat<load, i32, anyimm2, L2_loadri_io>;
+ defm: Loadxi_pat<load, v2i16, anyimm2, L2_loadri_io>;
+ defm: Loadxi_pat<load, v4i8, anyimm2, L2_loadri_io>;
defm: Loadxi_pat<load, i64, anyimm3, L2_loadrd_io>;
+ defm: Loadxi_pat<load, v2i32, anyimm3, L2_loadrd_io>;
+ defm: Loadxi_pat<load, v4i16, anyimm3, L2_loadrd_io>;
+ defm: Loadxi_pat<load, v8i8, anyimm3, L2_loadrd_io>;
defm: Loadxi_pat<load, f32, anyimm2, L2_loadri_io>;
defm: Loadxi_pat<load, f64, anyimm3, L2_loadrd_io>;
// No sextloadi1.
@@ -1828,10 +1894,15 @@ let AddedComplexity = 60 in {
def: Loadxu_pat<zextloadi16, i32, anyimm1, L4_loadruh_ur>;
def: Loadxu_pat<zextloadv2i8, v2i16, anyimm1, L4_loadbzw2_ur>;
def: Loadxu_pat<zextloadv4i8, v4i16, anyimm2, L4_loadbzw4_ur>;
- def: Loadxu_pat<load, f32, anyimm2, L4_loadri_ur>;
- def: Loadxu_pat<load, f64, anyimm3, L4_loadrd_ur>;
def: Loadxu_pat<load, i32, anyimm2, L4_loadri_ur>;
+ def: Loadxu_pat<load, v2i16, anyimm2, L4_loadri_ur>;
+ def: Loadxu_pat<load, v4i8, anyimm2, L4_loadri_ur>;
def: Loadxu_pat<load, i64, anyimm3, L4_loadrd_ur>;
+ def: Loadxu_pat<load, v2i32, anyimm3, L4_loadrd_ur>;
+ def: Loadxu_pat<load, v4i16, anyimm3, L4_loadrd_ur>;
+ def: Loadxu_pat<load, v8i8, anyimm3, L4_loadrd_ur>;
+ def: Loadxu_pat<load, f32, anyimm2, L4_loadri_ur>;
+ def: Loadxu_pat<load, f64, anyimm3, L4_loadrd_ur>;
def: Loadxum_pat<sextloadi8, i64, anyimm0, ToSext64, L4_loadrb_ur>;
def: Loadxum_pat<zextloadi8, i64, anyimm0, ToZext64, L4_loadrub_ur>;
@@ -1845,29 +1916,39 @@ let AddedComplexity = 60 in {
}
let AddedComplexity = 40 in {
- def: Loadxr_shl_pat<extloadi8, i32, L4_loadrub_rr>;
- def: Loadxr_shl_pat<zextloadi8, i32, L4_loadrub_rr>;
- def: Loadxr_shl_pat<sextloadi8, i32, L4_loadrb_rr>;
- def: Loadxr_shl_pat<extloadi16, i32, L4_loadruh_rr>;
- def: Loadxr_shl_pat<zextloadi16, i32, L4_loadruh_rr>;
- def: Loadxr_shl_pat<sextloadi16, i32, L4_loadrh_rr>;
- def: Loadxr_shl_pat<load, i32, L4_loadri_rr>;
- def: Loadxr_shl_pat<load, i64, L4_loadrd_rr>;
- def: Loadxr_shl_pat<load, f32, L4_loadri_rr>;
- def: Loadxr_shl_pat<load, f64, L4_loadrd_rr>;
+ def: Loadxr_shl_pat<extloadi8, i32, L4_loadrub_rr>;
+ def: Loadxr_shl_pat<zextloadi8, i32, L4_loadrub_rr>;
+ def: Loadxr_shl_pat<sextloadi8, i32, L4_loadrb_rr>;
+ def: Loadxr_shl_pat<extloadi16, i32, L4_loadruh_rr>;
+ def: Loadxr_shl_pat<zextloadi16, i32, L4_loadruh_rr>;
+ def: Loadxr_shl_pat<sextloadi16, i32, L4_loadrh_rr>;
+ def: Loadxr_shl_pat<load, i32, L4_loadri_rr>;
+ def: Loadxr_shl_pat<load, v2i16, L4_loadri_rr>;
+ def: Loadxr_shl_pat<load, v4i8, L4_loadri_rr>;
+ def: Loadxr_shl_pat<load, i64, L4_loadrd_rr>;
+ def: Loadxr_shl_pat<load, v2i32, L4_loadrd_rr>;
+ def: Loadxr_shl_pat<load, v4i16, L4_loadrd_rr>;
+ def: Loadxr_shl_pat<load, v8i8, L4_loadrd_rr>;
+ def: Loadxr_shl_pat<load, f32, L4_loadri_rr>;
+ def: Loadxr_shl_pat<load, f64, L4_loadrd_rr>;
}
let AddedComplexity = 20 in {
- def: Loadxr_add_pat<extloadi8, i32, L4_loadrub_rr>;
- def: Loadxr_add_pat<zextloadi8, i32, L4_loadrub_rr>;
- def: Loadxr_add_pat<sextloadi8, i32, L4_loadrb_rr>;
- def: Loadxr_add_pat<extloadi16, i32, L4_loadruh_rr>;
- def: Loadxr_add_pat<zextloadi16, i32, L4_loadruh_rr>;
- def: Loadxr_add_pat<sextloadi16, i32, L4_loadrh_rr>;
- def: Loadxr_add_pat<load, i32, L4_loadri_rr>;
- def: Loadxr_add_pat<load, i64, L4_loadrd_rr>;
- def: Loadxr_add_pat<load, f32, L4_loadri_rr>;
- def: Loadxr_add_pat<load, f64, L4_loadrd_rr>;
+ def: Loadxr_add_pat<extloadi8, i32, L4_loadrub_rr>;
+ def: Loadxr_add_pat<zextloadi8, i32, L4_loadrub_rr>;
+ def: Loadxr_add_pat<sextloadi8, i32, L4_loadrb_rr>;
+ def: Loadxr_add_pat<extloadi16, i32, L4_loadruh_rr>;
+ def: Loadxr_add_pat<zextloadi16, i32, L4_loadruh_rr>;
+ def: Loadxr_add_pat<sextloadi16, i32, L4_loadrh_rr>;
+ def: Loadxr_add_pat<load, i32, L4_loadri_rr>;
+ def: Loadxr_add_pat<load, v2i16, L4_loadri_rr>;
+ def: Loadxr_add_pat<load, v4i8, L4_loadri_rr>;
+ def: Loadxr_add_pat<load, i64, L4_loadrd_rr>;
+ def: Loadxr_add_pat<load, v2i32, L4_loadrd_rr>;
+ def: Loadxr_add_pat<load, v4i16, L4_loadrd_rr>;
+ def: Loadxr_add_pat<load, v8i8, L4_loadrd_rr>;
+ def: Loadxr_add_pat<load, f32, L4_loadri_rr>;
+ def: Loadxr_add_pat<load, f64, L4_loadrd_rr>;
}
let AddedComplexity = 40 in {
@@ -1897,17 +1978,22 @@ let AddedComplexity = 20 in {
// Absolute address
let AddedComplexity = 60 in {
- def: Loada_pat<zextloadi1, i32, anyimm0, PS_loadrubabs>;
- def: Loada_pat<sextloadi8, i32, anyimm0, PS_loadrbabs>;
- def: Loada_pat<extloadi8, i32, anyimm0, PS_loadrubabs>;
- def: Loada_pat<zextloadi8, i32, anyimm0, PS_loadrubabs>;
- def: Loada_pat<sextloadi16, i32, anyimm1, PS_loadrhabs>;
- def: Loada_pat<extloadi16, i32, anyimm1, PS_loadruhabs>;
- def: Loada_pat<zextloadi16, i32, anyimm1, PS_loadruhabs>;
- def: Loada_pat<load, i32, anyimm2, PS_loadriabs>;
- def: Loada_pat<load, i64, anyimm3, PS_loadrdabs>;
- def: Loada_pat<load, f32, anyimm2, PS_loadriabs>;
- def: Loada_pat<load, f64, anyimm3, PS_loadrdabs>;
+ def: Loada_pat<zextloadi1, i32, anyimm0, PS_loadrubabs>;
+ def: Loada_pat<sextloadi8, i32, anyimm0, PS_loadrbabs>;
+ def: Loada_pat<extloadi8, i32, anyimm0, PS_loadrubabs>;
+ def: Loada_pat<zextloadi8, i32, anyimm0, PS_loadrubabs>;
+ def: Loada_pat<sextloadi16, i32, anyimm1, PS_loadrhabs>;
+ def: Loada_pat<extloadi16, i32, anyimm1, PS_loadruhabs>;
+ def: Loada_pat<zextloadi16, i32, anyimm1, PS_loadruhabs>;
+ def: Loada_pat<load, i32, anyimm2, PS_loadriabs>;
+ def: Loada_pat<load, v2i16, anyimm2, PS_loadriabs>;
+ def: Loada_pat<load, v4i8, anyimm2, PS_loadriabs>;
+ def: Loada_pat<load, i64, anyimm3, PS_loadrdabs>;
+ def: Loada_pat<load, v2i32, anyimm3, PS_loadrdabs>;
+ def: Loada_pat<load, v4i16, anyimm3, PS_loadrdabs>;
+ def: Loada_pat<load, v8i8, anyimm3, PS_loadrdabs>;
+ def: Loada_pat<load, f32, anyimm2, PS_loadriabs>;
+ def: Loada_pat<load, f64, anyimm3, PS_loadrdabs>;
def: Loada_pat<atomic_load_8, i32, anyimm0, PS_loadrubabs>;
def: Loada_pat<atomic_load_16, i32, anyimm1, PS_loadruhabs>;
@@ -1933,18 +2019,23 @@ let AddedComplexity = 30 in {
// GP-relative address
let AddedComplexity = 100 in {
- def: Loada_pat<extloadi1, i32, addrgp, L2_loadrubgp>;
- def: Loada_pat<zextloadi1, i32, addrgp, L2_loadrubgp>;
- def: Loada_pat<extloadi8, i32, addrgp, L2_loadrubgp>;
- def: Loada_pat<sextloadi8, i32, addrgp, L2_loadrbgp>;
- def: Loada_pat<zextloadi8, i32, addrgp, L2_loadrubgp>;
- def: Loada_pat<extloadi16, i32, addrgp, L2_loadruhgp>;
- def: Loada_pat<sextloadi16, i32, addrgp, L2_loadrhgp>;
- def: Loada_pat<zextloadi16, i32, addrgp, L2_loadruhgp>;
- def: Loada_pat<load, i32, addrgp, L2_loadrigp>;
- def: Loada_pat<load, i64, addrgp, L2_loadrdgp>;
- def: Loada_pat<load, f32, addrgp, L2_loadrigp>;
- def: Loada_pat<load, f64, addrgp, L2_loadrdgp>;
+ def: Loada_pat<extloadi1, i32, addrgp, L2_loadrubgp>;
+ def: Loada_pat<zextloadi1, i32, addrgp, L2_loadrubgp>;
+ def: Loada_pat<extloadi8, i32, addrgp, L2_loadrubgp>;
+ def: Loada_pat<sextloadi8, i32, addrgp, L2_loadrbgp>;
+ def: Loada_pat<zextloadi8, i32, addrgp, L2_loadrubgp>;
+ def: Loada_pat<extloadi16, i32, addrgp, L2_loadruhgp>;
+ def: Loada_pat<sextloadi16, i32, addrgp, L2_loadrhgp>;
+ def: Loada_pat<zextloadi16, i32, addrgp, L2_loadruhgp>;
+ def: Loada_pat<load, i32, addrgp, L2_loadrigp>;
+ def: Loada_pat<load, v2i16, addrgp, L2_loadrigp>;
+ def: Loada_pat<load, v4i8, addrgp, L2_loadrigp>;
+ def: Loada_pat<load, i64, addrgp, L2_loadrdgp>;
+ def: Loada_pat<load, v2i32, addrgp, L2_loadrdgp>;
+ def: Loada_pat<load, v4i16, addrgp, L2_loadrdgp>;
+ def: Loada_pat<load, v8i8, addrgp, L2_loadrdgp>;
+ def: Loada_pat<load, f32, addrgp, L2_loadrigp>;
+ def: Loada_pat<load, f64, addrgp, L2_loadrdgp>;
def: Loada_pat<atomic_load_8, i32, addrgp, L2_loadrubgp>;
def: Loada_pat<atomic_load_16, i32, addrgp, L2_loadruhgp>;
@@ -1983,46 +2074,10 @@ def: Pat<(i1 (load (add I32:$Rs, anyimm0:$Off))),
def: Pat<(i1 (load I32:$Rs)),
(C2_tfrrp (L2_loadrub_io IntRegs:$Rs, 0))>;
-// HVX loads
-
-multiclass HvxLd_pat<InstHexagon MI, PatFrag Load, ValueType VT,
- PatFrag ImmPred> {
- def: Pat<(VT (Load I32:$Rt)), (MI I32:$Rt, 0)>;
- def: Pat<(VT (Load (add I32:$Rt, ImmPred:$s))), (MI I32:$Rt, imm:$s)>;
- // The HVX selection code for shuffles can generate vector constants.
- // Calling "Select" on the resulting loads from CP fails without these
- // patterns.
- def: Pat<(VT (Load (HexagonCP tconstpool:$A))), (MI (A2_tfrsi imm:$A), 0)>;
- def: Pat<(VT (Load (HexagonAtPcrel tconstpool:$A))),
- (MI (C4_addipc imm:$A), 0)>;
-}
-
-
-let Predicates = [UseHVX] in {
- multiclass HvxLdVs_pat<InstHexagon MI, PatFrag Load> {
- defm: HvxLd_pat<MI, Load, VecI8, IsVecOff>;
- defm: HvxLd_pat<MI, Load, VecI16, IsVecOff>;
- defm: HvxLd_pat<MI, Load, VecI32, IsVecOff>;
- }
- defm: HvxLdVs_pat<V6_vL32b_nt_ai, alignednontemporalload>;
- defm: HvxLdVs_pat<V6_vL32b_ai, alignedload>;
- defm: HvxLdVs_pat<V6_vL32Ub_ai, unalignedload>;
-
- multiclass HvxLdWs_pat<InstHexagon MI, PatFrag Load> {
- defm: HvxLd_pat<MI, Load, VecPI8, IsVecOff>;
- defm: HvxLd_pat<MI, Load, VecPI16, IsVecOff>;
- defm: HvxLd_pat<MI, Load, VecPI32, IsVecOff>;
- }
- defm: HvxLdWs_pat<PS_vloadrw_nt_ai, alignednontemporalload>;
- defm: HvxLdWs_pat<PS_vloadrw_ai, alignedload>;
- defm: HvxLdWs_pat<PS_vloadrwu_ai, unalignedload>;
-}
-
// --(13) Store ----------------------------------------------------------
//
-
class Storepi_pat<PatFrag Store, PatFrag Value, PatFrag Offset, InstHexagon MI>
: Pat<(Store Value:$Rt, I32:$Rx, Offset:$s4),
(MI I32:$Rx, imm:$s4, Value:$Rt)>;
@@ -2135,7 +2190,7 @@ class Stoream_pat<PatFrag Store, PatFrag Value, PatFrag Addr, PatFrag ValueMod,
// swapped. This relies on the knowledge that the F.Fragment uses names
// "ptr" and "val".
class AtomSt<PatFrag F>
- : PatFrag<(ops node:$val, node:$ptr), F.Fragment, F.PredicateCode,
+ : PatFrag<(ops node:$val, node:$ptr), !head(F.Fragments), F.PredicateCode,
F.OperandTransform> {
let IsAtomic = F.IsAtomic;
let MemoryVT = F.MemoryVT;
@@ -2459,36 +2514,6 @@ let AddedComplexity = 10 in {
def: Storexi_base_pat<AtomSt<atomic_store_64>, I64, S2_storerd_io>;
}
-// HVX stores
-
-multiclass HvxSt_pat<InstHexagon MI, PatFrag Store, PatFrag ImmPred,
- PatFrag Value> {
- def: Pat<(Store Value:$Vs, I32:$Rt),
- (MI I32:$Rt, 0, Value:$Vs)>;
- def: Pat<(Store Value:$Vs, (add I32:$Rt, ImmPred:$s)),
- (MI I32:$Rt, imm:$s, Value:$Vs)>;
-}
-
-let Predicates = [UseHVX] in {
- multiclass HvxStVs_pat<InstHexagon MI, PatFrag Store> {
- defm: HvxSt_pat<MI, Store, IsVecOff, HVI8>;
- defm: HvxSt_pat<MI, Store, IsVecOff, HVI16>;
- defm: HvxSt_pat<MI, Store, IsVecOff, HVI32>;
- }
- defm: HvxStVs_pat<V6_vS32b_nt_ai, alignednontemporalstore>;
- defm: HvxStVs_pat<V6_vS32b_ai, alignedstore>;
- defm: HvxStVs_pat<V6_vS32Ub_ai, unalignedstore>;
-
- multiclass HvxStWs_pat<InstHexagon MI, PatFrag Store> {
- defm: HvxSt_pat<MI, Store, IsVecOff, HWI8>;
- defm: HvxSt_pat<MI, Store, IsVecOff, HWI16>;
- defm: HvxSt_pat<MI, Store, IsVecOff, HWI32>;
- }
- defm: HvxStWs_pat<PS_vstorerw_nt_ai, alignednontemporalstore>;
- defm: HvxStWs_pat<PS_vstorerw_ai, alignedstore>;
- defm: HvxStWs_pat<PS_vstorerwu_ai, unalignedstore>;
-}
-
// --(14) Memop ----------------------------------------------------------
//
@@ -2570,8 +2595,10 @@ multiclass Memopxr_add_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
multiclass Memopxr_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
SDNode Oper, InstHexagon MI> {
- defm: Memopxr_base_pat <Load, Store, Oper, MI>;
- defm: Memopxr_add_pat <Load, Store, ImmPred, Oper, MI>;
+ let Predicates = [UseMEMOPS] in {
+ defm: Memopxr_base_pat <Load, Store, Oper, MI>;
+ defm: Memopxr_add_pat <Load, Store, ImmPred, Oper, MI>;
+ }
}
let AddedComplexity = 200 in {
@@ -2669,8 +2696,10 @@ multiclass Memopxi_add_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
multiclass Memopxi_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
SDNode Oper, PatFrag Arg, SDNodeXForm ArgMod,
InstHexagon MI> {
- defm: Memopxi_base_pat <Load, Store, Oper, Arg, ArgMod, MI>;
- defm: Memopxi_add_pat <Load, Store, ImmPred, Oper, Arg, ArgMod, MI>;
+ let Predicates = [UseMEMOPS] in {
+ defm: Memopxi_base_pat <Load, Store, Oper, Arg, ArgMod, MI>;
+ defm: Memopxi_add_pat <Load, Store, ImmPred, Oper, Arg, ArgMod, MI>;
+ }
}
let AddedComplexity = 220 in {
@@ -2829,6 +2858,8 @@ def: Pat<(brcond (not I1:$Pu), bb:$dst),
(J2_jumpf I1:$Pu, bb:$dst)>;
def: Pat<(brcond (i1 (setne I1:$Pu, -1)), bb:$dst),
(J2_jumpf I1:$Pu, bb:$dst)>;
+def: Pat<(brcond (i1 (seteq I1:$Pu, 0)), bb:$dst),
+ (J2_jumpf I1:$Pu, bb:$dst)>;
def: Pat<(brcond (i1 (setne I1:$Pu, 0)), bb:$dst),
(J2_jumpt I1:$Pu, bb:$dst)>;
@@ -2898,97 +2929,17 @@ def HexagonREADCYCLE: SDNode<"HexagonISD::READCYCLE", SDTInt64Leaf,
def: Pat<(HexagonREADCYCLE), (A4_tfrcpp UPCYCLE)>;
-
-def SDTVecLeaf: SDTypeProfile<1, 0, [SDTCisVec<0>]>;
-
-def SDTHexagonVEXTRACTW: SDTypeProfile<1, 2,
- [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisVT<2, i32>]>;
-def HexagonVEXTRACTW : SDNode<"HexagonISD::VEXTRACTW", SDTHexagonVEXTRACTW>;
-
-def SDTHexagonVINSERTW0: SDTypeProfile<1, 2,
- [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>;
-def HexagonVINSERTW0 : SDNode<"HexagonISD::VINSERTW0", SDTHexagonVINSERTW0>;
-
-def Combinev: OutPatFrag<(ops node:$Rs, node:$Rt),
- (REG_SEQUENCE HvxWR, $Rs, vsub_hi, $Rt, vsub_lo)>;
-
-def LoVec: OutPatFrag<(ops node:$Vs), (EXTRACT_SUBREG $Vs, vsub_lo)>;
-def HiVec: OutPatFrag<(ops node:$Vs), (EXTRACT_SUBREG $Vs, vsub_hi)>;
-
-let Predicates = [UseHVX] in {
- def: OpR_RR_pat<V6_vpackeb, pf2<HexagonVPACKE>, VecI8, HVI8>;
- def: OpR_RR_pat<V6_vpackob, pf2<HexagonVPACKO>, VecI8, HVI8>;
- def: OpR_RR_pat<V6_vpackeh, pf2<HexagonVPACKE>, VecI16, HVI16>;
- def: OpR_RR_pat<V6_vpackoh, pf2<HexagonVPACKO>, VecI16, HVI16>;
-}
-
-def HexagonVZERO: SDNode<"HexagonISD::VZERO", SDTVecLeaf>;
-def vzero: PatFrag<(ops), (HexagonVZERO)>;
-
-let Predicates = [UseHVX] in {
- def: Pat<(VecI8 vzero), (V6_vd0)>;
- def: Pat<(VecI16 vzero), (V6_vd0)>;
- def: Pat<(VecI32 vzero), (V6_vd0)>;
-
- def: Pat<(VecPI8 (concat_vectors HVI8:$Vs, HVI8:$Vt)),
- (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
- def: Pat<(VecPI16 (concat_vectors HVI16:$Vs, HVI16:$Vt)),
- (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
- def: Pat<(VecPI32 (concat_vectors HVI32:$Vs, HVI32:$Vt)),
- (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
-
- def: Pat<(HexagonVEXTRACTW HVI8:$Vu, I32:$Rs),
- (V6_extractw HvxVR:$Vu, I32:$Rs)>;
- def: Pat<(HexagonVEXTRACTW HVI16:$Vu, I32:$Rs),
- (V6_extractw HvxVR:$Vu, I32:$Rs)>;
- def: Pat<(HexagonVEXTRACTW HVI32:$Vu, I32:$Rs),
- (V6_extractw HvxVR:$Vu, I32:$Rs)>;
-
- def: Pat<(HexagonVINSERTW0 HVI8:$Vu, I32:$Rt),
- (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
- def: Pat<(HexagonVINSERTW0 HVI16:$Vu, I32:$Rt),
- (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
- def: Pat<(HexagonVINSERTW0 HVI32:$Vu, I32:$Rt),
- (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
-
- def: Pat<(add HVI8:$Vs, HVI8:$Vt), (V6_vaddb HvxVR:$Vs, HvxVR:$Vt)>;
- def: Pat<(add HVI16:$Vs, HVI16:$Vt), (V6_vaddh HvxVR:$Vs, HvxVR:$Vt)>;
- def: Pat<(add HVI32:$Vs, HVI32:$Vt), (V6_vaddw HvxVR:$Vs, HvxVR:$Vt)>;
-
- def: Pat<(sub HVI8:$Vs, HVI8:$Vt), (V6_vsubb HvxVR:$Vs, HvxVR:$Vt)>;
- def: Pat<(sub HVI16:$Vs, HVI16:$Vt), (V6_vsubh HvxVR:$Vs, HvxVR:$Vt)>;
- def: Pat<(sub HVI32:$Vs, HVI32:$Vt), (V6_vsubw HvxVR:$Vs, HvxVR:$Vt)>;
-
- def: Pat<(and HVI8:$Vs, HVI8:$Vt), (V6_vand HvxVR:$Vs, HvxVR:$Vt)>;
- def: Pat<(or HVI8:$Vs, HVI8:$Vt), (V6_vor HvxVR:$Vs, HvxVR:$Vt)>;
- def: Pat<(xor HVI8:$Vs, HVI8:$Vt), (V6_vxor HvxVR:$Vs, HvxVR:$Vt)>;
-
- def: Pat<(vselect HQ8:$Qu, HVI8:$Vs, HVI8:$Vt),
- (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
- def: Pat<(vselect HQ16:$Qu, HVI16:$Vs, HVI16:$Vt),
- (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
- def: Pat<(vselect HQ32:$Qu, HVI32:$Vs, HVI32:$Vt),
- (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
-
- def: Pat<(VecPI16 (sext HVI8:$Vs)), (V6_vsb HvxVR:$Vs)>;
- def: Pat<(VecPI32 (sext HVI16:$Vs)), (V6_vsh HvxVR:$Vs)>;
- def: Pat<(VecPI16 (zext HVI8:$Vs)), (V6_vzb HvxVR:$Vs)>;
- def: Pat<(VecPI32 (zext HVI16:$Vs)), (V6_vzh HvxVR:$Vs)>;
-
- def: Pat<(sext_inreg HVI32:$Vs, v16i16),
- (V6_vpackeb (LoVec (V6_vsh HvxVR:$Vs)),
- (HiVec (V6_vsh HvxVR:$Vs)))>;
- def: Pat<(sext_inreg HVI32:$Vs, v32i16),
- (V6_vpackeb (LoVec (V6_vsh HvxVR:$Vs)),
- (HiVec (V6_vsh HvxVR:$Vs)))>;
-
- def: Pat<(VecI16 (sext_invec HVI8:$Vs)), (LoVec (V6_vsb HvxVR:$Vs))>;
- def: Pat<(VecI32 (sext_invec HVI16:$Vs)), (LoVec (V6_vsh HvxVR:$Vs))>;
- def: Pat<(VecI32 (sext_invec HVI8:$Vs)),
- (LoVec (V6_vsh (LoVec (V6_vsb HvxVR:$Vs))))>;
-
- def: Pat<(VecI16 (zext_invec HVI8:$Vs)), (LoVec (V6_vzb HvxVR:$Vs))>;
- def: Pat<(VecI32 (zext_invec HVI16:$Vs)), (LoVec (V6_vzh HvxVR:$Vs))>;
- def: Pat<(VecI32 (zext_invec HVI8:$Vs)),
- (LoVec (V6_vzh (LoVec (V6_vzb HvxVR:$Vs))))>;
+// The declared return value of the store-locked intrinsics is i32, but
+// the instructions actually define i1. To avoid register copies from
+// IntRegs to PredRegs and back, fold the entire pattern checking the
+// result against true/false.
+let AddedComplexity = 100 in {
+ def: Pat<(i1 (setne (int_hexagon_S2_storew_locked I32:$Rs, I32:$Rt), 0)),
+ (S2_storew_locked I32:$Rs, I32:$Rt)>;
+ def: Pat<(i1 (seteq (int_hexagon_S2_storew_locked I32:$Rs, I32:$Rt), 0)),
+ (C2_not (S2_storew_locked I32:$Rs, I32:$Rt))>;
+ def: Pat<(i1 (setne (int_hexagon_S4_stored_locked I32:$Rs, I64:$Rt), 0)),
+ (S4_stored_locked I32:$Rs, I64:$Rt)>;
+ def: Pat<(i1 (seteq (int_hexagon_S4_stored_locked I32:$Rs, I64:$Rt), 0)),
+ (C2_not (S4_stored_locked I32:$Rs, I64:$Rt))>;
}
diff --git a/lib/Target/Hexagon/HexagonPatternsHVX.td b/lib/Target/Hexagon/HexagonPatternsHVX.td
new file mode 100644
index 000000000000..a4cfca9ac7d7
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -0,0 +1,497 @@
+def SDTVecLeaf:
+ SDTypeProfile<1, 0, [SDTCisVec<0>]>;
+def SDTVecBinOp:
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>]>;
+
+def SDTHexagonVEXTRACTW: SDTypeProfile<1, 2,
+ [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisVT<2, i32>]>;
+def HexagonVEXTRACTW : SDNode<"HexagonISD::VEXTRACTW", SDTHexagonVEXTRACTW>;
+
+def SDTHexagonVINSERTW0: SDTypeProfile<1, 2,
+ [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>;
+def HexagonVINSERTW0: SDNode<"HexagonISD::VINSERTW0", SDTHexagonVINSERTW0>;
+
+def SDTHexagonVSPLATW: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
+def HexagonVSPLATW: SDNode<"HexagonISD::VSPLATW", SDTHexagonVSPLATW>;
+
+def HwLen2: SDNodeXForm<imm, [{
+ const auto &ST = static_cast<const HexagonSubtarget&>(CurDAG->getSubtarget());
+ return CurDAG->getTargetConstant(ST.getVectorLength()/2, SDLoc(N), MVT::i32);
+}]>;
+
+def Q2V: OutPatFrag<(ops node:$Qs), (V6_vandqrt $Qs, (A2_tfrsi -1))>;
+
+def Combinev: OutPatFrag<(ops node:$Vs, node:$Vt),
+ (REG_SEQUENCE HvxWR, $Vs, vsub_hi, $Vt, vsub_lo)>;
+
+def Combineq: OutPatFrag<(ops node:$Qs, node:$Qt),
+ (V6_vandvrt
+ (V6_vor
+ (V6_vror (V6_vpackeb (V6_vd0), (Q2V $Qs)),
+ (A2_tfrsi (HwLen2 (i32 0)))), // Half the vector length
+ (V6_vpackeb (V6_vd0), (Q2V $Qt))),
+ (A2_tfrsi -1))>;
+
+def LoVec: OutPatFrag<(ops node:$Vs), (EXTRACT_SUBREG $Vs, vsub_lo)>;
+def HiVec: OutPatFrag<(ops node:$Vs), (EXTRACT_SUBREG $Vs, vsub_hi)>;
+
+def HexagonVZERO: SDNode<"HexagonISD::VZERO", SDTVecLeaf>;
+def HexagonQCAT: SDNode<"HexagonISD::QCAT", SDTVecBinOp>;
+def HexagonQTRUE: SDNode<"HexagonISD::QTRUE", SDTVecLeaf>;
+def HexagonQFALSE: SDNode<"HexagonISD::QFALSE", SDTVecLeaf>;
+
+def vzero: PatFrag<(ops), (HexagonVZERO)>;
+def qtrue: PatFrag<(ops), (HexagonQTRUE)>;
+def qfalse: PatFrag<(ops), (HexagonQFALSE)>;
+def qcat: PatFrag<(ops node:$Qs, node:$Qt),
+ (HexagonQCAT node:$Qs, node:$Qt)>;
+
+def qnot: PatFrag<(ops node:$Qs), (xor node:$Qs, qtrue)>;
+
+def VSxtb: OutPatFrag<(ops node:$Vs), (V6_vunpackb $Vs)>;
+def VSxth: OutPatFrag<(ops node:$Vs), (V6_vunpackh $Vs)>;
+def VZxtb: OutPatFrag<(ops node:$Vs), (V6_vunpackub $Vs)>;
+def VZxth: OutPatFrag<(ops node:$Vs), (V6_vunpackuh $Vs)>;
+
+def SplatB: SDNodeXForm<imm, [{
+ uint32_t V = N->getZExtValue();
+ assert(isUInt<8>(V));
+ uint32_t S = V << 24 | V << 16 | V << 8 | V;
+ return CurDAG->getTargetConstant(S, SDLoc(N), MVT::i32);
+}]>;
+
+def SplatH: SDNodeXForm<imm, [{
+ uint32_t V = N->getZExtValue();
+ assert(isUInt<16>(V));
+ return CurDAG->getTargetConstant(V << 16 | V, SDLoc(N), MVT::i32);
+}]>;
+
+def IsVecOff : PatLeaf<(i32 imm), [{
+ int32_t V = N->getSExtValue();
+ int32_t VecSize = HRI->getSpillSize(Hexagon::HvxVRRegClass);
+ assert(isPowerOf2_32(VecSize));
+ if ((uint32_t(V) & (uint32_t(VecSize)-1)) != 0)
+ return false;
+ int32_t L = Log2_32(VecSize);
+ return isInt<4>(V >> L);
+}]>;
+
+
+def alignedload: PatFrag<(ops node:$a), (load $a), [{
+ return isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+def unalignedload: PatFrag<(ops node:$a), (load $a), [{
+ return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+def alignedstore: PatFrag<(ops node:$v, node:$a), (store $v, $a), [{
+ return isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+def unalignedstore: PatFrag<(ops node:$v, node:$a), (store $v, $a), [{
+ return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+
+// HVX loads
+
+multiclass HvxLd_pat<InstHexagon MI, PatFrag Load, ValueType ResType,
+ PatFrag ImmPred> {
+ def: Pat<(ResType (Load I32:$Rt)),
+ (MI I32:$Rt, 0)>;
+ def: Pat<(ResType (Load (add I32:$Rt, ImmPred:$s))),
+ (MI I32:$Rt, imm:$s)>;
+ // The HVX selection code for shuffles can generate vector constants.
+ // Calling "Select" on the resulting loads from CP fails without these
+ // patterns.
+ def: Pat<(ResType (Load (HexagonCP tconstpool:$A))),
+ (MI (A2_tfrsi imm:$A), 0)>;
+ def: Pat<(ResType (Load (HexagonAtPcrel tconstpool:$A))),
+ (MI (C4_addipc imm:$A), 0)>;
+}
+
+multiclass HvxLda_pat<InstHexagon MI, PatFrag Load, ValueType ResType,
+ PatFrag ImmPred> {
+ let AddedComplexity = 50 in {
+ def: Pat<(ResType (Load (valignaddr I32:$Rt))),
+ (MI I32:$Rt, 0)>;
+ def: Pat<(ResType (Load (add (valignaddr I32:$Rt), ImmPred:$Off))),
+ (MI I32:$Rt, imm:$Off)>;
+ }
+ defm: HvxLd_pat<MI, Load, ResType, ImmPred>;
+}
+
+let Predicates = [UseHVX] in {
+ defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecI8, IsVecOff>;
+ defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecI16, IsVecOff>;
+ defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecI32, IsVecOff>;
+
+ defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecI8, IsVecOff>;
+ defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecI16, IsVecOff>;
+ defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecI32, IsVecOff>;
+
+ defm: HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecI8, IsVecOff>;
+ defm: HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecI16, IsVecOff>;
+ defm: HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecI32, IsVecOff>;
+}
+
+// HVX stores
+
+multiclass HvxSt_pat<InstHexagon MI, PatFrag Store, PatFrag ImmPred,
+ PatFrag Value> {
+ def: Pat<(Store Value:$Vs, I32:$Rt),
+ (MI I32:$Rt, 0, Value:$Vs)>;
+ def: Pat<(Store Value:$Vs, (add I32:$Rt, ImmPred:$s)),
+ (MI I32:$Rt, imm:$s, Value:$Vs)>;
+}
+
+let Predicates = [UseHVX] in {
+ defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, IsVecOff, HVI8>;
+ defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, IsVecOff, HVI16>;
+ defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, IsVecOff, HVI32>;
+
+ defm: HvxSt_pat<V6_vS32b_ai, alignedstore, IsVecOff, HVI8>;
+ defm: HvxSt_pat<V6_vS32b_ai, alignedstore, IsVecOff, HVI16>;
+ defm: HvxSt_pat<V6_vS32b_ai, alignedstore, IsVecOff, HVI32>;
+
+ defm: HvxSt_pat<V6_vS32Ub_ai, unalignedstore, IsVecOff, HVI8>;
+ defm: HvxSt_pat<V6_vS32Ub_ai, unalignedstore, IsVecOff, HVI16>;
+ defm: HvxSt_pat<V6_vS32Ub_ai, unalignedstore, IsVecOff, HVI32>;
+}
+
+// Bitcasts between same-size vector types are no-ops, except for the
+// actual type change.
+class Bitcast<ValueType ResTy, ValueType InpTy, RegisterClass RC>
+ : Pat<(ResTy (bitconvert (InpTy RC:$Val))), (ResTy RC:$Val)>;
+
+let Predicates = [UseHVX] in {
+ def: Bitcast<VecI8, VecI16, HvxVR>;
+ def: Bitcast<VecI8, VecI32, HvxVR>;
+ def: Bitcast<VecI16, VecI8, HvxVR>;
+ def: Bitcast<VecI16, VecI32, HvxVR>;
+ def: Bitcast<VecI32, VecI8, HvxVR>;
+ def: Bitcast<VecI32, VecI16, HvxVR>;
+
+ def: Bitcast<VecPI8, VecPI16, HvxWR>;
+ def: Bitcast<VecPI8, VecPI32, HvxWR>;
+ def: Bitcast<VecPI16, VecPI8, HvxWR>;
+ def: Bitcast<VecPI16, VecPI32, HvxWR>;
+ def: Bitcast<VecPI32, VecPI8, HvxWR>;
+ def: Bitcast<VecPI32, VecPI16, HvxWR>;
+}
+
+let Predicates = [UseHVX] in {
+ def: Pat<(VecI8 vzero), (V6_vd0)>;
+ def: Pat<(VecI16 vzero), (V6_vd0)>;
+ def: Pat<(VecI32 vzero), (V6_vd0)>;
+ def: Pat<(VecPI8 vzero), (PS_vdd0)>;
+ def: Pat<(VecPI16 vzero), (PS_vdd0)>;
+ def: Pat<(VecPI32 vzero), (PS_vdd0)>;
+
+ def: Pat<(concat_vectors (VecI8 vzero), (VecI8 vzero)), (PS_vdd0)>;
+ def: Pat<(concat_vectors (VecI16 vzero), (VecI16 vzero)), (PS_vdd0)>;
+ def: Pat<(concat_vectors (VecI32 vzero), (VecI32 vzero)), (PS_vdd0)>;
+
+ def: Pat<(VecPI8 (concat_vectors HVI8:$Vs, HVI8:$Vt)),
+ (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
+ def: Pat<(VecPI16 (concat_vectors HVI16:$Vs, HVI16:$Vt)),
+ (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
+ def: Pat<(VecPI32 (concat_vectors HVI32:$Vs, HVI32:$Vt)),
+ (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
+
+ def: Pat<(VecQ8 (qcat HQ16:$Qs, HQ16:$Qt)), (Combineq $Qt, $Qs)>;
+ def: Pat<(VecQ16 (qcat HQ32:$Qs, HQ32:$Qt)), (Combineq $Qt, $Qs)>;
+
+ def: Pat<(HexagonVEXTRACTW HVI8:$Vu, I32:$Rs),
+ (V6_extractw HvxVR:$Vu, I32:$Rs)>;
+ def: Pat<(HexagonVEXTRACTW HVI16:$Vu, I32:$Rs),
+ (V6_extractw HvxVR:$Vu, I32:$Rs)>;
+ def: Pat<(HexagonVEXTRACTW HVI32:$Vu, I32:$Rs),
+ (V6_extractw HvxVR:$Vu, I32:$Rs)>;
+
+ def: Pat<(HexagonVINSERTW0 HVI8:$Vu, I32:$Rt),
+ (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
+ def: Pat<(HexagonVINSERTW0 HVI16:$Vu, I32:$Rt),
+ (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
+ def: Pat<(HexagonVINSERTW0 HVI32:$Vu, I32:$Rt),
+ (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
+}
+
+def Vsplatib: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 (SplatB $V)))>;
+def Vsplatih: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 (SplatH $V)))>;
+def Vsplatiw: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 $V))>;
+
+def Vsplatrb: OutPatFrag<(ops node:$Rs), (V6_lvsplatw (S2_vsplatrb $Rs))>;
+def Vsplatrh: OutPatFrag<(ops node:$Rs),
+ (V6_lvsplatw (A2_combine_ll $Rs, $Rs))>;
+def Vsplatrw: OutPatFrag<(ops node:$Rs), (V6_lvsplatw $Rs)>;
+
+def Rep: OutPatFrag<(ops node:$N), (Combinev $N, $N)>;
+
+let Predicates = [UseHVX] in {
+ let AddedComplexity = 10 in {
+ def: Pat<(VecI8 (HexagonVSPLAT u8_0ImmPred:$V)), (Vsplatib $V)>;
+ def: Pat<(VecI16 (HexagonVSPLAT u16_0ImmPred:$V)), (Vsplatih $V)>;
+ def: Pat<(VecI32 (HexagonVSPLAT anyimm:$V)), (Vsplatiw $V)>;
+ def: Pat<(VecPI8 (HexagonVSPLAT u8_0ImmPred:$V)), (Rep (Vsplatib $V))>;
+ def: Pat<(VecPI16 (HexagonVSPLAT u16_0ImmPred:$V)), (Rep (Vsplatih $V))>;
+ def: Pat<(VecPI32 (HexagonVSPLAT anyimm:$V)), (Rep (Vsplatiw $V))>;
+ }
+ def: Pat<(VecI8 (HexagonVSPLAT I32:$Rs)), (Vsplatrb $Rs)>;
+ def: Pat<(VecI16 (HexagonVSPLAT I32:$Rs)), (Vsplatrh $Rs)>;
+ def: Pat<(VecI32 (HexagonVSPLAT I32:$Rs)), (Vsplatrw $Rs)>;
+ def: Pat<(VecPI8 (HexagonVSPLAT I32:$Rs)), (Rep (Vsplatrb $Rs))>;
+ def: Pat<(VecPI16 (HexagonVSPLAT I32:$Rs)), (Rep (Vsplatrh $Rs))>;
+ def: Pat<(VecPI32 (HexagonVSPLAT I32:$Rs)), (Rep (Vsplatrw $Rs))>;
+
+ def: Pat<(VecI8 (HexagonVSPLATW I32:$Rs)), (Vsplatrw $Rs)>;
+ def: Pat<(VecI16 (HexagonVSPLATW I32:$Rs)), (Vsplatrw $Rs)>;
+ def: Pat<(VecI32 (HexagonVSPLATW I32:$Rs)), (Vsplatrw $Rs)>;
+ def: Pat<(VecPI8 (HexagonVSPLATW I32:$Rs)), (Rep (Vsplatrw $Rs))>;
+ def: Pat<(VecPI16 (HexagonVSPLATW I32:$Rs)), (Rep (Vsplatrw $Rs))>;
+ def: Pat<(VecPI32 (HexagonVSPLATW I32:$Rs)), (Rep (Vsplatrw $Rs))>;
+}
+
+class Vneg1<ValueType VecTy>
+ : PatFrag<(ops), (VecTy (HexagonVSPLATW (i32 -1)))>;
+
+class Vnot<ValueType VecTy>
+ : PatFrag<(ops node:$Vs), (xor $Vs, Vneg1<VecTy>)>;
+
+let Predicates = [UseHVX] in {
+ let AddedComplexity = 200 in {
+ def: Pat<(Vnot<VecI8> HVI8:$Vs), (V6_vnot HvxVR:$Vs)>;
+ def: Pat<(Vnot<VecI16> HVI16:$Vs), (V6_vnot HvxVR:$Vs)>;
+ def: Pat<(Vnot<VecI32> HVI32:$Vs), (V6_vnot HvxVR:$Vs)>;
+ }
+
+ def: OpR_RR_pat<V6_vaddb, Add, VecI8, HVI8>;
+ def: OpR_RR_pat<V6_vaddh, Add, VecI16, HVI16>;
+ def: OpR_RR_pat<V6_vaddw, Add, VecI32, HVI32>;
+ def: OpR_RR_pat<V6_vaddb_dv, Add, VecPI8, HWI8>;
+ def: OpR_RR_pat<V6_vaddh_dv, Add, VecPI16, HWI16>;
+ def: OpR_RR_pat<V6_vaddw_dv, Add, VecPI32, HWI32>;
+ def: OpR_RR_pat<V6_vsubb, Sub, VecI8, HVI8>;
+ def: OpR_RR_pat<V6_vsubh, Sub, VecI16, HVI16>;
+ def: OpR_RR_pat<V6_vsubw, Sub, VecI32, HVI32>;
+ def: OpR_RR_pat<V6_vsubb_dv, Sub, VecPI8, HWI8>;
+ def: OpR_RR_pat<V6_vsubh_dv, Sub, VecPI16, HWI16>;
+ def: OpR_RR_pat<V6_vsubw_dv, Sub, VecPI32, HWI32>;
+ def: OpR_RR_pat<V6_vand, And, VecI8, HVI8>;
+ def: OpR_RR_pat<V6_vand, And, VecI16, HVI16>;
+ def: OpR_RR_pat<V6_vand, And, VecI32, HVI32>;
+ def: OpR_RR_pat<V6_vor, Or, VecI8, HVI8>;
+ def: OpR_RR_pat<V6_vor, Or, VecI16, HVI16>;
+ def: OpR_RR_pat<V6_vor, Or, VecI32, HVI32>;
+ def: OpR_RR_pat<V6_vxor, Xor, VecI8, HVI8>;
+ def: OpR_RR_pat<V6_vxor, Xor, VecI16, HVI16>;
+ def: OpR_RR_pat<V6_vxor, Xor, VecI32, HVI32>;
+
+ def: Pat<(vselect HQ8:$Qu, HVI8:$Vs, HVI8:$Vt),
+ (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
+ def: Pat<(vselect HQ16:$Qu, HVI16:$Vs, HVI16:$Vt),
+ (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
+ def: Pat<(vselect HQ32:$Qu, HVI32:$Vs, HVI32:$Vt),
+ (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
+
+ def: Pat<(vselect (qnot HQ8:$Qu), HVI8:$Vs, HVI8:$Vt),
+ (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>;
+ def: Pat<(vselect (qnot HQ16:$Qu), HVI16:$Vs, HVI16:$Vt),
+ (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>;
+ def: Pat<(vselect (qnot HQ32:$Qu), HVI32:$Vs, HVI32:$Vt),
+ (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>;
+}
+
+let Predicates = [UseHVX] in {
+ def: Pat<(VecPI16 (sext HVI8:$Vs)), (VSxtb $Vs)>;
+ def: Pat<(VecPI32 (sext HVI16:$Vs)), (VSxth $Vs)>;
+ def: Pat<(VecPI16 (zext HVI8:$Vs)), (VZxtb $Vs)>;
+ def: Pat<(VecPI32 (zext HVI16:$Vs)), (VZxth $Vs)>;
+
+ def: Pat<(VecI16 (sext_invec HVI8:$Vs)), (LoVec (VSxtb $Vs))>;
+ def: Pat<(VecI32 (sext_invec HVI16:$Vs)), (LoVec (VSxth $Vs))>;
+ def: Pat<(VecI32 (sext_invec HVI8:$Vs)),
+ (LoVec (VSxth (LoVec (VSxtb $Vs))))>;
+ def: Pat<(VecPI16 (sext_invec HWI8:$Vss)), (VSxtb (LoVec $Vss))>;
+ def: Pat<(VecPI32 (sext_invec HWI16:$Vss)), (VSxth (LoVec $Vss))>;
+ def: Pat<(VecPI32 (sext_invec HWI8:$Vss)),
+ (VSxth (LoVec (VSxtb (LoVec $Vss))))>;
+
+ def: Pat<(VecI16 (zext_invec HVI8:$Vs)), (LoVec (VZxtb $Vs))>;
+ def: Pat<(VecI32 (zext_invec HVI16:$Vs)), (LoVec (VZxth $Vs))>;
+ def: Pat<(VecI32 (zext_invec HVI8:$Vs)),
+ (LoVec (VZxth (LoVec (VZxtb $Vs))))>;
+ def: Pat<(VecPI16 (zext_invec HWI8:$Vss)), (VZxtb (LoVec $Vss))>;
+ def: Pat<(VecPI32 (zext_invec HWI16:$Vss)), (VZxth (LoVec $Vss))>;
+ def: Pat<(VecPI32 (zext_invec HWI8:$Vss)),
+ (VZxth (LoVec (VZxtb (LoVec $Vss))))>;
+
+ def: Pat<(VecI8 (trunc HWI16:$Vss)),
+ (V6_vpackeb (HiVec $Vss), (LoVec $Vss))>;
+ def: Pat<(VecI16 (trunc HWI32:$Vss)),
+ (V6_vpackeh (HiVec $Vss), (LoVec $Vss))>;
+
+ def: Pat<(VecQ8 (trunc HVI8:$Vs)),
+ (V6_vandvrt HvxVR:$Vs, (A2_tfrsi 0x01010101))>;
+ def: Pat<(VecQ16 (trunc HVI16:$Vs)),
+ (V6_vandvrt HvxVR:$Vs, (A2_tfrsi 0x01010101))>;
+ def: Pat<(VecQ32 (trunc HVI32:$Vs)),
+ (V6_vandvrt HvxVR:$Vs, (A2_tfrsi 0x01010101))>;
+}
+
+let Predicates = [UseHVX] in {
+ // The "source" types are not legal, and there are no parameterized
+ // definitions for them, but they are length-specific.
+ let Predicates = [UseHVX,UseHVX64B] in {
+ def: Pat<(VecI16 (sext_inreg HVI16:$Vs, v32i8)),
+ (V6_vasrh (V6_vaslh HVI16:$Vs, (A2_tfrsi 8)), (A2_tfrsi 8))>;
+ def: Pat<(VecI32 (sext_inreg HVI32:$Vs, v16i8)),
+ (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 24)), (A2_tfrsi 24))>;
+ def: Pat<(VecI32 (sext_inreg HVI32:$Vs, v16i16)),
+ (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 16)), (A2_tfrsi 16))>;
+ }
+ let Predicates = [UseHVX,UseHVX128B] in {
+ def: Pat<(VecI16 (sext_inreg HVI16:$Vs, v64i8)),
+ (V6_vasrh (V6_vaslh HVI16:$Vs, (A2_tfrsi 8)), (A2_tfrsi 8))>;
+ def: Pat<(VecI32 (sext_inreg HVI32:$Vs, v32i8)),
+ (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 24)), (A2_tfrsi 24))>;
+ def: Pat<(VecI32 (sext_inreg HVI32:$Vs, v32i16)),
+ (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 16)), (A2_tfrsi 16))>;
+ }
+
+ def: Pat<(HexagonVASL HVI8:$Vs, I32:$Rt),
+ (V6_vpackeb (V6_vaslh (HiVec (VZxtb HvxVR:$Vs)), I32:$Rt),
+ (V6_vaslh (LoVec (VZxtb HvxVR:$Vs)), I32:$Rt))>;
+ def: Pat<(HexagonVASR HVI8:$Vs, I32:$Rt),
+ (V6_vpackeb (V6_vasrh (HiVec (VSxtb HvxVR:$Vs)), I32:$Rt),
+ (V6_vasrh (LoVec (VSxtb HvxVR:$Vs)), I32:$Rt))>;
+ def: Pat<(HexagonVLSR HVI8:$Vs, I32:$Rt),
+ (V6_vpackeb (V6_vlsrh (HiVec (VZxtb HvxVR:$Vs)), I32:$Rt),
+ (V6_vlsrh (LoVec (VZxtb HvxVR:$Vs)), I32:$Rt))>;
+
+ def: Pat<(HexagonVASL HVI16:$Vs, I32:$Rt), (V6_vaslh HvxVR:$Vs, I32:$Rt)>;
+ def: Pat<(HexagonVASL HVI32:$Vs, I32:$Rt), (V6_vaslw HvxVR:$Vs, I32:$Rt)>;
+ def: Pat<(HexagonVASR HVI16:$Vs, I32:$Rt), (V6_vasrh HvxVR:$Vs, I32:$Rt)>;
+ def: Pat<(HexagonVASR HVI32:$Vs, I32:$Rt), (V6_vasrw HvxVR:$Vs, I32:$Rt)>;
+ def: Pat<(HexagonVLSR HVI16:$Vs, I32:$Rt), (V6_vlsrh HvxVR:$Vs, I32:$Rt)>;
+ def: Pat<(HexagonVLSR HVI32:$Vs, I32:$Rt), (V6_vlsrw HvxVR:$Vs, I32:$Rt)>;
+
+ def: Pat<(add HVI32:$Vx, (HexagonVASL HVI32:$Vu, I32:$Rt)),
+ (V6_vaslw_acc HvxVR:$Vx, HvxVR:$Vu, I32:$Rt)>;
+ def: Pat<(add HVI32:$Vx, (HexagonVASR HVI32:$Vu, I32:$Rt)),
+ (V6_vasrw_acc HvxVR:$Vx, HvxVR:$Vu, I32:$Rt)>;
+
+ def: Pat<(shl HVI16:$Vs, HVI16:$Vt), (V6_vaslhv HvxVR:$Vs, HvxVR:$Vt)>;
+ def: Pat<(shl HVI32:$Vs, HVI32:$Vt), (V6_vaslwv HvxVR:$Vs, HvxVR:$Vt)>;
+ def: Pat<(sra HVI16:$Vs, HVI16:$Vt), (V6_vasrhv HvxVR:$Vs, HvxVR:$Vt)>;
+ def: Pat<(sra HVI32:$Vs, HVI32:$Vt), (V6_vasrwv HvxVR:$Vs, HvxVR:$Vt)>;
+ def: Pat<(srl HVI16:$Vs, HVI16:$Vt), (V6_vlsrhv HvxVR:$Vs, HvxVR:$Vt)>;
+ def: Pat<(srl HVI32:$Vs, HVI32:$Vt), (V6_vlsrwv HvxVR:$Vs, HvxVR:$Vt)>;
+
+ def: Pat<(VecI16 (bswap HVI16:$Vs)),
+ (V6_vdelta HvxVR:$Vs, (V6_lvsplatw (A2_tfrsi 0x01010101)))>;
+ def: Pat<(VecI32 (bswap HVI32:$Vs)),
+ (V6_vdelta HvxVR:$Vs, (V6_lvsplatw (A2_tfrsi 0x03030303)))>;
+
+ def: Pat<(VecI8 (ctpop HVI8:$Vs)),
+ (V6_vpackeb (V6_vpopcounth (HiVec (V6_vunpackub HvxVR:$Vs))),
+ (V6_vpopcounth (LoVec (V6_vunpackub HvxVR:$Vs))))>;
+ def: Pat<(VecI16 (ctpop HVI16:$Vs)), (V6_vpopcounth HvxVR:$Vs)>;
+ def: Pat<(VecI32 (ctpop HVI32:$Vs)),
+ (V6_vaddw (LoVec (V6_vzh (V6_vpopcounth HvxVR:$Vs))),
+ (HiVec (V6_vzh (V6_vpopcounth HvxVR:$Vs))))>;
+
+ def: Pat<(VecI8 (ctlz HVI8:$Vs)),
+ (V6_vsubb (V6_vpackeb (V6_vcl0h (HiVec (V6_vunpackub HvxVR:$Vs))),
+ (V6_vcl0h (LoVec (V6_vunpackub HvxVR:$Vs)))),
+ (V6_lvsplatw (A2_tfrsi 0x08080808)))>;
+ def: Pat<(VecI16 (ctlz HVI16:$Vs)), (V6_vcl0h HvxVR:$Vs)>;
+ def: Pat<(VecI32 (ctlz HVI32:$Vs)), (V6_vcl0w HvxVR:$Vs)>;
+}
+
+class HvxSel_pat<InstHexagon MI, PatFrag RegPred>
+ : Pat<(select I1:$Pu, RegPred:$Vs, RegPred:$Vt),
+ (MI I1:$Pu, RegPred:$Vs, RegPred:$Vt)>;
+
+let Predicates = [UseHVX] in {
+ def: HvxSel_pat<PS_vselect, HVI8>;
+ def: HvxSel_pat<PS_vselect, HVI16>;
+ def: HvxSel_pat<PS_vselect, HVI32>;
+ def: HvxSel_pat<PS_wselect, HWI8>;
+ def: HvxSel_pat<PS_wselect, HWI16>;
+ def: HvxSel_pat<PS_wselect, HWI32>;
+}
+
+let Predicates = [UseHVX] in {
+ def: Pat<(VecQ8 (qtrue)), (PS_qtrue)>;
+ def: Pat<(VecQ16 (qtrue)), (PS_qtrue)>;
+ def: Pat<(VecQ32 (qtrue)), (PS_qtrue)>;
+ def: Pat<(VecQ8 (qfalse)), (PS_qfalse)>;
+ def: Pat<(VecQ16 (qfalse)), (PS_qfalse)>;
+ def: Pat<(VecQ32 (qfalse)), (PS_qfalse)>;
+
+ def: Pat<(vnot HQ8:$Qs), (V6_pred_not HvxQR:$Qs)>;
+ def: Pat<(vnot HQ16:$Qs), (V6_pred_not HvxQR:$Qs)>;
+ def: Pat<(vnot HQ32:$Qs), (V6_pred_not HvxQR:$Qs)>;
+ def: Pat<(qnot HQ8:$Qs), (V6_pred_not HvxQR:$Qs)>;
+ def: Pat<(qnot HQ16:$Qs), (V6_pred_not HvxQR:$Qs)>;
+ def: Pat<(qnot HQ32:$Qs), (V6_pred_not HvxQR:$Qs)>;
+
+ def: OpR_RR_pat<V6_pred_and, And, VecQ8, HQ8>;
+ def: OpR_RR_pat<V6_pred_and, And, VecQ16, HQ16>;
+ def: OpR_RR_pat<V6_pred_and, And, VecQ32, HQ32>;
+ def: OpR_RR_pat<V6_pred_or, Or, VecQ8, HQ8>;
+ def: OpR_RR_pat<V6_pred_or, Or, VecQ16, HQ16>;
+ def: OpR_RR_pat<V6_pred_or, Or, VecQ32, HQ32>;
+ def: OpR_RR_pat<V6_pred_xor, Xor, VecQ8, HQ8>;
+ def: OpR_RR_pat<V6_pred_xor, Xor, VecQ16, HQ16>;
+ def: OpR_RR_pat<V6_pred_xor, Xor, VecQ32, HQ32>;
+
+ def: OpR_RR_pat<V6_pred_and_n, Not2<And>, VecQ8, HQ8>;
+ def: OpR_RR_pat<V6_pred_and_n, Not2<And>, VecQ16, HQ16>;
+ def: OpR_RR_pat<V6_pred_and_n, Not2<And>, VecQ32, HQ32>;
+ def: OpR_RR_pat<V6_pred_or_n, Not2<Or>, VecQ8, HQ8>;
+ def: OpR_RR_pat<V6_pred_or_n, Not2<Or>, VecQ16, HQ16>;
+ def: OpR_RR_pat<V6_pred_or_n, Not2<Or>, VecQ32, HQ32>;
+
+ def: OpR_RR_pat<V6_veqb, seteq, VecQ8, HVI8>;
+ def: OpR_RR_pat<V6_veqh, seteq, VecQ16, HVI16>;
+ def: OpR_RR_pat<V6_veqw, seteq, VecQ32, HVI32>;
+ def: OpR_RR_pat<V6_vgtb, setgt, VecQ8, HVI8>;
+ def: OpR_RR_pat<V6_vgth, setgt, VecQ16, HVI16>;
+ def: OpR_RR_pat<V6_vgtw, setgt, VecQ32, HVI32>;
+ def: OpR_RR_pat<V6_vgtub, setugt, VecQ8, HVI8>;
+ def: OpR_RR_pat<V6_vgtuh, setugt, VecQ16, HVI16>;
+ def: OpR_RR_pat<V6_vgtuw, setugt, VecQ32, HVI32>;
+
+ def: AccRRR_pat<V6_veqb_and, And, seteq, HQ8, HVI8, HVI8>;
+ def: AccRRR_pat<V6_veqb_or, Or, seteq, HQ8, HVI8, HVI8>;
+ def: AccRRR_pat<V6_veqb_xor, Xor, seteq, HQ8, HVI8, HVI8>;
+ def: AccRRR_pat<V6_veqh_and, And, seteq, HQ16, HVI16, HVI16>;
+ def: AccRRR_pat<V6_veqh_or, Or, seteq, HQ16, HVI16, HVI16>;
+ def: AccRRR_pat<V6_veqh_xor, Xor, seteq, HQ16, HVI16, HVI16>;
+ def: AccRRR_pat<V6_veqw_and, And, seteq, HQ32, HVI32, HVI32>;
+ def: AccRRR_pat<V6_veqw_or, Or, seteq, HQ32, HVI32, HVI32>;
+ def: AccRRR_pat<V6_veqw_xor, Xor, seteq, HQ32, HVI32, HVI32>;
+
+ def: AccRRR_pat<V6_vgtb_and, And, setgt, HQ8, HVI8, HVI8>;
+ def: AccRRR_pat<V6_vgtb_or, Or, setgt, HQ8, HVI8, HVI8>;
+ def: AccRRR_pat<V6_vgtb_xor, Xor, setgt, HQ8, HVI8, HVI8>;
+ def: AccRRR_pat<V6_vgth_and, And, setgt, HQ16, HVI16, HVI16>;
+ def: AccRRR_pat<V6_vgth_or, Or, setgt, HQ16, HVI16, HVI16>;
+ def: AccRRR_pat<V6_vgth_xor, Xor, setgt, HQ16, HVI16, HVI16>;
+ def: AccRRR_pat<V6_vgtw_and, And, setgt, HQ32, HVI32, HVI32>;
+ def: AccRRR_pat<V6_vgtw_or, Or, setgt, HQ32, HVI32, HVI32>;
+ def: AccRRR_pat<V6_vgtw_xor, Xor, setgt, HQ32, HVI32, HVI32>;
+
+ def: AccRRR_pat<V6_vgtub_and, And, setugt, HQ8, HVI8, HVI8>;
+ def: AccRRR_pat<V6_vgtub_or, Or, setugt, HQ8, HVI8, HVI8>;
+ def: AccRRR_pat<V6_vgtub_xor, Xor, setugt, HQ8, HVI8, HVI8>;
+ def: AccRRR_pat<V6_vgtuh_and, And, setugt, HQ16, HVI16, HVI16>;
+ def: AccRRR_pat<V6_vgtuh_or, Or, setugt, HQ16, HVI16, HVI16>;
+ def: AccRRR_pat<V6_vgtuh_xor, Xor, setugt, HQ16, HVI16, HVI16>;
+ def: AccRRR_pat<V6_vgtuw_and, And, setugt, HQ32, HVI32, HVI32>;
+ def: AccRRR_pat<V6_vgtuw_or, Or, setugt, HQ32, HVI32, HVI32>;
+ def: AccRRR_pat<V6_vgtuw_xor, Xor, setugt, HQ32, HVI32, HVI32>;
+}
diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td
index b2d66317b66e..fd7466349ecd 100644
--- a/lib/Target/Hexagon/HexagonPseudo.td
+++ b/lib/Target/Hexagon/HexagonPseudo.td
@@ -24,7 +24,7 @@ let PrintMethod = "printGlobalOperand" in {
let isPseudo = 1 in {
let isCodeGenOnly = 0 in
def A2_iconst : Pseudo<(outs IntRegs:$Rd32),
- (ins s27_2Imm:$Ii), "${Rd32}=iconst(#${Ii})">;
+ (ins s27_2Imm:$Ii), "${Rd32} = iconst(#${Ii})">;
def DUPLEX_Pseudo : InstHexagon<(outs),
(ins s32_0Imm:$offset), "DUPLEX", [], "", DUPLEX, TypePSEUDO>;
@@ -34,7 +34,7 @@ let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
isAsmParserOnly = 1 in
def TFRI64_V2_ext : InstHexagon<(outs DoubleRegs:$dst),
(ins s32_0Imm:$src1, s8_0Imm:$src2),
- "$dst=combine(#$src1,#$src2)", [], "",
+ "$dst = combine(#$src1,#$src2)", [], "",
A2_combineii.Itinerary, TypeALU32_2op>, OpcodeHexagon;
// HI/LO Instructions
@@ -44,7 +44,7 @@ class REG_IMMED<string RegHalf, bit Rs, bits<3> MajOp, bit MinOp,
InstHexagon rootInst>
: InstHexagon<(outs IntRegs:$dst),
(ins u16_0Imm:$imm_value),
- "$dst"#RegHalf#"=#$imm_value", [], "",
+ "$dst"#RegHalf#" = #$imm_value", [], "",
rootInst.Itinerary, rootInst.Type>, OpcodeHexagon {
bits<5> dst;
bits<32> imm_value;
@@ -102,6 +102,13 @@ def ENDLOOP1 : Endloop<(outs), (ins b30_2Imm:$offset),
[]>;
}
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
+ Defs = [PC, LC0, LC1], Uses = [SA0, SA1, LC0, LC1] in {
+def ENDLOOP01 : Endloop<(outs), (ins b30_2Imm:$offset),
+ ":endloop01",
+ []>;
+}
+
let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
opExtendable = 0, hasSideEffects = 0 in
class LOOP_iBase<string mnemonic, InstHexagon rootInst>
@@ -316,7 +323,7 @@ def LDriw_pred : LDInst<(outs PredRegs:$dst),
// Load modifier.
let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def LDriw_mod : LDInst<(outs ModRegs:$dst),
+def LDriw_ctr : LDInst<(outs CtrRegs:$dst),
(ins IntRegs:$addr, s32_0Imm:$off),
".error \"should not emit\"", []>;
@@ -399,42 +406,42 @@ let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in {
}
// Vector store pseudos
-let Predicates = [HasV60T, UseHVX], isPseudo = 1, isCodeGenOnly = 1,
+let Predicates = [HasV60,UseHVX], isPseudo = 1, isCodeGenOnly = 1,
mayStore = 1, accessSize = HVXVectorAccess, hasSideEffects = 0 in
class STrivv_template<RegisterClass RC, InstHexagon rootInst>
: InstHexagon<(outs), (ins IntRegs:$addr, s32_0Imm:$off, RC:$src),
"", [], "", rootInst.Itinerary, rootInst.Type>;
def PS_vstorerw_ai: STrivv_template<HvxWR, V6_vS32b_ai>,
- Requires<[HasV60T,UseHVX]>;
+ Requires<[HasV60,UseHVX]>;
def PS_vstorerw_nt_ai: STrivv_template<HvxWR, V6_vS32b_nt_ai>,
- Requires<[HasV60T,UseHVX]>;
+ Requires<[HasV60,UseHVX]>;
def PS_vstorerwu_ai: STrivv_template<HvxWR, V6_vS32Ub_ai>,
- Requires<[HasV60T,UseHVX]>;
+ Requires<[HasV60,UseHVX]>;
let isPseudo = 1, isCodeGenOnly = 1, mayStore = 1, hasSideEffects = 0 in
def PS_vstorerq_ai: Pseudo<(outs),
(ins IntRegs:$Rs, s32_0Imm:$Off, HvxQR:$Qt), "", []>,
- Requires<[HasV60T,UseHVX]>;
+ Requires<[HasV60,UseHVX]>;
// Vector load pseudos
-let Predicates = [HasV60T, UseHVX], isPseudo = 1, isCodeGenOnly = 1,
+let Predicates = [HasV60, UseHVX], isPseudo = 1, isCodeGenOnly = 1,
mayLoad = 1, accessSize = HVXVectorAccess, hasSideEffects = 0 in
class LDrivv_template<RegisterClass RC, InstHexagon rootInst>
: InstHexagon<(outs RC:$dst), (ins IntRegs:$addr, s32_0Imm:$off),
"", [], "", rootInst.Itinerary, rootInst.Type>;
def PS_vloadrw_ai: LDrivv_template<HvxWR, V6_vL32b_ai>,
- Requires<[HasV60T,UseHVX]>;
+ Requires<[HasV60,UseHVX]>;
def PS_vloadrw_nt_ai: LDrivv_template<HvxWR, V6_vL32b_nt_ai>,
- Requires<[HasV60T,UseHVX]>;
+ Requires<[HasV60,UseHVX]>;
def PS_vloadrwu_ai: LDrivv_template<HvxWR, V6_vL32Ub_ai>,
- Requires<[HasV60T,UseHVX]>;
+ Requires<[HasV60,UseHVX]>;
let isPseudo = 1, isCodeGenOnly = 1, mayLoad = 1, hasSideEffects = 0 in
def PS_vloadrq_ai: Pseudo<(outs HvxQR:$Qd),
(ins IntRegs:$Rs, s32_0Imm:$Off), "", []>,
- Requires<[HasV60T,UseHVX]>;
+ Requires<[HasV60,UseHVX]>;
let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
@@ -443,10 +450,20 @@ class VSELInst<dag outs, dag ins, InstHexagon rootInst>
def PS_vselect: VSELInst<(outs HvxVR:$dst),
(ins PredRegs:$src1, HvxVR:$src2, HvxVR:$src3), V6_vcmov>,
- Requires<[HasV60T,UseHVX]>;
+ Requires<[HasV60,UseHVX]>;
def PS_wselect: VSELInst<(outs HvxWR:$dst),
(ins PredRegs:$src1, HvxWR:$src2, HvxWR:$src3), V6_vccombine>,
- Requires<[HasV60T,UseHVX]>;
+ Requires<[HasV60,UseHVX]>;
+
+let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
+ isCodeGenOnly = 1 in {
+ def PS_qtrue: InstHexagon<(outs HvxQR:$Qd), (ins), "", [], "",
+ V6_veqw.Itinerary, TypeCVI_VA>;
+ def PS_qfalse: InstHexagon<(outs HvxQR:$Qd), (ins), "", [], "",
+ V6_vgtw.Itinerary, TypeCVI_VA>;
+ def PS_vdd0: InstHexagon<(outs HvxWR:$Vd), (ins), "", [], "",
+ V6_vsubw_dv.Itinerary, TypeCVI_VA_DV>;
+}
// Store predicate.
let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
@@ -457,8 +474,8 @@ def STriw_pred : STInst<(outs),
// Store modifier.
let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def STriw_mod : STInst<(outs),
- (ins IntRegs:$addr, s32_0Imm:$off, ModRegs:$src1),
+def STriw_ctr : STInst<(outs),
+ (ins IntRegs:$addr, s32_0Imm:$off, CtrRegs:$src1),
".error \"should not emit\"", []>;
let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
@@ -499,3 +516,46 @@ def DuplexIClassC: InstDuplex < 0xC >;
def DuplexIClassD: InstDuplex < 0xD >;
def DuplexIClassE: InstDuplex < 0xE >;
def DuplexIClassF: InstDuplex < 0xF >;
+
+// Pseudos for circular buffer instructions. These are needed in order to
+// allocate the correct pair of CSx and Mx registers.
+multiclass NewCircularLoad<RegisterClass RC, MemAccessSize MS> {
+
+let isCodeGenOnly = 1, isPseudo = 1, Defs = [CS], Uses = [CS],
+ addrMode = PostInc, accessSize = MS, hasSideEffects = 0 in {
+ def NAME#_pci : LDInst<(outs RC:$Rd32, IntRegs:$Rx32),
+ (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Cs),
+ ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_4403ca65>;
+
+ def NAME#_pcr : LDInst<(outs RC:$Rd32, IntRegs:$Rx32),
+ (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Cs),
+ ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_2fc0c436>;
+}
+}
+
+defm PS_loadrub : NewCircularLoad<IntRegs, ByteAccess>;
+defm PS_loadrb : NewCircularLoad<IntRegs, ByteAccess>;
+defm PS_loadruh : NewCircularLoad<IntRegs, HalfWordAccess>;
+defm PS_loadrh : NewCircularLoad<IntRegs, HalfWordAccess>;
+defm PS_loadri : NewCircularLoad<IntRegs, WordAccess>;
+defm PS_loadrd : NewCircularLoad<DoubleRegs, DoubleWordAccess>;
+
+multiclass NewCircularStore<RegisterClass RC, MemAccessSize MS> {
+
+let isCodeGenOnly = 1, isPseudo = 1, Defs = [CS], Uses = [CS],
+ addrMode = PostInc, accessSize = MS, hasSideEffects = 0 in {
+ def NAME#_pci : STInst<(outs IntRegs:$Rx32),
+ (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, RC:$Rt32, IntRegs:$Cs),
+ ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_9fdb5406>;
+
+ def NAME#_pcr : STInst<(outs IntRegs:$Rx32),
+ (ins IntRegs:$Rx32in, ModRegs:$Mu2, RC:$Rt32, IntRegs:$Cs),
+ ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_f86c328a>;
+}
+}
+
+defm PS_storerb : NewCircularStore<IntRegs, ByteAccess>;
+defm PS_storerh : NewCircularStore<IntRegs, HalfWordAccess>;
+defm PS_storerf : NewCircularStore<IntRegs, HalfWordAccess>;
+defm PS_storeri : NewCircularStore<IntRegs, WordAccess>;
+defm PS_storerd : NewCircularStore<DoubleRegs, WordAccess>;
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 85d6a6b4089e..2e11f875c0f9 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -19,6 +19,7 @@
#include "HexagonTargetMachine.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -145,6 +146,13 @@ BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
Reserved.set(Hexagon::R30);
Reserved.set(Hexagon::R31);
Reserved.set(Hexagon::VTMP);
+
+ // Guest registers.
+ Reserved.set(Hexagon::GELR); // G0
+ Reserved.set(Hexagon::GSR); // G1
+ Reserved.set(Hexagon::GOSP); // G2
+ Reserved.set(Hexagon::G3); // G3
+
// Control registers.
Reserved.set(Hexagon::SA0); // C0
Reserved.set(Hexagon::LC0); // C1
@@ -171,6 +179,9 @@ BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
Reserved.set(Hexagon::C8);
Reserved.set(Hexagon::USR_OVF);
+ if (MF.getSubtarget<HexagonSubtarget>().hasReservedR19())
+ Reserved.set(Hexagon::R19);
+
for (int x = Reserved.find_first(); x >= 0; x = Reserved.find_next(x))
markSuperRegs(Reserved, x);
@@ -233,6 +244,55 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
}
+bool HexagonRegisterInfo::shouldCoalesce(MachineInstr *MI,
+ const TargetRegisterClass *SrcRC, unsigned SubReg,
+ const TargetRegisterClass *DstRC, unsigned DstSubReg,
+ const TargetRegisterClass *NewRC, LiveIntervals &LIS) const {
+ // Coalescing will extend the live interval of the destination register.
+ // If the destination register is a vector pair, avoid introducing function
+ // calls into the interval, since it could result in a spilling of a pair
+ // instead of a single vector.
+ MachineFunction &MF = *MI->getParent()->getParent();
+ const HexagonSubtarget &HST = MF.getSubtarget<HexagonSubtarget>();
+ if (!HST.useHVXOps() || NewRC->getID() != Hexagon::HvxWRRegClass.getID())
+ return true;
+ bool SmallSrc = SrcRC->getID() == Hexagon::HvxVRRegClass.getID();
+ bool SmallDst = DstRC->getID() == Hexagon::HvxVRRegClass.getID();
+ if (!SmallSrc && !SmallDst)
+ return true;
+
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ const SlotIndexes &Indexes = *LIS.getSlotIndexes();
+ auto HasCall = [&Indexes] (const LiveInterval::Segment &S) {
+ for (SlotIndex I = S.start.getBaseIndex(), E = S.end.getBaseIndex();
+ I != E; I = I.getNextIndex()) {
+ if (const MachineInstr *MI = Indexes.getInstructionFromIndex(I))
+ if (MI->isCall())
+ return true;
+ }
+ return false;
+ };
+
+ if (SmallSrc == SmallDst) {
+ // Both must be true, because the case for both being false was
+ // checked earlier. Both registers will be coalesced into a register
+ // of a wider class (HvxWR), and we don't want its live range to
+ // span over calls.
+ return !any_of(LIS.getInterval(DstReg), HasCall) &&
+ !any_of(LIS.getInterval(SrcReg), HasCall);
+ }
+
+ // If one register is large (HvxWR) and the other is small (HvxVR), then
+ // coalescing is ok if the large is already live across a function call,
+ // or if the small one is not.
+ unsigned SmallReg = SmallSrc ? SrcReg : DstReg;
+ unsigned LargeReg = SmallSrc ? DstReg : SrcReg;
+ return any_of(LIS.getInterval(LargeReg), HasCall) ||
+ !any_of(LIS.getInterval(SmallReg), HasCall);
+}
+
+
unsigned HexagonRegisterInfo::getRARegister() const {
return Hexagon::R31;
}
@@ -283,6 +343,11 @@ bool HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF)
return MF.getSubtarget<HexagonSubtarget>().getFrameLowering()->hasFP(MF);
}
+const TargetRegisterClass *
+HexagonRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind) const {
+ return &Hexagon::IntRegsRegClass;
+}
unsigned HexagonRegisterInfo::getFirstCallerSavedNonParamReg() const {
return Hexagon::R6;
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 4ead57da8fa1..497dc45236b1 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -39,6 +39,8 @@ public:
BitVector getReservedRegs(const MachineFunction &MF) const override;
+ bool enableMultipleCopyHints() const override { return true; }
+
void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
unsigned FIOperandNum, RegScavenger *RS = nullptr) const override;
@@ -61,6 +63,10 @@ public:
return true;
}
+ bool shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC,
+ unsigned SubReg, const TargetRegisterClass *DstRC, unsigned DstSubReg,
+ const TargetRegisterClass *NewRC, LiveIntervals &LIS) const override;
+
// Debug information queries.
unsigned getRARegister() const;
unsigned getFrameRegister(const MachineFunction &MF) const override;
@@ -75,6 +81,10 @@ public:
unsigned getFirstCallerSavedNonParamReg() const;
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind = 0) const override;
+
bool isEHReturnCalleeSaveReg(unsigned Reg) const;
};
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td
index 1d1e85e7ac7e..1fe1ef4ac572 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -67,6 +67,17 @@ let Namespace = "Hexagon" in {
let HWEncoding{0} = num;
}
+ // Rg - Guest/Hypervisor registers
+ class Rg<bits<5> num, string n,
+ list<string> alt = [], list<Register> alias = []> :
+ HexagonReg<num, n, alt, alias>;
+
+ // Rgg - 64-bit Guest/Hypervisor registers
+ class Rgg<bits<5> num, string n, list<Register> subregs> :
+ HexagonDoubleReg<num, n, subregs> {
+ let SubRegs = subregs;
+ }
+
def isub_lo : SubRegIndex<32>;
def isub_hi : SubRegIndex<32, 32>;
def vsub_lo : SubRegIndex<512>;
@@ -200,40 +211,87 @@ let Namespace = "Hexagon" in {
def Q1 : Rq<1, "q1">, DwarfRegNum<[132]>;
def Q2 : Rq<2, "q2">, DwarfRegNum<[133]>;
def Q3 : Rq<3, "q3">, DwarfRegNum<[134]>;
+
+ // Guest Registers
+ def GELR: Rg<0, "gelr", ["g0"]>, DwarfRegNum<[220]>;
+ def GSR: Rg<1, "gsr", ["g1"]>, DwarfRegNum<[221]>;
+ def GOSP: Rg<2, "gosp", ["g2"]>, DwarfRegNum<[222]>;
+ def G3: Rg<3, "gbadva", ["g3"]>, DwarfRegNum<[223]>;
+ def G4: Rg<4, "g4">, DwarfRegNum<[224]>;
+ def G5: Rg<5, "g5">, DwarfRegNum<[225]>;
+ def G6: Rg<6, "g6">, DwarfRegNum<[226]>;
+ def G7: Rg<7, "g7">, DwarfRegNum<[227]>;
+ def G8: Rg<8, "g8">, DwarfRegNum<[228]>;
+ def G9: Rg<9, "g9">, DwarfRegNum<[229]>;
+ def G10: Rg<10, "g10">, DwarfRegNum<[230]>;
+ def G11: Rg<11, "g11">, DwarfRegNum<[231]>;
+ def G12: Rg<12, "g12">, DwarfRegNum<[232]>;
+ def G13: Rg<13, "g13">, DwarfRegNum<[233]>;
+ def G14: Rg<14, "g14">, DwarfRegNum<[234]>;
+ def G15: Rg<15, "g15">, DwarfRegNum<[235]>;
+ def GPMUCNT4: Rg<16, "gpmucnt4", ["g16"]>, DwarfRegNum<[236]>;
+ def GPMUCNT5: Rg<17, "gpmucnt5", ["g17"]>, DwarfRegNum<[237]>;
+ def GPMUCNT6: Rg<18, "gpmucnt6", ["g18"]>, DwarfRegNum<[238]>;
+ def GPMUCNT7: Rg<19, "gpmucnt7", ["g19"]>, DwarfRegNum<[239]>;
+ def G20: Rg<20, "g20">, DwarfRegNum<[240]>;
+ def G21: Rg<21, "g21">, DwarfRegNum<[241]>;
+ def G22: Rg<22, "g22">, DwarfRegNum<[242]>;
+ def G23: Rg<23, "g23">, DwarfRegNum<[243]>;
+ def GPCYCLELO: Rg<24, "gpcyclelo", ["g24"]>, DwarfRegNum<[244]>;
+ def GPCYCLEHI: Rg<25, "gpcyclehi", ["g25"]>, DwarfRegNum<[245]>;
+ def GPMUCNT0: Rg<26, "gpmucnt0", ["g26"]>, DwarfRegNum<[246]>;
+ def GPMUCNT1: Rg<27, "gpmucnt1", ["g27"]>, DwarfRegNum<[247]>;
+ def GPMUCNT2: Rg<28, "gpmucnt2", ["g28"]>, DwarfRegNum<[248]>;
+ def GPMUCNT3: Rg<29, "gpmucnt3", ["g29"]>, DwarfRegNum<[249]>;
+ def G30: Rg<30, "g30">, DwarfRegNum<[250]>;
+ def G31: Rg<31, "g31">, DwarfRegNum<[251]>;
+
+ // Guest Register Pairs
+ let SubRegIndices = [isub_lo, isub_hi], CoveredBySubRegs = 1 in {
+ def G1_0 : Rgg<0, "g1:0", [GELR, GSR]>, DwarfRegNum<[220]>;
+ def G3_2 : Rgg<2, "g3:2", [GOSP, G3]>, DwarfRegNum<[222]>;
+ def G5_4 : Rgg<4, "g5:4", [G4, G5]>, DwarfRegNum<[224]>;
+ def G7_6 : Rgg<6, "g7:6", [G6, G7]>, DwarfRegNum<[226]>;
+ def G9_8 : Rgg<8, "g9:8", [G8, G9]>, DwarfRegNum<[228]>;
+ def G11_10 : Rgg<10, "g11:10", [G10, G11]>, DwarfRegNum<[230]>;
+ def G13_12 : Rgg<12, "g13:12", [G12, G13]>, DwarfRegNum<[232]>;
+ def G15_14 : Rgg<14, "g15:14", [G14, G15]>, DwarfRegNum<[234]>;
+ def G17_16 : Rgg<16, "g17:16", [GPMUCNT4, GPMUCNT5]>, DwarfRegNum<[236]>;
+ def G19_18 : Rgg<18, "g19:18", [GPMUCNT6, GPMUCNT7]>, DwarfRegNum<[238]>;
+ def G21_20 : Rgg<20, "g21:20", [G20, G21]>, DwarfRegNum<[240]>;
+ def G23_22 : Rgg<22, "g23:22", [G22, G23]>, DwarfRegNum<[242]>;
+ def G25_24 : Rgg<24, "g25:24", [GPCYCLELO, GPCYCLEHI]>, DwarfRegNum<[244]>;
+ def G27_26 : Rgg<26, "g27:26", [GPMUCNT0, GPMUCNT1]>, DwarfRegNum<[246]>;
+ def G29_28 : Rgg<28, "g29:28", [GPMUCNT2, GPMUCNT3]>, DwarfRegNum<[248]>;
+ def G31_30 : Rgg<30, "g31:30", [G30, G31]>, DwarfRegNum<[250]>;
+ }
+
}
// HVX types
-def VecI1
- : ValueTypeByHwMode<[Hvx64, Hvx64old, Hvx128, Hvx128old, DefaultMode],
- [v512i1, v512i1, v1024i1, v1024i1, v512i1]>;
-def VecI8
- : ValueTypeByHwMode<[Hvx64, Hvx64old, Hvx128, Hvx128old, DefaultMode],
- [v64i8, v64i8, v128i8, v128i8, v64i8]>;
-def VecI16
- : ValueTypeByHwMode<[Hvx64, Hvx64old, Hvx128, Hvx128old, DefaultMode],
- [v32i16, v32i16, v64i16, v64i16, v32i16]>;
-def VecI32
- : ValueTypeByHwMode<[Hvx64, Hvx64old, Hvx128, Hvx128old, DefaultMode],
- [v16i32, v16i32, v32i32, v32i32, v16i32]>;
-def VecPI8
- : ValueTypeByHwMode<[Hvx64, Hvx64old, Hvx128, Hvx128old, DefaultMode],
- [v128i8, v128i8, v256i8, v256i8, v128i8]>;
-def VecPI16
- : ValueTypeByHwMode<[Hvx64, Hvx64old, Hvx128, Hvx128old, DefaultMode],
- [v64i16, v64i16, v128i16, v128i16, v64i16]>;
-def VecPI32
- : ValueTypeByHwMode<[Hvx64, Hvx64old, Hvx128, Hvx128old, DefaultMode],
- [v32i32, v32i32, v64i32, v64i32, v32i32]>;
-def VecQ8
- : ValueTypeByHwMode<[Hvx64, Hvx64old, Hvx128, Hvx128old, DefaultMode],
- [v64i1, v64i1, v128i1, v128i1, v64i1]>;
-def VecQ16
- : ValueTypeByHwMode<[Hvx64, Hvx64old, Hvx128, Hvx128old, DefaultMode],
- [v32i1, v32i1, v64i1, v64i1, v32i1]>;
-def VecQ32
- : ValueTypeByHwMode<[Hvx64, Hvx64old, Hvx128, Hvx128old, DefaultMode],
- [v16i1, v16i1, v32i1, v32i1, v16i1]>;
+def VecI1: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode],
+ [v512i1, v1024i1, v512i1]>;
+def VecI8: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode],
+ [v64i8, v128i8, v64i8]>;
+def VecI16: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode],
+ [v32i16, v64i16, v32i16]>;
+def VecI32: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode],
+ [v16i32, v32i32, v16i32]>;
+
+def VecPI8: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode],
+ [v128i8, v256i8, v128i8]>;
+def VecPI16: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode],
+ [v64i16, v128i16, v64i16]>;
+def VecPI32: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode],
+ [v32i32, v64i32, v32i32]>;
+
+def VecQ8: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode],
+ [v64i1, v128i1, v64i1]>;
+def VecQ16: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode],
+ [v32i1, v64i1, v32i1]>;
+def VecQ32: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode],
+ [v16i1, v32i1, v16i1]>;
// HVX register classes
@@ -242,7 +300,7 @@ def VecQ32
// FIXME: the register order should be defined in terms of the preferred
// allocation order...
//
-def IntRegs : RegisterClass<"Hexagon", [i32, f32, v32i1, v4i8, v2i16], 32,
+def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32,
(add (sequence "R%u", 0, 9), (sequence "R%u", 12, 28),
R10, R11, R29, R30, R31)>;
@@ -254,8 +312,7 @@ def GeneralSubRegs : RegisterClass<"Hexagon", [i32], 32,
def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32,
(add R7, R6, R5, R4, R3, R2, R1, R0)> ;
-def DoubleRegs : RegisterClass<"Hexagon",
- [i64, f64, v64i1, v8i8, v4i16, v2i32], 64,
+def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64,
(add (sequence "D%u", 0, 4), (sequence "D%u", 6, 13), D5, D14, D15)>;
def GeneralDoubleLow8Regs : RegisterClass<"Hexagon", [i64], 64,
@@ -301,6 +358,25 @@ def CtrRegs64 : RegisterClass<"Hexagon", [i64], 64,
(add C1_0, C3_2, C5_4, C7_6, C9_8, C11_10, CS, UPCYCLE, C17_16,
PKTCOUNT, UTIMER)>;
+let Size = 32, isAllocatable = 0 in
+def GuestRegs : RegisterClass<"Hexagon", [i32], 32,
+ (add GELR, GSR, GOSP,
+ (sequence "G%u", 3, 15),
+ GPMUCNT4, GPMUCNT5, GPMUCNT6, GPMUCNT7,
+ G20, G21, G22, G23,
+ GPCYCLELO, GPCYCLEHI, GPMUCNT0, GPMUCNT1,
+ GPMUCNT2, GPMUCNT3,
+ G30, G31)>;
+
+let Size = 64, isAllocatable = 0 in
+def GuestRegs64 : RegisterClass<"Hexagon", [i64], 64,
+ (add G1_0, G3_2,
+ G5_4, G7_6, G9_8, G11_10, G13_12, G15_14,
+ G17_16, G19_18,
+ G21_20, G23_22,
+ G25_24, G27_26, G29_28,
+ G31_30)>;
+
// These registers are new for v62 and onward.
// The function RegisterMatchesArch() uses this list for validation.
let isAllocatable = 0 in
@@ -313,7 +389,6 @@ let Size = 32, isAllocatable = 0 in
def V65Regs : RegisterClass<"Hexagon", [i32], 32, (add VTMP)>;
-
def HexagonCSR
: CalleeSavedRegs<(add R16, R17, R18, R19, R20, R21, R22, R23,
R24, R25, R26, R27)>;
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index 3fe4cc73d2f3..c41f0d3c085c 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -60,14 +60,14 @@ INITIALIZE_PASS(HexagonSplitConst32AndConst64, "split-const-for-sdata",
"Hexagon Split Const32s and Const64s", false, false)
bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
- const HexagonTargetObjectFile &TLOF =
- *static_cast<const HexagonTargetObjectFile *>(
- Fn.getTarget().getObjFileLowering());
- if (TLOF.isSmallDataEnabled())
- return true;
+ auto &HST = Fn.getSubtarget<HexagonSubtarget>();
+ auto &HTM = static_cast<const HexagonTargetMachine&>(Fn.getTarget());
+ auto &TLOF = *HTM.getObjFileLowering();
+ if (HST.useSmallData() && TLOF.isSmallDataEnabled())
+ return false;
- const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
- const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+ const TargetInstrInfo *TII = HST.getInstrInfo();
+ const TargetRegisterInfo *TRI = HST.getRegisterInfo();
// Loop over all of the basic blocks
for (MachineBasicBlock &B : Fn) {
@@ -109,7 +109,6 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
//===----------------------------------------------------------------------===//
// Public Constructor Functions
//===----------------------------------------------------------------------===//
-
FunctionPass *llvm::createHexagonSplitConst32AndConst64() {
return new HexagonSplitConst32AndConst64();
}
diff --git a/lib/Target/Hexagon/HexagonSplitDouble.cpp b/lib/Target/Hexagon/HexagonSplitDouble.cpp
index c9f5400018e8..e018785f24d8 100644
--- a/lib/Target/Hexagon/HexagonSplitDouble.cpp
+++ b/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -26,6 +26,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
@@ -55,6 +56,8 @@ static cl::opt<int> MaxHSDR("max-hsdr", cl::Hidden, cl::init(-1),
cl::desc("Maximum number of split partitions"));
static cl::opt<bool> MemRefsFixed("hsdr-no-mem", cl::Hidden, cl::init(true),
cl::desc("Do not split loads or stores"));
+ static cl::opt<bool> SplitAll("hsdr-split-all", cl::Hidden, cl::init(false),
+ cl::desc("Split all partitions"));
namespace {
@@ -62,9 +65,7 @@ namespace {
public:
static char ID;
- HexagonSplitDoubleRegs() : MachineFunctionPass(ID) {
- initializeHexagonSplitDoubleRegsPass(*PassRegistry::getPassRegistry());
- }
+ HexagonSplitDoubleRegs() : MachineFunctionPass(ID) {}
StringRef getPassName() const override {
return "Hexagon Split Double Registers";
@@ -97,6 +98,7 @@ namespace {
bool isFixedInstr(const MachineInstr *MI) const;
void partitionRegisters(UUSetMap &P2Rs);
int32_t profit(const MachineInstr *MI) const;
+ int32_t profit(unsigned Reg) const;
bool isProfitable(const USet &Part, LoopRegMap &IRM) const;
void collectIndRegsForLoop(const MachineLoop *L, USet &Rs);
@@ -161,7 +163,7 @@ bool HexagonSplitDoubleRegs::isFixedInstr(const MachineInstr *MI) const {
if (MI->mayLoad() || MI->mayStore())
if (MemRefsFixed || isVolatileInstr(MI))
return true;
- if (MI->isDebugValue())
+ if (MI->isDebugInstr())
return false;
unsigned Opc = MI->getOpcode();
@@ -244,7 +246,7 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
if (FixedRegs[x])
continue;
unsigned R = TargetRegisterInfo::index2VirtReg(x);
- DEBUG(dbgs() << printReg(R, TRI) << " ~~");
+ LLVM_DEBUG(dbgs() << printReg(R, TRI) << " ~~");
USet &Asc = AssocMap[R];
for (auto U = MRI->use_nodbg_begin(R), Z = MRI->use_nodbg_end();
U != Z; ++U) {
@@ -267,13 +269,13 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
unsigned u = TargetRegisterInfo::virtReg2Index(T);
if (FixedRegs[u])
continue;
- DEBUG(dbgs() << ' ' << printReg(T, TRI));
+ LLVM_DEBUG(dbgs() << ' ' << printReg(T, TRI));
Asc.insert(T);
// Make it symmetric.
AssocMap[T].insert(R);
}
}
- DEBUG(dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << '\n');
}
UUMap R2P;
@@ -304,15 +306,10 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
P2Rs[I.second].insert(I.first);
}
-static inline int32_t profitImm(unsigned Lo, unsigned Hi) {
+static inline int32_t profitImm(unsigned Imm) {
int32_t P = 0;
- bool LoZ1 = false, HiZ1 = false;
- if (Lo == 0 || Lo == 0xFFFFFFFF)
- P += 10, LoZ1 = true;
- if (Hi == 0 || Hi == 0xFFFFFFFF)
- P += 10, HiZ1 = true;
- if (!LoZ1 && !HiZ1 && Lo == Hi)
- P += 3;
+ if (Imm == 0 || Imm == 0xFFFFFFFF)
+ P += 10;
return P;
}
@@ -342,21 +339,28 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
uint64_t D = MI->getOperand(1).getImm();
unsigned Lo = D & 0xFFFFFFFFULL;
unsigned Hi = D >> 32;
- return profitImm(Lo, Hi);
+ return profitImm(Lo) + profitImm(Hi);
}
case Hexagon::A2_combineii:
- case Hexagon::A4_combineii:
- return profitImm(MI->getOperand(1).getImm(),
- MI->getOperand(2).getImm());
+ case Hexagon::A4_combineii: {
+ const MachineOperand &Op1 = MI->getOperand(1);
+ const MachineOperand &Op2 = MI->getOperand(2);
+ int32_t Prof1 = Op1.isImm() ? profitImm(Op1.getImm()) : 0;
+ int32_t Prof2 = Op2.isImm() ? profitImm(Op2.getImm()) : 0;
+ return Prof1 + Prof2;
+ }
case Hexagon::A4_combineri:
ImmX++;
// Fall through into A4_combineir.
LLVM_FALLTHROUGH;
case Hexagon::A4_combineir: {
ImmX++;
- int64_t V = MI->getOperand(ImmX).getImm();
- if (V == 0 || V == -1)
- return 10;
+ const MachineOperand &OpX = MI->getOperand(ImmX);
+ if (OpX.isImm()) {
+ int64_t V = OpX.getImm();
+ if (V == 0 || V == -1)
+ return 10;
+ }
// Fall through into A2_combinew.
LLVM_FALLTHROUGH;
}
@@ -368,8 +372,11 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
case Hexagon::A2_andp:
case Hexagon::A2_orp:
- case Hexagon::A2_xorp:
- return 1;
+ case Hexagon::A2_xorp: {
+ unsigned Rs = MI->getOperand(1).getReg();
+ unsigned Rt = MI->getOperand(2).getReg();
+ return profit(Rs) + profit(Rt);
+ }
case Hexagon::S2_asl_i_p_or: {
unsigned S = MI->getOperand(3).getImm();
@@ -393,6 +400,25 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
return 0;
}
+int32_t HexagonSplitDoubleRegs::profit(unsigned Reg) const {
+ assert(TargetRegisterInfo::isVirtualRegister(Reg));
+
+ const MachineInstr *DefI = MRI->getVRegDef(Reg);
+ switch (DefI->getOpcode()) {
+ case Hexagon::A2_tfrpi:
+ case Hexagon::CONST64:
+ case Hexagon::A2_combineii:
+ case Hexagon::A4_combineii:
+ case Hexagon::A4_combineri:
+ case Hexagon::A4_combineir:
+ case Hexagon::A2_combinew:
+ return profit(DefI);
+ default:
+ break;
+ }
+ return 0;
+}
+
bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM)
const {
unsigned FixedNum = 0, LoopPhiNum = 0;
@@ -442,7 +468,9 @@ bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM)
if (FixedNum > 0 && LoopPhiNum > 0)
TotalP -= 20*LoopPhiNum;
- DEBUG(dbgs() << "Partition profit: " << TotalP << '\n');
+ LLVM_DEBUG(dbgs() << "Partition profit: " << TotalP << '\n');
+ if (SplitAll)
+ return true;
return TotalP > 0;
}
@@ -535,7 +563,7 @@ void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
Rs.insert(CmpR1);
Rs.insert(CmpR2);
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "For loop at " << printMBBReference(*HB) << " ind regs: ";
dump_partition(dbgs(), Rs, *TRI);
dbgs() << '\n';
@@ -710,23 +738,21 @@ void HexagonSplitDoubleRegs::splitCombine(MachineInstr *MI,
assert(F != PairMap.end());
const UUPair &P = F->second;
- if (Op1.isImm()) {
+ if (!Op1.isReg()) {
BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.second)
- .addImm(Op1.getImm());
- } else if (Op1.isReg()) {
+ .add(Op1);
+ } else {
BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.second)
.addReg(Op1.getReg(), getRegState(Op1), Op1.getSubReg());
- } else
- llvm_unreachable("Unexpected operand");
+ }
- if (Op2.isImm()) {
+ if (!Op2.isReg()) {
BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.first)
- .addImm(Op2.getImm());
- } else if (Op2.isReg()) {
+ .add(Op2);
+ } else {
BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.first)
.addReg(Op2.getReg(), getRegState(Op2), Op2.getSubReg());
- } else
- llvm_unreachable("Unexpected operand");
+ }
}
void HexagonSplitDoubleRegs::splitExt(MachineInstr *MI,
@@ -970,7 +996,7 @@ bool HexagonSplitDoubleRegs::splitInstr(MachineInstr *MI,
const UUPairMap &PairMap) {
using namespace Hexagon;
- DEBUG(dbgs() << "Splitting: " << *MI);
+ LLVM_DEBUG(dbgs() << "Splitting: " << *MI);
bool Split = false;
unsigned Opc = MI->getOpcode();
@@ -1104,8 +1130,8 @@ bool HexagonSplitDoubleRegs::splitPartition(const USet &Part) {
const TargetRegisterClass *IntRC = &Hexagon::IntRegsRegClass;
bool Changed = false;
- DEBUG(dbgs() << "Splitting partition: "; dump_partition(dbgs(), Part, *TRI);
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Splitting partition: ";
+ dump_partition(dbgs(), Part, *TRI); dbgs() << '\n');
UUPairMap PairMap;
@@ -1122,8 +1148,9 @@ bool HexagonSplitDoubleRegs::splitPartition(const USet &Part) {
unsigned LoR = MRI->createVirtualRegister(IntRC);
unsigned HiR = MRI->createVirtualRegister(IntRC);
- DEBUG(dbgs() << "Created mapping: " << printReg(DR, TRI) << " -> "
- << printReg(HiR, TRI) << ':' << printReg(LoR, TRI) << '\n');
+ LLVM_DEBUG(dbgs() << "Created mapping: " << printReg(DR, TRI) << " -> "
+ << printReg(HiR, TRI) << ':' << printReg(LoR, TRI)
+ << '\n');
PairMap.insert(std::make_pair(DR, UUPair(LoR, HiR)));
}
@@ -1160,12 +1187,12 @@ bool HexagonSplitDoubleRegs::splitPartition(const USet &Part) {
}
bool HexagonSplitDoubleRegs::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(dbgs() << "Splitting double registers in function: "
- << MF.getName() << '\n');
-
if (skipFunction(MF.getFunction()))
return false;
+ LLVM_DEBUG(dbgs() << "Splitting double registers in function: "
+ << MF.getName() << '\n');
+
auto &ST = MF.getSubtarget<HexagonSubtarget>();
TRI = ST.getRegisterInfo();
TII = ST.getInstrInfo();
@@ -1178,7 +1205,7 @@ bool HexagonSplitDoubleRegs::runOnMachineFunction(MachineFunction &MF) {
collectIndRegs(IRM);
partitionRegisters(P2Rs);
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Register partitioning: (partition #0 is fixed)\n";
for (UUSetMap::iterator I = P2Rs.begin(), E = P2Rs.end(); I != E; ++I) {
dbgs() << '#' << I->first << " -> ";
@@ -1196,7 +1223,8 @@ bool HexagonSplitDoubleRegs::runOnMachineFunction(MachineFunction &MF) {
if (Limit >= 0 && Counter >= Limit)
break;
USet &Part = I->second;
- DEBUG(dbgs() << "Calculating profit for partition #" << I->first << '\n');
+ LLVM_DEBUG(dbgs() << "Calculating profit for partition #" << I->first
+ << '\n');
if (!isProfitable(Part, IRM))
continue;
Counter++;
diff --git a/lib/Target/Hexagon/HexagonStoreWidening.cpp b/lib/Target/Hexagon/HexagonStoreWidening.cpp
index 300f6de33552..991af047387e 100644
--- a/lib/Target/Hexagon/HexagonStoreWidening.cpp
+++ b/lib/Target/Hexagon/HexagonStoreWidening.cpp
@@ -433,10 +433,11 @@ bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
const MCInstrDesc &StD = TII->get(WOpc);
MachineOperand &MR = FirstSt->getOperand(0);
int64_t Off = FirstSt->getOperand(1).getImm();
- MachineInstr *StI = BuildMI(*MF, DL, StD)
- .addReg(MR.getReg(), getKillRegState(MR.isKill()))
- .addImm(Off)
- .addImm(Val);
+ MachineInstr *StI =
+ BuildMI(*MF, DL, StD)
+ .addReg(MR.getReg(), getKillRegState(MR.isKill()), MR.getSubReg())
+ .addImm(Off)
+ .addImm(Val);
StI->addMemOperand(*MF, NewM);
NG.push_back(StI);
} else {
@@ -455,10 +456,11 @@ bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
const MCInstrDesc &StD = TII->get(WOpc);
MachineOperand &MR = FirstSt->getOperand(0);
int64_t Off = FirstSt->getOperand(1).getImm();
- MachineInstr *StI = BuildMI(*MF, DL, StD)
- .addReg(MR.getReg(), getKillRegState(MR.isKill()))
- .addImm(Off)
- .addReg(VReg, RegState::Kill);
+ MachineInstr *StI =
+ BuildMI(*MF, DL, StD)
+ .addReg(MR.getReg(), getKillRegState(MR.isKill()), MR.getSubReg())
+ .addImm(Off)
+ .addReg(VReg, RegState::Kill);
StI->addMemOperand(*MF, NewM);
NG.push_back(StI);
}
@@ -472,7 +474,7 @@ bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
// from OG was (in the order in which they appeared in the basic block).
// (The ordering in OG does not have to match the order in the basic block.)
bool HexagonStoreWidening::replaceStores(InstrGroup &OG, InstrGroup &NG) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Replacing:\n";
for (auto I : OG)
dbgs() << " " << *I;
@@ -576,7 +578,7 @@ bool HexagonStoreWidening::processBasicBlock(MachineBasicBlock &MBB) {
};
for (auto &G : SGs) {
assert(G.size() > 1 && "Store group with fewer than 2 elements");
- std::sort(G.begin(), G.end(), Less);
+ llvm::sort(G.begin(), G.end(), Less);
Changed |= processStoreGroup(G);
}
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 6f1f6c46a107..0686d6eb6118 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -15,13 +15,14 @@
#include "HexagonInstrInfo.h"
#include "HexagonRegisterInfo.h"
#include "HexagonSubtarget.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
-#include "MCTargetDesc/HexagonMCTargetDesc.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/ScheduleDAGInstrs.h"
#include "llvm/Support/CommandLine.h"
@@ -38,17 +39,6 @@ using namespace llvm;
#define GET_SUBTARGETINFO_TARGET_DESC
#include "HexagonGenSubtargetInfo.inc"
-static cl::opt<bool> EnableMemOps("enable-hexagon-memops",
- cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(true),
- cl::desc("Generate V4 MEMOP in code generation for Hexagon target"));
-
-static cl::opt<bool> DisableMemOps("disable-hexagon-memops",
- cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(false),
- cl::desc("Do not generate V4 MEMOP in code generation for Hexagon target"));
-
-static cl::opt<bool> EnableIEEERndNear("enable-hexagon-ieee-rnd-near",
- cl::Hidden, cl::ZeroOrMore, cl::init(false),
- cl::desc("Generate non-chopped conversion from fp to int."));
static cl::opt<bool> EnableBSBSched("enable-bsb-sched",
cl::Hidden, cl::ZeroOrMore, cl::init(true));
@@ -60,9 +50,6 @@ static cl::opt<bool> EnableDotCurSched("enable-cur-sched",
cl::Hidden, cl::ZeroOrMore, cl::init(true),
cl::desc("Enable the scheduler to generate .cur"));
-static cl::opt<bool> EnableVecFrwdSched("enable-evec-frwd-sched",
- cl::Hidden, cl::ZeroOrMore, cl::init(true));
-
static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched",
cl::Hidden, cl::ZeroOrMore, cl::init(false),
cl::desc("Disable Hexagon MI Scheduling"));
@@ -105,6 +92,7 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
HexagonSubtarget &
HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
static std::map<StringRef, Hexagon::ArchEnum> CpuTable{
+ {"generic", Hexagon::ArchEnum::V60},
{"hexagonv4", Hexagon::ArchEnum::V4},
{"hexagonv5", Hexagon::ArchEnum::V5},
{"hexagonv55", Hexagon::ArchEnum::V55},
@@ -123,9 +111,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
UseHVX64BOps = false;
UseLongCalls = false;
- UseMemOps = DisableMemOps ? false : EnableMemOps;
- ModeIEEERndNear = EnableIEEERndNear;
- UseBSBScheduling = hasV60TOps() && EnableBSBSched;
+ UseBSBScheduling = hasV60Ops() && EnableBSBSched;
ParseSubtargetFeatures(CPUString, FS);
@@ -204,11 +190,14 @@ bool HexagonSubtarget::CallMutation::shouldTFRICallBind(
Type == HexagonII::TypeALU64 || Type == HexagonII::TypeM;
}
-void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAG) {
+void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+ ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
SUnit* LastSequentialCall = nullptr;
- unsigned VRegHoldingRet = 0;
- unsigned RetRegister;
- SUnit* LastUseOfRet = nullptr;
+ // Map from virtual register to physical register from the copy.
+ DenseMap<unsigned, unsigned> VRegHoldingReg;
+ // Map from the physical register to the instruction that uses virtual
+ // register. This is used to create the barrier edge.
+ DenseMap<unsigned, SUnit *> LastVRegUse;
auto &TRI = *DAG->MF.getSubtarget().getRegisterInfo();
auto &HII = *DAG->MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
@@ -220,13 +209,15 @@ void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAG) {
LastSequentialCall = &DAG->SUnits[su];
// Look for a compare that defines a predicate.
else if (DAG->SUnits[su].getInstr()->isCompare() && LastSequentialCall)
- DAG->SUnits[su].addPred(SDep(LastSequentialCall, SDep::Barrier));
+ DAG->addEdge(&DAG->SUnits[su], SDep(LastSequentialCall, SDep::Barrier));
// Look for call and tfri* instructions.
else if (SchedPredsCloser && LastSequentialCall && su > 1 && su < e-1 &&
shouldTFRICallBind(HII, DAG->SUnits[su], DAG->SUnits[su+1]))
- DAG->SUnits[su].addPred(SDep(&DAG->SUnits[su-1], SDep::Barrier));
- // Prevent redundant register copies between two calls, which are caused by
- // both the return value and the argument for the next call being in %r0.
+ DAG->addEdge(&DAG->SUnits[su], SDep(&DAG->SUnits[su-1], SDep::Barrier));
+ // Prevent redundant register copies due to reads and writes of physical
+ // registers. The original motivation for this was the code generated
+ // between two calls, which are caused both the return value and the
+ // argument for the next call being in %r0.
// Example:
// 1: <call1>
// 2: %vreg = COPY %r0
@@ -235,21 +226,37 @@ void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAG) {
// 5: <call2>
// The scheduler would often swap 3 and 4, so an additional register is
// needed. This code inserts a Barrier dependence between 3 & 4 to prevent
- // this. The same applies for %d0 and %v0/%w0, which are also handled.
+ // this.
+ // The code below checks for all the physical registers, not just R0/D0/V0.
else if (SchedRetvalOptimization) {
const MachineInstr *MI = DAG->SUnits[su].getInstr();
- if (MI->isCopy() && (MI->readsRegister(Hexagon::R0, &TRI) ||
- MI->readsRegister(Hexagon::V0, &TRI))) {
- // %vreg = COPY %r0
- VRegHoldingRet = MI->getOperand(0).getReg();
- RetRegister = MI->getOperand(1).getReg();
- LastUseOfRet = nullptr;
- } else if (VRegHoldingRet && MI->readsVirtualRegister(VRegHoldingRet))
- // <use of %X>
- LastUseOfRet = &DAG->SUnits[su];
- else if (LastUseOfRet && MI->definesRegister(RetRegister, &TRI))
- // %r0 = ...
- DAG->SUnits[su].addPred(SDep(LastUseOfRet, SDep::Barrier));
+ if (MI->isCopy() &&
+ TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) {
+ // %vregX = COPY %r0
+ VRegHoldingReg[MI->getOperand(0).getReg()] = MI->getOperand(1).getReg();
+ LastVRegUse.erase(MI->getOperand(1).getReg());
+ } else {
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg())
+ continue;
+ if (MO.isUse() && !MI->isCopy() &&
+ VRegHoldingReg.count(MO.getReg())) {
+ // <use of %vregX>
+ LastVRegUse[VRegHoldingReg[MO.getReg()]] = &DAG->SUnits[su];
+ } else if (MO.isDef() &&
+ TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ for (MCRegAliasIterator AI(MO.getReg(), &TRI, true); AI.isValid();
+ ++AI) {
+ if (LastVRegUse.count(*AI) &&
+ LastVRegUse[*AI] != &DAG->SUnits[su])
+ // %r0 = ...
+ DAG->addEdge(&DAG->SUnits[su], SDep(LastVRegUse[*AI], SDep::Barrier));
+ LastVRegUse.erase(*AI);
+ }
+ }
+ }
+ }
}
}
}
@@ -300,7 +307,7 @@ void HexagonSubtarget::BankConflictMutation::apply(ScheduleDAGInstrs *DAG) {
}
}
-/// \brief Enable use of alias analysis during code generation (during MI
+/// Enable use of alias analysis during code generation (during MI
/// scheduling, DAGCombine, etc.).
bool HexagonSubtarget::useAA() const {
if (OptLevel != CodeGenOpt::None)
@@ -308,7 +315,7 @@ bool HexagonSubtarget::useAA() const {
return false;
}
-/// \brief Perform target specific adjustments to the latency of a schedule
+/// Perform target specific adjustments to the latency of a schedule
/// dependency.
void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
SDep &Dep) const {
@@ -328,25 +335,30 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
return;
}
- if (!hasV60TOps())
+ if (!hasV60Ops())
return;
- // If it's a REG_SEQUENCE, use its destination instruction to determine
+ // Set the latency for a copy to zero since we hope that is will get removed.
+ if (DstInst->isCopy())
+ Dep.setLatency(0);
+
+ // If it's a REG_SEQUENCE/COPY, use its destination instruction to determine
// the correct latency.
- if (DstInst->isRegSequence() && Dst->NumSuccs == 1) {
- unsigned RSeqReg = DstInst->getOperand(0).getReg();
- MachineInstr *RSeqDst = Dst->Succs[0].getSUnit()->getInstr();
+ if ((DstInst->isRegSequence() || DstInst->isCopy()) && Dst->NumSuccs == 1) {
+ unsigned DReg = DstInst->getOperand(0).getReg();
+ MachineInstr *DDst = Dst->Succs[0].getSUnit()->getInstr();
unsigned UseIdx = -1;
- for (unsigned OpNum = 0; OpNum < RSeqDst->getNumOperands(); OpNum++) {
- const MachineOperand &MO = RSeqDst->getOperand(OpNum);
- if (MO.isReg() && MO.getReg() && MO.isUse() && MO.getReg() == RSeqReg) {
+ for (unsigned OpNum = 0; OpNum < DDst->getNumOperands(); OpNum++) {
+ const MachineOperand &MO = DDst->getOperand(OpNum);
+ if (MO.isReg() && MO.getReg() && MO.isUse() && MO.getReg() == DReg) {
UseIdx = OpNum;
break;
}
}
- unsigned RSeqLatency = (InstrInfo.getOperandLatency(&InstrItins, *SrcInst,
- 0, *RSeqDst, UseIdx));
- Dep.setLatency(RSeqLatency);
+ int DLatency = (InstrInfo.getOperandLatency(&InstrItins, *SrcInst,
+ 0, *DDst, UseIdx));
+ DLatency = std::max(DLatency, 0);
+ Dep.setLatency((unsigned)DLatency);
}
// Try to schedule uses near definitions to generate .cur.
@@ -394,7 +406,7 @@ void HexagonSubtarget::updateLatency(MachineInstr &SrcInst,
return;
}
- if (!hasV60TOps())
+ if (!hasV60Ops())
return;
auto &QII = static_cast<const HexagonInstrInfo&>(*getInstrInfo());
@@ -418,6 +430,7 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
}
assert(DefIdx >= 0 && "Def Reg not found in Src MI");
MachineInstr *DstI = Dst->getInstr();
+ SDep T = I;
for (unsigned OpNum = 0; OpNum < DstI->getNumOperands(); OpNum++) {
const MachineOperand &MO = DstI->getOperand(OpNum);
if (MO.isReg() && MO.isUse() && MO.getReg() == DepR) {
@@ -426,8 +439,7 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
// For some instructions (ex: COPY), we might end up with < 0 latency
// as they don't have any Itinerary class associated with them.
- if (Latency <= 0)
- Latency = 1;
+ Latency = std::max(Latency, 0);
I.setLatency(Latency);
updateLatency(*SrcI, *DstI, I);
@@ -435,11 +447,10 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
}
// Update the latency of opposite edge too.
- for (auto &J : Dst->Preds) {
- if (J.getSUnit() != Src)
- continue;
- J.setLatency(I.getLatency());
- }
+ T.setSUnit(Src);
+ auto F = std::find(Dst->Preds.begin(), Dst->Preds.end(), T);
+ assert(F != Dst->Preds.end());
+ F->setLatency(I.getLatency());
}
}
@@ -447,7 +458,7 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
void HexagonSubtarget::changeLatency(SUnit *Src, SUnit *Dst, unsigned Lat)
const {
for (auto &I : Src->Succs) {
- if (I.getSUnit() != Dst)
+ if (!I.isAssignedRegDep() || I.getSUnit() != Dst)
continue;
SDep T = I;
I.setLatency(Lat);
@@ -456,7 +467,7 @@ void HexagonSubtarget::changeLatency(SUnit *Src, SUnit *Dst, unsigned Lat)
T.setSUnit(Src);
auto F = std::find(Dst->Preds.begin(), Dst->Preds.end(), T);
assert(F != Dst->Preds.end());
- F->setLatency(I.getLatency());
+ F->setLatency(Lat);
}
}
@@ -519,13 +530,13 @@ bool HexagonSubtarget::isBestZeroLatency(SUnit *Src, SUnit *Dst,
// Reassign the latency for the previous bests, which requires setting
// the dependence edge in both directions.
if (SrcBest != nullptr) {
- if (!hasV60TOps())
+ if (!hasV60Ops())
changeLatency(SrcBest, Dst, 1);
else
restoreLatency(SrcBest, Dst);
}
if (DstBest != nullptr) {
- if (!hasV60TOps())
+ if (!hasV60Ops())
changeLatency(Src, DstBest, 1);
else
restoreLatency(Src, DstBest);
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index af93f20d97fc..dc8d173a5057 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -32,9 +32,6 @@
#define GET_SUBTARGETINFO_HEADER
#include "HexagonGenSubtargetInfo.inc"
-#define Hexagon_SMALL_DATA_THRESHOLD 8
-#define Hexagon_SLOTS 4
-
namespace llvm {
class MachineInstr;
@@ -46,12 +43,20 @@ class Triple;
class HexagonSubtarget : public HexagonGenSubtargetInfo {
virtual void anchor();
- bool UseMemOps, UseHVX64BOps, UseHVX128BOps;
- bool UseLongCalls;
- bool ModeIEEERndNear;
+ bool UseHVX64BOps = false;
+ bool UseHVX128BOps = false;
+
+ bool UseLongCalls = false;
+ bool UseMemops = false;
+ bool UsePackets = false;
+ bool UseNewValueJumps = false;
+ bool UseNewValueStores = false;
+ bool UseSmallData = false;
bool HasMemNoShuf = false;
bool EnableDuplex = false;
+ bool ReservedR19 = false;
+
public:
Hexagon::ArchEnum HexagonArchVersion;
Hexagon::ArchEnum HexagonHVXVersion = Hexagon::ArchEnum::V4;
@@ -115,44 +120,50 @@ public:
/// subtarget options. Definition of function is auto generated by tblgen.
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
- bool useMemOps() const { return UseMemOps; }
- bool hasV5TOps() const {
+ bool hasV5Ops() const {
return getHexagonArchVersion() >= Hexagon::ArchEnum::V5;
}
- bool hasV5TOpsOnly() const {
+ bool hasV5OpsOnly() const {
return getHexagonArchVersion() == Hexagon::ArchEnum::V5;
}
- bool hasV55TOps() const {
+ bool hasV55Ops() const {
return getHexagonArchVersion() >= Hexagon::ArchEnum::V55;
}
- bool hasV55TOpsOnly() const {
+ bool hasV55OpsOnly() const {
return getHexagonArchVersion() == Hexagon::ArchEnum::V55;
}
- bool hasV60TOps() const {
+ bool hasV60Ops() const {
return getHexagonArchVersion() >= Hexagon::ArchEnum::V60;
}
- bool hasV60TOpsOnly() const {
+ bool hasV60OpsOnly() const {
return getHexagonArchVersion() == Hexagon::ArchEnum::V60;
}
- bool hasV62TOps() const {
+ bool hasV62Ops() const {
return getHexagonArchVersion() >= Hexagon::ArchEnum::V62;
}
- bool hasV62TOpsOnly() const {
+ bool hasV62OpsOnly() const {
return getHexagonArchVersion() == Hexagon::ArchEnum::V62;
}
- bool hasV65TOps() const {
+ bool hasV65Ops() const {
return getHexagonArchVersion() >= Hexagon::ArchEnum::V65;
}
- bool hasV65TOpsOnly() const {
+ bool hasV65OpsOnly() const {
return getHexagonArchVersion() == Hexagon::ArchEnum::V65;
}
- bool modeIEEERndNear() const { return ModeIEEERndNear; }
+ bool useLongCalls() const { return UseLongCalls; }
+ bool useMemops() const { return UseMemops; }
+ bool usePackets() const { return UsePackets; }
+ bool useNewValueJumps() const { return UseNewValueJumps; }
+ bool useNewValueStores() const { return UseNewValueStores; }
+ bool useSmallData() const { return UseSmallData; }
+
bool useHVXOps() const { return HexagonHVXVersion > Hexagon::ArchEnum::V4; }
bool useHVX128BOps() const { return useHVXOps() && UseHVX128BOps; }
bool useHVX64BOps() const { return useHVXOps() && UseHVX64BOps; }
+
bool hasMemNoShuf() const { return HasMemNoShuf; }
- bool useLongCalls() const { return UseLongCalls; }
+ bool hasReservedR19() const { return ReservedR19; }
bool usePredicatedCalls() const;
bool useBSBScheduling() const { return UseBSBScheduling; }
@@ -170,11 +181,6 @@ public:
const std::string &getCPUString () const { return CPUString; }
- // Threshold for small data section
- unsigned getSmallDataThreshold() const {
- return Hexagon_SMALL_DATA_THRESHOLD;
- }
-
const Hexagon::ArchEnum &getHexagonArchVersion() const {
return HexagonArchVersion;
}
@@ -187,11 +193,11 @@ public:
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
const override;
- /// \brief Enable use of alias analysis during code generation (during MI
+ /// Enable use of alias analysis during code generation (during MI
/// scheduling, DAGCombine, etc.).
bool useAA() const override;
- /// \brief Perform target specific adjustments to the latency of a schedule
+ /// Perform target specific adjustments to the latency of a schedule
/// dependency.
void adjustSchedDependency(SUnit *def, SUnit *use, SDep& dep) const override;
@@ -238,6 +244,12 @@ public:
return llvm::any_of(ElemTypes, [ElemTy] (MVT T) { return ElemTy == T; });
}
+ unsigned getTypeAlignment(MVT Ty) const {
+ if (isHVXVectorType(Ty, true))
+ return getVectorLength();
+ return Ty.getSizeInBits() / 8;
+ }
+
unsigned getL1CacheLineSize() const;
unsigned getL1PrefetchDistance() const;
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 363b703fef28..2c75e9139ad7 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -94,9 +94,8 @@ static cl::opt<bool> EnableVectorPrint("enable-hexagon-vector-print",
cl::Hidden, cl::ZeroOrMore, cl::init(false),
cl::desc("Enable Hexagon Vector print instr pass"));
-static cl::opt<bool> EnableTrapUnreachable("hexagon-trap-unreachable",
- cl::Hidden, cl::ZeroOrMore, cl::init(false),
- cl::desc("Enable generating trap for unreachable"));
+static cl::opt<bool> EnableVExtractOpt("hexagon-opt-vextract", cl::Hidden,
+ cl::ZeroOrMore, cl::init(true), cl::desc("Enable vextract optimization"));
/// HexagonTargetMachineModule - Note that this is used on hosts that
/// cannot link in a library unless there are references into the
@@ -122,7 +121,9 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
namespace llvm {
extern char &HexagonExpandCondsetsID;
+ void initializeHexagonBitSimplifyPass(PassRegistry&);
void initializeHexagonConstExtendersPass(PassRegistry&);
+ void initializeHexagonConstPropagationPass(PassRegistry&);
void initializeHexagonEarlyIfConversionPass(PassRegistry&);
void initializeHexagonExpandCondsetsPass(PassRegistry&);
void initializeHexagonGenMuxPass(PassRegistry&);
@@ -133,6 +134,8 @@ namespace llvm {
void initializeHexagonOptAddrModePass(PassRegistry&);
void initializeHexagonPacketizerPass(PassRegistry&);
void initializeHexagonRDFOptPass(PassRegistry&);
+ void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
+ void initializeHexagonVExtractPass(PassRegistry&);
Pass *createHexagonLoopIdiomPass();
Pass *createHexagonVectorLoopCarriedReusePass();
@@ -165,6 +168,7 @@ namespace llvm {
FunctionPass *createHexagonSplitDoubleRegs();
FunctionPass *createHexagonStoreWidening();
FunctionPass *createHexagonVectorPrint();
+ FunctionPass *createHexagonVExtract();
} // end namespace llvm;
static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
@@ -184,7 +188,9 @@ extern "C" void LLVMInitializeHexagonTarget() {
RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget());
PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeHexagonBitSimplifyPass(PR);
initializeHexagonConstExtendersPass(PR);
+ initializeHexagonConstPropagationPass(PR);
initializeHexagonEarlyIfConversionPass(PR);
initializeHexagonGenMuxPass(PR);
initializeHexagonHardwareLoopsPass(PR);
@@ -194,6 +200,8 @@ extern "C" void LLVMInitializeHexagonTarget() {
initializeHexagonOptAddrModePass(PR);
initializeHexagonPacketizerPass(PR);
initializeHexagonRDFOptPass(PR);
+ initializeHexagonSplitDoubleRegsPass(PR);
+ initializeHexagonVExtractPass(PR);
}
HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
@@ -213,8 +221,6 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
TT, CPU, FS, Options, getEffectiveRelocModel(RM),
getEffectiveCodeModel(CM), (HexagonNoOpt ? CodeGenOpt::None : OL)),
TLOF(make_unique<HexagonTargetObjectFile>()) {
- if (EnableTrapUnreachable)
- this->Options.TrapUnreachable = true;
initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
initAsmInfo();
}
@@ -299,6 +305,11 @@ void HexagonPassConfig::addIRPasses() {
TargetPassConfig::addIRPasses();
bool NoOpt = (getOptLevel() == CodeGenOpt::None);
+ if (!NoOpt) {
+ addPass(createConstantPropagationPass());
+ addPass(createDeadCodeEliminationPass());
+ }
+
addPass(createAtomicExpandPass());
if (!NoOpt) {
if (EnableLoopPrefetch)
@@ -321,6 +332,8 @@ bool HexagonPassConfig::addInstSelector() {
addPass(createHexagonISelDag(TM, getOptLevel()));
if (!NoOpt) {
+ if (EnableVExtractOpt)
+ addPass(createHexagonVExtract());
// Create logical operations on predicate registers.
if (EnableGenPred)
addPass(createHexagonGenPredicate());
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index ea86c9c42f47..e771f383dffa 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -74,7 +74,7 @@ static cl::opt<bool>
if (TraceGVPlacement) { \
TRACE_TO(errs(), X); \
} else { \
- DEBUG(TRACE_TO(dbgs(), X)); \
+ LLVM_DEBUG(TRACE_TO(dbgs(), X)); \
} \
} while (false)
#endif
@@ -200,11 +200,11 @@ MCSection *HexagonTargetObjectFile::getExplicitSectionGlobal(
bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
const TargetMachine &TM) const {
// Only global variables, not functions.
- DEBUG(dbgs() << "Checking if value is in small-data, -G"
- << SmallDataThreshold << ": \"" << GO->getName() << "\": ");
+ LLVM_DEBUG(dbgs() << "Checking if value is in small-data, -G"
+ << SmallDataThreshold << ": \"" << GO->getName() << "\": ");
const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GO);
if (!GVar) {
- DEBUG(dbgs() << "no, not a global variable\n");
+ LLVM_DEBUG(dbgs() << "no, not a global variable\n");
return false;
}
@@ -213,19 +213,19 @@ bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
// small data or not. This is how we can support mixing -G0/-G8 in LTO.
if (GVar->hasSection()) {
bool IsSmall = isSmallDataSection(GVar->getSection());
- DEBUG(dbgs() << (IsSmall ? "yes" : "no") << ", has section: "
- << GVar->getSection() << '\n');
+ LLVM_DEBUG(dbgs() << (IsSmall ? "yes" : "no")
+ << ", has section: " << GVar->getSection() << '\n');
return IsSmall;
}
if (GVar->isConstant()) {
- DEBUG(dbgs() << "no, is a constant\n");
+ LLVM_DEBUG(dbgs() << "no, is a constant\n");
return false;
}
bool IsLocal = GVar->hasLocalLinkage();
if (!StaticsInSData && IsLocal) {
- DEBUG(dbgs() << "no, is static\n");
+ LLVM_DEBUG(dbgs() << "no, is static\n");
return false;
}
@@ -234,7 +234,7 @@ bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
GType = PT->getElementType();
if (isa<ArrayType>(GType)) {
- DEBUG(dbgs() << "no, is an array\n");
+ LLVM_DEBUG(dbgs() << "no, is an array\n");
return false;
}
@@ -244,22 +244,22 @@ bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
// these objects end up in the sdata, the references will still be valid.
if (StructType *ST = dyn_cast<StructType>(GType)) {
if (ST->isOpaque()) {
- DEBUG(dbgs() << "no, has opaque type\n");
+ LLVM_DEBUG(dbgs() << "no, has opaque type\n");
return false;
}
}
unsigned Size = GVar->getParent()->getDataLayout().getTypeAllocSize(GType);
if (Size == 0) {
- DEBUG(dbgs() << "no, has size 0\n");
+ LLVM_DEBUG(dbgs() << "no, has size 0\n");
return false;
}
if (Size > SmallDataThreshold) {
- DEBUG(dbgs() << "no, size exceeds sdata threshold: " << Size << '\n');
+ LLVM_DEBUG(dbgs() << "no, size exceeds sdata threshold: " << Size << '\n');
return false;
}
- DEBUG(dbgs() << "yes\n");
+ LLVM_DEBUG(dbgs() << "yes\n");
return true;
}
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index d638503990ad..a496a17788d5 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -16,23 +16,59 @@
#include "HexagonTargetTransformInfo.h"
#include "HexagonSubtarget.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/User.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
using namespace llvm;
#define DEBUG_TYPE "hexagontti"
+static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false),
+ cl::Hidden, cl::desc("Enable loop vectorizer for HVX"));
+
static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables",
cl::init(true), cl::Hidden,
cl::desc("Control lookup table emission on Hexagon target"));
+// Constant "cost factor" to make floating point operations more expensive
+// in terms of vectorization cost. This isn't the best way, but it should
+// do. Ultimately, the cost should use cycles.
+static const unsigned FloatFactor = 4;
+
+bool HexagonTTIImpl::useHVX() const {
+ return ST.useHVXOps() && HexagonAutoHVX;
+}
+
+bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const {
+ assert(VecTy->isVectorTy());
+ // Avoid types like <2 x i32*>.
+ if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy())
+ return false;
+ EVT VecVT = EVT::getEVT(VecTy);
+ if (!VecVT.isSimple() || VecVT.getSizeInBits() <= 64)
+ return false;
+ if (ST.isHVXVectorType(VecVT.getSimpleVT()))
+ return true;
+ auto Action = TLI.getPreferredVectorAction(VecVT);
+ return Action == TargetLoweringBase::TypeWidenVector;
+}
+
+unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const {
+ if (Ty->isVectorTy())
+ return Ty->getVectorNumElements();
+ assert((Ty->isIntegerTy() || Ty->isFloatingPointTy()) &&
+ "Expecting scalar type");
+ return 1;
+}
+
TargetTransformInfo::PopcntSupportKind
HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
- // Return Fast Hardware support as every input < 64 bits will be promoted
+ // Return fast hardware support as every input < 64 bits will be promoted
// to 64 bits.
return TargetTransformInfo::PSK_FastHardware;
}
@@ -41,37 +77,223 @@ HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
void HexagonTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
UP.Runtime = UP.Partial = true;
+ // Only try to peel innermost loops with small runtime trip counts.
+ if (L && L->empty() && canPeel(L) &&
+ SE.getSmallConstantTripCount(L) == 0 &&
+ SE.getSmallConstantMaxTripCount(L) > 0 &&
+ SE.getSmallConstantMaxTripCount(L) <= 5) {
+ UP.PeelCount = 2;
+ }
+}
+
+bool HexagonTTIImpl::shouldFavorPostInc() const {
+ return true;
+}
+
+/// --- Vector TTI begin ---
+
+unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const {
+ if (Vector)
+ return useHVX() ? 32 : 0;
+ return 32;
+}
+
+unsigned HexagonTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+ return useHVX() ? 2 : 0;
+}
+
+unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const {
+ return Vector ? getMinVectorRegisterBitWidth() : 32;
+}
+
+unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const {
+ return useHVX() ? ST.getVectorLength()*8 : 0;
+}
+
+unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const {
+ return (8 * ST.getVectorLength()) / ElemWidth;
+}
+
+unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
+ bool Extract) {
+ return BaseT::getScalarizationOverhead(Ty, Insert, Extract);
+}
+
+unsigned HexagonTTIImpl::getOperandsScalarizationOverhead(
+ ArrayRef<const Value*> Args, unsigned VF) {
+ return BaseT::getOperandsScalarizationOverhead(Args, VF);
+}
+
+unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy,
+ ArrayRef<Type*> Tys) {
+ return BaseT::getCallInstrCost(F, RetTy, Tys);
+}
+
+unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+ ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF) {
+ return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+}
+
+unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+ ArrayRef<Type*> Tys, FastMathFlags FMF,
+ unsigned ScalarizationCostPassed) {
+ if (ID == Intrinsic::bswap) {
+ std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, RetTy);
+ return LT.first + 2;
+ }
+ return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
+ ScalarizationCostPassed);
+}
+
+unsigned HexagonTTIImpl::getAddressComputationCost(Type *Tp,
+ ScalarEvolution *SE, const SCEV *S) {
+ return 0;
+}
+
+unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ unsigned Alignment, unsigned AddressSpace, const Instruction *I) {
+ assert(Opcode == Instruction::Load || Opcode == Instruction::Store);
+ if (Opcode == Instruction::Store)
+ return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
+
+ if (Src->isVectorTy()) {
+ VectorType *VecTy = cast<VectorType>(Src);
+ unsigned VecWidth = VecTy->getBitWidth();
+ if (useHVX() && isTypeForHVX(VecTy)) {
+ unsigned RegWidth = getRegisterBitWidth(true);
+ Alignment = std::min(Alignment, RegWidth/8);
+ // Cost of HVX loads.
+ if (VecWidth % RegWidth == 0)
+ return VecWidth / RegWidth;
+ // Cost of constructing HVX vector from scalar loads.
+ unsigned AlignWidth = 8 * std::max(1u, Alignment);
+ unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth;
+ return 3*NumLoads;
+ }
+
+ // Non-HVX vectors.
+ // Add extra cost for floating point types.
+ unsigned Cost = VecTy->getElementType()->isFloatingPointTy() ? FloatFactor
+ : 1;
+ Alignment = std::min(Alignment, 8u);
+ unsigned AlignWidth = 8 * std::max(1u, Alignment);
+ unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth;
+ if (Alignment == 4 || Alignment == 8)
+ return Cost * NumLoads;
+ // Loads of less than 32 bits will need extra inserts to compose a vector.
+ unsigned LogA = Log2_32(Alignment);
+ return (3 - LogA) * Cost * NumLoads;
+ }
+
+ return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
+}
+
+unsigned HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode,
+ Type *Src, unsigned Alignment, unsigned AddressSpace) {
+ return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+}
+
+unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
+ int Index, Type *SubTp) {
+ return 1;
+}
+
+unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+ Value *Ptr, bool VariableMask, unsigned Alignment) {
+ return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+ Alignment);
+}
+
+unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
+ Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+ unsigned Alignment, unsigned AddressSpace) {
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+}
+
+unsigned HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy, const Instruction *I) {
+ if (ValTy->isVectorTy()) {
+ std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, ValTy);
+ if (Opcode == Instruction::FCmp)
+ return LT.first + FloatFactor * getTypeNumElements(ValTy);
+ }
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}
-unsigned HexagonTTIImpl::getNumberOfRegisters(bool vector) const {
- return vector ? 0 : 32;
+unsigned HexagonTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value*> Args) {
+ if (Ty->isVectorTy()) {
+ std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, Ty);
+ if (LT.second.isFloatingPoint())
+ return LT.first + FloatFactor * getTypeNumElements(Ty);
+ }
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo, Args);
}
+unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
+ Type *SrcTy, const Instruction *I) {
+ if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) {
+ unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0;
+ unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0;
+
+ std::pair<int, MVT> SrcLT = TLI.getTypeLegalizationCost(DL, SrcTy);
+ std::pair<int, MVT> DstLT = TLI.getTypeLegalizationCost(DL, DstTy);
+ return std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN);
+ }
+ return 1;
+}
+
+unsigned HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) {
+ Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType()
+ : Val;
+ if (Opcode == Instruction::InsertElement) {
+ // Need two rotations for non-zero index.
+ unsigned Cost = (Index != 0) ? 2 : 0;
+ if (ElemTy->isIntegerTy(32))
+ return Cost;
+ // If it's not a 32-bit value, there will need to be an extract.
+ return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, Index);
+ }
+
+ if (Opcode == Instruction::ExtractElement)
+ return 2;
+
+ return 1;
+}
+
+/// --- Vector TTI end ---
+
unsigned HexagonTTIImpl::getPrefetchDistance() const {
- return getST()->getL1PrefetchDistance();
+ return ST.getL1PrefetchDistance();
}
unsigned HexagonTTIImpl::getCacheLineSize() const {
- return getST()->getL1CacheLineSize();
+ return ST.getL1CacheLineSize();
}
int HexagonTTIImpl::getUserCost(const User *U,
ArrayRef<const Value *> Operands) {
- auto isCastFoldedIntoLoad = [](const CastInst *CI) -> bool {
+ auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool {
if (!CI->isIntegerCast())
return false;
+ // Only extensions from an integer type shorter than 32-bit to i32
+ // can be folded into the load.
+ const DataLayout &DL = getDataLayout();
+ unsigned SBW = DL.getTypeSizeInBits(CI->getSrcTy());
+ unsigned DBW = DL.getTypeSizeInBits(CI->getDestTy());
+ if (DBW != 32 || SBW >= DBW)
+ return false;
+
const LoadInst *LI = dyn_cast<const LoadInst>(CI->getOperand(0));
// Technically, this code could allow multiple uses of the load, and
// check if all the uses are the same extension operation, but this
// should be sufficient for most cases.
- if (!LI || !LI->hasOneUse())
- return false;
-
- // Only extensions from an integer type shorter than 32-bit to i32
- // can be folded into the load.
- unsigned SBW = CI->getSrcTy()->getIntegerBitWidth();
- unsigned DBW = CI->getDestTy()->getIntegerBitWidth();
- return DBW == 32 && (SBW < DBW);
+ return LI && LI->hasOneUse();
};
if (const CastInst *CI = dyn_cast<const CastInst>(U))
@@ -81,5 +303,5 @@ int HexagonTTIImpl::getUserCost(const User *U,
}
bool HexagonTTIImpl::shouldBuildLookupTables() const {
- return EmitLookupTables;
+ return EmitLookupTables;
}
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index d2cd05012afa..a232f99fc407 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -37,16 +37,24 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
friend BaseT;
- const HexagonSubtarget *ST;
- const HexagonTargetLowering *TLI;
+ const HexagonSubtarget &ST;
+ const HexagonTargetLowering &TLI;
- const HexagonSubtarget *getST() const { return ST; }
- const HexagonTargetLowering *getTLI() const { return TLI; }
+ const HexagonSubtarget *getST() const { return &ST; }
+ const HexagonTargetLowering *getTLI() const { return &TLI; }
+
+ bool useHVX() const;
+ bool isTypeForHVX(Type *VecTy) const;
+
+ // Returns the number of vector elements of Ty, if Ty is a vector type,
+ // or 1 if Ty is a scalar type. It is incorrect to call this function
+ // with any other type.
+ unsigned getTypeNumElements(Type *Ty) const;
public:
explicit HexagonTTIImpl(const HexagonTargetMachine *TM, const Function &F)
- : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
- TLI(ST->getTargetLowering()) {}
+ : BaseT(TM, F.getParent()->getDataLayout()),
+ ST(*TM->getSubtargetImpl(F)), TLI(*ST.getTargetLowering()) {}
/// \name Scalar TTI Implementations
/// @{
@@ -57,6 +65,9 @@ public:
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
+ /// Bias LSR towards creating post-increment opportunities.
+ bool shouldFavorPostInc() const;
+
// L1 cache prefetch.
unsigned getPrefetchDistance() const;
unsigned getCacheLineSize() const;
@@ -67,6 +78,64 @@ public:
/// @{
unsigned getNumberOfRegisters(bool vector) const;
+ unsigned getMaxInterleaveFactor(unsigned VF);
+ unsigned getRegisterBitWidth(bool Vector) const;
+ unsigned getMinVectorRegisterBitWidth() const;
+ unsigned getMinimumVF(unsigned ElemWidth) const;
+
+ bool shouldMaximizeVectorBandwidth(bool OptSize) const {
+ return true;
+ }
+ bool supportsEfficientVectorElementLoadStore() {
+ return false;
+ }
+ bool hasBranchDivergence() {
+ return false;
+ }
+ bool enableAggressiveInterleaving(bool LoopHasReductions) {
+ return false;
+ }
+ bool prefersVectorizedAddressing() {
+ return false;
+ }
+
+ unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+ unsigned getOperandsScalarizationOverhead(ArrayRef<const Value*> Args,
+ unsigned VF);
+ unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type*> Tys);
+ unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+ ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF);
+ unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+ ArrayRef<Type*> Tys, FastMathFlags FMF,
+ unsigned ScalarizationCostPassed = UINT_MAX);
+ unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
+ const SCEV *S);
+ unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace, const Instruction *I = nullptr);
+ unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace);
+ unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp);
+ unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+ bool VariableMask, unsigned Alignment);
+ unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
+ unsigned AddressSpace);
+ unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ const Instruction *I);
+ unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+ unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ const Instruction *I = nullptr);
+ unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+
+ unsigned getCFInstrCost(unsigned Opcode) {
+ return 1;
+ }
/// @}
@@ -77,5 +146,4 @@ public:
};
} // end namespace llvm
-
#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETTRANSFORMINFO_H
diff --git a/lib/Target/Hexagon/HexagonVExtract.cpp b/lib/Target/Hexagon/HexagonVExtract.cpp
new file mode 100644
index 000000000000..929ac2bd0d93
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonVExtract.cpp
@@ -0,0 +1,166 @@
+//===- HexagonVExtract.cpp ------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass will replace multiple occurrences of V6_extractw from the same
+// vector register with a combination of a vector store and scalar loads.
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/PassSupport.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+
+#include <map>
+
+using namespace llvm;
+
+static cl::opt<unsigned> VExtractThreshold("hexagon-vextract-threshold",
+ cl::Hidden, cl::ZeroOrMore, cl::init(1),
+ cl::desc("Threshold for triggering vextract replacement"));
+
+namespace llvm {
+ void initializeHexagonVExtractPass(PassRegistry& Registry);
+ FunctionPass *createHexagonVExtract();
+}
+
+namespace {
+ class HexagonVExtract : public MachineFunctionPass {
+ public:
+ static char ID;
+ HexagonVExtract() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override {
+ return "Hexagon optimize vextract";
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ private:
+ const HexagonSubtarget *HST = nullptr;
+ const HexagonInstrInfo *HII = nullptr;
+
+ unsigned genElemLoad(MachineInstr *ExtI, unsigned BaseR,
+ MachineRegisterInfo &MRI);
+ };
+
+ char HexagonVExtract::ID = 0;
+}
+
+INITIALIZE_PASS(HexagonVExtract, "hexagon-vextract",
+ "Hexagon optimize vextract", false, false)
+
+unsigned HexagonVExtract::genElemLoad(MachineInstr *ExtI, unsigned BaseR,
+ MachineRegisterInfo &MRI) {
+ MachineBasicBlock &ExtB = *ExtI->getParent();
+ DebugLoc DL = ExtI->getDebugLoc();
+ unsigned ElemR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+
+ unsigned ExtIdxR = ExtI->getOperand(2).getReg();
+ unsigned ExtIdxS = ExtI->getOperand(2).getSubReg();
+
+ // Simplified check for a compile-time constant value of ExtIdxR.
+ if (ExtIdxS == 0) {
+ MachineInstr *DI = MRI.getVRegDef(ExtIdxR);
+ if (DI->getOpcode() == Hexagon::A2_tfrsi) {
+ unsigned V = DI->getOperand(1).getImm();
+ V &= (HST->getVectorLength()-1) & -4u;
+
+ BuildMI(ExtB, ExtI, DL, HII->get(Hexagon::L2_loadri_io), ElemR)
+ .addReg(BaseR)
+ .addImm(V);
+ return ElemR;
+ }
+ }
+
+ unsigned IdxR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ BuildMI(ExtB, ExtI, DL, HII->get(Hexagon::A2_andir), IdxR)
+ .add(ExtI->getOperand(2))
+ .addImm(-4);
+ BuildMI(ExtB, ExtI, DL, HII->get(Hexagon::L4_loadri_rr), ElemR)
+ .addReg(BaseR)
+ .addReg(IdxR)
+ .addImm(0);
+ return ElemR;
+}
+
+bool HexagonVExtract::runOnMachineFunction(MachineFunction &MF) {
+ HST = &MF.getSubtarget<HexagonSubtarget>();
+ HII = HST->getInstrInfo();
+ const auto &HRI = *HST->getRegisterInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ std::map<unsigned, SmallVector<MachineInstr*,4>> VExtractMap;
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ unsigned Opc = MI.getOpcode();
+ if (Opc != Hexagon::V6_extractw)
+ continue;
+ unsigned VecR = MI.getOperand(1).getReg();
+ VExtractMap[VecR].push_back(&MI);
+ }
+ }
+
+ for (auto &P : VExtractMap) {
+ unsigned VecR = P.first;
+ if (P.second.size() <= VExtractThreshold)
+ continue;
+
+ const auto &VecRC = *MRI.getRegClass(VecR);
+ int FI = MFI.CreateSpillStackObject(HRI.getSpillSize(VecRC),
+ HRI.getSpillAlignment(VecRC));
+ MachineInstr *DefI = MRI.getVRegDef(VecR);
+ MachineBasicBlock::iterator At = std::next(DefI->getIterator());
+ MachineBasicBlock &DefB = *DefI->getParent();
+ unsigned StoreOpc = VecRC.getID() == Hexagon::HvxVRRegClassID
+ ? Hexagon::V6_vS32b_ai
+ : Hexagon::PS_vstorerw_ai;
+ BuildMI(DefB, At, DefI->getDebugLoc(), HII->get(StoreOpc))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addReg(VecR);
+
+ unsigned VecSize = HRI.getRegSizeInBits(VecRC) / 8;
+
+ for (MachineInstr *ExtI : P.second) {
+ assert(ExtI->getOpcode() == Hexagon::V6_extractw);
+ unsigned SR = ExtI->getOperand(1).getSubReg();
+ assert(ExtI->getOperand(1).getReg() == VecR);
+
+ MachineBasicBlock &ExtB = *ExtI->getParent();
+ DebugLoc DL = ExtI->getDebugLoc();
+ unsigned BaseR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ BuildMI(ExtB, ExtI, DL, HII->get(Hexagon::PS_fi), BaseR)
+ .addFrameIndex(FI)
+ .addImm(SR == 0 ? 0 : VecSize/2);
+
+ unsigned ElemR = genElemLoad(ExtI, BaseR, MRI);
+ unsigned ExtR = ExtI->getOperand(0).getReg();
+ MRI.replaceRegWith(ExtR, ElemR);
+ ExtB.erase(ExtI);
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+FunctionPass *llvm::createHexagonVExtract() {
+ return new HexagonVExtract();
+}
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index c2404235091c..56ab69db9bd1 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -199,11 +199,12 @@ static MachineBasicBlock::iterator moveInstrOut(MachineInstr &MI,
}
bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
- if (DisablePacketizer || skipFunction(MF.getFunction()))
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ if (DisablePacketizer || !HST.usePackets() || skipFunction(MF.getFunction()))
return false;
- HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
- HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+ HII = HST.getInstrInfo();
+ HRI = HST.getRegisterInfo();
auto &MLI = getAnalysis<MachineLoopInfo>();
auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
auto *MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
@@ -374,7 +375,7 @@ bool HexagonPacketizerList::promoteToDotCur(MachineInstr &MI,
void HexagonPacketizerList::cleanUpDotCur() {
MachineInstr *MI = nullptr;
for (auto BI : CurrentPacketMIs) {
- DEBUG(dbgs() << "Cleanup packet has "; BI->dump(););
+ LLVM_DEBUG(dbgs() << "Cleanup packet has "; BI->dump(););
if (HII->isDotCurInst(*BI)) {
MI = BI;
continue;
@@ -389,7 +390,7 @@ void HexagonPacketizerList::cleanUpDotCur() {
return;
// We did not find a use of the CUR, so de-cur it.
MI->setDesc(HII->get(HII->getNonDotCurOp(*MI)));
- DEBUG(dbgs() << "Demoted CUR "; MI->dump(););
+ LLVM_DEBUG(dbgs() << "Demoted CUR "; MI->dump(););
}
// Check to see if an instruction can be dot cur.
@@ -413,11 +414,10 @@ bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr &MI,
return false;
// Make sure candidate instruction uses cur.
- DEBUG(dbgs() << "Can we DOT Cur Vector MI\n";
- MI.dump();
- dbgs() << "in packet\n";);
+ LLVM_DEBUG(dbgs() << "Can we DOT Cur Vector MI\n"; MI.dump();
+ dbgs() << "in packet\n";);
MachineInstr &MJ = *MII;
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Checking CUR against ";
MJ.dump();
});
@@ -432,12 +432,12 @@ bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr &MI,
// Check for existing uses of a vector register within the packet which
// would be affected by converting a vector load into .cur formt.
for (auto BI : CurrentPacketMIs) {
- DEBUG(dbgs() << "packet has "; BI->dump(););
+ LLVM_DEBUG(dbgs() << "packet has "; BI->dump(););
if (BI->readsRegister(DepReg, MF.getSubtarget().getRegisterInfo()))
return false;
}
- DEBUG(dbgs() << "Can Dot CUR MI\n"; MI.dump(););
+ LLVM_DEBUG(dbgs() << "Can Dot CUR MI\n"; MI.dump(););
// We can convert the opcode into a .cur.
return true;
}
@@ -529,6 +529,9 @@ bool HexagonPacketizerList::updateOffset(SUnit *SUI, SUnit *SUJ) {
return false;
int64_t Offset = MI.getOperand(OPI).getImm();
+ if (!HII->isValidOffset(MI.getOpcode(), Offset+Incr, HRI))
+ return false;
+
MI.getOperand(OPI).setImm(Offset + Incr);
ChangedOffset = Offset;
return true;
@@ -1033,7 +1036,7 @@ void HexagonPacketizerList::initPacketizerState() {
// Ignore bundling of pseudo instructions.
bool HexagonPacketizerList::ignorePseudoInstruction(const MachineInstr &MI,
const MachineBasicBlock *) {
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
return true;
if (MI.isCFIInstruction())
@@ -1095,7 +1098,7 @@ bool HexagonPacketizerList::isSoloInstruction(const MachineInstr &MI) {
static bool cannotCoexistAsymm(const MachineInstr &MI, const MachineInstr &MJ,
const HexagonInstrInfo &HII) {
const MachineFunction *MF = MI.getParent()->getParent();
- if (MF->getSubtarget<HexagonSubtarget>().hasV60TOpsOnly() &&
+ if (MF->getSubtarget<HexagonSubtarget>().hasV60OpsOnly() &&
HII.isHVXMemWithAIndirect(MI, MJ))
return true;
@@ -1112,6 +1115,10 @@ static bool cannotCoexistAsymm(const MachineInstr &MI, const MachineInstr &MJ,
case Hexagon::S4_stored_locked:
case Hexagon::L2_loadw_locked:
case Hexagon::L4_loadd_locked:
+ case Hexagon::Y2_dccleana:
+ case Hexagon::Y2_dccleaninva:
+ case Hexagon::Y2_dcinva:
+ case Hexagon::Y2_dczeroa:
case Hexagon::Y4_l2fetch:
case Hexagon::Y5_l2fetch: {
// These instructions can only be grouped with ALU32 or non-floating-point
@@ -1513,7 +1520,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
bool IsVecJ = HII->isHVXVec(J);
bool IsVecI = HII->isHVXVec(I);
- if (Slot1Store && MF.getSubtarget<HexagonSubtarget>().hasV65TOps() &&
+ if (Slot1Store && MF.getSubtarget<HexagonSubtarget>().hasV65Ops() &&
((LoadJ && StoreI && !NVStoreI) ||
(StoreJ && LoadI && !NVStoreJ)) &&
(J.getOpcode() != Hexagon::S2_allocframe &&
@@ -1683,8 +1690,12 @@ HexagonPacketizerList::addToPacket(MachineInstr &MI) {
PacketStalls = false;
PacketStalls |= producesStall(MI);
- if (MI.isImplicitDef())
+ if (MI.isImplicitDef()) {
+ // Add to the packet to allow subsequent instructions to be checked
+ // properly.
+ CurrentPacketMIs.push_back(&MI);
return MII;
+ }
assert(ResourceTracker->canReserveResources(MI));
bool ExtMI = HII->isExtended(MI) || HII->isConstExtended(MI);
@@ -1754,7 +1765,7 @@ void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB,
bool memShufDisabled = getmemShufDisabled();
if (memShufDisabled && !foundLSInPacket()) {
setmemShufDisabled(false);
- DEBUG(dbgs() << " Not added to NoShufPacket\n");
+ LLVM_DEBUG(dbgs() << " Not added to NoShufPacket\n");
}
memShufDisabled = getmemShufDisabled();
@@ -1773,7 +1784,7 @@ void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB,
CurrentPacketMIs.clear();
ResourceTracker->clearResources();
- DEBUG(dbgs() << "End packet\n");
+ LLVM_DEBUG(dbgs() << "End packet\n");
}
bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr &MI) {
@@ -1803,17 +1814,18 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
SUnit *SUI = MIToSUnit[const_cast<MachineInstr *>(&I)];
- // Check if the latency is 0 between this instruction and any instruction
- // in the current packet. If so, we disregard any potential stalls due to
- // the instructions in the previous packet. Most of the instruction pairs
- // that can go together in the same packet have 0 latency between them.
- // Only exceptions are newValueJumps as they're generated much later and
- // the latencies can't be changed at that point. Another is .cur
- // instructions if its consumer has a 0 latency successor (such as .new).
- // In this case, the latency between .cur and the consumer stays non-zero
- // even though we can have both .cur and .new in the same packet. Changing
- // the latency to 0 is not an option as it causes software pipeliner to
- // not pipeline in some cases.
+ // If the latency is 0 and there is a data dependence between this
+ // instruction and any instruction in the current packet, we disregard any
+ // potential stalls due to the instructions in the previous packet. Most of
+ // the instruction pairs that can go together in the same packet have 0
+ // latency between them. The exceptions are
+ // 1. NewValueJumps as they're generated much later and the latencies can't
+ // be changed at that point.
+ // 2. .cur instructions, if its consumer has a 0 latency successor (such as
+ // .new). In this case, the latency between .cur and the consumer stays
+ // non-zero even though we can have both .cur and .new in the same packet.
+ // Changing the latency to 0 is not an option as it causes software pipeliner
+ // to not pipeline in some cases.
// For Example:
// {
@@ -1826,19 +1838,10 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
for (auto J : CurrentPacketMIs) {
SUnit *SUJ = MIToSUnit[J];
for (auto &Pred : SUI->Preds)
- if (Pred.getSUnit() == SUJ &&
- (Pred.getLatency() == 0 || HII->isNewValueJump(I) ||
- HII->isToBeScheduledASAP(*J, I)))
- return false;
- }
-
- // Check if the latency is greater than one between this instruction and any
- // instruction in the previous packet.
- for (auto J : OldPacketMIs) {
- SUnit *SUJ = MIToSUnit[J];
- for (auto &Pred : SUI->Preds)
- if (Pred.getSUnit() == SUJ && Pred.getLatency() > 1)
- return true;
+ if (Pred.getSUnit() == SUJ)
+ if ((Pred.getLatency() == 0 && Pred.isAssignedRegDep()) ||
+ HII->isNewValueJump(I) || HII->isToBeScheduledASAP(*J, I))
+ return false;
}
// Check if the latency is greater than one between this instruction and any
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
index 764d9ae9059a..40dcee3441a2 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -59,7 +59,7 @@ class HexagonPacketizerList : public VLIWPacketizerList {
bool PacketStalls = false;
protected:
- /// \brief A handle to the branch probability pass.
+ /// A handle to the branch probability pass.
const MachineBranchProbabilityInfo *MBPI;
const MachineLoopInfo *MLI;
diff --git a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
index 39395dbd3aec..9d1073346c72 100644
--- a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
+++ b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
@@ -138,6 +138,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
@@ -363,17 +364,18 @@ bool HexagonVectorLoopCarriedReuse::canReplace(Instruction *I) {
if (II &&
(II->getIntrinsicID() == Intrinsic::hexagon_V6_hi ||
II->getIntrinsicID() == Intrinsic::hexagon_V6_lo)) {
- DEBUG(dbgs() << "Not considering for reuse: " << *II << "\n");
+ LLVM_DEBUG(dbgs() << "Not considering for reuse: " << *II << "\n");
return false;
}
return true;
}
void HexagonVectorLoopCarriedReuse::findValueToReuse() {
for (auto *D : Dependences) {
- DEBUG(dbgs() << "Processing dependence " << *(D->front()) << "\n");
+ LLVM_DEBUG(dbgs() << "Processing dependence " << *(D->front()) << "\n");
if (D->iterations() > HexagonVLCRIterationLim) {
- DEBUG(dbgs() <<
- ".. Skipping because number of iterations > than the limit\n");
+ LLVM_DEBUG(
+ dbgs()
+ << ".. Skipping because number of iterations > than the limit\n");
continue;
}
@@ -381,7 +383,8 @@ void HexagonVectorLoopCarriedReuse::findValueToReuse() {
Instruction *BEInst = D->back();
int Iters = D->iterations();
BasicBlock *BB = PN->getParent();
- DEBUG(dbgs() << "Checking if any uses of " << *PN << " can be reused\n");
+ LLVM_DEBUG(dbgs() << "Checking if any uses of " << *PN
+ << " can be reused\n");
SmallVector<Instruction *, 4> PNUsers;
for (auto UI = PN->use_begin(), E = PN->use_end(); UI != E; ++UI) {
@@ -391,7 +394,8 @@ void HexagonVectorLoopCarriedReuse::findValueToReuse() {
if (User->getParent() != BB)
continue;
if (ReplacedInsts.count(User)) {
- DEBUG(dbgs() << *User << " has already been replaced. Skipping...\n");
+ LLVM_DEBUG(dbgs() << *User
+ << " has already been replaced. Skipping...\n");
continue;
}
if (isa<PHINode>(User))
@@ -403,7 +407,7 @@ void HexagonVectorLoopCarriedReuse::findValueToReuse() {
PNUsers.push_back(User);
}
- DEBUG(dbgs() << PNUsers.size() << " use(s) of the PHI in the block\n");
+ LLVM_DEBUG(dbgs() << PNUsers.size() << " use(s) of the PHI in the block\n");
// For each interesting use I of PN, find an Instruction BEUser that
// performs the same operation as I on BEInst and whose other operands,
@@ -439,7 +443,7 @@ void HexagonVectorLoopCarriedReuse::findValueToReuse() {
}
}
if (BEUser) {
- DEBUG(dbgs() << "Found Value for reuse.\n");
+ LLVM_DEBUG(dbgs() << "Found Value for reuse.\n");
ReuseCandidate.Inst2Replace = I;
ReuseCandidate.BackedgeInst = BEUser;
return;
@@ -460,7 +464,7 @@ Value *HexagonVectorLoopCarriedReuse::findValueInBlock(Value *Op,
}
void HexagonVectorLoopCarriedReuse::reuseValue() {
- DEBUG(dbgs() << ReuseCandidate);
+ LLVM_DEBUG(dbgs() << ReuseCandidate);
Instruction *Inst2Replace = ReuseCandidate.Inst2Replace;
Instruction *BEInst = ReuseCandidate.BackedgeInst;
int NumOperands = Inst2Replace->getNumOperands();
@@ -485,7 +489,7 @@ void HexagonVectorLoopCarriedReuse::reuseValue() {
}
}
- DEBUG(dbgs() << "reuseValue is making the following changes\n");
+ LLVM_DEBUG(dbgs() << "reuseValue is making the following changes\n");
SmallVector<Instruction *, 4> InstsInPreheader;
for (int i = 0; i < Iterations; ++i) {
@@ -506,8 +510,8 @@ void HexagonVectorLoopCarriedReuse::reuseValue() {
InstsInPreheader.push_back(InstInPreheader);
InstInPreheader->setName(Inst2Replace->getName() + ".hexagon.vlcr");
InstInPreheader->insertBefore(LoopPH->getTerminator());
- DEBUG(dbgs() << "Added " << *InstInPreheader << " to " << LoopPH->getName()
- << "\n");
+ LLVM_DEBUG(dbgs() << "Added " << *InstInPreheader << " to "
+ << LoopPH->getName() << "\n");
}
BasicBlock *BB = BEInst->getParent();
IRBuilder<> IRB(BB);
@@ -519,7 +523,8 @@ void HexagonVectorLoopCarriedReuse::reuseValue() {
NewPhi = IRB.CreatePHI(InstInPreheader->getType(), 2);
NewPhi->addIncoming(InstInPreheader, LoopPH);
NewPhi->addIncoming(BEVal, BB);
- DEBUG(dbgs() << "Adding " << *NewPhi << " to " << BB->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Adding " << *NewPhi << " to " << BB->getName()
+ << "\n");
BEVal = NewPhi;
}
// We are in LCSSA form. So, a value defined inside the Loop is used only
@@ -538,7 +543,7 @@ bool HexagonVectorLoopCarriedReuse::doVLCR() {
bool Changed = false;
bool Continue;
- DEBUG(dbgs() << "Working on Loop: " << *CurLoop->getHeader() << "\n");
+ LLVM_DEBUG(dbgs() << "Working on Loop: " << *CurLoop->getHeader() << "\n");
do {
// Reset datastructures.
Dependences.clear();
@@ -625,10 +630,9 @@ void HexagonVectorLoopCarriedReuse::findLoopCarriedDeps() {
else
delete D;
}
- DEBUG(dbgs() << "Found " << Dependences.size() << " dependences\n");
- DEBUG(for (size_t i = 0; i < Dependences.size(); ++i) {
- dbgs() << *Dependences[i] << "\n";
- });
+ LLVM_DEBUG(dbgs() << "Found " << Dependences.size() << " dependences\n");
+ LLVM_DEBUG(for (size_t i = 0; i < Dependences.size();
+ ++i) { dbgs() << *Dependences[i] << "\n"; });
}
Pass *llvm::createHexagonVectorLoopCarriedReusePass() {
diff --git a/lib/Target/Hexagon/HexagonVectorPrint.cpp b/lib/Target/Hexagon/HexagonVectorPrint.cpp
index ddd668b2cb1e..18d2f2f4acde 100644
--- a/lib/Target/Hexagon/HexagonVectorPrint.cpp
+++ b/lib/Target/Hexagon/HexagonVectorPrint.cpp
@@ -144,14 +144,15 @@ bool HexagonVectorPrint::runOnMachineFunction(MachineFunction &Fn) {
unsigned Reg = 0;
if (getInstrVecReg(*MII, Reg)) {
VecPrintList.push_back((&*MII));
- DEBUG(dbgs() << "Found vector reg inside bundle \n"; MII->dump());
+ LLVM_DEBUG(dbgs() << "Found vector reg inside bundle \n";
+ MII->dump());
}
}
} else {
unsigned Reg = 0;
if (getInstrVecReg(MI, Reg)) {
VecPrintList.push_back(&MI);
- DEBUG(dbgs() << "Found vector reg \n"; MI.dump());
+ LLVM_DEBUG(dbgs() << "Found vector reg \n"; MI.dump());
}
}
}
@@ -163,33 +164,33 @@ bool HexagonVectorPrint::runOnMachineFunction(MachineFunction &Fn) {
for (auto *I : VecPrintList) {
DebugLoc DL = I->getDebugLoc();
MachineBasicBlock *MBB = I->getParent();
- DEBUG(dbgs() << "Evaluating V MI\n"; I->dump());
+ LLVM_DEBUG(dbgs() << "Evaluating V MI\n"; I->dump());
unsigned Reg = 0;
if (!getInstrVecReg(*I, Reg))
llvm_unreachable("Need a vector reg");
MachineBasicBlock::instr_iterator MII = I->getIterator();
if (I->isInsideBundle()) {
- DEBUG(dbgs() << "add to end of bundle\n"; I->dump());
+ LLVM_DEBUG(dbgs() << "add to end of bundle\n"; I->dump());
while (MBB->instr_end() != MII && MII->isInsideBundle())
MII++;
} else {
- DEBUG(dbgs() << "add after instruction\n"; I->dump());
+ LLVM_DEBUG(dbgs() << "add after instruction\n"; I->dump());
MII++;
}
if (MBB->instr_end() == MII)
continue;
if (Reg >= Hexagon::V0 && Reg <= Hexagon::V31) {
- DEBUG(dbgs() << "adding dump for V" << Reg-Hexagon::V0 << '\n');
+ LLVM_DEBUG(dbgs() << "adding dump for V" << Reg - Hexagon::V0 << '\n');
addAsmInstr(MBB, Reg, MII, DL, QII, Fn);
} else if (Reg >= Hexagon::W0 && Reg <= Hexagon::W15) {
- DEBUG(dbgs() << "adding dump for W" << Reg-Hexagon::W0 << '\n');
+ LLVM_DEBUG(dbgs() << "adding dump for W" << Reg - Hexagon::W0 << '\n');
addAsmInstr(MBB, Hexagon::V0 + (Reg - Hexagon::W0) * 2 + 1,
MII, DL, QII, Fn);
addAsmInstr(MBB, Hexagon::V0 + (Reg - Hexagon::W0) * 2,
MII, DL, QII, Fn);
} else if (Reg >= Hexagon::Q0 && Reg <= Hexagon::Q3) {
- DEBUG(dbgs() << "adding dump for Q" << Reg-Hexagon::Q0 << '\n');
+ LLVM_DEBUG(dbgs() << "adding dump for Q" << Reg - Hexagon::Q0 << '\n');
addAsmInstr(MBB, Reg, MII, DL, QII, Fn);
} else
llvm_unreachable("Bad Vector reg");
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index b3ab6763281c..af1e5429d0c2 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -51,7 +51,7 @@ class HexagonAsmBackend : public MCAsmBackend {
SmallVector<MCFixup, 4> Fixups;
SmallString<256> Code;
raw_svector_ostream VecOS(Code);
- E.encodeInstruction(HMB, VecOS, Fixups, RF.getSubtargetInfo());
+ E.encodeInstruction(HMB, VecOS, Fixups, *RF.getSubtargetInfo());
// Update the fragment.
RF.setInst(HMB);
@@ -61,13 +61,14 @@ class HexagonAsmBackend : public MCAsmBackend {
public:
HexagonAsmBackend(const Target &T, const Triple &TT, uint8_t OSABI,
- StringRef CPU) :
- OSABI(OSABI), CPU(CPU), MCII(T.createMCInstrInfo()),
- RelaxTarget(new MCInst *), Extender(nullptr) {}
-
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createHexagonELFObjectWriter(OS, OSABI, CPU);
+ StringRef CPU)
+ : MCAsmBackend(support::little), OSABI(OSABI), CPU(CPU),
+ MCII(T.createMCInstrInfo()), RelaxTarget(new MCInst *),
+ Extender(nullptr) {}
+
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createHexagonELFObjectWriter(OSABI, CPU);
}
void setExtender(MCContext &Context) const {
@@ -413,7 +414,8 @@ public:
/// fixup kind as appropriate.
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t FixupValue, bool IsResolved) const override {
+ uint64_t FixupValue, bool IsResolved,
+ const MCSubtargetInfo *STI) const override {
// When FixupValue is 0 the relocation is external and there
// is nothing for us to do.
@@ -510,17 +512,15 @@ public:
break;
}
- DEBUG(dbgs() << "Name=" << getFixupKindInfo(Kind).Name << "(" <<
- (unsigned)Kind << ")\n");
- DEBUG(uint32_t OldData = 0;
- for (unsigned i = 0; i < NumBytes; i++)
- OldData |= (InstAddr[i] << (i * 8)) & (0xff << (i * 8));
- dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) <<
- ": AValue=0x"; dbgs().write_hex(FixupValue) <<
- ": Offset=" << Offset <<
- ": Size=" << Data.size() <<
- ": OInst=0x"; dbgs().write_hex(OldData) <<
- ": Reloc=0x"; dbgs().write_hex(Reloc););
+ LLVM_DEBUG(dbgs() << "Name=" << getFixupKindInfo(Kind).Name << "("
+ << (unsigned)Kind << ")\n");
+ LLVM_DEBUG(
+ uint32_t OldData = 0; for (unsigned i = 0; i < NumBytes; i++) OldData |=
+ (InstAddr[i] << (i * 8)) & (0xff << (i * 8));
+ dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) << ": AValue=0x";
+ dbgs().write_hex(FixupValue)
+ << ": Offset=" << Offset << ": Size=" << Data.size() << ": OInst=0x";
+ dbgs().write_hex(OldData) << ": Reloc=0x"; dbgs().write_hex(Reloc););
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value. The Value has been "split up" into the
@@ -530,10 +530,10 @@ public:
InstAddr[i] |= uint8_t(Reloc >> (i * 8)) & 0xff; // Apply new reloc
}
- DEBUG(uint32_t NewData = 0;
- for (unsigned i = 0; i < NumBytes; i++)
- NewData |= (InstAddr[i] << (i * 8)) & (0xff << (i * 8));
- dbgs() << ": NInst=0x"; dbgs().write_hex(NewData) << "\n";);
+ LLVM_DEBUG(uint32_t NewData = 0;
+ for (unsigned i = 0; i < NumBytes; i++) NewData |=
+ (InstAddr[i] << (i * 8)) & (0xff << (i * 8));
+ dbgs() << ": NInst=0x"; dbgs().write_hex(NewData) << "\n";);
}
bool isInstRelaxable(MCInst const &HMI) const {
@@ -562,7 +562,8 @@ public:
/// relaxation.
///
/// \param Inst - The instruction to test.
- bool mayNeedRelaxation(MCInst const &Inst) const override {
+ bool mayNeedRelaxation(MCInst const &Inst,
+ const MCSubtargetInfo &STI) const override {
return true;
}
@@ -571,7 +572,8 @@ public:
bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved,
uint64_t Value,
const MCRelaxableFragment *DF,
- const MCAsmLayout &Layout) const override {
+ const MCAsmLayout &Layout,
+ const bool WasForced) const override {
MCInst const &MCB = DF->getInst();
assert(HexagonMCInstrInfo::isBundle(MCB));
@@ -682,17 +684,17 @@ public:
assert(Update && "Didn't find relaxation target");
}
- bool writeNopData(uint64_t Count,
- MCObjectWriter * OW) const override {
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override {
static const uint32_t Nopcode = 0x7f000000, // Hard-coded NOP.
ParseIn = 0x00004000, // In packet parse-bits.
ParseEnd = 0x0000c000; // End of packet parse-bits.
while(Count % HEXAGON_INSTR_SIZE) {
- DEBUG(dbgs() << "Alignment not a multiple of the instruction size:" <<
- Count % HEXAGON_INSTR_SIZE << "/" << HEXAGON_INSTR_SIZE << "\n");
+ LLVM_DEBUG(dbgs() << "Alignment not a multiple of the instruction size:"
+ << Count % HEXAGON_INSTR_SIZE << "/"
+ << HEXAGON_INSTR_SIZE << "\n");
--Count;
- OW->write8(0);
+ OS << '\0';
}
while(Count) {
@@ -700,7 +702,7 @@ public:
// Close the packet whenever a multiple of the maximum packet size remains
uint32_t ParseBits = (Count % (HEXAGON_PACKET_SIZE * HEXAGON_INSTR_SIZE))?
ParseIn: ParseEnd;
- OW->write32(Nopcode | ParseBits);
+ support::endian::write<uint32_t>(OS, Nopcode | ParseBits, Endian);
}
return true;
}
@@ -736,7 +738,7 @@ public:
Inst.addOperand(MCOperand::createInst(Nop));
Size -= 4;
if (!HexagonMCChecker(
- Context, *MCII, RF.getSubtargetInfo(), Inst,
+ Context, *MCII, *RF.getSubtargetInfo(), Inst,
*Context.getRegisterInfo(), false)
.check()) {
Inst.erase(Inst.end() - 1);
@@ -744,7 +746,7 @@ public:
}
}
bool Error = HexagonMCShuffle(Context, true, *MCII,
- RF.getSubtargetInfo(), Inst);
+ *RF.getSubtargetInfo(), Inst);
//assert(!Error);
(void)Error;
ReplaceInstruction(Asm.getEmitter(), RF, Inst);
@@ -765,11 +767,12 @@ public:
// MCAsmBackend
MCAsmBackend *llvm::createHexagonAsmBackend(Target const &T,
- MCRegisterInfo const & /*MRI*/,
- const Triple &TT, StringRef CPU,
- const MCTargetOptions &Options) {
+ const MCSubtargetInfo &STI,
+ MCRegisterInfo const & /*MRI*/,
+ const MCTargetOptions &Options) {
+ const Triple &TT = STI.getTargetTriple();
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
- StringRef CPUString = Hexagon_MC::selectHexagonCPU(CPU);
+ StringRef CPUString = Hexagon_MC::selectHexagonCPU(STI.getCPU());
return new HexagonAsmBackend(T, TT, OSABI, CPUString);
}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index f5a376033757..cb504b5c3d5d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -25,7 +25,7 @@ namespace llvm {
/// HexagonII - This namespace holds all of the target specific flags that
/// instruction info tracks.
namespace HexagonII {
- unsigned const TypeCVI_FIRST = TypeCVI_HIST;
+ unsigned const TypeCVI_FIRST = TypeCVI_4SLOT_MPY;
unsigned const TypeCVI_LAST = TypeCVI_VX_LATE;
enum SubTarget {
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
index 12aa1bd9b2a0..e82e6b559f62 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
@@ -298,9 +298,7 @@ unsigned HexagonELFObjectWriter::getRelocType(MCContext &Ctx,
}
}
-std::unique_ptr<MCObjectWriter>
-llvm::createHexagonELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
- StringRef CPU) {
- auto MOTW = llvm::make_unique<HexagonELFObjectWriter>(OSABI, CPU);
- return createELFObjectWriter(std::move(MOTW), OS, /*IsLittleEndian*/ true);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createHexagonELFObjectWriter(uint8_t OSABI, StringRef CPU) {
+ return llvm::make_unique<HexagonELFObjectWriter>(OSABI, CPU);
}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 1929152129fa..3b3a15b990f1 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -69,19 +69,12 @@ void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
OS << "\n";
}
- auto Separator = "";
- if (HexagonMCInstrInfo::isInnerLoop(*MI)) {
- OS << Separator;
- Separator = " ";
- MCInst ME;
- ME.setOpcode(Hexagon::ENDLOOP0);
- printInstruction(&ME, OS);
- }
- if (HexagonMCInstrInfo::isOuterLoop(*MI)) {
- OS << Separator;
- MCInst ME;
- ME.setOpcode(Hexagon::ENDLOOP1);
- printInstruction(&ME, OS);
+ bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(*MI);
+ bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(*MI);
+ if (IsLoop0) {
+ OS << (IsLoop1 ? " :endloop01" : " :endloop0");
+ } else if (IsLoop1) {
+ OS << " :endloop1";
}
}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 631c38c2734f..3382684803aa 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -33,7 +33,9 @@
#include <cassert>
#include <cstddef>
#include <cstdint>
+#include <map>
#include <string>
+#include <vector>
#define DEBUG_TYPE "mccodeemitter"
@@ -42,62 +44,350 @@ using namespace Hexagon;
STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
-HexagonMCCodeEmitter::HexagonMCCodeEmitter(MCInstrInfo const &aMII,
- MCContext &aMCT)
- : MCT(aMCT), MCII(aMII), Addend(new unsigned(0)),
- Extended(new bool(false)), CurrentBundle(new MCInst const *),
- CurrentIndex(new size_t(0)) {}
+static const unsigned fixup_Invalid = ~0u;
+
+#define _ fixup_Invalid
+#define P(x) Hexagon::fixup_Hexagon##x
+static const std::map<unsigned, std::vector<unsigned>> ExtFixups = {
+ { MCSymbolRefExpr::VK_DTPREL,
+ { _, _, _, _,
+ _, _, P(_DTPREL_16_X), P(_DTPREL_11_X),
+ P(_DTPREL_11_X), P(_9_X), _, P(_DTPREL_11_X),
+ P(_DTPREL_16_X), _, _, _,
+ P(_DTPREL_16_X), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_DTPREL_32_6_X) }},
+ { MCSymbolRefExpr::VK_GOT,
+ { _, _, _, _,
+ _, _, P(_GOT_11_X), _ /* [1] */,
+ _ /* [1] */, P(_9_X), _, P(_GOT_11_X),
+ P(_GOT_16_X), _, _, _,
+ P(_GOT_16_X), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_GOT_32_6_X) }},
+ { MCSymbolRefExpr::VK_GOTREL,
+ { _, _, _, _,
+ _, _, P(_GOTREL_11_X), P(_GOTREL_11_X),
+ P(_GOTREL_11_X), P(_9_X), _, P(_GOTREL_11_X),
+ P(_GOTREL_16_X), _, _, _,
+ P(_GOTREL_16_X), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_GOTREL_32_6_X) }},
+ { MCSymbolRefExpr::VK_TPREL,
+ { _, _, _, _,
+ _, _, P(_TPREL_16_X), P(_TPREL_11_X),
+ P(_TPREL_11_X), P(_9_X), _, P(_TPREL_11_X),
+ P(_TPREL_16_X), _, _, _,
+ P(_TPREL_16_X), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_TPREL_32_6_X) }},
+ { MCSymbolRefExpr::VK_Hexagon_GD_GOT,
+ { _, _, _, _,
+ _, _, P(_GD_GOT_16_X), P(_GD_GOT_11_X),
+ P(_GD_GOT_11_X), P(_9_X), _, P(_GD_GOT_11_X),
+ P(_GD_GOT_16_X), _, _, _,
+ P(_GD_GOT_16_X), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_GD_GOT_32_6_X) }},
+ { MCSymbolRefExpr::VK_Hexagon_GD_PLT,
+ { _, _, _, _,
+ _, _, _, _,
+ _, P(_9_X), _, P(_GD_PLT_B22_PCREL_X),
+ _, _, _, _,
+ _, _, _, _,
+ _, _, P(_GD_PLT_B22_PCREL_X), _,
+ _, _, _, _,
+ _, _, _, _,
+ _ }},
+ { MCSymbolRefExpr::VK_Hexagon_IE,
+ { _, _, _, _,
+ _, _, P(_IE_16_X), _,
+ _, P(_9_X), _, _,
+ P(_IE_16_X), _, _, _,
+ P(_IE_16_X), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_IE_32_6_X) }},
+ { MCSymbolRefExpr::VK_Hexagon_IE_GOT,
+ { _, _, _, _,
+ _, _, P(_IE_GOT_11_X), P(_IE_GOT_11_X),
+ P(_IE_GOT_11_X), P(_9_X), _, P(_IE_GOT_11_X),
+ P(_IE_GOT_16_X), _, _, _,
+ P(_IE_GOT_16_X), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_IE_GOT_32_6_X) }},
+ { MCSymbolRefExpr::VK_Hexagon_LD_GOT,
+ { _, _, _, _,
+ _, _, P(_LD_GOT_11_X), P(_LD_GOT_11_X),
+ P(_LD_GOT_11_X), P(_9_X), _, P(_LD_GOT_11_X),
+ P(_LD_GOT_16_X), _, _, _,
+ P(_LD_GOT_16_X), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_LD_GOT_32_6_X) }},
+ { MCSymbolRefExpr::VK_Hexagon_LD_PLT,
+ { _, _, _, _,
+ _, _, _, _,
+ _, P(_9_X), _, P(_LD_PLT_B22_PCREL_X),
+ _, _, _, _,
+ _, _, _, _,
+ _, _, P(_LD_PLT_B22_PCREL_X), _,
+ _, _, _, _,
+ _, _, _, _,
+ _ }},
+ { MCSymbolRefExpr::VK_Hexagon_PCREL,
+ { _, _, _, _,
+ _, _, P(_6_PCREL_X), _,
+ _, P(_9_X), _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_32_PCREL) }},
+ { MCSymbolRefExpr::VK_None,
+ { _, _, _, _,
+ _, _, P(_6_X), P(_8_X),
+ P(_8_X), P(_9_X), P(_10_X), P(_11_X),
+ P(_12_X), P(_B13_PCREL), _, P(_B15_PCREL_X),
+ P(_16_X), _, _, _,
+ _, _, P(_B22_PCREL_X), _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_32_6_X) }},
+};
+// [1] The fixup is GOT_16_X for signed values and GOT_11_X for unsigned.
+
+static const std::map<unsigned, std::vector<unsigned>> StdFixups = {
+ { MCSymbolRefExpr::VK_DTPREL,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_DTPREL_16), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_DTPREL_32) }},
+ { MCSymbolRefExpr::VK_GOT,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_GOT_32) }},
+ { MCSymbolRefExpr::VK_GOTREL,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _ /* [2] */, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_GOTREL_32) }},
+ { MCSymbolRefExpr::VK_PLT,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, P(_PLT_B22_PCREL), _,
+ _, _, _, _,
+ _, _, _, _,
+ _ }},
+ { MCSymbolRefExpr::VK_TPREL,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, P(_TPREL_11_X),
+ _, _, _, _,
+ P(_TPREL_16), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_TPREL_32) }},
+ { MCSymbolRefExpr::VK_Hexagon_GD_GOT,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_GD_GOT_16), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_GD_GOT_32) }},
+ { MCSymbolRefExpr::VK_Hexagon_GD_PLT,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, P(_GD_PLT_B22_PCREL), _,
+ _, _, _, _,
+ _, _, _, _,
+ _ }},
+ { MCSymbolRefExpr::VK_Hexagon_GPREL,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_GPREL16_0), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _ }},
+ { MCSymbolRefExpr::VK_Hexagon_HI16,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_HI16), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _ }},
+ { MCSymbolRefExpr::VK_Hexagon_IE,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_IE_32) }},
+ { MCSymbolRefExpr::VK_Hexagon_IE_GOT,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_IE_GOT_16), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_IE_GOT_32) }},
+ { MCSymbolRefExpr::VK_Hexagon_LD_GOT,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_LD_GOT_16), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_LD_GOT_32) }},
+ { MCSymbolRefExpr::VK_Hexagon_LD_PLT,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, P(_LD_PLT_B22_PCREL), _,
+ _, _, _, _,
+ _, _, _, _,
+ _ }},
+ { MCSymbolRefExpr::VK_Hexagon_LO16,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_LO16), _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _ }},
+ { MCSymbolRefExpr::VK_Hexagon_PCREL,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_32_PCREL) }},
+ { MCSymbolRefExpr::VK_None,
+ { _, _, _, _,
+ _, _, _, _,
+ _, _, _, _,
+ _, P(_B13_PCREL), _, P(_B15_PCREL),
+ _, _, _, _,
+ _, _, P(_B22_PCREL), _,
+ _, _, _, _,
+ _, _, _, _,
+ P(_32) }},
+};
+//
+// [2] The actual fixup is LO16 or HI16, depending on the instruction.
+#undef P
+#undef _
-uint32_t HexagonMCCodeEmitter::parseBits(size_t Last,
- MCInst const &MCB,
+uint32_t HexagonMCCodeEmitter::parseBits(size_t Last, MCInst const &MCB,
MCInst const &MCI) const {
bool Duplex = HexagonMCInstrInfo::isDuplex(MCII, MCI);
- if (*CurrentIndex == 0) {
+ if (State.Index == 0) {
if (HexagonMCInstrInfo::isInnerLoop(MCB)) {
assert(!Duplex);
- assert(*CurrentIndex != Last);
+ assert(State.Index != Last);
return HexagonII::INST_PARSE_LOOP_END;
}
}
- if (*CurrentIndex == 1) {
+ if (State.Index == 1) {
if (HexagonMCInstrInfo::isOuterLoop(MCB)) {
assert(!Duplex);
- assert(*CurrentIndex != Last);
+ assert(State.Index != Last);
return HexagonII::INST_PARSE_LOOP_END;
}
}
if (Duplex) {
- assert(*CurrentIndex == Last);
+ assert(State.Index == Last);
return HexagonII::INST_PARSE_DUPLEX;
}
- if(*CurrentIndex == Last)
+ if (State.Index == Last)
return HexagonII::INST_PARSE_PACKET_END;
return HexagonII::INST_PARSE_NOT_END;
}
-/// EncodeInstruction - Emit the bundle
+/// Emit the bundle.
void HexagonMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
MCInst &HMB = const_cast<MCInst &>(MI);
assert(HexagonMCInstrInfo::isBundle(HMB));
- DEBUG(dbgs() << "Encoding bundle\n";);
- *Addend = 0;
- *Extended = false;
- *CurrentBundle = &MI;
- *CurrentIndex = 0;
+ LLVM_DEBUG(dbgs() << "Encoding bundle\n";);
+ State.Addend = 0;
+ State.Extended = false;
+ State.Bundle = &MI;
+ State.Index = 0;
size_t Last = HexagonMCInstrInfo::bundleSize(HMB) - 1;
+ uint64_t Features = computeAvailableFeatures(STI.getFeatureBits());
+
for (auto &I : HexagonMCInstrInfo::bundleInstructions(HMB)) {
MCInst &HMI = const_cast<MCInst &>(*I.getInst());
- verifyInstructionPredicates(HMI,
- computeAvailableFeatures(STI.getFeatureBits()));
-
- EncodeSingleInstruction(HMI, OS, Fixups, STI,
- parseBits(Last, HMB, HMI));
- *Extended = HexagonMCInstrInfo::isImmext(HMI);
- *Addend += HEXAGON_INSTR_SIZE;
- ++*CurrentIndex;
+ verifyInstructionPredicates(HMI, Features);
+
+ EncodeSingleInstruction(HMI, OS, Fixups, STI, parseBits(Last, HMB, HMI));
+ State.Extended = HexagonMCInstrInfo::isImmext(HMI);
+ State.Addend += HEXAGON_INSTR_SIZE;
+ ++State.Index;
}
}
@@ -115,9 +405,9 @@ static bool RegisterMatches(unsigned Consumer, unsigned Producer,
}
/// EncodeSingleInstruction - Emit a single
-void HexagonMCCodeEmitter::EncodeSingleInstruction(
- const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI, uint32_t Parse) const {
+void HexagonMCCodeEmitter::EncodeSingleInstruction(const MCInst &MI,
+ raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI, uint32_t Parse) const {
assert(!HexagonMCInstrInfo::isBundle(MI));
uint64_t Binary;
@@ -125,198 +415,150 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction(
// in the first place!
assert(!HexagonMCInstrInfo::getDesc(MCII, MI).isPseudo() &&
"pseudo-instruction found");
- DEBUG(dbgs() << "Encoding insn"
- " `" << HexagonMCInstrInfo::getName(MCII, MI) << "'"
- "\n");
+ LLVM_DEBUG(dbgs() << "Encoding insn `"
+ << HexagonMCInstrInfo::getName(MCII, MI) << "'\n");
Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+ unsigned Opc = MI.getOpcode();
+
// Check for unimplemented instructions. Immediate extenders
// are encoded as zero, so they need to be accounted for.
- if (!Binary &&
- MI.getOpcode() != DuplexIClass0 &&
- MI.getOpcode() != A4_ext) {
- DEBUG(dbgs() << "Unimplemented inst: "
- " `" << HexagonMCInstrInfo::getName(MCII, MI) << "'"
- "\n");
+ if (!Binary && Opc != DuplexIClass0 && Opc != A4_ext) {
+ LLVM_DEBUG(dbgs() << "Unimplemented inst `"
+ << HexagonMCInstrInfo::getName(MCII, MI) << "'\n");
llvm_unreachable("Unimplemented Instruction");
}
Binary |= Parse;
// if we need to emit a duplexed instruction
- if (MI.getOpcode() >= Hexagon::DuplexIClass0 &&
- MI.getOpcode() <= Hexagon::DuplexIClassF) {
+ if (Opc >= Hexagon::DuplexIClass0 && Opc <= Hexagon::DuplexIClassF) {
assert(Parse == HexagonII::INST_PARSE_DUPLEX &&
"Emitting duplex without duplex parse bits");
- unsigned dupIClass = MI.getOpcode() - Hexagon::DuplexIClass0;
+ unsigned DupIClass = MI.getOpcode() - Hexagon::DuplexIClass0;
// 29 is the bit position.
// 0b1110 =0xE bits are masked off and down shifted by 1 bit.
// Last bit is moved to bit position 13
- Binary = ((dupIClass & 0xE) << (29 - 1)) | ((dupIClass & 0x1) << 13);
+ Binary = ((DupIClass & 0xE) << (29 - 1)) | ((DupIClass & 0x1) << 13);
- const MCInst *subInst0 = MI.getOperand(0).getInst();
- const MCInst *subInst1 = MI.getOperand(1).getInst();
+ const MCInst *Sub0 = MI.getOperand(0).getInst();
+ const MCInst *Sub1 = MI.getOperand(1).getInst();
- // get subinstruction slot 0
- unsigned subInstSlot0Bits = getBinaryCodeForInstr(*subInst0, Fixups, STI);
- // get subinstruction slot 1
- unsigned subInstSlot1Bits = getBinaryCodeForInstr(*subInst1, Fixups, STI);
+ // Get subinstruction slot 0.
+ unsigned SubBits0 = getBinaryCodeForInstr(*Sub0, Fixups, STI);
+ // Get subinstruction slot 1.
+ State.SubInst1 = true;
+ unsigned SubBits1 = getBinaryCodeForInstr(*Sub1, Fixups, STI);
+ State.SubInst1 = false;
- Binary |= subInstSlot0Bits | (subInstSlot1Bits << 16);
+ Binary |= SubBits0 | (SubBits1 << 16);
}
- support::endian::Writer<support::little>(OS).write<uint32_t>(Binary);
+ support::endian::write<uint32_t>(OS, Binary, support::little);
++MCNumEmitted;
}
LLVM_ATTRIBUTE_NORETURN
-static void raise_relocation_error(unsigned bits, unsigned kind) {
+static void raise_relocation_error(unsigned Width, unsigned Kind) {
std::string Text;
- {
- raw_string_ostream Stream(Text);
- Stream << "Unrecognized relocation combination bits: " << bits
- << " kind: " << kind;
- }
- report_fatal_error(Text);
+ raw_string_ostream Stream(Text);
+ Stream << "Unrecognized relocation combination: width=" << Width
+ << " kind=" << Kind;
+ report_fatal_error(Stream.str());
}
-/// getFixupNoBits - Some insns are not extended and thus have no
-/// bits. These cases require a more brute force method for determining
-/// the correct relocation.
+/// Some insns are not extended and thus have no bits. These cases require
+/// a more brute force method for determining the correct relocation.
Hexagon::Fixups HexagonMCCodeEmitter::getFixupNoBits(
- MCInstrInfo const &MCII, const MCInst &MI, const MCOperand &MO,
- const MCSymbolRefExpr::VariantKind kind) const {
+ MCInstrInfo const &MCII, const MCInst &MI, const MCOperand &MO,
+ const MCSymbolRefExpr::VariantKind VarKind) const {
const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(MCII, MI);
- unsigned insnType = HexagonMCInstrInfo::getType(MCII, MI);
-
- if (insnType == HexagonII::TypeEXTENDER) {
- switch (kind) {
- case MCSymbolRefExpr::VK_GOTREL:
- return Hexagon::fixup_Hexagon_GOTREL_32_6_X;
- case MCSymbolRefExpr::VK_GOT:
- return Hexagon::fixup_Hexagon_GOT_32_6_X;
- case MCSymbolRefExpr::VK_TPREL:
- return Hexagon::fixup_Hexagon_TPREL_32_6_X;
- case MCSymbolRefExpr::VK_DTPREL:
- return Hexagon::fixup_Hexagon_DTPREL_32_6_X;
- case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
- return Hexagon::fixup_Hexagon_GD_GOT_32_6_X;
- case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
- return Hexagon::fixup_Hexagon_LD_GOT_32_6_X;
- case MCSymbolRefExpr::VK_Hexagon_IE:
- return Hexagon::fixup_Hexagon_IE_32_6_X;
- case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
- return Hexagon::fixup_Hexagon_IE_GOT_32_6_X;
- case MCSymbolRefExpr::VK_Hexagon_PCREL:
- return Hexagon::fixup_Hexagon_B32_PCREL_X;
- case MCSymbolRefExpr::VK_Hexagon_GD_PLT:
- return Hexagon::fixup_Hexagon_GD_PLT_B32_PCREL_X;
- case MCSymbolRefExpr::VK_Hexagon_LD_PLT:
- return Hexagon::fixup_Hexagon_LD_PLT_B32_PCREL_X;
-
- case MCSymbolRefExpr::VK_None: {
- auto Insts = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
- for (auto I = Insts.begin(), N = Insts.end(); I != N; ++I) {
- if (I->getInst() == &MI) {
- const MCInst &NextI = *(I+1)->getInst();
- const MCInstrDesc &D = HexagonMCInstrInfo::getDesc(MCII, NextI);
- if (D.isBranch() || D.isCall() ||
- HexagonMCInstrInfo::getType(MCII, NextI) == HexagonII::TypeCR)
- return Hexagon::fixup_Hexagon_B32_PCREL_X;
- return Hexagon::fixup_Hexagon_32_6_X;
- }
+ unsigned InsnType = HexagonMCInstrInfo::getType(MCII, MI);
+ using namespace Hexagon;
+
+ if (InsnType == HexagonII::TypeEXTENDER) {
+ if (VarKind == MCSymbolRefExpr::VK_None) {
+ auto Instrs = HexagonMCInstrInfo::bundleInstructions(*State.Bundle);
+ for (auto I = Instrs.begin(), N = Instrs.end(); I != N; ++I) {
+ if (I->getInst() != &MI)
+ continue;
+ assert(I+1 != N && "Extender cannot be last in packet");
+ const MCInst &NextI = *(I+1)->getInst();
+ const MCInstrDesc &NextD = HexagonMCInstrInfo::getDesc(MCII, NextI);
+ if (NextD.isBranch() || NextD.isCall() ||
+ HexagonMCInstrInfo::getType(MCII, NextI) == HexagonII::TypeCR)
+ return fixup_Hexagon_B32_PCREL_X;
+ return fixup_Hexagon_32_6_X;
}
- raise_relocation_error(0, kind);
- }
- default:
- raise_relocation_error(0, kind);
}
- } else if (MCID.isBranch())
- return Hexagon::fixup_Hexagon_B13_PCREL;
- switch (MCID.getOpcode()) {
- case Hexagon::HI:
- case Hexagon::A2_tfrih:
- switch (kind) {
- case MCSymbolRefExpr::VK_GOT:
- return Hexagon::fixup_Hexagon_GOT_HI16;
- case MCSymbolRefExpr::VK_GOTREL:
- return Hexagon::fixup_Hexagon_GOTREL_HI16;
- case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
- return Hexagon::fixup_Hexagon_GD_GOT_HI16;
- case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
- return Hexagon::fixup_Hexagon_LD_GOT_HI16;
- case MCSymbolRefExpr::VK_Hexagon_IE:
- return Hexagon::fixup_Hexagon_IE_HI16;
- case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
- return Hexagon::fixup_Hexagon_IE_GOT_HI16;
- case MCSymbolRefExpr::VK_TPREL:
- return Hexagon::fixup_Hexagon_TPREL_HI16;
- case MCSymbolRefExpr::VK_DTPREL:
- return Hexagon::fixup_Hexagon_DTPREL_HI16;
- case MCSymbolRefExpr::VK_None:
- return Hexagon::fixup_Hexagon_HI16;
- default:
- raise_relocation_error(0, kind);
- }
+ static const std::map<unsigned,unsigned> Relocs = {
+ { MCSymbolRefExpr::VK_GOTREL, fixup_Hexagon_GOTREL_32_6_X },
+ { MCSymbolRefExpr::VK_GOT, fixup_Hexagon_GOT_32_6_X },
+ { MCSymbolRefExpr::VK_TPREL, fixup_Hexagon_TPREL_32_6_X },
+ { MCSymbolRefExpr::VK_DTPREL, fixup_Hexagon_DTPREL_32_6_X },
+ { MCSymbolRefExpr::VK_Hexagon_GD_GOT, fixup_Hexagon_GD_GOT_32_6_X },
+ { MCSymbolRefExpr::VK_Hexagon_LD_GOT, fixup_Hexagon_LD_GOT_32_6_X },
+ { MCSymbolRefExpr::VK_Hexagon_IE, fixup_Hexagon_IE_32_6_X },
+ { MCSymbolRefExpr::VK_Hexagon_IE_GOT, fixup_Hexagon_IE_GOT_32_6_X },
+ { MCSymbolRefExpr::VK_Hexagon_PCREL, fixup_Hexagon_B32_PCREL_X },
+ { MCSymbolRefExpr::VK_Hexagon_GD_PLT, fixup_Hexagon_GD_PLT_B32_PCREL_X },
+ { MCSymbolRefExpr::VK_Hexagon_LD_PLT, fixup_Hexagon_LD_PLT_B32_PCREL_X },
+ };
+
+ auto F = Relocs.find(VarKind);
+ if (F != Relocs.end())
+ return Hexagon::Fixups(F->second);
+ raise_relocation_error(0, VarKind);
+ }
- case Hexagon::LO:
- case Hexagon::A2_tfril:
- switch (kind) {
- case MCSymbolRefExpr::VK_GOT:
- return Hexagon::fixup_Hexagon_GOT_LO16;
- case MCSymbolRefExpr::VK_GOTREL:
- return Hexagon::fixup_Hexagon_GOTREL_LO16;
- case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
- return Hexagon::fixup_Hexagon_GD_GOT_LO16;
- case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
- return Hexagon::fixup_Hexagon_LD_GOT_LO16;
- case MCSymbolRefExpr::VK_Hexagon_IE:
- return Hexagon::fixup_Hexagon_IE_LO16;
- case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
- return Hexagon::fixup_Hexagon_IE_GOT_LO16;
- case MCSymbolRefExpr::VK_TPREL:
- return Hexagon::fixup_Hexagon_TPREL_LO16;
- case MCSymbolRefExpr::VK_DTPREL:
- return Hexagon::fixup_Hexagon_DTPREL_LO16;
- case MCSymbolRefExpr::VK_None:
- return Hexagon::fixup_Hexagon_LO16;
- default:
- raise_relocation_error(0, kind);
- }
+ if (MCID.isBranch())
+ return fixup_Hexagon_B13_PCREL;
+
+ static const std::map<unsigned,unsigned> RelocsLo = {
+ { MCSymbolRefExpr::VK_GOT, fixup_Hexagon_GOT_LO16 },
+ { MCSymbolRefExpr::VK_GOTREL, fixup_Hexagon_GOTREL_LO16 },
+ { MCSymbolRefExpr::VK_Hexagon_GD_GOT, fixup_Hexagon_GD_GOT_LO16 },
+ { MCSymbolRefExpr::VK_Hexagon_LD_GOT, fixup_Hexagon_LD_GOT_LO16 },
+ { MCSymbolRefExpr::VK_Hexagon_IE, fixup_Hexagon_IE_LO16 },
+ { MCSymbolRefExpr::VK_Hexagon_IE_GOT, fixup_Hexagon_IE_GOT_LO16 },
+ { MCSymbolRefExpr::VK_TPREL, fixup_Hexagon_TPREL_LO16 },
+ { MCSymbolRefExpr::VK_DTPREL, fixup_Hexagon_DTPREL_LO16 },
+ { MCSymbolRefExpr::VK_None, fixup_Hexagon_LO16 },
+ };
+
+ static const std::map<unsigned,unsigned> RelocsHi = {
+ { MCSymbolRefExpr::VK_GOT, fixup_Hexagon_GOT_HI16 },
+ { MCSymbolRefExpr::VK_GOTREL, fixup_Hexagon_GOTREL_HI16 },
+ { MCSymbolRefExpr::VK_Hexagon_GD_GOT, fixup_Hexagon_GD_GOT_HI16 },
+ { MCSymbolRefExpr::VK_Hexagon_LD_GOT, fixup_Hexagon_LD_GOT_HI16 },
+ { MCSymbolRefExpr::VK_Hexagon_IE, fixup_Hexagon_IE_HI16 },
+ { MCSymbolRefExpr::VK_Hexagon_IE_GOT, fixup_Hexagon_IE_GOT_HI16 },
+ { MCSymbolRefExpr::VK_TPREL, fixup_Hexagon_TPREL_HI16 },
+ { MCSymbolRefExpr::VK_DTPREL, fixup_Hexagon_DTPREL_HI16 },
+ { MCSymbolRefExpr::VK_None, fixup_Hexagon_HI16 },
+ };
- // The only relocs left should be GP relative:
- default:
- if (MCID.mayStore() || MCID.mayLoad()) {
- for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses;
- ++ImpUses) {
- if (*ImpUses != Hexagon::GP)
- continue;
- switch (HexagonMCInstrInfo::getMemAccessSize(MCII, MI)) {
- case 1:
- return fixup_Hexagon_GPREL16_0;
- case 2:
- return fixup_Hexagon_GPREL16_1;
- case 4:
- return fixup_Hexagon_GPREL16_2;
- case 8:
- return fixup_Hexagon_GPREL16_3;
- default:
- raise_relocation_error(0, kind);
- }
- }
+ switch (MCID.getOpcode()) {
+ case Hexagon::LO:
+ case Hexagon::A2_tfril: {
+ auto F = RelocsLo.find(VarKind);
+ if (F != RelocsLo.end())
+ return Hexagon::Fixups(F->second);
+ break;
+ }
+ case Hexagon::HI:
+ case Hexagon::A2_tfrih: {
+ auto F = RelocsHi.find(VarKind);
+ if (F != RelocsHi.end())
+ return Hexagon::Fixups(F->second);
+ break;
}
- raise_relocation_error(0, kind);
}
- llvm_unreachable("Relocation exit not taken");
-}
-
-namespace llvm {
-
-extern const MCInstrDesc HexagonInsts[];
-} // end namespace llvm
+ raise_relocation_error(0, VarKind);
+}
-static bool isPCRel (unsigned Kind) {
- switch(Kind){
+static bool isPCRel(unsigned Kind) {
+ switch (Kind){
case fixup_Hexagon_B22_PCREL:
case fixup_Hexagon_B15_PCREL:
case fixup_Hexagon_B7_PCREL:
@@ -342,16 +584,34 @@ static bool isPCRel (unsigned Kind) {
}
unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
- const MCOperand &MO,
- const MCExpr *ME,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const
-{
+ const MCOperand &MO, const MCExpr *ME, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
if (isa<HexagonMCExpr>(ME))
ME = &HexagonMCInstrInfo::getExpr(*ME);
int64_t Value;
- if (ME->evaluateAsAbsolute(Value))
+ if (ME->evaluateAsAbsolute(Value)) {
+ bool InstExtendable = HexagonMCInstrInfo::isExtendable(MCII, MI) ||
+ HexagonMCInstrInfo::isExtended(MCII, MI);
+ // Only sub-instruction #1 can be extended in a duplex. If MI is a
+ // sub-instruction #0, it is not extended even if Extended is true
+ // (it can be true for the duplex as a whole).
+ bool IsSub0 = HexagonMCInstrInfo::isSubInstruction(MI) && !State.SubInst1;
+ if (State.Extended && InstExtendable && !IsSub0) {
+ unsigned OpIdx = ~0u;
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+ if (&MO != &MI.getOperand(I))
+ continue;
+ OpIdx = I;
+ break;
+ }
+ assert(OpIdx != ~0u);
+ if (OpIdx == HexagonMCInstrInfo::getExtendableOp(MCII, MI)) {
+ unsigned Shift = HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
+ Value = (Value & 0x3f) << Shift;
+ }
+ }
return Value;
+ }
assert(ME->getKind() == MCExpr::SymbolRef ||
ME->getKind() == MCExpr::Binary);
if (ME->getKind() == MCExpr::Binary) {
@@ -360,366 +620,99 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
getExprOpValue(MI, MO, Binary->getRHS(), Fixups, STI);
return 0;
}
- Hexagon::Fixups FixupKind =
- Hexagon::Fixups(Hexagon::fixup_Hexagon_TPREL_LO16);
+
+ unsigned FixupKind = fixup_Invalid;
const MCSymbolRefExpr *MCSRE = static_cast<const MCSymbolRefExpr *>(ME);
const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(MCII, MI);
- unsigned bits = HexagonMCInstrInfo::getExtentBits(MCII, MI) -
- HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
- const MCSymbolRefExpr::VariantKind kind = MCSRE->getKind();
-
- DEBUG(dbgs() << "----------------------------------------\n");
- DEBUG(dbgs() << "Opcode Name: " << HexagonMCInstrInfo::getName(MCII, MI)
- << "\n");
- DEBUG(dbgs() << "Opcode: " << MCID.getOpcode() << "\n");
- DEBUG(dbgs() << "Relocation bits: " << bits << "\n");
- DEBUG(dbgs() << "Addend: " << *Addend << "\n");
- DEBUG(dbgs() << "----------------------------------------\n");
-
- switch (bits) {
- default:
- raise_relocation_error(bits, kind);
- case 32:
- switch (kind) {
- case MCSymbolRefExpr::VK_DTPREL:
- FixupKind = *Extended ? Hexagon::fixup_Hexagon_DTPREL_32_6_X
- : Hexagon::fixup_Hexagon_DTPREL_32;
- break;
- case MCSymbolRefExpr::VK_GOT:
- FixupKind = *Extended ? Hexagon::fixup_Hexagon_GOT_32_6_X
- : Hexagon::fixup_Hexagon_GOT_32;
- break;
- case MCSymbolRefExpr::VK_GOTREL:
- FixupKind = *Extended ? Hexagon::fixup_Hexagon_GOTREL_32_6_X
- : Hexagon::fixup_Hexagon_GOTREL_32;
- break;
- case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
- FixupKind = *Extended ? Hexagon::fixup_Hexagon_GD_GOT_32_6_X
- : Hexagon::fixup_Hexagon_GD_GOT_32;
- break;
- case MCSymbolRefExpr::VK_Hexagon_IE:
- FixupKind = *Extended ? Hexagon::fixup_Hexagon_IE_32_6_X
- : Hexagon::fixup_Hexagon_IE_32;
- break;
- case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
- FixupKind = *Extended ? Hexagon::fixup_Hexagon_IE_GOT_32_6_X
- : Hexagon::fixup_Hexagon_IE_GOT_32;
- break;
- case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
- FixupKind = *Extended ? Hexagon::fixup_Hexagon_LD_GOT_32_6_X
- : Hexagon::fixup_Hexagon_LD_GOT_32;
- break;
- case MCSymbolRefExpr::VK_Hexagon_PCREL:
- FixupKind = Hexagon::fixup_Hexagon_32_PCREL;
- break;
- case MCSymbolRefExpr::VK_None:
- FixupKind =
- *Extended ? Hexagon::fixup_Hexagon_32_6_X : Hexagon::fixup_Hexagon_32;
- break;
- case MCSymbolRefExpr::VK_TPREL:
- FixupKind = *Extended ? Hexagon::fixup_Hexagon_TPREL_32_6_X
- : Hexagon::fixup_Hexagon_TPREL_32;
- break;
- default:
- raise_relocation_error(bits, kind);
- }
- break;
-
- case 22:
- switch (kind) {
- case MCSymbolRefExpr::VK_Hexagon_GD_PLT:
- FixupKind = *Extended ? Hexagon::fixup_Hexagon_GD_PLT_B22_PCREL_X
- : Hexagon::fixup_Hexagon_GD_PLT_B22_PCREL;
- break;
- case MCSymbolRefExpr::VK_Hexagon_LD_PLT:
- FixupKind = *Extended ? Hexagon::fixup_Hexagon_LD_PLT_B22_PCREL_X
- : Hexagon::fixup_Hexagon_LD_PLT_B22_PCREL;
- break;
- case MCSymbolRefExpr::VK_None:
- FixupKind = *Extended ? Hexagon::fixup_Hexagon_B22_PCREL_X
- : Hexagon::fixup_Hexagon_B22_PCREL;
- break;
- case MCSymbolRefExpr::VK_PLT:
- FixupKind = Hexagon::fixup_Hexagon_PLT_B22_PCREL;
- break;
- default:
- raise_relocation_error(bits, kind);
- }
- break;
-
- case 16:
- if (*Extended) {
- switch (kind) {
- case MCSymbolRefExpr::VK_DTPREL:
- FixupKind = Hexagon::fixup_Hexagon_DTPREL_16_X;
- break;
- case MCSymbolRefExpr::VK_GOT:
- FixupKind = Hexagon::fixup_Hexagon_GOT_16_X;
- break;
- case MCSymbolRefExpr::VK_GOTREL:
- FixupKind = Hexagon::fixup_Hexagon_GOTREL_16_X;
- break;
- case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
- FixupKind = Hexagon::fixup_Hexagon_GD_GOT_16_X;
- break;
- case MCSymbolRefExpr::VK_Hexagon_IE:
- FixupKind = Hexagon::fixup_Hexagon_IE_16_X;
- break;
- case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
- FixupKind = Hexagon::fixup_Hexagon_IE_GOT_16_X;
- break;
- case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
- FixupKind = Hexagon::fixup_Hexagon_LD_GOT_16_X;
- break;
- case MCSymbolRefExpr::VK_None:
- FixupKind = Hexagon::fixup_Hexagon_16_X;
- break;
- case MCSymbolRefExpr::VK_TPREL:
- FixupKind = Hexagon::fixup_Hexagon_TPREL_16_X;
- break;
- default:
- raise_relocation_error(bits, kind);
- }
- } else
- switch (kind) {
- case MCSymbolRefExpr::VK_None:
- if (HexagonMCInstrInfo::s27_2_reloc(*MO.getExpr()))
- FixupKind = Hexagon::fixup_Hexagon_27_REG;
- else
- if (MCID.mayStore() || MCID.mayLoad()) {
- for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses;
- ++ImpUses) {
- if (*ImpUses != Hexagon::GP)
- continue;
- switch (HexagonMCInstrInfo::getMemAccessSize(MCII, MI)) {
- case 1:
- FixupKind = fixup_Hexagon_GPREL16_0;
- break;
- case 2:
- FixupKind = fixup_Hexagon_GPREL16_1;
- break;
- case 4:
- FixupKind = fixup_Hexagon_GPREL16_2;
- break;
- case 8:
- FixupKind = fixup_Hexagon_GPREL16_3;
- break;
- default:
- raise_relocation_error(bits, kind);
- }
- }
- } else
- raise_relocation_error(bits, kind);
- break;
- case MCSymbolRefExpr::VK_DTPREL:
- FixupKind = Hexagon::fixup_Hexagon_DTPREL_16;
- break;
- case MCSymbolRefExpr::VK_GOTREL:
- if (MCID.getOpcode() == Hexagon::HI)
- FixupKind = Hexagon::fixup_Hexagon_GOTREL_HI16;
- else
- FixupKind = Hexagon::fixup_Hexagon_GOTREL_LO16;
- break;
- case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
- FixupKind = Hexagon::fixup_Hexagon_GD_GOT_16;
- break;
- case MCSymbolRefExpr::VK_Hexagon_GPREL:
- FixupKind = Hexagon::fixup_Hexagon_GPREL16_0;
- break;
- case MCSymbolRefExpr::VK_Hexagon_HI16:
- FixupKind = Hexagon::fixup_Hexagon_HI16;
- break;
- case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
- FixupKind = Hexagon::fixup_Hexagon_IE_GOT_16;
- break;
- case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
- FixupKind = Hexagon::fixup_Hexagon_LD_GOT_16;
- break;
- case MCSymbolRefExpr::VK_Hexagon_LO16:
- FixupKind = Hexagon::fixup_Hexagon_LO16;
- break;
- case MCSymbolRefExpr::VK_TPREL:
- FixupKind = Hexagon::fixup_Hexagon_TPREL_16;
- break;
- default:
- raise_relocation_error(bits, kind);
+ unsigned FixupWidth = HexagonMCInstrInfo::getExtentBits(MCII, MI) -
+ HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
+ MCSymbolRefExpr::VariantKind VarKind = MCSRE->getKind();
+ unsigned Opc = MCID.getOpcode();
+ unsigned IType = HexagonMCInstrInfo::getType(MCII, MI);
+
+ LLVM_DEBUG(dbgs() << "----------------------------------------\n"
+ << "Opcode Name: " << HexagonMCInstrInfo::getName(MCII, MI)
+ << "\nOpcode: " << Opc << "\nRelocation bits: "
+ << FixupWidth << "\nAddend: " << State.Addend
+ << "\nVariant: " << unsigned(VarKind)
+ << "\n----------------------------------------\n");
+
+ // Pick the applicable fixup kind for the symbol.
+ // Handle special cases first, the rest will be looked up in the tables.
+
+ if (FixupWidth == 16 && !State.Extended) {
+ if (VarKind == MCSymbolRefExpr::VK_None) {
+ if (HexagonMCInstrInfo::s27_2_reloc(*MO.getExpr())) {
+ // A2_iconst.
+ FixupKind = Hexagon::fixup_Hexagon_27_REG;
+ } else {
+ // Look for GP-relative fixups.
+ unsigned Shift = HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
+ static const Hexagon::Fixups GPRelFixups[] = {
+ Hexagon::fixup_Hexagon_GPREL16_0, Hexagon::fixup_Hexagon_GPREL16_1,
+ Hexagon::fixup_Hexagon_GPREL16_2, Hexagon::fixup_Hexagon_GPREL16_3
+ };
+ assert(Shift < array_lengthof(GPRelFixups));
+ auto UsesGP = [] (const MCInstrDesc &D) {
+ for (const MCPhysReg *U = D.getImplicitUses(); U && *U; ++U)
+ if (*U == Hexagon::GP)
+ return true;
+ return false;
+ };
+ if (UsesGP(MCID))
+ FixupKind = GPRelFixups[Shift];
}
- break;
-
- case 15:
- switch (kind) {
- case MCSymbolRefExpr::VK_None:
- FixupKind = *Extended ? Hexagon::fixup_Hexagon_B15_PCREL_X
- : Hexagon::fixup_Hexagon_B15_PCREL;
- break;
- default:
- raise_relocation_error(bits, kind);
+ } else if (VarKind == MCSymbolRefExpr::VK_GOTREL) {
+ // Select between LO/HI.
+ if (Opc == Hexagon::LO)
+ FixupKind = Hexagon::fixup_Hexagon_GOTREL_LO16;
+ else if (Opc == Hexagon::HI)
+ FixupKind = Hexagon::fixup_Hexagon_GOTREL_HI16;
}
- break;
-
- case 13:
- switch (kind) {
- case MCSymbolRefExpr::VK_None:
- FixupKind = Hexagon::fixup_Hexagon_B13_PCREL;
- break;
- default:
- raise_relocation_error(bits, kind);
- }
- break;
-
- case 12:
- if (*Extended)
- switch (kind) {
- // There isn't a GOT_12_X, both 11_X and 16_X resolve to 6/26
- case MCSymbolRefExpr::VK_GOT:
- FixupKind = Hexagon::fixup_Hexagon_GOT_16_X;
- break;
- case MCSymbolRefExpr::VK_GOTREL:
- FixupKind = Hexagon::fixup_Hexagon_GOTREL_16_X;
- break;
- case MCSymbolRefExpr::VK_None:
- FixupKind = Hexagon::fixup_Hexagon_12_X;
- break;
- default:
- raise_relocation_error(bits, kind);
- }
- else
- raise_relocation_error(bits, kind);
- break;
-
- case 11:
- if (*Extended)
- switch (kind) {
- case MCSymbolRefExpr::VK_DTPREL:
- FixupKind = Hexagon::fixup_Hexagon_DTPREL_11_X;
- break;
- case MCSymbolRefExpr::VK_GOT:
- FixupKind = Hexagon::fixup_Hexagon_GOT_11_X;
- break;
- case MCSymbolRefExpr::VK_GOTREL:
- FixupKind = Hexagon::fixup_Hexagon_GOTREL_11_X;
- break;
- case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
- FixupKind = Hexagon::fixup_Hexagon_GD_GOT_11_X;
- break;
- case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
- FixupKind = Hexagon::fixup_Hexagon_IE_GOT_11_X;
+ } else {
+ bool BranchOrCR = MCID.isBranch() || IType == HexagonII::TypeCR;
+ switch (FixupWidth) {
+ case 9:
+ if (BranchOrCR)
+ FixupKind = State.Extended ? Hexagon::fixup_Hexagon_B9_PCREL_X
+ : Hexagon::fixup_Hexagon_B9_PCREL;
+ break;
+ case 8:
+ case 7:
+ if (State.Extended && VarKind == MCSymbolRefExpr::VK_GOT)
+ FixupKind = HexagonMCInstrInfo::isExtentSigned(MCII, MI)
+ ? Hexagon::fixup_Hexagon_GOT_16_X
+ : Hexagon::fixup_Hexagon_GOT_11_X;
+ else if (FixupWidth == 7 && BranchOrCR)
+ FixupKind = State.Extended ? Hexagon::fixup_Hexagon_B7_PCREL_X
+ : Hexagon::fixup_Hexagon_B7_PCREL;
+ break;
+ case 0:
+ FixupKind = getFixupNoBits(MCII, MI, MO, VarKind);
break;
- case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
- FixupKind = Hexagon::fixup_Hexagon_LD_GOT_11_X;
- break;
- case MCSymbolRefExpr::VK_Hexagon_GD_PLT:
- FixupKind = Hexagon::fixup_Hexagon_GD_PLT_B22_PCREL_X;
- break;
- case MCSymbolRefExpr::VK_Hexagon_LD_PLT:
- FixupKind = Hexagon::fixup_Hexagon_LD_PLT_B22_PCREL_X;
- break;
- case MCSymbolRefExpr::VK_None:
- FixupKind = Hexagon::fixup_Hexagon_11_X;
- break;
- case MCSymbolRefExpr::VK_TPREL:
- FixupKind = Hexagon::fixup_Hexagon_TPREL_11_X;
- break;
- default:
- raise_relocation_error(bits, kind);
- }
- else {
- switch (kind) {
- case MCSymbolRefExpr::VK_TPREL:
- FixupKind = Hexagon::fixup_Hexagon_TPREL_11_X;
- break;
- default:
- raise_relocation_error(bits, kind);
- }
}
- break;
+ }
- case 10:
- if (*Extended) {
- switch (kind) {
- case MCSymbolRefExpr::VK_None:
- FixupKind = Hexagon::fixup_Hexagon_10_X;
- break;
- default:
- raise_relocation_error(bits, kind);
- }
- } else
- raise_relocation_error(bits, kind);
- break;
-
- case 9:
- if (MCID.isBranch() ||
- (HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCR))
- FixupKind = *Extended ? Hexagon::fixup_Hexagon_B9_PCREL_X
- : Hexagon::fixup_Hexagon_B9_PCREL;
- else if (*Extended)
- FixupKind = Hexagon::fixup_Hexagon_9_X;
- else
- raise_relocation_error(bits, kind);
- break;
-
- case 8:
- if (*Extended)
- FixupKind = Hexagon::fixup_Hexagon_8_X;
- else
- raise_relocation_error(bits, kind);
- break;
-
- case 7:
- if (MCID.isBranch() ||
- (HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCR))
- FixupKind = *Extended ? Hexagon::fixup_Hexagon_B7_PCREL_X
- : Hexagon::fixup_Hexagon_B7_PCREL;
- else if (*Extended)
- FixupKind = Hexagon::fixup_Hexagon_7_X;
- else
- raise_relocation_error(bits, kind);
- break;
-
- case 6:
- if (*Extended) {
- switch (kind) {
- case MCSymbolRefExpr::VK_DTPREL:
- FixupKind = Hexagon::fixup_Hexagon_DTPREL_16_X;
- break;
- // This is part of an extender, GOT_11 is a
- // Word32_U6 unsigned/truncated reloc.
- case MCSymbolRefExpr::VK_GOT:
- FixupKind = Hexagon::fixup_Hexagon_GOT_11_X;
- break;
- case MCSymbolRefExpr::VK_GOTREL:
- FixupKind = Hexagon::fixup_Hexagon_GOTREL_11_X;
- break;
- case MCSymbolRefExpr::VK_Hexagon_PCREL:
- FixupKind = Hexagon::fixup_Hexagon_6_PCREL_X;
- break;
- case MCSymbolRefExpr::VK_TPREL:
- FixupKind = Hexagon::fixup_Hexagon_TPREL_16_X;
- break;
- case MCSymbolRefExpr::VK_None:
- FixupKind = Hexagon::fixup_Hexagon_6_X;
- break;
- default:
- raise_relocation_error(bits, kind);
- }
- } else
- raise_relocation_error(bits, kind);
- break;
+ if (FixupKind == fixup_Invalid) {
+ const auto &FixupTable = State.Extended ? ExtFixups : StdFixups;
- case 0:
- FixupKind = getFixupNoBits(MCII, MI, MO, kind);
- break;
+ auto FindVK = FixupTable.find(VarKind);
+ if (FindVK != FixupTable.end())
+ FixupKind = FindVK->second[FixupWidth];
}
- MCExpr const *FixupExpression =
- (*Addend > 0 && isPCRel(FixupKind))
- ? MCBinaryExpr::createAdd(MO.getExpr(),
- MCConstantExpr::create(*Addend, MCT), MCT)
- : MO.getExpr();
+ if (FixupKind == fixup_Invalid)
+ raise_relocation_error(FixupWidth, VarKind);
- MCFixup fixup = MCFixup::create(*Addend, FixupExpression,
+ const MCExpr *FixupExpr = MO.getExpr();
+ if (State.Addend != 0 && isPCRel(FixupKind)) {
+ const MCExpr *C = MCConstantExpr::create(State.Addend, MCT);
+ FixupExpr = MCBinaryExpr::createAdd(FixupExpr, C, MCT);
+ }
+
+ MCFixup Fixup = MCFixup::create(State.Addend, FixupExpr,
MCFixupKind(FixupKind), MI.getLoc());
- Fixups.push_back(fixup);
+ Fixups.push_back(Fixup);
// All of the information is in the fixup.
return 0;
}
@@ -739,55 +732,55 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
#endif
if (HexagonMCInstrInfo::isNewValue(MCII, MI) &&
- &MO == &MI.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, MI))) {
+ &MO == &HexagonMCInstrInfo::getNewValueOperand(MCII, MI)) {
// Calculate the new value distance to the associated producer
- MCOperand const &MCO =
- MI.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, MI));
unsigned SOffset = 0;
unsigned VOffset = 0;
- unsigned Register = MCO.getReg();
- unsigned Register1;
- unsigned Register2;
- auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
- auto i = Instructions.begin() + *CurrentIndex - 1;
- for (;; --i) {
- assert(i != Instructions.begin() - 1 && "Couldn't find producer");
- MCInst const &Inst = *i->getInst();
+ unsigned UseReg = MO.getReg();
+ unsigned DefReg1, DefReg2;
+
+ auto Instrs = HexagonMCInstrInfo::bundleInstructions(*State.Bundle);
+ const MCOperand *I = Instrs.begin() + State.Index - 1;
+
+ for (;; --I) {
+ assert(I != Instrs.begin() - 1 && "Couldn't find producer");
+ MCInst const &Inst = *I->getInst();
if (HexagonMCInstrInfo::isImmext(Inst))
continue;
+
+ DefReg1 = DefReg2 = 0;
++SOffset;
- if (HexagonMCInstrInfo::isVector(MCII, Inst))
- // Vector instructions don't count scalars
+ if (HexagonMCInstrInfo::isVector(MCII, Inst)) {
+ // Vector instructions don't count scalars.
++VOffset;
- Register1 =
- HexagonMCInstrInfo::hasNewValue(MCII, Inst)
- ? HexagonMCInstrInfo::getNewValueOperand(MCII, Inst).getReg()
- : static_cast<unsigned>(Hexagon::NoRegister);
- Register2 =
- HexagonMCInstrInfo::hasNewValue2(MCII, Inst)
- ? HexagonMCInstrInfo::getNewValueOperand2(MCII, Inst).getReg()
- : static_cast<unsigned>(Hexagon::NoRegister);
- if (!RegisterMatches(Register, Register1, Register2))
+ }
+ if (HexagonMCInstrInfo::hasNewValue(MCII, Inst))
+ DefReg1 = HexagonMCInstrInfo::getNewValueOperand(MCII, Inst).getReg();
+ if (HexagonMCInstrInfo::hasNewValue2(MCII, Inst))
+ DefReg2 = HexagonMCInstrInfo::getNewValueOperand2(MCII, Inst).getReg();
+ if (!RegisterMatches(UseReg, DefReg1, DefReg2)) {
// This isn't the register we're looking for
continue;
- if (!HexagonMCInstrInfo::isPredicated(MCII, Inst))
+ }
+ if (!HexagonMCInstrInfo::isPredicated(MCII, Inst)) {
// Producer is unpredicated
break;
+ }
assert(HexagonMCInstrInfo::isPredicated(MCII, MI) &&
- "Unpredicated consumer depending on predicated producer");
+ "Unpredicated consumer depending on predicated producer");
if (HexagonMCInstrInfo::isPredicatedTrue(MCII, Inst) ==
- HexagonMCInstrInfo::isPredicatedTrue(MCII, MI))
- // Producer predicate sense matched ours
+ HexagonMCInstrInfo::isPredicatedTrue(MCII, MI))
+ // Producer predicate sense matched ours.
break;
}
// Hexagon PRM 10.11 Construct Nt from distance
- unsigned Offset =
- HexagonMCInstrInfo::isVector(MCII, MI) ? VOffset : SOffset;
+ unsigned Offset = HexagonMCInstrInfo::isVector(MCII, MI) ? VOffset
+ : SOffset;
Offset <<= 1;
- Offset |=
- HexagonMCInstrInfo::SubregisterBit(Register, Register1, Register2);
+ Offset |= HexagonMCInstrInfo::SubregisterBit(UseReg, DefReg1, DefReg2);
return Offset;
}
+
assert(!MO.isImm());
if (MO.isReg()) {
unsigned Reg = MO.getReg();
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
index 14cabf1534a5..fcea63db23a3 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief Definition for classes that emit Hexagon machine code from MCInsts
+/// Definition for classes that emit Hexagon machine code from MCInsts
///
//===----------------------------------------------------------------------===//
@@ -35,25 +35,20 @@ class raw_ostream;
class HexagonMCCodeEmitter : public MCCodeEmitter {
MCContext &MCT;
MCInstrInfo const &MCII;
- std::unique_ptr<unsigned> Addend;
- std::unique_ptr<bool> Extended;
- std::unique_ptr<MCInst const *> CurrentBundle;
- std::unique_ptr<size_t> CurrentIndex;
- // helper routine for getMachineOpValue()
- unsigned getExprOpValue(const MCInst &MI, const MCOperand &MO,
- const MCExpr *ME, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-
- Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
- const MCOperand &MO,
- const MCSymbolRefExpr::VariantKind kind) const;
+ // A mutable state of the emitter when encoding bundles and duplexes.
+ struct EmitterState {
+ unsigned Addend = 0;
+ bool Extended = false;
+ bool SubInst1 = false;
+ const MCInst *Bundle = nullptr;
+ size_t Index = 0;
+ };
+ mutable EmitterState State;
public:
- HexagonMCCodeEmitter(MCInstrInfo const &aMII, MCContext &aMCT);
-
- // Return parse bits for instruction `MCI' inside bundle `MCB'
- uint32_t parseBits(size_t Last, MCInst const &MCB, MCInst const &MCI) const;
+ HexagonMCCodeEmitter(MCInstrInfo const &MII, MCContext &MCT)
+ : MCT(MCT), MCII(MII) {}
void encodeInstruction(MCInst const &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
@@ -64,18 +59,30 @@ public:
const MCSubtargetInfo &STI,
uint32_t Parse) const;
- // \brief TableGen'erated function for getting the
+ // TableGen'erated function for getting the
// binary encoding for an instruction.
uint64_t getBinaryCodeForInstr(MCInst const &MI,
SmallVectorImpl<MCFixup> &Fixups,
MCSubtargetInfo const &STI) const;
- /// \brief Return binary encoding of operand.
+ /// Return binary encoding of operand.
unsigned getMachineOpValue(MCInst const &MI, MCOperand const &MO,
SmallVectorImpl<MCFixup> &Fixups,
MCSubtargetInfo const &STI) const;
private:
+ // helper routine for getMachineOpValue()
+ unsigned getExprOpValue(const MCInst &MI, const MCOperand &MO,
+ const MCExpr *ME, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
+ const MCOperand &MO,
+ const MCSymbolRefExpr::VariantKind Kind) const;
+
+ // Return parse bits for instruction `MCI' inside bundle `MCB'
+ uint32_t parseBits(size_t Last, MCInst const &MCB, MCInst const &MCI) const;
+
uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
void verifyInstructionPredicates(const MCInst &MI,
uint64_t AvailableFeatures) const;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
index 127c97e342dc..3eaef9ac7410 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -205,7 +205,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
switch (L.getOpcode()) {
default:
- DEBUG(dbgs() << "Possible compound ignored\n");
+ LLVM_DEBUG(dbgs() << "Possible compound ignored\n");
return CompoundInsn;
case Hexagon::A2_tfrsi:
@@ -233,7 +233,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
break;
case Hexagon::C2_cmpeq:
- DEBUG(dbgs() << "CX: C2_cmpeq\n");
+ LLVM_DEBUG(dbgs() << "CX: C2_cmpeq\n");
Rs = L.getOperand(1);
Rt = L.getOperand(2);
@@ -246,7 +246,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
break;
case Hexagon::C2_cmpgt:
- DEBUG(dbgs() << "CX: C2_cmpgt\n");
+ LLVM_DEBUG(dbgs() << "CX: C2_cmpgt\n");
Rs = L.getOperand(1);
Rt = L.getOperand(2);
@@ -259,7 +259,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
break;
case Hexagon::C2_cmpgtu:
- DEBUG(dbgs() << "CX: C2_cmpgtu\n");
+ LLVM_DEBUG(dbgs() << "CX: C2_cmpgtu\n");
Rs = L.getOperand(1);
Rt = L.getOperand(2);
@@ -272,7 +272,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
break;
case Hexagon::C2_cmpeqi:
- DEBUG(dbgs() << "CX: C2_cmpeqi\n");
+ LLVM_DEBUG(dbgs() << "CX: C2_cmpeqi\n");
Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
(void)Success;
assert(Success);
@@ -290,7 +290,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
break;
case Hexagon::C2_cmpgti:
- DEBUG(dbgs() << "CX: C2_cmpgti\n");
+ LLVM_DEBUG(dbgs() << "CX: C2_cmpgti\n");
Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
(void)Success;
assert(Success);
@@ -308,7 +308,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
break;
case Hexagon::C2_cmpgtui:
- DEBUG(dbgs() << "CX: C2_cmpgtui\n");
+ LLVM_DEBUG(dbgs() << "CX: C2_cmpgtui\n");
Rs = L.getOperand(1);
compoundOpcode = cmpgtuiBitOpcode[getCompoundOp(R)];
CompoundInsn = new (Context) MCInst;
@@ -319,7 +319,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
break;
case Hexagon::S2_tstbit_i:
- DEBUG(dbgs() << "CX: S2_tstbit_i\n");
+ LLVM_DEBUG(dbgs() << "CX: S2_tstbit_i\n");
Rs = L.getOperand(1);
compoundOpcode = tstBitOpcode[getCompoundOp(R)];
CompoundInsn = new (Context) MCInst;
@@ -372,14 +372,14 @@ static bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context,
BExtended = true;
continue;
}
- DEBUG(dbgs() << "J,B: " << JumpInst->getOpcode() << ","
- << Inst->getOpcode() << "\n");
+ LLVM_DEBUG(dbgs() << "J,B: " << JumpInst->getOpcode() << ","
+ << Inst->getOpcode() << "\n");
if (isOrderedCompoundPair(*Inst, BExtended, *JumpInst, JExtended)) {
MCInst *CompoundInsn = getCompoundInsn(Context, *Inst, *JumpInst);
if (CompoundInsn) {
- DEBUG(dbgs() << "B: " << Inst->getOpcode() << ","
- << JumpInst->getOpcode() << " Compounds to "
- << CompoundInsn->getOpcode() << "\n");
+ LLVM_DEBUG(dbgs() << "B: " << Inst->getOpcode() << ","
+ << JumpInst->getOpcode() << " Compounds to "
+ << CompoundInsn->getOpcode() << "\n");
J->setInst(CompoundInsn);
MCI.erase(B);
return true;
@@ -422,7 +422,7 @@ void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII, MCSubtargetInfo co
if (StartedValid &&
!llvm::HexagonMCShuffle(Context, false, MCII, STI, MCI)) {
- DEBUG(dbgs() << "Found ERROR\n");
+ LLVM_DEBUG(dbgs() << "Found ERROR\n");
MCI = OriginalBundle;
}
}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index 4c18af60efd1..b208a3668124 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -263,12 +263,10 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
break;
case Hexagon::L4_return:
-
case Hexagon::L2_deallocframe:
-
return HexagonII::HSIG_L2;
- case Hexagon::EH_RETURN_JMPR:
+ case Hexagon::EH_RETURN_JMPR:
case Hexagon::J2_jumpr:
case Hexagon::PS_jmpret:
// jumpr r31
@@ -789,12 +787,12 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
addOps(Result, Inst, 2);
break; // 1,3 SUBInst $Rdd = combine(#2, #$u2)
}
+ break;
case Hexagon::A4_combineir:
Result.setOpcode(Hexagon::SA1_combinezr);
addOps(Result, Inst, 0);
addOps(Result, Inst, 2);
break; // 1,3 SUBInst $Rdd = combine(#0, $Rs)
-
case Hexagon::A4_combineri:
Result.setOpcode(Hexagon::SA1_combinerz);
addOps(Result, Inst, 0);
@@ -901,6 +899,7 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
addOps(Result, Inst, 1);
break; // 2 1,2 SUBInst memb($Rs + #$u4_0)=#1
}
+ break;
case Hexagon::S2_storerb_io:
Result.setOpcode(Hexagon::SS1_storeb_io);
addOps(Result, Inst, 0);
@@ -937,6 +936,7 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
addOps(Result, Inst, 2);
break; // 1 2,3 SUBInst memw(r29 + #$u5_2) = $Rt
}
+ break;
case Hexagon::S2_storeri_io:
if (Inst.getOperand(0).getReg() == Hexagon::R29) {
Result.setOpcode(Hexagon::SS2_storew_sp);
@@ -1045,8 +1045,8 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
bool bisReversable = true;
if (isStoreInst(MCB.getOperand(j).getInst()->getOpcode()) &&
isStoreInst(MCB.getOperand(k).getInst()->getOpcode())) {
- DEBUG(dbgs() << "skip out of order write pair: " << k << "," << j
- << "\n");
+ LLVM_DEBUG(dbgs() << "skip out of order write pair: " << k << "," << j
+ << "\n");
bisReversable = false;
}
if (HexagonMCInstrInfo::isMemReorderDisabled(MCB)) // }:mem_noshuf
@@ -1066,14 +1066,14 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
// Save off pairs for duplex checking.
duplexToTry.push_back(DuplexCandidate(j, k, iClass));
- DEBUG(dbgs() << "adding pair: " << j << "," << k << ":"
- << MCB.getOperand(j).getInst()->getOpcode() << ","
- << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+ LLVM_DEBUG(dbgs() << "adding pair: " << j << "," << k << ":"
+ << MCB.getOperand(j).getInst()->getOpcode() << ","
+ << MCB.getOperand(k).getInst()->getOpcode() << "\n");
continue;
} else {
- DEBUG(dbgs() << "skipping pair: " << j << "," << k << ":"
- << MCB.getOperand(j).getInst()->getOpcode() << ","
- << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+ LLVM_DEBUG(dbgs() << "skipping pair: " << j << "," << k << ":"
+ << MCB.getOperand(j).getInst()->getOpcode() << ","
+ << MCB.getOperand(k).getInst()->getOpcode() << "\n");
}
// Try reverse.
@@ -1091,13 +1091,15 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
// Save off pairs for duplex checking.
duplexToTry.push_back(DuplexCandidate(k, j, iClass));
- DEBUG(dbgs() << "adding pair:" << k << "," << j << ":"
- << MCB.getOperand(j).getInst()->getOpcode() << ","
- << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+ LLVM_DEBUG(dbgs()
+ << "adding pair:" << k << "," << j << ":"
+ << MCB.getOperand(j).getInst()->getOpcode() << ","
+ << MCB.getOperand(k).getInst()->getOpcode() << "\n");
} else {
- DEBUG(dbgs() << "skipping pair: " << k << "," << j << ":"
- << MCB.getOperand(j).getInst()->getOpcode() << ","
- << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+ LLVM_DEBUG(dbgs()
+ << "skipping pair: " << k << "," << j << ":"
+ << MCB.getOperand(j).getInst()->getOpcode() << ","
+ << MCB.getOperand(k).getInst()->getOpcode() << "\n");
}
}
}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 691e269cb91f..f304bc50530f 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -25,6 +25,7 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
@@ -47,15 +48,15 @@ static cl::opt<unsigned> GPSize
HexagonMCELFStreamer::HexagonMCELFStreamer(
MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
- raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter)
- : MCELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)),
+ std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter)
+ : MCELFStreamer(Context, std::move(TAB), std::move(OW), std::move(Emitter)),
MCII(createHexagonMCInstrInfo()) {}
HexagonMCELFStreamer::HexagonMCELFStreamer(
MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
- raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+ std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
MCAssembler *Assembler)
- : MCELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)),
+ : MCELFStreamer(Context, std::move(TAB), std::move(OW), std::move(Emitter)),
MCII(createHexagonMCInstrInfo()) {}
void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCB,
@@ -63,21 +64,6 @@ void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCB,
assert(MCB.getOpcode() == Hexagon::BUNDLE);
assert(HexagonMCInstrInfo::bundleSize(MCB) <= HEXAGON_PACKET_SIZE);
assert(HexagonMCInstrInfo::bundleSize(MCB) > 0);
- bool Extended = false;
- for (auto &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
- MCInst *MCI = const_cast<MCInst *>(I.getInst());
- if (Extended) {
- if (HexagonMCInstrInfo::isDuplex(*MCII, *MCI)) {
- MCInst *SubInst = const_cast<MCInst *>(MCI->getOperand(1).getInst());
- HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *SubInst);
- } else {
- HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *MCI);
- }
- Extended = false;
- } else {
- Extended = HexagonMCInstrInfo::isImmext(*MCI);
- }
- }
// At this point, MCB is a bundle
// Iterate through the bundle and assign addends for the instructions
@@ -124,7 +110,7 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
MCSectionSubPair P = getCurrentSection();
SwitchSection(&Section);
- if (ELFSymbol->isUndefined(false)) {
+ if (ELFSymbol->isUndefined()) {
EmitValueToAlignment(ByteAlignment, 0, 1, 0);
EmitLabel(Symbol);
EmitZeros(Size);
@@ -166,9 +152,10 @@ void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol,
namespace llvm {
MCStreamer *createHexagonELFStreamer(Triple const &TT, MCContext &Context,
std::unique_ptr<MCAsmBackend> MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> CE) {
- return new HexagonMCELFStreamer(Context, std::move(MAB), OS, std::move(CE));
+ return new HexagonMCELFStreamer(Context, std::move(MAB), std::move(OW),
+ std::move(CE));
}
} // end namespace llvm
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
index c6fa0021d86b..c02bef8f06f7 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
@@ -23,11 +23,11 @@ class HexagonMCELFStreamer : public MCELFStreamer {
public:
HexagonMCELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter);
HexagonMCELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter,
MCAssembler *Assembler);
@@ -43,7 +43,7 @@ public:
MCStreamer *createHexagonELFStreamer(Triple const &TT, MCContext &Context,
std::unique_ptr<MCAsmBackend> MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> CE);
} // end namespace llvm
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index 19308cd425e8..a11aa92ccbe1 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -158,23 +158,6 @@ bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII,
return true;
}
-void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII,
- MCContext &Context, MCInst &MCI) {
- assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
- HexagonMCInstrInfo::isExtended(MCII, MCI));
- MCOperand &exOp =
- MCI.getOperand(HexagonMCInstrInfo::getExtendableOp(MCII, MCI));
- // If the extended value is a constant, then use it for the extended and
- // for the extender instructions, masking off the lower 6 bits and
- // including the assumed bits.
- int64_t Value;
- if (exOp.getExpr()->evaluateAsAbsolute(Value)) {
- unsigned Shift = HexagonMCInstrInfo::getExtentAlignment(MCII, MCI);
- exOp.setExpr(HexagonMCExpr::create(
- MCConstantExpr::create((Value & 0x3f) << Shift, Context), Context));
- }
-}
-
MCInst HexagonMCInstrInfo::deriveExtender(MCInstrInfo const &MCII,
MCInst const &Inst,
MCOperand const &MO) {
@@ -330,16 +313,19 @@ unsigned HexagonMCInstrInfo::getExtentBits(MCInstrInfo const &MCII,
return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
}
+bool HexagonMCInstrInfo::isExtentSigned(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
+}
+
/// Return the maximum value of an extendable operand.
int HexagonMCInstrInfo::getMaxValue(MCInstrInfo const &MCII,
MCInst const &MCI) {
- const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
- bool S = (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
-
assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
HexagonMCInstrInfo::isExtended(MCII, MCI));
- if (S) // if value is signed
+ if (HexagonMCInstrInfo::isExtentSigned(MCII, MCI)) // if value is signed
return (1 << (HexagonMCInstrInfo::getExtentBits(MCII, MCI) - 1)) - 1;
return (1 << HexagonMCInstrInfo::getExtentBits(MCII, MCI)) - 1;
}
@@ -347,13 +333,10 @@ int HexagonMCInstrInfo::getMaxValue(MCInstrInfo const &MCII,
/// Return the minimum value of an extendable operand.
int HexagonMCInstrInfo::getMinValue(MCInstrInfo const &MCII,
MCInst const &MCI) {
- const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
- bool S = (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
-
assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
HexagonMCInstrInfo::isExtended(MCII, MCI));
- if (S) // if value is signed
+ if (HexagonMCInstrInfo::isExtentSigned(MCII, MCI)) // if value is signed
return -(1 << (HexagonMCInstrInfo::getExtentBits(MCII, MCI) - 1));
return 0;
}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
index 28d89429266b..d040bea23b6d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -103,9 +103,6 @@ MCInst deriveExtender(MCInstrInfo const &MCII, MCInst const &Inst,
// Convert this instruction in to a duplex subinst
MCInst deriveSubInst(MCInst const &Inst);
-// Clamp off upper 26 bits of extendable operand for emission
-void clampExtended(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
-
// Return the extender for instruction at Index or nullptr if none
MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
void extendIfNeeded(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
@@ -143,6 +140,9 @@ unsigned getExtentAlignment(MCInstrInfo const &MCII, MCInst const &MCI);
// Return the number of logical bits of the extendable operand
unsigned getExtentBits(MCInstrInfo const &MCII, MCInst const &MCI);
+// Check if the extendable operand is signed.
+bool isExtentSigned(MCInstrInfo const &MCII, MCInst const &MCI);
+
// Return the max value that a constant extendable operand can have
// without being extended.
int getMaxValue(MCInstrInfo const &MCII, MCInst const &MCI);
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
index 7bd54fdfa3d5..4281144acaee 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
@@ -38,7 +38,8 @@ void HexagonMCShuffler::init(MCInst &MCB) {
// Copy the bundle for the shuffling.
for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
MCInst &MI = *const_cast<MCInst *>(I.getInst());
- DEBUG(dbgs() << "Shuffling: " << MCII.getName(MI.getOpcode()) << '\n');
+ LLVM_DEBUG(dbgs() << "Shuffling: " << MCII.getName(MI.getOpcode())
+ << '\n');
assert(!HexagonMCInstrInfo::getDesc(MCII, MI).isPseudo());
if (!HexagonMCInstrInfo::isImmext(MI)) {
@@ -98,7 +99,7 @@ bool HexagonMCShuffler::reshuffleTo(MCInst &MCB) {
copyTo(MCB);
return true;
}
- DEBUG(MCB.dump());
+ LLVM_DEBUG(MCB.dump());
return false;
}
@@ -119,10 +120,10 @@ bool llvm::HexagonMCShuffle(MCContext &Context, bool Fatal,
// * %d7 = IMPLICIT_DEF; flags:
// After the IMPLICIT_DEFs were removed by the asm printer, the bundle
// became empty.
- DEBUG(dbgs() << "Skipping empty bundle");
+ LLVM_DEBUG(dbgs() << "Skipping empty bundle");
return false;
} else if (!HexagonMCInstrInfo::isBundle(MCB)) {
- DEBUG(dbgs() << "Skipping stand-alone insn");
+ LLVM_DEBUG(dbgs() << "Skipping stand-alone insn");
return false;
}
@@ -144,10 +145,10 @@ llvm::HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII,
// * %d7 = IMPLICIT_DEF; flags:
// After the IMPLICIT_DEFs were removed by the asm printer, the bundle
// became empty.
- DEBUG(dbgs() << "Skipping empty bundle");
+ LLVM_DEBUG(dbgs() << "Skipping empty bundle");
return false;
} else if (!HexagonMCInstrInfo::isBundle(MCB)) {
- DEBUG(dbgs() << "Skipping stand-alone insn");
+ LLVM_DEBUG(dbgs() << "Skipping stand-alone insn");
return false;
}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 3fbe2197f937..b211a81524fb 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -29,6 +29,7 @@
#include "llvm/MC/MCELFStreamer.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -147,7 +148,7 @@ public:
auto PacketBundle = Contents.rsplit('\n');
auto HeadTail = PacketBundle.first.split('\n');
StringRef Separator = "\n";
- StringRef Indent = "\t\t";
+ StringRef Indent = "\t";
OS << "\t{\n";
while (!HeadTail.first.empty()) {
StringRef InstTxt;
@@ -164,7 +165,7 @@ public:
}
if (HexagonMCInstrInfo::isMemReorderDisabled(Inst))
- OS << "\n\t}:mem_noshuf" << PacketBundle.second;
+ OS << "\n\t} :mem_noshuf" << PacketBundle.second;
else
OS << "\t}" << PacketBundle.second;
}
@@ -248,10 +249,10 @@ createMCAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
static MCStreamer *createMCStreamer(Triple const &T, MCContext &Context,
std::unique_ptr<MCAsmBackend> &&MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&Emitter,
bool RelaxAll) {
- return createHexagonELFStreamer(T, Context, std::move(MAB), OS,
+ return createHexagonELFStreamer(T, Context, std::move(MAB), std::move(OW),
std::move(Emitter));
}
@@ -308,6 +309,7 @@ static bool isCPUValid(std::string CPU)
{
std::vector<std::string> table
{
+ "generic",
"hexagonv4",
"hexagonv5",
"hexagonv55",
@@ -342,8 +344,7 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
break;
}
bool UseHvx = false;
- for (unsigned F : {ExtensionHVX, ExtensionHVX64B, ExtensionHVX128B,
- ExtensionHVXDbl}) {
+ for (unsigned F : {ExtensionHVX, ExtensionHVX64B, ExtensionHVX128B}) {
if (!FB.test(F))
continue;
UseHvx = true;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index 05d17c368dcc..6cd1b3a4691f 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -27,7 +27,7 @@ class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
class MCRegisterInfo;
class MCSubtargetInfo;
class MCTargetOptions;
@@ -61,13 +61,12 @@ MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII,
MCContext &MCT);
MCAsmBackend *createHexagonAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
const MCTargetOptions &Options);
-std::unique_ptr<MCObjectWriter>
-createHexagonELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
- StringRef CPU);
+std::unique_ptr<MCObjectTargetWriter>
+createHexagonELFObjectWriter(uint8_t OSABI, StringRef CPU);
unsigned HexagonGetLastSlot();
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 7709a0f61624..59f3caa6af94 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -641,14 +641,14 @@ bool HexagonShuffler::shuffle() {
}
for (iterator ISJ = begin(); ISJ != end(); ++ISJ)
- DEBUG(dbgs().write_hex(ISJ->Core.getUnits()); if (ISJ->CVI.isValid()) {
+ LLVM_DEBUG(dbgs().write_hex(ISJ->Core.getUnits()); if (ISJ->CVI.isValid()) {
dbgs() << '/';
dbgs().write_hex(ISJ->CVI.getUnits()) << '|';
dbgs() << ISJ->CVI.getLanes();
} dbgs() << ':'
<< HexagonMCInstrInfo::getDesc(MCII, ISJ->getDesc()).getOpcode();
- dbgs() << '\n');
- DEBUG(dbgs() << '\n');
+ dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << '\n');
return Ok;
}
diff --git a/lib/Target/Hexagon/RDFCopy.cpp b/lib/Target/Hexagon/RDFCopy.cpp
index f8c766ac972c..4339fa2089d9 100644
--- a/lib/Target/Hexagon/RDFCopy.cpp
+++ b/lib/Target/Hexagon/RDFCopy.cpp
@@ -18,6 +18,7 @@
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -103,7 +104,7 @@ bool CopyPropagation::run() {
if (trace()) {
dbgs() << "Copies:\n";
- for (auto I : Copies) {
+ for (NodeId I : Copies) {
dbgs() << "Instr: " << *DFG.addr<StmtNode*>(I).Addr->getCode();
dbgs() << " eq: {";
for (auto J : CopyMap[I])
@@ -130,7 +131,7 @@ bool CopyPropagation::run() {
return 0;
};
- for (auto C : Copies) {
+ for (NodeId C : Copies) {
#ifndef NDEBUG
if (HasLimit && CpCount >= CpLimit)
break;
diff --git a/lib/Target/Hexagon/RDFDeadCode.cpp b/lib/Target/Hexagon/RDFDeadCode.cpp
index 240d7c355bc7..da339bfd3ff4 100644
--- a/lib/Target/Hexagon/RDFDeadCode.cpp
+++ b/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -214,7 +214,7 @@ bool DeadCodeElimination::erase(const SetVector<NodeId> &Nodes) {
return false;
return A.Id < B.Id;
};
- std::sort(DRNs.begin(), DRNs.end(), UsesFirst);
+ llvm::sort(DRNs.begin(), DRNs.end(), UsesFirst);
if (trace())
dbgs() << "Removing dead ref nodes:\n";
diff --git a/lib/Target/Hexagon/RDFGraph.cpp b/lib/Target/Hexagon/RDFGraph.cpp
index d1f6e5a4c8ef..3d1ec31dada7 100644
--- a/lib/Target/Hexagon/RDFGraph.cpp
+++ b/lib/Target/Hexagon/RDFGraph.cpp
@@ -893,7 +893,7 @@ void DataFlowGraph::build(unsigned Options) {
NodeAddr<BlockNode*> BA = newBlock(Func, &B);
BlockNodes.insert(std::make_pair(&B, BA));
for (MachineInstr &I : B) {
- if (I.isDebugValue())
+ if (I.isDebugInstr())
continue;
buildStmt(BA, I);
}
@@ -1471,7 +1471,7 @@ void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, RegisterSet &AllRefs,
// and add a def for each S in the closure.
// Sort the refs so that the phis will be created in a deterministic order.
- std::sort(MaxRefs.begin(), MaxRefs.end());
+ llvm::sort(MaxRefs.begin(), MaxRefs.end());
// Remove duplicates.
auto NewEnd = std::unique(MaxRefs.begin(), MaxRefs.end());
MaxRefs.erase(NewEnd, MaxRefs.end());
diff --git a/lib/Target/Hexagon/RDFLiveness.cpp b/lib/Target/Hexagon/RDFLiveness.cpp
index 13d9a1741978..c257d754ddf9 100644
--- a/lib/Target/Hexagon/RDFLiveness.cpp
+++ b/lib/Target/Hexagon/RDFLiveness.cpp
@@ -207,7 +207,7 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
};
std::vector<NodeId> Tmp(Owners.begin(), Owners.end());
- std::sort(Tmp.begin(), Tmp.end(), Less);
+ llvm::sort(Tmp.begin(), Tmp.end(), Less);
// The vector is a list of instructions, so that defs coming from
// the same instruction don't need to be artificially ordered.
@@ -628,7 +628,7 @@ void Liveness::computePhiInfo() {
// Collect the set PropUp of uses that are reached by the current
// phi PA, and are not covered by any intervening def between the
- // currently visited use UA and the the upward phi P.
+ // currently visited use UA and the upward phi P.
if (MidDefs.hasCoverOf(UR))
continue;
@@ -813,7 +813,7 @@ void Liveness::computeLiveIns() {
std::vector<RegisterRef> LV;
for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I)
LV.push_back(RegisterRef(I->PhysReg, I->LaneMask));
- std::sort(LV.begin(), LV.end());
+ llvm::sort(LV.begin(), LV.end());
dbgs() << printMBBReference(B) << "\t rec = {";
for (auto I : LV)
dbgs() << ' ' << Print<RegisterRef>(I, DFG);
@@ -824,7 +824,7 @@ void Liveness::computeLiveIns() {
const RegisterAggr &LG = LiveMap[&B];
for (auto I = LG.rr_begin(), E = LG.rr_end(); I != E; ++I)
LV.push_back(*I);
- std::sort(LV.begin(), LV.end());
+ llvm::sort(LV.begin(), LV.end());
dbgs() << "\tcomp = {";
for (auto I : LV)
dbgs() << ' ' << Print<RegisterRef>(I, DFG);
@@ -880,7 +880,7 @@ void Liveness::resetKills(MachineBasicBlock *B) {
for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) {
MachineInstr *MI = &*I;
- if (MI->isDebugValue())
+ if (MI->isDebugInstr())
continue;
MI->clearKillInfo();
diff --git a/lib/Target/Hexagon/RDFLiveness.h b/lib/Target/Hexagon/RDFLiveness.h
index 8cfb6a1e9554..eaeb4ea115b3 100644
--- a/lib/Target/Hexagon/RDFLiveness.h
+++ b/lib/Target/Hexagon/RDFLiveness.h
@@ -53,8 +53,8 @@ namespace rdf {
using RefMap = std::map<RegisterId, NodeRefSet>;
Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g)
- : DFG(g), TRI(g.getTRI()), PRI(g.getPRI()), MDT(g.getDT()),
- MDF(g.getDF()), LiveMap(g.getPRI()), NoRegs(g.getPRI()) {}
+ : DFG(g), TRI(g.getTRI()), PRI(g.getPRI()), MDT(g.getDT()),
+ MDF(g.getDF()), LiveMap(g.getPRI()), Empty(), NoRegs(g.getPRI()) {}
NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
bool TopShadows, bool FullChain, const RegisterAggr &DefRRs);
diff --git a/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp b/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
index a330f27ed300..78e2f2b2ddb3 100644
--- a/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
+++ b/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
@@ -18,6 +18,6 @@ Target &llvm::getTheHexagonTarget() {
}
extern "C" void LLVMInitializeHexagonTargetInfo() {
- RegisterTarget<Triple::hexagon, /*HasJIT=*/false> X(
+ RegisterTarget<Triple::hexagon, /*HasJIT=*/true> X(
getTheHexagonTarget(), "hexagon", "Hexagon", "Hexagon");
}
diff --git a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 3f24c3ef3902..a77b2b8f15ca 100644
--- a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -475,8 +475,8 @@ public:
} else if (isa<MCBinaryExpr>(getImm())) {
#ifndef NDEBUG
const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
- assert(dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()) &&
- dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
+ assert(isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+ cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
LanaiMCExpr::VK_Lanai_ABS_LO);
#endif
Inst.addOperand(MCOperand::createExpr(getImm()));
@@ -505,8 +505,8 @@ public:
} else if (isa<MCBinaryExpr>(getImm())) {
#ifndef NDEBUG
const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
- assert(dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()) &&
- dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
+ assert(isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+ cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
LanaiMCExpr::VK_Lanai_ABS_HI);
#endif
Inst.addOperand(MCOperand::createExpr(getImm()));
diff --git a/lib/Target/Lanai/CMakeLists.txt b/lib/Target/Lanai/CMakeLists.txt
index 867f6165c253..c3ad807b312b 100644
--- a/lib/Target/Lanai/CMakeLists.txt
+++ b/lib/Target/Lanai/CMakeLists.txt
@@ -9,6 +9,7 @@ tablegen(LLVM LanaiGenInstrInfo.inc -gen-instr-info)
tablegen(LLVM LanaiGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM LanaiGenRegisterInfo.inc -gen-register-info)
tablegen(LLVM LanaiGenSubtargetInfo.inc -gen-subtarget)
+
add_public_tablegen_target(LanaiCommonTableGen)
add_llvm_target(LanaiCodeGen
@@ -29,7 +30,7 @@ add_llvm_target(LanaiCodeGen
)
add_subdirectory(AsmParser)
-add_subdirectory(TargetInfo)
-add_subdirectory(MCTargetDesc)
-add_subdirectory(InstPrinter)
add_subdirectory(Disassembler)
+add_subdirectory(InstPrinter)
+add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
diff --git a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
index 6b4fa7771783..ea76a1128373 100644
--- a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
+++ b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
@@ -156,7 +156,7 @@ bool Filler::findDelayInstr(MachineBasicBlock &MBB,
for (MachineBasicBlock::reverse_instr_iterator I = ++Slot.getReverse();
I != MBB.instr_rend(); ++I) {
// skip debug value
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
// Convert to forward iterator.
diff --git a/lib/Target/Lanai/LanaiISelDAGToDAG.cpp b/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
index ed0c99a76ce4..5081cfbe4922 100644
--- a/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
+++ b/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
@@ -273,12 +273,9 @@ bool LanaiDAGToDAGISel::SelectInlineAsmMemoryOperand(
void LanaiDAGToDAGISel::Select(SDNode *Node) {
unsigned Opcode = Node->getOpcode();
- // Dump information about the Node being selected
- DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
-
// If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
- DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+ LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
return;
}
@@ -319,7 +316,7 @@ void LanaiDAGToDAGISel::Select(SDNode *Node) {
void LanaiDAGToDAGISel::selectFrameIndex(SDNode *Node) {
SDLoc DL(Node);
SDValue Imm = CurDAG->getTargetConstant(0, DL, MVT::i32);
- int FI = dyn_cast<FrameIndexSDNode>(Node)->getIndex();
+ int FI = cast<FrameIndexSDNode>(Node)->getIndex();
EVT VT = Node->getValueType(0);
SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
unsigned Opc = Lanai::ADD_I_LO;
diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp
index 17567436384e..045a897c4126 100644
--- a/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -28,7 +28,6 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -44,6 +43,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
@@ -87,7 +87,6 @@ LanaiTargetLowering::LanaiTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
setOperationAction(ISD::SETCC, MVT::i32, Custom);
- setOperationAction(ISD::SETCCE, MVT::i32, Custom);
setOperationAction(ISD::SELECT, MVT::i32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
@@ -193,8 +192,6 @@ SDValue LanaiTargetLowering::LowerOperation(SDValue Op,
return LowerSELECT_CC(Op, DAG);
case ISD::SETCC:
return LowerSETCC(Op, DAG);
- case ISD::SETCCE:
- return LowerSETCCE(Op, DAG);
case ISD::SHL_PARTS:
return LowerSHL_PARTS(Op, DAG);
case ISD::SRL_PARTS:
@@ -484,8 +481,8 @@ SDValue LanaiTargetLowering::LowerCCCArguments(
break;
}
default:
- DEBUG(dbgs() << "LowerFormalArguments Unhandled argument type: "
- << RegVT.getEVTString() << "\n");
+ LLVM_DEBUG(dbgs() << "LowerFormalArguments Unhandled argument type: "
+ << RegVT.getEVTString() << "\n");
llvm_unreachable("unhandled argument type");
}
} else {
@@ -969,19 +966,6 @@ SDValue LanaiTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
return Res;
}
-SDValue LanaiTargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
- SDValue Carry = Op.getOperand(2);
- SDValue Cond = Op.getOperand(3);
- SDLoc DL(Op);
-
- LPCC::CondCode CC = IntCondCCodeToICC(Cond, DL, RHS, DAG);
- SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i32);
- SDValue Flag = DAG.getNode(LanaiISD::SUBBF, DL, MVT::Glue, LHS, RHS, Carry);
- return DAG.getNode(LanaiISD::SETCC, DL, Op.getValueType(), TargetCC, Flag);
-}
-
SDValue LanaiTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
diff --git a/lib/Target/Lanai/LanaiISelLowering.h b/lib/Target/Lanai/LanaiISelLowering.h
index 46024e6fd508..0cde633cb41a 100644
--- a/lib/Target/Lanai/LanaiISelLowering.h
+++ b/lib/Target/Lanai/LanaiISelLowering.h
@@ -87,7 +87,6 @@ public:
SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/Lanai/LanaiInstrFormats.td b/lib/Target/Lanai/LanaiInstrFormats.td
index 30289ea4ac0b..1bb6b3d26a49 100644
--- a/lib/Target/Lanai/LanaiInstrFormats.td
+++ b/lib/Target/Lanai/LanaiInstrFormats.td
@@ -482,7 +482,7 @@ class InstSLI<dag outs, dag ins, string asmstr, list<dag> pattern>
// Memory(ea) <- (least significant half-word of Rr)
// If `YS' = 10 (bYte load): Rr <- Memory(ea)
// If `YS' = 00 (halfword load): Rr <- Memory(ea)
-// [Note: here ea is determined as in the the RM instruction. ]
+// [Note: here ea is determined as in the RM instruction. ]
// If `SE' = 01 then the value is zEro extended
// before being loaded into Rd.
// If `SE' = 00 then the value is sign extended
diff --git a/lib/Target/Lanai/LanaiInstrInfo.cpp b/lib/Target/Lanai/LanaiInstrInfo.cpp
index a7c9a7a7f280..493d02bef37c 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -573,8 +573,8 @@ bool LanaiInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
while (Instruction != MBB.begin()) {
--Instruction;
- // Skip over debug values.
- if (Instruction->isDebugValue())
+ // Skip over debug instructions.
+ if (Instruction->isDebugInstr())
continue;
// Working from the bottom, when we see a non-terminator
@@ -699,7 +699,7 @@ unsigned LanaiInstrInfo::removeBranch(MachineBasicBlock &MBB,
while (Instruction != MBB.begin()) {
--Instruction;
- if (Instruction->isDebugValue())
+ if (Instruction->isDebugInstr())
continue;
if (Instruction->getOpcode() != Lanai::BT &&
Instruction->getOpcode() != Lanai::BRCC) {
diff --git a/lib/Target/Lanai/LanaiInstrInfo.h b/lib/Target/Lanai/LanaiInstrInfo.h
index f07fede67a41..fe22fde2470b 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/lib/Target/Lanai/LanaiInstrInfo.h
@@ -15,6 +15,7 @@
#define LLVM_LIB_TARGET_LANAI_LANAIINSTRINFO_H
#include "LanaiRegisterInfo.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
diff --git a/lib/Target/Lanai/LanaiInstrInfo.td b/lib/Target/Lanai/LanaiInstrInfo.td
index 776fee101dfe..66192b4a4704 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.td
+++ b/lib/Target/Lanai/LanaiInstrInfo.td
@@ -842,6 +842,10 @@ def : Pat<(Call texternalsym:$dst), (CALL texternalsym:$dst)>;
// Loads
def : Pat<(extloadi8 ADDRspls:$src), (i32 (LDBz_RI ADDRspls:$src))>;
def : Pat<(extloadi16 ADDRspls:$src), (i32 (LDHz_RI ADDRspls:$src))>;
+// Loads up to 32-bits are already atomic.
+// TODO: This is a workaround for a particular failing case and should be
+// handled more generally.
+def : Pat<(atomic_load_8 ADDRspls:$src), (i32 (LDBz_RI ADDRspls:$src))>;
// GlobalAddress, ExternalSymbol, Jumptable, ConstantPool
def : Pat<(LanaiHi tglobaladdr:$dst), (MOVHI tglobaladdr:$dst)>;
diff --git a/lib/Target/Lanai/LanaiMemAluCombiner.cpp b/lib/Target/Lanai/LanaiMemAluCombiner.cpp
index c29c933db747..35e2542dfb13 100644
--- a/lib/Target/Lanai/LanaiMemAluCombiner.cpp
+++ b/lib/Target/Lanai/LanaiMemAluCombiner.cpp
@@ -343,7 +343,7 @@ MbbIterator LanaiMemAluCombiner::findClosestSuitableAluInstr(
break;
// Skip over debug instructions
- if (First->isDebugValue())
+ if (First->isDebugInstr())
continue;
if (isSuitableAluInstr(IsSpls, First, *Base, *Offset)) {
diff --git a/lib/Target/Lanai/LanaiTargetObjectFile.cpp b/lib/Target/Lanai/LanaiTargetObjectFile.cpp
index 38e75108ba16..7d165e9c5f8c 100644
--- a/lib/Target/Lanai/LanaiTargetObjectFile.cpp
+++ b/lib/Target/Lanai/LanaiTargetObjectFile.cpp
@@ -50,8 +50,7 @@ static bool isInSmallSection(uint64_t Size) {
// section.
bool LanaiTargetObjectFile::isGlobalInSmallSection(
const GlobalObject *GO, const TargetMachine &TM) const {
- if (GO == nullptr)
- return false;
+ if (GO == nullptr) return TM.getCodeModel() == CodeModel::Small;
// We first check the case where global is a declaration, because finding
// section kind using getKindForGlobal() is only allowed for global
@@ -67,8 +66,7 @@ bool LanaiTargetObjectFile::isGlobalInSmallSection(
bool LanaiTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
const TargetMachine &TM,
SectionKind Kind) const {
- return (isGlobalInSmallSectionImpl(GO, TM) &&
- (Kind.isData() || Kind.isBSS() || Kind.isCommon()));
+ return isGlobalInSmallSectionImpl(GO, TM);
}
// Return true if this global address should be placed into small data/bss
@@ -76,10 +74,10 @@ bool LanaiTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
// kind.
bool LanaiTargetObjectFile::isGlobalInSmallSectionImpl(
const GlobalObject *GO, const TargetMachine &TM) const {
- // Only global variables, not functions.
const auto *GVA = dyn_cast<GlobalVariable>(GO);
- if (!GVA)
- return false;
+
+ // If not a GlobalVariable, only consider the code model.
+ if (!GVA) return TM.getCodeModel() == CodeModel::Small;
// Global values placed in sections starting with .ldata do not fit in
// 21-bits, so always use large memory access for them. FIXME: This is a
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
index c4935746f5ad..82fa93ea5e5e 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -47,14 +47,15 @@ class LanaiAsmBackend : public MCAsmBackend {
public:
LanaiAsmBackend(const Target &T, Triple::OSType OST)
- : MCAsmBackend(), OSType(OST) {}
+ : MCAsmBackend(support::big), OSType(OST) {}
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsResolved) const override;
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override;
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override;
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override;
// No instruction requires relaxation
bool fixupNeedsRelaxation(const MCFixup & /*Fixup*/, uint64_t /*Value*/,
@@ -69,7 +70,8 @@ public:
return Lanai::NumTargetFixupKinds;
}
- bool mayNeedRelaxation(const MCInst & /*Inst*/) const override {
+ bool mayNeedRelaxation(const MCInst & /*Inst*/,
+ const MCSubtargetInfo &STI) const override {
return false;
}
@@ -77,15 +79,15 @@ public:
const MCSubtargetInfo & /*STI*/,
MCInst & /*Res*/) const override {}
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
};
-bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool LanaiAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
if ((Count % 4) != 0)
return false;
for (uint64_t i = 0; i < Count; i += 4)
- OW->write32(0x15000000);
+ OS.write("\x15\0\0\0", 4);
return true;
}
@@ -93,7 +95,8 @@ bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
void LanaiAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target,
MutableArrayRef<char> Data, uint64_t Value,
- bool /*IsResolved*/) const {
+ bool /*IsResolved*/,
+ const MCSubtargetInfo * /*STI*/) const {
MCFixupKind Kind = Fixup.getKind();
Value = adjustFixupValue(static_cast<unsigned>(Kind), Value);
@@ -127,10 +130,9 @@ void LanaiAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
}
}
-std::unique_ptr<MCObjectWriter>
-LanaiAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
- return createLanaiELFObjectWriter(OS,
- MCELFObjectTargetWriter::getOSABI(OSType));
+std::unique_ptr<MCObjectTargetWriter>
+LanaiAsmBackend::createObjectTargetWriter() const {
+ return createLanaiELFObjectWriter(MCELFObjectTargetWriter::getOSABI(OSType));
}
const MCFixupKindInfo &
@@ -165,9 +167,10 @@ LanaiAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
} // namespace
MCAsmBackend *llvm::createLanaiAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo & /*MRI*/,
- const Triple &TT, StringRef /*CPU*/,
const MCTargetOptions & /*Options*/) {
+ const Triple &TT = STI.getTargetTriple();
if (!TT.isOSBinFormatELF())
llvm_unreachable("OS not supported");
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
index 3c40176d2f60..7676891ef981 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
@@ -87,8 +87,7 @@ bool LanaiELFObjectWriter::needsRelocateWithSymbol(const MCSymbol & /*SD*/,
}
}
-std::unique_ptr<MCObjectWriter>
-llvm::createLanaiELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI) {
- return createELFObjectWriter(llvm::make_unique<LanaiELFObjectWriter>(OSABI),
- OS, /*IsLittleEndian=*/false);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createLanaiELFObjectWriter(uint8_t OSABI) {
+ return llvm::make_unique<LanaiELFObjectWriter>(OSABI);
}
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
index c3727416ecb9..21f4005aaf83 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
@@ -12,7 +12,6 @@
//===----------------------------------------------------------------------===//
#include "Lanai.h"
-#include "LanaiAluCode.h"
#include "MCTargetDesc/LanaiBaseInfo.h"
#include "MCTargetDesc/LanaiFixupKinds.h"
#include "MCTargetDesc/LanaiMCExpr.h"
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
index 74d929450ed2..ddb01cdd2d8f 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -61,14 +61,14 @@ createLanaiMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
std::unique_ptr<MCAsmBackend> &&MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&Emitter,
bool RelaxAll) {
if (!T.isOSBinFormatELF())
llvm_unreachable("OS not supported");
- return createELFStreamer(Context, std::move(MAB), OS, std::move(Emitter),
- RelaxAll);
+ return createELFStreamer(Context, std::move(MAB), std::move(OW),
+ std::move(Emitter), RelaxAll);
}
static MCInstPrinter *createLanaiMCInstPrinter(const Triple & /*T*/,
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
index 5bc84ad83870..2d8828ea4fa9 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
@@ -24,7 +24,7 @@ class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
class MCInstrAnalysis;
-class MCObjectWriter;
+class MCObjectTargetWriter;
class MCRelocationInfo;
class MCSubtargetInfo;
class Target;
@@ -38,12 +38,11 @@ MCCodeEmitter *createLanaiMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
-MCAsmBackend *createLanaiAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TheTriple, StringRef CPU,
+MCAsmBackend *createLanaiAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const MCTargetOptions &Options);
-std::unique_ptr<MCObjectWriter>
-createLanaiELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI);
+std::unique_ptr<MCObjectTargetWriter> createLanaiELFObjectWriter(uint8_t OSABI);
} // namespace llvm
// Defines symbolic names for Lanai registers. This defines a mapping from
diff --git a/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp b/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
index 5eed0cb28361..ccf47b08fcff 100644
--- a/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
+++ b/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
@@ -7,7 +7,6 @@
//
//===----------------------------------------------------------------------===//
-#include "Lanai.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/TargetRegistry.h"
diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt
index 3f377631c016..3facfd526a53 100644
--- a/lib/Target/MSP430/CMakeLists.txt
+++ b/lib/Target/MSP430/CMakeLists.txt
@@ -1,11 +1,12 @@
set(LLVM_TARGET_DEFINITIONS MSP430.td)
-tablegen(LLVM MSP430GenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM MSP430GenInstrInfo.inc -gen-instr-info)
tablegen(LLVM MSP430GenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM MSP430GenDAGISel.inc -gen-dag-isel)
tablegen(LLVM MSP430GenCallingConv.inc -gen-callingconv)
+tablegen(LLVM MSP430GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM MSP430GenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM MSP430GenRegisterInfo.inc -gen-register-info)
tablegen(LLVM MSP430GenSubtargetInfo.inc -gen-subtarget)
+
add_public_tablegen_target(MSP430CommonTableGen)
add_llvm_target(MSP430CodeGen
@@ -23,5 +24,5 @@ add_llvm_target(MSP430CodeGen
)
add_subdirectory(InstPrinter)
-add_subdirectory(TargetInfo)
add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
diff --git a/lib/Target/MSP430/MSP430BranchSelector.cpp b/lib/Target/MSP430/MSP430BranchSelector.cpp
index 87c320aa76aa..2b3495405545 100644
--- a/lib/Target/MSP430/MSP430BranchSelector.cpp
+++ b/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -138,15 +138,15 @@ bool MSP430BSel::expandBranches(OffsetVector &BlockOffsets) {
continue;
}
- DEBUG(dbgs() << " Found a branch that needs expanding, "
- << printMBBReference(*DestBB) << ", Distance "
- << BranchDistance << "\n");
+ LLVM_DEBUG(dbgs() << " Found a branch that needs expanding, "
+ << printMBBReference(*DestBB) << ", Distance "
+ << BranchDistance << "\n");
// If JCC is not the last instruction we need to split the MBB.
if (MI->getOpcode() == MSP430::JCC && std::next(MI) != EE) {
- DEBUG(dbgs() << " Found a basic block that needs to be split, "
- << printMBBReference(*MBB) << "\n");
+ LLVM_DEBUG(dbgs() << " Found a basic block that needs to be split, "
+ << printMBBReference(*MBB) << "\n");
// Create a new basic block.
MachineBasicBlock *NewBB =
@@ -229,7 +229,7 @@ bool MSP430BSel::runOnMachineFunction(MachineFunction &mf) {
if (!BranchSelectEnabled)
return false;
- DEBUG(dbgs() << "\n********** " << getPassName() << " **********\n");
+ LLVM_DEBUG(dbgs() << "\n********** " << getPassName() << " **********\n");
// BlockOffsets - Contains the distance from the beginning of the function to
// the beginning of each basic block.
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index bebf7478bccf..005f5f44a635 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -20,6 +20,7 @@
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
@@ -179,7 +180,7 @@ bool MSP430DAGToDAGISel::MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM)
}
bool MSP430DAGToDAGISel::MatchAddress(SDValue N, MSP430ISelAddressMode &AM) {
- DEBUG(errs() << "MatchAddress: "; AM.dump());
+ LLVM_DEBUG(errs() << "MatchAddress: "; AM.dump());
switch (N.getOpcode()) {
default: break;
@@ -381,16 +382,9 @@ bool MSP430DAGToDAGISel::tryIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2,
void MSP430DAGToDAGISel::Select(SDNode *Node) {
SDLoc dl(Node);
- // Dump information about the Node being selected
- DEBUG(errs() << "Selecting: ");
- DEBUG(Node->dump(CurDAG));
- DEBUG(errs() << "\n");
-
// If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
- DEBUG(errs() << "== ";
- Node->dump(CurDAG);
- errs() << "\n");
+ LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
Node->setNodeId(-1);
return;
}
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index 6135ce080920..dd1b30a3e470 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -113,7 +113,7 @@ unsigned MSP430InstrInfo::removeBranch(MachineBasicBlock &MBB,
while (I != MBB.begin()) {
--I;
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
if (I->getOpcode() != MSP430::JMP &&
I->getOpcode() != MSP430::JCC &&
@@ -183,7 +183,7 @@ bool MSP430InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I = MBB.end();
while (I != MBB.begin()) {
--I;
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
// Working from the bottom, when we see a non-terminator
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 345b081500a4..2acf701b43cb 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -146,10 +146,15 @@ class MipsAsmParser : public MCTargetAsmParser {
/// If true, then CpSaveLocation is a register, otherwise it's an offset.
bool CpSaveLocationIsRegister;
+ // Map of register aliases created via the .set directive.
+ StringMap<AsmToken> RegisterSets;
+
// Print a warning along with its fix-it message at the given range.
void printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg,
SMRange Range, bool ShowColors = true);
+ void ConvertXWPOperands(MCInst &Inst, const OperandVector &Operands);
+
#define GET_ASSEMBLER_HEADER
#include "MipsGenAsmMatcher.inc"
@@ -182,12 +187,14 @@ class MipsAsmParser : public MCTargetAsmParser {
matchAnyRegisterNameWithoutDollar(OperandVector &Operands,
StringRef Identifier, SMLoc S);
OperandMatchResultTy matchAnyRegisterWithoutDollar(OperandVector &Operands,
+ const AsmToken &Token,
+ SMLoc S);
+ OperandMatchResultTy matchAnyRegisterWithoutDollar(OperandVector &Operands,
SMLoc S);
OperandMatchResultTy parseAnyRegister(OperandVector &Operands);
OperandMatchResultTy parseImm(OperandVector &Operands);
OperandMatchResultTy parseJumpTarget(OperandVector &Operands);
OperandMatchResultTy parseInvNum(OperandVector &Operands);
- OperandMatchResultTy parseRegisterPair(OperandVector &Operands);
OperandMatchResultTy parseMovePRegPair(OperandVector &Operands);
OperandMatchResultTy parseRegisterList(OperandVector &Operands);
@@ -235,13 +242,7 @@ class MipsAsmParser : public MCTargetAsmParser {
const MCSubtargetInfo *STI);
void expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
- const MCSubtargetInfo *STI, bool IsLoad, bool IsImmOpnd);
-
- void expandLoadInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
- const MCSubtargetInfo *STI, bool IsImmOpnd);
-
- void expandStoreInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
- const MCSubtargetInfo *STI, bool IsImmOpnd);
+ const MCSubtargetInfo *STI, bool IsLoad);
bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
const MCSubtargetInfo *STI);
@@ -255,9 +256,9 @@ class MipsAsmParser : public MCTargetAsmParser {
bool expandCondBranches(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
const MCSubtargetInfo *STI);
- bool expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
- const MCSubtargetInfo *STI, const bool IsMips64,
- const bool Signed);
+ bool expandDivRem(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI, const bool IsMips64,
+ const bool Signed);
bool expandTrunc(MCInst &Inst, bool IsDouble, bool Is64FPU, SMLoc IDLoc,
MCStreamer &Out, const MCSubtargetInfo *STI);
@@ -348,10 +349,12 @@ class MipsAsmParser : public MCTargetAsmParser {
bool parseSetHardFloatDirective();
bool parseSetMtDirective();
bool parseSetNoMtDirective();
+ bool parseSetNoCRCDirective();
+ bool parseSetNoVirtDirective();
+ bool parseSetNoGINVDirective();
bool parseSetAssignment();
- bool parseDataDirective(unsigned Size, SMLoc L);
bool parseDirectiveGpWord();
bool parseDirectiveGpDWord();
bool parseDirectiveDtpRelWord();
@@ -466,6 +469,7 @@ public:
Match_RequiresSameSrcAndDst,
Match_NoFCCRegisterForCurrentISA,
Match_NonZeroOperandForSync,
+ Match_NonZeroOperandForMTCX,
Match_RequiresPosSizeRange0_32,
Match_RequiresPosSizeRange33_64,
Match_RequiresPosSizeUImm6,
@@ -482,6 +486,9 @@ public:
MCAsmParserExtension::Initialize(parser);
parser.addAliasForDirective(".asciiz", ".asciz");
+ parser.addAliasForDirective(".hword", ".2byte");
+ parser.addAliasForDirective(".word", ".4byte");
+ parser.addAliasForDirective(".dword", ".8byte");
// Initialize the set of available features.
setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
@@ -507,14 +514,13 @@ public:
CpRestoreOffset = -1;
const Triple &TheTriple = sti.getTargetTriple();
- if ((TheTriple.getArch() == Triple::mips) ||
- (TheTriple.getArch() == Triple::mips64))
- IsLittleEndian = false;
- else
- IsLittleEndian = true;
+ IsLittleEndian = TheTriple.isLittleEndian();
if (getSTI().getCPU() == "mips64r6" && inMicroMipsMode())
report_fatal_error("microMIPS64R6 is not supported", false);
+
+ if (!isABI_O32() && inMicroMipsMode())
+ report_fatal_error("microMIPS64 is not supported", false);
}
/// True if all of $fcc0 - $fcc7 exist for the current ISA.
@@ -643,6 +649,18 @@ public:
return getSTI().getFeatureBits()[Mips::FeatureMT];
}
+ bool hasCRC() const {
+ return getSTI().getFeatureBits()[Mips::FeatureCRC];
+ }
+
+ bool hasVirt() const {
+ return getSTI().getFeatureBits()[Mips::FeatureVirt];
+ }
+
+ bool hasGINV() const {
+ return getSTI().getFeatureBits()[Mips::FeatureGINV];
+ }
+
/// Warn if RegIndex is the same as the current AT.
void warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc);
@@ -1297,6 +1315,20 @@ public:
return IsReloc && isShiftedInt<Bits, ShiftAmount>(Res.getConstant());
}
+ bool isMemWithPtrSizeOffset() const {
+ if (!isMem())
+ return false;
+ if (!getMemBase()->isGPRAsmReg())
+ return false;
+ const unsigned PtrBits = AsmParser.getABI().ArePtrs64bit() ? 64 : 32;
+ if (isa<MCTargetExpr>(getMemOff()) ||
+ (isConstantMemOff() && isIntN(PtrBits, getConstantMemOff())))
+ return true;
+ MCValue Res;
+ bool IsReloc = getMemOff()->evaluateAsRelocatable(Res, nullptr, nullptr);
+ return IsReloc && isIntN(PtrBits, Res.getConstant());
+ }
+
bool isMemWithGRPMM16Base() const {
return isMem() && getMemBase()->isMM16AsmReg();
}
@@ -1326,9 +1358,11 @@ public:
template <unsigned Bits, unsigned ShiftLeftAmount>
bool isScaledSImm() const {
- if (isConstantImm() && isShiftedInt<Bits, ShiftLeftAmount>(getConstantImm()))
+ if (isConstantImm() &&
+ isShiftedInt<Bits, ShiftLeftAmount>(getConstantImm()))
return true;
- // Operand can also be a symbol or symbol plus offset in case of relocations.
+ // Operand can also be a symbol or symbol plus
+ // offset in case of relocations.
if (Kind != k_Immediate)
return false;
MCValue Res;
@@ -1405,10 +1439,6 @@ public:
return StringRef(Tok.Data, Tok.Length);
}
- bool isRegPair() const {
- return Kind == k_RegPair && RegIdx.Index <= 30;
- }
-
unsigned getReg() const override {
// As a special case until we sort out the definition of div/divu, accept
// $0/$zero here so that MCK_ZERO works correctly.
@@ -1471,7 +1501,7 @@ public:
static std::unique_ptr<MipsOperand>
createNumericReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
SMLoc S, SMLoc E, MipsAsmParser &Parser) {
- DEBUG(dbgs() << "createNumericReg(" << Index << ", ...)\n");
+ LLVM_DEBUG(dbgs() << "createNumericReg(" << Index << ", ...)\n");
return CreateReg(Index, Str, RegKind_Numeric, RegInfo, S, E, Parser);
}
@@ -2034,7 +2064,8 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
// FIXME: Add support for forward-declared local symbols.
// FIXME: Add expansion for when the LargeGOT option is enabled.
if (JalSym->isInSection() || JalSym->isTemporary() ||
- (JalSym->isELF() && cast<MCSymbolELF>(JalSym)->getBinding() == ELF::STB_LOCAL)) {
+ (JalSym->isELF() &&
+ cast<MCSymbolELF>(JalSym)->getBinding() == ELF::STB_LOCAL)) {
if (isABI_O32()) {
// If it's a local symbol and the O32 ABI is being used, we expand to:
// lw $25, 0($gp)
@@ -2102,10 +2133,10 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
(OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
MCOperand &Op = Inst.getOperand(i);
if (Op.isImm()) {
- int MemOffset = Op.getImm();
+ int64_t MemOffset = Op.getImm();
if (MemOffset < -32768 || MemOffset > 32767) {
// Offset can't exceed 16bit value.
- expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad(), true);
+ expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad());
return getParser().hasPendingError();
}
} else if (Op.isExpr()) {
@@ -2115,11 +2146,11 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
static_cast<const MCSymbolRefExpr *>(Expr);
if (SR->getKind() == MCSymbolRefExpr::VK_None) {
// Expand symbol.
- expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad(), false);
+ expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad());
return getParser().hasPendingError();
}
} else if (!isEvaluated(Expr)) {
- expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad(), false);
+ expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad());
return getParser().hasPendingError();
}
}
@@ -2128,7 +2159,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
} // if load/store
if (inMicroMipsMode()) {
- if (MCID.mayLoad()) {
+ if (MCID.mayLoad() && Inst.getOpcode() != Mips::LWP_MM) {
// Try to create 16-bit GP relative load instruction.
for (unsigned i = 0; i < MCID.getNumOperands(); i++) {
const MCOperandInfo &OpInfo = MCID.OpInfo[i];
@@ -2245,13 +2276,18 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
return Error(IDLoc, "immediate operand value out of range");
break;
case Mips::ADDIUPC_MM:
- MCOperand Opnd = Inst.getOperand(1);
+ Opnd = Inst.getOperand(1);
if (!Opnd.isImm())
return Error(IDLoc, "expected immediate operand kind");
- int Imm = Opnd.getImm();
+ Imm = Opnd.getImm();
if ((Imm % 4 != 0) || !isInt<25>(Imm))
return Error(IDLoc, "immediate operand value out of range");
break;
+ case Mips::LWP_MM:
+ case Mips::SWP_MM:
+ if (Inst.getOperand(0).getReg() == Mips::RA)
+ return Error(IDLoc, "invalid operand for instruction");
+ break;
}
}
@@ -2392,20 +2428,28 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
return expandCondBranches(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
case Mips::SDivMacro:
case Mips::SDivIMacro:
- return expandDiv(Inst, IDLoc, Out, STI, false, true) ? MER_Fail
- : MER_Success;
+ case Mips::SRemMacro:
+ case Mips::SRemIMacro:
+ return expandDivRem(Inst, IDLoc, Out, STI, false, true) ? MER_Fail
+ : MER_Success;
case Mips::DSDivMacro:
case Mips::DSDivIMacro:
- return expandDiv(Inst, IDLoc, Out, STI, true, true) ? MER_Fail
- : MER_Success;
+ case Mips::DSRemMacro:
+ case Mips::DSRemIMacro:
+ return expandDivRem(Inst, IDLoc, Out, STI, true, true) ? MER_Fail
+ : MER_Success;
case Mips::UDivMacro:
case Mips::UDivIMacro:
- return expandDiv(Inst, IDLoc, Out, STI, false, false) ? MER_Fail
- : MER_Success;
+ case Mips::URemMacro:
+ case Mips::URemIMacro:
+ return expandDivRem(Inst, IDLoc, Out, STI, false, false) ? MER_Fail
+ : MER_Success;
case Mips::DUDivMacro:
case Mips::DUDivIMacro:
- return expandDiv(Inst, IDLoc, Out, STI, true, false) ? MER_Fail
- : MER_Success;
+ case Mips::DURemMacro:
+ case Mips::DURemIMacro:
+ return expandDivRem(Inst, IDLoc, Out, STI, true, false) ? MER_Fail
+ : MER_Success;
case Mips::PseudoTRUNC_W_S:
return expandTrunc(Inst, false, false, IDLoc, Out, STI) ? MER_Fail
: MER_Success;
@@ -3522,21 +3566,17 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
}
void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
- const MCSubtargetInfo *STI, bool IsLoad,
- bool IsImmOpnd) {
- if (IsLoad) {
- expandLoadInst(Inst, IDLoc, Out, STI, IsImmOpnd);
- return;
- }
- expandStoreInst(Inst, IDLoc, Out, STI, IsImmOpnd);
-}
+ const MCSubtargetInfo *STI, bool IsLoad) {
+ const MCOperand &DstRegOp = Inst.getOperand(0);
+ assert(DstRegOp.isReg() && "expected register operand kind");
+ const MCOperand &BaseRegOp = Inst.getOperand(1);
+ assert(BaseRegOp.isReg() && "expected register operand kind");
+ const MCOperand &OffsetOp = Inst.getOperand(2);
-void MipsAsmParser::expandLoadInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
- const MCSubtargetInfo *STI, bool IsImmOpnd) {
MipsTargetStreamer &TOut = getTargetStreamer();
-
- unsigned DstReg = Inst.getOperand(0).getReg();
- unsigned BaseReg = Inst.getOperand(1).getReg();
+ unsigned DstReg = DstRegOp.getReg();
+ unsigned BaseReg = BaseRegOp.getReg();
+ unsigned TmpReg = DstReg;
const MCInstrDesc &Desc = getInstDesc(Inst.getOpcode());
int16_t DstRegClass = Desc.OpInfo[0].RegClass;
@@ -3545,75 +3585,51 @@ void MipsAsmParser::expandLoadInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
bool IsGPR = (DstRegClassID == Mips::GPR32RegClassID) ||
(DstRegClassID == Mips::GPR64RegClassID);
- if (IsImmOpnd) {
- // Try to use DstReg as the temporary.
- if (IsGPR && (BaseReg != DstReg)) {
- TOut.emitLoadWithImmOffset(Inst.getOpcode(), DstReg, BaseReg,
- Inst.getOperand(2).getImm(), DstReg, IDLoc,
- STI);
- return;
- }
-
- // At this point we need AT to perform the expansions and we exit if it is
- // not available.
- unsigned ATReg = getATReg(IDLoc);
- if (!ATReg)
+ if (!IsLoad || !IsGPR || (BaseReg == DstReg)) {
+ // At this point we need AT to perform the expansions
+ // and we exit if it is not available.
+ TmpReg = getATReg(IDLoc);
+ if (!TmpReg)
return;
-
- TOut.emitLoadWithImmOffset(Inst.getOpcode(), DstReg, BaseReg,
- Inst.getOperand(2).getImm(), ATReg, IDLoc, STI);
- return;
}
- const MCExpr *ExprOffset = Inst.getOperand(2).getExpr();
- MCOperand LoOperand = MCOperand::createExpr(
- MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
- MCOperand HiOperand = MCOperand::createExpr(
- MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+ if (OffsetOp.isImm()) {
+ int64_t LoOffset = OffsetOp.getImm() & 0xffff;
+ int64_t HiOffset = OffsetOp.getImm() & ~0xffff;
- // Try to use DstReg as the temporary.
- if (IsGPR && (BaseReg != DstReg)) {
- TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
- LoOperand, DstReg, IDLoc, STI);
- return;
- }
+ // If msb of LoOffset is 1(negative number) we must increment
+ // HiOffset to account for the sign-extension of the low part.
+ if (LoOffset & 0x8000)
+ HiOffset += 0x10000;
- // At this point we need AT to perform the expansions and we exit if it is
- // not available.
- unsigned ATReg = getATReg(IDLoc);
- if (!ATReg)
- return;
+ bool IsLargeOffset = HiOffset != 0;
- TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
- LoOperand, ATReg, IDLoc, STI);
-}
-
-void MipsAsmParser::expandStoreInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
- const MCSubtargetInfo *STI,
- bool IsImmOpnd) {
- MipsTargetStreamer &TOut = getTargetStreamer();
-
- unsigned SrcReg = Inst.getOperand(0).getReg();
- unsigned BaseReg = Inst.getOperand(1).getReg();
+ if (IsLargeOffset) {
+ bool Is32BitImm = (HiOffset >> 32) == 0;
+ if (loadImmediate(HiOffset, TmpReg, Mips::NoRegister, Is32BitImm, true,
+ IDLoc, Out, STI))
+ return;
+ }
- if (IsImmOpnd) {
- TOut.emitStoreWithImmOffset(Inst.getOpcode(), SrcReg, BaseReg,
- Inst.getOperand(2).getImm(),
- [&]() { return getATReg(IDLoc); }, IDLoc, STI);
- return;
+ if (BaseReg != Mips::ZERO && BaseReg != Mips::ZERO_64)
+ TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu, TmpReg, TmpReg,
+ BaseReg, IDLoc, STI);
+ TOut.emitRRI(Inst.getOpcode(), DstReg, TmpReg, LoOffset, IDLoc, STI);
+ } else {
+ assert(OffsetOp.isExpr() && "expected expression operand kind");
+ const MCExpr *ExprOffset = OffsetOp.getExpr();
+ MCOperand LoOperand = MCOperand::createExpr(
+ MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
+ MCOperand HiOperand = MCOperand::createExpr(
+ MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+
+ if (IsLoad)
+ TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
+ LoOperand, TmpReg, IDLoc, STI);
+ else
+ TOut.emitStoreWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
+ LoOperand, TmpReg, IDLoc, STI);
}
-
- unsigned ATReg = getATReg(IDLoc);
- if (!ATReg)
- return;
-
- const MCExpr *ExprOffset = Inst.getOperand(2).getExpr();
- MCOperand LoOperand = MCOperand::createExpr(
- MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
- MCOperand HiOperand = MCOperand::createExpr(
- MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
- TOut.emitStoreWithSymOffset(Inst.getOpcode(), SrcReg, BaseReg, HiOperand,
- LoOperand, ATReg, IDLoc, STI);
}
bool MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
@@ -3734,7 +3750,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
case Mips::BLTUL:
AcceptsEquality = false;
ReverseOrderSLT = false;
- IsUnsigned = ((PseudoOpcode == Mips::BLTU) || (PseudoOpcode == Mips::BLTUL));
+ IsUnsigned =
+ ((PseudoOpcode == Mips::BLTU) || (PseudoOpcode == Mips::BLTUL));
IsLikely = ((PseudoOpcode == Mips::BLTL) || (PseudoOpcode == Mips::BLTUL));
ZeroSrcOpcode = Mips::BGTZ;
ZeroTrgOpcode = Mips::BLTZ;
@@ -3745,7 +3762,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
case Mips::BLEUL:
AcceptsEquality = true;
ReverseOrderSLT = true;
- IsUnsigned = ((PseudoOpcode == Mips::BLEU) || (PseudoOpcode == Mips::BLEUL));
+ IsUnsigned =
+ ((PseudoOpcode == Mips::BLEU) || (PseudoOpcode == Mips::BLEUL));
IsLikely = ((PseudoOpcode == Mips::BLEL) || (PseudoOpcode == Mips::BLEUL));
ZeroSrcOpcode = Mips::BGEZ;
ZeroTrgOpcode = Mips::BLEZ;
@@ -3756,7 +3774,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
case Mips::BGEUL:
AcceptsEquality = true;
ReverseOrderSLT = false;
- IsUnsigned = ((PseudoOpcode == Mips::BGEU) || (PseudoOpcode == Mips::BGEUL));
+ IsUnsigned =
+ ((PseudoOpcode == Mips::BGEU) || (PseudoOpcode == Mips::BGEUL));
IsLikely = ((PseudoOpcode == Mips::BGEL) || (PseudoOpcode == Mips::BGEUL));
ZeroSrcOpcode = Mips::BLEZ;
ZeroTrgOpcode = Mips::BGEZ;
@@ -3767,7 +3786,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
case Mips::BGTUL:
AcceptsEquality = false;
ReverseOrderSLT = true;
- IsUnsigned = ((PseudoOpcode == Mips::BGTU) || (PseudoOpcode == Mips::BGTUL));
+ IsUnsigned =
+ ((PseudoOpcode == Mips::BGTU) || (PseudoOpcode == Mips::BGTUL));
IsLikely = ((PseudoOpcode == Mips::BGTL) || (PseudoOpcode == Mips::BGTUL));
ZeroSrcOpcode = Mips::BLTZ;
ZeroTrgOpcode = Mips::BGTZ;
@@ -3885,7 +3905,7 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
// This is accomplished by using a BNEZ with the result of the SLT.
//
// The other 2 pseudo-branches are opposites of the above 2 (BGE with BLT
- // and BLE with BGT), so we change the BNEZ into a a BEQZ.
+ // and BLE with BGT), so we change the BNEZ into a BEQZ.
// Because only BGE and BLE branch on equality, we can use the
// AcceptsEquality variable to decide when to emit the BEQZ.
// Note that the order of the SLT arguments doesn't change between
@@ -3912,9 +3932,9 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
// The destination register can only be $zero when expanding (S)DivIMacro or
// D(S)DivMacro.
-bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
- const MCSubtargetInfo *STI, const bool IsMips64,
- const bool Signed) {
+bool MipsAsmParser::expandDivRem(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI, const bool IsMips64,
+ const bool Signed) {
MipsTargetStreamer &TOut = getTargetStreamer();
warnIfNoMacro(IDLoc);
@@ -3954,6 +3974,17 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
bool UseTraps = useTraps();
+ unsigned Opcode = Inst.getOpcode();
+ bool isDiv = Opcode == Mips::SDivMacro || Opcode == Mips::SDivIMacro ||
+ Opcode == Mips::UDivMacro || Opcode == Mips::UDivIMacro ||
+ Opcode == Mips::DSDivMacro || Opcode == Mips::DSDivIMacro ||
+ Opcode == Mips::DUDivMacro || Opcode == Mips::DUDivIMacro;
+
+ bool isRem = Opcode == Mips::SRemMacro || Opcode == Mips::SRemIMacro ||
+ Opcode == Mips::URemMacro || Opcode == Mips::URemIMacro ||
+ Opcode == Mips::DSRemMacro || Opcode == Mips::DSRemIMacro ||
+ Opcode == Mips::DURemMacro || Opcode == Mips::DURemIMacro;
+
if (RtOp.isImm()) {
unsigned ATReg = getATReg(IDLoc);
if (!ATReg)
@@ -3967,10 +3998,13 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
return false;
}
- if (ImmValue == 1) {
+ if (isRem && (ImmValue == 1 || (Signed && (ImmValue == -1)))) {
+ TOut.emitRRR(Mips::OR, RdReg, ZeroReg, ZeroReg, IDLoc, STI);
+ return false;
+ } else if (isDiv && ImmValue == 1) {
TOut.emitRRR(Mips::OR, RdReg, RsReg, Mips::ZERO, IDLoc, STI);
return false;
- } else if (Signed && ImmValue == -1) {
+ } else if (isDiv && Signed && ImmValue == -1) {
TOut.emitRRR(SubOp, RdReg, ZeroReg, RsReg, IDLoc, STI);
return false;
} else {
@@ -3978,16 +4012,16 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
false, Inst.getLoc(), Out, STI))
return true;
TOut.emitRR(DivOp, RsReg, ATReg, IDLoc, STI);
- TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
+ TOut.emitR(isDiv ? Mips::MFLO : Mips::MFHI, RdReg, IDLoc, STI);
return false;
}
return true;
}
- // If the macro expansion of (d)div(u) would always trap or break, insert
- // the trap/break and exit. This gives a different result to GAS. GAS has
- // an inconsistency/missed optimization in that not all cases are handled
- // equivalently. As the observed behaviour is the same, we're ok.
+ // If the macro expansion of (d)div(u) or (d)rem(u) would always trap or
+ // break, insert the trap/break and exit. This gives a different result to
+ // GAS. GAS has an inconsistency/missed optimization in that not all cases
+ // are handled equivalently. As the observed behaviour is the same, we're ok.
if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64) {
if (UseTraps) {
TOut.emitRRI(Mips::TEQ, ZeroReg, ZeroReg, 0x7, IDLoc, STI);
@@ -3997,6 +4031,13 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
return false;
}
+ // (d)rem(u) $0, $X, $Y is a special case. Like div $zero, $X, $Y, it does
+ // not expand to macro sequence.
+ if (isRem && (RdReg == Mips::ZERO || RdReg == Mips::ZERO_64)) {
+ TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);
+ return false;
+ }
+
// Temporary label for first branch traget
MCContext &Context = TOut.getStreamer().getContext();
MCSymbol *BrTarget;
@@ -4020,7 +4061,7 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
if (!UseTraps)
TOut.getStreamer().EmitLabel(BrTarget);
- TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
+ TOut.emitR(isDiv ? Mips::MFLO : Mips::MFHI, RdReg, IDLoc, STI);
return false;
}
@@ -4043,7 +4084,7 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
if (IsMips64) {
TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, 1, IDLoc, STI);
- TOut.emitRRI(Mips::DSLL32, ATReg, ATReg, 0x1f, IDLoc, STI);
+ TOut.emitDSLL(ATReg, ATReg, 63, IDLoc, STI);
} else {
TOut.emitRI(Mips::LUi, ATReg, (uint16_t)0x8000, IDLoc, STI);
}
@@ -4053,12 +4094,12 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
else {
// Branch to the mflo instruction.
TOut.emitRRX(Mips::BNE, RsReg, ATReg, LabelOpEnd, IDLoc, STI);
- TOut.emitRRI(Mips::SLL, ZeroReg, ZeroReg, 0, IDLoc, STI);
+ TOut.emitNop(IDLoc, STI);
TOut.emitII(Mips::BREAK, 0x6, 0, IDLoc, STI);
}
TOut.getStreamer().EmitLabel(BrTargetEnd);
- TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
+ TOut.emitR(isDiv ? Mips::MFLO : Mips::MFHI, RdReg, IDLoc, STI);
return false;
}
@@ -4287,7 +4328,8 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
DstReg = ATReg;
}
- if (!loadImmediate(ImmValue, DstReg, Mips::NoRegister, Is32Bit, false, Inst.getLoc(), Out, STI)) {
+ if (!loadImmediate(ImmValue, DstReg, Mips::NoRegister, Is32Bit, false,
+ Inst.getLoc(), Out, STI)) {
switch (FinalOpcode) {
default:
llvm_unreachable("unimplemented expansion");
@@ -4675,7 +4717,8 @@ bool MipsAsmParser::expandMulImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
if (!ATReg)
return true;
- loadImmediate(ImmValue, ATReg, Mips::NoRegister, true, false, IDLoc, Out, STI);
+ loadImmediate(ImmValue, ATReg, Mips::NoRegister, true, false, IDLoc, Out,
+ STI);
TOut.emitRR(Inst.getOpcode() == Mips::MULImmMacro ? Mips::MULT : Mips::DMULT,
SrcReg, ATReg, IDLoc, STI);
@@ -5136,13 +5179,13 @@ unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
// It also applies for registers Rt and Rs of microMIPSr6 jalrc.hb instruction
// and registers Rd and Base for microMIPS lwp instruction
case Mips::JALR_HB:
+ case Mips::JALR_HB64:
case Mips::JALRC_HB_MMR6:
case Mips::JALRC_MMR6:
if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg())
return Match_RequiresDifferentSrcAndDst;
return Match_Success;
case Mips::LWP_MM:
- case Mips::LWP_MMR6:
if (Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg())
return Match_RequiresDifferentSrcAndDst;
return Match_Success;
@@ -5150,6 +5193,13 @@ unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
if (Inst.getOperand(0).getImm() != 0 && !hasMips32())
return Match_NonZeroOperandForSync;
return Match_Success;
+ case Mips::MFC0:
+ case Mips::MTC0:
+ case Mips::MTC2:
+ case Mips::MFC2:
+ if (Inst.getOperand(2).getImm() != 0 && !hasMips32())
+ return Match_NonZeroOperandForMTCX;
+ return Match_Success;
// As described the MIPSR6 spec, the compact branches that compare registers
// must:
// a) Not use the zero register.
@@ -5237,6 +5287,13 @@ unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
return Match_RequiresPosSizeRange33_64;
return Match_Success;
}
+ case Mips::CRC32B: case Mips::CRC32CB:
+ case Mips::CRC32H: case Mips::CRC32CH:
+ case Mips::CRC32W: case Mips::CRC32CW:
+ case Mips::CRC32D: case Mips::CRC32CD:
+ if (Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg())
+ return Match_RequiresSameSrcAndDst;
+ return Match_Success;
}
uint64_t TSFlags = getInstDesc(Inst.getOpcode()).TSFlags;
@@ -5290,7 +5347,10 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return Error(ErrorLoc, "invalid operand for instruction");
}
case Match_NonZeroOperandForSync:
- return Error(IDLoc, "s-type must be zero or unspecified for pre-MIPS32 ISAs");
+ return Error(IDLoc,
+ "s-type must be zero or unspecified for pre-MIPS32 ISAs");
+ case Match_NonZeroOperandForMTCX:
+ return Error(IDLoc, "selector must be zero for pre-MIPS32 ISAs");
case Match_MnemonicFail:
return Error(IDLoc, "invalid instruction");
case Match_RequiresDifferentSrcAndDst:
@@ -5429,6 +5489,9 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
case Match_MemSImm16:
return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
"expected memory with 16-bit signed offset");
+ case Match_MemSImmPtr:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected memory with 32-bit signed offset");
case Match_RequiresPosSizeRange0_32: {
SMLoc ErrorStart = Operands[3]->getStartLoc();
SMLoc ErrorEnd = Operands[4]->getEndLoc();
@@ -5463,6 +5526,17 @@ void MipsAsmParser::warnIfNoMacro(SMLoc Loc) {
Warning(Loc, "macro instruction expanded into multiple instructions");
}
+void MipsAsmParser::ConvertXWPOperands(MCInst &Inst,
+ const OperandVector &Operands) {
+ assert(
+ (Inst.getOpcode() == Mips::LWP_MM || Inst.getOpcode() == Mips::SWP_MM) &&
+ "Unexpected instruction!");
+ ((MipsOperand &)*Operands[1]).addGPR32ZeroAsmRegOperands(Inst, 1);
+ int NextReg = nextReg(((MipsOperand &)*Operands[1]).getGPR32Reg());
+ Inst.addOperand(MCOperand::createReg(NextReg));
+ ((MipsOperand &)*Operands[2]).addMemOperands(Inst, 2);
+}
+
void
MipsAsmParser::printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg,
SMRange Range, bool ShowColors) {
@@ -5653,7 +5727,7 @@ unsigned MipsAsmParser::getReg(int RC, int RegNo) {
bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
MCAsmParser &Parser = getParser();
- DEBUG(dbgs() << "parseOperand\n");
+ LLVM_DEBUG(dbgs() << "parseOperand\n");
// Check if the current operand has a custom associated parser, if so, try to
// custom parse the operand, or fallback to the general approach.
@@ -5666,7 +5740,7 @@ bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
if (ResTy == MatchOperand_ParseFail)
return true;
- DEBUG(dbgs() << ".. Generic Parser\n");
+ LLVM_DEBUG(dbgs() << ".. Generic Parser\n");
switch (getLexer().getKind()) {
case AsmToken::Dollar: {
@@ -5696,7 +5770,7 @@ bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
return false;
}
default: {
- DEBUG(dbgs() << ".. generic integer expression\n");
+ LLVM_DEBUG(dbgs() << ".. generic integer expression\n");
const MCExpr *Expr;
SMLoc S = Parser.getTok().getLoc(); // Start location of the operand.
@@ -5769,7 +5843,7 @@ bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
OperandMatchResultTy
MipsAsmParser::parseMemOperand(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
- DEBUG(dbgs() << "parseMemOperand\n");
+ LLVM_DEBUG(dbgs() << "parseMemOperand\n");
const MCExpr *IdVal = nullptr;
SMLoc S;
bool isParenExpr = false;
@@ -5905,13 +5979,12 @@ MipsAsmParser::parseMemOperand(OperandVector &Operands) {
bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
MCSymbol *Sym = getContext().lookupSymbol(Parser.getTok().getIdentifier());
- if (Sym) {
- SMLoc S = Parser.getTok().getLoc();
- const MCExpr *Expr;
- if (Sym->isVariable())
- Expr = Sym->getVariableValue();
- else
- return false;
+ if (!Sym)
+ return false;
+
+ SMLoc S = Parser.getTok().getLoc();
+ if (Sym->isVariable()) {
+ const MCExpr *Expr = Sym->getVariableValue();
if (Expr->getKind() == MCExpr::SymbolRef) {
const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
StringRef DefSymbol = Ref->getSymbol().getName();
@@ -5921,12 +5994,26 @@ bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) {
if (ResTy == MatchOperand_Success) {
Parser.Lex();
return true;
- } else if (ResTy == MatchOperand_ParseFail)
+ }
+ if (ResTy == MatchOperand_ParseFail)
llvm_unreachable("Should never ParseFail");
- return false;
+ }
+ }
+ } else if (Sym->isUnset()) {
+ // If symbol is unset, it might be created in the `parseSetAssignment`
+ // routine as an alias for a numeric register name.
+ // Lookup in the aliases list.
+ auto Entry = RegisterSets.find(Sym->getName());
+ if (Entry != RegisterSets.end()) {
+ OperandMatchResultTy ResTy =
+ matchAnyRegisterWithoutDollar(Operands, Entry->getValue(), S);
+ if (ResTy == MatchOperand_Success) {
+ Parser.Lex();
+ return true;
}
}
}
+
return false;
}
@@ -5994,48 +6081,59 @@ MipsAsmParser::matchAnyRegisterNameWithoutDollar(OperandVector &Operands,
}
OperandMatchResultTy
-MipsAsmParser::matchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S) {
- MCAsmParser &Parser = getParser();
- auto Token = Parser.getLexer().peekTok(false);
-
+MipsAsmParser::matchAnyRegisterWithoutDollar(OperandVector &Operands,
+ const AsmToken &Token, SMLoc S) {
if (Token.is(AsmToken::Identifier)) {
- DEBUG(dbgs() << ".. identifier\n");
+ LLVM_DEBUG(dbgs() << ".. identifier\n");
StringRef Identifier = Token.getIdentifier();
OperandMatchResultTy ResTy =
matchAnyRegisterNameWithoutDollar(Operands, Identifier, S);
return ResTy;
} else if (Token.is(AsmToken::Integer)) {
- DEBUG(dbgs() << ".. integer\n");
+ LLVM_DEBUG(dbgs() << ".. integer\n");
+ int64_t RegNum = Token.getIntVal();
+ if (RegNum < 0 || RegNum > 31) {
+ // Show the error, but treat invalid register
+ // number as a normal one to continue parsing
+ // and catch other possible errors.
+ Error(getLexer().getLoc(), "invalid register number");
+ }
Operands.push_back(MipsOperand::createNumericReg(
- Token.getIntVal(), Token.getString(), getContext().getRegisterInfo(), S,
+ RegNum, Token.getString(), getContext().getRegisterInfo(), S,
Token.getLoc(), *this));
return MatchOperand_Success;
}
- DEBUG(dbgs() << Parser.getTok().getKind() << "\n");
+ LLVM_DEBUG(dbgs() << Token.getKind() << "\n");
return MatchOperand_NoMatch;
}
OperandMatchResultTy
+MipsAsmParser::matchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S) {
+ auto Token = getLexer().peekTok(false);
+ return matchAnyRegisterWithoutDollar(Operands, Token, S);
+}
+
+OperandMatchResultTy
MipsAsmParser::parseAnyRegister(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
- DEBUG(dbgs() << "parseAnyRegister\n");
+ LLVM_DEBUG(dbgs() << "parseAnyRegister\n");
auto Token = Parser.getTok();
SMLoc S = Token.getLoc();
if (Token.isNot(AsmToken::Dollar)) {
- DEBUG(dbgs() << ".. !$ -> try sym aliasing\n");
+ LLVM_DEBUG(dbgs() << ".. !$ -> try sym aliasing\n");
if (Token.is(AsmToken::Identifier)) {
if (searchSymbolAlias(Operands))
return MatchOperand_Success;
}
- DEBUG(dbgs() << ".. !symalias -> NoMatch\n");
+ LLVM_DEBUG(dbgs() << ".. !symalias -> NoMatch\n");
return MatchOperand_NoMatch;
}
- DEBUG(dbgs() << ".. $\n");
+ LLVM_DEBUG(dbgs() << ".. $\n");
OperandMatchResultTy ResTy = matchAnyRegisterWithoutDollar(Operands, S);
if (ResTy == MatchOperand_Success) {
@@ -6048,7 +6146,7 @@ MipsAsmParser::parseAnyRegister(OperandVector &Operands) {
OperandMatchResultTy
MipsAsmParser::parseJumpTarget(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
- DEBUG(dbgs() << "parseJumpTarget\n");
+ LLVM_DEBUG(dbgs() << "parseJumpTarget\n");
SMLoc S = getLexer().getLoc();
@@ -6181,22 +6279,6 @@ MipsAsmParser::parseRegisterList(OperandVector &Operands) {
}
OperandMatchResultTy
-MipsAsmParser::parseRegisterPair(OperandVector &Operands) {
- MCAsmParser &Parser = getParser();
-
- SMLoc S = Parser.getTok().getLoc();
- if (parseAnyRegister(Operands) != MatchOperand_Success)
- return MatchOperand_ParseFail;
-
- SMLoc E = Parser.getTok().getLoc();
- MipsOperand Op = static_cast<MipsOperand &>(*Operands.back());
-
- Operands.pop_back();
- Operands.push_back(MipsOperand::CreateRegPair(Op, S, E, *this));
- return MatchOperand_Success;
-}
-
-OperandMatchResultTy
MipsAsmParser::parseMovePRegPair(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> TmpOperands;
@@ -6292,7 +6374,7 @@ bool MipsAsmParser::parseBracketSuffix(StringRef Name,
bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) {
MCAsmParser &Parser = getParser();
- DEBUG(dbgs() << "ParseInstruction\n");
+ LLVM_DEBUG(dbgs() << "ParseInstruction\n");
// We have reached first instruction, module directive are now forbidden.
getTargetStreamer().forbidModuleDirective();
@@ -6654,6 +6736,57 @@ bool MipsAsmParser::parseSetNoMtDirective() {
return false;
}
+bool MipsAsmParser::parseSetNoCRCDirective() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex(); // Eat "nocrc".
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ clearFeatureBits(Mips::FeatureCRC, "crc");
+
+ getTargetStreamer().emitDirectiveSetNoCRC();
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+bool MipsAsmParser::parseSetNoVirtDirective() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex(); // Eat "novirt".
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ clearFeatureBits(Mips::FeatureVirt, "virt");
+
+ getTargetStreamer().emitDirectiveSetNoVirt();
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
+bool MipsAsmParser::parseSetNoGINVDirective() {
+ MCAsmParser &Parser = getParser();
+ Parser.Lex(); // Eat "noginv".
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ clearFeatureBits(Mips::FeatureGINV, "ginv");
+
+ getTargetStreamer().emitDirectiveSetNoGINV();
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
bool MipsAsmParser::parseSetPopDirective() {
MCAsmParser &Parser = getParser();
SMLoc Loc = getLexer().getLoc();
@@ -6719,17 +6852,30 @@ bool MipsAsmParser::parseSetAssignment() {
MCAsmParser &Parser = getParser();
if (Parser.parseIdentifier(Name))
- reportParseError("expected identifier after .set");
+ return reportParseError("expected identifier after .set");
if (getLexer().isNot(AsmToken::Comma))
return reportParseError("unexpected token, expected comma");
Lex(); // Eat comma
- if (Parser.parseExpression(Value))
+ if (getLexer().is(AsmToken::Dollar) &&
+ getLexer().peekTok().is(AsmToken::Integer)) {
+ // Parse assignment of a numeric register:
+ // .set r1,$1
+ Parser.Lex(); // Eat $.
+ RegisterSets[Name] = Parser.getTok();
+ Parser.Lex(); // Eat identifier.
+ getContext().getOrCreateSymbol(Name);
+ } else if (!Parser.parseExpression(Value)) {
+ // Parse assignment of an expression including
+ // symbolic registers:
+ // .set $tmp, $BB0-$BB1
+ // .set r2, $f2
+ MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+ Sym->setVariableValue(Value);
+ } else {
return reportParseError("expected valid expression after comma");
-
- MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
- Sym->setVariableValue(Value);
+ }
return false;
}
@@ -6875,6 +7021,18 @@ bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
selectArch("mips64r6");
getTargetStreamer().emitDirectiveSetMips64R6();
break;
+ case Mips::FeatureCRC:
+ setFeatureBits(Mips::FeatureCRC, "crc");
+ getTargetStreamer().emitDirectiveSetCRC();
+ break;
+ case Mips::FeatureVirt:
+ setFeatureBits(Mips::FeatureVirt, "virt");
+ getTargetStreamer().emitDirectiveSetVirt();
+ break;
+ case Mips::FeatureGINV:
+ setFeatureBits(Mips::FeatureGINV, "ginv");
+ getTargetStreamer().emitDirectiveSetGINV();
+ break;
}
return false;
}
@@ -7074,143 +7232,131 @@ bool MipsAsmParser::parseDirectiveNaN() {
}
bool MipsAsmParser::parseDirectiveSet() {
- MCAsmParser &Parser = getParser();
- // Get the next token.
- const AsmToken &Tok = Parser.getTok();
+ const AsmToken &Tok = getParser().getTok();
+ StringRef IdVal = Tok.getString();
+ SMLoc Loc = Tok.getLoc();
- if (Tok.getString() == "noat") {
+ if (IdVal == "noat")
return parseSetNoAtDirective();
- } else if (Tok.getString() == "at") {
+ if (IdVal == "at")
return parseSetAtDirective();
- } else if (Tok.getString() == "arch") {
+ if (IdVal == "arch")
return parseSetArchDirective();
- } else if (Tok.getString() == "bopt") {
- Warning(Tok.getLoc(), "'bopt' feature is unsupported");
+ if (IdVal == "bopt") {
+ Warning(Loc, "'bopt' feature is unsupported");
getParser().Lex();
return false;
- } else if (Tok.getString() == "nobopt") {
+ }
+ if (IdVal == "nobopt") {
// We're already running in nobopt mode, so nothing to do.
getParser().Lex();
return false;
- } else if (Tok.getString() == "fp") {
+ }
+ if (IdVal == "fp")
return parseSetFpDirective();
- } else if (Tok.getString() == "oddspreg") {
+ if (IdVal == "oddspreg")
return parseSetOddSPRegDirective();
- } else if (Tok.getString() == "nooddspreg") {
+ if (IdVal == "nooddspreg")
return parseSetNoOddSPRegDirective();
- } else if (Tok.getString() == "pop") {
+ if (IdVal == "pop")
return parseSetPopDirective();
- } else if (Tok.getString() == "push") {
+ if (IdVal == "push")
return parseSetPushDirective();
- } else if (Tok.getString() == "reorder") {
+ if (IdVal == "reorder")
return parseSetReorderDirective();
- } else if (Tok.getString() == "noreorder") {
+ if (IdVal == "noreorder")
return parseSetNoReorderDirective();
- } else if (Tok.getString() == "macro") {
+ if (IdVal == "macro")
return parseSetMacroDirective();
- } else if (Tok.getString() == "nomacro") {
+ if (IdVal == "nomacro")
return parseSetNoMacroDirective();
- } else if (Tok.getString() == "mips16") {
+ if (IdVal == "mips16")
return parseSetMips16Directive();
- } else if (Tok.getString() == "nomips16") {
+ if (IdVal == "nomips16")
return parseSetNoMips16Directive();
- } else if (Tok.getString() == "nomicromips") {
+ if (IdVal == "nomicromips") {
clearFeatureBits(Mips::FeatureMicroMips, "micromips");
getTargetStreamer().emitDirectiveSetNoMicroMips();
- Parser.eatToEndOfStatement();
+ getParser().eatToEndOfStatement();
return false;
- } else if (Tok.getString() == "micromips") {
+ }
+ if (IdVal == "micromips") {
if (hasMips64r6()) {
- Error(Tok.getLoc(), ".set micromips directive is not supported with MIPS64R6");
+ Error(Loc, ".set micromips directive is not supported with MIPS64R6");
return false;
}
return parseSetFeature(Mips::FeatureMicroMips);
- } else if (Tok.getString() == "mips0") {
+ }
+ if (IdVal == "mips0")
return parseSetMips0Directive();
- } else if (Tok.getString() == "mips1") {
+ if (IdVal == "mips1")
return parseSetFeature(Mips::FeatureMips1);
- } else if (Tok.getString() == "mips2") {
+ if (IdVal == "mips2")
return parseSetFeature(Mips::FeatureMips2);
- } else if (Tok.getString() == "mips3") {
+ if (IdVal == "mips3")
return parseSetFeature(Mips::FeatureMips3);
- } else if (Tok.getString() == "mips4") {
+ if (IdVal == "mips4")
return parseSetFeature(Mips::FeatureMips4);
- } else if (Tok.getString() == "mips5") {
+ if (IdVal == "mips5")
return parseSetFeature(Mips::FeatureMips5);
- } else if (Tok.getString() == "mips32") {
+ if (IdVal == "mips32")
return parseSetFeature(Mips::FeatureMips32);
- } else if (Tok.getString() == "mips32r2") {
+ if (IdVal == "mips32r2")
return parseSetFeature(Mips::FeatureMips32r2);
- } else if (Tok.getString() == "mips32r3") {
+ if (IdVal == "mips32r3")
return parseSetFeature(Mips::FeatureMips32r3);
- } else if (Tok.getString() == "mips32r5") {
+ if (IdVal == "mips32r5")
return parseSetFeature(Mips::FeatureMips32r5);
- } else if (Tok.getString() == "mips32r6") {
+ if (IdVal == "mips32r6")
return parseSetFeature(Mips::FeatureMips32r6);
- } else if (Tok.getString() == "mips64") {
+ if (IdVal == "mips64")
return parseSetFeature(Mips::FeatureMips64);
- } else if (Tok.getString() == "mips64r2") {
+ if (IdVal == "mips64r2")
return parseSetFeature(Mips::FeatureMips64r2);
- } else if (Tok.getString() == "mips64r3") {
+ if (IdVal == "mips64r3")
return parseSetFeature(Mips::FeatureMips64r3);
- } else if (Tok.getString() == "mips64r5") {
+ if (IdVal == "mips64r5")
return parseSetFeature(Mips::FeatureMips64r5);
- } else if (Tok.getString() == "mips64r6") {
+ if (IdVal == "mips64r6") {
if (inMicroMipsMode()) {
- Error(Tok.getLoc(), "MIPS64R6 is not supported with microMIPS");
+ Error(Loc, "MIPS64R6 is not supported with microMIPS");
return false;
}
return parseSetFeature(Mips::FeatureMips64r6);
- } else if (Tok.getString() == "dsp") {
+ }
+ if (IdVal == "dsp")
return parseSetFeature(Mips::FeatureDSP);
- } else if (Tok.getString() == "dspr2") {
+ if (IdVal == "dspr2")
return parseSetFeature(Mips::FeatureDSPR2);
- } else if (Tok.getString() == "nodsp") {
+ if (IdVal == "nodsp")
return parseSetNoDspDirective();
- } else if (Tok.getString() == "msa") {
+ if (IdVal == "msa")
return parseSetMsaDirective();
- } else if (Tok.getString() == "nomsa") {
+ if (IdVal == "nomsa")
return parseSetNoMsaDirective();
- } else if (Tok.getString() == "mt") {
+ if (IdVal == "mt")
return parseSetMtDirective();
- } else if (Tok.getString() == "nomt") {
+ if (IdVal == "nomt")
return parseSetNoMtDirective();
- } else if (Tok.getString() == "softfloat") {
+ if (IdVal == "softfloat")
return parseSetSoftFloatDirective();
- } else if (Tok.getString() == "hardfloat") {
+ if (IdVal == "hardfloat")
return parseSetHardFloatDirective();
- } else {
- // It is just an identifier, look for an assignment.
- parseSetAssignment();
- return false;
- }
-
- return true;
-}
-
-/// parseDataDirective
-/// ::= .word [ expression (, expression)* ]
-bool MipsAsmParser::parseDataDirective(unsigned Size, SMLoc L) {
- MCAsmParser &Parser = getParser();
- if (getLexer().isNot(AsmToken::EndOfStatement)) {
- while (true) {
- const MCExpr *Value;
- if (getParser().parseExpression(Value))
- return true;
-
- getParser().getStreamer().EmitValue(Value, Size);
-
- if (getLexer().is(AsmToken::EndOfStatement))
- break;
-
- if (getLexer().isNot(AsmToken::Comma))
- return Error(L, "unexpected token, expected comma");
- Parser.Lex();
- }
- }
-
- Parser.Lex();
- return false;
+ if (IdVal == "crc")
+ return parseSetFeature(Mips::FeatureCRC);
+ if (IdVal == "nocrc")
+ return parseSetNoCRCDirective();
+ if (IdVal == "virt")
+ return parseSetFeature(Mips::FeatureVirt);
+ if (IdVal == "novirt")
+ return parseSetNoVirtDirective();
+ if (IdVal == "ginv")
+ return parseSetFeature(Mips::FeatureGINV);
+ if (IdVal == "noginv")
+ return parseSetNoGINVDirective();
+
+ // It is just an identifier, look for an assignment.
+ return parseSetAssignment();
}
/// parseDirectiveGpWord
@@ -7425,6 +7571,12 @@ bool MipsAsmParser::parseSSectionDirective(StringRef Section, unsigned Type) {
/// ::= .module softfloat
/// ::= .module hardfloat
/// ::= .module mt
+/// ::= .module crc
+/// ::= .module nocrc
+/// ::= .module virt
+/// ::= .module novirt
+/// ::= .module ginv
+/// ::= .module noginv
bool MipsAsmParser::parseDirectiveModule() {
MCAsmParser &Parser = getParser();
MCAsmLexer &Lexer = getLexer();
@@ -7543,6 +7695,120 @@ bool MipsAsmParser::parseDirectiveModule() {
}
return false; // parseDirectiveModule has finished successfully.
+ } else if (Option == "crc") {
+ setModuleFeatureBits(Mips::FeatureCRC, "crc");
+
+ // Synchronize the ABI Flags information with the FeatureBits information we
+ // updated above.
+ getTargetStreamer().updateABIInfo(*this);
+
+ // If printing assembly, use the recently updated ABI Flags information.
+ // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+ // emitted later).
+ getTargetStreamer().emitDirectiveModuleCRC();
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ return false; // parseDirectiveModule has finished successfully.
+ } else if (Option == "nocrc") {
+ clearModuleFeatureBits(Mips::FeatureCRC, "crc");
+
+ // Synchronize the ABI Flags information with the FeatureBits information we
+ // updated above.
+ getTargetStreamer().updateABIInfo(*this);
+
+ // If printing assembly, use the recently updated ABI Flags information.
+ // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+ // emitted later).
+ getTargetStreamer().emitDirectiveModuleNoCRC();
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ return false; // parseDirectiveModule has finished successfully.
+ } else if (Option == "virt") {
+ setModuleFeatureBits(Mips::FeatureVirt, "virt");
+
+ // Synchronize the ABI Flags information with the FeatureBits information we
+ // updated above.
+ getTargetStreamer().updateABIInfo(*this);
+
+ // If printing assembly, use the recently updated ABI Flags information.
+ // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+ // emitted later).
+ getTargetStreamer().emitDirectiveModuleVirt();
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ return false; // parseDirectiveModule has finished successfully.
+ } else if (Option == "novirt") {
+ clearModuleFeatureBits(Mips::FeatureVirt, "virt");
+
+ // Synchronize the ABI Flags information with the FeatureBits information we
+ // updated above.
+ getTargetStreamer().updateABIInfo(*this);
+
+ // If printing assembly, use the recently updated ABI Flags information.
+ // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+ // emitted later).
+ getTargetStreamer().emitDirectiveModuleNoVirt();
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ return false; // parseDirectiveModule has finished successfully.
+ } else if (Option == "ginv") {
+ setModuleFeatureBits(Mips::FeatureGINV, "ginv");
+
+ // Synchronize the ABI Flags information with the FeatureBits information we
+ // updated above.
+ getTargetStreamer().updateABIInfo(*this);
+
+ // If printing assembly, use the recently updated ABI Flags information.
+ // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+ // emitted later).
+ getTargetStreamer().emitDirectiveModuleGINV();
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ return false; // parseDirectiveModule has finished successfully.
+ } else if (Option == "noginv") {
+ clearModuleFeatureBits(Mips::FeatureGINV, "ginv");
+
+ // Synchronize the ABI Flags information with the FeatureBits information we
+ // updated above.
+ getTargetStreamer().updateABIInfo(*this);
+
+ // If printing assembly, use the recently updated ABI Flags information.
+ // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+ // emitted later).
+ getTargetStreamer().emitDirectiveModuleNoGINV();
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ return false; // parseDirectiveModule has finished successfully.
} else {
return Error(L, "'" + Twine(Option) + "' is not a valid .module option.");
}
@@ -7672,10 +7938,6 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
parseDirectiveCpRestore(DirectiveID.getLoc());
return false;
}
- if (IDVal == ".dword") {
- parseDataDirective(8, DirectiveID.getLoc());
- return false;
- }
if (IDVal == ".ent") {
StringRef SymbolName;
@@ -7923,16 +8185,6 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
return false;
}
- if (IDVal == ".word") {
- parseDataDirective(4, DirectiveID.getLoc());
- return false;
- }
-
- if (IDVal == ".hword") {
- parseDataDirective(2, DirectiveID.getLoc());
- return false;
- }
-
if (IDVal == ".option") {
parseDirectiveOption();
return false;
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 40e337eb97ca..2cacc0a0870c 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -1,16 +1,19 @@
set(LLVM_TARGET_DEFINITIONS Mips.td)
-tablegen(LLVM MipsGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM MipsGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM MipsGenDisassemblerTables.inc -gen-disassembler)
-tablegen(LLVM MipsGenMCCodeEmitter.inc -gen-emitter)
+tablegen(LLVM MipsGenAsmMatcher.inc -gen-asm-matcher)
tablegen(LLVM MipsGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM MipsGenCallingConv.inc -gen-callingconv)
tablegen(LLVM MipsGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM MipsGenDisassemblerTables.inc -gen-disassembler)
tablegen(LLVM MipsGenFastISel.inc -gen-fast-isel)
-tablegen(LLVM MipsGenCallingConv.inc -gen-callingconv)
-tablegen(LLVM MipsGenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM MipsGenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM MipsGenGlobalISel.inc -gen-global-isel)
+tablegen(LLVM MipsGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM MipsGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM MipsGenMCPseudoLowering.inc -gen-pseudo-lowering)
+tablegen(LLVM MipsGenRegisterBank.inc -gen-register-bank)
+tablegen(LLVM MipsGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM MipsGenSubtargetInfo.inc -gen-subtarget)
+
add_public_tablegen_target(MipsCommonTableGen)
add_llvm_target(MipsCodeGen
@@ -23,21 +26,25 @@ add_llvm_target(MipsCodeGen
Mips16RegisterInfo.cpp
MipsAnalyzeImmediate.cpp
MipsAsmPrinter.cpp
+ MipsCallLowering.cpp
MipsCCState.cpp
MipsConstantIslandPass.cpp
MipsDelaySlotFiller.cpp
+ MipsExpandPseudo.cpp
MipsFastISel.cpp
- MipsHazardSchedule.cpp
MipsInstrInfo.cpp
+ MipsInstructionSelector.cpp
MipsISelDAGToDAG.cpp
MipsISelLowering.cpp
MipsFrameLowering.cpp
- MipsLongBranch.cpp
+ MipsLegalizerInfo.cpp
+ MipsBranchExpansion.cpp
MipsMCInstLower.cpp
MipsMachineFunction.cpp
MipsModuleISelDAGToDAG.cpp
MipsOptimizePICCall.cpp
MipsOs16.cpp
+ MipsRegisterBankInfo.cpp
MipsRegisterInfo.cpp
MipsSEFrameLowering.cpp
MipsSEInstrInfo.cpp
@@ -50,9 +57,8 @@ add_llvm_target(MipsCodeGen
MicroMipsSizeReduction.cpp
)
-add_subdirectory(InstPrinter)
+add_subdirectory(AsmParser)
add_subdirectory(Disassembler)
-add_subdirectory(TargetInfo)
+add_subdirectory(InstPrinter)
add_subdirectory(MCTargetDesc)
-add_subdirectory(AsmParser)
-
+add_subdirectory(TargetInfo)
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index ef0f08b49850..b94afb9520e3 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -277,11 +277,6 @@ static DecodeStatus DecodeMemEVA(MCInst &Inst,
uint64_t Address,
const void *Decoder);
-static DecodeStatus DecodeLoadByte9(MCInst &Inst,
- unsigned Insn,
- uint64_t Address,
- const void *Decoder);
-
static DecodeStatus DecodeLoadByte15(MCInst &Inst,
unsigned Insn,
uint64_t Address,
@@ -300,11 +295,6 @@ static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
uint64_t Address,
const void *Decoder);
-static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst,
- unsigned Insn,
- uint64_t Address,
- const void *Decoder);
-
static DecodeStatus DecodePrefeOpMM(MCInst &Inst,
unsigned Insn,
uint64_t Address,
@@ -315,6 +305,11 @@ static DecodeStatus DecodeSyncI(MCInst &Inst,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeSyncI_MM(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
static DecodeStatus DecodeSynciR6(MCInst &Inst,
unsigned Insn,
uint64_t Address,
@@ -527,6 +522,10 @@ template <typename InsnType>
static DecodeStatus DecodeDEXT(MCInst &MI, InsnType Insn, uint64_t Address,
const void *Decoder);
+template <typename InsnType>
+static DecodeStatus DecodeCRC(MCInst &MI, InsnType Insn, uint64_t Address,
+ const void *Decoder);
+
static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn,
uint64_t Address,
const void *Decoder);
@@ -1139,6 +1138,22 @@ static DecodeStatus DecodeDINS(MCInst &MI, InsnType Insn, uint64_t Address,
return MCDisassembler::Success;
}
+
+// Auto-generated decoder wouldn't add the third operand for CRC32*.
+template <typename InsnType>
+static DecodeStatus DecodeCRC(MCInst &MI, InsnType Insn, uint64_t Address,
+ const void *Decoder) {
+ InsnType Rs = fieldFromInstruction(Insn, 21, 5);
+ InsnType Rt = fieldFromInstruction(Insn, 16, 5);
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rs)));
+ MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+ Rt)));
+ return MCDisassembler::Success;
+}
+
/// Read two bytes from the ArrayRef and return 16 bit halfword sorted
/// according to the given endianness.
static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -1210,7 +1225,8 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
return MCDisassembler::Fail;
if (hasMips32r6()) {
- DEBUG(dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n");
+ LLVM_DEBUG(
+ dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n");
// Calling the auto-generated decoder function for microMIPS32R6
// 16-bit instructions.
Result = decodeInstruction(DecoderTableMicroMipsR616, Instr, Insn,
@@ -1221,7 +1237,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
}
}
- DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n");
+ LLVM_DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n");
// Calling the auto-generated decoder function for microMIPS 16-bit
// instructions.
Result = decodeInstruction(DecoderTableMicroMips16, Instr, Insn, Address,
@@ -1236,7 +1252,8 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
return MCDisassembler::Fail;
if (hasMips32r6()) {
- DEBUG(dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n");
+ LLVM_DEBUG(
+ dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n");
// Calling the auto-generated decoder function.
Result = decodeInstruction(DecoderTableMicroMipsR632, Instr, Insn, Address,
this, STI);
@@ -1246,7 +1263,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
}
}
- DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n");
+ LLVM_DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n");
// Calling the auto-generated decoder function.
Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address,
this, STI);
@@ -1256,7 +1273,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
}
if (isFP64()) {
- DEBUG(dbgs() << "Trying MicroMipsFP64 table (32-bit opcodes):\n");
+ LLVM_DEBUG(dbgs() << "Trying MicroMipsFP64 table (32-bit opcodes):\n");
Result = decodeInstruction(DecoderTableMicroMipsFP6432, Instr, Insn,
Address, this, STI);
if (Result != MCDisassembler::Fail) {
@@ -1285,7 +1302,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
Size = 4;
if (hasCOP3()) {
- DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
+ LLVM_DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
Result =
decodeInstruction(DecoderTableCOP3_32, Instr, Insn, Address, this, STI);
if (Result != MCDisassembler::Fail)
@@ -1293,7 +1310,8 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
}
if (hasMips32r6() && isGP64()) {
- DEBUG(dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
+ LLVM_DEBUG(
+ dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, Instr, Insn,
Address, this, STI);
if (Result != MCDisassembler::Fail)
@@ -1301,7 +1319,8 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
}
if (hasMips32r6() && isPTR64()) {
- DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
+ LLVM_DEBUG(
+ dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
Result = decodeInstruction(DecoderTableMips32r6_64r6_PTR6432, Instr, Insn,
Address, this, STI);
if (Result != MCDisassembler::Fail)
@@ -1309,7 +1328,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
}
if (hasMips32r6()) {
- DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
+ LLVM_DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
Result = decodeInstruction(DecoderTableMips32r6_64r632, Instr, Insn,
Address, this, STI);
if (Result != MCDisassembler::Fail)
@@ -1317,7 +1336,8 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
}
if (hasMips2() && isPTR64()) {
- DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
+ LLVM_DEBUG(
+ dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
Result = decodeInstruction(DecoderTableMips32_64_PTR6432, Instr, Insn,
Address, this, STI);
if (Result != MCDisassembler::Fail)
@@ -1325,7 +1345,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
}
if (hasCnMips()) {
- DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n");
+ LLVM_DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n");
Result = decodeInstruction(DecoderTableCnMips32, Instr, Insn,
Address, this, STI);
if (Result != MCDisassembler::Fail)
@@ -1333,7 +1353,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
}
if (isGP64()) {
- DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n");
+ LLVM_DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n");
Result = decodeInstruction(DecoderTableMips6432, Instr, Insn,
Address, this, STI);
if (Result != MCDisassembler::Fail)
@@ -1341,14 +1361,15 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
}
if (isFP64()) {
- DEBUG(dbgs() << "Trying MipsFP64 (64 bit FPU) table (32-bit opcodes):\n");
+ LLVM_DEBUG(
+ dbgs() << "Trying MipsFP64 (64 bit FPU) table (32-bit opcodes):\n");
Result = decodeInstruction(DecoderTableMipsFP6432, Instr, Insn,
Address, this, STI);
if (Result != MCDisassembler::Fail)
return Result;
}
- DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
+ LLVM_DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
// Calling the auto-generated decoder function.
Result =
decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI);
@@ -1538,24 +1559,6 @@ static DecodeStatus DecodeMemEVA(MCInst &Inst,
return MCDisassembler::Success;
}
-static DecodeStatus DecodeLoadByte9(MCInst &Inst,
- unsigned Insn,
- uint64_t Address,
- const void *Decoder) {
- int Offset = SignExtend32<9>(Insn & 0x1ff);
- unsigned Base = fieldFromInstruction(Insn, 16, 5);
- unsigned Reg = fieldFromInstruction(Insn, 21, 5);
-
- Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
- Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
-
- Inst.addOperand(MCOperand::createReg(Reg));
- Inst.addOperand(MCOperand::createReg(Base));
- Inst.addOperand(MCOperand::createImm(Offset));
-
- return MCDisassembler::Success;
-}
-
static DecodeStatus DecodeLoadByte15(MCInst &Inst,
unsigned Insn,
uint64_t Address,
@@ -1642,30 +1645,25 @@ static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst,
return MCDisassembler::Success;
}
-static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst,
- unsigned Insn,
- uint64_t Address,
- const void *Decoder) {
- int Offset = SignExtend32<9>(Insn & 0x1ff);
- unsigned Reg = fieldFromInstruction(Insn, 21, 5);
- unsigned Base = fieldFromInstruction(Insn, 16, 5);
+static DecodeStatus DecodeSyncI(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<16>(Insn & 0xffff);
+ unsigned Base = fieldFromInstruction(Insn, 21, 5);
- Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
- Inst.addOperand(MCOperand::createReg(Reg));
Inst.addOperand(MCOperand::createReg(Base));
Inst.addOperand(MCOperand::createImm(Offset));
return MCDisassembler::Success;
}
-static DecodeStatus DecodeSyncI(MCInst &Inst,
- unsigned Insn,
- uint64_t Address,
- const void *Decoder) {
+static DecodeStatus DecodeSyncI_MM(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
int Offset = SignExtend32<16>(Insn & 0xffff);
- unsigned Base = fieldFromInstruction(Insn, 21, 5);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
@@ -1862,7 +1860,7 @@ static DecodeStatus DecodeMemMMImm9(MCInst &Inst,
Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
- if (Inst.getOpcode() == Mips::SCE_MM)
+ if (Inst.getOpcode() == Mips::SCE_MM || Inst.getOpcode() == Mips::SC_MMR6)
Inst.addOperand(MCOperand::createReg(Reg));
Inst.addOperand(MCOperand::createReg(Reg));
@@ -1897,8 +1895,7 @@ static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
LLVM_FALLTHROUGH;
default:
Inst.addOperand(MCOperand::createReg(Reg));
- if (Inst.getOpcode() == Mips::LWP_MM || Inst.getOpcode() == Mips::SWP_MM ||
- Inst.getOpcode() == Mips::LWP_MMR6 || Inst.getOpcode() == Mips::SWP_MMR6)
+ if (Inst.getOpcode() == Mips::LWP_MM || Inst.getOpcode() == Mips::SWP_MM)
Inst.addOperand(MCOperand::createReg(Reg+1));
Inst.addOperand(MCOperand::createReg(Base));
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index 1d125d0dbae6..73732a40bb8a 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -197,11 +197,6 @@ printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O) {
}
void MipsInstPrinter::
-printRegisterPair(const MCInst *MI, int opNum, raw_ostream &O) {
- printRegName(O, MI->getOperand(opNum).getReg());
-}
-
-void MipsInstPrinter::
printSHFMask(const MCInst *MI, int opNum, raw_ostream &O) {
llvm_unreachable("TODO");
}
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 4a76b5acac79..f02443ee21d3 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -98,7 +98,6 @@ private:
void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);
void printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O);
- void printRegisterPair(const MCInst *MI, int opNum, raw_ostream &O);
void printSHFMask(const MCInst *MI, int opNum, raw_ostream &O);
bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo,
diff --git a/lib/Target/Mips/LLVMBuild.txt b/lib/Target/Mips/LLVMBuild.txt
index 06af8a10a4d2..4b6851f7af69 100644
--- a/lib/Target/Mips/LLVMBuild.txt
+++ b/lib/Target/Mips/LLVMBuild.txt
@@ -43,4 +43,5 @@ required_libraries =
SelectionDAG
Support
Target
+ GlobalISel
add_to_library_groups = Mips
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
index 9abd4f1d6b08..68bf3829aab5 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
@@ -161,6 +161,12 @@ public:
ASESet |= Mips::AFL_ASE_MIPS16;
if (P.hasMT())
ASESet |= Mips::AFL_ASE_MT;
+ if (P.hasCRC())
+ ASESet |= Mips::AFL_ASE_CRC;
+ if (P.hasVirt())
+ ASESet |= Mips::AFL_ASE_VIRT;
+ if (P.hasGINV())
+ ASESet |= Mips::AFL_ASE_GINV;
}
template <class PredicateLibrary>
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
index 498ea6fda4b3..bf1390880281 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
@@ -57,7 +57,7 @@ MipsABIInfo MipsABIInfo::computeTargetABI(const Triple &TT, StringRef CPU,
return MipsABIInfo::N64();
assert(Options.getABIName().empty() && "Unknown ABI option for MIPS");
- if (TT.getArch() == Triple::mips64 || TT.getArch() == Triple::mips64el)
+ if (TT.isMIPS64())
return MipsABIInfo::N64();
return MipsABIInfo::O32();
}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 1ad524c06969..4397c971d080 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -16,6 +16,7 @@
#include "MCTargetDesc/MipsFixupKinds.h"
#include "MCTargetDesc/MipsMCExpr.h"
#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
@@ -53,6 +54,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
case Mips::fixup_Mips_GOT_DISP:
case Mips::fixup_Mips_GOT_LO16:
case Mips::fixup_Mips_CALL_LO16:
+ case Mips::fixup_MICROMIPS_GPOFF_HI:
+ case Mips::fixup_MICROMIPS_GPOFF_LO:
case Mips::fixup_MICROMIPS_LO16:
case Mips::fixup_MICROMIPS_GOT_PAGE:
case Mips::fixup_MICROMIPS_GOT_OFST:
@@ -107,10 +110,12 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
Value = ((Value + 0x8000) >> 16) & 0xffff;
break;
case Mips::fixup_Mips_HIGHER:
+ case Mips::fixup_MICROMIPS_HIGHER:
// Get the 3rd 16-bits.
Value = ((Value + 0x80008000LL) >> 32) & 0xffff;
break;
case Mips::fixup_Mips_HIGHEST:
+ case Mips::fixup_MICROMIPS_HIGHEST:
// Get the 4th 16-bits.
Value = ((Value + 0x800080008000LL) >> 48) & 0xffff;
break;
@@ -210,9 +215,9 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
return Value;
}
-std::unique_ptr<MCObjectWriter>
-MipsAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
- return createMipsELFObjectWriter(OS, TheTriple, IsN32);
+std::unique_ptr<MCObjectTargetWriter>
+MipsAsmBackend::createObjectTargetWriter() const {
+ return createMipsELFObjectWriter(TheTriple, IsN32);
}
// Little-endian fixup data byte ordering:
@@ -238,7 +243,8 @@ static unsigned calculateMMLEIndex(unsigned i) {
void MipsAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target,
MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) const {
+ bool IsResolved,
+ const MCSubtargetInfo *STI) const {
MCFixupKind Kind = Fixup.getKind();
MCContext &Ctx = Asm.getContext();
Value = adjustFixupValue(Fixup, Value, Ctx);
@@ -275,9 +281,9 @@ void MipsAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
bool microMipsLEByteOrder = needsMMLEByteOrder((unsigned) Kind);
for (unsigned i = 0; i != NumBytes; ++i) {
- unsigned Idx = IsLittle ? (microMipsLEByteOrder ? calculateMMLEIndex(i)
- : i)
- : (FullSize - 1 - i);
+ unsigned Idx = Endian == support::little
+ ? (microMipsLEByteOrder ? calculateMMLEIndex(i) : i)
+ : (FullSize - 1 - i);
CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8);
}
@@ -287,9 +293,9 @@ void MipsAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
// Write out the fixed up bytes back to the code/data bits.
for (unsigned i = 0; i != NumBytes; ++i) {
- unsigned Idx = IsLittle ? (microMipsLEByteOrder ? calculateMMLEIndex(i)
- : i)
- : (FullSize - 1 - i);
+ unsigned Idx = Endian == support::little
+ ? (microMipsLEByteOrder ? calculateMMLEIndex(i) : i)
+ : (FullSize - 1 - i);
Data[Offset + Idx] = (uint8_t)((CurVal >> (i*8)) & 0xff);
}
}
@@ -298,12 +304,46 @@ Optional<MCFixupKind> MipsAsmBackend::getFixupKind(StringRef Name) const {
return StringSwitch<Optional<MCFixupKind>>(Name)
.Case("R_MIPS_NONE", (MCFixupKind)Mips::fixup_Mips_NONE)
.Case("R_MIPS_32", FK_Data_4)
+ .Case("R_MIPS_GOT_PAGE", (MCFixupKind)Mips::fixup_Mips_GOT_PAGE)
+ .Case("R_MIPS_CALL_HI16", (MCFixupKind)Mips::fixup_Mips_CALL_HI16)
+ .Case("R_MIPS_CALL_LO16", (MCFixupKind)Mips::fixup_Mips_CALL_LO16)
+ .Case("R_MIPS_CALL16", (MCFixupKind)Mips::fixup_Mips_CALL16)
+ .Case("R_MIPS_GOT16", (MCFixupKind)Mips::fixup_Mips_GOT)
+ .Case("R_MIPS_GOT_PAGE", (MCFixupKind)Mips::fixup_Mips_GOT_PAGE)
+ .Case("R_MIPS_GOT_OFST", (MCFixupKind)Mips::fixup_Mips_GOT_OFST)
+ .Case("R_MIPS_GOT_DISP", (MCFixupKind)Mips::fixup_Mips_GOT_DISP)
+ .Case("R_MIPS_GOT_HI16", (MCFixupKind)Mips::fixup_Mips_GOT_HI16)
+ .Case("R_MIPS_GOT_LO16", (MCFixupKind)Mips::fixup_Mips_GOT_LO16)
+ .Case("R_MIPS_TLS_GOTTPREL", (MCFixupKind)Mips::fixup_Mips_GOTTPREL)
+ .Case("R_MIPS_TLS_DTPREL_HI16", (MCFixupKind)Mips::fixup_Mips_DTPREL_HI)
+ .Case("R_MIPS_TLS_DTPREL_LO16", (MCFixupKind)Mips::fixup_Mips_DTPREL_LO)
+ .Case("R_MIPS_TLS_GD", (MCFixupKind)Mips::fixup_Mips_TLSGD)
+ .Case("R_MIPS_TLS_LDM", (MCFixupKind)Mips::fixup_Mips_TLSLDM)
+ .Case("R_MIPS_TLS_TPREL_HI16", (MCFixupKind)Mips::fixup_Mips_TPREL_HI)
+ .Case("R_MIPS_TLS_TPREL_LO16", (MCFixupKind)Mips::fixup_Mips_TPREL_LO)
+ .Case("R_MICROMIPS_CALL16", (MCFixupKind)Mips::fixup_MICROMIPS_CALL16)
+ .Case("R_MICROMIPS_GOT_DISP", (MCFixupKind)Mips::fixup_MICROMIPS_GOT_DISP)
+ .Case("R_MICROMIPS_GOT_PAGE", (MCFixupKind)Mips::fixup_MICROMIPS_GOT_PAGE)
+ .Case("R_MICROMIPS_GOT_OFST", (MCFixupKind)Mips::fixup_MICROMIPS_GOT_OFST)
+ .Case("R_MICROMIPS_GOT16", (MCFixupKind)Mips::fixup_MICROMIPS_GOT16)
+ .Case("R_MICROMIPS_TLS_GOTTPREL",
+ (MCFixupKind)Mips::fixup_MICROMIPS_GOTTPREL)
+ .Case("R_MICROMIPS_TLS_DTPREL_HI16",
+ (MCFixupKind)Mips::fixup_MICROMIPS_TLS_DTPREL_HI16)
+ .Case("R_MICROMIPS_TLS_DTPREL_LO16",
+ (MCFixupKind)Mips::fixup_MICROMIPS_TLS_DTPREL_LO16)
+ .Case("R_MICROMIPS_TLS_GD", (MCFixupKind)Mips::fixup_MICROMIPS_TLS_GD)
+ .Case("R_MICROMIPS_TLS_LDM", (MCFixupKind)Mips::fixup_MICROMIPS_TLS_LDM)
+ .Case("R_MICROMIPS_TLS_TPREL_HI16",
+ (MCFixupKind)Mips::fixup_MICROMIPS_TLS_TPREL_HI16)
+ .Case("R_MICROMIPS_TLS_TPREL_LO16",
+ (MCFixupKind)Mips::fixup_MICROMIPS_TLS_TPREL_LO16)
.Default(MCAsmBackend::getFixupKind(Name));
}
const MCFixupKindInfo &MipsAsmBackend::
getFixupKindInfo(MCFixupKind Kind) const {
- const static MCFixupKindInfo LittleEndianInfos[Mips::NumTargetFixupKinds] = {
+ const static MCFixupKindInfo LittleEndianInfos[] = {
// This table *must* be in same the order of fixup_* kinds in
// MipsFixupKinds.h.
//
@@ -333,12 +373,16 @@ getFixupKindInfo(MCFixupKind Kind) const {
{ "fixup_Mips_DTPREL_LO", 0, 16, 0 },
{ "fixup_Mips_Branch_PCRel", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_Mips_GPOFF_HI", 0, 16, 0 },
+ { "fixup_MICROMIPS_GPOFF_HI",0, 16, 0 },
{ "fixup_Mips_GPOFF_LO", 0, 16, 0 },
+ { "fixup_MICROMIPS_GPOFF_LO",0, 16, 0 },
{ "fixup_Mips_GOT_PAGE", 0, 16, 0 },
{ "fixup_Mips_GOT_OFST", 0, 16, 0 },
{ "fixup_Mips_GOT_DISP", 0, 16, 0 },
{ "fixup_Mips_HIGHER", 0, 16, 0 },
+ { "fixup_MICROMIPS_HIGHER", 0, 16, 0 },
{ "fixup_Mips_HIGHEST", 0, 16, 0 },
+ { "fixup_MICROMIPS_HIGHEST", 0, 16, 0 },
{ "fixup_Mips_GOT_HI16", 0, 16, 0 },
{ "fixup_Mips_GOT_LO16", 0, 16, 0 },
{ "fixup_Mips_CALL_HI16", 0, 16, 0 },
@@ -374,8 +418,10 @@ getFixupKindInfo(MCFixupKind Kind) const {
{ "fixup_Mips_SUB", 0, 64, 0 },
{ "fixup_MICROMIPS_SUB", 0, 64, 0 }
};
+ static_assert(array_lengthof(LittleEndianInfos) == Mips::NumTargetFixupKinds,
+ "Not all MIPS little endian fixup kinds added!");
- const static MCFixupKindInfo BigEndianInfos[Mips::NumTargetFixupKinds] = {
+ const static MCFixupKindInfo BigEndianInfos[] = {
// This table *must* be in same the order of fixup_* kinds in
// MipsFixupKinds.h.
//
@@ -405,12 +451,16 @@ getFixupKindInfo(MCFixupKind Kind) const {
{ "fixup_Mips_DTPREL_LO", 16, 16, 0 },
{ "fixup_Mips_Branch_PCRel",16, 16, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_Mips_GPOFF_HI", 16, 16, 0 },
+ { "fixup_MICROMIPS_GPOFF_HI", 16, 16, 0 },
{ "fixup_Mips_GPOFF_LO", 16, 16, 0 },
+ { "fixup_MICROMIPS_GPOFF_LO", 16, 16, 0 },
{ "fixup_Mips_GOT_PAGE", 16, 16, 0 },
{ "fixup_Mips_GOT_OFST", 16, 16, 0 },
{ "fixup_Mips_GOT_DISP", 16, 16, 0 },
{ "fixup_Mips_HIGHER", 16, 16, 0 },
+ { "fixup_MICROMIPS_HIGHER", 16, 16, 0 },
{ "fixup_Mips_HIGHEST", 16, 16, 0 },
+ { "fixup_MICROMIPS_HIGHEST",16, 16, 0 },
{ "fixup_Mips_GOT_HI16", 16, 16, 0 },
{ "fixup_Mips_GOT_LO16", 16, 16, 0 },
{ "fixup_Mips_CALL_HI16", 16, 16, 0 },
@@ -446,6 +496,8 @@ getFixupKindInfo(MCFixupKind Kind) const {
{ "fixup_Mips_SUB", 0, 64, 0 },
{ "fixup_MICROMIPS_SUB", 0, 64, 0 }
};
+ static_assert(array_lengthof(BigEndianInfos) == Mips::NumTargetFixupKinds,
+ "Not all MIPS big endian fixup kinds added!");
if (Kind < FirstTargetFixupKind)
return MCAsmBackend::getFixupKindInfo(Kind);
@@ -453,7 +505,7 @@ getFixupKindInfo(MCFixupKind Kind) const {
assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
"Invalid kind!");
- if (IsLittle)
+ if (Endian == support::little)
return LittleEndianInfos[Kind - FirstTargetFixupKind];
return BigEndianInfos[Kind - FirstTargetFixupKind];
}
@@ -463,7 +515,7 @@ getFixupKindInfo(MCFixupKind Kind) const {
/// it should return an error.
///
/// \return - True on success.
-bool MipsAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool MipsAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
// Check for a less than instruction size number of bytes
// FIXME: 16 bit instructions are not handled yet here.
// We shouldn't be using a hard coded number for instruction size.
@@ -471,13 +523,55 @@ bool MipsAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
// If the count is not 4-byte aligned, we must be writing data into the text
// section (otherwise we have unaligned instructions, and thus have far
// bigger problems), so just write zeros instead.
- OW->WriteZeros(Count);
+ OS.write_zeros(Count);
return true;
}
+bool MipsAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+ const MCFixup &Fixup,
+ const MCValue &Target) {
+ const unsigned FixupKind = Fixup.getKind();
+ switch (FixupKind) {
+ default:
+ return false;
+ // All these relocations require special processing
+ // at linking time. Delegate this work to a linker.
+ case Mips::fixup_Mips_CALL_HI16:
+ case Mips::fixup_Mips_CALL_LO16:
+ case Mips::fixup_Mips_CALL16:
+ case Mips::fixup_Mips_GOT:
+ case Mips::fixup_Mips_GOT_PAGE:
+ case Mips::fixup_Mips_GOT_OFST:
+ case Mips::fixup_Mips_GOT_DISP:
+ case Mips::fixup_Mips_GOT_HI16:
+ case Mips::fixup_Mips_GOT_LO16:
+ case Mips::fixup_Mips_GOTTPREL:
+ case Mips::fixup_Mips_DTPREL_HI:
+ case Mips::fixup_Mips_DTPREL_LO:
+ case Mips::fixup_Mips_TLSGD:
+ case Mips::fixup_Mips_TLSLDM:
+ case Mips::fixup_Mips_TPREL_HI:
+ case Mips::fixup_Mips_TPREL_LO:
+ case Mips::fixup_MICROMIPS_CALL16:
+ case Mips::fixup_MICROMIPS_GOT_DISP:
+ case Mips::fixup_MICROMIPS_GOT_PAGE:
+ case Mips::fixup_MICROMIPS_GOT_OFST:
+ case Mips::fixup_MICROMIPS_GOT16:
+ case Mips::fixup_MICROMIPS_GOTTPREL:
+ case Mips::fixup_MICROMIPS_TLS_DTPREL_HI16:
+ case Mips::fixup_MICROMIPS_TLS_DTPREL_LO16:
+ case Mips::fixup_MICROMIPS_TLS_GD:
+ case Mips::fixup_MICROMIPS_TLS_LDM:
+ case Mips::fixup_MICROMIPS_TLS_TPREL_HI16:
+ case Mips::fixup_MICROMIPS_TLS_TPREL_LO16:
+ return true;
+ }
+}
+
MCAsmBackend *llvm::createMipsAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
const MCTargetOptions &Options) {
- return new MipsAsmBackend(T, MRI, TT, CPU, Options.ABIName == "n32");
+ return new MipsAsmBackend(T, MRI, STI.getTargetTriple(), STI.getCPU(),
+ Options.ABIName == "n32");
}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 406b820edae5..3d5e16fcf9b4 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -29,20 +29,21 @@ class Target;
class MipsAsmBackend : public MCAsmBackend {
Triple TheTriple;
- bool IsLittle; // Big or little endian
bool IsN32;
public:
MipsAsmBackend(const Target &T, const MCRegisterInfo &MRI, const Triple &TT,
StringRef CPU, bool N32)
- : TheTriple(TT), IsLittle(TT.isLittleEndian()), IsN32(N32) {}
+ : MCAsmBackend(TT.isLittleEndian() ? support::little : support::big),
+ TheTriple(TT), IsN32(N32) {}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override;
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override;
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsResolved) const override;
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override;
Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
@@ -58,7 +59,8 @@ public:
/// relaxation.
///
/// \param Inst - The instruction to test.
- bool mayNeedRelaxation(const MCInst &Inst) const override {
+ bool mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const override {
return false;
}
@@ -83,7 +85,10 @@ public:
/// @}
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+
+ bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target) override;
}; // class MipsAsmBackend
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 6d2f098a6b32..3dc753772e5f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -56,8 +56,7 @@ raw_ostream &operator<<(raw_ostream &OS, const MipsRelocationEntry &RHS) {
class MipsELFObjectWriter : public MCELFObjectTargetWriter {
public:
- MipsELFObjectWriter(uint8_t OSABI, bool HasRelocationAddend, bool Is64,
- bool IsLittleEndian);
+ MipsELFObjectWriter(uint8_t OSABI, bool HasRelocationAddend, bool Is64);
~MipsELFObjectWriter() override = default;
@@ -116,15 +115,15 @@ static InputIt find_best(InputIt First, InputIt Last, UnaryPredicate Predicate,
for (InputIt I = First; I != Last; ++I) {
unsigned Matched = Predicate(*I);
if (Matched != FindBest_NoMatch) {
- DEBUG(dbgs() << std::distance(First, I) << " is a match (";
- I->print(dbgs()); dbgs() << ")\n");
+ LLVM_DEBUG(dbgs() << std::distance(First, I) << " is a match (";
+ I->print(dbgs()); dbgs() << ")\n");
if (Best == Last || BetterThan(*I, *Best)) {
- DEBUG(dbgs() << ".. and it beats the last one\n");
+ LLVM_DEBUG(dbgs() << ".. and it beats the last one\n");
Best = I;
}
}
if (Matched == FindBest_PerfectMatch) {
- DEBUG(dbgs() << ".. and it is unbeatable\n");
+ LLVM_DEBUG(dbgs() << ".. and it is unbeatable\n");
break;
}
}
@@ -148,7 +147,8 @@ static unsigned getMatchingLoType(const ELFRelocationEntry &Reloc) {
if (Type == ELF::R_MIPS16_HI16)
return ELF::R_MIPS16_LO16;
- if (Reloc.OriginalSymbol->getBinding() != ELF::STB_LOCAL)
+ if (Reloc.OriginalSymbol &&
+ Reloc.OriginalSymbol->getBinding() != ELF::STB_LOCAL)
return ELF::R_MIPS_NONE;
if (Type == ELF::R_MIPS_GOT16)
@@ -211,8 +211,7 @@ static void dumpRelocs(const char *Prefix, const Container &Relocs) {
#endif
MipsELFObjectWriter::MipsELFObjectWriter(uint8_t OSABI,
- bool HasRelocationAddend, bool Is64,
- bool IsLittleEndian)
+ bool HasRelocationAddend, bool Is64)
: MCELFObjectTargetWriter(Is64, OSABI, ELF::EM_MIPS, HasRelocationAddend) {}
unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
@@ -225,6 +224,8 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
switch (Kind) {
case Mips::fixup_Mips_NONE:
return ELF::R_MIPS_NONE;
+ case FK_Data_1:
+ report_fatal_error("MIPS does not support one byte relocations");
case Mips::fixup_Mips_16:
case FK_Data_2:
return IsPCRel ? ELF::R_MIPS_PC16 : ELF::R_MIPS_16;
@@ -329,6 +330,13 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
Type = setRType3((unsigned)ELF::R_MIPS_HI16, Type);
return Type;
}
+ case Mips::fixup_MICROMIPS_GPOFF_HI: {
+ unsigned Type = (unsigned)ELF::R_MIPS_NONE;
+ Type = setRType((unsigned)ELF::R_MICROMIPS_GPREL16, Type);
+ Type = setRType2((unsigned)ELF::R_MICROMIPS_SUB, Type);
+ Type = setRType3((unsigned)ELF::R_MICROMIPS_HI16, Type);
+ return Type;
+ }
case Mips::fixup_Mips_GPOFF_LO: {
unsigned Type = (unsigned)ELF::R_MIPS_NONE;
Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
@@ -336,6 +344,13 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
Type = setRType3((unsigned)ELF::R_MIPS_LO16, Type);
return Type;
}
+ case Mips::fixup_MICROMIPS_GPOFF_LO: {
+ unsigned Type = (unsigned)ELF::R_MIPS_NONE;
+ Type = setRType((unsigned)ELF::R_MICROMIPS_GPREL16, Type);
+ Type = setRType2((unsigned)ELF::R_MICROMIPS_SUB, Type);
+ Type = setRType3((unsigned)ELF::R_MICROMIPS_LO16, Type);
+ return Type;
+ }
case Mips::fixup_Mips_HIGHER:
return ELF::R_MIPS_HIGHER;
case Mips::fixup_Mips_HIGHEST:
@@ -382,6 +397,10 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
return ELF::R_MICROMIPS_TLS_TPREL_LO16;
case Mips::fixup_MICROMIPS_SUB:
return ELF::R_MICROMIPS_SUB;
+ case Mips::fixup_MICROMIPS_HIGHER:
+ return ELF::R_MICROMIPS_HIGHER;
+ case Mips::fixup_MICROMIPS_HIGHEST:
+ return ELF::R_MICROMIPS_HIGHEST;
}
llvm_unreachable("invalid fixup kind!");
@@ -434,15 +453,15 @@ void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
return;
// Sort relocations by the address they are applied to.
- std::sort(Relocs.begin(), Relocs.end(),
- [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) {
- return A.Offset < B.Offset;
- });
+ llvm::sort(Relocs.begin(), Relocs.end(),
+ [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) {
+ return A.Offset < B.Offset;
+ });
std::list<MipsRelocationEntry> Sorted;
std::list<ELFRelocationEntry> Remainder;
- DEBUG(dumpRelocs("R: ", Relocs));
+ LLVM_DEBUG(dumpRelocs("R: ", Relocs));
// Separate the movable relocations (AHL relocations using the high bits) from
// the immobile relocations (everything else). This does not preserve high/low
@@ -453,7 +472,7 @@ void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
});
for (auto &R : Remainder) {
- DEBUG(dbgs() << "Matching: " << R << "\n");
+ LLVM_DEBUG(dbgs() << "Matching: " << R << "\n");
unsigned MatchingType = getMatchingLoType(R);
assert(MatchingType != ELF::R_MIPS_NONE &&
@@ -488,7 +507,7 @@ void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
Sorted.insert(InsertionPoint, R)->Matched = true;
}
- DEBUG(dumpRelocs("S: ", Sorted));
+ LLVM_DEBUG(dumpRelocs("S: ", Sorted));
assert(Relocs.size() == Sorted.size() && "Some relocs were not consumed");
@@ -656,13 +675,11 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
}
}
-std::unique_ptr<MCObjectWriter>
-llvm::createMipsELFObjectWriter(raw_pwrite_stream &OS, const Triple &TT,
- bool IsN32) {
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createMipsELFObjectWriter(const Triple &TT, bool IsN32) {
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
bool IsN64 = TT.isArch64Bit() && !IsN32;
bool HasRelocationAddend = TT.isArch64Bit();
- auto MOTW = llvm::make_unique<MipsELFObjectWriter>(
- OSABI, HasRelocationAddend, IsN64, TT.isLittleEndian());
- return createELFObjectWriter(std::move(MOTW), OS, TT.isLittleEndian());
+ return llvm::make_unique<MipsELFObjectWriter>(OSABI, HasRelocationAddend,
+ IsN64);
}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index 4b8f9c7a680c..7b9a02503ce2 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -16,6 +16,7 @@
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSymbolELF.h"
#include "llvm/Support/Casting.h"
@@ -23,9 +24,10 @@ using namespace llvm;
MipsELFStreamer::MipsELFStreamer(MCContext &Context,
std::unique_ptr<MCAsmBackend> MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter)
- : MCELFStreamer(Context, std::move(MAB), OS, std::move(Emitter)) {
+ : MCELFStreamer(Context, std::move(MAB), std::move(OW),
+ std::move(Emitter)) {
RegInfoRecord = new MipsRegInfoRecord(this, Context);
MipsOptionRecords.push_back(
std::unique_ptr<MipsRegInfoRecord>(RegInfoRecord));
@@ -84,6 +86,11 @@ void MipsELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
Labels.clear();
}
+void MipsELFStreamer::EmitIntValue(uint64_t Value, unsigned Size) {
+ MCELFStreamer::EmitIntValue(Value, Size);
+ Labels.clear();
+}
+
void MipsELFStreamer::EmitMipsOptionRecords() {
for (const auto &I : MipsOptionRecords)
I->EmitMipsOptionRecord();
@@ -91,7 +98,8 @@ void MipsELFStreamer::EmitMipsOptionRecords() {
MCELFStreamer *llvm::createMipsELFStreamer(
MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
- raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+ std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
bool RelaxAll) {
- return new MipsELFStreamer(Context, std::move(MAB), OS, std::move(Emitter));
+ return new MipsELFStreamer(Context, std::move(MAB), std::move(OW),
+ std::move(Emitter));
}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index 2fe9b08b645a..d141f5d77c61 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -34,7 +34,7 @@ class MipsELFStreamer : public MCELFStreamer {
public:
MipsELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter);
/// Overriding this function allows us to add arbitrary behaviour before the
@@ -54,9 +54,11 @@ public:
void SwitchSection(MCSection *Section,
const MCExpr *Subsection = nullptr) override;
- /// Overriding this function allows us to dismiss all labels that are
- /// candidates for marking as microMIPS when .word directive is emitted.
+ /// Overriding these functions allows us to dismiss all labels that are
+ /// candidates for marking as microMIPS when .word/.long/.4byte etc
+ /// directives are emitted.
void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override;
+ void EmitIntValue(uint64_t Value, unsigned Size) override;
/// Emits all the option records stored up until the point it's called.
void EmitMipsOptionRecords();
@@ -67,7 +69,7 @@ public:
MCELFStreamer *createMipsELFStreamer(MCContext &Context,
std::unique_ptr<MCAsmBackend> MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter,
bool RelaxAll);
} // end namespace llvm
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index 6148a1b622c8..fdb560f3c72f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -96,10 +96,14 @@ namespace Mips {
fixup_Mips_Branch_PCRel,
// resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16
+ // R_MICROMIPS_GPREL16/R_MICROMIPS_SUB/R_MICROMIPS_HI16
fixup_Mips_GPOFF_HI,
+ fixup_MICROMIPS_GPOFF_HI,
// resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16
+ // R_MICROMIPS_GPREL16/R_MICROMIPS_SUB/R_MICROMIPS_LO16
fixup_Mips_GPOFF_LO,
+ fixup_MICROMIPS_GPOFF_LO,
// resulting in - R_MIPS_PAGE
fixup_Mips_GOT_PAGE,
@@ -110,11 +114,13 @@ namespace Mips {
// resulting in - R_MIPS_GOT_DISP
fixup_Mips_GOT_DISP,
- // resulting in - R_MIPS_GOT_HIGHER
+ // resulting in - R_MIPS_HIGHER/R_MICROMIPS_HIGHER
fixup_Mips_HIGHER,
+ fixup_MICROMIPS_HIGHER,
- // resulting in - R_MIPS_HIGHEST
+ // resulting in - R_MIPS_HIGHEST/R_MICROMIPS_HIGHEST
fixup_Mips_HIGHEST,
+ fixup_MICROMIPS_HIGHEST,
// resulting in - R_MIPS_GOT_HI16
fixup_Mips_GOT_HI16,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index e63304220ae5..f498d830c8f0 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -21,16 +21,14 @@ void MipsMCAsmInfo::anchor() { }
MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
IsLittleEndian = TheTriple.isLittleEndian();
- if ((TheTriple.getArch() == Triple::mips64el) ||
- (TheTriple.getArch() == Triple::mips64)) {
+ if (TheTriple.isMIPS64()) {
CodePointerSize = CalleeSaveStackSlotSize = 8;
}
// FIXME: This condition isn't quite right but it's the best we can do until
// this object can identify the ABI. It will misbehave when using O32
// on a mips64*-* triple.
- if ((TheTriple.getArch() == Triple::mipsel) ||
- (TheTriple.getArch() == Triple::mips)) {
+ if (TheTriple.isMIPS32()) {
PrivateGlobalPrefix = "$";
PrivateLabelPrefix = "$";
}
@@ -54,8 +52,7 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
HasMipsExpressions = true;
// Enable IAS by default for O32.
- if (TheTriple.getArch() == Triple::mips ||
- TheTriple.getArch() == Triple::mipsel)
+ if (TheTriple.isMIPS32())
UseIntegratedAssembler = true;
// Enable IAS by default for Debian mips64/mips64el.
@@ -65,4 +62,9 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
// Enable IAS by default for Android mips64el that uses N64 ABI.
if (TheTriple.getArch() == Triple::mips64el && TheTriple.isAndroid())
UseIntegratedAssembler = true;
+
+ // Enable IAS by default for FreeBSD / OpenBSD mips64/mips64el.
+ if (TheTriple.isOSFreeBSD() ||
+ TheTriple.isOSOpenBSD())
+ UseIntegratedAssembler = true;
}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 2f6dd0035de3..cd34b0ab70b4 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -656,27 +656,29 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
break;
case MipsMCExpr::MEK_LO:
// Check for %lo(%neg(%gp_rel(X)))
- if (MipsExpr->isGpOff()) {
- FixupKind = Mips::fixup_Mips_GPOFF_LO;
- break;
- }
- FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
- : Mips::fixup_Mips_LO16;
+ if (MipsExpr->isGpOff())
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GPOFF_LO
+ : Mips::fixup_Mips_GPOFF_LO;
+ else
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
+ : Mips::fixup_Mips_LO16;
break;
case MipsMCExpr::MEK_HIGHEST:
- FixupKind = Mips::fixup_Mips_HIGHEST;
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HIGHEST
+ : Mips::fixup_Mips_HIGHEST;
break;
case MipsMCExpr::MEK_HIGHER:
- FixupKind = Mips::fixup_Mips_HIGHER;
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HIGHER
+ : Mips::fixup_Mips_HIGHER;
break;
case MipsMCExpr::MEK_HI:
// Check for %hi(%neg(%gp_rel(X)))
- if (MipsExpr->isGpOff()) {
- FixupKind = Mips::fixup_Mips_GPOFF_HI;
- break;
- }
- FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HI16
- : Mips::fixup_Mips_HI16;
+ if (MipsExpr->isGpOff())
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GPOFF_HI
+ : Mips::fixup_Mips_GPOFF_HI;
+ else
+ FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HI16
+ : Mips::fixup_Mips_HI16;
break;
case MipsMCExpr::MEK_PCREL_HI16:
FixupKind = Mips::fixup_MIPS_PCHI16;
@@ -1058,13 +1060,6 @@ MipsMCCodeEmitter::getRegisterListOpValue16(const MCInst &MI, unsigned OpNo,
}
unsigned
-MipsMCCodeEmitter::getRegisterPairOpValue(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
-}
-
-unsigned
MipsMCCodeEmitter::getMovePRegPairOpValue(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index 1e840114b2b3..09d50d4776ba 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -245,10 +245,6 @@ public:
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
- unsigned getRegisterPairOpValue(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-
unsigned getMovePRegPairOpValue(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
index dfacf4354516..988629ed1bca 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
@@ -24,7 +24,7 @@ bool baseRegNeedsLoadStoreMask(unsigned Reg);
// This function creates an MCELFStreamer for Mips NaCl.
MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context,
std::unique_ptr<MCAsmBackend> TAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter,
bool RelaxAll);
}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 8fcd8aa4c19b..ce208b7f98bc 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -23,6 +23,7 @@
#include "llvm/MC/MCELFStreamer.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
@@ -46,7 +47,7 @@ using namespace llvm;
/// FIXME: Merge with the copy in MipsSubtarget.cpp
StringRef MIPS_MC::selectMipsCPU(const Triple &TT, StringRef CPU) {
if (CPU.empty() || CPU == "generic") {
- if (TT.getArch() == Triple::mips || TT.getArch() == Triple::mipsel)
+ if (TT.isMIPS32())
CPU = "mips32";
else
CPU = "mips64";
@@ -93,15 +94,15 @@ static MCInstPrinter *createMipsMCInstPrinter(const Triple &T,
static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
std::unique_ptr<MCAsmBackend> &&MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&Emitter,
bool RelaxAll) {
MCStreamer *S;
if (!T.isOSNaCl())
- S = createMipsELFStreamer(Context, std::move(MAB), OS, std::move(Emitter),
- RelaxAll);
+ S = createMipsELFStreamer(Context, std::move(MAB), std::move(OW),
+ std::move(Emitter), RelaxAll);
else
- S = createMipsNaClELFStreamer(Context, std::move(MAB), OS,
+ S = createMipsNaClELFStreamer(Context, std::move(MAB), std::move(OW),
std::move(Emitter), RelaxAll);
return S;
}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index abbf08ed212f..4fc174ab5871 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -23,7 +23,7 @@ class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
class MCRegisterInfo;
class MCSubtargetInfo;
class MCTargetOptions;
@@ -45,12 +45,12 @@ MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
-MCAsmBackend *createMipsAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
+MCAsmBackend *createMipsAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const MCTargetOptions &Options);
-std::unique_ptr<MCObjectWriter>
-createMipsELFObjectWriter(raw_pwrite_stream &OS, const Triple &TT, bool IsN32);
+std::unique_ptr<MCObjectTargetWriter>
+createMipsELFObjectWriter(const Triple &TT, bool IsN32);
namespace MIPS_MC {
StringRef selectMipsCPU(const Triple &TT, StringRef CPU);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index d878cf82e26d..6bf62ea618b4 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -25,6 +25,7 @@
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCELFStreamer.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/Support/ErrorHandling.h"
#include <cassert>
@@ -43,9 +44,10 @@ const unsigned LoadStoreStackMaskReg = Mips::T7;
class MipsNaClELFStreamer : public MipsELFStreamer {
public:
MipsNaClELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter)
- : MipsELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)) {}
+ : MipsELFStreamer(Context, std::move(TAB), std::move(OW),
+ std::move(Emitter)) {}
~MipsNaClELFStreamer() override = default;
@@ -260,11 +262,11 @@ bool baseRegNeedsLoadStoreMask(unsigned Reg) {
MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context,
std::unique_ptr<MCAsmBackend> TAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter,
bool RelaxAll) {
- MipsNaClELFStreamer *S =
- new MipsNaClELFStreamer(Context, std::move(TAB), OS, std::move(Emitter));
+ MipsNaClELFStreamer *S = new MipsNaClELFStreamer(
+ Context, std::move(TAB), std::move(OW), std::move(Emitter));
if (RelaxAll)
S->getAssembler().setRelaxAll(true);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index fb4e1ba0ded9..1eb21b6cc826 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -52,6 +52,12 @@ void MipsTargetStreamer::emitDirectiveSetMsa() { forbidModuleDirective(); }
void MipsTargetStreamer::emitDirectiveSetNoMsa() { forbidModuleDirective(); }
void MipsTargetStreamer::emitDirectiveSetMt() {}
void MipsTargetStreamer::emitDirectiveSetNoMt() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetCRC() {}
+void MipsTargetStreamer::emitDirectiveSetNoCRC() {}
+void MipsTargetStreamer::emitDirectiveSetVirt() {}
+void MipsTargetStreamer::emitDirectiveSetNoVirt() {}
+void MipsTargetStreamer::emitDirectiveSetGINV() {}
+void MipsTargetStreamer::emitDirectiveSetNoGINV() {}
void MipsTargetStreamer::emitDirectiveSetAt() { forbidModuleDirective(); }
void MipsTargetStreamer::emitDirectiveSetAtWithArg(unsigned RegNo) {
forbidModuleDirective();
@@ -122,6 +128,12 @@ void MipsTargetStreamer::emitDirectiveModuleOddSPReg() {
void MipsTargetStreamer::emitDirectiveModuleSoftFloat() {}
void MipsTargetStreamer::emitDirectiveModuleHardFloat() {}
void MipsTargetStreamer::emitDirectiveModuleMT() {}
+void MipsTargetStreamer::emitDirectiveModuleCRC() {}
+void MipsTargetStreamer::emitDirectiveModuleNoCRC() {}
+void MipsTargetStreamer::emitDirectiveModuleVirt() {}
+void MipsTargetStreamer::emitDirectiveModuleNoVirt() {}
+void MipsTargetStreamer::emitDirectiveModuleGINV() {}
+void MipsTargetStreamer::emitDirectiveModuleNoGINV() {}
void MipsTargetStreamer::emitDirectiveSetFp(
MipsABIFlagsSection::FpABIKind Value) {
forbidModuleDirective();
@@ -421,6 +433,36 @@ void MipsTargetAsmStreamer::emitDirectiveSetNoMt() {
MipsTargetStreamer::emitDirectiveSetNoMt();
}
+void MipsTargetAsmStreamer::emitDirectiveSetCRC() {
+ OS << "\t.set\tcrc\n";
+ MipsTargetStreamer::emitDirectiveSetCRC();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoCRC() {
+ OS << "\t.set\tnocrc\n";
+ MipsTargetStreamer::emitDirectiveSetNoCRC();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetVirt() {
+ OS << "\t.set\tvirt\n";
+ MipsTargetStreamer::emitDirectiveSetVirt();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoVirt() {
+ OS << "\t.set\tnovirt\n";
+ MipsTargetStreamer::emitDirectiveSetNoVirt();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetGINV() {
+ OS << "\t.set\tginv\n";
+ MipsTargetStreamer::emitDirectiveSetGINV();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoGINV() {
+ OS << "\t.set\tnoginv\n";
+ MipsTargetStreamer::emitDirectiveSetNoGINV();
+}
+
void MipsTargetAsmStreamer::emitDirectiveSetAt() {
OS << "\t.set\tat\n";
MipsTargetStreamer::emitDirectiveSetAt();
@@ -694,6 +736,30 @@ void MipsTargetAsmStreamer::emitDirectiveModuleMT() {
OS << "\t.module\tmt\n";
}
+void MipsTargetAsmStreamer::emitDirectiveModuleCRC() {
+ OS << "\t.module\tcrc\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleNoCRC() {
+ OS << "\t.module\tnocrc\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleVirt() {
+ OS << "\t.module\tvirt\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleNoVirt() {
+ OS << "\t.module\tnovirt\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleGINV() {
+ OS << "\t.module\tginv\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleNoGINV() {
+ OS << "\t.module\tnoginv\n";
+}
+
// This part is for ELF object output.
MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
const MCSubtargetInfo &STI)
diff --git a/lib/Target/Mips/MicroMips32r6InstrFormats.td b/lib/Target/Mips/MicroMips32r6InstrFormats.td
index 25048293714d..ed5b8dd71a51 100644
--- a/lib/Target/Mips/MicroMips32r6InstrFormats.td
+++ b/lib/Target/Mips/MicroMips32r6InstrFormats.td
@@ -17,12 +17,6 @@ class MMR6Arch<string opstr> {
string DecoderNamespace = "MicroMipsR6";
}
-// Class used for microMIPS32r6 instructions.
-class MicroMipsR6Inst16 : PredicateControl {
- string DecoderNamespace = "MicroMipsR6";
- let InsnPredicates = [HasMicroMips32r6];
-}
-
//===----------------------------------------------------------------------===//
//
// Disambiguators
@@ -50,7 +44,7 @@ class BC16_FM_MM16R6 {
let Inst{9-0} = offset;
}
-class BEQZC_BNEZC_FM_MM16R6<bits<6> op> : MicroMipsR6Inst16 {
+class BEQZC_BNEZC_FM_MM16R6<bits<6> op> {
bits<3> rs;
bits<7> offset;
@@ -174,22 +168,6 @@ class ADDI_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> {
let Inst{15-0} = imm16;
}
-class POOL32C_ST_EVA_FM_MMR6<bits<6> op, bits<3> funct> : MipsR6Inst {
- bits<21> addr;
- bits<5> hint;
- bits<5> base = addr{20-16};
- bits<9> offset = addr{8-0};
-
- bits<32> Inst;
-
- let Inst{31-26} = op;
- let Inst{25-21} = hint;
- let Inst{20-16} = base;
- let Inst{15-12} = 0b1010;
- let Inst{11-9} = funct;
- let Inst{8-0} = offset;
-}
-
class LB32_FM_MMR6 : MipsR6Inst {
bits<21> addr;
bits<5> rt;
@@ -218,34 +196,6 @@ class LBU32_FM_MMR6 : MipsR6Inst {
let Inst{15-0} = offset;
}
-class POOL32C_LB_LBU_FM_MMR6<bits<3> funct> : MipsR6Inst {
- bits<21> addr;
- bits<5> rt;
-
- bits<32> Inst;
-
- let Inst{31-26} = 0b011000;
- let Inst{25-21} = rt;
- let Inst{20-16} = addr{20-16};
- let Inst{15-12} = 0b0110;
- let Inst{11-9} = funct;
- let Inst{8-0} = addr{8-0};
-}
-
-class SIGN_EXTEND_FM_MMR6<string instr_asm, bits<10> funct>
- : MMR6Arch<instr_asm> {
- bits<5> rd;
- bits<5> rt;
-
- bits<32> Inst;
-
- let Inst{31-26} = 0b000000;
- let Inst{25-21} = rd;
- let Inst{20-16} = rt;
- let Inst{15-6} = funct;
- let Inst{5-0} = 0b111100;
-}
-
class PCREL19_FM_MMR6<bits<2> funct> : MipsR6Inst {
bits<5> rt;
bits<19> imm;
@@ -436,38 +386,6 @@ class SB32_SH32_STORE_FM_MMR6<bits<6> op> {
let Inst{15-0} = offset;
}
-class POOL32C_STORE_EVA_FM_MMR6<bits<3> funct> {
- bits<5> rt;
- bits<21> addr;
- bits<5> base = addr{20-16};
- bits<9> offset = addr{8-0};
-
- bits<32> Inst;
-
- let Inst{31-26} = 0b011000;
- let Inst{25-21} = rt;
- let Inst{20-16} = base;
- let Inst{15-12} = 0b1010;
- let Inst{11-9} = funct;
- let Inst{8-0} = offset;
-}
-
-class LOAD_WORD_EVA_FM_MMR6<bits<3> funct> {
- bits<5> rt;
- bits<21> addr;
- bits<5> base = addr{20-16};
- bits<9> offset = addr{8-0};
-
- bits<32> Inst;
-
- let Inst{31-26} = 0b011000;
- let Inst{25-21} = rt;
- let Inst{20-16} = base;
- let Inst{15-12} = 0b0110;
- let Inst{11-9} = funct;
- let Inst{8-0} = offset;
-}
-
class LOAD_WORD_FM_MMR6 {
bits<5> rt;
bits<21> addr;
@@ -631,23 +549,6 @@ class SW32_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> {
let Inst{15-0} = addr{15-0};
}
-class POOL32C_SWE_FM_MMR6<string instr_asm, bits<6> op, bits<4> fmt,
- bits<3> funct> : MMR6Arch<instr_asm> {
- bits<5> rt;
- bits<21> addr;
- bits<5> base = addr{20-16};
- bits<9> offset = addr{8-0};
-
- bits<32> Inst;
-
- let Inst{31-26} = op;
- let Inst{25-21} = rt;
- let Inst{20-16} = base;
- let Inst{15-12} = fmt;
- let Inst{11-9} = funct;
- let Inst{8-0} = offset;
-}
-
class POOL32F_ARITH_FM_MMR6<string instr_asm, bits<2> fmt, bits<8> funct>
: MMR6Arch<instr_asm>, MipsR6Inst {
bits<5> ft;
@@ -791,7 +692,7 @@ class POOL32F_MATH_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct>
let Inst{5-0} = 0b111011;
}
-class POOL16A_ADDU16_FM_MMR6 : MicroMipsR6Inst16 {
+class POOL16A_ADDU16_FM_MMR6 {
bits<3> rs;
bits<3> rt;
bits<3> rd;
@@ -805,7 +706,7 @@ class POOL16A_ADDU16_FM_MMR6 : MicroMipsR6Inst16 {
let Inst{0} = 0;
}
-class POOL16C_AND16_FM_MMR6 : MicroMipsR6Inst16 {
+class POOL16C_AND16_FM_MMR6 {
bits<3> rt;
bits<3> rs;
@@ -817,7 +718,7 @@ class POOL16C_AND16_FM_MMR6 : MicroMipsR6Inst16 {
let Inst{3-0} = 0b0001;
}
-class POOL16C_NOT16_FM_MMR6 : MicroMipsR6Inst16 {
+class POOL16C_NOT16_FM_MMR6 {
bits<3> rt;
bits<3> rs;
@@ -829,7 +730,7 @@ class POOL16C_NOT16_FM_MMR6 : MicroMipsR6Inst16 {
let Inst{3-0} = 0b0000;
}
-class POOL16C_MOVEP16_FM_MMR6 : MicroMipsR6Inst16 {
+class POOL16C_MOVEP16_FM_MMR6 {
bits<3> dst_regs;
bits<3> rt;
bits<3> rs;
@@ -844,7 +745,7 @@ class POOL16C_MOVEP16_FM_MMR6 : MicroMipsR6Inst16 {
let Inst{1-0} = rs{1-0};
}
-class POOL16C_OR16_XOR16_FM_MMR6<bits<4> op> : MicroMipsR6Inst16 {
+class POOL16C_OR16_XOR16_FM_MMR6<bits<4> op> {
bits<3> rt;
bits<3> rs;
@@ -879,7 +780,8 @@ class POOL16A_SUBU16_FM_MMR6 {
let Inst{0} = 0b1;
}
-class POOL32A_WRPGPR_WSBH_FM_MMR6<bits<10> funct> : MipsR6Inst {
+class POOL32A_WRPGPR_WSBH_FM_MMR6<string instr_asm, bits<10> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
bits<5> rt;
bits<5> rs;
@@ -981,6 +883,23 @@ class POOL32A_MFTC0_FM_MMR6<string instr_asm, bits<5> funct, bits<6> opcode>
let Inst{5-0} = opcode;
}
+class POOL32A_GINV_FM_MMR6<string instr_asm, bits<2> ginv>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> rs;
+ bits<2> type;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x0;
+ let Inst{25-21} = 0x0;
+ let Inst{20-16} = rs;
+ let Inst{15-13} = 0b011;
+ let Inst{12-11} = ginv;
+ let Inst{10-9} = type;
+ let Inst{8-6} = 0b101;
+ let Inst{5-0} = 0b111100;
+}
+
class POOL32F_MFTC1_FM_MMR6<string instr_asm, bits<8> funct>
: MMR6Arch<instr_asm> {
bits<5> rt;
@@ -1037,21 +956,6 @@ class POOL32A_DVPEVP_FM_MMR6<string instr_asm, bits<10> funct>
let Inst{5-0} = 0b111100;
}
-class POOL32B_LWP_SWP_FM_MMR6<bits<4> funct> : MipsR6Inst {
- bits<5> rd;
- bits<21> addr;
- bits<5> base = addr{20-16};
- bits<12> offset = addr{11-0};
-
- bits<32> Inst;
-
- let Inst{31-26} = 0x8;
- let Inst{25-21} = rd;
- let Inst{20-16} = base;
- let Inst{15-12} = funct;
- let Inst{11-0} = offset;
-}
-
class CMP_BRANCH_OFF21_FM_MMR6<string opstr, bits<6> funct> : MipsR6Inst {
bits<5> rs;
bits<21> offset;
@@ -1107,3 +1011,21 @@ class POOL32B_LDWC2_SDWC2_FM_MMR6<string instr_asm, bits<4> funct>
let Inst{11} = 0;
let Inst{10-0} = offset;
}
+
+class POOL32C_LL_E_SC_E_FM_MMR6<string instr_asm, bits<4> majorFunc,
+ bits<3> minorFunc>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b011000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-12} = majorFunc;
+ let Inst{11-9} = minorFunc;
+ let Inst{8-0} = offset;
+}
diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td
index 3ff3f07654d9..f795112ae2b7 100644
--- a/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -106,20 +106,20 @@ class DI_MMR6_ENC : POOL32A_EIDI_MMR6_ENC<"di", 0b0100011101>;
class ERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0x3cd>;
class DERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0b1110001101>;
class ERETNC_MMR6_ENC : ERETNC_FM_MMR6<"eretnc">;
+class GINVI_MMR6_ENC : POOL32A_GINV_FM_MMR6<"ginvi", 0b00>;
+class GINVT_MMR6_ENC : POOL32A_GINV_FM_MMR6<"ginvt", 0b10>;
class JALRC16_MMR6_ENC : POOL16C_JALRC_FM_MM16R6<0xb>;
class JIALC_MMR6_ENC : JMP_IDX_COMPACT_FM<0b100000>;
class JIC_MMR6_ENC : JMP_IDX_COMPACT_FM<0b101000>;
class JRC16_MMR6_ENC: POOL16C_JALRC_FM_MM16R6<0x3>;
class JRCADDIUSP_MMR6_ENC : POOL16C_JRCADDIUSP_FM_MM16R6<0x13>;
class LSA_MMR6_ENC : POOL32A_LSA_FM<0b001111>;
-class LWP_MMR6_ENC : POOL32B_LWP_SWP_FM_MMR6<0x1>;
class LWPC_MMR6_ENC : PCREL19_FM_MMR6<0b01>;
class LWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0x2>;
class MFC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mfc0", 0b00011, 0b111100>;
class MFC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mfc1", 0b10000000>;
class MFC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mfc2", 0b0100110100>;
class MFHC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mfhc0", 0b00011, 0b110100>;
-class MFHC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mfhc1", 0b11000000>;
class MFHC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mfhc2", 0b1000110100>;
class MOD_MMR6_ENC : ARITH_FM_MMR6<"mod", 0x158>;
class MODU_MMR6_ENC : ARITH_FM_MMR6<"modu", 0x1d8>;
@@ -131,15 +131,12 @@ class MTC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mtc0", 0b01011, 0b111100>;
class MTC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mtc1", 0b10100000>;
class MTC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mtc2", 0b0101110100>;
class MTHC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mthc0", 0b01011, 0b110100>;
-class MTHC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mthc1", 0b11100000>;
class MTHC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mthc2", 0b1001110100>;
class NOR_MMR6_ENC : ARITH_FM_MMR6<"nor", 0x2d0>;
class OR_MMR6_ENC : ARITH_FM_MMR6<"or", 0x290>;
class ORI_MMR6_ENC : ADDI_FM_MMR6<"ori", 0x14>;
class PREF_MMR6_ENC : CACHE_PREF_FM_MMR6<0b011000, 0b0010>;
class SB16_MMR6_ENC : LOAD_STORE_FM_MM16<0x22>;
-class SEB_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seb", 0b0010101100>;
-class SEH_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seh", 0b0011101100>;
class SELEQZ_MMR6_ENC : POOL32A_FM_MMR6<0b0101000000>;
class SELNEZ_MMR6_ENC : POOL32A_FM_MMR6<0b0110000000>;
class SH16_MMR6_ENC : LOAD_STORE_FM_MM16<0x2a>;
@@ -147,19 +144,13 @@ class SLL_MMR6_ENC : SHIFT_MMR6_ENC<"sll", 0x00, 0b0>;
class SUB_MMR6_ENC : ARITH_FM_MMR6<"sub", 0x190>;
class SUBU_MMR6_ENC : ARITH_FM_MMR6<"subu", 0x1d0>;
class SW_MMR6_ENC : SW32_FM_MMR6<"sw", 0x3e>;
-class SWE_MMR6_ENC : POOL32C_SWE_FM_MMR6<"swe", 0x18, 0xa, 0x7>;
class SW16_MMR6_ENC : LOAD_STORE_FM_MM16<0x3a>;
class SWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0xa>;
class SWSP_MMR6_ENC : LOAD_STORE_SP_FM_MM16<0x32>;
-class SWP_MMR6_ENC : POOL32B_LWP_SWP_FM_MMR6<0x9>;
-class PREFE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b010>;
-class CACHEE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b011>;
-class WRPGPR_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x3c5>;
-class WSBH_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x1ec>;
+class WRPGPR_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<"wrpgpr", 0x3c5>;
+class WSBH_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<"wsbh", 0x1ec>;
class LB_MMR6_ENC : LB32_FM_MMR6;
class LBU_MMR6_ENC : LBU32_FM_MMR6;
-class LBE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b100>;
-class LBUE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b000>;
class PAUSE_MMR6_ENC : POOL32A_PAUSE_FM_MMR6<"pause", 0b00101>;
class RDHWR_MMR6_ENC : POOL32A_RDHWR_FM_MMR6;
class WAIT_MMR6_ENC : WAIT_FM_MM, MMR6Arch<"wait">;
@@ -184,15 +175,8 @@ class TRUNC_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.s", 0, 0b10001100>;
class TRUNC_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.d", 1, 0b10001100>;
class TRUNC_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.s", 0, 0b10101100>;
class TRUNC_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.d", 1, 0b10101100>;
-class SQRT_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.s", 0, 0b00101000>;
-class SQRT_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.d", 1, 0b00101000>;
class SB_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b000110>;
-class SBE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b100>;
-class SCE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b110>;
class SH_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b001110>;
-class SHE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b101>;
-class LLE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b110>;
-class LWE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b111>;
class LW_MMR6_ENC : LOAD_WORD_FM_MMR6;
class LUI_MMR6_ENC : LOAD_UPPER_IMM_FM_MMR6;
class JALRC_HB_MMR6_ENC : POOL32A_JALRC_FM_MMR6<"jalrc.hb", 0b0001111100>;
@@ -221,11 +205,11 @@ class BOVC_MMR6_ENC : POP35_BOVC_FM_MMR6<"bovc">;
class BNVC_MMR6_ENC : POP37_BNVC_FM_MMR6<"bnvc">;
class ADDU16_MMR6_ENC : POOL16A_ADDU16_FM_MMR6;
class AND16_MMR6_ENC : POOL16C_AND16_FM_MMR6;
-class ANDI16_MMR6_ENC : ANDI_FM_MM16<0b001011>, MicroMipsR6Inst16;
+class ANDI16_MMR6_ENC : ANDI_FM_MM16<0b001011>;
class NOT16_MMR6_ENC : POOL16C_NOT16_FM_MMR6;
class OR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1001>;
-class SLL16_MMR6_ENC : SHIFT_FM_MM16<0>, MicroMipsR6Inst16;
-class SRL16_MMR6_ENC : SHIFT_FM_MM16<1>, MicroMipsR6Inst16;
+class SLL16_MMR6_ENC : SHIFT_FM_MM16<0>;
+class SRL16_MMR6_ENC : SHIFT_FM_MM16<1>;
class BREAK16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b011011>;
class LI16_MMR6_ENC : LI_FM_MM16;
class MOVE16_MMR6_ENC : MOVE_FM_MM16<0b000011>;
@@ -248,23 +232,20 @@ class SDC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"sdc2", 0b1010>;
class LWC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"lwc2", 0b0000>;
class SWC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"swc2", 0b1000>;
+class LL_MMR6_ENC : POOL32C_LL_E_SC_E_FM_MMR6<"ll", 0b0011, 0b000>;
+class SC_MMR6_ENC : POOL32C_LL_E_SC_E_FM_MMR6<"sc", 0b1011, 0b000>;
+
/// Floating Point Instructions
class FADD_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.s", 0, 0b00110000>;
-class FADD_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.d", 1, 0b00110000>;
class FSUB_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.s", 0, 0b01110000>;
-class FSUB_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.d", 1, 0b01110000>;
class FMUL_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.s", 0, 0b10110000>;
-class FMUL_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.d", 1, 0b10110000>;
class FDIV_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.s", 0, 0b11110000>;
-class FDIV_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.d", 1, 0b11110000>;
class MADDF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.s", 0, 0b110111000>;
class MADDF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.d", 1, 0b110111000>;
class MSUBF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.s", 0, 0b111111000>;
class MSUBF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.d", 1, 0b111111000>;
class FMOV_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.s", 0, 0b0000001>;
-class FMOV_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.d", 1, 0b0000001>;
class FNEG_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.s", 0, 0b0101101>;
-class FNEG_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.d", 1, 0b0101101>;
class MAX_S_MMR6_ENC : POOL32F_MINMAX_FM<"max.s", 0, 0b000001011>;
class MAX_D_MMR6_ENC : POOL32F_MINMAX_FM<"max.d", 1, 0b000001011>;
class MAXA_S_MMR6_ENC : POOL32F_MINMAX_FM<"maxa.s", 0, 0b000101011>;
@@ -277,11 +258,7 @@ class MINA_D_MMR6_ENC : POOL32F_MINMAX_FM<"mina.d", 1, 0b000100011>;
class CVT_L_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.s", 0, 0b00000100>;
class CVT_L_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.d", 1, 0b00000100>;
class CVT_W_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.s", 0, 0b00100100>;
-class CVT_W_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.d", 1, 0b00100100>;
-class CVT_D_S_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.s", 0, 0b1001101>;
-class CVT_D_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.w", 1, 0b1001101>;
class CVT_D_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.l", 2, 0b1001101>;
-class CVT_S_D_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.d", 0, 0b1101101>;
class CVT_S_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.w", 1, 0b1101101>;
class CVT_S_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.l", 2, 0b1101101>;
@@ -390,7 +367,7 @@ class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26_mm, II_BC> {
class BC16_MMR6_DESC : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset),
!strconcat("bc16", "\t$offset"), [],
II_BC, FrmI>,
- MMR6Arch<"bc16">, MicroMipsR6Inst16 {
+ MMR6Arch<"bc16"> {
let isBranch = 1;
let isTerminator = 1;
let isBarrier = 1;
@@ -400,7 +377,8 @@ class BC16_MMR6_DESC : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset),
}
class BEQZC_BNEZC_MM16R6_DESC_BASE<string instr_asm>
- : CBranchZeroMM<instr_asm, brtarget7_mm, GPRMM16Opnd>, MMR6Arch<instr_asm> {
+ : CBranchZeroMM<instr_asm, brtarget7_mm, GPRMM16Opnd>,
+ MMR6Arch<instr_asm> {
let isBranch = 1;
let isTerminator = 1;
let hasDelaySlot = 0;
@@ -441,17 +419,6 @@ class CACHE_MMR6_DESC : CACHE_HINT_MMR6_DESC<"cache", mem_mm_12, GPR32Opnd,
class PREF_MMR6_DESC : CACHE_HINT_MMR6_DESC<"pref", mem_mm_12, GPR32Opnd,
II_PREF>;
-class PREFE_CACHEE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
- RegisterOperand GPROpnd, InstrItinClass Itin>
- : CACHE_HINT_MMR6_DESC<instr_asm, MemOpnd, GPROpnd, Itin> {
- string DecoderMethod = "DecodePrefeOpMM";
-}
-
-class PREFE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"prefe", mem_mm_9,
- GPR32Opnd, II_PREFE>;
-class CACHEE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"cachee", mem_mm_9,
- GPR32Opnd, II_CACHEE>;
-
class LB_LBU_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
RegisterOperand GPROpnd, InstrItinClass Itin>
: MMR6Arch<instr_asm> {
@@ -466,16 +433,6 @@ class LB_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lb", mem_mm_16, GPR32Opnd, II_LB>;
class LBU_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lbu", mem_mm_16, GPR32Opnd,
II_LBU>;
-class LBE_LBUE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
- RegisterOperand GPROpnd, InstrItinClass Itin>
- : LB_LBU_MMR6_DESC_BASE<instr_asm, MemOpnd, GPROpnd, Itin> {
- let DecoderMethod = "DecodeLoadByte9";
-}
-class LBE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbe", mem_mm_9, GPR32Opnd,
- II_LBE>;
-class LBUE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbue", mem_mm_9, GPR32Opnd,
- II_LBUE>;
-
class CLO_CLZ_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
InstrItinClass Itin> : MMR6Arch<instr_asm> {
dag OutOperandList = (outs GPROpnd:$rt);
@@ -498,7 +455,7 @@ class ERETNC_MMR6_DESC : ER_FT<"eretnc", II_ERETNC>;
class JALRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO>
: MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
[(MipsJmpLink RO:$rs)], II_JALR, FrmR>,
- MMR6Arch<opstr>, MicroMipsR6Inst16 {
+ MMR6Arch<opstr> {
let isCall = 1;
let hasDelaySlot = 0;
let Defs = [RA];
@@ -532,7 +489,7 @@ class JIC_MMR6_DESC : JMP_MMR6_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16,
class JRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO>
: MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
[], II_JR, FrmR>,
- MMR6Arch<opstr>, MicroMipsR6Inst16 {
+ MMR6Arch<opstr> {
let hasDelaySlot = 0;
let isBranch = 1;
let isIndirectBranch = 1;
@@ -542,7 +499,7 @@ class JRC16_MMR6_DESC : JRC16_MMR6_DESC_BASE<"jrc16", GPR32Opnd>;
class JRCADDIUSP_MMR6_DESC
: MicroMipsInst16<(outs), (ins uimm5_lsl2:$imm), "jrcaddiusp\t$imm",
[], II_JRADDIUSP, FrmR>,
- MMR6Arch<"jrcaddiusp">, MicroMipsR6Inst16 {
+ MMR6Arch<"jrcaddiusp"> {
let hasDelaySlot = 0;
let isTerminator = 1;
let isBarrier = 1;
@@ -574,8 +531,6 @@ class AUI_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
class AUI_MMR6_DESC : AUI_MMR6_DESC_BASE<"aui", GPR32Opnd, II_AUI>;
-class SEB_MMR6_DESC : SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>;
-class SEH_MMR6_DESC : SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>;
class ALUIPC_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
InstrItinClass Itin> : MMR6Arch<instr_asm> {
dag OutOperandList = (outs GPROpnd:$rt);
@@ -615,32 +570,6 @@ class ADDIUPC_MMR6_DESC : PCREL_MMR6_DESC_BASE<"addiupc", GPR32Opnd,
class LWPC_MMR6_DESC: PCREL_MMR6_DESC_BASE<"lwpc", GPR32Opnd, simm19_lsl2,
II_LWPC>;
-class LWP_MMR6_DESC : MMR6Arch<"lwp"> {
- dag OutOperandList = (outs regpair:$rd);
- dag InOperandList = (ins mem_simm12:$addr);
- string AsmString = !strconcat("lwp", "\t$rd, $addr");
- list<dag> Pattern = [];
- InstrItinClass Itinerary = II_LWP;
- ComplexPattern Addr = addr;
- Format f = FrmI;
- string BaseOpcode = "lwp";
- string DecoderMethod = "DecodeMemMMImm12";
- bit mayLoad = 1;
-}
-
-class SWP_MMR6_DESC : MMR6Arch<"swp"> {
- dag OutOperandList = (outs);
- dag InOperandList = (ins regpair:$rd, mem_simm12:$addr);
- string AsmString = !strconcat("swp", "\t$rd, $addr");
- list<dag> Pattern = [];
- InstrItinClass Itinerary = II_SWP;
- ComplexPattern Addr = addr;
- Format f = FrmI;
- string BaseOpcode = "swp";
- string DecoderMethod = "DecodeMemMMImm12";
- bit mayStore = 1;
-}
-
class SELEQNE_Z_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
InstrItinClass Itin> : MMR6Arch<instr_asm> {
dag OutOperandList = (outs GPROpnd:$rd);
@@ -704,23 +633,11 @@ class ORI_MMR6_DESC : ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16,
class XOR_MMR6_DESC : ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>;
class XORI_MMR6_DESC : ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI,
immZExt16, xor>;
-
-class SWE_MMR6_DESC_BASE<string opstr, DAGOperand RO, DAGOperand MO,
- InstrItinClass Itin = NoItinerary,
- SDPatternOperator OpNode = null_frag,
- ComplexPattern Addr = addr> :
- InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
- [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
- let DecoderMethod = "DecodeMem";
- let mayStore = 1;
-}
class SW_MMR6_DESC : Store<"sw", GPR32Opnd> {
InstrItinClass Itinerary = II_SW;
}
-class SWE_MMR6_DESC : SWE_MMR6_DESC_BASE<"swe", GPR32Opnd, mem_simm9, II_SWE>;
-
class WRPGPR_WSBH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO,
- InstrItinClass Itin> : MMR6Arch<instr_asm> {
+ InstrItinClass Itin> {
dag InOperandList = (ins RO:$rs);
dag OutOperandList = (outs RO:$rt);
string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
@@ -789,12 +706,6 @@ class MTC2_MMR6_DESC : MTC2_MMR6_DESC_BASE<"mtc2", COP2Opnd, GPR32Opnd,
II_MTC2>;
class MTHC0_MMR6_DESC : MTC0_MMR6_DESC_BASE<"mthc0", COP0Opnd, GPR32Opnd,
II_MTHC0>;
-class MTHC1_D32_MMR6_DESC : MTC1_64_MMR6_DESC_BASE<"mthc1", AFGR64Opnd,
- GPR32Opnd, II_MTC1>,
- HARDFLOAT, FGR_32;
-class MTHC1_D64_MMR6_DESC : MTC1_64_MMR6_DESC_BASE<"mthc1", FGR64Opnd,
- GPR32Opnd, II_MTC1>,
- HARDFLOAT, FGR_64;
class MTHC2_MMR6_DESC : MTC2_MMR6_DESC_BASE<"mthc2", COP2Opnd, GPR32Opnd,
II_MTC2>;
@@ -838,10 +749,6 @@ class MFC2_MMR6_DESC : MFC2_MMR6_DESC_BASE<"mfc2", GPR32Opnd, COP2Opnd,
II_MFC2>;
class MFHC0_MMR6_DESC : MFC0_MMR6_DESC_BASE<"mfhc0", GPR32Opnd, COP0Opnd,
II_MFHC0>;
-class MFHC1_D32_MMR6_DESC : MFC1_MMR6_DESC_BASE<"mfhc1", GPR32Opnd, AFGR64Opnd,
- II_MFHC1>, HARDFLOAT, FGR_32;
-class MFHC1_D64_MMR6_DESC : MFC1_MMR6_DESC_BASE<"mfhc1", GPR32Opnd, FGR64Opnd,
- II_MFHC1>, HARDFLOAT, FGR_64;
class MFHC2_MMR6_DESC : MFC2_MMR6_DESC_BASE<"mfhc2", GPR32Opnd, COP2Opnd,
II_MFC2>;
@@ -897,6 +804,49 @@ class SDC2_SWC2_MMR6_DESC_BASE<string opstr, InstrItinClass itin> {
class SDC2_MMR6_DESC : SDC2_SWC2_MMR6_DESC_BASE<"sdc2", II_SDC2>;
class SWC2_MMR6_DESC : SDC2_SWC2_MMR6_DESC_BASE<"swc2", II_SWC2>;
+class GINV_MMR6_DESC_BASE<string opstr,
+ RegisterOperand SrcRC, InstrItinClass Itin> {
+ dag InOperandList = (ins SrcRC:$rs, uimm2:$type);
+ dag OutOperandList = (outs);
+ string AsmString = !strconcat(opstr, "\t$rs, $type");
+ list<dag> Pattern = [];
+ Format f = FrmFR;
+ string BaseOpcode = opstr;
+ InstrItinClass Itinerary = Itin;
+}
+
+class GINVI_MMR6_DESC : GINV_MMR6_DESC_BASE<"ginvi", GPR32Opnd,
+ II_GINVI> {
+ dag InOperandList = (ins GPR32Opnd:$rs);
+ string AsmString = "ginvi\t$rs";
+}
+class GINVT_MMR6_DESC : GINV_MMR6_DESC_BASE<"ginvt", GPR32Opnd,
+ II_GINVT>;
+
+class SC_MMR6_DESC_BASE<string opstr, InstrItinClass itin> {
+ dag OutOperandList = (outs GPR32Opnd:$dst);
+ dag InOperandList = (ins GPR32Opnd:$rt, mem_mm_9:$addr);
+ string AsmString = !strconcat(opstr, "\t$rt, $addr");
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = opstr;
+ bit mayStore = 1;
+ string Constraints = "$rt = $dst";
+ string DecoderMethod = "DecodeMemMMImm9";
+}
+
+class LL_MMR6_DESC_BASE<string opstr, InstrItinClass itin> {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins mem_mm_9:$addr);
+ string AsmString = !strconcat(opstr, "\t$rt, $addr");
+ InstrItinClass Itinerary = itin;
+ string BaseOpcode = opstr;
+ bit mayLoad = 1;
+ string DecoderMethod = "DecodeMemMMImm9";
+}
+
+class SC_MMR6_DESC : SC_MMR6_DESC_BASE<"sc", II_SC>;
+class LL_MMR6_DESC : LL_MMR6_DESC_BASE<"ll", II_LL>;
+
/// Floating Point Instructions
class FARITH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RC,
InstrItinClass Itin, bit isComm,
@@ -910,20 +860,12 @@ class FARITH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RC,
}
class FADD_S_MMR6_DESC
: FARITH_MMR6_DESC_BASE<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>;
-class FADD_D_MMR6_DESC
- : FARITH_MMR6_DESC_BASE<"add.d", AFGR64Opnd, II_ADD_D, 1, fadd>;
class FSUB_S_MMR6_DESC
: FARITH_MMR6_DESC_BASE<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>;
-class FSUB_D_MMR6_DESC
- : FARITH_MMR6_DESC_BASE<"sub.d", AFGR64Opnd, II_SUB_D, 0, fsub>;
class FMUL_S_MMR6_DESC
: FARITH_MMR6_DESC_BASE<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>;
-class FMUL_D_MMR6_DESC
- : FARITH_MMR6_DESC_BASE<"mul.d", AFGR64Opnd, II_MUL_D, 1, fmul>;
class FDIV_S_MMR6_DESC
: FARITH_MMR6_DESC_BASE<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>;
-class FDIV_D_MMR6_DESC
- : FARITH_MMR6_DESC_BASE<"div.d", AFGR64Opnd, II_DIV_D, 0, fdiv>;
class MADDF_S_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd,
II_MADDF_S>, HARDFLOAT;
class MADDF_D_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd,
@@ -946,12 +888,8 @@ class FMOV_FNEG_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC,
}
class FMOV_S_MMR6_DESC
: FMOV_FNEG_MMR6_DESC_BASE<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>;
-class FMOV_D_MMR6_DESC
- : FMOV_FNEG_MMR6_DESC_BASE<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>;
class FNEG_S_MMR6_DESC
: FMOV_FNEG_MMR6_DESC_BASE<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>;
-class FNEG_D_MMR6_DESC
- : FMOV_FNEG_MMR6_DESC_BASE<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>;
class MAX_S_MMR6_DESC : MAX_MIN_DESC_BASE<"max.s", FGR32Opnd, II_MAX_S>,
HARDFLOAT;
@@ -989,16 +927,8 @@ class CVT_L_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.l.d", FGR64Opnd, FGR64Opnd,
II_CVT>;
class CVT_W_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.s", FGR32Opnd, FGR32Opnd,
II_CVT>;
-class CVT_W_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.d", FGR32Opnd, AFGR64Opnd,
- II_CVT>;
-class CVT_D_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.s", FGR32Opnd, AFGR64Opnd,
- II_CVT>;
-class CVT_D_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.w", FGR32Opnd, AFGR64Opnd,
- II_CVT>;
class CVT_D_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.l", FGR64Opnd, FGR64Opnd,
II_CVT>, FGR_64;
-class CVT_S_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.d", AFGR64Opnd, FGR32Opnd,
- II_CVT>;
class CVT_S_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.w", FGR32Opnd, FGR32Opnd,
II_CVT>;
class CVT_S_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.l", FGR64Opnd, FGR32Opnd,
@@ -1085,10 +1015,6 @@ class ABSS_FT_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC,
list<Predicate> EncodingPredicates = [HasStdEnc];
}
-class ABS_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.s", FGR32Opnd, FGR32Opnd,
- II_ABS, fabs>;
-class ABS_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.d", AFGR64Opnd, AFGR64Opnd,
- II_ABS, fabs>;
class FLOOR_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.s", FGR64Opnd,
FGR32Opnd, II_FLOOR>;
class FLOOR_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.d", FGR64Opnd,
@@ -1154,70 +1080,35 @@ class STORE_MMR6_DESC_BASE<string opstr, DAGOperand RO,
}
class SB_MMR6_DESC : STORE_MMR6_DESC_BASE<"sb", GPR32Opnd, II_SB>;
-class STORE_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO,
- InstrItinClass Itin>
- : MMR6Arch<instr_asm>, MipsR6Inst {
- dag OutOperandList = (outs);
- dag InOperandList = (ins RO:$rt, mem_simm9:$addr);
- string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
- string DecoderMethod = "DecodeStoreEvaOpMM";
- bit mayStore = 1;
- InstrItinClass Itinerary = Itin;
-}
-class SBE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sbe", GPR32Opnd, II_SBE>;
-class SCE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sce", GPR32Opnd, II_SCE>;
class SH_MMR6_DESC : STORE_MMR6_DESC_BASE<"sh", GPR32Opnd, II_SH>;
-class SHE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"she", GPR32Opnd, II_SHE>;
-class LOAD_WORD_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO,
- InstrItinClass Itin>
- : MMR6Arch<instr_asm>, MipsR6Inst {
- dag OutOperandList = (outs RO:$rt);
- dag InOperandList = (ins mem_simm9:$addr);
- string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
- string DecoderMethod = "DecodeMemMMImm9";
- bit mayLoad = 1;
- InstrItinClass Itinerary = Itin;
-}
-class LLE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lle", GPR32Opnd, II_LLE>;
-class LWE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lwe", GPR32Opnd, II_LWE>;
class ADDU16_MMR6_DESC : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
MMR6Arch<"addu16"> {
int AddedComplexity = 1;
}
-class AND16_MMR6_DESC : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
- MMR6Arch<"and16"> {
- int AddedComplexity = 1;
-}
+class AND16_MMR6_DESC : LogicRMM16<"and16", GPRMM16Opnd, II_AND>,
+ MMR6Arch<"and16">;
class ANDI16_MMR6_DESC : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>,
MMR6Arch<"andi16">;
class NOT16_MMR6_DESC : NotMM16<"not16", GPRMM16Opnd>, MMR6Arch<"not16"> {
int AddedComplexity = 1;
}
-class OR16_MMR6_DESC : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>,
- MMR6Arch<"or16"> {
- int AddedComplexity = 1;
-}
+class OR16_MMR6_DESC : LogicRMM16<"or16", GPRMM16Opnd, II_OR>, MMR6Arch<"or16">;
class SLL16_MMR6_DESC : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>,
MMR6Arch<"sll16">;
class SRL16_MMR6_DESC : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>,
MMR6Arch<"srl16">;
-class BREAK16_MMR6_DESC : BrkSdbbp16MM<"break16", II_BREAK>, MMR6Arch<"break16">,
- MicroMipsR6Inst16;
+class BREAK16_MMR6_DESC : BrkSdbbp16MM<"break16", II_BREAK>, MMR6Arch<"break16">;
class LI16_MMR6_DESC : LoadImmMM16<"li16", li16_imm, GPRMM16Opnd>,
- MMR6Arch<"li16">, MicroMipsR6Inst16, IsAsCheapAsAMove;
-class MOVE16_MMR6_DESC : MoveMM16<"move16", GPR32Opnd>, MMR6Arch<"move16">,
- MicroMipsR6Inst16;
+ MMR6Arch<"li16">, IsAsCheapAsAMove;
+class MOVE16_MMR6_DESC : MoveMM16<"move16", GPR32Opnd>, MMR6Arch<"move16">;
class MOVEP_MMR6_DESC : MovePMM16<"movep", GPRMM16OpndMoveP>, MMR6Arch<"movep">;
-class SDBBP16_MMR6_DESC : BrkSdbbp16MM<"sdbbp16", II_SDBBP>, MMR6Arch<"sdbbp16">,
- MicroMipsR6Inst16;
+class SDBBP16_MMR6_DESC : BrkSdbbp16MM<"sdbbp16", II_SDBBP>, MMR6Arch<"sdbbp16">;
class SUBU16_MMR6_DESC : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
- MMR6Arch<"subu16">, MicroMipsR6Inst16 {
- int AddedComplexity = 1;
-}
-class XOR16_MMR6_DESC : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
- MMR6Arch<"xor16"> {
+ MMR6Arch<"subu16"> {
int AddedComplexity = 1;
}
+class XOR16_MMR6_DESC : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR>,
+ MMR6Arch<"xor16">;
class LW_MMR6_DESC : MMR6Arch<"lw">, MipsR6Inst {
dag OutOperandList = (outs GPR32Opnd:$rt);
@@ -1250,7 +1141,7 @@ class SYNC_MMR6_DESC : MMR6Arch<"sync">, MipsR6Inst {
bit HasSideEffects = 1;
}
-class SYNCI_MMR6_DESC : SYNCI_FT<"synci"> {
+class SYNCI_MMR6_DESC : SYNCI_FT<"synci", mem_mm_16> {
let DecoderMethod = "DecodeSynciR6";
}
@@ -1273,7 +1164,7 @@ class LWM16_MMR6_DESC
: MicroMipsInst16<(outs reglist16:$rt), (ins mem_mm_4sp:$addr),
!strconcat("lwm16", "\t$rt, $addr"), [],
II_LWM, FrmI>,
- MMR6Arch<"lwm16">, MicroMipsR6Inst16 {
+ MMR6Arch<"lwm16"> {
let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
let mayLoad = 1;
ComplexPattern Addr = addr;
@@ -1283,7 +1174,7 @@ class SWM16_MMR6_DESC
: MicroMipsInst16<(outs), (ins reglist16:$rt, mem_mm_4sp:$addr),
!strconcat("swm16", "\t$rt, $addr"), [],
II_SWM, FrmI>,
- MMR6Arch<"swm16">, MicroMipsR6Inst16 {
+ MMR6Arch<"swm16"> {
let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
let mayStore = 1;
ComplexPattern Addr = addr;
@@ -1294,7 +1185,7 @@ class SB16_MMR6_DESC_BASE<string opstr, DAGOperand RTOpnd, DAGOperand RO,
Operand MemOpnd>
: MicroMipsInst16<(outs), (ins RTOpnd:$rt, MemOpnd:$addr),
!strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI>,
- MMR6Arch<opstr>, MicroMipsR6Inst16 {
+ MMR6Arch<opstr> {
let DecoderMethod = "DecodeMemMMImm4";
let mayStore = 1;
}
@@ -1308,7 +1199,7 @@ class SW16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sw16", GPRMM16OpndZero, GPRMM16Opnd,
class SWSP_MMR6_DESC
: MicroMipsInst16<(outs), (ins GPR32Opnd:$rt, mem_mm_sp_imm5_lsl2:$offset),
!strconcat("sw", "\t$rt, $offset"), [], II_SW, FrmI>,
- MMR6Arch<"sw">, MicroMipsR6Inst16 {
+ MMR6Arch<"sw"> {
let DecoderMethod = "DecodeMemMMSPImm5Lsl2";
let mayStore = 1;
}
@@ -1473,6 +1364,11 @@ def ERET_MMR6 : StdMMR6Rel, ERET_MMR6_DESC, ERET_MMR6_ENC, ISA_MICROMIPS32R6;
def DERET_MMR6 : StdMMR6Rel, DERET_MMR6_DESC, DERET_MMR6_ENC, ISA_MICROMIPS32R6;
def ERETNC_MMR6 : R6MMR6Rel, ERETNC_MMR6_DESC, ERETNC_MMR6_ENC,
ISA_MICROMIPS32R6;
+def GINVI_MMR6 : R6MMR6Rel, GINVI_MMR6_ENC, GINVI_MMR6_DESC,
+ ISA_MICROMIPS32R6, ASE_GINV;
+def GINVT_MMR6 : R6MMR6Rel, GINVT_MMR6_ENC, GINVT_MMR6_DESC,
+ ISA_MICROMIPS32R6, ASE_GINV;
+let FastISelShouldIgnore = 1 in
def JALRC16_MMR6 : R6MMR6Rel, JALRC16_MMR6_DESC, JALRC16_MMR6_ENC,
ISA_MICROMIPS32R6;
def JIALC_MMR6 : R6MMR6Rel, JIALC_MMR6_ENC, JIALC_MMR6_DESC, ISA_MICROMIPS32R6;
@@ -1481,29 +1377,17 @@ def JRC16_MMR6 : R6MMR6Rel, JRC16_MMR6_DESC, JRC16_MMR6_ENC, ISA_MICROMIPS32R6;
def JRCADDIUSP_MMR6 : R6MMR6Rel, JRCADDIUSP_MMR6_DESC, JRCADDIUSP_MMR6_ENC,
ISA_MICROMIPS32R6;
def LSA_MMR6 : R6MMR6Rel, LSA_MMR6_ENC, LSA_MMR6_DESC, ISA_MICROMIPS32R6;
-def LWP_MMR6 : StdMMR6Rel, LWP_MMR6_ENC, LWP_MMR6_DESC, ISA_MICROMIPS32R6;
def LWPC_MMR6 : R6MMR6Rel, LWPC_MMR6_ENC, LWPC_MMR6_DESC, ISA_MICROMIPS32R6;
def LWM16_MMR6 : StdMMR6Rel, LWM16_MMR6_DESC, LWM16_MMR6_ENC, ISA_MICROMIPS32R6;
def MTC0_MMR6 : StdMMR6Rel, MTC0_MMR6_ENC, MTC0_MMR6_DESC, ISA_MICROMIPS32R6;
def MTC1_MMR6 : StdMMR6Rel, MTC1_MMR6_DESC, MTC1_MMR6_ENC, ISA_MICROMIPS32R6;
def MTC2_MMR6 : StdMMR6Rel, MTC2_MMR6_ENC, MTC2_MMR6_DESC, ISA_MICROMIPS32R6;
def MTHC0_MMR6 : R6MMR6Rel, MTHC0_MMR6_ENC, MTHC0_MMR6_DESC, ISA_MICROMIPS32R6;
-def MTHC1_D32_MMR6 : StdMMR6Rel, MTHC1_D32_MMR6_DESC, MTHC1_MMR6_ENC, ISA_MICROMIPS32R6;
-let DecoderNamespace = "MicroMipsFP64" in {
- def MTHC1_D64_MMR6 : R6MMR6Rel, MTHC1_D64_MMR6_DESC, MTHC1_MMR6_ENC,
- ISA_MICROMIPS32R6;
-}
def MTHC2_MMR6 : StdMMR6Rel, MTHC2_MMR6_ENC, MTHC2_MMR6_DESC, ISA_MICROMIPS32R6;
def MFC0_MMR6 : StdMMR6Rel, MFC0_MMR6_ENC, MFC0_MMR6_DESC, ISA_MICROMIPS32R6;
def MFC1_MMR6 : StdMMR6Rel, MFC1_MMR6_DESC, MFC1_MMR6_ENC, ISA_MICROMIPS32R6;
def MFC2_MMR6 : StdMMR6Rel, MFC2_MMR6_ENC, MFC2_MMR6_DESC, ISA_MICROMIPS32R6;
def MFHC0_MMR6 : R6MMR6Rel, MFHC0_MMR6_ENC, MFHC0_MMR6_DESC, ISA_MICROMIPS32R6;
-def MFHC1_D32_MMR6 : StdMMR6Rel, MFHC1_D32_MMR6_DESC, MFHC1_MMR6_ENC,
- ISA_MICROMIPS32R6;
-let DecoderNamespace = "MicroMipsFP64" in {
- def MFHC1_D64_MMR6 : StdMMR6Rel, MFHC1_D64_MMR6_DESC, MFHC1_MMR6_ENC,
- ISA_MICROMIPS32R6;
-}
def MFHC2_MMR6 : StdMMR6Rel, MFHC2_MMR6_ENC, MFHC2_MMR6_DESC, ISA_MICROMIPS32R6;
def MOD_MMR6 : R6MMR6Rel, MOD_MMR6_DESC, MOD_MMR6_ENC, ISA_MICROMIPS32R6;
def MODU_MMR6 : R6MMR6Rel, MODU_MMR6_DESC, MODU_MMR6_ENC, ISA_MICROMIPS32R6;
@@ -1516,8 +1400,6 @@ def OR_MMR6 : StdMMR6Rel, OR_MMR6_DESC, OR_MMR6_ENC, ISA_MICROMIPS32R6;
def ORI_MMR6 : StdMMR6Rel, ORI_MMR6_DESC, ORI_MMR6_ENC, ISA_MICROMIPS32R6;
def PREF_MMR6 : R6MMR6Rel, PREF_MMR6_ENC, PREF_MMR6_DESC, ISA_MICROMIPS32R6;
def SB16_MMR6 : StdMMR6Rel, SB16_MMR6_DESC, SB16_MMR6_ENC, ISA_MICROMIPS32R6;
-def SEB_MMR6 : StdMMR6Rel, SEB_MMR6_DESC, SEB_MMR6_ENC, ISA_MICROMIPS32R6;
-def SEH_MMR6 : StdMMR6Rel, SEH_MMR6_DESC, SEH_MMR6_ENC, ISA_MICROMIPS32R6;
def SELEQZ_MMR6 : R6MMR6Rel, SELEQZ_MMR6_ENC, SELEQZ_MMR6_DESC,
ISA_MICROMIPS32R6;
def SELNEZ_MMR6 : R6MMR6Rel, SELNEZ_MMR6_ENC, SELNEZ_MMR6_DESC,
@@ -1529,17 +1411,11 @@ def SUBU_MMR6 : StdMMR6Rel, SUBU_MMR6_DESC, SUBU_MMR6_ENC, ISA_MICROMIPS32R6;
def SW16_MMR6 : StdMMR6Rel, SW16_MMR6_DESC, SW16_MMR6_ENC, ISA_MICROMIPS32R6;
def SWM16_MMR6 : StdMMR6Rel, SWM16_MMR6_DESC, SWM16_MMR6_ENC, ISA_MICROMIPS32R6;
def SWSP_MMR6 : StdMMR6Rel, SWSP_MMR6_DESC, SWSP_MMR6_ENC, ISA_MICROMIPS32R6;
-def SWP_MMR6 : StdMMR6Rel, SWP_MMR6_ENC, SWP_MMR6_DESC, ISA_MICROMIPS32R6;
-def PREFE_MMR6 : StdMMR6Rel, PREFE_MMR6_ENC, PREFE_MMR6_DESC, ISA_MICROMIPS32R6;
-def CACHEE_MMR6 : StdMMR6Rel, CACHEE_MMR6_ENC, CACHEE_MMR6_DESC,
- ISA_MICROMIPS32R6;
def WRPGPR_MMR6 : StdMMR6Rel, WRPGPR_MMR6_ENC, WRPGPR_MMR6_DESC,
ISA_MICROMIPS32R6;
def WSBH_MMR6 : StdMMR6Rel, WSBH_MMR6_ENC, WSBH_MMR6_DESC, ISA_MICROMIPS32R6;
def LB_MMR6 : R6MMR6Rel, LB_MMR6_ENC, LB_MMR6_DESC, ISA_MICROMIPS32R6;
def LBU_MMR6 : R6MMR6Rel, LBU_MMR6_ENC, LBU_MMR6_DESC, ISA_MICROMIPS32R6;
-def LBE_MMR6 : R6MMR6Rel, LBE_MMR6_ENC, LBE_MMR6_DESC, ISA_MICROMIPS32R6;
-def LBUE_MMR6 : R6MMR6Rel, LBUE_MMR6_ENC, LBUE_MMR6_DESC, ISA_MICROMIPS32R6;
def PAUSE_MMR6 : StdMMR6Rel, PAUSE_MMR6_DESC, PAUSE_MMR6_ENC, ISA_MICROMIPS32R6;
def RDHWR_MMR6 : R6MMR6Rel, RDHWR_MMR6_DESC, RDHWR_MMR6_ENC, ISA_MICROMIPS32R6;
def WAIT_MMR6 : StdMMR6Rel, WAIT_MMR6_DESC, WAIT_MMR6_ENC, ISA_MICROMIPS32R6;
@@ -1554,26 +1430,15 @@ def XORI_MMR6 : StdMMR6Rel, XORI_MMR6_DESC, XORI_MMR6_ENC, ISA_MICROMIPS32R6;
let DecoderMethod = "DecodeMemMMImm16" in {
def SW_MMR6 : StdMMR6Rel, SW_MMR6_DESC, SW_MMR6_ENC, ISA_MICROMIPS32R6;
}
-let DecoderMethod = "DecodeMemMMImm9" in {
- def SWE_MMR6 : StdMMR6Rel, SWE_MMR6_DESC, SWE_MMR6_ENC, ISA_MICROMIPS32R6;
-}
/// Floating Point Instructions
def FADD_S_MMR6 : StdMMR6Rel, FADD_S_MMR6_ENC, FADD_S_MMR6_DESC,
ISA_MICROMIPS32R6;
-def FADD_D_MMR6 : StdMMR6Rel, FADD_D_MMR6_ENC, FADD_D_MMR6_DESC,
- ISA_MICROMIPS32R6;
def FSUB_S_MMR6 : StdMMR6Rel, FSUB_S_MMR6_ENC, FSUB_S_MMR6_DESC,
ISA_MICROMIPS32R6;
-def FSUB_D_MMR6 : StdMMR6Rel, FSUB_D_MMR6_ENC, FSUB_D_MMR6_DESC,
- ISA_MICROMIPS32R6;
def FMUL_S_MMR6 : StdMMR6Rel, FMUL_S_MMR6_ENC, FMUL_S_MMR6_DESC,
ISA_MICROMIPS32R6;
-def FMUL_D_MMR6 : StdMMR6Rel, FMUL_D_MMR6_ENC, FMUL_D_MMR6_DESC,
- ISA_MICROMIPS32R6;
def FDIV_S_MMR6 : StdMMR6Rel, FDIV_S_MMR6_ENC, FDIV_S_MMR6_DESC,
ISA_MICROMIPS32R6;
-def FDIV_D_MMR6 : StdMMR6Rel, FDIV_D_MMR6_ENC, FDIV_D_MMR6_DESC,
- ISA_MICROMIPS32R6;
def MADDF_S_MMR6 : R6MMR6Rel, MADDF_S_MMR6_ENC, MADDF_S_MMR6_DESC,
ISA_MICROMIPS32R6;
def MADDF_D_MMR6 : R6MMR6Rel, MADDF_D_MMR6_ENC, MADDF_D_MMR6_DESC,
@@ -1584,12 +1449,8 @@ def MSUBF_D_MMR6 : R6MMR6Rel, MSUBF_D_MMR6_ENC, MSUBF_D_MMR6_DESC,
ISA_MICROMIPS32R6;
def FMOV_S_MMR6 : StdMMR6Rel, FMOV_S_MMR6_ENC, FMOV_S_MMR6_DESC,
ISA_MICROMIPS32R6;
-def FMOV_D_MMR6 : StdMMR6Rel, FMOV_D_MMR6_ENC, FMOV_D_MMR6_DESC,
- ISA_MICROMIPS32R6;
def FNEG_S_MMR6 : StdMMR6Rel, FNEG_S_MMR6_ENC, FNEG_S_MMR6_DESC,
ISA_MICROMIPS32R6;
-def FNEG_D_MMR6 : StdMMR6Rel, FNEG_D_MMR6_ENC, FNEG_D_MMR6_DESC,
- ISA_MICROMIPS32R6;
def MAX_S_MMR6 : R6MMR6Rel, MAX_S_MMR6_ENC, MAX_S_MMR6_DESC, ISA_MICROMIPS32R6;
def MAX_D_MMR6 : R6MMR6Rel, MAX_D_MMR6_ENC, MAX_D_MMR6_DESC, ISA_MICROMIPS32R6;
def MIN_S_MMR6 : R6MMR6Rel, MIN_S_MMR6_ENC, MIN_S_MMR6_DESC, ISA_MICROMIPS32R6;
@@ -1608,24 +1469,14 @@ def CVT_L_D_MMR6 : StdMMR6Rel, CVT_L_D_MMR6_ENC, CVT_L_D_MMR6_DESC,
ISA_MICROMIPS32R6;
def CVT_W_S_MMR6 : StdMMR6Rel, CVT_W_S_MMR6_ENC, CVT_W_S_MMR6_DESC,
ISA_MICROMIPS32R6;
-def CVT_W_D_MMR6 : StdMMR6Rel, CVT_W_D_MMR6_ENC, CVT_W_D_MMR6_DESC,
- ISA_MICROMIPS32R6;
-def CVT_D_S_MMR6 : StdMMR6Rel, CVT_D_S_MMR6_ENC, CVT_D_S_MMR6_DESC,
- ISA_MICROMIPS32R6;
-def CVT_D_W_MMR6 : StdMMR6Rel, CVT_D_W_MMR6_ENC, CVT_D_W_MMR6_DESC,
- ISA_MICROMIPS32R6;
def CVT_D_L_MMR6 : StdMMR6Rel, CVT_D_L_MMR6_ENC, CVT_D_L_MMR6_DESC,
ISA_MICROMIPS32R6;
-def CVT_S_D_MMR6 : StdMMR6Rel, CVT_S_D_MMR6_ENC, CVT_S_D_MMR6_DESC,
- ISA_MICROMIPS32R6;
def CVT_S_W_MMR6 : StdMMR6Rel, CVT_S_W_MMR6_ENC, CVT_S_W_MMR6_DESC,
ISA_MICROMIPS32R6;
def CVT_S_L_MMR6 : StdMMR6Rel, CVT_S_L_MMR6_ENC, CVT_S_L_MMR6_DESC,
ISA_MICROMIPS32R6;
defm S_MMR6 : CMP_CC_MMR6<0b000101, "s", FGR32Opnd, II_CMP_CC_S>;
defm D_MMR6 : CMP_CC_MMR6<0b010101, "d", FGR64Opnd, II_CMP_CC_D>;
-def ABS_S_MMR6 : StdMMR6Rel, ABS_S_MMR6_ENC, ABS_S_MMR6_DESC, ISA_MICROMIPS32R6;
-def ABS_D_MMR6 : StdMMR6Rel, ABS_D_MMR6_ENC, ABS_D_MMR6_DESC, ISA_MICROMIPS32R6;
def FLOOR_L_S_MMR6 : StdMMR6Rel, FLOOR_L_S_MMR6_ENC, FLOOR_L_S_MMR6_DESC,
ISA_MICROMIPS32R6;
def FLOOR_L_D_MMR6 : StdMMR6Rel, FLOOR_L_D_MMR6_ENC, FLOOR_L_D_MMR6_DESC,
@@ -1650,17 +1501,8 @@ def TRUNC_W_S_MMR6 : StdMMR6Rel, TRUNC_W_S_MMR6_ENC, TRUNC_W_S_MMR6_DESC,
ISA_MICROMIPS32R6;
def TRUNC_W_D_MMR6 : StdMMR6Rel, TRUNC_W_D_MMR6_ENC, TRUNC_W_D_MMR6_DESC,
ISA_MICROMIPS32R6;
-def SQRT_S_MMR6 : StdMMR6Rel, SQRT_S_MMR6_ENC, SQRT_S_MMR6_DESC,
- ISA_MICROMIPS32R6;
-def SQRT_D_MMR6 : StdMMR6Rel, SQRT_D_MMR6_ENC, SQRT_D_MMR6_DESC,
- ISA_MICROMIPS32R6;
def SB_MMR6 : StdMMR6Rel, SB_MMR6_DESC, SB_MMR6_ENC, ISA_MICROMIPS32R6;
-def SBE_MMR6 : StdMMR6Rel, SBE_MMR6_DESC, SBE_MMR6_ENC, ISA_MICROMIPS32R6;
-def SCE_MMR6 : StdMMR6Rel, SCE_MMR6_DESC, SCE_MMR6_ENC, ISA_MICROMIPS32R6;
def SH_MMR6 : StdMMR6Rel, SH_MMR6_DESC, SH_MMR6_ENC, ISA_MICROMIPS32R6;
-def SHE_MMR6 : StdMMR6Rel, SHE_MMR6_DESC, SHE_MMR6_ENC, ISA_MICROMIPS32R6;
-def LLE_MMR6 : StdMMR6Rel, LLE_MMR6_DESC, LLE_MMR6_ENC, ISA_MICROMIPS32R6;
-def LWE_MMR6 : StdMMR6Rel, LWE_MMR6_DESC, LWE_MMR6_ENC, ISA_MICROMIPS32R6;
def LW_MMR6 : StdMMR6Rel, LW_MMR6_DESC, LW_MMR6_ENC, ISA_MICROMIPS32R6;
def LUI_MMR6 : R6MMR6Rel, LUI_MMR6_DESC, LUI_MMR6_ENC, ISA_MICROMIPS32R6;
def ADDU16_MMR6 : StdMMR6Rel, ADDU16_MMR6_DESC, ADDU16_MMR6_ENC,
@@ -1747,6 +1589,8 @@ def LDC2_MMR6 : StdMMR6Rel, LDC2_MMR6_ENC, LDC2_MMR6_DESC, ISA_MICROMIPS32R6;
def SDC2_MMR6 : StdMMR6Rel, SDC2_MMR6_ENC, SDC2_MMR6_DESC, ISA_MICROMIPS32R6;
def LWC2_MMR6 : StdMMR6Rel, LWC2_MMR6_ENC, LWC2_MMR6_DESC, ISA_MICROMIPS32R6;
def SWC2_MMR6 : StdMMR6Rel, SWC2_MMR6_ENC, SWC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def LL_MMR6 : R6MMR6Rel, LL_MMR6_ENC, LL_MMR6_DESC, ISA_MICROMIPS32R6;
+def SC_MMR6 : R6MMR6Rel, SC_MMR6_ENC, SC_MMR6_DESC, ISA_MICROMIPS32R6;
}
def BOVC_MMR6 : R6MMR6Rel, BOVC_MMR6_ENC, BOVC_MMR6_DESC, ISA_MICROMIPS32R6,
@@ -1806,6 +1650,8 @@ def : MipsInstAlias<"mfhc0 $rt, $rs",
ISA_MICROMIPS32R6;
def : MipsInstAlias<"jalrc.hb $rs", (JALRC_HB_MMR6 RA, GPR32Opnd:$rs), 1>,
ISA_MICROMIPS32R6;
+def : MipsInstAlias<"jal $offset", (BALC_MMR6 brtarget26_mm:$offset), 0>,
+ ISA_MICROMIPS32R6;
def : MipsInstAlias<"dvp", (DVP_MMR6 ZERO), 0>, ISA_MICROMIPS32R6;
def : MipsInstAlias<"evp", (EVP_MMR6 ZERO), 0>, ISA_MICROMIPS32R6;
def : MipsInstAlias<"jalrc $rs", (JALRC_MMR6 RA, GPR32Opnd:$rs), 1>,
@@ -1831,13 +1677,32 @@ def : MipsInstAlias<"xor $rs, $imm",
def : MipsInstAlias<"not $rt, $rs",
(NOR_MMR6 GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>,
ISA_MICROMIPS32R6;
-def : MipsInstAlias<"seh $rd", (SEH_MMR6 GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
- ISA_MICROMIPS32R6;
-def : MipsInstAlias<"seb $rd", (SEB_MMR6 GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
+def : MipsInstAlias<"not $rt",
+ (NOR_MMR6 GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>,
ISA_MICROMIPS32R6;
def : MipsInstAlias<"lapc $rd, $imm",
(ADDIUPC_MMR6 GPR32Opnd:$rd, simm19_lsl2:$imm)>,
ISA_MICROMIPS32R6;
+def : MipsInstAlias<"neg $rt, $rs",
+ (SUB_MMR6 GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"neg $rt",
+ (SUB_MMR6 GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"negu $rt, $rs",
+ (SUBU_MMR6 GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"negu $rt",
+ (SUBU_MMR6 GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>,
+ ISA_MICROMIPS32R6;
+def : MipsInstAlias<"beqz16 $rs, $offset", (BEQZC16_MMR6 GPRMM16Opnd:$rs,
+ brtarget7_mm:$offset),
+ 0>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"bnez16 $rs, $offset", (BNEZC16_MMR6 GPRMM16Opnd:$rs,
+ brtarget7_mm:$offset),
+ 0>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"b16 $offset", (BC16_MMR6 brtarget10_mm:$offset), 0>,
+ ISA_MICROMIPS32R6;
//===----------------------------------------------------------------------===//
//
@@ -1867,6 +1732,11 @@ defm : SelectInt_Pats<i32, OR_MM, XORI_MMR6, SLTi_MM, SLTiu_MM, SELEQZ_MMR6,
defm S_MMR6 : Cmp_Pats<f32, NOR_MMR6, ZERO>, ISA_MICROMIPS32R6;
defm D_MMR6 : Cmp_Pats<f64, NOR_MMR6, ZERO>, ISA_MICROMIPS32R6;
+def : MipsPat<(f32 fpimm0), (MTC1_MMR6 ZERO)>, ISA_MICROMIPS32R6;
+def : MipsPat<(f32 fpimm0neg), (FNEG_S_MMR6 (MTC1 ZERO))>, ISA_MICROMIPS32R6;
+def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+ (TRUNC_W_D_MMR6 FGR64Opnd:$src)>, ISA_MICROMIPS32R6;
+
def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
(ANDI16_MMR6 GPRMM16:$src, immZExtAndi16:$imm)>,
ISA_MICROMIPS32R6;
@@ -1886,9 +1756,49 @@ let AddedComplexity = 41 in {
def TAILCALL_MMR6 : TailCall<BC_MMR6, brtarget26_mm>, ISA_MICROMIPS32R6;
+def TAILCALLREG_MMR6 : TailCallReg<JRC16_MM, GPR32Opnd>, ISA_MICROMIPS32R6;
+
+def PseudoIndirectBranch_MMR6 : PseudoIndirectBranchBase<JRC16_MMR6,
+ GPR32Opnd>,
+ ISA_MICROMIPS32R6;
+
def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
(TAILCALL_MMR6 tglobaladdr:$dst)>, ISA_MICROMIPS32R6;
def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
(TAILCALL_MMR6 texternalsym:$dst)>, ISA_MICROMIPS32R6;
+
+def : MipsPat<(brcond (i32 (setne GPR32:$lhs, 0)), bb:$dst),
+ (BNEZC_MMR6 GPR32:$lhs, bb:$dst)>, ISA_MICROMIPS32R6;
+def : MipsPat<(brcond (i32 (seteq GPR32:$lhs, 0)), bb:$dst),
+ (BEQZC_MMR6 GPR32:$lhs, bb:$dst)>, ISA_MICROMIPS32R6;
+
+def : MipsPat<(brcond (i32 (setge GPR32:$lhs, GPR32:$rhs)), bb:$dst),
+ (BEQZC_MMR6 (SLT_MM GPR32:$lhs, GPR32:$rhs), bb:$dst)>,
+ ISA_MICROMIPS32R6;
+def : MipsPat<(brcond (i32 (setuge GPR32:$lhs, GPR32:$rhs)), bb:$dst),
+ (BEQZC_MMR6 (SLTu_MM GPR32:$lhs, GPR32:$rhs), bb:$dst)>,
+ ISA_MICROMIPS32R6;
+def : MipsPat<(brcond (i32 (setge GPR32:$lhs, immSExt16:$rhs)), bb:$dst),
+ (BEQZC_MMR6 (SLTi_MM GPR32:$lhs, immSExt16:$rhs), bb:$dst)>,
+ ISA_MICROMIPS32R6;
+def : MipsPat<(brcond (i32 (setuge GPR32:$lhs, immSExt16:$rhs)), bb:$dst),
+ (BEQZC_MMR6 (SLTiu_MM GPR32:$lhs, immSExt16:$rhs), bb:$dst)>,
+ ISA_MICROMIPS32R6;
+def : MipsPat<(brcond (i32 (setgt GPR32:$lhs, immSExt16Plus1:$rhs)), bb:$dst),
+ (BEQZC_MMR6 (SLTi_MM GPR32:$lhs, (Plus1 imm:$rhs)), bb:$dst)>,
+ ISA_MICROMIPS32R6;
+def : MipsPat<(brcond (i32 (setugt GPR32:$lhs, immSExt16Plus1:$rhs)), bb:$dst),
+ (BEQZC_MMR6 (SLTiu_MM GPR32:$lhs, (Plus1 imm:$rhs)), bb:$dst)>,
+ ISA_MICROMIPS32R6;
+
+def : MipsPat<(brcond (i32 (setle GPR32:$lhs, GPR32:$rhs)), bb:$dst),
+ (BEQZC_MMR6 (SLT_MM GPR32:$rhs, GPR32:$lhs), bb:$dst)>,
+ ISA_MICROMIPS32R6;
+def : MipsPat<(brcond (i32 (setule GPR32:$lhs, GPR32:$rhs)), bb:$dst),
+ (BEQZC_MMR6 (SLTu_MM GPR32:$rhs, GPR32:$lhs), bb:$dst)>,
+ ISA_MICROMIPS32R6;
+
+def : MipsPat<(brcond GPR32:$cond, bb:$dst),
+ (BNEZC_MMR6 GPR32:$cond, bb:$dst)>, ISA_MICROMIPS32R6;
diff --git a/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
index af6473c468d9..0d444dfc9fad 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
@@ -8,9 +8,9 @@
//===----------------------------------------------------------------------===//
class MMDSPInst<string opstr = "">
- : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl {
- let InsnPredicates = [HasDSP];
- let AdditionalPredicates = [InMicroMips];
+ : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
+ let ASEPredicate = [HasDSP];
+ let EncodingPredicates = [InMicroMips];
string BaseOpcode = opstr;
string Arch = "mmdsp";
let DecoderNamespace = "MicroMips";
@@ -18,7 +18,7 @@ class MMDSPInst<string opstr = "">
class MMDSPInstAlias<string Asm, dag Result, bit Emit = 0b1>
: InstAlias<Asm, Result, Emit>, PredicateControl {
- let InsnPredicates = [HasDSP];
+ let ASEPredicate = [HasDSP];
let AdditionalPredicates = [InMicroMips];
}
diff --git a/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
index 20c1ab5a9998..132de6be750d 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
@@ -386,6 +386,7 @@ class WRDSP_MM_DESC {
string AsmString = !strconcat("wrdsp", "\t$rt, $mask");
list<dag> Pattern = [(int_mips_wrdsp GPR32Opnd:$rt, immZExt7:$mask)];
InstrItinClass Itinerary = NoItinerary;
+ bit isMoveReg = 1;
}
class BPOSGE32C_MMR3_DESC {
@@ -416,11 +417,11 @@ class BPOSGE32_MM_DESC : BPOSGE32_DESC_BASE<"bposge32", brtarget_mm,
NoItinerary>;
let DecoderNamespace = "MicroMipsDSP", Arch = "mmdsp",
- AdditionalPredicates = [HasDSP, InMicroMips] in {
- def LWDSP_MM : Load<"lw", DSPROpnd, null_frag, II_LW>, DspMMRel,
- LW_FM_MM<0x3f>;
- def SWDSP_MM : Store<"sw", DSPROpnd, null_frag, II_SW>, DspMMRel,
- LW_FM_MM<0x3e>;
+ EncodingPredicates = [InMicroMips], ASEPredicate = [HasDSP] in {
+ def LWDSP_MM : Load<"lw", DSPROpnd, null_frag, II_LW>, DspMMRel,
+ LW_FM_MM<0x3f>;
+ def SWDSP_MM : Store<"sw", DSPROpnd, null_frag, II_SW>, DspMMRel,
+ LW_FM_MM<0x3e>;
}
// Instruction defs.
// microMIPS DSP Rev 1
@@ -530,7 +531,7 @@ def MODSUB_MM : DspMMRel, MODSUB_MM_ENC, MODSUB_DESC;
def MULSAQ_S_W_PH_MM : DspMMRel, MULSAQ_S_W_PH_MM_ENC, MULSAQ_S_W_PH_DESC;
def BITREV_MM : DspMMRel, BITREV_MM_ENC, BITREV_MM_DESC;
def BPOSGE32_MM : DspMMRel, BPOSGE32_MM_ENC, BPOSGE32_MM_DESC,
- ISA_MIPS1_NOT_32R6_64R6;
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def CMP_EQ_PH_MM : DspMMRel, CMP_EQ_PH_MM_ENC, CMP_EQ_PH_DESC;
def CMP_LT_PH_MM : DspMMRel, CMP_LT_PH_MM_ENC, CMP_LT_PH_DESC;
def CMP_LE_PH_MM : DspMMRel, CMP_LE_PH_MM_ENC, CMP_LE_PH_DESC;
diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td
index 49025cc1570a..84ae0eddf980 100644
--- a/lib/Target/Mips/MicroMipsInstrFPU.td
+++ b/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -11,7 +11,18 @@
//
//===----------------------------------------------------------------------===//
-let isCodeGenOnly = 1 in {
+multiclass ADDS_MMM<string opstr, InstrItinClass Itin, bit IsComm,
+ SDPatternOperator OpNode = null_frag> {
+ def _D32_MM : MMRel, ADDS_FT<opstr, AFGR64Opnd, Itin, IsComm, OpNode>,
+ FGR_32 {
+ string DecoderNamespace = "MicroMips";
+ }
+ // FIXME: This needs to be part of the instruction mapping tables.
+ def _D64_MM : ADDS_FT<opstr, FGR64Opnd, Itin, IsComm, OpNode>, FGR_64 {
+ string DecoderNamespace = "MicroMipsFP64";
+ }
+}
+
def FADD_S_MM : MMRel, ADDS_FT<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>,
ADDS_FM_MM<0, 0x30>, ISA_MICROMIPS;
def FDIV_S_MM : MMRel, ADDS_FT<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>,
@@ -21,27 +32,27 @@ def FMUL_S_MM : MMRel, ADDS_FT<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>,
def FSUB_S_MM : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
ADDS_FM_MM<0, 0x70>, ISA_MICROMIPS;
-def FADD_MM : MMRel, ADDS_FT<"add.d", AFGR64Opnd, II_ADD_D, 1, fadd>,
- ADDS_FM_MM<1, 0x30>, ISA_MICROMIPS;
-def FDIV_MM : MMRel, ADDS_FT<"div.d", AFGR64Opnd, II_DIV_D, 0, fdiv>,
- ADDS_FM_MM<1, 0xf0>, ISA_MICROMIPS;
-def FMUL_MM : MMRel, ADDS_FT<"mul.d", AFGR64Opnd, II_MUL_D, 1, fmul>,
- ADDS_FM_MM<1, 0xb0>, ISA_MICROMIPS;
-def FSUB_MM : MMRel, ADDS_FT<"sub.d", AFGR64Opnd, II_SUB_D, 0, fsub>,
- ADDS_FM_MM<1, 0x70>, ISA_MICROMIPS;
-
-def LWXC1_MM : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>,
- LWXC1_FM_MM<0x48>, ISA_MICROMIPS32_NOT_MIPS32R6;
-def SWXC1_MM : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>,
- SWXC1_FM_MM<0x88>, ISA_MICROMIPS32_NOT_MIPS32R6;
-
-// FIXME: These instruction definitions are incorrect. They should be 64-bit
-// FPU only.
-def LUXC1_MM : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>,
- LWXC1_FM_MM<0x148>, ISA_MICROMIPS32_NOT_MIPS32R6;
-def SUXC1_MM : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>,
- SWXC1_FM_MM<0x188>, ISA_MICROMIPS32_NOT_MIPS32R6;
+defm FADD : ADDS_MMM<"add.d", II_ADD_D, 1, fadd>,
+ ADDS_FM_MM<1, 0x30>, ISA_MICROMIPS;
+defm FDIV : ADDS_MMM<"div.d", II_DIV_D, 0, fdiv>,
+ ADDS_FM_MM<1, 0xf0>, ISA_MICROMIPS;
+defm FMUL : ADDS_MMM<"mul.d", II_MUL_D, 1, fmul>,
+ ADDS_FM_MM<1, 0xb0>, ISA_MICROMIPS;
+defm FSUB : ADDS_MMM<"sub.d", II_SUB_D, 0, fsub>,
+ ADDS_FM_MM<1, 0x70>, ISA_MICROMIPS;
+let DecoderNamespace = "MicroMips" in {
+ def LWXC1_MM : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>,
+ LWXC1_FM_MM<0x48>, ISA_MICROMIPS32_NOT_MIPS32R6;
+ def SWXC1_MM : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>,
+ SWXC1_FM_MM<0x88>, ISA_MICROMIPS32_NOT_MIPS32R6;
+
+ def LUXC1_MM : MMRel, LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>,
+ LWXC1_FM_MM<0x148>, FGR_64, ISA_MICROMIPS32_NOT_MIPS32R6;
+ def SUXC1_MM : MMRel, SWXC1_FT<"suxc1", FGR64Opnd, II_SUXC1>,
+ SWXC1_FM_MM<0x188>, FGR_64, ISA_MICROMIPS32_NOT_MIPS32R6;
+}
+let isCodeGenOnly = 1 in {
def FCMP_S32_MM : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>,
CEQS_FM_MM<0>, ISA_MICROMIPS32_NOT_MIPS32R6 {
// FIXME: This is a required to work around the fact that these instructions
@@ -65,130 +76,174 @@ let DecoderNamespace = "MicroMips" in {
BC1F_FM_MM<0x1c>, ISA_MICROMIPS32_NOT_MIPS32R6;
def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, II_BC1T, MIPS_BRANCH_T>,
BC1F_FM_MM<0x1d>, ISA_MICROMIPS32_NOT_MIPS32R6;
+ def CVT_W_S_MM : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
+ ROUND_W_FM_MM<0, 0x24>, ISA_MICROMIPS;
}
-let isCodeGenOnly = 1 in {
-def CVT_W_S_MM : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
- ROUND_W_FM_MM<0, 0x24>, ISA_MICROMIPS;
-def ROUND_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd,
- II_ROUND>, ROUND_W_FM_MM<0, 0xec>,
- ISA_MICROMIPS;
-
-def CEIL_W_MM : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>,
- ROUND_W_FM_MM<1, 0x6c>, ISA_MICROMIPS, FGR_32;
-def CVT_W_MM : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
- ROUND_W_FM_MM<1, 0x24>, ISA_MICROMIPS, FGR_32;
-def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>,
- ROUND_W_FM_MM<1, 0x2c>, ISA_MICROMIPS, FGR_32;
-def ROUND_W_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd,
- II_ROUND>, ROUND_W_FM_MM<1, 0xec>,
- ISA_MICROMIPS, FGR_32;
-def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>,
- ROUND_W_FM_MM<1, 0xac>, ISA_MICROMIPS, FGR_32;
-
-def FSQRT_MM : MMRel, ABSS_FT<"sqrt.d", AFGR64Opnd, AFGR64Opnd, II_SQRT_D,
- fsqrt>, ROUND_W_FM_MM<1, 0x28>,
- ISA_MICROMIPS, FGR_32;
-
-def CVT_L_S_MM : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
- ROUND_W_FM_MM<0, 0x4>, ISA_MICROMIPS, FGR_64;
-def CVT_L_D64_MM : MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
- ROUND_W_FM_MM<1, 0x4>, ISA_MICROMIPS, FGR_64;
+let DecoderNamespace = "MicroMips" in {
+ def ROUND_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd,
+ FGR32Opnd, II_ROUND>,
+ ROUND_W_FM_MM<0, 0xec>, ISA_MICROMIPS;
+
+ def CEIL_W_MM : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>,
+ ROUND_W_FM_MM<1, 0x6c>, ISA_MICROMIPS, FGR_32;
+ def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>,
+ ROUND_W_FM_MM<1, 0x2c>, ISA_MICROMIPS, FGR_32;
+ def ROUND_W_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.d", FGR32Opnd,
+ AFGR64Opnd, II_ROUND>,
+ ROUND_W_FM_MM<1, 0xec>, ISA_MICROMIPS, FGR_32;
+ def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>,
+ ROUND_W_FM_MM<1, 0xac>, ISA_MICROMIPS, FGR_32;
+
+ def CVT_L_S_MM : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+ ROUND_W_FM_MM<0, 0x4>, ISA_MICROMIPS, FGR_64;
+ def CVT_L_D64_MM : MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
+ ROUND_W_FM_MM<1, 0x4>, ISA_MICROMIPS, FGR_64;
+
+ def CVT_W_D32_MM : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+ ROUND_W_FM_MM<1, 0x24>, ISA_MICROMIPS, FGR_32;
+}
+let DecoderNamespace = "MicroMipsFP64" in {
+ def CVT_W_D64_MM : ABSS_FT<"cvt.w.d", FGR32Opnd, FGR64Opnd, II_CVT>,
+ ROUND_W_FM_MM<1, 0x24>, ISA_MICROMIPS, FGR_64;
+}
+multiclass ABSS_MMM<string opstr, InstrItinClass Itin,
+ SDPatternOperator OpNode = null_frag> {
+ def _D32_MM : MMRel, ABSS_FT<opstr, AFGR64Opnd, AFGR64Opnd, Itin, OpNode>,
+ ISA_MICROMIPS, FGR_32 {
+ string DecoderNamespace = "MicroMips";
+ }
+ // FIXME: This needs to be part of the instruction mapping tables.
+ def _D64_MM : ABSS_FT<opstr, FGR64Opnd, FGR64Opnd, Itin, OpNode>,
+ ISA_MICROMIPS, FGR_64 {
+ string DecoderNamespace = "MicroMipsFP64";
+ }
}
+defm FSQRT : ABSS_MMM<"sqrt.d", II_SQRT_D, fsqrt>, ROUND_W_FM_MM<1, 0x28>;
+defm FABS : ABSS_MMM<"abs.d", II_SQRT_D, fabs>, ABS_FM_MM<1, 0xd>;
+
let DecoderNamespace = "MicroMips" in {
def FABS_S_MM : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
ABS_FM_MM<0, 0xd>, ISA_MICROMIPS;
- def FABS_MM : MMRel, ABSS_FT<"abs.d", AFGR64Opnd, AFGR64Opnd, II_ABS, fabs>,
- ABS_FM_MM<1, 0xd>, ISA_MICROMIPS, FGR_32;
}
-let isCodeGenOnly = 1 in {
def FMOV_S_MM : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
- ABS_FM_MM<0, 0x1>, ISA_MICROMIPS;
+ ABS_FM_MM<0, 0x1>, ISA_MICROMIPS {
+ let isMoveReg = 1;
+}
def FNEG_S_MM : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
ABS_FM_MM<0, 0x2d>, ISA_MICROMIPS;
-def CVT_D_S_MM : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
- ABS_FM_MM<0, 0x4d>, ISA_MICROMIPS, FGR_32;
-def CVT_D32_W_MM : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
- ABS_FM_MM<1, 0x4d>, ISA_MICROMIPS, FGR_32;
-def CVT_S_D32_MM : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
- ABS_FM_MM<0, 0x6d>, ISA_MICROMIPS, FGR_32;
-def CVT_S_W_MM : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
- ABS_FM_MM<1, 0x6d>, ISA_MICROMIPS;
-
-def FNEG_MM : MMRel, ABSS_FT<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>,
- ABS_FM_MM<1, 0x2d>, ISA_MICROMIPS, FGR_32;
-
-def FMOV_D32_MM : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
- ABS_FM_MM<1, 0x1>, ISA_MICROMIPS, FGR_32;
-
-def MOVZ_I_S_MM : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd,
- II_MOVZ_S>, CMov_I_F_FM_MM<0x78, 0>,
- ISA_MICROMIPS32_NOT_MIPS32R6;
-def MOVN_I_S_MM : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd,
+
+let DecoderNamespace = "MicroMips" in {
+ def CVT_D32_S_MM : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
+ ABS_FM_MM<0, 0x4d>, ISA_MICROMIPS, FGR_32;
+ def CVT_D32_W_MM : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
+ ABS_FM_MM<1, 0x4d>, ISA_MICROMIPS, FGR_32;
+}
+
+let DecoderNamespace = "MicroMipsFP64" in {
+ def CVT_D64_S_MM : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+ ABS_FM_MM<0, 0x4d>, ISA_MICROMIPS, FGR_64;
+ def CVT_D64_W_MM : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>,
+ ABS_FM_MM<1, 0x4d>, ISA_MICROMIPS, FGR_64;
+ def CVT_S_D64_MM : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>,
+ ABS_FM_MM<0, 0x6d>, ISA_MICROMIPS, FGR_64;
+}
+
+let DecoderNamespace = "MicroMips" in {
+ def CVT_S_D32_MM : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+ ABS_FM_MM<0, 0x6d>, ISA_MICROMIPS, FGR_32;
+ def CVT_S_W_MM : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
+ ABS_FM_MM<1, 0x6d>, ISA_MICROMIPS;
+}
+
+
+defm FNEG : ABSS_MMM<"neg.d", II_NEG, fneg>, ABS_FM_MM<1, 0x2d>;
+defm FMOV : ABSS_MMM<"mov.d", II_MOV_D>, ABS_FM_MM<1, 0x1>;
+
+let DecoderNamespace = "MicroMips" in {
+ def MOVZ_I_S_MM : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd,
+ II_MOVZ_S>, CMov_I_F_FM_MM<0x78, 0>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def MOVN_I_S_MM : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd,
II_MOVN_S>, CMov_I_F_FM_MM<0x38, 0>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def MOVZ_I_D32_MM : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
+ II_MOVZ_D>, CMov_I_F_FM_MM<0x78, 1>,
+ ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+ def MOVN_I_D32_MM : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
+ II_MOVN_D>, CMov_I_F_FM_MM<0x38, 1>,
+ ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+
+ def MOVT_S_MM : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S,
+ MipsCMovFP_T>, CMov_F_F_FM_MM<0x60, 0>,
ISA_MICROMIPS32_NOT_MIPS32R6;
-def MOVZ_I_D32_MM : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
- II_MOVZ_D>, CMov_I_F_FM_MM<0x78, 1>,
- ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
-def MOVN_I_D32_MM : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
- II_MOVN_D>, CMov_I_F_FM_MM<0x38, 1>,
- ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
-
-def MOVT_S_MM : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S,
- MipsCMovFP_T>, CMov_F_F_FM_MM<0x60, 0>,
- ISA_MICROMIPS32_NOT_MIPS32R6;
-def MOVF_S_MM : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S,
- MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 0>,
- ISA_MICROMIPS32_NOT_MIPS32R6;
-def MOVT_D32_MM : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
+ def MOVF_S_MM : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S,
+ MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 0>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def MOVT_D32_MM : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
MipsCMovFP_T>, CMov_F_F_FM_MM<0x60, 1>,
- ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
-def MOVF_D32_MM : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
- MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 1>,
ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
-def MFC1_MM : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd,
- II_MFC1, bitconvert>, MFC1_FM_MM<0x80>,
- ISA_MICROMIPS;
-def MTC1_MM : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd,
- II_MTC1, bitconvert>, MFC1_FM_MM<0xa0>,
- ISA_MICROMIPS;
-
-def MADD_S_MM : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
- MADDS_FM_MM<0x1>, ISA_MICROMIPS32_NOT_MIPS32R6;
-def MSUB_S_MM : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
- MADDS_FM_MM<0x21>, ISA_MICROMIPS32_NOT_MIPS32R6;
-def NMADD_S_MM : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
- MADDS_FM_MM<0x2>, ISA_MICROMIPS32_NOT_MIPS32R6;
-def NMSUB_S_MM : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
- MADDS_FM_MM<0x22>, ISA_MICROMIPS32_NOT_MIPS32R6;
-
-def MADD_D32_MM : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
- MADDS_FM_MM<0x9>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
-def MSUB_D32_MM : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
- MADDS_FM_MM<0x29>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
-def NMADD_D32_MM : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
- MADDS_FM_MM<0xa>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
-def NMSUB_D32_MM : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
- MADDS_FM_MM<0x2a>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+ def MOVF_D32_MM : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
+ MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 1>,
+ ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+
+ def MFC1_MM : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd,
+ II_MFC1, bitconvert>, MFC1_FM_MM<0x80>,
+ ISA_MICROMIPS;
+ def MTC1_MM : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd,
+ II_MTC1, bitconvert>, MFC1_FM_MM<0xa0>,
+ ISA_MICROMIPS;
+
+ def MADD_S_MM : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S>,
+ MADDS_FM_MM<0x1>, ISA_MICROMIPS32_NOT_MIPS32R6, MADD4;
+ def MSUB_S_MM : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S>,
+ MADDS_FM_MM<0x21>, ISA_MICROMIPS32_NOT_MIPS32R6, MADD4;
+ let AdditionalPredicates = [NoNaNsFPMath, HasMadd4] in {
+ def NMADD_S_MM : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S>,
+ MADDS_FM_MM<0x2>, ISA_MICROMIPS32_NOT_MIPS32R6;
+ def NMSUB_S_MM : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S>,
+ MADDS_FM_MM<0x22>, ISA_MICROMIPS32_NOT_MIPS32R6;
+ }
+ def MADD_D32_MM : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D>,
+ MADDS_FM_MM<0x9>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32,
+ MADD4;
+ def MSUB_D32_MM : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D>,
+ MADDS_FM_MM<0x29>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32,
+ MADD4;
+ let AdditionalPredicates = [NoNaNsFPMath, HasMadd4] in {
+ def NMADD_D32_MM : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D>,
+ MADDS_FM_MM<0xa>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+ def NMSUB_D32_MM : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D>,
+ MADDS_FM_MM<0x2a>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+ }
+
+ def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd,
+ II_FLOOR>, ROUND_W_FM_MM<0, 0x2c>,
+ ISA_MICROMIPS;
+ def TRUNC_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd,
+ FGR32Opnd, II_TRUNC>,
+ ROUND_W_FM_MM<0, 0xac>, ISA_MICROMIPS;
+ def CEIL_W_S_MM : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
+ ROUND_W_FM_MM<0, 0x6c>, ISA_MICROMIPS;
+
+ def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S,
+ fsqrt>, ROUND_W_FM_MM<0, 0x28>, ISA_MICROMIPS;
+
+ def MTHC1_D32_MM : MMRel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
+ MFC1_FM_MM<0xe0>, ISA_MICROMIPS, FGR_32;
+ def MFHC1_D32_MM : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
+ MFC1_FM_MM<0xc0>, ISA_MICROMIPS, FGR_32;
}
-def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd,
- II_FLOOR>, ROUND_W_FM_MM<0, 0x2c>,
- ISA_MICROMIPS;
-def TRUNC_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd,
- FGR32Opnd, II_TRUNC>,
- ROUND_W_FM_MM<0, 0xac>, ISA_MICROMIPS;
-def CEIL_W_S_MM : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
- ROUND_W_FM_MM<0, 0x6c>, ISA_MICROMIPS;
-def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S,
- fsqrt>, ROUND_W_FM_MM<0, 0x28>, ISA_MICROMIPS;
-def MTHC1_MM : MMRel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
- MFC1_FM_MM<0xe0>, ISA_MICROMIPS, FGR_32;
-def MFHC1_MM : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
- MFC1_FM_MM<0xc0>, ISA_MICROMIPS, FGR_32;
+let DecoderNamespace = "MicroMipsFP64" in {
+ def MTHC1_D64_MM : MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>,
+ MFC1_FM_MM<0xe0>, ISA_MICROMIPS, FGR_64;
+ def MFHC1_D64_MM : MFC1_FT<"mfhc1", GPR32Opnd, FGR64Opnd, II_MFHC1>,
+ MFC1_FM_MM<0xc0>, ISA_MICROMIPS, FGR_64;
+}
let DecoderNamespace = "MicroMips" in {
def CFC1_MM : MMRel, MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, II_CFC1>,
@@ -307,11 +362,13 @@ multiclass C_COND_MM<string TypeStr, RegisterOperand RC, bits<2> fmt,
let BaseOpcode = "c.ngt."#NAME;
}
}
+let DecoderNamespace = "MicroMips" in {
+ defm S : C_COND_MM<"s", FGR32Opnd, 0b00, II_C_CC_S>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ defm D32 : C_COND_MM<"d", AFGR64Opnd, 0b01, II_C_CC_D>,
+ ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+}
-defm S : C_COND_MM<"s", FGR32Opnd, 0b00, II_C_CC_S>,
- ISA_MICROMIPS32_NOT_MIPS32R6;
-defm D32 : C_COND_MM<"d", AFGR64Opnd, 0b01, II_C_CC_D>,
- ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
let DecoderNamespace = "Mips64" in
defm D64 : C_COND_MM<"d", FGR64Opnd, 0b01, II_C_CC_D>,
ISA_MICROMIPS32_NOT_MIPS32R6, FGR_64;
@@ -347,3 +404,36 @@ let AddedComplexity = 40 in {
def : LoadRegImmPat<LWC1_MM, f32, load>, ISA_MICROMIPS;
def : StoreRegImmPat<SWC1_MM, f32>, ISA_MICROMIPS;
}
+
+def : MipsPat<(f32 fpimm0), (MTC1_MM ZERO)>, ISA_MICROMIPS32_NOT_MIPS32R6;
+def : MipsPat<(f32 fpimm0neg), (FNEG_S_MM (MTC1_MM ZERO))>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+
+def : MipsPat<(f32 (fpround FGR64Opnd:$src)),
+ (CVT_S_D64_MM FGR64Opnd:$src)>, ISA_MICROMIPS, FGR_64;
+def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
+ (CVT_D64_S_MM FGR32Opnd:$src)>, ISA_MICROMIPS, FGR_64;
+def : MipsPat<(f32 (fpround AFGR64Opnd:$src)),
+ (CVT_S_D32_MM AFGR64Opnd:$src)>, ISA_MICROMIPS, FGR_32;
+def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
+ (CVT_D32_S_MM FGR32Opnd:$src)>, ISA_MICROMIPS, FGR_32;
+def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
+ (TRUNC_W_MM AFGR64Opnd:$src)>, ISA_MICROMIPS32_NOT_MIPS32R6,
+ FGR_32;
+
+// Selects
+defm : MovzPats0<GPR32, FGR32, MOVZ_I_S_MM, SLT_MM, SLTu_MM, SLTi_MM, SLTiu_MM>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+defm : MovzPats1<GPR32, FGR32, MOVZ_I_S_MM, XOR_MM>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+
+defm : MovnPats<GPR32, FGR32, MOVN_I_S_MM, XOR_MM>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+
+defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32_MM, SLT_MM, SLTu_MM, SLTi_MM,
+ SLTiu_MM>,
+ ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32_MM, XOR_MM>,
+ ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+defm : MovnPats<GPR32, AFGR64, MOVN_I_D32_MM, XOR_MM>,
+ ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
diff --git a/lib/Target/Mips/MicroMipsInstrFormats.td b/lib/Target/Mips/MicroMipsInstrFormats.td
index bc0045dad21e..a9c53e08b810 100644
--- a/lib/Target/Mips/MicroMipsInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -1,3 +1,16 @@
+//===-- MicroMipsInstrFormats.td - microMIPS Inst Formats -*- tablegen -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This files descributes the formats of the microMIPS instruction set.
+//
+//===----------------------------------------------------------------------===//
+
//===----------------------------------------------------------------------===//
// MicroMIPS Base Classes
//===----------------------------------------------------------------------===//
@@ -7,8 +20,8 @@
// This class does not depend on the instruction size.
//
class MicroMipsInstBase<dag outs, dag ins, string asmstr, list<dag> pattern,
- InstrItinClass itin, Format f> : Instruction
-{
+ InstrItinClass itin, Format f> : Instruction,
+ PredicateControl {
let Namespace = "Mips";
let DecoderNamespace = "MicroMips";
@@ -19,7 +32,7 @@ class MicroMipsInstBase<dag outs, dag ins, string asmstr, list<dag> pattern,
let Pattern = pattern;
let Itinerary = itin;
- let Predicates = [InMicroMips];
+ let EncodingPredicates = [InMicroMips];
Format Form = f;
}
@@ -406,7 +419,7 @@ class POOL32C_LHUE_FM_MM<bits<6> op, bits<4> fmt, bits<3> funct> : MMArch {
let Inst{8-0} = offset;
}
-class LWL_FM_MM<bits<4> funct> {
+class LWL_FM_MM<bits<4> funct> : MMArch {
bits<5> rt;
bits<21> addr;
@@ -419,7 +432,7 @@ class LWL_FM_MM<bits<4> funct> {
let Inst{11-0} = addr{11-0};
}
-class POOL32C_STEVA_LDEVA_FM_MM<bits<4> type, bits<3> funct> {
+class POOL32C_STEVA_LDEVA_FM_MM<bits<4> type, bits<3> funct> : MMArch {
bits<5> rt;
bits<21> addr;
bits<5> base = addr{20-16};
@@ -600,8 +613,9 @@ class SYNC_FM_MM : MMArch {
}
class SYNCI_FM_MM : MMArch {
- bits<5> rs;
- bits<16> offset;
+ bits<21> addr;
+ bits<5> rs = addr{20-16};
+ bits<16> offset = addr{15-0};
bits<32> Inst;
let Inst{31-26} = 0b010000;
@@ -629,7 +643,7 @@ class SYS_FM_MM : MMArch {
let Inst{5-0} = 0x3c;
}
-class WAIT_FM_MM {
+class WAIT_FM_MM : MMArch {
bits<10> code_;
bits<32> Inst;
@@ -699,7 +713,7 @@ class LL_FM_MM<bits<4> funct> : MMArch {
let Inst{11-0} = addr{11-0};
}
-class LLE_FM_MM<bits<4> funct> {
+class LLE_FM_MM<bits<4> funct> : MMArch {
bits<5> rt;
bits<21> addr;
bits<5> base = addr{20-16};
@@ -730,7 +744,6 @@ class ADDS_FM_MM<bits<2> fmt, bits<8> funct> : MMArch {
let Inst{9-8} = fmt;
let Inst{7-0} = funct;
- list<dag> Pattern = [];
}
class LWXC1_FM_MM<bits<9> funct> : MMArch {
@@ -831,13 +844,13 @@ class ABS_FM_MM<bits<2> fmt, bits<7> funct> : MMArch {
class CMov_F_F_FM_MM<bits<9> func, bits<2> fmt> : MMArch {
bits<5> fd;
bits<5> fs;
-
+ bits<3> fcc;
bits<32> Inst;
let Inst{31-26} = 0x15;
let Inst{25-21} = fd;
let Inst{20-16} = fs;
- let Inst{15-13} = 0x0; //cc
+ let Inst{15-13} = fcc; //cc
let Inst{12-11} = 0x0;
let Inst{10-9} = fmt;
let Inst{8-0} = func;
@@ -961,7 +974,7 @@ class LWM_FM_MM<bits<4> funct> : MMArch {
let Inst{11-0} = addr{11-0};
}
-class LWM_FM_MM16<bits<4> funct> : MMArch, PredicateControl {
+class LWM_FM_MM16<bits<4> funct> : MMArch {
bits<2> rt;
bits<4> addr;
@@ -1053,3 +1066,39 @@ class POOL32A_CFTC2_FM_MM<bits<10> funct> : MMArch {
let Inst{15-6} = funct;
let Inst{5-0} = 0b111100;
}
+
+class POOL32A_TLBINV_FM_MM<bits<10> funct> : MMArch {
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x0;
+ let Inst{25-16} = 0x0;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_MFTC0_FM_MM<bits<5> funct, bits<6> opcode> : MMArch {
+ bits<5> rt;
+ bits<5> rs;
+ bits<3> sel;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-14} = 0;
+ let Inst{13-11} = sel;
+ let Inst{10-6} = funct;
+ let Inst{5-0} = opcode;
+}
+
+class POOL32A_HYPCALL_FM_MM : MMArch {
+ bits<32> Inst;
+
+ bits<10> code_;
+
+ let Inst{31-26} = 0x0;
+ let Inst{25-16} = code_;
+ let Inst{15-6} = 0b1100001101;
+ let Inst{5-0} = 0b111100;
+}
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 64fe55e9776b..ebadb59a0432 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -1,3 +1,16 @@
+//===--- MicroMipsInstrFormats.td - microMIPS Inst Defs -*- tablegen -*----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This files describes the defintions of the microMIPSr3 instructions.
+//
+//===----------------------------------------------------------------------===//
+
def addrimm11 : ComplexPattern<iPTR, 2, "selectIntAddr11MM", [frameindex]>;
def addrimm12 : ComplexPattern<iPTR, 2, "selectIntAddr12MM", [frameindex]>;
def addrimm16 : ComplexPattern<iPTR, 2, "selectIntAddr16MM", [frameindex]>;
@@ -128,6 +141,7 @@ def mem_mm_16 : Operand<i32> {
let PrintMethod = "printMemOperand";
let MIOperandInfo = (ops ptr_rc, simm16);
let EncoderMethod = "getMemEncodingMMImm16";
+ let DecoderMethod = "DecodeMemMMImm16";
let ParserMatchClass = MipsMemSimm16AsmOperand;
let OperandType = "OPERAND_MEMORY";
}
@@ -201,6 +215,9 @@ class LoadLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
Itin, FrmI> {
let DecoderMethod = "DecodeMemMMImm12";
string Constraints = "$src = $rt";
+ let BaseOpcode = opstr;
+ bit mayLoad = 1;
+ bit mayStore = 0;
}
class StoreLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
@@ -209,6 +226,9 @@ class StoreLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
!strconcat(opstr, "\t$rt, $addr"),
[(OpNode RO:$rt, addrimm12:$addr)], Itin, FrmI> {
let DecoderMethod = "DecodeMemMMImm12";
+ let BaseOpcode = opstr;
+ bit mayLoad = 0;
+ bit mayStore = 1;
}
/// A register pair used by movep instruction.
@@ -231,35 +251,23 @@ MicroMipsInst16<(outs movep_regpair:$dst_regs), (ins RO:$rs, RO:$rt),
!strconcat(opstr, "\t$dst_regs, $rs, $rt"), [],
NoItinerary, FrmR> {
let isReMaterializable = 1;
-}
-
-/// A register pair used by load/store pair instructions.
-def RegPairAsmOperand : AsmOperandClass {
- let Name = "RegPair";
- let ParserMethod = "parseRegisterPair";
- let PredicateMethod = "isRegPair";
-}
-
-def regpair : Operand<i32> {
- let EncoderMethod = "getRegisterPairOpValue";
- let ParserMatchClass = RegPairAsmOperand;
- let PrintMethod = "printRegisterPair";
- let DecoderMethod = "DecodeRegPairOperand";
- let MIOperandInfo = (ops ptr_rc, ptr_rc);
+ let isMoveReg = 1;
}
class StorePairMM<string opstr, ComplexPattern Addr = addr>
- : InstSE<(outs), (ins regpair:$rt, mem_simm12:$addr),
+ : InstSE<(outs), (ins GPR32Opnd:$rt, GPR32Opnd:$rt2, mem_simm12:$addr),
!strconcat(opstr, "\t$rt, $addr"), [], II_SWP, FrmI, opstr> {
let DecoderMethod = "DecodeMemMMImm12";
let mayStore = 1;
+ let AsmMatchConverter = "ConvertXWPOperands";
}
class LoadPairMM<string opstr, ComplexPattern Addr = addr>
- : InstSE<(outs regpair:$rt), (ins mem_simm12:$addr),
+ : InstSE<(outs GPR32Opnd:$rt, GPR32Opnd:$rt2), (ins mem_simm12:$addr),
!strconcat(opstr, "\t$rt, $addr"), [], II_LWP, FrmI, opstr> {
let DecoderMethod = "DecodeMemMMImm12";
let mayLoad = 1;
+ let AsmMatchConverter = "ConvertXWPOperands";
}
class LLBaseMM<string opstr, RegisterOperand RO> :
@@ -273,6 +281,7 @@ class LLEBaseMM<string opstr, RegisterOperand RO> :
InstSE<(outs RO:$rt), (ins mem_simm9:$addr),
!strconcat(opstr, "\t$rt, $addr"), [], II_LLE, FrmI> {
let DecoderMethod = "DecodeMemMMImm9";
+ string BaseOpcode = opstr;
let mayLoad = 1;
}
@@ -288,6 +297,7 @@ class SCEBaseMM<string opstr, RegisterOperand RO> :
InstSE<(outs RO:$dst), (ins RO:$rt, mem_simm9:$addr),
!strconcat(opstr, "\t$rt, $addr"), [], II_SCE, FrmI> {
let DecoderMethod = "DecodeMemMMImm9";
+ string BaseOpcode = opstr;
let mayStore = 1;
let Constraints = "$rt = $dst";
}
@@ -406,12 +416,14 @@ class MoveFromHILOMM<string opstr, RegisterOperand RO, Register UseReg> :
[], II_MFHI_MFLO, FrmR> {
let Uses = [UseReg];
let hasSideEffects = 0;
+ let isMoveReg = 1;
}
class MoveMM16<string opstr, RegisterOperand RO>
: MicroMipsInst16<(outs RO:$rd), (ins RO:$rs),
!strconcat(opstr, "\t$rd, $rs"), [], II_MOVE, FrmR> {
let isReMaterializable = 1;
+ let isMoveReg = 1;
}
class LoadImmMM16<string opstr, Operand Od, RegisterOperand RO> :
@@ -423,7 +435,7 @@ class LoadImmMM16<string opstr, Operand Od, RegisterOperand RO> :
// 16-bit Jump and Link (Call)
class JumpLinkRegMM16<string opstr, RegisterOperand RO> :
MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
- [(MipsJmpLink RO:$rs)], II_JALR, FrmR>, PredicateControl {
+ [(MipsJmpLink RO:$rs)], II_JALR, FrmR> {
let isCall = 1;
let hasDelaySlot = 1;
let Defs = [RA];
@@ -586,70 +598,113 @@ class UncondBranchMM16<string opstr> :
let Defs = [AT];
}
-def ADDU16_MM : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
- ARITH_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6;
-def AND16_MM : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
- LOGIC_FM_MM16<0x2>, ISA_MICROMIPS_NOT_32R6;
+class HypcallMM<string opstr> :
+ InstSE<(outs), (ins uimm10:$code_),
+ !strconcat(opstr, "\t$code_"), [], II_HYPCALL, FrmOther> {
+ let BaseOpcode = opstr;
+}
+
+class TLBINVMM<string opstr, InstrItinClass Itin> :
+ InstSE<(outs), (ins), opstr, [], Itin, FrmOther> {
+ let BaseOpcode = opstr;
+}
+
+class MfCop0MM<string opstr, RegisterOperand DstRC,
+ RegisterOperand SrcRC, InstrItinClass Itin> :
+ InstSE<(outs DstRC:$rt), (ins SrcRC:$rs, uimm3:$sel),
+ !strconcat(opstr, "\t$rt, $rs, $sel"), [], Itin, FrmR> {
+ let BaseOpcode = opstr;
+}
+
+class MtCop0MM<string opstr, RegisterOperand DstRC,
+ RegisterOperand SrcRC, InstrItinClass Itin> :
+ InstSE<(outs DstRC:$rs), (ins SrcRC:$rt, uimm3:$sel),
+ !strconcat(opstr, "\t$rt, $rs, $sel"), [], Itin, FrmR> {
+ let BaseOpcode = opstr;
+}
+
+let FastISelShouldIgnore = 1 in {
+ def ADDU16_MM : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
+ ARITH_FM_MM16<0>, ISA_MICROMIPS32_NOT_MIPS32R6;
+ def AND16_MM : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
+ LOGIC_FM_MM16<0x2>, ISA_MICROMIPS32_NOT_MIPS32R6;
+}
+
def ANDI16_MM : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>, ANDI_FM_MM16<0x0b>,
- ISA_MICROMIPS_NOT_32R6;
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def NOT16_MM : NotMM16<"not16", GPRMM16Opnd>, LOGIC_FM_MM16<0x0>,
- ISA_MICROMIPS_NOT_32R6;
-def OR16_MM : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>, LOGIC_FM_MM16<0x3>,
- ISA_MICROMIPS_NOT_32R6;
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+let FastISelShouldIgnore = 1 in
+ def OR16_MM : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>, LOGIC_FM_MM16<0x3>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def SLL16_MM : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>,
- SHIFT_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6;
+ SHIFT_FM_MM16<0>, ISA_MICROMIPS32_NOT_MIPS32R6;
def SRL16_MM : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>,
- SHIFT_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6;
+ SHIFT_FM_MM16<1>, ISA_MICROMIPS32_NOT_MIPS32R6;
-def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
- ARITH_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6;
-def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
- LOGIC_FM_MM16<0x1>, ISA_MICROMIPS_NOT_32R6;
+let FastISelShouldIgnore = 1 in {
+ def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
+ ARITH_FM_MM16<1>, ISA_MICROMIPS32_NOT_MIPS32R6;
+ def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
+ LOGIC_FM_MM16<0x1>, ISA_MICROMIPS32_NOT_MIPS32R6;
+}
def LBU16_MM : LoadMM16<"lbu16", GPRMM16Opnd, zextloadi8, II_LBU,
- mem_mm_4>, LOAD_STORE_FM_MM16<0x02>;
+ mem_mm_4>, LOAD_STORE_FM_MM16<0x02>, ISA_MICROMIPS;
def LHU16_MM : LoadMM16<"lhu16", GPRMM16Opnd, zextloadi16, II_LHU,
- mem_mm_4_lsl1>, LOAD_STORE_FM_MM16<0x0a>;
+ mem_mm_4_lsl1>, LOAD_STORE_FM_MM16<0x0a>, ISA_MICROMIPS;
def LW16_MM : LoadMM16<"lw16", GPRMM16Opnd, load, II_LW, mem_mm_4_lsl2>,
- LOAD_STORE_FM_MM16<0x1a>;
+ LOAD_STORE_FM_MM16<0x1a>, ISA_MICROMIPS;
def SB16_MM : StoreMM16<"sb16", GPRMM16OpndZero, GPRMM16Opnd, truncstorei8,
- II_SB, mem_mm_4>, LOAD_STORE_FM_MM16<0x22>;
+ II_SB, mem_mm_4>, LOAD_STORE_FM_MM16<0x22>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def SH16_MM : StoreMM16<"sh16", GPRMM16OpndZero, GPRMM16Opnd, truncstorei16,
II_SH, mem_mm_4_lsl1>,
- LOAD_STORE_FM_MM16<0x2a>;
+ LOAD_STORE_FM_MM16<0x2a>, ISA_MICROMIPS32_NOT_MIPS32R6;
def SW16_MM : StoreMM16<"sw16", GPRMM16OpndZero, GPRMM16Opnd, store, II_SW,
- mem_mm_4_lsl2>, LOAD_STORE_FM_MM16<0x3a>;
+ mem_mm_4_lsl2>, LOAD_STORE_FM_MM16<0x3a>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def LWGP_MM : LoadGPMM16<"lw", GPRMM16Opnd, II_LW, mem_mm_gp_simm7_lsl2>,
- LOAD_GP_FM_MM16<0x19>;
+ LOAD_GP_FM_MM16<0x19>, ISA_MICROMIPS;
def LWSP_MM : LoadSPMM16<"lw", GPR32Opnd, II_LW, mem_mm_sp_imm5_lsl2>,
- LOAD_STORE_SP_FM_MM16<0x12>;
+ LOAD_STORE_SP_FM_MM16<0x12>, ISA_MICROMIPS;
def SWSP_MM : StoreSPMM16<"sw", GPR32Opnd, II_SW, mem_mm_sp_imm5_lsl2>,
- LOAD_STORE_SP_FM_MM16<0x32>;
-def ADDIUR1SP_MM : AddImmUR1SP<"addiur1sp", GPRMM16Opnd>, ADDIUR1SP_FM_MM16;
-def ADDIUR2_MM : AddImmUR2<"addiur2", GPRMM16Opnd>, ADDIUR2_FM_MM16;
-def ADDIUS5_MM : AddImmUS5<"addius5", GPR32Opnd>, ADDIUS5_FM_MM16;
-def ADDIUSP_MM : AddImmUSP<"addiusp">, ADDIUSP_FM_MM16;
-def MFHI16_MM : MoveFromHILOMM<"mfhi", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x10>;
-def MFLO16_MM : MoveFromHILOMM<"mflo", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x12>;
-def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>;
+ LOAD_STORE_SP_FM_MM16<0x32>, ISA_MICROMIPS32_NOT_MIPS32R6;
+def ADDIUR1SP_MM : AddImmUR1SP<"addiur1sp", GPRMM16Opnd>, ADDIUR1SP_FM_MM16,
+ ISA_MICROMIPS;
+def ADDIUR2_MM : AddImmUR2<"addiur2", GPRMM16Opnd>, ADDIUR2_FM_MM16,
+ ISA_MICROMIPS;
+def ADDIUS5_MM : AddImmUS5<"addius5", GPR32Opnd>, ADDIUS5_FM_MM16,
+ ISA_MICROMIPS;
+def ADDIUSP_MM : AddImmUSP<"addiusp">, ADDIUSP_FM_MM16, ISA_MICROMIPS;
+def MFHI16_MM : MoveFromHILOMM<"mfhi16", GPR32Opnd, AC0>,
+ MFHILO_FM_MM16<0x10>, ISA_MICROMIPS32_NOT_MIPS32R6;
+def MFLO16_MM : MoveFromHILOMM<"mflo16", GPR32Opnd, AC0>,
+ MFHILO_FM_MM16<0x12>, ISA_MICROMIPS32_NOT_MIPS32R6;
+def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def MOVEP_MM : MovePMM16<"movep", GPRMM16OpndMoveP>, MOVEP_FM_MM16,
- ISA_MICROMIPS_NOT_32R6;
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def LI16_MM : LoadImmMM16<"li16", li16_imm, GPRMM16Opnd>, LI_FM_MM16,
- IsAsCheapAsAMove;
+ IsAsCheapAsAMove, ISA_MICROMIPS32_NOT_MIPS32R6;
def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>,
ISA_MICROMIPS32_NOT_MIPS32R6;
-def JALRS16_MM : JumpLinkRegSMM16<"jalrs16", GPR32Opnd>, JALR_FM_MM16<0x0f>;
-def JRC16_MM : JumpRegCMM16<"jrc", GPR32Opnd>, JALR_FM_MM16<0x0d>;
-def JRADDIUSP : JumpRAddiuStackMM16, JRADDIUSP_FM_MM16<0x18>;
-def JR16_MM : JumpRegMM16<"jr16", GPR32Opnd>, JALR_FM_MM16<0x0c>;
+def JALRS16_MM : JumpLinkRegSMM16<"jalrs16", GPR32Opnd>, JALR_FM_MM16<0x0f>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+def JRC16_MM : JumpRegCMM16<"jrc", GPR32Opnd>, JALR_FM_MM16<0x0d>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+def JRADDIUSP : JumpRAddiuStackMM16, JRADDIUSP_FM_MM16<0x18>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+def JR16_MM : JumpRegMM16<"jr16", GPR32Opnd>, JALR_FM_MM16<0x0c>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def BEQZ16_MM : CBranchZeroMM<"beqz16", brtarget7_mm, GPRMM16Opnd>,
- BEQNEZ_FM_MM16<0x23>;
+ BEQNEZ_FM_MM16<0x23>, ISA_MICROMIPS32_NOT_MIPS32R6;
def BNEZ16_MM : CBranchZeroMM<"bnez16", brtarget7_mm, GPRMM16Opnd>,
- BEQNEZ_FM_MM16<0x2b>;
-def B16_MM : UncondBranchMM16<"b16">, B16_FM;
+ BEQNEZ_FM_MM16<0x2b>, ISA_MICROMIPS32_NOT_MIPS32R6;
+def B16_MM : UncondBranchMM16<"b16">, B16_FM, ISA_MICROMIPS32_NOT_MIPS32R6;
def BREAK16_MM : BrkSdbbp16MM<"break16", II_BREAK>, BRKSDBBP16_FM_MM<0x28>,
- ISA_MICROMIPS_NOT_32R6;
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def SDBBP16_MM : BrkSdbbp16MM<"sdbbp16", II_SDBBP>, BRKSDBBP16_FM_MM<0x2C>,
- ISA_MICROMIPS_NOT_32R6;
+ ISA_MICROMIPS32_NOT_MIPS32R6;
let DecoderNamespace = "MicroMips" in {
/// Load and Store Instructions - multiple
@@ -657,175 +712,196 @@ let DecoderNamespace = "MicroMips" in {
ISA_MICROMIPS32_NOT_MIPS32R6;
def LWM16_MM : LoadMultMM16<"lwm16", II_LWM>, LWM_FM_MM16<0x4>,
ISA_MICROMIPS32_NOT_MIPS32R6;
- let AdditionalPredicates = [InMicroMips] in {
- def CFC2_MM : InstSE<(outs GPR32Opnd:$rt), (ins COP2Opnd:$impl),
- "cfc2\t$rt, $impl", [], II_CFC2, FrmFR, "cfc2">,
- POOL32A_CFTC2_FM_MM<0b1100110100>;
- def CTC2_MM : InstSE<(outs COP2Opnd:$impl), (ins GPR32Opnd:$rt),
- "ctc2\t$rt, $impl", [], II_CTC2, FrmFR, "ctc2">,
- POOL32A_CFTC2_FM_MM<0b1101110100>;
- }
+ def CFC2_MM : InstSE<(outs GPR32Opnd:$rt), (ins COP2Opnd:$impl),
+ "cfc2\t$rt, $impl", [], II_CFC2, FrmFR, "cfc2">,
+ POOL32A_CFTC2_FM_MM<0b1100110100>, ISA_MICROMIPS;
+ def CTC2_MM : InstSE<(outs COP2Opnd:$impl), (ins GPR32Opnd:$rt),
+ "ctc2\t$rt, $impl", [], II_CTC2, FrmFR, "ctc2">,
+ POOL32A_CFTC2_FM_MM<0b1101110100>, ISA_MICROMIPS;
}
class WaitMM<string opstr> :
InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [],
II_WAIT, FrmOther, opstr>;
-let DecoderNamespace = "MicroMips", Predicates = [InMicroMips, NotMips32r6,
- NotMips64r6] in {
+let DecoderNamespace = "MicroMips" in {
/// Compact Branch Instructions
def BEQZC_MM : CompactBranchMM<"beqzc", brtarget_mm, seteq, GPR32Opnd>,
- COMPACT_BRANCH_FM_MM<0x7>;
+ COMPACT_BRANCH_FM_MM<0x7>, ISA_MICROMIPS32_NOT_MIPS32R6;
def BNEZC_MM : CompactBranchMM<"bnezc", brtarget_mm, setne, GPR32Opnd>,
- COMPACT_BRANCH_FM_MM<0x5>;
-}
-let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
+ COMPACT_BRANCH_FM_MM<0x5>, ISA_MICROMIPS32_NOT_MIPS32R6;
+
/// Arithmetic Instructions (ALU Immediate)
def ADDiu_MM : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd, II_ADDIU>,
- ADDI_FM_MM<0xc>;
+ ADDI_FM_MM<0xc>, ISA_MICROMIPS32_NOT_MIPS32R6;
def ADDi_MM : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd, II_ADDI>,
- ADDI_FM_MM<0x4>;
+ ADDI_FM_MM<0x4>, ISA_MICROMIPS32_NOT_MIPS32R6;
def SLTi_MM : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
- SLTI_FM_MM<0x24>;
+ SLTI_FM_MM<0x24>, ISA_MICROMIPS;
def SLTiu_MM : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
- SLTI_FM_MM<0x2c>;
+ SLTI_FM_MM<0x2c>, ISA_MICROMIPS;
def ANDi_MM : MMRel, ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI>,
- ADDI_FM_MM<0x34>;
+ ADDI_FM_MM<0x34>, ISA_MICROMIPS32_NOT_MIPS32R6;
def ORi_MM : MMRel, ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16,
- or>, ADDI_FM_MM<0x14>;
+ or>, ADDI_FM_MM<0x14>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def XORi_MM : MMRel, ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI,
- immZExt16, xor>, ADDI_FM_MM<0x1c>;
- def LUi_MM : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM_MM;
+ immZExt16, xor>, ADDI_FM_MM<0x1c>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def LUi_MM : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM_MM,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def LEA_ADDiu_MM : MMRel, EffectiveAddress<"addiu", GPR32Opnd>,
- LW_FM_MM<0xc>;
+ LW_FM_MM<0xc>, ISA_MICROMIPS;
/// Arithmetic Instructions (3-Operand, R-Type)
def ADDu_MM : MMRel, ArithLogicR<"addu", GPR32Opnd, 1, II_ADDU, add>,
- ADD_FM_MM<0, 0x150>;
+ ADD_FM_MM<0, 0x150>, ISA_MICROMIPS32_NOT_MIPS32R6;
def SUBu_MM : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
- ADD_FM_MM<0, 0x1d0>;
- def MUL_MM : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL>,
- ADD_FM_MM<0, 0x210>;
+ ADD_FM_MM<0, 0x1d0>, ISA_MICROMIPS32_NOT_MIPS32R6;
+ let Defs = [HI0, LO0] in
+ def MUL_MM : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>,
+ ADD_FM_MM<0, 0x210>, ISA_MICROMIPS32_NOT_MIPS32R6;
def ADD_MM : MMRel, ArithLogicR<"add", GPR32Opnd, 1, II_ADD>,
- ADD_FM_MM<0, 0x110>;
+ ADD_FM_MM<0, 0x110>, ISA_MICROMIPS32_NOT_MIPS32R6;
def SUB_MM : MMRel, ArithLogicR<"sub", GPR32Opnd, 0, II_SUB>,
- ADD_FM_MM<0, 0x190>;
- def SLT_MM : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM_MM<0, 0x350>;
+ ADD_FM_MM<0, 0x190>, ISA_MICROMIPS32_NOT_MIPS32R6;
+ def SLT_MM : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM_MM<0, 0x350>,
+ ISA_MICROMIPS;
def SLTu_MM : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>,
- ADD_FM_MM<0, 0x390>;
+ ADD_FM_MM<0, 0x390>, ISA_MICROMIPS;
def AND_MM : MMRel, ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>,
- ADD_FM_MM<0, 0x250>;
+ ADD_FM_MM<0, 0x250>, ISA_MICROMIPS32_NOT_MIPS32R6;
def OR_MM : MMRel, ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>,
- ADD_FM_MM<0, 0x290>;
+ ADD_FM_MM<0, 0x290>, ISA_MICROMIPS32_NOT_MIPS32R6;
def XOR_MM : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>,
- ADD_FM_MM<0, 0x310>;
- def NOR_MM : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM_MM<0, 0x2d0>;
+ ADD_FM_MM<0, 0x310>, ISA_MICROMIPS32_NOT_MIPS32R6;
+ def NOR_MM : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM_MM<0, 0x2d0>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def MULT_MM : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
- MULT_FM_MM<0x22c>;
+ MULT_FM_MM<0x22c>, ISA_MICROMIPS32_NOT_MIPS32R6;
def MULTu_MM : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
- MULT_FM_MM<0x26c>;
+ MULT_FM_MM<0x26c>, ISA_MICROMIPS32_NOT_MIPS32R6;
def SDIV_MM : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
- MULT_FM_MM<0x2ac>, ISA_MIPS1_NOT_32R6_64R6;
+ MULT_FM_MM<0x2ac>, ISA_MICROMIPS32_NOT_MIPS32R6;
def UDIV_MM : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
- MULT_FM_MM<0x2ec>, ISA_MIPS1_NOT_32R6_64R6;
+ MULT_FM_MM<0x2ec>, ISA_MICROMIPS32_NOT_MIPS32R6;
/// Arithmetic Instructions with PC and Immediate
- def ADDIUPC_MM : AddImmUPC<"addiupc", GPRMM16Opnd>, ADDIUPC_FM_MM;
+ def ADDIUPC_MM : AddImmUPC<"addiupc", GPRMM16Opnd>, ADDIUPC_FM_MM,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
/// Shift Instructions
def SLL_MM : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL>,
- SRA_FM_MM<0, 0>;
+ SRA_FM_MM<0, 0>, ISA_MICROMIPS;
def SRL_MM : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL>,
- SRA_FM_MM<0x40, 0>;
+ SRA_FM_MM<0x40, 0>, ISA_MICROMIPS;
def SRA_MM : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA>,
- SRA_FM_MM<0x80, 0>;
+ SRA_FM_MM<0x80, 0>, ISA_MICROMIPS;
def SLLV_MM : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV>,
- SRLV_FM_MM<0x10, 0>;
+ SRLV_FM_MM<0x10, 0>, ISA_MICROMIPS;
def SRLV_MM : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV>,
- SRLV_FM_MM<0x50, 0>;
+ SRLV_FM_MM<0x50, 0>, ISA_MICROMIPS;
def SRAV_MM : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV>,
- SRLV_FM_MM<0x90, 0>;
+ SRLV_FM_MM<0x90, 0>, ISA_MICROMIPS;
def ROTR_MM : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR>,
- SRA_FM_MM<0xc0, 0> {
+ SRA_FM_MM<0xc0, 0>, ISA_MICROMIPS {
list<dag> Pattern = [(set GPR32Opnd:$rd,
(rotr GPR32Opnd:$rt, immZExt5:$shamt))];
}
def ROTRV_MM : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV>,
- SRLV_FM_MM<0xd0, 0> {
+ SRLV_FM_MM<0xd0, 0>, ISA_MICROMIPS {
list<dag> Pattern = [(set GPR32Opnd:$rd,
(rotr GPR32Opnd:$rt, GPR32Opnd:$rs))];
}
/// Load and Store Instructions - aligned
let DecoderMethod = "DecodeMemMMImm16" in {
- def LB_MM : LoadMemory<"lb", GPR32Opnd, mem_mm_16, null_frag, II_LB>,
- MMRel, LW_FM_MM<0x7>;
- def LBu_MM : LoadMemory<"lbu", GPR32Opnd, mem_mm_16, null_frag, II_LBU>,
- MMRel, LW_FM_MM<0x5>;
- def LH_MM : LoadMemory<"lh", GPR32Opnd, mem_simm16, sextloadi16, II_LH,
- addrDefault>, MMRel, LW_FM_MM<0xf>;
- def LHu_MM : LoadMemory<"lhu", GPR32Opnd, mem_simm16, zextloadi16, II_LHU>,
- MMRel, LW_FM_MM<0xd>;
- def LW_MM : Load<"lw", GPR32Opnd, null_frag, II_LW>, MMRel, LW_FM_MM<0x3f>;
- def SB_MM : Store<"sb", GPR32Opnd, null_frag, II_SB>, MMRel,
- LW_FM_MM<0x6>;
- def SH_MM : Store<"sh", GPR32Opnd, null_frag, II_SH>, MMRel,
- LW_FM_MM<0xe>;
+ def LB_MM : LoadMemory<"lb", GPR32Opnd, mem_mm_16, sextloadi8, II_LB>,
+ MMRel, LW_FM_MM<0x7>, ISA_MICROMIPS;
+ def LBu_MM : LoadMemory<"lbu", GPR32Opnd, mem_mm_16, zextloadi8, II_LBU>,
+ MMRel, LW_FM_MM<0x5>, ISA_MICROMIPS;
+ def LH_MM : LoadMemory<"lh", GPR32Opnd, mem_simmptr, sextloadi16, II_LH,
+ addrDefault>, MMRel, LW_FM_MM<0xf>, ISA_MICROMIPS;
+ def LHu_MM : LoadMemory<"lhu", GPR32Opnd, mem_simmptr, zextloadi16, II_LHU>,
+ MMRel, LW_FM_MM<0xd>, ISA_MICROMIPS;
+ def LW_MM : Load<"lw", GPR32Opnd, null_frag, II_LW>, MMRel, LW_FM_MM<0x3f>,
+ ISA_MICROMIPS;
+ def SB_MM : Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel,
+ LW_FM_MM<0x6>, ISA_MICROMIPS;
+ def SH_MM : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel,
+ LW_FM_MM<0xe>, ISA_MICROMIPS;
def SW_MM : Store<"sw", GPR32Opnd, null_frag, II_SW>, MMRel,
- LW_FM_MM<0x3e>;
+ LW_FM_MM<0x3e>, ISA_MICROMIPS;
}
-
+}
+let DecoderNamespace = "MicroMips" in {
let DecoderMethod = "DecodeMemMMImm9" in {
- def LBE_MM : Load<"lbe", GPR32Opnd, null_frag, II_LBE>,
- POOL32C_LHUE_FM_MM<0x18, 0x6, 0x4>;
- def LBuE_MM : Load<"lbue", GPR32Opnd, null_frag, II_LBUE>,
- POOL32C_LHUE_FM_MM<0x18, 0x6, 0x0>;
- def LHE_MM : LoadMemory<"lhe", GPR32Opnd, mem_simm9, null_frag, II_LHE>,
- POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>;
- def LHuE_MM : LoadMemory<"lhue", GPR32Opnd, mem_simm9, null_frag, II_LHUE>,
- POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>;
- def LWE_MM : LoadMemory<"lwe", GPR32Opnd, mem_simm9, null_frag, II_LWE>,
- POOL32C_LHUE_FM_MM<0x18, 0x6, 0x7>;
- def SBE_MM : StoreMemory<"sbe", GPR32Opnd, mem_simm9, null_frag, II_SBE>,
- POOL32C_LHUE_FM_MM<0x18, 0xa, 0x4>;
- def SHE_MM : StoreMemory<"she", GPR32Opnd, mem_simm9, null_frag, II_SHE>,
- POOL32C_LHUE_FM_MM<0x18, 0xa, 0x5>;
- def SWE_MM : StoreMemory<"swe", GPR32Opnd, mem_simm9, null_frag, II_SWE>,
- POOL32C_LHUE_FM_MM<0x18, 0xa, 0x7>;
+ def LBE_MM : MMRel, Load<"lbe", GPR32Opnd, null_frag, II_LBE>,
+ POOL32C_LHUE_FM_MM<0x18, 0x6, 0x4>, ISA_MICROMIPS, ASE_EVA;
+ def LBuE_MM : MMRel, Load<"lbue", GPR32Opnd, null_frag, II_LBUE>,
+ POOL32C_LHUE_FM_MM<0x18, 0x6, 0x0>, ISA_MICROMIPS, ASE_EVA;
+ def LHE_MM : MMRel, LoadMemory<"lhe", GPR32Opnd, mem_simm9, null_frag,
+ II_LHE>,
+ POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>, ISA_MICROMIPS, ASE_EVA;
+ def LHuE_MM : MMRel, LoadMemory<"lhue", GPR32Opnd, mem_simm9, null_frag,
+ II_LHUE>,
+ POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>, ISA_MICROMIPS, ASE_EVA;
+ def LWE_MM : MMRel, LoadMemory<"lwe", GPR32Opnd, mem_simm9, null_frag,
+ II_LWE>,
+ POOL32C_LHUE_FM_MM<0x18, 0x6, 0x7>, ISA_MICROMIPS, ASE_EVA;
+ def SBE_MM : MMRel, StoreMemory<"sbe", GPR32Opnd, mem_simm9, null_frag,
+ II_SBE>,
+ POOL32C_LHUE_FM_MM<0x18, 0xa, 0x4>, ISA_MICROMIPS, ASE_EVA;
+ def SHE_MM : MMRel, StoreMemory<"she", GPR32Opnd, mem_simm9, null_frag,
+ II_SHE>,
+ POOL32C_LHUE_FM_MM<0x18, 0xa, 0x5>, ISA_MICROMIPS, ASE_EVA;
+ def SWE_MM : MMRel, StoreMemory<"swe", GPR32Opnd, mem_simm9, null_frag,
+ II_SWE>,
+ POOL32C_LHUE_FM_MM<0x18, 0xa, 0x7>, ISA_MICROMIPS, ASE_EVA;
+ def LWLE_MM : MMRel, LoadLeftRightMM<"lwle", MipsLWL, GPR32Opnd, mem_mm_9,
+ II_LWLE>,
+ POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x2>,
+ ISA_MICROMIPS32_NOT_MIPS32R6, ASE_EVA;
+ def LWRE_MM : MMRel, LoadLeftRightMM<"lwre", MipsLWR, GPR32Opnd, mem_mm_9,
+ II_LWRE>,
+ POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x3>,
+ ISA_MICROMIPS32_NOT_MIPS32R6, ASE_EVA;
+ def SWLE_MM : MMRel, StoreLeftRightMM<"swle", MipsSWL, GPR32Opnd, mem_mm_9,
+ II_SWLE>,
+ POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x0>,
+ ISA_MICROMIPS32_NOT_MIPS32R6, ASE_EVA;
+ def SWRE_MM : MMRel, StoreLeftRightMM<"swre", MipsSWR, GPR32Opnd, mem_mm_9,
+ II_SWRE>,
+ POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x1>,
+ ISA_MICROMIPS32_NOT_MIPS32R6, ASE_EVA;
}
- def LWXS_MM : LoadWordIndexedScaledMM<"lwxs", GPR32Opnd>, LWXS_FM_MM<0x118>;
+ def LWXS_MM : LoadWordIndexedScaledMM<"lwxs", GPR32Opnd>, LWXS_FM_MM<0x118>,
+ ISA_MICROMIPS;
/// Load and Store Instructions - unaligned
- def LWL_MM : LoadLeftRightMM<"lwl", MipsLWL, GPR32Opnd, mem_mm_12, II_LWL>,
- LWL_FM_MM<0x0>;
- def LWR_MM : LoadLeftRightMM<"lwr", MipsLWR, GPR32Opnd, mem_mm_12, II_LWR>,
- LWL_FM_MM<0x1>;
- def SWL_MM : StoreLeftRightMM<"swl", MipsSWL, GPR32Opnd, mem_mm_12, II_SWL>,
- LWL_FM_MM<0x8>;
- def SWR_MM : StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12, II_SWR>,
- LWL_FM_MM<0x9>;
- let DecoderMethod = "DecodeMemMMImm9" in {
- def LWLE_MM : LoadLeftRightMM<"lwle", MipsLWL, GPR32Opnd, mem_mm_9,
- II_LWLE>, POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x2>;
- def LWRE_MM : LoadLeftRightMM<"lwre", MipsLWR, GPR32Opnd, mem_mm_9,
- II_LWRE>, POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x3>;
- def SWLE_MM : StoreLeftRightMM<"swle", MipsSWL, GPR32Opnd, mem_mm_9,
- II_SWLE>,
- POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x0>;
- def SWRE_MM : StoreLeftRightMM<"swre", MipsSWR, GPR32Opnd, mem_mm_9,
- II_SWRE>,
- POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x1>, ISA_MIPS1_NOT_32R6_64R6;
- }
-
+ def LWL_MM : MMRel, LoadLeftRightMM<"lwl", MipsLWL, GPR32Opnd, mem_mm_12,
+ II_LWL>, LWL_FM_MM<0x0>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def LWR_MM : MMRel, LoadLeftRightMM<"lwr", MipsLWR, GPR32Opnd, mem_mm_12,
+ II_LWR>, LWL_FM_MM<0x1>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def SWL_MM : MMRel, StoreLeftRightMM<"swl", MipsSWL, GPR32Opnd, mem_mm_12,
+ II_SWL>, LWL_FM_MM<0x8>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def SWR_MM : MMRel, StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12,
+ II_SWR>, LWL_FM_MM<0x9>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+}
+let DecoderNamespace = "MicroMips" in {
/// Load and Store Instructions - multiple
- def SWM32_MM : StoreMultMM<"swm32", II_SWM>, LWM_FM_MM<0xd>;
- def LWM32_MM : LoadMultMM<"lwm32", II_LWM>, LWM_FM_MM<0x5>;
+ def SWM32_MM : StoreMultMM<"swm32", II_SWM>, LWM_FM_MM<0xd>, ISA_MICROMIPS;
+ def LWM32_MM : LoadMultMM<"lwm32", II_LWM>, LWM_FM_MM<0x5>, ISA_MICROMIPS;
/// Load and Store Pair Instructions
- def SWP_MM : StorePairMM<"swp">, LWM_FM_MM<0x9>;
- def LWP_MM : LoadPairMM<"lwp">, LWM_FM_MM<0x1>;
+ def SWP_MM : StorePairMM<"swp">, LWM_FM_MM<0x9>, ISA_MICROMIPS;
+ def LWP_MM : LoadPairMM<"lwp">, LWM_FM_MM<0x1>, ISA_MICROMIPS;
/// Load and Store multiple pseudo Instructions
class LoadWordMultMM<string instr_asm > :
@@ -837,172 +913,217 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
!strconcat(instr_asm, "\t$rt, $addr")> ;
- def SWM_MM : StoreWordMultMM<"swm">;
- def LWM_MM : LoadWordMultMM<"lwm">;
+ def SWM_MM : StoreWordMultMM<"swm">, ISA_MICROMIPS;
+ def LWM_MM : LoadWordMultMM<"lwm">, ISA_MICROMIPS;
/// Move Conditional
def MOVZ_I_MM : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd,
- NoItinerary>, ADD_FM_MM<0, 0x58>;
+ II_MOVZ>, ADD_FM_MM<0, 0x58>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def MOVN_I_MM : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd,
- NoItinerary>, ADD_FM_MM<0, 0x18>;
- def MOVT_I_MM : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT>,
- CMov_F_I_FM_MM<0x25>;
- def MOVF_I_MM : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF>,
- CMov_F_I_FM_MM<0x5>;
-
+ II_MOVN>, ADD_FM_MM<0, 0x18>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def MOVT_I_MM : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>,
+ CMov_F_I_FM_MM<0x25>, ISA_MICROMIPS32_NOT_MIPS32R6;
+ def MOVF_I_MM : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>,
+ CMov_F_I_FM_MM<0x5>, ISA_MICROMIPS32_NOT_MIPS32R6;
/// Move to/from HI/LO
def MTHI_MM : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>,
- MTLO_FM_MM<0x0b5>;
+ MTLO_FM_MM<0x0b5>, ISA_MICROMIPS32_NOT_MIPS32R6;
def MTLO_MM : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>,
- MTLO_FM_MM<0x0f5>;
+ MTLO_FM_MM<0x0f5>, ISA_MICROMIPS32_NOT_MIPS32R6;
def MFHI_MM : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>,
- MFLO_FM_MM<0x035>;
+ MFLO_FM_MM<0x035>, ISA_MICROMIPS32_NOT_MIPS32R6;
def MFLO_MM : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>,
- MFLO_FM_MM<0x075>;
+ MFLO_FM_MM<0x075>, ISA_MICROMIPS32_NOT_MIPS32R6;
/// Multiply Add/Sub Instructions
- def MADD_MM : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM_MM<0x32c>;
- def MADDU_MM : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM_MM<0x36c>;
- def MSUB_MM : MMRel, MArithR<"msub", II_MSUB>, MULT_FM_MM<0x3ac>;
- def MSUBU_MM : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM_MM<0x3ec>;
+ def MADD_MM : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM_MM<0x32c>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def MADDU_MM : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM_MM<0x36c>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def MSUB_MM : MMRel, MArithR<"msub", II_MSUB>, MULT_FM_MM<0x3ac>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def MSUBU_MM : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM_MM<0x3ec>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
/// Count Leading
def CLZ_MM : MMRel, CountLeading0<"clz", GPR32Opnd, II_CLZ>, CLO_FM_MM<0x16c>,
- ISA_MIPS32;
+ ISA_MICROMIPS;
def CLO_MM : MMRel, CountLeading1<"clo", GPR32Opnd, II_CLO>, CLO_FM_MM<0x12c>,
- ISA_MIPS32;
+ ISA_MICROMIPS;
/// Sign Ext In Register Instructions.
def SEB_MM : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
- SEB_FM_MM<0x0ac>, ISA_MIPS32R2;
+ SEB_FM_MM<0x0ac>, ISA_MICROMIPS;
def SEH_MM : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
- SEB_FM_MM<0x0ec>, ISA_MIPS32R2;
+ SEB_FM_MM<0x0ec>, ISA_MICROMIPS;
/// Word Swap Bytes Within Halfwords
def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>,
- SEB_FM_MM<0x1ec>, ISA_MIPS32R2;
+ SEB_FM_MM<0x1ec>, ISA_MICROMIPS;
// TODO: Add '0 < pos+size <= 32' constraint check to ext instruction
def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, immZExt5,
- immZExt5Plus1, MipsExt>, EXT_FM_MM<0x2c>;
+ immZExt5Plus1, MipsExt>, EXT_FM_MM<0x2c>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, uimm5_inssize_plus1,
immZExt5, immZExt5Plus1>,
- EXT_FM_MM<0x0c>;
+ EXT_FM_MM<0x0c>, ISA_MICROMIPS32_NOT_MIPS32R6;
/// Jump Instructions
-}
-let DecoderNamespace = "MicroMips", DecoderMethod = "DecodeJumpTargetMM" in
- def J_MM : MMRel, JumpFJ<jmptarget_mm, "j", br, bb, "j">,
- J_FM_MM<0x35>, AdditionalRequires<[RelocNotPIC]>,
- IsBranch, ISA_MICROMIPS32_NOT_MIPS32R6;
+ let DecoderMethod = "DecodeJumpTargetMM" in
+ def J_MM : MMRel, JumpFJ<jmptarget_mm, "j", br, bb, "j">,
+ J_FM_MM<0x35>, AdditionalRequires<[RelocNotPIC]>,
+ IsBranch, ISA_MICROMIPS32_NOT_MIPS32R6;
-let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
let DecoderMethod = "DecodeJumpTargetMM" in {
- def JAL_MM : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>;
- def JALX_MM : MMRel, JumpLink<"jalx", calltarget>, J_FM_MM<0x3c>;
+ def JAL_MM : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def JALX_MM : MMRel, JumpLink<"jalx", calltarget>, J_FM_MM<0x3c>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
}
def JR_MM : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>,
ISA_MICROMIPS32_NOT_MIPS32R6;
- def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>;
+ def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
/// Jump Instructions - Short Delay Slot
- def JALS_MM : JumpLinkMM<"jals", calltarget_mm>, J_FM_MM<0x1d>;
- def JALRS_MM : JumpLinkRegMM<"jalrs", GPR32Opnd>, JALR_FM_MM<0x13c>;
+ def JALS_MM : JumpLinkMM<"jals", calltarget_mm>, J_FM_MM<0x1d>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def JALRS_MM : JumpLinkRegMM<"jalrs", GPR32Opnd>, JALR_FM_MM<0x13c>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
/// Branch Instructions
def BEQ_MM : MMRel, CBranch<"beq", brtarget_mm, seteq, GPR32Opnd>,
- BEQ_FM_MM<0x25>;
+ BEQ_FM_MM<0x25>, ISA_MICROMIPS32_NOT_MIPS32R6;
def BNE_MM : MMRel, CBranch<"bne", brtarget_mm, setne, GPR32Opnd>,
- BEQ_FM_MM<0x2d>;
+ BEQ_FM_MM<0x2d>, ISA_MICROMIPS32_NOT_MIPS32R6;
def BGEZ_MM : MMRel, CBranchZero<"bgez", brtarget_mm, setge, GPR32Opnd>,
- BGEZ_FM_MM<0x2>;
+ BGEZ_FM_MM<0x2>, ISA_MICROMIPS32_NOT_MIPS32R6;
def BGTZ_MM : MMRel, CBranchZero<"bgtz", brtarget_mm, setgt, GPR32Opnd>,
- BGEZ_FM_MM<0x6>;
+ BGEZ_FM_MM<0x6>, ISA_MICROMIPS32_NOT_MIPS32R6;
def BLEZ_MM : MMRel, CBranchZero<"blez", brtarget_mm, setle, GPR32Opnd>,
- BGEZ_FM_MM<0x4>;
+ BGEZ_FM_MM<0x4>, ISA_MICROMIPS32_NOT_MIPS32R6;
def BLTZ_MM : MMRel, CBranchZero<"bltz", brtarget_mm, setlt, GPR32Opnd>,
- BGEZ_FM_MM<0x0>;
+ BGEZ_FM_MM<0x0>, ISA_MICROMIPS32_NOT_MIPS32R6;
def BGEZAL_MM : MMRel, BGEZAL_FT<"bgezal", brtarget_mm, GPR32Opnd>,
- BGEZAL_FM_MM<0x03>;
+ BGEZAL_FM_MM<0x03>, ISA_MICROMIPS32_NOT_MIPS32R6;
def BLTZAL_MM : MMRel, BGEZAL_FT<"bltzal", brtarget_mm, GPR32Opnd>,
- BGEZAL_FM_MM<0x01>;
+ BGEZAL_FM_MM<0x01>, ISA_MICROMIPS32_NOT_MIPS32R6;
+ def BAL_BR_MM : BAL_BR_Pseudo<BGEZAL_MM, brtarget_mm>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
/// Branch Instructions - Short Delay Slot
def BGEZALS_MM : BranchCompareToZeroLinkMM<"bgezals", brtarget_mm,
- GPR32Opnd>, BGEZAL_FM_MM<0x13>;
+ GPR32Opnd>, BGEZAL_FM_MM<0x13>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def BLTZALS_MM : BranchCompareToZeroLinkMM<"bltzals", brtarget_mm,
- GPR32Opnd>, BGEZAL_FM_MM<0x11>;
-}
-def B_MM : UncondBranch<BEQ_MM, brtarget_mm>, IsBranch, ISA_MICROMIPS;
-let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
+ GPR32Opnd>, BGEZAL_FM_MM<0x11>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def B_MM : UncondBranch<BEQ_MM, brtarget_mm>, IsBranch,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
/// Control Instructions
- def SYNC_MM : MMRel, SYNC_FT<"sync">, SYNC_FM_MM;
- def SYNCI_MM : MMRel, SYNCI_FT<"synci">, SYNCI_FM_MM;
- def BREAK_MM : MMRel, BRK_FT<"break">, BRK_FM_MM;
- def SYSCALL_MM : MMRel, SYS_FT<"syscall", uimm10, II_SYSCALL>, SYS_FM_MM;
- def WAIT_MM : WaitMM<"wait">, WAIT_FM_MM;
- def ERET_MM : MMRel, ER_FT<"eret", II_ERET>, ER_FM_MM<0x3cd>;
- def DERET_MM : MMRel, ER_FT<"deret", II_DERET>, ER_FM_MM<0x38d>;
+ def SYNC_MM : MMRel, SYNC_FT<"sync">, SYNC_FM_MM, ISA_MICROMIPS;
+ let DecoderMethod = "DecodeSyncI_MM" in
+ def SYNCI_MM : MMRel, SYNCI_FT<"synci", mem_mm_16>, SYNCI_FM_MM,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def BREAK_MM : MMRel, BRK_FT<"break">, BRK_FM_MM, ISA_MICROMIPS;
+ def SYSCALL_MM : MMRel, SYS_FT<"syscall", uimm10, II_SYSCALL>, SYS_FM_MM,
+ ISA_MICROMIPS;
+ def WAIT_MM : MMRel, WaitMM<"wait">, WAIT_FM_MM, ISA_MICROMIPS;
+ def ERET_MM : MMRel, ER_FT<"eret", II_ERET>, ER_FM_MM<0x3cd>,
+ ISA_MICROMIPS;
+ def DERET_MM : MMRel, ER_FT<"deret", II_DERET>, ER_FM_MM<0x38d>,
+ ISA_MICROMIPS;
def EI_MM : MMRel, DEI_FT<"ei", GPR32Opnd, II_EI>, EI_FM_MM<0x15d>,
- ISA_MIPS32R2;
+ ISA_MICROMIPS;
def DI_MM : MMRel, DEI_FT<"di", GPR32Opnd, II_DI>, EI_FM_MM<0x11d>,
- ISA_MIPS32R2;
+ ISA_MICROMIPS;
+ def TRAP_MM : TrapBase<BREAK_MM>, ISA_MICROMIPS;
/// Trap Instructions
- def TEQ_MM : MMRel, TEQ_FT<"teq", GPR32Opnd, uimm4, II_TEQ>, TEQ_FM_MM<0x0>;
- def TGE_MM : MMRel, TEQ_FT<"tge", GPR32Opnd, uimm4, II_TGE>, TEQ_FM_MM<0x08>;
+ def TEQ_MM : MMRel, TEQ_FT<"teq", GPR32Opnd, uimm4, II_TEQ>, TEQ_FM_MM<0x0>,
+ ISA_MICROMIPS;
+ def TGE_MM : MMRel, TEQ_FT<"tge", GPR32Opnd, uimm4, II_TGE>, TEQ_FM_MM<0x08>,
+ ISA_MICROMIPS;
def TGEU_MM : MMRel, TEQ_FT<"tgeu", GPR32Opnd, uimm4, II_TGEU>,
- TEQ_FM_MM<0x10>;
- def TLT_MM : MMRel, TEQ_FT<"tlt", GPR32Opnd, uimm4, II_TLT>, TEQ_FM_MM<0x20>;
+ TEQ_FM_MM<0x10>, ISA_MICROMIPS;
+ def TLT_MM : MMRel, TEQ_FT<"tlt", GPR32Opnd, uimm4, II_TLT>, TEQ_FM_MM<0x20>,
+ ISA_MICROMIPS;
def TLTU_MM : MMRel, TEQ_FT<"tltu", GPR32Opnd, uimm4, II_TLTU>,
- TEQ_FM_MM<0x28>;
- def TNE_MM : MMRel, TEQ_FT<"tne", GPR32Opnd, uimm4, II_TNE>, TEQ_FM_MM<0x30>;
+ TEQ_FM_MM<0x28>, ISA_MICROMIPS;
+ def TNE_MM : MMRel, TEQ_FT<"tne", GPR32Opnd, uimm4, II_TNE>, TEQ_FM_MM<0x30>,
+ ISA_MICROMIPS;
- def TEQI_MM : MMRel, TEQI_FT<"teqi", GPR32Opnd, II_TEQI>, TEQI_FM_MM<0x0e>;
- def TGEI_MM : MMRel, TEQI_FT<"tgei", GPR32Opnd, II_TGEI>, TEQI_FM_MM<0x09>;
+ def TEQI_MM : MMRel, TEQI_FT<"teqi", GPR32Opnd, II_TEQI>, TEQI_FM_MM<0x0e>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def TGEI_MM : MMRel, TEQI_FT<"tgei", GPR32Opnd, II_TGEI>, TEQI_FM_MM<0x09>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def TGEIU_MM : MMRel, TEQI_FT<"tgeiu", GPR32Opnd, II_TGEIU>,
- TEQI_FM_MM<0x0b>;
- def TLTI_MM : MMRel, TEQI_FT<"tlti", GPR32Opnd, II_TLTI>, TEQI_FM_MM<0x08>;
+ TEQI_FM_MM<0x0b>, ISA_MICROMIPS32_NOT_MIPS32R6;
+ def TLTI_MM : MMRel, TEQI_FT<"tlti", GPR32Opnd, II_TLTI>, TEQI_FM_MM<0x08>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def TLTIU_MM : MMRel, TEQI_FT<"tltiu", GPR32Opnd, II_TTLTIU>,
- TEQI_FM_MM<0x0a>;
- def TNEI_MM : MMRel, TEQI_FT<"tnei", GPR32Opnd, II_TNEI>, TEQI_FM_MM<0x0c>;
+ TEQI_FM_MM<0x0a>, ISA_MICROMIPS32_NOT_MIPS32R6;
+ def TNEI_MM : MMRel, TEQI_FT<"tnei", GPR32Opnd, II_TNEI>, TEQI_FM_MM<0x0c>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
/// Load-linked, Store-conditional
- def LL_MM : LLBaseMM<"ll", GPR32Opnd>, LL_FM_MM<0x3>;
- def SC_MM : SCBaseMM<"sc", GPR32Opnd>, LL_FM_MM<0xb>;
+ def LL_MM : LLBaseMM<"ll", GPR32Opnd>, LL_FM_MM<0x3>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def SC_MM : SCBaseMM<"sc", GPR32Opnd>, LL_FM_MM<0xb>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
- def LLE_MM : LLEBaseMM<"lle", GPR32Opnd>, LLE_FM_MM<0x6>;
- def SCE_MM : SCEBaseMM<"sce", GPR32Opnd>, LLE_FM_MM<0xA>;
+ def LLE_MM : MMRel, LLEBaseMM<"lle", GPR32Opnd>, LLE_FM_MM<0x6>,
+ ISA_MICROMIPS, ASE_EVA;
+ def SCE_MM : MMRel, SCEBaseMM<"sce", GPR32Opnd>, LLE_FM_MM<0xA>,
+ ISA_MICROMIPS, ASE_EVA;
let DecoderMethod = "DecodeCacheOpMM" in {
- def CACHE_MM : MMRel, CacheOp<"cache", mem_mm_12, II_CACHE>,
- CACHE_PREF_FM_MM<0x08, 0x6>;
- def PREF_MM : MMRel, CacheOp<"pref", mem_mm_12, II_PREF>,
- CACHE_PREF_FM_MM<0x18, 0x2>;
+ def CACHE_MM : MMRel, CacheOp<"cache", mem_mm_12, II_CACHE>,
+ CACHE_PREF_FM_MM<0x08, 0x6>, ISA_MICROMIPS32_NOT_MIPS32R6;
+ def PREF_MM : MMRel, CacheOp<"pref", mem_mm_12, II_PREF>,
+ CACHE_PREF_FM_MM<0x18, 0x2>, ISA_MICROMIPS32_NOT_MIPS32R6;
}
let DecoderMethod = "DecodePrefeOpMM" in {
def PREFE_MM : MMRel, CacheOp<"prefe", mem_mm_9, II_PREFE>,
- CACHE_PREFE_FM_MM<0x18, 0x2>;
+ CACHE_PREFE_FM_MM<0x18, 0x2>, ISA_MICROMIPS, ASE_EVA;
def CACHEE_MM : MMRel, CacheOp<"cachee", mem_mm_9, II_CACHEE>,
- CACHE_PREFE_FM_MM<0x18, 0x3>;
+ CACHE_PREFE_FM_MM<0x18, 0x3>, ISA_MICROMIPS, ASE_EVA;
}
- def SSNOP_MM : MMRel, Barrier<"ssnop", II_SSNOP>, BARRIER_FM_MM<0x1>;
- def EHB_MM : MMRel, Barrier<"ehb", II_EHB>, BARRIER_FM_MM<0x3>;
- def PAUSE_MM : MMRel, Barrier<"pause", II_PAUSE>, BARRIER_FM_MM<0x5>;
-
- def TLBP_MM : MMRel, TLB<"tlbp", II_TLBP>, COP0_TLB_FM_MM<0x0d>;
- def TLBR_MM : MMRel, TLB<"tlbr", II_TLBR>, COP0_TLB_FM_MM<0x4d>;
- def TLBWI_MM : MMRel, TLB<"tlbwi", II_TLBWI>, COP0_TLB_FM_MM<0x8d>;
- def TLBWR_MM : MMRel, TLB<"tlbwr", II_TLBWR>, COP0_TLB_FM_MM<0xcd>;
-
- def SDBBP_MM : MMRel, SYS_FT<"sdbbp", uimm10, II_SDBBP>, SDBBP_FM_MM;
-
- def PREFX_MM : PrefetchIndexed<"prefx">, POOL32F_PREFX_FM_MM<0x15, 0x1A0>;
+ def SSNOP_MM : MMRel, Barrier<"ssnop", II_SSNOP>, BARRIER_FM_MM<0x1>,
+ ISA_MICROMIPS;
+ def EHB_MM : MMRel, Barrier<"ehb", II_EHB>, BARRIER_FM_MM<0x3>,
+ ISA_MICROMIPS;
+ def PAUSE_MM : MMRel, Barrier<"pause", II_PAUSE>, BARRIER_FM_MM<0x5>,
+ ISA_MICROMIPS;
+
+ def TLBP_MM : MMRel, TLB<"tlbp", II_TLBP>, COP0_TLB_FM_MM<0x0d>,
+ ISA_MICROMIPS;
+ def TLBR_MM : MMRel, TLB<"tlbr", II_TLBR>, COP0_TLB_FM_MM<0x4d>,
+ ISA_MICROMIPS;
+ def TLBWI_MM : MMRel, TLB<"tlbwi", II_TLBWI>, COP0_TLB_FM_MM<0x8d>,
+ ISA_MICROMIPS;
+ def TLBWR_MM : MMRel, TLB<"tlbwr", II_TLBWR>, COP0_TLB_FM_MM<0xcd>,
+ ISA_MICROMIPS;
+
+ def SDBBP_MM : MMRel, SYS_FT<"sdbbp", uimm10, II_SDBBP>, SDBBP_FM_MM,
+ ISA_MICROMIPS;
+
+ def PREFX_MM : PrefetchIndexed<"prefx">, POOL32F_PREFX_FM_MM<0x15, 0x1A0>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
}
def TAILCALL_MM : TailCall<J_MM, jmptarget_mm>, ISA_MIPS1_NOT_32R6_64R6;
+def TAILCALLREG_MM : TailCallReg<JRC16_MM, GPR32Opnd>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+
+def PseudoIndirectBranch_MM : PseudoIndirectBranchBase<JR_MM, GPR32Opnd>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+
let DecoderNamespace = "MicroMips" in {
def RDHWR_MM : MMRel, R6MMR6Rel, ReadHardware<GPR32Opnd, HWRegsOpnd>,
RDHWR_FM_MM, ISA_MICROMIPS32_NOT_MIPS32R6;
@@ -1011,89 +1132,177 @@ let DecoderNamespace = "MicroMips" in {
ISA_MICROMIPS32_NOT_MIPS32R6;
}
+let DecoderNamespace = "MicroMips" in {
+ def MFGC0_MM : MMRel, MfCop0MM<"mfgc0", GPR32Opnd, COP0Opnd, II_MFGC0>,
+ POOL32A_MFTC0_FM_MM<0b10011, 0b111100>,
+ ISA_MICROMIPS32R5, ASE_VIRT;
+ def MFHGC0_MM : MMRel, MfCop0MM<"mfhgc0", GPR32Opnd, COP0Opnd, II_MFHGC0>,
+ POOL32A_MFTC0_FM_MM<0b10011, 0b110100>,
+ ISA_MICROMIPS32R5, ASE_VIRT;
+ def MTGC0_MM : MMRel, MtCop0MM<"mtgc0", COP0Opnd, GPR32Opnd, II_MTGC0>,
+ POOL32A_MFTC0_FM_MM<0b11011, 0b111100>,
+ ISA_MICROMIPS32R5, ASE_VIRT;
+ def MTHGC0_MM : MMRel, MtCop0MM<"mthgc0", COP0Opnd, GPR32Opnd, II_MTHGC0>,
+ POOL32A_MFTC0_FM_MM<0b11011, 0b110100>,
+ ISA_MICROMIPS32R5, ASE_VIRT;
+ def HYPCALL_MM : MMRel, HypcallMM<"hypcall">, POOL32A_HYPCALL_FM_MM,
+ ISA_MICROMIPS32R5, ASE_VIRT;
+ def TLBGINV_MM : MMRel, TLBINVMM<"tlbginv", II_TLBGINV>,
+ POOL32A_TLBINV_FM_MM<0x105>, ISA_MICROMIPS32R5, ASE_VIRT;
+ def TLBGINVF_MM : MMRel, TLBINVMM<"tlbginvf", II_TLBGINVF>,
+ POOL32A_TLBINV_FM_MM<0x145>, ISA_MICROMIPS32R5, ASE_VIRT;
+ def TLBGP_MM : MMRel, TLBINVMM<"tlbgp", II_TLBGP>,
+ POOL32A_TLBINV_FM_MM<0x5>, ISA_MICROMIPS32R5, ASE_VIRT;
+ def TLBGR_MM : MMRel, TLBINVMM<"tlbgr", II_TLBGR>,
+ POOL32A_TLBINV_FM_MM<0x45>, ISA_MICROMIPS32R5, ASE_VIRT;
+ def TLBGWI_MM : MMRel, TLBINVMM<"tlbgwi", II_TLBGWI>,
+ POOL32A_TLBINV_FM_MM<0x85>, ISA_MICROMIPS32R5, ASE_VIRT;
+ def TLBGWR_MM : MMRel, TLBINVMM<"tlbgwr", II_TLBGWR>,
+ POOL32A_TLBINV_FM_MM<0xc5>, ISA_MICROMIPS32R5, ASE_VIRT;
+}
+
//===----------------------------------------------------------------------===//
// MicroMips arbitrary patterns that map to one or more instructions
//===----------------------------------------------------------------------===//
-let AdditionalPredicates = [InMicroMips] in {
- def : MipsPat<(i32 immLi16:$imm),
- (LI16_MM immLi16:$imm)>;
-
- defm : MaterializeImms<i32, ZERO, ADDiu_MM, LUi_MM, ORi_MM>;
-}
-
-let Predicates = [InMicroMips] in {
- def : MipsPat<(not GPRMM16:$in),
- (NOT16_MM GPRMM16:$in)>;
- def : MipsPat<(not GPR32:$in),
- (NOR_MM GPR32Opnd:$in, ZERO)>;
-
- def : MipsPat<(add GPRMM16:$src, immSExtAddiur2:$imm),
- (ADDIUR2_MM GPRMM16:$src, immSExtAddiur2:$imm)>;
- def : MipsPat<(add GPR32:$src, immSExtAddius5:$imm),
- (ADDIUS5_MM GPR32:$src, immSExtAddius5:$imm)>;
- def : MipsPat<(add GPR32:$src, immSExt16:$imm),
- (ADDiu_MM GPR32:$src, immSExt16:$imm)>;
-
- def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
- (ANDI16_MM GPRMM16:$src, immZExtAndi16:$imm)>;
- def : MipsPat<(and GPR32:$src, immZExt16:$imm),
- (ANDi_MM GPR32:$src, immZExt16:$imm)>;
-
- def : MipsPat<(shl GPRMM16:$src, immZExt2Shift:$imm),
- (SLL16_MM GPRMM16:$src, immZExt2Shift:$imm)>;
- def : MipsPat<(shl GPR32:$src, immZExt5:$imm),
- (SLL_MM GPR32:$src, immZExt5:$imm)>;
- def : MipsPat<(shl GPR32:$lhs, GPR32:$rhs),
- (SLLV_MM GPR32:$lhs, GPR32:$rhs)>;
-
- def : MipsPat<(srl GPRMM16:$src, immZExt2Shift:$imm),
- (SRL16_MM GPRMM16:$src, immZExt2Shift:$imm)>;
- def : MipsPat<(srl GPR32:$src, immZExt5:$imm),
- (SRL_MM GPR32:$src, immZExt5:$imm)>;
- def : MipsPat<(srl GPR32:$lhs, GPR32:$rhs),
- (SRLV_MM GPR32:$lhs, GPR32:$rhs)>;
-
- def : MipsPat<(sra GPR32:$src, immZExt5:$imm),
- (SRA_MM GPR32:$src, immZExt5:$imm)>;
- def : MipsPat<(sra GPR32:$lhs, GPR32:$rhs),
- (SRAV_MM GPR32:$lhs, GPR32:$rhs)>;
-
- def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr),
- (SW16_MM GPRMM16:$src, addrimm4lsl2:$addr)>;
- def : MipsPat<(store GPR32:$src, addr:$addr),
- (SW_MM GPR32:$src, addr:$addr)>;
-
- def : MipsPat<(load addrimm4lsl2:$addr),
- (LW16_MM addrimm4lsl2:$addr)>;
- def : MipsPat<(load addr:$addr),
- (LW_MM addr:$addr)>;
- def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
- (SUBu_MM GPR32:$lhs, GPR32:$rhs)>;
-}
+defm : MipsHiLoRelocs<LUi_MM, ADDiu_MM, ZERO, GPR32Opnd>, ISA_MICROMIPS;
+
+def : MipsPat<(MipsGotHi tglobaladdr:$in), (LUi_MM tglobaladdr:$in)>,
+ ISA_MICROMIPS;
+def : MipsPat<(MipsGotHi texternalsym:$in), (LUi_MM texternalsym:$in)>,
+ ISA_MICROMIPS;
+
+def : MipsPat<(MipsTlsHi tglobaltlsaddr:$in), (LUi_MM tglobaltlsaddr:$in)>,
+ ISA_MICROMIPS;
+
+// gp_rel relocs
+def : MipsPat<(add GPR32:$gp, (MipsGPRel tglobaladdr:$in)),
+ (ADDiu_MM GPR32:$gp, tglobaladdr:$in)>, ISA_MICROMIPS;
+def : MipsPat<(add GPR32:$gp, (MipsGPRel tconstpool:$in)),
+ (ADDiu_MM GPR32:$gp, tconstpool:$in)>, ISA_MICROMIPS;
+
+def : WrapperPat<tglobaladdr, ADDiu_MM, GPR32>, ISA_MICROMIPS;
+def : WrapperPat<tconstpool, ADDiu_MM, GPR32>, ISA_MICROMIPS;
+def : WrapperPat<texternalsym, ADDiu_MM, GPR32>, ISA_MICROMIPS;
+def : WrapperPat<tblockaddress, ADDiu_MM, GPR32>, ISA_MICROMIPS;
+def : WrapperPat<tjumptable, ADDiu_MM, GPR32>, ISA_MICROMIPS;
+def : WrapperPat<tglobaltlsaddr, ADDiu_MM, GPR32>, ISA_MICROMIPS;
+
+def : MipsPat<(atomic_load_8 addr:$a), (LB_MM addr:$a)>, ISA_MICROMIPS;
+def : MipsPat<(atomic_load_16 addr:$a), (LH_MM addr:$a)>, ISA_MICROMIPS;
+def : MipsPat<(atomic_load_32 addr:$a), (LW_MM addr:$a)>, ISA_MICROMIPS;
+
+def : MipsPat<(i32 immLi16:$imm),
+ (LI16_MM immLi16:$imm)>, ISA_MICROMIPS;
+
+defm : MaterializeImms<i32, ZERO, ADDiu_MM, LUi_MM, ORi_MM>, ISA_MICROMIPS;
+
+def : MipsPat<(not GPRMM16:$in),
+ (NOT16_MM GPRMM16:$in)>, ISA_MICROMIPS;
+def : MipsPat<(not GPR32:$in),
+ (NOR_MM GPR32Opnd:$in, ZERO)>, ISA_MICROMIPS;
+
+def : MipsPat<(add GPRMM16:$src, immSExtAddiur2:$imm),
+ (ADDIUR2_MM GPRMM16:$src, immSExtAddiur2:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(add GPR32:$src, immSExtAddius5:$imm),
+ (ADDIUS5_MM GPR32:$src, immSExtAddius5:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(add GPR32:$src, immSExt16:$imm),
+ (ADDiu_MM GPR32:$src, immSExt16:$imm)>, ISA_MICROMIPS;
+
+def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
+ (ANDI16_MM GPRMM16:$src, immZExtAndi16:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(and GPR32:$src, immZExt16:$imm),
+ (ANDi_MM GPR32:$src, immZExt16:$imm)>, ISA_MICROMIPS;
+
+def : MipsPat<(shl GPRMM16:$src, immZExt2Shift:$imm),
+ (SLL16_MM GPRMM16:$src, immZExt2Shift:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(shl GPR32:$src, immZExt5:$imm),
+ (SLL_MM GPR32:$src, immZExt5:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(shl GPR32:$lhs, GPR32:$rhs),
+ (SLLV_MM GPR32:$lhs, GPR32:$rhs)>, ISA_MICROMIPS;
+
+def : MipsPat<(srl GPRMM16:$src, immZExt2Shift:$imm),
+ (SRL16_MM GPRMM16:$src, immZExt2Shift:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(srl GPR32:$src, immZExt5:$imm),
+ (SRL_MM GPR32:$src, immZExt5:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(srl GPR32:$lhs, GPR32:$rhs),
+ (SRLV_MM GPR32:$lhs, GPR32:$rhs)>, ISA_MICROMIPS;
+
+def : MipsPat<(sra GPR32:$src, immZExt5:$imm),
+ (SRA_MM GPR32:$src, immZExt5:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(sra GPR32:$lhs, GPR32:$rhs),
+ (SRAV_MM GPR32:$lhs, GPR32:$rhs)>, ISA_MICROMIPS;
+
+def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr),
+ (SW16_MM GPRMM16:$src, addrimm4lsl2:$addr)>, ISA_MICROMIPS;
+def : MipsPat<(store GPR32:$src, addr:$addr),
+ (SW_MM GPR32:$src, addr:$addr)>, ISA_MICROMIPS;
+
+def : MipsPat<(load addrimm4lsl2:$addr),
+ (LW16_MM addrimm4lsl2:$addr)>, ISA_MICROMIPS;
+def : MipsPat<(load addr:$addr),
+ (LW_MM addr:$addr)>, ISA_MICROMIPS;
+def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
+ (SUBu_MM GPR32:$lhs, GPR32:$rhs)>, ISA_MICROMIPS;
+
+def : MipsPat<(i32 (extloadi1 addr:$src)), (LBu_MM addr:$src)>,
+ ISA_MICROMIPS;
+
+def : MipsPat<(i32 (extloadi8 addr:$src)), (LBu_MM addr:$src)>,
+ ISA_MICROMIPS;
+
+def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu_MM addr:$src)>,
+ ISA_MICROMIPS;
+
+let AddedComplexity = 40 in
+ def : MipsPat<(i32 (sextloadi16 addrRegImm:$a)),
+ (LH_MM addrRegImm:$a)>, ISA_MICROMIPS;
+
+
+def : MipsPat<(bswap GPR32:$rt), (ROTR_MM (WSBH_MM GPR32:$rt), 16)>,
+ ISA_MICROMIPS;
def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
(TAILCALL_MM tglobaladdr:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
(TAILCALL_MM texternalsym:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
-let AddedComplexity = 40 in {
- def : MipsPat<(i32 (sextloadi16 addrRegImm:$a)),
- (LH_MM addrRegImm:$a)>;
-}
-def : MipsPat<(atomic_load_16 addr:$a),
- (LH_MM addr:$a)>;
-def : MipsPat<(i32 (extloadi16 addr:$src)),
- (LHu_MM addr:$src)>;
-
defm : BrcondPats<GPR32, BEQ_MM, BEQ_MM, BNE_MM, SLT_MM, SLTu_MM, SLTi_MM,
- SLTiu_MM, ZERO>;
+ SLTiu_MM, ZERO>, ISA_MICROMIPS32_NOT_MIPS32R6;
+
+def : MipsPat<(brcond (i32 (setlt i32:$lhs, 1)), bb:$dst),
+ (BLEZ_MM i32:$lhs, bb:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
+def : MipsPat<(brcond (i32 (setgt i32:$lhs, -1)), bb:$dst),
+ (BGEZ_MM i32:$lhs, bb:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
+
+defm : SeteqPats<GPR32, SLTiu_MM, XOR_MM, SLTu_MM, ZERO>, ISA_MICROMIPS;
+defm : SetlePats<GPR32, XORi_MM, SLT_MM, SLTu_MM>, ISA_MICROMIPS;
+defm : SetgtPats<GPR32, SLT_MM, SLTu_MM>, ISA_MICROMIPS;
+defm : SetgePats<GPR32, XORi_MM, SLT_MM, SLTu_MM>, ISA_MICROMIPS;
+defm : SetgeImmPats<GPR32, XORi_MM, SLTi_MM, SLTiu_MM>, ISA_MICROMIPS;
+
+// Select patterns
+
+// Instantiation of conditional move patterns.
+defm : MovzPats0<GPR32, GPR32, MOVZ_I_MM, SLT_MM, SLTu_MM, SLTi_MM, SLTiu_MM>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+defm : MovzPats1<GPR32, GPR32, MOVZ_I_MM, XOR_MM>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+defm : MovzPats2<GPR32, GPR32, MOVZ_I_MM, XORi_MM>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
-defm : SeteqPats<GPR32, SLTiu_MM, XOR_MM, SLTu_MM, ZERO>;
-defm : SetlePats<GPR32, XORi_MM, SLT_MM, SLTu_MM>;
-defm : SetgtPats<GPR32, SLT_MM, SLTu_MM>;
-defm : SetgePats<GPR32, XORi_MM, SLT_MM, SLTu_MM>;
-defm : SetgeImmPats<GPR32, XORi_MM, SLTi_MM, SLTiu_MM>;
+
+defm : MovnPats<GPR32, GPR32, MOVN_I_MM, XOR_MM>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+// Instantiation of conditional move patterns.
+defm : MovzPats0<GPR32, GPR32, MOVZ_I_MM, SLT_MM, SLTu_MM, SLTi_MM, SLTiu_MM>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+defm : MovzPats1<GPR32, GPR32, MOVZ_I_MM, XOR_MM>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+defm : MovzPats2<GPR32, GPR32, MOVZ_I_MM, XORi_MM>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+
+defm : MovnPats<GPR32, GPR32, MOVN_I_MM, XOR_MM>, ISA_MICROMIPS32_NOT_MIPS32R6;
//===----------------------------------------------------------------------===//
// MicroMips instruction aliases
@@ -1105,17 +1314,29 @@ class UncondBranchMMPseudo<string opstr> :
def B_MM_Pseudo : UncondBranchMMPseudo<"b">, ISA_MICROMIPS;
-let Predicates = [InMicroMips] in {
+let EncodingPredicates = [InMicroMips] in {
def SDIV_MM_Pseudo : MultDivPseudo<SDIV_MM, ACC64, GPR32Opnd, MipsDivRem,
II_DIV, 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
def UDIV_MM_Pseudo : MultDivPseudo<UDIV_MM, ACC64, GPR32Opnd, MipsDivRemU,
II_DIVU, 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
- def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>;
- def : MipsInstAlias<"nop", (SLL_MM ZERO, ZERO, 0), 1>;
- def : MipsInstAlias<"nop", (MOVE16_MM ZERO, ZERO), 1>;
- def : MipsInstAlias<"ei", (EI_MM ZERO), 1>, ISA_MIPS32R2;
- def : MipsInstAlias<"di", (DI_MM ZERO), 1>, ISA_MIPS32R2;
+ def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>, ISA_MICROMIPS;
+ def : MipsInstAlias<"nop", (SLL_MM ZERO, ZERO, 0), 1>, ISA_MICROMIPS;
+ def : MipsInstAlias<"nop", (MOVE16_MM ZERO, ZERO), 1>, ISA_MICROMIPS;
+ def : MipsInstAlias<"ei", (EI_MM ZERO), 1>, ISA_MICROMIPS;
+ def : MipsInstAlias<"di", (DI_MM ZERO), 1>, ISA_MICROMIPS;
+ def : MipsInstAlias<"neg $rt, $rs",
+ (SUB_MM GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def : MipsInstAlias<"neg $rt",
+ (SUB_MM GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def : MipsInstAlias<"negu $rt, $rs",
+ (SUBu_MM GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def : MipsInstAlias<"negu $rt",
+ (SUBu_MM GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def : MipsInstAlias<"teq $rs, $rt",
(TEQ_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
def : MipsInstAlias<"tge $rs, $rt",
@@ -1160,32 +1381,64 @@ let Predicates = [InMicroMips] in {
(SRL_MM GPR32Opnd:$rd, GPR32Opnd:$rd, uimm5:$shamt), 0>;
def : MipsInstAlias<"rotr $rt, $imm",
(ROTR_MM GPR32Opnd:$rt, GPR32Opnd:$rt, uimm5:$imm), 0>;
- def : MipsInstAlias<"syscall", (SYSCALL_MM 0), 1>;
+ def : MipsInstAlias<"syscall", (SYSCALL_MM 0), 1>, ISA_MICROMIPS;
- defm : OneOrTwoOperandMacroImmediateAlias<"add", ADDi_MM>;
+ def : MipsInstAlias<"sync", (SYNC_MM 0), 1>, ISA_MICROMIPS;
- defm : OneOrTwoOperandMacroImmediateAlias<"addu", ADDiu_MM>;
+ defm : OneOrTwoOperandMacroImmediateAlias<"add", ADDi_MM>, ISA_MICROMIPS;
- defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi_MM>;
+ defm : OneOrTwoOperandMacroImmediateAlias<"addu", ADDiu_MM>, ISA_MICROMIPS;
- defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi_MM>;
+ defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi_MM>, ISA_MICROMIPS;
- defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi_MM>;
+ defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi_MM>, ISA_MICROMIPS;
- defm : OneOrTwoOperandMacroImmediateAlias<"slt", SLTi_MM>;
+ defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi_MM>, ISA_MICROMIPS;
- defm : OneOrTwoOperandMacroImmediateAlias<"sltu", SLTiu_MM>;
+ defm : OneOrTwoOperandMacroImmediateAlias<"slt", SLTi_MM>, ISA_MICROMIPS;
+
+ defm : OneOrTwoOperandMacroImmediateAlias<"sltu", SLTiu_MM>, ISA_MICROMIPS;
def : MipsInstAlias<"not $rt, $rs",
- (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
+ (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def : MipsInstAlias<"not $rt",
- (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>;
+ (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def : MipsInstAlias<"bnez $rs,$offset",
- (BNE_MM GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+ (BNE_MM GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
+ ISA_MICROMIPS;
def : MipsInstAlias<"beqz $rs,$offset",
- (BEQ_MM GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+ (BEQ_MM GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
+ ISA_MICROMIPS;
def : MipsInstAlias<"seh $rd", (SEH_MM GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
- ISA_MIPS32R2_NOT_32R6_64R6;
+ ISA_MICROMIPS;
def : MipsInstAlias<"seb $rd", (SEB_MM GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
- ISA_MIPS32R2_NOT_32R6_64R6;
-}
+ ISA_MICROMIPS;
+ def : MipsInstAlias<"break", (BREAK_MM 0, 0), 1>, ISA_MICROMIPS;
+ def : MipsInstAlias<"break $imm", (BREAK_MM uimm10:$imm, 0), 1>,
+ ISA_MICROMIPS;
+ def : MipsInstAlias<"bal $offset", (BGEZAL_MM ZERO, brtarget_mm:$offset), 1>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+
+ def : MipsInstAlias<"j $rs", (JR_MM GPR32Opnd:$rs), 0>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+}
+def : MipsInstAlias<"rdhwr $rt, $rs",
+ (RDHWR_MM GPR32Opnd:$rt, HWRegsOpnd:$rs, 0), 1>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+
+def : MipsInstAlias<"hypcall", (HYPCALL_MM 0), 1>,
+ ISA_MICROMIPS32R5, ASE_VIRT;
+def : MipsInstAlias<"mfgc0 $rt, $rs",
+ (MFGC0_MM GPR32Opnd:$rt, COP0Opnd:$rs, 0), 0>,
+ ISA_MICROMIPS32R5, ASE_VIRT;
+def : MipsInstAlias<"mfhgc0 $rt, $rs",
+ (MFHGC0_MM GPR32Opnd:$rt, COP0Opnd:$rs, 0), 0>,
+ ISA_MICROMIPS32R5, ASE_VIRT;
+def : MipsInstAlias<"mtgc0 $rt, $rs",
+ (MTGC0_MM COP0Opnd:$rs, GPR32Opnd:$rt, 0), 0>,
+ ISA_MICROMIPS32R5, ASE_VIRT;
+def : MipsInstAlias<"mthgc0 $rt, $rs",
+ (MTHGC0_MM COP0Opnd:$rs, GPR32Opnd:$rt, 0), 0>,
+ ISA_MICROMIPS32R5, ASE_VIRT;
diff --git a/lib/Target/Mips/MicroMipsSizeReduction.cpp b/lib/Target/Mips/MicroMipsSizeReduction.cpp
index f2e014084e46..568cdfb5b110 100644
--- a/lib/Target/Mips/MicroMipsSizeReduction.cpp
+++ b/lib/Target/Mips/MicroMipsSizeReduction.cpp
@@ -10,7 +10,6 @@
/// This pass is used to reduce the size of instructions where applicable.
///
/// TODO: Implement microMIPS64 support.
-/// TODO: Implement support for reducing into lwp/swp instruction.
//===----------------------------------------------------------------------===//
#include "Mips.h"
#include "MipsInstrInfo.h"
@@ -22,8 +21,10 @@
using namespace llvm;
#define DEBUG_TYPE "micromips-reduce-size"
+#define MICROMIPS_SIZE_REDUCE_NAME "MicroMips instruction size reduce pass"
-STATISTIC(NumReduced, "Number of 32-bit instructions reduced to 16-bit ones");
+STATISTIC(NumReduced, "Number of instructions reduced (32-bit to 16-bit ones, "
+ "or two instructions into one");
namespace {
@@ -35,12 +36,15 @@ enum OperandTransfer {
OT_Operands02, ///< Transfer operands 0 and 2
OT_Operand2, ///< Transfer just operand 2
OT_OperandsXOR, ///< Transfer operands for XOR16
+ OT_OperandsLwp, ///< Transfer operands for LWP
+ OT_OperandsSwp, ///< Transfer operands for SWP
};
/// Reduction type
// TODO: Will be extended when additional optimizations are added
enum ReduceType {
- RT_OneInstr ///< Reduce one instruction into a smaller instruction
+ RT_TwoInstr, ///< Reduce two instructions into one instruction
+ RT_OneInstr ///< Reduce one instruction into a smaller instruction
};
// Information about immediate field restrictions
@@ -76,21 +80,22 @@ struct OpCodes {
unsigned NarrowOpc; ///< Narrow opcode
};
+typedef struct ReduceEntryFunArgs ReduceEntryFunArgs;
+
/// ReduceTable - A static table with information on mapping from wide
/// opcodes to narrow
struct ReduceEntry {
enum ReduceType eRType; ///< Reduction type
bool (*ReduceFunction)(
- MachineInstr *MI,
- const ReduceEntry &Entry); ///< Pointer to reduce function
- struct OpCodes Ops; ///< All relevant OpCodes
- struct OpInfo OpInf; ///< Characteristics of operands
- struct ImmField Imm; ///< Characteristics of immediate field
+ ReduceEntryFunArgs *Arguments); ///< Pointer to reduce function
+ struct OpCodes Ops; ///< All relevant OpCodes
+ struct OpInfo OpInf; ///< Characteristics of operands
+ struct ImmField Imm; ///< Characteristics of immediate field
ReduceEntry(enum ReduceType RType, struct OpCodes Op,
- bool (*F)(MachineInstr *MI, const ReduceEntry &Entry),
- struct OpInfo OpInf, struct ImmField Imm)
+ bool (*F)(ReduceEntryFunArgs *Arguments), struct OpInfo OpInf,
+ struct ImmField Imm)
: eRType(RType), ReduceFunction(F), Ops(Op), OpInf(OpInf), Imm(Imm) {}
unsigned NarrowOpc() const { return Ops.NarrowOpc; }
@@ -113,6 +118,20 @@ struct ReduceEntry {
}
};
+// Function arguments for ReduceFunction
+struct ReduceEntryFunArgs {
+ MachineInstr *MI; // Instruction
+ const ReduceEntry &Entry; // Entry field
+ MachineBasicBlock::instr_iterator
+ &NextMII; // Iterator to next instruction in block
+
+ ReduceEntryFunArgs(MachineInstr *argMI, const ReduceEntry &argEntry,
+ MachineBasicBlock::instr_iterator &argNextMII)
+ : MI(argMI), Entry(argEntry), NextMII(argNextMII) {}
+};
+
+typedef llvm::SmallVector<ReduceEntry, 32> ReduceEntryVector;
+
class MicroMipsSizeReduce : public MachineFunctionPass {
public:
static char ID;
@@ -132,42 +151,50 @@ private:
bool ReduceMBB(MachineBasicBlock &MBB);
/// Attempts to reduce MI, returns true on success.
- bool ReduceMI(const MachineBasicBlock::instr_iterator &MII);
+ bool ReduceMI(const MachineBasicBlock::instr_iterator &MII,
+ MachineBasicBlock::instr_iterator &NextMII);
// Attempts to reduce LW/SW instruction into LWSP/SWSP,
// returns true on success.
- static bool ReduceXWtoXWSP(MachineInstr *MI, const ReduceEntry &Entry);
+ static bool ReduceXWtoXWSP(ReduceEntryFunArgs *Arguments);
+
+ // Attempts to reduce two LW/SW instructions into LWP/SWP instruction,
+ // returns true on success.
+ static bool ReduceXWtoXWP(ReduceEntryFunArgs *Arguments);
// Attempts to reduce LBU/LHU instruction into LBU16/LHU16,
// returns true on success.
- static bool ReduceLXUtoLXU16(MachineInstr *MI, const ReduceEntry &Entry);
+ static bool ReduceLXUtoLXU16(ReduceEntryFunArgs *Arguments);
// Attempts to reduce SB/SH instruction into SB16/SH16,
// returns true on success.
- static bool ReduceSXtoSX16(MachineInstr *MI, const ReduceEntry &Entry);
+ static bool ReduceSXtoSX16(ReduceEntryFunArgs *Arguments);
// Attempts to reduce arithmetic instructions, returns true on success.
- static bool ReduceArithmeticInstructions(MachineInstr *MI,
- const ReduceEntry &Entry);
+ static bool ReduceArithmeticInstructions(ReduceEntryFunArgs *Arguments);
// Attempts to reduce ADDIU into ADDIUSP instruction,
// returns true on success.
- static bool ReduceADDIUToADDIUSP(MachineInstr *MI, const ReduceEntry &Entry);
+ static bool ReduceADDIUToADDIUSP(ReduceEntryFunArgs *Arguments);
// Attempts to reduce ADDIU into ADDIUR1SP instruction,
// returns true on success.
- static bool ReduceADDIUToADDIUR1SP(MachineInstr *MI,
- const ReduceEntry &Entry);
+ static bool ReduceADDIUToADDIUR1SP(ReduceEntryFunArgs *Arguments);
// Attempts to reduce XOR into XOR16 instruction,
// returns true on success.
- static bool ReduceXORtoXOR16(MachineInstr *MI, const ReduceEntry &Entry);
+ static bool ReduceXORtoXOR16(ReduceEntryFunArgs *Arguments);
- // Changes opcode of an instruction.
- static bool ReplaceInstruction(MachineInstr *MI, const ReduceEntry &Entry);
+ // Changes opcode of an instruction, replaces an instruction with a
+ // new one, or replaces two instructions with a new instruction
+ // depending on their order i.e. if these are consecutive forward
+ // or consecutive backward
+ static bool ReplaceInstruction(MachineInstr *MI, const ReduceEntry &Entry,
+ MachineInstr *MI2 = nullptr,
+ bool ConsecutiveForward = true);
// Table with transformation rules for each instruction.
- static llvm::SmallVector<ReduceEntry, 16> ReduceTable;
+ static ReduceEntryVector ReduceTable;
};
char MicroMipsSizeReduce::ID = 0;
@@ -175,7 +202,7 @@ const MipsInstrInfo *MicroMipsSizeReduce::MipsII;
// This table must be sorted by WideOpc as a main criterion and
// ReduceType as a sub-criterion (when wide opcodes are the same).
-llvm::SmallVector<ReduceEntry, 16> MicroMipsSizeReduce::ReduceTable = {
+ReduceEntryVector MicroMipsSizeReduce::ReduceTable = {
// ReduceType, OpCodes, ReduceFunction,
// OpInfo(TransferOperands),
@@ -200,12 +227,20 @@ llvm::SmallVector<ReduceEntry, 16> MicroMipsSizeReduce::ReduceTable = {
OpInfo(OT_OperandsAll), ImmField(0, -1, 15, 2)},
{RT_OneInstr, OpCodes(Mips::LEA_ADDiu, Mips::ADDIUR1SP_MM),
ReduceADDIUToADDIUR1SP, OpInfo(OT_Operands02), ImmField(2, 0, 64, 2)},
+ {RT_OneInstr, OpCodes(Mips::LEA_ADDiu_MM, Mips::ADDIUR1SP_MM),
+ ReduceADDIUToADDIUR1SP, OpInfo(OT_Operands02), ImmField(2, 0, 64, 2)},
{RT_OneInstr, OpCodes(Mips::LHu, Mips::LHU16_MM), ReduceLXUtoLXU16,
OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)},
{RT_OneInstr, OpCodes(Mips::LHu_MM, Mips::LHU16_MM), ReduceLXUtoLXU16,
OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)},
+ {RT_TwoInstr, OpCodes(Mips::LW, Mips::LWP_MM), ReduceXWtoXWP,
+ OpInfo(OT_OperandsLwp), ImmField(0, -2048, 2048, 2)},
{RT_OneInstr, OpCodes(Mips::LW, Mips::LWSP_MM), ReduceXWtoXWSP,
OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)},
+ {RT_TwoInstr, OpCodes(Mips::LW16_MM, Mips::LWP_MM), ReduceXWtoXWP,
+ OpInfo(OT_OperandsLwp), ImmField(0, -2048, 2048, 2)},
+ {RT_TwoInstr, OpCodes(Mips::LW_MM, Mips::LWP_MM), ReduceXWtoXWP,
+ OpInfo(OT_OperandsLwp), ImmField(0, -2048, 2048, 2)},
{RT_OneInstr, OpCodes(Mips::LW_MM, Mips::LWSP_MM), ReduceXWtoXWSP,
OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)},
{RT_OneInstr, OpCodes(Mips::SB, Mips::SB16_MM), ReduceSXtoSX16,
@@ -222,15 +257,24 @@ llvm::SmallVector<ReduceEntry, 16> MicroMipsSizeReduce::ReduceTable = {
{RT_OneInstr, OpCodes(Mips::SUBu_MM, Mips::SUBU16_MM),
ReduceArithmeticInstructions, OpInfo(OT_OperandsAll),
ImmField(0, 0, 0, -1)},
+ {RT_TwoInstr, OpCodes(Mips::SW, Mips::SWP_MM), ReduceXWtoXWP,
+ OpInfo(OT_OperandsSwp), ImmField(0, -2048, 2048, 2)},
{RT_OneInstr, OpCodes(Mips::SW, Mips::SWSP_MM), ReduceXWtoXWSP,
OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)},
+ {RT_TwoInstr, OpCodes(Mips::SW16_MM, Mips::SWP_MM), ReduceXWtoXWP,
+ OpInfo(OT_OperandsSwp), ImmField(0, -2048, 2048, 2)},
+ {RT_TwoInstr, OpCodes(Mips::SW_MM, Mips::SWP_MM), ReduceXWtoXWP,
+ OpInfo(OT_OperandsSwp), ImmField(0, -2048, 2048, 2)},
{RT_OneInstr, OpCodes(Mips::SW_MM, Mips::SWSP_MM), ReduceXWtoXWSP,
OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)},
{RT_OneInstr, OpCodes(Mips::XOR, Mips::XOR16_MM), ReduceXORtoXOR16,
OpInfo(OT_OperandsXOR), ImmField(0, 0, 0, -1)},
{RT_OneInstr, OpCodes(Mips::XOR_MM, Mips::XOR16_MM), ReduceXORtoXOR16,
OpInfo(OT_OperandsXOR), ImmField(0, 0, 0, -1)}};
-} // namespace
+} // end anonymous namespace
+
+INITIALIZE_PASS(MicroMipsSizeReduce, DEBUG_TYPE, MICROMIPS_SIZE_REDUCE_NAME,
+ false, false)
// Returns true if the machine operand MO is register SP.
static bool IsSP(const MachineOperand &MO) {
@@ -297,37 +341,100 @@ static bool ImmInRange(MachineInstr *MI, const ReduceEntry &Entry) {
return true;
}
+// Returns true if MI can be reduced to lwp/swp instruction
+static bool CheckXWPInstr(MachineInstr *MI, bool ReduceToLwp,
+ const ReduceEntry &Entry) {
+
+ if (ReduceToLwp &&
+ !(MI->getOpcode() == Mips::LW || MI->getOpcode() == Mips::LW_MM ||
+ MI->getOpcode() == Mips::LW16_MM))
+ return false;
+
+ if (!ReduceToLwp &&
+ !(MI->getOpcode() == Mips::SW || MI->getOpcode() == Mips::SW_MM ||
+ MI->getOpcode() == Mips::SW16_MM))
+ return false;
+
+ unsigned reg = MI->getOperand(0).getReg();
+ if (reg == Mips::RA)
+ return false;
+
+ if (!ImmInRange(MI, Entry))
+ return false;
+
+ if (ReduceToLwp && (MI->getOperand(0).getReg() == MI->getOperand(1).getReg()))
+ return false;
+
+ return true;
+}
+
+// Returns true if the registers Reg1 and Reg2 are consecutive
+static bool ConsecutiveRegisters(unsigned Reg1, unsigned Reg2) {
+ static SmallVector<unsigned, 31> Registers = {
+ Mips::AT, Mips::V0, Mips::V1, Mips::A0, Mips::A1, Mips::A2, Mips::A3,
+ Mips::T0, Mips::T1, Mips::T2, Mips::T3, Mips::T4, Mips::T5, Mips::T6,
+ Mips::T7, Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5,
+ Mips::S6, Mips::S7, Mips::T8, Mips::T9, Mips::K0, Mips::K1, Mips::GP,
+ Mips::SP, Mips::FP, Mips::RA};
+
+ for (uint8_t i = 0; i < Registers.size() - 1; i++) {
+ if (Registers[i] == Reg1) {
+ if (Registers[i + 1] == Reg2)
+ return true;
+ else
+ return false;
+ }
+ }
+ return false;
+}
+
+// Returns true if registers and offsets are consecutive
+static bool ConsecutiveInstr(MachineInstr *MI1, MachineInstr *MI2) {
+
+ int64_t Offset1, Offset2;
+ if (!GetImm(MI1, 2, Offset1))
+ return false;
+ if (!GetImm(MI2, 2, Offset2))
+ return false;
+
+ unsigned Reg1 = MI1->getOperand(0).getReg();
+ unsigned Reg2 = MI2->getOperand(0).getReg();
+
+ return ((Offset1 == (Offset2 - 4)) && (ConsecutiveRegisters(Reg1, Reg2)));
+}
+
MicroMipsSizeReduce::MicroMipsSizeReduce() : MachineFunctionPass(ID) {}
-bool MicroMipsSizeReduce::ReduceMI(
- const MachineBasicBlock::instr_iterator &MII) {
+bool MicroMipsSizeReduce::ReduceMI(const MachineBasicBlock::instr_iterator &MII,
+ MachineBasicBlock::instr_iterator &NextMII) {
MachineInstr *MI = &*MII;
unsigned Opcode = MI->getOpcode();
// Search the table.
- llvm::SmallVector<ReduceEntry, 16>::const_iterator Start =
- std::begin(ReduceTable);
- llvm::SmallVector<ReduceEntry, 16>::const_iterator End =
- std::end(ReduceTable);
+ ReduceEntryVector::const_iterator Start = std::begin(ReduceTable);
+ ReduceEntryVector::const_iterator End = std::end(ReduceTable);
- std::pair<llvm::SmallVector<ReduceEntry, 16>::const_iterator,
- llvm::SmallVector<ReduceEntry, 16>::const_iterator>
+ std::pair<ReduceEntryVector::const_iterator,
+ ReduceEntryVector::const_iterator>
Range = std::equal_range(Start, End, Opcode);
if (Range.first == Range.second)
return false;
- for (llvm::SmallVector<ReduceEntry, 16>::const_iterator Entry = Range.first;
- Entry != Range.second; ++Entry)
- if (((*Entry).ReduceFunction)(&(*MII), *Entry))
+ for (ReduceEntryVector::const_iterator Entry = Range.first;
+ Entry != Range.second; ++Entry) {
+ ReduceEntryFunArgs Arguments(&(*MII), *Entry, NextMII);
+ if (((*Entry).ReduceFunction)(&Arguments))
return true;
-
+ }
return false;
}
-bool MicroMipsSizeReduce::ReduceXWtoXWSP(MachineInstr *MI,
- const ReduceEntry &Entry) {
+bool MicroMipsSizeReduce::ReduceXWtoXWSP(ReduceEntryFunArgs *Arguments) {
+
+ MachineInstr *MI = Arguments->MI;
+ const ReduceEntry &Entry = Arguments->Entry;
if (!ImmInRange(MI, Entry))
return false;
@@ -338,8 +445,51 @@ bool MicroMipsSizeReduce::ReduceXWtoXWSP(MachineInstr *MI,
return ReplaceInstruction(MI, Entry);
}
+bool MicroMipsSizeReduce::ReduceXWtoXWP(ReduceEntryFunArgs *Arguments) {
+
+ const ReduceEntry &Entry = Arguments->Entry;
+ MachineBasicBlock::instr_iterator &NextMII = Arguments->NextMII;
+ const MachineBasicBlock::instr_iterator &E =
+ Arguments->MI->getParent()->instr_end();
+
+ if (NextMII == E)
+ return false;
+
+ MachineInstr *MI1 = Arguments->MI;
+ MachineInstr *MI2 = &*NextMII;
+
+ // ReduceToLwp = true/false - reduce to LWP/SWP instruction
+ bool ReduceToLwp = (MI1->getOpcode() == Mips::LW) ||
+ (MI1->getOpcode() == Mips::LW_MM) ||
+ (MI1->getOpcode() == Mips::LW16_MM);
+
+ if (!CheckXWPInstr(MI1, ReduceToLwp, Entry))
+ return false;
+
+ if (!CheckXWPInstr(MI2, ReduceToLwp, Entry))
+ return false;
+
+ unsigned Reg1 = MI1->getOperand(1).getReg();
+ unsigned Reg2 = MI2->getOperand(1).getReg();
+
+ if (Reg1 != Reg2)
+ return false;
+
+ bool ConsecutiveForward = ConsecutiveInstr(MI1, MI2);
+ bool ConsecutiveBackward = ConsecutiveInstr(MI2, MI1);
+
+ if (!(ConsecutiveForward || ConsecutiveBackward))
+ return false;
+
+ NextMII = std::next(NextMII);
+ return ReplaceInstruction(MI1, Entry, MI2, ConsecutiveForward);
+}
+
bool MicroMipsSizeReduce::ReduceArithmeticInstructions(
- MachineInstr *MI, const ReduceEntry &Entry) {
+ ReduceEntryFunArgs *Arguments) {
+
+ MachineInstr *MI = Arguments->MI;
+ const ReduceEntry &Entry = Arguments->Entry;
if (!isMMThreeBitGPRegister(MI->getOperand(0)) ||
!isMMThreeBitGPRegister(MI->getOperand(1)) ||
@@ -349,8 +499,11 @@ bool MicroMipsSizeReduce::ReduceArithmeticInstructions(
return ReplaceInstruction(MI, Entry);
}
-bool MicroMipsSizeReduce::ReduceADDIUToADDIUR1SP(MachineInstr *MI,
- const ReduceEntry &Entry) {
+bool MicroMipsSizeReduce::ReduceADDIUToADDIUR1SP(
+ ReduceEntryFunArgs *Arguments) {
+
+ MachineInstr *MI = Arguments->MI;
+ const ReduceEntry &Entry = Arguments->Entry;
if (!ImmInRange(MI, Entry))
return false;
@@ -361,8 +514,10 @@ bool MicroMipsSizeReduce::ReduceADDIUToADDIUR1SP(MachineInstr *MI,
return ReplaceInstruction(MI, Entry);
}
-bool MicroMipsSizeReduce::ReduceADDIUToADDIUSP(MachineInstr *MI,
- const ReduceEntry &Entry) {
+bool MicroMipsSizeReduce::ReduceADDIUToADDIUSP(ReduceEntryFunArgs *Arguments) {
+
+ MachineInstr *MI = Arguments->MI;
+ const ReduceEntry &Entry = Arguments->Entry;
int64_t ImmValue;
if (!GetImm(MI, Entry.ImmField(), ImmValue))
@@ -377,8 +532,10 @@ bool MicroMipsSizeReduce::ReduceADDIUToADDIUSP(MachineInstr *MI,
return ReplaceInstruction(MI, Entry);
}
-bool MicroMipsSizeReduce::ReduceLXUtoLXU16(MachineInstr *MI,
- const ReduceEntry &Entry) {
+bool MicroMipsSizeReduce::ReduceLXUtoLXU16(ReduceEntryFunArgs *Arguments) {
+
+ MachineInstr *MI = Arguments->MI;
+ const ReduceEntry &Entry = Arguments->Entry;
if (!ImmInRange(MI, Entry))
return false;
@@ -390,8 +547,10 @@ bool MicroMipsSizeReduce::ReduceLXUtoLXU16(MachineInstr *MI,
return ReplaceInstruction(MI, Entry);
}
-bool MicroMipsSizeReduce::ReduceSXtoSX16(MachineInstr *MI,
- const ReduceEntry &Entry) {
+bool MicroMipsSizeReduce::ReduceSXtoSX16(ReduceEntryFunArgs *Arguments) {
+
+ MachineInstr *MI = Arguments->MI;
+ const ReduceEntry &Entry = Arguments->Entry;
if (!ImmInRange(MI, Entry))
return false;
@@ -403,8 +562,11 @@ bool MicroMipsSizeReduce::ReduceSXtoSX16(MachineInstr *MI,
return ReplaceInstruction(MI, Entry);
}
-bool MicroMipsSizeReduce::ReduceXORtoXOR16(MachineInstr *MI,
- const ReduceEntry &Entry) {
+bool MicroMipsSizeReduce::ReduceXORtoXOR16(ReduceEntryFunArgs *Arguments) {
+
+ MachineInstr *MI = Arguments->MI;
+ const ReduceEntry &Entry = Arguments->Entry;
+
if (!isMMThreeBitGPRegister(MI->getOperand(0)) ||
!isMMThreeBitGPRegister(MI->getOperand(1)) ||
!isMMThreeBitGPRegister(MI->getOperand(2)))
@@ -433,23 +595,25 @@ bool MicroMipsSizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
continue;
// Try to reduce 32-bit instruction into 16-bit instruction
- Modified |= ReduceMI(MII);
+ Modified |= ReduceMI(MII, NextMII);
}
return Modified;
}
bool MicroMipsSizeReduce::ReplaceInstruction(MachineInstr *MI,
- const ReduceEntry &Entry) {
+ const ReduceEntry &Entry,
+ MachineInstr *MI2,
+ bool ConsecutiveForward) {
enum OperandTransfer OpTransfer = Entry.TransferOperands();
- DEBUG(dbgs() << "Converting 32-bit: " << *MI);
+ LLVM_DEBUG(dbgs() << "Converting 32-bit: " << *MI);
++NumReduced;
if (OpTransfer == OT_OperandsAll) {
MI->setDesc(MipsII->get(Entry.NarrowOpc()));
- DEBUG(dbgs() << " to 16-bit: " << *MI);
+ LLVM_DEBUG(dbgs() << " to 16-bit: " << *MI);
return true;
} else {
MachineBasicBlock &MBB = *MI->getParent();
@@ -477,6 +641,27 @@ bool MicroMipsSizeReduce::ReplaceInstruction(MachineInstr *MI,
}
break;
}
+ case OT_OperandsLwp:
+ case OT_OperandsSwp: {
+ if (ConsecutiveForward) {
+ MIB.add(MI->getOperand(0));
+ MIB.add(MI2->getOperand(0));
+ MIB.add(MI->getOperand(1));
+ MIB.add(MI->getOperand(2));
+ } else { // consecutive backward
+ MIB.add(MI2->getOperand(0));
+ MIB.add(MI->getOperand(0));
+ MIB.add(MI2->getOperand(1));
+ MIB.add(MI2->getOperand(2));
+ }
+
+ LLVM_DEBUG(dbgs() << "and converting 32-bit: " << *MI2
+ << " to: " << *MIB);
+
+ MBB.erase_instr(MI);
+ MBB.erase_instr(MI2);
+ return true;
+ }
default:
llvm_unreachable("Unknown operand transfer!");
}
@@ -484,7 +669,7 @@ bool MicroMipsSizeReduce::ReplaceInstruction(MachineInstr *MI,
// Transfer MI flags.
MIB.setMIFlags(MI->getFlags());
- DEBUG(dbgs() << " to 16-bit: " << *MIB);
+ LLVM_DEBUG(dbgs() << " to 16-bit: " << *MIB);
MBB.erase_instr(MI);
return true;
}
@@ -511,6 +696,6 @@ bool MicroMipsSizeReduce::runOnMachineFunction(MachineFunction &MF) {
}
/// Returns an instance of the MicroMips size reduction pass.
-FunctionPass *llvm::createMicroMipsSizeReductionPass() {
+FunctionPass *llvm::createMicroMipsSizeReducePass() {
return new MicroMipsSizeReduce();
}
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index 008b9505ee26..ef3a807c7648 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -22,6 +22,11 @@ namespace llvm {
class MipsTargetMachine;
class ModulePass;
class FunctionPass;
+ class MipsRegisterBankInfo;
+ class MipsSubtarget;
+ class MipsTargetMachine;
+ class InstructionSelector;
+ class PassRegistry;
ModulePass *createMipsOs16Pass();
ModulePass *createMips16HardFloatPass();
@@ -29,10 +34,18 @@ namespace llvm {
FunctionPass *createMipsModuleISelDagPass();
FunctionPass *createMipsOptimizePICCallPass();
FunctionPass *createMipsDelaySlotFillerPass();
- FunctionPass *createMipsHazardSchedule();
- FunctionPass *createMipsLongBranchPass();
+ FunctionPass *createMipsBranchExpansion();
FunctionPass *createMipsConstantIslandPass();
- FunctionPass *createMicroMipsSizeReductionPass();
+ FunctionPass *createMicroMipsSizeReducePass();
+ FunctionPass *createMipsExpandPseudoPass();
+
+ InstructionSelector *createMipsInstructionSelector(const MipsTargetMachine &,
+ MipsSubtarget &,
+ MipsRegisterBankInfo &);
+
+ void initializeMipsDelaySlotFillerPass(PassRegistry &);
+ void initializeMipsBranchExpansionPass(PassRegistry &);
+ void initializeMicroMipsSizeReducePass(PassRegistry &);
} // end namespace llvm;
#endif
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 6ceb05577538..2f3a1c399d3e 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -28,8 +28,10 @@ class PredicateControl {
list<Predicate> PTRPredicates = [];
// Predicates for the FGR size and layout such as IsFP64bit
list<Predicate> FGRPredicates = [];
- // Predicates for the instruction group membership such as ISA's and ASE's
+ // Predicates for the instruction group membership such as ISA's.
list<Predicate> InsnPredicates = [];
+ // Predicate for the ASE that an instruction belongs to.
+ list<Predicate> ASEPredicate = [];
// Predicate for marking the instruction as usable in hard-float mode only.
list<Predicate> HardFloatPredicate = [];
// Predicates for anything else
@@ -40,6 +42,7 @@ class PredicateControl {
FGRPredicates,
InsnPredicates,
HardFloatPredicate,
+ ASEPredicate,
AdditionalPredicates);
}
@@ -56,6 +59,7 @@ include "MipsRegisterInfo.td"
include "MipsSchedule.td"
include "MipsInstrInfo.td"
include "MipsCallingConv.td"
+include "MipsRegisterBanks.td"
// Avoid forward declaration issues.
include "MipsScheduleP5600.td"
@@ -173,6 +177,14 @@ def FeatureMSA : SubtargetFeature<"msa", "HasMSA", "true", "Mips MSA ASE">;
def FeatureEVA : SubtargetFeature<"eva", "HasEVA", "true", "Mips EVA ASE">;
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", "Mips R6 CRC ASE">;
+
+def FeatureVirt : SubtargetFeature<"virt", "HasVirt", "true",
+ "Mips Virtualization ASE">;
+
+def FeatureGINV : SubtargetFeature<"ginv", "HasGINV", "true",
+ "Mips Global Invalidate ASE">;
+
def FeatureMicroMips : SubtargetFeature<"micromips", "InMicroMipsMode", "true",
"microMips mode">;
@@ -193,6 +205,10 @@ def FeatureMT : SubtargetFeature<"mt", "HasMT", "true", "Mips MT ASE">;
def FeatureLongCalls : SubtargetFeature<"long-calls", "UseLongCalls", "true",
"Disable use of the jal instruction">;
+def FeatureUseIndirectJumpsHazard : SubtargetFeature<"use-indirect-jump-hazard",
+ "UseIndirectJumpsHazard",
+ "true", "Use indirect jump"
+ " guards to prevent certain speculation based attacks">;
//===----------------------------------------------------------------------===//
// Mips processors supported.
//===----------------------------------------------------------------------===//
@@ -238,4 +254,5 @@ def Mips : Target {
let InstructionSet = MipsInstrInfo;
let AssemblyParsers = [MipsAsmParser];
let AssemblyParserVariants = [MipsAsmParserVariant];
+ let AllowRegisterRenaming = 1;
}
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index cb59e2ddb1c6..122c1f5377b6 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -42,7 +42,6 @@ Mips16FrameLowering::Mips16FrameLowering(const MipsSubtarget &STI)
void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
MachineFrameInfo &MFI = MF.getFrameInfo();
const Mips16InstrInfo &TII =
*static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
@@ -92,11 +91,11 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
void Mips16FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
MachineFrameInfo &MFI = MF.getFrameInfo();
const Mips16InstrInfo &TII =
*static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
- DebugLoc dl = MBBI->getDebugLoc();
+ DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
uint64_t StackSize = MFI.getStackSize();
if (!StackSize)
@@ -117,7 +116,6 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
const std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
- MachineBasicBlock *EntryBlock = &MF->front();
//
// Registers RA, S0,S1 are the callee saved registers and they
@@ -134,7 +132,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA)
&& MF->getFrameInfo().isReturnAddressTaken();
if (!IsRAAndRetAddrIsTaken)
- EntryBlock->addLiveIn(Reg);
+ MBB.addLiveIn(Reg);
}
return true;
diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
index 682ea5c4ed7f..c310d9491af8 100644
--- a/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/lib/Target/Mips/Mips16HardFloat.cpp
@@ -482,11 +482,11 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
// remove the use-soft-float attribute
static void removeUseSoftFloat(Function &F) {
AttrBuilder B;
- DEBUG(errs() << "removing -use-soft-float\n");
+ LLVM_DEBUG(errs() << "removing -use-soft-float\n");
B.addAttribute("use-soft-float", "false");
F.removeAttributes(AttributeList::FunctionIndex, B);
if (F.hasFnAttribute("use-soft-float")) {
- DEBUG(errs() << "still has -use-soft-float\n");
+ LLVM_DEBUG(errs() << "still has -use-soft-float\n");
}
F.addAttributes(AttributeList::FunctionIndex, B);
}
@@ -510,7 +510,7 @@ static void removeUseSoftFloat(Function &F) {
bool Mips16HardFloat::runOnModule(Module &M) {
auto &TM = static_cast<const MipsTargetMachine &>(
getAnalysis<TargetPassConfig>().getTM<TargetMachine>());
- DEBUG(errs() << "Run on Module Mips16HardFloat\n");
+ LLVM_DEBUG(errs() << "Run on Module Mips16HardFloat\n");
bool Modified = false;
for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
if (F->hasFnAttribute("nomips16") &&
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index ce193b1734f3..a0d5bd9ef305 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -192,41 +192,6 @@ bool Mips16DAGToDAGISel::trySelect(SDNode *Node) {
default:
break;
- case ISD::SUBE:
- case ISD::ADDE: {
- SDValue InFlag = Node->getOperand(2), CmpLHS;
- unsigned Opc = InFlag.getOpcode();
- (void)Opc;
- assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
- (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
- "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
-
- unsigned MOp;
- if (Opcode == ISD::ADDE) {
- CmpLHS = InFlag.getValue(0);
- MOp = Mips::AdduRxRyRz16;
- } else {
- CmpLHS = InFlag.getOperand(0);
- MOp = Mips::SubuRxRyRz16;
- }
-
- SDValue Ops[] = {CmpLHS, InFlag.getOperand(1)};
-
- SDValue LHS = Node->getOperand(0);
- SDValue RHS = Node->getOperand(1);
-
- EVT VT = LHS.getValueType();
-
- unsigned Sltu_op = Mips::SltuRxRyRz16;
- SDNode *Carry = CurDAG->getMachineNode(Sltu_op, DL, VT, Ops);
- unsigned Addu_op = Mips::AdduRxRyRz16;
- SDNode *AddCarry =
- CurDAG->getMachineNode(Addu_op, DL, VT, SDValue(Carry, 0), RHS);
-
- CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0));
- return true;
- }
-
/// Mul with two results
case ISD::SMUL_LOHI:
case ISD::UMUL_LOHI: {
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index e11023b4d272..219f1ad33586 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -97,6 +97,17 @@ void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MIB.addReg(SrcReg, getKillRegState(KillSrc));
}
+bool Mips16InstrInfo::isCopyInstr(const MachineInstr &MI,
+ const MachineOperand *&Src,
+ const MachineOperand *&Dest) const {
+ if (MI.isMoveReg()) {
+ Dest = &MI.getOperand(0);
+ Src = &MI.getOperand(1);
+ return true;
+ }
+ return false;
+}
+
void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
unsigned SrcReg, bool isKill, int FI,
diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h
index ffdd4728c8cb..8190be6187ea 100644
--- a/lib/Target/Mips/Mips16InstrInfo.h
+++ b/lib/Target/Mips/Mips16InstrInfo.h
@@ -53,6 +53,9 @@ public:
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc) const override;
+ bool isCopyInstr(const MachineInstr &MI, const MachineOperand *&Src,
+ const MachineOperand *&Dest) const override;
+
void storeRegToStack(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
unsigned SrcReg, bool isKill, int FrameIndex,
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index b91c94288582..b7a1b9ce41bf 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -869,7 +869,9 @@ def Move32R16: FI8_MOV32R16_ins<"move", IIM16Alu>;
//Purpose: Move
// To move the contents of a GPR to a GPR.
//
-def MoveR3216: FI8_MOVR3216_ins<"move", IIM16Alu>;
+def MoveR3216: FI8_MOVR3216_ins<"move", IIM16Alu> {
+ let isMoveReg = 1;
+}
//
// Format: MFHI rx MIPS16e
@@ -879,6 +881,7 @@ def MoveR3216: FI8_MOVR3216_ins<"move", IIM16Alu>;
def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIM16Alu> {
let Uses = [HI0];
let hasSideEffects = 0;
+ let isMoveReg = 1;
}
//
@@ -889,6 +892,7 @@ def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIM16Alu> {
def Mflo16: FRR16_M_ins<0b10010, "mflo", IIM16Alu> {
let Uses = [LO0];
let hasSideEffects = 0;
+ let isMoveReg = 0;
}
//
@@ -1403,14 +1407,6 @@ def: Mips16Pat<(i32 addr16sp:$addr), (AddiuRxRyOffMemX16 addr16sp:$addr)>;
// Large (>16 bit) immediate loads
def : Mips16Pat<(i32 imm:$imm), (LwConstant32 imm:$imm, -1)>;
-// Carry MipsPatterns
-def : Mips16Pat<(subc CPU16Regs:$lhs, CPU16Regs:$rhs),
- (SubuRxRyRz16 CPU16Regs:$lhs, CPU16Regs:$rhs)>;
-def : Mips16Pat<(addc CPU16Regs:$lhs, CPU16Regs:$rhs),
- (AdduRxRyRz16 CPU16Regs:$lhs, CPU16Regs:$rhs)>;
-def : Mips16Pat<(addc CPU16Regs:$src, immSExt16:$imm),
- (AddiuRxRxImmX16 CPU16Regs:$src, imm:$imm)>;
-
//
// Some branch conditional patterns are not generated by llvm at this time.
// Some are for seemingly arbitrary reasons not used: i.e. with signed number
@@ -1424,7 +1420,7 @@ def : Mips16Pat<(addc CPU16Regs:$src, immSExt16:$imm),
// setcc instead and earlier I had implemented setcc first so may have masked
// the problem. The setcc variants are suboptimal for mips16 so I may wantto
// figure out how to enable the brcond patterns or else possibly new
-// combinations of of brcond and setcc.
+// combinations of brcond and setcc.
//
//
// bcond-seteq
@@ -1862,11 +1858,12 @@ def : Mips16Pat<(MipsHi tglobaladdr:$in),
(SllX16 (LiRxImmX16 tglobaladdr:$in), 16)>;
def : Mips16Pat<(MipsHi tjumptable:$in),
(SllX16 (LiRxImmX16 tjumptable:$in), 16)>;
-def : Mips16Pat<(MipsHi tglobaltlsaddr:$in),
- (SllX16 (LiRxImmX16 tglobaltlsaddr:$in), 16)>;
def : Mips16Pat<(MipsLo tblockaddress:$in), (LiRxImmX16 tblockaddress:$in)>;
+def : Mips16Pat<(MipsTlsHi tglobaltlsaddr:$in),
+ (SllX16 (LiRxImmX16 tglobaltlsaddr:$in), 16)>;
+
// wrapper_pic
class Wrapper16Pat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
Mips16Pat<(MipsWrapper RC:$gp, node:$in),
@@ -1910,3 +1907,7 @@ def CONSTPOOL_ENTRY :
MipsPseudo16<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
i32imm:$size), "foo", []>;
+// Instruction Aliases
+
+let EncodingPredicates = [InMips16Mode] in
+def : MipsInstAlias<"nop", (Move32R16 ZERO, S0)>;
diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index ff95f3c72282..751afd5ed369 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -127,8 +127,8 @@ void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
Offset = SPOffset + (int64_t)StackSize;
Offset += MI.getOperand(OpNo + 1).getImm();
-
- DEBUG(errs() << "Offset : " << Offset << "\n" << "<--------->\n");
+ LLVM_DEBUG(errs() << "Offset : " << Offset << "\n"
+ << "<--------->\n");
if (!MI.isDebugValue() &&
!Mips16InstrInfo::validImmediate(MI.getOpcode(), FrameReg, Offset)) {
diff --git a/lib/Target/Mips/Mips32r6InstrFormats.td b/lib/Target/Mips/Mips32r6InstrFormats.td
index 516caa34fbf2..e1d08cad88b7 100644
--- a/lib/Target/Mips/Mips32r6InstrFormats.td
+++ b/lib/Target/Mips/Mips32r6InstrFormats.td
@@ -30,8 +30,7 @@ class MipsR6Arch<string opstr> {
string BaseOpcode = opstr;
}
-class MipsR6Inst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
- PredicateControl {
+class MipsR6Inst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
let DecoderNamespace = "Mips32r6_64r6";
let EncodingPredicates = [HasStdEnc];
}
@@ -576,3 +575,30 @@ class COP2LDST_FM<OPCODE5 Operation> : MipsR6Inst {
let Inst{15-11} = base;
let Inst{10-0} = offset;
}
+
+class SPECIAL3_2R_SZ_CRC<bits<2> sz, bits<3> direction> : MipsR6Inst {
+ bits<5> rs;
+ bits<5> rt;
+
+ let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = 0b00000;
+ let Inst{10-8} = direction;
+ let Inst{7-6} = sz;
+ let Inst{5-0} = 0b001111;
+
+ string DecoderMethod = "DecodeCRC";
+}
+
+class SPECIAL3_GINV<bits<2> ginv> : MipsR6Inst {
+ bits<5> rs;
+ bits<2> type_;
+
+ let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+ let Inst{25-21} = rs;
+ let Inst{20-10} = 0x0;
+ let Inst{9-8} = type_;
+ let Inst{7-6} = ginv;
+ let Inst{5-0} = 0b111101;
+}
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index 62f045e77fdb..d86fc3f658ae 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -190,6 +190,16 @@ class CLZ_R6_ENC : SPECIAL_2R_FM<OPCODE6_CLZ>;
class SDBBP_R6_ENC : SPECIAL_SDBBP_FM;
+class CRC32B_ENC : SPECIAL3_2R_SZ_CRC<0,0>;
+class CRC32H_ENC : SPECIAL3_2R_SZ_CRC<1,0>;
+class CRC32W_ENC : SPECIAL3_2R_SZ_CRC<2,0>;
+class CRC32CB_ENC : SPECIAL3_2R_SZ_CRC<0,1>;
+class CRC32CH_ENC : SPECIAL3_2R_SZ_CRC<1,1>;
+class CRC32CW_ENC : SPECIAL3_2R_SZ_CRC<2,1>;
+
+class GINVI_ENC : SPECIAL3_GINV<0>;
+class GINVT_ENC : SPECIAL3_GINV<2>;
+
//===----------------------------------------------------------------------===//
//
// Instruction Multiclasses
@@ -804,6 +814,38 @@ class SDBBP_R6_DESC {
InstrItinClass Itinerary = II_SDBBP;
}
+class CRC_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin> : MipsR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = itin;
+}
+
+class CRC32B_DESC : CRC_DESC_BASE<"crc32b", GPR32Opnd, II_CRC32B>;
+class CRC32H_DESC : CRC_DESC_BASE<"crc32h", GPR32Opnd, II_CRC32H>;
+class CRC32W_DESC : CRC_DESC_BASE<"crc32w", GPR32Opnd, II_CRC32W>;
+class CRC32CB_DESC : CRC_DESC_BASE<"crc32cb", GPR32Opnd, II_CRC32CB>;
+class CRC32CH_DESC : CRC_DESC_BASE<"crc32ch", GPR32Opnd, II_CRC32CH>;
+class CRC32CW_DESC : CRC_DESC_BASE<"crc32cw", GPR32Opnd, II_CRC32CW>;
+
+class GINV_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ InstrItinClass itin> : MipsR6Arch<instr_asm> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins GPROpnd:$rs, uimm2:$type_);
+ string AsmString = !strconcat(instr_asm, "\t$rs, $type_");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = itin;
+ bit hasSideEffects = 1;
+}
+
+class GINVI_DESC : GINV_DESC_BASE<"ginvi", GPR32Opnd, II_GINVI> {
+ dag InOperandList = (ins GPR32Opnd:$rs);
+ string AsmString = "ginvi\t$rs";
+}
+class GINVT_DESC : GINV_DESC_BASE<"ginvt", GPR32Opnd, II_GINVT>;
+
//===----------------------------------------------------------------------===//
//
// Instruction Definitions
@@ -846,9 +888,7 @@ let AdditionalPredicates = [NotInMicroMips] in {
def BNEZC : R6MMR6Rel, BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6;
def BNVC : R6MMR6Rel, BNVC_ENC, BNVC_DESC, ISA_MIPS32R6;
def BOVC : R6MMR6Rel, BOVC_ENC, BOVC_DESC, ISA_MIPS32R6;
-}
-def CACHE_R6 : R6MMR6Rel, CACHE_ENC, CACHE_DESC, ISA_MIPS32R6;
-let AdditionalPredicates = [NotInMicroMips] in {
+ def CACHE_R6 : R6MMR6Rel, CACHE_ENC, CACHE_DESC, ISA_MIPS32R6;
def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT;
def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT;
}
@@ -901,8 +941,8 @@ let AdditionalPredicates = [NotInMicroMips] in {
def MULU : R6MMR6Rel, MULU_ENC, MULU_DESC, ISA_MIPS32R6;
}
def NAL; // BAL with rd=0
-def PREF_R6 : R6MMR6Rel, PREF_ENC, PREF_DESC, ISA_MIPS32R6;
let AdditionalPredicates = [NotInMicroMips] in {
+ def PREF_R6 : R6MMR6Rel, PREF_ENC, PREF_DESC, ISA_MIPS32R6;
def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT;
def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT;
def SC_R6 : SC_R6_ENC, SC_R6_DESC, PTR_32, ISA_MIPS32R6;
@@ -923,6 +963,20 @@ let AdditionalPredicates = [NotInMicroMips] in {
def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
}
+let AdditionalPredicates = [NotInMicroMips] in {
+ def CRC32B : R6MMR6Rel, CRC32B_ENC, CRC32B_DESC, ISA_MIPS32R6, ASE_CRC;
+ def CRC32H : R6MMR6Rel, CRC32H_ENC, CRC32H_DESC, ISA_MIPS32R6, ASE_CRC;
+ def CRC32W : R6MMR6Rel, CRC32W_ENC, CRC32W_DESC, ISA_MIPS32R6, ASE_CRC;
+ def CRC32CB : R6MMR6Rel, CRC32CB_ENC, CRC32CB_DESC, ISA_MIPS32R6, ASE_CRC;
+ def CRC32CH : R6MMR6Rel, CRC32CH_ENC, CRC32CH_DESC, ISA_MIPS32R6, ASE_CRC;
+ def CRC32CW : R6MMR6Rel, CRC32CW_ENC, CRC32CW_DESC, ISA_MIPS32R6, ASE_CRC;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ def GINVI : R6MMR6Rel, GINVI_ENC, GINVI_DESC, ISA_MIPS32R6, ASE_GINV;
+ def GINVT : R6MMR6Rel, GINVT_ENC, GINVT_DESC, ISA_MIPS32R6, ASE_GINV;
+}
+
//===----------------------------------------------------------------------===//
//
// Instruction Aliases
@@ -1036,3 +1090,42 @@ def : MipsPat<(select i32:$cond, immz, i32:$f),
(SELEQZ i32:$f, i32:$cond)>,
ISA_MIPS32R6;
}
+
+// Pseudo instructions
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
+ hasExtraSrcRegAllocReq = 1, isCTI = 1, Defs = [AT] in {
+ class TailCallRegR6<Instruction JumpInst, Register RT, RegisterOperand RO> :
+ PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>,
+ PseudoInstExpansion<(JumpInst RT:$rt, RO:$rs)>;
+}
+
+class PseudoIndirectBranchBaseR6<Instruction JumpInst, Register RT,
+ RegisterOperand RO> :
+ MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)],
+ II_IndirectBranchPseudo>,
+ PseudoInstExpansion<(JumpInst RT:$rt, RO:$rs)> {
+ let isTerminator=1;
+ let isBarrier=1;
+ let hasDelaySlot = 1;
+ let isBranch = 1;
+ let isIndirectBranch = 1;
+ bit isCTI = 1;
+}
+
+
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+ NoIndirectJumpGuards] in {
+ def TAILCALLR6REG : TailCallRegR6<JALR, ZERO, GPR32Opnd>, ISA_MIPS32R6;
+ def PseudoIndirectBranchR6 : PseudoIndirectBranchBaseR6<JALR, ZERO,
+ GPR32Opnd>,
+ ISA_MIPS32R6;
+}
+
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+ UseIndirectJumpsHazard] in {
+ def TAILCALLHBR6REG : TailCallReg<JR_HB_R6, GPR32Opnd>, ISA_MIPS32R6;
+ def PseudoIndrectHazardBranchR6 : PseudoIndirectBranchBase<JR_HB_R6,
+ GPR32Opnd>,
+ ISA_MIPS32R6;
+}
+
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index e008aeafaa2b..878ec29b188d 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -85,6 +85,17 @@ let usesCustomInserter = 1 in {
def ATOMIC_CMP_SWAP_I64 : AtomicCmpSwap<atomic_cmp_swap_64, GPR64>;
}
+def ATOMIC_LOAD_ADD_I64_POSTRA : Atomic2OpsPostRA<GPR64>;
+def ATOMIC_LOAD_SUB_I64_POSTRA : Atomic2OpsPostRA<GPR64>;
+def ATOMIC_LOAD_AND_I64_POSTRA : Atomic2OpsPostRA<GPR64>;
+def ATOMIC_LOAD_OR_I64_POSTRA : Atomic2OpsPostRA<GPR64>;
+def ATOMIC_LOAD_XOR_I64_POSTRA : Atomic2OpsPostRA<GPR64>;
+def ATOMIC_LOAD_NAND_I64_POSTRA : Atomic2OpsPostRA<GPR64>;
+
+def ATOMIC_SWAP_I64_POSTRA : Atomic2OpsPostRA<GPR64>;
+
+def ATOMIC_CMP_SWAP_I64_POSTRA : AtomicCmpSwapPostRA<GPR64>;
+
/// Pseudo instructions for loading and storing accumulator registers.
let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
def LOAD_ACC128 : Load<"", ACC128>;
@@ -106,16 +117,16 @@ let AdditionalPredicates = [NotInMicroMips] in {
let isCodeGenOnly = 1 in {
def SLTi64 : SetCC_I<"slti", setlt, simm16_64, immSExt16, GPR64Opnd>,
- SLTI_FM<0xa>;
+ SLTI_FM<0xa>, GPR_64;
def SLTiu64 : SetCC_I<"sltiu", setult, simm16_64, immSExt16, GPR64Opnd>,
- SLTI_FM<0xb>;
+ SLTI_FM<0xb>, GPR_64;
def ANDi64 : ArithLogicI<"andi", uimm16_64, GPR64Opnd, II_AND, immZExt16, and>,
- ADDI_FM<0xc>;
+ ADDI_FM<0xc>, GPR_64;
def ORi64 : ArithLogicI<"ori", uimm16_64, GPR64Opnd, II_OR, immZExt16, or>,
- ADDI_FM<0xd>;
+ ADDI_FM<0xd>, GPR_64;
def XORi64 : ArithLogicI<"xori", uimm16_64, GPR64Opnd, II_XOR, immZExt16, xor>,
- ADDI_FM<0xe>;
-def LUi64 : LoadUpper<"lui", GPR64Opnd, uimm16_64_relaxed>, LUI_FM;
+ ADDI_FM<0xe>, GPR_64;
+def LUi64 : LoadUpper<"lui", GPR64Opnd, uimm16_64_relaxed>, LUI_FM, GPR_64;
}
/// Arithmetic Instructions (3-Operand, R-Type)
@@ -131,12 +142,15 @@ let AdditionalPredicates = [NotInMicroMips] in {
}
let isCodeGenOnly = 1 in {
-def SLT64 : SetCC_R<"slt", setlt, GPR64Opnd>, ADD_FM<0, 0x2a>;
-def SLTu64 : SetCC_R<"sltu", setult, GPR64Opnd>, ADD_FM<0, 0x2b>;
-def AND64 : ArithLogicR<"and", GPR64Opnd, 1, II_AND, and>, ADD_FM<0, 0x24>;
-def OR64 : ArithLogicR<"or", GPR64Opnd, 1, II_OR, or>, ADD_FM<0, 0x25>;
-def XOR64 : ArithLogicR<"xor", GPR64Opnd, 1, II_XOR, xor>, ADD_FM<0, 0x26>;
-def NOR64 : LogicNOR<"nor", GPR64Opnd>, ADD_FM<0, 0x27>;
+def SLT64 : SetCC_R<"slt", setlt, GPR64Opnd>, ADD_FM<0, 0x2a>, GPR_64;
+def SLTu64 : SetCC_R<"sltu", setult, GPR64Opnd>, ADD_FM<0, 0x2b>, GPR_64;
+def AND64 : ArithLogicR<"and", GPR64Opnd, 1, II_AND, and>, ADD_FM<0, 0x24>,
+ GPR_64;
+def OR64 : ArithLogicR<"or", GPR64Opnd, 1, II_OR, or>, ADD_FM<0, 0x25>,
+ GPR_64;
+def XOR64 : ArithLogicR<"xor", GPR64Opnd, 1, II_XOR, xor>, ADD_FM<0, 0x26>,
+ GPR_64;
+def NOR64 : LogicNOR<"nor", GPR64Opnd>, ADD_FM<0, 0x27>, GPR_64;
}
/// Shift Instructions
@@ -176,22 +190,24 @@ let AdditionalPredicates = [NotInMicroMips] in {
/// Load and Store Instructions
/// aligned
let isCodeGenOnly = 1 in {
-def LB64 : Load<"lb", GPR64Opnd, sextloadi8, II_LB>, LW_FM<0x20>;
-def LBu64 : Load<"lbu", GPR64Opnd, zextloadi8, II_LBU>, LW_FM<0x24>;
-def LH64 : Load<"lh", GPR64Opnd, sextloadi16, II_LH>, LW_FM<0x21>;
-def LHu64 : Load<"lhu", GPR64Opnd, zextloadi16, II_LHU>, LW_FM<0x25>;
-def LW64 : Load<"lw", GPR64Opnd, sextloadi32, II_LW>, LW_FM<0x23>;
-def SB64 : Store<"sb", GPR64Opnd, truncstorei8, II_SB>, LW_FM<0x28>;
-def SH64 : Store<"sh", GPR64Opnd, truncstorei16, II_SH>, LW_FM<0x29>;
-def SW64 : Store<"sw", GPR64Opnd, truncstorei32, II_SW>, LW_FM<0x2b>;
+def LB64 : Load<"lb", GPR64Opnd, sextloadi8, II_LB>, LW_FM<0x20>, GPR_64;
+def LBu64 : Load<"lbu", GPR64Opnd, zextloadi8, II_LBU>, LW_FM<0x24>, GPR_64;
+def LH64 : Load<"lh", GPR64Opnd, sextloadi16, II_LH>, LW_FM<0x21>, GPR_64;
+def LHu64 : Load<"lhu", GPR64Opnd, zextloadi16, II_LHU>, LW_FM<0x25>, GPR_64;
+def LW64 : Load<"lw", GPR64Opnd, sextloadi32, II_LW>, LW_FM<0x23>, GPR_64;
+def SB64 : Store<"sb", GPR64Opnd, truncstorei8, II_SB>, LW_FM<0x28>, GPR_64;
+def SH64 : Store<"sh", GPR64Opnd, truncstorei16, II_SH>, LW_FM<0x29>,
+ GPR_64;
+def SW64 : Store<"sw", GPR64Opnd, truncstorei32, II_SW>, LW_FM<0x2b>,
+ GPR_64;
}
let AdditionalPredicates = [NotInMicroMips] in {
def LWu : MMRel, Load<"lwu", GPR64Opnd, zextloadi32, II_LWU>,
LW_FM<0x27>, ISA_MIPS3;
- def LD : LoadMemory<"ld", GPR64Opnd, mem_simm16, load, II_LD>,
+ def LD : LoadMemory<"ld", GPR64Opnd, mem_simmptr, load, II_LD>,
LW_FM<0x37>, ISA_MIPS3;
- def SD : StoreMemory<"sd", GPR64Opnd, mem_simm16, store, II_SD>,
+ def SD : StoreMemory<"sd", GPR64Opnd, mem_simmptr, store, II_SD>,
LW_FM<0x3f>, ISA_MIPS3;
}
@@ -199,10 +215,14 @@ let AdditionalPredicates = [NotInMicroMips] in {
/// load/store left/right
let isCodeGenOnly = 1 in {
-def LWL64 : LoadLeftRight<"lwl", MipsLWL, GPR64Opnd, II_LWL>, LW_FM<0x22>;
-def LWR64 : LoadLeftRight<"lwr", MipsLWR, GPR64Opnd, II_LWR>, LW_FM<0x26>;
-def SWL64 : StoreLeftRight<"swl", MipsSWL, GPR64Opnd, II_SWL>, LW_FM<0x2a>;
-def SWR64 : StoreLeftRight<"swr", MipsSWR, GPR64Opnd, II_SWR>, LW_FM<0x2e>;
+def LWL64 : LoadLeftRight<"lwl", MipsLWL, GPR64Opnd, II_LWL>, LW_FM<0x22>,
+ GPR_64;
+def LWR64 : LoadLeftRight<"lwr", MipsLWR, GPR64Opnd, II_LWR>, LW_FM<0x26>,
+ GPR_64;
+def SWL64 : StoreLeftRight<"swl", MipsSWL, GPR64Opnd, II_SWL>, LW_FM<0x2a>,
+ GPR_64;
+def SWR64 : StoreLeftRight<"swr", MipsSWR, GPR64Opnd, II_SWR>, LW_FM<0x2e>,
+ GPR_64;
}
def LDL : LoadLeftRight<"ldl", MipsLDL, GPR64Opnd, II_LDL>, LW_FM<0x1a>,
@@ -216,7 +236,7 @@ def SDR : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, II_SDR>, LW_FM<0x2d>,
/// Load-linked, Store-conditional
let AdditionalPredicates = [NotInMicroMips] in {
- def LLD : LLBase<"lld", GPR64Opnd, mem_simm16>, LW_FM<0x34>,
+ def LLD : LLBase<"lld", GPR64Opnd, mem_simmptr>, LW_FM<0x34>,
ISA_MIPS3_NOT_32R6_64R6;
}
def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>, ISA_MIPS3_NOT_32R6_64R6;
@@ -234,19 +254,44 @@ def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM;
/// Jump and Branch Instructions
let isCodeGenOnly = 1 in {
- def BEQ64 : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>;
- def BNE64 : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>;
- def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>;
- def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>;
- def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>;
- def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>;
- def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>;
+ def BEQ64 : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>,
+ GPR_64;
+ def BNE64 : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>,
+ GPR_64;
+ def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>,
+ GPR_64;
+ def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>,
+ GPR_64;
+ def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>,
+ GPR_64;
+ def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>,
+ GPR_64;
+ let AdditionalPredicates = [NoIndirectJumpGuards] in
+ def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>;
+}
+let AdditionalPredicates = [NotInMicroMips],
+ DecoderNamespace = "Mips64" in {
+ def JR_HB64 : JR_HB_DESC<GPR64Opnd>, JR_HB_ENC, ISA_MIPS32_NOT_32R6_64R6;
+ def JALR_HB64 : JALR_HB_DESC<GPR64Opnd>, JALR_HB_ENC, ISA_MIPS32R2;
}
+def PseudoReturn64 : PseudoReturnBase<GPR64Opnd>;
-def TAILCALLREG64 : TailCallReg<GPR64Opnd>;
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+ NoIndirectJumpGuards] in {
+ def TAILCALLREG64 : TailCallReg<JR64, GPR64Opnd>, ISA_MIPS3_NOT_32R6_64R6,
+ PTR_64;
+ def PseudoIndirectBranch64 : PseudoIndirectBranchBase<JR64, GPR64Opnd>,
+ ISA_MIPS3_NOT_32R6_64R6;
+}
-def PseudoReturn64 : PseudoReturnBase<GPR64Opnd>;
-def PseudoIndirectBranch64 : PseudoIndirectBranchBase<GPR64Opnd>;
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+ UseIndirectJumpsHazard] in {
+ def TAILCALLREGHB64 : TailCallReg<JR_HB64, GPR64Opnd>,
+ ISA_MIPS32R2_NOT_32R6_64R6, PTR_64;
+ def PseudoIndirectHazardBranch64 : PseudoIndirectBranchBase<JR_HB64,
+ GPR64Opnd>,
+ ISA_MIPS32R2_NOT_32R6_64R6;
+}
/// Multiply and Divide Instructions.
let AdditionalPredicates = [NotInMicroMips] in {
@@ -304,12 +349,13 @@ let AdditionalPredicates = [NotInMicroMips] in {
ISA_MIPS64R2;
def DSHD : SubwordSwap<"dshd", GPR64Opnd, II_DSHD>, SEB_FM<5, 0x24>,
ISA_MIPS64R2;
-}
-def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>;
+ def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>,
+ GPR_64;
+}
let isCodeGenOnly = 1 in
-def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM;
+def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM, GPR_64;
let AdditionalPredicates = [NotInMicroMips] in {
// The 'pos + size' constraints for code generation are enforced by the
@@ -357,11 +403,13 @@ let isCodeGenOnly = 1, AdditionalPredicates = [NotInMicroMips] in {
let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
def DSLL64_32 : FR<0x00, 0x3c, (outs GPR64:$rd), (ins GPR32:$rt),
- "dsll\t$rd, $rt, 32", [], II_DSLL>;
- def SLL64_32 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR32:$rt),
- "sll\t$rd, $rt, 0", [], II_SLL>;
- def SLL64_64 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR64:$rt),
- "sll\t$rd, $rt, 0", [], II_SLL>;
+ "dsll\t$rd, $rt, 32", [], II_DSLL>, GPR_64;
+ let isMoveReg = 1 in {
+ def SLL64_32 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR32:$rt),
+ "sll\t$rd, $rt, 0", [], II_SLL>, GPR_64;
+ def SLL64_64 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR64:$rt),
+ "sll\t$rd, $rt, 0", [], II_SLL>, GPR_64;
+ }
}
// We need the following pseudo instruction to avoid offset calculation for
@@ -372,7 +420,7 @@ let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
// where %PART may be %hi or %lo, depending on the relocation kind
// that $tgt is annotated with.
def LONG_BRANCH_DADDiu : PseudoSE<(outs GPR64Opnd:$dst),
- (ins GPR64Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
+ (ins GPR64Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>, GPR_64;
// Cavium Octeon cnMIPS instructions
let DecoderNamespace = "CnMips",
@@ -526,139 +574,156 @@ def DMTC2_OCTEON : MFC2OP<"dmtc2", GPR64Opnd, II_DMTC2>, MFC2OP_FM<0x12, 5>,
/// Move between CPU and coprocessor registers
let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
-def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd, COP0Opnd, II_DMFC0>, MFC3OP_FM<0x10, 1>,
- ISA_MIPS3;
-def DMTC0 : MTC3OP<"dmtc0", COP0Opnd, GPR64Opnd, II_DMTC0>, MFC3OP_FM<0x10, 5>,
- ISA_MIPS3;
-def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd, COP2Opnd, II_DMFC2>, MFC3OP_FM<0x12, 1>,
- ISA_MIPS3;
-def DMTC2 : MTC3OP<"dmtc2", COP2Opnd, GPR64Opnd, II_DMTC2>, MFC3OP_FM<0x12, 5>,
- ISA_MIPS3;
+def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd, COP0Opnd, II_DMFC0>,
+ MFC3OP_FM<0x10, 1, 0>, ISA_MIPS3;
+def DMTC0 : MTC3OP<"dmtc0", COP0Opnd, GPR64Opnd, II_DMTC0>,
+ MFC3OP_FM<0x10, 5, 0>, ISA_MIPS3;
+def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd, COP2Opnd, II_DMFC2>,
+ MFC3OP_FM<0x12, 1, 0>, ISA_MIPS3;
+def DMTC2 : MTC3OP<"dmtc2", COP2Opnd, GPR64Opnd, II_DMTC2>,
+ MFC3OP_FM<0x12, 5, 0>, ISA_MIPS3;
}
+/// Move between CPU and guest coprocessor registers (Virtualization ASE)
+let DecoderNamespace = "Mips64" in {
+ def DMFGC0 : MFC3OP<"dmfgc0", GPR64Opnd, COP0Opnd, II_DMFGC0>,
+ MFC3OP_FM<0x10, 3, 1>, ISA_MIPS64R5, ASE_VIRT;
+ def DMTGC0 : MTC3OP<"dmtgc0", COP0Opnd, GPR64Opnd, II_DMTGC0>,
+ MFC3OP_FM<0x10, 3, 3>, ISA_MIPS64R5, ASE_VIRT;
+}
+
+let AdditionalPredicates = [UseIndirectJumpsHazard] in
+ def JALRHB64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR_HB64, RA_64>;
+
//===----------------------------------------------------------------------===//
// Arbitrary patterns that map to one or more instructions
//===----------------------------------------------------------------------===//
// Materialize i64 constants.
-defm : MaterializeImms<i64, ZERO_64, DADDiu, LUi64, ORi64>;
+defm : MaterializeImms<i64, ZERO_64, DADDiu, LUi64, ORi64>, ISA_MIPS3, GPR_64;
def : MipsPat<(i64 immZExt32Low16Zero:$imm),
- (DSLL (ORi64 ZERO_64, (HI16 imm:$imm)), 16)>;
+ (DSLL (ORi64 ZERO_64, (HI16 imm:$imm)), 16)>, ISA_MIPS3, GPR_64;
def : MipsPat<(i64 immZExt32:$imm),
(ORi64 (DSLL (ORi64 ZERO_64, (HI16 imm:$imm)), 16),
- (LO16 imm:$imm))>;
+ (LO16 imm:$imm))>, ISA_MIPS3, GPR_64;
// extended loads
-def : MipsPat<(i64 (extloadi1 addr:$src)), (LB64 addr:$src)>;
-def : MipsPat<(i64 (extloadi8 addr:$src)), (LB64 addr:$src)>;
-def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
-def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
+def : MipsPat<(i64 (extloadi1 addr:$src)), (LB64 addr:$src)>, ISA_MIPS3,
+ GPR_64;
+def : MipsPat<(i64 (extloadi8 addr:$src)), (LB64 addr:$src)>, ISA_MIPS3,
+ GPR_64;
+def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>, ISA_MIPS3,
+ GPR_64;
+def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>, ISA_MIPS3,
+ GPR_64;
// hi/lo relocs
let AdditionalPredicates = [NotInMicroMips] in
-defm : MipsHiLoRelocs<LUi64, DADDiu, ZERO_64, GPR64Opnd>, SYM_32;
+defm : MipsHiLoRelocs<LUi64, DADDiu, ZERO_64, GPR64Opnd>, ISA_MIPS3, GPR_64,
+ SYM_32;
-def : MipsPat<(MipsGotHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
-def : MipsPat<(MipsGotHi texternalsym:$in), (LUi64 texternalsym:$in)>;
+def : MipsPat<(MipsGotHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>, ISA_MIPS3,
+ GPR_64;
+def : MipsPat<(MipsGotHi texternalsym:$in), (LUi64 texternalsym:$in)>,
+ ISA_MIPS3, GPR_64;
+
+def : MipsPat<(MipsTlsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>,
+ ISA_MIPS3, GPR_64;
// highest/higher/hi/lo relocs
let AdditionalPredicates = [NotInMicroMips] in {
def : MipsPat<(MipsJmpLink (i64 texternalsym:$dst)),
- (JAL texternalsym:$dst)>, SYM_64;
+ (JAL texternalsym:$dst)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(MipsHighest (i64 tglobaladdr:$in)),
- (LUi64 tglobaladdr:$in)>, SYM_64;
+ (LUi64 tglobaladdr:$in)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(MipsHighest (i64 tblockaddress:$in)),
- (LUi64 tblockaddress:$in)>, SYM_64;
+ (LUi64 tblockaddress:$in)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(MipsHighest (i64 tjumptable:$in)),
- (LUi64 tjumptable:$in)>, SYM_64;
+ (LUi64 tjumptable:$in)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(MipsHighest (i64 tconstpool:$in)),
- (LUi64 tconstpool:$in)>, SYM_64;
- def : MipsPat<(MipsHighest (i64 tglobaltlsaddr:$in)),
- (LUi64 tglobaltlsaddr:$in)>, SYM_64;
+ (LUi64 tconstpool:$in)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(MipsHighest (i64 texternalsym:$in)),
- (LUi64 texternalsym:$in)>, SYM_64;
+ (LUi64 texternalsym:$in)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(MipsHigher (i64 tglobaladdr:$in)),
- (DADDiu ZERO_64, tglobaladdr:$in)>, SYM_64;
+ (DADDiu ZERO_64, tglobaladdr:$in)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(MipsHigher (i64 tblockaddress:$in)),
- (DADDiu ZERO_64, tblockaddress:$in)>, SYM_64;
+ (DADDiu ZERO_64, tblockaddress:$in)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(MipsHigher (i64 tjumptable:$in)),
- (DADDiu ZERO_64, tjumptable:$in)>, SYM_64;
+ (DADDiu ZERO_64, tjumptable:$in)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(MipsHigher (i64 tconstpool:$in)),
- (DADDiu ZERO_64, tconstpool:$in)>, SYM_64;
- def : MipsPat<(MipsHigher (i64 tglobaltlsaddr:$in)),
- (DADDiu ZERO_64, tglobaltlsaddr:$in)>, SYM_64;
+ (DADDiu ZERO_64, tconstpool:$in)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(MipsHigher (i64 texternalsym:$in)),
- (DADDiu ZERO_64, texternalsym:$in)>, SYM_64;
+ (DADDiu ZERO_64, texternalsym:$in)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tglobaladdr:$lo))),
- (DADDiu GPR64:$hi, tglobaladdr:$lo)>, SYM_64;
+ (DADDiu GPR64:$hi, tglobaladdr:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tblockaddress:$lo))),
- (DADDiu GPR64:$hi, tblockaddress:$lo)>, SYM_64;
+ (DADDiu GPR64:$hi, tblockaddress:$lo)>, ISA_MIPS3, GPR_64,
+ SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tjumptable:$lo))),
- (DADDiu GPR64:$hi, tjumptable:$lo)>, SYM_64;
+ (DADDiu GPR64:$hi, tjumptable:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tconstpool:$lo))),
- (DADDiu GPR64:$hi, tconstpool:$lo)>, SYM_64;
- def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tglobaltlsaddr:$lo))),
- (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>, SYM_64;
+ (DADDiu GPR64:$hi, tconstpool:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tglobaladdr:$lo))),
- (DADDiu GPR64:$hi, tglobaladdr:$lo)>, SYM_64;
+ (DADDiu GPR64:$hi, tglobaladdr:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tblockaddress:$lo))),
- (DADDiu GPR64:$hi, tblockaddress:$lo)>, SYM_64;
+ (DADDiu GPR64:$hi, tblockaddress:$lo)>, ISA_MIPS3, GPR_64,
+ SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tjumptable:$lo))),
- (DADDiu GPR64:$hi, tjumptable:$lo)>, SYM_64;
+ (DADDiu GPR64:$hi, tjumptable:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tconstpool:$lo))),
- (DADDiu GPR64:$hi, tconstpool:$lo)>, SYM_64;
- def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tglobaltlsaddr:$lo))),
- (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>, SYM_64;
+ (DADDiu GPR64:$hi, tconstpool:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaladdr:$lo))),
- (DADDiu GPR64:$hi, tglobaladdr:$lo)>, SYM_64;
+ (DADDiu GPR64:$hi, tglobaladdr:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tblockaddress:$lo))),
- (DADDiu GPR64:$hi, tblockaddress:$lo)>, SYM_64;
+ (DADDiu GPR64:$hi, tblockaddress:$lo)>, ISA_MIPS3, GPR_64,
+ SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tjumptable:$lo))),
- (DADDiu GPR64:$hi, tjumptable:$lo)>, SYM_64;
+ (DADDiu GPR64:$hi, tjumptable:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tconstpool:$lo))),
- (DADDiu GPR64:$hi, tconstpool:$lo)>, SYM_64;
+ (DADDiu GPR64:$hi, tconstpool:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaltlsaddr:$lo))),
- (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>, SYM_64;
+ (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>, ISA_MIPS3, GPR_64,
+ SYM_64;
}
// gp_rel relocs
def : MipsPat<(add GPR64:$gp, (MipsGPRel tglobaladdr:$in)),
- (DADDiu GPR64:$gp, tglobaladdr:$in)>, ABI_N64;
+ (DADDiu GPR64:$gp, tglobaladdr:$in)>, ISA_MIPS3, ABI_N64;
def : MipsPat<(add GPR64:$gp, (MipsGPRel tconstpool:$in)),
- (DADDiu GPR64:$gp, tconstpool:$in)>, ABI_N64;
+ (DADDiu GPR64:$gp, tconstpool:$in)>, ISA_MIPS3, ABI_N64;
-def : WrapperPat<tglobaladdr, DADDiu, GPR64>;
-def : WrapperPat<tconstpool, DADDiu, GPR64>;
-def : WrapperPat<texternalsym, DADDiu, GPR64>;
-def : WrapperPat<tblockaddress, DADDiu, GPR64>;
-def : WrapperPat<tjumptable, DADDiu, GPR64>;
-def : WrapperPat<tglobaltlsaddr, DADDiu, GPR64>;
+def : WrapperPat<tglobaladdr, DADDiu, GPR64>, ISA_MIPS3, GPR_64;
+def : WrapperPat<tconstpool, DADDiu, GPR64>, ISA_MIPS3, GPR_64;
+def : WrapperPat<texternalsym, DADDiu, GPR64>, ISA_MIPS3, GPR_64;
+def : WrapperPat<tblockaddress, DADDiu, GPR64>, ISA_MIPS3, GPR_64;
+def : WrapperPat<tjumptable, DADDiu, GPR64>, ISA_MIPS3, GPR_64;
+def : WrapperPat<tglobaltlsaddr, DADDiu, GPR64>, ISA_MIPS3, GPR_64;
defm : BrcondPats<GPR64, BEQ64, BEQ, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
- ZERO_64>;
+ ZERO_64>, ISA_MIPS3, GPR_64;
def : MipsPat<(brcond (i32 (setlt i64:$lhs, 1)), bb:$dst),
- (BLEZ64 i64:$lhs, bb:$dst)>;
+ (BLEZ64 i64:$lhs, bb:$dst)>, ISA_MIPS3, GPR_64;
def : MipsPat<(brcond (i32 (setgt i64:$lhs, -1)), bb:$dst),
- (BGEZ64 i64:$lhs, bb:$dst)>;
+ (BGEZ64 i64:$lhs, bb:$dst)>, ISA_MIPS3, GPR_64;
// setcc patterns
let AdditionalPredicates = [NotInMicroMips] in {
- defm : SeteqPats<GPR64, SLTiu64, XOR64, SLTu64, ZERO_64>;
- defm : SetlePats<GPR64, XORi, SLT64, SLTu64>;
- defm : SetgtPats<GPR64, SLT64, SLTu64>;
- defm : SetgePats<GPR64, XORi, SLT64, SLTu64>;
- defm : SetgeImmPats<GPR64, XORi, SLTi64, SLTiu64>;
+ defm : SeteqPats<GPR64, SLTiu64, XOR64, SLTu64, ZERO_64>, ISA_MIPS3, GPR_64;
+ defm : SetlePats<GPR64, XORi, SLT64, SLTu64>, ISA_MIPS3, GPR_64;
+ defm : SetgtPats<GPR64, SLT64, SLTu64>, ISA_MIPS3, GPR_64;
+ defm : SetgePats<GPR64, XORi, SLT64, SLTu64>, ISA_MIPS3, GPR_64;
+ defm : SetgeImmPats<GPR64, XORi, SLTi64, SLTiu64>, ISA_MIPS3, GPR_64;
}
// truncate
def : MipsPat<(trunc (assertsext GPR64:$src)),
- (EXTRACT_SUBREG GPR64:$src, sub_32)>;
+ (EXTRACT_SUBREG GPR64:$src, sub_32)>, ISA_MIPS3, GPR_64;
// The forward compatibility strategy employed by MIPS requires us to treat
// values as being sign extended to an infinite number of bits. This allows
// existing software to run without modification on any future MIPS
@@ -670,80 +735,134 @@ def : MipsPat<(trunc (assertsext GPR64:$src)),
// such as (trunc:i32 (assertzext:i64 X, i32)), because the sign-bit of the
// lower subreg would not be replicated into the upper half.
def : MipsPat<(trunc (assertzext_lt_i32 GPR64:$src)),
- (EXTRACT_SUBREG GPR64:$src, sub_32)>;
+ (EXTRACT_SUBREG GPR64:$src, sub_32)>, ISA_MIPS3, GPR_64;
def : MipsPat<(i32 (trunc GPR64:$src)),
- (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>;
+ (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>, ISA_MIPS3, GPR_64;
// variable shift instructions patterns
def : MipsPat<(shl GPR64:$rt, (i32 (trunc GPR64:$rs))),
- (DSLLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
+ (DSLLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>,
+ ISA_MIPS3, GPR_64;
def : MipsPat<(srl GPR64:$rt, (i32 (trunc GPR64:$rs))),
- (DSRLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
+ (DSRLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>,
+ ISA_MIPS3, GPR_64;
def : MipsPat<(sra GPR64:$rt, (i32 (trunc GPR64:$rs))),
- (DSRAV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
-let AdditionalPredicates = [NotInMicroMips] in {
- def : MipsPat<(rotr GPR64:$rt, (i32 (trunc GPR64:$rs))),
- (DROTRV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
-}
+ (DSRAV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>,
+ ISA_MIPS3, GPR_64;
+def : MipsPat<(rotr GPR64:$rt, (i32 (trunc GPR64:$rs))),
+ (DROTRV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>,
+ ISA_MIPS3, GPR_64;
// 32-to-64-bit extension
def : MipsPat<(i64 (anyext GPR32:$src)),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
-def : MipsPat<(i64 (zext GPR32:$src)), (DSRL (DSLL64_32 GPR32:$src), 32)>;
-def : MipsPat<(i64 (sext GPR32:$src)), (SLL64_32 GPR32:$src)>;
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>,
+ ISA_MIPS3, GPR_64;
+def : MipsPat<(i64 (zext GPR32:$src)), (DSRL (DSLL64_32 GPR32:$src), 32)>,
+ ISA_MIPS3, GPR_64;
+def : MipsPat<(i64 (sext GPR32:$src)), (SLL64_32 GPR32:$src)>, ISA_MIPS3,
+ GPR_64;
let AdditionalPredicates = [NotInMicroMips] in {
def : MipsPat<(i64 (zext GPR32:$src)), (DEXT64_32 GPR32:$src, 0, 32)>,
- ISA_MIPS64R2;
+ ISA_MIPS64R2, GPR_64;
def : MipsPat<(i64 (zext (i32 (shl GPR32:$rt, immZExt5:$imm)))),
(CINS64_32 GPR32:$rt, imm:$imm, (immZExt5To31 imm:$imm))>,
- ASE_MIPS64_CNMIPS;
+ ISA_MIPS64R2, GPR_64, ASE_MIPS64_CNMIPS;
}
// Sign extend in register
def : MipsPat<(i64 (sext_inreg GPR64:$src, i32)),
- (SLL64_64 GPR64:$src)>;
+ (SLL64_64 GPR64:$src)>, ISA_MIPS3, GPR_64;
// bswap MipsPattern
-def : MipsPat<(bswap GPR64:$rt), (DSHD (DSBH GPR64:$rt))>;
+def : MipsPat<(bswap GPR64:$rt), (DSHD (DSBH GPR64:$rt))>, ISA_MIPS64R2;
// Carry pattern
let AdditionalPredicates = [NotInMicroMips] in {
def : MipsPat<(subc GPR64:$lhs, GPR64:$rhs),
- (DSUBu GPR64:$lhs, GPR64:$rhs)>;
+ (DSUBu GPR64:$lhs, GPR64:$rhs)>, ISA_MIPS3, GPR_64;
def : MipsPat<(addc GPR64:$lhs, GPR64:$rhs),
- (DADDu GPR64:$lhs, GPR64:$rhs)>, ASE_NOT_DSP;
+ (DADDu GPR64:$lhs, GPR64:$rhs)>, ISA_MIPS3, ASE_NOT_DSP, GPR_64;
def : MipsPat<(addc GPR64:$lhs, immSExt16:$imm),
- (DADDiu GPR64:$lhs, imm:$imm)>, ASE_NOT_DSP;
+ (DADDiu GPR64:$lhs, imm:$imm)>, ISA_MIPS3, ASE_NOT_DSP, GPR_64;
}
// Octeon bbit0/bbit1 MipsPattern
def : MipsPat<(brcond (i32 (seteq (and i64:$lhs, PowerOf2LO:$mask), 0)), bb:$dst),
- (BBIT0 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+ (BBIT0 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>,
+ ISA_MIPS64R2, ASE_MIPS64_CNMIPS;
def : MipsPat<(brcond (i32 (seteq (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst),
- (BBIT032 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+ (BBIT032 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>,
+ ISA_MIPS64R2, ASE_MIPS64_CNMIPS;
def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2LO:$mask), 0)), bb:$dst),
- (BBIT1 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+ (BBIT1 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>,
+ ISA_MIPS64R2, ASE_MIPS64_CNMIPS;
def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst),
- (BBIT132 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+ (BBIT132 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>,
+ ISA_MIPS64R2, ASE_MIPS64_CNMIPS;
def : MipsPat<(brcond (i32 (seteq (and i32:$lhs, PowerOf2LO_i32:$mask), 0)), bb:$dst),
(BBIT0 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), i32:$lhs, sub_32),
- (Log2LO PowerOf2LO_i32:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+ (Log2LO PowerOf2LO_i32:$mask), bb:$dst)>, ISA_MIPS64R2,
+ ASE_MIPS64_CNMIPS;
def : MipsPat<(brcond (i32 (setne (and i32:$lhs, PowerOf2LO_i32:$mask), 0)), bb:$dst),
(BBIT1 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), i32:$lhs, sub_32),
- (Log2LO PowerOf2LO_i32:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+ (Log2LO PowerOf2LO_i32:$mask), bb:$dst)>, ISA_MIPS64R2,
+ ASE_MIPS64_CNMIPS;
// Atomic load patterns.
-def : MipsPat<(atomic_load_8 addr:$a), (LB64 addr:$a)>;
-def : MipsPat<(atomic_load_16 addr:$a), (LH64 addr:$a)>;
-def : MipsPat<(atomic_load_32 addr:$a), (LW64 addr:$a)>;
-def : MipsPat<(atomic_load_64 addr:$a), (LD addr:$a)>;
+def : MipsPat<(atomic_load_8 addr:$a), (LB64 addr:$a)>, ISA_MIPS3, GPR_64;
+def : MipsPat<(atomic_load_16 addr:$a), (LH64 addr:$a)>, ISA_MIPS3, GPR_64;
+def : MipsPat<(atomic_load_32 addr:$a), (LW64 addr:$a)>, ISA_MIPS3, GPR_64;
+def : MipsPat<(atomic_load_64 addr:$a), (LD addr:$a)>, ISA_MIPS3, GPR_64;
// Atomic store patterns.
-def : MipsPat<(atomic_store_8 addr:$a, GPR64:$v), (SB64 GPR64:$v, addr:$a)>;
-def : MipsPat<(atomic_store_16 addr:$a, GPR64:$v), (SH64 GPR64:$v, addr:$a)>;
-def : MipsPat<(atomic_store_32 addr:$a, GPR64:$v), (SW64 GPR64:$v, addr:$a)>;
-def : MipsPat<(atomic_store_64 addr:$a, GPR64:$v), (SD GPR64:$v, addr:$a)>;
+def : MipsPat<(atomic_store_8 addr:$a, GPR64:$v), (SB64 GPR64:$v, addr:$a)>,
+ ISA_MIPS3, GPR_64;
+def : MipsPat<(atomic_store_16 addr:$a, GPR64:$v), (SH64 GPR64:$v, addr:$a)>,
+ ISA_MIPS3, GPR_64;
+def : MipsPat<(atomic_store_32 addr:$a, GPR64:$v), (SW64 GPR64:$v, addr:$a)>,
+ ISA_MIPS3, GPR_64;
+def : MipsPat<(atomic_store_64 addr:$a, GPR64:$v), (SD GPR64:$v, addr:$a)>,
+ ISA_MIPS3, GPR_64;
+
+// Patterns used for matching away redundant sign extensions.
+// MIPS32 arithmetic instructions sign extend their result implicitly.
+def : MipsPat<(i64 (sext (i32 (add GPR32:$src, immSExt16:$imm16)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (ADDiu GPR32:$src, immSExt16:$imm16), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (add GPR32:$src, GPR32:$src2)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (ADDu GPR32:$src, GPR32:$src2), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (sub GPR32:$src, GPR32:$src2)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (SUBu GPR32:$src, GPR32:$src2), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (mul GPR32:$src, GPR32:$src2)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (MUL GPR32:$src, GPR32:$src2), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (MipsMFHI ACC64:$src)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (PseudoMFHI ACC64:$src), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (MipsMFLO ACC64:$src)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (PseudoMFLO ACC64:$src), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (shl GPR32:$src, immZExt5:$imm5)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (SLL GPR32:$src, immZExt5:$imm5), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (shl GPR32:$src, GPR32:$src2)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (SLLV GPR32:$src, GPR32:$src2), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (srl GPR32:$src, immZExt5:$imm5)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (SRL GPR32:$src, immZExt5:$imm5), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (srl GPR32:$src, GPR32:$src2)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (SRLV GPR32:$src, GPR32:$src2), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (sra GPR32:$src, immZExt5:$imm5)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (SRA GPR32:$src, immZExt5:$imm5), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (sra GPR32:$src, GPR32:$src2)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (SRAV GPR32:$src, GPR32:$src2), sub_32)>;
//===----------------------------------------------------------------------===//
// Instruction aliases
@@ -769,13 +888,13 @@ let AdditionalPredicates = [NotInMicroMips] in {
0>, ISA_MIPS3;
defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi64, GPR64Opnd, imm64>,
- GPR_64;
+ ISA_MIPS3, GPR_64;
defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi64, GPR64Opnd, imm64>,
- GPR_64;
+ ISA_MIPS3, GPR_64;
defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi64, GPR64Opnd, imm64>,
- GPR_64;
+ ISA_MIPS3, GPR_64;
}
let AdditionalPredicates = [NotInMicroMips] in {
def : MipsInstAlias<"dneg $rt, $rs",
@@ -843,12 +962,19 @@ let AdditionalPredicates = [NotInMicroMips] in {
def : MipsInstAlias<"dext $rt, $rs, $pos, $size",
(DEXTU GPR64Opnd:$rt, GPR64Opnd:$rs, uimm5_plus32:$pos,
uimm5_plus1:$size), 0>, ISA_MIPS64R2;
-
+ def : MipsInstAlias<"jalr.hb $rs", (JALR_HB64 RA_64, GPR64Opnd:$rs), 1>,
+ ISA_MIPS64;
// Two operand (implicit 0 selector) versions:
def : MipsInstAlias<"dmtc0 $rt, $rd",
(DMTC0 COP0Opnd:$rd, GPR64Opnd:$rt, 0), 0>;
def : MipsInstAlias<"dmfc0 $rt, $rd",
(DMFC0 GPR64Opnd:$rt, COP0Opnd:$rd, 0), 0>;
+ def : MipsInstAlias<"dmfgc0 $rt, $rd",
+ (DMFGC0 GPR64Opnd:$rt, COP0Opnd:$rd, 0), 0>,
+ ISA_MIPS64R5, ASE_VIRT;
+ def : MipsInstAlias<"dmtgc0 $rt, $rd",
+ (DMTGC0 COP0Opnd:$rd, GPR64Opnd:$rt, 0), 0>,
+ ISA_MIPS64R5, ASE_VIRT;
}
def : MipsInstAlias<"dmfc2 $rt, $rd", (DMFC2 GPR64Opnd:$rt, COP2Opnd:$rd, 0), 0>;
def : MipsInstAlias<"dmtc2 $rt, $rd", (DMTC2 COP2Opnd:$rd, GPR64Opnd:$rt, 0), 0>;
@@ -966,6 +1092,38 @@ let AdditionalPredicates = [NotInMicroMips] in {
GPR64Opnd:$rd,
imm64:$imm), 0>,
ISA_MIPS3_NOT_32R6_64R6;
+ def DSRemMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+ (ins GPR64Opnd:$rs, GPR64Opnd:$rt),
+ "drem\t$rd, $rs, $rt">,
+ ISA_MIPS3_NOT_32R6_64R6;
+ def DSRemIMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+ (ins GPR64Opnd:$rs, simm32_relaxed:$imm),
+ "drem\t$rd, $rs, $imm">,
+ ISA_MIPS3_NOT_32R6_64R6;
+ def DURemMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+ (ins GPR64Opnd:$rs, GPR64Opnd:$rt),
+ "dremu\t$rd, $rs, $rt">,
+ ISA_MIPS3_NOT_32R6_64R6;
+ def DURemIMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+ (ins GPR64Opnd:$rs, simm32_relaxed:$imm),
+ "dremu\t$rd, $rs, $imm">,
+ ISA_MIPS3_NOT_32R6_64R6;
+ def : MipsInstAlias<"drem $rt, $rs", (DSRemMacro GPR64Opnd:$rt,
+ GPR64Opnd:$rt,
+ GPR64Opnd:$rs), 0>,
+ ISA_MIPS3_NOT_32R6_64R6;
+ def : MipsInstAlias<"drem $rd, $imm", (DSRemIMacro GPR64Opnd:$rd,
+ GPR64Opnd:$rd,
+ simm32_relaxed:$imm), 0>,
+ ISA_MIPS3_NOT_32R6_64R6;
+ def : MipsInstAlias<"dremu $rt, $rs", (DURemMacro GPR64Opnd:$rt,
+ GPR64Opnd:$rt,
+ GPR64Opnd:$rs), 0>,
+ ISA_MIPS3_NOT_32R6_64R6;
+ def : MipsInstAlias<"dremu $rd, $imm", (DURemIMacro GPR64Opnd:$rd,
+ GPR64Opnd:$rd,
+ simm32_relaxed:$imm), 0>,
+ ISA_MIPS3_NOT_32R6_64R6;
}
def NORImm64 : NORIMM_DESC_BASE<GPR64Opnd, imm64>, GPR_64;
diff --git a/lib/Target/Mips/Mips64r6InstrInfo.td b/lib/Target/Mips/Mips64r6InstrInfo.td
index 1cd43ee6f1c3..9df802cc30b9 100644
--- a/lib/Target/Mips/Mips64r6InstrInfo.td
+++ b/lib/Target/Mips/Mips64r6InstrInfo.td
@@ -39,6 +39,8 @@ class DMULU_ENC : SPECIAL_3R_FM<0b00010, 0b011101>;
class LDPC_ENC : PCREL18_FM<OPCODE3_LDPC>;
class LLD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_LLD>;
class SCD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SCD>;
+class CRC32D_ENC : SPECIAL3_2R_SZ_CRC<3,0>;
+class CRC32CD_ENC : SPECIAL3_2R_SZ_CRC<3,1>;
//===----------------------------------------------------------------------===//
//
@@ -71,7 +73,7 @@ class DMUHU_DESC : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd, II_DMUHU, mulhu>;
class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd, II_DMUL, mul>;
class DMULU_DESC : MUL_R6_DESC_BASE<"dmulu", GPR64Opnd, II_DMUL>;
class LDPC_DESC : PCREL_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3, II_LDPC>;
-class LLD_R6_DESC : LL_R6_DESC_BASE<"lld", GPR64Opnd, mem_simm16, II_LLD>;
+class LLD_R6_DESC : LL_R6_DESC_BASE<"lld", GPR64Opnd, mem_simmptr, II_LLD>;
class SCD_R6_DESC : SC_R6_DESC_BASE<"scd", GPR64Opnd, II_SCD>;
class SELEQZ64_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR64Opnd>;
class SELNEZ64_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR64Opnd>;
@@ -104,6 +106,20 @@ class JIC64_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16, GPR64Opnd,
class LL64_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd, mem_simm9, II_LL>;
class SC64_R6_DESC : SC_R6_DESC_BASE<"sc", GPR32Opnd, II_SC>;
+
+class JR_HB64_R6_DESC : JR_HB_DESC_BASE<"jr.hb", GPR64Opnd> {
+ bit isBranch = 1;
+ bit isIndirectBranch = 1;
+ bit hasDelaySlot = 1;
+ bit isTerminator=1;
+ bit isBarrier=1;
+ bit isCTI = 1;
+ InstrItinClass Itinerary = II_JR_HB;
+}
+
+class CRC32D_DESC : CRC_DESC_BASE<"crc32d", GPR32Opnd, II_CRC32D>;
+class CRC32CD_DESC : CRC_DESC_BASE<"crc32cd", GPR32Opnd, II_CRC32CD>;
+
//===----------------------------------------------------------------------===//
//
// Instruction Definitions
@@ -136,6 +152,7 @@ def SCD_R6 : SCD_R6_ENC, SCD_R6_DESC, ISA_MIPS32R6;
let DecoderNamespace = "Mips32r6_64r6_GP64" in {
def SELEQZ64 : SELEQZ_ENC, SELEQZ64_DESC, ISA_MIPS32R6, GPR_64;
def SELNEZ64 : SELNEZ_ENC, SELNEZ64_DESC, ISA_MIPS32R6, GPR_64;
+ def JR_HB64_R6 : JR_HB_R6_ENC, JR_HB64_R6_DESC, ISA_MIPS32R6;
}
let AdditionalPredicates = [NotInMicroMips],
DecoderNamespace = "Mips32r6_64r6_PTR64" in {
@@ -163,6 +180,10 @@ let DecoderNamespace = "Mips32r6_64r6_BranchZero" in {
def BLTZC64 : BLTZC_ENC, BLTZC64_DESC, ISA_MIPS64R6, GPR_64;
def BGEZC64 : BGEZC_ENC, BGEZC64_DESC, ISA_MIPS64R6, GPR_64;
}
+let AdditionalPredicates = [NotInMicroMips] in {
+ def CRC32D : R6MMR6Rel, CRC32D_ENC, CRC32D_DESC, ISA_MIPS64R6, ASE_CRC;
+ def CRC32CD : R6MMR6Rel, CRC32CD_ENC, CRC32CD_DESC, ISA_MIPS64R6, ASE_CRC;
+}
//===----------------------------------------------------------------------===//
//
@@ -277,3 +298,37 @@ def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i64:$f),
def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i64:$f),
(SELNEZ64 i64:$f, (SLL64_32 i32:$cond))>,
ISA_MIPS64R6;
+
+// Patterns used for matching away redundant sign extensions.
+// MIPS32 arithmetic instructions sign extend their result implicitly.
+def : MipsPat<(i64 (sext (i32 (sdiv GPR32:$src, GPR32:$src2)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (DIV GPR32:$src, GPR32:$src2), sub_32)>, ISA_MIPS64R6;
+def : MipsPat<(i64 (sext (i32 (udiv GPR32:$src, GPR32:$src2)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (DIVU GPR32:$src, GPR32:$src2), sub_32)>, ISA_MIPS64R6;
+def : MipsPat<(i64 (sext (i32 (srem GPR32:$src, GPR32:$src2)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (MOD GPR32:$src, GPR32:$src2), sub_32)>, ISA_MIPS64R6;
+def : MipsPat<(i64 (sext (i32 (urem GPR32:$src, GPR32:$src2)))),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (MODU GPR32:$src, GPR32:$src2), sub_32)>, ISA_MIPS64R6;
+
+// Pseudo instructions
+
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+ NoIndirectJumpGuards] in {
+ def TAILCALL64R6REG : TailCallRegR6<JALR64, ZERO_64, GPR64Opnd>, ISA_MIPS64R6;
+ def PseudoIndirectBranch64R6 : PseudoIndirectBranchBaseR6<JALR64, ZERO_64,
+ GPR64Opnd>,
+ ISA_MIPS64R6;
+}
+
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+ UseIndirectJumpsHazard] in {
+ def TAILCALLHB64R6REG : TailCallReg<JR_HB64_R6, GPR64Opnd>,
+ ISA_MIPS64R6;
+ def PseudoIndrectHazardBranch64R6 : PseudoIndirectBranchBase<JR_HB64_R6,
+ GPR64Opnd>,
+ ISA_MIPS64R6;
+}
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index f9de78dc281f..8ffc0731abcb 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -160,6 +160,8 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
PrintDebugValueComment(MI, OS);
return;
}
+ if (MI->isDebugLabel())
+ return;
// If we just ended a constant pool, mark it as such.
if (InConstantPool && Opc != Mips::CONSTPOOL_ENTRY) {
@@ -499,6 +501,13 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
return true;
O << MO.getImm() - 1;
return false;
+ case 'y': // exact log2
+ if ((MO.getType()) != MachineOperand::MO_Immediate)
+ return true;
+ if (!isPowerOf2_64(MO.getImm()))
+ return true;
+ O << Log2_64(MO.getImm());
+ return false;
case 'z':
// $0 if zero, regular printing otherwise
if (MO.getType() == MachineOperand::MO_Immediate && MO.getImm() == 0) {
@@ -576,17 +585,27 @@ bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
assert(OffsetMO.isImm() && "Unexpected offset for inline asm memory operand.");
int Offset = OffsetMO.getImm();
- // Currently we are expecting either no ExtraCode or 'D'
+ // Currently we are expecting either no ExtraCode or 'D','M','L'.
if (ExtraCode) {
- if (ExtraCode[0] == 'D')
+ switch (ExtraCode[0]) {
+ case 'D':
Offset += 4;
- else
+ break;
+ case 'M':
+ if (Subtarget->isLittle())
+ Offset += 4;
+ break;
+ case 'L':
+ if (!Subtarget->isLittle())
+ Offset += 4;
+ break;
+ default:
return true; // Unknown modifier.
- // FIXME: M = high order bits
- // FIXME: L = low order bits
+ }
}
- O << Offset << "($" << MipsInstPrinter::getRegisterName(BaseMO.getReg()) << ")";
+ O << Offset << "($" << MipsInstPrinter::getRegisterName(BaseMO.getReg())
+ << ")";
return false;
}
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsBranchExpansion.cpp
index bbf2050ce1eb..af936e6fc96b 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsBranchExpansion.cpp
@@ -1,4 +1,4 @@
-//===- MipsLongBranch.cpp - Emit long branches ----------------------------===//
+//===----------------------- MipsBranchExpansion.cpp ----------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,11 +6,70 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-// This pass expands a branch or jump instruction into a long branch if its
-// offset is too large to fit into its immediate field.
-//
-// FIXME: Fix pc-region jump instructions which cross 256MB segment boundaries.
+/// \file
+///
+/// This pass do two things:
+/// - it expands a branch or jump instruction into a long branch if its offset
+/// is too large to fit into its immediate field,
+/// - it inserts nops to prevent forbidden slot hazards.
+///
+/// The reason why this pass combines these two tasks is that one of these two
+/// tasks can break the result of the previous one.
+///
+/// Example of that is a situation where at first, no branch should be expanded,
+/// but after adding at least one nop somewhere in the code to prevent a
+/// forbidden slot hazard, offset of some branches may go out of range. In that
+/// case it is necessary to check again if there is some branch that needs
+/// expansion. On the other hand, expanding some branch may cause a control
+/// transfer instruction to appear in the forbidden slot, which is a hazard that
+/// should be fixed. This pass alternates between this two tasks untill no
+/// changes are made. Only then we can be sure that all branches are expanded
+/// properly, and no hazard situations exist.
+///
+/// Regarding branch expanding:
+///
+/// When branch instruction like beqzc or bnezc has offset that is too large
+/// to fit into its immediate field, it has to be expanded to another
+/// instruction or series of instructions.
+///
+/// FIXME: Fix pc-region jump instructions which cross 256MB segment boundaries.
+/// TODO: Handle out of range bc, b (pseudo) instructions.
+///
+/// Regarding compact branch hazard prevention:
+///
+/// Hazards handled: forbidden slots for MIPSR6.
+///
+/// A forbidden slot hazard occurs when a compact branch instruction is executed
+/// and the adjacent instruction in memory is a control transfer instruction
+/// such as a branch or jump, ERET, ERETNC, DERET, WAIT and PAUSE.
+///
+/// For example:
+///
+/// 0x8004 bnec a1,v0,<P+0x18>
+/// 0x8008 beqc a1,a2,<P+0x54>
+///
+/// In such cases, the processor is required to signal a Reserved Instruction
+/// exception.
+///
+/// Here, if the instruction at 0x8004 is executed, the processor will raise an
+/// exception as there is a control transfer instruction at 0x8008.
+///
+/// There are two sources of forbidden slot hazards:
+///
+/// A) A previous pass has created a compact branch directly.
+/// B) Transforming a delay slot branch into compact branch. This case can be
+/// difficult to process as lookahead for hazards is insufficient, as
+/// backwards delay slot fillling can also produce hazards in previously
+/// processed instuctions.
+///
+/// In future this pass can be extended (or new pass can be created) to handle
+/// other pipeline hazards, such as various MIPS1 hazards, processor errata that
+/// require instruction reorganization, etc.
+///
+/// This pass has to run after the delay slot filler as that pass can introduce
+/// pipeline hazards such as compact branch hazard, hence the existing hazard
+/// recognizer is not suitable.
+///
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/MipsABIInfo.h"
@@ -30,6 +89,7 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugLoc.h"
@@ -37,76 +97,126 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
#include <cassert>
#include <cstdint>
#include <iterator>
+#include <utility>
using namespace llvm;
-#define DEBUG_TYPE "mips-long-branch"
+#define DEBUG_TYPE "mips-branch-expansion"
+STATISTIC(NumInsertedNops, "Number of nops inserted");
STATISTIC(LongBranches, "Number of long branches.");
-static cl::opt<bool> SkipLongBranch(
- "skip-mips-long-branch",
- cl::init(false),
- cl::desc("MIPS: Skip long branch pass."),
- cl::Hidden);
+static cl::opt<bool>
+ SkipLongBranch("skip-mips-long-branch", cl::init(false),
+ cl::desc("MIPS: Skip branch expansion pass."), cl::Hidden);
-static cl::opt<bool> ForceLongBranch(
- "force-mips-long-branch",
- cl::init(false),
- cl::desc("MIPS: Expand all branches to long format."),
- cl::Hidden);
+static cl::opt<bool>
+ ForceLongBranch("force-mips-long-branch", cl::init(false),
+ cl::desc("MIPS: Expand all branches to long format."),
+ cl::Hidden);
namespace {
- using Iter = MachineBasicBlock::iterator;
- using ReverseIter = MachineBasicBlock::reverse_iterator;
+using Iter = MachineBasicBlock::iterator;
+using ReverseIter = MachineBasicBlock::reverse_iterator;
- struct MBBInfo {
- uint64_t Size = 0;
- uint64_t Address;
- bool HasLongBranch = false;
- MachineInstr *Br = nullptr;
+struct MBBInfo {
+ uint64_t Size = 0;
+ bool HasLongBranch = false;
+ MachineInstr *Br = nullptr;
+ MBBInfo() = default;
+};
- MBBInfo() = default;
- };
+class MipsBranchExpansion : public MachineFunctionPass {
+public:
+ static char ID;
- class MipsLongBranch : public MachineFunctionPass {
- public:
- static char ID;
+ MipsBranchExpansion() : MachineFunctionPass(ID), ABI(MipsABIInfo::Unknown()) {
+ initializeMipsBranchExpansionPass(*PassRegistry::getPassRegistry());
+ }
- MipsLongBranch()
- : MachineFunctionPass(ID), ABI(MipsABIInfo::Unknown()) {}
+ StringRef getPassName() const override {
+ return "Mips Branch Expansion Pass";
+ }
- StringRef getPassName() const override { return "Mips Long Branch"; }
+ bool runOnMachineFunction(MachineFunction &F) override;
- bool runOnMachineFunction(MachineFunction &F) override;
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
- MachineFunctionProperties getRequiredProperties() const override {
- return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::NoVRegs);
- }
+private:
+ void splitMBB(MachineBasicBlock *MBB);
+ void initMBBInfo();
+ int64_t computeOffset(const MachineInstr *Br);
+ void replaceBranch(MachineBasicBlock &MBB, Iter Br, const DebugLoc &DL,
+ MachineBasicBlock *MBBOpnd);
+ void expandToLongBranch(MBBInfo &Info);
+ bool handleForbiddenSlot();
+ bool handlePossibleLongBranch();
+
+ const MipsSubtarget *STI;
+ const MipsInstrInfo *TII;
+
+ MachineFunction *MFp;
+ SmallVector<MBBInfo, 16> MBBInfos;
+ bool IsPIC;
+ MipsABIInfo ABI;
+ unsigned LongBranchSeqSize;
+ bool ForceLongBranchFirstPass = false;
+};
+
+} // end of anonymous namespace
+
+char MipsBranchExpansion::ID = 0;
+
+INITIALIZE_PASS(MipsBranchExpansion, DEBUG_TYPE,
+ "Expand out of range branch instructions and prevent forbidden"
+ " slot hazards",
+ false, false)
+
+/// Returns a pass that clears pipeline hazards.
+FunctionPass *llvm::createMipsBranchExpansion() {
+ return new MipsBranchExpansion();
+}
- private:
- void splitMBB(MachineBasicBlock *MBB);
- void initMBBInfo();
- int64_t computeOffset(const MachineInstr *Br);
- void replaceBranch(MachineBasicBlock &MBB, Iter Br, const DebugLoc &DL,
- MachineBasicBlock *MBBOpnd);
- void expandToLongBranch(MBBInfo &Info);
+// Find the next real instruction from the current position in current basic
+// block.
+static Iter getNextMachineInstrInBB(Iter Position) {
+ Iter I = Position, E = Position->getParent()->end();
+ I = std::find_if_not(I, E,
+ [](const Iter &Insn) { return Insn->isTransient(); });
- MachineFunction *MF;
- SmallVector<MBBInfo, 16> MBBInfos;
- bool IsPIC;
- MipsABIInfo ABI;
- unsigned LongBranchSeqSize;
- };
+ return I;
+}
-} // end anonymous namespace
+// Find the next real instruction from the current position, looking through
+// basic block boundaries.
+static std::pair<Iter, bool> getNextMachineInstr(Iter Position,
+ MachineBasicBlock *Parent) {
+ if (Position == Parent->end()) {
+ do {
+ MachineBasicBlock *Succ = Parent->getNextNode();
+ if (Succ != nullptr && Parent->isSuccessor(Succ)) {
+ Position = Succ->begin();
+ Parent = Succ;
+ } else {
+ return std::make_pair(Position, true);
+ }
+ } while (Parent->empty());
+ }
-char MipsLongBranch::ID = 0;
+ Iter Instr = getNextMachineInstrInBB(Position);
+ if (Instr == Parent->end()) {
+ return getNextMachineInstr(Instr, Parent);
+ }
+ return std::make_pair(Instr, false);
+}
/// Iterate over list of Br's operands and search for a MachineBasicBlock
/// operand.
@@ -125,14 +235,14 @@ static MachineBasicBlock *getTargetMBB(const MachineInstr &Br) {
// found or it reaches E.
static ReverseIter getNonDebugInstr(ReverseIter B, const ReverseIter &E) {
for (; B != E; ++B)
- if (!B->isDebugValue())
+ if (!B->isDebugInstr())
return B;
return E;
}
// Split MBB if it has two direct jumps/branches.
-void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
+void MipsBranchExpansion::splitMBB(MachineBasicBlock *MBB) {
ReverseIter End = MBB->rend();
ReverseIter LastBr = getNonDebugInstr(MBB->rbegin(), End);
@@ -153,7 +263,7 @@ void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
// Create a new MBB. Move instructions in MBB to the newly created MBB.
MachineBasicBlock *NewMBB =
- MF->CreateMachineBasicBlock(MBB->getBasicBlock());
+ MFp->CreateMachineBasicBlock(MBB->getBasicBlock());
// Insert NewMBB and fix control flow.
MachineBasicBlock *Tgt = getTargetMBB(*FirstBr);
@@ -161,26 +271,24 @@ void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
NewMBB->removeSuccessor(Tgt, true);
MBB->addSuccessor(NewMBB);
MBB->addSuccessor(Tgt);
- MF->insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
+ MFp->insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
NewMBB->splice(NewMBB->end(), MBB, LastBr.getReverse(), MBB->end());
}
// Fill MBBInfos.
-void MipsLongBranch::initMBBInfo() {
+void MipsBranchExpansion::initMBBInfo() {
// Split the MBBs if they have two branches. Each basic block should have at
// most one branch after this loop is executed.
- for (auto &MBB : *MF)
+ for (auto &MBB : *MFp)
splitMBB(&MBB);
- MF->RenumberBlocks();
+ MFp->RenumberBlocks();
MBBInfos.clear();
- MBBInfos.resize(MF->size());
+ MBBInfos.resize(MFp->size());
- const MipsInstrInfo *TII =
- static_cast<const MipsInstrInfo *>(MF->getSubtarget().getInstrInfo());
for (unsigned I = 0, E = MBBInfos.size(); I < E; ++I) {
- MachineBasicBlock *MBB = MF->getBlockNumbered(I);
+ MachineBasicBlock *MBB = MFp->getBlockNumbered(I);
// Compute size of MBB.
for (MachineBasicBlock::instr_iterator MI = MBB->instr_begin();
@@ -198,7 +306,7 @@ void MipsLongBranch::initMBBInfo() {
}
// Compute offset of branch in number of bytes.
-int64_t MipsLongBranch::computeOffset(const MachineInstr *Br) {
+int64_t MipsBranchExpansion::computeOffset(const MachineInstr *Br) {
int64_t Offset = 0;
int ThisMBB = Br->getParent()->getNumber();
int TargetMBB = getTargetMBB(*Br)->getNumber();
@@ -220,11 +328,9 @@ int64_t MipsLongBranch::computeOffset(const MachineInstr *Br) {
// Replace Br with a branch which has the opposite condition code and a
// MachineBasicBlock operand MBBOpnd.
-void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
- const DebugLoc &DL,
- MachineBasicBlock *MBBOpnd) {
- const MipsInstrInfo *TII = static_cast<const MipsInstrInfo *>(
- MBB.getParent()->getSubtarget().getInstrInfo());
+void MipsBranchExpansion::replaceBranch(MachineBasicBlock &MBB, Iter Br,
+ const DebugLoc &DL,
+ MachineBasicBlock *MBBOpnd) {
unsigned NewOpc = TII->getOppositeBranchOpc(Br->getOpcode());
const MCInstrDesc &NewDesc = TII->get(NewOpc);
@@ -258,24 +364,20 @@ void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
// currently assumes that all branches have 16-bit offsets, and will produce
// wrong code if branches whose allowed offsets are [-128, -126, ..., 126]
// are present.
-void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
+void MipsBranchExpansion::expandToLongBranch(MBBInfo &I) {
MachineBasicBlock::iterator Pos;
MachineBasicBlock *MBB = I.Br->getParent(), *TgtMBB = getTargetMBB(*I.Br);
DebugLoc DL = I.Br->getDebugLoc();
const BasicBlock *BB = MBB->getBasicBlock();
MachineFunction::iterator FallThroughMBB = ++MachineFunction::iterator(MBB);
- MachineBasicBlock *LongBrMBB = MF->CreateMachineBasicBlock(BB);
- const MipsSubtarget &Subtarget =
- static_cast<const MipsSubtarget &>(MF->getSubtarget());
- const MipsInstrInfo *TII =
- static_cast<const MipsInstrInfo *>(Subtarget.getInstrInfo());
+ MachineBasicBlock *LongBrMBB = MFp->CreateMachineBasicBlock(BB);
- MF->insert(FallThroughMBB, LongBrMBB);
+ MFp->insert(FallThroughMBB, LongBrMBB);
MBB->replaceSuccessor(TgtMBB, LongBrMBB);
if (IsPIC) {
- MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
- MF->insert(FallThroughMBB, BalTgtMBB);
+ MachineBasicBlock *BalTgtMBB = MFp->CreateMachineBasicBlock(BB);
+ MFp->insert(FallThroughMBB, BalTgtMBB);
LongBrMBB->addSuccessor(BalTgtMBB);
BalTgtMBB->addSuccessor(TgtMBB);
@@ -283,9 +385,9 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
// instruction) and the pre-MIPS32r6/MIPS64r6 definition (which is an
// pseudo-instruction wrapping BGEZAL).
const unsigned BalOp =
- Subtarget.hasMips32r6()
- ? Subtarget.inMicroMipsMode() ? Mips::BALC_MMR6 : Mips::BALC
- : Mips::BAL_BR;
+ STI->hasMips32r6()
+ ? STI->inMicroMipsMode() ? Mips::BALC_MMR6 : Mips::BALC
+ : STI->inMicroMipsMode() ? Mips::BAL_BR_MM : Mips::BAL_BR;
if (!ABI.IsN64()) {
// Pre R6:
@@ -320,9 +422,12 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
Pos = LongBrMBB->begin();
BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
- .addReg(Mips::SP).addImm(-8);
- BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SW)).addReg(Mips::RA)
- .addReg(Mips::SP).addImm(0);
+ .addReg(Mips::SP)
+ .addImm(-8);
+ BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SW))
+ .addReg(Mips::RA)
+ .addReg(Mips::SP)
+ .addImm(0);
// LUi and ADDiu instructions create 32-bit offset of the target basic
// block from the target of BAL(C) instruction. We cannot use immediate
@@ -341,16 +446,17 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
// operands to lowered instructions.
BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi), Mips::AT)
- .addMBB(TgtMBB).addMBB(BalTgtMBB);
+ .addMBB(TgtMBB, MipsII::MO_ABS_HI)
+ .addMBB(BalTgtMBB);
MachineInstrBuilder BalInstr =
- BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB);
+ BuildMI(*MFp, DL, TII->get(BalOp)).addMBB(BalTgtMBB);
MachineInstrBuilder ADDiuInstr =
- BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_ADDiu), Mips::AT)
+ BuildMI(*MFp, DL, TII->get(Mips::LONG_BRANCH_ADDiu), Mips::AT)
.addReg(Mips::AT)
- .addMBB(TgtMBB)
+ .addMBB(TgtMBB, MipsII::MO_ABS_LO)
.addMBB(BalTgtMBB);
- if (Subtarget.hasMips32r6()) {
+ if (STI->hasMips32r6()) {
LongBrMBB->insert(Pos, ADDiuInstr);
LongBrMBB->insert(Pos, BalInstr);
} else {
@@ -362,30 +468,38 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
Pos = BalTgtMBB->begin();
BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDu), Mips::AT)
- .addReg(Mips::RA).addReg(Mips::AT);
+ .addReg(Mips::RA)
+ .addReg(Mips::AT);
BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
- .addReg(Mips::SP).addImm(0);
- if (Subtarget.isTargetNaCl())
+ .addReg(Mips::SP)
+ .addImm(0);
+ if (STI->isTargetNaCl())
// Bundle-align the target of indirect branch JR.
TgtMBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
// In NaCl, modifying the sp is not allowed in branch delay slot.
// For MIPS32R6, we can skip using a delay slot branch.
- if (Subtarget.isTargetNaCl() || Subtarget.hasMips32r6())
+ if (STI->isTargetNaCl() ||
+ (STI->hasMips32r6() && !STI->useIndirectJumpsHazard()))
BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
- .addReg(Mips::SP).addImm(8);
+ .addReg(Mips::SP)
+ .addImm(8);
- if (Subtarget.hasMips32r6()) {
+ if (STI->hasMips32r6() && !STI->useIndirectJumpsHazard()) {
const unsigned JICOp =
- Subtarget.inMicroMipsMode() ? Mips::JIC_MMR6 : Mips::JIC;
+ STI->inMicroMipsMode() ? Mips::JIC_MMR6 : Mips::JIC;
BuildMI(*BalTgtMBB, Pos, DL, TII->get(JICOp))
.addReg(Mips::AT)
.addImm(0);
} else {
- BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR)).addReg(Mips::AT);
+ unsigned JROp =
+ STI->useIndirectJumpsHazard()
+ ? (STI->hasMips32r6() ? Mips::JR_HB_R6 : Mips::JR_HB)
+ : Mips::JR;
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(JROp)).addReg(Mips::AT);
- if (Subtarget.isTargetNaCl()) {
+ if (STI->isTargetNaCl()) {
BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::NOP));
} else
BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
@@ -443,23 +557,29 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
Pos = LongBrMBB->begin();
BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
- .addReg(Mips::SP_64).addImm(-16);
- BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SD)).addReg(Mips::RA_64)
- .addReg(Mips::SP_64).addImm(0);
+ .addReg(Mips::SP_64)
+ .addImm(-16);
+ BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SD))
+ .addReg(Mips::RA_64)
+ .addReg(Mips::SP_64)
+ .addImm(0);
BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
- Mips::AT_64).addReg(Mips::ZERO_64)
- .addMBB(TgtMBB, MipsII::MO_ABS_HI).addMBB(BalTgtMBB);
+ Mips::AT_64)
+ .addReg(Mips::ZERO_64)
+ .addMBB(TgtMBB, MipsII::MO_ABS_HI)
+ .addMBB(BalTgtMBB);
BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
- .addReg(Mips::AT_64).addImm(16);
+ .addReg(Mips::AT_64)
+ .addImm(16);
MachineInstrBuilder BalInstr =
- BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB);
+ BuildMI(*MFp, DL, TII->get(BalOp)).addMBB(BalTgtMBB);
MachineInstrBuilder DADDiuInstr =
- BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_DADDiu), Mips::AT_64)
+ BuildMI(*MFp, DL, TII->get(Mips::LONG_BRANCH_DADDiu), Mips::AT_64)
.addReg(Mips::AT_64)
.addMBB(TgtMBB, MipsII::MO_ABS_LO)
.addMBB(BalTgtMBB);
- if (Subtarget.hasMips32r6()) {
+ if (STI->hasMips32r6()) {
LongBrMBB->insert(Pos, DADDiuInstr);
LongBrMBB->insert(Pos, BalInstr);
} else {
@@ -471,11 +591,13 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
Pos = BalTgtMBB->begin();
BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDu), Mips::AT_64)
- .addReg(Mips::RA_64).addReg(Mips::AT_64);
+ .addReg(Mips::RA_64)
+ .addReg(Mips::AT_64);
BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LD), Mips::RA_64)
- .addReg(Mips::SP_64).addImm(0);
+ .addReg(Mips::SP_64)
+ .addImm(0);
- if (Subtarget.hasMips64r6()) {
+ if (STI->hasMips64r6() && !STI->useIndirectJumpsHazard()) {
BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
.addReg(Mips::SP_64)
.addImm(16);
@@ -483,7 +605,11 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
.addReg(Mips::AT_64)
.addImm(0);
} else {
- BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64);
+ unsigned JROp =
+ STI->useIndirectJumpsHazard()
+ ? (STI->hasMips32r6() ? Mips::JR_HB64_R6 : Mips::JR_HB64)
+ : Mips::JR64;
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(JROp)).addReg(Mips::AT_64);
BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
.addReg(Mips::SP_64)
.addImm(16);
@@ -501,14 +627,14 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
//
Pos = LongBrMBB->begin();
LongBrMBB->addSuccessor(TgtMBB);
- if (Subtarget.hasMips32r6())
+ if (STI->hasMips32r6())
BuildMI(*LongBrMBB, Pos, DL,
- TII->get(Subtarget.inMicroMipsMode() ? Mips::BC_MMR6 : Mips::BC))
+ TII->get(STI->inMicroMipsMode() ? Mips::BC_MMR6 : Mips::BC))
.addMBB(TgtMBB);
else
MIBundleBuilder(*LongBrMBB, Pos)
- .append(BuildMI(*MF, DL, TII->get(Mips::J)).addMBB(TgtMBB))
- .append(BuildMI(*MF, DL, TII->get(Mips::NOP)));
+ .append(BuildMI(*MFp, DL, TII->get(Mips::J)).addMBB(TgtMBB))
+ .append(BuildMI(*MFp, DL, TII->get(Mips::NOP)));
assert(LongBrMBB->size() == LongBranchSeqSize);
}
@@ -528,35 +654,66 @@ static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) {
MachineBasicBlock::iterator I = MBB.begin();
DebugLoc DL = MBB.findDebugLoc(MBB.begin());
BuildMI(MBB, I, DL, TII->get(Mips::LUi), Mips::V0)
- .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
+ .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
BuildMI(MBB, I, DL, TII->get(Mips::ADDiu), Mips::V0)
- .addReg(Mips::V0).addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+ .addReg(Mips::V0)
+ .addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
MBB.removeLiveIn(Mips::V0);
}
-bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
- const MipsSubtarget &STI =
- static_cast<const MipsSubtarget &>(F.getSubtarget());
- const MipsInstrInfo *TII =
- static_cast<const MipsInstrInfo *>(STI.getInstrInfo());
+bool MipsBranchExpansion::handleForbiddenSlot() {
+ // Forbidden slot hazards are only defined for MIPSR6 but not microMIPSR6.
+ if (!STI->hasMips32r6() || STI->inMicroMipsMode())
+ return false;
- const TargetMachine& TM = F.getTarget();
- IsPIC = TM.isPositionIndependent();
- ABI = static_cast<const MipsTargetMachine &>(TM).getABI();
+ const MipsInstrInfo *TII = STI->getInstrInfo();
+
+ bool Changed = false;
+
+ for (MachineFunction::iterator FI = MFp->begin(); FI != MFp->end(); ++FI) {
+ for (Iter I = FI->begin(); I != FI->end(); ++I) {
+
+ // Forbidden slot hazard handling. Use lookahead over state.
+ if (!TII->HasForbiddenSlot(*I))
+ continue;
+
+ Iter Inst;
+ bool LastInstInFunction =
+ std::next(I) == FI->end() && std::next(FI) == MFp->end();
+ if (!LastInstInFunction) {
+ std::pair<Iter, bool> Res = getNextMachineInstr(std::next(I), &*FI);
+ LastInstInFunction |= Res.second;
+ Inst = Res.first;
+ }
+
+ if (LastInstInFunction || !TII->SafeInForbiddenSlot(*Inst)) {
+
+ MachineBasicBlock::instr_iterator Iit = I->getIterator();
+ if (std::next(Iit) == FI->end() ||
+ std::next(Iit)->getOpcode() != Mips::NOP) {
+ Changed = true;
+ MIBundleBuilder(&*I).append(
+ BuildMI(*MFp, I->getDebugLoc(), TII->get(Mips::NOP)));
+ NumInsertedNops++;
+ }
+ }
+ }
+ }
- LongBranchSeqSize = IsPIC ? ((ABI.IsN64() || STI.isTargetNaCl()) ? 10 : 9)
- : (STI.hasMips32r6() ? 1 : 2);
+ return Changed;
+}
- if (STI.inMips16Mode() || !STI.enableLongBranchPass())
+bool MipsBranchExpansion::handlePossibleLongBranch() {
+
+ LongBranchSeqSize = IsPIC ? ((ABI.IsN64() || STI->isTargetNaCl()) ? 10 : 9)
+ : (STI->hasMips32r6() ? 1 : 2);
+
+ if (STI->inMips16Mode() || !STI->enableLongBranchPass())
return false;
- if (IsPIC && static_cast<const MipsTargetMachine &>(TM).getABI().IsO32() &&
- F.getInfo<MipsFunctionInfo>()->globalBaseRegSet())
- emitGPDisp(F, TII);
if (SkipLongBranch)
- return true;
+ return false;
- MF = &F;
initMBBInfo();
SmallVectorImpl<MBBInfo>::iterator I, E = MBBInfos.end();
@@ -571,10 +728,9 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
if (!I->Br || I->HasLongBranch)
continue;
- int ShVal = STI.inMicroMipsMode() ? 2 : 4;
- int64_t Offset = computeOffset(I->Br) / ShVal;
+ int64_t Offset = computeOffset(I->Br);
- if (STI.isTargetNaCl()) {
+ if (STI->isTargetNaCl()) {
// The offset calculation does not include sandboxing instructions
// that will be added later in the MC layer. Since at this point we
// don't know the exact amount of code that "sandboxing" will add, we
@@ -582,8 +738,9 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
Offset *= 2;
}
- // Check if offset fits into 16-bit immediate field of branches.
- if (!ForceLongBranch && isInt<16>(Offset))
+ // Check if offset fits into the immediate field of the branch.
+ if (!ForceLongBranchFirstPass &&
+ TII->isBranchOffsetInRange(I->Br->getOpcode(), Offset))
continue;
I->HasLongBranch = true;
@@ -593,27 +750,49 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
}
}
- if (!EverMadeChange)
- return true;
-
- // Compute basic block addresses.
- if (IsPIC) {
- uint64_t Address = 0;
+ ForceLongBranchFirstPass = false;
- for (I = MBBInfos.begin(); I != E; Address += I->Size, ++I)
- I->Address = Address;
- }
+ if (!EverMadeChange)
+ return false;
// Do the expansion.
for (I = MBBInfos.begin(); I != E; ++I)
- if (I->HasLongBranch)
+ if (I->HasLongBranch) {
expandToLongBranch(*I);
+ }
- MF->RenumberBlocks();
+ MFp->RenumberBlocks();
return true;
}
-/// createMipsLongBranchPass - Returns a pass that converts branches to long
-/// branches.
-FunctionPass *llvm::createMipsLongBranchPass() { return new MipsLongBranch(); }
+bool MipsBranchExpansion::runOnMachineFunction(MachineFunction &MF) {
+ const TargetMachine &TM = MF.getTarget();
+ IsPIC = TM.isPositionIndependent();
+ ABI = static_cast<const MipsTargetMachine &>(TM).getABI();
+ STI = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+ TII = static_cast<const MipsInstrInfo *>(STI->getInstrInfo());
+
+ if (IsPIC && ABI.IsO32() &&
+ MF.getInfo<MipsFunctionInfo>()->globalBaseRegSet())
+ emitGPDisp(MF, TII);
+
+ MFp = &MF;
+
+ ForceLongBranchFirstPass = ForceLongBranch;
+ // Run these two at least once
+ bool longBranchChanged = handlePossibleLongBranch();
+ bool forbiddenSlotChanged = handleForbiddenSlot();
+
+ bool Changed = longBranchChanged || forbiddenSlotChanged;
+
+ // Then run them alternatively while there are changes
+ while (forbiddenSlotChanged) {
+ longBranchChanged = handlePossibleLongBranch();
+ if (!longBranchChanged)
+ break;
+ forbiddenSlotChanged = handleForbiddenSlot();
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/Mips/MipsCallLowering.cpp b/lib/Target/Mips/MipsCallLowering.cpp
new file mode 100644
index 000000000000..e82f62260b3f
--- /dev/null
+++ b/lib/Target/Mips/MipsCallLowering.cpp
@@ -0,0 +1,441 @@
+//===- MipsCallLowering.cpp -------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsCallLowering.h"
+#include "MipsCCState.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+
+using namespace llvm;
+
+MipsCallLowering::MipsCallLowering(const MipsTargetLowering &TLI)
+ : CallLowering(&TLI) {}
+
+bool MipsCallLowering::MipsHandler::assign(const CCValAssign &VA,
+ unsigned vreg) {
+ if (VA.isRegLoc()) {
+ assignValueToReg(vreg, VA.getLocReg());
+ } else if (VA.isMemLoc()) {
+ unsigned Size = alignTo(VA.getValVT().getSizeInBits(), 8) / 8;
+ unsigned Offset = VA.getLocMemOffset();
+ MachinePointerInfo MPO;
+ unsigned StackAddr = getStackAddress(Size, Offset, MPO);
+ assignValueToAddress(vreg, StackAddr, Size, MPO);
+ } else {
+ return false;
+ }
+ return true;
+}
+
+namespace {
+class IncomingValueHandler : public MipsCallLowering::MipsHandler {
+public:
+ IncomingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+ : MipsHandler(MIRBuilder, MRI) {}
+
+ bool handle(ArrayRef<CCValAssign> ArgLocs,
+ ArrayRef<CallLowering::ArgInfo> Args);
+
+private:
+ void assignValueToReg(unsigned ValVReg, unsigned PhysReg) override;
+
+ unsigned getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) override;
+
+ void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+ MachinePointerInfo &MPO) override;
+
+ virtual void markPhysRegUsed(unsigned PhysReg) {
+ MIRBuilder.getMBB().addLiveIn(PhysReg);
+ }
+
+ void buildLoad(unsigned Val, unsigned Addr, uint64_t Size, unsigned Alignment,
+ MachinePointerInfo &MPO) {
+ MachineMemOperand *MMO = MIRBuilder.getMF().getMachineMemOperand(
+ MPO, MachineMemOperand::MOLoad, Size, Alignment);
+ MIRBuilder.buildLoad(Val, Addr, *MMO);
+ }
+};
+
+class CallReturnHandler : public IncomingValueHandler {
+public:
+ CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ MachineInstrBuilder &MIB)
+ : IncomingValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+
+private:
+ void markPhysRegUsed(unsigned PhysReg) override {
+ MIB.addDef(PhysReg, RegState::Implicit);
+ }
+
+ MachineInstrBuilder &MIB;
+};
+
+} // end anonymous namespace
+
+void IncomingValueHandler::assignValueToReg(unsigned ValVReg,
+ unsigned PhysReg) {
+ MIRBuilder.buildCopy(ValVReg, PhysReg);
+ markPhysRegUsed(PhysReg);
+}
+
+unsigned IncomingValueHandler::getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) {
+ MachineFrameInfo &MFI = MIRBuilder.getMF().getFrameInfo();
+
+ int FI = MFI.CreateFixedObject(Size, Offset, true);
+ MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
+
+ unsigned AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 32));
+ MIRBuilder.buildFrameIndex(AddrReg, FI);
+
+ return AddrReg;
+}
+
+void IncomingValueHandler::assignValueToAddress(unsigned ValVReg, unsigned Addr,
+ uint64_t Size,
+ MachinePointerInfo &MPO) {
+ // If the value is not extended, a simple load will suffice.
+ buildLoad(ValVReg, Addr, Size, /* Alignment */ 0, MPO);
+}
+
+bool IncomingValueHandler::handle(ArrayRef<CCValAssign> ArgLocs,
+ ArrayRef<CallLowering::ArgInfo> Args) {
+ for (unsigned i = 0, ArgsSize = Args.size(); i < ArgsSize; ++i) {
+ if (!assign(ArgLocs[i], Args[i].Reg))
+ return false;
+ }
+ return true;
+}
+
+namespace {
+class OutgoingValueHandler : public MipsCallLowering::MipsHandler {
+public:
+ OutgoingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ MachineInstrBuilder &MIB)
+ : MipsHandler(MIRBuilder, MRI), MIB(MIB) {}
+
+ bool handle(ArrayRef<CCValAssign> ArgLocs,
+ ArrayRef<CallLowering::ArgInfo> Args);
+
+private:
+ void assignValueToReg(unsigned ValVReg, unsigned PhysReg) override;
+
+ unsigned getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) override;
+
+ void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+ MachinePointerInfo &MPO) override;
+
+ MachineInstrBuilder &MIB;
+};
+} // end anonymous namespace
+
+void OutgoingValueHandler::assignValueToReg(unsigned ValVReg,
+ unsigned PhysReg) {
+ MIRBuilder.buildCopy(PhysReg, ValVReg);
+ MIB.addUse(PhysReg, RegState::Implicit);
+}
+
+unsigned OutgoingValueHandler::getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) {
+ LLT p0 = LLT::pointer(0, 32);
+ LLT s32 = LLT::scalar(32);
+ unsigned SPReg = MRI.createGenericVirtualRegister(p0);
+ MIRBuilder.buildCopy(SPReg, Mips::SP);
+
+ unsigned OffsetReg = MRI.createGenericVirtualRegister(s32);
+ MIRBuilder.buildConstant(OffsetReg, Offset);
+
+ unsigned AddrReg = MRI.createGenericVirtualRegister(p0);
+ MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
+
+ MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
+ return AddrReg;
+}
+
+void OutgoingValueHandler::assignValueToAddress(unsigned ValVReg, unsigned Addr,
+ uint64_t Size,
+ MachinePointerInfo &MPO) {
+ MachineMemOperand *MMO = MIRBuilder.getMF().getMachineMemOperand(
+ MPO, MachineMemOperand::MOStore, Size, /* Alignment */ 0);
+ MIRBuilder.buildStore(ValVReg, Addr, *MMO);
+}
+
+bool OutgoingValueHandler::handle(ArrayRef<CCValAssign> ArgLocs,
+ ArrayRef<CallLowering::ArgInfo> Args) {
+ for (unsigned i = 0; i < Args.size(); ++i) {
+ if (!assign(ArgLocs[i], Args[i].Reg))
+ return false;
+ }
+ return true;
+}
+
+static bool isSupportedType(Type *T) {
+ if (T->isIntegerTy() && T->getScalarSizeInBits() == 32)
+ return true;
+ if (T->isPointerTy())
+ return true;
+ return false;
+}
+
+bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+ const Value *Val, unsigned VReg) const {
+
+ MachineInstrBuilder Ret = MIRBuilder.buildInstrNoInsert(Mips::RetRA);
+
+ if (Val != nullptr) {
+ if (!isSupportedType(Val->getType()))
+ return false;
+
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &F = MF.getFunction();
+ const DataLayout &DL = MF.getDataLayout();
+ const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
+
+ SmallVector<ArgInfo, 8> RetInfos;
+ SmallVector<unsigned, 8> OrigArgIndices;
+
+ ArgInfo ArgRetInfo(VReg, Val->getType());
+ setArgFlags(ArgRetInfo, AttributeList::ReturnIndex, DL, F);
+ splitToValueTypes(ArgRetInfo, 0, RetInfos, OrigArgIndices);
+
+ SmallVector<ISD::OutputArg, 8> Outs;
+ subTargetRegTypeForCallingConv(
+ MIRBuilder, RetInfos, OrigArgIndices,
+ [&](ISD::ArgFlagsTy flags, EVT vt, EVT argvt, bool used,
+ unsigned origIdx, unsigned partOffs) {
+ Outs.emplace_back(flags, vt, argvt, used, origIdx, partOffs);
+ });
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
+ F.getContext());
+ CCInfo.AnalyzeReturn(Outs, TLI.CCAssignFnForReturn());
+
+ OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret);
+ if (!RetHandler.handle(ArgLocs, RetInfos)) {
+ return false;
+ }
+ }
+ MIRBuilder.insertInstr(Ret);
+ return true;
+}
+
+bool MipsCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<unsigned> VRegs) const {
+
+ // Quick exit if there aren't any args.
+ if (F.arg_empty())
+ return true;
+
+ if (F.isVarArg()) {
+ return false;
+ }
+
+ for (auto &Arg : F.args()) {
+ if (!isSupportedType(Arg.getType()))
+ return false;
+ }
+
+ MachineFunction &MF = MIRBuilder.getMF();
+ const DataLayout &DL = MF.getDataLayout();
+ const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
+
+ SmallVector<ArgInfo, 8> ArgInfos;
+ SmallVector<unsigned, 8> OrigArgIndices;
+ unsigned i = 0;
+ for (auto &Arg : F.args()) {
+ ArgInfo AInfo(VRegs[i], Arg.getType());
+ setArgFlags(AInfo, i + AttributeList::FirstArgIndex, DL, F);
+ splitToValueTypes(AInfo, i, ArgInfos, OrigArgIndices);
+ ++i;
+ }
+
+ SmallVector<ISD::InputArg, 8> Ins;
+ subTargetRegTypeForCallingConv(
+ MIRBuilder, ArgInfos, OrigArgIndices,
+ [&](ISD::ArgFlagsTy flags, EVT vt, EVT argvt, bool used, unsigned origIdx,
+ unsigned partOffs) {
+ Ins.emplace_back(flags, vt, argvt, used, origIdx, partOffs);
+ });
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
+ F.getContext());
+
+ const MipsTargetMachine &TM =
+ static_cast<const MipsTargetMachine &>(MF.getTarget());
+ const MipsABIInfo &ABI = TM.getABI();
+ CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(F.getCallingConv()),
+ 1);
+ CCInfo.AnalyzeFormalArguments(Ins, TLI.CCAssignFnForCall());
+
+ IncomingValueHandler Handler(MIRBuilder, MF.getRegInfo());
+ if (!Handler.handle(ArgLocs, ArgInfos))
+ return false;
+
+ return true;
+}
+
+bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+ CallingConv::ID CallConv,
+ const MachineOperand &Callee,
+ const ArgInfo &OrigRet,
+ ArrayRef<ArgInfo> OrigArgs) const {
+
+ if (CallConv != CallingConv::C)
+ return false;
+
+ for (auto &Arg : OrigArgs) {
+ if (!isSupportedType(Arg.Ty))
+ return false;
+ if (Arg.Flags.isByVal() || Arg.Flags.isSRet())
+ return false;
+ }
+ if (OrigRet.Reg && !isSupportedType(OrigRet.Ty))
+ return false;
+
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &F = MF.getFunction();
+ const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
+ const MipsTargetMachine &TM =
+ static_cast<const MipsTargetMachine &>(MF.getTarget());
+ const MipsABIInfo &ABI = TM.getABI();
+
+ MachineInstrBuilder CallSeqStart =
+ MIRBuilder.buildInstr(Mips::ADJCALLSTACKDOWN);
+
+ // FIXME: Add support for pic calling sequences, long call sequences for O32,
+ // N32 and N64. First handle the case when Callee.isReg().
+ if (Callee.isReg())
+ return false;
+
+ MachineInstrBuilder MIB = MIRBuilder.buildInstrNoInsert(Mips::JAL);
+ MIB.addDef(Mips::SP, RegState::Implicit);
+ MIB.add(Callee);
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ MIB.addRegMask(TRI->getCallPreservedMask(MF, F.getCallingConv()));
+
+ TargetLowering::ArgListTy FuncOrigArgs;
+ FuncOrigArgs.reserve(OrigArgs.size());
+
+ SmallVector<ArgInfo, 8> ArgInfos;
+ SmallVector<unsigned, 8> OrigArgIndices;
+ unsigned i = 0;
+ for (auto &Arg : OrigArgs) {
+
+ TargetLowering::ArgListEntry Entry;
+ Entry.Ty = Arg.Ty;
+ FuncOrigArgs.push_back(Entry);
+
+ splitToValueTypes(Arg, i, ArgInfos, OrigArgIndices);
+ ++i;
+ }
+
+ SmallVector<ISD::OutputArg, 8> Outs;
+ subTargetRegTypeForCallingConv(
+ MIRBuilder, ArgInfos, OrigArgIndices,
+ [&](ISD::ArgFlagsTy flags, EVT vt, EVT argvt, bool used, unsigned origIdx,
+ unsigned partOffs) {
+ Outs.emplace_back(flags, vt, argvt, used, origIdx, partOffs);
+ });
+
+ SmallVector<CCValAssign, 8> ArgLocs;
+ MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
+ F.getContext());
+
+ CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
+ const char *Call = Callee.isSymbol() ? Callee.getSymbolName() : nullptr;
+ CCInfo.AnalyzeCallOperands(Outs, TLI.CCAssignFnForCall(), FuncOrigArgs, Call);
+
+ OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), MIB);
+ if (!RetHandler.handle(ArgLocs, ArgInfos)) {
+ return false;
+ }
+
+ unsigned NextStackOffset = CCInfo.getNextStackOffset();
+ const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
+ unsigned StackAlignment = TFL->getStackAlignment();
+ NextStackOffset = alignTo(NextStackOffset, StackAlignment);
+ CallSeqStart.addImm(NextStackOffset).addImm(0);
+
+ MIRBuilder.insertInstr(MIB);
+
+ if (OrigRet.Reg) {
+
+ ArgInfos.clear();
+ SmallVector<unsigned, 8> OrigRetIndices;
+
+ splitToValueTypes(OrigRet, 0, ArgInfos, OrigRetIndices);
+
+ SmallVector<ISD::InputArg, 8> Ins;
+ subTargetRegTypeForCallingConv(
+ MIRBuilder, ArgInfos, OrigRetIndices,
+ [&](ISD::ArgFlagsTy flags, EVT vt, EVT argvt, bool used,
+ unsigned origIdx, unsigned partOffs) {
+ Ins.emplace_back(flags, vt, argvt, used, origIdx, partOffs);
+ });
+
+ SmallVector<CCValAssign, 8> ArgLocs;
+ MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
+ F.getContext());
+
+ CCInfo.AnalyzeCallResult(Ins, TLI.CCAssignFnForReturn(), OrigRet.Ty, Call);
+
+ CallReturnHandler Handler(MIRBuilder, MF.getRegInfo(), MIB);
+ if (!Handler.handle(ArgLocs, ArgInfos))
+ return false;
+ }
+
+ MIRBuilder.buildInstr(Mips::ADJCALLSTACKUP).addImm(NextStackOffset).addImm(0);
+
+ return true;
+}
+
+void MipsCallLowering::subTargetRegTypeForCallingConv(
+ MachineIRBuilder &MIRBuilder, ArrayRef<ArgInfo> Args,
+ ArrayRef<unsigned> OrigArgIndices, const FunTy &PushBack) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &F = MF.getFunction();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
+
+ unsigned ArgNo = 0;
+ for (auto &Arg : Args) {
+
+ EVT VT = TLI.getValueType(DL, Arg.Ty);
+ MVT RegisterVT = TLI.getRegisterTypeForCallingConv(F.getContext(), VT);
+
+ ISD::ArgFlagsTy Flags = Arg.Flags;
+ Flags.setOrigAlign(TLI.getABIAlignmentForCallingConv(Arg.Ty, DL));
+
+ PushBack(Flags, RegisterVT, VT, true, OrigArgIndices[ArgNo], 0);
+
+ ++ArgNo;
+ }
+}
+
+void MipsCallLowering::splitToValueTypes(
+ const ArgInfo &OrigArg, unsigned OriginalIndex,
+ SmallVectorImpl<ArgInfo> &SplitArgs,
+ SmallVectorImpl<unsigned> &SplitArgsOrigIndices) const {
+
+ // TODO : perform structure and array split. For now we only deal with
+ // types that pass isSupportedType check.
+ SplitArgs.push_back(OrigArg);
+ SplitArgsOrigIndices.push_back(OriginalIndex);
+}
diff --git a/lib/Target/Mips/MipsCallLowering.h b/lib/Target/Mips/MipsCallLowering.h
new file mode 100644
index 000000000000..e23c10cec563
--- /dev/null
+++ b/lib/Target/Mips/MipsCallLowering.h
@@ -0,0 +1,86 @@
+//===- MipsCallLowering.h ---------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSCALLLOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPSCALLLOWERING_H
+
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+
+namespace llvm {
+
+class MipsTargetLowering;
+
+class MipsCallLowering : public CallLowering {
+
+public:
+ class MipsHandler {
+ public:
+ MipsHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+ : MIRBuilder(MIRBuilder), MRI(MRI) {}
+
+ virtual ~MipsHandler() = default;
+
+ protected:
+ bool assign(const CCValAssign &VA, unsigned vreg);
+
+ MachineIRBuilder &MIRBuilder;
+ MachineRegisterInfo &MRI;
+
+ private:
+ virtual unsigned getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) = 0;
+
+ virtual void assignValueToReg(unsigned ValVReg, unsigned PhysReg) = 0;
+
+ virtual void assignValueToAddress(unsigned ValVReg, unsigned Addr,
+ uint64_t Size,
+ MachinePointerInfo &MPO) = 0;
+ };
+
+ MipsCallLowering(const MipsTargetLowering &TLI);
+
+ bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+ unsigned VReg) const override;
+
+ bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<unsigned> VRegs) const override;
+
+ bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
+ const MachineOperand &Callee, const ArgInfo &OrigRet,
+ ArrayRef<ArgInfo> OrigArgs) const override;
+
+private:
+ using FunTy =
+ std::function<void(ISD::ArgFlagsTy flags, EVT vt, EVT argvt, bool used,
+ unsigned origIdx, unsigned partOffs)>;
+
+ /// Based on registers available on target machine split or extend
+ /// type if needed, also change pointer type to appropriate integer
+ /// type. Lambda will fill some info so we can tell MipsCCState to
+ /// assign physical registers.
+ void subTargetRegTypeForCallingConv(MachineIRBuilder &MIRBuilder,
+ ArrayRef<ArgInfo> Args,
+ ArrayRef<unsigned> OrigArgIndices,
+ const FunTy &PushBack) const;
+
+ /// Split structures and arrays, save original argument indices since
+ /// Mips calling conv needs info about original argument type.
+ void splitToValueTypes(const ArgInfo &OrigArg, unsigned OriginalIndex,
+ SmallVectorImpl<ArgInfo> &SplitArgs,
+ SmallVectorImpl<unsigned> &SplitArgsOrigIndices) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_MIPS_MIPSCALLLOWERING_H
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index a0039d159248..39dc2654aa6a 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -104,163 +104,162 @@ multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
}
// Instantiation of instructions.
-def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, II_MOVZ>,
- ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
-
-let isCodeGenOnly = 1 in {
- def MOVZ_I_I64 : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, II_MOVZ>,
- ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
- def MOVZ_I64_I : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, II_MOVZ>,
- ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
- def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, II_MOVZ>,
- ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
-}
+let AdditionalPredicates = [NotInMicroMips] in {
+ def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, II_MOVZ>,
+ ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+ let isCodeGenOnly = 1 in {
+ def MOVZ_I_I64 : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, II_MOVZ>,
+ ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+ def MOVZ_I64_I : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, II_MOVZ>,
+ ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+ def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, II_MOVZ>,
+ ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+ }
-def MOVN_I_I : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, II_MOVN>,
- ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+ def MOVN_I_I : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, II_MOVN>,
+ ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
-let isCodeGenOnly = 1 in {
- def MOVN_I_I64 : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, II_MOVN>,
- ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
- def MOVN_I64_I : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, II_MOVN>,
- ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
- def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, II_MOVN>,
- ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
-}
+ let isCodeGenOnly = 1 in {
+ def MOVN_I_I64 : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, II_MOVN>,
+ ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+ def MOVN_I64_I : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, II_MOVN>,
+ ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+ def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, II_MOVN>,
+ ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+ }
+ def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>,
+ CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+ let isCodeGenOnly = 1 in
+ def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, II_MOVZ_S>,
+ CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+ def MOVN_I_S : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, II_MOVN_S>,
+ CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+ let isCodeGenOnly = 1 in
+ def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, II_MOVN_S>,
+ CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+ def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
+ II_MOVZ_D>, CMov_I_F_FM<18, 17>,
+ INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+ def MOVN_I_D32 : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
+ II_MOVN_D>, CMov_I_F_FM<19, 17>,
+ INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+
+ let DecoderNamespace = "MipsFP64" in {
+ def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, II_MOVZ_D>,
+ CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, II_MOVN_D>,
+ CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ let isCodeGenOnly = 1 in {
+ def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd, II_MOVZ_D>,
+ CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd, II_MOVN_D>,
+ CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ }
+ }
-def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>,
- CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
+ def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>,
+ CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6;
-let isCodeGenOnly = 1 in
-def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, II_MOVZ_S>,
- CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+ let isCodeGenOnly = 1 in
+ def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, II_MOVT, MipsCMovFP_T>,
+ CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-def MOVN_I_S : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, II_MOVN_S>,
- CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
+ def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>,
+ CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6;
-let isCodeGenOnly = 1 in
-def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, II_MOVN_S>,
- CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+ let isCodeGenOnly = 1 in
+ def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, II_MOVF, MipsCMovFP_F>,
+ CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+ def MOVT_S : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S, MipsCMovFP_T>,
+ CMov_F_F_FM<16, 1>, INSN_MIPS4_32_NOT_32R6_64R6;
+ def MOVF_S : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S, MipsCMovFP_F>,
+ CMov_F_F_FM<16, 0>, INSN_MIPS4_32_NOT_32R6_64R6;
-def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
- II_MOVZ_D>, CMov_I_F_FM<18, 17>,
+ def MOVT_D32 : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
+ MipsCMovFP_T>, CMov_F_F_FM<17, 1>,
INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
-def MOVN_I_D32 : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
- II_MOVN_D>, CMov_I_F_FM<19, 17>,
+ def MOVF_D32 : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
+ MipsCMovFP_F>, CMov_F_F_FM<17, 0>,
INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
-let DecoderNamespace = "MipsFP64" in {
- def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, II_MOVZ_D>,
- CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
- def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, II_MOVN_D>,
- CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
- let isCodeGenOnly = 1 in {
- def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd, II_MOVZ_D>,
- CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
- def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd, II_MOVN_D>,
- CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ let DecoderNamespace = "MipsFP64" in {
+ def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, II_MOVT_D, MipsCMovFP_T>,
+ CMov_F_F_FM<17, 1>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, II_MOVF_D, MipsCMovFP_F>,
+ CMov_F_F_FM<17, 0>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
}
-}
-def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>,
- CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6;
-
-let isCodeGenOnly = 1 in
-def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, II_MOVT, MipsCMovFP_T>,
- CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-
-def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>,
- CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6;
-
-let isCodeGenOnly = 1 in
-def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, II_MOVF, MipsCMovFP_F>,
- CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-
-def MOVT_S : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S, MipsCMovFP_T>,
- CMov_F_F_FM<16, 1>, INSN_MIPS4_32_NOT_32R6_64R6;
-def MOVF_S : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S, MipsCMovFP_F>,
- CMov_F_F_FM<16, 0>, INSN_MIPS4_32_NOT_32R6_64R6;
-
-def MOVT_D32 : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
- MipsCMovFP_T>, CMov_F_F_FM<17, 1>,
- INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
-def MOVF_D32 : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
- MipsCMovFP_F>, CMov_F_F_FM<17, 0>,
- INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
-
-let DecoderNamespace = "MipsFP64" in {
- def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, II_MOVT_D, MipsCMovFP_T>,
- CMov_F_F_FM<17, 1>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
- def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, II_MOVF_D, MipsCMovFP_F>,
- CMov_F_F_FM<17, 0>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ // Instantiation of conditional move patterns.
+ defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>,
+ INSN_MIPS4_32_NOT_32R6_64R6;
+ defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+ defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+ defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+ defm : MovzPats0<GPR64, GPR32, MOVZ_I_I, SLT64, SLTu64, SLTi64, SLTiu64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+ defm : MovzPats0<GPR64, GPR64, MOVZ_I_I64, SLT64, SLTu64, SLTi64, SLTiu64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+ defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+ defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+ defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+ defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+ defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+ defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+ defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+ defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+ GPR_64;
+ defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+ GPR_64;
+ defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+ GPR_64;
+
+ defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>,
+ INSN_MIPS4_32_NOT_32R6_64R6;
+ defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+ defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+ defm : MovzPats0<GPR64, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64, SLTiu64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+ defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+ GPR_64;
+ defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+ GPR_64;
+
+ defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>,
+ INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+ defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+ FGR_32;
+ defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+ FGR_32;
+
+ defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>,
+ INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ defm : MovzPats0<GPR64, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64, SLTiu64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+ FGR_64;
+ defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>,
+ INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+ FGR_64;
+ defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+ FGR_64;
}
-
-// Instantiation of conditional move patterns.
-defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>,
- INSN_MIPS4_32_NOT_32R6_64R6;
-defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
-defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>, INSN_MIPS4_32_NOT_32R6_64R6;
-
-defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>,
- INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats0<GPR64, GPR32, MOVZ_I_I, SLT64, SLTu64, SLTi64, SLTiu64>,
- INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats0<GPR64, GPR64, MOVZ_I_I64, SLT64, SLTu64, SLTi64, SLTiu64>,
- INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>,
- INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>,
- INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>,
- INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>,
- INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>,
- INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>,
- INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-
-defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
-
-defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
- GPR_64;
-defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
- GPR_64;
-defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
- GPR_64;
-
-defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>,
- INSN_MIPS4_32_NOT_32R6_64R6;
-defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
-defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
-
-defm : MovzPats0<GPR64, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64, SLTiu64>,
- INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
- GPR_64;
-defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
- GPR_64;
-
-defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>,
- INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
-defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
- FGR_32;
-defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
- FGR_32;
-
-defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>,
- INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
-defm : MovzPats0<GPR64, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64, SLTiu64>,
- INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
-defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
- FGR_64;
-defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>,
- INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
-defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
- FGR_64;
-defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
- FGR_64;
-
// For targets that don't have conditional-move instructions
// we have to match SELECT nodes with pseudo instructions.
let usesCustomInserter = 1 in {
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index a9abc171b423..9eb13a68e561 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -37,6 +37,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugLoc.h"
@@ -442,13 +443,15 @@ bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
MF = &mf;
MCP = mf.getConstantPool();
STI = &static_cast<const MipsSubtarget &>(mf.getSubtarget());
- DEBUG(dbgs() << "constant island machine function " << "\n");
+ LLVM_DEBUG(dbgs() << "constant island machine function "
+ << "\n");
if (!STI->inMips16Mode() || !MipsSubtarget::useConstantIslands()) {
return false;
}
TII = (const Mips16InstrInfo *)STI->getInstrInfo();
MFI = MF->getInfo<MipsFunctionInfo>();
- DEBUG(dbgs() << "constant island processing " << "\n");
+ LLVM_DEBUG(dbgs() << "constant island processing "
+ << "\n");
//
// will need to make predermination if there is any constants we need to
// put in constant islands. TBD.
@@ -479,7 +482,7 @@ bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
// constant pool users.
initializeFunctionInfo(CPEMIs);
CPEMIs.clear();
- DEBUG(dumpBBs());
+ LLVM_DEBUG(dumpBBs());
/// Remove dead constant pool entries.
MadeChange |= removeUnusedCPEntries();
@@ -489,31 +492,31 @@ bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
unsigned NoCPIters = 0, NoBRIters = 0;
(void)NoBRIters;
while (true) {
- DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
+ LLVM_DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
bool CPChange = false;
for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
CPChange |= handleConstantPoolUser(i);
if (CPChange && ++NoCPIters > 30)
report_fatal_error("Constant Island pass failed to converge!");
- DEBUG(dumpBBs());
+ LLVM_DEBUG(dumpBBs());
// Clear NewWaterList now. If we split a block for branches, it should
// appear as "new water" for the next iteration of constant pool placement.
NewWaterList.clear();
- DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
+ LLVM_DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
bool BRChange = false;
for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
BRChange |= fixupImmediateBr(ImmBranches[i]);
if (BRChange && ++NoBRIters > 30)
report_fatal_error("Branch Fix Up pass failed to converge!");
- DEBUG(dumpBBs());
+ LLVM_DEBUG(dumpBBs());
if (!CPChange && !BRChange)
break;
MadeChange = true;
}
- DEBUG(dbgs() << '\n'; dumpBBs());
+ LLVM_DEBUG(dbgs() << '\n'; dumpBBs());
BBInfo.clear();
WaterList.clear();
@@ -580,10 +583,10 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
// Add a new CPEntry, but no corresponding CPUser yet.
CPEntries.emplace_back(1, CPEntry(CPEMI, i));
++NumCPEs;
- DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
- << Size << ", align = " << Align <<'\n');
+ LLVM_DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
+ << Size << ", align = " << Align << '\n');
}
- DEBUG(BB->dump());
+ LLVM_DEBUG(BB->dump());
}
/// BBHasFallthrough - Return true if the specified basic block can fallthrough
@@ -660,7 +663,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
if (!BBHasFallthrough(&MBB))
WaterList.push_back(&MBB);
for (MachineInstr &MI : MBB) {
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
continue;
int Opc = MI.getOpcode();
@@ -986,7 +989,7 @@ bool MipsConstantIslands::isCPEntryInRange
unsigned CPEOffset = getOffsetOf(CPEMI);
if (DoDump) {
- DEBUG({
+ LLVM_DEBUG({
unsigned Block = MI->getParent()->getNumber();
const BasicBlockInfo &BBI = BBInfo[Block];
dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
@@ -1059,7 +1062,7 @@ int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
// Check to see if the CPE is already in-range.
if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
true)) {
- DEBUG(dbgs() << "In range\n");
+ LLVM_DEBUG(dbgs() << "In range\n");
return 1;
}
@@ -1075,8 +1078,8 @@ int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
continue;
if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
U.NegOk)) {
- DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
- << CPEs[i].CPI << "\n");
+ LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+ << CPEs[i].CPI << "\n");
// Point the CPUser node to the replacement
U.CPEMI = CPEs[i].CPEMI;
// Change the CPI in the instruction operand to refer to the clone.
@@ -1113,7 +1116,7 @@ int MipsConstantIslands::findLongFormInRangeCPEntry
if (isCPEntryInRange(UserMI, UserOffset, CPEMI,
U.getLongFormMaxDisp(), U.NegOk,
true)) {
- DEBUG(dbgs() << "In range\n");
+ LLVM_DEBUG(dbgs() << "In range\n");
UserMI->setDesc(TII->get(U.getLongFormOpcode()));
U.setMaxDisp(U.getLongFormMaxDisp());
return 2; // instruction is longer length now
@@ -1131,8 +1134,8 @@ int MipsConstantIslands::findLongFormInRangeCPEntry
continue;
if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI,
U.getLongFormMaxDisp(), U.NegOk)) {
- DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
- << CPEs[i].CPI << "\n");
+ LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+ << CPEs[i].CPI << "\n");
// Point the CPUser node to the replacement
U.CPEMI = CPEs[i].CPEMI;
// Change the CPI in the instruction operand to refer to the clone.
@@ -1197,8 +1200,8 @@ bool MipsConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
// This is the least amount of required padding seen so far.
BestGrowth = Growth;
WaterIter = IP;
- DEBUG(dbgs() << "Found water after " << printMBBReference(*WaterBB)
- << " Growth=" << Growth << '\n');
+ LLVM_DEBUG(dbgs() << "Found water after " << printMBBReference(*WaterBB)
+ << " Growth=" << Growth << '\n');
// Keep looking unless it is perfect.
if (BestGrowth == 0)
@@ -1236,8 +1239,8 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
if (isOffsetInRange(UserOffset, CPEOffset, U)) {
- DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB)
- << format(", expected CPE offset %#x\n", CPEOffset));
+ LLVM_DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB)
+ << format(", expected CPE offset %#x\n", CPEOffset));
NewMBB = &*++UserMBB->getIterator();
// Add an unconditional branch from UserMBB to fallthrough block. Record
// it for branch lengthening; this new branch will not get out of range,
@@ -1263,16 +1266,16 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
unsigned LogAlign = MF->getAlignment();
assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
unsigned BaseInsertOffset = UserOffset + U.getMaxDisp();
- DEBUG(dbgs() << format("Split in middle of big block before %#x",
- BaseInsertOffset));
+ LLVM_DEBUG(dbgs() << format("Split in middle of big block before %#x",
+ BaseInsertOffset));
// The 4 in the following is for the unconditional branch we'll be inserting
// Alignment of the island is handled
// inside isOffsetInRange.
BaseInsertOffset -= 4;
- DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
- << " la=" << LogAlign << '\n');
+ LLVM_DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+ << " la=" << LogAlign << '\n');
// This could point off the end of the block if we've already got constant
// pool entries following this block; only the last one is in the water list.
@@ -1280,7 +1283,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
// long unconditional).
if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
BaseInsertOffset = UserBBI.postOffset() - 8;
- DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+ LLVM_DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
}
unsigned EndInsertOffset = BaseInsertOffset + 4 +
CPEMI->getOperand(2).getImm();
@@ -1336,7 +1339,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
MachineBasicBlock *NewMBB;
water_iterator IP;
if (findAvailableWater(U, UserOffset, IP)) {
- DEBUG(dbgs() << "Found water in range\n");
+ LLVM_DEBUG(dbgs() << "Found water in range\n");
MachineBasicBlock *WaterBB = *IP;
// If the original WaterList entry was "new water" on this iteration,
@@ -1355,7 +1358,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
result = findLongFormInRangeCPEntry(U, UserOffset);
if (result != 0) return true;
}
- DEBUG(dbgs() << "No water found\n");
+ LLVM_DEBUG(dbgs() << "No water found\n");
createNewWater(CPUserIndex, UserOffset, NewMBB);
// splitBlockBeforeInstr adds to WaterList, which is important when it is
@@ -1414,8 +1417,9 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
break;
}
- DEBUG(dbgs() << " Moved CPE to #" << ID << " CPI=" << CPI
- << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+ LLVM_DEBUG(
+ dbgs() << " Moved CPE to #" << ID << " CPI=" << CPI
+ << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
return true;
}
@@ -1470,11 +1474,11 @@ bool MipsConstantIslands::isBBInRange
unsigned BrOffset = getOffsetOf(MI) + PCAdj;
unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
- DEBUG(dbgs() << "Branch of destination " << printMBBReference(*DestBB)
- << " from " << printMBBReference(*MI->getParent())
- << " max delta=" << MaxDisp << " from " << getOffsetOf(MI)
- << " to " << DestOffset << " offset "
- << int(DestOffset - BrOffset) << "\t" << *MI);
+ LLVM_DEBUG(dbgs() << "Branch of destination " << printMBBReference(*DestBB)
+ << " from " << printMBBReference(*MI->getParent())
+ << " max delta=" << MaxDisp << " from " << getOffsetOf(MI)
+ << " to " << DestOffset << " offset "
+ << int(DestOffset - BrOffset) << "\t" << *MI);
if (BrOffset <= DestOffset) {
// Branch before the Dest.
@@ -1539,7 +1543,7 @@ MipsConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
HasFarJump = true;
++NumUBrFixed;
- DEBUG(dbgs() << " Changed B to long jump " << *MI);
+ LLVM_DEBUG(dbgs() << " Changed B to long jump " << *MI);
return true;
}
@@ -1594,8 +1598,9 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
MachineBasicBlock *NewDest =
BMI->getOperand(BMITargetOperand).getMBB();
if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
- DEBUG(dbgs() << " Invert Bcc condition and swap its destination with "
- << *BMI);
+ LLVM_DEBUG(
+ dbgs() << " Invert Bcc condition and swap its destination with "
+ << *BMI);
MI->setDesc(TII->get(OppositeBranchOpcode));
BMI->getOperand(BMITargetOperand).setMBB(DestBB);
MI->getOperand(TargetOperand).setMBB(NewDest);
@@ -1615,9 +1620,9 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
}
MachineBasicBlock *NextBB = &*++MBB->getIterator();
- DEBUG(dbgs() << " Insert B to " << printMBBReference(*DestBB)
- << " also invert condition and change dest. to "
- << printMBBReference(*NextBB) << "\n");
+ LLVM_DEBUG(dbgs() << " Insert B to " << printMBBReference(*DestBB)
+ << " also invert condition and change dest. to "
+ << printMBBReference(*NextBB) << "\n");
// Insert a new conditional branch and a new unconditional branch.
// Also update the ImmBranch as well as adding a new entry for the new branch.
@@ -1653,19 +1658,19 @@ void MipsConstantIslands::prescanForConstants() {
switch(I->getDesc().getOpcode()) {
case Mips::LwConstant32: {
PrescannedForConstants = true;
- DEBUG(dbgs() << "constant island constant " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "constant island constant " << *I << "\n");
J = I->getNumOperands();
- DEBUG(dbgs() << "num operands " << J << "\n");
+ LLVM_DEBUG(dbgs() << "num operands " << J << "\n");
MachineOperand& Literal = I->getOperand(1);
if (Literal.isImm()) {
int64_t V = Literal.getImm();
- DEBUG(dbgs() << "literal " << V << "\n");
+ LLVM_DEBUG(dbgs() << "literal " << V << "\n");
Type *Int32Ty =
Type::getInt32Ty(MF->getFunction().getContext());
const Constant *C = ConstantInt::get(Int32Ty, V);
unsigned index = MCP->getConstantPoolIndex(C, 4);
I->getOperand(2).ChangeToImmediate(index);
- DEBUG(dbgs() << "constant island constant " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "constant island constant " << *I << "\n");
I->setDesc(TII->get(Mips::LwRxPcTcp16));
I->RemoveOperand(1);
I->RemoveOperand(1);
diff --git a/lib/Target/Mips/MipsDSPInstrFormats.td b/lib/Target/Mips/MipsDSPInstrFormats.td
index 0ceb1858fb09..5f0763f5ea46 100644
--- a/lib/Target/Mips/MipsDSPInstrFormats.td
+++ b/lib/Target/Mips/MipsDSPInstrFormats.td
@@ -29,11 +29,11 @@ def HasDSPR3 : Predicate<"Subtarget->hasDSPR3()">,
AssemblerPredicate<"FeatureDSPR3">;
class ISA_DSPR2 {
- list<Predicate> InsnPredicates = [HasDSPR2];
+ list<Predicate> ASEPredicate = [HasDSPR2];
}
class ISA_DSPR3 {
- list<Predicate> InsnPredicates = [HasDSPR3];
+ list<Predicate> ASEPredicate = [HasDSPR3];
}
// Fields.
@@ -45,21 +45,21 @@ def SPECIAL3_OPCODE : Field6<0b011111>;
def REGIMM_OPCODE : Field6<0b000001>;
class DSPInst<string opstr = "">
- : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl {
- let InsnPredicates = [HasDSP];
+ : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
+ let ASEPredicate = [HasDSP];
string BaseOpcode = opstr;
string Arch = "dsp";
}
class PseudoDSP<dag outs, dag ins, list<dag> pattern,
InstrItinClass itin = IIPseudo>
- : MipsPseudo<outs, ins, pattern, itin>, PredicateControl {
- let InsnPredicates = [HasDSP];
+ : MipsPseudo<outs, ins, pattern, itin> {
+ let ASEPredicate = [HasDSP];
}
class DSPInstAlias<string Asm, dag Result, bit Emit = 0b1>
: InstAlias<Asm, Result, Emit>, PredicateControl {
- let InsnPredicates = [HasDSP];
+ let ASEPredicate = [HasDSP];
}
// ADDU.QB sub-class format.
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index 871135e3a22b..b9824220b558 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -447,6 +447,7 @@ class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))];
InstrItinClass Itinerary = itin;
string BaseOpcode = instr_asm;
+ bit isMoveReg = 1;
}
class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -457,6 +458,7 @@ class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
list<dag> Pattern = [(OpNode GPR32Opnd:$rs, immZExt10:$mask)];
InstrItinClass Itinerary = itin;
string BaseOpcode = instr_asm;
+ bit isMoveReg = 1;
}
class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
@@ -500,6 +502,7 @@ class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))];
InstrItinClass Itinerary = itin;
string BaseOpcode = instr_asm;
+ bit isMoveReg = 1;
}
class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> {
@@ -508,6 +511,7 @@ class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin>
string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
InstrItinClass Itinerary = itin;
string BaseOpcode = instr_asm;
+ bit isMoveReg = 1;
}
class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
@@ -1285,7 +1289,7 @@ let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
}
let DecoderNamespace = "MipsDSP", Arch = "dsp",
- AdditionalPredicates = [HasDSP] in {
+ ASEPredicate = [HasDSP] in {
def LWDSP : Load<"lw", DSPROpnd, null_frag, II_LW>, DspMMRel, LW_FM<0x23>;
def SWDSP : Store<"sw", DSPROpnd, null_frag, II_SW>, DspMMRel, LW_FM<0x2b>;
}
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index e06b57e41834..33f03b954a8c 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -51,7 +51,7 @@
using namespace llvm;
-#define DEBUG_TYPE "delay-slot-filler"
+#define DEBUG_TYPE "mips-delay-slot-filler"
STATISTIC(FilledSlots, "Number of delay slots filled");
STATISTIC(UsefulSlots, "Number of delay slots filled with instructions that"
@@ -210,9 +210,11 @@ namespace {
bool SeenNoObjStore = false;
};
- class Filler : public MachineFunctionPass {
+ class MipsDelaySlotFiller : public MachineFunctionPass {
public:
- Filler() : MachineFunctionPass(ID) {}
+ MipsDelaySlotFiller() : MachineFunctionPass(ID) {
+ initializeMipsDelaySlotFillerPass(*PassRegistry::getPassRegistry());
+ }
StringRef getPassName() const override { return "Mips Delay Slot Filler"; }
@@ -242,6 +244,8 @@ namespace {
MachineFunctionPass::getAnalysisUsage(AU);
}
+ static char ID;
+
private:
bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
@@ -292,18 +296,19 @@ namespace {
bool terminateSearch(const MachineInstr &Candidate) const;
const TargetMachine *TM = nullptr;
-
- static char ID;
};
} // end anonymous namespace
-char Filler::ID = 0;
+char MipsDelaySlotFiller::ID = 0;
static bool hasUnoccupiedSlot(const MachineInstr *MI) {
return MI->hasDelaySlot() && !MI->isBundledWithSucc();
}
+INITIALIZE_PASS(MipsDelaySlotFiller, DEBUG_TYPE,
+ "Fill delay slot for MIPS", false, false)
+
/// This function inserts clones of Filler into predecessor blocks.
static void insertDelayFiller(Iter Filler, const BB2BrMap &BrMap) {
MachineFunction *MF = Filler->getParent()->getParent();
@@ -551,8 +556,9 @@ getUnderlyingObjects(const MachineInstr &MI,
}
// Replace Branch with the compact branch instruction.
-Iter Filler::replaceWithCompactBranch(MachineBasicBlock &MBB, Iter Branch,
- const DebugLoc &DL) {
+Iter MipsDelaySlotFiller::replaceWithCompactBranch(MachineBasicBlock &MBB,
+ Iter Branch,
+ const DebugLoc &DL) {
const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>();
const MipsInstrInfo *TII = STI.getInstrInfo();
@@ -575,6 +581,7 @@ static int getEquivalentCallShort(int Opcode) {
case Mips::BLTZAL:
return Mips::BLTZALS_MM;
case Mips::JAL:
+ case Mips::JAL_MM:
return Mips::JALS_MM;
case Mips::JALR:
return Mips::JALRS_MM;
@@ -591,7 +598,7 @@ static int getEquivalentCallShort(int Opcode) {
/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
/// We assume there is only one delay slot per delayed instruction.
-bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+bool MipsDelaySlotFiller::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
bool Changed = false;
const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>();
bool InMicroMipsMode = STI.inMicroMipsMode();
@@ -632,7 +639,7 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
// TODO: Implement an instruction mapping table of 16bit opcodes to
// 32bit opcodes so that an instruction can be expanded. This would
// save 16 bits as a TAILCALL_MM pseudo requires a fullsized nop.
- // TODO: Permit b16 when branching backwards to the the same function
+ // TODO: Permit b16 when branching backwards to the same function
// if it is in range.
DSI->setDesc(TII->get(getEquivalentCallShort(DSI->getOpcode())));
}
@@ -669,16 +676,17 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
return Changed;
}
-template<typename IterTy>
-bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
- RegDefsUses &RegDU, InspectMemInstr& IM, Iter Slot,
- IterTy &Filler) const {
+template <typename IterTy>
+bool MipsDelaySlotFiller::searchRange(MachineBasicBlock &MBB, IterTy Begin,
+ IterTy End, RegDefsUses &RegDU,
+ InspectMemInstr &IM, Iter Slot,
+ IterTy &Filler) const {
for (IterTy I = Begin; I != End;) {
IterTy CurrI = I;
++I;
// skip debug value
- if (CurrI->isDebugValue())
+ if (CurrI->isDebugInstr())
continue;
if (terminateSearch(*CurrI))
@@ -720,6 +728,10 @@ bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
(Opcode == Mips::JR || Opcode == Mips::PseudoIndirectBranch ||
Opcode == Mips::PseudoReturn || Opcode == Mips::TAILCALL))
continue;
+ // Instructions LWP/SWP should not be in a delay slot as that
+ // results in unpredictable behaviour
+ if (InMicroMipsMode && (Opcode == Mips::LWP_MM || Opcode == Mips::SWP_MM))
+ continue;
Filler = CurrI;
return true;
@@ -728,7 +740,8 @@ bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
return false;
}
-bool Filler::searchBackward(MachineBasicBlock &MBB, MachineInstr &Slot) const {
+bool MipsDelaySlotFiller::searchBackward(MachineBasicBlock &MBB,
+ MachineInstr &Slot) const {
if (DisableBackwardSearch)
return false;
@@ -750,7 +763,8 @@ bool Filler::searchBackward(MachineBasicBlock &MBB, MachineInstr &Slot) const {
return true;
}
-bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
+bool MipsDelaySlotFiller::searchForward(MachineBasicBlock &MBB,
+ Iter Slot) const {
// Can handle only calls.
if (DisableForwardSearch || !Slot->isCall())
return false;
@@ -770,7 +784,8 @@ bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
return true;
}
-bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
+bool MipsDelaySlotFiller::searchSuccBBs(MachineBasicBlock &MBB,
+ Iter Slot) const {
if (DisableSuccBBSearch)
return false;
@@ -816,7 +831,8 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
return true;
}
-MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
+MachineBasicBlock *
+MipsDelaySlotFiller::selectSuccBB(MachineBasicBlock &B) const {
if (B.succ_empty())
return nullptr;
@@ -832,7 +848,8 @@ MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
}
std::pair<MipsInstrInfo::BranchType, MachineInstr *>
-Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
+MipsDelaySlotFiller::getBranch(MachineBasicBlock &MBB,
+ const MachineBasicBlock &Dst) const {
const MipsInstrInfo *TII =
MBB.getParent()->getSubtarget<MipsSubtarget>().getInstrInfo();
MachineBasicBlock *TrueBB = nullptr, *FalseBB = nullptr;
@@ -867,11 +884,13 @@ Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
return std::make_pair(MipsInstrInfo::BT_None, nullptr);
}
-bool Filler::examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
- RegDefsUses &RegDU, bool &HasMultipleSuccs,
- BB2BrMap &BrMap) const {
+bool MipsDelaySlotFiller::examinePred(MachineBasicBlock &Pred,
+ const MachineBasicBlock &Succ,
+ RegDefsUses &RegDU,
+ bool &HasMultipleSuccs,
+ BB2BrMap &BrMap) const {
std::pair<MipsInstrInfo::BranchType, MachineInstr *> P =
- getBranch(Pred, Succ);
+ getBranch(Pred, Succ);
// Return if either getBranch wasn't able to analyze the branches or there
// were no branches with unoccupied slots.
@@ -888,8 +907,9 @@ bool Filler::examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
return true;
}
-bool Filler::delayHasHazard(const MachineInstr &Candidate, RegDefsUses &RegDU,
- InspectMemInstr &IM) const {
+bool MipsDelaySlotFiller::delayHasHazard(const MachineInstr &Candidate,
+ RegDefsUses &RegDU,
+ InspectMemInstr &IM) const {
assert(!Candidate.isKill() &&
"KILL instructions should have been eliminated at this point.");
@@ -901,7 +921,7 @@ bool Filler::delayHasHazard(const MachineInstr &Candidate, RegDefsUses &RegDU,
return HasHazard;
}
-bool Filler::terminateSearch(const MachineInstr &Candidate) const {
+bool MipsDelaySlotFiller::terminateSearch(const MachineInstr &Candidate) const {
return (Candidate.isTerminator() || Candidate.isCall() ||
Candidate.isPosition() || Candidate.isInlineAsm() ||
Candidate.hasUnmodeledSideEffects());
@@ -909,4 +929,4 @@ bool Filler::terminateSearch(const MachineInstr &Candidate) const {
/// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
/// slots in Mips MachineFunctions
-FunctionPass *llvm::createMipsDelaySlotFillerPass() { return new Filler(); }
+FunctionPass *llvm::createMipsDelaySlotFillerPass() { return new MipsDelaySlotFiller(); }
diff --git a/lib/Target/Mips/MipsEVAInstrFormats.td b/lib/Target/Mips/MipsEVAInstrFormats.td
index 8c3024810d27..61785d0e891a 100644
--- a/lib/Target/Mips/MipsEVAInstrFormats.td
+++ b/lib/Target/Mips/MipsEVAInstrFormats.td
@@ -12,7 +12,7 @@
//===----------------------------------------------------------------------===//
class MipsEVAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
- PredicateControl, StdArch {
+ StdArch {
let DecoderNamespace = "Mips";
let EncodingPredicates = [HasStdEnc];
}
diff --git a/lib/Target/Mips/MipsEVAInstrInfo.td b/lib/Target/Mips/MipsEVAInstrInfo.td
index 26df263d228b..ff54b1f17877 100644
--- a/lib/Target/Mips/MipsEVAInstrInfo.td
+++ b/lib/Target/Mips/MipsEVAInstrInfo.td
@@ -59,6 +59,7 @@ class LOAD_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
list<dag> Pattern = [];
string DecoderMethod = "DecodeMemEVA";
bit canFoldAsLoad = 1;
+ string BaseOpcode = instr_asm;
bit mayLoad = 1;
InstrItinClass Itinerary = itin;
}
@@ -77,6 +78,7 @@ class STORE_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
list<dag> Pattern = [];
string DecoderMethod = "DecodeMemEVA";
+ string BaseOpcode = instr_asm;
bit mayStore = 1;
InstrItinClass Itinerary = itin;
}
@@ -93,13 +95,16 @@ class LOAD_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
list<dag> Pattern = [];
string DecoderMethod = "DecodeMemEVA";
+ string BaseOpcode = instr_asm;
string Constraints = "$src = $rt";
bit canFoldAsLoad = 1;
InstrItinClass Itinerary = itin;
+ bit mayLoad = 1;
+ bit mayStore = 0;
}
-class LWLE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwle", GPR32Opnd, II_LWLE>;
-class LWRE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwre", GPR32Opnd, II_LWRE>;
+class LWLE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwle", GPR32Opnd, II_LWLE>;
+class LWRE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwre", GPR32Opnd, II_LWRE>;
class STORE_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
InstrItinClass itin = NoItinerary> {
@@ -108,11 +113,14 @@ class STORE_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
list<dag> Pattern = [];
string DecoderMethod = "DecodeMemEVA";
+ string BaseOpcode = instr_asm;
InstrItinClass Itinerary = itin;
+ bit mayLoad = 0;
+ bit mayStore = 1;
}
-class SWLE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swle", GPR32Opnd, II_SWLE>;
-class SWRE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swre", GPR32Opnd, II_SWRE>;
+class SWLE_DESC : STORE_LEFT_RIGHT_EVA_DESC_BASE<"swle", GPR32Opnd, II_SWLE>;
+class SWRE_DESC : STORE_LEFT_RIGHT_EVA_DESC_BASE<"swre", GPR32Opnd, II_SWRE>;
// Load-linked EVA, Store-conditional EVA descriptions
class LLE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
@@ -121,6 +129,7 @@ class LLE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
dag InOperandList = (ins mem_simm9:$addr);
string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
list<dag> Pattern = [];
+ string BaseOpcode = instr_asm;
bit mayLoad = 1;
string DecoderMethod = "DecodeMemEVA";
InstrItinClass Itinerary = itin;
@@ -134,6 +143,7 @@ class SCE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
list<dag> Pattern = [];
+ string BaseOpcode = instr_asm;
bit mayStore = 1;
string Constraints = "$rt = $dst";
string DecoderMethod = "DecodeMemEVA";
@@ -159,6 +169,7 @@ class CACHEE_DESC_BASE<string instr_asm, Operand MemOpnd,
dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
list<dag> Pattern = [];
+ string BaseOpcode = instr_asm;
string DecoderMethod = "DecodeCacheeOp_CacheOpR6";
InstrItinClass Itinerary = itin;
}
@@ -172,38 +183,32 @@ class PREFE_DESC : CACHEE_DESC_BASE<"prefe", mem_simm9, II_PREFE>;
//
//===----------------------------------------------------------------------===//
-/// Load and Store EVA Instructions
-def LBE : LBE_ENC, LBE_DESC, INSN_EVA;
-def LBuE : LBuE_ENC, LBuE_DESC, INSN_EVA;
-def LHE : LHE_ENC, LHE_DESC, INSN_EVA;
-def LHuE : LHuE_ENC, LHuE_DESC, INSN_EVA;
-let AdditionalPredicates = [NotInMicroMips] in {
-def LWE : LWE_ENC, LWE_DESC, INSN_EVA;
-}
-def SBE : SBE_ENC, SBE_DESC, INSN_EVA;
-def SHE : SHE_ENC, SHE_DESC, INSN_EVA;
-let AdditionalPredicates = [NotInMicroMips] in {
-def SWE : SWE_ENC, SWE_DESC, INSN_EVA;
-}
-
-/// load/store left/right EVA
let AdditionalPredicates = [NotInMicroMips] in {
-def LWLE : LWLE_ENC, LWLE_DESC, INSN_EVA_NOT_32R6_64R6;
-def LWRE : LWRE_ENC, LWRE_DESC, INSN_EVA_NOT_32R6_64R6;
-def SWLE : SWLE_ENC, SWLE_DESC, INSN_EVA_NOT_32R6_64R6;
-def SWRE : SWRE_ENC, SWRE_DESC, INSN_EVA_NOT_32R6_64R6;
+ /// Load and Store EVA Instructions
+ def LBE : MMRel, LBE_ENC, LBE_DESC, ISA_MIPS32R2, ASE_EVA;
+ def LBuE : MMRel, LBuE_ENC, LBuE_DESC, ISA_MIPS32R2, ASE_EVA;
+ def LHE : MMRel, LHE_ENC, LHE_DESC, ISA_MIPS32R2, ASE_EVA;
+ def LHuE : MMRel, LHuE_ENC, LHuE_DESC, ISA_MIPS32R2, ASE_EVA;
+ def LWE : MMRel, LWE_ENC, LWE_DESC, ISA_MIPS32R2, ASE_EVA;
+ def SBE : MMRel, SBE_ENC, SBE_DESC, ISA_MIPS32R2, ASE_EVA;
+ def SHE : MMRel, SHE_ENC, SHE_DESC, ISA_MIPS32R2, ASE_EVA;
+ def SWE : MMRel, SWE_ENC, SWE_DESC, ISA_MIPS32R2, ASE_EVA;
+
+ /// load/store left/right EVA
+ def LWLE : MMRel, LWLE_ENC, LWLE_DESC, ISA_MIPS32R2_NOT_32R6_64R6, ASE_EVA;
+ def LWRE : MMRel, LWRE_ENC, LWRE_DESC, ISA_MIPS32R2_NOT_32R6_64R6, ASE_EVA;
+ def SWLE : MMRel, SWLE_ENC, SWLE_DESC, ISA_MIPS32R2_NOT_32R6_64R6, ASE_EVA;
+ def SWRE : MMRel, SWRE_ENC, SWRE_DESC, ISA_MIPS32R2_NOT_32R6_64R6, ASE_EVA;
+
+ /// Load-linked EVA, Store-conditional EVA
+ def LLE : MMRel, LLE_ENC, LLE_DESC, ISA_MIPS32R2, ASE_EVA;
+ def SCE : MMRel, SCE_ENC, SCE_DESC, ISA_MIPS32R2, ASE_EVA;
+
+ /// TLB invalidate instructions
+ def TLBINV : TLBINV_ENC, TLBINV_DESC, ISA_MIPS32R2, ASE_EVA;
+ def TLBINVF : TLBINVF_ENC, TLBINVF_DESC, ISA_MIPS32R2, ASE_EVA;
+
+ /// EVA versions of cache and pref
+ def CACHEE : MMRel, CACHEE_ENC, CACHEE_DESC, ISA_MIPS32R2, ASE_EVA;
+ def PREFE : MMRel, PREFE_ENC, PREFE_DESC, ISA_MIPS32R2, ASE_EVA;
}
-
-/// Load-linked EVA, Store-conditional EVA
-let AdditionalPredicates = [NotInMicroMips] in {
-def LLE : LLE_ENC, LLE_DESC, INSN_EVA;
-def SCE : SCE_ENC, SCE_DESC, INSN_EVA;
-}
-
-let AdditionalPredicates = [NotInMicroMips] in {
- def TLBINV : TLBINV_ENC, TLBINV_DESC, INSN_EVA;
- def TLBINVF : TLBINVF_ENC, TLBINVF_DESC, INSN_EVA;
-}
-
-def CACHEE : CACHEE_ENC, CACHEE_DESC, INSN_EVA;
-def PREFE : PREFE_ENC, PREFE_DESC, INSN_EVA;
diff --git a/lib/Target/Mips/MipsExpandPseudo.cpp b/lib/Target/Mips/MipsExpandPseudo.cpp
new file mode 100644
index 000000000000..acf66d1fb1b2
--- /dev/null
+++ b/lib/Target/Mips/MipsExpandPseudo.cpp
@@ -0,0 +1,702 @@
+//===-- MipsExpandPseudoInsts.cpp - Expand pseudo instructions ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling, if-conversion, and other late
+// optimizations. This pass should be run after register allocation but before
+// the post-regalloc scheduling pass.
+//
+// This is currently only used for expanding atomic pseudos after register
+// allocation. We do this to avoid the fast register allocator introducing
+// spills between ll and sc. These stores cause some MIPS implementations to
+// abort the atomic RMW sequence.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-pseudo"
+
+namespace {
+ class MipsExpandPseudo : public MachineFunctionPass {
+ public:
+ static char ID;
+ MipsExpandPseudo() : MachineFunctionPass(ID) {}
+
+ const MipsInstrInfo *TII;
+ const MipsSubtarget *STI;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "Mips pseudo instruction expansion pass";
+ }
+
+ private:
+ bool expandAtomicCmpSwap(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+ bool expandAtomicCmpSwapSubword(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+
+ bool expandAtomicBinOp(MachineBasicBlock &BB,
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator &NMBBI, unsigned Size);
+ bool expandAtomicBinOpSubword(MachineBasicBlock &BB,
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator &NMBBI);
+
+ bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NMBB);
+ bool expandMBB(MachineBasicBlock &MBB);
+ };
+ char MipsExpandPseudo::ID = 0;
+}
+
+bool MipsExpandPseudo::expandAtomicCmpSwapSubword(
+ MachineBasicBlock &BB, MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator &NMBBI) {
+
+ MachineFunction *MF = BB.getParent();
+
+ const bool ArePtrs64bit = STI->getABI().ArePtrs64bit();
+ DebugLoc DL = I->getDebugLoc();
+ unsigned LL, SC;
+
+ unsigned ZERO = Mips::ZERO;
+ unsigned BNE = Mips::BNE;
+ unsigned BEQ = Mips::BEQ;
+ unsigned SEOp =
+ I->getOpcode() == Mips::ATOMIC_CMP_SWAP_I8_POSTRA ? Mips::SEB : Mips::SEH;
+
+ if (STI->inMicroMipsMode()) {
+ LL = STI->hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
+ SC = STI->hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
+ BNE = STI->hasMips32r6() ? Mips::BNEC_MMR6 : Mips::BNE_MM;
+ BEQ = STI->hasMips32r6() ? Mips::BEQC_MMR6 : Mips::BEQ_MM;
+ } else {
+ LL = STI->hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+ : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+ SC = STI->hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+ : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+ }
+
+ unsigned Dest = I->getOperand(0).getReg();
+ unsigned Ptr = I->getOperand(1).getReg();
+ unsigned Mask = I->getOperand(2).getReg();
+ unsigned ShiftCmpVal = I->getOperand(3).getReg();
+ unsigned Mask2 = I->getOperand(4).getReg();
+ unsigned ShiftNewVal = I->getOperand(5).getReg();
+ unsigned ShiftAmnt = I->getOperand(6).getReg();
+ unsigned Scratch = I->getOperand(7).getReg();
+ unsigned Scratch2 = I->getOperand(8).getReg();
+
+ // insert new blocks after the current block
+ const BasicBlock *LLVM_BB = BB.getBasicBlock();
+ MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineFunction::iterator It = ++BB.getIterator();
+ MF->insert(It, loop1MBB);
+ MF->insert(It, loop2MBB);
+ MF->insert(It, sinkMBB);
+ MF->insert(It, exitMBB);
+
+ // Transfer the remainder of BB and its successor edges to exitMBB.
+ exitMBB->splice(exitMBB->begin(), &BB,
+ std::next(MachineBasicBlock::iterator(I)), BB.end());
+ exitMBB->transferSuccessorsAndUpdatePHIs(&BB);
+
+ // thisMBB:
+ // ...
+ // fallthrough --> loop1MBB
+ BB.addSuccessor(loop1MBB, BranchProbability::getOne());
+ loop1MBB->addSuccessor(sinkMBB);
+ loop1MBB->addSuccessor(loop2MBB);
+ loop1MBB->normalizeSuccProbs();
+ loop2MBB->addSuccessor(loop1MBB);
+ loop2MBB->addSuccessor(sinkMBB);
+ loop2MBB->normalizeSuccProbs();
+ sinkMBB->addSuccessor(exitMBB, BranchProbability::getOne());
+
+ // loop1MBB:
+ // ll dest, 0(ptr)
+ // and Mask', dest, Mask
+ // bne Mask', ShiftCmpVal, exitMBB
+ BuildMI(loop1MBB, DL, TII->get(LL), Scratch).addReg(Ptr).addImm(0);
+ BuildMI(loop1MBB, DL, TII->get(Mips::AND), Scratch2)
+ .addReg(Scratch)
+ .addReg(Mask);
+ BuildMI(loop1MBB, DL, TII->get(BNE))
+ .addReg(Scratch2).addReg(ShiftCmpVal).addMBB(sinkMBB);
+
+ // loop2MBB:
+ // and dest, dest, mask2
+ // or dest, dest, ShiftNewVal
+ // sc dest, dest, 0(ptr)
+ // beq dest, $0, loop1MBB
+ BuildMI(loop2MBB, DL, TII->get(Mips::AND), Scratch)
+ .addReg(Scratch, RegState::Kill)
+ .addReg(Mask2);
+ BuildMI(loop2MBB, DL, TII->get(Mips::OR), Scratch)
+ .addReg(Scratch, RegState::Kill)
+ .addReg(ShiftNewVal);
+ BuildMI(loop2MBB, DL, TII->get(SC), Scratch)
+ .addReg(Scratch, RegState::Kill)
+ .addReg(Ptr)
+ .addImm(0);
+ BuildMI(loop2MBB, DL, TII->get(BEQ))
+ .addReg(Scratch, RegState::Kill)
+ .addReg(ZERO)
+ .addMBB(loop1MBB);
+
+ // sinkMBB:
+ // srl srlres, Mask', shiftamt
+ // sign_extend dest,srlres
+ BuildMI(sinkMBB, DL, TII->get(Mips::SRLV), Dest)
+ .addReg(Scratch2)
+ .addReg(ShiftAmnt);
+ if (STI->hasMips32r2()) {
+ BuildMI(sinkMBB, DL, TII->get(SEOp), Dest).addReg(Dest);
+ } else {
+ const unsigned ShiftImm =
+ I->getOpcode() == Mips::ATOMIC_CMP_SWAP_I16_POSTRA ? 16 : 24;
+ BuildMI(sinkMBB, DL, TII->get(Mips::SLL), Dest)
+ .addReg(Dest, RegState::Kill)
+ .addImm(ShiftImm);
+ BuildMI(sinkMBB, DL, TII->get(Mips::SRA), Dest)
+ .addReg(Dest, RegState::Kill)
+ .addImm(ShiftImm);
+ }
+
+ LivePhysRegs LiveRegs;
+ computeAndAddLiveIns(LiveRegs, *loop1MBB);
+ computeAndAddLiveIns(LiveRegs, *loop2MBB);
+ computeAndAddLiveIns(LiveRegs, *sinkMBB);
+ computeAndAddLiveIns(LiveRegs, *exitMBB);
+
+ NMBBI = BB.end();
+ I->eraseFromParent();
+ return true;
+}
+
+bool MipsExpandPseudo::expandAtomicCmpSwap(MachineBasicBlock &BB,
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator &NMBBI) {
+
+ const unsigned Size =
+ I->getOpcode() == Mips::ATOMIC_CMP_SWAP_I32_POSTRA ? 4 : 8;
+ MachineFunction *MF = BB.getParent();
+
+ const bool ArePtrs64bit = STI->getABI().ArePtrs64bit();
+ DebugLoc DL = I->getDebugLoc();
+
+ unsigned LL, SC, ZERO, BNE, BEQ, MOVE;
+
+ if (Size == 4) {
+ if (STI->inMicroMipsMode()) {
+ LL = STI->hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
+ SC = STI->hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
+ BNE = STI->hasMips32r6() ? Mips::BNEC_MMR6 : Mips::BNE_MM;
+ BEQ = STI->hasMips32r6() ? Mips::BEQC_MMR6 : Mips::BEQ_MM;
+ } else {
+ LL = STI->hasMips32r6()
+ ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+ : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+ SC = STI->hasMips32r6()
+ ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+ : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+ BNE = Mips::BNE;
+ BEQ = Mips::BEQ;
+ }
+
+ ZERO = Mips::ZERO;
+ MOVE = Mips::OR;
+ } else {
+ LL = STI->hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
+ SC = STI->hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
+ ZERO = Mips::ZERO_64;
+ BNE = Mips::BNE64;
+ BEQ = Mips::BEQ64;
+ MOVE = Mips::OR64;
+ }
+
+ unsigned Dest = I->getOperand(0).getReg();
+ unsigned Ptr = I->getOperand(1).getReg();
+ unsigned OldVal = I->getOperand(2).getReg();
+ unsigned NewVal = I->getOperand(3).getReg();
+ unsigned Scratch = I->getOperand(4).getReg();
+
+ // insert new blocks after the current block
+ const BasicBlock *LLVM_BB = BB.getBasicBlock();
+ MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineFunction::iterator It = ++BB.getIterator();
+ MF->insert(It, loop1MBB);
+ MF->insert(It, loop2MBB);
+ MF->insert(It, exitMBB);
+
+ // Transfer the remainder of BB and its successor edges to exitMBB.
+ exitMBB->splice(exitMBB->begin(), &BB,
+ std::next(MachineBasicBlock::iterator(I)), BB.end());
+ exitMBB->transferSuccessorsAndUpdatePHIs(&BB);
+
+ // thisMBB:
+ // ...
+ // fallthrough --> loop1MBB
+ BB.addSuccessor(loop1MBB, BranchProbability::getOne());
+ loop1MBB->addSuccessor(exitMBB);
+ loop1MBB->addSuccessor(loop2MBB);
+ loop1MBB->normalizeSuccProbs();
+ loop2MBB->addSuccessor(loop1MBB);
+ loop2MBB->addSuccessor(exitMBB);
+ loop2MBB->normalizeSuccProbs();
+
+ // loop1MBB:
+ // ll dest, 0(ptr)
+ // bne dest, oldval, exitMBB
+ BuildMI(loop1MBB, DL, TII->get(LL), Dest).addReg(Ptr).addImm(0);
+ BuildMI(loop1MBB, DL, TII->get(BNE))
+ .addReg(Dest, RegState::Kill).addReg(OldVal).addMBB(exitMBB);
+
+ // loop2MBB:
+ // move scratch, NewVal
+ // sc Scratch, Scratch, 0(ptr)
+ // beq Scratch, $0, loop1MBB
+ BuildMI(loop2MBB, DL, TII->get(MOVE), Scratch).addReg(NewVal).addReg(ZERO);
+ BuildMI(loop2MBB, DL, TII->get(SC), Scratch)
+ .addReg(Scratch).addReg(Ptr).addImm(0);
+ BuildMI(loop2MBB, DL, TII->get(BEQ))
+ .addReg(Scratch, RegState::Kill).addReg(ZERO).addMBB(loop1MBB);
+
+ LivePhysRegs LiveRegs;
+ computeAndAddLiveIns(LiveRegs, *loop1MBB);
+ computeAndAddLiveIns(LiveRegs, *loop2MBB);
+ computeAndAddLiveIns(LiveRegs, *exitMBB);
+
+ NMBBI = BB.end();
+ I->eraseFromParent();
+ return true;
+}
+
+bool MipsExpandPseudo::expandAtomicBinOpSubword(
+ MachineBasicBlock &BB, MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator &NMBBI) {
+
+ MachineFunction *MF = BB.getParent();
+
+ const bool ArePtrs64bit = STI->getABI().ArePtrs64bit();
+ DebugLoc DL = I->getDebugLoc();
+
+ unsigned LL, SC;
+ unsigned BEQ = Mips::BEQ;
+ unsigned SEOp = Mips::SEH;
+
+ if (STI->inMicroMipsMode()) {
+ LL = STI->hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
+ SC = STI->hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
+ BEQ = STI->hasMips32r6() ? Mips::BEQC_MMR6 : Mips::BEQ_MM;
+ } else {
+ LL = STI->hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+ : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+ SC = STI->hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+ : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+ }
+
+ bool IsSwap = false;
+ bool IsNand = false;
+
+ unsigned Opcode = 0;
+ switch (I->getOpcode()) {
+ case Mips::ATOMIC_LOAD_NAND_I8_POSTRA:
+ SEOp = Mips::SEB;
+ LLVM_FALLTHROUGH;
+ case Mips::ATOMIC_LOAD_NAND_I16_POSTRA:
+ IsNand = true;
+ break;
+ case Mips::ATOMIC_SWAP_I8_POSTRA:
+ SEOp = Mips::SEB;
+ LLVM_FALLTHROUGH;
+ case Mips::ATOMIC_SWAP_I16_POSTRA:
+ IsSwap = true;
+ break;
+ case Mips::ATOMIC_LOAD_ADD_I8_POSTRA:
+ SEOp = Mips::SEB;
+ LLVM_FALLTHROUGH;
+ case Mips::ATOMIC_LOAD_ADD_I16_POSTRA:
+ Opcode = Mips::ADDu;
+ break;
+ case Mips::ATOMIC_LOAD_SUB_I8_POSTRA:
+ SEOp = Mips::SEB;
+ LLVM_FALLTHROUGH;
+ case Mips::ATOMIC_LOAD_SUB_I16_POSTRA:
+ Opcode = Mips::SUBu;
+ break;
+ case Mips::ATOMIC_LOAD_AND_I8_POSTRA:
+ SEOp = Mips::SEB;
+ LLVM_FALLTHROUGH;
+ case Mips::ATOMIC_LOAD_AND_I16_POSTRA:
+ Opcode = Mips::AND;
+ break;
+ case Mips::ATOMIC_LOAD_OR_I8_POSTRA:
+ SEOp = Mips::SEB;
+ LLVM_FALLTHROUGH;
+ case Mips::ATOMIC_LOAD_OR_I16_POSTRA:
+ Opcode = Mips::OR;
+ break;
+ case Mips::ATOMIC_LOAD_XOR_I8_POSTRA:
+ SEOp = Mips::SEB;
+ LLVM_FALLTHROUGH;
+ case Mips::ATOMIC_LOAD_XOR_I16_POSTRA:
+ Opcode = Mips::XOR;
+ break;
+ default:
+ llvm_unreachable("Unknown subword atomic pseudo for expansion!");
+ }
+
+ unsigned Dest = I->getOperand(0).getReg();
+ unsigned Ptr = I->getOperand(1).getReg();
+ unsigned Incr = I->getOperand(2).getReg();
+ unsigned Mask = I->getOperand(3).getReg();
+ unsigned Mask2 = I->getOperand(4).getReg();
+ unsigned ShiftAmnt = I->getOperand(5).getReg();
+ unsigned OldVal = I->getOperand(6).getReg();
+ unsigned BinOpRes = I->getOperand(7).getReg();
+ unsigned StoreVal = I->getOperand(8).getReg();
+
+ const BasicBlock *LLVM_BB = BB.getBasicBlock();
+ MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineFunction::iterator It = ++BB.getIterator();
+ MF->insert(It, loopMBB);
+ MF->insert(It, sinkMBB);
+ MF->insert(It, exitMBB);
+
+ exitMBB->splice(exitMBB->begin(), &BB, std::next(I), BB.end());
+ exitMBB->transferSuccessorsAndUpdatePHIs(&BB);
+
+ BB.addSuccessor(loopMBB, BranchProbability::getOne());
+ loopMBB->addSuccessor(sinkMBB);
+ loopMBB->addSuccessor(loopMBB);
+ loopMBB->normalizeSuccProbs();
+
+ BuildMI(loopMBB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
+ if (IsNand) {
+ // and andres, oldval, incr2
+ // nor binopres, $0, andres
+ // and newval, binopres, mask
+ BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes)
+ .addReg(OldVal)
+ .addReg(Incr);
+ BuildMI(loopMBB, DL, TII->get(Mips::NOR), BinOpRes)
+ .addReg(Mips::ZERO)
+ .addReg(BinOpRes);
+ BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes)
+ .addReg(BinOpRes)
+ .addReg(Mask);
+ } else if (!IsSwap) {
+ // <binop> binopres, oldval, incr2
+ // and newval, binopres, mask
+ BuildMI(loopMBB, DL, TII->get(Opcode), BinOpRes)
+ .addReg(OldVal)
+ .addReg(Incr);
+ BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes)
+ .addReg(BinOpRes)
+ .addReg(Mask);
+ } else { // atomic.swap
+ // and newval, incr2, mask
+ BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes)
+ .addReg(Incr)
+ .addReg(Mask);
+ }
+
+ // and StoreVal, OlddVal, Mask2
+ // or StoreVal, StoreVal, BinOpRes
+ // StoreVal<tied1> = sc StoreVal, 0(Ptr)
+ // beq StoreVal, zero, loopMBB
+ BuildMI(loopMBB, DL, TII->get(Mips::AND), StoreVal)
+ .addReg(OldVal).addReg(Mask2);
+ BuildMI(loopMBB, DL, TII->get(Mips::OR), StoreVal)
+ .addReg(StoreVal).addReg(BinOpRes);
+ BuildMI(loopMBB, DL, TII->get(SC), StoreVal)
+ .addReg(StoreVal).addReg(Ptr).addImm(0);
+ BuildMI(loopMBB, DL, TII->get(BEQ))
+ .addReg(StoreVal).addReg(Mips::ZERO).addMBB(loopMBB);
+
+ // sinkMBB:
+ // and maskedoldval1,oldval,mask
+ // srl srlres,maskedoldval1,shiftamt
+ // sign_extend dest,srlres
+
+ sinkMBB->addSuccessor(exitMBB, BranchProbability::getOne());
+
+ BuildMI(sinkMBB, DL, TII->get(Mips::AND), Dest)
+ .addReg(OldVal).addReg(Mask);
+ BuildMI(sinkMBB, DL, TII->get(Mips::SRLV), Dest)
+ .addReg(Dest).addReg(ShiftAmnt);
+
+ if (STI->hasMips32r2()) {
+ BuildMI(sinkMBB, DL, TII->get(SEOp), Dest).addReg(Dest);
+ } else {
+ const unsigned ShiftImm = SEOp == Mips::SEH ? 16 : 24;
+ BuildMI(sinkMBB, DL, TII->get(Mips::SLL), Dest)
+ .addReg(Dest, RegState::Kill)
+ .addImm(ShiftImm);
+ BuildMI(sinkMBB, DL, TII->get(Mips::SRA), Dest)
+ .addReg(Dest, RegState::Kill)
+ .addImm(ShiftImm);
+ }
+
+ LivePhysRegs LiveRegs;
+ computeAndAddLiveIns(LiveRegs, *loopMBB);
+ computeAndAddLiveIns(LiveRegs, *sinkMBB);
+ computeAndAddLiveIns(LiveRegs, *exitMBB);
+
+ NMBBI = BB.end();
+ I->eraseFromParent();
+
+ return true;
+}
+
+bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB,
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator &NMBBI,
+ unsigned Size) {
+ MachineFunction *MF = BB.getParent();
+
+ const bool ArePtrs64bit = STI->getABI().ArePtrs64bit();
+ DebugLoc DL = I->getDebugLoc();
+
+ unsigned LL, SC, ZERO, BEQ;
+
+ if (Size == 4) {
+ if (STI->inMicroMipsMode()) {
+ LL = STI->hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
+ SC = STI->hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
+ BEQ = STI->hasMips32r6() ? Mips::BEQC_MMR6 : Mips::BEQ_MM;
+ } else {
+ LL = STI->hasMips32r6()
+ ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+ : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+ SC = STI->hasMips32r6()
+ ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+ : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+ BEQ = Mips::BEQ;
+ }
+
+ ZERO = Mips::ZERO;
+ } else {
+ LL = STI->hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
+ SC = STI->hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
+ ZERO = Mips::ZERO_64;
+ BEQ = Mips::BEQ64;
+ }
+
+ unsigned OldVal = I->getOperand(0).getReg();
+ unsigned Ptr = I->getOperand(1).getReg();
+ unsigned Incr = I->getOperand(2).getReg();
+ unsigned Scratch = I->getOperand(3).getReg();
+
+ unsigned Opcode = 0;
+ unsigned OR = 0;
+ unsigned AND = 0;
+ unsigned NOR = 0;
+ bool IsNand = false;
+ switch (I->getOpcode()) {
+ case Mips::ATOMIC_LOAD_ADD_I32_POSTRA:
+ Opcode = Mips::ADDu;
+ break;
+ case Mips::ATOMIC_LOAD_SUB_I32_POSTRA:
+ Opcode = Mips::SUBu;
+ break;
+ case Mips::ATOMIC_LOAD_AND_I32_POSTRA:
+ Opcode = Mips::AND;
+ break;
+ case Mips::ATOMIC_LOAD_OR_I32_POSTRA:
+ Opcode = Mips::OR;
+ break;
+ case Mips::ATOMIC_LOAD_XOR_I32_POSTRA:
+ Opcode = Mips::XOR;
+ break;
+ case Mips::ATOMIC_LOAD_NAND_I32_POSTRA:
+ IsNand = true;
+ AND = Mips::AND;
+ NOR = Mips::NOR;
+ break;
+ case Mips::ATOMIC_SWAP_I32_POSTRA:
+ OR = Mips::OR;
+ break;
+ case Mips::ATOMIC_LOAD_ADD_I64_POSTRA:
+ Opcode = Mips::DADDu;
+ break;
+ case Mips::ATOMIC_LOAD_SUB_I64_POSTRA:
+ Opcode = Mips::DSUBu;
+ break;
+ case Mips::ATOMIC_LOAD_AND_I64_POSTRA:
+ Opcode = Mips::AND64;
+ break;
+ case Mips::ATOMIC_LOAD_OR_I64_POSTRA:
+ Opcode = Mips::OR64;
+ break;
+ case Mips::ATOMIC_LOAD_XOR_I64_POSTRA:
+ Opcode = Mips::XOR64;
+ break;
+ case Mips::ATOMIC_LOAD_NAND_I64_POSTRA:
+ IsNand = true;
+ AND = Mips::AND64;
+ NOR = Mips::NOR64;
+ break;
+ case Mips::ATOMIC_SWAP_I64_POSTRA:
+ OR = Mips::OR64;
+ break;
+ default:
+ llvm_unreachable("Unknown pseudo atomic!");
+ }
+
+ const BasicBlock *LLVM_BB = BB.getBasicBlock();
+ MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineFunction::iterator It = ++BB.getIterator();
+ MF->insert(It, loopMBB);
+ MF->insert(It, exitMBB);
+
+ exitMBB->splice(exitMBB->begin(), &BB, std::next(I), BB.end());
+ exitMBB->transferSuccessorsAndUpdatePHIs(&BB);
+
+ BB.addSuccessor(loopMBB, BranchProbability::getOne());
+ loopMBB->addSuccessor(exitMBB);
+ loopMBB->addSuccessor(loopMBB);
+ loopMBB->normalizeSuccProbs();
+
+ BuildMI(loopMBB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
+ assert((OldVal != Ptr) && "Clobbered the wrong ptr reg!");
+ assert((OldVal != Incr) && "Clobbered the wrong reg!");
+ if (Opcode) {
+ BuildMI(loopMBB, DL, TII->get(Opcode), Scratch).addReg(OldVal).addReg(Incr);
+ } else if (IsNand) {
+ assert(AND && NOR &&
+ "Unknown nand instruction for atomic pseudo expansion");
+ BuildMI(loopMBB, DL, TII->get(AND), Scratch).addReg(OldVal).addReg(Incr);
+ BuildMI(loopMBB, DL, TII->get(NOR), Scratch).addReg(ZERO).addReg(Scratch);
+ } else {
+ assert(OR && "Unknown instruction for atomic pseudo expansion!");
+ BuildMI(loopMBB, DL, TII->get(OR), Scratch).addReg(Incr).addReg(ZERO);
+ }
+
+ BuildMI(loopMBB, DL, TII->get(SC), Scratch).addReg(Scratch).addReg(Ptr).addImm(0);
+ BuildMI(loopMBB, DL, TII->get(BEQ)).addReg(Scratch).addReg(ZERO).addMBB(loopMBB);
+
+ NMBBI = BB.end();
+ I->eraseFromParent();
+
+ LivePhysRegs LiveRegs;
+ computeAndAddLiveIns(LiveRegs, *loopMBB);
+ computeAndAddLiveIns(LiveRegs, *exitMBB);
+
+ return true;
+}
+
+bool MipsExpandPseudo::expandMI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NMBB) {
+
+ bool Modified = false;
+
+ switch (MBBI->getOpcode()) {
+ case Mips::ATOMIC_CMP_SWAP_I32_POSTRA:
+ case Mips::ATOMIC_CMP_SWAP_I64_POSTRA:
+ return expandAtomicCmpSwap(MBB, MBBI, NMBB);
+ case Mips::ATOMIC_CMP_SWAP_I8_POSTRA:
+ case Mips::ATOMIC_CMP_SWAP_I16_POSTRA:
+ return expandAtomicCmpSwapSubword(MBB, MBBI, NMBB);
+ case Mips::ATOMIC_SWAP_I8_POSTRA:
+ case Mips::ATOMIC_SWAP_I16_POSTRA:
+ case Mips::ATOMIC_LOAD_NAND_I8_POSTRA:
+ case Mips::ATOMIC_LOAD_NAND_I16_POSTRA:
+ case Mips::ATOMIC_LOAD_ADD_I8_POSTRA:
+ case Mips::ATOMIC_LOAD_ADD_I16_POSTRA:
+ case Mips::ATOMIC_LOAD_SUB_I8_POSTRA:
+ case Mips::ATOMIC_LOAD_SUB_I16_POSTRA:
+ case Mips::ATOMIC_LOAD_AND_I8_POSTRA:
+ case Mips::ATOMIC_LOAD_AND_I16_POSTRA:
+ case Mips::ATOMIC_LOAD_OR_I8_POSTRA:
+ case Mips::ATOMIC_LOAD_OR_I16_POSTRA:
+ case Mips::ATOMIC_LOAD_XOR_I8_POSTRA:
+ case Mips::ATOMIC_LOAD_XOR_I16_POSTRA:
+ return expandAtomicBinOpSubword(MBB, MBBI, NMBB);
+ case Mips::ATOMIC_LOAD_ADD_I32_POSTRA:
+ case Mips::ATOMIC_LOAD_SUB_I32_POSTRA:
+ case Mips::ATOMIC_LOAD_AND_I32_POSTRA:
+ case Mips::ATOMIC_LOAD_OR_I32_POSTRA:
+ case Mips::ATOMIC_LOAD_XOR_I32_POSTRA:
+ case Mips::ATOMIC_LOAD_NAND_I32_POSTRA:
+ case Mips::ATOMIC_SWAP_I32_POSTRA:
+ return expandAtomicBinOp(MBB, MBBI, NMBB, 4);
+ case Mips::ATOMIC_LOAD_ADD_I64_POSTRA:
+ case Mips::ATOMIC_LOAD_SUB_I64_POSTRA:
+ case Mips::ATOMIC_LOAD_AND_I64_POSTRA:
+ case Mips::ATOMIC_LOAD_OR_I64_POSTRA:
+ case Mips::ATOMIC_LOAD_XOR_I64_POSTRA:
+ case Mips::ATOMIC_LOAD_NAND_I64_POSTRA:
+ case Mips::ATOMIC_SWAP_I64_POSTRA:
+ return expandAtomicBinOp(MBB, MBBI, NMBB, 8);
+ default:
+ return Modified;
+ }
+}
+
+bool MipsExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ while (MBBI != E) {
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ Modified |= expandMI(MBB, MBBI, NMBBI);
+ MBBI = NMBBI;
+ }
+
+ return Modified;
+}
+
+bool MipsExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+ STI = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+ TII = STI->getInstrInfo();
+
+ bool Modified = false;
+ for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+ ++MFI)
+ Modified |= expandMBB(*MFI);
+
+ if (Modified)
+ MF.RenumberBlocks();
+
+ return Modified;
+}
+
+/// createMipsExpandPseudoPass - returns an instance of the pseudo instruction
+/// expansion pass.
+FunctionPass *llvm::createMipsExpandPseudoPass() {
+ return new MipsExpandPseudo();
+}
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index 8bbac3ed7cfb..7b39507812ed 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file defines the MIPS-specific support for the FastISel class.
+/// This file defines the MIPS-specific support for the FastISel class.
/// Some of the target-specific code is generated by tablegen in the file
/// MipsGenFastISel.inc, which is #included here.
///
@@ -36,7 +36,6 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/ValueTypes.h"
@@ -64,9 +63,11 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
+#include <array>
#include <cassert>
#include <cstdint>
@@ -950,12 +951,9 @@ bool MipsFastISel::selectBranch(const Instruction *I) {
//
MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
- BI->getCondition();
// For now, just try the simplest case where it's fed by a compare.
if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
- unsigned CondReg = createResultReg(&Mips::GPR32RegClass);
- if (!emitCmp(CondReg, CI))
- return false;
+ unsigned CondReg = getRegForValue(CI);
BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ))
.addReg(CondReg)
.addMBB(TBB);
@@ -1000,11 +998,12 @@ bool MipsFastISel::selectFPExt(const Instruction *I) {
bool MipsFastISel::selectSelect(const Instruction *I) {
assert(isa<SelectInst>(I) && "Expected a select instruction.");
- DEBUG(dbgs() << "selectSelect\n");
+ LLVM_DEBUG(dbgs() << "selectSelect\n");
MVT VT;
if (!isTypeSupported(I->getType(), VT) || UnsupportedFPMode) {
- DEBUG(dbgs() << ".. .. gave up (!isTypeSupported || UnsupportedFPMode)\n");
+ LLVM_DEBUG(
+ dbgs() << ".. .. gave up (!isTypeSupported || UnsupportedFPMode)\n");
return false;
}
@@ -1287,32 +1286,32 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
}
bool MipsFastISel::fastLowerArguments() {
- DEBUG(dbgs() << "fastLowerArguments\n");
+ LLVM_DEBUG(dbgs() << "fastLowerArguments\n");
if (!FuncInfo.CanLowerReturn) {
- DEBUG(dbgs() << ".. gave up (!CanLowerReturn)\n");
+ LLVM_DEBUG(dbgs() << ".. gave up (!CanLowerReturn)\n");
return false;
}
const Function *F = FuncInfo.Fn;
if (F->isVarArg()) {
- DEBUG(dbgs() << ".. gave up (varargs)\n");
+ LLVM_DEBUG(dbgs() << ".. gave up (varargs)\n");
return false;
}
CallingConv::ID CC = F->getCallingConv();
if (CC != CallingConv::C) {
- DEBUG(dbgs() << ".. gave up (calling convention is not C)\n");
+ LLVM_DEBUG(dbgs() << ".. gave up (calling convention is not C)\n");
return false;
}
- const ArrayRef<MCPhysReg> GPR32ArgRegs = {Mips::A0, Mips::A1, Mips::A2,
- Mips::A3};
- const ArrayRef<MCPhysReg> FGR32ArgRegs = {Mips::F12, Mips::F14};
- const ArrayRef<MCPhysReg> AFGR64ArgRegs = {Mips::D6, Mips::D7};
- ArrayRef<MCPhysReg>::iterator NextGPR32 = GPR32ArgRegs.begin();
- ArrayRef<MCPhysReg>::iterator NextFGR32 = FGR32ArgRegs.begin();
- ArrayRef<MCPhysReg>::iterator NextAFGR64 = AFGR64ArgRegs.begin();
+ std::array<MCPhysReg, 4> GPR32ArgRegs = {{Mips::A0, Mips::A1, Mips::A2,
+ Mips::A3}};
+ std::array<MCPhysReg, 2> FGR32ArgRegs = {{Mips::F12, Mips::F14}};
+ std::array<MCPhysReg, 2> AFGR64ArgRegs = {{Mips::D6, Mips::D7}};
+ auto NextGPR32 = GPR32ArgRegs.begin();
+ auto NextFGR32 = FGR32ArgRegs.begin();
+ auto NextAFGR64 = AFGR64ArgRegs.begin();
struct AllocatedReg {
const TargetRegisterClass *RC;
@@ -1328,21 +1327,21 @@ bool MipsFastISel::fastLowerArguments() {
if (FormalArg.hasAttribute(Attribute::InReg) ||
FormalArg.hasAttribute(Attribute::StructRet) ||
FormalArg.hasAttribute(Attribute::ByVal)) {
- DEBUG(dbgs() << ".. gave up (inreg, structret, byval)\n");
+ LLVM_DEBUG(dbgs() << ".. gave up (inreg, structret, byval)\n");
return false;
}
Type *ArgTy = FormalArg.getType();
if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) {
- DEBUG(dbgs() << ".. gave up (struct, array, or vector)\n");
+ LLVM_DEBUG(dbgs() << ".. gave up (struct, array, or vector)\n");
return false;
}
EVT ArgVT = TLI.getValueType(DL, ArgTy);
- DEBUG(dbgs() << ".. " << FormalArg.getArgNo() << ": "
- << ArgVT.getEVTString() << "\n");
+ LLVM_DEBUG(dbgs() << ".. " << FormalArg.getArgNo() << ": "
+ << ArgVT.getEVTString() << "\n");
if (!ArgVT.isSimple()) {
- DEBUG(dbgs() << ".. .. gave up (not a simple type)\n");
+ LLVM_DEBUG(dbgs() << ".. .. gave up (not a simple type)\n");
return false;
}
@@ -1354,16 +1353,16 @@ bool MipsFastISel::fastLowerArguments() {
!FormalArg.hasAttribute(Attribute::ZExt)) {
// It must be any extend, this shouldn't happen for clang-generated IR
// so just fall back on SelectionDAG.
- DEBUG(dbgs() << ".. .. gave up (i8/i16 arg is not extended)\n");
+ LLVM_DEBUG(dbgs() << ".. .. gave up (i8/i16 arg is not extended)\n");
return false;
}
if (NextGPR32 == GPR32ArgRegs.end()) {
- DEBUG(dbgs() << ".. .. gave up (ran out of GPR32 arguments)\n");
+ LLVM_DEBUG(dbgs() << ".. .. gave up (ran out of GPR32 arguments)\n");
return false;
}
- DEBUG(dbgs() << ".. .. GPR32(" << *NextGPR32 << ")\n");
+ LLVM_DEBUG(dbgs() << ".. .. GPR32(" << *NextGPR32 << ")\n");
Allocation.emplace_back(&Mips::GPR32RegClass, *NextGPR32++);
// Allocating any GPR32 prohibits further use of floating point arguments.
@@ -1374,16 +1373,16 @@ bool MipsFastISel::fastLowerArguments() {
case MVT::i32:
if (FormalArg.hasAttribute(Attribute::ZExt)) {
// The O32 ABI does not permit a zero-extended i32.
- DEBUG(dbgs() << ".. .. gave up (i32 arg is zero extended)\n");
+ LLVM_DEBUG(dbgs() << ".. .. gave up (i32 arg is zero extended)\n");
return false;
}
if (NextGPR32 == GPR32ArgRegs.end()) {
- DEBUG(dbgs() << ".. .. gave up (ran out of GPR32 arguments)\n");
+ LLVM_DEBUG(dbgs() << ".. .. gave up (ran out of GPR32 arguments)\n");
return false;
}
- DEBUG(dbgs() << ".. .. GPR32(" << *NextGPR32 << ")\n");
+ LLVM_DEBUG(dbgs() << ".. .. GPR32(" << *NextGPR32 << ")\n");
Allocation.emplace_back(&Mips::GPR32RegClass, *NextGPR32++);
// Allocating any GPR32 prohibits further use of floating point arguments.
@@ -1393,14 +1392,14 @@ bool MipsFastISel::fastLowerArguments() {
case MVT::f32:
if (UnsupportedFPMode) {
- DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode)\n");
+ LLVM_DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode)\n");
return false;
}
if (NextFGR32 == FGR32ArgRegs.end()) {
- DEBUG(dbgs() << ".. .. gave up (ran out of FGR32 arguments)\n");
+ LLVM_DEBUG(dbgs() << ".. .. gave up (ran out of FGR32 arguments)\n");
return false;
}
- DEBUG(dbgs() << ".. .. FGR32(" << *NextFGR32 << ")\n");
+ LLVM_DEBUG(dbgs() << ".. .. FGR32(" << *NextFGR32 << ")\n");
Allocation.emplace_back(&Mips::FGR32RegClass, *NextFGR32++);
// Allocating an FGR32 also allocates the super-register AFGR64, and
// ABI rules require us to skip the corresponding GPR32.
@@ -1412,14 +1411,14 @@ bool MipsFastISel::fastLowerArguments() {
case MVT::f64:
if (UnsupportedFPMode) {
- DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode)\n");
+ LLVM_DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode)\n");
return false;
}
if (NextAFGR64 == AFGR64ArgRegs.end()) {
- DEBUG(dbgs() << ".. .. gave up (ran out of AFGR64 arguments)\n");
+ LLVM_DEBUG(dbgs() << ".. .. gave up (ran out of AFGR64 arguments)\n");
return false;
}
- DEBUG(dbgs() << ".. .. AFGR64(" << *NextAFGR64 << ")\n");
+ LLVM_DEBUG(dbgs() << ".. .. AFGR64(" << *NextAFGR64 << ")\n");
Allocation.emplace_back(&Mips::AFGR64RegClass, *NextAFGR64++);
// Allocating an FGR32 also allocates the super-register AFGR64, and
// ABI rules require us to skip the corresponding GPR32 pair.
@@ -1432,7 +1431,7 @@ bool MipsFastISel::fastLowerArguments() {
break;
default:
- DEBUG(dbgs() << ".. .. gave up (unknown type)\n");
+ LLVM_DEBUG(dbgs() << ".. .. gave up (unknown type)\n");
return false;
}
}
@@ -1628,7 +1627,7 @@ bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
if (!MTI->getLength()->getType()->isIntegerTy(32))
return false;
const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove";
- return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 2);
+ return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 1);
}
case Intrinsic::memset: {
const MemSetInst *MSI = cast<MemSetInst>(II);
@@ -1637,7 +1636,7 @@ bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
return false;
if (!MSI->getLength()->getType()->isIntegerTy(32))
return false;
- return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+ return lowerCallTo(II, "memset", II->getNumArgOperands() - 1);
}
}
return false;
@@ -1647,7 +1646,7 @@ bool MipsFastISel::selectRet(const Instruction *I) {
const Function &F = *I->getParent()->getParent();
const ReturnInst *Ret = cast<ReturnInst>(I);
- DEBUG(dbgs() << "selectRet\n");
+ LLVM_DEBUG(dbgs() << "selectRet\n");
if (!FuncInfo.CanLowerReturn)
return false;
@@ -1711,7 +1710,7 @@ bool MipsFastISel::selectRet(const Instruction *I) {
// Do not handle FGR64 returns for now.
if (RVVT == MVT::f64 && UnsupportedFPMode) {
- DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode\n");
+ LLVM_DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode\n");
return false;
}
@@ -2063,6 +2062,10 @@ unsigned MipsFastISel::getRegEnsuringSimpleIntegerWidening(const Value *V,
if (VReg == 0)
return 0;
MVT VMVT = TLI.getValueType(DL, V->getType(), true).getSimpleVT();
+
+ if (VMVT == MVT::i1)
+ return 0;
+
if ((VMVT == MVT::i8) || (VMVT == MVT::i16)) {
unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
if (!emitIntExt(VMVT, VReg, MVT::i32, TempReg, IsUnsigned))
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index 883c3267d51a..0ead56eddd2f 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -36,6 +36,10 @@ public:
bool isFPCloseToIncomingSP() const override { return false; }
+ bool enableShrinkWrapping(const MachineFunction &MF) const override {
+ return true;
+ }
+
MachineBasicBlock::iterator
eliminateCallFramePseudoInstr(MachineFunction &MF,
MachineBasicBlock &MBB,
diff --git a/lib/Target/Mips/MipsHazardSchedule.cpp b/lib/Target/Mips/MipsHazardSchedule.cpp
deleted file mode 100644
index da67c1bcea99..000000000000
--- a/lib/Target/Mips/MipsHazardSchedule.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-//===- MipsHazardSchedule.cpp - Workaround pipeline hazards ---------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This pass is used to workaround certain pipeline hazards. For now, this
-/// covers compact branch hazards. In future this pass can be extended to other
-/// pipeline hazards, such as various MIPS1 hazards, processor errata that
-/// require instruction reorganization, etc.
-///
-/// This pass has to run after the delay slot filler as that pass can introduce
-/// pipeline hazards, hence the existing hazard recognizer is not suitable.
-///
-/// Hazards handled: forbidden slots for MIPSR6.
-///
-/// A forbidden slot hazard occurs when a compact branch instruction is executed
-/// and the adjacent instruction in memory is a control transfer instruction
-/// such as a branch or jump, ERET, ERETNC, DERET, WAIT and PAUSE.
-///
-/// For example:
-///
-/// 0x8004 bnec a1,v0,<P+0x18>
-/// 0x8008 beqc a1,a2,<P+0x54>
-///
-/// In such cases, the processor is required to signal a Reserved Instruction
-/// exception.
-///
-/// Here, if the instruction at 0x8004 is executed, the processor will raise an
-/// exception as there is a control transfer instruction at 0x8008.
-///
-/// There are two sources of forbidden slot hazards:
-///
-/// A) A previous pass has created a compact branch directly.
-/// B) Transforming a delay slot branch into compact branch. This case can be
-/// difficult to process as lookahead for hazards is insufficient, as
-/// backwards delay slot fillling can also produce hazards in previously
-/// processed instuctions.
-///
-//===----------------------------------------------------------------------===//
-
-#include "Mips.h"
-#include "MipsInstrInfo.h"
-#include "MipsSubtarget.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include <algorithm>
-#include <iterator>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "mips-hazard-schedule"
-
-STATISTIC(NumInsertedNops, "Number of nops inserted");
-
-namespace {
-
-using Iter = MachineBasicBlock::iterator;
-using ReverseIter = MachineBasicBlock::reverse_iterator;
-
-class MipsHazardSchedule : public MachineFunctionPass {
-public:
- MipsHazardSchedule() : MachineFunctionPass(ID) {}
-
- StringRef getPassName() const override { return "Mips Hazard Schedule"; }
-
- bool runOnMachineFunction(MachineFunction &F) override;
-
- MachineFunctionProperties getRequiredProperties() const override {
- return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::NoVRegs);
- }
-
-private:
- static char ID;
-};
-
-} // end of anonymous namespace
-
-char MipsHazardSchedule::ID = 0;
-
-/// Returns a pass that clears pipeline hazards.
-FunctionPass *llvm::createMipsHazardSchedule() {
- return new MipsHazardSchedule();
-}
-
-// Find the next real instruction from the current position in current basic
-// block.
-static Iter getNextMachineInstrInBB(Iter Position) {
- Iter I = Position, E = Position->getParent()->end();
- I = std::find_if_not(I, E,
- [](const Iter &Insn) { return Insn->isTransient(); });
-
- return I;
-}
-
-// Find the next real instruction from the current position, looking through
-// basic block boundaries.
-static std::pair<Iter, bool> getNextMachineInstr(Iter Position, MachineBasicBlock * Parent) {
- if (Position == Parent->end()) {
- do {
- MachineBasicBlock *Succ = Parent->getNextNode();
- if (Succ != nullptr && Parent->isSuccessor(Succ)) {
- Position = Succ->begin();
- Parent = Succ;
- } else {
- return std::make_pair(Position, true);
- }
- } while (Parent->empty());
- }
-
- Iter Instr = getNextMachineInstrInBB(Position);
- if (Instr == Parent->end()) {
- return getNextMachineInstr(Instr, Parent);
- }
- return std::make_pair(Instr, false);
-}
-
-bool MipsHazardSchedule::runOnMachineFunction(MachineFunction &MF) {
-
- const MipsSubtarget *STI =
- &static_cast<const MipsSubtarget &>(MF.getSubtarget());
-
- // Forbidden slot hazards are only defined for MIPSR6 but not microMIPSR6.
- if (!STI->hasMips32r6() || STI->inMicroMipsMode())
- return false;
-
- bool Changed = false;
- const MipsInstrInfo *TII = STI->getInstrInfo();
-
- for (MachineFunction::iterator FI = MF.begin(); FI != MF.end(); ++FI) {
- for (Iter I = FI->begin(); I != FI->end(); ++I) {
-
- // Forbidden slot hazard handling. Use lookahead over state.
- if (!TII->HasForbiddenSlot(*I))
- continue;
-
- Iter Inst;
- bool LastInstInFunction =
- std::next(I) == FI->end() && std::next(FI) == MF.end();
- if (!LastInstInFunction) {
- std::pair<Iter, bool> Res = getNextMachineInstr(std::next(I), &*FI);
- LastInstInFunction |= Res.second;
- Inst = Res.first;
- }
-
- if (LastInstInFunction || !TII->SafeInForbiddenSlot(*Inst)) {
- Changed = true;
- MIBundleBuilder(&*I)
- .append(BuildMI(MF, I->getDebugLoc(), TII->get(Mips::NOP)));
- NumInsertedNops++;
- }
- }
- }
- return Changed;
-}
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 0e1173f1c617..f99f3a1b3e0a 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/StackProtector.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instructions.h"
@@ -46,6 +47,13 @@ using namespace llvm;
// instructions for SelectionDAG operations.
//===----------------------------------------------------------------------===//
+void MipsDAGToDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
+ // There are multiple MipsDAGToDAGISel instances added to the pass pipeline.
+ // We need to preserve StackProtector for the next one.
+ AU.addPreserved<StackProtector>();
+ SelectionDAGISel::getAnalysisUsage(AU);
+}
+
bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
bool Ret = SelectionDAGISel::runOnMachineFunction(MF);
@@ -215,12 +223,9 @@ bool MipsDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
void MipsDAGToDAGISel::Select(SDNode *Node) {
unsigned Opcode = Node->getOpcode();
- // Dump information about the Node being selected
- DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
-
// If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
- DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+ LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
Node->setNodeId(-1);
return;
}
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h
index 20bdd4aa8f5f..09003459d180 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -41,6 +41,8 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
protected:
SDNode *getGlobalBaseReg();
@@ -93,34 +95,34 @@ private:
virtual bool selectAddr16(SDValue Addr, SDValue &Base, SDValue &Offset);
virtual bool selectAddr16SP(SDValue Addr, SDValue &Base, SDValue &Offset);
- /// \brief Select constant vector splats.
+ /// Select constant vector splats.
virtual bool selectVSplat(SDNode *N, APInt &Imm,
unsigned MinSizeInBits) const;
- /// \brief Select constant vector splats whose value fits in a uimm1.
+ /// Select constant vector splats whose value fits in a uimm1.
virtual bool selectVSplatUimm1(SDValue N, SDValue &Imm) const;
- /// \brief Select constant vector splats whose value fits in a uimm2.
+ /// Select constant vector splats whose value fits in a uimm2.
virtual bool selectVSplatUimm2(SDValue N, SDValue &Imm) const;
- /// \brief Select constant vector splats whose value fits in a uimm3.
+ /// Select constant vector splats whose value fits in a uimm3.
virtual bool selectVSplatUimm3(SDValue N, SDValue &Imm) const;
- /// \brief Select constant vector splats whose value fits in a uimm4.
+ /// Select constant vector splats whose value fits in a uimm4.
virtual bool selectVSplatUimm4(SDValue N, SDValue &Imm) const;
- /// \brief Select constant vector splats whose value fits in a uimm5.
+ /// Select constant vector splats whose value fits in a uimm5.
virtual bool selectVSplatUimm5(SDValue N, SDValue &Imm) const;
- /// \brief Select constant vector splats whose value fits in a uimm6.
+ /// Select constant vector splats whose value fits in a uimm6.
virtual bool selectVSplatUimm6(SDValue N, SDValue &Imm) const;
- /// \brief Select constant vector splats whose value fits in a uimm8.
+ /// Select constant vector splats whose value fits in a uimm8.
virtual bool selectVSplatUimm8(SDValue N, SDValue &Imm) const;
- /// \brief Select constant vector splats whose value fits in a simm5.
+ /// Select constant vector splats whose value fits in a simm5.
virtual bool selectVSplatSimm5(SDValue N, SDValue &Imm) const;
- /// \brief Select constant vector splats whose value is a power of 2.
+ /// Select constant vector splats whose value is a power of 2.
virtual bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const;
- /// \brief Select constant vector splats whose value is the inverse of a
+ /// Select constant vector splats whose value is the inverse of a
/// power of 2.
virtual bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const;
- /// \brief Select constant vector splats whose value is a run of set bits
+ /// Select constant vector splats whose value is a run of set bits
/// ending at the most significant bit
virtual bool selectVSplatMaskL(SDValue N, SDValue &Imm) const;
- /// \brief Select constant vector splats whose value is a run of set bits
+ /// Select constant vector splats whose value is a run of set bits
/// starting at bit zero.
virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 6448fd917560..9ffc38356b76 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -41,7 +41,6 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -64,6 +63,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
@@ -110,12 +110,6 @@ static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
// The MIPS MSA ABI passes vector arguments in the integer register set.
// The number of integer registers used is dependant on the ABI used.
-MVT MipsTargetLowering::getRegisterTypeForCallingConv(MVT VT) const {
- if (VT.isVector() && Subtarget.hasMSA())
- return Subtarget.isABI_O32() ? MVT::i32 : MVT::i64;
- return MipsTargetLowering::getRegisterType(VT);
-}
-
MVT MipsTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
EVT VT) const {
if (VT.isVector()) {
@@ -195,11 +189,13 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
case MipsISD::Hi: return "MipsISD::Hi";
case MipsISD::Lo: return "MipsISD::Lo";
case MipsISD::GotHi: return "MipsISD::GotHi";
+ case MipsISD::TlsHi: return "MipsISD::TlsHi";
case MipsISD::GPRel: return "MipsISD::GPRel";
case MipsISD::ThreadPointer: return "MipsISD::ThreadPointer";
case MipsISD::Ret: return "MipsISD::Ret";
case MipsISD::ERet: return "MipsISD::ERet";
case MipsISD::EH_RETURN: return "MipsISD::EH_RETURN";
+ case MipsISD::FMS: return "MipsISD::FMS";
case MipsISD::FPBrcond: return "MipsISD::FPBrcond";
case MipsISD::FPCmp: return "MipsISD::FPCmp";
case MipsISD::FSELECT: return "MipsISD::FSELECT";
@@ -286,10 +282,6 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
case MipsISD::VCLE_U: return "MipsISD::VCLE_U";
case MipsISD::VCLT_S: return "MipsISD::VCLT_S";
case MipsISD::VCLT_U: return "MipsISD::VCLT_U";
- case MipsISD::VSMAX: return "MipsISD::VSMAX";
- case MipsISD::VSMIN: return "MipsISD::VSMIN";
- case MipsISD::VUMAX: return "MipsISD::VUMAX";
- case MipsISD::VUMIN: return "MipsISD::VUMIN";
case MipsISD::VEXTRACT_SEXT_ELT: return "MipsISD::VEXTRACT_SEXT_ELT";
case MipsISD::VEXTRACT_ZEXT_ELT: return "MipsISD::VEXTRACT_ZEXT_ELT";
case MipsISD::VNOR: return "MipsISD::VNOR";
@@ -402,18 +394,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
setOperationAction(ISD::UDIV, MVT::i64, Expand);
setOperationAction(ISD::UREM, MVT::i64, Expand);
- if (!(Subtarget.hasDSP() && Subtarget.hasMips32r2())) {
- setOperationAction(ISD::ADDC, MVT::i32, Expand);
- setOperationAction(ISD::ADDE, MVT::i32, Expand);
- }
-
- setOperationAction(ISD::ADDC, MVT::i64, Expand);
- setOperationAction(ISD::ADDE, MVT::i64, Expand);
- setOperationAction(ISD::SUBC, MVT::i32, Expand);
- setOperationAction(ISD::SUBE, MVT::i32, Expand);
- setOperationAction(ISD::SUBC, MVT::i64, Expand);
- setOperationAction(ISD::SUBE, MVT::i64, Expand);
-
// Operations not directly supported by Mips.
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
@@ -761,7 +741,7 @@ static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::ADD, DL, SetCC.getValueType(), SetCC, True);
}
- // Couldn't optimize.
+ // Could not optimize.
return SDValue();
}
@@ -1301,76 +1281,76 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
default:
llvm_unreachable("Unexpected instr type to insert");
case Mips::ATOMIC_LOAD_ADD_I8:
- return emitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
+ return emitAtomicBinaryPartword(MI, BB, 1);
case Mips::ATOMIC_LOAD_ADD_I16:
- return emitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu);
+ return emitAtomicBinaryPartword(MI, BB, 2);
case Mips::ATOMIC_LOAD_ADD_I32:
- return emitAtomicBinary(MI, BB, 4, Mips::ADDu);
+ return emitAtomicBinary(MI, BB);
case Mips::ATOMIC_LOAD_ADD_I64:
- return emitAtomicBinary(MI, BB, 8, Mips::DADDu);
+ return emitAtomicBinary(MI, BB);
case Mips::ATOMIC_LOAD_AND_I8:
- return emitAtomicBinaryPartword(MI, BB, 1, Mips::AND);
+ return emitAtomicBinaryPartword(MI, BB, 1);
case Mips::ATOMIC_LOAD_AND_I16:
- return emitAtomicBinaryPartword(MI, BB, 2, Mips::AND);
+ return emitAtomicBinaryPartword(MI, BB, 2);
case Mips::ATOMIC_LOAD_AND_I32:
- return emitAtomicBinary(MI, BB, 4, Mips::AND);
+ return emitAtomicBinary(MI, BB);
case Mips::ATOMIC_LOAD_AND_I64:
- return emitAtomicBinary(MI, BB, 8, Mips::AND64);
+ return emitAtomicBinary(MI, BB);
case Mips::ATOMIC_LOAD_OR_I8:
- return emitAtomicBinaryPartword(MI, BB, 1, Mips::OR);
+ return emitAtomicBinaryPartword(MI, BB, 1);
case Mips::ATOMIC_LOAD_OR_I16:
- return emitAtomicBinaryPartword(MI, BB, 2, Mips::OR);
+ return emitAtomicBinaryPartword(MI, BB, 2);
case Mips::ATOMIC_LOAD_OR_I32:
- return emitAtomicBinary(MI, BB, 4, Mips::OR);
+ return emitAtomicBinary(MI, BB);
case Mips::ATOMIC_LOAD_OR_I64:
- return emitAtomicBinary(MI, BB, 8, Mips::OR64);
+ return emitAtomicBinary(MI, BB);
case Mips::ATOMIC_LOAD_XOR_I8:
- return emitAtomicBinaryPartword(MI, BB, 1, Mips::XOR);
+ return emitAtomicBinaryPartword(MI, BB, 1);
case Mips::ATOMIC_LOAD_XOR_I16:
- return emitAtomicBinaryPartword(MI, BB, 2, Mips::XOR);
+ return emitAtomicBinaryPartword(MI, BB, 2);
case Mips::ATOMIC_LOAD_XOR_I32:
- return emitAtomicBinary(MI, BB, 4, Mips::XOR);
+ return emitAtomicBinary(MI, BB);
case Mips::ATOMIC_LOAD_XOR_I64:
- return emitAtomicBinary(MI, BB, 8, Mips::XOR64);
+ return emitAtomicBinary(MI, BB);
case Mips::ATOMIC_LOAD_NAND_I8:
- return emitAtomicBinaryPartword(MI, BB, 1, 0, true);
+ return emitAtomicBinaryPartword(MI, BB, 1);
case Mips::ATOMIC_LOAD_NAND_I16:
- return emitAtomicBinaryPartword(MI, BB, 2, 0, true);
+ return emitAtomicBinaryPartword(MI, BB, 2);
case Mips::ATOMIC_LOAD_NAND_I32:
- return emitAtomicBinary(MI, BB, 4, 0, true);
+ return emitAtomicBinary(MI, BB);
case Mips::ATOMIC_LOAD_NAND_I64:
- return emitAtomicBinary(MI, BB, 8, 0, true);
+ return emitAtomicBinary(MI, BB);
case Mips::ATOMIC_LOAD_SUB_I8:
- return emitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu);
+ return emitAtomicBinaryPartword(MI, BB, 1);
case Mips::ATOMIC_LOAD_SUB_I16:
- return emitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu);
+ return emitAtomicBinaryPartword(MI, BB, 2);
case Mips::ATOMIC_LOAD_SUB_I32:
- return emitAtomicBinary(MI, BB, 4, Mips::SUBu);
+ return emitAtomicBinary(MI, BB);
case Mips::ATOMIC_LOAD_SUB_I64:
- return emitAtomicBinary(MI, BB, 8, Mips::DSUBu);
+ return emitAtomicBinary(MI, BB);
case Mips::ATOMIC_SWAP_I8:
- return emitAtomicBinaryPartword(MI, BB, 1, 0);
+ return emitAtomicBinaryPartword(MI, BB, 1);
case Mips::ATOMIC_SWAP_I16:
- return emitAtomicBinaryPartword(MI, BB, 2, 0);
+ return emitAtomicBinaryPartword(MI, BB, 2);
case Mips::ATOMIC_SWAP_I32:
- return emitAtomicBinary(MI, BB, 4, 0);
+ return emitAtomicBinary(MI, BB);
case Mips::ATOMIC_SWAP_I64:
- return emitAtomicBinary(MI, BB, 8, 0);
+ return emitAtomicBinary(MI, BB);
case Mips::ATOMIC_CMP_SWAP_I8:
return emitAtomicCmpSwapPartword(MI, BB, 1);
case Mips::ATOMIC_CMP_SWAP_I16:
return emitAtomicCmpSwapPartword(MI, BB, 2);
case Mips::ATOMIC_CMP_SWAP_I32:
- return emitAtomicCmpSwap(MI, BB, 4);
+ return emitAtomicCmpSwap(MI, BB);
case Mips::ATOMIC_CMP_SWAP_I64:
- return emitAtomicCmpSwap(MI, BB, 8);
+ return emitAtomicCmpSwap(MI, BB);
case Mips::PseudoSDIV:
case Mips::PseudoUDIV:
case Mips::DIV:
@@ -1419,99 +1399,121 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// This function also handles Mips::ATOMIC_SWAP_I32 (when BinOpcode == 0), and
// Mips::ATOMIC_LOAD_NAND_I32 (when Nand == true)
-MachineBasicBlock *MipsTargetLowering::emitAtomicBinary(MachineInstr &MI,
- MachineBasicBlock *BB,
- unsigned Size,
- unsigned BinOpcode,
- bool Nand) const {
- assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicBinary.");
+MachineBasicBlock *
+MipsTargetLowering::emitAtomicBinary(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &RegInfo = MF->getRegInfo();
- const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- const bool ArePtrs64bit = ABI.ArePtrs64bit();
DebugLoc DL = MI.getDebugLoc();
- unsigned LL, SC, AND, NOR, ZERO, BEQ;
-
- if (Size == 4) {
- if (isMicroMips) {
- LL = Mips::LL_MM;
- SC = Mips::SC_MM;
- } else {
- LL = Subtarget.hasMips32r6()
- ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
- : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
- SC = Subtarget.hasMips32r6()
- ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
- : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
- }
- AND = Mips::AND;
- NOR = Mips::NOR;
- ZERO = Mips::ZERO;
- BEQ = Mips::BEQ;
- } else {
- LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
- SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
- AND = Mips::AND64;
- NOR = Mips::NOR64;
- ZERO = Mips::ZERO_64;
- BEQ = Mips::BEQ64;
+ unsigned AtomicOp;
+ switch (MI.getOpcode()) {
+ case Mips::ATOMIC_LOAD_ADD_I32:
+ AtomicOp = Mips::ATOMIC_LOAD_ADD_I32_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_SUB_I32:
+ AtomicOp = Mips::ATOMIC_LOAD_SUB_I32_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_AND_I32:
+ AtomicOp = Mips::ATOMIC_LOAD_AND_I32_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_OR_I32:
+ AtomicOp = Mips::ATOMIC_LOAD_OR_I32_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_XOR_I32:
+ AtomicOp = Mips::ATOMIC_LOAD_XOR_I32_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_NAND_I32:
+ AtomicOp = Mips::ATOMIC_LOAD_NAND_I32_POSTRA;
+ break;
+ case Mips::ATOMIC_SWAP_I32:
+ AtomicOp = Mips::ATOMIC_SWAP_I32_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_ADD_I64:
+ AtomicOp = Mips::ATOMIC_LOAD_ADD_I64_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_SUB_I64:
+ AtomicOp = Mips::ATOMIC_LOAD_SUB_I64_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_AND_I64:
+ AtomicOp = Mips::ATOMIC_LOAD_AND_I64_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_OR_I64:
+ AtomicOp = Mips::ATOMIC_LOAD_OR_I64_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_XOR_I64:
+ AtomicOp = Mips::ATOMIC_LOAD_XOR_I64_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_NAND_I64:
+ AtomicOp = Mips::ATOMIC_LOAD_NAND_I64_POSTRA;
+ break;
+ case Mips::ATOMIC_SWAP_I64:
+ AtomicOp = Mips::ATOMIC_SWAP_I64_POSTRA;
+ break;
+ default:
+ llvm_unreachable("Unknown pseudo atomic for replacement!");
}
unsigned OldVal = MI.getOperand(0).getReg();
unsigned Ptr = MI.getOperand(1).getReg();
unsigned Incr = MI.getOperand(2).getReg();
+ unsigned Scratch = RegInfo.createVirtualRegister(RegInfo.getRegClass(OldVal));
+
+ MachineBasicBlock::iterator II(MI);
+
+ // The scratch registers here with the EarlyClobber | Define | Implicit
+ // flags is used to persuade the register allocator and the machine
+ // verifier to accept the usage of this register. This has to be a real
+ // register which has an UNDEF value but is dead after the instruction which
+ // is unique among the registers chosen for the instruction.
+
+ // The EarlyClobber flag has the semantic properties that the operand it is
+ // attached to is clobbered before the rest of the inputs are read. Hence it
+ // must be unique among the operands to the instruction.
+ // The Define flag is needed to coerce the machine verifier that an Undef
+ // value isn't a problem.
+ // The Dead flag is needed as the value in scratch isn't used by any other
+ // instruction. Kill isn't used as Dead is more precise.
+ // The implicit flag is here due to the interaction between the other flags
+ // and the machine verifier.
+
+ // For correctness purpose, a new pseudo is introduced here. We need this
+ // new pseudo, so that FastRegisterAllocator does not see an ll/sc sequence
+ // that is spread over >1 basic blocks. A register allocator which
+ // introduces (or any codegen infact) a store, can violate the expectations
+ // of the hardware.
+ //
+ // An atomic read-modify-write sequence starts with a linked load
+ // instruction and ends with a store conditional instruction. The atomic
+ // read-modify-write sequence fails if any of the following conditions
+ // occur between the execution of ll and sc:
+ // * A coherent store is completed by another process or coherent I/O
+ // module into the block of synchronizable physical memory containing
+ // the word. The size and alignment of the block is
+ // implementation-dependent.
+ // * A coherent store is executed between an LL and SC sequence on the
+ // same processor to the block of synchornizable physical memory
+ // containing the word.
+ //
- unsigned StoreVal = RegInfo.createVirtualRegister(RC);
- unsigned AndRes = RegInfo.createVirtualRegister(RC);
- unsigned Success = RegInfo.createVirtualRegister(RC);
+ unsigned PtrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Ptr));
+ unsigned IncrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Incr));
- // insert new blocks after the current block
- const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineFunction::iterator It = ++BB->getIterator();
- MF->insert(It, loopMBB);
- MF->insert(It, exitMBB);
+ BuildMI(*BB, II, DL, TII->get(Mips::COPY), IncrCopy).addReg(Incr);
+ BuildMI(*BB, II, DL, TII->get(Mips::COPY), PtrCopy).addReg(Ptr);
- // Transfer the remainder of BB and its successor edges to exitMBB.
- exitMBB->splice(exitMBB->begin(), BB,
- std::next(MachineBasicBlock::iterator(MI)), BB->end());
- exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
- // thisMBB:
- // ...
- // fallthrough --> loopMBB
- BB->addSuccessor(loopMBB);
- loopMBB->addSuccessor(loopMBB);
- loopMBB->addSuccessor(exitMBB);
-
- // loopMBB:
- // ll oldval, 0(ptr)
- // <binop> storeval, oldval, incr
- // sc success, storeval, 0(ptr)
- // beq success, $0, loopMBB
- BB = loopMBB;
- BuildMI(BB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
- if (Nand) {
- // and andres, oldval, incr
- // nor storeval, $0, andres
- BuildMI(BB, DL, TII->get(AND), AndRes).addReg(OldVal).addReg(Incr);
- BuildMI(BB, DL, TII->get(NOR), StoreVal).addReg(ZERO).addReg(AndRes);
- } else if (BinOpcode) {
- // <binop> storeval, oldval, incr
- BuildMI(BB, DL, TII->get(BinOpcode), StoreVal).addReg(OldVal).addReg(Incr);
- } else {
- StoreVal = Incr;
- }
- BuildMI(BB, DL, TII->get(SC), Success).addReg(StoreVal).addReg(Ptr).addImm(0);
- BuildMI(BB, DL, TII->get(BEQ)).addReg(Success).addReg(ZERO).addMBB(loopMBB);
+ BuildMI(*BB, II, DL, TII->get(AtomicOp))
+ .addReg(OldVal, RegState::Define | RegState::EarlyClobber)
+ .addReg(PtrCopy)
+ .addReg(IncrCopy)
+ .addReg(Scratch, RegState::Define | RegState::EarlyClobber |
+ RegState::Implicit | RegState::Dead);
- MI.eraseFromParent(); // The instruction is gone now.
+ MI.eraseFromParent();
- return exitMBB;
+ return BB;
}
MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
@@ -1545,8 +1547,7 @@ MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
}
MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
- MachineInstr &MI, MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
- bool Nand) const {
+ MachineInstr &MI, MachineBasicBlock *BB, unsigned Size) const {
assert((Size == 1 || Size == 2) &&
"Unsupported size for EmitAtomicBinaryPartial.");
@@ -1567,39 +1568,66 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
unsigned Mask = RegInfo.createVirtualRegister(RC);
unsigned Mask2 = RegInfo.createVirtualRegister(RC);
- unsigned NewVal = RegInfo.createVirtualRegister(RC);
- unsigned OldVal = RegInfo.createVirtualRegister(RC);
unsigned Incr2 = RegInfo.createVirtualRegister(RC);
unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
- unsigned AndRes = RegInfo.createVirtualRegister(RC);
- unsigned BinOpRes = RegInfo.createVirtualRegister(RC);
- unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
- unsigned StoreVal = RegInfo.createVirtualRegister(RC);
- unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
- unsigned SrlRes = RegInfo.createVirtualRegister(RC);
- unsigned Success = RegInfo.createVirtualRegister(RC);
-
- unsigned LL, SC;
- if (isMicroMips) {
- LL = Mips::LL_MM;
- SC = Mips::SC_MM;
- } else {
- LL = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
- : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
- SC = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
- : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+ unsigned Scratch = RegInfo.createVirtualRegister(RC);
+ unsigned Scratch2 = RegInfo.createVirtualRegister(RC);
+ unsigned Scratch3 = RegInfo.createVirtualRegister(RC);
+
+ unsigned AtomicOp = 0;
+ switch (MI.getOpcode()) {
+ case Mips::ATOMIC_LOAD_NAND_I8:
+ AtomicOp = Mips::ATOMIC_LOAD_NAND_I8_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_NAND_I16:
+ AtomicOp = Mips::ATOMIC_LOAD_NAND_I16_POSTRA;
+ break;
+ case Mips::ATOMIC_SWAP_I8:
+ AtomicOp = Mips::ATOMIC_SWAP_I8_POSTRA;
+ break;
+ case Mips::ATOMIC_SWAP_I16:
+ AtomicOp = Mips::ATOMIC_SWAP_I16_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_ADD_I8:
+ AtomicOp = Mips::ATOMIC_LOAD_ADD_I8_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_ADD_I16:
+ AtomicOp = Mips::ATOMIC_LOAD_ADD_I16_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_SUB_I8:
+ AtomicOp = Mips::ATOMIC_LOAD_SUB_I8_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_SUB_I16:
+ AtomicOp = Mips::ATOMIC_LOAD_SUB_I16_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_AND_I8:
+ AtomicOp = Mips::ATOMIC_LOAD_AND_I8_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_AND_I16:
+ AtomicOp = Mips::ATOMIC_LOAD_AND_I16_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_OR_I8:
+ AtomicOp = Mips::ATOMIC_LOAD_OR_I8_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_OR_I16:
+ AtomicOp = Mips::ATOMIC_LOAD_OR_I16_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_XOR_I8:
+ AtomicOp = Mips::ATOMIC_LOAD_XOR_I8_POSTRA;
+ break;
+ case Mips::ATOMIC_LOAD_XOR_I16:
+ AtomicOp = Mips::ATOMIC_LOAD_XOR_I16_POSTRA;
+ break;
+ default:
+ llvm_unreachable("Unknown subword atomic pseudo for expansion!");
}
// insert new blocks after the current block
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineFunction::iterator It = ++BB->getIterator();
- MF->insert(It, loopMBB);
- MF->insert(It, sinkMBB);
MF->insert(It, exitMBB);
// Transfer the remainder of BB and its successor edges to exitMBB.
@@ -1607,10 +1635,7 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
std::next(MachineBasicBlock::iterator(MI)), BB->end());
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
- BB->addSuccessor(loopMBB);
- loopMBB->addSuccessor(loopMBB);
- loopMBB->addSuccessor(sinkMBB);
- sinkMBB->addSuccessor(exitMBB);
+ BB->addSuccessor(exitMBB, BranchProbability::getOne());
// thisMBB:
// addiu masklsb2,$0,-4 # 0xfffffffc
@@ -1644,159 +1669,92 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
BuildMI(BB, DL, TII->get(Mips::SLLV), Incr2).addReg(Incr).addReg(ShiftAmt);
- // atomic.load.binop
- // loopMBB:
- // ll oldval,0(alignedaddr)
- // binop binopres,oldval,incr2
- // and newval,binopres,mask
- // and maskedoldval0,oldval,mask2
- // or storeval,maskedoldval0,newval
- // sc success,storeval,0(alignedaddr)
- // beq success,$0,loopMBB
-
- // atomic.swap
- // loopMBB:
- // ll oldval,0(alignedaddr)
- // and newval,incr2,mask
- // and maskedoldval0,oldval,mask2
- // or storeval,maskedoldval0,newval
- // sc success,storeval,0(alignedaddr)
- // beq success,$0,loopMBB
-
- BB = loopMBB;
- BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
- if (Nand) {
- // and andres, oldval, incr2
- // nor binopres, $0, andres
- // and newval, binopres, mask
- BuildMI(BB, DL, TII->get(Mips::AND), AndRes).addReg(OldVal).addReg(Incr2);
- BuildMI(BB, DL, TII->get(Mips::NOR), BinOpRes)
- .addReg(Mips::ZERO).addReg(AndRes);
- BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
- } else if (BinOpcode) {
- // <binop> binopres, oldval, incr2
- // and newval, binopres, mask
- BuildMI(BB, DL, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
- BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
- } else { // atomic.swap
- // and newval, incr2, mask
- BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(Incr2).addReg(Mask);
- }
-
- BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
- .addReg(OldVal).addReg(Mask2);
- BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
- .addReg(MaskedOldVal0).addReg(NewVal);
- BuildMI(BB, DL, TII->get(SC), Success)
- .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
- BuildMI(BB, DL, TII->get(Mips::BEQ))
- .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
- // sinkMBB:
- // and maskedoldval1,oldval,mask
- // srl srlres,maskedoldval1,shiftamt
- // sign_extend dest,srlres
- BB = sinkMBB;
-
- BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
- .addReg(OldVal).addReg(Mask);
- BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
- .addReg(MaskedOldVal1).addReg(ShiftAmt);
- BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
+ // The purposes of the flags on the scratch registers is explained in
+ // emitAtomicBinary. In summary, we need a scratch register which is going to
+ // be undef, that is unique among registers chosen for the instruction.
+
+ BuildMI(BB, DL, TII->get(AtomicOp))
+ .addReg(Dest, RegState::Define | RegState::EarlyClobber)
+ .addReg(AlignedAddr)
+ .addReg(Incr2)
+ .addReg(Mask)
+ .addReg(Mask2)
+ .addReg(ShiftAmt)
+ .addReg(Scratch, RegState::EarlyClobber | RegState::Define |
+ RegState::Dead | RegState::Implicit)
+ .addReg(Scratch2, RegState::EarlyClobber | RegState::Define |
+ RegState::Dead | RegState::Implicit)
+ .addReg(Scratch3, RegState::EarlyClobber | RegState::Define |
+ RegState::Dead | RegState::Implicit);
MI.eraseFromParent(); // The instruction is gone now.
return exitMBB;
}
-MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
- MachineBasicBlock *BB,
- unsigned Size) const {
- assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicCmpSwap.");
+// Lower atomic compare and swap to a pseudo instruction, taking care to
+// define a scratch register for the pseudo instruction's expansion. The
+// instruction is expanded after the register allocator as to prevent
+// the insertion of stores between the linked load and the store conditional.
+
+MachineBasicBlock *
+MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+
+ assert((MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32 ||
+ MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I64) &&
+ "Unsupported atomic psseudo for EmitAtomicCmpSwap.");
+
+ const unsigned Size = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32 ? 4 : 8;
MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- const bool ArePtrs64bit = ABI.ArePtrs64bit();
DebugLoc DL = MI.getDebugLoc();
- unsigned LL, SC, ZERO, BNE, BEQ;
-
- if (Size == 4) {
- if (isMicroMips) {
- LL = Mips::LL_MM;
- SC = Mips::SC_MM;
- } else {
- LL = Subtarget.hasMips32r6()
- ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
- : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
- SC = Subtarget.hasMips32r6()
- ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
- : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
- }
-
- ZERO = Mips::ZERO;
- BNE = Mips::BNE;
- BEQ = Mips::BEQ;
- } else {
- LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
- SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
- ZERO = Mips::ZERO_64;
- BNE = Mips::BNE64;
- BEQ = Mips::BEQ64;
- }
+ unsigned AtomicOp = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32
+ ? Mips::ATOMIC_CMP_SWAP_I32_POSTRA
+ : Mips::ATOMIC_CMP_SWAP_I64_POSTRA;
unsigned Dest = MI.getOperand(0).getReg();
unsigned Ptr = MI.getOperand(1).getReg();
unsigned OldVal = MI.getOperand(2).getReg();
unsigned NewVal = MI.getOperand(3).getReg();
- unsigned Success = RegInfo.createVirtualRegister(RC);
+ unsigned Scratch = MRI.createVirtualRegister(RC);
+ MachineBasicBlock::iterator II(MI);
- // insert new blocks after the current block
- const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineFunction::iterator It = ++BB->getIterator();
- MF->insert(It, loop1MBB);
- MF->insert(It, loop2MBB);
- MF->insert(It, exitMBB);
+ // We need to create copies of the various registers and kill them at the
+ // atomic pseudo. If the copies are not made, when the atomic is expanded
+ // after fast register allocation, the spills will end up outside of the
+ // blocks that their values are defined in, causing livein errors.
- // Transfer the remainder of BB and its successor edges to exitMBB.
- exitMBB->splice(exitMBB->begin(), BB,
- std::next(MachineBasicBlock::iterator(MI)), BB->end());
- exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+ unsigned DestCopy = MRI.createVirtualRegister(MRI.getRegClass(Dest));
+ unsigned PtrCopy = MRI.createVirtualRegister(MRI.getRegClass(Ptr));
+ unsigned OldValCopy = MRI.createVirtualRegister(MRI.getRegClass(OldVal));
+ unsigned NewValCopy = MRI.createVirtualRegister(MRI.getRegClass(NewVal));
- // thisMBB:
- // ...
- // fallthrough --> loop1MBB
- BB->addSuccessor(loop1MBB);
- loop1MBB->addSuccessor(exitMBB);
- loop1MBB->addSuccessor(loop2MBB);
- loop2MBB->addSuccessor(loop1MBB);
- loop2MBB->addSuccessor(exitMBB);
-
- // loop1MBB:
- // ll dest, 0(ptr)
- // bne dest, oldval, exitMBB
- BB = loop1MBB;
- BuildMI(BB, DL, TII->get(LL), Dest).addReg(Ptr).addImm(0);
- BuildMI(BB, DL, TII->get(BNE))
- .addReg(Dest).addReg(OldVal).addMBB(exitMBB);
-
- // loop2MBB:
- // sc success, newval, 0(ptr)
- // beq success, $0, loop1MBB
- BB = loop2MBB;
- BuildMI(BB, DL, TII->get(SC), Success)
- .addReg(NewVal).addReg(Ptr).addImm(0);
- BuildMI(BB, DL, TII->get(BEQ))
- .addReg(Success).addReg(ZERO).addMBB(loop1MBB);
+ BuildMI(*BB, II, DL, TII->get(Mips::COPY), DestCopy).addReg(Dest);
+ BuildMI(*BB, II, DL, TII->get(Mips::COPY), PtrCopy).addReg(Ptr);
+ BuildMI(*BB, II, DL, TII->get(Mips::COPY), OldValCopy).addReg(OldVal);
+ BuildMI(*BB, II, DL, TII->get(Mips::COPY), NewValCopy).addReg(NewVal);
+
+ // The purposes of the flags on the scratch registers is explained in
+ // emitAtomicBinary. In summary, we need a scratch register which is going to
+ // be undef, that is unique among registers chosen for the instruction.
+
+ BuildMI(*BB, II, DL, TII->get(AtomicOp))
+ .addReg(Dest, RegState::Define | RegState::EarlyClobber)
+ .addReg(PtrCopy, RegState::Kill)
+ .addReg(OldValCopy, RegState::Kill)
+ .addReg(NewValCopy, RegState::Kill)
+ .addReg(Scratch, RegState::EarlyClobber | RegState::Define |
+ RegState::Dead | RegState::Implicit);
MI.eraseFromParent(); // The instruction is gone now.
- return exitMBB;
+ return BB;
}
MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
@@ -1823,40 +1781,33 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
unsigned Mask = RegInfo.createVirtualRegister(RC);
unsigned Mask2 = RegInfo.createVirtualRegister(RC);
unsigned ShiftedCmpVal = RegInfo.createVirtualRegister(RC);
- unsigned OldVal = RegInfo.createVirtualRegister(RC);
- unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
unsigned ShiftedNewVal = RegInfo.createVirtualRegister(RC);
unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
unsigned MaskedCmpVal = RegInfo.createVirtualRegister(RC);
unsigned MaskedNewVal = RegInfo.createVirtualRegister(RC);
- unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
- unsigned StoreVal = RegInfo.createVirtualRegister(RC);
- unsigned SrlRes = RegInfo.createVirtualRegister(RC);
- unsigned Success = RegInfo.createVirtualRegister(RC);
- unsigned LL, SC;
-
- if (isMicroMips) {
- LL = Mips::LL_MM;
- SC = Mips::SC_MM;
- } else {
- LL = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
- : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
- SC = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
- : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
- }
+ unsigned AtomicOp = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I8
+ ? Mips::ATOMIC_CMP_SWAP_I8_POSTRA
+ : Mips::ATOMIC_CMP_SWAP_I16_POSTRA;
+
+ // The scratch registers here with the EarlyClobber | Define | Dead | Implicit
+ // flags are used to coerce the register allocator and the machine verifier to
+ // accept the usage of these registers.
+ // The EarlyClobber flag has the semantic properties that the operand it is
+ // attached to is clobbered before the rest of the inputs are read. Hence it
+ // must be unique among the operands to the instruction.
+ // The Define flag is needed to coerce the machine verifier that an Undef
+ // value isn't a problem.
+ // The Dead flag is needed as the value in scratch isn't used by any other
+ // instruction. Kill isn't used as Dead is more precise.
+ unsigned Scratch = RegInfo.createVirtualRegister(RC);
+ unsigned Scratch2 = RegInfo.createVirtualRegister(RC);
// insert new blocks after the current block
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineFunction::iterator It = ++BB->getIterator();
- MF->insert(It, loop1MBB);
- MF->insert(It, loop2MBB);
- MF->insert(It, sinkMBB);
MF->insert(It, exitMBB);
// Transfer the remainder of BB and its successor edges to exitMBB.
@@ -1864,14 +1815,8 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
std::next(MachineBasicBlock::iterator(MI)), BB->end());
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
- BB->addSuccessor(loop1MBB);
- loop1MBB->addSuccessor(sinkMBB);
- loop1MBB->addSuccessor(loop2MBB);
- loop2MBB->addSuccessor(loop1MBB);
- loop2MBB->addSuccessor(sinkMBB);
- sinkMBB->addSuccessor(exitMBB);
+ BB->addSuccessor(exitMBB, BranchProbability::getOne());
- // FIXME: computation of newval2 can be moved to loop2MBB.
// thisMBB:
// addiu masklsb2,$0,-4 # 0xfffffffc
// and alignedaddr,ptr,masklsb2
@@ -1914,40 +1859,22 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedNewVal)
.addReg(MaskedNewVal).addReg(ShiftAmt);
- // loop1MBB:
- // ll oldval,0(alginedaddr)
- // and maskedoldval0,oldval,mask
- // bne maskedoldval0,shiftedcmpval,sinkMBB
- BB = loop1MBB;
- BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
- BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
- .addReg(OldVal).addReg(Mask);
- BuildMI(BB, DL, TII->get(Mips::BNE))
- .addReg(MaskedOldVal0).addReg(ShiftedCmpVal).addMBB(sinkMBB);
-
- // loop2MBB:
- // and maskedoldval1,oldval,mask2
- // or storeval,maskedoldval1,shiftednewval
- // sc success,storeval,0(alignedaddr)
- // beq success,$0,loop1MBB
- BB = loop2MBB;
- BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
- .addReg(OldVal).addReg(Mask2);
- BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
- .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
- BuildMI(BB, DL, TII->get(SC), Success)
- .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
- BuildMI(BB, DL, TII->get(Mips::BEQ))
- .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
-
- // sinkMBB:
- // srl srlres,maskedoldval0,shiftamt
- // sign_extend dest,srlres
- BB = sinkMBB;
-
- BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
- .addReg(MaskedOldVal0).addReg(ShiftAmt);
- BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
+ // The purposes of the flags on the scratch registers are explained in
+ // emitAtomicBinary. In summary, we need a scratch register which is going to
+ // be undef, that is unique among the register chosen for the instruction.
+
+ BuildMI(BB, DL, TII->get(AtomicOp))
+ .addReg(Dest, RegState::Define | RegState::EarlyClobber)
+ .addReg(AlignedAddr)
+ .addReg(Mask)
+ .addReg(ShiftedCmpVal)
+ .addReg(Mask2)
+ .addReg(ShiftedNewVal)
+ .addReg(ShiftAmt)
+ .addReg(Scratch, RegState::EarlyClobber | RegState::Define |
+ RegState::Dead | RegState::Implicit)
+ .addReg(Scratch2, RegState::EarlyClobber | RegState::Define |
+ RegState::Dead | RegState::Implicit);
MI.eraseFromParent(); // The instruction is gone now.
@@ -2073,7 +2000,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
// Local Exec TLS Model.
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
- if (DAG.getTarget().Options.EmulatedTLS)
+ if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(GA, DAG);
SDLoc DL(GA);
@@ -2114,7 +2041,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
MipsII::MO_DTPREL_HI);
- SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
+ SDValue Hi = DAG.getNode(MipsISD::TlsHi, DL, PtrVT, TGAHi);
SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
MipsII::MO_DTPREL_LO);
SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
@@ -2138,7 +2065,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
MipsII::MO_TPREL_HI);
SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
MipsII::MO_TPREL_LO);
- SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
+ SDValue Hi = DAG.getNode(MipsISD::TlsHi, DL, PtrVT, TGAHi);
SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
}
@@ -2837,6 +2764,13 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
#include "MipsGenCallingConv.inc"
+ CCAssignFn *MipsTargetLowering::CCAssignFnForCall() const{
+ return CC_Mips;
+ }
+
+ CCAssignFn *MipsTargetLowering::CCAssignFnForReturn() const{
+ return RetCC_Mips;
+ }
//===----------------------------------------------------------------------===//
// Call Calling Convention Implementation
//===----------------------------------------------------------------------===//
@@ -2953,12 +2887,44 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext(),
MipsCCState::getSpecialCallingConvForCallee(Callee.getNode(), Subtarget));
+ const ExternalSymbolSDNode *ES =
+ dyn_cast_or_null<const ExternalSymbolSDNode>(Callee.getNode());
+
+ // There is one case where CALLSEQ_START..CALLSEQ_END can be nested, which
+ // is during the lowering of a call with a byval argument which produces
+ // a call to memcpy. For the O32 case, this causes the caller to allocate
+ // stack space for the reserved argument area for the callee, then recursively
+ // again for the memcpy call. In the NEWABI case, this doesn't occur as those
+ // ABIs mandate that the callee allocates the reserved argument area. We do
+ // still produce nested CALLSEQ_START..CALLSEQ_END with zero space though.
+ //
+ // If the callee has a byval argument and memcpy is used, we are mandated
+ // to already have produced a reserved argument area for the callee for O32.
+ // Therefore, the reserved argument area can be reused for both calls.
+ //
+ // Other cases of calling memcpy cannot have a chain with a CALLSEQ_START
+ // present, as we have yet to hook that node onto the chain.
+ //
+ // Hence, the CALLSEQ_START and CALLSEQ_END nodes can be eliminated in this
+ // case. GCC does a similar trick, in that wherever possible, it calculates
+ // the maximum out going argument area (including the reserved area), and
+ // preallocates the stack space on entrance to the caller.
+ //
+ // FIXME: We should do the same for efficency and space.
+
+ // Note: The check on the calling convention below must match
+ // MipsABIInfo::GetCalleeAllocdArgSizeInBytes().
+ bool MemcpyInByVal = ES &&
+ StringRef(ES->getSymbol()) == StringRef("memcpy") &&
+ CallConv != CallingConv::Fast &&
+ Chain.getOpcode() == ISD::CALLSEQ_START;
+
// Allocate the reserved argument area. It seems strange to do this from the
// caller side but removing it breaks the frame size calculation.
- CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
+ unsigned ReservedArgArea =
+ MemcpyInByVal ? 0 : ABI.GetCalleeAllocdArgSizeInBytes(CallConv);
+ CCInfo.AllocateStack(ReservedArgArea, 1);
- const ExternalSymbolSDNode *ES =
- dyn_cast_or_null<const ExternalSymbolSDNode>(Callee.getNode());
CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(),
ES ? ES->getSymbol() : nullptr);
@@ -2993,7 +2959,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
NextStackOffset = alignTo(NextStackOffset, StackAlignment);
SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, DL, true);
- if (!IsTailCall)
+ if (!(IsTailCall || MemcpyInByVal))
Chain = DAG.getCALLSEQ_START(Chain, NextStackOffset, 0, DL);
SDValue StackPtr =
@@ -3201,10 +3167,13 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain = DAG.getNode(MipsISD::JmpLink, DL, NodeTys, Ops);
SDValue InFlag = Chain.getValue(1);
- // Create the CALLSEQ_END node.
- Chain = DAG.getCALLSEQ_END(Chain, NextStackOffsetVal,
- DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
- InFlag = Chain.getValue(1);
+ // Create the CALLSEQ_END node in the case of where it is not a call to
+ // memcpy.
+ if (!(MemcpyInByVal)) {
+ Chain = DAG.getCALLSEQ_END(Chain, NextStackOffsetVal,
+ DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
+ InFlag = Chain.getValue(1);
+ }
// Handle result values, copying them out of physregs into vregs that we
// return.
@@ -3507,10 +3476,9 @@ MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
bool
MipsTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const {
- if (Subtarget.hasMips3() && Subtarget.useSoftFloat()) {
- if (Type == MVT::i32)
+ if ((ABI.IsN32() || ABI.IsN64()) && Type == MVT::i32)
return true;
- }
+
return IsSigned;
}
@@ -3746,6 +3714,13 @@ static std::pair<bool, bool> parsePhysicalReg(StringRef C, StringRef &Prefix,
true);
}
+EVT MipsTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
+ ISD::NodeType) const {
+ bool Cond = !Subtarget.isABI_O32() && VT.getSizeInBits() == 32;
+ EVT MinVT = getRegisterType(Context, Cond ? MVT::i64 : MVT::i32);
+ return VT.bitsLT(MinVT) ? MinVT : VT;
+}
+
std::pair<unsigned, const TargetRegisterClass *> MipsTargetLowering::
parseRegForInlineAsmConstraint(StringRef C, MVT VT) const {
const TargetRegisterInfo *TRI =
@@ -3863,13 +3838,17 @@ MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case 'c': // register suitable for indirect jump
if (VT == MVT::i32)
return std::make_pair((unsigned)Mips::T9, &Mips::GPR32RegClass);
- assert(VT == MVT::i64 && "Unexpected type.");
- return std::make_pair((unsigned)Mips::T9_64, &Mips::GPR64RegClass);
- case 'l': // register suitable for indirect jump
- if (VT == MVT::i32)
+ if (VT == MVT::i64)
+ return std::make_pair((unsigned)Mips::T9_64, &Mips::GPR64RegClass);
+ // This will generate an error message
+ return std::make_pair(0U, nullptr);
+ case 'l': // use the `lo` register to store values
+ // that are no bigger than a word
+ if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8)
return std::make_pair((unsigned)Mips::LO0, &Mips::LO32RegClass);
return std::make_pair((unsigned)Mips::LO0_64, &Mips::LO64RegClass);
- case 'x': // register suitable for indirect jump
+ case 'x': // use the concatenated `hi` and `lo` registers
+ // to store doubleword values
// Fixme: Not triggering the use of both hi and low
// This will generate an error message
return std::make_pair(0U, nullptr);
@@ -4064,7 +4043,12 @@ void MipsTargetLowering::copyByValRegs(
// Create frame object.
EVT PtrTy = getPointerTy(DAG.getDataLayout());
- int FI = MFI.CreateFixedObject(FrameObjSize, FrameObjOffset, true);
+ // Make the fixed object stored to mutable so that the load instructions
+ // referencing it have their memory dependencies added.
+ // Set the frame object as isAliased which clears the underlying objects
+ // vector in ScheduleDAGInstrs::buildSchedGraph() resulting in addition of all
+ // stores as dependencies for loads referencing this fixed object.
+ int FI = MFI.CreateFixedObject(FrameObjSize, FrameObjOffset, false, true);
SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
InVals.push_back(FIN);
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index ce4f0376ca9b..b58d92c370d8 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -19,9 +19,9 @@
#include "MCTargetDesc/MipsBaseInfo.h"
#include "MCTargetDesc/MipsMCTargetDesc.h"
#include "Mips.h"
+#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetLowering.h"
@@ -29,6 +29,7 @@
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Type.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Target/TargetMachine.h"
#include <algorithm>
#include <cassert>
@@ -83,12 +84,18 @@ class TargetRegisterClass;
// Get the High 16 bits from a 32 bit immediate for accessing the GOT.
GotHi,
+ // Get the High 16 bits from a 32-bit immediate for accessing TLS.
+ TlsHi,
+
// Handle gp_rel (small data/bss sections) relocation.
GPRel,
// Thread Pointer
ThreadPointer,
+ // Vector Floating Point Multiply and Subtract
+ FMS,
+
// Floating Point Branch Conditional
FPBrcond,
@@ -217,12 +224,6 @@ class TargetRegisterClass;
VCLT_S,
VCLT_U,
- // Element-wise vector max/min.
- VSMAX,
- VSMIN,
- VUMAX,
- VUMIN,
-
// Vector Shuffle with mask as an operand
VSHF, // Generic shuffle
SHF, // 4-element set shuffle.
@@ -279,15 +280,14 @@ class TargetRegisterClass;
return MVT::i32;
}
+ EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
+ ISD::NodeType) const override;
+
bool isCheapToSpeculateCttz() const override;
bool isCheapToSpeculateCtlz() const override;
/// Return the register type for a given MVT, ensuring vectors are treated
/// as a series of gpr sized integers.
- MVT getRegisterTypeForCallingConv(MVT VT) const override;
-
- /// Return the register type for a given MVT, ensuring vectors are treated
- /// as a series of gpr sized integers.
MVT getRegisterTypeForCallingConv(LLVMContext &Context,
EVT VT) const override;
@@ -371,6 +371,10 @@ class TargetRegisterClass;
return getTargetMachine().isPositionIndependent();
}
+ CCAssignFn *CCAssignFnForCall() const;
+
+ CCAssignFn *CCAssignFnForReturn() const;
+
protected:
SDValue getGlobalReg(SelectionDAG &DAG, EVT Ty) const;
@@ -681,17 +685,13 @@ class TargetRegisterClass;
unsigned Size, unsigned DstReg,
unsigned SrcRec) const;
- MachineBasicBlock *emitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
- unsigned Size, unsigned BinOpcode,
- bool Nand = false) const;
+ MachineBasicBlock *emitAtomicBinary(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
MachineBasicBlock *emitAtomicBinaryPartword(MachineInstr &MI,
MachineBasicBlock *BB,
- unsigned Size,
- unsigned BinOpcode,
- bool Nand = false) const;
+ unsigned Size) const;
MachineBasicBlock *emitAtomicCmpSwap(MachineInstr &MI,
- MachineBasicBlock *BB,
- unsigned Size) const;
+ MachineBasicBlock *BB) const;
MachineBasicBlock *emitAtomicCmpSwapPartword(MachineInstr &MI,
MachineBasicBlock *BB,
unsigned Size) const;
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index c81739115373..dd30e20a743c 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -149,12 +149,16 @@ multiclass ROUND_M<string opstr, InstrItinClass Itin> {
class MFC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
InstSE<(outs DstRC:$rt), (ins SrcRC:$fs), !strconcat(opstr, "\t$rt, $fs"),
- [(set DstRC:$rt, (OpNode SrcRC:$fs))], Itin, FrmFR, opstr>, HARDFLOAT;
+ [(set DstRC:$rt, (OpNode SrcRC:$fs))], Itin, FrmFR, opstr>, HARDFLOAT {
+ let isMoveReg = 1;
+}
class MTC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"),
- [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR, opstr>, HARDFLOAT;
+ [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR, opstr>, HARDFLOAT {
+ let isMoveReg = 1;
+}
class MTC1_64_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
InstrItinClass Itin> :
@@ -349,22 +353,24 @@ defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
//===----------------------------------------------------------------------===//
// Floating Point Instructions
//===----------------------------------------------------------------------===//
-def ROUND_W_S : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
- ABSS_FM<0xc, 16>, ISA_MIPS2;
-defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
-def TRUNC_W_S : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
- ABSS_FM<0xd, 16>, ISA_MIPS2;
-def CEIL_W_S : MMRel, StdMMR6Rel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
- ABSS_FM<0xe, 16>, ISA_MIPS2;
-def FLOOR_W_S : MMRel, StdMMR6Rel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
- ABSS_FM<0xf, 16>, ISA_MIPS2;
-def CVT_W_S : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
- ABSS_FM<0x24, 16>;
-
-defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2;
-defm CEIL_W : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2;
-defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2;
-defm CVT_W : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def ROUND_W_S : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+ ABSS_FM<0xc, 16>, ISA_MIPS2;
+ defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
+ def TRUNC_W_S : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
+ ABSS_FM<0xd, 16>, ISA_MIPS2;
+ def CEIL_W_S : MMRel, StdMMR6Rel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
+ ABSS_FM<0xe, 16>, ISA_MIPS2;
+ def FLOOR_W_S : MMRel, StdMMR6Rel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
+ ABSS_FM<0xf, 16>, ISA_MIPS2;
+ def CVT_W_S : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
+ ABSS_FM<0x24, 16>, ISA_MIPS1;
+
+ defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2;
+ defm CEIL_W : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2;
+ defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2;
+ defm CVT_W : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>, ISA_MIPS1;
+}
let AdditionalPredicates = [NotInMicroMips] in {
def RECIP_S : MMRel, ABSS_FT<"recip.s", FGR32Opnd, FGR32Opnd, II_RECIP_S>,
@@ -391,53 +397,54 @@ let AdditionalPredicates = [NotInMicroMips] in {
let DecoderNamespace = "MipsFP64" in {
let AdditionalPredicates = [NotInMicroMips] in {
def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>,
- ABSS_FM<0x8, 16>, FGR_64;
+ ABSS_FM<0x8, 16>, ISA_MIPS2, FGR_64;
def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>,
- ABSS_FM<0x8, 17>, FGR_64;
+ ABSS_FM<0x8, 17>, INSN_MIPS3_32, FGR_64;
def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, II_TRUNC>,
- ABSS_FM<0x9, 16>, FGR_64;
+ ABSS_FM<0x9, 16>, ISA_MIPS2, FGR_64;
def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, II_TRUNC>,
- ABSS_FM<0x9, 17>, FGR_64;
+ ABSS_FM<0x9, 17>, INSN_MIPS3_32, FGR_64;
def CEIL_L_S : ABSS_FT<"ceil.l.s", FGR64Opnd, FGR32Opnd, II_CEIL>,
- ABSS_FM<0xa, 16>, FGR_64;
+ ABSS_FM<0xa, 16>, ISA_MIPS2, FGR_64;
def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64Opnd, FGR64Opnd, II_CEIL>,
- ABSS_FM<0xa, 17>, FGR_64;
+ ABSS_FM<0xa, 17>, INSN_MIPS3_32, FGR_64;
def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64Opnd, FGR32Opnd, II_FLOOR>,
- ABSS_FM<0xb, 16>, FGR_64;
+ ABSS_FM<0xb, 16>, ISA_MIPS2, FGR_64;
def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, II_FLOOR>,
- ABSS_FM<0xb, 17>, FGR_64;
+ ABSS_FM<0xb, 17>, INSN_MIPS3_32, FGR_64;
}
}
-def CVT_S_W : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
- ABSS_FM<0x20, 20>;
let AdditionalPredicates = [NotInMicroMips] in{
+ def CVT_S_W : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
+ ABSS_FM<0x20, 20>, ISA_MIPS1;
def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
ABSS_FM<0x25, 16>, INSN_MIPS3_32R2;
def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
ABSS_FM<0x25, 17>, INSN_MIPS3_32R2;
}
-def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
- ABSS_FM<0x20, 17>, FGR_32;
-def CVT_D32_W : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
- ABSS_FM<0x21, 20>, FGR_32;
-def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
- ABSS_FM<0x21, 16>, FGR_32;
-
+let AdditionalPredicates = [NotInMicroMips] in {
+ def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+ ABSS_FM<0x20, 17>, ISA_MIPS1, FGR_32;
+ def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
+ ABSS_FM<0x21, 16>, ISA_MIPS1, FGR_32;
+ def CVT_D32_W : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
+ ABSS_FM<0x21, 20>, ISA_MIPS1, FGR_32;
+}
let DecoderNamespace = "MipsFP64" in {
- def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>,
- ABSS_FM<0x20, 17>, FGR_64;
- let AdditionalPredicates = [NotInMicroMips] in{
+ let AdditionalPredicates = [NotInMicroMips] in {
def CVT_S_L : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>,
- ABSS_FM<0x20, 21>, FGR_64;
+ ABSS_FM<0x20, 21>, INSN_MIPS3_32R2, FGR_64;
+ def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>,
+ ABSS_FM<0x20, 17>, ISA_MIPS1, FGR_64;
+ def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>,
+ ABSS_FM<0x21, 20>, ISA_MIPS1, FGR_64;
+ def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+ ABSS_FM<0x21, 16>, ISA_MIPS1, FGR_64;
+ def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64Opnd, FGR64Opnd, II_CVT>,
+ ABSS_FM<0x21, 21>, INSN_MIPS3_32R2, FGR_64;
}
- def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>,
- ABSS_FM<0x21, 20>, FGR_64;
- def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>,
- ABSS_FM<0x21, 16>, FGR_64;
- def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64Opnd, FGR64Opnd, II_CVT>,
- ABSS_FM<0x21, 21>, FGR_64;
}
let isPseudo = 1, isCodeGenOnly = 1 in {
@@ -450,17 +457,21 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
let AdditionalPredicates = [NotInMicroMips] in {
def FABS_S : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
- ABSS_FM<0x5, 16>;
- defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>;
+ ABSS_FM<0x5, 16>, ISA_MIPS1;
+ defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>, ISA_MIPS1;
}
def FNEG_S : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
- ABSS_FM<0x7, 16>;
-defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>;
+ ABSS_FM<0x7, 16>, ISA_MIPS1;
+let AdditionalPredicates = [NotInMicroMips] in {
+ defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>, ISA_MIPS1;
+}
-def FSQRT_S : MMRel, StdMMR6Rel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd,
- II_SQRT_S, fsqrt>, ABSS_FM<0x4, 16>, ISA_MIPS2;
-defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>, ISA_MIPS2;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def FSQRT_S : MMRel, StdMMR6Rel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd,
+ II_SQRT_S, fsqrt>, ABSS_FM<0x4, 16>, ISA_MIPS2;
+ defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>, ISA_MIPS2;
+}
// The odd-numbered registers are only referenced when doing loads,
// stores, and moves between floating-point and integer registers.
@@ -469,60 +480,60 @@ defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>, ISA_MIPS2;
/// Move Control Registers From/To CPU Registers
let AdditionalPredicates = [NotInMicroMips] in {
- def CFC1 : MMRel, MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, II_CFC1>, MFC1_FM<2>;
- def CTC1 : MMRel, MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, II_CTC1>, MFC1_FM<6>;
-}
-def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1,
- bitconvert>, MFC1_FM<0>;
-def MFC1_D64 : MFC1_FT<"mfc1", GPR32Opnd, FGR64Opnd, II_MFC1>, MFC1_FM<0>,
- FGR_64 {
- let DecoderNamespace = "MipsFP64";
-}
-def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
- bitconvert>, MFC1_FM<4>;
-def MTC1_D64 : MTC1_FT<"mtc1", FGR64Opnd, GPR32Opnd, II_MTC1>, MFC1_FM<4>,
- FGR_64 {
- let DecoderNamespace = "MipsFP64";
-}
+ def CFC1 : MMRel, MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, II_CFC1>, MFC1_FM<2>,
+ ISA_MIPS1;
+ def CTC1 : MMRel, MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, II_CTC1>, MFC1_FM<6>,
+ ISA_MIPS1;
+
+ def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1,
+ bitconvert>, MFC1_FM<0>, ISA_MIPS1;
+ def MFC1_D64 : MFC1_FT<"mfc1", GPR32Opnd, FGR64Opnd, II_MFC1>, MFC1_FM<0>,
+ ISA_MIPS1, FGR_64 {
+ let DecoderNamespace = "MipsFP64";
+ }
+ def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
+ bitconvert>, MFC1_FM<4>, ISA_MIPS1;
+ def MTC1_D64 : MTC1_FT<"mtc1", FGR64Opnd, GPR32Opnd, II_MTC1>, MFC1_FM<4>,
+ ISA_MIPS1, FGR_64 {
+ let DecoderNamespace = "MipsFP64";
+ }
-let AdditionalPredicates = [NotInMicroMips] in {
def MFHC1_D32 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
MFC1_FM<3>, ISA_MIPS32R2, FGR_32;
def MFHC1_D64 : MFC1_FT<"mfhc1", GPR32Opnd, FGR64Opnd, II_MFHC1>,
MFC1_FM<3>, ISA_MIPS32R2, FGR_64 {
let DecoderNamespace = "MipsFP64";
}
-}
-let AdditionalPredicates = [NotInMicroMips] in {
+
def MTHC1_D32 : MMRel, StdMMR6Rel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
MFC1_FM<7>, ISA_MIPS32R2, FGR_32;
def MTHC1_D64 : MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>,
MFC1_FM<7>, ISA_MIPS32R2, FGR_64 {
let DecoderNamespace = "MipsFP64";
}
-}
-let AdditionalPredicates = [NotInMicroMips] in {
+
def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, II_DMTC1,
bitconvert>, MFC1_FM<5>, ISA_MIPS3;
def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, II_DMFC1,
bitconvert>, MFC1_FM<1>, ISA_MIPS3;
-}
-
-def FMOV_S : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
- ABSS_FM<0x6, 16>;
-def FMOV_D32 : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
- ABSS_FM<0x6, 17>, FGR_32;
-def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, II_MOV_D>,
- ABSS_FM<0x6, 17>, FGR_64 {
- let DecoderNamespace = "MipsFP64";
+ let isMoveReg = 1 in {
+ def FMOV_S : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
+ ABSS_FM<0x6, 16>, ISA_MIPS1;
+ def FMOV_D32 : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
+ ABSS_FM<0x6, 17>, ISA_MIPS1, FGR_32;
+ def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, II_MOV_D>,
+ ABSS_FM<0x6, 17>, ISA_MIPS1, FGR_64 {
+ let DecoderNamespace = "MipsFP64";
+ }
+ } // isMoveReg
}
/// Floating Point Memory Instructions
let AdditionalPredicates = [NotInMicroMips] in {
def LWC1 : MMRel, LW_FT<"lwc1", FGR32Opnd, mem_simm16, II_LWC1, load>,
- LW_FM<0x31>;
+ LW_FM<0x31>, ISA_MIPS1;
def SWC1 : MMRel, SW_FT<"swc1", FGR32Opnd, mem_simm16, II_SWC1, store>,
- LW_FM<0x39>;
+ LW_FM<0x39>, ISA_MIPS1;
}
let DecoderNamespace = "MipsFP64", AdditionalPredicates = [NotInMicroMips] in {
@@ -569,14 +580,15 @@ let DecoderNamespace="MipsFP64" in {
// Load/store doubleword indexed unaligned.
// FIXME: This instruction should not be defined for FGR_32.
-let AdditionalPredicates = [IsNotNaCl] in {
+let AdditionalPredicates = [IsNotNaCl, NotInMicroMips] in {
def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
}
-let DecoderNamespace="MipsFP64" in {
+let AdditionalPredicates = [IsNotNaCl, NotInMicroMips],
+ DecoderNamespace="MipsFP64" in {
def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64;
def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
@@ -584,58 +596,62 @@ let DecoderNamespace="MipsFP64" in {
}
/// Floating-point Aritmetic
-def FADD_S : MMRel, ADDS_FT<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>,
- ADDS_FM<0x00, 16>;
-defm FADD : ADDS_M<"add.d", II_ADD_D, 1, fadd>, ADDS_FM<0x00, 17>;
-def FDIV_S : MMRel, ADDS_FT<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>,
- ADDS_FM<0x03, 16>;
-defm FDIV : ADDS_M<"div.d", II_DIV_D, 0, fdiv>, ADDS_FM<0x03, 17>;
-def FMUL_S : MMRel, ADDS_FT<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>,
- ADDS_FM<0x02, 16>;
-defm FMUL : ADDS_M<"mul.d", II_MUL_D, 1, fmul>, ADDS_FM<0x02, 17>;
-def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
- ADDS_FM<0x01, 16>;
-defm FSUB : ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>;
-
-def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
- MADDS_FM<4, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6, MADD4;
-def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
- MADDS_FM<5, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6, MADD4;
-
-let AdditionalPredicates = [NoNaNsFPMath, HasMadd4] in {
+let AdditionalPredicates = [NotInMicroMips] in {
+ def FADD_S : MMRel, ADDS_FT<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>,
+ ADDS_FM<0x00, 16>, ISA_MIPS1;
+ defm FADD : ADDS_M<"add.d", II_ADD_D, 1, fadd>, ADDS_FM<0x00, 17>,
+ ISA_MIPS1;
+ def FDIV_S : MMRel, ADDS_FT<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>,
+ ADDS_FM<0x03, 16>, ISA_MIPS1;
+ defm FDIV : ADDS_M<"div.d", II_DIV_D, 0, fdiv>, ADDS_FM<0x03, 17>,
+ ISA_MIPS1;
+ def FMUL_S : MMRel, ADDS_FT<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>,
+ ADDS_FM<0x02, 16>, ISA_MIPS1;
+ defm FMUL : ADDS_M<"mul.d", II_MUL_D, 1, fmul>, ADDS_FM<0x02, 17>,
+ ISA_MIPS1;
+ def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
+ ADDS_FM<0x01, 16>, ISA_MIPS1;
+ defm FSUB : ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>,
+ ISA_MIPS1;
+}
+
+let AdditionalPredicates = [NotInMicroMips, HasMadd4] in {
+ def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
+ MADDS_FM<4, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+ def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
+ MADDS_FM<5, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+
+ def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
+ MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+ def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
+ MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+
+ let DecoderNamespace = "MipsFP64" in {
+ def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>,
+ MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+ def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>,
+ MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+ }
+}
+
+let AdditionalPredicates = [NoNaNsFPMath, HasMadd4, NotInMicroMips] in {
def NMADD_S : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
MADDS_FM<6, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
def NMSUB_S : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
MADDS_FM<7, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
-}
-def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
- MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32, MADD4;
-def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
- MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32, MADD4;
-
-let AdditionalPredicates = [NoNaNsFPMath, HasMadd4] in {
def NMADD_D32 : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
def NMSUB_D32 : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
-}
-
-let DecoderNamespace = "MipsFP64" in {
- def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>,
- MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64, MADD4;
- def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>,
- MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64, MADD4;
-}
-let AdditionalPredicates = [NoNaNsFPMath, HasMadd4],
- DecoderNamespace = "MipsFP64" in {
- def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>,
- MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
- def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>,
- MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+ let DecoderNamespace = "MipsFP64" in {
+ def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>,
+ MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+ def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>,
+ MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+ }
}
-
//===----------------------------------------------------------------------===//
// Floating Point Branch Codes
//===----------------------------------------------------------------------===//
@@ -844,28 +860,31 @@ let AdditionalPredicates = [NotInMicroMips] in {
//===----------------------------------------------------------------------===//
// Floating Point Patterns
//===----------------------------------------------------------------------===//
-def : MipsPat<(f32 fpimm0), (MTC1 ZERO)>;
-def : MipsPat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
+def : MipsPat<(f32 fpimm0), (MTC1 ZERO)>, ISA_MIPS1;
+def : MipsPat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>, ISA_MIPS1;
def : MipsPat<(f32 (sint_to_fp GPR32Opnd:$src)),
(PseudoCVT_S_W GPR32Opnd:$src)>;
def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
- (TRUNC_W_S FGR32Opnd:$src)>;
+ (TRUNC_W_S FGR32Opnd:$src)>, ISA_MIPS1;
def : MipsPat<(MipsMTC1_D64 GPR32Opnd:$src),
- (MTC1_D64 GPR32Opnd:$src)>, FGR_64;
+ (MTC1_D64 GPR32Opnd:$src)>, ISA_MIPS1, FGR_64;
def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
(PseudoCVT_D32_W GPR32Opnd:$src)>, FGR_32;
-def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
- (TRUNC_W_D32 AFGR64Opnd:$src)>, FGR_32;
-def : MipsPat<(f32 (fpround AFGR64Opnd:$src)),
- (CVT_S_D32 AFGR64Opnd:$src)>, FGR_32;
-def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
- (CVT_D32_S FGR32Opnd:$src)>, FGR_32;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
+ (TRUNC_W_D32 AFGR64Opnd:$src)>, ISA_MIPS2, FGR_32;
+ def : MipsPat<(f32 (fpround AFGR64Opnd:$src)),
+ (CVT_S_D32 AFGR64Opnd:$src)>, ISA_MIPS1, FGR_32;
+ def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
+ (CVT_D32_S FGR32Opnd:$src)>, ISA_MIPS1, FGR_32;
+}
-def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>, FGR_64;
-def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>, FGR_64;
+def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>, ISA_MIPS3, GPR_64, FGR_64;
+def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>, ISA_MIPS3, GPR_64,
+ FGR_64;
def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
(PseudoCVT_D64_W GPR32Opnd:$src)>, FGR_64;
@@ -875,16 +894,18 @@ def : MipsPat<(f64 (sint_to_fp GPR64Opnd:$src)),
(PseudoCVT_D64_L GPR64Opnd:$src)>, FGR_64;
def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
- (TRUNC_W_D64 FGR64Opnd:$src)>, FGR_64;
+ (TRUNC_W_D64 FGR64Opnd:$src)>, ISA_MIPS2, FGR_64;
def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
- (TRUNC_L_S FGR32Opnd:$src)>, FGR_64;
+ (TRUNC_L_S FGR32Opnd:$src)>, ISA_MIPS2, FGR_64;
def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
- (TRUNC_L_D64 FGR64Opnd:$src)>, FGR_64;
+ (TRUNC_L_D64 FGR64Opnd:$src)>, ISA_MIPS2, FGR_64;
-def : MipsPat<(f32 (fpround FGR64Opnd:$src)),
- (CVT_S_D64 FGR64Opnd:$src)>, FGR_64;
-def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
- (CVT_D64_S FGR32Opnd:$src)>, FGR_64;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsPat<(f32 (fpround FGR64Opnd:$src)),
+ (CVT_S_D64 FGR64Opnd:$src)>, ISA_MIPS1, FGR_64;
+ def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
+ (CVT_D64_S FGR32Opnd:$src)>, ISA_MIPS1, FGR_64;
+}
// To generate NMADD and NMSUB instructions when fneg node is present
multiclass NMADD_NMSUB<Instruction Nmadd, Instruction Nmsub, RegisterOperand RC> {
@@ -903,13 +924,13 @@ let AdditionalPredicates = [NoNaNsFPMath, HasMadd4, NotInMicroMips] in {
// Patterns for loads/stores with a reg+imm operand.
let AdditionalPredicates = [NotInMicroMips] in {
let AddedComplexity = 40 in {
- def : LoadRegImmPat<LWC1, f32, load>;
- def : StoreRegImmPat<SWC1, f32>;
+ def : LoadRegImmPat<LWC1, f32, load>, ISA_MIPS1;
+ def : StoreRegImmPat<SWC1, f32>, ISA_MIPS1;
- def : LoadRegImmPat<LDC164, f64, load>, FGR_64;
- def : StoreRegImmPat<SDC164, f64>, FGR_64;
+ def : LoadRegImmPat<LDC164, f64, load>, ISA_MIPS1, FGR_64;
+ def : StoreRegImmPat<SDC164, f64>, ISA_MIPS1, FGR_64;
- def : LoadRegImmPat<LDC1, f64, load>, FGR_32;
- def : StoreRegImmPat<SDC1, f64>, FGR_32;
+ def : LoadRegImmPat<LDC1, f64, load>, ISA_MIPS1, FGR_32;
+ def : StoreRegImmPat<SDC1, f64>, ISA_MIPS1, FGR_32;
}
}
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 817d9b44b9c2..ebbdcdf0df89 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -70,7 +70,7 @@ class StdArch {
// Generic Mips Format
class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
- InstrItinClass itin, Format f>: Instruction
+ InstrItinClass itin, Format f>: Instruction, PredicateControl
{
field bits<32> Inst;
Format Form = f;
@@ -119,8 +119,8 @@ class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
// Mips32/64 Instruction Format
class InstSE<dag outs, dag ins, string asmstr, list<dag> pattern,
InstrItinClass itin, Format f, string opstr = ""> :
- MipsInst<outs, ins, asmstr, pattern, itin, f>, PredicateControl {
- let EncodingPredicates = [HasStdEnc];
+ MipsInst<outs, ins, asmstr, pattern, itin, f> {
+ let EncodingPredicates = [NotInMips16Mode];
string BaseOpcode = opstr;
string Arch;
}
@@ -136,15 +136,15 @@ class MipsPseudo<dag outs, dag ins, list<dag> pattern,
// Mips32/64 Pseudo Instruction Format
class PseudoSE<dag outs, dag ins, list<dag> pattern,
InstrItinClass itin = IIPseudo> :
- MipsPseudo<outs, ins, pattern, itin>, PredicateControl {
- let EncodingPredicates = [HasStdEnc];
+ MipsPseudo<outs, ins, pattern, itin> {
+ let EncodingPredicates = [NotInMips16Mode];
}
// Pseudo-instructions for alternate assembly syntax (never used by codegen).
// These are aliases that require C++ handling to convert to the target
// instruction, while InstAliases can be handled directly by tblgen.
class MipsAsmPseudoInst<dag outs, dag ins, string asmstr>:
- MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo>, PredicateControl {
+ MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo> {
let isPseudo = 1;
let Pattern = [];
}
@@ -220,10 +220,9 @@ class FJ<bits<6> op> : StdArch
}
//===----------------------------------------------------------------------===//
-// MFC instruction class in Mips : <|op|mf|rt|rd|0000000|sel|>
+// MFC instruction class in Mips : <|op|mf|rt|rd|gst|0000|sel|>
//===----------------------------------------------------------------------===//
-class MFC3OP_FM<bits<6> op, bits<5> mfmt>
-{
+class MFC3OP_FM<bits<6> op, bits<5> mfmt, bits<3> guest> : StdArch {
bits<5> rt;
bits<5> rd;
bits<3> sel;
@@ -234,7 +233,8 @@ class MFC3OP_FM<bits<6> op, bits<5> mfmt>
let Inst{25-21} = mfmt;
let Inst{20-16} = rt;
let Inst{15-11} = rd;
- let Inst{10-3} = 0;
+ let Inst{10-8} = guest;
+ let Inst{7-3} = 0;
let Inst{2-0} = sel;
}
@@ -508,6 +508,7 @@ class EXT_FM<bits<6> funct> : StdArch {
class RDHWR_FM : StdArch {
bits<5> rt;
bits<5> rd;
+ bits<3> sel;
bits<32> Inst;
@@ -515,7 +516,8 @@ class RDHWR_FM : StdArch {
let Inst{25-21} = 0;
let Inst{20-16} = rt;
let Inst{15-11} = rd;
- let Inst{10-6} = 0;
+ let Inst{10-9} = 0b00;
+ let Inst{8-6} = sel;
let Inst{5-0} = 0x3b;
}
@@ -970,3 +972,14 @@ class CACHEOP_FM<bits<6> op> : StdArch {
let Inst{20-16} = hint;
let Inst{15-0} = offset;
}
+
+class HYPCALL_FM<bits<6> op> : StdArch {
+ bits<10> code_;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010000;
+ let Inst{25} = 1;
+ let Inst{20-11} = code_;
+ let Inst{5-0} = op;
+}
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 51ddc0d44c00..0e0e712dba19 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -163,7 +163,7 @@ unsigned MipsInstrInfo::removeBranch(MachineBasicBlock &MBB,
// Note that indirect branches are not removed.
while (I != REnd && removed < 2) {
// Skip past debug instructions.
- if (I->isDebugValue()) {
+ if (I->isDebugInstr()) {
++I;
continue;
}
@@ -195,7 +195,7 @@ MipsInstrInfo::BranchType MipsInstrInfo::analyzeBranch(
MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
// Skip all the debug instructions.
- while (I != REnd && I->isDebugValue())
+ while (I != REnd && I->isDebugInstr())
++I;
if (I == REnd || !isUnpredicatedTerminator(*I)) {
@@ -220,7 +220,7 @@ MipsInstrInfo::BranchType MipsInstrInfo::analyzeBranch(
// Skip past any debug instruction to see if the second last actual
// is a branch.
++I;
- while (I != REnd && I->isDebugValue())
+ while (I != REnd && I->isDebugInstr())
++I;
if (I != REnd) {
@@ -276,6 +276,163 @@ MipsInstrInfo::BranchType MipsInstrInfo::analyzeBranch(
return BT_CondUncond;
}
+bool MipsInstrInfo::isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const {
+ switch (BranchOpc) {
+ case Mips::B:
+ case Mips::BAL:
+ case Mips::BC1F:
+ case Mips::BC1FL:
+ case Mips::BC1T:
+ case Mips::BC1TL:
+ case Mips::BEQ: case Mips::BEQ64:
+ case Mips::BEQL:
+ case Mips::BGEZ: case Mips::BGEZ64:
+ case Mips::BGEZL:
+ case Mips::BGEZAL:
+ case Mips::BGEZALL:
+ case Mips::BGTZ: case Mips::BGTZ64:
+ case Mips::BGTZL:
+ case Mips::BLEZ: case Mips::BLEZ64:
+ case Mips::BLEZL:
+ case Mips::BLTZ: case Mips::BLTZ64:
+ case Mips::BLTZL:
+ case Mips::BLTZAL:
+ case Mips::BLTZALL:
+ case Mips::BNE: case Mips::BNE64:
+ case Mips::BNEL:
+ return isInt<18>(BrOffset);
+
+ // microMIPSr3 branches
+ case Mips::B_MM:
+ case Mips::BC1F_MM:
+ case Mips::BC1T_MM:
+ case Mips::BEQ_MM:
+ case Mips::BGEZ_MM:
+ case Mips::BGEZAL_MM:
+ case Mips::BGTZ_MM:
+ case Mips::BLEZ_MM:
+ case Mips::BLTZ_MM:
+ case Mips::BLTZAL_MM:
+ case Mips::BNE_MM:
+ case Mips::BEQZC_MM:
+ case Mips::BNEZC_MM:
+ return isInt<17>(BrOffset);
+
+ // microMIPSR3 short branches.
+ case Mips::B16_MM:
+ return isInt<11>(BrOffset);
+
+ case Mips::BEQZ16_MM:
+ case Mips::BNEZ16_MM:
+ return isInt<8>(BrOffset);
+
+ // MIPSR6 branches.
+ case Mips::BALC:
+ case Mips::BC:
+ return isInt<28>(BrOffset);
+
+ case Mips::BC1EQZ:
+ case Mips::BC1NEZ:
+ case Mips::BC2EQZ:
+ case Mips::BC2NEZ:
+ case Mips::BEQC: case Mips::BEQC64:
+ case Mips::BNEC: case Mips::BNEC64:
+ case Mips::BGEC: case Mips::BGEC64:
+ case Mips::BGEUC: case Mips::BGEUC64:
+ case Mips::BGEZC: case Mips::BGEZC64:
+ case Mips::BGTZC: case Mips::BGTZC64:
+ case Mips::BLEZC: case Mips::BLEZC64:
+ case Mips::BLTC: case Mips::BLTC64:
+ case Mips::BLTUC: case Mips::BLTUC64:
+ case Mips::BLTZC: case Mips::BLTZC64:
+ case Mips::BNVC:
+ case Mips::BOVC:
+ case Mips::BGEZALC:
+ case Mips::BEQZALC:
+ case Mips::BGTZALC:
+ case Mips::BLEZALC:
+ case Mips::BLTZALC:
+ case Mips::BNEZALC:
+ return isInt<18>(BrOffset);
+
+ case Mips::BEQZC: case Mips::BEQZC64:
+ case Mips::BNEZC: case Mips::BNEZC64:
+ return isInt<23>(BrOffset);
+
+ // microMIPSR6 branches
+ case Mips::BC16_MMR6:
+ return isInt<11>(BrOffset);
+
+ case Mips::BEQZC16_MMR6:
+ case Mips::BNEZC16_MMR6:
+ return isInt<8>(BrOffset);
+
+ case Mips::BALC_MMR6:
+ case Mips::BC_MMR6:
+ return isInt<27>(BrOffset);
+
+ case Mips::BC1EQZC_MMR6:
+ case Mips::BC1NEZC_MMR6:
+ case Mips::BC2EQZC_MMR6:
+ case Mips::BC2NEZC_MMR6:
+ case Mips::BGEZALC_MMR6:
+ case Mips::BEQZALC_MMR6:
+ case Mips::BGTZALC_MMR6:
+ case Mips::BLEZALC_MMR6:
+ case Mips::BLTZALC_MMR6:
+ case Mips::BNEZALC_MMR6:
+ case Mips::BNVC_MMR6:
+ case Mips::BOVC_MMR6:
+ return isInt<17>(BrOffset);
+
+ case Mips::BEQC_MMR6:
+ case Mips::BNEC_MMR6:
+ case Mips::BGEC_MMR6:
+ case Mips::BGEUC_MMR6:
+ case Mips::BGEZC_MMR6:
+ case Mips::BGTZC_MMR6:
+ case Mips::BLEZC_MMR6:
+ case Mips::BLTC_MMR6:
+ case Mips::BLTUC_MMR6:
+ case Mips::BLTZC_MMR6:
+ return isInt<18>(BrOffset);
+
+ case Mips::BEQZC_MMR6:
+ case Mips::BNEZC_MMR6:
+ return isInt<23>(BrOffset);
+
+ // DSP branches.
+ case Mips::BPOSGE32:
+ return isInt<18>(BrOffset);
+ case Mips::BPOSGE32_MM:
+ case Mips::BPOSGE32C_MMR3:
+ return isInt<17>(BrOffset);
+
+ // cnMIPS branches.
+ case Mips::BBIT0:
+ case Mips::BBIT032:
+ case Mips::BBIT1:
+ case Mips::BBIT132:
+ return isInt<18>(BrOffset);
+
+ // MSA branches.
+ case Mips::BZ_B:
+ case Mips::BZ_H:
+ case Mips::BZ_W:
+ case Mips::BZ_D:
+ case Mips::BZ_V:
+ case Mips::BNZ_B:
+ case Mips::BNZ_H:
+ case Mips::BNZ_W:
+ case Mips::BNZ_D:
+ case Mips::BNZ_V:
+ return isInt<18>(BrOffset);
+ }
+
+ llvm_unreachable("Unknown branch instruction!");
+}
+
+
/// Return the corresponding compact (no delay slot) form of a branch.
unsigned MipsInstrInfo::getEquivalentCompactForm(
const MachineBasicBlock::iterator I) const {
@@ -298,7 +455,6 @@ unsigned MipsInstrInfo::getEquivalentCompactForm(
case Mips::JR:
case Mips::PseudoReturn:
case Mips::PseudoIndirectBranch:
- case Mips::TAILCALLREG:
canUseShortMicroMipsCTI = true;
break;
}
@@ -377,18 +533,18 @@ unsigned MipsInstrInfo::getEquivalentCompactForm(
// For MIPSR6, the instruction 'jic' can be used for these cases. Some
// tools will accept 'jrc reg' as an alias for 'jic 0, $reg'.
case Mips::JR:
+ case Mips::PseudoIndirectBranchR6:
case Mips::PseudoReturn:
- case Mips::PseudoIndirectBranch:
- case Mips::TAILCALLREG:
+ case Mips::TAILCALLR6REG:
if (canUseShortMicroMipsCTI)
return Mips::JRC16_MM;
return Mips::JIC;
case Mips::JALRPseudo:
return Mips::JIALC;
case Mips::JR64:
+ case Mips::PseudoIndirectBranch64R6:
case Mips::PseudoReturn64:
- case Mips::PseudoIndirectBranch64:
- case Mips::TAILCALLREG64:
+ case Mips::TAILCALL64R6REG:
return Mips::JIC64;
case Mips::JALR64Pseudo:
return Mips::JIALC64;
@@ -599,7 +755,7 @@ bool MipsInstrInfo::verifyInstruction(const MachineInstr &MI,
case Mips::DINS:
return verifyInsExtInstruction(MI, ErrInfo, 0, 32, 0, 32, 0, 32);
case Mips::DINSM:
- // The ISA spec has a subtle difference difference between dinsm and dextm
+ // The ISA spec has a subtle difference between dinsm and dextm
// in that it says:
// 2 <= size <= 64 for 'dinsm' but 'dextm' has 32 < size <= 64.
// To make the bounds checks similar, the range 1 < size <= 64 is checked
@@ -617,6 +773,18 @@ bool MipsInstrInfo::verifyInstruction(const MachineInstr &MI,
return verifyInsExtInstruction(MI, ErrInfo, 0, 32, 32, 64, 32, 64);
case Mips::DEXTU:
return verifyInsExtInstruction(MI, ErrInfo, 32, 64, 0, 32, 32, 64);
+ case Mips::TAILCALLREG:
+ case Mips::PseudoIndirectBranch:
+ case Mips::JR:
+ case Mips::JR64:
+ case Mips::JALR:
+ case Mips::JALR64:
+ case Mips::JALRPseudo:
+ if (!Subtarget.useIndirectJumpsHazard())
+ return true;
+
+ ErrInfo = "invalid instruction when using jump guards!";
+ return false;
default:
return true;
}
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index c18e395f9013..9d27b8f66211 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -86,6 +86,10 @@ public:
/// Determine the opcode of a non-delay slot form for a branch if one exists.
unsigned getEquivalentCompactForm(const MachineBasicBlock::iterator I) const;
+ /// Determine if the branch target is in range.
+ bool isBranchOffsetInRange(unsigned BranchOpc,
+ int64_t BrOffset) const override;
+
/// Predicate to determine if an instruction can go in a forbidden slot.
bool SafeInForbiddenSlot(const MachineInstr &MI) const;
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index e0d818b749df..0faa13d4d63f 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -73,12 +73,8 @@ def MipsGPRel : SDNode<"MipsISD::GPRel", SDTIntUnaryOp>;
// Hi node for accessing the GOT.
def MipsGotHi : SDNode<"MipsISD::GotHi", SDTIntUnaryOp>;
-// TlsGd node is used to handle General Dynamic TLS
-def MipsTlsGd : SDNode<"MipsISD::TlsGd", SDTIntUnaryOp>;
-
-// TprelHi and TprelLo nodes are used to handle Local Exec TLS
-def MipsTprelHi : SDNode<"MipsISD::TprelHi", SDTIntUnaryOp>;
-def MipsTprelLo : SDNode<"MipsISD::TprelLo", SDTIntUnaryOp>;
+// Hi node for handling TLS offsets
+def MipsTlsHi : SDNode<"MipsISD::TlsHi", SDTIntUnaryOp>;
// Thread pointer
def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>;
@@ -202,12 +198,12 @@ def NotMips64 : Predicate<"!Subtarget->hasMips64()">,
AssemblerPredicate<"!FeatureMips64">;
def HasMips64r2 : Predicate<"Subtarget->hasMips64r2()">,
AssemblerPredicate<"FeatureMips64r2">;
+def HasMips64r5 : Predicate<"Subtarget->hasMips64r5()">,
+ AssemblerPredicate<"FeatureMips64r5">;
def HasMips64r6 : Predicate<"Subtarget->hasMips64r6()">,
AssemblerPredicate<"FeatureMips64r6">;
def NotMips64r6 : Predicate<"!Subtarget->hasMips64r6()">,
AssemblerPredicate<"!FeatureMips64r6">;
-def HasMicroMips32r6 : Predicate<"Subtarget->inMicroMips32r6Mode()">,
- AssemblerPredicate<"FeatureMicroMips,FeatureMips32r6">;
def InMips16Mode : Predicate<"Subtarget->inMips16Mode()">,
AssemblerPredicate<"FeatureMips16">;
def NotInMips16Mode : Predicate<"!Subtarget->inMips16Mode()">,
@@ -237,14 +233,26 @@ def IsBE : Predicate<"!Subtarget->isLittle()">;
def IsNotNaCl : Predicate<"!Subtarget->isTargetNaCl()">;
def UseTCCInDIV : AssemblerPredicate<"FeatureUseTCCInDIV">;
def HasEVA : Predicate<"Subtarget->hasEVA()">,
- AssemblerPredicate<"FeatureEVA,FeatureMips32r2">;
+ AssemblerPredicate<"FeatureEVA">;
def HasMSA : Predicate<"Subtarget->hasMSA()">,
AssemblerPredicate<"FeatureMSA">;
def HasMadd4 : Predicate<"!Subtarget->disableMadd4()">,
AssemblerPredicate<"!FeatureMadd4">;
def HasMT : Predicate<"Subtarget->hasMT()">,
AssemblerPredicate<"FeatureMT">;
-
+def UseIndirectJumpsHazard : Predicate<"Subtarget->useIndirectJumpsHazard()">,
+ AssemblerPredicate<"FeatureUseIndirectJumpsHazard">;
+def NoIndirectJumpGuards : Predicate<"!Subtarget->useIndirectJumpsHazard()">,
+ AssemblerPredicate<"!FeatureUseIndirectJumpsHazard">;
+def HasCRC : Predicate<"Subtarget->hasCRC()">,
+ AssemblerPredicate<"FeatureCRC">;
+def HasVirt : Predicate<"Subtarget->hasVirt()">,
+ AssemblerPredicate<"FeatureVirt">;
+def HasGINV : Predicate<"Subtarget->hasGINV()">,
+ AssemblerPredicate<"FeatureGINV">;
+// TODO: Add support for FPOpFusion::Standard
+def AllowFPOpFusion : Predicate<"TM.Options.AllowFPOpFusion =="
+ " FPOpFusion::Fast">;
//===----------------------------------------------------------------------===//
// Mips GPR size adjectives.
// They are mutually exclusive.
@@ -274,126 +282,203 @@ class SYM_64 { list<Predicate> SYMPredicates = [IsSym64]; }
// subtractive predicate will hopefully keep us under the 32 predicate
// limit long enough to develop an alternative way to handle P1||P2
// predicates.
+class ISA_MIPS1 {
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
class ISA_MIPS1_NOT_MIPS3 {
list<Predicate> InsnPredicates = [NotMips3];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
}
class ISA_MIPS1_NOT_4_32 {
list<Predicate> InsnPredicates = [NotMips4_32];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
}
class ISA_MIPS1_NOT_32R6_64R6 {
list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS2 {
+ list<Predicate> InsnPredicates = [HasMips2];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
}
-class ISA_MIPS2 { list<Predicate> InsnPredicates = [HasMips2]; }
class ISA_MIPS2_NOT_32R6_64R6 {
list<Predicate> InsnPredicates = [HasMips2, NotMips32r6, NotMips64r6];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS3 {
+ list<Predicate> InsnPredicates = [HasMips3];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
}
-class ISA_MIPS3 { list<Predicate> InsnPredicates = [HasMips3]; }
class ISA_MIPS3_NOT_32R6_64R6 {
list<Predicate> InsnPredicates = [HasMips3, NotMips32r6, NotMips64r6];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS32 {
+ list<Predicate> InsnPredicates = [HasMips32];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
}
-class ISA_MIPS32 { list<Predicate> InsnPredicates = [HasMips32]; }
class ISA_MIPS32_NOT_32R6_64R6 {
list<Predicate> InsnPredicates = [HasMips32, NotMips32r6, NotMips64r6];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS32R2 {
+ list<Predicate> InsnPredicates = [HasMips32r2];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
}
-class ISA_MIPS32R2 { list<Predicate> InsnPredicates = [HasMips32r2]; }
class ISA_MIPS32R2_NOT_32R6_64R6 {
list<Predicate> InsnPredicates = [HasMips32r2, NotMips32r6, NotMips64r6];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS32R5 {
+ list<Predicate> InsnPredicates = [HasMips32r5];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS64 {
+ list<Predicate> InsnPredicates = [HasMips64];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
}
-class ISA_MIPS32R5 { list<Predicate> InsnPredicates = [HasMips32r5]; }
-class ISA_MIPS64 { list<Predicate> InsnPredicates = [HasMips64]; }
class ISA_MIPS64_NOT_64R6 {
list<Predicate> InsnPredicates = [HasMips64, NotMips64r6];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS64R2 {
+ list<Predicate> InsnPredicates = [HasMips64r2];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS64R5 {
+ list<Predicate> InsnPredicates = [HasMips64r5];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS32R6 {
+ list<Predicate> InsnPredicates = [HasMips32r6];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS64R6 {
+ list<Predicate> InsnPredicates = [HasMips64r6];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MICROMIPS {
+ list<Predicate> EncodingPredicates = [InMicroMips];
+}
+class ISA_MICROMIPS32R5 {
+ list<Predicate> InsnPredicates = [HasMips32r5];
+ list<Predicate> EncodingPredicates = [InMicroMips];
}
-class ISA_MIPS64R2 { list<Predicate> InsnPredicates = [HasMips64r2]; }
-class ISA_MIPS32R6 { list<Predicate> InsnPredicates = [HasMips32r6]; }
-class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; }
-class ISA_MICROMIPS { list<Predicate> InsnPredicates = [InMicroMips]; }
class ISA_MICROMIPS32R6 {
- list<Predicate> InsnPredicates = [HasMicroMips32r6];
+ list<Predicate> InsnPredicates = [HasMips32r6];
+ list<Predicate> EncodingPredicates = [InMicroMips];
}
-class ISA_MICROMIPS32_NOT_MIPS32R6 {
- list<Predicate> InsnPredicates = [InMicroMips, NotMips32r6];
+class ISA_MICROMIPS64R6 {
+ list<Predicate> InsnPredicates = [HasMips64r6];
+ list<Predicate> EncodingPredicates = [InMicroMips];
}
-
-class INSN_EVA { list<Predicate> InsnPredicates = [HasEVA]; }
-class INSN_EVA_NOT_32R6_64R6 {
- list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6, HasEVA];
+class ISA_MICROMIPS32_NOT_MIPS32R6 {
+ list<Predicate> InsnPredicates = [NotMips32r6];
+ list<Predicate> EncodingPredicates = [InMicroMips];
}
+class ASE_EVA { list<Predicate> ASEPredicate = [HasEVA]; }
// The portions of MIPS-III that were also added to MIPS32
-class INSN_MIPS3_32 { list<Predicate> InsnPredicates = [HasMips3_32]; }
+class INSN_MIPS3_32 {
+ list<Predicate> InsnPredicates = [HasMips3_32];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
// The portions of MIPS-III that were also added to MIPS32 but were removed in
// MIPS32r6 and MIPS64r6.
class INSN_MIPS3_32_NOT_32R6_64R6 {
list<Predicate> InsnPredicates = [HasMips3_32, NotMips32r6, NotMips64r6];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
}
// The portions of MIPS-III that were also added to MIPS32
-class INSN_MIPS3_32R2 { list<Predicate> InsnPredicates = [HasMips3_32r2]; }
+class INSN_MIPS3_32R2 {
+ list<Predicate> InsnPredicates = [HasMips3_32r2];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
// The portions of MIPS-IV that were also added to MIPS32.
-class INSN_MIPS4_32 { list <Predicate> InsnPredicates = [HasMips4_32]; }
+class INSN_MIPS4_32 {
+ list <Predicate> InsnPredicates = [HasMips4_32];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
// The portions of MIPS-IV that were also added to MIPS32 but were removed in
// MIPS32r6 and MIPS64r6.
class INSN_MIPS4_32_NOT_32R6_64R6 {
list<Predicate> InsnPredicates = [HasMips4_32, NotMips32r6, NotMips64r6];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
}
// The portions of MIPS-IV that were also added to MIPS32r2 but were removed in
// MIPS32r6 and MIPS64r6.
class INSN_MIPS4_32R2_NOT_32R6_64R6 {
list<Predicate> InsnPredicates = [HasMips4_32r2, NotMips32r6, NotMips64r6];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
}
// The portions of MIPS-IV that were also added to MIPS32r2.
class INSN_MIPS4_32R2 {
list<Predicate> InsnPredicates = [HasMips4_32r2];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
}
// The portions of MIPS-V that were also added to MIPS32r2 but were removed in
// MIPS32r6 and MIPS64r6.
class INSN_MIPS5_32R2_NOT_32R6_64R6 {
list<Predicate> InsnPredicates = [HasMips5_32r2, NotMips32r6, NotMips64r6];
+ list<Predicate> EncodingPredicates = [HasStdEnc];
}
class ASE_CNMIPS {
- list<Predicate> InsnPredicates = [HasCnMips];
+ list<Predicate> ASEPredicate = [HasCnMips];
}
class NOT_ASE_CNMIPS {
- list<Predicate> InsnPredicates = [NotCnMips];
+ list<Predicate> ASEPredicate = [NotCnMips];
}
class ASE_MIPS64_CNMIPS {
- list<Predicate> InsnPredicates = [HasMips64, HasCnMips];
+ list<Predicate> ASEPredicate = [HasMips64, HasCnMips];
}
class ASE_MSA {
- list<Predicate> InsnPredicates = [HasMSA];
+ list<Predicate> ASEPredicate = [HasMSA];
}
class ASE_MSA_NOT_MSA64 {
- list<Predicate> InsnPredicates = [HasMSA, NotMips64];
+ list<Predicate> ASEPredicate = [HasMSA, NotMips64];
}
class ASE_MSA64 {
- list<Predicate> InsnPredicates = [HasMSA, HasMips64];
+ list<Predicate> ASEPredicate = [HasMSA, HasMips64];
}
class ASE_MT {
- list <Predicate> InsnPredicates = [HasMT];
+ list <Predicate> ASEPredicate = [HasMT];
+}
+
+class ASE_CRC {
+ list <Predicate> ASEPredicate = [HasCRC];
+}
+
+class ASE_VIRT {
+ list <Predicate> ASEPredicate = [HasVirt];
+}
+
+class ASE_GINV {
+ list <Predicate> ASEPredicate = [HasGINV];
}
// Class used for separating microMIPSr6 and microMIPS (r3) instruction.
// It can be used only on instructions that doesn't inherit PredicateControl.
class ISA_MICROMIPS_NOT_32R6 : PredicateControl {
- let InsnPredicates = [InMicroMips, NotMips32r6];
+ let InsnPredicates = [NotMips32r6];
+ let EncodingPredicates = [InMicroMips];
}
class ASE_NOT_DSP {
- list<Predicate> InsnPredicates = [NotDSP];
+ list<Predicate> ASEPredicate = [NotDSP];
}
class MADD4 {
@@ -410,11 +495,13 @@ class ABI_NOT_N64 {
list<Predicate> AdditionalPredicates = [IsNotN64];
}
+class FPOP_FUSION_FAST {
+ list <Predicate> AdditionalPredicates = [AllowFPOpFusion];
+}
+
//===----------------------------------------------------------------------===//
-class MipsPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl {
- let EncodingPredicates = [HasStdEnc];
-}
+class MipsPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl;
class MipsInstAlias<string Asm, dag Result, bit Emit = 0b1> :
InstAlias<Asm, Result, Emit>, PredicateControl;
@@ -1044,6 +1131,15 @@ def MipsMemSimm16AsmOperand : AsmOperandClass {
let DiagnosticType = "MemSImm16";
}
+def MipsMemSimmPtrAsmOperand : AsmOperandClass {
+ let Name = "MemOffsetSimmPtr";
+ let SuperClasses = [MipsMemAsmOperand];
+ let RenderMethod = "addMemOperands";
+ let ParserMethod = "parseMemOperand";
+ let PredicateMethod = "isMemWithPtrSizeOffset";
+ let DiagnosticType = "MemSImmPtr";
+}
+
def MipsInvertedImmoperand : AsmOperandClass {
let Name = "InvNum";
let RenderMethod = "addImmOperands";
@@ -1117,6 +1213,10 @@ def mem_simm16 : mem_generic {
let ParserMatchClass = MipsMemSimm16AsmOperand;
}
+def mem_simmptr : mem_generic {
+ let ParserMatchClass = MipsMemSimmPtrAsmOperand;
+}
+
def mem_ea : Operand<iPTR> {
let PrintMethod = "printMemOperandEA";
let MIOperandInfo = (ops ptr_rc, simm16);
@@ -1333,6 +1433,7 @@ class LoadMemory<string opstr, DAGOperand RO, DAGOperand MO,
[(set RO:$rt, (OpNode Addr:$addr))], Itin, FrmI, opstr> {
let DecoderMethod = "DecodeMem";
let canFoldAsLoad = 1;
+ string BaseOpcode = opstr;
let mayLoad = 1;
}
@@ -1346,6 +1447,7 @@ class StoreMemory<string opstr, DAGOperand RO, DAGOperand MO,
InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
[(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
let DecoderMethod = "DecodeMem";
+ string BaseOpcode = opstr;
let mayStore = 1;
}
@@ -1363,6 +1465,7 @@ class LoadLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
[(set RO:$rt, (OpNode addr:$addr, RO:$src))], Itin, FrmI> {
let DecoderMethod = "DecodeMem";
string Constraints = "$src = $rt";
+ let BaseOpcode = opstr;
}
class StoreLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
@@ -1370,6 +1473,7 @@ class StoreLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
[(OpNode RO:$rt, addr:$addr)], Itin, FrmI> {
let DecoderMethod = "DecodeMem";
+ let BaseOpcode = opstr;
}
// COP2 Load/Store
@@ -1540,13 +1644,14 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
PseudoSE<(outs), (ins calltarget:$target), [], II_J>,
PseudoInstExpansion<(JumpInst Opnd:$target)>;
- class TailCallReg<RegisterOperand RO> :
- PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>;
+ class TailCallReg<Instruction JumpInst, RegisterOperand RO> :
+ PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>,
+ PseudoInstExpansion<(JumpInst RO:$rs)>;
}
-class BAL_BR_Pseudo<Instruction RealInst> :
- PseudoSE<(outs), (ins brtarget:$offset), [], II_BCCZAL>,
- PseudoInstExpansion<(RealInst ZERO, brtarget:$offset)> {
+class BAL_BR_Pseudo<Instruction RealInst, DAGOperand opnd> :
+ PseudoSE<(outs), (ins opnd:$offset), [], II_BCCZAL>,
+ PseudoInstExpansion<(RealInst ZERO, opnd:$offset)> {
let isBranch = 1;
let isTerminator = 1;
let isBarrier = 1;
@@ -1588,8 +1693,8 @@ class SYNC_FT<string opstr> :
InstSE<(outs), (ins uimm5:$stype), "sync $stype",
[(MipsSync immZExt5:$stype)], II_SYNC, FrmOther, opstr>;
-class SYNCI_FT<string opstr> :
- InstSE<(outs), (ins mem_simm16:$addr), !strconcat(opstr, "\t$addr"), [],
+class SYNCI_FT<string opstr, DAGOperand MO> :
+ InstSE<(outs), (ins MO:$addr), !strconcat(opstr, "\t$addr"), [],
II_SYNCI, FrmOther, opstr> {
let hasSideEffects = 1;
let DecoderMethod = "DecodeSyncI";
@@ -1661,6 +1766,7 @@ class MoveFromLOHI<string opstr, RegisterOperand RO, Register UseReg>:
FrmR, opstr> {
let Uses = [UseReg];
let hasSideEffects = 0;
+ let isMoveReg = 1;
}
class PseudoMTLOHI<RegisterClass DstRC, RegisterClass SrcRC>
@@ -1673,6 +1779,7 @@ class MoveToLOHI<string opstr, RegisterOperand RO, list<Register> DefRegs>:
FrmR, opstr> {
let Defs = DefRegs;
let hasSideEffects = 0;
+ let isMoveReg = 1;
}
class EffectiveAddress<string opstr, RegisterOperand RO> :
@@ -1711,8 +1818,8 @@ class SubwordSwap<string opstr, RegisterOperand RO,
// Read Hardware
class ReadHardware<RegisterOperand CPURegOperand, RegisterOperand RO> :
- InstSE<(outs CPURegOperand:$rt), (ins RO:$rd), "rdhwr\t$rt, $rd", [],
- II_RDHWR, FrmR, "rdhwr">;
+ InstSE<(outs CPURegOperand:$rt), (ins RO:$rd, uimm8:$sel),
+ "rdhwr\t$rt, $rd, $sel", [], II_RDHWR, FrmR, "rdhwr">;
// Ext and Ins
class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
@@ -1721,7 +1828,7 @@ class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size),
!strconcat(opstr, "\t$rt, $rs, $pos, $size"),
[(set RO:$rt, (Op RO:$rs, PosImm:$pos, SizeImm:$size))], II_EXT,
- FrmR, opstr>, ISA_MIPS32R2;
+ FrmR, opstr>;
// 'ins' and its' 64 bit variants are matched by C++ code.
class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
@@ -1730,7 +1837,7 @@ class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
!strconcat(opstr, "\t$rt, $rs, $pos, $size"),
[(set RO:$rt, (null_frag RO:$rs, PosImm:$pos, SizeImm:$size,
RO:$src))],
- II_INS, FrmR, opstr>, ISA_MIPS32R2 {
+ II_INS, FrmR, opstr> {
let Constraints = "$src = $rt";
}
@@ -1739,11 +1846,37 @@ class Atomic2Ops<PatFrag Op, RegisterClass DRC> :
PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$incr),
[(set DRC:$dst, (Op iPTR:$ptr, DRC:$incr))]>;
+class Atomic2OpsPostRA<RegisterClass RC> :
+ PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$incr), []> {
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+class Atomic2OpsSubwordPostRA<RegisterClass RC> :
+ PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$incr, RC:$mask, RC:$mask2,
+ RC:$shiftamnt), []>;
+
// Atomic Compare & Swap.
+// Atomic compare and swap is lowered into two stages. The first stage happens
+// during ISelLowering, which produces the PostRA version of this instruction.
class AtomicCmpSwap<PatFrag Op, RegisterClass DRC> :
PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$cmp, DRC:$swap),
[(set DRC:$dst, (Op iPTR:$ptr, DRC:$cmp, DRC:$swap))]>;
+class AtomicCmpSwapPostRA<RegisterClass RC> :
+ PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$cmp, RC:$swap), []> {
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+class AtomicCmpSwapSubwordPostRA<RegisterClass RC> :
+ PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$mask, RC:$ShiftCmpVal,
+ RC:$mask2, RC:$ShiftNewVal, RC:$ShiftAmt), []> {
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+
class LLBase<string opstr, RegisterOperand RO, DAGOperand MO = mem> :
InstSE<(outs RO:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
[], II_LL, FrmI, opstr> {
@@ -1762,12 +1895,16 @@ class SCBase<string opstr, RegisterOperand RO> :
class MFC3OP<string asmstr, RegisterOperand RO, RegisterOperand RD,
InstrItinClass itin> :
InstSE<(outs RO:$rt), (ins RD:$rd, uimm3:$sel),
- !strconcat(asmstr, "\t$rt, $rd, $sel"), [], itin, FrmFR>;
+ !strconcat(asmstr, "\t$rt, $rd, $sel"), [], itin, FrmFR> {
+ let BaseOpcode = asmstr;
+}
class MTC3OP<string asmstr, RegisterOperand RO, RegisterOperand RD,
InstrItinClass itin> :
InstSE<(outs RO:$rd), (ins RD:$rt, uimm3:$sel),
- !strconcat(asmstr, "\t$rt, $rd, $sel"), [], itin, FrmFR>;
+ !strconcat(asmstr, "\t$rt, $rd, $sel"), [], itin, FrmFR> {
+ let BaseOpcode = asmstr;
+}
class TrapBase<Instruction RealInst>
: PseudoSE<(outs), (ins), [(trap)], II_TRAP>,
@@ -1825,8 +1962,36 @@ let usesCustomInserter = 1 in {
def ATOMIC_CMP_SWAP_I8 : AtomicCmpSwap<atomic_cmp_swap_8, GPR32>;
def ATOMIC_CMP_SWAP_I16 : AtomicCmpSwap<atomic_cmp_swap_16, GPR32>;
def ATOMIC_CMP_SWAP_I32 : AtomicCmpSwap<atomic_cmp_swap_32, GPR32>;
+
}
+def ATOMIC_LOAD_ADD_I8_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_ADD_I16_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_ADD_I32_POSTRA : Atomic2OpsPostRA<GPR32>;
+def ATOMIC_LOAD_SUB_I8_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_SUB_I16_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_SUB_I32_POSTRA : Atomic2OpsPostRA<GPR32>;
+def ATOMIC_LOAD_AND_I8_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_AND_I16_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_AND_I32_POSTRA : Atomic2OpsPostRA<GPR32>;
+def ATOMIC_LOAD_OR_I8_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_OR_I16_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_OR_I32_POSTRA : Atomic2OpsPostRA<GPR32>;
+def ATOMIC_LOAD_XOR_I8_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_XOR_I16_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_XOR_I32_POSTRA : Atomic2OpsPostRA<GPR32>;
+def ATOMIC_LOAD_NAND_I8_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_NAND_I16_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_NAND_I32_POSTRA : Atomic2OpsPostRA<GPR32>;
+
+def ATOMIC_SWAP_I8_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_SWAP_I16_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_SWAP_I32_POSTRA : Atomic2OpsPostRA<GPR32>;
+
+def ATOMIC_CMP_SWAP_I8_POSTRA : AtomicCmpSwapSubwordPostRA<GPR32>;
+def ATOMIC_CMP_SWAP_I16_POSTRA : AtomicCmpSwapSubwordPostRA<GPR32>;
+def ATOMIC_CMP_SWAP_I32_POSTRA : AtomicCmpSwapPostRA<GPR32>;
+
/// Pseudo instructions for loading and storing accumulator registers.
let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
def LOAD_ACC64 : Load<"", ACC64>;
@@ -1856,69 +2021,72 @@ def LONG_BRANCH_ADDiu : PseudoSE<(outs GPR32Opnd:$dst),
let AdditionalPredicates = [NotInMicroMips] in {
def ADDiu : MMRel, StdMMR6Rel, ArithLogicI<"addiu", simm16_relaxed, GPR32Opnd,
II_ADDIU, immSExt16, add>,
- ADDI_FM<0x9>, IsAsCheapAsAMove;
+ ADDI_FM<0x9>, IsAsCheapAsAMove, ISA_MIPS1;
def ANDi : MMRel, StdMMR6Rel,
ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI, immZExt16, and>,
- ADDI_FM<0xc>;
+ ADDI_FM<0xc>, ISA_MIPS1;
def ORi : MMRel, StdMMR6Rel,
ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16, or>,
- ADDI_FM<0xd>;
+ ADDI_FM<0xd>, ISA_MIPS1;
def XORi : MMRel, StdMMR6Rel,
ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI, immZExt16, xor>,
- ADDI_FM<0xe>;
-}
-def ADDi : MMRel, ArithLogicI<"addi", simm16_relaxed, GPR32Opnd, II_ADDI>, ADDI_FM<0x8>,
- ISA_MIPS1_NOT_32R6_64R6;
-let AdditionalPredicates = [NotInMicroMips] in {
+ ADDI_FM<0xe>, ISA_MIPS1;
+ def ADDi : MMRel, ArithLogicI<"addi", simm16_relaxed, GPR32Opnd, II_ADDI>,
+ ADDI_FM<0x8>, ISA_MIPS1_NOT_32R6_64R6;
def SLTi : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
- SLTI_FM<0xa>;
+ SLTI_FM<0xa>, ISA_MIPS1;
def SLTiu : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
- SLTI_FM<0xb>;
-}
-def LUi : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM;
-let AdditionalPredicates = [NotInMicroMips] in {
+ SLTI_FM<0xb>, ISA_MIPS1;
+
+ def LUi : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM,
+ ISA_MIPS1;
+
/// Arithmetic Instructions (3-Operand, R-Type)
def ADDu : MMRel, StdMMR6Rel, ArithLogicR<"addu", GPR32Opnd, 1, II_ADDU, add>,
- ADD_FM<0, 0x21>;
+ ADD_FM<0, 0x21>, ISA_MIPS1;
def SUBu : MMRel, StdMMR6Rel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
- ADD_FM<0, 0x23>;
-}
-let Defs = [HI0, LO0] in
-def MUL : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>,
- ADD_FM<0x1c, 2>, ISA_MIPS32_NOT_32R6_64R6;
-def ADD : MMRel, StdMMR6Rel, ArithLogicR<"add", GPR32Opnd, 1, II_ADD>, ADD_FM<0, 0x20>;
-def SUB : MMRel, StdMMR6Rel, ArithLogicR<"sub", GPR32Opnd, 0, II_SUB>, ADD_FM<0, 0x22>;
-let AdditionalPredicates = [NotInMicroMips] in {
- def SLT : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>;
- def SLTu : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>, ADD_FM<0, 0x2b>;
+ ADD_FM<0, 0x23>, ISA_MIPS1;
+
+ let Defs = [HI0, LO0] in
+ def MUL : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>,
+ ADD_FM<0x1c, 2>, ISA_MIPS32_NOT_32R6_64R6;
+
+ def ADD : MMRel, StdMMR6Rel, ArithLogicR<"add", GPR32Opnd, 1, II_ADD>,
+ ADD_FM<0, 0x20>, ISA_MIPS1;
+ def SUB : MMRel, StdMMR6Rel, ArithLogicR<"sub", GPR32Opnd, 0, II_SUB>,
+ ADD_FM<0, 0x22>, ISA_MIPS1;
+
+ def SLT : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>,
+ ISA_MIPS1;
+ def SLTu : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>, ADD_FM<0, 0x2b>,
+ ISA_MIPS1;
def AND : MMRel, StdMMR6Rel, ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>,
- ADD_FM<0, 0x24>;
+ ADD_FM<0, 0x24>, ISA_MIPS1;
def OR : MMRel, StdMMR6Rel, ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>,
- ADD_FM<0, 0x25>;
+ ADD_FM<0, 0x25>, ISA_MIPS1;
def XOR : MMRel, StdMMR6Rel, ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>,
- ADD_FM<0, 0x26>;
- def NOR : MMRel, StdMMR6Rel, LogicNOR<"nor", GPR32Opnd>, ADD_FM<0, 0x27>;
+ ADD_FM<0, 0x26>, ISA_MIPS1;
+ def NOR : MMRel, StdMMR6Rel, LogicNOR<"nor", GPR32Opnd>, ADD_FM<0, 0x27>,
+ ISA_MIPS1;
}
-/// Shift Instructions
-let AdditionalPredicates = [NotInMicroMips] in {
-def SLL : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL, shl,
- immZExt5>, SRA_FM<0, 0>;
-def SRL : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL, srl,
- immZExt5>, SRA_FM<2, 0>;
-def SRA : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA, sra,
- immZExt5>, SRA_FM<3, 0>;
-def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV, shl>,
- SRLV_FM<4, 0>;
-def SRLV : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV, srl>,
- SRLV_FM<6, 0>;
-def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV, sra>,
- SRLV_FM<7, 0>;
-}
-
-// Rotate Instructions
let AdditionalPredicates = [NotInMicroMips] in {
+ /// Shift Instructions
+ def SLL : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL, shl,
+ immZExt5>, SRA_FM<0, 0>, ISA_MIPS1;
+ def SRL : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL, srl,
+ immZExt5>, SRA_FM<2, 0>, ISA_MIPS1;
+ def SRA : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA, sra,
+ immZExt5>, SRA_FM<3, 0>, ISA_MIPS1;
+ def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV, shl>,
+ SRLV_FM<4, 0>, ISA_MIPS1;
+ def SRLV : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV, srl>,
+ SRLV_FM<6, 0>, ISA_MIPS1;
+ def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV, sra>,
+ SRLV_FM<7, 0>, ISA_MIPS1;
+
+ // Rotate Instructions
def ROTR : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR, rotr,
immZExt5>,
SRA_FM<2, 1>, ISA_MIPS32R2;
@@ -1928,39 +2096,35 @@ let AdditionalPredicates = [NotInMicroMips] in {
/// Load and Store Instructions
/// aligned
-def LB : LoadMemory<"lb", GPR32Opnd, mem_simm16, sextloadi8, II_LB>, MMRel,
- LW_FM<0x20>;
-def LBu : LoadMemory<"lbu", GPR32Opnd, mem_simm16, zextloadi8, II_LBU,
- addrDefault>, MMRel, LW_FM<0x24>;
let AdditionalPredicates = [NotInMicroMips] in {
- def LH : LoadMemory<"lh", GPR32Opnd, mem_simm16, sextloadi16, II_LH,
- addrDefault>, MMRel, LW_FM<0x21>;
- def LHu : LoadMemory<"lhu", GPR32Opnd, mem_simm16, zextloadi16, II_LHU>,
- MMRel, LW_FM<0x25>;
+ def LB : LoadMemory<"lb", GPR32Opnd, mem_simmptr, sextloadi8, II_LB>, MMRel,
+ LW_FM<0x20>, ISA_MIPS1;
+ def LBu : LoadMemory<"lbu", GPR32Opnd, mem_simmptr, zextloadi8, II_LBU,
+ addrDefault>, MMRel, LW_FM<0x24>, ISA_MIPS1;
+ def LH : LoadMemory<"lh", GPR32Opnd, mem_simmptr, sextloadi16, II_LH,
+ addrDefault>, MMRel, LW_FM<0x21>, ISA_MIPS1;
+ def LHu : LoadMemory<"lhu", GPR32Opnd, mem_simmptr, zextloadi16, II_LHU>,
+ MMRel, LW_FM<0x25>, ISA_MIPS1;
def LW : StdMMR6Rel, Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel,
- LW_FM<0x23>;
-}
-def SB : StdMMR6Rel, Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel,
- LW_FM<0x28>;
-def SH : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>;
-let AdditionalPredicates = [NotInMicroMips] in {
-def SW : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>;
+ LW_FM<0x23>, ISA_MIPS1;
+ def SB : StdMMR6Rel, Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel,
+ LW_FM<0x28>, ISA_MIPS1;
+ def SH : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>,
+ ISA_MIPS1;
+ def SW : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>, ISA_MIPS1;
}
/// load/store left/right
-let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
- AdditionalPredicates = [NotInMicroMips] in {
-def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, II_LWL>, LW_FM<0x22>,
+let AdditionalPredicates = [NotInMicroMips] in {
+def LWL : MMRel, LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, II_LWL>, LW_FM<0x22>,
ISA_MIPS1_NOT_32R6_64R6;
-def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, II_LWR>, LW_FM<0x26>,
+def LWR : MMRel, LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, II_LWR>, LW_FM<0x26>,
ISA_MIPS1_NOT_32R6_64R6;
-def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, II_SWL>, LW_FM<0x2a>,
+def SWL : MMRel, StoreLeftRight<"swl", MipsSWL, GPR32Opnd, II_SWL>, LW_FM<0x2a>,
ISA_MIPS1_NOT_32R6_64R6;
-def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>,
+def SWR : MMRel, StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>,
ISA_MIPS1_NOT_32R6_64R6;
-}
-let AdditionalPredicates = [NotInMicroMips] in {
// COP2 Memory Instructions
def LWC2 : StdMMR6Rel, LW_FT2<"lwc2", COP2Opnd, II_LWC2, load>, LW_FM<0x32>,
ISA_MIPS1_NOT_32R6_64R6;
@@ -1973,63 +2137,68 @@ def SDC2 : StdMMR6Rel, SW_FT2<"sdc2", COP2Opnd, II_SDC2, store>,
// COP3 Memory Instructions
let DecoderNamespace = "COP3_" in {
- def LWC3 : LW_FT3<"lwc3", COP3Opnd, II_LWC3, load>, LW_FM<0x33>;
- def SWC3 : SW_FT3<"swc3", COP3Opnd, II_SWC3, store>, LW_FM<0x3b>;
+ def LWC3 : LW_FT3<"lwc3", COP3Opnd, II_LWC3, load>, LW_FM<0x33>,
+ ISA_MIPS1_NOT_32R6_64R6, NOT_ASE_CNMIPS;
+ def SWC3 : SW_FT3<"swc3", COP3Opnd, II_SWC3, store>, LW_FM<0x3b>,
+ ISA_MIPS1_NOT_32R6_64R6, NOT_ASE_CNMIPS;
def LDC3 : LW_FT3<"ldc3", COP3Opnd, II_LDC3, load>, LW_FM<0x37>,
- ISA_MIPS2;
+ ISA_MIPS2, NOT_ASE_CNMIPS;
def SDC3 : SW_FT3<"sdc3", COP3Opnd, II_SDC3, store>, LW_FM<0x3f>,
- ISA_MIPS2;
+ ISA_MIPS2, NOT_ASE_CNMIPS;
}
def SYNC : MMRel, StdMMR6Rel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS2;
- def SYNCI : MMRel, StdMMR6Rel, SYNCI_FT<"synci">, SYNCI_FM, ISA_MIPS32R2;
+ def SYNCI : MMRel, StdMMR6Rel, SYNCI_FT<"synci", mem_simm16>, SYNCI_FM,
+ ISA_MIPS32R2;
}
let AdditionalPredicates = [NotInMicroMips] in {
- def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd, uimm10, II_TEQ>, TEQ_FM<0x34>, ISA_MIPS2;
- def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd, uimm10, II_TGE>, TEQ_FM<0x30>, ISA_MIPS2;
- def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd, uimm10, II_TGEU>, TEQ_FM<0x31>, ISA_MIPS2;
- def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd, uimm10, II_TLT>, TEQ_FM<0x32>, ISA_MIPS2;
- def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd, uimm10, II_TLTU>, TEQ_FM<0x33>, ISA_MIPS2;
- def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd, uimm10, II_TNE>, TEQ_FM<0x36>, ISA_MIPS2;
+ def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd, uimm10, II_TEQ>, TEQ_FM<0x34>,
+ ISA_MIPS2;
+ def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd, uimm10, II_TGE>, TEQ_FM<0x30>,
+ ISA_MIPS2;
+ def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd, uimm10, II_TGEU>, TEQ_FM<0x31>,
+ ISA_MIPS2;
+ def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd, uimm10, II_TLT>, TEQ_FM<0x32>,
+ ISA_MIPS2;
+ def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd, uimm10, II_TLTU>, TEQ_FM<0x33>,
+ ISA_MIPS2;
+ def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd, uimm10, II_TNE>, TEQ_FM<0x36>,
+ ISA_MIPS2;
+
+ def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd, II_TEQI>, TEQI_FM<0xc>,
+ ISA_MIPS2_NOT_32R6_64R6;
+ def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd, II_TGEI>, TEQI_FM<0x8>,
+ ISA_MIPS2_NOT_32R6_64R6;
+ def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd, II_TGEIU>, TEQI_FM<0x9>,
+ ISA_MIPS2_NOT_32R6_64R6;
+ def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd, II_TLTI>, TEQI_FM<0xa>,
+ ISA_MIPS2_NOT_32R6_64R6;
+ def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd, II_TTLTIU>, TEQI_FM<0xb>,
+ ISA_MIPS2_NOT_32R6_64R6;
+ def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd, II_TNEI>, TEQI_FM<0xe>,
+ ISA_MIPS2_NOT_32R6_64R6;
}
-def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd, II_TEQI>, TEQI_FM<0xc>,
- ISA_MIPS2_NOT_32R6_64R6;
-def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd, II_TGEI>, TEQI_FM<0x8>,
- ISA_MIPS2_NOT_32R6_64R6;
-def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd, II_TGEIU>, TEQI_FM<0x9>,
- ISA_MIPS2_NOT_32R6_64R6;
-def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd, II_TLTI>, TEQI_FM<0xa>,
- ISA_MIPS2_NOT_32R6_64R6;
-def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd, II_TTLTIU>, TEQI_FM<0xb>,
- ISA_MIPS2_NOT_32R6_64R6;
-def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd, II_TNEI>, TEQI_FM<0xe>,
- ISA_MIPS2_NOT_32R6_64R6;
-
let AdditionalPredicates = [NotInMicroMips] in {
-def BREAK : MMRel, StdMMR6Rel, BRK_FT<"break">, BRK_FM<0xd>;
-def SYSCALL : MMRel, SYS_FT<"syscall", uimm20, II_SYSCALL>, SYS_FM<0xc>;
-}
-def TRAP : TrapBase<BREAK>;
-let AdditionalPredicates = [NotInMicroMips] in {
-def SDBBP : MMRel, SYS_FT<"sdbbp", uimm20, II_SDBBP>, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6;
-}
+ def BREAK : MMRel, StdMMR6Rel, BRK_FT<"break">, BRK_FM<0xd>, ISA_MIPS1;
+ def SYSCALL : MMRel, SYS_FT<"syscall", uimm20, II_SYSCALL>, SYS_FM<0xc>,
+ ISA_MIPS1;
+ def TRAP : TrapBase<BREAK>, ISA_MIPS1;
+ def SDBBP : MMRel, SYS_FT<"sdbbp", uimm20, II_SDBBP>, SDBBP_FM,
+ ISA_MIPS32_NOT_32R6_64R6;
-let AdditionalPredicates = [NotInMicroMips] in {
def ERET : MMRel, ER_FT<"eret", II_ERET>, ER_FM<0x18, 0x0>, INSN_MIPS3_32;
- def ERETNC : MMRel, ER_FT<"eretnc", II_ERETNC>, ER_FM<0x18, 0x1>, ISA_MIPS32R5;
+ def ERETNC : MMRel, ER_FT<"eretnc", II_ERETNC>, ER_FM<0x18, 0x1>,
+ ISA_MIPS32R5;
def DERET : MMRel, ER_FT<"deret", II_DERET>, ER_FM<0x1f, 0x0>, ISA_MIPS32;
-}
-let AdditionalPredicates = [NotInMicroMips] in {
- def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd, II_EI>, EI_FM<1>, ISA_MIPS32R2;
- def DI : MMRel, StdMMR6Rel, DEI_FT<"di", GPR32Opnd, II_DI>, EI_FM<0>, ISA_MIPS32R2;
-}
+ def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd, II_EI>, EI_FM<1>,
+ ISA_MIPS32R2;
+ def DI : MMRel, StdMMR6Rel, DEI_FT<"di", GPR32Opnd, II_DI>, EI_FM<0>,
+ ISA_MIPS32R2;
-let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
- AdditionalPredicates = [NotInMicroMips] in {
-def WAIT : WAIT_FT<"wait">, WAIT_FM;
+ def WAIT : MMRel, StdMMR6Rel, WAIT_FT<"wait">, WAIT_FM, INSN_MIPS3_32;
}
let AdditionalPredicates = [NotInMicroMips] in {
@@ -2037,75 +2206,86 @@ let AdditionalPredicates = [NotInMicroMips] in {
def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, PTR_32, ISA_MIPS2_NOT_32R6_64R6;
def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, PTR_32, ISA_MIPS2_NOT_32R6_64R6;
}
-
/// Jump and Branch Instructions
+let AdditionalPredicates = [NotInMicroMips, RelocNotPIC] in
def J : MMRel, JumpFJ<jmptarget, "j", br, bb, "j">, FJ<2>,
- AdditionalRequires<[RelocNotPIC, NotInMicroMips]>, IsBranch;
-def JR : MMRel, IndirectBranch<"jr", GPR32Opnd>, MTLO_FM<8>, ISA_MIPS1_NOT_32R6_64R6;
-def BEQ : MMRel, CBranch<"beq", brtarget, seteq, GPR32Opnd>, BEQ_FM<4>;
+ IsBranch, ISA_MIPS1;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+def JR : MMRel, IndirectBranch<"jr", GPR32Opnd>, MTLO_FM<8>, ISA_MIPS1_NOT_32R6_64R6;
+def BEQ : MMRel, CBranch<"beq", brtarget, seteq, GPR32Opnd>, BEQ_FM<4>,
+ ISA_MIPS1;
def BEQL : MMRel, CBranchLikely<"beql", brtarget, GPR32Opnd>,
BEQ_FM<20>, ISA_MIPS2_NOT_32R6_64R6;
-def BNE : MMRel, CBranch<"bne", brtarget, setne, GPR32Opnd>, BEQ_FM<5>;
+def BNE : MMRel, CBranch<"bne", brtarget, setne, GPR32Opnd>, BEQ_FM<5>,
+ ISA_MIPS1;
def BNEL : MMRel, CBranchLikely<"bnel", brtarget, GPR32Opnd>,
BEQ_FM<21>, ISA_MIPS2_NOT_32R6_64R6;
def BGEZ : MMRel, CBranchZero<"bgez", brtarget, setge, GPR32Opnd>,
- BGEZ_FM<1, 1>;
+ BGEZ_FM<1, 1>, ISA_MIPS1;
def BGEZL : MMRel, CBranchZeroLikely<"bgezl", brtarget, GPR32Opnd>,
BGEZ_FM<1, 3>, ISA_MIPS2_NOT_32R6_64R6;
def BGTZ : MMRel, CBranchZero<"bgtz", brtarget, setgt, GPR32Opnd>,
- BGEZ_FM<7, 0>;
+ BGEZ_FM<7, 0>, ISA_MIPS1;
def BGTZL : MMRel, CBranchZeroLikely<"bgtzl", brtarget, GPR32Opnd>,
BGEZ_FM<23, 0>, ISA_MIPS2_NOT_32R6_64R6;
def BLEZ : MMRel, CBranchZero<"blez", brtarget, setle, GPR32Opnd>,
- BGEZ_FM<6, 0>;
+ BGEZ_FM<6, 0>, ISA_MIPS1;
def BLEZL : MMRel, CBranchZeroLikely<"blezl", brtarget, GPR32Opnd>,
BGEZ_FM<22, 0>, ISA_MIPS2_NOT_32R6_64R6;
def BLTZ : MMRel, CBranchZero<"bltz", brtarget, setlt, GPR32Opnd>,
- BGEZ_FM<1, 0>;
+ BGEZ_FM<1, 0>, ISA_MIPS1;
def BLTZL : MMRel, CBranchZeroLikely<"bltzl", brtarget, GPR32Opnd>,
BGEZ_FM<1, 2>, ISA_MIPS2_NOT_32R6_64R6;
-def B : UncondBranch<BEQ, brtarget>,
- AdditionalRequires<[NotInMicroMips]>;
+def B : UncondBranch<BEQ, brtarget>, ISA_MIPS1;
+
+def JAL : MMRel, JumpLink<"jal", calltarget>, FJ<3>, ISA_MIPS1;
-def JAL : MMRel, JumpLink<"jal", calltarget>, FJ<3>;
-let AdditionalPredicates = [NotInMicroMips] in {
- def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
- def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
}
-def JALX : MMRel, JumpLink<"jalx", calltarget>, FJ<0x1D>,
- ISA_MIPS32_NOT_32R6_64R6;
-def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>,
- ISA_MIPS1_NOT_32R6_64R6;
-def BGEZALL : MMRel, BGEZAL_FT<"bgezall", brtarget, GPR32Opnd>,
- BGEZAL_FM<0x13>, ISA_MIPS2_NOT_32R6_64R6;
-def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>,
- ISA_MIPS1_NOT_32R6_64R6;
-def BLTZALL : MMRel, BGEZAL_FT<"bltzall", brtarget, GPR32Opnd>,
- BGEZAL_FM<0x12>, ISA_MIPS2_NOT_32R6_64R6;
-def BAL_BR : BAL_BR_Pseudo<BGEZAL>;
+let AdditionalPredicates = [NotInMicroMips, NoIndirectJumpGuards] in {
+ def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM, ISA_MIPS1;
+ def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>, ISA_MIPS1;
+}
+let AdditionalPredicates = [NotInMicroMips] in {
+ def JALX : MMRel, JumpLink<"jalx", calltarget>, FJ<0x1D>,
+ ISA_MIPS32_NOT_32R6_64R6;
+ def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>,
+ ISA_MIPS1_NOT_32R6_64R6;
+ def BGEZALL : MMRel, BGEZAL_FT<"bgezall", brtarget, GPR32Opnd>,
+ BGEZAL_FM<0x13>, ISA_MIPS2_NOT_32R6_64R6;
+ def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>,
+ ISA_MIPS1_NOT_32R6_64R6;
+ def BLTZALL : MMRel, BGEZAL_FT<"bltzall", brtarget, GPR32Opnd>,
+ BGEZAL_FM<0x12>, ISA_MIPS2_NOT_32R6_64R6;
+ def BAL_BR : BAL_BR_Pseudo<BGEZAL, brtarget>, ISA_MIPS1;
+}
let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips] in {
- def TAILCALL : TailCall<J, jmptarget>;
+ def TAILCALL : TailCall<J, jmptarget>, ISA_MIPS1;
}
-
-def TAILCALLREG : TailCallReg<GPR32Opnd>;
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+ NoIndirectJumpGuards] in
+ def TAILCALLREG : TailCallReg<JR, GPR32Opnd>, ISA_MIPS1_NOT_32R6_64R6;
// Indirect branches are matched as PseudoIndirectBranch/PseudoIndirectBranch64
// then are expanded to JR, JR64, JALR, or JALR64 depending on the ISA.
-class PseudoIndirectBranchBase<RegisterOperand RO> :
+class PseudoIndirectBranchBase<Instruction JumpInst, RegisterOperand RO> :
MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)],
- II_IndirectBranchPseudo> {
+ II_IndirectBranchPseudo>,
+ PseudoInstExpansion<(JumpInst RO:$rs)> {
let isTerminator=1;
let isBarrier=1;
let hasDelaySlot = 1;
let isBranch = 1;
let isIndirectBranch = 1;
bit isCTI = 1;
- let Predicates = [NotInMips16Mode];
}
-def PseudoIndirectBranch : PseudoIndirectBranchBase<GPR32Opnd>;
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+ NoIndirectJumpGuards] in
+ def PseudoIndirectBranch : PseudoIndirectBranchBase<JR, GPR32Opnd>,
+ ISA_MIPS1_NOT_32R6_64R6;
// Return instructions are matched as a RetRA instruction, then are expanded
// into PseudoReturn/PseudoReturn64 after register allocation. Finally,
@@ -2147,64 +2327,61 @@ let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1, isCTI = 1 in
}
/// Multiply and Divide Instructions.
-def MULT : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
- MULT_FM<0, 0x18>, ISA_MIPS1_NOT_32R6_64R6;
-def MULTu : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
- MULT_FM<0, 0x19>, ISA_MIPS1_NOT_32R6_64R6;
let AdditionalPredicates = [NotInMicroMips] in {
+ def MULT : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
+ MULT_FM<0, 0x18>, ISA_MIPS1_NOT_32R6_64R6;
+ def MULTu : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
+ MULT_FM<0, 0x19>, ISA_MIPS1_NOT_32R6_64R6;
def SDIV : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
MULT_FM<0, 0x1a>, ISA_MIPS1_NOT_32R6_64R6;
def UDIV : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
MULT_FM<0, 0x1b>, ISA_MIPS1_NOT_32R6_64R6;
-}
-def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>,
- ISA_MIPS1_NOT_32R6_64R6;
-def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>,
- ISA_MIPS1_NOT_32R6_64R6;
-let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
- AdditionalPredicates = [NotInMicroMips] in {
-def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>,
- ISA_MIPS1_NOT_32R6_64R6;
-def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>,
- ISA_MIPS1_NOT_32R6_64R6;
-}
+ def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>,
+ ISA_MIPS1_NOT_32R6_64R6;
+ def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>,
+ ISA_MIPS1_NOT_32R6_64R6;
+ def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>,
+ ISA_MIPS1_NOT_32R6_64R6;
+ def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>,
+ ISA_MIPS1_NOT_32R6_64R6;
-/// Sign Ext In Register Instructions.
-def SEB : MMRel, StdMMR6Rel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
- SEB_FM<0x10, 0x20>, ISA_MIPS32R2;
-def SEH : MMRel, StdMMR6Rel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
- SEB_FM<0x18, 0x20>, ISA_MIPS32R2;
+ /// Sign Ext In Register Instructions.
+ def SEB : MMRel, StdMMR6Rel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
+ SEB_FM<0x10, 0x20>, ISA_MIPS32R2;
+ def SEH : MMRel, StdMMR6Rel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
+ SEB_FM<0x18, 0x20>, ISA_MIPS32R2;
-/// Count Leading
-def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd, II_CLZ>, CLO_FM<0x20>,
- ISA_MIPS32_NOT_32R6_64R6;
-def CLO : MMRel, CountLeading1<"clo", GPR32Opnd, II_CLO>, CLO_FM<0x21>,
- ISA_MIPS32_NOT_32R6_64R6;
+ /// Count Leading
+ def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd, II_CLZ>, CLO_FM<0x20>,
+ ISA_MIPS32_NOT_32R6_64R6;
+ def CLO : MMRel, CountLeading1<"clo", GPR32Opnd, II_CLO>, CLO_FM<0x21>,
+ ISA_MIPS32_NOT_32R6_64R6;
-let AdditionalPredicates = [NotInMicroMips] in {
/// Word Swap Bytes Within Halfwords
def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>, SEB_FM<2, 0x20>,
ISA_MIPS32R2;
-}
-/// No operation.
-def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
+ /// No operation.
+ def NOP : PseudoSE<(outs), (ins), []>,
+ PseudoInstExpansion<(SLL ZERO, ZERO, 0)>, ISA_MIPS1;
-// FrameIndexes are legalized when they are operands from load/store
-// instructions. The same not happens for stack address copies, so an
-// add op with mem ComplexPattern is used and the stack address copy
-// can be matched. It's similar to Sparc LEA_ADDRi
-def LEA_ADDiu : MMRel, EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>;
+ // FrameIndexes are legalized when they are operands from load/store
+ // instructions. The same not happens for stack address copies, so an
+ // add op with mem ComplexPattern is used and the stack address copy
+ // can be matched. It's similar to Sparc LEA_ADDRi
+ let AdditionalPredicates = [NotInMicroMips] in
+ def LEA_ADDiu : MMRel, EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>, ISA_MIPS1;
-// MADD*/MSUB*
-def MADD : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>,
- ISA_MIPS32_NOT_32R6_64R6;
-def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>,
- ISA_MIPS32_NOT_32R6_64R6;
-def MSUB : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>,
- ISA_MIPS32_NOT_32R6_64R6;
-def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>,
- ISA_MIPS32_NOT_32R6_64R6;
+ // MADD*/MSUB*
+ def MADD : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>,
+ ISA_MIPS32_NOT_32R6_64R6;
+ def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>,
+ ISA_MIPS32_NOT_32R6_64R6;
+ def MSUB : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>,
+ ISA_MIPS32_NOT_32R6_64R6;
+ def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>,
+ ISA_MIPS32_NOT_32R6_64R6;
+}
let AdditionalPredicates = [NotDSP] in {
def PseudoMULT : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, II_MULT>,
@@ -2229,35 +2406,39 @@ let AdditionalPredicates = [NotInMicroMips] in {
0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU,
0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
- def RDHWR : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
+ def RDHWR : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM, ISA_MIPS1;
// TODO: Add '0 < pos+size <= 32' constraint check to ext instruction
def EXT : MMRel, StdMMR6Rel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1,
immZExt5, immZExt5Plus1, MipsExt>,
- EXT_FM<0>;
+ EXT_FM<0>, ISA_MIPS32R2;
def INS : MMRel, StdMMR6Rel, InsBase<"ins", GPR32Opnd, uimm5,
uimm5_inssize_plus1, immZExt5,
immZExt5Plus1>,
- EXT_FM<4>;
+ EXT_FM<4>, ISA_MIPS32R2;
}
/// Move Control Registers From/To CPU Registers
let AdditionalPredicates = [NotInMicroMips] in {
- def MTC0 : MTC3OP<"mtc0", COP0Opnd, GPR32Opnd, II_MTC0>, MFC3OP_FM<0x10, 4>,
- ISA_MIPS32;
- def MFC0 : MFC3OP<"mfc0", GPR32Opnd, COP0Opnd, II_MFC0>, MFC3OP_FM<0x10, 0>,
- ISA_MIPS32;
+ def MTC0 : MTC3OP<"mtc0", COP0Opnd, GPR32Opnd, II_MTC0>,
+ MFC3OP_FM<0x10, 4, 0>, ISA_MIPS1;
+ def MFC0 : MFC3OP<"mfc0", GPR32Opnd, COP0Opnd, II_MFC0>,
+ MFC3OP_FM<0x10, 0, 0>, ISA_MIPS1;
+ def MFC2 : MFC3OP<"mfc2", GPR32Opnd, COP2Opnd, II_MFC2>,
+ MFC3OP_FM<0x12, 0, 0>, ISA_MIPS1;
+ def MTC2 : MTC3OP<"mtc2", COP2Opnd, GPR32Opnd, II_MTC2>,
+ MFC3OP_FM<0x12, 4, 0>, ISA_MIPS1;
}
-def MFC2 : MFC3OP<"mfc2", GPR32Opnd, COP2Opnd, II_MFC2>, MFC3OP_FM<0x12, 0>;
-def MTC2 : MTC3OP<"mtc2", COP2Opnd, GPR32Opnd, II_MTC2>, MFC3OP_FM<0x12, 4>;
class Barrier<string asmstr, InstrItinClass itin = NoItinerary> :
InstSE<(outs), (ins), asmstr, [], itin, FrmOther, asmstr>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def SSNOP : MMRel, StdMMR6Rel, Barrier<"ssnop", II_SSNOP>, BARRIER_FM<1>,
+ ISA_MIPS1;
+ def EHB : MMRel, Barrier<"ehb", II_EHB>, BARRIER_FM<3>, ISA_MIPS1;
-def SSNOP : MMRel, StdMMR6Rel, Barrier<"ssnop", II_SSNOP>, BARRIER_FM<1>;
-def EHB : MMRel, Barrier<"ehb", II_EHB>, BARRIER_FM<3>;
-
-let isCTI = 1 in
-def PAUSE : MMRel, StdMMR6Rel, Barrier<"pause", II_PAUSE>, BARRIER_FM<5>,
- ISA_MIPS32R2;
+ let isCTI = 1 in
+ def PAUSE : MMRel, StdMMR6Rel, Barrier<"pause", II_PAUSE>, BARRIER_FM<5>,
+ ISA_MIPS32R2;
+}
// JR_HB and JALR_HB are defined here using the new style naming
// scheme because some of this code is shared with Mips32r6InstrInfo.td
@@ -2278,8 +2459,8 @@ class JALR_HB_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
list<dag> Pattern = [];
}
-class JR_HB_DESC : InstSE<(outs), (ins), "", [], II_JR_HB, FrmJ>,
- JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> {
+class JR_HB_DESC<RegisterOperand RO> :
+ InstSE<(outs), (ins), "", [], II_JR_HB, FrmJ>, JR_HB_DESC_BASE<"jr.hb", RO> {
let isBranch=1;
let isIndirectBranch=1;
let hasDelaySlot=1;
@@ -2288,8 +2469,9 @@ class JR_HB_DESC : InstSE<(outs), (ins), "", [], II_JR_HB, FrmJ>,
bit isCTI = 1;
}
-class JALR_HB_DESC : InstSE<(outs), (ins), "", [], II_JALR_HB, FrmJ>,
- JALR_HB_DESC_BASE<"jalr.hb", GPR32Opnd> {
+class JALR_HB_DESC<RegisterOperand RO> :
+ InstSE<(outs), (ins), "", [], II_JALR_HB, FrmJ>, JALR_HB_DESC_BASE<"jalr.hb",
+ RO> {
let isIndirectBranch=1;
let hasDelaySlot=1;
bit isCTI = 1;
@@ -2298,16 +2480,27 @@ class JALR_HB_DESC : InstSE<(outs), (ins), "", [], II_JALR_HB, FrmJ>,
class JR_HB_ENC : JR_HB_FM<8>;
class JALR_HB_ENC : JALR_HB_FM<9>;
-def JR_HB : JR_HB_DESC, JR_HB_ENC, ISA_MIPS32_NOT_32R6_64R6;
-def JALR_HB : JALR_HB_DESC, JALR_HB_ENC, ISA_MIPS32;
+def JR_HB : JR_HB_DESC<GPR32Opnd>, JR_HB_ENC, ISA_MIPS32R2_NOT_32R6_64R6;
+def JALR_HB : JALR_HB_DESC<GPR32Opnd>, JALR_HB_ENC, ISA_MIPS32;
+
+let AdditionalPredicates = [NotInMicroMips, UseIndirectJumpsHazard] in
+ def JALRHBPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR_HB, RA>;
+
+
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+ UseIndirectJumpsHazard] in {
+ def TAILCALLREGHB : TailCallReg<JR_HB, GPR32Opnd>, ISA_MIPS32_NOT_32R6_64R6;
+ def PseudoIndirectHazardBranch : PseudoIndirectBranchBase<JR_HB, GPR32Opnd>,
+ ISA_MIPS32R2_NOT_32R6_64R6;
+}
class TLB<string asmstr, InstrItinClass itin = NoItinerary> :
InstSE<(outs), (ins), asmstr, [], itin, FrmOther, asmstr>;
let AdditionalPredicates = [NotInMicroMips] in {
-def TLBP : MMRel, TLB<"tlbp", II_TLBP>, COP0_TLB_FM<0x08>;
-def TLBR : MMRel, TLB<"tlbr", II_TLBR>, COP0_TLB_FM<0x01>;
-def TLBWI : MMRel, TLB<"tlbwi", II_TLBWI>, COP0_TLB_FM<0x02>;
-def TLBWR : MMRel, TLB<"tlbwr", II_TLBWR>, COP0_TLB_FM<0x06>;
+ def TLBP : MMRel, TLB<"tlbp", II_TLBP>, COP0_TLB_FM<0x08>, ISA_MIPS1;
+ def TLBR : MMRel, TLB<"tlbr", II_TLBR>, COP0_TLB_FM<0x01>, ISA_MIPS1;
+ def TLBWI : MMRel, TLB<"tlbwi", II_TLBWI>, COP0_TLB_FM<0x02>, ISA_MIPS1;
+ def TLBWR : MMRel, TLB<"tlbwr", II_TLBWR>, COP0_TLB_FM<0x06>, ISA_MIPS1;
}
class CacheOp<string instr_asm, Operand MemOpnd,
InstrItinClass itin = NoItinerary> :
@@ -2317,11 +2510,13 @@ class CacheOp<string instr_asm, Operand MemOpnd,
let DecoderMethod = "DecodeCacheOp";
}
-def CACHE : MMRel, CacheOp<"cache", mem, II_CACHE>, CACHEOP_FM<0b101111>,
- INSN_MIPS3_32_NOT_32R6_64R6;
-def PREF : MMRel, CacheOp<"pref", mem, II_PREF>, CACHEOP_FM<0b110011>,
- INSN_MIPS3_32_NOT_32R6_64R6;
-
+let AdditionalPredicates = [NotInMicroMips] in {
+ def CACHE : MMRel, CacheOp<"cache", mem, II_CACHE>, CACHEOP_FM<0b101111>,
+ INSN_MIPS3_32_NOT_32R6_64R6;
+ def PREF : MMRel, CacheOp<"pref", mem, II_PREF>, CACHEOP_FM<0b110011>,
+ INSN_MIPS3_32_NOT_32R6_64R6;
+}
+// FIXME: We are missing the prefx instruction.
def ROL : MipsAsmPseudoInst<(outs),
(ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
"rol\t$rs, $rt, $rd">;
@@ -2398,6 +2593,38 @@ def MULOUMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rd, GPR32Opnd:$rs,
"mulou\t$rd, $rs, $rt">,
ISA_MIPS1_NOT_32R6_64R6;
+// Virtualization ASE
+class HYPCALL_FT<string opstr> :
+ InstSE<(outs), (ins uimm10:$code_),
+ !strconcat(opstr, "\t$code_"), [], II_HYPCALL, FrmOther, opstr> {
+ let BaseOpcode = opstr;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+ def MFGC0 : MMRel, MFC3OP<"mfgc0", GPR32Opnd, COP0Opnd, II_MFGC0>,
+ MFC3OP_FM<0x10, 3, 0>, ISA_MIPS32R5, ASE_VIRT;
+ def MTGC0 : MMRel, MTC3OP<"mtgc0", COP0Opnd, GPR32Opnd, II_MTGC0>,
+ MFC3OP_FM<0x10, 3, 2>, ISA_MIPS32R5, ASE_VIRT;
+ def MFHGC0 : MMRel, MFC3OP<"mfhgc0", GPR32Opnd, COP0Opnd, II_MFHGC0>,
+ MFC3OP_FM<0x10, 3, 4>, ISA_MIPS32R5, ASE_VIRT;
+ def MTHGC0 : MMRel, MTC3OP<"mthgc0", COP0Opnd, GPR32Opnd, II_MTHGC0>,
+ MFC3OP_FM<0x10, 3, 6>, ISA_MIPS32R5, ASE_VIRT;
+ def TLBGINV : MMRel, TLB<"tlbginv", II_TLBGINV>, COP0_TLB_FM<0b001011>,
+ ISA_MIPS32R5, ASE_VIRT;
+ def TLBGINVF : MMRel, TLB<"tlbginvf", II_TLBGINVF>, COP0_TLB_FM<0b001100>,
+ ISA_MIPS32R5, ASE_VIRT;
+ def TLBGP : MMRel, TLB<"tlbgp", II_TLBGP>, COP0_TLB_FM<0b010000>,
+ ISA_MIPS32R5, ASE_VIRT;
+ def TLBGR : MMRel, TLB<"tlbgr", II_TLBGR>, COP0_TLB_FM<0b001001>,
+ ISA_MIPS32R5, ASE_VIRT;
+ def TLBGWI : MMRel, TLB<"tlbgwi", II_TLBGWI>, COP0_TLB_FM<0b001010>,
+ ISA_MIPS32R5, ASE_VIRT;
+ def TLBGWR : MMRel, TLB<"tlbgwr", II_TLBGWR>, COP0_TLB_FM<0b001110>,
+ ISA_MIPS32R5, ASE_VIRT;
+ def HYPCALL : MMRel, HYPCALL_FT<"hypcall">,
+ HYPCALL_FM<0b101000>, ISA_MIPS32R5, ASE_VIRT;
+}
+
//===----------------------------------------------------------------------===//
// Instruction aliases
//===----------------------------------------------------------------------===//
@@ -2416,93 +2643,111 @@ multiclass OneOrTwoOperandMacroImmediateAlias<string Memnomic,
Imm:$imm), 0>;
}
-def : MipsInstAlias<"move $dst, $src",
- (OR GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
- GPR_32 {
- let AdditionalPredicates = [NotInMicroMips];
-}
-def : MipsInstAlias<"move $dst, $src",
- (ADDu GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
- GPR_32 {
- let AdditionalPredicates = [NotInMicroMips];
-}
-def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>,
- ISA_MIPS1_NOT_32R6_64R6;
-
-def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
-let Predicates = [NotInMicroMips] in {
-def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
-}
-def : MipsInstAlias<"jalr.hb $rs", (JALR_HB RA, GPR32Opnd:$rs), 1>, ISA_MIPS32;
-def : MipsInstAlias<"neg $rt, $rs",
- (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
-def : MipsInstAlias<"neg $rt",
- (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>;
-def : MipsInstAlias<"negu $rt, $rs",
- (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
-def : MipsInstAlias<"negu $rt",
- (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>;
let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsInstAlias<"move $dst, $src",
+ (OR GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
+ GPR_32, ISA_MIPS1;
+ def : MipsInstAlias<"move $dst, $src",
+ (ADDu GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
+ GPR_32, ISA_MIPS1;
+
+ def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 1>,
+ ISA_MIPS1_NOT_32R6_64R6;
+
+ def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>, ISA_MIPS1;
+
+ def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
+
+ def : MipsInstAlias<"jalr.hb $rs", (JALR_HB RA, GPR32Opnd:$rs), 1>,
+ ISA_MIPS32;
+
+ def : MipsInstAlias<"neg $rt, $rs",
+ (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS1;
+ def : MipsInstAlias<"neg $rt",
+ (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>, ISA_MIPS1;
+ def : MipsInstAlias<"negu $rt, $rs",
+ (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS1;
+ def : MipsInstAlias<"negu $rt",
+ (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>, ISA_MIPS1;
def : MipsInstAlias<
"sgt $rd, $rs, $rt",
- (SLT GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ (SLT GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1;
def : MipsInstAlias<
"sgt $rs, $rt",
- (SLT GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ (SLT GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1;
def : MipsInstAlias<
"sgtu $rd, $rs, $rt",
- (SLTu GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ (SLTu GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1;
def : MipsInstAlias<
"sgtu $$rs, $rt",
- (SLTu GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+ (SLTu GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1;
def : MipsInstAlias<
"not $rt, $rs",
- (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
+ (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>, ISA_MIPS1;
def : MipsInstAlias<
"not $rt",
- (NOR GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>;
- def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
-
- defm : OneOrTwoOperandMacroImmediateAlias<"add", ADDi>, ISA_MIPS1_NOT_32R6_64R6;
-
- defm : OneOrTwoOperandMacroImmediateAlias<"addu", ADDiu>;
-
- defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi>, GPR_32;
+ (NOR GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>, ISA_MIPS1;
- defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi>, GPR_32;
+ def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>, ISA_MIPS1;
- defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi>, GPR_32;
-
- defm : OneOrTwoOperandMacroImmediateAlias<"slt", SLTi>, GPR_32;
-
- defm : OneOrTwoOperandMacroImmediateAlias<"sltu", SLTiu>, GPR_32;
-}
-def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, COP0Opnd:$rd, 0), 0>;
-def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 COP0Opnd:$rd, GPR32Opnd:$rt, 0), 0>;
-def : MipsInstAlias<"mfc2 $rt, $rd", (MFC2 GPR32Opnd:$rt, COP2Opnd:$rd, 0), 0>;
-def : MipsInstAlias<"mtc2 $rt, $rd", (MTC2 COP2Opnd:$rd, GPR32Opnd:$rt, 0), 0>;
-let AdditionalPredicates = [NotInMicroMips] in {
-def : MipsInstAlias<"b $offset", (BEQ ZERO, ZERO, brtarget:$offset), 0>;
-}
-def : MipsInstAlias<"bnez $rs,$offset",
- (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
-def : MipsInstAlias<"bnezl $rs,$offset",
- (BNEL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
-def : MipsInstAlias<"beqz $rs,$offset",
- (BEQ GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
-def : MipsInstAlias<"beqzl $rs,$offset",
- (BEQL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
-let AdditionalPredicates = [NotInMicroMips] in {
- def : MipsInstAlias<"syscall", (SYSCALL 0), 1>;
-}
+ defm : OneOrTwoOperandMacroImmediateAlias<"add", ADDi>, ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<"break", (BREAK 0, 0), 1>;
-def : MipsInstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
-let AdditionalPredicates = [NotInMicroMips] in {
+ defm : OneOrTwoOperandMacroImmediateAlias<"addu", ADDiu>, ISA_MIPS1;
+
+ defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi>, ISA_MIPS1, GPR_32;
+
+ defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi>, ISA_MIPS1, GPR_32;
+
+ defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi>, ISA_MIPS1, GPR_32;
+
+ defm : OneOrTwoOperandMacroImmediateAlias<"slt", SLTi>, ISA_MIPS1, GPR_32;
+
+ defm : OneOrTwoOperandMacroImmediateAlias<"sltu", SLTiu>, ISA_MIPS1, GPR_32;
+
+ def : MipsInstAlias<"mfgc0 $rt, $rd",
+ (MFGC0 GPR32Opnd:$rt, COP0Opnd:$rd, 0), 0>,
+ ISA_MIPS32R5, ASE_VIRT;
+ def : MipsInstAlias<"mtgc0 $rt, $rd",
+ (MTGC0 COP0Opnd:$rd, GPR32Opnd:$rt, 0), 0>,
+ ISA_MIPS32R5, ASE_VIRT;
+ def : MipsInstAlias<"mfhgc0 $rt, $rd",
+ (MFHGC0 GPR32Opnd:$rt, COP0Opnd:$rd, 0), 0>,
+ ISA_MIPS32R5, ASE_VIRT;
+ def : MipsInstAlias<"mthgc0 $rt, $rd",
+ (MTHGC0 COP0Opnd:$rd, GPR32Opnd:$rt, 0), 0>,
+ ISA_MIPS32R5, ASE_VIRT;
+ def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, COP0Opnd:$rd, 0), 0>,
+ ISA_MIPS1;
+ def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 COP0Opnd:$rd, GPR32Opnd:$rt, 0), 0>,
+ ISA_MIPS1;
+ def : MipsInstAlias<"mfc2 $rt, $rd", (MFC2 GPR32Opnd:$rt, COP2Opnd:$rd, 0), 0>,
+ ISA_MIPS1;
+ def : MipsInstAlias<"mtc2 $rt, $rd", (MTC2 COP2Opnd:$rd, GPR32Opnd:$rt, 0), 0>,
+ ISA_MIPS1;
+
+ def : MipsInstAlias<"b $offset", (BEQ ZERO, ZERO, brtarget:$offset), 0>,
+ ISA_MIPS1;
+
+ def : MipsInstAlias<"bnez $rs,$offset",
+ (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
+ ISA_MIPS1;
+ def : MipsInstAlias<"bnezl $rs,$offset",
+ (BNEL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
+ ISA_MIPS2;
+ def : MipsInstAlias<"beqz $rs,$offset",
+ (BEQ GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
+ ISA_MIPS1;
+ def : MipsInstAlias<"beqzl $rs,$offset",
+ (BEQL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
+ ISA_MIPS2;
+
+ def : MipsInstAlias<"syscall", (SYSCALL 0), 1>, ISA_MIPS1;
+
+ def : MipsInstAlias<"break", (BREAK 0, 0), 1>, ISA_MIPS1;
+ def : MipsInstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>, ISA_MIPS1;
def : MipsInstAlias<"ei", (EI ZERO), 1>, ISA_MIPS32R2;
def : MipsInstAlias<"di", (DI ZERO), 1>, ISA_MIPS32R2;
-}
-let AdditionalPredicates = [NotInMicroMips] in {
+
def : MipsInstAlias<"teq $rs, $rt",
(TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
def : MipsInstAlias<"tge $rs, $rt",
@@ -2515,6 +2760,9 @@ let AdditionalPredicates = [NotInMicroMips] in {
(TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
def : MipsInstAlias<"tne $rs, $rt",
(TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+ def : MipsInstAlias<"rdhwr $rt, $rs",
+ (RDHWR GPR32Opnd:$rt, HWRegsOpnd:$rs, 0), 1>, ISA_MIPS1;
+
}
def : MipsInstAlias<"sub, $rd, $rs, $imm",
(ADDi GPR32Opnd:$rd, GPR32Opnd:$rs,
@@ -2546,8 +2794,8 @@ let AdditionalPredicates = [NotInMicroMips] in {
ISA_MIPS32R2;
}
def : MipsInstAlias<"sdbbp", (SDBBP 0)>, ISA_MIPS32_NOT_32R6_64R6;
-def : MipsInstAlias<"sync",
- (SYNC 0), 1>, ISA_MIPS2;
+let AdditionalPredicates = [NotInMicroMips] in
+ def : MipsInstAlias<"sync", (SYNC 0), 1>, ISA_MIPS2;
def : MipsInstAlias<"mulo $rs, $rt",
(MULOMacro GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
@@ -2556,6 +2804,9 @@ def : MipsInstAlias<"mulou $rs, $rt",
(MULOUMacro GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
ISA_MIPS1_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in
+ def : MipsInstAlias<"hypcall", (HYPCALL 0), 1>, ISA_MIPS32R5, ASE_VIRT;
+
//===----------------------------------------------------------------------===//
// Assembler Pseudo Instructions
//===----------------------------------------------------------------------===//
@@ -2693,6 +2944,36 @@ def : MipsInstAlias<"divu $rd, $imm", (UDivIMacro GPR32Opnd:$rd, GPR32Opnd:$rd,
simm32:$imm), 0>,
ISA_MIPS1_NOT_32R6_64R6;
+def SRemMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+ "rem\t$rd, $rs, $rt">,
+ ISA_MIPS1_NOT_32R6_64R6;
+def SRemIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, simm32_relaxed:$imm),
+ "rem\t$rd, $rs, $imm">,
+ ISA_MIPS1_NOT_32R6_64R6;
+def URemMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+ "remu\t$rd, $rs, $rt">,
+ ISA_MIPS1_NOT_32R6_64R6;
+def URemIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, simm32_relaxed:$imm),
+ "remu\t$rd, $rs, $imm">,
+ ISA_MIPS1_NOT_32R6_64R6;
+
+def : MipsInstAlias<"rem $rt, $rs", (SRemMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+ GPR32Opnd:$rs), 0>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"rem $rd, $imm", (SRemIMacro GPR32Opnd:$rd, GPR32Opnd:$rd,
+ simm32_relaxed:$imm), 0>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"remu $rt, $rs", (URemMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+ GPR32Opnd:$rs), 0>,
+ ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"remu $rd, $imm", (URemIMacro GPR32Opnd:$rd, GPR32Opnd:$rd,
+ simm32_relaxed:$imm), 0>,
+ ISA_MIPS1_NOT_32R6_64R6;
+
def Ulh : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
"ulh\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
@@ -2747,17 +3028,17 @@ def : MipsPat<(VT immSExt16:$imm), (ADDiuOp ZEROReg, imm:$imm)>;
}
let AdditionalPredicates = [NotInMicroMips] in
- defm : MaterializeImms<i32, ZERO, ADDiu, LUi, ORi>;
+ defm : MaterializeImms<i32, ZERO, ADDiu, LUi, ORi>, ISA_MIPS1;
// Carry MipsPatterns
let AdditionalPredicates = [NotInMicroMips] in {
def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
- (SUBu GPR32:$lhs, GPR32:$rhs)>;
+ (SUBu GPR32:$lhs, GPR32:$rhs)>, ISA_MIPS1;
}
def : MipsPat<(addc GPR32:$lhs, GPR32:$rhs),
- (ADDu GPR32:$lhs, GPR32:$rhs)>, ASE_NOT_DSP;
+ (ADDu GPR32:$lhs, GPR32:$rhs)>, ISA_MIPS1, ASE_NOT_DSP;
def : MipsPat<(addc GPR32:$src, immSExt16:$imm),
- (ADDiu GPR32:$src, imm:$imm)>, ASE_NOT_DSP;
+ (ADDiu GPR32:$src, imm:$imm)>, ISA_MIPS1, ASE_NOT_DSP;
// Support multiplication for pre-Mips32 targets that don't have
// the MUL instruction.
@@ -2771,16 +3052,16 @@ def : MipsPat<(MipsSync (i32 immz)),
// Call
def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
- (JAL texternalsym:$dst)>;
+ (JAL texternalsym:$dst)>, ISA_MIPS1;
//def : MipsPat<(MipsJmpLink GPR32:$dst),
// (JALR GPR32:$dst)>;
// Tail call
let AdditionalPredicates = [NotInMicroMips] in {
def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
- (TAILCALL tglobaladdr:$dst)>;
+ (TAILCALL tglobaladdr:$dst)>, ISA_MIPS1;
def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
- (TAILCALL texternalsym:$dst)>;
+ (TAILCALL texternalsym:$dst)>, ISA_MIPS1;
}
// hi/lo relocs
multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu,
@@ -2789,7 +3070,6 @@ multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu,
def : MipsPat<(MipsHi tblockaddress:$in), (Lui tblockaddress:$in)>;
def : MipsPat<(MipsHi tjumptable:$in), (Lui tjumptable:$in)>;
def : MipsPat<(MipsHi tconstpool:$in), (Lui tconstpool:$in)>;
- def : MipsPat<(MipsHi tglobaltlsaddr:$in), (Lui tglobaltlsaddr:$in)>;
def : MipsPat<(MipsHi texternalsym:$in), (Lui texternalsym:$in)>;
def : MipsPat<(MipsLo tglobaladdr:$in), (Addiu ZeroReg, tglobaladdr:$in)>;
@@ -2813,44 +3093,47 @@ multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu,
(Addiu GPROpnd:$hi, tglobaltlsaddr:$lo)>;
}
-defm : MipsHiLoRelocs<LUi, ADDiu, ZERO, GPR32Opnd>;
+// wrapper_pic
+class WrapperPat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
+ MipsPat<(MipsWrapper RC:$gp, node:$in), (ADDiuOp RC:$gp, node:$in)>;
-def : MipsPat<(MipsGotHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
-def : MipsPat<(MipsGotHi texternalsym:$in), (LUi texternalsym:$in)>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ defm : MipsHiLoRelocs<LUi, ADDiu, ZERO, GPR32Opnd>, ISA_MIPS1;
-// gp_rel relocs
-def : MipsPat<(add GPR32:$gp, (MipsGPRel tglobaladdr:$in)),
- (ADDiu GPR32:$gp, tglobaladdr:$in)>, ABI_NOT_N64;
-def : MipsPat<(add GPR32:$gp, (MipsGPRel tconstpool:$in)),
- (ADDiu GPR32:$gp, tconstpool:$in)>, ABI_NOT_N64;
+ def : MipsPat<(MipsGotHi tglobaladdr:$in), (LUi tglobaladdr:$in)>, ISA_MIPS1;
+ def : MipsPat<(MipsGotHi texternalsym:$in), (LUi texternalsym:$in)>,
+ ISA_MIPS1;
-// wrapper_pic
-class WrapperPat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
- MipsPat<(MipsWrapper RC:$gp, node:$in),
- (ADDiuOp RC:$gp, node:$in)>;
+ def : MipsPat<(MipsTlsHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>,
+ ISA_MIPS1;
-def : WrapperPat<tglobaladdr, ADDiu, GPR32>;
-def : WrapperPat<tconstpool, ADDiu, GPR32>;
-def : WrapperPat<texternalsym, ADDiu, GPR32>;
-def : WrapperPat<tblockaddress, ADDiu, GPR32>;
-def : WrapperPat<tjumptable, ADDiu, GPR32>;
-def : WrapperPat<tglobaltlsaddr, ADDiu, GPR32>;
+ // gp_rel relocs
+ def : MipsPat<(add GPR32:$gp, (MipsGPRel tglobaladdr:$in)),
+ (ADDiu GPR32:$gp, tglobaladdr:$in)>, ISA_MIPS1, ABI_NOT_N64;
+ def : MipsPat<(add GPR32:$gp, (MipsGPRel tconstpool:$in)),
+ (ADDiu GPR32:$gp, tconstpool:$in)>, ISA_MIPS1, ABI_NOT_N64;
-let AdditionalPredicates = [NotInMicroMips] in {
-// Mips does not have "not", so we expand our way
-def : MipsPat<(not GPR32:$in),
- (NOR GPR32Opnd:$in, ZERO)>;
+ def : WrapperPat<tglobaladdr, ADDiu, GPR32>, ISA_MIPS1;
+ def : WrapperPat<tconstpool, ADDiu, GPR32>, ISA_MIPS1;
+ def : WrapperPat<texternalsym, ADDiu, GPR32>, ISA_MIPS1;
+ def : WrapperPat<tblockaddress, ADDiu, GPR32>, ISA_MIPS1;
+ def : WrapperPat<tjumptable, ADDiu, GPR32>, ISA_MIPS1;
+ def : WrapperPat<tglobaltlsaddr, ADDiu, GPR32>, ISA_MIPS1;
+
+ // Mips does not have "not", so we expand our way
+ def : MipsPat<(not GPR32:$in),
+ (NOR GPR32Opnd:$in, ZERO)>, ISA_MIPS1;
}
// extended loads
-def : MipsPat<(i32 (extloadi1 addr:$src)), (LBu addr:$src)>;
-def : MipsPat<(i32 (extloadi8 addr:$src)), (LBu addr:$src)>;
let AdditionalPredicates = [NotInMicroMips] in {
- def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
-}
+ def : MipsPat<(i32 (extloadi1 addr:$src)), (LBu addr:$src)>, ISA_MIPS1;
+ def : MipsPat<(i32 (extloadi8 addr:$src)), (LBu addr:$src)>, ISA_MIPS1;
+ def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>, ISA_MIPS1;
-// peepholes
-def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
+ // peepholes
+ def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>, ISA_MIPS1;
+}
// brcond patterns
multiclass BrcondPats<RegisterClass RC, Instruction BEQOp, Instruction BEQOp1,
@@ -2884,12 +3167,13 @@ def : MipsPat<(brcond RC:$cond, bb:$dst),
(BNEOp RC:$cond, ZEROReg, bb:$dst)>;
}
let AdditionalPredicates = [NotInMicroMips] in {
- defm : BrcondPats<GPR32, BEQ, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>;
+ defm : BrcondPats<GPR32, BEQ, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>,
+ ISA_MIPS1;
+ def : MipsPat<(brcond (i32 (setlt i32:$lhs, 1)), bb:$dst),
+ (BLEZ i32:$lhs, bb:$dst)>, ISA_MIPS1;
+ def : MipsPat<(brcond (i32 (setgt i32:$lhs, -1)), bb:$dst),
+ (BGEZ i32:$lhs, bb:$dst)>, ISA_MIPS1;
}
-def : MipsPat<(brcond (i32 (setlt i32:$lhs, 1)), bb:$dst),
- (BLEZ i32:$lhs, bb:$dst)>;
-def : MipsPat<(brcond (i32 (setgt i32:$lhs, -1)), bb:$dst),
- (BGEZ i32:$lhs, bb:$dst)>;
// setcc patterns
multiclass SeteqPats<RegisterClass RC, Instruction SLTiuOp, Instruction XOROp,
@@ -2936,36 +3220,39 @@ multiclass SetgeImmPats<RegisterClass RC, Instruction XORiOp,
}
let AdditionalPredicates = [NotInMicroMips] in {
- defm : SeteqPats<GPR32, SLTiu, XOR, SLTu, ZERO>;
- defm : SetlePats<GPR32, XORi, SLT, SLTu>;
- defm : SetgtPats<GPR32, SLT, SLTu>;
- defm : SetgePats<GPR32, XORi, SLT, SLTu>;
- defm : SetgeImmPats<GPR32, XORi, SLTi, SLTiu>;
-}
+ defm : SeteqPats<GPR32, SLTiu, XOR, SLTu, ZERO>, ISA_MIPS1;
+ defm : SetlePats<GPR32, XORi, SLT, SLTu>, ISA_MIPS1;
+ defm : SetgtPats<GPR32, SLT, SLTu>, ISA_MIPS1;
+ defm : SetgePats<GPR32, XORi, SLT, SLTu>, ISA_MIPS1;
+ defm : SetgeImmPats<GPR32, XORi, SLTi, SLTiu>, ISA_MIPS1;
-// bswap pattern
-def : MipsPat<(bswap GPR32:$rt), (ROTR (WSBH GPR32:$rt), 16)>;
+ // bswap pattern
+ def : MipsPat<(bswap GPR32:$rt), (ROTR (WSBH GPR32:$rt), 16)>, ISA_MIPS32R2;
+}
// Load halfword/word patterns.
-let AddedComplexity = 40 in {
- def : LoadRegImmPat<LBu, i32, zextloadi8>;
- let AdditionalPredicates = [NotInMicroMips] in {
- def : LoadRegImmPat<LH, i32, sextloadi16>;
- def : LoadRegImmPat<LW, i32, load>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ let AddedComplexity = 40 in {
+ def : LoadRegImmPat<LBu, i32, zextloadi8>, ISA_MIPS1;
+ def : LoadRegImmPat<LHu, i32, zextloadi16>, ISA_MIPS1;
+ def : LoadRegImmPat<LB, i32, sextloadi8>, ISA_MIPS1;
+ def : LoadRegImmPat<LH, i32, sextloadi16>, ISA_MIPS1;
+ def : LoadRegImmPat<LW, i32, load>, ISA_MIPS1;
}
-}
-// Atomic load patterns.
-def : MipsPat<(atomic_load_8 addr:$a), (LB addr:$a)>;
-let AdditionalPredicates = [NotInMicroMips] in {
- def : MipsPat<(atomic_load_16 addr:$a), (LH addr:$a)>;
-}
-def : MipsPat<(atomic_load_32 addr:$a), (LW addr:$a)>;
+ // Atomic load patterns.
+ def : MipsPat<(atomic_load_8 addr:$a), (LB addr:$a)>, ISA_MIPS1;
+ def : MipsPat<(atomic_load_16 addr:$a), (LH addr:$a)>, ISA_MIPS1;
+ def : MipsPat<(atomic_load_32 addr:$a), (LW addr:$a)>, ISA_MIPS1;
-// Atomic store patterns.
-def : MipsPat<(atomic_store_8 addr:$a, GPR32:$v), (SB GPR32:$v, addr:$a)>;
-def : MipsPat<(atomic_store_16 addr:$a, GPR32:$v), (SH GPR32:$v, addr:$a)>;
-def : MipsPat<(atomic_store_32 addr:$a, GPR32:$v), (SW GPR32:$v, addr:$a)>;
+ // Atomic store patterns.
+ def : MipsPat<(atomic_store_8 addr:$a, GPR32:$v), (SB GPR32:$v, addr:$a)>,
+ ISA_MIPS1;
+ def : MipsPat<(atomic_store_16 addr:$a, GPR32:$v), (SH GPR32:$v, addr:$a)>,
+ ISA_MIPS1;
+ def : MipsPat<(atomic_store_32 addr:$a, GPR32:$v), (SW GPR32:$v, addr:$a)>,
+ ISA_MIPS1;
+}
//===----------------------------------------------------------------------===//
// Floating Point Support
diff --git a/lib/Target/Mips/MipsInstructionSelector.cpp b/lib/Target/Mips/MipsInstructionSelector.cpp
new file mode 100644
index 000000000000..af0ac006bc9e
--- /dev/null
+++ b/lib/Target/Mips/MipsInstructionSelector.cpp
@@ -0,0 +1,184 @@
+//===- MipsInstructionSelector.cpp ------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// Mips.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "MipsRegisterBankInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+
+#define DEBUG_TYPE "mips-isel"
+
+using namespace llvm;
+
+namespace {
+
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#include "MipsGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+
+class MipsInstructionSelector : public InstructionSelector {
+public:
+ MipsInstructionSelector(const MipsTargetMachine &TM, const MipsSubtarget &STI,
+ const MipsRegisterBankInfo &RBI);
+
+ bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+ static const char *getName() { return DEBUG_TYPE; }
+
+private:
+ bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+
+ const MipsTargetMachine &TM;
+ const MipsSubtarget &STI;
+ const MipsInstrInfo &TII;
+ const MipsRegisterInfo &TRI;
+ const MipsRegisterBankInfo &RBI;
+
+#define GET_GLOBALISEL_PREDICATES_DECL
+#include "MipsGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "MipsGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
+#include "MipsGenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+
+MipsInstructionSelector::MipsInstructionSelector(
+ const MipsTargetMachine &TM, const MipsSubtarget &STI,
+ const MipsRegisterBankInfo &RBI)
+ : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
+ TRI(*STI.getRegisterInfo()), RBI(RBI),
+
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "MipsGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "MipsGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
+
+static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
+ MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+ const RegisterBankInfo &RBI) {
+ unsigned DstReg = I.getOperand(0).getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ return true;
+
+ const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+
+ if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
+ return false;
+ }
+ return true;
+}
+
+bool MipsInstructionSelector::select(MachineInstr &I,
+ CodeGenCoverage &CoverageInfo) const {
+
+ MachineBasicBlock &MBB = *I.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ if (!isPreISelGenericOpcode(I.getOpcode())) {
+ if (I.isCopy())
+ return selectCopy(I, TII, MRI, TRI, RBI);
+
+ return true;
+ }
+
+ if (selectImpl(I, CoverageInfo)) {
+ return true;
+ }
+
+ MachineInstr *MI = nullptr;
+ using namespace TargetOpcode;
+
+ switch (I.getOpcode()) {
+ case G_GEP: {
+ MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDu))
+ .add(I.getOperand(0))
+ .add(I.getOperand(1))
+ .add(I.getOperand(2));
+ break;
+ }
+ case G_FRAME_INDEX: {
+ MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu))
+ .add(I.getOperand(0))
+ .add(I.getOperand(1))
+ .addImm(0);
+ break;
+ }
+ case G_STORE:
+ case G_LOAD: {
+ const unsigned DestReg = I.getOperand(0).getReg();
+ const unsigned DestRegBank = RBI.getRegBank(DestReg, MRI, TRI)->getID();
+ const unsigned OpSize = MRI.getType(DestReg).getSizeInBits();
+
+ if (DestRegBank != Mips::GPRBRegBankID || OpSize != 32)
+ return false;
+
+ const unsigned NewOpc = I.getOpcode() == G_STORE ? Mips::SW : Mips::LW;
+
+ MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
+ .add(I.getOperand(0))
+ .add(I.getOperand(1))
+ .addImm(0)
+ .addMemOperand(*I.memoperands_begin());
+ break;
+ }
+ case G_CONSTANT: {
+ int Imm = I.getOperand(1).getCImm()->getValue().getLimitedValue();
+ unsigned LUiReg = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+ MachineInstr *LUi, *ORi;
+
+ LUi = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LUi))
+ .addDef(LUiReg)
+ .addImm(Imm >> 16);
+
+ ORi = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ORi))
+ .addDef(I.getOperand(0).getReg())
+ .addUse(LUiReg)
+ .addImm(Imm & 0xFFFF);
+
+ if (!constrainSelectedInstRegOperands(*LUi, TII, TRI, RBI))
+ return false;
+ if (!constrainSelectedInstRegOperands(*ORi, TII, TRI, RBI))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+ }
+
+ default:
+ return false;
+ }
+
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
+}
+
+namespace llvm {
+InstructionSelector *createMipsInstructionSelector(const MipsTargetMachine &TM,
+ MipsSubtarget &Subtarget,
+ MipsRegisterBankInfo &RBI) {
+ return new MipsInstructionSelector(TM, Subtarget, RBI);
+}
+} // end namespace llvm
diff --git a/lib/Target/Mips/MipsLegalizerInfo.cpp b/lib/Target/Mips/MipsLegalizerInfo.cpp
new file mode 100644
index 000000000000..da6f9dabdaaf
--- /dev/null
+++ b/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -0,0 +1,41 @@
+//===- MipsLegalizerInfo.cpp ------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for Mips.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "MipsLegalizerInfo.h"
+#include "MipsTargetMachine.h"
+
+using namespace llvm;
+
+MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
+ using namespace TargetOpcode;
+
+ const LLT s32 = LLT::scalar(32);
+ const LLT p0 = LLT::pointer(0, 32);
+
+ getActionDefinitionsBuilder(G_ADD).legalFor({s32});
+
+ getActionDefinitionsBuilder({G_LOAD, G_STORE})
+ .legalForCartesianProduct({p0, s32}, {p0});
+
+ getActionDefinitionsBuilder(G_CONSTANT)
+ .legalFor({s32});
+
+ getActionDefinitionsBuilder(G_GEP)
+ .legalFor({{p0, s32}});
+
+ getActionDefinitionsBuilder(G_FRAME_INDEX)
+ .legalFor({p0});
+
+ computeTables();
+ verify(*ST.getInstrInfo());
+}
diff --git a/lib/Target/Mips/MipsLegalizerInfo.h b/lib/Target/Mips/MipsLegalizerInfo.h
new file mode 100644
index 000000000000..36dd39c8c1c1
--- /dev/null
+++ b/lib/Target/Mips/MipsLegalizerInfo.h
@@ -0,0 +1,29 @@
+//===- MipsLegalizerInfo ----------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for Mips.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSMACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_MIPS_MIPSMACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class MipsSubtarget;
+
+/// This class provides legalization strategies.
+class MipsLegalizerInfo : public LegalizerInfo {
+public:
+ MipsLegalizerInfo(const MipsSubtarget &ST);
+};
+} // end namespace llvm
+#endif
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index a4ab7d3a5780..2b7f64099923 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -219,26 +219,77 @@ lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
// Lower register operand.
OutMI.addOperand(LowerOperand(MI->getOperand(0)));
- // Create %hi($tgt-$baltgt).
- OutMI.addOperand(createSub(MI->getOperand(1).getMBB(),
- MI->getOperand(2).getMBB(),
- MipsMCExpr::MEK_HI));
+ MipsMCExpr::MipsExprKind Kind;
+ unsigned TargetFlags = MI->getOperand(1).getTargetFlags();
+ switch (TargetFlags) {
+ case MipsII::MO_HIGHEST:
+ Kind = MipsMCExpr::MEK_HIGHEST;
+ break;
+ case MipsII::MO_HIGHER:
+ Kind = MipsMCExpr::MEK_HIGHER;
+ break;
+ case MipsII::MO_ABS_HI:
+ Kind = MipsMCExpr::MEK_HI;
+ break;
+ case MipsII::MO_ABS_LO:
+ Kind = MipsMCExpr::MEK_LO;
+ break;
+ default:
+ report_fatal_error("Unexpected flags for lowerLongBranchLUi");
+ }
+
+ if (MI->getNumOperands() == 2) {
+ const MCExpr *Expr =
+ MCSymbolRefExpr::create(MI->getOperand(1).getMBB()->getSymbol(), *Ctx);
+ const MipsMCExpr *MipsExpr = MipsMCExpr::create(Kind, Expr, *Ctx);
+ OutMI.addOperand(MCOperand::createExpr(MipsExpr));
+ } else if (MI->getNumOperands() == 3) {
+ // Create %hi($tgt-$baltgt).
+ OutMI.addOperand(createSub(MI->getOperand(1).getMBB(),
+ MI->getOperand(2).getMBB(), Kind));
+ }
}
-void MipsMCInstLower::lowerLongBranchADDiu(
- const MachineInstr *MI, MCInst &OutMI, int Opcode,
- MipsMCExpr::MipsExprKind Kind) const {
+void MipsMCInstLower::lowerLongBranchADDiu(const MachineInstr *MI,
+ MCInst &OutMI, int Opcode) const {
OutMI.setOpcode(Opcode);
+ MipsMCExpr::MipsExprKind Kind;
+ unsigned TargetFlags = MI->getOperand(2).getTargetFlags();
+ switch (TargetFlags) {
+ case MipsII::MO_HIGHEST:
+ Kind = MipsMCExpr::MEK_HIGHEST;
+ break;
+ case MipsII::MO_HIGHER:
+ Kind = MipsMCExpr::MEK_HIGHER;
+ break;
+ case MipsII::MO_ABS_HI:
+ Kind = MipsMCExpr::MEK_HI;
+ break;
+ case MipsII::MO_ABS_LO:
+ Kind = MipsMCExpr::MEK_LO;
+ break;
+ default:
+ report_fatal_error("Unexpected flags for lowerLongBranchADDiu");
+ }
+
// Lower two register operands.
for (unsigned I = 0, E = 2; I != E; ++I) {
const MachineOperand &MO = MI->getOperand(I);
OutMI.addOperand(LowerOperand(MO));
}
- // Create %lo($tgt-$baltgt) or %hi($tgt-$baltgt).
- OutMI.addOperand(createSub(MI->getOperand(2).getMBB(),
- MI->getOperand(3).getMBB(), Kind));
+ if (MI->getNumOperands() == 3) {
+ // Lower register operand.
+ const MCExpr *Expr =
+ MCSymbolRefExpr::create(MI->getOperand(2).getMBB()->getSymbol(), *Ctx);
+ const MipsMCExpr *MipsExpr = MipsMCExpr::create(Kind, Expr, *Ctx);
+ OutMI.addOperand(MCOperand::createExpr(MipsExpr));
+ } else if (MI->getNumOperands() == 4) {
+ // Create %lo($tgt-$baltgt) or %hi($tgt-$baltgt).
+ OutMI.addOperand(createSub(MI->getOperand(2).getMBB(),
+ MI->getOperand(3).getMBB(), Kind));
+ }
}
bool MipsMCInstLower::lowerLongBranch(const MachineInstr *MI,
@@ -250,16 +301,10 @@ bool MipsMCInstLower::lowerLongBranch(const MachineInstr *MI,
lowerLongBranchLUi(MI, OutMI);
return true;
case Mips::LONG_BRANCH_ADDiu:
- lowerLongBranchADDiu(MI, OutMI, Mips::ADDiu, MipsMCExpr::MEK_LO);
+ lowerLongBranchADDiu(MI, OutMI, Mips::ADDiu);
return true;
case Mips::LONG_BRANCH_DADDiu:
- unsigned TargetFlags = MI->getOperand(2).getTargetFlags();
- if (TargetFlags == MipsII::MO_ABS_HI)
- lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu, MipsMCExpr::MEK_HI);
- else if (TargetFlags == MipsII::MO_ABS_LO)
- lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu, MipsMCExpr::MEK_LO);
- else
- report_fatal_error("Unexpected flags for LONG_BRANCH_DADDiu");
+ lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu);
return true;
}
}
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index fb5079643827..e19f21c98839 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -44,8 +44,8 @@ private:
MCOperand createSub(MachineBasicBlock *BB1, MachineBasicBlock *BB2,
MipsMCExpr::MipsExprKind Kind) const;
void lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const;
- void lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI, int Opcode,
- MipsMCExpr::MipsExprKind Kind) const;
+ void lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI,
+ int Opcode) const;
bool lowerLongBranch(const MachineInstr *MI, MCInst &OutMI) const;
};
diff --git a/lib/Target/Mips/MipsMSAInstrFormats.td b/lib/Target/Mips/MipsMSAInstrFormats.td
index 7d25ea56e3d5..d4e225678184 100644
--- a/lib/Target/Mips/MipsMSAInstrFormats.td
+++ b/lib/Target/Mips/MipsMSAInstrFormats.td
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
- PredicateControl, ASE_MSA {
+ ASE_MSA {
let EncodingPredicates = [HasStdEnc];
let Inst{31-26} = 0b011110;
}
@@ -24,7 +24,8 @@ class MSASpecial : MSAInst {
class MSAPseudo<dag outs, dag ins, list<dag> pattern,
InstrItinClass itin = IIPseudo>:
MipsPseudo<outs, ins, pattern, itin> {
- let Predicates = [HasMSA];
+ let EncodingPredicates = [HasStdEnc];
+ let ASEPredicate = [HasMSA];
}
class MSA_BIT_B_FMT<bits<3> major, bits<6> minor>: MSAInst {
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
index bf79f0f2ff82..d83f75ffa1c1 100644
--- a/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -35,14 +35,6 @@ def MipsVAllNonZero : SDNode<"MipsISD::VALL_NONZERO", SDT_MipsVecCond>;
def MipsVAnyNonZero : SDNode<"MipsISD::VANY_NONZERO", SDT_MipsVecCond>;
def MipsVAllZero : SDNode<"MipsISD::VALL_ZERO", SDT_MipsVecCond>;
def MipsVAnyZero : SDNode<"MipsISD::VANY_ZERO", SDT_MipsVecCond>;
-def MipsVSMax : SDNode<"MipsISD::VSMAX", SDTIntBinOp,
- [SDNPCommutative, SDNPAssociative]>;
-def MipsVSMin : SDNode<"MipsISD::VSMIN", SDTIntBinOp,
- [SDNPCommutative, SDNPAssociative]>;
-def MipsVUMax : SDNode<"MipsISD::VUMAX", SDTIntBinOp,
- [SDNPCommutative, SDNPAssociative]>;
-def MipsVUMin : SDNode<"MipsISD::VUMIN", SDTIntBinOp,
- [SDNPCommutative, SDNPAssociative]>;
def MipsVNOR : SDNode<"MipsISD::VNOR", SDTIntBinOp,
[SDNPCommutative, SDNPAssociative]>;
def MipsVSHF : SDNode<"MipsISD::VSHF", SDT_VSHF>;
@@ -54,6 +46,7 @@ def MipsILVR : SDNode<"MipsISD::ILVR", SDT_ILV>;
def MipsPCKEV : SDNode<"MipsISD::PCKEV", SDT_ILV>;
def MipsPCKOD : SDNode<"MipsISD::PCKOD", SDT_ILV>;
def MipsINSVE : SDNode<"MipsISD::INSVE", SDT_INSVE>;
+def MipsFMS : SDNode<"MipsISD::FMS", SDTFPTernaryOp>;
def vsetcc : SDNode<"ISD::SETCC", SDT_VSetCC>;
def vfsetcc : SDNode<"ISD::SETCC", SDT_VFSetCC>;
@@ -188,8 +181,28 @@ def vsplati16 : PatFrag<(ops node:$e0),
def vsplati32 : PatFrag<(ops node:$e0),
(v4i32 (build_vector node:$e0, node:$e0,
node:$e0, node:$e0))>;
+
+def vsplati64_imm_eq_1 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{
+ APInt Imm;
+ SDNode *BV = N->getOperand(0).getNode();
+ EVT EltTy = N->getValueType(0).getVectorElementType();
+
+ return selectVSplat(BV, Imm, EltTy.getSizeInBits()) &&
+ Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
+}]>;
+
def vsplati64 : PatFrag<(ops node:$e0),
(v2i64 (build_vector node:$e0, node:$e0))>;
+
+def vsplati64_splat_d : PatFrag<(ops node:$e0),
+ (v2i64 (bitconvert
+ (v4i32 (and
+ (v4i32 (build_vector node:$e0,
+ node:$e0,
+ node:$e0,
+ node:$e0)),
+ vsplati64_imm_eq_1))))>;
+
def vsplatf32 : PatFrag<(ops node:$e0),
(v4f32 (build_vector node:$e0, node:$e0,
node:$e0, node:$e0))>;
@@ -203,7 +216,8 @@ def vsplati16_elt : PatFrag<(ops node:$v, node:$i),
def vsplati32_elt : PatFrag<(ops node:$v, node:$i),
(MipsVSHF (vsplati32 node:$i), node:$v, node:$v)>;
def vsplati64_elt : PatFrag<(ops node:$v, node:$i),
- (MipsVSHF (vsplati64 node:$i), node:$v, node:$v)>;
+ (MipsVSHF (vsplati64_splat_d node:$i),
+ node:$v, node:$v)>;
class SplatPatLeaf<Operand opclass, dag frag, code pred = [{}],
SDNodeXForm xform = NOOP_SDNodeXForm>
@@ -334,15 +348,6 @@ def vsplat_imm_eq_1 : PatLeaf<(build_vector), [{
Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
}]>;
-def vsplati64_imm_eq_1 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{
- APInt Imm;
- SDNode *BV = N->getOperand(0).getNode();
- EVT EltTy = N->getValueType(0).getVectorElementType();
-
- return selectVSplat(BV, Imm, EltTy.getSizeInBits()) &&
- Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
-}]>;
-
def vbclr_b : PatFrag<(ops node:$ws, node:$wt),
(and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
immAllOnesV))>;
@@ -377,9 +382,6 @@ def vbset_d : PatFrag<(ops node:$ws, node:$wt),
(or node:$ws, (shl (v2i64 vsplati64_imm_eq_1),
node:$wt))>;
-def fms : PatFrag<(ops node:$wd, node:$ws, node:$wt),
- (fsub node:$wd, (fmul node:$ws, node:$wt))>;
-
def muladd : PatFrag<(ops node:$wd, node:$ws, node:$wt),
(add node:$wd, (mul node:$ws, node:$wt))>;
@@ -1788,6 +1790,7 @@ class CFCMSA_DESC {
string AsmString = "cfcmsa\t$rd, $cs";
InstrItinClass Itinerary = NoItinerary;
bit hasSideEffects = 1;
+ bit isMoveReg = 1;
}
class CLE_S_B_DESC : MSA_3R_DESC_BASE<"cle_s.b", vsetle_v16i8, MSA128BOpnd>;
@@ -1882,6 +1885,7 @@ class CTCMSA_DESC {
string AsmString = "ctcmsa\t$cd, $rs";
InstrItinClass Itinerary = NoItinerary;
bit hasSideEffects = 1;
+ bit isMoveReg = 1;
}
class DIV_S_B_DESC : MSA_3R_DESC_BASE<"div_s.b", sdiv, MSA128BOpnd>;
@@ -2099,8 +2103,8 @@ class FMIN_A_W_DESC : MSA_3RF_DESC_BASE<"fmin_a.w", int_mips_fmin_a_w,
class FMIN_A_D_DESC : MSA_3RF_DESC_BASE<"fmin_a.d", int_mips_fmin_a_d,
MSA128DOpnd>;
-class FMSUB_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.w", fms, MSA128WOpnd>;
-class FMSUB_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.d", fms, MSA128DOpnd>;
+class FMSUB_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.w", MipsFMS, MSA128WOpnd>;
+class FMSUB_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.d", MipsFMS, MSA128DOpnd>;
class FMUL_W_DESC : MSA_3RF_DESC_BASE<"fmul.w", fmul, MSA128WOpnd>;
class FMUL_D_DESC : MSA_3RF_DESC_BASE<"fmul.d", fmul, MSA128DOpnd>;
@@ -2350,32 +2354,32 @@ class MAX_A_H_DESC : MSA_3R_DESC_BASE<"max_a.h", int_mips_max_a_h, MSA128HOpnd>;
class MAX_A_W_DESC : MSA_3R_DESC_BASE<"max_a.w", int_mips_max_a_w, MSA128WOpnd>;
class MAX_A_D_DESC : MSA_3R_DESC_BASE<"max_a.d", int_mips_max_a_d, MSA128DOpnd>;
-class MAX_S_B_DESC : MSA_3R_DESC_BASE<"max_s.b", MipsVSMax, MSA128BOpnd>;
-class MAX_S_H_DESC : MSA_3R_DESC_BASE<"max_s.h", MipsVSMax, MSA128HOpnd>;
-class MAX_S_W_DESC : MSA_3R_DESC_BASE<"max_s.w", MipsVSMax, MSA128WOpnd>;
-class MAX_S_D_DESC : MSA_3R_DESC_BASE<"max_s.d", MipsVSMax, MSA128DOpnd>;
+class MAX_S_B_DESC : MSA_3R_DESC_BASE<"max_s.b", smax, MSA128BOpnd>;
+class MAX_S_H_DESC : MSA_3R_DESC_BASE<"max_s.h", smax, MSA128HOpnd>;
+class MAX_S_W_DESC : MSA_3R_DESC_BASE<"max_s.w", smax, MSA128WOpnd>;
+class MAX_S_D_DESC : MSA_3R_DESC_BASE<"max_s.d", smax, MSA128DOpnd>;
-class MAX_U_B_DESC : MSA_3R_DESC_BASE<"max_u.b", MipsVUMax, MSA128BOpnd>;
-class MAX_U_H_DESC : MSA_3R_DESC_BASE<"max_u.h", MipsVUMax, MSA128HOpnd>;
-class MAX_U_W_DESC : MSA_3R_DESC_BASE<"max_u.w", MipsVUMax, MSA128WOpnd>;
-class MAX_U_D_DESC : MSA_3R_DESC_BASE<"max_u.d", MipsVUMax, MSA128DOpnd>;
+class MAX_U_B_DESC : MSA_3R_DESC_BASE<"max_u.b", umax, MSA128BOpnd>;
+class MAX_U_H_DESC : MSA_3R_DESC_BASE<"max_u.h", umax, MSA128HOpnd>;
+class MAX_U_W_DESC : MSA_3R_DESC_BASE<"max_u.w", umax, MSA128WOpnd>;
+class MAX_U_D_DESC : MSA_3R_DESC_BASE<"max_u.d", umax, MSA128DOpnd>;
-class MAXI_S_B_DESC : MSA_I5_DESC_BASE<"maxi_s.b", MipsVSMax, vsplati8_simm5,
+class MAXI_S_B_DESC : MSA_I5_DESC_BASE<"maxi_s.b", smax, vsplati8_simm5,
MSA128BOpnd>;
-class MAXI_S_H_DESC : MSA_I5_DESC_BASE<"maxi_s.h", MipsVSMax, vsplati16_simm5,
+class MAXI_S_H_DESC : MSA_I5_DESC_BASE<"maxi_s.h", smax, vsplati16_simm5,
MSA128HOpnd>;
-class MAXI_S_W_DESC : MSA_I5_DESC_BASE<"maxi_s.w", MipsVSMax, vsplati32_simm5,
+class MAXI_S_W_DESC : MSA_I5_DESC_BASE<"maxi_s.w", smax, vsplati32_simm5,
MSA128WOpnd>;
-class MAXI_S_D_DESC : MSA_I5_DESC_BASE<"maxi_s.d", MipsVSMax, vsplati64_simm5,
+class MAXI_S_D_DESC : MSA_I5_DESC_BASE<"maxi_s.d", smax, vsplati64_simm5,
MSA128DOpnd>;
-class MAXI_U_B_DESC : MSA_I5_DESC_BASE<"maxi_u.b", MipsVUMax, vsplati8_uimm5,
+class MAXI_U_B_DESC : MSA_I5_DESC_BASE<"maxi_u.b", umax, vsplati8_uimm5,
MSA128BOpnd>;
-class MAXI_U_H_DESC : MSA_I5_DESC_BASE<"maxi_u.h", MipsVUMax, vsplati16_uimm5,
+class MAXI_U_H_DESC : MSA_I5_DESC_BASE<"maxi_u.h", umax, vsplati16_uimm5,
MSA128HOpnd>;
-class MAXI_U_W_DESC : MSA_I5_DESC_BASE<"maxi_u.w", MipsVUMax, vsplati32_uimm5,
+class MAXI_U_W_DESC : MSA_I5_DESC_BASE<"maxi_u.w", umax, vsplati32_uimm5,
MSA128WOpnd>;
-class MAXI_U_D_DESC : MSA_I5_DESC_BASE<"maxi_u.d", MipsVUMax, vsplati64_uimm5,
+class MAXI_U_D_DESC : MSA_I5_DESC_BASE<"maxi_u.d", umax, vsplati64_uimm5,
MSA128DOpnd>;
class MIN_A_B_DESC : MSA_3R_DESC_BASE<"min_a.b", int_mips_min_a_b, MSA128BOpnd>;
@@ -2383,32 +2387,32 @@ class MIN_A_H_DESC : MSA_3R_DESC_BASE<"min_a.h", int_mips_min_a_h, MSA128HOpnd>;
class MIN_A_W_DESC : MSA_3R_DESC_BASE<"min_a.w", int_mips_min_a_w, MSA128WOpnd>;
class MIN_A_D_DESC : MSA_3R_DESC_BASE<"min_a.d", int_mips_min_a_d, MSA128DOpnd>;
-class MIN_S_B_DESC : MSA_3R_DESC_BASE<"min_s.b", MipsVSMin, MSA128BOpnd>;
-class MIN_S_H_DESC : MSA_3R_DESC_BASE<"min_s.h", MipsVSMin, MSA128HOpnd>;
-class MIN_S_W_DESC : MSA_3R_DESC_BASE<"min_s.w", MipsVSMin, MSA128WOpnd>;
-class MIN_S_D_DESC : MSA_3R_DESC_BASE<"min_s.d", MipsVSMin, MSA128DOpnd>;
+class MIN_S_B_DESC : MSA_3R_DESC_BASE<"min_s.b", smin, MSA128BOpnd>;
+class MIN_S_H_DESC : MSA_3R_DESC_BASE<"min_s.h", smin, MSA128HOpnd>;
+class MIN_S_W_DESC : MSA_3R_DESC_BASE<"min_s.w", smin, MSA128WOpnd>;
+class MIN_S_D_DESC : MSA_3R_DESC_BASE<"min_s.d", smin, MSA128DOpnd>;
-class MIN_U_B_DESC : MSA_3R_DESC_BASE<"min_u.b", MipsVUMin, MSA128BOpnd>;
-class MIN_U_H_DESC : MSA_3R_DESC_BASE<"min_u.h", MipsVUMin, MSA128HOpnd>;
-class MIN_U_W_DESC : MSA_3R_DESC_BASE<"min_u.w", MipsVUMin, MSA128WOpnd>;
-class MIN_U_D_DESC : MSA_3R_DESC_BASE<"min_u.d", MipsVUMin, MSA128DOpnd>;
+class MIN_U_B_DESC : MSA_3R_DESC_BASE<"min_u.b", umin, MSA128BOpnd>;
+class MIN_U_H_DESC : MSA_3R_DESC_BASE<"min_u.h", umin, MSA128HOpnd>;
+class MIN_U_W_DESC : MSA_3R_DESC_BASE<"min_u.w", umin, MSA128WOpnd>;
+class MIN_U_D_DESC : MSA_3R_DESC_BASE<"min_u.d", umin, MSA128DOpnd>;
-class MINI_S_B_DESC : MSA_I5_DESC_BASE<"mini_s.b", MipsVSMin, vsplati8_simm5,
+class MINI_S_B_DESC : MSA_I5_DESC_BASE<"mini_s.b", smin, vsplati8_simm5,
MSA128BOpnd>;
-class MINI_S_H_DESC : MSA_I5_DESC_BASE<"mini_s.h", MipsVSMin, vsplati16_simm5,
+class MINI_S_H_DESC : MSA_I5_DESC_BASE<"mini_s.h", smin, vsplati16_simm5,
MSA128HOpnd>;
-class MINI_S_W_DESC : MSA_I5_DESC_BASE<"mini_s.w", MipsVSMin, vsplati32_simm5,
+class MINI_S_W_DESC : MSA_I5_DESC_BASE<"mini_s.w", smin, vsplati32_simm5,
MSA128WOpnd>;
-class MINI_S_D_DESC : MSA_I5_DESC_BASE<"mini_s.d", MipsVSMin, vsplati64_simm5,
+class MINI_S_D_DESC : MSA_I5_DESC_BASE<"mini_s.d", smin, vsplati64_simm5,
MSA128DOpnd>;
-class MINI_U_B_DESC : MSA_I5_DESC_BASE<"mini_u.b", MipsVUMin, vsplati8_uimm5,
+class MINI_U_B_DESC : MSA_I5_DESC_BASE<"mini_u.b", umin, vsplati8_uimm5,
MSA128BOpnd>;
-class MINI_U_H_DESC : MSA_I5_DESC_BASE<"mini_u.h", MipsVUMin, vsplati16_uimm5,
+class MINI_U_H_DESC : MSA_I5_DESC_BASE<"mini_u.h", umin, vsplati16_uimm5,
MSA128HOpnd>;
-class MINI_U_W_DESC : MSA_I5_DESC_BASE<"mini_u.w", MipsVUMin, vsplati32_uimm5,
+class MINI_U_W_DESC : MSA_I5_DESC_BASE<"mini_u.w", umin, vsplati32_uimm5,
MSA128WOpnd>;
-class MINI_U_D_DESC : MSA_I5_DESC_BASE<"mini_u.d", MipsVUMin, vsplati64_uimm5,
+class MINI_U_D_DESC : MSA_I5_DESC_BASE<"mini_u.d", umin, vsplati64_uimm5,
MSA128DOpnd>;
class MOD_S_B_DESC : MSA_3R_DESC_BASE<"mod_s.b", srem, MSA128BOpnd>;
@@ -2427,6 +2431,7 @@ class MOVE_V_DESC {
string AsmString = "move.v\t$wd, $ws";
list<dag> Pattern = [];
InstrItinClass Itinerary = NoItinerary;
+ bit isMoveReg = 1;
}
class MSUB_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"msub_q.h", int_mips_msub_q_h,
@@ -3143,6 +3148,20 @@ def FTRUNC_S_D : FTRUNC_S_D_ENC, FTRUNC_S_D_DESC;
def FTRUNC_U_W : FTRUNC_U_W_ENC, FTRUNC_U_W_DESC;
def FTRUNC_U_D : FTRUNC_U_D_ENC, FTRUNC_U_D_DESC;
+def : MipsPat<(fsub MSA128WOpnd:$wd, (fmul MSA128WOpnd:$ws, MSA128WOpnd:$wt)),
+ (FMSUB_W MSA128WOpnd:$wd, MSA128WOpnd:$ws, MSA128WOpnd:$wt)>,
+ ISA_MIPS1, ASE_MSA, FPOP_FUSION_FAST;
+def : MipsPat<(fsub MSA128DOpnd:$wd, (fmul MSA128DOpnd:$ws, MSA128DOpnd:$wt)),
+ (FMSUB_D MSA128DOpnd:$wd, MSA128DOpnd:$ws, MSA128DOpnd:$wt)>,
+ ISA_MIPS1, ASE_MSA, FPOP_FUSION_FAST;
+
+def : MipsPat<(fadd MSA128WOpnd:$wd, (fmul MSA128WOpnd:$ws, MSA128WOpnd:$wt)),
+ (FMADD_W MSA128WOpnd:$wd, MSA128WOpnd:$ws, MSA128WOpnd:$wt)>,
+ ISA_MIPS1, ASE_MSA, FPOP_FUSION_FAST;
+def : MipsPat<(fadd MSA128DOpnd:$wd, (fmul MSA128DOpnd:$ws, MSA128DOpnd:$wt)),
+ (FMADD_D MSA128DOpnd:$wd, MSA128DOpnd:$ws, MSA128DOpnd:$wt)>,
+ ISA_MIPS1, ASE_MSA, FPOP_FUSION_FAST;
+
def HADD_S_H : HADD_S_H_ENC, HADD_S_H_DESC;
def HADD_S_W : HADD_S_W_ENC, HADD_S_W_DESC;
def HADD_S_D : HADD_S_D_ENC, HADD_S_D_DESC;
@@ -3733,7 +3752,7 @@ def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8,
// Pseudoes used to implement transparent fp16 support.
-let Predicates = [HasMSA] in {
+let ASEPredicate = [HasMSA] in {
def ST_F16 : MipsPseudo<(outs), (ins MSA128F16:$ws, mem_simm10:$addr),
[(store (f16 MSA128F16:$ws), (addrimm10:$addr))]> {
let usesCustomInserter = 1;
@@ -3773,12 +3792,13 @@ let Predicates = [HasMSA] in {
}
def : MipsPat<(MipsTruncIntFP MSA128F16:$ws),
- (TRUNC_W_D64 (MSA_FP_EXTEND_D_PSEUDO MSA128F16:$ws))>;
+ (TRUNC_W_D64 (MSA_FP_EXTEND_D_PSEUDO MSA128F16:$ws))>, ISA_MIPS1,
+ ASE_MSA;
def : MipsPat<(MipsFPCmp MSA128F16:$ws, MSA128F16:$wt, imm:$cond),
(FCMP_S32 (MSA_FP_EXTEND_W_PSEUDO MSA128F16:$ws),
(MSA_FP_EXTEND_W_PSEUDO MSA128F16:$wt), imm:$cond)>,
- ISA_MIPS1_NOT_32R6_64R6;
+ ISA_MIPS1_NOT_32R6_64R6, ASE_MSA;
}
def vsplati64_imm_eq_63 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{
diff --git a/lib/Target/Mips/MipsMTInstrFormats.td b/lib/Target/Mips/MipsMTInstrFormats.td
index edc0981e6278..c2c22e2ad61c 100644
--- a/lib/Target/Mips/MipsMTInstrFormats.td
+++ b/lib/Target/Mips/MipsMTInstrFormats.td
@@ -15,8 +15,7 @@
//
//===----------------------------------------------------------------------===//
-class MipsMTInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
- PredicateControl {
+class MipsMTInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
let DecoderNamespace = "Mips";
let EncodingPredicates = [HasStdEnc];
}
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index 1ee56d830090..81b4352670c0 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -29,25 +29,27 @@ bool MipsFunctionInfo::globalBaseRegSet() const {
return GlobalBaseReg;
}
+static const TargetRegisterClass &getGlobalBaseRegClass(MachineFunction &MF) {
+ auto &STI = static_cast<const MipsSubtarget &>(MF.getSubtarget());
+ auto &TM = static_cast<const MipsTargetMachine &>(MF.getTarget());
+
+ if (STI.inMips16Mode())
+ return Mips::CPU16RegsRegClass;
+
+ if (STI.inMicroMipsMode())
+ return Mips::GPRMM16RegClass;
+
+ if (TM.getABI().IsN64())
+ return Mips::GPR64RegClass;
+
+ return Mips::GPR32RegClass;
+}
+
unsigned MipsFunctionInfo::getGlobalBaseReg() {
- // Return if it has already been initialized.
- if (GlobalBaseReg)
- return GlobalBaseReg;
-
- MipsSubtarget const &STI =
- static_cast<const MipsSubtarget &>(MF.getSubtarget());
-
- const TargetRegisterClass *RC =
- STI.inMips16Mode()
- ? &Mips::CPU16RegsRegClass
- : STI.inMicroMipsMode()
- ? &Mips::GPRMM16RegClass
- : static_cast<const MipsTargetMachine &>(MF.getTarget())
- .getABI()
- .IsN64()
- ? &Mips::GPR64RegClass
- : &Mips::GPR32RegClass;
- return GlobalBaseReg = MF.getRegInfo().createVirtualRegister(RC);
+ if (!GlobalBaseReg)
+ GlobalBaseReg =
+ MF.getRegInfo().createVirtualRegister(&getGlobalBaseRegClass(MF));
+ return GlobalBaseReg;
}
void MipsFunctionInfo::createEhDataRegsFI() {
diff --git a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
index ceacaa498389..a2b55e8bddcd 100644
--- a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
@@ -11,6 +11,7 @@
#include "Mips.h"
#include "MipsTargetMachine.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/StackProtector.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -32,6 +33,7 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetPassConfig>();
+ AU.addPreserved<StackProtector>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -42,7 +44,7 @@ namespace {
}
bool MipsModuleDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(errs() << "In MipsModuleDAGToDAGISel::runMachineFunction\n");
+ LLVM_DEBUG(errs() << "In MipsModuleDAGToDAGISel::runMachineFunction\n");
auto &TPC = getAnalysis<TargetPassConfig>();
auto &TM = TPC.getTM<MipsTargetMachine>();
TM.resetSubtarget(&MF);
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index a9ca31a6d09f..27bc4843f410 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -27,7 +27,6 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -35,6 +34,7 @@
#include "llvm/Support/Allocator.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/RecyclingAllocator.h"
#include <cassert>
#include <utility>
@@ -90,10 +90,10 @@ public:
}
private:
- /// \brief Visit MBB.
+ /// Visit MBB.
bool visitNode(MBBInfo &MBBI);
- /// \brief Test if MI jumps to a function via a register.
+ /// Test if MI jumps to a function via a register.
///
/// Also, return the virtual register containing the target function's address
/// and the underlying object in Reg and Val respectively, if the function's
@@ -101,15 +101,15 @@ private:
bool isCallViaRegister(MachineInstr &MI, unsigned &Reg,
ValueType &Val) const;
- /// \brief Return the number of instructions that dominate the current
+ /// Return the number of instructions that dominate the current
/// instruction and load the function address from object Entry.
unsigned getCount(ValueType Entry);
- /// \brief Return the destination virtual register of the last instruction
+ /// Return the destination virtual register of the last instruction
/// that loads from object Entry.
unsigned getReg(ValueType Entry);
- /// \brief Update ScopedHT.
+ /// Update ScopedHT.
void incCntAndSetReg(ValueType Entry, unsigned Reg);
ScopedHTType ScopedHT;
diff --git a/lib/Target/Mips/MipsOs16.cpp b/lib/Target/Mips/MipsOs16.cpp
index 7ee45c28a7d0..4edcb3132ada 100644
--- a/lib/Target/Mips/MipsOs16.cpp
+++ b/lib/Target/Mips/MipsOs16.cpp
@@ -96,7 +96,8 @@ static bool needsFP(Function &F) {
;
}
if (const CallInst *CI = dyn_cast<CallInst>(I)) {
- DEBUG(dbgs() << "Working on call" << "\n");
+ LLVM_DEBUG(dbgs() << "Working on call"
+ << "\n");
Function &F_ = *CI->getCalledFunction();
if (needsFPFromSig(F_))
return true;
@@ -110,9 +111,10 @@ bool MipsOs16::runOnModule(Module &M) {
bool usingMask = Mips32FunctionMask.length() > 0;
bool doneUsingMask = false; // this will make it stop repeating
- DEBUG(dbgs() << "Run on Module MipsOs16 \n" << Mips32FunctionMask << "\n");
+ LLVM_DEBUG(dbgs() << "Run on Module MipsOs16 \n"
+ << Mips32FunctionMask << "\n");
if (usingMask)
- DEBUG(dbgs() << "using mask \n" << Mips32FunctionMask << "\n");
+ LLVM_DEBUG(dbgs() << "using mask \n" << Mips32FunctionMask << "\n");
unsigned int functionIndex = 0;
bool modified = false;
@@ -121,14 +123,14 @@ bool MipsOs16::runOnModule(Module &M) {
if (F.isDeclaration())
continue;
- DEBUG(dbgs() << "Working on " << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Working on " << F.getName() << "\n");
if (usingMask) {
if (!doneUsingMask) {
if (functionIndex == Mips32FunctionMask.length())
functionIndex = 0;
switch (Mips32FunctionMask[functionIndex]) {
case '1':
- DEBUG(dbgs() << "mask forced mips32: " << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "mask forced mips32: " << F.getName() << "\n");
F.addFnAttr("nomips16");
break;
case '.':
@@ -142,11 +144,11 @@ bool MipsOs16::runOnModule(Module &M) {
}
else {
if (needsFP(F)) {
- DEBUG(dbgs() << "os16 forced mips32: " << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "os16 forced mips32: " << F.getName() << "\n");
F.addFnAttr("nomips16");
}
else {
- DEBUG(dbgs() << "os16 forced mips16: " << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "os16 forced mips16: " << F.getName() << "\n");
F.addFnAttr("mips16");
}
}
diff --git a/lib/Target/Mips/MipsRegisterBankInfo.cpp b/lib/Target/Mips/MipsRegisterBankInfo.cpp
new file mode 100644
index 000000000000..cef21f447205
--- /dev/null
+++ b/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -0,0 +1,100 @@
+//===- MipsRegisterBankInfo.cpp ---------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for Mips.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "MipsInstrInfo.h"
+#include "MipsRegisterBankInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define GET_TARGET_REGBANK_IMPL
+
+#define DEBUG_TYPE "registerbankinfo"
+
+#include "MipsGenRegisterBank.inc"
+
+namespace llvm {
+namespace Mips {
+enum PartialMappingIdx {
+ PMI_GPR,
+ PMI_Min = PMI_GPR,
+};
+
+RegisterBankInfo::PartialMapping PartMappings[]{
+ {0, 32, GPRBRegBank}
+};
+
+enum ValueMappingIdx { InvalidIdx = 0, GPRIdx = 1 };
+
+RegisterBankInfo::ValueMapping ValueMappings[] = {
+ // invalid
+ {nullptr, 0},
+ // 3 operands in GPRs
+ {&PartMappings[PMI_GPR - PMI_Min], 1},
+ {&PartMappings[PMI_GPR - PMI_Min], 1},
+ {&PartMappings[PMI_GPR - PMI_Min], 1}};
+
+} // end namespace Mips
+} // end namespace llvm
+
+using namespace llvm;
+
+MipsRegisterBankInfo::MipsRegisterBankInfo(const TargetRegisterInfo &TRI)
+ : MipsGenRegisterBankInfo() {}
+
+const RegisterBank &MipsRegisterBankInfo::getRegBankFromRegClass(
+ const TargetRegisterClass &RC) const {
+ using namespace Mips;
+
+ switch (RC.getID()) {
+ case Mips::GPR32RegClassID:
+ case Mips::CPU16Regs_and_GPRMM16ZeroRegClassID:
+ case Mips::GPRMM16MoveP_and_CPU16Regs_and_GPRMM16ZeroRegClassID:
+ case Mips::SP32RegClassID:
+ return getRegBank(Mips::GPRBRegBankID);
+ default:
+ llvm_unreachable("Register class not supported");
+ }
+}
+
+const RegisterBankInfo::InstructionMapping &
+MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+
+ unsigned Opc = MI.getOpcode();
+
+ const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
+ if (Mapping.isValid())
+ return Mapping;
+
+ using namespace TargetOpcode;
+
+ unsigned NumOperands = MI.getNumOperands();
+ const ValueMapping *OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx];
+
+ switch (Opc) {
+ case G_ADD:
+ case G_LOAD:
+ case G_STORE:
+ case G_GEP:
+ OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx];
+ break;
+ case G_CONSTANT:
+ case G_FRAME_INDEX:
+ OperandsMapping =
+ getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr});
+ break;
+ default:
+ return getInvalidInstructionMapping();
+ }
+
+ return getInstructionMapping(DefaultMappingID, /*Cost=*/1, OperandsMapping,
+ NumOperands);
+}
diff --git a/lib/Target/Mips/MipsRegisterBankInfo.h b/lib/Target/Mips/MipsRegisterBankInfo.h
new file mode 100644
index 000000000000..64a79abaa74d
--- /dev/null
+++ b/lib/Target/Mips/MipsRegisterBankInfo.h
@@ -0,0 +1,43 @@
+//===- MipsRegisterBankInfo.h -----------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for Mips.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSREGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSREGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+#define GET_REGBANK_DECLARATIONS
+#include "MipsGenRegisterBank.inc"
+
+namespace llvm {
+
+class TargetRegisterInfo;
+
+class MipsGenRegisterBankInfo : public RegisterBankInfo {
+#define GET_TARGET_REGBANK_CLASS
+#include "MipsGenRegisterBank.inc"
+};
+
+/// This class provides the information for the target register banks.
+class MipsRegisterBankInfo final : public MipsGenRegisterBankInfo {
+public:
+ MipsRegisterBankInfo(const TargetRegisterInfo &TRI);
+
+ const RegisterBank &
+ getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+
+ const InstructionMapping &
+ getInstrMapping(const MachineInstr &MI) const override;
+};
+} // end namespace llvm
+#endif
diff --git a/lib/Target/Hexagon/HexagonDepDecoders.h b/lib/Target/Mips/MipsRegisterBanks.td
index 020362a95909..5f1687048fac 100644
--- a/lib/Target/Hexagon/HexagonDepDecoders.h
+++ b/lib/Target/Mips/MipsRegisterBanks.td
@@ -1,4 +1,4 @@
-//===- HexagonDepDecoders.h -----------------------------------------------===//
+//===- MipsRegisterBank.td ---------------------------------*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,8 +6,8 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-// Automatically generated file, please consult code owner before editing.
+//
+//
//===----------------------------------------------------------------------===//
-
-
+def GPRBRegBank : RegisterBank<"GPRB", [GPR32]>;
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 0e0d82270c89..3c108c2ba9b7 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -275,18 +275,20 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
MachineInstr &MI = *II;
MachineFunction &MF = *MI.getParent()->getParent();
- DEBUG(errs() << "\nFunction : " << MF.getName() << "\n";
- errs() << "<--------->\n" << MI);
+ LLVM_DEBUG(errs() << "\nFunction : " << MF.getName() << "\n";
+ errs() << "<--------->\n"
+ << MI);
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
uint64_t stackSize = MF.getFrameInfo().getStackSize();
int64_t spOffset = MF.getFrameInfo().getObjectOffset(FrameIndex);
- DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n"
- << "spOffset : " << spOffset << "\n"
- << "stackSize : " << stackSize << "\n"
- << "alignment : "
- << MF.getFrameInfo().getObjectAlignment(FrameIndex) << "\n");
+ LLVM_DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n"
+ << "spOffset : " << spOffset << "\n"
+ << "stackSize : " << stackSize << "\n"
+ << "alignment : "
+ << MF.getFrameInfo().getObjectAlignment(FrameIndex)
+ << "\n");
eliminateFI(MI, FIOperandNum, FrameIndex, stackSize, spOffset);
}
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index fe8d7953ec8f..4cc50fb981ba 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -57,6 +57,8 @@ public:
BitVector getReservedRegs(const MachineFunction &MF) const override;
+ bool enableMultipleCopyHints() const override { return true; }
+
bool requiresRegisterScavenging(const MachineFunction &MF) const override;
bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
@@ -72,7 +74,7 @@ public:
/// Debug information queries.
unsigned getFrameRegister(const MachineFunction &MF) const override;
- /// \brief Return GPR register class.
+ /// Return GPR register class.
virtual const TargetRegisterClass *intRegClass(unsigned Size) const = 0;
private:
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index eb1eea7925c0..687c9f676b34 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -394,7 +394,6 @@ MipsSEFrameLowering::MipsSEFrameLowering(const MipsSubtarget &STI)
void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
MachineFrameInfo &MFI = MF.getFrameInfo();
MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
@@ -682,7 +681,7 @@ void MipsSEFrameLowering::emitInterruptPrologueStub(
void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
MachineFrameInfo &MFI = MF.getFrameInfo();
MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
@@ -691,7 +690,7 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
const MipsRegisterInfo &RegInfo =
*static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo());
- DebugLoc DL = MBBI->getDebugLoc();
+ DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
MipsABIInfo ABI = STI.getABI();
unsigned SP = ABI.GetStackPtr();
unsigned FP = ABI.GetFramePtr();
@@ -790,7 +789,6 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
const std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
- MachineBasicBlock *EntryBlock = &MF->front();
const TargetInstrInfo &TII = *STI.getInstrInfo();
for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
@@ -803,7 +801,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA || Reg == Mips::RA_64)
&& MF->getFrameInfo().isReturnAddressTaken();
if (!IsRAAndRetAddrIsTaken)
- EntryBlock->addLiveIn(Reg);
+ MBB.addLiveIn(Reg);
// ISRs require HI/LO to be spilled into kernel registers to be then
// spilled to the stack frame.
@@ -828,7 +826,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
// Insert the spill to the stack frame.
bool IsKill = !IsRAAndRetAddrIsTaken;
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
- TII.storeRegToStackSlot(*EntryBlock, MI, Reg, IsKill,
+ TII.storeRegToStackSlot(MBB, MI, Reg, IsKill,
CSI[i].getFrameIdx(), RC, TRI);
}
@@ -882,9 +880,10 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
// Expand pseudo instructions which load, store or copy accumulators.
// Add an emergency spill slot if a pseudo was expanded.
if (ExpandPseudo(MF).expand()) {
- // The spill slot should be half the size of the accumulator. If target is
- // mips64, it should be 64-bit, otherwise it should be 32-bt.
- const TargetRegisterClass &RC = STI.hasMips64() ?
+ // The spill slot should be half the size of the accumulator. If target have
+ // general-purpose registers 64 bits wide, it should be 64-bit, otherwise
+ // it should be 32-bit.
+ const TargetRegisterClass &RC = STI.isGP64bit() ?
Mips::GPR64RegClass : Mips::GPR32RegClass;
int FI = MF.getFrameInfo().CreateStackObject(TRI->getSpillSize(RC),
TRI->getSpillAlignment(RC),
diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
index de8e6eed31d7..cb2119d6880b 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/lib/Target/Mips/MipsSEFrameLowering.h
@@ -40,7 +40,6 @@ public:
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS) const override;
- unsigned ehDataReg(unsigned I) const;
private:
void emitInterruptEpilogueStub(MachineFunction &MF,
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 893cae93e58f..599c1e913acf 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -288,7 +288,7 @@ void MipsSEDAGToDAGISel::selectAddE(SDNode *Node, const SDLoc &DL) const {
SDValue(Carry, 0)};
SDNode *DSPCFWithCarry = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, Ops);
- // My reading of the the MIPS DSP 3.01 specification isn't as clear as I
+ // My reading of the MIPS DSP 3.01 specification isn't as clear as I
// would like about whether bit 20 always gets overwritten by addwc.
// Hence take an extremely conservative view and presume it's sticky. We
// therefore need to clear it.
@@ -976,9 +976,9 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
}
SDNode *Rdhwr =
- CurDAG->getMachineNode(RdhwrOpc, DL,
- Node->getValueType(0),
- CurDAG->getRegister(Mips::HWR29, MVT::i32));
+ CurDAG->getMachineNode(RdhwrOpc, DL, Node->getValueType(0),
+ CurDAG->getRegister(Mips::HWR29, MVT::i32),
+ CurDAG->getTargetConstant(0, DL, MVT::i32));
SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, DestReg,
SDValue(Rdhwr, 0));
SDValue ResNode = CurDAG->getCopyFromReg(Chain, DL, DestReg, PtrVT);
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index 6f38289c5a45..eb3657aae050 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -93,37 +93,37 @@ private:
bool selectIntAddrSImm10Lsl3(SDValue Addr, SDValue &Base,
SDValue &Offset) const override;
- /// \brief Select constant vector splats.
+ /// Select constant vector splats.
bool selectVSplat(SDNode *N, APInt &Imm,
unsigned MinSizeInBits) const override;
- /// \brief Select constant vector splats whose value fits in a given integer.
+ /// Select constant vector splats whose value fits in a given integer.
bool selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
unsigned ImmBitSize) const;
- /// \brief Select constant vector splats whose value fits in a uimm1.
+ /// Select constant vector splats whose value fits in a uimm1.
bool selectVSplatUimm1(SDValue N, SDValue &Imm) const override;
- /// \brief Select constant vector splats whose value fits in a uimm2.
+ /// Select constant vector splats whose value fits in a uimm2.
bool selectVSplatUimm2(SDValue N, SDValue &Imm) const override;
- /// \brief Select constant vector splats whose value fits in a uimm3.
+ /// Select constant vector splats whose value fits in a uimm3.
bool selectVSplatUimm3(SDValue N, SDValue &Imm) const override;
- /// \brief Select constant vector splats whose value fits in a uimm4.
+ /// Select constant vector splats whose value fits in a uimm4.
bool selectVSplatUimm4(SDValue N, SDValue &Imm) const override;
- /// \brief Select constant vector splats whose value fits in a uimm5.
+ /// Select constant vector splats whose value fits in a uimm5.
bool selectVSplatUimm5(SDValue N, SDValue &Imm) const override;
- /// \brief Select constant vector splats whose value fits in a uimm6.
+ /// Select constant vector splats whose value fits in a uimm6.
bool selectVSplatUimm6(SDValue N, SDValue &Imm) const override;
- /// \brief Select constant vector splats whose value fits in a uimm8.
+ /// Select constant vector splats whose value fits in a uimm8.
bool selectVSplatUimm8(SDValue N, SDValue &Imm) const override;
- /// \brief Select constant vector splats whose value fits in a simm5.
+ /// Select constant vector splats whose value fits in a simm5.
bool selectVSplatSimm5(SDValue N, SDValue &Imm) const override;
- /// \brief Select constant vector splats whose value is a power of 2.
+ /// Select constant vector splats whose value is a power of 2.
bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const override;
- /// \brief Select constant vector splats whose value is the inverse of a
+ /// Select constant vector splats whose value is the inverse of a
/// power of 2.
bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const override;
- /// \brief Select constant vector splats whose value is a run of set bits
+ /// Select constant vector splats whose value is a run of set bits
/// ending at the most significant bit
bool selectVSplatMaskL(SDValue N, SDValue &Imm) const override;
- /// \brief Select constant vector splats whose value is a run of set bits
+ /// Select constant vector splats whose value is a run of set bits
/// starting at bit zero.
bool selectVSplatMaskR(SDValue N, SDValue &Imm) const override;
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index f7d7e2af85e4..f625a2903bd7 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -28,7 +28,6 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
@@ -40,6 +39,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
@@ -104,6 +104,11 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::VSELECT);
+
+ if (Subtarget.hasMips32r2()) {
+ setOperationAction(ISD::ADDC, MVT::i32, Legal);
+ setOperationAction(ISD::ADDE, MVT::i32, Legal);
+ }
}
if (Subtarget.hasDSPR2())
@@ -331,8 +336,12 @@ addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
setOperationAction(ISD::SRA, Ty, Legal);
setOperationAction(ISD::SRL, Ty, Legal);
setOperationAction(ISD::SUB, Ty, Legal);
+ setOperationAction(ISD::SMAX, Ty, Legal);
+ setOperationAction(ISD::SMIN, Ty, Legal);
setOperationAction(ISD::UDIV, Ty, Legal);
setOperationAction(ISD::UREM, Ty, Legal);
+ setOperationAction(ISD::UMAX, Ty, Legal);
+ setOperationAction(ISD::UMIN, Ty, Legal);
setOperationAction(ISD::VECTOR_SHUFFLE, Ty, Custom);
setOperationAction(ISD::VSELECT, Ty, Legal);
setOperationAction(ISD::XOR, Ty, Legal);
@@ -701,6 +710,77 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static bool shouldTransformMulToShiftsAddsSubs(APInt C, EVT VT,
+ SelectionDAG &DAG,
+ const MipsSubtarget &Subtarget) {
+ // Estimate the number of operations the below transform will turn a
+ // constant multiply into. The number is approximately how many powers
+ // of two summed together that the constant can be broken down into.
+
+ SmallVector<APInt, 16> WorkStack(1, C);
+ unsigned Steps = 0;
+ unsigned BitWidth = C.getBitWidth();
+
+ while (!WorkStack.empty()) {
+ APInt Val = WorkStack.pop_back_val();
+
+ if (Val == 0 || Val == 1)
+ continue;
+
+ if (Val.isPowerOf2()) {
+ ++Steps;
+ continue;
+ }
+
+ APInt Floor = APInt(BitWidth, 1) << Val.logBase2();
+ APInt Ceil = Val.isNegative() ? APInt(BitWidth, 0)
+ : APInt(BitWidth, 1) << C.ceilLogBase2();
+
+ if ((Val - Floor).ule(Ceil - Val)) {
+ WorkStack.push_back(Floor);
+ WorkStack.push_back(Val - Floor);
+ ++Steps;
+ continue;
+ }
+
+ WorkStack.push_back(Ceil);
+ WorkStack.push_back(Ceil - Val);
+ ++Steps;
+
+ // If we have taken more than 12[1] / 8[2] steps to attempt the
+ // optimization for a native sized value, it is more than likely that this
+ // optimization will make things worse.
+ //
+ // [1] MIPS64 requires 6 instructions at most to materialize any constant,
+ // multiplication requires at least 4 cycles, but another cycle (or two)
+ // to retrieve the result from the HI/LO registers.
+ //
+ // [2] For MIPS32, more than 8 steps is expensive as the constant could be
+ // materialized in 2 instructions, multiplication requires at least 4
+ // cycles, but another cycle (or two) to retrieve the result from the
+ // HI/LO registers.
+
+ if (Steps > 12 && (Subtarget.isABI_N32() || Subtarget.isABI_N64()))
+ return false;
+
+ if (Steps > 8 && Subtarget.isABI_O32())
+ return false;
+ }
+
+ // If the value being multiplied is not supported natively, we have to pay
+ // an additional legalization cost, conservatively assume an increase in the
+ // cost of 3 instructions per step. This values for this heuristic were
+ // determined experimentally.
+ unsigned RegisterSize = DAG.getTargetLoweringInfo()
+ .getRegisterType(*DAG.getContext(), VT)
+ .getSizeInBits();
+ Steps *= (VT.getSizeInBits() != RegisterSize) * 3;
+ if (Steps > 27)
+ return false;
+
+ return true;
+}
+
static SDValue genConstMult(SDValue X, APInt C, const SDLoc &DL, EVT VT,
EVT ShiftTy, SelectionDAG &DAG) {
// Return 0.
@@ -739,11 +819,13 @@ static SDValue genConstMult(SDValue X, APInt C, const SDLoc &DL, EVT VT,
static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
const TargetLowering::DAGCombinerInfo &DCI,
- const MipsSETargetLowering *TL) {
+ const MipsSETargetLowering *TL,
+ const MipsSubtarget &Subtarget) {
EVT VT = N->getValueType(0);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
- if (!VT.isVector())
+ if (!VT.isVector() && shouldTransformMulToShiftsAddsSubs(
+ C->getAPIntValue(), VT, DAG, Subtarget))
return genConstMult(N->getOperand(0), C->getAPIntValue(), SDLoc(N), VT,
TL->getScalarShiftAmountTy(DAG.getDataLayout(), VT),
DAG);
@@ -890,46 +972,7 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
static SDValue performVSELECTCombine(SDNode *N, SelectionDAG &DAG) {
EVT Ty = N->getValueType(0);
- if (Ty.is128BitVector() && Ty.isInteger()) {
- // Try the following combines:
- // (vselect (setcc $a, $b, SETLT), $b, $a)) -> (vsmax $a, $b)
- // (vselect (setcc $a, $b, SETLE), $b, $a)) -> (vsmax $a, $b)
- // (vselect (setcc $a, $b, SETLT), $a, $b)) -> (vsmin $a, $b)
- // (vselect (setcc $a, $b, SETLE), $a, $b)) -> (vsmin $a, $b)
- // (vselect (setcc $a, $b, SETULT), $b, $a)) -> (vumax $a, $b)
- // (vselect (setcc $a, $b, SETULE), $b, $a)) -> (vumax $a, $b)
- // (vselect (setcc $a, $b, SETULT), $a, $b)) -> (vumin $a, $b)
- // (vselect (setcc $a, $b, SETULE), $a, $b)) -> (vumin $a, $b)
- // SETGT/SETGE/SETUGT/SETUGE variants of these will show up initially but
- // will be expanded to equivalent SETLT/SETLE/SETULT/SETULE versions by the
- // legalizer.
- SDValue Op0 = N->getOperand(0);
-
- if (Op0->getOpcode() != ISD::SETCC)
- return SDValue();
-
- ISD::CondCode CondCode = cast<CondCodeSDNode>(Op0->getOperand(2))->get();
- bool Signed;
-
- if (CondCode == ISD::SETLT || CondCode == ISD::SETLE)
- Signed = true;
- else if (CondCode == ISD::SETULT || CondCode == ISD::SETULE)
- Signed = false;
- else
- return SDValue();
-
- SDValue Op1 = N->getOperand(1);
- SDValue Op2 = N->getOperand(2);
- SDValue Op0Op0 = Op0->getOperand(0);
- SDValue Op0Op1 = Op0->getOperand(1);
-
- if (Op1 == Op0Op0 && Op2 == Op0Op1)
- return DAG.getNode(Signed ? MipsISD::VSMIN : MipsISD::VUMIN, SDLoc(N),
- Ty, Op1, Op2);
- else if (Op1 == Op0Op1 && Op2 == Op0Op0)
- return DAG.getNode(Signed ? MipsISD::VSMAX : MipsISD::VUMAX, SDLoc(N),
- Ty, Op1, Op2);
- } else if ((Ty == MVT::v2i16) || (Ty == MVT::v4i8)) {
+ if (Ty == MVT::v2i16 || Ty == MVT::v4i8) {
SDValue SetCC = N->getOperand(0);
if (SetCC.getOpcode() != MipsISD::SETCC_DSP)
@@ -983,7 +1026,7 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
Val = performORCombine(N, DAG, DCI, Subtarget);
break;
case ISD::MUL:
- return performMULCombine(N, DAG, DCI, this);
+ return performMULCombine(N, DAG, DCI, this, Subtarget);
case ISD::SHL:
Val = performSHLCombine(N, DAG, DCI, Subtarget);
break;
@@ -1002,11 +1045,9 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
}
if (Val.getNode()) {
- DEBUG(dbgs() << "\nMipsSE DAG Combine:\n";
- N->printrWithDepth(dbgs(), &DAG);
- dbgs() << "\n=> \n";
- Val.getNode()->printrWithDepth(dbgs(), &DAG);
- dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "\nMipsSE DAG Combine:\n";
+ N->printrWithDepth(dbgs(), &DAG); dbgs() << "\n=> \n";
+ Val.getNode()->printrWithDepth(dbgs(), &DAG); dbgs() << "\n");
return Val;
}
@@ -1305,7 +1346,16 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
SDValue LaneB;
if (ResVecTy == MVT::v2i64) {
- LaneB = DAG.getConstant(0, DL, MVT::i32);
+ // In case of the index being passed as an immediate value, set the upper
+ // lane to 0 so that the splati.d instruction can be matched.
+ if (isa<ConstantSDNode>(LaneA))
+ LaneB = DAG.getConstant(0, DL, MVT::i32);
+ // Having the index passed in a register, set the upper lane to the same
+ // value as the lower - this results in the BUILD_VECTOR node not being
+ // expanded through stack. This way we are able to pattern match the set of
+ // nodes created here to splat.d.
+ else
+ LaneB = LaneA;
ViaVecTy = MVT::v4i32;
if(BigEndian)
std::swap(LaneA, LaneB);
@@ -1820,10 +1870,8 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::mips_fmsub_w:
case Intrinsic::mips_fmsub_d: {
// TODO: If intrinsics have fast-math-flags, propagate them.
- EVT ResTy = Op->getValueType(0);
- return DAG.getNode(ISD::FSUB, SDLoc(Op), ResTy, Op->getOperand(1),
- DAG.getNode(ISD::FMUL, SDLoc(Op), ResTy,
- Op->getOperand(2), Op->getOperand(3)));
+ return DAG.getNode(MipsISD::FMS, SDLoc(Op), Op->getValueType(0),
+ Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
}
case Intrinsic::mips_frint_w:
case Intrinsic::mips_frint_d:
@@ -1919,49 +1967,49 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::mips_max_s_h:
case Intrinsic::mips_max_s_w:
case Intrinsic::mips_max_s_d:
- return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+ return DAG.getNode(ISD::SMAX, DL, Op->getValueType(0),
Op->getOperand(1), Op->getOperand(2));
case Intrinsic::mips_max_u_b:
case Intrinsic::mips_max_u_h:
case Intrinsic::mips_max_u_w:
case Intrinsic::mips_max_u_d:
- return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+ return DAG.getNode(ISD::UMAX, DL, Op->getValueType(0),
Op->getOperand(1), Op->getOperand(2));
case Intrinsic::mips_maxi_s_b:
case Intrinsic::mips_maxi_s_h:
case Intrinsic::mips_maxi_s_w:
case Intrinsic::mips_maxi_s_d:
- return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+ return DAG.getNode(ISD::SMAX, DL, Op->getValueType(0),
Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG, true));
case Intrinsic::mips_maxi_u_b:
case Intrinsic::mips_maxi_u_h:
case Intrinsic::mips_maxi_u_w:
case Intrinsic::mips_maxi_u_d:
- return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+ return DAG.getNode(ISD::UMAX, DL, Op->getValueType(0),
Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
case Intrinsic::mips_min_s_b:
case Intrinsic::mips_min_s_h:
case Intrinsic::mips_min_s_w:
case Intrinsic::mips_min_s_d:
- return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+ return DAG.getNode(ISD::SMIN, DL, Op->getValueType(0),
Op->getOperand(1), Op->getOperand(2));
case Intrinsic::mips_min_u_b:
case Intrinsic::mips_min_u_h:
case Intrinsic::mips_min_u_w:
case Intrinsic::mips_min_u_d:
- return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+ return DAG.getNode(ISD::UMIN, DL, Op->getValueType(0),
Op->getOperand(1), Op->getOperand(2));
case Intrinsic::mips_mini_s_b:
case Intrinsic::mips_mini_s_h:
case Intrinsic::mips_mini_s_w:
case Intrinsic::mips_mini_s_d:
- return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+ return DAG.getNode(ISD::SMIN, DL, Op->getValueType(0),
Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG, true));
case Intrinsic::mips_mini_u_b:
case Intrinsic::mips_mini_u_h:
case Intrinsic::mips_mini_u_w:
case Intrinsic::mips_mini_u_d:
- return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+ return DAG.getNode(ISD::UMIN, DL, Op->getValueType(0),
Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
case Intrinsic::mips_mod_s_b:
case Intrinsic::mips_mod_s_h:
@@ -2312,7 +2360,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
}
}
-/// \brief Check if the given BuildVectorSDNode is a splat.
+/// Check if the given BuildVectorSDNode is a splat.
/// This method currently relies on DAG nodes being reused when equivalent,
/// so it's possible for this to return false even when isConstantSplat returns
/// true.
diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h
index 5976ecbcfc61..761ff3b1fa4d 100644
--- a/lib/Target/Mips/MipsSEISelLowering.h
+++ b/lib/Target/Mips/MipsSEISelLowering.h
@@ -15,8 +15,8 @@
#define LLVM_LIB_TARGET_MIPS_MIPSSEISELLOWERING_H
#include "MipsISelLowering.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/MachineValueType.h"
namespace llvm {
@@ -32,11 +32,11 @@ class TargetRegisterClass;
explicit MipsSETargetLowering(const MipsTargetMachine &TM,
const MipsSubtarget &STI);
- /// \brief Enable MSA support for the given integer type and Register
+ /// Enable MSA support for the given integer type and Register
/// class.
void addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC);
- /// \brief Enable MSA support for the given floating-point type and
+ /// Enable MSA support for the given floating-point type and
/// Register class.
void addMSAFloatType(MVT::SimpleValueType Ty,
const TargetRegisterClass *RC);
@@ -82,7 +82,7 @@ class TargetRegisterClass;
SDValue lowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
- /// \brief Lower VECTOR_SHUFFLE into one of a number of instructions
+ /// Lower VECTOR_SHUFFLE into one of a number of instructions
/// depending on the indices in the shuffle.
SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -92,46 +92,46 @@ class TargetRegisterClass;
MachineBasicBlock *emitMSACBranchPseudo(MachineInstr &MI,
MachineBasicBlock *BB,
unsigned BranchOp) const;
- /// \brief Emit the COPY_FW pseudo instruction
+ /// Emit the COPY_FW pseudo instruction
MachineBasicBlock *emitCOPY_FW(MachineInstr &MI,
MachineBasicBlock *BB) const;
- /// \brief Emit the COPY_FD pseudo instruction
+ /// Emit the COPY_FD pseudo instruction
MachineBasicBlock *emitCOPY_FD(MachineInstr &MI,
MachineBasicBlock *BB) const;
- /// \brief Emit the INSERT_FW pseudo instruction
+ /// Emit the INSERT_FW pseudo instruction
MachineBasicBlock *emitINSERT_FW(MachineInstr &MI,
MachineBasicBlock *BB) const;
- /// \brief Emit the INSERT_FD pseudo instruction
+ /// Emit the INSERT_FD pseudo instruction
MachineBasicBlock *emitINSERT_FD(MachineInstr &MI,
MachineBasicBlock *BB) const;
- /// \brief Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction
+ /// Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction
MachineBasicBlock *emitINSERT_DF_VIDX(MachineInstr &MI,
MachineBasicBlock *BB,
unsigned EltSizeInBytes,
bool IsFP) const;
- /// \brief Emit the FILL_FW pseudo instruction
+ /// Emit the FILL_FW pseudo instruction
MachineBasicBlock *emitFILL_FW(MachineInstr &MI,
MachineBasicBlock *BB) const;
- /// \brief Emit the FILL_FD pseudo instruction
+ /// Emit the FILL_FD pseudo instruction
MachineBasicBlock *emitFILL_FD(MachineInstr &MI,
MachineBasicBlock *BB) const;
- /// \brief Emit the FEXP2_W_1 pseudo instructions.
+ /// Emit the FEXP2_W_1 pseudo instructions.
MachineBasicBlock *emitFEXP2_W_1(MachineInstr &MI,
MachineBasicBlock *BB) const;
- /// \brief Emit the FEXP2_D_1 pseudo instructions.
+ /// Emit the FEXP2_D_1 pseudo instructions.
MachineBasicBlock *emitFEXP2_D_1(MachineInstr &MI,
MachineBasicBlock *BB) const;
- /// \brief Emit the FILL_FW pseudo instruction
+ /// Emit the FILL_FW pseudo instruction
MachineBasicBlock *emitLD_F16_PSEUDO(MachineInstr &MI,
MachineBasicBlock *BB) const;
- /// \brief Emit the FILL_FD pseudo instruction
+ /// Emit the FILL_FD pseudo instruction
MachineBasicBlock *emitST_F16_PSEUDO(MachineInstr &MI,
MachineBasicBlock *BB) const;
- /// \brief Emit the FEXP2_W_1 pseudo instructions.
+ /// Emit the FEXP2_W_1 pseudo instructions.
MachineBasicBlock *emitFPEXTEND_PSEUDO(MachineInstr &MI,
MachineBasicBlock *BB,
bool IsFGR64) const;
- /// \brief Emit the FEXP2_D_1 pseudo instructions.
+ /// Emit the FEXP2_D_1 pseudo instructions.
MachineBasicBlock *emitFPROUND_PSEUDO(MachineInstr &MI,
MachineBasicBlock *BBi,
bool IsFGR64) const;
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index 59b7679971cd..7ffe4aff474d 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -179,6 +179,69 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MIB.addReg(ZeroReg);
}
+static bool isORCopyInst(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case Mips::OR_MM:
+ case Mips::OR:
+ if (MI.getOperand(2).getReg() == Mips::ZERO)
+ return true;
+ break;
+ case Mips::OR64:
+ if (MI.getOperand(2).getReg() == Mips::ZERO_64)
+ return true;
+ break;
+ }
+ return false;
+}
+
+/// If @MI is WRDSP/RRDSP instruction return true with @isWrite set to true
+/// if it is WRDSP instruction.
+static bool isReadOrWriteToDSPReg(const MachineInstr &MI, bool &isWrite) {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case Mips::WRDSP:
+ case Mips::WRDSP_MM:
+ isWrite = true;
+ break;
+ case Mips::RDDSP:
+ case Mips::RDDSP_MM:
+ isWrite = false;
+ break;
+ }
+ return true;
+}
+
+/// We check for the common case of 'or', as it's MIPS' preferred instruction
+/// for GPRs but we have to check the operands to ensure that is the case.
+/// Other move instructions for MIPS are directly identifiable.
+bool MipsSEInstrInfo::isCopyInstr(const MachineInstr &MI,
+ const MachineOperand *&Src,
+ const MachineOperand *&Dest) const {
+ bool isDSPControlWrite = false;
+ // Condition is made to match the creation of WRDSP/RDDSP copy instruction
+ // from copyPhysReg function.
+ if (isReadOrWriteToDSPReg(MI, isDSPControlWrite)) {
+ if (!MI.getOperand(1).isImm() || MI.getOperand(1).getImm() != (1<<4))
+ return false;
+ else if (isDSPControlWrite) {
+ Src = &MI.getOperand(0);
+ Dest = &MI.getOperand(2);
+ } else {
+ Dest = &MI.getOperand(0);
+ Src = &MI.getOperand(2);
+ }
+ return true;
+ } else if (MI.isMoveReg() || isORCopyInst(MI)) {
+ Dest = &MI.getOperand(0);
+ Src = &MI.getOperand(1);
+ return true;
+ }
+ return false;
+}
+
void MipsSEInstrInfo::
storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
unsigned SrcReg, bool isKill, int FI,
@@ -379,28 +442,30 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
expandCvtFPInt(MBB, MI, Mips::CVT_S_W, Mips::MTC1, false);
break;
case Mips::PseudoCVT_D32_W:
- expandCvtFPInt(MBB, MI, Mips::CVT_D32_W, Mips::MTC1, false);
+ Opc = isMicroMips ? Mips::CVT_D32_W_MM : Mips::CVT_D32_W;
+ expandCvtFPInt(MBB, MI, Opc, Mips::MTC1, false);
break;
case Mips::PseudoCVT_S_L:
expandCvtFPInt(MBB, MI, Mips::CVT_S_L, Mips::DMTC1, true);
break;
case Mips::PseudoCVT_D64_W:
- expandCvtFPInt(MBB, MI, Mips::CVT_D64_W, Mips::MTC1, true);
+ Opc = isMicroMips ? Mips::CVT_D64_W_MM : Mips::CVT_D64_W;
+ expandCvtFPInt(MBB, MI, Opc, Mips::MTC1, true);
break;
case Mips::PseudoCVT_D64_L:
expandCvtFPInt(MBB, MI, Mips::CVT_D64_L, Mips::DMTC1, true);
break;
case Mips::BuildPairF64:
- expandBuildPairF64(MBB, MI, false);
+ expandBuildPairF64(MBB, MI, isMicroMips, false);
break;
case Mips::BuildPairF64_64:
- expandBuildPairF64(MBB, MI, true);
+ expandBuildPairF64(MBB, MI, isMicroMips, true);
break;
case Mips::ExtractElementF64:
- expandExtractElementF64(MBB, MI, false);
+ expandExtractElementF64(MBB, MI, isMicroMips, false);
break;
case Mips::ExtractElementF64_64:
- expandExtractElementF64(MBB, MI, true);
+ expandExtractElementF64(MBB, MI, isMicroMips, true);
break;
case Mips::MIPSeh_return32:
case Mips::MIPSeh_return64:
@@ -425,6 +490,10 @@ unsigned MipsSEInstrInfo::getOppositeBranchOpc(unsigned Opc) const {
case Mips::BGEZ: return Mips::BLTZ;
case Mips::BLTZ: return Mips::BGEZ;
case Mips::BLEZ: return Mips::BGTZ;
+ case Mips::BGTZ_MM: return Mips::BLEZ_MM;
+ case Mips::BGEZ_MM: return Mips::BLTZ_MM;
+ case Mips::BLTZ_MM: return Mips::BGEZ_MM;
+ case Mips::BLEZ_MM: return Mips::BGTZ_MM;
case Mips::BEQ64: return Mips::BNE64;
case Mips::BNE64: return Mips::BEQ64;
case Mips::BGTZ64: return Mips::BLEZ64;
@@ -433,16 +502,40 @@ unsigned MipsSEInstrInfo::getOppositeBranchOpc(unsigned Opc) const {
case Mips::BLEZ64: return Mips::BGTZ64;
case Mips::BC1T: return Mips::BC1F;
case Mips::BC1F: return Mips::BC1T;
- case Mips::BEQZC_MM: return Mips::BNEZC_MM;
- case Mips::BNEZC_MM: return Mips::BEQZC_MM;
+ case Mips::BC1T_MM: return Mips::BC1F_MM;
+ case Mips::BC1F_MM: return Mips::BC1T_MM;
+ case Mips::BEQZ16_MM: return Mips::BNEZ16_MM;
+ case Mips::BNEZ16_MM: return Mips::BEQZ16_MM;
+ case Mips::BEQZC_MM: return Mips::BNEZC_MM;
+ case Mips::BNEZC_MM: return Mips::BEQZC_MM;
case Mips::BEQZC: return Mips::BNEZC;
case Mips::BNEZC: return Mips::BEQZC;
- case Mips::BEQC: return Mips::BNEC;
- case Mips::BNEC: return Mips::BEQC;
- case Mips::BGTZC: return Mips::BLEZC;
+ case Mips::BLEZC: return Mips::BGTZC;
case Mips::BGEZC: return Mips::BLTZC;
+ case Mips::BGEC: return Mips::BLTC;
+ case Mips::BGTZC: return Mips::BLEZC;
case Mips::BLTZC: return Mips::BGEZC;
- case Mips::BLEZC: return Mips::BGTZC;
+ case Mips::BLTC: return Mips::BGEC;
+ case Mips::BGEUC: return Mips::BLTUC;
+ case Mips::BLTUC: return Mips::BGEUC;
+ case Mips::BEQC: return Mips::BNEC;
+ case Mips::BNEC: return Mips::BEQC;
+ case Mips::BC1EQZ: return Mips::BC1NEZ;
+ case Mips::BC1NEZ: return Mips::BC1EQZ;
+ case Mips::BEQZC_MMR6: return Mips::BNEZC_MMR6;
+ case Mips::BNEZC_MMR6: return Mips::BEQZC_MMR6;
+ case Mips::BLEZC_MMR6: return Mips::BGTZC_MMR6;
+ case Mips::BGEZC_MMR6: return Mips::BLTZC_MMR6;
+ case Mips::BGEC_MMR6: return Mips::BLTC_MMR6;
+ case Mips::BGTZC_MMR6: return Mips::BLEZC_MMR6;
+ case Mips::BLTZC_MMR6: return Mips::BGEZC_MMR6;
+ case Mips::BLTC_MMR6: return Mips::BGEC_MMR6;
+ case Mips::BGEUC_MMR6: return Mips::BLTUC_MMR6;
+ case Mips::BLTUC_MMR6: return Mips::BGEUC_MMR6;
+ case Mips::BEQC_MMR6: return Mips::BNEC_MMR6;
+ case Mips::BNEC_MMR6: return Mips::BEQC_MMR6;
+ case Mips::BC1EQZC_MMR6: return Mips::BC1NEZC_MMR6;
+ case Mips::BC1NEZC_MMR6: return Mips::BC1EQZC_MMR6;
case Mips::BEQZC64: return Mips::BNEZC64;
case Mips::BNEZC64: return Mips::BEQZC64;
case Mips::BEQC64: return Mips::BNEC64;
@@ -459,6 +552,16 @@ unsigned MipsSEInstrInfo::getOppositeBranchOpc(unsigned Opc) const {
case Mips::BBIT1: return Mips::BBIT0;
case Mips::BBIT032: return Mips::BBIT132;
case Mips::BBIT132: return Mips::BBIT032;
+ case Mips::BZ_B: return Mips::BNZ_B;
+ case Mips::BZ_H: return Mips::BNZ_H;
+ case Mips::BZ_W: return Mips::BNZ_W;
+ case Mips::BZ_D: return Mips::BNZ_D;
+ case Mips::BZ_V: return Mips::BNZ_V;
+ case Mips::BNZ_B: return Mips::BZ_B;
+ case Mips::BNZ_H: return Mips::BZ_H;
+ case Mips::BNZ_W: return Mips::BZ_W;
+ case Mips::BNZ_D: return Mips::BZ_D;
+ case Mips::BNZ_V: return Mips::BZ_V;
}
}
@@ -551,7 +654,13 @@ unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
Opc == Mips::BGTZC64 || Opc == Mips::BGEZC64 ||
Opc == Mips::BLTZC64 || Opc == Mips::BLEZC64 || Opc == Mips::BC ||
Opc == Mips::BBIT0 || Opc == Mips::BBIT1 || Opc == Mips::BBIT032 ||
- Opc == Mips::BBIT132) ? Opc : 0;
+ Opc == Mips::BBIT132 || Opc == Mips::BC_MMR6 ||
+ Opc == Mips::BEQC_MMR6 || Opc == Mips::BNEC_MMR6 ||
+ Opc == Mips::BLTC_MMR6 || Opc == Mips::BGEC_MMR6 ||
+ Opc == Mips::BLTUC_MMR6 || Opc == Mips::BGEUC_MMR6 ||
+ Opc == Mips::BGTZC_MMR6 || Opc == Mips::BLEZC_MMR6 ||
+ Opc == Mips::BGEZC_MMR6 || Opc == Mips::BLTZC_MMR6 ||
+ Opc == Mips::BEQZC_MMR6 || Opc == Mips::BNEZC_MMR6) ? Opc : 0;
}
void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB,
@@ -651,6 +760,7 @@ void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB,
void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
+ bool isMicroMips,
bool FP64) const {
unsigned DstReg = I->getOperand(0).getReg();
unsigned SrcReg = I->getOperand(1).getReg();
@@ -682,7 +792,10 @@ void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
// We therefore pretend that it reads the bottom 32-bits to
// artificially create a dependency and prevent the scheduler
// changing the behaviour of the code.
- BuildMI(MBB, I, dl, get(FP64 ? Mips::MFHC1_D64 : Mips::MFHC1_D32), DstReg)
+ BuildMI(MBB, I, dl,
+ get(isMicroMips ? (FP64 ? Mips::MFHC1_D64_MM : Mips::MFHC1_D32_MM)
+ : (FP64 ? Mips::MFHC1_D64 : Mips::MFHC1_D32)),
+ DstReg)
.addReg(SrcReg);
} else
BuildMI(MBB, I, dl, get(Mips::MFC1), DstReg).addReg(SubReg);
@@ -690,7 +803,7 @@ void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
- bool FP64) const {
+ bool isMicroMips, bool FP64) const {
unsigned DstReg = I->getOperand(0).getReg();
unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1);
@@ -735,7 +848,10 @@ void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
// We therefore pretend that it reads the bottom 32-bits to
// artificially create a dependency and prevent the scheduler
// changing the behaviour of the code.
- BuildMI(MBB, I, dl, get(FP64 ? Mips::MTHC1_D64 : Mips::MTHC1_D32), DstReg)
+ BuildMI(MBB, I, dl,
+ get(isMicroMips ? (FP64 ? Mips::MTHC1_D64_MM : Mips::MTHC1_D32_MM)
+ : (FP64 ? Mips::MTHC1_D64 : Mips::MTHC1_D32)),
+ DstReg)
.addReg(DstReg)
.addReg(HiReg);
} else if (Subtarget.isABI_FPXX())
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index b356909bf1cf..fc55716d598a 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -47,6 +47,9 @@ public:
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc) const override;
+ bool isCopyInstr(const MachineInstr &MI, const MachineOperand *&Src,
+ const MachineOperand *&Dest) const override;
+
void storeRegToStack(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned SrcReg, bool isKill, int FrameIndex,
@@ -107,9 +110,11 @@ private:
unsigned CvtOpc, unsigned MovOpc, bool IsI64) const;
void expandExtractElementF64(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, bool FP64) const;
+ MachineBasicBlock::iterator I, bool isMicroMips,
+ bool FP64) const;
void expandBuildPairF64(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, bool FP64) const;
+ MachineBasicBlock::iterator I, bool isMicroMips,
+ bool FP64) const;
void expandEhReturn(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const;
};
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 2ff6b99e78ff..e7d720a4b769 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -88,10 +88,8 @@ static inline unsigned getLoadStoreOffsetSizeInBits(const unsigned Opcode,
case Mips::SCE:
return 16;
case Mips::LLE_MM:
- case Mips::LLE_MMR6:
case Mips::LL_MM:
case Mips::SCE_MM:
- case Mips::SCE_MMR6:
case Mips::SC_MM:
return 12;
case Mips::LL64_R6:
@@ -100,6 +98,8 @@ static inline unsigned getLoadStoreOffsetSizeInBits(const unsigned Opcode,
case Mips::SC64_R6:
case Mips::SCD_R6:
case Mips::SC_R6:
+ case Mips::LL_MMR6:
+ case Mips::SC_MMR6:
return 9;
case Mips::INLINEASM: {
unsigned ConstraintID = InlineAsm::getMemoryConstraintID(MO.getImm());
@@ -204,7 +204,8 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
Offset = SPOffset + (int64_t)StackSize;
Offset += MI.getOperand(OpNo + 1).getImm();
- DEBUG(errs() << "Offset : " << Offset << "\n" << "<--------->\n");
+ LLVM_DEBUG(errs() << "Offset : " << Offset << "\n"
+ << "<--------->\n");
if (!MI.isDebugValue()) {
// Make sure Offset fits within the field available.
diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td
index 8ec55ab6284d..64db815a0f4c 100644
--- a/lib/Target/Mips/MipsSchedule.td
+++ b/lib/Target/Mips/MipsSchedule.td
@@ -57,6 +57,14 @@ def II_CFC1 : InstrItinClass;
def II_CFC2 : InstrItinClass;
def II_CLO : InstrItinClass;
def II_CLZ : InstrItinClass;
+def II_CRC32B : InstrItinClass;
+def II_CRC32CB : InstrItinClass;
+def II_CRC32CD : InstrItinClass;
+def II_CRC32CH : InstrItinClass;
+def II_CRC32CW : InstrItinClass;
+def II_CRC32D : InstrItinClass;
+def II_CRC32H : InstrItinClass;
+def II_CRC32W : InstrItinClass;
def II_CTC1 : InstrItinClass;
def II_CTC2 : InstrItinClass;
def II_CVT : InstrItinClass;
@@ -84,8 +92,10 @@ def II_DIVU : InstrItinClass;
def II_DIV_D : InstrItinClass;
def II_DIV_S : InstrItinClass;
def II_DMFC0 : InstrItinClass;
+def II_DMFGC0 : InstrItinClass;
def II_DMT : InstrItinClass;
def II_DMTC0 : InstrItinClass;
+def II_DMTGC0 : InstrItinClass;
def II_DMFC1 : InstrItinClass;
def II_DMTC1 : InstrItinClass;
def II_DMOD : InstrItinClass;
@@ -120,6 +130,9 @@ def II_EVPE : InstrItinClass;
def II_EXT : InstrItinClass; // Any EXT instruction
def II_FLOOR : InstrItinClass;
def II_FORK : InstrItinClass;
+def II_GINVI : InstrItinClass;
+def II_GINVT : InstrItinClass;
+def II_HYPCALL : InstrItinClass;
def II_INS : InstrItinClass; // Any INS instruction
def II_IndirectBranchPseudo : InstrItinClass; // Indirect branch pseudo.
def II_J : InstrItinClass;
@@ -225,6 +238,8 @@ def II_MFHC0 : InstrItinClass;
def II_MFC1 : InstrItinClass;
def II_MFHC1 : InstrItinClass;
def II_MFC2 : InstrItinClass;
+def II_MFGC0 : InstrItinClass;
+def II_MFHGC0 : InstrItinClass;
def II_MFHI_MFLO : InstrItinClass; // mfhi and mflo
def II_MFTR : InstrItinClass;
def II_MOD : InstrItinClass;
@@ -255,6 +270,8 @@ def II_MTHC0 : InstrItinClass;
def II_MTC1 : InstrItinClass;
def II_MTHC1 : InstrItinClass;
def II_MTC2 : InstrItinClass;
+def II_MTGC0 : InstrItinClass;
+def II_MTHGC0 : InstrItinClass;
def II_MTHI_MTLO : InstrItinClass; // mthi and mtlo
def II_MTTR : InstrItinClass;
def II_MUL : InstrItinClass;
@@ -346,6 +363,12 @@ def II_CACHEE : InstrItinClass;
def II_PREFE : InstrItinClass;
def II_LLE : InstrItinClass;
def II_SCE : InstrItinClass;
+def II_TLBGINV : InstrItinClass;
+def II_TLBGINVF : InstrItinClass;
+def II_TLBGP : InstrItinClass;
+def II_TLBGR : InstrItinClass;
+def II_TLBGWI : InstrItinClass;
+def II_TLBGWR : InstrItinClass;
def II_TLBINV : InstrItinClass;
def II_TLBINVF : InstrItinClass;
def II_WRPGPR : InstrItinClass;
@@ -686,5 +709,28 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
InstrItinData<II_RDPGPR , [InstrStage<1, [ALU]>]>,
InstrItinData<II_DVP , [InstrStage<1, [ALU]>]>,
InstrItinData<II_EVP , [InstrStage<1, [ALU]>]>,
- InstrItinData<II_YIELD , [InstrStage<5, [ALU]>]>
+ InstrItinData<II_YIELD , [InstrStage<5, [ALU]>]>,
+ InstrItinData<II_CRC32B , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_CRC32H , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_CRC32W , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_CRC32D , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_CRC32CB , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_CRC32CH , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_CRC32CW , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_CRC32CD , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_MFGC0 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MTGC0 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MFHGC0 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_MTHGC0 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_HYPCALL , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_TLBGINV , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_TLBGINVF , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_TLBGP , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_TLBGR , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_TLBWI , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_TLBWR , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_DMFGC0 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_DMTGC0 , [InstrStage<2, [ALU]>]>,
+ InstrItinData<II_GINVI , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_GINVT , [InstrStage<1, [ALU]>]>
]>;
diff --git a/lib/Target/Mips/MipsScheduleGeneric.td b/lib/Target/Mips/MipsScheduleGeneric.td
index 744392c320ef..79c55dbb9e03 100644
--- a/lib/Target/Mips/MipsScheduleGeneric.td
+++ b/lib/Target/Mips/MipsScheduleGeneric.td
@@ -25,8 +25,11 @@ def MipsGenericModel : SchedMachineModel {
int HighLatency = 37;
list<Predicate> UnsupportedFeatures = [];
- let CompleteModel = 1;
+ let CompleteModel = 0;
let PostRAScheduler = 1;
+
+ // FIXME: Remove when all errors have been fixed.
+ let FullInstRWOverlapCheck = 0;
}
let SchedModel = MipsGenericModel in {
@@ -71,12 +74,12 @@ def : ItinRW<[GenericWriteMDUtoGPR], [II_MUL]>;
def GenericWriteDIV : SchedWriteRes<[GenericIssueDIV]> {
// Estimated worst case
let Latency = 33;
- let ResourceCycles = [1, 33];
+ let ResourceCycles = [33];
}
def GenericWriteDIVU : SchedWriteRes<[GenericIssueDIV]> {
// Estimated worst case
let Latency = 31;
- let ResourceCycles = [1, 31];
+ let ResourceCycles = [31];
}
def : ItinRW<[GenericWriteDIV], [II_DIV]>;
diff --git a/lib/Target/Mips/MipsScheduleP5600.td b/lib/Target/Mips/MipsScheduleP5600.td
index 440f93d5b7eb..846fa11494c7 100644
--- a/lib/Target/Mips/MipsScheduleP5600.td
+++ b/lib/Target/Mips/MipsScheduleP5600.td
@@ -13,14 +13,13 @@ def MipsP5600Model : SchedMachineModel {
int LoadLatency = 4;
int MispredictPenalty = 8; // TODO: Estimated
- let CompleteModel = 1;
+ let CompleteModel = 0;
+ let FullInstRWOverlapCheck = 1;
list<Predicate> UnsupportedFeatures = [HasMips32r6, HasMips64r6,
- HasMips64, HasMips64r2, HasCnMips,
+ HasMips3, HasMips64r2, HasCnMips,
InMicroMips, InMips16Mode,
- HasMicroMips32r6, HasDSP,
- HasDSPR2, HasMT];
-
+ HasDSP, HasDSPR2, HasMT, HasCRC];
}
let SchedModel = MipsP5600Model in {
@@ -37,9 +36,8 @@ def P5600IssueALU : ProcResource<1> { let Super = P5600ALQ; }
def P5600WriteALU : SchedWriteRes<[P5600IssueALU]>;
// and, lui, nor, or, slti, sltiu, sub, subu, xor
-def : ItinRW<[P5600WriteALU],
- [II_AND, II_LUI, II_NOR, II_OR, II_SLTI_SLTIU, II_SUB, II_SUBU,
- II_XOR]>;
+def : InstRW<[P5600WriteALU], (instrs AND, LUi, NOR, OR, SLTi, SLTiu, SUB,
+ SUBu, XOR)>;
// AGQ Pipelines
// =============
@@ -63,20 +61,35 @@ def P5600WriteJumpAndLink : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]> {
// b, beq, beql, bg[et]z, bl[et]z, bne, bnel, j, syscall, jal, bltzal,
// jalr, jr.hb, jr
-def : ItinRW<[P5600WriteJump], [II_B, II_BCC, II_BCCZ, II_BCCZAL, II_J, II_JR,
- II_JR_HB, II_DERET, II_ERET, II_ERETNC,
- II_SYSCALL, II_BREAK, II_SDBBP, II_SSNOP,
- II_TEQ, II_TEQI, II_TGE, II_TGEI, II_TGEIU,
- II_TGEU, II_TLT, II_TLTI, II_TLTU, II_TNE,
- II_TNEI, II_TRAP, II_TTLTIU, II_WAIT,
- II_PAUSE]>;
-
-def : ItinRW<[P5600WriteJumpAndLink], [II_JAL, II_JALR, II_JALR_HB]>;
+def : InstRW<[P5600WriteJump], (instrs B, BAL, BAL_BR, BEQ, BEQL, BGEZ, BGEZAL,
+ BGEZALL, BGEZL, BGTZ, BGTZL, BLEZ, BLEZL, BLTZ,
+ BLTZAL, BLTZALL, BLTZL, BNE, BNEL, BREAK,
+ DERET, ERET, ERETNC, J, JR, JR_HB,
+ PseudoIndirectBranch,
+ PseudoIndirectHazardBranch, PseudoReturn,
+ SDBBP, SSNOP, SYSCALL, TAILCALL, TAILCALLREG,
+ TAILCALLREGHB, TEQ, TEQI, TGE, TGEI, TGEIU,
+ TGEU, TLT, TLTI, TLTU, TNE, TNEI, TRAP,
+ TTLTIU, WAIT, PAUSE)>;
+
+def : InstRW<[P5600WriteJumpAndLink], (instrs JAL, JALR, JALRHBPseudo,
+ JALRPseudo, JALR_HB)>;
+
+def : InstRW<[P5600WriteJumpAndLink], (instrs JALX)> {
+ let Unsupported = 1;
+}
def P5600COP0 : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]>;
-def : ItinRW<[P5600COP0], [II_TLBINV, II_TLBINVF, II_TLBP, II_TLBR, II_TLBWI,
- II_TLBWR, II_MFC0, II_MTC0]>;
+def : InstRW<[P5600COP0], (instrs TLBINV, TLBINVF, TLBP, TLBR, TLBWI, TLBWR,
+ MFC0, MTC0)>;
+
+def P5600COP2 : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]>;
+
+def : InstRW<[P5600COP2], (instrs MFC2, MTC2)> {
+ let Unsupported = 1;
+}
+
// LDST Pipeline
// -------------
@@ -106,21 +119,20 @@ def P5600WriteLoadToOtherUnits : SchedWriteRes<[P5600IssueLDST]> {
}
// l[bhw], l[bh]u, ll
-def : ItinRW<[P5600WriteLoad], [II_LB, II_LBE, II_LBU, II_LBUE, II_LH, II_LHE,
- II_LHU, II_LHUE, II_LW, II_LWE, II_LL, II_LLE,
- II_LWPC]>;
+def : InstRW<[P5600WriteLoad], (instrs LB, LBu, LH, LHu, LW, LL, LWC2, LWC3,
+ LDC2, LDC3, LBE, LBuE, LHE, LHuE, LWE, LLE,
+ LWPC)>;
// lw[lr]
-def : ItinRW<[P5600WriteLoadShifted], [II_LWL, II_LWLE, II_LWR, II_LWRE]>;
+def : InstRW<[P5600WriteLoadShifted], (instrs LWL, LWR, LWLE, LWRE)>;
// s[bhw], sw[lr]
-def : ItinRW<[P5600WriteStore], [II_SB, II_SBE, II_SH, II_SHE, II_SW, II_SWE,
- II_SWL, II_SWLE, II_SWR, II_SWRE, II_SC,
- II_SCE]>;
+def : InstRW<[P5600WriteStore], (instrs SB, SH, SW, SWC2, SWC3, SDC2, SDC3, SC,
+ SBE, SHE, SWE, SCE, SWL, SWR, SWLE, SWRE)>;
// pref, cache, sync, synci
-def : ItinRW<[P5600WriteCache], [II_PREF, II_PREFE, II_CACHE, II_CACHEE,
- II_SYNC, II_SYNCI]>;
+def : InstRW<[P5600WriteCache], (instrs PREF, PREFE, CACHE, CACHEE, SYNC,
+ SYNCI)>;
// LDST is also used in moves from general purpose registers to floating point
// and MSA.
@@ -154,28 +166,31 @@ def P5600WriteAL2MAdd: SchedWriteRes<[P5600IssueAL2, P5600CTISTD]> {
}
// clo, clz, di, ei, mfhi, mflo
-def : ItinRW<[P5600WriteAL2], [II_CLO, II_CLZ, II_DI, II_EI, II_MFHI_MFLO]>;
+def : InstRW<[P5600WriteAL2], (instrs CLO, CLZ, DI, EI, MFHI, MFLO,
+ PseudoMFHI, PseudoMFLO)>;
// ehb, rdhwr, rdpgpr, wrpgpr, wsbh
-def : ItinRW<[P5600WriteAL2ShadowMov], [II_EHB, II_RDHWR, II_WSBH]>;
+def : InstRW<[P5600WriteAL2ShadowMov], (instrs EHB, RDHWR, WSBH)>;
// mov[nz]
-def : ItinRW<[P5600WriteAL2CondMov], [II_MOVN, II_MOVZ]>;
+def : InstRW<[P5600WriteAL2CondMov], (instrs MOVN_I_I, MOVZ_I_I)>;
// divu?
-def : ItinRW<[P5600WriteAL2Div], [II_DIV]>;
-def : ItinRW<[P5600WriteAL2DivU], [II_DIVU]>;
+def : InstRW<[P5600WriteAL2Div], (instrs DIV, PseudoSDIV, SDIV)>;
+def : InstRW<[P5600WriteAL2DivU], (instrs DIVU, PseudoUDIV, UDIV)>;
// mul
-def : ItinRW<[P5600WriteAL2Mul], [II_MUL]>;
+def : InstRW<[P5600WriteAL2Mul], (instrs MUL)>;
// multu?, multu?
-def : ItinRW<[P5600WriteAL2Mult], [II_MULT, II_MULTU]>;
+def : InstRW<[P5600WriteAL2Mult], (instrs MULT, MULTu, PseudoMULT,
+ PseudoMULTu)>;
// maddu?, msubu?, mthi, mtlo
-def : ItinRW<[P5600WriteAL2MAdd],
- [II_MADD, II_MADDU, II_MSUB, II_MSUBU, II_MTHI_MTLO]>;
+def : InstRW<[P5600WriteAL2MAdd], (instrs MADD, MADDU, MSUB, MSUBU,
+ MTHI, MTLO, PseudoMADD, PseudoMADDU,
+ PseudoMSUB, PseudoMSUBU, PseudoMTLOHI)>;
// ext, ins
-def : ItinRW<[P5600WriteAL2BitExt], [II_EXT, II_INS]>;
+def : InstRW<[P5600WriteAL2BitExt], (instrs EXT, INS)>;
// Either ALU or AL2 Pipelines
// ---------------------------
@@ -193,11 +208,9 @@ def P5600WriteEitherALU : SchedWriteVariant<
// add, addi, addiu, addu, andi, ori, rotr, se[bh], sllv?, sr[al]v?, slt, sltu,
// xori
-def : ItinRW<[P5600WriteEitherALU],
- [II_ADD, II_ADDI, II_ADDIU, II_ANDI, II_ORI, II_ROTR, II_SEB, II_SEH,
- II_SLT_SLTU, II_SLL, II_SRA, II_SRL, II_XORI, II_ADDU, II_SLLV,
- II_SRAV, II_SRLV, II_LSA]>;
-def : InstRW<[], (instrs COPY)>;
+def : InstRW<[P5600WriteEitherALU], (instrs ADD, ADDi, ADDiu, ANDi, ORi, ROTR,
+ SEB, SEH, SLT, SLTu, SLL, SRA, SRL, XORi,
+ ADDu, SLLV, SRAV, SRLV, LSA, COPY)>;
// FPU Pipelines
// =============
@@ -300,8 +313,10 @@ def P5600WriteMoveFPULToOtherUnits : SchedWriteRes<[P5600IssueFPUL]>;
// abs.[ds], abs.ps, bc1[tf]l?, mov[tf].[ds], mov[tf], mov.[ds], [cm][ft]c1,
// m[ft]hc1, neg.[ds], neg.ps, nor.v, nori.b, or.v, ori.b, xor.v, xori.b,
// sdxc1, sdc1, st.[bhwd], swc1, swxc1
-def : ItinRW<[P5600WriteFPUS], [II_ABS, II_MOVF_D, II_MOVF_S, II_MOVT_D,
- II_MOVT_S, II_MOV_D, II_MOV_S, II_NEG]>;
+def : InstRW<[P5600WriteFPUS], (instrs FABS_S, FABS_D32, FABS_D64, MOVF_D32,
+ MOVF_D64, MOVF_S, MOVT_D32, MOVT_D64,
+ MOVT_S, FMOV_D32, FMOV_D64, FMOV_S, FNEG_S,
+ FNEG_D32, FNEG_D64)>;
// adds_a.[bhwd], adds_[asu].[bhwd], addvi?.[bhwd], asub_[us].[bhwd],
// aver?_[us].[bhwd], shf.[bhw], fill[bhwd], splat?.[bhwd]
@@ -321,23 +336,6 @@ def : InstRW<[P5600WriteMSAShortLogic], (instregex "^LDI_[BHWD]$")>;
def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)_V$")>;
def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>;
-// vshf.[bhwd], binsl.[bhwd], binsr.[bhwd], insert.[bhwd], sld?.[bhwd],
-// bset.[bhwd], bclr.[bhwd], bneg.[bhwd], bsel_v, bseli_b
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^VSHF_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BINSL|BINSLI)_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BINSR|BINSRI)_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^INSERT_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^(SLD|SLDI)_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BSET|BSETI)_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BCLR|BCLRI)_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BNEG|BNEGI)_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BSEL_V|BSELI_B)$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^BMN*Z.*$")>;
-
-// pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd]
-def : InstRW<[P5600WriteMSAOther3], (instregex "^PCNT_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAOther3], (instregex "^SAT_(S|U)_[BHWD]$")>;
-
// fexp2_w, fexp2_d
def : InstRW<[P5600WriteFPUS], (instregex "^FEXP2_(W|D)$")>;
@@ -424,7 +422,6 @@ def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(SRLR|SRLRI)_[BHWD]$")>;
def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(SLL|SLLI)_[BHWD]$")>;
def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(PCKEV|PCKOD)_[BHWD]$")>;
def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(NLOC|NLZC)_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>;
// Long Pipe
// ----------
@@ -432,24 +429,31 @@ def : InstRW<[P5600WriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>;
// add.[ds], add.ps, cvt.d.[sw], cvt.s.[dw], cvt.w.[sd], cvt.[sw].ps,
// cvt.ps.[sw], c.<cc>.[ds], c.<cc>.ps, mul.[ds], mul.ps, sub.[ds], sub.ps,
// trunc.w.[ds], trunc.w.ps
-def : ItinRW<[P5600WriteFPUL],
- [II_ADD_D, II_ADD_S, II_CVT, II_C_CC_D, II_C_CC_S, II_MUL_D,
- II_MUL_S, II_SUB_D, II_SUB_S, II_TRUNC]>;
+def : InstRW<[P5600WriteFPUL],
+ (instrs FADD_D32, FADD_D64, FADD_S, FMUL_D32, FMUL_D64, FMUL_S,
+ FSUB_D32, FSUB_D64, FSUB_S)>;
+def : InstRW<[P5600WriteFPUL], (instregex "^TRUNC_(L|W)_(S|D32|D64)$")>;
+def : InstRW<[P5600WriteFPUL],
+ (instregex "^CVT_(S|D32|D64|L|W)_(S|D32|D64|L|W)$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^C_[A-Z]+_(S|D32|D64)$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^FCMP_(S32|D32|D64)$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^PseudoCVT_(S|D32|D64)_(L|W)$")>;
// div.[ds], div.ps
-def : ItinRW<[P5600WriteFPUDivS], [II_DIV_S]>;
-def : ItinRW<[P5600WriteFPUDivD], [II_DIV_D]>;
+def : InstRW<[P5600WriteFPUDivS], (instrs FDIV_S)>;
+def : InstRW<[P5600WriteFPUDivD], (instrs FDIV_D32, FDIV_D64)>;
// sqrt.[ds], sqrt.ps
-def : ItinRW<[P5600WriteFPUSqrtS], [II_SQRT_S]>;
-def : ItinRW<[P5600WriteFPUSqrtD], [II_SQRT_D]>;
+def : InstRW<[P5600WriteFPUSqrtS], (instrs FSQRT_S)>;
+def : InstRW<[P5600WriteFPUSqrtD], (instrs FSQRT_D32, FSQRT_D64)>;
// frcp.[wd], frsqrt.[wd]
def : InstRW<[P5600WriteFPURsqrtD], (instregex "^FRCP_(W|D)$")>;
def : InstRW<[P5600WriteFPURsqrtD], (instregex "^FRSQRT_(W|D)$")>;
-def : ItinRW<[P5600WriteFPURsqrtD], [II_RECIP_D, II_RSQRT_D]>;
-def : ItinRW<[P5600WriteFPURsqrtS], [II_RECIP_S, II_RSQRT_S]>;
+def : InstRW<[P5600WriteFPURsqrtD], (instrs RECIP_D32, RECIP_D64, RSQRT_D32,
+ RSQRT_D64)>;
+def : InstRW<[P5600WriteFPURsqrtS], (instrs RECIP_S, RSQRT_S)>;
// fmadd.[wd], fmsubb.[wd], fdiv.[wd], fsqrt.[wd], fmul.[wd], fadd.[wd],
// fsub.[wd]
@@ -481,9 +485,9 @@ def : InstRW<[P5600WriteMSALongInt], (instregex "^MUL_Q_[HW]$")>;
// madd.[ds], msub.[ds], nmadd.[ds], nmsub.[ds],
// Operand 0 is read on cycle 5. All other operands are read on operand 0.
-def : ItinRW<[SchedReadAdvance<5>, P5600WriteFPUL_MADDSUB],
- [II_MADD_D, II_MADD_S, II_MSUB_D, II_MSUB_S, II_NMADD_D,
- II_NMADD_S, II_NMSUB_D, II_NMSUB_S]>;
+def : InstRW<[SchedReadAdvance<5>, P5600WriteFPUL_MADDSUB],
+ (instrs MADD_D32, MADD_D64, MADD_S, MSUB_D32, MSUB_D64, MSUB_S,
+ NMADD_D32, NMADD_D64, NMADD_S, NMSUB_D32, NMSUB_D64, NMSUB_S)>;
// madd.ps, msub.ps, nmadd.ps, nmsub.ps
// Operand 0 and 1 are read on cycle 5. All others are read on operand 0.
@@ -536,26 +540,30 @@ def P5600WriteLoadFPU : WriteSequence<[P5600WriteLoadToOtherUnits,
P5600WriteLoadOtherUnitsToFPU]>;
// ctc1, mtc1, mthc1
-def : ItinRW<[P5600WriteMoveGPRToFPU], [II_CTC1, II_MTC1, II_MTHC1]>;
+def : InstRW<[P5600WriteMoveGPRToFPU], (instrs CTC1, MTC1, MTC1_D64, MTHC1_D32,
+ MTHC1_D64, BuildPairF64,
+ BuildPairF64_64)>;
// copy.[su]_[bhwd]
def : InstRW<[P5600WriteMoveFPUToGPR], (instregex "^COPY_U_[BHW]$")>;
def : InstRW<[P5600WriteMoveFPUToGPR], (instregex "^COPY_S_[BHWD]$")>;
// bc1[ft], cfc1, mfc1, mfhc1, movf, movt
-def : ItinRW<[P5600WriteMoveFPUToGPR],
- [II_BC1F, II_BC1FL, II_BC1T, II_BC1TL, II_CFC1, II_MFC1, II_MFHC1, II_MOVF, II_MOVT]>;
+def : InstRW<[P5600WriteMoveFPUToGPR], (instrs BC1F, BC1FL, BC1T, BC1TL, CFC1,
+ MFC1, MFC1_D64, MFHC1_D32, MFHC1_D64,
+ MOVF_I, MOVT_I, ExtractElementF64,
+ ExtractElementF64_64)>;
// swc1, swxc1, st.[bhwd]
-def : ItinRW<[P5600WriteStoreFPUS], [II_SDC1, II_SDXC1, II_SUXC1, II_SWC1,
- II_SWXC1]>;
+def : InstRW<[P5600WriteStoreFPUS], (instrs SDC1, SDXC1, SUXC1, SWC1, SWXC1)>;
def : InstRW<[P5600WriteStoreFPUS], (instregex "^ST_[BHWD]$")>;
// movn.[ds], movz.[ds]
-def : ItinRW<[P5600WriteStoreFPUL], [II_MOVN_D, II_MOVN_S, II_MOVZ_D, II_MOVZ_S]>;
+def : InstRW<[P5600WriteStoreFPUL], (instrs MOVN_I_D32, MOVN_I_D64, MOVN_I_S,
+ MOVZ_I_D32, MOVZ_I_D64, MOVZ_I_S)>;
// l[dw]x?c1, ld.[bhwd]
-def : ItinRW<[P5600WriteLoadFPU], [II_LDC1, II_LDXC1, II_LWC1, II_LWXC1, II_LUXC1]>;
+def : InstRW<[P5600WriteLoadFPU], (instrs LDC1, LDXC1, LWC1, LWXC1, LUXC1)>;
def : InstRW<[P5600WriteLoadFPU], (instregex "LD_[BHWD]")>;
// Unsupported Instructions
@@ -577,10 +585,12 @@ def : InstRW<[P5600WriteLoadFPU], (instregex "LD_[BHWD]")>;
// ceil.[lw].[ds], floor.[lw].[ds]
// Reason behind guess: trunc.[lw].ds and the various cvt's are in FPUL
-def : ItinRW<[P5600WriteFPUL], [II_CEIL, II_FLOOR, II_ROUND]>;
+def : InstRW<[P5600WriteFPUL], (instregex "^CEIL_(L|W)_(S|D32|D64)$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^FLOOR_(L|W)_(S|D32|D64)$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^ROUND_(L|W)_(S|D32|D64)$")>;
// rotrv
// Reason behind guess: rotr is in the same category and the two register forms
// generally follow the immediate forms in this category
-def : ItinRW<[P5600WriteEitherALU], [II_ROTRV]>;
+def : InstRW<[P5600WriteEitherALU], (instrs ROTRV)>;
}
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index f6af7e22e351..0c39a45467c4 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -16,6 +16,9 @@
#include "MipsMachineFunction.h"
#include "MipsRegisterInfo.h"
#include "MipsTargetMachine.h"
+#include "MipsCallLowering.h"
+#include "MipsLegalizerInfo.h"
+#include "MipsRegisterBankInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/CommandLine.h"
@@ -57,6 +60,12 @@ static cl::opt<bool>
GPOpt("mgpopt", cl::Hidden,
cl::desc("Enable gp-relative addressing of mips small data items"));
+bool MipsSubtarget::DspWarningPrinted = false;
+bool MipsSubtarget::MSAWarningPrinted = false;
+bool MipsSubtarget::VirtWarningPrinted = false;
+bool MipsSubtarget::CRCWarningPrinted = false;
+bool MipsSubtarget::GINVWarningPrinted = false;
+
void MipsSubtarget::anchor() {}
MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
@@ -71,10 +80,12 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 | Mips_Os16),
Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasSym32(false),
- HasEVA(false), DisableMadd4(false), HasMT(false),
- StackAlignOverride(StackAlignOverride), TM(TM), TargetTriple(TT),
- TSInfo(), InstrInfo(MipsInstrInfo::create(
- initializeSubtargetDependencies(CPU, FS, TM))),
+ HasEVA(false), DisableMadd4(false), HasMT(false), HasCRC(false),
+ HasVirt(false), HasGINV(false), UseIndirectJumpsHazard(false),
+ StackAlignOverride(StackAlignOverride),
+ TM(TM), TargetTriple(TT), TSInfo(),
+ InstrInfo(
+ MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
FrameLowering(MipsFrameLowering::create(*this)),
TLInfo(MipsTargetLowering::create(TM, *this)) {
@@ -107,6 +118,17 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
if (hasMips64r6() && InMicroMipsMode)
report_fatal_error("microMIPS64R6 is not supported", false);
+ if (!isABI_O32() && InMicroMipsMode)
+ report_fatal_error("microMIPS64 is not supported.", false);
+
+ if (UseIndirectJumpsHazard) {
+ if (InMicroMipsMode)
+ report_fatal_error(
+ "cannot combine indirect jumps with hazard barriers and microMIPS");
+ if (!hasMips32r2())
+ report_fatal_error(
+ "indirect jumps with hazard barriers requires MIPS32R2 or later");
+ }
if (hasMips32r6()) {
StringRef ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
@@ -129,6 +151,59 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
<< "\n";
UseSmallSection = false;
}
+
+ if (hasDSPR2() && !DspWarningPrinted) {
+ if (hasMips64() && !hasMips64r2()) {
+ errs() << "warning: the 'dspr2' ASE requires MIPS64 revision 2 or "
+ << "greater\n";
+ DspWarningPrinted = true;
+ } else if (hasMips32() && !hasMips32r2()) {
+ errs() << "warning: the 'dspr2' ASE requires MIPS32 revision 2 or "
+ << "greater\n";
+ DspWarningPrinted = true;
+ }
+ } else if (hasDSP() && !DspWarningPrinted) {
+ if (hasMips64() && !hasMips64r2()) {
+ errs() << "warning: the 'dsp' ASE requires MIPS64 revision 2 or "
+ << "greater\n";
+ DspWarningPrinted = true;
+ } else if (hasMips32() && !hasMips32r2()) {
+ errs() << "warning: the 'dsp' ASE requires MIPS32 revision 2 or "
+ << "greater\n";
+ DspWarningPrinted = true;
+ }
+ }
+
+ StringRef ArchName = hasMips64() ? "MIPS64" : "MIPS32";
+
+ if (!hasMips32r5() && hasMSA() && !MSAWarningPrinted) {
+ errs() << "warning: the 'msa' ASE requires " << ArchName
+ << " revision 5 or greater\n";
+ MSAWarningPrinted = true;
+ }
+ if (!hasMips32r5() && hasVirt() && !VirtWarningPrinted) {
+ errs() << "warning: the 'virt' ASE requires " << ArchName
+ << " revision 5 or greater\n";
+ VirtWarningPrinted = true;
+ }
+ if (!hasMips32r6() && hasCRC() && !CRCWarningPrinted) {
+ errs() << "warning: the 'crc' ASE requires " << ArchName
+ << " revision 6 or greater\n";
+ CRCWarningPrinted = true;
+ }
+ if (!hasMips32r6() && hasGINV() && !GINVWarningPrinted) {
+ errs() << "warning: the 'ginv' ASE requires " << ArchName
+ << " revision 6 or greater\n";
+ GINVWarningPrinted = true;
+ }
+
+ CallLoweringInfo.reset(new MipsCallLowering(*getTargetLowering()));
+ Legalizer.reset(new MipsLegalizerInfo(*this));
+
+ auto *RBI = new MipsRegisterBankInfo(*getRegisterInfo());
+ RegBankInfo.reset(RBI);
+ InstSelector.reset(createMipsInstructionSelector(
+ *static_cast<const MipsTargetMachine *>(&TM), *this, *RBI));
}
bool MipsSubtarget::isPositionIndependent() const {
@@ -174,7 +249,8 @@ MipsSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
}
bool MipsSubtarget::useConstantIslands() {
- DEBUG(dbgs() << "use constant islands " << Mips16ConstantIslands << "\n");
+ LLVM_DEBUG(dbgs() << "use constant islands " << Mips16ConstantIslands
+ << "\n");
return Mips16ConstantIslands;
}
@@ -186,3 +262,19 @@ bool MipsSubtarget::isABI_N64() const { return getABI().IsN64(); }
bool MipsSubtarget::isABI_N32() const { return getABI().IsN32(); }
bool MipsSubtarget::isABI_O32() const { return getABI().IsO32(); }
const MipsABIInfo &MipsSubtarget::getABI() const { return TM.getABI(); }
+
+const CallLowering *MipsSubtarget::getCallLowering() const {
+ return CallLoweringInfo.get();
+}
+
+const LegalizerInfo *MipsSubtarget::getLegalizerInfo() const {
+ return Legalizer.get();
+}
+
+const RegisterBankInfo *MipsSubtarget::getRegBankInfo() const {
+ return RegBankInfo.get();
+}
+
+const InstructionSelector *MipsSubtarget::getInstructionSelector() const {
+ return InstSelector.get();
+}
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 8b10b0596e0e..676d702ba63e 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -20,6 +20,10 @@
#include "MipsInstrInfo.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/Support/ErrorHandling.h"
@@ -44,6 +48,21 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
enum class CPU { P5600 };
+ // Used to avoid printing dsp warnings multiple times.
+ static bool DspWarningPrinted;
+
+ // Used to avoid printing msa warnings multiple times.
+ static bool MSAWarningPrinted;
+
+ // Used to avoid printing crc warnings multiple times.
+ static bool CRCWarningPrinted;
+
+ // Used to avoid printing ginv warnings multiple times.
+ static bool GINVWarningPrinted;
+
+ // Used to avoid printing virt warnings multiple times.
+ static bool VirtWarningPrinted;
+
// Mips architecture version
MipsArchEnum MipsArchVersion;
@@ -152,6 +171,19 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
// HasMT -- support MT ASE.
bool HasMT;
+ // HasCRC -- supports R6 CRC ASE
+ bool HasCRC;
+
+ // HasVirt -- supports Virtualization ASE
+ bool HasVirt;
+
+ // HasGINV -- supports R6 Global INValidate ASE
+ bool HasGINV;
+
+ // Use hazard variants of the jump register instructions for indirect
+ // function calls and jump tables.
+ bool UseIndirectJumpsHazard;
+
// Disable use of the `jal` instruction.
bool UseLongCalls = false;
@@ -272,6 +304,12 @@ public:
bool disableMadd4() const { return DisableMadd4; }
bool hasEVA() const { return HasEVA; }
bool hasMT() const { return HasMT; }
+ bool hasCRC() const { return HasCRC; }
+ bool hasVirt() const { return HasVirt; }
+ bool hasGINV() const { return HasGINV; }
+ bool useIndirectJumpsHazard() const {
+ return UseIndirectJumpsHazard && hasMips32r2();
+ }
bool useSmallSection() const { return UseSmallSection; }
bool hasStandardEncoding() const { return !inMips16Mode(); }
@@ -336,6 +374,19 @@ public:
const InstrItineraryData *getInstrItineraryData() const override {
return &InstrItins;
}
+
+protected:
+ // GlobalISel related APIs.
+ std::unique_ptr<CallLowering> CallLoweringInfo;
+ std::unique_ptr<LegalizerInfo> Legalizer;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
+ std::unique_ptr<InstructionSelector> InstSelector;
+
+public:
+ const CallLowering *getCallLowering() const override;
+ const LegalizerInfo *getLegalizerInfo() const override;
+ const RegisterBankInfo *getRegBankInfo() const override;
+ const InstructionSelector *getInstructionSelector() const override;
};
} // End llvm namespace
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index fb79a4bf40c5..1e6fe2b9f7e7 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -23,6 +23,10 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/Passes.h"
@@ -46,6 +50,12 @@ extern "C" void LLVMInitializeMipsTarget() {
RegisterTargetMachine<MipselTargetMachine> Y(getTheMipselTarget());
RegisterTargetMachine<MipsebTargetMachine> A(getTheMips64Target());
RegisterTargetMachine<MipselTargetMachine> B(getTheMips64elTarget());
+
+ PassRegistry *PR = PassRegistry::getPassRegistry();
+ initializeGlobalISel(*PR);
+ initializeMipsDelaySlotFillerPass(*PR);
+ initializeMipsBranchExpansionPass(*PR);
+ initializeMicroMipsSizeReducePass(*PR);
}
static std::string computeDataLayout(const Triple &TT, StringRef CPU,
@@ -198,7 +208,7 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const {
}
void MipsTargetMachine::resetSubtarget(MachineFunction *MF) {
- DEBUG(dbgs() << "resetSubtarget\n");
+ LLVM_DEBUG(dbgs() << "resetSubtarget\n");
Subtarget = const_cast<MipsSubtarget *>(getSubtargetImpl(MF->getFunction()));
MF->setSubtarget(Subtarget);
@@ -230,6 +240,11 @@ public:
bool addInstSelector() override;
void addPreEmitPass() override;
void addPreRegAlloc() override;
+ void addPreEmit2() ;
+ bool addIRTranslator() override;
+ bool addLegalizeMachineIR() override;
+ bool addRegBankSelect() override;
+ bool addGlobalInstructionSelect() override;
};
} // end anonymous namespace
@@ -262,26 +277,62 @@ void MipsPassConfig::addPreRegAlloc() {
TargetTransformInfo
MipsTargetMachine::getTargetTransformInfo(const Function &F) {
if (Subtarget->allowMixed16_32()) {
- DEBUG(errs() << "No Target Transform Info Pass Added\n");
+ LLVM_DEBUG(errs() << "No Target Transform Info Pass Added\n");
// FIXME: This is no longer necessary as the TTI returned is per-function.
return TargetTransformInfo(F.getParent()->getDataLayout());
}
- DEBUG(errs() << "Target Transform Info Pass Added\n");
+ LLVM_DEBUG(errs() << "Target Transform Info Pass Added\n");
return TargetTransformInfo(BasicTTIImpl(this, F));
}
+void MipsPassConfig::addPreEmit2() {
+}
+
// Implemented by targets that want to run passes immediately before
// machine code is emitted. return true if -print-machineinstrs should
// print out the code after the passes.
void MipsPassConfig::addPreEmitPass() {
- addPass(createMicroMipsSizeReductionPass());
+ // Expand pseudo instructions that are sensitive to register allocation.
+ addPass(createMipsExpandPseudoPass());
- // The delay slot filler and the long branch passes can potientially create
- // forbidden slot/ hazards for MIPSR6 which the hazard schedule pass will
- // fix. Any new pass must come before the hazard schedule pass.
+ // The microMIPS size reduction pass performs instruction reselection for
+ // instructions which can be remapped to a 16 bit instruction.
+ addPass(createMicroMipsSizeReducePass());
+
+ // The delay slot filler pass can potientially create forbidden slot hazards
+ // for MIPSR6 and therefore it should go before MipsBranchExpansion pass.
addPass(createMipsDelaySlotFillerPass());
- addPass(createMipsLongBranchPass());
- addPass(createMipsHazardSchedule());
+
+ // This pass expands branches and takes care about the forbidden slot hazards.
+ // Expanding branches may potentially create forbidden slot hazards for
+ // MIPSR6, and fixing such hazard may potentially break a branch by extending
+ // its offset out of range. That's why this pass combine these two tasks, and
+ // runs them alternately until one of them finishes without any changes. Only
+ // then we can be sure that all branches are expanded properly and no hazards
+ // exists.
+ // Any new pass should go before this pass.
+ addPass(createMipsBranchExpansion());
+
addPass(createMipsConstantIslandPass());
}
+
+bool MipsPassConfig::addIRTranslator() {
+ addPass(new IRTranslator());
+ return false;
+}
+
+bool MipsPassConfig::addLegalizeMachineIR() {
+ addPass(new Legalizer());
+ return false;
+}
+
+bool MipsPassConfig::addRegBankSelect() {
+ addPass(new RegBankSelect());
+ return false;
+}
+
+bool MipsPassConfig::addGlobalInstructionSelect() {
+ addPass(new InstructionSelect());
+ return false;
+}
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 56e6e5d8daa2..d9b73d151119 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -54,7 +54,7 @@ public:
const MipsSubtarget *getSubtargetImpl(const Function &F) const override;
- /// \brief Reset the subtarget for the Mips target.
+ /// Reset the subtarget for the Mips target.
void resetSubtarget(MachineFunction *MF);
// Pass Pipeline Configuration
diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
index 9db6b7b1bcd6..f767c8321988 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -136,6 +136,13 @@ IsGlobalInSmallSectionImpl(const GlobalObject *GO,
return false;
Type *Ty = GVA->getValueType();
+
+ // It is possible that the type of the global is unsized, i.e. a declaration
+ // of a extern struct. In this case don't presume it is in the small data
+ // section. This happens e.g. when building the FreeBSD kernel.
+ if (!Ty->isSized())
+ return false;
+
return IsInSmallSection(
GVA->getParent()->getDataLayout().getTypeAllocSize(Ty));
}
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index 42473aac7288..a282366f6d40 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -42,6 +42,12 @@ public:
virtual void emitDirectiveSetNoMsa();
virtual void emitDirectiveSetMt();
virtual void emitDirectiveSetNoMt();
+ virtual void emitDirectiveSetCRC();
+ virtual void emitDirectiveSetNoCRC();
+ virtual void emitDirectiveSetVirt();
+ virtual void emitDirectiveSetNoVirt();
+ virtual void emitDirectiveSetGINV();
+ virtual void emitDirectiveSetNoGINV();
virtual void emitDirectiveSetAt();
virtual void emitDirectiveSetAtWithArg(unsigned RegNo);
virtual void emitDirectiveSetNoAt();
@@ -103,6 +109,12 @@ public:
virtual void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value);
virtual void emitDirectiveSetOddSPReg();
virtual void emitDirectiveSetNoOddSPReg();
+ virtual void emitDirectiveModuleCRC();
+ virtual void emitDirectiveModuleNoCRC();
+ virtual void emitDirectiveModuleVirt();
+ virtual void emitDirectiveModuleNoVirt();
+ virtual void emitDirectiveModuleGINV();
+ virtual void emitDirectiveModuleNoGINV();
void emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc,
const MCSubtargetInfo *STI);
@@ -213,6 +225,12 @@ public:
void emitDirectiveSetNoMsa() override;
void emitDirectiveSetMt() override;
void emitDirectiveSetNoMt() override;
+ void emitDirectiveSetCRC() override;
+ void emitDirectiveSetNoCRC() override;
+ void emitDirectiveSetVirt() override;
+ void emitDirectiveSetNoVirt() override;
+ void emitDirectiveSetGINV() override;
+ void emitDirectiveSetNoGINV() override;
void emitDirectiveSetAt() override;
void emitDirectiveSetAtWithArg(unsigned RegNo) override;
void emitDirectiveSetNoAt() override;
@@ -278,6 +296,12 @@ public:
void emitDirectiveModuleSoftFloat() override;
void emitDirectiveModuleHardFloat() override;
void emitDirectiveModuleMT() override;
+ void emitDirectiveModuleCRC() override;
+ void emitDirectiveModuleNoCRC() override;
+ void emitDirectiveModuleVirt() override;
+ void emitDirectiveModuleNoVirt() override;
+ void emitDirectiveModuleGINV() override;
+ void emitDirectiveModuleNoGINV() override;
void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value) override;
void emitDirectiveSetOddSPReg() override;
void emitDirectiveSetNoOddSPReg() override;
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
index a8eecfcc138c..4a64fe0961e5 100644
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -1,11 +1,11 @@
set(LLVM_TARGET_DEFINITIONS NVPTX.td)
-
-tablegen(LLVM NVPTXGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM NVPTXGenInstrInfo.inc -gen-instr-info)
tablegen(LLVM NVPTXGenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM NVPTXGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM NVPTXGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM NVPTXGenRegisterInfo.inc -gen-register-info)
tablegen(LLVM NVPTXGenSubtargetInfo.inc -gen-subtarget)
+
add_public_tablegen_target(NVPTXCommonTableGen)
set(NVPTXCodeGen_sources
@@ -36,6 +36,6 @@ set(NVPTXCodeGen_sources
add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})
-add_subdirectory(TargetInfo)
add_subdirectory(InstPrinter)
add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
diff --git a/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt b/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt
index dbbf23554aa9..9b4eadd59a31 100644
--- a/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt
@@ -1,4 +1,5 @@
add_llvm_library(LLVMNVPTXDesc
NVPTXMCAsmInfo.cpp
NVPTXMCTargetDesc.cpp
+ NVPTXTargetStreamer.cpp
)
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index bdd0f156c8af..f6cbd23f01c4 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -13,16 +13,9 @@
#include "NVPTXMCAsmInfo.h"
#include "llvm/ADT/Triple.h"
-#include "llvm/Support/CommandLine.h"
using namespace llvm;
-// -debug-compile - Command line option to inform opt and llc passes to
-// compile for debugging
-static cl::opt<bool> CompileForDebugging("debug-compile",
- cl::desc("Compile for debugging"),
- cl::Hidden, cl::init(false));
-
void NVPTXMCAsmInfo::anchor() {}
NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) {
@@ -37,7 +30,7 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) {
InlineAsmStart = " begin inline asm";
InlineAsmEnd = " end inline asm";
- SupportsDebugInformation = CompileForDebugging;
+ SupportsDebugInformation = true;
// PTX does not allow .align on functions.
HasFunctionAlignment = false;
HasDotTypeDotSizeDirective = false;
@@ -45,13 +38,16 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) {
HiddenDeclarationVisibilityAttr = HiddenVisibilityAttr = MCSA_Invalid;
ProtectedVisibilityAttr = MCSA_Invalid;
- Data8bitsDirective = " .b8 ";
- Data16bitsDirective = " .b16 ";
- Data32bitsDirective = " .b32 ";
- Data64bitsDirective = " .b64 ";
- ZeroDirective = " .b8";
- AsciiDirective = " .b8";
- AscizDirective = " .b8";
+ // FIXME: remove comment once debug info is properly supported.
+ Data8bitsDirective = "// .b8 ";
+ Data16bitsDirective = nullptr; // not supported
+ Data32bitsDirective = "// .b32 ";
+ Data64bitsDirective = "// .b64 ";
+ ZeroDirective = "// .b8";
+ AsciiDirective = nullptr; // not supported
+ AscizDirective = nullptr; // not supported
+ SupportsQuotedNames = false;
+ SupportsExtendedDwarfLocDirective = false;
// @TODO: Can we just disable this?
WeakDirective = "\t// .weak\t";
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index 9ac3c8850f75..9fd7600cf67f 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -25,6 +25,17 @@ class NVPTXMCAsmInfo : public MCAsmInfo {
public:
explicit NVPTXMCAsmInfo(const Triple &TheTriple);
+
+ /// Return true if the .section directive should be omitted when
+ /// emitting \p SectionName. For example:
+ ///
+ /// shouldOmitSectionDirective(".text")
+ ///
+ /// returns false => .section .text,#alloc,#execinstr
+ /// returns true => .text
+ bool shouldOmitSectionDirective(StringRef SectionName) const override {
+ return true;
+ }
};
} // namespace llvm
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index 12f992749366..b1a77a17ec15 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -11,9 +11,10 @@
//
//===----------------------------------------------------------------------===//
-#include "NVPTXMCTargetDesc.h"
#include "InstPrinter/NVPTXInstPrinter.h"
#include "NVPTXMCAsmInfo.h"
+#include "NVPTXMCTargetDesc.h"
+#include "NVPTXTargetStreamer.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -58,6 +59,12 @@ static MCInstPrinter *createNVPTXMCInstPrinter(const Triple &T,
return nullptr;
}
+static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
+ formatted_raw_ostream &,
+ MCInstPrinter *, bool) {
+ return new NVPTXTargetStreamer(S);
+}
+
// Force static initialization.
extern "C" void LLVMInitializeNVPTXTargetMC() {
for (Target *T : {&getTheNVPTXTarget32(), &getTheNVPTXTarget64()}) {
@@ -75,5 +82,8 @@ extern "C" void LLVMInitializeNVPTXTargetMC() {
// Register the MCInstPrinter.
TargetRegistry::RegisterMCInstPrinter(*T, createNVPTXMCInstPrinter);
+
+ // Register the MCTargetStreamer.
+ TargetRegistry::RegisterAsmTargetStreamer(*T, createTargetAsmStreamer);
}
}
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
new file mode 100644
index 000000000000..aeb90eca3a05
--- /dev/null
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
@@ -0,0 +1,94 @@
+//=====- NVPTXTargetStreamer.cpp - NVPTXTargetStreamer class ------------=====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the NVPTXTargetStreamer class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXTargetStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+
+using namespace llvm;
+
+//
+// NVPTXTargetStreamer Implemenation
+//
+NVPTXTargetStreamer::NVPTXTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+NVPTXTargetStreamer::~NVPTXTargetStreamer() = default;
+
+void NVPTXTargetStreamer::emitDwarfFileDirective(StringRef Directive) {
+ DwarfFiles.emplace_back(Directive);
+}
+
+static bool isDwarfSection(const MCObjectFileInfo *FI,
+ const MCSection *Section) {
+ // FIXME: the checks for the DWARF sections are very fragile and should be
+ // fixed up in a followup patch.
+ if (!Section || Section->getKind().isText() ||
+ Section->getKind().isWriteable())
+ return false;
+ return Section == FI->getDwarfAbbrevSection() ||
+ Section == FI->getDwarfInfoSection() ||
+ Section == FI->getDwarfMacinfoSection() ||
+ Section == FI->getDwarfFrameSection() ||
+ Section == FI->getDwarfAddrSection() ||
+ Section == FI->getDwarfRangesSection() ||
+ Section == FI->getDwarfARangesSection() ||
+ Section == FI->getDwarfLocSection() ||
+ Section == FI->getDwarfStrSection() ||
+ Section == FI->getDwarfLineSection() ||
+ Section == FI->getDwarfStrOffSection() ||
+ Section == FI->getDwarfLineStrSection() ||
+ Section == FI->getDwarfPubNamesSection() ||
+ Section == FI->getDwarfPubTypesSection() ||
+ Section == FI->getDwarfSwiftASTSection() ||
+ Section == FI->getDwarfTypesDWOSection() ||
+ Section == FI->getDwarfAbbrevDWOSection() ||
+ Section == FI->getDwarfAccelObjCSection() ||
+ Section == FI->getDwarfAccelNamesSection() ||
+ Section == FI->getDwarfAccelTypesSection() ||
+ Section == FI->getDwarfAccelNamespaceSection() ||
+ Section == FI->getDwarfLocDWOSection() ||
+ Section == FI->getDwarfStrDWOSection() ||
+ Section == FI->getDwarfCUIndexSection() ||
+ Section == FI->getDwarfInfoDWOSection() ||
+ Section == FI->getDwarfLineDWOSection() ||
+ Section == FI->getDwarfTUIndexSection() ||
+ Section == FI->getDwarfStrOffDWOSection() ||
+ Section == FI->getDwarfDebugNamesSection() ||
+ Section == FI->getDwarfDebugInlineSection() ||
+ Section == FI->getDwarfGnuPubNamesSection() ||
+ Section == FI->getDwarfGnuPubTypesSection();
+}
+
+void NVPTXTargetStreamer::changeSection(const MCSection *CurSection,
+ MCSection *Section,
+ const MCExpr *SubSection,
+ raw_ostream &OS) {
+ assert(!SubSection && "SubSection is not null!");
+ const MCObjectFileInfo *FI = getStreamer().getContext().getObjectFileInfo();
+ // FIXME: remove comment once debug info is properly supported.
+ // Emit closing brace for DWARF sections only.
+ if (isDwarfSection(FI, CurSection))
+ OS << "//\t}\n";
+ if (isDwarfSection(FI, Section)) {
+ // Emit DWARF .file directives in the outermost scope.
+ for (const std::string &S : DwarfFiles)
+ getStreamer().EmitRawText(S.data());
+ DwarfFiles.clear();
+ OS << "//\t.section";
+ Section->PrintSwitchToSection(*getStreamer().getContext().getAsmInfo(),
+ FI->getTargetTriple(), OS, SubSection);
+ // DWARF sections are enclosed into braces - emit the open one.
+ OS << "//\t{\n";
+ }
+}
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
new file mode 100644
index 000000000000..30831ab8bbeb
--- /dev/null
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
@@ -0,0 +1,46 @@
+//=====-- NVPTXTargetStreamer.h - NVPTX Target Streamer ------*- C++ -*--=====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXTARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class MCSection;
+
+/// Implments NVPTX-specific streamer.
+class NVPTXTargetStreamer : public MCTargetStreamer {
+private:
+ SmallVector<std::string, 4> DwarfFiles;
+
+public:
+ NVPTXTargetStreamer(MCStreamer &S);
+ ~NVPTXTargetStreamer() override;
+
+ /// Record DWARF file directives for later output.
+ /// According to PTX ISA, CUDA Toolkit documentation, 11.5.3. Debugging
+ /// Directives: .file
+ /// (http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-file),
+ /// The .file directive is allowed only in the outermost scope, i.e., at the
+ /// same level as kernel and device function declarations. Also, the order of
+ /// the .loc and .file directive does not matter, .file directives may follow
+ /// the .loc directives where the file is referenced.
+ /// LLVM emits .file directives immediately the location debug info is
+ /// emitted, i.e. they may be emitted inside functions. We gather all these
+ /// directives and emit them outside of the sections and, thus, outside of the
+ /// functions.
+ void emitDwarfFileDirective(StringRef Directive) override;
+ void changeSection(const MCSection *CurSection, MCSection *Section,
+ const MCExpr *SubSection, raw_ostream &OS) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td
index aba37d363591..6494c46f54ab 100644
--- a/lib/Target/NVPTX/NVPTX.td
+++ b/lib/Target/NVPTX/NVPTX.td
@@ -52,9 +52,8 @@ def SM62 : SubtargetFeature<"sm_62", "SmVersion", "62",
"Target SM 6.2">;
def SM70 : SubtargetFeature<"sm_70", "SmVersion", "70",
"Target SM 7.0">;
-
-def SATOM : SubtargetFeature<"satom", "HasAtomScope", "true",
- "Atomic operations with scope">;
+def SM72 : SubtargetFeature<"sm_72", "SmVersion", "72",
+ "Target SM 7.2">;
// PTX Versions
def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32",
@@ -71,6 +70,8 @@ def PTX50 : SubtargetFeature<"ptx50", "PTXVersion", "50",
"Use PTX version 5.0">;
def PTX60 : SubtargetFeature<"ptx60", "PTXVersion", "60",
"Use PTX version 6.0">;
+def PTX61 : SubtargetFeature<"ptx61", "PTXVersion", "61",
+ "Use PTX version 6.1">;
//===----------------------------------------------------------------------===//
// NVPTX supported processors.
@@ -88,10 +89,11 @@ def : Proc<"sm_37", [SM37, PTX41]>;
def : Proc<"sm_50", [SM50, PTX40]>;
def : Proc<"sm_52", [SM52, PTX41]>;
def : Proc<"sm_53", [SM53, PTX42]>;
-def : Proc<"sm_60", [SM60, PTX50, SATOM]>;
-def : Proc<"sm_61", [SM61, PTX50, SATOM]>;
-def : Proc<"sm_62", [SM62, PTX50, SATOM]>;
-def : Proc<"sm_70", [SM70, PTX60, SATOM]>;
+def : Proc<"sm_60", [SM60, PTX50]>;
+def : Proc<"sm_61", [SM61, PTX50]>;
+def : Proc<"sm_62", [SM62, PTX50]>;
+def : Proc<"sm_70", [SM70, PTX60]>;
+def : Proc<"sm_72", [SM72, PTX61]>;
def NVPTXInstrInfo : InstrInfo {
}
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 753cfff4cdae..a966b9928400 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -44,9 +44,7 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Attributes.h"
@@ -75,16 +73,17 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
#include <cassert>
#include <cstdint>
#include <cstring>
#include <new>
-#include <sstream>
#include <string>
#include <utility>
#include <vector>
@@ -93,16 +92,6 @@ using namespace llvm;
#define DEPOTNAME "__local_depot"
-static cl::opt<bool>
-EmitLineNumbers("nvptx-emit-line-numbers", cl::Hidden,
- cl::desc("NVPTX Specific: Emit Line numbers even without -G"),
- cl::init(true));
-
-static cl::opt<bool>
-InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore, cl::Hidden,
- cl::desc("NVPTX Specific: Emit source line in ptx file"),
- cl::init(false));
-
/// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
/// depends.
static void
@@ -151,56 +140,7 @@ VisitGlobalVariableForEmission(const GlobalVariable *GV,
Visiting.erase(GV);
}
-void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
- if (!EmitLineNumbers)
- return;
- if (ignoreLoc(MI))
- return;
-
- const DebugLoc &curLoc = MI.getDebugLoc();
-
- if (!prevDebugLoc && !curLoc)
- return;
-
- if (prevDebugLoc == curLoc)
- return;
-
- prevDebugLoc = curLoc;
-
- if (!curLoc)
- return;
-
- auto *Scope = cast_or_null<DIScope>(curLoc.getScope());
- if (!Scope)
- return;
-
- StringRef fileName(Scope->getFilename());
- StringRef dirName(Scope->getDirectory());
- SmallString<128> FullPathName = dirName;
- if (!dirName.empty() && !sys::path::is_absolute(fileName)) {
- sys::path::append(FullPathName, fileName);
- fileName = FullPathName;
- }
-
- if (filenameMap.find(fileName) == filenameMap.end())
- return;
-
- // Emit the line from the source file.
- if (InterleaveSrc)
- this->emitSrcInText(fileName, curLoc.getLine());
-
- std::stringstream temp;
- temp << "\t.loc " << filenameMap[fileName] << " " << curLoc.getLine()
- << " " << curLoc.getCol();
- OutStreamer->EmitRawText(temp.str());
-}
-
void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
- SmallString<128> Str;
- raw_svector_ostream OS(Str);
- if (static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() == NVPTX::CUDA)
- emitLineNumberAsDotLoc(*MI);
-
MCInst Inst;
lowerToMCInst(MI, Inst);
EmitToStreamer(*OutStreamer, Inst);
@@ -505,7 +445,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
emitGlobals(*MF->getFunction().getParent());
GlobalsEmitted = true;
}
-
+
// Set up
MRI = &MF->getRegInfo();
F = &MF->getFunction();
@@ -526,14 +466,25 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
OutStreamer->EmitRawText(O.str());
- prevDebugLoc = DebugLoc();
-}
-
-void NVPTXAsmPrinter::EmitFunctionBodyStart() {
VRegMapping.clear();
+ // Emit open brace for function body.
OutStreamer->EmitRawText(StringRef("{\n"));
setAndEmitFunctionVirtualRegisters(*MF);
+}
+bool NVPTXAsmPrinter::runOnMachineFunction(MachineFunction &F) {
+ nvptxSubtarget = &F.getSubtarget<NVPTXSubtarget>();
+ bool Result = AsmPrinter::runOnMachineFunction(F);
+ // Emit closing brace for the body of function F.
+ // The closing brace must be emitted here because we need to emit additional
+ // debug labels/data after the last basic block.
+ // We need to emit the closing brace here because we don't have function that
+ // finished emission of the function body.
+ OutStreamer->EmitRawText(StringRef("}\n"));
+ return Result;
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyStart() {
SmallString<128> Str;
raw_svector_ostream O(Str);
emitDemotedVars(&MF->getFunction(), O);
@@ -541,10 +492,15 @@ void NVPTXAsmPrinter::EmitFunctionBodyStart() {
}
void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
- OutStreamer->EmitRawText(StringRef("}\n"));
VRegMapping.clear();
}
+const MCSymbol *NVPTXAsmPrinter::getFunctionFrameSymbol() const {
+ SmallString<128> Str;
+ raw_svector_ostream(Str) << DEPOTNAME << getFunctionNumber();
+ return OutContext.getOrCreateSymbol(Str);
+}
+
void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
unsigned RegNo = MI->getOperand(0).getReg();
if (TargetRegisterInfo::isVirtualRegister(RegNo)) {
@@ -818,42 +774,6 @@ void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
}
}
-void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) {
- DebugInfoFinder DbgFinder;
- DbgFinder.processModule(M);
-
- unsigned i = 1;
- for (const DICompileUnit *DIUnit : DbgFinder.compile_units()) {
- StringRef Filename = DIUnit->getFilename();
- StringRef Dirname = DIUnit->getDirectory();
- SmallString<128> FullPathName = Dirname;
- if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
- sys::path::append(FullPathName, Filename);
- Filename = FullPathName;
- }
- if (filenameMap.find(Filename) != filenameMap.end())
- continue;
- filenameMap[Filename] = i;
- OutStreamer->EmitDwarfFileDirective(i, "", Filename);
- ++i;
- }
-
- for (DISubprogram *SP : DbgFinder.subprograms()) {
- StringRef Filename = SP->getFilename();
- StringRef Dirname = SP->getDirectory();
- SmallString<128> FullPathName = Dirname;
- if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
- sys::path::append(FullPathName, Filename);
- Filename = FullPathName;
- }
- if (filenameMap.find(Filename) != filenameMap.end())
- continue;
- filenameMap[Filename] = i;
- OutStreamer->EmitDwarfFileDirective(i, "", Filename);
- ++i;
- }
-}
-
static bool isEmptyXXStructor(GlobalVariable *GV) {
if (!GV) return true;
const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
@@ -889,24 +809,13 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
SmallString<128> Str1;
raw_svector_ostream OS1(Str1);
- MMI = getAnalysisIfAvailable<MachineModuleInfo>();
-
// We need to call the parent's one explicitly.
- //bool Result = AsmPrinter::doInitialization(M);
-
- // Initialize TargetLoweringObjectFile since we didn't do in
- // AsmPrinter::doInitialization either right above or where it's commented out
- // below.
- const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
- .Initialize(OutContext, TM);
+ bool Result = AsmPrinter::doInitialization(M);
// Emit header before any dwarf directives are emitted below.
emitHeader(M, OS1, STI);
OutStreamer->EmitRawText(OS1.str());
- // Already commented out
- //bool Result = AsmPrinter::doInitialization(M);
-
// Emit module-level inline asm if it exists.
if (!M.getModuleInlineAsm().empty()) {
OutStreamer->AddComment("Start of file scope inline assembly");
@@ -917,13 +826,9 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
OutStreamer->AddBlankLine();
}
- // If we're not NVCL we're CUDA, go ahead and emit filenames.
- if (TM.getTargetTriple().getOS() != Triple::NVCL)
- recordAndEmitFilenames(M);
-
GlobalsEmitted = false;
-
- return false; // success
+
+ return Result;
}
void NVPTXAsmPrinter::emitGlobals(const Module &M) {
@@ -974,13 +879,10 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
if (NTM.getDrvInterface() == NVPTX::NVCL)
O << ", texmode_independent";
- else {
- if (!STI.hasDouble())
- O << ", map_f64_to_f32";
- }
- if (MAI->doesSupportDebugInformation())
- O << ", debug";
+ // FIXME: remove comment once debug info is properly supported.
+ if (MMI && MMI->hasDebugInfo())
+ O << "//, debug";
O << "\n";
@@ -995,6 +897,8 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
}
bool NVPTXAsmPrinter::doFinalization(Module &M) {
+ bool HasDebugInfo = MMI && MMI->hasDebugInfo();
+
// If we did not emit any functions, then the global declarations have not
// yet been emitted.
if (!GlobalsEmitted) {
@@ -1029,6 +933,11 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
clearAnnotationCache(&M);
delete[] gv_array;
+ // FIXME: remove comment once debug info is properly supported.
+ // Close the last emitted section
+ if (HasDebugInfo)
+ OutStreamer->EmitRawText("//\t}");
+
return ret;
//bool Result = AsmPrinter::doFinalization(M);
@@ -1365,7 +1274,8 @@ void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
O << "shared";
break;
default:
- report_fatal_error("Bad address space found while emitting PTX");
+ report_fatal_error("Bad address space found while emitting PTX: " +
+ llvm::Twine(AddressSpace));
break;
}
}
@@ -1433,7 +1343,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
return;
}
- if (ETy->isFloatingPointTy() || ETy->isIntegerTy() || ETy->isPointerTy()) {
+ if (ETy->isFloatingPointTy() || ETy->isIntOrPtrTy()) {
O << " .";
O << getPTXFundamentalTypeStr(ETy);
O << " ";
@@ -1948,11 +1858,17 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
llvm_unreachable("unsupported integer const type");
break;
}
+ case Type::HalfTyID:
case Type::FloatTyID:
case Type::DoubleTyID: {
const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
Type *Ty = CFP->getType();
- if (Ty == Type::getFloatTy(CPV->getContext())) {
+ if (Ty == Type::getHalfTy(CPV->getContext())) {
+ APInt API = CFP->getValueAPF().bitcastToAPInt();
+ uint16_t float16 = API.getLoBits(16).getZExtValue();
+ ConvertIntToBytes<>(ptr, float16);
+ aggBuffer->addBytes(ptr, 2, Bytes);
+ } else if (Ty == Type::getFloatTy(CPV->getContext())) {
float float32 = (float) CFP->getValueAPF().convertToFloat();
ConvertFloatToBytes(ptr, float32);
aggBuffer->addBytes(ptr, 4, Bytes);
@@ -2049,65 +1965,6 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
llvm_unreachable("unsupported constant type in printAggregateConstant()");
}
-// buildTypeNameMap - Run through symbol table looking for type names.
-//
-
-bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- default:
- return false;
- case NVPTX::CallArgBeginInst:
- case NVPTX::CallArgEndInst0:
- case NVPTX::CallArgEndInst1:
- case NVPTX::CallArgF32:
- case NVPTX::CallArgF64:
- case NVPTX::CallArgI16:
- case NVPTX::CallArgI32:
- case NVPTX::CallArgI32imm:
- case NVPTX::CallArgI64:
- case NVPTX::CallArgParam:
- case NVPTX::CallVoidInst:
- case NVPTX::CallVoidInstReg:
- case NVPTX::Callseq_End:
- case NVPTX::CallVoidInstReg64:
- case NVPTX::DeclareParamInst:
- case NVPTX::DeclareRetMemInst:
- case NVPTX::DeclareRetRegInst:
- case NVPTX::DeclareRetScalarInst:
- case NVPTX::DeclareScalarParamInst:
- case NVPTX::DeclareScalarRegInst:
- case NVPTX::StoreParamF32:
- case NVPTX::StoreParamF64:
- case NVPTX::StoreParamI16:
- case NVPTX::StoreParamI32:
- case NVPTX::StoreParamI64:
- case NVPTX::StoreParamI8:
- case NVPTX::StoreRetvalF32:
- case NVPTX::StoreRetvalF64:
- case NVPTX::StoreRetvalI16:
- case NVPTX::StoreRetvalI32:
- case NVPTX::StoreRetvalI64:
- case NVPTX::StoreRetvalI8:
- case NVPTX::LastCallArgF32:
- case NVPTX::LastCallArgF64:
- case NVPTX::LastCallArgI16:
- case NVPTX::LastCallArgI32:
- case NVPTX::LastCallArgI32imm:
- case NVPTX::LastCallArgI64:
- case NVPTX::LastCallArgParam:
- case NVPTX::LoadParamMemF32:
- case NVPTX::LoadParamMemF64:
- case NVPTX::LoadParamMemI16:
- case NVPTX::LoadParamMemI32:
- case NVPTX::LoadParamMemI64:
- case NVPTX::LoadParamMemI8:
- case NVPTX::PrototypeInst:
- case NVPTX::DBG_VALUE:
- return true;
- }
- return false;
-}
-
/// lowerConstantForGV - Return an MCExpr for the given Constant. This is mostly
/// a copy from AsmPrinter::lowerConstant, except customized to only handle
/// expressions that are representable in PTX and create
@@ -2408,44 +2265,6 @@ void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
}
}
-void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) {
- std::stringstream temp;
- LineReader *reader = this->getReader(filename);
- temp << "\n//";
- temp << filename.str();
- temp << ":";
- temp << line;
- temp << " ";
- temp << reader->readLine(line);
- temp << "\n";
- this->OutStreamer->EmitRawText(temp.str());
-}
-
-LineReader *NVPTXAsmPrinter::getReader(const std::string &filename) {
- if (!reader) {
- reader = new LineReader(filename);
- }
-
- if (reader->fileName() != filename) {
- delete reader;
- reader = new LineReader(filename);
- }
-
- return reader;
-}
-
-std::string LineReader::readLine(unsigned lineNum) {
- if (lineNum < theCurLine) {
- theCurLine = 0;
- fstr.seekg(0, std::ios::beg);
- }
- while (theCurLine < lineNum) {
- fstr.getline(buff, 500);
- theCurLine++;
- }
- return buff;
-}
-
// Force static initialization.
extern "C" void LLVMInitializeNVPTXAsmPrinter() {
RegisterAsmPrinter<NVPTXAsmPrinter> X(getTheNVPTXTarget32());
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 8ec3476b8719..3b042c74b26c 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -41,7 +41,6 @@
#include "llvm/Target/TargetMachine.h"
#include <algorithm>
#include <cassert>
-#include <fstream>
#include <map>
#include <memory>
#include <string>
@@ -60,27 +59,6 @@ namespace llvm {
class MCOperand;
-class LineReader {
-private:
- unsigned theCurLine;
- std::ifstream fstr;
- char buff[512];
- std::string theFileName;
- SmallVector<unsigned, 32> lineOffset;
-
-public:
- LineReader(std::string filename) {
- theCurLine = 0;
- fstr.open(filename.c_str());
- theFileName = filename;
- }
-
- ~LineReader() { fstr.close(); }
-
- std::string fileName() { return theFileName; }
- std::string readLine(unsigned line);
-};
-
class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
class AggBuffer {
@@ -217,8 +195,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
friend class AggBuffer;
- void emitSrcInText(StringRef filename, unsigned line);
-
private:
StringRef getPassName() const override { return "NVPTX Assembly Printer"; }
@@ -271,8 +247,6 @@ protected:
bool doFinalization(Module &M) override;
private:
- std::string CurrentBankselLabelInBasicBlock;
-
bool GlobalsEmitted;
// This is specific per MachineFunction.
@@ -287,17 +261,9 @@ private:
// Cache the subtarget here.
const NVPTXSubtarget *nvptxSubtarget;
- // Build the map between type name and ID based on module's type
- // symbol table.
- std::map<Type *, std::string> TypeNameMap;
-
// List of variables demoted to a function scope.
std::map<const Function *, std::vector<const GlobalVariable *>> localDecls;
- // To record filename to ID mapping
- std::map<std::string, unsigned> filenameMap;
- void recordAndEmitFilenames(Module &);
-
void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const;
std::string getPTXFundamentalTypeStr(Type *Ty, bool = true) const;
@@ -317,10 +283,6 @@ private:
bool isLoopHeaderOfNoUnroll(const MachineBasicBlock &MBB) const;
- LineReader *reader = nullptr;
-
- LineReader *getReader(const std::string &);
-
// Used to control the need to emit .generic() in the initializer of
// module scope variables.
// Although ptx supports the hybrid mode like the following,
@@ -340,26 +302,16 @@ public:
EmitGeneric(static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() ==
NVPTX::CUDA) {}
- ~NVPTXAsmPrinter() override {
- delete reader;
- }
-
- bool runOnMachineFunction(MachineFunction &F) override {
- nvptxSubtarget = &F.getSubtarget<NVPTXSubtarget>();
- return AsmPrinter::runOnMachineFunction(F);
- }
+ bool runOnMachineFunction(MachineFunction &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineLoopInfo>();
AsmPrinter::getAnalysisUsage(AU);
}
- bool ignoreLoc(const MachineInstr &);
-
std::string getVirtualRegisterName(unsigned) const;
- DebugLoc prevDebugLoc;
- void emitLineNumberAsDotLoc(const MachineInstr &);
+ const MCSymbol *getFunctionFrameSymbol() const override;
};
} // end namespace llvm
diff --git a/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
index f02c33f9249a..41e9ae827180 100644
--- a/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
+++ b/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
@@ -28,7 +28,7 @@
using namespace llvm;
namespace {
-/// \brief NVPTXAssignValidGlobalNames
+/// NVPTXAssignValidGlobalNames
class NVPTXAssignValidGlobalNames : public ModulePass {
public:
static char ID;
@@ -36,7 +36,7 @@ public:
bool runOnModule(Module &M) override;
- /// \brief Clean up the name to remove symbols invalid in PTX.
+ /// Clean up the name to remove symbols invalid in PTX.
std::string cleanUpName(StringRef Name);
};
}
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 729f3ed7b79e..e5e6637967b2 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -64,6 +64,14 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
}
}
+int NVPTXFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ unsigned &FrameReg) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ FrameReg = NVPTX::VRDepot;
+ return MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
+}
+
void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {}
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h
index a802cf85d2e0..0a7856b9d5de 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -25,6 +25,8 @@ public:
bool hasFP(const MachineFunction &MF) const override;
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
MachineBasicBlock::iterator
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 916b0e115664..fd63fdbaced6 100644
--- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -45,8 +45,6 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {}
private:
- Value *getOrInsertCVTA(Module *M, Function *F, GlobalVariable *GV,
- IRBuilder<> &Builder);
Value *remapConstant(Module *M, Function *F, Constant *C,
IRBuilder<> &Builder);
Value *remapConstantVectorOrConstantAggregate(Module *M, Function *F,
@@ -156,46 +154,6 @@ bool GenericToNVVM::runOnModule(Module &M) {
return true;
}
-Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F,
- GlobalVariable *GV,
- IRBuilder<> &Builder) {
- PointerType *GVType = GV->getType();
- Value *CVTA = nullptr;
-
- // See if the address space conversion requires the operand to be bitcast
- // to i8 addrspace(n)* first.
- EVT ExtendedGVType = EVT::getEVT(GV->getValueType(), true);
- if (!ExtendedGVType.isInteger() && !ExtendedGVType.isFloatingPoint()) {
- // A bitcast to i8 addrspace(n)* on the operand is needed.
- LLVMContext &Context = M->getContext();
- unsigned int AddrSpace = GVType->getAddressSpace();
- Type *DestTy = PointerType::get(Type::getInt8Ty(Context), AddrSpace);
- CVTA = Builder.CreateBitCast(GV, DestTy, "cvta");
- // Insert the address space conversion.
- Type *ResultType =
- PointerType::get(Type::getInt8Ty(Context), llvm::ADDRESS_SPACE_GENERIC);
- Function *CVTAFunction = Intrinsic::getDeclaration(
- M, Intrinsic::nvvm_ptr_global_to_gen, {ResultType, DestTy});
- CVTA = Builder.CreateCall(CVTAFunction, CVTA, "cvta");
- // Another bitcast from i8 * to <the element type of GVType> * is
- // required.
- DestTy =
- PointerType::get(GV->getValueType(), llvm::ADDRESS_SPACE_GENERIC);
- CVTA = Builder.CreateBitCast(CVTA, DestTy, "cvta");
- } else {
- // A simple CVTA is enough.
- SmallVector<Type *, 2> ParamTypes;
- ParamTypes.push_back(PointerType::get(GV->getValueType(),
- llvm::ADDRESS_SPACE_GENERIC));
- ParamTypes.push_back(GVType);
- Function *CVTAFunction = Intrinsic::getDeclaration(
- M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
- CVTA = Builder.CreateCall(CVTAFunction, GV, "cvta");
- }
-
- return CVTA;
-}
-
Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C,
IRBuilder<> &Builder) {
// If the constant C has been converted already in the given function F, just
@@ -207,17 +165,17 @@ Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C,
Value *NewValue = C;
if (isa<GlobalVariable>(C)) {
- // If the constant C is a global variable and is found in GVMap, generate a
- // set set of instructions that convert the clone of C with the global
- // address space specifier to a generic pointer.
- // The constant C cannot be used here, as it will be erased from the
- // module eventually. And the clone of C with the global address space
- // specifier cannot be used here either, as it will affect the types of
- // other instructions in the function. Hence, this address space conversion
- // is required.
+ // If the constant C is a global variable and is found in GVMap, substitute
+ //
+ // addrspacecast GVMap[C] to addrspace(0)
+ //
+ // for our use of C.
GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(C));
if (I != GVMap.end()) {
- NewValue = getOrInsertCVTA(M, F, I->second, Builder);
+ GlobalVariable *GV = I->second;
+ NewValue = Builder.CreateAddrSpaceCast(
+ GV,
+ PointerType::get(GV->getValueType(), llvm::ADDRESS_SPACE_GENERIC));
}
} else if (isa<ConstantAggregate>(C)) {
// If any element in the constant vector or aggregate C is or uses a global
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 57e2acc0d7e0..4dfa8477a362 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -66,6 +66,10 @@ bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
return TL->allowUnsafeFPMath(*MF);
}
+bool NVPTXDAGToDAGISel::useShortPointers() const {
+ return TM.useShortPointers();
+}
+
/// Select - Select instructions not customized! Used for
/// expanded, promoted and normal instructions.
void NVPTXDAGToDAGISel::Select(SDNode *N) {
@@ -496,325 +500,11 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
SelectCode(N);
}
-// Each instruction has four addressing variants. WMMA_VARIANTS() macro below
-// constructs an array indexed by WmmaVariant which getWmmaLdVariant() uses to
-// look up the intrinsic ID of particular variant.
-enum WmmaVariant {
- WMMA_VARIANT_ARI64,
- WMMA_VARIANT_ARI64_STRIDE,
- WMMA_VARIANT_AVAR,
- WMMA_VARIANT_AVAR_STRIDE,
-};
-
-// clang-format off
-#define WMMA_VARIANTS(base) \
- {{ base##_ari64, base##_ari64_stride, base##_avar, base##_avar_stride }}
-// clang-format on
-
-static unsigned getWmmaLdVariant(WmmaVariant Variant, bool Stride,
- const std::array<unsigned, 4> Variants) {
- if (Stride) {
- if (Variant == WMMA_VARIANT_ARI64)
- Variant = WMMA_VARIANT_ARI64_STRIDE;
- else if (Variant == WMMA_VARIANT_AVAR)
- Variant = WMMA_VARIANT_AVAR_STRIDE;
- }
- return Variants[Variant];
-}
-
-static Optional<unsigned>
-getWmmaLdStOpcode(unsigned IntrinsicID,
- WmmaVariant Variant = WMMA_VARIANT_ARI64) {
- switch (IntrinsicID) {
- default:
- return None;
- //
- // WMMA_LOAD_A f16
- //
- case Intrinsic::nvvm_wmma_load_a_f16_col:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col));
- case Intrinsic::nvvm_wmma_load_a_f16_row:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row));
- case Intrinsic::nvvm_wmma_load_a_f16_col_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col));
- case Intrinsic::nvvm_wmma_load_a_f16_row_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row));
- case Intrinsic::nvvm_wmma_load_a_f16_col_shared:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_shared));
- case Intrinsic::nvvm_wmma_load_a_f16_row_shared:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_shared));
- case Intrinsic::nvvm_wmma_load_a_f16_col_shared_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_shared));
- case Intrinsic::nvvm_wmma_load_a_f16_row_shared_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_shared));
- case Intrinsic::nvvm_wmma_load_a_f16_col_global:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_global));
- case Intrinsic::nvvm_wmma_load_a_f16_row_global:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_global));
- case Intrinsic::nvvm_wmma_load_a_f16_col_global_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_global));
- case Intrinsic::nvvm_wmma_load_a_f16_row_global_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_global));
-
- //
- // WMMA_LOAD_B f16
- //
- case Intrinsic::nvvm_wmma_load_b_f16_col:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col));
- case Intrinsic::nvvm_wmma_load_b_f16_row:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row));
- case Intrinsic::nvvm_wmma_load_b_f16_col_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col));
- case Intrinsic::nvvm_wmma_load_b_f16_row_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row));
- case Intrinsic::nvvm_wmma_load_b_f16_col_shared:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_shared));
- case Intrinsic::nvvm_wmma_load_b_f16_row_shared:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_shared));
- case Intrinsic::nvvm_wmma_load_b_f16_col_shared_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_shared));
- case Intrinsic::nvvm_wmma_load_b_f16_row_shared_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_shared));
- case Intrinsic::nvvm_wmma_load_b_f16_col_global:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_global));
- case Intrinsic::nvvm_wmma_load_b_f16_row_global:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_global));
- case Intrinsic::nvvm_wmma_load_b_f16_col_global_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_global));
- case Intrinsic::nvvm_wmma_load_b_f16_row_global_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_global));
-
- //
- // WMMA_LOAD_C f16
- //
- case Intrinsic::nvvm_wmma_load_c_f16_col:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col));
- case Intrinsic::nvvm_wmma_load_c_f16_row:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row));
- case Intrinsic::nvvm_wmma_load_c_f16_col_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col));
- case Intrinsic::nvvm_wmma_load_c_f16_row_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row));
- case Intrinsic::nvvm_wmma_load_c_f16_col_shared:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_shared));
- case Intrinsic::nvvm_wmma_load_c_f16_row_shared:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_shared));
- case Intrinsic::nvvm_wmma_load_c_f16_col_shared_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_shared));
- case Intrinsic::nvvm_wmma_load_c_f16_row_shared_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_shared));
- case Intrinsic::nvvm_wmma_load_c_f16_col_global:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_global));
- case Intrinsic::nvvm_wmma_load_c_f16_row_global:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_global));
- case Intrinsic::nvvm_wmma_load_c_f16_col_global_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_global));
- case Intrinsic::nvvm_wmma_load_c_f16_row_global_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_global));
-
- //
- // WMMA_LOAD_C f32
- //
- case Intrinsic::nvvm_wmma_load_c_f32_col:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col));
- case Intrinsic::nvvm_wmma_load_c_f32_row:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row));
- case Intrinsic::nvvm_wmma_load_c_f32_col_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col));
- case Intrinsic::nvvm_wmma_load_c_f32_row_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row));
- case Intrinsic::nvvm_wmma_load_c_f32_col_shared:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_shared));
- case Intrinsic::nvvm_wmma_load_c_f32_row_shared:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_shared));
- case Intrinsic::nvvm_wmma_load_c_f32_col_shared_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_shared));
- case Intrinsic::nvvm_wmma_load_c_f32_row_shared_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_shared));
- case Intrinsic::nvvm_wmma_load_c_f32_col_global:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_global));
- case Intrinsic::nvvm_wmma_load_c_f32_row_global:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_global));
- case Intrinsic::nvvm_wmma_load_c_f32_col_global_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_global));
- case Intrinsic::nvvm_wmma_load_c_f32_row_global_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_global));
-
- //
- // WMMA_STORE_D f16
- //
- case Intrinsic::nvvm_wmma_store_d_f16_col:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col));
- case Intrinsic::nvvm_wmma_store_d_f16_row:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row));
- case Intrinsic::nvvm_wmma_store_d_f16_col_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col));
- case Intrinsic::nvvm_wmma_store_d_f16_row_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row));
- case Intrinsic::nvvm_wmma_store_d_f16_col_shared:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_shared));
- case Intrinsic::nvvm_wmma_store_d_f16_row_shared:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_shared));
- case Intrinsic::nvvm_wmma_store_d_f16_col_shared_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_shared));
- case Intrinsic::nvvm_wmma_store_d_f16_row_shared_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_shared));
- case Intrinsic::nvvm_wmma_store_d_f16_col_global:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_global));
- case Intrinsic::nvvm_wmma_store_d_f16_row_global:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_global));
- case Intrinsic::nvvm_wmma_store_d_f16_col_global_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_global));
- case Intrinsic::nvvm_wmma_store_d_f16_row_global_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_global));
-
- //
- // WMMA_STORE_D f32
- //
- case Intrinsic::nvvm_wmma_store_d_f32_col:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col));
- case Intrinsic::nvvm_wmma_store_d_f32_row:
- return getWmmaLdVariant(Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row));
- case Intrinsic::nvvm_wmma_store_d_f32_col_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col));
- case Intrinsic::nvvm_wmma_store_d_f32_row_stride:
- return getWmmaLdVariant(Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row));
- case Intrinsic::nvvm_wmma_store_d_f32_col_shared:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_shared));
- case Intrinsic::nvvm_wmma_store_d_f32_row_shared:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_shared));
- case Intrinsic::nvvm_wmma_store_d_f32_col_shared_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_shared));
- case Intrinsic::nvvm_wmma_store_d_f32_row_shared_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_shared));
- case Intrinsic::nvvm_wmma_store_d_f32_col_global:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_global));
- case Intrinsic::nvvm_wmma_store_d_f32_row_global:
- return getWmmaLdVariant(
- Variant, /*Stride=*/false,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_global));
- case Intrinsic::nvvm_wmma_store_d_f32_col_global_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_global));
- case Intrinsic::nvvm_wmma_store_d_f32_row_global_stride:
- return getWmmaLdVariant(
- Variant, /*Stride=*/true,
- WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_global));
- }
-}
-#undef WMMA_VARIANTS
-
bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
- if (getWmmaLdStOpcode(IID))
- return tryWMMA_LDST(N);
-
switch (IID) {
default:
return false;
- case Intrinsic::nvvm_match_all_sync_i32p:
- case Intrinsic::nvvm_match_all_sync_i64p:
- SelectMatchAll(N);
- return true;
case Intrinsic::nvvm_ldg_global_f:
case Intrinsic::nvvm_ldg_global_i:
case Intrinsic::nvvm_ldg_global_p:
@@ -987,8 +677,10 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
// We have two ways of identifying invariant loads: Loads may be explicitly
// marked as invariant, or we may infer them to be invariant.
//
- // We currently infer invariance only for kernel function pointer params that
- // are noalias (i.e. __restrict) and never written to.
+ // We currently infer invariance for loads from
+ // - constant global variables, and
+ // - kernel function pointer params that are noalias (i.e. __restrict) and
+ // never written to.
//
// TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
// not during the SelectionDAG phase).
@@ -1002,23 +694,22 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
if (N->isInvariant())
return true;
- // Load wasn't explicitly invariant. Attempt to infer invariance.
- if (!isKernelFunction(F->getFunction()))
- return false;
+ bool IsKernelFn = isKernelFunction(F->getFunction());
- // We use GetUnderlyingObjects() here instead of
- // GetUnderlyingObject() mainly because the former looks through phi
- // nodes while the latter does not. We need to look through phi
- // nodes to handle pointer induction variables.
+ // We use GetUnderlyingObjects() here instead of GetUnderlyingObject() mainly
+ // because the former looks through phi nodes while the latter does not. We
+ // need to look through phi nodes to handle pointer induction variables.
SmallVector<Value *, 8> Objs;
GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
Objs, F->getDataLayout());
- for (Value *Obj : Objs) {
- auto *A = dyn_cast<const Argument>(Obj);
- if (!A || !A->onlyReadsMemory() || !A->hasNoAliasAttr()) return false;
- }
- return true;
+ return all_of(Objs, [&](Value *V) {
+ if (auto *A = dyn_cast<const Argument>(V))
+ return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
+ if (auto *GV = dyn_cast<const GlobalVariable>(V))
+ return GV->isConstant();
+ return false;
+ });
}
bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
@@ -1029,39 +720,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
case Intrinsic::nvvm_texsurf_handle_internal:
SelectTexSurfHandle(N);
return true;
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16:
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16_satfinite:
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32:
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32_satfinite:
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16:
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16_satfinite:
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32:
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32_satfinite:
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16:
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16_satfinite:
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32:
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32_satfinite:
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16:
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16_satfinite:
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32:
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32_satfinite:
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16:
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16_satfinite:
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32:
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32_satfinite:
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16:
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16_satfinite:
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32:
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32_satfinite:
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16:
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16_satfinite:
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32:
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32_satfinite:
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16:
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16_satfinite:
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32:
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32_satfinite:
- return tryWMMA_MMA(N);
}
}
@@ -1073,42 +731,11 @@ void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
MVT::i64, GlobalVal));
}
-void NVPTXDAGToDAGISel::SelectMatchAll(SDNode *N) {
- SDLoc DL(N);
- enum { IS_I64 = 4, HAS_CONST_VALUE = 2, HAS_CONST_MASK = 1 };
- unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
- unsigned OpcodeIndex =
- (IID == Intrinsic::nvvm_match_all_sync_i64p) ? IS_I64 : 0;
- SDValue MaskOp = N->getOperand(2);
- SDValue ValueOp = N->getOperand(3);
- if (ConstantSDNode *ValueConst = dyn_cast<ConstantSDNode>(ValueOp)) {
- OpcodeIndex |= HAS_CONST_VALUE;
- ValueOp = CurDAG->getTargetConstant(ValueConst->getZExtValue(), DL,
- ValueConst->getValueType(0));
- }
- if (ConstantSDNode *MaskConst = dyn_cast<ConstantSDNode>(MaskOp)) {
- OpcodeIndex |= HAS_CONST_MASK;
- MaskOp = CurDAG->getTargetConstant(MaskConst->getZExtValue(), DL,
- MaskConst->getValueType(0));
- }
- // Maps {IS_I64, HAS_CONST_VALUE, HAS_CONST_MASK} -> opcode
- unsigned Opcodes[8] = {
- NVPTX::MATCH_ALLP_SYNC_32rr, NVPTX::MATCH_ALLP_SYNC_32ri,
- NVPTX::MATCH_ALLP_SYNC_32ir, NVPTX::MATCH_ALLP_SYNC_32ii,
- NVPTX::MATCH_ALLP_SYNC_64rr, NVPTX::MATCH_ALLP_SYNC_64ri,
- NVPTX::MATCH_ALLP_SYNC_64ir, NVPTX::MATCH_ALLP_SYNC_64ii};
- SDNode *NewNode = CurDAG->getMachineNode(
- Opcodes[OpcodeIndex], DL, {ValueOp->getValueType(0), MVT::i1, MVT::Other},
- {MaskOp, ValueOp});
- ReplaceNode(N, NewNode);
-}
-
void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
SDValue Src = N->getOperand(0);
AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
unsigned DstAddrSpace = CastN->getDestAddressSpace();
-
assert(SrcAddrSpace != DstAddrSpace &&
"addrspacecast must be between different address spaces");
@@ -1121,13 +748,19 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
break;
case ADDRESS_SPACE_SHARED:
- Opc = TM.is64Bit() ? NVPTX::cvta_shared_yes_64 : NVPTX::cvta_shared_yes;
+ Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
+ : NVPTX::cvta_shared_yes_64)
+ : NVPTX::cvta_shared_yes;
break;
case ADDRESS_SPACE_CONST:
- Opc = TM.is64Bit() ? NVPTX::cvta_const_yes_64 : NVPTX::cvta_const_yes;
+ Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
+ : NVPTX::cvta_const_yes_64)
+ : NVPTX::cvta_const_yes;
break;
case ADDRESS_SPACE_LOCAL:
- Opc = TM.is64Bit() ? NVPTX::cvta_local_yes_64 : NVPTX::cvta_local_yes;
+ Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
+ : NVPTX::cvta_local_yes_64)
+ : NVPTX::cvta_local_yes;
break;
}
ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
@@ -1145,16 +778,19 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
: NVPTX::cvta_to_global_yes;
break;
case ADDRESS_SPACE_SHARED:
- Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_yes_64
+ Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
+ : NVPTX::cvta_to_shared_yes_64)
: NVPTX::cvta_to_shared_yes;
break;
case ADDRESS_SPACE_CONST:
- Opc =
- TM.is64Bit() ? NVPTX::cvta_to_const_yes_64 : NVPTX::cvta_to_const_yes;
+ Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
+ : NVPTX::cvta_to_const_yes_64)
+ : NVPTX::cvta_to_const_yes;
break;
case ADDRESS_SPACE_LOCAL:
- Opc =
- TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes;
+ Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
+ : NVPTX::cvta_to_local_yes_64)
+ : NVPTX::cvta_to_local_yes;
break;
case ADDRESS_SPACE_PARAM:
Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
@@ -1210,18 +846,20 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
return false;
// Address Space Setting
- unsigned int codeAddrSpace = getCodeAddrSpace(LD);
-
- if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) {
+ unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
+ if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
return tryLDGLDU(N);
}
+ unsigned int PointerSize =
+ CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
+
// Volatile Setting
// - .volatile is only availalble for .global and .shared
bool isVolatile = LD->isVolatile();
- if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
- codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
- codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+ if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
isVolatile = false;
// Type Setting: fromType + fromTypeWidth
@@ -1268,27 +906,27 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+ SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl), getI32Imm(fromType, dl),
getI32Imm(fromTypeWidth, dl), Addr, Chain };
NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
MVT::Other, Ops);
- } else if (TM.is64Bit() ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
- : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
+ } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
+ : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+ SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl), getI32Imm(fromType, dl),
getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
MVT::Other, Ops);
- } else if (TM.is64Bit() ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
- : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
- if (TM.is64Bit())
+ } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
+ : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
+ if (PointerSize == 64)
Opcode = pickOpcodeForVT(
TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
@@ -1300,13 +938,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+ SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl), getI32Imm(fromType, dl),
getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
MVT::Other, Ops);
} else {
- if (TM.is64Bit())
+ if (PointerSize == 64)
Opcode = pickOpcodeForVT(
TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
@@ -1319,7 +957,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+ SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl), getI32Imm(fromType, dl),
getI32Imm(fromTypeWidth, dl), N1, Chain };
NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
@@ -1353,11 +991,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
// Address Space Setting
unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
-
if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
return tryLDGLDU(N);
}
+ unsigned int PointerSize =
+ CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
+
// Volatile Setting
// - .volatile is only availalble for .global and .shared
bool IsVolatile = MemSD->isVolatile();
@@ -1440,8 +1080,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
getI32Imm(VecType, DL), getI32Imm(FromType, DL),
getI32Imm(FromTypeWidth, DL), Addr, Chain };
LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
- } else if (TM.is64Bit() ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
- : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
+ } else if (PointerSize == 64
+ ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
+ : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
switch (N->getOpcode()) {
default:
return false;
@@ -1466,9 +1107,10 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
getI32Imm(VecType, DL), getI32Imm(FromType, DL),
getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
- } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
- : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
- if (TM.is64Bit()) {
+ } else if (PointerSize == 64
+ ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
+ : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
+ if (PointerSize == 64) {
switch (N->getOpcode()) {
default:
return false;
@@ -1516,7 +1158,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
} else {
- if (TM.is64Bit()) {
+ if (PointerSize == 64) {
switch (N->getOpcode()) {
default:
return false;
@@ -1615,6 +1257,12 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
if (EltVT.isVector()) {
NumElts = EltVT.getVectorNumElements();
EltVT = EltVT.getVectorElementType();
+ // vectors of f16 are loaded/stored as multiples of v2f16 elements.
+ if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
+ assert(NumElts % 2 == 0 && "Vector must have even number of elements");
+ EltVT = MVT::v2f16;
+ NumElts /= 2;
+ }
}
// Build the "promoted" result VTList for the load. If we are really loading
@@ -1632,6 +1280,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
switch (N->getOpcode()) {
default:
return false;
+ case ISD::LOAD:
case ISD::INTRINSIC_W_CHAIN:
if (IsLDG)
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
@@ -1654,6 +1303,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
break;
+ case NVPTXISD::LoadV2:
case NVPTXISD::LDGV2:
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
@@ -1676,6 +1326,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
break;
+ case NVPTXISD::LoadV4:
case NVPTXISD::LDGV4:
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
@@ -2052,14 +1703,16 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
return false;
// Address Space Setting
- unsigned int codeAddrSpace = getCodeAddrSpace(ST);
+ unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
+ unsigned int PointerSize =
+ CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
// Volatile Setting
// - .volatile is only availalble for .global and .shared
bool isVolatile = ST->isVolatile();
- if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
- codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
- codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+ if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
isVolatile = false;
// Vector Setting
@@ -2102,12 +1755,12 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
- getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+ getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Addr,
Chain };
NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
- } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
- : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+ } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+ : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
@@ -2115,13 +1768,13 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
- getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+ getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
Offset, Chain };
NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
- } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
- : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
- if (TM.is64Bit())
+ } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+ : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+ if (PointerSize == 64)
Opcode = pickOpcodeForVT(
SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
@@ -2135,12 +1788,12 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
return false;
SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
- getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+ getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
Offset, Chain };
NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
} else {
- if (TM.is64Bit())
+ if (PointerSize == 64)
Opcode =
pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
@@ -2154,7 +1807,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
- getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+ getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), N2,
Chain };
NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
@@ -2183,11 +1836,12 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
// Address Space Setting
unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
-
if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
report_fatal_error("Cannot store to pointer that points to constant "
"memory space");
}
+ unsigned int PointerSize =
+ CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
// Volatile Setting
// - .volatile is only availalble for .global and .shared
@@ -2268,8 +1922,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
break;
}
StOps.push_back(Addr);
- } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
- : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+ } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+ : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
switch (N->getOpcode()) {
default:
return false;
@@ -2290,9 +1944,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
}
StOps.push_back(Base);
StOps.push_back(Offset);
- } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
- : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
- if (TM.is64Bit()) {
+ } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+ : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+ if (PointerSize == 64) {
switch (N->getOpcode()) {
default:
return false;
@@ -2335,7 +1989,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
StOps.push_back(Base);
StOps.push_back(Offset);
} else {
- if (TM.is64Bit()) {
+ if (PointerSize == 64) {
switch (N->getOpcode()) {
default:
return false;
@@ -4068,172 +3722,3 @@ unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
}
}
}
-
-bool NVPTXDAGToDAGISel::tryWMMA_LDST(SDNode *N) {
- SDValue Chain = N->getOperand(0);
- unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
- SDValue Op1 = N->getOperand(2);
- SDValue Addr, Offset, Base;
- Optional<unsigned> Opcode;
- SDLoc DL(N);
- MemSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
- WmmaVariant Variant;
- SmallVector<SDValue, 12> Ops;
- bool isStore = N->getNumValues() == 1; // Store ops only return a chain.
-
- if (SelectDirectAddr(Op1, Addr)) {
- Variant = WMMA_VARIANT_AVAR;
- Ops.push_back(Addr);
- } else if (SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) ||
- SelectADDRri64(Op1.getNode(), Op1, Base, Offset)) {
- Variant = WMMA_VARIANT_ARI64;
- Ops.push_back(Base);
- Ops.push_back(Offset);
- } else {
- Variant = WMMA_VARIANT_AVAR;
- Ops.push_back(Op1);
- }
- unsigned NumOps = N->getNumOperands();
- // Pass through the rest of the operands to the machine node.
- for (unsigned i = 3; i < NumOps; ++i)
- Ops.push_back(N->getOperand(i));
- Ops.push_back(Chain);
-
- Opcode = getWmmaLdStOpcode(IID, Variant);
- if (!Opcode) {
- llvm::errs() << "tryWMMALD - no Opcode.\n";
- return false;
- }
-
- EVT MemVT = MemSD->getMemoryVT();
- assert(MemVT.isVector() && "Expected vector return type.");
-
- SDNode *MN;
- if (isStore) {
- MN = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops);
- } else {
- SmallVector<EVT, 9> InstVTs(MemVT.getVectorNumElements(),
- MemSD->getValueType(0));
- InstVTs.push_back(MVT::Other);
- MN = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTs, Ops);
- }
-
- ReplaceNode(N, MN);
- return true;
-}
-
-bool NVPTXDAGToDAGISel::tryWMMA_MMA(SDNode *N) {
- unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
- SDLoc DL(N);
- unsigned Opc;
-
- switch (IID) {
- default:
- return false;
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16:
- Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f16;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f16_satfinite;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32:
- Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f32;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f32_satfinite;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16:
- Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f16;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f16_satfinite;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32:
- Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f32;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f32_satfinite;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16:
- Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f16;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f16_satfinite;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32:
- Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f32;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f32_satfinite;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16:
- Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f16;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f16_satfinite;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32:
- Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f32;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f32_satfinite;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16:
- Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f16;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f16_satfinite;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32:
- Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f32;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f32_satfinite;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16:
- Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f16;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f16_satfinite;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32:
- Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f32;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f32_satfinite;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16:
- Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f16;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f16_satfinite;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32:
- Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f32;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f32_satfinite;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16:
- Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f16;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f16_satfinite;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32:
- Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f32;
- break;
- case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32_satfinite:
- Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f32_satfinite;
- break;
- }
-
- SmallVector<SDValue, 24> Ops;
- // Pass through operands and return value types to the machine node.
- for (unsigned i = 1; i < N->getNumOperands(); ++i)
- Ops.push_back(N->getOperand(i));
- SmallVector<EVT, 8> InstVTs(N->getNumValues(), N->getValueType(0));
- SDNode *MN = CurDAG->getMachineNode(Opc, DL, InstVTs, Ops);
- ReplaceNode(N, MN);
- return true;
-}
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index b23c27581a17..e911ba0c167d 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -35,6 +35,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool useF32FTZ() const;
bool allowFMA() const;
bool allowUnsafeFPMath() const;
+ bool useShortPointers() const;
public:
explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
@@ -58,7 +59,6 @@ private:
bool tryIntrinsicNoChain(SDNode *N);
bool tryIntrinsicChain(SDNode *N);
void SelectTexSurfHandle(SDNode *N);
- void SelectMatchAll(SDNode *N);
bool tryLoad(SDNode *N);
bool tryLoadVector(SDNode *N);
bool tryLDGLDU(SDNode *N);
@@ -74,8 +74,6 @@ private:
bool tryConstantFP16(SDNode *N);
bool SelectSETP_F16X2(SDNode *N);
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
- bool tryWMMA_LDST(SDNode *N);
- bool tryWMMA_MMA(SDNode *N);
inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
@@ -90,7 +88,6 @@ private:
SDValue &Offset);
bool SelectADDRri64(SDNode *OpNode, SDValue Addr, SDValue &Base,
SDValue &Offset);
-
bool SelectADDRsi_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
SDValue &Offset, MVT mvt);
bool SelectADDRsi(SDNode *OpNode, SDValue Addr, SDValue &Base,
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index f1e4251a44b5..2536623fb853 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -15,7 +15,6 @@
#include "NVPTXISelLowering.h"
#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "NVPTX.h"
-#include "NVPTXSection.h"
#include "NVPTXSubtarget.h"
#include "NVPTXTargetMachine.h"
#include "NVPTXTargetObjectFile.h"
@@ -26,7 +25,6 @@
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetCallingConv.h"
@@ -49,6 +47,7 @@
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
@@ -376,29 +375,19 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
// Operations not directly supported by NVPTX.
- setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::v2f16, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
- setOperationAction(ISD::BR_CC, MVT::f16, Expand);
- setOperationAction(ISD::BR_CC, MVT::v2f16, Expand);
- setOperationAction(ISD::BR_CC, MVT::f32, Expand);
- setOperationAction(ISD::BR_CC, MVT::f64, Expand);
- setOperationAction(ISD::BR_CC, MVT::i1, Expand);
- setOperationAction(ISD::BR_CC, MVT::i8, Expand);
- setOperationAction(ISD::BR_CC, MVT::i16, Expand);
- setOperationAction(ISD::BR_CC, MVT::i32, Expand);
- setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+ for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8,
+ MVT::i16, MVT::i32, MVT::i64}) {
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ setOperationAction(ISD::BR_CC, VT, Expand);
+ }
+
// Some SIGN_EXTEND_INREG can be done using cvt instruction.
// For others we will expand to a SHL/SRA pair.
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
@@ -417,20 +406,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
- if (STI.hasROT64()) {
- setOperationAction(ISD::ROTL, MVT::i64, Legal);
- setOperationAction(ISD::ROTR, MVT::i64, Legal);
- } else {
- setOperationAction(ISD::ROTL, MVT::i64, Expand);
- setOperationAction(ISD::ROTR, MVT::i64, Expand);
- }
- if (STI.hasROT32()) {
- setOperationAction(ISD::ROTL, MVT::i32, Legal);
- setOperationAction(ISD::ROTR, MVT::i32, Legal);
- } else {
- setOperationAction(ISD::ROTL, MVT::i32, Expand);
- setOperationAction(ISD::ROTR, MVT::i32, Expand);
- }
+ // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
+ // that don't have h/w rotation we lower them to multi-instruction assembly.
+ // See ROT*_sw in NVPTXIntrInfo.td
+ setOperationAction(ISD::ROTL, MVT::i64, Legal);
+ setOperationAction(ISD::ROTR, MVT::i64, Legal);
+ setOperationAction(ISD::ROTL, MVT::i32, Legal);
+ setOperationAction(ISD::ROTR, MVT::i32, Legal);
setOperationAction(ISD::ROTL, MVT::i16, Expand);
setOperationAction(ISD::ROTR, MVT::i16, Expand);
@@ -486,9 +468,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// TRAP can be lowered to PTX trap
setOperationAction(ISD::TRAP, MVT::Other, Legal);
- setOperationAction(ISD::ADDC, MVT::i64, Expand);
- setOperationAction(ISD::ADDE, MVT::i64, Expand);
-
// Register custom handling for vector loads/stores
for (MVT VT : MVT::vector_valuetypes()) {
if (IsPTXVectorType(VT)) {
@@ -1251,9 +1230,9 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
SDValue
NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
- const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
- auto PtrVT = getPointerTy(DAG.getDataLayout());
- Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
+ const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
+ auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
+ Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
}
@@ -3330,30 +3309,30 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
// Our result depends on both our and other thread's arguments.
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
- case Intrinsic::nvvm_wmma_load_a_f16_col:
- case Intrinsic::nvvm_wmma_load_a_f16_row:
- case Intrinsic::nvvm_wmma_load_a_f16_col_stride:
- case Intrinsic::nvvm_wmma_load_a_f16_row_stride:
- case Intrinsic::nvvm_wmma_load_a_f16_col_shared:
- case Intrinsic::nvvm_wmma_load_a_f16_row_shared:
- case Intrinsic::nvvm_wmma_load_a_f16_col_shared_stride:
- case Intrinsic::nvvm_wmma_load_a_f16_row_shared_stride:
- case Intrinsic::nvvm_wmma_load_a_f16_col_global:
- case Intrinsic::nvvm_wmma_load_a_f16_row_global:
- case Intrinsic::nvvm_wmma_load_a_f16_col_global_stride:
- case Intrinsic::nvvm_wmma_load_a_f16_row_global_stride:
- case Intrinsic::nvvm_wmma_load_b_f16_col:
- case Intrinsic::nvvm_wmma_load_b_f16_row:
- case Intrinsic::nvvm_wmma_load_b_f16_col_stride:
- case Intrinsic::nvvm_wmma_load_b_f16_row_stride:
- case Intrinsic::nvvm_wmma_load_b_f16_col_shared:
- case Intrinsic::nvvm_wmma_load_b_f16_row_shared:
- case Intrinsic::nvvm_wmma_load_b_f16_col_shared_stride:
- case Intrinsic::nvvm_wmma_load_b_f16_row_shared_stride:
- case Intrinsic::nvvm_wmma_load_b_f16_col_global:
- case Intrinsic::nvvm_wmma_load_b_f16_row_global:
- case Intrinsic::nvvm_wmma_load_b_f16_col_global_stride:
- case Intrinsic::nvvm_wmma_load_b_f16_row_global_stride: {
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v8f16;
Info.ptrVal = I.getArgOperand(0);
@@ -3363,18 +3342,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
return true;
}
- case Intrinsic::nvvm_wmma_load_c_f16_col:
- case Intrinsic::nvvm_wmma_load_c_f16_row:
- case Intrinsic::nvvm_wmma_load_c_f16_col_stride:
- case Intrinsic::nvvm_wmma_load_c_f16_row_stride:
- case Intrinsic::nvvm_wmma_load_c_f16_col_shared:
- case Intrinsic::nvvm_wmma_load_c_f16_row_shared:
- case Intrinsic::nvvm_wmma_load_c_f16_col_shared_stride:
- case Intrinsic::nvvm_wmma_load_c_f16_row_shared_stride:
- case Intrinsic::nvvm_wmma_load_c_f16_col_global:
- case Intrinsic::nvvm_wmma_load_c_f16_row_global:
- case Intrinsic::nvvm_wmma_load_c_f16_col_global_stride:
- case Intrinsic::nvvm_wmma_load_c_f16_row_global_stride: {
+ case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v4f16;
Info.ptrVal = I.getArgOperand(0);
@@ -3384,18 +3363,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
return true;
}
- case Intrinsic::nvvm_wmma_load_c_f32_col:
- case Intrinsic::nvvm_wmma_load_c_f32_row:
- case Intrinsic::nvvm_wmma_load_c_f32_col_stride:
- case Intrinsic::nvvm_wmma_load_c_f32_row_stride:
- case Intrinsic::nvvm_wmma_load_c_f32_col_shared:
- case Intrinsic::nvvm_wmma_load_c_f32_row_shared:
- case Intrinsic::nvvm_wmma_load_c_f32_col_shared_stride:
- case Intrinsic::nvvm_wmma_load_c_f32_row_shared_stride:
- case Intrinsic::nvvm_wmma_load_c_f32_col_global:
- case Intrinsic::nvvm_wmma_load_c_f32_row_global:
- case Intrinsic::nvvm_wmma_load_c_f32_col_global_stride:
- case Intrinsic::nvvm_wmma_load_c_f32_row_global_stride: {
+ case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v8f32;
Info.ptrVal = I.getArgOperand(0);
@@ -3405,19 +3384,19 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
return true;
}
- case Intrinsic::nvvm_wmma_store_d_f16_col:
- case Intrinsic::nvvm_wmma_store_d_f16_row:
- case Intrinsic::nvvm_wmma_store_d_f16_col_stride:
- case Intrinsic::nvvm_wmma_store_d_f16_row_stride:
- case Intrinsic::nvvm_wmma_store_d_f16_col_shared:
- case Intrinsic::nvvm_wmma_store_d_f16_row_shared:
- case Intrinsic::nvvm_wmma_store_d_f16_col_shared_stride:
- case Intrinsic::nvvm_wmma_store_d_f16_row_shared_stride:
- case Intrinsic::nvvm_wmma_store_d_f16_col_global:
- case Intrinsic::nvvm_wmma_store_d_f16_row_global:
- case Intrinsic::nvvm_wmma_store_d_f16_col_global_stride:
- case Intrinsic::nvvm_wmma_store_d_f16_row_global_stride: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
+ case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
+ case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
+ case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
+ Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::v4f16;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
@@ -3426,19 +3405,19 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
return true;
}
- case Intrinsic::nvvm_wmma_store_d_f32_col:
- case Intrinsic::nvvm_wmma_store_d_f32_row:
- case Intrinsic::nvvm_wmma_store_d_f32_col_stride:
- case Intrinsic::nvvm_wmma_store_d_f32_row_stride:
- case Intrinsic::nvvm_wmma_store_d_f32_col_shared:
- case Intrinsic::nvvm_wmma_store_d_f32_row_shared:
- case Intrinsic::nvvm_wmma_store_d_f32_col_shared_stride:
- case Intrinsic::nvvm_wmma_store_d_f32_row_shared_stride:
- case Intrinsic::nvvm_wmma_store_d_f32_col_global:
- case Intrinsic::nvvm_wmma_store_d_f32_row_global:
- case Intrinsic::nvvm_wmma_store_d_f32_col_global_stride:
- case Intrinsic::nvvm_wmma_store_d_f32_row_global_stride: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
+ case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
+ case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
+ case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: {
+ Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::v8f32;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
@@ -4756,31 +4735,8 @@ void NVPTXTargetLowering::ReplaceNodeResults(
}
}
-// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
-void NVPTXSection::anchor() {}
-
-NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
- delete static_cast<NVPTXSection *>(TextSection);
- delete static_cast<NVPTXSection *>(DataSection);
- delete static_cast<NVPTXSection *>(BSSSection);
- delete static_cast<NVPTXSection *>(ReadOnlySection);
-
- delete static_cast<NVPTXSection *>(StaticCtorSection);
- delete static_cast<NVPTXSection *>(StaticDtorSection);
- delete static_cast<NVPTXSection *>(LSDASection);
- delete static_cast<NVPTXSection *>(EHFrameSection);
- delete static_cast<NVPTXSection *>(DwarfAbbrevSection);
- delete static_cast<NVPTXSection *>(DwarfInfoSection);
- delete static_cast<NVPTXSection *>(DwarfLineSection);
- delete static_cast<NVPTXSection *>(DwarfFrameSection);
- delete static_cast<NVPTXSection *>(DwarfPubTypesSection);
- delete static_cast<const NVPTXSection *>(DwarfDebugInlineSection);
- delete static_cast<NVPTXSection *>(DwarfStrSection);
- delete static_cast<NVPTXSection *>(DwarfLocSection);
- delete static_cast<NVPTXSection *>(DwarfARangesSection);
- delete static_cast<NVPTXSection *>(DwarfRangesSection);
- delete static_cast<NVPTXSection *>(DwarfMacinfoSection);
-}
+// Pin NVPTXTargetObjectFile's vtables to this file.
+NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {}
MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index da563f0531d4..50815bff6c67 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -70,51 +70,6 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
.addReg(SrcReg, getKillRegState(KillSrc));
}
-bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
- unsigned &DestReg) const {
- // Look for the appropriate part of TSFlags
- bool isMove = false;
-
- unsigned TSFlags =
- (MI.getDesc().TSFlags & NVPTX::SimpleMoveMask) >> NVPTX::SimpleMoveShift;
- isMove = (TSFlags == 1);
-
- if (isMove) {
- MachineOperand dest = MI.getOperand(0);
- MachineOperand src = MI.getOperand(1);
- assert(dest.isReg() && "dest of a movrr is not a reg");
- assert(src.isReg() && "src of a movrr is not a reg");
-
- SrcReg = src.getReg();
- DestReg = dest.getReg();
- return true;
- }
-
- return false;
-}
-
-bool NVPTXInstrInfo::isLoadInstr(const MachineInstr &MI,
- unsigned &AddrSpace) const {
- bool isLoad = false;
- unsigned TSFlags =
- (MI.getDesc().TSFlags & NVPTX::isLoadMask) >> NVPTX::isLoadShift;
- isLoad = (TSFlags == 1);
- if (isLoad)
- AddrSpace = getLdStCodeAddrSpace(MI);
- return isLoad;
-}
-
-bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI,
- unsigned &AddrSpace) const {
- bool isStore = false;
- unsigned TSFlags =
- (MI.getDesc().TSFlags & NVPTX::isStoreMask) >> NVPTX::isStoreShift;
- isStore = (TSFlags == 1);
- if (isStore)
- AddrSpace = getLdStCodeAddrSpace(MI);
- return isStore;
-}
-
/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
/// implemented for a target). Upon success, this returns false and returns
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index 18ba7684ae51..4ab1bb481958 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -52,10 +52,6 @@ public:
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc) const override;
- virtual bool isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
- unsigned &DestReg) const;
- bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
- bool isStoreInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
// Branch analysis.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
@@ -68,10 +64,6 @@ public:
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
const DebugLoc &DL,
int *BytesAdded = nullptr) const override;
- unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const {
- return MI.getOperand(2).getImm();
- }
-
};
} // namespace llvm
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 92152a64e525..443b077184c7 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -111,28 +111,14 @@ def VecElement : Operand<i32> {
//===----------------------------------------------------------------------===//
-def hasAtomRedG32 : Predicate<"Subtarget->hasAtomRedG32()">;
-def hasAtomRedS32 : Predicate<"Subtarget->hasAtomRedS32()">;
-def hasAtomRedGen32 : Predicate<"Subtarget->hasAtomRedGen32()">;
-def useAtomRedG32forGen32 :
- Predicate<"!Subtarget->hasAtomRedGen32() && Subtarget->hasAtomRedG32()">;
-def hasBrkPt : Predicate<"Subtarget->hasBrkPt()">;
-def hasAtomRedG64 : Predicate<"Subtarget->hasAtomRedG64()">;
-def hasAtomRedS64 : Predicate<"Subtarget->hasAtomRedS64()">;
-def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">;
-def useAtomRedG64forGen64 :
- Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">;
-def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">;
def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
def hasVote : Predicate<"Subtarget->hasVote()">;
def hasDouble : Predicate<"Subtarget->hasDouble()">;
-def reqPTX20 : Predicate<"Subtarget->reqPTX20()">;
def hasLDG : Predicate<"Subtarget->hasLDG()">;
def hasLDU : Predicate<"Subtarget->hasLDU()">;
-def hasGenericLdSt : Predicate<"Subtarget->hasGenericLdSt()">;
def doF32FTZ : Predicate<"useF32FTZ()">;
def doNoF32FTZ : Predicate<"!useF32FTZ()">;
@@ -156,10 +142,12 @@ def true : Predicate<"true">;
def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
+def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
+def useShortPtr : Predicate<"useShortPointers()">;
def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
//===----------------------------------------------------------------------===//
@@ -961,13 +949,12 @@ def FDIV321r_prec_ftz :
(ins f32imm:$a, Float32Regs:$b),
"rcp.rn.ftz.f32 \t$dst, $b;",
[(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
- Requires<[reqPTX20, doF32FTZ]>;
+ Requires<[doF32FTZ]>;
def FDIV321r_prec :
NVPTXInst<(outs Float32Regs:$dst),
(ins f32imm:$a, Float32Regs:$b),
"rcp.rn.f32 \t$dst, $b;",
- [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
- Requires<[reqPTX20]>;
+ [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>;
//
// F32 Accurate division
//
@@ -976,25 +963,23 @@ def FDIV32rr_prec_ftz :
(ins Float32Regs:$a, Float32Regs:$b),
"div.rn.ftz.f32 \t$dst, $a, $b;",
[(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[doF32FTZ, reqPTX20]>;
+ Requires<[doF32FTZ]>;
def FDIV32ri_prec_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
"div.rn.ftz.f32 \t$dst, $a, $b;",
[(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
- Requires<[doF32FTZ, reqPTX20]>;
+ Requires<[doF32FTZ]>;
def FDIV32rr_prec :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
"div.rn.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[reqPTX20]>;
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>;
def FDIV32ri_prec :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
"div.rn.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
- Requires<[reqPTX20]>;
+ [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>;
//
// FMA
@@ -1544,6 +1529,7 @@ def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
[SDNPWantRoot]>;
def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
[SDNPWantRoot]>;
+def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
def MEMri : Operand<i32> {
let PrintMethod = "printMemOperand";
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index c932758bd0ae..47dcdcf6e0bd 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -277,26 +277,22 @@ multiclass MATCH_ALLP_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic Int
def ii : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred),
(ins i32imm:$mask, ImmOp:$value),
"match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
- // If would be nice if tablegen could match multiple return values,
- // but it does not seem to be the case. Thus we have an empty pattern and
- // lower intrinsic to instruction manually.
- // [(set regclass:$dest, Int1Regs:$pred, (IntOp imm:$value, imm:$mask))]>,
- []>,
+ [(set regclass:$dest, Int1Regs:$pred, (IntOp imm:$mask, imm:$value))]>,
Requires<[hasPTX60, hasSM70]>;
def ir : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred),
(ins Int32Regs:$mask, ImmOp:$value),
"match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
- []>,
+ [(set regclass:$dest, Int1Regs:$pred, (IntOp Int32Regs:$mask, imm:$value))]>,
Requires<[hasPTX60, hasSM70]>;
def ri : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred),
(ins i32imm:$mask, regclass:$value),
"match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
- []>,
+ [(set regclass:$dest, Int1Regs:$pred, (IntOp imm:$mask, regclass:$value))]>,
Requires<[hasPTX60, hasSM70]>;
def rr : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred),
(ins Int32Regs:$mask, regclass:$value),
"match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
- []>,
+ [(set regclass:$dest, Int1Regs:$pred, (IntOp Int32Regs:$mask, regclass:$value))]>,
Requires<[hasPTX60, hasSM70]>;
}
defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC<Int32Regs, "b32", int_nvvm_match_all_sync_i32p,
@@ -1025,18 +1021,19 @@ class ATOMIC_GENERIC_CHK <dag ops, dag frag>
multiclass F_ATOMIC_2_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
- Operand IMMType, SDNode IMM, Predicate Pred> {
+ Operand IMMType, SDNode IMM, list<Predicate> Pred> {
def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
!strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;"),
[(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
- Requires<[Pred]>;
+ Requires<Pred>;
def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
!strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;", ""),
[(set regclass:$dst, (IntOp ptrclass:$addr, IMM:$b))]>,
- Requires<[Pred]>;
+ Requires<Pred>;
}
multiclass F_ATOMIC_2<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
- string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM, Predicate Pred> {
+ string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM,
+ list<Predicate> Pred = []> {
defm p32 : F_ATOMIC_2_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
IntOp, IMMType, IMM, Pred>;
defm p64 : F_ATOMIC_2_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
@@ -1046,7 +1043,7 @@ multiclass F_ATOMIC_2<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
// has 2 operands, neg the second one
multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
- Operand IMMType, Predicate Pred> {
+ Operand IMMType, list<Predicate> Pred> {
def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
!strconcat(
"{{ \n\t",
@@ -1055,11 +1052,11 @@ multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
"atom", SpaceStr, OpcStr, ".u", TypeStr, " \t$dst, [$addr], temp; \n\t",
"}}"),
[(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
- Requires<[Pred]>;
+ Requires<Pred>;
}
multiclass F_ATOMIC_2_NEG<NVPTXRegClass regclass, string SpaceStr,
string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType,
- Predicate Pred> {
+ list<Predicate> Pred = []> {
defm p32: F_ATOMIC_2_NEG_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
IntOp, IMMType, Pred> ;
defm p64: F_ATOMIC_2_NEG_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
@@ -1069,33 +1066,33 @@ multiclass F_ATOMIC_2_NEG<NVPTXRegClass regclass, string SpaceStr,
// has 3 operands
multiclass F_ATOMIC_3_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
- Operand IMMType, Predicate Pred> {
+ Operand IMMType, list<Predicate> Pred> {
def reg : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, regclass:$b, regclass:$c),
!strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
[(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, regclass:$c))]>,
- Requires<[Pred]>;
+ Requires<Pred>;
def imm1 : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, IMMType:$b, regclass:$c),
!strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
[(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, regclass:$c))]>,
- Requires<[Pred]>;
+ Requires<Pred>;
def imm2 : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, regclass:$b, IMMType:$c),
!strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
[(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, imm:$c))]>,
- Requires<[Pred]>;
+ Requires<Pred>;
def imm3 : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, IMMType:$b, IMMType:$c),
!strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
[(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, imm:$c))]>,
- Requires<[Pred]>;
+ Requires<Pred>;
}
multiclass F_ATOMIC_3<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
- string OpcStr, PatFrag IntOp, Operand IMMType, Predicate Pred> {
+ string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
defm p32 : F_ATOMIC_3_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
IntOp, IMMType, Pred>;
defm p64 : F_ATOMIC_3_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
@@ -1130,36 +1127,36 @@ def atomic_load_add_f64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add",
- atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>;
+ atomic_load_add_32_g, i32imm, imm>;
defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".add",
- atomic_load_add_32_s, i32imm, imm, hasAtomRedS32>;
+ atomic_load_add_32_s, i32imm, imm>;
defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".add",
- atomic_load_add_32_gen, i32imm, imm, hasAtomRedGen32>;
+ atomic_load_add_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
- ".add", atomic_load_add_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+ ".add", atomic_load_add_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", ".add",
- atomic_load_add_64_g, i64imm, imm, hasAtomRedG64>;
+ atomic_load_add_64_g, i64imm, imm>;
defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", ".add",
- atomic_load_add_64_s, i64imm, imm, hasAtomRedS64>;
+ atomic_load_add_64_s, i64imm, imm>;
defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".add",
- atomic_load_add_64_gen, i64imm, imm, hasAtomRedGen64>;
+ atomic_load_add_64_gen, i64imm, imm>;
defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".u64",
- ".add", atomic_load_add_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+ ".add", atomic_load_add_64_gen, i64imm, imm>;
defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<Float32Regs, ".global", ".f32", ".add",
- atomic_load_add_f32_g, f32imm, fpimm, hasAtomAddF32>;
+ atomic_load_add_f32_g, f32imm, fpimm>;
defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add",
- atomic_load_add_f32_s, f32imm, fpimm, hasAtomAddF32>;
+ atomic_load_add_f32_s, f32imm, fpimm>;
defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add",
- atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>;
+ atomic_load_add_f32_gen, f32imm, fpimm>;
defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2<Float64Regs, ".global", ".f64", ".add",
- atomic_load_add_f64_g, f64imm, fpimm, hasAtomAddF64>;
+ atomic_load_add_f64_g, f64imm, fpimm, [hasAtomAddF64]>;
defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2<Float64Regs, ".shared", ".f64", ".add",
- atomic_load_add_f64_s, f64imm, fpimm, hasAtomAddF64>;
+ atomic_load_add_f64_s, f64imm, fpimm, [hasAtomAddF64]>;
defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2<Float64Regs, "", ".f64", ".add",
- atomic_load_add_f64_gen, f64imm, fpimm, hasAtomAddF64>;
+ atomic_load_add_f64_gen, f64imm, fpimm, [hasAtomAddF64]>;
// atom_sub
@@ -1177,21 +1174,21 @@ def atomic_load_sub_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_load_sub_64 node:$a, node:$b)>;
defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG<Int32Regs, ".global", "32", ".add",
- atomic_load_sub_32_g, i32imm, hasAtomRedG32>;
+ atomic_load_sub_32_g, i32imm>;
defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG<Int64Regs, ".global", "64", ".add",
- atomic_load_sub_64_g, i64imm, hasAtomRedG64>;
+ atomic_load_sub_64_g, i64imm>;
defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG<Int32Regs, "", "32", ".add",
- atomic_load_sub_32_gen, i32imm, hasAtomRedGen32>;
+ atomic_load_sub_32_gen, i32imm>;
defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG<Int32Regs, ".global", "32",
- ".add", atomic_load_sub_32_gen, i32imm, useAtomRedG32forGen32>;
+ ".add", atomic_load_sub_32_gen, i32imm>;
defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG<Int32Regs, ".shared", "32", ".add",
- atomic_load_sub_32_s, i32imm, hasAtomRedS32>;
+ atomic_load_sub_32_s, i32imm>;
defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG<Int64Regs, ".shared", "64", ".add",
- atomic_load_sub_64_s, i64imm, hasAtomRedS64>;
+ atomic_load_sub_64_s, i64imm>;
defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG<Int64Regs, "", "64", ".add",
- atomic_load_sub_64_gen, i64imm, hasAtomRedGen64>;
+ atomic_load_sub_64_gen, i64imm>;
defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG<Int64Regs, ".global", "64",
- ".add", atomic_load_sub_64_gen, i64imm, useAtomRedG64forGen64>;
+ ".add", atomic_load_sub_64_gen, i64imm>;
// atom_swap
@@ -1209,21 +1206,21 @@ def atomic_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_swap_64 node:$a, node:$b)>;
defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".exch",
- atomic_swap_32_g, i32imm, imm, hasAtomRedG32>;
+ atomic_swap_32_g, i32imm, imm>;
defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".exch",
- atomic_swap_32_s, i32imm, imm, hasAtomRedS32>;
+ atomic_swap_32_s, i32imm, imm>;
defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".exch",
- atomic_swap_32_gen, i32imm, imm, hasAtomRedGen32>;
+ atomic_swap_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
- ".exch", atomic_swap_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+ ".exch", atomic_swap_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".exch",
- atomic_swap_64_g, i64imm, imm, hasAtomRedG64>;
+ atomic_swap_64_g, i64imm, imm>;
defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".exch",
- atomic_swap_64_s, i64imm, imm, hasAtomRedS64>;
+ atomic_swap_64_s, i64imm, imm>;
defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".exch",
- atomic_swap_64_gen, i64imm, imm, hasAtomRedGen64>;
+ atomic_swap_64_gen, i64imm, imm>;
defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
- ".exch", atomic_swap_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+ ".exch", atomic_swap_64_gen, i64imm, imm>;
// atom_max
@@ -1253,37 +1250,37 @@ def atomic_load_umax_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_load_umax_64 node:$a, node:$b)>;
defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
- ".max", atomic_load_max_32_g, i32imm, imm, hasAtomRedG32>;
+ ".max", atomic_load_max_32_g, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
- ".max", atomic_load_max_32_s, i32imm, imm, hasAtomRedS32>;
+ ".max", atomic_load_max_32_s, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max",
- atomic_load_max_32_gen, i32imm, imm, hasAtomRedGen32>;
+ atomic_load_max_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
- ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+ ".s32", ".max", atomic_load_max_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
- ".max", atomic_load_max_64_g, i64imm, imm, hasAtomRedG64>;
+ ".max", atomic_load_max_64_g, i64imm, imm>;
defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
- ".max", atomic_load_max_64_s, i64imm, imm, hasAtomRedS64>;
+ ".max", atomic_load_max_64_s, i64imm, imm>;
defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".max",
- atomic_load_max_64_gen, i64imm, imm, hasAtomRedGen64>;
+ atomic_load_max_64_gen, i64imm, imm>;
defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
- ".s64", ".max", atomic_load_max_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+ ".s64", ".max", atomic_load_max_64_gen, i64imm, imm>;
defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
- ".max", atomic_load_umax_32_g, i32imm, imm, hasAtomRedG32>;
+ ".max", atomic_load_umax_32_g, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
- ".max", atomic_load_umax_32_s, i32imm, imm, hasAtomRedS32>;
+ ".max", atomic_load_umax_32_s, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max",
- atomic_load_umax_32_gen, i32imm, imm, hasAtomRedGen32>;
+ atomic_load_umax_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
- ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+ ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
- ".max", atomic_load_umax_64_g, i64imm, imm, hasAtomRedG64>;
+ ".max", atomic_load_umax_64_g, i64imm, imm>;
defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
- ".max", atomic_load_umax_64_s, i64imm, imm, hasAtomRedS64>;
+ ".max", atomic_load_umax_64_s, i64imm, imm>;
defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".max",
- atomic_load_umax_64_gen, i64imm, imm, hasAtomRedGen64>;
+ atomic_load_umax_64_gen, i64imm, imm>;
defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
- ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+ ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm>;
// atom_min
@@ -1313,37 +1310,37 @@ def atomic_load_umin_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_load_umin_64 node:$a, node:$b)>;
defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
- ".min", atomic_load_min_32_g, i32imm, imm, hasAtomRedG32>;
+ ".min", atomic_load_min_32_g, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
- ".min", atomic_load_min_32_s, i32imm, imm, hasAtomRedS32>;
+ ".min", atomic_load_min_32_s, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min",
- atomic_load_min_32_gen, i32imm, imm, hasAtomRedGen32>;
+ atomic_load_min_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
- ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+ ".s32", ".min", atomic_load_min_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
- ".min", atomic_load_min_64_g, i64imm, imm, hasAtomRedG64>;
+ ".min", atomic_load_min_64_g, i64imm, imm>;
defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
- ".min", atomic_load_min_64_s, i64imm, imm, hasAtomRedS64>;
+ ".min", atomic_load_min_64_s, i64imm, imm>;
defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".min",
- atomic_load_min_64_gen, i64imm, imm, hasAtomRedGen64>;
+ atomic_load_min_64_gen, i64imm, imm>;
defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
- ".s64", ".min", atomic_load_min_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+ ".s64", ".min", atomic_load_min_64_gen, i64imm, imm>;
defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
- ".min", atomic_load_umin_32_g, i32imm, imm, hasAtomRedG32>;
+ ".min", atomic_load_umin_32_g, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
- ".min", atomic_load_umin_32_s, i32imm, imm, hasAtomRedS32>;
+ ".min", atomic_load_umin_32_s, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min",
- atomic_load_umin_32_gen, i32imm, imm, hasAtomRedGen32>;
+ atomic_load_umin_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
- ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+ ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
- ".min", atomic_load_umin_64_g, i64imm, imm, hasAtomRedG64>;
+ ".min", atomic_load_umin_64_g, i64imm, imm>;
defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
- ".min", atomic_load_umin_64_s, i64imm, imm, hasAtomRedS64>;
+ ".min", atomic_load_umin_64_s, i64imm, imm>;
defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".min",
- atomic_load_umin_64_gen, i64imm, imm, hasAtomRedGen64>;
+ atomic_load_umin_64_gen, i64imm, imm>;
defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
- ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+ ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm>;
// atom_inc atom_dec
@@ -1361,21 +1358,21 @@ def atomic_load_dec_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
defm INT_PTX_ATOM_INC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".inc",
- atomic_load_inc_32_g, i32imm, imm, hasAtomRedG32>;
+ atomic_load_inc_32_g, i32imm, imm>;
defm INT_PTX_ATOM_INC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".inc",
- atomic_load_inc_32_s, i32imm, imm, hasAtomRedS32>;
+ atomic_load_inc_32_s, i32imm, imm>;
defm INT_PTX_ATOM_INC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".inc",
- atomic_load_inc_32_gen, i32imm, imm, hasAtomRedGen32>;
+ atomic_load_inc_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_INC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
- ".inc", atomic_load_inc_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+ ".inc", atomic_load_inc_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_DEC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".dec",
- atomic_load_dec_32_g, i32imm, imm, hasAtomRedG32>;
+ atomic_load_dec_32_g, i32imm, imm>;
defm INT_PTX_ATOM_DEC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".dec",
- atomic_load_dec_32_s, i32imm, imm, hasAtomRedS32>;
+ atomic_load_dec_32_s, i32imm, imm>;
defm INT_PTX_ATOM_DEC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".dec",
- atomic_load_dec_32_gen, i32imm, imm, hasAtomRedGen32>;
+ atomic_load_dec_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
- ".dec", atomic_load_dec_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+ ".dec", atomic_load_dec_32_gen, i32imm, imm>;
// atom_and
@@ -1393,21 +1390,21 @@ def atomic_load_and_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_load_and_64 node:$a, node:$b)>;
defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and",
- atomic_load_and_32_g, i32imm, imm, hasAtomRedG32>;
+ atomic_load_and_32_g, i32imm, imm>;
defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".and",
- atomic_load_and_32_s, i32imm, imm, hasAtomRedS32>;
+ atomic_load_and_32_s, i32imm, imm>;
defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and",
- atomic_load_and_32_gen, i32imm, imm, hasAtomRedGen32>;
+ atomic_load_and_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
- ".and", atomic_load_and_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+ ".and", atomic_load_and_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".and",
- atomic_load_and_64_g, i64imm, imm, hasAtomRedG64>;
+ atomic_load_and_64_g, i64imm, imm>;
defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".and",
- atomic_load_and_64_s, i64imm, imm, hasAtomRedS64>;
+ atomic_load_and_64_s, i64imm, imm>;
defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".and",
- atomic_load_and_64_gen, i64imm, imm, hasAtomRedGen64>;
+ atomic_load_and_64_gen, i64imm, imm>;
defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
- ".and", atomic_load_and_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+ ".and", atomic_load_and_64_gen, i64imm, imm>;
// atom_or
@@ -1425,21 +1422,21 @@ def atomic_load_or_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_load_or_64 node:$a, node:$b)>;
defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or",
- atomic_load_or_32_g, i32imm, imm, hasAtomRedG32>;
+ atomic_load_or_32_g, i32imm, imm>;
defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".or",
- atomic_load_or_32_gen, i32imm, imm, hasAtomRedGen32>;
+ atomic_load_or_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
- ".or", atomic_load_or_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+ ".or", atomic_load_or_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or",
- atomic_load_or_32_s, i32imm, imm, hasAtomRedS32>;
+ atomic_load_or_32_s, i32imm, imm>;
defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".or",
- atomic_load_or_64_g, i64imm, imm, hasAtomRedG64>;
+ atomic_load_or_64_g, i64imm, imm>;
defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".or",
- atomic_load_or_64_gen, i64imm, imm, hasAtomRedGen64>;
+ atomic_load_or_64_gen, i64imm, imm>;
defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
- ".or", atomic_load_or_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+ ".or", atomic_load_or_64_gen, i64imm, imm>;
defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".or",
- atomic_load_or_64_s, i64imm, imm, hasAtomRedS64>;
+ atomic_load_or_64_s, i64imm, imm>;
// atom_xor
@@ -1457,21 +1454,21 @@ def atomic_load_xor_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_load_xor_64 node:$a, node:$b)>;
defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor",
- atomic_load_xor_32_g, i32imm, imm, hasAtomRedG32>;
+ atomic_load_xor_32_g, i32imm, imm>;
defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".xor",
- atomic_load_xor_32_s, i32imm, imm, hasAtomRedS32>;
+ atomic_load_xor_32_s, i32imm, imm>;
defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor",
- atomic_load_xor_32_gen, i32imm, imm, hasAtomRedGen32>;
+ atomic_load_xor_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
- ".xor", atomic_load_xor_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+ ".xor", atomic_load_xor_32_gen, i32imm, imm>;
defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".xor",
- atomic_load_xor_64_g, i64imm, imm, hasAtomRedG64>;
+ atomic_load_xor_64_g, i64imm, imm>;
defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".xor",
- atomic_load_xor_64_s, i64imm, imm, hasAtomRedS64>;
+ atomic_load_xor_64_s, i64imm, imm>;
defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".xor",
- atomic_load_xor_64_gen, i64imm, imm, hasAtomRedGen64>;
+ atomic_load_xor_64_gen, i64imm, imm>;
defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
- ".xor", atomic_load_xor_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+ ".xor", atomic_load_xor_64_gen, i64imm, imm>;
// atom_cas
@@ -1489,21 +1486,21 @@ def atomic_cmp_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<Int32Regs, ".global", ".b32", ".cas",
- atomic_cmp_swap_32_g, i32imm, hasAtomRedG32>;
+ atomic_cmp_swap_32_g, i32imm>;
defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<Int32Regs, ".shared", ".b32", ".cas",
- atomic_cmp_swap_32_s, i32imm, hasAtomRedS32>;
+ atomic_cmp_swap_32_s, i32imm>;
defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<Int32Regs, "", ".b32", ".cas",
- atomic_cmp_swap_32_gen, i32imm, hasAtomRedGen32>;
+ atomic_cmp_swap_32_gen, i32imm>;
defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<Int32Regs, ".global", ".b32",
- ".cas", atomic_cmp_swap_32_gen, i32imm, useAtomRedG32forGen32>;
+ ".cas", atomic_cmp_swap_32_gen, i32imm>;
defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<Int64Regs, ".global", ".b64", ".cas",
- atomic_cmp_swap_64_g, i64imm, hasAtomRedG64>;
+ atomic_cmp_swap_64_g, i64imm>;
defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<Int64Regs, ".shared", ".b64", ".cas",
- atomic_cmp_swap_64_s, i64imm, hasAtomRedS64>;
+ atomic_cmp_swap_64_s, i64imm>;
defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<Int64Regs, "", ".b64", ".cas",
- atomic_cmp_swap_64_gen, i64imm, hasAtomRedGen64>;
+ atomic_cmp_swap_64_gen, i64imm>;
defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<Int64Regs, ".global", ".b64",
- ".cas", atomic_cmp_swap_64_gen, i64imm, useAtomRedG64forGen64>;
+ ".cas", atomic_cmp_swap_64_gen, i64imm>;
// Support for scoped atomic operations. Matches
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
@@ -1654,7 +1651,7 @@ multiclass ATOM2_add_impl<string OpStr> {
defm _u32 : ATOM2S_impl<OpStr, "i", "u32", Int32Regs, i32imm, imm, i32, []>;
defm _u64 : ATOM2S_impl<OpStr, "i", "u64", Int64Regs, i64imm, imm, i64, []>;
defm _f32 : ATOM2S_impl<OpStr, "f", "f32", Float32Regs, f32imm, fpimm, f32,
- [hasAtomAddF32]>;
+ []>;
defm _f64 : ATOM2S_impl<OpStr, "f", "f64", Float64Regs, f64imm, fpimm, f64,
[hasAtomAddF64]>;
}
@@ -1936,56 +1933,31 @@ defm INT_PTX_LDG_G_v4f32_ELE
multiclass NG_TO_G<string Str, Intrinsic Intrin> {
def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
!strconcat("cvta.", Str, ".u32 \t$result, $src;"),
- [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
- Requires<[hasGenericLdSt]>;
+ [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
!strconcat("cvta.", Str, ".u64 \t$result, $src;"),
- [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
- Requires<[hasGenericLdSt]>;
-
-// @TODO: Are these actually needed? I believe global addresses will be copied
-// to register values anyway.
- /*def __addr_yes : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src),
- !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
- [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
- Requires<[hasGenericLdSt]>;
- def __addr_yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src),
- !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
- [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
- Requires<[hasGenericLdSt]>;*/
-
- def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
- "mov.u32 \t$result, $src;",
- [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
- def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
- "mov.u64 \t$result, $src;",
[(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
-
-// @TODO: Are these actually needed? I believe global addresses will be copied
-// to register values anyway.
- /*def _addr_no : NVPTXInst<(outs Int32Regs:$result), (ins imem:$src),
- "mov.u32 \t$result, $src;",
- [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;
- def _addr_no_64 : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
- "mov.u64 \t$result, $src;",
- [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;*/
+ def _yes_6432 : NVPTXInst<(outs Int64Regs:$result), (ins Int32Regs:$src),
+ "{{ .reg .b64 %tmp;\n\t"
+ #" cvt.u64.u32 \t%tmp, $src;\n\t"
+ #" cvta." # Str # ".u64 \t$result, %tmp; }}",
+ [(set Int64Regs:$result, (Intrin Int32Regs:$src))]>,
+ Requires<[useShortPtr]>;
}
multiclass G_TO_NG<string Str, Intrinsic Intrin> {
def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
!strconcat("cvta.to.", Str, ".u32 \t$result, $src;"),
- [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
- Requires<[hasGenericLdSt]>;
+ [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
!strconcat("cvta.to.", Str, ".u64 \t$result, $src;"),
- [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
- Requires<[hasGenericLdSt]>;
- def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
- "mov.u32 \t$result, $src;",
- [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
- def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
- "mov.u64 \t$result, $src;",
[(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+ def _yes_3264 : NVPTXInst<(outs Int32Regs:$result), (ins Int64Regs:$src),
+ "{{ .reg .b64 %tmp;\n\t"
+ #" cvta.to." # Str # ".u64 \t%tmp, $src;\n\t"
+ #" cvt.u32.u64 \t$result, %tmp; }}",
+ [(set Int32Regs:$result, (Intrin Int64Regs:$src))]>,
+ Requires<[useShortPtr]>;
}
defm cvta_local : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>;
@@ -7412,204 +7384,380 @@ def INT_PTX_SREG_WARPSIZE :
//
// wmma.load.[a|b|c].sync.[row|col].m16n16k16[|.global|.shared].[f16|f32]
//
-class WMMA_LOAD_ALSTOS<string Abc, string Layout, string Space,
- string Type, NVPTXRegClass regclass,
- Operand SrcOp, int WithOffset, int WithStride>
- : NVPTXInst<!if(!eq(Abc#Type,"cf16"),
- (outs regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3),
- (outs regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3,
- regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7)),
- !if(WithStride,
- !if(WithOffset,
- (ins SrcOp:$src, i32imm:$offset, Int32Regs:$ldm),
- (ins SrcOp:$src, Int32Regs:$ldm)),
- !if(WithOffset,
- (ins SrcOp:$src, i32imm:$offset),
- (ins SrcOp:$src))),
- "wmma.load."#Abc#".sync."#Layout#".m16n16k16"#Space#"." #Type# " \t"
- #!if(!eq(Abc#Type,"cf16"),
- "{{$r0, $r1, $r2, $r3}}",
- "{{$r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7}}")
- #", "
- #!if(WithOffset,"[$src+$offset]", "[$src]")
- #!if(WithStride, ", $ldm", "")
- #";",
- []>,
- Requires<[hasPTX60, hasSM70]>;
-
-multiclass WMMA_LOAD_ALSTO<string Abc, string Layout, string Space,
- string Type, NVPTXRegClass regclass,
- Operand SrcOp, int WithOffset = 0> {
- def _stride: WMMA_LOAD_ALSTOS<Abc, Layout, Space, Type, regclass, SrcOp,
- WithOffset, 1>;
- def NAME: WMMA_LOAD_ALSTOS<Abc, Layout, Space, Type, regclass, SrcOp,
- WithOffset, 0>;
+
+class EmptyNVPTXInst : NVPTXInst<(outs), (ins), "?", []>;
+
+class WMMA_LOAD_GALSTOS<string Geometry, string Abc, string Layout,
+ string Space, string Type, NVPTXRegClass regclass,
+ DAGOperand SrcOp, bit WithStride>
+ : EmptyNVPTXInst,
+ Requires<[!if(!eq(Geometry, "m16n16k16"),
+ hasPTX60,
+ hasPTX61),
+ hasSM70]> {
+ // Pattern (created by WMMA_LOAD_INTR_HELPER below) that matches the intrinsic
+ // for this function.
+ PatFrag IntrMatcher = !cast<PatFrag>("INT_WMMA_"
+ # Geometry # "_load_"
+ # !subst("c", "c_" # Type, Abc)
+ # "_" # Layout
+ # !subst(".", "_", Space)
+ # !if(WithStride,"_stride", "")
+ # "_Intr");
+ dag OutsR03 = (outs regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3);
+ dag OutsR47 = (outs regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7);
+ dag Outs = !if(!eq(Abc#Type,"cf16"), OutsR03, !con(OutsR03, OutsR47));
+
+ dag StrideArg = !if(WithStride, (ins Int32Regs:$ldm), (ins));
+ dag Ins = !con((ins SrcOp:$src), StrideArg);
+
+ // Build a dag pattern that matches the intrinsic call.
+ // We want a dag that looks like this:
+ // (set <output args>, (intrinsic <input arguments>)) where input and
+ // output arguments are named patterns that would match corresponding
+ // input/output arguments of the instruction.
+ //
+ // First we construct (set <output arguments>) from instruction's outs dag by
+ // replacing dag operator 'outs' with 'set'.
+ dag PatOuts = !foreach(tmp, Outs, !subst(outs, set, tmp));
+ // Similarly, construct (intrinsic <input arguments>) sub-dag from
+ // instruction's input arguments, only now we also need to replace operands
+ // with patterns that would match them and the operator 'ins' with the
+ // intrinsic.
+ dag PatArgs = !foreach(tmp, Ins,
+ !subst(imem, ADDRvar,
+ !subst(MEMri64, ADDRri64,
+ !subst(MEMri, ADDRri,
+ !subst(ins, IntrMatcher, tmp)))));
+ // Finally, consatenate both parts together. !con() requires both dags to have
+ // the same operator, so we wrap PatArgs in a (set ...) dag.
+ let Pattern = [!con(PatOuts, (set PatArgs))];
+ let OutOperandList = Outs;
+ let InOperandList = Ins;
+ let AsmString = "wmma.load."
+ # Abc
+ # ".sync"
+ # "." # Layout
+ # "." # Geometry
+ # Space
+ # "." # Type # " \t"
+ # !if(!eq(Abc#Type, "cf16"),
+ "{{$r0, $r1, $r2, $r3}}",
+ "{{$r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7}}")
+ # ", [$src]"
+ # !if(WithStride, ", $ldm", "")
+ # ";";
}
-multiclass WMMA_LOAD_ALST<string Abc, string Layout, string Space,
+class WMMA_LOAD_INTR_HELPER<string Geometry, string Abc, string Layout,
+ string Space, string Type, bit WithStride>
+ : PatFrag <(ops),(ops)> {
+ // Intrinsic that matches this instruction.
+ Intrinsic Intr = !cast<Intrinsic>("int_nvvm_wmma"
+ # "_" # Geometry # "_load_"
+ # Abc # "_" # Type # "_" # Layout
+ # !if(WithStride,"_stride", ""));
+ code match_generic = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
+ }];
+ code match_shared = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
+ }];
+ code match_global = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
+ }];
+
+ let Operands = !if(WithStride, (ops node:$src, node:$ldm), (ops node:$src));
+ let Fragments = [!foreach(tmp, Operands, !subst(ops, Intr, tmp))];
+ let PredicateCode = !if(!eq(Space, ".shared"), match_shared,
+ !if(!eq(Space, ".global"), match_global, match_generic));
+}
+
+multiclass WMMA_LOAD_GALSTS<string Geometry, string Abc, string Layout,
+ string Space, string Type, NVPTXRegClass regclass,
+ bit WithStride> {
+ def _avar: WMMA_LOAD_GALSTOS<Geometry, Abc, Layout, Space, Type, regclass,
+ imem, WithStride>;
+ def _areg: WMMA_LOAD_GALSTOS<Geometry, Abc, Layout, Space, Type, regclass,
+ Int32Regs, WithStride>;
+ def _areg64: WMMA_LOAD_GALSTOS<Geometry, Abc, Layout, Space, Type, regclass,
+ Int64Regs, WithStride>;
+ def _ari: WMMA_LOAD_GALSTOS<Geometry, Abc, Layout, Space, Type, regclass,
+ MEMri, WithStride>;
+ def _ari64: WMMA_LOAD_GALSTOS<Geometry, Abc, Layout, Space, Type, regclass,
+ MEMri64, WithStride>;
+}
+
+multiclass WMMA_LOAD_GALSTSh<string Geometry, string Abc, string Layout,
+ string Space, string Type, NVPTXRegClass regclass,
+ bit WithStride> {
+ // Define a PatFrag that matches appropriate intrinsic that loads from the
+ // given address space.
+ def _Intr: WMMA_LOAD_INTR_HELPER<Geometry, Abc, Layout, Space, Type,
+ WithStride>;
+ defm NAME: WMMA_LOAD_GALSTS<Geometry, Abc, Layout, Space, Type, regclass,
+ WithStride>;
+}
+
+multiclass WMMA_LOAD_GALST<string Geometry, string Abc, string Layout,
+ string Space, string Type, NVPTXRegClass regclass> {
+ defm _stride: WMMA_LOAD_GALSTSh<Geometry, Abc, Layout, Space, Type, regclass, 1>;
+ defm NAME: WMMA_LOAD_GALSTSh<Geometry, Abc, Layout, Space, Type, regclass, 0>;
+}
+
+multiclass WMMA_LOAD_GALT<string Geometry, string Abc, string Layout,
string Type, NVPTXRegClass regclass> {
- defm _avar: WMMA_LOAD_ALSTO<Abc, Layout, Space, Type, regclass, imemAny, 0>;
- defm _ari64: WMMA_LOAD_ALSTO<Abc, Layout, Space, Type, regclass, imemAny, 1>;
+ defm _global: WMMA_LOAD_GALST<Geometry, Abc, Layout, ".global",
+ Type, regclass>;
+ defm _shared: WMMA_LOAD_GALST<Geometry, Abc, Layout, ".shared",
+ Type, regclass>;
+ defm NAME: WMMA_LOAD_GALST<Geometry, Abc, Layout, "",
+ Type, regclass>;
}
-multiclass WMMA_LOAD_ALT<string Abc, string Layout,
- string Type, NVPTXRegClass regclass> {
- defm _global: WMMA_LOAD_ALST<Abc, Layout, ".global", Type, regclass>;
- defm _shared: WMMA_LOAD_ALST<Abc, Layout, ".shared", Type, regclass>;
- defm NAME: WMMA_LOAD_ALST<Abc, Layout, "", Type, regclass>;
+multiclass WMMA_LOAD_GAT<string Geometry, string Abc,
+ string Type, NVPTXRegClass regclass> {
+ defm _row: WMMA_LOAD_GALT<Geometry, Abc, "row", Type, regclass>;
+ defm _col: WMMA_LOAD_GALT<Geometry, Abc, "col", Type, regclass>;
}
-multiclass WMMA_LOAD_AT<string Abc, string Type, NVPTXRegClass regclass> {
- defm _row: WMMA_LOAD_ALT<Abc, "row", Type, regclass>;
- defm _col: WMMA_LOAD_ALT<Abc, "col", Type, regclass>;
+multiclass WMMA_LOAD_G<string Geometry> {
+ defm _load_a: WMMA_LOAD_GAT<Geometry, "a", "f16", Float16x2Regs>;
+ defm _load_b: WMMA_LOAD_GAT<Geometry, "b", "f16", Float16x2Regs>;
+ defm _load_c_f16: WMMA_LOAD_GAT<Geometry, "c", "f16", Float16x2Regs>;
+ defm _load_c_f32: WMMA_LOAD_GAT<Geometry, "c", "f32", Float32Regs>;
}
-defm INT_WMMA_LOAD_A: WMMA_LOAD_AT<"a", "f16", Float16x2Regs>;
-defm INT_WMMA_LOAD_B: WMMA_LOAD_AT<"b", "f16", Float16x2Regs>;
-defm INT_WMMA_LOAD_C_f16: WMMA_LOAD_AT<"c", "f16", Float16x2Regs>;
-defm INT_WMMA_LOAD_C_f32: WMMA_LOAD_AT<"c", "f32", Float32Regs>;
+defm INT_WMMA_m32n8k16: WMMA_LOAD_G<"m32n8k16">;
+defm INT_WMMA_m16n16k16: WMMA_LOAD_G<"m16n16k16">;
+defm INT_WMMA_m8n32k16: WMMA_LOAD_G<"m8n32k16">;
//
// wmma.store.d.sync.[row|col].m16n16k16[|.global|.shared].[f16|f32]
//
-class WMMA_STORE_D_LSTOS<string Layout, string Space,
- string Type, NVPTXRegClass regclass,
- Operand DstOp, int WithOffset, int WithStride>
- : NVPTXInst<(outs),
- !if(!eq(Type,"f16"),
- !if(WithStride,
- !if(WithOffset,
- (ins DstOp:$src, i32imm:$offset,
- regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3,
- Int32Regs:$ldm),
- (ins DstOp:$src,
- regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3,
- Int32Regs:$ldm)),
- !if(WithOffset,
- (ins DstOp:$src, i32imm:$offset,
- regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3),
- (ins DstOp:$src,
- regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3))),
- !if(WithStride,
- !if(WithOffset,
- (ins DstOp:$src, i32imm:$offset,
- regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3,
- regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7,
- Int32Regs:$ldm),
- (ins DstOp:$src,
- regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3,
- regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7,
- Int32Regs:$ldm)),
- !if(WithOffset,
- (ins DstOp:$src, i32imm:$offset,
- regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3,
- regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7),
- (ins DstOp:$src,
- regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3,
- regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7)))),
- "wmma.store.d.sync."#Layout#".m16n16k16"#Space#"." #Type# " \t"
- #!if(WithOffset,"[$src+$offset], ", "[$src], ")
- #!if(!eq(Type,"f16"),
- "{{$r0, $r1, $r2, $r3}}",
- "{{$r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7}}")
- #!if(WithStride, ", $ldm", "")
- #";",
- []>,
- Requires<[hasPTX60, hasSM70]>;
-
-multiclass WMMA_STORE_D_LSTO<string Layout, string Space,
- string Type, NVPTXRegClass regclass,
- Operand DstOp, int WithOffset = 0> {
- def _stride: WMMA_STORE_D_LSTOS<Layout, Space, Type, regclass, DstOp,
- WithOffset, 1>;
- def NAME: WMMA_STORE_D_LSTOS<Layout, Space, Type, regclass, DstOp,
- WithOffset, 0>;
+class WMMA_STORE_D_GLSTSO<string Geometry, string Layout, string Space,
+ string Type, NVPTXRegClass regclass,
+ bit WithStride, DAGOperand DstOp>
+ : EmptyNVPTXInst,
+ Requires<[!if(!eq(Geometry, "m16n16k16"),
+ hasPTX60,
+ hasPTX61),
+ hasSM70]> {
+ PatFrag IntrMatcher = !cast<PatFrag>("INT_WMMA"
+ # "_" # Geometry # "_store_d"
+ # "_" # Type
+ # "_" # Layout
+ # !subst(".", "_", Space)
+ # !if(WithStride,"_stride", "")
+ # "_Intr");
+ dag InsR03 = (ins DstOp:$src, regclass:$r0, regclass:$r1,
+ regclass:$r2, regclass:$r3);
+ dag InsR47 = (ins regclass:$r4, regclass:$r5,
+ regclass:$r6, regclass:$r7);
+ dag InsR = !if(!eq(Type,"f16"), InsR03, !con(InsR03, InsR47));
+ dag StrideArg = !if(WithStride, (ins Int32Regs:$ldm), (ins));
+ dag Ins = !con(InsR, StrideArg);
+
+ // Construct the pattern to match corresponding intrinsic call. See the
+ // details in the comments in WMMA_LOAD_ALSTOS.
+ dag PatArgs = !foreach(tmp, Ins,
+ !subst(imem, ADDRvar,
+ !subst(MEMri64, ADDRri64,
+ !subst(MEMri, ADDRri,
+ !subst(ins, IntrMatcher, tmp)))));
+ let Pattern = [PatArgs];
+ let OutOperandList = (outs);
+ let InOperandList = Ins;
+ let AsmString = "wmma.store.d.sync."
+ # Layout
+ # "." # Geometry
+ # Space
+ # "." # Type
+ # " \t[$src],"
+ # !if(!eq(Type,"f16"),
+ "{{$r0, $r1, $r2, $r3}}",
+ "{{$r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7}}")
+ # !if(WithStride, ", $ldm", "")
+ # ";";
+
+}
+
+class WMMA_STORE_INTR_HELPER<string Geometry, string Layout, string Space,
+ string Type, bit WithStride>
+ : PatFrag <(ops),(ops)> {
+ // Intrinsic that matches this instruction.
+ Intrinsic Intr = !cast<Intrinsic>("int_nvvm_wmma_"
+ # Geometry
+ # "_store_d"
+ # "_" # Type
+ # "_" # Layout
+ # !if(WithStride, "_stride", ""));
+ code match_generic = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
+ }];
+ code match_shared = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
+ }];
+ code match_global = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
+ }];
+
+ dag Args = !if(!eq(Type,"f16"),
+ (ops node:$dst, node:$r0, node:$r1, node:$r2, node:$r3),
+ (ops node:$dst, node:$r0, node:$r1, node:$r2, node:$r3,
+ node:$r4, node:$r5, node:$r6, node:$r7));
+ dag StrideArg = !if(WithStride, (ops node:$ldm), (ops));
+ let Operands = !con(Args, StrideArg);
+ let Fragments = [!foreach(tmp, Operands, !subst(ops, Intr, tmp))];
+ let PredicateCode = !if(!eq(Space, ".shared"), match_shared,
+ !if(!eq(Space, ".global"), match_global, match_generic));
+}
+
+multiclass WMMA_STORE_D_GLSTS<string Geometry, string Layout, string Space,
+ string Type, NVPTXRegClass regclass,
+ bit WithStride> {
+ def _avar: WMMA_STORE_D_GLSTSO<Geometry, Layout, Space, Type, regclass,
+ WithStride, imem>;
+ def _areg: WMMA_STORE_D_GLSTSO<Geometry, Layout, Space, Type, regclass,
+ WithStride, Int32Regs>;
+ def _areg64: WMMA_STORE_D_GLSTSO<Geometry, Layout, Space, Type, regclass,
+ WithStride, Int64Regs>;
+ def _ari: WMMA_STORE_D_GLSTSO<Geometry, Layout, Space, Type, regclass,
+ WithStride, MEMri>;
+ def _ari64: WMMA_STORE_D_GLSTSO<Geometry, Layout, Space, Type, regclass,
+ WithStride, MEMri64>;
+}
+
+multiclass WMMA_STORE_D_GLSTSh<string Geometry, string Layout, string Space,
+ string Type, NVPTXRegClass regclass,
+ bit WithStride> {
+ // Define a PatFrag that matches appropriate intrinsic that loads from the
+ // given address space.
+ def _Intr: WMMA_STORE_INTR_HELPER<Geometry, Layout, Space, Type,
+ WithStride>;
+ defm NAME: WMMA_STORE_D_GLSTS<Geometry, Layout, Space, Type, regclass,
+ WithStride>;
}
-multiclass WMMA_STORE_D_LST<string Layout, string Space,
- string Type, NVPTXRegClass regclass> {
- defm _avar: WMMA_STORE_D_LSTO<Layout, Space, Type, regclass, imemAny, 0>;
- defm _ari64: WMMA_STORE_D_LSTO<Layout, Space, Type, regclass, imemAny, 1>;
+multiclass WMMA_STORE_D_GLST<string Geometry, string Layout, string Space,
+ string Type, NVPTXRegClass regclass > {
+ defm _stride: WMMA_STORE_D_GLSTSh<Geometry, Layout, Space, Type, regclass, 1>;
+ defm NAME: WMMA_STORE_D_GLSTSh<Geometry, Layout, Space, Type, regclass, 0>;
}
-multiclass WMMA_STORE_D_LT<string Layout,
+multiclass WMMA_STORE_D_GLT<string Geometry, string Layout,
string Type, NVPTXRegClass regclass> {
- defm _global: WMMA_STORE_D_LST<Layout, ".global", Type, regclass>;
- defm _shared: WMMA_STORE_D_LST<Layout, ".shared", Type, regclass>;
- defm NAME: WMMA_STORE_D_LST<Layout, "", Type, regclass>;
+ defm _global: WMMA_STORE_D_GLST<Geometry, Layout, ".global", Type, regclass>;
+ defm _shared: WMMA_STORE_D_GLST<Geometry, Layout, ".shared", Type, regclass>;
+ defm NAME: WMMA_STORE_D_GLST<Geometry, Layout, "", Type, regclass>;
+}
+
+multiclass WMMA_STORE_D_GT<string Geometry, string Type,
+ NVPTXRegClass regclass> {
+ defm _row: WMMA_STORE_D_GLT<Geometry, "row", Type, regclass>;
+ defm _col: WMMA_STORE_D_GLT<Geometry, "col", Type, regclass>;
}
-multiclass WMMA_STORE_D_T<string Type, NVPTXRegClass regclass> {
- defm _row: WMMA_STORE_D_LT<"row", Type, regclass>;
- defm _col: WMMA_STORE_D_LT<"col", Type, regclass>;
+multiclass WMMA_STORE_D_G<string Geometry> {
+ defm _store_d_f16: WMMA_STORE_D_GT<Geometry, "f16", Float16x2Regs>;
+ defm _store_d_f32: WMMA_STORE_D_GT<Geometry, "f32", Float32Regs>;
}
-defm INT_WMMA_STORE_D_f16: WMMA_STORE_D_T<"f16", Float16x2Regs>;
-defm INT_WMMA_STORE_D_f32: WMMA_STORE_D_T<"f32", Float32Regs>;
+defm INT_WMMA_m32n8k16: WMMA_STORE_D_G<"m32n8k16">;
+defm INT_WMMA_m16n16k16: WMMA_STORE_D_G<"m16n16k16">;
+defm INT_WMMA_m8n32k16: WMMA_STORE_D_G<"m8n32k16">;
// WMMA.MMA
-class WMMA_MMA_ABDCS<string ALayout, string BLayout,
+class WMMA_MMA_GABDCS<string Geometry, string ALayout, string BLayout,
string DType, NVPTXRegClass d_reg,
string CType, NVPTXRegClass c_reg,
NVPTXRegClass ab_reg,
string Satfinite = "">
- : NVPTXInst<!if(!eq(DType,"f16"),
- (outs d_reg:$d0, d_reg:$d1, d_reg:$d2, d_reg:$d3),
- (outs d_reg:$d0, d_reg:$d1, d_reg:$d2, d_reg:$d3,
- d_reg:$d4, d_reg:$d5, d_reg:$d6, d_reg:$d7)),
- !if(!eq(CType,"f16"),
- (ins ab_reg:$a0, ab_reg:$a1, ab_reg:$a2, ab_reg:$a3,
- ab_reg:$a4, ab_reg:$a5, ab_reg:$a6, ab_reg:$a7,
- ab_reg:$b0, ab_reg:$b1, ab_reg:$b2, ab_reg:$b3,
- ab_reg:$b4, ab_reg:$b5, ab_reg:$b6, ab_reg:$b7,
- c_reg:$c0, c_reg:$c1, c_reg:$c2, c_reg:$c3),
- (ins ab_reg:$a0, ab_reg:$a1, ab_reg:$a2, ab_reg:$a3,
- ab_reg:$a4, ab_reg:$a5, ab_reg:$a6, ab_reg:$a7,
- ab_reg:$b0, ab_reg:$b1, ab_reg:$b2, ab_reg:$b3,
- ab_reg:$b4, ab_reg:$b5, ab_reg:$b6, ab_reg:$b7,
- c_reg:$c0, c_reg:$c1, c_reg:$c2, c_reg:$c3,
- c_reg:$c4, c_reg:$c5, c_reg:$c6, c_reg:$c7)),
- "wmma.mma.sync."#ALayout#"."#BLayout#".m16n16k16."#
- #DType#"."#CType#Satfinite
- #"\n\t\t"
- #!if(!eq(DType,"f16"),
- "{{$d0, $d1, $d2, $d3}}, \n\t\t",
- "{{$d0, $d1, $d2, $d3, $d4, $d5, $d6, $d7}},\n\t\t")
- #"{{$a0, $a1, $a2, $a3, $a4, $a5, $a6, $a7}},\n\t\t"
- #"{{$b0, $b1, $b2, $b3, $b4, $b5, $b6, $b7}},\n\t\t"
- #!if(!eq(CType,"f16"),
- "{{$c0, $c1, $c2, $c3}};",
- "{{$c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7}};"),
- []>,
- Requires<[hasPTX60, hasSM70]>;
-
-multiclass WMMA_MMA_ABDC<string ALayout, string BLayout,
+ : EmptyNVPTXInst,
+ Requires<[!if(!eq(Geometry, "m16n16k16"),
+ hasPTX60,
+ hasPTX61),
+ hasSM70]> {
+ Intrinsic Intr = !cast<Intrinsic>("int_nvvm_wmma_"
+ # Geometry
+ # "_mma"
+ # "_" # ALayout
+ # "_" # BLayout
+ # "_" # DType
+ # "_" # CType
+ # !subst(".", "_", Satfinite));
+ dag Outs = !if(!eq(DType,"f16"),
+ (outs d_reg:$d0, d_reg:$d1, d_reg:$d2, d_reg:$d3),
+ (outs d_reg:$d0, d_reg:$d1, d_reg:$d2, d_reg:$d3,
+ d_reg:$d4, d_reg:$d5, d_reg:$d6, d_reg:$d7));
+ dag InsExtraCArgs = !if(!eq(CType,"f16"),
+ (ins),
+ (ins c_reg:$c4, c_reg:$c5, c_reg:$c6, c_reg:$c7));
+ dag Ins = !con((ins ab_reg:$a0, ab_reg:$a1, ab_reg:$a2, ab_reg:$a3,
+ ab_reg:$a4, ab_reg:$a5, ab_reg:$a6, ab_reg:$a7,
+ ab_reg:$b0, ab_reg:$b1, ab_reg:$b2, ab_reg:$b3,
+ ab_reg:$b4, ab_reg:$b5, ab_reg:$b6, ab_reg:$b7,
+ c_reg:$c0, c_reg:$c1, c_reg:$c2, c_reg:$c3),
+ InsExtraCArgs);
+
+ // Construct the pattern to match corresponding intrinsic call. See the
+ // details in the comments in WMMA_LOAD_ALSTOS.
+ dag PatOuts = !foreach(tmp, Outs, !subst(outs, set, tmp));
+ dag PatArgs = !foreach(tmp, Ins, !subst(ins, Intr, tmp));
+ let Pattern = [!con(PatOuts, (set PatArgs))];
+ let OutOperandList = Outs;
+ let InOperandList = Ins;
+ let AsmString = "wmma.mma.sync."
+ # ALayout
+ # "." # BLayout
+ # "." # Geometry
+ # "." # DType
+ # "." # CType
+ # Satfinite # "\n\t\t"
+ # !if(!eq(DType,"f16"),
+ "{{$d0, $d1, $d2, $d3}}, \n\t\t",
+ "{{$d0, $d1, $d2, $d3, $d4, $d5, $d6, $d7}},\n\t\t")
+ # "{{$a0, $a1, $a2, $a3, $a4, $a5, $a6, $a7}},\n\t\t"
+ # "{{$b0, $b1, $b2, $b3, $b4, $b5, $b6, $b7}},\n\t\t"
+ # !if(!eq(CType,"f16"),
+ "{{$c0, $c1, $c2, $c3}};",
+ "{{$c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7}};");
+}
+
+multiclass WMMA_MMA_GABDC<string Geometry, string ALayout, string BLayout,
string DType, NVPTXRegClass d_reg,
string CType, NVPTXRegClass c_reg> {
- def _satfinite: WMMA_MMA_ABDCS<ALayout, BLayout,
+ def _satfinite: WMMA_MMA_GABDCS<Geometry, ALayout, BLayout,
DType, d_reg, CType, c_reg,
Float16x2Regs, ".satfinite">;
- def NAME: WMMA_MMA_ABDCS<ALayout, BLayout,
+ def NAME: WMMA_MMA_GABDCS<Geometry, ALayout, BLayout,
DType, d_reg, CType, c_reg,
Float16x2Regs>;
}
-multiclass WMMA_MMA_ABD<string ALayout, string BLayout,
+multiclass WMMA_MMA_GABD<string Geometry, string ALayout, string BLayout,
string DType, NVPTXRegClass d_reg> {
- defm _f16: WMMA_MMA_ABDC<ALayout, BLayout, DType, d_reg, "f16", Float16x2Regs>;
- defm _f32: WMMA_MMA_ABDC<ALayout, BLayout, DType, d_reg, "f32", Float32Regs>;
+ defm _f16: WMMA_MMA_GABDC<Geometry, ALayout, BLayout, DType, d_reg,
+ "f16", Float16x2Regs>;
+ defm _f32: WMMA_MMA_GABDC<Geometry, ALayout, BLayout, DType, d_reg,
+ "f32", Float32Regs>;
}
-multiclass WMMA_MMA_AB<string ALayout, string BLayout> {
- defm _f16: WMMA_MMA_ABD<ALayout, BLayout, "f16", Float16x2Regs>;
- defm _f32: WMMA_MMA_ABD<ALayout, BLayout, "f32", Float32Regs>;
+multiclass WMMA_MMA_GAB<string Geometry, string ALayout, string BLayout> {
+ defm _f16: WMMA_MMA_GABD<Geometry, ALayout, BLayout, "f16", Float16x2Regs>;
+ defm _f32: WMMA_MMA_GABD<Geometry, ALayout, BLayout, "f32", Float32Regs>;
}
-multiclass WMMA_MMA_A<string ALayout> {
- defm _col: WMMA_MMA_AB<ALayout, "col">;
- defm _row: WMMA_MMA_AB<ALayout, "row">;
+multiclass WMMA_MMA_GA<string Geometry, string ALayout> {
+ defm _col: WMMA_MMA_GAB<Geometry, ALayout, "col">;
+ defm _row: WMMA_MMA_GAB<Geometry, ALayout, "row">;
}
-defm INT_WMMA_MMA_col: WMMA_MMA_A<"col">;
-defm INT_WMMA_MMA_row: WMMA_MMA_A<"row">;
+multiclass WMMA_MMA_G<string Geometry> {
+ defm _col: WMMA_MMA_GA<Geometry, "col">;
+ defm _row: WMMA_MMA_GA<Geometry, "row">;
+}
+defm INT_WMMA_MMA_m32n8k16 : WMMA_MMA_G<"m32n8k16">;
+defm INT_WMMA_MMA_m16n16k16 : WMMA_MMA_G<"m16n16k16">;
+defm INT_WMMA_MMA_m8n32k16 : WMMA_MMA_G<"m8n32k16">;
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 1402033b9e60..5bb4fc3edd09 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -97,10 +97,12 @@ AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
Offset = (Offset + Align - 1) / Align * Align;
if (StackGrowsDown) {
- DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
+ LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset
+ << "]\n");
MFI.setObjectOffset(FrameIdx, -Offset); // Set the computed offset
} else {
- DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
+ LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset
+ << "]\n");
MFI.setObjectOffset(FrameIdx, Offset);
Offset += MFI.getObjectSize(FrameIdx);
}
@@ -163,14 +165,14 @@ NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
// Adjust to alignment boundary.
Offset = (Offset + Align - 1) / Align * Align;
- DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+ LLVM_DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
// Resolve offsets for objects in the local block.
for (unsigned i = 0, e = MFI.getLocalFrameObjectCount(); i != e; ++i) {
std::pair<int, int64_t> Entry = MFI.getLocalFrameObjectMap(i);
int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
- DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
- FIOffset << "]\n");
+ LLVM_DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" << FIOffset
+ << "]\n");
MFI.setObjectOffset(Entry.first, FIOffset);
}
// Allocate the local block
diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
deleted file mode 100644
index d736eaa41301..000000000000
--- a/lib/Target/NVPTX/NVPTXSection.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===- NVPTXSection.h - NVPTX-specific section representation ---*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the NVPTXSection class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
-#define LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
-
-#include "llvm/MC/MCSection.h"
-#include "llvm/MC/SectionKind.h"
-
-namespace llvm {
-
-/// Represents a section in PTX PTX does not have sections. We create this class
-/// in order to use the ASMPrint interface.
-///
-class NVPTXSection final : public MCSection {
- virtual void anchor();
-
-public:
- NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K, nullptr) {}
- ~NVPTXSection() = default;
-
- /// Override this as NVPTX has its own way of printing switching
- /// to a section.
- void PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
- raw_ostream &OS,
- const MCExpr *Subsection) const override {}
-
- /// Base address of PTX sections is zero.
- bool UseCodeAlign() const override { return false; }
- bool isVirtualSection() const override { return false; }
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index 3a0bfd221b0b..b02822a099d9 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -48,10 +48,6 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
// FrameLowering class because TargetFrameLowering is abstract.
NVPTXFrameLowering FrameLowering;
-protected:
- // Processor supports scoped atomic operations.
- bool HasAtomScope;
-
public:
/// This constructor initializes the data members to match that
/// of the specified module.
@@ -73,37 +69,15 @@ public:
return &TSInfo;
}
- bool hasBrkPt() const { return SmVersion >= 11; }
- bool hasAtomRedG32() const { return SmVersion >= 11; }
- bool hasAtomRedS32() const { return SmVersion >= 12; }
- bool hasAtomRedG64() const { return SmVersion >= 12; }
- bool hasAtomRedS64() const { return SmVersion >= 20; }
- bool hasAtomRedGen32() const { return SmVersion >= 20; }
- bool hasAtomRedGen64() const { return SmVersion >= 20; }
- bool hasAtomAddF32() const { return SmVersion >= 20; }
bool hasAtomAddF64() const { return SmVersion >= 60; }
- bool hasAtomScope() const { return HasAtomScope; }
+ bool hasAtomScope() const { return SmVersion >= 60; }
bool hasAtomBitwise64() const { return SmVersion >= 32; }
bool hasAtomMinMax64() const { return SmVersion >= 32; }
- bool hasVote() const { return SmVersion >= 12; }
- bool hasDouble() const { return SmVersion >= 13; }
- bool reqPTX20() const { return SmVersion >= 20; }
- bool hasF32FTZ() const { return SmVersion >= 20; }
- bool hasFMAF32() const { return SmVersion >= 20; }
- bool hasFMAF64() const { return SmVersion >= 13; }
bool hasLDG() const { return SmVersion >= 32; }
- bool hasLDU() const { return ((SmVersion >= 20) && (SmVersion < 30)); }
- bool hasGenericLdSt() const { return SmVersion >= 20; }
inline bool hasHWROT32() const { return SmVersion >= 32; }
- inline bool hasSWROT32() const {
- return ((SmVersion >= 20) && (SmVersion < 32));
- }
- inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); }
- inline bool hasROT64() const { return SmVersion >= 20; }
bool hasImageHandles() const;
bool hasFP16Math() const { return SmVersion >= 53; }
bool allowFP16Math() const;
-
unsigned int getSmVersion() const { return SmVersion; }
std::string getTargetName() const { return TargetName; }
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index d31e1cb5047b..a1b160441df3 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -44,6 +44,20 @@ static cl::opt<bool>
cl::desc("Disable load/store vectorizer"),
cl::init(false), cl::Hidden);
+// TODO: Remove this flag when we are confident with no regressions.
+static cl::opt<bool> DisableRequireStructuredCFG(
+ "disable-nvptx-require-structured-cfg",
+ cl::desc("Transitional flag to turn off NVPTX's requirement on preserving "
+ "structured CFG. The requirement should be disabled only when "
+ "unexpected regressions happen."),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool> UseShortPointersOpt(
+ "nvptx-short-ptr",
+ cl::desc(
+ "Use 32-bit pointers for accessing const/local/shared address spaces."),
+ cl::init(false), cl::Hidden);
+
namespace llvm {
void initializeNVVMIntrRangePass(PassRegistry&);
@@ -75,11 +89,13 @@ extern "C" void LLVMInitializeNVPTXTarget() {
initializeNVPTXLowerAggrCopiesPass(PR);
}
-static std::string computeDataLayout(bool is64Bit) {
+static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
std::string Ret = "e";
if (!is64Bit)
Ret += "-p:32:32";
+ else if (UseShortPointers)
+ Ret += "-p3:32:32-p4:32:32-p5:32:32";
Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
@@ -100,14 +116,18 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
CodeGenOpt::Level OL, bool is64bit)
// The pic relocation model is used regardless of what the client has
// specified, as it is the only relocation model currently supported.
- : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options,
- Reloc::PIC_, getEffectiveCodeModel(CM), OL),
- is64bit(is64bit), TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
+ : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT,
+ CPU, FS, Options, Reloc::PIC_,
+ getEffectiveCodeModel(CM), OL),
+ is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
+ TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
Subtarget(TT, CPU, FS, *this) {
if (TT.getOS() == Triple::NVCL)
drvInterface = NVPTX::NVCL;
else
drvInterface = NVPTX::CUDA;
+ if (!DisableRequireStructuredCFG)
+ setRequiresStructuredCFG(true);
initAsmInfo();
}
@@ -228,9 +248,11 @@ void NVPTXPassConfig::addIRPasses() {
disablePass(&TailDuplicateID);
disablePass(&StackMapLivenessID);
disablePass(&LiveDebugValuesID);
+ disablePass(&PostRAMachineSinkingID);
disablePass(&PostRASchedulerID);
disablePass(&FuncletLayoutID);
disablePass(&PatchableFunctionID);
+ disablePass(&ShrinkWrapID);
// NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
// it here does nothing. But since we need it for correctness when lowering
@@ -323,7 +345,7 @@ void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
addPass(&StackSlotColoringID);
// FIXME: Needs physical registers
- //addPass(&PostRAMachineLICMID);
+ //addPass(&MachineLICMID);
printAndVerify("After StackSlotColoring");
}
@@ -358,7 +380,7 @@ void NVPTXPassConfig::addMachineSSAOptimization() {
if (addILPOpts())
printAndVerify("After ILP optimizations");
- addPass(&MachineLICMID);
+ addPass(&EarlyMachineLICMID);
addPass(&MachineCSEID);
addPass(&MachineSinkingID);
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index eeebf64d39c3..ca540b8e0389 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -26,6 +26,8 @@ namespace llvm {
///
class NVPTXTargetMachine : public LLVMTargetMachine {
bool is64bit;
+ // Use 32-bit pointers for accessing const/local/short AS.
+ bool UseShortPointers;
std::unique_ptr<TargetLoweringObjectFile> TLOF;
NVPTX::DrvInterface drvInterface;
NVPTXSubtarget Subtarget;
@@ -45,6 +47,7 @@ public:
}
const NVPTXSubtarget *getSubtargetImpl() const { return &Subtarget; }
bool is64Bit() const { return is64bit; }
+ bool useShortPointers() const { return UseShortPointers; }
NVPTX::DrvInterface getDrvInterface() const { return drvInterface; }
ManagedStringPool *getManagedStrPool() const {
return const_cast<ManagedStringPool *>(&ManagedStrPool);
diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index d16269f6ebea..c706b053ab8f 100644
--- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -10,77 +10,20 @@
#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
#define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
-#include "NVPTXSection.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/MC/MCSection.h"
#include "llvm/MC/SectionKind.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
namespace llvm {
class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
public:
- NVPTXTargetObjectFile() {
- TextSection = nullptr;
- DataSection = nullptr;
- BSSSection = nullptr;
- ReadOnlySection = nullptr;
-
- StaticCtorSection = nullptr;
- StaticDtorSection = nullptr;
- LSDASection = nullptr;
- EHFrameSection = nullptr;
- DwarfAbbrevSection = nullptr;
- DwarfInfoSection = nullptr;
- DwarfLineSection = nullptr;
- DwarfFrameSection = nullptr;
- DwarfPubTypesSection = nullptr;
- DwarfDebugInlineSection = nullptr;
- DwarfStrSection = nullptr;
- DwarfLocSection = nullptr;
- DwarfARangesSection = nullptr;
- DwarfRangesSection = nullptr;
- DwarfMacinfoSection = nullptr;
- }
+ NVPTXTargetObjectFile() : TargetLoweringObjectFile() {}
~NVPTXTargetObjectFile() override;
void Initialize(MCContext &ctx, const TargetMachine &TM) override {
TargetLoweringObjectFile::Initialize(ctx, TM);
- TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText());
- DataSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getData());
- BSSSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getBSS());
- ReadOnlySection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getReadOnly());
- StaticCtorSection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
- StaticDtorSection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
- LSDASection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
- EHFrameSection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
- DwarfAbbrevSection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
- DwarfInfoSection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
- DwarfLineSection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
- DwarfFrameSection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
- DwarfPubTypesSection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
- DwarfDebugInlineSection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
- DwarfStrSection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
- DwarfLocSection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
- DwarfARangesSection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
- DwarfRangesSection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
- DwarfMacinfoSection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
}
MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index d2414b72a009..a631055d36a0 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -49,6 +49,26 @@ public:
return AddressSpace::ADDRESS_SPACE_GENERIC;
}
+ // NVPTX has infinite registers of all kinds, but the actual machine doesn't.
+ // We conservatively return 1 here which is just enough to enable the
+ // vectorizers but disables heuristics based on the number of registers.
+ // FIXME: Return a more reasonable number, while keeping an eye on
+ // LoopVectorizer's unrolling heuristics.
+ unsigned getNumberOfRegisters(bool Vector) const { return 1; }
+
+ // Only <2 x half> should be vectorized, so always return 32 for the vector
+ // register size.
+ unsigned getRegisterBitWidth(bool Vector) const { return 32; }
+ unsigned getMinVectorRegisterBitWidth() const { return 32; }
+
+ // We don't want to prevent inlining because of target-cpu and -features
+ // attributes that were added to newer versions of LLVM/Clang: There are
+ // no incompatible functions in PTX, ptxas will throw errors in such cases.
+ bool areInlineCompatible(const Function *Caller,
+ const Function *Callee) const {
+ return true;
+ }
+
// Increase the inlining cost threshold by a factor of 5, reflecting that
// calls are particularly expensive in NVPTX.
unsigned getInliningThresholdMultiplier() { return 5; }
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index 152b665d0fdc..60971b48adfc 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -153,7 +153,7 @@ bool NVVMReflect::runOnFunction(Function &F) {
StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString();
ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
- DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");
+ LLVM_DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");
int ReflectVal = 0; // The default value is 0
if (ReflectArg == "__CUDA_FTZ") {
diff --git a/lib/Target/Nios2/CMakeLists.txt b/lib/Target/Nios2/CMakeLists.txt
index 7cad3c5ba9c1..6393cc5fcb92 100644
--- a/lib/Target/Nios2/CMakeLists.txt
+++ b/lib/Target/Nios2/CMakeLists.txt
@@ -1,17 +1,12 @@
set(LLVM_TARGET_DEFINITIONS Nios2.td)
-#Generate Nios2GenRegisterInfo.inc and Nios2GenInstrInfo.inc which included by
-#your hand code C++ files.
-#Nios2GenRegisterInfo.inc came from Nios2RegisterInfo.td, Nios2GenInstrInfo.inc
-#came from Nios2InstrInfo.td.
tablegen(LLVM Nios2GenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM Nios2GenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM Nios2GenRegisterInfo.inc -gen-register-info)
tablegen(LLVM Nios2GenCallingConv.inc -gen-callingconv)
+tablegen(LLVM Nios2GenDAGISel.inc -gen-dag-isel)
tablegen(LLVM Nios2GenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM Nios2GenRegisterInfo.inc -gen-register-info)
tablegen(LLVM Nios2GenSubtargetInfo.inc -gen-subtarget)
-#Nios2CommonTableGen must be defined
add_public_tablegen_target(Nios2CommonTableGen)
#Nios2CodeGen should match with LLVMBuild.txt Nios2CodeGen
diff --git a/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.cpp b/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.cpp
index 3971630c6beb..8ac08c6837d9 100644
--- a/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.cpp
+++ b/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.cpp
@@ -19,6 +19,7 @@
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
using namespace llvm;
@@ -111,21 +112,19 @@ Nios2AsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
return Infos[Kind - FirstTargetFixupKind];
}
-std::unique_ptr<MCObjectWriter>
-Nios2AsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
- return createNios2ELFObjectWriter(OS,
- MCELFObjectTargetWriter::getOSABI(OSType));
+std::unique_ptr<MCObjectTargetWriter>
+Nios2AsmBackend::createObjectTargetWriter() const {
+ return createNios2ELFObjectWriter(MCELFObjectTargetWriter::getOSABI(OSType));
}
-bool Nios2AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool Nios2AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
return true;
}
// MCAsmBackend
MCAsmBackend *llvm::createNios2AsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
const MCTargetOptions &Options) {
-
- return new Nios2AsmBackend(T, TT.getOS());
+ return new Nios2AsmBackend(T, STI.getTargetTriple().getOS());
}
diff --git a/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.h b/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.h
index 0aa42043ee2a..1f114bd869b1 100644
--- a/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.h
+++ b/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.h
@@ -31,12 +31,12 @@ class Nios2AsmBackend : public MCAsmBackend {
public:
Nios2AsmBackend(const Target &T, Triple::OSType OSType)
- : MCAsmBackend(), OSType(OSType) {}
+ : MCAsmBackend(support::little), OSType(OSType) {}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override;
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override;
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
diff --git a/lib/Target/Nios2/MCTargetDesc/Nios2ELFObjectWriter.cpp b/lib/Target/Nios2/MCTargetDesc/Nios2ELFObjectWriter.cpp
index 04f727ad390c..db432d15120d 100644
--- a/lib/Target/Nios2/MCTargetDesc/Nios2ELFObjectWriter.cpp
+++ b/lib/Target/Nios2/MCTargetDesc/Nios2ELFObjectWriter.cpp
@@ -37,8 +37,7 @@ unsigned Nios2ELFObjectWriter::getRelocType(MCContext &Ctx,
return 0;
}
-std::unique_ptr<MCObjectWriter>
-llvm::createNios2ELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI) {
- auto MOTW = llvm::make_unique<Nios2ELFObjectWriter>(OSABI);
- return createELFObjectWriter(std::move(MOTW), OS, true);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createNios2ELFObjectWriter(uint8_t OSABI) {
+ return llvm::make_unique<Nios2ELFObjectWriter>(OSABI);
}
diff --git a/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h
index d918a066acae..a7c4b16c6a3b 100644
--- a/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h
+++ b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h
@@ -18,8 +18,9 @@
namespace llvm {
class MCAsmBackend;
-class MCObjectWriter;
+class MCObjectTargetWriter;
class MCRegisterInfo;
+class MCSubtargetInfo;
class MCTargetOptions;
class Target;
class Triple;
@@ -28,12 +29,11 @@ class raw_pwrite_stream;
Target &getTheNios2Target();
-MCAsmBackend *createNios2AsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
+MCAsmBackend *createNios2AsmBackend(const Target &T, const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const MCTargetOptions &Options);
-std::unique_ptr<MCObjectWriter>
-createNios2ELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI);
+std::unique_ptr<MCObjectTargetWriter> createNios2ELFObjectWriter(uint8_t OSABI);
} // namespace llvm
diff --git a/lib/Target/Nios2/MCTargetDesc/Nios2TargetStreamer.cpp b/lib/Target/Nios2/MCTargetDesc/Nios2TargetStreamer.cpp
index b7e1bc36a6d3..795fd0084aa3 100644
--- a/lib/Target/Nios2/MCTargetDesc/Nios2TargetStreamer.cpp
+++ b/lib/Target/Nios2/MCTargetDesc/Nios2TargetStreamer.cpp
@@ -19,4 +19,4 @@ Nios2TargetStreamer::Nios2TargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
Nios2TargetAsmStreamer::Nios2TargetAsmStreamer(MCStreamer &S,
formatted_raw_ostream &OS)
- : Nios2TargetStreamer(S), OS(OS) {}
+ : Nios2TargetStreamer(S) {}
diff --git a/lib/Target/Nios2/Nios2ISelDAGToDAG.cpp b/lib/Target/Nios2/Nios2ISelDAGToDAG.cpp
index 31d04ebe447e..5f9679466115 100644
--- a/lib/Target/Nios2/Nios2ISelDAGToDAG.cpp
+++ b/lib/Target/Nios2/Nios2ISelDAGToDAG.cpp
@@ -59,12 +59,9 @@ public:
// expanded, promoted and normal instructions
void Nios2DAGToDAGISel::Select(SDNode *Node) {
- // Dump information about the Node being selected
- DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
-
// If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
- DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+ LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
Node->setNodeId(-1);
return;
}
diff --git a/lib/Target/Nios2/Nios2ISelLowering.cpp b/lib/Target/Nios2/Nios2ISelLowering.cpp
index 99aa43f960c1..008ce1570722 100644
--- a/lib/Target/Nios2/Nios2ISelLowering.cpp
+++ b/lib/Target/Nios2/Nios2ISelLowering.cpp
@@ -32,9 +32,38 @@ Nios2TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
+ // CCValAssign - represent the assignment of
+ // the return value to a location
+ SmallVector<CCValAssign, 16> RVLocs;
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ // CCState - Info about the registers and stack slot.
+ CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
+ // Analyze return values.
+ CCInfo.CheckReturn(Outs, RetCC_Nios2EABI);
+ SDValue Flag;
SmallVector<SDValue, 4> RetOps(1, Chain);
+ // Copy the result values into the output registers.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ SDValue Val = OutVals[i];
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ if (RVLocs[i].getValVT() != RVLocs[i].getLocVT())
+ Val = DAG.getNode(ISD::BITCAST, DL, RVLocs[i].getLocVT(), Val);
+
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Flag);
+
+ // Guarantee that all emitted copies are stuck together with flags.
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
return DAG.getNode(Nios2ISD::Ret, DL, MVT::Other, RetOps);
}
diff --git a/lib/Target/Nios2/Nios2InstrFormats.td b/lib/Target/Nios2/Nios2InstrFormats.td
index 58578501d804..f57bf03bba3c 100644
--- a/lib/Target/Nios2/Nios2InstrFormats.td
+++ b/lib/Target/Nios2/Nios2InstrFormats.td
@@ -20,14 +20,44 @@ class Format<bits<6> val> {
bits<6> Value = val;
}
-def Pseudo : Format<0>;
-def FrmI : Format<1>;
-def FrmR : Format<2>;
-def FrmJ : Format<3>;
-def FrmOther : Format<4>; // Instruction w/ a custom format
+def Pseudo : Format<0>;
+// Nios2 R1 instr formats:
+def FrmI : Format<1>;
+def FrmR : Format<2>;
+def FrmJ : Format<3>;
+def FrmOther : Format<4>; // Instruction w/ a custom format
+// Nios2 R2 instr 32-bit formats:
+def FrmL26 : Format<5>; // corresponds to J format in R1
+def FrmF2I16 : Format<6>; // corresponds to I format in R1
+def FrmF2X4I12 : Format<7>;
+def FrmF1X4I12 : Format<8>;
+def FrmF1X4L17 : Format<9>;
+def FrmF3X6L5 : Format<10>; // corresponds to R format in R1
+def FrmF2X6L10 : Format<11>;
+def FrmF3X6 : Format<12>; // corresponds to R format in R1
+def FrmF3X8 : Format<13>; // corresponds to custom format in R1
+// Nios2 R2 instr 16-bit formats:
+def FrmI10 : Format<14>;
+def FrmT1I7 : Format<15>;
+def FrmT2I4 : Format<16>;
+def FrmT1X1I6 : Format<17>;
+def FrmX1I7 : Format<18>;
+def FrmL5I4X1 : Format<19>;
+def FrmT2X1L3 : Format<20>;
+def FrmT2X1I3 : Format<21>;
+def FrmT3X1 : Format<22>;
+def FrmT2X3 : Format<23>;
+def FrmF1X1 : Format<24>;
+def FrmX2L5 : Format<25>;
+def FrmF1I5 : Format<26>;
+def FrmF2 : Format<27>;
-def isNios2r1 : Predicate<"Subtarget->isNios2r1()">;
-def isNios2r2 : Predicate<"Subtarget->isNios2r2()">;
+//===----------------------------------------------------------------------===//
+// Instruction Predicates:
+//===----------------------------------------------------------------------===//
+
+def isNios2r1 : Predicate<"Subtarget->isNios2r1()">;
+def isNios2r2 : Predicate<"Subtarget->isNios2r2()">;
class PredicateControl {
// Predicates related to specific target CPU features
@@ -151,6 +181,27 @@ class FJ<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
}
//===----------------------------------------------------------------------===//
+// Format F3X6 (R2) instruction : <|opx|RSV|C|B|A|opcode|>
+//===----------------------------------------------------------------------===//
+
+class F3X6<bits<6> opx, dag outs, dag ins, string asmstr, list<dag> pattern,
+ InstrItinClass itin>:
+ Nios2R2Inst32<outs, ins, asmstr, pattern, itin, FrmF3X6> {
+ bits<5> rC;
+ bits<5> rB;
+ bits<5> rA;
+ bits<5> rsv = 0;
+
+ let Opcode = 0x20; /* opcode is always 0x20 (OPX group) for F3X6 instr. */
+
+ let Inst{31-26} = opx; /* opx stands for opcode extension */
+ let Inst{25-21} = rsv;
+ let Inst{20-16} = rC;
+ let Inst{15-11} = rB;
+ let Inst{10-6} = rA;
+}
+
+//===----------------------------------------------------------------------===//
// Multiclasses for common instructions of both R1 and R2:
//===----------------------------------------------------------------------===//
@@ -160,6 +211,7 @@ multiclass CommonInstr_R_F3X6_opx<bits<6> opxR1, bits<6> opxR2, dag outs,
dag ins, string asmstr, list<dag> pattern,
InstrItinClass itin> {
def NAME#_R1 : FR<opxR1, outs, ins, asmstr, pattern, itin>;
+ def NAME#_R2 : F3X6<opxR2, outs, ins, asmstr, pattern, itin>;
}
// Multiclass for instructions that have R format in R1 and F3X6 format in R2
diff --git a/lib/Target/Nios2/Nios2InstrInfo.cpp b/lib/Target/Nios2/Nios2InstrInfo.cpp
index df435d2715d7..9700cba3595b 100644
--- a/lib/Target/Nios2/Nios2InstrInfo.cpp
+++ b/lib/Target/Nios2/Nios2InstrInfo.cpp
@@ -41,3 +41,14 @@ bool Nios2InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MBB.erase(MI);
return true;
}
+
+void Nios2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ unsigned opc = Subtarget.hasNios2r2() ? Nios2::ADD_R2 : Nios2::ADD_R1;
+ BuildMI(MBB, I, DL, get(opc))
+ .addReg(DestReg, RegState::Define)
+ .addReg(Nios2::ZERO)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+}
diff --git a/lib/Target/Nios2/Nios2InstrInfo.h b/lib/Target/Nios2/Nios2InstrInfo.h
index a994d3662db2..52f6e7e9c7c8 100644
--- a/lib/Target/Nios2/Nios2InstrInfo.h
+++ b/lib/Target/Nios2/Nios2InstrInfo.h
@@ -39,6 +39,10 @@ public:
const Nios2RegisterInfo &getRegisterInfo() const { return RI; };
bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
};
} // namespace llvm
diff --git a/lib/Target/Nios2/Nios2InstrInfo.td b/lib/Target/Nios2/Nios2InstrInfo.td
index 7a39b31a25a8..dee84f74bcbe 100644
--- a/lib/Target/Nios2/Nios2InstrInfo.td
+++ b/lib/Target/Nios2/Nios2InstrInfo.td
@@ -30,6 +30,10 @@ def simm16 : Operand<i32> {
// e.g. addi, andi
def immSExt16 : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>;
+// Custom return SDNode
+def Nios2Ret : SDNode<"Nios2ISD::Ret", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
//===----------------------------------------------------------------------===//
// Instructions specific format
//===----------------------------------------------------------------------===//
@@ -45,6 +49,16 @@ multiclass ArithLogicRegImm16<bits<6> op, string mnemonic, SDNode opNode,
(opNode CPURegs:$rA, immType:$imm))],
IIAlu>;
+// Arithmetic and logical instructions with 3 register operands.
+// Defines R1 and R2 instruction at the same time.
+multiclass ArithLogicReg<bits<6> opx, string mnemonic,
+ SDNode opNode>:
+ CommonInstr_R_F3X6<opx, (outs CPURegs:$rC),
+ (ins CPURegs:$rA, CPURegs:$rB),
+ !strconcat(mnemonic, "\t$rC, $rA, $rB"),
+ [(set CPURegs:$rC, (opNode CPURegs:$rA, CPURegs:$rB))],
+ IIAlu>;
+
multiclass Return<bits<6> opx, dag outs, dag ins, string mnemonic> {
let rB = 0, rC = 0,
isReturn = 1,
@@ -55,14 +69,31 @@ multiclass Return<bits<6> opx, dag outs, dag ins, string mnemonic> {
}
}
-// Custom return SDNode
-def Nios2Ret : SDNode<"Nios2ISD::Ret", SDTNone,
- [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-
//===----------------------------------------------------------------------===//
// Nios2 Instructions
//===----------------------------------------------------------------------===//
+/// Arithmetic instructions operating on registers.
+let isCommutable = 1 ,
+ isReMaterializable = 1 in {
+ defm ADD : ArithLogicReg<0x31, "add", add>;
+ defm AND : ArithLogicReg<0x0e, "and", and>;
+ defm OR : ArithLogicReg<0x16, "or", or>;
+ defm XOR : ArithLogicReg<0x1e, "xor", xor>;
+ defm MUL : ArithLogicReg<0x27, "mul", mul>;
+}
+
+let isReMaterializable = 1 in {
+ defm SUB : ArithLogicReg<0x39, "sub", sub>;
+}
+
+defm DIVU : ArithLogicReg<0x24, "divu", udiv>;
+defm DIV : ArithLogicReg<0x25, "div", sdiv>;
+
+defm SLL : ArithLogicReg<0x13, "sll", shl>;
+defm SRL : ArithLogicReg<0x1b, "srl", srl>;
+defm SRA : ArithLogicReg<0x3b, "sra", sra>;
+
/// Arithmetic Instructions (ALU Immediate)
defm ADDI : ArithLogicRegImm16<0x04, "addi", add, simm16, immSExt16>;
diff --git a/lib/Target/Nios2/Nios2TargetObjectFile.h b/lib/Target/Nios2/Nios2TargetObjectFile.h
index 28d7ff0ec668..e9ed6e31d937 100644
--- a/lib/Target/Nios2/Nios2TargetObjectFile.h
+++ b/lib/Target/Nios2/Nios2TargetObjectFile.h
@@ -16,8 +16,6 @@
namespace llvm {
class Nios2TargetObjectFile : public TargetLoweringObjectFileELF {
- const Nios2TargetMachine *TM;
-
public:
Nios2TargetObjectFile() : TargetLoweringObjectFileELF() {}
diff --git a/lib/Target/Nios2/Nios2TargetStreamer.h b/lib/Target/Nios2/Nios2TargetStreamer.h
index 63e4e3ccdc64..1520ac27e94f 100644
--- a/lib/Target/Nios2/Nios2TargetStreamer.h
+++ b/lib/Target/Nios2/Nios2TargetStreamer.h
@@ -22,8 +22,6 @@ public:
// This part is for ascii assembly output
class Nios2TargetAsmStreamer : public Nios2TargetStreamer {
- formatted_raw_ostream &OS;
-
public:
Nios2TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
};
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index d6db354e0215..56307a84f2e5 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -83,6 +83,16 @@ static const MCPhysReg FRegs[32] = {
PPC::F24, PPC::F25, PPC::F26, PPC::F27,
PPC::F28, PPC::F29, PPC::F30, PPC::F31
};
+static const MCPhysReg SPERegs[32] = {
+ PPC::S0, PPC::S1, PPC::S2, PPC::S3,
+ PPC::S4, PPC::S5, PPC::S6, PPC::S7,
+ PPC::S8, PPC::S9, PPC::S10, PPC::S11,
+ PPC::S12, PPC::S13, PPC::S14, PPC::S15,
+ PPC::S16, PPC::S17, PPC::S18, PPC::S19,
+ PPC::S20, PPC::S21, PPC::S22, PPC::S23,
+ PPC::S24, PPC::S25, PPC::S26, PPC::S27,
+ PPC::S28, PPC::S29, PPC::S30, PPC::S31
+};
static const MCPhysReg VFRegs[32] = {
PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3,
PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7,
@@ -648,6 +658,16 @@ public:
Inst.addOperand(MCOperand::createReg(QFRegs[getReg()]));
}
+ void addRegSPE4RCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(RRegs[getReg()]));
+ }
+
+ void addRegSPERCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(SPERegs[getReg()]));
+ }
+
void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createReg(CRBITRegs[getCRBit()]));
@@ -1394,6 +1414,12 @@ ExtractModifierFromExpr(const MCExpr *E,
case MCSymbolRefExpr::VK_PPC_HA:
Variant = PPCMCExpr::VK_PPC_HA;
break;
+ case MCSymbolRefExpr::VK_PPC_HIGH:
+ Variant = PPCMCExpr::VK_PPC_HIGH;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HIGHA:
+ Variant = PPCMCExpr::VK_PPC_HIGHA;
+ break;
case MCSymbolRefExpr::VK_PPC_HIGHER:
Variant = PPCMCExpr::VK_PPC_HIGHER;
break;
@@ -1973,6 +1999,10 @@ PPCAsmParser::applyModifierToExpr(const MCExpr *E,
return PPCMCExpr::create(PPCMCExpr::VK_PPC_HI, E, false, Ctx);
case MCSymbolRefExpr::VK_PPC_HA:
return PPCMCExpr::create(PPCMCExpr::VK_PPC_HA, E, false, Ctx);
+ case MCSymbolRefExpr::VK_PPC_HIGH:
+ return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGH, E, false, Ctx);
+ case MCSymbolRefExpr::VK_PPC_HIGHA:
+ return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHA, E, false, Ctx);
case MCSymbolRefExpr::VK_PPC_HIGHER:
return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHER, E, false, Ctx);
case MCSymbolRefExpr::VK_PPC_HIGHERA:
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index 3f173787114d..ff2776812845 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -1,15 +1,16 @@
set(LLVM_TARGET_DEFINITIONS PPC.td)
-tablegen(LLVM PPCGenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM PPCGenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM PPCGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM PPCGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM PPCGenDAGISel.inc -gen-dag-isel)
tablegen(LLVM PPCGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM PPCGenFastISel.inc -gen-fast-isel)
+tablegen(LLVM PPCGenInstrInfo.inc -gen-instr-info)
tablegen(LLVM PPCGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM PPCGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM PPCGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM PPCGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM PPCGenFastISel.inc -gen-fast-isel)
-tablegen(LLVM PPCGenCallingConv.inc -gen-callingconv)
tablegen(LLVM PPCGenSubtargetInfo.inc -gen-subtarget)
+
add_public_tablegen_target(PowerPCCommonTableGen)
add_llvm_target(PowerPCCodeGen
@@ -49,5 +50,5 @@ add_llvm_target(PowerPCCodeGen
add_subdirectory(AsmParser)
add_subdirectory(Disassembler)
add_subdirectory(InstPrinter)
-add_subdirectory(TargetInfo)
add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 11d22377611b..db01271b87e1 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -226,6 +226,17 @@ static const unsigned QFRegs[] = {
PPC::QF28, PPC::QF29, PPC::QF30, PPC::QF31
};
+static const unsigned SPERegs[] = {
+ PPC::S0, PPC::S1, PPC::S2, PPC::S3,
+ PPC::S4, PPC::S5, PPC::S6, PPC::S7,
+ PPC::S8, PPC::S9, PPC::S10, PPC::S11,
+ PPC::S12, PPC::S13, PPC::S14, PPC::S15,
+ PPC::S16, PPC::S17, PPC::S18, PPC::S19,
+ PPC::S20, PPC::S21, PPC::S22, PPC::S23,
+ PPC::S24, PPC::S25, PPC::S26, PPC::S27,
+ PPC::S28, PPC::S29, PPC::S30, PPC::S31
+};
+
template <std::size_t N>
static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
const unsigned (&Regs)[N]) {
@@ -327,6 +338,18 @@ static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
return decodeRegisterClass(Inst, RegNo, QFRegs);
}
+static DecodeStatus DecodeSPE4RCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, GPRegs);
+}
+
+static DecodeStatus DecodeSPERCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, SPERegs);
+}
+
#define DecodeQSRCRegisterClass DecodeQFRCRegisterClass
#define DecodeQBRCRegisterClass DecodeQFRCRegisterClass
@@ -417,6 +440,51 @@ static DecodeStatus decodeMemRIX16Operands(MCInst &Inst, uint64_t Imm,
return MCDisassembler::Success;
}
+static DecodeStatus decodeSPE8Operands(MCInst &Inst, uint64_t Imm,
+ int64_t Address, const void *Decoder) {
+ // Decode the spe8disp field (imm, reg), which has the low 5-bits as the
+ // displacement with 8-byte aligned, and the next 5 bits as the register #.
+
+ uint64_t Base = Imm >> 5;
+ uint64_t Disp = Imm & 0x1F;
+
+ assert(Base < 32 && "Invalid base register");
+
+ Inst.addOperand(MCOperand::createImm(Disp << 3));
+ Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeSPE4Operands(MCInst &Inst, uint64_t Imm,
+ int64_t Address, const void *Decoder) {
+ // Decode the spe4disp field (imm, reg), which has the low 5-bits as the
+ // displacement with 4-byte aligned, and the next 5 bits as the register #.
+
+ uint64_t Base = Imm >> 5;
+ uint64_t Disp = Imm & 0x1F;
+
+ assert(Base < 32 && "Invalid base register");
+
+ Inst.addOperand(MCOperand::createImm(Disp << 2));
+ Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeSPE2Operands(MCInst &Inst, uint64_t Imm,
+ int64_t Address, const void *Decoder) {
+ // Decode the spe2disp field (imm, reg), which has the low 5-bits as the
+ // displacement with 2-byte aligned, and the next 5 bits as the register #.
+
+ uint64_t Base = Imm >> 5;
+ uint64_t Disp = Imm & 0x1F;
+
+ assert(Base < 32 && "Invalid base register");
+
+ Inst.addOperand(MCOperand::createImm(Disp << 1));
+ Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+ return MCDisassembler::Success;
+}
+
static DecodeStatus decodeCRBitMOperand(MCInst &Inst, uint64_t Imm,
int64_t Address, const void *Decoder) {
// The cr bit encoding is 0x80 >> cr_reg_num.
@@ -450,6 +518,11 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI);
if (result != MCDisassembler::Fail)
return result;
+ } else if (STI.getFeatureBits()[PPC::FeatureSPE]) {
+ DecodeStatus result =
+ decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI);
+ if (result != MCDisassembler::Fail)
+ return result;
}
return decodeInstruction(DecoderTable32, MI, Inst, Address, this, STI);
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 2a1de244da92..a405dd70c307 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -18,6 +18,7 @@
#include "llvm/MC/MCMachObjectWriter.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/ErrorHandling.h"
@@ -74,10 +75,9 @@ namespace {
class PPCAsmBackend : public MCAsmBackend {
const Target &TheTarget;
- bool IsLittleEndian;
public:
- PPCAsmBackend(const Target &T, bool isLittle) : MCAsmBackend(), TheTarget(T),
- IsLittleEndian(isLittle) {}
+ PPCAsmBackend(const Target &T, support::endianness Endian)
+ : MCAsmBackend(Endian), TheTarget(T) {}
unsigned getNumFixupKinds() const override {
return PPC::NumTargetFixupKinds;
@@ -110,12 +110,15 @@ public:
assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
"Invalid kind!");
- return (IsLittleEndian? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
+ return (Endian == support::little
+ ? InfosLE
+ : InfosBE)[Kind - FirstTargetFixupKind];
}
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsResolved) const override {
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override {
Value = adjustFixupValue(Fixup.getKind(), Value);
if (!Value) return; // Doesn't change encoding.
@@ -126,7 +129,7 @@ public:
// from the fixup value. The Value has been "split up" into the appropriate
// bitfields above.
for (unsigned i = 0; i != NumBytes; ++i) {
- unsigned Idx = IsLittleEndian ? i : (NumBytes - 1 - i);
+ unsigned Idx = Endian == support::little ? i : (NumBytes - 1 - i);
Data[Offset + i] |= uint8_t((Value >> (Idx * 8)) & 0xff);
}
}
@@ -155,7 +158,8 @@ public:
}
}
- bool mayNeedRelaxation(const MCInst &Inst) const override {
+ bool mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const override {
// FIXME.
return false;
}
@@ -174,12 +178,12 @@ public:
llvm_unreachable("relaxInstruction() unimplemented");
}
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override {
uint64_t NumNops = Count / 4;
for (uint64_t i = 0; i != NumNops; ++i)
- OW->write32(0x60000000);
+ support::endian::write<uint32_t>(OS, 0x60000000, Endian);
- OW->WriteZeros(Count % 4);
+ OS.write_zeros(Count % 4);
return true;
}
@@ -190,10 +194,6 @@ public:
assert(Name == "ppc32" && "Unknown target name!");
return 4;
}
-
- bool isLittleEndian() const {
- return IsLittleEndian;
- }
};
} // end anonymous namespace
@@ -202,13 +202,12 @@ public:
namespace {
class DarwinPPCAsmBackend : public PPCAsmBackend {
public:
- DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, false) { }
+ DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, support::big) { }
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
bool is64 = getPointerSize() == 8;
return createPPCMachObjectWriter(
- OS,
/*Is64Bit=*/is64,
(is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC),
MachO::CPU_SUBTYPE_POWERPC_ALL);
@@ -218,26 +217,29 @@ namespace {
class ELFPPCAsmBackend : public PPCAsmBackend {
uint8_t OSABI;
public:
- ELFPPCAsmBackend(const Target &T, bool IsLittleEndian, uint8_t OSABI) :
- PPCAsmBackend(T, IsLittleEndian), OSABI(OSABI) { }
+ ELFPPCAsmBackend(const Target &T, support::endianness Endian,
+ uint8_t OSABI)
+ : PPCAsmBackend(T, Endian), OSABI(OSABI) {}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
bool is64 = getPointerSize() == 8;
- return createPPCELFObjectWriter(OS, is64, isLittleEndian(), OSABI);
+ return createPPCELFObjectWriter(is64, OSABI);
}
};
} // end anonymous namespace
MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
const MCTargetOptions &Options) {
+ const Triple &TT = STI.getTargetTriple();
if (TT.isOSDarwin())
return new DarwinPPCAsmBackend(T);
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
bool IsLittleEndian = TT.getArch() == Triple::ppc64le;
- return new ELFPPCAsmBackend(T, IsLittleEndian, OSABI);
+ return new ELFPPCAsmBackend(
+ T, IsLittleEndian ? support::little : support::big, OSABI);
}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 44ee9733b16e..a3caf9a7a5ee 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -55,6 +55,10 @@ static MCSymbolRefExpr::VariantKind getAccessVariant(const MCValue &Target,
return MCSymbolRefExpr::VK_PPC_HI;
case PPCMCExpr::VK_PPC_HA:
return MCSymbolRefExpr::VK_PPC_HA;
+ case PPCMCExpr::VK_PPC_HIGH:
+ return MCSymbolRefExpr::VK_PPC_HIGH;
+ case PPCMCExpr::VK_PPC_HIGHA:
+ return MCSymbolRefExpr::VK_PPC_HIGHA;
case PPCMCExpr::VK_PPC_HIGHERA:
return MCSymbolRefExpr::VK_PPC_HIGHERA;
case PPCMCExpr::VK_PPC_HIGHER:
@@ -151,6 +155,12 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
case MCSymbolRefExpr::VK_PPC_HA:
Type = ELF::R_PPC_ADDR16_HA;
break;
+ case MCSymbolRefExpr::VK_PPC_HIGH:
+ Type = ELF::R_PPC64_ADDR16_HIGH;
+ break;
+ case MCSymbolRefExpr::VK_PPC_HIGHA:
+ Type = ELF::R_PPC64_ADDR16_HIGHA;
+ break;
case MCSymbolRefExpr::VK_PPC_HIGHER:
Type = ELF::R_PPC64_ADDR16_HIGHER;
break;
@@ -199,6 +209,12 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
case MCSymbolRefExpr::VK_PPC_TPREL_HA:
Type = ELF::R_PPC_TPREL16_HA;
break;
+ case MCSymbolRefExpr::VK_PPC_TPREL_HIGH:
+ Type = ELF::R_PPC64_TPREL16_HIGH;
+ break;
+ case MCSymbolRefExpr::VK_PPC_TPREL_HIGHA:
+ Type = ELF::R_PPC64_TPREL16_HIGHA;
+ break;
case MCSymbolRefExpr::VK_PPC_TPREL_HIGHER:
Type = ELF::R_PPC64_TPREL16_HIGHER;
break;
@@ -223,6 +239,12 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
case MCSymbolRefExpr::VK_PPC_DTPREL_HA:
Type = ELF::R_PPC64_DTPREL16_HA;
break;
+ case MCSymbolRefExpr::VK_PPC_DTPREL_HIGH:
+ Type = ELF::R_PPC64_DTPREL16_HIGH;
+ break;
+ case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHA:
+ Type = ELF::R_PPC64_DTPREL16_HIGHA;
+ break;
case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHER:
Type = ELF::R_PPC64_DTPREL16_HIGHER;
break;
@@ -417,9 +439,7 @@ bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
}
}
-std::unique_ptr<MCObjectWriter>
-llvm::createPPCELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
- bool IsLittleEndian, uint8_t OSABI) {
- auto MOTW = llvm::make_unique<PPCELFObjectWriter>(Is64Bit, OSABI);
- return createELFObjectWriter(std::move(MOTW), OS, IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createPPCELFObjectWriter(bool Is64Bit, uint8_t OSABI) {
+ return llvm::make_unique<PPCELFObjectWriter>(Is64Bit, OSABI);
}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 92c8c224b71b..2b948ca60028 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -122,25 +122,18 @@ public:
// Output the constant in big/little endian byte order.
unsigned Size = Desc.getSize();
+ support::endianness E = IsLittleEndian ? support::little : support::big;
switch (Size) {
case 0:
break;
case 4:
- if (IsLittleEndian) {
- support::endian::Writer<support::little>(OS).write<uint32_t>(Bits);
- } else {
- support::endian::Writer<support::big>(OS).write<uint32_t>(Bits);
- }
+ support::endian::write<uint32_t>(OS, Bits, E);
break;
case 8:
// If we emit a pair of instructions, the first one is
// always in the top 32 bits, even on little-endian.
- if (IsLittleEndian) {
- uint64_t Swapped = (Bits << 32) | (Bits >> 32);
- support::endian::Writer<support::little>(OS).write<uint64_t>(Swapped);
- } else {
- support::endian::Writer<support::big>(OS).write<uint64_t>(Bits);
- }
+ support::endian::write<uint32_t>(OS, Bits >> 32, E);
+ support::endian::write<uint32_t>(OS, Bits, E);
break;
default:
llvm_unreachable("Invalid instruction size");
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index 54f664314578..32e6a0bdd65f 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -44,6 +44,8 @@ void PPCMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
case VK_PPC_LO: OS << "@l"; break;
case VK_PPC_HI: OS << "@h"; break;
case VK_PPC_HA: OS << "@ha"; break;
+ case VK_PPC_HIGH: OS << "@high"; break;
+ case VK_PPC_HIGHA: OS << "@higha"; break;
case VK_PPC_HIGHER: OS << "@higher"; break;
case VK_PPC_HIGHERA: OS << "@highera"; break;
case VK_PPC_HIGHEST: OS << "@highest"; break;
@@ -75,6 +77,10 @@ PPCMCExpr::evaluateAsInt64(int64_t Value) const {
return (Value >> 16) & 0xffff;
case VK_PPC_HA:
return ((Value + 0x8000) >> 16) & 0xffff;
+ case VK_PPC_HIGH:
+ return (Value >> 16) & 0xffff;
+ case VK_PPC_HIGHA:
+ return ((Value + 0x8000) >> 16) & 0xffff;
case VK_PPC_HIGHER:
return (Value >> 32) & 0xffff;
case VK_PPC_HIGHERA:
@@ -125,6 +131,12 @@ PPCMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
case VK_PPC_HA:
Modifier = MCSymbolRefExpr::VK_PPC_HA;
break;
+ case VK_PPC_HIGH:
+ Modifier = MCSymbolRefExpr::VK_PPC_HIGH;
+ break;
+ case VK_PPC_HIGHA:
+ Modifier = MCSymbolRefExpr::VK_PPC_HIGHA;
+ break;
case VK_PPC_HIGHERA:
Modifier = MCSymbolRefExpr::VK_PPC_HIGHERA;
break;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index d42a111cc43e..8bb4791d13dd 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -23,6 +23,8 @@ public:
VK_PPC_LO,
VK_PPC_HI,
VK_PPC_HA,
+ VK_PPC_HIGH,
+ VK_PPC_HIGHA,
VK_PPC_HIGHER,
VK_PPC_HIGHERA,
VK_PPC_HIGHEST,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 80a74c09a598..316fd2ccf358 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -27,8 +27,9 @@ class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
class MCRegisterInfo;
+class MCSubtargetInfo;
class MCTargetOptions;
class Target;
class Triple;
@@ -43,20 +44,16 @@ MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
-MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
+MCAsmBackend *createPPCAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const MCTargetOptions &Options);
/// Construct an PPC ELF object writer.
-std::unique_ptr<MCObjectWriter> createPPCELFObjectWriter(raw_pwrite_stream &OS,
- bool Is64Bit,
- bool IsLittleEndian,
- uint8_t OSABI);
+std::unique_ptr<MCObjectTargetWriter> createPPCELFObjectWriter(bool Is64Bit,
+ uint8_t OSABI);
/// Construct a PPC Mach-O object writer.
-std::unique_ptr<MCObjectWriter> createPPCMachObjectWriter(raw_pwrite_stream &OS,
- bool Is64Bit,
- uint32_t CPUType,
- uint32_t CPUSubtype);
+std::unique_ptr<MCObjectTargetWriter>
+createPPCMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype);
/// Returns true iff Val consists of one contiguous run of 1s with any number of
/// 0s on either side. The 1s are allowed to wrap from LSB to MSB, so
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index 4b9055ec7041..ff6cf584da23 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -374,10 +374,8 @@ void PPCMachObjectWriter::RecordPPCRelocation(
Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
}
-std::unique_ptr<MCObjectWriter>
-llvm::createPPCMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
- uint32_t CPUType, uint32_t CPUSubtype) {
- return createMachObjectWriter(
- llvm::make_unique<PPCMachObjectWriter>(Is64Bit, CPUType, CPUSubtype), OS,
- /*IsLittleEndian=*/false);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createPPCMachObjectWriter(bool Is64Bit, uint32_t CPUType,
+ uint32_t CPUSubtype) {
+ return llvm::make_unique<PPCMachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
index 603ac960133f..fe7e7aeeb182 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -50,6 +50,9 @@ namespace PPC {
PRED_UN_PLUS = (3 << 5) | 15,
PRED_NU_PLUS = (3 << 5) | 7,
+ // SPE scalar compare instructions always set the GT bit.
+ PRED_SPE = PRED_GT,
+
// When dealing with individual condition-register bits, we have simple set
// and unset predicates.
PRED_BIT_SET = 1024,
diff --git a/lib/Target/PowerPC/P9InstrResources.td b/lib/Target/PowerPC/P9InstrResources.td
index dc6ed16e53ce..34df8452fe16 100644
--- a/lib/Target/PowerPC/P9InstrResources.td
+++ b/lib/Target/PowerPC/P9InstrResources.td
@@ -7,10 +7,11 @@
//
//===----------------------------------------------------------------------===//
//
-// This file defines resources required by some of P9 instruction. This is part
-// P9 processor model used for instruction scheduling. Not every instruction
-// is listed here. Instructions in this file belong to itinerary classes that
-// have instructions with different resource requirements.
+// This file defines the resources required by P9 instructions. This is part
+// P9 processor model used for instruction scheduling. This file should contain
+// all of the instructions that may be used on Power 9. This is not just
+// instructions that are new on Power 9 but also instructions that were
+// available on earlier architectures and are still used in Power 9.
//
// The makeup of the P9 CPU is modeled as follows:
// - Each CPU is made up of two superslices.
@@ -31,85 +32,37 @@
//===----------------------------------------------------------------------===//
// Two cycle ALU vector operation that uses an entire superslice.
-// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
-// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
+// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
+// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
DISP_1C, DISP_1C, DISP_1C],
(instrs
- VADDCUW,
- VADDUBM,
- VADDUDM,
- VADDUHM,
- VADDUWM,
- VAND,
- VANDC,
- VCMPEQUB,
- VCMPEQUD,
- VCMPEQUH,
- VCMPEQUW,
- VCMPNEB,
- VCMPNEH,
- VCMPNEW,
- VCMPNEZB,
- VCMPNEZH,
- VCMPNEZW,
+ (instregex "VADDU(B|H|W|D)M$"),
+ (instregex "VAND(C)?$"),
+ (instregex "VEXTS(B|H|W)2(D|W)(s)?$"),
+ (instregex "V_SET0(B|H)?$"),
+ (instregex "VS(R|L)(B|H|W|D)$"),
+ (instregex "VSUBU(B|H|W|D)M$"),
+ (instregex "VPOPCNT(B|H)$"),
+ (instregex "VRL(B|H|W|D)$"),
+ (instregex "VSRA(B|H|W|D)$"),
+ (instregex "XV(N)?ABS(D|S)P$"),
+ (instregex "XVCPSGN(D|S)P$"),
+ (instregex "XV(I|X)EXP(D|S)P$"),
+ (instregex "VRL(D|W)(MI|NM)$"),
+ (instregex "VMRG(E|O)W$"),
+ MTVSRDD,
VEQV,
- VEXTSB2D,
- VEXTSB2W,
- VEXTSH2D,
- VEXTSH2W,
- VEXTSW2D,
- VRLB,
- VRLD,
- VRLDMI,
- VRLDNM,
- VRLH,
- VRLW,
- VRLWMI,
- VRLWNM,
- VSRAB,
- VSRAD,
- VSRAH,
- VSRAW,
- VSRB,
- VSRD,
- VSRH,
- VSRW,
- VSLB,
- VSLD,
- VSLH,
- VSLW,
- VMRGEW,
- VMRGOW,
VNAND,
VNEGD,
VNEGW,
VNOR,
VOR,
VORC,
- VPOPCNTB,
- VPOPCNTH,
VSEL,
- VSUBUBM,
- VSUBUDM,
- VSUBUHM,
- VSUBUWM,
VXOR,
- V_SET0B,
- V_SET0H,
- V_SET0,
- XVABSDP,
- XVABSSP,
- XVCPSGNDP,
- XVCPSGNSP,
- XVIEXPDP,
- XVNABSDP,
- XVNABSSP,
XVNEGDP,
XVNEGSP,
- XVXEXPDP,
- XVIEXPSP,
- XVXEXPSP,
XXLAND,
XXLANDC,
XXLEQV,
@@ -119,6 +72,9 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
XXLORf,
XXLORC,
XXLXOR,
+ XXLXORdpz,
+ XXLXORspz,
+ XXLXORz,
XXSEL,
XSABSQP,
XSCPSGNQP,
@@ -129,54 +85,89 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
)>;
// Restricted Dispatch ALU operation for 3 cycles. The operation runs on a
-// slingle slice. However, since it is Restricted it requires all 3 dispatches
-// (DISP) for that superslice.
+// slingle slice. However, since it is Restricted it requires all 3 dispatches
+// (DISP) for that superslice.
def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- FCMPUS,
- FCMPUD,
- XSTSTDCDP,
- XSTSTDCSP
+ (instregex "TABORT(D|W)C(I)?$"),
+ (instregex "MTFSB(0|1)$"),
+ (instregex "MFFSC(D)?RN(I)?$"),
+ (instregex "CMPRB(8)?$"),
+ (instregex "TD(I)?$"),
+ (instregex "TW(I)?$"),
+ (instregex "FCMPU(S|D)$"),
+ (instregex "XSTSTDC(S|D)P$"),
+ FTDIV,
+ FTSQRT,
+ CMPEQB
)>;
// Standard Dispatch ALU operation for 3 cycles. Only one slice used.
def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
- XSMAXCDP,
- XSMAXDP,
- XSMAXJDP,
- XSMINCDP,
- XSMINDP,
- XSMINJDP,
+ (instregex "XSMAX(C|J)?DP$"),
+ (instregex "XSMIN(C|J)?DP$"),
+ (instregex "XSCMP(EQ|EXP|GE|GT|O|U)DP$"),
+ (instregex "CNT(L|T)Z(D|W)(8)?(o)?$"),
+ (instregex "POPCNT(D|W)$"),
+ (instregex "CMPB(8)?$"),
XSTDIVDP,
XSTSQRTDP,
- XSCMPEQDP,
- XSCMPEXPDP,
- XSCMPGEDP,
- XSCMPGTDP,
- XSCMPODP,
- XSCMPUDP,
XSXSIGDP,
- XSCVSPDPN
+ XSCVSPDPN,
+ SETB,
+ BPERMD
)>;
// Standard Dispatch ALU operation for 2 cycles. Only one slice used.
def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
- ADDIStocHA,
- ADDItocL,
+ (instregex "S(L|R)D$"),
+ (instregex "SRAD(I)?$"),
+ (instregex "EXTSWSLI$"),
+ (instregex "MFV(S)?RD$"),
+ (instregex "MTVSRD$"),
+ (instregex "MTVSRW(A|Z)$"),
+ (instregex "CMP(WI|LWI|W|LW)(8)?$"),
+ (instregex "CMP(L)?D(I)?$"),
+ (instregex "SUBF(I)?C(8)?$"),
+ (instregex "ANDI(S)?o(8)?$"),
+ (instregex "ADDC(8)?$"),
+ (instregex "ADDIC(8)?(o)?$"),
+ (instregex "ADD(8|4)(o)?$"),
+ (instregex "ADD(E|ME|ZE)(8)?(o)?$"),
+ (instregex "SUBF(E|ME|ZE)?(8)?(o)?$"),
+ (instregex "NEG(8)?(o)?$"),
+ (instregex "POPCNTB$"),
+ (instregex "ADD(I|IS)?(8)?$"),
+ (instregex "LI(S)?(8)?$"),
+ (instregex "(X)?OR(I|IS)?(8)?(o)?$"),
+ (instregex "NAND(8)?(o)?$"),
+ (instregex "AND(C)?(8)?(o)?$"),
+ (instregex "NOR(8)?(o)?$"),
+ (instregex "OR(C)?(8)?(o)?$"),
+ (instregex "EQV(8)?(o)?$"),
+ (instregex "EXTS(B|H|W)(8)?(_32)?(_64)?(o)?$"),
+ (instregex "ADD(4|8)(TLS)?(_)?$"),
+ (instregex "NEG(8)?$"),
+ (instregex "ADDI(S)?toc(HA|L)$"),
+ COPY,
MCRF,
MCRXRX,
- SLD,
- SRD,
- SRAD,
- SRADI,
- RLDIC,
XSNABSDP,
XSXEXPDP,
XSABSDP,
XSNEGDP,
- XSCPSGNDP
+ XSCPSGNDP,
+ MFVSRWZ,
+ SRADI_32,
+ RLDIC,
+ RFEBB,
+ LA,
+ TBEGIN,
+ TRECHKPT,
+ NOP,
+ WAIT
)>;
// Restricted Dispatch ALU operation for 2 cycles. The operation runs on a
@@ -184,80 +175,50 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
// (DISP) for that superslice.
def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- RLDCL,
- RLDCR,
+ (instregex "RLDC(L|R)$"),
+ (instregex "RLWIMI(8)?$"),
+ (instregex "RLDIC(L|R)(_32)?(_64)?$"),
+ (instregex "M(F|T)OCRF(8)?$"),
+ (instregex "CR(6)?(UN)?SET$"),
+ (instregex "CR(N)?(OR|AND)(C)?$"),
+ (instregex "S(L|R)W(8)?$"),
+ (instregex "RLW(INM|NM)(8)?$"),
+ (instregex "F(N)?ABS(D|S)$"),
+ (instregex "FNEG(D|S)$"),
+ (instregex "FCPSGN(D|S)$"),
+ (instregex "SRAW(I)?$"),
+ (instregex "ISEL(8)?$"),
RLDIMI,
- RLDICL,
- RLDICR,
- RLDICL_32_64,
XSIEXPDP,
FMR,
- FABSD,
- FABSS,
- FNABSD,
- FNABSS,
- FNEGD,
- FNEGS,
- FCPSGND,
- FCPSGNS
+ CREQV,
+ CRXOR,
+ TRECLAIM,
+ TSR,
+ TABORT
)>;
// Three cycle ALU vector operation that uses an entire superslice.
-// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
-// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
+// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
+// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C,
DISP_1C, DISP_1C, DISP_1C],
(instrs
+ (instregex "M(T|F)VSCR$"),
+ (instregex "VCMPNEZ(B|H|W)$"),
+ (instregex "VCMPEQU(B|H|W|D)$"),
+ (instregex "VCMPNE(B|H|W)$"),
+ (instregex "VABSDU(B|H|W)$"),
+ (instregex "VADDU(B|H|W)S$"),
+ (instregex "VAVG(S|U)(B|H|W)$"),
+ (instregex "VCMP(EQ|GE|GT)FP(o)?$"),
+ (instregex "VCMPBFP(o)?$"),
+ (instregex "VC(L|T)Z(B|H|W|D)$"),
+ (instregex "VADDS(B|H|W)S$"),
+ (instregex "V(MIN|MAX)FP$"),
+ (instregex "V(MIN|MAX)(S|U)(B|H|W|D)$"),
VBPERMD,
- VABSDUB,
- VABSDUH,
- VABSDUW,
- VADDUBS,
- VADDUHS,
- VADDUWS,
- VAVGSB,
- VAVGSH,
- VAVGSW,
- VAVGUB,
- VAVGUH,
- VAVGUW,
- VCMPEQFP,
- VCMPEQFPo,
- VCMPGEFP,
- VCMPGEFPo,
- VCMPBFP,
- VCMPBFPo,
- VCMPGTFP,
- VCMPGTFPo,
- VCLZB,
- VCLZD,
- VCLZH,
- VCLZW,
- VCTZB,
- VCTZD,
- VCTZH,
- VCTZW,
- VADDSBS,
- VADDSHS,
- VADDSWS,
- VMINFP,
- VMINSB,
- VMINSD,
- VMINSH,
- VMINSW,
- VMINUB,
- VMINUD,
- VMINUH,
- VMINUW,
- VMAXFP,
- VMAXSB,
- VMAXSD,
- VMAXSH,
- VMAXSW,
- VMAXUB,
- VMAXUD,
- VMAXUH,
- VMAXUW,
+ VADDCUW,
VPOPCNTW,
VPOPCNTD,
VPRTYBD,
@@ -434,47 +395,38 @@ def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C,
VSUMSWS
)>;
+
+// 5 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three
+// dispatch units for the superslice.
+def : InstRW<[P9_DP_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "MADD(HD|HDU|LD)$"),
+ (instregex "MUL(HD|HW|LD|LI|LI8|LW)(U)?$")
+)>;
+
// 7 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three
// dispatch units for the superslice.
def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
FRSP,
- FRIND,
- FRINS,
- FRIPD,
- FRIPS,
- FRIZD,
- FRIZS,
- FRIMD,
- FRIMS,
- FRE,
- FRES,
- FRSQRTE,
- FRSQRTES,
- FMADDS,
- FMADD,
- FMSUBS,
- FMSUB,
+ (instregex "FRI(N|P|Z|M)(D|S)$"),
+ (instregex "FRE(S)?$"),
+ (instregex "FADD(S)?$"),
+ (instregex "FMSUB(S)?$"),
+ (instregex "FMADD(S)?$"),
+ (instregex "FSUB(S)?$"),
+ (instregex "FCFID(U)?(S)?$"),
+ (instregex "FCTID(U)?(Z)?$"),
+ (instregex "FCTIW(U)?(Z)?$"),
+ (instregex "FRSQRTE(S)?$"),
FNMADDS,
FNMADD,
FNMSUBS,
FNMSUB,
FSELD,
FSELS,
- FADDS,
FMULS,
FMUL,
- FSUBS,
- FCFID,
- FCTID,
- FCTIDZ,
- FCFIDU,
- FCFIDS,
- FCFIDUS,
- FCTIDUZ,
- FCTIWUZ,
- FCTIW,
- FCTIWZ,
XSMADDADP,
XSMADDASP,
XSMADDMDP,
@@ -495,16 +447,40 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
XSNMSUBMSP
)>;
-// 7 cycle Restricted DP operation and one 2 cycle ALU operation.
+// 7 cycle Restricted DP operation and one 3 cycle ALU operation.
+// These operations can be done in parallel.
// The DP is restricted so we need a full 5 dispatches.
-def : InstRW<[P9_DPOpAndALUOp_9C, IP_EXEC_1C, IP_EXEC_1C,
+def : InstRW<[P9_DP_7C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- FMULo,
- FMADDo,
- FMSUBo,
- FNMADDo,
- FNMSUBo
+ (instregex "FSEL(D|S)o$")
+)>;
+
+// 5 Cycle Restricted DP operation and one 2 cycle ALU operation.
+def : InstRW<[P9_DPOpAndALUOp_7C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "MUL(H|L)(D|W)(U)?o$")
+)>;
+
+// 7 cycle Restricted DP operation and one 3 cycle ALU operation.
+// These operations must be done sequentially.
+// The DP is restricted so we need a full 5 dispatches.
+def : InstRW<[P9_DPOpAndALU2Op_10C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "FRI(N|P|Z|M)(D|S)o$"),
+ (instregex "FRE(S)?o$"),
+ (instregex "FADD(S)?o$"),
+ (instregex "FSUB(S)?o$"),
+ (instregex "F(N)?MSUB(S)?o$"),
+ (instregex "F(N)?MADD(S)?o$"),
+ (instregex "FCFID(U)?(S)?o$"),
+ (instregex "FCTID(U)?(Z)?o$"),
+ (instregex "FCTIW(U)?(Z)?o$"),
+ (instregex "FMUL(S)?o$"),
+ (instregex "FRSQRTE(S)?o$"),
+ FRSPo
)>;
// 7 cycle DP operation. One DP unit, one EXEC pipeline and two dispatch units.
@@ -520,6 +496,8 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
XSCVDPUXDS,
XSCVDPUXDSs,
XSCVDPUXWS,
+ XSCVDPSXWSs,
+ XSCVDPUXWSs,
XSCVHPDP,
XSCVSPDP,
XSCVSXDDP,
@@ -533,12 +511,12 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
XSRDPIZ,
XSREDP,
XSRESP,
- //XSRSP,
XSRSQRTEDP,
XSRSQRTESP,
XSSUBDP,
XSSUBSP,
- XSCVDPSPN
+ XSCVDPSPN,
+ XSRSP
)>;
// Three Cycle PM operation. Only one PM unit per superslice so we use the whole
@@ -546,13 +524,18 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
// dispatches.
def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
+ (instregex "LVS(L|R)$"),
+ (instregex "VSPLTIS(W|H|B)$"),
+ (instregex "VSPLT(W|H|B)(s)?$"),
+ (instregex "V_SETALLONES(B|H)?$"),
+ (instregex "VEXTRACTU(B|H|W)$"),
+ (instregex "VINSERT(B|H|W|D)$"),
+ MFVSRLD,
+ MTVSRWS,
VBPERMQ,
VCLZLSBB,
VCTZLSBB,
VEXTRACTD,
- VEXTRACTUB,
- VEXTRACTUH,
- VEXTRACTUW,
VEXTUBLX,
VEXTUBRX,
VEXTUHLX,
@@ -560,10 +543,6 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
VEXTUWLX,
VEXTUWRX,
VGBBD,
- VINSERTB,
- VINSERTD,
- VINSERTH,
- VINSERTW,
VMRGHB,
VMRGHH,
VMRGHW,
@@ -591,14 +570,6 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
VSLDOI,
VSLO,
VSLV,
- VSPLTB,
- VSPLTBs,
- VSPLTH,
- VSPLTHs,
- VSPLTISB,
- VSPLTISH,
- VSPLTISW,
- VSPLTW,
VSR,
VSRO,
VSRV,
@@ -642,7 +613,17 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
XSCMPOQP,
XSCMPUQP,
XSTSTDCQP,
- XSXSIGQP
+ XSXSIGQP,
+ BCDCFNo,
+ BCDCFZo,
+ BCDCPSGNo,
+ BCDCTNo,
+ BCDCTZo,
+ BCDSETSGNo,
+ BCDSo,
+ BCDTRUNCo,
+ BCDUSo,
+ BCDUTRUNCo
)>;
// 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
@@ -650,6 +631,7 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
// dispatches.
def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
+ BCDSRo,
XSADDQP,
XSADDQPO,
XSCVDPQP,
@@ -662,11 +644,20 @@ def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
XSCVSDQP,
XSCVUDQP,
XSRQPI,
+ XSRQPIX,
XSRQPXP,
XSSUBQP,
XSSUBQPO
)>;
+// 23 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DFU_23C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ BCDCTSQo
+)>;
+
// 24 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
// dispatches.
@@ -684,6 +675,14 @@ def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
XSNMSUBQPO
)>;
+// 37 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DFU_37C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ BCDCFSQo
+)>;
+
// 58 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
// dispatches.
@@ -702,23 +701,58 @@ def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
XSSQRTQPO
)>;
-// 5 Cycle load uses a single slice.
+// 6 Cycle Load uses a single slice.
+def : InstRW<[P9_LS_6C, IP_AGEN_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "LXVL(L)?")
+)>;
+
+// 5 Cycle Load uses a single slice.
def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C],
(instrs
+ (instregex "LVE(B|H|W)X$"),
+ (instregex "LVX(L)?"),
+ (instregex "LXSI(B|H)ZX$"),
LXSDX,
+ LXVB16X,
LXVD2X,
+ LXVWSX,
LXSIWZX,
LXV,
LXVX,
LXSD,
DFLOADf64,
- XFLOADf64
+ XFLOADf64,
+ LIWZX
)>;
-// 4 Cycle load uses a single slice.
+// 4 Cycle Load uses a single slice.
def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C],
(instrs
- COPY
+ (instregex "DCB(F|T|ST)(EP)?$"),
+ (instregex "DCBZ(L)?(EP)?$"),
+ (instregex "DCBTST(EP)?$"),
+ (instregex "CP_COPY(8)?$"),
+ (instregex "CP_PASTE(8)?$"),
+ (instregex "ICBI(EP)?$"),
+ (instregex "ICBT(LS)?$"),
+ (instregex "LBARX(L)?$"),
+ (instregex "LBZ(CIX|8|X|X8|XTLS|XTLS_32)?(_)?$"),
+ (instregex "LD(ARX|ARXL|BRX|CIX|X|XTLS)?(_)?$"),
+ (instregex "LH(A|B)RX(L)?(8)?$"),
+ (instregex "LHZ(8|CIX|X|X8|XTLS|XTLS_32)?(_)?$"),
+ (instregex "LWARX(L)?$"),
+ (instregex "LWBRX(8)?$"),
+ (instregex "LWZ(8|CIX|X|X8|XTLS|XTLS_32)?(_)?$"),
+ CP_ABORT,
+ DARN,
+ EnforceIEIO,
+ ISYNC,
+ MSGSYNC,
+ TLBSYNC,
+ SYNC,
+ LMW,
+ LSWI
)>;
// 4 Cycle Restricted load uses a single slice but the dispatch for the whole
@@ -730,6 +764,58 @@ def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
LFD
)>;
+// Cracked Load Instructions.
+// Load instructions that can be done in parallel.
+def : InstRW<[P9_LS_4C, P9_LS_4C, IP_AGEN_1C, IP_AGEN_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ SLBIA,
+ SLBIE,
+ SLBMFEE,
+ SLBMFEV,
+ SLBMTE,
+ TLBIEL
+)>;
+
+// Cracked Load Instruction.
+// Requires Load and ALU pieces totaling 6 cycles. The Load and ALU
+// operations can be run in parallel.
+def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_AGEN_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "L(W|H)ZU(X)?(8)?$"),
+ TEND
+)>;
+
+// Cracked Store Instruction
+// Consecutive Store and ALU instructions. The store is restricted and requires
+// three dispatches.
+def : InstRW<[P9_StoreAndALUOp_3C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "ST(B|H|W|D)CX$")
+)>;
+
+// Cracked Load Instruction.
+// Two consecutive load operations for a total of 8 cycles.
+def : InstRW<[P9_LoadAndLoadOp_8C, IP_AGEN_1C, IP_AGEN_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ LDMX
+)>;
+
+// Cracked Load instruction.
+// Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU
+// operations cannot be done at the same time and so their latencies are added.
+def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "LHA(X)?(8)?$"),
+ (instregex "CP_PASTE(8)?o$"),
+ (instregex "LWA(X)?(_32)?$"),
+ TCHECK
+)>;
+
// Cracked Restricted Load instruction.
// Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU
// operations cannot be done at the same time and so their latencies are added.
@@ -737,9 +823,7 @@ def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- LFIWAX,
- LFSX,
- LFS
+ LFIWAX
)>;
// Cracked Load instruction.
@@ -749,13 +833,42 @@ def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- LXSSPX,
LXSIWAX,
+ LIWAX
+)>;
+
+// Cracked Load instruction.
+// Requires consecutive Load (4 cycles) and ALU (3 cycles) pieces totaling 7
+// cycles. The Load and ALU operations cannot be done at the same time and so
+// their latencies are added.
+// Full 6 dispatches are required as this is a restricted instruction.
+def : InstRW<[P9_LoadAndALU2Op_7C, IP_AGEN_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ LFSX,
+ LFS
+)>;
+
+// Cracked Load instruction.
+// Requires consecutive Load and ALU pieces totaling 8 cycles. The Load and ALU
+// operations cannot be done at the same time and so their latencies are added.
+// Full 4 dispatches are required as this is a cracked instruction.
+def : InstRW<[P9_LoadAndALU2Op_8C, IP_AGEN_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
LXSSP,
- DFLOADf32,
+ LXSSPX,
XFLOADf32,
- LIWAX,
- LIWZX
+ DFLOADf32
+)>;
+
+// Cracked 3-Way Load Instruction
+// Load with two ALU operations that depend on each other
+def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "LHAU(X)?(8)?$"),
+ LWAUX
)>;
// Cracked Load that requires the PM resource.
@@ -767,8 +880,8 @@ def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
+ LXVH8X,
LXVDSX,
- LXVWSX,
LXVW4X
)>;
@@ -776,29 +889,52 @@ def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C,
// all three dispatches for the superslice.
def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- STFS,
- STFD,
- STFIWX,
- STFSX,
- STFDX,
- STXSDX,
- STXSSPX,
- STXSIWX,
- DFSTOREf32,
- DFSTOREf64,
- XFSTOREf32,
- XFSTOREf64,
- STIWX
-)>;
-
-// Store operation that requires the whole superslice.
+ (instregex "STF(S|D|IWX|SX|DX)$"),
+ (instregex "STXS(D|DX|SPX|IWX|IBX|IHX|SP)(v)?$"),
+ (instregex "STW(8)?$"),
+ (instregex "(D|X)FSTORE(f32|f64)$"),
+ (instregex "ST(W|H|D)BRX$"),
+ (instregex "ST(B|H|D)(8)?$"),
+ (instregex "ST(B|W|H|D)(CI)?X(TLS|TLS_32)?(8)?(_)?$"),
+ STIWX,
+ SLBIEG,
+ STMW,
+ STSWI,
+ TLBIE
+)>;
+
+// Vector Store Instruction
+// Requires the whole superslice and therefore requires all three dispatches
+// as well as both the Even and Odd exec pipelines.
def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C,
DISP_1C, DISP_1C, DISP_1C],
(instrs
- STXVD2X,
- STXVW4X
+ (instregex "STVE(B|H|W)X$"),
+ (instregex "STVX(L)?$"),
+ (instregex "STXV(B16X|H8X|W4X|D2X|L|LL|X)?$")
+)>;
+
+// 5 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DIV_5C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "MTCTR(8)?(loop)?$"),
+ (instregex "MTLR(8)?$")
)>;
+// 12 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DIV_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "M(T|F)VRSAVE(v)?$"),
+ (instregex "M(T|F)PMR$"),
+ (instregex "M(T|F)TB(8)?$"),
+ (instregex "MF(SPR|CTR|LR)(8)?$"),
+ (instregex "M(T|F)MSR(D)?$"),
+ (instregex "MTSPR(8)?$")
+)>;
// 16 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
@@ -839,6 +975,15 @@ def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C,
// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
// and one full superslice for the DIV operation since there is only one DIV
// per superslice. Latency of DIV plus ALU is 26.
+def : InstRW<[P9_IntDivAndALUOp_18C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "DIVW(U)?(O)?o$")
+)>;
+
+// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
+// and one full superslice for the DIV operation since there is only one DIV
+// per superslice. Latency of DIV plus ALU is 26.
def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
@@ -868,16 +1013,40 @@ def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- MTOCRF,
- MTOCRF8,
MTCRF,
MTCRF8
)>;
-// Cracked, restricted, ALU operations.
+// Cracked ALU operations.
// Here the two ALU ops can actually be done in parallel and therefore the
// latencies are not added together. Otherwise this is like having two
-// instructions running together on two pipelines and 6 dispatches.
+// instructions running together on two pipelines and 4 dispatches.
+// ALU ops are 2 cycles each.
+def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "ADDC(8)?o$"),
+ (instregex "SUBFC(8)?o$")
+)>;
+
+// Cracked ALU operations.
+// Two ALU ops can be done in parallel.
+// One is three cycle ALU the ohter is a two cycle ALU.
+// One of the ALU ops is restricted the other is not so we have a total of
+// 5 dispatches.
+def : InstRW<[P9_ALU_2C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "F(N)?ABS(D|S)o$"),
+ (instregex "FCPSGN(D|S)o$"),
+ (instregex "FNEG(D|S)o$"),
+ FMRo
+)>;
+
+// Cracked ALU operations.
+// Here the two ALU ops can actually be done in parallel and therefore the
+// latencies are not added together. Otherwise this is like having two
+// instructions running together on two pipelines and 4 dispatches.
// ALU ops are 3 cycles each.
def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C],
@@ -885,7 +1054,63 @@ def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
MCRFS
)>;
-// FP Div instructions in IIC_FPDivD and IIC_FPDivS.
+// Cracked Restricted ALU operations.
+// Here the two ALU ops can actually be done in parallel and therefore the
+// latencies are not added together. Otherwise this is like having two
+// instructions running together on two pipelines and 6 dispatches.
+// ALU ops are 3 cycles each.
+def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "MTFSF(b|o)?$"),
+ (instregex "MTFSFI(o)?$")
+)>;
+
+// Cracked instruction made of two ALU ops.
+// The two ops cannot be done in parallel.
+// One of the ALU ops is restricted and takes 3 dispatches.
+def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "RLD(I)?C(R|L)o$"),
+ (instregex "RLW(IMI|INM|NM)(8)?o$"),
+ (instregex "SLW(8)?o$"),
+ (instregex "SRAW(I)?o$"),
+ (instregex "SRW(8)?o$"),
+ RLDICL_32o,
+ RLDIMIo
+)>;
+
+// Cracked instruction made of two ALU ops.
+// The two ops cannot be done in parallel.
+// Both of the ALU ops are restricted and take 3 dispatches.
+def : InstRW<[P9_ALU2OpAndALU2Op_6C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "MFFS(L|CE|o)?$")
+)>;
+
+// Cracked ALU instruction composed of three consecutive 2 cycle loads for a
+// total of 6 cycles. All of the ALU operations are also restricted so each
+// takes 3 dispatches for a total of 9.
+def : InstRW<[P9_ALUOpAndALUOpAndALUOp_6C, IP_EXEC_1C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
+ DISP_1C, DISP_1C],
+ (instrs
+ (instregex "MFCR(8)?$")
+)>;
+
+// Cracked instruction made of two ALU ops.
+// The two ops cannot be done in parallel.
+def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "EXTSWSLIo$"),
+ (instregex "SRAD(I)?o$"),
+ SLDo,
+ SRDo,
+ RLDICo
+)>;
// 33 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
@@ -893,13 +1118,66 @@ def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
FDIV
)>;
-// 33 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU.
-def : InstRW<[P9_DPOpAndALUOp_35C_8, IP_EXEC_1C, IP_EXEC_1C,
+// 33 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU.
+def : InstRW<[P9_DPOpAndALU2Op_36C_8, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
FDIVo
)>;
+// 36 Cycle DP Instruction.
+// Instruction can be done on a single slice.
+def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C, DISP_1C],
+ (instrs
+ XSSQRTDP
+)>;
+
+// 36 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
+def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FSQRT
+)>;
+
+// 36 Cycle DP Vector Instruction.
+def : InstRW<[P9_DPE_36C_10, P9_DPO_36C_10, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ XVSQRTDP
+)>;
+
+// 27 Cycle DP Vector Instruction.
+def : InstRW<[P9_DPE_27C_10, P9_DPO_27C_10, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ XVSQRTSP
+)>;
+
+// 36 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU.
+def : InstRW<[P9_DPOpAndALU2Op_39C_10, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FSQRTo
+)>;
+
+// 26 Cycle DP Instruction.
+def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C, DISP_1C],
+ (instrs
+ XSSQRTSP
+)>;
+
+// 26 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
+def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FSQRTS
+)>;
+
+// 26 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU.
+def : InstRW<[P9_DPOpAndALU2Op_29C_5, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FSQRTSo
+)>;
+
// 33 Cycle DP Instruction. Takes one slice and 2 dispatches.
def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
@@ -913,7 +1191,7 @@ def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
)>;
// 22 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU.
-def : InstRW<[P9_DPOpAndALUOp_24C_5, IP_EXEC_1C, IP_EXEC_1C,
+def : InstRW<[P9_DPOpAndALU2Op_25C_5, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
FDIVSo
@@ -943,23 +1221,40 @@ def : InstRW<[P9_DPE_33C_8, P9_DPO_33C_8, IP_EXECE_1C, IP_EXECO_1C,
XVDIVDP
)>;
-// Load instructions in IIC_LdStLFDU and IIC_LdStLFDUX.
-
// Instruction cracked into three pieces. One Load and two ALU operations.
// The Load and one of the ALU ops cannot be run at the same time and so the
// latencies are added together for 6 cycles. The remainaing ALU is 2 cycles.
// Both the load and the ALU that depends on it are restricted and so they take
// a total of 6 dispatches. The final 2 dispatches come from the second ALU op.
// The two EXEC pipelines are for the 2 ALUs while the AGEN is for the load.
-def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C,
+def : InstRW<[P9_LoadAndALU2Op_7C, P9_ALU_2C,
IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- LFSU,
- LFSUX
+ (instregex "LF(SU|SUX)$")
+)>;
+
+// Cracked instruction made up of a Store and an ALU. The ALU does not depend on
+// the store and so it can be run at the same time as the store. The store is
+// also restricted.
+def : InstRW<[P9_LS_1C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "STF(S|D)U(X)?$"),
+ (instregex "ST(B|H|W|D)U(X)?(8)?$")
+)>;
+
+// Cracked instruction made up of a Load and an ALU. The ALU does not depend on
+// the load and so it can be run at the same time as the load.
+def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "LBZU(X)?(8)?$"),
+ (instregex "LDU(X)?$")
)>;
+
// Cracked instruction made up of a Load and an ALU. The ALU does not depend on
// the load and so it can be run at the same time as the load. The load is also
// restricted. 3 dispatches are from the restricted load while the other two
@@ -968,8 +1263,7 @@ def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C,
def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- LFDU,
- LFDUX
+ (instregex "LF(DU|DUX)$")
)>;
// Crypto Instructions
@@ -979,13 +1273,147 @@ def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
// dispatches.
def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- VPMSUMB,
- VPMSUMD,
- VPMSUMH,
- VPMSUMW,
- VCIPHER,
- VCIPHERLAST,
- VNCIPHER,
- VNCIPHERLAST,
- VSBOX
+ (instregex "VPMSUM(B|H|W|D)$"),
+ (instregex "V(N)?CIPHER(LAST)?$"),
+ VSBOX
+)>;
+
+// Branch Instructions
+
+// Two Cycle Branch
+def : InstRW<[P9_BR_2C, DISP_1C, DISP_1C],
+ (instrs
+ (instregex "BCCCTR(L)?(8)?$"),
+ (instregex "BCCL(A|R|RL)?$"),
+ (instregex "BCCTR(L)?(8)?(n)?$"),
+ (instregex "BD(N)?Z(8|A|Am|Ap|m|p)?$"),
+ (instregex "BD(N)?ZL(A|Am|Ap|R|R8|RL|RLm|RLp|Rm|Rp|m|p)?$"),
+ (instregex "BL(_TLS)?$"),
+ (instregex "BL8(_TLS|_NOP|_NOP_TLS|_TLS_)?$"),
+ (instregex "BLA(8|8_NOP)?$"),
+ (instregex "BLR(8|L)?$"),
+ (instregex "TAILB(A)?(8)?$"),
+ (instregex "TAILBCTR(8)?$"),
+ (instregex "gBC(A|Aat|CTR|CTRL|L|LA|LAat|LR|LRL|Lat|at)?$"),
+ (instregex "BCLR(L)?(n)?$"),
+ (instregex "BCTR(L)?(8)?$"),
+ B,
+ BA,
+ BC,
+ BCC,
+ BCCA,
+ BCL,
+ BCLalways,
+ BCLn,
+ BCTRL8_LDinto_toc,
+ BCn,
+ CTRL_DEP
+)>;
+
+// Five Cycle Branch with a 2 Cycle ALU Op
+// Operations must be done consecutively and not in parallel.
+def : InstRW<[P9_BROpAndALUOp_7C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ ADDPCIS
+)>;
+
+// Special Extracted Instructions For Atomics
+
+// Atomic Load
+def : InstRW<[P9_LS_1C, P9_LS_1C, P9_LS_4C, P9_LS_4C, P9_LS_4C,
+ IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, IP_AGEN_1C, IP_AGEN_1C,
+ IP_AGEN_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
+ DISP_1C],
+ (instrs
+ (instregex "L(D|W)AT$")
+)>;
+
+// Atomic Store
+def : InstRW<[P9_LS_1C, P9_LS_4C, P9_LS_4C, IP_EXEC_1C, IP_AGEN_1C, IP_AGEN_1C,
+ IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
+ DISP_1C],
+ (instrs
+ (instregex "ST(D|W)AT$")
)>;
+
+// Signal Processing Engine (SPE) Instructions
+// These instructions are not supported on Power 9
+def : InstRW<[],
+ (instrs
+ BRINC,
+ EVABS,
+ EVEQV,
+ EVMRA,
+ EVNAND,
+ EVNEG,
+ (instregex "EVADD(I)?W$"),
+ (instregex "EVADD(SM|SS|UM|US)IAAW$"),
+ (instregex "EVAND(C)?$"),
+ (instregex "EVCMP(EQ|GTS|GTU|LTS|LTU)$"),
+ (instregex "EVCNTL(S|Z)W$"),
+ (instregex "EVDIVW(S|U)$"),
+ (instregex "EVEXTS(B|H)$"),
+ (instregex "EVLD(H|W|D)(X)?$"),
+ (instregex "EVLHH(E|OS|OU)SPLAT(X)?$"),
+ (instregex "EVLWHE(X)?$"),
+ (instregex "EVLWHO(S|U)(X)?$"),
+ (instregex "EVLW(H|W)SPLAT(X)?$"),
+ (instregex "EVMERGE(HI|LO|HILO|LOHI)$"),
+ (instregex "EVMHEG(S|U)M(F|I)A(A|N)$"),
+ (instregex "EVMHES(M|S)(F|I)(A|AA|AAW|ANW)?$"),
+ (instregex "EVMHEU(M|S)I(A|AA|AAW|ANW)?$"),
+ (instregex "EVMHOG(U|S)M(F|I)A(A|N)$"),
+ (instregex "EVMHOS(M|S)(F|I)(A|AA|AAW|ANW)?$"),
+ (instregex "EVMHOU(M|S)I(A|AA|ANW|AAW)?$"),
+ (instregex "EVMWHS(M|S)(F|FA|I|IA)$"),
+ (instregex "EVMWHUMI(A)?$"),
+ (instregex "EVMWLS(M|S)IA(A|N)W$"),
+ (instregex "EVMWLU(M|S)I(A|AA|AAW|ANW)?$"),
+ (instregex "EVMWSM(F|I)(A|AA|AN)?$"),
+ (instregex "EVMWSSF(A|AA|AN)?$"),
+ (instregex "EVMWUMI(A|AA|AN)?$"),
+ (instregex "EV(N|X)?OR(C)?$"),
+ (instregex "EVR(LW|LWI|NDW)$"),
+ (instregex "EVSLW(I)?$"),
+ (instregex "EVSPLAT(F)?I$"),
+ (instregex "EVSRW(I)?(S|U)$"),
+ (instregex "EVST(DD|DH|DW|WHE|WHO|WWE|WWO)(X)?$"),
+ (instregex "EVSUBF(S|U)(M|S)IAAW$"),
+ (instregex "EVSUB(I)?FW$")
+)> { let Unsupported = 1; }
+
+// General Instructions without scheduling support.
+def : InstRW<[],
+ (instrs
+ (instregex "(H)?RFI(D)?$"),
+ (instregex "DSS(ALL)?$"),
+ (instregex "DST(ST)?(T)?(64)?$"),
+ (instregex "ICBL(C|Q)$"),
+ (instregex "L(W|H|B)EPX$"),
+ (instregex "ST(W|H|B)EPX$"),
+ (instregex "(L|ST)FDEPX$"),
+ (instregex "M(T|F)SR(IN)?$"),
+ (instregex "M(T|F)DCR$"),
+ (instregex "NOP_GT_PWR(6|7)$"),
+ (instregex "TLB(IA|IVAX|SX|SX2|SX2D|LD|LI|RE|RE2|WE|WE2)$"),
+ (instregex "WRTEE(I)?$"),
+ ATTN,
+ CLRBHRB,
+ MFBHRBE,
+ MBAR,
+ MSYNC,
+ SLBSYNC,
+ NAP,
+ STOP,
+ TRAP,
+ RFCI,
+ RFDI,
+ RFMCI,
+ SC,
+ DCBA,
+ DCBI,
+ DCCCI,
+ ICCCI
+)> { let Unsupported = 1; }
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index 46502208b175..80ad4962a20f 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -35,6 +35,8 @@ def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">;
def Directive32 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">;
def Directive64 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
def DirectiveA2 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">;
+def DirectiveE500 : SubtargetFeature<"", "DarwinDirective",
+ "PPC::DIR_E500", "">;
def DirectiveE500mc : SubtargetFeature<"", "DarwinDirective",
"PPC::DIR_E500mc", "">;
def DirectiveE5500 : SubtargetFeature<"", "DarwinDirective",
@@ -59,9 +61,12 @@ def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
"Enable 64-bit registers usage for ppc32 [beta]">;
def FeatureCRBits : SubtargetFeature<"crbits", "UseCRBits", "true",
"Use condition-register bits individually">;
+def FeatureFPU : SubtargetFeature<"fpu","HasFPU","true",
+ "Enable classic FPU instructions",
+ [FeatureHardFloat]>;
def FeatureAltivec : SubtargetFeature<"altivec","HasAltivec", "true",
"Enable Altivec instructions",
- [FeatureHardFloat]>;
+ [FeatureFPU]>;
def FeatureSPE : SubtargetFeature<"spe","HasSPE", "true",
"Enable SPE instructions",
[FeatureHardFloat]>;
@@ -69,36 +74,36 @@ def FeatureMFOCRF : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
"Enable the MFOCRF instruction">;
def FeatureFSqrt : SubtargetFeature<"fsqrt","HasFSQRT", "true",
"Enable the fsqrt instruction",
- [FeatureHardFloat]>;
+ [FeatureFPU]>;
def FeatureFCPSGN : SubtargetFeature<"fcpsgn", "HasFCPSGN", "true",
"Enable the fcpsgn instruction",
- [FeatureHardFloat]>;
+ [FeatureFPU]>;
def FeatureFRE : SubtargetFeature<"fre", "HasFRE", "true",
"Enable the fre instruction",
- [FeatureHardFloat]>;
+ [FeatureFPU]>;
def FeatureFRES : SubtargetFeature<"fres", "HasFRES", "true",
"Enable the fres instruction",
- [FeatureHardFloat]>;
+ [FeatureFPU]>;
def FeatureFRSQRTE : SubtargetFeature<"frsqrte", "HasFRSQRTE", "true",
"Enable the frsqrte instruction",
- [FeatureHardFloat]>;
+ [FeatureFPU]>;
def FeatureFRSQRTES : SubtargetFeature<"frsqrtes", "HasFRSQRTES", "true",
"Enable the frsqrtes instruction",
- [FeatureHardFloat]>;
+ [FeatureFPU]>;
def FeatureRecipPrec : SubtargetFeature<"recipprec", "HasRecipPrec", "true",
"Assume higher precision reciprocal estimates">;
def FeatureSTFIWX : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
"Enable the stfiwx instruction",
- [FeatureHardFloat]>;
+ [FeatureFPU]>;
def FeatureLFIWAX : SubtargetFeature<"lfiwax","HasLFIWAX", "true",
"Enable the lfiwax instruction",
- [FeatureHardFloat]>;
+ [FeatureFPU]>;
def FeatureFPRND : SubtargetFeature<"fprnd", "HasFPRND", "true",
"Enable the fri[mnpz] instructions",
- [FeatureHardFloat]>;
+ [FeatureFPU]>;
def FeatureFPCVT : SubtargetFeature<"fpcvt", "HasFPCVT", "true",
"Enable fc[ft]* (unsigned and single-precision) and lfiwzx instructions",
- [FeatureHardFloat]>;
+ [FeatureFPU]>;
def FeatureISEL : SubtargetFeature<"isel","HasISEL", "true",
"Enable the isel instruction">;
def FeatureBPERMD : SubtargetFeature<"bpermd", "HasBPERMD", "true",
@@ -119,13 +124,15 @@ def FeatureMSYNC : SubtargetFeature<"msync", "HasOnlyMSYNC", "true",
[FeatureBookE]>;
def FeatureE500 : SubtargetFeature<"e500", "IsE500", "true",
"Enable E500/E500mc instructions">;
+def FeatureSecurePlt : SubtargetFeature<"secure-plt","SecurePlt", "true",
+ "Enable secure plt mode">;
def FeaturePPC4xx : SubtargetFeature<"ppc4xx", "IsPPC4xx", "true",
"Enable PPC 4xx instructions">;
def FeaturePPC6xx : SubtargetFeature<"ppc6xx", "IsPPC6xx", "true",
"Enable PPC 6xx instructions">;
def FeatureQPX : SubtargetFeature<"qpx","HasQPX", "true",
"Enable QPX instructions",
- [FeatureHardFloat]>;
+ [FeatureFPU]>;
def FeatureVSX : SubtargetFeature<"vsx","HasVSX", "true",
"Enable VSX instructions",
[FeatureAltivec]>;
@@ -304,8 +311,8 @@ def : ProcessorModel<"450", PPC440Model, [Directive440, FeatureISEL,
FeatureFRES, FeatureFRSQRTE,
FeatureICBT, FeatureBookE,
FeatureMSYNC, FeatureMFTB]>;
-def : Processor<"601", G3Itineraries, [Directive601, FeatureHardFloat]>;
-def : Processor<"602", G3Itineraries, [Directive602, FeatureHardFloat,
+def : Processor<"601", G3Itineraries, [Directive601, FeatureFPU]>;
+def : Processor<"602", G3Itineraries, [Directive602, FeatureFPU,
FeatureMFTB]>;
def : Processor<"603", G3Itineraries, [Directive603,
FeatureFRES, FeatureFRSQRTE,
@@ -356,6 +363,10 @@ def : ProcessorModel<"g5", G5Model,
FeatureFRES, FeatureFRSQRTE,
Feature64Bit /*, Feature64BitRegs */,
FeatureMFTB, DeprecatedDST]>;
+def : ProcessorModel<"e500", PPCE500Model,
+ [DirectiveE500,
+ FeatureICBT, FeatureBookE,
+ FeatureISEL, FeatureMFTB]>;
def : ProcessorModel<"e500mc", PPCE500mcModel,
[DirectiveE500mc,
FeatureSTFIWX, FeatureICBT, FeatureBookE,
@@ -465,4 +476,5 @@ def PPC : Target {
let AssemblyParsers = [PPCAsmParser];
let AssemblyParserVariants = [PPCAsmParserVariant];
+ let AllowRegisterRenaming = 1;
}
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 17451900840a..a9da64cc216f 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -510,6 +510,32 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
const Module *M = MF->getFunction().getParent();
PICLevel::Level PL = M->getPICLevel();
+#ifndef NDEBUG
+ // Validate that SPE and FPU are mutually exclusive in codegen
+ if (!MI->isInlineAsm()) {
+ for (const MachineOperand &MO: MI->operands()) {
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ if (Subtarget->hasSPE()) {
+ if (PPC::F4RCRegClass.contains(Reg) ||
+ PPC::F8RCRegClass.contains(Reg) ||
+ PPC::QBRCRegClass.contains(Reg) ||
+ PPC::QFRCRegClass.contains(Reg) ||
+ PPC::QSRCRegClass.contains(Reg) ||
+ PPC::VFRCRegClass.contains(Reg) ||
+ PPC::VRRCRegClass.contains(Reg) ||
+ PPC::VSFRCRegClass.contains(Reg) ||
+ PPC::VSSRCRegClass.contains(Reg)
+ )
+ llvm_unreachable("SPE targets cannot have FPRegs!");
+ } else {
+ if (PPC::SPERCRegClass.contains(Reg))
+ llvm_unreachable("SPE register found in FPU-targeted code!");
+ }
+ }
+ }
+ }
+#endif
// Lower multi-instruction pseudo operations.
switch (MI->getOpcode()) {
default: break;
@@ -563,33 +589,63 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// Transform %rd = UpdateGBR(%rt, %ri)
// Into: lwz %rt, .L0$poff - .L0$pb(%ri)
// add %rd, %rt, %ri
+ // or into (if secure plt mode is on):
+ // addis r30, r30, .LTOC - .L0$pb@ha
+ // addi r30, r30, .LTOC - .L0$pb@l
// Get the offset from the GOT Base Register to the GOT
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
- MCSymbol *PICOffset =
- MF->getInfo<PPCFunctionInfo>()->getPICOffsetSymbol();
- TmpInst.setOpcode(PPC::LWZ);
- const MCExpr *Exp =
- MCSymbolRefExpr::create(PICOffset, MCSymbolRefExpr::VK_None, OutContext);
- const MCExpr *PB =
- MCSymbolRefExpr::create(MF->getPICBaseSymbol(),
- MCSymbolRefExpr::VK_None,
- OutContext);
- const MCOperand TR = TmpInst.getOperand(1);
- const MCOperand PICR = TmpInst.getOperand(0);
-
- // Step 1: lwz %rt, .L$poff - .L$pb(%ri)
- TmpInst.getOperand(1) =
- MCOperand::createExpr(MCBinaryExpr::createSub(Exp, PB, OutContext));
- TmpInst.getOperand(0) = TR;
- TmpInst.getOperand(2) = PICR;
- EmitToStreamer(*OutStreamer, TmpInst);
+ if (Subtarget->isSecurePlt() && isPositionIndependent() ) {
+ unsigned PICR = TmpInst.getOperand(0).getReg();
+ MCSymbol *LTOCSymbol = OutContext.getOrCreateSymbol(StringRef(".LTOC"));
+ const MCExpr *PB =
+ MCSymbolRefExpr::create(MF->getPICBaseSymbol(),
+ OutContext);
- TmpInst.setOpcode(PPC::ADD4);
- TmpInst.getOperand(0) = PICR;
- TmpInst.getOperand(1) = TR;
- TmpInst.getOperand(2) = PICR;
- EmitToStreamer(*OutStreamer, TmpInst);
- return;
+ const MCExpr *LTOCDeltaExpr =
+ MCBinaryExpr::createSub(MCSymbolRefExpr::create(LTOCSymbol, OutContext),
+ PB, OutContext);
+
+ const MCExpr *LTOCDeltaHi =
+ PPCMCExpr::createHa(LTOCDeltaExpr, false, OutContext);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
+ .addReg(PICR)
+ .addReg(PICR)
+ .addExpr(LTOCDeltaHi));
+
+ const MCExpr *LTOCDeltaLo =
+ PPCMCExpr::createLo(LTOCDeltaExpr, false, OutContext);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
+ .addReg(PICR)
+ .addReg(PICR)
+ .addExpr(LTOCDeltaLo));
+ return;
+ } else {
+ MCSymbol *PICOffset =
+ MF->getInfo<PPCFunctionInfo>()->getPICOffsetSymbol();
+ TmpInst.setOpcode(PPC::LWZ);
+ const MCExpr *Exp =
+ MCSymbolRefExpr::create(PICOffset, MCSymbolRefExpr::VK_None, OutContext);
+ const MCExpr *PB =
+ MCSymbolRefExpr::create(MF->getPICBaseSymbol(),
+ MCSymbolRefExpr::VK_None,
+ OutContext);
+ const MCOperand TR = TmpInst.getOperand(1);
+ const MCOperand PICR = TmpInst.getOperand(0);
+
+ // Step 1: lwz %rt, .L$poff - .L$pb(%ri)
+ TmpInst.getOperand(1) =
+ MCOperand::createExpr(MCBinaryExpr::createSub(Exp, PB, OutContext));
+ TmpInst.getOperand(0) = TR;
+ TmpInst.getOperand(2) = PICR;
+ EmitToStreamer(*OutStreamer, TmpInst);
+
+ TmpInst.setOpcode(PPC::ADD4);
+ TmpInst.getOperand(0) = PICR;
+ TmpInst.getOperand(1) = TR;
+ TmpInst.getOperand(2) = PICR;
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
}
case PPC::LWZtoc: {
// Transform %r3 = LWZtoc @min1, %r2
@@ -741,11 +797,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
else if (MO.isGlobal()) {
const GlobalValue *GV = MO.getGlobal();
MOSymbol = getSymbol(GV);
- DEBUG(
- unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
- assert((GVFlags & PPCII::MO_NLP_FLAG) &&
- "LDtocL used on symbol that could be accessed directly is "
- "invalid. Must match ADDIStocHA."));
+ LLVM_DEBUG(
+ unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+ assert((GVFlags & PPCII::MO_NLP_FLAG) &&
+ "LDtocL used on symbol that could be accessed directly is "
+ "invalid. Must match ADDIStocHA."));
MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
}
@@ -770,11 +826,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
if (MO.isGlobal()) {
const GlobalValue *GV = MO.getGlobal();
- DEBUG(
- unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
- assert (
- !(GVFlags & PPCII::MO_NLP_FLAG) &&
- "Interposable definitions must use indirect access."));
+ LLVM_DEBUG(unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+ assert(!(GVFlags & PPCII::MO_NLP_FLAG) &&
+ "Interposable definitions must use indirect access."));
MOSymbol = getSymbol(GV);
} else if (MO.isCPI()) {
MOSymbol = GetCPISymbol(MO.getIndex());
@@ -1233,7 +1287,7 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
if (!Subtarget->isPPC64()) {
const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
- if (PPCFI->usesPICBase()) {
+ if (PPCFI->usesPICBase() && !Subtarget->isSecurePlt()) {
MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol();
MCSymbol *PICBase = MF->getPICBaseSymbol();
OutStreamer->EmitLabel(RelocSymbol);
@@ -1255,7 +1309,7 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
if (Subtarget->isELFv2ABI()) {
// In the Large code model, we allow arbitrary displacements between
// the text section and its associated TOC section. We place the
- // full 8-byte offset to the TOC in memory immediatedly preceding
+ // full 8-byte offset to the TOC in memory immediately preceding
// the function global entry point.
if (TM.getCodeModel() == CodeModel::Large
&& !MF->getRegInfo().use_empty(PPC::X2)) {
@@ -1458,6 +1512,7 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
"ppc750",
"ppc970",
"ppcA2",
+ "ppce500",
"ppce500mc",
"ppce5500",
"power3",
diff --git a/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
index 32d801b13ded..bbb977f090c5 100644
--- a/lib/Target/PowerPC/PPCBranchCoalescing.cpp
+++ b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
@@ -60,7 +60,7 @@ namespace llvm {
/// expands to the following machine code:
///
/// %bb.0: derived from LLVM BB %entry
-/// Live Ins: %f1 %f3 %x6
+/// liveins: %f1 %f3 %x6
/// <SNIP1>
/// %0 = COPY %f1; F8RC:%0
/// %5 = CMPLWI killed %4, 0; CRRC:%5 GPRC:%4
@@ -98,7 +98,7 @@ namespace llvm {
/// If all conditions are meet, IR should collapse to:
///
/// %bb.0: derived from LLVM BB %entry
-/// Live Ins: %f1 %f3 %x6
+/// liveins: %f1 %f3 %x6
/// <SNIP1>
/// %0 = COPY %f1; F8RC:%0
/// %5 = CMPLWI killed %4, 0; CRRC:%5 GPRC:%4
@@ -236,18 +236,18 @@ void PPCBranchCoalescing::initialize(MachineFunction &MF) {
///\return true if and only if the branch can be coalesced, false otherwise
///
bool PPCBranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) {
- DEBUG(dbgs() << "Determine if branch block " << Cand.BranchBlock->getNumber()
- << " can be coalesced:");
+ LLVM_DEBUG(dbgs() << "Determine if branch block "
+ << Cand.BranchBlock->getNumber() << " can be coalesced:");
MachineBasicBlock *FalseMBB = nullptr;
if (TII->analyzeBranch(*Cand.BranchBlock, Cand.BranchTargetBlock, FalseMBB,
Cand.Cond)) {
- DEBUG(dbgs() << "TII unable to Analyze Branch - skip\n");
+ LLVM_DEBUG(dbgs() << "TII unable to Analyze Branch - skip\n");
return false;
}
for (auto &I : Cand.BranchBlock->terminators()) {
- DEBUG(dbgs() << "Looking at terminator : " << I << "\n");
+ LLVM_DEBUG(dbgs() << "Looking at terminator : " << I << "\n");
if (!I.isBranch())
continue;
@@ -265,14 +265,14 @@ bool PPCBranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) {
// must then be extended to prove that none of the implicit operands are
// changed in the blocks that are combined during coalescing.
if (I.getNumOperands() != I.getNumExplicitOperands()) {
- DEBUG(dbgs() << "Terminator contains implicit operands - skip : " << I
- << "\n");
+ LLVM_DEBUG(dbgs() << "Terminator contains implicit operands - skip : "
+ << I << "\n");
return false;
}
}
if (Cand.BranchBlock->isEHPad() || Cand.BranchBlock->hasEHPadSuccessor()) {
- DEBUG(dbgs() << "EH Pad - skip\n");
+ LLVM_DEBUG(dbgs() << "EH Pad - skip\n");
return false;
}
@@ -280,13 +280,13 @@ bool PPCBranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) {
// FalseMBB is null, and BranchTargetBlock is a successor to BranchBlock)
if (!Cand.BranchTargetBlock || FalseMBB ||
!Cand.BranchBlock->isSuccessor(Cand.BranchTargetBlock)) {
- DEBUG(dbgs() << "Does not form a triangle - skip\n");
+ LLVM_DEBUG(dbgs() << "Does not form a triangle - skip\n");
return false;
}
// Ensure there are only two successors
if (Cand.BranchBlock->succ_size() != 2) {
- DEBUG(dbgs() << "Does not have 2 successors - skip\n");
+ LLVM_DEBUG(dbgs() << "Does not have 2 successors - skip\n");
return false;
}
@@ -305,18 +305,19 @@ bool PPCBranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) {
assert(Succ && "Expecting a valid fall-through block\n");
if (!Succ->empty()) {
- DEBUG(dbgs() << "Fall-through block contains code -- skip\n");
- return false;
+ LLVM_DEBUG(dbgs() << "Fall-through block contains code -- skip\n");
+ return false;
}
if (!Succ->isSuccessor(Cand.BranchTargetBlock)) {
- DEBUG(dbgs()
- << "Successor of fall through block is not branch taken block\n");
- return false;
+ LLVM_DEBUG(
+ dbgs()
+ << "Successor of fall through block is not branch taken block\n");
+ return false;
}
Cand.FallThroughBlock = Succ;
- DEBUG(dbgs() << "Valid Candidate\n");
+ LLVM_DEBUG(dbgs() << "Valid Candidate\n");
return true;
}
@@ -331,7 +332,7 @@ bool PPCBranchCoalescing::identicalOperands(
ArrayRef<MachineOperand> OpList1, ArrayRef<MachineOperand> OpList2) const {
if (OpList1.size() != OpList2.size()) {
- DEBUG(dbgs() << "Operand list is different size\n");
+ LLVM_DEBUG(dbgs() << "Operand list is different size\n");
return false;
}
@@ -339,8 +340,8 @@ bool PPCBranchCoalescing::identicalOperands(
const MachineOperand &Op1 = OpList1[i];
const MachineOperand &Op2 = OpList2[i];
- DEBUG(dbgs() << "Op1: " << Op1 << "\n"
- << "Op2: " << Op2 << "\n");
+ LLVM_DEBUG(dbgs() << "Op1: " << Op1 << "\n"
+ << "Op2: " << Op2 << "\n");
if (Op1.isIdenticalTo(Op2)) {
// filter out instructions with physical-register uses
@@ -348,10 +349,10 @@ bool PPCBranchCoalescing::identicalOperands(
// If the physical register is constant then we can assume the value
// has not changed between uses.
&& !(Op1.isUse() && MRI->isConstantPhysReg(Op1.getReg()))) {
- DEBUG(dbgs() << "The operands are not provably identical.\n");
+ LLVM_DEBUG(dbgs() << "The operands are not provably identical.\n");
return false;
}
- DEBUG(dbgs() << "Op1 and Op2 are identical!\n");
+ LLVM_DEBUG(dbgs() << "Op1 and Op2 are identical!\n");
continue;
}
@@ -364,14 +365,14 @@ bool PPCBranchCoalescing::identicalOperands(
MachineInstr *Op1Def = MRI->getVRegDef(Op1.getReg());
MachineInstr *Op2Def = MRI->getVRegDef(Op2.getReg());
if (TII->produceSameValue(*Op1Def, *Op2Def, MRI)) {
- DEBUG(dbgs() << "Op1Def: " << *Op1Def << " and " << *Op2Def
- << " produce the same value!\n");
+ LLVM_DEBUG(dbgs() << "Op1Def: " << *Op1Def << " and " << *Op2Def
+ << " produce the same value!\n");
} else {
- DEBUG(dbgs() << "Operands produce different values\n");
+ LLVM_DEBUG(dbgs() << "Operands produce different values\n");
return false;
}
} else {
- DEBUG(dbgs() << "The operands are not provably identical.\n");
+ LLVM_DEBUG(dbgs() << "The operands are not provably identical.\n");
return false;
}
}
@@ -395,7 +396,7 @@ void PPCBranchCoalescing::moveAndUpdatePHIs(MachineBasicBlock *SourceMBB,
MachineBasicBlock::iterator ME = SourceMBB->getFirstNonPHI();
if (MI == ME) {
- DEBUG(dbgs() << "SourceMBB contains no PHI instructions.\n");
+ LLVM_DEBUG(dbgs() << "SourceMBB contains no PHI instructions.\n");
return;
}
@@ -425,19 +426,19 @@ bool PPCBranchCoalescing::canMoveToBeginning(const MachineInstr &MI,
const MachineBasicBlock &TargetMBB
) const {
- DEBUG(dbgs() << "Checking if " << MI << " can move to beginning of "
- << TargetMBB.getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Checking if " << MI << " can move to beginning of "
+ << TargetMBB.getNumber() << "\n");
for (auto &Def : MI.defs()) { // Looking at Def
for (auto &Use : MRI->use_instructions(Def.getReg())) {
if (Use.isPHI() && Use.getParent() == &TargetMBB) {
- DEBUG(dbgs() << " *** used in a PHI -- cannot move ***\n");
- return false;
+ LLVM_DEBUG(dbgs() << " *** used in a PHI -- cannot move ***\n");
+ return false;
}
}
}
- DEBUG(dbgs() << " Safe to move to the beginning.\n");
+ LLVM_DEBUG(dbgs() << " Safe to move to the beginning.\n");
return true;
}
@@ -456,22 +457,23 @@ bool PPCBranchCoalescing::canMoveToEnd(const MachineInstr &MI,
const MachineBasicBlock &TargetMBB
) const {
- DEBUG(dbgs() << "Checking if " << MI << " can move to end of "
- << TargetMBB.getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Checking if " << MI << " can move to end of "
+ << TargetMBB.getNumber() << "\n");
for (auto &Use : MI.uses()) {
if (Use.isReg() && TargetRegisterInfo::isVirtualRegister(Use.getReg())) {
MachineInstr *DefInst = MRI->getVRegDef(Use.getReg());
if (DefInst->isPHI() && DefInst->getParent() == MI.getParent()) {
- DEBUG(dbgs() << " *** Cannot move this instruction ***\n");
+ LLVM_DEBUG(dbgs() << " *** Cannot move this instruction ***\n");
return false;
} else {
- DEBUG(dbgs() << " *** def is in another block -- safe to move!\n");
+ LLVM_DEBUG(
+ dbgs() << " *** def is in another block -- safe to move!\n");
}
}
}
- DEBUG(dbgs() << " Safe to move to the end.\n");
+ LLVM_DEBUG(dbgs() << " Safe to move to the end.\n");
return true;
}
@@ -541,15 +543,17 @@ bool PPCBranchCoalescing::canMerge(CoalescingCandidateInfo &SourceRegion,
for (auto &Def : I->defs())
for (auto &Use : MRI->use_instructions(Def.getReg())) {
if (Use.isPHI() && Use.getParent() == SourceRegion.BranchTargetBlock) {
- DEBUG(dbgs() << "PHI " << *I << " defines register used in another "
- "PHI within branch target block -- can't merge\n");
+ LLVM_DEBUG(dbgs()
+ << "PHI " << *I
+ << " defines register used in another "
+ "PHI within branch target block -- can't merge\n");
NumPHINotMoved++;
return false;
}
if (Use.getParent() == SourceRegion.BranchBlock) {
- DEBUG(dbgs() << "PHI " << *I
- << " defines register used in this "
- "block -- all must move down\n");
+ LLVM_DEBUG(dbgs() << "PHI " << *I
+ << " defines register used in this "
+ "block -- all must move down\n");
SourceRegion.MustMoveDown = true;
}
}
@@ -562,13 +566,13 @@ bool PPCBranchCoalescing::canMerge(CoalescingCandidateInfo &SourceRegion,
E = SourceRegion.BranchBlock->end();
I != E; ++I) {
if (!canMoveToBeginning(*I, *SourceRegion.BranchTargetBlock)) {
- DEBUG(dbgs() << "Instruction " << *I
- << " cannot move down - must move up!\n");
+ LLVM_DEBUG(dbgs() << "Instruction " << *I
+ << " cannot move down - must move up!\n");
SourceRegion.MustMoveUp = true;
}
if (!canMoveToEnd(*I, *TargetRegion.BranchBlock)) {
- DEBUG(dbgs() << "Instruction " << *I
- << " cannot move up - must move down!\n");
+ LLVM_DEBUG(dbgs() << "Instruction " << *I
+ << " cannot move up - must move down!\n");
SourceRegion.MustMoveDown = true;
}
}
@@ -719,10 +723,10 @@ bool PPCBranchCoalescing::runOnMachineFunction(MachineFunction &MF) {
bool didSomething = false;
- DEBUG(dbgs() << "******** Branch Coalescing ********\n");
+ LLVM_DEBUG(dbgs() << "******** Branch Coalescing ********\n");
initialize(MF);
- DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
CoalescingCandidateInfo Cand1, Cand2;
// Walk over blocks and find candidates to merge
@@ -752,24 +756,27 @@ bool PPCBranchCoalescing::runOnMachineFunction(MachineFunction &MF) {
"Branch-taken block should post-dominate first candidate");
if (!identicalOperands(Cand1.Cond, Cand2.Cond)) {
- DEBUG(dbgs() << "Blocks " << Cand1.BranchBlock->getNumber() << " and "
- << Cand2.BranchBlock->getNumber()
- << " have different branches\n");
+ LLVM_DEBUG(dbgs() << "Blocks " << Cand1.BranchBlock->getNumber()
+ << " and " << Cand2.BranchBlock->getNumber()
+ << " have different branches\n");
break;
}
if (!canMerge(Cand2, Cand1)) {
- DEBUG(dbgs() << "Cannot merge blocks " << Cand1.BranchBlock->getNumber()
- << " and " << Cand2.BranchBlock->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Cannot merge blocks "
+ << Cand1.BranchBlock->getNumber() << " and "
+ << Cand2.BranchBlock->getNumber() << "\n");
NumBlocksNotCoalesced++;
continue;
}
- DEBUG(dbgs() << "Merging blocks " << Cand1.BranchBlock->getNumber()
- << " and " << Cand1.BranchTargetBlock->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Merging blocks " << Cand1.BranchBlock->getNumber()
+ << " and " << Cand1.BranchTargetBlock->getNumber()
+ << "\n");
MergedCandidates = mergeCandidates(Cand2, Cand1);
if (MergedCandidates)
didSomething = true;
- DEBUG(dbgs() << "Function after merging: "; MF.dump(); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "Function after merging: "; MF.dump();
+ dbgs() << "\n");
} while (MergedCandidates);
}
@@ -779,6 +786,6 @@ bool PPCBranchCoalescing::runOnMachineFunction(MachineFunction &MF) {
MF.verify(nullptr, "Error in code produced by branch coalescing");
#endif // NDEBUG
- DEBUG(dbgs() << "Finished Branch Coalescing\n");
+ LLVM_DEBUG(dbgs() << "Finished Branch Coalescing\n");
return didSomething;
}
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index fc638829378a..6b9e2383e36f 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -30,11 +30,14 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/IR/Constants.h"
@@ -50,8 +53,8 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#ifndef NDEBUG
@@ -403,15 +406,16 @@ bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
}
if (Opcode) {
- MVT VTy = TLI->getSimpleValueType(
- *DL, CI->getArgOperand(0)->getType(), true);
- if (VTy == MVT::Other)
+ EVT EVTy =
+ TLI->getValueType(*DL, CI->getArgOperand(0)->getType(), true);
+
+ if (EVTy == MVT::Other)
return true;
- if (TLI->isOperationLegalOrCustom(Opcode, VTy))
+ if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
continue;
- else if (VTy.isVector() &&
- TLI->isOperationLegalOrCustom(Opcode, VTy.getScalarType()))
+ else if (EVTy.isVector() &&
+ TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
continue;
return true;
@@ -454,13 +458,16 @@ bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
return true;
}
+ // FREM is always a call.
+ if (J->getOpcode() == Instruction::FRem)
+ return true;
+
if (STI->useSoftFloat()) {
switch(J->getOpcode()) {
case Instruction::FAdd:
case Instruction::FSub:
case Instruction::FMul:
case Instruction::FDiv:
- case Instruction::FRem:
case Instruction::FPTrunc:
case Instruction::FPExt:
case Instruction::FPToUI:
@@ -500,13 +507,19 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
// Process nested loops first.
for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
MadeChange |= convertToCTRLoop(*I);
- DEBUG(dbgs() << "Nested loop converted\n");
+ LLVM_DEBUG(dbgs() << "Nested loop converted\n");
}
// If a nested loop has been converted, then we can't convert this loop.
if (MadeChange)
return MadeChange;
+ // Bail out if the loop has irreducible control flow.
+ LoopBlocksRPO RPOT(L);
+ RPOT.perform(LI);
+ if (containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI))
+ return false;
+
#ifndef NDEBUG
// Stop trying after reaching the limit (if any).
int Limit = CTRLoopLimit;
@@ -527,14 +540,35 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
SmallVector<BasicBlock*, 4> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
+ // If there is an exit edge known to be frequently taken,
+ // we should not transform this loop.
+ for (auto &BB : ExitingBlocks) {
+ Instruction *TI = BB->getTerminator();
+ if (!TI) continue;
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ uint64_t TrueWeight = 0, FalseWeight = 0;
+ if (!BI->isConditional() ||
+ !BI->extractProfMetadata(TrueWeight, FalseWeight))
+ continue;
+
+ // If the exit path is more frequent than the loop path,
+ // we return here without further analysis for this loop.
+ bool TrueIsExit = !L->contains(BI->getSuccessor(0));
+ if (( TrueIsExit && FalseWeight < TrueWeight) ||
+ (!TrueIsExit && FalseWeight > TrueWeight))
+ return MadeChange;
+ }
+ }
+
BasicBlock *CountedExitBlock = nullptr;
const SCEV *ExitCount = nullptr;
BranchInst *CountedExitBranch = nullptr;
for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
IE = ExitingBlocks.end(); I != IE; ++I) {
const SCEV *EC = SE->getExitCount(L, *I);
- DEBUG(dbgs() << "Exit Count for " << *L << " from block " <<
- (*I)->getName() << ": " << *EC << "\n");
+ LLVM_DEBUG(dbgs() << "Exit Count for " << *L << " from block "
+ << (*I)->getName() << ": " << *EC << "\n");
if (isa<SCEVCouldNotCompute>(EC))
continue;
if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
@@ -546,9 +580,15 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32))
continue;
+ // If this exiting block is contained in a nested loop, it is not eligible
+ // for insertion of the branch-and-decrement since the inner loop would
+ // end up messing up the value in the CTR.
+ if (LI->getLoopFor(*I) != L)
+ continue;
+
// We now have a loop-invariant count of loop iterations (which is not the
// constant zero) for which we know that this loop will not exit via this
- // exisiting block.
+ // existing block.
// We need to make sure that this block will run on every loop iteration.
// For this to be true, we must dominate all blocks with backedges. Such
@@ -602,7 +642,8 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
if (!Preheader)
return MadeChange;
- DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName()
+ << "\n");
// Insert the count into the preheader and replace the condition used by the
// selected branch.
@@ -690,11 +731,12 @@ check_block:
}
if (I != BI && clobbersCTR(*I)) {
- DEBUG(dbgs() << printMBBReference(*MBB) << " (" << MBB->getFullName()
- << ") instruction " << *I << " clobbers CTR, invalidating "
- << printMBBReference(*BI->getParent()) << " ("
- << BI->getParent()->getFullName() << ") instruction " << *BI
- << "\n");
+ LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " (" << MBB->getFullName()
+ << ") instruction " << *I
+ << " clobbers CTR, invalidating "
+ << printMBBReference(*BI->getParent()) << " ("
+ << BI->getParent()->getFullName() << ") instruction "
+ << *BI << "\n");
return false;
}
@@ -708,10 +750,10 @@ check_block:
if (CheckPreds) {
queue_preds:
if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) {
- DEBUG(dbgs() << "Unable to find a MTCTR instruction for "
- << printMBBReference(*BI->getParent()) << " ("
- << BI->getParent()->getFullName() << ") instruction " << *BI
- << "\n");
+ LLVM_DEBUG(dbgs() << "Unable to find a MTCTR instruction for "
+ << printMBBReference(*BI->getParent()) << " ("
+ << BI->getParent()->getFullName() << ") instruction "
+ << *BI << "\n");
return false;
}
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index a4f4c8688cc1..12c581023234 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -45,6 +45,30 @@ def RetCC_PPC64_AnyReg : CallingConv<[
CCCustom<"CC_PPC_AnyReg_Error">
]>;
+// Return-value convention for PowerPC coldcc.
+def RetCC_PPC_Cold : CallingConv<[
+ // Use the same return registers as RetCC_PPC, but limited to only
+ // one return value. The remaining return values will be saved to
+ // the stack.
+ CCIfType<[i32, i1], CCIfSubtarget<"isPPC64()", CCPromoteToType<i64>>>,
+ CCIfType<[i1], CCIfNotSubtarget<"isPPC64()", CCPromoteToType<i32>>>,
+
+ CCIfType<[i32], CCAssignToReg<[R3]>>,
+ CCIfType<[i64], CCAssignToReg<[X3]>>,
+ CCIfType<[i128], CCAssignToReg<[X3]>>,
+
+ CCIfType<[f32], CCAssignToReg<[F1]>>,
+ CCIfType<[f64], CCAssignToReg<[F1]>>,
+ CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2]>>>,
+
+ CCIfType<[v4f64, v4f32, v4i1],
+ CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1]>>>,
+
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
+ CCIfSubtarget<"hasAltivec()",
+ CCAssignToReg<[V2]>>>
+]>;
+
// Return-value convention for PowerPC
def RetCC_PPC : CallingConv<[
CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_PPC64_AnyReg>>,
@@ -59,8 +83,19 @@ def RetCC_PPC : CallingConv<[
// Floating point types returned as "direct" go into F1 .. F8; note that
// only the ELFv2 ABI fully utilizes all these registers.
- CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
- CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+ CCIfNotSubtarget<"hasSPE()",
+ CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>>,
+ CCIfNotSubtarget<"hasSPE()",
+ CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>>,
+ CCIfSubtarget<"hasSPE()",
+ CCIfType<[f32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>>,
+ CCIfSubtarget<"hasSPE()",
+ CCIfType<[f64], CCAssignToReg<[S3, S4, S5, S6, S7, S8, S9, S10]>>>,
+
+ // For P9, f128 are passed in vector registers.
+ CCIfType<[f128],
+ CCIfSubtarget<"hasP9Vector()",
+ CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>,
// QPX vectors are returned in QF1 and QF2.
CCIfType<[v4f64, v4f32, v4i1],
@@ -117,6 +152,9 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[
CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+ CCIfType<[f128],
+ CCIfSubtarget<"hasP9Vector()",
+ CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>,
CCIfType<[v4f64, v4f32, v4i1],
CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>,
CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
@@ -156,7 +194,15 @@ def CC_PPC32_SVR4_Common : CallingConv<[
CCIfType<[f64], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignFPArgRegs">>>,
// FP values are passed in F1 - F8.
- CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+ CCIfType<[f32, f64],
+ CCIfNotSubtarget<"hasSPE()",
+ CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>>,
+ CCIfType<[f64],
+ CCIfSubtarget<"hasSPE()",
+ CCAssignToReg<[S3, S4, S5, S6, S7, S8, S9, S10]>>>,
+ CCIfType<[f32],
+ CCIfSubtarget<"hasSPE()",
+ CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>>,
// Split arguments have an alignment of 8 bytes on the stack.
CCIfType<[i32], CCIfSplit<CCAssignToStack<4, 8>>>,
@@ -165,13 +211,18 @@ def CC_PPC32_SVR4_Common : CallingConv<[
// Floats are stored in double precision format, thus they have the same
// alignment and size as doubles.
- CCIfType<[f32,f64], CCAssignToStack<8, 8>>,
+ // With SPE floats are stored as single precision, so have alignment and
+ // size of int.
+ CCIfType<[f32,f64], CCIfNotSubtarget<"hasSPE()", CCAssignToStack<8, 8>>>,
+ CCIfType<[f32], CCIfSubtarget<"hasSPE()", CCAssignToStack<4, 4>>>,
+ CCIfType<[f64], CCIfSubtarget<"hasSPE()", CCAssignToStack<8, 8>>>,
// QPX vectors that are stored in double precision need 32-byte alignment.
CCIfType<[v4f64, v4i1], CCAssignToStack<32, 32>>,
- // Vectors get 16-byte stack slots that are 16-byte aligned.
- CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>
+ // Vectors and float128 get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>,
+ CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToStack<16, 16>>>
]>;
// This calling convention puts vector arguments always on the stack. It is used
@@ -192,6 +243,11 @@ def CC_PPC32_SVR4 : CallingConv<[
CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7,
V8, V9, V10, V11, V12, V13]>>>,
+
+ // Float128 types treated as vector arguments.
+ CCIfType<[f128],
+ CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2, V3, V4, V5, V6, V7,
+ V8, V9, V10, V11, V12, V13]>>>,
CCDelegateTo<CC_PPC32_SVR4_Common>
]>;
@@ -227,15 +283,23 @@ def CSR_Darwin32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20,
def CSR_Darwin32_Altivec : CalleeSavedRegs<(add CSR_Darwin32, CSR_Altivec)>;
-def CSR_SVR432 : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20,
- R21, R22, R23, R24, R25, R26, R27, R28,
- R29, R30, R31, F14, F15, F16, F17, F18,
+// SPE does not use FPRs, so break out the common register set as base.
+def CSR_SVR432_COMM : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20,
+ R21, R22, R23, R24, R25, R26, R27,
+ R28, R29, R30, R31, CR2, CR3, CR4
+ )>;
+def CSR_SVR432 : CalleeSavedRegs<(add CSR_SVR432_COMM, F14, F15, F16, F17, F18,
F19, F20, F21, F22, F23, F24, F25, F26,
- F27, F28, F29, F30, F31, CR2, CR3, CR4
+ F27, F28, F29, F30, F31
)>;
+def CSR_SPE : CalleeSavedRegs<(add S14, S15, S16, S17, S18, S19, S20, S21, S22,
+ S23, S24, S25, S26, S27, S28, S29, S30, S31
+ )>;
def CSR_SVR432_Altivec : CalleeSavedRegs<(add CSR_SVR432, CSR_Altivec)>;
+def CSR_SVR432_SPE : CalleeSavedRegs<(add CSR_SVR432_COMM, CSR_SPE)>;
+
def CSR_Darwin64 : CalleeSavedRegs<(add X13, X14, X15, X16, X17, X18, X19, X20,
X21, X22, X23, X24, X25, X26, X27, X28,
X29, X30, X31, F14, F15, F16, F17, F18,
@@ -271,6 +335,36 @@ def CSR_SVR464_R2_Altivec_ViaCopy : CalleeSavedRegs<(add CSR_SVR464_R2_Altivec)>
def CSR_NoRegs : CalleeSavedRegs<(add)>;
+// coldcc calling convection marks most registers as non-volatile.
+// Do not include r1 since the stack pointer is never considered a CSR.
+// Do not include r2, since it is the TOC register and is added depending
+// on wether or not the function uses the TOC and is a non-leaf.
+// Do not include r0,r11,r13 as they are optional in functional linkage
+// and value may be altered by inter-library calls.
+// Do not include r12 as it is used as a scratch register.
+// Do not include return registers r3, f1, v2.
+def CSR_SVR32_ColdCC : CalleeSavedRegs<(add (sequence "R%u", 4, 10),
+ (sequence "R%u", 14, 31),
+ F0, (sequence "F%u", 2, 31),
+ (sequence "CR%u", 0, 7))>;
+
+def CSR_SVR32_ColdCC_Altivec : CalleeSavedRegs<(add CSR_SVR32_ColdCC,
+ (sequence "V%u", 0, 1),
+ (sequence "V%u", 3, 31))>;
+
+def CSR_SVR64_ColdCC : CalleeSavedRegs<(add (sequence "X%u", 4, 10),
+ (sequence "X%u", 14, 31),
+ F0, (sequence "F%u", 2, 31),
+ (sequence "CR%u", 0, 7))>;
+
+def CSR_SVR64_ColdCC_R2: CalleeSavedRegs<(add CSR_SVR64_ColdCC, X2)>;
+
+def CSR_SVR64_ColdCC_Altivec : CalleeSavedRegs<(add CSR_SVR64_ColdCC,
+ (sequence "V%u", 0, 1),
+ (sequence "V%u", 3, 31))>;
+
+def CSR_SVR64_ColdCC_R2_Altivec : CalleeSavedRegs<(add CSR_SVR64_ColdCC_Altivec, X2)>;
+
def CSR_64_AllRegs: CalleeSavedRegs<(add X0, (sequence "X%u", 3, 10),
(sequence "X%u", 14, 31),
(sequence "F%u", 0, 31),
diff --git a/lib/Target/PowerPC/PPCEarlyReturn.cpp b/lib/Target/PowerPC/PPCEarlyReturn.cpp
index 1699463c0a4b..ed5e496b32fd 100644
--- a/lib/Target/PowerPC/PPCEarlyReturn.cpp
+++ b/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -128,7 +128,7 @@ protected:
if (J->getOperand(i).isMBB() &&
J->getOperand(i).getMBB() == &ReturnMBB)
OtherReference = true;
- } else if (!J->isTerminator() && !J->isDebugValue())
+ } else if (!J->isTerminator() && !J->isDebugInstr())
break;
if (J == (*PI)->begin())
diff --git a/lib/Target/PowerPC/PPCExpandISEL.cpp b/lib/Target/PowerPC/PPCExpandISEL.cpp
index b00e98b63e34..fe41e1b36a5d 100644
--- a/lib/Target/PowerPC/PPCExpandISEL.cpp
+++ b/lib/Target/PowerPC/PPCExpandISEL.cpp
@@ -117,7 +117,7 @@ public:
/// instruction is still generated by default on targets that support them.
///
/// \return true if ISEL should be expanded into if-then-else code sequence;
- /// false if ISEL instruction should be generated, i.e. not expaned.
+ /// false if ISEL instruction should be generated, i.e. not expanded.
///
static bool isExpandISELEnabled(const MachineFunction &MF);
@@ -126,11 +126,11 @@ public:
#endif
bool runOnMachineFunction(MachineFunction &MF) override {
- DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
initialize(MF);
if (!collectISELInstructions()) {
- DEBUG(dbgs() << "No ISEL instructions in this function\n");
+ LLVM_DEBUG(dbgs() << "No ISEL instructions in this function\n");
return false;
}
@@ -170,9 +170,10 @@ bool PPCExpandISEL::collectISELInstructions() {
#ifndef NDEBUG
void PPCExpandISEL::DumpISELInstructions() const {
for (const auto &I : ISELInstructions) {
- DEBUG(dbgs() << printMBBReference(*MF->getBlockNumbered(I.first)) << ":\n");
+ LLVM_DEBUG(dbgs() << printMBBReference(*MF->getBlockNumbered(I.first))
+ << ":\n");
for (const auto &VI : I.second)
- DEBUG(dbgs() << " "; VI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " "; VI->print(dbgs()));
}
}
#endif
@@ -192,9 +193,10 @@ void PPCExpandISEL::expandAndMergeISELs() {
bool ExpandISELEnabled = isExpandISELEnabled(*MF);
for (auto &BlockList : ISELInstructions) {
- DEBUG(dbgs() << "Expanding ISEL instructions in "
- << printMBBReference(*MF->getBlockNumbered(BlockList.first))
- << "\n");
+ LLVM_DEBUG(
+ dbgs() << "Expanding ISEL instructions in "
+ << printMBBReference(*MF->getBlockNumbered(BlockList.first))
+ << "\n");
BlockISELList &CurrentISELList = BlockList.second;
auto I = CurrentISELList.begin();
auto E = CurrentISELList.end();
@@ -210,7 +212,8 @@ void PPCExpandISEL::expandAndMergeISELs() {
// as it would be ISEL %R0, %ZERO, %R0, %CRN.
if (useSameRegister(Dest, TrueValue) &&
useSameRegister(Dest, FalseValue)) {
- DEBUG(dbgs() << "Remove redudant ISEL instruction: " << **I << "\n");
+ LLVM_DEBUG(dbgs() << "Remove redundant ISEL instruction: " << **I
+ << "\n");
// FIXME: if the CR field used has no other uses, we could eliminate the
// instruction that defines it. This would have to be done manually
// since this pass runs too late to run DCE after it.
@@ -223,8 +226,9 @@ void PPCExpandISEL::expandAndMergeISELs() {
// condition as it would be ISEL %RX, %ZERO, %R0, %CRN, which makes it
// safe to fold ISEL to MR(OR) instead of ADDI.
MachineBasicBlock *MBB = (*I)->getParent();
- DEBUG(dbgs() << "Fold the ISEL instruction to an unconditonal copy:\n");
- DEBUG(dbgs() << "ISEL: " << **I << "\n");
+ LLVM_DEBUG(
+ dbgs() << "Fold the ISEL instruction to an unconditional copy:\n");
+ LLVM_DEBUG(dbgs() << "ISEL: " << **I << "\n");
NumFolded++;
// Note: we're using both the TrueValue and FalseValue operands so as
// not to lose the kill flag if it is set on either of them.
@@ -235,8 +239,8 @@ void PPCExpandISEL::expandAndMergeISELs() {
(*I)->eraseFromParent();
I++;
} else if (ExpandISELEnabled) { // Normal cases expansion enabled
- DEBUG(dbgs() << "Expand ISEL instructions:\n");
- DEBUG(dbgs() << "ISEL: " << **I << "\n");
+ LLVM_DEBUG(dbgs() << "Expand ISEL instructions:\n");
+ LLVM_DEBUG(dbgs() << "ISEL: " << **I << "\n");
BlockISELList SubISELList;
SubISELList.push_back(*I++);
// Collect the ISELs that can be merged together.
@@ -244,7 +248,7 @@ void PPCExpandISEL::expandAndMergeISELs() {
// may be redundant or foldable to a register copy. So we still keep
// the handleSpecialCases() downstream to handle them.
while (I != E && canMerge(SubISELList.back(), *I)) {
- DEBUG(dbgs() << "ISEL: " << **I << "\n");
+ LLVM_DEBUG(dbgs() << "ISEL: " << **I << "\n");
SubISELList.push_back(*I++);
}
@@ -264,7 +268,7 @@ void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL,
auto MI = BIL.begin();
while (MI != BIL.end()) {
assert(isISEL(**MI) && "Expecting an ISEL instruction");
- DEBUG(dbgs() << "ISEL: " << **MI << "\n");
+ LLVM_DEBUG(dbgs() << "ISEL: " << **MI << "\n");
MachineOperand &Dest = (*MI)->getOperand(0);
MachineOperand &TrueValue = (*MI)->getOperand(1);
@@ -281,7 +285,7 @@ void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL,
// Special case 1, all registers used by ISEL are the same one.
if (!IsADDIInstRequired && !IsORIInstRequired) {
- DEBUG(dbgs() << "Remove redudant ISEL instruction.");
+ LLVM_DEBUG(dbgs() << "Remove redundant ISEL instruction.");
// FIXME: if the CR field used has no other uses, we could eliminate the
// instruction that defines it. This would have to be done manually
// since this pass runs too late to run DCE after it.
@@ -300,7 +304,8 @@ void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL,
// be zero. In this case, the useSameRegister method will return false,
// thereby preventing this ISEL from being folded.
if (useSameRegister(TrueValue, FalseValue) && (BIL.size() == 1)) {
- DEBUG(dbgs() << "Fold the ISEL instruction to an unconditonal copy.");
+ LLVM_DEBUG(
+ dbgs() << "Fold the ISEL instruction to an unconditional copy.");
NumFolded++;
// Note: we're using both the TrueValue and FalseValue operands so as
// not to lose the kill flag if it is set on either of them.
@@ -439,11 +444,10 @@ void PPCExpandISEL::populateBlocks(BlockISELList &BIL) {
// condition is false
MachineOperand &ConditionRegister = MI->getOperand(3); // Condition
- DEBUG(dbgs() << "Dest: " << Dest << "\n");
- DEBUG(dbgs() << "TrueValue: " << TrueValue << "\n");
- DEBUG(dbgs() << "FalseValue: " << FalseValue << "\n");
- DEBUG(dbgs() << "ConditionRegister: " << ConditionRegister << "\n");
-
+ LLVM_DEBUG(dbgs() << "Dest: " << Dest << "\n");
+ LLVM_DEBUG(dbgs() << "TrueValue: " << TrueValue << "\n");
+ LLVM_DEBUG(dbgs() << "FalseValue: " << FalseValue << "\n");
+ LLVM_DEBUG(dbgs() << "ConditionRegister: " << ConditionRegister << "\n");
// If the Dest Register and True Value Register are not the same one, we
// need the True Block.
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 402e29cdff72..b00655b50229 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -153,7 +153,8 @@ class PPCFastISel final : public FastISel {
return RC->getID() == PPC::VSSRCRegClassID;
}
bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
- bool isZExt, unsigned DestReg);
+ bool isZExt, unsigned DestReg,
+ const PPC::Predicate Pred);
bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
const TargetRegisterClass *RC, bool IsZExt = true,
unsigned FP64LoadOpc = PPC::LFD);
@@ -206,6 +207,8 @@ CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) {
return CC_PPC32_SVR4_ByVal;
else if (Flag == 3)
return CC_PPC32_SVR4_VarArg;
+ else if (Flag == 4)
+ return RetCC_PPC_Cold;
else
return RetCC_PPC;
}
@@ -219,7 +222,7 @@ static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
// result consists of 4 bits, indicating lt, eq, gt and un (unordered),
// only one of which will be set. The result is generated by fcmpu
// instruction. However, bc instruction only inspects one of the first 3
- // bits, so when un is set, bc instruction may jump to to an undesired
+ // bits, so when un is set, bc instruction may jump to an undesired
// place.
//
// More specifically, if we expect an unordered comparison and un is set, we
@@ -464,6 +467,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
bool IsZExt, unsigned FP64LoadOpc) {
unsigned Opc;
bool UseOffset = true;
+ bool HasSPE = PPCSubTarget->hasSPE();
// If ResultReg is given, it determines the register class of the load.
// Otherwise, RC is the register class to use. If the result of the
@@ -475,8 +479,8 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
const TargetRegisterClass *UseRC =
(ResultReg ? MRI.getRegClass(ResultReg) :
(RC ? RC :
- (VT == MVT::f64 ? &PPC::F8RCRegClass :
- (VT == MVT::f32 ? &PPC::F4RCRegClass :
+ (VT == MVT::f64 ? (HasSPE ? &PPC::SPERCRegClass : &PPC::F8RCRegClass) :
+ (VT == MVT::f32 ? (HasSPE ? &PPC::SPE4RCRegClass : &PPC::F4RCRegClass) :
(VT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
&PPC::GPRC_and_GPRC_NOR0RegClass)))));
@@ -505,7 +509,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
UseOffset = ((Addr.Offset & 3) == 0);
break;
case MVT::f32:
- Opc = PPC::LFS;
+ Opc = PPCSubTarget->hasSPE() ? PPC::SPELWZ : PPC::LFS;
break;
case MVT::f64:
Opc = FP64LoadOpc;
@@ -576,6 +580,8 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
case PPC::LD: Opc = PPC::LDX; break;
case PPC::LFS: Opc = IsVSSRC ? PPC::LXSSPX : PPC::LFSX; break;
case PPC::LFD: Opc = IsVSFRC ? PPC::LXSDX : PPC::LFDX; break;
+ case PPC::EVLDD: Opc = PPC::EVLDDX; break;
+ case PPC::SPELWZ: Opc = PPC::SPELWZX; break;
}
auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
@@ -618,7 +624,8 @@ bool PPCFastISel::SelectLoad(const Instruction *I) {
AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
unsigned ResultReg = 0;
- if (!PPCEmitLoad(VT, ResultReg, Addr, RC))
+ if (!PPCEmitLoad(VT, ResultReg, Addr, RC, true,
+ PPCSubTarget->hasSPE() ? PPC::EVLDD : PPC::LFD))
return false;
updateValueMap(I, ResultReg);
return true;
@@ -651,10 +658,10 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
UseOffset = ((Addr.Offset & 3) == 0);
break;
case MVT::f32:
- Opc = PPC::STFS;
+ Opc = PPCSubTarget->hasSPE() ? PPC::SPESTW : PPC::STFS;
break;
case MVT::f64:
- Opc = PPC::STFD;
+ Opc = PPCSubTarget->hasSPE() ? PPC::EVSTDD : PPC::STFD;
break;
}
@@ -719,6 +726,8 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
case PPC::STD: Opc = PPC::STDX; break;
case PPC::STFS: Opc = IsVSSRC ? PPC::STXSSPX : PPC::STFSX; break;
case PPC::STFD: Opc = IsVSFRC ? PPC::STXSDX : PPC::STFDX; break;
+ case PPC::EVSTDD: Opc = PPC::EVSTDDX; break;
+ case PPC::SPESTW: Opc = PPC::SPESTWX; break;
}
auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
@@ -792,11 +801,12 @@ bool PPCFastISel::SelectBranch(const Instruction *I) {
unsigned CondReg = createResultReg(&PPC::CRRCRegClass);
if (!PPCEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
- CondReg))
+ CondReg, PPCPred))
return false;
BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCC))
- .addImm(PPCPred).addReg(CondReg).addMBB(TBB);
+ .addImm(PPCSubTarget->hasSPE() ? PPC::PRED_SPE : PPCPred)
+ .addReg(CondReg).addMBB(TBB);
finishCondBranch(BI->getParent(), TBB, FBB);
return true;
}
@@ -820,7 +830,8 @@ bool PPCFastISel::SelectBranch(const Instruction *I) {
// Attempt to emit a compare of the two source values. Signed and unsigned
// comparisons are supported. Return false if we can't handle it.
bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
- bool IsZExt, unsigned DestReg) {
+ bool IsZExt, unsigned DestReg,
+ const PPC::Predicate Pred) {
Type *Ty = SrcValue1->getType();
EVT SrcEVT = TLI.getValueType(DL, Ty, true);
if (!SrcEVT.isSimple())
@@ -836,6 +847,7 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
// similar to ARM in this regard.
long Imm = 0;
bool UseImm = false;
+ const bool HasSPE = PPCSubTarget->hasSPE();
// Only 16-bit integer constants can be represented in compares for
// PowerPC. Others will be materialized into a register.
@@ -854,10 +866,38 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
switch (SrcVT.SimpleTy) {
default: return false;
case MVT::f32:
- CmpOpc = PPC::FCMPUS;
+ if (HasSPE) {
+ switch (Pred) {
+ default: return false;
+ case PPC::PRED_EQ:
+ CmpOpc = PPC::EFSCMPEQ;
+ break;
+ case PPC::PRED_LT:
+ CmpOpc = PPC::EFSCMPLT;
+ break;
+ case PPC::PRED_GT:
+ CmpOpc = PPC::EFSCMPGT;
+ break;
+ }
+ } else
+ CmpOpc = PPC::FCMPUS;
break;
case MVT::f64:
- CmpOpc = PPC::FCMPUD;
+ if (HasSPE) {
+ switch (Pred) {
+ default: return false;
+ case PPC::PRED_EQ:
+ CmpOpc = PPC::EFDCMPEQ;
+ break;
+ case PPC::PRED_LT:
+ CmpOpc = PPC::EFDCMPLT;
+ break;
+ case PPC::PRED_GT:
+ CmpOpc = PPC::EFDCMPGT;
+ break;
+ }
+ } else
+ CmpOpc = PPC::FCMPUD;
break;
case MVT::i1:
case MVT::i8:
@@ -945,9 +985,19 @@ bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
return false;
// Round the result to single precision.
- unsigned DestReg = createResultReg(&PPC::F4RCRegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP), DestReg)
- .addReg(SrcReg);
+ unsigned DestReg;
+
+ if (PPCSubTarget->hasSPE()) {
+ DestReg = createResultReg(&PPC::SPE4RCRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(PPC::EFSCFD), DestReg)
+ .addReg(SrcReg);
+ } else {
+ DestReg = createResultReg(&PPC::F4RCRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(PPC::FRSP), DestReg)
+ .addReg(SrcReg);
+ }
updateValueMap(I, DestReg);
return true;
@@ -1029,6 +1079,22 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
if (SrcReg == 0)
return false;
+ // Shortcut for SPE. Doesn't need to store/load, since it's all in the GPRs
+ if (PPCSubTarget->hasSPE()) {
+ unsigned Opc;
+ if (DstVT == MVT::f32)
+ Opc = IsSigned ? PPC::EFSCFSI : PPC::EFSCFUI;
+ else
+ Opc = IsSigned ? PPC::EFDCFSI : PPC::EFDCFUI;
+
+ unsigned DestReg = createResultReg(&PPC::SPERCRegClass);
+ // Generate the convert.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+ .addReg(SrcReg);
+ updateValueMap(I, DestReg);
+ return true;
+ }
+
// We can only lower an unsigned convert if we have the newer
// floating-point conversion operations.
if (!IsSigned && !PPCSubTarget->hasFPCVT())
@@ -1123,8 +1189,9 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
if (DstVT != MVT::i32 && DstVT != MVT::i64)
return false;
- // If we don't have FCTIDUZ and we need it, punt to SelectionDAG.
- if (DstVT == MVT::i64 && !IsSigned && !PPCSubTarget->hasFPCVT())
+ // If we don't have FCTIDUZ, or SPE, and we need it, punt to SelectionDAG.
+ if (DstVT == MVT::i64 && !IsSigned &&
+ !PPCSubTarget->hasFPCVT() && !PPCSubTarget->hasSPE())
return false;
Value *Src = I->getOperand(0);
@@ -1152,23 +1219,34 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
// Determine the opcode for the conversion, which takes place
// entirely within FPRs.
- unsigned DestReg = createResultReg(&PPC::F8RCRegClass);
+ unsigned DestReg;
unsigned Opc;
- if (DstVT == MVT::i32)
+ if (PPCSubTarget->hasSPE()) {
+ DestReg = createResultReg(&PPC::GPRCRegClass);
if (IsSigned)
- Opc = PPC::FCTIWZ;
+ Opc = InRC == &PPC::SPE4RCRegClass ? PPC::EFSCTSIZ : PPC::EFDCTSIZ;
else
- Opc = PPCSubTarget->hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
- else
- Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
+ Opc = InRC == &PPC::SPE4RCRegClass ? PPC::EFSCTUIZ : PPC::EFDCTUIZ;
+ } else {
+ DestReg = createResultReg(&PPC::F8RCRegClass);
+ if (DstVT == MVT::i32)
+ if (IsSigned)
+ Opc = PPC::FCTIWZ;
+ else
+ Opc = PPCSubTarget->hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
+ else
+ Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
+ }
// Generate the convert.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
.addReg(SrcReg);
// Now move the integer value from a float register to an integer register.
- unsigned IntReg = PPCMoveToIntReg(I, DstVT, DestReg, IsSigned);
+ unsigned IntReg = PPCSubTarget->hasSPE() ? DestReg :
+ PPCMoveToIntReg(I, DstVT, DestReg, IsSigned);
+
if (IntReg == 0)
return false;
@@ -1916,8 +1994,13 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
assert(Align > 0 && "Unexpectedly missing alignment information!");
unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
- const TargetRegisterClass *RC =
- (VT == MVT::f32) ? &PPC::F4RCRegClass : &PPC::F8RCRegClass;
+ const bool HasSPE = PPCSubTarget->hasSPE();
+ const TargetRegisterClass *RC;
+ if (HasSPE)
+ RC = ((VT == MVT::f32) ? &PPC::SPE4RCRegClass : &PPC::SPERCRegClass);
+ else
+ RC = ((VT == MVT::f32) ? &PPC::F4RCRegClass : &PPC::F8RCRegClass);
+
unsigned DestReg = createResultReg(RC);
CodeModel::Model CModel = TM.getCodeModel();
@@ -1925,7 +2008,13 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
MachinePointerInfo::getConstantPool(*FuncInfo.MF),
MachineMemOperand::MOLoad, (VT == MVT::f32) ? 4 : 8, Align);
- unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD;
+ unsigned Opc;
+
+ if (HasSPE)
+ Opc = ((VT == MVT::f32) ? PPC::SPELWZ : PPC::EVLDD);
+ else
+ Opc = ((VT == MVT::f32) ? PPC::LFS : PPC::LFD);
+
unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
PPCFuncInfo->setUsesTOCBasePtr();
@@ -2261,7 +2350,8 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
unsigned ResultReg = MI->getOperand(0).getReg();
- if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt))
+ if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt,
+ PPCSubTarget->hasSPE() ? PPC::EVLDD : PPC::LFD))
return false;
MI->eraseFromParent();
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 7902da20a010..f0000c5bafd7 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -173,7 +173,27 @@ const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
{PPC::V23, -144},
{PPC::V22, -160},
{PPC::V21, -176},
- {PPC::V20, -192}};
+ {PPC::V20, -192},
+
+ // SPE register save area (overlaps Vector save area).
+ {PPC::S31, -8},
+ {PPC::S30, -16},
+ {PPC::S29, -24},
+ {PPC::S28, -32},
+ {PPC::S27, -40},
+ {PPC::S26, -48},
+ {PPC::S25, -56},
+ {PPC::S24, -64},
+ {PPC::S23, -72},
+ {PPC::S22, -80},
+ {PPC::S21, -88},
+ {PPC::S20, -96},
+ {PPC::S19, -104},
+ {PPC::S18, -112},
+ {PPC::S17, -120},
+ {PPC::S16, -128},
+ {PPC::S15, -136},
+ {PPC::S14, -144}};
static const SpillSlot Offsets64[] = {
// Floating-point register save area offsets.
@@ -1615,7 +1635,7 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
}
// Make sure we don't explicitly spill r31, because, for example, we have
- // some inline asm which explicity clobbers it, when we otherwise have a
+ // some inline asm which explicitly clobbers it, when we otherwise have a
// frame pointer and are using r31's spill slot for the prologue/epilogue
// code. Same goes for the base pointer and the PIC base register.
if (needsFP(MF))
@@ -1676,7 +1696,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
unsigned MinGPR = PPC::R31;
unsigned MinG8R = PPC::X31;
unsigned MinFPR = PPC::F31;
- unsigned MinVR = PPC::V31;
+ unsigned MinVR = Subtarget.hasSPE() ? PPC::S31 : PPC::V31;
bool HasGPSaveArea = false;
bool HasG8SaveArea = false;
@@ -1691,7 +1711,8 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
unsigned Reg = CSI[i].getReg();
- if (PPC::GPRCRegClass.contains(Reg)) {
+ if (PPC::GPRCRegClass.contains(Reg) ||
+ PPC::SPE4RCRegClass.contains(Reg)) {
HasGPSaveArea = true;
GPRegs.push_back(CSI[i]);
@@ -1720,7 +1741,10 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
; // do nothing, as we already know whether CRs are spilled
} else if (PPC::VRSAVERCRegClass.contains(Reg)) {
HasVRSAVESaveArea = true;
- } else if (PPC::VRRCRegClass.contains(Reg)) {
+ } else if (PPC::VRRCRegClass.contains(Reg) ||
+ PPC::SPERCRegClass.contains(Reg)) {
+ // Altivec and SPE are mutually exclusive, but have the same stack
+ // alignment requirements, so overload the save area for both cases.
HasVRSaveArea = true;
VRegs.push_back(CSI[i]);
@@ -1863,8 +1887,10 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
LowerBound -= 4; // The VRSAVE save area is always 4 bytes long.
}
+ // Both Altivec and SPE have the same alignment and padding requirements
+ // within the stack frame.
if (HasVRSaveArea) {
- // Insert alignment padding, we need 16-byte alignment. Note: for postive
+ // Insert alignment padding, we need 16-byte alignment. Note: for positive
// number the alignment formula is : y = (x + (n-1)) & (~(n-1)). But since
// we are using negative number here (the stack grows downward). We should
// use formula : y = x & (~(n-1)). Where x is the size before aligning, n
@@ -1950,7 +1976,14 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4;
// Add the callee-saved register as live-in; it's killed at the spill.
- MBB.addLiveIn(Reg);
+ // Do not do this for callee-saved registers that are live-in to the
+ // function because they will already be marked live-in and this will be
+ // adding it for a second time. It is an error to add the same register
+ // to the set more than once.
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ bool IsLiveIn = MRI.isLiveIn(Reg);
+ if (!IsLiveIn)
+ MBB.addLiveIn(Reg);
if (CRSpilled && IsCRField) {
CRMIB.addReg(Reg, RegState::ImplicitKill);
@@ -1980,7 +2013,10 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
}
} else {
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
- TII.storeRegToStackSlot(MBB, MI, Reg, true,
+ // Use !IsLiveIn for the kill flag.
+ // We do not want to kill registers that are live in this function
+ // before their use because they will become undefined registers.
+ TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn,
CSI[i].getFrameIdx(), RC, TRI);
}
}
@@ -2149,6 +2185,8 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
}
bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+ if (MF.getInfo<PPCFunctionInfo>()->shrinkWrapDisabled())
+ return false;
return (MF.getSubtarget<PPCSubtarget>().isSVR4ABI() &&
MF.getSubtarget<PPCSubtarget>().isPPC64());
}
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index f845d5a9ac64..01c155594c44 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -30,7 +30,7 @@ class PPCFrameLowering: public TargetFrameLowering {
const unsigned BasePointerSaveOffset;
/**
- * \brief Find register[s] that can be used in function prologue and epilogue
+ * Find register[s] that can be used in function prologue and epilogue
*
* Find register[s] that can be use as scratch register[s] in function
* prologue and epilogue to save various registers (Link Register, Base
@@ -67,7 +67,7 @@ class PPCFrameLowering: public TargetFrameLowering {
bool twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const;
/**
- * \brief Create branch instruction for PPC::TCRETURN* (tail call return)
+ * Create branch instruction for PPC::TCRETURN* (tail call return)
*
* \param[in] MBB that is terminated by PPC::TCRETURN*
*/
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index f327396370f6..551220466901 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -180,9 +180,9 @@ void PPCDispatchGroupSBHazardRecognizer::EmitInstruction(SUnit *SU) {
CurGroup.clear();
CurSlots = CurBranches = 0;
} else {
- DEBUG(dbgs() << "**** Adding to dispatch group: SU(" <<
- SU->NodeNum << "): ");
- DEBUG(DAG->dumpNode(SU));
+ LLVM_DEBUG(dbgs() << "**** Adding to dispatch group: SU(" << SU->NodeNum
+ << "): ");
+ LLVM_DEBUG(DAG->dumpNode(SU));
unsigned NSlots;
bool MustBeFirst = mustComeFirst(MCID, NSlots);
@@ -268,7 +268,7 @@ PPCHazardRecognizer970::PPCHazardRecognizer970(const ScheduleDAG &DAG)
}
void PPCHazardRecognizer970::EndDispatchGroup() {
- DEBUG(errs() << "=== Start of dispatch group\n");
+ LLVM_DEBUG(errs() << "=== Start of dispatch group\n");
NumIssued = 0;
// Structural hazard info.
@@ -330,7 +330,7 @@ getHazardType(SUnit *SU, int Stalls) {
MachineInstr *MI = SU->getInstr();
- if (MI->isDebugValue())
+ if (MI->isDebugInstr())
return NoHazard;
unsigned Opcode = MI->getOpcode();
@@ -388,7 +388,7 @@ getHazardType(SUnit *SU, int Stalls) {
void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
MachineInstr *MI = SU->getInstr();
- if (MI->isDebugValue())
+ if (MI->isDebugInstr())
return;
unsigned Opcode = MI->getOpcode();
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index d3a223fe03e0..6cec664d1e66 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -32,7 +32,6 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -53,6 +52,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
@@ -101,6 +101,11 @@ static cl::opt<bool> EnableBranchHint(
cl::desc("Enable static hinting of branches on ppc"),
cl::Hidden);
+static cl::opt<bool> EnableTLSOpt(
+ "ppc-tls-opt", cl::init(true),
+ cl::desc("Enable tls optimization peephole"),
+ cl::Hidden);
+
enum ICmpInGPRType { ICGPR_All, ICGPR_None, ICGPR_I32, ICGPR_I64,
ICGPR_NonExtIn, ICGPR_Zext, ICGPR_Sext, ICGPR_ZextI32,
ICGPR_SextI32, ICGPR_ZextI64, ICGPR_SextI64 };
@@ -199,6 +204,14 @@ namespace {
bool tryBitPermutation(SDNode *N);
bool tryIntCompareInGPR(SDNode *N);
+ // tryTLSXFormLoad - Convert an ISD::LOAD fed by a PPCISD::ADD_TLS into
+ // an X-Form load instruction with the offset being a relocation coming from
+ // the PPCISD::ADD_TLS.
+ bool tryTLSXFormLoad(LoadSDNode *N);
+ // tryTLSXFormStore - Convert an ISD::STORE fed by a PPCISD::ADD_TLS into
+ // an X-Form store instruction with the offset being a relocation coming from
+ // the PPCISD::ADD_TLS.
+ bool tryTLSXFormStore(StoreSDNode *N);
/// SelectCC - Select a comparison of the specified values with the
/// specified condition code, returning the CR# of the expression.
SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
@@ -314,6 +327,7 @@ private:
bool isOffsetMultipleOf(SDNode *N, unsigned Val) const;
void transferMemOperands(SDNode *N, SDNode *Result);
+ MachineSDNode *flipSignBit(const SDValue &N, SDNode **SignBit = nullptr);
};
} // end anonymous namespace
@@ -417,6 +431,16 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
}
} else {
+ // We must ensure that this sequence is dominated by the prologue.
+ // FIXME: This is a bit of a big hammer since we don't get the benefits
+ // of shrink-wrapping whenever we emit this instruction. Considering
+ // this is used in any function where we emit a jump table, this may be
+ // a significant limitation. We should consider inserting this in the
+ // block where it is used and then commoning this sequence up if it
+ // appears in multiple places.
+ // Note: on ISA 3.0 cores, we can use lnia (addpcis) instead of
+ // MovePCtoLR8.
+ MF->getInfo<PPCFunctionInfo>()->setShrinkWrapDisabled(true);
GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RC_and_G8RC_NOX0RegClass);
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8));
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg);
@@ -494,10 +518,10 @@ static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo,
if (std::max(TProb, FProb) / Threshold < std::min(TProb, FProb))
return PPC::BR_NO_HINT;
- DEBUG(dbgs() << "Use branch hint for '" << FuncInfo->Fn->getName() << "::"
- << BB->getName() << "'\n"
- << " -> " << TBB->getName() << ": " << TProb << "\n"
- << " -> " << FBB->getName() << ": " << FProb << "\n");
+ LLVM_DEBUG(dbgs() << "Use branch hint for '" << FuncInfo->Fn->getName()
+ << "::" << BB->getName() << "'\n"
+ << " -> " << TBB->getName() << ": " << TProb << "\n"
+ << " -> " << FBB->getName() << ": " << FProb << "\n");
const BasicBlockSDNode *BBDN = cast<BasicBlockSDNode>(DestMBB);
@@ -572,6 +596,90 @@ bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask,
return false;
}
+bool PPCDAGToDAGISel::tryTLSXFormStore(StoreSDNode *ST) {
+ SDValue Base = ST->getBasePtr();
+ if (Base.getOpcode() != PPCISD::ADD_TLS)
+ return false;
+ SDValue Offset = ST->getOffset();
+ if (!Offset.isUndef())
+ return false;
+
+ SDLoc dl(ST);
+ EVT MemVT = ST->getMemoryVT();
+ EVT RegVT = ST->getValue().getValueType();
+
+ unsigned Opcode;
+ switch (MemVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8: {
+ Opcode = (RegVT == MVT::i32) ? PPC::STBXTLS_32 : PPC::STBXTLS;
+ break;
+ }
+ case MVT::i16: {
+ Opcode = (RegVT == MVT::i32) ? PPC::STHXTLS_32 : PPC::STHXTLS;
+ break;
+ }
+ case MVT::i32: {
+ Opcode = (RegVT == MVT::i32) ? PPC::STWXTLS_32 : PPC::STWXTLS;
+ break;
+ }
+ case MVT::i64: {
+ Opcode = PPC::STDXTLS;
+ break;
+ }
+ }
+ SDValue Chain = ST->getChain();
+ SDVTList VTs = ST->getVTList();
+ SDValue Ops[] = {ST->getValue(), Base.getOperand(0), Base.getOperand(1),
+ Chain};
+ SDNode *MN = CurDAG->getMachineNode(Opcode, dl, VTs, Ops);
+ transferMemOperands(ST, MN);
+ ReplaceNode(ST, MN);
+ return true;
+}
+
+bool PPCDAGToDAGISel::tryTLSXFormLoad(LoadSDNode *LD) {
+ SDValue Base = LD->getBasePtr();
+ if (Base.getOpcode() != PPCISD::ADD_TLS)
+ return false;
+ SDValue Offset = LD->getOffset();
+ if (!Offset.isUndef())
+ return false;
+
+ SDLoc dl(LD);
+ EVT MemVT = LD->getMemoryVT();
+ EVT RegVT = LD->getValueType(0);
+ unsigned Opcode;
+ switch (MemVT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8: {
+ Opcode = (RegVT == MVT::i32) ? PPC::LBZXTLS_32 : PPC::LBZXTLS;
+ break;
+ }
+ case MVT::i16: {
+ Opcode = (RegVT == MVT::i32) ? PPC::LHZXTLS_32 : PPC::LHZXTLS;
+ break;
+ }
+ case MVT::i32: {
+ Opcode = (RegVT == MVT::i32) ? PPC::LWZXTLS_32 : PPC::LWZXTLS;
+ break;
+ }
+ case MVT::i64: {
+ Opcode = PPC::LDXTLS;
+ break;
+ }
+ }
+ SDValue Chain = LD->getChain();
+ SDVTList VTs = LD->getVTList();
+ SDValue Ops[] = {Base.getOperand(0), Base.getOperand(1), Chain};
+ SDNode *MN = CurDAG->getMachineNode(Opcode, dl, VTs, Ops);
+ transferMemOperands(LD, MN);
+ ReplaceNode(LD, MN);
+ return true;
+}
+
/// Turn an or of two masked values into the rotate left word immediate then
/// mask insert (rlwimi) instruction.
bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
@@ -1023,8 +1131,8 @@ class BitPermutationSelector {
BitGroup(SDValue V, unsigned R, unsigned S, unsigned E)
: V(V), RLAmt(R), StartIdx(S), EndIdx(E), Repl32(false), Repl32CR(false),
Repl32Coalesced(false) {
- DEBUG(dbgs() << "\tbit group for " << V.getNode() << " RLAmt = " << R <<
- " [" << S << ", " << E << "]\n");
+ LLVM_DEBUG(dbgs() << "\tbit group for " << V.getNode() << " RLAmt = " << R
+ << " [" << S << ", " << E << "]\n");
}
};
@@ -1053,6 +1161,10 @@ class BitPermutationSelector {
return true;
else if (NumGroups < Other.NumGroups)
return false;
+ else if (RLAmt == 0 && Other.RLAmt != 0)
+ return true;
+ else if (RLAmt != 0 && Other.RLAmt == 0)
+ return false;
else if (FirstGroupStartIdx < Other.FirstGroupStartIdx)
return true;
return false;
@@ -1180,7 +1292,7 @@ class BitPermutationSelector {
Bits[i] = ValueBit(ValueBit::ConstZero);
return std::make_pair(Interesting, &Bits);
- }
+ }
}
for (unsigned i = 0; i < NumBits; ++i)
@@ -1258,7 +1370,7 @@ class BitPermutationSelector {
BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 &&
BitGroups[0].V == BitGroups[BitGroups.size()-1].V &&
BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) {
- DEBUG(dbgs() << "\tcombining final bit group with initial one\n");
+ LLVM_DEBUG(dbgs() << "\tcombining final bit group with initial one\n");
BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx;
BitGroups.erase(BitGroups.begin());
}
@@ -1266,7 +1378,9 @@ class BitPermutationSelector {
}
// Take all (SDValue, RLAmt) pairs and sort them by the number of groups
- // associated with each. If there is a degeneracy, pick the one that occurs
+ // associated with each. If the number of groups are same, we prefer a group
+ // which does not require rotate, i.e. RLAmt is 0, to avoid the first rotate
+ // instruction. If there is a degeneracy, pick the one that occurs
// first (in the final value).
void collectValueRotInfo() {
ValueRots.clear();
@@ -1287,7 +1401,7 @@ class BitPermutationSelector {
for (auto &I : ValueRots) {
ValueRotsVec.push_back(I.second);
}
- std::sort(ValueRotsVec.begin(), ValueRotsVec.end());
+ llvm::sort(ValueRotsVec.begin(), ValueRotsVec.end());
}
// In 64-bit mode, rlwinm and friends have a rotation operator that
@@ -1336,6 +1450,20 @@ class BitPermutationSelector {
};
for (auto &BG : BitGroups) {
+ // If this bit group has RLAmt of 0 and will not be merged with
+ // another bit group, we don't benefit from Repl32. We don't mark
+ // such group to give more freedom for later instruction selection.
+ if (BG.RLAmt == 0) {
+ auto PotentiallyMerged = [this](BitGroup & BG) {
+ for (auto &BG2 : BitGroups)
+ if (&BG != &BG2 && BG.V == BG2.V &&
+ (BG2.RLAmt == 0 || BG2.RLAmt == 32))
+ return true;
+ return false;
+ };
+ if (!PotentiallyMerged(BG))
+ continue;
+ }
if (BG.StartIdx < 32 && BG.EndIdx < 32) {
if (IsAllLow32(BG)) {
if (BG.RLAmt >= 32) {
@@ -1345,9 +1473,9 @@ class BitPermutationSelector {
BG.Repl32 = true;
- DEBUG(dbgs() << "\t32-bit replicated bit group for " <<
- BG.V.getNode() << " RLAmt = " << BG.RLAmt <<
- " [" << BG.StartIdx << ", " << BG.EndIdx << "]\n");
+ LLVM_DEBUG(dbgs() << "\t32-bit replicated bit group for "
+ << BG.V.getNode() << " RLAmt = " << BG.RLAmt << " ["
+ << BG.StartIdx << ", " << BG.EndIdx << "]\n");
}
}
}
@@ -1361,11 +1489,11 @@ class BitPermutationSelector {
if (I->Repl32 && IP->Repl32 && I->V == IP->V && I->RLAmt == IP->RLAmt &&
I->StartIdx == (IP->EndIdx + 1) % 64 && I != IP) {
- DEBUG(dbgs() << "\tcombining 32-bit replicated bit group for " <<
- I->V.getNode() << " RLAmt = " << I->RLAmt <<
- " [" << I->StartIdx << ", " << I->EndIdx <<
- "] with group with range [" <<
- IP->StartIdx << ", " << IP->EndIdx << "]\n");
+ LLVM_DEBUG(dbgs() << "\tcombining 32-bit replicated bit group for "
+ << I->V.getNode() << " RLAmt = " << I->RLAmt << " ["
+ << I->StartIdx << ", " << I->EndIdx
+ << "] with group with range [" << IP->StartIdx << ", "
+ << IP->EndIdx << "]\n");
IP->EndIdx = I->EndIdx;
IP->Repl32CR = IP->Repl32CR || I->Repl32CR;
@@ -1389,12 +1517,12 @@ class BitPermutationSelector {
IP->EndIdx == 31 && IN->StartIdx == 0 && I != IP &&
IsAllLow32(*I)) {
- DEBUG(dbgs() << "\tcombining bit group for " <<
- I->V.getNode() << " RLAmt = " << I->RLAmt <<
- " [" << I->StartIdx << ", " << I->EndIdx <<
- "] with 32-bit replicated groups with ranges [" <<
- IP->StartIdx << ", " << IP->EndIdx << "] and [" <<
- IN->StartIdx << ", " << IN->EndIdx << "]\n");
+ LLVM_DEBUG(dbgs() << "\tcombining bit group for " << I->V.getNode()
+ << " RLAmt = " << I->RLAmt << " [" << I->StartIdx
+ << ", " << I->EndIdx
+ << "] with 32-bit replicated groups with ranges ["
+ << IP->StartIdx << ", " << IP->EndIdx << "] and ["
+ << IN->StartIdx << ", " << IN->EndIdx << "]\n");
if (IP == IN) {
// There is only one other group; change it to cover the whole
@@ -1503,15 +1631,15 @@ class BitPermutationSelector {
(unsigned) (ANDIMask != 0 && ANDISMask != 0) +
(unsigned) (bool) Res;
- DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
- " RL: " << VRI.RLAmt << ":" <<
- "\n\t\t\tisel using masking: " << NumAndInsts <<
- " using rotates: " << VRI.NumGroups << "\n");
+ LLVM_DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode()
+ << " RL: " << VRI.RLAmt << ":"
+ << "\n\t\t\tisel using masking: " << NumAndInsts
+ << " using rotates: " << VRI.NumGroups << "\n");
if (NumAndInsts >= VRI.NumGroups)
continue;
- DEBUG(dbgs() << "\t\t\t\tusing masking\n");
+ LLVM_DEBUG(dbgs() << "\t\t\t\tusing masking\n");
if (InstCnt) *InstCnt += NumAndInsts;
@@ -1859,10 +1987,10 @@ class BitPermutationSelector {
FirstBG = false;
}
- DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
- " RL: " << VRI.RLAmt << (VRI.Repl32 ? " (32):" : ":") <<
- "\n\t\t\tisel using masking: " << NumAndInsts <<
- " using rotates: " << NumRLInsts << "\n");
+ LLVM_DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode()
+ << " RL: " << VRI.RLAmt << (VRI.Repl32 ? " (32):" : ":")
+ << "\n\t\t\tisel using masking: " << NumAndInsts
+ << " using rotates: " << NumRLInsts << "\n");
// When we'd use andi/andis, we bias toward using the rotates (andi only
// has a record form, and is cracked on POWER cores). However, when using
@@ -1876,7 +2004,7 @@ class BitPermutationSelector {
if ((Use32BitInsts || MoreBG) && NumAndInsts == NumRLInsts)
continue;
- DEBUG(dbgs() << "\t\t\t\tusing masking\n");
+ LLVM_DEBUG(dbgs() << "\t\t\t\tusing masking\n");
if (InstCnt) *InstCnt += NumAndInsts;
@@ -2127,9 +2255,9 @@ public:
return nullptr;
Bits = std::move(*Result.second);
- DEBUG(dbgs() << "Considering bit-permutation-based instruction"
- " selection for: ");
- DEBUG(N->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "Considering bit-permutation-based instruction"
+ " selection for: ");
+ LLVM_DEBUG(N->dump(CurDAG));
// Fill it RLAmt and set HasZeros.
computeRotationAmounts();
@@ -2145,22 +2273,22 @@ public:
// set of bit groups, and then mask in the zeros at the end. With early
// masking, we only insert the non-zero parts of the result at every step.
- unsigned InstCnt, InstCntLateMask;
- DEBUG(dbgs() << "\tEarly masking:\n");
+ unsigned InstCnt = 0, InstCntLateMask = 0;
+ LLVM_DEBUG(dbgs() << "\tEarly masking:\n");
SDNode *RN = Select(N, false, &InstCnt);
- DEBUG(dbgs() << "\t\tisel would use " << InstCnt << " instructions\n");
+ LLVM_DEBUG(dbgs() << "\t\tisel would use " << InstCnt << " instructions\n");
- DEBUG(dbgs() << "\tLate masking:\n");
+ LLVM_DEBUG(dbgs() << "\tLate masking:\n");
SDNode *RNLM = Select(N, true, &InstCntLateMask);
- DEBUG(dbgs() << "\t\tisel would use " << InstCntLateMask <<
- " instructions\n");
+ LLVM_DEBUG(dbgs() << "\t\tisel would use " << InstCntLateMask
+ << " instructions\n");
if (InstCnt <= InstCntLateMask) {
- DEBUG(dbgs() << "\tUsing early-masking for isel\n");
+ LLVM_DEBUG(dbgs() << "\tUsing early-masking for isel\n");
return RN;
}
- DEBUG(dbgs() << "\tUsing late-masking for isel\n");
+ LLVM_DEBUG(dbgs() << "\tUsing late-masking for isel\n");
return RNLM;
}
};
@@ -3288,7 +3416,7 @@ static bool allUsesExtend(SDValue Compare, SelectionDAG *CurDAG) {
}
/// Returns an equivalent of a SETCC node but with the result the same width as
-/// the inputs. This can nalso be used for SELECT_CC if either the true or false
+/// the inputs. This can also be used for SELECT_CC if either the true or false
/// values is a power of two while the other is zero.
SDValue IntegerCompareEliminator::getSETCCInGPR(SDValue Compare,
SetccInGPROpts ConvOpts) {
@@ -3488,10 +3616,63 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
Opc = PPC::CMPD;
}
} else if (LHS.getValueType() == MVT::f32) {
- Opc = PPC::FCMPUS;
+ if (PPCSubTarget->hasSPE()) {
+ switch (CC) {
+ default:
+ case ISD::SETEQ:
+ case ISD::SETNE:
+ Opc = PPC::EFSCMPEQ;
+ break;
+ case ISD::SETLT:
+ case ISD::SETGE:
+ case ISD::SETOLT:
+ case ISD::SETOGE:
+ case ISD::SETULT:
+ case ISD::SETUGE:
+ Opc = PPC::EFSCMPLT;
+ break;
+ case ISD::SETGT:
+ case ISD::SETLE:
+ case ISD::SETOGT:
+ case ISD::SETOLE:
+ case ISD::SETUGT:
+ case ISD::SETULE:
+ Opc = PPC::EFSCMPGT;
+ break;
+ }
+ } else
+ Opc = PPC::FCMPUS;
+ } else if (LHS.getValueType() == MVT::f64) {
+ if (PPCSubTarget->hasSPE()) {
+ switch (CC) {
+ default:
+ case ISD::SETEQ:
+ case ISD::SETNE:
+ Opc = PPC::EFDCMPEQ;
+ break;
+ case ISD::SETLT:
+ case ISD::SETGE:
+ case ISD::SETOLT:
+ case ISD::SETOGE:
+ case ISD::SETULT:
+ case ISD::SETUGE:
+ Opc = PPC::EFDCMPLT;
+ break;
+ case ISD::SETGT:
+ case ISD::SETLE:
+ case ISD::SETOGT:
+ case ISD::SETOLE:
+ case ISD::SETUGT:
+ case ISD::SETULE:
+ Opc = PPC::EFDCMPGT;
+ break;
+ }
+ } else
+ Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
} else {
- assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
- Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
+ assert(LHS.getValueType() == MVT::f128 && "Unknown vt!");
+ assert(PPCSubTarget->hasVSX() && "__float128 requires VSX");
+ Opc = PPC::XSCMPUQP;
}
return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
}
@@ -3765,7 +3946,7 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
// Altivec Vector compare instructions do not set any CR register by default and
// vector compare operations return the same type as the operands.
if (LHS.getValueType().isVector()) {
- if (PPCSubTarget->hasQPX())
+ if (PPCSubTarget->hasQPX() || PPCSubTarget->hasSPE())
return false;
EVT VecVT = LHS.getValueType();
@@ -3795,6 +3976,12 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
SDValue CCReg = SelectCC(LHS, RHS, CC, dl);
SDValue IntCR;
+ // SPE e*cmp* instructions only set the 'gt' bit, so hard-code that
+ // The correct compare instruction is already set by SelectCC()
+ if (PPCSubTarget->hasSPE() && LHS.getValueType().isFloatingPoint()) {
+ Idx = 1;
+ }
+
// Force the ccreg into CR7.
SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
@@ -3830,20 +4017,28 @@ bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const {
else if (STN)
AddrOp = STN->getOperand(2);
+ // If the address points a frame object or a frame object with an offset,
+ // we need to check the object alignment.
short Imm = 0;
- if (AddrOp.getOpcode() == ISD::ADD) {
+ if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(
+ AddrOp.getOpcode() == ISD::ADD ? AddrOp.getOperand(0) :
+ AddrOp)) {
// If op0 is a frame index that is under aligned, we can't do it either,
// because it is translated to r31 or r1 + slot + offset. We won't know the
// slot number until the stack frame is finalized.
- if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(AddrOp.getOperand(0))) {
- const MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo();
- unsigned SlotAlign = MFI.getObjectAlignment(FI->getIndex());
- if ((SlotAlign % Val) != 0)
- return false;
- }
- return isIntS16Immediate(AddrOp.getOperand(1), Imm) && !(Imm % Val);
+ const MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo();
+ unsigned SlotAlign = MFI.getObjectAlignment(FI->getIndex());
+ if ((SlotAlign % Val) != 0)
+ return false;
+
+ // If we have an offset, we need further check on the offset.
+ if (AddrOp.getOpcode() != ISD::ADD)
+ return true;
}
+ if (AddrOp.getOpcode() == ISD::ADD)
+ return isIntS16Immediate(AddrOp.getOperand(1), Imm) && !(Imm % Val);
+
// If the address comes from the outside, the offset will be zero.
return AddrOp.getOpcode() == ISD::CopyFromReg;
}
@@ -3855,6 +4050,51 @@ void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
}
+/// This method returns a node after flipping the MSB of each element
+/// of vector integer type. Additionally, if SignBitVec is non-null,
+/// this method sets a node with one at MSB of all elements
+/// and zero at other bits in SignBitVec.
+MachineSDNode *
+PPCDAGToDAGISel::flipSignBit(const SDValue &N, SDNode **SignBitVec) {
+ SDLoc dl(N);
+ EVT VecVT = N.getValueType();
+ if (VecVT == MVT::v4i32) {
+ if (SignBitVec) {
+ SDNode *ZV = CurDAG->getMachineNode(PPC::V_SET0, dl, MVT::v4i32);
+ *SignBitVec = CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT,
+ SDValue(ZV, 0));
+ }
+ return CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT, N);
+ }
+ else if (VecVT == MVT::v8i16) {
+ SDNode *Hi = CurDAG->getMachineNode(PPC::LIS, dl, MVT::i32,
+ getI32Imm(0x8000, dl));
+ SDNode *ScaImm = CurDAG->getMachineNode(PPC::ORI, dl, MVT::i32,
+ SDValue(Hi, 0),
+ getI32Imm(0x8000, dl));
+ SDNode *VecImm = CurDAG->getMachineNode(PPC::MTVSRWS, dl, VecVT,
+ SDValue(ScaImm, 0));
+ /*
+ Alternatively, we can do this as follow to use VRF instead of GPR.
+ vspltish 5, 1
+ vspltish 6, 15
+ vslh 5, 6, 5
+ */
+ if (SignBitVec) *SignBitVec = VecImm;
+ return CurDAG->getMachineNode(PPC::VADDUHM, dl, VecVT, N,
+ SDValue(VecImm, 0));
+ }
+ else if (VecVT == MVT::v16i8) {
+ SDNode *VecImm = CurDAG->getMachineNode(PPC::XXSPLTIB, dl, MVT::i32,
+ getI32Imm(0x80, dl));
+ if (SignBitVec) *SignBitVec = VecImm;
+ return CurDAG->getMachineNode(PPC::VADDUBM, dl, VecVT, N,
+ SDValue(VecImm, 0));
+ }
+ else
+ llvm_unreachable("Unsupported vector data type for flipSignBit");
+}
+
// Select - Convert the specified operand from a target-independent to a
// target-specific node if it hasn't already been changed.
void PPCDAGToDAGISel::Select(SDNode *N) {
@@ -3894,6 +4134,27 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
return;
break;
+ case PPCISD::CALL: {
+ const Module *M = MF->getFunction().getParent();
+
+ if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) != MVT::i32 ||
+ !PPCSubTarget->isSecurePlt() || !PPCSubTarget->isTargetELF() ||
+ M->getPICLevel() == PICLevel::SmallPIC)
+ break;
+
+ SDValue Op = N->getOperand(1);
+
+ if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
+ if (GA->getTargetFlags() == PPCII::MO_PLT)
+ getGlobalBaseReg();
+ }
+ else if (ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op)) {
+ if (ES->getTargetFlags() == PPCII::MO_PLT)
+ getGlobalBaseReg();
+ }
+ }
+ break;
+
case PPCISD::GlobalBaseReg:
ReplaceNode(N, getGlobalBaseReg());
return;
@@ -3939,14 +4200,28 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
}
}
+ case ISD::STORE: {
+ // Change TLS initial-exec D-form stores to X-form stores.
+ StoreSDNode *ST = cast<StoreSDNode>(N);
+ if (EnableTLSOpt && PPCSubTarget->isELFv2ABI() &&
+ ST->getAddressingMode() != ISD::PRE_INC)
+ if (tryTLSXFormStore(ST))
+ return;
+ break;
+ }
case ISD::LOAD: {
// Handle preincrement loads.
LoadSDNode *LD = cast<LoadSDNode>(N);
EVT LoadedVT = LD->getMemoryVT();
// Normal loads are handled by code generated from the .td file.
- if (LD->getAddressingMode() != ISD::PRE_INC)
+ if (LD->getAddressingMode() != ISD::PRE_INC) {
+ // Change TLS initial-exec D-form loads to X-form loads.
+ if (EnableTLSOpt && PPCSubTarget->isELFv2ABI())
+ if (tryTLSXFormLoad(LD))
+ return;
break;
+ }
SDValue Offset = LD->getOffset();
if (Offset.getOpcode() == ISD::TargetConstant ||
@@ -4338,16 +4613,24 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
SelectCCOp = PPC::SELECT_CC_I4;
else if (N->getValueType(0) == MVT::i64)
SelectCCOp = PPC::SELECT_CC_I8;
- else if (N->getValueType(0) == MVT::f32)
+ else if (N->getValueType(0) == MVT::f32) {
if (PPCSubTarget->hasP8Vector())
SelectCCOp = PPC::SELECT_CC_VSSRC;
+ else if (PPCSubTarget->hasSPE())
+ SelectCCOp = PPC::SELECT_CC_SPE4;
else
SelectCCOp = PPC::SELECT_CC_F4;
- else if (N->getValueType(0) == MVT::f64)
+ } else if (N->getValueType(0) == MVT::f64) {
if (PPCSubTarget->hasVSX())
SelectCCOp = PPC::SELECT_CC_VSFRC;
+ else if (PPCSubTarget->hasSPE())
+ SelectCCOp = PPC::SELECT_CC_SPE;
else
SelectCCOp = PPC::SELECT_CC_F8;
+ } else if (N->getValueType(0) == MVT::f128)
+ SelectCCOp = PPC::SELECT_CC_F16;
+ else if (PPCSubTarget->hasSPE())
+ SelectCCOp = PPC::SELECT_CC_SPE;
else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f64)
SelectCCOp = PPC::SELECT_CC_QFRC;
else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f32)
@@ -4633,6 +4916,55 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
return;
}
}
+ case ISD::ABS: {
+ assert(PPCSubTarget->hasP9Vector() && "ABS is supported with P9 Vector");
+
+ // For vector absolute difference, we use VABSDUW instruction of POWER9.
+ // Since VABSDU instructions are for unsigned integers, we need adjustment
+ // for signed integers.
+ // For abs(sub(a, b)), we generate VABSDUW(a+0x80000000, b+0x80000000).
+ // Otherwise, abs(sub(-1, 0)) returns 0xFFFFFFFF(=-1) instead of 1.
+ // For abs(a), we generate VABSDUW(a+0x80000000, 0x80000000).
+ EVT VecVT = N->getOperand(0).getValueType();
+ SDNode *AbsOp = nullptr;
+ unsigned AbsOpcode;
+
+ if (VecVT == MVT::v4i32)
+ AbsOpcode = PPC::VABSDUW;
+ else if (VecVT == MVT::v8i16)
+ AbsOpcode = PPC::VABSDUH;
+ else if (VecVT == MVT::v16i8)
+ AbsOpcode = PPC::VABSDUB;
+ else
+ llvm_unreachable("Unsupported vector data type for ISD::ABS");
+
+ // Even for signed integers, we can skip adjustment if all values are
+ // known to be positive (as signed integer) due to zero-extended inputs.
+ if (N->getOperand(0).getOpcode() == ISD::SUB &&
+ N->getOperand(0)->getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
+ N->getOperand(0)->getOperand(1).getOpcode() == ISD::ZERO_EXTEND) {
+ AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT,
+ SDValue(N->getOperand(0)->getOperand(0)),
+ SDValue(N->getOperand(0)->getOperand(1)));
+ ReplaceNode(N, AbsOp);
+ return;
+ }
+ if (N->getOperand(0).getOpcode() == ISD::SUB) {
+ SDValue SubVal = N->getOperand(0);
+ SDNode *Op0 = flipSignBit(SubVal->getOperand(0));
+ SDNode *Op1 = flipSignBit(SubVal->getOperand(1));
+ AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT,
+ SDValue(Op0, 0), SDValue(Op1, 0));
+ }
+ else {
+ SDNode *Op1 = nullptr;
+ SDNode *Op0 = flipSignBit(N->getOperand(0), &Op1);
+ AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, SDValue(Op0, 0),
+ SDValue(Op1, 0));
+ }
+ ReplaceNode(N, AbsOp);
+ return;
+ }
}
SelectCode(N);
@@ -4924,8 +5256,7 @@ void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) {
}
void PPCDAGToDAGISel::PreprocessISelDAG() {
- SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
- ++Position;
+ SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
bool MadeChange = false;
while (Position != CurDAG->allnodes_begin()) {
@@ -4945,11 +5276,11 @@ void PPCDAGToDAGISel::PreprocessISelDAG() {
foldBoolExts(Res, N);
if (Res) {
- DEBUG(dbgs() << "PPC DAG preprocessing replacing:\nOld: ");
- DEBUG(N->dump(CurDAG));
- DEBUG(dbgs() << "\nNew: ");
- DEBUG(Res.getNode()->dump(CurDAG));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "PPC DAG preprocessing replacing:\nOld: ");
+ LLVM_DEBUG(N->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\nNew: ");
+ LLVM_DEBUG(Res.getNode()->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\n");
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
MadeChange = true;
@@ -5026,13 +5357,13 @@ void PPCDAGToDAGISel::SwapAllSelectUsers(SDNode *N) {
User->getOperand(2),
User->getOperand(1));
- DEBUG(dbgs() << "CR Peephole replacing:\nOld: ");
- DEBUG(User->dump(CurDAG));
- DEBUG(dbgs() << "\nNew: ");
- DEBUG(ResNode->dump(CurDAG));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "CR Peephole replacing:\nOld: ");
+ LLVM_DEBUG(User->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\nNew: ");
+ LLVM_DEBUG(ResNode->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\n");
- ReplaceUses(User, ResNode);
+ ReplaceUses(User, ResNode);
}
}
@@ -5083,6 +5414,8 @@ void PPCDAGToDAGISel::PeepholeCROps() {
case PPC::SELECT_QFRC:
case PPC::SELECT_QSRC:
case PPC::SELECT_QBRC:
+ case PPC::SELECT_SPE:
+ case PPC::SELECT_SPE4:
case PPC::SELECT_VRRC:
case PPC::SELECT_VSFRC:
case PPC::SELECT_VSSRC:
@@ -5402,6 +5735,8 @@ void PPCDAGToDAGISel::PeepholeCROps() {
case PPC::SELECT_QFRC:
case PPC::SELECT_QSRC:
case PPC::SELECT_QBRC:
+ case PPC::SELECT_SPE:
+ case PPC::SELECT_SPE4:
case PPC::SELECT_VRRC:
case PPC::SELECT_VSFRC:
case PPC::SELECT_VSSRC:
@@ -5440,11 +5775,11 @@ void PPCDAGToDAGISel::PeepholeCROps() {
SwapAllSelectUsers(MachineNode);
if (ResNode != MachineNode) {
- DEBUG(dbgs() << "CR Peephole replacing:\nOld: ");
- DEBUG(MachineNode->dump(CurDAG));
- DEBUG(dbgs() << "\nNew: ");
- DEBUG(ResNode->dump(CurDAG));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "CR Peephole replacing:\nOld: ");
+ LLVM_DEBUG(MachineNode->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\nNew: ");
+ LLVM_DEBUG(ResNode->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\n");
ReplaceUses(MachineNode, ResNode);
IsModified = true;
@@ -5613,8 +5948,7 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() {
// unnecessary. When that happens, we remove it here, and redefine the
// relevant 32-bit operation to be a 64-bit operation.
- SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
- ++Position;
+ SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
bool MadeChange = false;
while (Position != CurDAG->allnodes_begin()) {
@@ -5739,25 +6073,25 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() {
else
NewVTs.push_back(VTs.VTs[i]);
- DEBUG(dbgs() << "PPC64 ZExt Peephole morphing:\nOld: ");
- DEBUG(PN->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "PPC64 ZExt Peephole morphing:\nOld: ");
+ LLVM_DEBUG(PN->dump(CurDAG));
CurDAG->SelectNodeTo(PN, NewOpcode, CurDAG->getVTList(NewVTs), Ops);
- DEBUG(dbgs() << "\nNew: ");
- DEBUG(PN->dump(CurDAG));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "\nNew: ");
+ LLVM_DEBUG(PN->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\n");
}
// Now we replace the original zero extend and its associated INSERT_SUBREG
// with the value feeding the INSERT_SUBREG (which has now been promoted to
// return an i64).
- DEBUG(dbgs() << "PPC64 ZExt Peephole replacing:\nOld: ");
- DEBUG(N->dump(CurDAG));
- DEBUG(dbgs() << "\nNew: ");
- DEBUG(Op32.getNode()->dump(CurDAG));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "PPC64 ZExt Peephole replacing:\nOld: ");
+ LLVM_DEBUG(N->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\nNew: ");
+ LLVM_DEBUG(Op32.getNode()->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\n");
ReplaceUses(N, Op32.getNode());
}
@@ -5771,8 +6105,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64())
return;
- SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
- ++Position;
+ SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
while (Position != CurDAG->allnodes_begin()) {
SDNode *N = &*--Position;
@@ -5782,28 +6115,37 @@ void PPCDAGToDAGISel::PeepholePPC64() {
unsigned FirstOp;
unsigned StorageOpcode = N->getMachineOpcode();
+ bool RequiresMod4Offset = false;
switch (StorageOpcode) {
default: continue;
+ case PPC::LWA:
+ case PPC::LD:
+ case PPC::DFLOADf64:
+ case PPC::DFLOADf32:
+ RequiresMod4Offset = true;
+ LLVM_FALLTHROUGH;
case PPC::LBZ:
case PPC::LBZ8:
- case PPC::LD:
case PPC::LFD:
case PPC::LFS:
case PPC::LHA:
case PPC::LHA8:
case PPC::LHZ:
case PPC::LHZ8:
- case PPC::LWA:
case PPC::LWZ:
case PPC::LWZ8:
FirstOp = 0;
break;
+ case PPC::STD:
+ case PPC::DFSTOREf64:
+ case PPC::DFSTOREf32:
+ RequiresMod4Offset = true;
+ LLVM_FALLTHROUGH;
case PPC::STB:
case PPC::STB8:
- case PPC::STD:
case PPC::STFD:
case PPC::STFS:
case PPC::STH:
@@ -5850,9 +6192,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
// For these cases, the immediate may not be divisible by 4, in
// which case the fold is illegal for DS-form instructions. (The
// other cases provide aligned addresses and are always safe.)
- if ((StorageOpcode == PPC::LWA ||
- StorageOpcode == PPC::LD ||
- StorageOpcode == PPC::STD) &&
+ if (RequiresMod4Offset &&
(!isa<ConstantSDNode>(Base.getOperand(1)) ||
Base.getConstantOperandVal(1) % 4 != 0))
continue;
@@ -5914,8 +6254,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
if (auto *C = dyn_cast<ConstantSDNode>(ImmOpnd)) {
Offset += C->getSExtValue();
- if ((StorageOpcode == PPC::LWA || StorageOpcode == PPC::LD ||
- StorageOpcode == PPC::STD) && (Offset % 4) != 0)
+ if (RequiresMod4Offset && (Offset % 4) != 0)
continue;
if (!isInt<16>(Offset))
@@ -5932,11 +6271,11 @@ void PPCDAGToDAGISel::PeepholePPC64() {
// immediate and substitute them into the load or store. If
// needed, update the target flags for the immediate operand to
// reflect the necessary relocation information.
- DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: ");
- DEBUG(Base->dump(CurDAG));
- DEBUG(dbgs() << "\nN: ");
- DEBUG(N->dump(CurDAG));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: ");
+ LLVM_DEBUG(Base->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\nN: ");
+ LLVM_DEBUG(N->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\n");
// If the relocation information isn't already present on the
// immediate operand, add it now.
@@ -5947,9 +6286,8 @@ void PPCDAGToDAGISel::PeepholePPC64() {
// We can't perform this optimization for data whose alignment
// is insufficient for the instruction encoding.
if (GV->getAlignment() < 4 &&
- (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
- StorageOpcode == PPC::LWA || (Offset % 4) != 0)) {
- DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
+ (RequiresMod4Offset || (Offset % 4) != 0)) {
+ LLVM_DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
continue;
}
ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, Offset, Flags);
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index cea59de3e8a9..1e3e14c71144 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -47,7 +47,6 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -83,6 +82,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
@@ -111,6 +111,9 @@ cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
static cl::opt<bool> DisableSCO("disable-ppc-sco",
cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
+static cl::opt<bool> EnableQuadPrecision("enable-ppc-quad-precision",
+cl::desc("enable quad precision float support on ppc"), cl::Hidden);
+
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumSiblingCalls, "Number of sibling calls");
@@ -134,14 +137,22 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// Set up the register classes.
addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
if (!useSoftFloat()) {
- addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
- addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
+ if (hasSPE()) {
+ addRegisterClass(MVT::f32, &PPC::SPE4RCRegClass);
+ addRegisterClass(MVT::f64, &PPC::SPERCRegClass);
+ } else {
+ addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
+ addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
+ }
}
// Match BITREVERSE to customized fast code sequence in the td file.
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
+ // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+
// PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
for (MVT VT : MVT::integer_valuetypes()) {
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
@@ -156,15 +167,26 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
- setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);
- setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);
setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
- setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);
- setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);
+ if (!Subtarget.hasSPE()) {
+ setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);
+ setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);
+ setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);
+ setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);
+ }
+
+ // PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry.
+ const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
+ for (MVT VT : ScalarIntVTs) {
+ setOperationAction(ISD::ADDC, VT, Legal);
+ setOperationAction(ISD::ADDE, VT, Legal);
+ setOperationAction(ISD::SUBC, VT, Legal);
+ setOperationAction(ISD::SUBE, VT, Legal);
+ }
if (Subtarget.useCRBits()) {
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
@@ -198,9 +220,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
}
- // This is used in the ppcf128->int sequence. Note it has different semantics
- // from FP_ROUND: that rounds to nearest, this rounds to zero.
- setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom);
+ // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
+ // PPC (the libcall is not available).
+ setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom);
// We do not currently implement these libm ops for PowerPC.
setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
@@ -250,13 +273,18 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
setOperationAction(ISD::FREM , MVT::f64, Expand);
setOperationAction(ISD::FPOW , MVT::f64, Expand);
- setOperationAction(ISD::FMA , MVT::f64, Legal);
setOperationAction(ISD::FSIN , MVT::f32, Expand);
setOperationAction(ISD::FCOS , MVT::f32, Expand);
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
setOperationAction(ISD::FREM , MVT::f32, Expand);
setOperationAction(ISD::FPOW , MVT::f32, Expand);
- setOperationAction(ISD::FMA , MVT::f32, Legal);
+ if (Subtarget.hasSPE()) {
+ setOperationAction(ISD::FMA , MVT::f64, Expand);
+ setOperationAction(ISD::FMA , MVT::f32, Expand);
+ } else {
+ setOperationAction(ISD::FMA , MVT::f64, Legal);
+ setOperationAction(ISD::FMA , MVT::f32, Legal);
+ }
setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
@@ -293,7 +321,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd
// to speed up scalar BSWAP64.
- // CTPOP or CTTZ were introduced in P8/P9 respectivelly
+ // CTPOP or CTTZ were introduced in P8/P9 respectively
setOperationAction(ISD::BSWAP, MVT::i32 , Expand);
if (Subtarget.isISA3_0()) {
setOperationAction(ISD::BSWAP, MVT::i64 , Custom);
@@ -339,12 +367,19 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
- // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
- setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+ if (Subtarget.hasSPE()) {
+ // SPE has built-in conversions
+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
+ } else {
+ // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
- // PowerPC does not have [U|S]INT_TO_FP
- setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
- setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+ // PowerPC does not have [U|S]INT_TO_FP
+ setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+ }
if (Subtarget.hasDirectMove() && isPPC64) {
setOperationAction(ISD::BITCAST, MVT::f32, Legal);
@@ -442,6 +477,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
// Comparisons that require checking two conditions.
+ if (Subtarget.hasSPE()) {
+ setCondCodeAction(ISD::SETO, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETO, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
+ }
setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
@@ -469,7 +510,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
} else {
// PowerPC does not have FP_TO_UINT on 32-bit implementations.
- setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+ if (Subtarget.hasSPE())
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
+ else
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
}
// With the instructions enabled under FPCVT, we can do everything.
@@ -782,6 +826,46 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SHL, MVT::v1i128, Legal);
setOperationAction(ISD::SRL, MVT::v1i128, Legal);
setOperationAction(ISD::SRA, MVT::v1i128, Expand);
+
+ if (EnableQuadPrecision) {
+ addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
+ setOperationAction(ISD::FADD, MVT::f128, Legal);
+ setOperationAction(ISD::FSUB, MVT::f128, Legal);
+ setOperationAction(ISD::FDIV, MVT::f128, Legal);
+ setOperationAction(ISD::FMUL, MVT::f128, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
+ // No extending loads to f128 on PPC.
+ for (MVT FPT : MVT::fp_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
+ setOperationAction(ISD::FMA, MVT::f128, Legal);
+ setCondCodeAction(ISD::SETULT, MVT::f128, Expand);
+ setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);
+ setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);
+ setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);
+ setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);
+ setCondCodeAction(ISD::SETONE, MVT::f128, Expand);
+
+ setOperationAction(ISD::FTRUNC, MVT::f128, Legal);
+ setOperationAction(ISD::FRINT, MVT::f128, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::f128, Legal);
+ setOperationAction(ISD::FCEIL, MVT::f128, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);
+ setOperationAction(ISD::FROUND, MVT::f128, Legal);
+
+ setOperationAction(ISD::SELECT, MVT::f128, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
+ setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);
+ setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+ setOperationAction(ISD::BITCAST, MVT::i128, Custom);
+ // No implementation for these ops for PowerPC.
+ setOperationAction(ISD::FSIN , MVT::f128, Expand);
+ setOperationAction(ISD::FCOS , MVT::f128, Expand);
+ setOperationAction(ISD::FPOW, MVT::f128, Expand);
+ setOperationAction(ISD::FPOWI, MVT::f128, Expand);
+ setOperationAction(ISD::FREM, MVT::f128, Expand);
+ }
+
}
if (Subtarget.hasP9Altivec()) {
@@ -1018,6 +1102,21 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
}
+ if (EnableQuadPrecision) {
+ setLibcallName(RTLIB::LOG_F128, "logf128");
+ setLibcallName(RTLIB::LOG2_F128, "log2f128");
+ setLibcallName(RTLIB::LOG10_F128, "log10f128");
+ setLibcallName(RTLIB::EXP_F128, "expf128");
+ setLibcallName(RTLIB::EXP2_F128, "exp2f128");
+ setLibcallName(RTLIB::SIN_F128, "sinf128");
+ setLibcallName(RTLIB::COS_F128, "cosf128");
+ setLibcallName(RTLIB::POW_F128, "powf128");
+ setLibcallName(RTLIB::FMIN_F128, "fminf128");
+ setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
+ setLibcallName(RTLIB::POWI_F128, "__powikf2");
+ setLibcallName(RTLIB::REM_F128, "fmodf128");
+ }
+
// With 32 condition bits, we don't need to sink (and duplicate) compares
// aggressively in CodeGenPrep.
if (Subtarget.useCRBits()) {
@@ -1033,6 +1132,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
default: break;
case PPC::DIR_970:
case PPC::DIR_A2:
+ case PPC::DIR_E500:
case PPC::DIR_E500mc:
case PPC::DIR_E5500:
case PPC::DIR_PWR4:
@@ -1123,10 +1223,28 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
return Align;
}
+unsigned PPCTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
+ EVT VT) const {
+ if (Subtarget.hasSPE() && VT == MVT::f64)
+ return 2;
+ return PPCTargetLowering::getNumRegisters(Context, VT);
+}
+
+MVT PPCTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+ EVT VT) const {
+ if (Subtarget.hasSPE() && VT == MVT::f64)
+ return MVT::i32;
+ return PPCTargetLowering::getRegisterType(Context, VT);
+}
+
bool PPCTargetLowering::useSoftFloat() const {
return Subtarget.useSoftFloat();
}
+bool PPCTargetLowering::hasSPE() const {
+ return Subtarget.hasSPE();
+}
+
const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((PPCISD::NodeType)Opcode) {
case PPCISD::FIRST_NUMBER: break;
@@ -1139,6 +1257,10 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ";
case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ";
case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ";
+ case PPCISD::FP_TO_UINT_IN_VSR:
+ return "PPCISD::FP_TO_UINT_IN_VSR,";
+ case PPCISD::FP_TO_SINT_IN_VSR:
+ return "PPCISD::FP_TO_SINT_IN_VSR";
case PPCISD::FRE: return "PPCISD::FRE";
case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE";
case PPCISD::STFIWX: return "PPCISD::STFIWX";
@@ -1154,6 +1276,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::Hi: return "PPCISD::Hi";
case PPCISD::Lo: return "PPCISD::Lo";
case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY";
+ case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8";
+ case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC";
case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET";
case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";
@@ -1190,6 +1314,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::SExtVElems: return "PPCISD::SExtVElems";
case PPCISD::LXVD2X: return "PPCISD::LXVD2X";
case PPCISD::STXVD2X: return "PPCISD::STXVD2X";
+ case PPCISD::ST_VSR_SCAL_INT:
+ return "PPCISD::ST_VSR_SCAL_INT";
case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH";
case PPCISD::BDNZ: return "PPCISD::BDNZ";
case PPCISD::BDZ: return "PPCISD::BDZ";
@@ -1226,6 +1352,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI";
case PPCISD::QBFLT: return "PPCISD::QBFLT";
case PPCISD::QVLFSb: return "PPCISD::QVLFSb";
+ case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128";
}
return nullptr;
}
@@ -1456,7 +1583,7 @@ bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
}
/**
- * \brief Common function used to match vmrgew and vmrgow shuffles
+ * Common function used to match vmrgew and vmrgow shuffles
*
* The indexOffset determines whether to look for even or odd words in
* the shuffle mask. This is based on the of the endianness of the target
@@ -1513,7 +1640,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
}
/**
- * \brief Determine if the specified shuffle mask is suitable for the vmrgew or
+ * Determine if the specified shuffle mask is suitable for the vmrgew or
* vmrgow instructions.
*
* \param[in] N The shuffle vector SD Node to analyze
@@ -2545,10 +2672,11 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
// 64-bit SVR4 ABI code is always position-independent.
// The actual BlockAddress is stored in the TOC.
- if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
- setUsesTOCBasePtr(DAG);
+ if (Subtarget.isSVR4ABI() && isPositionIndependent()) {
+ if (Subtarget.isPPC64())
+ setUsesTOCBasePtr(DAG);
SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
- return getTOCEntry(DAG, SDLoc(BASDN), true, GA);
+ return getTOCEntry(DAG, SDLoc(BASDN), Subtarget.isPPC64(), GA);
}
unsigned MOHiFlag, MOLoFlag;
@@ -2566,7 +2694,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
// large models could be added if users need it, at the cost of
// additional complexity.
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
- if (DAG.getTarget().Options.EmulatedTLS)
+ if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(GA, DAG);
SDLoc dl(GA);
@@ -3111,7 +3239,7 @@ static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
- ArgVT == MVT::v1i128)
+ ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
Align = 16;
// QPX vector types stored in double-precision are padded to a 32 byte
// boundary.
@@ -3191,7 +3319,7 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
- ArgVT == MVT::v1i128)
+ ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
if (AvailableVRs > 0) {
--AvailableVRs;
return false;
@@ -3280,7 +3408,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
// Reserve space for the linkage area on the stack.
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
CCInfo.AllocateStack(LinkageSize, PtrByteSize);
- if (useSoftFloat())
+ if (useSoftFloat() || hasSPE())
CCInfo.PreAnalyzeFormalArguments(Ins);
CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
@@ -3304,12 +3432,16 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
case MVT::f32:
if (Subtarget.hasP8Vector())
RC = &PPC::VSSRCRegClass;
+ else if (Subtarget.hasSPE())
+ RC = &PPC::SPE4RCRegClass;
else
RC = &PPC::F4RCRegClass;
break;
case MVT::f64:
if (Subtarget.hasVSX())
RC = &PPC::VSFRCRegClass;
+ else if (Subtarget.hasSPE())
+ RC = &PPC::SPERCRegClass;
else
RC = &PPC::F8RCRegClass;
break;
@@ -3398,7 +3530,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
};
unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
- if (useSoftFloat())
+ if (useSoftFloat() || hasSPE())
NumFPArgRegs = 0;
FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));
@@ -3780,23 +3912,23 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
case MVT::v2f64:
case MVT::v2i64:
case MVT::v1i128:
+ case MVT::f128:
if (!Subtarget.hasQPX()) {
- // These can be scalar arguments or elements of a vector array type
- // passed directly. The latter are used to implement ELFv2 homogenous
- // vector aggregates.
- if (VR_idx != Num_VR_Regs) {
- unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
- ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
- ++VR_idx;
- } else {
- if (CallConv == CallingConv::Fast)
- ComputeArgOffset();
-
- needsLoad = true;
- }
- if (CallConv != CallingConv::Fast || needsLoad)
- ArgOffset += 16;
- break;
+ // These can be scalar arguments or elements of a vector array type
+ // passed directly. The latter are used to implement ELFv2 homogenous
+ // vector aggregates.
+ if (VR_idx != Num_VR_Regs) {
+ unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+ ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+ ++VR_idx;
+ } else {
+ if (CallConv == CallingConv::Fast)
+ ComputeArgOffset();
+ needsLoad = true;
+ }
+ if (CallConv != CallingConv::Fast || needsLoad)
+ ArgOffset += 16;
+ break;
} // not QPX
assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 &&
@@ -4258,7 +4390,7 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
unsigned CallerMinReservedArea = FI->getMinReservedArea();
int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
- // Remember only if the new adjustement is bigger.
+ // Remember only if the new adjustment is bigger.
if (SPDiff < FI->getTailCallSPDelta())
FI->setTailCallSPDelta(SPDiff);
@@ -4397,13 +4529,18 @@ hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) {
static bool
areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
CallingConv::ID CalleeCC) {
- // Tail or Sibling call optimization (TCO/SCO) needs callee and caller to
- // have the same calling convention.
- if (CallerCC != CalleeCC)
+ // Tail calls are possible with fastcc and ccc.
+ auto isTailCallableCC = [] (CallingConv::ID CC){
+ return CC == CallingConv::C || CC == CallingConv::Fast;
+ };
+ if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC))
return false;
- // Tail or Sibling calls can be done with fastcc/ccc.
- return (CallerCC == CallingConv::Fast || CallerCC == CallingConv::C);
+ // We can safely tail call both fastcc and ccc callees from a c calling
+ // convention caller. If the caller is fastcc, we may have less stack space
+ // than a non-fastcc caller with the same signature so disable tail-calls in
+ // that case.
+ return CallerCC == CallingConv::C || CallerCC == CalleeCC;
}
bool
@@ -4434,10 +4571,28 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
// Callee contains any byval parameter is not supported, too.
// Note: This is a quick work around, because in some cases, e.g.
// caller's stack size > callee's stack size, we are still able to apply
- // sibling call optimization. See: https://reviews.llvm.org/D23441#513574
+ // sibling call optimization. For example, gcc is able to do SCO for caller1
+ // in the following example, but not for caller2.
+ // struct test {
+ // long int a;
+ // char ary[56];
+ // } gTest;
+ // __attribute__((noinline)) int callee(struct test v, struct test *b) {
+ // b->a = v.a;
+ // return 0;
+ // }
+ // void caller1(struct test a, struct test c, struct test *b) {
+ // callee(gTest, b); }
+ // void caller2(struct test *b) { callee(gTest, b); }
if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))
return false;
+ // If callee and caller use different calling conventions, we cannot pass
+ // parameters on stack since offsets for the parameter area may be different.
+ if (Caller.getCallingConv() != CalleeCC &&
+ needStackSlotPassParameters(Subtarget, Outs))
+ return false;
+
// No TCO/SCO on indirect call because Caller have to restore its TOC
if (!isFunctionGlobalAddress(Callee) &&
!isa<ExternalSymbolSDNode>(Callee))
@@ -4911,7 +5066,11 @@ SDValue PPCTargetLowering::LowerCallResult(
SmallVector<CCValAssign, 16> RVLocs;
CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
- CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
+
+ CCRetInfo.AnalyzeCallResult(
+ Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
+ ? RetCC_PPC_Cold
+ : RetCC_PPC);
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
@@ -5080,15 +5239,15 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
assert(isa<GlobalAddressSDNode>(Callee) &&
"Callee should be an llvm::Function object.");
- DEBUG(
- const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
- const unsigned Width = 80 - strlen("TCO caller: ")
- - strlen(", callee linkage: 0, 0");
- dbgs() << "TCO caller: "
- << left_justify(DAG.getMachineFunction().getName(), Width)
- << ", callee linkage: "
- << GV->getVisibility() << ", " << GV->getLinkage() << "\n"
- );
+ LLVM_DEBUG(
+ const GlobalValue *GV =
+ cast<GlobalAddressSDNode>(Callee)->getGlobal();
+ const unsigned Width =
+ 80 - strlen("TCO caller: ") - strlen(", callee linkage: 0, 0");
+ dbgs() << "TCO caller: "
+ << left_justify(DAG.getMachineFunction().getName(), Width)
+ << ", callee linkage: " << GV->getVisibility() << ", "
+ << GV->getLinkage() << "\n");
}
}
@@ -5131,6 +5290,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
// of the 32-bit SVR4 ABI stack frame layout.
assert((CallConv == CallingConv::C ||
+ CallConv == CallingConv::Cold ||
CallConv == CallingConv::Fast) && "Unknown calling convention!");
unsigned PtrByteSize = 4;
@@ -5434,6 +5594,11 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
// arguments that will be in registers.
unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
+ // Avoid allocating parameter area for fastcc functions if all the arguments
+ // can be passed in the registers.
+ if (CallConv == CallingConv::Fast)
+ HasParameterArea = false;
+
// Add up all the space actually used.
for (unsigned i = 0; i != NumOps; ++i) {
ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -5444,9 +5609,11 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
continue;
if (CallConv == CallingConv::Fast) {
- if (Flags.isByVal())
+ if (Flags.isByVal()) {
NumGPRsUsed += (Flags.getByValSize()+7)/8;
- else
+ if (NumGPRsUsed > NumGPRs)
+ HasParameterArea = true;
+ } else {
switch (ArgVT.getSimpleVT().SimpleTy) {
default: llvm_unreachable("Unexpected ValueType for argument!");
case MVT::i1:
@@ -5461,6 +5628,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
case MVT::v2f64:
case MVT::v2i64:
case MVT::v1i128:
+ case MVT::f128:
if (++NumVRsUsed <= NumVRs)
continue;
break;
@@ -5483,6 +5651,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
continue;
break;
}
+ HasParameterArea = true;
+ }
}
/* Respect alignment of argument on the stack. */
@@ -5839,6 +6009,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
case MVT::v2f64:
case MVT::v2i64:
case MVT::v1i128:
+ case MVT::f128:
if (!Subtarget.hasQPX()) {
// These can be scalar arguments or elements of a vector array type
// passed directly. The latter are used to implement ELFv2 homogenous
@@ -6392,7 +6563,10 @@ PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
- return CCInfo.CheckReturn(Outs, RetCC_PPC);
+ return CCInfo.CheckReturn(
+ Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
+ ? RetCC_PPC_Cold
+ : RetCC_PPC);
}
SDValue
@@ -6404,7 +6578,10 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
- CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
+ CCInfo.AnalyzeReturn(Outs,
+ (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
+ ? RetCC_PPC_Cold
+ : RetCC_PPC);
SDValue Flag;
SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -6824,7 +7001,7 @@ void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
RLI.MPI = MPI;
}
-/// \brief Custom lowers floating point to integer conversions to use
+/// Custom lowers floating point to integer conversions to use
/// the direct move instructions available in ISA 2.07 to avoid the
/// need for load/store combinations.
SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
@@ -6861,6 +7038,51 @@ SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
const SDLoc &dl) const {
+
+ // FP to INT conversions are legal for f128.
+ if (EnableQuadPrecision && (Op->getOperand(0).getValueType() == MVT::f128))
+ return Op;
+
+ // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
+ // PPC (the libcall is not available).
+ if (Op.getOperand(0).getValueType() == MVT::ppcf128) {
+ if (Op.getValueType() == MVT::i32) {
+ if (Op.getOpcode() == ISD::FP_TO_SINT) {
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+ MVT::f64, Op.getOperand(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+ MVT::f64, Op.getOperand(0),
+ DAG.getIntPtrConstant(1, dl));
+
+ // Add the two halves of the long double in round-to-zero mode.
+ SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
+
+ // Now use a smaller FP_TO_SINT.
+ return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
+ }
+ if (Op.getOpcode() == ISD::FP_TO_UINT) {
+ const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
+ APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));
+ SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128);
+ // X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
+ // FIXME: generated code sucks.
+ // TODO: Are there fast-math-flags to propagate to this FSUB?
+ SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128,
+ Op.getOperand(0), Tmp);
+ True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True);
+ True = DAG.getNode(ISD::ADD, dl, MVT::i32, True,
+ DAG.getConstant(0x80000000, dl, MVT::i32));
+ SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
+ Op.getOperand(0));
+ return DAG.getSelectCC(dl, Op.getOperand(0), Tmp, True, False,
+ ISD::SETGE);
+ }
+ }
+
+ return SDValue();
+ }
+
if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
return LowerFP_TO_INTDirectMove(Op, DAG, dl);
@@ -6942,7 +7164,7 @@ void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);
}
-/// \brief Analyze profitability of direct move
+/// Analyze profitability of direct move
/// prefer float load to int load plus direct move
/// when there is no integer use of int load
bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
@@ -6972,7 +7194,7 @@ bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
return false;
}
-/// \brief Custom lowers integer to floating point conversions to use
+/// Custom lowers integer to floating point conversions to use
/// the direct move instructions available in ISA 2.07 to avoid the
/// need for load/store combinations.
SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
@@ -7008,6 +7230,10 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
+ // Conversions to f128 are legal.
+ if (EnableQuadPrecision && (Op.getValueType() == MVT::f128))
+ return Op;
+
if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {
if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64)
return SDValue();
@@ -7524,6 +7750,23 @@ static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
return !(IsSplat && IsLoad);
}
+// Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128.
+SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
+
+ SDLoc dl(Op);
+ SDValue Op0 = Op->getOperand(0);
+
+ if (!EnableQuadPrecision ||
+ (Op.getValueType() != MVT::f128 ) ||
+ (Op0.getOpcode() != ISD::BUILD_PAIR) ||
+ (Op0.getOperand(0).getValueType() != MVT::i64) ||
+ (Op0.getOperand(1).getValueType() != MVT::i64))
+ return SDValue();
+
+ return DAG.getNode(PPCISD::BUILD_FP128, dl, MVT::f128, Op0.getOperand(0),
+ Op0.getOperand(1));
+}
+
// If this is a case we can't handle, return null and let the default
// expansion code take care of it. If we CAN select this case, and if it
// selects to a single instruction, return Op. Otherwise, if we can codegen
@@ -8811,6 +9054,42 @@ SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
return Op;
}
+// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be
+// compared to a value that is atomically loaded (atomic loads zero-extend).
+SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&
+ "Expecting an atomic compare-and-swap here.");
+ SDLoc dl(Op);
+ auto *AtomicNode = cast<AtomicSDNode>(Op.getNode());
+ EVT MemVT = AtomicNode->getMemoryVT();
+ if (MemVT.getSizeInBits() >= 32)
+ return Op;
+
+ SDValue CmpOp = Op.getOperand(2);
+ // If this is already correctly zero-extended, leave it alone.
+ auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits());
+ if (DAG.MaskedValueIsZero(CmpOp, HighBits))
+ return Op;
+
+ // Clear the high bits of the compare operand.
+ unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;
+ SDValue NewCmpOp =
+ DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp,
+ DAG.getConstant(MaskVal, dl, MVT::i32));
+
+ // Replace the existing compare operand with the properly zero-extended one.
+ SmallVector<SDValue, 4> Ops;
+ for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)
+ Ops.push_back(AtomicNode->getOperand(i));
+ Ops[2] = NewCmpOp;
+ MachineMemOperand *MMO = AtomicNode->getMemOperand();
+ SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other);
+ auto NodeTy =
+ (MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16;
+ return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO);
+}
+
SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -9238,27 +9517,19 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SETCC: return LowerSETCC(Op, DAG);
case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
- case ISD::VASTART:
- return LowerVASTART(Op, DAG);
-
- case ISD::VAARG:
- return LowerVAARG(Op, DAG);
-
- case ISD::VACOPY:
- return LowerVACOPY(Op, DAG);
- case ISD::STACKRESTORE:
- return LowerSTACKRESTORE(Op, DAG);
-
- case ISD::DYNAMIC_STACKALLOC:
- return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ // Variable argument lowering.
+ case ISD::VASTART: return LowerVASTART(Op, DAG);
+ case ISD::VAARG: return LowerVAARG(Op, DAG);
+ case ISD::VACOPY: return LowerVACOPY(Op, DAG);
+ case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
case ISD::GET_DYNAMIC_AREA_OFFSET:
return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
- case ISD::EH_DWARF_CFA:
- return LowerEH_DWARF_CFA(Op, DAG);
-
+ // Exception handling lowering.
+ case ISD::EH_DWARF_CFA: return LowerEH_DWARF_CFA(Op, DAG);
case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
@@ -9267,8 +9538,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::FP_TO_UINT:
- case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG,
- SDLoc(Op));
+ case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, SDLoc(Op));
case ISD::UINT_TO_FP:
case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
@@ -9291,6 +9561,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
// For counter-based loop handling.
case ISD::INTRINSIC_W_CHAIN: return SDValue();
+ case ISD::BITCAST: return LowerBITCAST(Op, DAG);
+
// Frame & Return address.
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
@@ -9302,6 +9574,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerREM(Op, DAG);
case ISD::BSWAP:
return LowerBSWAP(Op, DAG);
+ case ISD::ATOMIC_CMP_SWAP:
+ return LowerATOMIC_CMP_SWAP(Op, DAG);
}
}
@@ -9334,7 +9608,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
N->getOperand(1));
- Results.push_back(NewInt);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewInt));
Results.push_back(NewInt.getValue(1));
break;
}
@@ -9352,25 +9626,6 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
}
return;
}
- case ISD::FP_ROUND_INREG: {
- assert(N->getValueType(0) == MVT::ppcf128);
- assert(N->getOperand(0).getValueType() == MVT::ppcf128);
- SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
- MVT::f64, N->getOperand(0),
- DAG.getIntPtrConstant(0, dl));
- SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
- MVT::f64, N->getOperand(0),
- DAG.getIntPtrConstant(1, dl));
-
- // Add the two halves of the long double in round-to-zero mode.
- SDValue FPreg = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
-
- // We know the low half is about to be thrown away, so just use something
- // convenient.
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
- FPreg, FPreg));
- return;
- }
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
// LowerFP_TO_INT() can only handle f32 and f64.
@@ -10017,6 +10272,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.getOpcode() == PPC::SELECT_CC_I8 ||
MI.getOpcode() == PPC::SELECT_CC_F4 ||
MI.getOpcode() == PPC::SELECT_CC_F8 ||
+ MI.getOpcode() == PPC::SELECT_CC_F16 ||
MI.getOpcode() == PPC::SELECT_CC_QFRC ||
MI.getOpcode() == PPC::SELECT_CC_QSRC ||
MI.getOpcode() == PPC::SELECT_CC_QBRC ||
@@ -10024,13 +10280,18 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.getOpcode() == PPC::SELECT_CC_VSFRC ||
MI.getOpcode() == PPC::SELECT_CC_VSSRC ||
MI.getOpcode() == PPC::SELECT_CC_VSRC ||
+ MI.getOpcode() == PPC::SELECT_CC_SPE4 ||
+ MI.getOpcode() == PPC::SELECT_CC_SPE ||
MI.getOpcode() == PPC::SELECT_I4 ||
MI.getOpcode() == PPC::SELECT_I8 ||
MI.getOpcode() == PPC::SELECT_F4 ||
MI.getOpcode() == PPC::SELECT_F8 ||
+ MI.getOpcode() == PPC::SELECT_F16 ||
MI.getOpcode() == PPC::SELECT_QFRC ||
MI.getOpcode() == PPC::SELECT_QSRC ||
MI.getOpcode() == PPC::SELECT_QBRC ||
+ MI.getOpcode() == PPC::SELECT_SPE ||
+ MI.getOpcode() == PPC::SELECT_SPE4 ||
MI.getOpcode() == PPC::SELECT_VRRC ||
MI.getOpcode() == PPC::SELECT_VSFRC ||
MI.getOpcode() == PPC::SELECT_VSSRC ||
@@ -10063,6 +10324,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
if (MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 ||
MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 ||
+ MI.getOpcode() == PPC::SELECT_F16 ||
+ MI.getOpcode() == PPC::SELECT_SPE4 ||
+ MI.getOpcode() == PPC::SELECT_SPE ||
MI.getOpcode() == PPC::SELECT_QFRC ||
MI.getOpcode() == PPC::SELECT_QSRC ||
MI.getOpcode() == PPC::SELECT_QBRC ||
@@ -10615,6 +10879,7 @@ unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
return 3;
case PPC::DIR_440:
case PPC::DIR_A2:
+ case PPC::DIR_E500:
case PPC::DIR_E500mc:
case PPC::DIR_E5500:
return 2;
@@ -10896,7 +11161,7 @@ SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
// Size of integers being compared has a critical role in the following
// analysis, so we prefer to do this when all types are legal.
- if (!DCI.isAfterLegalizeVectorOps())
+ if (!DCI.isAfterLegalizeDAG())
return SDValue();
// If all users of SETCC extend its value to a legal integer type
@@ -11494,7 +11759,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
ShiftCst);
}
-/// \brief Reduces the number of fp-to-int conversion when building a vector.
+/// Reduces the number of fp-to-int conversion when building a vector.
///
/// If this vector is built out of floating to integer conversions,
/// transform it to a vector built out of floating point values followed by a
@@ -11574,7 +11839,7 @@ combineElementTruncationToVectorTruncation(SDNode *N,
return SDValue();
}
-/// \brief Reduce the number of loads when building a vector.
+/// Reduce the number of loads when building a vector.
///
/// Building a vector out of multiple loads can be converted to a load
/// of the vector type if the loads are consecutive. If the loads are
@@ -11882,10 +12147,12 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
SDLoc dl(N);
SDValue Op(N, 0);
- // Don't handle ppc_fp128 here or i1 conversions.
+ // Don't handle ppc_fp128 here or conversions that are out-of-range capable
+ // from the hardware.
if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
return SDValue();
- if (Op.getOperand(0).getValueType() == MVT::i1)
+ if (Op.getOperand(0).getValueType().getSimpleVT() <= MVT(MVT::i1) ||
+ Op.getOperand(0).getValueType().getSimpleVT() > MVT(MVT::i64))
return SDValue();
SDValue FirstOperand(Op.getOperand(0));
@@ -12105,6 +12372,64 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
return Store;
}
+// Handle DAG combine for STORE (FP_TO_INT F).
+SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+ unsigned Opcode = N->getOperand(1).getOpcode();
+
+ assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT)
+ && "Not a FP_TO_INT Instruction!");
+
+ SDValue Val = N->getOperand(1).getOperand(0);
+ EVT Op1VT = N->getOperand(1).getValueType();
+ EVT ResVT = Val.getValueType();
+
+ // Floating point types smaller than 32 bits are not legal on Power.
+ if (ResVT.getScalarSizeInBits() < 32)
+ return SDValue();
+
+ // Only perform combine for conversion to i64/i32 or power9 i16/i8.
+ bool ValidTypeForStoreFltAsInt =
+ (Op1VT == MVT::i32 || Op1VT == MVT::i64 ||
+ (Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8)));
+
+ if (ResVT == MVT::ppcf128 || !Subtarget.hasP8Altivec() ||
+ cast<StoreSDNode>(N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt)
+ return SDValue();
+
+ // Extend f32 values to f64
+ if (ResVT.getScalarSizeInBits() == 32) {
+ Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
+ DCI.AddToWorklist(Val.getNode());
+ }
+
+ // Set signed or unsigned conversion opcode.
+ unsigned ConvOpcode = (Opcode == ISD::FP_TO_SINT) ?
+ PPCISD::FP_TO_SINT_IN_VSR :
+ PPCISD::FP_TO_UINT_IN_VSR;
+
+ Val = DAG.getNode(ConvOpcode,
+ dl, ResVT == MVT::f128 ? MVT::f128 : MVT::f64, Val);
+ DCI.AddToWorklist(Val.getNode());
+
+ // Set number of bytes being converted.
+ unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8;
+ SDValue Ops[] = { N->getOperand(0), Val, N->getOperand(2),
+ DAG.getIntPtrConstant(ByteSize, dl, false),
+ DAG.getValueType(Op1VT) };
+
+ Val = DAG.getMemIntrinsicNode(PPCISD::ST_VSR_SCAL_INT, dl,
+ DAG.getVTList(MVT::Other), Ops,
+ cast<StoreSDNode>(N)->getMemoryVT(),
+ cast<StoreSDNode>(N)->getMemOperand());
+
+ DCI.AddToWorklist(Val.getNode());
+ return Val;
+}
+
SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -12144,60 +12469,27 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::UINT_TO_FP:
return combineFPToIntToFP(N, DCI);
case ISD::STORE: {
+
EVT Op1VT = N->getOperand(1).getValueType();
- bool ValidTypeForStoreFltAsInt = (Op1VT == MVT::i32) ||
- (Subtarget.hasP9Vector() && (Op1VT == MVT::i8 || Op1VT == MVT::i16));
-
- // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)).
- if (Subtarget.hasSTFIWX() && !cast<StoreSDNode>(N)->isTruncatingStore() &&
- N->getOperand(1).getOpcode() == ISD::FP_TO_SINT &&
- ValidTypeForStoreFltAsInt &&
- N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) {
- SDValue Val = N->getOperand(1).getOperand(0);
- if (Val.getValueType() == MVT::f32) {
- Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
- DCI.AddToWorklist(Val.getNode());
- }
- Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val);
- DCI.AddToWorklist(Val.getNode());
-
- if (Op1VT == MVT::i32) {
- SDValue Ops[] = {
- N->getOperand(0), Val, N->getOperand(2),
- DAG.getValueType(N->getOperand(1).getValueType())
- };
-
- Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
- DAG.getVTList(MVT::Other), Ops,
- cast<StoreSDNode>(N)->getMemoryVT(),
- cast<StoreSDNode>(N)->getMemOperand());
- } else {
- unsigned WidthInBytes =
- N->getOperand(1).getValueType() == MVT::i8 ? 1 : 2;
- SDValue WidthConst = DAG.getIntPtrConstant(WidthInBytes, dl, false);
-
- SDValue Ops[] = {
- N->getOperand(0), Val, N->getOperand(2), WidthConst,
- DAG.getValueType(N->getOperand(1).getValueType())
- };
- Val = DAG.getMemIntrinsicNode(PPCISD::STXSIX, dl,
- DAG.getVTList(MVT::Other), Ops,
- cast<StoreSDNode>(N)->getMemoryVT(),
- cast<StoreSDNode>(N)->getMemOperand());
- }
+ unsigned Opcode = N->getOperand(1).getOpcode();
- DCI.AddToWorklist(Val.getNode());
- return Val;
+ if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT) {
+ SDValue Val= combineStoreFPToInt(N, DCI);
+ if (Val)
+ return Val;
}
// Turn STORE (BSWAP) -> sthbrx/stwbrx.
- if (cast<StoreSDNode>(N)->isUnindexed() &&
- N->getOperand(1).getOpcode() == ISD::BSWAP &&
+ if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP &&
N->getOperand(1).getNode()->hasOneUse() &&
- (N->getOperand(1).getValueType() == MVT::i32 ||
- N->getOperand(1).getValueType() == MVT::i16 ||
- (Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
- N->getOperand(1).getValueType() == MVT::i64))) {
+ (Op1VT == MVT::i32 || Op1VT == MVT::i16 ||
+ (Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) {
+
+ // STBRX can only handle simple types.
+ EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
+ if (mVT.isExtended())
+ break;
+
SDValue BSwapOp = N->getOperand(1).getOperand(0);
// Do an any-extend to 32-bits if this is a half-word input.
if (BSwapOp.getValueType() == MVT::i16)
@@ -12205,7 +12497,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
// If the type of BSWAP operand is wider than stored memory width
// it need to be shifted to the right side before STBRX.
- EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
if (Op1VT.bitsGT(mVT)) {
int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp,
@@ -12226,9 +12517,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
// STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0>
// So it can increase the chance of CSE constant construction.
- EVT VT = N->getOperand(1).getValueType();
if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
- isa<ConstantSDNode>(N->getOperand(1)) && VT == MVT::i32) {
+ isa<ConstantSDNode>(N->getOperand(1)) && Op1VT == MVT::i32) {
// Need to sign-extended to 64-bits to handle negative values.
EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT();
uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1),
@@ -12246,8 +12536,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
// For little endian, VSX stores require generating xxswapd/lxvd2x.
// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
- if (VT.isSimple()) {
- MVT StoreVT = VT.getSimpleVT();
+ if (Op1VT.isSimple()) {
+ MVT StoreVT = Op1VT.getSimpleVT();
if (Subtarget.needsSwapsForVSXMemOps() &&
(StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
@@ -13030,14 +13320,21 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// really care overly much here so just give them all the same reg classes.
case 'd':
case 'f':
- if (VT == MVT::f32 || VT == MVT::i32)
- return std::make_pair(0U, &PPC::F4RCRegClass);
- if (VT == MVT::f64 || VT == MVT::i64)
- return std::make_pair(0U, &PPC::F8RCRegClass);
- if (VT == MVT::v4f64 && Subtarget.hasQPX())
- return std::make_pair(0U, &PPC::QFRCRegClass);
- if (VT == MVT::v4f32 && Subtarget.hasQPX())
- return std::make_pair(0U, &PPC::QSRCRegClass);
+ if (Subtarget.hasSPE()) {
+ if (VT == MVT::f32 || VT == MVT::i32)
+ return std::make_pair(0U, &PPC::SPE4RCRegClass);
+ if (VT == MVT::f64 || VT == MVT::i64)
+ return std::make_pair(0U, &PPC::SPERCRegClass);
+ } else {
+ if (VT == MVT::f32 || VT == MVT::i32)
+ return std::make_pair(0U, &PPC::F4RCRegClass);
+ if (VT == MVT::f64 || VT == MVT::i64)
+ return std::make_pair(0U, &PPC::F8RCRegClass);
+ if (VT == MVT::v4f64 && Subtarget.hasQPX())
+ return std::make_pair(0U, &PPC::QFRCRegClass);
+ if (VT == MVT::v4f32 && Subtarget.hasQPX())
+ return std::make_pair(0U, &PPC::QSRCRegClass);
+ }
break;
case 'v':
if (VT == MVT::v4f64 && Subtarget.hasQPX())
@@ -13520,7 +13817,7 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
return MVT::i32;
}
-/// \brief Returns true if it is beneficial to convert a load of a constant
+/// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
@@ -13569,6 +13866,9 @@ bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
"invalid fpext types");
+ // Extending to float128 is not free.
+ if (DestVT == MVT::f128)
+ return false;
return true;
}
@@ -13625,6 +13925,8 @@ bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
case MVT::f32:
case MVT::f64:
return true;
+ case MVT::f128:
+ return (EnableQuadPrecision && Subtarget.hasP9Vector());
default:
break;
}
@@ -13853,3 +14155,20 @@ bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
// If the function is local then we have a good chance at tail-calling it
return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee);
}
+
+bool PPCTargetLowering::
+isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
+ const Value *Mask = AndI.getOperand(1);
+ // If the mask is suitable for andi. or andis. we should sink the and.
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Mask)) {
+ // Can't handle constants wider than 64-bits.
+ if (CI->getBitWidth() > 64)
+ return false;
+ int64_t ConstVal = CI->getZExtValue();
+ return isUInt<16>(ConstVal) ||
+ (isUInt<16>(ConstVal >> 16) && !(ConstVal & 0xFFFF));
+ }
+
+ // For non-constant masks, we can always use the record-form and.
+ return true;
+}
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index b119e5b4a564..9b8d6435515b 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -20,7 +20,6 @@
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetLowering.h"
@@ -31,6 +30,7 @@
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Type.h"
+#include "llvm/Support/MachineValueType.h"
#include <utility>
namespace llvm {
@@ -71,6 +71,9 @@ namespace llvm {
/// unsigned integers with round toward zero.
FCTIDUZ, FCTIWUZ,
+ /// Floating-point-to-interger conversion instructions
+ FP_TO_UINT_IN_VSR, FP_TO_SINT_IN_VSR,
+
/// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in
/// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
VEXTS,
@@ -186,6 +189,9 @@ namespace llvm {
/// Direct move from a GPR to a VSX register (zero)
MTVSRZ,
+ /// Direct move of 2 consective GPR to a VSX register.
+ BUILD_FP128,
+
/// Extract a subvector from signed integer vector and convert to FP.
/// It is primarily used to convert a (widened) illegal integer vector
/// type to a legal floating point vector type.
@@ -426,10 +432,18 @@ namespace llvm {
/// an xxswapd.
STXVD2X,
+ /// Store scalar integers from VSR.
+ ST_VSR_SCAL_INT,
+
/// QBRC, CHAIN = QVLFSb CHAIN, Ptr
/// The 4xf32 load used for v4i1 constants.
QVLFSb,
+ /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes
+ /// except they ensure that the compare input is zero-extended for
+ /// sub-word versions because the atomic loads zero-extend.
+ ATOMIC_CMP_SWAP_8, ATOMIC_CMP_SWAP_16,
+
/// GPRC = TOC_ENTRY GA, TOC
/// Loads the entry for GA from the TOC, where the TOC base is given by
/// the last operand.
@@ -560,6 +574,8 @@ namespace llvm {
bool useSoftFloat() const override;
+ bool hasSPE() const;
+
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
return MVT::i32;
}
@@ -760,7 +776,7 @@ namespace llvm {
bool isFPExtFree(EVT DestVT, EVT SrcVT) const override;
- /// \brief Returns true if it is beneficial to convert a load of a constant
+ /// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
@@ -817,7 +833,7 @@ namespace llvm {
FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
const TargetLibraryInfo *LibInfo) const override;
- /// \brief Returns true if an argument of type Ty needs to be passed in a
+ /// Returns true if an argument of type Ty needs to be passed in a
/// contiguous block of registers in calling convention CallConv.
bool functionArgumentNeedsConsecutiveRegisters(
Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override {
@@ -855,6 +871,12 @@ namespace llvm {
unsigned JTI,
MCContext &Ctx) const override;
+ unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+ EVT VT) const override;
+
+ MVT getRegisterTypeForCallingConv(LLVMContext &Context,
+ EVT VT) const override;
+
private:
struct ReuseLoadInfo {
SDValue Ptr;
@@ -879,6 +901,11 @@ namespace llvm {
}
};
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+ // Addrspacecasts are always noops.
+ return true;
+ }
+
bool canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI,
SelectionDAG &DAG,
ISD::LoadExtType ET = ISD::NON_EXTLOAD) const;
@@ -955,6 +982,7 @@ namespace llvm {
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
@@ -1048,10 +1076,12 @@ namespace llvm {
SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineStoreFPToInt(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -1090,6 +1120,7 @@ namespace llvm {
// tail call. This will cause the optimizers to attempt to move, or
// duplicate return instructions to help enable tail call optimizations.
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
+ bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
}; // end class PPCTargetLowering
namespace PPC {
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index fdd28c2ff03f..cdd57c6a1118 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -244,8 +244,8 @@ let usesCustomInserter = 1 in {
// Instructions to support atomic operations
let mayLoad = 1, hasSideEffects = 0 in {
-def LDARX : XForm_1<31, 84, (outs g8rc:$rD), (ins memrr:$ptr),
- "ldarx $rD, $ptr", IIC_LdStLDARX, []>;
+def LDARX : XForm_1_memOp<31, 84, (outs g8rc:$rD), (ins memrr:$ptr),
+ "ldarx $rD, $ptr", IIC_LdStLDARX, []>;
// Instruction to support lock versions of atomics
// (EH=1 - see Power ISA 2.07 Book II 4.4.2)
@@ -259,8 +259,8 @@ def LDAT : X_RD5_RS5_IM5<31, 614, (outs g8rc:$rD), (ins g8rc:$rA, u5imm:$FC),
}
let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
-def STDCX : XForm_1<31, 214, (outs), (ins g8rc:$rS, memrr:$dst),
- "stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isDOT;
+def STDCX : XForm_1_memOp<31, 214, (outs), (ins g8rc:$rS, memrr:$dst),
+ "stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isDOT;
let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
def STDAT : X_RD5_RS5_IM5<31, 742, (outs), (ins g8rc:$rS, g8rc:$rA, u5imm:$FC),
@@ -499,7 +499,49 @@ defm ADD8 : XOForm_1r<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
def ADD8TLS : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc_nox0:$rA, tlsreg:$rB),
"add $rT, $rA, $rB", IIC_IntSimple,
[(set i64:$rT, (add i64:$rA, tglobaltlsaddr:$rB))]>;
-
+let mayLoad = 1 in {
+def LBZXTLS : XForm_1<31, 87, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+ "lbzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LHZXTLS : XForm_1<31, 279, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+ "lhzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LWZXTLS : XForm_1<31, 23, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+ "lwzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LDXTLS : XForm_1<31, 21, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+ "ldx $rD, $rA, $rB", IIC_LdStLD, []>, isPPC64;
+def LBZXTLS_32 : XForm_1<31, 87, (outs gprc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+ "lbzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LHZXTLS_32 : XForm_1<31, 279, (outs gprc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+ "lhzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LWZXTLS_32 : XForm_1<31, 23, (outs gprc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+ "lwzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+
+}
+
+let mayStore = 1 in {
+def STBXTLS : XForm_8<31, 215, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+ "stbx $rS, $rA, $rB", IIC_LdStStore, []>,
+ PPC970_DGroup_Cracked;
+def STHXTLS : XForm_8<31, 407, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+ "sthx $rS, $rA, $rB", IIC_LdStStore, []>,
+ PPC970_DGroup_Cracked;
+def STWXTLS : XForm_8<31, 151, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+ "stwx $rS, $rA, $rB", IIC_LdStStore, []>,
+ PPC970_DGroup_Cracked;
+def STDXTLS : XForm_8<31, 149, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+ "stdx $rS, $rA, $rB", IIC_LdStSTD, []>, isPPC64,
+ PPC970_DGroup_Cracked;
+def STBXTLS_32 : XForm_8<31, 215, (outs), (ins gprc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+ "stbx $rS, $rA, $rB", IIC_LdStStore, []>,
+ PPC970_DGroup_Cracked;
+def STHXTLS_32 : XForm_8<31, 407, (outs), (ins gprc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+ "sthx $rS, $rA, $rB", IIC_LdStStore, []>,
+ PPC970_DGroup_Cracked;
+def STWXTLS_32 : XForm_8<31, 151, (outs), (ins gprc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+ "stwx $rS, $rA, $rB", IIC_LdStStore, []>,
+ PPC970_DGroup_Cracked;
+
+}
+
let isCommutable = 1 in
defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
"addc", "$rT, $rA, $rB", IIC_IntGeneral,
@@ -558,10 +600,37 @@ defm SUBFZE8 : XOForm_3rc<31, 200, 0, (outs g8rc:$rT), (ins g8rc:$rA),
// FIXME: Duplicating this for the asm parser should be unnecessary, but the
// previous definition must be marked as CodeGen only to prevent decoding
// conflicts.
-let isAsmParserOnly = 1 in
+let isAsmParserOnly = 1 in {
def ADD8TLS_ : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB),
"add $rT, $rA, $rB", IIC_IntSimple, []>;
+let mayLoad = 1 in {
+def LBZXTLS_ : XForm_1<31, 87, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+ "lbzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LHZXTLS_ : XForm_1<31, 279, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+ "lhzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LWZXTLS_ : XForm_1<31, 23, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+ "lwzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LDXTLS_ : XForm_1<31, 21, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+ "ldx $rD, $rA, $rB", IIC_LdStLD, []>, isPPC64;
+}
+
+let mayStore = 1 in {
+def STBXTLS_ : XForm_8<31, 215, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+ "stbx $rS, $rA, $rB", IIC_LdStStore, []>,
+ PPC970_DGroup_Cracked;
+def STHXTLS_ : XForm_8<31, 407, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+ "sthx $rS, $rA, $rB", IIC_LdStStore, []>,
+ PPC970_DGroup_Cracked;
+def STWXTLS_ : XForm_8<31, 151, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+ "stwx $rS, $rA, $rB", IIC_LdStStore, []>,
+ PPC970_DGroup_Cracked;
+def STDXTLS_ : XForm_8<31, 149, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+ "stdx $rS, $rA, $rB", IIC_LdStSTD, []>, isPPC64,
+ PPC970_DGroup_Cracked;
+}
+}
+
let isCommutable = 1 in {
defm MULHD : XOForm_1r<31, 73, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
"mulhd", "$rT, $rA, $rB", IIC_IntMulHW,
@@ -837,22 +906,22 @@ def LWA : DSForm_1<58, 2, (outs g8rc:$rD), (ins memrix:$src),
(aligned4sextloadi32 ixaddr:$src))]>, isPPC64,
PPC970_DGroup_Cracked;
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
-def LHAX8: XForm_1<31, 343, (outs g8rc:$rD), (ins memrr:$src),
- "lhax $rD, $src", IIC_LdStLHA,
- [(set i64:$rD, (sextloadi16 xaddr:$src))]>,
- PPC970_DGroup_Cracked;
-def LWAX : XForm_1<31, 341, (outs g8rc:$rD), (ins memrr:$src),
- "lwax $rD, $src", IIC_LdStLHA,
- [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
- PPC970_DGroup_Cracked;
+def LHAX8: XForm_1_memOp<31, 343, (outs g8rc:$rD), (ins memrr:$src),
+ "lhax $rD, $src", IIC_LdStLHA,
+ [(set i64:$rD, (sextloadi16 xaddr:$src))]>,
+ PPC970_DGroup_Cracked;
+def LWAX : XForm_1_memOp<31, 341, (outs g8rc:$rD), (ins memrr:$src),
+ "lwax $rD, $src", IIC_LdStLHA,
+ [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
+ PPC970_DGroup_Cracked;
// For fast-isel:
let isCodeGenOnly = 1, mayLoad = 1 in {
def LWA_32 : DSForm_1<58, 2, (outs gprc:$rD), (ins memrix:$src),
"lwa $rD, $src", IIC_LdStLWA, []>, isPPC64,
PPC970_DGroup_Cracked;
-def LWAX_32 : XForm_1<31, 341, (outs gprc:$rD), (ins memrr:$src),
- "lwax $rD, $src", IIC_LdStLHA, []>, isPPC64,
- PPC970_DGroup_Cracked;
+def LWAX_32 : XForm_1_memOp<31, 341, (outs gprc:$rD), (ins memrr:$src),
+ "lwax $rD, $src", IIC_LdStLHA, []>, isPPC64,
+ PPC970_DGroup_Cracked;
} // end fast-isel isCodeGenOnly
// Update forms.
@@ -866,16 +935,16 @@ def LHAU8 : DForm_1<43, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
// NO LWAU!
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
-def LHAUX8 : XForm_1<31, 375, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
- (ins memrr:$addr),
- "lhaux $rD, $addr", IIC_LdStLHAUX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
-def LWAUX : XForm_1<31, 373, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
- (ins memrr:$addr),
- "lwaux $rD, $addr", IIC_LdStLHAUX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">, isPPC64;
+def LHAUX8 : XForm_1_memOp<31, 375, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lhaux $rD, $addr", IIC_LdStLHAUX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
+def LWAUX : XForm_1_memOp<31, 373, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lwaux $rD, $addr", IIC_LdStLHAUX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">, isPPC64;
}
}
@@ -892,47 +961,50 @@ def LWZ8 : DForm_1<32, (outs g8rc:$rD), (ins memri:$src),
"lwz $rD, $src", IIC_LdStLoad,
[(set i64:$rD, (zextloadi32 iaddr:$src))]>, isPPC64;
-def LBZX8 : XForm_1<31, 87, (outs g8rc:$rD), (ins memrr:$src),
- "lbzx $rD, $src", IIC_LdStLoad,
- [(set i64:$rD, (zextloadi8 xaddr:$src))]>;
-def LHZX8 : XForm_1<31, 279, (outs g8rc:$rD), (ins memrr:$src),
- "lhzx $rD, $src", IIC_LdStLoad,
- [(set i64:$rD, (zextloadi16 xaddr:$src))]>;
-def LWZX8 : XForm_1<31, 23, (outs g8rc:$rD), (ins memrr:$src),
- "lwzx $rD, $src", IIC_LdStLoad,
- [(set i64:$rD, (zextloadi32 xaddr:$src))]>;
+def LBZX8 : XForm_1_memOp<31, 87, (outs g8rc:$rD), (ins memrr:$src),
+ "lbzx $rD, $src", IIC_LdStLoad,
+ [(set i64:$rD, (zextloadi8 xaddr:$src))]>;
+def LHZX8 : XForm_1_memOp<31, 279, (outs g8rc:$rD), (ins memrr:$src),
+ "lhzx $rD, $src", IIC_LdStLoad,
+ [(set i64:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX8 : XForm_1_memOp<31, 23, (outs g8rc:$rD), (ins memrr:$src),
+ "lwzx $rD, $src", IIC_LdStLoad,
+ [(set i64:$rD, (zextloadi32 xaddr:$src))]>;
// Update forms.
let mayLoad = 1, hasSideEffects = 0 in {
-def LBZU8 : DForm_1<35, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+def LBZU8 : DForm_1<35, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memri:$addr),
"lbzu $rD, $addr", IIC_LdStLoadUpd,
[]>, RegConstraint<"$addr.reg = $ea_result">,
NoEncode<"$ea_result">;
-def LHZU8 : DForm_1<41, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+def LHZU8 : DForm_1<41, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memri:$addr),
"lhzu $rD, $addr", IIC_LdStLoadUpd,
[]>, RegConstraint<"$addr.reg = $ea_result">,
NoEncode<"$ea_result">;
-def LWZU8 : DForm_1<33, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+def LWZU8 : DForm_1<33, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memri:$addr),
"lwzu $rD, $addr", IIC_LdStLoadUpd,
[]>, RegConstraint<"$addr.reg = $ea_result">,
NoEncode<"$ea_result">;
-def LBZUX8 : XForm_1<31, 119, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
- (ins memrr:$addr),
- "lbzux $rD, $addr", IIC_LdStLoadUpdX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
-def LHZUX8 : XForm_1<31, 311, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
- (ins memrr:$addr),
- "lhzux $rD, $addr", IIC_LdStLoadUpdX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
-def LWZUX8 : XForm_1<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
- (ins memrr:$addr),
- "lwzux $rD, $addr", IIC_LdStLoadUpdX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
+def LBZUX8 : XForm_1_memOp<31, 119, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lbzux $rD, $addr", IIC_LdStLoadUpdX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
+def LHZUX8 : XForm_1_memOp<31, 311, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lhzux $rD, $addr", IIC_LdStLoadUpdX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
+def LWZUX8 : XForm_1_memOp<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "lwzux $rD, $addr", IIC_LdStLoadUpdX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">;
}
}
} // Interpretation64Bit
@@ -963,35 +1035,36 @@ def LDtocBA: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
[(set i64:$rD,
(PPCtoc_entry tblockaddress:$disp, i64:$reg))]>, isPPC64;
-def LDX : XForm_1<31, 21, (outs g8rc:$rD), (ins memrr:$src),
- "ldx $rD, $src", IIC_LdStLD,
- [(set i64:$rD, (load xaddr:$src))]>, isPPC64;
-def LDBRX : XForm_1<31, 532, (outs g8rc:$rD), (ins memrr:$src),
- "ldbrx $rD, $src", IIC_LdStLoad,
- [(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64;
+def LDX : XForm_1_memOp<31, 21, (outs g8rc:$rD), (ins memrr:$src),
+ "ldx $rD, $src", IIC_LdStLD,
+ [(set i64:$rD, (load xaddr:$src))]>, isPPC64;
+def LDBRX : XForm_1_memOp<31, 532, (outs g8rc:$rD), (ins memrr:$src),
+ "ldbrx $rD, $src", IIC_LdStLoad,
+ [(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64;
let mayLoad = 1, hasSideEffects = 0, isCodeGenOnly = 1 in {
-def LHBRX8 : XForm_1<31, 790, (outs g8rc:$rD), (ins memrr:$src),
- "lhbrx $rD, $src", IIC_LdStLoad, []>;
-def LWBRX8 : XForm_1<31, 534, (outs g8rc:$rD), (ins memrr:$src),
- "lwbrx $rD, $src", IIC_LdStLoad, []>;
+def LHBRX8 : XForm_1_memOp<31, 790, (outs g8rc:$rD), (ins memrr:$src),
+ "lhbrx $rD, $src", IIC_LdStLoad, []>;
+def LWBRX8 : XForm_1_memOp<31, 534, (outs g8rc:$rD), (ins memrr:$src),
+ "lwbrx $rD, $src", IIC_LdStLoad, []>;
}
let mayLoad = 1, hasSideEffects = 0 in {
-def LDU : DSForm_1<58, 1, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memrix:$addr),
+def LDU : DSForm_1<58, 1, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrix:$addr),
"ldu $rD, $addr", IIC_LdStLDU,
[]>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
NoEncode<"$ea_result">;
-def LDUX : XForm_1<31, 53, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
- (ins memrr:$addr),
- "ldux $rD, $addr", IIC_LdStLDUX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">, isPPC64;
+def LDUX : XForm_1_memOp<31, 53, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+ (ins memrr:$addr),
+ "ldux $rD, $addr", IIC_LdStLDUX,
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+ NoEncode<"$ea_result">, isPPC64;
def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src),
"ldmx $rD, $src", IIC_LdStLD, []>, isPPC64,
- Requires<[IsISA3_0]>;
+ Requires<[IsISA3_0]>;
}
}
@@ -1116,32 +1189,32 @@ def STH8 : DForm_1<44, (outs), (ins g8rc:$rS, memri:$src),
def STW8 : DForm_1<36, (outs), (ins g8rc:$rS, memri:$src),
"stw $rS, $src", IIC_LdStStore,
[(truncstorei32 i64:$rS, iaddr:$src)]>;
-def STBX8 : XForm_8<31, 215, (outs), (ins g8rc:$rS, memrr:$dst),
- "stbx $rS, $dst", IIC_LdStStore,
- [(truncstorei8 i64:$rS, xaddr:$dst)]>,
- PPC970_DGroup_Cracked;
-def STHX8 : XForm_8<31, 407, (outs), (ins g8rc:$rS, memrr:$dst),
- "sthx $rS, $dst", IIC_LdStStore,
- [(truncstorei16 i64:$rS, xaddr:$dst)]>,
- PPC970_DGroup_Cracked;
-def STWX8 : XForm_8<31, 151, (outs), (ins g8rc:$rS, memrr:$dst),
- "stwx $rS, $dst", IIC_LdStStore,
- [(truncstorei32 i64:$rS, xaddr:$dst)]>,
- PPC970_DGroup_Cracked;
+def STBX8 : XForm_8_memOp<31, 215, (outs), (ins g8rc:$rS, memrr:$dst),
+ "stbx $rS, $dst", IIC_LdStStore,
+ [(truncstorei8 i64:$rS, xaddr:$dst)]>,
+ PPC970_DGroup_Cracked;
+def STHX8 : XForm_8_memOp<31, 407, (outs), (ins g8rc:$rS, memrr:$dst),
+ "sthx $rS, $dst", IIC_LdStStore,
+ [(truncstorei16 i64:$rS, xaddr:$dst)]>,
+ PPC970_DGroup_Cracked;
+def STWX8 : XForm_8_memOp<31, 151, (outs), (ins g8rc:$rS, memrr:$dst),
+ "stwx $rS, $dst", IIC_LdStStore,
+ [(truncstorei32 i64:$rS, xaddr:$dst)]>,
+ PPC970_DGroup_Cracked;
} // Interpretation64Bit
// Normal 8-byte stores.
def STD : DSForm_1<62, 0, (outs), (ins g8rc:$rS, memrix:$dst),
"std $rS, $dst", IIC_LdStSTD,
[(aligned4store i64:$rS, ixaddr:$dst)]>, isPPC64;
-def STDX : XForm_8<31, 149, (outs), (ins g8rc:$rS, memrr:$dst),
- "stdx $rS, $dst", IIC_LdStSTD,
- [(store i64:$rS, xaddr:$dst)]>, isPPC64,
- PPC970_DGroup_Cracked;
-def STDBRX: XForm_8<31, 660, (outs), (ins g8rc:$rS, memrr:$dst),
- "stdbrx $rS, $dst", IIC_LdStStore,
- [(PPCstbrx i64:$rS, xoaddr:$dst, i64)]>, isPPC64,
- PPC970_DGroup_Cracked;
+def STDX : XForm_8_memOp<31, 149, (outs), (ins g8rc:$rS, memrr:$dst),
+ "stdx $rS, $dst", IIC_LdStSTD,
+ [(store i64:$rS, xaddr:$dst)]>, isPPC64,
+ PPC970_DGroup_Cracked;
+def STDBRX: XForm_8_memOp<31, 660, (outs), (ins g8rc:$rS, memrr:$dst),
+ "stdbrx $rS, $dst", IIC_LdStStore,
+ [(PPCstbrx i64:$rS, xoaddr:$dst, i64)]>, isPPC64,
+ PPC970_DGroup_Cracked;
}
// Stores with Update (pre-inc).
@@ -1157,29 +1230,38 @@ def STWU8 : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
"stwu $rS, $dst", IIC_LdStStoreUpd, []>,
RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
-def STBUX8: XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
- "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
- RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
- PPC970_DGroup_Cracked;
-def STHUX8: XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
- "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
- RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
- PPC970_DGroup_Cracked;
-def STWUX8: XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
- "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
- RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
- PPC970_DGroup_Cracked;
+def STBUX8: XForm_8_memOp<31, 247, (outs ptr_rc_nor0:$ea_res),
+ (ins g8rc:$rS, memrr:$dst),
+ "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">,
+ NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
+def STHUX8: XForm_8_memOp<31, 439, (outs ptr_rc_nor0:$ea_res),
+ (ins g8rc:$rS, memrr:$dst),
+ "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">,
+ NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
+def STWUX8: XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res),
+ (ins g8rc:$rS, memrr:$dst),
+ "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">,
+ NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
} // Interpretation64Bit
-def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrix:$dst),
+def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res),
+ (ins g8rc:$rS, memrix:$dst),
"stdu $rS, $dst", IIC_LdStSTDU, []>,
RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">,
isPPC64;
-def STDUX : XForm_8<31, 181, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
- "stdux $rS, $dst", IIC_LdStSTDUX, []>,
- RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
- PPC970_DGroup_Cracked, isPPC64;
+def STDUX : XForm_8_memOp<31, 181, (outs ptr_rc_nor0:$ea_res),
+ (ins g8rc:$rS, memrr:$dst),
+ "stdux $rS, $dst", IIC_LdStSTDUX, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">,
+ NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked, isPPC64;
}
// Patterns to match the pre-inc stores. We can't put the patterns on
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index e751c149b0b3..24969d7ef853 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -408,46 +408,46 @@ def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB),
[(int_ppc_altivec_mtvscr v4i32:$vB)]>;
let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in { // Loads.
-def LVEBX: XForm_1<31, 7, (outs vrrc:$vD), (ins memrr:$src),
+def LVEBX: XForm_1_memOp<31, 7, (outs vrrc:$vD), (ins memrr:$src),
"lvebx $vD, $src", IIC_LdStLoad,
[(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>;
-def LVEHX: XForm_1<31, 39, (outs vrrc:$vD), (ins memrr:$src),
+def LVEHX: XForm_1_memOp<31, 39, (outs vrrc:$vD), (ins memrr:$src),
"lvehx $vD, $src", IIC_LdStLoad,
[(set v8i16:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>;
-def LVEWX: XForm_1<31, 71, (outs vrrc:$vD), (ins memrr:$src),
+def LVEWX: XForm_1_memOp<31, 71, (outs vrrc:$vD), (ins memrr:$src),
"lvewx $vD, $src", IIC_LdStLoad,
[(set v4i32:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>;
-def LVX : XForm_1<31, 103, (outs vrrc:$vD), (ins memrr:$src),
+def LVX : XForm_1_memOp<31, 103, (outs vrrc:$vD), (ins memrr:$src),
"lvx $vD, $src", IIC_LdStLoad,
[(set v4i32:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>;
-def LVXL : XForm_1<31, 359, (outs vrrc:$vD), (ins memrr:$src),
+def LVXL : XForm_1_memOp<31, 359, (outs vrrc:$vD), (ins memrr:$src),
"lvxl $vD, $src", IIC_LdStLoad,
[(set v4i32:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>;
}
-def LVSL : XForm_1<31, 6, (outs vrrc:$vD), (ins memrr:$src),
+def LVSL : XForm_1_memOp<31, 6, (outs vrrc:$vD), (ins memrr:$src),
"lvsl $vD, $src", IIC_LdStLoad,
[(set v16i8:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>,
PPC970_Unit_LSU;
-def LVSR : XForm_1<31, 38, (outs vrrc:$vD), (ins memrr:$src),
+def LVSR : XForm_1_memOp<31, 38, (outs vrrc:$vD), (ins memrr:$src),
"lvsr $vD, $src", IIC_LdStLoad,
[(set v16i8:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>,
PPC970_Unit_LSU;
let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in { // Stores.
-def STVEBX: XForm_8<31, 135, (outs), (ins vrrc:$rS, memrr:$dst),
+def STVEBX: XForm_8_memOp<31, 135, (outs), (ins vrrc:$rS, memrr:$dst),
"stvebx $rS, $dst", IIC_LdStStore,
[(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>;
-def STVEHX: XForm_8<31, 167, (outs), (ins vrrc:$rS, memrr:$dst),
+def STVEHX: XForm_8_memOp<31, 167, (outs), (ins vrrc:$rS, memrr:$dst),
"stvehx $rS, $dst", IIC_LdStStore,
[(int_ppc_altivec_stvehx v8i16:$rS, xoaddr:$dst)]>;
-def STVEWX: XForm_8<31, 199, (outs), (ins vrrc:$rS, memrr:$dst),
+def STVEWX: XForm_8_memOp<31, 199, (outs), (ins vrrc:$rS, memrr:$dst),
"stvewx $rS, $dst", IIC_LdStStore,
[(int_ppc_altivec_stvewx v4i32:$rS, xoaddr:$dst)]>;
-def STVX : XForm_8<31, 231, (outs), (ins vrrc:$rS, memrr:$dst),
+def STVX : XForm_8_memOp<31, 231, (outs), (ins vrrc:$rS, memrr:$dst),
"stvx $rS, $dst", IIC_LdStStore,
[(int_ppc_altivec_stvx v4i32:$rS, xoaddr:$dst)]>;
-def STVXL : XForm_8<31, 487, (outs), (ins vrrc:$rS, memrr:$dst),
+def STVXL : XForm_8_memOp<31, 487, (outs), (ins vrrc:$rS, memrr:$dst),
"stvxl $rS, $dst", IIC_LdStStore,
[(int_ppc_altivec_stvxl v4i32:$rS, xoaddr:$dst)]>;
}
@@ -705,7 +705,7 @@ def VSPLTH : VXForm_1<588, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
(vsplth_shuffle:$UIMM v16i8:$vB, (undef)))]>;
def VSPLTW : VXForm_1<652, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
"vspltw $vD, $vB, $UIMM", IIC_VecPerm,
- [(set v16i8:$vD,
+ [(set v16i8:$vD,
(vspltw_shuffle:$UIMM v16i8:$vB, (undef)))]>;
let isCodeGenOnly = 1 in {
def VSPLTBs : VXForm_1<524, (outs vrrc:$vD), (ins u5imm:$UIMM, vfrc:$vB),
@@ -962,7 +962,7 @@ def : Pat<(and v4i32:$A, (vnot_ppc v4i32:$B)),
def : Pat<(fmul v4f32:$vA, v4f32:$vB),
(VMADDFP $vA, $vB,
- (v4i32 (VSLW (V_SETALLONES), (V_SETALLONES))))>;
+ (v4i32 (VSLW (v4i32 (V_SETALLONES)), (v4i32 (V_SETALLONES)))))>;
// Fused multiply add and multiply sub for packed float. These are represented
// separately from the real instructions above, for operations that must have
@@ -991,7 +991,7 @@ def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)),
def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)),
(v4i32 (VSLW $vA, $vB))>;
def : Pat<(v1i128 (shl v1i128:$vA, v1i128:$vB)),
- (v1i128 (VSL (VSLO $vA, $vB), (VSPLTB 15, $vB)))>;
+ (v1i128 (VSL (v16i8 (VSLO $vA, $vB)), (v16i8 (VSPLTB 15, $vB))))>;
def : Pat<(v16i8 (PPCshl v16i8:$vA, v16i8:$vB)),
(v16i8 (VSLB $vA, $vB))>;
def : Pat<(v8i16 (PPCshl v8i16:$vA, v8i16:$vB)),
@@ -999,7 +999,7 @@ def : Pat<(v8i16 (PPCshl v8i16:$vA, v8i16:$vB)),
def : Pat<(v4i32 (PPCshl v4i32:$vA, v4i32:$vB)),
(v4i32 (VSLW $vA, $vB))>;
def : Pat<(v1i128 (PPCshl v1i128:$vA, v1i128:$vB)),
- (v1i128 (VSL (VSLO $vA, $vB), (VSPLTB 15, $vB)))>;
+ (v1i128 (VSL (v16i8 (VSLO $vA, $vB)), (v16i8 (VSPLTB 15, $vB))))>;
def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)),
(v16i8 (VSRB $vA, $vB))>;
@@ -1008,7 +1008,7 @@ def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)),
def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)),
(v4i32 (VSRW $vA, $vB))>;
def : Pat<(v1i128 (srl v1i128:$vA, v1i128:$vB)),
- (v1i128 (VSR (VSRO $vA, $vB), (VSPLTB 15, $vB)))>;
+ (v1i128 (VSR (v16i8 (VSRO $vA, $vB)), (v16i8 (VSPLTB 15, $vB))))>;
def : Pat<(v16i8 (PPCsrl v16i8:$vA, v16i8:$vB)),
(v16i8 (VSRB $vA, $vB))>;
def : Pat<(v8i16 (PPCsrl v8i16:$vA, v8i16:$vB)),
@@ -1016,7 +1016,7 @@ def : Pat<(v8i16 (PPCsrl v8i16:$vA, v8i16:$vB)),
def : Pat<(v4i32 (PPCsrl v4i32:$vA, v4i32:$vB)),
(v4i32 (VSRW $vA, $vB))>;
def : Pat<(v1i128 (PPCsrl v1i128:$vA, v1i128:$vB)),
- (v1i128 (VSR (VSRO $vA, $vB), (VSPLTB 15, $vB)))>;
+ (v1i128 (VSR (v16i8 (VSRO $vA, $vB)), (v16i8 (VSPLTB 15, $vB))))>;
def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)),
(v16i8 (VSRAB $vA, $vB))>;
@@ -1078,10 +1078,12 @@ def VMINUD : VX1_Int_Ty<706, "vminud", int_ppc_altivec_vminud, v2i64>;
// Vector merge
def VMRGEW : VXForm_1<1932, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
"vmrgew $vD, $vA, $vB", IIC_VecFP,
- [(set v16i8:$vD, (vmrgew_shuffle v16i8:$vA, v16i8:$vB))]>;
+ [(set v16i8:$vD,
+ (v16i8 (vmrgew_shuffle v16i8:$vA, v16i8:$vB)))]>;
def VMRGOW : VXForm_1<1676, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
"vmrgow $vD, $vA, $vB", IIC_VecFP,
- [(set v16i8:$vD, (vmrgow_shuffle v16i8:$vA, v16i8:$vB))]>;
+ [(set v16i8:$vD,
+ (v16i8 (vmrgow_shuffle v16i8:$vA, v16i8:$vB)))]>;
// Match vmrgew(x,x) and vmrgow(x,x)
def:Pat<(vmrgew_unary_shuffle v16i8:$vA, undef),
@@ -1502,18 +1504,4 @@ def VABSDUW : VXForm_1<1155, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
"vabsduw $vD, $vA, $vB", IIC_VecGeneral,
[(set v4i32:$vD, (int_ppc_altivec_vabsduw v4i32:$vA, v4i32:$vB))]>;
-def : Pat<(v16i8:$vD (abs v16i8:$vA)),
- (v16i8 (VABSDUB $vA, (V_SET0B)))>;
-def : Pat<(v8i16:$vD (abs v8i16:$vA)),
- (v8i16 (VABSDUH $vA, (V_SET0H)))>;
-def : Pat<(v4i32:$vD (abs v4i32:$vA)),
- (v4i32 (VABSDUW $vA, (V_SET0)))>;
-
-def : Pat<(v16i8:$vD (abs (sub v16i8:$vA, v16i8:$vB))),
- (v16i8 (VABSDUB $vA, $vB))>;
-def : Pat<(v8i16:$vD (abs (sub v8i16:$vA, v8i16:$vB))),
- (v8i16 (VABSDUH $vA, $vB))>;
-def : Pat<(v4i32:$vD (abs (sub v4i32:$vA, v4i32:$vB))),
- (v4i32 (VABSDUW $vA, $vB))>;
-
} // end HasP9Altivec
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index f2845415ecb5..f5f4b46344cf 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -46,6 +46,10 @@ class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
bits<1> UseVSXReg = 0;
let TSFlags{6} = UseVSXReg;
+ // Indicate that this instruction is of type X-Form Load or Store
+ bits<1> XFormMemOp = 0;
+ let TSFlags{7} = XFormMemOp;
+
// Fields used for relation models.
string BaseName = "";
@@ -71,6 +75,7 @@ class PPC970_Unit_VPERM { bits<3> PPC970_Unit = 6; }
class PPC970_Unit_BRU { bits<3> PPC970_Unit = 7; }
class UseVSXReg { bits<1> UseVSXReg = 1; }
+class XFormMemOp { bits<1> XFormMemOp = 1; }
// Two joined instructions; used to emit two adjacent instructions as one.
// The itinerary from the first instruction is used for scheduling and
@@ -109,6 +114,11 @@ class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr,
bit Interpretation64Bit = 0;
}
+// Base class for all X-Form memory instructions
+class IXFormMemOp<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ :I<opcode, OOL, IOL, asmstr, itin>, XFormMemOp;
+
// 1.7.1 I-Form
class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
@@ -437,6 +447,11 @@ class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asms
let Inst{31} = RC;
}
+class XForm_base_r3xo_memOp<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin,
+ list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>, XFormMemOp;
+
class XForm_tlb<bits<10> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin> : XForm_base_r3xo<31, xo, OOL, IOL, asmstr, itin, []> {
let RST = 0;
@@ -469,9 +484,13 @@ class XForm_base_r3xo_swapped
class XForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
- InstrItinClass itin, list<dag> pattern>
+ InstrItinClass itin, list<dag> pattern>
: XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+class XForm_1_memOp<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo_memOp<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
class XForm_1a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
: XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
@@ -511,6 +530,10 @@ class XForm_8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
: XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+class XForm_8_memOp<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo_memOp<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
class XForm_10<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
: XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
@@ -692,24 +715,34 @@ class XForm_24_sync<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
}
class XForm_24_eieio<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
- string asmstr, InstrItinClass itin, list<dag> pattern>
+ string asmstr, InstrItinClass itin, list<dag> pattern>
: XForm_24_sync<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
let L = 0;
}
class XForm_25<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
- InstrItinClass itin, list<dag> pattern>
+ InstrItinClass itin, list<dag> pattern>
: XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
}
+class XForm_25_memOp<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo_memOp<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
class XForm_26<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
: XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
let A = 0;
}
+class XForm_28_memOp<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo_memOp<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
class XForm_28<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
- InstrItinClass itin, list<dag> pattern>
+ InstrItinClass itin, list<dag> pattern>
: XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
}
@@ -980,7 +1013,7 @@ class X_RD6_IMM8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
// to specify an SDAG pattern for matching.
class X_RD5_RS5_IM5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
string asmstr, InstrItinClass itin>
- : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, []> {
+ : XForm_base_r3xo_memOp<opcode, xo, OOL, IOL, asmstr, itin, []> {
}
class X_BF3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
@@ -1018,6 +1051,10 @@ class XX1Form<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
let Inst{31} = XT{5};
}
+class XX1Form_memOp<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : XX1Form<opcode, xo, OOL, IOL, asmstr, itin, pattern>, XFormMemOp;
+
class XX1_RS6_RD5_XO<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
string asmstr, InstrItinClass itin, list<dag> pattern>
: XX1Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
@@ -2094,6 +2131,27 @@ class Z23Form_3<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
let Inst{31} = RC;
}
+class Z23Form_8<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> VRT;
+ bit R;
+ bits<5> VRB;
+ bits<2> idx;
+
+ let Pattern = pattern;
+
+ bit RC = 0; // set by isDOT
+
+ let Inst{6-10} = VRT;
+ let Inst{11-14} = 0;
+ let Inst{15} = R;
+ let Inst{16-20} = VRB;
+ let Inst{21-22} = idx;
+ let Inst{23-30} = xo;
+ let Inst{31} = RC;
+}
+
//===----------------------------------------------------------------------===//
class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
: I<0, OOL, IOL, asmstr, NoItinerary> {
@@ -2103,3 +2161,7 @@ class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
let Inst{31-0} = 0;
let hasNoSchedulingInfo = 1;
}
+
+class PseudoXFormMemOp<dag OOL, dag IOL, string asmstr, list<dag> pattern>
+ : Pseudo<OOL, IOL, asmstr, pattern>, XFormMemOp;
+
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index ffb5cc8757f2..4669719744bc 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -55,6 +55,8 @@ STATISTIC(CmpIselsConverted,
"Number of ISELs that depend on comparison of constants converted");
STATISTIC(MissedConvertibleImmediateInstrs,
"Number of compare-immediate instructions fed by constants");
+STATISTIC(NumRcRotatesConvertedToRcAnd,
+ "Number of record-form rotates converted to record-form andi");
static cl::
opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
@@ -71,6 +73,28 @@ static cl::opt<bool>
UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden,
cl::desc("Use the old (incorrect) instruction latency calculation"));
+// Index into the OpcodesForSpill array.
+enum SpillOpcodeKey {
+ SOK_Int4Spill,
+ SOK_Int8Spill,
+ SOK_Float8Spill,
+ SOK_Float4Spill,
+ SOK_CRSpill,
+ SOK_CRBitSpill,
+ SOK_VRVectorSpill,
+ SOK_VSXVectorSpill,
+ SOK_VectorFloat8Spill,
+ SOK_VectorFloat4Spill,
+ SOK_VRSaveSpill,
+ SOK_QuadFloat8Spill,
+ SOK_QuadFloat4Spill,
+ SOK_QuadBitSpill,
+ SOK_SpillToVSR,
+ SOK_SPESpill,
+ SOK_SPE4Spill,
+ SOK_LastOpcodeSpill // This must be last on the enum.
+};
+
// Pin the vtable to this file.
void PPCInstrInfo::anchor() {}
@@ -275,23 +299,11 @@ bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
- // Note: This list must be kept consistent with LoadRegFromStackSlot.
- switch (MI.getOpcode()) {
- default: break;
- case PPC::LD:
- case PPC::LWZ:
- case PPC::LFS:
- case PPC::LFD:
- case PPC::RESTORE_CR:
- case PPC::RESTORE_CRBIT:
- case PPC::LVX:
- case PPC::LXVD2X:
- case PPC::LXV:
- case PPC::QVLFDX:
- case PPC::QVLFSXs:
- case PPC::QVLFDXb:
- case PPC::RESTORE_VRSAVE:
- case PPC::SPILLTOVSR_LD:
+ unsigned Opcode = MI.getOpcode();
+ const unsigned *OpcodesForSpill = getLoadOpcodesForSpillArray();
+ const unsigned *End = OpcodesForSpill + SOK_LastOpcodeSpill;
+
+ if (End != std::find(OpcodesForSpill, End, Opcode)) {
// Check for the operands added by addFrameReference (the immediate is the
// offset which defaults to 0).
if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
@@ -299,7 +311,6 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
FrameIndex = MI.getOperand(2).getIndex();
return MI.getOperand(0).getReg();
}
- break;
}
return 0;
}
@@ -329,31 +340,16 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
- // Note: This list must be kept consistent with StoreRegToStackSlot.
- switch (MI.getOpcode()) {
- default: break;
- case PPC::STD:
- case PPC::STW:
- case PPC::STFS:
- case PPC::STFD:
- case PPC::SPILL_CR:
- case PPC::SPILL_CRBIT:
- case PPC::STVX:
- case PPC::STXVD2X:
- case PPC::STXV:
- case PPC::QVSTFDX:
- case PPC::QVSTFSXs:
- case PPC::QVSTFDXb:
- case PPC::SPILL_VRSAVE:
- case PPC::SPILLTOVSR_ST:
- // Check for the operands added by addFrameReference (the immediate is the
- // offset which defaults to 0).
+ unsigned Opcode = MI.getOpcode();
+ const unsigned *OpcodesForSpill = getStoreOpcodesForSpillArray();
+ const unsigned *End = OpcodesForSpill + SOK_LastOpcodeSpill;
+
+ if (End != std::find(OpcodesForSpill, End, Opcode)) {
if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
MI.getOperand(2).isFI()) {
FrameIndex = MI.getOperand(2).getIndex();
return MI.getOperand(0).getReg();
}
- break;
}
return 0;
}
@@ -955,8 +951,19 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, get(PPC::MFVSRD), DestReg).addReg(SrcReg);
getKillRegState(KillSrc);
return;
+ } else if (PPC::SPERCRegClass.contains(SrcReg) &&
+ PPC::SPE4RCRegClass.contains(DestReg)) {
+ BuildMI(MBB, I, DL, get(PPC::EFSCFD), DestReg).addReg(SrcReg);
+ getKillRegState(KillSrc);
+ return;
+ } else if (PPC::SPE4RCRegClass.contains(SrcReg) &&
+ PPC::SPERCRegClass.contains(DestReg)) {
+ BuildMI(MBB, I, DL, get(PPC::EFDCFS), DestReg).addReg(SrcReg);
+ getKillRegState(KillSrc);
+ return;
}
+
unsigned Opc;
if (PPC::GPRCRegClass.contains(DestReg, SrcReg))
Opc = PPC::OR;
@@ -989,6 +996,8 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Opc = PPC::QVFMRb;
else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg))
Opc = PPC::CROR;
+ else if (PPC::SPERCRegClass.contains(DestReg, SrcReg))
+ Opc = PPC::EVOR;
else
llvm_unreachable("Impossible reg-to-reg copy");
@@ -1000,129 +1009,212 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc));
}
-// This function returns true if a CR spill is necessary and false otherwise.
-bool
-PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
- unsigned SrcReg, bool isKill,
- int FrameIdx,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs,
- bool &NonRI, bool &SpillsVRS) const{
- // Note: If additional store instructions are added here,
- // update isStoreToStackSlot.
-
- DebugLoc DL;
- if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
- PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
- .addReg(SrcReg,
- getKillRegState(isKill)),
- FrameIdx));
- } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
- PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
- .addReg(SrcReg,
- getKillRegState(isKill)),
- FrameIdx));
- } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFD))
- .addReg(SrcReg,
- getKillRegState(isKill)),
- FrameIdx));
- } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFS))
- .addReg(SrcReg,
- getKillRegState(isKill)),
- FrameIdx));
- } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR))
- .addReg(SrcReg,
- getKillRegState(isKill)),
- FrameIdx));
- return true;
- } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CRBIT))
- .addReg(SrcReg,
- getKillRegState(isKill)),
- FrameIdx));
- return true;
- } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STVX))
- .addReg(SrcReg,
- getKillRegState(isKill)),
- FrameIdx));
- NonRI = true;
- } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
- unsigned Op = Subtarget.hasP9Vector() ? PPC::STXV : PPC::STXVD2X;
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op))
- .addReg(SrcReg,
- getKillRegState(isKill)),
- FrameIdx));
- NonRI = true;
- } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
- unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFSTOREf64 : PPC::STXSDX;
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc))
- .addReg(SrcReg,
- getKillRegState(isKill)),
- FrameIdx));
- NonRI = true;
- } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
- unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFSTOREf32 : PPC::STXSSPX;
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc))
- .addReg(SrcReg,
- getKillRegState(isKill)),
- FrameIdx));
- NonRI = true;
- } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
- assert(Subtarget.isDarwin() &&
- "VRSAVE only needs spill/restore on Darwin");
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_VRSAVE))
- .addReg(SrcReg,
- getKillRegState(isKill)),
- FrameIdx));
- SpillsVRS = true;
- } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFDX))
- .addReg(SrcReg,
- getKillRegState(isKill)),
- FrameIdx));
- NonRI = true;
- } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFSXs))
- .addReg(SrcReg,
- getKillRegState(isKill)),
- FrameIdx));
- NonRI = true;
- } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFDXb))
- .addReg(SrcReg,
- getKillRegState(isKill)),
- FrameIdx));
- NonRI = true;
- } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILLTOVSR_ST))
- .addReg(SrcReg,
- getKillRegState(isKill)),
- FrameIdx));
+unsigned PPCInstrInfo::getStoreOpcodeForSpill(unsigned Reg,
+ const TargetRegisterClass *RC)
+ const {
+ const unsigned *OpcodesForSpill = getStoreOpcodesForSpillArray();
+ int OpcodeIndex = 0;
+
+ if (RC != nullptr) {
+ if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
+ PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_Int4Spill;
+ } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
+ PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_Int8Spill;
+ } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_Float8Spill;
+ } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_Float4Spill;
+ } else if (PPC::SPERCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_SPESpill;
+ } else if (PPC::SPE4RCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_SPE4Spill;
+ } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_CRSpill;
+ } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_CRBitSpill;
+ } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_VRVectorSpill;
+ } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_VSXVectorSpill;
+ } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_VectorFloat8Spill;
+ } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_VectorFloat4Spill;
+ } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_VRSaveSpill;
+ } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_QuadFloat8Spill;
+ } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_QuadFloat4Spill;
+ } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_QuadBitSpill;
+ } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_SpillToVSR;
+ } else {
+ llvm_unreachable("Unknown regclass!");
+ }
} else {
- llvm_unreachable("Unknown regclass!");
+ if (PPC::GPRCRegClass.contains(Reg) ||
+ PPC::GPRC_NOR0RegClass.contains(Reg)) {
+ OpcodeIndex = SOK_Int4Spill;
+ } else if (PPC::G8RCRegClass.contains(Reg) ||
+ PPC::G8RC_NOX0RegClass.contains(Reg)) {
+ OpcodeIndex = SOK_Int8Spill;
+ } else if (PPC::F8RCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_Float8Spill;
+ } else if (PPC::F4RCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_Float4Spill;
+ } else if (PPC::CRRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_CRSpill;
+ } else if (PPC::CRBITRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_CRBitSpill;
+ } else if (PPC::VRRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_VRVectorSpill;
+ } else if (PPC::VSRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_VSXVectorSpill;
+ } else if (PPC::VSFRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_VectorFloat8Spill;
+ } else if (PPC::VSSRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_VectorFloat4Spill;
+ } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_VRSaveSpill;
+ } else if (PPC::QFRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_QuadFloat8Spill;
+ } else if (PPC::QSRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_QuadFloat4Spill;
+ } else if (PPC::QBRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_QuadBitSpill;
+ } else if (PPC::SPILLTOVSRRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_SpillToVSR;
+ } else {
+ llvm_unreachable("Unknown regclass!");
+ }
}
+ return OpcodesForSpill[OpcodeIndex];
+}
- return false;
+unsigned
+PPCInstrInfo::getLoadOpcodeForSpill(unsigned Reg,
+ const TargetRegisterClass *RC) const {
+ const unsigned *OpcodesForSpill = getLoadOpcodesForSpillArray();
+ int OpcodeIndex = 0;
+
+ if (RC != nullptr) {
+ if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
+ PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_Int4Spill;
+ } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
+ PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_Int8Spill;
+ } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_Float8Spill;
+ } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_Float4Spill;
+ } else if (PPC::SPERCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_SPESpill;
+ } else if (PPC::SPE4RCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_SPE4Spill;
+ } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_CRSpill;
+ } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_CRBitSpill;
+ } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_VRVectorSpill;
+ } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_VSXVectorSpill;
+ } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_VectorFloat8Spill;
+ } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_VectorFloat4Spill;
+ } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_VRSaveSpill;
+ } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_QuadFloat8Spill;
+ } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_QuadFloat4Spill;
+ } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_QuadBitSpill;
+ } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
+ OpcodeIndex = SOK_SpillToVSR;
+ } else {
+ llvm_unreachable("Unknown regclass!");
+ }
+ } else {
+ if (PPC::GPRCRegClass.contains(Reg) ||
+ PPC::GPRC_NOR0RegClass.contains(Reg)) {
+ OpcodeIndex = SOK_Int4Spill;
+ } else if (PPC::G8RCRegClass.contains(Reg) ||
+ PPC::G8RC_NOX0RegClass.contains(Reg)) {
+ OpcodeIndex = SOK_Int8Spill;
+ } else if (PPC::F8RCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_Float8Spill;
+ } else if (PPC::F4RCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_Float4Spill;
+ } else if (PPC::CRRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_CRSpill;
+ } else if (PPC::CRBITRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_CRBitSpill;
+ } else if (PPC::VRRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_VRVectorSpill;
+ } else if (PPC::VSRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_VSXVectorSpill;
+ } else if (PPC::VSFRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_VectorFloat8Spill;
+ } else if (PPC::VSSRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_VectorFloat4Spill;
+ } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_VRSaveSpill;
+ } else if (PPC::QFRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_QuadFloat8Spill;
+ } else if (PPC::QSRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_QuadFloat4Spill;
+ } else if (PPC::QBRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_QuadBitSpill;
+ } else if (PPC::SPILLTOVSRRCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_SpillToVSR;
+ } else {
+ llvm_unreachable("Unknown regclass!");
+ }
+ }
+ return OpcodesForSpill[OpcodeIndex];
}
-void
-PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- unsigned SrcReg, bool isKill, int FrameIdx,
- const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const {
- MachineFunction &MF = *MBB.getParent();
- SmallVector<MachineInstr*, 4> NewMIs;
+void PPCInstrInfo::StoreRegToStackSlot(
+ MachineFunction &MF, unsigned SrcReg, bool isKill, int FrameIdx,
+ const TargetRegisterClass *RC,
+ SmallVectorImpl<MachineInstr *> &NewMIs) const {
+ unsigned Opcode = getStoreOpcodeForSpill(PPC::NoRegister, RC);
+ DebugLoc DL;
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
FuncInfo->setHasSpills();
+ NewMIs.push_back(addFrameReference(
+ BuildMI(MF, DL, get(Opcode)).addReg(SrcReg, getKillRegState(isKill)),
+ FrameIdx));
+
+ if (PPC::CRRCRegClass.hasSubClassEq(RC) ||
+ PPC::CRBITRCRegClass.hasSubClassEq(RC))
+ FuncInfo->setSpillsCR();
+
+ if (PPC::VRSAVERCRegClass.hasSubClassEq(RC))
+ FuncInfo->setSpillsVRSAVE();
+
+ if (isXFormMemOp(Opcode))
+ FuncInfo->setHasNonRISpills();
+}
+
+void PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill,
+ int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ MachineFunction &MF = *MBB.getParent();
+ SmallVector<MachineInstr *, 4> NewMIs;
+
// We need to avoid a situation in which the value from a VRRC register is
// spilled using an Altivec instruction and reloaded into a VSRC register
// using a VSX instruction. The issue with this is that the VSX
@@ -1132,16 +1224,7 @@ PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
// VSX instruction.
RC = updatedRC(RC);
- bool NonRI = false, SpillsVRS = false;
- if (StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs,
- NonRI, SpillsVRS))
- FuncInfo->setSpillsCR();
-
- if (SpillsVRS)
- FuncInfo->setSpillsVRSAVE();
-
- if (NonRI)
- FuncInfo->setHasNonRISpills();
+ StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs);
for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
MBB.insert(MI, NewMIs[i]);
@@ -1154,85 +1237,25 @@ PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
NewMIs.back()->addMemOperand(MF, MMO);
}
-bool PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
+void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
unsigned DestReg, int FrameIdx,
const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr *> &NewMIs,
- bool &NonRI, bool &SpillsVRS) const {
- // Note: If additional load instructions are added here,
- // update isLoadFromStackSlot.
-
- if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
- PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
- DestReg), FrameIdx));
- } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
- PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg),
- FrameIdx));
- } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg),
- FrameIdx));
- } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg),
- FrameIdx));
- } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
- get(PPC::RESTORE_CR), DestReg),
- FrameIdx));
- return true;
- } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
- get(PPC::RESTORE_CRBIT), DestReg),
- FrameIdx));
- return true;
- } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LVX), DestReg),
- FrameIdx));
- NonRI = true;
- } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
- unsigned Op = Subtarget.hasP9Vector() ? PPC::LXV : PPC::LXVD2X;
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op), DestReg),
- FrameIdx));
- NonRI = true;
- } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
- unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFLOADf64 : PPC::LXSDX;
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc),
- DestReg), FrameIdx));
- NonRI = true;
- } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
- unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFLOADf32 : PPC::LXSSPX;
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc),
- DestReg), FrameIdx));
- NonRI = true;
- } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
- assert(Subtarget.isDarwin() &&
- "VRSAVE only needs spill/restore on Darwin");
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
- get(PPC::RESTORE_VRSAVE),
- DestReg),
- FrameIdx));
- SpillsVRS = true;
- } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFDX), DestReg),
- FrameIdx));
- NonRI = true;
- } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFSXs), DestReg),
- FrameIdx));
- NonRI = true;
- } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFDXb), DestReg),
- FrameIdx));
- NonRI = true;
- } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILLTOVSR_LD),
- DestReg), FrameIdx));
- } else {
- llvm_unreachable("Unknown regclass!");
- }
+ SmallVectorImpl<MachineInstr *> &NewMIs)
+ const {
+ unsigned Opcode = getLoadOpcodeForSpill(PPC::NoRegister, RC);
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opcode), DestReg),
+ FrameIdx));
+ PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
- return false;
+ if (PPC::CRRCRegClass.hasSubClassEq(RC) ||
+ PPC::CRBITRCRegClass.hasSubClassEq(RC))
+ FuncInfo->setSpillsCR();
+
+ if (PPC::VRSAVERCRegClass.hasSubClassEq(RC))
+ FuncInfo->setSpillsVRSAVE();
+
+ if (isXFormMemOp(Opcode))
+ FuncInfo->setHasNonRISpills();
}
void
@@ -1259,16 +1282,7 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass)
RC = &PPC::VSRCRegClass;
- bool NonRI = false, SpillsVRS = false;
- if (LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs,
- NonRI, SpillsVRS))
- FuncInfo->setSpillsCR();
-
- if (SpillsVRS)
- FuncInfo->setSpillsVRSAVE();
-
- if (NonRI)
- FuncInfo->setHasNonRISpills();
+ LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs);
for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
MBB.insert(MI, NewMIs[i]);
@@ -1617,7 +1631,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
int OpC = CmpInstr.getOpcode();
unsigned CRReg = CmpInstr.getOperand(0).getReg();
- // FP record forms set CR1 based on the execption status bits, not a
+ // FP record forms set CR1 based on the exception status bits, not a
// comparison with zero.
if (OpC == PPC::FCMPUS || OpC == PPC::FCMPUD)
return false;
@@ -1740,7 +1754,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
unsigned PredHint = PPC::getPredicateHint(Pred);
int16_t Immed = (int16_t)Value;
- // When modyfing the condition in the predicate, we propagate hint bits
+ // When modifying the condition in the predicate, we propagate hint bits
// from the original predicate to the new one.
if (Immed == -1 && PredCond == PPC::PRED_GT)
// We convert "greater than -1" into "greater than or equal to 0",
@@ -1897,6 +1911,31 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
// specifically the case if this is the instruction directly after the
// compare).
+ // Rotates are expensive instructions. If we're emitting a record-form
+ // rotate that can just be an andi, we should just emit the andi.
+ if ((MIOpC == PPC::RLWINM || MIOpC == PPC::RLWINM8) &&
+ MI->getOperand(2).getImm() == 0) {
+ int64_t MB = MI->getOperand(3).getImm();
+ int64_t ME = MI->getOperand(4).getImm();
+ if (MB < ME && MB >= 16) {
+ uint64_t Mask = ((1LLU << (32 - MB)) - 1) & ~((1LLU << (31 - ME)) - 1);
+ NewOpC = MIOpC == PPC::RLWINM ? PPC::ANDIo : PPC::ANDIo8;
+ MI->RemoveOperand(4);
+ MI->RemoveOperand(3);
+ MI->getOperand(2).setImm(Mask);
+ NumRcRotatesConvertedToRcAnd++;
+ }
+ } else if (MIOpC == PPC::RLDICL && MI->getOperand(2).getImm() == 0) {
+ int64_t MB = MI->getOperand(3).getImm();
+ if (MB >= 48) {
+ uint64_t Mask = (1LLU << (63 - MB + 1)) - 1;
+ NewOpC = PPC::ANDIo8;
+ MI->RemoveOperand(3);
+ MI->getOperand(2).setImm(Mask);
+ NumRcRotatesConvertedToRcAnd++;
+ }
+ }
+
const MCInstrDesc &NewDesc = get(NewOpC);
MI->setDesc(NewDesc);
@@ -2049,6 +2088,12 @@ bool PPCInstrInfo::expandVSXMemPseudo(MachineInstr &MI) const {
return true;
}
+#ifndef NDEBUG
+static bool isAnImmediateOperand(const MachineOperand &MO) {
+ return MO.isCPI() || MO.isGlobal() || MO.isImm();
+}
+#endif
+
bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
auto &MBB = *MI.getParent();
auto DL = MI.getDebugLoc();
@@ -2071,7 +2116,8 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case PPC::DFSTOREf64: {
assert(Subtarget.hasP9Vector() &&
"Invalid D-Form Pseudo-ops on Pre-P9 target.");
- assert(MI.getOperand(2).isReg() && MI.getOperand(1).isImm() &&
+ assert(MI.getOperand(2).isReg() &&
+ isAnImmediateOperand(MI.getOperand(1)) &&
"D-form op must have register and immediate operands");
return expandVSXMemPseudo(MI);
}
@@ -2151,28 +2197,6 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return false;
}
-unsigned PPCInstrInfo::lookThruCopyLike(unsigned SrcReg,
- const MachineRegisterInfo *MRI) {
- while (true) {
- MachineInstr *MI = MRI->getVRegDef(SrcReg);
- if (!MI->isCopyLike())
- return SrcReg;
-
- unsigned CopySrcReg;
- if (MI->isCopy())
- CopySrcReg = MI->getOperand(1).getReg();
- else {
- assert(MI->isSubregToReg() && "Bad opcode for lookThruCopyLike");
- CopySrcReg = MI->getOperand(2).getReg();
- }
-
- if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg))
- return CopySrcReg;
-
- SrcReg = CopySrcReg;
- }
-}
-
// Essentially a compile-time implementation of a compare->isel sequence.
// It takes two constants to compare, along with the true/false registers
// and the comparison type (as a subreg to a CR field) and returns one
@@ -2238,7 +2262,8 @@ MachineInstr *PPCInstrInfo::getConstantDefMI(MachineInstr &MI,
ConstOp = ~0U;
MachineInstr *DefMI = nullptr;
MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo();
- // If we'ere in SSA, get the defs through the MRI. Otherwise, only look
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ // If we're in SSA, get the defs through the MRI. Otherwise, only look
// within the basic block to see if the register is defined using an LI/LI8.
if (MRI->isSSA()) {
for (int i = 1, e = MI.getNumOperands(); i < e; i++) {
@@ -2247,7 +2272,7 @@ MachineInstr *PPCInstrInfo::getConstantDefMI(MachineInstr &MI,
unsigned Reg = MI.getOperand(i).getReg();
if (!TargetRegisterInfo::isVirtualRegister(Reg))
continue;
- unsigned TrueReg = lookThruCopyLike(Reg, MRI);
+ unsigned TrueReg = TRI->lookThruCopyLike(Reg, MRI);
if (TargetRegisterInfo::isVirtualRegister(TrueReg)) {
DefMI = MRI->getVRegDef(TrueReg);
if (DefMI->getOpcode() == PPC::LI || DefMI->getOpcode() == PPC::LI8) {
@@ -2313,6 +2338,38 @@ MachineInstr *PPCInstrInfo::getConstantDefMI(MachineInstr &MI,
return ConstOp == ~0U ? nullptr : DefMI;
}
+const unsigned *PPCInstrInfo::getStoreOpcodesForSpillArray() const {
+ static const unsigned OpcodesForSpill[2][SOK_LastOpcodeSpill] = {
+ // Power 8
+ {PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR,
+ PPC::SPILL_CRBIT, PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX,
+ PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb,
+ PPC::SPILLTOVSR_ST, PPC::EVSTDD, PPC::SPESTW},
+ // Power 9
+ {PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR,
+ PPC::SPILL_CRBIT, PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32,
+ PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb,
+ PPC::SPILLTOVSR_ST}};
+
+ return OpcodesForSpill[(Subtarget.hasP9Vector()) ? 1 : 0];
+}
+
+const unsigned *PPCInstrInfo::getLoadOpcodesForSpillArray() const {
+ static const unsigned OpcodesForSpill[2][SOK_LastOpcodeSpill] = {
+ // Power 8
+ {PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,
+ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX,
+ PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb,
+ PPC::SPILLTOVSR_LD, PPC::EVLDD, PPC::SPELWZ},
+ // Power 9
+ {PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,
+ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, PPC::DFLOADf32,
+ PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb,
+ PPC::SPILLTOVSR_LD}};
+
+ return OpcodesForSpill[(Subtarget.hasP9Vector()) ? 1 : 0];
+}
+
// If this instruction has an immediate form and one of its operands is a
// result of a load-immediate, convert it to the immediate form if the constant
// is in range.
@@ -2391,16 +2448,17 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
CompareUseMI.RemoveOperand(2);
continue;
}
- DEBUG(dbgs() << "Found LI -> CMPI -> ISEL, replacing with a copy.\n");
- DEBUG(DefMI->dump(); MI.dump(); CompareUseMI.dump());
- DEBUG(dbgs() << "Is converted to:\n");
+ LLVM_DEBUG(
+ dbgs() << "Found LI -> CMPI -> ISEL, replacing with a copy.\n");
+ LLVM_DEBUG(DefMI->dump(); MI.dump(); CompareUseMI.dump());
+ LLVM_DEBUG(dbgs() << "Is converted to:\n");
// Convert to copy and remove unneeded operands.
CompareUseMI.setDesc(get(PPC::COPY));
CompareUseMI.RemoveOperand(3);
CompareUseMI.RemoveOperand(RegToCopy == TrueReg ? 2 : 1);
CmpIselsConverted++;
Changed = true;
- DEBUG(CompareUseMI.dump());
+ LLVM_DEBUG(CompareUseMI.dump());
}
if (Changed)
return true;
@@ -2431,9 +2489,10 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
// Use APInt's rotate function.
int64_t SH = MI.getOperand(2).getImm();
int64_t MB = MI.getOperand(3).getImm();
- APInt InVal(Opc == PPC::RLDICL ? 64 : 32, SExtImm, true);
+ APInt InVal((Opc == PPC::RLDICL || Opc == PPC::RLDICLo) ?
+ 64 : 32, SExtImm, true);
InVal = InVal.rotl(SH);
- uint64_t Mask = (1LU << (63 - MB + 1)) - 1;
+ uint64_t Mask = (1LLU << (63 - MB + 1)) - 1;
InVal &= Mask;
// Can't replace negative values with an LI as that will sign-extend
// and not clear the left bits. If we're setting the CR bit, we will use
@@ -2457,8 +2516,8 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
int64_t ME = MI.getOperand(4).getImm();
APInt InVal(32, SExtImm, true);
InVal = InVal.rotl(SH);
- // Set the bits ( MB + 32 ) to ( ME + 32 ).
- uint64_t Mask = ((1 << (32 - MB)) - 1) & ~((1 << (31 - ME)) - 1);
+ // Set the bits ( MB + 32 ) to ( ME + 32 ).
+ uint64_t Mask = ((1LLU << (32 - MB)) - 1) & ~((1LLU << (31 - ME)) - 1);
InVal &= Mask;
// Can't replace negative values with an LI as that will sign-extend
// and not clear the left bits. If we're setting the CR bit, we will use
@@ -2496,10 +2555,37 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
}
if (ReplaceWithLI) {
- DEBUG(dbgs() << "Replacing instruction:\n");
- DEBUG(MI.dump());
- DEBUG(dbgs() << "Fed by:\n");
- DEBUG(DefMI->dump());
+ // We need to be careful with CR-setting instructions we're replacing.
+ if (SetCR) {
+ // We don't know anything about uses when we're out of SSA, so only
+ // replace if the new immediate will be reproduced.
+ bool ImmChanged = (SExtImm & NewImm) != NewImm;
+ if (PostRA && ImmChanged)
+ return false;
+
+ if (!PostRA) {
+ // If the defining load-immediate has no other uses, we can just replace
+ // the immediate with the new immediate.
+ if (MRI->hasOneUse(DefMI->getOperand(0).getReg()))
+ DefMI->getOperand(1).setImm(NewImm);
+
+ // If we're not using the GPR result of the CR-setting instruction, we
+ // just need to and with zero/non-zero depending on the new immediate.
+ else if (MRI->use_empty(MI.getOperand(0).getReg())) {
+ if (NewImm) {
+ assert(Immediate && "Transformation converted zero to non-zero?");
+ NewImm = Immediate;
+ }
+ }
+ else if (ImmChanged)
+ return false;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "Replacing instruction:\n");
+ LLVM_DEBUG(MI.dump());
+ LLVM_DEBUG(dbgs() << "Fed by:\n");
+ LLVM_DEBUG(DefMI->dump());
LoadImmediateInfo LII;
LII.Imm = NewImm;
LII.Is64Bit = Is64BitLI;
@@ -2509,8 +2595,8 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
if (KilledDef && SetCR)
*KilledDef = nullptr;
replaceInstrWithLI(MI, LII);
- DEBUG(dbgs() << "With:\n");
- DEBUG(MI.dump());
+ LLVM_DEBUG(dbgs() << "With:\n");
+ LLVM_DEBUG(MI.dump());
return true;
}
return false;
@@ -2527,6 +2613,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
III.ConstantOpNo = 2;
III.ImmWidth = 16;
III.ImmMustBeMultipleOf = 1;
+ III.TruncateImmTo = 0;
switch (Opc) {
default: return false;
case PPC::ADD4:
@@ -2600,10 +2687,6 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
case PPC::RLWNM8:
case PPC::RLWNMo:
case PPC::RLWNM8o:
- case PPC::RLDCL:
- case PPC::RLDCLo:
- case PPC::RLDCR:
- case PPC::RLDCRo:
case PPC::SLW:
case PPC::SLW8:
case PPC::SLWo:
@@ -2614,29 +2697,26 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
case PPC::SRW8o:
case PPC::SRAW:
case PPC::SRAWo:
- case PPC::SLD:
- case PPC::SLDo:
- case PPC::SRD:
- case PPC::SRDo:
- case PPC::SRAD:
- case PPC::SRADo:
III.SignedImm = false;
III.ZeroIsSpecialOrig = 0;
III.ZeroIsSpecialNew = 0;
III.IsCommutative = false;
// This isn't actually true, but the instructions ignore any of the
// upper bits, so any immediate loaded with an LI is acceptable.
+ // This does not apply to shift right algebraic because a value
+ // out of range will produce a -1/0.
III.ImmWidth = 16;
+ if (Opc == PPC::RLWNM || Opc == PPC::RLWNM8 ||
+ Opc == PPC::RLWNMo || Opc == PPC::RLWNM8o)
+ III.TruncateImmTo = 5;
+ else
+ III.TruncateImmTo = 6;
switch(Opc) {
default: llvm_unreachable("Unknown opcode");
case PPC::RLWNM: III.ImmOpcode = PPC::RLWINM; break;
case PPC::RLWNM8: III.ImmOpcode = PPC::RLWINM8; break;
case PPC::RLWNMo: III.ImmOpcode = PPC::RLWINMo; break;
case PPC::RLWNM8o: III.ImmOpcode = PPC::RLWINM8o; break;
- case PPC::RLDCL: III.ImmOpcode = PPC::RLDICL; break;
- case PPC::RLDCLo: III.ImmOpcode = PPC::RLDICLo; break;
- case PPC::RLDCR: III.ImmOpcode = PPC::RLDICR; break;
- case PPC::RLDCRo: III.ImmOpcode = PPC::RLDICRo; break;
case PPC::SLW: III.ImmOpcode = PPC::RLWINM; break;
case PPC::SLW8: III.ImmOpcode = PPC::RLWINM8; break;
case PPC::SLWo: III.ImmOpcode = PPC::RLWINMo; break;
@@ -2645,14 +2725,62 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
case PPC::SRW8: III.ImmOpcode = PPC::RLWINM8; break;
case PPC::SRWo: III.ImmOpcode = PPC::RLWINMo; break;
case PPC::SRW8o: III.ImmOpcode = PPC::RLWINM8o; break;
- case PPC::SRAW: III.ImmOpcode = PPC::SRAWI; break;
- case PPC::SRAWo: III.ImmOpcode = PPC::SRAWIo; break;
+ case PPC::SRAW:
+ III.ImmWidth = 5;
+ III.TruncateImmTo = 0;
+ III.ImmOpcode = PPC::SRAWI;
+ break;
+ case PPC::SRAWo:
+ III.ImmWidth = 5;
+ III.TruncateImmTo = 0;
+ III.ImmOpcode = PPC::SRAWIo;
+ break;
+ }
+ break;
+ case PPC::RLDCL:
+ case PPC::RLDCLo:
+ case PPC::RLDCR:
+ case PPC::RLDCRo:
+ case PPC::SLD:
+ case PPC::SLDo:
+ case PPC::SRD:
+ case PPC::SRDo:
+ case PPC::SRAD:
+ case PPC::SRADo:
+ III.SignedImm = false;
+ III.ZeroIsSpecialOrig = 0;
+ III.ZeroIsSpecialNew = 0;
+ III.IsCommutative = false;
+ // This isn't actually true, but the instructions ignore any of the
+ // upper bits, so any immediate loaded with an LI is acceptable.
+ // This does not apply to shift right algebraic because a value
+ // out of range will produce a -1/0.
+ III.ImmWidth = 16;
+ if (Opc == PPC::RLDCL || Opc == PPC::RLDCLo ||
+ Opc == PPC::RLDCR || Opc == PPC::RLDCRo)
+ III.TruncateImmTo = 6;
+ else
+ III.TruncateImmTo = 7;
+ switch(Opc) {
+ default: llvm_unreachable("Unknown opcode");
+ case PPC::RLDCL: III.ImmOpcode = PPC::RLDICL; break;
+ case PPC::RLDCLo: III.ImmOpcode = PPC::RLDICLo; break;
+ case PPC::RLDCR: III.ImmOpcode = PPC::RLDICR; break;
+ case PPC::RLDCRo: III.ImmOpcode = PPC::RLDICRo; break;
case PPC::SLD: III.ImmOpcode = PPC::RLDICR; break;
case PPC::SLDo: III.ImmOpcode = PPC::RLDICRo; break;
case PPC::SRD: III.ImmOpcode = PPC::RLDICL; break;
case PPC::SRDo: III.ImmOpcode = PPC::RLDICLo; break;
- case PPC::SRAD: III.ImmOpcode = PPC::SRADI; break;
- case PPC::SRADo: III.ImmOpcode = PPC::SRADIo; break;
+ case PPC::SRAD:
+ III.ImmWidth = 6;
+ III.TruncateImmTo = 0;
+ III.ImmOpcode = PPC::SRADI;
+ break;
+ case PPC::SRADo:
+ III.ImmWidth = 6;
+ III.TruncateImmTo = 0;
+ III.ImmOpcode = PPC::SRADIo;
+ break;
}
break;
// Loads and stores:
@@ -2866,6 +2994,8 @@ bool PPCInstrInfo::transformToImmForm(MachineInstr &MI, const ImmInstrInfo &III,
return false;
if (Imm % III.ImmMustBeMultipleOf)
return false;
+ if (III.TruncateImmTo)
+ Imm &= ((1 << III.TruncateImmTo) - 1);
if (III.SignedImm) {
APInt ActualValue(64, Imm, true);
if (!ActualValue.isSignedIntN(III.ImmWidth))
@@ -3108,7 +3238,7 @@ bool PPCInstrInfo::isTOCSaveMI(const MachineInstr &MI) const {
}
// We limit the max depth to track incoming values of PHIs or binary ops
-// (e.g. AND) to avoid exsessive cost.
+// (e.g. AND) to avoid excessive cost.
const unsigned MAX_DEPTH = 1;
bool
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 4271c50127a1..ba82f56a2464 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -68,7 +68,9 @@ enum {
/// The VSX instruction that uses VSX register (vs0-vs63), instead of VMX
/// register (v0-v31).
- UseVSXReg = 0x1 << NewDef_Shift
+ UseVSXReg = 0x1 << NewDef_Shift,
+ /// This instruction is an X-Form memory operation.
+ XFormMemOp = 0x1 << (NewDef_Shift+1)
};
} // end namespace PPCII
@@ -97,6 +99,8 @@ struct ImmInstrInfo {
uint64_t ImmOpcode : 16;
// The size of the immediate.
uint64_t ImmWidth : 5;
+ // The immediate should be truncated to N bits.
+ uint64_t TruncateImmTo : 5;
};
// Information required to convert an instruction to just a materialized
@@ -112,20 +116,19 @@ class PPCInstrInfo : public PPCGenInstrInfo {
PPCSubtarget &Subtarget;
const PPCRegisterInfo RI;
- bool StoreRegToStackSlot(MachineFunction &MF,
- unsigned SrcReg, bool isKill, int FrameIdx,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs,
- bool &NonRI, bool &SpillsVRS) const;
- bool LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
+ void StoreRegToStackSlot(MachineFunction &MF, unsigned SrcReg, bool isKill,
+ int FrameIdx, const TargetRegisterClass *RC,
+ SmallVectorImpl<MachineInstr *> &NewMIs) const;
+ void LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
unsigned DestReg, int FrameIdx,
const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr *> &NewMIs,
- bool &NonRI, bool &SpillsVRS) const;
+ SmallVectorImpl<MachineInstr *> &NewMIs) const;
bool transformToImmForm(MachineInstr &MI, const ImmInstrInfo &III,
unsigned ConstantOpNo, int64_t Imm) const;
MachineInstr *getConstantDefMI(MachineInstr &MI, unsigned &ConstOp,
bool &SeenIntermediateUse) const;
+ const unsigned *getStoreOpcodesForSpillArray() const;
+ const unsigned *getLoadOpcodesForSpillArray() const;
virtual void anchor();
protected:
@@ -152,6 +155,10 @@ public:
///
const PPCRegisterInfo &getRegisterInfo() const { return RI; }
+ bool isXFormMemOp(unsigned Opcode) const {
+ return get(Opcode).TSFlags & PPCII::XFormMemOp;
+ }
+
ScheduleHazardRecognizer *
CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
const ScheduleDAG *DAG) const override;
@@ -249,6 +256,12 @@ public:
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
+ unsigned getStoreOpcodeForSpill(unsigned Reg,
+ const TargetRegisterClass *RC = nullptr) const;
+
+ unsigned getLoadOpcodeForSpill(unsigned Reg,
+ const TargetRegisterClass *RC = nullptr) const;
+
bool
reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
@@ -357,13 +370,6 @@ public:
MachineInstr **KilledDef = nullptr) const;
void replaceInstrWithLI(MachineInstr &MI, const LoadImmediateInfo &LII) const;
- // This is used to find the "true" source register for n
- // Machine instruction. Returns the original SrcReg unless it is the target
- // of a copy-like operation, in which case we chain backwards through all
- // such operations to the ultimate source register. If a
- // physical register is encountered, we stop the search.
- static unsigned lookThruCopyLike(unsigned SrcReg,
- const MachineRegisterInfo *MRI);
bool instrHasImmForm(const MachineInstr &MI, ImmInstrInfo &III) const;
};
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index a932d05b24ee..1a43037e4a4b 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -29,6 +29,12 @@ def SDT_PPCLxsizx : SDTypeProfile<1, 2, [
def SDT_PPCstxsix : SDTypeProfile<0, 3, [
SDTCisVT<0, f64>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
]>;
+def SDT_PPCcv_fp_to_int : SDTypeProfile<1, 1, [
+ SDTCisFP<0>, SDTCisFP<1>
+ ]>;
+def SDT_PPCstore_scal_int_from_vsr : SDTypeProfile<0, 3, [
+ SDTCisVT<0, f64>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
+]>;
def SDT_PPCVexts : SDTypeProfile<1, 2, [
SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2>
]>;
@@ -123,6 +129,14 @@ def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>;
def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
+
+def PPCcv_fp_to_uint_in_vsr:
+ SDNode<"PPCISD::FP_TO_UINT_IN_VSR", SDT_PPCcv_fp_to_int, []>;
+def PPCcv_fp_to_sint_in_vsr:
+ SDNode<"PPCISD::FP_TO_SINT_IN_VSR", SDT_PPCcv_fp_to_int, []>;
+def PPCstore_scal_int_from_vsr:
+ SDNode<"PPCISD::ST_VSR_SCAL_INT", SDT_PPCstore_scal_int_from_vsr,
+ [SDNPHasChain, SDNPMayStore]>;
def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx,
[SDNPHasChain, SDNPMayStore]>;
def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx,
@@ -204,6 +218,13 @@ def PPCsrl : SDNode<"PPCISD::SRL" , SDTIntShiftOp>;
def PPCsra : SDNode<"PPCISD::SRA" , SDTIntShiftOp>;
def PPCshl : SDNode<"PPCISD::SHL" , SDTIntShiftOp>;
+// Move 2 i64 values into a VSX register
+def PPCbuild_fp128: SDNode<"PPCISD::BUILD_FP128",
+ SDTypeProfile<1, 2,
+ [SDTCisFP<0>, SDTCisSameSizeAs<1,2>,
+ SDTCisSameAs<1,2>]>,
+ []>;
+
// These are target-independent nodes, but have target-specific formats.
def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeqStart,
[SDNPHasChain, SDNPOutGlue]>;
@@ -257,6 +278,13 @@ def PPCvcmp_o : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>;
def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
[SDNPHasChain, SDNPOptInGlue]>;
+// PPC-specific atomic operations.
+def PPCatomicCmpSwap_8 :
+ SDNode<"PPCISD::ATOMIC_CMP_SWAP_8", SDTAtomic3,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+def PPCatomicCmpSwap_16 :
+ SDNode<"PPCISD::ATOMIC_CMP_SWAP_16", SDTAtomic3,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
def PPClbrx : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def PPCstbrx : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
@@ -301,7 +329,7 @@ def HI16 : SDNodeXForm<imm, [{
def HA16 : SDNodeXForm<imm, [{
// Transformation function: shift the immediate value down into the low bits.
- int Val = N->getZExtValue();
+ long Val = N->getZExtValue();
return getI32Imm((Val - (signed short)Val) >> 16, SDLoc(N));
}]>;
def MB : SDNodeXForm<imm, [{
@@ -516,6 +544,19 @@ def crrc0 : RegisterOperand<CRRC0> {
let ParserMatchClass = PPCRegCRRCAsmOperand;
}
+def PPCRegSPERCAsmOperand : AsmOperandClass {
+ let Name = "RegSPERC"; let PredicateMethod = "isRegNumber";
+}
+def sperc : RegisterOperand<SPERC> {
+ let ParserMatchClass = PPCRegSPERCAsmOperand;
+}
+def PPCRegSPE4RCAsmOperand : AsmOperandClass {
+ let Name = "RegSPE4RC"; let PredicateMethod = "isRegNumber";
+}
+def spe4rc : RegisterOperand<SPE4RC> {
+ let ParserMatchClass = PPCRegSPE4RCAsmOperand;
+}
+
def PPCU1ImmAsmOperand : AsmOperandClass {
let Name = "U1Imm"; let PredicateMethod = "isU1Imm";
let RenderMethod = "addImmOperands";
@@ -791,16 +832,19 @@ def spe8dis : Operand<iPTR> { // SPE displacement where the imm is 8-aligned.
let PrintMethod = "printMemRegImm";
let MIOperandInfo = (ops dispSPE8:$imm, ptr_rc_nor0:$reg);
let EncoderMethod = "getSPE8DisEncoding";
+ let DecoderMethod = "decodeSPE8Operands";
}
def spe4dis : Operand<iPTR> { // SPE displacement where the imm is 4-aligned.
let PrintMethod = "printMemRegImm";
let MIOperandInfo = (ops dispSPE4:$imm, ptr_rc_nor0:$reg);
let EncoderMethod = "getSPE4DisEncoding";
+ let DecoderMethod = "decodeSPE4Operands";
}
def spe2dis : Operand<iPTR> { // SPE displacement where the imm is 2-aligned.
let PrintMethod = "printMemRegImm";
let MIOperandInfo = (ops dispSPE2:$imm, ptr_rc_nor0:$reg);
let EncoderMethod = "getSPE2DisEncoding";
+ let DecoderMethod = "decodeSPE2Operands";
}
// A single-register address. This is used with the SjLj
@@ -855,7 +899,7 @@ def HasSYNC : Predicate<"!PPCSubTarget->hasOnlyMSYNC()">;
def IsPPC4xx : Predicate<"PPCSubTarget->isPPC4xx()">;
def IsPPC6xx : Predicate<"PPCSubTarget->isPPC6xx()">;
def IsE500 : Predicate<"PPCSubTarget->isE500()">;
-def HasSPE : Predicate<"PPCSubTarget->HasSPE()">;
+def HasSPE : Predicate<"PPCSubTarget->hasSPE()">;
def HasICBT : Predicate<"PPCSubTarget->hasICBT()">;
def HasPartwordAtomics : Predicate<"PPCSubTarget->hasPartwordAtomics()">;
def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">;
@@ -863,6 +907,7 @@ def NaNsFPMath : Predicate<"!TM.Options.NoNaNsFPMath">;
def HasBPERMD : Predicate<"PPCSubTarget->hasBPERMD()">;
def HasExtDiv : Predicate<"PPCSubTarget->hasExtDiv()">;
def IsISA3_0 : Predicate<"PPCSubTarget->isISA3_0()">;
+def HasFPU : Predicate<"PPCSubTarget->hasFPU()">;
//===----------------------------------------------------------------------===//
// PowerPC Multiclass Definitions.
@@ -1188,6 +1233,9 @@ let usesCustomInserter = 1, // Expanded after instruction selection.
def SELECT_CC_F8 : Pseudo<(outs f8rc:$dst), (ins crrc:$cond, f8rc:$T, f8rc:$F,
i32imm:$BROPC), "#SELECT_CC_F8",
[]>;
+ def SELECT_CC_F16 : Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F,
+ i32imm:$BROPC), "#SELECT_CC_F16",
+ []>;
def SELECT_CC_VRRC: Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F,
i32imm:$BROPC), "#SELECT_CC_VRRC",
[]>;
@@ -1200,12 +1248,17 @@ let usesCustomInserter = 1, // Expanded after instruction selection.
def SELECT_I8 : Pseudo<(outs g8rc:$dst), (ins crbitrc:$cond,
g8rc_nox0:$T, g8rc_nox0:$F), "#SELECT_I8",
[(set i64:$dst, (select i1:$cond, i64:$T, i64:$F))]>;
+let Predicates = [HasFPU] in {
def SELECT_F4 : Pseudo<(outs f4rc:$dst), (ins crbitrc:$cond,
f4rc:$T, f4rc:$F), "#SELECT_F4",
[(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>;
def SELECT_F8 : Pseudo<(outs f8rc:$dst), (ins crbitrc:$cond,
f8rc:$T, f8rc:$F), "#SELECT_F8",
[(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>;
+ def SELECT_F16 : Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond,
+ vrrc:$T, vrrc:$F), "#SELECT_F16",
+ [(set f128:$dst, (select i1:$cond, f128:$T, f128:$F))]>;
+}
def SELECT_VRRC: Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond,
vrrc:$T, vrrc:$F), "#SELECT_VRRC",
[(set v4i32:$dst,
@@ -1710,30 +1763,35 @@ let usesCustomInserter = 1 in {
}
}
+def : Pat<(PPCatomicCmpSwap_8 xoaddr:$ptr, i32:$old, i32:$new),
+ (ATOMIC_CMP_SWAP_I8 xoaddr:$ptr, i32:$old, i32:$new)>;
+def : Pat<(PPCatomicCmpSwap_16 xoaddr:$ptr, i32:$old, i32:$new),
+ (ATOMIC_CMP_SWAP_I16 xoaddr:$ptr, i32:$old, i32:$new)>;
+
// Instructions to support atomic operations
let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
-def LBARX : XForm_1<31, 52, (outs gprc:$rD), (ins memrr:$src),
+def LBARX : XForm_1_memOp<31, 52, (outs gprc:$rD), (ins memrr:$src),
"lbarx $rD, $src", IIC_LdStLWARX, []>,
Requires<[HasPartwordAtomics]>;
-def LHARX : XForm_1<31, 116, (outs gprc:$rD), (ins memrr:$src),
+def LHARX : XForm_1_memOp<31, 116, (outs gprc:$rD), (ins memrr:$src),
"lharx $rD, $src", IIC_LdStLWARX, []>,
Requires<[HasPartwordAtomics]>;
-def LWARX : XForm_1<31, 20, (outs gprc:$rD), (ins memrr:$src),
+def LWARX : XForm_1_memOp<31, 20, (outs gprc:$rD), (ins memrr:$src),
"lwarx $rD, $src", IIC_LdStLWARX, []>;
// Instructions to support lock versions of atomics
// (EH=1 - see Power ISA 2.07 Book II 4.4.2)
-def LBARXL : XForm_1<31, 52, (outs gprc:$rD), (ins memrr:$src),
+def LBARXL : XForm_1_memOp<31, 52, (outs gprc:$rD), (ins memrr:$src),
"lbarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT,
Requires<[HasPartwordAtomics]>;
-def LHARXL : XForm_1<31, 116, (outs gprc:$rD), (ins memrr:$src),
+def LHARXL : XForm_1_memOp<31, 116, (outs gprc:$rD), (ins memrr:$src),
"lharx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT,
Requires<[HasPartwordAtomics]>;
-def LWARXL : XForm_1<31, 20, (outs gprc:$rD), (ins memrr:$src),
+def LWARXL : XForm_1_memOp<31, 20, (outs gprc:$rD), (ins memrr:$src),
"lwarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT;
// The atomic instructions use the destination register as well as the next one
@@ -1745,15 +1803,15 @@ def LWAT : X_RD5_RS5_IM5<31, 582, (outs gprc:$rD), (ins gprc:$rA, u5imm:$FC),
}
let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in {
-def STBCX : XForm_1<31, 694, (outs), (ins gprc:$rS, memrr:$dst),
+def STBCX : XForm_1_memOp<31, 694, (outs), (ins gprc:$rS, memrr:$dst),
"stbcx. $rS, $dst", IIC_LdStSTWCX, []>,
isDOT, Requires<[HasPartwordAtomics]>;
-def STHCX : XForm_1<31, 726, (outs), (ins gprc:$rS, memrr:$dst),
+def STHCX : XForm_1_memOp<31, 726, (outs), (ins gprc:$rS, memrr:$dst),
"sthcx. $rS, $dst", IIC_LdStSTWCX, []>,
isDOT, Requires<[HasPartwordAtomics]>;
-def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst),
+def STWCX : XForm_1_memOp<31, 150, (outs), (ins gprc:$rS, memrr:$dst),
"stwcx. $rS, $dst", IIC_LdStSTWCX, []>, isDOT;
}
@@ -1794,12 +1852,14 @@ def LWZ : DForm_1<32, (outs gprc:$rD), (ins memri:$src),
"lwz $rD, $src", IIC_LdStLoad,
[(set i32:$rD, (load iaddr:$src))]>;
+let Predicates = [HasFPU] in {
def LFS : DForm_1<48, (outs f4rc:$rD), (ins memri:$src),
"lfs $rD, $src", IIC_LdStLFD,
[(set f32:$rD, (load iaddr:$src))]>;
def LFD : DForm_1<50, (outs f8rc:$rD), (ins memri:$src),
"lfd $rD, $src", IIC_LdStLFD,
[(set f64:$rD, (load iaddr:$src))]>;
+}
// Unindexed (r+i) Loads with Update (preinc).
@@ -1824,6 +1884,7 @@ def LWZU : DForm_1<33, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr
[]>, RegConstraint<"$addr.reg = $ea_result">,
NoEncode<"$ea_result">;
+let Predicates = [HasFPU] in {
def LFSU : DForm_1<49, (outs f4rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
"lfsu $rD, $addr", IIC_LdStLFDU,
[]>, RegConstraint<"$addr.reg = $ea_result">,
@@ -1833,84 +1894,89 @@ def LFDU : DForm_1<51, (outs f8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr
"lfdu $rD, $addr", IIC_LdStLFDU,
[]>, RegConstraint<"$addr.reg = $ea_result">,
NoEncode<"$ea_result">;
+}
// Indexed (r+r) Loads with Update (preinc).
-def LBZUX : XForm_1<31, 119, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+def LBZUX : XForm_1_memOp<31, 119, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
(ins memrr:$addr),
"lbzux $rD, $addr", IIC_LdStLoadUpdX,
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
NoEncode<"$ea_result">;
-def LHAUX : XForm_1<31, 375, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+def LHAUX : XForm_1_memOp<31, 375, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
(ins memrr:$addr),
"lhaux $rD, $addr", IIC_LdStLHAUX,
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
NoEncode<"$ea_result">;
-def LHZUX : XForm_1<31, 311, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+def LHZUX : XForm_1_memOp<31, 311, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
(ins memrr:$addr),
"lhzux $rD, $addr", IIC_LdStLoadUpdX,
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
NoEncode<"$ea_result">;
-def LWZUX : XForm_1<31, 55, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+def LWZUX : XForm_1_memOp<31, 55, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
(ins memrr:$addr),
"lwzux $rD, $addr", IIC_LdStLoadUpdX,
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
NoEncode<"$ea_result">;
-def LFSUX : XForm_1<31, 567, (outs f4rc:$rD, ptr_rc_nor0:$ea_result),
+let Predicates = [HasFPU] in {
+def LFSUX : XForm_1_memOp<31, 567, (outs f4rc:$rD, ptr_rc_nor0:$ea_result),
(ins memrr:$addr),
"lfsux $rD, $addr", IIC_LdStLFDUX,
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
NoEncode<"$ea_result">;
-def LFDUX : XForm_1<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result),
+def LFDUX : XForm_1_memOp<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result),
(ins memrr:$addr),
"lfdux $rD, $addr", IIC_LdStLFDUX,
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
NoEncode<"$ea_result">;
}
}
+}
// Indexed (r+r) Loads.
//
let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in {
-def LBZX : XForm_1<31, 87, (outs gprc:$rD), (ins memrr:$src),
+def LBZX : XForm_1_memOp<31, 87, (outs gprc:$rD), (ins memrr:$src),
"lbzx $rD, $src", IIC_LdStLoad,
[(set i32:$rD, (zextloadi8 xaddr:$src))]>;
-def LHAX : XForm_1<31, 343, (outs gprc:$rD), (ins memrr:$src),
+def LHAX : XForm_1_memOp<31, 343, (outs gprc:$rD), (ins memrr:$src),
"lhax $rD, $src", IIC_LdStLHA,
[(set i32:$rD, (sextloadi16 xaddr:$src))]>,
PPC970_DGroup_Cracked;
-def LHZX : XForm_1<31, 279, (outs gprc:$rD), (ins memrr:$src),
+def LHZX : XForm_1_memOp<31, 279, (outs gprc:$rD), (ins memrr:$src),
"lhzx $rD, $src", IIC_LdStLoad,
[(set i32:$rD, (zextloadi16 xaddr:$src))]>;
-def LWZX : XForm_1<31, 23, (outs gprc:$rD), (ins memrr:$src),
+def LWZX : XForm_1_memOp<31, 23, (outs gprc:$rD), (ins memrr:$src),
"lwzx $rD, $src", IIC_LdStLoad,
[(set i32:$rD, (load xaddr:$src))]>;
-def LHBRX : XForm_1<31, 790, (outs gprc:$rD), (ins memrr:$src),
+def LHBRX : XForm_1_memOp<31, 790, (outs gprc:$rD), (ins memrr:$src),
"lhbrx $rD, $src", IIC_LdStLoad,
[(set i32:$rD, (PPClbrx xoaddr:$src, i16))]>;
-def LWBRX : XForm_1<31, 534, (outs gprc:$rD), (ins memrr:$src),
+def LWBRX : XForm_1_memOp<31, 534, (outs gprc:$rD), (ins memrr:$src),
"lwbrx $rD, $src", IIC_LdStLoad,
[(set i32:$rD, (PPClbrx xoaddr:$src, i32))]>;
-def LFSX : XForm_25<31, 535, (outs f4rc:$frD), (ins memrr:$src),
+let Predicates = [HasFPU] in {
+def LFSX : XForm_25_memOp<31, 535, (outs f4rc:$frD), (ins memrr:$src),
"lfsx $frD, $src", IIC_LdStLFD,
[(set f32:$frD, (load xaddr:$src))]>;
-def LFDX : XForm_25<31, 599, (outs f8rc:$frD), (ins memrr:$src),
+def LFDX : XForm_25_memOp<31, 599, (outs f8rc:$frD), (ins memrr:$src),
"lfdx $frD, $src", IIC_LdStLFD,
[(set f64:$frD, (load xaddr:$src))]>;
-def LFIWAX : XForm_25<31, 855, (outs f8rc:$frD), (ins memrr:$src),
+def LFIWAX : XForm_25_memOp<31, 855, (outs f8rc:$frD), (ins memrr:$src),
"lfiwax $frD, $src", IIC_LdStLFD,
[(set f64:$frD, (PPClfiwax xoaddr:$src))]>;
-def LFIWZX : XForm_25<31, 887, (outs f8rc:$frD), (ins memrr:$src),
+def LFIWZX : XForm_25_memOp<31, 887, (outs f8rc:$frD), (ins memrr:$src),
"lfiwzx $frD, $src", IIC_LdStLFD,
[(set f64:$frD, (PPClfiwzx xoaddr:$src))]>;
}
+}
// Load Multiple
def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src),
@@ -1931,6 +1997,7 @@ def STH : DForm_1<44, (outs), (ins gprc:$rS, memri:$src),
def STW : DForm_1<36, (outs), (ins gprc:$rS, memri:$src),
"stw $rS, $src", IIC_LdStStore,
[(store i32:$rS, iaddr:$src)]>;
+let Predicates = [HasFPU] in {
def STFS : DForm_1<52, (outs), (ins f4rc:$rS, memri:$dst),
"stfs $rS, $dst", IIC_LdStSTFD,
[(store f32:$rS, iaddr:$dst)]>;
@@ -1938,6 +2005,7 @@ def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst),
"stfd $rS, $dst", IIC_LdStSTFD,
[(store f64:$rS, iaddr:$dst)]>;
}
+}
// Unindexed (r+i) Stores with Update (preinc).
let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
@@ -1950,6 +2018,7 @@ def STHU : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
def STWU : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
"stwu $rS, $dst", IIC_LdStStoreUpd, []>,
RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+let Predicates = [HasFPU] in {
def STFSU : DForm_1<53, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memri:$dst),
"stfsu $rS, $dst", IIC_LdStSTFDU, []>,
RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
@@ -1957,6 +2026,7 @@ def STFDU : DForm_1<55, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memri:$dst),
"stfdu $rS, $dst", IIC_LdStSTFDU, []>,
RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
}
+}
// Patterns to match the pre-inc stores. We can't put the patterns on
// the instruction definitions directly as ISel wants the address base
@@ -1974,62 +2044,76 @@ def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
// Indexed (r+r) Stores.
let PPC970_Unit = 2 in {
-def STBX : XForm_8<31, 215, (outs), (ins gprc:$rS, memrr:$dst),
+def STBX : XForm_8_memOp<31, 215, (outs), (ins gprc:$rS, memrr:$dst),
"stbx $rS, $dst", IIC_LdStStore,
[(truncstorei8 i32:$rS, xaddr:$dst)]>,
PPC970_DGroup_Cracked;
-def STHX : XForm_8<31, 407, (outs), (ins gprc:$rS, memrr:$dst),
+def STHX : XForm_8_memOp<31, 407, (outs), (ins gprc:$rS, memrr:$dst),
"sthx $rS, $dst", IIC_LdStStore,
[(truncstorei16 i32:$rS, xaddr:$dst)]>,
PPC970_DGroup_Cracked;
-def STWX : XForm_8<31, 151, (outs), (ins gprc:$rS, memrr:$dst),
+def STWX : XForm_8_memOp<31, 151, (outs), (ins gprc:$rS, memrr:$dst),
"stwx $rS, $dst", IIC_LdStStore,
[(store i32:$rS, xaddr:$dst)]>,
PPC970_DGroup_Cracked;
-
-def STHBRX: XForm_8<31, 918, (outs), (ins gprc:$rS, memrr:$dst),
+
+def STHBRX: XForm_8_memOp<31, 918, (outs), (ins gprc:$rS, memrr:$dst),
"sthbrx $rS, $dst", IIC_LdStStore,
[(PPCstbrx i32:$rS, xoaddr:$dst, i16)]>,
PPC970_DGroup_Cracked;
-def STWBRX: XForm_8<31, 662, (outs), (ins gprc:$rS, memrr:$dst),
+def STWBRX: XForm_8_memOp<31, 662, (outs), (ins gprc:$rS, memrr:$dst),
"stwbrx $rS, $dst", IIC_LdStStore,
[(PPCstbrx i32:$rS, xoaddr:$dst, i32)]>,
PPC970_DGroup_Cracked;
-def STFIWX: XForm_28<31, 983, (outs), (ins f8rc:$frS, memrr:$dst),
+let Predicates = [HasFPU] in {
+def STFIWX: XForm_28_memOp<31, 983, (outs), (ins f8rc:$frS, memrr:$dst),
"stfiwx $frS, $dst", IIC_LdStSTFD,
[(PPCstfiwx f64:$frS, xoaddr:$dst)]>;
-
-def STFSX : XForm_28<31, 663, (outs), (ins f4rc:$frS, memrr:$dst),
+
+def STFSX : XForm_28_memOp<31, 663, (outs), (ins f4rc:$frS, memrr:$dst),
"stfsx $frS, $dst", IIC_LdStSTFD,
[(store f32:$frS, xaddr:$dst)]>;
-def STFDX : XForm_28<31, 727, (outs), (ins f8rc:$frS, memrr:$dst),
+def STFDX : XForm_28_memOp<31, 727, (outs), (ins f8rc:$frS, memrr:$dst),
"stfdx $frS, $dst", IIC_LdStSTFD,
[(store f64:$frS, xaddr:$dst)]>;
}
+}
// Indexed (r+r) Stores with Update (preinc).
let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
-def STBUX : XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
- "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
- RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
- PPC970_DGroup_Cracked;
-def STHUX : XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
- "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
- RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
- PPC970_DGroup_Cracked;
-def STWUX : XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
- "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
- RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
- PPC970_DGroup_Cracked;
-def STFSUX: XForm_8<31, 695, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memrr:$dst),
- "stfsux $rS, $dst", IIC_LdStSTFDU, []>,
- RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
- PPC970_DGroup_Cracked;
-def STFDUX: XForm_8<31, 759, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memrr:$dst),
- "stfdux $rS, $dst", IIC_LdStSTFDU, []>,
- RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
- PPC970_DGroup_Cracked;
+def STBUX : XForm_8_memOp<31, 247, (outs ptr_rc_nor0:$ea_res),
+ (ins gprc:$rS, memrr:$dst),
+ "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">,
+ NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
+def STHUX : XForm_8_memOp<31, 439, (outs ptr_rc_nor0:$ea_res),
+ (ins gprc:$rS, memrr:$dst),
+ "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">,
+ NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
+def STWUX : XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res),
+ (ins gprc:$rS, memrr:$dst),
+ "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">,
+ NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
+let Predicates = [HasFPU] in {
+def STFSUX: XForm_8_memOp<31, 695, (outs ptr_rc_nor0:$ea_res),
+ (ins f4rc:$rS, memrr:$dst),
+ "stfsux $rS, $dst", IIC_LdStSTFDU, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">,
+ NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
+def STFDUX: XForm_8_memOp<31, 759, (outs ptr_rc_nor0:$ea_res),
+ (ins f8rc:$rS, memrr:$dst),
+ "stfdux $rS, $dst", IIC_LdStSTFDU, []>,
+ RegConstraint<"$dst.ptrreg = $ea_res">,
+ NoEncode<"$ea_res">,
+ PPC970_DGroup_Cracked;
+}
}
// Patterns to match the pre-inc stores. We can't put the patterns on
@@ -2041,10 +2125,12 @@ def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
(STHUX $rS, $ptrreg, $ptroff)>;
def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
(STWUX $rS, $ptrreg, $ptroff)>;
+let Predicates = [HasFPU] in {
def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
(STFSUX $rS, $ptrreg, $ptroff)>;
def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
(STFDUX $rS, $ptrreg, $ptroff)>;
+}
// Store Multiple
def STMW : DForm_1<47, (outs), (ins gprc:$rS, memri:$dst),
@@ -2228,7 +2314,7 @@ let isCompare = 1, hasSideEffects = 0 in {
"cmplw $crD, $rA, $rB", IIC_IntCompare>;
}
}
-let PPC970_Unit = 3 in { // FPU Operations.
+let PPC970_Unit = 3, Predicates = [HasFPU] in { // FPU Operations.
//def FCMPO : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB),
// "fcmpo $crD, $fA, $fB", IIC_FPCompare>;
let isCompare = 1, hasSideEffects = 0 in {
@@ -2306,13 +2392,13 @@ let Uses = [RM] in {
/// often coalesced away and we don't want the dispatch group builder to think
/// that they will fill slots (which could cause the load of a LSU reject to
/// sneak into a d-group with a store).
-let hasSideEffects = 0 in
+let hasSideEffects = 0, Predicates = [HasFPU] in
defm FMR : XForm_26r<63, 72, (outs f4rc:$frD), (ins f4rc:$frB),
"fmr", "$frD, $frB", IIC_FPGeneral,
[]>, // (set f32:$frD, f32:$frB)
PPC970_Unit_Pseudo;
-let PPC970_Unit = 3, hasSideEffects = 0 in { // FPU Operations.
+let PPC970_Unit = 3, hasSideEffects = 0, Predicates = [HasFPU] in { // FPU Operations.
// These are artificially split into two different forms, for 4/8 byte FP.
defm FABSS : XForm_26r<63, 264, (outs f4rc:$frD), (ins f4rc:$frB),
"fabs", "$frD, $frB", IIC_FPGeneral,
@@ -2561,6 +2647,7 @@ def MCRXRX : X_BF3<31, 576, (outs crrc:$BF), (ins),
"mcrxrx $BF", IIC_BrMCRX>, Requires<[IsISA3_0]>;
} // hasSideEffects = 0
+let Predicates = [HasFPU] in {
// Pseudo instruction to perform FADD in round-to-zero mode.
let usesCustomInserter = 1, Uses = [RM] in {
def FADDrtz: Pseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "",
@@ -2620,6 +2707,7 @@ let Uses = [RM] in {
"mffsl $rT", IIC_IntMFFS, []>,
PPC970_DGroup_Single, PPC970_Unit_FPU;
}
+}
let Predicates = [IsISA3_0] in {
def MODSW : XForm_8<31, 779, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
@@ -2717,7 +2805,7 @@ defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$rT), (ins gprc:$rA),
// A-Form instructions. Most of the instructions executed in the FPU are of
// this type.
//
-let PPC970_Unit = 3, hasSideEffects = 0 in { // FPU Operations.
+let PPC970_Unit = 3, hasSideEffects = 0, Predicates = [HasFPU] in { // FPU Operations.
let Uses = [RM] in {
let isCommutable = 1 in {
defm FMADD : AForm_1r<63, 29,
@@ -3043,6 +3131,7 @@ def : Pat<(extloadi16 iaddr:$src),
(LHZ iaddr:$src)>;
def : Pat<(extloadi16 xaddr:$src),
(LHZX xaddr:$src)>;
+let Predicates = [HasFPU] in {
def : Pat<(f64 (extloadf32 iaddr:$src)),
(COPY_TO_REGCLASS (LFS iaddr:$src), F8RC)>;
def : Pat<(f64 (extloadf32 xaddr:$src)),
@@ -3050,6 +3139,7 @@ def : Pat<(f64 (extloadf32 xaddr:$src)),
def : Pat<(f64 (fpextend f32:$src)),
(COPY_TO_REGCLASS $src, F8RC)>;
+}
// Only seq_cst fences require the heavyweight sync (SYNC 0).
// All others can use the lightweight sync (SYNC 1).
@@ -3061,6 +3151,7 @@ def : Pat<(atomic_fence (i32 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>;
def : Pat<(atomic_fence (imm), (imm)), (SYNC 1)>, Requires<[HasSYNC]>;
def : Pat<(atomic_fence (imm), (imm)), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
+let Predicates = [HasFPU] in {
// Additional FNMSUB patterns: -a*c + b == -(a*c - b)
def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
(FNMSUB $A, $C, $B)>;
@@ -3076,6 +3167,7 @@ def : Pat<(fcopysign f64:$frB, f32:$frA),
(FCPSGND (COPY_TO_REGCLASS $frA, F8RC), $frB)>;
def : Pat<(fcopysign f32:$frB, f64:$frA),
(FCPSGNS (COPY_TO_REGCLASS $frA, F4RC), $frB)>;
+}
include "PPCInstrAltivec.td"
include "PPCInstrSPE.td"
@@ -3518,6 +3610,7 @@ defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)),
(EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
// SETCC for f32.
+let Predicates = [HasFPU] in {
def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)),
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)),
@@ -3579,6 +3672,96 @@ defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)),
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
+// SETCC for f128.
+def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOLT)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETLT)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOGT)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETGT)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOEQ)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETEQ)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETUO)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>;
+
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUGE)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETGE)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETULE)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETLE)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUNE)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETNE)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETO)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>;
+
+}
+
+// This must be in this file because it relies on patterns defined in this file
+// after the inclusion of the instruction sets.
+let Predicates = [HasSPE] in {
+// SETCC for f32.
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)),
+ (EXTRACT_SUBREG (EFSCMPLT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)),
+ (EXTRACT_SUBREG (EFSCMPLT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOGT)),
+ (EXTRACT_SUBREG (EFSCMPGT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETGT)),
+ (EXTRACT_SUBREG (EFSCMPGT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOEQ)),
+ (EXTRACT_SUBREG (EFSCMPEQ $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)),
+ (EXTRACT_SUBREG (EFSCMPEQ $s1, $s2), sub_gt)>;
+
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)),
+ (EXTRACT_SUBREG (EFSCMPLT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)),
+ (EXTRACT_SUBREG (EFSCMPLT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)),
+ (EXTRACT_SUBREG (EFSCMPGT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)),
+ (EXTRACT_SUBREG (EFSCMPGT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)),
+ (EXTRACT_SUBREG (EFSCMPEQ $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)),
+ (EXTRACT_SUBREG (EFSCMPEQ $s1, $s2), sub_gt)>;
+
+// SETCC for f64.
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)),
+ (EXTRACT_SUBREG (EFDCMPLT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETLT)),
+ (EXTRACT_SUBREG (EFDCMPLT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOGT)),
+ (EXTRACT_SUBREG (EFDCMPGT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETGT)),
+ (EXTRACT_SUBREG (EFDCMPGT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOEQ)),
+ (EXTRACT_SUBREG (EFDCMPEQ $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)),
+ (EXTRACT_SUBREG (EFDCMPEQ $s1, $s2), sub_gt)>;
+
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)),
+ (EXTRACT_SUBREG (EFDCMPLT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)),
+ (EXTRACT_SUBREG (EFDCMPLT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)),
+ (EXTRACT_SUBREG (EFDCMPGT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)),
+ (EXTRACT_SUBREG (EFDCMPGT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)),
+ (EXTRACT_SUBREG (EFDCMPEQ $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
+ (EXTRACT_SUBREG (EFDCMPEQ $s1, $s2), sub_gt)>;
+}
// match select on i1 variables:
def : Pat<(i1 (select i1:$cond, i1:$tval, i1:$fval)),
(CROR (CRAND $cond , $tval),
@@ -3661,6 +3844,7 @@ def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETUGT)),
def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETNE)),
(SELECT_I8 (CRXOR $lhs, $rhs), $tval, $fval)>;
+let Predicates = [HasFPU] in {
def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
(SELECT_F4 (CRANDC $lhs, $rhs), $tval, $fval)>;
def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)),
@@ -3702,6 +3886,28 @@ def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)),
(SELECT_F8 (CRANDC $lhs, $rhs), $tval, $fval)>;
def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
(SELECT_F8 (CRXOR $lhs, $rhs), $tval, $fval)>;
+}
+
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETLT)),
+ (SELECT_F16 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETULT)),
+ (SELECT_F16 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETLE)),
+ (SELECT_F16 (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETULE)),
+ (SELECT_F16 (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETEQ)),
+ (SELECT_F16 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETGE)),
+ (SELECT_F16 (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETUGE)),
+ (SELECT_F16 (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETGT)),
+ (SELECT_F16 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETUGT)),
+ (SELECT_F16 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETNE)),
+ (SELECT_F16 (CRXOR $lhs, $rhs), $tval, $fval)>;
def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLT)),
(SELECT_VRRC (CRANDC $lhs, $rhs), $tval, $fval)>;
@@ -3751,13 +3957,15 @@ def : Pat<(i1 (not (trunc i64:$in))),
// FIXME: For B=0 or B > 8, the registers following RT are used.
// WARNING: Do not add patterns for this instruction without fixing this.
-def LSWI : XForm_base_r3xo<31, 597, (outs gprc:$RT), (ins gprc:$A, u5imm:$B),
- "lswi $RT, $A, $B", IIC_LdStLoad, []>;
+def LSWI : XForm_base_r3xo_memOp<31, 597, (outs gprc:$RT),
+ (ins gprc:$A, u5imm:$B),
+ "lswi $RT, $A, $B", IIC_LdStLoad, []>;
// FIXME: For B=0 or B > 8, the registers following RT are used.
// WARNING: Do not add patterns for this instruction without fixing this.
-def STSWI : XForm_base_r3xo<31, 725, (outs), (ins gprc:$RT, gprc:$A, u5imm:$B),
- "stswi $RT, $A, $B", IIC_LdStLoad, []>;
+def STSWI : XForm_base_r3xo_memOp<31, 725, (outs),
+ (ins gprc:$RT, gprc:$A, u5imm:$B),
+ "stswi $RT, $A, $B", IIC_LdStLoad, []>;
def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins),
"isync", IIC_SprISYNC, []>;
@@ -3769,7 +3977,7 @@ def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src),
def EnforceIEIO : XForm_24_eieio<31, 854, (outs), (ins),
"eieio", IIC_LdStLoad, []>;
-def WAIT : XForm_24_sync<31, 62, (outs), (ins i32imm:$L),
+def WAIT : XForm_24_sync<31, 30, (outs), (ins i32imm:$L),
"wait $L", IIC_LdStLoad, []>;
def MBAR : XForm_mbar<31, 854, (outs), (ins u5imm:$MO),
@@ -3831,6 +4039,7 @@ def MTFSFIo : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
def : InstAlias<"mtfsfi $BF, $U", (MTFSFI crrc:$BF, i32imm:$U, 0)>;
def : InstAlias<"mtfsfi. $BF, $U", (MTFSFIo crrc:$BF, i32imm:$U, 0)>;
+let Predicates = [HasFPU] in {
def MTFSF : XFLForm_1<63, 711, (outs),
(ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
"mtfsf $FLM, $FRB, $L, $W", IIC_IntMFFS, []>;
@@ -3840,6 +4049,7 @@ def MTFSFo : XFLForm_1<63, 711, (outs),
def : InstAlias<"mtfsf $FLM, $FRB", (MTFSF i32imm:$FLM, f8rc:$FRB, 0, 0)>;
def : InstAlias<"mtfsf. $FLM, $FRB", (MTFSFo i32imm:$FLM, f8rc:$FRB, 0, 0)>;
+}
def SLBIE : XForm_16b<31, 434, (outs), (ins gprc:$RB),
"slbie $RB", IIC_SprSLBIE, []>;
@@ -3920,23 +4130,31 @@ def NAP : XLForm_1_np<19, 434, (outs), (ins), "nap", IIC_BrB, []>;
def ATTN : XForm_attn<0, 256, (outs), (ins), "attn", IIC_BrB>;
-def LBZCIX : XForm_base_r3xo<31, 853, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
- "lbzcix $RST, $A, $B", IIC_LdStLoad, []>;
-def LHZCIX : XForm_base_r3xo<31, 821, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
- "lhzcix $RST, $A, $B", IIC_LdStLoad, []>;
-def LWZCIX : XForm_base_r3xo<31, 789, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
- "lwzcix $RST, $A, $B", IIC_LdStLoad, []>;
-def LDCIX : XForm_base_r3xo<31, 885, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
- "ldcix $RST, $A, $B", IIC_LdStLoad, []>;
-
-def STBCIX : XForm_base_r3xo<31, 981, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
- "stbcix $RST, $A, $B", IIC_LdStLoad, []>;
-def STHCIX : XForm_base_r3xo<31, 949, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
- "sthcix $RST, $A, $B", IIC_LdStLoad, []>;
-def STWCIX : XForm_base_r3xo<31, 917, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
- "stwcix $RST, $A, $B", IIC_LdStLoad, []>;
-def STDCIX : XForm_base_r3xo<31, 1013, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
- "stdcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LBZCIX : XForm_base_r3xo_memOp<31, 853, (outs gprc:$RST),
+ (ins gprc:$A, gprc:$B),
+ "lbzcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LHZCIX : XForm_base_r3xo_memOp<31, 821, (outs gprc:$RST),
+ (ins gprc:$A, gprc:$B),
+ "lhzcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LWZCIX : XForm_base_r3xo_memOp<31, 789, (outs gprc:$RST),
+ (ins gprc:$A, gprc:$B),
+ "lwzcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LDCIX : XForm_base_r3xo_memOp<31, 885, (outs gprc:$RST),
+ (ins gprc:$A, gprc:$B),
+ "ldcix $RST, $A, $B", IIC_LdStLoad, []>;
+
+def STBCIX : XForm_base_r3xo_memOp<31, 981, (outs),
+ (ins gprc:$RST, gprc:$A, gprc:$B),
+ "stbcix $RST, $A, $B", IIC_LdStLoad, []>;
+def STHCIX : XForm_base_r3xo_memOp<31, 949, (outs),
+ (ins gprc:$RST, gprc:$A, gprc:$B),
+ "sthcix $RST, $A, $B", IIC_LdStLoad, []>;
+def STWCIX : XForm_base_r3xo_memOp<31, 917, (outs),
+ (ins gprc:$RST, gprc:$A, gprc:$B),
+ "stwcix $RST, $A, $B", IIC_LdStLoad, []>;
+def STDCIX : XForm_base_r3xo_memOp<31, 1013, (outs),
+ (ins gprc:$RST, gprc:$A, gprc:$B),
+ "stdcix $RST, $A, $B", IIC_LdStLoad, []>;
// External PID Load Store Instructions
@@ -3960,7 +4178,7 @@ def STBEPX : XForm_8<31, 223, (outs), (ins gprc:$rS, memrr:$dst),
"stbepx $rS, $dst", IIC_LdStStore, []>,
Requires<[IsE500]>;
-def STFDEPX : XForm_28<31, 735, (outs), (ins f8rc:$frS, memrr:$dst),
+def STFDEPX : XForm_28_memOp<31, 735, (outs), (ins f8rc:$frS, memrr:$dst),
"stfdepx $frS, $dst", IIC_LdStSTFD, []>,
Requires<[IsE500]>;
@@ -4683,10 +4901,10 @@ def DWMaskValues {
def DWSwapInByte {
dag Swap1 = (OR8 (AND8 (RLDICL $A, 63, 1), DWMaskValues.Lo1),
(AND8 (RLDICR $A, 1, 62), DWMaskValues.Hi1));
- dag Swap2 = (OR8 (AND8 (RLDICL DWSwapInByte.Swap1, 62, 2), DWMaskValues.Lo2),
- (AND8 (RLDICR DWSwapInByte.Swap1, 2, 61), DWMaskValues.Hi2));
- dag Swap4 = (OR8 (AND8 (RLDICL DWSwapInByte.Swap2, 60, 4), DWMaskValues.Lo4),
- (AND8 (RLDICR DWSwapInByte.Swap2, 4, 59), DWMaskValues.Hi4));
+ dag Swap2 = (OR8 (AND8 (RLDICL Swap1, 62, 2), DWMaskValues.Lo2),
+ (AND8 (RLDICR Swap1, 2, 61), DWMaskValues.Hi2));
+ dag Swap4 = (OR8 (AND8 (RLDICL Swap2, 60, 4), DWMaskValues.Lo4),
+ (AND8 (RLDICR Swap2, 4, 59), DWMaskValues.Hi4));
}
// Intra-byte swap is done, now start inter-byte swap.
@@ -4706,7 +4924,7 @@ def DWBytes7656 {
def DWBytes7654 {
dag Word = (RLWIMI DWBytes7656.Word, DWBytes4567.Word, 8, 24, 31);
dag DWord =
- (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), DWBytes7654.Word, sub_32));
+ (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), Word, sub_32));
}
def DWBytes0123 {
@@ -4725,7 +4943,7 @@ def DWBytes3212 {
def DWBytes3210 {
dag Word = (RLWIMI DWBytes3212.Word, DWBytes0123.Word, 8, 24, 31);
dag DWord =
- (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), DWBytes3210.Word, sub_32));
+ (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), Word, sub_32));
}
// Now both high word and low word are reversed, next
diff --git a/lib/Target/PowerPC/PPCInstrQPX.td b/lib/Target/PowerPC/PPCInstrQPX.td
index 4940c77c7ae5..c4bb02695b36 100644
--- a/lib/Target/PowerPC/PPCInstrQPX.td
+++ b/lib/Target/PowerPC/PPCInstrQPX.td
@@ -502,14 +502,14 @@ let Uses = [RM] in {
// Load indexed instructions
let mayLoad = 1 in {
- def QVLFDX : XForm_1<31, 583,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfdx $FRT, $src", IIC_LdStLFD,
- [(set v4f64:$FRT, (load xoaddr:$src))]>;
+ def QVLFDX : XForm_1_memOp<31, 583,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfdx $FRT, $src", IIC_LdStLFD,
+ [(set v4f64:$FRT, (load xoaddr:$src))]>;
let isCodeGenOnly = 1 in
- def QVLFDXb : XForm_1<31, 583,
- (outs qbrc:$FRT), (ins memrr:$src),
- "qvlfdx $FRT, $src", IIC_LdStLFD, []>;
+ def QVLFDXb : XForm_1_memOp<31, 583,
+ (outs qbrc:$FRT), (ins memrr:$src),
+ "qvlfdx $FRT, $src", IIC_LdStLFD, []>;
let RC = 1 in
def QVLFDXA : XForm_1<31, 583,
@@ -527,10 +527,10 @@ let Uses = [RM] in {
(outs qfrc:$FRT), (ins memrr:$src),
"qvlfduxa $FRT, $src", IIC_LdStLFD, []>;
- def QVLFSX : XForm_1<31, 519,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfsx $FRT, $src", IIC_LdStLFD,
- [(set v4f64:$FRT, (extloadv4f32 xoaddr:$src))]>;
+ def QVLFSX : XForm_1_memOp<31, 519,
+ (outs qfrc:$FRT), (ins memrr:$src),
+ "qvlfsx $FRT, $src", IIC_LdStLFD,
+ [(set v4f64:$FRT, (extloadv4f32 xoaddr:$src))]>;
let isCodeGenOnly = 1 in
def QVLFSXb : XForm_1<31, 519,
@@ -538,10 +538,10 @@ let Uses = [RM] in {
"qvlfsx $FRT, $src", IIC_LdStLFD,
[(set v4i1:$FRT, (PPCqvlfsb xoaddr:$src))]>;
let isCodeGenOnly = 1 in
- def QVLFSXs : XForm_1<31, 519,
- (outs qsrc:$FRT), (ins memrr:$src),
- "qvlfsx $FRT, $src", IIC_LdStLFD,
- [(set v4f32:$FRT, (load xoaddr:$src))]>;
+ def QVLFSXs : XForm_1_memOp<31, 519,
+ (outs qsrc:$FRT), (ins memrr:$src),
+ "qvlfsx $FRT, $src", IIC_LdStLFD,
+ [(set v4f32:$FRT, (load xoaddr:$src))]>;
let RC = 1 in
def QVLFSXA : XForm_1<31, 519,
@@ -634,12 +634,12 @@ let Uses = [RM] in {
// Store indexed instructions
let mayStore = 1 in {
- def QVSTFDX : XForm_8<31, 711,
+ def QVSTFDX : XForm_8_memOp<31, 711,
(outs), (ins qfrc:$FRT, memrr:$dst),
"qvstfdx $FRT, $dst", IIC_LdStSTFD,
[(store qfrc:$FRT, xoaddr:$dst)]>;
let isCodeGenOnly = 1 in
- def QVSTFDXb : XForm_8<31, 711,
+ def QVSTFDXb : XForm_8_memOp<31, 711,
(outs), (ins qbrc:$FRT, memrr:$dst),
"qvstfdx $FRT, $dst", IIC_LdStSTFD, []>;
@@ -675,12 +675,12 @@ let Uses = [RM] in {
(outs), (ins qfrc:$FRT, memrr:$dst),
"qvstfduxia $FRT, $dst", IIC_LdStSTFD, []>;
- def QVSTFSX : XForm_8<31, 647,
+ def QVSTFSX : XForm_8_memOp<31, 647,
(outs), (ins qfrc:$FRT, memrr:$dst),
"qvstfsx $FRT, $dst", IIC_LdStSTFD,
[(truncstorev4f32 qfrc:$FRT, xoaddr:$dst)]>;
let isCodeGenOnly = 1 in
- def QVSTFSXs : XForm_8<31, 647,
+ def QVSTFSXs : XForm_8_memOp<31, 647,
(outs), (ins qsrc:$FRT, memrr:$dst),
"qvstfsx $FRT, $dst", IIC_LdStSTFD,
[(store qsrc:$FRT, xoaddr:$dst)]>;
diff --git a/lib/Target/PowerPC/PPCInstrSPE.td b/lib/Target/PowerPC/PPCInstrSPE.td
index cc3a4d20a9b2..96649efdc1bc 100644
--- a/lib/Target/PowerPC/PPCInstrSPE.td
+++ b/lib/Target/PowerPC/PPCInstrSPE.td
@@ -12,14 +12,56 @@
//
//===----------------------------------------------------------------------===//
+class EFXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern> :
+ I<4, OOL, IOL, asmstr, itin> {
+ bits<5> RT;
+ bits<5> RA;
+ bits<5> RB;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = RT;
+ let Inst{11-15} = RA;
+ let Inst{16-20} = RB;
+ let Inst{21-31} = xo;
+}
+
+class EFXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern> :
+ EFXForm_1<xo, OOL, IOL, asmstr, itin, pattern> {
+ let RB = 0;
+}
+
+class EFXForm_2a<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern> :
+ EFXForm_1<xo, OOL, IOL, asmstr, itin, pattern> {
+ let RA = 0;
+}
+
+class EFXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin> :
+ I<4, OOL, IOL, asmstr, itin> {
+ bits<3> crD;
+ bits<5> RA;
+ bits<5> RB;
+
+ let Inst{6-8} = crD;
+ let Inst{9-10} = 0;
+ let Inst{11-15} = RA;
+ let Inst{16-20} = RB;
+ let Inst{21-31} = xo;
+}
+
class EVXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
- InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> {
+ InstrItinClass itin, list<dag> pattern> :
+ I<4, OOL, IOL, asmstr, itin> {
bits<5> RT;
bits<5> RA;
bits<5> RB;
- let Pattern = [];
-
+ let Pattern = pattern;
+
let Inst{6-10} = RT;
let Inst{11-15} = RA;
let Inst{16-20} = RB;
@@ -27,18 +69,26 @@ class EVXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
}
class EVXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
- InstrItinClass itin> : EVXForm_1<xo, OOL, IOL, asmstr, itin> {
+ InstrItinClass itin, list<dag> pattern> :
+ EVXForm_1<xo, OOL, IOL, asmstr, itin, pattern> {
let RB = 0;
}
+class EVXForm_2a<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern> :
+ EVXForm_1<xo, OOL, IOL, asmstr, itin, pattern> {
+ let RA = 0;
+}
+
class EVXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
- InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> {
+ InstrItinClass itin, list<dag> pattern> :
+ I<4, OOL, IOL, asmstr, itin> {
bits<3> crD;
bits<5> RA;
bits<5> RB;
- let Pattern = [];
-
+ let Pattern = pattern;
+
let Inst{6-8} = crD;
let Inst{9-10} = 0;
let Inst{11-15} = RA;
@@ -46,12 +96,30 @@ class EVXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
let Inst{21-31} = xo;
}
+class EVXForm_4<bits<8> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern> :
+ I<4, OOL, IOL, asmstr, itin> {
+ bits<3> crD;
+ bits<5> RA;
+ bits<5> RB;
+ bits<5> RT;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = RT;
+ let Inst{11-15} = RA;
+ let Inst{16-20} = RB;
+ let Inst{21-28} = xo;
+ let Inst{29-31} = crD;
+}
+
class EVXForm_D<bits<11> xo, dag OOL, dag IOL, string asmstr,
- InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> {
+ InstrItinClass itin, list<dag> pattern> :
+ I<4, OOL, IOL, asmstr, itin> {
bits<5> RT;
bits<21> D;
- let Pattern = [];
+ let Pattern = pattern;
let Inst{6-10} = RT;
let Inst{20} = D{0};
@@ -68,380 +136,757 @@ class EVXForm_D<bits<11> xo, dag OOL, dag IOL, string asmstr,
let Inst{21-31} = xo;
}
-let Predicates = [HasSPE], isAsmParserOnly = 1 in {
-
-def EVLDD : EVXForm_D<769, (outs gprc:$RT), (ins spe8dis:$dst),
- "evldd $RT, $dst", IIC_VecFP>;
-def EVLDW : EVXForm_D<771, (outs gprc:$RT), (ins spe8dis:$dst),
- "evldw $RT, $dst", IIC_VecFP>;
-def EVLDH : EVXForm_D<773, (outs gprc:$RT), (ins spe8dis:$dst),
- "evldh $RT, $dst", IIC_VecFP>;
-def EVLHHESPLAT : EVXForm_D<777, (outs gprc:$RT), (ins spe2dis:$dst),
- "evlhhesplat $RT, $dst", IIC_VecFP>;
-def EVLHHOUSPLAT : EVXForm_D<781, (outs gprc:$RT), (ins spe2dis:$dst),
- "evlhhousplat $RT, $dst", IIC_VecFP>;
-def EVLHHOSSPLAT : EVXForm_D<783, (outs gprc:$RT), (ins spe2dis:$dst),
- "evlhhossplat $RT, $dst", IIC_VecFP>;
-def EVLWHE : EVXForm_D<785, (outs gprc:$RT), (ins spe4dis:$dst),
- "evlwhe $RT, $dst", IIC_VecFP>;
-def EVLWHOU : EVXForm_D<789, (outs gprc:$RT), (ins spe4dis:$dst),
- "evlwhou $RT, $dst", IIC_VecFP>;
-def EVLWHOS : EVXForm_D<791, (outs gprc:$RT), (ins spe4dis:$dst),
- "evlwhos $RT, $dst", IIC_VecFP>;
-def EVLWWSPLAT : EVXForm_D<793, (outs gprc:$RT), (ins spe4dis:$dst),
- "evlwwsplat $RT, $dst", IIC_VecFP>;
-def EVLWHSPLAT : EVXForm_D<797, (outs gprc:$RT), (ins spe4dis:$dst),
- "evlwhsplat $RT, $dst", IIC_VecFP>;
-
-def EVSTDD : EVXForm_D<801, (outs), (ins gprc:$RT, spe8dis:$dst),
- "evstdd $RT, $dst", IIC_VecFP>;
-def EVSTDH : EVXForm_D<805, (outs), (ins gprc:$RT, spe8dis:$dst),
- "evstdh $RT, $dst", IIC_VecFP>;
-def EVSTDW : EVXForm_D<803, (outs), (ins gprc:$RT, spe8dis:$dst),
- "evstdw $RT, $dst", IIC_VecFP>;
-def EVSTWHE : EVXForm_D<817, (outs), (ins gprc:$RT, spe4dis:$dst),
- "evstwhe $RT, $dst", IIC_VecFP>;
-def EVSTWHO : EVXForm_D<821, (outs), (ins gprc:$RT, spe4dis:$dst),
- "evstwho $RT, $dst", IIC_VecFP>;
-def EVSTWWE : EVXForm_D<825, (outs), (ins gprc:$RT, spe4dis:$dst),
- "evstwwe $RT, $dst", IIC_VecFP>;
-def EVSTWWO : EVXForm_D<829, (outs), (ins gprc:$RT, spe4dis:$dst),
- "evstwwo $RT, $dst", IIC_VecFP>;
-
-def EVMRA : EVXForm_1<1220, (outs gprc:$RT), (ins gprc:$RA),
- "evmra $RT, $RA", IIC_VecFP> {
- let RB = 0;
-}
+let DecoderNamespace = "SPE", Predicates = [HasSPE] in {
def BRINC : EVXForm_1<527, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "brinc $RT, $RA, $RB", IIC_VecFP>;
-def EVABS : EVXForm_2<520, (outs gprc:$RT), (ins gprc:$RA),
- "evabs $RT, $RA", IIC_VecFP>;
-
-def EVADDIW : EVXForm_1<514, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
- "evaddiw $RT, $RB, $RA", IIC_VecFP>;
-def EVADDSMIAAW : EVXForm_2<1225, (outs gprc:$RT), (ins gprc:$RA),
- "evaddsmiaaw $RT, $RA", IIC_VecFP>;
-def EVADDSSIAAW : EVXForm_2<1217, (outs gprc:$RT), (ins gprc:$RA),
- "evaddssiaaw $RT, $RA", IIC_VecFP>;
-def EVADDUSIAAW : EVXForm_2<1216, (outs gprc:$RT), (ins gprc:$RA),
- "evaddusiaaw $RT, $RA", IIC_VecFP>;
-def EVADDUMIAAW : EVXForm_2<1224, (outs gprc:$RT), (ins gprc:$RA),
- "evaddumiaaw $RT, $RA", IIC_VecFP>;
-def EVADDW : EVXForm_1<512, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evaddw $RT, $RA, $RB", IIC_VecFP>;
-
-def EVAND : EVXForm_1<529, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evand $RT, $RA, $RB", IIC_VecFP>;
-def EVANDC : EVXForm_1<530, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evandc $RT, $RA, $RB", IIC_VecFP>;
-
-def EVCMPEQ : EVXForm_3<564, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
- "evcmpeq $crD, $RA, $RB", IIC_VecFP>;
-def EVCMPGTS : EVXForm_3<561, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
- "evcmpgts $crD, $RA, $RB", IIC_VecFP>;
-def EVCMPGTU : EVXForm_3<560, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
- "evcmpgtu $crD, $RA, $RB", IIC_VecFP>;
-def EVCMPLTS : EVXForm_3<563, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
- "evcmplts $crD, $RA, $RB", IIC_VecFP>;
-def EVCMPLTU : EVXForm_3<562, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
- "evcmpltu $crD, $RA, $RB", IIC_VecFP>;
-
-def EVCNTLSW : EVXForm_2<526, (outs gprc:$RT), (ins gprc:$RA),
- "evcntlsw $RT, $RA", IIC_VecFP>;
-def EVCNTLZW : EVXForm_2<525, (outs gprc:$RT), (ins gprc:$RA),
- "evcntlzw $RT, $RA", IIC_VecFP>;
-
-def EVDIVWS : EVXForm_1<1222, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evdivws $RT, $RA, $RB", IIC_VecFP>;
-def EVDIVWU : EVXForm_1<1223, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evdivwu $RT, $RA, $RB", IIC_VecFP>;
-
-def EVEQV : EVXForm_1<537, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "eveqv $RT, $RA, $RB", IIC_VecFP>;
-
-def EVEXTSB : EVXForm_2<522, (outs gprc:$RT), (ins gprc:$RA),
- "evextsb $RT, $RA", IIC_VecFP>;
-def EVEXTSH : EVXForm_2<523, (outs gprc:$RT), (ins gprc:$RA),
- "evextsh $RT, $RA", IIC_VecFP>;
-
-def EVLDDX : EVXForm_1<768, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evlddx $RT, $RA, $RB", IIC_VecFP>;
-def EVLDWX : EVXForm_1<770, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evldwx $RT, $RA, $RB", IIC_VecFP>;
-def EVLDHX : EVXForm_1<772, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evldhx $RT, $RA, $RB", IIC_VecFP>;
-def EVLHHESPLATX : EVXForm_1<776, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evlhhesplatx $RT, $RA, $RB", IIC_VecFP>;
-def EVLHHOUSPLATX : EVXForm_1<780, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evlhhousplatx $RT, $RA, $RB", IIC_VecFP>;
-def EVLHHOSSPLATX : EVXForm_1<782, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evlhhossplatx $RT, $RA, $RB", IIC_VecFP>;
-def EVLWHEX : EVXForm_1<784, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evlwhex $RT, $RA, $RB", IIC_VecFP>;
-def EVLWHOUX : EVXForm_1<788, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evlwhoux $RT, $RA, $RB", IIC_VecFP>;
-def EVLWHOSX : EVXForm_1<790, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evlwhosx $RT, $RA, $RB", IIC_VecFP>;
-def EVLWWSPLATX : EVXForm_1<792, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evlwwsplatx $RT, $RA, $RB", IIC_VecFP>;
-def EVLWHSPLATX : EVXForm_1<796, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evlwhsplatx $RT, $RA, $RB", IIC_VecFP>;
-
-def EVMERGEHI : EVXForm_1<556, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmergehi $RT, $RA, $RB", IIC_VecFP>;
-def EVMERGELO : EVXForm_1<557, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmergelo $RT, $RA, $RB", IIC_VecFP>;
-def EVMERGEHILO : EVXForm_1<558, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmergehilo $RT, $RA, $RB", IIC_VecFP>;
-def EVMERGELOHI : EVXForm_1<559, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmergelohi $RT, $RA, $RB", IIC_VecFP>;
-
-def EVMHEGSMFAA : EVXForm_1<1323, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhegsmfaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEGSMFAN : EVXForm_1<1451, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhegsmfan $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEGSMIAA : EVXForm_1<1321, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhegsmiaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEGSMIAN : EVXForm_1<1449, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhegsmian $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEGUMIAA : EVXForm_1<1320, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhegumiaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEGUMIAN : EVXForm_1<1448, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhegumian $RT, $RA, $RB", IIC_VecFP>;
-
-def EVMHESMF : EVXForm_1<1035, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhesmf $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESMFA : EVXForm_1<1067, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhesmfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESMFAAW : EVXForm_1<1291, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhesmfaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESMFANW : EVXForm_1<1419, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhesmfanw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESMI : EVXForm_1<1033, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhesmi $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESMIA : EVXForm_1<1065, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhesmia $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESMIAAW : EVXForm_1<1289, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhesmiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESMIANW : EVXForm_1<1417, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhesmianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESSF : EVXForm_1<1027, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhessf $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESSFA : EVXForm_1<1059, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhessfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESSFAAW : EVXForm_1<1283, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhessfaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESSFANW : EVXForm_1<1411, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhessfanw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESSIAAW : EVXForm_1<1281, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhessiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESSIANW : EVXForm_1<1409, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhessianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEUMI : EVXForm_1<1032, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmheumi $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEUMIA : EVXForm_1<1064, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmheumia $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEUMIAAW : EVXForm_1<1288, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmheumiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEUMIANW : EVXForm_1<1416, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmheumianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEUSIAAW : EVXForm_1<1280, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmheusiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEUSIANW : EVXForm_1<1408, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmheusianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOGSMFAA : EVXForm_1<1327, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhogsmfaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOGSMFAN : EVXForm_1<1455, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhogsmfan $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOGSMIAA : EVXForm_1<1325, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhogsmiaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOGSMIAN : EVXForm_1<1453, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhogsmian $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOGUMIAA : EVXForm_1<1324, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhogumiaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOGUMIAN : EVXForm_1<1452, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhogumian $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMF : EVXForm_1<1039, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhosmf $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMFA : EVXForm_1<1071, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhosmfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMFAAW : EVXForm_1<1295, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhosmfaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMFANW : EVXForm_1<1423, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhosmfanw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMI : EVXForm_1<1037, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhosmi $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMIA : EVXForm_1<1069, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhosmia $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMIAAW : EVXForm_1<1293, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhosmiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMIANW : EVXForm_1<1421, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhosmianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSSF : EVXForm_1<1031, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhossf $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSSFA : EVXForm_1<1063, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhossfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSSFAAW : EVXForm_1<1287, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhossfaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSSFANW : EVXForm_1<1415, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhossfanw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSSIAAW : EVXForm_1<1285, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhossiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSSIANW : EVXForm_1<1413, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhossianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOUMI : EVXForm_1<1036, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhoumi $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOUMIA : EVXForm_1<1068, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhoumia $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOUMIAAW : EVXForm_1<1292, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhoumiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOUMIANW : EVXForm_1<1420, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhoumianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOUSIAAW : EVXForm_1<1284, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhousiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOUSIANW : EVXForm_1<1412, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmhousianw $RT, $RA, $RB", IIC_VecFP>;
-
-
-def EVMWHSMF : EVXForm_1<1103, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwhsmf $RT, $RA, $RB", IIC_VecFP>;
-def EVMWHSMFA : EVXForm_1<1135, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwhsmfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWHSMI : EVXForm_1<1101, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwhsmi $RT, $RA, $RB", IIC_VecFP>;
-def EVMWHSMIA : EVXForm_1<1133, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwhsmia $RT, $RA, $RB", IIC_VecFP>;
-def EVMWHSSF : EVXForm_1<1095, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwhssf $RT, $RA, $RB", IIC_VecFP>;
-def EVMWHSSFA : EVXForm_1<1127, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwhssfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWHUMI : EVXForm_1<1100, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwhumi $RT, $RA, $RB", IIC_VecFP>;
-def EVMWHUMIA : EVXForm_1<1132, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwhumia $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLSMIAAW : EVXForm_1<1353, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwlsmiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLSMIANW : EVXForm_1<1481, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwlsmianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLSSIAAW : EVXForm_1<1345, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwlssiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLSSIANW : EVXForm_1<1473, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwlssianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLUMI : EVXForm_1<1096, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwlumi $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLUMIA : EVXForm_1<1128, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwlumia $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLUMIAAW : EVXForm_1<1352, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwlumiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLUMIANW : EVXForm_1<1480, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwlumianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLUSIAAW : EVXForm_1<1344, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwlusiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLUSIANW : EVXForm_1<1472, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwlusianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMF : EVXForm_1<1115, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwsmf $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMFA : EVXForm_1<1147, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwsmfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMFAA : EVXForm_1<1371, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwsmfaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMFAN : EVXForm_1<1499, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwsmfan $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMI : EVXForm_1<1113, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwsmi $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMIA : EVXForm_1<1145, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwsmia $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMIAA : EVXForm_1<1369, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwsmiaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMIAN : EVXForm_1<1497, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwsmian $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSSF : EVXForm_1<1107, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwssf $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSSFA : EVXForm_1<1139, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwssfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSSFAA : EVXForm_1<1363, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwssfaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSSFAN : EVXForm_1<1491, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwssfan $RT, $RA, $RB", IIC_VecFP>;
-def EVMWUMI : EVXForm_1<1112, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwumi $RT, $RA, $RB", IIC_VecFP>;
-def EVMWUMIA : EVXForm_1<1144, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwumia $RT, $RA, $RB", IIC_VecFP>;
-def EVMWUMIAA : EVXForm_1<1368, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwumiaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWUMIAN : EVXForm_1<1496, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evmwumian $RT, $RA, $RB", IIC_VecFP>;
-
-
-def EVNAND : EVXForm_1<542, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evnand $RT, $RA, $RB", IIC_VecFP>;
-
-def EVNEG : EVXForm_2<521, (outs gprc:$RT), (ins gprc:$RA),
- "evneg $RT, $RA", IIC_VecFP>;
-
-def EVNOR : EVXForm_1<536, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evnor $RT, $RA, $RB", IIC_VecFP>;
-def EVOR : EVXForm_1<535, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evor $RT, $RA, $RB", IIC_VecFP>;
-def EVORC : EVXForm_1<539, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evorc $RT, $RA, $RB", IIC_VecFP>;
-
-def EVRLWI : EVXForm_1<554, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
- "evrlwi $RT, $RA, $RB", IIC_VecFP>;
-def EVRLW : EVXForm_1<552, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evrlw $RT, $RA, $RB", IIC_VecFP>;
-
-def EVRNDW : EVXForm_2<524, (outs gprc:$RT), (ins gprc:$RA),
- "evrndw $RT, $RA", IIC_VecFP>;
-
-def EVSLWI : EVXForm_1<550, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
- "evslwi $RT, $RA, $RB", IIC_VecFP>;
-def EVSLW : EVXForm_1<548, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evslw $RT, $RA, $RB", IIC_VecFP>;
-
-def EVSPLATFI : EVXForm_2<555, (outs gprc:$RT), (ins i32imm:$RA),
- "evsplatfi $RT, $RA", IIC_VecFP>;
-def EVSPLATI : EVXForm_2<553, (outs gprc:$RT), (ins i32imm:$RA),
- "evsplati $RT, $RA", IIC_VecFP>;
-
-def EVSRWIS : EVXForm_1<547, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
- "evsrwis $RT, $RA, $RB", IIC_VecFP>;
-def EVSRWIU : EVXForm_1<546, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
- "evsrwiu $RT, $RA, $RB", IIC_VecFP>;
-def EVSRWS : EVXForm_1<545, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evsrws $RT, $RA, $RB", IIC_VecFP>;
-def EVSRWU : EVXForm_1<544, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evsrwu $RT, $RA, $RB", IIC_VecFP>;
-
-def EVSTDDX : EVXForm_1<800, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
- "evstddx $RT, $RA, $RB", IIC_VecFP>;
-def EVSTDHX : EVXForm_1<804, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
- "evstdhx $RT, $RA, $RB", IIC_VecFP>;
-def EVSTDWX : EVXForm_1<802, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
- "evstdwx $RT, $RA, $RB", IIC_VecFP>;
-def EVSTWHEX : EVXForm_1<816, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
- "evstwhex $RT, $RA, $RB", IIC_VecFP>;
-def EVSTWHOX : EVXForm_1<820, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
- "evstwhox $RT, $RA, $RB", IIC_VecFP>;
-def EVSTWWEX : EVXForm_1<824, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
- "evstwwex $RT, $RA, $RB", IIC_VecFP>;
-def EVSTWWOX : EVXForm_1<828, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
- "evstwwox $RT, $RA, $RB", IIC_VecFP>;
-
-def EVSUBFSSIAAW : EVXForm_2<1219, (outs gprc:$RT), (ins gprc:$RA),
- "evsubfssiaaw $RT, $RA", IIC_VecFP>;
-def EVSUBFSMIAAW : EVXForm_2<1227, (outs gprc:$RT), (ins gprc:$RA),
- "evsubfsmiaaw $RT, $RA", IIC_VecFP>;
-def EVSUBFUMIAAW : EVXForm_2<1226, (outs gprc:$RT), (ins gprc:$RA),
- "evsubfumiaaw $RT, $RA", IIC_VecFP>;
-def EVSUBFUSIAAW : EVXForm_2<1218, (outs gprc:$RT), (ins gprc:$RA),
- "evsubfusiaaw $RT, $RA", IIC_VecFP>;
-def EVSUBFW : EVXForm_1<516, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evsubfw $RT, $RA, $RB", IIC_VecFP>;
-def EVSUBIFW : EVXForm_1<518, (outs gprc:$RT), (ins u5imm:$RA, gprc:$RB),
- "evsubifw $RT, $RA, $RB", IIC_VecFP>;
-def EVXOR : EVXForm_1<534, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "evxor $RT, $RA, $RB", IIC_VecFP>;
+ "brinc $RT, $RA, $RB", IIC_IntSimple, []>;
+
+// Double-precision floating point
+def EFDABS : EFXForm_2<740, (outs sperc:$RT), (ins sperc:$RA),
+ "efdabs $RT, $RA", IIC_FPDGeneral,
+ [(set f64:$RT, (fabs f64:$RA))]>;
+
+def EFDADD : EFXForm_1<736, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "efdadd $RT, $RA, $RB", IIC_FPAddSub,
+ [(set f64:$RT, (fadd f64:$RA, f64:$RB))]>;
+
+def EFDCFS : EFXForm_2a<751, (outs sperc:$RT), (ins spe4rc:$RB),
+ "efdcfs $RT, $RB", IIC_FPDGeneral,
+ [(set f64:$RT, (fpextend f32:$RB))]>;
+
+def EFDCFSF : EFXForm_2a<755, (outs sperc:$RT), (ins spe4rc:$RB),
+ "efdcfsf $RT, $RB", IIC_FPDGeneral, []>;
+
+def EFDCFSI : EFXForm_2a<753, (outs sperc:$RT), (ins gprc:$RB),
+ "efdcfsi $RT, $RB", IIC_FPDGeneral,
+ [(set f64:$RT, (sint_to_fp i32:$RB))]>;
+
+def EFDCFSID : EFXForm_2a<739, (outs sperc:$RT), (ins gprc:$RB),
+ "efdcfsid $RT, $RB", IIC_FPDGeneral,
+ []>;
+
+def EFDCFUF : EFXForm_2a<754, (outs sperc:$RT), (ins spe4rc:$RB),
+ "efdcfuf $RT, $RB", IIC_FPDGeneral, []>;
+
+def EFDCFUI : EFXForm_2a<752, (outs sperc:$RT), (ins gprc:$RB),
+ "efdcfui $RT, $RB", IIC_FPDGeneral,
+ [(set f64:$RT, (uint_to_fp i32:$RB))]>;
+
+def EFDCFUID : EFXForm_2a<738, (outs sperc:$RT), (ins gprc:$RB),
+ "efdcfuid $RT, $RB", IIC_FPDGeneral,
+ []>;
+
+let isCompare = 1 in {
+def EFDCMPEQ : EFXForm_3<750, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "efdcmpeq $crD, $RA, $RB", IIC_FPDGeneral>;
+def EFDCMPGT : EFXForm_3<748, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "efdcmpgt $crD, $RA, $RB", IIC_FPDGeneral>;
+def EFDCMPLT : EFXForm_3<749, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "efdcmplt $crD, $RA, $RB", IIC_FPDGeneral>;
+}
+
+def EFDCTSF : EFXForm_2a<759, (outs sperc:$RT), (ins spe4rc:$RB),
+ "efdctsf $RT, $RB", IIC_FPDGeneral, []>;
+
+def EFDCTSI : EFXForm_2a<757, (outs gprc:$RT), (ins sperc:$RB),
+ "efdctsi $RT, $RB", IIC_FPDGeneral,
+ []>;
+
+def EFDCTSIDZ : EFXForm_2a<747, (outs gprc:$RT), (ins sperc:$RB),
+ "efdctsidz $RT, $RB", IIC_FPDGeneral,
+ []>;
+
+def EFDCTSIZ : EFXForm_2a<762, (outs gprc:$RT), (ins sperc:$RB),
+ "efdctsiz $RT, $RB", IIC_FPDGeneral,
+ [(set i32:$RT, (fp_to_sint f64:$RB))]>;
+
+def EFDCTUF : EFXForm_2a<758, (outs sperc:$RT), (ins spe4rc:$RB),
+ "efdctuf $RT, $RB", IIC_FPDGeneral, []>;
+
+def EFDCTUI : EFXForm_2a<756, (outs gprc:$RT), (ins sperc:$RB),
+ "efdctui $RT, $RB", IIC_FPDGeneral,
+ []>;
+
+def EFDCTUIDZ : EFXForm_2a<746, (outs gprc:$RT), (ins sperc:$RB),
+ "efdctuidz $RT, $RB", IIC_FPDGeneral,
+ []>;
+
+def EFDCTUIZ : EFXForm_2a<760, (outs gprc:$RT), (ins sperc:$RB),
+ "efdctuiz $RT, $RB", IIC_FPDGeneral,
+ [(set i32:$RT, (fp_to_uint f64:$RB))]>;
+
+def EFDDIV : EFXForm_1<745, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "efddiv $RT, $RA, $RB", IIC_FPDivD,
+ [(set f64:$RT, (fdiv f64:$RA, f64:$RB))]>;
+
+def EFDMUL : EFXForm_1<744, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "efdmul $RT, $RA, $RB", IIC_FPDGeneral,
+ [(set f64:$RT, (fmul f64:$RA, f64:$RB))]>;
+
+def EFDNABS : EFXForm_2<741, (outs sperc:$RT), (ins sperc:$RA),
+ "efdnabs $RT, $RA", IIC_FPDGeneral,
+ [(set f64:$RT, (fneg (fabs f64:$RA)))]>;
+
+def EFDNEG : EFXForm_2<742, (outs sperc:$RT), (ins sperc:$RA),
+ "efdneg $RT, $RA", IIC_FPDGeneral,
+ [(set f64:$RT, (fneg f64:$RA))]>;
+
+def EFDSUB : EFXForm_1<737, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "efdsub $RT, $RA, $RB", IIC_FPDGeneral,
+ [(set f64:$RT, (fsub f64:$RA, f64:$RB))]>;
+
+let isCompare = 1 in {
+def EFDTSTEQ : EFXForm_3<766, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "efdtsteq $crD, $RA, $RB", IIC_FPDGeneral>;
+def EFDTSTGT : EFXForm_3<764, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "efdtstgt $crD, $RA, $RB", IIC_FPDGeneral>;
+def EFDTSTLT : EFXForm_3<765, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "efdtstlt $crD, $RA, $RB", IIC_FPDGeneral>;
+}
+
+// Single-precision floating point
+def EFSABS : EFXForm_2<708, (outs spe4rc:$RT), (ins spe4rc:$RA),
+ "efsabs $RT, $RA", IIC_FPSGeneral,
+ [(set f32:$RT, (fabs f32:$RA))]>;
+
+def EFSADD : EFXForm_1<704, (outs spe4rc:$RT), (ins spe4rc:$RA, spe4rc:$RB),
+ "efsadd $RT, $RA, $RB", IIC_FPAddSub,
+ [(set f32:$RT, (fadd f32:$RA, f32:$RB))]>;
+
+def EFSCFD : EFXForm_2a<719, (outs spe4rc:$RT), (ins sperc:$RB),
+ "efscfd $RT, $RB", IIC_FPSGeneral,
+ [(set f32:$RT, (fpround f64:$RB))]>;
+
+def EFSCFSF : EFXForm_2a<723, (outs spe4rc:$RT), (ins spe4rc:$RB),
+ "efscfsf $RT, $RB", IIC_FPSGeneral, []>;
+
+def EFSCFSI : EFXForm_2a<721, (outs spe4rc:$RT), (ins gprc:$RB),
+ "efscfsi $RT, $RB", IIC_FPSGeneral,
+ [(set f32:$RT, (sint_to_fp i32:$RB))]>;
+
+def EFSCFUF : EFXForm_2a<722, (outs spe4rc:$RT), (ins spe4rc:$RB),
+ "efscfuf $RT, $RB", IIC_FPSGeneral, []>;
+
+def EFSCFUI : EFXForm_2a<720, (outs spe4rc:$RT), (ins gprc:$RB),
+ "efscfui $RT, $RB", IIC_FPSGeneral,
+ [(set f32:$RT, (uint_to_fp i32:$RB))]>;
+
+let isCompare = 1 in {
+def EFSCMPEQ : EFXForm_3<718, (outs crrc:$crD), (ins spe4rc:$RA, spe4rc:$RB),
+ "efscmpeq $crD, $RA, $RB", IIC_FPCompare>;
+def EFSCMPGT : EFXForm_3<716, (outs crrc:$crD), (ins spe4rc:$RA, spe4rc:$RB),
+ "efscmpgt $crD, $RA, $RB", IIC_FPCompare>;
+def EFSCMPLT : EFXForm_3<717, (outs crrc:$crD), (ins spe4rc:$RA, spe4rc:$RB),
+ "efscmplt $crD, $RA, $RB", IIC_FPCompare>;
+}
+
+def EFSCTSF : EFXForm_2a<727, (outs spe4rc:$RT), (ins spe4rc:$RB),
+ "efsctsf $RT, $RB", IIC_FPSGeneral, []>;
+
+def EFSCTSI : EFXForm_2a<725, (outs gprc:$RT), (ins spe4rc:$RB),
+ "efsctsi $RT, $RB", IIC_FPSGeneral,
+ []>;
+
+def EFSCTSIZ : EFXForm_2a<730, (outs gprc:$RT), (ins spe4rc:$RB),
+ "efsctsiz $RT, $RB", IIC_FPSGeneral,
+ [(set i32:$RT, (fp_to_sint f32:$RB))]>;
+
+def EFSCTUF : EFXForm_2a<726, (outs sperc:$RT), (ins spe4rc:$RB),
+ "efsctuf $RT, $RB", IIC_FPSGeneral, []>;
+
+def EFSCTUI : EFXForm_2a<724, (outs gprc:$RT), (ins spe4rc:$RB),
+ "efsctui $RT, $RB", IIC_FPSGeneral,
+ []>;
+
+def EFSCTUIZ : EFXForm_2a<728, (outs gprc:$RT), (ins spe4rc:$RB),
+ "efsctuiz $RT, $RB", IIC_FPSGeneral,
+ [(set i32:$RT, (fp_to_uint f32:$RB))]>;
+
+def EFSDIV : EFXForm_1<713, (outs spe4rc:$RT), (ins spe4rc:$RA, spe4rc:$RB),
+ "efsdiv $RT, $RA, $RB", IIC_FPDivD,
+ [(set f32:$RT, (fdiv f32:$RA, f32:$RB))]>;
+
+def EFSMUL : EFXForm_1<712, (outs spe4rc:$RT), (ins spe4rc:$RA, spe4rc:$RB),
+ "efsmul $RT, $RA, $RB", IIC_FPGeneral,
+ [(set f32:$RT, (fmul f32:$RA, f32:$RB))]>;
+
+def EFSNABS : EFXForm_2<709, (outs spe4rc:$RT), (ins spe4rc:$RA),
+ "efsnabs $RT, $RA", IIC_FPGeneral,
+ [(set f32:$RT, (fneg (fabs f32:$RA)))]>;
+
+def EFSNEG : EFXForm_2<710, (outs spe4rc:$RT), (ins spe4rc:$RA),
+ "efsneg $RT, $RA", IIC_FPGeneral,
+ [(set f32:$RT, (fneg f32:$RA))]>;
+
+def EFSSUB : EFXForm_1<705, (outs spe4rc:$RT), (ins spe4rc:$RA, spe4rc:$RB),
+ "efssub $RT, $RA, $RB", IIC_FPSGeneral,
+ [(set f32:$RT, (fsub f32:$RA, f32:$RB))]>;
+
+let isCompare = 1 in {
+def EFSTSTEQ : EFXForm_3<734, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "efststeq $crD, $RA, $RB", IIC_FPCompare>;
+def EFSTSTGT : EFXForm_3<732, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "efststgt $crD, $RA, $RB", IIC_FPCompare>;
+def EFSTSTLT : EFXForm_3<733, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "efststlt $crD, $RA, $RB", IIC_FPCompare>;
+}
+
+// SPE Vector operations
+
+def EVABS : EVXForm_2<520, (outs sperc:$RT), (ins sperc:$RA),
+ "evabs $RT, $RA", IIC_VecGeneral,
+ []>;
+
+def EVADDIW : EVXForm_1<514, (outs sperc:$RT), (ins sperc:$RA, u5imm:$RB),
+ "evaddiw $RT, $RB, $RA", IIC_VecGeneral, []>;
+def EVADDSMIAAW : EVXForm_2<1225, (outs sperc:$RT), (ins sperc:$RA),
+ "evaddsmiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVADDSSIAAW : EVXForm_2<1217, (outs sperc:$RT), (ins sperc:$RA),
+ "evaddssiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVADDUSIAAW : EVXForm_2<1216, (outs sperc:$RT), (ins sperc:$RA),
+ "evaddusiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVADDUMIAAW : EVXForm_2<1224, (outs sperc:$RT), (ins sperc:$RA),
+ "evaddumiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVADDW : EVXForm_1<512, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evaddw $RT, $RA, $RB", IIC_VecGeneral,
+ []>;
+
+def EVAND : EVXForm_1<529, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evand $RT, $RA, $RB", IIC_VecGeneral,
+ []>;
+def EVANDC : EVXForm_1<530, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evandc $RT, $RA, $RB", IIC_VecGeneral,
+ []>;
+
+let isCompare = 1 in {
+def EVCMPEQ : EVXForm_3<564, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "evcmpeq $crD, $RA, $RB", IIC_VecGeneral, []>;
+def EVCMPGTS : EVXForm_3<561, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "evcmpgts $crD, $RA, $RB", IIC_VecGeneral, []>;
+def EVCMPGTU : EVXForm_3<560, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "evcmpgtu $crD, $RA, $RB", IIC_VecGeneral, []>;
+def EVCMPLTS : EVXForm_3<563, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "evcmplts $crD, $RA, $RB", IIC_VecGeneral, []>;
+def EVCMPLTU : EVXForm_3<562, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "evcmpltu $crD, $RA, $RB", IIC_VecGeneral, []>;
+}
+
+def EVCNTLSW : EVXForm_2<526, (outs sperc:$RT), (ins sperc:$RA),
+ "evcntlsw $RT, $RA", IIC_VecGeneral, []>;
+def EVCNTLZW : EVXForm_2<525, (outs sperc:$RT), (ins sperc:$RA),
+ "evcntlzw $RT, $RA", IIC_VecGeneral,
+ []>;
+
+def EVDIVWS : EVXForm_1<1222, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evdivws $RT, $RA, $RB", IIC_VecComplex,
+ []>;
+def EVDIVWU : EVXForm_1<1223, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evdivwu $RT, $RA, $RB", IIC_VecComplex,
+ []>;
+
+def EVEQV : EVXForm_1<537, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "eveqv $RT, $RA, $RB", IIC_VecGeneral,
+ []>;
+
+def EVEXTSB : EVXForm_2<522, (outs sperc:$RT), (ins sperc:$RA),
+ "evextsb $RT, $RA", IIC_VecGeneral,
+ []>;
+def EVEXTSH : EVXForm_2<523, (outs sperc:$RT), (ins sperc:$RA),
+ "evextsh $RT, $RA", IIC_VecGeneral,
+ []>;
+
+def EVFSABS : EVXForm_2<644, (outs sperc:$RT), (ins sperc:$RA),
+ "evfsabs $RT, $RA", IIC_VecGeneral,
+ []>;
+def EVFSADD : EVXForm_1<640, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evfsadd $RT, $RA, $RB", IIC_VecComplex,
+ []>;
+def EVFSCFSF : EVXForm_2a<659, (outs sperc:$RT), (ins sperc:$RB),
+ "evfscfsf $RT, $RB", IIC_VecComplex, []>;
+def EVFSCFSI : EVXForm_2a<657, (outs sperc:$RT), (ins sperc:$RB),
+ "evfscfsi $RT, $RB", IIC_VecComplex,
+ []>;
+def EVFSCFUF : EVXForm_2a<658, (outs sperc:$RT), (ins sperc:$RB),
+ "evfscfuf $RT, $RB", IIC_VecComplex, []>;
+def EVFSCFUI : EVXForm_2a<650, (outs sperc:$RT), (ins sperc:$RB),
+ "evfscfui $RT, $RB", IIC_VecComplex,
+ []>;
+let isCompare = 1 in {
+def EVFSCMPEQ : EVXForm_3<654, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "evfscmpeq $crD, $RA, $RB", IIC_FPSGeneral, []>;
+def EVFSCMPGT : EVXForm_3<652, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "evfscmpgt $crD, $RA, $RB", IIC_FPSGeneral, []>;
+def EVFSCMPLT : EVXForm_3<653, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "evfscmplt $crD, $RA, $RB", IIC_FPSGeneral, []>;
+}
+
+def EVFSCTSF : EVXForm_2a<663, (outs sperc:$RT), (ins sperc:$RB),
+ "evfsctsf $RT, $RB", IIC_VecComplex, []>;
+def EVFSCTSI : EVXForm_2a<661, (outs sperc:$RT), (ins sperc:$RB),
+ "evfsctsi $RT, $RB", IIC_VecComplex,
+ []>;
+def EVFSCTSIZ : EVXForm_2a<666, (outs sperc:$RT), (ins sperc:$RB),
+ "evfsctsiz $RT, $RB", IIC_VecComplex,
+ []>;
+def EVFSCTUF : EVXForm_2a<662, (outs sperc:$RT), (ins sperc:$RB),
+ "evfsctsf $RT, $RB", IIC_VecComplex, []>;
+def EVFSCTUI : EVXForm_2a<660, (outs sperc:$RT), (ins sperc:$RB),
+ "evfsctui $RT, $RB", IIC_VecComplex,
+ []>;
+def EVFSCTUIZ : EVXForm_2a<664, (outs sperc:$RT), (ins sperc:$RB),
+ "evfsctsiz $RT, $RB", IIC_VecComplex,
+ []>;
+def EVFSDIV : EVXForm_1<649, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evfsdiv $RT, $RA, $RB", IIC_FPDivD,
+ []>;
+def EVFSMUL : EVXForm_1<648, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evfsmul $RT, $RA, $RB", IIC_VecComplex,
+ []>;
+def EVFSNABS : EVXForm_2<645, (outs sperc:$RT), (ins sperc:$RA),
+ "evfsnabs $RT, $RA", IIC_VecGeneral,
+ []>;
+def EVFSNEG : EVXForm_2<646, (outs sperc:$RT), (ins sperc:$RA),
+ "evfsneg $RT, $RA", IIC_VecGeneral,
+ []>;
+def EVFSSUB : EVXForm_1<641, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evfssub $RT, $RA, $RB", IIC_VecComplex,
+ []>;
+
+let isCompare = 1 in {
+def EVFSTSTEQ : EVXForm_3<670, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "evfststeq $crD, $RA, $RB", IIC_VecGeneral, []>;
+def EVFSTSTGT : EVXForm_3<668, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "evfststgt $crD, $RA, $RB", IIC_VecGeneral, []>;
+def EVFSTSTLT : EVXForm_3<669, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+ "evfststlt $crD, $RA, $RB", IIC_VecGeneral, []>;
+}
+
+def EVLDD : EVXForm_D<769, (outs sperc:$RT), (ins spe8dis:$dst),
+ "evldd $RT, $dst", IIC_LdStLoad,
+ [(set f64:$RT, (load iaddr:$dst))]>;
+def EVLDDX : EVXForm_1<768, (outs sperc:$RT), (ins memrr:$src),
+ "evlddx $RT, $src", IIC_LdStLoad,
+ [(set f64:$RT, (load xaddr:$src))]>;
+def EVLDH : EVXForm_D<773, (outs sperc:$RT), (ins spe8dis:$dst),
+ "evldh $RT, $dst", IIC_LdStLoad, []>;
+def EVLDHX : EVXForm_1<772, (outs sperc:$RT), (ins memrr:$src),
+ "evldhx $RT, $src", IIC_LdStLoad, []>;
+def EVLDW : EVXForm_D<771, (outs sperc:$RT), (ins spe8dis:$dst),
+ "evldw $RT, $dst", IIC_LdStLoad,
+ []>;
+def EVLDWX : EVXForm_1<770, (outs sperc:$RT), (ins memrr:$src),
+ "evldwx $RT, $src", IIC_LdStLoad,
+ []>;
+def EVLHHESPLAT : EVXForm_D<777, (outs sperc:$RT), (ins spe2dis:$dst),
+ "evlhhesplat $RT, $dst", IIC_LdStLoad, []>;
+def EVLHHESPLATX : EVXForm_1<776, (outs sperc:$RT), (ins memrr:$src),
+ "evlhhesplatx $RT, $src", IIC_LdStLoad, []>;
+def EVLHHOUSPLAT : EVXForm_D<781, (outs sperc:$RT), (ins spe2dis:$dst),
+ "evlhhousplat $RT, $dst", IIC_LdStLoad, []>;
+def EVLHHOUSPLATX : EVXForm_1<780, (outs sperc:$RT), (ins memrr:$src),
+ "evlhhousplatx $RT, $src", IIC_LdStLoad, []>;
+def EVLHHOSSPLAT : EVXForm_D<783, (outs sperc:$RT), (ins spe2dis:$dst),
+ "evlhhossplat $RT, $dst", IIC_LdStLoad, []>;
+def EVLHHOSSPLATX : EVXForm_1<782, (outs sperc:$RT), (ins memrr:$src),
+ "evlhhossplatx $RT, $src", IIC_LdStLoad, []>;
+def EVLWHE : EVXForm_D<785, (outs sperc:$RT), (ins spe4dis:$dst),
+ "evlwhe $RT, $dst", IIC_LdStLoad, []>;
+def EVLWHEX : EVXForm_1<784, (outs sperc:$RT), (ins memrr:$src),
+ "evlwhex $RT, $src", IIC_LdStLoad, []>;
+def EVLWHOS : EVXForm_D<791, (outs sperc:$RT), (ins spe4dis:$dst),
+ "evlwhos $RT, $dst", IIC_LdStLoad, []>;
+def EVLWHOSX : EVXForm_1<790, (outs sperc:$RT), (ins memrr:$src),
+ "evlwhosx $RT, $src", IIC_LdStLoad, []>;
+def EVLWHOU : EVXForm_D<789, (outs sperc:$RT), (ins spe4dis:$dst),
+ "evlwhou $RT, $dst", IIC_LdStLoad, []>;
+def EVLWHOUX : EVXForm_1<788, (outs sperc:$RT), (ins memrr:$src),
+ "evlwhoux $RT, $src", IIC_LdStLoad, []>;
+def EVLWHSPLAT : EVXForm_D<797, (outs sperc:$RT), (ins spe4dis:$dst),
+ "evlwhsplat $RT, $dst", IIC_LdStLoad, []>;
+def EVLWHSPLATX : EVXForm_1<796, (outs sperc:$RT), (ins memrr:$src),
+ "evlwhsplatx $RT, $src", IIC_LdStLoad, []>;
+def EVLWWSPLAT : EVXForm_D<793, (outs sperc:$RT), (ins spe4dis:$dst),
+ "evlwwsplat $RT, $dst", IIC_LdStLoad, []>;
+def EVLWWSPLATX : EVXForm_1<792, (outs sperc:$RT), (ins memrr:$src),
+ "evlwwsplatx $RT, $src", IIC_LdStLoad, []>;
+
+def EVMERGEHI : EVXForm_1<556, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmergehi $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVMERGELO : EVXForm_1<557, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmergelo $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVMERGEHILO : EVXForm_1<558, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmergehilo $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVMERGELOHI : EVXForm_1<559, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmergelohi $RT, $RA, $RB", IIC_VecGeneral, []>;
+
+def EVMHEGSMFAA : EVXForm_1<1323, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhegsmfaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEGSMFAN : EVXForm_1<1451, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhegsmfan $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEGSMIAA : EVXForm_1<1321, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhegsmiaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEGSMIAN : EVXForm_1<1449, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhegsmian $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEGUMIAA : EVXForm_1<1320, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhegumiaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEGUMIAN : EVXForm_1<1448, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhegumian $RT, $RA, $RB", IIC_VecComplex, []>;
+
+def EVMHESMF : EVXForm_1<1035, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhesmf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESMFA : EVXForm_1<1067, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhesmfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESMFAAW : EVXForm_1<1291, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhesmfaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESMFANW : EVXForm_1<1419, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhesmfanw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESMI : EVXForm_1<1033, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhesmi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESMIA : EVXForm_1<1065, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhesmia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESMIAAW : EVXForm_1<1289, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhesmiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESMIANW : EVXForm_1<1417, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhesmianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESSF : EVXForm_1<1027, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhessf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESSFA : EVXForm_1<1059, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhessfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESSFAAW : EVXForm_1<1283, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhessfaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESSFANW : EVXForm_1<1411, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhessfanw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESSIAAW : EVXForm_1<1281, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhessiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESSIANW : EVXForm_1<1409, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhessianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEUMI : EVXForm_1<1032, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmheumi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEUMIA : EVXForm_1<1064, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmheumia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEUMIAAW : EVXForm_1<1288, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmheumiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEUMIANW : EVXForm_1<1416, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmheumianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEUSIAAW : EVXForm_1<1280, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmheusiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEUSIANW : EVXForm_1<1408, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmheusianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOGSMFAA : EVXForm_1<1327, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhogsmfaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOGSMFAN : EVXForm_1<1455, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhogsmfan $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOGSMIAA : EVXForm_1<1325, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhogsmiaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOGSMIAN : EVXForm_1<1453, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhogsmian $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOGUMIAA : EVXForm_1<1324, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhogumiaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOGUMIAN : EVXForm_1<1452, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhogumian $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMF : EVXForm_1<1039, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhosmf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMFA : EVXForm_1<1071, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhosmfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMFAAW : EVXForm_1<1295, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhosmfaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMFANW : EVXForm_1<1423, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhosmfanw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMI : EVXForm_1<1037, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhosmi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMIA : EVXForm_1<1069, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhosmia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMIAAW : EVXForm_1<1293, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhosmiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMIANW : EVXForm_1<1421, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhosmianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSSF : EVXForm_1<1031, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhossf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSSFA : EVXForm_1<1063, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhossfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSSFAAW : EVXForm_1<1287, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhossfaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSSFANW : EVXForm_1<1415, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhossfanw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSSIAAW : EVXForm_1<1285, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhossiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSSIANW : EVXForm_1<1413, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhossianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOUMI : EVXForm_1<1036, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhoumi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOUMIA : EVXForm_1<1068, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhoumia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOUMIAAW : EVXForm_1<1292, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhoumiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOUMIANW : EVXForm_1<1420, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhoumianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOUSIAAW : EVXForm_1<1284, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhousiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOUSIANW : EVXForm_1<1412, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmhousianw $RT, $RA, $RB", IIC_VecComplex, []>;
+
+def EVMRA : EVXForm_2<1220, (outs sperc:$RT), (ins sperc:$RA),
+ "evmra $RT, $RA", IIC_VecComplex, []>;
+
+def EVMWHSMF : EVXForm_1<1103, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwhsmf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWHSMFA : EVXForm_1<1135, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwhsmfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWHSMI : EVXForm_1<1101, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwhsmi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWHSMIA : EVXForm_1<1133, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwhsmia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWHSSF : EVXForm_1<1095, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwhssf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWHSSFA : EVXForm_1<1127, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwhssfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWHUMI : EVXForm_1<1100, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwhumi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWHUMIA : EVXForm_1<1132, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwhumia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLSMIAAW : EVXForm_1<1353, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwlsmiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLSMIANW : EVXForm_1<1481, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwlsmianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLSSIAAW : EVXForm_1<1345, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwlssiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLSSIANW : EVXForm_1<1473, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwlssianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLUMI : EVXForm_1<1096, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwlumi $RT, $RA, $RB", IIC_VecComplex,
+ []>;
+def EVMWLUMIA : EVXForm_1<1128, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwlumia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLUMIAAW : EVXForm_1<1352, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwlumiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLUMIANW : EVXForm_1<1480, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwlumianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLUSIAAW : EVXForm_1<1344, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwlusiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLUSIANW : EVXForm_1<1472, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwlusianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMF : EVXForm_1<1115, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwsmf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMFA : EVXForm_1<1147, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwsmfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMFAA : EVXForm_1<1371, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwsmfaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMFAN : EVXForm_1<1499, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwsmfan $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMI : EVXForm_1<1113, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwsmi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMIA : EVXForm_1<1145, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwsmia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMIAA : EVXForm_1<1369, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwsmiaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMIAN : EVXForm_1<1497, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwsmian $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSSF : EVXForm_1<1107, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwssf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSSFA : EVXForm_1<1139, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwssfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSSFAA : EVXForm_1<1363, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwssfaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSSFAN : EVXForm_1<1491, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwssfan $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWUMI : EVXForm_1<1112, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwumi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWUMIA : EVXForm_1<1144, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwumia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWUMIAA : EVXForm_1<1368, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwumiaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWUMIAN : EVXForm_1<1496, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evmwumian $RT, $RA, $RB", IIC_VecComplex, []>;
+
+
+def EVNAND : EVXForm_1<542, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evnand $RT, $RA, $RB", IIC_VecGeneral,
+ []>;
+
+def EVNEG : EVXForm_2<521, (outs sperc:$RT), (ins sperc:$RA),
+ "evneg $RT, $RA", IIC_VecGeneral,
+ []>;
+
+def EVNOR : EVXForm_1<536, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evnor $RT, $RA, $RB", IIC_VecGeneral,
+ []>;
+def EVOR : EVXForm_1<535, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evor $RT, $RA, $RB", IIC_VecGeneral,
+ []>;
+def EVORC : EVXForm_1<539, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evorc $RT, $RA, $RB", IIC_VecGeneral,
+ []>;
+
+def EVRLWI : EVXForm_1<554, (outs sperc:$RT), (ins sperc:$RA, u5imm:$RB),
+ "evrlwi $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVRLW : EVXForm_1<552, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evrlw $RT, $RA, $RB", IIC_VecGeneral,
+ []>;
+
+def EVRNDW : EVXForm_2<524, (outs sperc:$RT), (ins sperc:$RA),
+ "evrndw $RT, $RA", IIC_VecGeneral, []>;
+
+def EVSEL : EVXForm_4<79, (outs sperc:$RT),
+ (ins sperc:$RA, sperc:$RB, crrc:$crD),
+ "evsel crD,$RT,$RA,$RB", IIC_VecGeneral, []>;
+
+def EVSLWI : EVXForm_1<550, (outs sperc:$RT), (ins sperc:$RA, u5imm:$RB),
+ "evslwi $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVSLW : EVXForm_1<548, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evslw $RT, $RA, $RB", IIC_VecGeneral,
+ []>;
+
+def EVSPLATFI : EVXForm_2<555, (outs sperc:$RT), (ins s5imm:$RA),
+ "evsplatfi $RT, $RA", IIC_VecGeneral, []>;
+def EVSPLATI : EVXForm_2<553, (outs sperc:$RT), (ins s5imm:$RA),
+ "evsplati $RT, $RA", IIC_VecGeneral, []>;
+
+def EVSRWIS : EVXForm_1<547, (outs sperc:$RT), (ins sperc:$RA, u5imm:$RB),
+ "evsrwis $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVSRWIU : EVXForm_1<546, (outs sperc:$RT), (ins sperc:$RA, u5imm:$RB),
+ "evsrwiu $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVSRWS : EVXForm_1<545, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evsrws $RT, $RA, $RB", IIC_VecGeneral,
+ []>;
+def EVSRWU : EVXForm_1<544, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evsrwu $RT, $RA, $RB", IIC_VecGeneral,
+ []>;
+
+def EVSTDD : EVXForm_D<801, (outs), (ins sperc:$RT, spe8dis:$dst),
+ "evstdd $RT, $dst", IIC_LdStStore,
+ [(store f64:$RT, iaddr:$dst)]>;
+def EVSTDDX : EVXForm_1<800, (outs), (ins sperc:$RT, memrr:$dst),
+ "evstddx $RT, $dst", IIC_LdStStore,
+ [(store f64:$RT, xaddr:$dst)]>;
+def EVSTDH : EVXForm_D<805, (outs), (ins sperc:$RT, spe8dis:$dst),
+ "evstdh $RT, $dst", IIC_LdStStore, []>;
+def EVSTDHX : EVXForm_1<804, (outs), (ins sperc:$RT, memrr:$dst),
+ "evstdhx $RT, $dst", IIC_LdStStore, []>;
+def EVSTDW : EVXForm_D<803, (outs), (ins sperc:$RT, spe8dis:$dst),
+ "evstdw $RT, $dst", IIC_LdStStore,
+ []>;
+def EVSTDWX : EVXForm_1<802, (outs), (ins sperc:$RT, memrr:$dst),
+ "evstdwx $RT, $dst", IIC_LdStStore,
+ []>;
+def EVSTWHE : EVXForm_D<817, (outs), (ins sperc:$RT, spe4dis:$dst),
+ "evstwhe $RT, $dst", IIC_LdStStore, []>;
+def EVSTWHEX : EVXForm_1<816, (outs), (ins sperc:$RT, memrr:$dst),
+ "evstwhex $RT, $dst", IIC_LdStStore, []>;
+def EVSTWHO : EVXForm_D<821, (outs), (ins sperc:$RT, spe4dis:$dst),
+ "evstwho $RT, $dst", IIC_LdStStore, []>;
+def EVSTWHOX : EVXForm_1<820, (outs), (ins sperc:$RT, memrr:$dst),
+ "evstwhox $RT, $dst", IIC_LdStStore, []>;
+def EVSTWWE : EVXForm_D<825, (outs), (ins sperc:$RT, spe4dis:$dst),
+ "evstwwe $RT, $dst", IIC_LdStStore, []>;
+def EVSTWWEX : EVXForm_1<824, (outs), (ins sperc:$RT, memrr:$dst),
+ "evstwwex $RT, $dst", IIC_LdStStore, []>;
+def EVSTWWO : EVXForm_D<829, (outs), (ins sperc:$RT, spe4dis:$dst),
+ "evstwwo $RT, $dst", IIC_LdStStore, []>;
+def EVSTWWOX : EVXForm_1<828, (outs), (ins sperc:$RT, memrr:$dst),
+ "evstwwox $RT, $dst", IIC_LdStStore, []>;
+
+def EVSUBFSSIAAW : EVXForm_2<1219, (outs sperc:$RT), (ins sperc:$RA),
+ "evsubfssiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVSUBFSMIAAW : EVXForm_2<1227, (outs sperc:$RT), (ins sperc:$RA),
+ "evsubfsmiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVSUBFUMIAAW : EVXForm_2<1226, (outs sperc:$RT), (ins sperc:$RA),
+ "evsubfumiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVSUBFUSIAAW : EVXForm_2<1218, (outs sperc:$RT), (ins sperc:$RA),
+ "evsubfusiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVSUBFW : EVXForm_1<516, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evsubfw $RT, $RA, $RB", IIC_VecGeneral,
+ []>;
+def EVSUBIFW : EVXForm_1<518, (outs sperc:$RT), (ins u5imm:$RA, sperc:$RB),
+ "evsubifw $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVXOR : EVXForm_1<534, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+ "evxor $RT, $RA, $RB", IIC_VecGeneral,
+ []>;
+
+let isAsmParserOnly = 1 in {
+// Identical to the integer Load/Stores, but to handle floats
+def SPELWZ : DForm_1<32, (outs spe4rc:$rD), (ins memri:$src),
+ "lwz $rD, $src", IIC_LdStLoad,
+ [(set f32:$rD, (load iaddr:$src))]>;
+def SPELWZX : XForm_1<31, 23, (outs spe4rc:$rD), (ins memrr:$src),
+ "lwzx $rD, $src", IIC_LdStLoad,
+ [(set f32:$rD, (load xaddr:$src))]>;
+def SPESTW : DForm_1<36, (outs), (ins spe4rc:$rS, memri:$src),
+ "stw $rS, $src", IIC_LdStStore,
+ [(store f32:$rS, iaddr:$src)]>;
+def SPESTWX : XForm_8<31, 151, (outs), (ins spe4rc:$rS, memrr:$dst),
+ "stwx $rS, $dst", IIC_LdStStore,
+ [(store f32:$rS, xaddr:$dst)]>;
+}
} // HasSPE
+
+let Predicates = [HasSPE] in {
+def : Pat<(f64 (extloadf32 iaddr:$src)),
+ (COPY_TO_REGCLASS (SPELWZ iaddr:$src), SPERC)>;
+def : Pat<(f64 (extloadf32 xaddr:$src)),
+ (COPY_TO_REGCLASS (SPELWZX xaddr:$src), SPERC)>;
+
+def : Pat<(f64 (fpextend f32:$src)),
+ (COPY_TO_REGCLASS $src, SPERC)>;
+}
+
+let Predicates = [HasSPE] in {
+ let usesCustomInserter = 1 in {
+def SELECT_CC_SPE4 : Pseudo<(outs spe4rc:$dst),
+ (ins crrc:$cond, spe4rc:$T, spe4rc:$F,
+ i32imm:$BROPC), "#SELECT_CC_SPE4",
+ []>;
+def SELECT_CC_SPE : Pseudo<(outs sperc:$dst),
+ (ins crrc:$cond, sperc:$T, sperc:$F, i32imm:$BROPC),
+ "#SELECT_CC_SPE",
+ []>;
+def SELECT_SPE4 : Pseudo<(outs spe4rc:$dst), (ins crbitrc:$cond,
+ spe4rc:$T, spe4rc:$F), "#SELECT_SPE4",
+ [(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>;
+def SELECT_SPE : Pseudo<(outs sperc:$dst), (ins crbitrc:$cond,
+ sperc:$T, sperc:$F), "#SELECT_SPE",
+ [(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>;
+ }
+
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
+ (SELECT_SPE4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)),
+ (SELECT_SPE4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)),
+ (SELECT_SPE4 (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULE)),
+ (SELECT_SPE4 (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)),
+ (SELECT_SPE4 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)),
+ (SELECT_SPE4 (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGE)),
+ (SELECT_SPE4 (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)),
+ (SELECT_SPE4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGT)),
+ (SELECT_SPE4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)),
+ (SELECT_SPE4 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)),
+ (SELECT_SPE (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULT)),
+ (SELECT_SPE (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)),
+ (SELECT_SPE (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULE)),
+ (SELECT_SPE (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)),
+ (SELECT_SPE (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)),
+ (SELECT_SPE (CRORC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGE)),
+ (SELECT_SPE (CRORC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)),
+ (SELECT_SPE (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)),
+ (SELECT_SPE (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
+ (SELECT_SPE (CRXOR $lhs, $rhs), $tval, $fval)>;
+}
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 6f719784eb7c..ffba0e5aadb5 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -126,29 +126,29 @@ let Uses = [RM] in {
// Load indexed instructions
let mayLoad = 1, mayStore = 0 in {
let CodeSize = 3 in
- def LXSDX : XX1Form<31, 588,
+ def LXSDX : XX1Form_memOp<31, 588,
(outs vsfrc:$XT), (ins memrr:$src),
"lxsdx $XT, $src", IIC_LdStLFD,
- [(set f64:$XT, (load xoaddr:$src))]>;
+ []>;
// Pseudo instruction XFLOADf64 will be expanded to LXSDX or LFDX later
let isPseudo = 1, CodeSize = 3 in
- def XFLOADf64 : Pseudo<(outs vsfrc:$XT), (ins memrr:$src),
+ def XFLOADf64 : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
"#XFLOADf64",
[(set f64:$XT, (load xoaddr:$src))]>;
let Predicates = [HasVSX, HasOnlySwappingMemOps] in
- def LXVD2X : XX1Form<31, 844,
+ def LXVD2X : XX1Form_memOp<31, 844,
(outs vsrc:$XT), (ins memrr:$src),
"lxvd2x $XT, $src", IIC_LdStLFD,
[(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>;
- def LXVDSX : XX1Form<31, 332,
+ def LXVDSX : XX1Form_memOp<31, 332,
(outs vsrc:$XT), (ins memrr:$src),
"lxvdsx $XT, $src", IIC_LdStLFD, []>;
let Predicates = [HasVSX, HasOnlySwappingMemOps] in
- def LXVW4X : XX1Form<31, 780,
+ def LXVW4X : XX1Form_memOp<31, 780,
(outs vsrc:$XT), (ins memrr:$src),
"lxvw4x $XT, $src", IIC_LdStLFD,
[]>;
@@ -157,26 +157,26 @@ let Uses = [RM] in {
// Store indexed instructions
let mayStore = 1, mayLoad = 0 in {
let CodeSize = 3 in
- def STXSDX : XX1Form<31, 716,
+ def STXSDX : XX1Form_memOp<31, 716,
(outs), (ins vsfrc:$XT, memrr:$dst),
"stxsdx $XT, $dst", IIC_LdStSTFD,
- [(store f64:$XT, xoaddr:$dst)]>;
+ []>;
// Pseudo instruction XFSTOREf64 will be expanded to STXSDX or STFDX later
let isPseudo = 1, CodeSize = 3 in
- def XFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrr:$dst),
+ def XFSTOREf64 : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst),
"#XFSTOREf64",
[(store f64:$XT, xoaddr:$dst)]>;
let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
// The behaviour of this instruction is endianness-specific so we provide no
// pattern to match it without considering endianness.
- def STXVD2X : XX1Form<31, 972,
+ def STXVD2X : XX1Form_memOp<31, 972,
(outs), (ins vsrc:$XT, memrr:$dst),
"stxvd2x $XT, $dst", IIC_LdStSTFD,
[]>;
- def STXVW4X : XX1Form<31, 908,
+ def STXVW4X : XX1Form_memOp<31, 908,
(outs), (ins vsrc:$XT, memrr:$dst),
"stxvw4x $XT, $dst", IIC_LdStSTFD,
[]>;
@@ -1200,6 +1200,7 @@ def ScalarLoads {
*/
def HasP8Vector : Predicate<"PPCSubTarget->hasP8Vector()">;
def HasDirectMove : Predicate<"PPCSubTarget->hasDirectMove()">;
+def NoP9Vector : Predicate<"!PPCSubTarget->hasP9Vector()">;
let Predicates = [HasP8Vector] in {
let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
let isCommutable = 1, UseVSXReg = 1 in {
@@ -1226,11 +1227,11 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
// VSX scalar loads introduced in ISA 2.07
let mayLoad = 1, mayStore = 0 in {
let CodeSize = 3 in
- def LXSSPX : XX1Form<31, 524, (outs vssrc:$XT), (ins memrr:$src),
+ def LXSSPX : XX1Form_memOp<31, 524, (outs vssrc:$XT), (ins memrr:$src),
"lxsspx $XT, $src", IIC_LdStLFD, []>;
- def LXSIWAX : XX1Form<31, 76, (outs vsfrc:$XT), (ins memrr:$src),
+ def LXSIWAX : XX1Form_memOp<31, 76, (outs vsfrc:$XT), (ins memrr:$src),
"lxsiwax $XT, $src", IIC_LdStLFD, []>;
- def LXSIWZX : XX1Form<31, 12, (outs vsfrc:$XT), (ins memrr:$src),
+ def LXSIWZX : XX1Form_memOp<31, 12, (outs vsfrc:$XT), (ins memrr:$src),
"lxsiwzx $XT, $src", IIC_LdStLFD, []>;
// Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it
@@ -1238,15 +1239,15 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
let isPseudo = 1 in {
// Pseudo instruction XFLOADf32 will be expanded to LXSSPX or LFSX later
let CodeSize = 3 in
- def XFLOADf32 : Pseudo<(outs vssrc:$XT), (ins memrr:$src),
+ def XFLOADf32 : PseudoXFormMemOp<(outs vssrc:$XT), (ins memrr:$src),
"#XFLOADf32",
[(set f32:$XT, (load xoaddr:$src))]>;
// Pseudo instruction LIWAX will be expanded to LXSIWAX or LFIWAX later
- def LIWAX : Pseudo<(outs vsfrc:$XT), (ins memrr:$src),
+ def LIWAX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
"#LIWAX",
[(set f64:$XT, (PPClfiwax xoaddr:$src))]>;
// Pseudo instruction LIWZX will be expanded to LXSIWZX or LFIWZX later
- def LIWZX : Pseudo<(outs vsfrc:$XT), (ins memrr:$src),
+ def LIWZX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
"#LIWZX",
[(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;
}
@@ -1255,9 +1256,9 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
// VSX scalar stores introduced in ISA 2.07
let mayStore = 1, mayLoad = 0 in {
let CodeSize = 3 in
- def STXSSPX : XX1Form<31, 652, (outs), (ins vssrc:$XT, memrr:$dst),
+ def STXSSPX : XX1Form_memOp<31, 652, (outs), (ins vssrc:$XT, memrr:$dst),
"stxsspx $XT, $dst", IIC_LdStSTFD, []>;
- def STXSIWX : XX1Form<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst),
+ def STXSIWX : XX1Form_memOp<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst),
"stxsiwx $XT, $dst", IIC_LdStSTFD, []>;
// Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it
@@ -1265,11 +1266,11 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
let isPseudo = 1 in {
// Pseudo instruction XFSTOREf32 will be expanded to STXSSPX or STFSX later
let CodeSize = 3 in
- def XFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrr:$dst),
+ def XFSTOREf32 : PseudoXFormMemOp<(outs), (ins vssrc:$XT, memrr:$dst),
"#XFSTOREf32",
[(store f32:$XT, xoaddr:$dst)]>;
// Pseudo instruction STIWX will be expanded to STXSIWX or STFIWX later
- def STIWX : Pseudo<(outs), (ins vsfrc:$XT, memrr:$dst),
+ def STIWX : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst),
"#STIWX",
[(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
}
@@ -1278,7 +1279,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
def : Pat<(f64 (extloadf32 xoaddr:$src)),
(COPY_TO_REGCLASS (XFLOADf32 xoaddr:$src), VSFRC)>;
- def : Pat<(f32 (fpround (extloadf32 xoaddr:$src))),
+ def : Pat<(f32 (fpround (f64 (extloadf32 xoaddr:$src)))),
(f32 (XFLOADf32 xoaddr:$src))>;
def : Pat<(f64 (fpextend f32:$src)),
(COPY_TO_REGCLASS $src, VSFRC)>;
@@ -1325,6 +1326,9 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
(outs vssrc:$XT), (ins vssrc:$XB),
"xsresp $XT, $XB", IIC_VecFP,
[(set f32:$XT, (PPCfre f32:$XB))]>;
+ def XSRSP : XX2Form<60, 281,
+ (outs vssrc:$XT), (ins vsfrc:$XB),
+ "xsrsp $XT, $XB", IIC_VecFP, []>;
def XSSQRTSP : XX2Form<60, 11,
(outs vssrc:$XT), (ins vssrc:$XB),
"xssqrtsp $XT, $XB", IIC_FPSqrtS,
@@ -1432,28 +1436,57 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
} // UseVSXReg = 1
let Predicates = [IsLittleEndian] in {
- def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+ def : Pat<(f32 (PPCfcfids
+ (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 0)))))),
(f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
- def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
- (f32 (XSCVSXDSP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
- def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+ def : Pat<(f32 (PPCfcfids
+ (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))),
+ (f32 (XSCVSXDSP (COPY_TO_REGCLASS
+ (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+ def : Pat<(f32 (PPCfcfidus
+ (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 0)))))),
(f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
- def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
- (f32 (XSCVUXDSP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+ def : Pat<(f32 (PPCfcfidus
+ (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))),
+ (f32 (XSCVUXDSP (COPY_TO_REGCLASS
+ (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
}
let Predicates = [IsBigEndian] in {
- def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+ def : Pat<(f32 (PPCfcfids
+ (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 0)))))),
(f32 (XSCVSXDSP (COPY_TO_REGCLASS $S, VSFRC)))>;
- def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+ def : Pat<(f32 (PPCfcfids
+ (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))),
(f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
- def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+ def : Pat<(f32 (PPCfcfidus
+ (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 0)))))),
(f32 (XSCVUXDSP (COPY_TO_REGCLASS $S, VSFRC)))>;
- def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+ def : Pat<(f32 (PPCfcfidus
+ (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))),
(f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
}
def : Pat<(v4i32 (scalar_to_vector ScalarLoads.Li32)),
(v4i32 (XXSPLTWs (LIWAX xoaddr:$src), 1))>;
+
+ // Instructions for converting float to i64 feeding a store.
+ let Predicates = [NoP9Vector] in {
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 8),
+ (STXSDX (XSCVDPSXDS f64:$src), xoaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 8),
+ (STXSDX (XSCVDPUXDS f64:$src), xoaddr:$dst)>;
+ }
+
+ // Instructions for converting float to i32 feeding a store.
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 4),
+ (STIWX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 4),
+ (STIWX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
+
} // AddedComplexity = 400
} // HasP8Vector
@@ -1614,11 +1647,11 @@ def VectorExtractions {
This is accomplished by inverting the bits of the index and AND-ing
with 0x8 (i.e. clearing all bits of the index and inverting bit 60).
*/
- dag LE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDC8 (LI8 8), $Idx));
+ dag LE_VBYTE_PERM_VEC = (v16i8 (LVSL ZERO8, (ANDC8 (LI8 8), $Idx)));
// Number 2. above:
// - Now that we set up the shift amount, we shift in the VMX register
- dag LE_VBYTE_PERMUTE = (VPERM $S, $S, LE_VBYTE_PERM_VEC);
+ dag LE_VBYTE_PERMUTE = (v16i8 (VPERM $S, $S, LE_VBYTE_PERM_VEC));
// Number 3. above:
// - The doubleword containing our element is moved to a GPR
@@ -1646,11 +1679,12 @@ def VectorExtractions {
AND with 0x4 (i.e. clear all bits of the index and invert bit 61).
Of course, the shift is still by 8 bytes, so we must multiply by 2.
*/
- dag LE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 4), $Idx), 1, 62));
+ dag LE_VHALF_PERM_VEC =
+ (v16i8 (LVSL ZERO8, (RLDICR (ANDC8 (LI8 4), $Idx), 1, 62)));
// Number 2. above:
// - Now that we set up the shift amount, we shift in the VMX register
- dag LE_VHALF_PERMUTE = (VPERM $S, $S, LE_VHALF_PERM_VEC);
+ dag LE_VHALF_PERMUTE = (v16i8 (VPERM $S, $S, LE_VHALF_PERM_VEC));
// Number 3. above:
// - The doubleword containing our element is moved to a GPR
@@ -1675,11 +1709,12 @@ def VectorExtractions {
- For elements 0-1, we shift left by 8 since they're on the right
- For elements 2-3, we need not shift
*/
- dag LE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 2), $Idx), 2, 61));
+ dag LE_VWORD_PERM_VEC = (v16i8 (LVSL ZERO8,
+ (RLDICR (ANDC8 (LI8 2), $Idx), 2, 61)));
// Number 2. above:
// - Now that we set up the shift amount, we shift in the VMX register
- dag LE_VWORD_PERMUTE = (VPERM $S, $S, LE_VWORD_PERM_VEC);
+ dag LE_VWORD_PERMUTE = (v16i8 (VPERM $S, $S, LE_VWORD_PERM_VEC));
// Number 3. above:
// - The doubleword containing our element is moved to a GPR
@@ -1704,11 +1739,12 @@ def VectorExtractions {
- For element 0, we shift left by 8 since it's on the right
- For element 1, we need not shift
*/
- dag LE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 1), $Idx), 3, 60));
+ dag LE_VDWORD_PERM_VEC = (v16i8 (LVSL ZERO8,
+ (RLDICR (ANDC8 (LI8 1), $Idx), 3, 60)));
// Number 2. above:
// - Now that we set up the shift amount, we shift in the VMX register
- dag LE_VDWORD_PERMUTE = (VPERM $S, $S, LE_VDWORD_PERM_VEC);
+ dag LE_VDWORD_PERMUTE = (v16i8 (VPERM $S, $S, LE_VDWORD_PERM_VEC));
// Number 3. above:
// - The doubleword containing our element is moved to a GPR
@@ -1722,16 +1758,17 @@ def VectorExtractions {
- Shift the vector to line up the desired element to BE Word 0
- Convert 32-bit float to a 64-bit single precision float
*/
- dag LE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR (XOR8 (LI8 3), $Idx), 2, 61));
+ dag LE_VFLOAT_PERM_VEC = (v16i8 (LVSL ZERO8,
+ (RLDICR (XOR8 (LI8 3), $Idx), 2, 61)));
dag LE_VFLOAT_PERMUTE = (VPERM $S, $S, LE_VFLOAT_PERM_VEC);
dag LE_VARIABLE_FLOAT = (XSCVSPDPN LE_VFLOAT_PERMUTE);
/* LE variable double
Same as the LE doubleword except there is no move.
*/
- dag LE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
- (COPY_TO_REGCLASS $S, VRRC),
- LE_VDWORD_PERM_VEC);
+ dag LE_VDOUBLE_PERMUTE = (v16i8 (VPERM (v16i8 (COPY_TO_REGCLASS $S, VRRC)),
+ (v16i8 (COPY_TO_REGCLASS $S, VRRC)),
+ LE_VDWORD_PERM_VEC));
dag LE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS LE_VDOUBLE_PERMUTE, VSRC);
/* BE variable byte
@@ -1741,8 +1778,8 @@ def VectorExtractions {
- The order of elements after the move to GPR is reversed, so we invert
the bits of the index prior to truncating to the range 0-7
*/
- dag BE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDIo8 $Idx, 8));
- dag BE_VBYTE_PERMUTE = (VPERM $S, $S, BE_VBYTE_PERM_VEC);
+ dag BE_VBYTE_PERM_VEC = (v16i8 (LVSL ZERO8, (ANDIo8 $Idx, 8)));
+ dag BE_VBYTE_PERMUTE = (v16i8 (VPERM $S, $S, BE_VBYTE_PERM_VEC));
dag BE_MV_VBYTE = (MFVSRD
(EXTRACT_SUBREG
(v2i64 (COPY_TO_REGCLASS BE_VBYTE_PERMUTE, VSRC)),
@@ -1759,8 +1796,9 @@ def VectorExtractions {
- The order of elements after the move to GPR is reversed, so we invert
the bits of the index prior to truncating to the range 0-3
*/
- dag BE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 4), 1, 62));
- dag BE_VHALF_PERMUTE = (VPERM $S, $S, BE_VHALF_PERM_VEC);
+ dag BE_VHALF_PERM_VEC = (v16i8 (LVSL ZERO8,
+ (RLDICR (ANDIo8 $Idx, 4), 1, 62)));
+ dag BE_VHALF_PERMUTE = (v16i8 (VPERM $S, $S, BE_VHALF_PERM_VEC));
dag BE_MV_VHALF = (MFVSRD
(EXTRACT_SUBREG
(v2i64 (COPY_TO_REGCLASS BE_VHALF_PERMUTE, VSRC)),
@@ -1776,8 +1814,9 @@ def VectorExtractions {
- The order of elements after the move to GPR is reversed, so we invert
the bits of the index prior to truncating to the range 0-1
*/
- dag BE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 2), 2, 61));
- dag BE_VWORD_PERMUTE = (VPERM $S, $S, BE_VWORD_PERM_VEC);
+ dag BE_VWORD_PERM_VEC = (v16i8 (LVSL ZERO8,
+ (RLDICR (ANDIo8 $Idx, 2), 2, 61)));
+ dag BE_VWORD_PERMUTE = (v16i8 (VPERM $S, $S, BE_VWORD_PERM_VEC));
dag BE_MV_VWORD = (MFVSRD
(EXTRACT_SUBREG
(v2i64 (COPY_TO_REGCLASS BE_VWORD_PERMUTE, VSRC)),
@@ -1791,8 +1830,9 @@ def VectorExtractions {
Same as the LE doubleword except we shift in the VMX register for opposite
element indices.
*/
- dag BE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 1), 3, 60));
- dag BE_VDWORD_PERMUTE = (VPERM $S, $S, BE_VDWORD_PERM_VEC);
+ dag BE_VDWORD_PERM_VEC = (v16i8 (LVSL ZERO8,
+ (RLDICR (ANDIo8 $Idx, 1), 3, 60)));
+ dag BE_VDWORD_PERMUTE = (v16i8 (VPERM $S, $S, BE_VDWORD_PERM_VEC));
dag BE_VARIABLE_DWORD =
(MFVSRD (EXTRACT_SUBREG
(v2i64 (COPY_TO_REGCLASS BE_VDWORD_PERMUTE, VSRC)),
@@ -1802,16 +1842,16 @@ def VectorExtractions {
- Shift the vector to line up the desired element to BE Word 0
- Convert 32-bit float to a 64-bit single precision float
*/
- dag BE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR $Idx, 2, 61));
+ dag BE_VFLOAT_PERM_VEC = (v16i8 (LVSL ZERO8, (RLDICR $Idx, 2, 61)));
dag BE_VFLOAT_PERMUTE = (VPERM $S, $S, BE_VFLOAT_PERM_VEC);
dag BE_VARIABLE_FLOAT = (XSCVSPDPN BE_VFLOAT_PERMUTE);
/* BE variable double
Same as the BE doubleword except there is no move.
*/
- dag BE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
- (COPY_TO_REGCLASS $S, VRRC),
- BE_VDWORD_PERM_VEC);
+ dag BE_VDOUBLE_PERMUTE = (v16i8 (VPERM (v16i8 (COPY_TO_REGCLASS $S, VRRC)),
+ (v16i8 (COPY_TO_REGCLASS $S, VRRC)),
+ BE_VDWORD_PERM_VEC));
dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC);
}
@@ -2282,7 +2322,7 @@ let Predicates = [HasDirectMove, HasVSX] in {
// (convert to 32-bit fp single, shift right 1 word, move to GPR)
def : Pat<(i32 (bitconvert f32:$S)),
(i32 (MFVSRWZ (EXTRACT_SUBREG
- (XXSLDWI (XSCVDPSPN $S),(XSCVDPSPN $S), 3),
+ (XXSLDWI (XSCVDPSPN $S), (XSCVDPSPN $S), 3),
sub_64)))>;
// bitconvert i32 -> f32
// (move to FPR, shift left 1 word, convert to 64-bit fp single)
@@ -2333,6 +2373,17 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
: X_RD5_XO5_RS5<opcode, xo2, xo, (outs vrrc:$vT), (ins vbtype:$vB),
!strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
+ // [PO VRT XO VRB XO /]
+ class X_VT5_XO5_VB5_VSFR<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+ list<dag> pattern>
+ : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vfrc:$vT), (ins vrrc:$vB),
+ !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
+
+ // [PO VRT XO VRB XO RO], Round to Odd version of [PO VRT XO VRB XO /]
+ class X_VT5_XO5_VB5_VSFR_Ro<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+ list<dag> pattern>
+ : X_VT5_XO5_VB5_VSFR<opcode, xo2, xo, opc, pattern>, isDOT;
+
let UseVSXReg = 1 in {
// [PO T XO B XO BX /]
class XX2_RT5_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
@@ -2365,43 +2416,112 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
list<dag> pattern>
: X_VT5_VA5_VB5<opcode, xo, opc, pattern>, isDOT;
+ // [PO VRT VRA VRB XO /]
+ class X_VT5_VA5_VB5_FMA<bits<6> opcode, bits<10> xo, string opc,
+ list<dag> pattern>
+ : XForm_1<opcode, xo, (outs vrrc:$vT), (ins vrrc:$vTi, vrrc:$vA, vrrc:$vB),
+ !strconcat(opc, " $vT, $vA, $vB"), IIC_VecFP, pattern>,
+ RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">;
+
+ // [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /]
+ class X_VT5_VA5_VB5_FMA_Ro<bits<6> opcode, bits<10> xo, string opc,
+ list<dag> pattern>
+ : X_VT5_VA5_VB5_FMA<opcode, xo, opc, pattern>, isDOT;
+
//===--------------------------------------------------------------------===//
// Quad-Precision Scalar Move Instructions:
// Copy Sign
- def XSCPSGNQP : X_VT5_VA5_VB5<63, 100, "xscpsgnqp", []>;
+ def XSCPSGNQP : X_VT5_VA5_VB5<63, 100, "xscpsgnqp",
+ [(set f128:$vT,
+ (fcopysign f128:$vB, f128:$vA))]>;
// Absolute/Negative-Absolute/Negate
- def XSABSQP : X_VT5_XO5_VB5<63, 0, 804, "xsabsqp" , []>;
- def XSNABSQP : X_VT5_XO5_VB5<63, 8, 804, "xsnabsqp", []>;
- def XSNEGQP : X_VT5_XO5_VB5<63, 16, 804, "xsnegqp" , []>;
+ def XSABSQP : X_VT5_XO5_VB5<63, 0, 804, "xsabsqp",
+ [(set f128:$vT, (fabs f128:$vB))]>;
+ def XSNABSQP : X_VT5_XO5_VB5<63, 8, 804, "xsnabsqp",
+ [(set f128:$vT, (fneg (fabs f128:$vB)))]>;
+ def XSNEGQP : X_VT5_XO5_VB5<63, 16, 804, "xsnegqp",
+ [(set f128:$vT, (fneg f128:$vB))]>;
//===--------------------------------------------------------------------===//
// Quad-Precision Scalar Floating-Point Arithmetic Instructions:
// Add/Divide/Multiply/Subtract
- def XSADDQP : X_VT5_VA5_VB5 <63, 4, "xsaddqp" , []>;
- def XSADDQPO : X_VT5_VA5_VB5_Ro<63, 4, "xsaddqpo", []>;
- def XSDIVQP : X_VT5_VA5_VB5 <63, 548, "xsdivqp" , []>;
- def XSDIVQPO : X_VT5_VA5_VB5_Ro<63, 548, "xsdivqpo", []>;
- def XSMULQP : X_VT5_VA5_VB5 <63, 36, "xsmulqp" , []>;
- def XSMULQPO : X_VT5_VA5_VB5_Ro<63, 36, "xsmulqpo", []>;
- def XSSUBQP : X_VT5_VA5_VB5 <63, 516, "xssubqp" , []>;
- def XSSUBQPO : X_VT5_VA5_VB5_Ro<63, 516, "xssubqpo", []>;
+ let isCommutable = 1 in {
+ def XSADDQP : X_VT5_VA5_VB5 <63, 4, "xsaddqp",
+ [(set f128:$vT, (fadd f128:$vA, f128:$vB))]>;
+ def XSADDQPO : X_VT5_VA5_VB5_Ro<63, 4, "xsaddqpo",
+ [(set f128:$vT,
+ (int_ppc_addf128_round_to_odd
+ f128:$vA, f128:$vB))]>;
+ def XSMULQP : X_VT5_VA5_VB5 <63, 36, "xsmulqp",
+ [(set f128:$vT, (fmul f128:$vA, f128:$vB))]>;
+ def XSMULQPO : X_VT5_VA5_VB5_Ro<63, 36, "xsmulqpo",
+ [(set f128:$vT,
+ (int_ppc_mulf128_round_to_odd
+ f128:$vA, f128:$vB))]>;
+ }
+
+ def XSSUBQP : X_VT5_VA5_VB5 <63, 516, "xssubqp" ,
+ [(set f128:$vT, (fsub f128:$vA, f128:$vB))]>;
+ def XSSUBQPO : X_VT5_VA5_VB5_Ro<63, 516, "xssubqpo",
+ [(set f128:$vT,
+ (int_ppc_subf128_round_to_odd
+ f128:$vA, f128:$vB))]>;
+ def XSDIVQP : X_VT5_VA5_VB5 <63, 548, "xsdivqp",
+ [(set f128:$vT, (fdiv f128:$vA, f128:$vB))]>;
+ def XSDIVQPO : X_VT5_VA5_VB5_Ro<63, 548, "xsdivqpo",
+ [(set f128:$vT,
+ (int_ppc_divf128_round_to_odd
+ f128:$vA, f128:$vB))]>;
// Square-Root
- def XSSQRTQP : X_VT5_XO5_VB5 <63, 27, 804, "xssqrtqp" , []>;
- def XSSQRTQPO : X_VT5_XO5_VB5_Ro<63, 27, 804, "xssqrtqpo", []>;
+ def XSSQRTQP : X_VT5_XO5_VB5 <63, 27, 804, "xssqrtqp",
+ [(set f128:$vT, (fsqrt f128:$vB))]>;
+ def XSSQRTQPO : X_VT5_XO5_VB5_Ro<63, 27, 804, "xssqrtqpo",
+ [(set f128:$vT,
+ (int_ppc_sqrtf128_round_to_odd f128:$vB))]>;
// (Negative) Multiply-{Add/Subtract}
- def XSMADDQP : X_VT5_VA5_VB5 <63, 388, "xsmaddqp" , []>;
- def XSMADDQPO : X_VT5_VA5_VB5_Ro<63, 388, "xsmaddqpo" , []>;
- def XSMSUBQP : X_VT5_VA5_VB5 <63, 420, "xsmsubqp" , []>;
- def XSMSUBQPO : X_VT5_VA5_VB5_Ro<63, 420, "xsmsubqpo" , []>;
- def XSNMADDQP : X_VT5_VA5_VB5 <63, 452, "xsnmaddqp" , []>;
- def XSNMADDQPO: X_VT5_VA5_VB5_Ro<63, 452, "xsnmaddqpo", []>;
- def XSNMSUBQP : X_VT5_VA5_VB5 <63, 484, "xsnmsubqp" , []>;
- def XSNMSUBQPO: X_VT5_VA5_VB5_Ro<63, 484, "xsnmsubqpo", []>;
+ def XSMADDQP : X_VT5_VA5_VB5_FMA <63, 388, "xsmaddqp",
+ [(set f128:$vT,
+ (fma f128:$vA, f128:$vB,
+ f128:$vTi))]>;
+
+ def XSMADDQPO : X_VT5_VA5_VB5_FMA_Ro<63, 388, "xsmaddqpo",
+ [(set f128:$vT,
+ (int_ppc_fmaf128_round_to_odd
+ f128:$vA,f128:$vB,f128:$vTi))]>;
+
+ def XSMSUBQP : X_VT5_VA5_VB5_FMA <63, 420, "xsmsubqp" ,
+ [(set f128:$vT,
+ (fma f128:$vA, f128:$vB,
+ (fneg f128:$vTi)))]>;
+ def XSMSUBQPO : X_VT5_VA5_VB5_FMA_Ro<63, 420, "xsmsubqpo" ,
+ [(set f128:$vT,
+ (int_ppc_fmaf128_round_to_odd
+ f128:$vA, f128:$vB, (fneg f128:$vTi)))]>;
+ def XSNMADDQP : X_VT5_VA5_VB5_FMA <63, 452, "xsnmaddqp",
+ [(set f128:$vT,
+ (fneg (fma f128:$vA, f128:$vB,
+ f128:$vTi)))]>;
+ def XSNMADDQPO: X_VT5_VA5_VB5_FMA_Ro<63, 452, "xsnmaddqpo",
+ [(set f128:$vT,
+ (fneg (int_ppc_fmaf128_round_to_odd
+ f128:$vA, f128:$vB, f128:$vTi)))]>;
+ def XSNMSUBQP : X_VT5_VA5_VB5_FMA <63, 484, "xsnmsubqp",
+ [(set f128:$vT,
+ (fneg (fma f128:$vA, f128:$vB,
+ (fneg f128:$vTi))))]>;
+ def XSNMSUBQPO: X_VT5_VA5_VB5_FMA_Ro<63, 484, "xsnmsubqpo",
+ [(set f128:$vT,
+ (fneg (int_ppc_fmaf128_round_to_odd
+ f128:$vA, f128:$vB, (fneg f128:$vTi))))]>;
+
+ // Additional fnmsub patterns: -a*c + b == -(a*c - b)
+ def : Pat<(fma (fneg f128:$A), f128:$C, f128:$B), (XSNMSUBQP $B, $C, $A)>;
+ def : Pat<(fma f128:$A, (fneg f128:$C), f128:$B), (XSNMSUBQP $B, $C, $A)>;
//===--------------------------------------------------------------------===//
// Quad/Double-Precision Compare Instructions:
@@ -2434,37 +2554,20 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
IIC_FPCompare, []>;
def XSCMPGTDP : XX3_XT5_XA5_XB5<60, 11, "xscmpgtdp", vsrc, vsfrc, vsfrc,
IIC_FPCompare, []>;
- def XSCMPNEDP : XX3_XT5_XA5_XB5<60, 27, "xscmpnedp", vsrc, vsfrc, vsfrc,
- IIC_FPCompare, []>;
- let UseVSXReg = 1 in {
- // Vector Compare Not Equal
- def XVCMPNEDP : XX3Form_Rc<60, 123,
- (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
- "xvcmpnedp $XT, $XA, $XB", IIC_VecFPCompare, []>;
- let Defs = [CR6] in
- def XVCMPNEDPo : XX3Form_Rc<60, 123,
- (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
- "xvcmpnedp. $XT, $XA, $XB", IIC_VecFPCompare, []>,
- isDOT;
- def XVCMPNESP : XX3Form_Rc<60, 91,
- (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
- "xvcmpnesp $XT, $XA, $XB", IIC_VecFPCompare, []>;
- let Defs = [CR6] in
- def XVCMPNESPo : XX3Form_Rc<60, 91,
- (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
- "xvcmpnesp. $XT, $XA, $XB", IIC_VecFPCompare, []>,
- isDOT;
- } // UseVSXReg = 1
//===--------------------------------------------------------------------===//
// Quad-Precision Floating-Point Conversion Instructions:
// Convert DP -> QP
- def XSCVDPQP : X_VT5_XO5_VB5_TyVB<63, 22, 836, "xscvdpqp", vfrc, []>;
+ def XSCVDPQP : X_VT5_XO5_VB5_TyVB<63, 22, 836, "xscvdpqp", vfrc,
+ [(set f128:$vT, (fpextend f64:$vB))]>;
// Round & Convert QP -> DP (dword[1] is set to zero)
- def XSCVQPDP : X_VT5_XO5_VB5 <63, 20, 836, "xscvqpdp" , []>;
- def XSCVQPDPO : X_VT5_XO5_VB5_Ro<63, 20, 836, "xscvqpdpo", []>;
+ def XSCVQPDP : X_VT5_XO5_VB5_VSFR<63, 20, 836, "xscvqpdp" , []>;
+ def XSCVQPDPO : X_VT5_XO5_VB5_VSFR_Ro<63, 20, 836, "xscvqpdpo",
+ [(set f64:$vT,
+ (int_ppc_truncf128_round_to_odd
+ f128:$vB))]>;
// Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero)
def XSCVQPSDZ : X_VT5_XO5_VB5<63, 25, 836, "xscvqpsdz", []>;
@@ -2472,9 +2575,30 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
def XSCVQPUDZ : X_VT5_XO5_VB5<63, 17, 836, "xscvqpudz", []>;
def XSCVQPUWZ : X_VT5_XO5_VB5<63, 1, 836, "xscvqpuwz", []>;
- // Convert (Un)Signed DWord -> QP
+ // Convert (Un)Signed DWord -> QP.
def XSCVSDQP : X_VT5_XO5_VB5_TyVB<63, 10, 836, "xscvsdqp", vfrc, []>;
+ def : Pat<(f128 (sint_to_fp i64:$src)),
+ (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+ def : Pat<(f128 (sint_to_fp (i64 (PPCmfvsr f64:$src)))),
+ (f128 (XSCVSDQP $src))>;
+ def : Pat<(f128 (sint_to_fp (i32 (PPCmfvsr f64:$src)))),
+ (f128 (XSCVSDQP (VEXTSW2Ds $src)))>;
+
def XSCVUDQP : X_VT5_XO5_VB5_TyVB<63, 2, 836, "xscvudqp", vfrc, []>;
+ def : Pat<(f128 (uint_to_fp i64:$src)),
+ (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+ def : Pat<(f128 (uint_to_fp (i64 (PPCmfvsr f64:$src)))),
+ (f128 (XSCVUDQP $src))>;
+
+ // Convert (Un)Signed Word -> QP.
+ def : Pat<(f128 (sint_to_fp i32:$src)),
+ (f128 (XSCVSDQP (MTVSRWA $src)))>;
+ def : Pat<(f128 (sint_to_fp (i32 (load xoaddr:$src)))),
+ (f128 (XSCVSDQP (LIWAX xoaddr:$src)))>;
+ def : Pat<(f128 (uint_to_fp i32:$src)),
+ (f128 (XSCVUDQP (MTVSRWZ $src)))>;
+ def : Pat<(f128 (uint_to_fp (i32 (load xoaddr:$src)))),
+ (f128 (XSCVUDQP (LIWZX xoaddr:$src)))>;
let UseVSXReg = 1 in {
//===--------------------------------------------------------------------===//
@@ -2503,7 +2627,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
class Z23_VT5_R1_VB5_RMC2_EX1<bits<6> opcode, bits<8> xo, bit ex, string opc,
list<dag> pattern>
- : Z23Form_1<opcode, xo,
+ : Z23Form_8<opcode, xo,
(outs vrrc:$vT), (ins u1imm:$r, vrrc:$vB, u2imm:$rmc),
!strconcat(opc, " $r, $vT, $vB, $rmc"), IIC_VecFP, pattern> {
let RC = ex;
@@ -2513,6 +2637,20 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
def XSRQPI : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 0, "xsrqpi" , []>;
def XSRQPIX : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 1, "xsrqpix", []>;
+ // Use current rounding mode
+ def : Pat<(f128 (fnearbyint f128:$vB)), (f128 (XSRQPI 0, $vB, 3))>;
+ // Round to nearest, ties away from zero
+ def : Pat<(f128 (fround f128:$vB)), (f128 (XSRQPI 0, $vB, 0))>;
+ // Round towards Zero
+ def : Pat<(f128 (ftrunc f128:$vB)), (f128 (XSRQPI 1, $vB, 1))>;
+ // Round towards +Inf
+ def : Pat<(f128 (fceil f128:$vB)), (f128 (XSRQPI 1, $vB, 2))>;
+ // Round towards -Inf
+ def : Pat<(f128 (ffloor f128:$vB)), (f128 (XSRQPI 1, $vB, 3))>;
+
+ // Use current rounding mode, [with Inexact]
+ def : Pat<(f128 (frint f128:$vB)), (f128 (XSRQPIX 0, $vB, 3))>;
+
// Round Quad-Precision to Double-Extended Precision (fp80)
def XSRQPXP : Z23_VT5_R1_VB5_RMC2_EX1<63, 37, 0, "xsrqpxp", []>;
@@ -2670,7 +2808,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
// "out" and "in" dag
class X_XT6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
RegisterOperand vtype, list<dag> pattern>
- : XX1Form<opcode, xo, (outs vtype:$XT), (ins memrr:$src),
+ : XX1Form_memOp<opcode, xo, (outs vtype:$XT), (ins memrr:$src),
!strconcat(opc, " $XT, $src"), IIC_LdStLFD, pattern>, UseVSXReg;
// Load as Integer Byte/Halfword & Zero Indexed
@@ -2687,11 +2825,11 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
def LXVX : X_XT6_RA5_RB5<31, 268, "lxvx" , vsrc,
[(set v2f64:$XT, (load xaddr:$src))]>;
// Load Vector (Left-justified) with Length
- def LXVL : XX1Form<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
+ def LXVL : XX1Form_memOp<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
"lxvl $XT, $src, $rB", IIC_LdStLoad,
[(set v4i32:$XT, (int_ppc_vsx_lxvl addr:$src, i64:$rB))]>,
UseVSXReg;
- def LXVLL : XX1Form<31,301, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
+ def LXVLL : XX1Form_memOp<31,301, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
"lxvll $XT, $src, $rB", IIC_LdStLoad,
[(set v4i32:$XT, (int_ppc_vsx_lxvll addr:$src, i64:$rB))]>,
UseVSXReg;
@@ -2716,7 +2854,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
// [PO S RA RB XO SX]
class X_XS6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
RegisterOperand vtype, list<dag> pattern>
- : XX1Form<opcode, xo, (outs), (ins vtype:$XT, memrr:$dst),
+ : XX1Form_memOp<opcode, xo, (outs), (ins vtype:$XT, memrr:$dst),
!strconcat(opc, " $XT, $dst"), IIC_LdStSTFD, pattern>, UseVSXReg;
// Store as Integer Byte/Halfword Indexed
@@ -2738,51 +2876,55 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
[(store v2f64:$XT, xaddr:$dst)]>;
// Store Vector (Left-justified) with Length
- def STXVL : XX1Form<31, 397, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB),
- "stxvl $XT, $dst, $rB", IIC_LdStLoad,
- [(int_ppc_vsx_stxvl v4i32:$XT, addr:$dst, i64:$rB)]>,
- UseVSXReg;
- def STXVLL : XX1Form<31, 429, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB),
- "stxvll $XT, $dst, $rB", IIC_LdStLoad,
- [(int_ppc_vsx_stxvll v4i32:$XT, addr:$dst, i64:$rB)]>,
- UseVSXReg;
+ def STXVL : XX1Form_memOp<31, 397, (outs),
+ (ins vsrc:$XT, memr:$dst, g8rc:$rB),
+ "stxvl $XT, $dst, $rB", IIC_LdStLoad,
+ [(int_ppc_vsx_stxvl v4i32:$XT, addr:$dst,
+ i64:$rB)]>,
+ UseVSXReg;
+ def STXVLL : XX1Form_memOp<31, 429, (outs),
+ (ins vsrc:$XT, memr:$dst, g8rc:$rB),
+ "stxvll $XT, $dst, $rB", IIC_LdStLoad,
+ [(int_ppc_vsx_stxvll v4i32:$XT, addr:$dst,
+ i64:$rB)]>,
+ UseVSXReg;
} // mayStore
let Predicates = [IsLittleEndian] in {
- def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+ def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>;
- def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+ def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>;
- def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+ def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>;
- def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+ def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>;
- def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+ def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
- def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+ def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>;
- def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+ def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>;
- def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+ def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>;
}
let Predicates = [IsBigEndian] in {
- def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+ def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>;
- def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+ def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>;
- def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+ def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>;
- def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+ def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>;
- def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+ def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>;
- def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+ def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>;
- def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+ def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>;
- def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+ def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
}
@@ -2795,21 +2937,21 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
// Patterns for which instructions from ISA 3.0 are a better match
let Predicates = [IsLittleEndian, HasP9Vector] in {
- def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+ def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
(f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
- def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+ def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
(f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
- def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+ def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
(f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
- def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+ def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
(f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
- def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+ def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
(f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>;
- def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+ def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
(f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>;
- def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+ def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
(f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>;
- def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+ def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
(f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>;
def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
(v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
@@ -2830,21 +2972,21 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
} // IsLittleEndian, HasP9Vector
let Predicates = [IsBigEndian, HasP9Vector] in {
- def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+ def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
(f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
- def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+ def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
(f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
- def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+ def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
(f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
- def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+ def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
(f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
- def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+ def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
(f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>;
- def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+ def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
(f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>;
- def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+ def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
(f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>;
- def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+ def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
(f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>;
def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
(v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
@@ -2869,12 +3011,16 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
def : Pat<(v4f32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
def : Pat<(v2i64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
def : Pat<(v2f64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+ def : Pat<(f128 (quadwOffsetLoad iqaddr:$src)),
+ (COPY_TO_REGCLASS (LXV memrix16:$src), VRRC)>;
def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iqaddr:$src)), (LXV memrix16:$src)>;
def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iqaddr:$src)), (LXV memrix16:$src)>;
def : Pat<(quadwOffsetStore v4f32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
def : Pat<(quadwOffsetStore v4i32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
def : Pat<(quadwOffsetStore v2f64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+ def : Pat<(quadwOffsetStore f128:$rS, iqaddr:$dst),
+ (STXV (COPY_TO_REGCLASS $rS, VSRC), memrix16:$dst)>;
def : Pat<(quadwOffsetStore v2i64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iqaddr:$dst),
(STXV $rS, memrix16:$dst)>;
@@ -2888,6 +3034,10 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
def : Pat<(v4i32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>;
def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>;
+ def : Pat<(f128 (nonQuadwOffsetLoad xoaddr:$src)),
+ (COPY_TO_REGCLASS (LXVX xoaddr:$src), VRRC)>;
+ def : Pat<(nonQuadwOffsetStore f128:$rS, xoaddr:$dst),
+ (STXVX (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
def : Pat<(nonQuadwOffsetStore v2f64:$rS, xoaddr:$dst),
(STXVX $rS, xoaddr:$dst)>;
def : Pat<(nonQuadwOffsetStore v2i64:$rS, xoaddr:$dst),
@@ -2904,7 +3054,8 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
(v4i32 (LXVWSX xoaddr:$src))>;
def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
(v4f32 (LXVWSX xoaddr:$src))>;
- def : Pat<(v4f32 (scalar_to_vector (f32 (fpround (extloadf32 xoaddr:$src))))),
+ def : Pat<(v4f32 (scalar_to_vector
+ (f32 (fpround (f64 (extloadf32 xoaddr:$src)))))),
(v4f32 (LXVWSX xoaddr:$src))>;
// Build vectors from i8 loads
@@ -2936,109 +3087,109 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
let Predicates = [IsBigEndian, HasP9Vector] in {
// Scalar stores of i8
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 9), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 9)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 11), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 11)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 13), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 13)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 15), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 15)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
(STXSIBXv $S, xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 1), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 1)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 3), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 3)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 5), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 5)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 7), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 7)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>;
// Scalar stores of i16
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
- (STXSIHXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+ (STXSIHXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
- (STXSIHXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+ (STXSIHXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
- (STXSIHXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+ (STXSIHXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
(STXSIHXv $S, xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
- (STXSIHXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+ (STXSIHXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
- (STXSIHXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+ (STXSIHXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
- (STXSIHXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+ (STXSIHXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
- (STXSIHXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+ (STXSIHXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>;
} // IsBigEndian, HasP9Vector
let Predicates = [IsLittleEndian, HasP9Vector] in {
// Scalar stores of i8
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 7), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 7)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 5), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 5)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 3), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 3)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 1), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 1)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
(STXSIBXv $S, xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 15), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 15)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 13), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 13)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 11), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 11)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
- (STXSIBXv (VSLDOI $S, $S, 9), xoaddr:$dst)>;
+ (STXSIBXv (v16i8 (VSLDOI $S, $S, 9)), xoaddr:$dst)>;
// Scalar stores of i16
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
- (STXSIHXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+ (STXSIHXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
- (STXSIHXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+ (STXSIHXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
- (STXSIHXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+ (STXSIHXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
- (STXSIHXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+ (STXSIHXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
(STXSIHXv $S, xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
- (STXSIHXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+ (STXSIHXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
- (STXSIHXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+ (STXSIHXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
- (STXSIHXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+ (STXSIHXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>;
} // IsLittleEndian, HasP9Vector
@@ -3064,21 +3215,264 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
}
def : Pat<(f64 (extloadf32 ixaddr:$src)),
(COPY_TO_REGCLASS (DFLOADf32 ixaddr:$src), VSFRC)>;
- def : Pat<(f32 (fpround (extloadf32 ixaddr:$src))),
+ def : Pat<(f32 (fpround (f64 (extloadf32 ixaddr:$src)))),
(f32 (DFLOADf32 ixaddr:$src))>;
+
+ let Predicates = [IsBigEndian, HasP9Vector] in {
+
+ // (Un)Signed DWord vector extract -> QP
+ def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 0)))),
+ (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+ def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 1)))),
+ (f128 (XSCVSDQP
+ (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
+ def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 0)))),
+ (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+ def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 1)))),
+ (f128 (XSCVUDQP
+ (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
+
+ // (Un)Signed Word vector extract -> QP
+ def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, 1)))),
+ (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSW2D $src), sub_64)))>;
+ foreach Idx = [0,2,3] in {
+ def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, Idx)))),
+ (f128 (XSCVSDQP (EXTRACT_SUBREG
+ (VEXTSW2D (VSPLTW Idx, $src)), sub_64)))>;
+ }
+ foreach Idx = 0-3 in {
+ def : Pat<(f128 (uint_to_fp (i32 (extractelt v4i32:$src, Idx)))),
+ (f128 (XSCVUDQP (XXEXTRACTUW $src, !shl(Idx, 2))))>;
+ }
+
+ // (Un)Signed HWord vector extract -> QP
+ foreach Idx = 0-7 in {
+ def : Pat<(f128 (sint_to_fp
+ (i32 (sext_inreg
+ (vector_extract v8i16:$src, Idx), i16)))),
+ (f128 (XSCVSDQP (EXTRACT_SUBREG
+ (VEXTSH2D (VEXTRACTUH !add(Idx, Idx), $src)),
+ sub_64)))>;
+ // The SDAG adds the `and` since an `i16` is being extracted as an `i32`.
+ def : Pat<(f128 (uint_to_fp
+ (and (i32 (vector_extract v8i16:$src, Idx)), 65535))),
+ (f128 (XSCVUDQP (EXTRACT_SUBREG
+ (VEXTRACTUH !add(Idx, Idx), $src), sub_64)))>;
+ }
+
+ // (Un)Signed Byte vector extract -> QP
+ foreach Idx = 0-15 in {
+ def : Pat<(f128 (sint_to_fp
+ (i32 (sext_inreg (vector_extract v16i8:$src, Idx),
+ i8)))),
+ (f128 (XSCVSDQP (EXTRACT_SUBREG
+ (VEXTSB2D (VEXTRACTUB Idx, $src)), sub_64)))>;
+ def : Pat<(f128 (uint_to_fp
+ (and (i32 (vector_extract v16i8:$src, Idx)), 255))),
+ (f128 (XSCVUDQP
+ (EXTRACT_SUBREG (VEXTRACTUB Idx, $src), sub_64)))>;
+ }
+
+ // Unsiged int in vsx register -> QP
+ def : Pat<(f128 (uint_to_fp (i32 (PPCmfvsr f64:$src)))),
+ (f128 (XSCVUDQP
+ (XXEXTRACTUW (SUBREG_TO_REG (i64 1), $src, sub_64), 4)))>;
+ } // IsBigEndian, HasP9Vector
+
+ let Predicates = [IsLittleEndian, HasP9Vector] in {
+
+ // (Un)Signed DWord vector extract -> QP
+ def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 0)))),
+ (f128 (XSCVSDQP
+ (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
+ def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 1)))),
+ (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+ def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 0)))),
+ (f128 (XSCVUDQP
+ (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
+ def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 1)))),
+ (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+
+ // (Un)Signed Word vector extract -> QP
+ foreach Idx = [[0,3],[1,2],[3,0]] in {
+ def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, !head(Idx))))),
+ (f128 (XSCVSDQP (EXTRACT_SUBREG
+ (VEXTSW2D (VSPLTW !head(!tail(Idx)), $src)),
+ sub_64)))>;
+ }
+ def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, 2)))),
+ (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSW2D $src), sub_64)))>;
+
+ foreach Idx = [[0,12],[1,8],[2,4],[3,0]] in {
+ def : Pat<(f128 (uint_to_fp (i32 (extractelt v4i32:$src, !head(Idx))))),
+ (f128 (XSCVUDQP (XXEXTRACTUW $src, !head(!tail(Idx)))))>;
+ }
+
+ // (Un)Signed HWord vector extract -> QP
+ // The Nested foreach lists identifies the vector element and corresponding
+ // register byte location.
+ foreach Idx = [[0,14],[1,12],[2,10],[3,8],[4,6],[5,4],[6,2],[7,0]] in {
+ def : Pat<(f128 (sint_to_fp
+ (i32 (sext_inreg
+ (vector_extract v8i16:$src, !head(Idx)), i16)))),
+ (f128 (XSCVSDQP
+ (EXTRACT_SUBREG (VEXTSH2D
+ (VEXTRACTUH !head(!tail(Idx)), $src)),
+ sub_64)))>;
+ def : Pat<(f128 (uint_to_fp
+ (and (i32 (vector_extract v8i16:$src, !head(Idx))),
+ 65535))),
+ (f128 (XSCVUDQP (EXTRACT_SUBREG
+ (VEXTRACTUH !head(!tail(Idx)), $src), sub_64)))>;
+ }
+
+ // (Un)Signed Byte vector extract -> QP
+ foreach Idx = [[0,15],[1,14],[2,13],[3,12],[4,11],[5,10],[6,9],[7,8],[8,7],
+ [9,6],[10,5],[11,4],[12,3],[13,2],[14,1],[15,0]] in {
+ def : Pat<(f128 (sint_to_fp
+ (i32 (sext_inreg
+ (vector_extract v16i8:$src, !head(Idx)), i8)))),
+ (f128 (XSCVSDQP
+ (EXTRACT_SUBREG
+ (VEXTSB2D (VEXTRACTUB !head(!tail(Idx)), $src)),
+ sub_64)))>;
+ def : Pat<(f128 (uint_to_fp
+ (and (i32 (vector_extract v16i8:$src, !head(Idx))),
+ 255))),
+ (f128 (XSCVUDQP
+ (EXTRACT_SUBREG
+ (VEXTRACTUB !head(!tail(Idx)), $src), sub_64)))>;
+ }
+
+ // Unsiged int in vsx register -> QP
+ def : Pat<(f128 (uint_to_fp (i32 (PPCmfvsr f64:$src)))),
+ (f128 (XSCVUDQP
+ (XXEXTRACTUW (SUBREG_TO_REG (i64 1), $src, sub_64), 8)))>;
+ } // IsLittleEndian, HasP9Vector
+
+ // Convert (Un)Signed DWord in memory -> QP
+ def : Pat<(f128 (sint_to_fp (i64 (load xaddr:$src)))),
+ (f128 (XSCVSDQP (LXSDX xaddr:$src)))>;
+ def : Pat<(f128 (sint_to_fp (i64 (load ixaddr:$src)))),
+ (f128 (XSCVSDQP (LXSD ixaddr:$src)))>;
+ def : Pat<(f128 (uint_to_fp (i64 (load xaddr:$src)))),
+ (f128 (XSCVUDQP (LXSDX xaddr:$src)))>;
+ def : Pat<(f128 (uint_to_fp (i64 (load ixaddr:$src)))),
+ (f128 (XSCVUDQP (LXSD ixaddr:$src)))>;
+
+ // Convert Unsigned HWord in memory -> QP
+ def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi16)),
+ (f128 (XSCVUDQP (LXSIHZX xaddr:$src)))>;
+
+ // Convert Unsigned Byte in memory -> QP
+ def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi8)),
+ (f128 (XSCVUDQP (LXSIBZX xoaddr:$src)))>;
+
+ // Truncate & Convert QP -> (Un)Signed (D)Word.
+ def : Pat<(i64 (fp_to_sint f128:$src)), (i64 (MFVRD (XSCVQPSDZ $src)))>;
+ def : Pat<(i64 (fp_to_uint f128:$src)), (i64 (MFVRD (XSCVQPUDZ $src)))>;
+ def : Pat<(i32 (fp_to_sint f128:$src)),
+ (i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC)))>;
+ def : Pat<(i32 (fp_to_uint f128:$src)),
+ (i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC)))>;
+
+ // Instructions for store(fptosi).
+ // The 8-byte version is repeated here due to availability of D-Form STXSD.
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xaddr:$dst, 8),
+ (STXSDX (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
+ xaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), ixaddr:$dst, 8),
+ (STXSD (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
+ ixaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 4),
+ (STXSIWX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 2),
+ (STXSIHX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 1),
+ (STXSIBX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xaddr:$dst, 8),
+ (STXSDX (XSCVDPSXDS f64:$src), xaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), ixaddr:$dst, 8),
+ (STXSD (XSCVDPSXDS f64:$src), ixaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 2),
+ (STXSIHX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 1),
+ (STXSIBX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
+
+ // Instructions for store(fptoui).
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xaddr:$dst, 8),
+ (STXSDX (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
+ xaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), ixaddr:$dst, 8),
+ (STXSD (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
+ ixaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 4),
+ (STXSIWX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 2),
+ (STXSIHX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 1),
+ (STXSIBX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xaddr:$dst, 8),
+ (STXSDX (XSCVDPUXDS f64:$src), xaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), ixaddr:$dst, 8),
+ (STXSD (XSCVDPUXDS f64:$src), ixaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 2),
+ (STXSIHX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
+ def : Pat<(PPCstore_scal_int_from_vsr
+ (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 1),
+ (STXSIBX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
+
+ // Round & Convert QP -> DP/SP
+ def : Pat<(f64 (fpround f128:$src)), (f64 (XSCVQPDP $src))>;
+ def : Pat<(f32 (fpround f128:$src)), (f32 (XSRSP (XSCVQPDPO $src)))>;
+
+ // Convert SP -> QP
+ def : Pat<(f128 (fpextend f32:$src)),
+ (f128 (XSCVDPQP (COPY_TO_REGCLASS $src, VFRC)))>;
+
} // end HasP9Vector, AddedComplexity
+let AddedComplexity = 400 in {
+ let Predicates = [IsISA3_0, HasP9Vector, HasDirectMove, IsBigEndian] in {
+ def : Pat<(f128 (PPCbuild_fp128 i64:$rB, i64:$rA)),
+ (f128 (COPY_TO_REGCLASS (MTVSRDD $rB, $rA), VRRC))>;
+ }
+ let Predicates = [IsISA3_0, HasP9Vector, HasDirectMove, IsLittleEndian] in {
+ def : Pat<(f128 (PPCbuild_fp128 i64:$rA, i64:$rB)),
+ (f128 (COPY_TO_REGCLASS (MTVSRDD $rB, $rA), VRRC))>;
+ }
+}
+
let Predicates = [HasP9Vector] in {
let isPseudo = 1 in {
let mayStore = 1 in {
- def SPILLTOVSR_STX : Pseudo<(outs), (ins spilltovsrrc:$XT, memrr:$dst),
- "#SPILLTOVSR_STX", []>;
+ def SPILLTOVSR_STX : PseudoXFormMemOp<(outs),
+ (ins spilltovsrrc:$XT, memrr:$dst),
+ "#SPILLTOVSR_STX", []>;
def SPILLTOVSR_ST : Pseudo<(outs), (ins spilltovsrrc:$XT, memrix:$dst),
"#SPILLTOVSR_ST", []>;
}
let mayLoad = 1 in {
- def SPILLTOVSR_LDX : Pseudo<(outs spilltovsrrc:$XT), (ins memrr:$src),
- "#SPILLTOVSR_LDX", []>;
+ def SPILLTOVSR_LDX : PseudoXFormMemOp<(outs spilltovsrrc:$XT),
+ (ins memrr:$src),
+ "#SPILLTOVSR_LDX", []>;
def SPILLTOVSR_LD : Pseudo<(outs spilltovsrrc:$XT), (ins memrix:$src),
"#SPILLTOVSR_LD", []>;
@@ -3170,10 +3564,10 @@ def FltToULongLoadP9 {
dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 ixaddr:$A)))));
}
def FltToLong {
- dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A))));
+ dag A = (i64 (PPCmfvsr (f64 (PPCfctidz (fpextend f32:$A)))));
}
def FltToULong {
- dag A = (i64 (PPCmfvsr (PPCfctiduz (fpextend f32:$A))));
+ dag A = (i64 (PPCmfvsr (f64 (PPCfctiduz (fpextend f32:$A)))));
}
def DblToInt {
dag A = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$A))));
@@ -3219,7 +3613,6 @@ def MrgFP {
}
// Patterns for BUILD_VECTOR nodes.
-def NoP9Vector : Predicate<"!PPCSubTarget->hasP9Vector()">;
let AddedComplexity = 400 in {
let Predicates = [HasVSX] in {
@@ -3389,8 +3782,10 @@ let AddedComplexity = 400 in {
def : Pat<(v2i64 (build_vector i64:$rB, i64:$rA)),
(v2i64 (MTVSRDD $rB, $rA))>;
def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
- (VMRGOW (COPY_TO_REGCLASS (MTVSRDD AnyExts.A, AnyExts.C), VSRC),
- (COPY_TO_REGCLASS (MTVSRDD AnyExts.B, AnyExts.D), VSRC))>;
+ (VMRGOW
+ (v4i32 (COPY_TO_REGCLASS (MTVSRDD AnyExts.A, AnyExts.C), VSRC)),
+ (v4i32
+ (COPY_TO_REGCLASS (MTVSRDD AnyExts.B, AnyExts.D), VSRC)))>;
}
let Predicates = [IsISA3_0, HasDirectMove, IsLittleEndian] in {
@@ -3400,8 +3795,10 @@ let AddedComplexity = 400 in {
def : Pat<(v2i64 (build_vector i64:$rA, i64:$rB)),
(v2i64 (MTVSRDD $rB, $rA))>;
def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
- (VMRGOW (COPY_TO_REGCLASS (MTVSRDD AnyExts.D, AnyExts.B), VSRC),
- (COPY_TO_REGCLASS (MTVSRDD AnyExts.C, AnyExts.A), VSRC))>;
+ (VMRGOW
+ (v4i32 (COPY_TO_REGCLASS (MTVSRDD AnyExts.D, AnyExts.B), VSRC)),
+ (v4i32
+ (COPY_TO_REGCLASS (MTVSRDD AnyExts.C, AnyExts.A), VSRC)))>;
}
// P9 Altivec instructions that can be used to build vectors.
// Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete
diff --git a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
index cdf544bdfac3..2217fa4693ce 100644
--- a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
+++ b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -33,6 +33,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Dominators.h"
@@ -47,8 +48,8 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <cassert>
#include <iterator>
@@ -246,15 +247,14 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
if (!L->empty())
return MadeChange;
- DEBUG(dbgs() << "PIP: Examining: " << *L << "\n");
+ LLVM_DEBUG(dbgs() << "PIP: Examining: " << *L << "\n");
BasicBlock *Header = L->getHeader();
const PPCSubtarget *ST =
TM ? TM->getSubtargetImpl(*Header->getParent()) : nullptr;
- unsigned HeaderLoopPredCount =
- std::distance(pred_begin(Header), pred_end(Header));
+ unsigned HeaderLoopPredCount = pred_size(Header);
// Collect buckets of comparable addresses used by loads and stores.
SmallVector<Bucket, 16> Buckets;
@@ -294,6 +294,19 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
if (const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV)) {
if (LARSCEV->getLoop() != L)
continue;
+ // See getPreIndexedAddressParts, the displacement for LDU/STDU has to
+ // be 4's multiple (DS-form). For i64 loads/stores when the displacement
+ // fits in a 16-bit signed field but isn't a multiple of 4, it will be
+ // useless and possible to break some original well-form addressing mode
+ // to make this pre-inc prep for it.
+ if (PtrValue->getType()->getPointerElementType()->isIntegerTy(64)) {
+ if (const SCEVConstant *StepConst =
+ dyn_cast<SCEVConstant>(LARSCEV->getStepRecurrence(*SE))) {
+ const APInt &ConstInt = StepConst->getValue()->getValue();
+ if (ConstInt.isSignedIntN(16) && ConstInt.srem(4) != 0)
+ continue;
+ }
+ }
} else {
continue;
}
@@ -332,7 +345,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
if (!LoopPredecessor)
return MadeChange;
- DEBUG(dbgs() << "PIP: Found " << Buckets.size() << " buckets\n");
+ LLVM_DEBUG(dbgs() << "PIP: Found " << Buckets.size() << " buckets\n");
SmallSet<BasicBlock *, 16> BBChanged;
for (unsigned i = 0, e = Buckets.size(); i != e; ++i) {
@@ -381,7 +394,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
if (!BasePtrSCEV->isAffine())
continue;
- DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n");
+ LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n");
assert(BasePtrSCEV->getLoop() == L &&
"AddRec for the wrong loop?");
@@ -407,7 +420,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
if (!isSafeToExpand(BasePtrStartSCEV, *SE))
continue;
- DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
+ LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV))
continue;
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index 1e40711328ec..62a612feb55c 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -21,13 +21,13 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Mangler.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
using namespace llvm;
static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) {
@@ -107,10 +107,20 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
break;
}
- if (MO.getTargetFlags() == PPCII::MO_PLT)
+ if (MO.getTargetFlags() == PPCII::MO_PLT)
RefKind = MCSymbolRefExpr::VK_PLT;
+ const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+ const PPCSubtarget *Subtarget = &(MF->getSubtarget<PPCSubtarget>());
+ const TargetMachine &TM = Printer.TM;
const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, RefKind, Ctx);
+ // -msecure-plt option works only in PIC mode. If secure plt mode
+ // is on add 32768 to symbol.
+ if (Subtarget->isSecurePlt() && TM.isPositionIndependent() &&
+ MO.getTargetFlags() == PPCII::MO_PLT)
+ Expr = MCBinaryExpr::createAdd(Expr,
+ MCConstantExpr::create(32768, Ctx),
+ Ctx);
if (!MO.isJTI() && MO.getOffset())
Expr = MCBinaryExpr::createAdd(Expr,
diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp
index 474661aaaee8..dbe1fe37ddf8 100644
--- a/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -55,7 +55,7 @@ FixedPointRegToImm("ppc-reg-to-imm-fixed-point", cl::Hidden, cl::init(true),
"convert reg-reg instructions to reg-imm"));
static cl::opt<bool>
-ConvertRegReg("ppc-convert-rr-to-ri", cl::Hidden, cl::init(false),
+ConvertRegReg("ppc-convert-rr-to-ri", cl::Hidden, cl::init(true),
cl::desc("Convert eligible reg+reg instructions to reg+imm"));
static cl::opt<bool>
@@ -119,8 +119,8 @@ void PPCMIPeephole::initialize(MachineFunction &MFParm) {
MRI = &MF->getRegInfo();
MDT = &getAnalysis<MachineDominatorTree>();
TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
- DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n");
- DEBUG(MF->dump());
+ LLVM_DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n");
+ LLVM_DEBUG(MF->dump());
}
static MachineInstr *getVRegDefOrNull(MachineOperand *Op,
@@ -190,18 +190,18 @@ getKnownLeadingZeroCount(MachineInstr *MI, const PPCInstrInfo *TII) {
}
// This function maintains a map for the pairs <TOC Save Instr, Keep>
-// Each time a new TOC save is encountered, it checks if any of the exisiting
-// ones are dominated by the new one. If so, it marks the exisiting one as
+// Each time a new TOC save is encountered, it checks if any of the existing
+// ones are dominated by the new one. If so, it marks the existing one as
// redundant by setting it's entry in the map as false. It then adds the new
// instruction to the map with either true or false depending on if any
-// exisiting instructions dominated the new one.
+// existing instructions dominated the new one.
void PPCMIPeephole::UpdateTOCSaves(
std::map<MachineInstr *, bool> &TOCSaves, MachineInstr *MI) {
assert(TII->isTOCSaveMI(*MI) && "Expecting a TOC save instruction here");
bool Keep = true;
for (auto It = TOCSaves.begin(); It != TOCSaves.end(); It++ ) {
MachineInstr *CurrInst = It->first;
- // If new instruction dominates an exisiting one, mark exisiting one as
+ // If new instruction dominates an existing one, mark existing one as
// redundant.
if (It->second && MDT->dominates(MI, CurrInst))
It->second = false;
@@ -220,7 +220,7 @@ bool PPCMIPeephole::simplifyCode(void) {
bool Simplified = false;
MachineInstr* ToErase = nullptr;
std::map<MachineInstr *, bool> TOCSaves;
-
+ const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
NumFunctionsEnteredInMIPeephole++;
if (ConvertRegReg) {
// Fixed-point conversion of reg/reg instructions fed by load-immediate
@@ -232,14 +232,14 @@ bool PPCMIPeephole::simplifyCode(void) {
SomethingChanged = false;
for (MachineBasicBlock &MBB : *MF) {
for (MachineInstr &MI : MBB) {
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
continue;
if (TII->convertToImmediateForm(MI)) {
// We don't erase anything in case the def has other uses. Let DCE
// remove it if it can be removed.
- DEBUG(dbgs() << "Converted instruction to imm form: ");
- DEBUG(MI.dump());
+ LLVM_DEBUG(dbgs() << "Converted instruction to imm form: ");
+ LLVM_DEBUG(MI.dump());
NumConvertedToImmediateForm++;
SomethingChanged = true;
Simplified = true;
@@ -261,7 +261,7 @@ bool PPCMIPeephole::simplifyCode(void) {
}
// Ignore debug instructions.
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
continue;
// Per-opcode peepholes.
@@ -276,7 +276,7 @@ bool PPCMIPeephole::simplifyCode(void) {
!MF->getSubtarget<PPCSubtarget>().isELFv2ABI())
break;
// When encountering a TOC save instruction, call UpdateTOCSaves
- // to add it to the TOCSaves map and mark any exisiting TOC saves
+ // to add it to the TOCSaves map and mark any existing TOC saves
// it dominates as redundant.
if (TII->isTOCSaveMI(MI))
UpdateTOCSaves(TOCSaves, &MI);
@@ -297,9 +297,9 @@ bool PPCMIPeephole::simplifyCode(void) {
// We have to look through chains of COPY and SUBREG_TO_REG
// to find the real source values for comparison.
unsigned TrueReg1 =
- TII->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
+ TRI->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
unsigned TrueReg2 =
- TII->lookThruCopyLike(MI.getOperand(2).getReg(), MRI);
+ TRI->lookThruCopyLike(MI.getOperand(2).getReg(), MRI);
if (TrueReg1 == TrueReg2
&& TargetRegisterInfo::isVirtualRegister(TrueReg1)) {
@@ -314,7 +314,7 @@ bool PPCMIPeephole::simplifyCode(void) {
if (DefOpc != PPC::XVCVDPSXDS && DefOpc != PPC::XVCVDPUXDS)
return false;
unsigned DefReg =
- TII->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
+ TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
if (TargetRegisterInfo::isVirtualRegister(DefReg)) {
MachineInstr *LoadMI = MRI->getVRegDef(DefReg);
if (LoadMI && LoadMI->getOpcode() == PPC::LXVDSX)
@@ -324,10 +324,9 @@ bool PPCMIPeephole::simplifyCode(void) {
};
if (DefMI && (Immed == 0 || Immed == 3)) {
if (DefOpc == PPC::LXVDSX || isConversionOfLoadAndSplat()) {
- DEBUG(dbgs()
- << "Optimizing load-and-splat/splat "
- "to load-and-splat/copy: ");
- DEBUG(MI.dump());
+ LLVM_DEBUG(dbgs() << "Optimizing load-and-splat/splat "
+ "to load-and-splat/copy: ");
+ LLVM_DEBUG(MI.dump());
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
MI.getOperand(0).getReg())
.add(MI.getOperand(1));
@@ -341,15 +340,14 @@ bool PPCMIPeephole::simplifyCode(void) {
if (DefOpc == PPC::XXPERMDI) {
unsigned FeedImmed = DefMI->getOperand(3).getImm();
unsigned FeedReg1 =
- TII->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
+ TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
unsigned FeedReg2 =
- TII->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI);
+ TRI->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI);
if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) {
- DEBUG(dbgs()
- << "Optimizing splat/swap or splat/splat "
- "to splat/copy: ");
- DEBUG(MI.dump());
+ LLVM_DEBUG(dbgs() << "Optimizing splat/swap or splat/splat "
+ "to splat/copy: ");
+ LLVM_DEBUG(MI.dump());
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
MI.getOperand(0).getReg())
.add(MI.getOperand(1));
@@ -362,8 +360,8 @@ bool PPCMIPeephole::simplifyCode(void) {
// parameter.
else if ((Immed == 0 || Immed == 3)
&& FeedImmed == 2 && FeedReg1 == FeedReg2) {
- DEBUG(dbgs() << "Optimizing swap/splat => splat: ");
- DEBUG(MI.dump());
+ LLVM_DEBUG(dbgs() << "Optimizing swap/splat => splat: ");
+ LLVM_DEBUG(MI.dump());
MI.getOperand(1).setReg(DefMI->getOperand(1).getReg());
MI.getOperand(2).setReg(DefMI->getOperand(2).getReg());
MI.getOperand(3).setImm(3 - Immed);
@@ -373,8 +371,8 @@ bool PPCMIPeephole::simplifyCode(void) {
// If this is a swap fed by a swap, we can replace it
// with a copy from the first swap's input.
else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) {
- DEBUG(dbgs() << "Optimizing swap/swap => copy: ");
- DEBUG(MI.dump());
+ LLVM_DEBUG(dbgs() << "Optimizing swap/swap => copy: ");
+ LLVM_DEBUG(MI.dump());
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
MI.getOperand(0).getReg())
.add(DefMI->getOperand(1));
@@ -389,8 +387,8 @@ bool PPCMIPeephole::simplifyCode(void) {
DefMI->getOperand(0).setReg(MI.getOperand(0).getReg());
ToErase = &MI;
Simplified = true;
- DEBUG(dbgs() << "Removing redundant splat: ");
- DEBUG(MI.dump());
+ LLVM_DEBUG(dbgs() << "Removing redundant splat: ");
+ LLVM_DEBUG(MI.dump());
}
}
}
@@ -402,7 +400,7 @@ bool PPCMIPeephole::simplifyCode(void) {
unsigned MyOpcode = MI.getOpcode();
unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2;
unsigned TrueReg =
- TII->lookThruCopyLike(MI.getOperand(OpNo).getReg(), MRI);
+ TRI->lookThruCopyLike(MI.getOperand(OpNo).getReg(), MRI);
if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
break;
MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
@@ -429,8 +427,8 @@ bool PPCMIPeephole::simplifyCode(void) {
// If the instruction[s] that feed this splat have already splat
// the value, this splat is redundant.
if (AlreadySplat) {
- DEBUG(dbgs() << "Changing redundant splat to a copy: ");
- DEBUG(MI.dump());
+ LLVM_DEBUG(dbgs() << "Changing redundant splat to a copy: ");
+ LLVM_DEBUG(MI.dump());
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
MI.getOperand(0).getReg())
.add(MI.getOperand(OpNo));
@@ -448,14 +446,14 @@ bool PPCMIPeephole::simplifyCode(void) {
if (ShiftOp1 == ShiftOp2) {
unsigned NewElem = (SplatImm + ShiftImm) & 0x3;
if (MRI->hasOneNonDBGUse(ShiftRes)) {
- DEBUG(dbgs() << "Removing redundant shift: ");
- DEBUG(DefMI->dump());
+ LLVM_DEBUG(dbgs() << "Removing redundant shift: ");
+ LLVM_DEBUG(DefMI->dump());
ToErase = DefMI;
}
Simplified = true;
- DEBUG(dbgs() << "Changing splat immediate from " << SplatImm <<
- " to " << NewElem << " in instruction: ");
- DEBUG(MI.dump());
+ LLVM_DEBUG(dbgs() << "Changing splat immediate from " << SplatImm
+ << " to " << NewElem << " in instruction: ");
+ LLVM_DEBUG(MI.dump());
MI.getOperand(1).setReg(ShiftOp1);
MI.getOperand(2).setImm(NewElem);
}
@@ -465,7 +463,7 @@ bool PPCMIPeephole::simplifyCode(void) {
case PPC::XVCVDPSP: {
// If this is a DP->SP conversion fed by an FRSP, the FRSP is redundant.
unsigned TrueReg =
- TII->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
+ TRI->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
break;
MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
@@ -474,9 +472,9 @@ bool PPCMIPeephole::simplifyCode(void) {
// values.
if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) {
unsigned DefsReg1 =
- TII->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
+ TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
unsigned DefsReg2 =
- TII->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI);
+ TRI->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI);
if (!TargetRegisterInfo::isVirtualRegister(DefsReg1) ||
!TargetRegisterInfo::isVirtualRegister(DefsReg2))
break;
@@ -499,12 +497,12 @@ bool PPCMIPeephole::simplifyCode(void) {
if (Use.getOperand(i).isReg() &&
Use.getOperand(i).getReg() == FRSPDefines)
Use.getOperand(i).setReg(ConvReg1);
- DEBUG(dbgs() << "Removing redundant FRSP:\n");
- DEBUG(RoundInstr->dump());
- DEBUG(dbgs() << "As it feeds instruction:\n");
- DEBUG(MI.dump());
- DEBUG(dbgs() << "Through instruction:\n");
- DEBUG(DefMI->dump());
+ LLVM_DEBUG(dbgs() << "Removing redundant FRSP:\n");
+ LLVM_DEBUG(RoundInstr->dump());
+ LLVM_DEBUG(dbgs() << "As it feeds instruction:\n");
+ LLVM_DEBUG(MI.dump());
+ LLVM_DEBUG(dbgs() << "Through instruction:\n");
+ LLVM_DEBUG(DefMI->dump());
RoundInstr->eraseFromParent();
}
};
@@ -552,11 +550,11 @@ bool PPCMIPeephole::simplifyCode(void) {
};
unsigned Opc = getSextLoadOp(is64Bit(MI.getOpcode()),
isXForm(SrcMI->getOpcode()));
- DEBUG(dbgs() << "Zero-extending load\n");
- DEBUG(SrcMI->dump());
- DEBUG(dbgs() << "and sign-extension\n");
- DEBUG(MI.dump());
- DEBUG(dbgs() << "are merged into sign-extending load\n");
+ LLVM_DEBUG(dbgs() << "Zero-extending load\n");
+ LLVM_DEBUG(SrcMI->dump());
+ LLVM_DEBUG(dbgs() << "and sign-extension\n");
+ LLVM_DEBUG(MI.dump());
+ LLVM_DEBUG(dbgs() << "are merged into sign-extending load\n");
SrcMI->setDesc(TII->get(Opc));
SrcMI->getOperand(0).setReg(MI.getOperand(0).getReg());
ToErase = &MI;
@@ -596,11 +594,11 @@ bool PPCMIPeephole::simplifyCode(void) {
};
unsigned Opc = getSextLoadOp(is64Bit(MI.getOpcode()),
isXForm(SrcMI->getOpcode()));
- DEBUG(dbgs() << "Zero-extending load\n");
- DEBUG(SrcMI->dump());
- DEBUG(dbgs() << "and sign-extension\n");
- DEBUG(MI.dump());
- DEBUG(dbgs() << "are merged into sign-extending load\n");
+ LLVM_DEBUG(dbgs() << "Zero-extending load\n");
+ LLVM_DEBUG(SrcMI->dump());
+ LLVM_DEBUG(dbgs() << "and sign-extension\n");
+ LLVM_DEBUG(MI.dump());
+ LLVM_DEBUG(dbgs() << "are merged into sign-extending load\n");
SrcMI->setDesc(TII->get(Opc));
SrcMI->getOperand(0).setReg(MI.getOperand(0).getReg());
ToErase = &MI;
@@ -610,7 +608,7 @@ bool PPCMIPeephole::simplifyCode(void) {
TII->isSignExtended(*SrcMI)) {
// We can eliminate EXTSW if the input is known to be already
// sign-extended.
- DEBUG(dbgs() << "Removing redundant sign-extension\n");
+ LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n");
unsigned TmpReg =
MF->getRegInfo().createVirtualRegister(&PPC::G8RCRegClass);
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::IMPLICIT_DEF),
@@ -661,7 +659,7 @@ bool PPCMIPeephole::simplifyCode(void) {
unsigned KnownZeroCount = getKnownLeadingZeroCount(SrcMI, TII);
if (MI.getOperand(3).getImm() <= KnownZeroCount) {
- DEBUG(dbgs() << "Removing redundant zero-extension\n");
+ LLVM_DEBUG(dbgs() << "Removing redundant zero-extension\n");
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
MI.getOperand(0).getReg())
.addReg(SrcReg);
@@ -727,8 +725,8 @@ bool PPCMIPeephole::simplifyCode(void) {
MachineInstr *DefPhiMI = getVRegDefOrNull(&Op1, MRI);
for (unsigned i = 1; i < DefPhiMI->getNumOperands(); i += 2) {
MachineInstr *LiMI = getVRegDefOrNull(&DefPhiMI->getOperand(i), MRI);
- DEBUG(dbgs() << "Optimizing LI to ADDI: ");
- DEBUG(LiMI->dump());
+ LLVM_DEBUG(dbgs() << "Optimizing LI to ADDI: ");
+ LLVM_DEBUG(LiMI->dump());
// There could be repeated registers in the PHI, e.g: %1 =
// PHI %6, <%bb.2>, %8, <%bb.3>, %8, <%bb.6>; So if we've
@@ -746,12 +744,12 @@ bool PPCMIPeephole::simplifyCode(void) {
MachineInstrBuilder(*LiMI->getParent()->getParent(), *LiMI)
.addReg(DominatorReg)
.addImm(LiImm); // restore the imm of LI
- DEBUG(LiMI->dump());
+ LLVM_DEBUG(LiMI->dump());
}
// Replace ADD with COPY
- DEBUG(dbgs() << "Optimizing ADD to COPY: ");
- DEBUG(MI.dump());
+ LLVM_DEBUG(dbgs() << "Optimizing ADD to COPY: ");
+ LLVM_DEBUG(MI.dump());
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
MI.getOperand(0).getReg())
.add(Op1);
@@ -849,7 +847,7 @@ static unsigned getPredicateToIncImm(MachineInstr *BI, MachineInstr *CMPI) {
return 0;
}
-// This takes a Phi node and returns a register value for the spefied BB.
+// This takes a Phi node and returns a register value for the specified BB.
static unsigned getIncomingRegForBlock(MachineInstr *Phi,
MachineBasicBlock *MBB) {
for (unsigned I = 2, E = Phi->getNumOperands() + 1; I != E; I += 2) {
@@ -979,9 +977,9 @@ static bool eligibleForCompareElimination(MachineBasicBlock &MBB,
}
// This function will iterate over the input map containing a pair of TOC save
-// instruction and a flag. The flag will be set to false if the TOC save is proven
-// redundant. This function will erase from the basic block all the TOC saves
-// marked as redundant.
+// instruction and a flag. The flag will be set to false if the TOC save is
+// proven redundant. This function will erase from the basic block all the TOC
+// saves marked as redundant.
bool PPCMIPeephole::eliminateRedundantTOCSaves(
std::map<MachineInstr *, bool> &TOCSaves) {
bool Simplified = false;
@@ -1192,16 +1190,16 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) {
}
}
- // We cannnot merge two compares if the immediates are not same.
+ // We cannot merge two compares if the immediates are not same.
if (NewImm2 != NewImm1)
continue;
}
- DEBUG(dbgs() << "Optimize two pairs of compare and branch:\n");
- DEBUG(CMPI1->dump());
- DEBUG(BI1->dump());
- DEBUG(CMPI2->dump());
- DEBUG(BI2->dump());
+ LLVM_DEBUG(dbgs() << "Optimize two pairs of compare and branch:\n");
+ LLVM_DEBUG(CMPI1->dump());
+ LLVM_DEBUG(BI1->dump());
+ LLVM_DEBUG(CMPI2->dump());
+ LLVM_DEBUG(BI2->dump());
// We adjust opcode, predicates and immediate as we determined above.
if (NewOpCode != 0 && NewOpCode != CMPI1->getOpcode()) {
@@ -1260,15 +1258,15 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) {
BI2->getOperand(1).setIsKill(true);
BI1->getOperand(1).setIsKill(false);
- DEBUG(dbgs() << "into a compare and two branches:\n");
- DEBUG(CMPI1->dump());
- DEBUG(BI1->dump());
- DEBUG(BI2->dump());
+ LLVM_DEBUG(dbgs() << "into a compare and two branches:\n");
+ LLVM_DEBUG(CMPI1->dump());
+ LLVM_DEBUG(BI1->dump());
+ LLVM_DEBUG(BI2->dump());
if (IsPartiallyRedundant) {
- DEBUG(dbgs() << "The following compare is moved into "
- << printMBBReference(*MBBtoMoveCmp)
- << " to handle partial redundancy.\n");
- DEBUG(CMPI2->dump());
+ LLVM_DEBUG(dbgs() << "The following compare is moved into "
+ << printMBBReference(*MBBtoMoveCmp)
+ << " to handle partial redundancy.\n");
+ LLVM_DEBUG(CMPI2->dump());
}
Simplified = true;
diff --git a/lib/Target/PowerPC/PPCMachineBasicBlockUtils.h b/lib/Target/PowerPC/PPCMachineBasicBlockUtils.h
deleted file mode 100644
index 628ea2ab9fe6..000000000000
--- a/lib/Target/PowerPC/PPCMachineBasicBlockUtils.h
+++ /dev/null
@@ -1,198 +0,0 @@
-//==-- PPCMachineBasicBlockUtils.h - Functions for common MBB operations ---==//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines utility functions for commonly used operations on
-// MachineBasicBlock's.
-// NOTE: Include this file after defining DEBUG_TYPE so that the debug messages
-// can be emitted for the pass that is using this.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_PPC_MACHINE_BASIC_BLOCK_UTILS_H
-#define LLVM_LIB_TARGET_PPC_MACHINE_BASIC_BLOCK_UTILS_H
-
-#include "PPCInstrInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
-#ifndef DEBUG_TYPE
-#define DEBUG_TYPE "ppc-generic-mbb-utilities"
-#endif
-
-using namespace llvm;
-
-/// Given a basic block \p Successor that potentially contains PHIs, this
-/// function will look for any incoming values in the PHIs that are supposed to
-/// be coming from \p OrigMBB but whose definition is actually in \p NewMBB.
-/// Any such PHIs will be updated to reflect reality.
-static void updatePHIs(MachineBasicBlock *Successor, MachineBasicBlock *OrigMBB,
- MachineBasicBlock *NewMBB, MachineRegisterInfo *MRI) {
- for (auto &MI : Successor->instrs()) {
- if (!MI.isPHI())
- continue;
- // This is a really ugly-looking loop, but it was pillaged directly from
- // MachineBasicBlock::transferSuccessorsAndUpdatePHIs().
- for (unsigned i = 2, e = MI.getNumOperands()+1; i != e; i += 2) {
- MachineOperand &MO = MI.getOperand(i);
- if (MO.getMBB() == OrigMBB) {
- // Check if the instruction is actualy defined in NewMBB.
- if (MI.getOperand(i-1).isReg()) {
- MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(i-1).getReg());
- if (DefMI->getParent() == NewMBB || !OrigMBB->isSuccessor(Successor)) {
- MO.setMBB(NewMBB);
- break;
- }
- }
- }
- }
- }
-}
-
-/// Given a basic block \p Successor that potentially contains PHIs, this
-/// function will look for PHIs that have an incoming value from \p OrigMBB
-/// and will add the same incoming value from \p NewMBB.
-/// NOTE: This should only be used if \p NewMBB is an immediate dominator of
-/// \p OrigMBB.
-static void addIncomingValuesToPHIs(MachineBasicBlock *Successor,
- MachineBasicBlock *OrigMBB,
- MachineBasicBlock *NewMBB,
- MachineRegisterInfo *MRI) {
- assert(OrigMBB->isSuccessor(NewMBB) && "NewMBB must be a sucessor of OrigMBB");
- for (auto &MI : Successor->instrs()) {
- if (!MI.isPHI())
- continue;
- // This is a really ugly-looking loop, but it was pillaged directly from
- // MachineBasicBlock::transferSuccessorsAndUpdatePHIs().
- for (unsigned i = 2, e = MI.getNumOperands()+1; i != e; i += 2) {
- MachineOperand &MO = MI.getOperand(i);
- if (MO.getMBB() == OrigMBB) {
- MachineInstrBuilder MIB(*MI.getParent()->getParent(), &MI);
- MIB.addReg(MI.getOperand(i-1).getReg()).addMBB(NewMBB);
- break;
- }
- }
- }
-}
-
-struct BlockSplitInfo {
- MachineInstr *OrigBranch;
- MachineInstr *SplitBefore;
- MachineInstr *SplitCond;
- bool InvertNewBranch;
- bool InvertOrigBranch;
- bool BranchToFallThrough;
- const MachineBranchProbabilityInfo *MBPI;
- MachineInstr *MIToDelete;
- MachineInstr *NewCond;
- bool allInstrsInSameMBB() {
- if (!OrigBranch || !SplitBefore || !SplitCond)
- return false;
- MachineBasicBlock *MBB = OrigBranch->getParent();
- if (SplitBefore->getParent() != MBB ||
- SplitCond->getParent() != MBB)
- return false;
- if (MIToDelete && MIToDelete->getParent() != MBB)
- return false;
- if (NewCond && NewCond->getParent() != MBB)
- return false;
- return true;
- }
-};
-
-/// Splits a MachineBasicBlock to branch before \p SplitBefore. The original
-/// branch is \p OrigBranch. The target of the new branch can either be the same
-/// as the target of the original branch or the fallthrough successor of the
-/// original block as determined by \p BranchToFallThrough. The branch
-/// conditions will be inverted according to \p InvertNewBranch and
-/// \p InvertOrigBranch. If an instruction that previously fed the branch is to
-/// be deleted, it is provided in \p MIToDelete and \p NewCond will be used as
-/// the branch condition. The branch probabilities will be set if the
-/// MachineBranchProbabilityInfo isn't null.
-static bool splitMBB(BlockSplitInfo &BSI) {
- assert(BSI.allInstrsInSameMBB() &&
- "All instructions must be in the same block.");
-
- MachineBasicBlock *ThisMBB = BSI.OrigBranch->getParent();
- MachineFunction *MF = ThisMBB->getParent();
- MachineRegisterInfo *MRI = &MF->getRegInfo();
- assert(MRI->isSSA() && "Can only do this while the function is in SSA form.");
- if (ThisMBB->succ_size() != 2) {
- DEBUG(dbgs() << "Don't know how to handle blocks that don't have exactly"
- << " two succesors.\n");
- return false;
- }
-
- const PPCInstrInfo *TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
- unsigned OrigBROpcode = BSI.OrigBranch->getOpcode();
- unsigned InvertedOpcode =
- OrigBROpcode == PPC::BC ? PPC::BCn :
- OrigBROpcode == PPC::BCn ? PPC::BC :
- OrigBROpcode == PPC::BCLR ? PPC::BCLRn : PPC::BCLR;
- unsigned NewBROpcode = BSI.InvertNewBranch ? InvertedOpcode : OrigBROpcode;
- MachineBasicBlock *OrigTarget = BSI.OrigBranch->getOperand(1).getMBB();
- MachineBasicBlock *OrigFallThrough =
- OrigTarget == *ThisMBB->succ_begin() ? *ThisMBB->succ_rbegin() :
- *ThisMBB->succ_begin();
- MachineBasicBlock *NewBRTarget =
- BSI.BranchToFallThrough ? OrigFallThrough : OrigTarget;
- BranchProbability ProbToNewTarget =
- !BSI.MBPI ? BranchProbability::getUnknown() :
- BSI.MBPI->getEdgeProbability(ThisMBB, NewBRTarget);
-
- // Create a new basic block.
- MachineBasicBlock::iterator InsertPoint = BSI.SplitBefore;
- const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
- MachineFunction::iterator It = ThisMBB->getIterator();
- MachineBasicBlock *NewMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MF->insert(++It, NewMBB);
-
- // Move everything after SplitBefore into the new block.
- NewMBB->splice(NewMBB->end(), ThisMBB, InsertPoint, ThisMBB->end());
- NewMBB->transferSuccessors(ThisMBB);
-
- // Add the two successors to ThisMBB. The probabilities come from the
- // existing blocks if available.
- ThisMBB->addSuccessor(NewBRTarget, ProbToNewTarget);
- ThisMBB->addSuccessor(NewMBB, ProbToNewTarget.getCompl());
-
- // Add the branches to ThisMBB.
- BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(),
- TII->get(NewBROpcode)).addReg(BSI.SplitCond->getOperand(0).getReg())
- .addMBB(NewBRTarget);
- BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(),
- TII->get(PPC::B)).addMBB(NewMBB);
- if (BSI.MIToDelete)
- BSI.MIToDelete->eraseFromParent();
-
- // Change the condition on the original branch and invert it if requested.
- auto FirstTerminator = NewMBB->getFirstTerminator();
- if (BSI.NewCond) {
- assert(FirstTerminator->getOperand(0).isReg() &&
- "Can't update condition of unconditional branch.");
- FirstTerminator->getOperand(0).setReg(BSI.NewCond->getOperand(0).getReg());
- }
- if (BSI.InvertOrigBranch)
- FirstTerminator->setDesc(TII->get(InvertedOpcode));
-
- // If any of the PHIs in the successors of NewMBB reference values that
- // now come from NewMBB, they need to be updated.
- for (auto *Succ : NewMBB->successors()) {
- updatePHIs(Succ, ThisMBB, NewMBB, MRI);
- }
- addIncomingValuesToPHIs(NewBRTarget, ThisMBB, NewMBB, MRI);
-
- DEBUG(dbgs() << "After splitting, ThisMBB:\n"; ThisMBB->dump());
- DEBUG(dbgs() << "NewMBB:\n"; NewMBB->dump());
- DEBUG(dbgs() << "New branch-to block:\n"; NewBRTarget->dump());
- return true;
-}
-
-
-#endif
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index a9b6073106ea..b14bbad2039a 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -45,6 +45,11 @@ class PPCFunctionInfo : public MachineFunctionInfo {
/// PEI.
bool MustSaveLR;
+ /// Do we have to disable shrink-wrapping? This has to be set if we emit any
+ /// instructions that clobber LR in the entry block because discovering this
+ /// in PEI is too late (happens after shrink-wrapping);
+ bool ShrinkWrapDisabled = false;
+
/// Does this function have any stack spills.
bool HasSpills = false;
@@ -147,6 +152,12 @@ public:
void setMustSaveLR(bool U) { MustSaveLR = U; }
bool mustSaveLR() const { return MustSaveLR; }
+ /// We certainly don't want to shrink wrap functions if we've emitted a
+ /// MovePCtoLR8 as that has to go into the entry, so the prologue definitely
+ /// has to go into the entry block.
+ void setShrinkWrapDisabled(bool U) { ShrinkWrapDisabled = U; }
+ bool shrinkWrapDisabled() const { return ShrinkWrapDisabled; }
+
void setHasSpills() { HasSpills = true; }
bool hasSpills() const { return HasSpills; }
@@ -185,11 +196,11 @@ public:
LiveInAttrs.push_back(std::make_pair(VReg, Flags));
}
- /// This function returns true if the spesified vreg is
+ /// This function returns true if the specified vreg is
/// a live-in register and sign-extended.
bool isLiveInSExt(unsigned VReg) const;
- /// This function returns true if the spesified vreg is
+ /// This function returns true if the specified vreg is
/// a live-in register and zero-extended.
bool isLiveInZExt(unsigned VReg) const;
diff --git a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
index 9501f0f89b81..1892d1e3dc26 100644
--- a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
+++ b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -35,7 +35,7 @@ STATISTIC(NumRemovedInPreEmit,
"Number of instructions deleted in pre-emit peephole");
static cl::opt<bool>
-RunPreEmitPeephole("ppc-late-peephole", cl::Hidden, cl::init(false),
+RunPreEmitPeephole("ppc-late-peephole", cl::Hidden, cl::init(true),
cl::desc("Run pre-emit peephole optimizations."));
namespace {
@@ -67,8 +67,8 @@ namespace {
if (TII->convertToImmediateForm(MI, &DefMIToErase)) {
Changed = true;
NumRRConvertedInPreEmit++;
- DEBUG(dbgs() << "Converted instruction to imm form: ");
- DEBUG(MI.dump());
+ LLVM_DEBUG(dbgs() << "Converted instruction to imm form: ");
+ LLVM_DEBUG(MI.dump());
if (DefMIToErase) {
InstrsToErase.push_back(DefMIToErase);
}
@@ -76,8 +76,8 @@ namespace {
}
}
for (MachineInstr *MI : InstrsToErase) {
- DEBUG(dbgs() << "PPC pre-emit peephole: erasing instruction: ");
- DEBUG(MI->dump());
+ LLVM_DEBUG(dbgs() << "PPC pre-emit peephole: erasing instruction: ");
+ LLVM_DEBUG(MI->dump());
MI->eraseFromParent();
NumRemovedInPreEmit++;
}
diff --git a/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
index 5b2d7191683c..173fc18b9ebf 100644
--- a/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
+++ b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
@@ -15,18 +15,21 @@
//
//===---------------------------------------------------------------------===//
-#include "PPCInstrInfo.h"
#include "PPC.h"
+#include "PPCInstrInfo.h"
#include "PPCTargetMachine.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/Support/Debug.h"
-#include "llvm/ADT/Statistic.h"
using namespace llvm;
#define DEBUG_TYPE "ppc-reduce-cr-ops"
-#include "PPCMachineBasicBlockUtils.h"
STATISTIC(NumContainedSingleUseBinOps,
"Number of single-use binary CR logical ops contained in a block");
@@ -50,7 +53,177 @@ namespace llvm {
void initializePPCReduceCRLogicalsPass(PassRegistry&);
}
-namespace {
+/// Given a basic block \p Successor that potentially contains PHIs, this
+/// function will look for any incoming values in the PHIs that are supposed to
+/// be coming from \p OrigMBB but whose definition is actually in \p NewMBB.
+/// Any such PHIs will be updated to reflect reality.
+static void updatePHIs(MachineBasicBlock *Successor, MachineBasicBlock *OrigMBB,
+ MachineBasicBlock *NewMBB, MachineRegisterInfo *MRI) {
+ for (auto &MI : Successor->instrs()) {
+ if (!MI.isPHI())
+ continue;
+ // This is a really ugly-looking loop, but it was pillaged directly from
+ // MachineBasicBlock::transferSuccessorsAndUpdatePHIs().
+ for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
+ MachineOperand &MO = MI.getOperand(i);
+ if (MO.getMBB() == OrigMBB) {
+ // Check if the instruction is actually defined in NewMBB.
+ if (MI.getOperand(i - 1).isReg()) {
+ MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(i - 1).getReg());
+ if (DefMI->getParent() == NewMBB ||
+ !OrigMBB->isSuccessor(Successor)) {
+ MO.setMBB(NewMBB);
+ break;
+ }
+ }
+ }
+ }
+ }
+}
+
+/// Given a basic block \p Successor that potentially contains PHIs, this
+/// function will look for PHIs that have an incoming value from \p OrigMBB
+/// and will add the same incoming value from \p NewMBB.
+/// NOTE: This should only be used if \p NewMBB is an immediate dominator of
+/// \p OrigMBB.
+static void addIncomingValuesToPHIs(MachineBasicBlock *Successor,
+ MachineBasicBlock *OrigMBB,
+ MachineBasicBlock *NewMBB,
+ MachineRegisterInfo *MRI) {
+ assert(OrigMBB->isSuccessor(NewMBB) &&
+ "NewMBB must be a successor of OrigMBB");
+ for (auto &MI : Successor->instrs()) {
+ if (!MI.isPHI())
+ continue;
+ // This is a really ugly-looking loop, but it was pillaged directly from
+ // MachineBasicBlock::transferSuccessorsAndUpdatePHIs().
+ for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
+ MachineOperand &MO = MI.getOperand(i);
+ if (MO.getMBB() == OrigMBB) {
+ MachineInstrBuilder MIB(*MI.getParent()->getParent(), &MI);
+ MIB.addReg(MI.getOperand(i - 1).getReg()).addMBB(NewMBB);
+ break;
+ }
+ }
+ }
+}
+
+struct BlockSplitInfo {
+ MachineInstr *OrigBranch;
+ MachineInstr *SplitBefore;
+ MachineInstr *SplitCond;
+ bool InvertNewBranch;
+ bool InvertOrigBranch;
+ bool BranchToFallThrough;
+ const MachineBranchProbabilityInfo *MBPI;
+ MachineInstr *MIToDelete;
+ MachineInstr *NewCond;
+ bool allInstrsInSameMBB() {
+ if (!OrigBranch || !SplitBefore || !SplitCond)
+ return false;
+ MachineBasicBlock *MBB = OrigBranch->getParent();
+ if (SplitBefore->getParent() != MBB || SplitCond->getParent() != MBB)
+ return false;
+ if (MIToDelete && MIToDelete->getParent() != MBB)
+ return false;
+ if (NewCond && NewCond->getParent() != MBB)
+ return false;
+ return true;
+ }
+};
+
+/// Splits a MachineBasicBlock to branch before \p SplitBefore. The original
+/// branch is \p OrigBranch. The target of the new branch can either be the same
+/// as the target of the original branch or the fallthrough successor of the
+/// original block as determined by \p BranchToFallThrough. The branch
+/// conditions will be inverted according to \p InvertNewBranch and
+/// \p InvertOrigBranch. If an instruction that previously fed the branch is to
+/// be deleted, it is provided in \p MIToDelete and \p NewCond will be used as
+/// the branch condition. The branch probabilities will be set if the
+/// MachineBranchProbabilityInfo isn't null.
+static bool splitMBB(BlockSplitInfo &BSI) {
+ assert(BSI.allInstrsInSameMBB() &&
+ "All instructions must be in the same block.");
+
+ MachineBasicBlock *ThisMBB = BSI.OrigBranch->getParent();
+ MachineFunction *MF = ThisMBB->getParent();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ assert(MRI->isSSA() && "Can only do this while the function is in SSA form.");
+ if (ThisMBB->succ_size() != 2) {
+ LLVM_DEBUG(
+ dbgs() << "Don't know how to handle blocks that don't have exactly"
+ << " two successors.\n");
+ return false;
+ }
+
+ const PPCInstrInfo *TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
+ unsigned OrigBROpcode = BSI.OrigBranch->getOpcode();
+ unsigned InvertedOpcode =
+ OrigBROpcode == PPC::BC
+ ? PPC::BCn
+ : OrigBROpcode == PPC::BCn
+ ? PPC::BC
+ : OrigBROpcode == PPC::BCLR ? PPC::BCLRn : PPC::BCLR;
+ unsigned NewBROpcode = BSI.InvertNewBranch ? InvertedOpcode : OrigBROpcode;
+ MachineBasicBlock *OrigTarget = BSI.OrigBranch->getOperand(1).getMBB();
+ MachineBasicBlock *OrigFallThrough = OrigTarget == *ThisMBB->succ_begin()
+ ? *ThisMBB->succ_rbegin()
+ : *ThisMBB->succ_begin();
+ MachineBasicBlock *NewBRTarget =
+ BSI.BranchToFallThrough ? OrigFallThrough : OrigTarget;
+ BranchProbability ProbToNewTarget =
+ !BSI.MBPI ? BranchProbability::getUnknown()
+ : BSI.MBPI->getEdgeProbability(ThisMBB, NewBRTarget);
+
+ // Create a new basic block.
+ MachineBasicBlock::iterator InsertPoint = BSI.SplitBefore;
+ const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
+ MachineFunction::iterator It = ThisMBB->getIterator();
+ MachineBasicBlock *NewMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MF->insert(++It, NewMBB);
+
+ // Move everything after SplitBefore into the new block.
+ NewMBB->splice(NewMBB->end(), ThisMBB, InsertPoint, ThisMBB->end());
+ NewMBB->transferSuccessors(ThisMBB);
+
+ // Add the two successors to ThisMBB. The probabilities come from the
+ // existing blocks if available.
+ ThisMBB->addSuccessor(NewBRTarget, ProbToNewTarget);
+ ThisMBB->addSuccessor(NewMBB, ProbToNewTarget.getCompl());
+
+ // Add the branches to ThisMBB.
+ BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(),
+ TII->get(NewBROpcode))
+ .addReg(BSI.SplitCond->getOperand(0).getReg())
+ .addMBB(NewBRTarget);
+ BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(),
+ TII->get(PPC::B))
+ .addMBB(NewMBB);
+ if (BSI.MIToDelete)
+ BSI.MIToDelete->eraseFromParent();
+
+ // Change the condition on the original branch and invert it if requested.
+ auto FirstTerminator = NewMBB->getFirstTerminator();
+ if (BSI.NewCond) {
+ assert(FirstTerminator->getOperand(0).isReg() &&
+ "Can't update condition of unconditional branch.");
+ FirstTerminator->getOperand(0).setReg(BSI.NewCond->getOperand(0).getReg());
+ }
+ if (BSI.InvertOrigBranch)
+ FirstTerminator->setDesc(TII->get(InvertedOpcode));
+
+ // If any of the PHIs in the successors of NewMBB reference values that
+ // now come from NewMBB, they need to be updated.
+ for (auto *Succ : NewMBB->successors()) {
+ updatePHIs(Succ, ThisMBB, NewMBB, MRI);
+ }
+ addIncomingValuesToPHIs(NewBRTarget, ThisMBB, NewMBB, MRI);
+
+ LLVM_DEBUG(dbgs() << "After splitting, ThisMBB:\n"; ThisMBB->dump());
+ LLVM_DEBUG(dbgs() << "NewMBB:\n"; NewMBB->dump());
+ LLVM_DEBUG(dbgs() << "New branch-to block:\n"; NewBRTarget->dump());
+ return true;
+}
static bool isBinary(MachineInstr &MI) {
return MI.getNumOperands() == 3;
@@ -149,6 +322,8 @@ computeBranchTargetAndInversion(unsigned CROp, unsigned BROp, bool UsingDef1,
llvm_unreachable("Don't know how to handle this branch.");
}
+namespace {
+
class PPCReduceCRLogicals : public MachineFunctionPass {
public:
@@ -317,7 +492,7 @@ PPCReduceCRLogicals::createCRLogicalOpInfo(MachineInstr &MIParam) {
Ret.ContainedInBlock &=
(MIParam.getParent() == Ret.TrueDefs.second->getParent());
}
- DEBUG(Ret.dump());
+ LLVM_DEBUG(Ret.dump());
if (Ret.IsBinary && Ret.ContainedInBlock && Ret.SingleUse) {
NumContainedSingleUseBinOps++;
if (Ret.FeedsBR && Ret.DefsSingleUse)
@@ -326,7 +501,7 @@ PPCReduceCRLogicals::createCRLogicalOpInfo(MachineInstr &MIParam) {
return Ret;
}
-/// Looks trhough a COPY instruction to the actual definition of the CR-bit
+/// Looks through a COPY instruction to the actual definition of the CR-bit
/// register and returns the instruction that defines it.
/// FIXME: This currently handles what is by-far the most common case:
/// an instruction that defines a CR field followed by a single copy of a bit
@@ -411,14 +586,15 @@ bool PPCReduceCRLogicals::handleCROp(CRLogicalOpInfo &CRI) {
/// BC %vr9<kill>, <BB#2>; CRBITRC:%vr9
bool PPCReduceCRLogicals::splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI) {
if (CRI.CopyDefs.first == CRI.CopyDefs.second) {
- DEBUG(dbgs() << "Unable to split as the two operands are the same\n");
+ LLVM_DEBUG(dbgs() << "Unable to split as the two operands are the same\n");
NumNotSplitIdenticalOperands++;
return false;
}
if (CRI.TrueDefs.first->isCopy() || CRI.TrueDefs.second->isCopy() ||
CRI.TrueDefs.first->isPHI() || CRI.TrueDefs.second->isPHI()) {
- DEBUG(dbgs() << "Unable to split because one of the operands is a PHI or "
- "chain of copies.\n");
+ LLVM_DEBUG(
+ dbgs() << "Unable to split because one of the operands is a PHI or "
+ "chain of copies.\n");
NumNotSplitChainCopies++;
return false;
}
@@ -429,11 +605,11 @@ bool PPCReduceCRLogicals::splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI) {
CRI.MI->getOpcode() != PPC::CRNAND &&
CRI.MI->getOpcode() != PPC::CRORC &&
CRI.MI->getOpcode() != PPC::CRANDC) {
- DEBUG(dbgs() << "Unable to split blocks on this opcode.\n");
+ LLVM_DEBUG(dbgs() << "Unable to split blocks on this opcode.\n");
NumNotSplitWrongOpcode++;
return false;
}
- DEBUG(dbgs() << "Splitting the following CR op:\n"; CRI.dump());
+ LLVM_DEBUG(dbgs() << "Splitting the following CR op:\n"; CRI.dump());
MachineBasicBlock::iterator Def1It = CRI.TrueDefs.first;
MachineBasicBlock::iterator Def2It = CRI.TrueDefs.second;
@@ -447,9 +623,9 @@ bool PPCReduceCRLogicals::splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI) {
}
}
- DEBUG(dbgs() << "We will split the following block:\n";);
- DEBUG(CRI.MI->getParent()->dump());
- DEBUG(dbgs() << "Before instruction:\n"; SplitBefore->dump());
+ LLVM_DEBUG(dbgs() << "We will split the following block:\n";);
+ LLVM_DEBUG(CRI.MI->getParent()->dump());
+ LLVM_DEBUG(dbgs() << "Before instruction:\n"; SplitBefore->dump());
// Get the branch instruction.
MachineInstr *Branch =
@@ -482,10 +658,11 @@ bool PPCReduceCRLogicals::splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI) {
TargetIsFallThrough);
MachineInstr *SplitCond =
UsingDef1 ? CRI.CopyDefs.second : CRI.CopyDefs.first;
- DEBUG(dbgs() << "We will " << (InvertNewBranch ? "invert" : "copy"));
- DEBUG(dbgs() << " the original branch and the target is the " <<
- (TargetIsFallThrough ? "fallthrough block\n" : "orig. target block\n"));
- DEBUG(dbgs() << "Original branch instruction: "; Branch->dump());
+ LLVM_DEBUG(dbgs() << "We will " << (InvertNewBranch ? "invert" : "copy"));
+ LLVM_DEBUG(dbgs() << " the original branch and the target is the "
+ << (TargetIsFallThrough ? "fallthrough block\n"
+ : "orig. target block\n"));
+ LLVM_DEBUG(dbgs() << "Original branch instruction: "; Branch->dump());
BlockSplitInfo BSI { Branch, SplitBefore, SplitCond, InvertNewBranch,
InvertOrigBranch, TargetIsFallThrough, MBPI, CRI.MI,
UsingDef1 ? CRI.CopyDefs.first : CRI.CopyDefs.second };
@@ -522,7 +699,7 @@ void PPCReduceCRLogicals::collectCRLogicals() {
}
}
-} // end annonymous namespace
+} // end anonymous namespace
INITIALIZE_PASS_BEGIN(PPCReduceCRLogicals, DEBUG_TYPE,
"PowerPC Reduce CR logical Operation", false, false)
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 6b62a82ef7bf..6647ceace5eb 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -65,6 +65,12 @@ static cl::opt<bool>
EnableGPRToVecSpills("ppc-enable-gpr-to-vsr-spills", cl::Hidden, cl::init(false),
cl::desc("Enable spills from gpr to vsr rather than stack"));
+static cl::opt<bool>
+StackPtrConst("ppc-stack-ptr-caller-preserved",
+ cl::desc("Consider R1 caller preserved so stack saves of "
+ "caller preserved registers can be LICM candidates"),
+ cl::init(true), cl::Hidden);
+
PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
: PPCGenRegisterInfo(TM.isPPC64() ? PPC::LR8 : PPC::LR,
TM.isPPC64() ? 0 : 1,
@@ -100,6 +106,12 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
ImmToIdxMap[PPC::STXV] = PPC::STXVX;
ImmToIdxMap[PPC::STXSD] = PPC::STXSDX;
ImmToIdxMap[PPC::STXSSP] = PPC::STXSSPX;
+
+ // SPE
+ ImmToIdxMap[PPC::EVLDD] = PPC::EVLDDX;
+ ImmToIdxMap[PPC::EVSTDD] = PPC::EVSTDDX;
+ ImmToIdxMap[PPC::SPESTW] = PPC::SPESTWX;
+ ImmToIdxMap[PPC::SPELWZ] = PPC::SPELWZX;
}
/// getPointerRegClass - Return the register class to use to hold pointers.
@@ -141,9 +153,23 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
if (TM.isPPC64() && MF->getInfo<PPCFunctionInfo>()->isSplitCSR())
return CSR_SRV464_TLS_PE_SaveList;
+ if (Subtarget.hasSPE())
+ return CSR_SVR432_SPE_SaveList;
+
// On PPC64, we might need to save r2 (but only if it is not reserved).
bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2);
+ if (MF->getFunction().getCallingConv() == CallingConv::Cold) {
+ return TM.isPPC64()
+ ? (Subtarget.hasAltivec()
+ ? (SaveR2 ? CSR_SVR64_ColdCC_R2_Altivec_SaveList
+ : CSR_SVR64_ColdCC_Altivec_SaveList)
+ : (SaveR2 ? CSR_SVR64_ColdCC_R2_SaveList
+ : CSR_SVR64_ColdCC_SaveList))
+ : (Subtarget.hasAltivec() ? CSR_SVR32_ColdCC_Altivec_SaveList
+ : CSR_SVR32_ColdCC_SaveList);
+ }
+
return TM.isPPC64()
? (Subtarget.hasAltivec()
? (SaveR2 ? CSR_SVR464_R2_Altivec_SaveList
@@ -196,6 +222,13 @@ PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
: (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_RegMask
: CSR_Darwin32_RegMask);
+ if (CC == CallingConv::Cold) {
+ return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR64_ColdCC_Altivec_RegMask
+ : CSR_SVR64_ColdCC_RegMask)
+ : (Subtarget.hasAltivec() ? CSR_SVR32_ColdCC_Altivec_RegMask
+ : CSR_SVR32_ColdCC_RegMask);
+ }
+
return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR464_Altivec_RegMask
: CSR_SVR464_RegMask)
: (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_RegMask
@@ -286,15 +319,26 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
bool PPCRegisterInfo::isCallerPreservedPhysReg(unsigned PhysReg,
const MachineFunction &MF) const {
assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
- if (TM.isELFv2ABI() && PhysReg == PPC::X2) {
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (!TM.isPPC64())
+ return false;
+
+ if (!Subtarget.isSVR4ABI())
+ return false;
+ if (PhysReg == PPC::X2)
// X2 is guaranteed to be preserved within a function if it is reserved.
// The reason it's reserved is that it's the TOC pointer (and the function
// uses the TOC). In functions where it isn't reserved (i.e. leaf functions
// with no TOC access), we can't claim that it is preserved.
return (getReservedRegs(MF).test(PPC::X2));
- } else {
- return false;
- }
+ if (StackPtrConst && (PhysReg == PPC::X1) && !MFI.hasVarSizedObjects()
+ && !MFI.hasOpaqueSPAdjustment())
+ // The value of the stack pointer does not change within a function after
+ // the prologue and before the epilogue if there are no dynamic allocations
+ // and no inline asm which clobbers X1.
+ return true;
+ return false;
}
unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
@@ -307,6 +351,8 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
return 0;
case PPC::G8RC_NOX0RegClassID:
case PPC::GPRC_NOR0RegClassID:
+ case PPC::SPERCRegClassID:
+ case PPC::SPE4RCRegClassID:
case PPC::G8RCRegClassID:
case PPC::GPRCRegClassID: {
unsigned FP = TFI->hasFP(MF) ? 1 : 0;
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index 0bbb71fdf9fb..91a98ee4efc7 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -85,6 +85,8 @@ public:
BitVector getReservedRegs(const MachineFunction &MF) const override;
bool isCallerPreservedPhysReg(unsigned PhysReg, const MachineFunction &MF) const override;
+ bool enableMultipleCopyHints() const override { return true; }
+
/// We require the register scavenger.
bool requiresRegisterScavenging(const MachineFunction &MF) const override {
return true;
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index f7807907bd64..0e641cf9e00a 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -38,6 +38,13 @@ class GP8<GPR SubReg, string n> : PPCReg<n> {
let SubRegIndices = [sub_32];
}
+// SPE - One of the 32 64-bit general-purpose registers (SPE)
+class SPE<GPR SubReg, string n> : PPCReg<n> {
+ let HWEncoding = SubReg.HWEncoding;
+ let SubRegs = [SubReg];
+ let SubRegIndices = [sub_32];
+}
+
// SPR - One of the 32-bit special-purpose registers
class SPR<bits<10> num, string n> : PPCReg<n> {
let HWEncoding{9-0} = num;
@@ -100,6 +107,12 @@ foreach Index = 0-31 in {
DwarfRegNum<[Index, -2]>;
}
+// SPE registers
+foreach Index = 0-31 in {
+ def S#Index : SPE<!cast<GPR>("R"#Index), "r"#Index>,
+ DwarfRegNum<[!add(Index, 1200), !add(Index, 1200)]>;
+}
+
// Floating-point registers
foreach Index = 0-31 in {
def F#Index : FPR<Index, "f"#Index>,
@@ -208,10 +221,20 @@ def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>;
// VRsave register
def VRSAVE: SPR<256, "vrsave">, DwarfRegNum<[109]>;
+// SPE extra registers
+// SPE Accumulator for multiply-accumulate SPE operations. Never directly
+// accessed, so there's no real encoding for it.
+def SPEACC: DwarfRegNum<[99, 111]>;
+def SPEFSCR: SPR<512, "spefscr">, DwarfRegNum<[612, 112]>;
+
+def XER: SPR<1, "xer">, DwarfRegNum<[76]>;
+
// Carry bit. In the architecture this is really bit 0 of the XER register
// (which really is SPR register 1); this is the only bit interesting to a
// compiler.
-def CARRY: SPR<1, "ca">, DwarfRegNum<[76]>;
+def CARRY: SPR<1, "xer">, DwarfRegNum<[76]> {
+ let Aliases = [XER];
+}
// FP rounding mode: bits 30 and 31 of the FP status and control register
// This is not allocated as a normal register; it appears only in
@@ -272,6 +295,12 @@ def G8RC_NOX0 : RegisterClass<"PPC", [i64], 64, (add (sub G8RC, X0), ZERO8)> {
}];
}
+def SPERC : RegisterClass<"PPC", [f64], 64, (add (sequence "S%u", 2, 12),
+ (sequence "S%u", 30, 13),
+ S31, S0, S1)>;
+
+def SPE4RC : RegisterClass<"PPC", [f32], 32, (add GPRC)>;
+
// Allocate volatiles first, then non-volatiles in reverse order. With the SVR4
// ABI the size of the Floating-point register save area is determined by the
// allocated non-volatile register with the lowest register number, as FP
@@ -283,7 +312,9 @@ def F8RC : RegisterClass<"PPC", [f64], 64, (add (sequence "F%u", 0, 13),
(sequence "F%u", 31, 14))>;
def F4RC : RegisterClass<"PPC", [f32], 32, (add F8RC)>;
-def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v2i64,v1i128,v4f32,v2f64], 128,
+def VRRC : RegisterClass<"PPC",
+ [v16i8,v8i16,v4i32,v2i64,v1i128,v4f32,v2f64, f128],
+ 128,
(add V2, V3, V4, V5, V0, V1, V6, V7, V8, V9, V10, V11,
V12, V13, V14, V15, V16, V17, V18, V19, V31, V30,
V29, V28, V27, V26, V25, V24, V23, V22, V21, V20)>;
@@ -351,7 +382,7 @@ def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)> {
}
def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>;
-def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> {
+def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY, XER)> {
let CopyCost = -1;
}
diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index d240529bc731..5ad0a517c117 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td
@@ -87,6 +87,8 @@ def IIC_SprMTSRIN : InstrItinClass;
def IIC_SprRFI : InstrItinClass;
def IIC_SprSC : InstrItinClass;
def IIC_FPGeneral : InstrItinClass;
+def IIC_FPDGeneral : InstrItinClass;
+def IIC_FPSGeneral : InstrItinClass;
def IIC_FPAddSub : InstrItinClass;
def IIC_FPCompare : InstrItinClass;
def IIC_FPDivD : InstrItinClass;
@@ -133,5 +135,6 @@ include "PPCScheduleP7.td"
include "PPCScheduleP8.td"
include "PPCScheduleP9.td"
include "PPCScheduleA2.td"
+include "PPCScheduleE500.td"
include "PPCScheduleE500mc.td"
include "PPCScheduleE5500.td"
diff --git a/lib/Target/PowerPC/PPCScheduleE500.td b/lib/Target/PowerPC/PPCScheduleE500.td
new file mode 100644
index 000000000000..d7c2bd15a258
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleE500.td
@@ -0,0 +1,274 @@
+//===-- PPCScheduleE500.td - e500 Scheduling Defs ------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e500 32-bit
+// Power processor.
+//
+// All information is derived from the "e500 Core Reference Manual",
+// Freescale Document Number E500MCRM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e500 core:
+//
+// * Decode & Dispatch
+// Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+// queues (GIQx) or Branch issue queue (BIQ).
+def E500_DIS0 : FuncUnit; // Dispatch stage - insn 1
+def E500_DIS1 : FuncUnit; // Dispatch stage - insn 2
+
+// * Execute
+// 6 pipelined execution units: SU0, SU1, BU, LSU, MU.
+// Some instructions can only execute in SU0 but not SU1.
+def E500_SU0 : FuncUnit; // Simple unit 0
+def E500_SU1 : FuncUnit; // Simple unit 1
+def E500_BU : FuncUnit; // Branch unit
+def E500_MU : FuncUnit; // MU pipeline
+def E500_LSU_0 : FuncUnit; // LSU pipeline
+
+def E500_GPR_Bypass : Bypass;
+def E500_CR_Bypass : Bypass;
+def E500_DivBypass : Bypass;
+
+def PPCE500Itineraries : ProcessorItineraries<
+ [E500_DIS0, E500_DIS1, E500_SU0, E500_SU1, E500_BU,
+ E500_MU, E500_LSU_0],
+ [E500_CR_Bypass, E500_GPR_Bypass, E500_DivBypass], [
+ InstrItinData<IIC_IntSimple, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0, E500_SU1]>],
+ [4, 1, 1], // Latency = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntGeneral, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0, E500_SU1]>],
+ [4, 1, 1], // Latency = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntISEL, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0, E500_SU1]>],
+ [4, 1, 1, 1], // Latency = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass,
+ E500_CR_Bypass]>,
+ InstrItinData<IIC_IntCompare, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0, E500_SU1]>],
+ [5, 1, 1], // Latency = 1 or 2
+ [E500_CR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntDivW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_MU], 0>,
+ InstrStage<14, [E500_MU]>],
+ [17, 1, 1], // Latency=4..35, Repeat= 4..35
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntMulHW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<4, [E500_MU]>],
+ [7, 1, 1], // Latency = 4, Repeat rate = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntMulHWU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<4, [E500_MU]>],
+ [7, 1, 1], // Latency = 4, Repeat rate = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntMulLI, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<4, [E500_MU]>],
+ [7, 1, 1], // Latency = 4, Repeat rate = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntRotate, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0, E500_SU1]>],
+ [4, 1, 1], // Latency = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntShift, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0, E500_SU1]>],
+ [4, 1, 1], // Latency = 1
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_IntTrapW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<2, [E500_SU0]>],
+ [5, 1], // Latency = 2, Repeat rate = 2
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_BrB, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_BU]>],
+ [4, 1], // Latency = 1
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_BrCR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_BU]>],
+ [4, 1, 1], // Latency = 1
+ [E500_CR_Bypass,
+ E500_CR_Bypass, E500_CR_Bypass]>,
+ InstrItinData<IIC_BrMCR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_BU]>],
+ [4, 1], // Latency = 1
+ [E500_CR_Bypass, E500_CR_Bypass]>,
+ InstrItinData<IIC_BrMCRX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0, E500_SU1]>],
+ [4, 1, 1], // Latency = 1
+ [E500_CR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStDCBA, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3, Repeat rate = 1
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStDCBF, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStDCBI, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLoad, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0, E500_SU1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0, E500_SU1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStStore, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0, E500_SU1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [NoBypass, E500_GPR_Bypass],
+ 2>, // 2 micro-ops
+ InstrItinData<IIC_LdStICBI, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLHA, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLHAU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0, E500_SU1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLHAUX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0, E500_SU1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLMW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [7, 1], // Latency = r+3
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLWARX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<3, [E500_LSU_0]>],
+ [6, 1, 1], // Latency = 3, Repeat rate = 3
+ [E500_GPR_Bypass,
+ E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTWCX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>],
+ [6, 1], // Latency = 3
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSync, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0]>]>,
+ InstrItinData<IIC_SprMFSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<4, [E500_SU0]>],
+ [7, 1],
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTMSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<2, [E500_SU0, E500_SU1]>],
+ [5, 1], // Latency = 2, Repeat rate = 4
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0]>],
+ [5, 1],
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_SprTLBSYNC, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_LSU_0], 0>]>,
+ InstrItinData<IIC_SprMFCR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<5, [E500_SU0]>],
+ [8, 1],
+ [E500_GPR_Bypass, E500_CR_Bypass]>,
+ InstrItinData<IIC_SprMFCRF, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<5, [E500_SU0]>],
+ [8, 1],
+ [E500_GPR_Bypass, E500_CR_Bypass]>,
+ InstrItinData<IIC_SprMFPMR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<4, [E500_SU0]>],
+ [7, 1], // Latency = 4, Repeat rate = 4
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_SprMFMSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<4, [E500_SU0]>],
+ [7, 1], // Latency = 4, Repeat rate = 4
+ [E500_GPR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_SprMFSPR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0, E500_SU1]>],
+ [4, 1], // Latency = 1, Repeat rate = 1
+ [E500_GPR_Bypass, E500_CR_Bypass]>,
+ InstrItinData<IIC_SprMTPMR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0]>],
+ [4, 1], // Latency = 1, Repeat rate = 1
+ [E500_CR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_SprMFTB, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<4, [E500_SU0]>],
+ [7, 1], // Latency = 4, Repeat rate = 4
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTSPR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0, E500_SU1]>],
+ [4, 1], // Latency = 1, Repeat rate = 1
+ [E500_CR_Bypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTSRIN, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0]>],
+ [4, 1],
+ [NoBypass, E500_GPR_Bypass]>,
+ InstrItinData<IIC_FPDGeneral, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<6, [E500_MU]>],
+ [9, 1, 1], // Latency = 6, Repeat rate = 1
+ [NoBypass]>,
+ InstrItinData<IIC_FPSGeneral, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<4, [E500_MU]>],
+ [7, 1, 1], // Latency = 4, Repeat rate = 1
+ [NoBypass]>,
+ InstrItinData<IIC_FPDivD, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<32, [E500_MU]>],
+ [35, 1, 1], // Latency = 32, Repeat rate = 32
+ [E500_DivBypass]>,
+ InstrItinData<IIC_FPDivS, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<29, [E500_MU]>],
+ [32, 1, 1], // Latency = 29, Repeat rate = 29
+ [E500_DivBypass]>,
+ InstrItinData<IIC_VecGeneral, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<1, [E500_SU0]>],
+ [4, 1, 1], // Latency = 1, Repeat rate = 1
+ [NoBypass]>,
+ InstrItinData<IIC_VecComplex, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+ InstrStage<4, [E500_MU]>],
+ [7, 1, 1], // Latency = 4, Repeat rate = 1
+ [NoBypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e500 machine model for scheduling and other instruction cost heuristics.
+
+def PPCE500Model : SchedMachineModel {
+ let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+ let LoadLatency = 5; // Optimistic load latency assuming bypass.
+ // This is overriden by OperandCycles if the
+ // Itineraries are queried instead.
+
+ let CompleteModel = 0;
+
+ let Itineraries = PPCE500Itineraries;
+}
diff --git a/lib/Target/PowerPC/PPCScheduleE500mc.td b/lib/Target/PowerPC/PPCScheduleE500mc.td
index 15d5991b938c..5f95f2a79f66 100644
--- a/lib/Target/PowerPC/PPCScheduleE500mc.td
+++ b/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -19,299 +19,299 @@
// * Decode & Dispatch
// Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
// queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
-def E500_DIS0 : FuncUnit; // Dispatch stage - insn 1
-def E500_DIS1 : FuncUnit; // Dispatch stage - insn 2
+def E500mc_DIS0 : FuncUnit; // Dispatch stage - insn 1
+def E500mc_DIS1 : FuncUnit; // Dispatch stage - insn 2
// * Execute
// 6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
// Some instructions can only execute in SFX0 but not SFX1.
// The CFX has a bypass path, allowing non-divide instructions to execute
// while a divide instruction is executed.
-def E500_SFX0 : FuncUnit; // Simple unit 0
-def E500_SFX1 : FuncUnit; // Simple unit 1
-def E500_BU : FuncUnit; // Branch unit
-def E500_CFX_DivBypass
+def E500mc_SFX0 : FuncUnit; // Simple unit 0
+def E500mc_SFX1 : FuncUnit; // Simple unit 1
+def E500mc_BU : FuncUnit; // Branch unit
+def E500mc_CFX_DivBypass
: FuncUnit; // CFX divide bypass path
-def E500_CFX_0 : FuncUnit; // CFX pipeline
-def E500_LSU_0 : FuncUnit; // LSU pipeline
-def E500_FPU_0 : FuncUnit; // FPU pipeline
+def E500mc_CFX_0 : FuncUnit; // CFX pipeline
+def E500mc_LSU_0 : FuncUnit; // LSU pipeline
+def E500mc_FPU_0 : FuncUnit; // FPU pipeline
-def E500_GPR_Bypass : Bypass;
-def E500_FPR_Bypass : Bypass;
-def E500_CR_Bypass : Bypass;
+def E500mc_GPR_Bypass : Bypass;
+def E500mc_FPR_Bypass : Bypass;
+def E500mc_CR_Bypass : Bypass;
def PPCE500mcItineraries : ProcessorItineraries<
- [E500_DIS0, E500_DIS1, E500_SFX0, E500_SFX1, E500_BU, E500_CFX_DivBypass,
- E500_CFX_0, E500_LSU_0, E500_FPU_0],
- [E500_CR_Bypass, E500_GPR_Bypass, E500_FPR_Bypass], [
- InstrItinData<IIC_IntSimple, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [E500mc_DIS0, E500mc_DIS1, E500mc_SFX0, E500mc_SFX1, E500mc_BU, E500mc_CFX_DivBypass,
+ E500mc_CFX_0, E500mc_LSU_0, E500mc_FPU_0],
+ [E500mc_CR_Bypass, E500mc_GPR_Bypass, E500mc_FPR_Bypass], [
+ InstrItinData<IIC_IntSimple, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
[4, 1, 1], // Latency = 1
- [E500_GPR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_IntGeneral, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [E500mc_GPR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_IntGeneral, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
[4, 1, 1], // Latency = 1
- [E500_GPR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_IntISEL, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [E500mc_GPR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_IntISEL, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
[4, 1, 1, 1], // Latency = 1
- [E500_GPR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass,
- E500_CR_Bypass]>,
- InstrItinData<IIC_IntCompare, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [E500mc_GPR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass,
+ E500mc_CR_Bypass]>,
+ InstrItinData<IIC_IntCompare, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
[5, 1, 1], // Latency = 1 or 2
- [E500_CR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_IntDivW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_CFX_0], 0>,
- InstrStage<14, [E500_CFX_DivBypass]>],
+ [E500mc_CR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_IntDivW, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_CFX_0], 0>,
+ InstrStage<14, [E500mc_CFX_DivBypass]>],
[17, 1, 1], // Latency=4..35, Repeat= 4..35
- [E500_GPR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_IntMFFS, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<8, [E500_FPU_0]>],
+ [E500mc_GPR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_IntMFFS, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<8, [E500mc_FPU_0]>],
[11], // Latency = 8
- [E500_FPR_Bypass]>,
- InstrItinData<IIC_IntMTFSB0, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<8, [E500_FPU_0]>],
+ [E500mc_FPR_Bypass]>,
+ InstrItinData<IIC_IntMTFSB0, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<8, [E500mc_FPU_0]>],
[11, 1, 1], // Latency = 8
[NoBypass, NoBypass, NoBypass]>,
- InstrItinData<IIC_IntMulHW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_CFX_0]>],
+ InstrItinData<IIC_IntMulHW, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_CFX_0]>],
[7, 1, 1], // Latency = 4, Repeat rate = 1
- [E500_GPR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_IntMulHWU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_CFX_0]>],
+ [E500mc_GPR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_IntMulHWU, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_CFX_0]>],
[7, 1, 1], // Latency = 4, Repeat rate = 1
- [E500_GPR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_IntMulLI, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_CFX_0]>],
+ [E500mc_GPR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_IntMulLI, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_CFX_0]>],
[7, 1, 1], // Latency = 4, Repeat rate = 1
- [E500_GPR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_IntRotate, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [E500mc_GPR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_IntRotate, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
[4, 1, 1], // Latency = 1
- [E500_GPR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_IntShift, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [E500mc_GPR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_IntShift, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
[4, 1, 1], // Latency = 1
- [E500_GPR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_IntTrapW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<2, [E500_SFX0]>],
+ [E500mc_GPR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_IntTrapW, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<2, [E500mc_SFX0]>],
[5, 1], // Latency = 2, Repeat rate = 2
- [E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_BrB, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_BU]>],
+ [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_BrB, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_BU]>],
[4, 1], // Latency = 1
- [NoBypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_BrCR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_BU]>],
+ [NoBypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_BrCR, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_BU]>],
[4, 1, 1], // Latency = 1
- [E500_CR_Bypass,
- E500_CR_Bypass, E500_CR_Bypass]>,
- InstrItinData<IIC_BrMCR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_BU]>],
+ [E500mc_CR_Bypass,
+ E500mc_CR_Bypass, E500mc_CR_Bypass]>,
+ InstrItinData<IIC_BrMCR, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_BU]>],
[4, 1], // Latency = 1
- [E500_CR_Bypass, E500_CR_Bypass]>,
- InstrItinData<IIC_BrMCRX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [E500mc_CR_Bypass, E500mc_CR_Bypass]>,
+ InstrItinData<IIC_BrMCRX, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
[4, 1, 1], // Latency = 1
- [E500_CR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_LdStDCBA, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ [E500mc_CR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_LdStDCBA, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[6, 1], // Latency = 3, Repeat rate = 1
- [E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_LdStDCBF, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_LdStDCBF, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[6, 1], // Latency = 3
- [E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_LdStDCBI, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_LdStDCBI, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[6, 1], // Latency = 3
- [E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_LdStLoad, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLoad, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[6, 1], // Latency = 3
- [E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[6, 1], // Latency = 3
- [E500_GPR_Bypass, E500_GPR_Bypass],
+ [E500mc_GPR_Bypass, E500mc_GPR_Bypass],
2>, // 2 micro-ops
- InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[6, 1], // Latency = 3
- [E500_GPR_Bypass, E500_GPR_Bypass],
+ [E500mc_GPR_Bypass, E500mc_GPR_Bypass],
2>, // 2 micro-ops
- InstrItinData<IIC_LdStStore, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ InstrItinData<IIC_LdStStore, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[6, 1], // Latency = 3
- [NoBypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ [NoBypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[6, 1], // Latency = 3
- [NoBypass, E500_GPR_Bypass],
+ [NoBypass, E500mc_GPR_Bypass],
2>, // 2 micro-ops
- InstrItinData<IIC_LdStICBI, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ InstrItinData<IIC_LdStICBI, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[6, 1], // Latency = 3
- [NoBypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_LdStSTFD, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ [NoBypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTFD, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[6, 1, 1], // Latency = 3
- [E500_GPR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_LdStSTFDU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ [E500mc_GPR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTFDU, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[6, 1, 1], // Latency = 3
- [E500_GPR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass],
+ [E500mc_GPR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass],
2>, // 2 micro-ops
- InstrItinData<IIC_LdStLFD, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ InstrItinData<IIC_LdStLFD, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[7, 1, 1], // Latency = 4
- [E500_FPR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_LdStLFDU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ [E500mc_FPR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLFDU, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[7, 1, 1], // Latency = 4
- [E500_FPR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass],
+ [E500mc_FPR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass],
2>, // 2 micro-ops
- InstrItinData<IIC_LdStLFDUX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ InstrItinData<IIC_LdStLFDUX, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[7, 1, 1], // Latency = 4
- [E500_FPR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass],
+ [E500mc_FPR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass],
2>, // 2 micro-ops
- InstrItinData<IIC_LdStLHA, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ InstrItinData<IIC_LdStLHA, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[6, 1], // Latency = 3
- [E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_LdStLHAU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLHAU, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[6, 1], // Latency = 3
- [E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_LdStLHAUX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLHAUX, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[6, 1], // Latency = 3
- [E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_LdStLMW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLMW, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[7, 1], // Latency = r+3
- [NoBypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_LdStLWARX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<3, [E500_LSU_0]>],
+ [NoBypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_LdStLWARX, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<3, [E500mc_LSU_0]>],
[6, 1, 1], // Latency = 3, Repeat rate = 3
- [E500_GPR_Bypass,
- E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_LdStSTWCX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_LSU_0]>],
+ [E500mc_GPR_Bypass,
+ E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSTWCX, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>],
[6, 1], // Latency = 3
- [NoBypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_LdStSync, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_LSU_0]>]>,
- InstrItinData<IIC_SprMFSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<4, [E500_SFX0]>],
+ [NoBypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_LdStSync, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_LSU_0]>]>,
+ InstrItinData<IIC_SprMFSR, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<4, [E500mc_SFX0]>],
[7, 1],
- [E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_SprMTMSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<2, [E500_SFX0, E500_SFX1]>],
+ [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTMSR, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<2, [E500mc_SFX0, E500mc_SFX1]>],
[5, 1], // Latency = 2, Repeat rate = 4
- [E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_SprMTSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0]>],
+ [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTSR, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0]>],
[5, 1],
- [NoBypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_SprTLBSYNC, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_LSU_0], 0>]>,
- InstrItinData<IIC_SprMFCR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<5, [E500_SFX0]>],
+ [NoBypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_SprTLBSYNC, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_LSU_0], 0>]>,
+ InstrItinData<IIC_SprMFCR, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<5, [E500mc_SFX0]>],
[8, 1],
- [E500_GPR_Bypass, E500_CR_Bypass]>,
- InstrItinData<IIC_SprMFCRF, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<5, [E500_SFX0]>],
+ [E500mc_GPR_Bypass, E500mc_CR_Bypass]>,
+ InstrItinData<IIC_SprMFCRF, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<5, [E500mc_SFX0]>],
[8, 1],
- [E500_GPR_Bypass, E500_CR_Bypass]>,
- InstrItinData<IIC_SprMFPMR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<4, [E500_SFX0]>],
+ [E500mc_GPR_Bypass, E500mc_CR_Bypass]>,
+ InstrItinData<IIC_SprMFPMR, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<4, [E500mc_SFX0]>],
[7, 1], // Latency = 4, Repeat rate = 4
- [E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_SprMFMSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<4, [E500_SFX0]>],
+ [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_SprMFMSR, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<4, [E500mc_SFX0]>],
[7, 1], // Latency = 4, Repeat rate = 4
- [E500_GPR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_SprMFSPR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_SprMFSPR, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
[4, 1], // Latency = 1, Repeat rate = 1
- [E500_GPR_Bypass, E500_CR_Bypass]>,
- InstrItinData<IIC_SprMTPMR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0]>],
+ [E500mc_GPR_Bypass, E500mc_CR_Bypass]>,
+ InstrItinData<IIC_SprMTPMR, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0]>],
[4, 1], // Latency = 1, Repeat rate = 1
- [E500_CR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_SprMFTB, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<4, [E500_SFX0]>],
+ [E500mc_CR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_SprMFTB, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<4, [E500mc_SFX0]>],
[7, 1], // Latency = 4, Repeat rate = 4
- [NoBypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_SprMTSPR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0, E500_SFX1]>],
+ [NoBypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTSPR, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
[4, 1], // Latency = 1, Repeat rate = 1
- [E500_CR_Bypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_SprMTSRIN, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<1, [E500_SFX0]>],
+ [E500mc_CR_Bypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_SprMTSRIN, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<1, [E500mc_SFX0]>],
[4, 1],
- [NoBypass, E500_GPR_Bypass]>,
- InstrItinData<IIC_FPGeneral, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<2, [E500_FPU_0]>],
+ [NoBypass, E500mc_GPR_Bypass]>,
+ InstrItinData<IIC_FPGeneral, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<2, [E500mc_FPU_0]>],
[11, 1, 1], // Latency = 8, Repeat rate = 2
- [E500_FPR_Bypass,
- E500_FPR_Bypass, E500_FPR_Bypass]>,
- InstrItinData<IIC_FPAddSub, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<4, [E500_FPU_0]>],
+ [E500mc_FPR_Bypass,
+ E500mc_FPR_Bypass, E500mc_FPR_Bypass]>,
+ InstrItinData<IIC_FPAddSub, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<4, [E500mc_FPU_0]>],
[13, 1, 1], // Latency = 10, Repeat rate = 4
- [E500_FPR_Bypass,
- E500_FPR_Bypass, E500_FPR_Bypass]>,
- InstrItinData<IIC_FPCompare, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<2, [E500_FPU_0]>],
+ [E500mc_FPR_Bypass,
+ E500mc_FPR_Bypass, E500mc_FPR_Bypass]>,
+ InstrItinData<IIC_FPCompare, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<2, [E500mc_FPU_0]>],
[11, 1, 1], // Latency = 8, Repeat rate = 2
- [E500_CR_Bypass,
- E500_FPR_Bypass, E500_FPR_Bypass]>,
- InstrItinData<IIC_FPDivD, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<68, [E500_FPU_0]>],
+ [E500mc_CR_Bypass,
+ E500mc_FPR_Bypass, E500mc_FPR_Bypass]>,
+ InstrItinData<IIC_FPDivD, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<68, [E500mc_FPU_0]>],
[71, 1, 1], // Latency = 68, Repeat rate = 68
- [E500_FPR_Bypass,
- E500_FPR_Bypass, E500_FPR_Bypass]>,
- InstrItinData<IIC_FPDivS, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<38, [E500_FPU_0]>],
+ [E500mc_FPR_Bypass,
+ E500mc_FPR_Bypass, E500mc_FPR_Bypass]>,
+ InstrItinData<IIC_FPDivS, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<38, [E500mc_FPU_0]>],
[41, 1, 1], // Latency = 38, Repeat rate = 38
- [E500_FPR_Bypass,
- E500_FPR_Bypass, E500_FPR_Bypass]>,
- InstrItinData<IIC_FPFused, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<4, [E500_FPU_0]>],
+ [E500mc_FPR_Bypass,
+ E500mc_FPR_Bypass, E500mc_FPR_Bypass]>,
+ InstrItinData<IIC_FPFused, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<4, [E500mc_FPU_0]>],
[13, 1, 1, 1], // Latency = 10, Repeat rate = 4
- [E500_FPR_Bypass,
- E500_FPR_Bypass, E500_FPR_Bypass,
- E500_FPR_Bypass]>,
- InstrItinData<IIC_FPRes, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
- InstrStage<38, [E500_FPU_0]>],
+ [E500mc_FPR_Bypass,
+ E500mc_FPR_Bypass, E500mc_FPR_Bypass,
+ E500mc_FPR_Bypass]>,
+ InstrItinData<IIC_FPRes, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+ InstrStage<38, [E500mc_FPU_0]>],
[41, 1], // Latency = 38, Repeat rate = 38
- [E500_FPR_Bypass, E500_FPR_Bypass]>
+ [E500mc_FPR_Bypass, E500mc_FPR_Bypass]>
]>;
// ===---------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCScheduleP9.td b/lib/Target/PowerPC/PPCScheduleP9.td
index b24f4fc603a1..e1a480117315 100644
--- a/lib/Target/PowerPC/PPCScheduleP9.td
+++ b/lib/Target/PowerPC/PPCScheduleP9.td
@@ -13,18 +13,31 @@
include "PPCInstrInfo.td"
def P9Model : SchedMachineModel {
+ // The maximum number of instructions to be issued at the same time.
+ // While a value of 8 is technically correct since 8 instructions can be
+ // fetched from the instruction cache. However, only 6 instructions may be
+ // actually dispatched at a time.
let IssueWidth = 8;
+ // Load latency is 4 or 5 cycles depending on the load. This latency assumes
+ // that we have a cache hit. For a cache miss the load latency will be more.
+ // There are two instructions (lxvl, lxvll) that have a latencty of 6 cycles.
+ // However it is not worth bumping this value up to 6 when the vast majority
+ // of instructions are 4 or 5 cycles.
let LoadLatency = 5;
+ // A total of 16 cycles to recover from a branch mispredict.
let MispredictPenalty = 16;
// Try to make sure we have at least 10 dispatch groups in a loop.
+ // A dispatch group is 6 instructions.
let LoopMicroOpBufferSize = 60;
let CompleteModel = 1;
- let UnsupportedFeatures = [HasQPX];
+ // Do not support QPX (Quad Processing eXtension) or SPE (Signal Procesing
+ // Engine) on Power 9.
+ let UnsupportedFeatures = [HasQPX, HasSPE];
}
@@ -36,6 +49,12 @@ let SchedModel = P9Model in {
def DISPATCHER : ProcResource<12>;
// Issue Ports
+ // An instruction can go down one of two issue queues.
+ // Address Generation (AGEN) mainly for loads and stores.
+ // Execution (EXEC) for most other instructions.
+ // Some instructions cannot be run on just any issue queue and may require an
+ // Even or an Odd queue. The EXECE represents the even queues and the EXECO
+ // represents the odd queues.
def IP_AGEN : ProcResource<4>;
def IP_EXEC : ProcResource<4>;
def IP_EXECE : ProcResource<2> {
@@ -48,6 +67,7 @@ let SchedModel = P9Model in {
}
// Pipeline Groups
+ // Four ALU (Fixed Point Arithmetic) units in total. Two even, two Odd.
def ALU : ProcResource<4>;
def ALUE : ProcResource<2> {
//Even ALU pipelines
@@ -57,7 +77,11 @@ let SchedModel = P9Model in {
//Odd ALU pipelines
let Super = ALU;
}
+
+ // Two DIV (Fixed Point Divide) units.
def DIV : ProcResource<2>;
+
+ // Four DP (Floating Point) units in total. Two even, two Odd.
def DP : ProcResource<4>;
def DPE : ProcResource<2> {
//Even DP pipelines
@@ -67,15 +91,23 @@ let SchedModel = P9Model in {
//Odd DP pipelines
let Super = DP;
}
+
+ // Four LS (Load or Store) units.
def LS : ProcResource<4>;
+
+ // Two PM (Permute) units.
def PM : ProcResource<2>;
+
+ // Only one DFU (Decimal Floating Point and Quad Precision) unit.
def DFU : ProcResource<1>;
+
+ // Only one Branch unit.
def BR : ProcResource<1> {
let BufferSize = 16;
}
- def CY : ProcResource<1>;
- def TestGroup : ProcResGroup<[ALU, DP]>;
+ // Only one CY (Crypto) unit.
+ def CY : ProcResource<1>;
// ***************** SchedWriteRes Definitions *****************
@@ -107,6 +139,11 @@ let SchedModel = P9Model in {
}
//Pipeline Groups
+
+ // ALU Units
+ // An ALU may take either 2 or 3 cycles to complete the operation.
+ // However, the ALU unit is only every busy for 1 cycle at a time and may
+ // receive new instructions each cycle.
def P9_ALU_2C : SchedWriteRes<[ALU]> {
let Latency = 2;
}
@@ -131,26 +168,13 @@ let SchedModel = P9Model in {
let Latency = 3;
}
- def P9_ALU_4C : SchedWriteRes<[ALU]> {
- let Latency = 4;
- }
-
- def P9_ALUE_4C : SchedWriteRes<[ALUE]> {
- let Latency = 4;
- }
-
- def P9_ALUO_4C : SchedWriteRes<[ALUO]> {
- let Latency = 4;
- }
-
- def P9_ALU_5C : SchedWriteRes<[ALU]> {
+ // DIV Unit
+ // A DIV unit may take from 5 to 40 cycles to complete.
+ // Some DIV operations may keep the unit busy for up to 8 cycles.
+ def P9_DIV_5C : SchedWriteRes<[DIV]> {
let Latency = 5;
}
- def P9_ALU_6C : SchedWriteRes<[ALU]> {
- let Latency = 6;
- }
-
def P9_DIV_12C : SchedWriteRes<[DIV]> {
let Latency = 12;
}
@@ -170,6 +194,9 @@ let SchedModel = P9Model in {
let Latency = 40;
}
+ // DP Unit
+ // A DP unit may take from 2 to 36 cycles to complete.
+ // Some DP operations keep the unit busy for up to 10 cycles.
def P9_DP_2C : SchedWriteRes<[DP]> {
let Latency = 2;
}
@@ -220,6 +247,16 @@ let SchedModel = P9Model in {
let Latency = 27;
}
+ def P9_DPE_27C_10 : SchedWriteRes<[DP]> {
+ let ResourceCycles = [10];
+ let Latency = 27;
+ }
+
+ def P9_DPO_27C_10 : SchedWriteRes<[DP]> {
+ let ResourceCycles = [10];
+ let Latency = 27;
+ }
+
def P9_DP_33C_8 : SchedWriteRes<[DP]> {
let ResourceCycles = [8];
let Latency = 33;
@@ -240,14 +277,28 @@ let SchedModel = P9Model in {
let Latency = 36;
}
- def P9_PM_3C : SchedWriteRes<[PM]> {
- let Latency = 3;
+ def P9_DPE_36C_10 : SchedWriteRes<[DP]> {
+ let ResourceCycles = [10];
+ let Latency = 36;
}
- def P9_PM_7C : SchedWriteRes<[PM]> {
+ def P9_DPO_36C_10 : SchedWriteRes<[DP]> {
+ let ResourceCycles = [10];
+ let Latency = 36;
+ }
+
+ // PM Unit
+ // Three cycle permute operations.
+ def P9_PM_3C : SchedWriteRes<[PM]> {
let Latency = 3;
}
+ // Load and Store Units
+ // Loads can have 4, 5 or 6 cycles of latency.
+ // Stores are listed as having a single cycle of latency. This is not
+ // completely accurate since it takes more than 1 cycle to actually store
+ // the value. However, since the store does not produce a result it can be
+ // considered complete after one cycle.
def P9_LS_1C : SchedWriteRes<[LS]> {
let Latency = 1;
}
@@ -260,25 +311,44 @@ let SchedModel = P9Model in {
let Latency = 5;
}
+ def P9_LS_6C : SchedWriteRes<[LS]> {
+ let Latency = 6;
+ }
+
+ // DFU Unit
+ // Some of the most expensive ops use the DFU.
+ // Can take from 12 cycles to 76 cycles to obtain a result.
+ // The unit may be busy for up to 62 cycles.
def P9_DFU_12C : SchedWriteRes<[DFU]> {
let Latency = 12;
}
+ def P9_DFU_23C : SchedWriteRes<[DFU]> {
+ let Latency = 23;
+ let ResourceCycles = [11];
+ }
+
def P9_DFU_24C : SchedWriteRes<[DFU]> {
let Latency = 24;
let ResourceCycles = [12];
}
+ def P9_DFU_37C : SchedWriteRes<[DFU]> {
+ let Latency = 37;
+ let ResourceCycles = [25];
+ }
+
def P9_DFU_58C : SchedWriteRes<[DFU]> {
let Latency = 58;
let ResourceCycles = [44];
}
- def P9_DFU_76C : SchedWriteRes<[TestGroup, DFU]> {
+ def P9_DFU_76C : SchedWriteRes<[DFU]> {
let Latency = 76;
let ResourceCycles = [62];
}
+ // 2 or 5 cycle latencies for the branch unit.
def P9_BR_2C : SchedWriteRes<[BR]> {
let Latency = 2;
}
@@ -287,138 +357,43 @@ let SchedModel = P9Model in {
let Latency = 5;
}
+ // 6 cycle latency for the crypto unit
def P9_CY_6C : SchedWriteRes<[CY]> {
let Latency = 6;
}
// ***************** WriteSeq Definitions *****************
+ // These are combinations of the resources listed above.
+ // The idea is that some cracked instructions cannot be done in parallel and
+ // so the latencies for their resources must be added.
def P9_LoadAndALUOp_6C : WriteSequence<[P9_LS_4C, P9_ALU_2C]>;
def P9_LoadAndALUOp_7C : WriteSequence<[P9_LS_5C, P9_ALU_2C]>;
+ def P9_LoadAndALU2Op_7C : WriteSequence<[P9_LS_4C, P9_ALU_3C]>;
+ def P9_LoadAndALU2Op_8C : WriteSequence<[P9_LS_5C, P9_ALU_3C]>;
def P9_LoadAndPMOp_8C : WriteSequence<[P9_LS_5C, P9_PM_3C]>;
def P9_LoadAndLoadOp_8C : WriteSequence<[P9_LS_4C, P9_LS_4C]>;
+ def P9_IntDivAndALUOp_18C_8 : WriteSequence<[P9_DIV_16C_8, P9_ALU_2C]>;
def P9_IntDivAndALUOp_26C_8 : WriteSequence<[P9_DIV_24C_8, P9_ALU_2C]>;
def P9_IntDivAndALUOp_42C_8 : WriteSequence<[P9_DIV_40C_8, P9_ALU_2C]>;
+ def P9_StoreAndALUOp_3C : WriteSequence<[P9_LS_1C, P9_ALU_2C]>;
def P9_StoreAndALUOp_4C : WriteSequence<[P9_LS_1C, P9_ALU_3C]>;
def P9_ALUOpAndALUOp_4C : WriteSequence<[P9_ALU_2C, P9_ALU_2C]>;
+ def P9_ALU2OpAndALU2Op_6C : WriteSequence<[P9_ALU_3C, P9_ALU_3C]>;
+ def P9_ALUOpAndALUOpAndALUOp_6C :
+ WriteSequence<[P9_ALU_2C, P9_ALU_2C, P9_ALU_2C]>;
+ def P9_DPOpAndALUOp_7C : WriteSequence<[P9_DP_5C, P9_ALU_2C]>;
def P9_DPOpAndALUOp_9C : WriteSequence<[P9_DP_7C, P9_ALU_2C]>;
+ def P9_DPOpAndALU2Op_10C : WriteSequence<[P9_DP_7C, P9_ALU_3C]>;
def P9_DPOpAndALUOp_24C_5 : WriteSequence<[P9_DP_22C_5, P9_ALU_2C]>;
def P9_DPOpAndALUOp_35C_8 : WriteSequence<[P9_DP_33C_8, P9_ALU_2C]>;
+ def P9_DPOpAndALU2Op_25C_5 : WriteSequence<[P9_DP_22C_5, P9_ALU_3C]>;
+ def P9_DPOpAndALU2Op_29C_5 : WriteSequence<[P9_DP_26C_5, P9_ALU_3C]>;
+ def P9_DPOpAndALU2Op_36C_8 : WriteSequence<[P9_DP_33C_8, P9_ALU_3C]>;
+ def P9_DPOpAndALU2Op_39C_10 : WriteSequence<[P9_DP_36C_10, P9_ALU_3C]>;
+ def P9_BROpAndALUOp_7C : WriteSequence<[P9_BR_5C, P9_ALU_2C]>;
- // ***************** Defining Itinerary Class Resources *****************
-
- // The following itineraries are fully covered by the InstRW definitions in
- // P9InstrResources.td so aren't listed here.
- // IIC_FPDivD, IIC_FPDivS, IIC_FPFused, IIC_IntDivD, IIC_LdStLFDU,
- // IIC_LdStLFDUX
-
- def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
- [IIC_IntSimple, IIC_IntGeneral, IIC_IntRFID,
- IIC_IntRotateD, IIC_IntRotateDI, IIC_IntTrapD,
- IIC_SprRFI]>;
-
- def : ItinRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C],
- [IIC_IntTrapW]>;
-
- def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_IntISEL, IIC_IntRotate, IIC_IntShift]>;
-
- def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_IntCompare]>;
-
- def : ItinRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C], [IIC_VecGeneral, IIC_FPCompare]>;
-
- def : ItinRW<[P9_DP_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_IntMulHW, IIC_IntMulHWU, IIC_IntMulLI, IIC_IntMulHD]>;
-
- def : ItinRW<[P9_LS_5C, IP_EXEC_1C, DISP_1C, DISP_1C],
- [IIC_LdStLoad, IIC_LdStLD, IIC_LdStLFD]>;
-
- def : ItinRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_LdStLoadUpd, IIC_LdStLDU]>;
-
- def : ItinRW<[P9_LS_4C, P9_ALU_2C, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_LdStLoadUpdX, IIC_LdStLDUX]>;
-
- def : ItinRW<[P9_LS_1C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_LdStSTFDU]>;
-
- def : ItinRW<[P9_LoadAndALUOp_6C,
- IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_LdStLHA, IIC_LdStLWA]>;
-
- def : ItinRW<[P9_LoadAndALUOp_6C, P9_ALU_2C,
- IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_LdStLHAU, IIC_LdStLHAUX]>;
-
- // IIC_LdStLMW contains two microcoded insns. This is not accurate, but
- // those insns are not used that much, if at all.
- def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C],
- [IIC_LdStLWARX, IIC_LdStLDARX, IIC_LdStLMW]>;
-
- def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C],
- [IIC_LdStCOPY, IIC_SprABORT, IIC_LdStPASTE, IIC_LdStDCBF,
- IIC_LdStICBI, IIC_LdStSync, IIC_SprISYNC, IIC_SprMSGSYNC,
- IIC_SprSLBIA, IIC_SprSLBSYNC, IIC_SprTLBSYNC]>;
-
- def : ItinRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_LdStSTFD, IIC_LdStSTD, IIC_LdStStore]>;
-
- def : ItinRW<[P9_LS_1C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_LdStSTDU, IIC_LdStSTDUX, IIC_LdStStoreUpd, IIC_SprSLBIEG,
- IIC_SprTLBIA, IIC_SprTLBIE]>;
-
- def : ItinRW<[P9_StoreAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_LdStSTDCX, IIC_LdStSTWCX]>;
-
- def : ItinRW<[P9_ALU_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_BrCR, IIC_IntMTFSB0]>;
-
- def : ItinRW<[P9_ALUOpAndALUOp_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
- IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_SprMFCR, IIC_SprMFCRF, IIC_BrMCR, IIC_BrMCRX, IIC_IntMFFS]>;
-
- def : ItinRW<[P9_BR_2C, DISP_1C], [IIC_BrB]>;
- def : ItinRW<[P9_BR_5C, DISP_1C], [IIC_SprMFSPR]>;
-
- // This class should be broken down to instruction level, once some missing
- // info is obtained.
- def : ItinRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C], [IIC_SprMTSPR]>;
-
- def : ItinRW<[P9_LoadAndLoadOp_8C, IP_EXEC_1C, DISP_1C, DISP_1C],
- [IIC_SprSLBIE, IIC_SprSLBMFEE, IIC_SprSLBMFEV, IIC_SprSLBMTE,
- IIC_SprTLBIEL]>;
-
- // IIC_VecFP is added here although many instructions with that itinerary
- // use very different resources. It would appear that instructions were
- // given that itinerary rather carelessly over time. Specific instructions
- // that use different resources are listed in various InstrRW classes.
- def : ItinRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_FPGeneral, IIC_FPAddSub, IIC_VecFP]>;
-
- def : ItinRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C], [IIC_VecFPCompare]>;
-
- def : ItinRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
- [IIC_VecPerm]>;
-
- def : ItinRW<[P9_DP_36C_10, IP_EXEC_1C], [IIC_FPSqrtD]>;
- def : ItinRW<[P9_DP_26C_5, P9_DP_26C_5, IP_EXEC_1C, IP_EXEC_1C], [IIC_FPSqrtS]>;
-
- def : ItinRW<[P9_DIV_12C, IP_EXECE_1C, DISP_1C, DISP_1C],
- [IIC_SprMFMSR, IIC_SprMFPMR, IIC_SprMFSR, IIC_SprMFTB,
- IIC_SprMTMSR, IIC_SprMTMSRD, IIC_SprMTPMR, IIC_SprMTSR]>;
-
- def : ItinRW<[], [IIC_SprSTOP]>;
-
+ // Include the resource requirements of individual instructions.
include "P9InstrResources.td"
}
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index ccf0f80c336b..c0cbfd779cb9 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -65,6 +65,7 @@ void PPCSubtarget::initializeEnvironment() {
HasHardFloat = false;
HasAltivec = false;
HasSPE = false;
+ HasFPU = false;
HasQPX = false;
HasVSX = false;
HasP8Vector = false;
@@ -106,6 +107,7 @@ void PPCSubtarget::initializeEnvironment() {
HasFloat128 = false;
IsISA3_0 = false;
UseLongCalls = false;
+ SecurePlt = false;
HasPOPCNTD = POPCNTD_Unavailable;
}
@@ -136,6 +138,16 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
if (isDarwin())
HasLazyResolverStubs = true;
+ if (HasSPE && IsPPC64)
+ report_fatal_error( "SPE is only supported for 32-bit targets.\n", false);
+ if (HasSPE && (HasAltivec || HasQPX || HasVSX || HasFPU))
+ report_fatal_error(
+ "SPE and traditional floating point cannot both be enabled.\n", false);
+
+ // If not SPE, set standard FPU
+ if (!HasSPE)
+ HasFPU = true;
+
// QPX requires a 32-byte aligned stack. Note that we need to do this if
// we're compiling for a BG/Q system regardless of whether or not QPX
// is enabled because external functions will assume this alignment.
@@ -163,27 +175,8 @@ bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const {
return false;
}
-// Embedded cores need aggressive scheduling (and some others also benefit).
-static bool needsAggressiveScheduling(unsigned Directive) {
- switch (Directive) {
- default: return false;
- case PPC::DIR_440:
- case PPC::DIR_A2:
- case PPC::DIR_E500mc:
- case PPC::DIR_E5500:
- case PPC::DIR_PWR7:
- case PPC::DIR_PWR8:
- // FIXME: Same as P8 until POWER9 scheduling info is available
- case PPC::DIR_PWR9:
- return true;
- }
-}
-
bool PPCSubtarget::enableMachineScheduler() const {
- // Enable MI scheduling for the embedded cores.
- // FIXME: Enable this for all cores (some additional modeling
- // may be necessary).
- return needsAggressiveScheduling(DarwinDirective);
+ return true;
}
// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
@@ -201,19 +194,19 @@ void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const {
- if (needsAggressiveScheduling(DarwinDirective)) {
- Policy.OnlyTopDown = false;
- Policy.OnlyBottomUp = false;
- }
-
+ // The GenericScheduler that we use defaults to scheduling bottom up only.
+ // We want to schedule from both the top and the bottom and so we set
+ // OnlyBottomUp to false.
+ // We want to do bi-directional scheduling since it provides a more balanced
+ // schedule leading to better performance.
+ Policy.OnlyBottomUp = false;
// Spilling is generally expensive on all PPC cores, so always enable
// register-pressure tracking.
Policy.ShouldTrackPressure = true;
}
bool PPCSubtarget::useAA() const {
- // Use AA during code generation for the embedded cores.
- return needsAggressiveScheduling(DarwinDirective);
+ return true;
}
bool PPCSubtarget::enableSubRegLiveness() const {
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index c351b5c04a05..c56f254d6bec 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -46,6 +46,7 @@ namespace PPC {
DIR_750,
DIR_970,
DIR_A2,
+ DIR_E500,
DIR_E500mc,
DIR_E5500,
DIR_PWR3,
@@ -94,6 +95,7 @@ protected:
bool HasHardFloat;
bool IsPPC64;
bool HasAltivec;
+ bool HasFPU;
bool HasSPE;
bool HasQPX;
bool HasVSX;
@@ -133,6 +135,7 @@ protected:
bool HasFloat128;
bool IsISA3_0;
bool UseLongCalls;
+ bool SecurePlt;
POPCNTDKind HasPOPCNTD;
@@ -238,6 +241,7 @@ public:
bool hasFPCVT() const { return HasFPCVT; }
bool hasAltivec() const { return HasAltivec; }
bool hasSPE() const { return HasSPE; }
+ bool hasFPU() const { return HasFPU; }
bool hasQPX() const { return HasQPX; }
bool hasVSX() const { return HasVSX; }
bool hasP8Vector() const { return HasP8Vector; }
@@ -255,6 +259,7 @@ public:
bool hasOnlyMSYNC() const { return HasOnlyMSYNC; }
bool isPPC4xx() const { return IsPPC4xx; }
bool isPPC6xx() const { return IsPPC6xx; }
+ bool isSecurePlt() const {return SecurePlt; }
bool isE500() const { return IsE500; }
bool isFeatureMFTB() const { return FeatureMFTB; }
bool isDeprecatedDST() const { return DeprecatedDST; }
diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 49f2699ab082..ac36abbe8439 100644
--- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -77,7 +77,7 @@ protected:
continue;
}
- DEBUG(dbgs() << "TLS Dynamic Call Fixup:\n " << MI);
+ LLVM_DEBUG(dbgs() << "TLS Dynamic Call Fixup:\n " << MI);
unsigned OutReg = MI.getOperand(0).getReg();
unsigned InReg = MI.getOperand(1).getReg();
@@ -108,7 +108,7 @@ protected:
}
// We create ADJCALLSTACKUP and ADJCALLSTACKDOWN around _tls_get_addr
- // as schduling fence to avoid it is scheduled before
+ // as scheduling fence to avoid it is scheduled before
// mflr in the prologue and the address in LR is clobbered (PR25839).
// We don't really need to save data to the stack - the clobbered
// registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr)
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 20a83c973026..a8d7955ef548 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -23,8 +23,8 @@
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
@@ -32,6 +32,7 @@
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/Scalar.h"
#include <cassert>
@@ -303,7 +304,12 @@ namespace {
class PPCPassConfig : public TargetPassConfig {
public:
PPCPassConfig(PPCTargetMachine &TM, PassManagerBase &PM)
- : TargetPassConfig(TM, PM) {}
+ : TargetPassConfig(TM, PM) {
+ // At any optimization level above -O0 we use the Machine Scheduler and not
+ // the default Post RA List Scheduler.
+ if (TM.getOptLevel() != CodeGenOpt::None)
+ substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
+ }
PPCTargetMachine &getPPCTargetMachine() const {
return getTM<PPCTargetMachine>();
@@ -343,7 +349,7 @@ void PPCPassConfig::addIRPasses() {
// Call SeparateConstOffsetFromGEP pass to extract constants within indices
// and lower a GEP with multiple indices to either arithmetic operations or
// multiple GEPs with single index.
- addPass(createSeparateConstOffsetFromGEPPass(TM, true));
+ addPass(createSeparateConstOffsetFromGEPPass(true));
// Call EarlyCSE pass to find and remove subexpressions in the lowered
// result.
addPass(createEarlyCSEPass());
diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.h b/lib/Target/PowerPC/PPCTargetObjectFile.h
index 8343a90696d9..417b8ed0d612 100644
--- a/lib/Target/PowerPC/PPCTargetObjectFile.h
+++ b/lib/Target/PowerPC/PPCTargetObjectFile.h
@@ -10,8 +10,8 @@
#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H
#define LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
@@ -25,7 +25,7 @@ namespace llvm {
MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
const TargetMachine &TM) const override;
- /// \brief Describe a TLS variable address within debug info.
+ /// Describe a TLS variable address within debug info.
const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
};
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index aa4073f7ea02..226c75f704f4 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -27,6 +27,11 @@ static cl::opt<unsigned>
CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
cl::desc("The loop prefetch cache line size"));
+static cl::opt<bool>
+EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
+ cl::desc("Enable using coldcc calling conv for cold "
+ "internal functions"));
+
//===----------------------------------------------------------------------===//
//
// PPC cost model.
@@ -215,6 +220,14 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
BaseT::getUnrollingPreferences(L, SE, UP);
}
+// This function returns true to allow using coldcc calling convention.
+// Returning true results in coldcc being used for functions which are cold at
+// all call sites when the callers of the functions are not calling any other
+// non coldcc functions.
+bool PPCTTIImpl::useColdCCForColdCall(Function &F) {
+ return EnablePPCColdCC;
+}
+
bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
// On the A2, always unroll aggressively. For QPX unaligned loads, we depend
// on combining the loads generated for consecutive accesses, and failure to
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index b42dae4a0254..2ee2b3eb8084 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -61,7 +61,7 @@ public:
/// \name Vector TTI Implementations
/// @{
-
+ bool useColdCCForColdCall(Function &F);
bool enableAggressiveInterleaving(bool LoopHasReductions);
const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
bool IsZeroCmp) const;
diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index f15af790de8f..6586f503a7b8 100644
--- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -241,7 +241,7 @@ protected:
assert(OldFMAReg == AddendMI->getOperand(0).getReg() &&
"Addend copy not tied to old FMA output!");
- DEBUG(dbgs() << "VSX FMA Mutation:\n " << MI);
+ LLVM_DEBUG(dbgs() << "VSX FMA Mutation:\n " << MI);
MI.getOperand(0).setReg(KilledProdReg);
MI.getOperand(1).setReg(KilledProdReg);
@@ -273,7 +273,7 @@ protected:
MI.getOperand(2).setIsUndef(OtherProdRegUndef);
}
- DEBUG(dbgs() << " -> " << MI);
+ LLVM_DEBUG(dbgs() << " -> " << MI);
// The killed product operand was killed here, so we can reuse it now
// for the result of the fma.
@@ -310,7 +310,7 @@ protected:
NewFMAInt.addSegment(LiveInterval::Segment(AI->start, AI->end,
NewFMAValNo));
}
- DEBUG(dbgs() << " extended: " << NewFMAInt << '\n');
+ LLVM_DEBUG(dbgs() << " extended: " << NewFMAInt << '\n');
// Extend the live interval of the addend source (it might end at the
// copy to be removed, or somewhere in between there and here). This
@@ -323,15 +323,15 @@ protected:
LiveRange &AddendSrcRange = LIS->getRegUnit(Unit);
AddendSrcRange.extendInBlock(LIS->getMBBStartIdx(&MBB),
FMAIdx.getRegSlot());
- DEBUG(dbgs() << " extended: " << AddendSrcRange << '\n');
+ LLVM_DEBUG(dbgs() << " extended: " << AddendSrcRange << '\n');
}
FMAInt.removeValNo(FMAValNo);
- DEBUG(dbgs() << " trimmed: " << FMAInt << '\n');
+ LLVM_DEBUG(dbgs() << " trimmed: " << FMAInt << '\n');
// Remove the (now unused) copy.
- DEBUG(dbgs() << " removing: " << *AddendMI << '\n');
+ LLVM_DEBUG(dbgs() << " removing: " << *AddendMI << '\n');
LIS->RemoveMachineInstrFromMaps(*AddendMI);
AddendMI->eraseFromParent();
diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index 8a5fb9fdaef1..1e8a1750ec3b 100644
--- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -51,6 +51,7 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"
@@ -248,7 +249,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
for (MachineBasicBlock &MBB : *MF) {
for (MachineInstr &MI : MBB) {
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
continue;
bool RelevantInstr = false;
@@ -519,14 +520,16 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
// permute control vectors (for shift values 1, 2, 3). However,
// VPERM has a more restrictive register class.
case PPC::XXSLDWI:
+ case PPC::XSCVDPSPN:
+ case PPC::XSCVSPDPN:
break;
}
}
}
if (RelevantFunction) {
- DEBUG(dbgs() << "Swap vector when first built\n\n");
- DEBUG(dumpSwapVector());
+ LLVM_DEBUG(dbgs() << "Swap vector when first built\n\n");
+ LLVM_DEBUG(dumpSwapVector());
}
return RelevantFunction;
@@ -585,14 +588,14 @@ unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
// as such so their containing webs will not be optimized.
void PPCVSXSwapRemoval::formWebs() {
- DEBUG(dbgs() << "\n*** Forming webs for swap removal ***\n\n");
+ LLVM_DEBUG(dbgs() << "\n*** Forming webs for swap removal ***\n\n");
for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
- DEBUG(dbgs() << "\n" << SwapVector[EntryIdx].VSEId << " ");
- DEBUG(MI->dump());
+ LLVM_DEBUG(dbgs() << "\n" << SwapVector[EntryIdx].VSEId << " ");
+ LLVM_DEBUG(MI->dump());
// It's sufficient to walk vector uses and join them to their unique
// definitions. In addition, check full vector register operands
@@ -622,10 +625,11 @@ void PPCVSXSwapRemoval::formWebs() {
(void)EC->unionSets(SwapVector[DefIdx].VSEId,
SwapVector[EntryIdx].VSEId);
- DEBUG(dbgs() << format("Unioning %d with %d\n", SwapVector[DefIdx].VSEId,
- SwapVector[EntryIdx].VSEId));
- DEBUG(dbgs() << " Def: ");
- DEBUG(DefMI->dump());
+ LLVM_DEBUG(dbgs() << format("Unioning %d with %d\n",
+ SwapVector[DefIdx].VSEId,
+ SwapVector[EntryIdx].VSEId));
+ LLVM_DEBUG(dbgs() << " Def: ");
+ LLVM_DEBUG(DefMI->dump());
}
}
}
@@ -636,7 +640,7 @@ void PPCVSXSwapRemoval::formWebs() {
// as rejected.
void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
- DEBUG(dbgs() << "\n*** Rejecting webs for swap removal ***\n\n");
+ LLVM_DEBUG(dbgs() << "\n*** Rejecting webs for swap removal ***\n\n");
for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
@@ -654,12 +658,13 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
SwapVector[Repr].WebRejected = 1;
- DEBUG(dbgs() <<
- format("Web %d rejected for physreg, partial reg, or not "
- "swap[pable]\n", Repr));
- DEBUG(dbgs() << " in " << EntryIdx << ": ");
- DEBUG(SwapVector[EntryIdx].VSEMI->dump());
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(
+ dbgs() << format("Web %d rejected for physreg, partial reg, or not "
+ "swap[pable]\n",
+ Repr));
+ LLVM_DEBUG(dbgs() << " in " << EntryIdx << ": ");
+ LLVM_DEBUG(SwapVector[EntryIdx].VSEMI->dump());
+ LLVM_DEBUG(dbgs() << "\n");
}
// Reject webs than contain swapping loads that feed something other
@@ -680,13 +685,13 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
SwapVector[Repr].WebRejected = 1;
- DEBUG(dbgs() <<
- format("Web %d rejected for load not feeding swap\n", Repr));
- DEBUG(dbgs() << " def " << EntryIdx << ": ");
- DEBUG(MI->dump());
- DEBUG(dbgs() << " use " << UseIdx << ": ");
- DEBUG(UseMI.dump());
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << format(
+ "Web %d rejected for load not feeding swap\n", Repr));
+ LLVM_DEBUG(dbgs() << " def " << EntryIdx << ": ");
+ LLVM_DEBUG(MI->dump());
+ LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
+ LLVM_DEBUG(UseMI.dump());
+ LLVM_DEBUG(dbgs() << "\n");
}
}
@@ -704,13 +709,13 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
SwapVector[Repr].WebRejected = 1;
- DEBUG(dbgs() <<
- format("Web %d rejected for store not fed by swap\n", Repr));
- DEBUG(dbgs() << " def " << DefIdx << ": ");
- DEBUG(DefMI->dump());
- DEBUG(dbgs() << " use " << EntryIdx << ": ");
- DEBUG(MI->dump());
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << format(
+ "Web %d rejected for store not fed by swap\n", Repr));
+ LLVM_DEBUG(dbgs() << " def " << DefIdx << ": ");
+ LLVM_DEBUG(DefMI->dump());
+ LLVM_DEBUG(dbgs() << " use " << EntryIdx << ": ");
+ LLVM_DEBUG(MI->dump());
+ LLVM_DEBUG(dbgs() << "\n");
}
// Ensure all uses of the register defined by DefMI feed store
@@ -721,21 +726,22 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
if (SwapVector[UseIdx].VSEMI->getOpcode() != MI->getOpcode()) {
SwapVector[Repr].WebRejected = 1;
- DEBUG(dbgs() <<
- format("Web %d rejected for swap not feeding only stores\n",
- Repr));
- DEBUG(dbgs() << " def " << " : ");
- DEBUG(DefMI->dump());
- DEBUG(dbgs() << " use " << UseIdx << ": ");
- DEBUG(SwapVector[UseIdx].VSEMI->dump());
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(
+ dbgs() << format(
+ "Web %d rejected for swap not feeding only stores\n", Repr));
+ LLVM_DEBUG(dbgs() << " def "
+ << " : ");
+ LLVM_DEBUG(DefMI->dump());
+ LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
+ LLVM_DEBUG(SwapVector[UseIdx].VSEMI->dump());
+ LLVM_DEBUG(dbgs() << "\n");
}
}
}
}
- DEBUG(dbgs() << "Swap vector after web analysis:\n\n");
- DEBUG(dumpSwapVector());
+ LLVM_DEBUG(dbgs() << "Swap vector after web analysis:\n\n");
+ LLVM_DEBUG(dumpSwapVector());
}
// Walk the swap vector entries looking for swaps fed by permuting loads
@@ -745,7 +751,7 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
// such that multiple loads feed the same swap, etc.)
void PPCVSXSwapRemoval::markSwapsForRemoval() {
- DEBUG(dbgs() << "\n*** Marking swaps for removal ***\n\n");
+ LLVM_DEBUG(dbgs() << "\n*** Marking swaps for removal ***\n\n");
for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
@@ -760,8 +766,8 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
int UseIdx = SwapMap[&UseMI];
SwapVector[UseIdx].WillRemove = 1;
- DEBUG(dbgs() << "Marking swap fed by load for removal: ");
- DEBUG(UseMI.dump());
+ LLVM_DEBUG(dbgs() << "Marking swap fed by load for removal: ");
+ LLVM_DEBUG(UseMI.dump());
}
}
@@ -775,8 +781,8 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
int DefIdx = SwapMap[DefMI];
SwapVector[DefIdx].WillRemove = 1;
- DEBUG(dbgs() << "Marking swap feeding store for removal: ");
- DEBUG(DefMI->dump());
+ LLVM_DEBUG(dbgs() << "Marking swap feeding store for removal: ");
+ LLVM_DEBUG(DefMI->dump());
}
} else if (SwapVector[EntryIdx].IsSwappable &&
@@ -821,8 +827,8 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
unsigned NElts;
- DEBUG(dbgs() << "Changing splat: ");
- DEBUG(MI->dump());
+ LLVM_DEBUG(dbgs() << "Changing splat: ");
+ LLVM_DEBUG(MI->dump());
switch (MI->getOpcode()) {
default:
@@ -845,8 +851,8 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
else
MI->getOperand(1).setImm(EltNo);
- DEBUG(dbgs() << " Into: ");
- DEBUG(MI->dump());
+ LLVM_DEBUG(dbgs() << " Into: ");
+ LLVM_DEBUG(MI->dump());
break;
}
@@ -859,8 +865,8 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
case SHValues::SH_XXPERMDI: {
MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
- DEBUG(dbgs() << "Changing XXPERMDI: ");
- DEBUG(MI->dump());
+ LLVM_DEBUG(dbgs() << "Changing XXPERMDI: ");
+ LLVM_DEBUG(MI->dump());
unsigned Selector = MI->getOperand(3).getImm();
if (Selector == 0 || Selector == 3)
@@ -872,8 +878,14 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
MI->getOperand(1).setReg(Reg2);
MI->getOperand(2).setReg(Reg1);
- DEBUG(dbgs() << " Into: ");
- DEBUG(MI->dump());
+ // We also need to swap kill flag associated with the register.
+ bool IsKill1 = MI->getOperand(1).isKill();
+ bool IsKill2 = MI->getOperand(2).isKill();
+ MI->getOperand(1).setIsKill(IsKill2);
+ MI->getOperand(2).setIsKill(IsKill1);
+
+ LLVM_DEBUG(dbgs() << " Into: ");
+ LLVM_DEBUG(MI->dump());
break;
}
@@ -883,16 +895,16 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
case SHValues::SH_COPYWIDEN: {
MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
- DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
- DEBUG(MI->dump());
+ LLVM_DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
+ LLVM_DEBUG(MI->dump());
unsigned DstReg = MI->getOperand(0).getReg();
const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
unsigned NewVReg = MRI->createVirtualRegister(DstRC);
MI->getOperand(0).setReg(NewVReg);
- DEBUG(dbgs() << " Into: ");
- DEBUG(MI->dump());
+ LLVM_DEBUG(dbgs() << " Into: ");
+ LLVM_DEBUG(MI->dump());
auto InsertPoint = ++MachineBasicBlock::iterator(MI);
@@ -908,19 +920,19 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
TII->get(PPC::COPY), VSRCTmp1)
.addReg(NewVReg);
- DEBUG(std::prev(InsertPoint)->dump());
+ LLVM_DEBUG(std::prev(InsertPoint)->dump());
insertSwap(MI, InsertPoint, VSRCTmp2, VSRCTmp1);
- DEBUG(std::prev(InsertPoint)->dump());
+ LLVM_DEBUG(std::prev(InsertPoint)->dump());
BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
TII->get(PPC::COPY), DstReg)
.addReg(VSRCTmp2);
- DEBUG(std::prev(InsertPoint)->dump());
+ LLVM_DEBUG(std::prev(InsertPoint)->dump());
} else {
insertSwap(MI, InsertPoint, DstReg, NewVReg);
- DEBUG(std::prev(InsertPoint)->dump());
+ LLVM_DEBUG(std::prev(InsertPoint)->dump());
}
break;
}
@@ -931,7 +943,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
// a copy operation.
bool PPCVSXSwapRemoval::removeSwaps() {
- DEBUG(dbgs() << "\n*** Removing swaps ***\n\n");
+ LLVM_DEBUG(dbgs() << "\n*** Removing swaps ***\n\n");
bool Changed = false;
@@ -944,9 +956,9 @@ bool PPCVSXSwapRemoval::removeSwaps() {
MI->getOperand(0).getReg())
.add(MI->getOperand(1));
- DEBUG(dbgs() << format("Replaced %d with copy: ",
- SwapVector[EntryIdx].VSEId));
- DEBUG(MI->dump());
+ LLVM_DEBUG(dbgs() << format("Replaced %d with copy: ",
+ SwapVector[EntryIdx].VSEId));
+ LLVM_DEBUG(MI->dump());
MI->eraseFromParent();
}
diff --git a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 3299a53ff5ba..9a455c105482 100644
--- a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -10,11 +10,13 @@
#include "MCTargetDesc/RISCVBaseInfo.h"
#include "MCTargetDesc/RISCVMCExpr.h"
#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "MCTargetDesc/RISCVTargetStreamer.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
#include "llvm/MC/MCParser/MCAsmLexer.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
@@ -22,10 +24,17 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/TargetRegistry.h"
+#include <limits>
+
using namespace llvm;
+// Include the auto-generated portion of the compress emitter.
+#define GEN_COMPRESS_INSTR
+#include "RISCVGenCompressInstEmitter.inc"
+
namespace {
struct RISCVOperand;
@@ -33,11 +42,16 @@ class RISCVAsmParser : public MCTargetAsmParser {
SMLoc getLoc() const { return getParser().getTok().getLoc(); }
bool isRV64() const { return getSTI().hasFeature(RISCV::Feature64Bit); }
+ RISCVTargetStreamer &getTargetStreamer() {
+ MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+ return static_cast<RISCVTargetStreamer &>(TS);
+ }
+
unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
unsigned Kind) override;
bool generateImmOutOfRangeError(OperandVector &Operands, uint64_t ErrorInfo,
- int Lower, int Upper, Twine Msg);
+ int64_t Lower, int64_t Upper, Twine Msg);
bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands, MCStreamer &Out,
@@ -51,6 +65,20 @@ class RISCVAsmParser : public MCTargetAsmParser {
bool ParseDirective(AsmToken DirectiveID) override;
+ // Helper to actually emit an instruction to the MCStreamer. Also, when
+ // possible, compression of the instruction is performed.
+ void emitToStreamer(MCStreamer &S, const MCInst &Inst);
+
+ // Helper to emit a combination of LUI, ADDI(W), and SLLI instructions that
+ // synthesize the desired immedate value into the destination register.
+ void emitLoadImm(unsigned DestReg, int64_t Value, MCStreamer &Out);
+
+ /// Helper for processing MC instructions that have been successfully matched
+ /// by MatchAndEmitInstruction. Modifications to the emitted instructions,
+ /// like the expansion of pseudo instructions (e.g., "li"), can be performed
+ /// in this method.
+ bool processInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+
// Auto-generated instruction matching functions
#define GET_ASSEMBLER_HEADER
#include "RISCVGenAsmMatcher.inc"
@@ -61,8 +89,25 @@ class RISCVAsmParser : public MCTargetAsmParser {
OperandMatchResultTy parseMemOpBaseReg(OperandVector &Operands);
OperandMatchResultTy parseOperandWithModifier(OperandVector &Operands);
- bool parseOperand(OperandVector &Operands);
+ bool parseOperand(OperandVector &Operands, bool ForceImmediate);
+
+ bool parseDirectiveOption();
+
+ void setFeatureBits(uint64_t Feature, StringRef FeatureString) {
+ if (!(getSTI().getFeatureBits()[Feature])) {
+ MCSubtargetInfo &STI = copySTI();
+ setAvailableFeatures(
+ ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
+ }
+ }
+ void clearFeatureBits(uint64_t Feature, StringRef FeatureString) {
+ if (getSTI().getFeatureBits()[Feature]) {
+ MCSubtargetInfo &STI = copySTI();
+ setAvailableFeatures(
+ ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
+ }
+ }
public:
enum RISCVMatchResultTy {
Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
@@ -78,6 +123,10 @@ public:
RISCVAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
const MCInstrInfo &MII, const MCTargetOptions &Options)
: MCTargetAsmParser(Options, STI, MII) {
+ Parser.addAliasForDirective(".half", ".2byte");
+ Parser.addAliasForDirective(".hword", ".2byte");
+ Parser.addAliasForDirective(".word", ".4byte");
+ Parser.addAliasForDirective(".dword", ".8byte");
setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
}
};
@@ -167,6 +216,16 @@ public:
// Predicate methods for AsmOperands defined in RISCVInstrInfo.td
+ bool isBareSymbol() const {
+ int64_t Imm;
+ RISCVMCExpr::VariantKind VK;
+ // Must be of 'immediate' type but not a constant.
+ if (!isImm() || evaluateConstantImm(Imm, VK))
+ return false;
+ return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
+ VK == RISCVMCExpr::VK_RISCV_None;
+ }
+
/// Return true if the operand is a valid for the fence instruction e.g.
/// ('iorw').
bool isFenceArg() const {
@@ -206,6 +265,18 @@ public:
return RISCVFPRndMode::stringToRoundingMode(Str) != RISCVFPRndMode::Invalid;
}
+ bool isImmXLen() const {
+ int64_t Imm;
+ RISCVMCExpr::VariantKind VK;
+ if (!isImm())
+ return false;
+ bool IsConstantImm = evaluateConstantImm(Imm, VK);
+ // Given only Imm, ensuring that the actually specified constant is either
+ // a signed or unsigned 64-bit number is unfortunately impossible.
+ bool IsInRange = isRV64() ? true : isInt<32>(Imm) || isUInt<32>(Imm);
+ return IsConstantImm && IsInRange && VK == RISCVMCExpr::VK_RISCV_None;
+ }
+
bool isUImmLog2XLen() const {
int64_t Imm;
RISCVMCExpr::VariantKind VK;
@@ -260,12 +331,26 @@ public:
(VK == RISCVMCExpr::VK_RISCV_None || VK == RISCVMCExpr::VK_RISCV_LO);
}
- bool isUImm6NonZero() const {
+ bool isSImm6NonZero() const {
+ RISCVMCExpr::VariantKind VK;
+ int64_t Imm;
+ bool IsValid;
+ bool IsConstantImm = evaluateConstantImm(Imm, VK);
+ if (!IsConstantImm)
+ IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
+ else
+ IsValid = ((Imm != 0) && isInt<6>(Imm));
+ return IsValid &&
+ (VK == RISCVMCExpr::VK_RISCV_None || VK == RISCVMCExpr::VK_RISCV_LO);
+ }
+
+ bool isCLUIImm() const {
int64_t Imm;
RISCVMCExpr::VariantKind VK;
bool IsConstantImm = evaluateConstantImm(Imm, VK);
- return IsConstantImm && isUInt<6>(Imm) && (Imm != 0) &&
- VK == RISCVMCExpr::VK_RISCV_None;
+ return IsConstantImm && (Imm != 0) &&
+ (isUInt<5>(Imm) || (Imm >= 0xfffe0 && Imm <= 0xfffff)) &&
+ VK == RISCVMCExpr::VK_RISCV_None;
}
bool isUImm7Lsb00() const {
@@ -321,8 +406,9 @@ public:
IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
else
IsValid = isInt<12>(Imm);
- return IsValid &&
- (VK == RISCVMCExpr::VK_RISCV_None || VK == RISCVMCExpr::VK_RISCV_LO);
+ return IsValid && (VK == RISCVMCExpr::VK_RISCV_None ||
+ VK == RISCVMCExpr::VK_RISCV_LO ||
+ VK == RISCVMCExpr::VK_RISCV_PCREL_LO);
}
bool isSImm12Lsb0() const { return isBareSimmNLsb0<12>(); }
@@ -338,11 +424,11 @@ public:
bool isSImm13Lsb0() const { return isBareSimmNLsb0<13>(); }
- bool isSImm10Lsb0000() const {
+ bool isSImm10Lsb0000NonZero() const {
int64_t Imm;
RISCVMCExpr::VariantKind VK;
bool IsConstantImm = evaluateConstantImm(Imm, VK);
- return IsConstantImm && isShiftedInt<6, 4>(Imm) &&
+ return IsConstantImm && (Imm != 0) && isShiftedInt<6, 4>(Imm) &&
VK == RISCVMCExpr::VK_RISCV_None;
}
@@ -564,7 +650,7 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
}
bool RISCVAsmParser::generateImmOutOfRangeError(
- OperandVector &Operands, uint64_t ErrorInfo, int Lower, int Upper,
+ OperandVector &Operands, uint64_t ErrorInfo, int64_t Lower, int64_t Upper,
Twine Msg = "immediate must be an integer in the range") {
SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
return Error(ErrorLoc, Msg + " [" + Twine(Lower) + ", " + Twine(Upper) + "]");
@@ -581,9 +667,7 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
default:
break;
case Match_Success:
- Inst.setLoc(IDLoc);
- Out.EmitInstruction(Inst, getSTI());
- return false;
+ return processInstruction(Inst, IDLoc, Out);
case Match_MissingFeature:
return Error(IDLoc, "instruction use requires an option to be enabled");
case Match_MnemonicFail:
@@ -600,6 +684,14 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
}
return Error(ErrorLoc, "invalid operand for instruction");
}
+ case Match_InvalidImmXLen:
+ if (isRV64()) {
+ SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+ return Error(ErrorLoc, "operand must be a constant 64-bit integer");
+ }
+ return generateImmOutOfRangeError(Operands, ErrorInfo,
+ std::numeric_limits<int32_t>::min(),
+ std::numeric_limits<uint32_t>::max());
case Match_InvalidUImmLog2XLen:
if (isRV64())
return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 6) - 1);
@@ -613,8 +705,14 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
case Match_InvalidSImm6:
return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 5),
(1 << 5) - 1);
- case Match_InvalidUImm6NonZero:
- return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 6) - 1);
+ case Match_InvalidSImm6NonZero:
+ return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 5),
+ (1 << 5) - 1,
+ "immediate must be non-zero in the range");
+ case Match_InvalidCLUIImm:
+ return generateImmOutOfRangeError(
+ Operands, ErrorInfo, 1, (1 << 5) - 1,
+ "immediate must be in [0xfffe0, 0xfffff] or");
case Match_InvalidUImm7Lsb00:
return generateImmOutOfRangeError(
Operands, ErrorInfo, 0, (1 << 7) - 4,
@@ -639,10 +737,10 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return generateImmOutOfRangeError(
Operands, ErrorInfo, 4, (1 << 10) - 4,
"immediate must be a multiple of 4 bytes in the range");
- case Match_InvalidSImm10Lsb0000:
+ case Match_InvalidSImm10Lsb0000NonZero:
return generateImmOutOfRangeError(
Operands, ErrorInfo, -(1 << 9), (1 << 9) - 16,
- "immediate must be a multiple of 16 bytes in the range");
+ "immediate must be a multiple of 16 bytes and non-zero in the range");
case Match_InvalidSImm12:
return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 11),
(1 << 11) - 1);
@@ -674,6 +772,10 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
ErrorLoc,
"operand must be a valid floating point rounding mode mnemonic");
}
+ case Match_InvalidBareSymbol: {
+ SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+ return Error(ErrorLoc, "operand must be a bare symbol name");
+ }
}
llvm_unreachable("Unknown match type detected!");
@@ -838,12 +940,15 @@ RISCVAsmParser::parseMemOpBaseReg(OperandVector &Operands) {
return MatchOperand_Success;
}
-/// Looks at a token type and creates the relevant operand
-/// from this information, adding to Operands.
-/// If operand was parsed, returns false, else true.
-bool RISCVAsmParser::parseOperand(OperandVector &Operands) {
- // Attempt to parse token as register
- if (parseRegister(Operands, true) == MatchOperand_Success)
+/// Looks at a token type and creates the relevant operand from this
+/// information, adding to Operands. If operand was parsed, returns false, else
+/// true. If ForceImmediate is true, no attempt will be made to parse the
+/// operand as a register, which is needed for pseudoinstructions such as
+/// call.
+bool RISCVAsmParser::parseOperand(OperandVector &Operands,
+ bool ForceImmediate) {
+ // Attempt to parse token as register, unless ForceImmediate.
+ if (!ForceImmediate && parseRegister(Operands, true) == MatchOperand_Success)
return false;
// Attempt to parse token as an immediate
@@ -870,7 +975,8 @@ bool RISCVAsmParser::ParseInstruction(ParseInstructionInfo &Info,
return false;
// Parse first operand
- if (parseOperand(Operands))
+ bool ForceImmediate = (Name == "call" || Name == "tail");
+ if (parseOperand(Operands, ForceImmediate))
return true;
// Parse until end of statement, consuming commas between operands
@@ -879,7 +985,7 @@ bool RISCVAsmParser::ParseInstruction(ParseInstructionInfo &Info,
getLexer().Lex();
// Parse next operand
- if (parseOperand(Operands))
+ if (parseOperand(Operands, false))
return true;
}
@@ -924,7 +1030,7 @@ bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr,
isa<MCSymbolRefExpr>(BE->getRHS()))
return true;
- // See if the addend is is a constant, otherwise there's more going
+ // See if the addend is a constant, otherwise there's more going
// on here than we can deal with.
auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
if (!AddendExpr)
@@ -938,7 +1044,165 @@ bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr,
return Kind != RISCVMCExpr::VK_RISCV_Invalid;
}
-bool RISCVAsmParser::ParseDirective(AsmToken DirectiveID) { return true; }
+bool RISCVAsmParser::ParseDirective(AsmToken DirectiveID) {
+ // This returns false if this function recognizes the directive
+ // regardless of whether it is successfully handles or reports an
+ // error. Otherwise it returns true to give the generic parser a
+ // chance at recognizing it.
+ StringRef IDVal = DirectiveID.getString();
+
+ if (IDVal == ".option")
+ return parseDirectiveOption();
+
+ return true;
+}
+
+bool RISCVAsmParser::parseDirectiveOption() {
+ MCAsmParser &Parser = getParser();
+ // Get the option token.
+ AsmToken Tok = Parser.getTok();
+ // At the moment only identifiers are supported.
+ if (Tok.isNot(AsmToken::Identifier))
+ return Error(Parser.getTok().getLoc(),
+ "unexpected token, expected identifier");
+
+ StringRef Option = Tok.getIdentifier();
+
+ if (Option == "rvc") {
+ getTargetStreamer().emitDirectiveOptionRVC();
+
+ Parser.Lex();
+ if (Parser.getTok().isNot(AsmToken::EndOfStatement))
+ return Error(Parser.getTok().getLoc(),
+ "unexpected token, expected end of statement");
+
+ setFeatureBits(RISCV::FeatureStdExtC, "c");
+ return false;
+ }
+
+ if (Option == "norvc") {
+ getTargetStreamer().emitDirectiveOptionNoRVC();
+
+ Parser.Lex();
+ if (Parser.getTok().isNot(AsmToken::EndOfStatement))
+ return Error(Parser.getTok().getLoc(),
+ "unexpected token, expected end of statement");
+
+ clearFeatureBits(RISCV::FeatureStdExtC, "c");
+ return false;
+ }
+
+ // Unknown option.
+ Warning(Parser.getTok().getLoc(),
+ "unknown option, expected 'rvc' or 'norvc'");
+ Parser.eatToEndOfStatement();
+ return false;
+}
+
+void RISCVAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) {
+ MCInst CInst;
+ bool Res = compressInst(CInst, Inst, getSTI(), S.getContext());
+ CInst.setLoc(Inst.getLoc());
+ S.EmitInstruction((Res ? CInst : Inst), getSTI());
+}
+
+void RISCVAsmParser::emitLoadImm(unsigned DestReg, int64_t Value,
+ MCStreamer &Out) {
+ if (isInt<32>(Value)) {
+ // Emits the MC instructions for loading a 32-bit constant into a register.
+ //
+ // Depending on the active bits in the immediate Value v, the following
+ // instruction sequences are emitted:
+ //
+ // v == 0 : ADDI(W)
+ // v[0,12) != 0 && v[12,32) == 0 : ADDI(W)
+ // v[0,12) == 0 && v[12,32) != 0 : LUI
+ // v[0,32) != 0 : LUI+ADDI(W)
+ //
+ int64_t Hi20 = ((Value + 0x800) >> 12) & 0xFFFFF;
+ int64_t Lo12 = SignExtend64<12>(Value);
+ unsigned SrcReg = RISCV::X0;
+
+ if (Hi20) {
+ emitToStreamer(Out,
+ MCInstBuilder(RISCV::LUI).addReg(DestReg).addImm(Hi20));
+ SrcReg = DestReg;
+ }
+
+ if (Lo12 || Hi20 == 0) {
+ unsigned AddiOpcode =
+ STI->hasFeature(RISCV::Feature64Bit) ? RISCV::ADDIW : RISCV::ADDI;
+ emitToStreamer(Out, MCInstBuilder(AddiOpcode)
+ .addReg(DestReg)
+ .addReg(SrcReg)
+ .addImm(Lo12));
+ }
+ return;
+ }
+ assert(STI->hasFeature(RISCV::Feature64Bit) &&
+ "Target must be 64-bit to support a >32-bit constant");
+
+ // In the worst case, for a full 64-bit constant, a sequence of 8 instructions
+ // (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emmitted. Note
+ // that the first two instructions (LUI+ADDIW) can contribute up to 32 bits
+ // while the following ADDI instructions contribute up to 12 bits each.
+ //
+ // On the first glance, implementing this seems to be possible by simply
+ // emitting the most significant 32 bits (LUI+ADDIW) followed by as many left
+ // shift (SLLI) and immediate additions (ADDI) as needed. However, due to the
+ // fact that ADDI performs a sign extended addition, doing it like that would
+ // only be possible when at most 11 bits of the ADDI instructions are used.
+ // Using all 12 bits of the ADDI instructions, like done by GAS, actually
+ // requires that the constant is processed starting with the least significant
+ // bit.
+ //
+ // In the following, constants are processed from LSB to MSB but instruction
+ // emission is performed from MSB to LSB by recursively calling
+ // emitLoadImm. In each recursion, first the lowest 12 bits are removed
+ // from the constant and the optimal shift amount, which can be greater than
+ // 12 bits if the constant is sparse, is determined. Then, the shifted
+ // remaining constant is processed recursively and gets emitted as soon as it
+ // fits into 32 bits. The emission of the shifts and additions is subsequently
+ // performed when the recursion returns.
+ //
+ int64_t Lo12 = SignExtend64<12>(Value);
+ int64_t Hi52 = (Value + 0x800) >> 12;
+ int ShiftAmount = 12 + findFirstSet((uint64_t)Hi52);
+ Hi52 = SignExtend64(Hi52 >> (ShiftAmount - 12), 64 - ShiftAmount);
+
+ emitLoadImm(DestReg, Hi52, Out);
+
+ emitToStreamer(Out, MCInstBuilder(RISCV::SLLI)
+ .addReg(DestReg)
+ .addReg(DestReg)
+ .addImm(ShiftAmount));
+
+ if (Lo12)
+ emitToStreamer(Out, MCInstBuilder(RISCV::ADDI)
+ .addReg(DestReg)
+ .addReg(DestReg)
+ .addImm(Lo12));
+}
+
+bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out) {
+ Inst.setLoc(IDLoc);
+
+ if (Inst.getOpcode() == RISCV::PseudoLI) {
+ auto Reg = Inst.getOperand(0).getReg();
+ int64_t Imm = Inst.getOperand(1).getImm();
+ // On RV32 the immediate here can either be a signed or an unsigned
+ // 32-bit number. Sign extension has to be performed to ensure that Imm
+ // represents the expected signed 64-bit number.
+ if (!isRV64())
+ Imm = SignExtend64<32>(Imm);
+ emitLoadImm(Reg, Imm, Out);
+ return false;
+ }
+
+ emitToStreamer(Out, Inst);
+ return false;
+}
extern "C" void LLVMInitializeRISCVAsmParser() {
RegisterMCAsmParser<RISCVAsmParser> X(getTheRISCV32Target());
diff --git a/lib/Target/RISCV/CMakeLists.txt b/lib/Target/RISCV/CMakeLists.txt
index 66b50f8728e1..f8d4e2b9517d 100644
--- a/lib/Target/RISCV/CMakeLists.txt
+++ b/lib/Target/RISCV/CMakeLists.txt
@@ -1,14 +1,15 @@
set(LLVM_TARGET_DEFINITIONS RISCV.td)
-tablegen(LLVM RISCVGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM RISCVGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM RISCVGenMCCodeEmitter.inc -gen-emitter)
-tablegen(LLVM RISCVGenMCPseudoLowering.inc -gen-pseudo-lowering)
tablegen(LLVM RISCVGenAsmMatcher.inc -gen-asm-matcher)
tablegen(LLVM RISCVGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM RISCVGenCompressInstEmitter.inc -gen-compress-inst-emitter)
tablegen(LLVM RISCVGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM RISCVGenSubtargetInfo.inc -gen-subtarget)
tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM RISCVGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM RISCVGenMCCodeEmitter.inc -gen-emitter)
+tablegen(LLVM RISCVGenMCPseudoLowering.inc -gen-pseudo-lowering)
+tablegen(LLVM RISCVGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM RISCVGenSubtargetInfo.inc -gen-subtarget)
add_public_tablegen_target(RISCVCommonTableGen)
@@ -19,9 +20,11 @@ add_llvm_target(RISCVCodeGen
RISCVISelDAGToDAG.cpp
RISCVISelLowering.cpp
RISCVMCInstLower.cpp
+ RISCVMergeBaseOffset.cpp
RISCVRegisterInfo.cpp
RISCVSubtarget.cpp
RISCVTargetMachine.cpp
+ RISCVTargetObjectFile.cpp
)
add_subdirectory(AsmParser)
diff --git a/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 563edc9e29d8..7bbb371a757f 100644
--- a/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -232,6 +232,17 @@ static DecodeStatus decodeSImmOperandAndLsl1(MCInst &Inst, uint64_t Imm,
return MCDisassembler::Success;
}
+static DecodeStatus decodeCLUIImmOperand(MCInst &Inst, uint64_t Imm,
+ int64_t Address,
+ const void *Decoder) {
+ assert(isUInt<6>(Imm) && "Invalid immediate");
+ if (Imm > 31) {
+ Imm = (SignExtend64<6>(Imm) & 0xfffff);
+ }
+ Inst.addOperand(MCOperand::createImm(Imm));
+ return MCDisassembler::Success;
+}
+
#include "RISCVGenDisassemblerTables.inc"
DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
@@ -247,14 +258,15 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// It's a 32 bit instruction if bit 0 and 1 are 1.
if ((Bytes[0] & 0x3) == 0x3) {
Insn = support::endian::read32le(Bytes.data());
- DEBUG(dbgs() << "Trying RISCV32 table :\n");
+ LLVM_DEBUG(dbgs() << "Trying RISCV32 table :\n");
Result = decodeInstruction(DecoderTable32, MI, Insn, Address, this, STI);
Size = 4;
} else {
Insn = support::endian::read16le(Bytes.data());
if (!STI.getFeatureBits()[RISCV::Feature64Bit]) {
- DEBUG(dbgs() << "Trying RISCV32Only_16 table (16-bit Instruction):\n");
+ LLVM_DEBUG(
+ dbgs() << "Trying RISCV32Only_16 table (16-bit Instruction):\n");
// Calling the auto-generated decoder function.
Result = decodeInstruction(DecoderTableRISCV32Only_16, MI, Insn, Address,
this, STI);
@@ -264,7 +276,7 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
}
- DEBUG(dbgs() << "Trying RISCV_C table (16-bit Instruction):\n");
+ LLVM_DEBUG(dbgs() << "Trying RISCV_C table (16-bit Instruction):\n");
// Calling the auto-generated decoder function.
Result = decodeInstruction(DecoderTable16, MI, Insn, Address, this, STI);
Size = 2;
diff --git a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp b/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp
index ff56fc5d90ff..300e6fd9750a 100644
--- a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp
+++ b/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp
@@ -13,10 +13,12 @@
#include "RISCVInstPrinter.h"
#include "MCTargetDesc/RISCVBaseInfo.h"
+#include "MCTargetDesc/RISCVMCExpr.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
@@ -29,6 +31,10 @@ using namespace llvm;
#define PRINT_ALIAS_INSTR
#include "RISCVGenAsmWriter.inc"
+// Include the auto-generated portion of the compress emitter.
+#define GEN_UNCOMPRESS_INSTR
+#include "RISCVGenCompressInstEmitter.inc"
+
static cl::opt<bool>
NoAliases("riscv-no-aliases",
cl::desc("Disable the emission of assembler pseudo instructions"),
@@ -37,8 +43,15 @@ NoAliases("riscv-no-aliases",
void RISCVInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
StringRef Annot, const MCSubtargetInfo &STI) {
- if (NoAliases || !printAliasInstr(MI, O))
- printInstruction(MI, O);
+ bool Res = false;
+ const MCInst *NewMI = MI;
+ MCInst UncompressedMI;
+ if (!NoAliases)
+ Res = uncompressInst(UncompressedMI, *MI, MRI, STI);
+ if (Res)
+ NewMI = const_cast<MCInst*>(&UncompressedMI);
+ if (NoAliases || !printAliasInstr(NewMI, STI, O))
+ printInstruction(NewMI, STI, O);
printAnnotation(O, Annot);
}
@@ -47,6 +60,7 @@ void RISCVInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
}
void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O, const char *Modifier) {
assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
const MCOperand &MO = MI->getOperand(OpNo);
@@ -66,6 +80,7 @@ void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
}
void RISCVInstPrinter::printFenceArg(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned FenceArg = MI->getOperand(OpNo).getImm();
if ((FenceArg & RISCVFenceField::I) != 0)
@@ -79,6 +94,7 @@ void RISCVInstPrinter::printFenceArg(const MCInst *MI, unsigned OpNo,
}
void RISCVInstPrinter::printFRMArg(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
auto FRMArg =
static_cast<RISCVFPRndMode::RoundingMode>(MI->getOperand(OpNo).getImm());
diff --git a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h b/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h
index 58f3f8410159..241be8daf113 100644
--- a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h
+++ b/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h
@@ -30,16 +30,21 @@ public:
const MCSubtargetInfo &STI) override;
void printRegName(raw_ostream &O, unsigned RegNo) const override;
- void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
- const char *Modifier = nullptr);
- void printFenceArg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printFRMArg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O, const char *Modifier = nullptr);
+ void printFenceArg(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printFRMArg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
// Autogenerated by tblgen.
- void printInstruction(const MCInst *MI, raw_ostream &O);
- bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+ void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
- unsigned PrintMethodIdx, raw_ostream &O);
+ unsigned PrintMethodIdx,
+ const MCSubtargetInfo &STI, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo,
unsigned AltIdx = RISCV::ABIRegAltName);
};
diff --git a/lib/Target/RISCV/MCTargetDesc/CMakeLists.txt b/lib/Target/RISCV/MCTargetDesc/CMakeLists.txt
index 60429647edd1..d9f4188aa75c 100644
--- a/lib/Target/RISCV/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/RISCV/MCTargetDesc/CMakeLists.txt
@@ -5,4 +5,6 @@ add_llvm_library(LLVMRISCVDesc
RISCVMCCodeEmitter.cpp
RISCVMCExpr.cpp
RISCVMCTargetDesc.cpp
+ RISCVTargetStreamer.cpp
+ RISCVELFStreamer.cpp
)
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index b91467fe1455..9ba7ebd0eb0f 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -27,46 +27,74 @@ using namespace llvm;
namespace {
class RISCVAsmBackend : public MCAsmBackend {
+ const MCSubtargetInfo &STI;
uint8_t OSABI;
bool Is64Bit;
public:
- RISCVAsmBackend(uint8_t OSABI, bool Is64Bit)
- : MCAsmBackend(), OSABI(OSABI), Is64Bit(Is64Bit) {}
+ RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit)
+ : MCAsmBackend(support::little), STI(STI), OSABI(OSABI),
+ Is64Bit(Is64Bit) {}
~RISCVAsmBackend() override {}
+ // Generate diff expression relocations if the relax feature is enabled,
+ // otherwise it is safe for the assembler to calculate these internally.
+ bool requiresDiffExpressionRelocations() const override {
+ return STI.getFeatureBits()[RISCV::FeatureRelax];
+ }
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsResolved) const override;
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override;
+
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override;
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override;
+ // If linker relaxation is enabled, always emit relocations even if the fixup
+ // can be resolved. This is necessary for correctness as offsets may change
+ // during relaxation.
+ bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target) override {
+ return STI.getFeatureBits()[RISCV::FeatureRelax];
+ }
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override {
- return false;
+ llvm_unreachable("Handled by fixupNeedsRelaxationAdvanced");
}
+ bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved,
+ uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout,
+ const bool WasForced) const override;
+
unsigned getNumFixupKinds() const override {
return RISCV::NumTargetFixupKinds;
}
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
- const static MCFixupKindInfo Infos[RISCV::NumTargetFixupKinds] = {
+ const static MCFixupKindInfo Infos[] = {
// This table *must* be in the order that the fixup_* kinds are defined in
// RISCVFixupKinds.h.
//
- // name offset bits flags
- { "fixup_riscv_hi20", 12, 20, 0 },
- { "fixup_riscv_lo12_i", 20, 12, 0 },
- { "fixup_riscv_lo12_s", 0, 32, 0 },
- { "fixup_riscv_pcrel_hi20", 12, 20, MCFixupKindInfo::FKF_IsPCRel },
- { "fixup_riscv_jal", 12, 20, MCFixupKindInfo::FKF_IsPCRel },
- { "fixup_riscv_branch", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
- { "fixup_riscv_rvc_jump", 2, 11, MCFixupKindInfo::FKF_IsPCRel },
- { "fixup_riscv_rvc_branch", 0, 16, MCFixupKindInfo::FKF_IsPCRel }
+ // name offset bits flags
+ { "fixup_riscv_hi20", 12, 20, 0 },
+ { "fixup_riscv_lo12_i", 20, 12, 0 },
+ { "fixup_riscv_lo12_s", 0, 32, 0 },
+ { "fixup_riscv_pcrel_hi20", 12, 20, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_riscv_pcrel_lo12_i", 20, 12, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_riscv_pcrel_lo12_s", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_riscv_jal", 12, 20, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_riscv_branch", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_riscv_rvc_jump", 2, 11, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_riscv_rvc_branch", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_riscv_call", 0, 64, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_riscv_relax", 0, 0, 0 }
};
+ static_assert((array_lengthof(Infos)) == RISCV::NumTargetFixupKinds,
+ "Not all fixup kinds added to Infos array");
if (Kind < FirstTargetFixupKind)
return MCAsmBackend::getFixupKindInfo(Kind);
@@ -76,26 +104,121 @@ public:
return Infos[Kind - FirstTargetFixupKind];
}
- bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+ bool mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const override;
+ unsigned getRelaxedOpcode(unsigned Op) const;
void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
- MCInst &Res) const override {
+ MCInst &Res) const override;
- report_fatal_error("RISCVAsmBackend::relaxInstruction() unimplemented");
- }
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
};
-bool RISCVAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
- // Once support for the compressed instruction set is added, we will be able
- // to conditionally support 16-bit NOPs
- if ((Count % 4) != 0)
+
+bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
+ bool Resolved,
+ uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout,
+ const bool WasForced) const {
+ // Return true if the symbol is actually unresolved.
+ // Resolved could be always false when shouldForceRelocation return true.
+ // We use !WasForced to indicate that the symbol is unresolved and not forced
+ // by shouldForceRelocation.
+ if (!Resolved && !WasForced)
+ return true;
+
+ int64_t Offset = int64_t(Value);
+ switch ((unsigned)Fixup.getKind()) {
+ default:
+ return false;
+ case RISCV::fixup_riscv_rvc_branch:
+ // For compressed branch instructions the immediate must be
+ // in the range [-256, 254].
+ return Offset > 254 || Offset < -256;
+ case RISCV::fixup_riscv_rvc_jump:
+ // For compressed jump instructions the immediate must be
+ // in the range [-2048, 2046].
+ return Offset > 2046 || Offset < -2048;
+ }
+}
+
+void RISCVAsmBackend::relaxInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI,
+ MCInst &Res) const {
+ // TODO: replace this with call to auto generated uncompressinstr() function.
+ switch (Inst.getOpcode()) {
+ default:
+ llvm_unreachable("Opcode not expected!");
+ case RISCV::C_BEQZ:
+ // c.beqz $rs1, $imm -> beq $rs1, X0, $imm.
+ Res.setOpcode(RISCV::BEQ);
+ Res.addOperand(Inst.getOperand(0));
+ Res.addOperand(MCOperand::createReg(RISCV::X0));
+ Res.addOperand(Inst.getOperand(1));
+ break;
+ case RISCV::C_BNEZ:
+ // c.bnez $rs1, $imm -> bne $rs1, X0, $imm.
+ Res.setOpcode(RISCV::BNE);
+ Res.addOperand(Inst.getOperand(0));
+ Res.addOperand(MCOperand::createReg(RISCV::X0));
+ Res.addOperand(Inst.getOperand(1));
+ break;
+ case RISCV::C_J:
+ // c.j $imm -> jal X0, $imm.
+ Res.setOpcode(RISCV::JAL);
+ Res.addOperand(MCOperand::createReg(RISCV::X0));
+ Res.addOperand(Inst.getOperand(0));
+ break;
+ case RISCV::C_JAL:
+ // c.jal $imm -> jal X1, $imm.
+ Res.setOpcode(RISCV::JAL);
+ Res.addOperand(MCOperand::createReg(RISCV::X1));
+ Res.addOperand(Inst.getOperand(0));
+ break;
+ }
+}
+
+// Given a compressed control flow instruction this function returns
+// the expanded instruction.
+unsigned RISCVAsmBackend::getRelaxedOpcode(unsigned Op) const {
+ switch (Op) {
+ default:
+ return Op;
+ case RISCV::C_BEQZ:
+ return RISCV::BEQ;
+ case RISCV::C_BNEZ:
+ return RISCV::BNE;
+ case RISCV::C_J:
+ case RISCV::C_JAL: // fall through.
+ return RISCV::JAL;
+ }
+}
+
+bool RISCVAsmBackend::mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const {
+ return getRelaxedOpcode(Inst.getOpcode()) != Inst.getOpcode();
+}
+
+bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+ bool HasStdExtC = STI.getFeatureBits()[RISCV::FeatureStdExtC];
+ unsigned MinNopLen = HasStdExtC ? 2 : 4;
+
+ if ((Count % MinNopLen) != 0)
return false;
- // The canonical nop on RISC-V is addi x0, x0, 0
- for (uint64_t i = 0; i < Count; i += 4)
- OW->write32(0x13);
+ // The canonical nop on RISC-V is addi x0, x0, 0.
+ uint64_t Nop32Count = Count / 4;
+ for (uint64_t i = Nop32Count; i != 0; --i)
+ OS.write("\x13\0\0\0", 4);
+
+ // The canonical nop on RVC is c.nop.
+ if (HasStdExtC) {
+ uint64_t Nop16Count = (Count - Nop32Count * 4) / 2;
+ for (uint64_t i = Nop16Count; i != 0; --i)
+ OS.write("\x01\0", 2);
+ }
return true;
}
@@ -112,8 +235,10 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
case FK_Data_8:
return Value;
case RISCV::fixup_riscv_lo12_i:
+ case RISCV::fixup_riscv_pcrel_lo12_i:
return Value & 0xfff;
case RISCV::fixup_riscv_lo12_s:
+ case RISCV::fixup_riscv_pcrel_lo12_s:
return (((Value >> 5) & 0x7f) << 25) | ((Value & 0x1f) << 7);
case RISCV::fixup_riscv_hi20:
case RISCV::fixup_riscv_pcrel_hi20:
@@ -154,6 +279,14 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
Value = (Sbit << 31) | (Mid6 << 25) | (Lo4 << 8) | (Hi1 << 7);
return Value;
}
+ case RISCV::fixup_riscv_call: {
+ // Jalr will add UpperImm with the sign-extended 12-bit LowerImm,
+ // we need to add 0x800ULL before extract upper bits to reflect the
+ // effect of the sign extension.
+ uint64_t UpperImm = (Value + 0x800ULL) & 0xfffff000ULL;
+ uint64_t LowerImm = Value & 0xfffULL;
+ return UpperImm | ((LowerImm << 20) << 32);
+ }
case RISCV::fixup_riscv_rvc_jump: {
// Need to produce offset[11|4|9:8|10|6|7|3:1|5] from the 11-bit Value.
unsigned Bit11 = (Value >> 11) & 0x1;
@@ -183,20 +316,11 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
}
}
-static unsigned getSize(unsigned Kind) {
- switch (Kind) {
- default:
- return 4;
- case RISCV::fixup_riscv_rvc_jump:
- case RISCV::fixup_riscv_rvc_branch:
- return 2;
- }
-}
-
void RISCVAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target,
MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) const {
+ bool IsResolved,
+ const MCSubtargetInfo *STI) const {
MCContext &Ctx = Asm.getContext();
MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
if (!Value)
@@ -208,31 +332,29 @@ void RISCVAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
Value <<= Info.TargetOffset;
unsigned Offset = Fixup.getOffset();
- unsigned FullSize = getSize(Fixup.getKind());
+ unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8;
-#ifndef NDEBUG
- unsigned NumBytes = (Info.TargetSize + 7) / 8;
assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
-#endif
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
- for (unsigned i = 0; i != FullSize; ++i) {
+ for (unsigned i = 0; i != NumBytes; ++i) {
Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
}
}
-std::unique_ptr<MCObjectWriter>
-RISCVAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
- return createRISCVELFObjectWriter(OS, OSABI, Is64Bit);
+std::unique_ptr<MCObjectTargetWriter>
+RISCVAsmBackend::createObjectTargetWriter() const {
+ return createRISCVELFObjectWriter(OSABI, Is64Bit);
}
} // end anonymous namespace
MCAsmBackend *llvm::createRISCVAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
const MCTargetOptions &Options) {
+ const Triple &TT = STI.getTargetTriple();
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
- return new RISCVAsmBackend(OSABI, TT.isArch64Bit());
+ return new RISCVAsmBackend(STI, OSABI, TT.isArch64Bit());
}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index ad53228c104a..9b88614aa693 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -23,6 +23,15 @@ public:
~RISCVELFObjectWriter() override;
+ // Return true if the given relocation must be with a symbol rather than
+ // section plus offset.
+ bool needsRelocateWithSymbol(const MCSymbol &Sym,
+ unsigned Type) const override {
+ // TODO: this is very conservative, update once RISC-V psABI requirements
+ // are clarified.
+ return true;
+ }
+
protected:
unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
const MCFixup &Fixup, bool IsPCRel) const override;
@@ -47,6 +56,22 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
return ELF::R_RISCV_32;
case FK_Data_8:
return ELF::R_RISCV_64;
+ case FK_Data_Add_1:
+ return ELF::R_RISCV_ADD8;
+ case FK_Data_Add_2:
+ return ELF::R_RISCV_ADD16;
+ case FK_Data_Add_4:
+ return ELF::R_RISCV_ADD32;
+ case FK_Data_Add_8:
+ return ELF::R_RISCV_ADD64;
+ case FK_Data_Sub_1:
+ return ELF::R_RISCV_SUB8;
+ case FK_Data_Sub_2:
+ return ELF::R_RISCV_SUB16;
+ case FK_Data_Sub_4:
+ return ELF::R_RISCV_SUB32;
+ case FK_Data_Sub_8:
+ return ELF::R_RISCV_SUB64;
case RISCV::fixup_riscv_hi20:
return ELF::R_RISCV_HI20;
case RISCV::fixup_riscv_lo12_i:
@@ -55,6 +80,10 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
return ELF::R_RISCV_LO12_S;
case RISCV::fixup_riscv_pcrel_hi20:
return ELF::R_RISCV_PCREL_HI20;
+ case RISCV::fixup_riscv_pcrel_lo12_i:
+ return ELF::R_RISCV_PCREL_LO12_I;
+ case RISCV::fixup_riscv_pcrel_lo12_s:
+ return ELF::R_RISCV_PCREL_LO12_S;
case RISCV::fixup_riscv_jal:
return ELF::R_RISCV_JAL;
case RISCV::fixup_riscv_branch:
@@ -63,13 +92,14 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
return ELF::R_RISCV_RVC_JUMP;
case RISCV::fixup_riscv_rvc_branch:
return ELF::R_RISCV_RVC_BRANCH;
+ case RISCV::fixup_riscv_call:
+ return ELF::R_RISCV_CALL;
+ case RISCV::fixup_riscv_relax:
+ return ELF::R_RISCV_RELAX;
}
}
-std::unique_ptr<MCObjectWriter>
-llvm::createRISCVELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
- bool Is64Bit) {
- return createELFObjectWriter(
- llvm::make_unique<RISCVELFObjectWriter>(OSABI, Is64Bit), OS,
- /*IsLittleEndian=*/true);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createRISCVELFObjectWriter(uint8_t OSABI, bool Is64Bit) {
+ return llvm::make_unique<RISCVELFObjectWriter>(OSABI, Is64Bit);
}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
new file mode 100644
index 000000000000..6428b11cfe9c
--- /dev/null
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -0,0 +1,42 @@
+//===-- RISCVELFStreamer.cpp - RISCV ELF Target Streamer Methods ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides RISCV specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVELFStreamer.h"
+#include "RISCVMCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+using namespace llvm;
+
+// This part is for ELF object output.
+RISCVTargetELFStreamer::RISCVTargetELFStreamer(MCStreamer &S,
+ const MCSubtargetInfo &STI)
+ : RISCVTargetStreamer(S) {
+ MCAssembler &MCA = getStreamer().getAssembler();
+
+ const FeatureBitset &Features = STI.getFeatureBits();
+
+ unsigned EFlags = MCA.getELFHeaderEFlags();
+
+ if (Features[RISCV::FeatureStdExtC])
+ EFlags |= ELF::EF_RISCV_RVC;
+
+ MCA.setELFHeaderEFlags(EFlags);
+}
+
+MCELFStreamer &RISCVTargetELFStreamer::getStreamer() {
+ return static_cast<MCELFStreamer &>(Streamer);
+}
+
+void RISCVTargetELFStreamer::emitDirectiveOptionRVC() {}
+void RISCVTargetELFStreamer::emitDirectiveOptionNoRVC() {}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
new file mode 100644
index 000000000000..daa7abfe1336
--- /dev/null
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -0,0 +1,27 @@
+//===-- RISCVELFStreamer.h - RISCV ELF Target Streamer ---------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVELFSTREAMER_H
+#define LLVM_LIB_TARGET_RISCV_RISCVELFSTREAMER_H
+
+#include "RISCVTargetStreamer.h"
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+
+class RISCVTargetELFStreamer : public RISCVTargetStreamer {
+public:
+ MCELFStreamer &getStreamer();
+ RISCVTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+
+ virtual void emitDirectiveOptionRVC();
+ virtual void emitDirectiveOptionNoRVC();
+};
+}
+#endif
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
index cfb5d99e79f5..6a1224be774e 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
@@ -29,6 +29,12 @@ enum Fixups {
// fixup_riscv_pcrel_hi20 - 20-bit fixup corresponding to pcrel_hi(foo) for
// instructions like auipc
fixup_riscv_pcrel_hi20,
+ // fixup_riscv_pcrel_lo12_i - 12-bit fixup corresponding to pcrel_lo(foo) for
+ // instructions like addi
+ fixup_riscv_pcrel_lo12_i,
+ // fixup_riscv_pcrel_lo12_s - 12-bit fixup corresponding to pcrel_lo(foo) for
+ // the S-type store instructions
+ fixup_riscv_pcrel_lo12_s,
// fixup_riscv_jal - 20-bit fixup for symbol references in the jal
// instruction
fixup_riscv_jal,
@@ -41,6 +47,12 @@ enum Fixups {
// fixup_riscv_rvc_branch - 8-bit fixup for symbol references in the
// compressed branch instruction
fixup_riscv_rvc_branch,
+ // fixup_riscv_call - A fixup representing a call attached to the auipc
+ // instruction in a pair composed of adjacent auipc+jalr instructions.
+ fixup_riscv_call,
+ // fixup_riscv_relax - Used to generate an R_RISCV_RELAX relocation type,
+ // which indicates the linker may relax the instruction pair.
+ fixup_riscv_relax,
// fixup_riscv_invalid - used as a sentinel and a marker, must be last fixup
fixup_riscv_invalid,
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
index d622911e92c4..780dae410cd0 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
@@ -22,4 +22,6 @@ RISCVMCAsmInfo::RISCVMCAsmInfo(const Triple &TT) {
CommentString = "#";
AlignmentIsInBytes = false;
SupportsDebugInformation = true;
+ Data16bitsDirective = "\t.half\t";
+ Data32bitsDirective = "\t.word\t";
}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 641997e67e06..8a796a014b33 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -21,6 +21,7 @@
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSymbol.h"
@@ -52,6 +53,10 @@ public:
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
+ void expandFunctionCall(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
/// TableGen'erated function for getting the binary encoding for an
/// instruction.
uint64_t getBinaryCodeForInstr(const MCInst &MI,
@@ -80,6 +85,46 @@ MCCodeEmitter *llvm::createRISCVMCCodeEmitter(const MCInstrInfo &MCII,
return new RISCVMCCodeEmitter(Ctx, MCII);
}
+// Expand PseudoCALL and PseudoTAIL to AUIPC and JALR with relocation types.
+// We expand PseudoCALL and PseudoTAIL while encoding, meaning AUIPC and JALR
+// won't go through RISCV MC to MC compressed instruction transformation. This
+// is acceptable because AUIPC has no 16-bit form and C_JALR have no immediate
+// operand field. We let linker relaxation deal with it. When linker
+// relaxation enabled, AUIPC and JALR have chance relax to JAL. If C extension
+// is enabled, JAL has chance relax to C_JAL.
+void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ MCInst TmpInst;
+ MCOperand Func = MI.getOperand(0);
+ unsigned Ra = (MI.getOpcode() == RISCV::PseudoTAIL) ? RISCV::X6 : RISCV::X1;
+ uint32_t Binary;
+
+ assert(Func.isExpr() && "Expected expression");
+
+ const MCExpr *Expr = Func.getExpr();
+
+ // Create function call expression CallExpr for AUIPC.
+ const MCExpr *CallExpr =
+ RISCVMCExpr::create(Expr, RISCVMCExpr::VK_RISCV_CALL, Ctx);
+
+ // Emit AUIPC Ra, Func with R_RISCV_CALL relocation type.
+ TmpInst = MCInstBuilder(RISCV::AUIPC)
+ .addReg(Ra)
+ .addOperand(MCOperand::createExpr(CallExpr));
+ Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+ support::endian::write(OS, Binary, support::little);
+
+ if (MI.getOpcode() == RISCV::PseudoTAIL)
+ // Emit JALR X0, X6, 0
+ TmpInst = MCInstBuilder(RISCV::JALR).addReg(RISCV::X0).addReg(Ra).addImm(0);
+ else
+ // Emit JALR X1, X1, 0
+ TmpInst = MCInstBuilder(RISCV::JALR).addReg(Ra).addReg(Ra).addImm(0);
+ Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+ support::endian::write(OS, Binary, support::little);
+}
+
void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
@@ -87,17 +132,24 @@ void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
// Get byte count of instruction.
unsigned Size = Desc.getSize();
+ if (MI.getOpcode() == RISCV::PseudoCALL ||
+ MI.getOpcode() == RISCV::PseudoTAIL) {
+ expandFunctionCall(MI, OS, Fixups, STI);
+ MCNumEmitted += 2;
+ return;
+ }
+
switch (Size) {
default:
llvm_unreachable("Unhandled encodeInstruction length!");
case 2: {
uint16_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
- support::endian::Writer<support::little>(OS).write<uint16_t>(Bits);
+ support::endian::write<uint16_t>(OS, Bits, support::little);
break;
}
case 4: {
uint32_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
- support::endian::Writer<support::little>(OS).write(Bits);
+ support::endian::write(OS, Bits, support::little);
break;
}
}
@@ -138,7 +190,7 @@ RISCVMCCodeEmitter::getImmOpValueAsr1(const MCInst &MI, unsigned OpNo,
unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
-
+ bool EnableRelax = STI.getFeatureBits()[RISCV::FeatureRelax];
const MCOperand &MO = MI.getOperand(OpNo);
MCInstrDesc const &Desc = MCII.get(MI.getOpcode());
@@ -161,15 +213,31 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
case RISCVMCExpr::VK_RISCV_Invalid:
llvm_unreachable("Unhandled fixup kind!");
case RISCVMCExpr::VK_RISCV_LO:
- FixupKind = MIFrm == RISCVII::InstFormatI ? RISCV::fixup_riscv_lo12_i
- : RISCV::fixup_riscv_lo12_s;
+ if (MIFrm == RISCVII::InstFormatI)
+ FixupKind = RISCV::fixup_riscv_lo12_i;
+ else if (MIFrm == RISCVII::InstFormatS)
+ FixupKind = RISCV::fixup_riscv_lo12_s;
+ else
+ llvm_unreachable("VK_RISCV_LO used with unexpected instruction format");
break;
case RISCVMCExpr::VK_RISCV_HI:
FixupKind = RISCV::fixup_riscv_hi20;
break;
+ case RISCVMCExpr::VK_RISCV_PCREL_LO:
+ if (MIFrm == RISCVII::InstFormatI)
+ FixupKind = RISCV::fixup_riscv_pcrel_lo12_i;
+ else if (MIFrm == RISCVII::InstFormatS)
+ FixupKind = RISCV::fixup_riscv_pcrel_lo12_s;
+ else
+ llvm_unreachable(
+ "VK_RISCV_PCREL_LO used with unexpected instruction format");
+ break;
case RISCVMCExpr::VK_RISCV_PCREL_HI:
FixupKind = RISCV::fixup_riscv_pcrel_hi20;
break;
+ case RISCVMCExpr::VK_RISCV_CALL:
+ FixupKind = RISCV::fixup_riscv_call;
+ break;
}
} else if (Kind == MCExpr::SymbolRef &&
cast<MCSymbolRefExpr>(Expr)->getKind() == MCSymbolRefExpr::VK_None) {
@@ -190,6 +258,15 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
MCFixup::create(0, Expr, MCFixupKind(FixupKind), MI.getLoc()));
++MCNumFixups;
+ if (EnableRelax) {
+ if (FixupKind == RISCV::fixup_riscv_call) {
+ Fixups.push_back(
+ MCFixup::create(0, Expr, MCFixupKind(RISCV::fixup_riscv_relax),
+ MI.getLoc()));
+ ++MCNumFixups;
+ }
+ }
+
return 0;
}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index b36236ea155f..085dcd4e5f66 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -12,6 +12,7 @@
//
//===----------------------------------------------------------------------===//
+#include "RISCV.h"
#include "RISCVMCExpr.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
@@ -31,7 +32,8 @@ const RISCVMCExpr *RISCVMCExpr::create(const MCExpr *Expr, VariantKind Kind,
}
void RISCVMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
- bool HasVariant = getKind() != VK_RISCV_None;
+ bool HasVariant =
+ ((getKind() != VK_RISCV_None) && (getKind() != VK_RISCV_CALL));
if (HasVariant)
OS << '%' << getVariantKindName(getKind()) << '(';
Expr->print(OS, MAI);
@@ -42,7 +44,23 @@ void RISCVMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
bool RISCVMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
const MCAsmLayout *Layout,
const MCFixup *Fixup) const {
- return getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup);
+ if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
+ return false;
+
+ // Some custom fixup types are not valid with symbol difference expressions
+ if (Res.getSymA() && Res.getSymB()) {
+ switch (getKind()) {
+ default:
+ return true;
+ case VK_RISCV_LO:
+ case VK_RISCV_HI:
+ case VK_RISCV_PCREL_LO:
+ case VK_RISCV_PCREL_HI:
+ return false;
+ }
+ }
+
+ return true;
}
void RISCVMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
@@ -53,6 +71,7 @@ RISCVMCExpr::VariantKind RISCVMCExpr::getVariantKindForName(StringRef name) {
return StringSwitch<RISCVMCExpr::VariantKind>(name)
.Case("lo", VK_RISCV_LO)
.Case("hi", VK_RISCV_HI)
+ .Case("pcrel_lo", VK_RISCV_PCREL_LO)
.Case("pcrel_hi", VK_RISCV_PCREL_HI)
.Default(VK_RISCV_Invalid);
}
@@ -65,6 +84,8 @@ StringRef RISCVMCExpr::getVariantKindName(VariantKind Kind) {
return "lo";
case VK_RISCV_HI:
return "hi";
+ case VK_RISCV_PCREL_LO:
+ return "pcrel_lo";
case VK_RISCV_PCREL_HI:
return "pcrel_hi";
}
@@ -73,7 +94,8 @@ StringRef RISCVMCExpr::getVariantKindName(VariantKind Kind) {
bool RISCVMCExpr::evaluateAsConstant(int64_t &Res) const {
MCValue Value;
- if (Kind == VK_RISCV_PCREL_HI)
+ if (Kind == VK_RISCV_PCREL_HI || Kind == VK_RISCV_PCREL_LO ||
+ Kind == VK_RISCV_CALL)
return false;
if (!getSubExpr()->evaluateAsRelocatable(Value, nullptr, nullptr))
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
index 69b55ca6f7cd..d2e0f6b6cdae 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
@@ -20,13 +20,16 @@
namespace llvm {
class StringRef;
+class MCOperand;
class RISCVMCExpr : public MCTargetExpr {
public:
enum VariantKind {
VK_RISCV_None,
VK_RISCV_LO,
VK_RISCV_HI,
+ VK_RISCV_PCREL_LO,
VK_RISCV_PCREL_HI,
+ VK_RISCV_CALL,
VK_RISCV_Invalid
};
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index 45de976ec6c2..133f3cd3d39a 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -13,7 +13,9 @@
#include "RISCVMCTargetDesc.h"
#include "InstPrinter/RISCVInstPrinter.h"
+#include "RISCVELFStreamer.h"
#include "RISCVMCAsmInfo.h"
+#include "RISCVTargetStreamer.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCInstrInfo.h"
@@ -67,6 +69,21 @@ static MCInstPrinter *createRISCVMCInstPrinter(const Triple &T,
return new RISCVInstPrinter(MAI, MII, MRI);
}
+static MCTargetStreamer *
+createRISCVObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+ const Triple &TT = STI.getTargetTriple();
+ if (TT.isOSBinFormatELF())
+ return new RISCVTargetELFStreamer(S, STI);
+ return nullptr;
+}
+
+static MCTargetStreamer *createRISCVAsmTargetStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrint,
+ bool isVerboseAsm) {
+ return new RISCVTargetAsmStreamer(S, OS);
+}
+
extern "C" void LLVMInitializeRISCVTargetMC() {
for (Target *T : {&getTheRISCV32Target(), &getTheRISCV64Target()}) {
TargetRegistry::RegisterMCAsmInfo(*T, createRISCVMCAsmInfo);
@@ -76,5 +93,10 @@ extern "C" void LLVMInitializeRISCVTargetMC() {
TargetRegistry::RegisterMCCodeEmitter(*T, createRISCVMCCodeEmitter);
TargetRegistry::RegisterMCInstPrinter(*T, createRISCVMCInstPrinter);
TargetRegistry::RegisterMCSubtargetInfo(*T, createRISCVMCSubtargetInfo);
+ TargetRegistry::RegisterObjectTargetStreamer(
+ *T, createRISCVObjectTargetStreamer);
+
+ // Register the asm target streamer.
+ TargetRegistry::RegisterAsmTargetStreamer(*T, createRISCVAsmTargetStreamer);
}
}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
index bea2f8800fa6..0228253c08cb 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
@@ -24,7 +24,7 @@ class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
class MCRegisterInfo;
class MCSubtargetInfo;
class StringRef;
@@ -40,12 +40,12 @@ MCCodeEmitter *createRISCVMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
-MCAsmBackend *createRISCVAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
+MCAsmBackend *createRISCVAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const MCTargetOptions &Options);
-std::unique_ptr<MCObjectWriter>
-createRISCVELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI, bool Is64Bit);
+std::unique_ptr<MCObjectTargetWriter> createRISCVELFObjectWriter(uint8_t OSABI,
+ bool Is64Bit);
}
// Defines symbolic names for RISC-V registers.
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
new file mode 100644
index 000000000000..2d5205aa7ef7
--- /dev/null
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
@@ -0,0 +1,32 @@
+//===-- RISCVTargetStreamer.cpp - RISCV Target Streamer Methods -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides RISCV specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVTargetStreamer.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+RISCVTargetStreamer::RISCVTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+// This part is for ascii assembly output
+RISCVTargetAsmStreamer::RISCVTargetAsmStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS)
+ : RISCVTargetStreamer(S), OS(OS) {}
+
+void RISCVTargetAsmStreamer::emitDirectiveOptionRVC() {
+ OS << "\t.option\trvc\n";
+}
+
+void RISCVTargetAsmStreamer::emitDirectiveOptionNoRVC() {
+ OS << "\t.option\tnorvc\n";
+}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
new file mode 100644
index 000000000000..525c20810f24
--- /dev/null
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
@@ -0,0 +1,37 @@
+//===-- RISCVTargetStreamer.h - RISCV Target Streamer ----------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_RISCV_RISCVTARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class RISCVTargetStreamer : public MCTargetStreamer {
+public:
+ RISCVTargetStreamer(MCStreamer &S);
+
+ virtual void emitDirectiveOptionRVC() = 0;
+ virtual void emitDirectiveOptionNoRVC() = 0;
+};
+
+// This part is for ascii assembly output
+class RISCVTargetAsmStreamer : public RISCVTargetStreamer {
+ formatted_raw_ostream &OS;
+
+public:
+ RISCVTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+
+ void emitDirectiveOptionRVC() override;
+ void emitDirectiveOptionNoRVC() override;
+};
+
+}
+#endif
diff --git a/lib/Target/RISCV/RISCV.h b/lib/Target/RISCV/RISCV.h
index 884cb2e5014d..2e4f536aca35 100644
--- a/lib/Target/RISCV/RISCV.h
+++ b/lib/Target/RISCV/RISCV.h
@@ -25,6 +25,7 @@ class MCInst;
class MCOperand;
class MachineInstr;
class MachineOperand;
+class PassRegistry;
void LowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
const AsmPrinter &AP);
@@ -32,6 +33,9 @@ bool LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
MCOperand &MCOp, const AsmPrinter &AP);
FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM);
+
+FunctionPass *createRISCVMergeBaseOffsetOptPass();
+void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &);
}
#endif
diff --git a/lib/Target/RISCV/RISCV.td b/lib/Target/RISCV/RISCV.td
index c74d560b2e03..281378cb2eee 100644
--- a/lib/Target/RISCV/RISCV.td
+++ b/lib/Target/RISCV/RISCV.td
@@ -55,6 +55,10 @@ def IsRV32 : Predicate<"!Subtarget->is64Bit()">,
def RV64 : HwMode<"+64bit">;
def RV32 : HwMode<"-64bit">;
+def FeatureRelax
+ : SubtargetFeature<"relax", "EnableLinkerRelax", "true",
+ "Enable Linker relaxation.">;
+
//===----------------------------------------------------------------------===//
// Registers, calling conventions, instruction descriptions.
//===----------------------------------------------------------------------===//
@@ -84,7 +88,13 @@ def RISCVAsmParser : AsmParser {
let AllowDuplicateRegisterNames = 1;
}
+def RISCVAsmWriter : AsmWriter {
+ int PassSubtarget = 1;
+}
+
def RISCV : Target {
let InstructionSet = RISCVInstrInfo;
let AssemblyParsers = [RISCVAsmParser];
+ let AssemblyWriters = [RISCVAsmWriter];
+ let AllowRegisterRenaming = 1;
}
diff --git a/lib/Target/RISCV/RISCVAsmPrinter.cpp b/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 4808e6c73c50..bdf8e5d840b3 100644
--- a/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -14,6 +14,7 @@
#include "RISCV.h"
#include "InstPrinter/RISCVInstPrinter.h"
+#include "MCTargetDesc/RISCVMCExpr.h"
#include "RISCVTargetMachine.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineConstantPool.h"
@@ -41,6 +42,14 @@ public:
void EmitInstruction(const MachineInstr *MI) override;
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) override;
+
+ void EmitToStreamer(MCStreamer &S, const MCInst &Inst);
bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
const MachineInstr *MI);
@@ -51,6 +60,15 @@ public:
};
}
+#define GEN_COMPRESS_INSTR
+#include "RISCVGenCompressInstEmitter.inc"
+void RISCVAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
+ MCInst CInst;
+ bool Res = compressInst(CInst, Inst, *TM.getMCSubtargetInfo(),
+ OutStreamer->getContext());
+ AsmPrinter::EmitToStreamer(*OutStreamer, Res ? CInst : Inst);
+}
+
// Simple pseudo-instructions have their lowering (with expansion to real
// instructions) auto-generated.
#include "RISCVGenMCPseudoLowering.inc"
@@ -65,6 +83,54 @@ void RISCVAsmPrinter::EmitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, TmpInst);
}
+bool RISCVAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &OS) {
+ if (AsmVariant != 0)
+ report_fatal_error("There are no defined alternate asm variants");
+
+ // First try the generic code, which knows about modifiers like 'c' and 'n'.
+ if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS))
+ return false;
+
+ if (!ExtraCode) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ switch (MO.getType()) {
+ case MachineOperand::MO_Immediate:
+ OS << MO.getImm();
+ return false;
+ case MachineOperand::MO_Register:
+ OS << RISCVInstPrinter::getRegisterName(MO.getReg());
+ return false;
+ default:
+ break;
+ }
+ }
+
+ return true;
+}
+
+bool RISCVAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo, unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &OS) {
+ if (AsmVariant != 0)
+ report_fatal_error("There are no defined alternate asm variants");
+
+ if (!ExtraCode) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ // For now, we only support register memory operands in registers and
+ // assume there is no addend
+ if (!MO.isReg())
+ return true;
+
+ OS << "0(" << RISCVInstPrinter::getRegisterName(MO.getReg()) << ")";
+ return false;
+ }
+
+ return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
+}
+
// Force static initialization.
extern "C" void LLVMInitializeRISCVAsmPrinter() {
RegisterAsmPrinter<RISCVAsmPrinter> X(getTheRISCV32Target());
diff --git a/lib/Target/RISCV/RISCVCallingConv.td b/lib/Target/RISCV/RISCVCallingConv.td
index d2b17c64c9c2..ef146258c383 100644
--- a/lib/Target/RISCV/RISCVCallingConv.td
+++ b/lib/Target/RISCV/RISCVCallingConv.td
@@ -18,3 +18,40 @@ def CSR : CalleeSavedRegs<(add X1, X3, X4, X8, X9, (sequence "X%u", 18, 27))>;
// Needed for implementation of RISCVRegisterInfo::getNoPreservedMask()
def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+// Interrupt handler needs to save/restore all registers that are used,
+// both Caller and Callee saved registers.
+def CSR_Interrupt : CalleeSavedRegs<(add X1,
+ (sequence "X%u", 3, 9),
+ (sequence "X%u", 10, 11),
+ (sequence "X%u", 12, 17),
+ (sequence "X%u", 18, 27),
+ (sequence "X%u", 28, 31))>;
+
+// Same as CSR_Interrupt, but including all 32-bit FP registers.
+def CSR_XLEN_F32_Interrupt: CalleeSavedRegs<(add X1,
+ (sequence "X%u", 3, 9),
+ (sequence "X%u", 10, 11),
+ (sequence "X%u", 12, 17),
+ (sequence "X%u", 18, 27),
+ (sequence "X%u", 28, 31),
+ (sequence "F%u_32", 0, 7),
+ (sequence "F%u_32", 10, 11),
+ (sequence "F%u_32", 12, 17),
+ (sequence "F%u_32", 28, 31),
+ (sequence "F%u_32", 8, 9),
+ (sequence "F%u_32", 18, 27))>;
+
+// Same as CSR_Interrupt, but including all 64-bit FP registers.
+def CSR_XLEN_F64_Interrupt: CalleeSavedRegs<(add X1,
+ (sequence "X%u", 3, 9),
+ (sequence "X%u", 10, 11),
+ (sequence "X%u", 12, 17),
+ (sequence "X%u", 18, 27),
+ (sequence "X%u", 28, 31),
+ (sequence "F%u_64", 0, 7),
+ (sequence "F%u_64", 10, 11),
+ (sequence "F%u_64", 12, 17),
+ (sequence "F%u_64", 28, 31),
+ (sequence "F%u_64", 8, 9),
+ (sequence "F%u_64", 18, 27))>;
diff --git a/lib/Target/RISCV/RISCVFrameLowering.cpp b/lib/Target/RISCV/RISCVFrameLowering.cpp
index e9e003e63d59..a816028f9d8b 100644
--- a/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -12,15 +12,24 @@
//===----------------------------------------------------------------------===//
#include "RISCVFrameLowering.h"
+#include "RISCVMachineFunctionInfo.h"
#include "RISCVSubtarget.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
using namespace llvm;
-bool RISCVFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
+bool RISCVFrameLowering::hasFP(const MachineFunction &MF) const {
+ const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+ RegInfo->needsStackRealignment(MF) || MFI.hasVarSizedObjects() ||
+ MFI.isFrameAddressTaken();
+}
// Determines the size of the frame and maximum call frame size.
void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
@@ -34,21 +43,6 @@ void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
uint64_t StackAlign = RI->needsStackRealignment(MF) ? MFI.getMaxAlignment()
: getStackAlignment();
- // Get the maximum call frame size of all the calls.
- uint64_t MaxCallFrameSize = MFI.getMaxCallFrameSize();
-
- // If we have dynamic alloca then MaxCallFrameSize needs to be aligned so
- // that allocations will be aligned.
- if (MFI.hasVarSizedObjects())
- MaxCallFrameSize = alignTo(MaxCallFrameSize, StackAlign);
-
- // Update maximum call frame size.
- MFI.setMaxCallFrameSize(MaxCallFrameSize);
-
- // Include call frame size in total.
- if (!(hasReservedCallFrame(MF) && MFI.adjustsStack()))
- FrameSize += MaxCallFrameSize;
-
// Make sure the frame is aligned.
FrameSize = alignTo(FrameSize, StackAlign);
@@ -61,18 +55,34 @@ void RISCVFrameLowering::adjustReg(MachineBasicBlock &MBB,
const DebugLoc &DL, unsigned DestReg,
unsigned SrcReg, int64_t Val,
MachineInstr::MIFlag Flag) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const RISCVInstrInfo *TII = STI.getInstrInfo();
if (DestReg == SrcReg && Val == 0)
return;
- if (!isInt<12>(Val))
- report_fatal_error("adjustReg cannot yet handle adjustments >12 bits");
-
- BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI), DestReg)
- .addReg(SrcReg)
- .addImm(Val)
- .setMIFlag(Flag);
+ if (isInt<12>(Val)) {
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI), DestReg)
+ .addReg(SrcReg)
+ .addImm(Val)
+ .setMIFlag(Flag);
+ } else if (isInt<32>(Val)) {
+ unsigned Opc = RISCV::ADD;
+ bool isSub = Val < 0;
+ if (isSub) {
+ Val = -Val;
+ Opc = RISCV::SUB;
+ }
+
+ unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ TII->movImm32(MBB, MBBI, DL, ScratchReg, Val, Flag);
+ BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+ .addReg(SrcReg)
+ .addReg(ScratchReg, RegState::Kill)
+ .setMIFlag(Flag);
+ } else {
+ report_fatal_error("adjustReg cannot yet handle adjustments >32 bits");
+ }
}
// Returns the register used to hold the frame pointer.
@@ -85,12 +95,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
- if (!hasFP(MF)) {
- report_fatal_error(
- "emitPrologue doesn't support framepointer-less functions");
- }
-
MachineFrameInfo &MFI = MF.getFrameInfo();
+ auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
MachineBasicBlock::iterator MBBI = MBB.begin();
unsigned FPReg = getFPReg(STI);
@@ -124,19 +130,17 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
std::advance(MBBI, CSI.size());
// Generate new FP.
- adjustReg(MBB, MBBI, DL, FPReg, SPReg, StackSize, MachineInstr::FrameSetup);
+ if (hasFP(MF))
+ adjustReg(MBB, MBBI, DL, FPReg, SPReg,
+ StackSize - RVFI->getVarArgsSaveSize(), MachineInstr::FrameSetup);
}
void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- if (!hasFP(MF)) {
- report_fatal_error(
- "emitEpilogue doesn't support framepointer-less functions");
- }
-
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
MachineFrameInfo &MFI = MF.getFrameInfo();
+ auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
DebugLoc DL = MBBI->getDebugLoc();
unsigned FPReg = getFPReg(STI);
unsigned SPReg = getSPReg(STI);
@@ -153,7 +157,9 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
// necessary if the stack pointer was modified, meaning the stack size is
// unknown.
if (RI->needsStackRealignment(MF) || MFI.hasVarSizedObjects()) {
- adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg, -StackSize,
+ assert(hasFP(MF) && "frame pointer should not have been eliminated");
+ adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg,
+ -StackSize + RVFI->getVarArgsSaveSize(),
MachineInstr::FrameDestroy);
}
@@ -166,6 +172,7 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
unsigned &FrameReg) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+ const auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
// Callee-saved registers should be referenced relative to the stack
// pointer (positive offset), otherwise use the frame pointer (negative
@@ -182,10 +189,15 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
}
- FrameReg = RI->getFrameRegister(MF);
if (FI >= MinCSFI && FI <= MaxCSFI) {
FrameReg = RISCV::X2;
Offset += MF.getFrameInfo().getStackSize();
+ } else {
+ FrameReg = RI->getFrameRegister(MF);
+ if (hasFP(MF))
+ Offset += RVFI->getVarArgsSaveSize();
+ else
+ Offset += MF.getFrameInfo().getStackSize();
}
return Offset;
}
@@ -194,8 +206,94 @@ void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedRegs,
RegScavenger *RS) const {
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
- // TODO: Once frame pointer elimination is implemented, don't
- // unconditionally spill the frame pointer and return address.
- SavedRegs.set(RISCV::X1);
- SavedRegs.set(RISCV::X8);
+ // Unconditionally spill RA and FP only if the function uses a frame
+ // pointer.
+ if (hasFP(MF)) {
+ SavedRegs.set(RISCV::X1);
+ SavedRegs.set(RISCV::X8);
+ }
+
+ // If interrupt is enabled and there are calls in the handler,
+ // unconditionally save all Caller-saved registers and
+ // all FP registers, regardless whether they are used.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ if (MF.getFunction().hasFnAttribute("interrupt") && MFI.hasCalls()) {
+
+ static const MCPhysReg CSRegs[] = { RISCV::X1, /* ra */
+ RISCV::X5, RISCV::X6, RISCV::X7, /* t0-t2 */
+ RISCV::X10, RISCV::X11, /* a0-a1, a2-a7 */
+ RISCV::X12, RISCV::X13, RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17,
+ RISCV::X28, RISCV::X29, RISCV::X30, RISCV::X31, 0 /* t3-t6 */
+ };
+
+ for (unsigned i = 0; CSRegs[i]; ++i)
+ SavedRegs.set(CSRegs[i]);
+
+ if (MF.getSubtarget<RISCVSubtarget>().hasStdExtD() ||
+ MF.getSubtarget<RISCVSubtarget>().hasStdExtF()) {
+
+ // If interrupt is enabled, this list contains all FP registers.
+ const MCPhysReg * Regs = MF.getRegInfo().getCalleeSavedRegs();
+
+ for (unsigned i = 0; Regs[i]; ++i)
+ if (RISCV::FPR32RegClass.contains(Regs[i]) ||
+ RISCV::FPR64RegClass.contains(Regs[i]))
+ SavedRegs.set(Regs[i]);
+ }
+ }
+}
+
+void RISCVFrameLowering::processFunctionBeforeFrameFinalized(
+ MachineFunction &MF, RegScavenger *RS) const {
+ const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const TargetRegisterClass *RC = &RISCV::GPRRegClass;
+ // estimateStackSize has been observed to under-estimate the final stack
+ // size, so give ourselves wiggle-room by checking for stack size
+ // representable an 11-bit signed field rather than 12-bits.
+ // FIXME: It may be possible to craft a function with a small stack that
+ // still needs an emergency spill slot for branch relaxation. This case
+ // would currently be missed.
+ if (!isInt<11>(MFI.estimateStackSize(MF))) {
+ int RegScavFI = MFI.CreateStackObject(
+ RegInfo->getSpillSize(*RC), RegInfo->getSpillAlignment(*RC), false);
+ RS->addScavengingFrameIndex(RegScavFI);
+ }
+}
+
+// Not preserve stack space within prologue for outgoing variables when the
+// function contains variable size objects and let eliminateCallFramePseudoInstr
+// preserve stack space for it.
+bool RISCVFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ return !MF.getFrameInfo().hasVarSizedObjects();
+}
+
+// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions.
+MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const {
+ unsigned SPReg = RISCV::X2;
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (!hasReservedCallFrame(MF)) {
+ // If space has not been reserved for a call frame, ADJCALLSTACKDOWN and
+ // ADJCALLSTACKUP must be converted to instructions manipulating the stack
+ // pointer. This is necessary when there is a variable length stack
+ // allocation (e.g. alloca), which means it's not possible to allocate
+ // space for outgoing arguments from within the function prologue.
+ int64_t Amount = MI->getOperand(0).getImm();
+
+ if (Amount != 0) {
+ // Ensure the stack remains aligned after adjustment.
+ Amount = alignSPAdjust(Amount);
+
+ if (MI->getOpcode() == RISCV::ADJCALLSTACKDOWN)
+ Amount = -Amount;
+
+ adjustReg(MBB, MI, DL, SPReg, SPReg, Amount, MachineInstr::NoFlags);
+ }
+ }
+
+ return MBB.erase(MI);
}
diff --git a/lib/Target/RISCV/RISCVFrameLowering.h b/lib/Target/RISCV/RISCVFrameLowering.h
index d92bb70c76da..ca653c2b9f17 100644
--- a/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/lib/Target/RISCV/RISCVFrameLowering.h
@@ -36,13 +36,15 @@ public:
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS) const override;
+ void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+ RegScavenger *RS) const override;
+
bool hasFP(const MachineFunction &MF) const override;
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
MachineBasicBlock::iterator
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI) const override {
- return MBB.erase(MI);
- }
+ MachineBasicBlock::iterator MI) const override;
protected:
const RISCVSubtarget &STI;
diff --git a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 113a45ac7cc0..04441b9a9b15 100644
--- a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -42,25 +42,36 @@ public:
return SelectionDAGISel::runOnMachineFunction(MF);
}
+ void PostprocessISelDAG() override;
+
void Select(SDNode *Node) override;
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+
bool SelectAddrFI(SDValue Addr, SDValue &Base);
// Include the pieces autogenerated from the target description.
#include "RISCVGenDAGISel.inc"
+
+private:
+ void doPeepholeLoadStoreADDI();
+ void doPeepholeBuildPairF64SplitF64();
};
}
+void RISCVDAGToDAGISel::PostprocessISelDAG() {
+ doPeepholeLoadStoreADDI();
+ doPeepholeBuildPairF64SplitF64();
+}
+
void RISCVDAGToDAGISel::Select(SDNode *Node) {
unsigned Opcode = Node->getOpcode();
MVT XLenVT = Subtarget->getXLenVT();
- // Dump information about the Node being selected.
- DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n");
-
// If we have a custom node, we have already selected
if (Node->isMachineOpcode()) {
- DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
Node->setNodeId(-1);
return;
}
@@ -82,7 +93,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
if (Opcode == ISD::FrameIndex) {
SDLoc DL(Node);
SDValue Imm = CurDAG->getTargetConstant(0, DL, XLenVT);
- int FI = dyn_cast<FrameIndexSDNode>(Node)->getIndex();
+ int FI = cast<FrameIndexSDNode>(Node)->getIndex();
EVT VT = Node->getValueType(0);
SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm));
@@ -93,6 +104,22 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
SelectCode(Node);
}
+bool RISCVDAGToDAGISel::SelectInlineAsmMemoryOperand(
+ const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
+ switch (ConstraintID) {
+ case InlineAsm::Constraint_i:
+ case InlineAsm::Constraint_m:
+ // We just support simple memory operands that have a single address
+ // operand and need no special handling.
+ OutOps.push_back(Op);
+ return false;
+ default:
+ break;
+ }
+
+ return true;
+}
+
bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
if (auto FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getXLenVT());
@@ -101,6 +128,131 @@ bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
return false;
}
+// Merge an ADDI into the offset of a load/store instruction where possible.
+// (load (add base, off), 0) -> (load base, off)
+// (store val, (add base, off)) -> (store val, base, off)
+void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() {
+ SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+ ++Position;
+
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ // Skip dead nodes and any non-machine opcodes.
+ if (N->use_empty() || !N->isMachineOpcode())
+ continue;
+
+ int OffsetOpIdx;
+ int BaseOpIdx;
+
+ // Only attempt this optimisation for I-type loads and S-type stores.
+ switch (N->getMachineOpcode()) {
+ default:
+ continue;
+ case RISCV::LB:
+ case RISCV::LH:
+ case RISCV::LW:
+ case RISCV::LBU:
+ case RISCV::LHU:
+ case RISCV::LWU:
+ case RISCV::LD:
+ case RISCV::FLW:
+ case RISCV::FLD:
+ BaseOpIdx = 0;
+ OffsetOpIdx = 1;
+ break;
+ case RISCV::SB:
+ case RISCV::SH:
+ case RISCV::SW:
+ case RISCV::SD:
+ case RISCV::FSW:
+ case RISCV::FSD:
+ BaseOpIdx = 1;
+ OffsetOpIdx = 2;
+ break;
+ }
+
+ // Currently, the load/store offset must be 0 to be considered for this
+ // peephole optimisation.
+ if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)) ||
+ N->getConstantOperandVal(OffsetOpIdx) != 0)
+ continue;
+
+ SDValue Base = N->getOperand(BaseOpIdx);
+
+ // If the base is an ADDI, we can merge it in to the load/store.
+ if (!Base.isMachineOpcode() || Base.getMachineOpcode() != RISCV::ADDI)
+ continue;
+
+ SDValue ImmOperand = Base.getOperand(1);
+
+ if (auto Const = dyn_cast<ConstantSDNode>(ImmOperand)) {
+ ImmOperand = CurDAG->getTargetConstant(
+ Const->getSExtValue(), SDLoc(ImmOperand), ImmOperand.getValueType());
+ } else if (auto GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) {
+ ImmOperand = CurDAG->getTargetGlobalAddress(
+ GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(),
+ GA->getOffset(), GA->getTargetFlags());
+ } else {
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: ");
+ LLVM_DEBUG(Base->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\nN: ");
+ LLVM_DEBUG(N->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\n");
+
+ // Modify the offset operand of the load/store.
+ if (BaseOpIdx == 0) // Load
+ CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
+ N->getOperand(2));
+ else // Store
+ CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
+ ImmOperand, N->getOperand(3));
+
+ // The add-immediate may now be dead, in which case remove it.
+ if (Base.getNode()->use_empty())
+ CurDAG->RemoveDeadNode(Base.getNode());
+ }
+}
+
+// Remove redundant BuildPairF64+SplitF64 pairs. i.e. cases where an f64 is
+// built of two i32 values, only to be split apart again. This must be done
+// here as a peephole optimisation as the DAG has not been fully legalized at
+// the point BuildPairF64/SplitF64 nodes are created in RISCVISelLowering, so
+// some nodes would not yet have been replaced with libcalls.
+void RISCVDAGToDAGISel::doPeepholeBuildPairF64SplitF64() {
+ SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+ ++Position;
+
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ // Skip dead nodes and any nodes other than SplitF64Pseudo.
+ if (N->use_empty() || !N->isMachineOpcode() ||
+ !(N->getMachineOpcode() == RISCV::SplitF64Pseudo))
+ continue;
+
+ // If the operand to SplitF64 is a BuildPairF64, the split operation is
+ // redundant. Just use the operands to BuildPairF64 as the result.
+ SDValue F64Val = N->getOperand(0);
+ if (F64Val.isMachineOpcode() &&
+ F64Val.getMachineOpcode() == RISCV::BuildPairF64Pseudo) {
+ LLVM_DEBUG(
+ dbgs() << "Removing redundant SplitF64Pseudo and replacing uses "
+ "with BuildPairF64Pseudo operands:\n");
+ LLVM_DEBUG(dbgs() << "N: ");
+ LLVM_DEBUG(N->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "F64Val: ");
+ LLVM_DEBUG(F64Val->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\n");
+ SDValue From[] = {SDValue(N, 0), SDValue(N, 1)};
+ SDValue To[] = {F64Val.getOperand(0), F64Val.getOperand(1)};
+ CurDAG->ReplaceAllUsesOfValuesWith(From, To, 2);
+ }
+ }
+ CurDAG->RemoveDeadNodes();
+}
+
// This pass converts a legalized DAG into a RISCV-specific DAG, ready
// for instruction scheduling.
FunctionPass *llvm::createRISCVISelDag(RISCVTargetMachine &TM) {
diff --git a/lib/Target/RISCV/RISCVISelLowering.cpp b/lib/Target/RISCV/RISCVISelLowering.cpp
index 7d32954936be..87796e5b1097 100644
--- a/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -14,9 +14,11 @@
#include "RISCVISelLowering.h"
#include "RISCV.h"
+#include "RISCVMachineFunctionInfo.h"
#include "RISCVRegisterInfo.h"
#include "RISCVSubtarget.h"
#include "RISCVTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -35,6 +37,8 @@ using namespace llvm;
#define DEBUG_TYPE "riscv-lower"
+STATISTIC(NumTailCalls, "Number of tail calls");
+
RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
const RISCVSubtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
@@ -44,6 +48,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
// Set up the register classes.
addRegisterClass(XLenVT, &RISCV::GPRRegClass);
+ if (Subtarget.hasStdExtF())
+ addRegisterClass(MVT::f32, &RISCV::FPR32RegClass);
+ if (Subtarget.hasStdExtD())
+ addRegisterClass(MVT::f64, &RISCV::FPR64RegClass);
+
// Compute derived properties from the register classes.
computeRegisterProperties(STI.getRegisterInfo());
@@ -63,26 +72,28 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+ setOperationAction(ISD::VAARG, MVT::Other, Expand);
+ setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
for (auto VT : {MVT::i1, MVT::i8, MVT::i16})
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
- setOperationAction(ISD::ADDC, XLenVT, Expand);
- setOperationAction(ISD::ADDE, XLenVT, Expand);
- setOperationAction(ISD::SUBC, XLenVT, Expand);
- setOperationAction(ISD::SUBE, XLenVT, Expand);
+ if (!Subtarget.hasStdExtM()) {
+ setOperationAction(ISD::MUL, XLenVT, Expand);
+ setOperationAction(ISD::MULHS, XLenVT, Expand);
+ setOperationAction(ISD::MULHU, XLenVT, Expand);
+ setOperationAction(ISD::SDIV, XLenVT, Expand);
+ setOperationAction(ISD::UDIV, XLenVT, Expand);
+ setOperationAction(ISD::SREM, XLenVT, Expand);
+ setOperationAction(ISD::UREM, XLenVT, Expand);
+ }
- setOperationAction(ISD::SREM, XLenVT, Expand);
setOperationAction(ISD::SDIVREM, XLenVT, Expand);
- setOperationAction(ISD::SDIV, XLenVT, Expand);
- setOperationAction(ISD::UREM, XLenVT, Expand);
setOperationAction(ISD::UDIVREM, XLenVT, Expand);
- setOperationAction(ISD::UDIV, XLenVT, Expand);
-
- setOperationAction(ISD::MUL, XLenVT, Expand);
setOperationAction(ISD::SMUL_LOHI, XLenVT, Expand);
setOperationAction(ISD::UMUL_LOHI, XLenVT, Expand);
- setOperationAction(ISD::MULHS, XLenVT, Expand);
- setOperationAction(ISD::MULHU, XLenVT, Expand);
setOperationAction(ISD::SHL_PARTS, XLenVT, Expand);
setOperationAction(ISD::SRL_PARTS, XLenVT, Expand);
@@ -95,19 +106,128 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTLZ, XLenVT, Expand);
setOperationAction(ISD::CTPOP, XLenVT, Expand);
+ ISD::CondCode FPCCToExtend[] = {
+ ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETO, ISD::SETUEQ,
+ ISD::SETUGT, ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE,
+ ISD::SETGT, ISD::SETGE, ISD::SETNE};
+
+ if (Subtarget.hasStdExtF()) {
+ setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+ for (auto CC : FPCCToExtend)
+ setCondCodeAction(CC, MVT::f32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+ setOperationAction(ISD::SELECT, MVT::f32, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+ }
+
+ if (Subtarget.hasStdExtD()) {
+ setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+ for (auto CC : FPCCToExtend)
+ setCondCodeAction(CC, MVT::f64, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+ setOperationAction(ISD::SELECT, MVT::f64, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+ }
+
setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
setOperationAction(ISD::BlockAddress, XLenVT, Custom);
+ setOperationAction(ISD::ConstantPool, XLenVT, Custom);
+
+ if (Subtarget.hasStdExtA())
+ setMaxAtomicSizeInBitsSupported(Subtarget.getXLen());
+ else
+ setMaxAtomicSizeInBitsSupported(0);
setBooleanContents(ZeroOrOneBooleanContent);
// Function alignments (log2).
- setMinFunctionAlignment(3);
- setPrefFunctionAlignment(3);
+ unsigned FunctionAlignment = Subtarget.hasStdExtC() ? 1 : 2;
+ setMinFunctionAlignment(FunctionAlignment);
+ setPrefFunctionAlignment(FunctionAlignment);
// Effectively disable jump table generation.
setMinimumJumpTableEntries(INT_MAX);
}
+EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+ EVT VT) const {
+ if (!VT.isVector())
+ return getPointerTy(DL);
+ return VT.changeVectorElementTypeToInteger();
+}
+
+bool RISCVTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS,
+ Instruction *I) const {
+ // No global is ever allowed as a base.
+ if (AM.BaseGV)
+ return false;
+
+ // Require a 12-bit signed offset.
+ if (!isInt<12>(AM.BaseOffs))
+ return false;
+
+ switch (AM.Scale) {
+ case 0: // "r+i" or just "i", depending on HasBaseReg.
+ break;
+ case 1:
+ if (!AM.HasBaseReg) // allow "r+i".
+ break;
+ return false; // disallow "r+r" or "r+r+i".
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+bool RISCVTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+ return isInt<12>(Imm);
+}
+
+bool RISCVTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+ return isInt<12>(Imm);
+}
+
+// On RV32, 64-bit integers are split into their high and low parts and held
+// in two different registers, so the trunc is free since the low register can
+// just be used.
+bool RISCVTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
+ if (Subtarget.is64Bit() || !SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
+ return false;
+ unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
+ unsigned DestBits = DstTy->getPrimitiveSizeInBits();
+ return (SrcBits == 64 && DestBits == 32);
+}
+
+bool RISCVTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
+ if (Subtarget.is64Bit() || SrcVT.isVector() || DstVT.isVector() ||
+ !SrcVT.isInteger() || !DstVT.isInteger())
+ return false;
+ unsigned SrcBits = SrcVT.getSizeInBits();
+ unsigned DestBits = DstVT.getSizeInBits();
+ return (SrcBits == 64 && DestBits == 32);
+}
+
+bool RISCVTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+ // Zexts are free if they can be combined with a load.
+ if (auto *LD = dyn_cast<LoadSDNode>(Val)) {
+ EVT MemVT = LD->getMemoryVT();
+ if ((MemVT == MVT::i8 || MemVT == MVT::i16 ||
+ (Subtarget.is64Bit() && MemVT == MVT::i32)) &&
+ (LD->getExtensionType() == ISD::NON_EXTLOAD ||
+ LD->getExtensionType() == ISD::ZEXTLOAD))
+ return true;
+ }
+
+ return TargetLowering::isZExtFree(Val, VT2);
+}
+
// Changes the condition code and swaps operands if necessary, so the SetCC
// operation matches one of the comparisons supported directly in the RISC-V
// ISA.
@@ -156,8 +276,16 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return lowerGlobalAddress(Op, DAG);
case ISD::BlockAddress:
return lowerBlockAddress(Op, DAG);
+ case ISD::ConstantPool:
+ return lowerConstantPool(Op, DAG);
case ISD::SELECT:
return lowerSELECT(Op, DAG);
+ case ISD::VASTART:
+ return lowerVASTART(Op, DAG);
+ case ISD::FRAMEADDR:
+ return LowerFRAMEADDR(Op, DAG);
+ case ISD::RETURNADDR:
+ return LowerRETURNADDR(Op, DAG);
}
}
@@ -168,17 +296,22 @@ SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op,
GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = N->getGlobal();
int64_t Offset = N->getOffset();
+ MVT XLenVT = Subtarget.getXLenVT();
if (isPositionIndependent() || Subtarget.is64Bit())
report_fatal_error("Unable to lowerGlobalAddress");
-
- SDValue GAHi =
- DAG.getTargetGlobalAddress(GV, DL, Ty, Offset, RISCVII::MO_HI);
- SDValue GALo =
- DAG.getTargetGlobalAddress(GV, DL, Ty, Offset, RISCVII::MO_LO);
+ // In order to maximise the opportunity for common subexpression elimination,
+ // emit a separate ADD node for the global address offset instead of folding
+ // it in the global address node. Later peephole optimisations may choose to
+ // fold it back in when profitable.
+ SDValue GAHi = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_HI);
+ SDValue GALo = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_LO);
SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, GAHi), 0);
SDValue MNLo =
SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, GALo), 0);
+ if (Offset != 0)
+ return DAG.getNode(ISD::ADD, DL, Ty, MNLo,
+ DAG.getConstant(Offset, DL, XLenVT));
return MNLo;
}
@@ -201,6 +334,29 @@ SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op,
return MNLo;
}
+SDValue RISCVTargetLowering::lowerConstantPool(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT Ty = Op.getValueType();
+ ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+ const Constant *CPA = N->getConstVal();
+ int64_t Offset = N->getOffset();
+ unsigned Alignment = N->getAlignment();
+
+ if (!isPositionIndependent()) {
+ SDValue CPAHi =
+ DAG.getTargetConstantPool(CPA, Ty, Alignment, Offset, RISCVII::MO_HI);
+ SDValue CPALo =
+ DAG.getTargetConstantPool(CPA, Ty, Alignment, Offset, RISCVII::MO_LO);
+ SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, CPAHi), 0);
+ SDValue MNLo =
+ SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, CPALo), 0);
+ return MNLo;
+ } else {
+ report_fatal_error("Unable to lowerConstantPool");
+ }
+}
+
SDValue RISCVTargetLowering::lowerExternalSymbol(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
@@ -261,14 +417,153 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
}
+SDValue RISCVTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ RISCVMachineFunctionInfo *FuncInfo = MF.getInfo<RISCVMachineFunctionInfo>();
+
+ SDLoc DL(Op);
+ SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+ getPointerTy(MF.getDataLayout()));
+
+ // vastart just stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
+ MachinePointerInfo(SV));
+}
+
+SDValue RISCVTargetLowering::LowerFRAMEADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo();
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
+ unsigned FrameReg = RI.getFrameRegister(MF);
+ int XLenInBytes = Subtarget.getXLen() / 8;
+
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, VT);
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ while (Depth--) {
+ int Offset = -(XLenInBytes * 2);
+ SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
+ DAG.getIntPtrConstant(Offset, DL));
+ FrameAddr =
+ DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
+ }
+ return FrameAddr;
+}
+
+SDValue RISCVTargetLowering::LowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo();
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setReturnAddressIsTaken(true);
+ MVT XLenVT = Subtarget.getXLenVT();
+ int XLenInBytes = Subtarget.getXLen() / 8;
+
+ if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+ return SDValue();
+
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ if (Depth) {
+ int Off = -XLenInBytes;
+ SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+ SDValue Offset = DAG.getConstant(Off, DL, VT);
+ return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
+ MachinePointerInfo());
+ }
+
+ // Return the value of the return address register, marking it an implicit
+ // live-in.
+ unsigned Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(XLenVT));
+ return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, XLenVT);
+}
+
+static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
+ MachineBasicBlock *BB) {
+ assert(MI.getOpcode() == RISCV::SplitF64Pseudo && "Unexpected instruction");
+
+ MachineFunction &MF = *BB->getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+ unsigned LoReg = MI.getOperand(0).getReg();
+ unsigned HiReg = MI.getOperand(1).getReg();
+ unsigned SrcReg = MI.getOperand(2).getReg();
+ const TargetRegisterClass *SrcRC = &RISCV::FPR64RegClass;
+ int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();
+
+ TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC,
+ RI);
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
+ MachineMemOperand::MOLoad, 8, 8);
+ BuildMI(*BB, MI, DL, TII.get(RISCV::LW), LoReg)
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO);
+ BuildMI(*BB, MI, DL, TII.get(RISCV::LW), HiReg)
+ .addFrameIndex(FI)
+ .addImm(4)
+ .addMemOperand(MMO);
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
+ MachineBasicBlock *BB) {
+ assert(MI.getOpcode() == RISCV::BuildPairF64Pseudo &&
+ "Unexpected instruction");
+
+ MachineFunction &MF = *BB->getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned LoReg = MI.getOperand(1).getReg();
+ unsigned HiReg = MI.getOperand(2).getReg();
+ const TargetRegisterClass *DstRC = &RISCV::FPR64RegClass;
+ int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
+ MachineMemOperand::MOStore, 8, 8);
+ BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
+ .addReg(LoReg, getKillRegState(MI.getOperand(1).isKill()))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO);
+ BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
+ .addReg(HiReg, getKillRegState(MI.getOperand(2).isKill()))
+ .addFrameIndex(FI)
+ .addImm(4)
+ .addMemOperand(MMO);
+ TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, RI);
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
MachineBasicBlock *
RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
- const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
- DebugLoc DL = MI.getDebugLoc();
-
- assert(MI.getOpcode() == RISCV::Select_GPR_Using_CC_GPR &&
- "Unexpected instr type to insert");
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instr type to insert");
+ case RISCV::Select_GPR_Using_CC_GPR:
+ case RISCV::Select_FPR32_Using_CC_GPR:
+ case RISCV::Select_FPR64_Using_CC_GPR:
+ break;
+ case RISCV::BuildPairF64Pseudo:
+ return emitBuildPairF64Pseudo(MI, BB);
+ case RISCV::SplitF64Pseudo:
+ return emitSplitF64Pseudo(MI, BB);
+ }
// To "insert" a SELECT instruction, we actually have to insert the triangle
// control-flow pattern. The incoming instruction knows the destination vreg
@@ -281,7 +576,9 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// | IfFalseMBB
// | /
// TailMBB
+ const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ DebugLoc DL = MI.getDebugLoc();
MachineFunction::iterator I = ++BB->getIterator();
MachineBasicBlock *HeadMBB = BB;
@@ -398,19 +695,36 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
// Implements the RISC-V calling convention. Returns true upon failure.
static bool CC_RISCV(const DataLayout &DL, unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State, bool IsFixed, bool IsRet) {
+ CCState &State, bool IsFixed, bool IsRet, Type *OrigTy) {
unsigned XLen = DL.getLargestLegalIntTypeSizeInBits();
assert(XLen == 32 || XLen == 64);
MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;
- assert(ValVT == XLenVT && "Unexpected ValVT");
- assert(LocVT == XLenVT && "Unexpected LocVT");
- assert(IsFixed && "Vararg support not yet implemented");
+ if (ValVT == MVT::f32) {
+ LocVT = MVT::i32;
+ LocInfo = CCValAssign::BCvt;
+ }
// Any return value split in to more than two values can't be returned
// directly.
if (IsRet && ValNo > 1)
return true;
+ // If this is a variadic argument, the RISC-V calling convention requires
+ // that it is assigned an 'even' or 'aligned' register if it has 8-byte
+ // alignment (RV32) or 16-byte alignment (RV64). An aligned register should
+ // be used regardless of whether the original argument was split during
+ // legalisation or not. The argument will not be passed by registers if the
+ // original type is larger than 2*XLEN, so the register alignment rule does
+ // not apply.
+ unsigned TwoXLenInBytes = (2 * XLen) / 8;
+ if (!IsFixed && ArgFlags.getOrigAlign() == TwoXLenInBytes &&
+ DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes) {
+ unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
+ // Skip 'odd' register if necessary.
+ if (RegIdx != array_lengthof(ArgGPRs) && RegIdx % 2 == 1)
+ State.AllocateReg(ArgGPRs);
+ }
+
SmallVectorImpl<CCValAssign> &PendingLocs = State.getPendingLocs();
SmallVectorImpl<ISD::ArgFlagsTy> &PendingArgFlags =
State.getPendingArgFlags();
@@ -418,6 +732,28 @@ static bool CC_RISCV(const DataLayout &DL, unsigned ValNo, MVT ValVT, MVT LocVT,
assert(PendingLocs.size() == PendingArgFlags.size() &&
"PendingLocs and PendingArgFlags out of sync");
+ // Handle passing f64 on RV32D with a soft float ABI.
+ if (XLen == 32 && ValVT == MVT::f64) {
+ assert(!ArgFlags.isSplit() && PendingLocs.empty() &&
+ "Can't lower f64 if it is split");
+ // Depending on available argument GPRS, f64 may be passed in a pair of
+ // GPRs, split between a GPR and the stack, or passed completely on the
+ // stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these
+ // cases.
+ unsigned Reg = State.AllocateReg(ArgGPRs);
+ LocVT = MVT::i32;
+ if (!Reg) {
+ unsigned StackOffset = State.AllocateStack(8, 8);
+ State.addLoc(
+ CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
+ return false;
+ }
+ if (!State.AllocateReg(ArgGPRs))
+ State.AllocateStack(4, 4);
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+
// Split arguments might be passed indirectly, so keep track of the pending
// values.
if (ArgFlags.isSplit() || !PendingLocs.empty()) {
@@ -482,15 +818,22 @@ void RISCVTargetLowering::analyzeInputArgs(
MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet) const {
unsigned NumArgs = Ins.size();
+ FunctionType *FType = MF.getFunction().getFunctionType();
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ArgVT = Ins[i].VT;
ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
+ Type *ArgTy = nullptr;
+ if (IsRet)
+ ArgTy = FType->getReturnType();
+ else if (Ins[i].isOrigArg())
+ ArgTy = FType->getParamType(Ins[i].getOrigArgIndex());
+
if (CC_RISCV(MF.getDataLayout(), i, ArgVT, ArgVT, CCValAssign::Full,
- ArgFlags, CCInfo, /*IsRet=*/true, IsRet)) {
- DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
- << EVT(ArgVT).getEVTString() << '\n');
+ ArgFlags, CCInfo, /*IsRet=*/true, IsRet, ArgTy)) {
+ LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
+ << EVT(ArgVT).getEVTString() << '\n');
llvm_unreachable(nullptr);
}
}
@@ -498,17 +841,19 @@ void RISCVTargetLowering::analyzeInputArgs(
void RISCVTargetLowering::analyzeOutputArgs(
MachineFunction &MF, CCState &CCInfo,
- const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsRet) const {
+ const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsRet,
+ CallLoweringInfo *CLI) const {
unsigned NumArgs = Outs.size();
for (unsigned i = 0; i != NumArgs; i++) {
MVT ArgVT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+ Type *OrigTy = CLI ? CLI->getArgs()[Outs[i].OrigArgIndex].Ty : nullptr;
if (CC_RISCV(MF.getDataLayout(), i, ArgVT, ArgVT, CCValAssign::Full,
- ArgFlags, CCInfo, Outs[i].IsFixed, IsRet)) {
- DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
- << EVT(ArgVT).getEVTString() << "\n");
+ ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) {
+ LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
+ << EVT(ArgVT).getEVTString() << "\n");
llvm_unreachable(nullptr);
}
}
@@ -521,6 +866,7 @@ static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
MachineFunction &MF = DAG.getMachineFunction();
MachineRegisterInfo &RegInfo = MF.getRegInfo();
EVT LocVT = VA.getLocVT();
+ EVT ValVT = VA.getValVT();
SDValue Val;
unsigned VReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
@@ -532,8 +878,12 @@ static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
llvm_unreachable("Unexpected CCValAssign::LocInfo");
case CCValAssign::Full:
case CCValAssign::Indirect:
- return Val;
+ break;
+ case CCValAssign::BCvt:
+ Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
+ break;
}
+ return Val;
}
// The caller is responsible for loading the full value if the argument is
@@ -565,6 +915,43 @@ static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain,
return Val;
}
+static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
+ const CCValAssign &VA, const SDLoc &DL) {
+ assert(VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64 &&
+ "Unexpected VA");
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+ if (VA.isMemLoc()) {
+ // f64 is passed on the stack.
+ int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), /*Immutable=*/true);
+ SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+ return DAG.getLoad(MVT::f64, DL, Chain, FIN,
+ MachinePointerInfo::getFixedStack(MF, FI));
+ }
+
+ assert(VA.isRegLoc() && "Expected register VA assignment");
+
+ unsigned LoVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), LoVReg);
+ SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32);
+ SDValue Hi;
+ if (VA.getLocReg() == RISCV::X17) {
+ // Second half of f64 is passed on the stack.
+ int FI = MFI.CreateFixedObject(4, 0, /*Immutable=*/true);
+ SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+ Hi = DAG.getLoad(MVT::i32, DL, Chain, FIN,
+ MachinePointerInfo::getFixedStack(MF, FI));
+ } else {
+ // Second half of f64 is passed in another GPR.
+ unsigned HiVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
+ RegInfo.addLiveIn(VA.getLocReg() + 1, HiVReg);
+ Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32);
+ }
+ return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
+}
+
// Transform physical registers into virtual registers.
SDValue RISCVTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
@@ -580,11 +967,26 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
}
MachineFunction &MF = DAG.getMachineFunction();
- MVT XLenVT = Subtarget.getXLenVT();
- EVT PtrVT = getPointerTy(DAG.getDataLayout());
- if (IsVarArg)
- report_fatal_error("VarArg not supported");
+ const Function &Func = MF.getFunction();
+ if (Func.hasFnAttribute("interrupt")) {
+ if (!Func.arg_empty())
+ report_fatal_error(
+ "Functions with the interrupt attribute cannot have arguments!");
+
+ StringRef Kind =
+ MF.getFunction().getFnAttribute("interrupt").getValueAsString();
+
+ if (!(Kind == "user" || Kind == "supervisor" || Kind == "machine"))
+ report_fatal_error(
+ "Function interrupt attribute argument not supported!");
+ }
+
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ MVT XLenVT = Subtarget.getXLenVT();
+ unsigned XLenInBytes = Subtarget.getXLen() / 8;
+ // Used with vargs to acumulate store chains.
+ std::vector<SDValue> OutChains;
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
@@ -595,7 +997,11 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
CCValAssign &VA = ArgLocs[i];
assert(VA.getLocVT() == XLenVT && "Unhandled argument type");
SDValue ArgValue;
- if (VA.isRegLoc())
+ // Passing f64 on RV32D with a soft float ABI must be handled as a special
+ // case.
+ if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64)
+ ArgValue = unpackF64OnRV32DSoftABI(DAG, Chain, VA, DL);
+ else if (VA.isRegLoc())
ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL);
else
ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL);
@@ -621,9 +1027,155 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
}
InVals.push_back(ArgValue);
}
+
+ if (IsVarArg) {
+ ArrayRef<MCPhysReg> ArgRegs = makeArrayRef(ArgGPRs);
+ unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs);
+ const TargetRegisterClass *RC = &RISCV::GPRRegClass;
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ RISCVMachineFunctionInfo *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
+
+ // Offset of the first variable argument from stack pointer, and size of
+ // the vararg save area. For now, the varargs save area is either zero or
+ // large enough to hold a0-a7.
+ int VaArgOffset, VarArgsSaveSize;
+
+ // If all registers are allocated, then all varargs must be passed on the
+ // stack and we don't need to save any argregs.
+ if (ArgRegs.size() == Idx) {
+ VaArgOffset = CCInfo.getNextStackOffset();
+ VarArgsSaveSize = 0;
+ } else {
+ VarArgsSaveSize = XLenInBytes * (ArgRegs.size() - Idx);
+ VaArgOffset = -VarArgsSaveSize;
+ }
+
+ // Record the frame index of the first variable argument
+ // which is a value necessary to VASTART.
+ int FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
+ RVFI->setVarArgsFrameIndex(FI);
+
+ // If saving an odd number of registers then create an extra stack slot to
+ // ensure that the frame pointer is 2*XLEN-aligned, which in turn ensures
+ // offsets to even-numbered registered remain 2*XLEN-aligned.
+ if (Idx % 2) {
+ FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset - (int)XLenInBytes,
+ true);
+ VarArgsSaveSize += XLenInBytes;
+ }
+
+ // Copy the integer registers that may have been used for passing varargs
+ // to the vararg save area.
+ for (unsigned I = Idx; I < ArgRegs.size();
+ ++I, VaArgOffset += XLenInBytes) {
+ const unsigned Reg = RegInfo.createVirtualRegister(RC);
+ RegInfo.addLiveIn(ArgRegs[I], Reg);
+ SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT);
+ FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
+ SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
+ MachinePointerInfo::getFixedStack(MF, FI));
+ cast<StoreSDNode>(Store.getNode())
+ ->getMemOperand()
+ ->setValue((Value *)nullptr);
+ OutChains.push_back(Store);
+ }
+ RVFI->setVarArgsSaveSize(VarArgsSaveSize);
+ }
+
+ // All stores are grouped in one node to allow the matching between
+ // the size of Ins and InVals. This only happens for vararg functions.
+ if (!OutChains.empty()) {
+ OutChains.push_back(Chain);
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+ }
+
return Chain;
}
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization.
+/// Note: This is modelled after ARM's IsEligibleForTailCallOptimization.
+bool RISCVTargetLowering::IsEligibleForTailCallOptimization(
+ CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
+ const SmallVector<CCValAssign, 16> &ArgLocs) const {
+
+ auto &Callee = CLI.Callee;
+ auto CalleeCC = CLI.CallConv;
+ auto IsVarArg = CLI.IsVarArg;
+ auto &Outs = CLI.Outs;
+ auto &Caller = MF.getFunction();
+ auto CallerCC = Caller.getCallingConv();
+
+ // Do not tail call opt functions with "disable-tail-calls" attribute.
+ if (Caller.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
+ return false;
+
+ // Exception-handling functions need a special set of instructions to
+ // indicate a return to the hardware. Tail-calling another function would
+ // probably break this.
+ // TODO: The "interrupt" attribute isn't currently defined by RISC-V. This
+ // should be expanded as new function attributes are introduced.
+ if (Caller.hasFnAttribute("interrupt"))
+ return false;
+
+ // Do not tail call opt functions with varargs.
+ if (IsVarArg)
+ return false;
+
+ // Do not tail call opt if the stack is used to pass parameters.
+ if (CCInfo.getNextStackOffset() != 0)
+ return false;
+
+ // Do not tail call opt if any parameters need to be passed indirectly.
+ // Since long doubles (fp128) and i128 are larger than 2*XLEN, they are
+ // passed indirectly. So the address of the value will be passed in a
+ // register, or if not available, then the address is put on the stack. In
+ // order to pass indirectly, space on the stack often needs to be allocated
+ // in order to store the value. In this case the CCInfo.getNextStackOffset()
+ // != 0 check is not enough and we need to check if any CCValAssign ArgsLocs
+ // are passed CCValAssign::Indirect.
+ for (auto &VA : ArgLocs)
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ return false;
+
+ // Do not tail call opt if either caller or callee uses struct return
+ // semantics.
+ auto IsCallerStructRet = Caller.hasStructRetAttr();
+ auto IsCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
+ if (IsCallerStructRet || IsCalleeStructRet)
+ return false;
+
+ // Externally-defined functions with weak linkage should not be
+ // tail-called. The behaviour of branch instructions in this situation (as
+ // used for tail calls) is implementation-defined, so we cannot rely on the
+ // linker replacing the tail call with a return.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ const GlobalValue *GV = G->getGlobal();
+ if (GV->hasExternalWeakLinkage())
+ return false;
+ }
+
+ // The callee has to preserve all registers the caller needs to preserve.
+ const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+ if (CalleeCC != CallerCC) {
+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+ if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+ return false;
+ }
+
+ // Byval parameters hand the function a pointer directly into the stack area
+ // we want to reuse during a tail call. Working around this *is* possible
+ // but less efficient and uglier in LowerCall.
+ for (auto &Arg : Outs)
+ if (Arg.Flags.isByVal())
+ return false;
+
+ return true;
+}
+
// Lower a call to a callseq_start + CALL + callseq_end chain, and add input
// and output parameter nodes.
SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
@@ -635,22 +1187,29 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
- CLI.IsTailCall = false;
+ bool &IsTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
EVT PtrVT = getPointerTy(DAG.getDataLayout());
MVT XLenVT = Subtarget.getXLenVT();
- if (IsVarArg) {
- report_fatal_error("LowerCall with varargs not implemented");
- }
-
MachineFunction &MF = DAG.getMachineFunction();
// Analyze the operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
- analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false);
+ analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI);
+
+ // Check if it's really possible to do a tail call.
+ if (IsTailCall)
+ IsTailCall = IsEligibleForTailCallOptimization(ArgCCInfo, CLI, MF,
+ ArgLocs);
+
+ if (IsTailCall)
+ ++NumTailCalls;
+ else if (CLI.CS && CLI.CS.isMustTailCall())
+ report_fatal_error("failed to perform tail call elimination on a call "
+ "site marked musttail");
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = ArgCCInfo.getNextStackOffset();
@@ -673,12 +1232,13 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Align,
/*IsVolatile=*/false,
/*AlwaysInline=*/false,
- /*isTailCall=*/false, MachinePointerInfo(),
+ IsTailCall, MachinePointerInfo(),
MachinePointerInfo());
ByValArgs.push_back(FIPtr);
}
- Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
+ if (!IsTailCall)
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
// Copy argument values to their designated locations.
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
@@ -689,11 +1249,45 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
SDValue ArgValue = OutVals[i];
ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ // Handle passing f64 on RV32D with a soft float ABI as a special case.
+ bool IsF64OnRV32DSoftABI =
+ VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64;
+ if (IsF64OnRV32DSoftABI && VA.isRegLoc()) {
+ SDValue SplitF64 = DAG.getNode(
+ RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32), ArgValue);
+ SDValue Lo = SplitF64.getValue(0);
+ SDValue Hi = SplitF64.getValue(1);
+
+ unsigned RegLo = VA.getLocReg();
+ RegsToPass.push_back(std::make_pair(RegLo, Lo));
+
+ if (RegLo == RISCV::X17) {
+ // Second half of f64 is passed on the stack.
+ // Work out the address of the stack slot.
+ if (!StackPtr.getNode())
+ StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT);
+ // Emit the store.
+ MemOpChains.push_back(
+ DAG.getStore(Chain, DL, Hi, StackPtr, MachinePointerInfo()));
+ } else {
+ // Second half of f64 is passed in another GPR.
+ unsigned RegHigh = RegLo + 1;
+ RegsToPass.push_back(std::make_pair(RegHigh, Hi));
+ }
+ continue;
+ }
+
+ // IsF64OnRV32DSoftABI && VA.isMemLoc() is handled below in the same way
+ // as any other MemLoc.
+
// Promote the value if needed.
// For now, only handle fully promoted and indirect arguments.
switch (VA.getLocInfo()) {
case CCValAssign::Full:
break;
+ case CCValAssign::BCvt:
+ ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), ArgValue);
+ break;
case CCValAssign::Indirect: {
// Store the argument in a stack slot and pass its address.
SDValue SpillSlot = DAG.CreateStackTemporary(Outs[i].ArgVT);
@@ -731,6 +1325,8 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
} else {
assert(VA.isMemLoc() && "Argument not register or memory");
+ assert(!IsTailCall && "Tail call not allowed if stack is used "
+ "for passing parameters");
// Work out the address of the stack slot.
if (!StackPtr.getNode())
@@ -757,10 +1353,13 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
Glue = Chain.getValue(1);
}
- if (isa<GlobalAddressSDNode>(Callee)) {
- Callee = lowerGlobalAddress(Callee, DAG);
- } else if (isa<ExternalSymbolSDNode>(Callee)) {
- Callee = lowerExternalSymbol(Callee, DAG);
+ // If the callee is a GlobalAddress/ExternalSymbol node, turn it into a
+ // TargetGlobalAddress/TargetExternalSymbol node so that legalize won't
+ // split it and then direct call can be matched by PseudoCALL.
+ if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ Callee = DAG.getTargetGlobalAddress(S->getGlobal(), DL, PtrVT, 0, 0);
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, 0);
}
// The first call operand is the chain and the second is the target address.
@@ -773,11 +1372,13 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
for (auto &Reg : RegsToPass)
Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
- // Add a register mask operand representing the call-preserved registers.
- const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
- const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
- assert(Mask && "Missing call preserved mask for calling convention");
- Ops.push_back(DAG.getRegisterMask(Mask));
+ if (!IsTailCall) {
+ // Add a register mask operand representing the call-preserved registers.
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+ }
// Glue the call to the argument copies, if any.
if (Glue.getNode())
@@ -785,6 +1386,12 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
// Emit the call.
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+ if (IsTailCall) {
+ MF.getFrameInfo().setHasTailCall();
+ return DAG.getNode(RISCVISD::TAIL, DL, NodeTys, Ops);
+ }
+
Chain = DAG.getNode(RISCVISD::CALL, DL, NodeTys, Ops);
Glue = Chain.getValue(1);
@@ -802,13 +1409,32 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
// Copy all of the result registers out of their specified physreg.
for (auto &VA : RVLocs) {
- // Copy the value out, gluing the copy to the end of the call sequence.
- SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
- VA.getLocVT(), Glue);
+ // Copy the value out
+ SDValue RetValue =
+ DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue);
+ // Glue the RetValue to the end of the call sequence
Chain = RetValue.getValue(1);
Glue = RetValue.getValue(2);
+ if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
+ assert(VA.getLocReg() == ArgGPRs[0] && "Unexpected reg assignment");
+ SDValue RetValue2 =
+ DAG.getCopyFromReg(Chain, DL, ArgGPRs[1], MVT::i32, Glue);
+ Chain = RetValue2.getValue(1);
+ Glue = RetValue2.getValue(2);
+ RetValue = DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, RetValue,
+ RetValue2);
+ }
+
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ RetValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), RetValue);
+ break;
+ }
- assert(VA.getLocInfo() == CCValAssign::Full && "Unknown loc info!");
InVals.push_back(RetValue);
}
@@ -824,22 +1450,34 @@ bool RISCVTargetLowering::CanLowerReturn(
MVT VT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
if (CC_RISCV(MF.getDataLayout(), i, VT, VT, CCValAssign::Full, ArgFlags,
- CCInfo, /*IsFixed=*/true, /*IsRet=*/true))
+ CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr))
return false;
}
return true;
}
+static SDValue packIntoRegLoc(SelectionDAG &DAG, SDValue Val,
+ const CCValAssign &VA, const SDLoc &DL) {
+ EVT LocVT = VA.getLocVT();
+
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unexpected CCValAssign::LocInfo");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
+ break;
+ }
+ return Val;
+}
+
SDValue
RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
- if (IsVarArg) {
- report_fatal_error("VarArg not supported");
- }
-
// Stores the assignment of the return value to a location.
SmallVector<CCValAssign, 16> RVLocs;
@@ -847,9 +1485,10 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
- analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /*IsRet=*/true);
+ analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /*IsRet=*/true,
+ nullptr);
- SDValue Flag;
+ SDValue Glue;
SmallVector<SDValue, 4> RetOps(1, Chain);
// Copy the result values into the output registers.
@@ -857,21 +1496,60 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SDValue Val = OutVals[i];
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
- assert(VA.getLocInfo() == CCValAssign::Full &&
- "Unexpected CCValAssign::LocInfo");
- Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Flag);
+ if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
+ // Handle returning f64 on RV32D with a soft float ABI.
+ assert(VA.isRegLoc() && "Expected return via registers");
+ SDValue SplitF64 = DAG.getNode(RISCVISD::SplitF64, DL,
+ DAG.getVTList(MVT::i32, MVT::i32), Val);
+ SDValue Lo = SplitF64.getValue(0);
+ SDValue Hi = SplitF64.getValue(1);
+ unsigned RegLo = VA.getLocReg();
+ unsigned RegHi = RegLo + 1;
+ Chain = DAG.getCopyToReg(Chain, DL, RegLo, Lo, Glue);
+ Glue = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(RegLo, MVT::i32));
+ Chain = DAG.getCopyToReg(Chain, DL, RegHi, Hi, Glue);
+ Glue = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(RegHi, MVT::i32));
+ } else {
+ // Handle a 'normal' return.
+ Val = packIntoRegLoc(DAG, Val, VA, DL);
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue);
- // Guarantee that all emitted copies are stuck together.
- Flag = Chain.getValue(1);
- RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ // Guarantee that all emitted copies are stuck together.
+ Glue = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
}
RetOps[0] = Chain; // Update chain.
- // Add the flag if we have it.
- if (Flag.getNode()) {
- RetOps.push_back(Flag);
+ // Add the glue node if we have it.
+ if (Glue.getNode()) {
+ RetOps.push_back(Glue);
+ }
+
+ // Interrupt service routines use different return instructions.
+ const Function &Func = DAG.getMachineFunction().getFunction();
+ if (Func.hasFnAttribute("interrupt")) {
+ if (!Func.getReturnType()->isVoidTy())
+ report_fatal_error(
+ "Functions with the interrupt attribute must have void return type!");
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ StringRef Kind =
+ MF.getFunction().getFnAttribute("interrupt").getValueAsString();
+
+ unsigned RetOpc;
+ if (Kind == "user")
+ RetOpc = RISCVISD::URET_FLAG;
+ else if (Kind == "supervisor")
+ RetOpc = RISCVISD::SRET_FLAG;
+ else
+ RetOpc = RISCVISD::MRET_FLAG;
+
+ return DAG.getNode(RetOpc, DL, MVT::Other, RetOps);
}
return DAG.getNode(RISCVISD::RET_FLAG, DL, MVT::Other, RetOps);
@@ -883,10 +1561,58 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
break;
case RISCVISD::RET_FLAG:
return "RISCVISD::RET_FLAG";
+ case RISCVISD::URET_FLAG:
+ return "RISCVISD::URET_FLAG";
+ case RISCVISD::SRET_FLAG:
+ return "RISCVISD::SRET_FLAG";
+ case RISCVISD::MRET_FLAG:
+ return "RISCVISD::MRET_FLAG";
case RISCVISD::CALL:
return "RISCVISD::CALL";
case RISCVISD::SELECT_CC:
return "RISCVISD::SELECT_CC";
+ case RISCVISD::BuildPairF64:
+ return "RISCVISD::BuildPairF64";
+ case RISCVISD::SplitF64:
+ return "RISCVISD::SplitF64";
+ case RISCVISD::TAIL:
+ return "RISCVISD::TAIL";
}
return nullptr;
}
+
+std::pair<unsigned, const TargetRegisterClass *>
+RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint,
+ MVT VT) const {
+ // First, see if this is a constraint that directly corresponds to a
+ // RISCV register class.
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'r':
+ return std::make_pair(0U, &RISCV::GPRRegClass);
+ default:
+ break;
+ }
+ }
+
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
+ if (isa<LoadInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
+ return Builder.CreateFence(Ord);
+ if (isa<StoreInst>(Inst) && isReleaseOrStronger(Ord))
+ return Builder.CreateFence(AtomicOrdering::Release);
+ return nullptr;
+}
+
+Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
+ if (isa<LoadInst>(Inst) && isAcquireOrStronger(Ord))
+ return Builder.CreateFence(AtomicOrdering::Acquire);
+ return nullptr;
+}
diff --git a/lib/Target/RISCV/RISCVISelLowering.h b/lib/Target/RISCV/RISCVISelLowering.h
index 9c5c7ca008c0..280adb29fd02 100644
--- a/lib/Target/RISCV/RISCVISelLowering.h
+++ b/lib/Target/RISCV/RISCVISelLowering.h
@@ -25,8 +25,14 @@ namespace RISCVISD {
enum NodeType : unsigned {
FIRST_NUMBER = ISD::BUILTIN_OP_END,
RET_FLAG,
+ URET_FLAG,
+ SRET_FLAG,
+ MRET_FLAG,
CALL,
- SELECT_CC
+ SELECT_CC,
+ BuildPairF64,
+ SplitF64,
+ TAIL
};
}
@@ -37,23 +43,47 @@ public:
explicit RISCVTargetLowering(const TargetMachine &TM,
const RISCVSubtarget &STI);
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+ unsigned AS,
+ Instruction *I = nullptr) const override;
+ bool isLegalICmpImmediate(int64_t Imm) const override;
+ bool isLegalAddImmediate(int64_t Imm) const override;
+ bool isTruncateFree(Type *SrcTy, Type *DstTy) const override;
+ bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
+ bool isZExtFree(SDValue Val, EVT VT2) const override;
+
// Provide custom lowering hooks for some operations.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
// This method returns the name of a target specific DAG node.
const char *getTargetNodeName(unsigned Opcode) const override;
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const override;
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+ EVT VT) const override;
+
+ bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+ return isa<LoadInst>(I) || isa<StoreInst>(I);
+ }
+ Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+ Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+
private:
void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::InputArg> &Ins,
bool IsRet) const;
void analyzeOutputArgs(MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::OutputArg> &Outs,
- bool IsRet) const;
+ bool IsRet, CallLoweringInfo *CLI) const;
// Lower incoming arguments, copy physregs into vregs
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
bool IsVarArg,
@@ -76,8 +106,16 @@ private:
}
SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+
+ bool IsEligibleForTailCallOptimization(CCState &CCInfo,
+ CallLoweringInfo &CLI, MachineFunction &MF,
+ const SmallVector<CCValAssign, 16> &ArgLocs) const;
};
}
diff --git a/lib/Target/RISCV/RISCVInstrFormats.td b/lib/Target/RISCV/RISCVInstrFormats.td
index 7479ffbc9532..529e048045c6 100644
--- a/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/lib/Target/RISCV/RISCVInstrFormats.td
@@ -102,8 +102,8 @@ class RVInst<dag outs, dag ins, string opcodestr, string argstr,
}
// Pseudo instructions
-class Pseudo<dag outs, dag ins, list<dag> pattern>
- : RVInst<outs, ins, "", "", pattern, InstFormatPseudo> {
+class Pseudo<dag outs, dag ins, list<dag> pattern, string opcodestr = "", string argstr = "">
+ : RVInst<outs, ins, opcodestr, argstr, pattern, InstFormatPseudo> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
diff --git a/lib/Target/RISCV/RISCVInstrInfo.cpp b/lib/Target/RISCV/RISCVInstrInfo.cpp
index 186fe363edd9..327e4a7d615f 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -20,6 +20,7 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
@@ -31,16 +32,78 @@ using namespace llvm;
RISCVInstrInfo::RISCVInstrInfo()
: RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP) {}
+unsigned RISCVInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ switch (MI.getOpcode()) {
+ default:
+ return 0;
+ case RISCV::LB:
+ case RISCV::LBU:
+ case RISCV::LH:
+ case RISCV::LHU:
+ case RISCV::LW:
+ case RISCV::FLW:
+ case RISCV::LWU:
+ case RISCV::LD:
+ case RISCV::FLD:
+ break;
+ }
+
+ if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+ MI.getOperand(2).getImm() == 0) {
+ FrameIndex = MI.getOperand(1).getIndex();
+ return MI.getOperand(0).getReg();
+ }
+
+ return 0;
+}
+
+unsigned RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ switch (MI.getOpcode()) {
+ default:
+ return 0;
+ case RISCV::SB:
+ case RISCV::SH:
+ case RISCV::SW:
+ case RISCV::FSW:
+ case RISCV::SD:
+ case RISCV::FSD:
+ break;
+ }
+
+ if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
+ MI.getOperand(1).getImm() == 0) {
+ FrameIndex = MI.getOperand(0).getIndex();
+ return MI.getOperand(2).getReg();
+ }
+
+ return 0;
+}
+
void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, unsigned DstReg,
unsigned SrcReg, bool KillSrc) const {
- assert(RISCV::GPRRegClass.contains(DstReg, SrcReg) &&
- "Impossible reg-to-reg copy");
+ if (RISCV::GPRRegClass.contains(DstReg, SrcReg)) {
+ BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(0);
+ return;
+ }
- BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg)
+ // FPR->FPR copies
+ unsigned Opc;
+ if (RISCV::FPR32RegClass.contains(DstReg, SrcReg))
+ Opc = RISCV::FSGNJ_S;
+ else if (RISCV::FPR64RegClass.contains(DstReg, SrcReg))
+ Opc = RISCV::FSGNJ_D;
+ else
+ llvm_unreachable("Impossible reg-to-reg copy");
+
+ BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
.addReg(SrcReg, getKillRegState(KillSrc))
- .addImm(0);
+ .addReg(SrcReg, getKillRegState(KillSrc));
}
void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -52,13 +115,22 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
if (I != MBB.end())
DL = I->getDebugLoc();
+ unsigned Opcode;
+
if (RISCV::GPRRegClass.hasSubClassEq(RC))
- BuildMI(MBB, I, DL, get(RISCV::SW))
- .addReg(SrcReg, getKillRegState(IsKill))
- .addFrameIndex(FI)
- .addImm(0);
+ Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
+ RISCV::SW : RISCV::SD;
+ else if (RISCV::FPR32RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::FSW;
+ else if (RISCV::FPR64RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::FSD;
else
llvm_unreachable("Can't store this register to stack slot");
+
+ BuildMI(MBB, I, DL, get(Opcode))
+ .addReg(SrcReg, getKillRegState(IsKill))
+ .addFrameIndex(FI)
+ .addImm(0);
}
void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -70,8 +142,310 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
if (I != MBB.end())
DL = I->getDebugLoc();
+ unsigned Opcode;
+
if (RISCV::GPRRegClass.hasSubClassEq(RC))
- BuildMI(MBB, I, DL, get(RISCV::LW), DstReg).addFrameIndex(FI).addImm(0);
+ Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
+ RISCV::LW : RISCV::LD;
+ else if (RISCV::FPR32RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::FLW;
+ else if (RISCV::FPR64RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::FLD;
else
llvm_unreachable("Can't load this register from stack slot");
+
+ BuildMI(MBB, I, DL, get(Opcode), DstReg).addFrameIndex(FI).addImm(0);
+}
+
+void RISCVInstrInfo::movImm32(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned DstReg, uint64_t Val,
+ MachineInstr::MIFlag Flag) const {
+ assert(isInt<32>(Val) && "Can only materialize 32-bit constants");
+
+ // TODO: If the value can be materialized using only one instruction, only
+ // insert a single instruction.
+
+ uint64_t Hi20 = ((Val + 0x800) >> 12) & 0xfffff;
+ uint64_t Lo12 = SignExtend64<12>(Val);
+ BuildMI(MBB, MBBI, DL, get(RISCV::LUI), DstReg)
+ .addImm(Hi20)
+ .setMIFlag(Flag);
+ BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg)
+ .addReg(DstReg, RegState::Kill)
+ .addImm(Lo12)
+ .setMIFlag(Flag);
+}
+
+// The contents of values added to Cond are not examined outside of
+// RISCVInstrInfo, giving us flexibility in what to push to it. For RISCV, we
+// push BranchOpcode, Reg1, Reg2.
+static void parseCondBranch(MachineInstr &LastInst, MachineBasicBlock *&Target,
+ SmallVectorImpl<MachineOperand> &Cond) {
+ // Block ends with fall-through condbranch.
+ assert(LastInst.getDesc().isConditionalBranch() &&
+ "Unknown conditional branch");
+ Target = LastInst.getOperand(2).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(LastInst.getOpcode()));
+ Cond.push_back(LastInst.getOperand(0));
+ Cond.push_back(LastInst.getOperand(1));
+}
+
+static unsigned getOppositeBranchOpcode(int Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unrecognized conditional branch");
+ case RISCV::BEQ:
+ return RISCV::BNE;
+ case RISCV::BNE:
+ return RISCV::BEQ;
+ case RISCV::BLT:
+ return RISCV::BGE;
+ case RISCV::BGE:
+ return RISCV::BLT;
+ case RISCV::BLTU:
+ return RISCV::BGEU;
+ case RISCV::BGEU:
+ return RISCV::BLTU;
+ }
+}
+
+bool RISCVInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ TBB = FBB = nullptr;
+ Cond.clear();
+
+ // If the block has no terminators, it just falls into the block after it.
+ MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+ if (I == MBB.end() || !isUnpredicatedTerminator(*I))
+ return false;
+
+ // Count the number of terminators and find the first unconditional or
+ // indirect branch.
+ MachineBasicBlock::iterator FirstUncondOrIndirectBr = MBB.end();
+ int NumTerminators = 0;
+ for (auto J = I.getReverse(); J != MBB.rend() && isUnpredicatedTerminator(*J);
+ J++) {
+ NumTerminators++;
+ if (J->getDesc().isUnconditionalBranch() ||
+ J->getDesc().isIndirectBranch()) {
+ FirstUncondOrIndirectBr = J.getReverse();
+ }
+ }
+
+ // If AllowModify is true, we can erase any terminators after
+ // FirstUncondOrIndirectBR.
+ if (AllowModify && FirstUncondOrIndirectBr != MBB.end()) {
+ while (std::next(FirstUncondOrIndirectBr) != MBB.end()) {
+ std::next(FirstUncondOrIndirectBr)->eraseFromParent();
+ NumTerminators--;
+ }
+ I = FirstUncondOrIndirectBr;
+ }
+
+ // We can't handle blocks that end in an indirect branch.
+ if (I->getDesc().isIndirectBranch())
+ return true;
+
+ // We can't handle blocks with more than 2 terminators.
+ if (NumTerminators > 2)
+ return true;
+
+ // Handle a single unconditional branch.
+ if (NumTerminators == 1 && I->getDesc().isUnconditionalBranch()) {
+ TBB = I->getOperand(0).getMBB();
+ return false;
+ }
+
+ // Handle a single conditional branch.
+ if (NumTerminators == 1 && I->getDesc().isConditionalBranch()) {
+ parseCondBranch(*I, TBB, Cond);
+ return false;
+ }
+
+ // Handle a conditional branch followed by an unconditional branch.
+ if (NumTerminators == 2 && std::prev(I)->getDesc().isConditionalBranch() &&
+ I->getDesc().isUnconditionalBranch()) {
+ parseCondBranch(*std::prev(I), TBB, Cond);
+ FBB = I->getOperand(0).getMBB();
+ return false;
+ }
+
+ // Otherwise, we can't handle this.
+ return true;
+}
+
+unsigned RISCVInstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ if (BytesRemoved)
+ *BytesRemoved = 0;
+ MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+ if (I == MBB.end())
+ return 0;
+
+ if (!I->getDesc().isUnconditionalBranch() &&
+ !I->getDesc().isConditionalBranch())
+ return 0;
+
+ // Remove the branch.
+ I->eraseFromParent();
+ if (BytesRemoved)
+ *BytesRemoved += getInstSizeInBytes(*I);
+
+ I = MBB.end();
+
+ if (I == MBB.begin())
+ return 1;
+ --I;
+ if (!I->getDesc().isConditionalBranch())
+ return 1;
+
+ // Remove the branch.
+ I->eraseFromParent();
+ if (BytesRemoved)
+ *BytesRemoved += getInstSizeInBytes(*I);
+ return 2;
+}
+
+// Inserts a branch into the end of the specific MachineBasicBlock, returning
+// the number of instructions inserted.
+unsigned RISCVInstrInfo::insertBranch(
+ MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
+ if (BytesAdded)
+ *BytesAdded = 0;
+
+ // Shouldn't be a fall through.
+ assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 3 || Cond.size() == 0) &&
+ "RISCV branch conditions have two components!");
+
+ // Unconditional branch.
+ if (Cond.empty()) {
+ MachineInstr &MI = *BuildMI(&MBB, DL, get(RISCV::PseudoBR)).addMBB(TBB);
+ if (BytesAdded)
+ *BytesAdded += getInstSizeInBytes(MI);
+ return 1;
+ }
+
+ // Either a one or two-way conditional branch.
+ unsigned Opc = Cond[0].getImm();
+ MachineInstr &CondMI =
+ *BuildMI(&MBB, DL, get(Opc)).add(Cond[1]).add(Cond[2]).addMBB(TBB);
+ if (BytesAdded)
+ *BytesAdded += getInstSizeInBytes(CondMI);
+
+ // One-way conditional branch.
+ if (!FBB)
+ return 1;
+
+ // Two-way conditional branch.
+ MachineInstr &MI = *BuildMI(&MBB, DL, get(RISCV::PseudoBR)).addMBB(FBB);
+ if (BytesAdded)
+ *BytesAdded += getInstSizeInBytes(MI);
+ return 2;
+}
+
+unsigned RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock &DestBB,
+ const DebugLoc &DL,
+ int64_t BrOffset,
+ RegScavenger *RS) const {
+ assert(RS && "RegScavenger required for long branching");
+ assert(MBB.empty() &&
+ "new block should be inserted for expanding unconditional branch");
+ assert(MBB.pred_size() == 1);
+
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const auto &TM = static_cast<const RISCVTargetMachine &>(MF->getTarget());
+ const auto &STI = MF->getSubtarget<RISCVSubtarget>();
+
+ if (TM.isPositionIndependent() || STI.is64Bit())
+ report_fatal_error("Unable to insert indirect branch");
+
+ if (!isInt<32>(BrOffset))
+ report_fatal_error(
+ "Branch offsets outside of the signed 32-bit range not supported");
+
+ // FIXME: A virtual register must be used initially, as the register
+ // scavenger won't work with empty blocks (SIInstrInfo::insertIndirectBranch
+ // uses the same workaround).
+ unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ auto II = MBB.end();
+
+ MachineInstr &LuiMI = *BuildMI(MBB, II, DL, get(RISCV::LUI), ScratchReg)
+ .addMBB(&DestBB, RISCVII::MO_HI);
+ BuildMI(MBB, II, DL, get(RISCV::PseudoBRIND))
+ .addReg(ScratchReg, RegState::Kill)
+ .addMBB(&DestBB, RISCVII::MO_LO);
+
+ RS->enterBasicBlockEnd(MBB);
+ unsigned Scav = RS->scavengeRegisterBackwards(
+ RISCV::GPRRegClass, MachineBasicBlock::iterator(LuiMI), false, 0);
+ MRI.replaceRegWith(ScratchReg, Scav);
+ MRI.clearVirtRegs();
+ RS->setRegUsed(Scav);
+ return 8;
+}
+
+bool RISCVInstrInfo::reverseBranchCondition(
+ SmallVectorImpl<MachineOperand> &Cond) const {
+ assert((Cond.size() == 3) && "Invalid branch condition!");
+ Cond[0].setImm(getOppositeBranchOpcode(Cond[0].getImm()));
+ return false;
+}
+
+MachineBasicBlock *
+RISCVInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
+ assert(MI.getDesc().isBranch() && "Unexpected opcode!");
+ // The branch target is always the last operand.
+ int NumOp = MI.getNumExplicitOperands();
+ return MI.getOperand(NumOp - 1).getMBB();
+}
+
+bool RISCVInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
+ int64_t BrOffset) const {
+ // Ideally we could determine the supported branch offset from the
+ // RISCVII::FormMask, but this can't be used for Pseudo instructions like
+ // PseudoBR.
+ switch (BranchOp) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+ case RISCV::BEQ:
+ case RISCV::BNE:
+ case RISCV::BLT:
+ case RISCV::BGE:
+ case RISCV::BLTU:
+ case RISCV::BGEU:
+ return isIntN(13, BrOffset);
+ case RISCV::JAL:
+ case RISCV::PseudoBR:
+ return isIntN(21, BrOffset);
+ }
+}
+
+unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+
+ switch (Opcode) {
+ default: { return get(Opcode).getSize(); }
+ case TargetOpcode::EH_LABEL:
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::KILL:
+ case TargetOpcode::DBG_VALUE:
+ return 0;
+ case RISCV::PseudoCALL:
+ case RISCV::PseudoTAIL:
+ return 8;
+ case TargetOpcode::INLINEASM: {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const auto &TM = static_cast<const RISCVTargetMachine &>(MF.getTarget());
+ return getInlineAsmLength(MI.getOperand(0).getSymbolName(),
+ *TM.getMCAsmInfo());
+ }
+ }
}
diff --git a/lib/Target/RISCV/RISCVInstrInfo.h b/lib/Target/RISCV/RISCVInstrInfo.h
index 05c8378445cf..1d3279c3d31e 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/lib/Target/RISCV/RISCVInstrInfo.h
@@ -27,6 +27,11 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
public:
RISCVInstrInfo();
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, unsigned DstReg, unsigned SrcReg,
bool KillSrc) const override;
@@ -41,6 +46,39 @@ public:
MachineBasicBlock::iterator MBBI, unsigned DstReg,
int FrameIndex, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
+
+ // Materializes the given int32 Val into DstReg.
+ void movImm32(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned DstReg, uint64_t Val,
+ MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const;
+
+ unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
+
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &dl,
+ int *BytesAdded = nullptr) const override;
+
+ unsigned insertIndirectBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock &NewDestBB,
+ const DebugLoc &DL, int64_t BrOffset,
+ RegScavenger *RS = nullptr) const override;
+
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+
+ bool
+ reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+ MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override;
+
+ bool isBranchOffsetInRange(unsigned BranchOpc,
+ int64_t BrOffset) const override;
};
}
#endif
diff --git a/lib/Target/RISCV/RISCVInstrInfo.td b/lib/Target/RISCV/RISCVInstrInfo.td
index 1aae2f39dbdd..b51e4e70330d 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/lib/Target/RISCV/RISCVInstrInfo.td
@@ -36,13 +36,28 @@ def CallSeqEnd : SDNode<"ISD::CALLSEQ_END", SDT_RISCVCallSeqEnd,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
def RetFlag : SDNode<"RISCVISD::RET_FLAG", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def URetFlag : SDNode<"RISCVISD::URET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue]>;
+def SRetFlag : SDNode<"RISCVISD::SRET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue]>;
+def MRetFlag : SDNode<"RISCVISD::MRET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue]>;
def SelectCC : SDNode<"RISCVISD::SELECT_CC", SDT_RISCVSelectCC,
[SDNPInGlue]>;
+def Tail : SDNode<"RISCVISD::TAIL", SDT_RISCVCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
//===----------------------------------------------------------------------===//
// Operand and SDNode transformation definitions.
//===----------------------------------------------------------------------===//
+class ImmXLenAsmOperand<string prefix, string suffix = ""> : AsmOperandClass {
+ let Name = prefix # "ImmXLen" # suffix;
+ let RenderMethod = "addImmOperands";
+ let DiagnosticType = !strconcat("Invalid", Name);
+}
+
class ImmAsmOperand<string prefix, int width, string suffix> : AsmOperandClass {
let Name = prefix # "Imm" # width # suffix;
let RenderMethod = "addImmOperands";
@@ -83,6 +98,14 @@ def uimmlog2xlen : Operand<XLenVT>, ImmLeaf<XLenVT, [{
let ParserMatchClass = UImmLog2XLenAsmOperand;
// TODO: should ensure invalid shamt is rejected when decoding.
let DecoderMethod = "decodeUImmOperand<6>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (!MCOp.evaluateAsConstantImm(Imm))
+ return false;
+ if (STI.getTargetTriple().isArch64Bit())
+ return isUInt<6>(Imm);
+ return isUInt<5>(Imm);
+ }];
}
def uimm5 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isUInt<5>(Imm);}]> {
@@ -94,6 +117,12 @@ def simm12 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<12>(Imm);}]> {
let ParserMatchClass = SImmAsmOperand<12>;
let EncoderMethod = "getImmOpValue";
let DecoderMethod = "decodeSImmOperand<12>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (MCOp.evaluateAsConstantImm(Imm))
+ return isInt<12>(Imm);
+ return MCOp.isBareSymbolRef();
+ }];
}
def uimm12 : Operand<XLenVT> {
@@ -106,12 +135,24 @@ def simm13_lsb0 : Operand<OtherVT> {
let ParserMatchClass = SImmAsmOperand<13, "Lsb0">;
let EncoderMethod = "getImmOpValueAsr1";
let DecoderMethod = "decodeSImmOperandAndLsl1<13>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (MCOp.evaluateAsConstantImm(Imm))
+ return isShiftedInt<12, 1>(Imm);
+ return MCOp.isBareSymbolRef();
+ }];
}
def uimm20 : Operand<XLenVT> {
let ParserMatchClass = UImmAsmOperand<20>;
let EncoderMethod = "getImmOpValue";
let DecoderMethod = "decodeUImmOperand<20>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (MCOp.evaluateAsConstantImm(Imm))
+ return isUInt<20>(Imm);
+ return MCOp.isBareSymbolRef();
+ }];
}
// A 21-bit signed immediate where the least significant bit is zero.
@@ -119,13 +160,36 @@ def simm21_lsb0 : Operand<OtherVT> {
let ParserMatchClass = SImmAsmOperand<21, "Lsb0">;
let EncoderMethod = "getImmOpValueAsr1";
let DecoderMethod = "decodeSImmOperandAndLsl1<21>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (MCOp.evaluateAsConstantImm(Imm))
+ return isShiftedInt<20, 1>(Imm);
+ return MCOp.isBareSymbolRef();
+ }];
+}
+
+def BareSymbol : AsmOperandClass {
+ let Name = "BareSymbol";
+ let RenderMethod = "addImmOperands";
+ let DiagnosticType = "InvalidBareSymbol";
+}
+
+// A bare symbol.
+def bare_symbol : Operand<XLenVT> {
+ let ParserMatchClass = BareSymbol;
+ let MCOperandPredicate = [{
+ return MCOp.isBareSymbolRef();
+ }];
}
// A parameterized register class alternative to i32imm/i64imm from Target.td.
-def ixlenimm : Operand<XLenVT>;
+def ixlenimm : Operand<XLenVT> {
+ let ParserMatchClass = ImmXLenAsmOperand<"">;
+}
// Standalone (codegen-only) immleaf patterns.
-def simm32 : ImmLeaf<XLenVT, [{return isInt<32>(Imm);}]>;
+def simm32 : ImmLeaf<XLenVT, [{return isInt<32>(Imm);}]>;
+def simm32hi20 : ImmLeaf<XLenVT, [{return isShiftedInt<20, 12>(Imm);}]>;
// Addressing modes.
// Necessary because a frameindex can't be matched directly in a pattern.
@@ -220,7 +284,7 @@ class Priv<string opcodestr, bits<7> funct7>
// Instructions
//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let hasSideEffects = 0, isReMaterializable = 1, mayLoad = 0, mayStore = 0 in {
def LUI : RVInstU<OPC_LUI, (outs GPR:$rd), (ins uimm20:$imm20),
"lui", "$rd, $imm20">;
@@ -254,7 +318,11 @@ def SB : Store_rri<0b000, "sb">;
def SH : Store_rri<0b001, "sh">;
def SW : Store_rri<0b010, "sw">;
+// ADDI isn't always rematerializable, but isReMaterializable will be used as
+// a hint which is verified in isReallyTriviallyReMaterializable.
+let isReMaterializable = 1 in
def ADDI : ALU_ri<0b000, "addi">;
+
def SLTI : ALU_ri<0b010, "slti">;
def SLTIU : ALU_ri<0b011, "sltiu">;
def XORI : ALU_ri<0b100, "xori">;
@@ -288,6 +356,12 @@ def FENCE : RVInstI<0b000, OPC_MISC_MEM, (outs),
let imm12 = {0b0000,pred,succ};
}
+def FENCE_TSO : RVInstI<0b000, OPC_MISC_MEM, (outs), (ins), "fence.tso", ""> {
+ let rs1 = 0;
+ let rd = 0;
+ let imm12 = {0b1000,0b0011,0b0011};
+}
+
def FENCE_I : RVInstI<0b001, OPC_MISC_MEM, (outs), (ins), "fence.i", ""> {
let rs1 = 0;
let rd = 0;
@@ -386,7 +460,16 @@ def SFENCE_VMA : RVInstR<0b0001001, 0b000, OPC_SYSTEM, (outs),
// TODO RV64I: sd
def : InstAlias<"nop", (ADDI X0, X0, 0)>;
-// TODO li
+
+// Note that the size is 32 because up to 8 32-bit instructions are needed to
+// generate an arbitrary 64-bit immediate. However, the size does not really
+// matter since PseudoLI is currently only used in the AsmParser where it gets
+// expanded to real instructions immediately.
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 32,
+ isCodeGenOnly = 0, isAsmParserOnly = 1 in
+def PseudoLI : Pseudo<(outs GPR:$rd), (ins ixlenimm:$imm), [],
+ "li", "$rd, $imm">;
+
def : InstAlias<"mv $rd, $rs", (ADDI GPR:$rd, GPR:$rs, 0)>;
def : InstAlias<"not $rd, $rs", (XORI GPR:$rd, GPR:$rs, -1)>;
def : InstAlias<"neg $rd, $rs", (SUB GPR:$rd, X0, GPR:$rs)>;
@@ -401,6 +484,11 @@ def : InstAlias<"snez $rd, $rs", (SLTU GPR:$rd, X0, GPR:$rs)>;
def : InstAlias<"sltz $rd, $rs", (SLT GPR:$rd, GPR:$rs, X0)>;
def : InstAlias<"sgtz $rd, $rs", (SLT GPR:$rd, X0, GPR:$rs)>;
+// sgt/sgtu are recognised by the GNU assembler but the canonical slt/sltu
+// form will always be printed. Therefore, set a zero weight.
+def : InstAlias<"sgt $rd, $rs, $rt", (SLT GPR:$rd, GPR:$rt, GPR:$rs), 0>;
+def : InstAlias<"sgtu $rd, $rs, $rt", (SLTU GPR:$rd, GPR:$rt, GPR:$rs), 0>;
+
def : InstAlias<"beqz $rs, $offset",
(BEQ GPR:$rs, X0, simm13_lsb0:$offset)>;
def : InstAlias<"bnez $rs, $offset",
@@ -489,7 +577,7 @@ def IsOrAdd: PatFrag<(ops node:$A, node:$B), (or node:$A, node:$B), [{
/// Immediates
def : Pat<(simm12:$imm), (ADDI X0, simm12:$imm)>;
-// TODO: Add a pattern for immediates with all zeroes in the lower 12 bits.
+def : Pat<(simm32hi20:$imm), (LUI (HI20 imm:$imm))>;
def : Pat<(simm32:$imm), (ADDI (LUI (HI20 imm:$imm)), (LO12Sext imm:$imm))>;
/// Simple arithmetic operations
@@ -536,11 +624,14 @@ def : Pat<(setge GPR:$rs1, GPR:$rs2), (XORI (SLT GPR:$rs1, GPR:$rs2), 1)>;
def : Pat<(setle GPR:$rs1, GPR:$rs2), (XORI (SLT GPR:$rs2, GPR:$rs1), 1)>;
let usesCustomInserter = 1 in
-def Select_GPR_Using_CC_GPR
- : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, ixlenimm:$imm, GPR:$src, GPR:$src2),
- [(set XLenVT:$dst, (SelectCC GPR:$lhs, GPR:$rhs,
- (XLenVT imm:$imm), GPR:$src, GPR:$src2))]>;
+class SelectCC_rrirr<RegisterClass valty, RegisterClass cmpty>
+ : Pseudo<(outs valty:$dst),
+ (ins cmpty:$lhs, cmpty:$rhs, ixlenimm:$imm,
+ valty:$truev, valty:$falsev),
+ [(set valty:$dst, (SelectCC cmpty:$lhs, cmpty:$rhs,
+ (XLenVT imm:$imm), valty:$truev, valty:$falsev))]>;
+
+def Select_GPR_Using_CC_GPR : SelectCC_rrirr<GPR, GPR>;
/// Branches and jumps
@@ -585,14 +676,50 @@ def : Pat<(brind GPR:$rs1), (PseudoBRIND GPR:$rs1, 0)>;
def : Pat<(brind (add GPR:$rs1, simm12:$imm12)),
(PseudoBRIND GPR:$rs1, simm12:$imm12)>;
+// PseudoCALL is a pseudo instruction which will eventually expand to auipc
+// and jalr while encoding. This is desirable, as an auipc+jalr pair with
+// R_RISCV_CALL and R_RISCV_RELAX relocations can be be relaxed by the linker
+// if the offset fits in a signed 21-bit immediate.
+// Define AsmString to print "call" when compile with -S flag.
+// Define isCodeGenOnly = 0 to support parsing assembly "call" instruction.
+let isCall = 1, Defs = [X1], isCodeGenOnly = 0 in
+def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func),
+ [(Call tglobaladdr:$func)]> {
+ let AsmString = "call\t$func";
+}
+
+def : Pat<(Call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
+
+def : Pat<(URetFlag), (URET X0, X0)>;
+def : Pat<(SRetFlag), (SRET X0, X0)>;
+def : Pat<(MRetFlag), (MRET X0, X0)>;
+
let isCall = 1, Defs = [X1] in
-def PseudoCALL : Pseudo<(outs), (ins GPR:$rs1), [(Call GPR:$rs1)]>,
- PseudoInstExpansion<(JALR X1, GPR:$rs1, 0)>;
+def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rs1), [(Call GPR:$rs1)]>,
+ PseudoInstExpansion<(JALR X1, GPR:$rs1, 0)>;
let isBarrier = 1, isReturn = 1, isTerminator = 1 in
def PseudoRET : Pseudo<(outs), (ins), [(RetFlag)]>,
PseudoInstExpansion<(JALR X0, X1, 0)>;
+// PseudoTAIL is a pseudo instruction similar to PseudoCALL and will eventually
+// expand to auipc and jalr while encoding.
+// Define AsmString to print "tail" when compile with -S flag.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [X2],
+ isCodeGenOnly = 0 in
+def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst), []> {
+ let AsmString = "tail\t$dst";
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [X2] in
+def PseudoTAILIndirect : Pseudo<(outs), (ins GPRTC:$rs1), [(Tail GPRTC:$rs1)]>,
+ PseudoInstExpansion<(JALR X0, GPR:$rs1, 0)>;
+
+def : Pat<(Tail (iPTR tglobaladdr:$dst)),
+ (PseudoTAIL texternalsym:$dst)>;
+def : Pat<(Tail (iPTR texternalsym:$dst)),
+ (PseudoTAIL texternalsym:$dst)>;
+
/// Loads
multiclass LdPat<PatFrag LoadOp, RVInst Inst> {
@@ -616,20 +743,40 @@ defm : LdPat<zextloadi16, LHU>;
/// Stores
-multiclass StPat<PatFrag StoreOp, RVInst Inst> {
- def : Pat<(StoreOp GPR:$rs2, GPR:$rs1), (Inst GPR:$rs2, GPR:$rs1, 0)>;
- def : Pat<(StoreOp GPR:$rs2, AddrFI:$rs1), (Inst GPR:$rs2, AddrFI:$rs1, 0)>;
- def : Pat<(StoreOp GPR:$rs2, (add GPR:$rs1, simm12:$imm12)),
- (Inst GPR:$rs2, GPR:$rs1, simm12:$imm12)>;
- def : Pat<(StoreOp GPR:$rs2, (add AddrFI:$rs1, simm12:$imm12)),
- (Inst GPR:$rs2, AddrFI:$rs1, simm12:$imm12)>;
- def : Pat<(StoreOp GPR:$rs2, (IsOrAdd AddrFI:$rs1, simm12:$imm12)),
- (Inst GPR:$rs2, AddrFI:$rs1, simm12:$imm12)>;
+multiclass StPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy> {
+ def : Pat<(StoreOp StTy:$rs2, GPR:$rs1), (Inst StTy:$rs2, GPR:$rs1, 0)>;
+ def : Pat<(StoreOp StTy:$rs2, AddrFI:$rs1), (Inst StTy:$rs2, AddrFI:$rs1, 0)>;
+ def : Pat<(StoreOp StTy:$rs2, (add GPR:$rs1, simm12:$imm12)),
+ (Inst StTy:$rs2, GPR:$rs1, simm12:$imm12)>;
+ def : Pat<(StoreOp StTy:$rs2, (add AddrFI:$rs1, simm12:$imm12)),
+ (Inst StTy:$rs2, AddrFI:$rs1, simm12:$imm12)>;
+ def : Pat<(StoreOp StTy:$rs2, (IsOrAdd AddrFI:$rs1, simm12:$imm12)),
+ (Inst StTy:$rs2, AddrFI:$rs1, simm12:$imm12)>;
}
-defm : StPat<truncstorei8, SB>;
-defm : StPat<truncstorei16, SH>;
-defm : StPat<store, SW>;
+defm : StPat<truncstorei8, SB, GPR>;
+defm : StPat<truncstorei16, SH, GPR>;
+defm : StPat<store, SW, GPR>;
+
+/// Fences
+
+// Refer to Table A.6 in the version 2.3 draft of the RISC-V Instruction Set
+// Manual: Volume I.
+
+// fence acquire -> fence r, rw
+def : Pat<(atomic_fence (i32 4), (imm)), (FENCE 0b10, 0b11)>;
+// fence release -> fence rw, w
+def : Pat<(atomic_fence (i32 5), (imm)), (FENCE 0b11, 0b1)>;
+// fence acq_rel -> fence.tso
+def : Pat<(atomic_fence (i32 6), (imm)), (FENCE_TSO)>;
+// fence seq_cst -> fence rw, rw
+def : Pat<(atomic_fence (i32 7), (imm)), (FENCE 0b11, 0b11)>;
+
+// Lowering for atomic load and store is defined in RISCVInstrInfoA.td.
+// Although these are lowered to fence+load/store instructions defined in the
+// base RV32I/RV64I ISA, this lowering is only used when the A extension is
+// present. This is necessary as it isn't valid to mix __atomic_* libcalls
+// with inline atomic operations for the same object.
/// Other pseudo-instructions
diff --git a/lib/Target/RISCV/RISCVInstrInfoA.td b/lib/Target/RISCV/RISCVInstrInfoA.td
index 33e863ba6a10..379322060438 100644
--- a/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -75,3 +75,23 @@ defm AMOMAX_D : AMO_rr_aq_rl<0b10100, 0b011, "amomax.d">;
defm AMOMINU_D : AMO_rr_aq_rl<0b11000, 0b011, "amominu.d">;
defm AMOMAXU_D : AMO_rr_aq_rl<0b11100, 0b011, "amomaxu.d">;
} // Predicates = [HasStedExtA, IsRV64]
+
+//===----------------------------------------------------------------------===//
+// Pseudo-instructions and codegen patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtA] in {
+
+/// Atomic loads and stores
+
+// Fences will be inserted for atomic load/stores according to the logic in
+// RISCVTargetLowering::{emitLeadingFence,emitTrailingFence}.
+
+defm : LdPat<atomic_load_8, LB>;
+defm : LdPat<atomic_load_16, LH>;
+defm : LdPat<atomic_load_32, LW>;
+
+defm : StPat<atomic_store_8, SB, GPR>;
+defm : StPat<atomic_store_16, SH, GPR>;
+defm : StPat<atomic_store_32, SW, GPR>;
+} // Predicates = [HasStdExtF]
diff --git a/lib/Target/RISCV/RISCVInstrInfoC.td b/lib/Target/RISCV/RISCVInstrInfoC.td
index 4ca52652086b..5d1c62c0b653 100644
--- a/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -27,18 +27,67 @@ def uimmlog2xlennonzero : Operand<XLenVT>, ImmLeaf<XLenVT, [{
let ParserMatchClass = UImmLog2XLenNonZeroAsmOperand;
// TODO: should ensure invalid shamt is rejected when decoding.
let DecoderMethod = "decodeUImmOperand<6>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (!MCOp.evaluateAsConstantImm(Imm))
+ return false;
+ if (STI.getTargetTriple().isArch64Bit())
+ return isUInt<6>(Imm) && (Imm != 0);
+ return isUInt<5>(Imm) && (Imm != 0);
+ }];
}
def simm6 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<6>(Imm);}]> {
let ParserMatchClass = SImmAsmOperand<6>;
let EncoderMethod = "getImmOpValue";
let DecoderMethod = "decodeSImmOperand<6>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (MCOp.evaluateAsConstantImm(Imm))
+ return isInt<6>(Imm);
+ return MCOp.isBareSymbolRef();
+ }];
}
-def uimm6nonzero : Operand<XLenVT>,
- ImmLeaf<XLenVT, [{return isUInt<6>(Imm) && (Imm != 0);}]> {
- let ParserMatchClass = UImmAsmOperand<6, "NonZero">;
- let DecoderMethod = "decodeUImmOperand<6>";
+def simm6nonzero : Operand<XLenVT>,
+ ImmLeaf<XLenVT, [{return (Imm != 0) && isInt<6>(Imm);}]> {
+ let ParserMatchClass = SImmAsmOperand<6, "NonZero">;
+ let EncoderMethod = "getImmOpValue";
+ let DecoderMethod = "decodeSImmOperand<6>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (MCOp.evaluateAsConstantImm(Imm))
+ return (Imm != 0) && isInt<6>(Imm);
+ return MCOp.isBareSymbolRef();
+ }];
+}
+
+def CLUIImmAsmOperand : AsmOperandClass {
+ let Name = "CLUIImm";
+ let RenderMethod = "addImmOperands";
+ let DiagnosticType = !strconcat("Invalid", Name);
+}
+
+
+// c_lui_imm checks the immediate range is in [1, 31] or [0xfffe0, 0xfffff].
+// The RISC-V ISA describes the constraint as [1, 63], with that value being
+// loaded in to bits 17-12 of the destination register and sign extended from
+// bit 17. Therefore, this 6-bit immediate can represent values in the ranges
+// [1, 31] and [0xfffe0, 0xfffff].
+def c_lui_imm : Operand<XLenVT>,
+ ImmLeaf<XLenVT, [{return (Imm != 0) &&
+ (isUInt<5>(Imm) ||
+ (Imm >= 0xfffe0 && Imm <= 0xfffff));}]> {
+ let ParserMatchClass = CLUIImmAsmOperand;
+ let EncoderMethod = "getImmOpValue";
+ let DecoderMethod = "decodeCLUIImmOperand";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (MCOp.evaluateAsConstantImm(Imm))
+ return (Imm != 0) && (isUInt<5>(Imm) ||
+ (Imm >= 0xfffe0 && Imm <= 0xfffff));
+ return MCOp.isBareSymbolRef();
+ }];
}
// A 7-bit unsigned immediate where the least significant two bits are zero.
@@ -47,6 +96,12 @@ def uimm7_lsb00 : Operand<XLenVT>,
let ParserMatchClass = UImmAsmOperand<7, "Lsb00">;
let EncoderMethod = "getImmOpValue";
let DecoderMethod = "decodeUImmOperand<7>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (!MCOp.evaluateAsConstantImm(Imm))
+ return false;
+ return isShiftedUInt<5, 2>(Imm);
+ }];
}
// A 8-bit unsigned immediate where the least significant two bits are zero.
@@ -55,6 +110,12 @@ def uimm8_lsb00 : Operand<XLenVT>,
let ParserMatchClass = UImmAsmOperand<8, "Lsb00">;
let EncoderMethod = "getImmOpValue";
let DecoderMethod = "decodeUImmOperand<8>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (!MCOp.evaluateAsConstantImm(Imm))
+ return false;
+ return isShiftedUInt<6, 2>(Imm);
+ }];
}
// A 8-bit unsigned immediate where the least significant three bits are zero.
@@ -63,6 +124,12 @@ def uimm8_lsb000 : Operand<XLenVT>,
let ParserMatchClass = UImmAsmOperand<8, "Lsb000">;
let EncoderMethod = "getImmOpValue";
let DecoderMethod = "decodeUImmOperand<8>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (!MCOp.evaluateAsConstantImm(Imm))
+ return false;
+ return isShiftedUInt<5, 3>(Imm);
+ }];
}
// A 9-bit signed immediate where the least significant bit is zero.
@@ -70,6 +137,13 @@ def simm9_lsb0 : Operand<OtherVT> {
let ParserMatchClass = SImmAsmOperand<9, "Lsb0">;
let EncoderMethod = "getImmOpValueAsr1";
let DecoderMethod = "decodeSImmOperandAndLsl1<9>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (MCOp.evaluateAsConstantImm(Imm))
+ return isShiftedInt<8, 1>(Imm);
+ return MCOp.isBareSymbolRef();
+
+ }];
}
// A 9-bit unsigned immediate where the least significant three bits are zero.
@@ -78,6 +152,12 @@ def uimm9_lsb000 : Operand<XLenVT>,
let ParserMatchClass = UImmAsmOperand<9, "Lsb000">;
let EncoderMethod = "getImmOpValue";
let DecoderMethod = "decodeUImmOperand<9>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (!MCOp.evaluateAsConstantImm(Imm))
+ return false;
+ return isShiftedUInt<6, 3>(Imm);
+ }];
}
// A 10-bit unsigned immediate where the least significant two bits are zero
@@ -88,21 +168,40 @@ def uimm10_lsb00nonzero : Operand<XLenVT>,
let ParserMatchClass = UImmAsmOperand<10, "Lsb00NonZero">;
let EncoderMethod = "getImmOpValue";
let DecoderMethod = "decodeUImmOperand<10>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (!MCOp.evaluateAsConstantImm(Imm))
+ return false;
+ return isShiftedUInt<8, 2>(Imm) && (Imm != 0);
+ }];
}
// A 10-bit signed immediate where the least significant four bits are zero.
-def simm10_lsb0000 : Operand<XLenVT>,
- ImmLeaf<XLenVT, [{return isShiftedInt<6, 4>(Imm);}]> {
- let ParserMatchClass = SImmAsmOperand<10, "Lsb0000">;
+def simm10_lsb0000nonzero : Operand<XLenVT>,
+ ImmLeaf<XLenVT,
+ [{return (Imm != 0) && isShiftedInt<6, 4>(Imm);}]> {
+ let ParserMatchClass = SImmAsmOperand<10, "Lsb0000NonZero">;
let EncoderMethod = "getImmOpValue";
let DecoderMethod = "decodeSImmOperand<10>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (!MCOp.evaluateAsConstantImm(Imm))
+ return false;
+ return isShiftedInt<6, 4>(Imm);
+ }];
}
// A 12-bit signed immediate where the least significant bit is zero.
-def simm12_lsb0 : Operand<OtherVT> {
+def simm12_lsb0 : Operand<XLenVT> {
let ParserMatchClass = SImmAsmOperand<12, "Lsb0">;
let EncoderMethod = "getImmOpValueAsr1";
let DecoderMethod = "decodeSImmOperandAndLsl1<12>";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (MCOp.evaluateAsConstantImm(Imm))
+ return isShiftedInt<11, 1>(Imm);
+ return MCOp.isBareSymbolRef();
+ }];
}
//===----------------------------------------------------------------------===//
@@ -177,7 +276,7 @@ class CS_ALU<bits<2> funct2, string OpcodeStr, RegisterClass cls,
let Predicates = [HasStdExtC] in {
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [X2] in
def C_ADDI4SPN : RVInst16CIW<0b000, 0b00, (outs GPRC:$rd),
(ins SP:$rs1, uimm10_lsb00nonzero:$imm),
"c.addi4spn", "$rd, $rs1, $imm"> {
@@ -188,8 +287,8 @@ def C_ADDI4SPN : RVInst16CIW<0b000, 0b00, (outs GPRC:$rd),
let Inst{5} = imm{3};
}
-def C_FLD : CLoad_ri<0b001, "c.fld", FPR64C, uimm8_lsb000>,
- Requires<[HasStdExtD]> {
+let Predicates = [HasStdExtC, HasStdExtD] in
+def C_FLD : CLoad_ri<0b001, "c.fld", FPR64C, uimm8_lsb000> {
bits<8> imm;
let Inst{12-10} = imm{5-3};
let Inst{6-5} = imm{7-6};
@@ -202,24 +301,24 @@ def C_LW : CLoad_ri<0b010, "c.lw", GPRC, uimm7_lsb00> {
let Inst{5} = imm{6};
}
-let DecoderNamespace = "RISCV32Only_" in
-def C_FLW : CLoad_ri<0b011, "c.flw", FPR32C, uimm7_lsb00>,
- Requires<[HasStdExtF, IsRV32]> {
+let DecoderNamespace = "RISCV32Only_",
+ Predicates = [HasStdExtC, HasStdExtF, IsRV32] in
+def C_FLW : CLoad_ri<0b011, "c.flw", FPR32C, uimm7_lsb00> {
bits<7> imm;
let Inst{12-10} = imm{5-3};
let Inst{6} = imm{2};
let Inst{5} = imm{6};
}
-def C_LD : CLoad_ri<0b011, "c.ld", GPRC, uimm8_lsb000>,
- Requires<[IsRV64]> {
+let Predicates = [HasStdExtC, IsRV64] in
+def C_LD : CLoad_ri<0b011, "c.ld", GPRC, uimm8_lsb000> {
bits<8> imm;
let Inst{12-10} = imm{5-3};
let Inst{6-5} = imm{7-6};
}
-def C_FSD : CStore_rri<0b101, "c.fsd", FPR64C, uimm8_lsb000>,
- Requires<[HasStdExtD]> {
+let Predicates = [HasStdExtC, HasStdExtD] in
+def C_FSD : CStore_rri<0b101, "c.fsd", FPR64C, uimm8_lsb000> {
bits<8> imm;
let Inst{12-10} = imm{5-3};
let Inst{6-5} = imm{7-6};
@@ -232,17 +331,17 @@ def C_SW : CStore_rri<0b110, "c.sw", GPRC, uimm7_lsb00> {
let Inst{5} = imm{6};
}
-let DecoderNamespace = "RISCV32Only_" in
-def C_FSW : CStore_rri<0b111, "c.fsw", FPR32C, uimm7_lsb00>,
- Requires<[HasStdExtF, IsRV32]> {
+let DecoderNamespace = "RISCV32Only_",
+ Predicates = [HasStdExtC, HasStdExtF, IsRV32] in
+def C_FSW : CStore_rri<0b111, "c.fsw", FPR32C, uimm7_lsb00> {
bits<7> imm;
let Inst{12-10} = imm{5-3};
let Inst{6} = imm{2};
let Inst{5} = imm{6};
}
-def C_SD : CStore_rri<0b111, "c.sd", GPRC, uimm8_lsb000>,
- Requires<[IsRV64]> {
+let Predicates = [HasStdExtC, IsRV64] in
+def C_SD : CStore_rri<0b111, "c.sd", GPRC, uimm8_lsb000> {
bits<8> imm;
let Inst{12-10} = imm{5-3};
let Inst{6-5} = imm{7-6};
@@ -253,23 +352,23 @@ def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
- (ins GPRNoX0:$rd, simm6:$imm),
+ (ins GPRNoX0:$rd, simm6nonzero:$imm),
"c.addi", "$rd, $imm"> {
let Constraints = "$rd = $rd_wb";
let Inst{6-2} = imm{4-0};
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCall = 1,
- DecoderNamespace = "RISCV32Only_" in
+ DecoderNamespace = "RISCV32Only_", Defs = [X1],
+ Predicates = [HasStdExtC, IsRV32] in
def C_JAL : RVInst16CJ<0b001, 0b01, (outs), (ins simm12_lsb0:$offset),
- "c.jal", "$offset">,
- Requires<[IsRV32]>;
+ "c.jal", "$offset">;
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
+ Predicates = [HasStdExtC, IsRV64] in
def C_ADDIW : RVInst16CI<0b001, 0b01, (outs GPRNoX0:$rd_wb),
(ins GPRNoX0:$rd, simm6:$imm),
- "c.addiw", "$rd, $imm">,
- Requires<[IsRV64]> {
+ "c.addiw", "$rd, $imm"> {
let Constraints = "$rd = $rd_wb";
let Inst{6-2} = imm{4-0};
}
@@ -282,7 +381,7 @@ def C_LI : RVInst16CI<0b010, 0b01, (outs GPRNoX0:$rd), (ins simm6:$imm),
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
def C_ADDI16SP : RVInst16CI<0b011, 0b01, (outs SP:$rd_wb),
- (ins SP:$rd, simm10_lsb0000:$imm),
+ (ins SP:$rd, simm10_lsb0000nonzero:$imm),
"c.addi16sp", "$rd, $imm"> {
let Constraints = "$rd = $rd_wb";
let Inst{12} = imm{9};
@@ -295,7 +394,7 @@ def C_ADDI16SP : RVInst16CI<0b011, 0b01, (outs SP:$rd_wb),
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
def C_LUI : RVInst16CI<0b011, 0b01, (outs GPRNoX0X2:$rd),
- (ins uimm6nonzero:$imm),
+ (ins c_lui_imm:$imm),
"c.lui", "$rd, $imm"> {
let Inst{6-2} = imm{4-0};
}
@@ -317,8 +416,10 @@ def C_XOR : CS_ALU<0b01, "c.xor", GPRC, 0>;
def C_OR : CS_ALU<0b10, "c.or" , GPRC, 0>;
def C_AND : CS_ALU<0b11, "c.and", GPRC, 0>;
-def C_SUBW : CS_ALU<0b00, "c.subw", GPRC, 1>, Requires<[IsRV64]>;
-def C_ADDW : CS_ALU<0b01, "c.addw", GPRC, 1>, Requires<[IsRV64]>;
+let Predicates = [HasStdExtC, IsRV64] in {
+def C_SUBW : CS_ALU<0b00, "c.subw", GPRC, 1>;
+def C_ADDW : CS_ALU<0b01, "c.addw", GPRC, 1>;
+}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
def C_J : RVInst16CJ<0b101, 0b01, (outs), (ins simm12_lsb0:$offset),
@@ -339,8 +440,8 @@ def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd_wb),
let Inst{6-2} = imm{4-0};
}
-def C_FLDSP : CStackLoad<0b001, "c.fldsp", FPR64, uimm9_lsb000>,
- Requires<[HasStdExtD]> {
+let Predicates = [HasStdExtC, HasStdExtD] in
+def C_FLDSP : CStackLoad<0b001, "c.fldsp", FPR64, uimm9_lsb000> {
let Inst{6-5} = imm{4-3};
let Inst{4-2} = imm{8-6};
}
@@ -350,15 +451,15 @@ def C_LWSP : CStackLoad<0b010, "c.lwsp", GPRNoX0, uimm8_lsb00> {
let Inst{3-2} = imm{7-6};
}
-let DecoderNamespace = "RISCV32Only_" in
-def C_FLWSP : CStackLoad<0b011, "c.flwsp", FPR32, uimm8_lsb00>,
- Requires<[HasStdExtF, IsRV32]> {
+let DecoderNamespace = "RISCV32Only_",
+ Predicates = [HasStdExtC, HasStdExtF, IsRV32] in
+def C_FLWSP : CStackLoad<0b011, "c.flwsp", FPR32, uimm8_lsb00> {
let Inst{6-4} = imm{4-2};
let Inst{3-2} = imm{7-6};
}
-def C_LDSP : CStackLoad<0b011, "c.ldsp", GPRNoX0, uimm9_lsb000>,
- Requires<[IsRV64]> {
+let Predicates = [HasStdExtC, IsRV64] in
+def C_LDSP : CStackLoad<0b011, "c.ldsp", GPRNoX0, uimm9_lsb000> {
let Inst{6-5} = imm{4-3};
let Inst{4-2} = imm{8-6};
}
@@ -392,8 +493,8 @@ def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPRNoX0:$rs1_wb),
let Constraints = "$rs1 = $rs1_wb";
}
-def C_FSDSP : CStackStore<0b101, "c.fsdsp", FPR64, uimm9_lsb000>,
- Requires<[HasStdExtD]> {
+let Predicates = [HasStdExtC, HasStdExtD] in
+def C_FSDSP : CStackStore<0b101, "c.fsdsp", FPR64, uimm9_lsb000> {
let Inst{12-10} = imm{5-3};
let Inst{9-7} = imm{8-6};
}
@@ -403,17 +504,204 @@ def C_SWSP : CStackStore<0b110, "c.swsp", GPR, uimm8_lsb00> {
let Inst{8-7} = imm{7-6};
}
-let DecoderNamespace = "RISCV32Only_" in
-def C_FSWSP : CStackStore<0b111, "c.fswsp", FPR32, uimm8_lsb00>,
- Requires<[HasStdExtF, IsRV32]> {
+let DecoderNamespace = "RISCV32Only_",
+ Predicates = [HasStdExtC, HasStdExtF, IsRV32] in
+def C_FSWSP : CStackStore<0b111, "c.fswsp", FPR32, uimm8_lsb00> {
let Inst{12-9} = imm{5-2};
let Inst{8-7} = imm{7-6};
}
-def C_SDSP : CStackStore<0b111, "c.sdsp", GPR, uimm9_lsb000>,
- Requires<[IsRV64]> {
+let Predicates = [HasStdExtC, IsRV64] in
+def C_SDSP : CStackStore<0b111, "c.sdsp", GPR, uimm9_lsb000> {
let Inst{12-10} = imm{5-3};
let Inst{9-7} = imm{8-6};
}
} // Predicates = [HasStdExtC]
+
+//===----------------------------------------------------------------------===//
+// Compress Instruction tablegen backend.
+//===----------------------------------------------------------------------===//
+
+class CompressPat<dag input, dag output> {
+ dag Input = input;
+ dag Output = output;
+ list<Predicate> Predicates = [];
+}
+
+// Patterns are defined in the same order the compressed instructions appear
+// on page 82 of the ISA manual.
+
+// Quadrant 0
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(ADDI GPRC:$rd, SP:$rs1, uimm10_lsb00nonzero:$imm),
+ (C_ADDI4SPN GPRC:$rd, SP:$rs1, uimm10_lsb00nonzero:$imm)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, HasStdExtD] in {
+def : CompressPat<(FLD FPR64C:$rd, GPRC:$rs1, uimm8_lsb000:$imm),
+ (C_FLD FPR64C:$rd, GPRC:$rs1, uimm8_lsb000:$imm)>;
+} // Predicates = [HasStdExtC, HasStdExtD]
+
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(LW GPRC:$rd, GPRC:$rs1, uimm7_lsb00:$imm),
+ (C_LW GPRC:$rd, GPRC:$rs1, uimm7_lsb00:$imm)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in {
+def : CompressPat<(FLW FPR32C:$rd, GPRC:$rs1, uimm7_lsb00:$imm),
+ (C_FLW FPR32C:$rd, GPRC:$rs1, uimm7_lsb00:$imm)>;
+} // Predicates = [HasStdExtC, HasStdExtF, IsRV32]
+
+let Predicates = [HasStdExtC, IsRV64] in {
+def : CompressPat<(LD GPRC:$rd, GPRC:$rs1, uimm8_lsb000:$imm),
+ (C_LD GPRC:$rd, GPRC:$rs1, uimm8_lsb000:$imm)>;
+} // Predicates = [HasStdExtC, IsRV64]
+
+let Predicates = [HasStdExtC, HasStdExtD] in {
+def : CompressPat<(FSD FPR64C:$rs2, GPRC:$rs1, uimm8_lsb000:$imm),
+ (C_FSD FPR64C:$rs2, GPRC:$rs1, uimm8_lsb000:$imm)>;
+} // Predicates = [HasStdExtC, HasStdExtD]
+
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(SW GPRC:$rs2, GPRC:$rs1, uimm7_lsb00:$imm),
+ (C_SW GPRC:$rs2, GPRC:$rs1, uimm7_lsb00:$imm)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in {
+def : CompressPat<(FSW FPR32C:$rs2, GPRC:$rs1,uimm7_lsb00:$imm),
+ (C_FSW FPR32C:$rs2, GPRC:$rs1, uimm7_lsb00:$imm)>;
+} // Predicate = [HasStdExtC, HasStdExtF, IsRV32]
+
+let Predicates = [HasStdExtC, IsRV64] in {
+def : CompressPat<(SD GPRC:$rs2, GPRC:$rs1, uimm8_lsb000:$imm),
+ (C_SD GPRC:$rs2, GPRC:$rs1, uimm8_lsb000:$imm)>;
+} // Predicates = [HasStdExtC, IsRV64]
+
+// Quadrant 1
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(ADDI X0, X0, 0), (C_NOP)>;
+def : CompressPat<(ADDI GPRNoX0:$rs1, GPRNoX0:$rs1, simm6nonzero:$imm),
+ (C_ADDI GPRNoX0:$rs1, simm6nonzero:$imm)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, IsRV32] in {
+def : CompressPat<(JAL X1, simm12_lsb0:$offset),
+ (C_JAL simm12_lsb0:$offset)>;
+} // Predicates = [HasStdExtC, IsRV32]
+
+let Predicates = [HasStdExtC, IsRV64] in {
+def : CompressPat<(ADDIW GPRNoX0:$rs1, GPRNoX0:$rs1, simm6:$imm),
+ (C_ADDIW GPRNoX0:$rs1, simm6:$imm)>;
+} // Predicates = [HasStdExtC, IsRV64]
+
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(ADDI GPRNoX0:$rd, X0, simm6:$imm),
+ (C_LI GPRNoX0:$rd, simm6:$imm)>;
+def : CompressPat<(ADDI X2, X2, simm10_lsb0000nonzero:$imm),
+ (C_ADDI16SP X2, simm10_lsb0000nonzero:$imm)>;
+def : CompressPat<(LUI GPRNoX0X2:$rd, c_lui_imm:$imm),
+ (C_LUI GPRNoX0X2:$rd, c_lui_imm:$imm)>;
+def : CompressPat<(SRLI GPRC:$rs1, GPRC:$rs1, uimmlog2xlennonzero:$imm),
+ (C_SRLI GPRC:$rs1, uimmlog2xlennonzero:$imm)>;
+def : CompressPat<(SRAI GPRC:$rs1, GPRC:$rs1, uimmlog2xlennonzero:$imm),
+ (C_SRAI GPRC:$rs1, uimmlog2xlennonzero:$imm)>;
+def : CompressPat<(ANDI GPRC:$rs1, GPRC:$rs1, simm6:$imm),
+ (C_ANDI GPRC:$rs1, simm6:$imm)>;
+def : CompressPat<(SUB GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
+ (C_SUB GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(XOR GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
+ (C_XOR GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(XOR GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
+ (C_XOR GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(OR GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
+ (C_OR GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(OR GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
+ (C_OR GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(AND GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
+ (C_AND GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(AND GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
+ (C_AND GPRC:$rs1, GPRC:$rs2)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, IsRV64] in {
+def : CompressPat<(SUBW GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
+ (C_SUBW GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(ADDW GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
+ (C_ADDW GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(ADDW GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
+ (C_ADDW GPRC:$rs1, GPRC:$rs2)>;
+} // Predicates = [HasStdExtC, IsRV64]
+
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(JAL X0, simm12_lsb0:$offset),
+ (C_J simm12_lsb0:$offset)>;
+def : CompressPat<(BEQ GPRC:$rs1, X0, simm9_lsb0:$imm),
+ (C_BEQZ GPRC:$rs1, simm9_lsb0:$imm)>;
+def : CompressPat<(BNE GPRC:$rs1, X0, simm9_lsb0:$imm),
+ (C_BNEZ GPRC:$rs1, simm9_lsb0:$imm)>;
+} // Predicates = [HasStdExtC]
+
+// Quadrant 2
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(SLLI GPRNoX0:$rs1, GPRNoX0:$rs1, uimmlog2xlennonzero:$imm),
+ (C_SLLI GPRNoX0:$rs1, uimmlog2xlennonzero:$imm)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, HasStdExtD] in {
+def : CompressPat<(FLD FPR64:$rd, SP:$rs1, uimm9_lsb000:$imm),
+ (C_FLDSP FPR64:$rd, SP:$rs1, uimm9_lsb000:$imm)>;
+} // Predicates = [HasStdExtC, HasStdExtD]
+
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(LW GPRNoX0:$rd, SP:$rs1, uimm8_lsb00:$imm),
+ (C_LWSP GPRNoX0:$rd, SP:$rs1, uimm8_lsb00:$imm)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in {
+def : CompressPat<(FLW FPR32:$rd, SP:$rs1, uimm8_lsb00:$imm),
+ (C_FLWSP FPR32:$rd, SP:$rs1, uimm8_lsb00:$imm)>;
+} // Predicates = [HasStdExtC, HasStdExtF, IsRV32]
+
+let Predicates = [HasStdExtC, IsRV64] in {
+def : CompressPat<(LD GPRNoX0:$rd, SP:$rs1, uimm9_lsb000:$imm),
+ (C_LDSP GPRNoX0:$rd, SP:$rs1, uimm9_lsb000:$imm)>;
+} // Predicates = [HasStdExtC, IsRV64]
+
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(JALR X0, GPRNoX0:$rs1, 0),
+ (C_JR GPRNoX0:$rs1)>;
+def : CompressPat<(ADD GPRNoX0:$rs1, X0, GPRNoX0:$rs2),
+ (C_MV GPRNoX0:$rs1, GPRNoX0:$rs2)>;
+def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs2, X0),
+ (C_MV GPRNoX0:$rs1, GPRNoX0:$rs2)>;
+def : CompressPat<(ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, 0),
+ (C_MV GPRNoX0:$rs1, GPRNoX0:$rs2)>;
+def : CompressPat<(EBREAK), (C_EBREAK)>;
+def : CompressPat<(JALR X1, GPRNoX0:$rs1, 0),
+ (C_JALR GPRNoX0:$rs1)>;
+def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs1, GPRNoX0:$rs2),
+ (C_ADD GPRNoX0:$rs1, GPRNoX0:$rs2)>;
+def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs2, GPRNoX0:$rs1),
+ (C_ADD GPRNoX0:$rs1, GPRNoX0:$rs2)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, HasStdExtD] in {
+def : CompressPat<(FSD FPR64:$rs2, SP:$rs1, uimm9_lsb000:$imm),
+ (C_FSDSP FPR64:$rs2, SP:$rs1, uimm9_lsb000:$imm)>;
+} // Predicates = [HasStdExtC, HasStdExtD]
+
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(SW GPR:$rs2, SP:$rs1, uimm8_lsb00:$imm),
+ (C_SWSP GPR:$rs2, SP:$rs1, uimm8_lsb00:$imm)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in {
+def : CompressPat<(FSW FPR32:$rs2, SP:$rs1, uimm8_lsb00:$imm),
+ (C_FSWSP FPR32:$rs2, SP:$rs1, uimm8_lsb00:$imm)>;
+} // Predicates = [HasStdExtC, HasStdExtF, IsRV32]
+
+let Predicates = [HasStdExtC, IsRV64] in {
+def : CompressPat<(SD GPR:$rs2, SP:$rs1, uimm9_lsb000:$imm),
+ (C_SDSP GPR:$rs2, SP:$rs1, uimm9_lsb000:$imm)>;
+} // Predicates = [HasStdExtC, IsRV64]
diff --git a/lib/Target/RISCV/RISCVInstrInfoD.td b/lib/Target/RISCV/RISCVInstrInfoD.td
index 48d91c0054d3..06b834d55ade 100644
--- a/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -13,6 +13,20 @@
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
+// RISC-V specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDT_RISCVBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>,
+ SDTCisVT<1, i32>,
+ SDTCisSameAs<1, 2>]>;
+def SDT_RISCVSplitF64 : SDTypeProfile<2, 1, [SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>,
+ SDTCisVT<2, f64>]>;
+
+def RISCVBuildPairF64 : SDNode<"RISCVISD::BuildPairF64", SDT_RISCVBuildPairF64>;
+def RISCVSplitF64 : SDNode<"RISCVISD::SplitF64", SDT_RISCVSplitF64>;
+
+//===----------------------------------------------------------------------===//
// Instruction Class Templates
//===----------------------------------------------------------------------===//
@@ -171,4 +185,105 @@ let Predicates = [HasStdExtD] in {
def : InstAlias<"fmv.d $rd, $rs", (FSGNJ_D FPR64:$rd, FPR64:$rs, FPR64:$rs)>;
def : InstAlias<"fabs.d $rd, $rs", (FSGNJX_D FPR64:$rd, FPR64:$rs, FPR64:$rs)>;
def : InstAlias<"fneg.d $rd, $rs", (FSGNJN_D FPR64:$rd, FPR64:$rs, FPR64:$rs)>;
+
+// fgt.d/fge.d are recognised by the GNU assembler but the canonical
+// flt.d/fle.d forms will always be printed. Therefore, set a zero weight.
+def : InstAlias<"fgt.d $rd, $rs, $rt",
+ (FLT_D GPR:$rd, FPR64:$rt, FPR64:$rs), 0>;
+def : InstAlias<"fge.d $rd, $rs, $rt",
+ (FLE_D GPR:$rd, FPR64:$rt, FPR64:$rs), 0>;
+} // Predicates = [HasStdExtD]
+
+//===----------------------------------------------------------------------===//
+// Pseudo-instructions and codegen patterns
+//===----------------------------------------------------------------------===//
+
+class PatFpr64Fpr64<SDPatternOperator OpNode, RVInstR Inst>
+ : Pat<(OpNode FPR64:$rs1, FPR64:$rs2), (Inst $rs1, $rs2)>;
+
+class PatFpr64Fpr64DynFrm<SDPatternOperator OpNode, RVInstRFrm Inst>
+ : Pat<(OpNode FPR64:$rs1, FPR64:$rs2), (Inst $rs1, $rs2, 0b111)>;
+
+let Predicates = [HasStdExtD] in {
+
+/// Float conversion operations
+
+// f64 -> f32, f32 -> f64
+def : Pat<(fpround FPR64:$rs1), (FCVT_S_D FPR64:$rs1, 0b111)>;
+def : Pat<(fpextend FPR32:$rs1), (FCVT_D_S FPR32:$rs1)>;
+
+// FP->[u]int. Round-to-zero must be used
+def : Pat<(fp_to_sint FPR64:$rs1), (FCVT_W_D FPR64:$rs1, 0b001)>;
+def : Pat<(fp_to_uint FPR64:$rs1), (FCVT_WU_D FPR64:$rs1, 0b001)>;
+
+// [u]int->fp
+def : Pat<(sint_to_fp GPR:$rs1), (FCVT_D_W GPR:$rs1)>;
+def : Pat<(uint_to_fp GPR:$rs1), (FCVT_D_WU GPR:$rs1)>;
+
+/// Float arithmetic operations
+
+def : PatFpr64Fpr64DynFrm<fadd, FADD_D>;
+def : PatFpr64Fpr64DynFrm<fsub, FSUB_D>;
+def : PatFpr64Fpr64DynFrm<fmul, FMUL_D>;
+def : PatFpr64Fpr64DynFrm<fdiv, FDIV_D>;
+
+def : Pat<(fsqrt FPR64:$rs1), (FSQRT_D FPR64:$rs1, 0b111)>;
+
+def : Pat<(fneg FPR64:$rs1), (FSGNJN_D $rs1, $rs1)>;
+def : Pat<(fabs FPR64:$rs1), (FSGNJX_D $rs1, $rs1)>;
+
+def : PatFpr64Fpr64<fcopysign, FSGNJ_D>;
+def : Pat<(fcopysign FPR64:$rs1, (fneg FPR64:$rs2)), (FSGNJN_D $rs1, $rs2)>;
+
+// The RISC-V 2.2 user-level ISA spec defines fmin and fmax as returning the
+// canonical NaN when giving a signaling NaN. This doesn't match the LLVM
+// behaviour (see https://bugs.llvm.org/show_bug.cgi?id=27363). However, the
+// draft 2.3 ISA spec changes the definition of fmin and fmax in a way that
+// matches LLVM's fminnum and fmaxnum
+// <https://github.com/riscv/riscv-isa-manual/commit/cd20cee7efd9bac7c5aa127ec3b451749d2b3cce>.
+def : PatFpr64Fpr64<fminnum, FMIN_D>;
+def : PatFpr64Fpr64<fmaxnum, FMAX_D>;
+
+/// Setcc
+
+def : PatFpr64Fpr64<seteq, FEQ_D>;
+def : PatFpr64Fpr64<setoeq, FEQ_D>;
+def : PatFpr64Fpr64<setlt, FLT_D>;
+def : PatFpr64Fpr64<setolt, FLT_D>;
+def : PatFpr64Fpr64<setle, FLE_D>;
+def : PatFpr64Fpr64<setole, FLE_D>;
+
+// Define pattern expansions for setcc operations which aren't directly
+// handled by a RISC-V instruction and aren't expanded in the SelectionDAG
+// Legalizer.
+
+def : Pat<(setuo FPR64:$rs1, FPR64:$rs2),
+ (SLTIU (AND (FEQ_D FPR64:$rs1, FPR64:$rs1),
+ (FEQ_D FPR64:$rs2, FPR64:$rs2)),
+ 1)>;
+
+def Select_FPR64_Using_CC_GPR : SelectCC_rrirr<FPR64, GPR>;
+
+/// Loads
+
+defm : LdPat<load, FLD>;
+
+/// Stores
+
+defm : StPat<store, FSD, FPR64>;
+
+/// Pseudo-instructions needed for the soft-float ABI with RV32D
+
+// Moves two GPRs to an FPR.
+let usesCustomInserter = 1 in
+def BuildPairF64Pseudo
+ : Pseudo<(outs FPR64:$dst), (ins GPR:$src1, GPR:$src2),
+ [(set FPR64:$dst, (RISCVBuildPairF64 GPR:$src1, GPR:$src2))]>;
+
+// Moves an FPR to two GPRs.
+let usesCustomInserter = 1 in
+def SplitF64Pseudo
+ : Pseudo<(outs GPR:$dst1, GPR:$dst2), (ins FPR64:$src),
+ [(set GPR:$dst1, GPR:$dst2, (RISCVSplitF64 FPR64:$src))]>;
+
} // Predicates = [HasStdExtD]
diff --git a/lib/Target/RISCV/RISCVInstrInfoF.td b/lib/Target/RISCV/RISCVInstrInfoF.td
index 07722d2cbf34..6d7c59becf24 100644
--- a/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -200,6 +200,13 @@ def : InstAlias<"fmv.s $rd, $rs", (FSGNJ_S FPR32:$rd, FPR32:$rs, FPR32:$rs)>;
def : InstAlias<"fabs.s $rd, $rs", (FSGNJX_S FPR32:$rd, FPR32:$rs, FPR32:$rs)>;
def : InstAlias<"fneg.s $rd, $rs", (FSGNJN_S FPR32:$rd, FPR32:$rs, FPR32:$rs)>;
+// fgt.s/fge.s are recognised by the GNU assembler but the canonical
+// flt.s/fle.s forms will always be printed. Therefore, set a zero weight.
+def : InstAlias<"fgt.s $rd, $rs, $rt",
+ (FLT_S GPR:$rd, FPR32:$rt, FPR32:$rs), 0>;
+def : InstAlias<"fge.s $rd, $rs, $rt",
+ (FLE_S GPR:$rd, FPR32:$rt, FPR32:$rs), 0>;
+
// The following csr instructions actually alias instructions from the base ISA.
// However, it only makes sense to support them when the F extension is enabled.
// CSR Addresses: 0x003 == fcsr, 0x002 == frm, 0x001 == fflags
@@ -219,4 +226,90 @@ def : InstAlias<"fsflags $rd, $rs", (CSRRW GPR:$rd, 0x001, GPR:$rs)>;
def : InstAlias<"fsflags $rs", (CSRRW X0, 0x001, GPR:$rs), 2>;
def : InstAlias<"fsflagsi $rd, $imm", (CSRRWI GPR:$rd, 0x001, uimm5:$imm)>;
def : InstAlias<"fsflagsi $imm", (CSRRWI X0, 0x001, uimm5:$imm), 2>;
+
+// fmv.w.x and fmv.x.w were previously known as fmv.s.x and fmv.x.s. Both
+// spellings should be supported by standard tools.
+def : MnemonicAlias<"fmv.s.x", "fmv.w.x">;
+def : MnemonicAlias<"fmv.x.s", "fmv.x.w">;
+} // Predicates = [HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// Pseudo-instructions and codegen patterns
+//===----------------------------------------------------------------------===//
+
+/// Generic pattern classes
+class PatFpr32Fpr32<SDPatternOperator OpNode, RVInstR Inst>
+ : Pat<(OpNode FPR32:$rs1, FPR32:$rs2), (Inst $rs1, $rs2)>;
+
+class PatFpr32Fpr32DynFrm<SDPatternOperator OpNode, RVInstRFrm Inst>
+ : Pat<(OpNode FPR32:$rs1, FPR32:$rs2), (Inst $rs1, $rs2, 0b111)>;
+
+let Predicates = [HasStdExtF] in {
+
+/// Float conversion operations
+
+// Moves (no conversion)
+def : Pat<(bitconvert GPR:$rs1), (FMV_W_X GPR:$rs1)>;
+def : Pat<(bitconvert FPR32:$rs1), (FMV_X_W FPR32:$rs1)>;
+
+// FP->[u]int. Round-to-zero must be used
+def : Pat<(fp_to_sint FPR32:$rs1), (FCVT_W_S $rs1, 0b001)>;
+def : Pat<(fp_to_uint FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>;
+
+// [u]int->fp. Match GCC and default to using dynamic rounding mode.
+def : Pat<(sint_to_fp GPR:$rs1), (FCVT_S_W $rs1, 0b111)>;
+def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_WU $rs1, 0b111)>;
+
+/// Float arithmetic operations
+
+def : PatFpr32Fpr32DynFrm<fadd, FADD_S>;
+def : PatFpr32Fpr32DynFrm<fsub, FSUB_S>;
+def : PatFpr32Fpr32DynFrm<fmul, FMUL_S>;
+def : PatFpr32Fpr32DynFrm<fdiv, FDIV_S>;
+
+def : Pat<(fsqrt FPR32:$rs1), (FSQRT_S FPR32:$rs1, 0b111)>;
+
+def : Pat<(fneg FPR32:$rs1), (FSGNJN_S $rs1, $rs1)>;
+def : Pat<(fabs FPR32:$rs1), (FSGNJX_S $rs1, $rs1)>;
+
+def : PatFpr32Fpr32<fcopysign, FSGNJ_S>;
+def : Pat<(fcopysign FPR32:$rs1, (fneg FPR32:$rs2)), (FSGNJN_S $rs1, $rs2)>;
+
+// The RISC-V 2.2 user-level ISA spec defines fmin and fmax as returning the
+// canonical NaN when given a signaling NaN. This doesn't match the LLVM
+// behaviour (see https://bugs.llvm.org/show_bug.cgi?id=27363). However, the
+// draft 2.3 ISA spec changes the definition of fmin and fmax in a way that
+// matches LLVM's fminnum and fmaxnum
+// <https://github.com/riscv/riscv-isa-manual/commit/cd20cee7efd9bac7c5aa127ec3b451749d2b3cce>.
+def : PatFpr32Fpr32<fminnum, FMIN_S>;
+def : PatFpr32Fpr32<fmaxnum, FMAX_S>;
+
+/// Setcc
+
+def : PatFpr32Fpr32<seteq, FEQ_S>;
+def : PatFpr32Fpr32<setoeq, FEQ_S>;
+def : PatFpr32Fpr32<setlt, FLT_S>;
+def : PatFpr32Fpr32<setolt, FLT_S>;
+def : PatFpr32Fpr32<setle, FLE_S>;
+def : PatFpr32Fpr32<setole, FLE_S>;
+
+// Define pattern expansions for setcc operations which aren't directly
+// handled by a RISC-V instruction and aren't expanded in the SelectionDAG
+// Legalizer.
+
+def : Pat<(setuo FPR32:$rs1, FPR32:$rs2),
+ (SLTIU (AND (FEQ_S FPR32:$rs1, FPR32:$rs1),
+ (FEQ_S FPR32:$rs2, FPR32:$rs2)),
+ 1)>;
+
+def Select_FPR32_Using_CC_GPR : SelectCC_rrirr<FPR32, GPR>;
+
+/// Loads
+
+defm : LdPat<load, FLW>;
+
+/// Stores
+
+defm : StPat<store, FSW, FPR32>;
+
} // Predicates = [HasStdExtF]
diff --git a/lib/Target/RISCV/RISCVInstrInfoM.td b/lib/Target/RISCV/RISCVInstrInfoM.td
index fec9c1f93997..2dd10ada4003 100644
--- a/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -34,3 +34,18 @@ def DIVUW : ALUW_rr<0b0000001, 0b101, "divuw">;
def REMW : ALUW_rr<0b0000001, 0b110, "remw">;
def REMUW : ALUW_rr<0b0000001, 0b111, "remuw">;
} // Predicates = [HasStdExtM, IsRV64]
+
+//===----------------------------------------------------------------------===//
+// Pseudo-instructions and codegen patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtM] in {
+def : PatGprGpr<mul, MUL>;
+def : PatGprGpr<mulhs, MULH>;
+def : PatGprGpr<mulhu, MULHU>;
+// No ISDOpcode for mulhsu
+def : PatGprGpr<sdiv, DIV>;
+def : PatGprGpr<udiv, DIVU>;
+def : PatGprGpr<srem, REM>;
+def : PatGprGpr<urem, REMU>;
+} // Predicates = [HasStdExtM]
diff --git a/lib/Target/RISCV/RISCVMCInstLower.cpp b/lib/Target/RISCV/RISCVMCInstLower.cpp
index d8ae11f2bd90..e0100b1679be 100644
--- a/lib/Target/RISCV/RISCVMCInstLower.cpp
+++ b/lib/Target/RISCV/RISCVMCInstLower.cpp
@@ -48,11 +48,12 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
const MCExpr *ME =
MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
- if (!MO.isJTI() && MO.getOffset())
+ if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
ME = MCBinaryExpr::createAdd(
ME, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
- ME = RISCVMCExpr::create(ME, Kind, Ctx);
+ if (Kind != RISCVMCExpr::VK_RISCV_None)
+ ME = RISCVMCExpr::create(ME, Kind, Ctx);
return MCOperand::createExpr(ME);
}
@@ -75,8 +76,7 @@ bool llvm::LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
MCOp = MCOperand::createImm(MO.getImm());
break;
case MachineOperand::MO_MachineBasicBlock:
- MCOp = MCOperand::createExpr(
- MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), AP.OutContext));
+ MCOp = lowerSymbolOperand(MO, MO.getMBB()->getSymbol(), AP);
break;
case MachineOperand::MO_GlobalAddress:
MCOp = lowerSymbolOperand(MO, AP.getSymbol(MO.getGlobal()), AP);
@@ -89,6 +89,9 @@ bool llvm::LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
MCOp = lowerSymbolOperand(
MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()), AP);
break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ MCOp = lowerSymbolOperand(MO, AP.GetCPISymbol(MO.getIndex()), AP);
+ break;
}
return true;
}
diff --git a/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/lib/Target/RISCV/RISCVMachineFunctionInfo.h
new file mode 100644
index 000000000000..2fea3a1bdd2f
--- /dev/null
+++ b/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -0,0 +1,55 @@
+//=- RISCVMachineFunctionInfo.h - RISCV machine function info -----*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares RISCV-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_RISCV_RISCVMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// RISCVMachineFunctionInfo - This class is derived from MachineFunctionInfo
+/// and contains private RISCV-specific information for each MachineFunction.
+class RISCVMachineFunctionInfo : public MachineFunctionInfo {
+private:
+ MachineFunction &MF;
+ /// FrameIndex for start of varargs area
+ int VarArgsFrameIndex = 0;
+ /// Size of the save area used for varargs
+ int VarArgsSaveSize = 0;
+ /// FrameIndex used for transferring values between 64-bit FPRs and a pair
+ /// of 32-bit GPRs via the stack.
+ int MoveF64FrameIndex = -1;
+
+public:
+ // RISCVMachineFunctionInfo() = default;
+
+ RISCVMachineFunctionInfo(MachineFunction &MF) : MF(MF) {}
+
+ int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+ void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+ unsigned getVarArgsSaveSize() const { return VarArgsSaveSize; }
+ void setVarArgsSaveSize(int Size) { VarArgsSaveSize = Size; }
+
+ int getMoveF64FrameIndex() {
+ if (MoveF64FrameIndex == -1)
+ MoveF64FrameIndex = MF.getFrameInfo().CreateStackObject(8, 8, false);
+ return MoveF64FrameIndex;
+ }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_RISCV_RISCVMACHINEFUNCTIONINFO_H
diff --git a/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
new file mode 100644
index 000000000000..b8fa8a97d41a
--- /dev/null
+++ b/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -0,0 +1,286 @@
+//===----- RISCVMergeBaseOffset.cpp - Optimise address calculations ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Merge the offset of address calculation into the offset field
+// of instructions in a global address lowering sequence. This pass transforms:
+// lui vreg1, %hi(s)
+// addi vreg2, vreg1, %lo(s)
+// addi vreg3, verg2, Offset
+//
+// Into:
+// lui vreg1, %hi(s+Offset)
+// addi vreg2, vreg1, %lo(s+Offset)
+//
+// The transformation is carried out under certain conditions:
+// 1) The offset field in the base of global address lowering sequence is zero.
+// 2) The lowered global address has only one use.
+//
+// The offset field can be in a different form. This pass handles all of them.
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include <set>
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-merge-base-offset"
+#define RISCV_MERGE_BASE_OFFSET_NAME "RISCV Merge Base Offset"
+namespace {
+
+struct RISCVMergeBaseOffsetOpt : public MachineFunctionPass {
+ static char ID;
+ const MachineFunction *MF;
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+ bool detectLuiAddiGlobal(MachineInstr &LUI, MachineInstr *&ADDI);
+
+ bool detectAndFoldOffset(MachineInstr &HiLUI, MachineInstr &LoADDI);
+ void foldOffset(MachineInstr &HiLUI, MachineInstr &LoADDI, MachineInstr &Tail,
+ int64_t Offset);
+ bool matchLargeOffset(MachineInstr &TailAdd, unsigned GSReg, int64_t &Offset);
+ RISCVMergeBaseOffsetOpt() : MachineFunctionPass(ID) {}
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::IsSSA);
+ }
+
+ StringRef getPassName() const override {
+ return RISCV_MERGE_BASE_OFFSET_NAME;
+ }
+
+private:
+ MachineRegisterInfo *MRI;
+ std::set<MachineInstr *> DeadInstrs;
+};
+}; // end anonymous namespace
+
+char RISCVMergeBaseOffsetOpt::ID = 0;
+INITIALIZE_PASS(RISCVMergeBaseOffsetOpt, "riscv-merge-base-offset",
+ RISCV_MERGE_BASE_OFFSET_NAME, false, false)
+
+// Detect the pattern:
+// lui vreg1, %hi(s)
+// addi vreg2, vreg1, %lo(s)
+//
+// Pattern only accepted if:
+// 1) ADDI has only one use.
+// 2) LUI has only one use; which is the ADDI.
+// 3) Both ADDI and LUI have GlobalAddress type which indicates that these
+// are generated from global address lowering.
+// 4) Offset value in the Global Address is 0.
+bool RISCVMergeBaseOffsetOpt::detectLuiAddiGlobal(MachineInstr &HiLUI,
+ MachineInstr *&LoADDI) {
+ if (HiLUI.getOpcode() != RISCV::LUI ||
+ HiLUI.getOperand(1).getTargetFlags() != RISCVII::MO_HI ||
+ HiLUI.getOperand(1).getType() != MachineOperand::MO_GlobalAddress ||
+ HiLUI.getOperand(1).getOffset() != 0 ||
+ !MRI->hasOneUse(HiLUI.getOperand(0).getReg()))
+ return false;
+ unsigned HiLuiDestReg = HiLUI.getOperand(0).getReg();
+ LoADDI = MRI->use_begin(HiLuiDestReg)->getParent();
+ if (LoADDI->getOpcode() != RISCV::ADDI ||
+ LoADDI->getOperand(2).getTargetFlags() != RISCVII::MO_LO ||
+ LoADDI->getOperand(2).getType() != MachineOperand::MO_GlobalAddress ||
+ LoADDI->getOperand(2).getOffset() != 0 ||
+ !MRI->hasOneUse(LoADDI->getOperand(0).getReg()))
+ return false;
+ return true;
+}
+
+// Update the offset in HiLUI and LoADDI instructions.
+// Delete the tail instruction and update all the uses to use the
+// output from LoADDI.
+void RISCVMergeBaseOffsetOpt::foldOffset(MachineInstr &HiLUI,
+ MachineInstr &LoADDI,
+ MachineInstr &Tail, int64_t Offset) {
+ // Put the offset back in HiLUI and the LoADDI
+ HiLUI.getOperand(1).setOffset(Offset);
+ LoADDI.getOperand(2).setOffset(Offset);
+ // Delete the tail instruction.
+ DeadInstrs.insert(&Tail);
+ MRI->replaceRegWith(Tail.getOperand(0).getReg(),
+ LoADDI.getOperand(0).getReg());
+ LLVM_DEBUG(dbgs() << " Merged offset " << Offset << " into base.\n"
+ << " " << HiLUI << " " << LoADDI;);
+}
+
+// Detect patterns for large offsets that are passed into an ADD instruction.
+//
+// Base address lowering is of the form:
+// HiLUI: lui vreg1, %hi(s)
+// LoADDI: addi vreg2, vreg1, %lo(s)
+// / \
+// / \
+// / \
+// / The large offset can be of two forms: \
+// 1) Offset that has non zero bits in lower 2) Offset that has non zero
+// 12 bits and upper 20 bits bits in upper 20 bits only
+// OffseLUI: lui vreg3, 4
+// OffsetTail: addi voff, vreg3, 188 OffsetTail: lui voff, 128
+// \ /
+// \ /
+// \ /
+// \ /
+// TailAdd: add vreg4, vreg2, voff
+bool RISCVMergeBaseOffsetOpt::matchLargeOffset(MachineInstr &TailAdd,
+ unsigned GAReg,
+ int64_t &Offset) {
+ assert((TailAdd.getOpcode() == RISCV::ADD) && "Expected ADD instruction!");
+ unsigned Rs = TailAdd.getOperand(1).getReg();
+ unsigned Rt = TailAdd.getOperand(2).getReg();
+ unsigned Reg = Rs == GAReg ? Rt : Rs;
+
+ // Can't fold if the register has more than one use.
+ if (!MRI->hasOneUse(Reg))
+ return false;
+ // This can point to an ADDI or a LUI:
+ MachineInstr &OffsetTail = *MRI->getVRegDef(Reg);
+ if (OffsetTail.getOpcode() == RISCV::ADDI) {
+ // The offset value has non zero bits in both %hi and %lo parts.
+ // Detect an ADDI that feeds from a LUI instruction.
+ MachineOperand &AddiImmOp = OffsetTail.getOperand(2);
+ if (AddiImmOp.getTargetFlags() != RISCVII::MO_None)
+ return false;
+ int64_t OffLo = AddiImmOp.getImm();
+ MachineInstr &OffsetLui =
+ *MRI->getVRegDef(OffsetTail.getOperand(1).getReg());
+ MachineOperand &LuiImmOp = OffsetLui.getOperand(1);
+ if (OffsetLui.getOpcode() != RISCV::LUI ||
+ LuiImmOp.getTargetFlags() != RISCVII::MO_None ||
+ !MRI->hasOneUse(OffsetLui.getOperand(0).getReg()))
+ return false;
+ int64_t OffHi = OffsetLui.getOperand(1).getImm();
+ Offset = (OffHi << 12) + OffLo;
+ LLVM_DEBUG(dbgs() << " Offset Instrs: " << OffsetTail
+ << " " << OffsetLui);
+ DeadInstrs.insert(&OffsetTail);
+ DeadInstrs.insert(&OffsetLui);
+ return true;
+ } else if (OffsetTail.getOpcode() == RISCV::LUI) {
+ // The offset value has all zero bits in the lower 12 bits. Only LUI
+ // exists.
+ LLVM_DEBUG(dbgs() << " Offset Instr: " << OffsetTail);
+ Offset = OffsetTail.getOperand(1).getImm() << 12;
+ DeadInstrs.insert(&OffsetTail);
+ return true;
+ }
+ return false;
+}
+
+bool RISCVMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &HiLUI,
+ MachineInstr &LoADDI) {
+ unsigned DestReg = LoADDI.getOperand(0).getReg();
+ assert(MRI->hasOneUse(DestReg) && "expected one use for LoADDI");
+ // LoADDI has only one use.
+ MachineInstr &Tail = *MRI->use_begin(DestReg)->getParent();
+ switch (Tail.getOpcode()) {
+ default:
+ LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:"
+ << Tail);
+ return false;
+ case RISCV::ADDI: {
+ // Offset is simply an immediate operand.
+ int64_t Offset = Tail.getOperand(2).getImm();
+ LLVM_DEBUG(dbgs() << " Offset Instr: " << Tail);
+ foldOffset(HiLUI, LoADDI, Tail, Offset);
+ return true;
+ } break;
+ case RISCV::ADD: {
+ // The offset is too large to fit in the immediate field of ADDI.
+ // This can be in two forms:
+ // 1) LUI hi_Offset followed by:
+ // ADDI lo_offset
+ // This happens in case the offset has non zero bits in
+ // both hi 20 and lo 12 bits.
+ // 2) LUI (offset20)
+ // This happens in case the lower 12 bits of the offset are zeros.
+ int64_t Offset;
+ if (!matchLargeOffset(Tail, DestReg, Offset))
+ return false;
+ foldOffset(HiLUI, LoADDI, Tail, Offset);
+ return true;
+ } break;
+ case RISCV::LB:
+ case RISCV::LH:
+ case RISCV::LW:
+ case RISCV::LBU:
+ case RISCV::LHU:
+ case RISCV::LWU:
+ case RISCV::LD:
+ case RISCV::FLW:
+ case RISCV::FLD:
+ case RISCV::SB:
+ case RISCV::SH:
+ case RISCV::SW:
+ case RISCV::SD:
+ case RISCV::FSW:
+ case RISCV::FSD: {
+ // Transforms the sequence: Into:
+ // HiLUI: lui vreg1, %hi(foo) ---> lui vreg1, %hi(foo+8)
+ // LoADDI: addi vreg2, vreg1, %lo(foo) ---> lw vreg3, lo(foo+8)(vreg1)
+ // Tail: lw vreg3, 8(vreg2)
+ if (Tail.getOperand(1).isFI())
+ return false;
+ // Register defined by LoADDI should be used in the base part of the
+ // load\store instruction. Otherwise, no folding possible.
+ unsigned BaseAddrReg = Tail.getOperand(1).getReg();
+ if (DestReg != BaseAddrReg)
+ return false;
+ MachineOperand &TailImmOp = Tail.getOperand(2);
+ int64_t Offset = TailImmOp.getImm();
+ // Update the offsets in global address lowering.
+ HiLUI.getOperand(1).setOffset(Offset);
+ // Update the immediate in the Tail instruction to add the offset.
+ Tail.RemoveOperand(2);
+ MachineOperand &ImmOp = LoADDI.getOperand(2);
+ ImmOp.setOffset(Offset);
+ Tail.addOperand(ImmOp);
+ // Update the base reg in the Tail instruction to feed from LUI.
+ // Output of HiLUI is only used in LoADDI, no need to use
+ // MRI->replaceRegWith().
+ Tail.getOperand(1).setReg(HiLUI.getOperand(0).getReg());
+ DeadInstrs.insert(&LoADDI);
+ return true;
+ } break;
+ }
+ return false;
+}
+
+bool RISCVMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
+ if (skipFunction(Fn.getFunction()))
+ return false;
+
+ DeadInstrs.clear();
+ MRI = &Fn.getRegInfo();
+ for (MachineBasicBlock &MBB : Fn) {
+ LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n");
+ for (MachineInstr &HiLUI : MBB) {
+ MachineInstr *LoADDI = nullptr;
+ if (!detectLuiAddiGlobal(HiLUI, LoADDI))
+ continue;
+ LLVM_DEBUG(dbgs() << " Found lowered global address with one use: "
+ << *LoADDI->getOperand(2).getGlobal() << "\n");
+ // If the use count is only one, merge the offset
+ detectAndFoldOffset(HiLUI, *LoADDI);
+ }
+ }
+ // Delete dead instructions.
+ for (auto *MI : DeadInstrs)
+ MI->eraseFromParent();
+ return true;
+}
+
+/// Returns an instance of the Merge Base Offset Optimization pass.
+FunctionPass *llvm::createRISCVMergeBaseOffsetOptPass() {
+ return new RISCVMergeBaseOffsetOpt();
+}
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.cpp b/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 5776a92cab91..3ed1dec434ce 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -33,6 +33,13 @@ RISCVRegisterInfo::RISCVRegisterInfo(unsigned HwMode)
const MCPhysReg *
RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ if (MF->getFunction().hasFnAttribute("interrupt")) {
+ if (MF->getSubtarget<RISCVSubtarget>().hasStdExtD())
+ return CSR_XLEN_F64_Interrupt_SaveList;
+ if (MF->getSubtarget<RISCVSubtarget>().hasStdExtF())
+ return CSR_XLEN_F32_Interrupt_SaveList;
+ return CSR_Interrupt_SaveList;
+ }
return CSR_SaveList;
}
@@ -50,6 +57,10 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
return Reserved;
}
+bool RISCVRegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
+ return PhysReg == RISCV::X0;
+}
+
const uint32_t *RISCVRegisterInfo::getNoPreservedMask() const {
return CSR_NoRegs_RegMask;
}
@@ -61,6 +72,8 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MachineInstr &MI = *II;
MachineFunction &MF = *MI.getParent()->getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const RISCVInstrInfo *TII = MF.getSubtarget<RISCVSubtarget>().getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
@@ -69,25 +82,47 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
getFrameLowering(MF)->getFrameIndexReference(MF, FrameIndex, FrameReg) +
MI.getOperand(FIOperandNum + 1).getImm();
- assert(MF.getSubtarget().getFrameLowering()->hasFP(MF) &&
- "eliminateFrameIndex currently requires hasFP");
+ if (!isInt<32>(Offset)) {
+ report_fatal_error(
+ "Frame offsets outside of the signed 32-bit range not supported");
+ }
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ bool FrameRegIsKill = false;
- // Offsets must be directly encoded in a 12-bit immediate field
if (!isInt<12>(Offset)) {
- report_fatal_error(
- "Frame offsets outside of the signed 12-bit range not supported");
+ assert(isInt<32>(Offset) && "Int32 expected");
+ // The offset won't fit in an immediate, so use a scratch register instead
+ // Modify Offset and FrameReg appropriately
+ unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ TII->movImm32(MBB, II, DL, ScratchReg, Offset);
+ BuildMI(MBB, II, DL, TII->get(RISCV::ADD), ScratchReg)
+ .addReg(FrameReg)
+ .addReg(ScratchReg, RegState::Kill);
+ Offset = 0;
+ FrameReg = ScratchReg;
+ FrameRegIsKill = true;
}
- MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
+ MI.getOperand(FIOperandNum)
+ .ChangeToRegister(FrameReg, false, false, FrameRegIsKill);
MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
}
unsigned RISCVRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
- return RISCV::X8;
+ const TargetFrameLowering *TFI = getFrameLowering(MF);
+ return TFI->hasFP(MF) ? RISCV::X8 : RISCV::X2;
}
const uint32_t *
-RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & /*MF*/,
+RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF,
CallingConv::ID /*CC*/) const {
+ if (MF.getFunction().hasFnAttribute("interrupt")) {
+ if (MF.getSubtarget<RISCVSubtarget>().hasStdExtD())
+ return CSR_XLEN_F64_Interrupt_RegMask;
+ if (MF.getSubtarget<RISCVSubtarget>().hasStdExtF())
+ return CSR_XLEN_F32_Interrupt_RegMask;
+ return CSR_Interrupt_RegMask;
+ }
return CSR_RegMask;
}
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.h b/lib/Target/RISCV/RISCVRegisterInfo.h
index 0b2bc3776fc6..cbbb70079dd1 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -32,6 +32,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
BitVector getReservedRegs(const MachineFunction &MF) const override;
+ bool isConstantPhysReg(unsigned PhysReg) const override;
+
const uint32_t *getNoPreservedMask() const override;
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
@@ -39,6 +41,18 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
RegScavenger *RS = nullptr) const override;
unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+ bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+ return true;
+ }
+
+ bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
+ return true;
+ }
+
+ bool trackLivenessAfterRegAlloc(const MachineFunction &) const override {
+ return true;
+ }
};
}
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.td b/lib/Target/RISCV/RISCVRegisterInfo.td
index 21be2e332e59..4be8ff9200e9 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -38,8 +38,16 @@ def ABIRegAltName : RegAltNameIndex;
} // Namespace = "RISCV"
// Integer registers
+// CostPerUse is set higher for registers that may not be compressible as they
+// are not part of GPRC, the most restrictive register class used by the
+// compressed instruction set. This will influence the greedy register
+// allocator to reduce the use of registers that can't be encoded in 16 bit
+// instructions. This affects register allocation even when compressed
+// instruction isn't targeted, we see no major negative codegen impact.
+
let RegAltNameIndices = [ABIRegAltName] in {
def X0 : RISCVReg<0, "x0", ["zero"]>, DwarfRegNum<[0]>;
+ let CostPerUse = 1 in {
def X1 : RISCVReg<1, "x1", ["ra"]>, DwarfRegNum<[1]>;
def X2 : RISCVReg<2, "x2", ["sp"]>, DwarfRegNum<[2]>;
def X3 : RISCVReg<3, "x3", ["gp"]>, DwarfRegNum<[3]>;
@@ -47,6 +55,7 @@ let RegAltNameIndices = [ABIRegAltName] in {
def X5 : RISCVReg<5, "x5", ["t0"]>, DwarfRegNum<[5]>;
def X6 : RISCVReg<6, "x6", ["t1"]>, DwarfRegNum<[6]>;
def X7 : RISCVReg<7, "x7", ["t2"]>, DwarfRegNum<[7]>;
+ }
def X8 : RISCVReg<8, "x8", ["s0"]>, DwarfRegNum<[8]>;
def X9 : RISCVReg<9, "x9", ["s1"]>, DwarfRegNum<[9]>;
def X10 : RISCVReg<10,"x10", ["a0"]>, DwarfRegNum<[10]>;
@@ -55,6 +64,7 @@ let RegAltNameIndices = [ABIRegAltName] in {
def X13 : RISCVReg<13,"x13", ["a3"]>, DwarfRegNum<[13]>;
def X14 : RISCVReg<14,"x14", ["a4"]>, DwarfRegNum<[14]>;
def X15 : RISCVReg<15,"x15", ["a5"]>, DwarfRegNum<[15]>;
+ let CostPerUse = 1 in {
def X16 : RISCVReg<16,"x16", ["a6"]>, DwarfRegNum<[16]>;
def X17 : RISCVReg<17,"x17", ["a7"]>, DwarfRegNum<[17]>;
def X18 : RISCVReg<18,"x18", ["s2"]>, DwarfRegNum<[18]>;
@@ -71,6 +81,7 @@ let RegAltNameIndices = [ABIRegAltName] in {
def X29 : RISCVReg<29,"x29", ["t4"]>, DwarfRegNum<[29]>;
def X30 : RISCVReg<30,"x30", ["t5"]>, DwarfRegNum<[30]>;
def X31 : RISCVReg<31,"x31", ["t6"]>, DwarfRegNum<[31]>;
+ }
}
def XLenVT : ValueTypeByHwMode<[RV32, RV64, DefaultMode],
@@ -128,6 +139,19 @@ def GPRC : RegisterClass<"RISCV", [XLenVT], 32, (add
[RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
}
+// For indirect tail calls, we can't use callee-saved registers, as they are
+// restored to the saved value before the tail call, which would clobber a call
+// address.
+def GPRTC : RegisterClass<"RISCV", [XLenVT], 32, (add
+ (sequence "X%u", 5, 7),
+ (sequence "X%u", 10, 17),
+ (sequence "X%u", 28, 31)
+ )> {
+ let RegInfos = RegInfoByHwMode<
+ [RV32, RV64, DefaultMode],
+ [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+}
+
def SP : RegisterClass<"RISCV", [XLenVT], 32, (add X2)> {
let RegInfos = RegInfoByHwMode<
[RV32, RV64, DefaultMode],
diff --git a/lib/Target/RISCV/RISCVSubtarget.h b/lib/Target/RISCV/RISCVSubtarget.h
index 928ba5815a22..0e09391e7829 100644
--- a/lib/Target/RISCV/RISCVSubtarget.h
+++ b/lib/Target/RISCV/RISCVSubtarget.h
@@ -36,6 +36,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
bool HasStdExtD = false;
bool HasStdExtC = false;
bool HasRV64 = false;
+ bool EnableLinkerRelax = false;
unsigned XLen = 32;
MVT XLenVT = MVT::i32;
RISCVFrameLowering FrameLowering;
@@ -77,6 +78,7 @@ public:
bool hasStdExtD() const { return HasStdExtD; }
bool hasStdExtC() const { return HasStdExtC; }
bool is64Bit() const { return HasRV64; }
+ bool enableLinkerRelax() const { return EnableLinkerRelax; }
MVT getXLenVT() const { return XLenVT; }
unsigned getXLen() const { return XLen; }
};
diff --git a/lib/Target/RISCV/RISCVTargetMachine.cpp b/lib/Target/RISCV/RISCVTargetMachine.cpp
index e12168b73999..a2ebf5bf3e6b 100644
--- a/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -13,6 +13,7 @@
#include "RISCV.h"
#include "RISCVTargetMachine.h"
+#include "RISCVTargetObjectFile.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
@@ -59,7 +60,7 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
: LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
getEffectiveRelocModel(TT, RM),
getEffectiveCodeModel(CM), OL),
- TLOF(make_unique<TargetLoweringObjectFileELF>()),
+ TLOF(make_unique<RISCVELFTargetObjectFile>()),
Subtarget(TT, CPU, FS, *this) {
initAsmInfo();
}
@@ -74,7 +75,10 @@ public:
return getTM<RISCVTargetMachine>();
}
+ void addIRPasses() override;
bool addInstSelector() override;
+ void addPreEmitPass() override;
+ void addPreRegAlloc() override;
};
}
@@ -82,8 +86,19 @@ TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) {
return new RISCVPassConfig(*this, PM);
}
+void RISCVPassConfig::addIRPasses() {
+ addPass(createAtomicExpandPass());
+ TargetPassConfig::addIRPasses();
+}
+
bool RISCVPassConfig::addInstSelector() {
addPass(createRISCVISelDag(getRISCVTargetMachine()));
return false;
}
+
+void RISCVPassConfig::addPreEmitPass() { addPass(&BranchRelaxationPassID); }
+
+void RISCVPassConfig::addPreRegAlloc() {
+ addPass(createRISCVMergeBaseOffsetOptPass());
+}
diff --git a/lib/Target/RISCV/RISCVTargetObjectFile.cpp b/lib/Target/RISCV/RISCVTargetObjectFile.cpp
new file mode 100644
index 000000000000..46e81b628b65
--- /dev/null
+++ b/lib/Target/RISCV/RISCVTargetObjectFile.cpp
@@ -0,0 +1,19 @@
+//===-- RISCVTargetObjectFile.cpp - RISCV Object Info -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVTargetObjectFile.h"
+#include "RISCVTargetMachine.h"
+
+using namespace llvm;
+
+void RISCVELFTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
diff --git a/lib/Target/RISCV/RISCVTargetObjectFile.h b/lib/Target/RISCV/RISCVTargetObjectFile.h
new file mode 100644
index 000000000000..5467220301c1
--- /dev/null
+++ b/lib/Target/RISCV/RISCVTargetObjectFile.h
@@ -0,0 +1,25 @@
+//===-- RISCVTargetObjectFile.h - RISCV Object Info -*- C++ ---------*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_RISCV_RISCVTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+class RISCVTargetMachine;
+
+/// This implementation is used for RISCV ELF targets.
+class RISCVELFTargetObjectFile : public TargetLoweringObjectFileELF {
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 05f78a48badf..c7a5a1e8e6ee 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -95,7 +95,6 @@ class SparcAsmParser : public MCTargetAsmParser {
unsigned &RegKind);
bool matchSparcAsmModifiers(const MCExpr *&EVal, SMLoc &EndLoc);
- bool parseDirectiveWord(unsigned Size, SMLoc L);
bool is64Bit() const {
return getSTI().getTargetTriple().getArch() == Triple::sparcv9;
@@ -109,6 +108,14 @@ public:
const MCInstrInfo &MII,
const MCTargetOptions &Options)
: MCTargetAsmParser(Options, sti, MII), Parser(parser) {
+ Parser.addAliasForDirective(".half", ".2byte");
+ Parser.addAliasForDirective(".uahalf", ".2byte");
+ Parser.addAliasForDirective(".word", ".4byte");
+ Parser.addAliasForDirective(".uaword", ".4byte");
+ Parser.addAliasForDirective(".nword", is64Bit() ? ".8byte" : ".4byte");
+ if (is64Bit())
+ Parser.addAliasForDirective(".xword", ".8byte");
+
// Initialize the set of available features.
setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
}
@@ -682,21 +689,6 @@ ParseDirective(AsmToken DirectiveID)
{
StringRef IDVal = DirectiveID.getString();
- if (IDVal == ".byte")
- return parseDirectiveWord(1, DirectiveID.getLoc());
-
- if (IDVal == ".half")
- return parseDirectiveWord(2, DirectiveID.getLoc());
-
- if (IDVal == ".word")
- return parseDirectiveWord(4, DirectiveID.getLoc());
-
- if (IDVal == ".nword")
- return parseDirectiveWord(is64Bit() ? 8 : 4, DirectiveID.getLoc());
-
- if (is64Bit() && IDVal == ".xword")
- return parseDirectiveWord(8, DirectiveID.getLoc());
-
if (IDVal == ".register") {
// For now, ignore .register directive.
Parser.eatToEndOfStatement();
@@ -713,28 +705,6 @@ ParseDirective(AsmToken DirectiveID)
return true;
}
-bool SparcAsmParser:: parseDirectiveWord(unsigned Size, SMLoc L) {
- if (getLexer().isNot(AsmToken::EndOfStatement)) {
- while (true) {
- const MCExpr *Value;
- if (getParser().parseExpression(Value))
- return true;
-
- getParser().getStreamer().EmitValue(Value, Size);
-
- if (getLexer().is(AsmToken::EndOfStatement))
- break;
-
- // FIXME: Improve diagnostic.
- if (getLexer().isNot(AsmToken::Comma))
- return Error(L, "unexpected token in directive");
- Parser.Lex();
- }
- }
- Parser.Lex();
- return false;
-}
-
OperandMatchResultTy
SparcAsmParser::parseMEMOperand(OperandVector &Operands) {
SMLoc S, E;
@@ -915,9 +885,17 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
const MCExpr *Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
getContext());
- if (isCall && getContext().getObjectFileInfo()->isPositionIndependent())
- Res = SparcMCExpr::create(SparcMCExpr::VK_Sparc_WPLT30, Res,
- getContext());
+ SparcMCExpr::VariantKind Kind = SparcMCExpr::VK_Sparc_13;
+
+ if (getContext().getObjectFileInfo()->isPositionIndependent()) {
+ if (isCall)
+ Kind = SparcMCExpr::VK_Sparc_WPLT30;
+ else
+ Kind = SparcMCExpr::VK_Sparc_GOT13;
+ }
+
+ Res = SparcMCExpr::create(Kind, Res, getContext());
+
Op = SparcOperand::CreateImm(Res, S, E);
}
break;
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index 312215cf6cde..e60fd4a86121 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -1,14 +1,15 @@
set(LLVM_TARGET_DEFINITIONS Sparc.td)
-tablegen(LLVM SparcGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM SparcGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM SparcGenDisassemblerTables.inc -gen-disassembler)
-tablegen(LLVM SparcGenMCCodeEmitter.inc -gen-emitter)
-tablegen(LLVM SparcGenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM SparcGenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM SparcGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM SparcGenCallingConv.inc -gen-callingconv)
tablegen(LLVM SparcGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM SparcGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM SparcGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM SparcGenMCCodeEmitter.inc -gen-emitter)
+tablegen(LLVM SparcGenRegisterInfo.inc -gen-register-info)
tablegen(LLVM SparcGenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM SparcGenCallingConv.inc -gen-callingconv)
+
add_public_tablegen_target(SparcCommonTableGen)
add_llvm_target(SparcCodeGen
@@ -27,8 +28,8 @@ add_llvm_target(SparcCodeGen
SparcTargetObjectFile.cpp
)
-add_subdirectory(TargetInfo)
-add_subdirectory(MCTargetDesc)
-add_subdirectory(InstPrinter)
add_subdirectory(AsmParser)
add_subdirectory(Disassembler)
+add_subdirectory(InstPrinter)
+add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index 9b1d0f5bf3c9..6290e5a15a8b 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -207,8 +207,8 @@ Filler::findDelayInstr(MachineBasicBlock &MBB,
if (!done)
--I;
- // skip debug value
- if (I->isDebugValue())
+ // skip debug instruction
+ if (I->isDebugInstr())
continue;
if (I->hasUnmodeledSideEffects() || I->isInlineAsm() || I->isPosition() ||
diff --git a/lib/Target/Sparc/LeonFeatures.td b/lib/Target/Sparc/LeonFeatures.td
index d9efe094d078..a7dea068cb11 100755
--- a/lib/Target/Sparc/LeonFeatures.td
+++ b/lib/Target/Sparc/LeonFeatures.td
@@ -37,14 +37,6 @@ def LeonCASA : SubtargetFeature<
"Enable CASA instruction for LEON3 and LEON4 processors"
>;
-
-def ReplaceSDIV : SubtargetFeature<
- "replacesdiv",
- "PerformSDIVReplace",
- "true",
- "AT697E erratum fix: Do not emit SDIV, emit SDIVCC instead"
->;
-
def InsertNOPLoad: SubtargetFeature<
"insertnopload",
"InsertNOPLoad",
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index a38545ecf430..5f5e2ef7d45a 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -14,6 +14,7 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/TargetRegistry.h"
@@ -53,6 +54,10 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
case Sparc::fixup_sparc_hi22:
return (Value >> 10) & 0x3fffff;
+ case Sparc::fixup_sparc_got13:
+ case Sparc::fixup_sparc_13:
+ return Value & 0x1fff;
+
case Sparc::fixup_sparc_pc10:
case Sparc::fixup_sparc_got10:
case Sparc::fixup_sparc_tls_gd_lo10:
@@ -99,14 +104,13 @@ namespace {
class SparcAsmBackend : public MCAsmBackend {
protected:
const Target &TheTarget;
- bool IsLittleEndian;
bool Is64Bit;
public:
SparcAsmBackend(const Target &T)
- : MCAsmBackend(), TheTarget(T),
- IsLittleEndian(StringRef(TheTarget.getName()) == "sparcel"),
- Is64Bit(StringRef(TheTarget.getName()) == "sparcv9") {}
+ : MCAsmBackend(StringRef(T.getName()) == "sparcel" ? support::little
+ : support::big),
+ TheTarget(T), Is64Bit(StringRef(TheTarget.getName()) == "sparcv9") {}
unsigned getNumFixupKinds() const override {
return Sparc::NumTargetFixupKinds;
@@ -120,6 +124,7 @@ namespace {
{ "fixup_sparc_br19", 13, 19, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_sparc_br16_2", 10, 2, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_sparc_br16_14", 18, 14, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_13", 19, 13, 0 },
{ "fixup_sparc_hi22", 10, 22, 0 },
{ "fixup_sparc_lo10", 22, 10, 0 },
{ "fixup_sparc_h44", 10, 22, 0 },
@@ -131,6 +136,7 @@ namespace {
{ "fixup_sparc_pc10", 22, 10, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_sparc_got22", 10, 22, 0 },
{ "fixup_sparc_got10", 22, 10, 0 },
+ { "fixup_sparc_got13", 19, 13, 0 },
{ "fixup_sparc_wplt30", 2, 30, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_sparc_tls_gd_hi22", 10, 22, 0 },
{ "fixup_sparc_tls_gd_lo10", 22, 10, 0 },
@@ -159,6 +165,7 @@ namespace {
{ "fixup_sparc_br19", 0, 19, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_sparc_br16_2", 20, 2, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_sparc_br16_14", 0, 14, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_sparc_13", 0, 13, 0 },
{ "fixup_sparc_hi22", 0, 22, 0 },
{ "fixup_sparc_lo10", 0, 10, 0 },
{ "fixup_sparc_h44", 0, 22, 0 },
@@ -170,6 +177,7 @@ namespace {
{ "fixup_sparc_pc10", 0, 10, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_sparc_got22", 0, 22, 0 },
{ "fixup_sparc_got10", 0, 10, 0 },
+ { "fixup_sparc_got13", 0, 13, 0 },
{ "fixup_sparc_wplt30", 0, 30, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_sparc_tls_gd_hi22", 0, 22, 0 },
{ "fixup_sparc_tls_gd_lo10", 0, 10, 0 },
@@ -196,7 +204,7 @@ namespace {
assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
"Invalid kind!");
- if (IsLittleEndian)
+ if (Endian == support::little)
return InfosLE[Kind - FirstTargetFixupKind];
return InfosBE[Kind - FirstTargetFixupKind];
@@ -233,7 +241,8 @@ namespace {
}
}
- bool mayNeedRelaxation(const MCInst &Inst) const override {
+ bool mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const override {
// FIXME.
return false;
}
@@ -254,14 +263,14 @@ namespace {
llvm_unreachable("relaxInstruction() unimplemented");
}
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override {
// Cannot emit NOP with size not multiple of 32 bits.
if (Count % 4 != 0)
return false;
uint64_t NumNops = Count / 4;
for (uint64_t i = 0; i != NumNops; ++i)
- OW->write32(0x01000000);
+ support::endian::write<uint32_t>(OS, 0x01000000, Endian);
return true;
}
@@ -275,7 +284,8 @@ namespace {
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsResolved) const override {
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override {
Value = adjustFixupValue(Fixup.getKind(), Value);
if (!Value) return; // Doesn't change encoding.
@@ -286,23 +296,23 @@ namespace {
// from the fixup value. The Value has been "split up" into the
// appropriate bitfields above.
for (unsigned i = 0; i != 4; ++i) {
- unsigned Idx = IsLittleEndian ? i : 3 - i;
+ unsigned Idx = Endian == support::little ? i : 3 - i;
Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
}
}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(OSType);
- return createSparcELFObjectWriter(OS, Is64Bit, IsLittleEndian, OSABI);
+ return createSparcELFObjectWriter(Is64Bit, OSABI);
}
};
} // end anonymous namespace
MCAsmBackend *llvm::createSparcAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
const MCTargetOptions &Options) {
- return new ELFSparcAsmBackend(T, TT.getOS());
+ return new ELFSparcAsmBackend(T, STI.getTargetTriple().getOS());
}
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index a204036a0975..5a730947796e 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -79,6 +79,7 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx,
case FK_Data_8: return ((Fixup.getOffset() % 8)
? ELF::R_SPARC_UA64
: ELF::R_SPARC_64);
+ case Sparc::fixup_sparc_13: return ELF::R_SPARC_13;
case Sparc::fixup_sparc_hi22: return ELF::R_SPARC_HI22;
case Sparc::fixup_sparc_lo10: return ELF::R_SPARC_LO10;
case Sparc::fixup_sparc_h44: return ELF::R_SPARC_H44;
@@ -88,6 +89,7 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx,
case Sparc::fixup_sparc_hm: return ELF::R_SPARC_HM10;
case Sparc::fixup_sparc_got22: return ELF::R_SPARC_GOT22;
case Sparc::fixup_sparc_got10: return ELF::R_SPARC_GOT10;
+ case Sparc::fixup_sparc_got13: return ELF::R_SPARC_GOT13;
case Sparc::fixup_sparc_tls_gd_hi22: return ELF::R_SPARC_TLS_GD_HI22;
case Sparc::fixup_sparc_tls_gd_lo10: return ELF::R_SPARC_TLS_GD_LO10;
case Sparc::fixup_sparc_tls_gd_add: return ELF::R_SPARC_TLS_GD_ADD;
@@ -132,9 +134,7 @@ bool SparcELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
}
}
-std::unique_ptr<MCObjectWriter>
-llvm::createSparcELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
- bool IsLittleEndian, uint8_t OSABI) {
- auto MOTW = llvm::make_unique<SparcELFObjectWriter>(Is64Bit, OSABI);
- return createELFObjectWriter(std::move(MOTW), OS, IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createSparcELFObjectWriter(bool Is64Bit, uint8_t OSABI) {
+ return llvm::make_unique<SparcELFObjectWriter>(Is64Bit, OSABI);
}
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h b/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
index 8d79396d936e..99aa63fe2290 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
@@ -30,6 +30,9 @@ namespace llvm {
fixup_sparc_br16_2,
fixup_sparc_br16_14,
+ /// fixup_sparc_13 - 13-bit fixup
+ fixup_sparc_13,
+
/// fixup_sparc_hi22 - 22-bit fixup corresponding to %hi(foo)
/// for sethi
fixup_sparc_hi22,
@@ -64,6 +67,9 @@ namespace llvm {
/// fixup_sparc_got10 - 10-bit fixup corresponding to %got10(foo)
fixup_sparc_got10,
+ /// fixup_sparc_got13 - 13-bit fixup corresponding to %got13(foo)
+ fixup_sparc_got13,
+
/// fixup_sparc_wplt30
fixup_sparc_wplt30,
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 684f66970dbe..647be159a151 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -98,14 +98,9 @@ void SparcMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
computeAvailableFeatures(STI.getFeatureBits()));
unsigned Bits = getBinaryCodeForInstr(MI, Fixups, STI);
-
- if (Ctx.getAsmInfo()->isLittleEndian()) {
- // Output the bits in little-endian byte order.
- support::endian::Writer<support::little>(OS).write<uint32_t>(Bits);
- } else {
- // Output the bits in big-endian byte order.
- support::endian::Writer<support::big>(OS).write<uint32_t>(Bits);
- }
+ support::endian::write(OS, Bits,
+ Ctx.getAsmInfo()->isLittleEndian() ? support::little
+ : support::big);
unsigned tlsOpNo = 0;
switch (MI.getOpcode()) {
default: break;
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index a77f760d9eff..f736a37a266c 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -58,6 +58,8 @@ bool SparcMCExpr::printVariantKind(raw_ostream &OS, VariantKind Kind)
// FIXME: use %got22/%got10, if system assembler supports them.
case VK_Sparc_GOT22: OS << "%hi("; break;
case VK_Sparc_GOT10: OS << "%lo("; break;
+ case VK_Sparc_GOT13: closeParen = false; break;
+ case VK_Sparc_13: closeParen = false; break;
case VK_Sparc_WPLT30: closeParen = false; break;
case VK_Sparc_R_DISP32: OS << "%r_disp32("; break;
case VK_Sparc_TLS_GD_HI22: OS << "%tgd_hi22("; break;
@@ -96,6 +98,7 @@ SparcMCExpr::VariantKind SparcMCExpr::parseVariantKind(StringRef name)
.Case("pc10", VK_Sparc_PC10)
.Case("got22", VK_Sparc_GOT22)
.Case("got10", VK_Sparc_GOT10)
+ .Case("got13", VK_Sparc_GOT13)
.Case("r_disp32", VK_Sparc_R_DISP32)
.Case("tgd_hi22", VK_Sparc_TLS_GD_HI22)
.Case("tgd_lo10", VK_Sparc_TLS_GD_LO10)
@@ -132,6 +135,8 @@ Sparc::Fixups SparcMCExpr::getFixupKind(SparcMCExpr::VariantKind Kind) {
case VK_Sparc_PC10: return Sparc::fixup_sparc_pc10;
case VK_Sparc_GOT22: return Sparc::fixup_sparc_got22;
case VK_Sparc_GOT10: return Sparc::fixup_sparc_got10;
+ case VK_Sparc_GOT13: return Sparc::fixup_sparc_got13;
+ case VK_Sparc_13: return Sparc::fixup_sparc_13;
case VK_Sparc_WPLT30: return Sparc::fixup_sparc_wplt30;
case VK_Sparc_TLS_GD_HI22: return Sparc::fixup_sparc_tls_gd_hi22;
case VK_Sparc_TLS_GD_LO10: return Sparc::fixup_sparc_tls_gd_lo10;
@@ -193,14 +198,26 @@ static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
void SparcMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
switch(getKind()) {
default: return;
+ case VK_Sparc_TLS_GD_CALL:
+ case VK_Sparc_TLS_LDM_CALL: {
+ // The corresponding relocations reference __tls_get_addr, as they call it,
+ // but this is only implicit; we must explicitly add it to our symbol table
+ // to bind it for these uses.
+ MCSymbol *Symbol = Asm.getContext().getOrCreateSymbol("__tls_get_addr");
+ Asm.registerSymbol(*Symbol);
+ auto ELFSymbol = cast<MCSymbolELF>(Symbol);
+ if (!ELFSymbol->isBindingSet()) {
+ ELFSymbol->setBinding(ELF::STB_GLOBAL);
+ ELFSymbol->setExternal(true);
+ }
+ LLVM_FALLTHROUGH;
+ }
case VK_Sparc_TLS_GD_HI22:
case VK_Sparc_TLS_GD_LO10:
case VK_Sparc_TLS_GD_ADD:
- case VK_Sparc_TLS_GD_CALL:
case VK_Sparc_TLS_LDM_HI22:
case VK_Sparc_TLS_LDM_LO10:
case VK_Sparc_TLS_LDM_ADD:
- case VK_Sparc_TLS_LDM_CALL:
case VK_Sparc_TLS_LDO_HIX22:
case VK_Sparc_TLS_LDO_LOX10:
case VK_Sparc_TLS_LDO_ADD:
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index 13f08195c764..cf2db067749c 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -36,6 +36,8 @@ public:
VK_Sparc_PC10,
VK_Sparc_GOT22,
VK_Sparc_GOT10,
+ VK_Sparc_GOT13,
+ VK_Sparc_13,
VK_Sparc_WPLT30,
VK_Sparc_R_DISP32,
VK_Sparc_TLS_GD_HI22,
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
index 563e6f4efbe6..3cd24104c443 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -23,7 +23,7 @@ class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
class MCRegisterInfo;
class MCSubtargetInfo;
class MCTargetOptions;
@@ -40,12 +40,11 @@ Target &getTheSparcelTarget();
MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
-MCAsmBackend *createSparcAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
+MCAsmBackend *createSparcAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const MCTargetOptions &Options);
-std::unique_ptr<MCObjectWriter>
-createSparcELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
- bool IsLIttleEndian, uint8_t OSABI);
+std::unique_ptr<MCObjectTargetWriter> createSparcELFObjectWriter(bool Is64Bit,
+ uint8_t OSABI);
} // End llvm namespace
// Defines symbolic names for Sparc registers. This defines a mapping from
diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td
index 9e0a297c8812..2f9b57f76041 100644
--- a/lib/Target/Sparc/Sparc.td
+++ b/lib/Target/Sparc/Sparc.td
@@ -130,7 +130,7 @@ def : Processor<"leon2", LEON2Itineraries,
// LEON 2 FT (AT697E)
// TO DO: Place-holder: Processor specific features will be added *very* soon here.
def : Processor<"at697e", LEON2Itineraries,
- [FeatureLeon, ReplaceSDIV, InsertNOPLoad]>;
+ [FeatureLeon, InsertNOPLoad]>;
// LEON 2 FT (AT697F)
// TO DO: Place-holder: Processor specific features will be added *very* soon here.
@@ -176,4 +176,5 @@ def Sparc : Target {
let InstructionSet = SparcInstrInfo;
let AssemblyParsers = [SparcAsmParser];
let AssemblyWriters = [SparcAsmWriter];
+ let AllowRegisterRenaming = 1;
}
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 9864aa372354..9f6c7d65592d 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -88,10 +88,11 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF,
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
MachineFrameInfo &MFI = MF.getFrameInfo();
+ const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
const SparcInstrInfo &TII =
- *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ *static_cast<const SparcInstrInfo *>(Subtarget.getInstrInfo());
const SparcRegisterInfo &RegInfo =
- *static_cast<const SparcRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+ *static_cast<const SparcRegisterInfo *>(Subtarget.getRegisterInfo());
MachineBasicBlock::iterator MBBI = MBB.begin();
// Debug location must be unknown since the first debug location is used
// to determine the end of the prologue.
@@ -141,7 +142,7 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF,
// Adds the SPARC subtarget-specific spill area to the stack
// size. Also ensures target-required alignment.
- NumBytes = MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes);
+ NumBytes = Subtarget.getAdjustedFrameSize(NumBytes);
// Finally, ensure that the size is sufficiently aligned for the
// data on the stack.
@@ -176,9 +177,27 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF,
.addCFIIndex(CFIIndex);
if (NeedsStackRealignment) {
- // andn %o6, MaxAlign-1, %o6
+ int64_t Bias = Subtarget.getStackPointerBias();
+ unsigned regUnbiased;
+ if (Bias) {
+ // This clobbers G1 which we always know is available here.
+ regUnbiased = SP::G1;
+ // add %o6, BIAS, %g1
+ BuildMI(MBB, MBBI, dl, TII.get(SP::ADDri), regUnbiased)
+ .addReg(SP::O6).addImm(Bias);
+ } else
+ regUnbiased = SP::O6;
+
+ // andn %regUnbiased, MaxAlign-1, %regUnbiased
int MaxAlign = MFI.getMaxAlignment();
- BuildMI(MBB, MBBI, dl, TII.get(SP::ANDNri), SP::O6).addReg(SP::O6).addImm(MaxAlign - 1);
+ BuildMI(MBB, MBBI, dl, TII.get(SP::ANDNri), regUnbiased)
+ .addReg(regUnbiased).addImm(MaxAlign - 1);
+
+ if (Bias) {
+ // add %g1, -BIAS, %o6
+ BuildMI(MBB, MBBI, dl, TII.get(SP::ADDri), SP::O6)
+ .addReg(regUnbiased).addImm(-Bias);
+ }
}
}
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index c36e75d1b076..f845c41ede45 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -311,6 +311,8 @@ bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){
if (!Changed)
return false;
+ SelectInlineAsmMemoryOperands(AsmNodeOperands, SDLoc(N));
+
SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
New->setNodeId(-1);
@@ -360,12 +362,6 @@ void SparcDAGToDAGISel::Select(SDNode *N) {
// FIXME: Handle div by immediate.
unsigned Opcode = N->getOpcode() == ISD::SDIV ? SP::SDIVrr : SP::UDIVrr;
- // SDIV is a hardware erratum on some LEON2 processors. Replace it with SDIVcc here.
- if (((SparcTargetMachine&)TM).getSubtargetImpl()->performSDIVReplace()
- &&
- Opcode == SP::SDIVrr) {
- Opcode = SP::SDIVCCrr;
- }
CurDAG->SelectNodeTo(N, Opcode, MVT::i32, DivLHS, DivRHS, TopPart);
return;
}
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index d9548ff90d7f..b04c6b112682 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1450,7 +1450,7 @@ static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
const SparcSubtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
- MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+ MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize(0));
// Instructions which use registers as conditionals examine all the
// bits (as does the pseudo SELECT_CC expansion). I don't think it
@@ -1590,6 +1590,11 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+ setOperationAction(ISD::ADDC, MVT::i32, Custom);
+ setOperationAction(ISD::ADDE, MVT::i32, Custom);
+ setOperationAction(ISD::SUBC, MVT::i32, Custom);
+ setOperationAction(ISD::SUBE, MVT::i32, Custom);
+
if (Subtarget->is64Bit()) {
setOperationAction(ISD::ADDC, MVT::i64, Custom);
setOperationAction(ISD::ADDE, MVT::i64, Custom);
@@ -1700,6 +1705,9 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UDIV, MVT::i32, Expand);
setLibcallName(RTLIB::UDIV_I32, ".udiv");
+
+ setLibcallName(RTLIB::SREM_I32, ".rem");
+ setLibcallName(RTLIB::UREM_I32, ".urem");
}
if (Subtarget->is64Bit()) {
@@ -1722,6 +1730,7 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VAARG , MVT::Other, Custom);
setOperationAction(ISD::TRAP , MVT::Other, Legal);
+ setOperationAction(ISD::DEBUGTRAP , MVT::Other, Legal);
// Use the default implementation.
setOperationAction(ISD::VACOPY , MVT::Other, Expand);
@@ -1975,11 +1984,22 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
// Handle PIC mode first. SPARC needs a got load for every variable!
if (isPositionIndependent()) {
- // This is the pic32 code model, the GOT is known to be smaller than 4GB.
- SDValue HiLo = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_GOT22,
- SparcMCExpr::VK_Sparc_GOT10, DAG);
+ const Module *M = DAG.getMachineFunction().getFunction().getParent();
+ PICLevel::Level picLevel = M->getPICLevel();
+ SDValue Idx;
+
+ if (picLevel == PICLevel::SmallPIC) {
+ // This is the pic13 code model, the GOT is known to be smaller than 8KiB.
+ Idx = DAG.getNode(SPISD::Lo, DL, Op.getValueType(),
+ withTargetFlags(Op, SparcMCExpr::VK_Sparc_GOT13, DAG));
+ } else {
+ // This is the pic32 code model, the GOT is known to be smaller than 4GB.
+ Idx = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_GOT22,
+ SparcMCExpr::VK_Sparc_GOT10, DAG);
+ }
+
SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, VT);
- SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, VT, GlobalBase, HiLo);
+ SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Idx);
// GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
// function has calls.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
@@ -2036,7 +2056,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
- if (DAG.getTarget().Options.EmulatedTLS)
+ if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(GA, DAG);
SDLoc DL(GA);
@@ -3513,6 +3533,22 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return TargetLowering::getRegForInlineAsmConstraint(TRI, newConstraint,
VT);
}
+ if (name.substr(0, 1).equals("f") &&
+ !name.substr(1).getAsInteger(10, intVal) && intVal <= 63) {
+ std::string newConstraint;
+
+ if (VT == MVT::f32 || VT == MVT::Other) {
+ newConstraint = "{f" + utostr(intVal) + "}";
+ } else if (VT == MVT::f64 && (intVal % 2 == 0)) {
+ newConstraint = "{d" + utostr(intVal / 2) + "}";
+ } else if (VT == MVT::f128 && (intVal % 4 == 0)) {
+ newConstraint = "{q" + utostr(intVal / 4) + "}";
+ } else {
+ return std::make_pair(0U, nullptr);
+ }
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, newConstraint,
+ VT);
+ }
}
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
diff --git a/lib/Target/Sparc/SparcInstrAliases.td b/lib/Target/Sparc/SparcInstrAliases.td
index df570cea8da8..352090ed92c1 100644
--- a/lib/Target/Sparc/SparcInstrAliases.td
+++ b/lib/Target/Sparc/SparcInstrAliases.td
@@ -474,6 +474,19 @@ def : InstAlias<"wr $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>;
// flush -> flush %g0
def : InstAlias<"flush", (FLUSH), 0>;
+def : MnemonicAlias<"iflush", "flush">;
+
+def : MnemonicAlias<"stub", "stb">;
+def : MnemonicAlias<"stsb", "stb">;
+
+def : MnemonicAlias<"stuba", "stba">;
+def : MnemonicAlias<"stsba", "stba">;
+
+def : MnemonicAlias<"stuh", "sth">;
+def : MnemonicAlias<"stsh", "sth">;
+
+def : MnemonicAlias<"stuha", "stha">;
+def : MnemonicAlias<"stsha", "stha">;
def : MnemonicAlias<"lduw", "ld">, Requires<[HasV9]>;
def : MnemonicAlias<"lduwa", "lda">, Requires<[HasV9]>;
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index ea8ed830bafc..6750763d8ee5 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -280,7 +280,7 @@ unsigned SparcInstrInfo::removeBranch(MachineBasicBlock &MBB,
while (I != MBB.begin()) {
--I;
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
if (I->getOpcode() != SP::BA
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 08bccbde0bd6..5b7fb3c485e8 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -421,7 +421,7 @@ let hasSideEffects = 1, mayStore = 1 in {
def FLUSHW : F3_1<0b10, 0b101011, (outs), (ins),
"flushw",
[(flushw)]>, Requires<[HasV9]>;
- let rd = 0, rs1 = 1, simm13 = 3 in
+ let rd = 8, rs1 = 0, simm13 = 3 in
def TA3 : F3_2<0b10, 0b111010, (outs), (ins),
"ta 3",
[(flushw)]>;
@@ -1009,6 +1009,9 @@ let DecoderNamespace = "SparcV9", DecoderMethod = "DecodeTRAP", Predicates = [Ha
let isBarrier = 1, isTerminator = 1, rd = 0b01000, rs1 = 0, simm13 = 5 in
def TA5 : F3_2<0b10, 0b111010, (outs), (ins), "ta 5", [(trap)]>;
+let hasSideEffects = 1, rd = 0b01000, rs1 = 0, simm13 = 1 in
+ def TA1 : F3_2<0b10, 0b111010, (outs), (ins), "ta 1", [(debugtrap)]>;
+
// Section B.28 - Read State Register Instructions
let rs2 = 0 in
def RDASR : F3_1<2, 0b101000,
@@ -1599,6 +1602,9 @@ let Predicates = [HasV9] in {
// Non-Instruction Patterns
//===----------------------------------------------------------------------===//
+// Zero immediate.
+def : Pat<(i32 0),
+ (ORrr (i32 G0), (i32 G0))>;
// Small immediates.
def : Pat<(i32 simm13:$val),
(ORri (i32 G0), imm:$val)>;
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index 8dd2569d10de..2a279dad5ae2 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -35,6 +35,8 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
unsigned Kind) const override;
+ bool enableMultipleCopyHints() const override { return true; }
+
void eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS = nullptr) const override;
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index 01545b8d20a0..40c5683f8495 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -44,7 +44,6 @@ SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
// Leon features
HasLeonCasa = false;
HasUmacSmac = false;
- PerformSDIVReplace = false;
InsertNOPLoad = false;
FixAllFDIVSQRT = false;
DetectRoundChange = false;
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index bcdc96e68103..588a6765bcdf 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -50,7 +50,6 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
bool InsertNOPLoad;
bool FixAllFDIVSQRT;
bool DetectRoundChange;
- bool PerformSDIVReplace;
SparcInstrInfo InstrInfo;
SparcTargetLowering TLInfo;
@@ -92,7 +91,6 @@ public:
// Leon options
bool hasUmacSmac() const { return HasUmacSmac; }
- bool performSDIVReplace() const { return PerformSDIVReplace; }
bool hasLeonCasa() const { return HasLeonCasa; }
bool insertNOPLoad() const { return InsertNOPLoad; }
bool fixAllFDIVSQRT() const { return FixAllFDIVSQRT; }
diff --git a/lib/Target/SystemZ/CMakeLists.txt b/lib/Target/SystemZ/CMakeLists.txt
index 138e14a25b70..f83b4242fb42 100644
--- a/lib/Target/SystemZ/CMakeLists.txt
+++ b/lib/Target/SystemZ/CMakeLists.txt
@@ -5,10 +5,11 @@ tablegen(LLVM SystemZGenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM SystemZGenCallingConv.inc -gen-callingconv)
tablegen(LLVM SystemZGenDAGISel.inc -gen-dag-isel)
tablegen(LLVM SystemZGenDisassemblerTables.inc -gen-disassembler)
-tablegen(LLVM SystemZGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM SystemZGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM SystemZGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM SystemZGenRegisterInfo.inc -gen-register-info)
tablegen(LLVM SystemZGenSubtargetInfo.inc -gen-subtarget)
+
add_public_tablegen_target(SystemZCommonTableGen)
add_llvm_target(SystemZCodeGen
@@ -39,5 +40,5 @@ add_llvm_target(SystemZCodeGen
add_subdirectory(AsmParser)
add_subdirectory(Disassembler)
add_subdirectory(InstPrinter)
-add_subdirectory(TargetInfo)
add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index e035c3b87a40..2146832f7794 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -14,6 +14,7 @@
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
using namespace llvm;
@@ -43,7 +44,7 @@ class SystemZMCAsmBackend : public MCAsmBackend {
uint8_t OSABI;
public:
SystemZMCAsmBackend(uint8_t osABI)
- : OSABI(osABI) {}
+ : MCAsmBackend(support::big), OSABI(osABI) {}
// Override MCAsmBackend
unsigned getNumFixupKinds() const override {
@@ -52,8 +53,10 @@ public:
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsResolved) const override;
- bool mayNeedRelaxation(const MCInst &Inst) const override {
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override;
+ bool mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const override {
return false;
}
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -65,10 +68,10 @@ public:
MCInst &Res) const override {
llvm_unreachable("SystemZ does do not have assembler relaxation");
}
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createSystemZObjectWriter(OS, OSABI);
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createSystemZObjectWriter(OSABI);
}
};
} // end anonymous namespace
@@ -95,7 +98,8 @@ void SystemZMCAsmBackend::applyFixup(const MCAssembler &Asm,
const MCFixup &Fixup,
const MCValue &Target,
MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) const {
+ bool IsResolved,
+ const MCSubtargetInfo *STI) const {
MCFixupKind Kind = Fixup.getKind();
unsigned Offset = Fixup.getOffset();
unsigned BitSize = getFixupKindInfo(Kind).TargetSize;
@@ -114,17 +118,17 @@ void SystemZMCAsmBackend::applyFixup(const MCAssembler &Asm,
}
}
-bool SystemZMCAsmBackend::writeNopData(uint64_t Count,
- MCObjectWriter *OW) const {
+bool SystemZMCAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
for (uint64_t I = 0; I != Count; ++I)
- OW->write8(7);
+ OS << '\x7';
return true;
}
MCAsmBackend *llvm::createSystemZMCAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
const MCTargetOptions &Options) {
- uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
+ uint8_t OSABI =
+ MCELFObjectTargetWriter::getOSABI(STI.getTargetTriple().getOS());
return new SystemZMCAsmBackend(OSABI);
}
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index 238926d6c8e0..888be519fb16 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -161,8 +161,7 @@ unsigned SystemZObjectWriter::getRelocType(MCContext &Ctx,
}
}
-std::unique_ptr<MCObjectWriter>
-llvm::createSystemZObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI) {
- return createELFObjectWriter(llvm::make_unique<SystemZObjectWriter>(OSABI),
- OS, /*IsLittleEndian=*/false);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createSystemZObjectWriter(uint8_t OSABI) {
+ return llvm::make_unique<SystemZObjectWriter>(OSABI);
}
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index 99b157e37275..1617a807e65a 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -20,7 +20,7 @@ class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
class MCRegisterInfo;
class MCSubtargetInfo;
class MCTargetOptions;
@@ -89,12 +89,11 @@ MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
MCContext &Ctx);
MCAsmBackend *createSystemZMCAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
const MCTargetOptions &Options);
-std::unique_ptr<MCObjectWriter> createSystemZObjectWriter(raw_pwrite_stream &OS,
- uint8_t OSABI);
+std::unique_ptr<MCObjectTargetWriter> createSystemZObjectWriter(uint8_t OSABI);
} // end namespace llvm
// Defines symbolic names for SystemZ registers.
diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h
index 9a8e508e4119..fdbde3d8dbc3 100644
--- a/lib/Target/SystemZ/SystemZ.h
+++ b/lib/Target/SystemZ/SystemZ.h
@@ -47,6 +47,22 @@ const unsigned CCMASK_CMP_O = CCMASK_ANY ^ CCMASK_CMP_UO;
const unsigned CCMASK_ICMP = CCMASK_0 | CCMASK_1 | CCMASK_2;
const unsigned CCMASK_FCMP = CCMASK_0 | CCMASK_1 | CCMASK_2 | CCMASK_3;
+// Condition-code mask assignments for arithmetical operations.
+const unsigned CCMASK_ARITH_EQ = CCMASK_0;
+const unsigned CCMASK_ARITH_LT = CCMASK_1;
+const unsigned CCMASK_ARITH_GT = CCMASK_2;
+const unsigned CCMASK_ARITH_OVERFLOW = CCMASK_3;
+const unsigned CCMASK_ARITH = CCMASK_ANY;
+
+// Condition-code mask assignments for logical operations.
+const unsigned CCMASK_LOGICAL_ZERO = CCMASK_0 | CCMASK_2;
+const unsigned CCMASK_LOGICAL_NONZERO = CCMASK_1 | CCMASK_2;
+const unsigned CCMASK_LOGICAL_CARRY = CCMASK_2 | CCMASK_3;
+const unsigned CCMASK_LOGICAL_NOCARRY = CCMASK_0 | CCMASK_1;
+const unsigned CCMASK_LOGICAL_BORROW = CCMASK_LOGICAL_NOCARRY;
+const unsigned CCMASK_LOGICAL_NOBORROW = CCMASK_LOGICAL_CARRY;
+const unsigned CCMASK_LOGICAL = CCMASK_ANY;
+
// Condition-code mask assignments for CS.
const unsigned CCMASK_CS_EQ = CCMASK_0;
const unsigned CCMASK_CS_NE = CCMASK_1;
diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td
index 06905fb41e44..3800f7a26b79 100644
--- a/lib/Target/SystemZ/SystemZ.td
+++ b/lib/Target/SystemZ/SystemZ.td
@@ -75,4 +75,5 @@ def SystemZAsmParser : AsmParser {
def SystemZ : Target {
let InstructionSet = SystemZInstrInfo;
let AssemblyParsers = [SystemZAsmParser];
+ let AllowRegisterRenaming = 1;
}
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index b39245b20b3c..bd99fabb48c9 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -460,6 +460,14 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
break;
+ case TargetOpcode::STACKMAP:
+ LowerSTACKMAP(*MI);
+ return;
+
+ case TargetOpcode::PATCHPOINT:
+ LowerPATCHPOINT(*MI, Lower);
+ return;
+
default:
Lower.lower(MI, LoweredMI);
break;
@@ -467,6 +475,123 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, LoweredMI);
}
+
+// Emit the largest nop instruction smaller than or equal to NumBytes
+// bytes. Return the size of nop emitted.
+static unsigned EmitNop(MCContext &OutContext, MCStreamer &OutStreamer,
+ unsigned NumBytes, const MCSubtargetInfo &STI) {
+ if (NumBytes < 2) {
+ llvm_unreachable("Zero nops?");
+ return 0;
+ }
+ else if (NumBytes < 4) {
+ OutStreamer.EmitInstruction(MCInstBuilder(SystemZ::BCRAsm)
+ .addImm(0).addReg(SystemZ::R0D), STI);
+ return 2;
+ }
+ else if (NumBytes < 6) {
+ OutStreamer.EmitInstruction(MCInstBuilder(SystemZ::BCAsm)
+ .addImm(0).addReg(0).addImm(0).addReg(0),
+ STI);
+ return 4;
+ }
+ else {
+ MCSymbol *DotSym = OutContext.createTempSymbol();
+ const MCSymbolRefExpr *Dot = MCSymbolRefExpr::create(DotSym, OutContext);
+ OutStreamer.EmitInstruction(MCInstBuilder(SystemZ::BRCLAsm)
+ .addImm(0).addExpr(Dot), STI);
+ OutStreamer.EmitLabel(DotSym);
+ return 6;
+ }
+}
+
+void SystemZAsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
+ const SystemZInstrInfo *TII =
+ static_cast<const SystemZInstrInfo *>(MF->getSubtarget().getInstrInfo());
+
+ unsigned NumNOPBytes = MI.getOperand(1).getImm();
+
+ SM.recordStackMap(MI);
+ assert(NumNOPBytes % 2 == 0 && "Invalid number of NOP bytes requested!");
+
+ // Scan ahead to trim the shadow.
+ unsigned ShadowBytes = 0;
+ const MachineBasicBlock &MBB = *MI.getParent();
+ MachineBasicBlock::const_iterator MII(MI);
+ ++MII;
+ while (ShadowBytes < NumNOPBytes) {
+ if (MII == MBB.end() ||
+ MII->getOpcode() == TargetOpcode::PATCHPOINT ||
+ MII->getOpcode() == TargetOpcode::STACKMAP)
+ break;
+ ShadowBytes += TII->getInstSizeInBytes(*MII);
+ if (MII->isCall())
+ break;
+ ++MII;
+ }
+
+ // Emit nops.
+ while (ShadowBytes < NumNOPBytes)
+ ShadowBytes += EmitNop(OutContext, *OutStreamer, NumNOPBytes - ShadowBytes,
+ getSubtargetInfo());
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>
+void SystemZAsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
+ SystemZMCInstLower &Lower) {
+ SM.recordPatchPoint(MI);
+ PatchPointOpers Opers(&MI);
+
+ unsigned EncodedBytes = 0;
+ const MachineOperand &CalleeMO = Opers.getCallTarget();
+
+ if (CalleeMO.isImm()) {
+ uint64_t CallTarget = CalleeMO.getImm();
+ if (CallTarget) {
+ unsigned ScratchIdx = -1;
+ unsigned ScratchReg = 0;
+ do {
+ ScratchIdx = Opers.getNextScratchIdx(ScratchIdx + 1);
+ ScratchReg = MI.getOperand(ScratchIdx).getReg();
+ } while (ScratchReg == SystemZ::R0D);
+
+ // Materialize the call target address
+ EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::LLILF)
+ .addReg(ScratchReg)
+ .addImm(CallTarget & 0xFFFFFFFF));
+ EncodedBytes += 6;
+ if (CallTarget >> 32) {
+ EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::IIHF)
+ .addReg(ScratchReg)
+ .addImm(CallTarget >> 32));
+ EncodedBytes += 6;
+ }
+
+ EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::BASR)
+ .addReg(SystemZ::R14D)
+ .addReg(ScratchReg));
+ EncodedBytes += 2;
+ }
+ } else if (CalleeMO.isGlobal()) {
+ const MCExpr *Expr = Lower.getExpr(CalleeMO, MCSymbolRefExpr::VK_PLT);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::BRASL)
+ .addReg(SystemZ::R14D)
+ .addExpr(Expr));
+ EncodedBytes += 6;
+ }
+
+ // Emit padding.
+ unsigned NumBytes = Opers.getNumPatchBytes();
+ assert(NumBytes >= EncodedBytes &&
+ "Patchpoint can't request size less than the length of a call.");
+ assert((NumBytes - EncodedBytes) % 2 == 0 &&
+ "Invalid number of NOP bytes requested!");
+ while (EncodedBytes < NumBytes)
+ EncodedBytes += EmitNop(OutContext, *OutStreamer, NumBytes - EncodedBytes,
+ getSubtargetInfo());
+}
+
// Convert a SystemZ-specific constant pool modifier into the associated
// MCSymbolRefExpr variant kind.
static MCSymbolRefExpr::VariantKind
@@ -521,6 +646,10 @@ bool SystemZAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
return false;
}
+void SystemZAsmPrinter::EmitEndOfAsmFile(Module &M) {
+ SM.serializeToStackMapSection();
+}
+
// Force static initialization.
extern "C" void LLVMInitializeSystemZAsmPrinter() {
RegisterAsmPrinter<SystemZAsmPrinter> X(getTheSystemZTarget());
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.h b/lib/Target/SystemZ/SystemZAsmPrinter.h
index fe8c88fe23e3..cb88ec32f83a 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.h
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -11,7 +11,9 @@
#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H
#include "SystemZTargetMachine.h"
+#include "SystemZMCInstLower.h"
#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/StackMaps.h"
#include "llvm/Support/Compiler.h"
namespace llvm {
@@ -22,20 +24,33 @@ class Module;
class raw_ostream;
class LLVM_LIBRARY_VISIBILITY SystemZAsmPrinter : public AsmPrinter {
+private:
+ StackMaps SM;
+
public:
SystemZAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
- : AsmPrinter(TM, std::move(Streamer)) {}
+ : AsmPrinter(TM, std::move(Streamer)), SM(*this) {}
// Override AsmPrinter.
StringRef getPassName() const override { return "SystemZ Assembly Printer"; }
void EmitInstruction(const MachineInstr *MI) override;
void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
+ void EmitEndOfAsmFile(Module &M) override;
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
unsigned AsmVariant, const char *ExtraCode,
raw_ostream &OS) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
unsigned AsmVariant, const char *ExtraCode,
raw_ostream &OS) override;
+
+ bool doInitialization(Module &M) override {
+ SM.reset();
+ return AsmPrinter::doInitialization(M);
+ }
+
+private:
+ void LowerSTACKMAP(const MachineInstr &MI);
+ void LowerPATCHPOINT(const MachineInstr &MI, SystemZMCInstLower &Lower);
};
} // end namespace llvm
diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td
index 2bf5ac29865f..deba27fee7fe 100644
--- a/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/lib/Target/SystemZ/SystemZCallingConv.td
@@ -120,3 +120,12 @@ def CSR_SystemZ : CalleeSavedRegs<(add (sequence "R%dD", 6, 15),
// R9 is used to return SwiftError; remove it from CSR.
def CSR_SystemZ_SwiftError : CalleeSavedRegs<(sub CSR_SystemZ, R9D)>;
+
+// "All registers" as used by the AnyReg calling convention.
+// Note that registers 0 and 1 are still defined as intra-call scratch
+// registers that may be clobbered e.g. by PLT stubs.
+def CSR_SystemZ_AllRegs : CalleeSavedRegs<(add (sequence "R%dD", 2, 15),
+ (sequence "F%dD", 0, 15))>;
+def CSR_SystemZ_AllRegs_Vector : CalleeSavedRegs<(add (sequence "R%dD", 2, 15),
+ (sequence "V%d", 0, 31))>;
+
diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
index 55f7a7b8d0d1..9edd1fc36406 100644
--- a/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -86,9 +86,11 @@ private:
SmallVectorImpl<MachineInstr *> &CCUsers);
bool convertToLoadAndTrap(MachineInstr &MI, MachineInstr &Compare,
SmallVectorImpl<MachineInstr *> &CCUsers);
- bool convertToLoadAndTest(MachineInstr &MI);
+ bool convertToLoadAndTest(MachineInstr &MI, MachineInstr &Compare,
+ SmallVectorImpl<MachineInstr *> &CCUsers);
bool adjustCCMasksForInstr(MachineInstr &MI, MachineInstr &Compare,
- SmallVectorImpl<MachineInstr *> &CCUsers);
+ SmallVectorImpl<MachineInstr *> &CCUsers,
+ unsigned ConvOpc = 0);
bool optimizeCompareZero(MachineInstr &Compare,
SmallVectorImpl<MachineInstr *> &CCUsers);
bool fuseCompareOperations(MachineInstr &Compare,
@@ -282,26 +284,37 @@ bool SystemZElimCompare::convertToLoadAndTrap(
// If MI is a load instruction, try to convert it into a LOAD AND TEST.
// Return true on success.
-bool SystemZElimCompare::convertToLoadAndTest(MachineInstr &MI) {
+bool SystemZElimCompare::convertToLoadAndTest(
+ MachineInstr &MI, MachineInstr &Compare,
+ SmallVectorImpl<MachineInstr *> &CCUsers) {
+
+ // Try to adjust CC masks for the LOAD AND TEST opcode that could replace MI.
unsigned Opcode = TII->getLoadAndTest(MI.getOpcode());
- if (!Opcode)
+ if (!Opcode || !adjustCCMasksForInstr(MI, Compare, CCUsers, Opcode))
return false;
- MI.setDesc(TII->get(Opcode));
- MachineInstrBuilder(*MI.getParent()->getParent(), MI)
- .addReg(SystemZ::CC, RegState::ImplicitDefine);
+ // Rebuild to get the CC operand in the right place.
+ MachineInstr *BuiltMI =
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opcode));
+ for (const auto &MO : MI.operands())
+ BuiltMI->addOperand(MO);
+ BuiltMI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MI.eraseFromParent();
+
return true;
}
// The CC users in CCUsers are testing the result of a comparison of some
-// value X against zero and we know that any CC value produced by MI
-// would also reflect the value of X. Try to adjust CCUsers so that
-// they test the result of MI directly, returning true on success.
-// Leave everything unchanged on failure.
+// value X against zero and we know that any CC value produced by MI would
+// also reflect the value of X. ConvOpc may be used to pass the transfomed
+// opcode MI will have if this succeeds. Try to adjust CCUsers so that they
+// test the result of MI directly, returning true on success. Leave
+// everything unchanged on failure.
bool SystemZElimCompare::adjustCCMasksForInstr(
MachineInstr &MI, MachineInstr &Compare,
- SmallVectorImpl<MachineInstr *> &CCUsers) {
- int Opcode = MI.getOpcode();
+ SmallVectorImpl<MachineInstr *> &CCUsers,
+ unsigned ConvOpc) {
+ int Opcode = (ConvOpc ? ConvOpc : MI.getOpcode());
const MCInstrDesc &Desc = TII->get(Opcode);
unsigned MIFlags = Desc.TSFlags;
@@ -319,53 +332,72 @@ bool SystemZElimCompare::adjustCCMasksForInstr(
unsigned CCValues = SystemZII::getCCValues(MIFlags);
assert((ReusableCCMask & ~CCValues) == 0 && "Invalid CCValues");
- // Now check whether these flags are enough for all users.
- SmallVector<MachineOperand *, 4> AlterMasks;
- for (unsigned int I = 0, E = CCUsers.size(); I != E; ++I) {
- MachineInstr *MI = CCUsers[I];
-
- // Fail if this isn't a use of CC that we understand.
- unsigned Flags = MI->getDesc().TSFlags;
- unsigned FirstOpNum;
- if (Flags & SystemZII::CCMaskFirst)
- FirstOpNum = 0;
- else if (Flags & SystemZII::CCMaskLast)
- FirstOpNum = MI->getNumExplicitOperands() - 2;
- else
- return false;
-
- // Check whether the instruction predicate treats all CC values
- // outside of ReusableCCMask in the same way. In that case it
- // doesn't matter what those CC values mean.
- unsigned CCValid = MI->getOperand(FirstOpNum).getImm();
- unsigned CCMask = MI->getOperand(FirstOpNum + 1).getImm();
- unsigned OutValid = ~ReusableCCMask & CCValid;
- unsigned OutMask = ~ReusableCCMask & CCMask;
- if (OutMask != 0 && OutMask != OutValid)
- return false;
+ bool MIEquivalentToCmp =
+ (ReusableCCMask == CCValues &&
+ CCValues == SystemZII::getCCValues(CompareFlags));
+
+ if (!MIEquivalentToCmp) {
+ // Now check whether these flags are enough for all users.
+ SmallVector<MachineOperand *, 4> AlterMasks;
+ for (unsigned int I = 0, E = CCUsers.size(); I != E; ++I) {
+ MachineInstr *MI = CCUsers[I];
+
+ // Fail if this isn't a use of CC that we understand.
+ unsigned Flags = MI->getDesc().TSFlags;
+ unsigned FirstOpNum;
+ if (Flags & SystemZII::CCMaskFirst)
+ FirstOpNum = 0;
+ else if (Flags & SystemZII::CCMaskLast)
+ FirstOpNum = MI->getNumExplicitOperands() - 2;
+ else
+ return false;
+
+ // Check whether the instruction predicate treats all CC values
+ // outside of ReusableCCMask in the same way. In that case it
+ // doesn't matter what those CC values mean.
+ unsigned CCValid = MI->getOperand(FirstOpNum).getImm();
+ unsigned CCMask = MI->getOperand(FirstOpNum + 1).getImm();
+ unsigned OutValid = ~ReusableCCMask & CCValid;
+ unsigned OutMask = ~ReusableCCMask & CCMask;
+ if (OutMask != 0 && OutMask != OutValid)
+ return false;
+
+ AlterMasks.push_back(&MI->getOperand(FirstOpNum));
+ AlterMasks.push_back(&MI->getOperand(FirstOpNum + 1));
+ }
- AlterMasks.push_back(&MI->getOperand(FirstOpNum));
- AlterMasks.push_back(&MI->getOperand(FirstOpNum + 1));
+ // All users are OK. Adjust the masks for MI.
+ for (unsigned I = 0, E = AlterMasks.size(); I != E; I += 2) {
+ AlterMasks[I]->setImm(CCValues);
+ unsigned CCMask = AlterMasks[I + 1]->getImm();
+ if (CCMask & ~ReusableCCMask)
+ AlterMasks[I + 1]->setImm((CCMask & ReusableCCMask) |
+ (CCValues & ~ReusableCCMask));
+ }
}
- // All users are OK. Adjust the masks for MI.
- for (unsigned I = 0, E = AlterMasks.size(); I != E; I += 2) {
- AlterMasks[I]->setImm(CCValues);
- unsigned CCMask = AlterMasks[I + 1]->getImm();
- if (CCMask & ~ReusableCCMask)
- AlterMasks[I + 1]->setImm((CCMask & ReusableCCMask) |
- (CCValues & ~ReusableCCMask));
+ // CC is now live after MI.
+ if (!ConvOpc) {
+ int CCDef = MI.findRegisterDefOperandIdx(SystemZ::CC, false, true, TRI);
+ assert(CCDef >= 0 && "Couldn't find CC set");
+ MI.getOperand(CCDef).setIsDead(false);
}
- // CC is now live after MI.
- int CCDef = MI.findRegisterDefOperandIdx(SystemZ::CC, false, true, TRI);
- assert(CCDef >= 0 && "Couldn't find CC set");
- MI.getOperand(CCDef).setIsDead(false);
+ // Check if MI lies before Compare.
+ bool BeforeCmp = false;
+ MachineBasicBlock::iterator MBBI = MI, MBBE = MI.getParent()->end();
+ for (++MBBI; MBBI != MBBE; ++MBBI)
+ if (MBBI == Compare) {
+ BeforeCmp = true;
+ break;
+ }
// Clear any intervening kills of CC.
- MachineBasicBlock::iterator MBBI = MI, MBBE = Compare;
- for (++MBBI; MBBI != MBBE; ++MBBI)
- MBBI->clearRegisterKills(SystemZ::CC, TRI);
+ if (BeforeCmp) {
+ MachineBasicBlock::iterator MBBI = MI, MBBE = Compare;
+ for (++MBBI; MBBI != MBBE; ++MBBI)
+ MBBI->clearRegisterKills(SystemZ::CC, TRI);
+ }
return true;
}
@@ -398,12 +430,12 @@ bool SystemZElimCompare::optimizeCompareZero(
// Search back for CC results that are based on the first operand.
unsigned SrcReg = getCompareSourceReg(Compare);
MachineBasicBlock &MBB = *Compare.getParent();
- MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB.begin();
Reference CCRefs;
Reference SrcRefs;
- while (MBBI != MBBE) {
- --MBBI;
- MachineInstr &MI = *MBBI;
+ for (MachineBasicBlock::reverse_iterator MBBI =
+ std::next(MachineBasicBlock::reverse_iterator(&Compare)),
+ MBBE = MBB.rend(); MBBI != MBBE;) {
+ MachineInstr &MI = *MBBI++;
if (resultTests(MI, SrcReg)) {
// Try to remove both MI and Compare by converting a branch to BRCT(G).
// or a load-and-trap instruction. We don't care in this case whether
@@ -419,7 +451,7 @@ bool SystemZElimCompare::optimizeCompareZero(
}
}
// Try to eliminate Compare by reusing a CC result from MI.
- if ((!CCRefs && convertToLoadAndTest(MI)) ||
+ if ((!CCRefs && convertToLoadAndTest(MI, Compare, CCUsers)) ||
(!CCRefs.Def && adjustCCMasksForInstr(MI, Compare, CCUsers))) {
EliminatedComparisons += 1;
return true;
@@ -434,17 +466,15 @@ bool SystemZElimCompare::optimizeCompareZero(
}
// Also do a forward search to handle cases where an instruction after the
- // compare can be converted like
- //
- // LTEBRCompare %f0s, %f0s, implicit-def %cc LTEBRCompare %f0s, %f0s,
- // implicit-def %cc %f2s = LER %f0s
- //
- MBBI = Compare, MBBE = MBB.end();
- while (++MBBI != MBBE) {
- MachineInstr &MI = *MBBI;
+ // compare can be converted, like
+ // LTEBRCompare %f0s, %f0s; %f2s = LER %f0s => LTEBRCompare %f2s, %f0s
+ for (MachineBasicBlock::iterator MBBI =
+ std::next(MachineBasicBlock::iterator(&Compare)), MBBE = MBB.end();
+ MBBI != MBBE;) {
+ MachineInstr &MI = *MBBI++;
if (preservesValueOf(MI, SrcReg)) {
// Try to eliminate Compare by reusing a CC result from MI.
- if (convertToLoadAndTest(MI)) {
+ if (convertToLoadAndTest(MI, Compare, CCUsers)) {
EliminatedComparisons += 1;
return true;
}
diff --git a/lib/Target/SystemZ/SystemZExpandPseudo.cpp b/lib/Target/SystemZ/SystemZExpandPseudo.cpp
index d02db9a617a3..67c80899d491 100644
--- a/lib/Target/SystemZ/SystemZExpandPseudo.cpp
+++ b/lib/Target/SystemZ/SystemZExpandPseudo.cpp
@@ -55,7 +55,7 @@ char SystemZExpandPseudo::ID = 0;
INITIALIZE_PASS(SystemZExpandPseudo, "systemz-expand-pseudo",
SYSTEMZ_EXPAND_PSEUDO_NAME, false, false)
-/// \brief Returns an instance of the pseudo instruction expansion pass.
+/// Returns an instance of the pseudo instruction expansion pass.
FunctionPass *llvm::createSystemZExpandPseudoPass(SystemZTargetMachine &TM) {
return new SystemZExpandPseudo();
}
@@ -112,7 +112,7 @@ bool SystemZExpandPseudo::expandLOCRMux(MachineBasicBlock &MBB,
return true;
}
-/// \brief If MBBI references a pseudo instruction that should be expanded here,
+/// If MBBI references a pseudo instruction that should be expanded here,
/// do the expansion and return true. Otherwise return false.
bool SystemZExpandPseudo::expandMI(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
@@ -127,7 +127,7 @@ bool SystemZExpandPseudo::expandMI(MachineBasicBlock &MBB,
return false;
}
-/// \brief Iterate over the instructions in basic block MBB and expand any
+/// Iterate over the instructions in basic block MBB and expand any
/// pseudo instructions. Return true if anything was modified.
bool SystemZExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
bool Modified = false;
diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td
index fda9c30fe3fc..beff45dba81d 100644
--- a/lib/Target/SystemZ/SystemZFeatures.td
+++ b/lib/Target/SystemZ/SystemZFeatures.td
@@ -62,6 +62,7 @@ def FeatureLoadStoreOnCond : SystemZFeature<
"load-store-on-cond", "LoadStoreOnCond",
"Assume that the load/store-on-condition facility is installed"
>;
+def FeatureNoLoadStoreOnCond : SystemZMissingFeature<"LoadStoreOnCond">;
def FeaturePopulationCount : SystemZFeature<
"population-count", "PopulationCount",
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index b600aa61cd0b..565299c90139 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -204,7 +204,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
addSavedGPR(MBB, MIB, SystemZ::ArgGPRs[I], true);
}
- // Save FPRs in the normal TargetInstrInfo way.
+ // Save FPRs/VRs in the normal TargetInstrInfo way.
for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
unsigned Reg = CSI[I].getReg();
if (SystemZ::FP64BitRegClass.contains(Reg)) {
@@ -212,6 +212,11 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
&SystemZ::FP64BitRegClass, TRI);
}
+ if (SystemZ::VR128BitRegClass.contains(Reg)) {
+ MBB.addLiveIn(Reg);
+ TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
+ &SystemZ::VR128BitRegClass, TRI);
+ }
}
return true;
@@ -231,12 +236,15 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
bool HasFP = hasFP(MF);
DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
- // Restore FPRs in the normal TargetInstrInfo way.
+ // Restore FPRs/VRs in the normal TargetInstrInfo way.
for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
unsigned Reg = CSI[I].getReg();
if (SystemZ::FP64BitRegClass.contains(Reg))
TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(),
&SystemZ::FP64BitRegClass, TRI);
+ if (SystemZ::VR128BitRegClass.contains(Reg))
+ TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(),
+ &SystemZ::VR128BitRegClass, TRI);
}
// Restore call-saved GPRs (but not call-clobbered varargs, which at
@@ -371,7 +379,15 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
}
}
- uint64_t StackSize = getAllocatedStackSize(MF);
+ uint64_t StackSize = MFFrame.getStackSize();
+ // We need to allocate the ABI-defined 160-byte base area whenever
+ // we allocate stack space for our own use and whenever we call another
+ // function.
+ if (StackSize || MFFrame.hasVarSizedObjects() || MFFrame.hasCalls()) {
+ StackSize += SystemZMC::CallFrameSize;
+ MFFrame.setStackSize(StackSize);
+ }
+
if (StackSize) {
// Determine if we want to store a backchain.
bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
@@ -417,7 +433,7 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
I->addLiveIn(SystemZ::R11D);
}
- // Skip over the FPR saves.
+ // Skip over the FPR/VR saves.
SmallVector<unsigned, 8> CFIIndexes;
for (auto &Save : CSI) {
unsigned Reg = Save.getReg();
@@ -428,19 +444,26 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
++MBBI;
else
llvm_unreachable("Couldn't skip over FPR save");
+ } else if (SystemZ::VR128BitRegClass.contains(Reg)) {
+ if (MBBI != MBB.end() &&
+ MBBI->getOpcode() == SystemZ::VST)
+ ++MBBI;
+ else
+ llvm_unreachable("Couldn't skip over VR save");
+ } else
+ continue;
- // Add CFI for the this save.
- unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
- unsigned IgnoredFrameReg;
- int64_t Offset =
- getFrameIndexReference(MF, Save.getFrameIdx(), IgnoredFrameReg);
+ // Add CFI for the this save.
+ unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+ unsigned IgnoredFrameReg;
+ int64_t Offset =
+ getFrameIndexReference(MF, Save.getFrameIdx(), IgnoredFrameReg);
- unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
nullptr, DwarfReg, SPOffsetFromCFA + Offset));
- CFIIndexes.push_back(CFIIndex);
- }
+ CFIIndexes.push_back(CFIIndex);
}
- // Complete the CFI for the FPR saves, modelling them as taking effect
+ // Complete the CFI for the FPR/VR saves, modelling them as taking effect
// after the last save.
for (auto CFIIndex : CFIIndexes) {
BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
@@ -454,11 +477,12 @@ void SystemZFrameLowering::emitEpilogue(MachineFunction &MF,
auto *ZII =
static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+ MachineFrameInfo &MFFrame = MF.getFrameInfo();
// Skip the return instruction.
assert(MBBI->isReturn() && "Can only insert epilogue into returning blocks");
- uint64_t StackSize = getAllocatedStackSize(MF);
+ uint64_t StackSize = MFFrame.getStackSize();
if (ZFI->getLowSavedGPR()) {
--MBBI;
unsigned Opcode = MBBI->getOpcode();
@@ -495,46 +519,6 @@ bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const {
MF.getInfo<SystemZMachineFunctionInfo>()->getManipulatesSP());
}
-int SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF,
- int FI,
- unsigned &FrameReg) const {
- const MachineFrameInfo &MFFrame = MF.getFrameInfo();
- const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
-
- // Fill in FrameReg output argument.
- FrameReg = RI->getFrameRegister(MF);
-
- // Start with the offset of FI from the top of the caller-allocated frame
- // (i.e. the top of the 160 bytes allocated by the caller). This initial
- // offset is therefore negative.
- int64_t Offset = (MFFrame.getObjectOffset(FI) +
- MFFrame.getOffsetAdjustment());
-
- // Make the offset relative to the incoming stack pointer.
- Offset -= getOffsetOfLocalArea();
-
- // Make the offset relative to the bottom of the frame.
- Offset += getAllocatedStackSize(MF);
-
- return Offset;
-}
-
-uint64_t SystemZFrameLowering::
-getAllocatedStackSize(const MachineFunction &MF) const {
- const MachineFrameInfo &MFFrame = MF.getFrameInfo();
-
- // Start with the size of the local variables and spill slots.
- uint64_t StackSize = MFFrame.getStackSize();
-
- // We need to allocate the ABI-defined 160-byte base area whenever
- // we allocate stack space for our own use and whenever we call another
- // function.
- if (StackSize || MFFrame.hasVarSizedObjects() || MFFrame.hasCalls())
- StackSize += SystemZMC::CallFrameSize;
-
- return StackSize;
-}
-
bool
SystemZFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
// The ABI requires us to allocate 160 bytes of stack space for the callee,
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h
index a75d111b0294..08c84c785cc0 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -43,16 +43,11 @@ public:
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
bool hasFP(const MachineFunction &MF) const override;
- int getFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg) const override;
bool hasReservedCallFrame(const MachineFunction &MF) const override;
MachineBasicBlock::iterator
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const override;
- // Return the number of bytes in the callee-allocated part of the frame.
- uint64_t getAllocatedStackSize(const MachineFunction &MF) const;
-
// Return the byte offset from the incoming stack pointer of Reg's
// ABI-defined save slot. Return 0 if no slot is defined for Reg.
unsigned getRegSpillOffset(unsigned Reg) const {
diff --git a/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
index f37216022762..d300d1d88abc 100644
--- a/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
+++ b/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -59,10 +59,18 @@ getNumDecoderSlots(SUnit *SU) const {
return 1; // Normal instruction
}
-unsigned SystemZHazardRecognizer::getCurrCycleIdx() {
+unsigned SystemZHazardRecognizer::getCurrCycleIdx(SUnit *SU) const {
unsigned Idx = CurrGroupSize;
if (GrpCount % 2)
Idx += 3;
+
+ if (SU != nullptr && !fitsIntoCurrentGroup(SU)) {
+ if (Idx == 1 || Idx == 2)
+ Idx = 3;
+ else if (Idx == 4 || Idx == 5)
+ Idx = 0;
+ }
+
return Idx;
}
@@ -77,7 +85,7 @@ void SystemZHazardRecognizer::Reset() {
GrpCount = 0;
LastFPdOpCycleIdx = UINT_MAX;
LastEmittedMI = nullptr;
- DEBUG(CurGroupDbg = "";);
+ LLVM_DEBUG(CurGroupDbg = "";);
}
bool
@@ -100,30 +108,30 @@ SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const {
return true;
}
-void SystemZHazardRecognizer::nextGroup(bool DbgOutput) {
- if (CurrGroupSize > 0) {
- DEBUG(dumpCurrGroup("Completed decode group"));
- DEBUG(CurGroupDbg = "";);
+void SystemZHazardRecognizer::nextGroup() {
+ if (CurrGroupSize == 0)
+ return;
+
+ LLVM_DEBUG(dumpCurrGroup("Completed decode group"));
+ LLVM_DEBUG(CurGroupDbg = "";);
- GrpCount++;
+ GrpCount++;
- // Reset counter for next group.
- CurrGroupSize = 0;
+ // Reset counter for next group.
+ CurrGroupSize = 0;
- // Decrease counters for execution units by one.
- for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
- if (ProcResourceCounters[i] > 0)
- ProcResourceCounters[i]--;
+ // Decrease counters for execution units by one.
+ for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
+ if (ProcResourceCounters[i] > 0)
+ ProcResourceCounters[i]--;
- // Clear CriticalResourceIdx if it is now below the threshold.
- if (CriticalResourceIdx != UINT_MAX &&
- (ProcResourceCounters[CriticalResourceIdx] <=
- ProcResCostLim))
- CriticalResourceIdx = UINT_MAX;
- }
+ // Clear CriticalResourceIdx if it is now below the threshold.
+ if (CriticalResourceIdx != UINT_MAX &&
+ (ProcResourceCounters[CriticalResourceIdx] <=
+ ProcResCostLim))
+ CriticalResourceIdx = UINT_MAX;
- DEBUG(if (DbgOutput)
- dumpProcResourceCounters(););
+ LLVM_DEBUG(dumpState(););
}
#ifndef NDEBUG // Debug output
@@ -143,7 +151,11 @@ void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const {
std::string FU(PRD.Name);
// trim e.g. Z13_FXaUnit -> FXa
FU = FU.substr(FU.find("_") + 1);
- FU.resize(FU.find("Unit"));
+ size_t Pos = FU.find("Unit");
+ if (Pos != std::string::npos)
+ FU.resize(Pos);
+ if (FU == "LS") // LSUnit -> LSU
+ FU = "LSU";
OS << "/" << FU;
if (PI->Cycles > 1)
@@ -163,7 +175,7 @@ void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const {
}
void SystemZHazardRecognizer::dumpCurrGroup(std::string Msg) const {
- dbgs() << "+++ " << Msg;
+ dbgs() << "++ " << Msg;
dbgs() << ": ";
if (CurGroupDbg.empty())
@@ -188,15 +200,28 @@ void SystemZHazardRecognizer::dumpProcResourceCounters() const {
if (!any)
return;
- dbgs() << "+++ Resource counters:\n";
+ dbgs() << "++ | Resource counters: ";
for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
- if (ProcResourceCounters[i] > 0) {
- dbgs() << "+++ Extra schedule for execution unit "
- << SchedModel->getProcResource(i)->Name
- << ": " << ProcResourceCounters[i] << "\n";
- any = true;
- }
+ if (ProcResourceCounters[i] > 0)
+ dbgs() << SchedModel->getProcResource(i)->Name
+ << ":" << ProcResourceCounters[i] << " ";
+ dbgs() << "\n";
+
+ if (CriticalResourceIdx != UINT_MAX)
+ dbgs() << "++ | Critical resource: "
+ << SchedModel->getProcResource(CriticalResourceIdx)->Name
+ << "\n";
+}
+
+void SystemZHazardRecognizer::dumpState() const {
+ dumpCurrGroup("| Current decoder group");
+ dbgs() << "++ | Current cycle index: "
+ << getCurrCycleIdx() << "\n";
+ dumpProcResourceCounters();
+ if (LastFPdOpCycleIdx != UINT_MAX)
+ dbgs() << "++ | Last FPd cycle index: " << LastFPdOpCycleIdx << "\n";
}
+
#endif //NDEBUG
void SystemZHazardRecognizer::clearProcResCounters() {
@@ -213,30 +238,25 @@ static inline bool isBranchRetTrap(MachineInstr *MI) {
void SystemZHazardRecognizer::
EmitInstruction(SUnit *SU) {
const MCSchedClassDesc *SC = getSchedClass(SU);
- DEBUG( dumpCurrGroup("Decode group before emission"););
+ LLVM_DEBUG(dbgs() << "++ HazardRecognizer emitting "; dumpSU(SU, dbgs());
+ dbgs() << "\n";);
+ LLVM_DEBUG(dumpCurrGroup("Decode group before emission"););
// If scheduling an SU that must begin a new decoder group, move on
// to next group.
if (!fitsIntoCurrentGroup(SU))
nextGroup();
- DEBUG( dbgs() << "+++ HazardRecognizer emitting "; dumpSU(SU, dbgs());
- dbgs() << "\n";
- raw_string_ostream cgd(CurGroupDbg);
- if (CurGroupDbg.length())
- cgd << ", ";
- dumpSU(SU, cgd););
+ LLVM_DEBUG(raw_string_ostream cgd(CurGroupDbg);
+ if (CurGroupDbg.length()) cgd << ", "; dumpSU(SU, cgd););
LastEmittedMI = SU->getInstr();
// After returning from a call, we don't know much about the state.
if (SU->isCall) {
- DEBUG (dbgs() << "+++ Clearing state after call.\n";);
- clearProcResCounters();
- LastFPdOpCycleIdx = UINT_MAX;
- CurrGroupSize += getNumDecoderSlots(SU);
- assert (CurrGroupSize <= 3);
- nextGroup();
+ LLVM_DEBUG(dbgs() << "++ Clearing state after call.\n";);
+ Reset();
+ LastEmittedMI = SU->getInstr();
return;
}
@@ -256,23 +276,21 @@ EmitInstruction(SUnit *SU) {
(PI->ProcResourceIdx != CriticalResourceIdx &&
CurrCounter >
ProcResourceCounters[CriticalResourceIdx]))) {
- DEBUG( dbgs() << "+++ New critical resource: "
- << SchedModel->getProcResource(PI->ProcResourceIdx)->Name
- << "\n";);
+ LLVM_DEBUG(
+ dbgs() << "++ New critical resource: "
+ << SchedModel->getProcResource(PI->ProcResourceIdx)->Name
+ << "\n";);
CriticalResourceIdx = PI->ProcResourceIdx;
}
}
// Make note of an instruction that uses a blocking resource (FPd).
if (SU->isUnbuffered) {
- LastFPdOpCycleIdx = getCurrCycleIdx();
- DEBUG (dbgs() << "+++ Last FPd cycle index: "
- << LastFPdOpCycleIdx << "\n";);
+ LastFPdOpCycleIdx = getCurrCycleIdx(SU);
+ LLVM_DEBUG(dbgs() << "++ Last FPd cycle index: " << LastFPdOpCycleIdx
+ << "\n";);
}
- bool GroupEndingBranch =
- (CurrGroupSize >= 1 && isBranchRetTrap(SU->getInstr()));
-
// Insert SU into current group by increasing number of slots used
// in current group.
CurrGroupSize += getNumDecoderSlots(SU);
@@ -280,7 +298,7 @@ EmitInstruction(SUnit *SU) {
// Check if current group is now full/ended. If so, move on to next
// group to be ready to evaluate more candidates.
- if (CurrGroupSize == 3 || SC->EndGroup || GroupEndingBranch)
+ if (CurrGroupSize == 3 || SC->EndGroup)
nextGroup();
}
@@ -311,7 +329,7 @@ int SystemZHazardRecognizer::groupingCost(SUnit *SU) const {
return 0;
}
-bool SystemZHazardRecognizer::isFPdOpPreferred_distance(const SUnit *SU) {
+bool SystemZHazardRecognizer::isFPdOpPreferred_distance(SUnit *SU) const {
assert (SU->isUnbuffered);
// If this is the first FPd op, it should be scheduled high.
if (LastFPdOpCycleIdx == UINT_MAX)
@@ -320,9 +338,10 @@ bool SystemZHazardRecognizer::isFPdOpPreferred_distance(const SUnit *SU) {
// of the processor to use the other FPd unit there. This should
// generally happen if two FPd ops are placed with 2 other
// instructions between them (modulo 6).
- if (LastFPdOpCycleIdx > getCurrCycleIdx())
- return ((LastFPdOpCycleIdx - getCurrCycleIdx()) == 3);
- return ((getCurrCycleIdx() - LastFPdOpCycleIdx) == 3);
+ unsigned SUCycleIdx = getCurrCycleIdx(SU);
+ if (LastFPdOpCycleIdx > SUCycleIdx)
+ return ((LastFPdOpCycleIdx - SUCycleIdx) == 3);
+ return ((SUCycleIdx - LastFPdOpCycleIdx) == 3);
}
int SystemZHazardRecognizer::
@@ -373,10 +392,17 @@ void SystemZHazardRecognizer::emitInstruction(MachineInstr *MI,
}
}
+ unsigned GroupSizeBeforeEmit = CurrGroupSize;
EmitInstruction(&SU);
+ if (!TakenBranch && isBranchRetTrap(MI)) {
+ // NT Branch on second slot ends group.
+ if (GroupSizeBeforeEmit == 1)
+ nextGroup();
+ }
+
if (TakenBranch && CurrGroupSize > 0)
- nextGroup(false /*DbgOutput*/);
+ nextGroup();
assert ((!MI->isTerminator() || isBranchRetTrap(MI)) &&
"Scheduler: unhandled terminator!");
@@ -386,7 +412,7 @@ void SystemZHazardRecognizer::
copyState(SystemZHazardRecognizer *Incoming) {
// Current decoder group
CurrGroupSize = Incoming->CurrGroupSize;
- DEBUG (CurGroupDbg = Incoming->CurGroupDbg;);
+ LLVM_DEBUG(CurGroupDbg = Incoming->CurGroupDbg;);
// Processor resources
ProcResourceCounters = Incoming->ProcResourceCounters;
diff --git a/lib/Target/SystemZ/SystemZHazardRecognizer.h b/lib/Target/SystemZ/SystemZHazardRecognizer.h
index 7e1b5fb2e4fe..40cb3acc7009 100644
--- a/lib/Target/SystemZ/SystemZHazardRecognizer.h
+++ b/lib/Target/SystemZ/SystemZHazardRecognizer.h
@@ -75,9 +75,11 @@ class SystemZHazardRecognizer : public ScheduleHazardRecognizer {
/// Two decoder groups per cycle are formed (for z13), meaning 2x3
/// instructions. This function returns a number between 0 and 5,
- /// representing the current decoder slot of the current cycle.
- unsigned getCurrCycleIdx();
-
+ /// representing the current decoder slot of the current cycle. If an SU
+ /// is passed which will begin a new decoder group, the returned value is
+ /// the cycle index of the next group.
+ unsigned getCurrCycleIdx(SUnit *SU = nullptr) const;
+
/// LastFPdOpCycleIdx stores the numbeer returned by getCurrCycleIdx()
/// when a stalling operation is scheduled (which uses the FPd resource).
unsigned LastFPdOpCycleIdx;
@@ -88,14 +90,14 @@ class SystemZHazardRecognizer : public ScheduleHazardRecognizer {
unsigned getCurrGroupSize() {return CurrGroupSize;};
/// Start next decoder group.
- void nextGroup(bool DbgOutput = true);
+ void nextGroup();
/// Clear all counters for processor resources.
void clearProcResCounters();
/// With the goal of alternating processor sides for stalling (FPd)
/// ops, return true if it seems good to schedule an FPd op next.
- bool isFPdOpPreferred_distance(const SUnit *SU);
+ bool isFPdOpPreferred_distance(SUnit *SU) const;
/// Last emitted instruction or nullptr.
MachineInstr *LastEmittedMI;
@@ -145,6 +147,7 @@ public:
void dumpSU(SUnit *SU, raw_ostream &OS) const;
void dumpCurrGroup(std::string Msg = "") const;
void dumpProcResourceCounters() const;
+ void dumpState() const;
#endif
MachineBasicBlock::iterator getLastEmittedMI() { return LastEmittedMI; }
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index ce6f3d37f5c9..5425f1d16e5e 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -310,6 +310,11 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
// Try to use scatter instruction Opcode to implement store Store.
bool tryScatter(StoreSDNode *Store, unsigned Opcode);
+ // Change a chain of {load; op; store} of the same value into a simple op
+ // through memory of that value, if the uses of the modified value and its
+ // address are suitable.
+ bool tryFoldLoadStoreIntoMemOperand(SDNode *Node);
+
// Return true if Load and Store are loads and stores of the same size
// and are guaranteed not to overlap. Such operations can be implemented
// using block (SS-format) instructions.
@@ -330,6 +335,9 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
// to X.
bool storeLoadCanUseBlockBinary(SDNode *N, unsigned I) const;
+ // Try to expand a boolean SELECT_CCMASK using an IPM sequence.
+ SDValue expandSelectBoolean(SDNode *Node);
+
public:
SystemZDAGToDAGISel(SystemZTargetMachine &TM, CodeGenOpt::Level OptLevel)
: SelectionDAGISel(TM, OptLevel) {}
@@ -348,6 +356,8 @@ public:
void Select(SDNode *Node) override;
bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
std::vector<SDValue> &OutOps) override;
+ bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
+ void PreprocessISelDAG() override;
// Include the pieces autogenerated from the target description.
#include "SystemZGenDAGISel.inc"
@@ -579,7 +589,7 @@ bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
if (AM.isDynAlloc() && !AM.IncludesDynAlloc)
return false;
- DEBUG(AM.dump());
+ LLVM_DEBUG(AM.dump());
return true;
}
@@ -589,10 +599,16 @@ bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
// The selection DAG must no longer depend on their uniqueness when this
// function is used.
static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) {
- if (N.getNode()->getNodeId() == -1 ||
- N.getNode()->getNodeId() > Pos->getNodeId()) {
+ if (N->getNodeId() == -1 ||
+ (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
+ SelectionDAGISel::getUninvalidatedNodeId(Pos))) {
DAG->RepositionNode(Pos->getIterator(), N.getNode());
- N.getNode()->setNodeId(Pos->getNodeId());
+ // Mark Node as invalid for pruning as after this it may be a successor to a
+ // selected node but otherwise be in the same position of Pos.
+ // Conservatively mark it with the same -abs(Id) to assure node id
+ // invariant is preserved.
+ N->setNodeId(Pos->getNodeId());
+ SelectionDAGISel::InvalidateNodeId(N.getNode());
}
}
@@ -989,7 +1005,8 @@ bool SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
N = New.getNode();
}
// Now, select the machine opcode to implement this operation.
- SelectCode(N);
+ if (!N->isMachineOpcode())
+ SelectCode(N);
return true;
}
}
@@ -1022,8 +1039,7 @@ bool SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
};
SDValue New = convertTo(
DL, VT, SDValue(CurDAG->getMachineNode(Opcode, DL, OpcodeVT, Ops), 0));
- ReplaceUses(N, New.getNode());
- CurDAG->RemoveDeadNode(N);
+ ReplaceNode(N, New.getNode());
return true;
}
@@ -1114,8 +1130,7 @@ void SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node,
SDValue Lower = CurDAG->getConstant(LowerVal, DL, VT);
SDValue Or = CurDAG->getNode(Opcode, DL, VT, Upper, Lower);
- ReplaceUses(Node, Or.getNode());
- CurDAG->RemoveDeadNode(Node);
+ ReplaceNode(Node, Or.getNode());
SelectCode(Or.getNode());
}
@@ -1186,6 +1201,171 @@ bool SystemZDAGToDAGISel::tryScatter(StoreSDNode *Store, unsigned Opcode) {
return true;
}
+// Check whether or not the chain ending in StoreNode is suitable for doing
+// the {load; op; store} to modify transformation.
+static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
+ SDValue StoredVal, SelectionDAG *CurDAG,
+ LoadSDNode *&LoadNode,
+ SDValue &InputChain) {
+ // Is the stored value result 0 of the operation?
+ if (StoredVal.getResNo() != 0)
+ return false;
+
+ // Are there other uses of the loaded value than the operation?
+ if (!StoredVal.getNode()->hasNUsesOfValue(1, 0))
+ return false;
+
+ // Is the store non-extending and non-indexed?
+ if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
+ return false;
+
+ SDValue Load = StoredVal->getOperand(0);
+ // Is the stored value a non-extending and non-indexed load?
+ if (!ISD::isNormalLoad(Load.getNode()))
+ return false;
+
+ // Return LoadNode by reference.
+ LoadNode = cast<LoadSDNode>(Load);
+
+ // Is store the only read of the loaded value?
+ if (!Load.hasOneUse())
+ return false;
+
+ // Is the address of the store the same as the load?
+ if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
+ LoadNode->getOffset() != StoreNode->getOffset())
+ return false;
+
+ // Check if the chain is produced by the load or is a TokenFactor with
+ // the load output chain as an operand. Return InputChain by reference.
+ SDValue Chain = StoreNode->getChain();
+
+ bool ChainCheck = false;
+ if (Chain == Load.getValue(1)) {
+ ChainCheck = true;
+ InputChain = LoadNode->getChain();
+ } else if (Chain.getOpcode() == ISD::TokenFactor) {
+ SmallVector<SDValue, 4> ChainOps;
+ for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
+ SDValue Op = Chain.getOperand(i);
+ if (Op == Load.getValue(1)) {
+ ChainCheck = true;
+ // Drop Load, but keep its chain. No cycle check necessary.
+ ChainOps.push_back(Load.getOperand(0));
+ continue;
+ }
+
+ // Make sure using Op as part of the chain would not cause a cycle here.
+ // In theory, we could check whether the chain node is a predecessor of
+ // the load. But that can be very expensive. Instead visit the uses and
+ // make sure they all have smaller node id than the load.
+ int LoadId = LoadNode->getNodeId();
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = UI->use_end(); UI != UE; ++UI) {
+ if (UI.getUse().getResNo() != 0)
+ continue;
+ if (UI->getNodeId() > LoadId)
+ return false;
+ }
+
+ ChainOps.push_back(Op);
+ }
+
+ if (ChainCheck)
+ // Make a new TokenFactor with all the other input chains except
+ // for the load.
+ InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
+ MVT::Other, ChainOps);
+ }
+ if (!ChainCheck)
+ return false;
+
+ return true;
+}
+
+// Change a chain of {load; op; store} of the same value into a simple op
+// through memory of that value, if the uses of the modified value and its
+// address are suitable.
+//
+// The tablegen pattern memory operand pattern is currently not able to match
+// the case where the CC on the original operation are used.
+//
+// See the equivalent routine in X86ISelDAGToDAG for further comments.
+bool SystemZDAGToDAGISel::tryFoldLoadStoreIntoMemOperand(SDNode *Node) {
+ StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
+ SDValue StoredVal = StoreNode->getOperand(1);
+ unsigned Opc = StoredVal->getOpcode();
+ SDLoc DL(StoreNode);
+
+ // Before we try to select anything, make sure this is memory operand size
+ // and opcode we can handle. Note that this must match the code below that
+ // actually lowers the opcodes.
+ EVT MemVT = StoreNode->getMemoryVT();
+ unsigned NewOpc = 0;
+ bool NegateOperand = false;
+ switch (Opc) {
+ default:
+ return false;
+ case SystemZISD::SSUBO:
+ NegateOperand = true;
+ /* fall through */
+ case SystemZISD::SADDO:
+ if (MemVT == MVT::i32)
+ NewOpc = SystemZ::ASI;
+ else if (MemVT == MVT::i64)
+ NewOpc = SystemZ::AGSI;
+ else
+ return false;
+ break;
+ case SystemZISD::USUBO:
+ NegateOperand = true;
+ /* fall through */
+ case SystemZISD::UADDO:
+ if (MemVT == MVT::i32)
+ NewOpc = SystemZ::ALSI;
+ else if (MemVT == MVT::i64)
+ NewOpc = SystemZ::ALGSI;
+ else
+ return false;
+ break;
+ }
+
+ LoadSDNode *LoadNode = nullptr;
+ SDValue InputChain;
+ if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadNode,
+ InputChain))
+ return false;
+
+ SDValue Operand = StoredVal.getOperand(1);
+ auto *OperandC = dyn_cast<ConstantSDNode>(Operand);
+ if (!OperandC)
+ return false;
+ auto OperandV = OperandC->getAPIntValue();
+ if (NegateOperand)
+ OperandV = -OperandV;
+ if (OperandV.getMinSignedBits() > 8)
+ return false;
+ Operand = CurDAG->getTargetConstant(OperandV, DL, MemVT);
+
+ SDValue Base, Disp;
+ if (!selectBDAddr20Only(StoreNode->getBasePtr(), Base, Disp))
+ return false;
+
+ SDValue Ops[] = { Base, Disp, Operand, InputChain };
+ MachineSDNode *Result =
+ CurDAG->getMachineNode(NewOpc, DL, MVT::i32, MVT::Other, Ops);
+
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
+ MemOp[0] = StoreNode->getMemOperand();
+ MemOp[1] = LoadNode->getMemOperand();
+ Result->setMemRefs(MemOp, MemOp + 2);
+
+ ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
+ ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
+ CurDAG->RemoveDeadNode(Node);
+ return true;
+}
+
bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store,
LoadSDNode *Load) const {
// Check that the two memory operands have the same size.
@@ -1245,12 +1425,9 @@ bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
}
void SystemZDAGToDAGISel::Select(SDNode *Node) {
- // Dump information about the Node being selected
- DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
-
// If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
- DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+ LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
Node->setNodeId(-1);
return;
}
@@ -1332,7 +1509,13 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
CCMask = CurDAG->getConstant(ConstCCValid ^ ConstCCMask, SDLoc(Node),
CCMask.getValueType());
SDValue Op4 = Node->getOperand(4);
- Node = CurDAG->UpdateNodeOperands(Node, Op1, Op0, CCValid, CCMask, Op4);
+ SDNode *UpdatedNode =
+ CurDAG->UpdateNodeOperands(Node, Op1, Op0, CCValid, CCMask, Op4);
+ if (UpdatedNode != Node) {
+ // In case this node already exists then replace Node with it.
+ ReplaceNode(Node, UpdatedNode);
+ Node = UpdatedNode;
+ }
}
break;
}
@@ -1351,6 +1534,8 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
}
case ISD::STORE: {
+ if (tryFoldLoadStoreIntoMemOperand(Node))
+ return;
auto *Store = cast<StoreSDNode>(Node);
unsigned ElemBitSize = Store->getValue().getValueSizeInBits();
if (ElemBitSize == 32) {
@@ -1438,3 +1623,227 @@ SelectInlineAsmMemoryOperand(const SDValue &Op,
return true;
}
+
+// IsProfitableToFold - Returns true if is profitable to fold the specific
+// operand node N of U during instruction selection that starts at Root.
+bool
+SystemZDAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U,
+ SDNode *Root) const {
+ // We want to avoid folding a LOAD into an ICMP node if as a result
+ // we would be forced to spill the condition code into a GPR.
+ if (N.getOpcode() == ISD::LOAD && U->getOpcode() == SystemZISD::ICMP) {
+ if (!N.hasOneUse() || !U->hasOneUse())
+ return false;
+
+ // The user of the CC value will usually be a CopyToReg into the
+ // physical CC register, which in turn is glued and chained to the
+ // actual instruction that uses the CC value. Bail out if we have
+ // anything else than that.
+ SDNode *CCUser = *U->use_begin();
+ SDNode *CCRegUser = nullptr;
+ if (CCUser->getOpcode() == ISD::CopyToReg ||
+ cast<RegisterSDNode>(CCUser->getOperand(1))->getReg() == SystemZ::CC) {
+ for (auto *U : CCUser->uses()) {
+ if (CCRegUser == nullptr)
+ CCRegUser = U;
+ else if (CCRegUser != U)
+ return false;
+ }
+ }
+ if (CCRegUser == nullptr)
+ return false;
+
+ // If the actual instruction is a branch, the only thing that remains to be
+ // checked is whether the CCUser chain is a predecessor of the load.
+ if (CCRegUser->isMachineOpcode() &&
+ CCRegUser->getMachineOpcode() == SystemZ::BRC)
+ return !N->isPredecessorOf(CCUser->getOperand(0).getNode());
+
+ // Otherwise, the instruction may have multiple operands, and we need to
+ // verify that none of them are a predecessor of the load. This is exactly
+ // the same check that would be done by common code if the CC setter were
+ // glued to the CC user, so simply invoke that check here.
+ if (!IsLegalToFold(N, U, CCRegUser, OptLevel, false))
+ return false;
+ }
+
+ return true;
+}
+
+namespace {
+// Represents a sequence for extracting a 0/1 value from an IPM result:
+// (((X ^ XORValue) + AddValue) >> Bit)
+struct IPMConversion {
+ IPMConversion(unsigned xorValue, int64_t addValue, unsigned bit)
+ : XORValue(xorValue), AddValue(addValue), Bit(bit) {}
+
+ int64_t XORValue;
+ int64_t AddValue;
+ unsigned Bit;
+};
+} // end anonymous namespace
+
+// Return a sequence for getting a 1 from an IPM result when CC has a
+// value in CCMask and a 0 when CC has a value in CCValid & ~CCMask.
+// The handling of CC values outside CCValid doesn't matter.
+static IPMConversion getIPMConversion(unsigned CCValid, unsigned CCMask) {
+ // Deal with cases where the result can be taken directly from a bit
+ // of the IPM result.
+ if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_3)))
+ return IPMConversion(0, 0, SystemZ::IPM_CC);
+ if (CCMask == (CCValid & (SystemZ::CCMASK_2 | SystemZ::CCMASK_3)))
+ return IPMConversion(0, 0, SystemZ::IPM_CC + 1);
+
+ // Deal with cases where we can add a value to force the sign bit
+ // to contain the right value. Putting the bit in 31 means we can
+ // use SRL rather than RISBG(L), and also makes it easier to get a
+ // 0/-1 value, so it has priority over the other tests below.
+ //
+ // These sequences rely on the fact that the upper two bits of the
+ // IPM result are zero.
+ uint64_t TopBit = uint64_t(1) << 31;
+ if (CCMask == (CCValid & SystemZ::CCMASK_0))
+ return IPMConversion(0, -(1 << SystemZ::IPM_CC), 31);
+ if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_1)))
+ return IPMConversion(0, -(2 << SystemZ::IPM_CC), 31);
+ if (CCMask == (CCValid & (SystemZ::CCMASK_0
+ | SystemZ::CCMASK_1
+ | SystemZ::CCMASK_2)))
+ return IPMConversion(0, -(3 << SystemZ::IPM_CC), 31);
+ if (CCMask == (CCValid & SystemZ::CCMASK_3))
+ return IPMConversion(0, TopBit - (3 << SystemZ::IPM_CC), 31);
+ if (CCMask == (CCValid & (SystemZ::CCMASK_1
+ | SystemZ::CCMASK_2
+ | SystemZ::CCMASK_3)))
+ return IPMConversion(0, TopBit - (1 << SystemZ::IPM_CC), 31);
+
+ // Next try inverting the value and testing a bit. 0/1 could be
+ // handled this way too, but we dealt with that case above.
+ if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_2)))
+ return IPMConversion(-1, 0, SystemZ::IPM_CC);
+
+ // Handle cases where adding a value forces a non-sign bit to contain
+ // the right value.
+ if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_2)))
+ return IPMConversion(0, 1 << SystemZ::IPM_CC, SystemZ::IPM_CC + 1);
+ if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_3)))
+ return IPMConversion(0, -(1 << SystemZ::IPM_CC), SystemZ::IPM_CC + 1);
+
+ // The remaining cases are 1, 2, 0/1/3 and 0/2/3. All these are
+ // can be done by inverting the low CC bit and applying one of the
+ // sign-based extractions above.
+ if (CCMask == (CCValid & SystemZ::CCMASK_1))
+ return IPMConversion(1 << SystemZ::IPM_CC, -(1 << SystemZ::IPM_CC), 31);
+ if (CCMask == (CCValid & SystemZ::CCMASK_2))
+ return IPMConversion(1 << SystemZ::IPM_CC,
+ TopBit - (3 << SystemZ::IPM_CC), 31);
+ if (CCMask == (CCValid & (SystemZ::CCMASK_0
+ | SystemZ::CCMASK_1
+ | SystemZ::CCMASK_3)))
+ return IPMConversion(1 << SystemZ::IPM_CC, -(3 << SystemZ::IPM_CC), 31);
+ if (CCMask == (CCValid & (SystemZ::CCMASK_0
+ | SystemZ::CCMASK_2
+ | SystemZ::CCMASK_3)))
+ return IPMConversion(1 << SystemZ::IPM_CC,
+ TopBit - (1 << SystemZ::IPM_CC), 31);
+
+ llvm_unreachable("Unexpected CC combination");
+}
+
+SDValue SystemZDAGToDAGISel::expandSelectBoolean(SDNode *Node) {
+ auto *TrueOp = dyn_cast<ConstantSDNode>(Node->getOperand(0));
+ auto *FalseOp = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+ if (!TrueOp || !FalseOp)
+ return SDValue();
+ if (FalseOp->getZExtValue() != 0)
+ return SDValue();
+ if (TrueOp->getSExtValue() != 1 && TrueOp->getSExtValue() != -1)
+ return SDValue();
+
+ auto *CCValidOp = dyn_cast<ConstantSDNode>(Node->getOperand(2));
+ auto *CCMaskOp = dyn_cast<ConstantSDNode>(Node->getOperand(3));
+ if (!CCValidOp || !CCMaskOp)
+ return SDValue();
+ int CCValid = CCValidOp->getZExtValue();
+ int CCMask = CCMaskOp->getZExtValue();
+
+ SDLoc DL(Node);
+ SDValue CCReg = Node->getOperand(4);
+ IPMConversion IPM = getIPMConversion(CCValid, CCMask);
+ SDValue Result = CurDAG->getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
+
+ if (IPM.XORValue)
+ Result = CurDAG->getNode(ISD::XOR, DL, MVT::i32, Result,
+ CurDAG->getConstant(IPM.XORValue, DL, MVT::i32));
+
+ if (IPM.AddValue)
+ Result = CurDAG->getNode(ISD::ADD, DL, MVT::i32, Result,
+ CurDAG->getConstant(IPM.AddValue, DL, MVT::i32));
+
+ EVT VT = Node->getValueType(0);
+ if (VT == MVT::i32 && IPM.Bit == 31) {
+ unsigned ShiftOp = TrueOp->getSExtValue() == 1 ? ISD::SRL : ISD::SRA;
+ Result = CurDAG->getNode(ShiftOp, DL, MVT::i32, Result,
+ CurDAG->getConstant(IPM.Bit, DL, MVT::i32));
+ } else {
+ if (VT != MVT::i32)
+ Result = CurDAG->getNode(ISD::ANY_EXTEND, DL, VT, Result);
+
+ if (TrueOp->getSExtValue() == 1) {
+ // The SHR/AND sequence should get optimized to an RISBG.
+ Result = CurDAG->getNode(ISD::SRL, DL, VT, Result,
+ CurDAG->getConstant(IPM.Bit, DL, MVT::i32));
+ Result = CurDAG->getNode(ISD::AND, DL, VT, Result,
+ CurDAG->getConstant(1, DL, VT));
+ } else {
+ // Sign-extend from IPM.Bit using a pair of shifts.
+ int ShlAmt = VT.getSizeInBits() - 1 - IPM.Bit;
+ int SraAmt = VT.getSizeInBits() - 1;
+ Result = CurDAG->getNode(ISD::SHL, DL, VT, Result,
+ CurDAG->getConstant(ShlAmt, DL, MVT::i32));
+ Result = CurDAG->getNode(ISD::SRA, DL, VT, Result,
+ CurDAG->getConstant(SraAmt, DL, MVT::i32));
+ }
+ }
+
+ return Result;
+}
+
+void SystemZDAGToDAGISel::PreprocessISelDAG() {
+ // If we have conditional immediate loads, we always prefer
+ // using those over an IPM sequence.
+ if (Subtarget->hasLoadStoreOnCond2())
+ return;
+
+ bool MadeChange = false;
+
+ for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+ E = CurDAG->allnodes_end();
+ I != E;) {
+ SDNode *N = &*I++;
+ if (N->use_empty())
+ continue;
+
+ SDValue Res;
+ switch (N->getOpcode()) {
+ default: break;
+ case SystemZISD::SELECT_CCMASK:
+ Res = expandSelectBoolean(N);
+ break;
+ }
+
+ if (Res) {
+ LLVM_DEBUG(dbgs() << "SystemZ DAG preprocessing replacing:\nOld: ");
+ LLVM_DEBUG(N->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\nNew: ");
+ LLVM_DEBUG(Res.getNode()->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\n");
+
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ MadeChange = true;
+ }
+ }
+
+ if (MadeChange)
+ CurDAG->RemoveDeadNodes();
+}
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index adf368319dc3..302c7883f97b 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -31,17 +31,6 @@ using namespace llvm;
#define DEBUG_TYPE "systemz-lower"
namespace {
-// Represents a sequence for extracting a 0/1 value from an IPM result:
-// (((X ^ XORValue) + AddValue) >> Bit)
-struct IPMConversion {
- IPMConversion(unsigned xorValue, int64_t addValue, unsigned bit)
- : XORValue(xorValue), AddValue(addValue), Bit(bit) {}
-
- int64_t XORValue;
- int64_t AddValue;
- unsigned Bit;
-};
-
// Represents information about a comparison.
struct Comparison {
Comparison(SDValue Op0In, SDValue Op1In)
@@ -87,7 +76,7 @@ static MachineOperand earlyUseOperand(MachineOperand Op) {
SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
const SystemZSubtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
- MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+ MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize(0));
// Set up the register classes.
if (Subtarget.hasHighWord())
@@ -133,6 +122,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// Instructions are strings of 2-byte aligned 2-byte values.
setMinFunctionAlignment(2);
+ // For performance reasons we prefer 16-byte alignment.
+ setPrefFunctionAlignment(4);
// Handle operations that are handled in a similar way for all types.
for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
@@ -173,6 +164,18 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SDIVREM, VT, Custom);
setOperationAction(ISD::UDIVREM, VT, Custom);
+ // Support addition/subtraction with overflow.
+ setOperationAction(ISD::SADDO, VT, Custom);
+ setOperationAction(ISD::SSUBO, VT, Custom);
+
+ // Support addition/subtraction with carry.
+ setOperationAction(ISD::UADDO, VT, Custom);
+ setOperationAction(ISD::USUBO, VT, Custom);
+
+ // Support carry in as value rather than glue.
+ setOperationAction(ISD::ADDCARRY, VT, Custom);
+ setOperationAction(ISD::SUBCARRY, VT, Custom);
+
// Lower ATOMIC_LOAD and ATOMIC_STORE into normal volatile loads and
// stores, putting a serialization instruction after the stores.
setOperationAction(ISD::ATOMIC_LOAD, VT, Custom);
@@ -517,7 +520,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VAEND, MVT::Other, Expand);
// Codes for which we want to perform some z-specific combinations.
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::FP_ROUND);
@@ -643,7 +648,8 @@ supportedAddressingMode(Instruction *I, bool HasVector) {
if (SingleUser->getParent() == I->getParent()) {
if (isa<ICmpInst>(SingleUser)) {
if (auto *C = dyn_cast<ConstantInt>(SingleUser->getOperand(1)))
- if (isInt<16>(C->getSExtValue()) || isUInt<16>(C->getZExtValue()))
+ if (C->getBitWidth() <= 64 &&
+ (isInt<16>(C->getSExtValue()) || isUInt<16>(C->getZExtValue())))
// Comparison of memory with 16 bit signed / unsigned immediate
return AddressingMode(false/*LongDispl*/, false/*IdxReg*/);
} else if (isa<StoreInst>(SingleUser))
@@ -748,6 +754,7 @@ SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
case 'f': // Floating-point register
case 'h': // High-part register
case 'r': // General-purpose register
+ case 'v': // Vector register
return C_RegisterClass;
case 'Q': // Memory with base and unsigned 12-bit displacement
@@ -800,6 +807,12 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
weight = CW_Register;
break;
+ case 'v': // Vector register
+ if ((type->isVectorTy() || type->isFloatingPointTy()) &&
+ Subtarget.hasVector())
+ weight = CW_Register;
+ break;
+
case 'I': // Unsigned 8-bit constant
if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
if (isUInt<8>(C->getZExtValue()))
@@ -838,13 +851,13 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
// Map maps 0-based register numbers to LLVM register numbers.
static std::pair<unsigned, const TargetRegisterClass *>
parseRegisterNumber(StringRef Constraint, const TargetRegisterClass *RC,
- const unsigned *Map) {
+ const unsigned *Map, unsigned Size) {
assert(*(Constraint.end()-1) == '}' && "Missing '}'");
if (isdigit(Constraint[2])) {
unsigned Index;
bool Failed =
Constraint.slice(2, Constraint.size() - 1).getAsInteger(10, Index);
- if (!Failed && Index < 16 && Map[Index])
+ if (!Failed && Index < Size && Map[Index])
return std::make_pair(Map[Index], RC);
}
return std::make_pair(0U, nullptr);
@@ -881,6 +894,16 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
else if (VT == MVT::f128)
return std::make_pair(0U, &SystemZ::FP128BitRegClass);
return std::make_pair(0U, &SystemZ::FP32BitRegClass);
+
+ case 'v': // Vector register
+ if (Subtarget.hasVector()) {
+ if (VT == MVT::f32)
+ return std::make_pair(0U, &SystemZ::VR32BitRegClass);
+ if (VT == MVT::f64)
+ return std::make_pair(0U, &SystemZ::VR64BitRegClass);
+ return std::make_pair(0U, &SystemZ::VR128BitRegClass);
+ }
+ break;
}
}
if (Constraint.size() > 0 && Constraint[0] == '{') {
@@ -891,22 +914,32 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
if (Constraint[1] == 'r') {
if (VT == MVT::i32)
return parseRegisterNumber(Constraint, &SystemZ::GR32BitRegClass,
- SystemZMC::GR32Regs);
+ SystemZMC::GR32Regs, 16);
if (VT == MVT::i128)
return parseRegisterNumber(Constraint, &SystemZ::GR128BitRegClass,
- SystemZMC::GR128Regs);
+ SystemZMC::GR128Regs, 16);
return parseRegisterNumber(Constraint, &SystemZ::GR64BitRegClass,
- SystemZMC::GR64Regs);
+ SystemZMC::GR64Regs, 16);
}
if (Constraint[1] == 'f') {
if (VT == MVT::f32)
return parseRegisterNumber(Constraint, &SystemZ::FP32BitRegClass,
- SystemZMC::FP32Regs);
+ SystemZMC::FP32Regs, 16);
if (VT == MVT::f128)
return parseRegisterNumber(Constraint, &SystemZ::FP128BitRegClass,
- SystemZMC::FP128Regs);
+ SystemZMC::FP128Regs, 16);
return parseRegisterNumber(Constraint, &SystemZ::FP64BitRegClass,
- SystemZMC::FP64Regs);
+ SystemZMC::FP64Regs, 16);
+ }
+ if (Constraint[1] == 'v') {
+ if (VT == MVT::f32)
+ return parseRegisterNumber(Constraint, &SystemZ::VR32BitRegClass,
+ SystemZMC::VR32Regs, 32);
+ if (VT == MVT::f64)
+ return parseRegisterNumber(Constraint, &SystemZ::VR64BitRegClass,
+ SystemZMC::VR64Regs, 32);
+ return parseRegisterNumber(Constraint, &SystemZ::VR128BitRegClass,
+ SystemZMC::VR128Regs, 32);
}
}
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
@@ -964,6 +997,13 @@ LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
#include "SystemZGenCallingConv.inc"
+const MCPhysReg *SystemZTargetLowering::getScratchRegisters(
+ CallingConv::ID) const {
+ static const MCPhysReg ScratchRegs[] = { SystemZ::R0D, SystemZ::R1D,
+ SystemZ::R14D, 0 };
+ return ScratchRegs;
+}
+
bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType,
Type *ToType) const {
return isTruncateFree(FromType, ToType);
@@ -1634,9 +1674,9 @@ static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
}
}
-// Emit an intrinsic with chain with a glued value instead of its CC result.
-static SDValue emitIntrinsicWithChainAndGlue(SelectionDAG &DAG, SDValue Op,
- unsigned Opcode) {
+// Emit an intrinsic with chain and an explicit CC register result.
+static SDNode *emitIntrinsicWithCCAndChain(SelectionDAG &DAG, SDValue Op,
+ unsigned Opcode) {
// Copy all operands except the intrinsic ID.
unsigned NumOps = Op.getNumOperands();
SmallVector<SDValue, 6> Ops;
@@ -1646,17 +1686,17 @@ static SDValue emitIntrinsicWithChainAndGlue(SelectionDAG &DAG, SDValue Op,
Ops.push_back(Op.getOperand(I));
assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
- SDVTList RawVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDVTList RawVTs = DAG.getVTList(MVT::i32, MVT::Other);
SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
SDValue OldChain = SDValue(Op.getNode(), 1);
- SDValue NewChain = SDValue(Intr.getNode(), 0);
+ SDValue NewChain = SDValue(Intr.getNode(), 1);
DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
- return Intr;
+ return Intr.getNode();
}
-// Emit an intrinsic with a glued value instead of its CC result.
-static SDValue emitIntrinsicWithGlue(SelectionDAG &DAG, SDValue Op,
- unsigned Opcode) {
+// Emit an intrinsic with an explicit CC register result.
+static SDNode *emitIntrinsicWithCC(SelectionDAG &DAG, SDValue Op,
+ unsigned Opcode) {
// Copy all operands except the intrinsic ID.
unsigned NumOps = Op.getNumOperands();
SmallVector<SDValue, 6> Ops;
@@ -1664,11 +1704,8 @@ static SDValue emitIntrinsicWithGlue(SelectionDAG &DAG, SDValue Op,
for (unsigned I = 1; I < NumOps; ++I)
Ops.push_back(Op.getOperand(I));
- if (Op->getNumValues() == 1)
- return DAG.getNode(Opcode, SDLoc(Op), MVT::Glue, Ops);
- assert(Op->getNumValues() == 2 && "Expected exactly one non-CC result");
- SDVTList RawVTs = DAG.getVTList(Op->getValueType(0), MVT::Glue);
- return DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
+ SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), Op->getVTList(), Ops);
+ return Intr.getNode();
}
// CC is a comparison that will be implemented using an integer or
@@ -1699,73 +1736,6 @@ static unsigned CCMaskForCondCode(ISD::CondCode CC) {
#undef CONV
}
-// Return a sequence for getting a 1 from an IPM result when CC has a
-// value in CCMask and a 0 when CC has a value in CCValid & ~CCMask.
-// The handling of CC values outside CCValid doesn't matter.
-static IPMConversion getIPMConversion(unsigned CCValid, unsigned CCMask) {
- // Deal with cases where the result can be taken directly from a bit
- // of the IPM result.
- if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_3)))
- return IPMConversion(0, 0, SystemZ::IPM_CC);
- if (CCMask == (CCValid & (SystemZ::CCMASK_2 | SystemZ::CCMASK_3)))
- return IPMConversion(0, 0, SystemZ::IPM_CC + 1);
-
- // Deal with cases where we can add a value to force the sign bit
- // to contain the right value. Putting the bit in 31 means we can
- // use SRL rather than RISBG(L), and also makes it easier to get a
- // 0/-1 value, so it has priority over the other tests below.
- //
- // These sequences rely on the fact that the upper two bits of the
- // IPM result are zero.
- uint64_t TopBit = uint64_t(1) << 31;
- if (CCMask == (CCValid & SystemZ::CCMASK_0))
- return IPMConversion(0, -(1 << SystemZ::IPM_CC), 31);
- if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_1)))
- return IPMConversion(0, -(2 << SystemZ::IPM_CC), 31);
- if (CCMask == (CCValid & (SystemZ::CCMASK_0
- | SystemZ::CCMASK_1
- | SystemZ::CCMASK_2)))
- return IPMConversion(0, -(3 << SystemZ::IPM_CC), 31);
- if (CCMask == (CCValid & SystemZ::CCMASK_3))
- return IPMConversion(0, TopBit - (3 << SystemZ::IPM_CC), 31);
- if (CCMask == (CCValid & (SystemZ::CCMASK_1
- | SystemZ::CCMASK_2
- | SystemZ::CCMASK_3)))
- return IPMConversion(0, TopBit - (1 << SystemZ::IPM_CC), 31);
-
- // Next try inverting the value and testing a bit. 0/1 could be
- // handled this way too, but we dealt with that case above.
- if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_2)))
- return IPMConversion(-1, 0, SystemZ::IPM_CC);
-
- // Handle cases where adding a value forces a non-sign bit to contain
- // the right value.
- if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_2)))
- return IPMConversion(0, 1 << SystemZ::IPM_CC, SystemZ::IPM_CC + 1);
- if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_3)))
- return IPMConversion(0, -(1 << SystemZ::IPM_CC), SystemZ::IPM_CC + 1);
-
- // The remaining cases are 1, 2, 0/1/3 and 0/2/3. All these are
- // can be done by inverting the low CC bit and applying one of the
- // sign-based extractions above.
- if (CCMask == (CCValid & SystemZ::CCMASK_1))
- return IPMConversion(1 << SystemZ::IPM_CC, -(1 << SystemZ::IPM_CC), 31);
- if (CCMask == (CCValid & SystemZ::CCMASK_2))
- return IPMConversion(1 << SystemZ::IPM_CC,
- TopBit - (3 << SystemZ::IPM_CC), 31);
- if (CCMask == (CCValid & (SystemZ::CCMASK_0
- | SystemZ::CCMASK_1
- | SystemZ::CCMASK_3)))
- return IPMConversion(1 << SystemZ::IPM_CC, -(3 << SystemZ::IPM_CC), 31);
- if (CCMask == (CCValid & (SystemZ::CCMASK_0
- | SystemZ::CCMASK_2
- | SystemZ::CCMASK_3)))
- return IPMConversion(1 << SystemZ::IPM_CC,
- TopBit - (1 << SystemZ::IPM_CC), 31);
-
- llvm_unreachable("Unexpected CC combination");
-}
-
// If C can be converted to a comparison against zero, adjust the operands
// as necessary.
static void adjustZeroCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
@@ -2237,6 +2207,24 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
C.CCMask = NewCCMask;
}
+// See whether the comparison argument contains a redundant AND
+// and remove it if so. This sometimes happens due to the generic
+// BRCOND expansion.
+static void adjustForRedundantAnd(SelectionDAG &DAG, const SDLoc &DL,
+ Comparison &C) {
+ if (C.Op0.getOpcode() != ISD::AND)
+ return;
+ auto *Mask = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
+ if (!Mask)
+ return;
+ KnownBits Known;
+ DAG.computeKnownBits(C.Op0.getOperand(0), Known);
+ if ((~Known.Zero).getZExtValue() & ~Mask->getZExtValue())
+ return;
+
+ C.Op0 = C.Op0.getOperand(0);
+}
+
// Return a Comparison that tests the condition-code result of intrinsic
// node Call against constant integer CC using comparison code Cond.
// Opcode is the opcode of the SystemZISD operation for the intrinsic
@@ -2311,6 +2299,7 @@ static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
else
C.ICmpType = SystemZICMP::SignedOnly;
C.CCMask &= ~SystemZ::CCMASK_CMP_UO;
+ adjustForRedundantAnd(DAG, DL, C);
adjustZeroCmp(DAG, DL, C);
adjustSubwordCmp(DAG, DL, C);
adjustForSubtraction(DAG, DL, C);
@@ -2330,29 +2319,28 @@ static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
// Emit the comparison instruction described by C.
static SDValue emitCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
if (!C.Op1.getNode()) {
- SDValue Op;
+ SDNode *Node;
switch (C.Op0.getOpcode()) {
case ISD::INTRINSIC_W_CHAIN:
- Op = emitIntrinsicWithChainAndGlue(DAG, C.Op0, C.Opcode);
- break;
+ Node = emitIntrinsicWithCCAndChain(DAG, C.Op0, C.Opcode);
+ return SDValue(Node, 0);
case ISD::INTRINSIC_WO_CHAIN:
- Op = emitIntrinsicWithGlue(DAG, C.Op0, C.Opcode);
- break;
+ Node = emitIntrinsicWithCC(DAG, C.Op0, C.Opcode);
+ return SDValue(Node, Node->getNumValues() - 1);
default:
llvm_unreachable("Invalid comparison operands");
}
- return SDValue(Op.getNode(), Op->getNumValues() - 1);
}
if (C.Opcode == SystemZISD::ICMP)
- return DAG.getNode(SystemZISD::ICMP, DL, MVT::Glue, C.Op0, C.Op1,
+ return DAG.getNode(SystemZISD::ICMP, DL, MVT::i32, C.Op0, C.Op1,
DAG.getConstant(C.ICmpType, DL, MVT::i32));
if (C.Opcode == SystemZISD::TM) {
bool RegisterOnly = (bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
- return DAG.getNode(SystemZISD::TM, DL, MVT::Glue, C.Op0, C.Op1,
+ return DAG.getNode(SystemZISD::TM, DL, MVT::i32, C.Op0, C.Op1,
DAG.getConstant(RegisterOnly, DL, MVT::i32));
}
- return DAG.getNode(C.Opcode, DL, MVT::Glue, C.Op0, C.Op1);
+ return DAG.getNode(C.Opcode, DL, MVT::i32, C.Op0, C.Op1);
}
// Implement a 32-bit *MUL_LOHI operation by extending both operands to
@@ -2383,29 +2371,16 @@ static void lowerGR128Binary(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result);
}
-// Return an i32 value that is 1 if the CC value produced by Glue is
+// Return an i32 value that is 1 if the CC value produced by CCReg is
// in the mask CCMask and 0 otherwise. CC is known to have a value
// in CCValid, so other values can be ignored.
-static SDValue emitSETCC(SelectionDAG &DAG, const SDLoc &DL, SDValue Glue,
+static SDValue emitSETCC(SelectionDAG &DAG, const SDLoc &DL, SDValue CCReg,
unsigned CCValid, unsigned CCMask) {
- IPMConversion Conversion = getIPMConversion(CCValid, CCMask);
- SDValue Result = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
-
- if (Conversion.XORValue)
- Result = DAG.getNode(ISD::XOR, DL, MVT::i32, Result,
- DAG.getConstant(Conversion.XORValue, DL, MVT::i32));
-
- if (Conversion.AddValue)
- Result = DAG.getNode(ISD::ADD, DL, MVT::i32, Result,
- DAG.getConstant(Conversion.AddValue, DL, MVT::i32));
-
- // The SHR/AND sequence should get optimized to an RISBG.
- Result = DAG.getNode(ISD::SRL, DL, MVT::i32, Result,
- DAG.getConstant(Conversion.Bit, DL, MVT::i32));
- if (Conversion.Bit != 31)
- Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
- DAG.getConstant(1, DL, MVT::i32));
- return Result;
+ SDValue Ops[] = { DAG.getConstant(1, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(CCValid, DL, MVT::i32),
+ DAG.getConstant(CCMask, DL, MVT::i32), CCReg };
+ return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, MVT::i32, Ops);
}
// Return the SystemISD vector comparison operation for CC, or 0 if it cannot
@@ -2554,8 +2529,8 @@ SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
return lowerVectorSETCC(DAG, DL, VT, CC, CmpOp0, CmpOp1);
Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
- SDValue Glue = emitCmp(DAG, DL, C);
- return emitSETCC(DAG, DL, Glue, C.CCValid, C.CCMask);
+ SDValue CCReg = emitCmp(DAG, DL, C);
+ return emitSETCC(DAG, DL, CCReg, C.CCValid, C.CCMask);
}
SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
@@ -2566,10 +2541,10 @@ SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
- SDValue Glue = emitCmp(DAG, DL, C);
+ SDValue CCReg = emitCmp(DAG, DL, C);
return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
Op.getOperand(0), DAG.getConstant(C.CCValid, DL, MVT::i32),
- DAG.getConstant(C.CCMask, DL, MVT::i32), Dest, Glue);
+ DAG.getConstant(C.CCMask, DL, MVT::i32), Dest, CCReg);
}
// Return true if Pos is CmpOp and Neg is the negative of CmpOp,
@@ -2619,36 +2594,11 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
return getAbsolute(DAG, DL, FalseOp, C.CCMask & SystemZ::CCMASK_CMP_GT);
}
- SDValue Glue = emitCmp(DAG, DL, C);
-
- // Special case for handling -1/0 results. The shifts we use here
- // should get optimized with the IPM conversion sequence.
- auto *TrueC = dyn_cast<ConstantSDNode>(TrueOp);
- auto *FalseC = dyn_cast<ConstantSDNode>(FalseOp);
- if (TrueC && FalseC) {
- int64_t TrueVal = TrueC->getSExtValue();
- int64_t FalseVal = FalseC->getSExtValue();
- if ((TrueVal == -1 && FalseVal == 0) || (TrueVal == 0 && FalseVal == -1)) {
- // Invert the condition if we want -1 on false.
- if (TrueVal == 0)
- C.CCMask ^= C.CCValid;
- SDValue Result = emitSETCC(DAG, DL, Glue, C.CCValid, C.CCMask);
- EVT VT = Op.getValueType();
- // Extend the result to VT. Upper bits are ignored.
- if (!is32Bit(VT))
- Result = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Result);
- // Sign-extend from the low bit.
- SDValue ShAmt = DAG.getConstant(VT.getSizeInBits() - 1, DL, MVT::i32);
- SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Result, ShAmt);
- return DAG.getNode(ISD::SRA, DL, VT, Shl, ShAmt);
- }
- }
-
+ SDValue CCReg = emitCmp(DAG, DL, C);
SDValue Ops[] = {TrueOp, FalseOp, DAG.getConstant(C.CCValid, DL, MVT::i32),
- DAG.getConstant(C.CCMask, DL, MVT::i32), Glue};
+ DAG.getConstant(C.CCMask, DL, MVT::i32), CCReg};
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
- return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
+ return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, Op.getValueType(), Ops);
}
SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
@@ -2757,7 +2707,7 @@ SDValue SystemZTargetLowering::lowerThreadPointer(const SDLoc &DL,
SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
SelectionDAG &DAG) const {
- if (DAG.getTarget().Options.EmulatedTLS)
+ if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(Node, DAG);
SDLoc DL(Node);
const GlobalValue *GV = Node->getGlobal();
@@ -3266,6 +3216,99 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
MVT::i64, HighOp, Low32);
}
+// Lower SADDO/SSUBO/UADDO/USUBO nodes.
+SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDNode *N = Op.getNode();
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDLoc DL(N);
+ unsigned BaseOp = 0;
+ unsigned CCValid = 0;
+ unsigned CCMask = 0;
+
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Unknown instruction!");
+ case ISD::SADDO:
+ BaseOp = SystemZISD::SADDO;
+ CCValid = SystemZ::CCMASK_ARITH;
+ CCMask = SystemZ::CCMASK_ARITH_OVERFLOW;
+ break;
+ case ISD::SSUBO:
+ BaseOp = SystemZISD::SSUBO;
+ CCValid = SystemZ::CCMASK_ARITH;
+ CCMask = SystemZ::CCMASK_ARITH_OVERFLOW;
+ break;
+ case ISD::UADDO:
+ BaseOp = SystemZISD::UADDO;
+ CCValid = SystemZ::CCMASK_LOGICAL;
+ CCMask = SystemZ::CCMASK_LOGICAL_CARRY;
+ break;
+ case ISD::USUBO:
+ BaseOp = SystemZISD::USUBO;
+ CCValid = SystemZ::CCMASK_LOGICAL;
+ CCMask = SystemZ::CCMASK_LOGICAL_BORROW;
+ break;
+ }
+
+ SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+ SDValue Result = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
+
+ SDValue SetCC = emitSETCC(DAG, DL, Result.getValue(1), CCValid, CCMask);
+ if (N->getValueType(1) == MVT::i1)
+ SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC);
+}
+
+// Lower ADDCARRY/SUBCARRY nodes.
+SDValue SystemZTargetLowering::lowerADDSUBCARRY(SDValue Op,
+ SelectionDAG &DAG) const {
+
+ SDNode *N = Op.getNode();
+ MVT VT = N->getSimpleValueType(0);
+
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue Carry = Op.getOperand(2);
+ SDLoc DL(N);
+ unsigned BaseOp = 0;
+ unsigned CCValid = 0;
+ unsigned CCMask = 0;
+
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Unknown instruction!");
+ case ISD::ADDCARRY:
+ BaseOp = SystemZISD::ADDCARRY;
+ CCValid = SystemZ::CCMASK_LOGICAL;
+ CCMask = SystemZ::CCMASK_LOGICAL_CARRY;
+ break;
+ case ISD::SUBCARRY:
+ BaseOp = SystemZISD::SUBCARRY;
+ CCValid = SystemZ::CCMASK_LOGICAL;
+ CCMask = SystemZ::CCMASK_LOGICAL_BORROW;
+ break;
+ }
+
+ // Set the condition code from the carry flag.
+ Carry = DAG.getNode(SystemZISD::GET_CCMASK, DL, MVT::i32, Carry,
+ DAG.getConstant(CCValid, DL, MVT::i32),
+ DAG.getConstant(CCMask, DL, MVT::i32));
+
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ SDValue Result = DAG.getNode(BaseOp, DL, VTs, LHS, RHS, Carry);
+
+ SDValue SetCC = emitSETCC(DAG, DL, Result.getValue(1), CCValid, CCMask);
+ if (N->getValueType(1) == MVT::i1)
+ SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC);
+}
+
SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
@@ -3512,16 +3555,16 @@ SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
EVT NarrowVT = Node->getMemoryVT();
EVT WideVT = NarrowVT == MVT::i64 ? MVT::i64 : MVT::i32;
if (NarrowVT == WideVT) {
- SDVTList Tys = DAG.getVTList(WideVT, MVT::Other, MVT::Glue);
+ SDVTList Tys = DAG.getVTList(WideVT, MVT::i32, MVT::Other);
SDValue Ops[] = { ChainIn, Addr, CmpVal, SwapVal };
SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAP,
DL, Tys, Ops, NarrowVT, MMO);
- SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(2),
+ SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(1),
SystemZ::CCMASK_CS, SystemZ::CCMASK_CS_EQ);
DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
- DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(1));
+ DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(2));
return SDValue();
}
@@ -3546,17 +3589,17 @@ SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
DAG.getConstant(0, DL, WideVT), BitShift);
// Construct the ATOMIC_CMP_SWAPW node.
- SDVTList VTList = DAG.getVTList(WideVT, MVT::Other, MVT::Glue);
+ SDVTList VTList = DAG.getVTList(WideVT, MVT::i32, MVT::Other);
SDValue Ops[] = { ChainIn, AlignedAddr, CmpVal, SwapVal, BitShift,
NegBitShift, DAG.getConstant(BitSize, DL, WideVT) };
SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAPW, DL,
VTList, Ops, NarrowVT, MMO);
- SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(2),
+ SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(1),
SystemZ::CCMASK_ICMP, SystemZ::CCMASK_CMP_EQ);
DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
- DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(1));
+ DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(2));
return SDValue();
}
@@ -3613,12 +3656,10 @@ SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
Node->getMemoryVT(), Node->getMemOperand());
}
-// Return an i32 that contains the value of CC immediately after After,
-// whose final operand must be MVT::Glue.
-static SDValue getCCResult(SelectionDAG &DAG, SDNode *After) {
- SDLoc DL(After);
- SDValue Glue = SDValue(After, After->getNumValues() - 1);
- SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+// Convert condition code in CCReg to an i32 value.
+static SDValue getCCResult(SelectionDAG &DAG, SDValue CCReg) {
+ SDLoc DL(CCReg);
+ SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
return DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
}
@@ -3629,8 +3670,8 @@ SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
unsigned Opcode, CCValid;
if (isIntrinsicWithCCAndChain(Op, Opcode, CCValid)) {
assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
- SDValue Glued = emitIntrinsicWithChainAndGlue(DAG, Op, Opcode);
- SDValue CC = getCCResult(DAG, Glued.getNode());
+ SDNode *Node = emitIntrinsicWithCCAndChain(DAG, Op, Opcode);
+ SDValue CC = getCCResult(DAG, SDValue(Node, 0));
DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), CC);
return SDValue();
}
@@ -3643,13 +3684,12 @@ SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned Opcode, CCValid;
if (isIntrinsicWithCC(Op, Opcode, CCValid)) {
- SDValue Glued = emitIntrinsicWithGlue(DAG, Op, Opcode);
- SDValue CC = getCCResult(DAG, Glued.getNode());
+ SDNode *Node = emitIntrinsicWithCC(DAG, Op, Opcode);
if (Op->getNumValues() == 1)
- return CC;
+ return getCCResult(DAG, SDValue(Node, 0));
assert(Op->getNumValues() == 2 && "Expected a CC and non-CC result");
- return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(), Glued,
- CC);
+ return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(),
+ SDValue(Node, 0), getCCResult(DAG, SDValue(Node, 1)));
}
unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -3853,20 +3893,34 @@ static const Permute *matchDoublePermute(const SmallVectorImpl<int> &Bytes,
return nullptr;
}
-// Convert the mask of the given VECTOR_SHUFFLE into a byte-level mask,
+// Convert the mask of the given shuffle op into a byte-level mask,
// as if it had type vNi8.
-static void getVPermMask(ShuffleVectorSDNode *VSN,
+static bool getVPermMask(SDValue ShuffleOp,
SmallVectorImpl<int> &Bytes) {
- EVT VT = VSN->getValueType(0);
+ EVT VT = ShuffleOp.getValueType();
unsigned NumElements = VT.getVectorNumElements();
unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
- Bytes.resize(NumElements * BytesPerElement, -1);
- for (unsigned I = 0; I < NumElements; ++I) {
- int Index = VSN->getMaskElt(I);
- if (Index >= 0)
+
+ if (auto *VSN = dyn_cast<ShuffleVectorSDNode>(ShuffleOp)) {
+ Bytes.resize(NumElements * BytesPerElement, -1);
+ for (unsigned I = 0; I < NumElements; ++I) {
+ int Index = VSN->getMaskElt(I);
+ if (Index >= 0)
+ for (unsigned J = 0; J < BytesPerElement; ++J)
+ Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
+ }
+ return true;
+ }
+ if (SystemZISD::SPLAT == ShuffleOp.getOpcode() &&
+ isa<ConstantSDNode>(ShuffleOp.getOperand(1))) {
+ unsigned Index = ShuffleOp.getConstantOperandVal(1);
+ Bytes.resize(NumElements * BytesPerElement, -1);
+ for (unsigned I = 0; I < NumElements; ++I)
for (unsigned J = 0; J < BytesPerElement; ++J)
Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
+ return true;
}
+ return false;
}
// Bytes is a VPERM-like permute vector, except that -1 is used for
@@ -4035,7 +4089,8 @@ bool GeneralShuffle::add(SDValue Op, unsigned Elem) {
// See whether the bytes we need come from a contiguous part of one
// operand.
SmallVector<int, SystemZ::VectorBytes> OpBytes;
- getVPermMask(cast<ShuffleVectorSDNode>(Op), OpBytes);
+ if (!getVPermMask(Op, OpBytes))
+ break;
int NewByte;
if (!getShuffleInput(OpBytes, Byte, BytesPerElement, NewByte))
break;
@@ -4217,9 +4272,9 @@ static bool tryBuildVectorByteMask(BuildVectorSDNode *BVN, uint64_t &Mask) {
if (!Op.isUndef()) {
uint64_t Value;
if (Op.getOpcode() == ISD::Constant)
- Value = dyn_cast<ConstantSDNode>(Op)->getZExtValue();
+ Value = cast<ConstantSDNode>(Op)->getZExtValue();
else if (Op.getOpcode() == ISD::ConstantFP)
- Value = (dyn_cast<ConstantFPSDNode>(Op)->getValueAPF().bitcastToAPInt()
+ Value = (cast<ConstantFPSDNode>(Op)->getValueAPF().bitcastToAPInt()
.getZExtValue());
else
return false;
@@ -4245,12 +4300,15 @@ static SDValue tryBuildVectorReplicate(SelectionDAG &DAG,
const SDLoc &DL, EVT VT, uint64_t Value,
unsigned BitsPerElement) {
// Signed 16-bit values can be replicated using VREPI.
+ // Mark the constants as opaque or DAGCombiner will convert back to
+ // BUILD_VECTOR.
int64_t SignedValue = SignExtend64(Value, BitsPerElement);
if (isInt<16>(SignedValue)) {
MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
SystemZ::VectorBits / BitsPerElement);
- SDValue Op = DAG.getNode(SystemZISD::REPLICATE, DL, VecVT,
- DAG.getConstant(SignedValue, DL, MVT::i32));
+ SDValue Op = DAG.getNode(
+ SystemZISD::REPLICATE, DL, VecVT,
+ DAG.getConstant(SignedValue, DL, MVT::i32, false, true /*isOpaque*/));
return DAG.getNode(ISD::BITCAST, DL, VT, Op);
}
// See whether rotating the constant left some N places gives a value that
@@ -4266,9 +4324,10 @@ static SDValue tryBuildVectorReplicate(SelectionDAG &DAG,
End -= 64 - BitsPerElement;
MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
SystemZ::VectorBits / BitsPerElement);
- SDValue Op = DAG.getNode(SystemZISD::ROTATE_MASK, DL, VecVT,
- DAG.getConstant(Start, DL, MVT::i32),
- DAG.getConstant(End, DL, MVT::i32));
+ SDValue Op = DAG.getNode(
+ SystemZISD::ROTATE_MASK, DL, VecVT,
+ DAG.getConstant(Start, DL, MVT::i32, false, true /*isOpaque*/),
+ DAG.getConstant(End, DL, MVT::i32, false, true /*isOpaque*/));
return DAG.getNode(ISD::BITCAST, DL, VT, Op);
}
return SDValue();
@@ -4481,8 +4540,9 @@ SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op,
// priority over other methods below.
uint64_t Mask = 0;
if (tryBuildVectorByteMask(BVN, Mask)) {
- SDValue Op = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
- DAG.getConstant(Mask, DL, MVT::i32));
+ SDValue Op = DAG.getNode(
+ SystemZISD::BYTE_MASK, DL, MVT::v16i8,
+ DAG.getConstant(Mask, DL, MVT::i32, false, true /*isOpaque*/));
return DAG.getNode(ISD::BITCAST, DL, VT, Op);
}
@@ -4597,7 +4657,7 @@ SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
Op1.getOpcode() != ISD::BITCAST &&
Op1.getOpcode() != ISD::ConstantFP &&
Op2.getOpcode() == ISD::Constant) {
- uint64_t Index = dyn_cast<ConstantSDNode>(Op2)->getZExtValue();
+ uint64_t Index = cast<ConstantSDNode>(Op2)->getZExtValue();
unsigned Mask = VT.getVectorNumElements() - 1;
if (Index <= Mask)
return Op;
@@ -4753,6 +4813,14 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
return lowerSDIVREM(Op, DAG);
case ISD::UDIVREM:
return lowerUDIVREM(Op, DAG);
+ case ISD::SADDO:
+ case ISD::SSUBO:
+ case ISD::UADDO:
+ case ISD::USUBO:
+ return lowerXALUO(Op, DAG);
+ case ISD::ADDCARRY:
+ case ISD::SUBCARRY:
+ return lowerADDSUBCARRY(Op, DAG);
case ISD::OR:
return lowerOR(Op, DAG);
case ISD::CTPOP:
@@ -4881,19 +4949,19 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
}
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
SDLoc DL(N);
- SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::Other, MVT::Glue);
+ SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other);
SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
lowerI128ToGR128(DAG, N->getOperand(2)),
lowerI128ToGR128(DAG, N->getOperand(3)) };
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAP_128,
DL, Tys, Ops, MVT::i128, MMO);
- SDValue Success = emitSETCC(DAG, DL, Res.getValue(2),
+ SDValue Success = emitSETCC(DAG, DL, Res.getValue(1),
SystemZ::CCMASK_CS, SystemZ::CCMASK_CS_EQ);
Success = DAG.getZExtOrTrunc(Success, DL, N->getValueType(1));
Results.push_back(lowerGR128ToI128(DAG, Res));
Results.push_back(Success);
- Results.push_back(Res.getValue(1));
+ Results.push_back(Res.getValue(2));
break;
}
default:
@@ -4931,6 +4999,13 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
OPCODE(UMUL_LOHI);
OPCODE(SDIVREM);
OPCODE(UDIVREM);
+ OPCODE(SADDO);
+ OPCODE(SSUBO);
+ OPCODE(UADDO);
+ OPCODE(USUBO);
+ OPCODE(ADDCARRY);
+ OPCODE(SUBCARRY);
+ OPCODE(GET_CCMASK);
OPCODE(MVC);
OPCODE(MVC_LOOP);
OPCODE(NC);
@@ -5049,13 +5124,14 @@ SDValue SystemZTargetLowering::combineExtract(const SDLoc &DL, EVT ResVT,
if (Opcode == ISD::BITCAST)
// Look through bitcasts.
Op = Op.getOperand(0);
- else if (Opcode == ISD::VECTOR_SHUFFLE &&
+ else if ((Opcode == ISD::VECTOR_SHUFFLE || Opcode == SystemZISD::SPLAT) &&
canTreatAsByteVector(Op.getValueType())) {
// Get a VPERM-like permute mask and see whether the bytes covered
// by the extracted element are a contiguous sequence from one
// source operand.
SmallVector<int, SystemZ::VectorBytes> Bytes;
- getVPermMask(cast<ShuffleVectorSDNode>(Op), Bytes);
+ if (!getVPermMask(Op, Bytes))
+ break;
int First;
if (!getShuffleInput(Bytes, Index * BytesPerElement,
BytesPerElement, First))
@@ -5174,6 +5250,54 @@ SDValue SystemZTargetLowering::combineTruncateExtract(
return SDValue();
}
+SDValue SystemZTargetLowering::combineZERO_EXTEND(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ // Convert (zext (select_ccmask C1, C2)) into (select_ccmask C1', C2')
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ if (N0.getOpcode() == SystemZISD::SELECT_CCMASK) {
+ auto *TrueOp = dyn_cast<ConstantSDNode>(N0.getOperand(0));
+ auto *FalseOp = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (TrueOp && FalseOp) {
+ SDLoc DL(N0);
+ SDValue Ops[] = { DAG.getConstant(TrueOp->getZExtValue(), DL, VT),
+ DAG.getConstant(FalseOp->getZExtValue(), DL, VT),
+ N0.getOperand(2), N0.getOperand(3), N0.getOperand(4) };
+ SDValue NewSelect = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VT, Ops);
+ // If N0 has multiple uses, change other uses as well.
+ if (!N0.hasOneUse()) {
+ SDValue TruncSelect =
+ DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), NewSelect);
+ DCI.CombineTo(N0.getNode(), TruncSelect);
+ }
+ return NewSelect;
+ }
+ }
+ return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineSIGN_EXTEND_INREG(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ // Convert (sext_in_reg (setcc LHS, RHS, COND), i1)
+ // and (sext_in_reg (any_extend (setcc LHS, RHS, COND)), i1)
+ // into (select_cc LHS, RHS, -1, 0, COND)
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+ if (N0.hasOneUse() && N0.getOpcode() == ISD::ANY_EXTEND)
+ N0 = N0.getOperand(0);
+ if (EVT == MVT::i1 && N0.hasOneUse() && N0.getOpcode() == ISD::SETCC) {
+ SDLoc DL(N0);
+ SDValue Ops[] = { N0.getOperand(0), N0.getOperand(1),
+ DAG.getConstant(-1, DL, VT), DAG.getConstant(0, DL, VT),
+ N0.getOperand(2) };
+ return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
+ }
+ return SDValue();
+}
+
SDValue SystemZTargetLowering::combineSIGN_EXTEND(
SDNode *N, DAGCombinerInfo &DCI) const {
// Convert (sext (ashr (shl X, C1), C2)) to
@@ -5249,7 +5373,7 @@ SDValue SystemZTargetLowering::combineSTORE(
// for the extraction to be done on a vMiN value, so that we can use VSTE.
// If X has wider elements then convert it to:
// (truncstoreiN (extract_vector_elt (bitcast X), Y2), Z).
- if (MemVT.isInteger()) {
+ if (MemVT.isInteger() && SN->isTruncatingStore()) {
if (SDValue Value =
combineTruncateExtract(SDLoc(N), MemVT, SN->getValue(), DCI)) {
DCI.AddToWorklist(Value.getNode());
@@ -5261,9 +5385,7 @@ SDValue SystemZTargetLowering::combineSTORE(
}
}
// Combine STORE (BSWAP) into STRVH/STRV/STRVG
- // See comment in combineBSWAP about volatile accesses.
if (!SN->isTruncatingStore() &&
- !SN->isVolatile() &&
Op1.getOpcode() == ISD::BSWAP &&
Op1.getNode()->hasOneUse() &&
(Op1.getValueType() == MVT::i16 ||
@@ -5364,13 +5486,10 @@ SDValue SystemZTargetLowering::combineBSWAP(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
// Combine BSWAP (LOAD) into LRVH/LRV/LRVG
- // These loads are allowed to access memory multiple times, and so we must check
- // that the loads are not volatile before performing the combine.
if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
N->getOperand(0).hasOneUse() &&
(N->getValueType(0) == MVT::i16 || N->getValueType(0) == MVT::i32 ||
- N->getValueType(0) == MVT::i64) &&
- !cast<LoadSDNode>(N->getOperand(0))->isVolatile()) {
+ N->getValueType(0) == MVT::i64)) {
SDValue Load = N->getOperand(0);
LoadSDNode *LD = cast<LoadSDNode>(Load);
@@ -5475,11 +5594,157 @@ SDValue SystemZTargetLowering::combineSHIFTROT(
return SDValue();
}
+static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask) {
+ // We have a SELECT_CCMASK or BR_CCMASK comparing the condition code
+ // set by the CCReg instruction using the CCValid / CCMask masks,
+ // If the CCReg instruction is itself a (ICMP (SELECT_CCMASK)) testing
+ // the condition code set by some other instruction, see whether we
+ // can directly use that condition code.
+ bool Invert = false;
+
+ // Verify that we have an appropriate mask for a EQ or NE comparison.
+ if (CCValid != SystemZ::CCMASK_ICMP)
+ return false;
+ if (CCMask == SystemZ::CCMASK_CMP_NE)
+ Invert = !Invert;
+ else if (CCMask != SystemZ::CCMASK_CMP_EQ)
+ return false;
+
+ // Verify that we have an ICMP that is the user of a SELECT_CCMASK.
+ SDNode *ICmp = CCReg.getNode();
+ if (ICmp->getOpcode() != SystemZISD::ICMP)
+ return false;
+ SDNode *Select = ICmp->getOperand(0).getNode();
+ if (Select->getOpcode() != SystemZISD::SELECT_CCMASK)
+ return false;
+
+ // Verify that the ICMP compares against one of select values.
+ auto *CompareVal = dyn_cast<ConstantSDNode>(ICmp->getOperand(1));
+ if (!CompareVal)
+ return false;
+ auto *TrueVal = dyn_cast<ConstantSDNode>(Select->getOperand(0));
+ if (!TrueVal)
+ return false;
+ auto *FalseVal = dyn_cast<ConstantSDNode>(Select->getOperand(1));
+ if (!FalseVal)
+ return false;
+ if (CompareVal->getZExtValue() == FalseVal->getZExtValue())
+ Invert = !Invert;
+ else if (CompareVal->getZExtValue() != TrueVal->getZExtValue())
+ return false;
+
+ // Compute the effective CC mask for the new branch or select.
+ auto *NewCCValid = dyn_cast<ConstantSDNode>(Select->getOperand(2));
+ auto *NewCCMask = dyn_cast<ConstantSDNode>(Select->getOperand(3));
+ if (!NewCCValid || !NewCCMask)
+ return false;
+ CCValid = NewCCValid->getZExtValue();
+ CCMask = NewCCMask->getZExtValue();
+ if (Invert)
+ CCMask ^= CCValid;
+
+ // Return the updated CCReg link.
+ CCReg = Select->getOperand(4);
+ return true;
+}
+
+SDValue SystemZTargetLowering::combineBR_CCMASK(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+
+ // Combine BR_CCMASK (ICMP (SELECT_CCMASK)) into a single BR_CCMASK.
+ auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(2));
+ if (!CCValid || !CCMask)
+ return SDValue();
+
+ int CCValidVal = CCValid->getZExtValue();
+ int CCMaskVal = CCMask->getZExtValue();
+ SDValue Chain = N->getOperand(0);
+ SDValue CCReg = N->getOperand(4);
+
+ if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
+ return DAG.getNode(SystemZISD::BR_CCMASK, SDLoc(N), N->getValueType(0),
+ Chain,
+ DAG.getConstant(CCValidVal, SDLoc(N), MVT::i32),
+ DAG.getConstant(CCMaskVal, SDLoc(N), MVT::i32),
+ N->getOperand(3), CCReg);
+ return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineSELECT_CCMASK(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+
+ // Combine SELECT_CCMASK (ICMP (SELECT_CCMASK)) into a single SELECT_CCMASK.
+ auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(2));
+ auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(3));
+ if (!CCValid || !CCMask)
+ return SDValue();
+
+ int CCValidVal = CCValid->getZExtValue();
+ int CCMaskVal = CCMask->getZExtValue();
+ SDValue CCReg = N->getOperand(4);
+
+ if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
+ return DAG.getNode(SystemZISD::SELECT_CCMASK, SDLoc(N), N->getValueType(0),
+ N->getOperand(0),
+ N->getOperand(1),
+ DAG.getConstant(CCValidVal, SDLoc(N), MVT::i32),
+ DAG.getConstant(CCMaskVal, SDLoc(N), MVT::i32),
+ CCReg);
+ return SDValue();
+}
+
+
+SDValue SystemZTargetLowering::combineGET_CCMASK(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+
+ // Optimize away GET_CCMASK (SELECT_CCMASK) if the CC masks are compatible
+ auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(2));
+ if (!CCValid || !CCMask)
+ return SDValue();
+ int CCValidVal = CCValid->getZExtValue();
+ int CCMaskVal = CCMask->getZExtValue();
+
+ SDValue Select = N->getOperand(0);
+ if (Select->getOpcode() != SystemZISD::SELECT_CCMASK)
+ return SDValue();
+
+ auto *SelectCCValid = dyn_cast<ConstantSDNode>(Select->getOperand(2));
+ auto *SelectCCMask = dyn_cast<ConstantSDNode>(Select->getOperand(3));
+ if (!SelectCCValid || !SelectCCMask)
+ return SDValue();
+ int SelectCCValidVal = SelectCCValid->getZExtValue();
+ int SelectCCMaskVal = SelectCCMask->getZExtValue();
+
+ auto *TrueVal = dyn_cast<ConstantSDNode>(Select->getOperand(0));
+ auto *FalseVal = dyn_cast<ConstantSDNode>(Select->getOperand(1));
+ if (!TrueVal || !FalseVal)
+ return SDValue();
+ if (TrueVal->getZExtValue() != 0 && FalseVal->getZExtValue() == 0)
+ ;
+ else if (TrueVal->getZExtValue() == 0 && FalseVal->getZExtValue() != 0)
+ SelectCCMaskVal ^= SelectCCValidVal;
+ else
+ return SDValue();
+
+ if (SelectCCValidVal & ~CCValidVal)
+ return SDValue();
+ if (SelectCCMaskVal != (CCMaskVal & SelectCCValidVal))
+ return SDValue();
+
+ return Select->getOperand(4);
+}
+
SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch(N->getOpcode()) {
default: break;
+ case ISD::ZERO_EXTEND: return combineZERO_EXTEND(N, DCI);
case ISD::SIGN_EXTEND: return combineSIGN_EXTEND(N, DCI);
+ case ISD::SIGN_EXTEND_INREG: return combineSIGN_EXTEND_INREG(N, DCI);
case SystemZISD::MERGE_HIGH:
case SystemZISD::MERGE_LOW: return combineMERGE(N, DCI);
case ISD::STORE: return combineSTORE(N, DCI);
@@ -5491,11 +5756,303 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SRA:
case ISD::SRL:
case ISD::ROTL: return combineSHIFTROT(N, DCI);
+ case SystemZISD::BR_CCMASK: return combineBR_CCMASK(N, DCI);
+ case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI);
+ case SystemZISD::GET_CCMASK: return combineGET_CCMASK(N, DCI);
}
return SDValue();
}
+// Return the demanded elements for the OpNo source operand of Op. DemandedElts
+// are for Op.
+static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
+ unsigned OpNo) {
+ EVT VT = Op.getValueType();
+ unsigned NumElts = (VT.isVector() ? VT.getVectorNumElements() : 1);
+ APInt SrcDemE;
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
+ unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (Id) {
+ case Intrinsic::s390_vpksh: // PACKS
+ case Intrinsic::s390_vpksf:
+ case Intrinsic::s390_vpksg:
+ case Intrinsic::s390_vpkshs: // PACKS_CC
+ case Intrinsic::s390_vpksfs:
+ case Intrinsic::s390_vpksgs:
+ case Intrinsic::s390_vpklsh: // PACKLS
+ case Intrinsic::s390_vpklsf:
+ case Intrinsic::s390_vpklsg:
+ case Intrinsic::s390_vpklshs: // PACKLS_CC
+ case Intrinsic::s390_vpklsfs:
+ case Intrinsic::s390_vpklsgs:
+ // VECTOR PACK truncates the elements of two source vectors into one.
+ SrcDemE = DemandedElts;
+ if (OpNo == 2)
+ SrcDemE.lshrInPlace(NumElts / 2);
+ SrcDemE = SrcDemE.trunc(NumElts / 2);
+ break;
+ // VECTOR UNPACK extends half the elements of the source vector.
+ case Intrinsic::s390_vuphb: // VECTOR UNPACK HIGH
+ case Intrinsic::s390_vuphh:
+ case Intrinsic::s390_vuphf:
+ case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
+ case Intrinsic::s390_vuplhh:
+ case Intrinsic::s390_vuplhf:
+ SrcDemE = APInt(NumElts * 2, 0);
+ SrcDemE.insertBits(DemandedElts, 0);
+ break;
+ case Intrinsic::s390_vuplb: // VECTOR UNPACK LOW
+ case Intrinsic::s390_vuplhw:
+ case Intrinsic::s390_vuplf:
+ case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
+ case Intrinsic::s390_vupllh:
+ case Intrinsic::s390_vupllf:
+ SrcDemE = APInt(NumElts * 2, 0);
+ SrcDemE.insertBits(DemandedElts, NumElts);
+ break;
+ case Intrinsic::s390_vpdi: {
+ // VECTOR PERMUTE DWORD IMMEDIATE selects one element from each source.
+ SrcDemE = APInt(NumElts, 0);
+ if (!DemandedElts[OpNo - 1])
+ break;
+ unsigned Mask = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+ unsigned MaskBit = ((OpNo - 1) ? 1 : 4);
+ // Demand input element 0 or 1, given by the mask bit value.
+ SrcDemE.setBit((Mask & MaskBit)? 1 : 0);
+ break;
+ }
+ case Intrinsic::s390_vsldb: {
+ // VECTOR SHIFT LEFT DOUBLE BY BYTE
+ assert(VT == MVT::v16i8 && "Unexpected type.");
+ unsigned FirstIdx = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+ assert (FirstIdx > 0 && FirstIdx < 16 && "Unused operand.");
+ unsigned NumSrc0Els = 16 - FirstIdx;
+ SrcDemE = APInt(NumElts, 0);
+ if (OpNo == 1) {
+ APInt DemEls = DemandedElts.trunc(NumSrc0Els);
+ SrcDemE.insertBits(DemEls, FirstIdx);
+ } else {
+ APInt DemEls = DemandedElts.lshr(NumSrc0Els);
+ SrcDemE.insertBits(DemEls, 0);
+ }
+ break;
+ }
+ case Intrinsic::s390_vperm:
+ SrcDemE = APInt(NumElts, 1);
+ break;
+ default:
+ llvm_unreachable("Unhandled intrinsic.");
+ break;
+ }
+ } else {
+ switch (Opcode) {
+ case SystemZISD::JOIN_DWORDS:
+ // Scalar operand.
+ SrcDemE = APInt(1, 1);
+ break;
+ case SystemZISD::SELECT_CCMASK:
+ SrcDemE = DemandedElts;
+ break;
+ default:
+ llvm_unreachable("Unhandled opcode.");
+ break;
+ }
+ }
+ return SrcDemE;
+}
+
+static void computeKnownBitsBinOp(const SDValue Op, KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG, unsigned Depth,
+ unsigned OpNo) {
+ APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
+ APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
+ unsigned SrcBitWidth = Op.getOperand(OpNo).getScalarValueSizeInBits();
+ KnownBits LHSKnown(SrcBitWidth), RHSKnown(SrcBitWidth);
+ DAG.computeKnownBits(Op.getOperand(OpNo), LHSKnown, Src0DemE, Depth + 1);
+ DAG.computeKnownBits(Op.getOperand(OpNo + 1), RHSKnown, Src1DemE, Depth + 1);
+ Known.Zero = LHSKnown.Zero & RHSKnown.Zero;
+ Known.One = LHSKnown.One & RHSKnown.One;
+}
+
+void
+SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ Known.resetAll();
+
+ // Intrinsic CC result is returned in the two low bits.
+ unsigned tmp0, tmp1; // not used
+ if (Op.getResNo() == 1 && isIntrinsicWithCC(Op, tmp0, tmp1)) {
+ Known.Zero.setBitsFrom(2);
+ return;
+ }
+ EVT VT = Op.getValueType();
+ if (Op.getResNo() != 0 || VT == MVT::Untyped)
+ return;
+ assert (Known.getBitWidth() == VT.getScalarSizeInBits() &&
+ "KnownBits does not match VT in bitwidth");
+ assert ((!VT.isVector() ||
+ (DemandedElts.getBitWidth() == VT.getVectorNumElements())) &&
+ "DemandedElts does not match VT number of elements");
+ unsigned BitWidth = Known.getBitWidth();
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
+ bool IsLogical = false;
+ unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (Id) {
+ case Intrinsic::s390_vpksh: // PACKS
+ case Intrinsic::s390_vpksf:
+ case Intrinsic::s390_vpksg:
+ case Intrinsic::s390_vpkshs: // PACKS_CC
+ case Intrinsic::s390_vpksfs:
+ case Intrinsic::s390_vpksgs:
+ case Intrinsic::s390_vpklsh: // PACKLS
+ case Intrinsic::s390_vpklsf:
+ case Intrinsic::s390_vpklsg:
+ case Intrinsic::s390_vpklshs: // PACKLS_CC
+ case Intrinsic::s390_vpklsfs:
+ case Intrinsic::s390_vpklsgs:
+ case Intrinsic::s390_vpdi:
+ case Intrinsic::s390_vsldb:
+ case Intrinsic::s390_vperm:
+ computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 1);
+ break;
+ case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
+ case Intrinsic::s390_vuplhh:
+ case Intrinsic::s390_vuplhf:
+ case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
+ case Intrinsic::s390_vupllh:
+ case Intrinsic::s390_vupllf:
+ IsLogical = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::s390_vuphb: // VECTOR UNPACK HIGH
+ case Intrinsic::s390_vuphh:
+ case Intrinsic::s390_vuphf:
+ case Intrinsic::s390_vuplb: // VECTOR UNPACK LOW
+ case Intrinsic::s390_vuplhw:
+ case Intrinsic::s390_vuplf: {
+ SDValue SrcOp = Op.getOperand(1);
+ unsigned SrcBitWidth = SrcOp.getScalarValueSizeInBits();
+ Known = KnownBits(SrcBitWidth);
+ APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 0);
+ DAG.computeKnownBits(SrcOp, Known, SrcDemE, Depth + 1);
+ if (IsLogical) {
+ Known = Known.zext(BitWidth);
+ Known.Zero.setBitsFrom(SrcBitWidth);
+ } else
+ Known = Known.sext(BitWidth);
+ break;
+ }
+ default:
+ break;
+ }
+ } else {
+ switch (Opcode) {
+ case SystemZISD::JOIN_DWORDS:
+ case SystemZISD::SELECT_CCMASK:
+ computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 0);
+ break;
+ case SystemZISD::REPLICATE: {
+ SDValue SrcOp = Op.getOperand(0);
+ DAG.computeKnownBits(SrcOp, Known, Depth + 1);
+ if (Known.getBitWidth() < BitWidth && isa<ConstantSDNode>(SrcOp))
+ Known = Known.sext(BitWidth); // VREPI sign extends the immedate.
+ break;
+ }
+ default:
+ break;
+ }
+ }
+
+ // Known has the width of the source operand(s). Adjust if needed to match
+ // the passed bitwidth.
+ if (Known.getBitWidth() != BitWidth)
+ Known = Known.zextOrTrunc(BitWidth);
+}
+
+static unsigned computeNumSignBitsBinOp(SDValue Op, const APInt &DemandedElts,
+ const SelectionDAG &DAG, unsigned Depth,
+ unsigned OpNo) {
+ APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
+ unsigned LHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo), Src0DemE, Depth + 1);
+ if (LHS == 1) return 1; // Early out.
+ APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
+ unsigned RHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo + 1), Src1DemE, Depth + 1);
+ if (RHS == 1) return 1; // Early out.
+ unsigned Common = std::min(LHS, RHS);
+ unsigned SrcBitWidth = Op.getOperand(OpNo).getScalarValueSizeInBits();
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getScalarSizeInBits();
+ if (SrcBitWidth > VTBits) { // PACK
+ unsigned SrcExtraBits = SrcBitWidth - VTBits;
+ if (Common > SrcExtraBits)
+ return (Common - SrcExtraBits);
+ return 1;
+ }
+ assert (SrcBitWidth == VTBits && "Expected operands of same bitwidth.");
+ return Common;
+}
+
+unsigned
+SystemZTargetLowering::ComputeNumSignBitsForTargetNode(
+ SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+ unsigned Depth) const {
+ if (Op.getResNo() != 0)
+ return 1;
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
+ unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (Id) {
+ case Intrinsic::s390_vpksh: // PACKS
+ case Intrinsic::s390_vpksf:
+ case Intrinsic::s390_vpksg:
+ case Intrinsic::s390_vpkshs: // PACKS_CC
+ case Intrinsic::s390_vpksfs:
+ case Intrinsic::s390_vpksgs:
+ case Intrinsic::s390_vpklsh: // PACKLS
+ case Intrinsic::s390_vpklsf:
+ case Intrinsic::s390_vpklsg:
+ case Intrinsic::s390_vpklshs: // PACKLS_CC
+ case Intrinsic::s390_vpklsfs:
+ case Intrinsic::s390_vpklsgs:
+ case Intrinsic::s390_vpdi:
+ case Intrinsic::s390_vsldb:
+ case Intrinsic::s390_vperm:
+ return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 1);
+ case Intrinsic::s390_vuphb: // VECTOR UNPACK HIGH
+ case Intrinsic::s390_vuphh:
+ case Intrinsic::s390_vuphf:
+ case Intrinsic::s390_vuplb: // VECTOR UNPACK LOW
+ case Intrinsic::s390_vuplhw:
+ case Intrinsic::s390_vuplf: {
+ SDValue PackedOp = Op.getOperand(1);
+ APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 1);
+ unsigned Tmp = DAG.ComputeNumSignBits(PackedOp, SrcDemE, Depth + 1);
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getScalarSizeInBits();
+ Tmp += VTBits - PackedOp.getScalarValueSizeInBits();
+ return Tmp;
+ }
+ default:
+ break;
+ }
+ } else {
+ switch (Opcode) {
+ case SystemZISD::SELECT_CCMASK:
+ return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 0);
+ default:
+ break;
+ }
+ }
+
+ return 1;
+}
+
//===----------------------------------------------------------------------===//
// Custom insertion
//===----------------------------------------------------------------------===//
@@ -5546,34 +6103,141 @@ static unsigned forceReg(MachineInstr &MI, MachineOperand &Base,
return Reg;
}
+// The CC operand of MI might be missing a kill marker because there
+// were multiple uses of CC, and ISel didn't know which to mark.
+// Figure out whether MI should have had a kill marker.
+static bool checkCCKill(MachineInstr &MI, MachineBasicBlock *MBB) {
+ // Scan forward through BB for a use/def of CC.
+ MachineBasicBlock::iterator miI(std::next(MachineBasicBlock::iterator(MI)));
+ for (MachineBasicBlock::iterator miE = MBB->end(); miI != miE; ++miI) {
+ const MachineInstr& mi = *miI;
+ if (mi.readsRegister(SystemZ::CC))
+ return false;
+ if (mi.definesRegister(SystemZ::CC))
+ break; // Should have kill-flag - update below.
+ }
+
+ // If we hit the end of the block, check whether CC is live into a
+ // successor.
+ if (miI == MBB->end()) {
+ for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI)
+ if ((*SI)->isLiveIn(SystemZ::CC))
+ return false;
+ }
+
+ return true;
+}
+
+// Return true if it is OK for this Select pseudo-opcode to be cascaded
+// together with other Select pseudo-opcodes into a single basic-block with
+// a conditional jump around it.
+static bool isSelectPseudo(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case SystemZ::Select32:
+ case SystemZ::Select64:
+ case SystemZ::SelectF32:
+ case SystemZ::SelectF64:
+ case SystemZ::SelectF128:
+ case SystemZ::SelectVR32:
+ case SystemZ::SelectVR64:
+ case SystemZ::SelectVR128:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+// Helper function, which inserts PHI functions into SinkMBB:
+// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
+// where %FalseValue(i) and %TrueValue(i) are taken from the consequent Selects
+// in [MIItBegin, MIItEnd) range.
+static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin,
+ MachineBasicBlock::iterator MIItEnd,
+ MachineBasicBlock *TrueMBB,
+ MachineBasicBlock *FalseMBB,
+ MachineBasicBlock *SinkMBB) {
+ MachineFunction *MF = TrueMBB->getParent();
+ const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+ unsigned CCValid = MIItBegin->getOperand(3).getImm();
+ unsigned CCMask = MIItBegin->getOperand(4).getImm();
+ DebugLoc DL = MIItBegin->getDebugLoc();
+
+ MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
+
+ // As we are creating the PHIs, we have to be careful if there is more than
+ // one. Later Selects may reference the results of earlier Selects, but later
+ // PHIs have to reference the individual true/false inputs from earlier PHIs.
+ // That also means that PHI construction must work forward from earlier to
+ // later, and that the code must maintain a mapping from earlier PHI's
+ // destination registers, and the registers that went into the PHI.
+ DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+ unsigned DestReg = MIIt->getOperand(0).getReg();
+ unsigned TrueReg = MIIt->getOperand(1).getReg();
+ unsigned FalseReg = MIIt->getOperand(2).getReg();
+
+ // If this Select we are generating is the opposite condition from
+ // the jump we generated, then we have to swap the operands for the
+ // PHI that is going to be generated.
+ if (MIIt->getOperand(4).getImm() == (CCValid ^ CCMask))
+ std::swap(TrueReg, FalseReg);
+
+ if (RegRewriteTable.find(TrueReg) != RegRewriteTable.end())
+ TrueReg = RegRewriteTable[TrueReg].first;
+
+ if (RegRewriteTable.find(FalseReg) != RegRewriteTable.end())
+ FalseReg = RegRewriteTable[FalseReg].second;
+
+ BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(SystemZ::PHI), DestReg)
+ .addReg(TrueReg).addMBB(TrueMBB)
+ .addReg(FalseReg).addMBB(FalseMBB);
+
+ // Add this PHI to the rewrite table.
+ RegRewriteTable[DestReg] = std::make_pair(TrueReg, FalseReg);
+ }
+}
+
// Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
MachineBasicBlock *
SystemZTargetLowering::emitSelect(MachineInstr &MI,
- MachineBasicBlock *MBB,
- unsigned LOCROpcode) const {
+ MachineBasicBlock *MBB) const {
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned TrueReg = MI.getOperand(1).getReg();
- unsigned FalseReg = MI.getOperand(2).getReg();
unsigned CCValid = MI.getOperand(3).getImm();
unsigned CCMask = MI.getOperand(4).getImm();
DebugLoc DL = MI.getDebugLoc();
- // Use LOCROpcode if possible.
- if (LOCROpcode && Subtarget.hasLoadStoreOnCond()) {
- BuildMI(*MBB, MI, DL, TII->get(LOCROpcode), DestReg)
- .addReg(FalseReg).addReg(TrueReg)
- .addImm(CCValid).addImm(CCMask);
- MI.eraseFromParent();
- return MBB;
- }
+ // If we have a sequence of Select* pseudo instructions using the
+ // same condition code value, we want to expand all of them into
+ // a single pair of basic blocks using the same condition.
+ MachineInstr *LastMI = &MI;
+ MachineBasicBlock::iterator NextMIIt =
+ std::next(MachineBasicBlock::iterator(MI));
+
+ if (isSelectPseudo(MI))
+ while (NextMIIt != MBB->end() && isSelectPseudo(*NextMIIt) &&
+ NextMIIt->getOperand(3).getImm() == CCValid &&
+ (NextMIIt->getOperand(4).getImm() == CCMask ||
+ NextMIIt->getOperand(4).getImm() == (CCValid ^ CCMask))) {
+ LastMI = &*NextMIIt;
+ ++NextMIIt;
+ }
MachineBasicBlock *StartMBB = MBB;
MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB);
MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
+ // Unless CC was killed in the last Select instruction, mark it as
+ // live-in to both FalseMBB and JoinMBB.
+ if (!LastMI->killsRegister(SystemZ::CC) && !checkCCKill(*LastMI, JoinMBB)) {
+ FalseMBB->addLiveIn(SystemZ::CC);
+ JoinMBB->addLiveIn(SystemZ::CC);
+ }
+
// StartMBB:
// BRC CCMask, JoinMBB
// # fallthrough to FalseMBB
@@ -5592,11 +6256,12 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI,
// %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
// ...
MBB = JoinMBB;
- BuildMI(*MBB, MI, DL, TII->get(SystemZ::PHI), DestReg)
- .addReg(TrueReg).addMBB(StartMBB)
- .addReg(FalseReg).addMBB(FalseMBB);
+ MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+ MachineBasicBlock::iterator MIItEnd =
+ std::next(MachineBasicBlock::iterator(LastMI));
+ createPHIsForSelects(MIItBegin, MIItEnd, StartMBB, FalseMBB, MBB);
- MI.eraseFromParent();
+ StartMBB->erase(MIItBegin, MIItEnd);
return JoinMBB;
}
@@ -5658,6 +6323,13 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB);
MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
+ // Unless CC was killed in the CondStore instruction, mark it as
+ // live-in to both FalseMBB and JoinMBB.
+ if (!MI.killsRegister(SystemZ::CC) && !checkCCKill(MI, JoinMBB)) {
+ FalseMBB->addLiveIn(SystemZ::CC);
+ JoinMBB->addLiveIn(SystemZ::CC);
+ }
+
// StartMBB:
// BRC CCMask, JoinMBB
// # fallthrough to FalseMBB
@@ -6223,6 +6895,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
DestBase = MachineOperand::CreateReg(NextDestReg, false);
SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
Length &= 255;
+ if (EndMBB && !Length)
+ // If the loop handled the whole CLC range, DoneMBB will be empty with
+ // CC live-through into EndMBB, so add it as live-in.
+ DoneMBB->addLiveIn(SystemZ::CC);
MBB = DoneMBB;
}
// Handle any remaining bytes with straight-line code.
@@ -6415,18 +7091,15 @@ MachineBasicBlock *SystemZTargetLowering::emitLoadAndTestCmp0(
MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *MBB) const {
switch (MI.getOpcode()) {
- case SystemZ::Select32Mux:
- return emitSelect(MI, MBB,
- Subtarget.hasLoadStoreOnCond2()? SystemZ::LOCRMux : 0);
case SystemZ::Select32:
- return emitSelect(MI, MBB, SystemZ::LOCR);
case SystemZ::Select64:
- return emitSelect(MI, MBB, SystemZ::LOCGR);
case SystemZ::SelectF32:
case SystemZ::SelectF64:
case SystemZ::SelectF128:
+ case SystemZ::SelectVR32:
+ case SystemZ::SelectVR64:
case SystemZ::SelectVR128:
- return emitSelect(MI, MBB, 0);
+ return emitSelect(MI, MBB);
case SystemZ::CondStore8Mux:
return emitCondStore(MI, MBB, SystemZ::STCMux, 0, false);
@@ -6675,6 +7348,10 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
case SystemZ::LTXBRCompare_VecPseudo:
return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR);
+ case TargetOpcode::STACKMAP:
+ case TargetOpcode::PATCHPOINT:
+ return emitPatchPoint(MI, MBB);
+
default:
llvm_unreachable("Unexpected instr type to insert");
}
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 2cdc88db5a4d..0ca93a38a016 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -93,6 +93,19 @@ enum NodeType : unsigned {
SDIVREM,
UDIVREM,
+ // Add/subtract with overflow/carry. These have the same operands as
+ // the corresponding standard operations, except with the carry flag
+ // replaced by a condition code value.
+ SADDO, SSUBO, UADDO, USUBO, ADDCARRY, SUBCARRY,
+
+ // Set the condition code from a boolean value in operand 0.
+ // Operand 1 is a mask of all condition-code values that may result of this
+ // operation, operand 2 is a mask of condition-code values that may result
+ // if the boolean is true.
+ // Note that this operation is always optimized away, we will never
+ // generate any code for it.
+ GET_CCMASK,
+
// Use a series of MVCs to copy bytes from one memory location to another.
// The operands are:
// - the target address
@@ -142,11 +155,11 @@ enum NodeType : unsigned {
// Transaction begin. The first operand is the chain, the second
// the TDB pointer, and the third the immediate control field.
- // Returns chain and glue.
+ // Returns CC value and chain.
TBEGIN,
TBEGIN_NOFLOAT,
- // Transaction end. Just the chain operand. Returns chain and glue.
+ // Transaction end. Just the chain operand. Returns CC value and chain.
TEND,
// Create a vector constant by filling byte N of the result with bit
@@ -308,8 +321,8 @@ enum NodeType : unsigned {
// Operand 5: the width of the field in bits (8 or 16)
ATOMIC_CMP_SWAPW,
- // Atomic compare-and-swap returning glue (condition code).
- // Val, OUTCHAIN, glue = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
+ // Atomic compare-and-swap returning CC value.
+ // Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
ATOMIC_CMP_SWAP,
// 128-bit atomic load.
@@ -321,7 +334,7 @@ enum NodeType : unsigned {
ATOMIC_STORE_128,
// 128-bit atomic compare-and-swap.
- // Val, OUTCHAIN, glue = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
+ // Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
ATOMIC_CMP_SWAP_128,
// Byte swapping load.
@@ -470,6 +483,7 @@ public:
SelectionDAG &DAG) const override;
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
SelectionDAG &DAG) const override;
+ const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
bool allowTruncateForTailCall(Type *, Type *) const override;
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
@@ -490,6 +504,20 @@ public:
SelectionDAG &DAG) const override;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+ /// Determine which of the bits specified in Mask are known to be either
+ /// zero or one and return them in the KnownZero/KnownOne bitsets.
+ void computeKnownBitsForTargetNode(const SDValue Op,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
+ /// Determine the number of bits in the operation that are sign bits.
+ unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth) const override;
+
ISD::NodeType getExtendForAtomicOps() const override {
return ISD::ANY_EXTEND;
}
@@ -533,6 +561,8 @@ private:
SDValue lowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerXALUO(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
@@ -563,7 +593,9 @@ private:
bool Force) const;
SDValue combineTruncateExtract(const SDLoc &DL, EVT TruncVT, SDValue Op,
DAGCombinerInfo &DCI) const;
+ SDValue combineZERO_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSIGN_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineSIGN_EXTEND_INREG(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineMERGE(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSTORE(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineEXTRACT_VECTOR_ELT(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -571,6 +603,9 @@ private:
SDValue combineFP_ROUND(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineBSWAP(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSHIFTROT(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineBR_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineSELECT_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineGET_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
// If the last instruction before MBBI in MBB was some form of COMPARE,
// try to replace it with a COMPARE AND BRANCH just before MBBI.
@@ -582,8 +617,7 @@ private:
MachineBasicBlock *Target) const;
// Implement EmitInstrWithCustomInserter for individual operation types.
- MachineBasicBlock *emitSelect(MachineInstr &MI, MachineBasicBlock *BB,
- unsigned LOCROpcode) const;
+ MachineBasicBlock *emitSelect(MachineInstr &MI, MachineBasicBlock *BB) const;
MachineBasicBlock *emitCondStore(MachineInstr &MI, MachineBasicBlock *BB,
unsigned StoreOpcode, unsigned STOCOpcode,
bool Invert) const;
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 16edbea87cda..4e47752ed122 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -15,6 +15,10 @@
//===----------------------------------------------------------------------===//
// C's ?: operator for floating-point operands.
+let Predicates = [FeatureVector] in {
+ def SelectVR32 : SelectWrapper<f32, VR32>;
+ def SelectVR64 : SelectWrapper<f64, VR64>;
+}
def SelectF32 : SelectWrapper<f32, FP32>;
def SelectF64 : SelectWrapper<f64, FP64>;
let Predicates = [FeatureNoVectorEnhancements1] in
@@ -65,7 +69,7 @@ let Predicates = [FeatureNoVector] in {
// Use a normal load-and-test for compare against zero in case of
// vector support (via a pseudo to simplify instruction selection).
-let Defs = [CC], usesCustomInserter = 1 in {
+let Defs = [CC], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
def LTEBRCompare_VecPseudo : Pseudo<(outs), (ins FP32:$R1, FP32:$R2), []>;
def LTDBRCompare_VecPseudo : Pseudo<(outs), (ins FP64:$R1, FP64:$R2), []>;
def LTXBRCompare_VecPseudo : Pseudo<(outs), (ins FP128:$R1, FP128:$R2), []>;
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 06da66ad8764..e3f9a9645d13 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -2469,7 +2469,7 @@ class StoreVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr, bits<5> bytes, bits<4> type = 0>
: InstVRX<opcode, (outs), (ins tr.op:$V1, bdxaddr12only:$XBD2),
mnemonic#"\t$V1, $XBD2",
- [(set tr.op:$V1, (tr.vt (operator bdxaddr12only:$XBD2)))]> {
+ [(set (tr.vt tr.op:$V1), (operator bdxaddr12only:$XBD2))]> {
let M3 = type;
let mayStore = 1;
let AccessBytes = bytes;
@@ -2844,7 +2844,7 @@ class UnaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr, Immediate imm, bits<4> type = 0>
: InstVRIa<opcode, (outs tr.op:$V1), (ins imm:$I2),
mnemonic#"\t$V1, $I2",
- [(set tr.op:$V1, (tr.vt (operator imm:$I2)))]> {
+ [(set (tr.vt tr.op:$V1), (operator imm:$I2))]> {
let M3 = type;
}
@@ -2857,7 +2857,7 @@ class UnaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
bits<4> m5 = 0>
: InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2),
mnemonic#"\t$V1, $V2",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2))))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2)))]> {
let M3 = type;
let M4 = m4;
let M5 = m5;
@@ -2913,7 +2913,7 @@ class UnaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr, bits<5> bytes, bits<4> type = 0>
: InstVRX<opcode, (outs tr.op:$V1), (ins bdxaddr12only:$XBD2),
mnemonic#"\t$V1, $XBD2",
- [(set tr.op:$V1, (tr.vt (operator bdxaddr12only:$XBD2)))]> {
+ [(set (tr.vt tr.op:$V1), (operator bdxaddr12only:$XBD2))]> {
let M3 = type;
let mayLoad = 1;
let AccessBytes = bytes;
@@ -3132,7 +3132,9 @@ class CondBinaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
RegisterOperand cls2>
: InstRRFc<opcode, (outs cls1:$R1),
(ins cls1:$R1src, cls2:$R2, cond4:$valid, cond4:$M3),
- mnemonic#"$M3\t$R1, $R2", []> {
+ mnemonic#"$M3\t$R1, $R2",
+ [(set cls1:$R1, (z_select_ccmask cls2:$R2, cls1:$R1src,
+ cond4:$valid, cond4:$M3))]> {
let Constraints = "$R1 = $R1src";
let DisableEncoding = "$R1src";
let CCMaskLast = 1;
@@ -3385,7 +3387,7 @@ class BinaryVRIb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr, bits<4> type>
: InstVRIb<opcode, (outs tr.op:$V1), (ins imm32zx8:$I2, imm32zx8:$I3),
mnemonic#"\t$V1, $I2, $I3",
- [(set tr.op:$V1, (tr.vt (operator imm32zx8:$I2, imm32zx8:$I3)))]> {
+ [(set (tr.vt tr.op:$V1), (operator imm32zx8:$I2, imm32zx8:$I3))]> {
let M4 = type;
}
@@ -3398,8 +3400,8 @@ class BinaryVRIc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr1, TypedReg tr2, bits<4> type>
: InstVRIc<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, imm32zx16:$I2),
mnemonic#"\t$V1, $V3, $I2",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V3),
- imm32zx16:$I2)))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V3),
+ imm32zx16:$I2))]> {
let M4 = type;
}
@@ -3412,8 +3414,8 @@ class BinaryVRIe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> m5>
: InstVRIe<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx12:$I3),
mnemonic#"\t$V1, $V2, $I3",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
- imm32zx12:$I3)))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+ imm32zx12:$I3))]> {
let M4 = type;
let M5 = m5;
}
@@ -3432,8 +3434,8 @@ class BinaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m4 = 0>
: InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx4:$M5),
mnemonic#"\t$V1, $V2, $M5",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
- imm32zx12:$M5)))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+ imm32zx12:$M5))]> {
let M3 = type;
let M4 = m4;
}
@@ -3448,8 +3450,8 @@ class BinaryVRRb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
bits<4> modifier = 0>
: InstVRRb<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3),
mnemonic#"\t$V1, $V2, $V3",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
- (tr2.vt tr2.op:$V3))))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3)))]> {
let M4 = type;
let M5 = modifier;
}
@@ -3507,8 +3509,8 @@ class BinaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
bits<4> m6 = 0>
: InstVRRc<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3),
mnemonic#"\t$V1, $V2, $V3",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
- (tr2.vt tr2.op:$V3))))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3)))]> {
let M4 = type;
let M5 = m5;
let M6 = m6;
@@ -3554,7 +3556,7 @@ class BinaryVRRf<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr>
: InstVRRf<opcode, (outs tr.op:$V1), (ins GR64:$R2, GR64:$R3),
mnemonic#"\t$V1, $R2, $R3",
- [(set tr.op:$V1, (tr.vt (operator GR64:$R2, GR64:$R3)))]>;
+ [(set (tr.vt tr.op:$V1), (operator GR64:$R2, GR64:$R3))]>;
class BinaryVRRi<string mnemonic, bits<16> opcode, RegisterOperand cls>
: InstVRRi<opcode, (outs cls:$R1), (ins VR128:$V2, imm32zx4:$M3),
@@ -3564,8 +3566,8 @@ class BinaryVRSa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr1, TypedReg tr2, bits<4> type>
: InstVRSa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, shift12only:$BD2),
mnemonic#"\t$V1, $V3, $BD2",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V3),
- shift12only:$BD2)))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V3),
+ shift12only:$BD2))]> {
let M4 = type;
}
@@ -3610,8 +3612,8 @@ class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr, bits<5> bytes>
: InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
mnemonic#"\t$V1, $XBD2, $M3",
- [(set tr.op:$V1, (tr.vt (operator bdxaddr12only:$XBD2,
- imm32zx4:$M3)))]> {
+ [(set (tr.vt tr.op:$V1), (operator bdxaddr12only:$XBD2,
+ imm32zx4:$M3))]> {
let mayLoad = 1;
let AccessBytes = bytes;
}
@@ -3688,7 +3690,7 @@ class CompareRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
RegisterOperand cls1, RegisterOperand cls2>
: InstRR<opcode, (outs), (ins cls1:$R1, cls2:$R2),
mnemonic#"\t$R1, $R2",
- [(operator cls1:$R1, cls2:$R2)]> {
+ [(set CC, (operator cls1:$R1, cls2:$R2))]> {
let OpKey = mnemonic#cls1;
let OpType = "reg";
let isCompare = 1;
@@ -3698,7 +3700,7 @@ class CompareRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
RegisterOperand cls1, RegisterOperand cls2>
: InstRRE<opcode, (outs), (ins cls1:$R1, cls2:$R2),
mnemonic#"\t$R1, $R2",
- [(operator cls1:$R1, cls2:$R2)]> {
+ [(set CC, (operator cls1:$R1, cls2:$R2))]> {
let OpKey = mnemonic#cls1;
let OpType = "reg";
let isCompare = 1;
@@ -3708,7 +3710,7 @@ class CompareRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
RegisterOperand cls, Immediate imm>
: InstRIa<opcode, (outs), (ins cls:$R1, imm:$I2),
mnemonic#"\t$R1, $I2",
- [(operator cls:$R1, imm:$I2)]> {
+ [(set CC, (operator cls:$R1, imm:$I2))]> {
let isCompare = 1;
}
@@ -3716,7 +3718,7 @@ class CompareRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
RegisterOperand cls, Immediate imm>
: InstRILa<opcode, (outs), (ins cls:$R1, imm:$I2),
mnemonic#"\t$R1, $I2",
- [(operator cls:$R1, imm:$I2)]> {
+ [(set CC, (operator cls:$R1, imm:$I2))]> {
let isCompare = 1;
}
@@ -3724,7 +3726,7 @@ class CompareRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
RegisterOperand cls, SDPatternOperator load>
: InstRILb<opcode, (outs), (ins cls:$R1, pcrel32:$RI2),
mnemonic#"\t$R1, $RI2",
- [(operator cls:$R1, (load pcrel32:$RI2))]> {
+ [(set CC, (operator cls:$R1, (load pcrel32:$RI2)))]> {
let isCompare = 1;
let mayLoad = 1;
// We want PC-relative addresses to be tried ahead of BD and BDX addresses.
@@ -3738,7 +3740,7 @@ class CompareRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
AddressingMode mode = bdxaddr12only>
: InstRXa<opcode, (outs), (ins cls:$R1, mode:$XBD2),
mnemonic#"\t$R1, $XBD2",
- [(operator cls:$R1, (load mode:$XBD2))]> {
+ [(set CC, (operator cls:$R1, (load mode:$XBD2)))]> {
let OpKey = mnemonic#"r"#cls;
let OpType = "mem";
let isCompare = 1;
@@ -3750,7 +3752,7 @@ class CompareRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
: InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
mnemonic#"\t$R1, $XBD2",
- [(operator cls:$R1, (load bdxaddr12only:$XBD2))]> {
+ [(set CC, (operator cls:$R1, (load bdxaddr12only:$XBD2)))]> {
let OpKey = mnemonic#"r"#cls;
let OpType = "mem";
let isCompare = 1;
@@ -3764,7 +3766,7 @@ class CompareRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
AddressingMode mode = bdxaddr20only>
: InstRXYa<opcode, (outs), (ins cls:$R1, mode:$XBD2),
mnemonic#"\t$R1, $XBD2",
- [(operator cls:$R1, (load mode:$XBD2))]> {
+ [(set CC, (operator cls:$R1, (load mode:$XBD2)))]> {
let OpKey = mnemonic#"r"#cls;
let OpType = "mem";
let isCompare = 1;
@@ -3824,7 +3826,7 @@ class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
AddressingMode mode = bdaddr12only>
: InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
mnemonic#"\t$BD1, $I2",
- [(operator (load mode:$BD1), imm:$I2)]> {
+ [(set CC, (operator (load mode:$BD1), imm:$I2))]> {
let isCompare = 1;
let mayLoad = 1;
}
@@ -3833,7 +3835,7 @@ class CompareSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
SDPatternOperator load, Immediate imm>
: InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
mnemonic#"\t$BD1, $I2",
- [(operator (load bdaddr12only:$BD1), imm:$I2)]> {
+ [(set CC, (operator (load bdaddr12only:$BD1), imm:$I2))]> {
let isCompare = 1;
let mayLoad = 1;
}
@@ -3843,7 +3845,7 @@ class CompareSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
AddressingMode mode = bdaddr20only>
: InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
mnemonic#"\t$BD1, $I2",
- [(operator (load mode:$BD1), imm:$I2)]> {
+ [(set CC, (operator (load mode:$BD1), imm:$I2))]> {
let isCompare = 1;
let mayLoad = 1;
}
@@ -3864,7 +3866,7 @@ class CompareVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr, bits<4> type>
: InstVRRa<opcode, (outs), (ins tr.op:$V1, tr.op:$V2),
mnemonic#"\t$V1, $V2",
- [(operator (tr.vt tr.op:$V1), (tr.vt tr.op:$V2))]> {
+ [(set CC, (operator (tr.vt tr.op:$V1), (tr.vt tr.op:$V2)))]> {
let isCompare = 1;
let M3 = type;
let M4 = 0;
@@ -3893,14 +3895,26 @@ class CompareVRRh<string mnemonic, bits<16> opcode>
let isCompare = 1;
}
+class TestInherentS<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator>
+ : InstS<opcode, (outs), (ins), mnemonic, [(set CC, (operator))]> {
+ let BD2 = 0;
+}
+
class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
RegisterOperand cls>
: InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
mnemonic#"\t$R1, $XBD2",
- [(operator cls:$R1, bdxaddr12only:$XBD2)]> {
+ [(set CC, (operator cls:$R1, bdxaddr12only:$XBD2))]> {
let M3 = 0;
}
+class TestBinarySIL<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator, Immediate imm>
+ : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+ mnemonic#"\t$BD1, $I2",
+ [(set CC, (operator bdaddr12only:$BD1, imm:$I2))]>;
+
class TestRSL<string mnemonic, bits<16> opcode>
: InstRSLa<opcode, (outs), (ins bdladdr12onlylen4:$BDL1),
mnemonic#"\t$BDL1", []> {
@@ -4097,8 +4111,8 @@ class TernaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr1, TypedReg tr2, Immediate imm, Immediate index>
: InstVRIa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V1src, imm:$I2, index:$M3),
mnemonic#"\t$V1, $I2, $M3",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
- imm:$I2, index:$M3)))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src),
+ imm:$I2, index:$M3))]> {
let Constraints = "$V1 = $V1src";
let DisableEncoding = "$V1src";
}
@@ -4108,9 +4122,9 @@ class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
: InstVRId<opcode, (outs tr1.op:$V1),
(ins tr2.op:$V2, tr2.op:$V3, imm32zx8:$I4),
mnemonic#"\t$V1, $V2, $V3, $I4",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
- (tr2.vt tr2.op:$V3),
- imm32zx8:$I4)))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ imm32zx8:$I4))]> {
let M5 = type;
}
@@ -4124,9 +4138,9 @@ class TernaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
: InstVRRa<opcode, (outs tr1.op:$V1),
(ins tr2.op:$V2, imm32zx4:$M4, imm32zx4:$M5),
mnemonic#"\t$V1, $V2, $M4, $M5",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
- imm32zx4:$M4,
- imm32zx4:$M5)))],
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+ imm32zx4:$M4,
+ imm32zx4:$M5))],
m4or> {
let M3 = type;
}
@@ -4142,9 +4156,9 @@ class TernaryVRRb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
: InstVRRb<opcode, (outs tr1.op:$V1),
(ins tr2.op:$V2, tr2.op:$V3, m5mask:$M5),
mnemonic#"\t$V1, $V2, $V3, $M5",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
- (tr2.vt tr2.op:$V3),
- m5mask:$M5)))],
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ m5mask:$M5))],
m5or> {
let M4 = type;
}
@@ -4184,9 +4198,9 @@ class TernaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
: InstVRRc<opcode, (outs tr1.op:$V1),
(ins tr2.op:$V2, tr2.op:$V3, imm32zx4:$M4),
mnemonic#"\t$V1, $V2, $V3, $M4",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
- (tr2.vt tr2.op:$V3),
- imm32zx4:$M4)))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ imm32zx4:$M4))]> {
let M5 = 0;
let M6 = 0;
}
@@ -4197,9 +4211,9 @@ class TernaryVRRcFloat<string mnemonic, bits<16> opcode,
: InstVRRc<opcode, (outs tr1.op:$V1),
(ins tr2.op:$V2, tr2.op:$V3, imm32zx4:$M6),
mnemonic#"\t$V1, $V2, $V3, $M6",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
- (tr2.vt tr2.op:$V3),
- imm32zx4:$M6)))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ imm32zx4:$M6))]> {
let M4 = type;
let M5 = m5;
}
@@ -4215,9 +4229,9 @@ class TernaryVRRd<string mnemonic, bits<16> opcode, SDPatternOperator operator,
: InstVRRd<opcode, (outs tr1.op:$V1),
(ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
mnemonic#"\t$V1, $V2, $V3, $V4",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
- (tr2.vt tr2.op:$V3),
- (tr1.vt tr1.op:$V4))))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ (tr1.vt tr1.op:$V4)))]> {
let M5 = type;
let M6 = 0;
}
@@ -4234,9 +4248,9 @@ class TernaryVRRe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
: InstVRRe<opcode, (outs tr1.op:$V1),
(ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
mnemonic#"\t$V1, $V2, $V3, $V4",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
- (tr2.vt tr2.op:$V3),
- (tr1.vt tr1.op:$V4))))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ (tr1.vt tr1.op:$V4)))]> {
let M5 = m5;
let M6 = type;
}
@@ -4251,9 +4265,9 @@ class TernaryVRSb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
: InstVRSb<opcode, (outs tr1.op:$V1),
(ins tr2.op:$V1src, cls:$R3, shift12only:$BD2),
mnemonic#"\t$V1, $R3, $BD2",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
- cls:$R3,
- shift12only:$BD2)))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src),
+ cls:$R3,
+ shift12only:$BD2))]> {
let Constraints = "$V1 = $V1src";
let DisableEncoding = "$V1src";
let M4 = type;
@@ -4283,9 +4297,9 @@ class TernaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
: InstVRX<opcode, (outs tr1.op:$V1),
(ins tr2.op:$V1src, bdxaddr12only:$XBD2, index:$M3),
mnemonic#"\t$V1, $XBD2, $M3",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
- bdxaddr12only:$XBD2,
- index:$M3)))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src),
+ bdxaddr12only:$XBD2,
+ index:$M3))]> {
let Constraints = "$V1 = $V1src";
let DisableEncoding = "$V1src";
let mayLoad = 1;
@@ -4297,10 +4311,10 @@ class QuaternaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operato
: InstVRId<opcode, (outs tr1.op:$V1),
(ins tr2.op:$V1src, tr2.op:$V2, tr2.op:$V3, imm32zx8:$I4),
mnemonic#"\t$V1, $V2, $V3, $I4",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
- (tr2.vt tr2.op:$V2),
- (tr2.vt tr2.op:$V3),
- imm32zx8:$I4)))]> {
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src),
+ (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ imm32zx8:$I4))]> {
let Constraints = "$V1 = $V1src";
let DisableEncoding = "$V1src";
let M5 = type;
@@ -4334,10 +4348,10 @@ class QuaternaryVRRd<string mnemonic, bits<16> opcode,
: InstVRRd<opcode, (outs tr1.op:$V1),
(ins tr2.op:$V2, tr3.op:$V3, tr4.op:$V4, m6mask:$M6),
mnemonic#"\t$V1, $V2, $V3, $V4, $M6",
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
- (tr3.vt tr3.op:$V3),
- (tr4.vt tr4.op:$V4),
- m6mask:$M6)))],
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+ (tr3.vt tr3.op:$V3),
+ (tr4.vt tr4.op:$V4),
+ m6mask:$M6))],
m6or> {
let M5 = type;
}
@@ -4527,11 +4541,6 @@ class Pseudo<dag outs, dag ins, list<dag> pattern>
let isCodeGenOnly = 1;
}
-// Like SideEffectBinarySIL, but expanded later.
-class SideEffectBinarySILPseudo<SDPatternOperator operator, Immediate imm>
- : Pseudo<(outs), (ins bdaddr12only:$BD1, imm:$I2),
- [(operator bdaddr12only:$BD1, imm:$I2)]>;
-
// Like UnaryRI, but expanded after RA depending on the choice of register.
class UnaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
Immediate imm>
@@ -4591,7 +4600,8 @@ multiclass BinaryRIAndKPseudo<string key, SDPatternOperator operator,
// Like CompareRI, but expanded after RA depending on the choice of register.
class CompareRIPseudo<SDPatternOperator operator, RegisterOperand cls,
Immediate imm>
- : Pseudo<(outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]> {
+ : Pseudo<(outs), (ins cls:$R1, imm:$I2),
+ [(set CC, (operator cls:$R1, imm:$I2))]> {
let isCompare = 1;
}
@@ -4600,18 +4610,25 @@ class CompareRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
SDPatternOperator load, bits<5> bytes,
AddressingMode mode = bdxaddr20only>
: Pseudo<(outs), (ins cls:$R1, mode:$XBD2),
- [(operator cls:$R1, (load mode:$XBD2))]> {
+ [(set CC, (operator cls:$R1, (load mode:$XBD2)))]> {
let mayLoad = 1;
let Has20BitOffset = 1;
let HasIndex = 1;
let AccessBytes = bytes;
}
+// Like TestBinarySIL, but expanded later.
+class TestBinarySILPseudo<SDPatternOperator operator, Immediate imm>
+ : Pseudo<(outs), (ins bdaddr12only:$BD1, imm:$I2),
+ [(set CC, (operator bdaddr12only:$BD1, imm:$I2))]>;
+
// Like CondBinaryRRF, but expanded after RA depending on the choice of
// register.
class CondBinaryRRFPseudo<RegisterOperand cls1, RegisterOperand cls2>
: Pseudo<(outs cls1:$R1),
- (ins cls1:$R1src, cls2:$R2, cond4:$valid, cond4:$M3), []> {
+ (ins cls1:$R1src, cls2:$R2, cond4:$valid, cond4:$M3),
+ [(set cls1:$R1, (z_select_ccmask cls2:$R2, cls1:$R1src,
+ cond4:$valid, cond4:$M3))]> {
let Constraints = "$R1 = $R1src";
let DisableEncoding = "$R1src";
let CCMaskLast = 1;
@@ -4685,17 +4702,14 @@ class SelectWrapper<ValueType vt, RegisterOperand cls>
[(set (vt cls:$dst), (z_select_ccmask cls:$src1, cls:$src2,
imm32zx4:$valid, imm32zx4:$cc))]> {
let usesCustomInserter = 1;
- // Although the instructions used by these nodes do not in themselves
- // change CC, the insertion requires new blocks, and CC cannot be live
- // across them.
- let Defs = [CC];
+ let hasNoSchedulingInfo = 1;
let Uses = [CC];
}
// Stores $new to $addr if $cc is true ("" case) or false (Inv case).
multiclass CondStores<RegisterOperand cls, SDPatternOperator store,
SDPatternOperator load, AddressingMode mode> {
- let Defs = [CC], Uses = [CC], usesCustomInserter = 1,
+ let Uses = [CC], usesCustomInserter = 1, hasNoSchedulingInfo = 1,
mayLoad = 1, mayStore = 1 in {
def "" : Pseudo<(outs),
(ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc),
@@ -4765,7 +4779,7 @@ class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
multiclass MemorySS<string mnemonic, bits<8> opcode,
SDPatternOperator sequence, SDPatternOperator loop> {
def "" : SideEffectBinarySSa<mnemonic, opcode>;
- let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
+ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [CC] in {
def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
imm64:$length),
[(sequence bdaddr12only:$dest, bdaddr12only:$src,
@@ -4777,6 +4791,22 @@ multiclass MemorySS<string mnemonic, bits<8> opcode,
}
}
+// The same, but setting a CC result as comparion operator.
+multiclass CompareMemorySS<string mnemonic, bits<8> opcode,
+ SDPatternOperator sequence, SDPatternOperator loop> {
+ def "" : SideEffectBinarySSa<mnemonic, opcode>;
+ let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
+ def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length),
+ [(set CC, (sequence bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length))]>;
+ def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length, GR64:$count256),
+ [(set CC, (loop bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length, GR64:$count256))]>;
+ }
+}
+
// Define an instruction that operates on two strings, both terminated
// by the character in R0. The instruction processes a CPU-determinated
// number of bytes at a time and sets CC to 3 if the instruction needs
@@ -4809,13 +4839,13 @@ class UnaryAliasVRS<RegisterOperand cls1, RegisterOperand cls2>
// An alias of a UnaryVRR*, but with different register sizes.
class UnaryAliasVRR<SDPatternOperator operator, TypedReg tr1, TypedReg tr2>
: Alias<6, (outs tr1.op:$V1), (ins tr2.op:$V2),
- [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2))))]>;
+ [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2)))]>;
// An alias of a UnaryVRX, but with different register sizes.
class UnaryAliasVRX<SDPatternOperator operator, TypedReg tr,
AddressingMode mode = bdxaddr12only>
: Alias<6, (outs tr.op:$V1), (ins mode:$XBD2),
- [(set tr.op:$V1, (tr.vt (operator mode:$XBD2)))]>;
+ [(set (tr.vt tr.op:$V1), (operator mode:$XBD2))]>;
// An alias of a StoreVRX, but with different register sizes.
class StoreAliasVRX<SDPatternOperator operator, TypedReg tr,
@@ -4846,7 +4876,8 @@ class BinaryAliasVRRf<RegisterOperand cls>
// An alias of a CompareRI, but with different register sizes.
class CompareAliasRI<SDPatternOperator operator, RegisterOperand cls,
Immediate imm>
- : Alias<4, (outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]> {
+ : Alias<4, (outs), (ins cls:$R1, imm:$I2),
+ [(set CC, (operator cls:$R1, imm:$I2))]> {
let isCompare = 1;
}
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 572446c1aa12..f0f9211efd5d 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -389,7 +389,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I = MBB.end();
while (I != MBB.begin()) {
--I;
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
// Working from the bottom, when we see a non-terminator instruction, we're
@@ -479,7 +479,7 @@ unsigned SystemZInstrInfo::removeBranch(MachineBasicBlock &MBB,
while (I != MBB.begin()) {
--I;
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
if (!I->isBranch())
break;
@@ -906,6 +906,23 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ // Move CC value from/to a GR32.
+ if (SrcReg == SystemZ::CC) {
+ auto MIB = BuildMI(MBB, MBBI, DL, get(SystemZ::IPM), DestReg);
+ if (KillSrc) {
+ const MachineFunction *MF = MBB.getParent();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ MIB->addRegisterKilled(SrcReg, TRI);
+ }
+ return;
+ }
+ if (DestReg == SystemZ::CC) {
+ BuildMI(MBB, MBBI, DL, get(SystemZ::TMLH))
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(3 << (SystemZ::IPM_CC - 16));
+ return;
+ }
+
// Everything else needs only one instruction.
unsigned Opcode;
if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
@@ -1174,6 +1191,36 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
return BuiltMI;
}
+ if ((Opcode == SystemZ::ALFI && OpNum == 0 &&
+ isInt<8>((int32_t)MI.getOperand(2).getImm())) ||
+ (Opcode == SystemZ::ALGFI && OpNum == 0 &&
+ isInt<8>((int64_t)MI.getOperand(2).getImm()))) {
+ // AL(G)FI %reg, CONST -> AL(G)SI %mem, CONST
+ Opcode = (Opcode == SystemZ::ALFI ? SystemZ::ALSI : SystemZ::ALGSI);
+ MachineInstr *BuiltMI =
+ BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(), get(Opcode))
+ .addFrameIndex(FrameIndex)
+ .addImm(0)
+ .addImm((int8_t)MI.getOperand(2).getImm());
+ transferDeadCC(&MI, BuiltMI);
+ return BuiltMI;
+ }
+
+ if ((Opcode == SystemZ::SLFI && OpNum == 0 &&
+ isInt<8>((int32_t)-MI.getOperand(2).getImm())) ||
+ (Opcode == SystemZ::SLGFI && OpNum == 0 &&
+ isInt<8>((int64_t)-MI.getOperand(2).getImm()))) {
+ // SL(G)FI %reg, CONST -> AL(G)SI %mem, -CONST
+ Opcode = (Opcode == SystemZ::SLFI ? SystemZ::ALSI : SystemZ::ALGSI);
+ MachineInstr *BuiltMI =
+ BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(), get(Opcode))
+ .addFrameIndex(FrameIndex)
+ .addImm(0)
+ .addImm((int8_t)-MI.getOperand(2).getImm());
+ transferDeadCC(&MI, BuiltMI);
+ return BuiltMI;
+ }
+
if (Opcode == SystemZ::LGDR || Opcode == SystemZ::LDGR) {
bool Op0IsGPR = (Opcode == SystemZ::LGDR);
bool Op1IsGPR = (Opcode == SystemZ::LDGR);
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index abb804597f4e..9d7312269957 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -325,9 +325,10 @@ let isReturn = 1, isTerminator = 1, hasCtrlDep = 1 in {
// Select instructions
//===----------------------------------------------------------------------===//
-def Select32Mux : SelectWrapper<i32, GRX32>, Requires<[FeatureHighWord]>;
-def Select32 : SelectWrapper<i32, GR32>;
-def Select64 : SelectWrapper<i64, GR64>;
+def Select32 : SelectWrapper<i32, GR32>,
+ Requires<[FeatureNoLoadStoreOnCond]>;
+def Select64 : SelectWrapper<i64, GR64>,
+ Requires<[FeatureNoLoadStoreOnCond]>;
// We don't define 32-bit Mux stores if we don't have STOCFH, because the
// low-only STOC should then always be used if possible.
@@ -495,7 +496,7 @@ let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in {
defm LOCHI : CondBinaryRIEPair<"lochi", 0xEC42, GR32, imm32sx16>;
defm LOCGHI : CondBinaryRIEPair<"locghi", 0xEC46, GR64, imm64sx16>;
- // Move register on condition. Expanded from Select* pseudos and
+ // Move register on condition. Matched via DAG pattern and
// created by early if-conversion.
let isCommutable = 1 in {
// Expands to LOCR or LOCFHR or a branch-and-move sequence,
@@ -530,7 +531,7 @@ let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in {
}
let Predicates = [FeatureLoadStoreOnCond], Uses = [CC] in {
- // Move register on condition. Expanded from Select* pseudos and
+ // Move register on condition. Matched via DAG pattern and
// created by early if-conversion.
let isCommutable = 1 in {
defm LOCR : CondBinaryRRFPair<"locr", 0xB9F2, GR32, GR32>;
@@ -681,7 +682,7 @@ let Predicates = [FeatureLoadAndTrap], hasSideEffects = 1 in {
}
// Extend GR64s to GR128s.
-let usesCustomInserter = 1 in
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
def ZEXT128 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
//===----------------------------------------------------------------------===//
@@ -693,7 +694,7 @@ def : Pat<(i64 (anyext GR32:$src)),
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>;
// Extend GR64s to GR128s.
-let usesCustomInserter = 1 in
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
def AEXT128 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
//===----------------------------------------------------------------------===//
@@ -890,12 +891,12 @@ def : Pat<(or (zext32 GR32:$src), imm64hf32:$imm),
// Addition
//===----------------------------------------------------------------------===//
-// Plain addition.
+// Addition producing a signed overflow flag.
let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
// Addition of a register.
let isCommutable = 1 in {
- defm AR : BinaryRRAndK<"ar", 0x1A, 0xB9F8, add, GR32, GR32>;
- defm AGR : BinaryRREAndK<"agr", 0xB908, 0xB9E8, add, GR64, GR64>;
+ defm AR : BinaryRRAndK<"ar", 0x1A, 0xB9F8, z_sadd, GR32, GR32>;
+ defm AGR : BinaryRREAndK<"agr", 0xB908, 0xB9E8, z_sadd, GR64, GR64>;
}
def AGFR : BinaryRRE<"agfr", 0xB918, null_frag, GR64, GR32>;
@@ -906,38 +907,38 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
Requires<[FeatureHighWord]>;
// Addition of signed 16-bit immediates.
- defm AHIMux : BinaryRIAndKPseudo<"ahimux", add, GRX32, imm32sx16>;
- defm AHI : BinaryRIAndK<"ahi", 0xA7A, 0xECD8, add, GR32, imm32sx16>;
- defm AGHI : BinaryRIAndK<"aghi", 0xA7B, 0xECD9, add, GR64, imm64sx16>;
+ defm AHIMux : BinaryRIAndKPseudo<"ahimux", z_sadd, GRX32, imm32sx16>;
+ defm AHI : BinaryRIAndK<"ahi", 0xA7A, 0xECD8, z_sadd, GR32, imm32sx16>;
+ defm AGHI : BinaryRIAndK<"aghi", 0xA7B, 0xECD9, z_sadd, GR64, imm64sx16>;
// Addition of signed 32-bit immediates.
- def AFIMux : BinaryRIPseudo<add, GRX32, simm32>,
+ def AFIMux : BinaryRIPseudo<z_sadd, GRX32, simm32>,
Requires<[FeatureHighWord]>;
- def AFI : BinaryRIL<"afi", 0xC29, add, GR32, simm32>;
- def AIH : BinaryRIL<"aih", 0xCC8, add, GRH32, simm32>,
+ def AFI : BinaryRIL<"afi", 0xC29, z_sadd, GR32, simm32>;
+ def AIH : BinaryRIL<"aih", 0xCC8, z_sadd, GRH32, simm32>,
Requires<[FeatureHighWord]>;
- def AGFI : BinaryRIL<"agfi", 0xC28, add, GR64, imm64sx32>;
+ def AGFI : BinaryRIL<"agfi", 0xC28, z_sadd, GR64, imm64sx32>;
// Addition of memory.
- defm AH : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, asextloadi16, 2>;
- defm A : BinaryRXPair<"a", 0x5A, 0xE35A, add, GR32, load, 4>;
- def AGH : BinaryRXY<"agh", 0xE338, add, GR64, asextloadi16, 2>,
+ defm AH : BinaryRXPair<"ah", 0x4A, 0xE37A, z_sadd, GR32, asextloadi16, 2>;
+ defm A : BinaryRXPair<"a", 0x5A, 0xE35A, z_sadd, GR32, load, 4>;
+ def AGH : BinaryRXY<"agh", 0xE338, z_sadd, GR64, asextloadi16, 2>,
Requires<[FeatureMiscellaneousExtensions2]>;
- def AGF : BinaryRXY<"agf", 0xE318, add, GR64, asextloadi32, 4>;
- def AG : BinaryRXY<"ag", 0xE308, add, GR64, load, 8>;
+ def AGF : BinaryRXY<"agf", 0xE318, z_sadd, GR64, asextloadi32, 4>;
+ def AG : BinaryRXY<"ag", 0xE308, z_sadd, GR64, load, 8>;
// Addition to memory.
def ASI : BinarySIY<"asi", 0xEB6A, add, imm32sx8>;
def AGSI : BinarySIY<"agsi", 0xEB7A, add, imm64sx8>;
}
-defm : SXB<add, GR64, AGFR>;
+defm : SXB<z_sadd, GR64, AGFR>;
// Addition producing a carry.
let Defs = [CC] in {
// Addition of a register.
let isCommutable = 1 in {
- defm ALR : BinaryRRAndK<"alr", 0x1E, 0xB9FA, addc, GR32, GR32>;
- defm ALGR : BinaryRREAndK<"algr", 0xB90A, 0xB9EA, addc, GR64, GR64>;
+ defm ALR : BinaryRRAndK<"alr", 0x1E, 0xB9FA, z_uadd, GR32, GR32>;
+ defm ALGR : BinaryRREAndK<"algr", 0xB90A, 0xB9EA, z_uadd, GR64, GR64>;
}
def ALGFR : BinaryRRE<"algfr", 0xB91A, null_frag, GR64, GR32>;
@@ -948,56 +949,56 @@ let Defs = [CC] in {
Requires<[FeatureHighWord]>;
// Addition of signed 16-bit immediates.
- def ALHSIK : BinaryRIE<"alhsik", 0xECDA, addc, GR32, imm32sx16>,
+ def ALHSIK : BinaryRIE<"alhsik", 0xECDA, z_uadd, GR32, imm32sx16>,
Requires<[FeatureDistinctOps]>;
- def ALGHSIK : BinaryRIE<"alghsik", 0xECDB, addc, GR64, imm64sx16>,
+ def ALGHSIK : BinaryRIE<"alghsik", 0xECDB, z_uadd, GR64, imm64sx16>,
Requires<[FeatureDistinctOps]>;
// Addition of unsigned 32-bit immediates.
- def ALFI : BinaryRIL<"alfi", 0xC2B, addc, GR32, uimm32>;
- def ALGFI : BinaryRIL<"algfi", 0xC2A, addc, GR64, imm64zx32>;
+ def ALFI : BinaryRIL<"alfi", 0xC2B, z_uadd, GR32, uimm32>;
+ def ALGFI : BinaryRIL<"algfi", 0xC2A, z_uadd, GR64, imm64zx32>;
// Addition of signed 32-bit immediates.
def ALSIH : BinaryRIL<"alsih", 0xCCA, null_frag, GRH32, simm32>,
Requires<[FeatureHighWord]>;
// Addition of memory.
- defm AL : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>;
- def ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>;
- def ALG : BinaryRXY<"alg", 0xE30A, addc, GR64, load, 8>;
+ defm AL : BinaryRXPair<"al", 0x5E, 0xE35E, z_uadd, GR32, load, 4>;
+ def ALGF : BinaryRXY<"algf", 0xE31A, z_uadd, GR64, azextloadi32, 4>;
+ def ALG : BinaryRXY<"alg", 0xE30A, z_uadd, GR64, load, 8>;
// Addition to memory.
def ALSI : BinarySIY<"alsi", 0xEB6E, null_frag, imm32sx8>;
def ALGSI : BinarySIY<"algsi", 0xEB7E, null_frag, imm64sx8>;
}
-defm : ZXB<addc, GR64, ALGFR>;
+defm : ZXB<z_uadd, GR64, ALGFR>;
// Addition producing and using a carry.
let Defs = [CC], Uses = [CC] in {
// Addition of a register.
- def ALCR : BinaryRRE<"alcr", 0xB998, adde, GR32, GR32>;
- def ALCGR : BinaryRRE<"alcgr", 0xB988, adde, GR64, GR64>;
+ def ALCR : BinaryRRE<"alcr", 0xB998, z_addcarry, GR32, GR32>;
+ def ALCGR : BinaryRRE<"alcgr", 0xB988, z_addcarry, GR64, GR64>;
// Addition of memory.
- def ALC : BinaryRXY<"alc", 0xE398, adde, GR32, load, 4>;
- def ALCG : BinaryRXY<"alcg", 0xE388, adde, GR64, load, 8>;
+ def ALC : BinaryRXY<"alc", 0xE398, z_addcarry, GR32, load, 4>;
+ def ALCG : BinaryRXY<"alcg", 0xE388, z_addcarry, GR64, load, 8>;
}
// Addition that does not modify the condition code.
def ALSIHN : BinaryRIL<"alsihn", 0xCCB, null_frag, GRH32, simm32>,
Requires<[FeatureHighWord]>;
+
//===----------------------------------------------------------------------===//
// Subtraction
//===----------------------------------------------------------------------===//
-// Plain subtraction. Although immediate forms exist, we use the
-// add-immediate instruction instead.
+// Subtraction producing a signed overflow flag.
let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
// Subtraction of a register.
- defm SR : BinaryRRAndK<"sr", 0x1B, 0xB9F9, sub, GR32, GR32>;
+ defm SR : BinaryRRAndK<"sr", 0x1B, 0xB9F9, z_ssub, GR32, GR32>;
def SGFR : BinaryRRE<"sgfr", 0xB919, null_frag, GR64, GR32>;
- defm SGR : BinaryRREAndK<"sgr", 0xB909, 0xB9E9, sub, GR64, GR64>;
+ defm SGR : BinaryRREAndK<"sgr", 0xB909, 0xB9E9, z_ssub, GR64, GR64>;
// Subtraction from a high register.
def SHHHR : BinaryRRFa<"shhhr", 0xB9C9, null_frag, GRH32, GRH32, GRH32>,
@@ -1006,21 +1007,39 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
Requires<[FeatureHighWord]>;
// Subtraction of memory.
- defm SH : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>;
- defm S : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>;
- def SGH : BinaryRXY<"sgh", 0xE339, sub, GR64, asextloadi16, 2>,
+ defm SH : BinaryRXPair<"sh", 0x4B, 0xE37B, z_ssub, GR32, asextloadi16, 2>;
+ defm S : BinaryRXPair<"s", 0x5B, 0xE35B, z_ssub, GR32, load, 4>;
+ def SGH : BinaryRXY<"sgh", 0xE339, z_ssub, GR64, asextloadi16, 2>,
Requires<[FeatureMiscellaneousExtensions2]>;
- def SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, asextloadi32, 4>;
- def SG : BinaryRXY<"sg", 0xE309, sub, GR64, load, 8>;
+ def SGF : BinaryRXY<"sgf", 0xE319, z_ssub, GR64, asextloadi32, 4>;
+ def SG : BinaryRXY<"sg", 0xE309, z_ssub, GR64, load, 8>;
+}
+defm : SXB<z_ssub, GR64, SGFR>;
+
+// Subtracting an immediate is the same as adding the negated immediate.
+let AddedComplexity = 1 in {
+ def : Pat<(z_ssub GR32:$src1, imm32sx16n:$src2),
+ (AHIMux GR32:$src1, imm32sx16n:$src2)>,
+ Requires<[FeatureHighWord]>;
+ def : Pat<(z_ssub GR32:$src1, simm32n:$src2),
+ (AFIMux GR32:$src1, simm32n:$src2)>,
+ Requires<[FeatureHighWord]>;
+ def : Pat<(z_ssub GR32:$src1, imm32sx16n:$src2),
+ (AHI GR32:$src1, imm32sx16n:$src2)>;
+ def : Pat<(z_ssub GR32:$src1, simm32n:$src2),
+ (AFI GR32:$src1, simm32n:$src2)>;
+ def : Pat<(z_ssub GR64:$src1, imm64sx16n:$src2),
+ (AGHI GR64:$src1, imm64sx16n:$src2)>;
+ def : Pat<(z_ssub GR64:$src1, imm64sx32n:$src2),
+ (AGFI GR64:$src1, imm64sx32n:$src2)>;
}
-defm : SXB<sub, GR64, SGFR>;
// Subtraction producing a carry.
let Defs = [CC] in {
// Subtraction of a register.
- defm SLR : BinaryRRAndK<"slr", 0x1F, 0xB9FB, subc, GR32, GR32>;
+ defm SLR : BinaryRRAndK<"slr", 0x1F, 0xB9FB, z_usub, GR32, GR32>;
def SLGFR : BinaryRRE<"slgfr", 0xB91B, null_frag, GR64, GR32>;
- defm SLGR : BinaryRREAndK<"slgr", 0xB90B, 0xB9EB, subc, GR64, GR64>;
+ defm SLGR : BinaryRREAndK<"slgr", 0xB90B, 0xB9EB, z_usub, GR64, GR64>;
// Subtraction from a high register.
def SLHHHR : BinaryRRFa<"slhhhr", 0xB9CB, null_frag, GRH32, GRH32, GRH32>,
@@ -1028,29 +1047,43 @@ let Defs = [CC] in {
def SLHHLR : BinaryRRFa<"slhhlr", 0xB9DB, null_frag, GRH32, GRH32, GR32>,
Requires<[FeatureHighWord]>;
- // Subtraction of unsigned 32-bit immediates. These don't match
- // subc because we prefer addc for constants.
- def SLFI : BinaryRIL<"slfi", 0xC25, null_frag, GR32, uimm32>;
- def SLGFI : BinaryRIL<"slgfi", 0xC24, null_frag, GR64, imm64zx32>;
+ // Subtraction of unsigned 32-bit immediates.
+ def SLFI : BinaryRIL<"slfi", 0xC25, z_usub, GR32, uimm32>;
+ def SLGFI : BinaryRIL<"slgfi", 0xC24, z_usub, GR64, imm64zx32>;
// Subtraction of memory.
- defm SL : BinaryRXPair<"sl", 0x5F, 0xE35F, subc, GR32, load, 4>;
- def SLGF : BinaryRXY<"slgf", 0xE31B, subc, GR64, azextloadi32, 4>;
- def SLG : BinaryRXY<"slg", 0xE30B, subc, GR64, load, 8>;
+ defm SL : BinaryRXPair<"sl", 0x5F, 0xE35F, z_usub, GR32, load, 4>;
+ def SLGF : BinaryRXY<"slgf", 0xE31B, z_usub, GR64, azextloadi32, 4>;
+ def SLG : BinaryRXY<"slg", 0xE30B, z_usub, GR64, load, 8>;
+}
+defm : ZXB<z_usub, GR64, SLGFR>;
+
+// Subtracting an immediate is the same as adding the negated immediate.
+let AddedComplexity = 1 in {
+ def : Pat<(z_usub GR32:$src1, imm32sx16n:$src2),
+ (ALHSIK GR32:$src1, imm32sx16n:$src2)>,
+ Requires<[FeatureDistinctOps]>;
+ def : Pat<(z_usub GR64:$src1, imm64sx16n:$src2),
+ (ALGHSIK GR64:$src1, imm64sx16n:$src2)>,
+ Requires<[FeatureDistinctOps]>;
}
-defm : ZXB<subc, GR64, SLGFR>;
+
+// And vice versa in one special case (but we prefer addition).
+def : Pat<(add GR64:$src1, imm64zx32n:$src2),
+ (SLGFI GR64:$src1, imm64zx32n:$src2)>;
// Subtraction producing and using a carry.
let Defs = [CC], Uses = [CC] in {
// Subtraction of a register.
- def SLBR : BinaryRRE<"slbr", 0xB999, sube, GR32, GR32>;
- def SLBGR : BinaryRRE<"slbgr", 0xB989, sube, GR64, GR64>;
+ def SLBR : BinaryRRE<"slbr", 0xB999, z_subcarry, GR32, GR32>;
+ def SLBGR : BinaryRRE<"slbgr", 0xB989, z_subcarry, GR64, GR64>;
// Subtraction of memory.
- def SLB : BinaryRXY<"slb", 0xE399, sube, GR32, load, 4>;
- def SLBG : BinaryRXY<"slbg", 0xE389, sube, GR64, load, 8>;
+ def SLB : BinaryRXY<"slb", 0xE399, z_subcarry, GR32, load, 4>;
+ def SLBG : BinaryRXY<"slbg", 0xE389, z_subcarry, GR64, load, 8>;
}
+
//===----------------------------------------------------------------------===//
// AND
//===----------------------------------------------------------------------===//
@@ -1492,7 +1525,7 @@ defm : ZXB<z_ucmp, GR64, CLGFR>;
// Memory-to-memory comparison.
let mayLoad = 1, Defs = [CC] in {
- defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+ defm CLC : CompareMemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
def CLCL : SideEffectBinaryMemMemRR<"clcl", 0x0F, GR128, GR128>;
def CLCLE : SideEffectTernaryMemMemRS<"clcle", 0xA9, GR128, GR128>;
def CLCLU : SideEffectTernaryMemMemRSY<"clclu", 0xEB8F, GR128, GR128>;
@@ -1933,15 +1966,16 @@ let isCall = 1, Defs = [CC] in
let hasSideEffects = 1, Predicates = [FeatureTransactionalExecution] in {
// Transaction Begin
let mayStore = 1, usesCustomInserter = 1, Defs = [CC] in {
- def TBEGIN : SideEffectBinarySIL<"tbegin", 0xE560, z_tbegin, imm32zx16>;
- def TBEGIN_nofloat : SideEffectBinarySILPseudo<z_tbegin_nofloat, imm32zx16>;
+ def TBEGIN : TestBinarySIL<"tbegin", 0xE560, z_tbegin, imm32zx16>;
+ let hasNoSchedulingInfo = 1 in
+ def TBEGIN_nofloat : TestBinarySILPseudo<z_tbegin_nofloat, imm32zx16>;
def TBEGINC : SideEffectBinarySIL<"tbeginc", 0xE561,
int_s390_tbeginc, imm32zx16>;
}
// Transaction End
let Defs = [CC] in
- def TEND : SideEffectInherentS<"tend", 0xB2F8, z_tend>;
+ def TEND : TestInherentS<"tend", 0xB2F8, z_tend>;
// Transaction Abort
let isTerminator = 1, isBarrier = 1, mayStore = 1,
@@ -2117,32 +2151,6 @@ let isCodeGenOnly = 1, hasSideEffects = 1 in {
// Peepholes.
//===----------------------------------------------------------------------===//
-// Use AL* for GR64 additions of unsigned 32-bit values.
-defm : ZXB<add, GR64, ALGFR>;
-def : Pat<(add GR64:$src1, imm64zx32:$src2),
- (ALGFI GR64:$src1, imm64zx32:$src2)>;
-def : Pat<(add GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
- (ALGF GR64:$src1, bdxaddr20only:$addr)>;
-
-// Use SL* for GR64 subtractions of unsigned 32-bit values.
-defm : ZXB<sub, GR64, SLGFR>;
-def : Pat<(add GR64:$src1, imm64zx32n:$src2),
- (SLGFI GR64:$src1, imm64zx32n:$src2)>;
-def : Pat<(sub GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
- (SLGF GR64:$src1, bdxaddr20only:$addr)>;
-
-// Optimize sign-extended 1/0 selects to -1/0 selects. This is important
-// for vector legalization.
-def : Pat<(sra (shl (i32 (z_select_ccmask 1, 0, imm32zx4:$valid, imm32zx4:$cc)),
- (i32 31)),
- (i32 31)),
- (Select32 (LHI -1), (LHI 0), imm32zx4:$valid, imm32zx4:$cc)>;
-def : Pat<(sra (shl (i64 (anyext (i32 (z_select_ccmask 1, 0, imm32zx4:$valid,
- imm32zx4:$cc)))),
- (i32 63)),
- (i32 63)),
- (Select64 (LGHI -1), (LGHI 0), imm32zx4:$valid, imm32zx4:$cc)>;
-
// Avoid generating 2 XOR instructions. (xor (and x, y), y) is
// equivalent to (and (xor x, -1), y)
def : Pat<(and (xor GR64:$x, (i64 -1)), GR64:$y),
diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
index 791f0334e0f1..802962bd4db0 100644
--- a/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -295,7 +295,7 @@ uint64_t SystemZLongBranch::initMBBInfo() {
// Add the terminators.
while (MI != End) {
- if (!MI->isDebugValue()) {
+ if (!MI->isDebugInstr()) {
assert(MI->isTerminator() && "Terminator followed by non-terminator");
Terminators.push_back(describeTerminator(*MI));
skipTerminator(Position, Terminators.back(), false);
@@ -312,7 +312,7 @@ uint64_t SystemZLongBranch::initMBBInfo() {
// relaxed if it were placed at address Address.
bool SystemZLongBranch::mustRelaxBranch(const TerminatorInfo &Terminator,
uint64_t Address) {
- if (!Terminator.Branch)
+ if (!Terminator.Branch || Terminator.ExtraRelaxSize == 0)
return false;
const MBBInfo &Target = MBBs[Terminator.TargetBlock];
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
index 08eb73fc362e..fcbf4c4b5fe4 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -65,25 +65,29 @@ advanceTo(MachineBasicBlock::iterator NextBegin) {
std::next(LastEmittedMI) : MBB->begin());
for (; I != NextBegin; ++I) {
- if (I->isPosition() || I->isDebugValue())
+ if (I->isPosition() || I->isDebugInstr())
continue;
HazardRec->emitInstruction(&*I);
}
}
+void SystemZPostRASchedStrategy::initialize(ScheduleDAGMI *dag) {
+ LLVM_DEBUG(HazardRec->dumpState(););
+}
+
void SystemZPostRASchedStrategy::enterMBB(MachineBasicBlock *NextMBB) {
assert ((SchedStates.find(NextMBB) == SchedStates.end()) &&
"Entering MBB twice?");
- DEBUG(dbgs() << "+++ Entering " << printMBBReference(*NextMBB));
+ LLVM_DEBUG(dbgs() << "** Entering " << printMBBReference(*NextMBB));
MBB = NextMBB;
+
/// Create a HazardRec for MBB, save it in SchedStates and set HazardRec to
/// point to it.
HazardRec = SchedStates[MBB] = new SystemZHazardRecognizer(TII, &SchedModel);
- DEBUG (const MachineLoop *Loop = MLI->getLoopFor(MBB);
- if(Loop && Loop->getHeader() == MBB)
- dbgs() << " (Loop header)";
- dbgs() << ":\n";);
+ LLVM_DEBUG(const MachineLoop *Loop = MLI->getLoopFor(MBB);
+ if (Loop && Loop->getHeader() == MBB) dbgs() << " (Loop header)";
+ dbgs() << ":\n";);
// Try to take over the state from a single predecessor, if it has been
// scheduled. If this is not possible, we are done.
@@ -93,16 +97,17 @@ void SystemZPostRASchedStrategy::enterMBB(MachineBasicBlock *NextMBB) {
SchedStates.find(SinglePredMBB) == SchedStates.end())
return;
- DEBUG(dbgs() << "+++ Continued scheduling from "
- << printMBBReference(*SinglePredMBB) << "\n";);
+ LLVM_DEBUG(dbgs() << "** Continued scheduling from "
+ << printMBBReference(*SinglePredMBB) << "\n";);
HazardRec->copyState(SchedStates[SinglePredMBB]);
+ LLVM_DEBUG(HazardRec->dumpState(););
// Emit incoming terminator(s). Be optimistic and assume that branch
// prediction will generally do "the right thing".
for (MachineBasicBlock::iterator I = SinglePredMBB->getFirstTerminator();
I != SinglePredMBB->end(); I++) {
- DEBUG (dbgs() << "+++ Emitting incoming branch: "; I->dump(););
+ LLVM_DEBUG(dbgs() << "** Emitting incoming branch: "; I->dump(););
bool TakenBranch = (I->isBranch() &&
(TII->getBranchInfo(*I).Target->isReg() || // Relative branch
TII->getBranchInfo(*I).Target->getMBB() == MBB));
@@ -113,7 +118,7 @@ void SystemZPostRASchedStrategy::enterMBB(MachineBasicBlock *NextMBB) {
}
void SystemZPostRASchedStrategy::leaveMBB() {
- DEBUG(dbgs() << "+++ Leaving " << printMBBReference(*MBB) << "\n";);
+ LLVM_DEBUG(dbgs() << "** Leaving " << printMBBReference(*MBB) << "\n";);
// Advance to first terminator. The successor block will handle terminators
// dependent on CFG layout (T/NT branch etc).
@@ -127,7 +132,7 @@ SystemZPostRASchedStrategy(const MachineSchedContext *C)
(C->MF->getSubtarget().getInstrInfo())),
MBB(nullptr), HazardRec(nullptr) {
const TargetSubtargetInfo *ST = &C->MF->getSubtarget();
- SchedModel.init(ST->getSchedModel(), ST, TII);
+ SchedModel.init(ST);
}
SystemZPostRASchedStrategy::~SystemZPostRASchedStrategy() {
@@ -159,14 +164,14 @@ SUnit *SystemZPostRASchedStrategy::pickNode(bool &IsTopNode) {
// If only one choice, return it.
if (Available.size() == 1) {
- DEBUG (dbgs() << "+++ Only one: ";
- HazardRec->dumpSU(*Available.begin(), dbgs()); dbgs() << "\n";);
+ LLVM_DEBUG(dbgs() << "** Only one: ";
+ HazardRec->dumpSU(*Available.begin(), dbgs()); dbgs() << "\n";);
return *Available.begin();
}
// All nodes that are possible to schedule are stored by in the
// Available set.
- DEBUG(dbgs() << "+++ Available: "; Available.dump(*HazardRec););
+ LLVM_DEBUG(dbgs() << "** Available: "; Available.dump(*HazardRec););
Candidate Best;
for (auto *SU : Available) {
@@ -177,15 +182,11 @@ SUnit *SystemZPostRASchedStrategy::pickNode(bool &IsTopNode) {
// Remeber which SU is the best candidate.
if (Best.SU == nullptr || c < Best) {
Best = c;
- DEBUG(dbgs() << "+++ Best sofar: ";
- HazardRec->dumpSU(Best.SU, dbgs());
- if (Best.GroupingCost != 0)
- dbgs() << "\tGrouping cost:" << Best.GroupingCost;
- if (Best.ResourcesCost != 0)
- dbgs() << " Resource cost:" << Best.ResourcesCost;
- dbgs() << " Height:" << Best.SU->getHeight();
- dbgs() << "\n";);
- }
+ LLVM_DEBUG(dbgs() << "** Best so far: ";);
+ } else
+ LLVM_DEBUG(dbgs() << "** Tried : ";);
+ LLVM_DEBUG(HazardRec->dumpSU(c.SU, dbgs()); c.dumpCosts();
+ dbgs() << " Height:" << c.SU->getHeight(); dbgs() << "\n";);
// Once we know we have seen all SUs that affect grouping or use unbuffered
// resources, we can stop iterating if Best looks good.
@@ -206,7 +207,7 @@ Candidate(SUnit *SU_, SystemZHazardRecognizer &HazardRec) : Candidate() {
// if it would fit naturally into the schedule.
GroupingCost = HazardRec.groupingCost(SU);
- // Check the resources cost for this SU.
+ // Check the resources cost for this SU.
ResourcesCost = HazardRec.resourcesCost(SU);
}
@@ -239,7 +240,9 @@ operator<(const Candidate &other) {
}
void SystemZPostRASchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
- DEBUG(dbgs() << "+++ Scheduling SU(" << SU->NodeNum << ")\n";);
+ LLVM_DEBUG(dbgs() << "** Scheduling SU(" << SU->NodeNum << ") ";
+ if (Available.size() == 1) dbgs() << "(only one) ";
+ Candidate c(SU, *HazardRec); c.dumpCosts(); dbgs() << "\n";);
// Remove SU from Available set and update HazardRec.
Available.erase(SU);
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.h b/lib/Target/SystemZ/SystemZMachineScheduler.h
index de1bf4655c54..cb0304825966 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.h
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -58,6 +58,15 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
bool noCost() const {
return (GroupingCost <= 0 && !ResourcesCost);
}
+
+#ifndef NDEBUG
+ void dumpCosts() {
+ if (GroupingCost != 0)
+ dbgs() << " Grouping cost:" << GroupingCost;
+ if (ResourcesCost != 0)
+ dbgs() << " Resource cost:" << ResourcesCost;
+ }
+#endif
};
// A sorter for the Available set that makes sure that SUs are considered
@@ -119,7 +128,7 @@ public:
// transferrred over scheduling boundaries.
bool doMBBSchedRegionsTopDown() const override { return true; }
- void initialize(ScheduleDAGMI *dag) override {}
+ void initialize(ScheduleDAGMI *dag) override;
/// Tell the strategy that MBB is about to be processed.
void enterMBB(MachineBasicBlock *NextMBB) override;
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
index 713612129d90..da682cb4e5ab 100644
--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -115,13 +115,13 @@ class AddressingMode<string seltype, string bitsize, string dispsize,
class BDMode<string type, string bitsize, string dispsize, string suffix>
: AddressingMode<type, bitsize, dispsize, suffix, "", 2, "BDAddr",
(ops !cast<RegisterOperand>("ADDR"##bitsize),
- !cast<Immediate>("disp"##dispsize##"imm"##bitsize))>;
+ !cast<Operand>("disp"##dispsize##"imm"##bitsize))>;
// An addressing mode with a base, displacement and index.
class BDXMode<string type, string bitsize, string dispsize, string suffix>
: AddressingMode<type, bitsize, dispsize, suffix, "", 3, "BDXAddr",
(ops !cast<RegisterOperand>("ADDR"##bitsize),
- !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+ !cast<Operand>("disp"##dispsize##"imm"##bitsize),
!cast<RegisterOperand>("ADDR"##bitsize))>;
// A BDMode paired with an immediate length operand of LENSIZE bits.
@@ -130,21 +130,21 @@ class BDLMode<string type, string bitsize, string dispsize, string suffix,
: AddressingMode<type, bitsize, dispsize, suffix, "Len"##lensize, 3,
"BDLAddr",
(ops !cast<RegisterOperand>("ADDR"##bitsize),
- !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
- !cast<Immediate>("imm"##bitsize))>;
+ !cast<Operand>("disp"##dispsize##"imm"##bitsize),
+ !cast<Operand>("imm"##bitsize))>;
// A BDMode paired with a register length operand.
class BDRMode<string type, string bitsize, string dispsize, string suffix>
: AddressingMode<type, bitsize, dispsize, suffix, "", 3, "BDRAddr",
(ops !cast<RegisterOperand>("ADDR"##bitsize),
- !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+ !cast<Operand>("disp"##dispsize##"imm"##bitsize),
!cast<RegisterOperand>("GR"##bitsize))>;
// An addressing mode with a base, displacement and a vector index.
class BDVMode<string bitsize, string dispsize>
: AddressOperand<bitsize, dispsize, "", "BDVAddr",
(ops !cast<RegisterOperand>("ADDR"##bitsize),
- !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+ !cast<Operand>("disp"##dispsize##"imm"##bitsize),
!cast<RegisterOperand>("VR128"))>;
//===----------------------------------------------------------------------===//
@@ -219,6 +219,12 @@ def SIMM16 : SDNodeXForm<imm, [{
MVT::i64);
}]>;
+// Negate and then truncate an immediate to a 16-bit signed quantity.
+def NEGSIMM16 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(int16_t(-N->getZExtValue()), SDLoc(N),
+ MVT::i64);
+}]>;
+
// Truncate an immediate to a 16-bit unsigned quantity.
def UIMM16 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(uint16_t(N->getZExtValue()), SDLoc(N),
@@ -231,24 +237,30 @@ def SIMM32 : SDNodeXForm<imm, [{
MVT::i64);
}]>;
+// Negate and then truncate an immediate to a 32-bit unsigned quantity.
+def NEGSIMM32 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(int32_t(-N->getZExtValue()), SDLoc(N),
+ MVT::i64);
+}]>;
+
// Truncate an immediate to a 32-bit unsigned quantity.
def UIMM32 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(uint32_t(N->getZExtValue()), SDLoc(N),
MVT::i64);
}]>;
+// Negate and then truncate an immediate to a 32-bit unsigned quantity.
+def NEGUIMM32 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(uint32_t(-N->getZExtValue()), SDLoc(N),
+ MVT::i64);
+}]>;
+
// Truncate an immediate to a 48-bit unsigned quantity.
def UIMM48 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(uint64_t(N->getZExtValue()) & 0xffffffffffff,
SDLoc(N), MVT::i64);
}]>;
-// Negate and then truncate an immediate to a 32-bit unsigned quantity.
-def NEGIMM32 : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(uint32_t(-N->getZExtValue()), SDLoc(N),
- MVT::i64);
-}]>;
-
//===----------------------------------------------------------------------===//
// Immediate asm operands.
//===----------------------------------------------------------------------===//
@@ -336,6 +348,10 @@ def imm32sx16 : Immediate<i32, [{
return isInt<16>(N->getSExtValue());
}], SIMM16, "S16Imm">;
+def imm32sx16n : Immediate<i32, [{
+ return isInt<16>(-N->getSExtValue());
+}], NEGSIMM16, "S16Imm">;
+
def imm32zx16 : Immediate<i32, [{
return isUInt<16>(N->getZExtValue());
}], UIMM16, "U16Imm">;
@@ -348,6 +364,10 @@ def imm32sx16trunc : Immediate<i32, [{}], SIMM16, "S16Imm">;
def simm32 : Immediate<i32, [{}], SIMM32, "S32Imm">;
def uimm32 : Immediate<i32, [{}], UIMM32, "U32Imm">;
+def simm32n : Immediate<i32, [{
+ return isInt<32>(-N->getSExtValue());
+}], NEGSIMM32, "S32Imm">;
+
def imm32 : ImmLeaf<i32, [{}]>;
//===----------------------------------------------------------------------===//
@@ -423,6 +443,10 @@ def imm64sx16 : Immediate<i64, [{
return isInt<16>(N->getSExtValue());
}], SIMM16, "S16Imm">;
+def imm64sx16n : Immediate<i64, [{
+ return isInt<16>(-N->getSExtValue());
+}], NEGSIMM16, "S16Imm">;
+
def imm64zx16 : Immediate<i64, [{
return isUInt<16>(N->getZExtValue());
}], UIMM16, "U16Imm">;
@@ -431,13 +455,17 @@ def imm64sx32 : Immediate<i64, [{
return isInt<32>(N->getSExtValue());
}], SIMM32, "S32Imm">;
+def imm64sx32n : Immediate<i64, [{
+ return isInt<32>(-N->getSExtValue());
+}], NEGSIMM32, "S32Imm">;
+
def imm64zx32 : Immediate<i64, [{
return isUInt<32>(N->getZExtValue());
}], UIMM32, "U32Imm">;
def imm64zx32n : Immediate<i64, [{
return isUInt<32>(-N->getSExtValue());
-}], NEGIMM32, "U32Imm">;
+}], NEGUIMM32, "U32Imm">;
def imm64zx48 : Immediate<i64, [{
return isUInt<64>(N->getZExtValue());
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index d067f331f677..3cfe23aec417 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -15,19 +15,24 @@ def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i64>,
def SDT_CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i64>,
SDTCisVT<1, i64>]>;
def SDT_ZCall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
-def SDT_ZCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
-def SDT_ZICmp : SDTypeProfile<0, 3,
- [SDTCisSameAs<0, 1>,
- SDTCisVT<2, i32>]>;
-def SDT_ZBRCCMask : SDTypeProfile<0, 3,
+def SDT_ZCmp : SDTypeProfile<1, 2,
+ [SDTCisVT<0, i32>,
+ SDTCisSameAs<1, 2>]>;
+def SDT_ZICmp : SDTypeProfile<1, 3,
+ [SDTCisVT<0, i32>,
+ SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i32>]>;
+def SDT_ZBRCCMask : SDTypeProfile<0, 4,
[SDTCisVT<0, i32>,
SDTCisVT<1, i32>,
- SDTCisVT<2, OtherVT>]>;
-def SDT_ZSelectCCMask : SDTypeProfile<1, 4,
+ SDTCisVT<2, OtherVT>,
+ SDTCisVT<3, i32>]>;
+def SDT_ZSelectCCMask : SDTypeProfile<1, 5,
[SDTCisSameAs<0, 1>,
SDTCisSameAs<1, 2>,
SDTCisVT<3, i32>,
- SDTCisVT<4, i32>]>;
+ SDTCisVT<4, i32>,
+ SDTCisVT<5, i32>]>;
def SDT_ZWrapPtr : SDTypeProfile<1, 1,
[SDTCisSameAs<0, 1>,
SDTCisPtrTy<0>]>;
@@ -40,6 +45,17 @@ def SDT_ZGR128Binary : SDTypeProfile<1, 2,
[SDTCisVT<0, untyped>,
SDTCisInt<1>,
SDTCisInt<2>]>;
+def SDT_ZBinaryWithFlags : SDTypeProfile<2, 2,
+ [SDTCisInt<0>,
+ SDTCisVT<1, i32>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>]>;
+def SDT_ZBinaryWithCarry : SDTypeProfile<2, 3,
+ [SDTCisInt<0>,
+ SDTCisVT<1, i32>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisVT<1, i32>]>;
def SDT_ZAtomicLoadBinaryW : SDTypeProfile<1, 5,
[SDTCisVT<0, i32>,
SDTCisPtrTy<1>,
@@ -47,45 +63,67 @@ def SDT_ZAtomicLoadBinaryW : SDTypeProfile<1, 5,
SDTCisVT<3, i32>,
SDTCisVT<4, i32>,
SDTCisVT<5, i32>]>;
-def SDT_ZAtomicCmpSwapW : SDTypeProfile<1, 6,
+def SDT_ZAtomicCmpSwapW : SDTypeProfile<2, 6,
[SDTCisVT<0, i32>,
- SDTCisPtrTy<1>,
- SDTCisVT<2, i32>,
+ SDTCisVT<1, i32>,
+ SDTCisPtrTy<2>,
SDTCisVT<3, i32>,
SDTCisVT<4, i32>,
SDTCisVT<5, i32>,
- SDTCisVT<6, i32>]>;
-def SDT_ZAtomicCmpSwap : SDTypeProfile<1, 3,
+ SDTCisVT<6, i32>,
+ SDTCisVT<7, i32>]>;
+def SDT_ZAtomicCmpSwap : SDTypeProfile<2, 3,
[SDTCisInt<0>,
- SDTCisPtrTy<1>,
- SDTCisSameAs<0, 2>,
- SDTCisSameAs<0, 3>]>;
+ SDTCisVT<1, i32>,
+ SDTCisPtrTy<2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisSameAs<0, 4>]>;
def SDT_ZAtomicLoad128 : SDTypeProfile<1, 1,
[SDTCisVT<0, untyped>,
SDTCisPtrTy<1>]>;
def SDT_ZAtomicStore128 : SDTypeProfile<0, 2,
[SDTCisVT<0, untyped>,
SDTCisPtrTy<1>]>;
-def SDT_ZAtomicCmpSwap128 : SDTypeProfile<1, 3,
+def SDT_ZAtomicCmpSwap128 : SDTypeProfile<2, 3,
[SDTCisVT<0, untyped>,
- SDTCisPtrTy<1>,
- SDTCisVT<2, untyped>,
- SDTCisVT<3, untyped>]>;
+ SDTCisVT<1, i32>,
+ SDTCisPtrTy<2>,
+ SDTCisVT<3, untyped>,
+ SDTCisVT<4, untyped>]>;
def SDT_ZMemMemLength : SDTypeProfile<0, 3,
[SDTCisPtrTy<0>,
SDTCisPtrTy<1>,
SDTCisVT<2, i64>]>;
+def SDT_ZMemMemLengthCC : SDTypeProfile<1, 3,
+ [SDTCisVT<0, i32>,
+ SDTCisPtrTy<1>,
+ SDTCisPtrTy<2>,
+ SDTCisVT<3, i64>]>;
def SDT_ZMemMemLoop : SDTypeProfile<0, 4,
[SDTCisPtrTy<0>,
SDTCisPtrTy<1>,
SDTCisVT<2, i64>,
SDTCisVT<3, i64>]>;
+def SDT_ZMemMemLoopCC : SDTypeProfile<1, 4,
+ [SDTCisVT<0, i32>,
+ SDTCisPtrTy<1>,
+ SDTCisPtrTy<2>,
+ SDTCisVT<3, i64>,
+ SDTCisVT<4, i64>]>;
def SDT_ZString : SDTypeProfile<1, 3,
[SDTCisPtrTy<0>,
SDTCisPtrTy<1>,
SDTCisPtrTy<2>,
SDTCisVT<3, i32>]>;
-def SDT_ZI32Intrinsic : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>;
+def SDT_ZStringCC : SDTypeProfile<2, 3,
+ [SDTCisPtrTy<0>,
+ SDTCisVT<1, i32>,
+ SDTCisPtrTy<2>,
+ SDTCisPtrTy<3>,
+ SDTCisVT<4, i32>]>;
+def SDT_ZIPM : SDTypeProfile<1, 1,
+ [SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
def SDT_ZPrefetch : SDTypeProfile<0, 2,
[SDTCisVT<0, i32>,
SDTCisPtrTy<1>]>;
@@ -97,9 +135,12 @@ def SDT_ZStoreBSwap : SDTypeProfile<0, 3,
[SDTCisInt<0>,
SDTCisPtrTy<1>,
SDTCisVT<2, OtherVT>]>;
-def SDT_ZTBegin : SDTypeProfile<0, 2,
- [SDTCisPtrTy<0>,
- SDTCisVT<1, i32>]>;
+def SDT_ZTBegin : SDTypeProfile<1, 2,
+ [SDTCisVT<0, i32>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, i32>]>;
+def SDT_ZTEnd : SDTypeProfile<1, 0,
+ [SDTCisVT<0, i32>]>;
def SDT_ZInsertVectorElt : SDTypeProfile<1, 3,
[SDTCisVec<0>,
SDTCisSameAs<0, 1>,
@@ -115,10 +156,19 @@ def SDT_ZVecUnaryConv : SDTypeProfile<1, 1,
def SDT_ZVecUnary : SDTypeProfile<1, 1,
[SDTCisVec<0>,
SDTCisSameAs<0, 1>]>;
+def SDT_ZVecUnaryCC : SDTypeProfile<2, 1,
+ [SDTCisVec<0>,
+ SDTCisVT<1, i32>,
+ SDTCisSameAs<0, 2>]>;
def SDT_ZVecBinary : SDTypeProfile<1, 2,
[SDTCisVec<0>,
SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>]>;
+def SDT_ZVecBinaryCC : SDTypeProfile<2, 2,
+ [SDTCisVec<0>,
+ SDTCisVT<1, i32>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 2>]>;
def SDT_ZVecBinaryInt : SDTypeProfile<1, 2,
[SDTCisVec<0>,
SDTCisSameAs<0, 1>,
@@ -127,10 +177,16 @@ def SDT_ZVecBinaryConv : SDTypeProfile<1, 2,
[SDTCisVec<0>,
SDTCisVec<1>,
SDTCisSameAs<1, 2>]>;
-def SDT_ZVecBinaryConvInt : SDTypeProfile<1, 2,
+def SDT_ZVecBinaryConvCC : SDTypeProfile<2, 2,
[SDTCisVec<0>,
- SDTCisVec<1>,
- SDTCisVT<2, i32>]>;
+ SDTCisVT<1, i32>,
+ SDTCisVec<2>,
+ SDTCisSameAs<2, 3>]>;
+def SDT_ZVecBinaryConvIntCC : SDTypeProfile<2, 2,
+ [SDTCisVec<0>,
+ SDTCisVT<1, i32>,
+ SDTCisVec<2>,
+ SDTCisVT<3, i32>]>;
def SDT_ZRotateMask : SDTypeProfile<1, 2,
[SDTCisVec<0>,
SDTCisVT<1, i32>,
@@ -149,13 +205,28 @@ def SDT_ZVecTernaryInt : SDTypeProfile<1, 3,
SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
SDTCisVT<3, i32>]>;
+def SDT_ZVecTernaryIntCC : SDTypeProfile<2, 3,
+ [SDTCisVec<0>,
+ SDTCisVT<1, i32>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisVT<4, i32>]>;
def SDT_ZVecQuaternaryInt : SDTypeProfile<1, 4,
[SDTCisVec<0>,
SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>,
SDTCisVT<4, i32>]>;
-def SDT_ZTest : SDTypeProfile<0, 2, [SDTCisVT<1, i64>]>;
+def SDT_ZVecQuaternaryIntCC : SDTypeProfile<2, 4,
+ [SDTCisVec<0>,
+ SDTCisVT<1, i32>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisSameAs<0, 4>,
+ SDTCisVT<5, i32>]>;
+def SDT_ZTest : SDTypeProfile<1, 2,
+ [SDTCisVT<0, i32>,
+ SDTCisVT<2, i64>]>;
//===----------------------------------------------------------------------===//
// Node definitions
@@ -188,19 +259,26 @@ def z_pcrel_wrapper : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
def z_pcrel_offset : SDNode<"SystemZISD::PCREL_OFFSET",
SDT_ZWrapOffset, []>;
def z_iabs : SDNode<"SystemZISD::IABS", SDTIntUnaryOp, []>;
-def z_icmp : SDNode<"SystemZISD::ICMP", SDT_ZICmp, [SDNPOutGlue]>;
-def z_fcmp : SDNode<"SystemZISD::FCMP", SDT_ZCmp, [SDNPOutGlue]>;
-def z_tm : SDNode<"SystemZISD::TM", SDT_ZICmp, [SDNPOutGlue]>;
-def z_br_ccmask : SDNode<"SystemZISD::BR_CCMASK", SDT_ZBRCCMask,
- [SDNPHasChain, SDNPInGlue]>;
-def z_select_ccmask : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask,
- [SDNPInGlue]>;
+def z_icmp : SDNode<"SystemZISD::ICMP", SDT_ZICmp>;
+def z_fcmp : SDNode<"SystemZISD::FCMP", SDT_ZCmp>;
+def z_tm : SDNode<"SystemZISD::TM", SDT_ZICmp>;
+def z_br_ccmask_1 : SDNode<"SystemZISD::BR_CCMASK", SDT_ZBRCCMask,
+ [SDNPHasChain]>;
+def z_select_ccmask_1 : SDNode<"SystemZISD::SELECT_CCMASK",
+ SDT_ZSelectCCMask>;
+def z_ipm_1 : SDNode<"SystemZISD::IPM", SDT_ZIPM>;
def z_adjdynalloc : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
def z_popcnt : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
def z_smul_lohi : SDNode<"SystemZISD::SMUL_LOHI", SDT_ZGR128Binary>;
def z_umul_lohi : SDNode<"SystemZISD::UMUL_LOHI", SDT_ZGR128Binary>;
def z_sdivrem : SDNode<"SystemZISD::SDIVREM", SDT_ZGR128Binary>;
def z_udivrem : SDNode<"SystemZISD::UDIVREM", SDT_ZGR128Binary>;
+def z_saddo : SDNode<"SystemZISD::SADDO", SDT_ZBinaryWithFlags>;
+def z_ssubo : SDNode<"SystemZISD::SSUBO", SDT_ZBinaryWithFlags>;
+def z_uaddo : SDNode<"SystemZISD::UADDO", SDT_ZBinaryWithFlags>;
+def z_usubo : SDNode<"SystemZISD::USUBO", SDT_ZBinaryWithFlags>;
+def z_addcarry_1 : SDNode<"SystemZISD::ADDCARRY", SDT_ZBinaryWithCarry>;
+def z_subcarry_1 : SDNode<"SystemZISD::SUBCARRY", SDT_ZBinaryWithCarry>;
def z_membarrier : SDNode<"SystemZISD::MEMBARRIER", SDTNone,
[SDNPHasChain, SDNPSideEffect]>;
@@ -210,7 +288,7 @@ def z_loadbswap : SDNode<"SystemZISD::LRV", SDT_ZLoadBSwap,
def z_storebswap : SDNode<"SystemZISD::STRV", SDT_ZStoreBSwap,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
-def z_tdc : SDNode<"SystemZISD::TDC", SDT_ZTest, [SDNPOutGlue]>;
+def z_tdc : SDNode<"SystemZISD::TDC", SDT_ZTest>;
// Defined because the index is an i32 rather than a pointer.
def z_vector_insert : SDNode<"ISD::INSERT_VECTOR_ELT",
@@ -229,10 +307,8 @@ def z_permute_dwords : SDNode<"SystemZISD::PERMUTE_DWORDS",
SDT_ZVecTernaryInt>;
def z_permute : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;
def z_pack : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>;
-def z_packs_cc : SDNode<"SystemZISD::PACKS_CC", SDT_ZVecBinaryConv,
- [SDNPOutGlue]>;
-def z_packls_cc : SDNode<"SystemZISD::PACKLS_CC", SDT_ZVecBinaryConv,
- [SDNPOutGlue]>;
+def z_packs_cc : SDNode<"SystemZISD::PACKS_CC", SDT_ZVecBinaryConvCC>;
+def z_packls_cc : SDNode<"SystemZISD::PACKLS_CC", SDT_ZVecBinaryConvCC>;
def z_unpack_high : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnaryConv>;
def z_unpackl_high : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnaryConv>;
def z_unpack_low : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnaryConv>;
@@ -247,44 +323,30 @@ def z_vsum : SDNode<"SystemZISD::VSUM", SDT_ZVecBinaryConv>;
def z_vicmpe : SDNode<"SystemZISD::VICMPE", SDT_ZVecBinary>;
def z_vicmph : SDNode<"SystemZISD::VICMPH", SDT_ZVecBinary>;
def z_vicmphl : SDNode<"SystemZISD::VICMPHL", SDT_ZVecBinary>;
-def z_vicmpes : SDNode<"SystemZISD::VICMPES", SDT_ZVecBinary,
- [SDNPOutGlue]>;
-def z_vicmphs : SDNode<"SystemZISD::VICMPHS", SDT_ZVecBinary,
- [SDNPOutGlue]>;
-def z_vicmphls : SDNode<"SystemZISD::VICMPHLS", SDT_ZVecBinary,
- [SDNPOutGlue]>;
+def z_vicmpes : SDNode<"SystemZISD::VICMPES", SDT_ZVecBinaryCC>;
+def z_vicmphs : SDNode<"SystemZISD::VICMPHS", SDT_ZVecBinaryCC>;
+def z_vicmphls : SDNode<"SystemZISD::VICMPHLS", SDT_ZVecBinaryCC>;
def z_vfcmpe : SDNode<"SystemZISD::VFCMPE", SDT_ZVecBinaryConv>;
def z_vfcmph : SDNode<"SystemZISD::VFCMPH", SDT_ZVecBinaryConv>;
def z_vfcmphe : SDNode<"SystemZISD::VFCMPHE", SDT_ZVecBinaryConv>;
-def z_vfcmpes : SDNode<"SystemZISD::VFCMPES", SDT_ZVecBinaryConv,
- [SDNPOutGlue]>;
-def z_vfcmphs : SDNode<"SystemZISD::VFCMPHS", SDT_ZVecBinaryConv,
- [SDNPOutGlue]>;
-def z_vfcmphes : SDNode<"SystemZISD::VFCMPHES", SDT_ZVecBinaryConv,
- [SDNPOutGlue]>;
+def z_vfcmpes : SDNode<"SystemZISD::VFCMPES", SDT_ZVecBinaryConvCC>;
+def z_vfcmphs : SDNode<"SystemZISD::VFCMPHS", SDT_ZVecBinaryConvCC>;
+def z_vfcmphes : SDNode<"SystemZISD::VFCMPHES", SDT_ZVecBinaryConvCC>;
def z_vextend : SDNode<"SystemZISD::VEXTEND", SDT_ZVecUnaryConv>;
def z_vround : SDNode<"SystemZISD::VROUND", SDT_ZVecUnaryConv>;
-def z_vtm : SDNode<"SystemZISD::VTM", SDT_ZCmp, [SDNPOutGlue]>;
-def z_vfae_cc : SDNode<"SystemZISD::VFAE_CC", SDT_ZVecTernaryInt,
- [SDNPOutGlue]>;
-def z_vfaez_cc : SDNode<"SystemZISD::VFAEZ_CC", SDT_ZVecTernaryInt,
- [SDNPOutGlue]>;
-def z_vfee_cc : SDNode<"SystemZISD::VFEE_CC", SDT_ZVecBinary,
- [SDNPOutGlue]>;
-def z_vfeez_cc : SDNode<"SystemZISD::VFEEZ_CC", SDT_ZVecBinary,
- [SDNPOutGlue]>;
-def z_vfene_cc : SDNode<"SystemZISD::VFENE_CC", SDT_ZVecBinary,
- [SDNPOutGlue]>;
-def z_vfenez_cc : SDNode<"SystemZISD::VFENEZ_CC", SDT_ZVecBinary,
- [SDNPOutGlue]>;
-def z_vistr_cc : SDNode<"SystemZISD::VISTR_CC", SDT_ZVecUnary,
- [SDNPOutGlue]>;
-def z_vstrc_cc : SDNode<"SystemZISD::VSTRC_CC", SDT_ZVecQuaternaryInt,
- [SDNPOutGlue]>;
+def z_vtm : SDNode<"SystemZISD::VTM", SDT_ZCmp>;
+def z_vfae_cc : SDNode<"SystemZISD::VFAE_CC", SDT_ZVecTernaryIntCC>;
+def z_vfaez_cc : SDNode<"SystemZISD::VFAEZ_CC", SDT_ZVecTernaryIntCC>;
+def z_vfee_cc : SDNode<"SystemZISD::VFEE_CC", SDT_ZVecBinaryCC>;
+def z_vfeez_cc : SDNode<"SystemZISD::VFEEZ_CC", SDT_ZVecBinaryCC>;
+def z_vfene_cc : SDNode<"SystemZISD::VFENE_CC", SDT_ZVecBinaryCC>;
+def z_vfenez_cc : SDNode<"SystemZISD::VFENEZ_CC", SDT_ZVecBinaryCC>;
+def z_vistr_cc : SDNode<"SystemZISD::VISTR_CC", SDT_ZVecUnaryCC>;
+def z_vstrc_cc : SDNode<"SystemZISD::VSTRC_CC",
+ SDT_ZVecQuaternaryIntCC>;
def z_vstrcz_cc : SDNode<"SystemZISD::VSTRCZ_CC",
- SDT_ZVecQuaternaryInt, [SDNPOutGlue]>;
-def z_vftci : SDNode<"SystemZISD::VFTCI", SDT_ZVecBinaryConvInt,
- [SDNPOutGlue]>;
+ SDT_ZVecQuaternaryIntCC>;
+def z_vftci : SDNode<"SystemZISD::VFTCI", SDT_ZVecBinaryConvIntCC>;
class AtomicWOp<string name, SDTypeProfile profile = SDT_ZAtomicLoadBinaryW>
: SDNode<"SystemZISD::"##name, profile,
@@ -305,11 +367,11 @@ def z_atomic_loadw_umax : AtomicWOp<"ATOMIC_LOADW_UMAX">;
def z_atomic_cmp_swap : SDNode<"SystemZISD::ATOMIC_CMP_SWAP",
SDT_ZAtomicCmpSwap,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
- SDNPOutGlue, SDNPMemOperand]>;
+ SDNPMemOperand]>;
def z_atomic_cmp_swapw : SDNode<"SystemZISD::ATOMIC_CMP_SWAPW",
SDT_ZAtomicCmpSwapW,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
- SDNPOutGlue, SDNPMemOperand]>;
+ SDNPMemOperand]>;
def z_atomic_load_128 : SDNode<"SystemZISD::ATOMIC_LOAD_128",
SDT_ZAtomicLoad128,
@@ -320,7 +382,7 @@ def z_atomic_store_128 : SDNode<"SystemZISD::ATOMIC_STORE_128",
def z_atomic_cmp_swap_128 : SDNode<"SystemZISD::ATOMIC_CMP_SWAP_128",
SDT_ZAtomicCmpSwap128,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
- SDNPOutGlue, SDNPMemOperand]>;
+ SDNPMemOperand]>;
def z_mvc : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
@@ -338,30 +400,26 @@ def z_xc : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
def z_xc_loop : SDNode<"SystemZISD::XC_LOOP", SDT_ZMemMemLoop,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLength,
- [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
-def z_clc_loop : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoop,
- [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
-def z_strcmp : SDNode<"SystemZISD::STRCMP", SDT_ZString,
- [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC,
+ [SDNPHasChain, SDNPMayLoad]>;
+def z_clc_loop : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoopCC,
+ [SDNPHasChain, SDNPMayLoad]>;
+def z_strcmp : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC,
+ [SDNPHasChain, SDNPMayLoad]>;
def z_stpcpy : SDNode<"SystemZISD::STPCPY", SDT_ZString,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_search_string : SDNode<"SystemZISD::SEARCH_STRING", SDT_ZString,
- [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
-def z_ipm : SDNode<"SystemZISD::IPM", SDT_ZI32Intrinsic,
- [SDNPInGlue]>;
+def z_search_string : SDNode<"SystemZISD::SEARCH_STRING", SDT_ZStringCC,
+ [SDNPHasChain, SDNPMayLoad]>;
def z_prefetch : SDNode<"SystemZISD::PREFETCH", SDT_ZPrefetch,
[SDNPHasChain, SDNPMayLoad, SDNPMayStore,
SDNPMemOperand]>;
def z_tbegin : SDNode<"SystemZISD::TBEGIN", SDT_ZTBegin,
- [SDNPHasChain, SDNPOutGlue, SDNPMayStore,
- SDNPSideEffect]>;
+ [SDNPHasChain, SDNPMayStore, SDNPSideEffect]>;
def z_tbegin_nofloat : SDNode<"SystemZISD::TBEGIN_NOFLOAT", SDT_ZTBegin,
- [SDNPHasChain, SDNPOutGlue, SDNPMayStore,
- SDNPSideEffect]>;
-def z_tend : SDNode<"SystemZISD::TEND", SDTNone,
- [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+ [SDNPHasChain, SDNPMayStore, SDNPSideEffect]>;
+def z_tend : SDNode<"SystemZISD::TEND", SDT_ZTEnd,
+ [SDNPHasChain, SDNPSideEffect]>;
def z_vshl : SDNode<"ISD::SHL", SDT_ZVecBinary>;
def z_vsra : SDNode<"ISD::SRA", SDT_ZVecBinary>;
@@ -382,6 +440,20 @@ def z_strv : PatFrag<(ops node:$src, node:$addr),
def z_strvg : PatFrag<(ops node:$src, node:$addr),
(z_storebswap node:$src, node:$addr, i64)>;
+// Fragments including CC as an implicit source.
+def z_br_ccmask
+ : PatFrag<(ops node:$valid, node:$mask, node:$bb),
+ (z_br_ccmask_1 node:$valid, node:$mask, node:$bb, CC)>;
+def z_select_ccmask
+ : PatFrag<(ops node:$true, node:$false, node:$valid, node:$mask),
+ (z_select_ccmask_1 node:$true, node:$false,
+ node:$valid, node:$mask, CC)>;
+def z_ipm : PatFrag<(ops), (z_ipm_1 CC)>;
+def z_addcarry : PatFrag<(ops node:$lhs, node:$rhs),
+ (z_addcarry_1 node:$lhs, node:$rhs, CC)>;
+def z_subcarry : PatFrag<(ops node:$lhs, node:$rhs),
+ (z_subcarry_1 node:$lhs, node:$rhs, CC)>;
+
// Signed and unsigned comparisons.
def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
@@ -574,6 +646,20 @@ def z_inegabs64 : PatFrag<(ops node:$src), (ineg (z_iabs64 node:$src))>;
def z_muladd : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(add (mul node:$src1, node:$src2), node:$src3)>;
+// Alternatives to match operations with or without an overflow CC result.
+def z_sadd : PatFrags<(ops node:$src1, node:$src2),
+ [(z_saddo node:$src1, node:$src2),
+ (add node:$src1, node:$src2)]>;
+def z_uadd : PatFrags<(ops node:$src1, node:$src2),
+ [(z_uaddo node:$src1, node:$src2),
+ (add node:$src1, node:$src2)]>;
+def z_ssub : PatFrags<(ops node:$src1, node:$src2),
+ [(z_ssubo node:$src1, node:$src2),
+ (sub node:$src1, node:$src2)]>;
+def z_usub : PatFrags<(ops node:$src1, node:$src2),
+ [(z_usubo node:$src1, node:$src2),
+ (sub node:$src1, node:$src2)]>;
+
// Fused multiply-subtract, using the natural operand order.
def fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(fma node:$src1, node:$src2, (fneg node:$src3))>;
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 856505e00a10..76ed6f80ba55 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -108,6 +108,10 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
const MCPhysReg *
SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ const SystemZSubtarget &Subtarget = MF->getSubtarget<SystemZSubtarget>();
+ if (MF->getFunction().getCallingConv() == CallingConv::AnyReg)
+ return Subtarget.hasVector()? CSR_SystemZ_AllRegs_Vector_SaveList
+ : CSR_SystemZ_AllRegs_SaveList;
if (MF->getSubtarget().getTargetLowering()->supportSwiftError() &&
MF->getFunction().getAttributes().hasAttrSomewhere(
Attribute::SwiftError))
@@ -118,6 +122,10 @@ SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const uint32_t *
SystemZRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
+ const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+ if (CC == CallingConv::AnyReg)
+ return Subtarget.hasVector()? CSR_SystemZ_AllRegs_Vector_RegMask
+ : CSR_SystemZ_AllRegs_RegMask;
if (MF.getSubtarget().getTargetLowering()->supportSwiftError() &&
MF.getFunction().getAttributes().hasAttrSomewhere(
Attribute::SwiftError))
@@ -307,3 +315,11 @@ SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const SystemZFrameLowering *TFI = getFrameLowering(MF);
return TFI->hasFP(MF) ? SystemZ::R11D : SystemZ::R15D;
}
+
+const TargetRegisterClass *
+SystemZRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+ if (RC == &SystemZ::CCRRegClass)
+ return &SystemZ::GR32BitRegClass;
+ return RC;
+}
+
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index 8787a90b1e25..94781659a50a 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -44,6 +44,12 @@ public:
return &SystemZ::ADDR64BitRegClass;
}
+ /// getCrossCopyRegClass - Returns a legal register class to copy a register
+ /// in the specified class to or from. Returns NULL if it is possible to copy
+ /// between a two registers of the specified class.
+ const TargetRegisterClass *
+ getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
bool getRegAllocationHints(unsigned VirtReg,
ArrayRef<MCPhysReg> Order,
SmallVectorImpl<MCPhysReg> &Hints,
@@ -71,7 +77,7 @@ public:
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const override;
- /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true.
+ /// SrcRC and DstRC will be morphed into NewRC if this returns true.
bool shouldCoalesce(MachineInstr *MI,
const TargetRegisterClass *SrcRC,
unsigned SubReg,
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index a1cfaf699401..79ba7534f92c 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -130,7 +130,7 @@ defm AnyReg : SystemZRegClass<"AnyReg",
[i64, f64, v8i8, v4i16, v2i32, v2f32], 64,
(add (sequence "R%uD", 0, 15),
(sequence "F%uD", 0, 15),
- (sequence "V%u", 0, 15))>;
+ (sequence "V%u", 0, 15)), 0/*allocatable*/>;
//===----------------------------------------------------------------------===//
// Floating-point registers
@@ -263,7 +263,7 @@ defm VF128 : SystemZRegClass<"VF128",
// All vector registers.
defm VR128 : SystemZRegClass<"VR128",
- [f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
128, (add (sequence "V%u", 0, 7),
(sequence "V%u", 16, 31),
(sequence "V%u", 8, 15))>;
@@ -296,8 +296,8 @@ def v128any : TypedReg<untyped, VR128>;
// The 2-bit condition code field of the PSW. Every register named in an
// inline asm needs a class associated with it.
def CC : SystemZReg<"cc">;
-let isAllocatable = 0 in
- def CCRegs : RegisterClass<"SystemZ", [i32], 32, (add CC)>;
+let isAllocatable = 0, CopyCost = -1 in
+ def CCR : RegisterClass<"SystemZ", [i32], 32, (add CC)>;
// Access registers.
class ACR32<bits<16> num, string n> : SystemZReg<n> {
diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td
index 8dba89f70a42..385a94b5d6a9 100644
--- a/lib/Target/SystemZ/SystemZSchedule.td
+++ b/lib/Target/SystemZ/SystemZSchedule.td
@@ -8,75 +8,57 @@
//===----------------------------------------------------------------------===//
// Scheduler resources
-// Resources ending with a '2' use that resource for 2 cycles. An instruction
-// using two such resources use the mapped unit for 4 cycles, and 2 is added
-// to the total number of uops of the sched class.
-// These three resources are used to express decoder grouping rules.
-// The number of decoder slots needed by an instructions is normally
-// one. For a cracked instruction (BeginGroup && !EndGroup) it is
-// two. Expanded instructions (BeginGroup && EndGroup) group alone.
+// These resources are used to express decoder grouping rules. The number of
+// decoder slots needed by an instructions is normally one, but there are
+// exceptions.
+def NormalGr : SchedWrite;
+def Cracked : SchedWrite;
def GroupAlone : SchedWrite;
def BeginGroup : SchedWrite;
def EndGroup : SchedWrite;
-// Latencies, to make code a bit neater. If more than one resource is
-// used for an instruction, the greatest latency (not the sum) will be
-// output by Tablegen. Therefore, in such cases one of these resources
-// is needed.
-def Lat2 : SchedWrite;
-def Lat3 : SchedWrite;
-def Lat4 : SchedWrite;
-def Lat5 : SchedWrite;
-def Lat6 : SchedWrite;
-def Lat7 : SchedWrite;
-def Lat8 : SchedWrite;
-def Lat9 : SchedWrite;
-def Lat10 : SchedWrite;
-def Lat11 : SchedWrite;
-def Lat12 : SchedWrite;
-def Lat15 : SchedWrite;
-def Lat20 : SchedWrite;
-def Lat30 : SchedWrite;
+// A SchedWrite added to other SchedWrites to make LSU latency parameterizable.
+def LSULatency : SchedWrite;
-// Fixed-point
-def FXa : SchedWrite;
-def FXa2 : SchedWrite;
-def FXb : SchedWrite;
-def FXU : SchedWrite;
+// Operand WriteLatencies.
+foreach L = 1 - 30 in def "WLat"#L : SchedWrite;
-// Load/store unit
-def LSU : SchedWrite;
+foreach L = 1 - 16 in
+ def "WLat"#L#"LSU" : WriteSequence<[!cast<SchedWrite>("WLat"#L),
+ LSULatency]>;
-// Model a return without latency, otherwise if-converter will model
-// extra cost and abort (currently there is an assert that checks that
-// all instructions have at least one uop).
-def LSU_lat1 : SchedWrite;
+// ReadAdvances, used for the register operand next to a memory operand,
+// modelling that the register operand is needed later than the address
+// operands.
+def RegReadAdv : SchedRead;
-// Floating point unit (zEC12 and earlier)
-def FPU : SchedWrite;
-def FPU2 : SchedWrite;
-def DFU : SchedWrite;
-def DFU2 : SchedWrite;
+foreach Num = ["", "2", "3", "4", "5", "6"] in {
+ // Fixed-point units
+ def "FXa"#Num : SchedWrite;
+ def "FXb"#Num : SchedWrite;
+ def "FXU"#Num : SchedWrite;
+ // Load/store unit
+ def "LSU"#Num : SchedWrite;
+ // Vector sub units (z13 and later)
+ def "VecBF"#Num : SchedWrite;
+ def "VecDF"#Num : SchedWrite;
+ def "VecDFX"#Num : SchedWrite;
+ def "VecMul"#Num : SchedWrite;
+ def "VecStr"#Num : SchedWrite;
+ def "VecXsPm"#Num : SchedWrite;
+ // Floating point unit (zEC12 and earlier)
+ def "FPU"#Num : SchedWrite;
+ def "DFU"#Num : SchedWrite;
+}
-// Vector sub units (z13 and later)
-def VecBF : SchedWrite;
-def VecBF2 : SchedWrite;
-def VecDF : SchedWrite;
-def VecDF2 : SchedWrite;
-def VecDFX : SchedWrite;
-def VecDFX2 : SchedWrite;
-def VecFPd : SchedWrite; // Blocking BFP div/sqrt unit.
-def VecMul : SchedWrite;
-def VecStr : SchedWrite;
-def VecXsPm : SchedWrite;
+def VecFPd : SchedWrite; // Blocking BFP div/sqrt unit.
-// Virtual branching unit
-def VBU : SchedWrite;
+def VBU : SchedWrite; // Virtual branching unit
+def MCD : SchedWrite; // Millicode
include "SystemZScheduleZ14.td"
include "SystemZScheduleZ13.td"
include "SystemZScheduleZEC12.td"
include "SystemZScheduleZ196.td"
-
diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td
index 72543c1eaee2..5d32232107af 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -10,13 +10,15 @@
// This file defines the machine model for Z13 to support instruction
// scheduling and other instruction cost heuristics.
//
+// Pseudos expanded right after isel do not need to be modelled here.
+//
//===----------------------------------------------------------------------===//
def Z13Model : SchedMachineModel {
let UnsupportedFeatures = Arch11UnsupportedFeatures.List;
- let IssueWidth = 8;
+ let IssueWidth = 6; // Number of instructions decoded per cycle.
let MicroOpBufferSize = 60; // Issue queues
let LoadLatency = 1; // Optimistic load latency.
@@ -27,37 +29,39 @@ def Z13Model : SchedMachineModel {
}
let SchedModel = Z13Model in {
-
-// These definitions could be put in a subtarget common include file,
-// but it seems the include system in Tablegen currently rejects
-// multiple includes of same file.
-def : WriteRes<GroupAlone, []> {
- let NumMicroOps = 0;
- let BeginGroup = 1;
- let EndGroup = 1;
+// These definitions need the SchedModel value. They could be put in a
+// subtarget common include file, but it seems the include system in Tablegen
+// currently (2016) rejects multiple includes of same file.
+
+// Decoder grouping rules
+let NumMicroOps = 1 in {
+ def : WriteRes<NormalGr, []>;
+ def : WriteRes<BeginGroup, []> { let BeginGroup = 1; }
+ def : WriteRes<EndGroup, []> { let EndGroup = 1; }
}
-def : WriteRes<BeginGroup, []> {
- let NumMicroOps = 0;
+def : WriteRes<Cracked, []> {
+ let NumMicroOps = 2;
let BeginGroup = 1;
}
-def : WriteRes<EndGroup, []> {
- let NumMicroOps = 0;
+def : WriteRes<GroupAlone, []> {
+ let NumMicroOps = 3;
+ let BeginGroup = 1;
let EndGroup = 1;
}
-def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
-def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
-def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
-def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
-def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
-def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
-def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
-def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
-def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
-def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
-def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
-def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
-def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
-def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
+
+// Incoming latency removed from the register operand which is used together
+// with a memory operand by the instruction.
+def : ReadAdvance<RegReadAdv, 4>;
+
+// LoadLatency (above) is not used for instructions in this file. This is
+// instead the role of LSULatency, which is the latency value added to the
+// result of loads and instructions with folded memory operands.
+def : WriteRes<LSULatency, []> { let Latency = 4; let NumMicroOps = 0; }
+
+let NumMicroOps = 0 in {
+ foreach L = 1-30 in
+ def : WriteRes<!cast<SchedWrite>("WLat"#L), []> { let Latency = L; }
+}
// Execution units.
def Z13_FXaUnit : ProcResource<2>;
@@ -66,33 +70,39 @@ def Z13_LSUnit : ProcResource<2>;
def Z13_VecUnit : ProcResource<2>;
def Z13_VecFPdUnit : ProcResource<2> { let BufferSize = 1; /* blocking */ }
def Z13_VBUnit : ProcResource<2>;
+def Z13_MCD : ProcResource<1>;
// Subtarget specific definitions of scheduling resources.
-def : WriteRes<FXa, [Z13_FXaUnit]> { let Latency = 1; }
-def : WriteRes<FXa2, [Z13_FXaUnit, Z13_FXaUnit]> { let Latency = 2; }
-def : WriteRes<FXb, [Z13_FXbUnit]> { let Latency = 1; }
-def : WriteRes<LSU, [Z13_LSUnit]> { let Latency = 4; }
-def : WriteRes<VecBF, [Z13_VecUnit]> { let Latency = 8; }
-def : WriteRes<VecBF2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
-def : WriteRes<VecDF, [Z13_VecUnit]> { let Latency = 8; }
-def : WriteRes<VecDF2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
-def : WriteRes<VecDFX, [Z13_VecUnit]> { let Latency = 1; }
-def : WriteRes<VecDFX2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 2; }
-def : WriteRes<VecFPd, [Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
- Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
- Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
- Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
- Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
- Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
- Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
- Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
- Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
- Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit]>
- { let Latency = 30; }
-def : WriteRes<VecMul, [Z13_VecUnit]> { let Latency = 5; }
-def : WriteRes<VecStr, [Z13_VecUnit]> { let Latency = 4; }
-def : WriteRes<VecXsPm, [Z13_VecUnit]> { let Latency = 3; }
-def : WriteRes<VBU, [Z13_VBUnit]>; // Virtual Branching Unit
+let NumMicroOps = 0 in {
+ def : WriteRes<FXa, [Z13_FXaUnit]>;
+ def : WriteRes<FXb, [Z13_FXbUnit]>;
+ def : WriteRes<LSU, [Z13_LSUnit]>;
+ def : WriteRes<VecBF, [Z13_VecUnit]>;
+ def : WriteRes<VecDF, [Z13_VecUnit]>;
+ def : WriteRes<VecDFX, [Z13_VecUnit]>;
+ def : WriteRes<VecMul, [Z13_VecUnit]>;
+ def : WriteRes<VecStr, [Z13_VecUnit]>;
+ def : WriteRes<VecXsPm, [Z13_VecUnit]>;
+ foreach Num = 2-5 in { let ResourceCycles = [Num] in {
+ def : WriteRes<!cast<SchedWrite>("FXa"#Num), [Z13_FXaUnit]>;
+ def : WriteRes<!cast<SchedWrite>("FXb"#Num), [Z13_FXbUnit]>;
+ def : WriteRes<!cast<SchedWrite>("LSU"#Num), [Z13_LSUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecBF"#Num), [Z13_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecDF"#Num), [Z13_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecDFX"#Num), [Z13_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecMul"#Num), [Z13_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecStr"#Num), [Z13_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecXsPm"#Num), [Z13_VecUnit]>;
+ }}
+
+ def : WriteRes<VecFPd, [Z13_VecFPdUnit]> { let ResourceCycles = [30]; }
+
+ def : WriteRes<VBU, [Z13_VBUnit]>; // Virtual Branching Unit
+}
+
+def : WriteRes<MCD, [Z13_MCD]> { let NumMicroOps = 3;
+ let BeginGroup = 1;
+ let EndGroup = 1; }
// -------------------------- INSTRUCTIONS ---------------------------------- //
@@ -106,26 +116,27 @@ def : WriteRes<VBU, [Z13_VBUnit]>; // Virtual Branching Unit
// Stack allocation
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+// Pseudo -> LA / LAY
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ADJDYNALLOC$")>;
//===----------------------------------------------------------------------===//
// Branch instructions
//===----------------------------------------------------------------------===//
// Branch
-def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
-def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "(Call)?B(R)?(Asm.*)?$")>;
-def : InstRW<[FXa, EndGroup], (instregex "BRCT(G)?$")>;
-def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BRCTH$")>;
-def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BCT(G)?(R)?$")>;
-def : InstRW<[FXa, FXa, FXb, FXb, Lat4, GroupAlone],
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXa, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb2, GroupAlone],
(instregex "B(R)?X(H|L).*$")>;
// Compare and branch
-def : InstRW<[FXb], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
-def : InstRW<[FXb, FXb, Lat2, GroupAlone],
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb2, GroupAlone],
(instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
//===----------------------------------------------------------------------===//
@@ -133,593 +144,609 @@ def : InstRW<[FXb, FXb, Lat2, GroupAlone],
//===----------------------------------------------------------------------===//
// Trap
-def : InstRW<[VBU], (instregex "(Cond)?Trap$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Cond)?Trap$")>;
// Compare and trap
-def : InstRW<[FXb], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "CL(G)?RT(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "CL(F|G)IT(Asm.*)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>;
//===----------------------------------------------------------------------===//
// Call and return instructions
//===----------------------------------------------------------------------===//
// Call
-def : InstRW<[VBU, FXa, FXa, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BRASL$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
// Return
-def : InstRW<[FXb, EndGroup], (instregex "Return$")>;
-def : InstRW<[FXb], (instregex "CondReturn$")>;
-
-//===----------------------------------------------------------------------===//
-// Select instructions
-//===----------------------------------------------------------------------===//
-
-// Select pseudo
-def : InstRW<[FXa], (instregex "Select(32|64|32Mux)$")>;
-
-// CondStore pseudos
-def : InstRW<[FXa], (instregex "CondStore16(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore16Mux(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore32(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore32Mux(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore64(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore8(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore8Mux(Inv)?$")>;
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn$")>;
//===----------------------------------------------------------------------===//
// Move instructions
//===----------------------------------------------------------------------===//
// Moves
-def : InstRW<[FXb, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MVI(Y)?$")>;
// Move character
-def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
+def : InstRW<[WLat1, FXb, LSU3, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVCL(E|U)?$")>;
// Pseudo -> reg move
-def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>;
-def : InstRW<[FXa], (instregex "EXTRACT_SUBREG$")>;
-def : InstRW<[FXa], (instregex "INSERT_SUBREG$")>;
-def : InstRW<[FXa], (instregex "REG_SEQUENCE$")>;
-def : InstRW<[FXa], (instregex "SUBREG_TO_REG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "REG_SEQUENCE$")>;
// Loads
-def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux|CBB)?$")>;
-def : InstRW<[LSU], (instregex "LG(RL)?$")>;
-def : InstRW<[LSU], (instregex "L128$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSULatency, LSULatency, LSU, NormalGr], (instregex "LCBB$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LG(RL)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L128$")>;
-def : InstRW<[FXa], (instregex "LLIH(F|H|L)$")>;
-def : InstRW<[FXa], (instregex "LLIL(F|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>;
-def : InstRW<[FXa], (instregex "LG(F|H)I$")>;
-def : InstRW<[FXa], (instregex "LHI(Mux)?$")>;
-def : InstRW<[FXa], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR(Mux)?$")>;
// Load and zero rightmost byte
-def : InstRW<[LSU], (instregex "LZR(F|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>;
// Load and trap
-def : InstRW<[FXb, LSU, Lat5], (instregex "L(FH|G)?AT$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "L(FH|G)?AT$")>;
// Load and test
-def : InstRW<[FXa, LSU, Lat5], (instregex "LT(G)?$")>;
-def : InstRW<[FXa], (instregex "LT(G)?R$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXa, NormalGr], (instregex "LT(G)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LT(G)?R$")>;
// Stores
-def : InstRW<[FXb, LSU, Lat5], (instregex "STG(RL)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "ST128$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STG(RL)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST128$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(Y|FH|RL|Mux)?$")>;
// String moves.
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVST$")>;
//===----------------------------------------------------------------------===//
// Conditional move instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, Lat2], (instregex "LOCRMux$")>;
-def : InstRW<[FXa, Lat2], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
-def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
-def : InstRW<[FXa, LSU, Lat6], (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOCRMux$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr],
+ (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
//===----------------------------------------------------------------------===//
// Sign extensions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa], (instregex "L(B|H|G)R$")>;
-def : InstRW<[FXa], (instregex "LG(B|H|F)R$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "L(B|H|G)R$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(B|H|F)R$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LTGF$")>;
-def : InstRW<[FXa], (instregex "LTGFR$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXa, LSU, NormalGr], (instregex "LTGF$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LTGFR$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LH(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LG(B|H|F)$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(Y)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(B|H|F)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(H|F)RL$")>;
//===----------------------------------------------------------------------===//
// Zero extensions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa], (instregex "LLCR(Mux)?$")>;
-def : InstRW<[FXa], (instregex "LLHR(Mux)?$")>;
-def : InstRW<[FXa], (instregex "LLG(C|H|F|T)R$")>;
-def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
-def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LL(C|H)H$")>;
-def : InstRW<[LSU], (instregex "LLHRL$")>;
-def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLH(Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLHRL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
// Load and zero rightmost byte
-def : InstRW<[LSU], (instregex "LLZRGF$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLZRGF$")>;
// Load and trap
-def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "LLG(F|T)?AT$")>;
//===----------------------------------------------------------------------===//
// Truncations
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STCM(H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Multi-register moves
//===----------------------------------------------------------------------===//
// Load multiple (estimated average of 5 ops)
-def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
- (instregex "LM(H|Y|G)?$")>;
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LM(H|Y|G)?$")>;
// Load multiple disjoint
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LMD$")>;
-// Store multiple (estimated average of ceil(5/2) FXb ops)
-def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
- GroupAlone], (instregex "STM(G|H|Y)?$")>;
+// Store multiple
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "STM(G|H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Byte swaps
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa], (instregex "LRV(G)?R$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LRV(G)?R$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LRV(G|H)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STRV(G|H)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCIN$")>;
//===----------------------------------------------------------------------===//
// Load address instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa], (instregex "LA(Y|RL)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LA(Y|RL)?$")>;
// Load the Global Offset Table address ( -> larl )
-def : InstRW<[FXa], (instregex "GOT$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "GOT$")>;
//===----------------------------------------------------------------------===//
// Absolute and Negation
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, Lat2], (instregex "LP(G)?R$")>;
-def : InstRW<[FXa, FXa, Lat3, BeginGroup], (instregex "L(N|P)GFR$")>;
-def : InstRW<[FXa, Lat2], (instregex "LN(R|GR)$")>;
-def : InstRW<[FXa], (instregex "LC(R|GR)$")>;
-def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "LCGFR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "LP(G)?R$")>;
+def : InstRW<[WLat3, WLat3, FXa2, Cracked], (instregex "L(N|P)GFR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "LN(R|GR)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LC(R|GR)$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "LCGFR$")>;
//===----------------------------------------------------------------------===//
// Insertion
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat5], (instregex "IC(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "IC32(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
-def : InstRW<[FXa], (instregex "II(F|H|L)Mux$")>;
-def : InstRW<[FXa], (instregex "IIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "IIHH(64)?$")>;
-def : InstRW<[FXa], (instregex "IIHL(64)?$")>;
-def : InstRW<[FXa], (instregex "IILF(64)?$")>;
-def : InstRW<[FXa], (instregex "IILH(64)?$")>;
-def : InstRW<[FXa], (instregex "IILL(64)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "IC(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "IC32(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, WLat1LSU, FXa, LSU, NormalGr],
+ (instregex "ICM(H|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILL(64)?$")>;
//===----------------------------------------------------------------------===//
// Addition
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat5], (instregex "A(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat6], (instregex "AH(Y)?$")>;
-def : InstRW<[FXa], (instregex "AIH$")>;
-def : InstRW<[FXa], (instregex "AFI(Mux)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "AG$")>;
-def : InstRW<[FXa], (instregex "AGFI$")>;
-def : InstRW<[FXa], (instregex "AGHI(K)?$")>;
-def : InstRW<[FXa], (instregex "AGR(K)?$")>;
-def : InstRW<[FXa], (instregex "AHI(K)?$")>;
-def : InstRW<[FXa], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "AL(Y)?$")>;
-def : InstRW<[FXa], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "ALG(F)?$")>;
-def : InstRW<[FXa], (instregex "ALGHSIK$")>;
-def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
-def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
-def : InstRW<[FXa], (instregex "ALR(K)?$")>;
-def : InstRW<[FXa], (instregex "AR(K)?$")>;
-def : InstRW<[FXa], (instregex "A(L)?HHHR$")>;
-def : InstRW<[FXa, Lat2], (instregex "A(L)?HHLR$")>;
-def : InstRW<[FXa], (instregex "ALSIH(N)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "A(Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "AH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AIH$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "AG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGFI$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHIMux(K)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "AL(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "ALG(F)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGHSIK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "A(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "A(L)?HHLR$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALSIH(N)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "A(L)?(G)?SI$")>;
// Logical addition with carry
-def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>;
-def : InstRW<[FXa, Lat2, GroupAlone], (instregex "ALC(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+ (instregex "ALC(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "ALC(G)?R$")>;
// Add with sign extension (32 -> 64)
-def : InstRW<[FXa, LSU, Lat6], (instregex "AGF$")>;
-def : InstRW<[FXa, Lat2], (instregex "AGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "AGF$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "AGFR$")>;
//===----------------------------------------------------------------------===//
// Subtraction
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat5], (instregex "S(G|Y)?$")>;
-def : InstRW<[FXa, LSU, Lat6], (instregex "SH(Y)?$")>;
-def : InstRW<[FXa], (instregex "SGR(K)?$")>;
-def : InstRW<[FXa], (instregex "SLFI$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
-def : InstRW<[FXa], (instregex "SLGF(I|R)$")>;
-def : InstRW<[FXa], (instregex "SLGR(K)?$")>;
-def : InstRW<[FXa], (instregex "SLR(K)?$")>;
-def : InstRW<[FXa], (instregex "SR(K)?$")>;
-def : InstRW<[FXa], (instregex "S(L)?HHHR$")>;
-def : InstRW<[FXa, Lat2], (instregex "S(L)?HHLR$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "S(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "SH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLFI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "S(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "S(L)?HHLR$")>;
// Subtraction with borrow
-def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>;
-def : InstRW<[FXa, Lat2, GroupAlone], (instregex "SLB(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+ (instregex "SLB(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "SLB(G)?R$")>;
// Subtraction with sign extension (32 -> 64)
-def : InstRW<[FXa, LSU, Lat6], (instregex "SGF$")>;
-def : InstRW<[FXa, Lat2], (instregex "SGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "SGF$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "SGFR$")>;
//===----------------------------------------------------------------------===//
// AND
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat5], (instregex "N(G|Y)?$")>;
-def : InstRW<[FXa], (instregex "NGR(K)?$")>;
-def : InstRW<[FXa], (instregex "NI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "NI(Y)?$")>;
-def : InstRW<[FXa], (instregex "NIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "NIHH(64)?$")>;
-def : InstRW<[FXa], (instregex "NIHL(64)?$")>;
-def : InstRW<[FXa], (instregex "NILF(64)?$")>;
-def : InstRW<[FXa], (instregex "NILH(64)?$")>;
-def : InstRW<[FXa], (instregex "NILL(64)?$")>;
-def : InstRW<[FXa], (instregex "NR(K)?$")>;
-def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "NC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "N(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "NI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "NC$")>;
//===----------------------------------------------------------------------===//
// OR
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat5], (instregex "O(G|Y)?$")>;
-def : InstRW<[FXa], (instregex "OGR(K)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "OI(Y)?$")>;
-def : InstRW<[FXa], (instregex "OI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXa], (instregex "OIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "OIHH(64)?$")>;
-def : InstRW<[FXa], (instregex "OIHL(64)?$")>;
-def : InstRW<[FXa], (instregex "OILF(64)?$")>;
-def : InstRW<[FXa], (instregex "OILH(64)?$")>;
-def : InstRW<[FXa], (instregex "OILL(64)?$")>;
-def : InstRW<[FXa], (instregex "OR(K)?$")>;
-def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "OC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "O(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OGR(K)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "OI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "OC$")>;
//===----------------------------------------------------------------------===//
// XOR
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat5], (instregex "X(G|Y)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "XI(Y)?$")>;
-def : InstRW<[FXa], (instregex "XIFMux$")>;
-def : InstRW<[FXa], (instregex "XGR(K)?$")>;
-def : InstRW<[FXa], (instregex "XIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "XILF(64)?$")>;
-def : InstRW<[FXa], (instregex "XR(K)?$")>;
-def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "XC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "X(G|Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "XI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIFMux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "XC$")>;
//===----------------------------------------------------------------------===//
// Multiplication
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat10], (instregex "MS(GF|Y)?$")>;
-def : InstRW<[FXa, Lat6], (instregex "MS(R|FI)$")>;
-def : InstRW<[FXa, LSU, Lat12], (instregex "MSG$")>;
-def : InstRW<[FXa, Lat8], (instregex "MSGR$")>;
-def : InstRW<[FXa, Lat6], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXa2, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXa2, Lat9, GroupAlone], (instregex "MLGR$")>;
-def : InstRW<[FXa, Lat5], (instregex "MGHI$")>;
-def : InstRW<[FXa, Lat5], (instregex "MHI$")>;
-def : InstRW<[FXa, LSU, Lat9], (instregex "MH(Y)?$")>;
-def : InstRW<[FXa2, Lat7, GroupAlone], (instregex "M(L)?R$")>;
-def : InstRW<[FXa2, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "MS(GF|Y)?$")>;
+def : InstRW<[WLat6, FXa, NormalGr], (instregex "MS(R|FI)$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MSG$")>;
+def : InstRW<[WLat8, FXa, NormalGr], (instregex "MSGR$")>;
+def : InstRW<[WLat6, FXa, NormalGr], (instregex "MSGF(I|R)$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FXa2, LSU, GroupAlone],
+ (instregex "MLG$")>;
+def : InstRW<[WLat9, FXa2, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[WLat5, FXa, NormalGr], (instregex "MGHI$")>;
+def : InstRW<[WLat5, FXa, NormalGr], (instregex "MHI$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MH(Y)?$")>;
+def : InstRW<[WLat7, FXa2, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FXa2, LSU, GroupAlone],
+ (instregex "M(FY|L)?$")>;
//===----------------------------------------------------------------------===//
// Division and remainder
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>;
-def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>;
-def : InstRW<[FXa2, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
-def : InstRW<[LSU, FXa2, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
-def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>;
-def : InstRW<[FXa2, FXa2, Lat30, GroupAlone], (instregex "DLGR$")>;
-def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone], (instregex "D$")>;
+def : InstRW<[WLat30, FXa2, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa2, LSU, GroupAlone],
+ (instregex "DSG(F)?$")>;
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DLR$")>;
+def : InstRW<[WLat30, FXa4, GroupAlone], (instregex "DLGR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone], (instregex "DL(G)?$")>;
//===----------------------------------------------------------------------===//
// Shifts
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa], (instregex "SLL(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SRL(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SLA(G|K)?$")>;
-def : InstRW<[FXa, FXa, FXa, FXa, LSU, Lat8, GroupAlone],
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRA(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLA(G|K)?$")>;
+def : InstRW<[WLat5LSU, WLat5LSU, FXa4, LSU, GroupAlone],
(instregex "S(L|R)D(A|L)$")>;
// Rotate
-def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>;
+def : InstRW<[WLat2LSU, FXa, LSU, NormalGr], (instregex "RLL(G)?$")>;
// Rotate and insert
-def : InstRW<[FXa], (instregex "RISBG(N|32)?$")>;
-def : InstRW<[FXa], (instregex "RISBH(G|H|L)$")>;
-def : InstRW<[FXa], (instregex "RISBL(G|H|L)$")>;
-def : InstRW<[FXa], (instregex "RISBMux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBMux$")>;
// Rotate and Select
-def : InstRW<[FXa, FXa, Lat3, BeginGroup], (instregex "R(N|O|X)SBG$")>;
+def : InstRW<[WLat3, WLat3, FXa2, Cracked], (instregex "R(N|O|X)SBG$")>;
//===----------------------------------------------------------------------===//
// Comparison
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
-def : InstRW<[FXb], (instregex "C(F|H)I(Mux)?$")>;
-def : InstRW<[FXb], (instregex "CG(F|H)I$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
-def : InstRW<[FXb], (instregex "C(G)?R$")>;
-def : InstRW<[FXb], (instregex "CIH$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CH(F|SI)$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
-def : InstRW<[FXb], (instregex "CLFI(Mux)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLGF(RL)?$")>;
-def : InstRW<[FXb], (instregex "CLGF(I|R)$")>;
-def : InstRW<[FXb], (instregex "CLGR$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLGRL$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
-def : InstRW<[FXb], (instregex "CLIH$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>;
-def : InstRW<[FXb], (instregex "CLR$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>;
-def : InstRW<[FXb], (instregex "C(L)?HHR$")>;
-def : InstRW<[FXb, Lat2], (instregex "C(L)?HLR$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+ (instregex "C(G|Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CG(F|H)I$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?R$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CIH$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CHSI$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+ (instregex "CL(Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLFHSI$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLG$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLG(HRL|HSI)$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLGF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGFRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGF(I|R)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGRL$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLH(RL|HSI)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLIH$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLI(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?HHR$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "C(L)?HLR$")>;
// Compare halfword
-def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
-def : InstRW<[FXb, LSU, Lat6], (instregex "CGH(RL)?$")>;
-def : InstRW<[FXa, FXb, LSU, Lat6, BeginGroup], (instregex "CHHSI$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CH(Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CHRL$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGH$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGHRL$")>;
+def : InstRW<[WLat2LSU, FXa, FXb, LSU, Cracked], (instregex "CHHSI$")>;
// Compare with sign extension (32 -> 64)
-def : InstRW<[FXb, LSU, Lat6], (instregex "CGF(RL)?$")>;
-def : InstRW<[FXb, Lat2], (instregex "CGFR$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGF$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGFRL$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "CGFR$")>;
// Compare logical character
-def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+def : InstRW<[WLat6, FXb, LSU2, Cracked], (instregex "CLC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLST$")>;
// Test under mask
-def : InstRW<[FXb, LSU, Lat5], (instregex "TM(Y)?$")>;
-def : InstRW<[FXb], (instregex "TM(H|L)Mux$")>;
-def : InstRW<[FXb], (instregex "TMHH(64)?$")>;
-def : InstRW<[FXb], (instregex "TMHL(64)?$")>;
-def : InstRW<[FXb], (instregex "TMLH(64)?$")>;
-def : InstRW<[FXb], (instregex "TMLL(64)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "TM(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHL(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLL(64)?$")>;
// Compare logical characters under mask
-def : InstRW<[FXb, LSU, Lat6], (instregex "CLM(H|Y)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr],
+ (instregex "CLM(H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Prefetch and execution hint
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU], (instregex "PFD(RL)?$")>;
-def : InstRW<[FXb, Lat2], (instregex "BPP$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "PFD(RL)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "BPP$")>;
def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>;
-def : InstRW<[FXb], (instregex "NIAI$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "NIAI$")>;
//===----------------------------------------------------------------------===//
// Atomic operations
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, EndGroup], (instregex "Serialize$")>;
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Serialize$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAA(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAAL(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAN(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAO(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAX(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAA(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAAL(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAN(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAO(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAX(G)?$")>;
// Test and set
-def : InstRW<[FXb, LSU, Lat5, EndGroup], (instregex "TS$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, EndGroup], (instregex "TS$")>;
// Compare and swap
-def : InstRW<[FXa, FXb, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+def : InstRW<[WLat3LSU, WLat3LSU, FXa, FXb, LSU, GroupAlone],
+ (instregex "CS(G|Y)?$")>;
// Compare double and swap
-def : InstRW<[FXa, FXa, FXb, FXb, FXa, LSU, Lat10, GroupAlone],
+def : InstRW<[WLat6LSU, WLat6LSU, FXa3, FXb2, LSU, GroupAlone],
(instregex "CDS(Y)?$")>;
-def : InstRW<[FXa, FXa, FXb, FXb, LSU, FXb, FXb, LSU, LSU, Lat20, GroupAlone],
+def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3, GroupAlone],
(instregex "CDSG$")>;
// Compare and swap and store
-def : InstRW<[FXa, LSU, Lat30], (instregex "CSST$")>;
+def : InstRW<[WLat30, MCD], (instregex "CSST$")>;
// Perform locked operation
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+def : InstRW<[WLat30, MCD], (instregex "PLO$")>;
// Load/store pair from/to quadword
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
-def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+def : InstRW<[WLat4LSU, LSU2, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[WLat1, FXb2, LSU, GroupAlone], (instregex "STPQ$")>;
// Load pair disjoint
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, LSU2, GroupAlone], (instregex "LPD(G)?$")>;
//===----------------------------------------------------------------------===//
// Translate and convert
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
-def : InstRW<[FXa, FXa, FXa, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>;
-def : InstRW<[FXa, LSU, Lat30], (instregex "TRTR$")>;
-def : InstRW<[FXa, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
-def : InstRW<[FXa, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
-def : InstRW<[FXa, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "TR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, FXa3, LSU2, GroupAlone],
+ (instregex "TRT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRTR$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TRE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT(R)?E(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+ (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
//===----------------------------------------------------------------------===//
// Message-security assist
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, Lat30], (instregex "KM(C|F|O|CTR)?$")>;
-def : InstRW<[FXa, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC|PPNO)$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD],
+ (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+ (instregex "(KIMD|KLMD|KMAC|PCC|PPNO)$")>;
//===----------------------------------------------------------------------===//
// Decimal arithmetic
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, VecDF, VecDF, LSU, LSU, Lat30, GroupAlone],
+def : InstRW<[WLat30, RegReadAdv, FXb, VecDF2, LSU2, GroupAlone],
(instregex "CVBG$")>;
-def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
-def : InstRW<[FXb, FXb, FXb, VecDF2, VecDF2, LSU, Lat30, GroupAlone],
- (instregex "CVDG$")>;
-def : InstRW<[FXb, VecDF, FXb, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
-def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
-def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
-def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "UNPK$")>;
-
-def : InstRW<[FXb, VecDFX, LSU, LSU, LSU, Lat9, GroupAlone],
+def : InstRW<[WLat30, RegReadAdv, FXb, VecDF, LSU, GroupAlone],
+ (instregex "CVB(Y)?$")>;
+def : InstRW<[WLat1, FXb3, VecDF4, LSU, GroupAlone], (instregex "CVDG$")>;
+def : InstRW<[WLat1, FXb2, VecDF, LSU, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "MV(N|O|Z)$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[WLat12, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat1, FXb, LSU2, Cracked], (instregex "UNPK$")>;
+
+def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone],
(instregex "(A|S|ZA)P$")>;
-def : InstRW<[FXb, VecDFX2, VecDFX2, LSU, LSU, LSU, Lat30, GroupAlone],
- (instregex "(M|D)P$")>;
-def : InstRW<[FXb, VecDFX, VecDFX, LSU, LSU, Lat15, GroupAlone],
- (instregex "SRP$")>;
-def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>;
-def : InstRW<[VecDFX, LSU, Lat4, BeginGroup], (instregex "TP$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone], (instregex "(M|D)P$")>;
+def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone], (instregex "SRP$")>;
+def : InstRW<[WLat8, VecDFX, LSU, LSU, GroupAlone], (instregex "CP$")>;
+def : InstRW<[WLat3LSU, VecDFX, LSU, Cracked], (instregex "TP$")>;
+def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>;
//===----------------------------------------------------------------------===//
// Access registers
//===----------------------------------------------------------------------===//
// Extract/set/copy access register
-def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+def : InstRW<[WLat3, LSU, NormalGr], (instregex "(EAR|SAR|CPYA)$")>;
// Load address extended
-def : InstRW<[LSU, FXa, Lat5, BeginGroup], (instregex "LAE(Y)?$")>;
+def : InstRW<[WLat5, LSU, FXa, Cracked], (instregex "LAE(Y)?$")>;
// Load/store access multiple (not modeled precisely)
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+def : InstRW<[WLat20, WLat20, LSU5, GroupAlone], (instregex "LAM(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STAM(Y)?$")>;
//===----------------------------------------------------------------------===//
// Program mask and addressing mode
//===----------------------------------------------------------------------===//
// Insert Program Mask
-def : InstRW<[FXa, Lat3, EndGroup], (instregex "IPM$")>;
+def : InstRW<[WLat3, FXa, EndGroup], (instregex "IPM$")>;
// Set Program Mask
-def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+def : InstRW<[WLat3, LSU, EndGroup], (instregex "SPM$")>;
// Branch and link
-def : InstRW<[FXa, FXa, FXb, Lat5, GroupAlone], (instregex "BAL(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BAL(R)?$")>;
// Test addressing mode
-def : InstRW<[FXb], (instregex "TAM$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TAM$")>;
// Set addressing mode
-def : InstRW<[FXb, Lat2, EndGroup], (instregex "SAM(24|31|64)$")>;
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "SAM(24|31|64)$")>;
// Branch (and save) and set mode.
-def : InstRW<[FXa, FXb, Lat2, GroupAlone], (instregex "BSM$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "BASSM$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BASSM$")>;
//===----------------------------------------------------------------------===//
// Transactional execution
//===----------------------------------------------------------------------===//
// Transaction begin
-def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat15, GroupAlone],
- (instregex "TBEGIN(C|_nofloat)?$")>;
+def : InstRW<[WLat9, LSU2, FXb5, GroupAlone], (instregex "TBEGIN(C)?$")>;
// Transaction end
-def : InstRW<[FXb, GroupAlone], (instregex "TEND$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "TEND$")>;
// Transaction abort
-def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>;
+def : InstRW<[WLat30, MCD], (instregex "TABORT$")>;
// Extract Transaction Nesting Depth
-def : InstRW<[FXa], (instregex "ETND$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ETND$")>;
// Nontransactional store
-def : InstRW<[FXb, LSU, Lat5], (instregex "NTSTG$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "NTSTG$")>;
//===----------------------------------------------------------------------===//
// Processor assist
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb], (instregex "PPA$")>;
+def : InstRW<[WLat30, MCD], (instregex "PPA$")>;
//===----------------------------------------------------------------------===//
// Miscellaneous Instructions.
//===----------------------------------------------------------------------===//
// Find leftmost one
-def : InstRW<[FXa, FXa, Lat6, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[WLat7, WLat7, FXa2, GroupAlone], (instregex "FLOGR$")>;
// Population count
-def : InstRW<[FXa, Lat3], (instregex "POPCNT$")>;
-
-// Extend
-def : InstRW<[FXa], (instregex "AEXT128$")>;
-def : InstRW<[FXa], (instregex "ZEXT128$")>;
+def : InstRW<[WLat3, WLat3, FXa, NormalGr], (instregex "POPCNT$")>;
// String instructions
-def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>;
-def : InstRW<[FXa, Lat30], (instregex "SRSTU$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "SRST(U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CUSE$")>;
// Various complex instructions
-def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "UPT$")>;
-def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
-def : InstRW<[FXa, Lat30], (instregex "CMPSC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CFC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, WLat30, WLat30, MCD],
+ (instregex "UPT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CKSM$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CMPSC$")>;
// Execute
-def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "EX(RL)?$")>;
//===----------------------------------------------------------------------===//
// .insn directive instructions
@@ -733,168 +760,158 @@ def : InstRW<[], (instregex "Insn.*")>;
// ----------------------------- Floating point ----------------------------- //
//===----------------------------------------------------------------------===//
-// FP: Select instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[FXa], (instregex "SelectF(32|64|128)$")>;
-def : InstRW<[FXa], (instregex "CondStoreF32(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStoreF64(Inv)?$")>;
-
-//===----------------------------------------------------------------------===//
// FP: Move instructions
//===----------------------------------------------------------------------===//
// Load zero
-def : InstRW<[FXb], (instregex "LZ(DR|ER)$")>;
-def : InstRW<[FXb, FXb, Lat2, BeginGroup], (instregex "LZXR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
// Load
-def : InstRW<[VecXsPm], (instregex "LER$")>;
-def : InstRW<[FXb], (instregex "LD(R|R32|GR)$")>;
-def : InstRW<[FXb, Lat3], (instregex "LGDR$")>;
-def : InstRW<[FXb, FXb, Lat2, GroupAlone], (instregex "LXR$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
+def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
// Load and Test
-def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)BR$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "LTEBRCompare(_VecPseudo)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "LTDBRCompare(_VecPseudo)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXBR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone],
- (instregex "LTXBRCompare(_VecPseudo)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BR$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BRCompare$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone],
+ (instregex "LTXBR(Compare)?$")>;
// Copy sign
-def : InstRW<[VecXsPm], (instregex "CPSDRd(d|s)$")>;
-def : InstRW<[VecXsPm], (instregex "CPSDRs(d|s)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>;
//===----------------------------------------------------------------------===//
// FP: Load instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[VecXsPm, LSU, Lat7], (instregex "LE(Y)?$")>;
-def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
-def : InstRW<[LSU], (instregex "LX$")>;
+def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
//===----------------------------------------------------------------------===//
// FP: Store instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat7], (instregex "STD(Y)?$")>;
-def : InstRW<[FXb, LSU, Lat7], (instregex "STE(Y)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STX$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>;
//===----------------------------------------------------------------------===//
// FP: Conversion instructions
//===----------------------------------------------------------------------===//
// Load rounded
-def : InstRW<[VecBF], (instregex "LEDBR(A)?$")>;
-def : InstRW<[VecDF, VecDF, Lat20], (instregex "LEXBR(A)?$")>;
-def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXBR(A)?$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "LEDBR(A)?$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "L(E|D)XBR(A)?$")>;
// Load lengthened
-def : InstRW<[VecBF, LSU, Lat12], (instregex "LDEB$")>;
-def : InstRW<[VecBF], (instregex "LDEBR$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12 , GroupAlone], (instregex "LX(D|E)B$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)BR$")>;
+def : InstRW<[WLat7LSU, VecBF, LSU, NormalGr], (instregex "LDEB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "LDEBR$")>;
+def : InstRW<[WLat8LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)B$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "LX(E|D)BR$")>;
// Convert from fixed / logical
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)BR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)BR(A)?$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CEL(F|G)BR$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CDL(F|G)BR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CXL(F|G)BR$")>;
+def : InstRW<[WLat8, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)BR(A)?$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[WLat8, FXb, VecBF, Cracked], (instregex "C(E|D)L(F|G)BR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "CXL(F|G)BR$")>;
// Convert to fixed / logical
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)BR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)BR(A)?$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XBR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat11, GroupAlone], (instregex "CLFEBR$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLFDBR$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLG(E|D)BR$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "CL(F|G)XBR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked],
+ (instregex "C(F|G)(E|D)BR(A)?$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked],
+ (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, GroupAlone], (instregex "CLFEBR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked], (instregex "CLFDBR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>;
//===----------------------------------------------------------------------===//
// FP: Unary arithmetic
//===----------------------------------------------------------------------===//
// Load Complement / Negative / Positive
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DBR$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)EBR$")>;
-def : InstRW<[FXb], (instregex "LCDFR(_32)?$")>;
-def : InstRW<[FXb], (instregex "LNDFR(_32)?$")>;
-def : InstRW<[FXb], (instregex "LPDFR(_32)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
// Square root
-def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)B$")>;
-def : InstRW<[VecFPd], (instregex "SQ(E|D)BR$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXBR$")>;
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)B$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXBR$")>;
// Load FP integer
-def : InstRW<[VecBF], (instregex "FIEBR(A)?$")>;
-def : InstRW<[VecBF], (instregex "FIDBR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXBR(A)?$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "FI(E|D)BR(A)?$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXBR(A)?$")>;
//===----------------------------------------------------------------------===//
// FP: Binary arithmetic
//===----------------------------------------------------------------------===//
// Addition
-def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D)B$")>;
-def : InstRW<[VecBF], (instregex "A(E|D)BR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "A(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "A(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXBR$")>;
// Subtraction
-def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D)B$")>;
-def : InstRW<[VecBF], (instregex "S(E|D)BR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "S(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "S(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXBR$")>;
// Multiply
-def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
-def : InstRW<[VecBF], (instregex "M(D|DE|EE)BR$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXDB$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDBR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+ (instregex "MXDB$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[WLat20, VecDF4, GroupAlone], (instregex "MXBR$")>;
// Multiply and add / subtract
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
-def : InstRW<[VecBF], (instregex "M(A|S)DBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+ (instregex "M(A|S)EB$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+ (instregex "M(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "M(A|S)DBR$")>;
// Division
-def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>;
-def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>;
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr],
+ (instregex "D(E|D)B$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)BR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXBR$")>;
// Divide to integer
-def : InstRW<[VecFPd, Lat30], (instregex "DI(E|D)BR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "DI(E|D)BR$")>;
//===----------------------------------------------------------------------===//
// FP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>;
-def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>;
+def : InstRW<[WLat3LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+ (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XBR$")>;
// Test Data Class
-def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>;
-def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "TCXB$")>;
+def : InstRW<[WLat5, LSU, VecXsPm, NormalGr], (instregex "TC(E|D)B$")>;
+def : InstRW<[WLat10, LSU2, VecDF4, GroupAlone], (instregex "TCXB$")>;
//===----------------------------------------------------------------------===//
// FP: Floating-point control register instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
-def : InstRW<[FXb, LSU, Lat5, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
-def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[FXa, Lat30], (instregex "SFASR$")>;
-def : InstRW<[FXa, LSU, Lat30], (instregex "LFAS$")>;
-def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
+def : InstRW<[WLat4, FXa, LSU, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[WLat3, LSU, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[WLat3LSU, LSU2, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SFASR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LFAS$")>;
+def : InstRW<[WLat3, FXb, GroupAlone], (instregex "SRNM(B|T)?$")>;
// --------------------- Hexadecimal floating point ------------------------- //
@@ -904,108 +921,113 @@ def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
//===----------------------------------------------------------------------===//
// Load and Test
-def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)R$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXR$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXR$")>;
//===----------------------------------------------------------------------===//
// HFP: Conversion instructions
//===----------------------------------------------------------------------===//
// Load rounded
-def : InstRW<[VecBF], (instregex "(LEDR|LRER)$")>;
-def : InstRW<[VecBF], (instregex "LEXR$")>;
-def : InstRW<[VecDF2], (instregex "(LDXR|LRDR)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "LEXR$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "(LDXR|LRDR)$")>;
// Load lengthened
-def : InstRW<[LSU], (instregex "LDE$")>;
-def : InstRW<[FXb], (instregex "LDER$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "LX(D|E)$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LDE$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LDER$")>;
+def : InstRW<[WLat8LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "LX(E|D)R$")>;
// Convert from fixed
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)R$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)R$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)R$")>;
+def : InstRW<[WLat8, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)R$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "CX(F|G)R$")>;
// Convert to fixed
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)R$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)R$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked], (instregex "C(F|G)(E|D)R$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "C(F|G)XR$")>;
// Convert BFP to HFP / HFP to BFP.
-def : InstRW<[VecBF], (instregex "THD(E)?R$")>;
-def : InstRW<[VecBF], (instregex "TB(E)?DR$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "THD(E)?R$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "TB(E)?DR$")>;
//===----------------------------------------------------------------------===//
// HFP: Unary arithmetic
//===----------------------------------------------------------------------===//
// Load Complement / Negative / Positive
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DR$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)ER$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XR$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XR$")>;
// Halve
-def : InstRW<[VecBF], (instregex "H(E|D)R$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "H(E|D)R$")>;
// Square root
-def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)$")>;
-def : InstRW<[VecFPd], (instregex "SQ(E|D)R$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXR$")>;
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)R$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXR$")>;
// Load FP integer
-def : InstRW<[VecBF], (instregex "FIER$")>;
-def : InstRW<[VecBF], (instregex "FIDR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXR$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "FI(E|D)R$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXR$")>;
//===----------------------------------------------------------------------===//
// HFP: Binary arithmetic
//===----------------------------------------------------------------------===//
// Addition
-def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
-def : InstRW<[VecBF], (instregex "A(E|D|U|W)R$")>;
-def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "A(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXR$")>;
// Subtraction
-def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
-def : InstRW<[VecBF], (instregex "S(E|D|U|W)R$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "S(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXR$")>;
// Multiply
-def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
-def : InstRW<[VecBF], (instregex "M(D|DE|E|EE)R$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXD$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXR$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MY$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MY(H|L)$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MYR$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+ (instregex "MXD$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+ (instregex "MY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF2, LSU, GroupAlone],
+ (instregex "MY(H|L)$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
// Multiply and add / subtract
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)ER$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)DR$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+ (instregex "M(A|S)(E|D)$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "M(A|S)(E|D)R$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, RegReadAdv, VecBF4, LSU, GroupAlone],
+ (instregex "MAY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+ (instregex "MAY(H|L)$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
// Division
-def : InstRW<[VecFPd, LSU], (instregex "D(E|D)$")>;
-def : InstRW<[VecFPd], (instregex "D(E|D)R$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXR$")>;
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr],
+ (instregex "D(E|D)$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)R$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXR$")>;
//===----------------------------------------------------------------------===//
// HFP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[VecBF, LSU, Lat12], (instregex "C(E|D)$")>;
-def : InstRW<[VecBF], (instregex "C(E|D)R$")>;
-def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "C(E|D)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "C(E|D)R$")>;
+def : InstRW<[WLat10, VecDF2, GroupAlone], (instregex "CXR$")>;
// ------------------------ Decimal floating point -------------------------- //
@@ -1015,121 +1037,123 @@ def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>;
//===----------------------------------------------------------------------===//
// Load and Test
-def : InstRW<[VecDF], (instregex "LTDTR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXTR$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "LTDTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXTR$")>;
//===----------------------------------------------------------------------===//
// DFP: Conversion instructions
//===----------------------------------------------------------------------===//
// Load rounded
-def : InstRW<[VecDF, Lat15], (instregex "LEDTR$")>;
-def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXTR$")>;
+def : InstRW<[WLat15, VecDF, NormalGr], (instregex "LEDTR$")>;
+def : InstRW<[WLat15, VecDF2, NormalGr], (instregex "LDXTR$")>;
// Load lengthened
-def : InstRW<[VecDF], (instregex "LDETR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LXDTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "LDETR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "LXDTR$")>;
// Convert from fixed / logical
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CD(F|G)TR(A)?$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CDL(F|G)TR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CXL(F|G)TR$")>;
+def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CD(F|G)TR(A)?$")>;
+def : InstRW<[WLat30, FXb, VecDF4, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[WLat30, FXb, VecDF4, GroupAlone], (instregex "CXL(F|G)TR$")>;
// Convert to fixed / logical
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "C(F|G)DTR(A)?$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "C(F|G)XTR(A)?$")>;
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)DTR$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)XTR$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked],
+ (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked],
+ (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked], (instregex "CL(F|G)XTR$")>;
// Convert from / to signed / unsigned packed
-def : InstRW<[FXb, VecDF, Lat9, BeginGroup], (instregex "CD(S|U)TR$")>;
-def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CX(S|U)TR$")>;
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "C(S|U)DTR$")>;
-def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "C(S|U)XTR$")>;
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "CD(S|U)TR$")>;
+def : InstRW<[WLat12, FXb2, VecDF4, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "C(S|U)DTR$")>;
+def : InstRW<[WLat15, FXb2, VecDF4, GroupAlone], (instregex "C(S|U)XTR$")>;
// Convert from / to zoned
-def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDZT$")>;
-def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXZT$")>;
-def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CZDT$")>;
-def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CZXT$")>;
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDZT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone], (instregex "CXZT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CZDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CZXT$")>;
// Convert from / to packed
-def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDPT$")>;
-def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXPT$")>;
-def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CPDT$")>;
-def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CPXT$")>;
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDPT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone], (instregex "CXPT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CPDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CPXT$")>;
// Perform floating-point operation
-def : InstRW<[FXb, Lat30], (instregex "PFPO$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "PFPO$")>;
//===----------------------------------------------------------------------===//
// DFP: Unary arithmetic
//===----------------------------------------------------------------------===//
// Load FP integer
-def : InstRW<[VecDF], (instregex "FIDTR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "FIDTR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXTR$")>;
// Extract biased exponent
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEDTR$")>;
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEXTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEXTR$")>;
// Extract significance
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "ESDTR$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat15, BeginGroup], (instregex "ESXTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "ESDTR$")>;
+def : InstRW<[WLat12, FXb, VecDF2, Cracked], (instregex "ESXTR$")>;
//===----------------------------------------------------------------------===//
// DFP: Binary arithmetic
//===----------------------------------------------------------------------===//
// Addition
-def : InstRW<[VecDF], (instregex "ADTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXTR(A)?$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "ADTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXTR(A)?$")>;
// Subtraction
-def : InstRW<[VecDF], (instregex "SDTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXTR(A)?$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "SDTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXTR(A)?$")>;
// Multiply
-def : InstRW<[VecDF, Lat30], (instregex "MDTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "MDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXTR(A)?$")>;
// Division
-def : InstRW<[VecDF, Lat30], (instregex "DDTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "DDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "DXTR(A)?$")>;
// Quantize
-def : InstRW<[VecDF], (instregex "QADTR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "QAXTR$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "QADTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "QAXTR$")>;
// Reround
-def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "RRDTR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "RRXTR$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecDF, Cracked], (instregex "RRDTR$")>;
+def : InstRW<[WLat11, WLat11, FXb, VecDF4, GroupAlone], (instregex "RRXTR$")>;
// Shift significand left/right
-def : InstRW<[LSU, VecDF, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
-def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+def : InstRW<[WLat11LSU, LSU, VecDF, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[WLat11LSU, LSU, VecDF4, GroupAlone], (instregex "S(L|R)XT$")>;
// Insert biased exponent
-def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "IEDTR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "IEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "IEXTR$")>;
//===----------------------------------------------------------------------===//
// DFP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[VecDF], (instregex "(K|C)DTR$")>;
-def : InstRW<[VecDF, VecDF, Lat11, GroupAlone], (instregex "(K|C)XTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "(K|C)DTR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XTR$")>;
// Compare biased exponent
-def : InstRW<[VecDF], (instregex "CEDTR$")>;
-def : InstRW<[VecDF], (instregex "CEXTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEDTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEXTR$")>;
// Test Data Class/Group
-def : InstRW<[LSU, VecDF, Lat11], (instregex "TD(C|G)(E|D)T$")>;
-def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+def : InstRW<[WLat15, LSU, VecDF, NormalGr], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[WLat15, LSU, VecDF2, GroupAlone], (instregex "TD(C|G)XT$")>;
// --------------------------------- Vector --------------------------------- //
@@ -1138,234 +1162,236 @@ def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
// Vector: Move instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb], (instregex "VLR(32|64)?$")>;
-def : InstRW<[FXb, Lat4], (instregex "VLGV(B|F|G|H)?$")>;
-def : InstRW<[FXb], (instregex "VLVG(B|F|G|H)?$")>;
-def : InstRW<[FXb, Lat2], (instregex "VLVGP(32)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLR(32|64)?$")>;
+def : InstRW<[WLat4, FXb, NormalGr], (instregex "VLGV(B|F|G|H)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLVG(B|F|G|H)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLVGP(32)?$")>;
//===----------------------------------------------------------------------===//
// Vector: Immediate instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[VecXsPm], (instregex "VZERO$")>;
-def : InstRW<[VecXsPm], (instregex "VONE$")>;
-def : InstRW<[VecXsPm], (instregex "VGBM$")>;
-def : InstRW<[VecXsPm], (instregex "VGM(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VREPI(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VZERO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VONE$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGBM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREPI(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>;
//===----------------------------------------------------------------------===//
// Vector: Loads
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU], (instregex "VL(L|BB)?$")>;
-def : InstRW<[LSU], (instregex "VL(32|64)$")>;
-def : InstRW<[LSU], (instregex "VLLEZ(B|F|G|H)?$")>;
-def : InstRW<[LSU], (instregex "VLREP(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, LSU, Lat7], (instregex "VLE(B|F|G|H)$")>;
-def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VGE(F|G)$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
- (instregex "VLM$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(BB)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+ (instregex "VLE(B|F|G|H)$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, FXb, LSU, VecXsPm, Cracked],
+ (instregex "VGE(F|G)$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone], (instregex "VLM$")>;
//===----------------------------------------------------------------------===//
// Vector: Stores
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat8], (instregex "VST(L|32|64)?$")>;
-def : InstRW<[FXb, LSU, Lat8], (instregex "VSTE(F|G)$")>;
-def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VSTE(B|H)$")>;
-def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat20, GroupAlone],
- (instregex "VSTM$")>;
-def : InstRW<[FXb, FXb, LSU, Lat12, BeginGroup], (instregex "VSCE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(L|32|64)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "VSTM$")>;
+def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>;
//===----------------------------------------------------------------------===//
// Vector: Selects and permutes
//===----------------------------------------------------------------------===//
-def : InstRW<[VecXsPm], (instregex "VMRH(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMRL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VPERM$")>;
-def : InstRW<[VecXsPm], (instregex "VPDI$")>;
-def : InstRW<[VecXsPm], (instregex "VREP(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VSEL$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRH(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPERM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPDI$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEL$")>;
//===----------------------------------------------------------------------===//
// Vector: Widening and narrowing
//===----------------------------------------------------------------------===//
-def : InstRW<[VecXsPm], (instregex "VPK(F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VPKS(F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VPKS(F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VPKLS(F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VPKLS(F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VSEG(B|F|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPH(B|F|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPL(B|F)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPLH(B|F|H|W)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPLL(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPK(F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEG(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPH(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPL(B|F)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLH(B|F|H|W)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLL(B|F|H)?$")>;
//===----------------------------------------------------------------------===//
// Vector: Integer arithmetic
//===----------------------------------------------------------------------===//
-def : InstRW<[VecXsPm], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
-def : InstRW<[VecXsPm], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
-def : InstRW<[VecXsPm], (instregex "VAVG(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VAVGL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VN(C|O)?$")>;
-def : InstRW<[VecXsPm], (instregex "VO$")>;
-def : InstRW<[VecMul], (instregex "VCKSM$")>;
-def : InstRW<[VecXsPm], (instregex "VCLZ(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VCTZ(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VX$")>;
-def : InstRW<[VecMul], (instregex "VGFM?$")>;
-def : InstRW<[VecMul], (instregex "VGFMA(B|F|G|H)?$")>;
-def : InstRW<[VecMul], (instregex "VGFM(B|F|G|H)$")>;
-def : InstRW<[VecXsPm], (instregex "VLC(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VLP(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMX(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMXL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMN(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMNL(B|F|G|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAL(B|F)?$")>;
-def : InstRW<[VecMul], (instregex "VMALE(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMALH(B|F|H|W)?$")>;
-def : InstRW<[VecMul], (instregex "VMALO(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAO(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAE(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAH(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VME(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMH(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VML(B|F)?$")>;
-def : InstRW<[VecMul], (instregex "VMLE(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMLH(B|F|H|W)?$")>;
-def : InstRW<[VecMul], (instregex "VMLO(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMO(B|F|H)?$")>;
-
-def : InstRW<[VecXsPm], (instregex "VPOPCT$")>;
-
-def : InstRW<[VecXsPm], (instregex "VERLL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VERLLV(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VERIM(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESLV(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRA(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRAV(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRLV(B|F|G|H)?$")>;
-
-def : InstRW<[VecXsPm], (instregex "VSL(DB)?$")>;
-def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSLB$")>;
-def : InstRW<[VecXsPm], (instregex "VSR(A|L)$")>;
-def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSR(A|L)B$")>;
-
-def : InstRW<[VecXsPm], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
-def : InstRW<[VecXsPm], (instregex "VSCBI(B|F|G|H|Q)?$")>;
-def : InstRW<[VecXsPm], (instregex "VS(F|G|H|Q)?$")>;
-
-def : InstRW<[VecMul], (instregex "VSUM(B|H)?$")>;
-def : InstRW<[VecMul], (instregex "VSUMG(F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VSUMQ(F|G)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVG(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVGL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VN(C|O)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VO$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VCKSM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCLZ(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCTZ(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VX$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFMA(B|F|G|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM(B|F|G|H)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLC(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLP(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMX(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMXL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMN(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMNL(B|F|G|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAL(B|F)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALH(B|F|H|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAH(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VME(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMH(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VML(B|F)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLH(B|F|H|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMO(B|F|H)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPOPCT$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERIM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRA(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRAV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRLV(B|F|G|H)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSL(DB)?$")>;
+def : InstRW<[WLat3, VecXsPm2, NormalGr], (instregex "VSLB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)$")>;
+def : InstRW<[WLat3, VecXsPm2, NormalGr], (instregex "VSR(A|L)B$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSCBI(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VS(F|G|H|Q)?$")>;
+
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUM(B|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMG(F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMQ(F|G)?$")>;
//===----------------------------------------------------------------------===//
// Vector: Integer comparison
//===----------------------------------------------------------------------===//
-def : InstRW<[VecXsPm, Lat4], (instregex "VEC(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VECL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VCEQ(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VCEQ(B|F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VCH(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VCH(B|F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VCHL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VCHL(B|F|G|H)S$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VTM$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VEC(B|F|G|H)?$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VECL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)S$")>;
+def : InstRW<[WLat4, VecStr, NormalGr], (instregex "VTM$")>;
//===----------------------------------------------------------------------===//
// Vector: Floating-point arithmetic
//===----------------------------------------------------------------------===//
// Conversion and rounding
-def : InstRW<[VecBF2], (instregex "VCD(L)?G$")>;
-def : InstRW<[VecBF2], (instregex "VCD(L)?GB$")>;
-def : InstRW<[VecBF], (instregex "WCD(L)?GB$")>;
-def : InstRW<[VecBF2], (instregex "VC(L)?GD$")>;
-def : InstRW<[VecBF2], (instregex "VC(L)?GDB$")>;
-def : InstRW<[VecBF], (instregex "WC(L)?GDB$")>;
-def : InstRW<[VecBF2], (instregex "VL(DE|ED)$")>;
-def : InstRW<[VecBF2], (instregex "VL(DE|ED)B$")>;
-def : InstRW<[VecBF], (instregex "WL(DE|ED)B$")>;
-def : InstRW<[VecBF2], (instregex "VFI$")>;
-def : InstRW<[VecBF2], (instregex "VFIDB$")>;
-def : InstRW<[VecBF], (instregex "WFIDB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VCD(L)?G$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VCD(L)?GB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WCD(L)?GB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VC(L)?GD$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VC(L)?GDB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WC(L)?GDB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VL(DE|ED)$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VL(DE|ED)B$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WL(DE|ED)B$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFI$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFIDB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFIDB$")>;
// Sign operations
-def : InstRW<[VecXsPm], (instregex "VFPSO$")>;
-def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>;
-def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VFPSO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSODB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)DB$")>;
// Test data class
-def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCIDB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFTCI$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCIDB$")>;
// Add / subtract
-def : InstRW<[VecBF2], (instregex "VF(A|S)$")>;
-def : InstRW<[VecBF2], (instregex "VF(A|S)DB$")>;
-def : InstRW<[VecBF], (instregex "WF(A|S)DB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
// Multiply / multiply-and-add/subtract
-def : InstRW<[VecBF2], (instregex "VFM$")>;
-def : InstRW<[VecBF2], (instregex "VFMDB$")>;
-def : InstRW<[VecBF], (instregex "WFMDB$")>;
-def : InstRW<[VecBF2], (instregex "VFM(A|S)$")>;
-def : InstRW<[VecBF2], (instregex "VFM(A|S)DB$")>;
-def : InstRW<[VecBF], (instregex "WFM(A|S)DB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFMDB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFMDB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM(A|S)$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(A|S)DB$")>;
// Divide / square root
-def : InstRW<[VecFPd], (instregex "VFD$")>;
-def : InstRW<[VecFPd], (instregex "(V|W)FDDB$")>;
-def : InstRW<[VecFPd], (instregex "VFSQ$")>;
-def : InstRW<[VecFPd], (instregex "(V|W)FSQDB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFD$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDDB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFSQ$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQDB$")>;
//===----------------------------------------------------------------------===//
// Vector: Floating-point comparison
//===----------------------------------------------------------------------===//
-def : InstRW<[VecXsPm], (instregex "VFC(E|H|HE)$")>;
-def : InstRW<[VecXsPm], (instregex "VFC(E|H|HE)DB$")>;
-def : InstRW<[VecXsPm], (instregex "WFC(E|H|HE)DB$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VFC(E|H|HE)DBS$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WFC(E|H|HE)DBS$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)DB$")>;
+def : InstRW<[WLat2, WLat2, VecXsPm, NormalGr], (instregex "VFC(E|H|HE)$")>;
+def : InstRW<[WLat2, WLat2, VecXsPm, NormalGr], (instregex "VFC(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)DB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFC(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)DB$")>;
//===----------------------------------------------------------------------===//
// Vector: Floating-point insertion and extraction
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb], (instregex "LEFR$")>;
-def : InstRW<[FXb, Lat4], (instregex "LFER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>;
+def : InstRW<[WLat4, FXb, NormalGr], (instregex "LFER$")>;
//===----------------------------------------------------------------------===//
// Vector: String instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[VecStr], (instregex "VFAE(B)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFAEBS$")>;
-def : InstRW<[VecStr], (instregex "VFAE(F|H)$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFAE(F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VFAEZ(B|F|H)$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFAEZ(B|F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
-def : InstRW<[VecStr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
-def : InstRW<[VecStr], (instregex "VISTR(B|F|H)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VISTR(B|F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VSTRC(B|F|H)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(B)?$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAE(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+ (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+ (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VISTR(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VISTR(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRC(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRC(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)S$")>;
// -------------------------------- System ---------------------------------- //
@@ -1374,156 +1400,150 @@ def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>;
// System: Program-Status Word Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LPSW(E)?$")>;
-def : InstRW<[FXa, Lat3, GroupAlone], (instregex "IPK$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
-def : InstRW<[FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
-def : InstRW<[FXa, Lat3], (instregex "IAC$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>;
+def : InstRW<[WLat30, MCD], (instregex "LPSW(E)?$")>;
+def : InstRW<[WLat3, FXa, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[WLat3, FXa, NormalGr], (instregex "IAC$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>;
//===----------------------------------------------------------------------===//
// System: Control Register Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat30], (instregex "LCTL(G)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
-def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
-def : InstRW<[FXb, Lat30], (instregex "SSA(I)?R$")>;
-def : InstRW<[FXb, Lat30], (instregex "ESEA$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, LSU2, GroupAlone], (instregex "LCTL(G)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "ESEA$")>;
//===----------------------------------------------------------------------===//
// System: Prefix-Register Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat30], (instregex "SPX$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STPX$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(T)?PX$")>;
//===----------------------------------------------------------------------===//
// System: Storage-Key and Real Memory Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, Lat30], (instregex "ISKE$")>;
-def : InstRW<[FXb, Lat30], (instregex "IVSK$")>;
-def : InstRW<[FXb, Lat30], (instregex "SSKE(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "RRB(E|M)$")>;
-def : InstRW<[FXb, Lat30], (instregex "PFMF$")>;
-def : InstRW<[FXb, Lat30], (instregex "TB$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "PGIN$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "PGOUT$")>;
+def : InstRW<[WLat30, MCD], (instregex "ISKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "IVSK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RRB(E|M)$")>;
+def : InstRW<[WLat30, MCD], (instregex "PFMF$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGIN$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGOUT$")>;
//===----------------------------------------------------------------------===//
// System: Dynamic-Address-Translation Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "IDTE(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "CRDTE(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "PTLB$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "CSP(G)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LPTEA$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STRAG$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LURA(G)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STUR(A|G)$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "TPROT$")>;
+def : InstRW<[WLat30, MCD], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTLB$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "CSP(G)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "LPTEA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STRAG$")>;
+def : InstRW<[WLat30, MCD], (instregex "LURA(G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STUR(A|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPROT$")>;
//===----------------------------------------------------------------------===//
// System: Memory-move Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "MVCOS$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
+def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[WLat1, FXa, LSU5, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVPG$")>;
//===----------------------------------------------------------------------===//
// System: Address-Space Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat30], (instregex "LASP$")>;
-def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "PC$")>;
-def : InstRW<[FXb, Lat30], (instregex "PR$")>;
-def : InstRW<[FXb, Lat30], (instregex "PT(I)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "RP$")>;
-def : InstRW<[FXb, Lat30], (instregex "BS(G|A)$")>;
-def : InstRW<[FXb, Lat20], (instregex "TAR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LASP$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PC$")>;
+def : InstRW<[WLat30, MCD], (instregex "PR$")>;
+def : InstRW<[WLat30, MCD], (instregex "PT(I)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RP$")>;
+def : InstRW<[WLat30, MCD], (instregex "BS(G|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TAR$")>;
//===----------------------------------------------------------------------===//
// System: Linkage-Stack Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, Lat30, EndGroup], (instregex "BAKR$")>;
-def : InstRW<[FXb, Lat30], (instregex "EREG(G)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "(E|M)STA$")>;
+def : InstRW<[WLat30, MCD], (instregex "BAKR$")>;
+def : InstRW<[WLat30, MCD], (instregex "EREG(G)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "(E|M)STA$")>;
//===----------------------------------------------------------------------===//
// System: Time-Related Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, Lat30], (instregex "PTFF$")>;
-def : InstRW<[FXb, LSU, Lat20], (instregex "SCK$")>;
-def : InstRW<[FXb, Lat30], (instregex "SCKPF$")>;
-def : InstRW<[FXb, LSU, Lat20], (instregex "SCKC$")>;
-def : InstRW<[LSU, LSU, GroupAlone], (instregex "SPT$")>;
-def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
- (instregex "STCK(F)?$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
- (instregex "STCKE$")>;
-def : InstRW<[FXb, LSU, Lat9], (instregex "STCKC$")>;
-def : InstRW<[LSU, LSU, FXb, Lat5, BeginGroup], (instregex "STPT$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTFF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCK(PF|C)?$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "SPT$")>;
+def : InstRW<[WLat15, LSU3, FXa2, FXb, GroupAlone], (instregex "STCK(F)?$")>;
+def : InstRW<[WLat20, LSU4, FXa2, FXb2, GroupAlone], (instregex "STCKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCKC$")>;
+def : InstRW<[WLat1, LSU2, FXb, Cracked], (instregex "STPT$")>;
//===----------------------------------------------------------------------===//
// System: CPU-Related Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat30], (instregex "STAP$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STIDP$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STSI$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STFL(E)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "ECAG$")>;
-def : InstRW<[FXa, LSU, Lat30], (instregex "ECTG$")>;
-def : InstRW<[FXb, Lat30], (instregex "PTF$")>;
-def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>;
+def : InstRW<[WLat30, MCD], (instregex "STAP$")>;
+def : InstRW<[WLat30, MCD], (instregex "STIDP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STSI$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STFL(E)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "ECAG$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECTG$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTF$")>;
+def : InstRW<[WLat30, MCD], (instregex "PCKMO$")>;
//===----------------------------------------------------------------------===//
// System: Miscellaneous Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
-def : InstRW<[FXb, GroupAlone], (instregex "MC$")>;
-def : InstRW<[FXb, Lat30], (instregex "DIAG$")>;
-def : InstRW<[FXb], (instregex "TRAC(E|G)$")>;
-def : InstRW<[FXb, Lat30], (instregex "TRAP(2|4)$")>;
-def : InstRW<[FXb, Lat30], (instregex "SIGP$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "SIGA$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "SIE$")>;
+def : InstRW<[WLat30, MCD], (instregex "SVC$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "MC$")>;
+def : InstRW<[WLat30, MCD], (instregex "DIAG$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TRAC(E|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TRAP(2|4)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIG(P|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIE$")>;
//===----------------------------------------------------------------------===//
// System: CPU-Measurement Facility Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb], (instregex "LPP$")>;
-def : InstRW<[FXb, Lat30], (instregex "ECPGA$")>;
-def : InstRW<[FXb, Lat30], (instregex "E(C|P)CTR$")>;
-def : InstRW<[FXb, Lat30], (instregex "LCCTL$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "L(P|S)CTL$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
-def : InstRW<[FXb, Lat30], (instregex "S(C|P)CTR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LPP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECPGA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "E(C|P)CTR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LCCTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "L(P|S)CTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(C|P)CTR$")>;
//===----------------------------------------------------------------------===//
// System: I/O Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, Lat30], (instregex "(C|H|R|X)SCH$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
-def : InstRW<[FXb, Lat30], (instregex "RCHP$")>;
-def : InstRW<[FXb, Lat30], (instregex "SCHM$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STC(PS|RW)$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "TPI$")>;
-def : InstRW<[FXb, Lat30], (instregex "SAL$")>;
+def : InstRW<[WLat30, MCD], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "RCHP$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCHM$")>;
+def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPI$")>;
+def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
}
diff --git a/lib/Target/SystemZ/SystemZScheduleZ14.td b/lib/Target/SystemZ/SystemZScheduleZ14.td
index 698eb5627d19..515f968e5091 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -10,13 +10,15 @@
// This file defines the machine model for Z14 to support instruction
// scheduling and other instruction cost heuristics.
//
+// Pseudos expanded right after isel do not need to be modelled here.
+//
//===----------------------------------------------------------------------===//
def Z14Model : SchedMachineModel {
let UnsupportedFeatures = Arch12UnsupportedFeatures.List;
- let IssueWidth = 8;
+ let IssueWidth = 6; // Number of instructions decoded per cycle.
let MicroOpBufferSize = 60; // Issue queues
let LoadLatency = 1; // Optimistic load latency.
@@ -27,37 +29,39 @@ def Z14Model : SchedMachineModel {
}
let SchedModel = Z14Model in {
-
-// These definitions could be put in a subtarget common include file,
-// but it seems the include system in Tablegen currently rejects
-// multiple includes of same file.
-def : WriteRes<GroupAlone, []> {
- let NumMicroOps = 0;
- let BeginGroup = 1;
- let EndGroup = 1;
+// These definitions need the SchedModel value. They could be put in a
+// subtarget common include file, but it seems the include system in Tablegen
+// currently (2016) rejects multiple includes of same file.
+
+// Decoder grouping rules
+let NumMicroOps = 1 in {
+ def : WriteRes<NormalGr, []>;
+ def : WriteRes<BeginGroup, []> { let BeginGroup = 1; }
+ def : WriteRes<EndGroup, []> { let EndGroup = 1; }
}
-def : WriteRes<BeginGroup, []> {
- let NumMicroOps = 0;
+def : WriteRes<Cracked, []> {
+ let NumMicroOps = 2;
let BeginGroup = 1;
}
-def : WriteRes<EndGroup, []> {
- let NumMicroOps = 0;
+def : WriteRes<GroupAlone, []> {
+ let NumMicroOps = 3;
+ let BeginGroup = 1;
let EndGroup = 1;
}
-def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
-def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
-def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
-def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
-def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
-def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
-def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
-def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
-def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
-def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
-def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
-def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
-def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
-def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
+
+// Incoming latency removed from the register operand which is used together
+// with a memory operand by the instruction.
+def : ReadAdvance<RegReadAdv, 4>;
+
+// LoadLatency (above) is not used for instructions in this file. This is
+// instead the role of LSULatency, which is the latency value added to the
+// result of loads and instructions with folded memory operands.
+def : WriteRes<LSULatency, []> { let Latency = 4; let NumMicroOps = 0; }
+
+let NumMicroOps = 0 in {
+ foreach L = 1-30 in
+ def : WriteRes<!cast<SchedWrite>("WLat"#L), []> { let Latency = L; }
+}
// Execution units.
def Z14_FXaUnit : ProcResource<2>;
@@ -66,33 +70,39 @@ def Z14_LSUnit : ProcResource<2>;
def Z14_VecUnit : ProcResource<2>;
def Z14_VecFPdUnit : ProcResource<2> { let BufferSize = 1; /* blocking */ }
def Z14_VBUnit : ProcResource<2>;
+def Z14_MCD : ProcResource<1>;
// Subtarget specific definitions of scheduling resources.
-def : WriteRes<FXa, [Z14_FXaUnit]> { let Latency = 1; }
-def : WriteRes<FXa2, [Z14_FXaUnit, Z14_FXaUnit]> { let Latency = 2; }
-def : WriteRes<FXb, [Z14_FXbUnit]> { let Latency = 1; }
-def : WriteRes<LSU, [Z14_LSUnit]> { let Latency = 4; }
-def : WriteRes<VecBF, [Z14_VecUnit]> { let Latency = 8; }
-def : WriteRes<VecBF2, [Z14_VecUnit, Z14_VecUnit]> { let Latency = 9; }
-def : WriteRes<VecDF, [Z14_VecUnit]> { let Latency = 8; }
-def : WriteRes<VecDF2, [Z14_VecUnit, Z14_VecUnit]> { let Latency = 9; }
-def : WriteRes<VecDFX, [Z14_VecUnit]> { let Latency = 1; }
-def : WriteRes<VecDFX2, [Z14_VecUnit, Z14_VecUnit]> { let Latency = 2; }
-def : WriteRes<VecFPd, [Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
- Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
- Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
- Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
- Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
- Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
- Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
- Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
- Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
- Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit]>
- { let Latency = 30; }
-def : WriteRes<VecMul, [Z14_VecUnit]> { let Latency = 5; }
-def : WriteRes<VecStr, [Z14_VecUnit]> { let Latency = 4; }
-def : WriteRes<VecXsPm, [Z14_VecUnit]> { let Latency = 3; }
-def : WriteRes<VBU, [Z14_VBUnit]>; // Virtual Branching Unit
+let NumMicroOps = 0 in {
+ def : WriteRes<FXa, [Z14_FXaUnit]>;
+ def : WriteRes<FXb, [Z14_FXbUnit]>;
+ def : WriteRes<LSU, [Z14_LSUnit]>;
+ def : WriteRes<VecBF, [Z14_VecUnit]>;
+ def : WriteRes<VecDF, [Z14_VecUnit]>;
+ def : WriteRes<VecDFX, [Z14_VecUnit]>;
+ def : WriteRes<VecMul, [Z14_VecUnit]>;
+ def : WriteRes<VecStr, [Z14_VecUnit]>;
+ def : WriteRes<VecXsPm, [Z14_VecUnit]>;
+ foreach Num = 2-5 in { let ResourceCycles = [Num] in {
+ def : WriteRes<!cast<SchedWrite>("FXa"#Num), [Z14_FXaUnit]>;
+ def : WriteRes<!cast<SchedWrite>("FXb"#Num), [Z14_FXbUnit]>;
+ def : WriteRes<!cast<SchedWrite>("LSU"#Num), [Z14_LSUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecBF"#Num), [Z14_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecDF"#Num), [Z14_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecDFX"#Num), [Z14_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecMul"#Num), [Z14_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecStr"#Num), [Z14_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecXsPm"#Num), [Z14_VecUnit]>;
+ }}
+
+ def : WriteRes<VecFPd, [Z14_VecFPdUnit]> { let ResourceCycles = [30]; }
+
+ def : WriteRes<VBU, [Z14_VBUnit]>; // Virtual Branching Unit
+}
+
+def : WriteRes<MCD, [Z14_MCD]> { let NumMicroOps = 3;
+ let BeginGroup = 1;
+ let EndGroup = 1; }
// -------------------------- INSTRUCTIONS ---------------------------------- //
@@ -106,27 +116,28 @@ def : WriteRes<VBU, [Z14_VBUnit]>; // Virtual Branching Unit
// Stack allocation
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+// Pseudo -> LA / LAY
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ADJDYNALLOC$")>;
//===----------------------------------------------------------------------===//
// Branch instructions
//===----------------------------------------------------------------------===//
// Branch
-def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
-def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "(Call)?B(R)?(Asm.*)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "BI(C)?(Asm.*)?$")>;
-def : InstRW<[FXa, EndGroup], (instregex "BRCT(G)?$")>;
-def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BRCTH$")>;
-def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BCT(G)?(R)?$")>;
-def : InstRW<[FXa, FXa, FXb, FXb, Lat4, GroupAlone],
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "BI(C)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXa, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb2, GroupAlone],
(instregex "B(R)?X(H|L).*$")>;
// Compare and branch
-def : InstRW<[FXb], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
-def : InstRW<[FXb, FXb, Lat2, GroupAlone],
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb2, GroupAlone],
(instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
//===----------------------------------------------------------------------===//
@@ -134,609 +145,627 @@ def : InstRW<[FXb, FXb, Lat2, GroupAlone],
//===----------------------------------------------------------------------===//
// Trap
-def : InstRW<[VBU], (instregex "(Cond)?Trap$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Cond)?Trap$")>;
// Compare and trap
-def : InstRW<[FXb], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "CL(G)?RT(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "CL(F|G)IT(Asm.*)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>;
//===----------------------------------------------------------------------===//
// Call and return instructions
//===----------------------------------------------------------------------===//
// Call
-def : InstRW<[VBU, FXa, FXa, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BRASL$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
// Return
-def : InstRW<[FXb, EndGroup], (instregex "Return$")>;
-def : InstRW<[FXb], (instregex "CondReturn$")>;
-
-//===----------------------------------------------------------------------===//
-// Select instructions
-//===----------------------------------------------------------------------===//
-
-// Select pseudo
-def : InstRW<[FXa], (instregex "Select(32|64|32Mux)$")>;
-
-// CondStore pseudos
-def : InstRW<[FXa], (instregex "CondStore16(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore16Mux(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore32(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore32Mux(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore64(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore8(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore8Mux(Inv)?$")>;
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn$")>;
//===----------------------------------------------------------------------===//
// Move instructions
//===----------------------------------------------------------------------===//
// Moves
-def : InstRW<[FXb, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MVI(Y)?$")>;
// Move character
-def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
+def : InstRW<[WLat1, FXb, LSU3, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVCL(E|U)?$")>;
// Pseudo -> reg move
-def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>;
-def : InstRW<[FXa], (instregex "EXTRACT_SUBREG$")>;
-def : InstRW<[FXa], (instregex "INSERT_SUBREG$")>;
-def : InstRW<[FXa], (instregex "REG_SEQUENCE$")>;
-def : InstRW<[FXa], (instregex "SUBREG_TO_REG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "REG_SEQUENCE$")>;
// Loads
-def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux|CBB)?$")>;
-def : InstRW<[LSU], (instregex "LG(RL)?$")>;
-def : InstRW<[LSU], (instregex "L128$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSULatency, LSULatency, LSU, NormalGr], (instregex "LCBB$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LG(RL)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L128$")>;
-def : InstRW<[FXa], (instregex "LLIH(F|H|L)$")>;
-def : InstRW<[FXa], (instregex "LLIL(F|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>;
-def : InstRW<[FXa], (instregex "LG(F|H)I$")>;
-def : InstRW<[FXa], (instregex "LHI(Mux)?$")>;
-def : InstRW<[FXa], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR(Mux)?$")>;
// Load and zero rightmost byte
-def : InstRW<[LSU], (instregex "LZR(F|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>;
// Load and trap
-def : InstRW<[FXb, LSU, Lat5], (instregex "L(FH|G)?AT$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "L(FH|G)?AT$")>;
// Load and test
-def : InstRW<[FXa, LSU, Lat5], (instregex "LT(G)?$")>;
-def : InstRW<[FXa], (instregex "LT(G)?R$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXa, NormalGr], (instregex "LT(G)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LT(G)?R$")>;
// Stores
-def : InstRW<[FXb, LSU, Lat5], (instregex "STG(RL)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "ST128$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STG(RL)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST128$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(Y|FH|RL|Mux)?$")>;
// String moves.
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVST$")>;
//===----------------------------------------------------------------------===//
// Conditional move instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, Lat2], (instregex "LOCRMux$")>;
-def : InstRW<[FXa, Lat2], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
-def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
-def : InstRW<[FXa, LSU, Lat6], (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOCRMux$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr],
+ (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
//===----------------------------------------------------------------------===//
// Sign extensions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa], (instregex "L(B|H|G)R$")>;
-def : InstRW<[FXa], (instregex "LG(B|H|F)R$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "L(B|H|G)R$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(B|H|F)R$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LTGF$")>;
-def : InstRW<[FXa], (instregex "LTGFR$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXa, LSU, NormalGr], (instregex "LTGF$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LTGFR$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LH(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LG(B|H|F)$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(Y)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(B|H|F)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(H|F)RL$")>;
//===----------------------------------------------------------------------===//
// Zero extensions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa], (instregex "LLCR(Mux)?$")>;
-def : InstRW<[FXa], (instregex "LLHR(Mux)?$")>;
-def : InstRW<[FXa], (instregex "LLG(C|H|F|T)R$")>;
-def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
-def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LL(C|H)H$")>;
-def : InstRW<[LSU], (instregex "LLHRL$")>;
-def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLH(Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLHRL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
// Load and zero rightmost byte
-def : InstRW<[LSU], (instregex "LLZRGF$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLZRGF$")>;
// Load and trap
-def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "LLG(F|T)?AT$")>;
//===----------------------------------------------------------------------===//
// Truncations
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STCM(H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Multi-register moves
//===----------------------------------------------------------------------===//
// Load multiple (estimated average of 5 ops)
-def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
- (instregex "LM(H|Y|G)?$")>;
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LM(H|Y|G)?$")>;
// Load multiple disjoint
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LMD$")>;
-// Store multiple (estimated average of ceil(5/2) FXb ops)
-def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
- GroupAlone], (instregex "STM(G|H|Y)?$")>;
+// Store multiple
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "STM(G|H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Byte swaps
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa], (instregex "LRV(G)?R$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LRV(G)?R$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LRV(G|H)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STRV(G|H)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCIN$")>;
//===----------------------------------------------------------------------===//
// Load address instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa], (instregex "LA(Y|RL)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LA(Y|RL)?$")>;
// Load the Global Offset Table address ( -> larl )
-def : InstRW<[FXa], (instregex "GOT$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "GOT$")>;
//===----------------------------------------------------------------------===//
// Absolute and Negation
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa], (instregex "LP(G)?R$")>;
-def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "L(N|P)GFR$")>;
-def : InstRW<[FXa], (instregex "LN(R|GR)$")>;
-def : InstRW<[FXa], (instregex "LC(R|GR)$")>;
-def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "LCGFR$")>;
+def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LP(G)?R$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "L(N|P)GFR$")>;
+def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LN(R|GR)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LC(R|GR)$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "LCGFR$")>;
//===----------------------------------------------------------------------===//
// Insertion
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat5], (instregex "IC(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "IC32(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
-def : InstRW<[FXa], (instregex "II(F|H|L)Mux$")>;
-def : InstRW<[FXa], (instregex "IIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "IIHH(64)?$")>;
-def : InstRW<[FXa], (instregex "IIHL(64)?$")>;
-def : InstRW<[FXa], (instregex "IILF(64)?$")>;
-def : InstRW<[FXa], (instregex "IILH(64)?$")>;
-def : InstRW<[FXa], (instregex "IILL(64)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "IC(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "IC32(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, WLat1LSU, FXa, LSU, NormalGr],
+ (instregex "ICM(H|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILL(64)?$")>;
//===----------------------------------------------------------------------===//
// Addition
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat5], (instregex "A(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat6], (instregex "AH(Y)?$")>;
-def : InstRW<[FXa], (instregex "AIH$")>;
-def : InstRW<[FXa], (instregex "AFI(Mux)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "AG$")>;
-def : InstRW<[FXa], (instregex "AGFI$")>;
-def : InstRW<[FXa], (instregex "AGHI(K)?$")>;
-def : InstRW<[FXa], (instregex "AGR(K)?$")>;
-def : InstRW<[FXa], (instregex "AHI(K)?$")>;
-def : InstRW<[FXa], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "AL(Y)?$")>;
-def : InstRW<[FXa], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "ALG(F)?$")>;
-def : InstRW<[FXa], (instregex "ALGHSIK$")>;
-def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
-def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
-def : InstRW<[FXa], (instregex "ALR(K)?$")>;
-def : InstRW<[FXa], (instregex "AR(K)?$")>;
-def : InstRW<[FXa], (instregex "A(L)?HHHR$")>;
-def : InstRW<[FXa, Lat2], (instregex "A(L)?HHLR$")>;
-def : InstRW<[FXa], (instregex "ALSIH(N)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "A(Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "AH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AIH$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "AG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGFI$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHIMux(K)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "AL(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "ALG(F)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGHSIK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "A(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "A(L)?HHLR$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALSIH(N)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "A(L)?(G)?SI$")>;
// Logical addition with carry
-def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>;
-def : InstRW<[FXa, Lat2, GroupAlone], (instregex "ALC(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+ (instregex "ALC(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "ALC(G)?R$")>;
// Add with sign extension (16/32 -> 64)
-def : InstRW<[FXa, LSU, Lat6], (instregex "AG(F|H)$")>;
-def : InstRW<[FXa, Lat2], (instregex "AGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "AG(F|H)$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "AGFR$")>;
//===----------------------------------------------------------------------===//
// Subtraction
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat5], (instregex "S(G|Y)?$")>;
-def : InstRW<[FXa, LSU, Lat6], (instregex "SH(Y)?$")>;
-def : InstRW<[FXa], (instregex "SGR(K)?$")>;
-def : InstRW<[FXa], (instregex "SLFI$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
-def : InstRW<[FXa], (instregex "SLGF(I|R)$")>;
-def : InstRW<[FXa], (instregex "SLGR(K)?$")>;
-def : InstRW<[FXa], (instregex "SLR(K)?$")>;
-def : InstRW<[FXa], (instregex "SR(K)?$")>;
-def : InstRW<[FXa], (instregex "S(L)?HHHR$")>;
-def : InstRW<[FXa, Lat2], (instregex "S(L)?HHLR$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "S(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "SH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLFI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "S(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "S(L)?HHLR$")>;
// Subtraction with borrow
-def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>;
-def : InstRW<[FXa, Lat2, GroupAlone], (instregex "SLB(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+ (instregex "SLB(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "SLB(G)?R$")>;
// Subtraction with sign extension (16/32 -> 64)
-def : InstRW<[FXa, LSU, Lat6], (instregex "SG(F|H)$")>;
-def : InstRW<[FXa, Lat2], (instregex "SGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "SG(F|H)$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "SGFR$")>;
//===----------------------------------------------------------------------===//
// AND
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat5], (instregex "N(G|Y)?$")>;
-def : InstRW<[FXa], (instregex "NGR(K)?$")>;
-def : InstRW<[FXa], (instregex "NI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "NI(Y)?$")>;
-def : InstRW<[FXa], (instregex "NIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "NIHH(64)?$")>;
-def : InstRW<[FXa], (instregex "NIHL(64)?$")>;
-def : InstRW<[FXa], (instregex "NILF(64)?$")>;
-def : InstRW<[FXa], (instregex "NILH(64)?$")>;
-def : InstRW<[FXa], (instregex "NILL(64)?$")>;
-def : InstRW<[FXa], (instregex "NR(K)?$")>;
-def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "NC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "N(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "NI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "NC$")>;
//===----------------------------------------------------------------------===//
// OR
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat5], (instregex "O(G|Y)?$")>;
-def : InstRW<[FXa], (instregex "OGR(K)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "OI(Y)?$")>;
-def : InstRW<[FXa], (instregex "OI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXa], (instregex "OIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "OIHH(64)?$")>;
-def : InstRW<[FXa], (instregex "OIHL(64)?$")>;
-def : InstRW<[FXa], (instregex "OILF(64)?$")>;
-def : InstRW<[FXa], (instregex "OILH(64)?$")>;
-def : InstRW<[FXa], (instregex "OILL(64)?$")>;
-def : InstRW<[FXa], (instregex "OR(K)?$")>;
-def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "OC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "O(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OGR(K)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "OI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "OC$")>;
//===----------------------------------------------------------------------===//
// XOR
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat5], (instregex "X(G|Y)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "XI(Y)?$")>;
-def : InstRW<[FXa], (instregex "XIFMux$")>;
-def : InstRW<[FXa], (instregex "XGR(K)?$")>;
-def : InstRW<[FXa], (instregex "XIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "XILF(64)?$")>;
-def : InstRW<[FXa], (instregex "XR(K)?$")>;
-def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "XC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "X(G|Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "XI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIFMux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "XC$")>;
//===----------------------------------------------------------------------===//
// Multiplication
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat9], (instregex "MS(GF|Y)?$")>;
-def : InstRW<[FXa, Lat5], (instregex "MS(R|FI)$")>;
-def : InstRW<[FXa, LSU, Lat11], (instregex "MSG$")>;
-def : InstRW<[FXa, Lat7], (instregex "MSGR$")>;
-def : InstRW<[FXa, Lat5], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXa2, LSU, Lat12, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXa2, Lat8, GroupAlone], (instregex "MLGR$")>;
-def : InstRW<[FXa, Lat4], (instregex "MGHI$")>;
-def : InstRW<[FXa, Lat4], (instregex "MHI$")>;
-def : InstRW<[FXa, LSU, Lat8], (instregex "MH(Y)?$")>;
-def : InstRW<[FXa2, Lat6, GroupAlone], (instregex "M(L)?R$")>;
-def : InstRW<[FXa2, LSU, Lat10, GroupAlone], (instregex "M(FY|L)?$")>;
-def : InstRW<[FXa, LSU, Lat8], (instregex "MGH$")>;
-def : InstRW<[FXa, FXa, LSU, Lat12, GroupAlone], (instregex "MG$")>;
-def : InstRW<[FXa, FXa, Lat8, GroupAlone], (instregex "MGRK$")>;
-def : InstRW<[FXa, LSU, Lat9], (instregex "MSC$")>;
-def : InstRW<[FXa, LSU, Lat11], (instregex "MSGC$")>;
-def : InstRW<[FXa, Lat5], (instregex "MSRKC$")>;
-def : InstRW<[FXa, Lat7], (instregex "MSGRKC$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "MS(GF|Y)?$")>;
+def : InstRW<[WLat5, FXa, NormalGr], (instregex "MS(R|FI)$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MSG$")>;
+def : InstRW<[WLat7, FXa, NormalGr], (instregex "MSGR$")>;
+def : InstRW<[WLat5, FXa, NormalGr], (instregex "MSGF(I|R)$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[WLat8, FXa2, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "MGHI$")>;
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "MHI$")>;
+def : InstRW<[WLat4LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MH(Y)?$")>;
+def : InstRW<[WLat6, FXa2, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, FXa2, LSU, GroupAlone],
+ (instregex "M(FY|L)?$")>;
+def : InstRW<[WLat8, RegReadAdv, FXa, LSU, NormalGr], (instregex "MGH$")>;
+def : InstRW<[WLat12, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MG$")>;
+def : InstRW<[WLat8, FXa2, GroupAlone], (instregex "MGRK$")>;
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "MSC$")>;
+def : InstRW<[WLat8LSU, WLat8LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "MSGC$")>;
+def : InstRW<[WLat6, WLat6, FXa, NormalGr], (instregex "MSRKC$")>;
+def : InstRW<[WLat8, WLat8, FXa, NormalGr], (instregex "MSGRKC$")>;
//===----------------------------------------------------------------------===//
// Division and remainder
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>;
-def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>;
-def : InstRW<[FXa2, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
-def : InstRW<[LSU, FXa2, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
-def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>;
-def : InstRW<[FXa2, FXa2, Lat30, GroupAlone], (instregex "DLGR$")>;
-def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone], (instregex "D$")>;
+def : InstRW<[WLat30, FXa2, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa2, LSU, GroupAlone],
+ (instregex "DSG(F)?$")>;
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DLR$")>;
+def : InstRW<[WLat30, FXa4, GroupAlone], (instregex "DLGR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone], (instregex "DL(G)?$")>;
//===----------------------------------------------------------------------===//
// Shifts
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa], (instregex "SLL(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SRL(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SLA(G|K)?$")>;
-def : InstRW<[FXa, FXa, FXa, FXa, LSU, Lat8, GroupAlone],
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRA(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLA(G|K)?$")>;
+def : InstRW<[WLat5LSU, WLat5LSU, FXa4, LSU, GroupAlone],
(instregex "S(L|R)D(A|L)$")>;
// Rotate
-def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>;
+def : InstRW<[WLat2LSU, FXa, LSU, NormalGr], (instregex "RLL(G)?$")>;
// Rotate and insert
-def : InstRW<[FXa], (instregex "RISBG(N|32)?$")>;
-def : InstRW<[FXa], (instregex "RISBH(G|H|L)$")>;
-def : InstRW<[FXa], (instregex "RISBL(G|H|L)$")>;
-def : InstRW<[FXa], (instregex "RISBMux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBMux$")>;
// Rotate and Select
-def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "R(N|O|X)SBG$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "R(N|O|X)SBG$")>;
//===----------------------------------------------------------------------===//
// Comparison
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
-def : InstRW<[FXb], (instregex "C(F|H)I(Mux)?$")>;
-def : InstRW<[FXb], (instregex "CG(F|H)I$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
-def : InstRW<[FXb], (instregex "C(G)?R$")>;
-def : InstRW<[FXb], (instregex "CIH$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CH(F|SI)$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
-def : InstRW<[FXb], (instregex "CLFI(Mux)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLGF(RL)?$")>;
-def : InstRW<[FXb], (instregex "CLGF(I|R)$")>;
-def : InstRW<[FXb], (instregex "CLGR$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLGRL$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
-def : InstRW<[FXb], (instregex "CLIH$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>;
-def : InstRW<[FXb], (instregex "CLR$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>;
-def : InstRW<[FXb], (instregex "C(L)?HHR$")>;
-def : InstRW<[FXb, Lat2], (instregex "C(L)?HLR$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+ (instregex "C(G|Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CG(F|H)I$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?R$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CIH$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CHSI$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+ (instregex "CL(Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLFHSI$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLG$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLG(HRL|HSI)$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLGF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGFRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGF(I|R)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGRL$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLH(RL|HSI)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLIH$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLI(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?HHR$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "C(L)?HLR$")>;
// Compare halfword
-def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
-def : InstRW<[FXb, LSU, Lat6], (instregex "CGH(RL)?$")>;
-def : InstRW<[FXa, FXb, LSU, Lat6, BeginGroup], (instregex "CHHSI$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CH(Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CHRL$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGH$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGHRL$")>;
+def : InstRW<[WLat2LSU, FXa, FXb, LSU, Cracked], (instregex "CHHSI$")>;
// Compare with sign extension (32 -> 64)
-def : InstRW<[FXb, LSU, Lat6], (instregex "CGF(RL)?$")>;
-def : InstRW<[FXb, Lat2], (instregex "CGFR$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGF$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGFRL$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "CGFR$")>;
// Compare logical character
-def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+def : InstRW<[WLat6, FXb, LSU2, Cracked], (instregex "CLC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLST$")>;
// Test under mask
-def : InstRW<[FXb, LSU, Lat5], (instregex "TM(Y)?$")>;
-def : InstRW<[FXb], (instregex "TM(H|L)Mux$")>;
-def : InstRW<[FXb], (instregex "TMHH(64)?$")>;
-def : InstRW<[FXb], (instregex "TMHL(64)?$")>;
-def : InstRW<[FXb], (instregex "TMLH(64)?$")>;
-def : InstRW<[FXb], (instregex "TMLL(64)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "TM(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHL(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLL(64)?$")>;
// Compare logical characters under mask
-def : InstRW<[FXb, LSU, Lat6], (instregex "CLM(H|Y)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr],
+ (instregex "CLM(H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Prefetch and execution hint
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU], (instregex "PFD(RL)?$")>;
-def : InstRW<[FXb, Lat2], (instregex "BPP$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "PFD(RL)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "BPP$")>;
def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>;
-def : InstRW<[FXb], (instregex "NIAI$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "NIAI$")>;
//===----------------------------------------------------------------------===//
// Atomic operations
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, EndGroup], (instregex "Serialize$")>;
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Serialize$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAA(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAAL(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAN(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAO(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAX(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAA(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAAL(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAN(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAO(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAX(G)?$")>;
// Test and set
-def : InstRW<[FXb, LSU, Lat5, EndGroup], (instregex "TS$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, EndGroup], (instregex "TS$")>;
// Compare and swap
-def : InstRW<[FXa, FXb, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+def : InstRW<[WLat3LSU, WLat3LSU, FXa, FXb, LSU, GroupAlone],
+ (instregex "CS(G|Y)?$")>;
// Compare double and swap
-def : InstRW<[FXa, FXa, FXb, FXb, FXa, LSU, Lat10, GroupAlone],
+def : InstRW<[WLat6LSU, WLat6LSU, FXa3, FXb2, LSU, GroupAlone],
(instregex "CDS(Y)?$")>;
-def : InstRW<[FXa, FXa, FXb, FXb, LSU, FXb, FXb, LSU, LSU, Lat20, GroupAlone],
- (instregex "CDSG$")>;
+def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3,
+ GroupAlone], (instregex "CDSG$")>;
// Compare and swap and store
-def : InstRW<[FXa, LSU, Lat30], (instregex "CSST$")>;
+def : InstRW<[WLat30, MCD], (instregex "CSST$")>;
// Perform locked operation
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+def : InstRW<[WLat30, MCD], (instregex "PLO$")>;
// Load/store pair from/to quadword
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
-def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+def : InstRW<[WLat4LSU, LSU2, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[WLat1, FXb2, LSU, GroupAlone], (instregex "STPQ$")>;
// Load pair disjoint
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, LSU2, GroupAlone], (instregex "LPD(G)?$")>;
//===----------------------------------------------------------------------===//
// Translate and convert
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
-def : InstRW<[FXa, FXa, FXa, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>;
-def : InstRW<[FXa, LSU, Lat30], (instregex "TRTR$")>;
-def : InstRW<[FXa, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
-def : InstRW<[FXa, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
-def : InstRW<[FXa, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "TR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, FXa3, LSU2, GroupAlone],
+ (instregex "TRT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRTR$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TRE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT(R)?E(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+ (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
//===----------------------------------------------------------------------===//
// Message-security assist
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, Lat30], (instregex "KM(C|F|O|CTR|A)?$")>;
-def : InstRW<[FXa, Lat30], (instregex "(KIMD|KLMD|KMAC)$")>;
-def : InstRW<[FXa, Lat30], (instregex "(PCC|PPNO|PRNO)$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD],
+ (instregex "KM(C|F|O|CTR|A)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+ (instregex "(KIMD|KLMD|KMAC)$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+ (instregex "(PCC|PPNO|PRNO)$")>;
//===----------------------------------------------------------------------===//
// Guarded storage
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU], (instregex "LGG$")>;
-def : InstRW<[LSU, Lat5], (instregex "LLGFSG$")>;
-def : InstRW<[LSU, Lat30], (instregex "(L|ST)GSC$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LGG$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLGFSG$")>;
+def : InstRW<[WLat30, MCD], (instregex "(L|ST)GSC$")>;
//===----------------------------------------------------------------------===//
// Decimal arithmetic
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, VecDF, VecDF, LSU, LSU, Lat30, GroupAlone],
+def : InstRW<[WLat30, RegReadAdv, FXb, VecDF2, LSU2, GroupAlone],
(instregex "CVBG$")>;
-def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
-def : InstRW<[FXb, FXb, FXb, VecDF2, VecDF2, LSU, Lat30, GroupAlone],
- (instregex "CVDG$")>;
-def : InstRW<[FXb, VecDF, FXb, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
-def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
-def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
-def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "UNPK$")>;
-
-def : InstRW<[FXb, VecDFX, LSU, LSU, LSU, Lat9, GroupAlone],
+def : InstRW<[WLat30, RegReadAdv, FXb, VecDF, LSU, GroupAlone],
+ (instregex "CVB(Y)?$")>;
+def : InstRW<[WLat1, FXb3, VecDF4, LSU, GroupAlone], (instregex "CVDG$")>;
+def : InstRW<[WLat1, FXb2, VecDF, LSU, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "MV(N|O|Z)$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[WLat12, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat1, FXb, LSU2, Cracked], (instregex "UNPK$")>;
+
+def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone],
(instregex "(A|S|ZA)P$")>;
-def : InstRW<[FXb, VecDFX2, VecDFX2, LSU, LSU, LSU, Lat30, GroupAlone],
- (instregex "(M|D)P$")>;
-def : InstRW<[FXb, VecDFX, VecDFX, LSU, LSU, Lat15, GroupAlone],
- (instregex "SRP$")>;
-def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>;
-def : InstRW<[VecDFX, LSU, Lat4, BeginGroup], (instregex "TP$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone], (instregex "(M|D)P$")>;
+def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone], (instregex "SRP$")>;
+def : InstRW<[WLat8, VecDFX, LSU, LSU, GroupAlone], (instregex "CP$")>;
+def : InstRW<[WLat3LSU, VecDFX, LSU, Cracked], (instregex "TP$")>;
+def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>;
//===----------------------------------------------------------------------===//
// Access registers
//===----------------------------------------------------------------------===//
// Extract/set/copy access register
-def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+def : InstRW<[WLat3, LSU, NormalGr], (instregex "(EAR|SAR|CPYA)$")>;
// Load address extended
-def : InstRW<[LSU, FXa, Lat5, BeginGroup], (instregex "LAE(Y)?$")>;
+def : InstRW<[WLat5, LSU, FXa, Cracked], (instregex "LAE(Y)?$")>;
// Load/store access multiple (not modeled precisely)
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+def : InstRW<[WLat20, WLat20, LSU5, GroupAlone], (instregex "LAM(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STAM(Y)?$")>;
//===----------------------------------------------------------------------===//
// Program mask and addressing mode
//===----------------------------------------------------------------------===//
// Insert Program Mask
-def : InstRW<[FXa, Lat3, EndGroup], (instregex "IPM$")>;
+def : InstRW<[WLat3, FXa, EndGroup], (instregex "IPM$")>;
// Set Program Mask
-def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+def : InstRW<[WLat3, LSU, EndGroup], (instregex "SPM$")>;
// Branch and link
-def : InstRW<[FXa, FXa, FXb, Lat5, GroupAlone], (instregex "BAL(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BAL(R)?$")>;
// Test addressing mode
-def : InstRW<[FXb], (instregex "TAM$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TAM$")>;
// Set addressing mode
-def : InstRW<[FXb, Lat2, EndGroup], (instregex "SAM(24|31|64)$")>;
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "SAM(24|31|64)$")>;
// Branch (and save) and set mode.
-def : InstRW<[FXa, FXb, Lat2, GroupAlone], (instregex "BSM$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "BASSM$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BASSM$")>;
//===----------------------------------------------------------------------===//
// Transactional execution
//===----------------------------------------------------------------------===//
// Transaction begin
-def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat15, GroupAlone],
- (instregex "TBEGIN(C|_nofloat)?$")>;
+def : InstRW<[WLat9, LSU2, FXb5, GroupAlone], (instregex "TBEGIN(C)?$")>;
// Transaction end
-def : InstRW<[FXb, GroupAlone], (instregex "TEND$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "TEND$")>;
// Transaction abort
-def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>;
+def : InstRW<[WLat30, MCD], (instregex "TABORT$")>;
// Extract Transaction Nesting Depth
-def : InstRW<[FXa], (instregex "ETND$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ETND$")>;
// Nontransactional store
-def : InstRW<[FXb, LSU, Lat5], (instregex "NTSTG$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "NTSTG$")>;
//===----------------------------------------------------------------------===//
// Processor assist
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, GroupAlone], (instregex "PPA$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "PPA$")>;
//===----------------------------------------------------------------------===//
// Miscellaneous Instructions.
//===----------------------------------------------------------------------===//
// Find leftmost one
-def : InstRW<[FXa, FXa, Lat4, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[WLat5, WLat5, FXa2, GroupAlone], (instregex "FLOGR$")>;
// Population count
-def : InstRW<[FXa, Lat3], (instregex "POPCNT$")>;
-
-// Extend
-def : InstRW<[FXa], (instregex "AEXT128$")>;
-def : InstRW<[FXa], (instregex "ZEXT128$")>;
+def : InstRW<[WLat3, WLat3, FXa, NormalGr], (instregex "POPCNT$")>;
// String instructions
-def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>;
-def : InstRW<[FXa, Lat30], (instregex "SRSTU$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "SRST(U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CUSE$")>;
// Various complex instructions
-def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "UPT$")>;
-def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
-def : InstRW<[FXa, Lat30], (instregex "CMPSC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CFC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, WLat30, WLat30, MCD],
+ (instregex "UPT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CKSM$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CMPSC$")>;
// Execute
-def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "EX(RL)?$")>;
//===----------------------------------------------------------------------===//
// .insn directive instructions
@@ -750,168 +779,158 @@ def : InstRW<[], (instregex "Insn.*")>;
// ----------------------------- Floating point ----------------------------- //
//===----------------------------------------------------------------------===//
-// FP: Select instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[FXa], (instregex "Select(F32|F64|F128|VR128)$")>;
-def : InstRW<[FXa], (instregex "CondStoreF32(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStoreF64(Inv)?$")>;
-
-//===----------------------------------------------------------------------===//
// FP: Move instructions
//===----------------------------------------------------------------------===//
// Load zero
-def : InstRW<[FXb], (instregex "LZ(DR|ER)$")>;
-def : InstRW<[FXb, FXb, Lat2, BeginGroup], (instregex "LZXR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
// Load
-def : InstRW<[VecXsPm], (instregex "LER$")>;
-def : InstRW<[FXb], (instregex "LD(R|R32|GR)$")>;
-def : InstRW<[FXb, Lat3], (instregex "LGDR$")>;
-def : InstRW<[FXb, FXb, Lat2, GroupAlone], (instregex "LXR$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
+def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
// Load and Test
-def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)BR$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "LTEBRCompare(_VecPseudo)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "LTDBRCompare(_VecPseudo)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXBR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone],
- (instregex "LTXBRCompare(_VecPseudo)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BR$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BRCompare$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone],
+ (instregex "LTXBR(Compare)?$")>;
// Copy sign
-def : InstRW<[VecXsPm], (instregex "CPSDRd(d|s)$")>;
-def : InstRW<[VecXsPm], (instregex "CPSDRs(d|s)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>;
//===----------------------------------------------------------------------===//
// FP: Load instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[VecXsPm, LSU, Lat7], (instregex "LE(Y)?$")>;
-def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
-def : InstRW<[LSU], (instregex "LX$")>;
+def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
//===----------------------------------------------------------------------===//
// FP: Store instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat7], (instregex "STD(Y)?$")>;
-def : InstRW<[FXb, LSU, Lat7], (instregex "STE(Y)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STX$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>;
//===----------------------------------------------------------------------===//
// FP: Conversion instructions
//===----------------------------------------------------------------------===//
// Load rounded
-def : InstRW<[VecBF], (instregex "LEDBR(A)?$")>;
-def : InstRW<[VecDF, VecDF, Lat20], (instregex "LEXBR(A)?$")>;
-def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXBR(A)?$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "LEDBR(A)?$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "L(E|D)XBR(A)?$")>;
// Load lengthened
-def : InstRW<[VecBF, LSU, Lat12], (instregex "LDEB$")>;
-def : InstRW<[VecBF], (instregex "LDEBR$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12 , GroupAlone], (instregex "LX(D|E)B$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)BR$")>;
+def : InstRW<[WLat7LSU, VecBF, LSU, NormalGr], (instregex "LDEB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "LDEBR$")>;
+def : InstRW<[WLat8LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)B$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "LX(E|D)BR$")>;
// Convert from fixed / logical
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)BR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)BR(A)?$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CEL(F|G)BR$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CDL(F|G)BR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CXL(F|G)BR$")>;
+def : InstRW<[WLat8, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)BR(A)?$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[WLat8, FXb, VecBF, Cracked], (instregex "C(E|D)L(F|G)BR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "CXL(F|G)BR$")>;
// Convert to fixed / logical
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)BR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)BR(A)?$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XBR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat11, GroupAlone], (instregex "CLFEBR$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLFDBR$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLG(E|D)BR$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "CL(F|G)XBR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked],
+ (instregex "C(F|G)(E|D)BR(A)?$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked],
+ (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, GroupAlone], (instregex "CLFEBR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked], (instregex "CLFDBR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>;
//===----------------------------------------------------------------------===//
// FP: Unary arithmetic
//===----------------------------------------------------------------------===//
// Load Complement / Negative / Positive
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DBR$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)EBR$")>;
-def : InstRW<[FXb], (instregex "LCDFR(_32)?$")>;
-def : InstRW<[FXb], (instregex "LNDFR(_32)?$")>;
-def : InstRW<[FXb], (instregex "LPDFR(_32)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
// Square root
-def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)B$")>;
-def : InstRW<[VecFPd], (instregex "SQ(E|D)BR$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXBR$")>;
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)B$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXBR$")>;
// Load FP integer
-def : InstRW<[VecBF], (instregex "FIEBR(A)?$")>;
-def : InstRW<[VecBF], (instregex "FIDBR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXBR(A)?$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "FI(E|D)BR(A)?$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXBR(A)?$")>;
//===----------------------------------------------------------------------===//
// FP: Binary arithmetic
//===----------------------------------------------------------------------===//
// Addition
-def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D)B$")>;
-def : InstRW<[VecBF], (instregex "A(E|D)BR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "A(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "A(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXBR$")>;
// Subtraction
-def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D)B$")>;
-def : InstRW<[VecBF], (instregex "S(E|D)BR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "S(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "S(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXBR$")>;
// Multiply
-def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
-def : InstRW<[VecBF], (instregex "M(D|DE|EE)BR$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXDB$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDBR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+ (instregex "MXDB$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[WLat20, VecDF4, GroupAlone], (instregex "MXBR$")>;
// Multiply and add / subtract
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
-def : InstRW<[VecBF], (instregex "M(A|S)DBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+ (instregex "M(A|S)EB$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+ (instregex "M(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "M(A|S)DBR$")>;
// Division
-def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>;
-def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>;
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr],
+ (instregex "D(E|D)B$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)BR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXBR$")>;
// Divide to integer
-def : InstRW<[VecFPd, Lat30], (instregex "DI(E|D)BR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "DI(E|D)BR$")>;
//===----------------------------------------------------------------------===//
// FP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>;
-def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>;
+def : InstRW<[WLat3LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+ (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XBR$")>;
// Test Data Class
-def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>;
-def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "TCXB$")>;
+def : InstRW<[WLat5, LSU, VecXsPm, NormalGr], (instregex "TC(E|D)B$")>;
+def : InstRW<[WLat10, LSU2, VecDF4, GroupAlone], (instregex "TCXB$")>;
//===----------------------------------------------------------------------===//
// FP: Floating-point control register instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
-def : InstRW<[FXb, LSU, Lat5, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
-def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[FXa, Lat30], (instregex "SFASR$")>;
-def : InstRW<[FXa, LSU, Lat30], (instregex "LFAS$")>;
-def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
+def : InstRW<[WLat4, FXa, LSU, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[WLat3, LSU, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[WLat3LSU, LSU2, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SFASR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LFAS$")>;
+def : InstRW<[WLat3, FXb, GroupAlone], (instregex "SRNM(B|T)?$")>;
// --------------------- Hexadecimal floating point ------------------------- //
@@ -921,108 +940,111 @@ def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
//===----------------------------------------------------------------------===//
// Load and Test
-def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)R$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXR$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXR$")>;
//===----------------------------------------------------------------------===//
// HFP: Conversion instructions
//===----------------------------------------------------------------------===//
// Load rounded
-def : InstRW<[VecBF], (instregex "(LEDR|LRER)$")>;
-def : InstRW<[VecBF], (instregex "LEXR$")>;
-def : InstRW<[VecDF2], (instregex "(LDXR|LRDR)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "LEXR$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "(LDXR|LRDR)$")>;
// Load lengthened
-def : InstRW<[LSU], (instregex "LDE$")>;
-def : InstRW<[FXb], (instregex "LDER$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "LX(D|E)$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LDE$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LDER$")>;
+def : InstRW<[WLat8LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "LX(E|D)R$")>;
// Convert from fixed
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)R$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)R$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)R$")>;
+def : InstRW<[WLat8, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)R$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "CX(F|G)R$")>;
// Convert to fixed
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)R$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)R$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked], (instregex "C(F|G)(E|D)R$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "C(F|G)XR$")>;
// Convert BFP to HFP / HFP to BFP.
-def : InstRW<[VecBF], (instregex "THD(E)?R$")>;
-def : InstRW<[VecBF], (instregex "TB(E)?DR$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "THD(E)?R$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "TB(E)?DR$")>;
//===----------------------------------------------------------------------===//
// HFP: Unary arithmetic
//===----------------------------------------------------------------------===//
// Load Complement / Negative / Positive
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DR$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)ER$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XR$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XR$")>;
// Halve
-def : InstRW<[VecBF], (instregex "H(E|D)R$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "H(E|D)R$")>;
// Square root
-def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)$")>;
-def : InstRW<[VecFPd], (instregex "SQ(E|D)R$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXR$")>;
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)R$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXR$")>;
// Load FP integer
-def : InstRW<[VecBF], (instregex "FIER$")>;
-def : InstRW<[VecBF], (instregex "FIDR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXR$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "FI(E|D)R$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXR$")>;
//===----------------------------------------------------------------------===//
// HFP: Binary arithmetic
//===----------------------------------------------------------------------===//
// Addition
-def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
-def : InstRW<[VecBF], (instregex "A(E|D|U|W)R$")>;
-def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "A(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXR$")>;
// Subtraction
-def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
-def : InstRW<[VecBF], (instregex "S(E|D|U|W)R$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "S(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXR$")>;
// Multiply
-def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
-def : InstRW<[VecBF], (instregex "M(D|DE|E|EE)R$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXD$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXR$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MY$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MY(H|L)$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MYR$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+ (instregex "MXD$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, VecBF4, LSU, GroupAlone], (instregex "MY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF2, LSU, GroupAlone],
+ (instregex "MY(H|L)$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
// Multiply and add / subtract
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)ER$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)DR$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+ (instregex "M(A|S)(E|D)$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "M(A|S)(E|D)R$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, RegReadAdv, VecBF4, LSU, GroupAlone],
+ (instregex "MAY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+ (instregex "MAY(H|L)$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
// Division
-def : InstRW<[VecFPd, LSU], (instregex "D(E|D)$")>;
-def : InstRW<[VecFPd], (instregex "D(E|D)R$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXR$")>;
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr], (instregex "D(E|D)$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)R$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXR$")>;
//===----------------------------------------------------------------------===//
// HFP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[VecBF, LSU, Lat12], (instregex "C(E|D)$")>;
-def : InstRW<[VecBF], (instregex "C(E|D)R$")>;
-def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "C(E|D)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "C(E|D)R$")>;
+def : InstRW<[WLat10, VecDF2, GroupAlone], (instregex "CXR$")>;
// ------------------------ Decimal floating point -------------------------- //
@@ -1032,121 +1054,123 @@ def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>;
//===----------------------------------------------------------------------===//
// Load and Test
-def : InstRW<[VecDF], (instregex "LTDTR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXTR$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "LTDTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXTR$")>;
//===----------------------------------------------------------------------===//
// DFP: Conversion instructions
//===----------------------------------------------------------------------===//
// Load rounded
-def : InstRW<[VecDF, Lat15], (instregex "LEDTR$")>;
-def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXTR$")>;
+def : InstRW<[WLat15, VecDF, NormalGr], (instregex "LEDTR$")>;
+def : InstRW<[WLat15, VecDF2, NormalGr], (instregex "LDXTR$")>;
// Load lengthened
-def : InstRW<[VecDF], (instregex "LDETR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LXDTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "LDETR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "LXDTR$")>;
// Convert from fixed / logical
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CD(F|G)TR(A)?$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CDL(F|G)TR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CXL(F|G)TR$")>;
+def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CD(F|G)TR(A)?$")>;
+def : InstRW<[WLat30, FXb, VecDF4, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[WLat30, FXb, VecDF4, GroupAlone], (instregex "CXL(F|G)TR$")>;
// Convert to fixed / logical
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "C(F|G)DTR(A)?$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "C(F|G)XTR(A)?$")>;
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)DTR$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)XTR$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked],
+ (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked],
+ (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked], (instregex "CL(F|G)XTR$")>;
// Convert from / to signed / unsigned packed
-def : InstRW<[FXb, VecDF, Lat9, BeginGroup], (instregex "CD(S|U)TR$")>;
-def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CX(S|U)TR$")>;
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "C(S|U)DTR$")>;
-def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "C(S|U)XTR$")>;
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "CD(S|U)TR$")>;
+def : InstRW<[WLat12, FXb2, VecDF4, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "C(S|U)DTR$")>;
+def : InstRW<[WLat15, FXb2, VecDF4, GroupAlone], (instregex "C(S|U)XTR$")>;
// Convert from / to zoned
-def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDZT$")>;
-def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXZT$")>;
-def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CZDT$")>;
-def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CZXT$")>;
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDZT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone], (instregex "CXZT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CZDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CZXT$")>;
// Convert from / to packed
-def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDPT$")>;
-def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXPT$")>;
-def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CPDT$")>;
-def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CPXT$")>;
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDPT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone], (instregex "CXPT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CPDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CPXT$")>;
// Perform floating-point operation
-def : InstRW<[FXb, Lat30], (instregex "PFPO$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "PFPO$")>;
//===----------------------------------------------------------------------===//
// DFP: Unary arithmetic
//===----------------------------------------------------------------------===//
// Load FP integer
-def : InstRW<[VecDF], (instregex "FIDTR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "FIDTR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXTR$")>;
// Extract biased exponent
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEDTR$")>;
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEXTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEXTR$")>;
// Extract significance
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "ESDTR$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat15, BeginGroup], (instregex "ESXTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "ESDTR$")>;
+def : InstRW<[WLat12, FXb, VecDF2, Cracked], (instregex "ESXTR$")>;
//===----------------------------------------------------------------------===//
// DFP: Binary arithmetic
//===----------------------------------------------------------------------===//
// Addition
-def : InstRW<[VecDF], (instregex "ADTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXTR(A)?$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "ADTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXTR(A)?$")>;
// Subtraction
-def : InstRW<[VecDF], (instregex "SDTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXTR(A)?$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "SDTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXTR(A)?$")>;
// Multiply
-def : InstRW<[VecDF, Lat30], (instregex "MDTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "MDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXTR(A)?$")>;
// Division
-def : InstRW<[VecDF, Lat30], (instregex "DDTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "DDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "DXTR(A)?$")>;
// Quantize
-def : InstRW<[VecDF], (instregex "QADTR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "QAXTR$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "QADTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "QAXTR$")>;
// Reround
-def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "RRDTR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "RRXTR$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecDF, Cracked], (instregex "RRDTR$")>;
+def : InstRW<[WLat11, WLat11, FXb, VecDF4, GroupAlone], (instregex "RRXTR$")>;
// Shift significand left/right
-def : InstRW<[LSU, VecDF, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
-def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+def : InstRW<[WLat11LSU, LSU, VecDF, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[WLat11LSU, LSU, VecDF4, GroupAlone], (instregex "S(L|R)XT$")>;
// Insert biased exponent
-def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "IEDTR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "IEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "IEXTR$")>;
//===----------------------------------------------------------------------===//
// DFP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[VecDF], (instregex "(K|C)DTR$")>;
-def : InstRW<[VecDF, VecDF, Lat11, GroupAlone], (instregex "(K|C)XTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "(K|C)DTR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XTR$")>;
// Compare biased exponent
-def : InstRW<[VecDF], (instregex "CEDTR$")>;
-def : InstRW<[VecDF], (instregex "CEXTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEDTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEXTR$")>;
// Test Data Class/Group
-def : InstRW<[LSU, VecDF, Lat11], (instregex "TD(C|G)(E|D)T$")>;
-def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+def : InstRW<[WLat15, LSU, VecDF, NormalGr], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[WLat15, LSU, VecDF2, GroupAlone], (instregex "TD(C|G)XT$")>;
// --------------------------------- Vector --------------------------------- //
@@ -1155,298 +1179,307 @@ def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
// Vector: Move instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb], (instregex "VLR(32|64)?$")>;
-def : InstRW<[FXb, Lat4], (instregex "VLGV(B|F|G|H)?$")>;
-def : InstRW<[FXb], (instregex "VLVG(B|F|G|H)?$")>;
-def : InstRW<[FXb, Lat2], (instregex "VLVGP(32)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLR(32|64)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLGV(B|F|G|H)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLVG(B|F|G|H)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLVGP(32)?$")>;
//===----------------------------------------------------------------------===//
// Vector: Immediate instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[VecXsPm], (instregex "VZERO$")>;
-def : InstRW<[VecXsPm], (instregex "VONE$")>;
-def : InstRW<[VecXsPm], (instregex "VGBM$")>;
-def : InstRW<[VecXsPm], (instregex "VGM(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VREPI(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VZERO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VONE$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGBM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREPI(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>;
//===----------------------------------------------------------------------===//
// Vector: Loads
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU], (instregex "VL(L|BB)?$")>;
-def : InstRW<[LSU], (instregex "VL(32|64)$")>;
-def : InstRW<[LSU], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
-def : InstRW<[LSU], (instregex "VLREP(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, LSU, Lat7], (instregex "VLE(B|F|G|H)$")>;
-def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VGE(F|G)$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
- (instregex "VLM$")>;
-def : InstRW<[LSU, Lat5], (instregex "VLRL(R)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(BB)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+ (instregex "VLE(B|F|G|H)$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXb, LSU, VecXsPm, Cracked],
+ (instregex "VGE(F|G)$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone], (instregex "VLM$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>;
//===----------------------------------------------------------------------===//
// Vector: Stores
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat8], (instregex "VST(L|32|64)?$")>;
-def : InstRW<[FXb, LSU, Lat8], (instregex "VSTE(F|G)$")>;
-def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VSTE(B|H)$")>;
-def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat20, GroupAlone],
- (instregex "VSTM$")>;
-def : InstRW<[FXb, FXb, LSU, Lat12, BeginGroup], (instregex "VSCE(F|G)$")>;
-def : InstRW<[FXb, LSU, Lat8], (instregex "VSTRL(R)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(L|32|64)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "VSTM$")>;
+def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTRL(R)?$")>;
//===----------------------------------------------------------------------===//
// Vector: Selects and permutes
//===----------------------------------------------------------------------===//
-def : InstRW<[VecXsPm], (instregex "VMRH(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMRL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VPERM$")>;
-def : InstRW<[VecXsPm], (instregex "VPDI$")>;
-def : InstRW<[VecXsPm], (instregex "VBPERM$")>;
-def : InstRW<[VecXsPm], (instregex "VREP(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VSEL$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRH(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPERM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPDI$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VBPERM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEL$")>;
//===----------------------------------------------------------------------===//
// Vector: Widening and narrowing
//===----------------------------------------------------------------------===//
-def : InstRW<[VecXsPm], (instregex "VPK(F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VPKS(F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VPKS(F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VPKLS(F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VPKLS(F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VSEG(B|F|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPH(B|F|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPL(B|F)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPLH(B|F|H|W)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPLL(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPK(F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEG(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPH(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPL(B|F)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLH(B|F|H|W)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLL(B|F|H)?$")>;
//===----------------------------------------------------------------------===//
// Vector: Integer arithmetic
//===----------------------------------------------------------------------===//
-def : InstRW<[VecXsPm], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
-def : InstRW<[VecXsPm], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
-def : InstRW<[VecXsPm], (instregex "VAVG(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VAVGL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VN(C|O|N|X)?$")>;
-def : InstRW<[VecXsPm], (instregex "VO(C)?$")>;
-def : InstRW<[VecMul], (instregex "VCKSM$")>;
-def : InstRW<[VecXsPm], (instregex "VCLZ(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VCTZ(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VX$")>;
-def : InstRW<[VecMul], (instregex "VGFM?$")>;
-def : InstRW<[VecMul], (instregex "VGFMA(B|F|G|H)?$")>;
-def : InstRW<[VecMul], (instregex "VGFM(B|F|G|H)$")>;
-def : InstRW<[VecXsPm], (instregex "VLC(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VLP(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMX(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMXL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMN(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMNL(B|F|G|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAL(B|F)?$")>;
-def : InstRW<[VecMul], (instregex "VMALE(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMALH(B|F|H|W)?$")>;
-def : InstRW<[VecMul], (instregex "VMALO(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAO(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAE(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAH(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VME(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMH(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VML(B|F)?$")>;
-def : InstRW<[VecMul], (instregex "VMLE(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMLH(B|F|H|W)?$")>;
-def : InstRW<[VecMul], (instregex "VMLO(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMO(B|F|H)?$")>;
-def : InstRW<[VecBF2], (instregex "VMSL(G)?$")>;
-
-def : InstRW<[VecXsPm], (instregex "VPOPCT(B|F|G|H)?$")>;
-
-def : InstRW<[VecXsPm], (instregex "VERLL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VERLLV(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VERIM(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESLV(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRA(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRAV(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRLV(B|F|G|H)?$")>;
-
-def : InstRW<[VecXsPm], (instregex "VSL(DB)?$")>;
-def : InstRW<[VecXsPm], (instregex "VSLB$")>;
-def : InstRW<[VecXsPm], (instregex "VSR(A|L)$")>;
-def : InstRW<[VecXsPm], (instregex "VSR(A|L)B$")>;
-
-def : InstRW<[VecXsPm], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
-def : InstRW<[VecXsPm], (instregex "VSCBI(B|F|G|H|Q)?$")>;
-def : InstRW<[VecXsPm], (instregex "VS(F|G|H|Q)?$")>;
-
-def : InstRW<[VecMul], (instregex "VSUM(B|H)?$")>;
-def : InstRW<[VecMul], (instregex "VSUMG(F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VSUMQ(F|G)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVG(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVGL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VN(C|O|N|X)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VO(C)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VCKSM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCLZ(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCTZ(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VX$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFMA(B|F|G|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM(B|F|G|H)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLC(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLP(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMX(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMXL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMN(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMNL(B|F|G|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAL(B|F)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALH(B|F|H|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAH(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VME(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMH(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VML(B|F)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLH(B|F|H|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMO(B|F|H)?$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VMSL(G)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPOPCT(B|F|G|H)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERIM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRA(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRAV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRLV(B|F|G|H)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSL(DB)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSLB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)B$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSCBI(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VS(F|G|H|Q)?$")>;
+
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUM(B|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMG(F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMQ(F|G)?$")>;
//===----------------------------------------------------------------------===//
// Vector: Integer comparison
//===----------------------------------------------------------------------===//
-def : InstRW<[VecXsPm, Lat4], (instregex "VEC(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VECL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VCEQ(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VCEQ(B|F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VCH(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VCH(B|F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VCHL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VCHL(B|F|G|H)S$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VTM$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VEC(B|F|G|H)?$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VECL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)S$")>;
+def : InstRW<[WLat4, VecStr, NormalGr], (instregex "VTM$")>;
//===----------------------------------------------------------------------===//
// Vector: Floating-point arithmetic
//===----------------------------------------------------------------------===//
// Conversion and rounding
-def : InstRW<[VecBF], (instregex "VCD(L)?G$")>;
-def : InstRW<[VecBF], (instregex "VCD(L)?GB$")>;
-def : InstRW<[VecBF], (instregex "WCD(L)?GB$")>;
-def : InstRW<[VecBF], (instregex "VC(L)?GD$")>;
-def : InstRW<[VecBF], (instregex "VC(L)?GDB$")>;
-def : InstRW<[VecBF], (instregex "WC(L)?GDB$")>;
-def : InstRW<[VecBF], (instregex "VL(DE|ED)$")>;
-def : InstRW<[VecBF], (instregex "VL(DE|ED)B$")>;
-def : InstRW<[VecBF], (instregex "WL(DE|ED)B$")>;
-def : InstRW<[VecBF], (instregex "VFL(L|R)$")>;
-def : InstRW<[VecBF], (instregex "VFL(LS|RD)$")>;
-def : InstRW<[VecBF], (instregex "WFL(LS|RD)$")>;
-def : InstRW<[VecBF2], (instregex "WFLLD$")>;
-def : InstRW<[VecDF2, Lat10], (instregex "WFLRX$")>;
-def : InstRW<[VecBF2], (instregex "VFI$")>;
-def : InstRW<[VecBF], (instregex "VFIDB$")>;
-def : InstRW<[VecBF], (instregex "WFIDB$")>;
-def : InstRW<[VecBF2], (instregex "VFISB$")>;
-def : InstRW<[VecBF], (instregex "WFISB$")>;
-def : InstRW<[VecDF2, Lat10], (instregex "WFIXB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VCD(L)?G$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VCD(L)?GB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WCD(L)?GB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VC(L)?GD$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VC(L)?GDB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WC(L)?GDB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VL(DE|ED)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VL(DE|ED)B$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WL(DE|ED)B$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VFL(L|R)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VFL(LS|RD)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFL(LS|RD)$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "WFLLD$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WFLRX$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFI$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VFIDB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFIDB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFISB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFISB$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WFIXB$")>;
// Sign operations
-def : InstRW<[VecXsPm], (instregex "VFPSO$")>;
-def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>;
-def : InstRW<[VecXsPm], (instregex "(V|W)FPSOSB$")>;
-def : InstRW<[VecXsPm], (instregex "WFPSOXB$")>;
-def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)DB$")>;
-def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)SB$")>;
-def : InstRW<[VecXsPm], (instregex "WFL(C|N|P)XB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VFPSO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSODB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSOSB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFPSOXB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFL(C|N|P)XB$")>;
// Minimum / maximum
-def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)$")>;
-def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)DB$")>;
-def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)DB$")>;
-def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)SB$")>;
-def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)SB$")>;
-def : InstRW<[VecDFX], (instregex "WF(MAX|MIN)XB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)SB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WF(MAX|MIN)XB$")>;
// Test data class
-def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCIDB$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCISB$")>;
-def : InstRW<[VecDFX, Lat4], (instregex "WFTCIXB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFTCI$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCIDB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCISB$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFTCIXB$")>;
// Add / subtract
-def : InstRW<[VecBF2], (instregex "VF(A|S)$")>;
-def : InstRW<[VecBF], (instregex "VF(A|S)DB$")>;
-def : InstRW<[VecBF], (instregex "WF(A|S)DB$")>;
-def : InstRW<[VecBF2], (instregex "VF(A|S)SB$")>;
-def : InstRW<[VecBF], (instregex "WF(A|S)SB$")>;
-def : InstRW<[VecDF2, Lat10], (instregex "WF(A|S)XB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VF(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)SB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)SB$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>;
// Multiply / multiply-and-add/subtract
-def : InstRW<[VecBF2], (instregex "VFM$")>;
-def : InstRW<[VecBF], (instregex "VFMDB$")>;
-def : InstRW<[VecBF], (instregex "WFMDB$")>;
-def : InstRW<[VecBF2], (instregex "VFMSB$")>;
-def : InstRW<[VecBF], (instregex "WFMSB$")>;
-def : InstRW<[VecDF2, Lat20], (instregex "WFMXB$")>;
-def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)$")>;
-def : InstRW<[VecBF], (instregex "VF(N)?M(A|S)DB$")>;
-def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)DB$")>;
-def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)SB$")>;
-def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)SB$")>;
-def : InstRW<[VecDF2, Lat20], (instregex "WF(N)?M(A|S)XB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VFMDB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(D|S)B$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFMSB$")>;
+def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(N)?M(A|S)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(N)?M(A|S)SB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>;
+def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>;
// Divide / square root
-def : InstRW<[VecFPd], (instregex "VFD$")>;
-def : InstRW<[VecFPd], (instregex "(V|W)FDDB$")>;
-def : InstRW<[VecFPd], (instregex "(V|W)FDSB$")>;
-def : InstRW<[VecFPd], (instregex "WFDXB$")>;
-def : InstRW<[VecFPd], (instregex "VFSQ$")>;
-def : InstRW<[VecFPd], (instregex "(V|W)FSQDB$")>;
-def : InstRW<[VecFPd], (instregex "(V|W)FSQSB$")>;
-def : InstRW<[VecFPd], (instregex "WFSQXB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFD$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDDB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFDXB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFSQ$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQDB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFSQXB$")>;
//===----------------------------------------------------------------------===//
// Vector: Floating-point comparison
//===----------------------------------------------------------------------===//
-def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)$")>;
-def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)DB$")>;
-def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)DB$")>;
-def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)SB$")>;
-def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)SB$")>;
-def : InstRW<[VecDFX], (instregex "WF(C|K)(E|H|HE)XB$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)DBS$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)DBS$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)SBS$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)SBS$")>;
-def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)(E|H|HE)XBS$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)DB$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)SB$")>;
-def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)XB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFC(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFK(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr],
+ (instregex "WF(C|K)(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr],
+ (instregex "VF(C|K)(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XBS$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XBS$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)DB$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)SB$")>;
+def : InstRW<[WLat3, VecDFX, NormalGr], (instregex "WF(C|K)XB$")>;
//===----------------------------------------------------------------------===//
// Vector: Floating-point insertion and extraction
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb], (instregex "LEFR$")>;
-def : InstRW<[FXb, Lat4], (instregex "LFER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER$")>;
//===----------------------------------------------------------------------===//
// Vector: String instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[VecStr], (instregex "VFAE(B)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFAEBS$")>;
-def : InstRW<[VecStr], (instregex "VFAE(F|H)$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFAE(F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VFAEZ(B|F|H)$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFAEZ(B|F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
-def : InstRW<[VecStr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
-def : InstRW<[VecStr], (instregex "VISTR(B|F|H)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VISTR(B|F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VSTRC(B|F|H)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(B)?$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAE(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+ (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+ (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VISTR(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VISTR(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRC(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRC(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)S$")>;
//===----------------------------------------------------------------------===//
// Vector: Packed-decimal instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[VecDF, VecDF, Lat10], (instregex "VLIP$")>;
-def : InstRW<[VecDFX, LSU, GroupAlone], (instregex "VPKZ$")>;
-def : InstRW<[VecDFX, FXb, LSU, Lat12, BeginGroup], (instregex "VUPKZ$")>;
-def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVB(G)?$")>;
-def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVD(G)?$")>;
-def : InstRW<[VecDFX], (instregex "V(A|S)P$")>;
-def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "VM(S)?P$")>;
-def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "V(D|R)P$")>;
-def : InstRW<[VecDFX, Lat30, GroupAlone], (instregex "VSDP$")>;
-def : InstRW<[VecDF, VecDF, Lat11], (instregex "VSRP$")>;
-def : InstRW<[VecDFX], (instregex "VPSOP$")>;
-def : InstRW<[VecDFX], (instregex "V(T|C)P$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "VLIP$")>;
+def : InstRW<[WLat6, VecDFX, LSU, GroupAlone], (instregex "VPKZ$")>;
+def : InstRW<[WLat1, VecDFX, FXb, LSU, Cracked], (instregex "VUPKZ$")>;
+def : InstRW<[WLat20, WLat20, VecDF2, FXb, GroupAlone], (instregex "VCVB(G)?$")>;
+def : InstRW<[WLat20, WLat20, VecDF2, FXb, GroupAlone], (instregex "VCVD(G)?$")>;
+def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "V(A|S)P$")>;
+def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "VM(S)?P$")>;
+def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "V(D|R)P$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "VSDP$")>;
+def : InstRW<[WLat10, WLat10, VecDF2, NormalGr], (instregex "VSRP$")>;
+def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "VPSOP$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "V(T|C)P$")>;
// -------------------------------- System ---------------------------------- //
@@ -1455,157 +1488,151 @@ def : InstRW<[VecDFX], (instregex "V(T|C)P$")>;
// System: Program-Status Word Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LPSW(E)?$")>;
-def : InstRW<[FXa, Lat3, GroupAlone], (instregex "IPK$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
-def : InstRW<[FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
-def : InstRW<[FXa, Lat3], (instregex "IAC$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>;
+def : InstRW<[WLat20, GroupAlone], (instregex "LPSW(E)?$")>;
+def : InstRW<[WLat3, FXa, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[WLat3, FXa, NormalGr], (instregex "IAC$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>;
//===----------------------------------------------------------------------===//
// System: Control Register Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat30], (instregex "LCTL(G)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
-def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
-def : InstRW<[FXb, Lat30], (instregex "SSA(I)?R$")>;
-def : InstRW<[FXb, Lat30], (instregex "ESEA$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, LSU2, GroupAlone], (instregex "LCTL(G)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "ESEA$")>;
//===----------------------------------------------------------------------===//
// System: Prefix-Register Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat30], (instregex "SPX$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STPX$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(T)?PX$")>;
//===----------------------------------------------------------------------===//
// System: Storage-Key and Real Memory Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, Lat30], (instregex "ISKE$")>;
-def : InstRW<[FXb, Lat30], (instregex "IVSK$")>;
-def : InstRW<[FXb, Lat30], (instregex "SSKE(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "RRB(E|M)$")>;
-def : InstRW<[FXb, Lat30], (instregex "IRBM$")>;
-def : InstRW<[FXb, Lat30], (instregex "PFMF$")>;
-def : InstRW<[FXb, Lat30], (instregex "TB$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "PGIN$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "PGOUT$")>;
+def : InstRW<[WLat30, MCD], (instregex "ISKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "IVSK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RRB(E|M)$")>;
+def : InstRW<[WLat30, MCD], (instregex "IRBM$")>;
+def : InstRW<[WLat30, MCD], (instregex "PFMF$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGIN$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGOUT$")>;
//===----------------------------------------------------------------------===//
// System: Dynamic-Address-Translation Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "IDTE(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "CRDTE(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "PTLB$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "CSP(G)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LPTEA$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STRAG$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LURA(G)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STUR(A|G)$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "TPROT$")>;
+def : InstRW<[WLat30, MCD], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTLB$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "CSP(G)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "LPTEA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STRAG$")>;
+def : InstRW<[WLat30, MCD], (instregex "LURA(G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STUR(A|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPROT$")>;
//===----------------------------------------------------------------------===//
// System: Memory-move Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "MVCOS$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
+def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[WLat1, FXa, LSU5, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVPG$")>;
//===----------------------------------------------------------------------===//
// System: Address-Space Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat30], (instregex "LASP$")>;
-def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "PC$")>;
-def : InstRW<[FXb, Lat30], (instregex "PR$")>;
-def : InstRW<[FXb, Lat30], (instregex "PT(I)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "RP$")>;
-def : InstRW<[FXb, Lat30], (instregex "BS(G|A)$")>;
-def : InstRW<[FXb, Lat20], (instregex "TAR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LASP$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PC$")>;
+def : InstRW<[WLat30, MCD], (instregex "PR$")>;
+def : InstRW<[WLat30, MCD], (instregex "PT(I)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RP$")>;
+def : InstRW<[WLat30, MCD], (instregex "BS(G|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TAR$")>;
//===----------------------------------------------------------------------===//
// System: Linkage-Stack Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, Lat30, EndGroup], (instregex "BAKR$")>;
-def : InstRW<[FXb, Lat30], (instregex "EREG(G)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "(E|M)STA$")>;
+def : InstRW<[WLat30, MCD], (instregex "BAKR$")>;
+def : InstRW<[WLat30, MCD], (instregex "EREG(G)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "(E|M)STA$")>;
//===----------------------------------------------------------------------===//
// System: Time-Related Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, Lat30], (instregex "PTFF$")>;
-def : InstRW<[FXb, LSU, Lat20], (instregex "SCK$")>;
-def : InstRW<[FXb, Lat30], (instregex "SCKPF$")>;
-def : InstRW<[FXb, LSU, Lat20], (instregex "SCKC$")>;
-def : InstRW<[LSU, LSU, GroupAlone], (instregex "SPT$")>;
-def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
- (instregex "STCK(F)?$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
- (instregex "STCKE$")>;
-def : InstRW<[FXb, LSU, Lat9], (instregex "STCKC$")>;
-def : InstRW<[LSU, LSU, FXb, Lat5, BeginGroup], (instregex "STPT$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTFF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCK(PF|C)?$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "SPT$")>;
+def : InstRW<[WLat15, LSU3, FXa2, FXb, GroupAlone], (instregex "STCK(F)?$")>;
+def : InstRW<[WLat20, LSU4, FXa2, FXb2, GroupAlone], (instregex "STCKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCKC$")>;
+def : InstRW<[WLat1, LSU2, FXb, Cracked], (instregex "STPT$")>;
//===----------------------------------------------------------------------===//
// System: CPU-Related Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, LSU, Lat30], (instregex "STAP$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STIDP$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STSI$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STFL(E)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "ECAG$")>;
-def : InstRW<[FXa, LSU, Lat30], (instregex "ECTG$")>;
-def : InstRW<[FXb, Lat30], (instregex "PTF$")>;
-def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>;
+def : InstRW<[WLat30, MCD], (instregex "STAP$")>;
+def : InstRW<[WLat30, MCD], (instregex "STIDP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STSI$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STFL(E)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "ECAG$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECTG$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTF$")>;
+def : InstRW<[WLat30, MCD], (instregex "PCKMO$")>;
//===----------------------------------------------------------------------===//
// System: Miscellaneous Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
-def : InstRW<[FXb, GroupAlone], (instregex "MC$")>;
-def : InstRW<[FXb, Lat30], (instregex "DIAG$")>;
-def : InstRW<[FXb], (instregex "TRAC(E|G)$")>;
-def : InstRW<[FXb, Lat30], (instregex "TRAP(2|4)$")>;
-def : InstRW<[FXb, Lat30], (instregex "SIGP$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "SIGA$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "SIE$")>;
+def : InstRW<[WLat30, MCD], (instregex "SVC$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "MC$")>;
+def : InstRW<[WLat30, MCD], (instregex "DIAG$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TRAC(E|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TRAP(2|4)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIG(P|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIE$")>;
//===----------------------------------------------------------------------===//
// System: CPU-Measurement Facility Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb], (instregex "LPP$")>;
-def : InstRW<[FXb, Lat30], (instregex "ECPGA$")>;
-def : InstRW<[FXb, Lat30], (instregex "E(C|P)CTR$")>;
-def : InstRW<[FXb, Lat30], (instregex "LCCTL$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "L(P|S)CTL$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
-def : InstRW<[FXb, Lat30], (instregex "S(C|P)CTR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LPP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECPGA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "E(C|P)CTR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LCCTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "L(P|S)CTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(C|P)CTR$")>;
//===----------------------------------------------------------------------===//
// System: I/O Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXb, Lat30], (instregex "(C|H|R|X)SCH$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
-def : InstRW<[FXb, Lat30], (instregex "RCHP$")>;
-def : InstRW<[FXb, Lat30], (instregex "SCHM$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STC(PS|RW)$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "TPI$")>;
-def : InstRW<[FXb, Lat30], (instregex "SAL$")>;
+def : InstRW<[WLat30, MCD], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "RCHP$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCHM$")>;
+def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPI$")>;
+def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
}
diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td
index 4d986e8391cf..3012b565d5ef 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -10,13 +10,15 @@
// This file defines the machine model for Z196 to support instruction
// scheduling and other instruction cost heuristics.
//
+// Pseudos expanded right after isel do not need to be modelled here.
+//
//===----------------------------------------------------------------------===//
def Z196Model : SchedMachineModel {
let UnsupportedFeatures = Arch9UnsupportedFeatures.List;
- let IssueWidth = 5;
+ let IssueWidth = 3;
let MicroOpBufferSize = 40; // Issue queues
let LoadLatency = 1; // Optimistic load latency.
@@ -27,48 +29,65 @@ def Z196Model : SchedMachineModel {
}
let SchedModel = Z196Model in {
-
-// These definitions could be put in a subtarget common include file,
-// but it seems the include system in Tablegen currently rejects
-// multiple includes of same file.
+// These definitions need the SchedModel value. They could be put in a
+// subtarget common include file, but it seems the include system in Tablegen
+// currently (2016) rejects multiple includes of same file.
+
+// Decoder grouping rules
+let NumMicroOps = 1 in {
+ def : WriteRes<NormalGr, []>;
+ def : WriteRes<BeginGroup, []> { let BeginGroup = 1; }
+ def : WriteRes<EndGroup, []> { let EndGroup = 1; }
+}
+def : WriteRes<Cracked, []> {
+ let NumMicroOps = 2;
+ let BeginGroup = 1;
+}
def : WriteRes<GroupAlone, []> {
- let NumMicroOps = 0;
+ let NumMicroOps = 3;
let BeginGroup = 1;
let EndGroup = 1;
}
-def : WriteRes<EndGroup, []> {
- let NumMicroOps = 0;
- let EndGroup = 1;
+
+// Incoming latency removed from the register operand which is used together
+// with a memory operand by the instruction.
+def : ReadAdvance<RegReadAdv, 4>;
+
+// LoadLatency (above) is not used for instructions in this file. This is
+// instead the role of LSULatency, which is the latency value added to the
+// result of loads and instructions with folded memory operands.
+def : WriteRes<LSULatency, []> { let Latency = 4; let NumMicroOps = 0; }
+
+let NumMicroOps = 0 in {
+ foreach L = 1-30 in {
+ def : WriteRes<!cast<SchedWrite>("WLat"#L), []> { let Latency = L; }
+ }
}
-def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
-def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
-def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
-def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
-def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
-def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
-def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
-def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
-def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
-def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
-def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
-def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
-def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
-def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
// Execution units.
def Z196_FXUnit : ProcResource<2>;
def Z196_LSUnit : ProcResource<2>;
def Z196_FPUnit : ProcResource<1>;
def Z196_DFUnit : ProcResource<1>;
+def Z196_MCD : ProcResource<1>;
// Subtarget specific definitions of scheduling resources.
-def : WriteRes<FXU, [Z196_FXUnit]> { let Latency = 1; }
-def : WriteRes<LSU, [Z196_LSUnit]> { let Latency = 4; }
-def : WriteRes<LSU_lat1, [Z196_LSUnit]> { let Latency = 1; }
-def : WriteRes<FPU, [Z196_FPUnit]> { let Latency = 8; }
-def : WriteRes<FPU2, [Z196_FPUnit, Z196_FPUnit]> { let Latency = 9; }
-def : WriteRes<DFU, [Z196_DFUnit]> { let Latency = 2; }
-def : WriteRes<DFU2, [Z196_DFUnit, Z196_DFUnit]> { let Latency = 3; }
+let NumMicroOps = 0 in {
+ def : WriteRes<FXU, [Z196_FXUnit]>;
+ def : WriteRes<LSU, [Z196_LSUnit]>;
+ def : WriteRes<FPU, [Z196_FPUnit]>;
+ def : WriteRes<DFU, [Z196_DFUnit]>;
+ foreach Num = 2-6 in { let ResourceCycles = [Num] in {
+ def : WriteRes<!cast<SchedWrite>("FXU"#Num), [Z196_FXUnit]>;
+ def : WriteRes<!cast<SchedWrite>("LSU"#Num), [Z196_LSUnit]>;
+ def : WriteRes<!cast<SchedWrite>("FPU"#Num), [Z196_FPUnit]>;
+ def : WriteRes<!cast<SchedWrite>("DFU"#Num), [Z196_DFUnit]>;
+ }}
+}
+
+def : WriteRes<MCD, [Z196_MCD]> { let NumMicroOps = 3;
+ let BeginGroup = 1;
+ let EndGroup = 1; }
// -------------------------- INSTRUCTIONS ---------------------------------- //
@@ -82,26 +101,26 @@ def : WriteRes<DFU2, [Z196_DFUnit, Z196_DFUnit]> { let Latency = 3; }
// Stack allocation
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
//===----------------------------------------------------------------------===//
// Branch instructions
//===----------------------------------------------------------------------===//
// Branch
-def : InstRW<[LSU, EndGroup], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
-def : InstRW<[LSU, EndGroup], (instregex "(Call)?J(G)?(Asm.*)?$")>;
-def : InstRW<[LSU, EndGroup], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
-def : InstRW<[LSU, EndGroup], (instregex "(Call)?B(R)?(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BRCT(G|H)?$")>;
-def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BCT(G)?(R)?$")>;
-def : InstRW<[FXU, FXU, FXU, LSU, Lat7, GroupAlone],
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "BRCT(G|H)?$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[WLat1, FXU3, LSU, GroupAlone],
(instregex "B(R)?X(H|L).*$")>;
// Compare and branch
-def : InstRW<[FXU, LSU, Lat5, GroupAlone],
+def : InstRW<[WLat1, FXU, LSU, GroupAlone],
(instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat5, GroupAlone],
+def : InstRW<[WLat1, FXU, LSU, GroupAlone],
(instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
//===----------------------------------------------------------------------===//
@@ -109,546 +128,558 @@ def : InstRW<[FXU, LSU, Lat5, GroupAlone],
//===----------------------------------------------------------------------===//
// Trap
-def : InstRW<[LSU, EndGroup], (instregex "(Cond)?Trap$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "(Cond)?Trap$")>;
// Compare and trap
-def : InstRW<[FXU], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
-def : InstRW<[FXU], (instregex "CL(G)?RT(Asm.*)?$")>;
-def : InstRW<[FXU], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CL(F|G)IT(Asm.*)?$")>;
//===----------------------------------------------------------------------===//
// Call and return instructions
//===----------------------------------------------------------------------===//
// Call
-def : InstRW<[LSU, FXU, FXU, Lat6, GroupAlone], (instregex "(Call)?BRAS$")>;
-def : InstRW<[LSU, FXU, FXU, Lat6, GroupAlone], (instregex "(Call)?BRASL$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
-def : InstRW<[LSU, FXU, FXU, Lat6, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
// Return
-def : InstRW<[LSU_lat1, EndGroup], (instregex "Return$")>;
-def : InstRW<[LSU_lat1, EndGroup], (instregex "CondReturn$")>;
-
-//===----------------------------------------------------------------------===//
-// Select instructions
-//===----------------------------------------------------------------------===//
-
-// Select pseudo
-def : InstRW<[FXU], (instregex "Select(32|64|32Mux)$")>;
-
-// CondStore pseudos
-def : InstRW<[FXU], (instregex "CondStore16(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore16Mux(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore32(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore64(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore8(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore8Mux(Inv)?$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "Return$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "CondReturn$")>;
//===----------------------------------------------------------------------===//
// Move instructions
//===----------------------------------------------------------------------===//
// Moves
-def : InstRW<[FXU, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "MVI(Y)?$")>;
// Move character
-def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
+def : InstRW<[WLat1, FXU, LSU3, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVCL(E|U)?$")>;
// Pseudo -> reg move
-def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
-def : InstRW<[FXU], (instregex "EXTRACT_SUBREG$")>;
-def : InstRW<[FXU], (instregex "INSERT_SUBREG$")>;
-def : InstRW<[FXU], (instregex "REG_SEQUENCE$")>;
-def : InstRW<[FXU], (instregex "SUBREG_TO_REG$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "REG_SEQUENCE$")>;
// Loads
-def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux)?$")>;
-def : InstRW<[LSU], (instregex "LG(RL)?$")>;
-def : InstRW<[LSU], (instregex "L128$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LG(RL)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L128$")>;
-def : InstRW<[FXU], (instregex "LLIH(F|H|L)$")>;
-def : InstRW<[FXU], (instregex "LLIL(F|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLIL(F|H|L)$")>;
-def : InstRW<[FXU], (instregex "LG(F|H)I$")>;
-def : InstRW<[FXU], (instregex "LHI(Mux)?$")>;
-def : InstRW<[FXU], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LG(F|H)I$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LHI(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LR(Mux)?$")>;
// Load and test
-def : InstRW<[FXU, LSU, Lat5], (instregex "LT(G)?$")>;
-def : InstRW<[FXU], (instregex "LT(G)?R$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXU, NormalGr], (instregex "LT(G)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LT(G)?R$")>;
// Stores
-def : InstRW<[FXU, LSU, Lat5], (instregex "STG(RL)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ST128$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STG(RL)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST128$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(Y|FH|RL|Mux)?$")>;
// String moves.
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVST$")>;
//===----------------------------------------------------------------------===//
// Conditional move instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat2, EndGroup], (instregex "LOC(G)?R(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat6, EndGroup], (instregex "LOC(G)?(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat5, EndGroup], (instregex "STOC(G)?(Asm.*)?$")>;
+def : InstRW<[WLat2, FXU, EndGroup], (instregex "LOC(G)?R(Asm.*)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU, LSU, EndGroup],
+ (instregex "LOC(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, LSU, EndGroup], (instregex "STOC(G)?(Asm.*)?$")>;
//===----------------------------------------------------------------------===//
// Sign extensions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "L(B|H|G)R$")>;
-def : InstRW<[FXU], (instregex "LG(B|H|F)R$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LTGF$")>;
-def : InstRW<[FXU], (instregex "LTGFR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(B|H|G)R$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LTGF$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LTGFR$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LH(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LG(B|H|F)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LH(Y)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LG(B|H|F)$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LG(H|F)RL$")>;
//===----------------------------------------------------------------------===//
// Zero extensions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "LLCR(Mux)?$")>;
-def : InstRW<[FXU], (instregex "LLHR(Mux)?$")>;
-def : InstRW<[FXU], (instregex "LLG(C|F|H|T)R$")>;
-def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
-def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LL(C|H)H$")>;
-def : InstRW<[LSU], (instregex "LLHRL$")>;
-def : InstRW<[LSU], (instregex "LLG(C|F|H|T|FRL|HRL)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLH(Mux)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLHRL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
//===----------------------------------------------------------------------===//
// Truncations
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STCM(H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Multi-register moves
//===----------------------------------------------------------------------===//
// Load multiple (estimated average of 5 ops)
-def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
- (instregex "LM(H|Y|G)?$")>;
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LM(H|Y|G)?$")>;
// Load multiple disjoint
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LMD$")>;
// Store multiple (estimated average of 3 ops)
-def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
- (instregex "STM(H|Y|G)?$")>;
+def : InstRW<[WLat1, LSU2, FXU5, GroupAlone], (instregex "STM(H|Y|G)?$")>;
//===----------------------------------------------------------------------===//
// Byte swaps
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LRV(G)?R$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LRV(G|H)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STRV(G|H)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCIN$")>;
//===----------------------------------------------------------------------===//
// Load address instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "LA(Y|RL)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LA(Y|RL)?$")>;
// Load the Global Offset Table address
-def : InstRW<[FXU], (instregex "GOT$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "GOT$")>;
//===----------------------------------------------------------------------===//
// Absolute and Negation
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat2], (instregex "LP(G)?R$")>;
-def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "L(N|P)GFR$")>;
-def : InstRW<[FXU, Lat2], (instregex "LN(R|GR)$")>;
-def : InstRW<[FXU], (instregex "LC(R|GR)$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LCGFR$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "LP(G)?R$")>;
+def : InstRW<[WLat3, WLat3, FXU2, GroupAlone], (instregex "L(N|P)GFR$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "LN(R|GR)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LC(R|GR)$")>;
+def : InstRW<[WLat2, WLat2, FXU2, GroupAlone], (instregex "LCGFR$")>;
//===----------------------------------------------------------------------===//
// Insertion
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "IC(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "IC32(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
-def : InstRW<[FXU], (instregex "II(F|H|L)Mux$")>;
-def : InstRW<[FXU], (instregex "IIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "IIHH(64)?$")>;
-def : InstRW<[FXU], (instregex "IIHL(64)?$")>;
-def : InstRW<[FXU], (instregex "IILF(64)?$")>;
-def : InstRW<[FXU], (instregex "IILH(64)?$")>;
-def : InstRW<[FXU], (instregex "IILL(64)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "IC(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "IC32(Y)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "ICM(H|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IIHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IIHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IILH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IILL(64)?$")>;
//===----------------------------------------------------------------------===//
// Addition
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "AH(Y)?$")>;
-def : InstRW<[FXU], (instregex "AIH$")>;
-def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
-def : InstRW<[FXU], (instregex "AGFI$")>;
-def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
-def : InstRW<[FXU], (instregex "AGR(K)?$")>;
-def : InstRW<[FXU], (instregex "AHI(K)?$")>;
-def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>;
-def : InstRW<[FXU], (instregex "ALGHSIK$")>;
-def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
-def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
-def : InstRW<[FXU], (instregex "ALR(K)?$")>;
-def : InstRW<[FXU], (instregex "AR(K)?$")>;
-def : InstRW<[FXU], (instregex "A(L)?HHHR$")>;
-def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "A(L)?HHLR$")>;
-def : InstRW<[FXU], (instregex "ALSIH(N)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "A(L)?(Y)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "A(L)?SI$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+ (instregex "AH(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AIH$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AFI(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AGFI$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AGHI(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AHI(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AHIMux(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "ALGF$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALGHSIK$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALGF(I|R)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "A(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXU2, GroupAlone], (instregex "A(L)?HHLR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALSIH(N)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "A(L)?G$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "A(L)?GSI$")>;
// Logical addition with carry
-def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
-def : InstRW<[FXU, Lat3, GroupAlone], (instregex "ALC(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, GroupAlone],
+ (instregex "ALC(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXU, GroupAlone], (instregex "ALC(G)?R$")>;
// Add with sign extension (32 -> 64)
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "AGF$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "AGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+ (instregex "AGF$")>;
+def : InstRW<[WLat2, WLat2, FXU2, GroupAlone], (instregex "AGFR$")>;
//===----------------------------------------------------------------------===//
// Subtraction
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "S(G|Y)?$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "SH(Y)?$")>;
-def : InstRW<[FXU], (instregex "SGR(K)?$")>;
-def : InstRW<[FXU], (instregex "SLFI$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
-def : InstRW<[FXU], (instregex "SLGF(I|R)$")>;
-def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
-def : InstRW<[FXU], (instregex "SLR(K)?$")>;
-def : InstRW<[FXU], (instregex "SR(K)?$")>;
-def : InstRW<[FXU], (instregex "S(L)?HHHR$")>;
-def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "S(L)?HHLR$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "S(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+ (instregex "SH(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLFI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLGF(I|R)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "S(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXU2, GroupAlone], (instregex "S(L)?HHLR$")>;
// Subtraction with borrow
-def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
-def : InstRW<[FXU, Lat3, GroupAlone], (instregex "SLB(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, GroupAlone],
+ (instregex "SLB(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXU, GroupAlone], (instregex "SLB(G)?R$")>;
// Subtraction with sign extension (32 -> 64)
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "SGF$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "SGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+ (instregex "SGF$")>;
+def : InstRW<[WLat2, WLat2, FXU2, GroupAlone], (instregex "SGFR$")>;
//===----------------------------------------------------------------------===//
// AND
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "N(G|Y)?$")>;
-def : InstRW<[FXU], (instregex "NGR(K)?$")>;
-def : InstRW<[FXU], (instregex "NI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "NI(Y)?$")>;
-def : InstRW<[FXU], (instregex "NIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "NIHH(64)?$")>;
-def : InstRW<[FXU], (instregex "NIHL(64)?$")>;
-def : InstRW<[FXU], (instregex "NILF(64)?$")>;
-def : InstRW<[FXU], (instregex "NILH(64)?$")>;
-def : InstRW<[FXU], (instregex "NILL(64)?$")>;
-def : InstRW<[FXU], (instregex "NR(K)?$")>;
-def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "NC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "N(G|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "NI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NIHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NIHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NILH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NILL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NR(K)?$")>;
+def : InstRW<[WLat5LSU, LSU2, FXU, GroupAlone], (instregex "NC$")>;
//===----------------------------------------------------------------------===//
// OR
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "O(G|Y)?$")>;
-def : InstRW<[FXU], (instregex "OGR(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "OI(Y)?$")>;
-def : InstRW<[FXU], (instregex "OI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXU], (instregex "OIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "OIHH(64)?$")>;
-def : InstRW<[FXU], (instregex "OIHL(64)?$")>;
-def : InstRW<[FXU], (instregex "OILF(64)?$")>;
-def : InstRW<[FXU], (instregex "OILH(64)?$")>;
-def : InstRW<[FXU], (instregex "OILL(64)?$")>;
-def : InstRW<[FXU], (instregex "OR(K)?$")>;
-def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "OC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "O(G|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OGR(K)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "OI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OIHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OIHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OILH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OILL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OR(K)?$")>;
+def : InstRW<[WLat5LSU, LSU2, FXU, GroupAlone], (instregex "OC$")>;
//===----------------------------------------------------------------------===//
// XOR
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "X(G|Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "XI(Y)?$")>;
-def : InstRW<[FXU], (instregex "XIFMux$")>;
-def : InstRW<[FXU], (instregex "XGR(K)?$")>;
-def : InstRW<[FXU], (instregex "XIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "XILF(64)?$")>;
-def : InstRW<[FXU], (instregex "XR(K)?$")>;
-def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "XC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "X(G|Y)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "XI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XIFMux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XR(K)?$")>;
+def : InstRW<[WLat5LSU, LSU2, FXU, GroupAlone], (instregex "XC$")>;
//===----------------------------------------------------------------------===//
// Multiplication
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat10], (instregex "MS(GF|Y)?$")>;
-def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
-def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
-def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
-def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
-def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
-def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
-def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
-def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
-def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "MS(GF|Y)?$")>;
+def : InstRW<[WLat6, FXU, NormalGr], (instregex "MS(R|FI)$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "MSG$")>;
+def : InstRW<[WLat8, FXU, NormalGr], (instregex "MSGR$")>;
+def : InstRW<[WLat6, FXU, NormalGr], (instregex "MSGF(I|R)$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+ (instregex "MLG$")>;
+def : InstRW<[WLat9, FXU2, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[WLat5, FXU, NormalGr], (instregex "MGHI$")>;
+def : InstRW<[WLat5, FXU, NormalGr], (instregex "MHI$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "MH(Y)?$")>;
+def : InstRW<[WLat7, FXU2, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+ (instregex "M(FY|L)?$")>;
//===----------------------------------------------------------------------===//
// Division and remainder
//===----------------------------------------------------------------------===//
-def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
- (instregex "DR$")>;
-def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
- (instregex "D$")>;
-def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
- (instregex "DSG(F)?R$")>;
-def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
- (instregex "DSG(F)?$")>;
-def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
- (instregex "DL(G)?R$")>;
-def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
- (instregex "DL(G)?$")>;
+def : InstRW<[WLat30, FPU4, FXU5, GroupAlone], (instregex "DR$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU4, LSU, FXU4, GroupAlone],
+ (instregex "D$")>;
+def : InstRW<[WLat30, FPU4, FXU4, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU4, LSU, FXU3, GroupAlone],
+ (instregex "DSG(F)?$")>;
+def : InstRW<[WLat30, FPU4, FXU5, GroupAlone], (instregex "DL(G)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU4, LSU, FXU4, GroupAlone],
+ (instregex "DL(G)?$")>;
//===----------------------------------------------------------------------===//
// Shifts
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
-def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
-def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXU, Lat2], (instregex "SLA(G|K)?$")>;
-def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone],
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLL(G|K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SRL(G|K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SRA(G|K)?$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "SLA(G|K)?$")>;
+def : InstRW<[WLat5LSU, WLat5LSU, FXU4, LSU, GroupAlone],
(instregex "S(L|R)D(A|L)$")>;
// Rotate
-def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
+def : InstRW<[WLat2LSU, FXU, LSU, NormalGr], (instregex "RLL(G)?$")>;
// Rotate and insert
-def : InstRW<[FXU], (instregex "RISBG(32)?$")>;
-def : InstRW<[FXU], (instregex "RISBH(G|H|L)$")>;
-def : InstRW<[FXU], (instregex "RISBL(G|H|L)$")>;
-def : InstRW<[FXU], (instregex "RISBMux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBG(32)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBMux$")>;
// Rotate and Select
-def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "R(N|O|X)SBG$")>;
+def : InstRW<[WLat3, WLat3, FXU2, GroupAlone], (instregex "R(N|O|X)SBG$")>;
//===----------------------------------------------------------------------===//
// Comparison
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
-def : InstRW<[FXU], (instregex "C(F|H)I(Mux)?$")>;
-def : InstRW<[FXU], (instregex "CG(F|H)I$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
-def : InstRW<[FXU], (instregex "C(G)?R$")>;
-def : InstRW<[FXU], (instregex "CIH$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CH(F|SI)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
-def : InstRW<[FXU], (instregex "CLFI(Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLGF(RL)?$")>;
-def : InstRW<[FXU], (instregex "CLGF(I|R)$")>;
-def : InstRW<[FXU], (instregex "CLGR$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLGRL$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
-def : InstRW<[FXU], (instregex "CLIH$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
-def : InstRW<[FXU], (instregex "CLR$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
-def : InstRW<[FXU], (instregex "C(L)?HHR$")>;
-def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "C(L)?HLR$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "C(G|Y|Mux|RL)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CG(F|H)I$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(G)?R$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CIH$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CHF$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CHSI$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "CL(Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLFHSI$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CLG$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLG(HRL|HSI)$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CLGF$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLGFRL$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLGF(I|R)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLGR$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLGRL$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CLHF$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLH(RL|HSI)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLIH$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLR$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLRL$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(L)?HHR$")>;
+def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "C(L)?HLR$")>;
// Compare halfword
-def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CH(Y|RL)?$")>;
-def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CGH(RL)?$")>;
-def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CHHSI$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+ (instregex "CH(Y)?$")>;
+def : InstRW<[WLat2LSU, FXU2, LSU, GroupAlone], (instregex "CHRL$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone], (instregex "CGH$")>;
+def : InstRW<[WLat2LSU, FXU2, LSU, GroupAlone], (instregex "CGHRL$")>;
+def : InstRW<[WLat2LSU, FXU2, LSU, GroupAlone], (instregex "CHHSI$")>;
// Compare with sign extension (32 -> 64)
-def : InstRW<[FXU, FXU, LSU, Lat6, Lat2, GroupAlone], (instregex "CGF(RL)?$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "CGFR$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone], (instregex "CGF$")>;
+def : InstRW<[WLat2LSU, FXU2, LSU, GroupAlone], (instregex "CGFRL$")>;
+def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "CGFR$")>;
// Compare logical character
-def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "CLC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+def : InstRW<[WLat9, FXU, LSU2, GroupAlone], (instregex "CLC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLST$")>;
// Test under mask
-def : InstRW<[FXU, LSU, Lat5], (instregex "TM(Y)?$")>;
-def : InstRW<[FXU], (instregex "TM(H|L)Mux$")>;
-def : InstRW<[FXU], (instregex "TMHH(64)?$")>;
-def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
-def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
-def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "TM(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMLH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMLL(64)?$")>;
// Compare logical characters under mask
-def : InstRW<[FXU, FXU, LSU, Lat5, GroupAlone], (instregex "CLM(H|Y)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+ (instregex "CLM(H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Prefetch
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU, GroupAlone], (instregex "PFD(RL)?$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PFD(RL)?$")>;
//===----------------------------------------------------------------------===//
// Atomic operations
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU, EndGroup], (instregex "Serialize$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "Serialize$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAA(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAAL(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAN(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAO(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAX(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAA(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAAL(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAN(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAO(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAX(G)?$")>;
// Test and set
-def : InstRW<[FXU, LSU, Lat5, EndGroup], (instregex "TS$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, EndGroup], (instregex "TS$")>;
// Compare and swap
-def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXU2, LSU, GroupAlone],
+ (instregex "CS(G|Y)?$")>;
// Compare double and swap
-def : InstRW<[FXU, FXU, FXU, FXU, FXU, LSU, Lat10, GroupAlone],
+def : InstRW<[WLat5LSU, WLat5LSU, FXU5, LSU, GroupAlone],
(instregex "CDS(Y)?$")>;
-def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
+def : InstRW<[WLat12, WLat12, FXU6, LSU2, GroupAlone],
(instregex "CDSG$")>;
// Compare and swap and store
-def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>;
+def : InstRW<[WLat30, MCD], (instregex "CSST$")>;
// Perform locked operation
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+def : InstRW<[WLat30, MCD], (instregex "PLO$")>;
// Load/store pair from/to quadword
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
-def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+def : InstRW<[WLat4LSU, LSU2, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[WLat1, FXU2, LSU2, GroupAlone], (instregex "STPQ$")>;
// Load pair disjoint
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, LSU2, GroupAlone], (instregex "LPD(G)?$")>;
//===----------------------------------------------------------------------===//
// Translate and convert
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TRT$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>;
-def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "TR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRTR$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TRE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT(R)?E(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+ (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
//===----------------------------------------------------------------------===//
// Message-security assist
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD],
+ (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
//===----------------------------------------------------------------------===//
// Decimal arithmetic
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>;
-def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
-def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone],
- (instregex "CVDG$")>;
-def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
-def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>;
-def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat30, RegReadAdv, FXU, DFU2, LSU2, GroupAlone],
+ (instregex "CVBG$")>;
+def : InstRW<[WLat20, RegReadAdv, FXU, DFU, LSU, GroupAlone],
+ (instregex "CVB(Y)?$")>;
+def : InstRW<[WLat1, FXU3, DFU4, LSU, GroupAlone], (instregex "CVDG$")>;
+def : InstRW<[WLat1, FXU2, DFU, LSU, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "MV(N|O|Z)$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[WLat10, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "UNPK$")>;
-def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[WLat11LSU, FXU, DFU4, LSU2, GroupAlone],
(instregex "(A|S|ZA)P$")>;
-def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone],
- (instregex "(M|D)P$")>;
-def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone],
- (instregex "SRP$")>;
-def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
-def : InstRW<[DFU2, LSU, LSU, GroupAlone], (instregex "TP$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+def : InstRW<[WLat1, FXU, DFU4, LSU2, GroupAlone], (instregex "(M|D)P$")>;
+def : InstRW<[WLat15, FXU2, DFU4, LSU3, GroupAlone], (instregex "SRP$")>;
+def : InstRW<[WLat11, DFU4, LSU2, GroupAlone], (instregex "CP$")>;
+def : InstRW<[WLat5LSU, DFU2, LSU2, GroupAlone], (instregex "TP$")>;
+def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>;
//===----------------------------------------------------------------------===//
// Access registers
//===----------------------------------------------------------------------===//
// Extract/set/copy access register
-def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+def : InstRW<[WLat3, LSU, NormalGr], (instregex "(EAR|SAR|CPYA)$")>;
// Load address extended
-def : InstRW<[LSU, FXU, Lat5, GroupAlone], (instregex "LAE(Y)?$")>;
+def : InstRW<[WLat5, LSU, FXU, GroupAlone], (instregex "LAE(Y)?$")>;
// Load/store access multiple (not modeled precisely)
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LAM(Y)?$")>;
+def : InstRW<[WLat1, FXU5, LSU5, GroupAlone], (instregex "STAM(Y)?$")>;
//===----------------------------------------------------------------------===//
// Program mask and addressing mode
//===----------------------------------------------------------------------===//
// Insert Program Mask
-def : InstRW<[FXU, Lat3, EndGroup], (instregex "IPM$")>;
+def : InstRW<[WLat3, FXU, EndGroup], (instregex "IPM$")>;
// Set Program Mask
-def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+def : InstRW<[WLat3, LSU, EndGroup], (instregex "SPM$")>;
// Branch and link
-def : InstRW<[FXU, FXU, LSU, Lat8, GroupAlone], (instregex "BAL(R)?$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "BAL(R)?$")>;
// Test addressing mode
-def : InstRW<[FXU], (instregex "TAM$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TAM$")>;
// Set addressing mode
-def : InstRW<[LSU, EndGroup], (instregex "SAM(24|31|64)$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAM(24|31|64)$")>;
// Branch (and save) and set mode.
-def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BSM$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "BASSM$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "BASSM$")>;
//===----------------------------------------------------------------------===//
// Miscellaneous Instructions.
//===----------------------------------------------------------------------===//
// Find leftmost one
-def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[WLat7, WLat7, FXU2, GroupAlone], (instregex "FLOGR$")>;
// Population count
-def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
-
-// Extend
-def : InstRW<[FXU], (instregex "AEXT128$")>;
-def : InstRW<[FXU], (instregex "ZEXT128$")>;
+def : InstRW<[WLat3, WLat3, FXU, NormalGr], (instregex "POPCNT$")>;
// String instructions
-def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
-def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "SRST(U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CUSE$")>;
// Various complex instructions
-def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>;
-def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
-def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CFC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, WLat30, WLat30, MCD],
+ (instregex "UPT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CKSM$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CMPSC$")>;
// Execute
def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
@@ -665,167 +696,155 @@ def : InstRW<[], (instregex "Insn.*")>;
// ----------------------------- Floating point ----------------------------- //
//===----------------------------------------------------------------------===//
-// FP: Select instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[FXU], (instregex "SelectF(32|64|128)$")>;
-def : InstRW<[FXU], (instregex "CondStoreF32(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStoreF64(Inv)?$")>;
-
-//===----------------------------------------------------------------------===//
// FP: Move instructions
//===----------------------------------------------------------------------===//
// Load zero
-def : InstRW<[FXU], (instregex "LZ(DR|ER)$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LZXR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LZXR$")>;
// Load
-def : InstRW<[FXU], (instregex "LER$")>;
-def : InstRW<[FXU], (instregex "LD(R|R32|GR)$")>;
-def : InstRW<[FXU, Lat3], (instregex "LGDR$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LXR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat3, FXU, NormalGr], (instregex "LGDR$")>;
+def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LXR$")>;
// Load and Test
-def : InstRW<[FPU], (instregex "LT(D|E)BR$")>;
-def : InstRW<[FPU], (instregex "LTEBRCompare(_VecPseudo)?$")>;
-def : InstRW<[FPU], (instregex "LTDBRCompare(_VecPseudo)?$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXBR$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone],
- (instregex "LTXBRCompare(_VecPseudo)?$")>;
+def : InstRW<[WLat9, WLat9, FPU, NormalGr], (instregex "LT(E|D)BR$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "LT(E|D)BRCompare$")>;
+def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "LTXBR(Compare)?$")>;
// Copy sign
-def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRd(d|s)$")>;
-def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRs(d|s)$")>;
+def : InstRW<[WLat5, FXU2, GroupAlone], (instregex "CPSDR(d|s)(d|s)$")>;
//===----------------------------------------------------------------------===//
// FP: Load instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU], (instregex "LE(Y)?$")>;
-def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
-def : InstRW<[LSU], (instregex "LX$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(E|D)(Y|E32)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
//===----------------------------------------------------------------------===//
// FP: Store instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat7], (instregex "STD(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat7], (instregex "STE(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STX$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STX$")>;
//===----------------------------------------------------------------------===//
// FP: Conversion instructions
//===----------------------------------------------------------------------===//
// Load rounded
-def : InstRW<[FPU], (instregex "LEDBR(A)?$")>;
-def : InstRW<[FPU, FPU, Lat20], (instregex "LEXBR(A)?$")>;
-def : InstRW<[FPU, FPU, Lat20], (instregex "LDXBR(A)?$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "LEDBR(A)?$")>;
+def : InstRW<[WLat9, FPU2, NormalGr], (instregex "L(E|D)XBR(A)?$")>;
// Load lengthened
-def : InstRW<[FPU, LSU, Lat12], (instregex "LDEB$")>;
-def : InstRW<[FPU], (instregex "LDEBR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)B$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)BR$")>;
+def : InstRW<[WLat7LSU, FPU, LSU, NormalGr], (instregex "LDEB$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "LDEBR$")>;
+def : InstRW<[WLat11LSU, FPU4, LSU, GroupAlone], (instregex "LX(E|D)B$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "LX(E|D)BR$")>;
// Convert from fixed / logical
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)BR(A)?$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)BR(A)?$")>;
-def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CEL(F|G)BR$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CDL(F|G)BR$")>;
-def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CXL(F|G)BR$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "C(E|D)(F|G)BR(A)?$")>;
+def : InstRW<[WLat11, FXU, FPU4, GroupAlone], (instregex "CX(F|G)BR(A?)$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "CEL(F|G)BR$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "CDL(F|G)BR$")>;
+def : InstRW<[WLat11, FXU, FPU4, GroupAlone], (instregex "CXL(F|G)BR$")>;
// Convert to fixed / logical
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)BR(A)?$")>;
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)BR(A)?$")>;
-def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XBR(A)?$")>;
-def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLF(E|D)BR$")>;
-def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLG(E|D)BR$")>;
-def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "CL(F|G)XBR$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU, GroupAlone],
+ (instregex "C(F|G)(E|D)BR(A?)$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU2, GroupAlone],
+ (instregex "C(F|G)XBR(A?)$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU, GroupAlone],
+ (instregex "CL(F|G)(E|D)BR$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU2, GroupAlone], (instregex "CL(F|G)XBR$")>;
//===----------------------------------------------------------------------===//
// FP: Unary arithmetic
//===----------------------------------------------------------------------===//
// Load Complement / Negative / Positive
-def : InstRW<[FPU], (instregex "L(C|N|P)DBR$")>;
-def : InstRW<[FPU], (instregex "L(C|N|P)EBR$")>;
-def : InstRW<[FXU], (instregex "LCDFR(_32)?$")>;
-def : InstRW<[FXU], (instregex "LNDFR(_32)?$")>;
-def : InstRW<[FXU], (instregex "LPDFR(_32)?$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
// Square root
-def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)B$")>;
-def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXBR$")>;
+def : InstRW<[WLat30, FPU, LSU, NormalGr], (instregex "SQ(E|D)B$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "SQXBR$")>;
// Load FP integer
-def : InstRW<[FPU], (instregex "FIEBR(A)?$")>;
-def : InstRW<[FPU], (instregex "FIDBR(A)?$")>;
-def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXBR(A)?$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "FI(E|D)BR(A)?$")>;
+def : InstRW<[WLat15, FPU4, GroupAlone], (instregex "FIXBR(A)?$")>;
//===----------------------------------------------------------------------===//
// FP: Binary arithmetic
//===----------------------------------------------------------------------===//
// Addition
-def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D)B$")>;
-def : InstRW<[FPU], (instregex "A(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+ (instregex "A(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "A(E|D)BR$")>;
+def : InstRW<[WLat20, WLat20, FPU4, GroupAlone], (instregex "AXBR$")>;
// Subtraction
-def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D)B$")>;
-def : InstRW<[FPU], (instregex "S(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+ (instregex "S(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "S(E|D)BR$")>;
+def : InstRW<[WLat20, WLat20, FPU4, GroupAlone], (instregex "SXBR$")>;
// Multiply
-def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
-def : InstRW<[FPU], (instregex "M(D|DE|EE)BR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXDB$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+ (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU4, LSU, GroupAlone],
+ (instregex "MXDB$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "MXBR$")>;
// Multiply and add / subtract
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+ (instregex "M(A|S)EB$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+ (instregex "M(A|S)DB$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
// Division
-def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
-def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU, LSU, NormalGr], (instregex "D(E|D)B$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "D(E|D)BR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "DXBR$")>;
// Divide to integer
-def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "DI(E|D)BR$")>;
//===----------------------------------------------------------------------===//
// FP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>;
-def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>;
-def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU, LSU, NormalGr],
+ (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[WLat30, FPU2, NormalGr], (instregex "(K|C)XBR$")>;
// Test Data Class
-def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
+def : InstRW<[WLat15, FPU, LSU, NormalGr], (instregex "TC(E|D)B$")>;
+def : InstRW<[WLat15, FPU4, LSU, GroupAlone], (instregex "TCXB$")>;
//===----------------------------------------------------------------------===//
// FP: Floating-point control register instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
-def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[FXU, Lat30], (instregex "SFASR$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>;
-def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
+def : InstRW<[WLat4, FXU, LSU, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SFASR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LFAS$")>;
+def : InstRW<[WLat2, FXU, GroupAlone], (instregex "SRNM(B|T)?$")>;
// --------------------- Hexadecimal floating point ------------------------- //
@@ -835,108 +854,111 @@ def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
//===----------------------------------------------------------------------===//
// Load and Test
-def : InstRW<[FPU], (instregex "LT(D|E)R$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXR$")>;
+def : InstRW<[WLat9, WLat9, FPU, NormalGr], (instregex "LT(E|D)R$")>;
+def : InstRW<[WLat9, WLat9, FPU4, GroupAlone], (instregex "LTXR$")>;
//===----------------------------------------------------------------------===//
// HFP: Conversion instructions
//===----------------------------------------------------------------------===//
// Load rounded
-def : InstRW<[FPU], (instregex "(LEDR|LRER)$")>;
-def : InstRW<[FPU], (instregex "LEXR$")>;
-def : InstRW<[FPU], (instregex "(LDXR|LRDR)$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "LEXR$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "(LDXR|LRDR)$")>;
// Load lengthened
-def : InstRW<[LSU], (instregex "LDE$")>;
-def : InstRW<[FXU], (instregex "LDER$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LDE$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LDER$")>;
+def : InstRW<[WLat11LSU, FPU4, LSU, GroupAlone], (instregex "LX(E|D)$")>;
+def : InstRW<[WLat9, FPU4, GroupAlone], (instregex "LX(E|D)R$")>;
// Convert from fixed
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)R$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)R$")>;
-def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)R$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "C(E|D)(F|G)R$")>;
+def : InstRW<[WLat10, FXU, FPU4, GroupAlone], (instregex "CX(F|G)R$")>;
// Convert to fixed
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)R$")>;
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)R$")>;
-def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XR$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU, GroupAlone],
+ (instregex "C(F|G)(E|D)R$")>;
+def : InstRW<[WLat30, WLat30, FXU, FPU2, GroupAlone], (instregex "C(F|G)XR$")>;
// Convert BFP to HFP / HFP to BFP.
-def : InstRW<[FPU], (instregex "THD(E)?R$")>;
-def : InstRW<[FPU], (instregex "TB(E)?DR$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "THD(E)?R$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "TB(E)?DR$")>;
//===----------------------------------------------------------------------===//
// HFP: Unary arithmetic
//===----------------------------------------------------------------------===//
// Load Complement / Negative / Positive
-def : InstRW<[FPU], (instregex "L(C|N|P)DR$")>;
-def : InstRW<[FPU], (instregex "L(C|N|P)ER$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XR$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "L(C|N|P)(E|D)R$")>;
+def : InstRW<[WLat9, WLat9, FPU4, GroupAlone], (instregex "L(C|N|P)XR$")>;
// Halve
-def : InstRW<[FPU], (instregex "H(E|D)R$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "H(E|D)R$")>;
// Square root
-def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)$")>;
-def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)R$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXR$")>;
+def : InstRW<[WLat30, FPU, LSU, NormalGr], (instregex "SQ(E|D)$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "SQ(E|D)R$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "SQXR$")>;
// Load FP integer
-def : InstRW<[FPU], (instregex "FIER$")>;
-def : InstRW<[FPU], (instregex "FIDR$")>;
-def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXR$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "FI(E|D)R$")>;
+def : InstRW<[WLat15, FPU4, GroupAlone], (instregex "FIXR$")>;
//===----------------------------------------------------------------------===//
// HFP: Binary arithmetic
//===----------------------------------------------------------------------===//
// Addition
-def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
-def : InstRW<[FPU], (instregex "A(E|D|U|W)R$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+ (instregex "A(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[WLat15, WLat15, FPU4, GroupAlone], (instregex "AXR$")>;
// Subtraction
-def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
-def : InstRW<[FPU], (instregex "S(E|D|U|W)R$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+ (instregex "S(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[WLat15, WLat15, FPU4, GroupAlone], (instregex "SXR$")>;
// Multiply
-def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
-def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>;
-def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MYR$")>;
-def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FPU, LSU, NormalGr], (instregex "M(D|EE)$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, FPU, LSU, NormalGr], (instregex "M(DE|E)$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "M(D|EE)R$")>;
+def : InstRW<[WLat8, FPU, NormalGr], (instregex "M(DE|E)R$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU4, LSU, GroupAlone], (instregex "MXD$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU4, LSU, GroupAlone], (instregex "MY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FPU2, LSU, GroupAlone],
+ (instregex "MY(H|L)$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "MY(H|L)R$")>;
// Multiply and add / subtract
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>;
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>;
-def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>;
-def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>;
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+ (instregex "M(A|S)(E|D)$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "M(A|S)(E|D)R$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, RegReadAdv, FPU4, LSU, GroupAlone],
+ (instregex "MAY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+ (instregex "MAY(H|L)$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
// Division
-def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>;
-def : InstRW<[FPU, Lat30], (instregex "D(E|D)R$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXR$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU, LSU, NormalGr], (instregex "D(E|D)$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "D(E|D)R$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "DXR$")>;
//===----------------------------------------------------------------------===//
// HFP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)$")>;
-def : InstRW<[FPU], (instregex "C(E|D)R$")>;
-def : InstRW<[FPU, FPU, Lat15], (instregex "CXR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU, LSU, NormalGr], (instregex "C(E|D)$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "C(E|D)R$")>;
+def : InstRW<[WLat15, FPU2, NormalGr], (instregex "CXR$")>;
// ------------------------ Decimal floating point -------------------------- //
@@ -946,114 +968,115 @@ def : InstRW<[FPU, FPU, Lat15], (instregex "CXR$")>;
//===----------------------------------------------------------------------===//
// Load and Test
-def : InstRW<[DFU, Lat20], (instregex "LTDTR$")>;
-def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LTXTR$")>;
+def : InstRW<[WLat4, WLat4, DFU, NormalGr], (instregex "LTDTR$")>;
+def : InstRW<[WLat6, WLat6, DFU4, GroupAlone], (instregex "LTXTR$")>;
//===----------------------------------------------------------------------===//
// DFP: Conversion instructions
//===----------------------------------------------------------------------===//
// Load rounded
-def : InstRW<[DFU, Lat30], (instregex "LEDTR$")>;
-def : InstRW<[DFU, DFU, Lat30], (instregex "LDXTR$")>;
+def : InstRW<[WLat30, DFU, NormalGr], (instregex "LEDTR$")>;
+def : InstRW<[WLat30, DFU2, NormalGr], (instregex "LDXTR$")>;
// Load lengthened
-def : InstRW<[DFU, Lat20], (instregex "LDETR$")>;
-def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>;
+def : InstRW<[WLat7, DFU, NormalGr], (instregex "LDETR$")>;
+def : InstRW<[WLat6, DFU4, GroupAlone], (instregex "LXDTR$")>;
// Convert from fixed / logical
-def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>;
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>;
-def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>;
+def : InstRW<[WLat9, FXU, DFU, GroupAlone], (instregex "CDFTR$")>;
+def : InstRW<[WLat30, FXU, DFU, GroupAlone], (instregex "CDGTR(A)?$")>;
+def : InstRW<[WLat5, FXU, DFU4, GroupAlone], (instregex "CXFTR(A)?$")>;
+def : InstRW<[WLat30, FXU, DFU4, GroupAlone], (instregex "CXGTR(A)?$")>;
+def : InstRW<[WLat9, FXU, DFU, GroupAlone], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[WLat9, FXU, DFU4, GroupAlone], (instregex "CXLFTR$")>;
+def : InstRW<[WLat5, FXU, DFU4, GroupAlone], (instregex "CXLGTR$")>;
// Convert to fixed / logical
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>;
-def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>;
+def : InstRW<[WLat11, WLat11, FXU, DFU, GroupAlone], (instregex "CFDTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXU, DFU, GroupAlone], (instregex "CGDTR(A)?$")>;
+def : InstRW<[WLat7, WLat7, FXU, DFU2, GroupAlone], (instregex "CFXTR$")>;
+def : InstRW<[WLat30, WLat30, FXU, DFU2, GroupAlone], (instregex "CGXTR(A)?$")>;
+def : InstRW<[WLat11, WLat11, FXU, DFU, GroupAlone], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[WLat7, WLat7, FXU, DFU2, GroupAlone], (instregex "CL(F|G)XTR$")>;
// Convert from / to signed / unsigned packed
-def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>;
-def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "CX(S|U)TR$")>;
-def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "C(S|U)DTR$")>;
-def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")>;
+def : InstRW<[WLat5, FXU, DFU, GroupAlone], (instregex "CD(S|U)TR$")>;
+def : InstRW<[WLat8, FXU2, DFU4, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[WLat7, FXU, DFU, GroupAlone], (instregex "C(S|U)DTR$")>;
+def : InstRW<[WLat12, FXU2, DFU4, GroupAlone], (instregex "C(S|U)XTR$")>;
// Perform floating-point operation
-def : InstRW<[FXU, Lat30], (instregex "PFPO$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "PFPO$")>;
//===----------------------------------------------------------------------===//
// DFP: Unary arithmetic
//===----------------------------------------------------------------------===//
// Load FP integer
-def : InstRW<[DFU, Lat20], (instregex "FIDTR$")>;
-def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>;
+def : InstRW<[WLat8, DFU, NormalGr], (instregex "FIDTR$")>;
+def : InstRW<[WLat10, DFU4, GroupAlone], (instregex "FIXTR$")>;
// Extract biased exponent
-def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>;
-def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>;
+def : InstRW<[WLat7, FXU, DFU, GroupAlone], (instregex "EEDTR$")>;
+def : InstRW<[WLat8, FXU, DFU2, GroupAlone], (instregex "EEXTR$")>;
// Extract significance
-def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat20, GroupAlone], (instregex "ESXTR$")>;
+def : InstRW<[WLat7, FXU, DFU, GroupAlone], (instregex "ESDTR$")>;
+def : InstRW<[WLat8, FXU, DFU2, GroupAlone], (instregex "ESXTR$")>;
//===----------------------------------------------------------------------===//
// DFP: Binary arithmetic
//===----------------------------------------------------------------------===//
// Addition
-def : InstRW<[DFU, Lat30], (instregex "ADTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "AXTR(A)?$")>;
+def : InstRW<[WLat9, WLat9, DFU, NormalGr], (instregex "ADTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, DFU4, GroupAlone], (instregex "AXTR(A)?$")>;
// Subtraction
-def : InstRW<[DFU, Lat30], (instregex "SDTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "SXTR(A)?$")>;
+def : InstRW<[WLat9, WLat9, DFU, NormalGr], (instregex "SDTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, DFU4, GroupAlone], (instregex "SXTR(A)?$")>;
// Multiply
-def : InstRW<[DFU, Lat30], (instregex "MDTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+def : InstRW<[WLat30, DFU, NormalGr], (instregex "MDTR(A)?$")>;
+def : InstRW<[WLat30, DFU4, GroupAlone], (instregex "MXTR(A)?$")>;
// Division
-def : InstRW<[DFU, Lat30], (instregex "DDTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+def : InstRW<[WLat30, DFU, NormalGr], (instregex "DDTR(A)?$")>;
+def : InstRW<[WLat30, DFU4, GroupAlone], (instregex "DXTR(A)?$")>;
// Quantize
-def : InstRW<[DFU, Lat30], (instregex "QADTR$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>;
+def : InstRW<[WLat8, WLat8, DFU, NormalGr], (instregex "QADTR$")>;
+def : InstRW<[WLat10, WLat10, DFU4, GroupAlone], (instregex "QAXTR$")>;
// Reround
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>;
+def : InstRW<[WLat11, WLat11, FXU, DFU, GroupAlone], (instregex "RRDTR$")>;
+def : InstRW<[WLat30, WLat30, FXU, DFU4, GroupAlone], (instregex "RRXTR$")>;
// Shift significand left/right
-def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
-def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+def : InstRW<[WLat7LSU, LSU, DFU, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[WLat11LSU, LSU, DFU4, GroupAlone], (instregex "S(L|R)XT$")>;
// Insert biased exponent
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+def : InstRW<[WLat5, FXU, DFU, GroupAlone], (instregex "IEDTR$")>;
+def : InstRW<[WLat7, FXU, DFU4, GroupAlone], (instregex "IEXTR$")>;
//===----------------------------------------------------------------------===//
// DFP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>;
-def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>;
+def : InstRW<[WLat9, DFU, NormalGr], (instregex "(K|C)DTR$")>;
+def : InstRW<[WLat10, DFU2, NormalGr], (instregex "(K|C)XTR$")>;
// Compare biased exponent
-def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>;
-def : InstRW<[DFU2, Lat9], (instregex "CEXTR$")>;
+def : InstRW<[WLat4, DFU, NormalGr], (instregex "CEDTR$")>;
+def : InstRW<[WLat5, DFU2, NormalGr], (instregex "CEXTR$")>;
// Test Data Class/Group
-def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
-def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
+def : InstRW<[WLat9, LSU, DFU, NormalGr], (instregex "TD(C|G)DT$")>;
+def : InstRW<[WLat10, LSU, DFU, NormalGr], (instregex "TD(C|G)ET$")>;
+def : InstRW<[WLat10, LSU, DFU2, NormalGr], (instregex "TD(C|G)XT$")>;
// -------------------------------- System ---------------------------------- //
@@ -1062,156 +1085,151 @@ def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
// System: Program-Status Word Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
-def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
-def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
-def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>;
+def : InstRW<[WLat30, MCD], (instregex "LPSW(E)?$")>;
+def : InstRW<[WLat3, FXU, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[WLat3, FXU, NormalGr], (instregex "IAC$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>;
//===----------------------------------------------------------------------===//
// System: Control Register Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
-def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
- (instregex "STCT(L|G)$")>;
-def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
-def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
-def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
+def : InstRW<[WLat10, WLat10, LSU2, GroupAlone], (instregex "LCTL(G)?$")>;
+def : InstRW<[WLat1, FXU5, LSU5, GroupAlone], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "ESEA$")>;
//===----------------------------------------------------------------------===//
// System: Prefix-Register Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat30], (instregex "SPX$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STPX$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(T)?PX$")>;
//===----------------------------------------------------------------------===//
// System: Storage-Key and Real Memory Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat30], (instregex "ISKE$")>;
-def : InstRW<[FXU, Lat30], (instregex "IVSK$")>;
-def : InstRW<[FXU, Lat30], (instregex "SSKE(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "RRB(E|M)$")>;
-def : InstRW<[FXU, Lat30], (instregex "PFMF$")>;
-def : InstRW<[FXU, Lat30], (instregex "TB$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "PGIN$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "PGOUT$")>;
+def : InstRW<[WLat30, MCD], (instregex "ISKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "IVSK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RRB(E|M)$")>;
+def : InstRW<[WLat30, MCD], (instregex "PFMF$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGIN$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGOUT$")>;
//===----------------------------------------------------------------------===//
// System: Dynamic-Address-Translation Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "IDTE(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "PTLB$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "CSP(G)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LPTEA$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STRAG$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LURA(G)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STUR(A|G)$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
+def : InstRW<[WLat30, MCD], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTLB$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "CSP(G)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "LPTEA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STRAG$")>;
+def : InstRW<[WLat30, MCD], (instregex "LURA(G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STUR(A|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPROT$")>;
//===----------------------------------------------------------------------===//
// System: Memory-move Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCSK$")>;
-def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVC(S|D)K$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVPG$")>;
//===----------------------------------------------------------------------===//
// System: Address-Space Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
-def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
-def : InstRW<[FXU, Lat30], (instregex "PR$")>;
-def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "RP$")>;
-def : InstRW<[FXU, Lat30], (instregex "BS(G|A)$")>;
-def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LASP$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PC$")>;
+def : InstRW<[WLat30, MCD], (instregex "PR$")>;
+def : InstRW<[WLat30, MCD], (instregex "PT(I)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RP$")>;
+def : InstRW<[WLat30, MCD], (instregex "BS(G|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TAR$")>;
//===----------------------------------------------------------------------===//
// System: Linkage-Stack Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>;
-def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
+def : InstRW<[WLat30, MCD], (instregex "BAKR$")>;
+def : InstRW<[WLat30, MCD], (instregex "EREG(G)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "(E|M)STA$")>;
//===----------------------------------------------------------------------===//
// System: Time-Related Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat30], (instregex "PTFF$")>;
-def : InstRW<[FXU, LSU, Lat20], (instregex "SCK$")>;
-def : InstRW<[FXU, Lat30], (instregex "SCKPF$")>;
-def : InstRW<[FXU, LSU, Lat20], (instregex "SCKC$")>;
-def : InstRW<[FXU, LSU, Lat20], (instregex "SPT$")>;
-def : InstRW<[FXU, LSU, Lat15], (instregex "STCK$")>;
-def : InstRW<[FXU, LSU, Lat12], (instregex "STCKF$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STCKE$")>;
-def : InstRW<[FXU, LSU, Lat9], (instregex "STCKC$")>;
-def : InstRW<[FXU, LSU, Lat8], (instregex "STPT$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTFF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCKPF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCKC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SPT$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCK(F)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCKC$")>;
+def : InstRW<[WLat30, MCD], (instregex "STPT$")>;
//===----------------------------------------------------------------------===//
// System: CPU-Related Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat30], (instregex "STAP$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STIDP$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STSI$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STFL(E)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "ECAG$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "ECTG$")>;
-def : InstRW<[FXU, Lat30], (instregex "PTF$")>;
-def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
+def : InstRW<[WLat30, MCD], (instregex "STAP$")>;
+def : InstRW<[WLat30, MCD], (instregex "STIDP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STSI$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STFL(E)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "ECAG$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECTG$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTF$")>;
+def : InstRW<[WLat30, MCD], (instregex "PCKMO$")>;
//===----------------------------------------------------------------------===//
// System: Miscellaneous Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-def : InstRW<[FXU, GroupAlone], (instregex "MC$")>;
-def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "TRAC(E|G)$")>;
-def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
-def : InstRW<[FXU, Lat30], (instregex "SIGP$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
+def : InstRW<[WLat30, MCD], (instregex "SVC$")>;
+def : InstRW<[WLat1, FXU, GroupAlone], (instregex "MC$")>;
+def : InstRW<[WLat30, MCD], (instregex "DIAG$")>;
+def : InstRW<[WLat30, MCD], (instregex "TRAC(E|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TRAP(2|4)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIG(P|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIE$")>;
//===----------------------------------------------------------------------===//
// System: CPU-Measurement Facility Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "LPP$")>;
-def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
-def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
-def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
-def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LPP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECPGA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "E(C|P)CTR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LCCTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "L(P|S)CTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(C|P)CTR$")>;
//===----------------------------------------------------------------------===//
// System: I/O Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat30], (instregex "(C|H|R|X)SCH$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
-def : InstRW<[FXU, Lat30], (instregex "RCHP$")>;
-def : InstRW<[FXU, Lat30], (instregex "SCHM$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STC(PS|RW)$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "TPI$")>;
-def : InstRW<[FXU, Lat30], (instregex "SAL$")>;
+def : InstRW<[WLat30, MCD], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "RCHP$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCHM$")>;
+def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPI$")>;
+def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
}
diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td
index a0f2115eb9d7..892f493570d1 100644
--- a/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -10,13 +10,15 @@
// This file defines the machine model for ZEC12 to support instruction
// scheduling and other instruction cost heuristics.
//
+// Pseudos expanded right after isel do not need to be modelled here.
+//
//===----------------------------------------------------------------------===//
def ZEC12Model : SchedMachineModel {
let UnsupportedFeatures = Arch10UnsupportedFeatures.List;
- let IssueWidth = 5;
+ let IssueWidth = 3;
let MicroOpBufferSize = 40; // Issue queues
let LoadLatency = 1; // Optimistic load latency.
@@ -26,34 +28,41 @@ def ZEC12Model : SchedMachineModel {
let MispredictPenalty = 16;
}
-let SchedModel = ZEC12Model in {
+let SchedModel = ZEC12Model in {
+// These definitions need the SchedModel value. They could be put in a
+// subtarget common include file, but it seems the include system in Tablegen
+// currently (2016) rejects multiple includes of same file.
-// These definitions could be put in a subtarget common include file,
-// but it seems the include system in Tablegen currently rejects
-// multiple includes of same file.
+// Decoder grouping rules
+let NumMicroOps = 1 in {
+ def : WriteRes<NormalGr, []>;
+ def : WriteRes<BeginGroup, []> { let BeginGroup = 1; }
+ def : WriteRes<EndGroup, []> { let EndGroup = 1; }
+}
+def : WriteRes<Cracked, []> {
+ let NumMicroOps = 2;
+ let BeginGroup = 1;
+}
def : WriteRes<GroupAlone, []> {
- let NumMicroOps = 0;
+ let NumMicroOps = 3;
let BeginGroup = 1;
let EndGroup = 1;
}
-def : WriteRes<EndGroup, []> {
- let NumMicroOps = 0;
- let EndGroup = 1;
+
+// Incoming latency removed from the register operand which is used together
+// with a memory operand by the instruction.
+def : ReadAdvance<RegReadAdv, 4>;
+
+// LoadLatency (above) is not used for instructions in this file. This is
+// instead the role of LSULatency, which is the latency value added to the
+// result of loads and instructions with folded memory operands.
+def : WriteRes<LSULatency, []> { let Latency = 4; let NumMicroOps = 0; }
+
+let NumMicroOps = 0 in {
+ foreach L = 1-30 in {
+ def : WriteRes<!cast<SchedWrite>("WLat"#L), []> { let Latency = L; }
+ }
}
-def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
-def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
-def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
-def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
-def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
-def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
-def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
-def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
-def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
-def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
-def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
-def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
-def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
-def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
// Execution units.
def ZEC12_FXUnit : ProcResource<2>;
@@ -61,16 +70,27 @@ def ZEC12_LSUnit : ProcResource<2>;
def ZEC12_FPUnit : ProcResource<1>;
def ZEC12_DFUnit : ProcResource<1>;
def ZEC12_VBUnit : ProcResource<1>;
+def ZEC12_MCD : ProcResource<1>;
// Subtarget specific definitions of scheduling resources.
-def : WriteRes<FXU, [ZEC12_FXUnit]> { let Latency = 1; }
-def : WriteRes<LSU, [ZEC12_LSUnit]> { let Latency = 4; }
-def : WriteRes<LSU_lat1, [ZEC12_LSUnit]> { let Latency = 1; }
-def : WriteRes<FPU, [ZEC12_FPUnit]> { let Latency = 8; }
-def : WriteRes<FPU2, [ZEC12_FPUnit, ZEC12_FPUnit]> { let Latency = 9; }
-def : WriteRes<DFU, [ZEC12_DFUnit]> { let Latency = 2; }
-def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_DFUnit]> { let Latency = 3; }
-def : WriteRes<VBU, [ZEC12_VBUnit]>; // Virtual Branching Unit
+let NumMicroOps = 0 in {
+ def : WriteRes<FXU, [ZEC12_FXUnit]>;
+ def : WriteRes<LSU, [ZEC12_LSUnit]>;
+ def : WriteRes<FPU, [ZEC12_FPUnit]>;
+ def : WriteRes<DFU, [ZEC12_DFUnit]>;
+ foreach Num = 2-6 in { let ResourceCycles = [Num] in {
+ def : WriteRes<!cast<SchedWrite>("FXU"#Num), [ZEC12_FXUnit]>;
+ def : WriteRes<!cast<SchedWrite>("LSU"#Num), [ZEC12_LSUnit]>;
+ def : WriteRes<!cast<SchedWrite>("FPU"#Num), [ZEC12_FPUnit]>;
+ def : WriteRes<!cast<SchedWrite>("DFU"#Num), [ZEC12_DFUnit]>;
+ }}
+
+ def : WriteRes<VBU, [ZEC12_VBUnit]>; // Virtual Branching Unit
+}
+
+def : WriteRes<MCD, [ZEC12_MCD]> { let NumMicroOps = 3;
+ let BeginGroup = 1;
+ let EndGroup = 1; }
// -------------------------- INSTRUCTIONS ---------------------------------- //
@@ -84,26 +104,27 @@ def : WriteRes<VBU, [ZEC12_VBUnit]>; // Virtual Branching Unit
// Stack allocation
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+// Pseudo -> LA / LAY
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ADJDYNALLOC$")>;
//===----------------------------------------------------------------------===//
// Branch instructions
//===----------------------------------------------------------------------===//
// Branch
-def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
-def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>;
-def : InstRW<[LSU, Lat4], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
-def : InstRW<[LSU, Lat4], (instregex "(Call)?B(R)?(Asm.*)?$")>;
-def : InstRW<[FXU, EndGroup], (instregex "BRCT(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BRCTH$")>;
-def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BCT(G)?(R)?$")>;
-def : InstRW<[FXU, FXU, FXU, LSU, Lat7, GroupAlone],
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[WLat1, FXU3, LSU, GroupAlone],
(instregex "B(R)?X(H|L).*$")>;
// Compare and branch
-def : InstRW<[FXU], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat5, GroupAlone],
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone],
(instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
//===----------------------------------------------------------------------===//
@@ -111,582 +132,592 @@ def : InstRW<[FXU, LSU, Lat5, GroupAlone],
//===----------------------------------------------------------------------===//
// Trap
-def : InstRW<[VBU], (instregex "(Cond)?Trap$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Cond)?Trap$")>;
// Compare and trap
-def : InstRW<[FXU], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
-def : InstRW<[FXU], (instregex "CL(G)?RT(Asm.*)?$")>;
-def : InstRW<[FXU], (instregex "CL(F|G)IT(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>;
//===----------------------------------------------------------------------===//
// Call and return instructions
//===----------------------------------------------------------------------===//
// Call
-def : InstRW<[VBU, FXU, FXU, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "(Call)?BRASL$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+def : InstRW<[WLat1, FXU2, VBU, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
// Return
-def : InstRW<[LSU_lat1, EndGroup], (instregex "Return$")>;
-def : InstRW<[LSU_lat1], (instregex "CondReturn$")>;
-
-//===----------------------------------------------------------------------===//
-// Select instructions
-//===----------------------------------------------------------------------===//
-
-// Select pseudo
-def : InstRW<[FXU], (instregex "Select(32|64|32Mux)$")>;
-
-// CondStore pseudos
-def : InstRW<[FXU], (instregex "CondStore16(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore16Mux(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore32(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore64(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore8(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore8Mux(Inv)?$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "Return$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "CondReturn$")>;
//===----------------------------------------------------------------------===//
// Move instructions
//===----------------------------------------------------------------------===//
// Moves
-def : InstRW<[FXU, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "MVI(Y)?$")>;
// Move character
-def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
+def : InstRW<[WLat1, FXU, LSU3, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVCL(E|U)?$")>;
// Pseudo -> reg move
-def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
-def : InstRW<[FXU], (instregex "EXTRACT_SUBREG$")>;
-def : InstRW<[FXU], (instregex "INSERT_SUBREG$")>;
-def : InstRW<[FXU], (instregex "REG_SEQUENCE$")>;
-def : InstRW<[FXU], (instregex "SUBREG_TO_REG$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "REG_SEQUENCE$")>;
// Loads
-def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux)?$")>;
-def : InstRW<[LSU], (instregex "LG(RL)?$")>;
-def : InstRW<[LSU], (instregex "L128$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LG(RL)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L128$")>;
-def : InstRW<[FXU], (instregex "LLIH(F|H|L)$")>;
-def : InstRW<[FXU], (instregex "LLIL(F|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLIL(F|H|L)$")>;
-def : InstRW<[FXU], (instregex "LG(F|H)I$")>;
-def : InstRW<[FXU], (instregex "LHI(Mux)?$")>;
-def : InstRW<[FXU], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LG(F|H)I$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LHI(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LR(Mux)?$")>;
// Load and trap
-def : InstRW<[FXU, LSU, Lat5], (instregex "L(FH|G)?AT$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "L(FH|G)?AT$")>;
// Load and test
-def : InstRW<[FXU, LSU, Lat5], (instregex "LT(G)?$")>;
-def : InstRW<[FXU], (instregex "LT(G)?R$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXU, NormalGr], (instregex "LT(G)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LT(G)?R$")>;
// Stores
-def : InstRW<[FXU, LSU, Lat5], (instregex "STG(RL)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ST128$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STG(RL)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST128$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(Y|FH|RL|Mux)?$")>;
// String moves.
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVST$")>;
//===----------------------------------------------------------------------===//
// Conditional move instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat2], (instregex "LOC(G)?R(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat6], (instregex "LOC(G)?(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STOC(G)?(Asm.*)?$")>;
+def : InstRW<[WLat2, FXU, NormalGr], (instregex "LOC(G)?R(Asm.*)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "LOC(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STOC(G)?(Asm.*)?$")>;
//===----------------------------------------------------------------------===//
// Sign extensions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "L(B|H|G)R$")>;
-def : InstRW<[FXU], (instregex "LG(B|H|F)R$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(B|H|G)R$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LG(B|H|F)R$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LTGF$")>;
-def : InstRW<[FXU], (instregex "LTGFR$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LTGF$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LTGFR$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LH(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LG(B|H|F)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LH(Y)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LG(B|H|F)$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LG(H|F)RL$")>;
//===----------------------------------------------------------------------===//
// Zero extensions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "LLCR(Mux)?$")>;
-def : InstRW<[FXU], (instregex "LLHR(Mux)?$")>;
-def : InstRW<[FXU], (instregex "LLG(C|H|F|T)R$")>;
-def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
-def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LL(C|H)H$")>;
-def : InstRW<[LSU], (instregex "LLHRL$")>;
-def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLH(Mux)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLHRL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
// Load and trap
-def : InstRW<[FXU, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LLG(F|T)?AT$")>;
//===----------------------------------------------------------------------===//
// Truncations
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STCM(H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Multi-register moves
//===----------------------------------------------------------------------===//
// Load multiple (estimated average of 5 ops)
-def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
- (instregex "LM(H|Y|G)?$")>;
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LM(H|Y|G)?$")>;
// Load multiple disjoint
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LMD$")>;
// Store multiple (estimated average of 3 ops)
-def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
- (instregex "STM(H|Y|G)?$")>;
+def : InstRW<[WLat1, LSU2, FXU5, GroupAlone], (instregex "STM(H|Y|G)?$")>;
//===----------------------------------------------------------------------===//
// Byte swaps
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LRV(G)?R$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LRV(G|H)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STRV(G|H)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCIN$")>;
//===----------------------------------------------------------------------===//
// Load address instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "LA(Y|RL)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LA(Y|RL)?$")>;
// Load the Global Offset Table address
-def : InstRW<[FXU], (instregex "GOT$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "GOT$")>;
//===----------------------------------------------------------------------===//
// Absolute and Negation
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat2], (instregex "LP(G)?R$")>;
-def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "L(N|P)GFR$")>;
-def : InstRW<[FXU, Lat2], (instregex "LN(R|GR)$")>;
-def : InstRW<[FXU], (instregex "LC(R|GR)$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LCGFR$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "LP(G)?R$")>;
+def : InstRW<[WLat3, WLat3, FXU2, GroupAlone], (instregex "L(N|P)GFR$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "LN(R|GR)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LC(R|GR)$")>;
+def : InstRW<[WLat2, WLat2, FXU2, GroupAlone], (instregex "LCGFR$")>;
//===----------------------------------------------------------------------===//
// Insertion
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "IC(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "IC32(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
-def : InstRW<[FXU], (instregex "II(F|H|L)Mux$")>;
-def : InstRW<[FXU], (instregex "IIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "IIHH(64)?$")>;
-def : InstRW<[FXU], (instregex "IIHL(64)?$")>;
-def : InstRW<[FXU], (instregex "IILF(64)?$")>;
-def : InstRW<[FXU], (instregex "IILH(64)?$")>;
-def : InstRW<[FXU], (instregex "IILL(64)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "IC(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "IC32(Y)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "ICM(H|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IIHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IIHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IILH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IILL(64)?$")>;
//===----------------------------------------------------------------------===//
// Addition
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>;
-def : InstRW<[FXU, LSU, Lat6], (instregex "AH(Y)?$")>;
-def : InstRW<[FXU], (instregex "AIH$")>;
-def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
-def : InstRW<[FXU], (instregex "AGFI$")>;
-def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
-def : InstRW<[FXU], (instregex "AGR(K)?$")>;
-def : InstRW<[FXU], (instregex "AHI(K)?$")>;
-def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>;
-def : InstRW<[FXU], (instregex "ALGHSIK$")>;
-def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
-def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
-def : InstRW<[FXU], (instregex "ALR(K)?$")>;
-def : InstRW<[FXU], (instregex "AR(K)?$")>;
-def : InstRW<[FXU], (instregex "A(L)?HHHR$")>;
-def : InstRW<[FXU, Lat2], (instregex "A(L)?HHLR$")>;
-def : InstRW<[FXU], (instregex "ALSIH(N)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "A(L)?(Y)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "A(L)?SI$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "AH(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AIH$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AFI(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AGFI$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AGHI(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AHI(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AHIMux(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "ALGF$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALGHSIK$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALGF(I|R)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "A(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "A(L)?HHLR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALSIH(N)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "A(L)?G$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "A(L)?GSI$")>;
// Logical addition with carry
-def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
-def : InstRW<[FXU, Lat3, GroupAlone], (instregex "ALC(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, GroupAlone],
+ (instregex "ALC(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXU, GroupAlone], (instregex "ALC(G)?R$")>;
// Add with sign extension (32 -> 64)
-def : InstRW<[FXU, LSU, Lat6], (instregex "AGF$")>;
-def : InstRW<[FXU, Lat2], (instregex "AGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "AGF$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "AGFR$")>;
//===----------------------------------------------------------------------===//
// Subtraction
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "S(G|Y)?$")>;
-def : InstRW<[FXU, LSU, Lat6], (instregex "SH(Y)?$")>;
-def : InstRW<[FXU], (instregex "SGR(K)?$")>;
-def : InstRW<[FXU], (instregex "SLFI$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
-def : InstRW<[FXU], (instregex "SLGF(I|R)$")>;
-def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
-def : InstRW<[FXU], (instregex "SLR(K)?$")>;
-def : InstRW<[FXU], (instregex "SR(K)?$")>;
-def : InstRW<[FXU], (instregex "S(L)?HHHR$")>;
-def : InstRW<[FXU, Lat2], (instregex "S(L)?HHLR$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "S(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "SH(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLFI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLGF(I|R)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "S(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "S(L)?HHLR$")>;
// Subtraction with borrow
-def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
-def : InstRW<[FXU, Lat3, GroupAlone], (instregex "SLB(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, GroupAlone],
+ (instregex "SLB(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXU, GroupAlone], (instregex "SLB(G)?R$")>;
// Subtraction with sign extension (32 -> 64)
-def : InstRW<[FXU, LSU, Lat6], (instregex "SGF$")>;
-def : InstRW<[FXU, Lat2], (instregex "SGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "SGF$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "SGFR$")>;
//===----------------------------------------------------------------------===//
// AND
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "N(G|Y)?$")>;
-def : InstRW<[FXU], (instregex "NGR(K)?$")>;
-def : InstRW<[FXU], (instregex "NI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "NI(Y)?$")>;
-def : InstRW<[FXU], (instregex "NIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "NIHH(64)?$")>;
-def : InstRW<[FXU], (instregex "NIHL(64)?$")>;
-def : InstRW<[FXU], (instregex "NILF(64)?$")>;
-def : InstRW<[FXU], (instregex "NILH(64)?$")>;
-def : InstRW<[FXU], (instregex "NILL(64)?$")>;
-def : InstRW<[FXU], (instregex "NR(K)?$")>;
-def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "NC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "N(G|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "NI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NIHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NIHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NILH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NILL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NR(K)?$")>;
+def : InstRW<[WLat5LSU, LSU2, FXU, GroupAlone], (instregex "NC$")>;
//===----------------------------------------------------------------------===//
// OR
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "O(G|Y)?$")>;
-def : InstRW<[FXU], (instregex "OGR(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "OI(Y)?$")>;
-def : InstRW<[FXU], (instregex "OI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXU], (instregex "OIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "OIHH(64)?$")>;
-def : InstRW<[FXU], (instregex "OIHL(64)?$")>;
-def : InstRW<[FXU], (instregex "OILF(64)?$")>;
-def : InstRW<[FXU], (instregex "OILH(64)?$")>;
-def : InstRW<[FXU], (instregex "OILL(64)?$")>;
-def : InstRW<[FXU], (instregex "OR(K)?$")>;
-def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "OC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "O(G|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OGR(K)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "OI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OIHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OIHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OILH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OILL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OR(K)?$")>;
+def : InstRW<[WLat5LSU, LSU2, FXU, GroupAlone], (instregex "OC$")>;
//===----------------------------------------------------------------------===//
// XOR
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "X(G|Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "XI(Y)?$")>;
-def : InstRW<[FXU], (instregex "XIFMux$")>;
-def : InstRW<[FXU], (instregex "XGR(K)?$")>;
-def : InstRW<[FXU], (instregex "XIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "XILF(64)?$")>;
-def : InstRW<[FXU], (instregex "XR(K)?$")>;
-def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "XC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "X(G|Y)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "XI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XIFMux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XR(K)?$")>;
+def : InstRW<[WLat5LSU, LSU2, FXU, GroupAlone], (instregex "XC$")>;
//===----------------------------------------------------------------------===//
// Multiplication
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat10], (instregex "MS(GF|Y)?$")>;
-def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
-def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
-def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
-def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
-def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
-def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
-def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
-def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
-def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "MS(GF|Y)?$")>;
+def : InstRW<[WLat6, FXU, NormalGr], (instregex "MS(R|FI)$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "MSG$")>;
+def : InstRW<[WLat8, FXU, NormalGr], (instregex "MSGR$")>;
+def : InstRW<[WLat6, FXU, NormalGr], (instregex "MSGF(I|R)$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+ (instregex "MLG$")>;
+def : InstRW<[WLat9, FXU2, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[WLat5, FXU, NormalGr], (instregex "MGHI$")>;
+def : InstRW<[WLat5, FXU, NormalGr], (instregex "MHI$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "MH(Y)?$")>;
+def : InstRW<[WLat7, FXU2, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+ (instregex "M(FY|L)?$")>;
//===----------------------------------------------------------------------===//
// Division and remainder
//===----------------------------------------------------------------------===//
-def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
- (instregex "DR$")>;
-def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
- (instregex "D$")>;
-def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
- (instregex "DSG(F)?R$")>;
-def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
- (instregex "DSG(F)?$")>;
-def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
- (instregex "DL(G)?R$")>;
-def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
- (instregex "DL(G)?$")>;
+def : InstRW<[WLat30, FPU4, FXU5, GroupAlone], (instregex "DR$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU4, LSU, FXU4, GroupAlone],
+ (instregex "D$")>;
+def : InstRW<[WLat30, FPU4, FXU4, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU4, LSU, FXU3, GroupAlone],
+ (instregex "DSG(F)?$")>;
+def : InstRW<[WLat30, FPU4, FXU5, GroupAlone], (instregex "DL(G)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU4, LSU, FXU4, GroupAlone],
+ (instregex "DL(G)?$")>;
//===----------------------------------------------------------------------===//
// Shifts
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
-def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
-def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXU], (instregex "SLA(G|K)?$")>;
-def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone],
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLL(G|K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SRL(G|K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SRA(G|K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLA(G|K)?$")>;
+def : InstRW<[WLat5LSU, WLat5LSU, FXU4, LSU, GroupAlone],
(instregex "S(L|R)D(A|L)$")>;
// Rotate
-def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
+def : InstRW<[WLat2LSU, FXU, LSU, NormalGr], (instregex "RLL(G)?$")>;
// Rotate and insert
-def : InstRW<[FXU], (instregex "RISBG(N|32)?$")>;
-def : InstRW<[FXU], (instregex "RISBH(G|H|L)$")>;
-def : InstRW<[FXU], (instregex "RISBL(G|H|L)$")>;
-def : InstRW<[FXU], (instregex "RISBMux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBMux$")>;
// Rotate and Select
-def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "R(N|O|X)SBG$")>;
+def : InstRW<[WLat3, WLat3, FXU2, GroupAlone], (instregex "R(N|O|X)SBG$")>;
//===----------------------------------------------------------------------===//
// Comparison
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
-def : InstRW<[FXU], (instregex "C(F|H)I(Mux)?$")>;
-def : InstRW<[FXU], (instregex "CG(F|H)I$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
-def : InstRW<[FXU], (instregex "C(G)?R$")>;
-def : InstRW<[FXU], (instregex "CIH$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CH(F|SI)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
-def : InstRW<[FXU], (instregex "CLFI(Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLGF(RL)?$")>;
-def : InstRW<[FXU], (instregex "CLGF(I|R)$")>;
-def : InstRW<[FXU], (instregex "CLGR$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLGRL$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
-def : InstRW<[FXU], (instregex "CLIH$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
-def : InstRW<[FXU], (instregex "CLR$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
-def : InstRW<[FXU], (instregex "C(L)?HHR$")>;
-def : InstRW<[FXU, Lat2], (instregex "C(L)?HLR$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "C(G|Y|Mux|RL)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CG(F|H)I$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(G)?R$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CIH$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CHF$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CHSI$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "CL(Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLFHSI$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CLG$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLG(HRL|HSI)$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CLGF$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLGFRL$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLGF(I|R)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLGR$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLGRL$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CLHF$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLH(RL|HSI)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLIH$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLR$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLRL$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(L)?HHR$")>;
+def : InstRW<[WLat2, FXU, NormalGr], (instregex "C(L)?HLR$")>;
// Compare halfword
-def : InstRW<[FXU, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
-def : InstRW<[FXU, LSU, Lat6], (instregex "CGH(RL)?$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "CHHSI$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CH(Y)?$")>;
+def : InstRW<[WLat2LSU, FXU, LSU, NormalGr], (instregex "CHRL$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CGH$")>;
+def : InstRW<[WLat2LSU, FXU, LSU, NormalGr], (instregex "CGHRL$")>;
+def : InstRW<[WLat2LSU, FXU2, LSU, GroupAlone], (instregex "CHHSI$")>;
// Compare with sign extension (32 -> 64)
-def : InstRW<[FXU, LSU, Lat6], (instregex "CGF(RL)?$")>;
-def : InstRW<[FXU, Lat2], (instregex "CGFR$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CGF$")>;
+def : InstRW<[WLat2LSU, FXU, LSU, NormalGr], (instregex "CGFRL$")>;
+def : InstRW<[WLat2, FXU, NormalGr], (instregex "CGFR$")>;
// Compare logical character
-def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "CLC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+def : InstRW<[WLat9, FXU, LSU2, GroupAlone], (instregex "CLC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLST$")>;
// Test under mask
-def : InstRW<[FXU, LSU, Lat5], (instregex "TM(Y)?$")>;
-def : InstRW<[FXU], (instregex "TM(H|L)Mux$")>;
-def : InstRW<[FXU], (instregex "TMHH(64)?$")>;
-def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
-def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
-def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "TM(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMLH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMLL(64)?$")>;
// Compare logical characters under mask
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU, LSU, NormalGr],
+ (instregex "CLM(H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Prefetch and execution hint
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU], (instregex "PFD(RL)?$")>;
-def : InstRW<[LSU], (instregex "BP(R)?P$")>;
-def : InstRW<[FXU], (instregex "NIAI$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "PFD(RL)?$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "BP(R)?P$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NIAI$")>;
//===----------------------------------------------------------------------===//
// Atomic operations
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU, EndGroup], (instregex "Serialize$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "Serialize$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAA(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAAL(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAN(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAO(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAX(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAA(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAAL(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAN(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAO(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAX(G)?$")>;
// Test and set
-def : InstRW<[FXU, LSU, Lat5, EndGroup], (instregex "TS$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, EndGroup], (instregex "TS$")>;
// Compare and swap
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXU2, LSU, GroupAlone],
+ (instregex "CS(G|Y)?$")>;
// Compare double and swap
-def : InstRW<[FXU, FXU, FXU, FXU, FXU, LSU, Lat10, GroupAlone],
+def : InstRW<[WLat5LSU, WLat5LSU, FXU5, LSU, GroupAlone],
(instregex "CDS(Y)?$")>;
-def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
+def : InstRW<[WLat12, WLat12, FXU6, LSU2, GroupAlone],
(instregex "CDSG$")>;
// Compare and swap and store
-def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>;
+def : InstRW<[WLat30, MCD], (instregex "CSST$")>;
// Perform locked operation
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+def : InstRW<[WLat30, MCD], (instregex "PLO$")>;
// Load/store pair from/to quadword
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
-def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+def : InstRW<[WLat4LSU, LSU2, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[WLat1, FXU2, LSU2, GroupAlone], (instregex "STPQ$")>;
// Load pair disjoint
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, LSU2, GroupAlone], (instregex "LPD(G)?$")>;
//===----------------------------------------------------------------------===//
// Translate and convert
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
-def : InstRW<[FXU, FXU, FXU, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>;
-def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "TR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, FXU3, LSU2, GroupAlone],
+ (instregex "TRT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRTR$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TRE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT(R)?E(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+ (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
//===----------------------------------------------------------------------===//
// Message-security assist
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD],
+ (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
//===----------------------------------------------------------------------===//
// Decimal arithmetic
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>;
-def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
-def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone],
- (instregex "CVDG$")>;
-def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
-def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>;
-def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat30, RegReadAdv, FXU, DFU2, LSU2, GroupAlone],
+ (instregex "CVBG$")>;
+def : InstRW<[WLat20, RegReadAdv, FXU, DFU, LSU, GroupAlone],
+ (instregex "CVB(Y)?$")>;
+def : InstRW<[WLat1, FXU3, DFU4, LSU, GroupAlone], (instregex "CVDG$")>;
+def : InstRW<[WLat1, FXU2, DFU, LSU, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "MV(N|O|Z)$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[WLat10, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat1, FXU, LSU2, GroupAlone], (instregex "UNPK$")>;
-def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[WLat11LSU, FXU, DFU4, LSU2, GroupAlone],
(instregex "(A|S|ZA)P$")>;
-def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone],
- (instregex "(M|D)P$")>;
-def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone],
- (instregex "SRP$")>;
-def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
-def : InstRW<[DFU2, LSU, LSU, Lat5, GroupAlone], (instregex "TP$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+def : InstRW<[WLat1, FXU, DFU4, LSU2, GroupAlone], (instregex "(M|D)P$")>;
+def : InstRW<[WLat15, FXU2, DFU4, LSU3, GroupAlone], (instregex "SRP$")>;
+def : InstRW<[WLat11, DFU4, LSU2, GroupAlone], (instregex "CP$")>;
+def : InstRW<[WLat5LSU, DFU2, LSU2, GroupAlone], (instregex "TP$")>;
+def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>;
//===----------------------------------------------------------------------===//
// Access registers
//===----------------------------------------------------------------------===//
// Extract/set/copy access register
-def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+def : InstRW<[WLat3, LSU, NormalGr], (instregex "(EAR|SAR|CPYA)$")>;
// Load address extended
-def : InstRW<[LSU, FXU, Lat5, GroupAlone], (instregex "LAE(Y)?$")>;
+def : InstRW<[WLat5, LSU, FXU, GroupAlone], (instregex "LAE(Y)?$")>;
// Load/store access multiple (not modeled precisely)
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LAM(Y)?$")>;
+def : InstRW<[WLat1, FXU5, LSU5, GroupAlone], (instregex "STAM(Y)?$")>;
//===----------------------------------------------------------------------===//
// Program mask and addressing mode
//===----------------------------------------------------------------------===//
// Insert Program Mask
-def : InstRW<[FXU, Lat3, EndGroup], (instregex "IPM$")>;
+def : InstRW<[WLat3, FXU, EndGroup], (instregex "IPM$")>;
// Set Program Mask
-def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+def : InstRW<[WLat3, LSU, EndGroup], (instregex "SPM$")>;
// Branch and link
-def : InstRW<[FXU, FXU, LSU, Lat8, GroupAlone], (instregex "BAL(R)?$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "BAL(R)?$")>;
// Test addressing mode
-def : InstRW<[FXU], (instregex "TAM$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TAM$")>;
// Set addressing mode
-def : InstRW<[LSU, EndGroup], (instregex "SAM(24|31|64)$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAM(24|31|64)$")>;
// Branch (and save) and set mode.
-def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BSM$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "BASSM$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "BASSM$")>;
//===----------------------------------------------------------------------===//
// Transactional execution
//===----------------------------------------------------------------------===//
// Transaction begin
-def : InstRW<[LSU, LSU, FXU, FXU, FXU, FXU, FXU, Lat15, GroupAlone],
- (instregex "TBEGIN(C|_nofloat)?$")>;
+def : InstRW<[WLat9, LSU2, FXU5, GroupAlone], (instregex "TBEGIN(C)?$")>;
// Transaction end
-def : InstRW<[LSU, GroupAlone], (instregex "TEND$")>;
+def : InstRW<[WLat4, LSU, GroupAlone], (instregex "TEND$")>;
// Transaction abort
-def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>;
+def : InstRW<[WLat30, MCD], (instregex "TABORT$")>;
// Extract Transaction Nesting Depth
-def : InstRW<[FXU], (instregex "ETND$")>;
+def : InstRW<[WLat30, MCD], (instregex "ETND$")>;
// Nontransactional store
-def : InstRW<[FXU, LSU, Lat5], (instregex "NTSTG$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "NTSTG$")>;
//===----------------------------------------------------------------------===//
// Processor assist
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "PPA$")>;
+def : InstRW<[WLat30, MCD], (instregex "PPA$")>;
//===----------------------------------------------------------------------===//
// Miscellaneous Instructions.
//===----------------------------------------------------------------------===//
// Find leftmost one
-def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[WLat7, WLat7, FXU2, GroupAlone], (instregex "FLOGR$")>;
// Population count
-def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
-
-// Extend
-def : InstRW<[FXU], (instregex "AEXT128$")>;
-def : InstRW<[FXU], (instregex "ZEXT128$")>;
+def : InstRW<[WLat3, WLat3, FXU, NormalGr], (instregex "POPCNT$")>;
// String instructions
-def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
-def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "SRST(U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CUSE$")>;
// Various complex instructions
-def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>;
-def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
-def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CFC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, WLat30, WLat30, MCD],
+ (instregex "UPT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CKSM$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CMPSC$")>;
// Execute
def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
@@ -703,167 +734,155 @@ def : InstRW<[], (instregex "Insn.*")>;
// ----------------------------- Floating point ----------------------------- //
//===----------------------------------------------------------------------===//
-// FP: Select instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[FXU], (instregex "SelectF(32|64|128)$")>;
-def : InstRW<[FXU], (instregex "CondStoreF32(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStoreF64(Inv)?$")>;
-
-//===----------------------------------------------------------------------===//
// FP: Move instructions
//===----------------------------------------------------------------------===//
// Load zero
-def : InstRW<[FXU], (instregex "LZ(DR|ER)$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LZXR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LZXR$")>;
// Load
-def : InstRW<[FXU], (instregex "LER$")>;
-def : InstRW<[FXU], (instregex "LD(R|R32|GR)$")>;
-def : InstRW<[FXU, Lat3], (instregex "LGDR$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LXR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat3, FXU, NormalGr], (instregex "LGDR$")>;
+def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LXR$")>;
// Load and Test
-def : InstRW<[FPU], (instregex "LT(D|E)BR$")>;
-def : InstRW<[FPU], (instregex "LTEBRCompare(_VecPseudo)?$")>;
-def : InstRW<[FPU], (instregex "LTDBRCompare(_VecPseudo)?$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXBR$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone],
- (instregex "LTXBRCompare(_VecPseudo)?$")>;
+def : InstRW<[WLat9, WLat9, FPU, NormalGr], (instregex "LT(E|D)BR$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "LT(E|D)BRCompare$")>;
+def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "LTXBR(Compare)?$")>;
// Copy sign
-def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRd(d|s)$")>;
-def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRs(d|s)$")>;
+def : InstRW<[WLat5, FXU2, GroupAlone], (instregex "CPSDR(d|s)(d|s)$")>;
//===----------------------------------------------------------------------===//
// FP: Load instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU], (instregex "LE(Y)?$")>;
-def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
-def : InstRW<[LSU], (instregex "LX$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(E|D)(Y|E32)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
//===----------------------------------------------------------------------===//
// FP: Store instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat7], (instregex "STD(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat7], (instregex "STE(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STX$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STX$")>;
//===----------------------------------------------------------------------===//
// FP: Conversion instructions
//===----------------------------------------------------------------------===//
// Load rounded
-def : InstRW<[FPU], (instregex "LEDBR(A)?$")>;
-def : InstRW<[FPU, FPU, Lat20], (instregex "LEXBR(A)?$")>;
-def : InstRW<[FPU, FPU, Lat20], (instregex "LDXBR(A)?$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "LEDBR(A)?$")>;
+def : InstRW<[WLat9, FPU2, NormalGr], (instregex "L(E|D)XBR(A)?$")>;
// Load lengthened
-def : InstRW<[FPU, LSU, Lat12], (instregex "LDEB$")>;
-def : InstRW<[FPU], (instregex "LDEBR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)B$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)BR$")>;
+def : InstRW<[WLat7LSU, FPU, LSU, NormalGr], (instregex "LDEB$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "LDEBR$")>;
+def : InstRW<[WLat11LSU, FPU4, LSU, GroupAlone], (instregex "LX(E|D)B$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "LX(E|D)BR$")>;
// Convert from fixed / logical
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)BR(A?)$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)BR(A?)$")>;
-def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)BR(A?)$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CEL(F|G)BR$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CDL(F|G)BR$")>;
-def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CXL(F|G)BR$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "C(E|D)(F|G)BR(A)?$")>;
+def : InstRW<[WLat11, FXU, FPU4, GroupAlone], (instregex "CX(F|G)BR(A?)$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "CEL(F|G)BR$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "CDL(F|G)BR$")>;
+def : InstRW<[WLat11, FXU, FPU4, GroupAlone], (instregex "CXL(F|G)BR$")>;
// Convert to fixed / logical
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)BR(A?)$")>;
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)BR(A?)$")>;
-def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XBR(A?)$")>;
-def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLF(E|D)BR$")>;
-def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLG(E|D)BR$")>;
-def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "CL(F|G)XBR$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU, GroupAlone],
+ (instregex "C(F|G)(E|D)BR(A?)$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU2, GroupAlone],
+ (instregex "C(F|G)XBR(A?)$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU, GroupAlone],
+ (instregex "CL(F|G)(E|D)BR$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU2, GroupAlone], (instregex "CL(F|G)XBR$")>;
//===----------------------------------------------------------------------===//
// FP: Unary arithmetic
//===----------------------------------------------------------------------===//
// Load Complement / Negative / Positive
-def : InstRW<[FPU], (instregex "L(C|N|P)DBR$")>;
-def : InstRW<[FPU], (instregex "L(C|N|P)EBR$")>;
-def : InstRW<[FXU], (instregex "LCDFR(_32)?$")>;
-def : InstRW<[FXU], (instregex "LNDFR(_32)?$")>;
-def : InstRW<[FXU], (instregex "LPDFR(_32)?$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
// Square root
-def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)B$")>;
-def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXBR$")>;
+def : InstRW<[WLat30, FPU, LSU, NormalGr], (instregex "SQ(E|D)B$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "SQXBR$")>;
// Load FP integer
-def : InstRW<[FPU], (instregex "FIEBR(A)?$")>;
-def : InstRW<[FPU], (instregex "FIDBR(A)?$")>;
-def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXBR(A)?$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "FI(E|D)BR(A)?$")>;
+def : InstRW<[WLat15, FPU4, GroupAlone], (instregex "FIXBR(A)?$")>;
//===----------------------------------------------------------------------===//
// FP: Binary arithmetic
//===----------------------------------------------------------------------===//
// Addition
-def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D)B$")>;
-def : InstRW<[FPU], (instregex "A(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+ (instregex "A(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "A(E|D)BR$")>;
+def : InstRW<[WLat20, WLat20, FPU4, GroupAlone], (instregex "AXBR$")>;
// Subtraction
-def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D)B$")>;
-def : InstRW<[FPU], (instregex "S(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+ (instregex "S(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "S(E|D)BR$")>;
+def : InstRW<[WLat20, WLat20, FPU4, GroupAlone], (instregex "SXBR$")>;
// Multiply
-def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
-def : InstRW<[FPU], (instregex "M(D|DE|EE)BR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXDB$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+ (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU4, LSU, GroupAlone],
+ (instregex "MXDB$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "MXBR$")>;
// Multiply and add / subtract
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+ (instregex "M(A|S)EB$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+ (instregex "M(A|S)DB$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
// Division
-def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
-def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU, LSU, NormalGr], (instregex "D(E|D)B$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "D(E|D)BR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "DXBR$")>;
// Divide to integer
-def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "DI(E|D)BR$")>;
//===----------------------------------------------------------------------===//
// FP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>;
-def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>;
-def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU, LSU, NormalGr],
+ (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[WLat30, FPU2, NormalGr], (instregex "(K|C)XBR$")>;
// Test Data Class
-def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
+def : InstRW<[WLat15, FPU, LSU, NormalGr], (instregex "TC(E|D)B$")>;
+def : InstRW<[WLat15, FPU4, LSU, GroupAlone], (instregex "TCXB$")>;
//===----------------------------------------------------------------------===//
// FP: Floating-point control register instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
-def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[FXU, Lat30], (instregex "SFASR$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>;
-def : InstRW<[FXU, GroupAlone], (instregex "SRNM(B|T)?$")>;
+def : InstRW<[WLat4, FXU, LSU, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SFASR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LFAS$")>;
+def : InstRW<[WLat2, FXU, GroupAlone], (instregex "SRNM(B|T)?$")>;
// --------------------- Hexadecimal floating point ------------------------- //
@@ -873,108 +892,111 @@ def : InstRW<[FXU, GroupAlone], (instregex "SRNM(B|T)?$")>;
//===----------------------------------------------------------------------===//
// Load and Test
-def : InstRW<[FPU], (instregex "LT(D|E)R$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXR$")>;
+def : InstRW<[WLat9, WLat9, FPU, NormalGr], (instregex "LT(E|D)R$")>;
+def : InstRW<[WLat9, WLat9, FPU4, GroupAlone], (instregex "LTXR$")>;
//===----------------------------------------------------------------------===//
// HFP: Conversion instructions
//===----------------------------------------------------------------------===//
// Load rounded
-def : InstRW<[FPU], (instregex "(LEDR|LRER)$")>;
-def : InstRW<[FPU], (instregex "LEXR$")>;
-def : InstRW<[FPU], (instregex "(LDXR|LRDR)$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "LEXR$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "(LDXR|LRDR)$")>;
// Load lengthened
-def : InstRW<[LSU], (instregex "LDE$")>;
-def : InstRW<[FXU], (instregex "LDER$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LDE$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LDER$")>;
+def : InstRW<[WLat11LSU, FPU4, LSU, GroupAlone], (instregex "LX(E|D)$")>;
+def : InstRW<[WLat9, FPU4, GroupAlone], (instregex "LX(E|D)R$")>;
// Convert from fixed
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)R$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)R$")>;
-def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)R$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "C(E|D)(F|G)R$")>;
+def : InstRW<[WLat10, FXU, FPU4, GroupAlone], (instregex "CX(F|G)R$")>;
// Convert to fixed
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)R$")>;
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)R$")>;
-def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XR$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU, GroupAlone],
+ (instregex "C(F|G)(E|D)R$")>;
+def : InstRW<[WLat30, WLat30, FXU, FPU2, GroupAlone], (instregex "C(F|G)XR$")>;
// Convert BFP to HFP / HFP to BFP.
-def : InstRW<[FPU], (instregex "THD(E)?R$")>;
-def : InstRW<[FPU], (instregex "TB(E)?DR$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "THD(E)?R$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "TB(E)?DR$")>;
//===----------------------------------------------------------------------===//
// HFP: Unary arithmetic
//===----------------------------------------------------------------------===//
// Load Complement / Negative / Positive
-def : InstRW<[FPU], (instregex "L(C|N|P)DR$")>;
-def : InstRW<[FPU], (instregex "L(C|N|P)ER$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XR$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "L(C|N|P)(E|D)R$")>;
+def : InstRW<[WLat9, WLat9, FPU4, GroupAlone], (instregex "L(C|N|P)XR$")>;
// Halve
-def : InstRW<[FPU], (instregex "H(E|D)R$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "H(E|D)R$")>;
// Square root
-def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)$")>;
-def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)R$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXR$")>;
+def : InstRW<[WLat30, FPU, LSU, NormalGr], (instregex "SQ(E|D)$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "SQ(E|D)R$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "SQXR$")>;
// Load FP integer
-def : InstRW<[FPU], (instregex "FIER$")>;
-def : InstRW<[FPU], (instregex "FIDR$")>;
-def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXR$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "FI(E|D)R$")>;
+def : InstRW<[WLat15, FPU4, GroupAlone], (instregex "FIXR$")>;
//===----------------------------------------------------------------------===//
// HFP: Binary arithmetic
//===----------------------------------------------------------------------===//
// Addition
-def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
-def : InstRW<[FPU], (instregex "A(E|D|U|W)R$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+ (instregex "A(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[WLat15, WLat15, FPU4, GroupAlone], (instregex "AXR$")>;
// Subtraction
-def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
-def : InstRW<[FPU], (instregex "S(E|D|U|W)R$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+ (instregex "S(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[WLat15, WLat15, FPU4, GroupAlone], (instregex "SXR$")>;
// Multiply
-def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
-def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>;
-def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MYR$")>;
-def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FPU, LSU, NormalGr], (instregex "M(D|EE)$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, FPU, LSU, NormalGr], (instregex "M(DE|E)$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "M(D|EE)R$")>;
+def : InstRW<[WLat8, FPU, NormalGr], (instregex "M(DE|E)R$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU4, LSU, GroupAlone], (instregex "MXD$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU4, LSU, GroupAlone], (instregex "MY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FPU2, LSU, GroupAlone],
+ (instregex "MY(H|L)$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "MY(H|L)R$")>;
// Multiply and add / subtract
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>;
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>;
-def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>;
-def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>;
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+ (instregex "M(A|S)(E|D)$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "M(A|S)(E|D)R$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, RegReadAdv, FPU4, LSU, GroupAlone],
+ (instregex "MAY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+ (instregex "MAY(H|L)$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
// Division
-def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>;
-def : InstRW<[FPU, Lat30], (instregex "D(E|D)R$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXR$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU, LSU, NormalGr], (instregex "D(E|D)$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "D(E|D)R$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "DXR$")>;
//===----------------------------------------------------------------------===//
// HFP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)$")>;
-def : InstRW<[FPU], (instregex "C(E|D)R$")>;
-def : InstRW<[FPU, FPU, Lat15], (instregex "CXR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU, LSU, NormalGr], (instregex "C(E|D)$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "C(E|D)R$")>;
+def : InstRW<[WLat15, FPU2, NormalGr], (instregex "CXR$")>;
// ------------------------ Decimal floating point -------------------------- //
@@ -984,120 +1006,121 @@ def : InstRW<[FPU, FPU, Lat15], (instregex "CXR$")>;
//===----------------------------------------------------------------------===//
// Load and Test
-def : InstRW<[DFU, Lat20], (instregex "LTDTR$")>;
-def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LTXTR$")>;
+def : InstRW<[WLat4, WLat4, DFU, NormalGr], (instregex "LTDTR$")>;
+def : InstRW<[WLat6, WLat6, DFU4, GroupAlone], (instregex "LTXTR$")>;
//===----------------------------------------------------------------------===//
// DFP: Conversion instructions
//===----------------------------------------------------------------------===//
// Load rounded
-def : InstRW<[DFU, Lat30], (instregex "LEDTR$")>;
-def : InstRW<[DFU, DFU, Lat30], (instregex "LDXTR$")>;
+def : InstRW<[WLat30, DFU, NormalGr], (instregex "LEDTR$")>;
+def : InstRW<[WLat30, DFU2, NormalGr], (instregex "LDXTR$")>;
// Load lengthened
-def : InstRW<[DFU, Lat20], (instregex "LDETR$")>;
-def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>;
+def : InstRW<[WLat7, DFU, NormalGr], (instregex "LDETR$")>;
+def : InstRW<[WLat6, DFU4, GroupAlone], (instregex "LXDTR$")>;
// Convert from fixed / logical
-def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>;
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>;
-def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>;
+def : InstRW<[WLat9, FXU, DFU, GroupAlone], (instregex "CDFTR$")>;
+def : InstRW<[WLat30, FXU, DFU, GroupAlone], (instregex "CDGTR(A)?$")>;
+def : InstRW<[WLat5, FXU, DFU4, GroupAlone], (instregex "CXFTR(A)?$")>;
+def : InstRW<[WLat30, FXU, DFU4, GroupAlone], (instregex "CXGTR(A)?$")>;
+def : InstRW<[WLat9, FXU, DFU, GroupAlone], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[WLat9, FXU, DFU4, GroupAlone], (instregex "CXLFTR$")>;
+def : InstRW<[WLat5, FXU, DFU4, GroupAlone], (instregex "CXLGTR$")>;
// Convert to fixed / logical
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>;
-def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>;
+def : InstRW<[WLat11, WLat11, FXU, DFU, GroupAlone], (instregex "CFDTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXU, DFU, GroupAlone], (instregex "CGDTR(A)?$")>;
+def : InstRW<[WLat7, WLat7, FXU, DFU2, GroupAlone], (instregex "CFXTR$")>;
+def : InstRW<[WLat30, WLat30, FXU, DFU2, GroupAlone], (instregex "CGXTR(A)?$")>;
+def : InstRW<[WLat11, WLat11, FXU, DFU, GroupAlone], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[WLat7, WLat7, FXU, DFU2, GroupAlone], (instregex "CL(F|G)XTR$")>;
// Convert from / to signed / unsigned packed
-def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>;
-def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "CX(S|U)TR$")>;
-def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "C(S|U)DTR$")>;
-def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")>;
+def : InstRW<[WLat5, FXU, DFU, GroupAlone], (instregex "CD(S|U)TR$")>;
+def : InstRW<[WLat8, FXU2, DFU4, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[WLat7, FXU, DFU, GroupAlone], (instregex "C(S|U)DTR$")>;
+def : InstRW<[WLat12, FXU2, DFU4, GroupAlone], (instregex "C(S|U)XTR$")>;
// Convert from / to zoned
-def : InstRW<[LSU, DFU2, Lat7, GroupAlone], (instregex "CDZT$")>;
-def : InstRW<[LSU, LSU, DFU2, DFU2, Lat10, GroupAlone], (instregex "CXZT$")>;
-def : InstRW<[FXU, LSU, DFU, DFU, Lat11, GroupAlone], (instregex "CZDT$")>;
-def : InstRW<[FXU, LSU, DFU, DFU, Lat15, GroupAlone], (instregex "CZXT$")>;
+def : InstRW<[WLat4LSU, LSU, DFU2, GroupAlone], (instregex "CDZT$")>;
+def : InstRW<[WLat11LSU, LSU2, DFU4, GroupAlone], (instregex "CXZT$")>;
+def : InstRW<[WLat1, FXU, LSU, DFU2, GroupAlone], (instregex "CZDT$")>;
+def : InstRW<[WLat1, FXU, LSU, DFU2, GroupAlone], (instregex "CZXT$")>;
// Perform floating-point operation
-def : InstRW<[FXU, Lat30], (instregex "PFPO$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "PFPO$")>;
//===----------------------------------------------------------------------===//
// DFP: Unary arithmetic
//===----------------------------------------------------------------------===//
// Load FP integer
-def : InstRW<[DFU, Lat20], (instregex "FIDTR$")>;
-def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>;
+def : InstRW<[WLat8, DFU, NormalGr], (instregex "FIDTR$")>;
+def : InstRW<[WLat10, DFU4, GroupAlone], (instregex "FIXTR$")>;
// Extract biased exponent
-def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>;
-def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>;
+def : InstRW<[WLat7, FXU, DFU, GroupAlone], (instregex "EEDTR$")>;
+def : InstRW<[WLat8, FXU, DFU2, GroupAlone], (instregex "EEXTR$")>;
// Extract significance
-def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat20, GroupAlone], (instregex "ESXTR$")>;
+def : InstRW<[WLat7, FXU, DFU, GroupAlone], (instregex "ESDTR$")>;
+def : InstRW<[WLat8, FXU, DFU2, GroupAlone], (instregex "ESXTR$")>;
//===----------------------------------------------------------------------===//
// DFP: Binary arithmetic
//===----------------------------------------------------------------------===//
// Addition
-def : InstRW<[DFU, Lat30], (instregex "ADTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "AXTR(A)?$")>;
+def : InstRW<[WLat9, WLat9, DFU, NormalGr], (instregex "ADTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, DFU4, GroupAlone], (instregex "AXTR(A)?$")>;
// Subtraction
-def : InstRW<[DFU, Lat30], (instregex "SDTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "SXTR(A)?$")>;
+def : InstRW<[WLat9, WLat9, DFU, NormalGr], (instregex "SDTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, DFU4, GroupAlone], (instregex "SXTR(A)?$")>;
// Multiply
-def : InstRW<[DFU, Lat30], (instregex "MDTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+def : InstRW<[WLat30, DFU, NormalGr], (instregex "MDTR(A)?$")>;
+def : InstRW<[WLat30, DFU4, GroupAlone], (instregex "MXTR(A)?$")>;
// Division
-def : InstRW<[DFU, Lat30], (instregex "DDTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+def : InstRW<[WLat30, DFU, NormalGr], (instregex "DDTR(A)?$")>;
+def : InstRW<[WLat30, DFU4, GroupAlone], (instregex "DXTR(A)?$")>;
// Quantize
-def : InstRW<[DFU, Lat30], (instregex "QADTR$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>;
+def : InstRW<[WLat8, WLat8, DFU, NormalGr], (instregex "QADTR$")>;
+def : InstRW<[WLat10, WLat10, DFU4, GroupAlone], (instregex "QAXTR$")>;
// Reround
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>;
+def : InstRW<[WLat11, WLat11, FXU, DFU, GroupAlone], (instregex "RRDTR$")>;
+def : InstRW<[WLat30, WLat30, FXU, DFU4, GroupAlone], (instregex "RRXTR$")>;
// Shift significand left/right
-def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
-def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+def : InstRW<[WLat7LSU, LSU, DFU, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[WLat11LSU, LSU, DFU4, GroupAlone], (instregex "S(L|R)XT$")>;
// Insert biased exponent
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+def : InstRW<[WLat5, FXU, DFU, GroupAlone], (instregex "IEDTR$")>;
+def : InstRW<[WLat7, FXU, DFU4, GroupAlone], (instregex "IEXTR$")>;
//===----------------------------------------------------------------------===//
// DFP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>;
-def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>;
+def : InstRW<[WLat9, DFU, NormalGr], (instregex "(K|C)DTR$")>;
+def : InstRW<[WLat10, DFU2, NormalGr], (instregex "(K|C)XTR$")>;
// Compare biased exponent
-def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>;
-def : InstRW<[DFU, DFU, Lat9], (instregex "CEXTR$")>;
+def : InstRW<[WLat4, DFU, NormalGr], (instregex "CEDTR$")>;
+def : InstRW<[WLat5, DFU2, NormalGr], (instregex "CEXTR$")>;
// Test Data Class/Group
-def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
-def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
+def : InstRW<[WLat9, LSU, DFU, NormalGr], (instregex "TD(C|G)DT$")>;
+def : InstRW<[WLat10, LSU, DFU, NormalGr], (instregex "TD(C|G)ET$")>;
+def : InstRW<[WLat10, LSU, DFU2, NormalGr], (instregex "TD(C|G)XT$")>;
// -------------------------------- System ---------------------------------- //
@@ -1106,157 +1129,152 @@ def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
// System: Program-Status Word Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
-def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
-def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
-def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>;
+def : InstRW<[WLat30, MCD], (instregex "LPSW(E)?$")>;
+def : InstRW<[WLat3, FXU, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[WLat3, FXU, NormalGr], (instregex "IAC$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>;
//===----------------------------------------------------------------------===//
// System: Control Register Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
-def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
- (instregex "STCT(L|G)$")>;
-def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
-def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
-def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
+def : InstRW<[WLat10, WLat10, FXU, LSU, NormalGr], (instregex "LCTL(G)?$")>;
+def : InstRW<[WLat1, FXU5, LSU5, GroupAlone], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "ESEA$")>;
//===----------------------------------------------------------------------===//
// System: Prefix-Register Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat30], (instregex "SPX$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STPX$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(T)?PX$")>;
//===----------------------------------------------------------------------===//
// System: Storage-Key and Real Memory Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat30], (instregex "ISKE$")>;
-def : InstRW<[FXU, Lat30], (instregex "IVSK$")>;
-def : InstRW<[FXU, Lat30], (instregex "SSKE(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "RRB(E|M)$")>;
-def : InstRW<[FXU, Lat30], (instregex "PFMF$")>;
-def : InstRW<[FXU, Lat30], (instregex "TB$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "PGIN$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "PGOUT$")>;
+def : InstRW<[WLat30, MCD], (instregex "ISKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "IVSK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RRB(E|M)$")>;
+def : InstRW<[WLat30, MCD], (instregex "PFMF$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGIN$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGOUT$")>;
//===----------------------------------------------------------------------===//
// System: Dynamic-Address-Translation Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "IDTE(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "CRDTE(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "PTLB$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "CSP(G)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LPTEA$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STRAG$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LURA(G)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STUR(A|G)$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
+def : InstRW<[WLat30, MCD], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTLB$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "CSP(G)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "LPTEA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STRAG$")>;
+def : InstRW<[WLat30, MCD], (instregex "LURA(G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STUR(A|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPROT$")>;
//===----------------------------------------------------------------------===//
// System: Memory-move Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[LSU, Lat6, Lat30, GroupAlone], (instregex "MVCSK$")>;
-def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVC(S|D)K$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVPG$")>;
//===----------------------------------------------------------------------===//
// System: Address-Space Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
-def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
-def : InstRW<[FXU, Lat30], (instregex "PR$")>;
-def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "RP$")>;
-def : InstRW<[FXU, Lat30], (instregex "BS(G|A)$")>;
-def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LASP$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PC$")>;
+def : InstRW<[WLat30, MCD], (instregex "PR$")>;
+def : InstRW<[WLat30, MCD], (instregex "PT(I)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RP$")>;
+def : InstRW<[WLat30, MCD], (instregex "BS(G|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TAR$")>;
//===----------------------------------------------------------------------===//
// System: Linkage-Stack Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>;
-def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
+def : InstRW<[WLat30, MCD], (instregex "BAKR$")>;
+def : InstRW<[WLat30, MCD], (instregex "EREG(G)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "(E|M)STA$")>;
//===----------------------------------------------------------------------===//
// System: Time-Related Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat30], (instregex "PTFF$")>;
-def : InstRW<[FXU, LSU, Lat20], (instregex "SCK$")>;
-def : InstRW<[FXU, Lat30], (instregex "SCKPF$")>;
-def : InstRW<[FXU, LSU, Lat20], (instregex "SCKC$")>;
-def : InstRW<[FXU, LSU, Lat20], (instregex "SPT$")>;
-def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "STCK(F)?$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, FXU, FXU, Lat20, GroupAlone],
- (instregex "STCKE$")>;
-def : InstRW<[FXU, LSU, Lat9], (instregex "STCKC$")>;
-def : InstRW<[FXU, LSU, Lat8], (instregex "STPT$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTFF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCKPF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCKC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SPT$")>;
+def : InstRW<[WLat9, FXU, LSU2, GroupAlone], (instregex "STCK(F)?$")>;
+def : InstRW<[WLat20, LSU4, FXU2, GroupAlone], (instregex "STCKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCKC$")>;
+def : InstRW<[WLat30, MCD], (instregex "STPT$")>;
//===----------------------------------------------------------------------===//
// System: CPU-Related Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat30], (instregex "STAP$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STIDP$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STSI$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STFL(E)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "ECAG$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "ECTG$")>;
-def : InstRW<[FXU, Lat30], (instregex "PTF$")>;
-def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
+def : InstRW<[WLat30, MCD], (instregex "STAP$")>;
+def : InstRW<[WLat30, MCD], (instregex "STIDP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STSI$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STFL(E)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "ECAG$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECTG$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTF$")>;
+def : InstRW<[WLat30, MCD], (instregex "PCKMO$")>;
//===----------------------------------------------------------------------===//
// System: Miscellaneous Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-def : InstRW<[FXU, GroupAlone], (instregex "MC$")>;
-def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
-def : InstRW<[FXU], (instregex "TRAC(E|G)$")>;
-def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
-def : InstRW<[FXU, Lat30], (instregex "SIGP$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
+def : InstRW<[WLat30, MCD], (instregex "SVC$")>;
+def : InstRW<[WLat1, FXU, GroupAlone], (instregex "MC$")>;
+def : InstRW<[WLat30, MCD], (instregex "DIAG$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TRAC(E|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TRAP(2|4)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIG(P|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIE$")>;
//===----------------------------------------------------------------------===//
// System: CPU-Measurement Facility Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "LPP$")>;
-def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
-def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
-def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
-def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LPP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECPGA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "E(C|P)CTR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LCCTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "L(P|S)CTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(C|P)CTR$")>;
//===----------------------------------------------------------------------===//
// System: I/O Instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, Lat30], (instregex "(C|H|R|X)SCH$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
-def : InstRW<[FXU, Lat30], (instregex "RCHP$")>;
-def : InstRW<[FXU, Lat30], (instregex "SCHM$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STC(PS|RW)$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "TPI$")>;
-def : InstRW<[FXU, Lat30], (instregex "SAL$")>;
+def : InstRW<[WLat30, MCD], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "RCHP$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCHM$")>;
+def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPI$")>;
+def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
}
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index 657482504045..e0d7bca9a94b 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -145,7 +145,7 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
// deciding whether to use a loop or straight-line code.
static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
SDValue Src1, SDValue Src2, uint64_t Size) {
- SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
EVT PtrVT = Src1.getValueType();
// A two-CLC sequence is a clear win over a loop, not least because it
// needs only one branch. A three-CLC sequence needs the same number
@@ -167,9 +167,9 @@ static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
// less than zero if CC == 1 and greater than zero if CC >= 2.
// The sequence starts with IPM, which puts CC into bits 29 and 28
// of an integer and clears bits 30 and 31.
-static SDValue addIPMSequence(const SDLoc &DL, SDValue Glue,
+static SDValue addIPMSequence(const SDLoc &DL, SDValue CCReg,
SelectionDAG &DAG) {
- SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+ SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
SDValue ROTL = DAG.getNode(ISD::ROTL, DL, MVT::i32, SRL,
@@ -184,9 +184,9 @@ std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemcmp(
if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
uint64_t Bytes = CSize->getZExtValue();
assert(Bytes > 0 && "Caller should have handled 0-size case");
- Chain = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes);
- SDValue Glue = Chain.getValue(1);
- return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+ SDValue CCReg = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes);
+ Chain = CCReg.getValue(1);
+ return std::make_pair(addIPMSequence(DL, CCReg, DAG), Chain);
}
return std::make_pair(SDValue(), SDValue());
}
@@ -196,7 +196,7 @@ std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemchr(
SDValue Char, SDValue Length, MachinePointerInfo SrcPtrInfo) const {
// Use SRST to find the character. End is its address on success.
EVT PtrVT = Src.getValueType();
- SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+ SDVTList VTs = DAG.getVTList(PtrVT, MVT::i32, MVT::Other);
Length = DAG.getZExtOrTrunc(Length, DL, PtrVT);
Char = DAG.getZExtOrTrunc(Char, DL, MVT::i32);
Char = DAG.getNode(ISD::AND, DL, MVT::i32, Char,
@@ -204,17 +204,16 @@ std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemchr(
SDValue Limit = DAG.getNode(ISD::ADD, DL, PtrVT, Src, Length);
SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
Limit, Src, Char);
- Chain = End.getValue(1);
- SDValue Glue = End.getValue(2);
+ SDValue CCReg = End.getValue(1);
+ Chain = End.getValue(2);
// Now select between End and null, depending on whether the character
// was found.
SDValue Ops[] = {End, DAG.getConstant(0, DL, PtrVT),
DAG.getConstant(SystemZ::CCMASK_SRST, DL, MVT::i32),
DAG.getConstant(SystemZ::CCMASK_SRST_FOUND, DL, MVT::i32),
- Glue};
- VTs = DAG.getVTList(PtrVT, MVT::Glue);
- End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
+ CCReg};
+ End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, PtrVT, Ops);
return std::make_pair(End, Chain);
}
@@ -232,12 +231,12 @@ std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForStrcmp(
SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src1,
SDValue Src2, MachinePointerInfo Op1PtrInfo,
MachinePointerInfo Op2PtrInfo) const {
- SDVTList VTs = DAG.getVTList(Src1.getValueType(), MVT::Other, MVT::Glue);
+ SDVTList VTs = DAG.getVTList(Src1.getValueType(), MVT::i32, MVT::Other);
SDValue Unused = DAG.getNode(SystemZISD::STRCMP, DL, VTs, Chain, Src1, Src2,
DAG.getConstant(0, DL, MVT::i32));
- Chain = Unused.getValue(1);
- SDValue Glue = Chain.getValue(2);
- return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+ SDValue CCReg = Unused.getValue(1);
+ Chain = Unused.getValue(2);
+ return std::make_pair(addIPMSequence(DL, CCReg, DAG), Chain);
}
// Search from Src for a null character, stopping once Src reaches Limit.
@@ -250,10 +249,10 @@ static std::pair<SDValue, SDValue> getBoundedStrlen(SelectionDAG &DAG,
SDValue Chain, SDValue Src,
SDValue Limit) {
EVT PtrVT = Src.getValueType();
- SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+ SDVTList VTs = DAG.getVTList(PtrVT, MVT::i32, MVT::Other);
SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
Limit, Src, DAG.getConstant(0, DL, MVT::i32));
- Chain = End.getValue(1);
+ Chain = End.getValue(2);
SDValue Len = DAG.getNode(ISD::SUB, DL, PtrVT, End, Src);
return std::make_pair(Len, Chain);
}
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 3a167a6d452a..f3620dcf3b92 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -18,12 +18,12 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Transforms/Scalar.h"
#include <string>
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 37c55c4e3889..e2a3efda5c5e 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -737,7 +737,7 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondT
unsigned PredicateExtraCost = 0;
if (I != nullptr) {
// Some predicates cost one or two extra instructions.
- switch (dyn_cast<CmpInst>(I)->getPredicate()) {
+ switch (cast<CmpInst>(I)->getPredicate()) {
case CmpInst::Predicate::ICMP_NE:
case CmpInst::Predicate::ICMP_UGE:
case CmpInst::Predicate::ICMP_ULE:
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 72baf5985eac..907ecf46e8ff 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -12,9 +12,8 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
@@ -52,11 +51,24 @@ TargetLoweringObjectFile::~TargetLoweringObjectFile() {
delete Mang;
}
-static bool isSuitableForBSS(const GlobalVariable *GV, bool NoZerosInBSS) {
+static bool isNullOrUndef(const Constant *C) {
+ // Check that the constant isn't all zeros or undefs.
+ if (C->isNullValue() || isa<UndefValue>(C))
+ return true;
+ if (!isa<ConstantAggregate>(C))
+ return false;
+ for (auto Operand : C->operand_values()) {
+ if (!isNullOrUndef(cast<Constant>(Operand)))
+ return false;
+ }
+ return true;
+}
+
+static bool isSuitableForBSS(const GlobalVariable *GV) {
const Constant *C = GV->getInitializer();
// Must have zero initializer.
- if (!C->isNullValue())
+ if (!isNullOrUndef(C))
return false;
// Leave constant zeros in readonly constant sections, so they can be shared.
@@ -67,10 +79,6 @@ static bool isSuitableForBSS(const GlobalVariable *GV, bool NoZerosInBSS) {
if (GV->hasSection())
return false;
- // If -nozero-initialized-in-bss is specified, don't ever use BSS.
- if (NoZerosInBSS)
- return false;
-
// Otherwise, put it in BSS!
return true;
}
@@ -126,25 +134,24 @@ void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
/// getKindForGlobal - This is a top-level target-independent classifier for
-/// a global variable. Given an global variable and information from TM, it
-/// classifies the global in a variety of ways that make various target
-/// implementations simpler. The target implementation is free to ignore this
-/// extra info of course.
+/// a global object. Given a global variable and information from the TM, this
+/// function classifies the global in a target independent manner. This function
+/// may be overridden by the target implementation.
SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO,
const TargetMachine &TM){
assert(!GO->isDeclaration() && !GO->hasAvailableExternallyLinkage() &&
"Can only be used for global definitions");
- Reloc::Model ReloModel = TM.getRelocationModel();
-
- // Early exit - functions should be always in text sections.
- const auto *GVar = dyn_cast<GlobalVariable>(GO);
- if (!GVar)
+ // Functions are classified as text sections.
+ if (isa<Function>(GO))
return SectionKind::getText();
+ // Global variables require more detailed analysis.
+ const auto *GVar = cast<GlobalVariable>(GO);
+
// Handle thread-local data first.
if (GVar->isThreadLocal()) {
- if (isSuitableForBSS(GVar, TM.Options.NoZerosInBSS))
+ if (isSuitableForBSS(GVar) && !TM.Options.NoZerosInBSS)
return SectionKind::getThreadBSS();
return SectionKind::getThreadData();
}
@@ -153,8 +160,9 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO,
if (GVar->hasCommonLinkage())
return SectionKind::getCommon();
- // Variable can be easily put to BSS section.
- if (isSuitableForBSS(GVar, TM.Options.NoZerosInBSS)) {
+ // Most non-mergeable zero data can be put in the BSS section unless otherwise
+ // specified.
+ if (isSuitableForBSS(GVar) && !TM.Options.NoZerosInBSS) {
if (GVar->hasLocalLinkage())
return SectionKind::getBSSLocal();
else if (GVar->hasExternalLinkage())
@@ -162,14 +170,13 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO,
return SectionKind::getBSS();
}
- const Constant *C = GVar->getInitializer();
-
// If the global is marked constant, we can put it into a mergable section,
// a mergable string section, or general .data if it contains relocations.
if (GVar->isConstant()) {
// If the initializer for the global contains something that requires a
// relocation, then we may have to drop this into a writable data section
// even though it is marked const.
+ const Constant *C = GVar->getInitializer();
if (!C->needsRelocation()) {
// If the global is required to have a unique address, it can't be put
// into a mergable section: just drop it into the general read-only
@@ -215,6 +222,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO,
// the time the app starts up. However, we can't put this into a
// mergable section, because the linker doesn't take relocations into
// consideration when it tries to merge entries in the section.
+ Reloc::Model ReloModel = TM.getRelocationModel();
if (ReloModel == Reloc::Static || ReloModel == Reloc::ROPI ||
ReloModel == Reloc::RWPI || ReloModel == Reloc::ROPI_RWPI)
return SectionKind::getReadOnly();
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index c4c0dd22ee0c..092f5ea4104b 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -13,8 +13,6 @@
#include "llvm/Target/TargetMachine.h"
#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalValue.h"
@@ -27,6 +25,7 @@
#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCTargetOptions.h"
#include "llvm/MC/SectionKind.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
using namespace llvm;
//---------------------------------------------------------------------------
@@ -52,7 +51,7 @@ bool TargetMachine::isPositionIndependent() const {
return getRelocationModel() == Reloc::PIC_;
}
-/// \brief Reset the target options based on the function's attributes.
+/// Reset the target options based on the function's attributes.
// FIXME: This function needs to go away for a number of reasons:
// a) global state on the TargetMachine is terrible in general,
// b) these target options should be passed only on the function
@@ -116,12 +115,24 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
if (GV && GV->isDSOLocal())
return true;
- // According to the llvm language reference, we should be able to just return
- // false in here if we have a GV, as we know it is dso_preemptable.
- // At this point in time, the various IR producers have not been transitioned
- // to always produce a dso_local when it is possible to do so. As a result we
- // still have some pre-dso_local logic in here to improve the quality of the
- // generated code:
+ // If we are not supossed to use a PLT, we cannot assume that intrinsics are
+ // local since the linker can convert some direct access to access via plt.
+ if (M.getRtLibUseGOT() && !GV)
+ return false;
+
+ // According to the llvm language reference, we should be able to
+ // just return false in here if we have a GV, as we know it is
+ // dso_preemptable. At this point in time, the various IR producers
+ // have not been transitioned to always produce a dso_local when it
+ // is possible to do so.
+ // In the case of intrinsics, GV is null and there is nowhere to put
+ // dso_local. Returning false for those will produce worse code in some
+ // architectures. For example, on x86 the caller has to set ebx before calling
+ // a plt.
+ // As a result we still have some logic in here to improve the quality of the
+ // generated code.
+ // FIXME: Add a module level metadata for whether intrinsics should be assumed
+ // local.
Reloc::Model RM = getRelocationModel();
const Triple &TT = getTargetTriple();
@@ -131,7 +142,7 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
return false;
// Every other GV is local on COFF.
- // Make an exception for windows OS in the triple: Some firmwares builds use
+ // Make an exception for windows OS in the triple: Some firmware builds use
// *-win32-macho triples. This (accidentally?) produced windows relocations
// without GOT tables in older clang versions; Keep this behaviour.
if (TT.isOSBinFormatCOFF() || (TT.isOSWindows() && TT.isOSBinFormatMachO()))
@@ -141,12 +152,10 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
// produce a 0 if it turns out the symbol is undefined. While this
// is ABI and relocation depended, it seems worth it to handle it
// here.
- // FIXME: this is probably not ELF specific.
- if (GV && isPositionIndependent() && TT.isOSBinFormatELF() &&
- GV->hasExternalWeakLinkage())
+ if (GV && isPositionIndependent() && GV->hasExternalWeakLinkage())
return false;
- if (GV && (GV->hasLocalLinkage() || !GV->hasDefaultVisibility()))
+ if (GV && !GV->hasDefaultVisibility())
return true;
if (TT.isOSBinFormatMachO()) {
@@ -174,7 +183,7 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
bool IsTLS = GV && GV->isThreadLocal();
bool IsAccessViaCopyRelocs =
- Options.MCOptions.MCPIECopyRelocations && GV && isa<GlobalVariable>(GV);
+ GV && Options.MCOptions.MCPIECopyRelocations && isa<GlobalVariable>(GV);
Triple::ArchType Arch = TT.getArch();
bool IsPPC =
Arch == Triple::ppc || Arch == Triple::ppc64 || Arch == Triple::ppc64le;
@@ -187,6 +196,14 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
return false;
}
+bool TargetMachine::useEmulatedTLS() const {
+ // Returns Options.EmulatedTLS if the -emulated-tls or -no-emulated-tls
+ // was specified explicitly; otherwise uses target triple to decide default.
+ if (Options.ExplicitEmulatedTLS)
+ return Options.EmulatedTLS;
+ return getTargetTriple().hasDefaultEmulatedTLS();
+}
+
TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
bool IsPIE = GV->getParent()->getPIELevel() != PIELevel::Default;
Reloc::Model RM = getRelocationModel();
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index 74fe7c5d3cde..37d398d580f8 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -18,12 +18,13 @@
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
-#include "llvm/Support/CodeGenCWrappers.h"
+#include "llvm/MC/SubtargetFeature.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Support/Host.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/CodeGenCWrappers.h"
#include "llvm/Target/TargetMachine.h"
#include <cassert>
#include <cstdlib>
@@ -195,7 +196,7 @@ static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
ft = TargetMachine::CGFT_ObjectFile;
break;
}
- if (TM->addPassesToEmitFile(pass, OS, ft)) {
+ if (TM->addPassesToEmitFile(pass, OS, nullptr, ft)) {
error = "TargetMachine can't emit a file of this type";
*ErrorMessage = strdup(error.c_str());
return true;
@@ -237,6 +238,25 @@ char *LLVMGetDefaultTargetTriple(void) {
return strdup(sys::getDefaultTargetTriple().c_str());
}
+char *LLVMNormalizeTargetTriple(const char* triple) {
+ return strdup(Triple::normalize(StringRef(triple)).c_str());
+}
+
+char *LLVMGetHostCPUName(void) {
+ return strdup(sys::getHostCPUName().data());
+}
+
+char *LLVMGetHostCPUFeatures(void) {
+ SubtargetFeatures Features;
+ StringMap<bool> HostFeatures;
+
+ if (sys::getHostCPUFeatures(HostFeatures))
+ for (auto &F : HostFeatures)
+ Features.AddFeature(F.first(), F.second);
+
+ return strdup(Features.getString().c_str());
+}
+
void LLVMAddAnalysisPasses(LLVMTargetMachineRef T, LLVMPassManagerRef PM) {
unwrap(PM)->add(
createTargetTransformInfoWrapperPass(unwrap(T)->getTargetIRAnalysis()));
diff --git a/lib/Target/WebAssembly/AsmParser/CMakeLists.txt b/lib/Target/WebAssembly/AsmParser/CMakeLists.txt
new file mode 100644
index 000000000000..bd4741d13984
--- /dev/null
+++ b/lib/Target/WebAssembly/AsmParser/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMWebAssemblyAsmParser
+ WebAssemblyAsmParser.cpp
+ )
diff --git a/lib/Target/WebAssembly/AsmParser/LLVMBuild.txt b/lib/Target/WebAssembly/AsmParser/LLVMBuild.txt
new file mode 100644
index 000000000000..4c0652617bb8
--- /dev/null
+++ b/lib/Target/WebAssembly/AsmParser/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===-- ./lib/Target/WebAssembly/Disassembler/LLVMBuild.txt -----*- Conf -*--===;
+;
+; The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+; http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = WebAssemblyAsmParser
+parent = WebAssembly
+required_libraries = MC MCParser WebAssemblyInfo Support
+add_to_library_groups = WebAssembly
diff --git a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
new file mode 100644
index 000000000000..2d92b93ca704
--- /dev/null
+++ b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -0,0 +1,561 @@
+//==- WebAssemblyAsmParser.cpp - Assembler for WebAssembly -*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file is part of the WebAssembly Assembler.
+///
+/// It contains code to translate a parsed .s file into MCInsts.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyTargetStreamer.h"
+#include "WebAssembly.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-asm-parser"
+
+namespace {
+
+// We store register types as SimpleValueType to retain SIMD layout
+// information, but must also be able to supply them as the (unnamed)
+// register enum from WebAssemblyRegisterInfo.td/.inc.
+static unsigned MVTToWasmReg(MVT::SimpleValueType Type) {
+ switch(Type) {
+ case MVT::i32: return WebAssembly::I32_0;
+ case MVT::i64: return WebAssembly::I64_0;
+ case MVT::f32: return WebAssembly::F32_0;
+ case MVT::f64: return WebAssembly::F64_0;
+ case MVT::v16i8: return WebAssembly::V128_0;
+ case MVT::v8i16: return WebAssembly::V128_0;
+ case MVT::v4i32: return WebAssembly::V128_0;
+ case MVT::v4f32: return WebAssembly::V128_0;
+ default: return MVT::INVALID_SIMPLE_VALUE_TYPE;
+ }
+}
+
+/// WebAssemblyOperand - Instances of this class represent the operands in a
+/// parsed WASM machine instruction.
+struct WebAssemblyOperand : public MCParsedAsmOperand {
+ enum KindTy { Token, Local, Stack, Integer, Float, Symbol } Kind;
+
+ SMLoc StartLoc, EndLoc;
+
+ struct TokOp {
+ StringRef Tok;
+ };
+
+ struct RegOp {
+ // This is a (virtual) local or stack register represented as 0..
+ unsigned RegNo;
+ // In most targets, the register number also encodes the type, but for
+ // wasm we have to track that seperately since we have an unbounded
+ // number of registers.
+ // This has the unfortunate side effect that we supply a different value
+ // to the table-gen matcher at different times in the process (when it
+ // calls getReg() or addRegOperands().
+ // TODO: While this works, it feels brittle. and would be nice to clean up.
+ MVT::SimpleValueType Type;
+ };
+
+ struct IntOp {
+ int64_t Val;
+ };
+
+ struct FltOp {
+ double Val;
+ };
+
+ struct SymOp {
+ const MCExpr *Exp;
+ };
+
+ union {
+ struct TokOp Tok;
+ struct RegOp Reg;
+ struct IntOp Int;
+ struct FltOp Flt;
+ struct SymOp Sym;
+ };
+
+ WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, TokOp T)
+ : Kind(K), StartLoc(Start), EndLoc(End), Tok(T) {}
+ WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, RegOp R)
+ : Kind(K), StartLoc(Start), EndLoc(End), Reg(R) {}
+ WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, IntOp I)
+ : Kind(K), StartLoc(Start), EndLoc(End), Int(I) {}
+ WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, FltOp F)
+ : Kind(K), StartLoc(Start), EndLoc(End), Flt(F) {}
+ WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, SymOp S)
+ : Kind(K), StartLoc(Start), EndLoc(End), Sym(S) {}
+
+ bool isToken() const override { return Kind == Token; }
+ bool isImm() const override { return Kind == Integer ||
+ Kind == Float ||
+ Kind == Symbol; }
+ bool isReg() const override { return Kind == Local || Kind == Stack; }
+ bool isMem() const override { return false; }
+
+ unsigned getReg() const override {
+ assert(isReg());
+ // This is called from the tablegen matcher (MatchInstructionImpl)
+ // where it expects to match the type of register, see RegOp above.
+ return MVTToWasmReg(Reg.Type);
+ }
+
+ StringRef getToken() const {
+ assert(isToken());
+ return Tok.Tok;
+ }
+
+ SMLoc getStartLoc() const override { return StartLoc; }
+ SMLoc getEndLoc() const override { return EndLoc; }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ assert(isReg() && "Not a register operand!");
+ // This is called from the tablegen matcher (MatchInstructionImpl)
+ // where it expects to output the actual register index, see RegOp above.
+ unsigned R = Reg.RegNo;
+ if (Kind == Stack) {
+ // A stack register is represented as a large negative number.
+ // See WebAssemblyRegNumbering::runOnMachineFunction and
+ // getWARegStackId for why this | is needed.
+ R |= INT32_MIN;
+ }
+ Inst.addOperand(MCOperand::createReg(R));
+ }
+
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ if (Kind == Integer)
+ Inst.addOperand(MCOperand::createImm(Int.Val));
+ else if (Kind == Float)
+ Inst.addOperand(MCOperand::createFPImm(Flt.Val));
+ else if (Kind == Symbol)
+ Inst.addOperand(MCOperand::createExpr(Sym.Exp));
+ else
+ llvm_unreachable("Should be immediate or symbol!");
+ }
+
+ void print(raw_ostream &OS) const override {
+ switch (Kind) {
+ case Token:
+ OS << "Tok:" << Tok.Tok;
+ break;
+ case Local:
+ OS << "Loc:" << Reg.RegNo << ":" << static_cast<int>(Reg.Type);
+ break;
+ case Stack:
+ OS << "Stk:" << Reg.RegNo << ":" << static_cast<int>(Reg.Type);
+ break;
+ case Integer:
+ OS << "Int:" << Int.Val;
+ break;
+ case Float:
+ OS << "Flt:" << Flt.Val;
+ break;
+ case Symbol:
+ OS << "Sym:" << Sym.Exp;
+ break;
+ }
+ }
+};
+
+class WebAssemblyAsmParser final : public MCTargetAsmParser {
+ MCAsmParser &Parser;
+ MCAsmLexer &Lexer;
+ // These are for the current function being parsed:
+ // These are vectors since register assignments are so far non-sparse.
+ // Replace by map if necessary.
+ std::vector<MVT::SimpleValueType> LocalTypes;
+ std::vector<MVT::SimpleValueType> StackTypes;
+ MCSymbol *LastLabel;
+
+public:
+ WebAssemblyAsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
+ const MCInstrInfo &mii, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, sti, mii), Parser(Parser),
+ Lexer(Parser.getLexer()), LastLabel(nullptr) {
+ }
+
+#define GET_ASSEMBLER_HEADER
+#include "WebAssemblyGenAsmMatcher.inc"
+
+ // TODO: This is required to be implemented, but appears unused.
+ bool ParseRegister(unsigned &/*RegNo*/, SMLoc &/*StartLoc*/,
+ SMLoc &/*EndLoc*/) override {
+ llvm_unreachable("ParseRegister is not implemented.");
+ }
+
+ bool Error(const StringRef &msg, const AsmToken &tok) {
+ return Parser.Error(tok.getLoc(), msg + tok.getString());
+ }
+
+ bool IsNext(AsmToken::TokenKind Kind) {
+ auto ok = Lexer.is(Kind);
+ if (ok) Parser.Lex();
+ return ok;
+ }
+
+ bool Expect(AsmToken::TokenKind Kind, const char *KindName) {
+ if (!IsNext(Kind))
+ return Error(std::string("Expected ") + KindName + ", instead got: ",
+ Lexer.getTok());
+ return false;
+ }
+
+ MVT::SimpleValueType ParseRegType(const StringRef &RegType) {
+ // Derive type from .param .local decls, or the instruction itself.
+ return StringSwitch<MVT::SimpleValueType>(RegType)
+ .Case("i32", MVT::i32)
+ .Case("i64", MVT::i64)
+ .Case("f32", MVT::f32)
+ .Case("f64", MVT::f64)
+ .Case("i8x16", MVT::v16i8)
+ .Case("i16x8", MVT::v8i16)
+ .Case("i32x4", MVT::v4i32)
+ .Case("f32x4", MVT::v4f32)
+ .Default(MVT::INVALID_SIMPLE_VALUE_TYPE);
+ }
+
+ MVT::SimpleValueType &GetType(
+ std::vector<MVT::SimpleValueType> &Types, size_t i) {
+ Types.resize(std::max(i + 1, Types.size()), MVT::INVALID_SIMPLE_VALUE_TYPE);
+ return Types[i];
+ }
+
+ bool ParseReg(OperandVector &Operands, StringRef TypePrefix) {
+ if (Lexer.is(AsmToken::Integer)) {
+ auto &Local = Lexer.getTok();
+ // This is a reference to a local, turn it into a virtual register.
+ auto LocalNo = static_cast<unsigned>(Local.getIntVal());
+ Operands.push_back(make_unique<WebAssemblyOperand>(
+ WebAssemblyOperand::Local, Local.getLoc(),
+ Local.getEndLoc(),
+ WebAssemblyOperand::RegOp{LocalNo,
+ GetType(LocalTypes, LocalNo)}));
+ Parser.Lex();
+ } else if (Lexer.is(AsmToken::Identifier)) {
+ auto &StackRegTok = Lexer.getTok();
+ // These are push/pop/drop pseudo stack registers, which we turn
+ // into virtual registers also. The stackify pass will later turn them
+ // back into implicit stack references if possible.
+ auto StackReg = StackRegTok.getString();
+ auto StackOp = StackReg.take_while([](char c) { return isalpha(c); });
+ auto Reg = StackReg.drop_front(StackOp.size());
+ unsigned long long ParsedRegNo = 0;
+ if (!Reg.empty() && getAsUnsignedInteger(Reg, 10, ParsedRegNo))
+ return Error("Cannot parse stack register index: ", StackRegTok);
+ unsigned RegNo = static_cast<unsigned>(ParsedRegNo);
+ if (StackOp == "push") {
+ // This defines a result, record register type.
+ auto RegType = ParseRegType(TypePrefix);
+ GetType(StackTypes, RegNo) = RegType;
+ Operands.push_back(make_unique<WebAssemblyOperand>(
+ WebAssemblyOperand::Stack,
+ StackRegTok.getLoc(),
+ StackRegTok.getEndLoc(),
+ WebAssemblyOperand::RegOp{RegNo, RegType}));
+ } else if (StackOp == "pop") {
+ // This uses a previously defined stack value.
+ auto RegType = GetType(StackTypes, RegNo);
+ Operands.push_back(make_unique<WebAssemblyOperand>(
+ WebAssemblyOperand::Stack,
+ StackRegTok.getLoc(),
+ StackRegTok.getEndLoc(),
+ WebAssemblyOperand::RegOp{RegNo, RegType}));
+ } else if (StackOp == "drop") {
+ // This operand will be dropped, since it is part of an instruction
+ // whose result is void.
+ } else {
+ return Error("Unknown stack register prefix: ", StackRegTok);
+ }
+ Parser.Lex();
+ } else {
+ return Error(
+ "Expected identifier/integer following $, instead got: ",
+ Lexer.getTok());
+ }
+ IsNext(AsmToken::Equal);
+ return false;
+ }
+
+ void ParseSingleInteger(bool IsNegative, OperandVector &Operands) {
+ auto &Int = Lexer.getTok();
+ int64_t Val = Int.getIntVal();
+ if (IsNegative) Val = -Val;
+ Operands.push_back(make_unique<WebAssemblyOperand>(
+ WebAssemblyOperand::Integer, Int.getLoc(),
+ Int.getEndLoc(), WebAssemblyOperand::IntOp{Val}));
+ Parser.Lex();
+ }
+
+ bool ParseOperandStartingWithInteger(bool IsNegative,
+ OperandVector &Operands,
+ StringRef InstType) {
+ ParseSingleInteger(IsNegative, Operands);
+ if (Lexer.is(AsmToken::LParen)) {
+ // Parse load/store operands of the form: offset($reg)align
+ auto &LParen = Lexer.getTok();
+ Operands.push_back(
+ make_unique<WebAssemblyOperand>(WebAssemblyOperand::Token,
+ LParen.getLoc(),
+ LParen.getEndLoc(),
+ WebAssemblyOperand::TokOp{
+ LParen.getString()}));
+ Parser.Lex();
+ if (Expect(AsmToken::Dollar, "register")) return true;
+ if (ParseReg(Operands, InstType)) return true;
+ auto &RParen = Lexer.getTok();
+ Operands.push_back(
+ make_unique<WebAssemblyOperand>(WebAssemblyOperand::Token,
+ RParen.getLoc(),
+ RParen.getEndLoc(),
+ WebAssemblyOperand::TokOp{
+ RParen.getString()}));
+ if (Expect(AsmToken::RParen, ")")) return true;
+ if (Lexer.is(AsmToken::Integer)) {
+ ParseSingleInteger(false, Operands);
+ } else {
+ // Alignment not specified.
+ // FIXME: correctly derive a default from the instruction.
+ Operands.push_back(make_unique<WebAssemblyOperand>(
+ WebAssemblyOperand::Integer, RParen.getLoc(),
+ RParen.getEndLoc(), WebAssemblyOperand::IntOp{0}));
+ }
+ }
+ return false;
+ }
+
+ bool ParseInstruction(ParseInstructionInfo &/*Info*/, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override {
+ Operands.push_back(
+ make_unique<WebAssemblyOperand>(WebAssemblyOperand::Token, NameLoc,
+ SMLoc::getFromPointer(
+ NameLoc.getPointer() + Name.size()),
+ WebAssemblyOperand::TokOp{
+ StringRef(NameLoc.getPointer(),
+ Name.size())}));
+ auto NamePair = Name.split('.');
+ // If no '.', there is no type prefix.
+ if (NamePair.second.empty()) std::swap(NamePair.first, NamePair.second);
+ while (Lexer.isNot(AsmToken::EndOfStatement)) {
+ auto &Tok = Lexer.getTok();
+ switch (Tok.getKind()) {
+ case AsmToken::Dollar: {
+ Parser.Lex();
+ if (ParseReg(Operands, NamePair.first)) return true;
+ break;
+ }
+ case AsmToken::Identifier: {
+ auto &Id = Lexer.getTok();
+ const MCExpr *Val;
+ SMLoc End;
+ if (Parser.parsePrimaryExpr(Val, End))
+ return Error("Cannot parse symbol: ", Lexer.getTok());
+ Operands.push_back(make_unique<WebAssemblyOperand>(
+ WebAssemblyOperand::Symbol, Id.getLoc(),
+ Id.getEndLoc(), WebAssemblyOperand::SymOp{Val}));
+ break;
+ }
+ case AsmToken::Minus:
+ Parser.Lex();
+ if (Lexer.isNot(AsmToken::Integer))
+ return Error("Expected integer instead got: ", Lexer.getTok());
+ if (ParseOperandStartingWithInteger(true, Operands, NamePair.first))
+ return true;
+ break;
+ case AsmToken::Integer:
+ if (ParseOperandStartingWithInteger(false, Operands, NamePair.first))
+ return true;
+ break;
+ case AsmToken::Real: {
+ double Val;
+ if (Tok.getString().getAsDouble(Val, false))
+ return Error("Cannot parse real: ", Tok);
+ Operands.push_back(make_unique<WebAssemblyOperand>(
+ WebAssemblyOperand::Float, Tok.getLoc(),
+ Tok.getEndLoc(), WebAssemblyOperand::FltOp{Val}));
+ Parser.Lex();
+ break;
+ }
+ default:
+ return Error("Unexpected token in operand: ", Tok);
+ }
+ if (Lexer.isNot(AsmToken::EndOfStatement)) {
+ if (Expect(AsmToken::Comma, ",")) return true;
+ }
+ }
+ Parser.Lex();
+ // Call instructions are vararg, but the tablegen matcher doesn't seem to
+ // support that, so for now we strip these extra operands.
+ // This is problematic if these arguments are not simple $pop stack
+ // registers, since e.g. a local register would get lost, so we check for
+ // this. This can be the case when using -disable-wasm-explicit-locals
+ // which currently s2wasm requires.
+ // TODO: Instead, we can move this code to MatchAndEmitInstruction below and
+ // actually generate get_local instructions on the fly.
+ // Or even better, improve the matcher to support vararg?
+ auto IsIndirect = NamePair.second == "call_indirect";
+ if (IsIndirect || NamePair.second == "call") {
+ // Figure out number of fixed operands from the instruction.
+ size_t CallOperands = 1; // The name token.
+ if (!IsIndirect) CallOperands++; // The function index.
+ if (!NamePair.first.empty()) CallOperands++; // The result register.
+ if (Operands.size() > CallOperands) {
+ // Ensure operands we drop are all $pop.
+ for (size_t I = CallOperands; I < Operands.size(); I++) {
+ auto Operand =
+ reinterpret_cast<WebAssemblyOperand *>(Operands[I].get());
+ if (Operand->Kind != WebAssemblyOperand::Stack)
+ Parser.Error(NameLoc,
+ "Call instruction has non-stack arguments, if this code was "
+ "generated with -disable-wasm-explicit-locals please remove it");
+ }
+ // Drop unneeded operands.
+ Operands.resize(CallOperands);
+ }
+ }
+ // Block instructions require a signature index, but these are missing in
+ // assembly, so we add a dummy one explicitly (since we have no control
+ // over signature tables here, we assume these will be regenerated when
+ // the wasm module is generated).
+ if (NamePair.second == "block" || NamePair.second == "loop") {
+ Operands.push_back(make_unique<WebAssemblyOperand>(
+ WebAssemblyOperand::Integer, NameLoc,
+ NameLoc, WebAssemblyOperand::IntOp{-1}));
+ }
+ // These don't specify the type, which has to derived from the local index.
+ if (NamePair.second == "get_local" || NamePair.second == "tee_local") {
+ if (Operands.size() >= 3 && Operands[1]->isReg() &&
+ Operands[2]->isImm()) {
+ auto Op1 = reinterpret_cast<WebAssemblyOperand *>(Operands[1].get());
+ auto Op2 = reinterpret_cast<WebAssemblyOperand *>(Operands[2].get());
+ auto Type = GetType(LocalTypes, static_cast<size_t>(Op2->Int.Val));
+ Op1->Reg.Type = Type;
+ GetType(StackTypes, Op1->Reg.RegNo) = Type;
+ }
+ }
+ return false;
+ }
+
+ void onLabelParsed(MCSymbol *Symbol) override {
+ LastLabel = Symbol;
+ }
+
+ bool ParseDirective(AsmToken DirectiveID) override {
+ assert(DirectiveID.getKind() == AsmToken::Identifier);
+ auto &Out = getStreamer();
+ auto &TOut = reinterpret_cast<WebAssemblyTargetStreamer &>(
+ *Out.getTargetStreamer());
+ // TODO: we're just parsing the subset of directives we're interested in,
+ // and ignoring ones we don't recognise. We should ideally verify
+ // all directives here.
+ if (DirectiveID.getString() == ".type") {
+ // This could be the start of a function, check if followed by
+ // "label,@function"
+ if (!(IsNext(AsmToken::Identifier) &&
+ IsNext(AsmToken::Comma) &&
+ IsNext(AsmToken::At) &&
+ Lexer.is(AsmToken::Identifier)))
+ return Error("Expected label,@type declaration, got: ", Lexer.getTok());
+ if (Lexer.getTok().getString() == "function") {
+ // Track locals from start of function.
+ LocalTypes.clear();
+ StackTypes.clear();
+ }
+ Parser.Lex();
+ //Out.EmitSymbolAttribute(??, MCSA_ELF_TypeFunction);
+ } else if (DirectiveID.getString() == ".param" ||
+ DirectiveID.getString() == ".local") {
+ // Track the number of locals, needed for correct virtual register
+ // assignment elsewhere.
+ // Also output a directive to the streamer.
+ std::vector<MVT> Params;
+ std::vector<MVT> Locals;
+ while (Lexer.is(AsmToken::Identifier)) {
+ auto RegType = ParseRegType(Lexer.getTok().getString());
+ if (RegType == MVT::INVALID_SIMPLE_VALUE_TYPE) return true;
+ LocalTypes.push_back(RegType);
+ if (DirectiveID.getString() == ".param") {
+ Params.push_back(RegType);
+ } else {
+ Locals.push_back(RegType);
+ }
+ Parser.Lex();
+ if (!IsNext(AsmToken::Comma)) break;
+ }
+ assert(LastLabel);
+ TOut.emitParam(LastLabel, Params);
+ TOut.emitLocal(Locals);
+ } else {
+ // For now, ignore anydirective we don't recognize:
+ while (Lexer.isNot(AsmToken::EndOfStatement)) Parser.Lex();
+ }
+ return Expect(AsmToken::EndOfStatement, "EOL");
+ }
+
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &/*Opcode*/,
+ OperandVector &Operands,
+ MCStreamer &Out, uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override {
+ MCInst Inst;
+ unsigned MatchResult =
+ MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+ switch (MatchResult) {
+ case Match_Success: {
+ Out.EmitInstruction(Inst, getSTI());
+ return false;
+ }
+ case Match_MissingFeature:
+ return Parser.Error(IDLoc,
+ "instruction requires a WASM feature not currently enabled");
+ case Match_MnemonicFail:
+ return Parser.Error(IDLoc, "invalid instruction");
+ case Match_NearMisses:
+ return Parser.Error(IDLoc, "ambiguous instruction");
+ case Match_InvalidTiedOperand:
+ case Match_InvalidOperand: {
+ SMLoc ErrorLoc = IDLoc;
+ if (ErrorInfo != ~0ULL) {
+ if (ErrorInfo >= Operands.size())
+ return Parser.Error(IDLoc, "too few operands for instruction");
+ ErrorLoc = Operands[ErrorInfo]->getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ }
+ return Parser.Error(ErrorLoc, "invalid operand for instruction");
+ }
+ }
+ llvm_unreachable("Implement any new match types added!");
+ }
+};
+} // end anonymous namespace
+
+// Force static initialization.
+extern "C" void LLVMInitializeWebAssemblyAsmParser() {
+ RegisterMCAsmParser<WebAssemblyAsmParser> X(getTheWebAssemblyTarget32());
+ RegisterMCAsmParser<WebAssemblyAsmParser> Y(getTheWebAssemblyTarget64());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "WebAssemblyGenAsmMatcher.inc"
diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index 68b68bd797b5..a928f110efe0 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -1,20 +1,26 @@
set(LLVM_TARGET_DEFINITIONS WebAssembly.td)
+tablegen(LLVM WebAssemblyGenAsmMatcher.inc -gen-asm-matcher)
tablegen(LLVM WebAssemblyGenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM WebAssemblyGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM WebAssemblyGenDisassemblerTables.inc -gen-disassembler)
tablegen(LLVM WebAssemblyGenFastISel.inc -gen-fast-isel)
tablegen(LLVM WebAssemblyGenInstrInfo.inc -gen-instr-info)
tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM WebAssemblyGenRegisterInfo.inc -gen-register-info)
tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget)
+
add_public_tablegen_target(WebAssemblyCommonTableGen)
add_llvm_target(WebAssemblyCodeGen
+ WebAssemblyAddMissingPrototypes.cpp
WebAssemblyArgumentMove.cpp
WebAssemblyAsmPrinter.cpp
WebAssemblyCallIndirectFixup.cpp
WebAssemblyCFGStackify.cpp
WebAssemblyCFGSort.cpp
+ WebAssemblyLateEHPrepare.cpp
+ WebAssemblyExceptionInfo.cpp
WebAssemblyExplicitLocals.cpp
WebAssemblyFastISel.cpp
WebAssemblyFixIrreducibleControlFlow.cpp
@@ -51,6 +57,7 @@ add_llvm_target(WebAssemblyCodeGen
intrinsics_gen
)
+add_subdirectory(AsmParser)
add_subdirectory(Disassembler)
add_subdirectory(InstPrinter)
add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 9be11da9afac..2f0960271e30 100644
--- a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file is part of the WebAssembly Disassembler.
+/// This file is part of the WebAssembly Disassembler.
///
/// It contains code to translate the data produced by the decoder into
/// MCInsts.
@@ -19,16 +19,23 @@
#include "WebAssembly.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/Endian.h"
+#include "llvm/Support/LEB128.h"
#include "llvm/Support/TargetRegistry.h"
+
using namespace llvm;
#define DEBUG_TYPE "wasm-disassembler"
+using DecodeStatus = MCDisassembler::DecodeStatus;
+
+#include "WebAssemblyGenDisassemblerTables.inc"
+
namespace {
class WebAssemblyDisassembler final : public MCDisassembler {
std::unique_ptr<const MCInstrInfo> MCII;
@@ -60,11 +67,120 @@ extern "C" void LLVMInitializeWebAssemblyDisassembler() {
createWebAssemblyDisassembler);
}
-MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
- MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
- raw_ostream &OS, raw_ostream &CS) const {
+static int nextByte(ArrayRef<uint8_t> Bytes, uint64_t &Size) {
+ if (Size >= Bytes.size())
+ return -1;
+ auto V = Bytes[Size];
+ Size++;
+ return V;
+}
- // TODO: Implement disassembly.
+static bool parseLEBImmediate(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, bool Signed) {
+ unsigned N = 0;
+ const char *Error = nullptr;
+ auto Val = Signed ? decodeSLEB128(Bytes.data() + Size, &N,
+ Bytes.data() + Bytes.size(), &Error)
+ : static_cast<int64_t>(
+ decodeULEB128(Bytes.data() + Size, &N,
+ Bytes.data() + Bytes.size(), &Error));
+ if (Error)
+ return false;
+ Size += N;
+ MI.addOperand(MCOperand::createImm(Val));
+ return true;
+}
+
+template <typename T>
+bool parseFPImmediate(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes) {
+ if (Size + sizeof(T) > Bytes.size())
+ return false;
+ T Val;
+ memcpy(&Val, Bytes.data() + Size, sizeof(T));
+ support::endian::byte_swap<T, support::endianness::little>(Val);
+ Size += sizeof(T);
+ MI.addOperand(MCOperand::createFPImm(static_cast<double>(Val)));
+ return true;
+}
- return MCDisassembler::Fail;
+MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
+ MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
+ raw_ostream & /*OS*/, raw_ostream &CS) const {
+ CommentStream = &CS;
+ Size = 0;
+ auto Opc = nextByte(Bytes, Size);
+ if (Opc < 0)
+ return MCDisassembler::Fail;
+ const auto *WasmInst = &InstructionTable0[Opc];
+ // If this is a prefix byte, indirect to another table.
+ if (WasmInst->ET == ET_Prefix) {
+ WasmInst = nullptr;
+ // Linear search, so far only 2 entries.
+ for (auto PT = PrefixTable; PT->Table; PT++) {
+ if (PT->Prefix == Opc) {
+ WasmInst = PT->Table;
+ break;
+ }
+ }
+ if (!WasmInst)
+ return MCDisassembler::Fail;
+ Opc = nextByte(Bytes, Size);
+ if (Opc < 0)
+ return MCDisassembler::Fail;
+ WasmInst += Opc;
+ }
+ if (WasmInst->ET == ET_Unused)
+ return MCDisassembler::Fail;
+ // At this point we must have a valid instruction to decode.
+ assert(WasmInst->ET == ET_Instruction);
+ MI.setOpcode(WasmInst->Opcode);
+ // Parse any operands.
+ for (uint8_t OPI = 0; OPI < WasmInst->NumOperands; OPI++) {
+ switch (WasmInst->Operands[OPI]) {
+ // ULEB operands:
+ case WebAssembly::OPERAND_BASIC_BLOCK:
+ case WebAssembly::OPERAND_LOCAL:
+ case WebAssembly::OPERAND_GLOBAL:
+ case WebAssembly::OPERAND_FUNCTION32:
+ case WebAssembly::OPERAND_OFFSET32:
+ case WebAssembly::OPERAND_P2ALIGN:
+ case WebAssembly::OPERAND_TYPEINDEX:
+ case MCOI::OPERAND_IMMEDIATE: {
+ if (!parseLEBImmediate(MI, Size, Bytes, false))
+ return MCDisassembler::Fail;
+ break;
+ }
+ // SLEB operands:
+ case WebAssembly::OPERAND_I32IMM:
+ case WebAssembly::OPERAND_I64IMM:
+ case WebAssembly::OPERAND_SIGNATURE: {
+ if (!parseLEBImmediate(MI, Size, Bytes, true))
+ return MCDisassembler::Fail;
+ break;
+ }
+ // FP operands.
+ case WebAssembly::OPERAND_F32IMM: {
+ if (!parseFPImmediate<float>(MI, Size, Bytes))
+ return MCDisassembler::Fail;
+ break;
+ }
+ case WebAssembly::OPERAND_F64IMM: {
+ if (!parseFPImmediate<double>(MI, Size, Bytes))
+ return MCDisassembler::Fail;
+ break;
+ }
+ case MCOI::OPERAND_REGISTER: {
+ // These are NOT actually in the instruction stream, but MC is going to
+ // expect operands to be present for them!
+ // FIXME: can MC re-generate register assignments or do we have to
+ // do this? Since this function decodes a single instruction, we don't
+ // have the proper context for tracking an operand stack here.
+ MI.addOperand(MCOperand::createReg(0));
+ break;
+ }
+ default:
+ llvm_unreachable("Unknown operand type in WebAssemblyDisassembler");
+ }
+ }
+ return MCDisassembler::Success;
}
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index c3f0f2787146..10fa798ac8d7 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief Print MCInst instructions to wasm format.
+/// Print MCInst instructions to wasm format.
///
//===----------------------------------------------------------------------===//
@@ -46,7 +46,7 @@ void WebAssemblyInstPrinter::printRegName(raw_ostream &OS,
void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
StringRef Annot,
- const MCSubtargetInfo & /*STI*/) {
+ const MCSubtargetInfo &STI) {
// Print the instruction (this uses the AsmStrings from the .td files).
printInstruction(MI, OS);
@@ -82,10 +82,12 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
break;
case WebAssembly::END_LOOP:
- ControlFlowStack.pop_back();
+ // Have to guard against an empty stack, in case of mismatched pairs
+ // in assembly parsing.
+ if (!ControlFlowStack.empty()) ControlFlowStack.pop_back();
break;
case WebAssembly::END_BLOCK:
- printAnnotation(
+ if (!ControlFlowStack.empty()) printAnnotation(
OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
break;
}
@@ -176,10 +178,10 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
if (Info.OperandType == WebAssembly::OPERAND_F32IMM) {
// TODO: MC converts all floating point immediate operands to double.
// This is fine for numeric values, but may cause NaNs to change bits.
- O << toString(APFloat(float(Op.getFPImm())));
+ O << ::toString(APFloat(float(Op.getFPImm())));
} else {
assert(Info.OperandType == WebAssembly::OPERAND_F64IMM);
- O << toString(APFloat(Op.getFPImm()));
+ O << ::toString(APFloat(Op.getFPImm()));
}
} else {
assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
@@ -192,20 +194,16 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
}
}
-void
-WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(const MCInst *MI,
- unsigned OpNo,
- raw_ostream &O) {
+void WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(
+ const MCInst *MI, unsigned OpNo, raw_ostream &O) {
int64_t Imm = MI->getOperand(OpNo).getImm();
if (Imm == WebAssembly::GetDefaultP2Align(MI->getOpcode()))
return;
O << ":p2align=" << Imm;
}
-void
-WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
- unsigned OpNo,
- raw_ostream &O) {
+void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(
+ const MCInst *MI, unsigned OpNo, raw_ostream &O) {
int64_t Imm = MI->getOperand(OpNo).getImm();
switch (WebAssembly::ExprType(Imm)) {
case WebAssembly::ExprType::Void: break;
@@ -220,6 +218,7 @@ WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
case WebAssembly::ExprType::B8x16: O << "b8x16"; break;
case WebAssembly::ExprType::B16x8: O << "b16x8"; break;
case WebAssembly::ExprType::B32x4: O << "b32x4"; break;
+ case WebAssembly::ExprType::ExceptRef: O << "except_ref"; break;
}
}
@@ -238,6 +237,8 @@ const char *llvm::WebAssembly::TypeToString(MVT Ty) {
case MVT::v4i32:
case MVT::v4f32:
return "v128";
+ case MVT::ExceptRef:
+ return "except_ref";
default:
llvm_unreachable("unsupported type");
}
@@ -253,6 +254,8 @@ const char *llvm::WebAssembly::TypeToString(wasm::ValType Type) {
return "f32";
case wasm::ValType::F64:
return "f64";
+ case wasm::ValType::EXCEPT_REF:
+ return "except_ref";
}
llvm_unreachable("unsupported type");
}
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index b1de84d7e8e6..f5b890a7615e 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This class prints an WebAssembly MCInst to wasm file syntax.
+/// This class prints an WebAssembly MCInst to wasm file syntax.
///
//===----------------------------------------------------------------------===//
@@ -17,8 +17,8 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/BinaryFormat/Wasm.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/MachineValueType.h"
namespace llvm {
diff --git a/lib/Target/WebAssembly/LLVMBuild.txt b/lib/Target/WebAssembly/LLVMBuild.txt
index 69b03fe19f0e..055c32bf0cbf 100644
--- a/lib/Target/WebAssembly/LLVMBuild.txt
+++ b/lib/Target/WebAssembly/LLVMBuild.txt
@@ -16,12 +16,13 @@
;===------------------------------------------------------------------------===;
[common]
-subdirectories = Disassembler InstPrinter MCTargetDesc TargetInfo
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
[component_0]
type = TargetGroup
name = WebAssembly
parent = Target
+has_asmparser = 1
has_asmprinter = 1
has_disassembler = 1
diff --git a/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt b/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt
index 13c0fe915908..0032a43db87f 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt
@@ -1,6 +1,5 @@
add_llvm_library(LLVMWebAssemblyDesc
WebAssemblyAsmBackend.cpp
- WebAssemblyELFObjectWriter.cpp
WebAssemblyMCAsmInfo.cpp
WebAssemblyMCCodeEmitter.cpp
WebAssemblyMCTargetDesc.cpp
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 226a3b35f2cf..244c2189b455 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file implements the WebAssemblyAsmBackend class.
+/// This file implements the WebAssemblyAsmBackend class.
///
//===----------------------------------------------------------------------===//
@@ -17,7 +17,6 @@
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCObjectWriter.h"
@@ -26,51 +25,17 @@
#include "llvm/MC/MCWasmObjectWriter.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
+
using namespace llvm;
namespace {
-class WebAssemblyAsmBackendELF final : public MCAsmBackend {
- bool Is64Bit;
-
-public:
- explicit WebAssemblyAsmBackendELF(bool Is64Bit)
- : MCAsmBackend(), Is64Bit(Is64Bit) {}
- ~WebAssemblyAsmBackendELF() override {}
-
- void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
- const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsPCRel) const override;
-
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override;
-
- // No instruction requires relaxation
- bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
- const MCRelaxableFragment *DF,
- const MCAsmLayout &Layout) const override {
- return false;
- }
-
- unsigned getNumFixupKinds() const override {
- // We currently just use the generic fixups in MCFixup.h and don't have any
- // target-specific fixups.
- return 0;
- }
-
- bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
-
- void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
- MCInst &Res) const override {}
-
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
-};
class WebAssemblyAsmBackend final : public MCAsmBackend {
bool Is64Bit;
public:
explicit WebAssemblyAsmBackend(bool Is64Bit)
- : MCAsmBackend(), Is64Bit(Is64Bit) {}
+ : MCAsmBackend(support::little), Is64Bit(Is64Bit) {}
~WebAssemblyAsmBackend() override {}
unsigned getNumFixupKinds() const override {
@@ -81,10 +46,11 @@ public:
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsPCRel) const override;
+ uint64_t Value, bool IsPCRel,
+ const MCSubtargetInfo *STI) const override;
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override;
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override;
// No instruction requires relaxation
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -93,51 +59,17 @@ public:
return false;
}
- bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+ bool mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const override {
+ return false;
+ }
void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
MCInst &Res) const override {}
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
};
-bool WebAssemblyAsmBackendELF::writeNopData(uint64_t Count,
- MCObjectWriter *OW) const {
- for (uint64_t i = 0; i < Count; ++i)
- OW->write8(WebAssembly::Nop);
-
- return true;
-}
-
-void WebAssemblyAsmBackendELF::applyFixup(const MCAssembler &Asm,
- const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data,
- uint64_t Value, bool IsPCRel) const {
- const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
- assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
-
- unsigned NumBytes = alignTo(Info.TargetSize, 8) / 8;
- if (Value == 0)
- return; // Doesn't change encoding.
-
- // Shift the value into position.
- Value <<= Info.TargetOffset;
-
- unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
-
- // For each byte of the fragment that the fixup touches, mask in the
- // bits from the fixup value.
- for (unsigned i = 0; i != NumBytes; ++i)
- Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
-}
-
-std::unique_ptr<MCObjectWriter>
-WebAssemblyAsmBackendELF::createObjectWriter(raw_pwrite_stream &OS) const {
- return createWebAssemblyELFObjectWriter(OS, Is64Bit, 0);
-}
-
const MCFixupKindInfo &
WebAssemblyAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
const static MCFixupKindInfo Infos[WebAssembly::NumTargetFixupKinds] = {
@@ -158,13 +90,10 @@ WebAssemblyAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
return Infos[Kind - FirstTargetFixupKind];
}
-bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
- MCObjectWriter *OW) const {
- if (Count == 0)
- return true;
-
+bool WebAssemblyAsmBackend::writeNopData(raw_ostream &OS,
+ uint64_t Count) const {
for (uint64_t i = 0; i < Count; ++i)
- OW->write8(WebAssembly::Nop);
+ OS << char(WebAssembly::Nop);
return true;
}
@@ -173,7 +102,8 @@ void WebAssemblyAsmBackend::applyFixup(const MCAssembler &Asm,
const MCFixup &Fixup,
const MCValue &Target,
MutableArrayRef<char> Data,
- uint64_t Value, bool IsPCRel) const {
+ uint64_t Value, bool IsPCRel,
+ const MCSubtargetInfo *STI) const {
const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
@@ -193,14 +123,13 @@ void WebAssemblyAsmBackend::applyFixup(const MCAssembler &Asm,
Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
}
-std::unique_ptr<MCObjectWriter>
-WebAssemblyAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
- return createWebAssemblyWasmObjectWriter(OS, Is64Bit);
+std::unique_ptr<MCObjectTargetWriter>
+WebAssemblyAsmBackend::createObjectTargetWriter() const {
+ return createWebAssemblyWasmObjectWriter(Is64Bit);
}
+
} // end anonymous namespace
MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Triple &TT) {
- if (TT.isOSBinFormatELF())
- return new WebAssemblyAsmBackendELF(TT.isArch64Bit());
return new WebAssemblyAsmBackend(TT.isArch64Bit());
}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
deleted file mode 100644
index b67ecfa455b3..000000000000
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-//===-- WebAssemblyELFObjectWriter.cpp - WebAssembly ELF Writer -----------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief This file handles ELF-specific object emission, converting LLVM's
-/// internal fixups into the appropriate relocations.
-///
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCFixup.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/Support/ErrorHandling.h"
-using namespace llvm;
-
-namespace {
-class WebAssemblyELFObjectWriter final : public MCELFObjectTargetWriter {
-public:
- WebAssemblyELFObjectWriter(bool Is64Bit, uint8_t OSABI);
-
-protected:
- unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
- const MCFixup &Fixup, bool IsPCRel) const override;
-};
-} // end anonymous namespace
-
-WebAssemblyELFObjectWriter::WebAssemblyELFObjectWriter(bool Is64Bit,
- uint8_t OSABI)
- : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_WEBASSEMBLY,
- /*HasRelocationAddend=*/false) {}
-
-unsigned WebAssemblyELFObjectWriter::getRelocType(MCContext &Ctx,
- const MCValue &Target,
- const MCFixup &Fixup,
- bool IsPCRel) const {
- // WebAssembly functions are not allocated in the address space. To resolve a
- // pointer to a function, we must use a special relocation type.
- if (const MCSymbolRefExpr *SyExp =
- dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
- if (SyExp->getKind() == MCSymbolRefExpr::VK_WebAssembly_FUNCTION)
- return ELF::R_WEBASSEMBLY_FUNCTION;
-
- switch (Fixup.getKind()) {
- case FK_Data_4:
- assert(!is64Bit() && "4-byte relocations only supported on wasm32");
- return ELF::R_WEBASSEMBLY_DATA;
- case FK_Data_8:
- assert(is64Bit() && "8-byte relocations only supported on wasm64");
- return ELF::R_WEBASSEMBLY_DATA;
- default:
- llvm_unreachable("unimplemented fixup kind");
- }
-}
-
-std::unique_ptr<MCObjectWriter>
-llvm::createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
- bool Is64Bit,
- uint8_t OSABI) {
- auto MOTW = llvm::make_unique<WebAssemblyELFObjectWriter>(Is64Bit, OSABI);
- return createELFObjectWriter(std::move(MOTW), OS, /*IsLittleEndian=*/true);
-}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
index 5f8c78ed1683..44fcc129c39e 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
@@ -8,50 +8,18 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains the declarations of the WebAssemblyMCAsmInfo
+/// This file contains the declarations of the WebAssemblyMCAsmInfo
/// properties.
///
//===----------------------------------------------------------------------===//
#include "WebAssemblyMCAsmInfo.h"
#include "llvm/ADT/Triple.h"
+
using namespace llvm;
#define DEBUG_TYPE "wasm-mc-asm-info"
-WebAssemblyMCAsmInfoELF::~WebAssemblyMCAsmInfoELF() {}
-
-WebAssemblyMCAsmInfoELF::WebAssemblyMCAsmInfoELF(const Triple &T) {
- CodePointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4;
-
- // TODO: What should MaxInstLength be?
-
- UseDataRegionDirectives = true;
-
- // Use .skip instead of .zero because .zero is confusing when used with two
- // arguments (it doesn't actually zero things out).
- ZeroDirective = "\t.skip\t";
-
- Data8bitsDirective = "\t.int8\t";
- Data16bitsDirective = "\t.int16\t";
- Data32bitsDirective = "\t.int32\t";
- Data64bitsDirective = "\t.int64\t";
-
- AlignmentIsInBytes = false;
- COMMDirectiveAlignmentIsInBytes = false;
- LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
-
- SupportsDebugInformation = true;
-
- // For now, WebAssembly does not support exceptions.
- ExceptionsType = ExceptionHandling::None;
-
- // TODO: UseIntegratedAssembler?
-
- // WebAssembly's stack is never executable.
- UsesNonexecutableStackSection = false;
-}
-
WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {}
WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
@@ -76,8 +44,5 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
SupportsDebugInformation = true;
- // For now, WebAssembly does not support exceptions.
- ExceptionsType = ExceptionHandling::None;
-
// TODO: UseIntegratedAssembler?
}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
index d9547096190e..8627a6e40c6a 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
@@ -8,26 +8,19 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains the declaration of the WebAssemblyMCAsmInfo class.
+/// This file contains the declaration of the WebAssemblyMCAsmInfo class.
///
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H
#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H
-#include "llvm/MC/MCAsmInfoELF.h"
#include "llvm/MC/MCAsmInfoWasm.h"
namespace llvm {
class Triple;
-class WebAssemblyMCAsmInfoELF final : public MCAsmInfoELF {
-public:
- explicit WebAssemblyMCAsmInfoELF(const Triple &T);
- ~WebAssemblyMCAsmInfoELF() override;
-};
-
class WebAssemblyMCAsmInfo final : public MCAsmInfoWasm {
public:
explicit WebAssemblyMCAsmInfo(const Triple &T);
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index 77744e53d62f..94ca94e1e18c 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file implements the WebAssemblyMCCodeEmitter class.
+/// This file implements the WebAssemblyMCCodeEmitter class.
///
//===----------------------------------------------------------------------===//
@@ -23,9 +23,11 @@
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/EndianStream.h"
#include "llvm/Support/LEB128.h"
#include "llvm/Support/raw_ostream.h"
+
using namespace llvm;
#define DEBUG_TYPE "mccodeemitter"
@@ -86,14 +88,18 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
assert(Desc.TSFlags == 0 &&
"WebAssembly non-variable_ops don't use TSFlags");
const MCOperandInfo &Info = Desc.OpInfo[i];
+ LLVM_DEBUG(dbgs() << "Encoding immediate: type="
+ << int(Info.OperandType) << "\n");
if (Info.OperandType == WebAssembly::OPERAND_I32IMM) {
encodeSLEB128(int32_t(MO.getImm()), OS);
+ } else if (Info.OperandType == WebAssembly::OPERAND_OFFSET32) {
+ encodeULEB128(uint32_t(MO.getImm()), OS);
} else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) {
encodeSLEB128(int64_t(MO.getImm()), OS);
} else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) {
llvm_unreachable("wasm globals should only be accessed symbolicly");
} else if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) {
- encodeSLEB128(int64_t(MO.getImm()), OS);
+ OS << uint8_t(MO.getImm());
} else {
encodeULEB128(uint64_t(MO.getImm()), OS);
}
@@ -112,11 +118,11 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
// TODO: MC converts all floating point immediate operands to double.
// This is fine for numeric values, but may cause NaNs to change bits.
float f = float(MO.getFPImm());
- support::endian::Writer<support::little>(OS).write<float>(f);
+ support::endian::write<float>(OS, f, support::little);
} else {
assert(Info.OperandType == WebAssembly::OPERAND_F64IMM);
double d = MO.getFPImm();
- support::endian::Writer<support::little>(OS).write<double>(d);
+ support::endian::write<double>(OS, d, support::little);
}
} else if (MO.isExpr()) {
const MCOperandInfo &Info = Desc.OpInfo[i];
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 18de4273d1d0..baf8a0c96c0a 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file provides WebAssembly-specific target descriptions.
+/// This file provides WebAssembly-specific target descriptions.
///
//===----------------------------------------------------------------------===//
@@ -36,8 +36,6 @@ using namespace llvm;
static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
const Triple &TT) {
- if (TT.isOSBinFormatELF())
- return new WebAssemblyMCAsmInfoELF(TT);
return new WebAssemblyMCAsmInfo(TT);
}
@@ -69,10 +67,10 @@ static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII,
}
static MCAsmBackend *createAsmBackend(const Target & /*T*/,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo & /*MRI*/,
- const Triple &TT, StringRef /*CPU*/,
const MCTargetOptions & /*Options*/) {
- return createWebAssemblyAsmBackend(TT);
+ return createWebAssemblyAsmBackend(STI.getTargetTriple());
}
static MCSubtargetInfo *createMCSubtargetInfo(const Triple &TT, StringRef CPU,
@@ -82,10 +80,6 @@ static MCSubtargetInfo *createMCSubtargetInfo(const Triple &TT, StringRef CPU,
static MCTargetStreamer *
createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
- const Triple &TT = STI.getTargetTriple();
- if (TT.isOSBinFormatELF())
- return new WebAssemblyTargetELFStreamer(S);
-
return new WebAssemblyTargetWasmStreamer(S);
}
@@ -135,6 +129,7 @@ wasm::ValType WebAssembly::toValType(const MVT &Ty) {
case MVT::i64: return wasm::ValType::I64;
case MVT::f32: return wasm::ValType::F32;
case MVT::f64: return wasm::ValType::F64;
+ case MVT::ExceptRef: return wasm::ValType::EXCEPT_REF;
default: llvm_unreachable("unexpected type");
}
}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 7dca89ab822d..c1c8d243e920 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file provides WebAssembly-specific target descriptions.
+/// This file provides WebAssembly-specific target descriptions.
///
//===----------------------------------------------------------------------===//
@@ -26,7 +26,7 @@ class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
class MCSubtargetInfo;
class MVT;
class Target;
@@ -40,13 +40,8 @@ MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
-std::unique_ptr<MCObjectWriter>
-createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
- bool Is64Bit, uint8_t OSABI);
-
-std::unique_ptr<MCObjectWriter>
-createWebAssemblyWasmObjectWriter(raw_pwrite_stream &OS,
- bool Is64Bit);
+std::unique_ptr<MCObjectTargetWriter>
+createWebAssemblyWasmObjectWriter(bool Is64Bit);
namespace WebAssembly {
enum OperandType {
@@ -111,38 +106,166 @@ namespace WebAssembly {
inline unsigned GetDefaultP2Align(unsigned Opcode) {
switch (Opcode) {
case WebAssembly::LOAD8_S_I32:
+ case WebAssembly::LOAD8_S_I32_S:
case WebAssembly::LOAD8_U_I32:
+ case WebAssembly::LOAD8_U_I32_S:
case WebAssembly::LOAD8_S_I64:
+ case WebAssembly::LOAD8_S_I64_S:
case WebAssembly::LOAD8_U_I64:
+ case WebAssembly::LOAD8_U_I64_S:
case WebAssembly::ATOMIC_LOAD8_U_I32:
+ case WebAssembly::ATOMIC_LOAD8_U_I32_S:
case WebAssembly::ATOMIC_LOAD8_U_I64:
+ case WebAssembly::ATOMIC_LOAD8_U_I64_S:
case WebAssembly::STORE8_I32:
+ case WebAssembly::STORE8_I32_S:
case WebAssembly::STORE8_I64:
+ case WebAssembly::STORE8_I64_S:
+ case WebAssembly::ATOMIC_STORE8_I32:
+ case WebAssembly::ATOMIC_STORE8_I32_S:
+ case WebAssembly::ATOMIC_STORE8_I64:
+ case WebAssembly::ATOMIC_STORE8_I64_S:
+ case WebAssembly::ATOMIC_RMW8_U_ADD_I32:
+ case WebAssembly::ATOMIC_RMW8_U_ADD_I32_S:
+ case WebAssembly::ATOMIC_RMW8_U_ADD_I64:
+ case WebAssembly::ATOMIC_RMW8_U_ADD_I64_S:
+ case WebAssembly::ATOMIC_RMW8_U_SUB_I32:
+ case WebAssembly::ATOMIC_RMW8_U_SUB_I32_S:
+ case WebAssembly::ATOMIC_RMW8_U_SUB_I64:
+ case WebAssembly::ATOMIC_RMW8_U_SUB_I64_S:
+ case WebAssembly::ATOMIC_RMW8_U_AND_I32:
+ case WebAssembly::ATOMIC_RMW8_U_AND_I32_S:
+ case WebAssembly::ATOMIC_RMW8_U_AND_I64:
+ case WebAssembly::ATOMIC_RMW8_U_AND_I64_S:
+ case WebAssembly::ATOMIC_RMW8_U_OR_I32:
+ case WebAssembly::ATOMIC_RMW8_U_OR_I32_S:
+ case WebAssembly::ATOMIC_RMW8_U_OR_I64:
+ case WebAssembly::ATOMIC_RMW8_U_OR_I64_S:
+ case WebAssembly::ATOMIC_RMW8_U_XOR_I32:
+ case WebAssembly::ATOMIC_RMW8_U_XOR_I32_S:
+ case WebAssembly::ATOMIC_RMW8_U_XOR_I64:
+ case WebAssembly::ATOMIC_RMW8_U_XOR_I64_S:
+ case WebAssembly::ATOMIC_RMW8_U_XCHG_I32:
+ case WebAssembly::ATOMIC_RMW8_U_XCHG_I32_S:
+ case WebAssembly::ATOMIC_RMW8_U_XCHG_I64:
+ case WebAssembly::ATOMIC_RMW8_U_XCHG_I64_S:
return 0;
case WebAssembly::LOAD16_S_I32:
+ case WebAssembly::LOAD16_S_I32_S:
case WebAssembly::LOAD16_U_I32:
+ case WebAssembly::LOAD16_U_I32_S:
case WebAssembly::LOAD16_S_I64:
+ case WebAssembly::LOAD16_S_I64_S:
case WebAssembly::LOAD16_U_I64:
+ case WebAssembly::LOAD16_U_I64_S:
case WebAssembly::ATOMIC_LOAD16_U_I32:
+ case WebAssembly::ATOMIC_LOAD16_U_I32_S:
case WebAssembly::ATOMIC_LOAD16_U_I64:
+ case WebAssembly::ATOMIC_LOAD16_U_I64_S:
case WebAssembly::STORE16_I32:
+ case WebAssembly::STORE16_I32_S:
case WebAssembly::STORE16_I64:
+ case WebAssembly::STORE16_I64_S:
+ case WebAssembly::ATOMIC_STORE16_I32:
+ case WebAssembly::ATOMIC_STORE16_I32_S:
+ case WebAssembly::ATOMIC_STORE16_I64:
+ case WebAssembly::ATOMIC_STORE16_I64_S:
+ case WebAssembly::ATOMIC_RMW16_U_ADD_I32:
+ case WebAssembly::ATOMIC_RMW16_U_ADD_I32_S:
+ case WebAssembly::ATOMIC_RMW16_U_ADD_I64:
+ case WebAssembly::ATOMIC_RMW16_U_ADD_I64_S:
+ case WebAssembly::ATOMIC_RMW16_U_SUB_I32:
+ case WebAssembly::ATOMIC_RMW16_U_SUB_I32_S:
+ case WebAssembly::ATOMIC_RMW16_U_SUB_I64:
+ case WebAssembly::ATOMIC_RMW16_U_SUB_I64_S:
+ case WebAssembly::ATOMIC_RMW16_U_AND_I32:
+ case WebAssembly::ATOMIC_RMW16_U_AND_I32_S:
+ case WebAssembly::ATOMIC_RMW16_U_AND_I64:
+ case WebAssembly::ATOMIC_RMW16_U_AND_I64_S:
+ case WebAssembly::ATOMIC_RMW16_U_OR_I32:
+ case WebAssembly::ATOMIC_RMW16_U_OR_I32_S:
+ case WebAssembly::ATOMIC_RMW16_U_OR_I64:
+ case WebAssembly::ATOMIC_RMW16_U_OR_I64_S:
+ case WebAssembly::ATOMIC_RMW16_U_XOR_I32:
+ case WebAssembly::ATOMIC_RMW16_U_XOR_I32_S:
+ case WebAssembly::ATOMIC_RMW16_U_XOR_I64:
+ case WebAssembly::ATOMIC_RMW16_U_XOR_I64_S:
+ case WebAssembly::ATOMIC_RMW16_U_XCHG_I32:
+ case WebAssembly::ATOMIC_RMW16_U_XCHG_I32_S:
+ case WebAssembly::ATOMIC_RMW16_U_XCHG_I64:
+ case WebAssembly::ATOMIC_RMW16_U_XCHG_I64_S:
return 1;
case WebAssembly::LOAD_I32:
+ case WebAssembly::LOAD_I32_S:
case WebAssembly::LOAD_F32:
+ case WebAssembly::LOAD_F32_S:
case WebAssembly::STORE_I32:
+ case WebAssembly::STORE_I32_S:
case WebAssembly::STORE_F32:
+ case WebAssembly::STORE_F32_S:
case WebAssembly::LOAD32_S_I64:
+ case WebAssembly::LOAD32_S_I64_S:
case WebAssembly::LOAD32_U_I64:
+ case WebAssembly::LOAD32_U_I64_S:
case WebAssembly::STORE32_I64:
+ case WebAssembly::STORE32_I64_S:
case WebAssembly::ATOMIC_LOAD_I32:
+ case WebAssembly::ATOMIC_LOAD_I32_S:
case WebAssembly::ATOMIC_LOAD32_U_I64:
+ case WebAssembly::ATOMIC_LOAD32_U_I64_S:
+ case WebAssembly::ATOMIC_STORE_I32:
+ case WebAssembly::ATOMIC_STORE_I32_S:
+ case WebAssembly::ATOMIC_STORE32_I64:
+ case WebAssembly::ATOMIC_STORE32_I64_S:
+ case WebAssembly::ATOMIC_RMW_ADD_I32:
+ case WebAssembly::ATOMIC_RMW_ADD_I32_S:
+ case WebAssembly::ATOMIC_RMW32_U_ADD_I64:
+ case WebAssembly::ATOMIC_RMW32_U_ADD_I64_S:
+ case WebAssembly::ATOMIC_RMW_SUB_I32:
+ case WebAssembly::ATOMIC_RMW_SUB_I32_S:
+ case WebAssembly::ATOMIC_RMW32_U_SUB_I64:
+ case WebAssembly::ATOMIC_RMW32_U_SUB_I64_S:
+ case WebAssembly::ATOMIC_RMW_AND_I32:
+ case WebAssembly::ATOMIC_RMW_AND_I32_S:
+ case WebAssembly::ATOMIC_RMW32_U_AND_I64:
+ case WebAssembly::ATOMIC_RMW32_U_AND_I64_S:
+ case WebAssembly::ATOMIC_RMW_OR_I32:
+ case WebAssembly::ATOMIC_RMW_OR_I32_S:
+ case WebAssembly::ATOMIC_RMW32_U_OR_I64:
+ case WebAssembly::ATOMIC_RMW32_U_OR_I64_S:
+ case WebAssembly::ATOMIC_RMW_XOR_I32:
+ case WebAssembly::ATOMIC_RMW_XOR_I32_S:
+ case WebAssembly::ATOMIC_RMW32_U_XOR_I64:
+ case WebAssembly::ATOMIC_RMW32_U_XOR_I64_S:
+ case WebAssembly::ATOMIC_RMW_XCHG_I32:
+ case WebAssembly::ATOMIC_RMW_XCHG_I32_S:
+ case WebAssembly::ATOMIC_RMW32_U_XCHG_I64:
+ case WebAssembly::ATOMIC_RMW32_U_XCHG_I64_S:
return 2;
case WebAssembly::LOAD_I64:
+ case WebAssembly::LOAD_I64_S:
case WebAssembly::LOAD_F64:
+ case WebAssembly::LOAD_F64_S:
case WebAssembly::STORE_I64:
+ case WebAssembly::STORE_I64_S:
case WebAssembly::STORE_F64:
+ case WebAssembly::STORE_F64_S:
case WebAssembly::ATOMIC_LOAD_I64:
+ case WebAssembly::ATOMIC_LOAD_I64_S:
+ case WebAssembly::ATOMIC_STORE_I64:
+ case WebAssembly::ATOMIC_STORE_I64_S:
+ case WebAssembly::ATOMIC_RMW_ADD_I64:
+ case WebAssembly::ATOMIC_RMW_ADD_I64_S:
+ case WebAssembly::ATOMIC_RMW_SUB_I64:
+ case WebAssembly::ATOMIC_RMW_SUB_I64_S:
+ case WebAssembly::ATOMIC_RMW_AND_I64:
+ case WebAssembly::ATOMIC_RMW_AND_I64_S:
+ case WebAssembly::ATOMIC_RMW_OR_I64:
+ case WebAssembly::ATOMIC_RMW_OR_I64_S:
+ case WebAssembly::ATOMIC_RMW_XOR_I64:
+ case WebAssembly::ATOMIC_RMW_XOR_I64_S:
+ case WebAssembly::ATOMIC_RMW_XCHG_I64:
+ case WebAssembly::ATOMIC_RMW_XCHG_I64_S:
return 3;
default:
llvm_unreachable("Only loads and stores have p2align values");
@@ -158,19 +281,20 @@ static const unsigned LoadP2AlignOperandNo = 1;
static const unsigned StoreP2AlignOperandNo = 0;
/// This is used to indicate block signatures.
-enum class ExprType {
- Void = -0x40,
- I32 = -0x01,
- I64 = -0x02,
- F32 = -0x03,
- F64 = -0x04,
- I8x16 = -0x05,
- I16x8 = -0x06,
- I32x4 = -0x07,
- F32x4 = -0x08,
- B8x16 = -0x09,
- B16x8 = -0x0a,
- B32x4 = -0x0b
+enum class ExprType : unsigned {
+ Void = 0x40,
+ I32 = 0x7F,
+ I64 = 0x7E,
+ F32 = 0x7D,
+ F64 = 0x7C,
+ I8x16 = 0x7B,
+ I16x8 = 0x7A,
+ I32x4 = 0x79,
+ F32x4 = 0x78,
+ B8x16 = 0x77,
+ B16x8 = 0x76,
+ B32x4 = 0x75,
+ ExceptRef = 0x68
};
/// Instruction opcodes emitted via means other than CodeGen.
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index 0ca52ad651b5..5272e188e1d0 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file defines WebAssembly-specific target streamer classes.
+/// This file defines WebAssembly-specific target streamer classes.
/// These are for implementing support for target-specific assembly directives.
///
//===----------------------------------------------------------------------===//
@@ -17,10 +17,8 @@
#include "InstPrinter/WebAssemblyInstPrinter.h"
#include "WebAssemblyMCTargetDesc.h"
#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSectionWasm.h"
#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/MCSymbolWasm.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
@@ -31,16 +29,13 @@ WebAssemblyTargetStreamer::WebAssemblyTargetStreamer(MCStreamer &S)
: MCTargetStreamer(S) {}
void WebAssemblyTargetStreamer::emitValueType(wasm::ValType Type) {
- Streamer.EmitSLEB128IntValue(int32_t(Type));
+ Streamer.EmitIntValue(uint8_t(Type), 1);
}
WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer(
MCStreamer &S, formatted_raw_ostream &OS)
: WebAssemblyTargetStreamer(S), OS(OS) {}
-WebAssemblyTargetELFStreamer::WebAssemblyTargetELFStreamer(MCStreamer &S)
- : WebAssemblyTargetStreamer(S) {}
-
WebAssemblyTargetWasmStreamer::WebAssemblyTargetWasmStreamer(MCStreamer &S)
: WebAssemblyTargetStreamer(S) {}
@@ -87,27 +82,6 @@ void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) {
}
}
-void WebAssemblyTargetAsmStreamer::emitGlobal(
- ArrayRef<wasm::Global> Globals) {
- if (!Globals.empty()) {
- OS << "\t.globalvar \t";
-
- bool First = true;
- for (const wasm::Global &G : Globals) {
- if (First)
- First = false;
- else
- OS << ", ";
- OS << WebAssembly::TypeToString(G.Type);
- if (!G.InitialModule.empty())
- OS << '=' << G.InitialModule << ':' << G.InitialName;
- else
- OS << '=' << G.InitialValue;
- }
- OS << '\n';
- }
-}
-
void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
void WebAssemblyTargetAsmStreamer::emitIndirectFunctionType(
@@ -128,46 +102,13 @@ void WebAssemblyTargetAsmStreamer::emitGlobalImport(StringRef name) {
OS << "\t.import_global\t" << name << '\n';
}
-void WebAssemblyTargetAsmStreamer::emitIndIdx(const MCExpr *Value) {
- OS << "\t.indidx \t" << *Value << '\n';
-}
-
-void WebAssemblyTargetELFStreamer::emitParam(MCSymbol *Symbol,
- ArrayRef<MVT> Types) {
- // Nothing to emit; params are declared as part of the function signature.
-}
-
-void WebAssemblyTargetELFStreamer::emitResult(MCSymbol *Symbol,
- ArrayRef<MVT> Types) {
- // Nothing to emit; results are declared as part of the function signature.
-}
-
-void WebAssemblyTargetELFStreamer::emitLocal(ArrayRef<MVT> Types) {
- Streamer.EmitULEB128IntValue(Types.size());
- for (MVT Type : Types)
- emitValueType(WebAssembly::toValType(Type));
-}
-
-void WebAssemblyTargetELFStreamer::emitGlobal(
- ArrayRef<wasm::Global> Globals) {
- llvm_unreachable(".globalvar encoding not yet implemented");
-}
-
-void WebAssemblyTargetELFStreamer::emitEndFunc() {
- Streamer.EmitIntValue(WebAssembly::End, 1);
-}
-
-void WebAssemblyTargetELFStreamer::emitIndIdx(const MCExpr *Value) {
- llvm_unreachable(".indidx encoding not yet implemented");
-}
-
-void WebAssemblyTargetELFStreamer::emitIndirectFunctionType(
- MCSymbol *Symbol, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) {
- // Nothing to emit here. TODO: Re-design how linking works and re-evaluate
- // whether it's necessary for .o files to declare indirect function types.
+void WebAssemblyTargetAsmStreamer::emitImportModule(MCSymbolWasm *Sym,
+ StringRef ModuleName) {
+ OS << "\t.import_module\t" << Sym->getName() << ", " << ModuleName << '\n';
}
-void WebAssemblyTargetELFStreamer::emitGlobalImport(StringRef name) {
+void WebAssemblyTargetAsmStreamer::emitIndIdx(const MCExpr *Value) {
+ OS << "\t.indidx \t" << *Value << '\n';
}
void WebAssemblyTargetWasmStreamer::emitParam(MCSymbol *Symbol,
@@ -204,31 +145,6 @@ void WebAssemblyTargetWasmStreamer::emitLocal(ArrayRef<MVT> Types) {
}
}
-void WebAssemblyTargetWasmStreamer::emitGlobal(
- ArrayRef<wasm::Global> Globals) {
- // Encode the globals use by the funciton into the special .global_variables
- // section. This will later be decoded and turned into contents for the
- // Globals Section.
- Streamer.PushSection();
- Streamer.SwitchSection(Streamer.getContext().getWasmSection(
- ".global_variables", SectionKind::getMetadata()));
- for (const wasm::Global &G : Globals) {
- Streamer.EmitIntValue(int32_t(G.Type), 1);
- Streamer.EmitIntValue(G.Mutable, 1);
- if (G.InitialModule.empty()) {
- Streamer.EmitIntValue(0, 1); // indicate that we have an int value
- Streamer.EmitSLEB128IntValue(0);
- } else {
- Streamer.EmitIntValue(1, 1); // indicate that we have a module import
- Streamer.EmitBytes(G.InitialModule);
- Streamer.EmitIntValue(0, 1); // nul-terminate
- Streamer.EmitBytes(G.InitialName);
- Streamer.EmitIntValue(0, 1); // nul-terminate
- }
- }
- Streamer.PopSection();
-}
-
void WebAssemblyTargetWasmStreamer::emitEndFunc() {
llvm_unreachable(".end_func is not needed for direct wasm output");
}
@@ -256,9 +172,14 @@ void WebAssemblyTargetWasmStreamer::emitIndirectFunctionType(
WasmSym->setParams(std::move(ValParams));
WasmSym->setReturns(std::move(ValResults));
- WasmSym->setIsFunction(true);
+ WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
}
void WebAssemblyTargetWasmStreamer::emitGlobalImport(StringRef name) {
llvm_unreachable(".global_import is not needed for direct wasm output");
}
+
+void WebAssemblyTargetWasmStreamer::emitImportModule(MCSymbolWasm *Sym,
+ StringRef ModuleName) {
+ Sym->setModuleName(ModuleName);
+}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index 2cb21a20580b..cafcb04ccd11 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file declares WebAssembly-specific target streamer classes.
+/// This file declares WebAssembly-specific target streamer classes.
/// These are for implementing support for target-specific assembly directives.
///
//===----------------------------------------------------------------------===//
@@ -17,13 +17,13 @@
#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H
#include "llvm/BinaryFormat/Wasm.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/MachineValueType.h"
namespace llvm {
-class MCELFStreamer;
class MCWasmStreamer;
+class MCSymbolWasm;
/// WebAssembly-specific streamer interface, to implement support
/// WebAssembly-specific assembly directives.
@@ -37,8 +37,6 @@ public:
virtual void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) = 0;
/// .local
virtual void emitLocal(ArrayRef<MVT> Types) = 0;
- /// .globalvar
- virtual void emitGlobal(ArrayRef<wasm::Global> Globals) = 0;
/// .endfunc
virtual void emitEndFunc() = 0;
/// .functype
@@ -49,6 +47,8 @@ public:
virtual void emitIndIdx(const MCExpr *Value) = 0;
/// .import_global
virtual void emitGlobalImport(StringRef name) = 0;
+ /// .import_module
+ virtual void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) = 0;
protected:
void emitValueType(wasm::ValType Type);
@@ -64,30 +64,13 @@ public:
void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
void emitLocal(ArrayRef<MVT> Types) override;
- void emitGlobal(ArrayRef<wasm::Global> Globals) override;
- void emitEndFunc() override;
- void emitIndirectFunctionType(MCSymbol *Symbol,
- SmallVectorImpl<MVT> &Params,
- SmallVectorImpl<MVT> &Results) override;
- void emitIndIdx(const MCExpr *Value) override;
- void emitGlobalImport(StringRef name) override;
-};
-
-/// This part is for ELF object output
-class WebAssemblyTargetELFStreamer final : public WebAssemblyTargetStreamer {
-public:
- explicit WebAssemblyTargetELFStreamer(MCStreamer &S);
-
- void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
- void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
- void emitLocal(ArrayRef<MVT> Types) override;
- void emitGlobal(ArrayRef<wasm::Global> Globals) override;
void emitEndFunc() override;
void emitIndirectFunctionType(MCSymbol *Symbol,
SmallVectorImpl<MVT> &Params,
SmallVectorImpl<MVT> &Results) override;
void emitIndIdx(const MCExpr *Value) override;
void emitGlobalImport(StringRef name) override;
+ void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) override;
};
/// This part is for Wasm object output
@@ -98,13 +81,13 @@ public:
void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
void emitLocal(ArrayRef<MVT> Types) override;
- void emitGlobal(ArrayRef<wasm::Global> Globals) override;
void emitEndFunc() override;
void emitIndirectFunctionType(MCSymbol *Symbol,
SmallVectorImpl<MVT> &Params,
SmallVectorImpl<MVT> &Results) override;
void emitIndIdx(const MCExpr *Value) override;
void emitGlobalImport(StringRef name) override;
+ void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) override;
};
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index 39abde26df7f..4fb12d40b01b 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file handles Wasm-specific object emission, converting LLVM's
+/// This file handles Wasm-specific object emission, converting LLVM's
/// internal fixups into the appropriate relocations.
///
//===----------------------------------------------------------------------===//
@@ -20,9 +20,10 @@
#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionWasm.h"
#include "llvm/MC/MCSymbolWasm.h"
-#include "llvm/MC/MCWasmObjectWriter.h"
#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWasmObjectWriter.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
@@ -61,6 +62,25 @@ static bool IsFunctionType(const MCValue &Target) {
return RefA && RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX;
}
+static const MCSection *GetFixupSection(const MCExpr *Expr) {
+ if (auto SyExp = dyn_cast<MCSymbolRefExpr>(Expr)) {
+ if (SyExp->getSymbol().isInSection())
+ return &SyExp->getSymbol().getSection();
+ return nullptr;
+ }
+
+ if (auto BinOp = dyn_cast<MCBinaryExpr>(Expr)) {
+ auto SectionLHS = GetFixupSection(BinOp->getLHS());
+ auto SectionRHS = GetFixupSection(BinOp->getRHS());
+ return SectionLHS == SectionRHS ? nullptr : SectionLHS;
+ }
+
+ if (auto UnOp = dyn_cast<MCUnaryExpr>(Expr))
+ return GetFixupSection(UnOp->getSubExpr());
+
+ return nullptr;
+}
+
unsigned
WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
const MCFixup &Fixup) const {
@@ -86,6 +106,13 @@ WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
case FK_Data_4:
if (IsFunction)
return wasm::R_WEBASSEMBLY_TABLE_INDEX_I32;
+ if (auto Section = static_cast<const MCSectionWasm *>(
+ GetFixupSection(Fixup.getValue()))) {
+ if (Section->getKind().isText())
+ return wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32;
+ else if (!Section->isWasmData())
+ return wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32;
+ }
return wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32;
case FK_Data_8:
llvm_unreachable("FK_Data_8 not implemented yet");
@@ -94,9 +121,7 @@ WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
}
}
-std::unique_ptr<MCObjectWriter>
-llvm::createWebAssemblyWasmObjectWriter(raw_pwrite_stream &OS,
- bool Is64Bit) {
- auto MOTW = llvm::make_unique<WebAssemblyWasmObjectWriter>(Is64Bit);
- return createWasmObjectWriter(std::move(MOTW), OS);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createWebAssemblyWasmObjectWriter(bool Is64Bit) {
+ return llvm::make_unique<WebAssemblyWasmObjectWriter>(Is64Bit);
}
diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt
index 3433b1553e8c..ef0099f07efb 100644
--- a/lib/Target/WebAssembly/README.txt
+++ b/lib/Target/WebAssembly/README.txt
@@ -2,15 +2,42 @@
This WebAssembly backend is presently under development.
-Currently the easiest way to use it is through Emscripten, which provides a
-compilation environment that includes standard libraries, tools, and packaging
-for producing WebAssembly applications that can run in browsers and other
-environments. For more information, see the Emscripten documentation in
-general, and this page in particular:
+The most notable feature which is not yet stable is the ".o" file format.
+".o" file support is needed for many common ways of using LLVM, such as
+using it through "clang -c", so this backend is not yet considered widely
+usable. However, this backend is usable within some language toolchain
+packages:
+
+Emscripten provides a C/C++ compilation environment that includes standard
+libraries, tools, and packaging for producing WebAssembly applications that
+can run in browsers and other environments. For more information, see the
+Emscripten documentation in general, and this page in particular:
+
* https://github.com/kripken/emscripten/wiki/New-WebAssembly-Backend
+
+Rust provides WebAssembly support integrated into Cargo. There are two
+main options:
+ - wasm32-unknown-unknown, which provides a relatively minimal environment
+ that has an emphasis on being "native"
+ - wasm32-unknown-emscripten, which uses Emscripten internally and
+ provides standard C/C++ libraries, filesystem emulation, GL and SDL
+ bindings
+For more information, see:
+ * https://www.hellorust.com/
+
+
+This backend does not yet support debug info. Full DWARF support needs a
+design for how DWARF should be represented in WebAssembly. Sourcemap support
+has an existing design and some corresponding browser implementations, so it
+just needs implementing in LLVM.
-Other ways of using this backend, such as via a standalone "clang", are also
-under development, though they are not generally usable yet.
+Work-in-progress documentation for the ".o" file format is here:
+
+ * https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md
+
+A corresponding linker implementation is also under development:
+
+ * https://lld.llvm.org/WebAssembly.html
For more information on WebAssembly itself, see the home page:
* https://webassembly.github.io/
@@ -30,6 +57,8 @@ turn red if not. Once most of these pass, further testing will use LLVM's own
test suite. The tests can be run locally using:
https://github.com/WebAssembly/waterfall/blob/master/src/compile_torture_tests.py
+Some notes on ways that the generated code could be improved follow:
+
//===---------------------------------------------------------------------===//
Br, br_if, and br_table instructions can support having a value on the value
@@ -127,7 +156,7 @@ However, if moving the binary operator to its user moves it to a place where
its operands can't be moved to, it would be better to leave it in place, or
perhaps move it up, so that it can stackify its operands. A binary operator
has two operands and one result, so in such cases there could be a net win by
-prefering the operands.
+preferring the operands.
//===---------------------------------------------------------------------===//
@@ -138,11 +167,10 @@ instructions advantageously for this purpose.
//===---------------------------------------------------------------------===//
-WebAssembly is now officially a stack machine, rather than an AST, and this
-comes with additional opportunities for WebAssemblyRegStackify. Specifically,
-the stack doesn't need to be empty after an instruction with no return values.
-WebAssemblyRegStackify could be extended, or possibly rewritten, to take
-advantage of the new opportunities.
+WebAssemblyRegStackify currently assumes that the stack must be empty after
+an instruction with no return values, however wasm doesn't actually require
+this. WebAssemblyRegStackify could be extended, or possibly rewritten, to take
+full advantage of what WebAssembly permits.
//===---------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp b/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
index a2c03b1a0400..f7a417c0ed49 100644
--- a/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
+++ b/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file registers the WebAssembly target.
+/// This file registers the WebAssembly target.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h
index 7ac6c3991531..05b7b21fb597 100644
--- a/lib/Target/WebAssembly/WebAssembly.h
+++ b/lib/Target/WebAssembly/WebAssembly.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains the entry points for global functions defined in
+/// This file contains the entry points for global functions defined in
/// the LLVM WebAssembly back-end.
///
//===----------------------------------------------------------------------===//
@@ -27,8 +27,8 @@ class FunctionPass;
// LLVM IR passes.
ModulePass *createWebAssemblyLowerEmscriptenEHSjLj(bool DoEH, bool DoSjLj);
-void initializeWebAssemblyLowerEmscriptenEHSjLjPass(PassRegistry &);
ModulePass *createWebAssemblyLowerGlobalDtors();
+ModulePass *createWebAssemblyAddMissingPrototypes();
ModulePass *createWebAssemblyFixFunctionBitcasts();
FunctionPass *createWebAssemblyOptimizeReturned();
@@ -47,6 +47,7 @@ FunctionPass *createWebAssemblyRegStackify();
FunctionPass *createWebAssemblyRegColoring();
FunctionPass *createWebAssemblyExplicitLocals();
FunctionPass *createWebAssemblyFixIrreducibleControlFlow();
+FunctionPass *createWebAssemblyLateEHPrepare();
FunctionPass *createWebAssemblyCFGSort();
FunctionPass *createWebAssemblyCFGStackify();
FunctionPass *createWebAssemblyLowerBrUnless();
@@ -54,6 +55,31 @@ FunctionPass *createWebAssemblyRegNumbering();
FunctionPass *createWebAssemblyPeephole();
FunctionPass *createWebAssemblyCallIndirectFixup();
+// PassRegistry initialization declarations.
+void initializeWebAssemblyAddMissingPrototypesPass(PassRegistry &);
+void initializeWebAssemblyLowerEmscriptenEHSjLjPass(PassRegistry &);
+void initializeLowerGlobalDtorsPass(PassRegistry &);
+void initializeFixFunctionBitcastsPass(PassRegistry &);
+void initializeOptimizeReturnedPass(PassRegistry &);
+void initializeWebAssemblyArgumentMovePass(PassRegistry &);
+void initializeWebAssemblySetP2AlignOperandsPass(PassRegistry &);
+void initializeWebAssemblyReplacePhysRegsPass(PassRegistry &);
+void initializeWebAssemblyPrepareForLiveIntervalsPass(PassRegistry &);
+void initializeWebAssemblyOptimizeLiveIntervalsPass(PassRegistry &);
+void initializeWebAssemblyStoreResultsPass(PassRegistry &);
+void initializeWebAssemblyRegStackifyPass(PassRegistry &);
+void initializeWebAssemblyRegColoringPass(PassRegistry &);
+void initializeWebAssemblyExplicitLocalsPass(PassRegistry &);
+void initializeWebAssemblyFixIrreducibleControlFlowPass(PassRegistry &);
+void initializeWebAssemblyLateEHPreparePass(PassRegistry &);
+void initializeWebAssemblyExceptionInfoPass(PassRegistry &);
+void initializeWebAssemblyCFGSortPass(PassRegistry &);
+void initializeWebAssemblyCFGStackifyPass(PassRegistry &);
+void initializeWebAssemblyLowerBrUnlessPass(PassRegistry &);
+void initializeWebAssemblyRegNumberingPass(PassRegistry &);
+void initializeWebAssemblyPeepholePass(PassRegistry &);
+void initializeWebAssemblyCallIndirectFixupPass(PassRegistry &);
+
} // end namespace llvm
#endif
diff --git a/lib/Target/WebAssembly/WebAssembly.td b/lib/Target/WebAssembly/WebAssembly.td
index 99cf1f119a20..2f301da8e422 100644
--- a/lib/Target/WebAssembly/WebAssembly.td
+++ b/lib/Target/WebAssembly/WebAssembly.td
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This is a target description file for the WebAssembly architecture,
+/// This is a target description file for the WebAssembly architecture,
/// which is also known as "wasm".
///
//===----------------------------------------------------------------------===//
@@ -32,6 +32,15 @@ def FeatureNontrappingFPToInt :
"HasNontrappingFPToInt", "true",
"Enable non-trapping float-to-int conversion operators">;
+def FeatureSignExt :
+ SubtargetFeature<"sign-ext",
+ "HasSignExt", "true",
+ "Enable sign extension operators">;
+
+def FeatureExceptionHandling :
+ SubtargetFeature<"exception-handling", "HasExceptionHandling", "true",
+ "Enable Wasm exception handling">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//===----------------------------------------------------------------------===//
@@ -68,6 +77,20 @@ def : ProcessorModel<"bleeding-edge", NoSchedModel,
// Target Declaration
//===----------------------------------------------------------------------===//
+def WebAssemblyAsmParser : AsmParser {
+ // The physical register names are not in the binary format or asm text
+ let ShouldEmitMatchRegisterName = 0;
+}
+
+def WebAssemblyAsmWriter : AsmWriter {
+ string AsmWriterClassName = "InstPrinter";
+ int PassSubtarget = 0;
+ int Variant = 0;
+ bit isMCAsmWriter = 1;
+}
+
def WebAssembly : Target {
let InstructionSet = WebAssemblyInstrInfo;
+ let AssemblyParsers = [WebAssemblyAsmParser];
+ let AssemblyWriters = [WebAssemblyAsmWriter];
}
diff --git a/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp b/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
new file mode 100644
index 000000000000..4af9cd150bf7
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
@@ -0,0 +1,144 @@
+//===-- WebAssemblyAddMissingPrototypes.cpp - Fix prototypeless functions -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Add prototypes to prototypes-less functions.
+///
+/// WebAssembly has strict function prototype checking so we need functions
+/// declarations to match the call sites. Clang treats prototype-less functions
+/// as varargs (foo(...)) which happens to work on existing platforms but
+/// doesn't under WebAssembly. This pass will find all the call sites of each
+/// prototype-less function, ensure they agree, and then set the signature
+/// on the function declaration accordingly.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-add-missing-prototypes"
+
+namespace {
+class WebAssemblyAddMissingPrototypes final : public ModulePass {
+ StringRef getPassName() const override {
+ return "Add prototypes to prototypes-less functions";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ ModulePass::getAnalysisUsage(AU);
+ }
+
+ bool runOnModule(Module &M) override;
+
+public:
+ static char ID;
+ WebAssemblyAddMissingPrototypes() : ModulePass(ID) {}
+};
+} // End anonymous namespace
+
+char WebAssemblyAddMissingPrototypes::ID = 0;
+INITIALIZE_PASS(WebAssemblyAddMissingPrototypes, DEBUG_TYPE,
+ "Add prototypes to prototypes-less functions", false, false)
+
+ModulePass *llvm::createWebAssemblyAddMissingPrototypes() {
+ return new WebAssemblyAddMissingPrototypes();
+}
+
+bool WebAssemblyAddMissingPrototypes::runOnModule(Module &M) {
+ LLVM_DEBUG(dbgs() << "runnning AddMissingPrototypes\n");
+
+ std::vector<std::pair<Function*, Function*>> Replacements;
+
+ // Find all the prototype-less function declarations
+ for (Function &F : M) {
+ if (!F.isDeclaration() || !F.hasFnAttribute("no-prototype"))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Found no-prototype function: " << F.getName() << "\n");
+
+ // When clang emits prototype-less C functions it uses (...), i.e. varargs
+ // function that take no arguments (have no sentinel). When we see a
+ // no-prototype attribute we expect the function have these properties.
+ if (!F.isVarArg())
+ report_fatal_error(
+ "Functions with 'no-prototype' attribute must take varargs: " +
+ F.getName());
+ if (F.getFunctionType()->getNumParams() != 0)
+ report_fatal_error(
+ "Functions with 'no-prototype' attribute should not have params: " +
+ F.getName());
+
+
+ // Create a function prototype based on the first call site (first bitcast)
+ // that we find.
+ FunctionType *NewType = nullptr;
+ Function* NewF = nullptr;
+ for (Use &U : F.uses()) {
+ LLVM_DEBUG(dbgs() << "prototype-less use: " << F.getName() << "\n");
+ if (BitCastOperator *BC = dyn_cast<BitCastOperator>(U.getUser())) {
+ FunctionType *DestType =
+ cast<FunctionType>(BC->getDestTy()->getPointerElementType());
+
+ // Create a new function with the correct type
+ NewType = DestType;
+ NewF = Function::Create(NewType, F.getLinkage(), F.getName());
+ NewF->setAttributes(F.getAttributes());
+ NewF->removeFnAttr("no-prototype");
+ break;
+ }
+ }
+
+ if (!NewType) {
+ LLVM_DEBUG(
+ dbgs() << "could not derive a function prototype from usage: " +
+ F.getName() + "\n");
+ continue;
+ }
+
+ for (Use &U : F.uses()) {
+ if (BitCastOperator *BC = dyn_cast<BitCastOperator>(U.getUser())) {
+ FunctionType *DestType =
+ cast<FunctionType>(BC->getDestTy()->getPointerElementType());
+ if (NewType != DestType) {
+ report_fatal_error(
+ "Prototypeless function used with conflicting signatures: " +
+ F.getName());
+ }
+ BC->replaceAllUsesWith(NewF);
+ Replacements.emplace_back(&F, NewF);
+ } else {
+ dbgs() << *U.getUser()->getType() << "\n";
+#ifndef NDEBUG
+ U.getUser()->dump();
+#endif
+ report_fatal_error(
+ "unexpected use of prototypeless function: " + F.getName() + "\n");
+ }
+ }
+ }
+
+ // Finally replace the old function declarations with the new ones
+ for (auto &Pair : Replacements) {
+ Function* Old = Pair.first;
+ Function* New = Pair.second;
+ Old->eraseFromParent();
+ M.getFunctionList().push_back(New);
+ }
+
+ return !Replacements.empty();
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
index 5fadca38b820..7c8a631cde8a 100644
--- a/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file moves ARGUMENT instructions after ScheduleDAG scheduling.
+/// This file moves ARGUMENT instructions after ScheduleDAG scheduling.
///
/// Arguments are really live-in registers, however, since we use virtual
/// registers and LLVM doesn't support live-in virtual registers, we're
@@ -60,12 +60,15 @@ public:
} // end anonymous namespace
char WebAssemblyArgumentMove::ID = 0;
+INITIALIZE_PASS(WebAssemblyArgumentMove, DEBUG_TYPE,
+ "Move ARGUMENT instructions for WebAssembly", false, false)
+
FunctionPass *llvm::createWebAssemblyArgumentMove() {
return new WebAssemblyArgumentMove();
}
bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "********** Argument Move **********\n"
<< "********** Function: " << MF.getName() << '\n';
});
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 204d97cbdd44..1f280e1d13fc 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains a printer that converts from our internal
+/// This file contains a printer that converts from our internal
/// representation of machine-dependent LLVM code to the WebAssembly assembly
/// language.
///
@@ -31,10 +31,10 @@
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionWasm.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolWasm.h"
-#include "llvm/MC/MCSymbolELF.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
@@ -53,7 +53,7 @@ MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
MVT::v4i32, MVT::v4f32})
if (TRI->isTypeLegalForClass(*TRC, T))
return T;
- DEBUG(errs() << "Unknown type for register number: " << RegNo);
+ LLVM_DEBUG(errs() << "Unknown type for register number: " << RegNo);
llvm_unreachable("Unknown register type");
return MVT::Other;
}
@@ -84,21 +84,47 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
SmallVector<MVT, 4> Results;
SmallVector<MVT, 4> Params;
ComputeSignatureVTs(F, TM, Params, Results);
- getTargetStreamer()->emitIndirectFunctionType(getSymbol(&F), Params,
- Results);
+ MCSymbol *Sym = getSymbol(&F);
+ getTargetStreamer()->emitIndirectFunctionType(Sym, Params, Results);
+
+ if (TM.getTargetTriple().isOSBinFormatWasm() &&
+ F.hasFnAttribute("wasm-import-module")) {
+ MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+ StringRef Name = F.getFnAttribute("wasm-import-module")
+ .getValueAsString();
+ getTargetStreamer()->emitImportModule(WasmSym, Name);
+ }
}
}
for (const auto &G : M.globals()) {
if (!G.hasInitializer() && G.hasExternalLinkage()) {
if (G.getValueType()->isSized()) {
uint16_t Size = M.getDataLayout().getTypeAllocSize(G.getValueType());
- if (TM.getTargetTriple().isOSBinFormatELF())
- getTargetStreamer()->emitGlobalImport(G.getGlobalIdentifier());
OutStreamer->emitELFSize(getSymbol(&G),
MCConstantExpr::create(Size, OutContext));
}
}
}
+
+ if (const NamedMDNode *Named = M.getNamedMetadata("wasm.custom_sections")) {
+ for (const Metadata *MD : Named->operands()) {
+ const MDTuple *Tuple = dyn_cast<MDTuple>(MD);
+ if (!Tuple || Tuple->getNumOperands() != 2)
+ continue;
+ const MDString *Name = dyn_cast<MDString>(Tuple->getOperand(0));
+ const MDString *Contents = dyn_cast<MDString>(Tuple->getOperand(1));
+ if (!Name || !Contents)
+ continue;
+
+ OutStreamer->PushSection();
+ std::string SectionName = (".custom_section." + Name->getString()).str();
+ MCSectionWasm *mySection =
+ OutContext.getWasmSection(SectionName, SectionKind::getMetadata());
+ OutStreamer->SwitchSection(mySection);
+ OutStreamer->EmitBytes(Contents->getString());
+ OutStreamer->PopSection();
+ }
+ }
}
void WebAssemblyAsmPrinter::EmitConstantPool() {
@@ -133,36 +159,13 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
else
getTargetStreamer()->emitResult(CurrentFnSym, ArrayRef<MVT>());
- if (TM.getTargetTriple().isOSBinFormatELF()) {
- assert(MFI->getLocals().empty());
- for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
- unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx);
- unsigned WAReg = MFI->getWAReg(VReg);
- // Don't declare unused registers.
- if (WAReg == WebAssemblyFunctionInfo::UnusedReg)
- continue;
- // Don't redeclare parameters.
- if (WAReg < MFI->getParams().size())
- continue;
- // Don't declare stackified registers.
- if (int(WAReg) < 0)
- continue;
- MFI->addLocal(getRegType(VReg));
- }
- }
-
getTargetStreamer()->emitLocal(MFI->getLocals());
AsmPrinter::EmitFunctionBodyStart();
}
-void WebAssemblyAsmPrinter::EmitFunctionBodyEnd() {
- if (TM.getTargetTriple().isOSBinFormatELF())
- getTargetStreamer()->emitEndFunc();
-}
-
void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
- DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n');
+ LLVM_DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n');
switch (MI->getOpcode()) {
case WebAssembly::ARGUMENT_I32:
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
index a37f8bcf6ba5..23817b4e5126 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
@@ -57,7 +57,6 @@ public:
void EmitJumpTableInfo() override;
void EmitConstantPool() override;
void EmitFunctionBodyStart() override;
- void EmitFunctionBodyEnd() override;
void EmitInstruction(const MachineInstr *MI) override;
const MCExpr *lowerConstant(const Constant *CV) override;
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
index 700111743ee8..267a51433cd1 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file implements a CFG sorting pass.
+/// This file implements a CFG sorting pass.
///
/// This pass reorders the blocks in a function to put them into topological
/// order, ignoring loop backedges, and without any loop being interrupted
@@ -56,6 +56,9 @@ public:
} // end anonymous namespace
char WebAssemblyCFGSort::ID = 0;
+INITIALIZE_PASS(WebAssemblyCFGSort, DEBUG_TYPE,
+ "Reorders blocks in topological order", false, false)
+
FunctionPass *llvm::createWebAssemblyCFGSort() {
return new WebAssemblyCFGSort();
}
@@ -250,7 +253,7 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
assert(OnStack.count(MLI.getLoopFor(&MBB)) &&
"Blocks must be nested in their loops");
}
- while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back()))
+ while (OnStack.size() > 1 && &MBB == WebAssembly::getBottom(OnStack.back()))
OnStack.pop_back();
}
assert(OnStack.pop_back_val() == nullptr &&
@@ -261,9 +264,9 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
}
bool WebAssemblyCFGSort::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(dbgs() << "********** CFG Sorting **********\n"
- "********** Function: "
- << MF.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "********** CFG Sorting **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
const auto &MLI = getAnalysis<MachineLoopInfo>();
auto &MDT = getAnalysis<MachineDominatorTree>();
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 21e0f6b23777..70ce40cefed7 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file implements a CFG stacking pass.
+/// This file implements a CFG stacking pass.
///
/// This pass inserts BLOCK and LOOP markers to mark the start of scopes, since
/// scope boundaries serve as the labels for WebAssembly's control transfers.
@@ -57,6 +57,10 @@ public:
} // end anonymous namespace
char WebAssemblyCFGStackify::ID = 0;
+INITIALIZE_PASS(WebAssemblyCFGStackify, DEBUG_TYPE,
+ "Insert BLOCK and LOOP markers for WebAssembly scopes",
+ false, false)
+
FunctionPass *llvm::createWebAssemblyCFGStackify() {
return new WebAssemblyCFGStackify();
}
@@ -123,7 +127,8 @@ static void PlaceBlockMarker(
// Decide where in Header to put the BLOCK.
MachineBasicBlock::iterator InsertPos;
MachineLoop *HeaderLoop = MLI.getLoopFor(Header);
- if (HeaderLoop && MBB.getNumber() > LoopBottom(HeaderLoop)->getNumber()) {
+ if (HeaderLoop &&
+ MBB.getNumber() > WebAssembly::getBottom(HeaderLoop)->getNumber()) {
// Header is the header of a loop that does not lexically contain MBB, so
// the BLOCK needs to be above the LOOP, after any END constructs.
InsertPos = Header->begin();
@@ -143,9 +148,10 @@ static void PlaceBlockMarker(
}
// Add the BLOCK.
- MachineInstr *Begin = BuildMI(*Header, InsertPos, DebugLoc(),
- TII.get(WebAssembly::BLOCK))
- .addImm(int64_t(WebAssembly::ExprType::Void));
+ MachineInstr *Begin =
+ BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos),
+ TII.get(WebAssembly::BLOCK))
+ .addImm(int64_t(WebAssembly::ExprType::Void));
// Mark the end of the block.
InsertPos = MBB.begin();
@@ -153,7 +159,7 @@ static void PlaceBlockMarker(
InsertPos->getOpcode() == WebAssembly::END_LOOP &&
LoopTops[&*InsertPos]->getParent()->getNumber() >= Header->getNumber())
++InsertPos;
- MachineInstr *End = BuildMI(MBB, InsertPos, DebugLoc(),
+ MachineInstr *End = BuildMI(MBB, InsertPos, MBB.findPrevDebugLoc(InsertPos),
TII.get(WebAssembly::END_BLOCK));
BlockTops[End] = Begin;
@@ -176,7 +182,7 @@ static void PlaceLoopMarker(
// The operand of a LOOP is the first block after the loop. If the loop is the
// bottom of the function, insert a dummy block at the end.
- MachineBasicBlock *Bottom = LoopBottom(Loop);
+ MachineBasicBlock *Bottom = WebAssembly::getBottom(Loop);
auto Iter = std::next(MachineFunction::iterator(Bottom));
if (Iter == MF.end()) {
MachineBasicBlock *Label = MF.CreateMachineBasicBlock();
@@ -193,12 +199,14 @@ static void PlaceLoopMarker(
while (InsertPos != MBB.end() &&
InsertPos->getOpcode() == WebAssembly::END_LOOP)
++InsertPos;
- MachineInstr *Begin = BuildMI(MBB, InsertPos, DebugLoc(),
+ MachineInstr *Begin = BuildMI(MBB, InsertPos, MBB.findDebugLoc(InsertPos),
TII.get(WebAssembly::LOOP))
- .addImm(int64_t(WebAssembly::ExprType::Void));
+ .addImm(int64_t(WebAssembly::ExprType::Void));
- // Mark the end of the loop.
- MachineInstr *End = BuildMI(*AfterLoop, AfterLoop->begin(), DebugLoc(),
+ // Mark the end of the loop (using arbitrary debug location that branched
+ // to the loop end as its location).
+ DebugLoc EndDL = (*AfterLoop->pred_rbegin())->findBranchDebugLoc();
+ MachineInstr *End = BuildMI(*AfterLoop, AfterLoop->begin(), EndDL,
TII.get(WebAssembly::END_LOOP));
LoopTops[End] = Begin;
@@ -249,12 +257,13 @@ static void FixEndsAtEndOfFunction(
case MVT::v8i16: retType = WebAssembly::ExprType::I16x8; break;
case MVT::v4i32: retType = WebAssembly::ExprType::I32x4; break;
case MVT::v4f32: retType = WebAssembly::ExprType::F32x4; break;
+ case MVT::ExceptRef: retType = WebAssembly::ExprType::ExceptRef; break;
default: llvm_unreachable("unexpected return type");
}
for (MachineBasicBlock &MBB : reverse(MF)) {
for (MachineInstr &MI : reverse(MBB)) {
- if (MI.isPosition() || MI.isDebugValue())
+ if (MI.isPosition() || MI.isDebugInstr())
continue;
if (MI.getOpcode() == WebAssembly::END_BLOCK) {
BlockTops[&MI]->getOperand(0).setImm(int32_t(retType));
@@ -275,7 +284,8 @@ static void FixEndsAtEndOfFunction(
static void AppendEndToFunction(
MachineFunction &MF,
const WebAssemblyInstrInfo &TII) {
- BuildMI(MF.back(), MF.back().end(), DebugLoc(),
+ BuildMI(MF.back(), MF.back().end(),
+ MF.back().findPrevDebugLoc(MF.back().end()),
TII.get(WebAssembly::END_FUNCTION));
}
@@ -348,15 +358,13 @@ static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
FixEndsAtEndOfFunction(MF, MFI, BlockTops, LoopTops);
// Add an end instruction at the end of the function body.
- if (!MF.getSubtarget<WebAssemblySubtarget>()
- .getTargetTriple().isOSBinFormatELF())
- AppendEndToFunction(MF, TII);
+ AppendEndToFunction(MF, TII);
}
bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(dbgs() << "********** CFG Stackifying **********\n"
- "********** Function: "
- << MF.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "********** CFG Stackifying **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
const auto &MLI = getAnalysis<MachineLoopInfo>();
auto &MDT = getAnalysis<MachineDominatorTree>();
diff --git a/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp b/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
index 1af92f02d8e0..c1820bf66bc0 100644
--- a/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file converts pseudo call_indirect instructions into real
+/// This file converts pseudo call_indirect instructions into real
/// call_indirects.
///
/// The order of arguments for a call_indirect is the arguments to the function
@@ -54,6 +54,9 @@ public:
} // end anonymous namespace
char WebAssemblyCallIndirectFixup::ID = 0;
+INITIALIZE_PASS(WebAssemblyCallIndirectFixup, DEBUG_TYPE,
+ "Rewrite call_indirect argument orderings", false, false)
+
FunctionPass *llvm::createWebAssemblyCallIndirectFixup() {
return new WebAssemblyCallIndirectFixup();
}
@@ -80,8 +83,8 @@ static bool IsPseudoCallIndirect(const MachineInstr &MI) {
}
bool WebAssemblyCallIndirectFixup::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(dbgs() << "********** Fixing up CALL_INDIRECTs **********\n"
- << MF.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "********** Fixing up CALL_INDIRECTs **********\n"
+ << MF.getName() << '\n');
bool Changed = false;
const WebAssemblyInstrInfo *TII =
@@ -90,7 +93,7 @@ bool WebAssemblyCallIndirectFixup::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
if (IsPseudoCallIndirect(MI)) {
- DEBUG(dbgs() << "Found call_indirect: " << MI << '\n');
+ LLVM_DEBUG(dbgs() << "Found call_indirect: " << MI << '\n');
// Rewrite pseudo to non-pseudo
const MCInstrDesc &Desc = TII->get(GetNonPseudoCallIndirectOpcode(MI));
@@ -120,13 +123,13 @@ bool WebAssemblyCallIndirectFixup::runOnMachineFunction(MachineFunction &MF) {
for (const MachineOperand &MO : Ops)
MI.addOperand(MO);
- DEBUG(dbgs() << " After transform: " << MI);
+ LLVM_DEBUG(dbgs() << " After transform: " << MI);
Changed = true;
}
}
}
- DEBUG(dbgs() << "\nDone fixing up CALL_INDIRECTs\n\n");
+ LLVM_DEBUG(dbgs() << "\nDone fixing up CALL_INDIRECTs\n\n");
return Changed;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp b/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
new file mode 100644
index 000000000000..84683d48a90a
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
@@ -0,0 +1,197 @@
+//===--- WebAssemblyExceptionInfo.cpp - Exception Infomation --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements WebAssemblyException information analysis.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyExceptionInfo.h"
+#include "WebAssemblyUtilities.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-exception-info"
+
+char WebAssemblyExceptionInfo::ID = 0;
+
+INITIALIZE_PASS_BEGIN(WebAssemblyExceptionInfo, DEBUG_TYPE,
+ "WebAssembly Exception Information", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
+INITIALIZE_PASS_END(WebAssemblyExceptionInfo, DEBUG_TYPE,
+ "WebAssembly Exception Information", true, true)
+
+bool WebAssemblyExceptionInfo::runOnMachineFunction(MachineFunction &F) {
+ releaseMemory();
+ auto &MDT = getAnalysis<MachineDominatorTree>();
+ auto &MDF = getAnalysis<MachineDominanceFrontier>();
+ recalculate(MDT, MDF);
+ return false;
+}
+
+void WebAssemblyExceptionInfo::recalculate(
+ MachineDominatorTree &MDT, const MachineDominanceFrontier &MDF) {
+ // Postorder traversal of the dominator tree.
+ SmallVector<WebAssemblyException *, 8> Exceptions;
+ for (auto DomNode : post_order(&MDT)) {
+ MachineBasicBlock *EHPad = DomNode->getBlock();
+ if (!EHPad->isEHPad())
+ continue;
+ // We group catch & catch-all terminate pads together, so skip the second
+ // one
+ if (WebAssembly::isCatchAllTerminatePad(*EHPad))
+ continue;
+ auto *WE = new WebAssemblyException(EHPad);
+ discoverAndMapException(WE, MDT, MDF);
+ Exceptions.push_back(WE);
+ }
+
+ // Add BBs to exceptions
+ for (auto DomNode : post_order(&MDT)) {
+ MachineBasicBlock *MBB = DomNode->getBlock();
+ WebAssemblyException *WE = getExceptionFor(MBB);
+ for (; WE; WE = WE->getParentException())
+ WE->addBlock(MBB);
+ }
+
+ // Add subexceptions to exceptions
+ for (auto *WE : Exceptions) {
+ if (WE->getParentException())
+ WE->getParentException()->getSubExceptions().push_back(WE);
+ else
+ addTopLevelException(WE);
+ }
+
+ // For convenience, Blocks and SubExceptions are inserted in postorder.
+ // Reverse the lists.
+ for (auto *WE : Exceptions) {
+ WE->reverseBlock();
+ std::reverse(WE->getSubExceptions().begin(), WE->getSubExceptions().end());
+ }
+}
+
+void WebAssemblyExceptionInfo::releaseMemory() {
+ BBMap.clear();
+ DeleteContainerPointers(TopLevelExceptions);
+ TopLevelExceptions.clear();
+}
+
+void WebAssemblyExceptionInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominanceFrontier>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void WebAssemblyExceptionInfo::discoverAndMapException(
+ WebAssemblyException *WE, const MachineDominatorTree &MDT,
+ const MachineDominanceFrontier &MDF) {
+ unsigned NumBlocks = 0;
+ unsigned NumSubExceptions = 0;
+
+ // Map blocks that belong to a catchpad / cleanuppad
+ MachineBasicBlock *EHPad = WE->getEHPad();
+
+ // We group catch & catch-all terminate pads together within an exception
+ if (WebAssembly::isCatchTerminatePad(*EHPad)) {
+ assert(EHPad->succ_size() == 1 &&
+ "Catch terminate pad has more than one successors");
+ changeExceptionFor(EHPad, WE);
+ changeExceptionFor(*(EHPad->succ_begin()), WE);
+ return;
+ }
+
+ SmallVector<MachineBasicBlock *, 8> WL;
+ WL.push_back(EHPad);
+ while (!WL.empty()) {
+ MachineBasicBlock *MBB = WL.pop_back_val();
+
+ // Find its outermost discovered exception. If this is a discovered block,
+ // check if it is already discovered to be a subexception of this exception.
+ WebAssemblyException *SubE = getOutermostException(MBB);
+ if (SubE) {
+ if (SubE != WE) {
+ // Discover a subexception of this exception.
+ SubE->setParentException(WE);
+ ++NumSubExceptions;
+ NumBlocks += SubE->getBlocksVector().capacity();
+ // All blocks that belong to this subexception have been already
+ // discovered. Skip all of them. Add the subexception's landing pad's
+ // dominance frontier to the worklist.
+ for (auto &Frontier : MDF.find(SubE->getEHPad())->second)
+ if (MDT.dominates(EHPad, Frontier))
+ WL.push_back(Frontier);
+ }
+ continue;
+ }
+
+ // This is an undiscovered block. Map it to the current exception.
+ changeExceptionFor(MBB, WE);
+ ++NumBlocks;
+
+ // Add successors dominated by the current BB to the worklist.
+ for (auto *Succ : MBB->successors())
+ if (MDT.dominates(EHPad, Succ))
+ WL.push_back(Succ);
+ }
+
+ WE->getSubExceptions().reserve(NumSubExceptions);
+ WE->reserveBlocks(NumBlocks);
+}
+
+WebAssemblyException *
+WebAssemblyExceptionInfo::getOutermostException(MachineBasicBlock *MBB) const {
+ WebAssemblyException *WE = getExceptionFor(MBB);
+ if (WE) {
+ while (WebAssemblyException *Parent = WE->getParentException())
+ WE = Parent;
+ }
+ return WE;
+}
+
+void WebAssemblyException::print(raw_ostream &OS, unsigned Depth) const {
+ OS.indent(Depth * 2) << "Exception at depth " << getExceptionDepth()
+ << " containing: ";
+
+ for (unsigned I = 0; I < getBlocks().size(); ++I) {
+ MachineBasicBlock *MBB = getBlocks()[I];
+ if (I)
+ OS << ", ";
+ OS << "%bb." << MBB->getNumber();
+ if (const auto *BB = MBB->getBasicBlock())
+ if (BB->hasName())
+ OS << "." << BB->getName();
+
+ if (getEHPad() == MBB)
+ OS << " (landing-pad)";
+ }
+ OS << "\n";
+
+ for (auto &SubE : SubExceptions)
+ SubE->print(OS, Depth + 2);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void WebAssemblyException::dump() const { print(dbgs()); }
+#endif
+
+raw_ostream &operator<<(raw_ostream &OS, const WebAssemblyException &WE) {
+ WE.print(OS);
+ return OS;
+}
+
+void WebAssemblyExceptionInfo::print(raw_ostream &OS, const Module *) const {
+ for (auto *WE : TopLevelExceptions)
+ WE->print(OS);
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h b/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
new file mode 100644
index 000000000000..fcd7e2366e03
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
@@ -0,0 +1,170 @@
+//===-- WebAssemblyExceptionInfo.h - WebAssembly Exception Info -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements WebAssemblyException information analysis.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYEXCEPTIONINFO_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYEXCEPTIONINFO_H
+
+#include "WebAssembly.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+
+class MachineDominatorTree;
+class MachineDominanceFrontier;
+
+// WebAssembly instructions for exception handling are structured as follows:
+// try
+// instructions*
+// catch ----|
+// instructions* | -> A WebAssemblyException consists of this region
+// end ----|
+//
+// A WebAssemblyException object contains BBs that belong to a 'catch' part of
+// the try-catch-end structure to be created later. 'try' and 'end' markers
+// are not present at this stage and will be generated in CFGStackify pass.
+// Because CFGSort requires all the BBs within a catch part to be sorted
+// together as it does for loops, this pass calculates the nesting structure of
+// catch part of exceptions in a function.
+//
+// An exception catch part is defined as a BB with catch instruction and all
+// other BBs dominated by this BB.
+class WebAssemblyException {
+ MachineBasicBlock *EHPad = nullptr;
+
+ WebAssemblyException *ParentException = nullptr;
+ std::vector<WebAssemblyException *> SubExceptions;
+ std::vector<MachineBasicBlock *> Blocks;
+ SmallPtrSet<const MachineBasicBlock *, 8> BlockSet;
+
+public:
+ WebAssemblyException(MachineBasicBlock *EHPad) : EHPad(EHPad) {}
+ ~WebAssemblyException() { DeleteContainerPointers(SubExceptions); }
+ WebAssemblyException(const WebAssemblyException &) = delete;
+ const WebAssemblyException &operator=(const WebAssemblyException &) = delete;
+
+ MachineBasicBlock *getEHPad() const { return EHPad; }
+ MachineBasicBlock *getHeader() const { return EHPad; }
+ WebAssemblyException *getParentException() const { return ParentException; }
+ void setParentException(WebAssemblyException *WE) { ParentException = WE; }
+
+ bool contains(const WebAssemblyException *WE) const {
+ if (WE == this)
+ return true;
+ if (!WE)
+ return false;
+ return contains(WE->getParentException());
+ }
+ bool contains(const MachineBasicBlock *MBB) const {
+ return BlockSet.count(MBB);
+ }
+
+ void addBlock(MachineBasicBlock *MBB) {
+ Blocks.push_back(MBB);
+ BlockSet.insert(MBB);
+ }
+ ArrayRef<MachineBasicBlock *> getBlocks() const { return Blocks; }
+ using block_iterator = typename ArrayRef<MachineBasicBlock *>::const_iterator;
+ block_iterator block_begin() const { return getBlocks().begin(); }
+ block_iterator block_end() const { return getBlocks().end(); }
+ inline iterator_range<block_iterator> blocks() const {
+ return make_range(block_begin(), block_end());
+ }
+ unsigned getNumBlocks() const { return Blocks.size(); }
+ std::vector<MachineBasicBlock *> &getBlocksVector() { return Blocks; }
+
+ const std::vector<WebAssemblyException *> &getSubExceptions() const {
+ return SubExceptions;
+ }
+ std::vector<WebAssemblyException *> &getSubExceptions() {
+ return SubExceptions;
+ }
+ void addSubException(WebAssemblyException *E) { SubExceptions.push_back(E); }
+ using iterator = typename std::vector<WebAssemblyException *>::const_iterator;
+ iterator begin() const { return SubExceptions.begin(); }
+ iterator end() const { return SubExceptions.end(); }
+
+ void reserveBlocks(unsigned Size) { Blocks.reserve(Size); }
+ void reverseBlock(unsigned From = 0) {
+ std::reverse(Blocks.begin() + From, Blocks.end());
+ }
+
+ // Return the nesting level. An outermost one has depth 1.
+ unsigned getExceptionDepth() const {
+ unsigned D = 1;
+ for (const WebAssemblyException *CurException = ParentException;
+ CurException; CurException = CurException->ParentException)
+ ++D;
+ return D;
+ }
+
+ void print(raw_ostream &OS, unsigned Depth = 0) const;
+ void dump() const;
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const WebAssemblyException &WE);
+
+class WebAssemblyExceptionInfo final : public MachineFunctionPass {
+ // Mapping of basic blocks to the innermost exception they occur in
+ DenseMap<const MachineBasicBlock *, WebAssemblyException *> BBMap;
+ std::vector<WebAssemblyException *> TopLevelExceptions;
+
+ void discoverAndMapException(WebAssemblyException *WE,
+ const MachineDominatorTree &MDT,
+ const MachineDominanceFrontier &MDF);
+ WebAssemblyException *getOutermostException(MachineBasicBlock *MBB) const;
+
+public:
+ static char ID;
+ WebAssemblyExceptionInfo() : MachineFunctionPass(ID) {
+ initializeWebAssemblyExceptionInfoPass(*PassRegistry::getPassRegistry());
+ }
+ ~WebAssemblyExceptionInfo() override { releaseMemory(); }
+ WebAssemblyExceptionInfo(const WebAssemblyExceptionInfo &) = delete;
+ WebAssemblyExceptionInfo &
+ operator=(const WebAssemblyExceptionInfo &) = delete;
+
+ bool runOnMachineFunction(MachineFunction &) override;
+ void releaseMemory() override;
+ void recalculate(MachineDominatorTree &MDT,
+ const MachineDominanceFrontier &MDF);
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ bool empty() const { return TopLevelExceptions.empty(); }
+
+ // Return the innermost exception that MBB lives in. If the block is not in an
+ // exception, null is returned.
+ WebAssemblyException *getExceptionFor(const MachineBasicBlock *MBB) const {
+ return BBMap.lookup(MBB);
+ }
+
+ void changeExceptionFor(MachineBasicBlock *MBB, WebAssemblyException *WE) {
+ if (!WE) {
+ BBMap.erase(MBB);
+ return;
+ }
+ BBMap[MBB] = WE;
+ }
+
+ void addTopLevelException(WebAssemblyException *WE) {
+ assert(!WE->getParentException() && "Not a top level exception!");
+ TopLevelExceptions.push_back(WE);
+ }
+
+ void print(raw_ostream &OS, const Module *M = nullptr) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index e2edb924d4d2..8619cbdcb5ee 100644
--- a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file converts any remaining registers into WebAssembly locals.
+/// This file converts any remaining registers into WebAssembly locals.
///
/// After register stackification and register coloring, convert non-stackified
/// registers into locals, inserting explicit get_local and set_local
@@ -60,6 +60,9 @@ public:
} // end anonymous namespace
char WebAssemblyExplicitLocals::ID = 0;
+INITIALIZE_PASS(WebAssemblyExplicitLocals, DEBUG_TYPE,
+ "Convert registers to WebAssembly locals", false, false)
+
FunctionPass *llvm::createWebAssemblyExplicitLocals() {
return new WebAssemblyExplicitLocals();
}
@@ -86,6 +89,8 @@ static unsigned getDropOpcode(const TargetRegisterClass *RC) {
return WebAssembly::DROP_F64;
if (RC == &WebAssembly::V128RegClass)
return WebAssembly::DROP_V128;
+ if (RC == &WebAssembly::EXCEPT_REFRegClass)
+ return WebAssembly::DROP_EXCEPT_REF;
llvm_unreachable("Unexpected register class");
}
@@ -101,6 +106,8 @@ static unsigned getGetLocalOpcode(const TargetRegisterClass *RC) {
return WebAssembly::GET_LOCAL_F64;
if (RC == &WebAssembly::V128RegClass)
return WebAssembly::GET_LOCAL_V128;
+ if (RC == &WebAssembly::EXCEPT_REFRegClass)
+ return WebAssembly::GET_LOCAL_EXCEPT_REF;
llvm_unreachable("Unexpected register class");
}
@@ -116,6 +123,8 @@ static unsigned getSetLocalOpcode(const TargetRegisterClass *RC) {
return WebAssembly::SET_LOCAL_F64;
if (RC == &WebAssembly::V128RegClass)
return WebAssembly::SET_LOCAL_V128;
+ if (RC == &WebAssembly::EXCEPT_REFRegClass)
+ return WebAssembly::SET_LOCAL_EXCEPT_REF;
llvm_unreachable("Unexpected register class");
}
@@ -131,6 +140,8 @@ static unsigned getTeeLocalOpcode(const TargetRegisterClass *RC) {
return WebAssembly::TEE_LOCAL_F64;
if (RC == &WebAssembly::V128RegClass)
return WebAssembly::TEE_LOCAL_V128;
+ if (RC == &WebAssembly::EXCEPT_REFRegClass)
+ return WebAssembly::TEE_LOCAL_EXCEPT_REF;
llvm_unreachable("Unexpected register class");
}
@@ -144,6 +155,8 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) {
return MVT::f32;
if (RC == &WebAssembly::F64RegClass)
return MVT::f64;
+ if (RC == &WebAssembly::EXCEPT_REFRegClass)
+ return MVT::ExceptRef;
llvm_unreachable("unrecognized register class");
}
@@ -168,19 +181,14 @@ static MachineInstr *FindStartOfTree(MachineOperand &MO,
}
bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(dbgs() << "********** Make Locals Explicit **********\n"
- "********** Function: "
- << MF.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "********** Make Locals Explicit **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
// Disable this pass if directed to do so.
if (DisableWebAssemblyExplicitLocals)
return false;
- // Disable this pass if we aren't doing direct wasm object emission.
- if (MF.getSubtarget<WebAssemblySubtarget>()
- .getTargetTriple().isOSBinFormatELF())
- return false;
-
bool Changed = false;
MachineRegisterInfo &MRI = MF.getRegInfo();
WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
@@ -218,7 +226,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
MachineInstr &MI = *I++;
assert(!WebAssembly::isArgument(MI));
- if (MI.isDebugValue() || MI.isLabel())
+ if (MI.isDebugInstr() || MI.isLabel())
continue;
// Replace tee instructions with tee_local. The difference is that tee
@@ -271,8 +279,11 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
}
if (UseEmpty[TargetRegisterInfo::virtReg2Index(OldReg)]) {
unsigned Opc = getDropOpcode(RC);
- BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
- .addReg(NewReg);
+ MachineInstr *Drop =
+ BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
+ .addReg(NewReg);
+ // After the drop instruction, this reg operand will not be used
+ Drop->getOperand(0).setIsKill();
} else {
unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
unsigned Opc = getSetLocalOpcode(RC);
@@ -281,6 +292,9 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
.addReg(NewReg);
}
MI.getOperand(0).setReg(NewReg);
+ // This register operand is now being used by the inserted drop
+ // instruction, so make it undead.
+ MI.getOperand(0).setIsDead(false);
MFI.stackifyVReg(NewReg);
Changed = true;
}
@@ -362,7 +376,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
// Assert that all registers have been stackified at this point.
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
- if (MI.isDebugValue() || MI.isLabel())
+ if (MI.isDebugInstr() || MI.isLabel())
continue;
for (const MachineOperand &MO : MI.explicit_operands()) {
assert(
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 7e284ea950fd..566ef68c027d 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file defines the WebAssembly-specific support for the FastISel
+/// This file defines the WebAssembly-specific support for the FastISel
/// class. Some of the target-specific code is generated by tablegen in the file
/// WebAssemblyGenFastISel.inc, which is #included here.
///
@@ -127,6 +127,7 @@ private:
case MVT::i64:
case MVT::f32:
case MVT::f64:
+ case MVT::ExceptRef:
return VT;
case MVT::f16:
return MVT::f32;
@@ -418,7 +419,7 @@ unsigned WebAssemblyFastISel::getRegForI1Value(const Value *V, bool &Not) {
return getRegForValue(ICmp->getOperand(0));
}
- if (BinaryOperator::isNot(V)) {
+ if (BinaryOperator::isNot(V) && V->getType()->isIntegerTy(32)) {
Not = true;
return getRegForValue(BinaryOperator::getNotArgument(V));
}
@@ -681,6 +682,10 @@ bool WebAssemblyFastISel::fastLowerArguments() {
Opc = WebAssembly::ARGUMENT_v4f32;
RC = &WebAssembly::V128RegClass;
break;
+ case MVT::ExceptRef:
+ Opc = WebAssembly::ARGUMENT_EXCEPT_REF;
+ RC = &WebAssembly::EXCEPT_REFRegClass;
+ break;
default:
return false;
}
@@ -695,11 +700,23 @@ bool WebAssemblyFastISel::fastLowerArguments() {
MRI.addLiveIn(WebAssembly::ARGUMENTS);
auto *MFI = MF->getInfo<WebAssemblyFunctionInfo>();
- for (auto const &Arg : F->args())
- MFI->addParam(getLegalType(getSimpleType(Arg.getType())));
+ for (auto const &Arg : F->args()) {
+ MVT::SimpleValueType ArgTy = getLegalType(getSimpleType(Arg.getType()));
+ if (ArgTy == MVT::INVALID_SIMPLE_VALUE_TYPE) {
+ MFI->clearParamsAndResults();
+ return false;
+ }
+ MFI->addParam(ArgTy);
+ }
- if (!F->getReturnType()->isVoidTy())
- MFI->addResult(getLegalType(getSimpleType(F->getReturnType())));
+ if (!F->getReturnType()->isVoidTy()) {
+ MVT::SimpleValueType RetTy = getLegalType(getSimpleType(F->getReturnType()));
+ if (RetTy == MVT::INVALID_SIMPLE_VALUE_TYPE) {
+ MFI->clearParamsAndResults();
+ return false;
+ }
+ MFI->addResult(RetTy);
+ }
return true;
}
@@ -770,6 +787,11 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
IsDirect ? WebAssembly::CALL_v4f32 : WebAssembly::PCALL_INDIRECT_v4f32;
ResultReg = createResultReg(&WebAssembly::V128RegClass);
break;
+ case MVT::ExceptRef:
+ Opc = IsDirect ? WebAssembly::CALL_EXCEPT_REF
+ : WebAssembly::PCALL_INDIRECT_EXCEPT_REF;
+ ResultReg = createResultReg(&WebAssembly::EXCEPT_REFRegClass);
+ break;
default:
return false;
}
@@ -868,6 +890,10 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
Opc = WebAssembly::SELECT_F64;
RC = &WebAssembly::F64RegClass;
break;
+ case MVT::ExceptRef:
+ Opc = WebAssembly::SELECT_EXCEPT_REF;
+ RC = &WebAssembly::EXCEPT_REFRegClass;
+ break;
default:
return false;
}
@@ -1165,6 +1191,7 @@ bool WebAssemblyFastISel::selectStore(const Instruction *I) {
switch (getSimpleType(Store->getValueOperand()->getType())) {
case MVT::i1:
VTIsi1 = true;
+ LLVM_FALLTHROUGH;
case MVT::i8:
Opc = WebAssembly::STORE8_I32;
break;
@@ -1273,6 +1300,9 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
case MVT::v4f32:
Opc = WebAssembly::RETURN_v4f32;
break;
+ case MVT::ExceptRef:
+ Opc = WebAssembly::RETURN_EXCEPT_REF;
+ break;
default: return false;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index 666337acccce..d5e47ee82513 100644
--- a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief Fix bitcasted functions.
+/// Fix bitcasted functions.
///
/// WebAssembly requires caller and callee signatures to match, however in LLVM,
/// some amount of slop is vaguely permitted. Detect mismatch by looking for
@@ -61,6 +61,9 @@ public:
} // End anonymous namespace
char FixFunctionBitcasts::ID = 0;
+INITIALIZE_PASS(FixFunctionBitcasts, DEBUG_TYPE,
+ "Fix mismatching bitcasts for WebAssembly", false, false)
+
ModulePass *llvm::createWebAssemblyFixFunctionBitcasts() {
return new FixFunctionBitcasts();
}
diff --git a/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
index 88daea7e3681..bea027be7711 100644
--- a/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file implements a pass that transforms irreducible control flow
+/// This file implements a pass that transforms irreducible control flow
/// into reducible control flow. Irreducible control flow means multiple-entry
/// loops; they appear as CFG cycles that are not recorded in MachineLoopInfo
/// due to being unnatural.
@@ -71,6 +71,9 @@ public:
} // end anonymous namespace
char WebAssemblyFixIrreducibleControlFlow::ID = 0;
+INITIALIZE_PASS(WebAssemblyFixIrreducibleControlFlow, DEBUG_TYPE,
+ "Removes irreducible control flow", false, false)
+
FunctionPass *llvm::createWebAssemblyFixIrreducibleControlFlow() {
return new WebAssemblyFixIrreducibleControlFlow();
}
@@ -136,7 +139,7 @@ bool WebAssemblyFixIrreducibleControlFlow::VisitLoop(MachineFunction &MF,
MachineBasicBlock *Header = Loop ? Loop->getHeader() : &*MF.begin();
SetVector<MachineBasicBlock *> RewriteSuccs;
- // DFS through Loop's body, looking for for irreducible control flow. Loop is
+ // DFS through Loop's body, looking for irreducible control flow. Loop is
// natural, and we stay in its body, and we treat any nested loops
// monolithically, so any cycles we encounter indicate irreducibility.
SmallPtrSet<MachineBasicBlock *, 8> OnStack;
@@ -174,7 +177,7 @@ bool WebAssemblyFixIrreducibleControlFlow::VisitLoop(MachineFunction &MF,
if (LLVM_LIKELY(RewriteSuccs.empty()))
return false;
- DEBUG(dbgs() << "Irreducible control flow detected!\n");
+ LLVM_DEBUG(dbgs() << "Irreducible control flow detected!\n");
// Ok. We have irreducible control flow! Create a dispatch block which will
// contains a jump table to any block in the problematic set of blocks.
@@ -205,7 +208,8 @@ bool WebAssemblyFixIrreducibleControlFlow::VisitLoop(MachineFunction &MF,
continue;
unsigned Index = MIB.getInstr()->getNumExplicitOperands() - 1;
- DEBUG(dbgs() << printMBBReference(*MBB) << " has index " << Index << "\n");
+ LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has index " << Index
+ << "\n");
Pair.first->second = Index;
for (auto Pred : MBB->predecessors())
@@ -264,9 +268,9 @@ bool WebAssemblyFixIrreducibleControlFlow::VisitLoop(MachineFunction &MF,
bool WebAssemblyFixIrreducibleControlFlow::runOnMachineFunction(
MachineFunction &MF) {
- DEBUG(dbgs() << "********** Fixing Irreducible Control Flow **********\n"
- "********** Function: "
- << MF.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "********** Fixing Irreducible Control Flow **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
bool Changed = false;
auto &MLI = getAnalysis<MachineLoopInfo>();
@@ -284,7 +288,7 @@ bool WebAssemblyFixIrreducibleControlFlow::runOnMachineFunction(
// If we made any changes, completely recompute everything.
if (LLVM_UNLIKELY(Changed)) {
- DEBUG(dbgs() << "Recomputing dominators and loops.\n");
+ LLVM_DEBUG(dbgs() << "Recomputing dominators and loops.\n");
MF.getRegInfo().invalidateLiveness();
MF.RenumberBlocks();
getAnalysis<MachineDominatorTree>().runOnMachineFunction(MF);
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 84246052f601..052c94e9d6a9 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains the WebAssembly implementation of
+/// This file contains the WebAssembly implementation of
/// TargetFrameLowering class.
///
/// On WebAssembly, there aren't a lot of things to do here. There are no
@@ -106,29 +106,9 @@ static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
const char *ES = "__stack_pointer";
auto *SPSymbol = MF.createExternalSymbolName(ES);
- if (MF.getSubtarget<WebAssemblySubtarget>()
- .getTargetTriple().isOSBinFormatELF()) {
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const TargetRegisterClass *PtrRC =
- MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
- unsigned Zero = MRI.createVirtualRegister(PtrRC);
-
- BuildMI(MBB, InsertAddr, DL, TII->get(WebAssembly::CONST_I32), Zero)
- .addImm(0);
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
- MachineMemOperand::MOStore, 4, 4);
- BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::STORE_I32))
- .addImm(2) // p2align
- .addExternalSymbol(SPSymbol)
- .addReg(Zero)
- .addReg(SrcReg)
- .addMemOperand(MMO);
- } else {
- BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::SET_GLOBAL_I32))
- .addExternalSymbol(SPSymbol)
- .addReg(SrcReg);
- }
+ BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::SET_GLOBAL_I32))
+ .addExternalSymbol(SPSymbol)
+ .addReg(SrcReg);
}
MachineBasicBlock::iterator
@@ -172,25 +152,8 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
const char *ES = "__stack_pointer";
auto *SPSymbol = MF.createExternalSymbolName(ES);
- if (MF.getSubtarget<WebAssemblySubtarget>()
- .getTargetTriple().isOSBinFormatELF()) {
- unsigned Zero = MRI.createVirtualRegister(PtrRC);
-
- BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero)
- .addImm(0);
- MachineMemOperand *LoadMMO = MF.getMachineMemOperand(
- MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
- MachineMemOperand::MOLoad, 4, 4);
- // Load the SP value.
- BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg)
- .addImm(2) // p2align
- .addExternalSymbol(SPSymbol)
- .addReg(Zero) // addr
- .addMemOperand(LoadMMO);
- } else {
- BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::GET_GLOBAL_I32), SPReg)
- .addExternalSymbol(SPSymbol);
- }
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::GET_GLOBAL_I32), SPReg)
+ .addExternalSymbol(SPSymbol);
bool HasBP = hasBP(MF);
if (HasBP) {
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
index 4cc7f5ae058a..fe23e418a3f1 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This class implements WebAssembly-specific bits of
+/// This class implements WebAssembly-specific bits of
/// TargetFrameLowering class.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyISD.def b/lib/Target/WebAssembly/WebAssemblyISD.def
index 2f0f106ef5b7..c12550feabbb 100644
--- a/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file describes the various WebAssembly ISD node types.
+/// This file describes the various WebAssembly ISD node types.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index 9f40d35689a5..fdf3a30a5c0e 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file defines an instruction selector for the WebAssembly target.
+/// This file defines an instruction selector for the WebAssembly target.
///
//===----------------------------------------------------------------------===//
@@ -68,27 +68,21 @@ private:
} // end anonymous namespace
void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
- // Dump information about the Node being selected.
- DEBUG(errs() << "Selecting: ");
- DEBUG(Node->dump(CurDAG));
- DEBUG(errs() << "\n");
-
// If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
- DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+ LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
Node->setNodeId(-1);
return;
}
- // Few custom selection stuff.
- EVT VT = Node->getValueType(0);
-
+ // Few custom selection stuff. If we need WebAssembly-specific selection,
+ // uncomment this block add corresponding case statements.
+ /*
switch (Node->getOpcode()) {
default:
break;
- // If we need WebAssembly-specific selection, it would go here.
- (void)VT;
}
+ */
// Select the default instruction.
SelectCode(Node);
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 299009fa6674..283e703e1f6c 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file implements the WebAssemblyTargetLowering class.
+/// This file implements the WebAssemblyTargetLowering class.
///
//===----------------------------------------------------------------------===//
@@ -117,8 +117,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// As a special case, these operators use the type to mean the type to
// sign-extend from.
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
- if (!Subtarget->hasAtomics()) {
- // The Atomics feature includes signext intructions.
+ if (!Subtarget->hasSignExt()) {
for (auto T : {MVT::i8, MVT::i16, MVT::i32})
setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand);
}
@@ -152,6 +151,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// Trap lowers to wasm unreachable
setOperationAction(ISD::TRAP, MVT::Other, Legal);
+ // Exception handling intrinsics
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
setMaxAtomicSizeInBitsSupported(64);
}
@@ -427,6 +429,15 @@ bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT,
return true;
}
+EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL,
+ LLVMContext &C,
+ EVT VT) const {
+ if (VT.isVector())
+ return VT.changeVectorElementTypeToInteger();
+
+ return TargetLowering::getSetCCResultType(DL, C, VT);
+}
+
//===----------------------------------------------------------------------===//
// WebAssembly Lowering private implementation.
//===----------------------------------------------------------------------===//
@@ -485,6 +496,7 @@ SDValue WebAssemblyTargetLowering::LowerCall(
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ unsigned NumFixedArgs = 0;
for (unsigned i = 0; i < Outs.size(); ++i) {
const ISD::OutputArg &Out = Outs[i];
SDValue &OutVal = OutVals[i];
@@ -510,11 +522,11 @@ SDValue WebAssemblyTargetLowering::LowerCall(
/*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
OutVal = FINode;
}
+ // Count the number of fixed args *after* legalization.
+ NumFixedArgs += Out.IsFixed;
}
bool IsVarArg = CLI.IsVarArg;
- unsigned NumFixedArgs = CLI.NumFixedArgs;
-
auto PtrVT = getPointerTy(Layout);
// Analyze operands of the call, assigning locations to each operand.
@@ -738,6 +750,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
return LowerFRAMEADDR(Op, DAG);
case ISD::CopyToReg:
return LowerCopyToReg(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN:
+ return LowerINTRINSIC_WO_CHAIN(Op, DAG);
}
}
@@ -870,6 +884,21 @@ SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
MachinePointerInfo(SV), 0);
}
+SDValue
+WebAssemblyTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDLoc DL(Op);
+ switch (IntNo) {
+ default:
+ return {}; // Don't custom lower most intrinsics.
+
+ case Intrinsic::wasm_lsda:
+ // TODO For now, just return 0 not to crash
+ return DAG.getConstant(0, DL, Op.getValueType());
+ }
+}
+
//===----------------------------------------------------------------------===//
// WebAssembly Optimization Hooks
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 7bb8e71ab974..79819493ac6a 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file defines the interfaces that WebAssembly uses to lower LLVM
+/// This file defines the interfaces that WebAssembly uses to lower LLVM
/// code into a selection DAG.
///
//===----------------------------------------------------------------------===//
@@ -64,6 +64,9 @@ class WebAssemblyTargetLowering final : public TargetLowering {
bool *Fast) const override;
bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+ EVT VT) const override;
+
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
@@ -90,6 +93,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
};
namespace WebAssembly {
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index a49172df158f..d879932b3232 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief WebAssembly Atomic operand code-gen constructs.
+/// WebAssembly Atomic operand code-gen constructs.
///
//===----------------------------------------------------------------------===//
@@ -17,8 +17,8 @@
//===----------------------------------------------------------------------===//
let Defs = [ARGUMENTS] in {
-def ATOMIC_LOAD_I32 : WebAssemblyLoad<I32, "i32.atomic.load", 0xfe10>;
-def ATOMIC_LOAD_I64 : WebAssemblyLoad<I64, "i64.atomic.load", 0xfe11>;
+defm ATOMIC_LOAD_I32 : WebAssemblyLoad<I32, "i32.atomic.load", 0xfe10>;
+defm ATOMIC_LOAD_I64 : WebAssemblyLoad<I64, "i64.atomic.load", 0xfe11>;
} // Defs = [ARGUMENTS]
// Select loads with no constant offset.
@@ -40,7 +40,6 @@ def : LoadPatGlobalAddr<i64, atomic_load_64, ATOMIC_LOAD_I64>;
def : LoadPatExternalSym<i32, atomic_load_32, ATOMIC_LOAD_I32>;
def : LoadPatExternalSym<i64, atomic_load_64, ATOMIC_LOAD_I64>;
-
// Select loads with just a constant offset.
def : LoadPatOffsetOnly<i32, atomic_load_32, ATOMIC_LOAD_I32>;
def : LoadPatOffsetOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
@@ -56,14 +55,14 @@ def : LoadPatExternSymOffOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
// Extending loads. Note that there are only zero-extending atomic loads, no
// sign-extending loads.
let Defs = [ARGUMENTS] in {
-def ATOMIC_LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load8_u", 0xfe12>;
-def ATOMIC_LOAD16_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load16_u", 0xfe13>;
-def ATOMIC_LOAD8_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load8_u", 0xfe14>;
-def ATOMIC_LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load16_u", 0xfe15>;
-def ATOMIC_LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load32_u", 0xfe16>;
+defm ATOMIC_LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load8_u", 0xfe12>;
+defm ATOMIC_LOAD16_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load16_u", 0xfe13>;
+defm ATOMIC_LOAD8_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load8_u", 0xfe14>;
+defm ATOMIC_LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load16_u", 0xfe15>;
+defm ATOMIC_LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load32_u", 0xfe16>;
} // Defs = [ARGUMENTS]
-// Fragments for exending loads. These are different from regular loads because
+// Fragments for extending loads. These are different from regular loads because
// the SDNodes are derived from AtomicSDNode rather than LoadSDNode and
// therefore don't have the extension type field. So instead of matching that,
// we match the patterns that the type legalizer expands them to.
@@ -72,10 +71,10 @@ def ATOMIC_LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load32_u", 0xfe16>;
// i32 (zext (i8 (atomic_load_8))) gets legalized to
// i32 (and (i32 (atomic_load_8)), 255)
// These can be selected to a single zero-extending atomic load instruction.
-def zext_aload_8 : PatFrag<(ops node:$addr),
- (and (i32 (atomic_load_8 node:$addr)), 255)>;
-def zext_aload_16 : PatFrag<(ops node:$addr),
- (and (i32 (atomic_load_16 node:$addr)), 65535)>;
+def zext_aload_8_32 :
+ PatFrag<(ops node:$addr), (and (i32 (atomic_load_8 node:$addr)), 255)>;
+def zext_aload_16_32 :
+ PatFrag<(ops node:$addr), (and (i32 (atomic_load_16 node:$addr)), 65535)>;
// Unlike regular loads, extension to i64 is handled differently than i32.
// i64 (zext (i8 (atomic_load_8))) gets legalized to
// i64 (and (i64 (anyext (i32 (atomic_load_8)))), 255)
@@ -93,15 +92,15 @@ def zext_aload_32_64 :
// match bare subword loads (for 32-bit results) and anyext loads (for 64-bit
// results) and select a zext load; the next instruction will be sext_inreg
// which is selected by itself.
-def anyext_aload_8_64 :
+def sext_aload_8_64 :
PatFrag<(ops node:$addr), (anyext (i32 (atomic_load_8 node:$addr)))>;
-def anyext_aload_16_64 :
+def sext_aload_16_64 :
PatFrag<(ops node:$addr), (anyext (i32 (atomic_load_16 node:$addr)))>;
let Predicates = [HasAtomics] in {
// Select zero-extending loads with no constant offset.
-def : LoadPatNoOffset<i32, zext_aload_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatNoOffset<i32, zext_aload_16, ATOMIC_LOAD16_U_I32>;
+def : LoadPatNoOffset<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
+def : LoadPatNoOffset<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
def : LoadPatNoOffset<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
def : LoadPatNoOffset<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
def : LoadPatNoOffset<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
@@ -109,16 +108,15 @@ def : LoadPatNoOffset<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
// Select sign-extending loads with no constant offset
def : LoadPatNoOffset<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
def : LoadPatNoOffset<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatNoOffset<i64, anyext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatNoOffset<i64, anyext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-// 32->64 sext load gets selected as i32.atomic.load, i64.extend_s/i64
-
+def : LoadPatNoOffset<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
+def : LoadPatNoOffset<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
+// 32->64 sext load gets selected as i32.atomic.load, i64.extend_s/i32
// Zero-extending loads with constant offset
-def : LoadPatImmOff<i32, zext_aload_8, regPlusImm, ATOMIC_LOAD8_U_I32>;
-def : LoadPatImmOff<i32, zext_aload_16, regPlusImm, ATOMIC_LOAD16_U_I32>;
-def : LoadPatImmOff<i32, zext_aload_8, or_is_add, ATOMIC_LOAD8_U_I32>;
-def : LoadPatImmOff<i32, zext_aload_16, or_is_add, ATOMIC_LOAD16_U_I32>;
+def : LoadPatImmOff<i32, zext_aload_8_32, regPlusImm, ATOMIC_LOAD8_U_I32>;
+def : LoadPatImmOff<i32, zext_aload_16_32, regPlusImm, ATOMIC_LOAD16_U_I32>;
+def : LoadPatImmOff<i32, zext_aload_8_32, or_is_add, ATOMIC_LOAD8_U_I32>;
+def : LoadPatImmOff<i32, zext_aload_16_32, or_is_add, ATOMIC_LOAD16_U_I32>;
def : LoadPatImmOff<i64, zext_aload_8_64, regPlusImm, ATOMIC_LOAD8_U_I64>;
def : LoadPatImmOff<i64, zext_aload_16_64, regPlusImm, ATOMIC_LOAD16_U_I64>;
def : LoadPatImmOff<i64, zext_aload_32_64, regPlusImm, ATOMIC_LOAD32_U_I64>;
@@ -131,64 +129,62 @@ def : LoadPatImmOff<i32, atomic_load_8, regPlusImm, ATOMIC_LOAD8_U_I32>;
def : LoadPatImmOff<i32, atomic_load_16, regPlusImm, ATOMIC_LOAD16_U_I32>;
def : LoadPatImmOff<i32, atomic_load_8, or_is_add, ATOMIC_LOAD8_U_I32>;
def : LoadPatImmOff<i32, atomic_load_16, or_is_add, ATOMIC_LOAD16_U_I32>;
-def : LoadPatImmOff<i64, anyext_aload_8_64, regPlusImm, ATOMIC_LOAD8_U_I64>;
-def : LoadPatImmOff<i64, anyext_aload_16_64, regPlusImm, ATOMIC_LOAD16_U_I64>;
-def : LoadPatImmOff<i64, anyext_aload_8_64, or_is_add, ATOMIC_LOAD8_U_I64>;
-def : LoadPatImmOff<i64, anyext_aload_16_64, or_is_add, ATOMIC_LOAD16_U_I64>;
+def : LoadPatImmOff<i64, sext_aload_8_64, regPlusImm, ATOMIC_LOAD8_U_I64>;
+def : LoadPatImmOff<i64, sext_aload_16_64, regPlusImm, ATOMIC_LOAD16_U_I64>;
+def : LoadPatImmOff<i64, sext_aload_8_64, or_is_add, ATOMIC_LOAD8_U_I64>;
+def : LoadPatImmOff<i64, sext_aload_16_64, or_is_add, ATOMIC_LOAD16_U_I64>;
// No 32->64 patterns, just use i32.atomic.load and i64.extend_s/i64
-def : LoadPatGlobalAddr<i32, zext_aload_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatGlobalAddr<i32, zext_aload_16, ATOMIC_LOAD16_U_I32>;
+def : LoadPatGlobalAddr<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
+def : LoadPatGlobalAddr<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
def : LoadPatGlobalAddr<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
def : LoadPatGlobalAddr<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
def : LoadPatGlobalAddr<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
def : LoadPatGlobalAddr<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
def : LoadPatGlobalAddr<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatGlobalAddr<i64, anyext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatGlobalAddr<i64, anyext_aload_16_64, ATOMIC_LOAD16_U_I64>;
+def : LoadPatGlobalAddr<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
+def : LoadPatGlobalAddr<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-def : LoadPatExternalSym<i32, zext_aload_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatExternalSym<i32, zext_aload_16, ATOMIC_LOAD16_U_I32>;
+def : LoadPatExternalSym<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
+def : LoadPatExternalSym<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
def : LoadPatExternalSym<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
def : LoadPatExternalSym<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
def : LoadPatExternalSym<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
def : LoadPatExternalSym<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
def : LoadPatExternalSym<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatExternalSym<i64, anyext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatExternalSym<i64, anyext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-
+def : LoadPatExternalSym<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
+def : LoadPatExternalSym<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
// Extending loads with just a constant offset
-def : LoadPatOffsetOnly<i32, zext_aload_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatOffsetOnly<i32, zext_aload_16, ATOMIC_LOAD16_U_I32>;
+def : LoadPatOffsetOnly<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
+def : LoadPatOffsetOnly<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
def : LoadPatOffsetOnly<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
def : LoadPatOffsetOnly<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
def : LoadPatOffsetOnly<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
def : LoadPatOffsetOnly<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
def : LoadPatOffsetOnly<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatOffsetOnly<i64, anyext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatOffsetOnly<i64, anyext_aload_16_64, ATOMIC_LOAD16_U_I64>;
+def : LoadPatOffsetOnly<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
+def : LoadPatOffsetOnly<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-def : LoadPatGlobalAddrOffOnly<i32, zext_aload_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatGlobalAddrOffOnly<i32, zext_aload_16, ATOMIC_LOAD16_U_I32>;
+def : LoadPatGlobalAddrOffOnly<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
+def : LoadPatGlobalAddrOffOnly<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
def : LoadPatGlobalAddrOffOnly<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
def : LoadPatGlobalAddrOffOnly<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
def : LoadPatGlobalAddrOffOnly<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
def : LoadPatGlobalAddrOffOnly<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
def : LoadPatGlobalAddrOffOnly<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatGlobalAddrOffOnly<i64, anyext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatGlobalAddrOffOnly<i64, anyext_aload_16_64, ATOMIC_LOAD16_U_I64>;
+def : LoadPatGlobalAddrOffOnly<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
+def : LoadPatGlobalAddrOffOnly<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-def : LoadPatExternSymOffOnly<i32, zext_aload_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatExternSymOffOnly<i32, zext_aload_16, ATOMIC_LOAD16_U_I32>;
+def : LoadPatExternSymOffOnly<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
+def : LoadPatExternSymOffOnly<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
def : LoadPatExternSymOffOnly<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
def : LoadPatExternSymOffOnly<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
def : LoadPatExternSymOffOnly<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
def : LoadPatExternSymOffOnly<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
def : LoadPatExternSymOffOnly<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatExternSymOffOnly<i64, anyext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatExternSymOffOnly<i64, anyext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-
+def : LoadPatExternSymOffOnly<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
+def : LoadPatExternSymOffOnly<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
} // Predicates = [HasAtomics]
@@ -196,19 +192,466 @@ def : LoadPatExternSymOffOnly<i64, anyext_aload_16_64, ATOMIC_LOAD16_U_I64>;
// Atomic stores
//===----------------------------------------------------------------------===//
-// TODO: add atomic stores here...
+let Defs = [ARGUMENTS] in {
+defm ATOMIC_STORE_I32 : WebAssemblyStore<I32, "i32.atomic.store", 0xfe17>;
+defm ATOMIC_STORE_I64 : WebAssemblyStore<I64, "i64.atomic.store", 0xfe18>;
+} // Defs = [ARGUMENTS]
+
+// We need an 'atomic' version of store patterns because store and atomic_store
+// nodes have different operand orders:
+// store: (store $val, $ptr)
+// atomic_store: (store $ptr, $val)
+
+let Predicates = [HasAtomics] in {
+
+// Select stores with no constant offset.
+class AStorePatNoOffset<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(kind I32:$addr, ty:$val), (inst 0, 0, I32:$addr, ty:$val)>;
+def : AStorePatNoOffset<i32, atomic_store_32, ATOMIC_STORE_I32>;
+def : AStorePatNoOffset<i64, atomic_store_64, ATOMIC_STORE_I64>;
+
+// Select stores with a constant offset.
+
+// Pattern with address + immediate offset
+class AStorePatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
+ Pat<(kind (operand I32:$addr, imm:$off), ty:$val),
+ (inst 0, imm:$off, I32:$addr, ty:$val)>;
+def : AStorePatImmOff<i32, atomic_store_32, regPlusImm, ATOMIC_STORE_I32>;
+def : AStorePatImmOff<i64, atomic_store_64, regPlusImm, ATOMIC_STORE_I64>;
+def : AStorePatImmOff<i32, atomic_store_32, or_is_add, ATOMIC_STORE_I32>;
+def : AStorePatImmOff<i64, atomic_store_64, or_is_add, ATOMIC_STORE_I64>;
+
+class AStorePatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)),
+ ty:$val),
+ (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>;
+def : AStorePatGlobalAddr<i32, atomic_store_32, ATOMIC_STORE_I32>;
+def : AStorePatGlobalAddr<i64, atomic_store_64, ATOMIC_STORE_I64>;
+
+class AStorePatExternalSym<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)), ty:$val),
+ (inst 0, texternalsym:$off, I32:$addr, ty:$val)>;
+def : AStorePatExternalSym<i32, atomic_store_32, ATOMIC_STORE_I32>;
+def : AStorePatExternalSym<i64, atomic_store_64, ATOMIC_STORE_I64>;
+
+// Select stores with just a constant offset.
+class AStorePatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(kind imm:$off, ty:$val), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>;
+def : AStorePatOffsetOnly<i32, atomic_store_32, ATOMIC_STORE_I32>;
+def : AStorePatOffsetOnly<i64, atomic_store_64, ATOMIC_STORE_I64>;
+
+class AStorePatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(kind (WebAssemblywrapper tglobaladdr:$off), ty:$val),
+ (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>;
+def : AStorePatGlobalAddrOffOnly<i32, atomic_store_32, ATOMIC_STORE_I32>;
+def : AStorePatGlobalAddrOffOnly<i64, atomic_store_64, ATOMIC_STORE_I64>;
+
+class AStorePatExternSymOffOnly<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(kind (WebAssemblywrapper texternalsym:$off), ty:$val),
+ (inst 0, texternalsym:$off, (CONST_I32 0), ty:$val)>;
+def : AStorePatExternSymOffOnly<i32, atomic_store_32, ATOMIC_STORE_I32>;
+def : AStorePatExternSymOffOnly<i64, atomic_store_64, ATOMIC_STORE_I64>;
+
+} // Predicates = [HasAtomics]
+
+// Truncating stores.
+let Defs = [ARGUMENTS] in {
+defm ATOMIC_STORE8_I32 : WebAssemblyStore<I32, "i32.atomic.store8", 0xfe19>;
+defm ATOMIC_STORE16_I32 : WebAssemblyStore<I32, "i32.atomic.store16", 0xfe1a>;
+defm ATOMIC_STORE8_I64 : WebAssemblyStore<I64, "i64.atomic.store8", 0xfe1b>;
+defm ATOMIC_STORE16_I64 : WebAssemblyStore<I64, "i64.atomic.store16", 0xfe1c>;
+defm ATOMIC_STORE32_I64 : WebAssemblyStore<I64, "i64.atomic.store32", 0xfe1d>;
+} // Defs = [ARGUMENTS]
+
+// Fragments for truncating stores.
+
+// We don't have single truncating atomic store instructions. For 32-bit
+// instructions, we just need to match bare atomic stores. On the other hand,
+// truncating stores from i64 values are once truncated to i32 first.
+class trunc_astore_64<PatFrag kind> :
+ PatFrag<(ops node:$addr, node:$val),
+ (kind node:$addr, (i32 (trunc (i64 node:$val))))>;
+def trunc_astore_8_64 : trunc_astore_64<atomic_store_8>;
+def trunc_astore_16_64 : trunc_astore_64<atomic_store_16>;
+def trunc_astore_32_64 : trunc_astore_64<atomic_store_32>;
+
+let Predicates = [HasAtomics] in {
+
+// Truncating stores with no constant offset
+def : AStorePatNoOffset<i32, atomic_store_8, ATOMIC_STORE8_I32>;
+def : AStorePatNoOffset<i32, atomic_store_16, ATOMIC_STORE16_I32>;
+def : AStorePatNoOffset<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
+def : AStorePatNoOffset<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
+def : AStorePatNoOffset<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
+
+// Truncating stores with a constant offset
+def : AStorePatImmOff<i32, atomic_store_8, regPlusImm, ATOMIC_STORE8_I32>;
+def : AStorePatImmOff<i32, atomic_store_16, regPlusImm, ATOMIC_STORE16_I32>;
+def : AStorePatImmOff<i64, trunc_astore_8_64, regPlusImm, ATOMIC_STORE8_I64>;
+def : AStorePatImmOff<i64, trunc_astore_16_64, regPlusImm, ATOMIC_STORE16_I64>;
+def : AStorePatImmOff<i64, trunc_astore_32_64, regPlusImm, ATOMIC_STORE32_I64>;
+def : AStorePatImmOff<i32, atomic_store_8, or_is_add, ATOMIC_STORE8_I32>;
+def : AStorePatImmOff<i32, atomic_store_16, or_is_add, ATOMIC_STORE16_I32>;
+def : AStorePatImmOff<i64, trunc_astore_8_64, or_is_add, ATOMIC_STORE8_I64>;
+def : AStorePatImmOff<i64, trunc_astore_16_64, or_is_add, ATOMIC_STORE16_I64>;
+def : AStorePatImmOff<i64, trunc_astore_32_64, or_is_add, ATOMIC_STORE32_I64>;
+
+def : AStorePatGlobalAddr<i32, atomic_store_8, ATOMIC_STORE8_I32>;
+def : AStorePatGlobalAddr<i32, atomic_store_16, ATOMIC_STORE16_I32>;
+def : AStorePatGlobalAddr<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
+def : AStorePatGlobalAddr<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
+def : AStorePatGlobalAddr<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
+
+def : AStorePatExternalSym<i32, atomic_store_8, ATOMIC_STORE8_I32>;
+def : AStorePatExternalSym<i32, atomic_store_16, ATOMIC_STORE16_I32>;
+def : AStorePatExternalSym<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
+def : AStorePatExternalSym<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
+def : AStorePatExternalSym<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
+
+// Truncating stores with just a constant offset
+def : AStorePatOffsetOnly<i32, atomic_store_8, ATOMIC_STORE8_I32>;
+def : AStorePatOffsetOnly<i32, atomic_store_16, ATOMIC_STORE16_I32>;
+def : AStorePatOffsetOnly<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
+def : AStorePatOffsetOnly<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
+def : AStorePatOffsetOnly<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
+
+def : AStorePatGlobalAddrOffOnly<i32, atomic_store_8, ATOMIC_STORE8_I32>;
+def : AStorePatGlobalAddrOffOnly<i32, atomic_store_16, ATOMIC_STORE16_I32>;
+def : AStorePatGlobalAddrOffOnly<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
+def : AStorePatGlobalAddrOffOnly<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
+def : AStorePatGlobalAddrOffOnly<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
+
+def : AStorePatExternSymOffOnly<i32, atomic_store_8, ATOMIC_STORE8_I32>;
+def : AStorePatExternSymOffOnly<i32, atomic_store_16, ATOMIC_STORE16_I32>;
+def : AStorePatExternSymOffOnly<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
+def : AStorePatExternSymOffOnly<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
+def : AStorePatExternSymOffOnly<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
+
+} // Predicates = [HasAtomics]
//===----------------------------------------------------------------------===//
-// Low-level exclusive operations
+// Atomic binary read-modify-writes
//===----------------------------------------------------------------------===//
-// TODO: add exclusive operations here...
+let Defs = [ARGUMENTS] in {
+
+multiclass WebAssemblyBinRMW<WebAssemblyRegClass rc, string Name, int Opcode> {
+ defm "" : I<(outs rc:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val),
+ (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+ !strconcat(Name, "\t$dst, ${off}(${addr})${p2align}, $val"),
+ !strconcat(Name, "\t${off}, ${p2align}"), Opcode>;
+}
+
+defm ATOMIC_RMW_ADD_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.add", 0xfe1e>;
+defm ATOMIC_RMW_ADD_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.add", 0xfe1f>;
+defm ATOMIC_RMW8_U_ADD_I32 :
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw8_u.add", 0xfe20>;
+defm ATOMIC_RMW16_U_ADD_I32 :
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw16_u.add", 0xfe21>;
+defm ATOMIC_RMW8_U_ADD_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw8_u.add", 0xfe22>;
+defm ATOMIC_RMW16_U_ADD_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.add", 0xfe23>;
+defm ATOMIC_RMW32_U_ADD_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.add", 0xfe24>;
+
+defm ATOMIC_RMW_SUB_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.sub", 0xfe25>;
+defm ATOMIC_RMW_SUB_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.sub", 0xfe26>;
+defm ATOMIC_RMW8_U_SUB_I32 :
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw8_u.sub", 0xfe27>;
+defm ATOMIC_RMW16_U_SUB_I32 :
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw16_u.sub", 0xfe28>;
+defm ATOMIC_RMW8_U_SUB_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw8_u.sub", 0xfe29>;
+defm ATOMIC_RMW16_U_SUB_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.sub", 0xfe2a>;
+defm ATOMIC_RMW32_U_SUB_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.sub", 0xfe2b>;
+
+defm ATOMIC_RMW_AND_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.and", 0xfe2c>;
+defm ATOMIC_RMW_AND_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.and", 0xfe2d>;
+defm ATOMIC_RMW8_U_AND_I32 :
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw8_u.and", 0xfe2e>;
+defm ATOMIC_RMW16_U_AND_I32 :
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw16_u.and", 0xfe2f>;
+defm ATOMIC_RMW8_U_AND_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw8_u.and", 0xfe30>;
+defm ATOMIC_RMW16_U_AND_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.and", 0xfe31>;
+defm ATOMIC_RMW32_U_AND_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.and", 0xfe32>;
+
+defm ATOMIC_RMW_OR_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.or", 0xfe33>;
+defm ATOMIC_RMW_OR_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.or", 0xfe34>;
+defm ATOMIC_RMW8_U_OR_I32 :
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw8_u.or", 0xfe35>;
+defm ATOMIC_RMW16_U_OR_I32 :
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw16_u.or", 0xfe36>;
+defm ATOMIC_RMW8_U_OR_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw8_u.or", 0xfe37>;
+defm ATOMIC_RMW16_U_OR_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.or", 0xfe38>;
+defm ATOMIC_RMW32_U_OR_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.or", 0xfe39>;
+
+defm ATOMIC_RMW_XOR_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.xor", 0xfe3a>;
+defm ATOMIC_RMW_XOR_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.xor", 0xfe3b>;
+defm ATOMIC_RMW8_U_XOR_I32 :
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw8_u.xor", 0xfe3c>;
+defm ATOMIC_RMW16_U_XOR_I32 :
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw16_u.xor", 0xfe3d>;
+defm ATOMIC_RMW8_U_XOR_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw8_u.xor", 0xfe3e>;
+defm ATOMIC_RMW16_U_XOR_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.xor", 0xfe3f>;
+defm ATOMIC_RMW32_U_XOR_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.xor", 0xfe40>;
+
+defm ATOMIC_RMW_XCHG_I32 :
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw.xchg", 0xfe41>;
+defm ATOMIC_RMW_XCHG_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw.xchg", 0xfe42>;
+defm ATOMIC_RMW8_U_XCHG_I32 :
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw8_u.xchg", 0xfe43>;
+defm ATOMIC_RMW16_U_XCHG_I32 :
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw16_u.xchg", 0xfe44>;
+defm ATOMIC_RMW8_U_XCHG_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw8_u.xchg", 0xfe45>;
+defm ATOMIC_RMW16_U_XCHG_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.xchg", 0xfe46>;
+defm ATOMIC_RMW32_U_XCHG_I64 :
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.xchg", 0xfe47>;
+}
+
+// Select binary RMWs with no constant offset.
+class BinRMWPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(ty (kind I32:$addr, ty:$val)), (inst 0, 0, I32:$addr, ty:$val)>;
+
+// Select binary RMWs with a constant offset.
+
+// Pattern with address + immediate offset
+class BinRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
+ Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$val)),
+ (inst 0, imm:$off, I32:$addr, ty:$val)>;
+
+class BinRMWPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(ty (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)),
+ ty:$val)),
+ (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>;
+
+class BinRMWPatExternalSym<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(ty (kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)),
+ ty:$val)),
+ (inst 0, texternalsym:$off, I32:$addr, ty:$val)>;
+
+// Select binary RMWs with just a constant offset.
+class BinRMWPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(ty (kind imm:$off, ty:$val)),
+ (inst 0, imm:$off, (CONST_I32 0), ty:$val)>;
+
+class BinRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$val)),
+ (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>;
-// Load-exclusives.
+class BinRMWPatExternSymOffOnly<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(ty (kind (WebAssemblywrapper texternalsym:$off), ty:$val)),
+ (inst 0, texternalsym:$off, (CONST_I32 0), ty:$val)>;
-// Store-exclusives.
+// Patterns for various addressing modes.
+multiclass BinRMWPattern<PatFrag rmw_32, PatFrag rmw_64, NI inst_32,
+ NI inst_64> {
+ def : BinRMWPatNoOffset<i32, rmw_32, inst_32>;
+ def : BinRMWPatNoOffset<i64, rmw_64, inst_64>;
-// Store-release-exclusives.
+ def : BinRMWPatImmOff<i32, rmw_32, regPlusImm, inst_32>;
+ def : BinRMWPatImmOff<i64, rmw_64, regPlusImm, inst_64>;
+ def : BinRMWPatImmOff<i32, rmw_32, or_is_add, inst_32>;
+ def : BinRMWPatImmOff<i64, rmw_64, or_is_add, inst_64>;
-// And clear exclusive.
+ def : BinRMWPatGlobalAddr<i32, rmw_32, inst_32>;
+ def : BinRMWPatGlobalAddr<i64, rmw_64, inst_64>;
+ def : BinRMWPatExternalSym<i32, rmw_32, inst_32>;
+ def : BinRMWPatExternalSym<i64, rmw_64, inst_64>;
+
+ def : BinRMWPatOffsetOnly<i32, rmw_32, inst_32>;
+ def : BinRMWPatOffsetOnly<i64, rmw_64, inst_64>;
+
+ def : BinRMWPatGlobalAddrOffOnly<i32, rmw_32, inst_32>;
+ def : BinRMWPatGlobalAddrOffOnly<i64, rmw_64, inst_64>;
+
+ def : BinRMWPatExternSymOffOnly<i32, rmw_32, inst_32>;
+ def : BinRMWPatExternSymOffOnly<i64, rmw_64, inst_64>;
+}
+
+let Predicates = [HasAtomics] in {
+defm : BinRMWPattern<atomic_load_add_32, atomic_load_add_64, ATOMIC_RMW_ADD_I32,
+ ATOMIC_RMW_ADD_I64>;
+defm : BinRMWPattern<atomic_load_sub_32, atomic_load_sub_64, ATOMIC_RMW_SUB_I32,
+ ATOMIC_RMW_SUB_I64>;
+defm : BinRMWPattern<atomic_load_and_32, atomic_load_and_64, ATOMIC_RMW_AND_I32,
+ ATOMIC_RMW_AND_I64>;
+defm : BinRMWPattern<atomic_load_or_32, atomic_load_or_64, ATOMIC_RMW_OR_I32,
+ ATOMIC_RMW_OR_I64>;
+defm : BinRMWPattern<atomic_load_xor_32, atomic_load_xor_64, ATOMIC_RMW_XOR_I32,
+ ATOMIC_RMW_XOR_I64>;
+defm : BinRMWPattern<atomic_swap_32, atomic_swap_64, ATOMIC_RMW_XCHG_I32,
+ ATOMIC_RMW_XCHG_I64>;
+} // Predicates = [HasAtomics]
+
+// Truncating & zero-extending binary RMW patterns.
+// These are combined patterns of truncating store patterns and zero-extending
+// load patterns above.
+class zext_bin_rmw_8_32<PatFrag kind> :
+ PatFrag<(ops node:$addr, node:$val),
+ (and (i32 (kind node:$addr, node:$val)), 255)>;
+class zext_bin_rmw_16_32<PatFrag kind> :
+ PatFrag<(ops node:$addr, node:$val),
+ (and (i32 (kind node:$addr, node:$val)), 65535)>;
+class zext_bin_rmw_8_64<PatFrag kind> :
+ PatFrag<(ops node:$addr, node:$val),
+ (and (i64 (anyext (i32 (kind node:$addr,
+ (i32 (trunc (i64 node:$val))))))), 255)>;
+class zext_bin_rmw_16_64<PatFrag kind> :
+ PatFrag<(ops node:$addr, node:$val),
+ (and (i64 (anyext (i32 (kind node:$addr,
+ (i32 (trunc (i64 node:$val))))))), 65535)>;
+class zext_bin_rmw_32_64<PatFrag kind> :
+ PatFrag<(ops node:$addr, node:$val),
+ (zext (i32 (kind node:$addr, (i32 (trunc (i64 node:$val))))))>;
+
+// Truncating & sign-extending binary RMW patterns.
+// These are combined patterns of truncating store patterns and sign-extending
+// load patterns above. We match subword RMWs (for 32-bit) and anyext RMWs (for
+// 64-bit) and select a zext RMW; the next instruction will be sext_inreg which
+// is selected by itself.
+class sext_bin_rmw_8_32<PatFrag kind> :
+ PatFrag<(ops node:$addr, node:$val), (kind node:$addr, node:$val)>;
+class sext_bin_rmw_16_32<PatFrag kind> : sext_bin_rmw_8_32<kind>;
+class sext_bin_rmw_8_64<PatFrag kind> :
+ PatFrag<(ops node:$addr, node:$val),
+ (anyext (i32 (kind node:$addr, (i32 (trunc (i64 node:$val))))))>;
+class sext_bin_rmw_16_64<PatFrag kind> : sext_bin_rmw_8_64<kind>;
+// 32->64 sext RMW gets selected as i32.atomic.rmw.***, i64.extend_s/i32
+
+// Patterns for various addressing modes for truncating-extending binary RMWs.
+multiclass BinRMWTruncExtPattern<
+ PatFrag rmw_8, PatFrag rmw_16, PatFrag rmw_32, PatFrag rmw_64,
+ NI inst8_32, NI inst16_32, NI inst8_64, NI inst16_64, NI inst32_64> {
+ // Truncating-extending binary RMWs with no constant offset
+ def : BinRMWPatNoOffset<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
+ def : BinRMWPatNoOffset<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
+ def : BinRMWPatNoOffset<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
+ def : BinRMWPatNoOffset<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
+ def : BinRMWPatNoOffset<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
+
+ def : BinRMWPatNoOffset<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
+ def : BinRMWPatNoOffset<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
+ def : BinRMWPatNoOffset<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
+ def : BinRMWPatNoOffset<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
+
+ // Truncating-extending binary RMWs with a constant offset
+ def : BinRMWPatImmOff<i32, zext_bin_rmw_8_32<rmw_8>, regPlusImm, inst8_32>;
+ def : BinRMWPatImmOff<i32, zext_bin_rmw_16_32<rmw_16>, regPlusImm, inst16_32>;
+ def : BinRMWPatImmOff<i64, zext_bin_rmw_8_64<rmw_8>, regPlusImm, inst8_64>;
+ def : BinRMWPatImmOff<i64, zext_bin_rmw_16_64<rmw_16>, regPlusImm, inst16_64>;
+ def : BinRMWPatImmOff<i64, zext_bin_rmw_32_64<rmw_32>, regPlusImm, inst32_64>;
+ def : BinRMWPatImmOff<i32, zext_bin_rmw_8_32<rmw_8>, or_is_add, inst8_32>;
+ def : BinRMWPatImmOff<i32, zext_bin_rmw_16_32<rmw_16>, or_is_add, inst16_32>;
+ def : BinRMWPatImmOff<i64, zext_bin_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
+ def : BinRMWPatImmOff<i64, zext_bin_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
+ def : BinRMWPatImmOff<i64, zext_bin_rmw_32_64<rmw_32>, or_is_add, inst32_64>;
+
+ def : BinRMWPatImmOff<i32, sext_bin_rmw_8_32<rmw_8>, regPlusImm, inst8_32>;
+ def : BinRMWPatImmOff<i32, sext_bin_rmw_16_32<rmw_16>, regPlusImm, inst16_32>;
+ def : BinRMWPatImmOff<i64, sext_bin_rmw_8_64<rmw_8>, regPlusImm, inst8_64>;
+ def : BinRMWPatImmOff<i64, sext_bin_rmw_16_64<rmw_16>, regPlusImm, inst16_64>;
+ def : BinRMWPatImmOff<i32, sext_bin_rmw_8_32<rmw_8>, or_is_add, inst8_32>;
+ def : BinRMWPatImmOff<i32, sext_bin_rmw_16_32<rmw_16>, or_is_add, inst16_32>;
+ def : BinRMWPatImmOff<i64, sext_bin_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
+ def : BinRMWPatImmOff<i64, sext_bin_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
+
+ def : BinRMWPatGlobalAddr<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
+ def : BinRMWPatGlobalAddr<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
+ def : BinRMWPatGlobalAddr<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
+ def : BinRMWPatGlobalAddr<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
+ def : BinRMWPatGlobalAddr<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
+
+ def : BinRMWPatGlobalAddr<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
+ def : BinRMWPatGlobalAddr<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
+ def : BinRMWPatGlobalAddr<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
+ def : BinRMWPatGlobalAddr<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
+
+ def : BinRMWPatExternalSym<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
+ def : BinRMWPatExternalSym<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
+ def : BinRMWPatExternalSym<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
+ def : BinRMWPatExternalSym<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
+ def : BinRMWPatExternalSym<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
+
+ def : BinRMWPatExternalSym<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
+ def : BinRMWPatExternalSym<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
+ def : BinRMWPatExternalSym<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
+ def : BinRMWPatExternalSym<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
+
+ // Truncating-extending binary RMWs with just a constant offset
+ def : BinRMWPatOffsetOnly<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
+ def : BinRMWPatOffsetOnly<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
+ def : BinRMWPatOffsetOnly<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
+ def : BinRMWPatOffsetOnly<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
+ def : BinRMWPatOffsetOnly<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
+
+ def : BinRMWPatOffsetOnly<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
+ def : BinRMWPatOffsetOnly<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
+ def : BinRMWPatOffsetOnly<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
+ def : BinRMWPatOffsetOnly<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
+
+ def : BinRMWPatGlobalAddrOffOnly<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
+ def : BinRMWPatGlobalAddrOffOnly<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
+ def : BinRMWPatGlobalAddrOffOnly<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
+ def : BinRMWPatGlobalAddrOffOnly<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
+ def : BinRMWPatGlobalAddrOffOnly<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
+
+ def : BinRMWPatGlobalAddrOffOnly<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
+ def : BinRMWPatGlobalAddrOffOnly<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
+ def : BinRMWPatGlobalAddrOffOnly<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
+ def : BinRMWPatGlobalAddrOffOnly<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
+
+ def : BinRMWPatExternSymOffOnly<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
+ def : BinRMWPatExternSymOffOnly<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
+ def : BinRMWPatExternSymOffOnly<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
+ def : BinRMWPatExternSymOffOnly<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
+ def : BinRMWPatExternSymOffOnly<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
+
+ def : BinRMWPatExternSymOffOnly<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
+ def : BinRMWPatExternSymOffOnly<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
+ def : BinRMWPatExternSymOffOnly<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
+ def : BinRMWPatExternSymOffOnly<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
+}
+
+let Predicates = [HasAtomics] in {
+defm : BinRMWTruncExtPattern<
+ atomic_load_add_8, atomic_load_add_16, atomic_load_add_32, atomic_load_add_64,
+ ATOMIC_RMW8_U_ADD_I32, ATOMIC_RMW16_U_ADD_I32,
+ ATOMIC_RMW8_U_ADD_I64, ATOMIC_RMW16_U_ADD_I64, ATOMIC_RMW32_U_ADD_I64>;
+defm : BinRMWTruncExtPattern<
+ atomic_load_sub_8, atomic_load_sub_16, atomic_load_sub_32, atomic_load_sub_64,
+ ATOMIC_RMW8_U_SUB_I32, ATOMIC_RMW16_U_SUB_I32,
+ ATOMIC_RMW8_U_SUB_I64, ATOMIC_RMW16_U_SUB_I64, ATOMIC_RMW32_U_SUB_I64>;
+defm : BinRMWTruncExtPattern<
+ atomic_load_and_8, atomic_load_and_16, atomic_load_and_32, atomic_load_and_64,
+ ATOMIC_RMW8_U_AND_I32, ATOMIC_RMW16_U_AND_I32,
+ ATOMIC_RMW8_U_AND_I64, ATOMIC_RMW16_U_AND_I64, ATOMIC_RMW32_U_AND_I64>;
+defm : BinRMWTruncExtPattern<
+ atomic_load_or_8, atomic_load_or_16, atomic_load_or_32, atomic_load_or_64,
+ ATOMIC_RMW8_U_OR_I32, ATOMIC_RMW16_U_OR_I32,
+ ATOMIC_RMW8_U_OR_I64, ATOMIC_RMW16_U_OR_I64, ATOMIC_RMW32_U_OR_I64>;
+defm : BinRMWTruncExtPattern<
+ atomic_load_xor_8, atomic_load_xor_16, atomic_load_xor_32, atomic_load_xor_64,
+ ATOMIC_RMW8_U_XOR_I32, ATOMIC_RMW16_U_XOR_I32,
+ ATOMIC_RMW8_U_XOR_I64, ATOMIC_RMW16_U_XOR_I64, ATOMIC_RMW32_U_XOR_I64>;
+defm : BinRMWTruncExtPattern<
+ atomic_swap_8, atomic_swap_16, atomic_swap_32, atomic_swap_64,
+ ATOMIC_RMW8_U_XCHG_I32, ATOMIC_RMW16_U_XCHG_I32,
+ ATOMIC_RMW8_U_XCHG_I64, ATOMIC_RMW16_U_XCHG_I64, ATOMIC_RMW32_U_XCHG_I64>;
+} // Predicates = [HasAtomics]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 6b45839c14b0..34262752430c 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -8,89 +8,111 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief WebAssembly Call operand code-gen constructs.
+/// WebAssembly Call operand code-gen constructs.
///
//===----------------------------------------------------------------------===//
// TODO: addr64: These currently assume the callee address is 32-bit.
+// FIXME: add $type to first call_indirect asmstr (and maybe $flags)
let Defs = [ARGUMENTS] in {
// Call sequence markers. These have an immediate which represents the amount of
// stack space to allocate or free, which is used for varargs lowering.
let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in {
-def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
- [(WebAssemblycallseq_start timm:$amt, timm:$amt2)]>;
-def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
- [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>;
+defm ADJCALLSTACKDOWN : NRI<(outs), (ins i32imm:$amt, i32imm:$amt2),
+ [(WebAssemblycallseq_start timm:$amt, timm:$amt2)]>;
+defm ADJCALLSTACKUP : NRI<(outs), (ins i32imm:$amt, i32imm:$amt2),
+ [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>;
} // isCodeGenOnly = 1
multiclass CALL<WebAssemblyRegClass vt, string prefix> {
- def CALL_#vt : I<(outs vt:$dst), (ins function32_op:$callee, variable_ops),
- [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))],
- !strconcat(prefix, "call\t$dst, $callee"),
- 0x10>;
+ defm CALL_#vt : I<(outs vt:$dst), (ins function32_op:$callee, variable_ops),
+ (outs), (ins function32_op:$callee),
+ [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))],
+ !strconcat(prefix, "call\t$dst, $callee"),
+ !strconcat(prefix, "call\t$callee"),
+ 0x10>;
let isCodeGenOnly = 1 in {
- def PCALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops),
- [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
- "PSEUDO CALL INDIRECT\t$callee">;
+ defm PCALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops),
+ (outs), (ins I32:$callee),
+ [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
+ "PSEUDO CALL INDIRECT\t$callee",
+ "PSEUDO CALL INDIRECT\t$callee">;
} // isCodeGenOnly = 1
- def CALL_INDIRECT_#vt : I<(outs vt:$dst),
- (ins TypeIndex:$type, i32imm:$flags, variable_ops),
- [],
- !strconcat(prefix, "call_indirect\t$dst"),
- 0x11>;
+ defm CALL_INDIRECT_#vt : I<(outs vt:$dst),
+ (ins TypeIndex:$type, i32imm:$flags, variable_ops),
+ (outs), (ins TypeIndex:$type, i32imm:$flags),
+ [],
+ !strconcat(prefix, "call_indirect\t$dst"),
+ !strconcat(prefix, "call_indirect\t$type"),
+ 0x11>;
}
multiclass SIMD_CALL<ValueType vt, string prefix> {
- def CALL_#vt : SIMD_I<(outs V128:$dst), (ins function32_op:$callee, variable_ops),
+ defm CALL_#vt : SIMD_I<(outs V128:$dst), (ins function32_op:$callee,
+ variable_ops),
+ (outs), (ins function32_op:$callee),
[(set (vt V128:$dst),
- (WebAssemblycall1 (i32 imm:$callee)))],
+ (WebAssemblycall1 (i32 imm:$callee)))],
!strconcat(prefix, "call\t$dst, $callee"),
+ !strconcat(prefix, "call\t$callee"),
0x10>;
let isCodeGenOnly = 1 in {
- def PCALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
- (ins I32:$callee, variable_ops),
- [(set (vt V128:$dst),
- (WebAssemblycall1 I32:$callee))],
- "PSEUDO CALL INDIRECT\t$callee">;
+ defm PCALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
+ (ins I32:$callee, variable_ops),
+ (outs), (ins I32:$callee),
+ [(set (vt V128:$dst),
+ (WebAssemblycall1 I32:$callee))],
+ "PSEUDO CALL INDIRECT\t$callee",
+ "PSEUDO CALL INDIRECT\t$callee">;
} // isCodeGenOnly = 1
- def CALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
+ defm CALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
(ins TypeIndex:$type, i32imm:$flags,
- variable_ops),
+ variable_ops),
+ (outs), (ins TypeIndex:$type, i32imm:$flags),
[],
- !strconcat(prefix, "call_indirect\t$dst"),
+ !strconcat(prefix,
+ "call_indirect\t$dst"),
+ !strconcat(prefix, "call_indirect\t$type"),
0x11>;
}
let Uses = [SP32, SP64], isCall = 1 in {
- defm : CALL<I32, "i32.">;
- defm : CALL<I64, "i64.">;
- defm : CALL<F32, "f32.">;
- defm : CALL<F64, "f64.">;
- defm : SIMD_CALL<v16i8, "i8x16.">;
- defm : SIMD_CALL<v8i16, "i16x8.">;
- defm : SIMD_CALL<v4i32, "i32x4.">;
- defm : SIMD_CALL<v4f32, "f32x4.">;
-
- def CALL_VOID : I<(outs), (ins function32_op:$callee, variable_ops),
- [(WebAssemblycall0 (i32 imm:$callee))],
- "call \t$callee", 0x10>;
+ defm "" : CALL<I32, "i32.">;
+ defm "" : CALL<I64, "i64.">;
+ defm "" : CALL<F32, "f32.">;
+ defm "" : CALL<F64, "f64.">;
+ defm "" : CALL<EXCEPT_REF, "except_ref.">;
+ defm "" : SIMD_CALL<v16i8, "i8x16.">;
+ defm "" : SIMD_CALL<v8i16, "i16x8.">;
+ defm "" : SIMD_CALL<v4i32, "i32x4.">;
+ defm "" : SIMD_CALL<v4f32, "f32x4.">;
+
+ defm CALL_VOID : I<(outs), (ins function32_op:$callee, variable_ops),
+ (outs), (ins function32_op:$callee),
+ [(WebAssemblycall0 (i32 imm:$callee))],
+ "call \t$callee", "call\t$callee", 0x10>;
let isCodeGenOnly = 1 in {
- def PCALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops),
- [(WebAssemblycall0 I32:$callee)],
- "PSEUDO CALL INDIRECT\t$callee">;
+ defm PCALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops),
+ (outs), (ins I32:$callee),
+ [(WebAssemblycall0 I32:$callee)],
+ "PSEUDO CALL INDIRECT\t$callee",
+ "PSEUDO CALL INDIRECT\t$callee">;
} // isCodeGenOnly = 1
- def CALL_INDIRECT_VOID : I<(outs),
- (ins TypeIndex:$type, i32imm:$flags, variable_ops),
- [],
- "call_indirect\t", 0x11>;
+ defm CALL_INDIRECT_VOID : I<(outs),
+ (ins TypeIndex:$type, i32imm:$flags,
+ variable_ops),
+ (outs), (ins TypeIndex:$type, i32imm:$flags),
+ [],
+ "call_indirect\t", "call_indirect\t$type",
+ 0x11>;
} // Uses = [SP32,SP64], isCall = 1
} // Defs = [ARGUMENTS]
@@ -112,6 +134,9 @@ def : Pat<(v4i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
(CALL_v4i32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
(CALL_v4f32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(ExceptRef
+ (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+ (CALL_EXCEPT_REF tglobaladdr:$callee)>;
def : Pat<(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee)),
(CALL_VOID tglobaladdr:$callee)>;
@@ -132,5 +157,8 @@ def : Pat<(v4i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
(CALL_v4i32 texternalsym:$callee)>, Requires<[HasSIMD128]>;
def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
(CALL_v4f32 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(ExceptRef
+ (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+ (CALL_EXCEPT_REF texternalsym:$callee)>;
def : Pat<(WebAssemblycall0 (WebAssemblywrapper texternalsym:$callee)),
(CALL_VOID texternalsym:$callee)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 129794171464..d90244b90662 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief WebAssembly control-flow code-gen constructs.
+/// WebAssembly control-flow code-gen constructs.
///
//===----------------------------------------------------------------------===//
@@ -16,15 +16,17 @@ let Defs = [ARGUMENTS] in {
let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
// The condition operand is a boolean value which WebAssembly represents as i32.
-def BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond),
- [(brcond I32:$cond, bb:$dst)],
- "br_if \t$dst, $cond", 0x0d>;
+defm BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond),
+ (outs), (ins bb_op:$dst),
+ [(brcond I32:$cond, bb:$dst)],
+ "br_if \t$dst, $cond", "br_if \t$dst", 0x0d>;
let isCodeGenOnly = 1 in
-def BR_UNLESS : I<(outs), (ins bb_op:$dst, I32:$cond), []>;
+defm BR_UNLESS : I<(outs), (ins bb_op:$dst, I32:$cond),
+ (outs), (ins bb_op:$dst), []>;
let isBarrier = 1 in {
-def BR : I<(outs), (ins bb_op:$dst),
- [(br bb:$dst)],
- "br \t$dst", 0x0c>;
+defm BR : NRI<(outs), (ins bb_op:$dst),
+ [(br bb:$dst)],
+ "br \t$dst", 0x0c>;
} // isBarrier = 1
} // isBranch = 1, isTerminator = 1, hasCtrlDep = 1
@@ -42,92 +44,151 @@ let Defs = [ARGUMENTS] in {
// currently.
// Set TSFlags{0} to 1 to indicate that the variable_ops are immediates.
// Set TSFlags{1} to 1 to indicate that the immediates represent labels.
+// FIXME: this can't inherit from I<> since there is no way to inherit from a
+// multiclass and still have the let statements.
let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
-def BR_TABLE_I32 : I<(outs), (ins I32:$index, variable_ops),
- [(WebAssemblybr_table I32:$index)],
- "br_table \t$index", 0x0e> {
+def BR_TABLE_I32 : NI<(outs), (ins I32:$index, variable_ops),
+ [(WebAssemblybr_table I32:$index)], 0,
+ "br_table \t$index", 0x0e> {
let TSFlags{0} = 1;
let TSFlags{1} = 1;
}
-def BR_TABLE_I64 : I<(outs), (ins I64:$index, variable_ops),
- [(WebAssemblybr_table I64:$index)],
- "br_table \t$index"> {
+def BR_TABLE_I32_S : NI<(outs), (ins I32:$index),
+ [], 1,
+ "br_table \t$index", 0x0e> {
+ let TSFlags{0} = 1;
+ let TSFlags{1} = 1;
+}
+def BR_TABLE_I64 : NI<(outs), (ins I64:$index, variable_ops),
+ [(WebAssemblybr_table I64:$index)], 0,
+ "br_table \t$index"> {
+ let TSFlags{0} = 1;
+ let TSFlags{1} = 1;
+}
+def BR_TABLE_I64_S : NI<(outs), (ins I64:$index),
+ [], 1,
+ "br_table \t$index"> {
let TSFlags{0} = 1;
let TSFlags{1} = 1;
}
} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
-// Placemarkers to indicate the start or end of a block, loop, or try scope.
+// This is technically a control-flow instruction, since all it affects is the
+// IP.
+defm NOP : NRI<(outs), (ins), [], "nop", 0x01>;
+
+// Placemarkers to indicate the start or end of a block or loop scope.
// These use/clobber VALUE_STACK to prevent them from being moved into the
// middle of an expression tree.
let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in {
-def BLOCK : I<(outs), (ins Signature:$sig), [], "block \t$sig", 0x02>;
-def LOOP : I<(outs), (ins Signature:$sig), [], "loop \t$sig", 0x03>;
-def TRY : I<(outs), (ins Signature:$sig), [], "try \t$sig", 0x06>;
-
-// END_BLOCK, END_LOOP, END_TRY, and END_FUNCTION are represented with the same
-// opcode in wasm.
-def END_BLOCK : I<(outs), (ins), [], "end_block", 0x0b>;
-def END_LOOP : I<(outs), (ins), [], "end_loop", 0x0b>;
-def END_TRY : I<(outs), (ins), [], "end_try", 0x0b>;
+defm BLOCK : NRI<(outs), (ins Signature:$sig), [], "block \t$sig", 0x02>;
+defm LOOP : NRI<(outs), (ins Signature:$sig), [], "loop \t$sig", 0x03>;
+
+// END_BLOCK, END_LOOP, and END_FUNCTION are represented with the same opcode in
+// wasm.
+defm END_BLOCK : NRI<(outs), (ins), [], "end_block", 0x0b>;
+defm END_LOOP : NRI<(outs), (ins), [], "end_loop", 0x0b>;
let isTerminator = 1, isBarrier = 1 in
-def END_FUNCTION : I<(outs), (ins), [], "end_function", 0x0b>;
+defm END_FUNCTION : NRI<(outs), (ins), [], "end_function", 0x0b>;
} // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
multiclass RETURN<WebAssemblyRegClass vt> {
- def RETURN_#vt : I<(outs), (ins vt:$val), [(WebAssemblyreturn vt:$val)],
- "return \t$val", 0x0f>;
+ defm RETURN_#vt : I<(outs), (ins vt:$val), (outs), (ins),
+ [(WebAssemblyreturn vt:$val)],
+ "return \t$val", "return", 0x0f>;
// Equivalent to RETURN_#vt, for use at the end of a function when wasm
// semantics return by falling off the end of the block.
let isCodeGenOnly = 1 in
- def FALLTHROUGH_RETURN_#vt : I<(outs), (ins vt:$val), []>;
+ defm FALLTHROUGH_RETURN_#vt : I<(outs), (ins vt:$val), (outs), (ins), []>;
}
multiclass SIMD_RETURN<ValueType vt> {
- def RETURN_#vt : SIMD_I<(outs), (ins V128:$val),
- [(WebAssemblyreturn (vt V128:$val))],
- "return \t$val", 0x0f>;
+ defm RETURN_#vt : SIMD_I<(outs), (ins V128:$val), (outs), (ins),
+ [(WebAssemblyreturn (vt V128:$val))],
+ "return \t$val", "return", 0x0f>;
// Equivalent to RETURN_#vt, for use at the end of a function when wasm
// semantics return by falling off the end of the block.
let isCodeGenOnly = 1 in
- def FALLTHROUGH_RETURN_#vt : SIMD_I<(outs), (ins V128:$val), []>;
+ defm FALLTHROUGH_RETURN_#vt : SIMD_I<(outs), (ins V128:$val), (outs), (ins),
+ []>;
}
let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
let isReturn = 1 in {
- defm : RETURN<I32>;
- defm : RETURN<I64>;
- defm : RETURN<F32>;
- defm : RETURN<F64>;
- defm : SIMD_RETURN<v16i8>;
- defm : SIMD_RETURN<v8i16>;
- defm : SIMD_RETURN<v4i32>;
- defm : SIMD_RETURN<v4f32>;
-
- def RETURN_VOID : I<(outs), (ins), [(WebAssemblyreturn)], "return", 0x0f>;
+ defm "": RETURN<I32>;
+ defm "": RETURN<I64>;
+ defm "": RETURN<F32>;
+ defm "": RETURN<F64>;
+ defm "": RETURN<EXCEPT_REF>;
+ defm "": SIMD_RETURN<v16i8>;
+ defm "": SIMD_RETURN<v8i16>;
+ defm "": SIMD_RETURN<v4i32>;
+ defm "": SIMD_RETURN<v4f32>;
+
+ defm RETURN_VOID : NRI<(outs), (ins), [(WebAssemblyreturn)], "return", 0x0f>;
// This is to RETURN_VOID what FALLTHROUGH_RETURN_#vt is to RETURN_#vt.
let isCodeGenOnly = 1 in
- def FALLTHROUGH_RETURN_VOID : I<(outs), (ins), []>;
+ defm FALLTHROUGH_RETURN_VOID : NRI<(outs), (ins), []>;
} // isReturn = 1
-def UNREACHABLE : I<(outs), (ins), [(trap)], "unreachable", 0x00>;
+defm UNREACHABLE : NRI<(outs), (ins), [(trap)], "unreachable", 0x00>;
+} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+
+//===----------------------------------------------------------------------===//
+// Exception handling instructions
+//===----------------------------------------------------------------------===//
-def THROW_I32 : I<(outs), (ins i32imm:$tag, I32:$obj),
- [(int_wasm_throw imm:$tag, I32:$obj)], "throw \t$tag, $obj",
- 0x08>;
-def THROW_I64 : I<(outs), (ins i32imm:$tag, I64:$obj),
- [(int_wasm_throw imm:$tag, I64:$obj)], "throw \t$tag, $obj",
- 0x08>;
-def RETHROW : I<(outs), (ins i32imm:$rel_depth), [], "rethrow \t$rel_depth",
- 0x09>;
+let Predicates = [HasExceptionHandling] in {
+// Throwing an exception: throw / rethrow
+let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
+defm THROW_I32 : I<(outs), (ins i32imm:$tag, I32:$val),
+ (outs), (ins i32imm:$tag),
+ [(int_wasm_throw imm:$tag, I32:$val)],
+ "throw \t$tag, $val", "throw \t$tag",
+ 0x08>;
+defm THROW_I64 : I<(outs), (ins i32imm:$tag, I64:$val),
+ (outs), (ins i32imm:$tag),
+ [(int_wasm_throw imm:$tag, I64:$val)],
+ "throw \t$tag, $val", "throw \t$tag",
+ 0x08>;
+defm RETHROW : NRI<(outs), (ins bb_op:$dst), [], "rethrow \t$dst", 0x09>;
+let isCodeGenOnly = 1 in
+// This is used when the destination for rethrow is the caller function. This
+// will be converted to a rethrow in CFGStackify.
+defm RETHROW_TO_CALLER : NRI<(outs), (ins), [], "rethrow">;
} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
-} // Defs = [ARGUMENTS]
+// Region within which an exception is caught: try / end_try
+let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in {
+defm TRY : NRI<(outs), (ins Signature:$sig), [], "try \t$sig", 0x06>;
+defm END_TRY : NRI<(outs), (ins), [], "end_try", 0x0b>;
+} // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
-// rethrow takes a relative depth as an argument, for which currently only 0 is
-// possible for C++. Once other languages need depths other than 0, depths will
-// be computed in CFGStackify.
-def : Pat<(int_wasm_rethrow), (RETHROW 0)>;
+// Catching an exception: catch / catch_all
+let hasCtrlDep = 1 in {
+defm CATCH_I32 : I<(outs I32:$dst), (ins i32imm:$tag),
+ (outs), (ins i32imm:$tag),
+ [(set I32:$dst, (int_wasm_catch imm:$tag))],
+ "i32.catch \t$dst, $tag", "i32.catch \t$tag", 0x07>;
+defm CATCH_I64 : I<(outs I64:$dst), (ins i32imm:$tag),
+ (outs), (ins i32imm:$tag),
+ [(set I64:$dst, (int_wasm_catch imm:$tag))],
+ "i64.catch \t$dst, $tag", "i64.catch \t$tag", 0x07>;
+defm CATCH_ALL : NRI<(outs), (ins), [], "catch_all", 0x05>;
+}
+
+// Pseudo instructions: cleanupret / catchret
+// They are not return instructions in wasm, but setting 'isReturn' to true as
+// in X86 is necessary for computing EH scope membership.
+let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
+ isCodeGenOnly = 1, isReturn = 1 in {
+ defm CLEANUPRET : NRI<(outs), (ins), [(cleanupret)], "", 0>;
+ defm CATCHRET : NRI<(outs), (ins bb_op:$dst, bb_op:$from),
+ [(catchret bb:$dst, bb:$from)], "", 0>;
+}
+}
+
+} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index 426c2c802172..c89c1b549816 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -8,41 +8,48 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief WebAssembly datatype conversions, truncations, reinterpretations,
+/// WebAssembly datatype conversions, truncations, reinterpretations,
/// promotions, and demotions operand code-gen constructs.
///
//===----------------------------------------------------------------------===//
let Defs = [ARGUMENTS] in {
-def I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src),
+defm I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins),
[(set I32:$dst, (trunc I64:$src))],
- "i32.wrap/i64\t$dst, $src", 0xa7>;
+ "i32.wrap/i64\t$dst, $src", "i32.wrap/i64", 0xa7>;
-def I64_EXTEND_S_I32 : I<(outs I64:$dst), (ins I32:$src),
+defm I64_EXTEND_S_I32 : I<(outs I64:$dst), (ins I32:$src), (outs), (ins),
[(set I64:$dst, (sext I32:$src))],
- "i64.extend_s/i32\t$dst, $src", 0xac>;
-def I64_EXTEND_U_I32 : I<(outs I64:$dst), (ins I32:$src),
- [(set I64:$dst, (zext I32:$src))],
- "i64.extend_u/i32\t$dst, $src", 0xad>;
+ "i64.extend_s/i32\t$dst, $src", "i64.extend_s/i32",
+ 0xac>;
+defm I64_EXTEND_U_I32 : I<(outs I64:$dst), (ins I32:$src), (outs), (ins),
+ [(set I64:$dst, (zext I32:$src))],
+ "i64.extend_u/i32\t$dst, $src", "i64.extend_u/i32",
+ 0xad>;
-let Predicates = [HasAtomics] in {
-def I32_EXTEND8_S_I32 : I<(outs I32:$dst), (ins I32:$src),
- [(set I32:$dst, (sext_inreg I32:$src, i8))],
- "i32.extend8_s\t$dst, $src", 0xc0>;
-def I32_EXTEND16_S_I32 : I<(outs I32:$dst), (ins I32:$src),
- [(set I32:$dst, (sext_inreg I32:$src, i16))],
- "i32.extend16_s\t$dst, $src", 0xc1>;
-def I64_EXTEND8_S_I64 : I<(outs I64:$dst), (ins I64:$src),
- [(set I64:$dst, (sext_inreg I64:$src, i8))],
- "i64.extend8_s\t$dst, $src", 0xc2>;
-def I64_EXTEND16_S_I64 : I<(outs I64:$dst), (ins I64:$src),
- [(set I64:$dst, (sext_inreg I64:$src, i16))],
- "i64.extend16_s\t$dst, $src", 0xc3>;
-def I64_EXTEND32_S_I64 : I<(outs I64:$dst), (ins I64:$src),
- [(set I64:$dst, (sext_inreg I64:$src, i32))],
- "i64.extend32_s\t$dst, $src", 0xc4>;
-} // Predicates = [HasAtomics]
+let Predicates = [HasSignExt] in {
+defm I32_EXTEND8_S_I32 : I<(outs I32:$dst), (ins I32:$src), (outs), (ins),
+ [(set I32:$dst, (sext_inreg I32:$src, i8))],
+ "i32.extend8_s\t$dst, $src", "i32.extend8_s",
+ 0xc0>;
+defm I32_EXTEND16_S_I32 : I<(outs I32:$dst), (ins I32:$src), (outs), (ins),
+ [(set I32:$dst, (sext_inreg I32:$src, i16))],
+ "i32.extend16_s\t$dst, $src", "i32.extend16_s",
+ 0xc1>;
+defm I64_EXTEND8_S_I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins),
+ [(set I64:$dst, (sext_inreg I64:$src, i8))],
+ "i64.extend8_s\t$dst, $src", "i64.extend8_s",
+ 0xc2>;
+defm I64_EXTEND16_S_I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins),
+ [(set I64:$dst, (sext_inreg I64:$src, i16))],
+ "i64.extend16_s\t$dst, $src", "i64.extend16_s",
+ 0xc3>;
+defm I64_EXTEND32_S_I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins),
+ [(set I64:$dst, (sext_inreg I64:$src, i32))],
+ "i64.extend32_s\t$dst, $src", "i64.extend32_s",
+ 0xc4>;
+} // Predicates = [HasSignExt]
} // defs = [ARGUMENTS]
@@ -55,131 +62,161 @@ let Defs = [ARGUMENTS] in {
// Conversion from floating point to integer instructions which don't trap on
// overflow or invalid.
-def I32_TRUNC_S_SAT_F32 : I<(outs I32:$dst), (ins F32:$src),
- [(set I32:$dst, (fp_to_sint F32:$src))],
- "i32.trunc_s:sat/f32\t$dst, $src", 0xfc00>,
- Requires<[HasNontrappingFPToInt]>;
-def I32_TRUNC_U_SAT_F32 : I<(outs I32:$dst), (ins F32:$src),
- [(set I32:$dst, (fp_to_uint F32:$src))],
- "i32.trunc_u:sat/f32\t$dst, $src", 0xfc01>,
- Requires<[HasNontrappingFPToInt]>;
-def I64_TRUNC_S_SAT_F32 : I<(outs I64:$dst), (ins F32:$src),
- [(set I64:$dst, (fp_to_sint F32:$src))],
- "i64.trunc_s:sat/f32\t$dst, $src", 0xfc04>,
- Requires<[HasNontrappingFPToInt]>;
-def I64_TRUNC_U_SAT_F32 : I<(outs I64:$dst), (ins F32:$src),
- [(set I64:$dst, (fp_to_uint F32:$src))],
- "i64.trunc_u:sat/f32\t$dst, $src", 0xfc05>,
- Requires<[HasNontrappingFPToInt]>;
-def I32_TRUNC_S_SAT_F64 : I<(outs I32:$dst), (ins F64:$src),
- [(set I32:$dst, (fp_to_sint F64:$src))],
- "i32.trunc_s:sat/f64\t$dst, $src", 0xfc02>,
- Requires<[HasNontrappingFPToInt]>;
-def I32_TRUNC_U_SAT_F64 : I<(outs I32:$dst), (ins F64:$src),
- [(set I32:$dst, (fp_to_uint F64:$src))],
- "i32.trunc_u:sat/f64\t$dst, $src", 0xfc03>,
- Requires<[HasNontrappingFPToInt]>;
-def I64_TRUNC_S_SAT_F64 : I<(outs I64:$dst), (ins F64:$src),
- [(set I64:$dst, (fp_to_sint F64:$src))],
- "i64.trunc_s:sat/f64\t$dst, $src", 0xfc06>,
- Requires<[HasNontrappingFPToInt]>;
-def I64_TRUNC_U_SAT_F64 : I<(outs I64:$dst), (ins F64:$src),
- [(set I64:$dst, (fp_to_uint F64:$src))],
- "i64.trunc_u:sat/f64\t$dst, $src", 0xfc07>,
- Requires<[HasNontrappingFPToInt]>;
+defm I32_TRUNC_S_SAT_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
+ [(set I32:$dst, (fp_to_sint F32:$src))],
+ "i32.trunc_s:sat/f32\t$dst, $src",
+ "i32.trunc_s:sat/f32", 0xfc00>,
+ Requires<[HasNontrappingFPToInt]>;
+defm I32_TRUNC_U_SAT_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
+ [(set I32:$dst, (fp_to_uint F32:$src))],
+ "i32.trunc_u:sat/f32\t$dst, $src",
+ "i32.trunc_u:sat/f32", 0xfc01>,
+ Requires<[HasNontrappingFPToInt]>;
+defm I64_TRUNC_S_SAT_F32 : I<(outs I64:$dst), (ins F32:$src), (outs), (ins),
+ [(set I64:$dst, (fp_to_sint F32:$src))],
+ "i64.trunc_s:sat/f32\t$dst, $src",
+ "i64.trunc_s:sat/f32", 0xfc04>,
+ Requires<[HasNontrappingFPToInt]>;
+defm I64_TRUNC_U_SAT_F32 : I<(outs I64:$dst), (ins F32:$src), (outs), (ins),
+ [(set I64:$dst, (fp_to_uint F32:$src))],
+ "i64.trunc_u:sat/f32\t$dst, $src",
+ "i64.trunc_u:sat/f32", 0xfc05>,
+ Requires<[HasNontrappingFPToInt]>;
+defm I32_TRUNC_S_SAT_F64 : I<(outs I32:$dst), (ins F64:$src), (outs), (ins),
+ [(set I32:$dst, (fp_to_sint F64:$src))],
+ "i32.trunc_s:sat/f64\t$dst, $src",
+ "i32.trunc_s:sat/f64", 0xfc02>,
+ Requires<[HasNontrappingFPToInt]>;
+defm I32_TRUNC_U_SAT_F64 : I<(outs I32:$dst), (ins F64:$src), (outs), (ins),
+ [(set I32:$dst, (fp_to_uint F64:$src))],
+ "i32.trunc_u:sat/f64\t$dst, $src",
+ "i32.trunc_u:sat/f64", 0xfc03>,
+ Requires<[HasNontrappingFPToInt]>;
+defm I64_TRUNC_S_SAT_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
+ [(set I64:$dst, (fp_to_sint F64:$src))],
+ "i64.trunc_s:sat/f64\t$dst, $src",
+ "i64.trunc_s:sat/f64", 0xfc06>,
+ Requires<[HasNontrappingFPToInt]>;
+defm I64_TRUNC_U_SAT_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
+ [(set I64:$dst, (fp_to_uint F64:$src))],
+ "i64.trunc_u:sat/f64\t$dst, $src",
+ "i64.trunc_u:sat/f64", 0xfc07>,
+ Requires<[HasNontrappingFPToInt]>;
// Conversion from floating point to integer pseudo-instructions which don't
// trap on overflow or invalid.
let usesCustomInserter = 1, isCodeGenOnly = 1 in {
-def FP_TO_SINT_I32_F32 : I<(outs I32:$dst), (ins F32:$src),
- [(set I32:$dst, (fp_to_sint F32:$src))], "", 0>,
- Requires<[NotHasNontrappingFPToInt]>;
-def FP_TO_UINT_I32_F32 : I<(outs I32:$dst), (ins F32:$src),
- [(set I32:$dst, (fp_to_uint F32:$src))], "", 0>,
- Requires<[NotHasNontrappingFPToInt]>;
-def FP_TO_SINT_I64_F32 : I<(outs I64:$dst), (ins F32:$src),
- [(set I64:$dst, (fp_to_sint F32:$src))], "", 0>,
- Requires<[NotHasNontrappingFPToInt]>;
-def FP_TO_UINT_I64_F32 : I<(outs I64:$dst), (ins F32:$src),
- [(set I64:$dst, (fp_to_uint F32:$src))], "", 0>,
- Requires<[NotHasNontrappingFPToInt]>;
-def FP_TO_SINT_I32_F64 : I<(outs I32:$dst), (ins F64:$src),
- [(set I32:$dst, (fp_to_sint F64:$src))], "", 0>,
- Requires<[NotHasNontrappingFPToInt]>;
-def FP_TO_UINT_I32_F64 : I<(outs I32:$dst), (ins F64:$src),
- [(set I32:$dst, (fp_to_uint F64:$src))], "", 0>,
- Requires<[NotHasNontrappingFPToInt]>;
-def FP_TO_SINT_I64_F64 : I<(outs I64:$dst), (ins F64:$src),
- [(set I64:$dst, (fp_to_sint F64:$src))], "", 0>,
- Requires<[NotHasNontrappingFPToInt]>;
-def FP_TO_UINT_I64_F64 : I<(outs I64:$dst), (ins F64:$src),
- [(set I64:$dst, (fp_to_uint F64:$src))], "", 0>,
- Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_SINT_I32_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
+ [(set I32:$dst, (fp_to_sint F32:$src))], "", "", 0>,
+ Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_UINT_I32_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
+ [(set I32:$dst, (fp_to_uint F32:$src))], "", "", 0>,
+ Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_SINT_I64_F32 : I<(outs I64:$dst), (ins F32:$src), (outs), (ins),
+ [(set I64:$dst, (fp_to_sint F32:$src))], "", "", 0>,
+ Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_UINT_I64_F32 : I<(outs I64:$dst), (ins F32:$src), (outs), (ins),
+ [(set I64:$dst, (fp_to_uint F32:$src))], "", "", 0>,
+ Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_SINT_I32_F64 : I<(outs I32:$dst), (ins F64:$src), (outs), (ins),
+ [(set I32:$dst, (fp_to_sint F64:$src))], "", "", 0>,
+ Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_UINT_I32_F64 : I<(outs I32:$dst), (ins F64:$src), (outs), (ins),
+ [(set I32:$dst, (fp_to_uint F64:$src))], "", "", 0>,
+ Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_SINT_I64_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
+ [(set I64:$dst, (fp_to_sint F64:$src))], "", "", 0>,
+ Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_UINT_I64_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
+ [(set I64:$dst, (fp_to_uint F64:$src))], "", "", 0>,
+ Requires<[NotHasNontrappingFPToInt]>;
} // usesCustomInserter, isCodeGenOnly = 1
// Conversion from floating point to integer traps on overflow and invalid.
let hasSideEffects = 1 in {
-def I32_TRUNC_S_F32 : I<(outs I32:$dst), (ins F32:$src),
- [], "i32.trunc_s/f32\t$dst, $src", 0xa8>;
-def I32_TRUNC_U_F32 : I<(outs I32:$dst), (ins F32:$src),
- [], "i32.trunc_u/f32\t$dst, $src", 0xa9>;
-def I64_TRUNC_S_F32 : I<(outs I64:$dst), (ins F32:$src),
- [], "i64.trunc_s/f32\t$dst, $src", 0xae>;
-def I64_TRUNC_U_F32 : I<(outs I64:$dst), (ins F32:$src),
- [], "i64.trunc_u/f32\t$dst, $src", 0xaf>;
-def I32_TRUNC_S_F64 : I<(outs I32:$dst), (ins F64:$src),
- [], "i32.trunc_s/f64\t$dst, $src", 0xaa>;
-def I32_TRUNC_U_F64 : I<(outs I32:$dst), (ins F64:$src),
- [], "i32.trunc_u/f64\t$dst, $src", 0xab>;
-def I64_TRUNC_S_F64 : I<(outs I64:$dst), (ins F64:$src),
- [], "i64.trunc_s/f64\t$dst, $src", 0xb0>;
-def I64_TRUNC_U_F64 : I<(outs I64:$dst), (ins F64:$src),
- [], "i64.trunc_u/f64\t$dst, $src", 0xb1>;
+defm I32_TRUNC_S_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
+ [], "i32.trunc_s/f32\t$dst, $src", "i32.trunc_s/f32",
+ 0xa8>;
+defm I32_TRUNC_U_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
+ [], "i32.trunc_u/f32\t$dst, $src", "i32.trunc_u/f32",
+ 0xa9>;
+defm I64_TRUNC_S_F32 : I<(outs I64:$dst), (ins F32:$src), (outs), (ins),
+ [], "i64.trunc_s/f32\t$dst, $src", "i64.trunc_s/f32",
+ 0xae>;
+defm I64_TRUNC_U_F32 : I<(outs I64:$dst), (ins F32:$src), (outs), (ins),
+ [], "i64.trunc_u/f32\t$dst, $src", "i64.trunc_u/f32",
+ 0xaf>;
+defm I32_TRUNC_S_F64 : I<(outs I32:$dst), (ins F64:$src), (outs), (ins),
+ [], "i32.trunc_s/f64\t$dst, $src", "i32.trunc_s/f64",
+ 0xaa>;
+defm I32_TRUNC_U_F64 : I<(outs I32:$dst), (ins F64:$src), (outs), (ins),
+ [], "i32.trunc_u/f64\t$dst, $src", "i32.trunc_u/f64",
+ 0xab>;
+defm I64_TRUNC_S_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
+ [], "i64.trunc_s/f64\t$dst, $src", "i64.trunc_s/f64",
+ 0xb0>;
+defm I64_TRUNC_U_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
+ [], "i64.trunc_u/f64\t$dst, $src", "i64.trunc_u/f64",
+ 0xb1>;
} // hasSideEffects = 1
-def F32_CONVERT_S_I32 : I<(outs F32:$dst), (ins I32:$src),
- [(set F32:$dst, (sint_to_fp I32:$src))],
- "f32.convert_s/i32\t$dst, $src", 0xb2>;
-def F32_CONVERT_U_I32 : I<(outs F32:$dst), (ins I32:$src),
- [(set F32:$dst, (uint_to_fp I32:$src))],
- "f32.convert_u/i32\t$dst, $src", 0xb3>;
-def F64_CONVERT_S_I32 : I<(outs F64:$dst), (ins I32:$src),
- [(set F64:$dst, (sint_to_fp I32:$src))],
- "f64.convert_s/i32\t$dst, $src", 0xb7>;
-def F64_CONVERT_U_I32 : I<(outs F64:$dst), (ins I32:$src),
- [(set F64:$dst, (uint_to_fp I32:$src))],
- "f64.convert_u/i32\t$dst, $src", 0xb8>;
-def F32_CONVERT_S_I64 : I<(outs F32:$dst), (ins I64:$src),
- [(set F32:$dst, (sint_to_fp I64:$src))],
- "f32.convert_s/i64\t$dst, $src", 0xb4>;
-def F32_CONVERT_U_I64 : I<(outs F32:$dst), (ins I64:$src),
- [(set F32:$dst, (uint_to_fp I64:$src))],
- "f32.convert_u/i64\t$dst, $src", 0xb5>;
-def F64_CONVERT_S_I64 : I<(outs F64:$dst), (ins I64:$src),
- [(set F64:$dst, (sint_to_fp I64:$src))],
- "f64.convert_s/i64\t$dst, $src", 0xb9>;
-def F64_CONVERT_U_I64 : I<(outs F64:$dst), (ins I64:$src),
- [(set F64:$dst, (uint_to_fp I64:$src))],
- "f64.convert_u/i64\t$dst, $src", 0xba>;
+defm F32_CONVERT_S_I32 : I<(outs F32:$dst), (ins I32:$src), (outs), (ins),
+ [(set F32:$dst, (sint_to_fp I32:$src))],
+ "f32.convert_s/i32\t$dst, $src", "f32.convert_s/i32",
+ 0xb2>;
+defm F32_CONVERT_U_I32 : I<(outs F32:$dst), (ins I32:$src), (outs), (ins),
+ [(set F32:$dst, (uint_to_fp I32:$src))],
+ "f32.convert_u/i32\t$dst, $src", "f32.convert_u/i32",
+ 0xb3>;
+defm F64_CONVERT_S_I32 : I<(outs F64:$dst), (ins I32:$src), (outs), (ins),
+ [(set F64:$dst, (sint_to_fp I32:$src))],
+ "f64.convert_s/i32\t$dst, $src", "f64.convert_s/i32",
+ 0xb7>;
+defm F64_CONVERT_U_I32 : I<(outs F64:$dst), (ins I32:$src), (outs), (ins),
+ [(set F64:$dst, (uint_to_fp I32:$src))],
+ "f64.convert_u/i32\t$dst, $src", "f64.convert_u/i32",
+ 0xb8>;
+defm F32_CONVERT_S_I64 : I<(outs F32:$dst), (ins I64:$src), (outs), (ins),
+ [(set F32:$dst, (sint_to_fp I64:$src))],
+ "f32.convert_s/i64\t$dst, $src", "f32.convert_s/i64",
+ 0xb4>;
+defm F32_CONVERT_U_I64 : I<(outs F32:$dst), (ins I64:$src), (outs), (ins),
+ [(set F32:$dst, (uint_to_fp I64:$src))],
+ "f32.convert_u/i64\t$dst, $src", "f32.convert_u/i64",
+ 0xb5>;
+defm F64_CONVERT_S_I64 : I<(outs F64:$dst), (ins I64:$src), (outs), (ins),
+ [(set F64:$dst, (sint_to_fp I64:$src))],
+ "f64.convert_s/i64\t$dst, $src", "f64.convert_s/i64",
+ 0xb9>;
+defm F64_CONVERT_U_I64 : I<(outs F64:$dst), (ins I64:$src), (outs), (ins),
+ [(set F64:$dst, (uint_to_fp I64:$src))],
+ "f64.convert_u/i64\t$dst, $src", "f64.convert_u/i64",
+ 0xba>;
-def F64_PROMOTE_F32 : I<(outs F64:$dst), (ins F32:$src),
- [(set F64:$dst, (fpextend F32:$src))],
- "f64.promote/f32\t$dst, $src", 0xbb>;
-def F32_DEMOTE_F64 : I<(outs F32:$dst), (ins F64:$src),
- [(set F32:$dst, (fpround F64:$src))],
- "f32.demote/f64\t$dst, $src", 0xb6>;
+defm F64_PROMOTE_F32 : I<(outs F64:$dst), (ins F32:$src), (outs), (ins),
+ [(set F64:$dst, (fpextend F32:$src))],
+ "f64.promote/f32\t$dst, $src", "f64.promote/f32",
+ 0xbb>;
+defm F32_DEMOTE_F64 : I<(outs F32:$dst), (ins F64:$src), (outs), (ins),
+ [(set F32:$dst, (fpround F64:$src))],
+ "f32.demote/f64\t$dst, $src", "f32.demote/f64",
+ 0xb6>;
-def I32_REINTERPRET_F32 : I<(outs I32:$dst), (ins F32:$src),
- [(set I32:$dst, (bitconvert F32:$src))],
- "i32.reinterpret/f32\t$dst, $src", 0xbc>;
-def F32_REINTERPRET_I32 : I<(outs F32:$dst), (ins I32:$src),
- [(set F32:$dst, (bitconvert I32:$src))],
- "f32.reinterpret/i32\t$dst, $src", 0xbe>;
-def I64_REINTERPRET_F64 : I<(outs I64:$dst), (ins F64:$src),
- [(set I64:$dst, (bitconvert F64:$src))],
- "i64.reinterpret/f64\t$dst, $src", 0xbd>;
-def F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src),
- [(set F64:$dst, (bitconvert I64:$src))],
- "f64.reinterpret/i64\t$dst, $src", 0xbf>;
+defm I32_REINTERPRET_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
+ [(set I32:$dst, (bitconvert F32:$src))],
+ "i32.reinterpret/f32\t$dst, $src",
+ "i32.reinterpret/f32", 0xbc>;
+defm F32_REINTERPRET_I32 : I<(outs F32:$dst), (ins I32:$src), (outs), (ins),
+ [(set F32:$dst, (bitconvert I32:$src))],
+ "f32.reinterpret/i32\t$dst, $src",
+ "f32.reinterpret/i32", 0xbe>;
+defm I64_REINTERPRET_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
+ [(set I64:$dst, (bitconvert F64:$src))],
+ "i64.reinterpret/f64\t$dst, $src",
+ "i64.reinterpret/f64", 0xbd>;
+defm F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src), (outs), (ins),
+ [(set F64:$dst, (bitconvert I64:$src))],
+ "f64.reinterpret/i64\t$dst, $src",
+ "f64.reinterpret/i64", 0xbf>;
} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
new file mode 100644
index 000000000000..41b39f69e51c
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
@@ -0,0 +1,31 @@
+// WebAssemblyInstrExceptRef.td-WebAssembly except_ref codegen --*- tablegen -*-
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// WebAssembly except_ref operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+let Defs = [ARGUMENTS] in {
+
+defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst),
+ (ins EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond),
+ (outs), (ins),
+ [(set EXCEPT_REF:$dst,
+ (select I32:$cond, EXCEPT_REF:$lhs,
+ EXCEPT_REF:$rhs))],
+ "except_ref.select\t$dst, $lhs, $rhs, $cond",
+ "except_ref.select", 0x1b>;
+
+} // Defs = [ARGUMENTS]
+
+def : Pat<(select (i32 (setne I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs),
+ (SELECT_EXCEPT_REF EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond)>;
+def : Pat<(select (i32 (seteq I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs),
+ (SELECT_EXCEPT_REF EXCEPT_REF:$rhs, EXCEPT_REF:$lhs, I32:$cond)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index 03c9c1f8d5c0..8db75d38942b 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief WebAssembly Floating-point operand code-gen constructs.
+/// WebAssembly Floating-point operand code-gen constructs.
///
//===----------------------------------------------------------------------===//
@@ -77,12 +77,14 @@ def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>;
let Defs = [ARGUMENTS] in {
-def SELECT_F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs, I32:$cond),
- [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))],
- "f32.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
-def SELECT_F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs, I32:$cond),
- [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))],
- "f64.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
+defm SELECT_F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs, I32:$cond),
+ (outs), (ins),
+ [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))],
+ "f32.select\t$dst, $lhs, $rhs, $cond", "f32.select", 0x1b>;
+defm SELECT_F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs, I32:$cond),
+ (outs), (ins),
+ [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))],
+ "f64.select\t$dst, $lhs, $rhs, $cond", "f64.select", 0x1b>;
} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 4f41fcc232e9..403152c80660 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -8,99 +8,160 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief WebAssembly instruction format definitions.
+/// WebAssembly instruction format definitions.
///
//===----------------------------------------------------------------------===//
// WebAssembly Instruction Format.
-class WebAssemblyInst<bits<32> inst, string asmstr> : Instruction {
+// We instantiate 2 of these for every actual instruction (register based
+// and stack based), see below.
+class WebAssemblyInst<bits<32> inst, string asmstr, bit stack> : Instruction {
field bits<32> Inst = inst; // Instruction encoding.
+ field bit StackBased = stack;
let Namespace = "WebAssembly";
let Pattern = [];
let AsmString = asmstr;
}
-// Normal instructions.
-class I<dag oops, dag iops, list<dag> pattern, string asmstr = "", bits<32> inst = -1>
- : WebAssemblyInst<inst, asmstr> {
+// Normal instructions. Default instantiation of a WebAssemblyInst.
+class NI<dag oops, dag iops, list<dag> pattern, bit stack, string asmstr = "",
+ bits<32> inst = -1>
+ : WebAssemblyInst<inst, asmstr, stack> {
dag OutOperandList = oops;
dag InOperandList = iops;
let Pattern = pattern;
}
-class SIMD_I<dag oops, dag iops, list<dag> pattern,
- string asmstr = "", bits<32> inst = -1>
- : I<oops, iops, pattern, asmstr, inst>, Requires<[HasSIMD128]>;
+// Generates both register and stack based versions of one actual instruction.
+// We have 2 sets of operands (oops & iops) for the register and stack
+// based version of this instruction, as well as the corresponding asmstr.
+// The register versions have virtual-register operands which correspond to wasm
+// locals or stack locations. Each use and def of the register corresponds to an
+// implicit get_local / set_local or access of stack operands in wasm. These
+// instructions are used for ISel and all MI passes. The stack versions of the
+// instructions do not have register operands (they implicitly operate on the
+// stack), and get_locals and set_locals are explicit. The register instructions
+// are converted to their corresponding stack instructions before lowering to
+// MC.
+// Every instruction should want to be based on this multi-class to guarantee
+// there is always an equivalent pair of instructions.
+multiclass I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
+ list<dag> pattern_r, string asmstr_r = "", string asmstr_s = "",
+ bits<32> inst = -1> {
+ def "" : NI<oops_r, iops_r, pattern_r, 0, asmstr_r, inst>;
+ def _S : NI<oops_s, iops_s, [], 1, asmstr_s, inst>;
+}
+
+// For instructions that have no register ops, so both sets are the same.
+multiclass NRI<dag oops, dag iops, list<dag> pattern, string asmstr = "",
+ bits<32> inst = -1> {
+ defm "": I<oops, iops, oops, iops, pattern, asmstr, asmstr, inst>;
+}
-class ATOMIC_I<dag oops, dag iops, list<dag> pattern,
- string asmstr = "", bits<32> inst = -1>
- : I<oops, iops, pattern, asmstr, inst>, Requires<[HasAtomics]>;
+multiclass SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
+ list<dag> pattern_r, string asmstr_r = "",
+ string asmstr_s = "", bits<32> inst = -1> {
+ defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
+ inst>,
+ Requires<[HasSIMD128]>;
+}
+
+multiclass ATOMIC_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
+ list<dag> pattern_r, string asmstr_r = "",
+ string asmstr_s = "", bits<32> inst = -1> {
+ defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
+ inst>,
+ Requires<[HasAtomics]>;
+}
// Unary and binary instructions, for the local types that WebAssembly supports.
-multiclass UnaryInt<SDNode node, string name, bits<32> i32Inst, bits<32> i64Inst> {
- def _I32 : I<(outs I32:$dst), (ins I32:$src),
- [(set I32:$dst, (node I32:$src))],
- !strconcat("i32.", !strconcat(name, "\t$dst, $src")), i32Inst>;
- def _I64 : I<(outs I64:$dst), (ins I64:$src),
- [(set I64:$dst, (node I64:$src))],
- !strconcat("i64.", !strconcat(name, "\t$dst, $src")), i64Inst>;
+multiclass UnaryInt<SDNode node, string name, bits<32> i32Inst,
+ bits<32> i64Inst> {
+ defm _I32 : I<(outs I32:$dst), (ins I32:$src), (outs), (ins),
+ [(set I32:$dst, (node I32:$src))],
+ !strconcat("i32.", !strconcat(name, "\t$dst, $src")),
+ !strconcat("i32.", name), i32Inst>;
+ defm _I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins),
+ [(set I64:$dst, (node I64:$src))],
+ !strconcat("i64.", !strconcat(name, "\t$dst, $src")),
+ !strconcat("i64.", name), i64Inst>;
}
-multiclass BinaryInt<SDNode node, string name, bits<32> i32Inst, bits<32> i64Inst> {
- def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs),
- [(set I32:$dst, (node I32:$lhs, I32:$rhs))],
- !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs")), i32Inst>;
- def _I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs),
- [(set I64:$dst, (node I64:$lhs, I64:$rhs))],
- !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs")), i64Inst>;
+multiclass BinaryInt<SDNode node, string name, bits<32> i32Inst,
+ bits<32> i64Inst> {
+ defm _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs), (outs), (ins),
+ [(set I32:$dst, (node I32:$lhs, I32:$rhs))],
+ !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+ !strconcat("i32.", name), i32Inst>;
+ defm _I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs), (outs), (ins),
+ [(set I64:$dst, (node I64:$lhs, I64:$rhs))],
+ !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+ !strconcat("i64.", name), i64Inst>;
}
-multiclass UnaryFP<SDNode node, string name, bits<32> f32Inst, bits<32> f64Inst> {
- def _F32 : I<(outs F32:$dst), (ins F32:$src),
- [(set F32:$dst, (node F32:$src))],
- !strconcat("f32.", !strconcat(name, "\t$dst, $src")), f32Inst>;
- def _F64 : I<(outs F64:$dst), (ins F64:$src),
- [(set F64:$dst, (node F64:$src))],
- !strconcat("f64.", !strconcat(name, "\t$dst, $src")), f64Inst>;
+multiclass UnaryFP<SDNode node, string name, bits<32> f32Inst,
+ bits<32> f64Inst> {
+ defm _F32 : I<(outs F32:$dst), (ins F32:$src), (outs), (ins),
+ [(set F32:$dst, (node F32:$src))],
+ !strconcat("f32.", !strconcat(name, "\t$dst, $src")),
+ !strconcat("f32.", name), f32Inst>;
+ defm _F64 : I<(outs F64:$dst), (ins F64:$src), (outs), (ins),
+ [(set F64:$dst, (node F64:$src))],
+ !strconcat("f64.", !strconcat(name, "\t$dst, $src")),
+ !strconcat("f64.", name), f64Inst>;
}
-multiclass BinaryFP<SDNode node, string name, bits<32> f32Inst, bits<32> f64Inst> {
- def _F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs),
- [(set F32:$dst, (node F32:$lhs, F32:$rhs))],
- !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs")), f32Inst>;
- def _F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs),
- [(set F64:$dst, (node F64:$lhs, F64:$rhs))],
- !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs")), f64Inst>;
+multiclass BinaryFP<SDNode node, string name, bits<32> f32Inst,
+ bits<32> f64Inst> {
+ defm _F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs), (outs), (ins),
+ [(set F32:$dst, (node F32:$lhs, F32:$rhs))],
+ !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+ !strconcat("f32.", name), f32Inst>;
+ defm _F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs), (outs), (ins),
+ [(set F64:$dst, (node F64:$lhs, F64:$rhs))],
+ !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+ !strconcat("f64.", name), f64Inst>;
}
multiclass SIMDBinary<SDNode node, SDNode fnode, string name> {
- def _I8x16 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
- [(set (v16i8 V128:$dst), (node V128:$lhs, V128:$rhs))],
- !strconcat("i8x16.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
- def _I16x8 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
- [(set (v8i16 V128:$dst), (node V128:$lhs, V128:$rhs))],
- !strconcat("i16x8.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
- def _I32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
- [(set (v4i32 V128:$dst), (node V128:$lhs, V128:$rhs))],
- !strconcat("i32x4.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
- def _F32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
- [(set (v4f32 V128:$dst), (fnode V128:$lhs, V128:$rhs))],
- !strconcat("f32x4.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
-
+ defm _I8x16 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+ (outs), (ins),
+ [(set (v16i8 V128:$dst), (node V128:$lhs, V128:$rhs))],
+ !strconcat("i8x16.",
+ !strconcat(name, "\t$dst, $lhs, $rhs")),
+ !strconcat("i8x16.", name)>;
+ defm _I16x8 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+ (outs), (ins),
+ [(set (v8i16 V128:$dst), (node V128:$lhs, V128:$rhs))],
+ !strconcat("i16x8.",
+ !strconcat(name, "\t$dst, $lhs, $rhs")),
+ !strconcat("i16x8.", name)>;
+ defm _I32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+ (outs), (ins),
+ [(set (v4i32 V128:$dst), (node V128:$lhs, V128:$rhs))],
+ !strconcat("i32x4.",
+ !strconcat(name, "\t$dst, $lhs, $rhs")),
+ !strconcat("i32x4.", name)>;
+ defm _F32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+ (outs), (ins),
+ [(set (v4f32 V128:$dst), (fnode V128:$lhs, V128:$rhs))],
+ !strconcat("f32x4.",
+ !strconcat(name, "\t$dst, $lhs, $rhs")),
+ !strconcat("f32x4.", name)>;
}
multiclass ComparisonInt<CondCode cond, string name, bits<32> i32Inst, bits<32> i64Inst> {
- def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs),
- [(set I32:$dst, (setcc I32:$lhs, I32:$rhs, cond))],
- !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
- i32Inst>;
- def _I64 : I<(outs I32:$dst), (ins I64:$lhs, I64:$rhs),
- [(set I32:$dst, (setcc I64:$lhs, I64:$rhs, cond))],
- !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
- i64Inst>;
+ defm _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs), (outs), (ins),
+ [(set I32:$dst, (setcc I32:$lhs, I32:$rhs, cond))],
+ !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+ !strconcat("i32.", name), i32Inst>;
+ defm _I64 : I<(outs I32:$dst), (ins I64:$lhs, I64:$rhs), (outs), (ins),
+ [(set I32:$dst, (setcc I64:$lhs, I64:$rhs, cond))],
+ !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+ !strconcat("i64.", name), i64Inst>;
}
multiclass ComparisonFP<CondCode cond, string name, bits<32> f32Inst, bits<32> f64Inst> {
- def _F32 : I<(outs I32:$dst), (ins F32:$lhs, F32:$rhs),
- [(set I32:$dst, (setcc F32:$lhs, F32:$rhs, cond))],
- !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
- f32Inst>;
- def _F64 : I<(outs I32:$dst), (ins F64:$lhs, F64:$rhs),
- [(set I32:$dst, (setcc F64:$lhs, F64:$rhs, cond))],
- !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
- f64Inst>;
+ defm _F32 : I<(outs I32:$dst), (ins F32:$lhs, F32:$rhs), (outs), (ins),
+ [(set I32:$dst, (setcc F32:$lhs, F32:$rhs, cond))],
+ !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+ !strconcat("f32.", name), f32Inst>;
+ defm _F64 : I<(outs I32:$dst), (ins F64:$lhs, F64:$rhs), (outs), (ins),
+ [(set I32:$dst, (setcc F64:$lhs, F64:$rhs, cond))],
+ !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+ !strconcat("f64.", name), f64Inst>;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 8846952e5af4..cd49bd1682ad 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains the WebAssembly implementation of the
+/// This file contains the WebAssembly implementation of the
/// TargetInstrInfo class.
///
//===----------------------------------------------------------------------===//
@@ -30,7 +30,8 @@ using namespace llvm;
WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
: WebAssemblyGenInstrInfo(WebAssembly::ADJCALLSTACKDOWN,
- WebAssembly::ADJCALLSTACKUP),
+ WebAssembly::ADJCALLSTACKUP,
+ WebAssembly::CATCHRET),
RI(STI.getTargetTriple()) {}
bool WebAssemblyInstrInfo::isReallyTriviallyReMaterializable(
@@ -151,7 +152,7 @@ unsigned WebAssemblyInstrInfo::removeBranch(MachineBasicBlock &MBB,
while (I != MBB.instr_begin()) {
--I;
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
if (!I->isTerminator())
break;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
index eb74106336ed..4a3763c345b0 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains the WebAssembly implementation of the
+/// This file contains the WebAssembly implementation of the
/// TargetInstrInfo class.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index f8d311ac3b00..aeb282a7febb 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief WebAssembly Instruction definitions.
+/// WebAssembly Instruction definitions.
///
//===----------------------------------------------------------------------===//
@@ -30,6 +30,24 @@ def NotHasNontrappingFPToInt :
Predicate<"!Subtarget->hasNontrappingFPToInt()">,
AssemblerPredicate<"!FeatureNontrappingFPToInt",
"nontrapping-fptoint">;
+def HasSignExt :
+ Predicate<"Subtarget->hasSignExt()">,
+ AssemblerPredicate<"FeatureSignExt",
+ "sign-ext">;
+def NotHasSignExt :
+ Predicate<"!Subtarget->hasSignExt()">,
+ AssemblerPredicate<"!FeatureSignExt",
+ "sign-ext">;
+
+def HasExceptionHandling :
+ Predicate<"Subtarget->hasExceptionHandling()">,
+ AssemblerPredicate<"FeatureExceptionHandling",
+ "exception-handling">;
+
+def NotHasExceptionHandling :
+ Predicate<"!Subtarget->hasExceptionHandling()">,
+ AssemblerPredicate<"!FeatureExceptionHandling",
+ "exception-handling">;
//===----------------------------------------------------------------------===//
// WebAssembly-specific DAG Node Types.
@@ -135,23 +153,26 @@ include "WebAssemblyInstrFormats.td"
multiclass ARGUMENT<WebAssemblyRegClass vt> {
let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
- def ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno),
- [(set vt:$res, (WebAssemblyargument timm:$argno))]>;
+ defm ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno),
+ (outs), (ins i32imm:$argno),
+ [(set vt:$res, (WebAssemblyargument timm:$argno))]>;
}
multiclass SIMD_ARGUMENT<ValueType vt> {
let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
- def ARGUMENT_#vt : SIMD_I<(outs V128:$res), (ins i32imm:$argno),
- [(set (vt V128:$res),
+ defm ARGUMENT_#vt : SIMD_I<(outs V128:$res), (ins i32imm:$argno),
+ (outs), (ins i32imm:$argno),
+ [(set (vt V128:$res),
(WebAssemblyargument timm:$argno))]>;
}
-defm : ARGUMENT<I32>;
-defm : ARGUMENT<I64>;
-defm : ARGUMENT<F32>;
-defm : ARGUMENT<F64>;
-defm : SIMD_ARGUMENT<v16i8>;
-defm : SIMD_ARGUMENT<v8i16>;
-defm : SIMD_ARGUMENT<v4i32>;
-defm : SIMD_ARGUMENT<v4f32>;
+defm "": ARGUMENT<I32>;
+defm "": ARGUMENT<I64>;
+defm "": ARGUMENT<F32>;
+defm "": ARGUMENT<F64>;
+defm "": ARGUMENT<EXCEPT_REF>;
+defm "": SIMD_ARGUMENT<v16i8>;
+defm "": SIMD_ARGUMENT<v8i16>;
+defm "": SIMD_ARGUMENT<v4i32>;
+defm "": SIMD_ARGUMENT<v4f32>;
let Defs = [ARGUMENTS] in {
@@ -165,69 +186,83 @@ let hasSideEffects = 0 in {
// and set_local. COPYs are eliminated (and replaced with
// get_local/set_local) in the ExplicitLocals pass.
let isAsCheapAsAMove = 1, isCodeGenOnly = 1 in
- def COPY_#vt : I<(outs vt:$res), (ins vt:$src), [], "copy_local\t$res, $src">;
+ defm COPY_#vt : I<(outs vt:$res), (ins vt:$src), (outs), (ins), [],
+ "copy_local\t$res, $src", "copy_local">;
// TEE is similar to COPY, but writes two copies of its result. Typically
// this would be used to stackify one result and write the other result to a
// local.
let isAsCheapAsAMove = 1, isCodeGenOnly = 1 in
- def TEE_#vt : I<(outs vt:$res, vt:$also), (ins vt:$src), [],
- "tee_local\t$res, $also, $src">;
+ defm TEE_#vt : I<(outs vt:$res, vt:$also), (ins vt:$src), (outs), (ins), [],
+ "tee_local\t$res, $also, $src", "tee_local">;
// This is the actual get_local instruction in wasm. These are made explicit
// by the ExplicitLocals pass. It has mayLoad because it reads from a wasm
// local, which is a side effect not otherwise modeled in LLVM.
let mayLoad = 1, isAsCheapAsAMove = 1 in
- def GET_LOCAL_#vt : I<(outs vt:$res), (ins local_op:$local), [],
- "get_local\t$res, $local", 0x20>;
+ defm GET_LOCAL_#vt : I<(outs vt:$res), (ins local_op:$local),
+ (outs), (ins local_op:$local), [],
+ "get_local\t$res, $local", "get_local\t$local", 0x20>;
// This is the actual set_local instruction in wasm. These are made explicit
// by the ExplicitLocals pass. It has mayStore because it writes to a wasm
// local, which is a side effect not otherwise modeled in LLVM.
let mayStore = 1, isAsCheapAsAMove = 1 in
- def SET_LOCAL_#vt : I<(outs), (ins local_op:$local, vt:$src), [],
- "set_local\t$local, $src", 0x21>;
+ defm SET_LOCAL_#vt : I<(outs), (ins local_op:$local, vt:$src),
+ (outs), (ins local_op:$local), [],
+ "set_local\t$local, $src", "set_local\t$local", 0x21>;
// This is the actual tee_local instruction in wasm. TEEs are turned into
// TEE_LOCALs by the ExplicitLocals pass. It has mayStore for the same reason
// as SET_LOCAL.
let mayStore = 1, isAsCheapAsAMove = 1 in
- def TEE_LOCAL_#vt : I<(outs vt:$res), (ins local_op:$local, vt:$src), [],
- "tee_local\t$res, $local, $src", 0x22>;
+ defm TEE_LOCAL_#vt : I<(outs vt:$res), (ins local_op:$local, vt:$src),
+ (outs), (ins local_op:$local), [],
+ "tee_local\t$res, $local, $src", "tee_local\t$local",
+ 0x22>;
// Unused values must be dropped in some contexts.
- def DROP_#vt : I<(outs), (ins vt:$src), [],
- "drop\t$src", 0x1a>;
+ defm DROP_#vt : I<(outs), (ins vt:$src), (outs), (ins), [],
+ "drop\t$src", "drop", 0x1a>;
let mayLoad = 1 in
- def GET_GLOBAL_#vt : I<(outs vt:$res), (ins global_op:$local), [],
- "get_global\t$res, $local", 0x23>;
+ defm GET_GLOBAL_#vt : I<(outs vt:$res), (ins global_op:$local),
+ (outs), (ins global_op:$local), [],
+ "get_global\t$res, $local", "get_global\t$local",
+ 0x23>;
let mayStore = 1 in
- def SET_GLOBAL_#vt : I<(outs), (ins global_op:$local, vt:$src), [],
- "set_global\t$local, $src", 0x24>;
+ defm SET_GLOBAL_#vt : I<(outs), (ins global_op:$local, vt:$src),
+ (outs), (ins global_op:$local), [],
+ "set_global\t$local, $src", "set_global\t$local",
+ 0x24>;
} // hasSideEffects = 0
}
-defm : LOCAL<I32>;
-defm : LOCAL<I64>;
-defm : LOCAL<F32>;
-defm : LOCAL<F64>;
-defm : LOCAL<V128>, Requires<[HasSIMD128]>;
+defm "" : LOCAL<I32>;
+defm "" : LOCAL<I64>;
+defm "" : LOCAL<F32>;
+defm "" : LOCAL<F64>;
+defm "" : LOCAL<V128>, Requires<[HasSIMD128]>;
+defm "" : LOCAL<EXCEPT_REF>, Requires<[HasExceptionHandling]>;
let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
-def CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm),
- [(set I32:$res, imm:$imm)],
- "i32.const\t$res, $imm", 0x41>;
-def CONST_I64 : I<(outs I64:$res), (ins i64imm_op:$imm),
- [(set I64:$res, imm:$imm)],
- "i64.const\t$res, $imm", 0x42>;
-def CONST_F32 : I<(outs F32:$res), (ins f32imm_op:$imm),
- [(set F32:$res, fpimm:$imm)],
- "f32.const\t$res, $imm", 0x43>;
-def CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
- [(set F64:$res, fpimm:$imm)],
- "f64.const\t$res, $imm", 0x44>;
+defm CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm),
+ (outs), (ins i32imm_op:$imm),
+ [(set I32:$res, imm:$imm)],
+ "i32.const\t$res, $imm", "i32.const\t$imm", 0x41>;
+defm CONST_I64 : I<(outs I64:$res), (ins i64imm_op:$imm),
+ (outs), (ins i64imm_op:$imm),
+ [(set I64:$res, imm:$imm)],
+ "i64.const\t$res, $imm", "i64.const\t$imm", 0x42>;
+defm CONST_F32 : I<(outs F32:$res), (ins f32imm_op:$imm),
+ (outs), (ins f32imm_op:$imm),
+ [(set F32:$res, fpimm:$imm)],
+ "f32.const\t$res, $imm", "f32.const\t$imm", 0x43>;
+defm CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
+ (outs), (ins f64imm_op:$imm),
+ [(set F64:$res, fpimm:$imm)],
+ "f64.const\t$res, $imm", "f64.const\t$imm", 0x44>;
} // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1
} // Defs = [ARGUMENTS]
@@ -249,3 +284,4 @@ include "WebAssemblyInstrConv.td"
include "WebAssemblyInstrFloat.td"
include "WebAssemblyInstrAtomics.td"
include "WebAssemblyInstrSIMD.td"
+include "WebAssemblyInstrExceptRef.td"
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index e872dc219846..f9f21fd1d754 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief WebAssembly Integer operand code-gen constructs.
+/// WebAssembly Integer operand code-gen constructs.
///
//===----------------------------------------------------------------------===//
@@ -56,12 +56,12 @@ defm CLZ : UnaryInt<ctlz, "clz ", 0x67, 0x79>;
defm CTZ : UnaryInt<cttz, "ctz ", 0x68, 0x7a>;
defm POPCNT : UnaryInt<ctpop, "popcnt", 0x69, 0x7b>;
-def EQZ_I32 : I<(outs I32:$dst), (ins I32:$src),
- [(set I32:$dst, (setcc I32:$src, 0, SETEQ))],
- "i32.eqz \t$dst, $src", 0x45>;
-def EQZ_I64 : I<(outs I32:$dst), (ins I64:$src),
- [(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
- "i64.eqz \t$dst, $src", 0x50>;
+defm EQZ_I32 : I<(outs I32:$dst), (ins I32:$src), (outs), (ins),
+ [(set I32:$dst, (setcc I32:$src, 0, SETEQ))],
+ "i32.eqz \t$dst, $src", "i32.eqz", 0x45>;
+defm EQZ_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins),
+ [(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
+ "i64.eqz \t$dst, $src", "i64.eqz", 0x50>;
} // Defs = [ARGUMENTS]
@@ -73,12 +73,14 @@ def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>;
let Defs = [ARGUMENTS] in {
-def SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond),
- [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
- "i32.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
-def SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond),
- [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))],
- "i64.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
+defm SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond),
+ (outs), (ins),
+ [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
+ "i32.select\t$dst, $lhs, $rhs, $cond", "i32.select", 0x1b>;
+defm SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond),
+ (outs), (ins),
+ [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))],
+ "i64.select\t$dst, $lhs, $rhs, $cond", "i64.select", 0x1b>;
} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 9d58895ca5a6..8a49325af2bd 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief WebAssembly Memory operand code-gen constructs.
+/// WebAssembly Memory operand code-gen constructs.
///
//===----------------------------------------------------------------------===//
@@ -56,24 +56,27 @@ def regPlusGA : PatFrag<(ops node:$addr, node:$off),
let Defs = [ARGUMENTS] in {
// Defines atomic and non-atomic loads, regular and extending.
-class WebAssemblyLoad<WebAssemblyRegClass rc, string Name, int Opcode> :
- I<(outs rc:$dst),
- (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
- [], !strconcat(Name, "\t$dst, ${off}(${addr})${p2align}"), Opcode>;
+multiclass WebAssemblyLoad<WebAssemblyRegClass rc, string Name, int Opcode> {
+ defm "": I<(outs rc:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ (outs), (ins P2Align:$p2align, offset32_op:$off),
+ [], !strconcat(Name, "\t$dst, ${off}(${addr})${p2align}"),
+ !strconcat(Name, "\t${off}, ${p2align}"), Opcode>;
+}
// Basic load.
// FIXME: When we can break syntax compatibility, reorder the fields in the
// asmstrings to match the binary encoding.
-def LOAD_I32 : WebAssemblyLoad<I32, "i32.load", 0x28>;
-def LOAD_I64 : WebAssemblyLoad<I64, "i64.load", 0x29>;
-def LOAD_F32 : WebAssemblyLoad<F32, "f32.load", 0x2a>;
-def LOAD_F64 : WebAssemblyLoad<F64, "f64.load", 0x2b>;
+defm LOAD_I32 : WebAssemblyLoad<I32, "i32.load", 0x28>;
+defm LOAD_I64 : WebAssemblyLoad<I64, "i64.load", 0x29>;
+defm LOAD_F32 : WebAssemblyLoad<F32, "f32.load", 0x2a>;
+defm LOAD_F64 : WebAssemblyLoad<F64, "f64.load", 0x2b>;
} // Defs = [ARGUMENTS]
// Select loads with no constant offset.
-class LoadPatNoOffset<ValueType ty, PatFrag node, I inst> :
- Pat<(ty (node I32:$addr)), (inst 0, 0, $addr)>;
+class LoadPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(ty (kind I32:$addr)), (inst 0, 0, I32:$addr)>;
def : LoadPatNoOffset<i32, load, LOAD_I32>;
def : LoadPatNoOffset<i64, load, LOAD_I64>;
@@ -84,9 +87,8 @@ def : LoadPatNoOffset<f64, load, LOAD_F64>;
// Select loads with a constant offset.
// Pattern with address + immediate offset
-class LoadPatImmOff<ValueType ty, PatFrag loadkind, PatFrag operand, I inst> :
- Pat<(ty (loadkind (operand I32:$addr, imm:$off))),
- (inst 0, imm:$off, $addr)>;
+class LoadPatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
+ Pat<(ty (kind (operand I32:$addr, imm:$off))), (inst 0, imm:$off, I32:$addr)>;
def : LoadPatImmOff<i32, load, regPlusImm, LOAD_I32>;
def : LoadPatImmOff<i64, load, regPlusImm, LOAD_I64>;
@@ -97,18 +99,18 @@ def : LoadPatImmOff<i64, load, or_is_add, LOAD_I64>;
def : LoadPatImmOff<f32, load, or_is_add, LOAD_F32>;
def : LoadPatImmOff<f64, load, or_is_add, LOAD_F64>;
-class LoadPatGlobalAddr<ValueType ty, PatFrag loadkind, I inst> :
- Pat<(ty (loadkind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)))),
- (inst 0, tglobaladdr:$off, $addr)>;
+class LoadPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(ty (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)))),
+ (inst 0, tglobaladdr:$off, I32:$addr)>;
def : LoadPatGlobalAddr<i32, load, LOAD_I32>;
def : LoadPatGlobalAddr<i64, load, LOAD_I64>;
def : LoadPatGlobalAddr<f32, load, LOAD_F32>;
def : LoadPatGlobalAddr<f64, load, LOAD_F64>;
-class LoadPatExternalSym<ValueType ty, PatFrag loadkind, I inst> :
- Pat<(ty (loadkind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
- (inst 0, texternalsym:$off, $addr)>;
+class LoadPatExternalSym<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(ty (kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
+ (inst 0, texternalsym:$off, I32:$addr)>;
def : LoadPatExternalSym<i32, load, LOAD_I32>;
def : LoadPatExternalSym<i64, load, LOAD_I64>;
def : LoadPatExternalSym<f32, load, LOAD_F32>;
@@ -116,16 +118,16 @@ def : LoadPatExternalSym<f64, load, LOAD_F64>;
// Select loads with just a constant offset.
-class LoadPatOffsetOnly<ValueType ty, PatFrag loadkind, I inst> :
- Pat<(ty (loadkind imm:$off)), (inst 0, imm:$off, (CONST_I32 0))>;
+class LoadPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(ty (kind imm:$off)), (inst 0, imm:$off, (CONST_I32 0))>;
def : LoadPatOffsetOnly<i32, load, LOAD_I32>;
def : LoadPatOffsetOnly<i64, load, LOAD_I64>;
def : LoadPatOffsetOnly<f32, load, LOAD_F32>;
def : LoadPatOffsetOnly<f64, load, LOAD_F64>;
-class LoadPatGlobalAddrOffOnly<ValueType ty, PatFrag loadkind, I inst> :
- Pat<(ty (loadkind (WebAssemblywrapper tglobaladdr:$off))),
+class LoadPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off))),
(inst 0, tglobaladdr:$off, (CONST_I32 0))>;
def : LoadPatGlobalAddrOffOnly<i32, load, LOAD_I32>;
@@ -133,8 +135,8 @@ def : LoadPatGlobalAddrOffOnly<i64, load, LOAD_I64>;
def : LoadPatGlobalAddrOffOnly<f32, load, LOAD_F32>;
def : LoadPatGlobalAddrOffOnly<f64, load, LOAD_F64>;
-class LoadPatExternSymOffOnly<ValueType ty, PatFrag loadkind, I inst> :
- Pat<(ty (loadkind (WebAssemblywrapper texternalsym:$off))),
+class LoadPatExternSymOffOnly<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(ty (kind (WebAssemblywrapper texternalsym:$off))),
(inst 0, texternalsym:$off, (CONST_I32 0))>;
def : LoadPatExternSymOffOnly<i32, load, LOAD_I32>;
def : LoadPatExternSymOffOnly<i64, load, LOAD_I64>;
@@ -144,16 +146,16 @@ def : LoadPatExternSymOffOnly<f64, load, LOAD_F64>;
let Defs = [ARGUMENTS] in {
// Extending load.
-def LOAD8_S_I32 : WebAssemblyLoad<I32, "i32.load8_s", 0x2c>;
-def LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.load8_u", 0x2d>;
-def LOAD16_S_I32 : WebAssemblyLoad<I32, "i32.load16_s", 0x2e>;
-def LOAD16_U_I32 : WebAssemblyLoad<I32, "i32.load16_u", 0x2f>;
-def LOAD8_S_I64 : WebAssemblyLoad<I64, "i64.load8_s", 0x30>;
-def LOAD8_U_I64 : WebAssemblyLoad<I64, "i64.load8_u", 0x31>;
-def LOAD16_S_I64 : WebAssemblyLoad<I64, "i64.load16_s", 0x32>;
-def LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.load16_u", 0x32>;
-def LOAD32_S_I64 : WebAssemblyLoad<I64, "i64.load32_s", 0x34>;
-def LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.load32_u", 0x35>;
+defm LOAD8_S_I32 : WebAssemblyLoad<I32, "i32.load8_s", 0x2c>;
+defm LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.load8_u", 0x2d>;
+defm LOAD16_S_I32 : WebAssemblyLoad<I32, "i32.load16_s", 0x2e>;
+defm LOAD16_U_I32 : WebAssemblyLoad<I32, "i32.load16_u", 0x2f>;
+defm LOAD8_S_I64 : WebAssemblyLoad<I64, "i64.load8_s", 0x30>;
+defm LOAD8_U_I64 : WebAssemblyLoad<I64, "i64.load8_u", 0x31>;
+defm LOAD16_S_I64 : WebAssemblyLoad<I64, "i64.load16_s", 0x32>;
+defm LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.load16_u", 0x33>;
+defm LOAD32_S_I64 : WebAssemblyLoad<I64, "i64.load32_s", 0x34>;
+defm LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.load32_u", 0x35>;
} // Defs = [ARGUMENTS]
@@ -303,236 +305,191 @@ def : LoadPatExternSymOffOnly<i64, extloadi32, LOAD32_U_I64>;
let Defs = [ARGUMENTS] in {
+// Defines atomic and non-atomic stores, regular and truncating
+multiclass WebAssemblyStore<WebAssemblyRegClass rc, string Name, int Opcode> {
+ defm "" : I<(outs),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val),
+ (outs),
+ (ins P2Align:$p2align, offset32_op:$off), [],
+ !strconcat(Name, "\t${off}(${addr})${p2align}, $val"),
+ !strconcat(Name, "\t${off}, ${p2align}"), Opcode>;
+}
// Basic store.
// Note: WebAssembly inverts SelectionDAG's usual operand order.
-def STORE_I32 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
- I32:$val), [],
- "i32.store\t${off}(${addr})${p2align}, $val", 0x36>;
-def STORE_I64 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
- I64:$val), [],
- "i64.store\t${off}(${addr})${p2align}, $val", 0x37>;
-def STORE_F32 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
- F32:$val), [],
- "f32.store\t${off}(${addr})${p2align}, $val", 0x38>;
-def STORE_F64 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
- F64:$val), [],
- "f64.store\t${off}(${addr})${p2align}, $val", 0x39>;
+defm STORE_I32 : WebAssemblyStore<I32, "i32.store", 0x36>;
+defm STORE_I64 : WebAssemblyStore<I64, "i64.store", 0x37>;
+defm STORE_F32 : WebAssemblyStore<F32, "f32.store", 0x38>;
+defm STORE_F64 : WebAssemblyStore<F64, "f64.store", 0x39>;
} // Defs = [ARGUMENTS]
// Select stores with no constant offset.
-def : Pat<(store I32:$val, I32:$addr), (STORE_I32 0, 0, I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, I32:$addr), (STORE_I64 0, 0, I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, 0, I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, 0, I32:$addr, F64:$val)>;
+class StorePatNoOffset<ValueType ty, PatFrag node, NI inst> :
+ Pat<(node ty:$val, I32:$addr), (inst 0, 0, I32:$addr, ty:$val)>;
+
+def : StorePatNoOffset<i32, store, STORE_I32>;
+def : StorePatNoOffset<i64, store, STORE_I64>;
+def : StorePatNoOffset<f32, store, STORE_F32>;
+def : StorePatNoOffset<f64, store, STORE_F64>;
// Select stores with a constant offset.
-def : Pat<(store I32:$val, (regPlusImm I32:$addr, imm:$off)),
- (STORE_I32 0, imm:$off, I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, (regPlusImm I32:$addr, imm:$off)),
- (STORE_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, (regPlusImm I32:$addr, imm:$off)),
- (STORE_F32 0, imm:$off, I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, (regPlusImm I32:$addr, imm:$off)),
- (STORE_F64 0, imm:$off, I32:$addr, F64:$val)>;
-def : Pat<(store I32:$val, (or_is_add I32:$addr, imm:$off)),
- (STORE_I32 0, imm:$off, I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, (or_is_add I32:$addr, imm:$off)),
- (STORE_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, (or_is_add I32:$addr, imm:$off)),
- (STORE_F32 0, imm:$off, I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, (or_is_add I32:$addr, imm:$off)),
- (STORE_F64 0, imm:$off, I32:$addr, F64:$val)>;
-def : Pat<(store I32:$val, (regPlusGA I32:$addr,
- (WebAssemblywrapper tglobaladdr:$off))),
- (STORE_I32 0, tglobaladdr:$off, I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, (regPlusGA I32:$addr,
- (WebAssemblywrapper tglobaladdr:$off))),
- (STORE_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, (regPlusGA I32:$addr,
- (WebAssemblywrapper tglobaladdr:$off))),
- (STORE_F32 0, tglobaladdr:$off, I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, (regPlusGA I32:$addr,
- (WebAssemblywrapper tglobaladdr:$off))),
- (STORE_F64 0, tglobaladdr:$off, I32:$addr, F64:$val)>;
-def : Pat<(store I32:$val, (add I32:$addr,
- (WebAssemblywrapper texternalsym:$off))),
- (STORE_I32 0, texternalsym:$off, I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, (add I32:$addr,
- (WebAssemblywrapper texternalsym:$off))),
- (STORE_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, (add I32:$addr,
- (WebAssemblywrapper texternalsym:$off))),
- (STORE_F32 0, texternalsym:$off, I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, (add I32:$addr,
- (WebAssemblywrapper texternalsym:$off))),
- (STORE_F64 0, texternalsym:$off, I32:$addr, F64:$val)>;
+class StorePatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
+ Pat<(kind ty:$val, (operand I32:$addr, imm:$off)),
+ (inst 0, imm:$off, I32:$addr, ty:$val)>;
+
+def : StorePatImmOff<i32, store, regPlusImm, STORE_I32>;
+def : StorePatImmOff<i64, store, regPlusImm, STORE_I64>;
+def : StorePatImmOff<f32, store, regPlusImm, STORE_F32>;
+def : StorePatImmOff<f64, store, regPlusImm, STORE_F64>;
+def : StorePatImmOff<i32, store, or_is_add, STORE_I32>;
+def : StorePatImmOff<i64, store, or_is_add, STORE_I64>;
+def : StorePatImmOff<f32, store, or_is_add, STORE_F32>;
+def : StorePatImmOff<f64, store, or_is_add, STORE_F64>;
+
+class StorePatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(kind ty:$val,
+ (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off))),
+ (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>;
+def : StorePatGlobalAddr<i32, store, STORE_I32>;
+def : StorePatGlobalAddr<i64, store, STORE_I64>;
+def : StorePatGlobalAddr<f32, store, STORE_F32>;
+def : StorePatGlobalAddr<f64, store, STORE_F64>;
+
+class StorePatExternalSym<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(kind ty:$val, (add I32:$addr, (WebAssemblywrapper texternalsym:$off))),
+ (inst 0, texternalsym:$off, I32:$addr, ty:$val)>;
+def : StorePatExternalSym<i32, store, STORE_I32>;
+def : StorePatExternalSym<i64, store, STORE_I64>;
+def : StorePatExternalSym<f32, store, STORE_F32>;
+def : StorePatExternalSym<f64, store, STORE_F64>;
// Select stores with just a constant offset.
-def : Pat<(store I32:$val, imm:$off),
- (STORE_I32 0, imm:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(store I64:$val, imm:$off),
- (STORE_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(store F32:$val, imm:$off),
- (STORE_F32 0, imm:$off, (CONST_I32 0), F32:$val)>;
-def : Pat<(store F64:$val, imm:$off),
- (STORE_F64 0, imm:$off, (CONST_I32 0), F64:$val)>;
-def : Pat<(store I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
- (STORE_I32 0, tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(store I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
- (STORE_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(store F32:$val, (WebAssemblywrapper tglobaladdr:$off)),
- (STORE_F32 0, tglobaladdr:$off, (CONST_I32 0), F32:$val)>;
-def : Pat<(store F64:$val, (WebAssemblywrapper tglobaladdr:$off)),
- (STORE_F64 0, tglobaladdr:$off, (CONST_I32 0), F64:$val)>;
-def : Pat<(store I32:$val, (WebAssemblywrapper texternalsym:$off)),
- (STORE_I32 0, texternalsym:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(store I64:$val, (WebAssemblywrapper texternalsym:$off)),
- (STORE_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(store F32:$val, (WebAssemblywrapper texternalsym:$off)),
- (STORE_F32 0, texternalsym:$off, (CONST_I32 0), F32:$val)>;
-def : Pat<(store F64:$val, (WebAssemblywrapper texternalsym:$off)),
- (STORE_F64 0, texternalsym:$off, (CONST_I32 0), F64:$val)>;
+class StorePatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(kind ty:$val, imm:$off), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>;
+def : StorePatOffsetOnly<i32, store, STORE_I32>;
+def : StorePatOffsetOnly<i64, store, STORE_I64>;
+def : StorePatOffsetOnly<f32, store, STORE_F32>;
+def : StorePatOffsetOnly<f64, store, STORE_F64>;
+
+class StorePatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(kind ty:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>;
+def : StorePatGlobalAddrOffOnly<i32, store, STORE_I32>;
+def : StorePatGlobalAddrOffOnly<i64, store, STORE_I64>;
+def : StorePatGlobalAddrOffOnly<f32, store, STORE_F32>;
+def : StorePatGlobalAddrOffOnly<f64, store, STORE_F64>;
+
+class StorePatExternSymOffOnly<ValueType ty, PatFrag kind, NI inst> :
+ Pat<(kind ty:$val, (WebAssemblywrapper texternalsym:$off)),
+ (inst 0, texternalsym:$off, (CONST_I32 0), ty:$val)>;
+def : StorePatExternSymOffOnly<i32, store, STORE_I32>;
+def : StorePatExternSymOffOnly<i64, store, STORE_I64>;
+def : StorePatExternSymOffOnly<f32, store, STORE_F32>;
+def : StorePatExternSymOffOnly<f64, store, STORE_F64>;
+
let Defs = [ARGUMENTS] in {
// Truncating store.
-def STORE8_I32 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
- I32:$val), [],
- "i32.store8\t${off}(${addr})${p2align}, $val", 0x3a>;
-def STORE16_I32 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
- I32:$val), [],
- "i32.store16\t${off}(${addr})${p2align}, $val", 0x3b>;
-def STORE8_I64 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
- I64:$val), [],
- "i64.store8\t${off}(${addr})${p2align}, $val", 0x3c>;
-def STORE16_I64 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
- I64:$val), [],
- "i64.store16\t${off}(${addr})${p2align}, $val", 0x3d>;
-def STORE32_I64 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
- I64:$val), [],
- "i64.store32\t${off}(${addr})${p2align}, $val", 0x3e>;
+defm STORE8_I32 : WebAssemblyStore<I32, "i32.store8", 0x3a>;
+defm STORE16_I32 : WebAssemblyStore<I32, "i32.store16", 0x3b>;
+defm STORE8_I64 : WebAssemblyStore<I64, "i64.store8", 0x3c>;
+defm STORE16_I64 : WebAssemblyStore<I64, "i64.store16", 0x3d>;
+defm STORE32_I64 : WebAssemblyStore<I64, "i64.store32", 0x3e>;
} // Defs = [ARGUMENTS]
// Select truncating stores with no constant offset.
-def : Pat<(truncstorei8 I32:$val, I32:$addr),
- (STORE8_I32 0, 0, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei16 I32:$val, I32:$addr),
- (STORE16_I32 0, 0, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei8 I64:$val, I32:$addr),
- (STORE8_I64 0, 0, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei16 I64:$val, I32:$addr),
- (STORE16_I64 0, 0, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei32 I64:$val, I32:$addr),
- (STORE32_I64 0, 0, I32:$addr, I64:$val)>;
+def : StorePatNoOffset<i32, truncstorei8, STORE8_I32>;
+def : StorePatNoOffset<i32, truncstorei16, STORE16_I32>;
+def : StorePatNoOffset<i64, truncstorei8, STORE8_I64>;
+def : StorePatNoOffset<i64, truncstorei16, STORE16_I64>;
+def : StorePatNoOffset<i64, truncstorei32, STORE32_I64>;
// Select truncating stores with a constant offset.
-def : Pat<(truncstorei8 I32:$val, (regPlusImm I32:$addr, imm:$off)),
- (STORE8_I32 0, imm:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei16 I32:$val, (regPlusImm I32:$addr, imm:$off)),
- (STORE16_I32 0, imm:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei8 I64:$val, (regPlusImm I32:$addr, imm:$off)),
- (STORE8_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei16 I64:$val, (regPlusImm I32:$addr, imm:$off)),
- (STORE16_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei32 I64:$val, (regPlusImm I32:$addr, imm:$off)),
- (STORE32_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei8 I32:$val, (or_is_add I32:$addr, imm:$off)),
- (STORE8_I32 0, imm:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei16 I32:$val, (or_is_add I32:$addr, imm:$off)),
- (STORE16_I32 0, imm:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei8 I64:$val, (or_is_add I32:$addr, imm:$off)),
- (STORE8_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei16 I64:$val, (or_is_add I32:$addr, imm:$off)),
- (STORE16_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei32 I64:$val, (or_is_add I32:$addr, imm:$off)),
- (STORE32_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei8 I32:$val,
- (regPlusGA I32:$addr,
- (WebAssemblywrapper tglobaladdr:$off))),
- (STORE8_I32 0, tglobaladdr:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei16 I32:$val,
- (regPlusGA I32:$addr,
- (WebAssemblywrapper tglobaladdr:$off))),
- (STORE16_I32 0, tglobaladdr:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei8 I64:$val,
- (regPlusGA I32:$addr,
- (WebAssemblywrapper tglobaladdr:$off))),
- (STORE8_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei16 I64:$val,
- (regPlusGA I32:$addr,
- (WebAssemblywrapper tglobaladdr:$off))),
- (STORE16_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei32 I64:$val,
- (regPlusGA I32:$addr,
- (WebAssemblywrapper tglobaladdr:$off))),
- (STORE32_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei8 I32:$val, (add I32:$addr,
- (WebAssemblywrapper texternalsym:$off))),
- (STORE8_I32 0, texternalsym:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei16 I32:$val,
- (add I32:$addr,
- (WebAssemblywrapper texternalsym:$off))),
- (STORE16_I32 0, texternalsym:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei8 I64:$val,
- (add I32:$addr,
- (WebAssemblywrapper texternalsym:$off))),
- (STORE8_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei16 I64:$val,
- (add I32:$addr,
- (WebAssemblywrapper texternalsym:$off))),
- (STORE16_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei32 I64:$val,
- (add I32:$addr,
- (WebAssemblywrapper texternalsym:$off))),
- (STORE32_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
+def : StorePatImmOff<i32, truncstorei8, regPlusImm, STORE8_I32>;
+def : StorePatImmOff<i32, truncstorei16, regPlusImm, STORE16_I32>;
+def : StorePatImmOff<i64, truncstorei8, regPlusImm, STORE8_I64>;
+def : StorePatImmOff<i64, truncstorei16, regPlusImm, STORE16_I64>;
+def : StorePatImmOff<i64, truncstorei32, regPlusImm, STORE32_I64>;
+def : StorePatImmOff<i32, truncstorei8, or_is_add, STORE8_I32>;
+def : StorePatImmOff<i32, truncstorei16, or_is_add, STORE16_I32>;
+def : StorePatImmOff<i64, truncstorei8, or_is_add, STORE8_I64>;
+def : StorePatImmOff<i64, truncstorei16, or_is_add, STORE16_I64>;
+def : StorePatImmOff<i64, truncstorei32, or_is_add, STORE32_I64>;
+
+def : StorePatGlobalAddr<i32, truncstorei8, STORE8_I32>;
+def : StorePatGlobalAddr<i32, truncstorei16, STORE16_I32>;
+def : StorePatGlobalAddr<i64, truncstorei8, STORE8_I64>;
+def : StorePatGlobalAddr<i64, truncstorei16, STORE16_I64>;
+def : StorePatGlobalAddr<i64, truncstorei32, STORE32_I64>;
+def : StorePatExternalSym<i32, truncstorei8, STORE8_I32>;
+def : StorePatExternalSym<i32, truncstorei16, STORE16_I32>;
+def : StorePatExternalSym<i64, truncstorei8, STORE8_I64>;
+def : StorePatExternalSym<i64, truncstorei16, STORE16_I64>;
+def : StorePatExternalSym<i64, truncstorei32, STORE32_I64>;
// Select truncating stores with just a constant offset.
-def : Pat<(truncstorei8 I32:$val, imm:$off),
- (STORE8_I32 0, imm:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(truncstorei16 I32:$val, imm:$off),
- (STORE16_I32 0, imm:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(truncstorei8 I64:$val, imm:$off),
- (STORE8_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei16 I64:$val, imm:$off),
- (STORE16_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei32 I64:$val, imm:$off),
- (STORE32_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
- (STORE8_I32 0, tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
- (STORE16_I32 0, tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
- (STORE8_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
- (STORE16_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
- (STORE32_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper texternalsym:$off)),
- (STORE8_I32 0, texternalsym:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper texternalsym:$off)),
- (STORE16_I32 0, texternalsym:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper texternalsym:$off)),
- (STORE8_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper texternalsym:$off)),
- (STORE16_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper texternalsym:$off)),
- (STORE32_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
+def : StorePatOffsetOnly<i32, truncstorei8, STORE8_I32>;
+def : StorePatOffsetOnly<i32, truncstorei16, STORE16_I32>;
+def : StorePatOffsetOnly<i64, truncstorei8, STORE8_I64>;
+def : StorePatOffsetOnly<i64, truncstorei16, STORE16_I64>;
+def : StorePatOffsetOnly<i64, truncstorei32, STORE32_I64>;
+def : StorePatGlobalAddrOffOnly<i32, truncstorei8, STORE8_I32>;
+def : StorePatGlobalAddrOffOnly<i32, truncstorei16, STORE16_I32>;
+def : StorePatGlobalAddrOffOnly<i64, truncstorei8, STORE8_I64>;
+def : StorePatGlobalAddrOffOnly<i64, truncstorei16, STORE16_I64>;
+def : StorePatGlobalAddrOffOnly<i64, truncstorei32, STORE32_I64>;
+def : StorePatExternSymOffOnly<i32, truncstorei8, STORE8_I32>;
+def : StorePatExternSymOffOnly<i32, truncstorei16, STORE16_I32>;
+def : StorePatExternSymOffOnly<i64, truncstorei8, STORE8_I64>;
+def : StorePatExternSymOffOnly<i64, truncstorei16, STORE16_I64>;
+def : StorePatExternSymOffOnly<i64, truncstorei32, STORE32_I64>;
let Defs = [ARGUMENTS] in {
// Current memory size.
-def CURRENT_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
- [],
- "current_memory\t$dst", 0x3f>,
- Requires<[HasAddr32]>;
+defm MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
+ (outs), (ins i32imm:$flags),
+ [(set I32:$dst,
+ (int_wasm_memory_size (i32 imm:$flags)))],
+ "memory.size\t$dst, $flags", "memory.size\t$flags",
+ 0x3f>,
+ Requires<[HasAddr32]>;
+defm MEM_SIZE_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
+ (outs), (ins i32imm:$flags),
+ [(set I32:$dst, (int_wasm_mem_size (i32 imm:$flags)))],
+ "mem.size\t$dst, $flags", "mem.size\t$flags", 0x3f>,
+ Requires<[HasAddr32]>;
+defm CURRENT_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
+ (outs), (ins i32imm:$flags),
+ [],
+ "current_memory\t$dst",
+ "current_memory\t$flags", 0x3f>,
+ Requires<[HasAddr32]>;
// Grow memory.
-def GROW_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta),
- [],
- "grow_memory\t$dst, $delta", 0x40>,
- Requires<[HasAddr32]>;
+defm MEMORY_GROW_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta),
+ (outs), (ins i32imm:$flags, I32:$delta),
+ [(set I32:$dst,
+ (int_wasm_memory_grow (i32 imm:$flags),
+ I32:$delta))],
+ "memory.grow\t$dst, $flags, $delta",
+ "memory.grow\t$flags, $delta", 0x3f>,
+ Requires<[HasAddr32]>;
+defm MEM_GROW_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta),
+ (outs), (ins i32imm:$flags),
+ [(set I32:$dst,
+ (int_wasm_mem_grow (i32 imm:$flags), I32:$delta))],
+ "mem.grow\t$dst, $flags, $delta", "mem.grow\t$flags",
+ 0x3f>,
+ Requires<[HasAddr32]>;
+defm GROW_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta),
+ (outs), (ins i32imm:$flags),
+ [],
+ "grow_memory\t$dst, $delta", "grow_memory\t$flags",
+ 0x40>,
+ Requires<[HasAddr32]>;
} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index e403534d580a..7d1edccdeb3c 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief WebAssembly SIMD operand code-gen constructs.
+/// WebAssembly SIMD operand code-gen constructs.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
new file mode 100644
index 000000000000..e42dcbc0a8ac
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -0,0 +1,383 @@
+//=== WebAssemblyLateEHPrepare.cpp - WebAssembly Exception Preparation -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Does various transformations for exception handling.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/WasmEHFuncInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-exception-prepare"
+
+namespace {
+class WebAssemblyLateEHPrepare final : public MachineFunctionPass {
+ StringRef getPassName() const override {
+ return "WebAssembly Prepare Exception";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ bool replaceFuncletReturns(MachineFunction &MF);
+ bool hoistCatches(MachineFunction &MF);
+ bool addCatchAlls(MachineFunction &MF);
+ bool addRethrows(MachineFunction &MF);
+ bool ensureSingleBBTermPads(MachineFunction &MF);
+ bool mergeTerminatePads(MachineFunction &MF);
+ bool addCatchAllTerminatePads(MachineFunction &MF);
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyLateEHPrepare() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyLateEHPrepare::ID = 0;
+INITIALIZE_PASS(WebAssemblyLateEHPrepare, DEBUG_TYPE,
+ "WebAssembly Exception Preparation", false, false)
+
+FunctionPass *llvm::createWebAssemblyLateEHPrepare() {
+ return new WebAssemblyLateEHPrepare();
+}
+
+// Returns the nearest EH pad that dominates this instruction. This does not use
+// dominator analysis; it just does BFS on its predecessors until arriving at an
+// EH pad. This assumes valid EH scopes so the first EH pad it arrives in all
+// possible search paths should be the same.
+// Returns nullptr in case it does not find any EH pad in the search, or finds
+// multiple different EH pads.
+MachineBasicBlock *GetMatchingEHPad(MachineInstr *MI) {
+ MachineFunction *MF = MI->getParent()->getParent();
+ SmallVector<MachineBasicBlock *, 2> WL;
+ SmallPtrSet<MachineBasicBlock *, 2> Visited;
+ WL.push_back(MI->getParent());
+ MachineBasicBlock *EHPad = nullptr;
+ while (!WL.empty()) {
+ MachineBasicBlock *MBB = WL.pop_back_val();
+ if (Visited.count(MBB))
+ continue;
+ Visited.insert(MBB);
+ if (MBB->isEHPad()) {
+ if (EHPad && EHPad != MBB)
+ return nullptr;
+ EHPad = MBB;
+ continue;
+ }
+ if (MBB == &MF->front())
+ return nullptr;
+ WL.append(MBB->pred_begin(), MBB->pred_end());
+ }
+ return EHPad;
+}
+
+// Erases the given BB and all its children from the function. If other BBs have
+// this BB as a successor, the successor relationships will be deleted as well.
+static void EraseBBAndChildren(MachineBasicBlock *MBB) {
+ SmallVector<MachineBasicBlock *, 8> WL;
+ WL.push_back(MBB);
+ while (!WL.empty()) {
+ MachineBasicBlock *MBB = WL.pop_back_val();
+ for (auto *Pred : MBB->predecessors())
+ Pred->removeSuccessor(MBB);
+ for (auto *Succ : MBB->successors()) {
+ WL.push_back(Succ);
+ MBB->removeSuccessor(Succ);
+ }
+ MBB->eraseFromParent();
+ }
+}
+
+bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) {
+ if (MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() !=
+ ExceptionHandling::Wasm)
+ return false;
+
+ bool Changed = false;
+ Changed |= addRethrows(MF);
+ if (!MF.getFunction().hasPersonalityFn())
+ return Changed;
+ Changed |= replaceFuncletReturns(MF);
+ Changed |= hoistCatches(MF);
+ Changed |= addCatchAlls(MF);
+ Changed |= ensureSingleBBTermPads(MF);
+ Changed |= mergeTerminatePads(MF);
+ Changed |= addCatchAllTerminatePads(MF);
+ return Changed;
+}
+
+bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
+ bool Changed = false;
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ auto *EHInfo = MF.getWasmEHFuncInfo();
+
+ for (auto &MBB : MF) {
+ auto Pos = MBB.getFirstTerminator();
+ if (Pos == MBB.end())
+ continue;
+ MachineInstr *TI = &*Pos;
+
+ switch (TI->getOpcode()) {
+ case WebAssembly::CATCHRET: {
+ // Replace a catchret with a branch
+ MachineBasicBlock *TBB = TI->getOperand(0).getMBB();
+ if (!MBB.isLayoutSuccessor(TBB))
+ BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::BR))
+ .addMBB(TBB);
+ TI->eraseFromParent();
+ Changed = true;
+ break;
+ }
+ case WebAssembly::CLEANUPRET: {
+ // Replace a cleanupret with a rethrow
+ if (EHInfo->hasThrowUnwindDest(&MBB))
+ BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::RETHROW))
+ .addMBB(EHInfo->getThrowUnwindDest(&MBB));
+ else
+ BuildMI(MBB, TI, TI->getDebugLoc(),
+ TII.get(WebAssembly::RETHROW_TO_CALLER));
+
+ TI->eraseFromParent();
+ Changed = true;
+ break;
+ }
+ }
+ }
+ return Changed;
+}
+
+// Hoist catch instructions to the beginning of their matching EH pad BBs in
+// case,
+// (1) catch instruction is not the first instruction in EH pad.
+// ehpad:
+// some_other_instruction
+// ...
+// %exn = catch 0
+// (2) catch instruction is in a non-EH pad BB. For example,
+// ehpad:
+// br bb0
+// bb0:
+// %exn = catch 0
+bool WebAssemblyLateEHPrepare::hoistCatches(MachineFunction &MF) {
+ bool Changed = false;
+ SmallVector<MachineInstr *, 16> Catches;
+ for (auto &MBB : MF)
+ for (auto &MI : MBB)
+ if (WebAssembly::isCatch(MI))
+ Catches.push_back(&MI);
+
+ for (auto *Catch : Catches) {
+ MachineBasicBlock *EHPad = GetMatchingEHPad(Catch);
+ assert(EHPad && "No matching EH pad for catch");
+ if (EHPad->begin() == Catch)
+ continue;
+ Changed = true;
+ EHPad->insert(EHPad->begin(), Catch->removeFromParent());
+ }
+ return Changed;
+}
+
+// Add catch_all to beginning of cleanup pads.
+bool WebAssemblyLateEHPrepare::addCatchAlls(MachineFunction &MF) {
+ bool Changed = false;
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+ for (auto &MBB : MF) {
+ if (!MBB.isEHPad())
+ continue;
+ // This runs after hoistCatches(), so we assume that if there is a catch,
+ // that should be the first instruction in an EH pad.
+ if (!WebAssembly::isCatch(*MBB.begin())) {
+ Changed = true;
+ BuildMI(MBB, MBB.begin(), MBB.begin()->getDebugLoc(),
+ TII.get(WebAssembly::CATCH_ALL));
+ }
+ }
+ return Changed;
+}
+
+// Add a 'rethrow' instruction after __cxa_rethrow() call
+bool WebAssemblyLateEHPrepare::addRethrows(MachineFunction &MF) {
+ bool Changed = false;
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ auto *EHInfo = MF.getWasmEHFuncInfo();
+
+ for (auto &MBB : MF)
+ for (auto &MI : MBB) {
+ // Check if it is a call to __cxa_rethrow()
+ if (!MI.isCall())
+ continue;
+ MachineOperand &CalleeOp = MI.getOperand(0);
+ if (!CalleeOp.isGlobal() ||
+ CalleeOp.getGlobal()->getName() != WebAssembly::CxaRethrowFn)
+ continue;
+
+ // Now we have __cxa_rethrow() call
+ Changed = true;
+ auto InsertPt = std::next(MachineBasicBlock::iterator(MI));
+ while (InsertPt != MBB.end() && InsertPt->isLabel()) // Skip EH_LABELs
+ ++InsertPt;
+ MachineInstr *Rethrow = nullptr;
+ if (EHInfo->hasThrowUnwindDest(&MBB))
+ Rethrow = BuildMI(MBB, InsertPt, MI.getDebugLoc(),
+ TII.get(WebAssembly::RETHROW))
+ .addMBB(EHInfo->getThrowUnwindDest(&MBB));
+ else
+ Rethrow = BuildMI(MBB, InsertPt, MI.getDebugLoc(),
+ TII.get(WebAssembly::RETHROW_TO_CALLER));
+
+ // Becasue __cxa_rethrow does not return, the instruction after the
+ // rethrow should be an unreachable or a branch to another BB that should
+ // eventually lead to an unreachable. Delete it because rethrow itself is
+ // a terminator, and also delete non-EH pad successors if any.
+ MBB.erase(std::next(MachineBasicBlock::iterator(Rethrow)), MBB.end());
+ for (auto *Succ : MBB.successors())
+ if (!Succ->isEHPad())
+ EraseBBAndChildren(Succ);
+ }
+ return Changed;
+}
+
+// Terminate pads are an single-BB EH pad in the form of
+// termpad:
+// %exn = catch 0
+// call @__clang_call_terminate(%exn)
+// unreachable
+// (There can be set_local and get_locals before the call if we didn't run
+// RegStackify)
+// But code transformations can change or add more control flow, so the call to
+// __clang_call_terminate() function may not be in the original EH pad anymore.
+// This ensures every terminate pad is a single BB in the form illustrated
+// above.
+bool WebAssemblyLateEHPrepare::ensureSingleBBTermPads(MachineFunction &MF) {
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+ // Find calls to __clang_call_terminate()
+ SmallVector<MachineInstr *, 8> ClangCallTerminateCalls;
+ for (auto &MBB : MF)
+ for (auto &MI : MBB)
+ if (MI.isCall()) {
+ const MachineOperand &CalleeOp = MI.getOperand(0);
+ if (CalleeOp.isGlobal() && CalleeOp.getGlobal()->getName() ==
+ WebAssembly::ClangCallTerminateFn)
+ ClangCallTerminateCalls.push_back(&MI);
+ }
+
+ bool Changed = false;
+ for (auto *Call : ClangCallTerminateCalls) {
+ MachineBasicBlock *EHPad = GetMatchingEHPad(Call);
+ assert(EHPad && "No matching EH pad for catch");
+
+ // If it is already the form we want, skip it
+ if (Call->getParent() == EHPad &&
+ Call->getNextNode()->getOpcode() == WebAssembly::UNREACHABLE)
+ continue;
+
+ // In case the __clang_call_terminate() call is not in its matching EH pad,
+ // move the call to the end of EH pad and add an unreachable instruction
+ // after that. Delete all successors and their children if any, because here
+ // the program terminates.
+ Changed = true;
+ MachineInstr *Catch = &*EHPad->begin();
+ // This runs after hoistCatches(), so catch instruction should be at the top
+ assert(WebAssembly::isCatch(*Catch));
+ // Takes the result register of the catch instruction as argument. There may
+ // have been some other set_local/get_locals in between, but at this point
+ // we don't care.
+ Call->getOperand(1).setReg(Catch->getOperand(0).getReg());
+ auto InsertPos = std::next(MachineBasicBlock::iterator(Catch));
+ EHPad->insert(InsertPos, Call->removeFromParent());
+ BuildMI(*EHPad, InsertPos, Call->getDebugLoc(),
+ TII.get(WebAssembly::UNREACHABLE));
+ EHPad->erase(InsertPos, EHPad->end());
+ for (auto *Succ : EHPad->successors())
+ EraseBBAndChildren(Succ);
+ }
+ return Changed;
+}
+
+// In case there are multiple terminate pads, merge them into one for code size.
+// This runs after ensureSingleBBTermPads() and assumes every terminate pad is a
+// single BB.
+// In principle this violates EH scope relationship because it can merge
+// multiple inner EH scopes, each of which is in different outer EH scope. But
+// getEHScopeMembership() function will not be called after this, so it is fine.
+bool WebAssemblyLateEHPrepare::mergeTerminatePads(MachineFunction &MF) {
+ SmallVector<MachineBasicBlock *, 8> TermPads;
+ for (auto &MBB : MF)
+ if (WebAssembly::isCatchTerminatePad(MBB))
+ TermPads.push_back(&MBB);
+ if (TermPads.empty())
+ return false;
+
+ MachineBasicBlock *UniqueTermPad = TermPads.front();
+ for (auto *TermPad :
+ llvm::make_range(std::next(TermPads.begin()), TermPads.end())) {
+ SmallVector<MachineBasicBlock *, 2> Preds(TermPad->pred_begin(),
+ TermPad->pred_end());
+ for (auto *Pred : Preds)
+ Pred->replaceSuccessor(TermPad, UniqueTermPad);
+ TermPad->eraseFromParent();
+ }
+ return true;
+}
+
+// Terminate pads are cleanup pads, so they should start with a 'catch_all'
+// instruction. But in the Itanium model, when we have a C++ exception object,
+// we pass them to __clang_call_terminate function, which calls __cxa_end_catch
+// with the passed exception pointer and then std::terminate. This is the reason
+// that terminate pads are generated with not a catch_all but a catch
+// instruction in clang and earlier llvm passes. Here we append a terminate pad
+// with a catch_all after each existing terminate pad so we can also catch
+// foreign exceptions. For every terminate pad:
+// %exn = catch 0
+// call @__clang_call_terminate(%exn)
+// unreachable
+// We append this BB right after that:
+// catch_all
+// call @std::terminate()
+// unreachable
+bool WebAssemblyLateEHPrepare::addCatchAllTerminatePads(MachineFunction &MF) {
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ SmallVector<MachineBasicBlock *, 8> TermPads;
+ for (auto &MBB : MF)
+ if (WebAssembly::isCatchTerminatePad(MBB))
+ TermPads.push_back(&MBB);
+ if (TermPads.empty())
+ return false;
+
+ Function *StdTerminateFn =
+ MF.getFunction().getParent()->getFunction(WebAssembly::StdTerminateFn);
+ assert(StdTerminateFn && "There is no std::terminate() function");
+ for (auto *CatchTermPad : TermPads) {
+ DebugLoc DL = CatchTermPad->findDebugLoc(CatchTermPad->begin());
+ auto *CatchAllTermPad = MF.CreateMachineBasicBlock();
+ MF.insert(std::next(MachineFunction::iterator(CatchTermPad)),
+ CatchAllTermPad);
+ CatchAllTermPad->setIsEHPad();
+ BuildMI(CatchAllTermPad, DL, TII.get(WebAssembly::CATCH_ALL));
+ BuildMI(CatchAllTermPad, DL, TII.get(WebAssembly::CALL_VOID))
+ .addGlobalAddress(StdTerminateFn);
+ BuildMI(CatchAllTermPad, DL, TII.get(WebAssembly::UNREACHABLE));
+
+ // Actually this CatchAllTermPad (new terminate pad with a catch_all) is not
+ // a successor of an existing terminate pad. CatchAllTermPad should have all
+ // predecessors CatchTermPad has instead. This is a hack to force
+ // CatchAllTermPad be always sorted right after CatchTermPad; the correct
+ // predecessor-successor relationships will be restored in CFGStackify pass.
+ CatchTermPad->addSuccessor(CatchAllTermPad);
+ }
+ return true;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
index 5b867aa763a1..5fb97e38939a 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file lowers br_unless into br_if with an inverted condition.
+/// This file lowers br_unless into br_if with an inverted condition.
///
/// br_unless is not currently in the spec, but it's very convenient for LLVM
/// to use. This pass allows LLVM to use it, for now.
@@ -47,14 +47,17 @@ public:
} // end anonymous namespace
char WebAssemblyLowerBrUnless::ID = 0;
+INITIALIZE_PASS(WebAssemblyLowerBrUnless, DEBUG_TYPE,
+ "Lowers br_unless into inverted br_if", false, false)
+
FunctionPass *llvm::createWebAssemblyLowerBrUnless() {
return new WebAssemblyLowerBrUnless();
}
bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(dbgs() << "********** Lowering br_unless **********\n"
- "********** Function: "
- << MF.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "********** Lowering br_unless **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index f0b6a3e35dba..e9cb7c10113b 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file lowers exception-related instructions and setjmp/longjmp
+/// This file lowers exception-related instructions and setjmp/longjmp
/// function calls in order to use Emscripten's JavaScript try and catch
/// mechanism.
///
@@ -225,13 +225,8 @@ static cl::list<std::string>
namespace {
class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
- static const char *ThrewGVName;
- static const char *ThrewValueGVName;
- static const char *TempRet0GVName;
static const char *ResumeFName;
static const char *EHTypeIDFName;
- static const char *SetThrewFName;
- static const char *SetTempRet0FName;
static const char *EmLongjmpFName;
static const char *EmLongjmpJmpbufFName;
static const char *SaveSetjmpFName;
@@ -300,14 +295,9 @@ public:
};
} // End anonymous namespace
-const char *WebAssemblyLowerEmscriptenEHSjLj::ThrewGVName = "__THREW__";
-const char *WebAssemblyLowerEmscriptenEHSjLj::ThrewValueGVName = "__threwValue";
-const char *WebAssemblyLowerEmscriptenEHSjLj::TempRet0GVName = "__tempRet0";
const char *WebAssemblyLowerEmscriptenEHSjLj::ResumeFName = "__resumeException";
const char *WebAssemblyLowerEmscriptenEHSjLj::EHTypeIDFName =
"llvm_eh_typeid_for";
-const char *WebAssemblyLowerEmscriptenEHSjLj::SetThrewFName = "setThrew";
-const char *WebAssemblyLowerEmscriptenEHSjLj::SetTempRet0FName = "setTempRet0";
const char *WebAssemblyLowerEmscriptenEHSjLj::EmLongjmpFName =
"emscripten_longjmp";
const char *WebAssemblyLowerEmscriptenEHSjLj::EmLongjmpJmpbufFName =
@@ -343,15 +333,13 @@ static bool canThrow(const Value *V) {
return true;
}
-// Returns an available name for a global value.
-// If the proposed name already exists in the module, adds '_' at the end of
-// the name until the name is available.
-static inline std::string createGlobalValueName(const Module &M,
- const std::string &Propose) {
- std::string Name = Propose;
- while (M.getNamedGlobal(Name))
- Name += "_";
- return Name;
+static GlobalVariable *createGlobalVariableI32(Module &M, IRBuilder<> &IRB,
+ const char *Name) {
+ if (M.getNamedGlobal(Name))
+ report_fatal_error(Twine("variable name is reserved: ") + Name);
+
+ return new GlobalVariable(M, IRB.getInt32Ty(), false,
+ GlobalValue::WeakODRLinkage, IRB.getInt32(0), Name);
}
// Simple function name mangler.
@@ -613,11 +601,13 @@ void WebAssemblyLowerEmscriptenEHSjLj::createSetThrewFunction(Module &M) {
LLVMContext &C = M.getContext();
IRBuilder<> IRB(C);
- assert(!M.getNamedGlobal(SetThrewFName) && "setThrew already exists");
+ if (M.getNamedGlobal("setThrew"))
+ report_fatal_error("setThrew already exists");
+
Type *Params[] = {IRB.getInt32Ty(), IRB.getInt32Ty()};
FunctionType *FTy = FunctionType::get(IRB.getVoidTy(), Params, false);
Function *F =
- Function::Create(FTy, GlobalValue::ExternalLinkage, SetThrewFName, &M);
+ Function::Create(FTy, GlobalValue::WeakODRLinkage, "setThrew", &M);
Argument *Arg1 = &*(F->arg_begin());
Argument *Arg2 = &*std::next(F->arg_begin());
Arg1->setName("threw");
@@ -648,11 +638,12 @@ void WebAssemblyLowerEmscriptenEHSjLj::createSetTempRet0Function(Module &M) {
LLVMContext &C = M.getContext();
IRBuilder<> IRB(C);
- assert(!M.getNamedGlobal(SetTempRet0FName) && "setTempRet0 already exists");
+ if (M.getNamedGlobal("setTempRet0"))
+ report_fatal_error("setTempRet0 already exists");
Type *Params[] = {IRB.getInt32Ty()};
FunctionType *FTy = FunctionType::get(IRB.getVoidTy(), Params, false);
Function *F =
- Function::Create(FTy, GlobalValue::ExternalLinkage, SetTempRet0FName, &M);
+ Function::Create(FTy, GlobalValue::WeakODRLinkage, "setTempRet0", &M);
F->arg_begin()->setName("value");
BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
IRB.SetInsertPoint(EntryBB);
@@ -699,15 +690,9 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
// Create global variables __THREW__, threwValue, and __tempRet0, which are
// used in common for both exception handling and setjmp/longjmp handling
- ThrewGV = new GlobalVariable(M, IRB.getInt32Ty(), false,
- GlobalValue::ExternalLinkage, IRB.getInt32(0),
- createGlobalValueName(M, ThrewGVName));
- ThrewValueGV = new GlobalVariable(
- M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage, IRB.getInt32(0),
- createGlobalValueName(M, ThrewValueGVName));
- TempRet0GV = new GlobalVariable(M, IRB.getInt32Ty(), false,
- GlobalValue::ExternalLinkage, IRB.getInt32(0),
- createGlobalValueName(M, TempRet0GVName));
+ ThrewGV = createGlobalVariableI32(M, IRB, "__THREW__");
+ ThrewValueGV = createGlobalVariableI32(M, IRB, "__threwValue");
+ TempRet0GV = createGlobalVariableI32(M, IRB, "__tempRet0");
bool Changed = false;
@@ -736,12 +721,6 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
if (DoSjLj) {
Changed = true; // We have setjmp or longjmp somewhere
- Function *MallocF = M.getFunction("malloc");
- Function *FreeF = M.getFunction("free");
- if (!MallocF || !FreeF)
- report_fatal_error(
- "malloc and free must be linked into the module if setjmp is used");
-
// Register saveSetjmp function
FunctionType *SetjmpFTy = SetjmpF->getFunctionType();
SmallVector<Type *, 4> Params = {SetjmpFTy->getParamType(0),
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp b/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
index 0020817aee41..ee708d637b25 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief Lower @llvm.global_dtors.
+/// Lower @llvm.global_dtors.
///
/// WebAssembly doesn't have a builtin way to invoke static destructors.
/// Implement @llvm.global_dtors by creating wrapper functions that are
@@ -51,6 +51,9 @@ public:
} // End anonymous namespace
char LowerGlobalDtors::ID = 0;
+INITIALIZE_PASS(LowerGlobalDtors, DEBUG_TYPE,
+ "Lower @llvm.global_dtors for WebAssembly", false, false)
+
ModulePass *llvm::createWebAssemblyLowerGlobalDtors() {
return new LowerGlobalDtors();
}
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 4a93d4810c7d..d85db14fc679 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains code to lower WebAssembly MachineInstrs to their
+/// This file contains code to lower WebAssembly MachineInstrs to their
/// corresponding MCInst records.
///
//===----------------------------------------------------------------------===//
@@ -25,7 +25,6 @@
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/MCSymbolWasm.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
@@ -34,11 +33,7 @@ using namespace llvm;
MCSymbol *
WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
const GlobalValue *Global = MO.getGlobal();
- MCSymbol *Sym = Printer.getSymbol(Global);
- if (isa<MCSymbolELF>(Sym))
- return Sym;
-
- MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+ MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Printer.getSymbol(Global));
if (const auto *FuncTy = dyn_cast<FunctionType>(Global->getValueType())) {
const MachineFunction &MF = *MO.getParent()->getParent()->getParent();
@@ -74,7 +69,7 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
WasmSym->setReturns(std::move(Returns));
WasmSym->setParams(std::move(Params));
- WasmSym->setIsFunction(true);
+ WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
}
return WasmSym;
@@ -83,17 +78,22 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
const MachineOperand &MO) const {
const char *Name = MO.getSymbolName();
- MCSymbol *Sym = Printer.GetExternalSymbolSymbol(Name);
- if (isa<MCSymbolELF>(Sym))
- return Sym;
-
- MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+ MCSymbolWasm *WasmSym =
+ cast<MCSymbolWasm>(Printer.GetExternalSymbolSymbol(Name));
const WebAssemblySubtarget &Subtarget = Printer.getSubtarget();
// __stack_pointer is a global variable; all other external symbols used by
- // CodeGen are functions.
- if (strcmp(Name, "__stack_pointer") == 0)
+ // CodeGen are functions. It's OK to hardcode knowledge of specific symbols
+ // here; this method is precisely there for fetching the signatures of known
+ // Clang-provided symbols.
+ if (strcmp(Name, "__stack_pointer") == 0) {
+ WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
+ WasmSym->setGlobalType(wasm::WasmGlobalType{
+ uint8_t(Subtarget.hasAddr64() ? wasm::WASM_TYPE_I64
+ : wasm::WASM_TYPE_I32),
+ true});
return WasmSym;
+ }
SmallVector<wasm::ValType, 4> Returns;
SmallVector<wasm::ValType, 4> Params;
@@ -101,7 +101,7 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
WasmSym->setReturns(std::move(Returns));
WasmSym->setParams(std::move(Params));
- WasmSym->setIsFunction(true);
+ WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
return WasmSym;
}
@@ -169,35 +169,32 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
const MCOperandInfo &Info = Desc.OpInfo[i];
if (Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
MCSymbol *Sym = Printer.createTempSymbol("typeindex");
- if (!isa<MCSymbolELF>(Sym)) {
- SmallVector<wasm::ValType, 4> Returns;
- SmallVector<wasm::ValType, 4> Params;
-
- const MachineRegisterInfo &MRI =
- MI->getParent()->getParent()->getRegInfo();
- for (const MachineOperand &MO : MI->defs())
- Returns.push_back(getType(MRI.getRegClass(MO.getReg())));
- for (const MachineOperand &MO : MI->explicit_uses())
- if (MO.isReg())
- Params.push_back(getType(MRI.getRegClass(MO.getReg())));
-
- // call_indirect instructions have a callee operand at the end which
- // doesn't count as a param.
- if (WebAssembly::isCallIndirect(*MI))
- Params.pop_back();
-
- MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
- WasmSym->setReturns(std::move(Returns));
- WasmSym->setParams(std::move(Params));
- WasmSym->setIsFunction(true);
-
- const MCExpr *Expr =
- MCSymbolRefExpr::create(WasmSym,
- MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX,
- Ctx);
- MCOp = MCOperand::createExpr(Expr);
- break;
- }
+
+ SmallVector<wasm::ValType, 4> Returns;
+ SmallVector<wasm::ValType, 4> Params;
+
+ const MachineRegisterInfo &MRI =
+ MI->getParent()->getParent()->getRegInfo();
+ for (const MachineOperand &MO : MI->defs())
+ Returns.push_back(getType(MRI.getRegClass(MO.getReg())));
+ for (const MachineOperand &MO : MI->explicit_uses())
+ if (MO.isReg())
+ Params.push_back(getType(MRI.getRegClass(MO.getReg())));
+
+ // call_indirect instructions have a callee operand at the end which
+ // doesn't count as a param.
+ if (WebAssembly::isCallIndirect(*MI))
+ Params.pop_back();
+
+ MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+ WasmSym->setReturns(std::move(Returns));
+ WasmSym->setParams(std::move(Params));
+ WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+
+ const MCExpr *Expr = MCSymbolRefExpr::create(
+ WasmSym, MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX, Ctx);
+ MCOp = MCOperand::createExpr(Expr);
+ break;
}
}
MCOp = MCOperand::createImm(MO.getImm());
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
index d1d2794c3b8f..41b4313bb38c 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file declares the class to lower WebAssembly MachineInstrs to
+/// This file declares the class to lower WebAssembly MachineInstrs to
/// their corresponding MCInst records.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index ccf6a18b32ea..e511e574050f 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file implements WebAssembly-specific per-machine-function
+/// This file implements WebAssembly-specific per-machine-function
/// information.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 1fcbb7791d4e..a60b10fc5309 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file declares WebAssembly-specific per-machine-function
+/// This file declares WebAssembly-specific per-machine-function
/// information.
///
//===----------------------------------------------------------------------===//
@@ -60,6 +60,8 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
void addResult(MVT VT) { Results.push_back(VT); }
const std::vector<MVT> &getResults() const { return Results; }
+ void clearParamsAndResults() { Params.clear(); Results.clear(); }
+
void setNumLocals(size_t NumLocals) { Locals.resize(NumLocals, MVT::i32); }
void setLocal(size_t i, MVT VT) { Locals[i] = VT; }
void addLocal(MVT VT) { Locals.push_back(VT); }
@@ -81,25 +83,29 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
void stackifyVReg(unsigned VReg) {
assert(MF.getRegInfo().getUniqueVRegDef(VReg));
- if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size())
- VRegStackified.resize(TargetRegisterInfo::virtReg2Index(VReg) + 1);
- VRegStackified.set(TargetRegisterInfo::virtReg2Index(VReg));
+ auto I = TargetRegisterInfo::virtReg2Index(VReg);
+ if (I >= VRegStackified.size())
+ VRegStackified.resize(I + 1);
+ VRegStackified.set(I);
}
bool isVRegStackified(unsigned VReg) const {
- if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size())
+ auto I = TargetRegisterInfo::virtReg2Index(VReg);
+ if (I >= VRegStackified.size())
return false;
- return VRegStackified.test(TargetRegisterInfo::virtReg2Index(VReg));
+ return VRegStackified.test(I);
}
void initWARegs();
void setWAReg(unsigned VReg, unsigned WAReg) {
assert(WAReg != UnusedReg);
- assert(TargetRegisterInfo::virtReg2Index(VReg) < WARegs.size());
- WARegs[TargetRegisterInfo::virtReg2Index(VReg)] = WAReg;
+ auto I = TargetRegisterInfo::virtReg2Index(VReg);
+ assert(I < WARegs.size());
+ WARegs[I] = WAReg;
}
- unsigned getWAReg(unsigned Reg) const {
- assert(TargetRegisterInfo::virtReg2Index(Reg) < WARegs.size());
- return WARegs[TargetRegisterInfo::virtReg2Index(Reg)];
+ unsigned getWAReg(unsigned VReg) const {
+ auto I = TargetRegisterInfo::virtReg2Index(VReg);
+ assert(I < WARegs.size());
+ return WARegs[I];
}
// For a given stackified WAReg, return the id number to print with push/pop.
diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
index ebe97848d461..04ac22a589ea 100644
--- a/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -8,11 +8,11 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief Optimize LiveIntervals for use in a post-RA context.
+/// Optimize LiveIntervals for use in a post-RA context.
//
/// LiveIntervals normally runs before register allocation when the code is
/// only recently lowered out of SSA form, so it's uncommon for registers to
-/// have multiple defs, and then they do, the defs are usually closely related.
+/// have multiple defs, and when they do, the defs are usually closely related.
/// Later, after coalescing, tail duplication, and other optimizations, it's
/// more common to see registers with multiple unrelated defs. This pass
/// updates LiveIntervals to distribute the value numbers across separate
@@ -58,14 +58,17 @@ public:
} // end anonymous namespace
char WebAssemblyOptimizeLiveIntervals::ID = 0;
+INITIALIZE_PASS(WebAssemblyOptimizeLiveIntervals, DEBUG_TYPE,
+ "Optimize LiveIntervals for WebAssembly", false, false)
+
FunctionPass *llvm::createWebAssemblyOptimizeLiveIntervals() {
return new WebAssemblyOptimizeLiveIntervals();
}
bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(dbgs() << "********** Optimize LiveIntervals **********\n"
- "********** Function: "
- << MF.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "********** Optimize LiveIntervals **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
MachineRegisterInfo &MRI = MF.getRegInfo();
LiveIntervals &LIS = getAnalysis<LiveIntervals>();
diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
index 559165e4c86b..113ee2532bce 100644
--- a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief Optimize calls with "returned" attributes for WebAssembly.
+/// Optimize calls with "returned" attributes for WebAssembly.
///
//===----------------------------------------------------------------------===//
@@ -48,6 +48,10 @@ public:
} // End anonymous namespace
char OptimizeReturned::ID = 0;
+INITIALIZE_PASS(OptimizeReturned, DEBUG_TYPE,
+ "Optimize calls with \"returned\" attributes for WebAssembly",
+ false, false)
+
FunctionPass *llvm::createWebAssemblyOptimizeReturned() {
return new OptimizeReturned();
}
diff --git a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index d2fbc5a22308..a54484407805 100644
--- a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief Late peephole optimizations for WebAssembly.
+/// Late peephole optimizations for WebAssembly.
///
//===----------------------------------------------------------------------===//
@@ -50,6 +50,9 @@ public:
} // end anonymous namespace
char WebAssemblyPeephole::ID = 0;
+INITIALIZE_PASS(WebAssemblyPeephole, DEBUG_TYPE,
+ "WebAssembly peephole optimizations", false, false)
+
FunctionPass *llvm::createWebAssemblyPeephole() {
return new WebAssemblyPeephole();
}
@@ -80,18 +83,13 @@ static bool MaybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
return false;
if (&MBB != &MF.back())
return false;
- if (MF.getSubtarget<WebAssemblySubtarget>()
- .getTargetTriple().isOSBinFormatELF()) {
- if (&MI != &MBB.back())
- return false;
- } else {
- MachineBasicBlock::iterator End = MBB.end();
- --End;
- assert(End->getOpcode() == WebAssembly::END_FUNCTION);
- --End;
- if (&MI != &*End)
- return false;
- }
+
+ MachineBasicBlock::iterator End = MBB.end();
+ --End;
+ assert(End->getOpcode() == WebAssembly::END_FUNCTION);
+ --End;
+ if (&MI != &*End)
+ return false;
if (FallthroughOpc != WebAssembly::FALLTHROUGH_RETURN_VOID) {
// If the operand isn't stackified, insert a COPY to read the operand and
@@ -113,7 +111,7 @@ static bool MaybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
}
bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "********** Peephole **********\n"
<< "********** Function: " << MF.getName() << '\n';
});
diff --git a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
index 3a2876bfcde2..e44e7057e233 100644
--- a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief Fix up code to meet LiveInterval's requirements.
+/// Fix up code to meet LiveInterval's requirements.
///
/// Some CodeGen passes don't preserve LiveInterval's requirements, because
/// they run after register allocation and it isn't important. However,
@@ -55,6 +55,9 @@ private:
} // end anonymous namespace
char WebAssemblyPrepareForLiveIntervals::ID = 0;
+INITIALIZE_PASS(WebAssemblyPrepareForLiveIntervals, DEBUG_TYPE,
+ "Fix up code for LiveIntervals", false, false)
+
FunctionPass *llvm::createWebAssemblyPrepareForLiveIntervals() {
return new WebAssemblyPrepareForLiveIntervals();
}
@@ -68,7 +71,7 @@ static bool HasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) {
}
bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "********** Prepare For LiveIntervals **********\n"
<< "********** Function: " << MF.getName() << '\n';
});
diff --git a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
index 2ac3a839c3c8..d69a27937105 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file implements a virtual register coloring pass.
+/// This file implements a virtual register coloring pass.
///
/// WebAssembly doesn't have a fixed number of registers, but it is still
/// desirable to minimize the total number of registers used in each function.
@@ -55,6 +55,9 @@ private:
} // end anonymous namespace
char WebAssemblyRegColoring::ID = 0;
+INITIALIZE_PASS(WebAssemblyRegColoring, DEBUG_TYPE,
+ "Minimize number of registers used", false, false)
+
FunctionPass *llvm::createWebAssemblyRegColoring() {
return new WebAssemblyRegColoring();
}
@@ -71,7 +74,7 @@ static float computeWeight(const MachineRegisterInfo *MRI,
}
bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "********** Register Coloring **********\n"
<< "********** Function: " << MF.getName() << '\n';
});
@@ -94,7 +97,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
SmallVector<LiveInterval *, 0> SortedIntervals;
SortedIntervals.reserve(NumVRegs);
- DEBUG(dbgs() << "Interesting register intervals:\n");
+ LLVM_DEBUG(dbgs() << "Interesting register intervals:\n");
for (unsigned i = 0; i < NumVRegs; ++i) {
unsigned VReg = TargetRegisterInfo::index2VirtReg(i);
if (MFI.isVRegStackified(VReg))
@@ -106,27 +109,27 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
LiveInterval *LI = &Liveness->getInterval(VReg);
assert(LI->weight == 0.0f);
LI->weight = computeWeight(MRI, MBFI, VReg);
- DEBUG(LI->dump());
+ LLVM_DEBUG(LI->dump());
SortedIntervals.push_back(LI);
}
- DEBUG(dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << '\n');
// Sort them to put arguments first (since we don't want to rename live-in
// registers), by weight next, and then by position.
// TODO: Investigate more intelligent sorting heuristics. For starters, we
// should try to coalesce adjacent live intervals before non-adjacent ones.
- std::sort(SortedIntervals.begin(), SortedIntervals.end(),
- [MRI](LiveInterval *LHS, LiveInterval *RHS) {
- if (MRI->isLiveIn(LHS->reg) != MRI->isLiveIn(RHS->reg))
- return MRI->isLiveIn(LHS->reg);
- if (LHS->weight != RHS->weight)
- return LHS->weight > RHS->weight;
- if (LHS->empty() || RHS->empty())
- return !LHS->empty() && RHS->empty();
- return *LHS < *RHS;
- });
-
- DEBUG(dbgs() << "Coloring register intervals:\n");
+ llvm::sort(SortedIntervals.begin(), SortedIntervals.end(),
+ [MRI](LiveInterval *LHS, LiveInterval *RHS) {
+ if (MRI->isLiveIn(LHS->reg) != MRI->isLiveIn(RHS->reg))
+ return MRI->isLiveIn(LHS->reg);
+ if (LHS->weight != RHS->weight)
+ return LHS->weight > RHS->weight;
+ if (LHS->empty() || RHS->empty())
+ return !LHS->empty() && RHS->empty();
+ return *LHS < *RHS;
+ });
+
+ LLVM_DEBUG(dbgs() << "Coloring register intervals:\n");
SmallVector<unsigned, 16> SlotMapping(SortedIntervals.size(), -1u);
SmallVector<SmallVector<LiveInterval *, 4>, 16> Assignments(
SortedIntervals.size());
@@ -156,9 +159,9 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
Changed |= Old != New;
UsedColors.set(Color);
Assignments[Color].push_back(LI);
- DEBUG(dbgs() << "Assigning vreg"
- << TargetRegisterInfo::virtReg2Index(LI->reg) << " to vreg"
- << TargetRegisterInfo::virtReg2Index(New) << "\n");
+ LLVM_DEBUG(
+ dbgs() << "Assigning vreg" << TargetRegisterInfo::virtReg2Index(LI->reg)
+ << " to vreg" << TargetRegisterInfo::virtReg2Index(New) << "\n");
}
if (!Changed)
return false;
diff --git a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
index 766ab456a8e6..1e2a248f097e 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file implements a pass which assigns WebAssembly register
+/// This file implements a pass which assigns WebAssembly register
/// numbers for CodeGen virtual registers.
///
//===----------------------------------------------------------------------===//
@@ -51,14 +51,18 @@ public:
} // end anonymous namespace
char WebAssemblyRegNumbering::ID = 0;
+INITIALIZE_PASS(WebAssemblyRegNumbering, DEBUG_TYPE,
+ "Assigns WebAssembly register numbers for virtual registers",
+ false, false)
+
FunctionPass *llvm::createWebAssemblyRegNumbering() {
return new WebAssemblyRegNumbering();
}
bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(dbgs() << "********** Register Numbering **********\n"
- "********** Function: "
- << MF.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "********** Register Numbering **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -73,8 +77,8 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
break;
int64_t Imm = MI.getOperand(1).getImm();
- DEBUG(dbgs() << "Arg VReg " << MI.getOperand(0).getReg() << " -> WAReg "
- << Imm << "\n");
+ LLVM_DEBUG(dbgs() << "Arg VReg " << MI.getOperand(0).getReg()
+ << " -> WAReg " << Imm << "\n");
MFI.setWAReg(MI.getOperand(0).getReg(), Imm);
}
@@ -92,13 +96,13 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
continue;
// Handle stackified registers.
if (MFI.isVRegStackified(VReg)) {
- DEBUG(dbgs() << "VReg " << VReg << " -> WAReg "
- << (INT32_MIN | NumStackRegs) << "\n");
+ LLVM_DEBUG(dbgs() << "VReg " << VReg << " -> WAReg "
+ << (INT32_MIN | NumStackRegs) << "\n");
MFI.setWAReg(VReg, INT32_MIN | NumStackRegs++);
continue;
}
if (MFI.getWAReg(VReg) == WebAssemblyFunctionInfo::UnusedReg) {
- DEBUG(dbgs() << "VReg " << VReg << " -> WAReg " << CurReg << "\n");
+ LLVM_DEBUG(dbgs() << "VReg " << VReg << " -> WAReg " << CurReg << "\n");
MFI.setWAReg(VReg, CurReg++);
}
}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index a4bb967f36f6..9f5d5bd87831 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file implements a register stacking pass.
+/// This file implements a register stacking pass.
///
/// This pass reorders instructions to put register uses and defs in an order
/// such that they form single-use expression trees. Registers fitting this form
@@ -67,6 +67,10 @@ public:
} // end anonymous namespace
char WebAssemblyRegStackify::ID = 0;
+INITIALIZE_PASS(WebAssemblyRegStackify, DEBUG_TYPE,
+ "Reorder instructions to use the WebAssembly value stack",
+ false, false)
+
FunctionPass *llvm::createWebAssemblyRegStackify() {
return new WebAssemblyRegStackify();
}
@@ -156,10 +160,9 @@ static void QueryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read,
// and/or uses the stack pointer value.
static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
bool &Write, bool &Effects, bool &StackPointer) {
- assert(!MI.isPosition());
assert(!MI.isTerminator());
- if (MI.isDebugValue())
+ if (MI.isDebugInstr() || MI.isPosition())
return;
// Check for loads.
@@ -469,7 +472,7 @@ static MachineInstr *MoveForSingleUse(unsigned Reg, MachineOperand& Op,
MachineInstr *Insert, LiveIntervals &LIS,
WebAssemblyFunctionInfo &MFI,
MachineRegisterInfo &MRI) {
- DEBUG(dbgs() << "Move for single use: "; Def->dump());
+ LLVM_DEBUG(dbgs() << "Move for single use: "; Def->dump());
MBB.splice(Insert, &MBB, Def);
LIS.handleMove(*Def);
@@ -496,7 +499,7 @@ static MachineInstr *MoveForSingleUse(unsigned Reg, MachineOperand& Op,
MFI.stackifyVReg(NewReg);
- DEBUG(dbgs() << " - Replaced register: "; Def->dump());
+ LLVM_DEBUG(dbgs() << " - Replaced register: "; Def->dump());
}
ImposeStackOrdering(Def);
@@ -510,8 +513,8 @@ static MachineInstr *RematerializeCheapDef(
MachineBasicBlock::instr_iterator Insert, LiveIntervals &LIS,
WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI,
const WebAssemblyInstrInfo *TII, const WebAssemblyRegisterInfo *TRI) {
- DEBUG(dbgs() << "Rematerializing cheap def: "; Def.dump());
- DEBUG(dbgs() << " - for use in "; Op.getParent()->dump());
+ LLVM_DEBUG(dbgs() << "Rematerializing cheap def: "; Def.dump());
+ LLVM_DEBUG(dbgs() << " - for use in "; Op.getParent()->dump());
unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
TII->reMaterialize(MBB, Insert, NewReg, 0, Def, *TRI);
@@ -522,7 +525,7 @@ static MachineInstr *RematerializeCheapDef(
MFI.stackifyVReg(NewReg);
ImposeStackOrdering(Clone);
- DEBUG(dbgs() << " - Cloned to "; Clone->dump());
+ LLVM_DEBUG(dbgs() << " - Cloned to "; Clone->dump());
// Shrink the interval.
bool IsDead = MRI.use_empty(Reg);
@@ -534,7 +537,7 @@ static MachineInstr *RematerializeCheapDef(
// If that was the last use of the original, delete the original.
if (IsDead) {
- DEBUG(dbgs() << " - Deleting original\n");
+ LLVM_DEBUG(dbgs() << " - Deleting original\n");
SlotIndex Idx = LIS.getInstructionIndex(Def).getRegSlot();
LIS.removePhysRegDefAt(WebAssembly::ARGUMENTS, Idx);
LIS.removeInterval(Reg);
@@ -569,7 +572,7 @@ static MachineInstr *MoveAndTeeForMultiUse(
unsigned Reg, MachineOperand &Op, MachineInstr *Def, MachineBasicBlock &MBB,
MachineInstr *Insert, LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI,
MachineRegisterInfo &MRI, const WebAssemblyInstrInfo *TII) {
- DEBUG(dbgs() << "Move and tee for multi-use:"; Def->dump());
+ LLVM_DEBUG(dbgs() << "Move and tee for multi-use:"; Def->dump());
// Move Def into place.
MBB.splice(Insert, &MBB, Def);
@@ -605,8 +608,8 @@ static MachineInstr *MoveAndTeeForMultiUse(
ImposeStackOrdering(Def);
ImposeStackOrdering(Tee);
- DEBUG(dbgs() << " - Replaced register: "; Def->dump());
- DEBUG(dbgs() << " - Tee instruction: "; Tee->dump());
+ LLVM_DEBUG(dbgs() << " - Replaced register: "; Def->dump());
+ LLVM_DEBUG(dbgs() << " - Tee instruction: "; Tee->dump());
return Def;
}
@@ -733,9 +736,9 @@ public:
} // end anonymous namespace
bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
- DEBUG(dbgs() << "********** Register Stackifying **********\n"
- "********** Function: "
- << MF.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "********** Register Stackifying **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
bool Changed = false;
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -746,14 +749,6 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
LiveIntervals &LIS = getAnalysis<LiveIntervals>();
- // Disable the TEE optimization if we aren't doing direct wasm object
- // emission, because lowering TEE to TEE_LOCAL is done in the ExplicitLocals
- // pass, which is also disabled.
- bool UseTee = true;
- if (MF.getSubtarget<WebAssemblySubtarget>()
- .getTargetTriple().isOSBinFormatELF())
- UseTee = false;
-
// Walk the instructions from the bottom up. Currently we don't look past
// block boundaries, and the blocks aren't ordered so the block visitation
// order isn't significant, but we may want to change this in the future.
@@ -819,7 +814,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
Insert =
RematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(),
LIS, MFI, MRI, TII, TRI);
- } else if (UseTee && CanMove &&
+ } else if (CanMove &&
OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) {
Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,
MRI, TII);
@@ -867,7 +862,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
SmallVector<unsigned, 0> Stack;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
continue;
for (MachineOperand &MO : reverse(MI.explicit_operands())) {
if (!MO.isReg())
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index 5e7ebd19fac7..b6481ac2d4ae 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains the WebAssembly implementation of the
+/// This file contains the WebAssembly implementation of the
/// TargetRegisterInfo class.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
index ad1d71eebf22..2a73dfd4b065 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains the WebAssembly implementation of the
+/// This file contains the WebAssembly implementation of the
/// WebAssemblyRegisterInfo class.
///
//===----------------------------------------------------------------------===//
@@ -45,6 +45,8 @@ public:
const TargetRegisterClass *
getPointerRegClass(const MachineFunction &MF,
unsigned Kind = 0) const override;
+ // This does not apply to wasm.
+ const uint32_t *getNoPreservedMask() const override { return nullptr; }
};
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 90888100be17..29f42b96b249 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file describes the WebAssembly register classes and some nominal
+/// This file describes the WebAssembly register classes and some nominal
/// physical registers.
///
//===----------------------------------------------------------------------===//
@@ -34,13 +34,18 @@ def SP32 : WebAssemblyReg<"%SP32">;
def SP64 : WebAssemblyReg<"%SP64">;
// The register allocation framework requires register classes have at least
-// one register, so we define a few for the floating point register classes
-// since we otherwise don't need a physical register in those classes.
+// one register, so we define a few for the integer / floating point register
+// classes since we otherwise don't need a physical register in those classes.
+// These are also used a "types" in the generated assembly matcher.
+def I32_0 : WebAssemblyReg<"%i32.0">;
+def I64_0 : WebAssemblyReg<"%i64.0">;
def F32_0 : WebAssemblyReg<"%f32.0">;
def F64_0 : WebAssemblyReg<"%f64.0">;
def V128_0: WebAssemblyReg<"%v128">;
+def EXCEPT_REF_0 : WebAssemblyReg<"%except_ref.0">;
+
// The value stack "register". This is an opaque entity which serves to order
// uses and defs that must remain in LIFO order.
def VALUE_STACK : WebAssemblyReg<"STACK">;
@@ -54,9 +59,10 @@ def ARGUMENTS : WebAssemblyReg<"ARGUMENTS">;
// Register classes
//===----------------------------------------------------------------------===//
-def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32)>;
-def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64)>;
+def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32, I32_0)>;
+def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64, I64_0)>;
def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
def V128 : WebAssemblyRegClass<[v4f32, v4i32, v16i8, v8i16], 128, (add V128_0)>;
+def EXCEPT_REF : WebAssemblyRegClass<[ExceptRef], 0, (add EXCEPT_REF_0)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp b/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
index 878ffd08d228..f432b367d156 100644
--- a/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file implements a pass that replaces physical registers with
+/// This file implements a pass that replaces physical registers with
/// virtual registers.
///
/// LLVM expects certain physical registers, such as a stack pointer. However,
@@ -53,12 +53,16 @@ private:
} // end anonymous namespace
char WebAssemblyReplacePhysRegs::ID = 0;
+INITIALIZE_PASS(WebAssemblyReplacePhysRegs, DEBUG_TYPE,
+ "Replace physical registers with virtual registers",
+ false, false)
+
FunctionPass *llvm::createWebAssemblyReplacePhysRegs() {
return new WebAssemblyReplacePhysRegs();
}
bool WebAssemblyReplacePhysRegs::runOnMachineFunction(MachineFunction &MF) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "********** Replace Physical Registers **********\n"
<< "********** Function: " << MF.getName() << '\n';
});
diff --git a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index f808c063d7e4..fe8a5e4c06f1 100644
--- a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains signature information for runtime libcalls.
+/// This file contains signature information for runtime libcalls.
///
/// CodeGen uses external symbols, which it refers to by name. The WebAssembly
/// target needs type information for all functions. This file contains a big
@@ -22,6 +22,7 @@
#include "WebAssemblyRuntimeLibcallSignatures.h"
#include "WebAssemblySubtarget.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/Support/ManagedStatic.h"
using namespace llvm;
@@ -58,13 +59,16 @@ enum RuntimeLibcallSignature {
i32_func_f32_f32,
i32_func_f64_f64,
i32_func_i32_i32,
+ i32_func_i32_i32_iPTR,
i64_func_i64_i64,
+ i64_func_i64_i64_iPTR,
i64_i64_func_f32,
i64_i64_func_f64,
i16_i16_func_i16_i16,
i32_i32_func_i32_i32,
i64_i64_func_i64_i64,
i64_i64_func_i64_i64_i64_i64,
+ i64_i64_func_i64_i64_i64_i64_iPTR,
i64_i64_i64_i64_func_i64_i64_i64_i64,
i64_i64_func_i64_i64_i32,
iPTR_func_iPTR_i32_iPTR,
@@ -84,918 +88,405 @@ enum RuntimeLibcallSignature {
unsupported
};
-} // end anonymous namespace
-
-static const RuntimeLibcallSignature
-RuntimeLibcallSignatures[RTLIB::UNKNOWN_LIBCALL] = {
-// Integer
-/* SHL_I16 */ i16_func_i16_i16,
-/* SHL_I32 */ i32_func_i32_i32,
-/* SHL_I64 */ i64_func_i64_i64,
-/* SHL_I128 */ i64_i64_func_i64_i64_i32,
-/* SRL_I16 */ i16_func_i16_i16,
-/* SRL_I32 */ i32_func_i32_i32,
-/* SRL_I64 */ i64_func_i64_i64,
-/* SRL_I128 */ i64_i64_func_i64_i64_i32,
-/* SRA_I16 */ i16_func_i16_i16,
-/* SRA_I32 */ i32_func_i32_i32,
-/* SRA_I64 */ i64_func_i64_i64,
-/* SRA_I128 */ i64_i64_func_i64_i64_i32,
-/* MUL_I8 */ i8_func_i8_i8,
-/* MUL_I16 */ i16_func_i16_i16,
-/* MUL_I32 */ i32_func_i32_i32,
-/* MUL_I64 */ i64_func_i64_i64,
-/* MUL_I128 */ i64_i64_func_i64_i64_i64_i64,
-/* MULO_I32 */ i32_func_i32_i32,
-/* MULO_I64 */ i64_func_i64_i64,
-/* MULO_I128 */ i64_i64_func_i64_i64_i64_i64,
-/* SDIV_I8 */ i8_func_i8_i8,
-/* SDIV_I16 */ i16_func_i16_i16,
-/* SDIV_I32 */ i32_func_i32_i32,
-/* SDIV_I64 */ i64_func_i64_i64,
-/* SDIV_I128 */ i64_i64_func_i64_i64_i64_i64,
-/* UDIV_I8 */ i8_func_i8_i8,
-/* UDIV_I16 */ i16_func_i16_i16,
-/* UDIV_I32 */ i32_func_i32_i32,
-/* UDIV_I64 */ i64_func_i64_i64,
-/* UDIV_I128 */ i64_i64_func_i64_i64_i64_i64,
-/* SREM_I8 */ i8_func_i8_i8,
-/* SREM_I16 */ i16_func_i16_i16,
-/* SREM_I32 */ i32_func_i32_i32,
-/* SREM_I64 */ i64_func_i64_i64,
-/* SREM_I128 */ i64_i64_func_i64_i64_i64_i64,
-/* UREM_I8 */ i8_func_i8_i8,
-/* UREM_I16 */ i16_func_i16_i16,
-/* UREM_I32 */ i32_func_i32_i32,
-/* UREM_I64 */ i64_func_i64_i64,
-/* UREM_I128 */ i64_i64_func_i64_i64_i64_i64,
-/* SDIVREM_I8 */ i8_func_i8_i8,
-/* SDIVREM_I16 */ i16_i16_func_i16_i16,
-/* SDIVREM_I32 */ i32_i32_func_i32_i32,
-/* SDIVREM_I64 */ i64_func_i64_i64,
-/* SDIVREM_I128 */ i64_i64_i64_i64_func_i64_i64_i64_i64,
-/* UDIVREM_I8 */ i8_func_i8_i8,
-/* UDIVREM_I16 */ i16_i16_func_i16_i16,
-/* UDIVREM_I32 */ i32_i32_func_i32_i32,
-/* UDIVREM_I64 */ i64_i64_func_i64_i64,
-/* UDIVREM_I128 */ i64_i64_i64_i64_func_i64_i64_i64_i64,
-/* NEG_I32 */ i32_func_i32,
-/* NEG_I64 */ i64_func_i64,
-
-// FLOATING POINT
-/* ADD_F32 */ f32_func_f32_f32,
-/* ADD_F64 */ f64_func_f64_f64,
-/* ADD_F80 */ unsupported,
-/* ADD_F128 */ func_iPTR_i64_i64_i64_i64,
-/* ADD_PPCF128 */ unsupported,
-/* SUB_F32 */ f32_func_f32_f32,
-/* SUB_F64 */ f64_func_f64_f64,
-/* SUB_F80 */ unsupported,
-/* SUB_F128 */ func_iPTR_i64_i64_i64_i64,
-/* SUB_PPCF128 */ unsupported,
-/* MUL_F32 */ f32_func_f32_f32,
-/* MUL_F64 */ f64_func_f64_f64,
-/* MUL_F80 */ unsupported,
-/* MUL_F128 */ func_iPTR_i64_i64_i64_i64,
-/* MUL_PPCF128 */ unsupported,
-/* DIV_F32 */ f32_func_f32_f32,
-/* DIV_F64 */ f64_func_f64_f64,
-/* DIV_F80 */ unsupported,
-/* DIV_F128 */ func_iPTR_i64_i64_i64_i64,
-/* DIV_PPCF128 */ unsupported,
-/* REM_F32 */ f32_func_f32_f32,
-/* REM_F64 */ f64_func_f64_f64,
-/* REM_F80 */ unsupported,
-/* REM_F128 */ func_iPTR_i64_i64_i64_i64,
-/* REM_PPCF128 */ unsupported,
-/* FMA_F32 */ f32_func_f32_f32_f32,
-/* FMA_F64 */ f64_func_f64_f64_f64,
-/* FMA_F80 */ unsupported,
-/* FMA_F128 */ func_iPTR_i64_i64_i64_i64_i64_i64,
-/* FMA_PPCF128 */ unsupported,
-/* POWI_F32 */ f32_func_f32_i32,
-/* POWI_F64 */ f64_func_f64_i32,
-/* POWI_F80 */ unsupported,
-/* POWI_F128 */ func_iPTR_i64_i64_i64_i64,
-/* POWI_PPCF128 */ unsupported,
-/* SQRT_F32 */ f32_func_f32,
-/* SQRT_F64 */ f64_func_f64,
-/* SQRT_F80 */ unsupported,
-/* SQRT_F128 */ func_iPTR_i64_i64,
-/* SQRT_PPCF128 */ unsupported,
-/* LOG_F32 */ f32_func_f32,
-/* LOG_F64 */ f64_func_f64,
-/* LOG_F80 */ unsupported,
-/* LOG_F128 */ func_iPTR_i64_i64,
-/* LOG_PPCF128 */ unsupported,
-/* LOG2_F32 */ f32_func_f32,
-/* LOG2_F64 */ f64_func_f64,
-/* LOG2_F80 */ unsupported,
-/* LOG2_F128 */ func_iPTR_i64_i64,
-/* LOG2_PPCF128 */ unsupported,
-/* LOG10_F32 */ f32_func_f32,
-/* LOG10_F64 */ f64_func_f64,
-/* LOG10_F80 */ unsupported,
-/* LOG10_F128 */ func_iPTR_i64_i64,
-/* LOG10_PPCF128 */ unsupported,
-/* EXP_F32 */ f32_func_f32,
-/* EXP_F64 */ f64_func_f64,
-/* EXP_F80 */ unsupported,
-/* EXP_F128 */ func_iPTR_i64_i64,
-/* EXP_PPCF128 */ unsupported,
-/* EXP2_F32 */ f32_func_f32,
-/* EXP2_F64 */ f64_func_f64,
-/* EXP2_F80 */ unsupported,
-/* EXP2_F128 */ func_iPTR_i64_i64,
-/* EXP2_PPCF128 */ unsupported,
-/* SIN_F32 */ f32_func_f32,
-/* SIN_F64 */ f64_func_f64,
-/* SIN_F80 */ unsupported,
-/* SIN_F128 */ func_iPTR_i64_i64,
-/* SIN_PPCF128 */ unsupported,
-/* COS_F32 */ f32_func_f32,
-/* COS_F64 */ f64_func_f64,
-/* COS_F80 */ unsupported,
-/* COS_F128 */ func_iPTR_i64_i64,
-/* COS_PPCF128 */ unsupported,
-/* SINCOS_F32 */ func_f32_iPTR_iPTR,
-/* SINCOS_F64 */ func_f64_iPTR_iPTR,
-/* SINCOS_F80 */ unsupported,
-/* SINCOS_F128 */ func_i64_i64_iPTR_iPTR,
-/* SINCOS_PPCF128 */ unsupported,
-/* SINCOS_STRET_F32 */ unsupported,
-/* SINCOS_STRET_F64 */ unsupported,
-/* POW_F32 */ f32_func_f32_f32,
-/* POW_F64 */ f64_func_f64_f64,
-/* POW_F80 */ unsupported,
-/* POW_F128 */ func_iPTR_i64_i64_i64_i64,
-/* POW_PPCF128 */ unsupported,
-/* CEIL_F32 */ f32_func_f32,
-/* CEIL_F64 */ f64_func_f64,
-/* CEIL_F80 */ unsupported,
-/* CEIL_F128 */ func_iPTR_i64_i64,
-/* CEIL_PPCF128 */ unsupported,
-/* TRUNC_F32 */ f32_func_f32,
-/* TRUNC_F64 */ f64_func_f64,
-/* TRUNC_F80 */ unsupported,
-/* TRUNC_F128 */ func_iPTR_i64_i64,
-/* TRUNC_PPCF128 */ unsupported,
-/* RINT_F32 */ f32_func_f32,
-/* RINT_F64 */ f64_func_f64,
-/* RINT_F80 */ unsupported,
-/* RINT_F128 */ func_iPTR_i64_i64,
-/* RINT_PPCF128 */ unsupported,
-/* NEARBYINT_F32 */ f32_func_f32,
-/* NEARBYINT_F64 */ f64_func_f64,
-/* NEARBYINT_F80 */ unsupported,
-/* NEARBYINT_F128 */ func_iPTR_i64_i64,
-/* NEARBYINT_PPCF128 */ unsupported,
-/* ROUND_F32 */ f32_func_f32,
-/* ROUND_F64 */ f64_func_f64,
-/* ROUND_F80 */ unsupported,
-/* ROUND_F128 */ func_iPTR_i64_i64,
-/* ROUND_PPCF128 */ unsupported,
-/* FLOOR_F32 */ f32_func_f32,
-/* FLOOR_F64 */ f64_func_f64,
-/* FLOOR_F80 */ unsupported,
-/* FLOOR_F128 */ func_iPTR_i64_i64,
-/* FLOOR_PPCF128 */ unsupported,
-/* COPYSIGN_F32 */ f32_func_f32_f32,
-/* COPYSIGN_F64 */ f64_func_f64_f64,
-/* COPYSIGN_F80 */ unsupported,
-/* COPYSIGN_F128 */ func_iPTR_i64_i64_i64_i64,
-/* COPYSIGN_PPCF128 */ unsupported,
-/* FMIN_F32 */ f32_func_f32_f32,
-/* FMIN_F64 */ f64_func_f64_f64,
-/* FMIN_F80 */ unsupported,
-/* FMIN_F128 */ func_iPTR_i64_i64_i64_i64,
-/* FMIN_PPCF128 */ unsupported,
-/* FMAX_F32 */ f32_func_f32_f32,
-/* FMAX_F64 */ f64_func_f64_f64,
-/* FMAX_F80 */ unsupported,
-/* FMAX_F128 */ func_iPTR_i64_i64_i64_i64,
-/* FMAX_PPCF128 */ unsupported,
-
-// CONVERSION
-/* FPEXT_F32_PPCF128 */ unsupported,
-/* FPEXT_F64_PPCF128 */ unsupported,
-/* FPEXT_F64_F128 */ func_iPTR_f64,
-/* FPEXT_F32_F128 */ func_iPTR_f32,
-/* FPEXT_F32_F64 */ f64_func_f32,
-/* FPEXT_F16_F32 */ f32_func_i16,
-/* FPROUND_F32_F16 */ i16_func_f32,
-/* FPROUND_F64_F16 */ unsupported,
-/* FPROUND_F80_F16 */ unsupported,
-/* FPROUND_F128_F16 */ unsupported,
-/* FPROUND_PPCF128_F16 */ unsupported,
-/* FPROUND_F64_F32 */ f32_func_f64,
-/* FPROUND_F80_F32 */ unsupported,
-/* FPROUND_F128_F32 */ f32_func_i64_i64,
-/* FPROUND_PPCF128_F32 */ unsupported,
-/* FPROUND_F80_F64 */ unsupported,
-/* FPROUND_F128_F64 */ f64_func_i64_i64,
-/* FPROUND_PPCF128_F64 */ unsupported,
-/* FPTOSINT_F32_I32 */ i32_func_f32,
-/* FPTOSINT_F32_I64 */ i64_func_f32,
-/* FPTOSINT_F32_I128 */ i64_i64_func_f32,
-/* FPTOSINT_F64_I32 */ i32_func_f64,
-/* FPTOSINT_F64_I64 */ i64_func_f64,
-/* FPTOSINT_F64_I128 */ i64_i64_func_f64,
-/* FPTOSINT_F80_I32 */ unsupported,
-/* FPTOSINT_F80_I64 */ unsupported,
-/* FPTOSINT_F80_I128 */ unsupported,
-/* FPTOSINT_F128_I32 */ i32_func_i64_i64,
-/* FPTOSINT_F128_I64 */ i64_func_i64_i64,
-/* FPTOSINT_F128_I128 */ i64_i64_func_i64_i64,
-/* FPTOSINT_PPCF128_I32 */ unsupported,
-/* FPTOSINT_PPCF128_I64 */ unsupported,
-/* FPTOSINT_PPCF128_I128 */ unsupported,
-/* FPTOUINT_F32_I32 */ i32_func_f32,
-/* FPTOUINT_F32_I64 */ i64_func_f32,
-/* FPTOUINT_F32_I128 */ i64_i64_func_f32,
-/* FPTOUINT_F64_I32 */ i32_func_f64,
-/* FPTOUINT_F64_I64 */ i64_func_f64,
-/* FPTOUINT_F64_I128 */ i64_i64_func_f64,
-/* FPTOUINT_F80_I32 */ unsupported,
-/* FPTOUINT_F80_I64 */ unsupported,
-/* FPTOUINT_F80_I128 */ unsupported,
-/* FPTOUINT_F128_I32 */ i32_func_i64_i64,
-/* FPTOUINT_F128_I64 */ i64_func_i64_i64,
-/* FPTOUINT_F128_I128 */ i64_i64_func_i64_i64,
-/* FPTOUINT_PPCF128_I32 */ unsupported,
-/* FPTOUINT_PPCF128_I64 */ unsupported,
-/* FPTOUINT_PPCF128_I128 */ unsupported,
-/* SINTTOFP_I32_F32 */ f32_func_i32,
-/* SINTTOFP_I32_F64 */ f64_func_i32,
-/* SINTTOFP_I32_F80 */ unsupported,
-/* SINTTOFP_I32_F128 */ func_iPTR_i32,
-/* SINTTOFP_I32_PPCF128 */ unsupported,
-/* SINTTOFP_I64_F32 */ f32_func_i64,
-/* SINTTOFP_I64_F64 */ f64_func_i64,
-/* SINTTOFP_I64_F80 */ unsupported,
-/* SINTTOFP_I64_F128 */ func_iPTR_i64,
-/* SINTTOFP_I64_PPCF128 */ unsupported,
-/* SINTTOFP_I128_F32 */ f32_func_i64_i64,
-/* SINTTOFP_I128_F64 */ f64_func_i64_i64,
-/* SINTTOFP_I128_F80 */ unsupported,
-/* SINTTOFP_I128_F128 */ func_iPTR_i64_i64,
-/* SINTTOFP_I128_PPCF128 */ unsupported,
-/* UINTTOFP_I32_F32 */ f32_func_i32,
-/* UINTTOFP_I32_F64 */ f64_func_i64,
-/* UINTTOFP_I32_F80 */ unsupported,
-/* UINTTOFP_I32_F128 */ func_iPTR_i32,
-/* UINTTOFP_I32_PPCF128 */ unsupported,
-/* UINTTOFP_I64_F32 */ f32_func_i64,
-/* UINTTOFP_I64_F64 */ f64_func_i64,
-/* UINTTOFP_I64_F80 */ unsupported,
-/* UINTTOFP_I64_F128 */ func_iPTR_i64,
-/* UINTTOFP_I64_PPCF128 */ unsupported,
-/* UINTTOFP_I128_F32 */ f32_func_i64_i64,
-/* UINTTOFP_I128_F64 */ f64_func_i64_i64,
-/* UINTTOFP_I128_F80 */ unsupported,
-/* UINTTOFP_I128_F128 */ func_iPTR_i64_i64,
-/* UINTTOFP_I128_PPCF128 */ unsupported,
-
-// COMPARISON
-/* OEQ_F32 */ i32_func_f32_f32,
-/* OEQ_F64 */ i32_func_f64_f64,
-/* OEQ_F128 */ i32_func_i64_i64_i64_i64,
-/* OEQ_PPCF128 */ unsupported,
-/* UNE_F32 */ i32_func_f32_f32,
-/* UNE_F64 */ i32_func_f64_f64,
-/* UNE_F128 */ i32_func_i64_i64_i64_i64,
-/* UNE_PPCF128 */ unsupported,
-/* OGE_F32 */ i32_func_f32_f32,
-/* OGE_F64 */ i32_func_f64_f64,
-/* OGE_F128 */ i32_func_i64_i64_i64_i64,
-/* OGE_PPCF128 */ unsupported,
-/* OLT_F32 */ i32_func_f32_f32,
-/* OLT_F64 */ i32_func_f64_f64,
-/* OLT_F128 */ i32_func_i64_i64_i64_i64,
-/* OLT_PPCF128 */ unsupported,
-/* OLE_F32 */ i32_func_f32_f32,
-/* OLE_F64 */ i32_func_f64_f64,
-/* OLE_F128 */ i32_func_i64_i64_i64_i64,
-/* OLE_PPCF128 */ unsupported,
-/* OGT_F32 */ i32_func_f32_f32,
-/* OGT_F64 */ i32_func_f64_f64,
-/* OGT_F128 */ i32_func_i64_i64_i64_i64,
-/* OGT_PPCF128 */ unsupported,
-/* UO_F32 */ i32_func_f32_f32,
-/* UO_F64 */ i32_func_f64_f64,
-/* UO_F128 */ i32_func_i64_i64_i64_i64,
-/* UO_PPCF128 */ unsupported,
-/* O_F32 */ i32_func_f32_f32,
-/* O_F64 */ i32_func_f64_f64,
-/* O_F128 */ i32_func_i64_i64_i64_i64,
-/* O_PPCF128 */ unsupported,
-
-// MEMORY
-/* MEMCPY */ iPTR_func_iPTR_iPTR_iPTR,
-/* MEMMOVE */ iPTR_func_iPTR_iPTR_iPTR,
-/* MEMSET */ iPTR_func_iPTR_i32_iPTR,
-/* BZERO */ unsupported,
-
-// ELEMENT-WISE ATOMIC MEMORY
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_2 */ unsupported,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_4 */ unsupported,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_8 */ unsupported,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_16 */ unsupported,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2 */ unsupported,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4 */ unsupported,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8 */ unsupported,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16 */ unsupported,
-
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_2 */ unsupported,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_4 */ unsupported,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_8 */ unsupported,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_16 */ unsupported,
-
-// EXCEPTION HANDLING
-/* UNWIND_RESUME */ unsupported,
-
-// Note: there's two sets of atomics libcalls; see
-// <http://llvm.org/docs/Atomics.html> for more info on the
-// difference between them.
-
-// Atomic '__sync_*' libcalls.
-/* SYNC_VAL_COMPARE_AND_SWAP_1 */ unsupported,
-/* SYNC_VAL_COMPARE_AND_SWAP_2 */ unsupported,
-/* SYNC_VAL_COMPARE_AND_SWAP_4 */ unsupported,
-/* SYNC_VAL_COMPARE_AND_SWAP_8 */ unsupported,
-/* SYNC_VAL_COMPARE_AND_SWAP_16 */ unsupported,
-/* SYNC_LOCK_TEST_AND_SET_1 */ unsupported,
-/* SYNC_LOCK_TEST_AND_SET_2 */ unsupported,
-/* SYNC_LOCK_TEST_AND_SET_4 */ unsupported,
-/* SYNC_LOCK_TEST_AND_SET_8 */ unsupported,
-/* SYNC_LOCK_TEST_AND_SET_16 */ unsupported,
-/* SYNC_FETCH_AND_ADD_1 */ unsupported,
-/* SYNC_FETCH_AND_ADD_2 */ unsupported,
-/* SYNC_FETCH_AND_ADD_4 */ unsupported,
-/* SYNC_FETCH_AND_ADD_8 */ unsupported,
-/* SYNC_FETCH_AND_ADD_16 */ unsupported,
-/* SYNC_FETCH_AND_SUB_1 */ unsupported,
-/* SYNC_FETCH_AND_SUB_2 */ unsupported,
-/* SYNC_FETCH_AND_SUB_4 */ unsupported,
-/* SYNC_FETCH_AND_SUB_8 */ unsupported,
-/* SYNC_FETCH_AND_SUB_16 */ unsupported,
-/* SYNC_FETCH_AND_AND_1 */ unsupported,
-/* SYNC_FETCH_AND_AND_2 */ unsupported,
-/* SYNC_FETCH_AND_AND_4 */ unsupported,
-/* SYNC_FETCH_AND_AND_8 */ unsupported,
-/* SYNC_FETCH_AND_AND_16 */ unsupported,
-/* SYNC_FETCH_AND_OR_1 */ unsupported,
-/* SYNC_FETCH_AND_OR_2 */ unsupported,
-/* SYNC_FETCH_AND_OR_4 */ unsupported,
-/* SYNC_FETCH_AND_OR_8 */ unsupported,
-/* SYNC_FETCH_AND_OR_16 */ unsupported,
-/* SYNC_FETCH_AND_XOR_1 */ unsupported,
-/* SYNC_FETCH_AND_XOR_2 */ unsupported,
-/* SYNC_FETCH_AND_XOR_4 */ unsupported,
-/* SYNC_FETCH_AND_XOR_8 */ unsupported,
-/* SYNC_FETCH_AND_XOR_16 */ unsupported,
-/* SYNC_FETCH_AND_NAND_1 */ unsupported,
-/* SYNC_FETCH_AND_NAND_2 */ unsupported,
-/* SYNC_FETCH_AND_NAND_4 */ unsupported,
-/* SYNC_FETCH_AND_NAND_8 */ unsupported,
-/* SYNC_FETCH_AND_NAND_16 */ unsupported,
-/* SYNC_FETCH_AND_MAX_1 */ unsupported,
-/* SYNC_FETCH_AND_MAX_2 */ unsupported,
-/* SYNC_FETCH_AND_MAX_4 */ unsupported,
-/* SYNC_FETCH_AND_MAX_8 */ unsupported,
-/* SYNC_FETCH_AND_MAX_16 */ unsupported,
-/* SYNC_FETCH_AND_UMAX_1 */ unsupported,
-/* SYNC_FETCH_AND_UMAX_2 */ unsupported,
-/* SYNC_FETCH_AND_UMAX_4 */ unsupported,
-/* SYNC_FETCH_AND_UMAX_8 */ unsupported,
-/* SYNC_FETCH_AND_UMAX_16 */ unsupported,
-/* SYNC_FETCH_AND_MIN_1 */ unsupported,
-/* SYNC_FETCH_AND_MIN_2 */ unsupported,
-/* SYNC_FETCH_AND_MIN_4 */ unsupported,
-/* SYNC_FETCH_AND_MIN_8 */ unsupported,
-/* SYNC_FETCH_AND_MIN_16 */ unsupported,
-/* SYNC_FETCH_AND_UMIN_1 */ unsupported,
-/* SYNC_FETCH_AND_UMIN_2 */ unsupported,
-/* SYNC_FETCH_AND_UMIN_4 */ unsupported,
-/* SYNC_FETCH_AND_UMIN_8 */ unsupported,
-/* SYNC_FETCH_AND_UMIN_16 */ unsupported,
-
-// Atomic '__atomic_*' libcalls.
-/* ATOMIC_LOAD */ unsupported,
-/* ATOMIC_LOAD_1 */ unsupported,
-/* ATOMIC_LOAD_2 */ unsupported,
-/* ATOMIC_LOAD_4 */ unsupported,
-/* ATOMIC_LOAD_8 */ unsupported,
-/* ATOMIC_LOAD_16 */ unsupported,
-
-/* ATOMIC_STORE */ unsupported,
-/* ATOMIC_STORE_1 */ unsupported,
-/* ATOMIC_STORE_2 */ unsupported,
-/* ATOMIC_STORE_4 */ unsupported,
-/* ATOMIC_STORE_8 */ unsupported,
-/* ATOMIC_STORE_16 */ unsupported,
-
-/* ATOMIC_EXCHANGE */ unsupported,
-/* ATOMIC_EXCHANGE_1 */ unsupported,
-/* ATOMIC_EXCHANGE_2 */ unsupported,
-/* ATOMIC_EXCHANGE_4 */ unsupported,
-/* ATOMIC_EXCHANGE_8 */ unsupported,
-/* ATOMIC_EXCHANGE_16 */ unsupported,
-
-/* ATOMIC_COMPARE_EXCHANGE */ unsupported,
-/* ATOMIC_COMPARE_EXCHANGE_1 */ unsupported,
-/* ATOMIC_COMPARE_EXCHANGE_2 */ unsupported,
-/* ATOMIC_COMPARE_EXCHANGE_4 */ unsupported,
-/* ATOMIC_COMPARE_EXCHANGE_8 */ unsupported,
-/* ATOMIC_COMPARE_EXCHANGE_16 */ unsupported,
-
-/* ATOMIC_FETCH_ADD_1 */ unsupported,
-/* ATOMIC_FETCH_ADD_2 */ unsupported,
-/* ATOMIC_FETCH_ADD_4 */ unsupported,
-/* ATOMIC_FETCH_ADD_8 */ unsupported,
-/* ATOMIC_FETCH_ADD_16 */ unsupported,
-
-/* ATOMIC_FETCH_SUB_1 */ unsupported,
-/* ATOMIC_FETCH_SUB_2 */ unsupported,
-/* ATOMIC_FETCH_SUB_4 */ unsupported,
-/* ATOMIC_FETCH_SUB_8 */ unsupported,
-/* ATOMIC_FETCH_SUB_16 */ unsupported,
-
-/* ATOMIC_FETCH_AND_1 */ unsupported,
-/* ATOMIC_FETCH_AND_2 */ unsupported,
-/* ATOMIC_FETCH_AND_4 */ unsupported,
-/* ATOMIC_FETCH_AND_8 */ unsupported,
-/* ATOMIC_FETCH_AND_16 */ unsupported,
-
-/* ATOMIC_FETCH_OR_1 */ unsupported,
-/* ATOMIC_FETCH_OR_2 */ unsupported,
-/* ATOMIC_FETCH_OR_4 */ unsupported,
-/* ATOMIC_FETCH_OR_8 */ unsupported,
-/* ATOMIC_FETCH_OR_16 */ unsupported,
-
-/* ATOMIC_FETCH_XOR_1 */ unsupported,
-/* ATOMIC_FETCH_XOR_2 */ unsupported,
-/* ATOMIC_FETCH_XOR_4 */ unsupported,
-/* ATOMIC_FETCH_XOR_8 */ unsupported,
-/* ATOMIC_FETCH_XOR_16 */ unsupported,
-
-/* ATOMIC_FETCH_NAND_1 */ unsupported,
-/* ATOMIC_FETCH_NAND_2 */ unsupported,
-/* ATOMIC_FETCH_NAND_4 */ unsupported,
-/* ATOMIC_FETCH_NAND_8 */ unsupported,
-/* ATOMIC_FETCH_NAND_16 */ unsupported,
-
-// Stack Protector Fail.
-/* STACKPROTECTOR_CHECK_FAIL */ func,
-
-// Deoptimization.
-/* DEOPTIMIZE */ unsupported,
+struct RuntimeLibcallSignatureTable {
+ std::vector<RuntimeLibcallSignature> Table;
+
+ // Any newly-added libcalls will be unsupported by default.
+ RuntimeLibcallSignatureTable() : Table(RTLIB::UNKNOWN_LIBCALL, unsupported) {
+ // Integer
+ Table[RTLIB::SHL_I16] = i16_func_i16_i16;
+ Table[RTLIB::SHL_I32] = i32_func_i32_i32;
+ Table[RTLIB::SHL_I64] = i64_func_i64_i64;
+ Table[RTLIB::SHL_I128] = i64_i64_func_i64_i64_i32;
+ Table[RTLIB::SRL_I16] = i16_func_i16_i16;
+ Table[RTLIB::SRL_I32] = i32_func_i32_i32;
+ Table[RTLIB::SRL_I64] = i64_func_i64_i64;
+ Table[RTLIB::SRL_I128] = i64_i64_func_i64_i64_i32;
+ Table[RTLIB::SRA_I16] = i16_func_i16_i16;
+ Table[RTLIB::SRA_I32] = i32_func_i32_i32;
+ Table[RTLIB::SRA_I64] = i64_func_i64_i64;
+ Table[RTLIB::SRA_I128] = i64_i64_func_i64_i64_i32;
+ Table[RTLIB::MUL_I8] = i8_func_i8_i8;
+ Table[RTLIB::MUL_I16] = i16_func_i16_i16;
+ Table[RTLIB::MUL_I32] = i32_func_i32_i32;
+ Table[RTLIB::MUL_I64] = i64_func_i64_i64;
+ Table[RTLIB::MUL_I128] = i64_i64_func_i64_i64_i64_i64;
+ Table[RTLIB::MULO_I32] = i32_func_i32_i32_iPTR;
+ Table[RTLIB::MULO_I64] = i64_func_i64_i64_iPTR;
+ Table[RTLIB::MULO_I128] = i64_i64_func_i64_i64_i64_i64_iPTR;
+ Table[RTLIB::SDIV_I8] = i8_func_i8_i8;
+ Table[RTLIB::SDIV_I16] = i16_func_i16_i16;
+ Table[RTLIB::SDIV_I32] = i32_func_i32_i32;
+ Table[RTLIB::SDIV_I64] = i64_func_i64_i64;
+ Table[RTLIB::SDIV_I128] = i64_i64_func_i64_i64_i64_i64;
+ Table[RTLIB::UDIV_I8] = i8_func_i8_i8;
+ Table[RTLIB::UDIV_I16] = i16_func_i16_i16;
+ Table[RTLIB::UDIV_I32] = i32_func_i32_i32;
+ Table[RTLIB::UDIV_I64] = i64_func_i64_i64;
+ Table[RTLIB::UDIV_I128] = i64_i64_func_i64_i64_i64_i64;
+ Table[RTLIB::SREM_I8] = i8_func_i8_i8;
+ Table[RTLIB::SREM_I16] = i16_func_i16_i16;
+ Table[RTLIB::SREM_I32] = i32_func_i32_i32;
+ Table[RTLIB::SREM_I64] = i64_func_i64_i64;
+ Table[RTLIB::SREM_I128] = i64_i64_func_i64_i64_i64_i64;
+ Table[RTLIB::UREM_I8] = i8_func_i8_i8;
+ Table[RTLIB::UREM_I16] = i16_func_i16_i16;
+ Table[RTLIB::UREM_I32] = i32_func_i32_i32;
+ Table[RTLIB::UREM_I64] = i64_func_i64_i64;
+ Table[RTLIB::UREM_I128] = i64_i64_func_i64_i64_i64_i64;
+ Table[RTLIB::SDIVREM_I8] = i8_func_i8_i8;
+ Table[RTLIB::SDIVREM_I16] = i16_i16_func_i16_i16;
+ Table[RTLIB::SDIVREM_I32] = i32_i32_func_i32_i32;
+ Table[RTLIB::SDIVREM_I64] = i64_func_i64_i64;
+ Table[RTLIB::SDIVREM_I128] = i64_i64_i64_i64_func_i64_i64_i64_i64;
+ Table[RTLIB::UDIVREM_I8] = i8_func_i8_i8;
+ Table[RTLIB::UDIVREM_I16] = i16_i16_func_i16_i16;
+ Table[RTLIB::UDIVREM_I32] = i32_i32_func_i32_i32;
+ Table[RTLIB::UDIVREM_I64] = i64_i64_func_i64_i64;
+ Table[RTLIB::UDIVREM_I128] = i64_i64_i64_i64_func_i64_i64_i64_i64;
+ Table[RTLIB::NEG_I32] = i32_func_i32;
+ Table[RTLIB::NEG_I64] = i64_func_i64;
+
+ // Floating-point.
+ // All F80 and PPCF128 routines are unsupported.
+ Table[RTLIB::ADD_F32] = f32_func_f32_f32;
+ Table[RTLIB::ADD_F64] = f64_func_f64_f64;
+ Table[RTLIB::ADD_F128] = func_iPTR_i64_i64_i64_i64;
+ Table[RTLIB::SUB_F32] = f32_func_f32_f32;
+ Table[RTLIB::SUB_F64] = f64_func_f64_f64;
+ Table[RTLIB::SUB_F128] = func_iPTR_i64_i64_i64_i64;
+ Table[RTLIB::MUL_F32] = f32_func_f32_f32;
+ Table[RTLIB::MUL_F64] = f64_func_f64_f64;
+ Table[RTLIB::MUL_F128] = func_iPTR_i64_i64_i64_i64;
+ Table[RTLIB::DIV_F32] = f32_func_f32_f32;
+ Table[RTLIB::DIV_F64] = f64_func_f64_f64;
+ Table[RTLIB::DIV_F128] = func_iPTR_i64_i64_i64_i64;
+ Table[RTLIB::REM_F32] = f32_func_f32_f32;
+ Table[RTLIB::REM_F64] = f64_func_f64_f64;
+ Table[RTLIB::REM_F128] = func_iPTR_i64_i64_i64_i64;
+ Table[RTLIB::FMA_F32] = f32_func_f32_f32_f32;
+ Table[RTLIB::FMA_F64] = f64_func_f64_f64_f64;
+ Table[RTLIB::FMA_F128] = func_iPTR_i64_i64_i64_i64_i64_i64;
+ Table[RTLIB::POWI_F32] = f32_func_f32_i32;
+ Table[RTLIB::POWI_F64] = f64_func_f64_i32;
+ Table[RTLIB::POWI_F128] = func_iPTR_i64_i64_i64_i64;
+ Table[RTLIB::SQRT_F32] = f32_func_f32;
+ Table[RTLIB::SQRT_F64] = f64_func_f64;
+ Table[RTLIB::SQRT_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::LOG_F32] = f32_func_f32;
+ Table[RTLIB::LOG_F64] = f64_func_f64;
+ Table[RTLIB::LOG_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::LOG2_F32] = f32_func_f32;
+ Table[RTLIB::LOG2_F64] = f64_func_f64;
+ Table[RTLIB::LOG2_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::LOG10_F32] = f32_func_f32;
+ Table[RTLIB::LOG10_F64] = f64_func_f64;
+ Table[RTLIB::LOG10_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::EXP_F32] = f32_func_f32;
+ Table[RTLIB::EXP_F64] = f64_func_f64;
+ Table[RTLIB::EXP_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::EXP2_F32] = f32_func_f32;
+ Table[RTLIB::EXP2_F64] = f64_func_f64;
+ Table[RTLIB::EXP2_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::SIN_F32] = f32_func_f32;
+ Table[RTLIB::SIN_F64] = f64_func_f64;
+ Table[RTLIB::SIN_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::COS_F32] = f32_func_f32;
+ Table[RTLIB::COS_F64] = f64_func_f64;
+ Table[RTLIB::COS_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::SINCOS_F32] = func_f32_iPTR_iPTR;
+ Table[RTLIB::SINCOS_F64] = func_f64_iPTR_iPTR;
+ Table[RTLIB::SINCOS_F128] = func_i64_i64_iPTR_iPTR;
+ Table[RTLIB::POW_F32] = f32_func_f32_f32;
+ Table[RTLIB::POW_F64] = f64_func_f64_f64;
+ Table[RTLIB::POW_F128] = func_iPTR_i64_i64_i64_i64;
+ Table[RTLIB::CEIL_F32] = f32_func_f32;
+ Table[RTLIB::CEIL_F64] = f64_func_f64;
+ Table[RTLIB::CEIL_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::TRUNC_F32] = f32_func_f32;
+ Table[RTLIB::TRUNC_F64] = f64_func_f64;
+ Table[RTLIB::TRUNC_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::RINT_F32] = f32_func_f32;
+ Table[RTLIB::RINT_F64] = f64_func_f64;
+ Table[RTLIB::RINT_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::NEARBYINT_F32] = f32_func_f32;
+ Table[RTLIB::NEARBYINT_F64] = f64_func_f64;
+ Table[RTLIB::NEARBYINT_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::ROUND_F32] = f32_func_f32;
+ Table[RTLIB::ROUND_F64] = f64_func_f64;
+ Table[RTLIB::ROUND_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::FLOOR_F32] = f32_func_f32;
+ Table[RTLIB::FLOOR_F64] = f64_func_f64;
+ Table[RTLIB::FLOOR_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::COPYSIGN_F32] = f32_func_f32_f32;
+ Table[RTLIB::COPYSIGN_F64] = f64_func_f64_f64;
+ Table[RTLIB::COPYSIGN_F128] = func_iPTR_i64_i64_i64_i64;
+ Table[RTLIB::FMIN_F32] = f32_func_f32_f32;
+ Table[RTLIB::FMIN_F64] = f64_func_f64_f64;
+ Table[RTLIB::FMIN_F128] = func_iPTR_i64_i64_i64_i64;
+ Table[RTLIB::FMAX_F32] = f32_func_f32_f32;
+ Table[RTLIB::FMAX_F64] = f64_func_f64_f64;
+ Table[RTLIB::FMAX_F128] = func_iPTR_i64_i64_i64_i64;
+
+ // Conversion
+ // All F80 and PPCF128 routines are unspported.
+ Table[RTLIB::FPEXT_F64_F128] = func_iPTR_f64;
+ Table[RTLIB::FPEXT_F32_F128] = func_iPTR_f32;
+ Table[RTLIB::FPEXT_F32_F64] = f64_func_f32;
+ Table[RTLIB::FPEXT_F16_F32] = f32_func_i16;
+ Table[RTLIB::FPROUND_F32_F16] = i16_func_f32;
+ Table[RTLIB::FPROUND_F64_F32] = f32_func_f64;
+ Table[RTLIB::FPROUND_F128_F32] = f32_func_i64_i64;
+ Table[RTLIB::FPROUND_F128_F64] = f64_func_i64_i64;
+ Table[RTLIB::FPTOSINT_F32_I32] = i32_func_f32;
+ Table[RTLIB::FPTOSINT_F32_I64] = i64_func_f32;
+ Table[RTLIB::FPTOSINT_F32_I128] = i64_i64_func_f32;
+ Table[RTLIB::FPTOSINT_F64_I32] = i32_func_f64;
+ Table[RTLIB::FPTOSINT_F64_I64] = i64_func_f64;
+ Table[RTLIB::FPTOSINT_F64_I128] = i64_i64_func_f64;
+ Table[RTLIB::FPTOSINT_F128_I32] = i32_func_i64_i64;
+ Table[RTLIB::FPTOSINT_F128_I64] = i64_func_i64_i64;
+ Table[RTLIB::FPTOSINT_F128_I128] = i64_i64_func_i64_i64;
+ Table[RTLIB::FPTOUINT_F32_I32] = i32_func_f32;
+ Table[RTLIB::FPTOUINT_F32_I64] = i64_func_f32;
+ Table[RTLIB::FPTOUINT_F32_I128] = i64_i64_func_f32;
+ Table[RTLIB::FPTOUINT_F64_I32] = i32_func_f64;
+ Table[RTLIB::FPTOUINT_F64_I64] = i64_func_f64;
+ Table[RTLIB::FPTOUINT_F64_I128] = i64_i64_func_f64;
+ Table[RTLIB::FPTOUINT_F128_I32] = i32_func_i64_i64;
+ Table[RTLIB::FPTOUINT_F128_I64] = i64_func_i64_i64;
+ Table[RTLIB::FPTOUINT_F128_I128] = i64_i64_func_i64_i64;
+ Table[RTLIB::SINTTOFP_I32_F32] = f32_func_i32;
+ Table[RTLIB::SINTTOFP_I32_F64] = f64_func_i32;
+ Table[RTLIB::SINTTOFP_I32_F128] = func_iPTR_i32;
+ Table[RTLIB::SINTTOFP_I64_F32] = f32_func_i64;
+ Table[RTLIB::SINTTOFP_I64_F64] = f64_func_i64;
+ Table[RTLIB::SINTTOFP_I64_F128] = func_iPTR_i64;
+ Table[RTLIB::SINTTOFP_I128_F32] = f32_func_i64_i64;
+ Table[RTLIB::SINTTOFP_I128_F64] = f64_func_i64_i64;
+ Table[RTLIB::SINTTOFP_I128_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::UINTTOFP_I32_F32] = f32_func_i32;
+ Table[RTLIB::UINTTOFP_I32_F64] = f64_func_i64;
+ Table[RTLIB::UINTTOFP_I32_F128] = func_iPTR_i32;
+ Table[RTLIB::UINTTOFP_I64_F32] = f32_func_i64;
+ Table[RTLIB::UINTTOFP_I64_F64] = f64_func_i64;
+ Table[RTLIB::UINTTOFP_I64_F128] = func_iPTR_i64;
+ Table[RTLIB::UINTTOFP_I128_F32] = f32_func_i64_i64;
+ Table[RTLIB::UINTTOFP_I128_F64] = f64_func_i64_i64;
+ Table[RTLIB::UINTTOFP_I128_F128] = func_iPTR_i64_i64;
+
+ // Comparison
+ // ALl F80 and PPCF128 routines are unsupported.
+ Table[RTLIB::OEQ_F32] = i32_func_f32_f32;
+ Table[RTLIB::OEQ_F64] = i32_func_f64_f64;
+ Table[RTLIB::OEQ_F128] = i32_func_i64_i64_i64_i64;
+ Table[RTLIB::UNE_F32] = i32_func_f32_f32;
+ Table[RTLIB::UNE_F64] = i32_func_f64_f64;
+ Table[RTLIB::UNE_F128] = i32_func_i64_i64_i64_i64;
+ Table[RTLIB::OGE_F32] = i32_func_f32_f32;
+ Table[RTLIB::OGE_F64] = i32_func_f64_f64;
+ Table[RTLIB::OGE_F128] = i32_func_i64_i64_i64_i64;
+ Table[RTLIB::OLT_F32] = i32_func_f32_f32;
+ Table[RTLIB::OLT_F64] = i32_func_f64_f64;
+ Table[RTLIB::OLT_F128] = i32_func_i64_i64_i64_i64;
+ Table[RTLIB::OLE_F32] = i32_func_f32_f32;
+ Table[RTLIB::OLE_F64] = i32_func_f64_f64;
+ Table[RTLIB::OLE_F128] = i32_func_i64_i64_i64_i64;
+ Table[RTLIB::OGT_F32] = i32_func_f32_f32;
+ Table[RTLIB::OGT_F64] = i32_func_f64_f64;
+ Table[RTLIB::OGT_F128] = i32_func_i64_i64_i64_i64;
+ Table[RTLIB::UO_F32] = i32_func_f32_f32;
+ Table[RTLIB::UO_F64] = i32_func_f64_f64;
+ Table[RTLIB::UO_F128] = i32_func_i64_i64_i64_i64;
+ // O_FXX has the weird property that it uses the same libcall name as UO_FXX
+ // This breaks our name-based lookup. Fortunately only the UO family of
+ // libcalls appears to be actually used.
+ Table[RTLIB::O_F32] = unsupported;
+ Table[RTLIB::O_F64] = unsupported;
+ Table[RTLIB::O_F128] = unsupported;
+
+ // Memory
+ Table[RTLIB::MEMCPY] = iPTR_func_iPTR_iPTR_iPTR;
+ Table[RTLIB::MEMSET] = iPTR_func_iPTR_i32_iPTR;
+ Table[RTLIB::MEMMOVE] = iPTR_func_iPTR_iPTR_iPTR;
+
+ // Element-wise Atomic memory
+ // TODO: Fix these when we implement atomic support
+ Table[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_1] = unsupported;
+ Table[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_2] = unsupported;
+ Table[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_4] = unsupported;
+ Table[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_8] = unsupported;
+ Table[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_16] = unsupported;
+ Table[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1] = unsupported;
+ Table[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2] = unsupported;
+ Table[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4] = unsupported;
+ Table[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8] = unsupported;
+ Table[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16] = unsupported;
+
+ Table[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_1] = unsupported;
+ Table[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_2] = unsupported;
+ Table[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_4] = unsupported;
+ Table[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_8] = unsupported;
+ Table[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_16] = unsupported;
+
+ // Atomic '__sync_*' libcalls.
+ // TODO: Fix these when we implement atomic support
+ Table[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = unsupported;
+ Table[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = unsupported;
+ Table[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4] = unsupported;
+ Table[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8] = unsupported;
+ Table[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_16] = unsupported;
+ Table[RTLIB::SYNC_LOCK_TEST_AND_SET_1] = unsupported;
+ Table[RTLIB::SYNC_LOCK_TEST_AND_SET_2] = unsupported;
+ Table[RTLIB::SYNC_LOCK_TEST_AND_SET_4] = unsupported;
+ Table[RTLIB::SYNC_LOCK_TEST_AND_SET_8] = unsupported;
+ Table[RTLIB::SYNC_LOCK_TEST_AND_SET_16] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_ADD_1] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_ADD_2] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_ADD_4] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_ADD_8] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_ADD_16] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_SUB_1] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_SUB_2] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_SUB_4] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_SUB_8] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_SUB_16] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_AND_1] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_AND_2] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_AND_4] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_AND_8] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_AND_16] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_OR_1] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_OR_2] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_OR_4] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_OR_8] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_OR_16] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_XOR_1] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_XOR_2] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_XOR_4] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_XOR_8] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_XOR_16] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_NAND_1] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_NAND_2] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_NAND_4] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_NAND_8] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_NAND_16] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_MAX_1] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_MAX_2] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_MAX_4] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_MAX_8] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_MAX_16] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_UMAX_1] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_UMAX_2] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_UMAX_4] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_UMAX_8] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_UMAX_16] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_MIN_1] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_MIN_2] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_MIN_4] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_MIN_8] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_MIN_16] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_UMIN_1] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_UMIN_2] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_UMIN_4] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_UMIN_8] = unsupported;
+ Table[RTLIB::SYNC_FETCH_AND_UMIN_16] = unsupported;
+
+ // Atomic '__atomic_*' libcalls.
+ // TODO: Fix these when we implement atomic support
+ Table[RTLIB::ATOMIC_LOAD] = unsupported;
+ Table[RTLIB::ATOMIC_LOAD_1] = unsupported;
+ Table[RTLIB::ATOMIC_LOAD_2] = unsupported;
+ Table[RTLIB::ATOMIC_LOAD_4] = unsupported;
+ Table[RTLIB::ATOMIC_LOAD_8] = unsupported;
+ Table[RTLIB::ATOMIC_LOAD_16] = unsupported;
+
+ Table[RTLIB::ATOMIC_STORE] = unsupported;
+ Table[RTLIB::ATOMIC_STORE_1] = unsupported;
+ Table[RTLIB::ATOMIC_STORE_2] = unsupported;
+ Table[RTLIB::ATOMIC_STORE_4] = unsupported;
+ Table[RTLIB::ATOMIC_STORE_8] = unsupported;
+ Table[RTLIB::ATOMIC_STORE_16] = unsupported;
+
+ Table[RTLIB::ATOMIC_EXCHANGE] = unsupported;
+ Table[RTLIB::ATOMIC_EXCHANGE_1] = unsupported;
+ Table[RTLIB::ATOMIC_EXCHANGE_2] = unsupported;
+ Table[RTLIB::ATOMIC_EXCHANGE_4] = unsupported;
+ Table[RTLIB::ATOMIC_EXCHANGE_8] = unsupported;
+ Table[RTLIB::ATOMIC_EXCHANGE_16] = unsupported;
+
+ Table[RTLIB::ATOMIC_COMPARE_EXCHANGE] = unsupported;
+ Table[RTLIB::ATOMIC_COMPARE_EXCHANGE_1] = unsupported;
+ Table[RTLIB::ATOMIC_COMPARE_EXCHANGE_2] = unsupported;
+ Table[RTLIB::ATOMIC_COMPARE_EXCHANGE_4] = unsupported;
+ Table[RTLIB::ATOMIC_COMPARE_EXCHANGE_8] = unsupported;
+ Table[RTLIB::ATOMIC_COMPARE_EXCHANGE_16] = unsupported;
+
+ Table[RTLIB::ATOMIC_FETCH_ADD_1] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_ADD_2] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_ADD_4] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_ADD_8] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_ADD_16] = unsupported;
+
+ Table[RTLIB::ATOMIC_FETCH_SUB_1] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_SUB_2] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_SUB_4] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_SUB_8] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_SUB_16] = unsupported;
+
+ Table[RTLIB::ATOMIC_FETCH_AND_1] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_AND_2] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_AND_4] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_AND_8] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_AND_16] = unsupported;
+
+ Table[RTLIB::ATOMIC_FETCH_OR_1] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_OR_2] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_OR_4] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_OR_8] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_OR_16] = unsupported;
+
+ Table[RTLIB::ATOMIC_FETCH_XOR_1] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_XOR_2] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_XOR_4] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_XOR_8] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_XOR_16] = unsupported;
+
+ Table[RTLIB::ATOMIC_FETCH_NAND_1] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_NAND_2] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_NAND_4] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_NAND_8] = unsupported;
+ Table[RTLIB::ATOMIC_FETCH_NAND_16] = unsupported;
+ }
};
-static const char *
-RuntimeLibcallNames[RTLIB::UNKNOWN_LIBCALL] = {
-/* SHL_I16 */ "__ashlhi3",
-/* SHL_I32 */ "__ashlsi3",
-/* SHL_I64 */ "__ashldi3",
-/* SHL_I128 */ "__ashlti3",
-/* SRL_I16 */ "__lshrhi3",
-/* SRL_I32 */ "__lshrsi3",
-/* SRL_I64 */ "__lshrdi3",
-/* SRL_I128 */ "__lshrti3",
-/* SRA_I16 */ "__ashrhi3",
-/* SRA_I32 */ "__ashrsi3",
-/* SRA_I64 */ "__ashrdi3",
-/* SRA_I128 */ "__ashrti3",
-/* MUL_I8 */ "__mulqi3",
-/* MUL_I16 */ "__mulhi3",
-/* MUL_I32 */ "__mulsi3",
-/* MUL_I64 */ "__muldi3",
-/* MUL_I128 */ "__multi3",
-/* MULO_I32 */ "__mulosi4",
-/* MULO_I64 */ "__mulodi4",
-/* MULO_I128 */ "__muloti4",
-/* SDIV_I8 */ "__divqi3",
-/* SDIV_I16 */ "__divhi3",
-/* SDIV_I32 */ "__divsi3",
-/* SDIV_I64 */ "__divdi3",
-/* SDIV_I128 */ "__divti3",
-/* UDIV_I8 */ "__udivqi3",
-/* UDIV_I16 */ "__udivhi3",
-/* UDIV_I32 */ "__udivsi3",
-/* UDIV_I64 */ "__udivdi3",
-/* UDIV_I128 */ "__udivti3",
-/* SREM_I8 */ "__modqi3",
-/* SREM_I16 */ "__modhi3",
-/* SREM_I32 */ "__modsi3",
-/* SREM_I64 */ "__moddi3",
-/* SREM_I128 */ "__modti3",
-/* UREM_I8 */ "__umodqi3",
-/* UREM_I16 */ "__umodhi3",
-/* UREM_I32 */ "__umodsi3",
-/* UREM_I64 */ "__umoddi3",
-/* UREM_I128 */ "__umodti3",
-/* SDIVREM_I8 */ nullptr,
-/* SDIVREM_I16 */ nullptr,
-/* SDIVREM_I32 */ nullptr,
-/* SDIVREM_I64 */ nullptr,
-/* SDIVREM_I128 */ nullptr,
-/* UDIVREM_I8 */ nullptr,
-/* UDIVREM_I16 */ nullptr,
-/* UDIVREM_I32 */ nullptr,
-/* UDIVREM_I64 */ nullptr,
-/* UDIVREM_I128 */ nullptr,
-/* NEG_I32 */ "__negsi2",
-/* NEG_I64 */ "__negdi2",
-/* ADD_F32 */ "__addsf3",
-/* ADD_F64 */ "__adddf3",
-/* ADD_F80 */ nullptr,
-/* ADD_F128 */ "__addtf3",
-/* ADD_PPCF128 */ nullptr,
-/* SUB_F32 */ "__subsf3",
-/* SUB_F64 */ "__subdf3",
-/* SUB_F80 */ nullptr,
-/* SUB_F128 */ "__subtf3",
-/* SUB_PPCF128 */ nullptr,
-/* MUL_F32 */ "__mulsf3",
-/* MUL_F64 */ "__muldf3",
-/* MUL_F80 */ nullptr,
-/* MUL_F128 */ "__multf3",
-/* MUL_PPCF128 */ nullptr,
-/* DIV_F32 */ "__divsf3",
-/* DIV_F64 */ "__divdf3",
-/* DIV_F80 */ nullptr,
-/* DIV_F128 */ "__divtf3",
-/* DIV_PPCF128 */ nullptr,
-/* REM_F32 */ "fmodf",
-/* REM_F64 */ "fmod",
-/* REM_F80 */ nullptr,
-/* REM_F128 */ "fmodl",
-/* REM_PPCF128 */ nullptr,
-/* FMA_F32 */ "fmaf",
-/* FMA_F64 */ "fma",
-/* FMA_F80 */ nullptr,
-/* FMA_F128 */ "fmal",
-/* FMA_PPCF128 */ nullptr,
-/* POWI_F32 */ "__powisf2",
-/* POWI_F64 */ "__powidf2",
-/* POWI_F80 */ nullptr,
-/* POWI_F128 */ "__powitf2",
-/* POWI_PPCF128 */ nullptr,
-/* SQRT_F32 */ "sqrtf",
-/* SQRT_F64 */ "sqrt",
-/* SQRT_F80 */ nullptr,
-/* SQRT_F128 */ "sqrtl",
-/* SQRT_PPCF128 */ nullptr,
-/* LOG_F32 */ "logf",
-/* LOG_F64 */ "log",
-/* LOG_F80 */ nullptr,
-/* LOG_F128 */ "logl",
-/* LOG_PPCF128 */ nullptr,
-/* LOG2_F32 */ "log2f",
-/* LOG2_F64 */ "log2",
-/* LOG2_F80 */ nullptr,
-/* LOG2_F128 */ "log2l",
-/* LOG2_PPCF128 */ nullptr,
-/* LOG10_F32 */ "log10f",
-/* LOG10_F64 */ "log10",
-/* LOG10_F80 */ nullptr,
-/* LOG10_F128 */ "log10l",
-/* LOG10_PPCF128 */ nullptr,
-/* EXP_F32 */ "expf",
-/* EXP_F64 */ "exp",
-/* EXP_F80 */ nullptr,
-/* EXP_F128 */ "expl",
-/* EXP_PPCF128 */ nullptr,
-/* EXP2_F32 */ "exp2f",
-/* EXP2_F64 */ "exp2",
-/* EXP2_F80 */ nullptr,
-/* EXP2_F128 */ "exp2l",
-/* EXP2_PPCF128 */ nullptr,
-/* SIN_F32 */ "sinf",
-/* SIN_F64 */ "sin",
-/* SIN_F80 */ nullptr,
-/* SIN_F128 */ "sinl",
-/* SIN_PPCF128 */ nullptr,
-/* COS_F32 */ "cosf",
-/* COS_F64 */ "cos",
-/* COS_F80 */ nullptr,
-/* COS_F128 */ "cosl",
-/* COS_PPCF128 */ nullptr,
-/* SINCOS_F32 */ "sincosf",
-/* SINCOS_F64 */ "sincos",
-/* SINCOS_F80 */ nullptr,
-/* SINCOS_F128 */ "sincosl",
-/* SINCOS_PPCF128 */ nullptr,
-/* SINCOS_STRET_F32 */ nullptr,
-/* SINCOS_STRET_F64 */ nullptr,
-/* POW_F32 */ "powf",
-/* POW_F64 */ "pow",
-/* POW_F80 */ nullptr,
-/* POW_F128 */ "powl",
-/* POW_PPCF128 */ nullptr,
-/* CEIL_F32 */ "ceilf",
-/* CEIL_F64 */ "ceil",
-/* CEIL_F80 */ nullptr,
-/* CEIL_F128 */ "ceill",
-/* CEIL_PPCF128 */ nullptr,
-/* TRUNC_F32 */ "truncf",
-/* TRUNC_F64 */ "trunc",
-/* TRUNC_F80 */ nullptr,
-/* TRUNC_F128 */ "truncl",
-/* TRUNC_PPCF128 */ nullptr,
-/* RINT_F32 */ "rintf",
-/* RINT_F64 */ "rint",
-/* RINT_F80 */ nullptr,
-/* RINT_F128 */ "rintl",
-/* RINT_PPCF128 */ nullptr,
-/* NEARBYINT_F32 */ "nearbyintf",
-/* NEARBYINT_F64 */ "nearbyint",
-/* NEARBYINT_F80 */ nullptr,
-/* NEARBYINT_F128 */ "nearbyintl",
-/* NEARBYINT_PPCF128 */ nullptr,
-/* ROUND_F32 */ "roundf",
-/* ROUND_F64 */ "round",
-/* ROUND_F80 */ nullptr,
-/* ROUND_F128 */ "roundl",
-/* ROUND_PPCF128 */ nullptr,
-/* FLOOR_F32 */ "floorf",
-/* FLOOR_F64 */ "floor",
-/* FLOOR_F80 */ nullptr,
-/* FLOOR_F128 */ "floorl",
-/* FLOOR_PPCF128 */ nullptr,
-/* COPYSIGN_F32 */ "copysignf",
-/* COPYSIGN_F64 */ "copysign",
-/* COPYSIGN_F80 */ nullptr,
-/* COPYSIGN_F128 */ "copysignl",
-/* COPYSIGN_PPCF128 */ nullptr,
-/* FMIN_F32 */ "fminf",
-/* FMIN_F64 */ "fmin",
-/* FMIN_F80 */ nullptr,
-/* FMIN_F128 */ "fminl",
-/* FMIN_PPCF128 */ nullptr,
-/* FMAX_F32 */ "fmaxf",
-/* FMAX_F64 */ "fmax",
-/* FMAX_F80 */ nullptr,
-/* FMAX_F128 */ "fmaxl",
-/* FMAX_PPCF128 */ nullptr,
-/* FPEXT_F32_PPCF128 */ nullptr,
-/* FPEXT_F64_PPCF128 */ nullptr,
-/* FPEXT_F64_F128 */ "__extenddftf2",
-/* FPEXT_F32_F128 */ "__extendsftf2",
-/* FPEXT_F32_F64 */ "__extendsfdf2",
-/* FPEXT_F16_F32 */ "__gnu_h2f_ieee",
-/* FPROUND_F32_F16 */ "__gnu_f2h_ieee",
-/* FPROUND_F64_F16 */ nullptr,
-/* FPROUND_F80_F16 */ nullptr,
-/* FPROUND_F128_F16 */ nullptr,
-/* FPROUND_PPCF128_F16 */ nullptr,
-/* FPROUND_F64_F32 */ "__truncdfsf2",
-/* FPROUND_F80_F32 */ "__truncxfsf2",
-/* FPROUND_F128_F32 */ "__trunctfsf2",
-/* FPROUND_PPCF128_F32 */ nullptr,
-/* FPROUND_F80_F64 */ "__truncxfdf2",
-/* FPROUND_F128_F64 */ "__trunctfdf2",
-/* FPROUND_PPCF128_F64 */ nullptr,
-/* FPTOSINT_F32_I32 */ "__fixsfsi",
-/* FPTOSINT_F32_I64 */ "__fixsfdi",
-/* FPTOSINT_F32_I128 */ "__fixsfti",
-/* FPTOSINT_F64_I32 */ "__fixdfsi",
-/* FPTOSINT_F64_I64 */ "__fixdfdi",
-/* FPTOSINT_F64_I128 */ "__fixdfti",
-/* FPTOSINT_F80_I32 */ "__fixxfsi",
-/* FPTOSINT_F80_I64 */ "__fixxfdi",
-/* FPTOSINT_F80_I128 */ "__fixxfti",
-/* FPTOSINT_F128_I32 */ "__fixtfsi",
-/* FPTOSINT_F128_I64 */ "__fixtfdi",
-/* FPTOSINT_F128_I128 */ "__fixtfti",
-/* FPTOSINT_PPCF128_I32 */ nullptr,
-/* FPTOSINT_PPCF128_I64 */ nullptr,
-/* FPTOSINT_PPCF128_I128 */ nullptr,
-/* FPTOUINT_F32_I32 */ "__fixunssfsi",
-/* FPTOUINT_F32_I64 */ "__fixunssfdi",
-/* FPTOUINT_F32_I128 */ "__fixunssfti",
-/* FPTOUINT_F64_I32 */ "__fixunsdfsi",
-/* FPTOUINT_F64_I64 */ "__fixunsdfdi",
-/* FPTOUINT_F64_I128 */ "__fixunsdfti",
-/* FPTOUINT_F80_I32 */ "__fixunsxfsi",
-/* FPTOUINT_F80_I64 */ "__fixunsxfdi",
-/* FPTOUINT_F80_I128 */ "__fixunsxfti",
-/* FPTOUINT_F128_I32 */ "__fixunstfsi",
-/* FPTOUINT_F128_I64 */ "__fixunstfdi",
-/* FPTOUINT_F128_I128 */ "__fixunstfti",
-/* FPTOUINT_PPCF128_I32 */ nullptr,
-/* FPTOUINT_PPCF128_I64 */ nullptr,
-/* FPTOUINT_PPCF128_I128 */ nullptr,
-/* SINTTOFP_I32_F32 */ "__floatsisf",
-/* SINTTOFP_I32_F64 */ "__floatsidf",
-/* SINTTOFP_I32_F80 */ nullptr,
-/* SINTTOFP_I32_F128 */ "__floatsitf",
-/* SINTTOFP_I32_PPCF128 */ nullptr,
-/* SINTTOFP_I64_F32 */ "__floatdisf",
-/* SINTTOFP_I64_F64 */ "__floatdidf",
-/* SINTTOFP_I64_F80 */ nullptr,
-/* SINTTOFP_I64_F128 */ "__floatditf",
-/* SINTTOFP_I64_PPCF128 */ nullptr,
-/* SINTTOFP_I128_F32 */ "__floattisf",
-/* SINTTOFP_I128_F64 */ "__floattidf",
-/* SINTTOFP_I128_F80 */ nullptr,
-/* SINTTOFP_I128_F128 */ "__floattitf",
-/* SINTTOFP_I128_PPCF128 */ nullptr,
-/* UINTTOFP_I32_F32 */ "__floatunsisf",
-/* UINTTOFP_I32_F64 */ "__floatunsidf",
-/* UINTTOFP_I32_F80 */ nullptr,
-/* UINTTOFP_I32_F128 */ "__floatunsitf",
-/* UINTTOFP_I32_PPCF128 */ nullptr,
-/* UINTTOFP_I64_F32 */ "__floatundisf",
-/* UINTTOFP_I64_F64 */ "__floatundidf",
-/* UINTTOFP_I64_F80 */ nullptr,
-/* UINTTOFP_I64_F128 */ "__floatunditf",
-/* UINTTOFP_I64_PPCF128 */ nullptr,
-/* UINTTOFP_I128_F32 */ "__floatuntisf",
-/* UINTTOFP_I128_F64 */ "__floatuntidf",
-/* UINTTOFP_I128_F80 */ nullptr,
-/* UINTTOFP_I128_F128 */ "__floatuntitf",
-/* UINTTOFP_I128_PPCF128 */ nullptr,
-/* OEQ_F32 */ "__eqsf2",
-/* OEQ_F64 */ "__eqdf2",
-/* OEQ_F128 */ "__eqtf2",
-/* OEQ_PPCF128 */ nullptr,
-/* UNE_F32 */ "__nesf2",
-/* UNE_F64 */ "__nedf2",
-/* UNE_F128 */ "__netf2",
-/* UNE_PPCF128 */ nullptr,
-/* OGE_F32 */ "__gesf2",
-/* OGE_F64 */ "__gedf2",
-/* OGE_F128 */ "__getf2",
-/* OGE_PPCF128 */ nullptr,
-/* OLT_F32 */ "__ltsf2",
-/* OLT_F64 */ "__ltdf2",
-/* OLT_F128 */ "__lttf2",
-/* OLT_PPCF128 */ nullptr,
-/* OLE_F32 */ "__lesf2",
-/* OLE_F64 */ "__ledf2",
-/* OLE_F128 */ "__letf2",
-/* OLE_PPCF128 */ nullptr,
-/* OGT_F32 */ "__gtsf2",
-/* OGT_F64 */ "__gtdf2",
-/* OGT_F128 */ "__gttf2",
-/* OGT_PPCF128 */ nullptr,
-/* UO_F32 */ "__unordsf2",
-/* UO_F64 */ "__unorddf2",
-/* UO_F128 */ "__unordtf2",
-/* UO_PPCF128 */ nullptr,
-/* O_F32 */ "__unordsf2",
-/* O_F64 */ "__unorddf2",
-/* O_F128 */ "__unordtf2",
-/* O_PPCF128 */ nullptr,
-/* MEMCPY */ "memcpy",
-/* MEMMOVE */ "memset",
-/* MEMSET */ "memmove",
-/* BZERO */ nullptr,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_8 */ nullptr,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_16 */ nullptr,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8 */ nullptr,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16 */ nullptr,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_8 */ nullptr,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_16 */ nullptr,
-/* UNWIND_RESUME */ "_Unwind_Resume",
-/* SYNC_VAL_COMPARE_AND_SWAP_1 */ "__sync_val_compare_and_swap_1",
-/* SYNC_VAL_COMPARE_AND_SWAP_2 */ "__sync_val_compare_and_swap_2",
-/* SYNC_VAL_COMPARE_AND_SWAP_4 */ "__sync_val_compare_and_swap_4",
-/* SYNC_VAL_COMPARE_AND_SWAP_8 */ "__sync_val_compare_and_swap_8",
-/* SYNC_VAL_COMPARE_AND_SWAP_16 */ "__sync_val_compare_and_swap_16",
-/* SYNC_LOCK_TEST_AND_SET_1 */ "__sync_lock_test_and_set_1",
-/* SYNC_LOCK_TEST_AND_SET_2 */ "__sync_lock_test_and_set_2",
-/* SYNC_LOCK_TEST_AND_SET_4 */ "__sync_lock_test_and_set_4",
-/* SYNC_LOCK_TEST_AND_SET_8 */ "__sync_lock_test_and_set_8",
-/* SYNC_LOCK_TEST_AND_SET_16 */ "__sync_lock_test_and_set_16",
-/* SYNC_FETCH_AND_ADD_1 */ "__sync_fetch_and_add_1",
-/* SYNC_FETCH_AND_ADD_2 */ "__sync_fetch_and_add_2",
-/* SYNC_FETCH_AND_ADD_4 */ "__sync_fetch_and_add_4",
-/* SYNC_FETCH_AND_ADD_8 */ "__sync_fetch_and_add_8",
-/* SYNC_FETCH_AND_ADD_16 */ "__sync_fetch_and_add_16",
-/* SYNC_FETCH_AND_SUB_1 */ "__sync_fetch_and_sub_1",
-/* SYNC_FETCH_AND_SUB_2 */ "__sync_fetch_and_sub_2",
-/* SYNC_FETCH_AND_SUB_4 */ "__sync_fetch_and_sub_4",
-/* SYNC_FETCH_AND_SUB_8 */ "__sync_fetch_and_sub_8",
-/* SYNC_FETCH_AND_SUB_16 */ "__sync_fetch_and_sub_16",
-/* SYNC_FETCH_AND_AND_1 */ "__sync_fetch_and_and_1",
-/* SYNC_FETCH_AND_AND_2 */ "__sync_fetch_and_and_2",
-/* SYNC_FETCH_AND_AND_4 */ "__sync_fetch_and_and_4",
-/* SYNC_FETCH_AND_AND_8 */ "__sync_fetch_and_and_8",
-/* SYNC_FETCH_AND_AND_16 */ "__sync_fetch_and_and_16",
-/* SYNC_FETCH_AND_OR_1 */ "__sync_fetch_and_or_1",
-/* SYNC_FETCH_AND_OR_2 */ "__sync_fetch_and_or_2",
-/* SYNC_FETCH_AND_OR_4 */ "__sync_fetch_and_or_4",
-/* SYNC_FETCH_AND_OR_8 */ "__sync_fetch_and_or_8",
-/* SYNC_FETCH_AND_OR_16 */ "__sync_fetch_and_or_16",
-/* SYNC_FETCH_AND_XOR_1 */ "__sync_fetch_and_xor_1",
-/* SYNC_FETCH_AND_XOR_2 */ "__sync_fetch_and_xor_2",
-/* SYNC_FETCH_AND_XOR_4 */ "__sync_fetch_and_xor_4",
-/* SYNC_FETCH_AND_XOR_8 */ "__sync_fetch_and_xor_8",
-/* SYNC_FETCH_AND_XOR_16 */ "__sync_fetch_and_xor_16",
-/* SYNC_FETCH_AND_NAND_1 */ "__sync_fetch_and_nand_1",
-/* SYNC_FETCH_AND_NAND_2 */ "__sync_fetch_and_nand_2",
-/* SYNC_FETCH_AND_NAND_4 */ "__sync_fetch_and_nand_4",
-/* SYNC_FETCH_AND_NAND_8 */ "__sync_fetch_and_nand_8",
-/* SYNC_FETCH_AND_NAND_16 */ "__sync_fetch_and_nand_16",
-/* SYNC_FETCH_AND_MAX_1 */ "__sync_fetch_and_max_1",
-/* SYNC_FETCH_AND_MAX_2 */ "__sync_fetch_and_max_2",
-/* SYNC_FETCH_AND_MAX_4 */ "__sync_fetch_and_max_4",
-/* SYNC_FETCH_AND_MAX_8 */ "__sync_fetch_and_max_8",
-/* SYNC_FETCH_AND_MAX_16 */ "__sync_fetch_and_max_16",
-/* SYNC_FETCH_AND_UMAX_1 */ "__sync_fetch_and_umax_1",
-/* SYNC_FETCH_AND_UMAX_2 */ "__sync_fetch_and_umax_2",
-/* SYNC_FETCH_AND_UMAX_4 */ "__sync_fetch_and_umax_4",
-/* SYNC_FETCH_AND_UMAX_8 */ "__sync_fetch_and_umax_8",
-/* SYNC_FETCH_AND_UMAX_16 */ "__sync_fetch_and_umax_16",
-/* SYNC_FETCH_AND_MIN_1 */ "__sync_fetch_and_min_1",
-/* SYNC_FETCH_AND_MIN_2 */ "__sync_fetch_and_min_2",
-/* SYNC_FETCH_AND_MIN_4 */ "__sync_fetch_and_min_4",
-/* SYNC_FETCH_AND_MIN_8 */ "__sync_fetch_and_min_8",
-/* SYNC_FETCH_AND_MIN_16 */ "__sync_fetch_and_min_16",
-/* SYNC_FETCH_AND_UMIN_1 */ "__sync_fetch_and_umin_1",
-/* SYNC_FETCH_AND_UMIN_2 */ "__sync_fetch_and_umin_2",
-/* SYNC_FETCH_AND_UMIN_4 */ "__sync_fetch_and_umin_4",
-/* SYNC_FETCH_AND_UMIN_8 */ "__sync_fetch_and_umin_8",
-/* SYNC_FETCH_AND_UMIN_16 */ "__sync_fetch_and_umin_16",
-
-/* ATOMIC_LOAD */ "__atomic_load",
-/* ATOMIC_LOAD_1 */ "__atomic_load_1",
-/* ATOMIC_LOAD_2 */ "__atomic_load_2",
-/* ATOMIC_LOAD_4 */ "__atomic_load_4",
-/* ATOMIC_LOAD_8 */ "__atomic_load_8",
-/* ATOMIC_LOAD_16 */ "__atomic_load_16",
-
-/* ATOMIC_STORE */ "__atomic_store",
-/* ATOMIC_STORE_1 */ "__atomic_store_1",
-/* ATOMIC_STORE_2 */ "__atomic_store_2",
-/* ATOMIC_STORE_4 */ "__atomic_store_4",
-/* ATOMIC_STORE_8 */ "__atomic_store_8",
-/* ATOMIC_STORE_16 */ "__atomic_store_16",
-
-/* ATOMIC_EXCHANGE */ "__atomic_exchange",
-/* ATOMIC_EXCHANGE_1 */ "__atomic_exchange_1",
-/* ATOMIC_EXCHANGE_2 */ "__atomic_exchange_2",
-/* ATOMIC_EXCHANGE_4 */ "__atomic_exchange_4",
-/* ATOMIC_EXCHANGE_8 */ "__atomic_exchange_8",
-/* ATOMIC_EXCHANGE_16 */ "__atomic_exchange_16",
-
-/* ATOMIC_COMPARE_EXCHANGE */ "__atomic_compare_exchange",
-/* ATOMIC_COMPARE_EXCHANGE_1 */ "__atomic_compare_exchange_1",
-/* ATOMIC_COMPARE_EXCHANGE_2 */ "__atomic_compare_exchange_2",
-/* ATOMIC_COMPARE_EXCHANGE_4 */ "__atomic_compare_exchange_4",
-/* ATOMIC_COMPARE_EXCHANGE_8 */ "__atomic_compare_exchange_8",
-/* ATOMIC_COMPARE_EXCHANGE_16 */ "__atomic_compare_exchange_16",
+ManagedStatic<RuntimeLibcallSignatureTable> RuntimeLibcallSignatures;
+
+// Maps libcall names to their RTLIB::Libcall number. Builds the map in a
+// constructor for use with ManagedStatic
+struct StaticLibcallNameMap {
+ StringMap<RTLIB::Libcall> Map;
+ StaticLibcallNameMap() {
+ static const std::pair<const char *, RTLIB::Libcall> NameLibcalls[] = {
+#define HANDLE_LIBCALL(code, name) {(const char *)name, RTLIB::code},
+#include "llvm/IR/RuntimeLibcalls.def"
+#undef HANDLE_LIBCALL
+ };
+ for (const auto &NameLibcall : NameLibcalls) {
+ if (NameLibcall.first != nullptr &&
+ RuntimeLibcallSignatures->Table[NameLibcall.second] != unsupported) {
+ assert(Map.find(NameLibcall.first) == Map.end() &&
+ "duplicate libcall names in name map");
+ Map[NameLibcall.first] = NameLibcall.second;
+ }
+ }
+ }
+};
-/* ATOMIC_FETCH_ADD_1 */ "__atomic_fetch_add_1",
-/* ATOMIC_FETCH_ADD_2 */ "__atomic_fetch_add_2",
-/* ATOMIC_FETCH_ADD_4 */ "__atomic_fetch_add_4",
-/* ATOMIC_FETCH_ADD_8 */ "__atomic_fetch_add_8",
-/* ATOMIC_FETCH_ADD_16 */ "__atomic_fetch_add_16",
-/* ATOMIC_FETCH_SUB_1 */ "__atomic_fetch_sub_1",
-/* ATOMIC_FETCH_SUB_2 */ "__atomic_fetch_sub_2",
-/* ATOMIC_FETCH_SUB_4 */ "__atomic_fetch_sub_4",
-/* ATOMIC_FETCH_SUB_8 */ "__atomic_fetch_sub_8",
-/* ATOMIC_FETCH_SUB_16 */ "__atomic_fetch_sub_16",
-/* ATOMIC_FETCH_AND_1 */ "__atomic_fetch_and_1",
-/* ATOMIC_FETCH_AND_2 */ "__atomic_fetch_and_2",
-/* ATOMIC_FETCH_AND_4 */ "__atomic_fetch_and_4",
-/* ATOMIC_FETCH_AND_8 */ "__atomic_fetch_and_8",
-/* ATOMIC_FETCH_AND_16 */ "__atomic_fetch_and_16",
-/* ATOMIC_FETCH_OR_1 */ "__atomic_fetch_or_1",
-/* ATOMIC_FETCH_OR_2 */ "__atomic_fetch_or_2",
-/* ATOMIC_FETCH_OR_4 */ "__atomic_fetch_or_4",
-/* ATOMIC_FETCH_OR_8 */ "__atomic_fetch_or_8",
-/* ATOMIC_FETCH_OR_16 */ "__atomic_fetch_or_16",
-/* ATOMIC_FETCH_XOR_1 */ "__atomic_fetch_xor_1",
-/* ATOMIC_FETCH_XOR_2 */ "__atomic_fetch_xor_2",
-/* ATOMIC_FETCH_XOR_4 */ "__atomic_fetch_xor_4",
-/* ATOMIC_FETCH_XOR_8 */ "__atomic_fetch_xor_8",
-/* ATOMIC_FETCH_XOR_16 */ "__atomic_fetch_xor_16",
-/* ATOMIC_FETCH_NAND_1 */ "__atomic_fetch_nand_1",
-/* ATOMIC_FETCH_NAND_2 */ "__atomic_fetch_nand_2",
-/* ATOMIC_FETCH_NAND_4 */ "__atomic_fetch_nand_4",
-/* ATOMIC_FETCH_NAND_8 */ "__atomic_fetch_nand_8",
-/* ATOMIC_FETCH_NAND_16 */ "__atomic_fetch_nand_16",
+} // end anonymous namespace
-/* STACKPROTECTOR_CHECK_FAIL */ "__stack_chk_fail",
-/* DEOPTIMIZE */ "__llvm_deoptimize",
-};
void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
RTLIB::Libcall LC, SmallVectorImpl<wasm::ValType> &Rets,
@@ -1003,11 +494,11 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
assert(Rets.empty());
assert(Params.empty());
- WebAssembly::ExprType iPTR = Subtarget.hasAddr64() ?
- WebAssembly::ExprType::I64 :
- WebAssembly::ExprType::I32;
+ wasm::ValType iPTR =
+ Subtarget.hasAddr64() ? wasm::ValType::I64 : wasm::ValType::I32;
- switch (RuntimeLibcallSignatures[LC]) {
+ auto& Table = RuntimeLibcallSignatures->Table;
+ switch (Table[LC]) {
case func:
break;
case f32_func_f32:
@@ -1111,13 +602,13 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
break;
case func_f32_iPTR_iPTR:
Params.push_back(wasm::ValType::F32);
- Params.push_back(wasm::ValType(iPTR));
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
+ Params.push_back(iPTR);
break;
case func_f64_iPTR_iPTR:
Params.push_back(wasm::ValType::F64);
- Params.push_back(wasm::ValType(iPTR));
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
+ Params.push_back(iPTR);
break;
case i16_func_i16_i16:
Rets.push_back(wasm::ValType::I32);
@@ -1139,17 +630,29 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I32);
Params.push_back(wasm::ValType::I32);
break;
+ case i32_func_i32_i32_iPTR:
+ Rets.push_back(wasm::ValType::I32);
+ Params.push_back(wasm::ValType::I32);
+ Params.push_back(wasm::ValType::I32);
+ Params.push_back(iPTR);
+ break;
case i64_func_i64_i64:
Rets.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
break;
+ case i64_func_i64_i64_iPTR:
+ Rets.push_back(wasm::ValType::I64);
+ Params.push_back(wasm::ValType::I64);
+ Params.push_back(wasm::ValType::I64);
+ Params.push_back(iPTR);
+ break;
case i64_i64_func_f32:
#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
#else
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
#endif
Params.push_back(wasm::ValType::F32);
break;
@@ -1158,7 +661,7 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
#else
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
#endif
Params.push_back(wasm::ValType::F64);
break;
@@ -1167,7 +670,7 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I32);
Rets.push_back(wasm::ValType::I32);
#else
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
#endif
Params.push_back(wasm::ValType::I32);
Params.push_back(wasm::ValType::I32);
@@ -1177,7 +680,7 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I32);
Rets.push_back(wasm::ValType::I32);
#else
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
#endif
Params.push_back(wasm::ValType::I32);
Params.push_back(wasm::ValType::I32);
@@ -1187,7 +690,7 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
#else
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
#endif
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
@@ -1197,13 +700,26 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
#else
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
#endif
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
break;
+ case i64_i64_func_i64_i64_i64_i64_iPTR:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+ Rets.push_back(wasm::ValType::I64);
+ Rets.push_back(wasm::ValType::I64);
+#else
+ Params.push_back(iPTR);
+#endif
+ Params.push_back(wasm::ValType::I64);
+ Params.push_back(wasm::ValType::I64);
+ Params.push_back(wasm::ValType::I64);
+ Params.push_back(wasm::ValType::I64);
+ Params.push_back(iPTR);
+ break;
case i64_i64_i64_i64_func_i64_i64_i64_i64:
#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
Rets.push_back(wasm::ValType::I64);
@@ -1211,7 +727,7 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
#else
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
#endif
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
@@ -1225,23 +741,23 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
#else
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
#endif
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I32);
break;
case iPTR_func_iPTR_i32_iPTR:
- Rets.push_back(wasm::ValType(iPTR));
- Params.push_back(wasm::ValType(iPTR));
+ Rets.push_back(iPTR);
+ Params.push_back(iPTR);
Params.push_back(wasm::ValType::I32);
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
break;
case iPTR_func_iPTR_iPTR_iPTR:
- Rets.push_back(wasm::ValType(iPTR));
- Params.push_back(wasm::ValType(iPTR));
- Params.push_back(wasm::ValType(iPTR));
- Params.push_back(wasm::ValType(iPTR));
+ Rets.push_back(iPTR);
+ Params.push_back(iPTR);
+ Params.push_back(iPTR);
+ Params.push_back(iPTR);
break;
case f32_func_f32_f32_f32:
Rets.push_back(wasm::ValType::F32);
@@ -1258,39 +774,39 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
case func_i64_i64_iPTR_iPTR:
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
- Params.push_back(wasm::ValType(iPTR));
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
+ Params.push_back(iPTR);
break;
case func_iPTR_f32:
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
Params.push_back(wasm::ValType::F32);
break;
case func_iPTR_f64:
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
Params.push_back(wasm::ValType::F64);
break;
case func_iPTR_i32:
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
Params.push_back(wasm::ValType::I32);
break;
case func_iPTR_i64:
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
Params.push_back(wasm::ValType::I64);
break;
case func_iPTR_i64_i64:
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
break;
case func_iPTR_i64_i64_i64_i64:
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
break;
case func_iPTR_i64_i64_i64_i64_i64_i64:
- Params.push_back(wasm::ValType(iPTR));
+ Params.push_back(iPTR);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
@@ -1315,15 +831,14 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
}
}
+static ManagedStatic<StaticLibcallNameMap> LibcallNameMap;
+// TODO: If the RTLIB::Libcall-taking flavor of GetSignature remains unsed
+// other than here, just roll its logic into this version.
void llvm::GetSignature(const WebAssemblySubtarget &Subtarget, const char *Name,
SmallVectorImpl<wasm::ValType> &Rets,
SmallVectorImpl<wasm::ValType> &Params) {
- assert(strcmp(RuntimeLibcallNames[RTLIB::DEOPTIMIZE], "__llvm_deoptimize") ==
- 0);
-
- for (size_t i = 0, e = RTLIB::UNKNOWN_LIBCALL; i < e; ++i)
- if (RuntimeLibcallNames[i] && strcmp(RuntimeLibcallNames[i], Name) == 0)
- return GetSignature(Subtarget, RTLIB::Libcall(i), Rets, Params);
-
- llvm_unreachable("unexpected runtime library name");
+ auto& Map = LibcallNameMap->Map;
+ auto val = Map.find(Name);
+ assert(val != Map.end() && "unexpected runtime library name");
+ return GetSignature(Subtarget, val->second, Rets, Params);
}
diff --git a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
index 129067604784..2ba65ff5b716 100644
--- a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
+++ b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file provides signature information for runtime libcalls.
+/// This file provides signature information for runtime libcalls.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
index fae9c6100510..bec72049258a 100644
--- a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file implements the WebAssemblySelectionDAGInfo class.
+/// This file implements the WebAssemblySelectionDAGInfo class.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
index 533c66b7a22f..31d150eded67 100644
--- a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file defines the WebAssembly subclass for
+/// This file defines the WebAssembly subclass for
/// SelectionDAGTargetInfo.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp b/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
index c4b9e915b41e..14221993603a 100644
--- a/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file sets the p2align operands on load and store instructions.
+/// This file sets the p2align operands on load and store instructions.
///
//===----------------------------------------------------------------------===//
@@ -46,6 +46,10 @@ public:
} // end anonymous namespace
char WebAssemblySetP2AlignOperands::ID = 0;
+INITIALIZE_PASS(WebAssemblySetP2AlignOperands, DEBUG_TYPE,
+ "Set the p2align operands for WebAssembly loads and stores",
+ false, false)
+
FunctionPass *llvm::createWebAssemblySetP2AlignOperands() {
return new WebAssemblySetP2AlignOperands();
}
@@ -72,7 +76,7 @@ static void RewriteP2Align(MachineInstr &MI, unsigned OperandNo) {
}
bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "********** Set p2align Operands **********\n"
<< "********** Function: " << MF.getName() << '\n';
});
@@ -103,6 +107,48 @@ bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) {
case WebAssembly::ATOMIC_LOAD8_U_I64:
case WebAssembly::ATOMIC_LOAD16_U_I64:
case WebAssembly::ATOMIC_LOAD32_U_I64:
+ case WebAssembly::ATOMIC_RMW8_U_ADD_I32:
+ case WebAssembly::ATOMIC_RMW8_U_ADD_I64:
+ case WebAssembly::ATOMIC_RMW8_U_SUB_I32:
+ case WebAssembly::ATOMIC_RMW8_U_SUB_I64:
+ case WebAssembly::ATOMIC_RMW8_U_AND_I32:
+ case WebAssembly::ATOMIC_RMW8_U_AND_I64:
+ case WebAssembly::ATOMIC_RMW8_U_OR_I32:
+ case WebAssembly::ATOMIC_RMW8_U_OR_I64:
+ case WebAssembly::ATOMIC_RMW8_U_XOR_I32:
+ case WebAssembly::ATOMIC_RMW8_U_XOR_I64:
+ case WebAssembly::ATOMIC_RMW8_U_XCHG_I32:
+ case WebAssembly::ATOMIC_RMW8_U_XCHG_I64:
+ case WebAssembly::ATOMIC_RMW16_U_ADD_I32:
+ case WebAssembly::ATOMIC_RMW16_U_ADD_I64:
+ case WebAssembly::ATOMIC_RMW16_U_SUB_I32:
+ case WebAssembly::ATOMIC_RMW16_U_SUB_I64:
+ case WebAssembly::ATOMIC_RMW16_U_AND_I32:
+ case WebAssembly::ATOMIC_RMW16_U_AND_I64:
+ case WebAssembly::ATOMIC_RMW16_U_OR_I32:
+ case WebAssembly::ATOMIC_RMW16_U_OR_I64:
+ case WebAssembly::ATOMIC_RMW16_U_XOR_I32:
+ case WebAssembly::ATOMIC_RMW16_U_XOR_I64:
+ case WebAssembly::ATOMIC_RMW16_U_XCHG_I32:
+ case WebAssembly::ATOMIC_RMW16_U_XCHG_I64:
+ case WebAssembly::ATOMIC_RMW_ADD_I32:
+ case WebAssembly::ATOMIC_RMW32_U_ADD_I64:
+ case WebAssembly::ATOMIC_RMW_SUB_I32:
+ case WebAssembly::ATOMIC_RMW32_U_SUB_I64:
+ case WebAssembly::ATOMIC_RMW_AND_I32:
+ case WebAssembly::ATOMIC_RMW32_U_AND_I64:
+ case WebAssembly::ATOMIC_RMW_OR_I32:
+ case WebAssembly::ATOMIC_RMW32_U_OR_I64:
+ case WebAssembly::ATOMIC_RMW_XOR_I32:
+ case WebAssembly::ATOMIC_RMW32_U_XOR_I64:
+ case WebAssembly::ATOMIC_RMW_XCHG_I32:
+ case WebAssembly::ATOMIC_RMW32_U_XCHG_I64:
+ case WebAssembly::ATOMIC_RMW_ADD_I64:
+ case WebAssembly::ATOMIC_RMW_SUB_I64:
+ case WebAssembly::ATOMIC_RMW_AND_I64:
+ case WebAssembly::ATOMIC_RMW_OR_I64:
+ case WebAssembly::ATOMIC_RMW_XOR_I64:
+ case WebAssembly::ATOMIC_RMW_XCHG_I64:
RewriteP2Align(MI, WebAssembly::LoadP2AlignOperandNo);
break;
case WebAssembly::STORE_I32:
@@ -114,6 +160,13 @@ bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) {
case WebAssembly::STORE8_I64:
case WebAssembly::STORE16_I64:
case WebAssembly::STORE32_I64:
+ case WebAssembly::ATOMIC_STORE_I32:
+ case WebAssembly::ATOMIC_STORE8_I32:
+ case WebAssembly::ATOMIC_STORE16_I32:
+ case WebAssembly::ATOMIC_STORE_I64:
+ case WebAssembly::ATOMIC_STORE8_I64:
+ case WebAssembly::ATOMIC_STORE16_I64:
+ case WebAssembly::ATOMIC_STORE32_I64:
RewriteP2Align(MI, WebAssembly::StoreP2AlignOperandNo);
break;
default:
diff --git a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
index 22a5a9099e72..893e8484c4c6 100644
--- a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file implements an optimization pass using store result values.
+/// This file implements an optimization pass using store result values.
///
/// WebAssembly's store instructions return the stored value. This is to enable
/// an optimization wherein uses of the stored value can be replaced by uses of
@@ -68,6 +68,9 @@ private:
} // end anonymous namespace
char WebAssemblyStoreResults::ID = 0;
+INITIALIZE_PASS(WebAssemblyStoreResults, DEBUG_TYPE,
+ "Optimize store result values for WebAssembly", false, false)
+
FunctionPass *llvm::createWebAssemblyStoreResults() {
return new WebAssemblyStoreResults();
}
@@ -108,8 +111,8 @@ static bool ReplaceDominatedUses(MachineBasicBlock &MBB, MachineInstr &MI,
continue;
Changed = true;
- DEBUG(dbgs() << "Setting operand " << O << " in " << *Where << " from "
- << MI << "\n");
+ LLVM_DEBUG(dbgs() << "Setting operand " << O << " in " << *Where << " from "
+ << MI << "\n");
O.setReg(ToReg);
// If the store's def was previously dead, it is no longer.
@@ -167,7 +170,7 @@ static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI,
}
bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "********** Store Results **********\n"
<< "********** Function: " << MF.getName() << '\n';
});
@@ -186,7 +189,7 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
assert(MRI.tracksLiveness() && "StoreResults expects liveness tracking");
for (auto &MBB : MF) {
- DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n');
for (auto &MI : MBB)
switch (MI.getOpcode()) {
default:
diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index 9e122a5f1574..d6af0fb219d7 100644
--- a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file implements the WebAssembly-specific subclass of
+/// This file implements the WebAssembly-specific subclass of
/// TargetSubtarget.
///
//===----------------------------------------------------------------------===//
@@ -41,9 +41,9 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
const std::string &FS,
const TargetMachine &TM)
: WebAssemblyGenSubtargetInfo(TT, CPU, FS), HasSIMD128(false),
- HasAtomics(false), HasNontrappingFPToInt(false), CPUString(CPU),
- TargetTriple(TT), FrameLowering(),
- InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
+ HasAtomics(false), HasNontrappingFPToInt(false), HasSignExt(false),
+ HasExceptionHandling(false), CPUString(CPU), TargetTriple(TT),
+ FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
TLInfo(TM, *this) {}
bool WebAssemblySubtarget::enableMachineScheduler() const {
diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.h b/lib/Target/WebAssembly/WebAssemblySubtarget.h
index a6bf0b6d54f6..b170dbff3b32 100644
--- a/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file declares the WebAssembly-specific subclass of
+/// This file declares the WebAssembly-specific subclass of
/// TargetSubtarget.
///
//===----------------------------------------------------------------------===//
@@ -32,6 +32,8 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
bool HasSIMD128;
bool HasAtomics;
bool HasNontrappingFPToInt;
+ bool HasSignExt;
+ bool HasExceptionHandling;
/// String name of used CPU.
std::string CPUString;
@@ -78,6 +80,8 @@ public:
bool hasSIMD128() const { return HasSIMD128; }
bool hasAtomics() const { return HasAtomics; }
bool hasNontrappingFPToInt() const { return HasNontrappingFPToInt; }
+ bool hasSignExt() const { return HasSignExt; }
+ bool hasExceptionHandling() const { return HasExceptionHandling; }
/// Parses features string setting specified subtarget options. Definition of
/// function is auto generated by tblgen.
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index d38cde74d2ec..7c10f022cbbc 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file defines the WebAssembly-specific subclass of TargetMachine.
+/// This file defines the WebAssembly-specific subclass of TargetMachine.
///
//===----------------------------------------------------------------------===//
@@ -25,6 +25,7 @@
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
using namespace llvm;
#define DEBUG_TYPE "wasm"
@@ -48,9 +49,31 @@ extern "C" void LLVMInitializeWebAssemblyTarget() {
RegisterTargetMachine<WebAssemblyTargetMachine> Y(
getTheWebAssemblyTarget64());
- // Register exception handling pass to opt
- initializeWebAssemblyLowerEmscriptenEHSjLjPass(
- *PassRegistry::getPassRegistry());
+ // Register backend passes
+ auto &PR = *PassRegistry::getPassRegistry();
+ initializeWebAssemblyAddMissingPrototypesPass(PR);
+ initializeWebAssemblyLowerEmscriptenEHSjLjPass(PR);
+ initializeLowerGlobalDtorsPass(PR);
+ initializeFixFunctionBitcastsPass(PR);
+ initializeOptimizeReturnedPass(PR);
+ initializeWebAssemblyArgumentMovePass(PR);
+ initializeWebAssemblySetP2AlignOperandsPass(PR);
+ initializeWebAssemblyReplacePhysRegsPass(PR);
+ initializeWebAssemblyPrepareForLiveIntervalsPass(PR);
+ initializeWebAssemblyOptimizeLiveIntervalsPass(PR);
+ initializeWebAssemblyStoreResultsPass(PR);
+ initializeWebAssemblyRegStackifyPass(PR);
+ initializeWebAssemblyRegColoringPass(PR);
+ initializeWebAssemblyExplicitLocalsPass(PR);
+ initializeWebAssemblyFixIrreducibleControlFlowPass(PR);
+ initializeWebAssemblyLateEHPreparePass(PR);
+ initializeWebAssemblyExceptionInfoPass(PR);
+ initializeWebAssemblyCFGSortPass(PR);
+ initializeWebAssemblyCFGStackifyPass(PR);
+ initializeWebAssemblyLowerBrUnlessPass(PR);
+ initializeWebAssemblyRegNumberingPass(PR);
+ initializeWebAssemblyPeepholePass(PR);
+ initializeWebAssemblyCallIndirectFixupPass(PR);
}
//===----------------------------------------------------------------------===//
@@ -74,11 +97,7 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
: "e-m:e-p:32:32-i64:64-n32:64-S128",
TT, CPU, FS, Options, getEffectiveRelocModel(RM),
CM ? *CM : CodeModel::Large, OL),
- TLOF(TT.isOSBinFormatELF() ?
- static_cast<TargetLoweringObjectFile*>(
- new WebAssemblyTargetObjectFileELF()) :
- static_cast<TargetLoweringObjectFile*>(
- new WebAssemblyTargetObjectFile())) {
+ TLOF(new WebAssemblyTargetObjectFile()) {
// WebAssembly type-checks instructions, but a noreturn function with a return
// type that doesn't match the context will cause a check failure. So we lower
// LLVM 'unreachable' to ISD::TRAP and then lower that to WebAssembly's
@@ -87,11 +106,9 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
// WebAssembly treats each function as an independent unit. Force
// -ffunction-sections, effectively, so that we can emit them independently.
- if (!TT.isOSBinFormatELF()) {
- this->Options.FunctionSections = true;
- this->Options.DataSections = true;
- this->Options.UniqueSectionNames = true;
- }
+ this->Options.FunctionSections = true;
+ this->Options.DataSections = true;
+ this->Options.UniqueSectionNames = true;
initAsmInfo();
@@ -126,6 +143,22 @@ WebAssemblyTargetMachine::getSubtargetImpl(const Function &F) const {
}
namespace {
+class StripThreadLocal final : public ModulePass {
+ // The default thread model for wasm is single, where thread-local variables
+ // are identical to regular globals and should be treated the same. So this
+ // pass just converts all GlobalVariables to NotThreadLocal
+ static char ID;
+
+ public:
+ StripThreadLocal() : ModulePass(ID) {}
+ bool runOnModule(Module &M) override {
+ for (auto &GV : M.globals())
+ GV.setThreadLocalMode(GlobalValue::ThreadLocalMode::NotThreadLocal);
+ return true;
+ }
+};
+char StripThreadLocal::ID = 0;
+
/// WebAssembly Code Generator Pass Configuration Options.
class WebAssemblyPassConfig final : public TargetPassConfig {
public:
@@ -166,13 +199,18 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
//===----------------------------------------------------------------------===//
void WebAssemblyPassConfig::addIRPasses() {
- if (TM->Options.ThreadModel == ThreadModel::Single)
+ if (TM->Options.ThreadModel == ThreadModel::Single) {
// In "single" mode, atomics get lowered to non-atomics.
addPass(createLowerAtomicPass());
- else
+ addPass(new StripThreadLocal());
+ } else {
// Expand some atomic operations. WebAssemblyTargetLowering has hooks which
// control specifically what gets lowered.
addPass(createAtomicExpandPass());
+ }
+
+ // Add signatures to prototype-less function declarations
+ addPass(createWebAssemblyAddMissingPrototypes());
// Lower .llvm.global_dtors into .llvm_global_ctors with __cxa_atexit calls.
addPass(createWebAssemblyLowerGlobalDtors());
@@ -190,7 +228,8 @@ void WebAssemblyPassConfig::addIRPasses() {
// blocks. Lowering invokes when there is no EH support is done in
// TargetPassConfig::addPassesToHandleExceptions, but this runs after this
// function and SjLj handling expects all invokes to be lowered before.
- if (!EnableEmException) {
+ if (!EnableEmException &&
+ TM->Options.ExceptionModel == ExceptionHandling::None) {
addPass(createLowerInvokePass());
// The lower invoke pass may create unreachable code. Remove it in order not
// to process dead blocks in setjmp/longjmp handling.
@@ -225,16 +264,15 @@ void WebAssemblyPassConfig::addPostRegAlloc() {
// virtual registers. Consider removing their restrictions and re-enabling
// them.
- // Has no asserts of its own, but was not written to handle virtual regs.
- disablePass(&ShrinkWrapID);
-
// These functions all require the NoVRegs property.
disablePass(&MachineCopyPropagationID);
+ disablePass(&PostRAMachineSinkingID);
disablePass(&PostRASchedulerID);
disablePass(&FuncletLayoutID);
disablePass(&StackMapLivenessID);
disablePass(&LiveDebugValuesID);
disablePass(&PatchableFunctionID);
+ disablePass(&ShrinkWrapID);
TargetPassConfig::addPostRegAlloc();
}
@@ -282,6 +320,9 @@ void WebAssemblyPassConfig::addPreEmitPass() {
// Insert explicit get_local and set_local operators.
addPass(createWebAssemblyExplicitLocals());
+ // Do various transformations for exception handling
+ addPass(createWebAssemblyLateEHPrepare());
+
// Sort the blocks of the CFG into topological order, a prerequisite for
// BLOCK and LOOP markers.
addPass(createWebAssemblyCFGSort());
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
index dd826befd117..41001e7a0cc7 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file declares the WebAssembly-specific subclass of
+/// This file declares the WebAssembly-specific subclass of
/// TargetMachine.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
index b1fd108bc249..0459bfca418d 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
@@ -8,20 +8,15 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file defines the functions of the WebAssembly-specific subclass
+/// This file defines the functions of the WebAssembly-specific subclass
/// of TargetLoweringObjectFile.
///
//===----------------------------------------------------------------------===//
#include "WebAssemblyTargetObjectFile.h"
#include "WebAssemblyTargetMachine.h"
-using namespace llvm;
-void WebAssemblyTargetObjectFileELF::Initialize(MCContext &Ctx,
- const TargetMachine &TM) {
- TargetLoweringObjectFileELF::Initialize(Ctx, TM);
- InitializeELF(TM.Options.UseInitArray);
-}
+using namespace llvm;
void WebAssemblyTargetObjectFile::Initialize(MCContext &Ctx,
const TargetMachine &TM) {
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
index ace87c9e442f..ce744ba8b8e8 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file declares the WebAssembly-specific subclass of
+/// This file declares the WebAssembly-specific subclass of
/// TargetLoweringObjectFile.
///
//===----------------------------------------------------------------------===//
@@ -20,12 +20,6 @@
namespace llvm {
-class WebAssemblyTargetObjectFileELF final
- : public TargetLoweringObjectFileELF {
-public:
- void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
-};
-
class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFileWasm {
public:
void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 2e002781f43d..4a2777cc3a9f 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file defines the WebAssembly-specific TargetTransformInfo
+/// This file defines the WebAssembly-specific TargetTransformInfo
/// implementation.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 7b35fc916133..4300ca3defbf 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file a TargetTransformInfo::Concept conforming object specific
+/// This file a TargetTransformInfo::Concept conforming object specific
/// to the WebAssembly target machine.
///
/// It uses the target's detailed information to provide more precise answers to
diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index e32772d491cf..5944cea5abd1 100644
--- a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file implements several utility functions for WebAssembly.
+/// This file implements several utility functions for WebAssembly.
///
//===----------------------------------------------------------------------===//
@@ -18,6 +18,13 @@
#include "llvm/CodeGen/MachineLoopInfo.h"
using namespace llvm;
+const char *const WebAssembly::ClangCallTerminateFn = "__clang_call_terminate";
+const char *const WebAssembly::CxaBeginCatchFn = "__cxa_begin_catch";
+const char *const WebAssembly::CxaRethrowFn = "__cxa_rethrow";
+const char *const WebAssembly::StdTerminateFn = "_ZSt9terminatev";
+const char *const WebAssembly::PersonalityWrapperFn =
+ "_Unwind_Wasm_CallPersonality";
+
bool WebAssembly::isArgument(const MachineInstr &MI) {
switch (MI.getOpcode()) {
case WebAssembly::ARGUMENT_I32:
@@ -71,6 +78,24 @@ bool WebAssembly::isChild(const MachineInstr &MI,
MFI.isVRegStackified(Reg);
}
+bool WebAssembly::isCallDirect(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::CALL_VOID:
+ case WebAssembly::CALL_I32:
+ case WebAssembly::CALL_I64:
+ case WebAssembly::CALL_F32:
+ case WebAssembly::CALL_F64:
+ case WebAssembly::CALL_v16i8:
+ case WebAssembly::CALL_v8i16:
+ case WebAssembly::CALL_v4i32:
+ case WebAssembly::CALL_v4f32:
+ case WebAssembly::CALL_EXCEPT_REF:
+ return true;
+ default:
+ return false;
+ }
+}
+
bool WebAssembly::isCallIndirect(const MachineInstr &MI) {
switch (MI.getOpcode()) {
case WebAssembly::CALL_INDIRECT_VOID:
@@ -82,16 +107,136 @@ bool WebAssembly::isCallIndirect(const MachineInstr &MI) {
case WebAssembly::CALL_INDIRECT_v8i16:
case WebAssembly::CALL_INDIRECT_v4i32:
case WebAssembly::CALL_INDIRECT_v4f32:
+ case WebAssembly::CALL_INDIRECT_EXCEPT_REF:
+ return true;
+ default:
+ return false;
+ }
+}
+
+unsigned WebAssembly::getCalleeOpNo(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::CALL_VOID:
+ case WebAssembly::CALL_INDIRECT_VOID:
+ return 0;
+ case WebAssembly::CALL_I32:
+ case WebAssembly::CALL_I64:
+ case WebAssembly::CALL_F32:
+ case WebAssembly::CALL_F64:
+ case WebAssembly::CALL_EXCEPT_REF:
+ case WebAssembly::CALL_INDIRECT_I32:
+ case WebAssembly::CALL_INDIRECT_I64:
+ case WebAssembly::CALL_INDIRECT_F32:
+ case WebAssembly::CALL_INDIRECT_F64:
+ case WebAssembly::CALL_INDIRECT_EXCEPT_REF:
+ return 1;
+ default:
+ llvm_unreachable("Not a call instruction");
+ }
+}
+
+bool WebAssembly::isMarker(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::BLOCK:
+ case WebAssembly::END_BLOCK:
+ case WebAssembly::LOOP:
+ case WebAssembly::END_LOOP:
+ case WebAssembly::TRY:
+ case WebAssembly::END_TRY:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool WebAssembly::isThrow(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::THROW_I32:
+ case WebAssembly::THROW_I64:
return true;
default:
return false;
}
}
-MachineBasicBlock *llvm::LoopBottom(const MachineLoop *Loop) {
- MachineBasicBlock *Bottom = Loop->getHeader();
- for (MachineBasicBlock *MBB : Loop->blocks())
- if (MBB->getNumber() > Bottom->getNumber())
- Bottom = MBB;
- return Bottom;
+bool WebAssembly::isRethrow(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::RETHROW:
+ case WebAssembly::RETHROW_TO_CALLER:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool WebAssembly::isCatch(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::CATCH_I32:
+ case WebAssembly::CATCH_I64:
+ case WebAssembly::CATCH_ALL:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool WebAssembly::mayThrow(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::THROW_I32:
+ case WebAssembly::THROW_I64:
+ case WebAssembly::RETHROW:
+ return true;
+ }
+ if (isCallIndirect(MI))
+ return true;
+ if (!MI.isCall())
+ return false;
+
+ const MachineOperand &MO = MI.getOperand(getCalleeOpNo(MI));
+ assert(MO.isGlobal());
+ const auto *F = dyn_cast<Function>(MO.getGlobal());
+ if (!F)
+ return true;
+ if (F->doesNotThrow())
+ return false;
+ // These functions never throw
+ if (F->getName() == CxaBeginCatchFn || F->getName() == PersonalityWrapperFn ||
+ F->getName() == ClangCallTerminateFn || F->getName() == StdTerminateFn)
+ return false;
+ return true;
+}
+
+bool WebAssembly::isCatchTerminatePad(const MachineBasicBlock &MBB) {
+ if (!MBB.isEHPad())
+ return false;
+ bool SeenCatch = false;
+ for (auto &MI : MBB) {
+ if (MI.getOpcode() == WebAssembly::CATCH_I32 ||
+ MI.getOpcode() == WebAssembly::CATCH_I64)
+ SeenCatch = true;
+ if (SeenCatch && MI.isCall()) {
+ const MachineOperand &CalleeOp = MI.getOperand(getCalleeOpNo(MI));
+ if (CalleeOp.isGlobal() &&
+ CalleeOp.getGlobal()->getName() == ClangCallTerminateFn)
+ return true;
+ }
+ }
+ return false;
+}
+
+bool WebAssembly::isCatchAllTerminatePad(const MachineBasicBlock &MBB) {
+ if (!MBB.isEHPad())
+ return false;
+ bool SeenCatchAll = false;
+ for (auto &MI : MBB) {
+ if (MI.getOpcode() == WebAssembly::CATCH_ALL)
+ SeenCatchAll = true;
+ if (SeenCatchAll && MI.isCall()) {
+ const MachineOperand &CalleeOp = MI.getOperand(getCalleeOpNo(MI));
+ if (CalleeOp.isGlobal() &&
+ CalleeOp.getGlobal()->getName() == StdTerminateFn)
+ return true;
+ }
+ }
+ return false;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.h b/lib/Target/WebAssembly/WebAssemblyUtilities.h
index 595491f1bf5b..cdb7873e9013 100644
--- a/lib/Target/WebAssembly/WebAssemblyUtilities.h
+++ b/lib/Target/WebAssembly/WebAssemblyUtilities.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains the declaration of the WebAssembly-specific
+/// This file contains the declaration of the WebAssembly-specific
/// utility functions.
///
//===----------------------------------------------------------------------===//
@@ -16,11 +16,10 @@
#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYUTILITIES_H
#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYUTILITIES_H
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
namespace llvm {
-class MachineBasicBlock;
-class MachineInstr;
-class MachineLoop;
class WebAssemblyFunctionInfo;
namespace WebAssembly {
@@ -29,14 +28,44 @@ bool isArgument(const MachineInstr &MI);
bool isCopy(const MachineInstr &MI);
bool isTee(const MachineInstr &MI);
bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI);
+bool isCallDirect(const MachineInstr &MI);
bool isCallIndirect(const MachineInstr &MI);
+bool isMarker(const MachineInstr &MI);
+bool isThrow(const MachineInstr &MI);
+bool isRethrow(const MachineInstr &MI);
+bool isCatch(const MachineInstr &MI);
+bool mayThrow(const MachineInstr &MI);
-} // end namespace WebAssembly
+/// Returns the operand number of a callee, assuming the argument is a call
+/// instruction.
+unsigned getCalleeOpNo(const MachineInstr &MI);
+
+/// Returns if the given BB is a single BB terminate pad which starts with a
+/// 'catch' instruction.
+bool isCatchTerminatePad(const MachineBasicBlock &MBB);
+/// Returns if the given BB is a single BB terminate pad which starts with a
+/// 'catch_all' insrtruction.
+bool isCatchAllTerminatePad(const MachineBasicBlock &MBB);
-/// Return the "bottom" block of a loop. This differs from
-/// MachineLoop::getBottomBlock in that it works even if the loop is
-/// discontiguous.
-MachineBasicBlock *LoopBottom(const MachineLoop *Loop);
+// Exception-related function names
+extern const char *const ClangCallTerminateFn;
+extern const char *const CxaBeginCatchFn;
+extern const char *const CxaRethrowFn;
+extern const char *const StdTerminateFn;
+extern const char *const PersonalityWrapperFn;
+
+/// Return the "bottom" block of an entity, which can be either a MachineLoop or
+/// WebAssemblyException. This differs from MachineLoop::getBottomBlock in that
+/// it works even if the entity is discontiguous.
+template <typename T> MachineBasicBlock *getBottom(const T *Unit) {
+ MachineBasicBlock *Bottom = Unit->getHeader();
+ for (MachineBasicBlock *MBB : Unit->blocks())
+ if (MBB->getNumber() > Bottom->getNumber())
+ Bottom = MBB;
+ return Bottom;
+}
+
+} // end namespace WebAssembly
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt
index 2eb73befc50b..364c871f61b0 100644
--- a/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -5,22 +5,22 @@
# they pass. (Known failures that do not run at all will not cause an
# error). The format is
# <name> <attributes> # comment
-#
-# The attributes in this case represent the different arguments used to
-# compiler: 'wasm-s' is for compiling to .s files, and 'wasm-o' for compiling
-# to wasm object files (.o).
# Computed gotos are not supported (Cannot select BlockAddress/BRIND)
-20071220-1.c wasm-o,O0
+20071220-1.c
+20071220-2.c
20040302-1.c
20041214-1.c O0
20071210-1.c
-20071220-1.c wasm-s,O0
920501-4.c
920501-5.c
comp-goto-1.c
980526-1.c
990208-1.c
+label13.C O0
+label13a.C O0
+label3.C
+pr42462.C O0
# WebAssembly hasn't implemented (will never?) __builtin_return_address
20010122-1.c
@@ -76,6 +76,44 @@ pr41935.c
920728-1.c
pr28865.c
widechar-2.c
+attr-alias-1.C
+attr-alias-2.C
+attr-ifunc-1.C
+attr-ifunc-2.C
+attr-ifunc-3.C
+attr-ifunc-4.C
+complit12.C
+va-arg-pack-1.C
+va-arg-pack-len-1.C
+builtin-line1.C
+builtin-location.C
+devirt-6.C # bad main signature
+devirt-13.C # bad main signature
+devirt-14.C # bad main signature
+devirt-21.C # bad main signature
+devirt-23.C # bad main signature
+lifetime2.C # violates C++ DR1696
-# Untriaged: Assertion failure in WasmObjectWriter::applyRelocations
-20071220-2.c wasm-o,O0
+# Untriaged C++ failures
+spec5.C
+addr1.C
+ef_test.C
+friend18.C
+member2.C
+new39.C
+new40.C
+nrv8.C
+offsetof9.C
+opaque-1.C
+pr19650.C
+pr37146-1.C
+pr46149.C
+pr59470.C
+rtti2.C
+self1.C
+type-generic-1.C
+vbase8-10.C
+vbase8-21.C
+vbase8-22.C
+vbase8-4.C
+vector1.C
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index f1ce430f3323..b84c2d31a63e 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -9,6 +9,7 @@
#include "InstPrinter/X86IntelInstPrinter.h"
#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCExpr.h"
#include "MCTargetDesc/X86TargetStreamer.h"
#include "X86AsmInstrumentation.h"
#include "X86AsmParserCommon.h"
@@ -345,7 +346,7 @@ private:
public:
IntelExprStateMachine()
: State(IES_INIT), PrevState(IES_ERROR), BaseReg(0), IndexReg(0),
- TmpReg(0), Scale(1), Imm(0), Sym(nullptr), BracCount(0),
+ TmpReg(0), Scale(0), Imm(0), Sym(nullptr), BracCount(0),
MemExpr(false) {}
void addImm(int64_t imm) { Imm += imm; }
@@ -451,7 +452,7 @@ private:
IC.pushOperator(IC_PLUS);
if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
// If we already have a BaseReg, then assume this is the IndexReg with
- // a scale of 1.
+ // no explicit scale.
if (!BaseReg) {
BaseReg = TmpReg;
} else {
@@ -460,7 +461,7 @@ private:
return true;
}
IndexReg = TmpReg;
- Scale = 1;
+ Scale = 0;
}
}
break;
@@ -504,7 +505,7 @@ private:
IC.pushOperator(IC_NEG);
if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
// If we already have a BaseReg, then assume this is the IndexReg with
- // a scale of 1.
+ // no explicit scale.
if (!BaseReg) {
BaseReg = TmpReg;
} else {
@@ -513,7 +514,7 @@ private:
return true;
}
IndexReg = TmpReg;
- Scale = 1;
+ Scale = 0;
}
}
break;
@@ -736,13 +737,13 @@ private:
State = IES_RBRAC;
if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
// If we already have a BaseReg, then assume this is the IndexReg with
- // a scale of 1.
+ // no explicit scale.
if (!BaseReg) {
BaseReg = TmpReg;
} else {
assert (!IndexReg && "BaseReg/IndexReg already set!");
IndexReg = TmpReg;
- Scale = 1;
+ Scale = 0;
}
}
break;
@@ -825,7 +826,7 @@ private:
bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End);
unsigned IdentifyIntelInlineAsmOperator(StringRef Name);
unsigned ParseIntelInlineAsmOperator(unsigned OpKind);
- std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End);
+ std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start);
bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM);
void RewriteIntelExpression(IntelExprStateMachine &SM, SMLoc Start,
SMLoc End);
@@ -834,7 +835,7 @@ private:
InlineAsmIdentifierInfo &Info,
bool IsUnevaluatedOperand, SMLoc &End);
- std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
+ std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc MemStart);
bool ParseIntelMemoryOperandSize(unsigned &Size);
std::unique_ptr<X86Operand>
@@ -844,7 +845,6 @@ private:
const InlineAsmIdentifierInfo &Info);
bool parseDirectiveEven(SMLoc L);
- bool ParseDirectiveWord(unsigned Size, SMLoc L);
bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
/// CodeView FPO data directives.
@@ -943,6 +943,8 @@ public:
: MCTargetAsmParser(Options, sti, mii), InstInfo(nullptr),
Code16GCC(false) {
+ Parser.addAliasForDirective(".word", ".2byte");
+
// Initialize the set of available features.
setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
Instrumentation.reset(
@@ -953,6 +955,8 @@ public:
void SetFrameRegister(unsigned RegNo) override;
+ bool parseAssignmentExpression(const MCExpr *&Res, SMLoc &EndLoc) override;
+
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) override;
@@ -968,27 +972,68 @@ static unsigned MatchRegisterName(StringRef Name);
/// }
static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
- unsigned Scale, StringRef &ErrMsg) {
+ unsigned Scale, bool Is64BitMode,
+ StringRef &ErrMsg) {
// If we have both a base register and an index register make sure they are
// both 64-bit or 32-bit registers.
// To support VSIB, IndexReg can be 128-bit or 256-bit registers.
- if ((BaseReg == X86::RIP && IndexReg != 0) || (IndexReg == X86::RIP)) {
+ if (BaseReg != 0 &&
+ !(BaseReg == X86::RIP || BaseReg == X86::EIP ||
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) ||
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg))) {
+ ErrMsg = "invalid base+index expression";
+ return true;
+ }
+
+ if (IndexReg != 0 &&
+ !(IndexReg == X86::EIZ || IndexReg == X86::RIZ ||
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::VR128XRegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::VR256XRegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::VR512RegClassID].contains(IndexReg))) {
ErrMsg = "invalid base+index expression";
return true;
}
+
+ if (((BaseReg == X86::RIP || BaseReg == X86::EIP) && IndexReg != 0) ||
+ IndexReg == X86::EIP || IndexReg == X86::RIP ||
+ IndexReg == X86::ESP || IndexReg == X86::RSP) {
+ ErrMsg = "invalid base+index expression";
+ return true;
+ }
+
+ // Check for use of invalid 16-bit registers. Only BX/BP/SI/DI are allowed,
+ // and then only in non-64-bit modes.
+ if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+ (Is64BitMode || (BaseReg != X86::BX && BaseReg != X86::BP &&
+ BaseReg != X86::SI && BaseReg != X86::DI)) &&
+ BaseReg != X86::DX) {
+ ErrMsg = "invalid 16-bit base register";
+ return true;
+ }
+
+ if (BaseReg == 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
+ ErrMsg = "16-bit memory operand may not include only index register";
+ return true;
+ }
+
if (BaseReg != 0 && IndexReg != 0) {
if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
(X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
- X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
- IndexReg != X86::RIZ) {
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
+ IndexReg == X86::EIZ)) {
ErrMsg = "base register is 64-bit, but index register is not";
return true;
}
if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
(X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
- X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
- IndexReg != X86::EIZ){
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) ||
+ IndexReg == X86::RIZ)) {
ErrMsg = "base register is 32-bit, but index register is not";
return true;
}
@@ -998,15 +1043,21 @@ static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
ErrMsg = "base register is 16-bit, but index register is not";
return true;
}
- if (((BaseReg == X86::BX || BaseReg == X86::BP) &&
- IndexReg != X86::SI && IndexReg != X86::DI) ||
- ((BaseReg == X86::SI || BaseReg == X86::DI) &&
- IndexReg != X86::BX && IndexReg != X86::BP)) {
+ if ((BaseReg != X86::BX && BaseReg != X86::BP) ||
+ (IndexReg != X86::SI && IndexReg != X86::DI)) {
ErrMsg = "invalid 16-bit base/index register combination";
return true;
}
}
}
+
+ // RIP/EIP-relative addressing is only supported in 64-bit mode.
+ if (!Is64BitMode && BaseReg != 0 &&
+ (BaseReg == X86::RIP || BaseReg == X86::EIP)) {
+ ErrMsg = "RIP-relative addressing requires 64-bit mode";
+ return true;
+ }
+
return checkScale(Scale, ErrMsg);
}
@@ -1048,18 +1099,13 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
// checked.
// FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
// REX prefix.
- if (RegNo == X86::RIZ ||
+ if (RegNo == X86::RIZ || RegNo == X86::RIP || RegNo == X86::EIP ||
X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
X86II::isX86_64NonExtLowByteReg(RegNo) ||
X86II::isX86_64ExtendedReg(RegNo))
return Error(StartLoc, "register %"
+ Tok.getString() + " is only available in 64-bit mode",
SMRange(StartLoc, EndLoc));
- } else if (!getSTI().getFeatureBits()[X86::FeatureAVX512]) {
- if (X86II::is32ExtendedReg(RegNo))
- return Error(StartLoc, "register %"
- + Tok.getString() + " is only available with AVX512",
- SMRange(StartLoc, EndLoc));
}
// Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
@@ -1388,6 +1434,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
if (ParseIntelDotOperator(SM, End))
return true;
break;
+ case AsmToken::At:
case AsmToken::String:
case AsmToken::Identifier: {
SMLoc IdentLoc = Tok.getLoc();
@@ -1395,7 +1442,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
UpdateLocLex = false;
// Register
unsigned Reg;
- if (Tok.isNot(AsmToken::String) && !ParseRegister(Reg, IdentLoc, End)) {
+ if (Tok.is(AsmToken::Identifier) && !ParseRegister(Reg, IdentLoc, End)) {
if (SM.onRegister(Reg, ErrMsg))
return Error(Tok.getLoc(), ErrMsg);
break;
@@ -1433,6 +1480,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
break;
}
// MS InlineAsm identifier
+ // Call parseIdentifier() to combine @ with the identifier behind it.
+ if (TK == AsmToken::At && Parser.parseIdentifier(Identifier))
+ return Error(IdentLoc, "expected identifier");
if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End))
return true;
else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg))
@@ -1595,7 +1645,7 @@ bool X86AsmParser::ParseIntelInlineAsmIdentifier(const MCExpr *&Val,
//ParseRoundingModeOp - Parse AVX-512 rounding mode operand
std::unique_ptr<X86Operand>
-X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) {
+X86AsmParser::ParseRoundingModeOp(SMLoc Start) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
// Eat "{" and mark the current place.
@@ -1616,6 +1666,7 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) {
Parser.Lex(); // Eat the sae
if (!getLexer().is(AsmToken::RCurly))
return ErrorOperand(Tok.getLoc(), "Expected } at this point");
+ SMLoc End = Tok.getEndLoc();
Parser.Lex(); // Eat "}"
const MCExpr *RndModeOp =
MCConstantExpr::create(rndMode, Parser.getContext());
@@ -1760,7 +1811,6 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
.Cases("XMMWORD", "xmmword", 128)
.Cases("YMMWORD", "ymmword", 256)
.Cases("ZMMWORD", "zmmword", 512)
- .Cases("OPAQUE", "opaque", -1U) // needs to be non-zero, but doesn't matter
.Default(0);
if (Size) {
const AsmToken &Tok = Lex(); // Eat operand size (e.g., byte, word).
@@ -1792,9 +1842,8 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
Start = Tok.getLoc();
// Rounding mode operand.
- if (getSTI().getFeatureBits()[X86::FeatureAVX512] &&
- getLexer().is(AsmToken::LCurly))
- return ParseRoundingModeOp(Start, End);
+ if (getLexer().is(AsmToken::LCurly))
+ return ParseRoundingModeOp(Start);
// Register operand.
unsigned RegNo = 0;
@@ -1839,8 +1888,39 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
unsigned IndexReg = SM.getIndexReg();
unsigned Scale = SM.getScale();
+ if (Scale == 0 && BaseReg != X86::ESP && BaseReg != X86::RSP &&
+ (IndexReg == X86::ESP || IndexReg == X86::RSP))
+ std::swap(BaseReg, IndexReg);
+
+ // If BaseReg is a vector register and IndexReg is not, swap them unless
+ // Scale was specified in which case it would be an error.
+ if (Scale == 0 &&
+ !(X86MCRegisterClasses[X86::VR128XRegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::VR256XRegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::VR512RegClassID].contains(IndexReg)) &&
+ (X86MCRegisterClasses[X86::VR128XRegClassID].contains(BaseReg) ||
+ X86MCRegisterClasses[X86::VR256XRegClassID].contains(BaseReg) ||
+ X86MCRegisterClasses[X86::VR512RegClassID].contains(BaseReg)))
+ std::swap(BaseReg, IndexReg);
+
+ if (Scale != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg))
+ return ErrorOperand(Start, "16-bit addresses cannot have a scale");
+
+ // If there was no explicit scale specified, change it to 1.
+ if (Scale == 0)
+ Scale = 1;
+
+ // If this is a 16-bit addressing mode with the base and index in the wrong
+ // order, swap them so CheckBaseRegAndIndexRegAndScale doesn't fail. It is
+ // shared with att syntax where order matters.
+ if ((BaseReg == X86::SI || BaseReg == X86::DI) &&
+ (IndexReg == X86::BX || IndexReg == X86::BP))
+ std::swap(BaseReg, IndexReg);
+
if ((BaseReg || IndexReg) &&
- CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, ErrMsg))
+ CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(),
+ ErrMsg))
return ErrorOperand(Start, ErrMsg);
if (isParsingInlineAsm())
return CreateMemForInlineAsm(RegNo, Disp, BaseReg, IndexReg,
@@ -1895,10 +1975,8 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
return X86Operand::CreateImm(Val, Start, End);
}
case AsmToken::LCurly:{
- SMLoc Start = Parser.getTok().getLoc(), End;
- if (getSTI().getFeatureBits()[X86::FeatureAVX512])
- return ParseRoundingModeOp(Start, End);
- return ErrorOperand(Start, "Unexpected '{' in expression");
+ SMLoc Start = Parser.getTok().getLoc();
+ return ParseRoundingModeOp(Start);
}
}
}
@@ -1928,82 +2006,80 @@ bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z,
bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
const MCParsedAsmOperand &Op) {
MCAsmParser &Parser = getParser();
- if(getSTI().getFeatureBits()[X86::FeatureAVX512]) {
- if (getLexer().is(AsmToken::LCurly)) {
- // Eat "{" and mark the current place.
- const SMLoc consumedToken = consumeToken();
- // Distinguish {1to<NUM>} from {%k<NUM>}.
- if(getLexer().is(AsmToken::Integer)) {
- // Parse memory broadcasting ({1to<NUM>}).
- if (getLexer().getTok().getIntVal() != 1)
- return TokError("Expected 1to<NUM> at this point");
- Parser.Lex(); // Eat "1" of 1to8
- if (!getLexer().is(AsmToken::Identifier) ||
- !getLexer().getTok().getIdentifier().startswith("to"))
- return TokError("Expected 1to<NUM> at this point");
- // Recognize only reasonable suffixes.
- const char *BroadcastPrimitive =
- StringSwitch<const char*>(getLexer().getTok().getIdentifier())
- .Case("to2", "{1to2}")
- .Case("to4", "{1to4}")
- .Case("to8", "{1to8}")
- .Case("to16", "{1to16}")
- .Default(nullptr);
- if (!BroadcastPrimitive)
- return TokError("Invalid memory broadcast primitive.");
- Parser.Lex(); // Eat "toN" of 1toN
- if (!getLexer().is(AsmToken::RCurly))
- return TokError("Expected } at this point");
- Parser.Lex(); // Eat "}"
- Operands.push_back(X86Operand::CreateToken(BroadcastPrimitive,
- consumedToken));
- // No AVX512 specific primitives can pass
- // after memory broadcasting, so return.
- return false;
- } else {
- // Parse either {k}{z}, {z}{k}, {k} or {z}
- // last one have no meaning, but GCC accepts it
- // Currently, we're just pass a '{' mark
- std::unique_ptr<X86Operand> Z;
- if (ParseZ(Z, consumedToken))
- return true;
- // Reaching here means that parsing of the allegadly '{z}' mark yielded
- // no errors.
- // Query for the need of further parsing for a {%k<NUM>} mark
- if (!Z || getLexer().is(AsmToken::LCurly)) {
- SMLoc StartLoc = Z ? consumeToken() : consumedToken;
- // Parse an op-mask register mark ({%k<NUM>}), which is now to be
- // expected
- unsigned RegNo;
- SMLoc RegLoc;
- if (!ParseRegister(RegNo, RegLoc, StartLoc) &&
- X86MCRegisterClasses[X86::VK1RegClassID].contains(RegNo)) {
- if (RegNo == X86::K0)
- return Error(RegLoc, "Register k0 can't be used as write mask");
- if (!getLexer().is(AsmToken::RCurly))
- return Error(getLexer().getLoc(), "Expected } at this point");
- Operands.push_back(X86Operand::CreateToken("{", StartLoc));
- Operands.push_back(
- X86Operand::CreateReg(RegNo, StartLoc, StartLoc));
- Operands.push_back(X86Operand::CreateToken("}", consumeToken()));
- } else
+ if (getLexer().is(AsmToken::LCurly)) {
+ // Eat "{" and mark the current place.
+ const SMLoc consumedToken = consumeToken();
+ // Distinguish {1to<NUM>} from {%k<NUM>}.
+ if(getLexer().is(AsmToken::Integer)) {
+ // Parse memory broadcasting ({1to<NUM>}).
+ if (getLexer().getTok().getIntVal() != 1)
+ return TokError("Expected 1to<NUM> at this point");
+ Parser.Lex(); // Eat "1" of 1to8
+ if (!getLexer().is(AsmToken::Identifier) ||
+ !getLexer().getTok().getIdentifier().startswith("to"))
+ return TokError("Expected 1to<NUM> at this point");
+ // Recognize only reasonable suffixes.
+ const char *BroadcastPrimitive =
+ StringSwitch<const char*>(getLexer().getTok().getIdentifier())
+ .Case("to2", "{1to2}")
+ .Case("to4", "{1to4}")
+ .Case("to8", "{1to8}")
+ .Case("to16", "{1to16}")
+ .Default(nullptr);
+ if (!BroadcastPrimitive)
+ return TokError("Invalid memory broadcast primitive.");
+ Parser.Lex(); // Eat "toN" of 1toN
+ if (!getLexer().is(AsmToken::RCurly))
+ return TokError("Expected } at this point");
+ Parser.Lex(); // Eat "}"
+ Operands.push_back(X86Operand::CreateToken(BroadcastPrimitive,
+ consumedToken));
+ // No AVX512 specific primitives can pass
+ // after memory broadcasting, so return.
+ return false;
+ } else {
+ // Parse either {k}{z}, {z}{k}, {k} or {z}
+ // last one have no meaning, but GCC accepts it
+ // Currently, we're just pass a '{' mark
+ std::unique_ptr<X86Operand> Z;
+ if (ParseZ(Z, consumedToken))
+ return true;
+ // Reaching here means that parsing of the allegadly '{z}' mark yielded
+ // no errors.
+ // Query for the need of further parsing for a {%k<NUM>} mark
+ if (!Z || getLexer().is(AsmToken::LCurly)) {
+ SMLoc StartLoc = Z ? consumeToken() : consumedToken;
+ // Parse an op-mask register mark ({%k<NUM>}), which is now to be
+ // expected
+ unsigned RegNo;
+ SMLoc RegLoc;
+ if (!ParseRegister(RegNo, RegLoc, StartLoc) &&
+ X86MCRegisterClasses[X86::VK1RegClassID].contains(RegNo)) {
+ if (RegNo == X86::K0)
+ return Error(RegLoc, "Register k0 can't be used as write mask");
+ if (!getLexer().is(AsmToken::RCurly))
+ return Error(getLexer().getLoc(), "Expected } at this point");
+ Operands.push_back(X86Operand::CreateToken("{", StartLoc));
+ Operands.push_back(
+ X86Operand::CreateReg(RegNo, StartLoc, StartLoc));
+ Operands.push_back(X86Operand::CreateToken("}", consumeToken()));
+ } else
+ return Error(getLexer().getLoc(),
+ "Expected an op-mask register at this point");
+ // {%k<NUM>} mark is found, inquire for {z}
+ if (getLexer().is(AsmToken::LCurly) && !Z) {
+ // Have we've found a parsing error, or found no (expected) {z} mark
+ // - report an error
+ if (ParseZ(Z, consumeToken()) || !Z)
return Error(getLexer().getLoc(),
- "Expected an op-mask register at this point");
- // {%k<NUM>} mark is found, inquire for {z}
- if (getLexer().is(AsmToken::LCurly) && !Z) {
- // Have we've found a parsing error, or found no (expected) {z} mark
- // - report an error
- if (ParseZ(Z, consumeToken()) || !Z)
- return Error(getLexer().getLoc(),
- "Expected a {z} mark at this point");
+ "Expected a {z} mark at this point");
- }
- // '{z}' on its own is meaningless, hence should be ignored.
- // on the contrary - have it been accompanied by a K register,
- // allow it.
- if (Z)
- Operands.push_back(std::move(Z));
}
+ // '{z}' on its own is meaningless, hence should be ignored.
+ // on the contrary - have it been accompanied by a K register,
+ // allow it.
+ if (Z)
+ Operands.push_back(std::move(Z));
}
}
}
@@ -2024,6 +2100,9 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
if (getLexer().isNot(AsmToken::LParen)) {
SMLoc ExprEnd;
if (getParser().parseExpression(Disp, ExprEnd)) return nullptr;
+ // Disp may be a variable, handle register values.
+ if (auto *RE = dyn_cast<X86MCExpr>(Disp))
+ return X86Operand::CreateReg(RE->getRegNo(), MemStart, ExprEnd);
// After parsing the base expression we could either have a parenthesized
// memory address or not. If not, return now. If so, eat the (.
@@ -2114,12 +2193,8 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
if (getLexer().isNot(AsmToken::RParen)) {
// Parse the scale amount:
// ::= ',' [scale-expression]
- if (getLexer().isNot(AsmToken::Comma)) {
- Error(Parser.getTok().getLoc(),
- "expected comma in scale expression");
+ if (parseToken(AsmToken::Comma, "expected comma in scale expression"))
return nullptr;
- }
- Parser.Lex(); // Eat the comma.
if (getLexer().isNot(AsmToken::RParen)) {
SMLoc Loc = Parser.getTok().getLoc();
@@ -2160,31 +2235,21 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
}
// Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
- if (getLexer().isNot(AsmToken::RParen)) {
- Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
- return nullptr;
- }
SMLoc MemEnd = Parser.getTok().getEndLoc();
- Parser.Lex(); // Eat the ')'.
-
- // Check for use of invalid 16-bit registers. Only BX/BP/SI/DI are allowed,
- // and then only in non-64-bit modes. Except for DX, which is a special case
- // because an unofficial form of in/out instructions uses it.
- if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
- (is64BitMode() || (BaseReg != X86::BX && BaseReg != X86::BP &&
- BaseReg != X86::SI && BaseReg != X86::DI)) &&
- BaseReg != X86::DX) {
- Error(BaseLoc, "invalid 16-bit base register");
- return nullptr;
- }
- if (BaseReg == 0 &&
- X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
- Error(IndexLoc, "16-bit memory operand may not include only index register");
+ if (parseToken(AsmToken::RParen, "unexpected token in memory operand"))
return nullptr;
- }
+
+ // This is a terrible hack to handle "out[s]?[bwl]? %al, (%dx)" ->
+ // "outb %al, %dx". Out doesn't take a memory form, but this is a widely
+ // documented form in various unofficial manuals, so a lot of code uses it.
+ if (BaseReg == X86::DX && IndexReg == 0 && Scale == 1 &&
+ SegReg == 0 && isa<MCConstantExpr>(Disp) &&
+ cast<MCConstantExpr>(Disp)->getValue() == 0)
+ return X86Operand::CreateDXReg(BaseLoc, BaseLoc);
StringRef ErrMsg;
- if (CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, ErrMsg)) {
+ if (CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(),
+ ErrMsg)) {
Error(BaseLoc, ErrMsg);
return nullptr;
}
@@ -2195,6 +2260,25 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, MemEnd);
}
+// Parse either a standard expression or a register.
+bool X86AsmParser::parseAssignmentExpression(const MCExpr *&Res,
+ SMLoc &EndLoc) {
+ MCAsmParser &Parser = getParser();
+ if (Parser.parseExpression(Res, EndLoc)) {
+ SMLoc StartLoc = Parser.getTok().getLoc();
+ // Normal Expression parse fails, check if it could be a register.
+ unsigned RegNo;
+ if (Parser.getTargetParser().ParseRegister(RegNo, StartLoc, EndLoc))
+ return true;
+ // Clear previous parse error and return correct expression.
+ Parser.clearPendingErrors();
+ Res = X86MCExpr::create(RegNo, Parser.getContext());
+ return false;
+ }
+
+ return false;
+}
+
bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) {
MCAsmParser &Parser = getParser();
@@ -2358,23 +2442,31 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
.Cases("acquire", "release", isParsingIntelSyntax())
.Default(false);
- auto isLockRepeatPrefix = [](StringRef N) {
+ auto isLockRepeatNtPrefix = [](StringRef N) {
return StringSwitch<bool>(N)
- .Cases("lock", "rep", "repe", "repz", "repne", "repnz", true)
+ .Cases("lock", "rep", "repe", "repz", "repne", "repnz", "notrack", true)
.Default(false);
};
bool CurlyAsEndOfStatement = false;
unsigned Flags = X86::IP_NO_PREFIX;
- while (isLockRepeatPrefix(Name.lower())) {
+ while (isLockRepeatNtPrefix(Name.lower())) {
unsigned Prefix =
StringSwitch<unsigned>(Name)
.Cases("lock", "lock", X86::IP_HAS_LOCK)
.Cases("rep", "repe", "repz", X86::IP_HAS_REPEAT)
.Cases("repne", "repnz", X86::IP_HAS_REPEAT_NE)
+ .Cases("notrack", "notrack", X86::IP_HAS_NOTRACK)
.Default(X86::IP_NO_PREFIX); // Invalid prefix (impossible)
Flags |= Prefix;
+ if (getLexer().is(AsmToken::EndOfStatement)) {
+ // We don't have real instr with the given prefix
+ // let's use the prefix as the instr.
+ // TODO: there could be several prefixes one after another
+ Flags = X86::IP_NO_PREFIX;
+ break;
+ }
Name = Parser.getTok().getString();
Parser.Lex(); // eat the prefix
// Hack: we could have something like "rep # some comment" or
@@ -2389,6 +2481,20 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
if (Flags)
PatchedName = Name;
+
+ // Hacks to handle 'data16' and 'data32'
+ if (PatchedName == "data16" && is16BitMode()) {
+ return Error(NameLoc, "redundant data16 prefix");
+ }
+ if (PatchedName == "data32") {
+ if (is32BitMode())
+ return Error(NameLoc, "redundant data32 prefix");
+ if (is64BitMode())
+ return Error(NameLoc, "'data32' is not supported in 64-bit mode");
+ // Hack to 'data16' for the table lookup.
+ PatchedName = "data16";
+ }
+
Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
// This does the actual operand parsing. Don't parse any more if we have a
@@ -2423,7 +2529,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
(getLexer().is(AsmToken::LCurly) || getLexer().is(AsmToken::RCurly));
if (getLexer().isNot(AsmToken::EndOfStatement) && !CurlyAsEndOfStatement)
return TokError("unexpected token in argument list");
- }
+ }
// Consume the EndOfStatement or the prefix separator Slash
if (getLexer().is(AsmToken::EndOfStatement) ||
@@ -2479,26 +2585,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
Name == "outl" || Name == "outsl" || Name == "out" || Name == "outs") &&
Operands.size() == 3) {
X86Operand &Op = (X86Operand &)*Operands.back();
- if (Op.isMem() && Op.Mem.SegReg == 0 &&
- isa<MCConstantExpr>(Op.Mem.Disp) &&
- cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
- Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
- SMLoc Loc = Op.getEndLoc();
- Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
- }
+ if (Op.isDXReg())
+ Operands.back() = X86Operand::CreateReg(X86::DX, Op.getStartLoc(),
+ Op.getEndLoc());
}
// Same hack for "in[s]?[bwl]? (%dx), %al" -> "inb %dx, %al".
if ((Name == "inb" || Name == "insb" || Name == "inw" || Name == "insw" ||
Name == "inl" || Name == "insl" || Name == "in" || Name == "ins") &&
Operands.size() == 3) {
X86Operand &Op = (X86Operand &)*Operands[1];
- if (Op.isMem() && Op.Mem.SegReg == 0 &&
- isa<MCConstantExpr>(Op.Mem.Disp) &&
- cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
- Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
- SMLoc Loc = Op.getEndLoc();
- Operands[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
- }
+ if (Op.isDXReg())
+ Operands[1] = X86Operand::CreateReg(X86::DX, Op.getStartLoc(),
+ Op.getEndLoc());
}
SmallVector<std::unique_ptr<MCParsedAsmOperand>, 2> TmpOperands;
@@ -2703,6 +2801,39 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
"should be distinct");
break;
}
+ case X86::V4FMADDPSrm:
+ case X86::V4FMADDPSrmk:
+ case X86::V4FMADDPSrmkz:
+ case X86::V4FMADDSSrm:
+ case X86::V4FMADDSSrmk:
+ case X86::V4FMADDSSrmkz:
+ case X86::V4FNMADDPSrm:
+ case X86::V4FNMADDPSrmk:
+ case X86::V4FNMADDPSrmkz:
+ case X86::V4FNMADDSSrm:
+ case X86::V4FNMADDSSrmk:
+ case X86::V4FNMADDSSrmkz:
+ case X86::VP4DPWSSDSrm:
+ case X86::VP4DPWSSDSrmk:
+ case X86::VP4DPWSSDSrmkz:
+ case X86::VP4DPWSSDrm:
+ case X86::VP4DPWSSDrmk:
+ case X86::VP4DPWSSDrmkz: {
+ unsigned Src2 = Inst.getOperand(Inst.getNumOperands() -
+ X86::AddrNumOperands - 1).getReg();
+ unsigned Src2Enc = MRI->getEncodingValue(Src2);
+ if (Src2Enc % 4 != 0) {
+ StringRef RegName = X86IntelInstPrinter::getRegisterName(Src2);
+ unsigned GroupStart = (Src2Enc / 4) * 4;
+ unsigned GroupEnd = GroupStart + 3;
+ return Warning(Ops[0]->getStartLoc(),
+ "source register '" + RegName + "' implicitly denotes '" +
+ RegName.take_front(3) + Twine(GroupStart) + "' to '" +
+ RegName.take_front(3) + Twine(GroupEnd) +
+ "' source group");
+ }
+ break;
+ }
}
return false;
@@ -3146,9 +3277,7 @@ bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) {
bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
MCAsmParser &Parser = getParser();
StringRef IDVal = DirectiveID.getIdentifier();
- if (IDVal == ".word")
- return ParseDirectiveWord(2, DirectiveID.getLoc());
- else if (IDVal.startswith(".code"))
+ if (IDVal.startswith(".code"))
return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
else if (IDVal.startswith(".att_syntax")) {
getParser().setParsingInlineAsm(false);
@@ -3195,10 +3324,9 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
/// parseDirectiveEven
/// ::= .even
bool X86AsmParser::parseDirectiveEven(SMLoc L) {
- if (getLexer().isNot(AsmToken::EndOfStatement)) {
- TokError("unexpected token in directive");
- return false;
- }
+ if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+ return false;
+
const MCSection *Section = getStreamer().getCurrentSectionOnly();
if (!Section) {
getStreamer().InitSections(false);
@@ -3210,42 +3338,6 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) {
getStreamer().EmitValueToAlignment(2, 0, 1, 0);
return false;
}
-/// ParseDirectiveWord
-/// ::= .word [ expression (, expression)* ]
-bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
- MCAsmParser &Parser = getParser();
- if (getLexer().isNot(AsmToken::EndOfStatement)) {
- for (;;) {
- const MCExpr *Value;
- SMLoc ExprLoc = getLexer().getLoc();
- if (getParser().parseExpression(Value))
- return false;
-
- if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) {
- assert(Size <= 8 && "Invalid size");
- uint64_t IntValue = MCE->getValue();
- if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
- return Error(ExprLoc, "literal value out of range for directive");
- getStreamer().EmitIntValue(IntValue, Size);
- } else {
- getStreamer().EmitValue(Value, Size, ExprLoc);
- }
-
- if (getLexer().is(AsmToken::EndOfStatement))
- break;
-
- // FIXME: Improve diagnostic.
- if (getLexer().isNot(AsmToken::Comma)) {
- Error(L, "unexpected token in directive");
- return false;
- }
- Parser.Lex();
- }
- }
-
- Parser.Lex();
- return false;
-}
/// ParseDirectiveCode
/// ::= .code16 | .code32 | .code64
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index 43a0561e769b..4d4aae0a1c6a 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -10,6 +10,7 @@
#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
#define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
+#include "InstPrinter/X86IntelInstPrinter.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
#include "X86AsmParserCommon.h"
#include "llvm/ADT/STLExtras.h"
@@ -28,8 +29,8 @@ namespace llvm {
/// X86Operand - Instances of this class represent a parsed X86 machine
/// instruction.
-struct X86Operand : public MCParsedAsmOperand {
- enum KindTy { Token, Register, Immediate, Memory, Prefix } Kind;
+struct X86Operand final : public MCParsedAsmOperand {
+ enum KindTy { Token, Register, Immediate, Memory, Prefix, DXRegister } Kind;
SMLoc StartLoc, EndLoc;
SMLoc OffsetOfLoc;
@@ -77,7 +78,7 @@ struct X86Operand : public MCParsedAsmOperand {
};
X86Operand(KindTy K, SMLoc Start, SMLoc End)
- : Kind(K), StartLoc(Start), EndLoc(End) {}
+ : Kind(K), StartLoc(Start), EndLoc(End) {}
StringRef getSymName() override { return SymName; }
void *getOpDecl() override { return OpDecl; }
@@ -95,7 +96,55 @@ struct X86Operand : public MCParsedAsmOperand {
/// getOffsetOfLoc - Get the location of the offset operator.
SMLoc getOffsetOfLoc() const override { return OffsetOfLoc; }
- void print(raw_ostream &OS) const override {}
+ void print(raw_ostream &OS) const override {
+
+ auto PrintImmValue = [&](const MCExpr *Val, const char *VName) {
+ if (Val->getKind() == MCExpr::Constant) {
+ if (auto Imm = cast<MCConstantExpr>(Val)->getValue())
+ OS << VName << Imm;
+ } else if (Val->getKind() == MCExpr::SymbolRef) {
+ if (auto *SRE = dyn_cast<MCSymbolRefExpr>(Val)) {
+ const MCSymbol &Sym = SRE->getSymbol();
+ if (auto SymName = Sym.getName().data())
+ OS << VName << SymName;
+ }
+ }
+ };
+
+ switch (Kind) {
+ case Token:
+ OS << Tok.Data;
+ break;
+ case Register:
+ OS << "Reg:" << X86IntelInstPrinter::getRegisterName(Reg.RegNo);
+ break;
+ case DXRegister:
+ OS << "DXReg";
+ break;
+ case Immediate:
+ PrintImmValue(Imm.Val, "Imm:");
+ break;
+ case Prefix:
+ OS << "Prefix:" << Pref.Prefixes;
+ break;
+ case Memory:
+ OS << "Memory: ModeSize=" << Mem.ModeSize;
+ if (Mem.Size)
+ OS << ",Size=" << Mem.Size;
+ if (Mem.BaseReg)
+ OS << ",BaseReg=" << X86IntelInstPrinter::getRegisterName(Mem.BaseReg);
+ if (Mem.IndexReg)
+ OS << ",IndexReg="
+ << X86IntelInstPrinter::getRegisterName(Mem.IndexReg);
+ if (Mem.Scale)
+ OS << ",Scale=" << Mem.Scale;
+ if (Mem.Disp)
+ PrintImmValue(Mem.Disp, ",Disp=");
+ if (Mem.SegReg)
+ OS << ",SegReg=" << X86IntelInstPrinter::getRegisterName(Mem.SegReg);
+ break;
+ }
+ }
StringRef getToken() const {
assert(Kind == Token && "Invalid access!");
@@ -395,6 +444,7 @@ struct X86Operand : public MCParsedAsmOperand {
bool isPrefix() const { return Kind == Prefix; }
bool isReg() const override { return Kind == Register; }
+ bool isDXReg() const { return Kind == DXRegister; }
bool isGR32orGR64() const {
return Kind == Register &&
@@ -415,34 +465,11 @@ struct X86Operand : public MCParsedAsmOperand {
Inst.addOperand(MCOperand::createReg(getReg()));
}
- static unsigned getGR32FromGR64(unsigned RegNo) {
- switch (RegNo) {
- default: llvm_unreachable("Unexpected register");
- case X86::RAX: return X86::EAX;
- case X86::RCX: return X86::ECX;
- case X86::RDX: return X86::EDX;
- case X86::RBX: return X86::EBX;
- case X86::RBP: return X86::EBP;
- case X86::RSP: return X86::ESP;
- case X86::RSI: return X86::ESI;
- case X86::RDI: return X86::EDI;
- case X86::R8: return X86::R8D;
- case X86::R9: return X86::R9D;
- case X86::R10: return X86::R10D;
- case X86::R11: return X86::R11D;
- case X86::R12: return X86::R12D;
- case X86::R13: return X86::R13D;
- case X86::R14: return X86::R14D;
- case X86::R15: return X86::R15D;
- case X86::RIP: return X86::EIP;
- }
- }
-
void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
unsigned RegNo = getReg();
if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
- RegNo = getGR32FromGR64(RegNo);
+ RegNo = getX86SubSuperRegister(RegNo, 32);
Inst.addOperand(MCOperand::createReg(RegNo));
}
@@ -517,6 +544,11 @@ struct X86Operand : public MCParsedAsmOperand {
}
static std::unique_ptr<X86Operand>
+ CreateDXReg(SMLoc StartLoc, SMLoc EndLoc) {
+ return llvm::make_unique<X86Operand>(DXRegister, StartLoc, EndLoc);
+ }
+
+ static std::unique_ptr<X86Operand>
CreatePrefix(unsigned Prefixes, SMLoc StartLoc, SMLoc EndLoc) {
auto Res = llvm::make_unique<X86Operand>(Prefix, StartLoc, EndLoc);
Res->Pref.Prefixes = Prefixes;
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 7e0df2941467..0dbc82bc7666 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -1,18 +1,18 @@
set(LLVM_TARGET_DEFINITIONS X86.td)
-tablegen(LLVM X86GenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM X86GenDisassemblerTables.inc -gen-disassembler)
-tablegen(LLVM X86GenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM X86GenAsmMatcher.inc -gen-asm-matcher)
tablegen(LLVM X86GenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
-tablegen(LLVM X86GenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM X86GenFastISel.inc -gen-fast-isel)
tablegen(LLVM X86GenCallingConv.inc -gen-callingconv)
-tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM X86GenDisassemblerTables.inc -gen-disassembler)
tablegen(LLVM X86GenEVEX2VEXTables.inc -gen-x86-EVEX2VEX-tables)
-tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank)
+tablegen(LLVM X86GenFastISel.inc -gen-fast-isel)
tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel)
+tablegen(LLVM X86GenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank)
+tablegen(LLVM X86GenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
if (X86_GEN_FOLD_TABLES)
tablegen(LLVM X86GenFoldTables.inc -gen-x86-fold-tables)
@@ -21,8 +21,10 @@ endif()
add_public_tablegen_target(X86CommonTableGen)
set(sources
+ ShadowCallStack.cpp
X86AsmPrinter.cpp
X86CallFrameOptimization.cpp
+ X86CallingConv.cpp
X86CallLowering.cpp
X86CmovConversion.cpp
X86DomainReassignment.cpp
@@ -30,14 +32,18 @@ set(sources
X86FastISel.cpp
X86FixupBWInsts.cpp
X86FixupLEAs.cpp
+ X86AvoidStoreForwardingBlocks.cpp
X86FixupSetCC.cpp
+ X86FlagsCopyLowering.cpp
X86FloatingPoint.cpp
X86FrameLowering.cpp
X86InstructionSelector.cpp
X86ISelDAGToDAG.cpp
X86ISelLowering.cpp
+ X86IndirectBranchTracking.cpp
X86InterleavedAccess.cpp
X86InstrFMA3Info.cpp
+ X86InstrFoldTables.cpp
X86InstrInfo.cpp
X86EvexToVex.cpp
X86LegalizerInfo.cpp
@@ -48,8 +54,10 @@ set(sources
X86PadShortFunction.cpp
X86RegisterBankInfo.cpp
X86RegisterInfo.cpp
+ X86RetpolineThunks.cpp
X86SelectionDAGInfo.cpp
X86ShuffleDecodeConstantPool.cpp
+ X86SpeculativeLoadHardening.cpp
X86Subtarget.cpp
X86TargetMachine.cpp
X86TargetObjectFile.cpp
@@ -57,7 +65,6 @@ set(sources
X86VZeroUpper.cpp
X86WinAllocaExpander.cpp
X86WinEHState.cpp
- X86CallingConv.cpp
)
add_llvm_target(X86CodeGen ${sources})
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index c58254ae38c1..62312777318e 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -103,7 +103,7 @@ StringRef llvm::X86Disassembler::GetInstrName(unsigned Opcode,
return MII->getName(Opcode);
}
-#define debug(s) DEBUG(Debug(__FILE__, __LINE__, s));
+#define debug(s) LLVM_DEBUG(Debug(__FILE__, __LINE__, s));
namespace llvm {
@@ -247,6 +247,8 @@ MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction(
// It should not be 'pause' f3 90
InternalInstr.opcode != 0x90)
Flags |= X86::IP_HAS_REPEAT;
+ if (InternalInstr.hasLockPrefix)
+ Flags |= X86::IP_HAS_LOCK;
}
Instr.setFlags(Flags);
}
@@ -265,13 +267,10 @@ MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction(
/// @param reg - The Reg to append.
static void translateRegister(MCInst &mcInst, Reg reg) {
#define ENTRY(x) X86::x,
- uint8_t llvmRegnums[] = {
- ALL_REGS
- 0
- };
+ static constexpr MCPhysReg llvmRegnums[] = {ALL_REGS};
#undef ENTRY
- uint8_t llvmRegnum = llvmRegnums[reg];
+ MCPhysReg llvmRegnum = llvmRegnums[reg];
mcInst.addOperand(MCOperand::createReg(llvmRegnum));
}
@@ -664,8 +663,6 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
case TYPE_ZMM:
mcInst.addOperand(MCOperand::createReg(X86::ZMM0 + (immediate >> 4)));
return;
- case TYPE_BNDR:
- mcInst.addOperand(MCOperand::createReg(X86::BND0 + (immediate >> 4)));
default:
// operand is 64 bits wide. Do nothing.
break;
@@ -761,7 +758,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
#undef ENTRY
}
} else {
- baseReg = MCOperand::createReg(0);
+ baseReg = MCOperand::createReg(X86::NoRegister);
}
if (insn.sibIndex != SIB_INDEX_NONE) {
@@ -780,7 +777,22 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
#undef ENTRY
}
} else {
- indexReg = MCOperand::createReg(0);
+ // Use EIZ/RIZ for a few ambiguous cases where the SIB byte is present,
+ // but no index is used and modrm alone should have been enough.
+ // -No base register in 32-bit mode. In 64-bit mode this is used to
+ // avoid rip-relative addressing.
+ // -Any base register used other than ESP/RSP/R12D/R12. Using these as a
+ // base always requires a SIB byte.
+ // -A scale other than 1 is used.
+ if (insn.sibScale != 1 ||
+ (insn.sibBase == SIB_BASE_NONE && insn.mode != MODE_64BIT) ||
+ (insn.sibBase != SIB_BASE_NONE &&
+ insn.sibBase != SIB_BASE_ESP && insn.sibBase != SIB_BASE_RSP &&
+ insn.sibBase != SIB_BASE_R12D && insn.sibBase != SIB_BASE_R12)) {
+ indexReg = MCOperand::createReg(insn.addressSize == 4 ? X86::EIZ :
+ X86::RIZ);
+ } else
+ indexReg = MCOperand::createReg(X86::NoRegister);
}
scaleAmount = MCOperand::createImm(insn.sibScale);
@@ -797,12 +809,14 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
tryAddingPcLoadReferenceComment(insn.startLocation +
insn.displacementOffset,
insn.displacement + pcrel, Dis);
- baseReg = MCOperand::createReg(X86::RIP); // Section 2.2.1.6
+ // Section 2.2.1.6
+ baseReg = MCOperand::createReg(insn.addressSize == 4 ? X86::EIP :
+ X86::RIP);
}
else
- baseReg = MCOperand::createReg(0);
+ baseReg = MCOperand::createReg(X86::NoRegister);
- indexReg = MCOperand::createReg(0);
+ indexReg = MCOperand::createReg(X86::NoRegister);
break;
case EA_BASE_BX_SI:
baseReg = MCOperand::createReg(X86::BX);
@@ -821,7 +835,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
indexReg = MCOperand::createReg(X86::DI);
break;
default:
- indexReg = MCOperand::createReg(0);
+ indexReg = MCOperand::createReg(X86::NoRegister);
switch (insn.eaBase) {
default:
debug("Unexpected eaBase");
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index 843d037ad3cd..1ac304f3be03 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -103,6 +103,9 @@ static int modRMRequired(OpcodeType type,
case XOPA_MAP:
decision = &XOPA_MAP_SYM;
break;
+ case THREEDNOW_MAP:
+ decision = &THREEDNOW_MAP_SYM;
+ break;
}
return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
@@ -147,6 +150,9 @@ static InstrUID decode(OpcodeType type,
case XOPA_MAP:
dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
break;
+ case THREEDNOW_MAP:
+ dec = &THREEDNOW_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
}
switch (dec->modrm_type) {
@@ -292,6 +298,9 @@ static bool isREX(struct InternalInstruction *insn, uint8_t prefix) {
static void setPrefixPresent(struct InternalInstruction *insn, uint8_t prefix) {
uint8_t nextByte;
switch (prefix) {
+ case 0xf0:
+ insn->hasLockPrefix = true;
+ break;
case 0xf2:
case 0xf3:
if (lookAtByte(insn, &nextByte))
@@ -623,6 +632,8 @@ static int readPrefixes(struct InternalInstruction* insn) {
return 0;
}
+static int readModRM(struct InternalInstruction* insn);
+
/*
* readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
* extended or escape opcodes).
@@ -715,6 +726,17 @@ static int readOpcode(struct InternalInstruction* insn) {
return -1;
insn->opcodeType = THREEBYTE_3A;
+ } else if (current == 0x0f) {
+ dbgprintf(insn, "Found a 3dnow escape prefix (0x%hhx)", current);
+
+ // Consume operands before the opcode to comply with the 3DNow encoding
+ if (readModRM(insn))
+ return -1;
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ insn->opcodeType = THREEDNOW_MAP;
} else {
dbgprintf(insn, "Didn't find a three-byte escape prefix");
@@ -735,8 +757,6 @@ static int readOpcode(struct InternalInstruction* insn) {
return 0;
}
-static int readModRM(struct InternalInstruction* insn);
-
/*
* getIDWithAttrMask - Determines the ID of an instruction, consuming
* the ModR/M byte as appropriate for extended and escape opcodes,
@@ -947,6 +967,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
attrMask |= ATTR_ADSIZE;
break;
}
+
}
if (insn->rexPrefix & 0x08) {
@@ -1039,13 +1060,15 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
}
/*
- * Absolute moves need special handling.
+ * Absolute moves, umonitor, and movdir64b need special handling.
* -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are
* inverted w.r.t.
* -For 32-bit mode we need to ensure the ADSIZE prefix is observed in
* any position.
*/
- if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) {
+ if ((insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) ||
+ (insn->opcodeType == TWOBYTE && (insn->opcode == 0xAE)) ||
+ (insn->opcodeType == THREEBYTE_38 && insn->opcode == 0xF8)) {
/* Make sure we observed the prefixes in any position. */
if (insn->hasAdSize)
attrMask |= ATTR_ADSIZE;
@@ -1053,8 +1076,13 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
attrMask |= ATTR_OPSIZE;
/* In 16-bit, invert the attributes. */
- if (insn->mode == MODE_16BIT)
- attrMask ^= ATTR_ADSIZE | ATTR_OPSIZE;
+ if (insn->mode == MODE_16BIT) {
+ attrMask ^= ATTR_ADSIZE;
+
+ /* The OpSize attribute is only valid with the absolute moves. */
+ if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0))
+ attrMask ^= ATTR_OPSIZE;
+ }
if (getIDWithAttrMask(&instructionID, insn, attrMask))
return -1;
@@ -1279,7 +1307,7 @@ static int readDisplacement(struct InternalInstruction* insn) {
* @return - 0 if the information was successfully read; nonzero otherwise.
*/
static int readModRM(struct InternalInstruction* insn) {
- uint8_t mod, rm, reg;
+ uint8_t mod, rm, reg, evexrm;
dbgprintf(insn, "readModRM()");
@@ -1316,16 +1344,18 @@ static int readModRM(struct InternalInstruction* insn) {
reg |= rFromREX(insn->rexPrefix) << 3;
rm |= bFromREX(insn->rexPrefix) << 3;
- if (insn->vectorExtensionType == TYPE_EVEX) {
+
+ evexrm = 0;
+ if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT) {
reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
- rm |= xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+ evexrm = xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
}
insn->reg = (Reg)(insn->regBase + reg);
switch (insn->addressSize) {
- case 2:
- insn->eaBaseBase = EA_BASE_BX_SI;
+ case 2: {
+ EABase eaBaseBase = EA_BASE_BX_SI;
switch (mod) {
case 0x0:
@@ -1335,19 +1365,19 @@ static int readModRM(struct InternalInstruction* insn) {
if (readDisplacement(insn))
return -1;
} else {
- insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaBase = (EABase)(eaBaseBase + rm);
insn->eaDisplacement = EA_DISP_NONE;
}
break;
case 0x1:
- insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaBase = (EABase)(eaBaseBase + rm);
insn->eaDisplacement = EA_DISP_8;
insn->displacementSize = 1;
if (readDisplacement(insn))
return -1;
break;
case 0x2:
- insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaBase = (EABase)(eaBaseBase + rm);
insn->eaDisplacement = EA_DISP_16;
if (readDisplacement(insn))
return -1;
@@ -1359,9 +1389,10 @@ static int readModRM(struct InternalInstruction* insn) {
break;
}
break;
+ }
case 4:
- case 8:
- insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
+ case 8: {
+ EABase eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
switch (mod) {
case 0x0:
@@ -1383,7 +1414,7 @@ static int readModRM(struct InternalInstruction* insn) {
return -1;
break;
default:
- insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaBase = (EABase)(eaBaseBase + rm);
break;
}
break;
@@ -1399,7 +1430,7 @@ static int readModRM(struct InternalInstruction* insn) {
return -1;
break;
default:
- insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaBase = (EABase)(eaBaseBase + rm);
if (readDisplacement(insn))
return -1;
break;
@@ -1407,16 +1438,17 @@ static int readModRM(struct InternalInstruction* insn) {
break;
case 0x3:
insn->eaDisplacement = EA_DISP_NONE;
- insn->eaBase = (EABase)(insn->eaRegBase + rm);
+ insn->eaBase = (EABase)(insn->eaRegBase + rm + evexrm);
break;
}
break;
+ }
} /* switch (insn->addressSize) */
return 0;
}
-#define GENERIC_FIXUP_FUNC(name, base, prefix) \
+#define GENERIC_FIXUP_FUNC(name, base, prefix, mask) \
static uint16_t name(struct InternalInstruction *insn, \
OperandType type, \
uint8_t index, \
@@ -1430,6 +1462,9 @@ static int readModRM(struct InternalInstruction* insn) {
case TYPE_Rv: \
return base + index; \
case TYPE_R8: \
+ index &= mask; \
+ if (index > 0xf) \
+ *valid = 0; \
if (insn->rexPrefix && \
index >= 4 && index <= 7) { \
return prefix##_SPL + (index - 4); \
@@ -1437,10 +1472,19 @@ static int readModRM(struct InternalInstruction* insn) {
return prefix##_AL + index; \
} \
case TYPE_R16: \
+ index &= mask; \
+ if (index > 0xf) \
+ *valid = 0; \
return prefix##_AX + index; \
case TYPE_R32: \
+ index &= mask; \
+ if (index > 0xf) \
+ *valid = 0; \
return prefix##_EAX + index; \
case TYPE_R64: \
+ index &= mask; \
+ if (index > 0xf) \
+ *valid = 0; \
return prefix##_RAX + index; \
case TYPE_ZMM: \
return prefix##_ZMM0 + index; \
@@ -1449,6 +1493,7 @@ static int readModRM(struct InternalInstruction* insn) {
case TYPE_XMM: \
return prefix##_XMM0 + index; \
case TYPE_VK: \
+ index &= 0xf; \
if (index > 7) \
*valid = 0; \
return prefix##_K0 + index; \
@@ -1488,8 +1533,8 @@ static int readModRM(struct InternalInstruction* insn) {
* field is valid for the register class; 0 if not.
* @return - The proper value.
*/
-GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG)
-GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG)
+GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG, 0x1f)
+GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG, 0xf)
/*
* fixupReg - Consults an operand specifier to determine which of the
@@ -1670,7 +1715,7 @@ static int readVVVV(struct InternalInstruction* insn) {
return -1;
if (insn->mode != MODE_64BIT)
- vvvv &= 0x7;
+ vvvv &= 0xf; // Can only clear bit 4. Bit 3 must be cleared later.
insn->vvvv = static_cast<Reg>(vvvv);
return 0;
@@ -1731,10 +1776,10 @@ static int readOperands(struct InternalInstruction* insn) {
// If sibIndex was set to SIB_INDEX_NONE, index offset is 4.
if (insn->sibIndex == SIB_INDEX_NONE)
- insn->sibIndex = (SIBIndex)4;
+ insn->sibIndex = (SIBIndex)(insn->sibIndexBase + 4);
// If EVEX.v2 is set this is one of the 16-31 registers.
- if (insn->vectorExtensionType == TYPE_EVEX &&
+ if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT &&
v2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
insn->sibIndex = (SIBIndex)(insn->sibIndex + 16);
@@ -1835,6 +1880,8 @@ static int readOperands(struct InternalInstruction* insn) {
needVVVV = 0; /* Mark that we have found a VVVV operand. */
if (!hasVVVV)
return -1;
+ if (insn->mode != MODE_64BIT)
+ insn->vvvv = static_cast<Reg>(insn->vvvv & 0x7);
if (fixupReg(insn, &Op))
return -1;
break;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index ecd9d8dccafa..3b8a4f732eed 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -16,8 +16,8 @@
#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
-#include "X86DisassemblerDecoderCommon.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/X86DisassemblerDecoderCommon.h"
namespace llvm {
namespace X86Disassembler {
@@ -400,7 +400,7 @@ namespace X86Disassembler {
REGS_BOUND \
ENTRY(RIP)
-/// \brief All possible values of the base field for effective-address
+/// All possible values of the base field for effective-address
/// computations, a.k.a. the Mod and R/M fields of the ModR/M byte.
/// We distinguish between bases (EA_BASE_*) and registers that just happen
/// to be referred to when Mod == 0b11 (EA_REG_*).
@@ -415,7 +415,7 @@ enum EABase {
EA_max
};
-/// \brief All possible values of the SIB index field.
+/// All possible values of the SIB index field.
/// borrows entries from ALL_EA_BASES with the special case that
/// sib is synonymous with NONE.
/// Vector SIB: index can be XMM or YMM.
@@ -430,7 +430,7 @@ enum SIBIndex {
SIB_INDEX_max
};
-/// \brief All possible values of the SIB base field.
+/// All possible values of the SIB base field.
enum SIBBase {
SIB_BASE_NONE,
#define ENTRY(x) SIB_BASE_##x,
@@ -439,7 +439,7 @@ enum SIBBase {
SIB_BASE_max
};
-/// \brief Possible displacement types for effective-address computations.
+/// Possible displacement types for effective-address computations.
typedef enum {
EA_DISP_NONE,
EA_DISP_8,
@@ -447,7 +447,7 @@ typedef enum {
EA_DISP_32
} EADisplacement;
-/// \brief All possible values of the reg field in the ModR/M byte.
+/// All possible values of the reg field in the ModR/M byte.
enum Reg {
#define ENTRY(x) MODRM_REG_##x,
ALL_REGS
@@ -455,7 +455,7 @@ enum Reg {
MODRM_REG_max
};
-/// \brief All possible segment overrides.
+/// All possible segment overrides.
enum SegmentOverride {
SEG_OVERRIDE_NONE,
SEG_OVERRIDE_CS,
@@ -467,7 +467,7 @@ enum SegmentOverride {
SEG_OVERRIDE_max
};
-/// \brief Possible values for the VEX.m-mmmm field
+/// Possible values for the VEX.m-mmmm field
enum VEXLeadingOpcodeByte {
VEX_LOB_0F = 0x1,
VEX_LOB_0F38 = 0x2,
@@ -480,7 +480,7 @@ enum XOPMapSelect {
XOP_MAP_SELECT_A = 0xA
};
-/// \brief Possible values for the VEX.pp/EVEX.pp field
+/// Possible values for the VEX.pp/EVEX.pp field
enum VEXPrefixCode {
VEX_PREFIX_NONE = 0x0,
VEX_PREFIX_66 = 0x1,
@@ -496,7 +496,7 @@ enum VectorExtensionType {
TYPE_XOP = 0x4
};
-/// \brief Type for the byte reader that the consumer must provide to
+/// Type for the byte reader that the consumer must provide to
/// the decoder. Reads a single byte from the instruction's address space.
/// \param arg A baton that the consumer can associate with any internal
/// state that it needs.
@@ -507,7 +507,7 @@ enum VectorExtensionType {
/// \return -1 if the byte cannot be read for any reason; 0 otherwise.
typedef int (*byteReader_t)(const void *arg, uint8_t *byte, uint64_t address);
-/// \brief Type for the logging function that the consumer can provide to
+/// Type for the logging function that the consumer can provide to
/// get debugging output from the decoder.
/// \param arg A baton that the consumer can associate with any internal
/// state that it needs.
@@ -563,6 +563,8 @@ struct InternalInstruction {
bool hasAdSize;
// Operand-size override
bool hasOpSize;
+ // Lock prefix
+ bool hasLockPrefix;
// The repeat prefix if any
uint8_t repeatPrefix;
@@ -627,7 +629,6 @@ struct InternalInstruction {
// These fields determine the allowable values for the ModR/M fields, which
// depend on operand and address widths.
- EABase eaBaseBase;
EABase eaRegBase;
Reg regBase;
@@ -650,7 +651,7 @@ struct InternalInstruction {
ArrayRef<OperandSpecifier> operands;
};
-/// \brief Decode one instruction and store the decoding results in
+/// Decode one instruction and store the decoding results in
/// a buffer provided by the consumer.
/// \param insn The buffer to store the instruction in. Allocated by the
/// consumer.
@@ -674,7 +675,7 @@ int decodeInstruction(InternalInstruction *insn,
uint64_t startLoc,
DisassemblerMode mode);
-/// \brief Print a message to debugs()
+/// Print a message to debugs()
/// \param file The name of the file printing the debug message.
/// \param line The line number that printed the debug message.
/// \param s The message to print.
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
deleted file mode 100644
index ad1404860fb6..000000000000
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ /dev/null
@@ -1,466 +0,0 @@
-//===-- X86DisassemblerDecoderCommon.h - Disassembler decoder ---*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is part of the X86 Disassembler.
-// It contains common definitions used by both the disassembler and the table
-// generator.
-// Documentation for the disassembler can be found in X86Disassembler.h.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H
-#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H
-
-#include "llvm/Support/DataTypes.h"
-
-namespace llvm {
-namespace X86Disassembler {
-
-#define INSTRUCTIONS_SYM x86DisassemblerInstrSpecifiers
-#define CONTEXTS_SYM x86DisassemblerContexts
-#define ONEBYTE_SYM x86DisassemblerOneByteOpcodes
-#define TWOBYTE_SYM x86DisassemblerTwoByteOpcodes
-#define THREEBYTE38_SYM x86DisassemblerThreeByte38Opcodes
-#define THREEBYTE3A_SYM x86DisassemblerThreeByte3AOpcodes
-#define XOP8_MAP_SYM x86DisassemblerXOP8Opcodes
-#define XOP9_MAP_SYM x86DisassemblerXOP9Opcodes
-#define XOPA_MAP_SYM x86DisassemblerXOPAOpcodes
-
-#define INSTRUCTIONS_STR "x86DisassemblerInstrSpecifiers"
-#define CONTEXTS_STR "x86DisassemblerContexts"
-#define ONEBYTE_STR "x86DisassemblerOneByteOpcodes"
-#define TWOBYTE_STR "x86DisassemblerTwoByteOpcodes"
-#define THREEBYTE38_STR "x86DisassemblerThreeByte38Opcodes"
-#define THREEBYTE3A_STR "x86DisassemblerThreeByte3AOpcodes"
-#define XOP8_MAP_STR "x86DisassemblerXOP8Opcodes"
-#define XOP9_MAP_STR "x86DisassemblerXOP9Opcodes"
-#define XOPA_MAP_STR "x86DisassemblerXOPAOpcodes"
-
-// Attributes of an instruction that must be known before the opcode can be
-// processed correctly. Most of these indicate the presence of particular
-// prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
-#define ATTRIBUTE_BITS \
- ENUM_ENTRY(ATTR_NONE, 0x00) \
- ENUM_ENTRY(ATTR_64BIT, (0x1 << 0)) \
- ENUM_ENTRY(ATTR_XS, (0x1 << 1)) \
- ENUM_ENTRY(ATTR_XD, (0x1 << 2)) \
- ENUM_ENTRY(ATTR_REXW, (0x1 << 3)) \
- ENUM_ENTRY(ATTR_OPSIZE, (0x1 << 4)) \
- ENUM_ENTRY(ATTR_ADSIZE, (0x1 << 5)) \
- ENUM_ENTRY(ATTR_VEX, (0x1 << 6)) \
- ENUM_ENTRY(ATTR_VEXL, (0x1 << 7)) \
- ENUM_ENTRY(ATTR_EVEX, (0x1 << 8)) \
- ENUM_ENTRY(ATTR_EVEXL, (0x1 << 9)) \
- ENUM_ENTRY(ATTR_EVEXL2, (0x1 << 10)) \
- ENUM_ENTRY(ATTR_EVEXK, (0x1 << 11)) \
- ENUM_ENTRY(ATTR_EVEXKZ, (0x1 << 12)) \
- ENUM_ENTRY(ATTR_EVEXB, (0x1 << 13))
-
-#define ENUM_ENTRY(n, v) n = v,
-enum attributeBits {
- ATTRIBUTE_BITS
- ATTR_max
-};
-#undef ENUM_ENTRY
-
-// Combinations of the above attributes that are relevant to instruction
-// decode. Although other combinations are possible, they can be reduced to
-// these without affecting the ultimately decoded instruction.
-
-// Class name Rank Rationale for rank assignment
-#define INSTRUCTION_CONTEXTS \
- ENUM_ENTRY(IC, 0, "says nothing about the instruction") \
- ENUM_ENTRY(IC_64BIT, 1, "says the instruction applies in " \
- "64-bit mode but no more") \
- ENUM_ENTRY(IC_OPSIZE, 3, "requires an OPSIZE prefix, so " \
- "operands change width") \
- ENUM_ENTRY(IC_ADSIZE, 3, "requires an ADSIZE prefix, so " \
- "operands change width") \
- ENUM_ENTRY(IC_OPSIZE_ADSIZE, 4, "requires ADSIZE and OPSIZE prefixes") \
- ENUM_ENTRY(IC_XD, 2, "may say something about the opcode " \
- "but not the operands") \
- ENUM_ENTRY(IC_XS, 2, "may say something about the opcode " \
- "but not the operands") \
- ENUM_ENTRY(IC_XD_OPSIZE, 3, "requires an OPSIZE prefix, so " \
- "operands change width") \
- ENUM_ENTRY(IC_XS_OPSIZE, 3, "requires an OPSIZE prefix, so " \
- "operands change width") \
- ENUM_ENTRY(IC_64BIT_REXW, 5, "requires a REX.W prefix, so operands "\
- "change width; overrides IC_OPSIZE") \
- ENUM_ENTRY(IC_64BIT_REXW_ADSIZE, 6, "requires a REX.W prefix and 0x67 " \
- "prefix") \
- ENUM_ENTRY(IC_64BIT_OPSIZE, 3, "Just as meaningful as IC_OPSIZE") \
- ENUM_ENTRY(IC_64BIT_ADSIZE, 3, "Just as meaningful as IC_ADSIZE") \
- ENUM_ENTRY(IC_64BIT_OPSIZE_ADSIZE, 4, "Just as meaningful as IC_OPSIZE/" \
- "IC_ADSIZE") \
- ENUM_ENTRY(IC_64BIT_XD, 6, "XD instructions are SSE; REX.W is " \
- "secondary") \
- ENUM_ENTRY(IC_64BIT_XS, 6, "Just as meaningful as IC_64BIT_XD") \
- ENUM_ENTRY(IC_64BIT_XD_OPSIZE, 3, "Just as meaningful as IC_XD_OPSIZE") \
- ENUM_ENTRY(IC_64BIT_XS_OPSIZE, 3, "Just as meaningful as IC_XS_OPSIZE") \
- ENUM_ENTRY(IC_64BIT_REXW_XS, 7, "OPSIZE could mean a different " \
- "opcode") \
- ENUM_ENTRY(IC_64BIT_REXW_XD, 7, "Just as meaningful as " \
- "IC_64BIT_REXW_XS") \
- ENUM_ENTRY(IC_64BIT_REXW_OPSIZE, 8, "The Dynamic Duo! Prefer over all " \
- "else because this changes most " \
- "operands' meaning") \
- ENUM_ENTRY(IC_VEX, 1, "requires a VEX prefix") \
- ENUM_ENTRY(IC_VEX_XS, 2, "requires VEX and the XS prefix") \
- ENUM_ENTRY(IC_VEX_XD, 2, "requires VEX and the XD prefix") \
- ENUM_ENTRY(IC_VEX_OPSIZE, 2, "requires VEX and the OpSize prefix") \
- ENUM_ENTRY(IC_VEX_W, 3, "requires VEX and the W prefix") \
- ENUM_ENTRY(IC_VEX_W_XS, 4, "requires VEX, W, and XS prefix") \
- ENUM_ENTRY(IC_VEX_W_XD, 4, "requires VEX, W, and XD prefix") \
- ENUM_ENTRY(IC_VEX_W_OPSIZE, 4, "requires VEX, W, and OpSize") \
- ENUM_ENTRY(IC_VEX_L, 3, "requires VEX and the L prefix") \
- ENUM_ENTRY(IC_VEX_L_XS, 4, "requires VEX and the L and XS prefix")\
- ENUM_ENTRY(IC_VEX_L_XD, 4, "requires VEX and the L and XD prefix")\
- ENUM_ENTRY(IC_VEX_L_OPSIZE, 4, "requires VEX, L, and OpSize") \
- ENUM_ENTRY(IC_VEX_L_W, 4, "requires VEX, L and W") \
- ENUM_ENTRY(IC_VEX_L_W_XS, 5, "requires VEX, L, W and XS prefix") \
- ENUM_ENTRY(IC_VEX_L_W_XD, 5, "requires VEX, L, W and XD prefix") \
- ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 5, "requires VEX, L, W and OpSize") \
- ENUM_ENTRY(IC_EVEX, 1, "requires an EVEX prefix") \
- ENUM_ENTRY(IC_EVEX_XS, 2, "requires EVEX and the XS prefix") \
- ENUM_ENTRY(IC_EVEX_XD, 2, "requires EVEX and the XD prefix") \
- ENUM_ENTRY(IC_EVEX_OPSIZE, 2, "requires EVEX and the OpSize prefix") \
- ENUM_ENTRY(IC_EVEX_W, 3, "requires EVEX and the W prefix") \
- ENUM_ENTRY(IC_EVEX_W_XS, 4, "requires EVEX, W, and XS prefix") \
- ENUM_ENTRY(IC_EVEX_W_XD, 4, "requires EVEX, W, and XD prefix") \
- ENUM_ENTRY(IC_EVEX_W_OPSIZE, 4, "requires EVEX, W, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L, 3, "requires EVEX and the L prefix") \
- ENUM_ENTRY(IC_EVEX_L_XS, 4, "requires EVEX and the L and XS prefix")\
- ENUM_ENTRY(IC_EVEX_L_XD, 4, "requires EVEX and the L and XD prefix")\
- ENUM_ENTRY(IC_EVEX_L_OPSIZE, 4, "requires EVEX, L, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L_W, 3, "requires EVEX, L and W") \
- ENUM_ENTRY(IC_EVEX_L_W_XS, 4, "requires EVEX, L, W and XS prefix") \
- ENUM_ENTRY(IC_EVEX_L_W_XD, 4, "requires EVEX, L, W and XD prefix") \
- ENUM_ENTRY(IC_EVEX_L_W_OPSIZE, 4, "requires EVEX, L, W and OpSize") \
- ENUM_ENTRY(IC_EVEX_L2, 3, "requires EVEX and the L2 prefix") \
- ENUM_ENTRY(IC_EVEX_L2_XS, 4, "requires EVEX and the L2 and XS prefix")\
- ENUM_ENTRY(IC_EVEX_L2_XD, 4, "requires EVEX and the L2 and XD prefix")\
- ENUM_ENTRY(IC_EVEX_L2_OPSIZE, 4, "requires EVEX, L2, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L2_W, 3, "requires EVEX, L2 and W") \
- ENUM_ENTRY(IC_EVEX_L2_W_XS, 4, "requires EVEX, L2, W and XS prefix") \
- ENUM_ENTRY(IC_EVEX_L2_W_XD, 4, "requires EVEX, L2, W and XD prefix") \
- ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE, 4, "requires EVEX, L2, W and OpSize") \
- ENUM_ENTRY(IC_EVEX_K, 1, "requires an EVEX_K prefix") \
- ENUM_ENTRY(IC_EVEX_XS_K, 2, "requires EVEX_K and the XS prefix") \
- ENUM_ENTRY(IC_EVEX_XD_K, 2, "requires EVEX_K and the XD prefix") \
- ENUM_ENTRY(IC_EVEX_OPSIZE_K, 2, "requires EVEX_K and the OpSize prefix") \
- ENUM_ENTRY(IC_EVEX_W_K, 3, "requires EVEX_K and the W prefix") \
- ENUM_ENTRY(IC_EVEX_W_XS_K, 4, "requires EVEX_K, W, and XS prefix") \
- ENUM_ENTRY(IC_EVEX_W_XD_K, 4, "requires EVEX_K, W, and XD prefix") \
- ENUM_ENTRY(IC_EVEX_W_OPSIZE_K, 4, "requires EVEX_K, W, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L_K, 3, "requires EVEX_K and the L prefix") \
- ENUM_ENTRY(IC_EVEX_L_XS_K, 4, "requires EVEX_K and the L and XS prefix")\
- ENUM_ENTRY(IC_EVEX_L_XD_K, 4, "requires EVEX_K and the L and XD prefix")\
- ENUM_ENTRY(IC_EVEX_L_OPSIZE_K, 4, "requires EVEX_K, L, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L_W_K, 3, "requires EVEX_K, L and W") \
- ENUM_ENTRY(IC_EVEX_L_W_XS_K, 4, "requires EVEX_K, L, W and XS prefix") \
- ENUM_ENTRY(IC_EVEX_L_W_XD_K, 4, "requires EVEX_K, L, W and XD prefix") \
- ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K, 4, "requires EVEX_K, L, W and OpSize") \
- ENUM_ENTRY(IC_EVEX_L2_K, 3, "requires EVEX_K and the L2 prefix") \
- ENUM_ENTRY(IC_EVEX_L2_XS_K, 4, "requires EVEX_K and the L2 and XS prefix")\
- ENUM_ENTRY(IC_EVEX_L2_XD_K, 4, "requires EVEX_K and the L2 and XD prefix")\
- ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K, 4, "requires EVEX_K, L2, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L2_W_K, 3, "requires EVEX_K, L2 and W") \
- ENUM_ENTRY(IC_EVEX_L2_W_XS_K, 4, "requires EVEX_K, L2, W and XS prefix") \
- ENUM_ENTRY(IC_EVEX_L2_W_XD_K, 4, "requires EVEX_K, L2, W and XD prefix") \
- ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K, 4, "requires EVEX_K, L2, W and OpSize") \
- ENUM_ENTRY(IC_EVEX_B, 1, "requires an EVEX_B prefix") \
- ENUM_ENTRY(IC_EVEX_XS_B, 2, "requires EVEX_B and the XS prefix") \
- ENUM_ENTRY(IC_EVEX_XD_B, 2, "requires EVEX_B and the XD prefix") \
- ENUM_ENTRY(IC_EVEX_OPSIZE_B, 2, "requires EVEX_B and the OpSize prefix") \
- ENUM_ENTRY(IC_EVEX_W_B, 3, "requires EVEX_B and the W prefix") \
- ENUM_ENTRY(IC_EVEX_W_XS_B, 4, "requires EVEX_B, W, and XS prefix") \
- ENUM_ENTRY(IC_EVEX_W_XD_B, 4, "requires EVEX_B, W, and XD prefix") \
- ENUM_ENTRY(IC_EVEX_W_OPSIZE_B, 4, "requires EVEX_B, W, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L_B, 3, "requires EVEX_B and the L prefix") \
- ENUM_ENTRY(IC_EVEX_L_XS_B, 4, "requires EVEX_B and the L and XS prefix")\
- ENUM_ENTRY(IC_EVEX_L_XD_B, 4, "requires EVEX_B and the L and XD prefix")\
- ENUM_ENTRY(IC_EVEX_L_OPSIZE_B, 4, "requires EVEX_B, L, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L_W_B, 3, "requires EVEX_B, L and W") \
- ENUM_ENTRY(IC_EVEX_L_W_XS_B, 4, "requires EVEX_B, L, W and XS prefix") \
- ENUM_ENTRY(IC_EVEX_L_W_XD_B, 4, "requires EVEX_B, L, W and XD prefix") \
- ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_B, 4, "requires EVEX_B, L, W and OpSize") \
- ENUM_ENTRY(IC_EVEX_L2_B, 3, "requires EVEX_B and the L2 prefix") \
- ENUM_ENTRY(IC_EVEX_L2_XS_B, 4, "requires EVEX_B and the L2 and XS prefix")\
- ENUM_ENTRY(IC_EVEX_L2_XD_B, 4, "requires EVEX_B and the L2 and XD prefix")\
- ENUM_ENTRY(IC_EVEX_L2_OPSIZE_B, 4, "requires EVEX_B, L2, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L2_W_B, 3, "requires EVEX_B, L2 and W") \
- ENUM_ENTRY(IC_EVEX_L2_W_XS_B, 4, "requires EVEX_B, L2, W and XS prefix") \
- ENUM_ENTRY(IC_EVEX_L2_W_XD_B, 4, "requires EVEX_B, L2, W and XD prefix") \
- ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_B, 4, "requires EVEX_B, L2, W and OpSize") \
- ENUM_ENTRY(IC_EVEX_K_B, 1, "requires EVEX_B and EVEX_K prefix") \
- ENUM_ENTRY(IC_EVEX_XS_K_B, 2, "requires EVEX_B, EVEX_K and the XS prefix") \
- ENUM_ENTRY(IC_EVEX_XD_K_B, 2, "requires EVEX_B, EVEX_K and the XD prefix") \
- ENUM_ENTRY(IC_EVEX_OPSIZE_K_B, 2, "requires EVEX_B, EVEX_K and the OpSize prefix") \
- ENUM_ENTRY(IC_EVEX_W_K_B, 3, "requires EVEX_B, EVEX_K and the W prefix") \
- ENUM_ENTRY(IC_EVEX_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, W, and XS prefix") \
- ENUM_ENTRY(IC_EVEX_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, W, and XD prefix") \
- ENUM_ENTRY(IC_EVEX_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, W, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L_K_B, 3, "requires EVEX_B, EVEX_K and the L prefix") \
- ENUM_ENTRY(IC_EVEX_L_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L and XS prefix")\
- ENUM_ENTRY(IC_EVEX_L_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L and XD prefix")\
- ENUM_ENTRY(IC_EVEX_L_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L_W_K_B, 3, "requires EVEX_B, EVEX_K, L and W") \
- ENUM_ENTRY(IC_EVEX_L_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XS prefix") \
- ENUM_ENTRY(IC_EVEX_L_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XD prefix") \
- ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K_B,4, "requires EVEX_B, EVEX_K, L, W and OpSize") \
- ENUM_ENTRY(IC_EVEX_L2_K_B, 3, "requires EVEX_B, EVEX_K and the L2 prefix") \
- ENUM_ENTRY(IC_EVEX_L2_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XS prefix")\
- ENUM_ENTRY(IC_EVEX_L2_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XD prefix")\
- ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L2, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L2_W_K_B, 3, "requires EVEX_B, EVEX_K, L2 and W") \
- ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XS prefix") \
- ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XD prefix") \
- ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B,4, "requires EVEX_B, EVEX_K, L2, W and OpSize") \
- ENUM_ENTRY(IC_EVEX_KZ_B, 1, "requires EVEX_B and EVEX_KZ prefix") \
- ENUM_ENTRY(IC_EVEX_XS_KZ_B, 2, "requires EVEX_B, EVEX_KZ and the XS prefix") \
- ENUM_ENTRY(IC_EVEX_XD_KZ_B, 2, "requires EVEX_B, EVEX_KZ and the XD prefix") \
- ENUM_ENTRY(IC_EVEX_OPSIZE_KZ_B, 2, "requires EVEX_B, EVEX_KZ and the OpSize prefix") \
- ENUM_ENTRY(IC_EVEX_W_KZ_B, 3, "requires EVEX_B, EVEX_KZ and the W prefix") \
- ENUM_ENTRY(IC_EVEX_W_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ, W, and XS prefix") \
- ENUM_ENTRY(IC_EVEX_W_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ, W, and XD prefix") \
- ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, W, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L_KZ_B, 3, "requires EVEX_B, EVEX_KZ and the L prefix") \
- ENUM_ENTRY(IC_EVEX_L_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L and XS prefix")\
- ENUM_ENTRY(IC_EVEX_L_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L and XD prefix")\
- ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L_W_KZ_B, 3, "requires EVEX_B, EVEX_KZ, L and W") \
- ENUM_ENTRY(IC_EVEX_L_W_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, W and XS prefix") \
- ENUM_ENTRY(IC_EVEX_L_W_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, W and XD prefix") \
- ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, W and OpSize") \
- ENUM_ENTRY(IC_EVEX_L2_KZ_B, 3, "requires EVEX_B, EVEX_KZ and the L2 prefix") \
- ENUM_ENTRY(IC_EVEX_L2_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L2 and XS prefix")\
- ENUM_ENTRY(IC_EVEX_L2_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L2 and XD prefix")\
- ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L2_W_KZ_B, 3, "requires EVEX_B, EVEX_KZ, L2 and W") \
- ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, W and XS prefix") \
- ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, W and XD prefix") \
- ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, W and OpSize") \
- ENUM_ENTRY(IC_EVEX_KZ, 1, "requires an EVEX_KZ prefix") \
- ENUM_ENTRY(IC_EVEX_XS_KZ, 2, "requires EVEX_KZ and the XS prefix") \
- ENUM_ENTRY(IC_EVEX_XD_KZ, 2, "requires EVEX_KZ and the XD prefix") \
- ENUM_ENTRY(IC_EVEX_OPSIZE_KZ, 2, "requires EVEX_KZ and the OpSize prefix") \
- ENUM_ENTRY(IC_EVEX_W_KZ, 3, "requires EVEX_KZ and the W prefix") \
- ENUM_ENTRY(IC_EVEX_W_XS_KZ, 4, "requires EVEX_KZ, W, and XS prefix") \
- ENUM_ENTRY(IC_EVEX_W_XD_KZ, 4, "requires EVEX_KZ, W, and XD prefix") \
- ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ, 4, "requires EVEX_KZ, W, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L_KZ, 3, "requires EVEX_KZ and the L prefix") \
- ENUM_ENTRY(IC_EVEX_L_XS_KZ, 4, "requires EVEX_KZ and the L and XS prefix")\
- ENUM_ENTRY(IC_EVEX_L_XD_KZ, 4, "requires EVEX_KZ and the L and XD prefix")\
- ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ, 4, "requires EVEX_KZ, L, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L_W_KZ, 3, "requires EVEX_KZ, L and W") \
- ENUM_ENTRY(IC_EVEX_L_W_XS_KZ, 4, "requires EVEX_KZ, L, W and XS prefix") \
- ENUM_ENTRY(IC_EVEX_L_W_XD_KZ, 4, "requires EVEX_KZ, L, W and XD prefix") \
- ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L, W and OpSize") \
- ENUM_ENTRY(IC_EVEX_L2_KZ, 3, "requires EVEX_KZ and the L2 prefix") \
- ENUM_ENTRY(IC_EVEX_L2_XS_KZ, 4, "requires EVEX_KZ and the L2 and XS prefix")\
- ENUM_ENTRY(IC_EVEX_L2_XD_KZ, 4, "requires EVEX_KZ and the L2 and XD prefix")\
- ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, and OpSize") \
- ENUM_ENTRY(IC_EVEX_L2_W_KZ, 3, "requires EVEX_KZ, L2 and W") \
- ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ, 4, "requires EVEX_KZ, L2, W and XS prefix") \
- ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ, 4, "requires EVEX_KZ, L2, W and XD prefix") \
- ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, W and OpSize")
-
-#define ENUM_ENTRY(n, r, d) n,
-enum InstructionContext {
- INSTRUCTION_CONTEXTS
- IC_max
-};
-#undef ENUM_ENTRY
-
-// Opcode types, which determine which decode table to use, both in the Intel
-// manual and also for the decoder.
-enum OpcodeType {
- ONEBYTE = 0,
- TWOBYTE = 1,
- THREEBYTE_38 = 2,
- THREEBYTE_3A = 3,
- XOP8_MAP = 4,
- XOP9_MAP = 5,
- XOPA_MAP = 6
-};
-
-// The following structs are used for the hierarchical decode table. After
-// determining the instruction's class (i.e., which IC_* constant applies to
-// it), the decoder reads the opcode. Some instructions require specific
-// values of the ModR/M byte, so the ModR/M byte indexes into the final table.
-//
-// If a ModR/M byte is not required, "required" is left unset, and the values
-// for each instructionID are identical.
-typedef uint16_t InstrUID;
-
-// ModRMDecisionType - describes the type of ModR/M decision, allowing the
-// consumer to determine the number of entries in it.
-//
-// MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
-// instruction is the same.
-// MODRM_SPLITRM - If the ModR/M byte is between 0x00 and 0xbf, the opcode
-// corresponds to one instruction; otherwise, it corresponds to
-// a different instruction.
-// MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
-// divided by 8 is used to select instruction; otherwise, each
-// value of the ModR/M byte could correspond to a different
-// instruction.
-// MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
-// corresponds to instructions that use reg field as opcode
-// MODRM_FULL - Potentially, each value of the ModR/M byte could correspond
-// to a different instruction.
-#define MODRMTYPES \
- ENUM_ENTRY(MODRM_ONEENTRY) \
- ENUM_ENTRY(MODRM_SPLITRM) \
- ENUM_ENTRY(MODRM_SPLITMISC) \
- ENUM_ENTRY(MODRM_SPLITREG) \
- ENUM_ENTRY(MODRM_FULL)
-
-#define ENUM_ENTRY(n) n,
-enum ModRMDecisionType {
- MODRMTYPES
- MODRM_max
-};
-#undef ENUM_ENTRY
-
-#define CASE_ENCODING_RM \
- case ENCODING_RM: \
- case ENCODING_RM_CD2: \
- case ENCODING_RM_CD4: \
- case ENCODING_RM_CD8: \
- case ENCODING_RM_CD16: \
- case ENCODING_RM_CD32: \
- case ENCODING_RM_CD64
-
-#define CASE_ENCODING_VSIB \
- case ENCODING_VSIB: \
- case ENCODING_VSIB_CD2: \
- case ENCODING_VSIB_CD4: \
- case ENCODING_VSIB_CD8: \
- case ENCODING_VSIB_CD16: \
- case ENCODING_VSIB_CD32: \
- case ENCODING_VSIB_CD64
-
-// Physical encodings of instruction operands.
-#define ENCODINGS \
- ENUM_ENTRY(ENCODING_NONE, "") \
- ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \
- ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \
- ENUM_ENTRY(ENCODING_RM_CD2, "R/M operand with CDisp scaling of 2") \
- ENUM_ENTRY(ENCODING_RM_CD4, "R/M operand with CDisp scaling of 4") \
- ENUM_ENTRY(ENCODING_RM_CD8, "R/M operand with CDisp scaling of 8") \
- ENUM_ENTRY(ENCODING_RM_CD16,"R/M operand with CDisp scaling of 16") \
- ENUM_ENTRY(ENCODING_RM_CD32,"R/M operand with CDisp scaling of 32") \
- ENUM_ENTRY(ENCODING_RM_CD64,"R/M operand with CDisp scaling of 64") \
- ENUM_ENTRY(ENCODING_VSIB, "VSIB operand in ModR/M byte.") \
- ENUM_ENTRY(ENCODING_VSIB_CD2, "VSIB operand with CDisp scaling of 2") \
- ENUM_ENTRY(ENCODING_VSIB_CD4, "VSIB operand with CDisp scaling of 4") \
- ENUM_ENTRY(ENCODING_VSIB_CD8, "VSIB operand with CDisp scaling of 8") \
- ENUM_ENTRY(ENCODING_VSIB_CD16,"VSIB operand with CDisp scaling of 16") \
- ENUM_ENTRY(ENCODING_VSIB_CD32,"VSIB operand with CDisp scaling of 32") \
- ENUM_ENTRY(ENCODING_VSIB_CD64,"VSIB operand with CDisp scaling of 64") \
- ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \
- ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.") \
- ENUM_ENTRY(ENCODING_IB, "1-byte immediate") \
- ENUM_ENTRY(ENCODING_IW, "2-byte") \
- ENUM_ENTRY(ENCODING_ID, "4-byte") \
- ENUM_ENTRY(ENCODING_IO, "8-byte") \
- ENUM_ENTRY(ENCODING_RB, "(AL..DIL, R8L..R15L) Register code added to " \
- "the opcode byte") \
- ENUM_ENTRY(ENCODING_RW, "(AX..DI, R8W..R15W)") \
- ENUM_ENTRY(ENCODING_RD, "(EAX..EDI, R8D..R15D)") \
- ENUM_ENTRY(ENCODING_RO, "(RAX..RDI, R8..R15)") \
- ENUM_ENTRY(ENCODING_FP, "Position on floating-point stack in ModR/M " \
- "byte.") \
- \
- ENUM_ENTRY(ENCODING_Iv, "Immediate of operand size") \
- ENUM_ENTRY(ENCODING_Ia, "Immediate of address size") \
- ENUM_ENTRY(ENCODING_IRC, "Immediate for static rounding control") \
- ENUM_ENTRY(ENCODING_Rv, "Register code of operand size added to the " \
- "opcode byte") \
- ENUM_ENTRY(ENCODING_DUP, "Duplicate of another operand; ID is encoded " \
- "in type") \
- ENUM_ENTRY(ENCODING_SI, "Source index; encoded in OpSize/Adsize prefix") \
- ENUM_ENTRY(ENCODING_DI, "Destination index; encoded in prefixes")
-
-#define ENUM_ENTRY(n, d) n,
-enum OperandEncoding {
- ENCODINGS
- ENCODING_max
-};
-#undef ENUM_ENTRY
-
-// Semantic interpretations of instruction operands.
-#define TYPES \
- ENUM_ENTRY(TYPE_NONE, "") \
- ENUM_ENTRY(TYPE_REL, "immediate address") \
- ENUM_ENTRY(TYPE_R8, "1-byte register operand") \
- ENUM_ENTRY(TYPE_R16, "2-byte") \
- ENUM_ENTRY(TYPE_R32, "4-byte") \
- ENUM_ENTRY(TYPE_R64, "8-byte") \
- ENUM_ENTRY(TYPE_IMM, "immediate operand") \
- ENUM_ENTRY(TYPE_IMM3, "1-byte immediate operand between 0 and 7") \
- ENUM_ENTRY(TYPE_IMM5, "1-byte immediate operand between 0 and 31") \
- ENUM_ENTRY(TYPE_AVX512ICC, "1-byte immediate operand for AVX512 icmp") \
- ENUM_ENTRY(TYPE_UIMM8, "1-byte unsigned immediate operand") \
- ENUM_ENTRY(TYPE_M, "Memory operand") \
- ENUM_ENTRY(TYPE_MVSIBX, "Memory operand using XMM index") \
- ENUM_ENTRY(TYPE_MVSIBY, "Memory operand using YMM index") \
- ENUM_ENTRY(TYPE_MVSIBZ, "Memory operand using ZMM index") \
- ENUM_ENTRY(TYPE_SRCIDX, "memory at source index") \
- ENUM_ENTRY(TYPE_DSTIDX, "memory at destination index") \
- ENUM_ENTRY(TYPE_MOFFS, "memory offset (relative to segment base)") \
- ENUM_ENTRY(TYPE_ST, "Position on the floating-point stack") \
- ENUM_ENTRY(TYPE_MM64, "8-byte MMX register") \
- ENUM_ENTRY(TYPE_XMM, "16-byte") \
- ENUM_ENTRY(TYPE_YMM, "32-byte") \
- ENUM_ENTRY(TYPE_ZMM, "64-byte") \
- ENUM_ENTRY(TYPE_VK, "mask register") \
- ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \
- ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \
- ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand") \
- ENUM_ENTRY(TYPE_BNDR, "MPX bounds register") \
- \
- ENUM_ENTRY(TYPE_Rv, "Register operand of operand size") \
- ENUM_ENTRY(TYPE_RELv, "Immediate address of operand size") \
- ENUM_ENTRY(TYPE_DUP0, "Duplicate of operand 0") \
- ENUM_ENTRY(TYPE_DUP1, "operand 1") \
- ENUM_ENTRY(TYPE_DUP2, "operand 2") \
- ENUM_ENTRY(TYPE_DUP3, "operand 3") \
- ENUM_ENTRY(TYPE_DUP4, "operand 4") \
-
-#define ENUM_ENTRY(n, d) n,
-enum OperandType {
- TYPES
- TYPE_max
-};
-#undef ENUM_ENTRY
-
-/// \brief The specification for how to extract and interpret one operand.
-struct OperandSpecifier {
- uint8_t encoding;
- uint8_t type;
-};
-
-static const unsigned X86_MAX_OPERANDS = 6;
-
-/// Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode
-/// are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
-/// respectively.
-enum DisassemblerMode {
- MODE_16BIT,
- MODE_32BIT,
- MODE_64BIT
-};
-
-} // namespace X86Disassembler
-} // namespace llvm
-
-#endif
diff --git a/lib/Target/X86/InstPrinter/CMakeLists.txt b/lib/Target/X86/InstPrinter/CMakeLists.txt
index 686a37e61498..a61efaed33a5 100644
--- a/lib/Target/X86/InstPrinter/CMakeLists.txt
+++ b/lib/Target/X86/InstPrinter/CMakeLists.txt
@@ -2,4 +2,5 @@ add_llvm_library(LLVMX86AsmPrinter
X86ATTInstPrinter.cpp
X86IntelInstPrinter.cpp
X86InstComments.cpp
+ X86InstPrinterCommon.cpp
)
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index 0c99dbbe328b..82e82fe1efd9 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -17,7 +17,6 @@
#include "X86InstComments.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/Casting.h"
@@ -42,24 +41,11 @@ void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
StringRef Annot, const MCSubtargetInfo &STI) {
- const MCInstrDesc &Desc = MII.get(MI->getOpcode());
- uint64_t TSFlags = Desc.TSFlags;
-
// If verbose assembly is enabled, we can print some informative comments.
if (CommentStream)
- HasCustomInstComment =
- EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
-
- unsigned Flags = MI->getFlags();
- if (TSFlags & X86II::LOCK)
- OS << "\tlock\t";
- if (!(TSFlags & X86II::LOCK) && Flags & X86::IP_HAS_LOCK)
- OS << "\tlock\t";
+ HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII);
- if (Flags & X86::IP_HAS_REPEAT_NE)
- OS << "\trepne\t";
- else if (Flags & X86::IP_HAS_REPEAT)
- OS << "\trep\t";
+ printInstFlags(MI, OS);
// Output CALLpcrel32 as "callq" in 64-bit mode.
// In Intel annotation it's always emitted as "call".
@@ -78,10 +64,8 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
// 0x66 to be interpreted as "data16" by the asm printer.
// Thus we add an adjustment here in order to print the "right" instruction.
else if (MI->getOpcode() == X86::DATA16_PREFIX &&
- (STI.getFeatureBits()[X86::Mode16Bit])) {
- MCInst Data32MI(*MI);
- Data32MI.setOpcode(X86::DATA32_PREFIX);
- printInstruction(&Data32MI, OS);
+ STI.getFeatureBits()[X86::Mode16Bit]) {
+ OS << "\tdata32";
}
// Try to print any aliases first.
else if (!printAliasInstr(MI, OS))
@@ -91,97 +75,6 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
printAnnotation(OS, Annot);
}
-void X86ATTInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- int64_t Imm = MI->getOperand(Op).getImm();
- switch (Imm) {
- default: llvm_unreachable("Invalid ssecc/avxcc argument!");
- case 0: O << "eq"; break;
- case 1: O << "lt"; break;
- case 2: O << "le"; break;
- case 3: O << "unord"; break;
- case 4: O << "neq"; break;
- case 5: O << "nlt"; break;
- case 6: O << "nle"; break;
- case 7: O << "ord"; break;
- case 8: O << "eq_uq"; break;
- case 9: O << "nge"; break;
- case 0xa: O << "ngt"; break;
- case 0xb: O << "false"; break;
- case 0xc: O << "neq_oq"; break;
- case 0xd: O << "ge"; break;
- case 0xe: O << "gt"; break;
- case 0xf: O << "true"; break;
- case 0x10: O << "eq_os"; break;
- case 0x11: O << "lt_oq"; break;
- case 0x12: O << "le_oq"; break;
- case 0x13: O << "unord_s"; break;
- case 0x14: O << "neq_us"; break;
- case 0x15: O << "nlt_uq"; break;
- case 0x16: O << "nle_uq"; break;
- case 0x17: O << "ord_s"; break;
- case 0x18: O << "eq_us"; break;
- case 0x19: O << "nge_uq"; break;
- case 0x1a: O << "ngt_uq"; break;
- case 0x1b: O << "false_os"; break;
- case 0x1c: O << "neq_os"; break;
- case 0x1d: O << "ge_oq"; break;
- case 0x1e: O << "gt_oq"; break;
- case 0x1f: O << "true_us"; break;
- }
-}
-
-void X86ATTInstPrinter::printXOPCC(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- int64_t Imm = MI->getOperand(Op).getImm();
- switch (Imm) {
- default: llvm_unreachable("Invalid xopcc argument!");
- case 0: O << "lt"; break;
- case 1: O << "le"; break;
- case 2: O << "gt"; break;
- case 3: O << "ge"; break;
- case 4: O << "eq"; break;
- case 5: O << "neq"; break;
- case 6: O << "false"; break;
- case 7: O << "true"; break;
- }
-}
-
-void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
- switch (Imm) {
- case 0: O << "{rn-sae}"; break;
- case 1: O << "{rd-sae}"; break;
- case 2: O << "{ru-sae}"; break;
- case 3: O << "{rz-sae}"; break;
- }
-}
-
-/// printPCRelImm - This is used to print an immediate value that ends up
-/// being encoded as a pc-relative value (e.g. for jumps and calls). These
-/// print slightly differently than normal immediates. For example, a $ is not
-/// emitted.
-void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- const MCOperand &Op = MI->getOperand(OpNo);
- if (Op.isImm())
- O << formatImm(Op.getImm());
- else {
- assert(Op.isExpr() && "unknown pcrel immediate operand");
- // If a symbolic branch target was added as a constant expression then print
- // that address in hex.
- const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
- int64_t Address;
- if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
- O << formatHex((uint64_t)Address);
- } else {
- // Otherwise, just print the expression.
- Op.getExpr()->print(O, &MAI);
- }
- }
-}
-
void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
const MCOperand &Op = MI->getOperand(OpNo);
@@ -220,15 +113,11 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg);
const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg);
const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp);
- const MCOperand &SegReg = MI->getOperand(Op + X86::AddrSegmentReg);
O << markup("<mem:");
// If this has a segment register, print it.
- if (SegReg.getReg()) {
- printOperand(MI, Op + X86::AddrSegmentReg, O);
- O << ':';
- }
+ printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O);
if (DispSpec.isImm()) {
int64_t DispVal = DispSpec.getImm();
@@ -261,15 +150,10 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
raw_ostream &O) {
- const MCOperand &SegReg = MI->getOperand(Op + 1);
-
O << markup("<mem:");
// If this has a segment register, print it.
- if (SegReg.getReg()) {
- printOperand(MI, Op + 1, O);
- O << ':';
- }
+ printOptionalSegReg(MI, Op + 1, O);
O << "(";
printOperand(MI, Op, O);
@@ -292,15 +176,11 @@ void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
raw_ostream &O) {
const MCOperand &DispSpec = MI->getOperand(Op);
- const MCOperand &SegReg = MI->getOperand(Op + 1);
O << markup("<mem:");
// If this has a segment register, print it.
- if (SegReg.getReg()) {
- printOperand(MI, Op + 1, O);
- O << ':';
- }
+ printOptionalSegReg(MI, Op + 1, O);
if (DispSpec.isImm()) {
O << formatImm(DispSpec.getImm());
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 946c1c73f088..57422bc9a0b2 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -14,15 +14,15 @@
#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
-#include "llvm/MC/MCInstPrinter.h"
+#include "X86InstPrinterCommon.h"
namespace llvm {
-class X86ATTInstPrinter final : public MCInstPrinter {
+class X86ATTInstPrinter final : public X86InstPrinterCommon {
public:
X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
const MCRegisterInfo &MRI)
- : MCInstPrinter(MAI, MII, MRI) {}
+ : X86InstPrinterCommon(MAI, MII, MRI) {}
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
@@ -38,21 +38,16 @@ public:
void printInstruction(const MCInst *MI, raw_ostream &OS);
static const char *getRegisterName(unsigned RegNo);
- void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS) override;
void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
- void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
- void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
- void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
- void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
- void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
- void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printSrcIdx(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printDstIdx(const MCInst *MI, unsigned Op, raw_ostream &O);
void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS);
void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
-
void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index a46f22ff40f5..37bed37b0994 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -13,10 +13,12 @@
//===----------------------------------------------------------------------===//
#include "X86InstComments.h"
+#include "X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
#include "Utils/X86ShuffleDecode.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -158,6 +160,46 @@ using namespace llvm;
CASE_MASKZ_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \
CASE_MASKZ_INS_COMMON(SHUFI##Inst, Z256, r##src##i)
+#define CASE_AVX512_FMA(Inst, suf) \
+ CASE_AVX512_INS_COMMON(Inst, Z, suf) \
+ CASE_AVX512_INS_COMMON(Inst, Z256, suf) \
+ CASE_AVX512_INS_COMMON(Inst, Z128, suf)
+
+#define CASE_FMA(Inst, suf) \
+ CASE_AVX512_FMA(Inst, suf) \
+ CASE_AVX_INS_COMMON(Inst, , suf) \
+ CASE_AVX_INS_COMMON(Inst, Y, suf)
+
+#define CASE_FMA_PACKED_REG(Inst) \
+ CASE_FMA(Inst##PD, r) \
+ CASE_FMA(Inst##PS, r)
+
+#define CASE_FMA_PACKED_MEM(Inst) \
+ CASE_FMA(Inst##PD, m) \
+ CASE_FMA(Inst##PS, m) \
+ CASE_AVX512_FMA(Inst##PD, mb) \
+ CASE_AVX512_FMA(Inst##PS, mb)
+
+#define CASE_FMA_SCALAR_REG(Inst) \
+ CASE_AVX_INS_COMMON(Inst##SD, , r) \
+ CASE_AVX_INS_COMMON(Inst##SS, , r) \
+ CASE_AVX_INS_COMMON(Inst##SD, , r_Int) \
+ CASE_AVX_INS_COMMON(Inst##SS, , r_Int) \
+ CASE_AVX_INS_COMMON(Inst##SD, Z, r) \
+ CASE_AVX_INS_COMMON(Inst##SS, Z, r) \
+ CASE_AVX512_INS_COMMON(Inst##SD, Z, r_Int) \
+ CASE_AVX512_INS_COMMON(Inst##SS, Z, r_Int)
+
+#define CASE_FMA_SCALAR_MEM(Inst) \
+ CASE_AVX_INS_COMMON(Inst##SD, , m) \
+ CASE_AVX_INS_COMMON(Inst##SS, , m) \
+ CASE_AVX_INS_COMMON(Inst##SD, , m_Int) \
+ CASE_AVX_INS_COMMON(Inst##SS, , m_Int) \
+ CASE_AVX_INS_COMMON(Inst##SD, Z, m) \
+ CASE_AVX_INS_COMMON(Inst##SS, Z, m) \
+ CASE_AVX512_INS_COMMON(Inst##SD, Z, m_Int) \
+ CASE_AVX512_INS_COMMON(Inst##SS, Z, m_Int)
+
static unsigned getVectorRegSize(unsigned RegNo) {
if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31)
return 512;
@@ -171,230 +213,32 @@ static unsigned getVectorRegSize(unsigned RegNo) {
llvm_unreachable("Unknown vector reg!");
}
-static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT,
- unsigned OperandIndex) {
+static unsigned getRegOperandNumElts(const MCInst *MI, unsigned ScalarSize,
+ unsigned OperandIndex) {
unsigned OpReg = MI->getOperand(OperandIndex).getReg();
- return MVT::getVectorVT(ScalarVT,
- getVectorRegSize(OpReg)/ScalarVT.getSizeInBits());
+ return getVectorRegSize(OpReg) / ScalarSize;
}
-/// \brief Extracts the dst type for a given zero extension instruction.
-static MVT getZeroExtensionResultType(const MCInst *MI) {
- switch (MI->getOpcode()) {
- default:
- llvm_unreachable("Unknown zero extension instruction");
- // zero extension to i16
- CASE_PMOVZX(PMOVZXBW, m)
- CASE_PMOVZX(PMOVZXBW, r)
- return getRegOperandVectorVT(MI, MVT::i16, 0);
- // zero extension to i32
- CASE_PMOVZX(PMOVZXBD, m)
- CASE_PMOVZX(PMOVZXBD, r)
- CASE_PMOVZX(PMOVZXWD, m)
- CASE_PMOVZX(PMOVZXWD, r)
- return getRegOperandVectorVT(MI, MVT::i32, 0);
- // zero extension to i64
- CASE_PMOVZX(PMOVZXBQ, m)
- CASE_PMOVZX(PMOVZXBQ, r)
- CASE_PMOVZX(PMOVZXWQ, m)
- CASE_PMOVZX(PMOVZXWQ, r)
- CASE_PMOVZX(PMOVZXDQ, m)
- CASE_PMOVZX(PMOVZXDQ, r)
- return getRegOperandVectorVT(MI, MVT::i64, 0);
- }
+static const char *getRegName(unsigned Reg) {
+ return X86ATTInstPrinter::getRegisterName(Reg);
}
/// Wraps the destination register name with AVX512 mask/maskz filtering.
static void printMasking(raw_ostream &OS, const MCInst *MI,
- const char *(*getRegName)(unsigned)) {
- bool MaskWithZero = false;
- const char *MaskRegName = nullptr;
+ const MCInstrInfo &MCII) {
+ const MCInstrDesc &Desc = MCII.get(MI->getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
- switch (MI->getOpcode()) {
- default:
+ if (!(TSFlags & X86II::EVEX_K))
return;
- CASE_MASKZ_MOVDUP(MOVDDUP, m)
- CASE_MASKZ_MOVDUP(MOVDDUP, r)
- CASE_MASKZ_MOVDUP(MOVSHDUP, m)
- CASE_MASKZ_MOVDUP(MOVSHDUP, r)
- CASE_MASKZ_MOVDUP(MOVSLDUP, m)
- CASE_MASKZ_MOVDUP(MOVSLDUP, r)
- CASE_MASKZ_PMOVZX(PMOVZXBD, m)
- CASE_MASKZ_PMOVZX(PMOVZXBD, r)
- CASE_MASKZ_PMOVZX(PMOVZXBQ, m)
- CASE_MASKZ_PMOVZX(PMOVZXBQ, r)
- CASE_MASKZ_PMOVZX(PMOVZXBW, m)
- CASE_MASKZ_PMOVZX(PMOVZXBW, r)
- CASE_MASKZ_PMOVZX(PMOVZXDQ, m)
- CASE_MASKZ_PMOVZX(PMOVZXDQ, r)
- CASE_MASKZ_PMOVZX(PMOVZXWD, m)
- CASE_MASKZ_PMOVZX(PMOVZXWD, r)
- CASE_MASKZ_PMOVZX(PMOVZXWQ, m)
- CASE_MASKZ_PMOVZX(PMOVZXWQ, r)
- CASE_MASKZ_UNPCK(PUNPCKHBW, m)
- CASE_MASKZ_UNPCK(PUNPCKHBW, r)
- CASE_MASKZ_UNPCK(PUNPCKHWD, m)
- CASE_MASKZ_UNPCK(PUNPCKHWD, r)
- CASE_MASKZ_UNPCK(PUNPCKHDQ, m)
- CASE_MASKZ_UNPCK(PUNPCKHDQ, r)
- CASE_MASKZ_UNPCK(PUNPCKLBW, m)
- CASE_MASKZ_UNPCK(PUNPCKLBW, r)
- CASE_MASKZ_UNPCK(PUNPCKLWD, m)
- CASE_MASKZ_UNPCK(PUNPCKLWD, r)
- CASE_MASKZ_UNPCK(PUNPCKLDQ, m)
- CASE_MASKZ_UNPCK(PUNPCKLDQ, r)
- CASE_MASKZ_UNPCK(UNPCKHPD, m)
- CASE_MASKZ_UNPCK(UNPCKHPD, r)
- CASE_MASKZ_UNPCK(UNPCKHPS, m)
- CASE_MASKZ_UNPCK(UNPCKHPS, r)
- CASE_MASKZ_UNPCK(UNPCKLPD, m)
- CASE_MASKZ_UNPCK(UNPCKLPD, r)
- CASE_MASKZ_UNPCK(UNPCKLPS, m)
- CASE_MASKZ_UNPCK(UNPCKLPS, r)
- CASE_MASKZ_SHUF(PALIGNR, r)
- CASE_MASKZ_SHUF(PALIGNR, m)
- CASE_MASKZ_SHUF(ALIGNQ, r)
- CASE_MASKZ_SHUF(ALIGNQ, m)
- CASE_MASKZ_SHUF(ALIGND, r)
- CASE_MASKZ_SHUF(ALIGND, m)
- CASE_MASKZ_SHUF(SHUFPD, m)
- CASE_MASKZ_SHUF(SHUFPD, r)
- CASE_MASKZ_SHUF(SHUFPS, m)
- CASE_MASKZ_SHUF(SHUFPS, r)
- CASE_MASKZ_VPERMILPI(PERMILPD, m)
- CASE_MASKZ_VPERMILPI(PERMILPD, r)
- CASE_MASKZ_VPERMILPI(PERMILPS, m)
- CASE_MASKZ_VPERMILPI(PERMILPS, r)
- CASE_MASKZ_VPERMILPI(PSHUFD, m)
- CASE_MASKZ_VPERMILPI(PSHUFD, r)
- CASE_MASKZ_VPERMILPI(PSHUFHW, m)
- CASE_MASKZ_VPERMILPI(PSHUFHW, r)
- CASE_MASKZ_VPERMILPI(PSHUFLW, m)
- CASE_MASKZ_VPERMILPI(PSHUFLW, r)
- CASE_MASKZ_VPERM(PERMPD, m)
- CASE_MASKZ_VPERM(PERMPD, r)
- CASE_MASKZ_VPERM(PERMQ, m)
- CASE_MASKZ_VPERM(PERMQ, r)
- CASE_MASKZ_VSHUF(64X2, m)
- CASE_MASKZ_VSHUF(64X2, r)
- CASE_MASKZ_VSHUF(32X4, m)
- CASE_MASKZ_VSHUF(32X4, r)
- CASE_MASKZ_INS_COMMON(BROADCASTF64X2, Z128, rm)
- CASE_MASKZ_INS_COMMON(BROADCASTI64X2, Z128, rm)
- CASE_MASKZ_INS_COMMON(BROADCASTF64X2, , rm)
- CASE_MASKZ_INS_COMMON(BROADCASTI64X2, , rm)
- CASE_MASKZ_INS_COMMON(BROADCASTF64X4, , rm)
- CASE_MASKZ_INS_COMMON(BROADCASTI64X4, , rm)
- CASE_MASKZ_INS_COMMON(BROADCASTF32X4, Z256, rm)
- CASE_MASKZ_INS_COMMON(BROADCASTI32X4, Z256, rm)
- CASE_MASKZ_INS_COMMON(BROADCASTF32X4, , rm)
- CASE_MASKZ_INS_COMMON(BROADCASTI32X4, , rm)
- CASE_MASKZ_INS_COMMON(BROADCASTF32X8, , rm)
- CASE_MASKZ_INS_COMMON(BROADCASTI32X8, , rm)
- CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z128, r)
- CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z128, m)
- CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, r)
- CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z256, r)
- CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, m)
- CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z256, m)
- CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z, r)
- CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z, r)
- CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z, m)
- CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z, m)
- MaskWithZero = true;
- MaskRegName = getRegName(MI->getOperand(1).getReg());
- break;
- CASE_MASK_MOVDUP(MOVDDUP, m)
- CASE_MASK_MOVDUP(MOVDDUP, r)
- CASE_MASK_MOVDUP(MOVSHDUP, m)
- CASE_MASK_MOVDUP(MOVSHDUP, r)
- CASE_MASK_MOVDUP(MOVSLDUP, m)
- CASE_MASK_MOVDUP(MOVSLDUP, r)
- CASE_MASK_PMOVZX(PMOVZXBD, m)
- CASE_MASK_PMOVZX(PMOVZXBD, r)
- CASE_MASK_PMOVZX(PMOVZXBQ, m)
- CASE_MASK_PMOVZX(PMOVZXBQ, r)
- CASE_MASK_PMOVZX(PMOVZXBW, m)
- CASE_MASK_PMOVZX(PMOVZXBW, r)
- CASE_MASK_PMOVZX(PMOVZXDQ, m)
- CASE_MASK_PMOVZX(PMOVZXDQ, r)
- CASE_MASK_PMOVZX(PMOVZXWD, m)
- CASE_MASK_PMOVZX(PMOVZXWD, r)
- CASE_MASK_PMOVZX(PMOVZXWQ, m)
- CASE_MASK_PMOVZX(PMOVZXWQ, r)
- CASE_MASK_UNPCK(PUNPCKHBW, m)
- CASE_MASK_UNPCK(PUNPCKHBW, r)
- CASE_MASK_UNPCK(PUNPCKHWD, m)
- CASE_MASK_UNPCK(PUNPCKHWD, r)
- CASE_MASK_UNPCK(PUNPCKHDQ, m)
- CASE_MASK_UNPCK(PUNPCKHDQ, r)
- CASE_MASK_UNPCK(PUNPCKLBW, m)
- CASE_MASK_UNPCK(PUNPCKLBW, r)
- CASE_MASK_UNPCK(PUNPCKLWD, m)
- CASE_MASK_UNPCK(PUNPCKLWD, r)
- CASE_MASK_UNPCK(PUNPCKLDQ, m)
- CASE_MASK_UNPCK(PUNPCKLDQ, r)
- CASE_MASK_UNPCK(UNPCKHPD, m)
- CASE_MASK_UNPCK(UNPCKHPD, r)
- CASE_MASK_UNPCK(UNPCKHPS, m)
- CASE_MASK_UNPCK(UNPCKHPS, r)
- CASE_MASK_UNPCK(UNPCKLPD, m)
- CASE_MASK_UNPCK(UNPCKLPD, r)
- CASE_MASK_UNPCK(UNPCKLPS, m)
- CASE_MASK_UNPCK(UNPCKLPS, r)
- CASE_MASK_SHUF(PALIGNR, r)
- CASE_MASK_SHUF(PALIGNR, m)
- CASE_MASK_SHUF(ALIGNQ, r)
- CASE_MASK_SHUF(ALIGNQ, m)
- CASE_MASK_SHUF(ALIGND, r)
- CASE_MASK_SHUF(ALIGND, m)
- CASE_MASK_SHUF(SHUFPD, m)
- CASE_MASK_SHUF(SHUFPD, r)
- CASE_MASK_SHUF(SHUFPS, m)
- CASE_MASK_SHUF(SHUFPS, r)
- CASE_MASK_VPERMILPI(PERMILPD, m)
- CASE_MASK_VPERMILPI(PERMILPD, r)
- CASE_MASK_VPERMILPI(PERMILPS, m)
- CASE_MASK_VPERMILPI(PERMILPS, r)
- CASE_MASK_VPERMILPI(PSHUFD, m)
- CASE_MASK_VPERMILPI(PSHUFD, r)
- CASE_MASK_VPERMILPI(PSHUFHW, m)
- CASE_MASK_VPERMILPI(PSHUFHW, r)
- CASE_MASK_VPERMILPI(PSHUFLW, m)
- CASE_MASK_VPERMILPI(PSHUFLW, r)
- CASE_MASK_VPERM(PERMPD, m)
- CASE_MASK_VPERM(PERMPD, r)
- CASE_MASK_VPERM(PERMQ, m)
- CASE_MASK_VPERM(PERMQ, r)
- CASE_MASK_VSHUF(64X2, m)
- CASE_MASK_VSHUF(64X2, r)
- CASE_MASK_VSHUF(32X4, m)
- CASE_MASK_VSHUF(32X4, r)
- CASE_MASK_INS_COMMON(BROADCASTF64X2, Z128, rm)
- CASE_MASK_INS_COMMON(BROADCASTI64X2, Z128, rm)
- CASE_MASK_INS_COMMON(BROADCASTF64X2, , rm)
- CASE_MASK_INS_COMMON(BROADCASTI64X2, , rm)
- CASE_MASK_INS_COMMON(BROADCASTF64X4, , rm)
- CASE_MASK_INS_COMMON(BROADCASTI64X4, , rm)
- CASE_MASK_INS_COMMON(BROADCASTF32X4, Z256, rm)
- CASE_MASK_INS_COMMON(BROADCASTI32X4, Z256, rm)
- CASE_MASK_INS_COMMON(BROADCASTF32X4, , rm)
- CASE_MASK_INS_COMMON(BROADCASTI32X4, , rm)
- CASE_MASK_INS_COMMON(BROADCASTF32X8, , rm)
- CASE_MASK_INS_COMMON(BROADCASTI32X8, , rm)
- CASE_MASK_INS_COMMON(BROADCASTI32X2, Z128, r)
- CASE_MASK_INS_COMMON(BROADCASTI32X2, Z128, m)
- CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, r)
- CASE_MASK_INS_COMMON(BROADCASTI32X2, Z256, r)
- CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, m)
- CASE_MASK_INS_COMMON(BROADCASTI32X2, Z256, m)
- CASE_MASK_INS_COMMON(BROADCASTF32X2, Z, r)
- CASE_MASK_INS_COMMON(BROADCASTI32X2, Z, r)
- CASE_MASK_INS_COMMON(BROADCASTF32X2, Z, m)
- CASE_MASK_INS_COMMON(BROADCASTI32X2, Z, m)
- MaskRegName = getRegName(MI->getOperand(2).getReg());
- break;
- }
+
+ bool MaskWithZero = (TSFlags & X86II::EVEX_Z);
+ unsigned MaskOp = Desc.getNumDefs();
+
+ if (Desc.getOperandConstraint(MaskOp, MCOI::TIED_TO) != -1)
+ ++MaskOp;
+
+ const char *MaskRegName = getRegName(MI->getOperand(MaskOp).getReg());
// MASK: zmmX {%kY}
OS << " {%" << MaskRegName << "}";
@@ -404,6 +248,248 @@ static void printMasking(raw_ostream &OS, const MCInst *MI,
OS << " {z}";
}
+static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) {
+ const char *Mul1Name = nullptr, *Mul2Name = nullptr, *AccName = nullptr;
+ unsigned NumOperands = MI->getNumOperands();
+ bool RegForm = false;
+ bool Negate = false;
+ StringRef AccStr = "+";
+
+ // The operands for FMA instructions without rounding fall into two forms.
+ // dest, src1, src2, src3
+ // dest, src1, mask, src2, src3
+ // Where src3 is either a register or 5 memory address operands. So to find
+ // dest and src1 we can index from the front. To find src2 and src3 we can
+ // index from the end by taking into account memory vs register form when
+ // finding src2.
+
+ switch (MI->getOpcode()) {
+ default:
+ return false;
+ CASE_FMA_PACKED_REG(FMADD132)
+ CASE_FMA_SCALAR_REG(FMADD132)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMADD132)
+ CASE_FMA_SCALAR_MEM(FMADD132)
+ AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ break;
+
+ CASE_FMA_PACKED_REG(FMADD213)
+ CASE_FMA_SCALAR_REG(FMADD213)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMADD213)
+ CASE_FMA_SCALAR_MEM(FMADD213)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul2Name = getRegName(MI->getOperand(1).getReg());
+ break;
+
+ CASE_FMA_PACKED_REG(FMADD231)
+ CASE_FMA_SCALAR_REG(FMADD231)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMADD231)
+ CASE_FMA_SCALAR_MEM(FMADD231)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ AccName = getRegName(MI->getOperand(1).getReg());
+ break;
+
+ CASE_FMA_PACKED_REG(FMSUB132)
+ CASE_FMA_SCALAR_REG(FMSUB132)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMSUB132)
+ CASE_FMA_SCALAR_MEM(FMSUB132)
+ AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ break;
+
+ CASE_FMA_PACKED_REG(FMSUB213)
+ CASE_FMA_SCALAR_REG(FMSUB213)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMSUB213)
+ CASE_FMA_SCALAR_MEM(FMSUB213)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul2Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ break;
+
+ CASE_FMA_PACKED_REG(FMSUB231)
+ CASE_FMA_SCALAR_REG(FMSUB231)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMSUB231)
+ CASE_FMA_SCALAR_MEM(FMSUB231)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ AccName = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ break;
+
+ CASE_FMA_PACKED_REG(FNMADD132)
+ CASE_FMA_SCALAR_REG(FNMADD132)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FNMADD132)
+ CASE_FMA_SCALAR_MEM(FNMADD132)
+ AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ Negate = true;
+ break;
+
+ CASE_FMA_PACKED_REG(FNMADD213)
+ CASE_FMA_SCALAR_REG(FNMADD213)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FNMADD213)
+ CASE_FMA_SCALAR_MEM(FNMADD213)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul2Name = getRegName(MI->getOperand(1).getReg());
+ Negate = true;
+ break;
+
+ CASE_FMA_PACKED_REG(FNMADD231)
+ CASE_FMA_SCALAR_REG(FNMADD231)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FNMADD231)
+ CASE_FMA_SCALAR_MEM(FNMADD231)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ AccName = getRegName(MI->getOperand(1).getReg());
+ Negate = true;
+ break;
+
+ CASE_FMA_PACKED_REG(FNMSUB132)
+ CASE_FMA_SCALAR_REG(FNMSUB132)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FNMSUB132)
+ CASE_FMA_SCALAR_MEM(FNMSUB132)
+ AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ Negate = true;
+ break;
+
+ CASE_FMA_PACKED_REG(FNMSUB213)
+ CASE_FMA_SCALAR_REG(FNMSUB213)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FNMSUB213)
+ CASE_FMA_SCALAR_MEM(FNMSUB213)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul2Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ Negate = true;
+ break;
+
+ CASE_FMA_PACKED_REG(FNMSUB231)
+ CASE_FMA_SCALAR_REG(FNMSUB231)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FNMSUB231)
+ CASE_FMA_SCALAR_MEM(FNMSUB231)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ AccName = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ Negate = true;
+ break;
+
+ CASE_FMA_PACKED_REG(FMADDSUB132)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMADDSUB132)
+ AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "+/-";
+ break;
+
+ CASE_FMA_PACKED_REG(FMADDSUB213)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMADDSUB213)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul2Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "+/-";
+ break;
+
+ CASE_FMA_PACKED_REG(FMADDSUB231)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMADDSUB231)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ AccName = getRegName(MI->getOperand(1).getReg());
+ AccStr = "+/-";
+ break;
+
+ CASE_FMA_PACKED_REG(FMSUBADD132)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMSUBADD132)
+ AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-/+";
+ break;
+
+ CASE_FMA_PACKED_REG(FMSUBADD213)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMSUBADD213)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul2Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-/+";
+ break;
+
+ CASE_FMA_PACKED_REG(FMSUBADD231)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMSUBADD231)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ AccName = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-/+";
+ break;
+ }
+
+ const char *DestName = getRegName(MI->getOperand(0).getReg());
+
+ if (!Mul1Name) Mul1Name = "mem";
+ if (!Mul2Name) Mul2Name = "mem";
+ if (!AccName) AccName = "mem";
+
+ OS << DestName << " = ";
+ // TODO: Print masking information?
+
+ if (Negate)
+ OS << '-';
+
+ OS << '(' << Mul1Name << " * " << Mul2Name << ") " << AccStr << ' '
+ << AccName;
+
+ return true;
+}
+
+
//===----------------------------------------------------------------------===//
// Top Level Entrypoint
//===----------------------------------------------------------------------===//
@@ -412,13 +498,16 @@ static void printMasking(raw_ostream &OS, const MCInst *MI,
/// newline terminated strings to the specified string if desired. This
/// information is shown in disassembly dumps when verbose assembly is enabled.
bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
- const char *(*getRegName)(unsigned)) {
+ const MCInstrInfo &MCII) {
// If this is a shuffle operation, the switch should fill in this state.
SmallVector<int, 8> ShuffleMask;
const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
unsigned NumOperands = MI->getNumOperands();
bool RegForm = false;
+ if (printFMA3Comments(MI, OS))
+ return true;
+
switch (MI->getOpcode()) {
default:
// Not an instruction for which we can decode comments.
@@ -433,7 +522,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VBLENDPDrmi:
case X86::VBLENDPDYrmi:
if (MI->getOperand(NumOperands - 1).isImm())
- DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+ DecodeBLENDMask(getRegOperandNumElts(MI, 64, 0),
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -449,7 +538,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VBLENDPSrmi:
case X86::VBLENDPSYrmi:
if (MI->getOperand(NumOperands - 1).isImm())
- DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+ DecodeBLENDMask(getRegOperandNumElts(MI, 32, 0),
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -465,7 +554,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VPBLENDWrmi:
case X86::VPBLENDWYrmi:
if (MI->getOperand(NumOperands - 1).isImm())
- DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+ DecodeBLENDMask(getRegOperandNumElts(MI, 16, 0),
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -479,7 +568,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VPBLENDDrmi:
case X86::VPBLENDDYrmi:
if (MI->getOperand(NumOperands - 1).isImm())
- DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+ DecodeBLENDMask(getRegOperandNumElts(MI, 32, 0),
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -524,7 +613,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VMOVHPDZ128rm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeInsertElementMask(MVT::v2f64, 1, 1, ShuffleMask);
+ DecodeInsertElementMask(2, 1, 1, ShuffleMask);
break;
case X86::MOVHPSrm:
@@ -532,7 +621,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VMOVHPSZ128rm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeInsertElementMask(MVT::v4f32, 2, 2, ShuffleMask);
+ DecodeInsertElementMask(4, 2, 2, ShuffleMask);
break;
case X86::MOVLPDrm:
@@ -540,7 +629,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VMOVLPDZ128rm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeInsertElementMask(MVT::v2f64, 0, 1, ShuffleMask);
+ DecodeInsertElementMask(2, 0, 1, ShuffleMask);
break;
case X86::MOVLPSrm:
@@ -548,7 +637,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VMOVLPSZ128rm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeInsertElementMask(MVT::v4f32, 0, 2, ShuffleMask);
+ DecodeInsertElementMask(4, 0, 2, ShuffleMask);
break;
CASE_MOVDUP(MOVSLDUP, r)
@@ -557,7 +646,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_MOVDUP(MOVSLDUP, m)
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeMOVSLDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+ DecodeMOVSLDUPMask(getRegOperandNumElts(MI, 32, 0), ShuffleMask);
break;
CASE_MOVDUP(MOVSHDUP, r)
@@ -566,7 +655,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_MOVDUP(MOVSHDUP, m)
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeMOVSHDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+ DecodeMOVSHDUPMask(getRegOperandNumElts(MI, 32, 0), ShuffleMask);
break;
CASE_MOVDUP(MOVDDUP, r)
@@ -575,7 +664,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_MOVDUP(MOVDDUP, m)
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeMOVDDUPMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+ DecodeMOVDDUPMask(getRegOperandNumElts(MI, 64, 0), ShuffleMask);
break;
case X86::PSLLDQri:
@@ -591,7 +680,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VPSLLDQZrm:
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
- DecodePSLLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+ DecodePSLLDQMask(getRegOperandNumElts(MI, 8, 0),
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
break;
@@ -609,7 +698,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VPSRLDQZrm:
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
- DecodePSRLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+ DecodePSRLDQMask(getRegOperandNumElts(MI, 8, 0),
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
break;
@@ -623,7 +712,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
- DecodePALIGNRMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+ DecodePALIGNRMask(getRegOperandNumElts(MI, 8, 0),
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
break;
@@ -641,7 +730,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
- DecodeVALIGNMask(getRegOperandVectorVT(MI, MVT::i64, 0),
+ DecodeVALIGNMask(getRegOperandNumElts(MI, 64, 0),
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
break;
@@ -659,7 +748,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
- DecodeVALIGNMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+ DecodeVALIGNMask(getRegOperandNumElts(MI, 32, 0),
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
break;
@@ -671,7 +760,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_SHUF(PSHUFD, mi)
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
- DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+ DecodePSHUFMask(getRegOperandNumElts(MI, 32, 0), 32,
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
break;
@@ -683,7 +772,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_SHUF(PSHUFHW, mi)
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
- DecodePSHUFHWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+ DecodePSHUFHWMask(getRegOperandNumElts(MI, 16, 0),
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
break;
@@ -695,7 +784,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_SHUF(PSHUFLW, mi)
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
- DecodePSHUFLWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+ DecodePSHUFLWMask(getRegOperandNumElts(MI, 16, 0),
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
break;
@@ -707,8 +796,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MMX_PSHUFWmi:
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
- DecodePSHUFMask(MVT::v4i16,
- MI->getOperand(NumOperands - 1).getImm(),
+ DecodePSHUFMask(4, 16, MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
break;
@@ -718,7 +806,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::PSWAPDrm:
DestName = getRegName(MI->getOperand(0).getReg());
- DecodePSWAPMask(MVT::v2i32, ShuffleMask);
+ DecodePSWAPMask(2, ShuffleMask);
break;
CASE_UNPCK(PUNPCKHBW, r)
@@ -731,7 +819,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MMX_PUNPCKHBWirm:
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
+ DecodeUNPCKHMask(getRegOperandNumElts(MI, 8, 0), 8, ShuffleMask);
break;
CASE_UNPCK(PUNPCKHWD, r)
@@ -744,7 +832,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MMX_PUNPCKHWDirm:
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
+ DecodeUNPCKHMask(getRegOperandNumElts(MI, 16, 0), 16, ShuffleMask);
break;
CASE_UNPCK(PUNPCKHDQ, r)
@@ -757,7 +845,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MMX_PUNPCKHDQirm:
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
+ DecodeUNPCKHMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask);
break;
CASE_UNPCK(PUNPCKHQDQ, r)
@@ -768,7 +856,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_UNPCK(PUNPCKHQDQ, m)
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
+ DecodeUNPCKHMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask);
break;
CASE_UNPCK(PUNPCKLBW, r)
@@ -781,7 +869,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MMX_PUNPCKLBWirm:
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
+ DecodeUNPCKLMask(getRegOperandNumElts(MI, 8, 0), 8, ShuffleMask);
break;
CASE_UNPCK(PUNPCKLWD, r)
@@ -794,7 +882,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MMX_PUNPCKLWDirm:
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
+ DecodeUNPCKLMask(getRegOperandNumElts(MI, 16, 0), 16, ShuffleMask);
break;
CASE_UNPCK(PUNPCKLDQ, r)
@@ -807,7 +895,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MMX_PUNPCKLDQirm:
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
+ DecodeUNPCKLMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask);
break;
CASE_UNPCK(PUNPCKLQDQ, r)
@@ -818,7 +906,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_UNPCK(PUNPCKLQDQ, m)
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
+ DecodeUNPCKLMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask);
break;
CASE_SHUF(SHUFPD, rri)
@@ -828,9 +916,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_SHUF(SHUFPD, rmi)
if (MI->getOperand(NumOperands - 1).isImm())
- DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f64, 0),
- MI->getOperand(NumOperands - 1).getImm(),
- ShuffleMask);
+ DecodeSHUFPMask(getRegOperandNumElts(MI, 64, 0), 64,
+ MI->getOperand(NumOperands - 1).getImm(), ShuffleMask);
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -842,7 +929,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_SHUF(SHUFPS, rmi)
if (MI->getOperand(NumOperands - 1).isImm())
- DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+ DecodeSHUFPMask(getRegOperandNumElts(MI, 32, 0), 32,
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
@@ -855,7 +942,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
LLVM_FALLTHROUGH;
CASE_VSHUF(64X2, m)
- decodeVSHUF64x2FamilyMask(getRegOperandVectorVT(MI, MVT::i64, 0),
+ decodeVSHUF64x2FamilyMask(getRegOperandNumElts(MI, 64, 0), 64,
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
@@ -868,7 +955,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
LLVM_FALLTHROUGH;
CASE_VSHUF(32X4, m)
- decodeVSHUF64x2FamilyMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+ decodeVSHUF64x2FamilyMask(getRegOperandNumElts(MI, 32, 0), 32,
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
@@ -881,7 +968,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
LLVM_FALLTHROUGH;
CASE_UNPCK(UNPCKLPD, m)
- DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+ DecodeUNPCKLMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask);
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -892,7 +979,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
LLVM_FALLTHROUGH;
CASE_UNPCK(UNPCKLPS, m)
- DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+ DecodeUNPCKLMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask);
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -903,7 +990,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
LLVM_FALLTHROUGH;
CASE_UNPCK(UNPCKHPD, m)
- DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+ DecodeUNPCKHMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask);
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -914,7 +1001,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
LLVM_FALLTHROUGH;
CASE_UNPCK(UNPCKHPS, m)
- DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+ DecodeUNPCKHMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask);
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -925,7 +1012,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_VPERMILPI(PERMILPS, m)
if (MI->getOperand(NumOperands - 1).isImm())
- DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+ DecodePSHUFMask(getRegOperandNumElts(MI, 32, 0), 32,
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
@@ -937,7 +1024,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_VPERMILPI(PERMILPD, m)
if (MI->getOperand(NumOperands - 1).isImm())
- DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+ DecodePSHUFMask(getRegOperandNumElts(MI, 64, 0), 64,
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
@@ -952,8 +1039,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VPERM2I128rm:
// For instruction comments purpose, assume the 256-bit vector is v4i64.
if (MI->getOperand(NumOperands - 1).isImm())
- DecodeVPERM2X128Mask(MVT::v4i64,
- MI->getOperand(NumOperands - 1).getImm(),
+ DecodeVPERM2X128Mask(4, MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
@@ -965,7 +1051,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_VPERM(PERMPD, m)
if (MI->getOperand(NumOperands - 1).isImm())
- DecodeVPERMMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+ DecodeVPERMMask(getRegOperandNumElts(MI, 64, 0),
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
@@ -977,7 +1063,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_VPERM(PERMQ, m)
if (MI->getOperand(NumOperands - 1).isImm())
- DecodeVPERMMask(getRegOperandVectorVT(MI, MVT::i64, 0),
+ DecodeVPERMMask(getRegOperandNumElts(MI, 64, 0),
MI->getOperand(NumOperands - 1).getImm(),
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
@@ -993,7 +1079,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MOVSDrm:
case X86::VMOVSDrm:
case X86::VMOVSDZrm:
- DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask);
+ DecodeScalarMoveMask(2, nullptr == Src2Name, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -1007,13 +1093,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MOVSSrm:
case X86::VMOVSSrm:
case X86::VMOVSSZrm:
- DecodeScalarMoveMask(MVT::v4f32, nullptr == Src2Name, ShuffleMask);
+ DecodeScalarMoveMask(4, nullptr == Src2Name, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
case X86::MOVPQI2QIrr:
case X86::MOVZPQILo2PQIrr:
case X86::VMOVPQI2QIrr:
+ case X86::VMOVPQI2QIZrr:
case X86::VMOVZPQILo2PQIrr:
case X86::VMOVZPQILo2PQIZrr:
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -1022,23 +1109,22 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MOVQI2PQIrm:
case X86::VMOVQI2PQIrm:
case X86::VMOVQI2PQIZrm:
- DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask);
+ DecodeZeroMoveLowMask(2, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
case X86::MOVDI2PDIrm:
case X86::VMOVDI2PDIrm:
case X86::VMOVDI2PDIZrm:
- DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask);
+ DecodeZeroMoveLowMask(4, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
case X86::EXTRQI:
if (MI->getOperand(2).isImm() &&
MI->getOperand(3).isImm())
- DecodeEXTRQIMask(MVT::v16i8, MI->getOperand(2).getImm(),
- MI->getOperand(3).getImm(),
- ShuffleMask);
+ DecodeEXTRQIMask(16, 8, MI->getOperand(2).getImm(),
+ MI->getOperand(3).getImm(), ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -1047,9 +1133,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::INSERTQI:
if (MI->getOperand(3).isImm() &&
MI->getOperand(4).isImm())
- DecodeINSERTQIMask(MVT::v16i8, MI->getOperand(3).getImm(),
- MI->getOperand(4).getImm(),
- ShuffleMask);
+ DecodeINSERTQIMask(16, 8, MI->getOperand(3).getImm(),
+ MI->getOperand(4).getImm(), ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -1060,39 +1145,39 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VBROADCASTI128:
CASE_AVX512_INS_COMMON(BROADCASTF64X2, Z128, rm)
CASE_AVX512_INS_COMMON(BROADCASTI64X2, Z128, rm)
- DecodeSubVectorBroadcast(MVT::v4f64, MVT::v2f64, ShuffleMask);
+ DecodeSubVectorBroadcast(4, 2, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
CASE_AVX512_INS_COMMON(BROADCASTF64X2, , rm)
CASE_AVX512_INS_COMMON(BROADCASTI64X2, , rm)
- DecodeSubVectorBroadcast(MVT::v8f64, MVT::v2f64, ShuffleMask);
+ DecodeSubVectorBroadcast(8, 2, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
CASE_AVX512_INS_COMMON(BROADCASTF64X4, , rm)
CASE_AVX512_INS_COMMON(BROADCASTI64X4, , rm)
- DecodeSubVectorBroadcast(MVT::v8f64, MVT::v4f64, ShuffleMask);
+ DecodeSubVectorBroadcast(8, 4, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
CASE_AVX512_INS_COMMON(BROADCASTF32X4, Z256, rm)
CASE_AVX512_INS_COMMON(BROADCASTI32X4, Z256, rm)
- DecodeSubVectorBroadcast(MVT::v8f32, MVT::v4f32, ShuffleMask);
+ DecodeSubVectorBroadcast(8, 4, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
CASE_AVX512_INS_COMMON(BROADCASTF32X4, , rm)
CASE_AVX512_INS_COMMON(BROADCASTI32X4, , rm)
- DecodeSubVectorBroadcast(MVT::v16f32, MVT::v4f32, ShuffleMask);
+ DecodeSubVectorBroadcast(16, 4, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
CASE_AVX512_INS_COMMON(BROADCASTF32X8, , rm)
CASE_AVX512_INS_COMMON(BROADCASTI32X8, , rm)
- DecodeSubVectorBroadcast(MVT::v16f32, MVT::v8f32, ShuffleMask);
+ DecodeSubVectorBroadcast(16, 8, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, m)
- DecodeSubVectorBroadcast(MVT::v4f32, MVT::v2f32, ShuffleMask);
+ DecodeSubVectorBroadcast(4, 2, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r)
@@ -1101,7 +1186,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
LLVM_FALLTHROUGH;
CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m)
CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m)
- DecodeSubVectorBroadcast(MVT::v8f32, MVT::v2f32, ShuffleMask);
+ DecodeSubVectorBroadcast(8, 2, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r)
@@ -1110,40 +1195,55 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
LLVM_FALLTHROUGH;
CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m)
CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m)
- DecodeSubVectorBroadcast(MVT::v16f32, MVT::v2f32, ShuffleMask);
+ DecodeSubVectorBroadcast(16, 2, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
CASE_PMOVZX(PMOVZXBW, r)
- CASE_PMOVZX(PMOVZXBD, r)
- CASE_PMOVZX(PMOVZXBQ, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
-
CASE_PMOVZX(PMOVZXBW, m)
+ DecodeZeroExtendMask(8, 16, getRegOperandNumElts(MI, 16, 0), ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_PMOVZX(PMOVZXBD, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
CASE_PMOVZX(PMOVZXBD, m)
+ DecodeZeroExtendMask(8, 32, getRegOperandNumElts(MI, 32, 0), ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_PMOVZX(PMOVZXBQ, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
CASE_PMOVZX(PMOVZXBQ, m)
- DecodeZeroExtendMask(MVT::i8, getZeroExtensionResultType(MI), ShuffleMask);
+ DecodeZeroExtendMask(8, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
CASE_PMOVZX(PMOVZXWD, r)
- CASE_PMOVZX(PMOVZXWQ, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
-
CASE_PMOVZX(PMOVZXWD, m)
+ DecodeZeroExtendMask(16, 32, getRegOperandNumElts(MI, 32, 0), ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_PMOVZX(PMOVZXWQ, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
CASE_PMOVZX(PMOVZXWQ, m)
- DecodeZeroExtendMask(MVT::i16, getZeroExtensionResultType(MI), ShuffleMask);
+ DecodeZeroExtendMask(16, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
CASE_PMOVZX(PMOVZXDQ, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
-
CASE_PMOVZX(PMOVZXDQ, m)
- DecodeZeroExtendMask(MVT::i32, getZeroExtensionResultType(MI), ShuffleMask);
+ DecodeZeroExtendMask(32, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
}
@@ -1156,7 +1256,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
if (!DestName) DestName = Src1Name;
if (DestName) {
OS << DestName;
- printMasking(OS, MI, getRegName);
+ printMasking(OS, MI, MCII);
} else
OS << "mem";
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.h b/lib/Target/X86/InstPrinter/X86InstComments.h
index 629c02c95c7f..40dffa5fbb8a 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.h
+++ b/lib/Target/X86/InstPrinter/X86InstComments.h
@@ -15,19 +15,13 @@
#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
-#include "llvm/CodeGen/MachineInstr.h"
-
namespace llvm {
- enum AsmComments {
- // For instr that was compressed from EVEX to VEX.
- AC_EVEX_2_VEX = MachineInstr::TAsmComments
- };
-
class MCInst;
+ class MCInstrInfo;
class raw_ostream;
bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
- const char *(*getRegName)(unsigned));
+ const MCInstrInfo &MCII);
}
#endif
diff --git a/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp b/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp
new file mode 100644
index 000000000000..432cd47ae499
--- /dev/null
+++ b/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp
@@ -0,0 +1,142 @@
+//===--- X86InstPrinterCommon.cpp - X86 assembly instruction printing -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes common code for rendering MCInst instances as Intel-style
+// and Intel-style assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstPrinterCommon.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Casting.h"
+#include <cstdint>
+#include <cassert>
+
+using namespace llvm;
+
+void X86InstPrinterCommon::printSSEAVXCC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid ssecc/avxcc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ case 8: O << "eq_uq"; break;
+ case 9: O << "nge"; break;
+ case 0xa: O << "ngt"; break;
+ case 0xb: O << "false"; break;
+ case 0xc: O << "neq_oq"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "gt"; break;
+ case 0xf: O << "true"; break;
+ case 0x10: O << "eq_os"; break;
+ case 0x11: O << "lt_oq"; break;
+ case 0x12: O << "le_oq"; break;
+ case 0x13: O << "unord_s"; break;
+ case 0x14: O << "neq_us"; break;
+ case 0x15: O << "nlt_uq"; break;
+ case 0x16: O << "nle_uq"; break;
+ case 0x17: O << "ord_s"; break;
+ case 0x18: O << "eq_us"; break;
+ case 0x19: O << "nge_uq"; break;
+ case 0x1a: O << "ngt_uq"; break;
+ case 0x1b: O << "false_os"; break;
+ case 0x1c: O << "neq_os"; break;
+ case 0x1d: O << "ge_oq"; break;
+ case 0x1e: O << "gt_oq"; break;
+ case 0x1f: O << "true_us"; break;
+ }
+}
+
+void X86InstPrinterCommon::printXOPCC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid xopcc argument!");
+ case 0: O << "lt"; break;
+ case 1: O << "le"; break;
+ case 2: O << "gt"; break;
+ case 3: O << "ge"; break;
+ case 4: O << "eq"; break;
+ case 5: O << "neq"; break;
+ case 6: O << "false"; break;
+ case 7: O << "true"; break;
+ }
+}
+
+void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
+ switch (Imm) {
+ case 0: O << "{rn-sae}"; break;
+ case 1: O << "{rd-sae}"; break;
+ case 2: O << "{ru-sae}"; break;
+ case 3: O << "{rz-sae}"; break;
+ }
+}
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value (e.g. for jumps and calls). In
+/// Intel-style these print slightly differently than normal immediates.
+/// for example, a $ is not emitted.
+void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm())
+ O << formatImm(Op.getImm());
+ else {
+ assert(Op.isExpr() && "unknown pcrel immediate operand");
+ // If a symbolic branch target was added as a constant expression then print
+ // that address in hex.
+ const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+ int64_t Address;
+ if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
+ O << formatHex((uint64_t)Address);
+ } else {
+ // Otherwise, just print the expression.
+ Op.getExpr()->print(O, &MAI);
+ }
+ }
+}
+
+void X86InstPrinterCommon::printOptionalSegReg(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getReg()) {
+ printOperand(MI, OpNo, O);
+ O << ':';
+ }
+}
+
+void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+ unsigned Flags = MI->getFlags();
+
+ if ((TSFlags & X86II::LOCK) || (Flags & X86::IP_HAS_LOCK))
+ O << "\tlock\t";
+
+ if ((TSFlags & X86II::NOTRACK) || (Flags & X86::IP_HAS_NOTRACK))
+ O << "\tnotrack\t";
+
+ if (Flags & X86::IP_HAS_REPEAT_NE)
+ O << "\trepne\t";
+ else if (Flags & X86::IP_HAS_REPEAT)
+ O << "\trep\t";
+}
diff --git a/lib/Target/X86/InstPrinter/X86InstPrinterCommon.h b/lib/Target/X86/InstPrinter/X86InstPrinterCommon.h
new file mode 100644
index 000000000000..f2875e71f22c
--- /dev/null
+++ b/lib/Target/X86/InstPrinter/X86InstPrinterCommon.h
@@ -0,0 +1,38 @@
+//===-- X86InstPrinterCommon.cpp - X86 assembly instruction printing ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code common for rendering MCInst instances as AT&T-style
+// and Intel-style assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTPRINTERCOMMON_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTPRINTERCOMMON_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class X86InstPrinterCommon : public MCInstPrinter {
+public:
+ using MCInstPrinter::MCInstPrinter;
+
+ virtual void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) = 0;
+ void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+protected:
+ void printInstFlags(const MCInst *MI, raw_ostream &O);
+ void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 1f02600a7982..044b71564152 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -19,6 +19,7 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include <cassert>
@@ -37,116 +38,21 @@ void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
StringRef Annot,
const MCSubtargetInfo &STI) {
- const MCInstrDesc &Desc = MII.get(MI->getOpcode());
- uint64_t TSFlags = Desc.TSFlags;
+ printInstFlags(MI, OS);
- if (TSFlags & X86II::LOCK)
- OS << "\tlock\t";
-
- unsigned Flags = MI->getFlags();
- if (Flags & X86::IP_HAS_REPEAT_NE)
- OS << "\trepne\t";
- else if (Flags & X86::IP_HAS_REPEAT)
- OS << "\trep\t";
-
- printInstruction(MI, OS);
+ // In 16-bit mode, print data16 as data32.
+ if (MI->getOpcode() == X86::DATA16_PREFIX &&
+ STI.getFeatureBits()[X86::Mode16Bit]) {
+ OS << "\tdata32";
+ } else
+ printInstruction(MI, OS);
// Next always print the annotation.
printAnnotation(OS, Annot);
// If verbose assembly is enabled, we can print some informative comments.
if (CommentStream)
- EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
-}
-
-void X86IntelInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- int64_t Imm = MI->getOperand(Op).getImm();
- switch (Imm) {
- default: llvm_unreachable("Invalid avxcc argument!");
- case 0: O << "eq"; break;
- case 1: O << "lt"; break;
- case 2: O << "le"; break;
- case 3: O << "unord"; break;
- case 4: O << "neq"; break;
- case 5: O << "nlt"; break;
- case 6: O << "nle"; break;
- case 7: O << "ord"; break;
- case 8: O << "eq_uq"; break;
- case 9: O << "nge"; break;
- case 0xa: O << "ngt"; break;
- case 0xb: O << "false"; break;
- case 0xc: O << "neq_oq"; break;
- case 0xd: O << "ge"; break;
- case 0xe: O << "gt"; break;
- case 0xf: O << "true"; break;
- case 0x10: O << "eq_os"; break;
- case 0x11: O << "lt_oq"; break;
- case 0x12: O << "le_oq"; break;
- case 0x13: O << "unord_s"; break;
- case 0x14: O << "neq_us"; break;
- case 0x15: O << "nlt_uq"; break;
- case 0x16: O << "nle_uq"; break;
- case 0x17: O << "ord_s"; break;
- case 0x18: O << "eq_us"; break;
- case 0x19: O << "nge_uq"; break;
- case 0x1a: O << "ngt_uq"; break;
- case 0x1b: O << "false_os"; break;
- case 0x1c: O << "neq_os"; break;
- case 0x1d: O << "ge_oq"; break;
- case 0x1e: O << "gt_oq"; break;
- case 0x1f: O << "true_us"; break;
- }
-}
-
-void X86IntelInstPrinter::printXOPCC(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- int64_t Imm = MI->getOperand(Op).getImm();
- switch (Imm) {
- default: llvm_unreachable("Invalid xopcc argument!");
- case 0: O << "lt"; break;
- case 1: O << "le"; break;
- case 2: O << "gt"; break;
- case 3: O << "ge"; break;
- case 4: O << "eq"; break;
- case 5: O << "neq"; break;
- case 6: O << "false"; break;
- case 7: O << "true"; break;
- }
-}
-
-void X86IntelInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
- switch (Imm) {
- case 0: O << "{rn-sae}"; break;
- case 1: O << "{rd-sae}"; break;
- case 2: O << "{ru-sae}"; break;
- case 3: O << "{rz-sae}"; break;
- }
-}
-
-/// printPCRelImm - This is used to print an immediate value that ends up
-/// being encoded as a pc-relative value.
-void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- const MCOperand &Op = MI->getOperand(OpNo);
- if (Op.isImm())
- O << formatImm(Op.getImm());
- else {
- assert(Op.isExpr() && "unknown pcrel immediate operand");
- // If a symbolic branch target was added as a constant expression then print
- // that address in hex.
- const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
- int64_t Address;
- if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
- O << formatHex((uint64_t)Address);
- }
- else {
- // Otherwise, just print the expression.
- Op.getExpr()->print(O, &MAI);
- }
- }
+ EmitAnyX86InstComments(MI, *CommentStream, MII);
}
void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -169,13 +75,9 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
- const MCOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg);
// If this has a segment register, print it.
- if (SegReg.getReg()) {
- printOperand(MI, Op+X86::AddrSegmentReg, O);
- O << ':';
- }
+ printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O);
O << '[';
@@ -217,13 +119,8 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
raw_ostream &O) {
- const MCOperand &SegReg = MI->getOperand(Op+1);
-
// If this has a segment register, print it.
- if (SegReg.getReg()) {
- printOperand(MI, Op+1, O);
- O << ':';
- }
+ printOptionalSegReg(MI, Op + 1, O);
O << '[';
printOperand(MI, Op, O);
O << ']';
@@ -240,13 +137,9 @@ void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
raw_ostream &O) {
const MCOperand &DispSpec = MI->getOperand(Op);
- const MCOperand &SegReg = MI->getOperand(Op+1);
// If this has a segment register, print it.
- if (SegReg.getReg()) {
- printOperand(MI, Op+1, O);
- O << ':';
- }
+ printOptionalSegReg(MI, Op + 1, O);
O << '[';
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index ace31186a054..3b34a8052bec 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -14,16 +14,16 @@
#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
-#include "llvm/MC/MCInstPrinter.h"
+#include "X86InstPrinterCommon.h"
#include "llvm/Support/raw_ostream.h"
namespace llvm {
-class X86IntelInstPrinter final : public MCInstPrinter {
+class X86IntelInstPrinter final : public X86InstPrinterCommon {
public:
X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
const MCRegisterInfo &MRI)
- : MCInstPrinter(MAI, MII, MRI) {}
+ : X86InstPrinterCommon(MAI, MII, MRI) {}
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
@@ -33,15 +33,11 @@ public:
void printInstruction(const MCInst *MI, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
- void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) override;
void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
- void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
- void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &O);
- void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O);
void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
@@ -49,7 +45,6 @@ public:
}
void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "opaque ptr ";
printMemReference(MI, OpNo, O);
}
@@ -90,7 +85,7 @@ public:
printMemReference(MI, OpNo, O);
}
void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "xword ptr ";
+ O << "tbyte ptr ";
printMemReference(MI, OpNo, O);
}
void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 239db2a74b24..0e4c4398e49d 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -46,6 +46,7 @@ static unsigned getFixupKindLog2Size(unsigned Kind) {
case X86::reloc_signed_4byte:
case X86::reloc_signed_4byte_relax:
case X86::reloc_global_offset_table:
+ case X86::reloc_branch_4byte_pcrel:
case FK_SecRel_4:
case FK_Data_4:
return 2;
@@ -67,19 +68,10 @@ public:
};
class X86AsmBackend : public MCAsmBackend {
- const StringRef CPU;
- bool HasNopl;
- const uint64_t MaxNopLength;
+ const MCSubtargetInfo &STI;
public:
- X86AsmBackend(const Target &T, StringRef CPU)
- : MCAsmBackend(), CPU(CPU),
- MaxNopLength((CPU == "slm" || CPU == "silvermont") ? 7 : 15) {
- HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
- CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
- CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
- CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" &&
- CPU != "c3" && CPU != "c3-2" && CPU != "lakemont" && CPU != "";
- }
+ X86AsmBackend(const Target &T, const MCSubtargetInfo &STI)
+ : MCAsmBackend(support::little), STI(STI) {}
unsigned getNumFixupKinds() const override {
return X86::NumTargetFixupKinds;
@@ -95,6 +87,7 @@ public:
{"reloc_signed_4byte_relax", 0, 32, 0},
{"reloc_global_offset_table", 0, 32, 0},
{"reloc_global_offset_table8", 0, 64, 0},
+ {"reloc_branch_4byte_pcrel", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
};
if (Kind < FirstTargetFixupKind)
@@ -102,12 +95,14 @@ public:
assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
"Invalid kind!");
+ assert(Infos[Kind - FirstTargetFixupKind].Name && "Empty fixup name!");
return Infos[Kind - FirstTargetFixupKind];
}
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsResolved) const override {
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override {
unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
@@ -123,7 +118,8 @@ public:
Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
}
- bool mayNeedRelaxation(const MCInst &Inst) const override;
+ bool mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const override;
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
@@ -132,7 +128,7 @@ public:
void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
MCInst &Res) const override;
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
};
} // end anonymous namespace
@@ -270,7 +266,8 @@ static unsigned getRelaxedOpcode(const MCInst &Inst, bool is16BitMode) {
return getRelaxedOpcodeBranch(Inst, is16BitMode);
}
-bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const {
// Branches can always be relaxed in either mode.
if (getRelaxedOpcodeBranch(Inst, false) != Inst.getOpcode())
return true;
@@ -318,52 +315,61 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst,
Res.setOpcode(RelaxedOp);
}
-/// \brief Write a sequence of optimal nops to the output, covering \p Count
+/// Write a sequence of optimal nops to the output, covering \p Count
/// bytes.
/// \return - true on success, false on failure
-bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
- static const uint8_t Nops[10][10] = {
+bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+ static const char Nops[10][11] = {
// nop
- {0x90},
+ "\x90",
// xchg %ax,%ax
- {0x66, 0x90},
+ "\x66\x90",
// nopl (%[re]ax)
- {0x0f, 0x1f, 0x00},
+ "\x0f\x1f\x00",
// nopl 0(%[re]ax)
- {0x0f, 0x1f, 0x40, 0x00},
+ "\x0f\x1f\x40\x00",
// nopl 0(%[re]ax,%[re]ax,1)
- {0x0f, 0x1f, 0x44, 0x00, 0x00},
+ "\x0f\x1f\x44\x00\x00",
// nopw 0(%[re]ax,%[re]ax,1)
- {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+ "\x66\x0f\x1f\x44\x00\x00",
// nopl 0L(%[re]ax)
- {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+ "\x0f\x1f\x80\x00\x00\x00\x00",
// nopl 0L(%[re]ax,%[re]ax,1)
- {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+ "\x0f\x1f\x84\x00\x00\x00\x00\x00",
// nopw 0L(%[re]ax,%[re]ax,1)
- {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+ "\x66\x0f\x1f\x84\x00\x00\x00\x00\x00",
// nopw %cs:0L(%[re]ax,%[re]ax,1)
- {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+ "\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00",
};
// This CPU doesn't support long nops. If needed add more.
- // FIXME: Can we get this from the subtarget somehow?
// FIXME: We could generated something better than plain 0x90.
- if (!HasNopl) {
+ if (!STI.getFeatureBits()[X86::FeatureNOPL]) {
for (uint64_t i = 0; i < Count; ++i)
- OW->write8(0x90);
+ OS << '\x90';
return true;
}
- // 15 is the longest single nop instruction. Emit as many 15-byte nops as
- // needed, then emit a nop of the remaining length.
+ // 15-bytes is the longest single NOP instruction, but 10-bytes is
+ // commonly the longest that can be efficiently decoded.
+ uint64_t MaxNopLength = 10;
+ if (STI.getFeatureBits()[X86::ProcIntelSLM])
+ MaxNopLength = 7;
+ else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
+ MaxNopLength = 15;
+ else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
+ MaxNopLength = 11;
+
+ // Emit as many MaxNopLength NOPs as needed, then emit a NOP of the remaining
+ // length.
do {
const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
for (uint8_t i = 0; i < Prefixes; i++)
- OW->write8(0x66);
+ OS << '\x66';
const uint8_t Rest = ThisNopLength - Prefixes;
- for (uint8_t i = 0; i < Rest; i++)
- OW->write8(Nops[Rest - 1][i]);
+ if (Rest != 0)
+ OS.write(Nops[Rest - 1], Rest);
Count -= ThisNopLength;
} while (Count != 0);
@@ -377,53 +383,57 @@ namespace {
class ELFX86AsmBackend : public X86AsmBackend {
public:
uint8_t OSABI;
- ELFX86AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
- : X86AsmBackend(T, CPU), OSABI(OSABI) {}
+ ELFX86AsmBackend(const Target &T, uint8_t OSABI, const MCSubtargetInfo &STI)
+ : X86AsmBackend(T, STI), OSABI(OSABI) {}
};
class ELFX86_32AsmBackend : public ELFX86AsmBackend {
public:
- ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
- : ELFX86AsmBackend(T, OSABI, CPU) {}
+ ELFX86_32AsmBackend(const Target &T, uint8_t OSABI,
+ const MCSubtargetInfo &STI)
+ : ELFX86AsmBackend(T, OSABI, STI) {}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386);
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createX86ELFObjectWriter(/*IsELF64*/ false, OSABI, ELF::EM_386);
}
};
class ELFX86_X32AsmBackend : public ELFX86AsmBackend {
public:
- ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
- : ELFX86AsmBackend(T, OSABI, CPU) {}
+ ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI,
+ const MCSubtargetInfo &STI)
+ : ELFX86AsmBackend(T, OSABI, STI) {}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createX86ELFObjectWriter(/*IsELF64*/ false, OSABI,
ELF::EM_X86_64);
}
};
class ELFX86_IAMCUAsmBackend : public ELFX86AsmBackend {
public:
- ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
- : ELFX86AsmBackend(T, OSABI, CPU) {}
+ ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI,
+ const MCSubtargetInfo &STI)
+ : ELFX86AsmBackend(T, OSABI, STI) {}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createX86ELFObjectWriter(/*IsELF64*/ false, OSABI,
ELF::EM_IAMCU);
}
};
class ELFX86_64AsmBackend : public ELFX86AsmBackend {
public:
- ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
- : ELFX86AsmBackend(T, OSABI, CPU) {}
+ ELFX86_64AsmBackend(const Target &T, uint8_t OSABI,
+ const MCSubtargetInfo &STI)
+ : ELFX86AsmBackend(T, OSABI, STI) {}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64);
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createX86ELFObjectWriter(/*IsELF64*/ true, OSABI, ELF::EM_X86_64);
}
};
@@ -431,8 +441,9 @@ class WindowsX86AsmBackend : public X86AsmBackend {
bool Is64Bit;
public:
- WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU)
- : X86AsmBackend(T, CPU)
+ WindowsX86AsmBackend(const Target &T, bool is64Bit,
+ const MCSubtargetInfo &STI)
+ : X86AsmBackend(T, STI)
, Is64Bit(is64Bit) {
}
@@ -444,9 +455,9 @@ public:
.Default(MCAsmBackend::getFixupKind(Name));
}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createX86WinCOFFObjectWriter(OS, Is64Bit);
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createX86WinCOFFObjectWriter(Is64Bit);
}
};
@@ -479,7 +490,7 @@ namespace CU {
class DarwinX86AsmBackend : public X86AsmBackend {
const MCRegisterInfo &MRI;
- /// \brief Number of registers that can be saved in a compact unwind encoding.
+ /// Number of registers that can be saved in a compact unwind encoding.
enum { CU_NUM_SAVED_REGS = 6 };
mutable unsigned SavedRegs[CU_NUM_SAVED_REGS];
@@ -489,7 +500,7 @@ class DarwinX86AsmBackend : public X86AsmBackend {
unsigned MoveInstrSize; ///< Size of a "move" instruction.
unsigned StackDivide; ///< Amount to adjust stack size by.
protected:
- /// \brief Size of a "push" instruction for the given register.
+ /// Size of a "push" instruction for the given register.
unsigned PushInstrSize(unsigned Reg) const {
switch (Reg) {
case X86::EBX:
@@ -510,7 +521,7 @@ protected:
return 1;
}
- /// \brief Implementation of algorithm to generate the compact unwind encoding
+ /// Implementation of algorithm to generate the compact unwind encoding
/// for the CFI instructions.
uint32_t
generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const {
@@ -655,8 +666,7 @@ protected:
// instruction.
CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
- // Encode any extra stack stack adjustments (done via push
- // instructions).
+ // Encode any extra stack adjustments (done via push instructions).
CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
}
@@ -678,7 +688,7 @@ protected:
}
private:
- /// \brief Get the compact unwind number for a given register. The number
+ /// Get the compact unwind number for a given register. The number
/// corresponds to the enum lists in compact_unwind_encoding.h.
int getCompactUnwindRegNum(unsigned Reg) const {
static const MCPhysReg CU32BitRegs[7] = {
@@ -695,7 +705,7 @@ private:
return -1;
}
- /// \brief Return the registers encoded for a compact encoding with a frame
+ /// Return the registers encoded for a compact encoding with a frame
/// pointer.
uint32_t encodeCompactUnwindRegistersWithFrame() const {
// Encode the registers in the order they were saved --- 3-bits per
@@ -719,7 +729,7 @@ private:
return RegEnc;
}
- /// \brief Create the permutation encoding used with frameless stacks. It is
+ /// Create the permutation encoding used with frameless stacks. It is
/// passed the number of registers to be saved and an array of the registers
/// saved.
uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
@@ -790,9 +800,9 @@ private:
}
public:
- DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef CPU,
- bool Is64Bit)
- : X86AsmBackend(T, CPU), MRI(MRI), Is64Bit(Is64Bit) {
+ DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const MCSubtargetInfo &STI, bool Is64Bit)
+ : X86AsmBackend(T, STI), MRI(MRI), Is64Bit(Is64Bit) {
memset(SavedRegs, 0, sizeof(SavedRegs));
OffsetSize = Is64Bit ? 8 : 4;
MoveInstrSize = Is64Bit ? 3 : 2;
@@ -803,17 +813,17 @@ public:
class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
public:
DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
- StringRef CPU)
- : DarwinX86AsmBackend(T, MRI, CPU, false) {}
+ const MCSubtargetInfo &STI)
+ : DarwinX86AsmBackend(T, MRI, STI, false) {}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createX86MachObjectWriter(/*Is64Bit=*/false,
MachO::CPU_TYPE_I386,
MachO::CPU_SUBTYPE_I386_ALL);
}
- /// \brief Generate the compact unwind encoding for the CFI instructions.
+ /// Generate the compact unwind encoding for the CFI instructions.
uint32_t generateCompactUnwindEncoding(
ArrayRef<MCCFIInstruction> Instrs) const override {
return generateCompactUnwindEncodingImpl(Instrs);
@@ -824,16 +834,16 @@ class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
const MachO::CPUSubTypeX86 Subtype;
public:
DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
- StringRef CPU, MachO::CPUSubTypeX86 st)
- : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {}
+ const MCSubtargetInfo &STI, MachO::CPUSubTypeX86 st)
+ : DarwinX86AsmBackend(T, MRI, STI, true), Subtype(st) {}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
- MachO::CPU_TYPE_X86_64, Subtype);
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createX86MachObjectWriter(/*Is64Bit=*/true, MachO::CPU_TYPE_X86_64,
+ Subtype);
}
- /// \brief Generate the compact unwind encoding for the CFI instructions.
+ /// Generate the compact unwind encoding for the CFI instructions.
uint32_t generateCompactUnwindEncoding(
ArrayRef<MCCFIInstruction> Instrs) const override {
return generateCompactUnwindEncodingImpl(Instrs);
@@ -843,43 +853,43 @@ public:
} // end anonymous namespace
MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TheTriple,
- StringRef CPU,
const MCTargetOptions &Options) {
+ const Triple &TheTriple = STI.getTargetTriple();
if (TheTriple.isOSBinFormatMachO())
- return new DarwinX86_32AsmBackend(T, MRI, CPU);
+ return new DarwinX86_32AsmBackend(T, MRI, STI);
if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
- return new WindowsX86AsmBackend(T, false, CPU);
+ return new WindowsX86AsmBackend(T, false, STI);
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
if (TheTriple.isOSIAMCU())
- return new ELFX86_IAMCUAsmBackend(T, OSABI, CPU);
+ return new ELFX86_IAMCUAsmBackend(T, OSABI, STI);
- return new ELFX86_32AsmBackend(T, OSABI, CPU);
+ return new ELFX86_32AsmBackend(T, OSABI, STI);
}
MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TheTriple,
- StringRef CPU,
const MCTargetOptions &Options) {
+ const Triple &TheTriple = STI.getTargetTriple();
if (TheTriple.isOSBinFormatMachO()) {
MachO::CPUSubTypeX86 CS =
StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
.Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
.Default(MachO::CPU_SUBTYPE_X86_64_ALL);
- return new DarwinX86_64AsmBackend(T, MRI, CPU, CS);
+ return new DarwinX86_64AsmBackend(T, MRI, STI, CS);
}
if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
- return new WindowsX86AsmBackend(T, true, CPU);
+ return new WindowsX86AsmBackend(T, true, STI);
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
if (TheTriple.getEnvironment() == Triple::GNUX32)
- return new ELFX86_X32AsmBackend(T, OSABI, CPU);
- return new ELFX86_64AsmBackend(T, OSABI, CPU);
+ return new ELFX86_X32AsmBackend(T, OSABI, STI);
+ return new ELFX86_64AsmBackend(T, OSABI, STI);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 07cc488d047e..497e29fe628e 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -60,8 +60,9 @@ namespace X86 {
IP_HAS_REPEAT_NE = 4,
IP_HAS_REPEAT = 8,
IP_HAS_LOCK = 16,
- NO_SCHED_INFO = 32 // Don't add sched comment to the current instr because
- // it was already added
+ NO_SCHED_INFO = 32, // Don't add sched comment to the current instr because
+ // it was already added
+ IP_HAS_NOTRACK = 64
};
} // end namespace X86;
@@ -368,15 +369,13 @@ namespace X86II {
// OpSize - OpSizeFixed implies instruction never needs a 0x66 prefix.
// OpSize16 means this is a 16-bit instruction and needs 0x66 prefix in
// 32-bit mode. OpSize32 means this is a 32-bit instruction needs a 0x66
- // prefix in 16-bit mode. OpSizeIgnore means that the instruction may
- // take a optional 0x66 byte but should not emit with one.
+ // prefix in 16-bit mode.
OpSizeShift = 7,
OpSizeMask = 0x3 << OpSizeShift,
OpSizeFixed = 0 << OpSizeShift,
OpSize16 = 1 << OpSizeShift,
OpSize32 = 2 << OpSizeShift,
- OpSizeIgnore = 3 << OpSizeShift,
// AsSize - AdSizeX implies this instruction determines its need of 0x67
// prefix from a normal ModRM memory operand. The other types indicate that
@@ -385,7 +384,7 @@ namespace X86II {
AdSizeShift = OpSizeShift + 2,
AdSizeMask = 0x3 << AdSizeShift,
- AdSizeX = 1 << AdSizeShift,
+ AdSizeX = 0 << AdSizeShift,
AdSize16 = 1 << AdSizeShift,
AdSize32 = 2 << AdSizeShift,
AdSize64 = 3 << AdSizeShift,
@@ -396,21 +395,21 @@ namespace X86II {
// no prefix.
//
OpPrefixShift = AdSizeShift + 2,
- OpPrefixMask = 0x7 << OpPrefixShift,
+ OpPrefixMask = 0x3 << OpPrefixShift,
- // PS, PD - Prefix code for packed single and double precision vector
- // floating point operations performed in the SSE registers.
- PS = 1 << OpPrefixShift, PD = 2 << OpPrefixShift,
+ // PD - Prefix code for packed double precision vector floating point
+ // operations performed in the SSE registers.
+ PD = 1 << OpPrefixShift,
// XS, XD - These prefix codes are for single and double precision scalar
// floating point operations performed in the SSE registers.
- XS = 3 << OpPrefixShift, XD = 4 << OpPrefixShift,
+ XS = 2 << OpPrefixShift, XD = 3 << OpPrefixShift,
//===------------------------------------------------------------------===//
// OpMap - This field determines which opcode map this instruction
// belongs to. i.e. one-byte, two-byte, 0x0f 0x38, 0x0f 0x3a, etc.
//
- OpMapShift = OpPrefixShift + 3,
+ OpMapShift = OpPrefixShift + 2,
OpMapMask = 0x7 << OpMapShift,
// OB - OneByte - Set if this instruction has a one byte opcode.
@@ -432,6 +431,14 @@ namespace X86II {
// XOPA - Prefix to encode 0xA in VEX.MMMM of XOP instructions.
XOPA = 6 << OpMapShift,
+ /// ThreeDNow - This indicates that the instruction uses the
+ /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents
+ /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
+ /// storing a classifier in the imm8 field. To simplify our implementation,
+ /// we handle this by storeing the classifier in the opcode field and using
+ /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
+ ThreeDNow = 7 << OpMapShift,
+
//===------------------------------------------------------------------===//
// REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
// They are used to specify GPRs and SSE registers, 64-bit operand size,
@@ -561,24 +568,19 @@ namespace X86II {
CD8_Scale_Shift = EVEX_BShift + 1,
CD8_Scale_Mask = 127ULL << CD8_Scale_Shift,
- /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
- /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents
- /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
- /// storing a classifier in the imm8 field. To simplify our implementation,
- /// we handle this by storeing the classifier in the opcode field and using
- /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
- Has3DNow0F0FOpcodeShift = CD8_Scale_Shift + 7,
- Has3DNow0F0FOpcode = 1ULL << Has3DNow0F0FOpcodeShift,
-
/// Explicitly specified rounding control
- EVEX_RCShift = Has3DNow0F0FOpcodeShift + 1,
- EVEX_RC = 1ULL << EVEX_RCShift
+ EVEX_RCShift = CD8_Scale_Shift + 7,
+ EVEX_RC = 1ULL << EVEX_RCShift,
+
+ // NOTRACK prefix
+ NoTrackShift = EVEX_RCShift + 1,
+ NOTRACK = 1ULL << NoTrackShift
};
// getBaseOpcodeFor - This function returns the "base" X86 opcode for the
// specified machine instruction.
//
- inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
+ inline uint8_t getBaseOpcodeFor(uint64_t TSFlags) {
return TSFlags >> X86II::OpcodeShift;
}
@@ -641,30 +643,44 @@ namespace X86II {
}
}
- /// getOperandBias - compute any additional adjustment needed to
- /// the offset to the start of the memory operand
- /// in this instruction.
- /// If this is a two-address instruction,skip one of the register operands.
- /// FIXME: This should be handled during MCInst lowering.
- inline unsigned getOperandBias(const MCInstrDesc& Desc)
- {
+ /// getOperandBias - compute whether all of the def operands are repeated
+ /// in the uses and therefore should be skipped.
+ /// This determines the start of the unique operand list. We need to determine
+ /// if all of the defs have a corresponding tied operand in the uses.
+ /// Unfortunately, the tied operand information is encoded in the uses not
+ /// the defs so we have to use some heuristics to find which operands to
+ /// query.
+ inline unsigned getOperandBias(const MCInstrDesc& Desc) {
+ unsigned NumDefs = Desc.getNumDefs();
unsigned NumOps = Desc.getNumOperands();
- if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
- return 1;
- if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
- Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
- // Special case for AVX-512 GATHER with 2 TIED_TO operands
- // Skip the first 2 operands: dst, mask_wb
- return 2;
- if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
- Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
- // Special case for GATHER with 2 TIED_TO operands
- // Skip the first 2 operands: dst, mask_wb
- return 2;
- if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
- // SCATTER
- return 1;
- return 0;
+ switch (NumDefs) {
+ default: llvm_unreachable("Unexpected number of defs");
+ case 0:
+ return 0;
+ case 1:
+ // Common two addr case.
+ if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+ return 1;
+ // Check for AVX-512 scatter which has a TIED_TO in the second to last
+ // operand.
+ if (NumOps == 8 &&
+ Desc.getOperandConstraint(6, MCOI::TIED_TO) == 0)
+ return 1;
+ return 0;
+ case 2:
+ // XCHG/XADD have two destinations and two sources.
+ if (NumOps >= 4 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
+ return 2;
+ // Check for gather. AVX-512 has the second tied operand early. AVX2
+ // has it as the last op.
+ if (NumOps == 9 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ (Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1 ||
+ Desc.getOperandConstraint(8, MCOI::TIED_TO) == 1) &&
+ "Instruction with 2 defs isn't gather?")
+ return 2;
+ return 0;
+ }
}
/// getMemoryOperandNo - The function returns the MCInst operand # for the
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 4cdbae4d0d96..b724a89f81d2 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -75,6 +75,9 @@ static X86_64RelType getType64(unsigned Kind,
case X86::reloc_riprel_4byte_relax_rex:
case X86::reloc_riprel_4byte_movq_load:
return RT64_32;
+ case X86::reloc_branch_4byte_pcrel:
+ Modifier = MCSymbolRefExpr::VK_PLT;
+ return RT64_32;
case FK_PCRel_2:
case FK_Data_2:
return RT64_16;
@@ -298,9 +301,7 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
return getRelocType32(Ctx, Modifier, getType32(Type), IsPCRel, Kind);
}
-std::unique_ptr<MCObjectWriter>
-llvm::createX86ELFObjectWriter(raw_pwrite_stream &OS, bool IsELF64,
- uint8_t OSABI, uint16_t EMachine) {
- auto MOTW = llvm::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine);
- return createELFObjectWriter(std::move(MOTW), OS, /*IsLittleEndian=*/true);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createX86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine) {
+ return llvm::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
index dfdc9ec29aec..3c04b13e002e 100644
--- a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
+++ b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -30,6 +30,7 @@ enum Fixups {
// of the instruction. Used only
// for _GLOBAL_OFFSET_TABLE_.
reloc_global_offset_table8, // 64-bit variant.
+ reloc_branch_4byte_pcrel, // 32-bit PC relative branch.
// Marker
LastTargetFixupKind,
NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index a7059c6914df..f5371db9e77a 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -152,6 +152,8 @@ public:
uint8_t DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
int MemOperand, const MCInstrDesc &Desc) const;
+
+ bool isPCRel32Branch(const MCInst &MI) const;
};
} // end anonymous namespace
@@ -217,6 +219,8 @@ static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
assert(IndexReg.getReg() == 0 && "Invalid eip-based address.");
return true;
}
+ if (IndexReg.getReg() == X86::EIZ)
+ return true;
return false;
}
@@ -276,6 +280,22 @@ static bool HasSecRelSymbolRef(const MCExpr *Expr) {
return false;
}
+bool X86MCCodeEmitter::isPCRel32Branch(const MCInst &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MCII.get(Opcode);
+ if ((Opcode != X86::CALL64pcrel32 && Opcode != X86::JMP_4) ||
+ getImmFixupKind(Desc.TSFlags) != FK_PCRel_4)
+ return false;
+
+ unsigned CurOp = X86II::getOperandBias(Desc);
+ const MCOperand &Op = MI.getOperand(CurOp);
+ if (!Op.isExpr())
+ return false;
+
+ const MCSymbolRefExpr *Ref = dyn_cast<MCSymbolRefExpr>(Op.getExpr());
+ return Ref && Ref->getKind() == MCSymbolRefExpr::VK_None;
+}
+
void X86MCCodeEmitter::
EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
@@ -331,8 +351,15 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load) ||
FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax) ||
- FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax_rex))
+ FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax_rex) ||
+ FixupKind == MCFixupKind(X86::reloc_branch_4byte_pcrel)) {
ImmOffset -= 4;
+ // If this is a pc-relative load off _GLOBAL_OFFSET_TABLE_:
+ // leaq _GLOBAL_OFFSET_TABLE_(%rip), %r15
+ // this needs to be a GOTPC32 relocation.
+ if (StartsWithGlobalOffsetTable(Expr) != GOT_None)
+ FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+ }
if (FixupKind == FK_PCRel_2)
ImmOffset -= 2;
if (FixupKind == FK_PCRel_1)
@@ -380,6 +407,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
return X86::reloc_riprel_4byte_movq_load;
case X86::CALL64m:
case X86::JMP64m:
+ case X86::TAILJMPm64:
case X86::TEST64mr:
case X86::ADC64rm:
case X86::ADD64rm:
@@ -396,10 +424,14 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// rip-relative addressing is actually relative to the *next* instruction.
// Since an immediate can follow the mod/rm byte for an instruction, this
- // means that we need to bias the immediate field of the instruction with
- // the size of the immediate field. If we have this case, add it into the
+ // means that we need to bias the displacement field of the instruction with
+ // the size of the immediate field. If we have this case, add it into the
// expression to emit.
- int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+ // Note: rip-relative addressing using immediate displacement values should
+ // not be adjusted, assuming it was the user's intent.
+ int ImmSize = !Disp.isImm() && X86II::hasImm(TSFlags)
+ ? X86II::getSizeOfImm(TSFlags)
+ : 0;
EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
CurByte, OS, Fixups, -ImmSize);
@@ -446,7 +478,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
}
if (Disp.isImm() && isDisp8(Disp.getImm())) {
- if (Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+ if (Disp.getImm() == 0 && RMfield != 6) {
// There is no displacement; just the register.
EmitByte(ModRMByte(0, RegOpcodeField, RMfield), CurByte, OS);
return;
@@ -677,10 +709,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// 0b10: F3
// 0b11: F2
//
- uint8_t VEX_PP;
+ uint8_t VEX_PP = 0;
switch (TSFlags & X86II::OpPrefixMask) {
- default: llvm_unreachable("Invalid op prefix!");
- case X86II::PS: VEX_PP = 0x0; break; // none
case X86II::PD: VEX_PP = 0x1; break; // 66
case X86II::XS: VEX_PP = 0x2; break; // F3
case X86II::XD: VEX_PP = 0x3; break; // F2
@@ -1111,6 +1141,10 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
if (TSFlags & X86II::LOCK || MI.getFlags() & X86::IP_HAS_LOCK)
EmitByte(0xF0, CurByte, OS);
+ // Emit the NOTRACK opcode prefix.
+ if (TSFlags & X86II::NOTRACK || MI.getFlags() & X86::IP_HAS_NOTRACK)
+ EmitByte(0x3E, CurByte, OS);
+
switch (TSFlags & X86II::OpPrefixMask) {
case X86II::PD: // 66
EmitByte(0x66, CurByte, OS);
@@ -1136,9 +1170,10 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// 0x0F escape code must be emitted just before the opcode.
switch (TSFlags & X86II::OpMapMask) {
- case X86II::TB: // Two-byte opcode map
- case X86II::T8: // 0F 38
- case X86II::TA: // 0F 3A
+ case X86II::TB: // Two-byte opcode map
+ case X86II::T8: // 0F 38
+ case X86II::TA: // 0F 3A
+ case X86II::ThreeDNow: // 0F 0F, second 0F emitted by caller.
EmitByte(0x0F, CurByte, OS);
break;
}
@@ -1234,7 +1269,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
uint8_t BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
- if (TSFlags & X86II::Has3DNow0F0FOpcode)
+ if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow)
BaseOpcode = 0x0F; // Weird 3DNow! encoding.
uint64_t Form = TSFlags & X86II::FormMask;
@@ -1283,9 +1318,18 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
EmitByte(BaseOpcode, CurByte, OS);
break;
}
- case X86II::RawFrm:
+ case X86II::RawFrm: {
EmitByte(BaseOpcode, CurByte, OS);
+
+ if (!is64BitMode(STI) || !isPCRel32Branch(MI))
+ break;
+
+ const MCOperand &Op = MI.getOperand(CurOp++);
+ EmitImmediate(Op, MI.getLoc(), X86II::getSizeOfImm(TSFlags),
+ MCFixupKind(X86::reloc_branch_4byte_pcrel), CurByte, OS,
+ Fixups);
break;
+ }
case X86II::RawFrmMemOffs:
// Emit segment override opcode prefix as needed.
EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
@@ -1519,7 +1563,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
}
}
- if (TSFlags & X86II::Has3DNow0F0FOpcode)
+ if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow)
EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
#ifndef NDEBUG
diff --git a/lib/Target/X86/MCTargetDesc/X86MCExpr.h b/lib/Target/X86/MCTargetDesc/X86MCExpr.h
new file mode 100644
index 000000000000..f1438cd24960
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86MCExpr.h
@@ -0,0 +1,75 @@
+//=--- X86MCExpr.h - X86 specific MC expression classes ---*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes X86-specific MCExprs, i.e, registers used for
+// extended variable assignments.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCEXPR_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCEXPR_H
+
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class X86MCExpr : public MCTargetExpr {
+
+private:
+ const int64_t RegNo; // All
+
+ explicit X86MCExpr(int64_t R) : RegNo(R) {}
+
+public:
+ /// @name Construction
+ /// @{
+
+ static const X86MCExpr *create(int64_t RegNo, MCContext &Ctx) {
+ return new (Ctx) X86MCExpr(RegNo);
+ }
+
+ /// @}
+ /// @name Accessors
+ /// @{
+
+ /// getSubExpr - Get the child of this expression.
+ int64_t getRegNo() const { return RegNo; }
+
+ /// @}
+
+ void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override {
+ if (MAI->getAssemblerDialect() == 0)
+ OS << '%';
+ OS << X86ATTInstPrinter::getRegisterName(RegNo);
+ }
+
+ bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override {
+ return false;
+ }
+ // Register values should be inlined as they are not valid .set expressions.
+ bool inlineAssignedExpr() const override { return true; }
+ void visitUsedExpr(MCStreamer &Streamer) const override{};
+ MCFragment *findAssociatedFragment() const override { return nullptr; }
+
+ // There are no TLS X86MCExprs at the moment.
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+ static bool classof(const MCExpr *E) {
+ return E->getKind() == MCExpr::Target;
+ }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index cdd43478baed..d030f26d98de 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -14,7 +14,9 @@
#include "X86MCTargetDesc.h"
#include "InstPrinter/X86ATTInstPrinter.h"
#include "InstPrinter/X86IntelInstPrinter.h"
+#include "X86BaseInfo.h"
#include "X86MCAsmInfo.h"
+#include "llvm/ADT/APInt.h"
#include "llvm/ADT/Triple.h"
#include "llvm/DebugInfo/CodeView/CodeView.h"
#include "llvm/MC/MCInstrAnalysis.h"
@@ -37,6 +39,7 @@ using namespace llvm;
#include "X86GenRegisterInfo.inc"
#define GET_INSTRINFO_MC_DESC
+#define GET_GENINSTRINFO_MC_HELPERS
#include "X86GenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
@@ -78,120 +81,120 @@ void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
codeview::RegisterId CVReg;
MCPhysReg Reg;
} RegMap[] = {
- { codeview::RegisterId::AL, X86::AL},
- { codeview::RegisterId::CL, X86::CL},
- { codeview::RegisterId::DL, X86::DL},
- { codeview::RegisterId::BL, X86::BL},
- { codeview::RegisterId::AH, X86::AH},
- { codeview::RegisterId::CH, X86::CH},
- { codeview::RegisterId::DH, X86::DH},
- { codeview::RegisterId::BH, X86::BH},
- { codeview::RegisterId::AX, X86::AX},
- { codeview::RegisterId::CX, X86::CX},
- { codeview::RegisterId::DX, X86::DX},
- { codeview::RegisterId::BX, X86::BX},
- { codeview::RegisterId::SP, X86::SP},
- { codeview::RegisterId::BP, X86::BP},
- { codeview::RegisterId::SI, X86::SI},
- { codeview::RegisterId::DI, X86::DI},
- { codeview::RegisterId::EAX, X86::EAX},
- { codeview::RegisterId::ECX, X86::ECX},
- { codeview::RegisterId::EDX, X86::EDX},
- { codeview::RegisterId::EBX, X86::EBX},
- { codeview::RegisterId::ESP, X86::ESP},
- { codeview::RegisterId::EBP, X86::EBP},
- { codeview::RegisterId::ESI, X86::ESI},
- { codeview::RegisterId::EDI, X86::EDI},
-
- { codeview::RegisterId::EFLAGS, X86::EFLAGS},
-
- { codeview::RegisterId::ST0, X86::FP0},
- { codeview::RegisterId::ST1, X86::FP1},
- { codeview::RegisterId::ST2, X86::FP2},
- { codeview::RegisterId::ST3, X86::FP3},
- { codeview::RegisterId::ST4, X86::FP4},
- { codeview::RegisterId::ST5, X86::FP5},
- { codeview::RegisterId::ST6, X86::FP6},
- { codeview::RegisterId::ST7, X86::FP7},
-
- { codeview::RegisterId::XMM0, X86::XMM0},
- { codeview::RegisterId::XMM1, X86::XMM1},
- { codeview::RegisterId::XMM2, X86::XMM2},
- { codeview::RegisterId::XMM3, X86::XMM3},
- { codeview::RegisterId::XMM4, X86::XMM4},
- { codeview::RegisterId::XMM5, X86::XMM5},
- { codeview::RegisterId::XMM6, X86::XMM6},
- { codeview::RegisterId::XMM7, X86::XMM7},
-
- { codeview::RegisterId::XMM8, X86::XMM8},
- { codeview::RegisterId::XMM9, X86::XMM9},
- { codeview::RegisterId::XMM10, X86::XMM10},
- { codeview::RegisterId::XMM11, X86::XMM11},
- { codeview::RegisterId::XMM12, X86::XMM12},
- { codeview::RegisterId::XMM13, X86::XMM13},
- { codeview::RegisterId::XMM14, X86::XMM14},
- { codeview::RegisterId::XMM15, X86::XMM15},
-
- { codeview::RegisterId::SIL, X86::SIL},
- { codeview::RegisterId::DIL, X86::DIL},
- { codeview::RegisterId::BPL, X86::BPL},
- { codeview::RegisterId::SPL, X86::SPL},
- { codeview::RegisterId::RAX, X86::RAX},
- { codeview::RegisterId::RBX, X86::RBX},
- { codeview::RegisterId::RCX, X86::RCX},
- { codeview::RegisterId::RDX, X86::RDX},
- { codeview::RegisterId::RSI, X86::RSI},
- { codeview::RegisterId::RDI, X86::RDI},
- { codeview::RegisterId::RBP, X86::RBP},
- { codeview::RegisterId::RSP, X86::RSP},
- { codeview::RegisterId::R8, X86::R8},
- { codeview::RegisterId::R9, X86::R9},
- { codeview::RegisterId::R10, X86::R10},
- { codeview::RegisterId::R11, X86::R11},
- { codeview::RegisterId::R12, X86::R12},
- { codeview::RegisterId::R13, X86::R13},
- { codeview::RegisterId::R14, X86::R14},
- { codeview::RegisterId::R15, X86::R15},
- { codeview::RegisterId::R8B, X86::R8B},
- { codeview::RegisterId::R9B, X86::R9B},
- { codeview::RegisterId::R10B, X86::R10B},
- { codeview::RegisterId::R11B, X86::R11B},
- { codeview::RegisterId::R12B, X86::R12B},
- { codeview::RegisterId::R13B, X86::R13B},
- { codeview::RegisterId::R14B, X86::R14B},
- { codeview::RegisterId::R15B, X86::R15B},
- { codeview::RegisterId::R8W, X86::R8W},
- { codeview::RegisterId::R9W, X86::R9W},
- { codeview::RegisterId::R10W, X86::R10W},
- { codeview::RegisterId::R11W, X86::R11W},
- { codeview::RegisterId::R12W, X86::R12W},
- { codeview::RegisterId::R13W, X86::R13W},
- { codeview::RegisterId::R14W, X86::R14W},
- { codeview::RegisterId::R15W, X86::R15W},
- { codeview::RegisterId::R8D, X86::R8D},
- { codeview::RegisterId::R9D, X86::R9D},
- { codeview::RegisterId::R10D, X86::R10D},
- { codeview::RegisterId::R11D, X86::R11D},
- { codeview::RegisterId::R12D, X86::R12D},
- { codeview::RegisterId::R13D, X86::R13D},
- { codeview::RegisterId::R14D, X86::R14D},
- { codeview::RegisterId::R15D, X86::R15D},
- { codeview::RegisterId::AMD64_YMM0, X86::YMM0},
- { codeview::RegisterId::AMD64_YMM1, X86::YMM1},
- { codeview::RegisterId::AMD64_YMM2, X86::YMM2},
- { codeview::RegisterId::AMD64_YMM3, X86::YMM3},
- { codeview::RegisterId::AMD64_YMM4, X86::YMM4},
- { codeview::RegisterId::AMD64_YMM5, X86::YMM5},
- { codeview::RegisterId::AMD64_YMM6, X86::YMM6},
- { codeview::RegisterId::AMD64_YMM7, X86::YMM7},
- { codeview::RegisterId::AMD64_YMM8, X86::YMM8},
- { codeview::RegisterId::AMD64_YMM9, X86::YMM9},
- { codeview::RegisterId::AMD64_YMM10, X86::YMM10},
- { codeview::RegisterId::AMD64_YMM11, X86::YMM11},
- { codeview::RegisterId::AMD64_YMM12, X86::YMM12},
- { codeview::RegisterId::AMD64_YMM13, X86::YMM13},
- { codeview::RegisterId::AMD64_YMM14, X86::YMM14},
- { codeview::RegisterId::AMD64_YMM15, X86::YMM15},
+ { codeview::RegisterId::CVRegAL, X86::AL},
+ { codeview::RegisterId::CVRegCL, X86::CL},
+ { codeview::RegisterId::CVRegDL, X86::DL},
+ { codeview::RegisterId::CVRegBL, X86::BL},
+ { codeview::RegisterId::CVRegAH, X86::AH},
+ { codeview::RegisterId::CVRegCH, X86::CH},
+ { codeview::RegisterId::CVRegDH, X86::DH},
+ { codeview::RegisterId::CVRegBH, X86::BH},
+ { codeview::RegisterId::CVRegAX, X86::AX},
+ { codeview::RegisterId::CVRegCX, X86::CX},
+ { codeview::RegisterId::CVRegDX, X86::DX},
+ { codeview::RegisterId::CVRegBX, X86::BX},
+ { codeview::RegisterId::CVRegSP, X86::SP},
+ { codeview::RegisterId::CVRegBP, X86::BP},
+ { codeview::RegisterId::CVRegSI, X86::SI},
+ { codeview::RegisterId::CVRegDI, X86::DI},
+ { codeview::RegisterId::CVRegEAX, X86::EAX},
+ { codeview::RegisterId::CVRegECX, X86::ECX},
+ { codeview::RegisterId::CVRegEDX, X86::EDX},
+ { codeview::RegisterId::CVRegEBX, X86::EBX},
+ { codeview::RegisterId::CVRegESP, X86::ESP},
+ { codeview::RegisterId::CVRegEBP, X86::EBP},
+ { codeview::RegisterId::CVRegESI, X86::ESI},
+ { codeview::RegisterId::CVRegEDI, X86::EDI},
+
+ { codeview::RegisterId::CVRegEFLAGS, X86::EFLAGS},
+
+ { codeview::RegisterId::CVRegST0, X86::FP0},
+ { codeview::RegisterId::CVRegST1, X86::FP1},
+ { codeview::RegisterId::CVRegST2, X86::FP2},
+ { codeview::RegisterId::CVRegST3, X86::FP3},
+ { codeview::RegisterId::CVRegST4, X86::FP4},
+ { codeview::RegisterId::CVRegST5, X86::FP5},
+ { codeview::RegisterId::CVRegST6, X86::FP6},
+ { codeview::RegisterId::CVRegST7, X86::FP7},
+
+ { codeview::RegisterId::CVRegXMM0, X86::XMM0},
+ { codeview::RegisterId::CVRegXMM1, X86::XMM1},
+ { codeview::RegisterId::CVRegXMM2, X86::XMM2},
+ { codeview::RegisterId::CVRegXMM3, X86::XMM3},
+ { codeview::RegisterId::CVRegXMM4, X86::XMM4},
+ { codeview::RegisterId::CVRegXMM5, X86::XMM5},
+ { codeview::RegisterId::CVRegXMM6, X86::XMM6},
+ { codeview::RegisterId::CVRegXMM7, X86::XMM7},
+
+ { codeview::RegisterId::CVRegXMM8, X86::XMM8},
+ { codeview::RegisterId::CVRegXMM9, X86::XMM9},
+ { codeview::RegisterId::CVRegXMM10, X86::XMM10},
+ { codeview::RegisterId::CVRegXMM11, X86::XMM11},
+ { codeview::RegisterId::CVRegXMM12, X86::XMM12},
+ { codeview::RegisterId::CVRegXMM13, X86::XMM13},
+ { codeview::RegisterId::CVRegXMM14, X86::XMM14},
+ { codeview::RegisterId::CVRegXMM15, X86::XMM15},
+
+ { codeview::RegisterId::CVRegSIL, X86::SIL},
+ { codeview::RegisterId::CVRegDIL, X86::DIL},
+ { codeview::RegisterId::CVRegBPL, X86::BPL},
+ { codeview::RegisterId::CVRegSPL, X86::SPL},
+ { codeview::RegisterId::CVRegRAX, X86::RAX},
+ { codeview::RegisterId::CVRegRBX, X86::RBX},
+ { codeview::RegisterId::CVRegRCX, X86::RCX},
+ { codeview::RegisterId::CVRegRDX, X86::RDX},
+ { codeview::RegisterId::CVRegRSI, X86::RSI},
+ { codeview::RegisterId::CVRegRDI, X86::RDI},
+ { codeview::RegisterId::CVRegRBP, X86::RBP},
+ { codeview::RegisterId::CVRegRSP, X86::RSP},
+ { codeview::RegisterId::CVRegR8, X86::R8},
+ { codeview::RegisterId::CVRegR9, X86::R9},
+ { codeview::RegisterId::CVRegR10, X86::R10},
+ { codeview::RegisterId::CVRegR11, X86::R11},
+ { codeview::RegisterId::CVRegR12, X86::R12},
+ { codeview::RegisterId::CVRegR13, X86::R13},
+ { codeview::RegisterId::CVRegR14, X86::R14},
+ { codeview::RegisterId::CVRegR15, X86::R15},
+ { codeview::RegisterId::CVRegR8B, X86::R8B},
+ { codeview::RegisterId::CVRegR9B, X86::R9B},
+ { codeview::RegisterId::CVRegR10B, X86::R10B},
+ { codeview::RegisterId::CVRegR11B, X86::R11B},
+ { codeview::RegisterId::CVRegR12B, X86::R12B},
+ { codeview::RegisterId::CVRegR13B, X86::R13B},
+ { codeview::RegisterId::CVRegR14B, X86::R14B},
+ { codeview::RegisterId::CVRegR15B, X86::R15B},
+ { codeview::RegisterId::CVRegR8W, X86::R8W},
+ { codeview::RegisterId::CVRegR9W, X86::R9W},
+ { codeview::RegisterId::CVRegR10W, X86::R10W},
+ { codeview::RegisterId::CVRegR11W, X86::R11W},
+ { codeview::RegisterId::CVRegR12W, X86::R12W},
+ { codeview::RegisterId::CVRegR13W, X86::R13W},
+ { codeview::RegisterId::CVRegR14W, X86::R14W},
+ { codeview::RegisterId::CVRegR15W, X86::R15W},
+ { codeview::RegisterId::CVRegR8D, X86::R8D},
+ { codeview::RegisterId::CVRegR9D, X86::R9D},
+ { codeview::RegisterId::CVRegR10D, X86::R10D},
+ { codeview::RegisterId::CVRegR11D, X86::R11D},
+ { codeview::RegisterId::CVRegR12D, X86::R12D},
+ { codeview::RegisterId::CVRegR13D, X86::R13D},
+ { codeview::RegisterId::CVRegR14D, X86::R14D},
+ { codeview::RegisterId::CVRegR15D, X86::R15D},
+ { codeview::RegisterId::CVRegAMD64_YMM0, X86::YMM0},
+ { codeview::RegisterId::CVRegAMD64_YMM1, X86::YMM1},
+ { codeview::RegisterId::CVRegAMD64_YMM2, X86::YMM2},
+ { codeview::RegisterId::CVRegAMD64_YMM3, X86::YMM3},
+ { codeview::RegisterId::CVRegAMD64_YMM4, X86::YMM4},
+ { codeview::RegisterId::CVRegAMD64_YMM5, X86::YMM5},
+ { codeview::RegisterId::CVRegAMD64_YMM6, X86::YMM6},
+ { codeview::RegisterId::CVRegAMD64_YMM7, X86::YMM7},
+ { codeview::RegisterId::CVRegAMD64_YMM8, X86::YMM8},
+ { codeview::RegisterId::CVRegAMD64_YMM9, X86::YMM9},
+ { codeview::RegisterId::CVRegAMD64_YMM10, X86::YMM10},
+ { codeview::RegisterId::CVRegAMD64_YMM11, X86::YMM11},
+ { codeview::RegisterId::CVRegAMD64_YMM12, X86::YMM12},
+ { codeview::RegisterId::CVRegAMD64_YMM13, X86::YMM13},
+ { codeview::RegisterId::CVRegAMD64_YMM14, X86::YMM14},
+ { codeview::RegisterId::CVRegAMD64_YMM15, X86::YMM15},
};
for (unsigned I = 0; I < array_lengthof(RegMap); ++I)
MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg));
@@ -293,8 +296,79 @@ static MCRelocationInfo *createX86MCRelocationInfo(const Triple &TheTriple,
return llvm::createMCRelocationInfo(TheTriple, Ctx);
}
+namespace llvm {
+namespace X86_MC {
+
+class X86MCInstrAnalysis : public MCInstrAnalysis {
+ X86MCInstrAnalysis(const X86MCInstrAnalysis &) = delete;
+ X86MCInstrAnalysis &operator=(const X86MCInstrAnalysis &) = delete;
+ virtual ~X86MCInstrAnalysis() = default;
+
+public:
+ X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {}
+
+ bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst,
+ APInt &Mask) const override;
+};
+
+bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI,
+ const MCInst &Inst,
+ APInt &Mask) const {
+ const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+ unsigned NumDefs = Desc.getNumDefs();
+ unsigned NumImplicitDefs = Desc.getNumImplicitDefs();
+ assert(Mask.getBitWidth() == NumDefs + NumImplicitDefs &&
+ "Unexpected number of bits in the mask!");
+
+ bool HasVEX = (Desc.TSFlags & X86II::EncodingMask) == X86II::VEX;
+ bool HasEVEX = (Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX;
+ bool HasXOP = (Desc.TSFlags & X86II::EncodingMask) == X86II::XOP;
+
+ const MCRegisterClass &GR32RC = MRI.getRegClass(X86::GR32RegClassID);
+ const MCRegisterClass &VR128XRC = MRI.getRegClass(X86::VR128XRegClassID);
+ const MCRegisterClass &VR256XRC = MRI.getRegClass(X86::VR256XRegClassID);
+
+ auto ClearsSuperReg = [=](unsigned RegID) {
+ // On X86-64, a general purpose integer register is viewed as a 64-bit
+ // register internal to the processor.
+ // An update to the lower 32 bits of a 64 bit integer register is
+ // architecturally defined to zero extend the upper 32 bits.
+ if (GR32RC.contains(RegID))
+ return true;
+
+ // Early exit if this instruction has no vex/evex/xop prefix.
+ if (!HasEVEX && !HasVEX && !HasXOP)
+ return false;
+
+ // All VEX and EVEX encoded instructions are defined to zero the high bits
+ // of the destination register up to VLMAX (i.e. the maximum vector register
+ // width pertaining to the instruction).
+ // We assume the same behavior for XOP instructions too.
+ return VR128XRC.contains(RegID) || VR256XRC.contains(RegID);
+ };
+
+ Mask.clearAllBits();
+ for (unsigned I = 0, E = NumDefs; I < E; ++I) {
+ const MCOperand &Op = Inst.getOperand(I);
+ if (ClearsSuperReg(Op.getReg()))
+ Mask.setBit(I);
+ }
+
+ for (unsigned I = 0, E = NumImplicitDefs; I < E; ++I) {
+ const MCPhysReg Reg = Desc.getImplicitDefs()[I];
+ if (ClearsSuperReg(Reg))
+ Mask.setBit(NumDefs + I);
+ }
+
+ return Mask.getBoolValue();
+}
+
+} // end of namespace X86_MC
+
+} // end of namespace llvm
+
static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
- return new MCInstrAnalysis(Info);
+ return new X86_MC::X86MCInstrAnalysis(Info);
}
// Force static initialization.
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index c5859b600ad2..595c26d31e3f 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -23,6 +23,7 @@ class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
+class MCObjectTargetWriter;
class MCObjectWriter;
class MCRegisterInfo;
class MCSubtargetInfo;
@@ -70,11 +71,13 @@ MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
-MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
+MCAsmBackend *createX86_32AsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const MCTargetOptions &Options);
-MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
+MCAsmBackend *createX86_64AsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const MCTargetOptions &Options);
/// Implements X86-only directives for assembly emission.
@@ -93,25 +96,21 @@ MCTargetStreamer *createX86ObjectTargetStreamer(MCStreamer &OS,
/// Takes ownership of \p AB and \p CE.
MCStreamer *createX86WinCOFFStreamer(MCContext &C,
std::unique_ptr<MCAsmBackend> &&AB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&CE,
bool RelaxAll,
bool IncrementalLinkerCompatible);
/// Construct an X86 Mach-O object writer.
-std::unique_ptr<MCObjectWriter> createX86MachObjectWriter(raw_pwrite_stream &OS,
- bool Is64Bit,
- uint32_t CPUType,
- uint32_t CPUSubtype);
+std::unique_ptr<MCObjectTargetWriter>
+createX86MachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype);
/// Construct an X86 ELF object writer.
-std::unique_ptr<MCObjectWriter> createX86ELFObjectWriter(raw_pwrite_stream &OS,
- bool IsELF64,
- uint8_t OSABI,
- uint16_t EMachine);
+std::unique_ptr<MCObjectTargetWriter>
+createX86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
/// Construct an X86 Win COFF object writer.
-std::unique_ptr<MCObjectWriter>
-createX86WinCOFFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit);
+std::unique_ptr<MCObjectTargetWriter>
+createX86WinCOFFObjectWriter(bool Is64Bit);
/// Returns the sub or super register of a specific X86 register.
/// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX.
@@ -135,6 +134,7 @@ unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned,
// Defines symbolic names for the X86 instructions.
//
#define GET_INSTRINFO_ENUM
+#define GET_GENINSTRINFO_MC_DECL
#include "X86GenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 965f7de809b3..883278b7bc1f 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -94,6 +94,7 @@ static unsigned getFixupKindLog2Size(unsigned Kind) {
case X86::reloc_riprel_4byte_movq_load:
case X86::reloc_signed_4byte:
case X86::reloc_signed_4byte_relax:
+ case X86::reloc_branch_4byte_pcrel:
case FK_Data_4: return 2;
case FK_Data_8: return 3;
}
@@ -597,10 +598,8 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
}
-std::unique_ptr<MCObjectWriter>
-llvm::createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
- uint32_t CPUType, uint32_t CPUSubtype) {
- return createMachObjectWriter(
- llvm::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype), OS,
- /*IsLittleEndian=*/true);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createX86MachObjectWriter(bool Is64Bit, uint32_t CPUType,
+ uint32_t CPUSubtype) {
+ return llvm::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 5139bb46b561..a5e115e5ff4d 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -62,6 +62,7 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
case X86::reloc_riprel_4byte_movq_load:
case X86::reloc_riprel_4byte_relax:
case X86::reloc_riprel_4byte_relax_rex:
+ case X86::reloc_branch_4byte_pcrel:
return COFF::IMAGE_REL_AMD64_REL32;
case FK_Data_4:
case X86::reloc_signed_4byte:
@@ -105,8 +106,7 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
llvm_unreachable("Unsupported COFF machine type.");
}
-std::unique_ptr<MCObjectWriter>
-llvm::createX86WinCOFFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit) {
- auto MOTW = llvm::make_unique<X86WinCOFFObjectWriter>(Is64Bit);
- return createWinCOFFObjectWriter(std::move(MOTW), OS);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createX86WinCOFFObjectWriter(bool Is64Bit) {
+ return llvm::make_unique<X86WinCOFFObjectWriter>(Is64Bit);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index 5b1357ae4a7b..0085787e576a 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -11,6 +11,7 @@
#include "X86TargetStreamer.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCWin64EH.h"
#include "llvm/MC/MCWinCOFFStreamer.h"
@@ -21,8 +22,9 @@ class X86WinCOFFStreamer : public MCWinCOFFStreamer {
Win64EH::UnwindEmitter EHStreamer;
public:
X86WinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> AB,
- std::unique_ptr<MCCodeEmitter> CE, raw_pwrite_stream &OS)
- : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), OS) {}
+ std::unique_ptr<MCCodeEmitter> CE,
+ std::unique_ptr<MCObjectWriter> OW)
+ : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
void EmitWinEHHandlerData(SMLoc Loc) override;
void EmitWindowsUnwindTables() override;
@@ -60,12 +62,12 @@ void X86WinCOFFStreamer::FinishImpl() {
MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C,
std::unique_ptr<MCAsmBackend> &&AB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&CE,
bool RelaxAll,
bool IncrementalLinkerCompatible) {
X86WinCOFFStreamer *S =
- new X86WinCOFFStreamer(C, std::move(AB), std::move(CE), OS);
+ new X86WinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW));
S->getAssembler().setRelaxAll(RelaxAll);
S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
return S;
diff --git a/lib/Target/X86/README-MMX.txt b/lib/Target/X86/README-MMX.txt
deleted file mode 100644
index a6c8616b6d2c..000000000000
--- a/lib/Target/X86/README-MMX.txt
+++ /dev/null
@@ -1,71 +0,0 @@
-//===---------------------------------------------------------------------===//
-// Random ideas for the X86 backend: MMX-specific stuff.
-//===---------------------------------------------------------------------===//
-
-//===---------------------------------------------------------------------===//
-
-This:
-
-#include <mmintrin.h>
-
-__v2si qux(int A) {
- return (__v2si){ 0, A };
-}
-
-is compiled into:
-
-_qux:
- subl $28, %esp
- movl 32(%esp), %eax
- movd %eax, %mm0
- movq %mm0, (%esp)
- movl (%esp), %eax
- movl %eax, 20(%esp)
- movq %mm0, 8(%esp)
- movl 12(%esp), %eax
- movl %eax, 16(%esp)
- movq 16(%esp), %mm0
- addl $28, %esp
- ret
-
-Yuck!
-
-GCC gives us:
-
-_qux:
- subl $12, %esp
- movl 16(%esp), %eax
- movl 20(%esp), %edx
- movl $0, (%eax)
- movl %edx, 4(%eax)
- addl $12, %esp
- ret $4
-
-//===---------------------------------------------------------------------===//
-
-We generate crappy code for this:
-
-__m64 t() {
- return _mm_cvtsi32_si64(1);
-}
-
-_t:
- subl $12, %esp
- movl $1, %eax
- movd %eax, %mm0
- movq %mm0, (%esp)
- movl (%esp), %eax
- movl 4(%esp), %edx
- addl $12, %esp
- ret
-
-The extra stack traffic is covered in the previous entry. But the other reason
-is we are not smart about materializing constants in MMX registers. With -m64
-
- movl $1, %eax
- movd %eax, %mm0
- movd %mm0, %rax
- ret
-
-We should be using a constantpool load instead:
- movq LC0(%rip), %rax
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index 73cf27692447..190ca861967a 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -839,12 +839,3 @@ similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
turn into hsubpd also.
//===---------------------------------------------------------------------===//
-
-define <2 x i32> @foo(<2 x double> %in) {
- %x = fptosi <2 x double> %in to <2 x i32>
- ret <2 x i32> %x
-}
-
-Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si.
-
-//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/README-UNIMPLEMENTED.txt b/lib/Target/X86/README-UNIMPLEMENTED.txt
deleted file mode 100644
index c26c75ab951c..000000000000
--- a/lib/Target/X86/README-UNIMPLEMENTED.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-//===---------------------------------------------------------------------===//
-// Testcases that crash the X86 backend because they aren't implemented
-//===---------------------------------------------------------------------===//
-
-These are cases we know the X86 backend doesn't handle. Patches are welcome
-and appreciated, because no one has signed up to implemented these yet.
-Implementing these would allow elimination of the corresponding intrinsics,
-which would be great.
-
-1) vector shifts
-2) vector comparisons
-3) vector fp<->int conversions: PR2683, PR2684, PR2685, PR2686, PR2688
-4) bitcasts from vectors to scalars: PR2804
-5) llvm.atomic.cmp.swap.i128.p0i128: PR3462
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 11652af9f1fc..c06a7b1ade6d 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -1436,30 +1436,6 @@ bar:
//===---------------------------------------------------------------------===//
-Consider the following two functions compiled with clang:
-_Bool foo(int *x) { return !(*x & 4); }
-unsigned bar(int *x) { return !(*x & 4); }
-
-foo:
- movl 4(%esp), %eax
- testb $4, (%eax)
- sete %al
- movzbl %al, %eax
- ret
-
-bar:
- movl 4(%esp), %eax
- movl (%eax), %eax
- shrl $2, %eax
- andl $1, %eax
- xorl $1, %eax
- ret
-
-The second function generates more code even though the two functions are
-are functionally identical.
-
-//===---------------------------------------------------------------------===//
-
Take the following C code:
int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
diff --git a/lib/Target/X86/ShadowCallStack.cpp b/lib/Target/X86/ShadowCallStack.cpp
new file mode 100644
index 000000000000..9a39455f9dd5
--- /dev/null
+++ b/lib/Target/X86/ShadowCallStack.cpp
@@ -0,0 +1,326 @@
+//===------- ShadowCallStack.cpp - Shadow Call Stack pass -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The ShadowCallStack pass instruments function prologs/epilogs to check that
+// the return address has not been corrupted during the execution of the
+// function. The return address is stored in a 'shadow call stack' addressed
+// using the %gs segment register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeShadowCallStackPass(PassRegistry &);
+}
+
+namespace {
+
+class ShadowCallStack : public MachineFunctionPass {
+public:
+ static char ID;
+
+ ShadowCallStack() : MachineFunctionPass(ID) {
+ initializeShadowCallStackPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+private:
+ // Do not instrument leaf functions with this many or fewer instructions. The
+ // shadow call stack instrumented prolog/epilog are slightly race-y reading
+ // and checking the saved return address, so it is better to not instrument
+ // functions that have fewer instructions than the instrumented prolog/epilog
+ // race.
+ static const size_t SkipLeafInstructions = 3;
+};
+
+char ShadowCallStack::ID = 0;
+} // end anonymous namespace.
+
+static void addProlog(MachineFunction &Fn, const TargetInstrInfo *TII,
+ MachineBasicBlock &MBB, const DebugLoc &DL);
+static void addPrologLeaf(MachineFunction &Fn, const TargetInstrInfo *TII,
+ MachineBasicBlock &MBB, const DebugLoc &DL,
+ MCPhysReg FreeRegister);
+
+static void addEpilog(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
+ MachineInstr &MI, MachineBasicBlock &TrapBB);
+static void addEpilogLeaf(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
+ MachineInstr &MI, MachineBasicBlock &TrapBB,
+ MCPhysReg FreeRegister);
+// Generate a longer epilog that only uses r10 when a tailcall branches to r11.
+static void addEpilogOnlyR10(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
+ MachineInstr &MI, MachineBasicBlock &TrapBB);
+
+// Helper function to add ModR/M references for [Seg: Reg + Offset] memory
+// accesses
+static inline const MachineInstrBuilder &
+addSegmentedMem(const MachineInstrBuilder &MIB, MCPhysReg Seg, MCPhysReg Reg,
+ int Offset = 0) {
+ return MIB.addReg(Reg).addImm(1).addReg(0).addImm(Offset).addReg(Seg);
+}
+
+static void addProlog(MachineFunction &Fn, const TargetInstrInfo *TII,
+ MachineBasicBlock &MBB, const DebugLoc &DL) {
+ const MCPhysReg ReturnReg = X86::R10;
+ const MCPhysReg OffsetReg = X86::R11;
+
+ auto MBBI = MBB.begin();
+ // mov r10, [rsp]
+ addDirectMem(BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64rm)).addDef(ReturnReg),
+ X86::RSP);
+ // xor r11, r11
+ BuildMI(MBB, MBBI, DL, TII->get(X86::XOR64rr))
+ .addDef(OffsetReg)
+ .addReg(OffsetReg, RegState::Undef)
+ .addReg(OffsetReg, RegState::Undef);
+ // add QWORD [gs:r11], 8
+ addSegmentedMem(BuildMI(MBB, MBBI, DL, TII->get(X86::ADD64mi8)), X86::GS,
+ OffsetReg)
+ .addImm(8);
+ // mov r11, [gs:r11]
+ addSegmentedMem(
+ BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64rm)).addDef(OffsetReg), X86::GS,
+ OffsetReg);
+ // mov [gs:r11], r10
+ addSegmentedMem(BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64mr)), X86::GS,
+ OffsetReg)
+ .addReg(ReturnReg);
+}
+
+static void addPrologLeaf(MachineFunction &Fn, const TargetInstrInfo *TII,
+ MachineBasicBlock &MBB, const DebugLoc &DL,
+ MCPhysReg FreeRegister) {
+ // mov REG, [rsp]
+ addDirectMem(BuildMI(MBB, MBB.begin(), DL, TII->get(X86::MOV64rm))
+ .addDef(FreeRegister),
+ X86::RSP);
+}
+
+static void addEpilog(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
+ MachineInstr &MI, MachineBasicBlock &TrapBB) {
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ // xor r11, r11
+ BuildMI(MBB, MI, DL, TII->get(X86::XOR64rr))
+ .addDef(X86::R11)
+ .addReg(X86::R11, RegState::Undef)
+ .addReg(X86::R11, RegState::Undef);
+ // mov r10, [gs:r11]
+ addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
+ X86::GS, X86::R11);
+ // mov r10, [gs:r10]
+ addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
+ X86::GS, X86::R10);
+ // sub QWORD [gs:r11], 8
+ // This instruction should not be moved up to avoid a signal race.
+ addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::SUB64mi8)),
+ X86::GS, X86::R11)
+ .addImm(8);
+ // cmp [rsp], r10
+ addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP)
+ .addReg(X86::R10);
+ // jne trap
+ BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB);
+ MBB.addSuccessor(&TrapBB);
+}
+
+static void addEpilogLeaf(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
+ MachineInstr &MI, MachineBasicBlock &TrapBB,
+ MCPhysReg FreeRegister) {
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ // cmp [rsp], REG
+ addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP)
+ .addReg(FreeRegister);
+ // jne trap
+ BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB);
+ MBB.addSuccessor(&TrapBB);
+}
+
+static void addEpilogOnlyR10(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
+ MachineInstr &MI, MachineBasicBlock &TrapBB) {
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ // xor r10, r10
+ BuildMI(MBB, MI, DL, TII->get(X86::XOR64rr))
+ .addDef(X86::R10)
+ .addReg(X86::R10, RegState::Undef)
+ .addReg(X86::R10, RegState::Undef);
+ // mov r10, [gs:r10]
+ addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
+ X86::GS, X86::R10);
+ // mov r10, [gs:r10]
+ addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
+ X86::GS, X86::R10);
+ // sub QWORD [gs:0], 8
+ // This instruction should not be moved up to avoid a signal race.
+ addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::SUB64mi8)), X86::GS, 0)
+ .addImm(8);
+ // cmp [rsp], r10
+ addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP)
+ .addReg(X86::R10);
+ // jne trap
+ BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB);
+ MBB.addSuccessor(&TrapBB);
+}
+
+bool ShadowCallStack::runOnMachineFunction(MachineFunction &Fn) {
+ if (!Fn.getFunction().hasFnAttribute(Attribute::ShadowCallStack) ||
+ Fn.getFunction().hasFnAttribute(Attribute::Naked))
+ return false;
+
+ if (Fn.empty() || !Fn.getRegInfo().tracksLiveness())
+ return false;
+
+ // FIXME: Skip functions that have r10 or r11 live on entry (r10 can be live
+ // on entry for parameters with the nest attribute.)
+ if (Fn.front().isLiveIn(X86::R10) || Fn.front().isLiveIn(X86::R11))
+ return false;
+
+ // FIXME: Skip functions with conditional and r10 tail calls for now.
+ bool HasReturn = false;
+ for (auto &MBB : Fn) {
+ if (MBB.empty())
+ continue;
+
+ const MachineInstr &MI = MBB.instr_back();
+ if (MI.isReturn())
+ HasReturn = true;
+
+ if (MI.isReturn() && MI.isCall()) {
+ if (MI.findRegisterUseOperand(X86::EFLAGS))
+ return false;
+ // This should only be possible on Windows 64 (see GR64_TC versus
+ // GR64_TCW64.)
+ if (MI.findRegisterUseOperand(X86::R10) ||
+ MI.hasRegisterImplicitUseOperand(X86::R10))
+ return false;
+ }
+ }
+
+ if (!HasReturn)
+ return false;
+
+ // For leaf functions:
+ // 1. Do not instrument very short functions where it would not improve that
+ // function's security.
+ // 2. Detect if there is an unused caller-saved register we can reserve to
+ // hold the return address instead of writing/reading it from the shadow
+ // call stack.
+ MCPhysReg LeafFuncRegister = X86::NoRegister;
+ if (!Fn.getFrameInfo().adjustsStack()) {
+ size_t InstructionCount = 0;
+ std::bitset<X86::NUM_TARGET_REGS> UsedRegs;
+ for (auto &MBB : Fn) {
+ for (auto &LiveIn : MBB.liveins())
+ UsedRegs.set(LiveIn.PhysReg);
+ for (auto &MI : MBB) {
+ if (!MI.isDebugValue() && !MI.isCFIInstruction() && !MI.isLabel())
+ InstructionCount++;
+ for (auto &Op : MI.operands())
+ if (Op.isReg() && Op.isDef())
+ UsedRegs.set(Op.getReg());
+ }
+ }
+
+ if (InstructionCount <= SkipLeafInstructions)
+ return false;
+
+ std::bitset<X86::NUM_TARGET_REGS> CalleeSavedRegs;
+ const MCPhysReg *CSRegs = Fn.getRegInfo().getCalleeSavedRegs();
+ for (size_t i = 0; CSRegs[i]; i++)
+ CalleeSavedRegs.set(CSRegs[i]);
+
+ const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+ for (auto &Reg : X86::GR64_NOSPRegClass.getRegisters()) {
+ // FIXME: Optimization opportunity: spill/restore a callee-saved register
+ // if a caller-saved register is unavailable.
+ if (CalleeSavedRegs.test(Reg))
+ continue;
+
+ bool Used = false;
+ for (MCSubRegIterator SR(Reg, TRI, true); SR.isValid(); ++SR)
+ if ((Used = UsedRegs.test(*SR)))
+ break;
+
+ if (!Used) {
+ LeafFuncRegister = Reg;
+ break;
+ }
+ }
+ }
+
+ const bool LeafFuncOptimization = LeafFuncRegister != X86::NoRegister;
+ if (LeafFuncOptimization)
+ // Mark the leaf function register live-in for all MBBs except the entry MBB
+ for (auto I = ++Fn.begin(), E = Fn.end(); I != E; ++I)
+ I->addLiveIn(LeafFuncRegister);
+
+ MachineBasicBlock &MBB = Fn.front();
+ const MachineBasicBlock *NonEmpty = MBB.empty() ? MBB.getFallThrough() : &MBB;
+ const DebugLoc &DL = NonEmpty->front().getDebugLoc();
+
+ const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
+ if (LeafFuncOptimization)
+ addPrologLeaf(Fn, TII, MBB, DL, LeafFuncRegister);
+ else
+ addProlog(Fn, TII, MBB, DL);
+
+ MachineBasicBlock *Trap = nullptr;
+ for (auto &MBB : Fn) {
+ if (MBB.empty())
+ continue;
+
+ MachineInstr &MI = MBB.instr_back();
+ if (MI.isReturn()) {
+ if (!Trap) {
+ Trap = Fn.CreateMachineBasicBlock();
+ BuildMI(Trap, MI.getDebugLoc(), TII->get(X86::TRAP));
+ Fn.push_back(Trap);
+ }
+
+ if (LeafFuncOptimization)
+ addEpilogLeaf(TII, MBB, MI, *Trap, LeafFuncRegister);
+ else if (MI.findRegisterUseOperand(X86::R11))
+ addEpilogOnlyR10(TII, MBB, MI, *Trap);
+ else
+ addEpilog(TII, MBB, MI, *Trap);
+ }
+ }
+
+ return true;
+}
+
+INITIALIZE_PASS(ShadowCallStack, "shadow-call-stack", "Shadow Call Stack",
+ false, false)
+
+FunctionPass *llvm::createShadowCallStackPass() {
+ return new ShadowCallStack();
+}
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 8a0fbfb45b22..fe567f4cece8 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -14,7 +14,6 @@
#include "X86ShuffleDecode.h"
#include "llvm/ADT/ArrayRef.h"
-#include "llvm/CodeGen/MachineValueType.h"
//===----------------------------------------------------------------------===//
// Vector Mask Decoding
@@ -45,9 +44,8 @@ void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
}
-void DecodeInsertElementMask(MVT VT, unsigned Idx, unsigned Len,
+void DecodeInsertElementMask(unsigned NumElts, unsigned Idx, unsigned Len,
SmallVectorImpl<int> &ShuffleMask) {
- unsigned NumElts = VT.getVectorNumElements();
assert((Idx + Len) <= NumElts && "Insertion out of range");
for (unsigned i = 0; i != NumElts; ++i)
@@ -74,41 +72,31 @@ void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
ShuffleMask.push_back(NElts + i);
}
-void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
- unsigned NumElts = VT.getVectorNumElements();
+void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
for (int i = 0, e = NumElts / 2; i < e; ++i) {
ShuffleMask.push_back(2 * i);
ShuffleMask.push_back(2 * i);
}
}
-void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
- unsigned NumElts = VT.getVectorNumElements();
+void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
for (int i = 0, e = NumElts / 2; i < e; ++i) {
ShuffleMask.push_back(2 * i + 1);
ShuffleMask.push_back(2 * i + 1);
}
}
-void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
- unsigned VectorSizeInBits = VT.getSizeInBits();
- unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
- unsigned NumElts = VT.getVectorNumElements();
- unsigned NumLanes = VectorSizeInBits / 128;
- unsigned NumLaneElts = NumElts / NumLanes;
- unsigned NumLaneSubElts = 64 / ScalarSizeInBits;
+void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
+ const unsigned NumLaneElts = 2;
for (unsigned l = 0; l < NumElts; l += NumLaneElts)
- for (unsigned i = 0; i < NumLaneElts; i += NumLaneSubElts)
- for (unsigned s = 0; s != NumLaneSubElts; s++)
- ShuffleMask.push_back(l + s);
+ for (unsigned i = 0; i < NumLaneElts; ++i)
+ ShuffleMask.push_back(l);
}
-void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
- unsigned VectorSizeInBits = VT.getSizeInBits();
- unsigned NumElts = VectorSizeInBits / 8;
- unsigned NumLanes = VectorSizeInBits / 128;
- unsigned NumLaneElts = NumElts / NumLanes;
+void DecodePSLLDQMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ const unsigned NumLaneElts = 16;
for (unsigned l = 0; l < NumElts; l += NumLaneElts)
for (unsigned i = 0; i < NumLaneElts; ++i) {
@@ -118,11 +106,9 @@ void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
}
}
-void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
- unsigned VectorSizeInBits = VT.getSizeInBits();
- unsigned NumElts = VectorSizeInBits / 8;
- unsigned NumLanes = VectorSizeInBits / 128;
- unsigned NumLaneElts = NumElts / NumLanes;
+void DecodePSRLDQMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ const unsigned NumLaneElts = 16;
for (unsigned l = 0; l < NumElts; l += NumLaneElts)
for (unsigned i = 0; i < NumLaneElts; ++i) {
@@ -133,58 +119,50 @@ void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
}
}
-void DecodePALIGNRMask(MVT VT, unsigned Imm,
+void DecodePALIGNRMask(unsigned NumElts, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask) {
- unsigned NumElts = VT.getVectorNumElements();
- unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
-
- unsigned NumLanes = VT.getSizeInBits() / 128;
- unsigned NumLaneElts = NumElts / NumLanes;
+ const unsigned NumLaneElts = 16;
for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
for (unsigned i = 0; i != NumLaneElts; ++i) {
- unsigned Base = i + Offset;
- // if i+offset is out of this lane then we actually need the other source
+ unsigned Base = i + Imm;
+ // if i+imm is out of this lane then we actually need the other source
if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
ShuffleMask.push_back(Base + l);
}
}
}
-void DecodeVALIGNMask(MVT VT, unsigned Imm,
+void DecodeVALIGNMask(unsigned NumElts, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask) {
- int NumElts = VT.getVectorNumElements();
// Not all bits of the immediate are used so mask it.
assert(isPowerOf2_32(NumElts) && "NumElts should be power of 2");
Imm = Imm & (NumElts - 1);
- for (int i = 0; i != NumElts; ++i)
+ for (unsigned i = 0; i != NumElts; ++i)
ShuffleMask.push_back(i + Imm);
}
/// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*.
/// VT indicates the type of the vector allowing it to handle different
/// datatypes and vector widths.
-void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
- unsigned NumElts = VT.getVectorNumElements();
-
- unsigned NumLanes = VT.getSizeInBits() / 128;
+void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned Size = NumElts * ScalarBits;
+ unsigned NumLanes = Size / 128;
if (NumLanes == 0) NumLanes = 1; // Handle MMX
unsigned NumLaneElts = NumElts / NumLanes;
- unsigned NewImm = Imm;
+ uint32_t SplatImm = (Imm & 0xff) * 0x01010101;
for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
for (unsigned i = 0; i != NumLaneElts; ++i) {
- ShuffleMask.push_back(NewImm % NumLaneElts + l);
- NewImm /= NumLaneElts;
+ ShuffleMask.push_back(SplatImm % NumLaneElts + l);
+ SplatImm /= NumLaneElts;
}
- if (NumLaneElts == 4) NewImm = Imm; // reload imm
}
}
-void DecodePSHUFHWMask(MVT VT, unsigned Imm,
+void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask) {
- unsigned NumElts = VT.getVectorNumElements();
-
for (unsigned l = 0; l != NumElts; l += 8) {
unsigned NewImm = Imm;
for (unsigned i = 0, e = 4; i != e; ++i) {
@@ -197,10 +175,8 @@ void DecodePSHUFHWMask(MVT VT, unsigned Imm,
}
}
-void DecodePSHUFLWMask(MVT VT, unsigned Imm,
+void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask) {
- unsigned NumElts = VT.getVectorNumElements();
-
for (unsigned l = 0; l != NumElts; l += 8) {
unsigned NewImm = Imm;
for (unsigned i = 0, e = 4; i != e; ++i) {
@@ -213,8 +189,7 @@ void DecodePSHUFLWMask(MVT VT, unsigned Imm,
}
}
-void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
- unsigned NumElts = VT.getVectorNumElements();
+void DecodePSWAPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
unsigned NumHalfElts = NumElts / 2;
for (unsigned l = 0; l != NumHalfElts; ++l)
@@ -226,11 +201,9 @@ void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
/// the type of the vector allowing it to handle different datatypes and vector
/// widths.
-void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
- unsigned NumElts = VT.getVectorNumElements();
-
- unsigned NumLanes = VT.getSizeInBits() / 128;
- unsigned NumLaneElts = NumElts / NumLanes;
+void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits,
+ unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumLaneElts = 128 / ScalarBits;
unsigned NewImm = Imm;
for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
@@ -248,12 +221,11 @@ void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
/// and punpckh*. VT indicates the type of the vector allowing it to handle
/// different datatypes and vector widths.
-void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
- unsigned NumElts = VT.getVectorNumElements();
-
+void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits,
+ SmallVectorImpl<int> &ShuffleMask) {
// Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
// independently on 128-bit lanes.
- unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumLanes = (NumElts * ScalarBits) / 128;
if (NumLanes == 0) NumLanes = 1; // Handle MMX
unsigned NumLaneElts = NumElts / NumLanes;
@@ -268,12 +240,11 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
/// and punpckl*. VT indicates the type of the vector allowing it to handle
/// different datatypes and vector widths.
-void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
- unsigned NumElts = VT.getVectorNumElements();
-
+void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits,
+ SmallVectorImpl<int> &ShuffleMask) {
// Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
// independently on 128-bit lanes.
- unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumLanes = (NumElts * ScalarBits) / 128;
if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
unsigned NumLaneElts = NumElts / NumLanes;
@@ -286,47 +257,44 @@ void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
}
/// Decodes a broadcast of the first element of a vector.
-void DecodeVectorBroadcast(MVT DstVT, SmallVectorImpl<int> &ShuffleMask) {
- unsigned NumElts = DstVT.getVectorNumElements();
+void DecodeVectorBroadcast(unsigned NumElts,
+ SmallVectorImpl<int> &ShuffleMask) {
ShuffleMask.append(NumElts, 0);
}
/// Decodes a broadcast of a subvector to a larger vector type.
-void DecodeSubVectorBroadcast(MVT DstVT, MVT SrcVT,
+void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts,
SmallVectorImpl<int> &ShuffleMask) {
- assert(SrcVT.getScalarType() == DstVT.getScalarType() &&
- "Non matching vector element types");
- unsigned NumElts = SrcVT.getVectorNumElements();
- unsigned Scale = DstVT.getSizeInBits() / SrcVT.getSizeInBits();
+ unsigned Scale = DstNumElts / SrcNumElts;
for (unsigned i = 0; i != Scale; ++i)
- for (unsigned j = 0; j != NumElts; ++j)
+ for (unsigned j = 0; j != SrcNumElts; ++j)
ShuffleMask.push_back(j);
}
-/// \brief Decode a shuffle packed values at 128-bit granularity
+/// Decode a shuffle packed values at 128-bit granularity
/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
/// immediate mask into a shuffle mask.
-void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
- SmallVectorImpl<int> &ShuffleMask) {
- unsigned NumLanes = VT.getSizeInBits() / 128;
- unsigned NumElementsInLane = 128 / VT.getScalarSizeInBits();
- unsigned ControlBitsMask = NumLanes - 1;
- unsigned NumControlBits = NumLanes / 2;
-
- for (unsigned l = 0; l != NumLanes; ++l) {
- unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask;
+void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize,
+ unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElementsInLane = 128 / ScalarSize;
+ unsigned NumLanes = NumElts / NumElementsInLane;
+
+ for (unsigned l = 0; l != NumElts; l += NumElementsInLane) {
+ unsigned Index = (Imm % NumLanes) * NumElementsInLane;
+ Imm /= NumLanes; // Discard the bits we just used.
// We actually need the other source.
- if (l >= NumLanes / 2)
- LaneMask += NumLanes;
+ if (l >= (NumElts / 2))
+ Index += NumElts;
for (unsigned i = 0; i != NumElementsInLane; ++i)
- ShuffleMask.push_back(LaneMask * NumElementsInLane + i);
+ ShuffleMask.push_back(Index + i);
}
}
-void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask) {
- unsigned HalfSize = VT.getVectorNumElements() / 2;
+ unsigned HalfSize = NumElts / 2;
for (unsigned l = 0; l != 2; ++l) {
unsigned HalfMask = Imm >> (l * 4);
@@ -358,17 +326,13 @@ void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
}
}
-void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
- int ElementBits = VT.getScalarSizeInBits();
- int NumElements = VT.getVectorNumElements();
- for (int i = 0; i < NumElements; ++i) {
+void DecodeBLENDMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = 0; i < NumElts; ++i) {
// If there are more than 8 elements in the vector, then any immediate blend
- // mask applies to each 128-bit lane. There can never be more than
- // 8 elements in a 128-bit lane with an immediate blend.
- int Bit = NumElements > 8 ? i % (128 / ElementBits) : i;
- assert(Bit < 8 &&
- "Immediate blends only operate over 8 elements at a time!");
- ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i);
+ // mask wraps around.
+ unsigned Bit = i % 8;
+ ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElts + i : i);
}
}
@@ -412,19 +376,15 @@ void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
}
/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
-void DecodeVPERMMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
- assert((VT.is256BitVector() || VT.is512BitVector()) &&
- (VT.getScalarSizeInBits() == 64) && "Unexpected vector value type");
- unsigned NumElts = VT.getVectorNumElements();
+void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
for (unsigned l = 0; l != NumElts; l += 4)
for (unsigned i = 0; i != 4; ++i)
ShuffleMask.push_back(l + ((Imm >> (2 * i)) & 3));
}
-void DecodeZeroExtendMask(MVT SrcScalarVT, MVT DstVT, SmallVectorImpl<int> &Mask) {
- unsigned NumDstElts = DstVT.getVectorNumElements();
- unsigned SrcScalarBits = SrcScalarVT.getSizeInBits();
- unsigned DstScalarBits = DstVT.getScalarSizeInBits();
+void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
+ unsigned NumDstElts, SmallVectorImpl<int> &Mask) {
unsigned Scale = DstScalarBits / SrcScalarBits;
assert(SrcScalarBits < DstScalarBits &&
"Expected zero extension mask to increase scalar size");
@@ -436,27 +396,24 @@ void DecodeZeroExtendMask(MVT SrcScalarVT, MVT DstVT, SmallVectorImpl<int> &Mask
}
}
-void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
- unsigned NumElts = VT.getVectorNumElements();
+void DecodeZeroMoveLowMask(unsigned NumElts,
+ SmallVectorImpl<int> &ShuffleMask) {
ShuffleMask.push_back(0);
for (unsigned i = 1; i < NumElts; i++)
ShuffleMask.push_back(SM_SentinelZero);
}
-void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) {
+void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad,
+ SmallVectorImpl<int> &Mask) {
// First element comes from the first element of second source.
// Remaining elements: Load zero extends / Move copies from first source.
- unsigned NumElts = VT.getVectorNumElements();
Mask.push_back(NumElts);
for (unsigned i = 1; i < NumElts; i++)
Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
}
-void DecodeEXTRQIMask(MVT VT, int Len, int Idx,
+void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
SmallVectorImpl<int> &ShuffleMask) {
- assert(VT.is128BitVector() && "Expected 128-bit vector");
- unsigned NumElts = VT.getVectorNumElements();
- unsigned EltSize = VT.getScalarSizeInBits();
unsigned HalfElts = NumElts / 2;
// Only the bottom 6 bits are valid for each immediate.
@@ -492,11 +449,8 @@ void DecodeEXTRQIMask(MVT VT, int Len, int Idx,
ShuffleMask.push_back(SM_SentinelUndef);
}
-void DecodeINSERTQIMask(MVT VT, int Len, int Idx,
+void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
SmallVectorImpl<int> &ShuffleMask) {
- assert(VT.is128BitVector() && "Expected 128-bit vector");
- unsigned NumElts = VT.getVectorNumElements();
- unsigned EltSize = VT.getScalarSizeInBits();
unsigned HalfElts = NumElts / 2;
// Only the bottom 6 bits are valid for each immediate.
@@ -535,33 +489,32 @@ void DecodeINSERTQIMask(MVT VT, int Len, int Idx,
ShuffleMask.push_back(SM_SentinelUndef);
}
-void DecodeVPERMILPMask(MVT VT, ArrayRef<uint64_t> RawMask,
+void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
+ ArrayRef<uint64_t> RawMask,
SmallVectorImpl<int> &ShuffleMask) {
- unsigned VecSize = VT.getSizeInBits();
- unsigned EltSize = VT.getScalarSizeInBits();
+ unsigned VecSize = NumElts * ScalarBits;
unsigned NumLanes = VecSize / 128;
- unsigned NumEltsPerLane = VT.getVectorNumElements() / NumLanes;
+ unsigned NumEltsPerLane = NumElts / NumLanes;
assert((VecSize == 128 || VecSize == 256 || VecSize == 512) &&
"Unexpected vector size");
- assert((EltSize == 32 || EltSize == 64) && "Unexpected element size");
+ assert((ScalarBits == 32 || ScalarBits == 64) && "Unexpected element size");
for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
uint64_t M = RawMask[i];
- M = (EltSize == 64 ? ((M >> 1) & 0x1) : (M & 0x3));
+ M = (ScalarBits == 64 ? ((M >> 1) & 0x1) : (M & 0x3));
unsigned LaneOffset = i & ~(NumEltsPerLane - 1);
ShuffleMask.push_back((int)(LaneOffset + M));
}
}
-void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask,
+void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
+ ArrayRef<uint64_t> RawMask,
SmallVectorImpl<int> &ShuffleMask) {
- unsigned VecSize = VT.getSizeInBits();
- unsigned EltSize = VT.getScalarSizeInBits();
+ unsigned VecSize = NumElts * ScalarBits;
unsigned NumLanes = VecSize / 128;
- unsigned NumElts = VT.getVectorNumElements();
unsigned NumEltsPerLane = NumElts / NumLanes;
assert((VecSize == 128 || VecSize == 256) && "Unexpected vector size");
- assert((EltSize == 32 || EltSize == 64) && "Unexpected element size");
+ assert((ScalarBits == 32 || ScalarBits == 64) && "Unexpected element size");
assert((NumElts == RawMask.size()) && "Unexpected mask size");
for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
@@ -584,7 +537,7 @@ void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask,
}
int Index = i & ~(NumEltsPerLane - 1);
- if (EltSize == 64)
+ if (ScalarBits == 64)
Index += (Selector >> 1) & 0x1;
else
Index += Selector & 0x3;
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 251c9f7558ec..6d13bd58a127 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -23,7 +23,6 @@
namespace llvm {
template <typename T> class ArrayRef;
-class MVT;
enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
@@ -32,7 +31,7 @@ void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
// Insert the bottom Len elements from a second source into a vector starting at
// element Idx.
-void DecodeInsertElementMask(MVT VT, unsigned Idx, unsigned Len,
+void DecodeInsertElementMask(unsigned NumElts, unsigned Idx, unsigned Len,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
@@ -43,58 +42,68 @@ void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
/// i.e. <0,2> or <0,1,4,5>
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
-void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
-void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
-void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
-void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSLLDQMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
-void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSRLDQMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
-void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePALIGNRMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
-void DecodeVALIGNMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodeVALIGNMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
/// Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
/// VT indicates the type of the vector allowing it to handle different
/// datatypes and vector widths.
-void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
/// Decodes the shuffle masks for pshufhw.
/// VT indicates the type of the vector allowing it to handle different
/// datatypes and vector widths.
-void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
/// Decodes the shuffle masks for pshuflw.
/// VT indicates the type of the vector allowing it to handle different
/// datatypes and vector widths.
-void DecodePSHUFLWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
/// Decodes a PSWAPD 3DNow! instruction.
-void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSWAPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
/// Decodes the shuffle masks for shufp*.
/// VT indicates the type of the vector allowing it to handle different
/// datatypes and vector widths.
-void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
/// Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
/// VT indicates the type of the vector allowing it to handle different
/// datatypes and vector widths.
-void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits,
+ SmallVectorImpl<int> &ShuffleMask);
/// Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
/// VT indicates the type of the vector allowing it to handle different
/// datatypes and vector widths.
-void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits,
+ SmallVectorImpl<int> &ShuffleMask);
/// Decodes a broadcast of the first element of a vector.
-void DecodeVectorBroadcast(MVT DstVT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
/// Decodes a broadcast of a subvector to a larger vector type.
-void DecodeSubVectorBroadcast(MVT DstVT, MVT SrcVT,
+void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a PSHUFB mask from a raw array of constants such as from
@@ -103,18 +112,20 @@ void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a BLEND immediate mask into a shuffle mask.
-void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodeBLENDMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
-void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a shuffle packed values at 128-bit granularity
/// immediate mask into a shuffle mask.
-void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
- SmallVectorImpl<int> &ShuffleMask);
+void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize,
+ unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
/// Decodes the shuffle masks for VPERMQ/VPERMPD.
-void DecodeVPERMMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
/// Decode a VPPERM mask from a raw array of constants such as from
/// BUILD_VECTOR.
@@ -124,30 +135,33 @@ void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a zero extension instruction as a shuffle mask.
-void DecodeZeroExtendMask(MVT SrcScalarVT, MVT DstVT,
+void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
+ unsigned NumDstElts,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a move lower and zero upper instruction as a shuffle mask.
-void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
/// Decode a scalar float move instruction as a shuffle mask.
-void DecodeScalarMoveMask(MVT VT, bool IsLoad,
+void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a SSE4A EXTRQ instruction as a shuffle mask.
-void DecodeEXTRQIMask(MVT VT, int Len, int Idx,
+void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a SSE4A INSERTQ instruction as a shuffle mask.
-void DecodeINSERTQIMask(MVT VT, int Len, int Idx,
+void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
-void DecodeVPERMILPMask(MVT VT, ArrayRef<uint64_t> RawMask,
+void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
+ ArrayRef<uint64_t> RawMask,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
-void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask,
+void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
+ ArrayRef<uint64_t> RawMask,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 5631648d2dc8..73bb0f2af285 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -22,6 +22,7 @@ namespace llvm {
class FunctionPass;
class ImmutablePass;
class InstructionSelector;
+class ModulePass;
class PassRegistry;
class X86RegisterBankInfo;
class X86Subtarget;
@@ -49,6 +50,15 @@ FunctionPass *createX86FloatingPointStackifierPass();
/// transition penalty between functions encoded with AVX and SSE.
FunctionPass *createX86IssueVZeroUpperPass();
+/// This pass instruments the function prolog to save the return address to a
+/// 'shadow call stack' and the function epilog to check that the return address
+/// did not change during function execution.
+FunctionPass *createShadowCallStackPass();
+
+/// This pass inserts ENDBR instructions before indirect jump/call
+/// destinations as part of CET IBT mechanism.
+FunctionPass *createX86IndirectBranchTrackingPass();
+
/// Return a pass that pads short functions with NOOPs.
/// This will prevent a stall when returning on the Atom.
FunctionPass *createX86PadShortFunctions();
@@ -65,6 +75,12 @@ FunctionPass *createX86OptimizeLEAs();
/// Return a pass that transforms setcc + movzx pairs into xor + setcc.
FunctionPass *createX86FixupSetCC();
+/// Return a pass that avoids creating store forward block issues in the hardware.
+FunctionPass *createX86AvoidStoreForwardingBlocks();
+
+/// Return a pass that lowers EFLAGS copy pseudo instructions.
+FunctionPass *createX86FlagsCopyLoweringPass();
+
/// Return a pass that expands WinAlloca pseudo-instructions.
FunctionPass *createX86WinAllocaExpander();
@@ -102,12 +118,17 @@ void initializeFixupBWInstPassPass(PassRegistry &);
/// encoding when possible in order to reduce code size.
FunctionPass *createX86EvexToVexInsts();
+/// This pass creates the thunks for the retpoline feature.
+FunctionPass *createX86RetpolineThunksPass();
+
InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
X86Subtarget &,
X86RegisterBankInfo &);
void initializeEvexToVexInstPassPass(PassRegistry &);
+FunctionPass *createX86SpeculativeLoadHardeningPass();
+
} // End llvm namespace
#endif
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index ba998467b799..63c2dc4da6cc 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -34,6 +34,9 @@ def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
def FeatureX87 : SubtargetFeature<"x87","HasX87", "true",
"Enable X87 float instructions">;
+def FeatureNOPL : SubtargetFeature<"nopl", "HasNOPL", "true",
+ "Enable NOPL instruction">;
+
def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true",
"Enable conditional move instructions">;
@@ -215,8 +218,6 @@ def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true",
[FeatureSSE2]>;
def FeatureSHSTK : SubtargetFeature<"shstk", "HasSHSTK", "true",
"Support CET Shadow-Stack instructions">;
-def FeatureIBT : SubtargetFeature<"ibt", "HasIBT", "true",
- "Support CET Indirect-Branch-Tracking instructions">;
def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
"Support PRFCHW instructions">;
def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true",
@@ -227,6 +228,10 @@ def FeatureMWAITX : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
"Enable MONITORX/MWAITX timer functionality">;
def FeatureCLZERO : SubtargetFeature<"clzero", "HasCLZERO", "true",
"Enable Cache Line Zero">;
+def FeatureCLDEMOTE : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true",
+ "Enable Cache Demote">;
+def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true",
+ "Support ptwrite instruction">;
def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true",
"Support MPX instructions">;
def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
@@ -240,12 +245,20 @@ def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
"PadShortFunctions", "true",
"Pad short functions">;
+def FeatureINVPCID : SubtargetFeature<"invpcid", "HasINVPCID", "true",
+ "Invalidate Process-Context Identifier">;
def FeatureSGX : SubtargetFeature<"sgx", "HasSGX", "true",
"Enable Software Guard Extensions">;
def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true",
"Flush A Cache Line Optimized">;
def FeatureCLWB : SubtargetFeature<"clwb", "HasCLWB", "true",
"Cache Line Write Back">;
+def FeatureWBNOINVD : SubtargetFeature<"wbnoinvd", "HasWBNOINVD", "true",
+ "Write Back No Invalidate">;
+def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true",
+ "Support RDPID instructions">;
+def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true",
+ "Wait and pause enhancements">;
// On some processors, instructions that implicitly take two memory operands are
// slow. In practice, this means that CALL, PUSH, and POP with memory operands
// should be avoided in favor of a MOV + register CALL/PUSH/POP.
@@ -263,6 +276,14 @@ def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
def FeatureSoftFloat
: SubtargetFeature<"soft-float", "UseSoftFloat", "true",
"Use software floating point features.">;
+def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt",
+ "HasPOPCNTFalseDeps", "true",
+ "POPCNT has a false dependency on dest register">;
+def FeatureLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
+ "HasLZCNTFalseDeps", "true",
+ "LZCNT/TZCNT have a false dependency on dest register">;
+def FeaturePCONFIG : SubtargetFeature<"pconfig", "HasPCONFIG", "true",
+ "platform configuration instruction">;
// On recent X86 (port bound) processors, its preferable to combine to a single shuffle
// using a variable mask over multiple fixed shuffles.
def FeatureFastVariableShuffle
@@ -294,8 +315,16 @@ def FeatureFastLZCNT
: SubtargetFeature<
"fast-lzcnt", "HasFastLZCNT", "true",
"LZCNT instructions are as fast as most simple integer ops">;
-
-
+// If the target can efficiently decode NOPs upto 11-bytes in length.
+def FeatureFast11ByteNOP
+ : SubtargetFeature<
+ "fast-11bytenop", "HasFast11ByteNOP", "true",
+ "Target can quickly decode up to 11 byte NOPs">;
+// If the target can efficiently decode NOPs upto 15-bytes in length.
+def FeatureFast15ByteNOP
+ : SubtargetFeature<
+ "fast-15bytenop", "HasFast15ByteNOP", "true",
+ "Target can quickly decode up to 15 byte NOPs">;
// Sandy Bridge and newer processors can use SHLD with the same source on both
// inputs to implement rotate to avoid the partial flag update of the normal
// rotate instructions.
@@ -329,6 +358,37 @@ def FeatureHasFastGather
: SubtargetFeature<"fast-gather", "HasFastGather", "true",
"Indicates if gather is reasonably fast.">;
+def FeaturePrefer256Bit
+ : SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
+ "Prefer 256-bit AVX instructions">;
+
+// Enable mitigation of some aspects of speculative execution related
+// vulnerabilities by removing speculatable indirect branches. This disables
+// jump-table formation, rewrites explicit `indirectbr` instructions into
+// `switch` instructions, and uses a special construct called a "retpoline" to
+// prevent speculation of the remaining indirect branches (indirect calls and
+// tail calls).
+def FeatureRetpoline
+ : SubtargetFeature<"retpoline", "UseRetpoline", "true",
+ "Remove speculation of indirect branches from the "
+ "generated code, either by avoiding them entirely or "
+ "lowering them with a speculation blocking construct.">;
+
+// Rely on external thunks for the emitted retpoline calls. This allows users
+// to provide their own custom thunk definitions in highly specialized
+// environments such as a kernel that does boot-time hot patching.
+def FeatureRetpolineExternalThunk
+ : SubtargetFeature<
+ "retpoline-external-thunk", "UseRetpolineExternalThunk", "true",
+ "Enable retpoline, but with an externally provided thunk.",
+ [FeatureRetpoline]>;
+
+// Direct Move instructions.
+def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
+ "Support movdiri instruction">;
+def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
+ "Support movdir64b instruction">;
+
//===----------------------------------------------------------------------===//
// Register File Description
//===----------------------------------------------------------------------===//
@@ -342,6 +402,7 @@ include "X86RegisterBanks.td"
include "X86Schedule.td"
include "X86InstrInfo.td"
+include "X86SchedPredicates.td"
def X86InstrInfo : InstrInfo;
@@ -365,6 +426,10 @@ def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
"Intel Silvermont processors">;
def ProcIntelGLM : SubtargetFeature<"glm", "X86ProcFamily", "IntelGLM",
"Intel Goldmont processors">;
+def ProcIntelGLP : SubtargetFeature<"glp", "X86ProcFamily", "IntelGLP",
+ "Intel Goldmont Plus processors">;
+def ProcIntelTRM : SubtargetFeature<"tremont", "X86ProcFamily", "IntelTRM",
+ "Intel Tremont processors">;
def ProcIntelHSW : SubtargetFeature<"haswell", "X86ProcFamily",
"IntelHaswell", "Intel Haswell processors">;
def ProcIntelBDW : SubtargetFeature<"broadwell", "X86ProcFamily",
@@ -377,8 +442,10 @@ def ProcIntelSKX : SubtargetFeature<"skx", "X86ProcFamily",
"IntelSKX", "Intel Skylake Server processors">;
def ProcIntelCNL : SubtargetFeature<"cannonlake", "X86ProcFamily",
"IntelCannonlake", "Intel Cannonlake processors">;
-def ProcIntelICL : SubtargetFeature<"icelake", "X86ProcFamily",
- "IntelIcelake", "Intel Icelake processors">;
+def ProcIntelICL : SubtargetFeature<"icelake-client", "X86ProcFamily",
+ "IntelIcelakeClient", "Intel Icelake processors">;
+def ProcIntelICX : SubtargetFeature<"icelake-server", "X86ProcFamily",
+ "IntelIcelakeServer", "Intel Icelake Server processors">;
class Proc<string Name, list<SubtargetFeature> Features>
: ProcessorModel<Name, GenericModel, Features>;
@@ -390,16 +457,16 @@ def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16]>;
def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16]>;
def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
-foreach P = ["i686", "pentiumpro"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>;
-}
+def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>;
+def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV,
+ FeatureNOPL]>;
def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureCMOV, FeatureFXSR]>;
+ FeatureCMOV, FeatureFXSR, FeatureNOPL]>;
foreach P = ["pentium3", "pentium3m"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
- FeatureFXSR]>;
+ FeatureFXSR, FeatureNOPL]>;
}
// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
@@ -414,12 +481,12 @@ foreach P = ["pentium3", "pentium3m"] in {
def : ProcessorModel<"pentium-m", GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureSSE2, FeatureFXSR]>;
+ FeatureSSE2, FeatureFXSR, FeatureNOPL]>;
foreach P = ["pentium4", "pentium4m"] in {
def : ProcessorModel<P, GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureSSE2, FeatureFXSR]>;
+ FeatureSSE2, FeatureFXSR, FeatureNOPL]>;
}
// Intel Quark.
@@ -428,18 +495,19 @@ def : Proc<"lakemont", []>;
// Intel Core Duo.
def : ProcessorModel<"yonah", SandyBridgeModel,
[FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
- FeatureFXSR]>;
+ FeatureFXSR, FeatureNOPL]>;
// NetBurst.
def : ProcessorModel<"prescott", GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
- FeatureFXSR]>;
+ FeatureFXSR, FeatureNOPL]>;
def : ProcessorModel<"nocona", GenericPostRAModel, [
FeatureX87,
FeatureSlowUAMem16,
FeatureMMX,
FeatureSSE3,
FeatureFXSR,
+ FeatureNOPL,
FeatureCMPXCHG16B
]>;
@@ -450,6 +518,7 @@ def : ProcessorModel<"core2", SandyBridgeModel, [
FeatureMMX,
FeatureSSSE3,
FeatureFXSR,
+ FeatureNOPL,
FeatureCMPXCHG16B,
FeatureLAHFSAHF,
FeatureMacroFusion
@@ -460,6 +529,7 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [
FeatureMMX,
FeatureSSE41,
FeatureFXSR,
+ FeatureNOPL,
FeatureCMPXCHG16B,
FeatureLAHFSAHF,
FeatureMacroFusion
@@ -473,6 +543,7 @@ class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
FeatureMMX,
FeatureSSSE3,
FeatureFXSR,
+ FeatureNOPL,
FeatureCMPXCHG16B,
FeatureMOVBE,
FeatureLEAForSP,
@@ -492,6 +563,7 @@ class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
FeatureMMX,
FeatureSSE42,
FeatureFXSR,
+ FeatureNOPL,
FeatureCMPXCHG16B,
FeatureMOVBE,
FeaturePOPCNT,
@@ -503,17 +575,29 @@ class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
FeatureSlowLEA,
FeatureSlowIncDec,
FeatureSlowPMULLD,
- FeatureLAHFSAHF
+ FeatureRDRAND,
+ FeatureLAHFSAHF,
+ FeaturePOPCNTFalseDeps
]>;
def : SilvermontProc<"silvermont">;
def : SilvermontProc<"slm">; // Legacy alias.
-class GoldmontProc<string Name> : ProcessorModel<Name, SLMModel, [
- ProcIntelGLM,
+class ProcessorFeatures<list<SubtargetFeature> Inherited,
+ list<SubtargetFeature> NewFeatures> {
+ list<SubtargetFeature> Value = !listconcat(Inherited, NewFeatures);
+}
+
+class ProcModel<string Name, SchedMachineModel Model,
+ list<SubtargetFeature> ProcFeatures,
+ list<SubtargetFeature> OtherFeatures> :
+ ProcessorModel<Name, Model, !listconcat(ProcFeatures, OtherFeatures)>;
+
+def GLMFeatures : ProcessorFeatures<[], [
FeatureX87,
FeatureMMX,
FeatureSSE42,
FeatureFXSR,
+ FeatureNOPL,
FeatureCMPXCHG16B,
FeatureMOVBE,
FeaturePOPCNT,
@@ -535,14 +619,44 @@ class GoldmontProc<string Name> : ProcessorModel<Name, SLMModel, [
FeatureCLFLUSHOPT,
FeatureFSGSBase
]>;
+
+class GoldmontProc<string Name> : ProcModel<Name, SLMModel,
+ GLMFeatures.Value, [
+ ProcIntelGLM,
+ FeaturePOPCNTFalseDeps
+]>;
def : GoldmontProc<"goldmont">;
+def GLPFeatures : ProcessorFeatures<GLMFeatures.Value, [
+ FeaturePTWRITE,
+ FeatureRDPID,
+ FeatureSGX
+]>;
+
+class GoldmontPlusProc<string Name> : ProcModel<Name, SLMModel,
+ GLPFeatures.Value, [
+ ProcIntelGLP
+]>;
+def : GoldmontPlusProc<"goldmont-plus">;
+
+class TremontProc<string Name> : ProcModel<Name, SLMModel,
+ GLPFeatures.Value, [
+ ProcIntelTRM,
+ FeatureCLDEMOTE,
+ FeatureGFNI,
+ FeatureMOVDIRI,
+ FeatureMOVDIR64B,
+ FeatureWAITPKG
+]>;
+def : TremontProc<"tremont">;
+
// "Arrandale" along with corei3 and corei5
class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
FeatureX87,
FeatureMMX,
FeatureSSE42,
FeatureFXSR,
+ FeatureNOPL,
FeatureCMPXCHG16B,
FeaturePOPCNT,
FeatureLAHFSAHF,
@@ -558,6 +672,7 @@ class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
FeatureMMX,
FeatureSSE42,
FeatureFXSR,
+ FeatureNOPL,
FeatureCMPXCHG16B,
FeaturePOPCNT,
FeatureAES,
@@ -567,16 +682,6 @@ class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
]>;
def : WestmereProc<"westmere">;
-class ProcessorFeatures<list<SubtargetFeature> Inherited,
- list<SubtargetFeature> NewFeatures> {
- list<SubtargetFeature> Value = !listconcat(Inherited, NewFeatures);
-}
-
-class ProcModel<string Name, SchedMachineModel Model,
- list<SubtargetFeature> ProcFeatures,
- list<SubtargetFeature> OtherFeatures> :
- ProcessorModel<Name, Model, !listconcat(ProcFeatures, OtherFeatures)>;
-
// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
// rather than a superset.
def SNBFeatures : ProcessorFeatures<[], [
@@ -584,6 +689,7 @@ def SNBFeatures : ProcessorFeatures<[], [
FeatureMMX,
FeatureAVX,
FeatureFXSR,
+ FeatureNOPL,
FeatureCMPXCHG16B,
FeaturePOPCNT,
FeatureAES,
@@ -601,7 +707,8 @@ def SNBFeatures : ProcessorFeatures<[], [
class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
SNBFeatures.Value, [
- FeatureSlowUAMem32
+ FeatureSlowUAMem32,
+ FeaturePOPCNTFalseDeps
]>;
def : SandyBridgeProc<"sandybridge">;
def : SandyBridgeProc<"corei7-avx">; // Legacy alias.
@@ -614,7 +721,8 @@ def IVBFeatures : ProcessorFeatures<SNBFeatures.Value, [
class IvyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
IVBFeatures.Value, [
- FeatureSlowUAMem32
+ FeatureSlowUAMem32,
+ FeaturePOPCNTFalseDeps
]>;
def : IvyBridgeProc<"ivybridge">;
def : IvyBridgeProc<"core-avx-i">; // Legacy alias.
@@ -625,6 +733,7 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
FeatureBMI2,
FeatureERMSB,
FeatureFMA,
+ FeatureINVPCID,
FeatureLZCNT,
FeatureMOVBE,
FeatureFastVariableShuffle
@@ -632,7 +741,9 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
class HaswellProc<string Name> : ProcModel<Name, HaswellModel,
HSWFeatures.Value, [
- ProcIntelHSW
+ ProcIntelHSW,
+ FeaturePOPCNTFalseDeps,
+ FeatureLZCNTFalseDeps
]>;
def : HaswellProc<"haswell">;
def : HaswellProc<"core-avx2">; // Legacy alias.
@@ -644,7 +755,9 @@ def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
]>;
class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel,
BDWFeatures.Value, [
- ProcIntelBDW
+ ProcIntelBDW,
+ FeaturePOPCNTFalseDeps,
+ FeatureLZCNTFalseDeps
]>;
def : BroadwellProc<"broadwell">;
@@ -653,7 +766,6 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
FeatureRTM,
FeatureXSAVEC,
FeatureXSAVES,
- FeatureSGX,
FeatureCLFLUSHOPT,
FeatureFastVectorFSQRT
]>;
@@ -661,7 +773,9 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel,
SKLFeatures.Value, [
ProcIntelSKL,
- FeatureHasFastGather
+ FeatureHasFastGather,
+ FeaturePOPCNTFalseDeps,
+ FeatureSGX
]>;
def : SkylakeClientProc<"skylake">;
@@ -714,15 +828,23 @@ def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [
class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
SKXFeatures.Value, [
ProcIntelSKX,
- FeatureHasFastGather
+ FeatureHasFastGather,
+ FeaturePOPCNTFalseDeps
]>;
def : SkylakeServerProc<"skylake-avx512">;
def : SkylakeServerProc<"skx">; // Legacy alias.
-def CNLFeatures : ProcessorFeatures<SKXFeatures.Value, [
+def CNLFeatures : ProcessorFeatures<SKLFeatures.Value, [
+ FeatureAVX512,
+ FeatureCDI,
+ FeatureDQI,
+ FeatureBWI,
+ FeatureVLX,
+ FeaturePKU,
FeatureVBMI,
FeatureIFMA,
- FeatureSHA
+ FeatureSHA,
+ FeatureSGX
]>;
class CannonlakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
@@ -740,15 +862,25 @@ def ICLFeatures : ProcessorFeatures<CNLFeatures.Value, [
FeatureVPCLMULQDQ,
FeatureVPOPCNTDQ,
FeatureGFNI,
- FeatureCLWB
+ FeatureCLWB,
+ FeatureRDPID
]>;
-class IcelakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
- ICLFeatures.Value, [
+class IcelakeClientProc<string Name> : ProcModel<Name, SkylakeServerModel,
+ ICLFeatures.Value, [
ProcIntelICL,
FeatureHasFastGather
]>;
-def : IcelakeProc<"icelake">;
+def : IcelakeClientProc<"icelake-client">;
+
+class IcelakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
+ ICLFeatures.Value, [
+ ProcIntelICX,
+ FeaturePCONFIG,
+ FeatureWBNOINVD,
+ FeatureHasFastGather
+]>;
+def : IcelakeServerProc<"icelake-server">;
// AMD CPUs.
@@ -757,27 +889,28 @@ def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
foreach P = ["athlon", "athlon-tbird"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, Feature3DNowA, FeatureSlowSHLD]>;
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, Feature3DNowA,
+ FeatureNOPL, FeatureSlowSHLD]>;
}
foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
- Feature3DNowA, FeatureFXSR, FeatureSlowSHLD]>;
+ Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureSlowSHLD]>;
}
foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
- FeatureFXSR, Feature64Bit, FeatureSlowSHLD]>;
+ FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureSlowSHLD]>;
}
foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA,
- FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowSHLD]>;
+ FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureSlowSHLD]>;
}
foreach P = ["amdfam10", "barcelona"] in {
def : Proc<P, [FeatureX87, FeatureSSE4A, Feature3DNowA, FeatureFXSR,
- FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
+ FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
FeatureSlowSHLD, FeatureLAHFSAHF]>;
}
@@ -788,12 +921,14 @@ def : Proc<"btver1", [
FeatureSSSE3,
FeatureSSE4A,
FeatureFXSR,
+ FeatureNOPL,
FeatureCMPXCHG16B,
FeaturePRFCHW,
FeatureLZCNT,
FeaturePOPCNT,
FeatureSlowSHLD,
- FeatureLAHFSAHF
+ FeatureLAHFSAHF,
+ FeatureFast15ByteNOP
]>;
// Jaguar
@@ -802,6 +937,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [
FeatureMMX,
FeatureAVX,
FeatureFXSR,
+ FeatureNOPL,
FeatureSSE4A,
FeatureCMPXCHG16B,
FeaturePRFCHW,
@@ -817,6 +953,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [
FeatureXSAVEOPT,
FeatureSlowSHLD,
FeatureLAHFSAHF,
+ FeatureFast15ByteNOP,
FeatureFastPartialYMMorZMMWrite
]>;
@@ -832,6 +969,7 @@ def : Proc<"bdver1", [
FeatureMMX,
FeatureAVX,
FeatureFXSR,
+ FeatureNOPL,
FeatureSSE4A,
FeatureLZCNT,
FeaturePOPCNT,
@@ -839,6 +977,7 @@ def : Proc<"bdver1", [
FeatureLWP,
FeatureSlowSHLD,
FeatureLAHFSAHF,
+ FeatureFast11ByteNOP,
FeatureMacroFusion
]>;
// Piledriver
@@ -853,6 +992,7 @@ def : Proc<"bdver2", [
FeatureMMX,
FeatureAVX,
FeatureFXSR,
+ FeatureNOPL,
FeatureSSE4A,
FeatureF16C,
FeatureLZCNT,
@@ -864,6 +1004,7 @@ def : Proc<"bdver2", [
FeatureFMA,
FeatureSlowSHLD,
FeatureLAHFSAHF,
+ FeatureFast11ByteNOP,
FeatureMacroFusion
]>;
@@ -879,6 +1020,7 @@ def : Proc<"bdver3", [
FeatureMMX,
FeatureAVX,
FeatureFXSR,
+ FeatureNOPL,
FeatureSSE4A,
FeatureF16C,
FeatureLZCNT,
@@ -892,6 +1034,7 @@ def : Proc<"bdver3", [
FeatureSlowSHLD,
FeatureFSGSBase,
FeatureLAHFSAHF,
+ FeatureFast11ByteNOP,
FeatureMacroFusion
]>;
@@ -901,6 +1044,7 @@ def : Proc<"bdver4", [
FeatureMMX,
FeatureAVX2,
FeatureFXSR,
+ FeatureNOPL,
FeatureXOP,
FeatureFMA4,
FeatureCMPXCHG16B,
@@ -920,6 +1064,7 @@ def : Proc<"bdver4", [
FeatureSlowSHLD,
FeatureFSGSBase,
FeatureLAHFSAHF,
+ FeatureFast11ByteNOP,
FeatureMWAITX,
FeatureMacroFusion
]>;
@@ -938,9 +1083,11 @@ def: ProcessorModel<"znver1", Znver1Model, [
FeatureFMA,
FeatureFSGSBase,
FeatureFXSR,
+ FeatureNOPL,
FeatureFastLZCNT,
FeatureLAHFSAHF,
FeatureLZCNT,
+ FeatureFast15ByteNOP,
FeatureMacroFusion,
FeatureMMX,
FeatureMOVBE,
@@ -982,6 +1129,7 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [
FeatureMMX,
FeatureSSE2,
FeatureFXSR,
+ FeatureNOPL,
Feature64Bit,
FeatureSlow3OpsLEA,
FeatureSlowIncDec,
@@ -1045,4 +1193,11 @@ def X86 : Target {
let InstructionSet = X86InstrInfo;
let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+ let AllowRegisterRenaming = 1;
}
+
+//===----------------------------------------------------------------------===//
+// Pfm Counters
+//===----------------------------------------------------------------------===//
+
+include "X86PfmCounters.td"
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 71526dd77f11..7d8f7b9dfe46 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -19,9 +19,9 @@
#include "X86InstrInfo.h"
#include "X86MachineFunctionInfo.h"
#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Mangler.h"
@@ -31,11 +31,13 @@
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
@@ -370,6 +372,14 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
char Mode, raw_ostream &O) {
unsigned Reg = MO.getReg();
+ bool EmitPercent = true;
+
+ if (!X86::GR8RegClass.contains(Reg) &&
+ !X86::GR16RegClass.contains(Reg) &&
+ !X86::GR32RegClass.contains(Reg) &&
+ !X86::GR64RegClass.contains(Reg))
+ return true;
+
switch (Mode) {
default: return true; // Unknown mode.
case 'b': // Print QImode register
@@ -384,6 +394,9 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
case 'k': // Print SImode register
Reg = getX86SubSuperRegister(Reg, 32);
break;
+ case 'V':
+ EmitPercent = false;
+ LLVM_FALLTHROUGH;
case 'q':
// Print 64-bit register names if 64-bit integer registers are available.
// Otherwise, print 32-bit register names.
@@ -391,7 +404,10 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
break;
}
- O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+ if (EmitPercent)
+ O << '%';
+
+ O << X86ATTInstPrinter::getRegisterName(Reg);
return false;
}
@@ -464,6 +480,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case 'w': // Print HImode register
case 'k': // Print SImode register
case 'q': // Print DImode register
+ case 'V': // Print native register without '%'
if (MO.isReg())
return printAsmMRegister(*this, MO, ExtraCode[0], O);
printOperand(*this, MI, OpNo, O);
@@ -473,7 +490,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
printPCRelImm(*this, MI, OpNo, O);
return false;
- case 'n': // Negate the immediate or print a '-' before the operand.
+ case 'n': // Negate the immediate or print a '-' before the operand.
// Note: this is a temporary solution. It should be handled target
// independently as part of the 'MC' work.
if (MO.isImm()) {
@@ -524,6 +541,42 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
const Triple &TT = TM.getTargetTriple();
+ if (TT.isOSBinFormatELF()) {
+ // Assemble feature flags that may require creation of a note section.
+ unsigned FeatureFlagsAnd = 0;
+ if (M.getModuleFlag("cf-protection-branch"))
+ FeatureFlagsAnd |= ELF::GNU_PROPERTY_X86_FEATURE_1_IBT;
+ if (M.getModuleFlag("cf-protection-return"))
+ FeatureFlagsAnd |= ELF::GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+
+ if (FeatureFlagsAnd) {
+ // Emit a .note.gnu.property section with the flags.
+ if (!TT.isArch32Bit() && !TT.isArch64Bit())
+ llvm_unreachable("CFProtection used on invalid architecture!");
+ MCSection *Cur = OutStreamer->getCurrentSectionOnly();
+ MCSection *Nt = MMI->getContext().getELFSection(
+ ".note.gnu.property", ELF::SHT_NOTE, ELF::SHF_ALLOC);
+ OutStreamer->SwitchSection(Nt);
+
+ // Emitting note header.
+ int WordSize = TT.isArch64Bit() ? 8 : 4;
+ EmitAlignment(WordSize == 4 ? 2 : 3);
+ OutStreamer->EmitIntValue(4, 4 /*size*/); // data size for "GNU\0"
+ OutStreamer->EmitIntValue(8 + WordSize, 4 /*size*/); // Elf_Prop size
+ OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4 /*size*/);
+ OutStreamer->EmitBytes(StringRef("GNU", 4)); // note name
+
+ // Emitting an Elf_Prop for the CET properties.
+ OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_X86_FEATURE_1_AND, 4);
+ OutStreamer->EmitIntValue(WordSize, 4); // data size
+ OutStreamer->EmitIntValue(FeatureFlagsAnd, WordSize); // data
+ EmitAlignment(WordSize == 4 ? 2 : 3); // padding
+
+ OutStreamer->endSection(Nt);
+ OutStreamer->SwitchSection(Cur);
+ }
+ }
+
if (TT.isOSBinFormatMachO())
OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
@@ -578,64 +631,48 @@ emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
4 /*size*/);
}
-MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const {
- if (Subtarget->isTargetKnownWindowsMSVC()) {
- const MachineConstantPoolEntry &CPE =
- MF->getConstantPool()->getConstants()[CPID];
- if (!CPE.isMachineConstantPoolEntry()) {
- const DataLayout &DL = MF->getDataLayout();
- SectionKind Kind = CPE.getSectionKind(&DL);
- const Constant *C = CPE.Val.ConstVal;
- unsigned Align = CPE.Alignment;
- if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
- getObjFileLowering().getSectionForConstant(DL, Kind, C, Align))) {
- if (MCSymbol *Sym = S->getCOMDATSymbol()) {
- if (Sym->isUndefined())
- OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global);
- return Sym;
- }
- }
- }
- }
+static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) {
+
+ MachineModuleInfoMachO &MMIMacho =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+ // Output stubs for dynamically-linked functions.
+ MachineModuleInfoMachO::SymbolListTy Stubs;
+
+ // Output stubs for external and common global variables.
+ Stubs = MMIMacho.GetGVStubList();
+ if (!Stubs.empty()) {
+ OutStreamer.SwitchSection(MMI->getContext().getMachOSection(
+ "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS,
+ SectionKind::getMetadata()));
- return AsmPrinter::GetCPISymbol(CPID);
+ for (auto &Stub : Stubs)
+ emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
+
+ Stubs.clear();
+ OutStreamer.AddBlankLine();
+ }
}
void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
const Triple &TT = TM.getTargetTriple();
if (TT.isOSBinFormatMachO()) {
- // All darwin targets use mach-o.
- MachineModuleInfoMachO &MMIMacho =
- MMI->getObjFileInfo<MachineModuleInfoMachO>();
-
- // Output stubs for dynamically-linked functions.
- MachineModuleInfoMachO::SymbolListTy Stubs;
-
- // Output stubs for external and common global variables.
- Stubs = MMIMacho.GetGVStubList();
- if (!Stubs.empty()) {
- MCSection *TheSection = OutContext.getMachOSection(
- "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS,
- SectionKind::getMetadata());
- OutStreamer->SwitchSection(TheSection);
-
- for (auto &Stub : Stubs)
- emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
-
- Stubs.clear();
- OutStreamer->AddBlankLine();
- }
+ // Mach-O uses non-lazy symbol stubs to encode per-TU information into
+ // global table for symbol lookup.
+ emitNonLazyStubs(MMI, *OutStreamer);
+ // Emit stack and fault map information.
SM.serializeToStackMapSection();
FM.serializeToFaultMapSection();
- // Funny Darwin hack: This flag tells the linker that no global symbols
- // contain code that falls through to other global symbols (e.g. the obvious
- // implementation of multiple entry points). If this doesn't occur, the
- // linker can safely perform dead code stripping. Since LLVM never
- // generates code that does this, it is always safe to set.
+ // This flag tells the linker that no global symbols contain code that fall
+ // through to other global symbols (e.g. an implementation of multiple entry
+ // points). If this doesn't occur, the linker can safely perform dead code
+ // stripping. Since LLVM never generates code that does this, it is always
+ // safe to set.
OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+ return;
}
if (TT.isKnownWindowsMSVCEnvironment() && MMI->usesVAFloatArgument()) {
@@ -643,36 +680,18 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
(TT.getArch() == Triple::x86_64) ? "_fltused" : "__fltused";
MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName);
OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
+ return;
}
if (TT.isOSBinFormatCOFF()) {
- const TargetLoweringObjectFileCOFF &TLOFCOFF =
- static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());
-
- std::string Flags;
- raw_string_ostream FlagsOS(Flags);
-
- for (const auto &Function : M)
- TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Function);
- for (const auto &Global : M.globals())
- TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Global);
- for (const auto &Alias : M.aliases())
- TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Alias);
-
- FlagsOS.flush();
-
- // Output collected flags.
- if (!Flags.empty()) {
- OutStreamer->SwitchSection(TLOFCOFF.getDrectveSection());
- OutStreamer->EmitBytes(Flags);
- }
-
SM.serializeToStackMapSection();
+ return;
}
if (TT.isOSBinFormatELF()) {
SM.serializeToStackMapSection();
FM.serializeToFaultMapSection();
+ return;
}
}
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 7e70789ac82c..55abdf2ba601 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -32,6 +32,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
FaultMaps FM;
std::unique_ptr<MCCodeEmitter> CodeEmitter;
bool EmitFPOData = false;
+ bool NeedsRetpoline = false;
// This utility class tracks the length of a stackmap instruction's 'shadow'.
// It is used by the X86AsmPrinter to ensure that the stackmap shadow
@@ -94,6 +95,8 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
void LowerPATCHABLE_RET(const MachineInstr &MI, X86MCInstLower &MCIL);
void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
void LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
+ void LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
+ X86MCInstLower &MCIL);
void LowerFENTRY_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
@@ -127,9 +130,6 @@ public:
unsigned AsmVariant, const char *ExtraCode,
raw_ostream &OS) override;
- /// \brief Return the symbol for the specified constant pool entry.
- MCSymbol *GetCPISymbol(unsigned CPID) const override;
-
bool doInitialization(Module &M) override {
SMShadowTracker.reset(0);
SM.reset();
diff --git a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
new file mode 100644
index 000000000000..ab2cbfc33e17
--- /dev/null
+++ b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -0,0 +1,732 @@
+//===- X86AvoidStoreForwardingBlockis.cpp - Avoid HW Store Forward Block --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// If a load follows a store and reloads data that the store has written to
+// memory, Intel microarchitectures can in many cases forward the data directly
+// from the store to the load, This "store forwarding" saves cycles by enabling
+// the load to directly obtain the data instead of accessing the data from
+// cache or memory.
+// A "store forward block" occurs in cases that a store cannot be forwarded to
+// the load. The most typical case of store forward block on Intel Core
+// microarchitecture that a small store cannot be forwarded to a large load.
+// The estimated penalty for a store forward block is ~13 cycles.
+//
+// This pass tries to recognize and handle cases where "store forward block"
+// is created by the compiler when lowering memcpy calls to a sequence
+// of a load and a store.
+//
+// The pass currently only handles cases where memcpy is lowered to
+// XMM/YMM registers, it tries to break the memcpy into smaller copies.
+// breaking the memcpy should be possible since there is no atomicity
+// guarantee for loads and stores to XMM/YMM.
+//
+// It could be better for performance to solve the problem by loading
+// to XMM/YMM then inserting the partial store before storing back from XMM/YMM
+// to memory, but this will result in a more conservative optimization since it
+// requires we prove that all memory accesses between the blocking store and the
+// load must alias/don't alias before we can move the store, whereas the
+// transformation done here is correct regardless to other memory accesses.
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCInstrDesc.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-avoid-SFB"
+
+namespace llvm {
+void initializeX86AvoidSFBPassPass(PassRegistry &);
+} // end namespace llvm
+
+static cl::opt<bool> DisableX86AvoidStoreForwardBlocks(
+ "x86-disable-avoid-SFB", cl::Hidden,
+ cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));
+
+static cl::opt<unsigned> X86AvoidSFBInspectionLimit(
+ "x86-sfb-inspection-limit",
+ cl::desc("X86: Number of instructions backward to "
+ "inspect for store forwarding blocks."),
+ cl::init(20), cl::Hidden);
+
+namespace {
+
+using DisplacementSizeMap = std::map<int64_t, unsigned>;
+
+class X86AvoidSFBPass : public MachineFunctionPass {
+public:
+ static char ID;
+ X86AvoidSFBPass() : MachineFunctionPass(ID) {
+ initializeX86AvoidSFBPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "X86 Avoid Store Forwarding Blocks";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<AAResultsWrapperPass>();
+ }
+
+private:
+ MachineRegisterInfo *MRI;
+ const X86InstrInfo *TII;
+ const X86RegisterInfo *TRI;
+ SmallVector<std::pair<MachineInstr *, MachineInstr *>, 2>
+ BlockedLoadsStoresPairs;
+ SmallVector<MachineInstr *, 2> ForRemoval;
+ AliasAnalysis *AA;
+
+ /// Returns couples of Load then Store to memory which look
+ /// like a memcpy.
+ void findPotentiallylBlockedCopies(MachineFunction &MF);
+ /// Break the memcpy's load and store into smaller copies
+ /// such that each memory load that was blocked by a smaller store
+ /// would now be copied separately.
+ void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
+ const DisplacementSizeMap &BlockingStoresDispSizeMap);
+ /// Break a copy of size Size to smaller copies.
+ void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
+ MachineInstr *StoreInst, int64_t StDispImm,
+ int64_t LMMOffset, int64_t SMMOffset);
+
+ void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
+ MachineInstr *StoreInst, unsigned NStoreOpcode,
+ int64_t StoreDisp, unsigned Size, int64_t LMMOffset,
+ int64_t SMMOffset);
+
+ bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const;
+
+ unsigned getRegSizeInBytes(MachineInstr *Inst);
+};
+
+} // end anonymous namespace
+
+char X86AvoidSFBPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", false,
+ false)
+
+FunctionPass *llvm::createX86AvoidStoreForwardingBlocks() {
+ return new X86AvoidSFBPass();
+}
+
+static bool isXMMLoadOpcode(unsigned Opcode) {
+ return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
+ Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
+ Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
+ Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
+ Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
+ Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
+ Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
+ Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
+}
+static bool isYMMLoadOpcode(unsigned Opcode) {
+ return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
+ Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
+ Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
+ Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
+ Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
+ Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
+ Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
+}
+
+static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
+ return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
+}
+
+static bool isPotentialBlockedMemCpyPair(int LdOpcode, int StOpcode) {
+ switch (LdOpcode) {
+ case X86::MOVUPSrm:
+ case X86::MOVAPSrm:
+ return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPSrm:
+ return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
+ case X86::VMOVUPDrm:
+ case X86::VMOVAPDrm:
+ return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
+ case X86::VMOVDQUrm:
+ case X86::VMOVDQArm:
+ return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
+ case X86::VMOVUPSZ128rm:
+ case X86::VMOVAPSZ128rm:
+ return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
+ case X86::VMOVUPDZ128rm:
+ case X86::VMOVAPDZ128rm:
+ return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPSYrm:
+ return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
+ case X86::VMOVUPDYrm:
+ case X86::VMOVAPDYrm:
+ return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
+ case X86::VMOVDQUYrm:
+ case X86::VMOVDQAYrm:
+ return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
+ case X86::VMOVUPSZ256rm:
+ case X86::VMOVAPSZ256rm:
+ return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVAPDZ256rm:
+ return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
+ case X86::VMOVDQU64Z128rm:
+ case X86::VMOVDQA64Z128rm:
+ return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
+ case X86::VMOVDQU32Z128rm:
+ case X86::VMOVDQA32Z128rm:
+ return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
+ case X86::VMOVDQU64Z256rm:
+ case X86::VMOVDQA64Z256rm:
+ return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQA32Z256rm:
+ return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
+ default:
+ return false;
+ }
+}
+
+static bool isPotentialBlockingStoreInst(int Opcode, int LoadOpcode) {
+ bool PBlock = false;
+ PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
+ Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
+ Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
+ Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
+ if (isYMMLoadOpcode(LoadOpcode))
+ PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
+ Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
+ Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
+ Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
+ Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
+ Opcode == X86::VMOVDQU64Z128mr ||
+ Opcode == X86::VMOVDQA64Z128mr ||
+ Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
+ return PBlock;
+}
+
+static const int MOV128SZ = 16;
+static const int MOV64SZ = 8;
+static const int MOV32SZ = 4;
+static const int MOV16SZ = 2;
+static const int MOV8SZ = 1;
+
+static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) {
+ switch (LoadOpcode) {
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPSYrm:
+ return X86::VMOVUPSrm;
+ case X86::VMOVUPDYrm:
+ case X86::VMOVAPDYrm:
+ return X86::VMOVUPDrm;
+ case X86::VMOVDQUYrm:
+ case X86::VMOVDQAYrm:
+ return X86::VMOVDQUrm;
+ case X86::VMOVUPSZ256rm:
+ case X86::VMOVAPSZ256rm:
+ return X86::VMOVUPSZ128rm;
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVAPDZ256rm:
+ return X86::VMOVUPDZ128rm;
+ case X86::VMOVDQU64Z256rm:
+ case X86::VMOVDQA64Z256rm:
+ return X86::VMOVDQU64Z128rm;
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQA32Z256rm:
+ return X86::VMOVDQU32Z128rm;
+ default:
+ llvm_unreachable("Unexpected Load Instruction Opcode");
+ }
+ return 0;
+}
+
+static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
+ switch (StoreOpcode) {
+ case X86::VMOVUPSYmr:
+ case X86::VMOVAPSYmr:
+ return X86::VMOVUPSmr;
+ case X86::VMOVUPDYmr:
+ case X86::VMOVAPDYmr:
+ return X86::VMOVUPDmr;
+ case X86::VMOVDQUYmr:
+ case X86::VMOVDQAYmr:
+ return X86::VMOVDQUmr;
+ case X86::VMOVUPSZ256mr:
+ case X86::VMOVAPSZ256mr:
+ return X86::VMOVUPSZ128mr;
+ case X86::VMOVUPDZ256mr:
+ case X86::VMOVAPDZ256mr:
+ return X86::VMOVUPDZ128mr;
+ case X86::VMOVDQU64Z256mr:
+ case X86::VMOVDQA64Z256mr:
+ return X86::VMOVDQU64Z128mr;
+ case X86::VMOVDQU32Z256mr:
+ case X86::VMOVDQA32Z256mr:
+ return X86::VMOVDQU32Z128mr;
+ default:
+ llvm_unreachable("Unexpected Load Instruction Opcode");
+ }
+ return 0;
+}
+
+static int getAddrOffset(MachineInstr *MI) {
+ const MCInstrDesc &Descl = MI->getDesc();
+ int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
+ assert(AddrOffset != -1 && "Expected Memory Operand");
+ AddrOffset += X86II::getOperandBias(Descl);
+ return AddrOffset;
+}
+
+static MachineOperand &getBaseOperand(MachineInstr *MI) {
+ int AddrOffset = getAddrOffset(MI);
+ return MI->getOperand(AddrOffset + X86::AddrBaseReg);
+}
+
+static MachineOperand &getDispOperand(MachineInstr *MI) {
+ int AddrOffset = getAddrOffset(MI);
+ return MI->getOperand(AddrOffset + X86::AddrDisp);
+}
+
+// Relevant addressing modes contain only base register and immediate
+// displacement or frameindex and immediate displacement.
+// TODO: Consider expanding to other addressing modes in the future
+static bool isRelevantAddressingMode(MachineInstr *MI) {
+ int AddrOffset = getAddrOffset(MI);
+ MachineOperand &Base = getBaseOperand(MI);
+ MachineOperand &Disp = getDispOperand(MI);
+ MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
+ MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
+ MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
+
+ if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
+ return false;
+ if (!Disp.isImm())
+ return false;
+ if (Scale.getImm() != 1)
+ return false;
+ if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
+ return false;
+ if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
+ return false;
+ return true;
+}
+
+// Collect potentially blocking stores.
+// Limit the number of instructions backwards we want to inspect
+// since the effect of store block won't be visible if the store
+// and load instructions have enough instructions in between to
+// keep the core busy.
+static SmallVector<MachineInstr *, 2>
+findPotentialBlockers(MachineInstr *LoadInst) {
+ SmallVector<MachineInstr *, 2> PotentialBlockers;
+ unsigned BlockCount = 0;
+ const unsigned InspectionLimit = X86AvoidSFBInspectionLimit;
+ for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),
+ E = LoadInst->getParent()->rend();
+ PBInst != E; ++PBInst) {
+ BlockCount++;
+ if (BlockCount >= InspectionLimit)
+ break;
+ MachineInstr &MI = *PBInst;
+ if (MI.getDesc().isCall())
+ return PotentialBlockers;
+ PotentialBlockers.push_back(&MI);
+ }
+ // If we didn't get to the instructions limit try predecessing blocks.
+ // Ideally we should traverse the predecessor blocks in depth with some
+ // coloring algorithm, but for now let's just look at the first order
+ // predecessors.
+ if (BlockCount < InspectionLimit) {
+ MachineBasicBlock *MBB = LoadInst->getParent();
+ int LimitLeft = InspectionLimit - BlockCount;
+ for (MachineBasicBlock::pred_iterator PB = MBB->pred_begin(),
+ PE = MBB->pred_end();
+ PB != PE; ++PB) {
+ MachineBasicBlock *PMBB = *PB;
+ int PredCount = 0;
+ for (MachineBasicBlock::reverse_iterator PBInst = PMBB->rbegin(),
+ PME = PMBB->rend();
+ PBInst != PME; ++PBInst) {
+ PredCount++;
+ if (PredCount >= LimitLeft)
+ break;
+ if (PBInst->getDesc().isCall())
+ break;
+ PotentialBlockers.push_back(&*PBInst);
+ }
+ }
+ }
+ return PotentialBlockers;
+}
+
+void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
+ int64_t LoadDisp, MachineInstr *StoreInst,
+ unsigned NStoreOpcode, int64_t StoreDisp,
+ unsigned Size, int64_t LMMOffset,
+ int64_t SMMOffset) {
+ MachineOperand &LoadBase = getBaseOperand(LoadInst);
+ MachineOperand &StoreBase = getBaseOperand(StoreInst);
+ MachineBasicBlock *MBB = LoadInst->getParent();
+ MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
+ MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
+
+ unsigned Reg1 = MRI->createVirtualRegister(
+ TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
+ MachineInstr *NewLoad =
+ BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
+ Reg1)
+ .add(LoadBase)
+ .addImm(1)
+ .addReg(X86::NoRegister)
+ .addImm(LoadDisp)
+ .addReg(X86::NoRegister)
+ .addMemOperand(
+ MBB->getParent()->getMachineMemOperand(LMMO, LMMOffset, Size));
+ if (LoadBase.isReg())
+ getBaseOperand(NewLoad).setIsKill(false);
+ LLVM_DEBUG(NewLoad->dump());
+ // If the load and store are consecutive, use the loadInst location to
+ // reduce register pressure.
+ MachineInstr *StInst = StoreInst;
+ if (StoreInst->getPrevNode() == LoadInst)
+ StInst = LoadInst;
+ MachineInstr *NewStore =
+ BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
+ .add(StoreBase)
+ .addImm(1)
+ .addReg(X86::NoRegister)
+ .addImm(StoreDisp)
+ .addReg(X86::NoRegister)
+ .addReg(Reg1)
+ .addMemOperand(
+ MBB->getParent()->getMachineMemOperand(SMMO, SMMOffset, Size));
+ if (StoreBase.isReg())
+ getBaseOperand(NewStore).setIsKill(false);
+ MachineOperand &StoreSrcVReg = StoreInst->getOperand(X86::AddrNumOperands);
+ assert(StoreSrcVReg.isReg() && "Expected virtual register");
+ NewStore->getOperand(X86::AddrNumOperands).setIsKill(StoreSrcVReg.isKill());
+ LLVM_DEBUG(NewStore->dump());
+}
+
+void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
+ int64_t LdDispImm, MachineInstr *StoreInst,
+ int64_t StDispImm, int64_t LMMOffset,
+ int64_t SMMOffset) {
+ int LdDisp = LdDispImm;
+ int StDisp = StDispImm;
+ while (Size > 0) {
+ if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) {
+ Size = Size - MOV128SZ;
+ buildCopy(LoadInst, getYMMtoXMMLoadOpcode(LoadInst->getOpcode()), LdDisp,
+ StoreInst, getYMMtoXMMStoreOpcode(StoreInst->getOpcode()),
+ StDisp, MOV128SZ, LMMOffset, SMMOffset);
+ LdDisp += MOV128SZ;
+ StDisp += MOV128SZ;
+ LMMOffset += MOV128SZ;
+ SMMOffset += MOV128SZ;
+ continue;
+ }
+ if (Size - MOV64SZ >= 0) {
+ Size = Size - MOV64SZ;
+ buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,
+ MOV64SZ, LMMOffset, SMMOffset);
+ LdDisp += MOV64SZ;
+ StDisp += MOV64SZ;
+ LMMOffset += MOV64SZ;
+ SMMOffset += MOV64SZ;
+ continue;
+ }
+ if (Size - MOV32SZ >= 0) {
+ Size = Size - MOV32SZ;
+ buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,
+ MOV32SZ, LMMOffset, SMMOffset);
+ LdDisp += MOV32SZ;
+ StDisp += MOV32SZ;
+ LMMOffset += MOV32SZ;
+ SMMOffset += MOV32SZ;
+ continue;
+ }
+ if (Size - MOV16SZ >= 0) {
+ Size = Size - MOV16SZ;
+ buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,
+ MOV16SZ, LMMOffset, SMMOffset);
+ LdDisp += MOV16SZ;
+ StDisp += MOV16SZ;
+ LMMOffset += MOV16SZ;
+ SMMOffset += MOV16SZ;
+ continue;
+ }
+ if (Size - MOV8SZ >= 0) {
+ Size = Size - MOV8SZ;
+ buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,
+ MOV8SZ, LMMOffset, SMMOffset);
+ LdDisp += MOV8SZ;
+ StDisp += MOV8SZ;
+ LMMOffset += MOV8SZ;
+ SMMOffset += MOV8SZ;
+ continue;
+ }
+ }
+ assert(Size == 0 && "Wrong size division");
+}
+
+static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
+ MachineOperand &LoadBase = getBaseOperand(LoadInst);
+ MachineOperand &StoreBase = getBaseOperand(StoreInst);
+ if (LoadBase.isReg()) {
+ MachineInstr *LastLoad = LoadInst->getPrevNode();
+ // If the original load and store to xmm/ymm were consecutive
+ // then the partial copies were also created in
+ // a consecutive order to reduce register pressure,
+ // and the location of the last load is before the last store.
+ if (StoreInst->getPrevNode() == LoadInst)
+ LastLoad = LoadInst->getPrevNode()->getPrevNode();
+ getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
+ }
+ if (StoreBase.isReg()) {
+ MachineInstr *StInst = StoreInst;
+ if (StoreInst->getPrevNode() == LoadInst)
+ StInst = LoadInst;
+ getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
+ }
+}
+
+bool X86AvoidSFBPass::alias(const MachineMemOperand &Op1,
+ const MachineMemOperand &Op2) const {
+ if (!Op1.getValue() || !Op2.getValue())
+ return true;
+
+ int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
+ int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset;
+ int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset;
+
+ AliasResult AAResult =
+ AA->alias(MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
+ MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
+ return AAResult != NoAlias;
+}
+
+void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
+ for (auto &MBB : MF)
+ for (auto &MI : MBB) {
+ if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))
+ continue;
+ int DefVR = MI.getOperand(0).getReg();
+ if (!MRI->hasOneUse(DefVR))
+ continue;
+ for (auto UI = MRI->use_nodbg_begin(DefVR), UE = MRI->use_nodbg_end();
+ UI != UE;) {
+ MachineOperand &StoreMO = *UI++;
+ MachineInstr &StoreMI = *StoreMO.getParent();
+ // Skip cases where the memcpy may overlap.
+ if (StoreMI.getParent() == MI.getParent() &&
+ isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) &&
+ isRelevantAddressingMode(&MI) &&
+ isRelevantAddressingMode(&StoreMI)) {
+ assert(MI.hasOneMemOperand() &&
+ "Expected one memory operand for load instruction");
+ assert(StoreMI.hasOneMemOperand() &&
+ "Expected one memory operand for store instruction");
+ if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin()))
+ BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));
+ }
+ }
+ }
+}
+
+unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
+ auto TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
+ *LoadInst->getParent()->getParent());
+ return TRI->getRegSizeInBits(*TRC) / 8;
+}
+
+void X86AvoidSFBPass::breakBlockedCopies(
+ MachineInstr *LoadInst, MachineInstr *StoreInst,
+ const DisplacementSizeMap &BlockingStoresDispSizeMap) {
+ int64_t LdDispImm = getDispOperand(LoadInst).getImm();
+ int64_t StDispImm = getDispOperand(StoreInst).getImm();
+ int64_t LMMOffset = 0;
+ int64_t SMMOffset = 0;
+
+ int64_t LdDisp1 = LdDispImm;
+ int64_t LdDisp2 = 0;
+ int64_t StDisp1 = StDispImm;
+ int64_t StDisp2 = 0;
+ unsigned Size1 = 0;
+ unsigned Size2 = 0;
+ int64_t LdStDelta = StDispImm - LdDispImm;
+
+ for (auto DispSizePair : BlockingStoresDispSizeMap) {
+ LdDisp2 = DispSizePair.first;
+ StDisp2 = DispSizePair.first + LdStDelta;
+ Size2 = DispSizePair.second;
+ // Avoid copying overlapping areas.
+ if (LdDisp2 < LdDisp1) {
+ int OverlapDelta = LdDisp1 - LdDisp2;
+ LdDisp2 += OverlapDelta;
+ StDisp2 += OverlapDelta;
+ Size2 -= OverlapDelta;
+ }
+ Size1 = std::abs(std::abs(LdDisp2) - std::abs(LdDisp1));
+
+ // Build a copy for the point until the current blocking store's
+ // displacement.
+ buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
+ SMMOffset);
+ // Build a copy for the current blocking store.
+ buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, LMMOffset + Size1,
+ SMMOffset + Size1);
+ LdDisp1 = LdDisp2 + Size2;
+ StDisp1 = StDisp2 + Size2;
+ LMMOffset += Size1 + Size2;
+ SMMOffset += Size1 + Size2;
+ }
+ unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
+ buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
+ LMMOffset);
+}
+
+static bool hasSameBaseOpValue(MachineInstr *LoadInst,
+ MachineInstr *StoreInst) {
+ MachineOperand &LoadBase = getBaseOperand(LoadInst);
+ MachineOperand &StoreBase = getBaseOperand(StoreInst);
+ if (LoadBase.isReg() != StoreBase.isReg())
+ return false;
+ if (LoadBase.isReg())
+ return LoadBase.getReg() == StoreBase.getReg();
+ return LoadBase.getIndex() == StoreBase.getIndex();
+}
+
+static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize,
+ int64_t StoreDispImm, unsigned StoreSize) {
+ return ((StoreDispImm >= LoadDispImm) &&
+ (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
+}
+
+// Keep track of all stores blocking a load
+static void
+updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap,
+ int64_t DispImm, unsigned Size) {
+ if (BlockingStoresDispSizeMap.count(DispImm)) {
+ // Choose the smallest blocking store starting at this displacement.
+ if (BlockingStoresDispSizeMap[DispImm] > Size)
+ BlockingStoresDispSizeMap[DispImm] = Size;
+
+ } else
+ BlockingStoresDispSizeMap[DispImm] = Size;
+}
+
+// Remove blocking stores contained in each other.
+static void
+removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) {
+ if (BlockingStoresDispSizeMap.size() <= 1)
+ return;
+
+ int64_t PrevDisp = BlockingStoresDispSizeMap.begin()->first;
+ unsigned PrevSize = BlockingStoresDispSizeMap.begin()->second;
+ SmallVector<int64_t, 2> ForRemoval;
+ for (auto DispSizePair = std::next(BlockingStoresDispSizeMap.begin());
+ DispSizePair != BlockingStoresDispSizeMap.end(); ++DispSizePair) {
+ int64_t CurrDisp = DispSizePair->first;
+ unsigned CurrSize = DispSizePair->second;
+ if (CurrDisp + CurrSize <= PrevDisp + PrevSize) {
+ ForRemoval.push_back(PrevDisp);
+ }
+ PrevDisp = CurrDisp;
+ PrevSize = CurrSize;
+ }
+ for (auto Disp : ForRemoval)
+ BlockingStoresDispSizeMap.erase(Disp);
+}
+
+bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+
+ if (DisableX86AvoidStoreForwardBlocks || skipFunction(MF.getFunction()) ||
+ !MF.getSubtarget<X86Subtarget>().is64Bit())
+ return false;
+
+ MRI = &MF.getRegInfo();
+ assert(MRI->isSSA() && "Expected MIR to be in SSA form");
+ TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";);
+ // Look for a load then a store to XMM/YMM which look like a memcpy
+ findPotentiallylBlockedCopies(MF);
+
+ for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
+ MachineInstr *LoadInst = LoadStoreInstPair.first;
+ int64_t LdDispImm = getDispOperand(LoadInst).getImm();
+ DisplacementSizeMap BlockingStoresDispSizeMap;
+
+ SmallVector<MachineInstr *, 2> PotentialBlockers =
+ findPotentialBlockers(LoadInst);
+ for (auto PBInst : PotentialBlockers) {
+ if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),
+ LoadInst->getOpcode()) ||
+ !isRelevantAddressingMode(PBInst))
+ continue;
+ int64_t PBstDispImm = getDispOperand(PBInst).getImm();
+ assert(PBInst->hasOneMemOperand() && "Expected One Memory Operand");
+ unsigned PBstSize = (*PBInst->memoperands_begin())->getSize();
+ // This check doesn't cover all cases, but it will suffice for now.
+ // TODO: take branch probability into consideration, if the blocking
+ // store is in an unreached block, breaking the memcopy could lose
+ // performance.
+ if (hasSameBaseOpValue(LoadInst, PBInst) &&
+ isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm,
+ PBstSize))
+ updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm,
+ PBstSize);
+ }
+
+ if (BlockingStoresDispSizeMap.empty())
+ continue;
+
+ // We found a store forward block, break the memcpy's load and store
+ // into smaller copies such that each smaller store that was causing
+ // a store block would now be copied separately.
+ MachineInstr *StoreInst = LoadStoreInstPair.second;
+ LLVM_DEBUG(dbgs() << "Blocked load and store instructions: \n");
+ LLVM_DEBUG(LoadInst->dump());
+ LLVM_DEBUG(StoreInst->dump());
+ LLVM_DEBUG(dbgs() << "Replaced with:\n");
+ removeRedundantBlockingStores(BlockingStoresDispSizeMap);
+ breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap);
+ updateKillStatus(LoadInst, StoreInst);
+ ForRemoval.push_back(LoadInst);
+ ForRemoval.push_back(StoreInst);
+ }
+ for (auto RemovedInst : ForRemoval) {
+ RemovedInst->eraseFromParent();
+ }
+ ForRemoval.clear();
+ BlockedLoadsStoresPairs.clear();
+ LLVM_DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";);
+
+ return Changed;
+}
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 522dc7926b94..c73fd6eb144a 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -375,7 +375,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
// Skip over DEBUG_VALUE.
// For globals in PIC mode, we can have some LEAs here. Skip them as well.
// TODO: Extend this to something that covers more cases.
- while (I->getOpcode() == X86::LEA32r || I->isDebugValue())
+ while (I->getOpcode() == X86::LEA32r || I->isDebugInstr())
++I;
unsigned StackPtr = RegInfo.getStackRegister();
diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp
index ccb982f9ac16..96ea64dc8c48 100644
--- a/lib/Target/X86/X86CallLowering.cpp
+++ b/lib/Target/X86/X86CallLowering.cpp
@@ -33,7 +33,6 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
@@ -43,6 +42,7 @@
#include "llvm/IR/Value.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/MachineValueType.h"
#include <cassert>
#include <cstdint>
@@ -126,7 +126,25 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
- unsigned ExtReg = extendRegister(ValVReg, VA);
+
+ unsigned ExtReg;
+ // If we are copying the value to a physical register with the
+ // size larger than the size of the value itself - build AnyExt
+ // to the size of the register first and only then do the copy.
+ // The example of that would be copying from s32 to xmm0, for which
+ // case ValVT == LocVT == MVT::f32. If LocSize and ValSize are not equal
+ // we expect normal extendRegister mechanism to work.
+ unsigned PhysRegSize =
+ MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI);
+ unsigned ValSize = VA.getValVT().getSizeInBits();
+ unsigned LocSize = VA.getLocVT().getSizeInBits();
+ if (PhysRegSize > ValSize && LocSize == ValSize) {
+ assert((PhysRegSize == 128 || PhysRegSize == 80) && "We expect that to be 128 bit");
+ auto MIB = MIRBuilder.buildAnyExt(LLT::scalar(PhysRegSize), ValVReg);
+ ExtReg = MIB->getOperand(0).getReg();
+ } else
+ ExtReg = extendRegister(ValVReg, VA);
+
MIRBuilder.buildCopy(PhysReg, ExtReg);
}
@@ -229,10 +247,28 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
CCValAssign &VA) override {
markPhysRegUsed(PhysReg);
+
switch (VA.getLocInfo()) {
- default:
+ default: {
+ // If we are copying the value from a physical register with the
+ // size larger than the size of the value itself - build the copy
+ // of the phys reg first and then build the truncation of that copy.
+ // The example of that would be copying from xmm0 to s32, for which
+ // case ValVT == LocVT == MVT::f32. If LocSize and ValSize are not equal
+ // we expect this to be handled in SExt/ZExt/AExt case.
+ unsigned PhysRegSize =
+ MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI);
+ unsigned ValSize = VA.getValVT().getSizeInBits();
+ unsigned LocSize = VA.getLocVT().getSizeInBits();
+ if (PhysRegSize > ValSize && LocSize == ValSize) {
+ auto Copy = MIRBuilder.buildCopy(LLT::scalar(PhysRegSize), PhysReg);
+ MIRBuilder.buildTrunc(ValVReg, Copy);
+ return;
+ }
+
MIRBuilder.buildCopy(ValVReg, PhysReg);
break;
+ }
case CCValAssign::LocInfo::SExt:
case CCValAssign::LocInfo::ZExt:
case CCValAssign::LocInfo::AExt: {
@@ -402,8 +438,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (Callee.isReg())
MIB->getOperand(0).setReg(constrainOperandRegClass(
MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
- *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(),
- Callee.getReg(), 0));
+ *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Callee, 0));
// Finally we can copy the returned value back into its virtual-register. In
// symmetry with the arguments, the physical register must be an
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 5d806fe60b86..fcc9a296de93 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -18,6 +18,12 @@ class CCIfSubtarget<string F, CCAction A>
"(State.getMachineFunction().getSubtarget()).", F),
A>;
+/// CCIfNotSubtarget - Match if the current subtarget doesn't has a feature F.
+class CCIfNotSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("!static_cast<const X86Subtarget&>"
+ "(State.getMachineFunction().getSubtarget()).", F),
+ A>;
+
// Register classes for RegCall
class RC_X86_RegCall {
list<Register> GPR_8 = [];
@@ -246,8 +252,9 @@ def RetCC_X86Common : CallingConv<[
// MM0, it doesn't support these vector types.
CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
- // Long double types are always returned in FP0 (even with SSE).
- CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>
+ // Long double types are always returned in FP0 (even with SSE),
+ // except on Win64.
+ CCIfNotSubtarget<"isTargetWin64()", CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>>
]>;
// X86-32 C return-value convention.
@@ -535,7 +542,7 @@ def CC_X86_64_C : CallingConv<[
// fixed arguments to vararg functions are supposed to be passed in
// registers. Actually modeling that would be a lot of work, though.
CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
- CCIfSubtarget<"hasFp256()",
+ CCIfSubtarget<"hasAVX()",
CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
YMM4, YMM5, YMM6, YMM7]>>>>,
@@ -586,8 +593,8 @@ def CC_X86_Win64_C : CallingConv<[
// FIXME: Handle byval stuff.
// FIXME: Handle varargs.
- // Promote i1/i8/i16/v1i1 arguments to i32.
- CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
+ // Promote i1/v1i1 arguments to i8.
+ CCIfType<[i1, v1i1], CCPromoteToType<i8>>,
// The 'nest' parameter, if any, is passed in R10.
CCIfNest<CCAssignToReg<[R10]>>,
@@ -605,10 +612,17 @@ def CC_X86_Win64_C : CallingConv<[
// 512 bit vectors are passed by pointer
CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
+ // Long doubles are passed by pointer
+ CCIfType<[f80], CCPassIndirect<i64>>,
+
// The first 4 MMX vector arguments are passed in GPRs.
CCIfType<[x86mmx], CCBitConvertToType<i64>>,
// The first 4 integer arguments are passed in integer registers.
+ CCIfType<[i8 ], CCAssignToRegWithShadow<[CL , DL , R8B , R9B ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+ CCIfType<[i16], CCAssignToRegWithShadow<[CX , DX , R8W , R9W ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
[XMM0, XMM1, XMM2, XMM3]>>,
@@ -628,11 +642,7 @@ def CC_X86_Win64_C : CallingConv<[
// Integer/FP values get stored in stack slots that are 8 bytes in size and
// 8-byte aligned if there are no more registers to hold them.
- CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
-
- // Long doubles get stack slots whose size and alignment depends on the
- // subtarget.
- CCIfType<[f80], CCAssignToStack<0, 0>>
+ CCIfType<[i8, i16, i32, i64, f32, f64], CCAssignToStack<8, 8>>
]>;
def CC_X86_Win64_VectorCall : CallingConv<[
@@ -731,7 +741,7 @@ def CC_X86_32_Vector_Standard : CallingConv<[
// AVX 256-bit vector arguments are passed in YMM registers.
CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
- CCIfSubtarget<"hasFp256()",
+ CCIfSubtarget<"hasAVX()",
CCAssignToReg<[YMM0, YMM1, YMM2]>>>>,
// AVX 512-bit vector arguments are passed in ZMM registers.
@@ -750,7 +760,7 @@ def CC_X86_32_Vector_Darwin : CallingConv<[
// AVX 256-bit vector arguments are passed in YMM registers.
CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
- CCIfSubtarget<"hasFp256()",
+ CCIfSubtarget<"hasAVX()",
CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
// AVX 512-bit vector arguments are passed in ZMM registers.
@@ -841,13 +851,15 @@ def CC_X86_32_MCU : CallingConv<[
]>;
def CC_X86_32_FastCall : CallingConv<[
- // Promote i1/i8/i16/v1i1 arguments to i32.
- CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
+ // Promote i1 to i8.
+ CCIfType<[i1], CCPromoteToType<i8>>,
// The 'nest' parameter, if any, is passed in EAX.
CCIfNest<CCAssignToReg<[EAX]>>,
// The first 2 integer arguments are passed in ECX/EDX
+ CCIfInReg<CCIfType<[ i8], CCAssignToReg<[ CL, DL]>>>,
+ CCIfInReg<CCIfType<[i16], CCAssignToReg<[ CX, DX]>>>,
CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>,
// Otherwise, same as everything else.
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp
index 489d9d86e254..f73455cc31b8 100644
--- a/lib/Target/X86/X86CmovConversion.cpp
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -169,8 +169,8 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
if (!EnableCmovConverter)
return false;
- DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
- << "**********\n");
+ LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+ << "**********\n");
bool Changed = false;
MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
@@ -178,7 +178,7 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
TII = STI.getInstrInfo();
TRI = STI.getRegisterInfo();
- TSchedModel.init(STI.getSchedModel(), &STI, TII);
+ TSchedModel.init(&STI);
// Before we handle the more subtle cases of register-register CMOVs inside
// of potentially hot loops, we want to quickly remove all CMOVs with
@@ -295,7 +295,7 @@ bool X86CmovConverterPass::collectCmovCandidates(
for (auto &I : *MBB) {
// Skip debug instructions.
- if (I.isDebugValue())
+ if (I.isDebugInstr())
continue;
X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode());
// Check if we found a X86::CMOVrr instruction.
@@ -435,7 +435,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
RegDefMaps[PhyRegType].clear();
for (MachineInstr &MI : *MBB) {
// Skip debug instructions.
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
continue;
unsigned MIDepth = 0;
unsigned MIDepthOpt = 0;
@@ -605,7 +605,7 @@ static void packCmovGroup(MachineInstr *First, MachineInstr *Last) {
SmallVector<MachineInstr *, 2> DBGInstructions;
for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I++) {
- if (I->isDebugValue())
+ if (I->isDebugInstr())
DBGInstructions.push_back(&*I);
}
@@ -776,7 +776,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
auto *NewCMOV = NewMIs.pop_back_val();
assert(X86::getCondFromCMovOpc(NewCMOV->getOpcode()) == OppCC &&
"Last new instruction isn't the expected CMOV!");
- DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
+ LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV);
if (&*MIItBegin == &MI)
MIItBegin = MachineBasicBlock::iterator(NewCMOV);
@@ -784,7 +784,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
// Sink whatever instructions were needed to produce the unfolded operand
// into the false block.
for (auto *NewMI : NewMIs) {
- DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
+ LLVM_DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
FalseMBB->insert(FalseInsertionPoint, NewMI);
// Re-map any operands that are from other cmovs to the inputs for this block.
for (auto &MOp : NewMI->uses()) {
@@ -846,8 +846,8 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
.addReg(Op2Reg)
.addMBB(MBB);
(void)MIB;
- DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
- DEBUG(dbgs() << "\tTo: "; MIB->dump());
+ LLVM_DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
+ LLVM_DEBUG(dbgs() << "\tTo: "; MIB->dump());
// Add this PHI to the rewrite table.
RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
index ba7280c29cc9..5196446b39e9 100644
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -26,6 +26,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/Printable.h"
#include <bitset>
using namespace llvm;
@@ -262,25 +263,6 @@ public:
}
};
-/// An Instruction Converter which completely deletes an instruction.
-/// For example, IMPLICIT_DEF instructions can be deleted when converting from
-/// GPR to mask.
-class InstrDeleter : public InstrConverterBase {
-public:
- InstrDeleter(unsigned SrcOpcode) : InstrConverterBase(SrcOpcode) {}
-
- bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII,
- MachineRegisterInfo *MRI) const override {
- assert(isLegal(MI, TII) && "Cannot convert instruction");
- return true;
- }
-
- double getExtraCost(const MachineInstr *MI,
- MachineRegisterInfo *MRI) const override {
- return 0;
- }
-};
-
// Key type to be used by the Instruction Converters map.
// A converter is identified by <destination domain, source opcode>
typedef std::pair<int, unsigned> InstrConverterBaseKeyTy;
@@ -310,8 +292,12 @@ private:
/// Domains which this closure can legally be reassigned to.
std::bitset<NumDomains> LegalDstDomains;
+ /// An ID to uniquely identify this closure, even when it gets
+ /// moved around
+ unsigned ID;
+
public:
- Closure(std::initializer_list<RegDomain> LegalDstDomainList) {
+ Closure(unsigned ID, std::initializer_list<RegDomain> LegalDstDomainList) : ID(ID) {
for (RegDomain D : LegalDstDomainList)
LegalDstDomains.set(D);
}
@@ -347,6 +333,27 @@ public:
return Instrs;
}
+ LLVM_DUMP_METHOD void dump(const MachineRegisterInfo *MRI) const {
+ dbgs() << "Registers: ";
+ bool First = true;
+ for (unsigned Reg : Edges) {
+ if (!First)
+ dbgs() << ", ";
+ First = false;
+ dbgs() << printReg(Reg, MRI->getTargetRegisterInfo(), 0, MRI);
+ }
+ dbgs() << "\n" << "Instructions:";
+ for (MachineInstr *MI : Instrs) {
+ dbgs() << "\n ";
+ MI->print(dbgs());
+ }
+ dbgs() << "\n";
+ }
+
+ unsigned getID() const {
+ return ID;
+ }
+
};
class X86DomainReassignment : public MachineFunctionPass {
@@ -358,7 +365,7 @@ class X86DomainReassignment : public MachineFunctionPass {
DenseSet<unsigned> EnclosedEdges;
/// All instructions that are included in some closure.
- DenseMap<MachineInstr *, Closure *> EnclosedInstrs;
+ DenseMap<MachineInstr *, unsigned> EnclosedInstrs;
public:
static char ID;
@@ -435,14 +442,14 @@ void X86DomainReassignment::visitRegister(Closure &C, unsigned Reg,
void X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) {
auto I = EnclosedInstrs.find(MI);
if (I != EnclosedInstrs.end()) {
- if (I->second != &C)
+ if (I->second != C.getID())
// Instruction already belongs to another closure, avoid conflicts between
// closure and mark this closure as illegal.
C.setAllIllegal();
return;
}
- EnclosedInstrs[MI] = &C;
+ EnclosedInstrs[MI] = C.getID();
C.addInstruction(MI);
// Mark closure as illegal for reassignment to domains, if there is no
@@ -587,7 +594,7 @@ void X86DomainReassignment::initConverters() {
new InstrIgnore(TargetOpcode::PHI);
Converters[{MaskDomain, TargetOpcode::IMPLICIT_DEF}] =
- new InstrDeleter(TargetOpcode::IMPLICIT_DEF);
+ new InstrIgnore(TargetOpcode::IMPLICIT_DEF);
Converters[{MaskDomain, TargetOpcode::INSERT_SUBREG}] =
new InstrReplaceWithCopy(TargetOpcode::INSERT_SUBREG, 2);
@@ -663,8 +670,10 @@ void X86DomainReassignment::initConverters() {
createReplacer(X86::XOR32rr, X86::KXORDrr);
createReplacer(X86::XOR64rr, X86::KXORQrr);
- createReplacer(X86::TEST32rr, X86::KTESTDrr);
- createReplacer(X86::TEST64rr, X86::KTESTQrr);
+ // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+ // to prove only Z flag is used.
+ //createReplacer(X86::TEST32rr, X86::KTESTDrr);
+ //createReplacer(X86::TEST64rr, X86::KTESTQrr);
}
if (STI->hasDQI()) {
@@ -684,8 +693,10 @@ void X86DomainReassignment::initConverters() {
createReplacer(X86::SHR8ri, X86::KSHIFTRBri);
createReplacer(X86::SHL8ri, X86::KSHIFTLBri);
- createReplacer(X86::TEST8rr, X86::KTESTBrr);
- createReplacer(X86::TEST16rr, X86::KTESTWrr);
+ // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+ // to prove only Z flag is used.
+ //createReplacer(X86::TEST8rr, X86::KTESTBrr);
+ //createReplacer(X86::TEST16rr, X86::KTESTWrr);
createReplacer(X86::XOR8rr, X86::KXORBrr);
}
@@ -697,8 +708,9 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
if (DisableX86DomainReassignment)
return false;
- DEBUG(dbgs() << "***** Machine Function before Domain Reassignment *****\n");
- DEBUG(MF.print(dbgs()));
+ LLVM_DEBUG(
+ dbgs() << "***** Machine Function before Domain Reassignment *****\n");
+ LLVM_DEBUG(MF.print(dbgs()));
STI = &MF.getSubtarget<X86Subtarget>();
// GPR->K is the only transformation currently supported, bail out early if no
@@ -719,6 +731,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
std::vector<Closure> Closures;
// Go over all virtual registers and calculate a closure.
+ unsigned ClosureID = 0;
for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) {
unsigned Reg = TargetRegisterInfo::index2VirtReg(Idx);
@@ -731,7 +744,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
continue;
// Calculate closure starting with Reg.
- Closure C({MaskDomain});
+ Closure C(ClosureID++, {MaskDomain});
buildClosure(C, Reg);
// Collect all closures that can potentially be converted.
@@ -739,18 +752,20 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
Closures.push_back(std::move(C));
}
- for (Closure &C : Closures)
+ for (Closure &C : Closures) {
+ LLVM_DEBUG(C.dump(MRI));
if (isReassignmentProfitable(C, MaskDomain)) {
reassign(C, MaskDomain);
++NumClosuresConverted;
Changed = true;
}
+ }
- for (auto I : Converters)
- delete I.second;
+ DeleteContainerSeconds(Converters);
- DEBUG(dbgs() << "***** Machine Function after Domain Reassignment *****\n");
- DEBUG(MF.print(dbgs()));
+ LLVM_DEBUG(
+ dbgs() << "***** Machine Function after Domain Reassignment *****\n");
+ LLVM_DEBUG(MF.print(dbgs()));
return Changed;
}
diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp
index 6dd4631a4844..80674c7251fe 100755
--- a/lib/Target/X86/X86EvexToVex.cpp
+++ b/lib/Target/X86/X86EvexToVex.cpp
@@ -25,7 +25,6 @@
#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
-#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -42,6 +41,15 @@ using namespace llvm;
struct X86EvexToVexCompressTableEntry {
uint16_t EvexOpcode;
uint16_t VexOpcode;
+
+ bool operator<(const X86EvexToVexCompressTableEntry &RHS) const {
+ return EvexOpcode < RHS.EvexOpcode;
+ }
+
+ friend bool operator<(const X86EvexToVexCompressTableEntry &TE,
+ unsigned Opc) {
+ return TE.EvexOpcode < Opc;
+ }
};
#include "X86GenEVEX2VEXTables.inc"
@@ -54,35 +62,15 @@ namespace {
class EvexToVexInstPass : public MachineFunctionPass {
- /// X86EvexToVexCompressTable - Evex to Vex encoding opcode map.
- using EvexToVexTableType = DenseMap<unsigned, uint16_t>;
- EvexToVexTableType EvexToVex128Table;
- EvexToVexTableType EvexToVex256Table;
-
/// For EVEX instructions that can be encoded using VEX encoding, replace
/// them by the VEX encoding in order to reduce size.
bool CompressEvexToVexImpl(MachineInstr &MI) const;
- /// For initializing the hash map tables of all AVX-512 EVEX
- /// corresponding to AVX/AVX2 opcodes.
- void AddTableEntry(EvexToVexTableType &EvexToVexTable, uint16_t EvexOp,
- uint16_t VexOp);
-
public:
static char ID;
EvexToVexInstPass() : MachineFunctionPass(ID) {
initializeEvexToVexInstPassPass(*PassRegistry::getPassRegistry());
-
- // Initialize the EVEX to VEX 128 table map.
- for (X86EvexToVexCompressTableEntry Entry : X86EvexToVex128CompressTable) {
- AddTableEntry(EvexToVex128Table, Entry.EvexOpcode, Entry.VexOpcode);
- }
-
- // Initialize the EVEX to VEX 256 table map.
- for (X86EvexToVexCompressTableEntry Entry : X86EvexToVex256CompressTable) {
- AddTableEntry(EvexToVex256Table, Entry.EvexOpcode, Entry.VexOpcode);
- }
}
StringRef getPassName() const override { return EVEX2VEX_DESC; }
@@ -127,11 +115,6 @@ bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
return Changed;
}
-void EvexToVexInstPass::AddTableEntry(EvexToVexTableType &EvexToVexTable,
- uint16_t EvexOp, uint16_t VexOp) {
- EvexToVexTable[EvexOp] = VexOp;
-}
-
static bool usesExtendedRegister(const MachineInstr &MI) {
auto isHiRegIdx = [](unsigned Reg) {
// Check for XMM register with indexes between 16 - 31.
@@ -164,7 +147,7 @@ static bool usesExtendedRegister(const MachineInstr &MI) {
}
// Do any custom cleanup needed to finalize the conversion.
-static void performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
+static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
(void)NewOpc;
unsigned Opc = MI.getOpcode();
switch (Opc) {
@@ -197,7 +180,31 @@ static void performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1));
break;
}
+ case X86::VRNDSCALEPDZ128rri:
+ case X86::VRNDSCALEPDZ128rmi:
+ case X86::VRNDSCALEPSZ128rri:
+ case X86::VRNDSCALEPSZ128rmi:
+ case X86::VRNDSCALEPDZ256rri:
+ case X86::VRNDSCALEPDZ256rmi:
+ case X86::VRNDSCALEPSZ256rri:
+ case X86::VRNDSCALEPSZ256rmi:
+ case X86::VRNDSCALESDZr:
+ case X86::VRNDSCALESDZm:
+ case X86::VRNDSCALESSZr:
+ case X86::VRNDSCALESSZm:
+ case X86::VRNDSCALESDZr_Int:
+ case X86::VRNDSCALESDZm_Int:
+ case X86::VRNDSCALESSZr_Int:
+ case X86::VRNDSCALESSZm_Int:
+ const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1);
+ int64_t ImmVal = Imm.getImm();
+ // Ensure that only bits 3:0 of the immediate are used.
+ if ((ImmVal & 0xf) != ImmVal)
+ return false;
+ break;
}
+
+ return true;
}
@@ -224,46 +231,44 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
if (Desc.TSFlags & (X86II::EVEX_K | X86II::EVEX_B))
return false;
- // Check for non EVEX_V512 instrs only.
- // EVEX_V512 instr: bit EVEX_L2 = 1; bit VEX_L = 0.
- if ((Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L))
+ // Check for EVEX instructions with L2 set. These instructions are 512-bits
+ // and can't be converted to VEX.
+ if (Desc.TSFlags & X86II::EVEX_L2)
return false;
- // EVEX_V128 instr: bit EVEX_L2 = 0, bit VEX_L = 0.
- bool IsEVEX_V128 =
- (!(Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L));
-
- // EVEX_V256 instr: bit EVEX_L2 = 0, bit VEX_L = 1.
- bool IsEVEX_V256 =
- (!(Desc.TSFlags & X86II::EVEX_L2) && (Desc.TSFlags & X86II::VEX_L));
-
- unsigned NewOpc = 0;
-
- // Check for EVEX_V256 instructions.
- if (IsEVEX_V256) {
- // Search for opcode in the EvexToVex256 table.
- auto It = EvexToVex256Table.find(MI.getOpcode());
- if (It != EvexToVex256Table.end())
- NewOpc = It->second;
- }
- // Check for EVEX_V128 or Scalar instructions.
- else if (IsEVEX_V128) {
- // Search for opcode in the EvexToVex128 table.
- auto It = EvexToVex128Table.find(MI.getOpcode());
- if (It != EvexToVex128Table.end())
- NewOpc = It->second;
+#ifndef NDEBUG
+ // Make sure the tables are sorted.
+ static std::atomic<bool> TableChecked(false);
+ if (!TableChecked.load(std::memory_order_relaxed)) {
+ assert(std::is_sorted(std::begin(X86EvexToVex128CompressTable),
+ std::end(X86EvexToVex128CompressTable)) &&
+ "X86EvexToVex128CompressTable is not sorted!");
+ assert(std::is_sorted(std::begin(X86EvexToVex256CompressTable),
+ std::end(X86EvexToVex256CompressTable)) &&
+ "X86EvexToVex256CompressTable is not sorted!");
+ TableChecked.store(true, std::memory_order_relaxed);
}
+#endif
+
+ // Use the VEX.L bit to select the 128 or 256-bit table.
+ ArrayRef<X86EvexToVexCompressTableEntry> Table =
+ (Desc.TSFlags & X86II::VEX_L) ? makeArrayRef(X86EvexToVex256CompressTable)
+ : makeArrayRef(X86EvexToVex128CompressTable);
- if (!NewOpc)
+ auto I = std::lower_bound(Table.begin(), Table.end(), MI.getOpcode());
+ if (I == Table.end() || I->EvexOpcode != MI.getOpcode())
return false;
+ unsigned NewOpc = I->VexOpcode;
+
if (usesExtendedRegister(MI))
return false;
- performCustomAdjustments(MI, NewOpc);
+ if (!performCustomAdjustments(MI, NewOpc))
+ return false;
MI.setDesc(TII->get(NewOpc));
- MI.setAsmPrinterFlag(AC_EVEX_2_VEX);
+ MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
return true;
}
diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp
index ab2ef26d1cc9..1dd73163080b 100644
--- a/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/lib/Target/X86/X86ExpandPseudo.cpp
@@ -59,12 +59,112 @@ public:
}
private:
+ void ExpandICallBranchFunnel(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator MBBI);
+
bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
bool ExpandMBB(MachineBasicBlock &MBB);
};
char X86ExpandPseudo::ID = 0;
} // End anonymous namespace.
+void X86ExpandPseudo::ExpandICallBranchFunnel(
+ MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI) {
+ MachineBasicBlock *JTMBB = MBB;
+ MachineInstr *JTInst = &*MBBI;
+ MachineFunction *MF = MBB->getParent();
+ const BasicBlock *BB = MBB->getBasicBlock();
+ auto InsPt = MachineFunction::iterator(MBB);
+ ++InsPt;
+
+ std::vector<std::pair<MachineBasicBlock *, unsigned>> TargetMBBs;
+ DebugLoc DL = JTInst->getDebugLoc();
+ MachineOperand Selector = JTInst->getOperand(0);
+ const GlobalValue *CombinedGlobal = JTInst->getOperand(1).getGlobal();
+
+ auto CmpTarget = [&](unsigned Target) {
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::LEA64r), X86::R11)
+ .addReg(X86::RIP)
+ .addImm(1)
+ .addReg(0)
+ .addGlobalAddress(CombinedGlobal,
+ JTInst->getOperand(2 + 2 * Target).getImm())
+ .addReg(0);
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::CMP64rr))
+ .add(Selector)
+ .addReg(X86::R11);
+ };
+
+ auto CreateMBB = [&]() {
+ auto *NewMBB = MF->CreateMachineBasicBlock(BB);
+ MBB->addSuccessor(NewMBB);
+ return NewMBB;
+ };
+
+ auto EmitCondJump = [&](unsigned Opcode, MachineBasicBlock *ThenMBB) {
+ BuildMI(*MBB, MBBI, DL, TII->get(Opcode)).addMBB(ThenMBB);
+
+ auto *ElseMBB = CreateMBB();
+ MF->insert(InsPt, ElseMBB);
+ MBB = ElseMBB;
+ MBBI = MBB->end();
+ };
+
+ auto EmitCondJumpTarget = [&](unsigned Opcode, unsigned Target) {
+ auto *ThenMBB = CreateMBB();
+ TargetMBBs.push_back({ThenMBB, Target});
+ EmitCondJump(Opcode, ThenMBB);
+ };
+
+ auto EmitTailCall = [&](unsigned Target) {
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::TAILJMPd64))
+ .add(JTInst->getOperand(3 + 2 * Target));
+ };
+
+ std::function<void(unsigned, unsigned)> EmitBranchFunnel =
+ [&](unsigned FirstTarget, unsigned NumTargets) {
+ if (NumTargets == 1) {
+ EmitTailCall(FirstTarget);
+ return;
+ }
+
+ if (NumTargets == 2) {
+ CmpTarget(FirstTarget + 1);
+ EmitCondJumpTarget(X86::JB_1, FirstTarget);
+ EmitTailCall(FirstTarget + 1);
+ return;
+ }
+
+ if (NumTargets < 6) {
+ CmpTarget(FirstTarget + 1);
+ EmitCondJumpTarget(X86::JB_1, FirstTarget);
+ EmitCondJumpTarget(X86::JE_1, FirstTarget + 1);
+ EmitBranchFunnel(FirstTarget + 2, NumTargets - 2);
+ return;
+ }
+
+ auto *ThenMBB = CreateMBB();
+ CmpTarget(FirstTarget + (NumTargets / 2));
+ EmitCondJump(X86::JB_1, ThenMBB);
+ EmitCondJumpTarget(X86::JE_1, FirstTarget + (NumTargets / 2));
+ EmitBranchFunnel(FirstTarget + (NumTargets / 2) + 1,
+ NumTargets - (NumTargets / 2) - 1);
+
+ MF->insert(InsPt, ThenMBB);
+ MBB = ThenMBB;
+ MBBI = MBB->end();
+ EmitBranchFunnel(FirstTarget, NumTargets / 2);
+ };
+
+ EmitBranchFunnel(0, (JTInst->getNumOperands() - 2) / 2);
+ for (auto P : TargetMBBs) {
+ MF->insert(InsPt, P.first);
+ BuildMI(P.first, DL, TII->get(X86::TAILJMPd64))
+ .add(JTInst->getOperand(3 + 2 * P.second));
+ }
+ JTMBB->erase(JTInst);
+}
+
/// If \p MBBI is a pseudo instruction, this method expands
/// it to the corresponding (sequence of) actual instruction(s).
/// \returns true if \p MBBI has been expanded.
@@ -106,7 +206,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
if (Offset) {
// Check for possible merge with preceding ADD instruction.
Offset += X86FL->mergeSPUpdates(MBB, MBBI, true);
- X86FL->emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true);
+ X86FL->emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue=*/true);
}
// Jump to label or value in register.
@@ -186,7 +286,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case X86::IRET: {
// Adjust stack to erase error code
int64_t StackAdj = MBBI->getOperand(0).getImm();
- X86FL->emitSPUpdate(MBB, MBBI, StackAdj, true);
+ X86FL->emitSPUpdate(MBB, MBBI, DL, StackAdj, true);
// Replace pseudo with machine iret
BuildMI(MBB, MBBI, DL,
TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32));
@@ -210,7 +310,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// A ret can only handle immediates as big as 2**16-1. If we need to pop
// off bytes before the return address, we must do it manually.
BuildMI(MBB, MBBI, DL, TII->get(X86::POP32r)).addReg(X86::ECX, RegState::Define);
- X86FL->emitSPUpdate(MBB, MBBI, StackAdj, /*InEpilogue=*/true);
+ X86FL->emitSPUpdate(MBB, MBBI, DL, StackAdj, /*InEpilogue=*/true);
BuildMI(MBB, MBBI, DL, TII->get(X86::PUSH32r)).addReg(X86::ECX);
MIB = BuildMI(MBB, MBBI, DL, TII->get(X86::RETL));
}
@@ -259,6 +359,9 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MBBI->eraseFromParent();
return true;
}
+ case TargetOpcode::ICALL_BRANCH_FUNNEL:
+ ExpandICallBranchFunnel(&MBB, MBBI);
+ return true;
}
llvm_unreachable("Previous switch has a fallthrough?");
}
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 5dae485f4c9f..de8b40f28a86 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -68,7 +68,7 @@ public:
bool fastSelectInstruction(const Instruction *I) override;
- /// \brief The specified machine instr operand is a vreg, and that
+ /// The specified machine instr operand is a vreg, and that
/// vreg is being provided by the specified load instruction. If possible,
/// try to fold the load as an operand to the instruction, returning true if
/// possible.
@@ -134,6 +134,8 @@ private:
bool X86SelectFPExt(const Instruction *I);
bool X86SelectFPTrunc(const Instruction *I);
bool X86SelectSIToFP(const Instruction *I);
+ bool X86SelectUIToFP(const Instruction *I);
+ bool X86SelectIntToFP(const Instruction *I, bool IsSigned);
const X86InstrInfo *getInstrInfo() const {
return Subtarget->getInstrInfo();
@@ -217,7 +219,7 @@ getX86SSEConditionCode(CmpInst::Predicate Predicate) {
return std::make_pair(CC, NeedSwap);
}
-/// \brief Adds a complex addressing mode to the given machine instr builder.
+/// Adds a complex addressing mode to the given machine instr builder.
/// Note, this will constrain the index register. If its not possible to
/// constrain the given index register, then a new one will be created. The
/// IndexReg field of the addressing mode will be updated to match in this case.
@@ -231,7 +233,7 @@ X86FastISel::addFullAddress(const MachineInstrBuilder &MIB,
return ::addFullAddress(MIB, AM);
}
-/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// Check if it is possible to fold the condition from the XALU intrinsic
/// into the user. The condition code will only be updated on success.
bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
const Value *Cond) {
@@ -1789,9 +1791,16 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
bool X86FastISel::X86SelectShift(const Instruction *I) {
unsigned CReg = 0, OpReg = 0;
const TargetRegisterClass *RC = nullptr;
- assert(!I->getType()->isIntegerTy(8) &&
- "i8 shifts should be handled by autogenerated table");
- if (I->getType()->isIntegerTy(16)) {
+ if (I->getType()->isIntegerTy(8)) {
+ CReg = X86::CL;
+ RC = &X86::GR8RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR8rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR8rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL8rCL; break;
+ default: return false;
+ }
+ } else if (I->getType()->isIntegerTy(16)) {
CReg = X86::CX;
RC = &X86::GR16RegClass;
switch (I->getOpcode()) {
@@ -1836,10 +1845,10 @@ bool X86FastISel::X86SelectShift(const Instruction *I) {
// The shift instruction uses X86::CL. If we defined a super-register
// of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
- assert(CReg != X86::CL && "CReg should be a super register of CL");
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::KILL), X86::CL)
- .addReg(CReg, RegState::Kill);
+ if (CReg != X86::CL)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::KILL), X86::CL)
+ .addReg(CReg, RegState::Kill);
unsigned ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
@@ -2012,7 +2021,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
return true;
}
-/// \brief Emit a conditional move instruction (if the are supported) to lower
+/// Emit a conditional move instruction (if the are supported) to lower
/// the select.
bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
// Check if the subtarget supports these instructions.
@@ -2141,7 +2150,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
return true;
}
-/// \brief Emit SSE or AVX instructions to lower the select.
+/// Emit SSE or AVX instructions to lower the select.
///
/// Try to use SSE1/SSE2 instructions to simulate a select without branches.
/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
@@ -2403,15 +2412,19 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) {
return false;
}
-bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
+// Common code for X86SelectSIToFP and X86SelectUIToFP.
+bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) {
// The target-independent selection algorithm in FastISel already knows how
// to select a SINT_TO_FP if the target is SSE but not AVX.
// Early exit if the subtarget doesn't have AVX.
- if (!Subtarget->hasAVX())
+ // Unsigned conversion requires avx512.
+ bool HasAVX512 = Subtarget->hasAVX512();
+ if (!Subtarget->hasAVX() || (!IsSigned && !HasAVX512))
return false;
- Type *InTy = I->getOperand(0)->getType();
- if (!InTy->isIntegerTy(32) && !InTy->isIntegerTy(64))
+ // TODO: We could sign extend narrower types.
+ MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
+ if (SrcVT != MVT::i32 && SrcVT != MVT::i64)
return false;
// Select integer to float/double conversion.
@@ -2419,20 +2432,31 @@ bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
if (OpReg == 0)
return false;
- const TargetRegisterClass *RC = nullptr;
unsigned Opcode;
+ static const uint16_t SCvtOpc[2][2][2] = {
+ { { X86::VCVTSI2SSrr, X86::VCVTSI642SSrr },
+ { X86::VCVTSI2SDrr, X86::VCVTSI642SDrr } },
+ { { X86::VCVTSI2SSZrr, X86::VCVTSI642SSZrr },
+ { X86::VCVTSI2SDZrr, X86::VCVTSI642SDZrr } },
+ };
+ static const uint16_t UCvtOpc[2][2] = {
+ { X86::VCVTUSI2SSZrr, X86::VCVTUSI642SSZrr },
+ { X86::VCVTUSI2SDZrr, X86::VCVTUSI642SDZrr },
+ };
+ bool Is64Bit = SrcVT == MVT::i64;
+
if (I->getType()->isDoubleTy()) {
- // sitofp int -> double
- Opcode = InTy->isIntegerTy(64) ? X86::VCVTSI642SDrr : X86::VCVTSI2SDrr;
- RC = &X86::FR64RegClass;
+ // s/uitofp int -> double
+ Opcode = IsSigned ? SCvtOpc[HasAVX512][1][Is64Bit] : UCvtOpc[1][Is64Bit];
} else if (I->getType()->isFloatTy()) {
- // sitofp int -> float
- Opcode = InTy->isIntegerTy(64) ? X86::VCVTSI642SSrr : X86::VCVTSI2SSrr;
- RC = &X86::FR32RegClass;
+ // s/uitofp int -> float
+ Opcode = IsSigned ? SCvtOpc[HasAVX512][0][Is64Bit] : UCvtOpc[0][Is64Bit];
} else
return false;
+ MVT DstVT = TLI.getValueType(DL, I->getType()).getSimpleVT();
+ const TargetRegisterClass *RC = TLI.getRegClassFor(DstVT);
unsigned ImplicitDefReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
@@ -2442,6 +2466,14 @@ bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
return true;
}
+bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
+ return X86SelectIntToFP(I, /*IsSigned*/true);
+}
+
+bool X86FastISel::X86SelectUIToFP(const Instruction *I) {
+ return X86SelectIntToFP(I, /*IsSigned*/false);
+}
+
// Helper method used by X86SelectFPExt and X86SelectFPTrunc.
bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
unsigned TargetOpc,
@@ -2675,7 +2707,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
(FrameReg == X86::EBP && VT == MVT::i32)) &&
"Invalid Frame Register!");
- // Always make a copy of the frame register to to a vreg first, so that we
+ // Always make a copy of the frame register to a vreg first, so that we
// never directly reference the frame register (the TwoAddressInstruction-
// Pass doesn't like that).
unsigned SrcReg = createResultReg(RC);
@@ -2726,7 +2758,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
return false;
- return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2);
+ return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 1);
}
case Intrinsic::memset: {
const MemSetInst *MSI = cast<MemSetInst>(II);
@@ -2741,7 +2773,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
if (MSI->getDestAddressSpace() > 255)
return false;
- return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+ return lowerCallTo(II, "memset", II->getNumArgOperands() - 1);
}
case Intrinsic::stackprotector: {
// Emit code to store the stack guard onto the stack.
@@ -2792,17 +2824,19 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
// Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
// is not generated by FastISel yet.
// FIXME: Update this code once tablegen can handle it.
- static const uint16_t SqrtOpc[2][2] = {
- {X86::SQRTSSr, X86::VSQRTSSr},
- {X86::SQRTSDr, X86::VSQRTSDr}
+ static const uint16_t SqrtOpc[3][2] = {
+ { X86::SQRTSSr, X86::SQRTSDr },
+ { X86::VSQRTSSr, X86::VSQRTSDr },
+ { X86::VSQRTSSZr, X86::VSQRTSDZr },
};
- bool HasAVX = Subtarget->hasAVX();
+ unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :
+ Subtarget->hasAVX() ? 1 :
+ 0;
unsigned Opc;
- const TargetRegisterClass *RC;
switch (VT.SimpleTy) {
default: return false;
- case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
- case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
+ case MVT::f32: Opc = SqrtOpc[AVXLevel][0]; break;
+ case MVT::f64: Opc = SqrtOpc[AVXLevel][1]; break;
}
const Value *SrcVal = II->getArgOperand(0);
@@ -2811,8 +2845,9 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
if (SrcReg == 0)
return false;
+ const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
unsigned ImplicitDefReg = 0;
- if (HasAVX) {
+ if (AVXLevel > 0) {
ImplicitDefReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
@@ -2989,18 +3024,22 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
if (!isTypeLegal(RetTy, VT))
return false;
- static const uint16_t CvtOpc[2][2][2] = {
- { { X86::CVTTSS2SIrr, X86::VCVTTSS2SIrr },
- { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr } },
- { { X86::CVTTSD2SIrr, X86::VCVTTSD2SIrr },
- { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr } }
+ static const uint16_t CvtOpc[3][2][2] = {
+ { { X86::CVTTSS2SIrr, X86::CVTTSS2SI64rr },
+ { X86::CVTTSD2SIrr, X86::CVTTSD2SI64rr } },
+ { { X86::VCVTTSS2SIrr, X86::VCVTTSS2SI64rr },
+ { X86::VCVTTSD2SIrr, X86::VCVTTSD2SI64rr } },
+ { { X86::VCVTTSS2SIZrr, X86::VCVTTSS2SI64Zrr },
+ { X86::VCVTTSD2SIZrr, X86::VCVTTSD2SI64Zrr } },
};
- bool HasAVX = Subtarget->hasAVX();
+ unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :
+ Subtarget->hasAVX() ? 1 :
+ 0;
unsigned Opc;
switch (VT.SimpleTy) {
default: llvm_unreachable("Unexpected result type.");
- case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
- case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
+ case MVT::i32: Opc = CvtOpc[AVXLevel][IsInputDouble][0]; break;
+ case MVT::i64: Opc = CvtOpc[AVXLevel][IsInputDouble][1]; break;
}
// Check if we can fold insertelement instructions into the convert.
@@ -3167,11 +3206,22 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
const Function *CalledFn = CI ? CI->getCalledFunction() : nullptr;
+ // Call / invoke instructions with NoCfCheck attribute require special
+ // handling.
+ const auto *II =
+ CLI.CS ? dyn_cast<InvokeInst>(CLI.CS->getInstruction()) : nullptr;
+ if ((CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck()))
+ return false;
+
// Functions with no_caller_saved_registers that need special handling.
if ((CI && CI->hasFnAttr("no_caller_saved_registers")) ||
(CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers")))
return false;
+ // Functions using retpoline should use SDISel for calls.
+ if (Subtarget->useRetpoline())
+ return false;
+
// Handle only C, fastcc, and webkit_js calling conventions for now.
switch (CC) {
default: return false;
@@ -3598,6 +3648,8 @@ X86FastISel::fastSelectInstruction(const Instruction *I) {
return X86SelectFPTrunc(I);
case Instruction::SIToFP:
return X86SelectSIToFP(I);
+ case Instruction::UIToFP:
+ return X86SelectUIToFP(I);
case Instruction::IntToPtr: // Deliberate fall-through.
case Instruction::PtrToInt: {
EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index 01d10fe4cae4..d9bf60c2c9fb 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -155,93 +155,18 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
MLI = &getAnalysis<MachineLoopInfo>();
LiveRegs.init(TII->getRegisterInfo());
- DEBUG(dbgs() << "Start X86FixupBWInsts\n";);
+ LLVM_DEBUG(dbgs() << "Start X86FixupBWInsts\n";);
// Process all basic blocks.
for (auto &MBB : MF)
processBasicBlock(MF, MBB);
- DEBUG(dbgs() << "End X86FixupBWInsts\n";);
+ LLVM_DEBUG(dbgs() << "End X86FixupBWInsts\n";);
return true;
}
-/// Check if register \p Reg is live after the \p MI.
-///
-/// \p LiveRegs should be in a state describing liveness information in
-/// that exact place as this function tries to precise analysis made
-/// by \p LiveRegs by exploiting the information about particular
-/// instruction \p MI. \p MI is expected to be one of the MOVs handled
-/// by the x86FixupBWInsts pass.
-/// Note: similar to LivePhysRegs::contains this would state that
-/// super-register is not used if only some part of it is used.
-///
-/// X86 backend does not have subregister liveness tracking enabled,
-/// so liveness information might be overly conservative. However, for
-/// some specific instructions (this pass only cares about MOVs) we can
-/// produce more precise results by analysing that MOV's operands.
-///
-/// Indeed, if super-register is not live before the mov it means that it
-/// was originally <read-undef> and so we are free to modify these
-/// undef upper bits. That may happen in case where the use is in another MBB
-/// and the vreg/physreg corresponding to the move has higher width than
-/// necessary (e.g. due to register coalescing with a "truncate" copy).
-/// So, it handles pattern like this:
-///
-/// %bb.2: derived from LLVM BB %if.then
-/// Live Ins: %rdi
-/// Predecessors according to CFG: %bb.0
-/// %ax = MOV16rm killed %rdi, 1, %noreg, 0, %noreg, implicit-def %eax;
-/// mem:LD2[%p]
-/// No implicit %eax
-/// Successors according to CFG: %bb.3(?%)
-///
-/// %bb.3: derived from LLVM BB %if.end
-/// Live Ins: %eax Only %ax is actually live
-/// Predecessors according to CFG: %bb.2 %bb.1
-/// %ax = KILL %ax, implicit killed %eax
-/// RET 0, %ax
-static bool isLive(const MachineInstr &MI,
- const LivePhysRegs &LiveRegs,
- const TargetRegisterInfo *TRI,
- unsigned Reg) {
- if (!LiveRegs.contains(Reg))
- return false;
-
- unsigned Opc = MI.getOpcode(); (void)Opc;
- // These are the opcodes currently handled by the pass, if something
- // else will be added we need to ensure that new opcode has the same
- // properties.
- assert((Opc == X86::MOV8rm || Opc == X86::MOV16rm || Opc == X86::MOV8rr ||
- Opc == X86::MOV16rr) &&
- "Unexpected opcode.");
-
- bool IsDefined = false;
- for (auto &MO: MI.implicit_operands()) {
- if (!MO.isReg())
- continue;
-
- assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!");
-
- for (MCSuperRegIterator Supers(Reg, TRI, true); Supers.isValid(); ++Supers) {
- if (*Supers == MO.getReg()) {
- if (MO.isDef())
- IsDefined = true;
- else
- return true; // SuperReg Imp-used' -> live before the MI
- }
- }
- }
- // Reg is not Imp-def'ed -> it's live both before/after the instruction.
- if (!IsDefined)
- return true;
-
- // Otherwise, the Reg is not live before the MI and the MOV can't
- // make it really live, so it's in fact dead even after the MI.
- return false;
-}
-
-/// \brief Check if after \p OrigMI the only portion of super register
+/// Check if after \p OrigMI the only portion of super register
/// of the destination register of \p OrigMI that is alive is that
/// destination register.
///
@@ -262,20 +187,85 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
if (SubRegIdx == X86::sub_8bit_hi)
return false;
- if (isLive(*OrigMI, LiveRegs, TRI, SuperDestReg))
- return false;
+ // If neither the destination-super register nor any applicable subregisters
+ // are live after this instruction, then the super register is safe to use.
+ if (!LiveRegs.contains(SuperDestReg)) {
+ // If the original destination register was not the low 8-bit subregister
+ // then the super register check is sufficient.
+ if (SubRegIdx != X86::sub_8bit)
+ return true;
+ // If the original destination register was the low 8-bit subregister and
+ // we also need to check the 16-bit subregister and the high 8-bit
+ // subregister.
+ if (!LiveRegs.contains(getX86SubSuperRegister(OrigDestReg, 16)) &&
+ !LiveRegs.contains(getX86SubSuperRegister(SuperDestReg, 8,
+ /*High=*/true)))
+ return true;
+ // Otherwise, we have a little more checking to do.
+ }
+
+ // If we get here, the super-register destination (or some part of it) is
+ // marked as live after the original instruction.
+ //
+ // The X86 backend does not have subregister liveness tracking enabled,
+ // so liveness information might be overly conservative. Specifically, the
+ // super register might be marked as live because it is implicitly defined
+ // by the instruction we are examining.
+ //
+ // However, for some specific instructions (this pass only cares about MOVs)
+ // we can produce more precise results by analysing that MOV's operands.
+ //
+ // Indeed, if super-register is not live before the mov it means that it
+ // was originally <read-undef> and so we are free to modify these
+ // undef upper bits. That may happen in case where the use is in another MBB
+ // and the vreg/physreg corresponding to the move has higher width than
+ // necessary (e.g. due to register coalescing with a "truncate" copy).
+ // So, we would like to handle patterns like this:
+ //
+ // %bb.2: derived from LLVM BB %if.then
+ // Live Ins: %rdi
+ // Predecessors according to CFG: %bb.0
+ // %ax<def> = MOV16rm killed %rdi, 1, %noreg, 0, %noreg, implicit-def %eax
+ // ; No implicit %eax
+ // Successors according to CFG: %bb.3(?%)
+ //
+ // %bb.3: derived from LLVM BB %if.end
+ // Live Ins: %eax Only %ax is actually live
+ // Predecessors according to CFG: %bb.2 %bb.1
+ // %ax = KILL %ax, implicit killed %eax
+ // RET 0, %ax
+ unsigned Opc = OrigMI->getOpcode(); (void)Opc;
+ // These are the opcodes currently handled by the pass, if something
+ // else will be added we need to ensure that new opcode has the same
+ // properties.
+ assert((Opc == X86::MOV8rm || Opc == X86::MOV16rm || Opc == X86::MOV8rr ||
+ Opc == X86::MOV16rr) &&
+ "Unexpected opcode.");
- if (SubRegIdx == X86::sub_8bit) {
- // In the case of byte registers, we also have to check that the upper
- // byte register is also dead. That is considered to be independent of
- // whether the super-register is dead.
- unsigned UpperByteReg =
- getX86SubSuperRegister(SuperDestReg, 8, /*High=*/true);
+ bool IsDefined = false;
+ for (auto &MO: OrigMI->implicit_operands()) {
+ if (!MO.isReg())
+ continue;
+
+ assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!");
- if (isLive(*OrigMI, LiveRegs, TRI, UpperByteReg))
+ if (MO.isDef() && TRI->isSuperRegisterEq(OrigDestReg, MO.getReg()))
+ IsDefined = true;
+
+ // If MO is a use of any part of the destination register but is not equal
+ // to OrigDestReg or one of its subregisters, we cannot use SuperDestReg.
+ // For example, if OrigDestReg is %al then an implicit use of %ah, %ax,
+ // %eax, or %rax will prevent us from using the %eax register.
+ if (MO.isUse() && !TRI->isSubRegisterEq(OrigDestReg, MO.getReg()) &&
+ TRI->regsOverlap(SuperDestReg, MO.getReg()))
return false;
}
+ // Reg is not Imp-def'ed -> it's live both before/after the instruction.
+ if (!IsDefined)
+ return false;
+ // Otherwise, the Reg is not live before the MI and the MOV can't
+ // make it really live, so it's in fact dead even after the MI.
return true;
}
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index b41bf99f19b2..d85389a0a7f1 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -20,7 +20,7 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -40,13 +40,13 @@ namespace {
class FixupLEAPass : public MachineFunctionPass {
enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
- /// \brief Loop over all of the instructions in the basic block
+ /// Loop over all of the instructions in the basic block
/// replacing applicable instructions with LEA instructions,
/// where appropriate.
bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
- /// \brief Given a machine register, look for the instruction
+ /// Given a machine register, look for the instruction
/// which writes it in the current basic block. If found,
/// try to replace it with an equivalent LEA instruction.
/// If replacement succeeds, then also process the newly created
@@ -54,20 +54,20 @@ class FixupLEAPass : public MachineFunctionPass {
void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI);
- /// \brief Given a memory access or LEA instruction
+ /// Given a memory access or LEA instruction
/// whose address mode uses a base and/or index register, look for
/// an opportunity to replace the instruction which sets the base or index
/// register with an equivalent LEA instruction.
void processInstruction(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI);
- /// \brief Given a LEA instruction which is unprofitable
+ /// Given a LEA instruction which is unprofitable
/// on Silvermont try to replace it with an equivalent ADD instruction
void processInstructionForSLM(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI);
- /// \brief Given a LEA instruction which is unprofitable
+ /// Given a LEA instruction which is unprofitable
/// on SNB+ try to replace it with other instructions.
/// According to Intel's Optimization Reference Manual:
/// " For LEA instructions with three source operands and some specific
@@ -82,23 +82,23 @@ class FixupLEAPass : public MachineFunctionPass {
MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
MachineFunction::iterator MFI);
- /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
+ /// Look for LEAs that add 1 to reg or subtract 1 from reg
/// and convert them to INC or DEC respectively.
bool fixupIncDec(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI) const;
- /// \brief Determine if an instruction references a machine register
+ /// Determine if an instruction references a machine register
/// and, if so, whether it reads or writes the register.
RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I);
- /// \brief Step backwards through a basic block, looking
+ /// Step backwards through a basic block, looking
/// for an instruction which writes a register within
/// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
MachineBasicBlock::iterator searchBackwards(MachineOperand &p,
MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI);
- /// \brief if an instruction can be converted to an
+ /// if an instruction can be converted to an
/// equivalent LEA, insert the new instruction into the basic block
/// and return a pointer to it. Otherwise, return zero.
MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI,
@@ -113,7 +113,7 @@ public:
initializeFixupLEAPassPass(*PassRegistry::getPassRegistry());
}
- /// \brief Loop over all of the basic blocks,
+ /// Loop over all of the basic blocks,
/// replacing instructions by equivalent LEA instructions
/// if needed and when possible.
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -125,6 +125,7 @@ public:
}
private:
+ TargetSchedModel TSM;
MachineFunction *MF;
const X86InstrInfo *TII; // Machine instruction info.
bool OptIncDec;
@@ -202,13 +203,14 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
if (!OptLEA && !OptIncDec)
return false;
+ TSM.init(&Func.getSubtarget());
TII = ST.getInstrInfo();
- DEBUG(dbgs() << "Start X86FixupLEAs\n";);
+ LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";);
// Process all basic blocks.
for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I)
processBasicBlock(Func, I);
- DEBUG(dbgs() << "End X86FixupLEAs\n";);
+ LLVM_DEBUG(dbgs() << "End X86FixupLEAs\n";);
return true;
}
@@ -264,8 +266,7 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
if (usesRegister(p, CurInst) == RU_Write) {
return CurInst;
}
- InstrDistance += TII->getInstrLatency(
- MF->getSubtarget().getInstrItineraryData(), *CurInst);
+ InstrDistance += TSM.computeInstrLatency(&*CurInst);
Found = getPreviousInstr(CurInst, MFI);
}
return MachineBasicBlock::iterator();
@@ -285,6 +286,8 @@ static inline bool isRegOperand(const MachineOperand &Op) {
}
/// hasIneffecientLEARegs - LEA that uses base and index registers
/// where the base is EBP, RBP, or R13
+// TODO: use a variant scheduling class to model the latency profile
+// of LEA instructions, and implement this logic as a scheduling predicate.
static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
const MachineOperand &Index) {
return Base.isReg() && isInefficientLEAReg(Base.getReg()) &&
@@ -295,13 +298,6 @@ static inline bool hasLEAOffset(const MachineOperand &Offset) {
return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal();
}
-// LEA instruction that has all three operands: offset, base and index
-static inline bool isThreeOperandsLEA(const MachineOperand &Base,
- const MachineOperand &Index,
- const MachineOperand &Offset) {
- return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset);
-}
-
static inline int getADDrrFromLEA(int LEAOpcode) {
switch (LEAOpcode) {
default:
@@ -407,9 +403,9 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI);
if (NewMI) {
++NumLEAs;
- DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
+ LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
// now to replace with an equivalent LEA...
- DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
+ LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
MFI->erase(MBI);
MachineBasicBlock::iterator J =
static_cast<MachineBasicBlock::iterator>(NewMI);
@@ -434,8 +430,8 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
return;
if (MI.getOperand(2).getImm() > 1)
return;
- DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
- DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+ LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
+ LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: ";);
MachineInstr *NewMI = nullptr;
// Make ADD instruction for two registers writing to LEA's destination
if (SrcR1 != 0 && SrcR2 != 0) {
@@ -443,7 +439,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1);
NewMI =
BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src);
- DEBUG(NewMI->dump(););
+ LLVM_DEBUG(NewMI->dump(););
}
// Make ADD instruction for immediate
if (MI.getOperand(4).getImm() != 0) {
@@ -453,7 +449,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR)
.add(SrcR)
.addImm(MI.getOperand(4).getImm());
- DEBUG(NewMI->dump(););
+ LLVM_DEBUG(NewMI->dump(););
}
if (NewMI) {
MFI->erase(I);
@@ -476,7 +472,7 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
const MachineOperand &Offset = MI.getOperand(4);
const MachineOperand &Segment = MI.getOperand(5);
- if (!(isThreeOperandsLEA(Base, Index, Offset) ||
+ if (!(TII->isThreeOperandsLEA(MI) ||
hasInefficientLEABaseReg(Base, Index)) ||
!TII->isSafeToClobberEFLAGS(*MFI, MI) ||
Segment.getReg() != X86::NoRegister)
@@ -503,8 +499,8 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode));
const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset));
- DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
- DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+ LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
+ LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: ";);
// First try to replace LEA with one or two (for the 3-op LEA case)
// add instructions:
@@ -514,11 +510,11 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
const MachineOperand &Src = DstR == BaseR ? Index : Base;
MachineInstr *NewMI =
BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
- DEBUG(NewMI->dump(););
+ LLVM_DEBUG(NewMI->dump(););
// Create ADD instruction for the Offset in case of 3-Ops LEA.
if (hasLEAOffset(Offset)) {
NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
- DEBUG(NewMI->dump(););
+ LLVM_DEBUG(NewMI->dump(););
}
return NewMI;
}
@@ -534,11 +530,11 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
.add(IsInefficientBase ? Base : Index)
.addImm(0)
.add(Segment);
- DEBUG(NewMI->dump(););
+ LLVM_DEBUG(NewMI->dump(););
// Create ADD instruction for the Offset in case of 3-Ops LEA.
if (hasLEAOffset(Offset)) {
NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
- DEBUG(NewMI->dump(););
+ LLVM_DEBUG(NewMI->dump(););
}
return NewMI;
}
@@ -548,12 +544,13 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
// lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
if (IsScale1 && !hasLEAOffset(Offset)) {
- TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill());
- DEBUG(MI.getPrevNode()->dump(););
+ bool BIK = Base.isKill() && BaseR != IndexR;
+ TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, BIK);
+ LLVM_DEBUG(MI.getPrevNode()->dump(););
MachineInstr *NewMI =
BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
- DEBUG(NewMI->dump(););
+ LLVM_DEBUG(NewMI->dump(););
return NewMI;
}
// lea offset(%base,%index,scale), %dst =>
@@ -565,10 +562,10 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
.add(Index)
.add(Offset)
.add(Segment);
- DEBUG(NewMI->dump(););
+ LLVM_DEBUG(NewMI->dump(););
NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
- DEBUG(NewMI->dump(););
+ LLVM_DEBUG(NewMI->dump(););
return NewMI;
}
diff --git a/lib/Target/X86/X86FlagsCopyLowering.cpp b/lib/Target/X86/X86FlagsCopyLowering.cpp
new file mode 100644
index 000000000000..1ba08d39c595
--- /dev/null
+++ b/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -0,0 +1,1052 @@
+//====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual
+/// flag bits.
+///
+/// We have to do this by carefully analyzing and rewriting the usage of the
+/// copied EFLAGS register because there is no general way to rematerialize the
+/// entire EFLAGS register safely and efficiently. Using `popf` both forces
+/// dynamic stack adjustment and can create correctness issues due to IF, TF,
+/// and other non-status flags being overwritten. Using sequences involving
+/// SAHF don't work on all x86 processors and are often quite slow compared to
+/// directly testing a single status preserved in its own GPR.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define PASS_KEY "x86-flags-copy-lowering"
+#define DEBUG_TYPE PASS_KEY
+
+STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated");
+STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted");
+STATISTIC(NumTestsInserted, "Number of test instructions inserted");
+STATISTIC(NumAddsInserted, "Number of adds instructions inserted");
+
+namespace llvm {
+
+void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
+
+} // end namespace llvm
+
+namespace {
+
+// Convenient array type for storing registers associated with each condition.
+using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>;
+
+class X86FlagsCopyLoweringPass : public MachineFunctionPass {
+public:
+ X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) {
+ initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// Pass identification, replacement for typeid.
+ static char ID;
+
+private:
+ MachineRegisterInfo *MRI;
+ const X86InstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ const TargetRegisterClass *PromoteRC;
+ MachineDominatorTree *MDT;
+
+ CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator CopyDefI);
+
+ unsigned promoteCondToReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc, X86::CondCode Cond);
+ std::pair<unsigned, bool>
+ getCondOrInverseInReg(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+ X86::CondCode Cond, CondRegArray &CondRegs);
+ void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
+ DebugLoc Loc, unsigned Reg);
+
+ void rewriteArithmetic(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+ MachineInstr &MI, MachineOperand &FlagUse,
+ CondRegArray &CondRegs);
+ void rewriteCMov(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+ MachineInstr &CMovI, MachineOperand &FlagUse,
+ CondRegArray &CondRegs);
+ void rewriteCondJmp(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+ MachineInstr &JmpI, CondRegArray &CondRegs);
+ void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse,
+ MachineInstr &CopyDefI);
+ void rewriteSetCarryExtended(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc, MachineInstr &SetBI,
+ MachineOperand &FlagUse, CondRegArray &CondRegs);
+ void rewriteSetCC(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+ MachineInstr &SetCCI, MachineOperand &FlagUse,
+ CondRegArray &CondRegs);
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE,
+ "X86 EFLAGS copy lowering", false, false)
+INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE,
+ "X86 EFLAGS copy lowering", false, false)
+
+FunctionPass *llvm::createX86FlagsCopyLoweringPass() {
+ return new X86FlagsCopyLoweringPass();
+}
+
+char X86FlagsCopyLoweringPass::ID = 0;
+
+void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+namespace {
+/// An enumeration of the arithmetic instruction mnemonics which have
+/// interesting flag semantics.
+///
+/// We can map instruction opcodes into these mnemonics to make it easy to
+/// dispatch with specific functionality.
+enum class FlagArithMnemonic {
+ ADC,
+ ADCX,
+ ADOX,
+ RCL,
+ RCR,
+ SBB,
+};
+} // namespace
+
+static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ report_fatal_error("No support for lowering a copy into EFLAGS when used "
+ "by this instruction!");
+
+#define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX) \
+ case X86::MNEMONIC##8##SUFFIX: \
+ case X86::MNEMONIC##16##SUFFIX: \
+ case X86::MNEMONIC##32##SUFFIX: \
+ case X86::MNEMONIC##64##SUFFIX:
+
+#define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC) \
+ LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr) \
+ LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV) \
+ LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm) \
+ LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr) \
+ case X86::MNEMONIC##8ri: \
+ case X86::MNEMONIC##16ri8: \
+ case X86::MNEMONIC##32ri8: \
+ case X86::MNEMONIC##64ri8: \
+ case X86::MNEMONIC##16ri: \
+ case X86::MNEMONIC##32ri: \
+ case X86::MNEMONIC##64ri32: \
+ case X86::MNEMONIC##8mi: \
+ case X86::MNEMONIC##16mi8: \
+ case X86::MNEMONIC##32mi8: \
+ case X86::MNEMONIC##64mi8: \
+ case X86::MNEMONIC##16mi: \
+ case X86::MNEMONIC##32mi: \
+ case X86::MNEMONIC##64mi32: \
+ case X86::MNEMONIC##8i8: \
+ case X86::MNEMONIC##16i16: \
+ case X86::MNEMONIC##32i32: \
+ case X86::MNEMONIC##64i32:
+
+ LLVM_EXPAND_ADC_SBB_INSTR(ADC)
+ return FlagArithMnemonic::ADC;
+
+ LLVM_EXPAND_ADC_SBB_INSTR(SBB)
+ return FlagArithMnemonic::SBB;
+
+#undef LLVM_EXPAND_ADC_SBB_INSTR
+
+ LLVM_EXPAND_INSTR_SIZES(RCL, rCL)
+ LLVM_EXPAND_INSTR_SIZES(RCL, r1)
+ LLVM_EXPAND_INSTR_SIZES(RCL, ri)
+ return FlagArithMnemonic::RCL;
+
+ LLVM_EXPAND_INSTR_SIZES(RCR, rCL)
+ LLVM_EXPAND_INSTR_SIZES(RCR, r1)
+ LLVM_EXPAND_INSTR_SIZES(RCR, ri)
+ return FlagArithMnemonic::RCR;
+
+#undef LLVM_EXPAND_INSTR_SIZES
+
+ case X86::ADCX32rr:
+ case X86::ADCX64rr:
+ case X86::ADCX32rm:
+ case X86::ADCX64rm:
+ return FlagArithMnemonic::ADCX;
+
+ case X86::ADOX32rr:
+ case X86::ADOX64rr:
+ case X86::ADOX32rm:
+ case X86::ADOX64rm:
+ return FlagArithMnemonic::ADOX;
+ }
+}
+
+static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
+ MachineInstr &SplitI,
+ const X86InstrInfo &TII) {
+ MachineFunction &MF = *MBB.getParent();
+
+ assert(SplitI.getParent() == &MBB &&
+ "Split instruction must be in the split block!");
+ assert(SplitI.isBranch() &&
+ "Only designed to split a tail of branch instructions!");
+ assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID &&
+ "Must split on an actual jCC instruction!");
+
+ // Dig out the previous instruction to the split point.
+ MachineInstr &PrevI = *std::prev(SplitI.getIterator());
+ assert(PrevI.isBranch() && "Must split after a branch!");
+ assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID &&
+ "Must split after an actual jCC instruction!");
+ assert(!std::prev(PrevI.getIterator())->isTerminator() &&
+ "Must only have this one terminator prior to the split!");
+
+ // Grab the one successor edge that will stay in `MBB`.
+ MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB();
+
+ // Analyze the original block to see if we are actually splitting an edge
+ // into two edges. This can happen when we have multiple conditional jumps to
+ // the same successor.
+ bool IsEdgeSplit =
+ std::any_of(SplitI.getIterator(), MBB.instr_end(),
+ [&](MachineInstr &MI) {
+ assert(MI.isTerminator() &&
+ "Should only have spliced terminators!");
+ return llvm::any_of(
+ MI.operands(), [&](MachineOperand &MOp) {
+ return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc;
+ });
+ }) ||
+ MBB.getFallThrough() == &UnsplitSucc;
+
+ MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
+
+ // Insert the new block immediately after the current one. Any existing
+ // fallthrough will be sunk into this new block anyways.
+ MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
+
+ // Splice the tail of instructions into the new block.
+ NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end());
+
+ // Copy the necessary succesors (and their probability info) into the new
+ // block.
+ for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI)
+ if (IsEdgeSplit || *SI != &UnsplitSucc)
+ NewMBB.copySuccessor(&MBB, SI);
+ // Normalize the probabilities if we didn't end up splitting the edge.
+ if (!IsEdgeSplit)
+ NewMBB.normalizeSuccProbs();
+
+ // Now replace all of the moved successors in the original block with the new
+ // block. This will merge their probabilities.
+ for (MachineBasicBlock *Succ : NewMBB.successors())
+ if (Succ != &UnsplitSucc)
+ MBB.replaceSuccessor(Succ, &NewMBB);
+
+ // We should always end up replacing at least one successor.
+ assert(MBB.isSuccessor(&NewMBB) &&
+ "Failed to make the new block a successor!");
+
+ // Now update all the PHIs.
+ for (MachineBasicBlock *Succ : NewMBB.successors()) {
+ for (MachineInstr &MI : *Succ) {
+ if (!MI.isPHI())
+ break;
+
+ for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
+ OpIdx += 2) {
+ MachineOperand &OpV = MI.getOperand(OpIdx);
+ MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
+ assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
+ if (OpMBB.getMBB() != &MBB)
+ continue;
+
+ // Replace the operand for unsplit successors
+ if (!IsEdgeSplit || Succ != &UnsplitSucc) {
+ OpMBB.setMBB(&NewMBB);
+
+ // We have to continue scanning as there may be multiple entries in
+ // the PHI.
+ continue;
+ }
+
+ // When we have split the edge append a new successor.
+ MI.addOperand(MF, OpV);
+ MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
+ break;
+ }
+ }
+ }
+
+ return NewMBB;
+}
+
+bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+ << " **********\n");
+
+ auto &Subtarget = MF.getSubtarget<X86Subtarget>();
+ MRI = &MF.getRegInfo();
+ TII = Subtarget.getInstrInfo();
+ TRI = Subtarget.getRegisterInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
+ PromoteRC = &X86::GR8RegClass;
+
+ if (MF.begin() == MF.end())
+ // Nothing to do for a degenerate empty function...
+ return false;
+
+ // Collect the copies in RPO so that when there are chains where a copy is in
+ // turn copied again we visit the first one first. This ensures we can find
+ // viable locations for testing the original EFLAGS that dominate all the
+ // uses across complex CFGs.
+ SmallVector<MachineInstr *, 4> Copies;
+ ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+ for (MachineBasicBlock *MBB : RPOT)
+ for (MachineInstr &MI : *MBB)
+ if (MI.getOpcode() == TargetOpcode::COPY &&
+ MI.getOperand(0).getReg() == X86::EFLAGS)
+ Copies.push_back(&MI);
+
+ for (MachineInstr *CopyI : Copies) {
+ MachineBasicBlock &MBB = *CopyI->getParent();
+
+ MachineOperand &VOp = CopyI->getOperand(1);
+ assert(VOp.isReg() &&
+ "The input to the copy for EFLAGS should always be a register!");
+ MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg());
+ if (CopyDefI.getOpcode() != TargetOpcode::COPY) {
+ // FIXME: The big likely candidate here are PHI nodes. We could in theory
+ // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard
+ // enough that it is probably better to change every other part of LLVM
+ // to avoid creating them. The issue is that once we have PHIs we won't
+ // know which original EFLAGS value we need to capture with our setCCs
+ // below. The end result will be computing a complete set of setCCs that
+ // we *might* want, computing them in every place where we copy *out* of
+ // EFLAGS and then doing SSA formation on all of them to insert necessary
+ // PHI nodes and consume those here. Then hoping that somehow we DCE the
+ // unnecessary ones. This DCE seems very unlikely to be successful and so
+ // we will almost certainly end up with a glut of dead setCC
+ // instructions. Until we have a motivating test case and fail to avoid
+ // it by changing other parts of LLVM's lowering, we refuse to handle
+ // this complex case here.
+ LLVM_DEBUG(
+ dbgs() << "ERROR: Encountered unexpected def of an eflags copy: ";
+ CopyDefI.dump());
+ report_fatal_error(
+ "Cannot lower EFLAGS copy unless it is defined in turn by a copy!");
+ }
+
+ auto Cleanup = make_scope_exit([&] {
+ // All uses of the EFLAGS copy are now rewritten, kill the copy into
+ // eflags and if dead the copy from.
+ CopyI->eraseFromParent();
+ if (MRI->use_empty(CopyDefI.getOperand(0).getReg()))
+ CopyDefI.eraseFromParent();
+ ++NumCopiesEliminated;
+ });
+
+ MachineOperand &DOp = CopyI->getOperand(0);
+ assert(DOp.isDef() && "Expected register def!");
+ assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!");
+ if (DOp.isDead())
+ continue;
+
+ MachineBasicBlock *TestMBB = CopyDefI.getParent();
+ auto TestPos = CopyDefI.getIterator();
+ DebugLoc TestLoc = CopyDefI.getDebugLoc();
+
+ LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
+
+ // Walk up across live-in EFLAGS to find where they were actually def'ed.
+ //
+ // This copy's def may just be part of a region of blocks covered by
+ // a single def of EFLAGS and we want to find the top of that region where
+ // possible.
+ //
+ // This is essentially a search for a *candidate* reaching definition
+ // location. We don't need to ever find the actual reaching definition here,
+ // but we want to walk up the dominator tree to find the highest point which
+ // would be viable for such a definition.
+ auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End) {
+ // Scan backwards as we expect these to be relatively short and often find
+ // a clobber near the end.
+ return llvm::any_of(
+ llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) {
+ // Flag any instruction (other than the copy we are
+ // currently rewriting) that defs EFLAGS.
+ return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS);
+ });
+ };
+ auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB,
+ MachineBasicBlock *EndMBB) {
+ assert(MDT->dominates(BeginMBB, EndMBB) &&
+ "Only support paths down the dominator tree!");
+ SmallPtrSet<MachineBasicBlock *, 4> Visited;
+ SmallVector<MachineBasicBlock *, 4> Worklist;
+ // We terminate at the beginning. No need to scan it.
+ Visited.insert(BeginMBB);
+ Worklist.push_back(EndMBB);
+ do {
+ auto *MBB = Worklist.pop_back_val();
+ for (auto *PredMBB : MBB->predecessors()) {
+ if (!Visited.insert(PredMBB).second)
+ continue;
+ if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end()))
+ return true;
+ // Enqueue this block to walk its predecessors.
+ Worklist.push_back(PredMBB);
+ }
+ } while (!Worklist.empty());
+ // No clobber found along a path from the begin to end.
+ return false;
+ };
+ while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() &&
+ !HasEFLAGSClobber(TestMBB->begin(), TestPos)) {
+ // Find the nearest common dominator of the predecessors, as
+ // that will be the best candidate to hoist into.
+ MachineBasicBlock *HoistMBB =
+ std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(),
+ *TestMBB->pred_begin(),
+ [&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) {
+ return MDT->findNearestCommonDominator(LHS, RHS);
+ });
+
+ // Now we need to scan all predecessors that may be reached along paths to
+ // the hoist block. A clobber anywhere in any of these blocks the hoist.
+ // Note that this even handles loops because we require *no* clobbers.
+ if (HasEFLAGSClobberPath(HoistMBB, TestMBB))
+ break;
+
+ // We also need the terminators to not sneakily clobber flags.
+ if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(),
+ HoistMBB->instr_end()))
+ break;
+
+ // We found a viable location, hoist our test position to it.
+ TestMBB = HoistMBB;
+ TestPos = TestMBB->getFirstTerminator()->getIterator();
+ // Clear the debug location as it would just be confusing after hoisting.
+ TestLoc = DebugLoc();
+ }
+ LLVM_DEBUG({
+ auto DefIt = llvm::find_if(
+ llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)),
+ [&](MachineInstr &MI) {
+ return MI.findRegisterDefOperand(X86::EFLAGS);
+ });
+ if (DefIt.base() != TestMBB->instr_begin()) {
+ dbgs() << " Using EFLAGS defined by: ";
+ DefIt->dump();
+ } else {
+ dbgs() << " Using live-in flags for BB:\n";
+ TestMBB->dump();
+ }
+ });
+
+ // While rewriting uses, we buffer jumps and rewrite them in a second pass
+ // because doing so will perturb the CFG that we are walking to find the
+ // uses in the first place.
+ SmallVector<MachineInstr *, 4> JmpIs;
+
+ // Gather the condition flags that have already been preserved in
+ // registers. We do this from scratch each time as we expect there to be
+ // very few of them and we expect to not revisit the same copy definition
+ // many times. If either of those change sufficiently we could build a map
+ // of these up front instead.
+ CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos);
+
+ // Collect the basic blocks we need to scan. Typically this will just be
+ // a single basic block but we may have to scan multiple blocks if the
+ // EFLAGS copy lives into successors.
+ SmallVector<MachineBasicBlock *, 2> Blocks;
+ SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
+ Blocks.push_back(&MBB);
+
+ do {
+ MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
+
+ // Track when if/when we find a kill of the flags in this block.
+ bool FlagsKilled = false;
+
+ // In most cases, we walk from the beginning to the end of the block. But
+ // when the block is the same block as the copy is from, we will visit it
+ // twice. The first time we start from the copy and go to the end. The
+ // second time we start from the beginning and go to the copy. This lets
+ // us handle copies inside of cycles.
+ // FIXME: This loop is *super* confusing. This is at least in part
+ // a symptom of all of this routine needing to be refactored into
+ // documentable components. Once done, there may be a better way to write
+ // this loop.
+ for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB))
+ ? std::next(CopyI->getIterator())
+ : UseMBB.instr_begin(),
+ MIE = UseMBB.instr_end();
+ MII != MIE;) {
+ MachineInstr &MI = *MII++;
+ // If we are in the original copy block and encounter either the copy
+ // def or the copy itself, break so that we don't re-process any part of
+ // the block or process the instructions in the range that was copied
+ // over.
+ if (&MI == CopyI || &MI == &CopyDefI) {
+ assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) &&
+ "Should only encounter these on the second pass over the "
+ "original block.");
+ break;
+ }
+
+ MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
+ if (!FlagUse) {
+ if (MI.findRegisterDefOperand(X86::EFLAGS)) {
+ // If EFLAGS are defined, it's as-if they were killed. We can stop
+ // scanning here.
+ //
+ // NB!!! Many instructions only modify some flags. LLVM currently
+ // models this as clobbering all flags, but if that ever changes
+ // this will need to be carefully updated to handle that more
+ // complex logic.
+ FlagsKilled = true;
+ break;
+ }
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << " Rewriting use: "; MI.dump());
+
+ // Check the kill flag before we rewrite as that may change it.
+ if (FlagUse->isKill())
+ FlagsKilled = true;
+
+ // Once we encounter a branch, the rest of the instructions must also be
+ // branches. We can't rewrite in place here, so we handle them below.
+ //
+ // Note that we don't have to handle tail calls here, even conditional
+ // tail calls, as those are not introduced into the X86 MI until post-RA
+ // branch folding or black placement. As a consequence, we get to deal
+ // with the simpler formulation of conditional branches followed by tail
+ // calls.
+ if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) {
+ auto JmpIt = MI.getIterator();
+ do {
+ JmpIs.push_back(&*JmpIt);
+ ++JmpIt;
+ } while (JmpIt != UseMBB.instr_end() &&
+ X86::getCondFromBranchOpc(JmpIt->getOpcode()) !=
+ X86::COND_INVALID);
+ break;
+ }
+
+ // Otherwise we can just rewrite in-place.
+ if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
+ rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+ } else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
+ X86::COND_INVALID) {
+ rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+ } else if (MI.getOpcode() == TargetOpcode::COPY) {
+ rewriteCopy(MI, *FlagUse, CopyDefI);
+ } else {
+ // We assume all other instructions that use flags also def them.
+ assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
+ "Expected a def of EFLAGS for this instruction!");
+
+ // NB!!! Several arithmetic instructions only *partially* update
+ // flags. Theoretically, we could generate MI code sequences that
+ // would rely on this fact and observe different flags independently.
+ // But currently LLVM models all of these instructions as clobbering
+ // all the flags in an undef way. We rely on that to simplify the
+ // logic.
+ FlagsKilled = true;
+
+ switch (MI.getOpcode()) {
+ case X86::SETB_C8r:
+ case X86::SETB_C16r:
+ case X86::SETB_C32r:
+ case X86::SETB_C64r:
+ // Use custom lowering for arithmetic that is merely extending the
+ // carry flag. We model this as the SETB_C* pseudo instructions.
+ rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
+ CondRegs);
+ break;
+
+ default:
+ // Generically handle remaining uses as arithmetic instructions.
+ rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
+ CondRegs);
+ break;
+ }
+ break;
+ }
+
+ // If this was the last use of the flags, we're done.
+ if (FlagsKilled)
+ break;
+ }
+
+ // If the flags were killed, we're done with this block.
+ if (FlagsKilled)
+ continue;
+
+ // Otherwise we need to scan successors for ones where the flags live-in
+ // and queue those up for processing.
+ for (MachineBasicBlock *SuccMBB : UseMBB.successors())
+ if (SuccMBB->isLiveIn(X86::EFLAGS) &&
+ VisitedBlocks.insert(SuccMBB).second) {
+ // We currently don't do any PHI insertion and so we require that the
+ // test basic block dominates all of the use basic blocks. Further, we
+ // can't have a cycle from the test block back to itself as that would
+ // create a cycle requiring a PHI to break it.
+ //
+ // We could in theory do PHI insertion here if it becomes useful by
+ // just taking undef values in along every edge that we don't trace
+ // this EFLAGS copy along. This isn't as bad as fully general PHI
+ // insertion, but still seems like a great deal of complexity.
+ //
+ // Because it is theoretically possible that some earlier MI pass or
+ // other lowering transformation could induce this to happen, we do
+ // a hard check even in non-debug builds here.
+ if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) {
+ LLVM_DEBUG({
+ dbgs()
+ << "ERROR: Encountered use that is not dominated by our test "
+ "basic block! Rewriting this would require inserting PHI "
+ "nodes to track the flag state across the CFG.\n\nTest "
+ "block:\n";
+ TestMBB->dump();
+ dbgs() << "Use block:\n";
+ SuccMBB->dump();
+ });
+ report_fatal_error(
+ "Cannot lower EFLAGS copy when original copy def "
+ "does not dominate all uses.");
+ }
+
+ Blocks.push_back(SuccMBB);
+ }
+ } while (!Blocks.empty());
+
+ // Now rewrite the jumps that use the flags. These we handle specially
+ // because if there are multiple jumps in a single basic block we'll have
+ // to do surgery on the CFG.
+ MachineBasicBlock *LastJmpMBB = nullptr;
+ for (MachineInstr *JmpI : JmpIs) {
+ // Past the first jump within a basic block we need to split the blocks
+ // apart.
+ if (JmpI->getParent() == LastJmpMBB)
+ splitBlock(*JmpI->getParent(), *JmpI, *TII);
+ else
+ LastJmpMBB = JmpI->getParent();
+
+ rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
+ }
+
+ // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
+ // the copy's def operand is itself a kill.
+ }
+
+#ifndef NDEBUG
+ for (MachineBasicBlock &MBB : MF)
+ for (MachineInstr &MI : MBB)
+ if (MI.getOpcode() == TargetOpcode::COPY &&
+ (MI.getOperand(0).getReg() == X86::EFLAGS ||
+ MI.getOperand(1).getReg() == X86::EFLAGS)) {
+ LLVM_DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: ";
+ MI.dump());
+ llvm_unreachable("Unlowered EFLAGS copy!");
+ }
+#endif
+
+ return true;
+}
+
+/// Collect any conditions that have already been set in registers so that we
+/// can re-use them rather than adding duplicates.
+CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) {
+ CondRegArray CondRegs = {};
+
+ // Scan backwards across the range of instructions with live EFLAGS.
+ for (MachineInstr &MI :
+ llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
+ X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode());
+ if (Cond != X86::COND_INVALID && MI.getOperand(0).isReg() &&
+ TRI->isVirtualRegister(MI.getOperand(0).getReg()))
+ CondRegs[Cond] = MI.getOperand(0).getReg();
+
+ // Stop scanning when we see the first definition of the EFLAGS as prior to
+ // this we would potentially capture the wrong flag state.
+ if (MI.findRegisterDefOperand(X86::EFLAGS))
+ break;
+ }
+ return CondRegs;
+}
+
+unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
+ MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc, X86::CondCode Cond) {
+ unsigned Reg = MRI->createVirtualRegister(PromoteRC);
+ auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
+ TII->get(X86::getSETFromCond(Cond)), Reg);
+ (void)SetI;
+ LLVM_DEBUG(dbgs() << " save cond: "; SetI->dump());
+ ++NumSetCCsInserted;
+ return Reg;
+}
+
+std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg(
+ MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) {
+ unsigned &CondReg = CondRegs[Cond];
+ unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)];
+ if (!CondReg && !InvCondReg)
+ CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
+
+ if (CondReg)
+ return {CondReg, false};
+ else
+ return {InvCondReg, true};
+}
+
+void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Pos,
+ DebugLoc Loc, unsigned Reg) {
+ auto TestI =
+ BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg);
+ (void)TestI;
+ LLVM_DEBUG(dbgs() << " test cond: "; TestI->dump());
+ ++NumTestsInserted;
+}
+
+void X86FlagsCopyLoweringPass::rewriteArithmetic(
+ MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc, MachineInstr &MI, MachineOperand &FlagUse,
+ CondRegArray &CondRegs) {
+ // Arithmetic is either reading CF or OF. Figure out which condition we need
+ // to preserve in a register.
+ X86::CondCode Cond;
+
+ // The addend to use to reset CF or OF when added to the flag value.
+ int Addend;
+
+ switch (getMnemonicFromOpcode(MI.getOpcode())) {
+ case FlagArithMnemonic::ADC:
+ case FlagArithMnemonic::ADCX:
+ case FlagArithMnemonic::RCL:
+ case FlagArithMnemonic::RCR:
+ case FlagArithMnemonic::SBB:
+ Cond = X86::COND_B; // CF == 1
+ // Set up an addend that when one is added will need a carry due to not
+ // having a higher bit available.
+ Addend = 255;
+ break;
+
+ case FlagArithMnemonic::ADOX:
+ Cond = X86::COND_O; // OF == 1
+ // Set up an addend that when one is added will turn from positive to
+ // negative and thus overflow in the signed domain.
+ Addend = 127;
+ break;
+ }
+
+ // Now get a register that contains the value of the flag input to the
+ // arithmetic. We require exactly this flag to simplify the arithmetic
+ // required to materialize it back into the flag.
+ unsigned &CondReg = CondRegs[Cond];
+ if (!CondReg)
+ CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
+
+ MachineBasicBlock &MBB = *MI.getParent();
+
+ // Insert an instruction that will set the flag back to the desired value.
+ unsigned TmpReg = MRI->createVirtualRegister(PromoteRC);
+ auto AddI =
+ BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri))
+ .addDef(TmpReg, RegState::Dead)
+ .addReg(CondReg)
+ .addImm(Addend);
+ (void)AddI;
+ LLVM_DEBUG(dbgs() << " add cond: "; AddI->dump());
+ ++NumAddsInserted;
+ FlagUse.setIsKill(true);
+}
+
+void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc,
+ MachineInstr &CMovI,
+ MachineOperand &FlagUse,
+ CondRegArray &CondRegs) {
+ // First get the register containing this specific condition.
+ X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode());
+ unsigned CondReg;
+ bool Inverted;
+ std::tie(CondReg, Inverted) =
+ getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
+
+ MachineBasicBlock &MBB = *CMovI.getParent();
+
+ // Insert a direct test of the saved register.
+ insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
+
+ // Rewrite the CMov to use the !ZF flag from the test (but match register
+ // size and memory operand), and then kill its use of the flags afterward.
+ auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg());
+ CMovI.setDesc(TII->get(X86::getCMovFromCond(
+ Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8,
+ !CMovI.memoperands_empty())));
+ FlagUse.setIsKill(true);
+ LLVM_DEBUG(dbgs() << " fixed cmov: "; CMovI.dump());
+}
+
+void X86FlagsCopyLoweringPass::rewriteCondJmp(
+ MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
+ // First get the register containing this specific condition.
+ X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode());
+ unsigned CondReg;
+ bool Inverted;
+ std::tie(CondReg, Inverted) =
+ getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
+
+ MachineBasicBlock &JmpMBB = *JmpI.getParent();
+
+ // Insert a direct test of the saved register.
+ insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg);
+
+ // Rewrite the jump to use the !ZF flag from the test, and kill its use of
+ // flags afterward.
+ JmpI.setDesc(TII->get(
+ X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE)));
+ const int ImplicitEFLAGSOpIdx = 1;
+ JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true);
+ LLVM_DEBUG(dbgs() << " fixed jCC: "; JmpI.dump());
+}
+
+void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI,
+ MachineOperand &FlagUse,
+ MachineInstr &CopyDefI) {
+ // Just replace this copy with the original copy def.
+ MRI->replaceRegWith(MI.getOperand(0).getReg(),
+ CopyDefI.getOperand(0).getReg());
+ MI.eraseFromParent();
+}
+
+void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
+ MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse,
+ CondRegArray &CondRegs) {
+ // This routine is only used to handle pseudos for setting a register to zero
+ // or all ones based on CF. This is essentially the sign extended from 1-bit
+ // form of SETB and modeled with the SETB_C* pseudos. They require special
+ // handling as they aren't normal SETcc instructions and are lowered to an
+ // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that
+ // they are only provided in reg-defining forms. A complicating factor is that
+ // they can define many different register widths.
+ assert(SetBI.getOperand(0).isReg() &&
+ "Cannot have a non-register defined operand to this variant of SETB!");
+
+ // Little helper to do the common final step of replacing the register def'ed
+ // by this SETB instruction with a new register and removing the SETB
+ // instruction.
+ auto RewriteToReg = [&](unsigned Reg) {
+ MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg);
+ SetBI.eraseFromParent();
+ };
+
+ // Grab the register class used for this particular instruction.
+ auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg());
+
+ MachineBasicBlock &MBB = *SetBI.getParent();
+ auto SetPos = SetBI.getIterator();
+ auto SetLoc = SetBI.getDebugLoc();
+
+ auto AdjustReg = [&](unsigned Reg) {
+ auto &OrigRC = *MRI->getRegClass(Reg);
+ if (&OrigRC == &SetBRC)
+ return Reg;
+
+ unsigned NewReg;
+
+ int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8;
+ int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8;
+ assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!");
+ assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!");
+ int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit,
+ X86::NoSubRegister, X86::sub_32bit};
+
+ // If the original size is smaller than the target *and* is smaller than 4
+ // bytes, we need to explicitly zero extend it. We always extend to 4-bytes
+ // to maximize the chance of being able to CSE that operation and to avoid
+ // partial dependency stalls extending to 2-bytes.
+ if (OrigRegSize < TargetRegSize && OrigRegSize < 4) {
+ NewReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg)
+ .addReg(Reg);
+ if (&SetBRC == &X86::GR32RegClass)
+ return NewReg;
+ Reg = NewReg;
+ OrigRegSize = 4;
+ }
+
+ NewReg = MRI->createVirtualRegister(&SetBRC);
+ if (OrigRegSize < TargetRegSize) {
+ BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG),
+ NewReg)
+ .addImm(0)
+ .addReg(Reg)
+ .addImm(SubRegIdx[OrigRegSize]);
+ } else if (OrigRegSize > TargetRegSize) {
+ BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::EXTRACT_SUBREG),
+ NewReg)
+ .addReg(Reg)
+ .addImm(SubRegIdx[TargetRegSize]);
+ } else {
+ BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg)
+ .addReg(Reg);
+ }
+ return NewReg;
+ };
+
+ unsigned &CondReg = CondRegs[X86::COND_B];
+ if (!CondReg)
+ CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B);
+
+ // Adjust the condition to have the desired register width by zero-extending
+ // as needed.
+ // FIXME: We should use a better API to avoid the local reference and using a
+ // different variable here.
+ unsigned ExtCondReg = AdjustReg(CondReg);
+
+ // Now we need to turn this into a bitmask. We do this by subtracting it from
+ // zero.
+ unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg);
+ ZeroReg = AdjustReg(ZeroReg);
+
+ unsigned Sub;
+ switch (SetBI.getOpcode()) {
+ case X86::SETB_C8r:
+ Sub = X86::SUB8rr;
+ break;
+
+ case X86::SETB_C16r:
+ Sub = X86::SUB16rr;
+ break;
+
+ case X86::SETB_C32r:
+ Sub = X86::SUB32rr;
+ break;
+
+ case X86::SETB_C64r:
+ Sub = X86::SUB64rr;
+ break;
+
+ default:
+ llvm_unreachable("Invalid SETB_C* opcode!");
+ }
+ unsigned ResultReg = MRI->createVirtualRegister(&SetBRC);
+ BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg)
+ .addReg(ZeroReg)
+ .addReg(ExtCondReg);
+ return RewriteToReg(ResultReg);
+}
+
+void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc,
+ MachineInstr &SetCCI,
+ MachineOperand &FlagUse,
+ CondRegArray &CondRegs) {
+ X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode());
+ // Note that we can't usefully rewrite this to the inverse without complex
+ // analysis of the users of the setCC. Largely we rely on duplicates which
+ // could have been avoided already being avoided here.
+ unsigned &CondReg = CondRegs[Cond];
+ if (!CondReg)
+ CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
+
+ // Rewriting a register def is trivial: we just replace the register and
+ // remove the setcc.
+ if (!SetCCI.mayStore()) {
+ assert(SetCCI.getOperand(0).isReg() &&
+ "Cannot have a non-register defined operand to SETcc!");
+ MRI->replaceRegWith(SetCCI.getOperand(0).getReg(), CondReg);
+ SetCCI.eraseFromParent();
+ return;
+ }
+
+ // Otherwise, we need to emit a store.
+ auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(),
+ SetCCI.getDebugLoc(), TII->get(X86::MOV8mr));
+ // Copy the address operands.
+ for (int i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.add(SetCCI.getOperand(i));
+
+ MIB.addReg(CondReg);
+
+ MIB->setMemRefs(SetCCI.memoperands_begin(), SetCCI.memoperands_end());
+
+ SetCCI.eraseFromParent();
+ return;
+}
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 9a72e7114be0..ae748901164a 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -39,6 +39,7 @@
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
@@ -434,7 +435,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
PrevMI = &*std::prev(I);
++NumFP; // Keep track of # of pseudo instrs
- DEBUG(dbgs() << "\nFPInst:\t" << MI);
+ LLVM_DEBUG(dbgs() << "\nFPInst:\t" << MI);
// Get dead variables list now because the MI pointer may be deleted as part
// of processing!
@@ -464,13 +465,13 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
// is in the clobber list and marked dead might not be live on the stack.
static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers");
if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) {
- DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
+ LLVM_DEBUG(dbgs() << "Register FP#" << Reg - X86::FP0 << " is dead!\n");
freeStackSlotAfter(I, Reg-X86::FP0);
}
}
// Print out all of the instructions expanded to if -debug
- DEBUG({
+ LLVM_DEBUG({
MachineBasicBlock::iterator PrevI = PrevMI;
if (I == PrevI) {
dbgs() << "Just deleted pseudo instruction\n";
@@ -499,15 +500,15 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
/// setupBlockStack - Use the live bundles to set up our model of the stack
/// to match predecessors' live out stack.
void FPS::setupBlockStack() {
- DEBUG(dbgs() << "\nSetting up live-ins for " << printMBBReference(*MBB)
- << " derived from " << MBB->getName() << ".\n");
+ LLVM_DEBUG(dbgs() << "\nSetting up live-ins for " << printMBBReference(*MBB)
+ << " derived from " << MBB->getName() << ".\n");
StackTop = 0;
// Get the live-in bundle for MBB.
const LiveBundle &Bundle =
LiveBundles[Bundles->getBundle(MBB->getNumber(), false)];
if (!Bundle.Mask) {
- DEBUG(dbgs() << "Block has no FP live-ins.\n");
+ LLVM_DEBUG(dbgs() << "Block has no FP live-ins.\n");
return;
}
@@ -516,8 +517,8 @@ void FPS::setupBlockStack() {
// Push the fixed live-in registers.
for (unsigned i = Bundle.FixCount; i > 0; --i) {
- DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %fp"
- << unsigned(Bundle.FixStack[i-1]) << '\n');
+ LLVM_DEBUG(dbgs() << "Live-in st(" << (i - 1) << "): %fp"
+ << unsigned(Bundle.FixStack[i - 1]) << '\n');
pushReg(Bundle.FixStack[i-1]);
}
@@ -526,7 +527,7 @@ void FPS::setupBlockStack() {
// to be revived at the end of a short block. It might save a few instrs.
unsigned Mask = calcLiveInMask(MBB, /*RemoveFPs=*/true);
adjustLiveRegs(Mask, MBB->begin());
- DEBUG(MBB->dump());
+ LLVM_DEBUG(MBB->dump());
}
/// finishBlockStack - Revive live-outs that are implicitly defined out of
@@ -538,8 +539,8 @@ void FPS::finishBlockStack() {
if (MBB->succ_empty())
return;
- DEBUG(dbgs() << "Setting up live-outs for " << printMBBReference(*MBB)
- << " derived from " << MBB->getName() << ".\n");
+ LLVM_DEBUG(dbgs() << "Setting up live-outs for " << printMBBReference(*MBB)
+ << " derived from " << MBB->getName() << ".\n");
// Get MBB's live-out bundle.
unsigned BundleIdx = Bundles->getBundle(MBB->getNumber(), true);
@@ -551,18 +552,18 @@ void FPS::finishBlockStack() {
adjustLiveRegs(Bundle.Mask, Term);
if (!Bundle.Mask) {
- DEBUG(dbgs() << "No live-outs.\n");
+ LLVM_DEBUG(dbgs() << "No live-outs.\n");
return;
}
// Has the stack order been fixed yet?
- DEBUG(dbgs() << "LB#" << BundleIdx << ": ");
+ LLVM_DEBUG(dbgs() << "LB#" << BundleIdx << ": ");
if (Bundle.isFixed()) {
- DEBUG(dbgs() << "Shuffling stack to match.\n");
+ LLVM_DEBUG(dbgs() << "Shuffling stack to match.\n");
shuffleStackTop(Bundle.FixStack, Bundle.FixCount, Term);
} else {
// Not fixed yet, we get to choose.
- DEBUG(dbgs() << "Fixing stack order now.\n");
+ LLVM_DEBUG(dbgs() << "Fixing stack order now.\n");
Bundle.FixCount = StackTop;
for (unsigned i = 0; i < StackTop; ++i)
Bundle.FixStack[i] = getStackEntry(i);
@@ -599,13 +600,14 @@ static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) {
#ifdef NDEBUG
#define ASSERT_SORTED(TABLE)
#else
-#define ASSERT_SORTED(TABLE) \
- { static bool TABLE##Checked = false; \
- if (!TABLE##Checked) { \
- assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) && \
- "All lookup tables must be sorted for efficient access!"); \
- TABLE##Checked = true; \
- } \
+#define ASSERT_SORTED(TABLE) \
+ { \
+ static std::atomic<bool> TABLE##Checked(false); \
+ if (!TABLE##Checked.load(std::memory_order_relaxed)) { \
+ assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) && \
+ "All lookup tables must be sorted for efficient access!"); \
+ TABLE##Checked.store(true, std::memory_order_relaxed); \
+ } \
}
#endif
@@ -893,7 +895,8 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
while (Kills && Defs) {
unsigned KReg = countTrailingZeros(Kills);
unsigned DReg = countTrailingZeros(Defs);
- DEBUG(dbgs() << "Renaming %fp" << KReg << " as imp %fp" << DReg << "\n");
+ LLVM_DEBUG(dbgs() << "Renaming %fp" << KReg << " as imp %fp" << DReg
+ << "\n");
std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]);
std::swap(RegMap[KReg], RegMap[DReg]);
Kills &= ~(1 << KReg);
@@ -907,7 +910,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
unsigned KReg = getStackEntry(0);
if (!(Kills & (1 << KReg)))
break;
- DEBUG(dbgs() << "Popping %fp" << KReg << "\n");
+ LLVM_DEBUG(dbgs() << "Popping %fp" << KReg << "\n");
popStackAfter(I2);
Kills &= ~(1 << KReg);
}
@@ -916,7 +919,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
// Manually kill the rest.
while (Kills) {
unsigned KReg = countTrailingZeros(Kills);
- DEBUG(dbgs() << "Killing %fp" << KReg << "\n");
+ LLVM_DEBUG(dbgs() << "Killing %fp" << KReg << "\n");
freeStackSlotBefore(I, KReg);
Kills &= ~(1 << KReg);
}
@@ -924,14 +927,14 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
// Load zeros for all the imp-defs.
while(Defs) {
unsigned DReg = countTrailingZeros(Defs);
- DEBUG(dbgs() << "Defining %fp" << DReg << " as 0\n");
+ LLVM_DEBUG(dbgs() << "Defining %fp" << DReg << " as 0\n");
BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
pushReg(DReg);
Defs &= ~(1 << DReg);
}
// Now we should have the correct registers live.
- DEBUG(dumpStack());
+ LLVM_DEBUG(dumpStack());
assert(StackTop == countPopulation(Mask) && "Live count mismatch");
}
@@ -954,7 +957,7 @@ void FPS::shuffleStackTop(const unsigned char *FixStack,
if (FixCount > 0)
moveToTop(OldReg, I);
}
- DEBUG(dumpStack());
+ LLVM_DEBUG(dumpStack());
}
@@ -1466,7 +1469,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
case TargetOpcode::IMPLICIT_DEF: {
// All FP registers must be explicitly defined, so load a 0 instead.
unsigned Reg = MI.getOperand(0).getReg() - X86::FP0;
- DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
+ LLVM_DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
BuildMI(*MBB, Inst, MI.getDebugLoc(), TII->get(X86::LD_F0));
pushReg(Reg);
break;
@@ -1571,8 +1574,9 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
MI.emitError("implicitly popped regs must be last on the x87 stack");
unsigned NumSTPopped = countTrailingOnes(STPopped);
- DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
- << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n");
+ LLVM_DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
+ << NumSTPopped << ", and defines " << NumSTDefs
+ << " regs.\n");
#ifndef NDEBUG
// If any input operand uses constraint "f", all output register
@@ -1610,7 +1614,10 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
STUsesArray[I] = I;
shuffleStackTop(STUsesArray, NumSTUses, Inst);
- DEBUG({dbgs() << "Before asm: "; dumpStack();});
+ LLVM_DEBUG({
+ dbgs() << "Before asm: ";
+ dumpStack();
+ });
// With the stack layout fixed, rewrite the FP registers.
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
@@ -1658,7 +1665,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
// We want to leave I pointing to the previous instruction, but what if we
// just erased the first instruction?
if (Inst == MBB->begin()) {
- DEBUG(dbgs() << "Inserting dummy KILL\n");
+ LLVM_DEBUG(dbgs() << "Inserting dummy KILL\n");
Inst = BuildMI(*MBB, Inst, DebugLoc(), TII->get(TargetOpcode::KILL));
} else
--Inst;
@@ -1673,7 +1680,7 @@ void FPS::setKillFlags(MachineBasicBlock &MBB) const {
for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
I != E; ++I) {
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
std::bitset<8> Defs;
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 80b1cc192a88..a257ec41f75b 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -248,6 +248,7 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
/// stack pointer by a constant value.
void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &DL,
int64_t NumBytes, bool InEpilogue) const {
bool isSub = NumBytes < 0;
uint64_t Offset = isSub ? -NumBytes : NumBytes;
@@ -255,7 +256,6 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;
uint64_t Chunk = (1LL << 31) - 1;
- DebugLoc DL = MBB.findDebugLoc(MBBI);
if (Offset > Chunk) {
// Rather than emit a long series of instructions for large offsets,
@@ -399,28 +399,30 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
return 0;
MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
- MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
- : std::next(MBBI);
+
PI = skipDebugInstructionsBackward(PI, MBB.begin());
- if (NI != nullptr)
- NI = skipDebugInstructionsForward(NI, MBB.end());
+ // It is assumed that ADD/SUB/LEA instruction is succeded by one CFI
+ // instruction, and that there are no DBG_VALUE or other instructions between
+ // ADD/SUB/LEA and its corresponding CFI instruction.
+ /* TODO: Add support for the case where there are multiple CFI instructions
+ below the ADD/SUB/LEA, e.g.:
+ ...
+ add
+ cfi_def_cfa_offset
+ cfi_offset
+ ...
+ */
+ if (doMergeWithPrevious && PI != MBB.begin() && PI->isCFIInstruction())
+ PI = std::prev(PI);
unsigned Opc = PI->getOpcode();
int Offset = 0;
- if (!doMergeWithPrevious && NI != MBB.end() &&
- NI->getOpcode() == TargetOpcode::CFI_INSTRUCTION) {
- // Don't merge with the next instruction if it has CFI.
- return Offset;
- }
-
if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
PI->getOperand(0).getReg() == StackPtr){
assert(PI->getOperand(1).getReg() == StackPtr);
- Offset += PI->getOperand(2).getImm();
- MBB.erase(PI);
- if (!doMergeWithPrevious) MBBI = NI;
+ Offset = PI->getOperand(2).getImm();
} else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
PI->getOperand(0).getReg() == StackPtr &&
PI->getOperand(1).getReg() == StackPtr &&
@@ -428,17 +430,19 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
PI->getOperand(3).getReg() == X86::NoRegister &&
PI->getOperand(5).getReg() == X86::NoRegister) {
// For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
- Offset += PI->getOperand(4).getImm();
- MBB.erase(PI);
- if (!doMergeWithPrevious) MBBI = NI;
+ Offset = PI->getOperand(4).getImm();
} else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
PI->getOperand(0).getReg() == StackPtr) {
assert(PI->getOperand(1).getReg() == StackPtr);
- Offset -= PI->getOperand(2).getImm();
- MBB.erase(PI);
- if (!doMergeWithPrevious) MBBI = NI;
- }
+ Offset = -PI->getOperand(2).getImm();
+ } else
+ return 0;
+
+ PI = MBB.erase(PI);
+ if (PI != MBB.end() && PI->isCFIInstruction()) PI = MBB.erase(PI);
+ if (!doMergeWithPrevious)
+ MBBI = skipDebugInstructionsForward(PI, MBB.end());
return Offset;
}
@@ -741,6 +745,11 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
bool InProlog) const {
bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
+ // FIXME: Add retpoline support and remove this.
+ if (Is64Bit && IsLargeCodeModel && STI.useRetpoline())
+ report_fatal_error("Emitting stack probe calls on 64-bit with the large "
+ "code model and retpoline not yet implemented.");
+
unsigned CallOp;
if (Is64Bit)
CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
@@ -993,7 +1002,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
Fn.arg_size() == 2) {
StackSize += 8;
MFI.setStackSize(StackSize);
- emitSPUpdate(MBB, MBBI, -8, /*InEpilogue=*/false);
+ emitSPUpdate(MBB, MBBI, DL, -8, /*InEpilogue=*/false);
}
// If this is x86-64 and the Red Zone is not disabled, if we are a leaf
@@ -1208,30 +1217,34 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
bool isEAXAlive = isEAXLiveIn(MBB);
if (isEAXAlive) {
- // Sanity check that EAX is not livein for this function.
- // It should not be, so throw an assert.
- assert(!Is64Bit && "EAX is livein in x64 case!");
-
- // Save EAX
- BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
- .addReg(X86::EAX, RegState::Kill)
- .setMIFlag(MachineInstr::FrameSetup);
+ if (Is64Bit) {
+ // Save RAX
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
+ .addReg(X86::RAX, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ // Save EAX
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
+ .addReg(X86::EAX, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
}
if (Is64Bit) {
// Handle the 64-bit Windows ABI case where we need to call __chkstk.
// Function prologue is responsible for adjusting the stack pointer.
- if (isUInt<32>(NumBytes)) {
+ int Alloc = isEAXAlive ? NumBytes - 8 : NumBytes;
+ if (isUInt<32>(Alloc)) {
BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
- .addImm(NumBytes)
+ .addImm(Alloc)
.setMIFlag(MachineInstr::FrameSetup);
- } else if (isInt<32>(NumBytes)) {
+ } else if (isInt<32>(Alloc)) {
BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX)
- .addImm(NumBytes)
+ .addImm(Alloc)
.setMIFlag(MachineInstr::FrameSetup);
} else {
BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
- .addImm(NumBytes)
+ .addImm(Alloc)
.setMIFlag(MachineInstr::FrameSetup);
}
} else {
@@ -1246,15 +1259,19 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
emitStackProbe(MF, MBB, MBBI, DL, true);
if (isEAXAlive) {
- // Restore EAX
- MachineInstr *MI =
- addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
- StackPtr, false, NumBytes - 4);
+ // Restore RAX/EAX
+ MachineInstr *MI;
+ if (Is64Bit)
+ MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV64rm), X86::RAX),
+ StackPtr, false, NumBytes - 8);
+ else
+ MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
+ StackPtr, false, NumBytes - 4);
MI->setFlag(MachineInstr::FrameSetup);
MBB.insert(MBBI, MI);
}
} else if (NumBytes) {
- emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false);
+ emitSPUpdate(MBB, MBBI, DL, -(int64_t)NumBytes, /*InEpilogue=*/false);
}
if (NeedsWinCFI && NumBytes) {
@@ -1560,6 +1577,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
bool HasFP = hasFP(MF);
uint64_t NumBytes = 0;
+ bool NeedsDwarfCFI =
+ (!MF.getTarget().getTargetTriple().isOSDarwin() &&
+ !MF.getTarget().getTargetTriple().isOSWindows()) &&
+ (MF.getMMI().hasDebugInfo() || MF.getFunction().needsUnwindTableEntry());
+
if (IsFunclet) {
assert(HasFP && "EH funclets without FP not yet implemented");
NumBytes = getWinEHFuncletFrameSize(MF);
@@ -1582,6 +1604,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
MachineFramePtr)
.setMIFlag(MachineInstr::FrameDestroy);
+ if (NeedsDwarfCFI) {
+ unsigned DwarfStackPtr =
+ TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
+ BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfa(
+ nullptr, DwarfStackPtr, -SlotSize));
+ --MBBI;
+ }
}
MachineBasicBlock::iterator FirstCSPop = MBBI;
@@ -1644,7 +1673,12 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
}
} else if (NumBytes) {
// Adjust stack pointer back: ESP += numbytes.
- emitSPUpdate(MBB, MBBI, NumBytes, /*InEpilogue=*/true);
+ emitSPUpdate(MBB, MBBI, DL, NumBytes, /*InEpilogue=*/true);
+ if (!hasFP(MF) && NeedsDwarfCFI) {
+ // Define the current CFA rule to use the provided offset.
+ BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
+ nullptr, -CSSize - SlotSize));
+ }
--MBBI;
}
@@ -1657,6 +1691,23 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (NeedsWin64CFI && MF.hasWinCFI())
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
+ if (!hasFP(MF) && NeedsDwarfCFI) {
+ MBBI = FirstCSPop;
+ int64_t Offset = -CSSize - SlotSize;
+ // Mark callee-saved pop instruction.
+ // Define the current CFA rule to use the provided offset.
+ while (MBBI != MBB.end()) {
+ MachineBasicBlock::iterator PI = MBBI;
+ unsigned Opc = PI->getOpcode();
+ ++MBBI;
+ if (Opc == X86::POP32r || Opc == X86::POP64r) {
+ Offset += SlotSize;
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createDefCfaOffset(nullptr, Offset));
+ }
+ }
+ }
+
if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
// Add the return addr area delta back since we are not tail calling.
int Offset = -1 * X86FI->getTCReturnAddrDelta();
@@ -1664,7 +1715,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (Offset) {
// Check for possible merge with preceding ADD instruction.
Offset += mergeSPUpdates(MBB, Terminator, true);
- emitSPUpdate(MBB, Terminator, Offset, /*InEpilogue=*/true);
+ emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);
}
}
}
@@ -1855,6 +1906,32 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
unsigned CalleeSavedFrameSize = 0;
int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
+ int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+
+ if (TailCallReturnAddrDelta < 0) {
+ // create RETURNADDR area
+ // arg
+ // arg
+ // RETADDR
+ // { ...
+ // RETADDR area
+ // ...
+ // }
+ // [EBP]
+ MFI.CreateFixedObject(-TailCallReturnAddrDelta,
+ TailCallReturnAddrDelta - SlotSize, true);
+ }
+
+ // Spill the BasePtr if it's used.
+ if (this->TRI->hasBasePointer(MF)) {
+ // Allocate a spill slot for EBP if we have a base pointer and EH funclets.
+ if (MF.hasEHFunclets()) {
+ int FI = MFI.CreateSpillStackObject(SlotSize, SlotSize);
+ X86FI->setHasSEHFramePtrSave(true);
+ X86FI->setSEHFramePtrSaveIndex(FI);
+ }
+ }
+
if (hasFP(MF)) {
// emitPrologue always spills frame register the first thing.
SpillSlotOffset -= SlotSize;
@@ -1894,7 +1971,12 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
continue;
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ // If this is k-register make sure we lookup via the largest legal type.
+ MVT VT = MVT::Other;
+ if (X86::VK16RegClass.contains(Reg))
+ VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
unsigned Size = TRI->getSpillSize(*RC);
unsigned Align = TRI->getSpillAlignment(*RC);
// ensure alignment
@@ -1961,9 +2043,15 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
unsigned Reg = CSI[i-1].getReg();
if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
continue;
+
+ // If this is k-register make sure we lookup via the largest legal type.
+ MVT VT = MVT::Other;
+ if (X86::VK16RegClass.contains(Reg))
+ VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
+
// Add the callee-saved register as live-in. It's killed at the spill.
MBB.addLiveIn(Reg);
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
TRI);
@@ -2037,7 +2125,12 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
X86::GR32RegClass.contains(Reg))
continue;
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ // If this is k-register make sure we lookup via the largest legal type.
+ MVT VT = MVT::Other;
+ if (X86::VK16RegClass.contains(Reg))
+ VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
}
@@ -2060,35 +2153,12 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
RegScavenger *RS) const {
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
- MachineFrameInfo &MFI = MF.getFrameInfo();
-
- X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
-
- if (TailCallReturnAddrDelta < 0) {
- // create RETURNADDR area
- // arg
- // arg
- // RETADDR
- // { ...
- // RETADDR area
- // ...
- // }
- // [EBP]
- MFI.CreateFixedObject(-TailCallReturnAddrDelta,
- TailCallReturnAddrDelta - SlotSize, true);
- }
-
// Spill the BasePtr if it's used.
- if (TRI->hasBasePointer(MF)) {
- SavedRegs.set(TRI->getBaseRegister());
-
- // Allocate a spill slot for EBP if we have a base pointer and EH funclets.
- if (MF.hasEHFunclets()) {
- int FI = MFI.CreateSpillStackObject(SlotSize, SlotSize);
- X86FI->setHasSEHFramePtrSave(true);
- X86FI->setSEHFramePtrSaveIndex(FI);
- }
+ if (TRI->hasBasePointer(MF)){
+ unsigned BasePtr = TRI->getBaseRegister();
+ if (STI.isTarget64BitILP32())
+ BasePtr = getX86SubSuperRegister(BasePtr, 64);
+ SavedRegs.set(BasePtr);
}
}
@@ -2171,8 +2241,10 @@ void X86FrameLowering::adjustForSegmentedStacks(
// prologue.
StackSize = MFI.getStackSize();
- // Do not generate a prologue for functions with a stack of size zero
- if (StackSize == 0)
+ // Do not generate a prologue for leaf functions with a stack of size zero.
+ // For non-leaf functions we have to allow for the possibility that the
+ // call is to a non-split function, as in PR37807.
+ if (StackSize == 0 && !MFI.hasTailCall())
return;
MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
@@ -2345,6 +2417,10 @@ void X86FrameLowering::adjustForSegmentedStacks(
// This solution is not perfect, as it assumes that the .rodata section
// is laid out within 2^31 bytes of each function body, but this seems
// to be sufficient for JIT.
+ // FIXME: Add retpoline support and remove the error here..
+ if (STI.useRetpoline())
+ report_fatal_error("Emitting morestack calls on 64-bit with the large "
+ "code model and retpoline not yet implemented.");
BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
.addReg(X86::RIP)
.addImm(0)
@@ -2683,7 +2759,6 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
// Add Amount to SP to destroy a frame, or subtract to setup.
int64_t StackAdjustment = isDestroy ? Amount : -Amount;
- int64_t CfaAdjustment = -StackAdjustment;
if (StackAdjustment) {
// Merge with any previous or following adjustment instruction. Note: the
@@ -2708,6 +2783,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
// offset to be correct at each call site, while for debugging we want
// it to be more precise.
+ int64_t CfaAdjustment = -StackAdjustment;
// TODO: When not using precise CFA, we also need to adjust for the
// InternalAmt here.
if (CfaAdjustment) {
@@ -2838,6 +2914,15 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
return MBBI;
}
+int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
+ return TRI->getSlotSize();
+}
+
+unsigned X86FrameLowering::getInitialCFARegister(const MachineFunction &MF)
+ const {
+ return TRI->getDwarfRegNum(StackPtr, true);
+}
+
namespace {
// Struct used by orderFrameObjects to help sort the stack objects.
struct X86FrameSortingObject {
@@ -2942,7 +3027,7 @@ void X86FrameLowering::orderFrameObjects(
// Count the number of uses for each object.
for (auto &MBB : MF) {
for (auto &MI : MBB) {
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
continue;
for (const MachineOperand &MO : MI.operands()) {
// Check to see if it's a local stack symbol.
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 909319fc18fc..3bd805aae123 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -125,7 +125,7 @@ public:
/// Emit a series of instructions to increment / decrement the stack
/// pointer by a constant value.
void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
- int64_t NumBytes, bool InEpilogue) const;
+ const DebugLoc &DL, int64_t NumBytes, bool InEpilogue) const;
/// Check that LEA can be used on SP in an epilogue sequence for \p MF.
bool canUseLEAForSPInEpilogue(const MachineFunction &MF) const;
@@ -168,6 +168,10 @@ public:
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, bool RestoreSP = false) const;
+ int getInitialCFAOffset(const MachineFunction &MF) const override;
+
+ unsigned getInitialCFARegister(const MachineFunction &MF) const override;
+
private:
uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 660c1eff3c4b..a28d4eac8393 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
@@ -100,11 +101,11 @@ namespace {
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- void dump() {
+ void dump(SelectionDAG *DAG = nullptr) {
dbgs() << "X86ISelAddressMode " << this << '\n';
dbgs() << "Base_Reg ";
if (Base_Reg.getNode())
- Base_Reg.getNode()->dump();
+ Base_Reg.getNode()->dump(DAG);
else
dbgs() << "nul\n";
if (BaseType == FrameIndexBase)
@@ -112,7 +113,7 @@ namespace {
dbgs() << " Scale " << Scale << '\n'
<< "IndexReg ";
if (IndexReg.getNode())
- IndexReg.getNode()->dump();
+ IndexReg.getNode()->dump(DAG);
else
dbgs() << "nul\n";
dbgs() << " Disp " << Disp << '\n'
@@ -181,6 +182,7 @@ namespace {
bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
void PreprocessISelDAG() override;
+ void PostprocessISelDAG() override;
// Include the pieces autogenerated from the target description.
#include "X86GenDAGISel.inc"
@@ -213,7 +215,7 @@ namespace {
bool selectTLSADDRAddr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
- bool selectScalarSSELoad(SDNode *Root, SDValue N,
+ bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment,
@@ -225,7 +227,7 @@ namespace {
SDValue &Index, SDValue &Disp,
SDValue &Segment);
- // Convience method where P is also root.
+ // Convenience method where P is also root.
bool tryFoldLoad(SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
@@ -233,6 +235,12 @@ namespace {
return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
}
+ // Try to fold a vector load. This makes sure the load isn't non-temporal.
+ bool tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+
/// Implement addressing mode selection for inline asm expressions.
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
unsigned ConstraintID,
@@ -368,6 +376,11 @@ namespace {
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
}
+ /// Return a target constant with the specified value, of type i64.
+ inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
+ return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
+ }
+
SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
const SDLoc &DL) {
assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
@@ -401,7 +414,7 @@ namespace {
return Subtarget->getInstrInfo();
}
- /// \brief Address-mode matching performs shift-of-and to and-of-shift
+ /// Address-mode matching performs shift-of-and to and-of-shift
/// reassociation in order to expose more scaled addressing
/// opportunities.
bool ComplexPatternFuncMutatesDAG() const override {
@@ -440,10 +453,15 @@ namespace {
}
bool foldLoadStoreIntoMemOperand(SDNode *Node);
-
bool matchBEXTRFromAnd(SDNode *Node);
-
+ bool shrinkAndImmediate(SDNode *N);
bool isMaskZeroExtended(SDNode *N) const;
+
+ MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
+ const SDLoc &dl, MVT VT, SDNode *Node);
+ MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
+ const SDLoc &dl, MVT VT, SDNode *Node,
+ SDValue &InFlag);
};
}
@@ -452,19 +470,21 @@ namespace {
// type.
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
unsigned Opcode = N->getOpcode();
- if (Opcode == X86ISD::PCMPEQM || Opcode == X86ISD::PCMPGTM ||
- Opcode == X86ISD::CMPM || Opcode == X86ISD::TESTM ||
- Opcode == X86ISD::TESTNM || Opcode == X86ISD::CMPMU ||
- Opcode == X86ISD::CMPM_RND) {
+ if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
+ Opcode == X86ISD::CMPM_RND || Opcode == X86ISD::VFPCLASS) {
// We can get 256-bit 8 element types here without VLX being enabled. When
// this happens we will use 512-bit operations and the mask will not be
// zero extended.
EVT OpVT = N->getOperand(0).getValueType();
- if (OpVT == MVT::v8i32 || OpVT == MVT::v8f32)
+ if (OpVT.is256BitVector() || OpVT.is128BitVector())
return Subtarget->hasVLX();
return true;
}
+ // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
+ if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
+ Opcode == X86ISD::FSETCCM_RND)
+ return true;
return false;
}
@@ -518,10 +538,21 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
// addl 4(%esp), %eax
// The former is 2 bytes shorter. In case where the increment is 1, then
// the saving can be 4 bytes (by using incl %eax).
- if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
+ if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
if (Imm->getAPIntValue().isSignedIntN(8))
return false;
+ // If this is a 64-bit AND with an immediate that fits in 32-bits,
+ // prefer using the smaller and over folding the load. This is needed to
+ // make sure immediates created by shrinkAndImmediate are always folded.
+ // Ideally we would narrow the load during DAG combine and get the
+ // best of both worlds.
+ if (U->getOpcode() == ISD::AND &&
+ Imm->getAPIntValue().getBitWidth() == 64 &&
+ Imm->getAPIntValue().isIntN(32))
+ return false;
+ }
+
// If the other operand is a TLS address, we should fold it instead.
// This produces
// movl %gs:0, %eax
@@ -537,10 +568,60 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
return false;
}
+
+ // Don't fold load if this matches the BTS/BTR/BTC patterns.
+ // BTS: (or X, (shl 1, n))
+ // BTR: (and X, (rotl -2, n))
+ // BTC: (xor X, (shl 1, n))
+ if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
+ if (U->getOperand(0).getOpcode() == ISD::SHL &&
+ isOneConstant(U->getOperand(0).getOperand(0)))
+ return false;
+
+ if (U->getOperand(1).getOpcode() == ISD::SHL &&
+ isOneConstant(U->getOperand(1).getOperand(0)))
+ return false;
+ }
+ if (U->getOpcode() == ISD::AND) {
+ SDValue U0 = U->getOperand(0);
+ SDValue U1 = U->getOperand(1);
+ if (U0.getOpcode() == ISD::ROTL) {
+ auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
+ if (C && C->getSExtValue() == -2)
+ return false;
+ }
+
+ if (U1.getOpcode() == ISD::ROTL) {
+ auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
+ if (C && C->getSExtValue() == -2)
+ return false;
+ }
+ }
+
+ break;
}
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL:
+ // Don't fold a load into a shift by immediate. The BMI2 instructions
+ // support folding a load, but not an immediate. The legacy instructions
+ // support folding an immediate, but can't fold a load. Folding an
+ // immediate is preferable to folding a load.
+ if (isa<ConstantSDNode>(U->getOperand(1)))
+ return false;
+
+ break;
}
}
+ // Prevent folding a load if this can implemented with an insert_subreg or
+ // a move that implicitly zeroes.
+ if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
+ isNullConstant(Root->getOperand(2)) &&
+ (Root->getOperand(0).isUndef() ||
+ ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
+ return false;
+
return true;
}
@@ -628,12 +709,24 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
E = CurDAG->allnodes_end(); I != E; ) {
SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
+ // If this is a target specific AND node with no flag usages, turn it back
+ // into ISD::AND to enable test instruction matching.
+ if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
+ SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), N->getOperand(1));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ CurDAG->DeleteNode(N);
+ continue;
+ }
+
if (OptLevel != CodeGenOpt::None &&
- // Only does this when target favors doesn't favor register indirect
- // call.
+ // Only do this when the target can fold the load into the call or
+ // jmp.
+ !Subtarget->useRetpoline() &&
((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
(N->getOpcode() == X86ISD::TC_RETURN &&
- // Only does this if load can be folded into TC_RETURN.
(Subtarget->is64Bit() ||
!getTargetMachine().isPositionIndependent())))) {
/// Also try moving call address load from outside callseq_start to just
@@ -735,6 +828,70 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
}
+void X86DAGToDAGISel::PostprocessISelDAG() {
+ // Skip peepholes at -O0.
+ if (TM.getOptLevel() == CodeGenOpt::None)
+ return;
+
+ // Attempt to remove vectors moves that were inserted to zero upper bits.
+
+ SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+ ++Position;
+
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ // Skip dead nodes and any non-machine opcodes.
+ if (N->use_empty() || !N->isMachineOpcode())
+ continue;
+
+ if (N->getMachineOpcode() != TargetOpcode::SUBREG_TO_REG)
+ continue;
+
+ unsigned SubRegIdx = N->getConstantOperandVal(2);
+ if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
+ continue;
+
+ SDValue Move = N->getOperand(1);
+ if (!Move.isMachineOpcode())
+ continue;
+
+ // Make sure its one of the move opcodes we recognize.
+ switch (Move.getMachineOpcode()) {
+ default:
+ continue;
+ case X86::VMOVAPDrr: case X86::VMOVUPDrr:
+ case X86::VMOVAPSrr: case X86::VMOVUPSrr:
+ case X86::VMOVDQArr: case X86::VMOVDQUrr:
+ case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
+ case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
+ case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
+ case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
+ case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
+ case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
+ case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
+ case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
+ case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
+ case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
+ case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
+ break;
+ }
+
+ SDValue In = Move.getOperand(0);
+ if (!In.isMachineOpcode() ||
+ In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
+ continue;
+
+ // Producing instruction is another vector instruction. We can drop the
+ // move.
+ CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
+
+ // If the move is now dead, delete it.
+ if (Move.getNode()->use_empty())
+ CurDAG->RemoveDeadNode(Move.getNode());
+ }
+}
+
+
/// Emit any code that needs to be executed only in the main function.
void X86DAGToDAGISel::emitSpecialCodeForMain() {
if (Subtarget->isTargetCygMing()) {
@@ -771,9 +928,14 @@ static bool isDispSafeForFrameIndex(int64_t Val) {
bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
X86ISelAddressMode &AM) {
+ // If there's no offset to fold, we don't need to do any work.
+ if (Offset == 0)
+ return false;
+
// Cannot combine ExternalSymbol displacements with integer offsets.
- if (Offset != 0 && (AM.ES || AM.MCSym))
+ if (AM.ES || AM.MCSym)
return true;
+
int64_t Val = AM.Disp + Offset;
CodeModel::Model M = TM.getCodeModel();
if (Subtarget->is64Bit()) {
@@ -827,94 +989,60 @@ bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
if (AM.hasSymbolicDisplacement())
return true;
- SDValue N0 = N.getOperand(0);
+ bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
+
+ // We can't use an addressing mode in the 64-bit large code model. In the
+ // medium code model, we use can use an mode when RIP wrappers are present.
+ // That signifies access to globals that are known to be "near", such as the
+ // GOT itself.
CodeModel::Model M = TM.getCodeModel();
+ if (Subtarget->is64Bit() &&
+ (M == CodeModel::Large || (M == CodeModel::Medium && !IsRIPRel)))
+ return true;
- // Handle X86-64 rip-relative addresses. We check this before checking direct
- // folding because RIP is preferable to non-RIP accesses.
- if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
- // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
- // they cannot be folded into immediate fields.
- // FIXME: This can be improved for kernel and other models?
- (M == CodeModel::Small || M == CodeModel::Kernel)) {
- // Base and index reg must be 0 in order to use %rip as base.
- if (AM.hasBaseOrIndexReg())
- return true;
- if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
- X86ISelAddressMode Backup = AM;
- AM.GV = G->getGlobal();
- AM.SymbolFlags = G->getTargetFlags();
- if (foldOffsetIntoAddress(G->getOffset(), AM)) {
- AM = Backup;
- return true;
- }
- } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
- X86ISelAddressMode Backup = AM;
- AM.CP = CP->getConstVal();
- AM.Align = CP->getAlignment();
- AM.SymbolFlags = CP->getTargetFlags();
- if (foldOffsetIntoAddress(CP->getOffset(), AM)) {
- AM = Backup;
- return true;
- }
- } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
- AM.ES = S->getSymbol();
- AM.SymbolFlags = S->getTargetFlags();
- } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
- AM.MCSym = S->getMCSymbol();
- } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
- AM.JT = J->getIndex();
- AM.SymbolFlags = J->getTargetFlags();
- } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
- X86ISelAddressMode Backup = AM;
- AM.BlockAddr = BA->getBlockAddress();
- AM.SymbolFlags = BA->getTargetFlags();
- if (foldOffsetIntoAddress(BA->getOffset(), AM)) {
- AM = Backup;
- return true;
- }
- } else
- llvm_unreachable("Unhandled symbol reference node.");
+ // Base and index reg must be 0 in order to use %rip as base.
+ if (IsRIPRel && AM.hasBaseOrIndexReg())
+ return true;
- if (N.getOpcode() == X86ISD::WrapperRIP)
- AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
- return false;
- }
+ // Make a local copy in case we can't do this fold.
+ X86ISelAddressMode Backup = AM;
- // Handle the case when globals fit in our immediate field: This is true for
- // X86-32 always and X86-64 when in -mcmodel=small mode. In 64-bit
- // mode, this only applies to a non-RIP-relative computation.
- if (!Subtarget->is64Bit() ||
- M == CodeModel::Small || M == CodeModel::Kernel) {
- assert(N.getOpcode() != X86ISD::WrapperRIP &&
- "RIP-relative addressing already handled");
- if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
- AM.GV = G->getGlobal();
- AM.Disp += G->getOffset();
- AM.SymbolFlags = G->getTargetFlags();
- } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
- AM.CP = CP->getConstVal();
- AM.Align = CP->getAlignment();
- AM.Disp += CP->getOffset();
- AM.SymbolFlags = CP->getTargetFlags();
- } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
- AM.ES = S->getSymbol();
- AM.SymbolFlags = S->getTargetFlags();
- } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
- AM.MCSym = S->getMCSymbol();
- } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
- AM.JT = J->getIndex();
- AM.SymbolFlags = J->getTargetFlags();
- } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
- AM.BlockAddr = BA->getBlockAddress();
- AM.Disp += BA->getOffset();
- AM.SymbolFlags = BA->getTargetFlags();
- } else
- llvm_unreachable("Unhandled symbol reference node.");
- return false;
+ int64_t Offset = 0;
+ SDValue N0 = N.getOperand(0);
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ AM.GV = G->getGlobal();
+ AM.SymbolFlags = G->getTargetFlags();
+ Offset = G->getOffset();
+ } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ AM.CP = CP->getConstVal();
+ AM.Align = CP->getAlignment();
+ AM.SymbolFlags = CP->getTargetFlags();
+ Offset = CP->getOffset();
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+ AM.ES = S->getSymbol();
+ AM.SymbolFlags = S->getTargetFlags();
+ } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
+ AM.MCSym = S->getMCSymbol();
+ } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+ AM.JT = J->getIndex();
+ AM.SymbolFlags = J->getTargetFlags();
+ } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+ AM.BlockAddr = BA->getBlockAddress();
+ AM.SymbolFlags = BA->getTargetFlags();
+ Offset = BA->getOffset();
+ } else
+ llvm_unreachable("Unhandled symbol reference node.");
+
+ if (foldOffsetIntoAddress(Offset, AM)) {
+ AM = Backup;
+ return true;
}
- return true;
+ if (IsRIPRel)
+ AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
+
+ // Commit the changes now that we know this fold is safe.
+ return false;
}
/// Add the specified node to the specified addressing mode, returning true if
@@ -988,10 +1116,16 @@ bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
// IDs! The selection DAG must no longer depend on their uniqueness when this
// is used.
static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
- if (N.getNode()->getNodeId() == -1 ||
- N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
- DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode());
- N.getNode()->setNodeId(Pos.getNode()->getNodeId());
+ if (N->getNodeId() == -1 ||
+ (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
+ SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
+ DAG.RepositionNode(Pos->getIterator(), N.getNode());
+ // Mark Node as invalid for pruning as after this it may be a successor to a
+ // selected node but otherwise be in the same position of Pos.
+ // Conservatively mark it with the same -abs(Id) to assure node id
+ // invariant is preserved.
+ N->setNodeId(Pos->getNodeId());
+ SelectionDAGISel::InvalidateNodeId(N.getNode());
}
}
@@ -1196,10 +1330,10 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
unsigned Depth) {
SDLoc dl(N);
- DEBUG({
- dbgs() << "MatchAddress: ";
- AM.dump();
- });
+ LLVM_DEBUG({
+ dbgs() << "MatchAddress: ";
+ AM.dump(CurDAG);
+ });
// Limit recursion.
if (Depth > 5)
return matchAddressBase(N, AM);
@@ -1508,6 +1642,12 @@ bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
// TODO: Support other operations.
switch (N.getOpcode()) {
+ case ISD::Constant: {
+ uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+ if (!foldOffsetIntoAddress(Val, AM))
+ return false;
+ break;
+ }
case X86ISD::Wrapper:
if (!matchWrapper(N, AM))
return false;
@@ -1523,7 +1663,7 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
X86ISelAddressMode AM;
auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
AM.IndexReg = Mgs->getIndex();
- AM.Scale = Mgs->getValue().getScalarValueSizeInBits() / 8;
+ AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
@@ -1534,14 +1674,8 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
if (AddrSpace == 258)
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
- // If Base is 0, the whole address is in index and the Scale is 1
- if (isa<ConstantSDNode>(N)) {
- assert(cast<ConstantSDNode>(N)->isNullValue() &&
- "Unexpected base in gather/scatter");
- AM.Scale = 1;
- }
- // Otherwise, try to match into the base and displacement fields.
- else if (matchVectorAddress(N, AM))
+ // Try to match into the base and displacement fields.
+ if (matchVectorAddress(N, AM))
return false;
MVT VT = N.getSimpleValueType();
@@ -1604,8 +1738,7 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
// We can only fold a load if all nodes between it and the root node have a
// single use. If there are additional uses, we could end up duplicating the
// load.
-static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *N) {
- SDNode *User = *N->use_begin();
+static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
while (User != Root) {
if (!User->hasOneUse())
return false;
@@ -1622,17 +1755,19 @@ static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *N) {
/// We also return:
/// PatternChainNode: this is the matched node that has a chain input and
/// output.
-bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
+bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment,
SDValue &PatternNodeWithChain) {
+ if (!hasSingleUsesFromRoot(Root, Parent))
+ return false;
+
// We can allow a full vector load here since narrowing a load is ok.
if (ISD::isNON_EXTLoad(N.getNode())) {
PatternNodeWithChain = N;
if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) &&
- hasSingleUsesFromRoot(Root, N.getNode())) {
+ IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
Segment);
@@ -1643,8 +1778,7 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
PatternNodeWithChain = N;
if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) &&
- hasSingleUsesFromRoot(Root, N.getNode())) {
+ IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
Segment);
@@ -1658,8 +1792,7 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
PatternNodeWithChain = N.getOperand(0);
if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) &&
- hasSingleUsesFromRoot(Root, N.getNode())) {
+ IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
Segment);
@@ -1675,8 +1808,7 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
PatternNodeWithChain = N.getOperand(0).getOperand(0);
if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) &&
- hasSingleUsesFromRoot(Root, N.getNode())) {
+ IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
// Okay, this is a zero extending load. Fold it.
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
@@ -1699,10 +1831,10 @@ bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
}
// In static codegen with small code model, we can get the address of a label
- // into a register with 'movl'. TableGen has already made sure we're looking
- // at a label of some kind.
- assert(N->getOpcode() == X86ISD::Wrapper &&
- "Unexpected node type for MOV32ri64");
+ // into a register with 'movl'
+ if (N->getOpcode() != X86ISD::Wrapper)
+ return false;
+
N = N.getOperand(0);
// At least GNU as does not accept 'movl' for TPOFF relocations.
@@ -1907,6 +2039,20 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
N.getOperand(1), Base, Scale, Index, Disp, Segment);
}
+bool X86DAGToDAGISel::tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ if (!ISD::isNON_EXTLoad(N.getNode()) ||
+ useNonTemporalLoad(cast<LoadSDNode>(N)) ||
+ !IsProfitableToFold(N, P, Root) ||
+ !IsLegalToFold(N, P, Root, OptLevel))
+ return false;
+
+ return selectAddr(N.getNode(),
+ N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
/// Return an SDNode that returns the value of the global base register.
/// Output instructions required to initialize the global base register,
/// if necessary.
@@ -2092,50 +2238,84 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
LoadNode->getOffset() != StoreNode->getOffset())
return false;
- // Check if the chain is produced by the load or is a TokenFactor with
- // the load output chain as an operand. Return InputChain by reference.
+ bool FoundLoad = false;
+ SmallVector<SDValue, 4> ChainOps;
+ SmallVector<const SDNode *, 4> LoopWorklist;
+ SmallPtrSet<const SDNode *, 16> Visited;
+ const unsigned int Max = 1024;
+
+ // Visualization of Load-Op-Store fusion:
+ // -------------------------
+ // Legend:
+ // *-lines = Chain operand dependencies.
+ // |-lines = Normal operand dependencies.
+ // Dependencies flow down and right. n-suffix references multiple nodes.
+ //
+ // C Xn C
+ // * * *
+ // * * *
+ // Xn A-LD Yn TF Yn
+ // * * \ | * |
+ // * * \ | * |
+ // * * \ | => A--LD_OP_ST
+ // * * \| \
+ // TF OP \
+ // * | \ Zn
+ // * | \
+ // A-ST Zn
+ //
+
+ // This merge induced dependences from: #1: Xn -> LD, OP, Zn
+ // #2: Yn -> LD
+ // #3: ST -> Zn
+
+ // Ensure the transform is safe by checking for the dual
+ // dependencies to make sure we do not induce a loop.
+
+ // As LD is a predecessor to both OP and ST we can do this by checking:
+ // a). if LD is a predecessor to a member of Xn or Yn.
+ // b). if a Zn is a predecessor to ST.
+
+ // However, (b) can only occur through being a chain predecessor to
+ // ST, which is the same as Zn being a member or predecessor of Xn,
+ // which is a subset of LD being a predecessor of Xn. So it's
+ // subsumed by check (a).
+
SDValue Chain = StoreNode->getChain();
- bool ChainCheck = false;
+ // Gather X elements in ChainOps.
if (Chain == Load.getValue(1)) {
- ChainCheck = true;
- InputChain = LoadNode->getChain();
+ FoundLoad = true;
+ ChainOps.push_back(Load.getOperand(0));
} else if (Chain.getOpcode() == ISD::TokenFactor) {
- SmallVector<SDValue, 4> ChainOps;
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
SDValue Op = Chain.getOperand(i);
if (Op == Load.getValue(1)) {
- ChainCheck = true;
+ FoundLoad = true;
// Drop Load, but keep its chain. No cycle check necessary.
ChainOps.push_back(Load.getOperand(0));
continue;
}
-
- // Make sure using Op as part of the chain would not cause a cycle here.
- // In theory, we could check whether the chain node is a predecessor of
- // the load. But that can be very expensive. Instead visit the uses and
- // make sure they all have smaller node id than the load.
- int LoadId = LoadNode->getNodeId();
- for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
- UE = UI->use_end(); UI != UE; ++UI) {
- if (UI.getUse().getResNo() != 0)
- continue;
- if (UI->getNodeId() > LoadId)
- return false;
- }
-
+ LoopWorklist.push_back(Op.getNode());
ChainOps.push_back(Op);
}
-
- if (ChainCheck)
- // Make a new TokenFactor with all the other input chains except
- // for the load.
- InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
- MVT::Other, ChainOps);
}
- if (!ChainCheck)
+
+ if (!FoundLoad)
return false;
+ // Worklist is currently Xn. Add Yn to worklist.
+ for (SDValue Op : StoredVal->ops())
+ if (Op.getNode() != LoadNode)
+ LoopWorklist.push_back(Op.getNode());
+
+ // Check (a) if Load is a predecessor to Xn + Yn
+ if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
+ true))
+ return false;
+
+ InputChain =
+ CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
return true;
}
@@ -2177,7 +2357,9 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
case X86ISD::INC:
case X86ISD::DEC:
case X86ISD::ADD:
+ case X86ISD::ADC:
case X86ISD::SUB:
+ case X86ISD::SBB:
case X86ISD::AND:
case X86ISD::OR:
case X86ISD::XOR:
@@ -2225,7 +2407,9 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
break;
}
case X86ISD::ADD:
+ case X86ISD::ADC:
case X86ISD::SUB:
+ case X86ISD::SBB:
case X86ISD::AND:
case X86ISD::OR:
case X86ISD::XOR: {
@@ -2234,9 +2418,15 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
case X86ISD::ADD:
return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
X86::ADD8mr);
+ case X86ISD::ADC:
+ return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
+ X86::ADC8mr);
case X86ISD::SUB:
return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
X86::SUB8mr);
+ case X86ISD::SBB:
+ return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
+ X86::SBB8mr);
case X86ISD::AND:
return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
X86::AND8mr);
@@ -2253,8 +2443,12 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
switch (Opc) {
case X86ISD::ADD:
return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
+ case X86ISD::ADC:
+ return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
case X86ISD::SUB:
return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
+ case X86ISD::SBB:
+ return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
case X86ISD::AND:
return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
case X86ISD::OR:
@@ -2270,9 +2464,15 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
case X86ISD::ADD:
return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
X86::ADD8mi);
+ case X86ISD::ADC:
+ return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
+ X86::ADC8mi);
case X86ISD::SUB:
return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
X86::SUB8mi);
+ case X86ISD::SBB:
+ return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
+ X86::SBB8mi);
case X86ISD::AND:
return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
X86::AND8mi);
@@ -2320,10 +2520,21 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
}
}
- const SDValue Ops[] = {Base, Scale, Index, Disp,
- Segment, Operand, InputChain};
- Result =
- CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops);
+ if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
+ SDValue CopyTo =
+ CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
+ StoredVal.getOperand(2), SDValue());
+
+ const SDValue Ops[] = {Base, Scale, Index, Disp,
+ Segment, Operand, CopyTo, CopyTo.getValue(1)};
+ Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
+ Ops);
+ } else {
+ const SDValue Ops[] = {Base, Scale, Index, Disp,
+ Segment, Operand, InputChain};
+ Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
+ Ops);
+ }
break;
}
default:
@@ -2335,6 +2546,8 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
MemOp[1] = LoadNode->getMemOperand();
Result->setMemRefs(MemOp, MemOp + 2);
+ // Update Load Chain uses as well.
+ ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
CurDAG->RemoveDeadNode(Node);
@@ -2388,57 +2601,169 @@ bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) {
if (Shift + MaskSize > NVT.getSizeInBits())
return false;
- SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
- unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
- unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+ // Create a BEXTR node and run it through selection.
+ SDValue C = CurDAG->getConstant(Shift | (MaskSize << 8), dl, NVT);
+ SDValue New = CurDAG->getNode(X86ISD::BEXTR, dl, NVT,
+ N0->getOperand(0), C);
+ ReplaceNode(Node, New.getNode());
+ SelectCode(New.getNode());
+ return true;
+}
- // BMI requires the immediate to placed in a register.
- if (!Subtarget->hasTBM()) {
- ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
- MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
- New = SDValue(CurDAG->getMachineNode(X86::MOV32ri, dl, NVT, New), 0);
- if (NVT == MVT::i64) {
- New =
- SDValue(CurDAG->getMachineNode(
- TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
- CurDAG->getTargetConstant(0, dl, MVT::i64), New,
- CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
- 0);
- }
+// Emit a PCMISTR(I/M) instruction.
+MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
+ bool MayFoldLoad, const SDLoc &dl,
+ MVT VT, SDNode *Node) {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+ SDValue Imm = Node->getOperand(2);
+ const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
+ Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
+
+ // If there is a load, it will be behind a bitcast. We don't need to check
+ // alignment on this load.
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() &&
+ tryFoldVecLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2,
+ Tmp3, Tmp4)) {
+ SDValue Load = N1.getOperand(0);
+ SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
+ Load.getOperand(0) };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
+ MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ // Update the chain.
+ ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
+ // Record the mem-refs
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand();
+ CNode->setMemRefs(MemOp, MemOp + 1);
+ return CNode;
}
- MachineSDNode *NewNode;
- SDValue Input = N0->getOperand(0);
+ SDValue Ops[] = { N0, N1, Imm };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
+ MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
+ return CNode;
+}
+
+// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
+// to emit a second instruction after this one. This is needed since we have two
+// copyToReg nodes glued before this and we need to continue that glue through.
+MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
+ bool MayFoldLoad, const SDLoc &dl,
+ MVT VT, SDNode *Node,
+ SDValue &InFlag) {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N2 = Node->getOperand(2);
+ SDValue Imm = Node->getOperand(4);
+ const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
+ Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
+
+ // If there is a load, it will be behind a bitcast. We don't need to check
+ // alignment on this load.
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
- if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
- SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
- SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
- NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() &&
+ tryFoldVecLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2,
+ Tmp3, Tmp4)) {
+ SDValue Load = N2.getOperand(0);
+ SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
+ Load.getOperand(0), InFlag };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
+ MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ InFlag = SDValue(CNode, 3);
// Update the chain.
- ReplaceUses(Input.getValue(1), SDValue(NewNode, 1));
+ ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
// Record the mem-refs
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<LoadSDNode>(Input)->getMemOperand();
- NewNode->setMemRefs(MemOp, MemOp + 1);
- } else {
- NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
+ MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand();
+ CNode->setMemRefs(MemOp, MemOp + 1);
+ return CNode;
}
- ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
- CurDAG->RemoveDeadNode(Node);
+ SDValue Ops[] = { N0, N2, Imm, InFlag };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
+ MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
+ InFlag = SDValue(CNode, 2);
+ return CNode;
+}
+
+/// If the high bits of an 'and' operand are known zero, try setting the
+/// high bits of an 'and' constant operand to produce a smaller encoding by
+/// creating a small, sign-extended negative immediate rather than a large
+/// positive one. This reverses a transform in SimplifyDemandedBits that
+/// shrinks mask constants by clearing bits. There is also a possibility that
+/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
+/// case, just replace the 'and'. Return 'true' if the node is replaced.
+bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
+ // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
+ // have immediate operands.
+ MVT VT = And->getSimpleValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return false;
+
+ auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
+ if (!And1C)
+ return false;
+
+ // Bail out if the mask constant is already negative. It's can't shrink more.
+ // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
+ // patterns to use a 32-bit and instead of a 64-bit and by relying on the
+ // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
+ // are negative too.
+ APInt MaskVal = And1C->getAPIntValue();
+ unsigned MaskLZ = MaskVal.countLeadingZeros();
+ if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
+ return false;
+
+ // Don't extend into the upper 32 bits of a 64 bit mask.
+ if (VT == MVT::i64 && MaskLZ >= 32) {
+ MaskLZ -= 32;
+ MaskVal = MaskVal.trunc(32);
+ }
+
+ SDValue And0 = And->getOperand(0);
+ APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
+ APInt NegMaskVal = MaskVal | HighZeros;
+
+ // If a negative constant would not allow a smaller encoding, there's no need
+ // to continue. Only change the constant when we know it's a win.
+ unsigned MinWidth = NegMaskVal.getMinSignedBits();
+ if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
+ return false;
+
+ // Extend masks if we truncated above.
+ if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
+ NegMaskVal = NegMaskVal.zext(64);
+ HighZeros = HighZeros.zext(64);
+ }
+
+ // The variable operand must be all zeros in the top bits to allow using the
+ // new, negative constant as the mask.
+ if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
+ return false;
+
+ // Check if the mask is -1. In that case, this is an unnecessary instruction
+ // that escaped earlier analysis.
+ if (NegMaskVal.isAllOnesValue()) {
+ ReplaceNode(And, And0.getNode());
+ return true;
+ }
+
+ // A negative mask allows a smaller encoding. Create a new 'and' node.
+ SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
+ SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
+ ReplaceNode(And, NewAnd.getNode());
+ SelectCode(NewAnd.getNode());
return true;
}
void X86DAGToDAGISel::Select(SDNode *Node) {
MVT NVT = Node->getSimpleValueType(0);
- unsigned Opc, MOpc;
unsigned Opcode = Node->getOpcode();
SDLoc dl(Node);
- DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
-
if (Node->isMachineOpcode()) {
- DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
Node->setNodeId(-1);
return; // Already selected.
}
@@ -2483,9 +2808,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
}
case ISD::AND:
- // Try to match BEXTR/BEXTRI instruction.
if (matchBEXTRFromAnd(Node))
return;
+ if (shrinkAndImmediate(Node))
+ return;
LLVM_FALLTHROUGH;
case ISD::OR:
@@ -2577,7 +2903,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
- Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r);
+ unsigned Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r);
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::AL,
N0, SDValue()).getValue(1);
@@ -2594,7 +2920,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
- unsigned LoReg;
+ unsigned LoReg, Opc;
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
// MVT::i8 is handled by X86ISD::UMUL8.
@@ -2619,13 +2945,12 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
+ unsigned Opc, MOpc;
bool isSigned = Opcode == ISD::SMUL_LOHI;
bool hasBMI2 = Subtarget->hasBMI2();
if (!isSigned) {
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
- case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break;
- case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
@@ -2634,8 +2959,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
} else {
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
- case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break;
- case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
}
@@ -2644,14 +2967,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
unsigned SrcReg, LoReg, HiReg;
switch (Opc) {
default: llvm_unreachable("Unknown MUL opcode!");
- case X86::IMUL8r:
- case X86::MUL8r:
- SrcReg = LoReg = X86::AL; HiReg = X86::AH;
- break;
- case X86::IMUL16r:
- case X86::MUL16r:
- SrcReg = LoReg = X86::AX; HiReg = X86::DX;
- break;
case X86::IMUL32r:
case X86::MUL32r:
SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
@@ -2721,27 +3036,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
}
}
- // Prevent use of AH in a REX instruction by referencing AX instead.
- if (HiReg == X86::AH && Subtarget->is64Bit() &&
- !SDValue(Node, 1).use_empty()) {
- SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
- X86::AX, MVT::i16, InFlag);
- InFlag = Result.getValue(2);
- // Get the low part if needed. Don't use getCopyFromReg for aliasing
- // registers.
- if (!SDValue(Node, 0).use_empty())
- ReplaceUses(SDValue(Node, 0),
- CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
-
- // Shift AX down 8 bits.
- Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
- Result,
- CurDAG->getTargetConstant(8, dl, MVT::i8)),
- 0);
- // Then truncate it down to i8.
- ReplaceUses(SDValue(Node, 1),
- CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
- }
// Copy the low half of the result, if it is needed.
if (!SDValue(Node, 0).use_empty()) {
if (!ResLo.getNode()) {
@@ -2751,7 +3045,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
InFlag = ResLo.getValue(2);
}
ReplaceUses(SDValue(Node, 0), ResLo);
- DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
+ dbgs() << '\n');
}
// Copy the high half of the result, if it is needed.
if (!SDValue(Node, 1).use_empty()) {
@@ -2762,7 +3057,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
InFlag = ResHi.getValue(2);
}
ReplaceUses(SDValue(Node, 1), ResHi);
- DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
+ dbgs() << '\n');
}
CurDAG->RemoveDeadNode(Node);
@@ -2776,6 +3072,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
+ unsigned Opc, MOpc;
bool isSigned = (Opcode == ISD::SDIVREM ||
Opcode == X86ISD::SDIVREM8_SEXT_HREG);
if (!isSigned) {
@@ -2909,7 +3206,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
unsigned AHExtOpcode =
- isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8;
+ isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
MVT::Glue, AHCopy, InFlag);
@@ -2924,7 +3221,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
}
ReplaceUses(SDValue(Node, 1), Result);
- DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
+ dbgs() << '\n');
}
// Copy the division (low) result, if it is needed.
if (!SDValue(Node, 0).use_empty()) {
@@ -2932,7 +3230,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
LoReg, NVT, InFlag);
InFlag = Result.getValue(2);
ReplaceUses(SDValue(Node, 0), Result);
- DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
+ dbgs() << '\n');
}
// Copy the remainder (high) result, if it is needed.
if (!SDValue(Node, 1).use_empty()) {
@@ -2940,18 +3239,14 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
HiReg, NVT, InFlag);
InFlag = Result.getValue(2);
ReplaceUses(SDValue(Node, 1), Result);
- DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
+ dbgs() << '\n');
}
CurDAG->RemoveDeadNode(Node);
return;
}
- case X86ISD::CMP:
- case X86ISD::SUB: {
- // Sometimes a SUB is used to perform comparison.
- if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0))
- // This node is not a CMP.
- break;
+ case X86ISD::CMP: {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
@@ -2962,8 +3257,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
// use a smaller encoding.
// Look past the truncate if CMP is the only use of it.
- if ((N0.getOpcode() == ISD::AND ||
- (N0.getResNo() == 0 && N0.getOpcode() == X86ISD::AND)) &&
+ if (N0.getOpcode() == ISD::AND &&
N0.getNode()->hasOneUse() &&
N0.getValueType() != MVT::i8 &&
X86::isZeroNode(N1)) {
@@ -2971,98 +3265,119 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
if (!C) break;
uint64_t Mask = C->getZExtValue();
- // For example, convert "testl %eax, $8" to "testb %al, $8"
+ MVT VT;
+ int SubRegOp;
+ unsigned Op;
+
if (isUInt<8>(Mask) &&
(!(Mask & 0x80) || hasNoSignedComparisonUses(Node))) {
- SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i8);
- SDValue Reg = N0.getOperand(0);
-
- // Extract the l-register.
- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
- MVT::i8, Reg);
-
- // Emit a testb.
- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
- Subreg, Imm);
- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
- // one, do not call ReplaceAllUsesWith.
- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
- SDValue(NewNode, 0));
- CurDAG->RemoveDeadNode(Node);
- return;
+ // For example, convert "testl %eax, $8" to "testb %al, $8"
+ VT = MVT::i8;
+ SubRegOp = X86::sub_8bit;
+ Op = X86::TEST8ri;
+ } else if (OptForMinSize && isUInt<16>(Mask) &&
+ (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
+ // For example, "testl %eax, $32776" to "testw %ax, $32776".
+ // NOTE: We only want to form TESTW instructions if optimizing for
+ // min size. Otherwise we only save one byte and possibly get a length
+ // changing prefix penalty in the decoders.
+ VT = MVT::i16;
+ SubRegOp = X86::sub_16bit;
+ Op = X86::TEST16ri;
+ } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
+ (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) {
+ // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
+ // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
+ // Otherwize, we find ourselves in a position where we have to do
+ // promotion. If previous passes did not promote the and, we assume
+ // they had a good reason not to and do not promote here.
+ VT = MVT::i32;
+ SubRegOp = X86::sub_32bit;
+ Op = X86::TEST32ri;
+ } else {
+ // No eligible transformation was found.
+ break;
}
- // For example, "testl %eax, $2048" to "testb %ah, $8".
- if (isShiftedUInt<8, 8>(Mask) &&
- (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
- // Shift the immediate right by 8 bits.
- SDValue ShiftedImm = CurDAG->getTargetConstant(Mask >> 8, dl, MVT::i8);
- SDValue Reg = N0.getOperand(0);
-
- // Extract the h-register.
- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
- MVT::i8, Reg);
-
- // Emit a testb. The EXTRACT_SUBREG becomes a COPY that can only
- // target GR8_NOREX registers, so make sure the register class is
- // forced.
- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
- MVT::i32, Subreg, ShiftedImm);
- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
- // one, do not call ReplaceAllUsesWith.
- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
- SDValue(NewNode, 0));
- CurDAG->RemoveDeadNode(Node);
- return;
- }
+ SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
+ SDValue Reg = N0.getOperand(0);
- // For example, "testl %eax, $32776" to "testw %ax, $32776".
- // NOTE: We only want to form TESTW instructions if optimizing for
- // min size. Otherwise we only save one byte and possibly get a length
- // changing prefix penalty in the decoders.
- if (OptForMinSize && isUInt<16>(Mask) && N0.getValueType() != MVT::i16 &&
- (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
- SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i16);
- SDValue Reg = N0.getOperand(0);
-
- // Extract the 16-bit subregister.
- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
- MVT::i16, Reg);
-
- // Emit a testw.
- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
- Subreg, Imm);
- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
- // one, do not call ReplaceAllUsesWith.
- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
- SDValue(NewNode, 0));
- CurDAG->RemoveDeadNode(Node);
- return;
- }
+ // Extract the subregister if necessary.
+ if (N0.getValueType() != VT)
+ Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
- // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
- if (isUInt<32>(Mask) && N0.getValueType() == MVT::i64 &&
- (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) {
- SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i32);
- SDValue Reg = N0.getOperand(0);
-
- // Extract the 32-bit subregister.
- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
- MVT::i32, Reg);
-
- // Emit a testl.
- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
- Subreg, Imm);
- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
- // one, do not call ReplaceAllUsesWith.
- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
- SDValue(NewNode, 0));
- CurDAG->RemoveDeadNode(Node);
- return;
- }
+ // Emit a testl or testw.
+ SDNode *NewNode = CurDAG->getMachineNode(Op, dl, MVT::i32, Reg, Imm);
+ // Replace CMP with TEST.
+ ReplaceNode(Node, NewNode);
+ return;
}
break;
}
+ case X86ISD::PCMPISTR: {
+ if (!Subtarget->hasSSE42())
+ break;
+
+ bool NeedIndex = !SDValue(Node, 0).use_empty();
+ bool NeedMask = !SDValue(Node, 1).use_empty();
+ // We can't fold a load if we are going to make two instructions.
+ bool MayFoldLoad = !NeedIndex || !NeedMask;
+
+ MachineSDNode *CNode;
+ if (NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
+ CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
+ ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
+ }
+ if (NeedIndex || !NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
+ CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
+ ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+ }
+
+ // Connect the flag usage to the last instruction created.
+ ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ case X86ISD::PCMPESTR: {
+ if (!Subtarget->hasSSE42())
+ break;
+
+ // Copy the two implicit register inputs.
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
+ Node->getOperand(1),
+ SDValue()).getValue(1);
+ InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
+ Node->getOperand(3), InFlag).getValue(1);
+
+ bool NeedIndex = !SDValue(Node, 0).use_empty();
+ bool NeedMask = !SDValue(Node, 1).use_empty();
+ // We can't fold a load if we are going to make two instructions.
+ bool MayFoldLoad = !NeedIndex || !NeedMask;
+
+ MachineSDNode *CNode;
+ if (NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
+ CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
+ InFlag);
+ ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
+ }
+ if (NeedIndex || !NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
+ CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
+ ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+ }
+ // Connect the flag usage to the last instruction created.
+ ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+
case ISD::STORE:
if (foldLoadStoreIntoMemOperand(Node))
return;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9edd799779c7..7dcdb7967058 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -103,7 +103,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
X86ScalarSSEf64 = Subtarget.hasSSE2();
X86ScalarSSEf32 = Subtarget.hasSSE1();
- MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+ MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
// Set up the TargetLowering object.
@@ -216,6 +216,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// We have an algorithm for SSE2, and we turn this into a 64-bit
// FILD or VCVTUSI2SS/SD for other targets.
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
+ } else {
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand);
}
// Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
@@ -235,7 +237,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
} else {
setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
- setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand);
}
// Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
@@ -611,7 +613,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Long double always uses X87, except f128 in MMX.
if (UseX87) {
if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
- addRegisterClass(MVT::f128, &X86::FR128RegClass);
+ addRegisterClass(MVT::f128, &X86::VR128RegClass);
ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
setOperationAction(ISD::FABS , MVT::f128, Custom);
setOperationAction(ISD::FNEG , MVT::f128, Custom);
@@ -790,19 +792,33 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FABS, MVT::v2f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
- setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
- setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
- setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
- setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+ setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
+ setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
+ setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
+ setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
+ }
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+ // Provide custom widening for v2f32 setcc. This is really for VLX when
+ // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
+ // type legalization changing the result type to v4i1 during widening.
+ // It works fine for SSE2 and is probably faster so no need to qualify with
+ // VLX support.
+ setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
+
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTTZ, VT, Custom);
+
+ // The condition codes aren't legal in SSE/AVX and under AVX512 we use
+ // setcc all the way to isel and prefer SETGT in some isel patterns.
+ setCondCodeAction(ISD::SETLT, VT, Custom);
+ setCondCodeAction(ISD::SETLE, VT, Custom);
}
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
@@ -874,6 +890,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
+ if (!Subtarget.hasAVX512())
+ setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
@@ -886,6 +904,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
}
+
+ setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
+ setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
+ setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
@@ -967,7 +989,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BITREVERSE, VT, Custom);
}
- if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
+ if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
bool HasInt256 = Subtarget.hasInt256();
addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
@@ -996,13 +1018,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
// even though v8i16 is a legal type.
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
- setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
+ if (!Subtarget.hasAVX512())
+ setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
+
for (MVT VT : MVT::fp_vector_valuetypes())
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
@@ -1014,6 +1039,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SRA, VT, Custom);
}
+ setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
+ setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
+ setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
+
setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
@@ -1034,6 +1063,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTTZ, VT, Custom);
setOperationAction(ISD::CTLZ, VT, Custom);
+
+ // The condition codes aren't legal in SSE/AVX and under AVX512 we use
+ // setcc all the way to isel and prefer SETGT in some isel patterns.
+ setCondCodeAction(ISD::SETLT, VT, Custom);
+ setCondCodeAction(ISD::SETLE, VT, Custom);
}
if (Subtarget.hasAnyFMA()) {
@@ -1060,6 +1094,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
+ setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
+ setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
+ setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
+ setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
+
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
@@ -1137,13 +1176,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
+ // This block controls legalization of the mask vector sizes that are
+ // available with AVX512. 512-bit vectors are in a separate block controlled
+ // by useAVX512Regs.
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
- addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
- addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
- addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
- addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
-
addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
+ addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
+ addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
@@ -1151,24 +1190,34 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
-
- // Extends of v16i1/v8i1 to 128-bit vectors.
- setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v16i8, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v16i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i16, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v8i16, Custom);
-
- for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
+
+ // There is no byte sized k-register load or store without AVX512DQ.
+ if (!Subtarget.hasDQI()) {
+ setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
+ setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
+ setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
+ setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
+
+ setOperationAction(ISD::STORE, MVT::v1i1, Custom);
+ setOperationAction(ISD::STORE, MVT::v2i1, Custom);
+ setOperationAction(ISD::STORE, MVT::v4i1, Custom);
+ setOperationAction(ISD::STORE, MVT::v8i1, Custom);
+ }
+
+ // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+ setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+ setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+ }
+
+ for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
setOperationAction(ISD::ADD, VT, Custom);
setOperationAction(ISD::SUB, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
@@ -1184,11 +1233,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v2i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
- for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1,
- MVT::v16i1, MVT::v32i1, MVT::v64i1 })
- setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+ for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ }
+
+ // This block controls legalization for 512-bit operations with 32/64 bit
+ // elements. 512-bits can be disabled based on prefer-vector-width and
+ // required-vector-width function attributes.
+ if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
+ addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
+ addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
+ addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
+ addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
for (MVT VT : MVT::fp_vector_valuetypes())
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
@@ -1201,16 +1263,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
}
- for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
- MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
- MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
- MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
- setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
- setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
- setTruncStoreAction(VT, MaskVT, Custom);
- }
-
for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
@@ -1219,11 +1271,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
- setOperationAction(ISD::FP_TO_SINT, MVT::v16i16, Promote);
- setOperationAction(ISD::FP_TO_SINT, MVT::v16i8, Promote);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v16i8, Promote);
- setOperationAction(ISD::FP_TO_UINT, MVT::v16i16, Promote);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
@@ -1296,6 +1350,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTTZ, VT, Custom);
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+
+ // The condition codes aren't legal in SSE/AVX and under AVX512 we use
+ // setcc all the way to isel and prefer SETGT in some isel patterns.
+ setCondCodeAction(ISD::SETLT, VT, Custom);
+ setCondCodeAction(ISD::SETLE, VT, Custom);
}
// Need to promote to 64-bit even though we have 32-bit masked instructions
@@ -1310,6 +1370,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
+
+ setOperationAction(ISD::MUL, MVT::v8i64, Legal);
}
if (Subtarget.hasCDI()) {
@@ -1349,10 +1411,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
}
+
+ // Need to custom split v32i16/v64i8 bitcasts.
+ if (!Subtarget.hasBWI()) {
+ setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
+ }
}// has AVX-512
- if (!Subtarget.useSoftFloat() &&
- (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
+ // This block controls legalization for operations that don't have
+ // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
+ // narrower widths.
+ if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
// These operations are handled on non-VLX by artificially widening in
// isel patterns.
// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
@@ -1376,6 +1446,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ROTR, VT, Custom);
}
+ // Custom legalize 2x32 to get a little better code.
+ setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
+ setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
+
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
setOperationAction(ISD::MSCATTER, VT, Custom);
@@ -1386,6 +1460,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP, VT, Legal);
setOperationAction(ISD::FP_TO_SINT, VT, Legal);
setOperationAction(ISD::FP_TO_UINT, VT, Legal);
+
+ setOperationAction(ISD::MUL, VT, Legal);
}
}
@@ -1402,10 +1478,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
+ // This block control legalization of v32i1/v64i1 which are available with
+ // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
+ // useBWIRegs.
if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
- addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
- addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
-
addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
@@ -1428,11 +1504,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
+ for (auto VT : { MVT::v16i1, MVT::v32i1 })
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
// Extends from v32i1 masks to 256-bit vectors.
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
+ }
+
+ // This block controls legalization for v32i16 and v64i8. 512-bits can be
+ // disabled based on prefer-vector-width and required-vector-width function
+ // attributes.
+ if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
+ addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
+ addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
+
// Extends from v64i1 masks to 512-bit vectors.
setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
@@ -1482,6 +1569,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
+ setOperationAction(ISD::SETCC, VT, Custom);
setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
@@ -1498,8 +1586,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
- if (!Subtarget.useSoftFloat() && Subtarget.hasBWI() &&
- (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
+ if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
@@ -1516,39 +1603,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
- addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
- addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
-
- for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
- setOperationAction(ISD::ADD, VT, Custom);
- setOperationAction(ISD::SUB, VT, Custom);
- setOperationAction(ISD::MUL, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Expand);
-
- setOperationAction(ISD::TRUNCATE, VT, Custom);
- setOperationAction(ISD::SETCC, VT, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::SELECT, VT, Custom);
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- }
-
- // TODO: v8i1 concat should be legal without VLX to support concats of
- // v1i1, but we won't legalize it correctly currently without introducing
- // a v4i1 concat in the middle.
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
-
- // Extends from v2i1/v4i1 masks to 128-bit vectors.
- setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Custom);
-
setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
@@ -1648,6 +1702,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+ setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
@@ -1733,6 +1788,9 @@ SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
TargetLoweringBase::LegalizeTypeAction
X86TargetLowering::getPreferredVectorAction(EVT VT) const {
+ if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
+ return TypeSplitVector;
+
if (ExperimentalVectorWideningLegalization &&
VT.getVectorNumElements() != 1 &&
VT.getVectorElementType().getSimpleVT() != MVT::i1)
@@ -1741,6 +1799,20 @@ X86TargetLowering::getPreferredVectorAction(EVT VT) const {
return TargetLoweringBase::getPreferredVectorAction(VT);
}
+MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+ EVT VT) const {
+ if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
+ return MVT::v32i8;
+ return TargetLowering::getRegisterTypeForCallingConv(Context, VT);
+}
+
+unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
+ EVT VT) const {
+ if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
+ return 1;
+ return TargetLowering::getNumRegistersForCallingConv(Context, VT);
+}
+
EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext& Context,
EVT VT) const {
@@ -1937,7 +2009,7 @@ void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
// Mark the first N int arguments as having reg
for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
Type *T = Args[Idx].Ty;
- if (T->isPointerTy() || T->isIntegerTy())
+ if (T->isIntOrPtrTy())
if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
unsigned numRegs = 1;
if (MF->getDataLayout().getTypeAllocSize(T) > 4)
@@ -2051,7 +2123,8 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
void X86TargetLowering::insertSSPDeclarations(Module &M) const {
// MSVC CRT provides functionalities for stack protection.
- if (Subtarget.getTargetTriple().isOSMSVCRT()) {
+ if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
+ Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
// MSVC CRT has a global variable holding security cookie.
M.getOrInsertGlobal("__security_cookie",
Type::getInt8PtrTy(M.getContext()));
@@ -2073,15 +2146,19 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
// MSVC CRT has a global variable holding security cookie.
- if (Subtarget.getTargetTriple().isOSMSVCRT())
+ if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
+ Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
return M.getGlobalVariable("__security_cookie");
+ }
return TargetLowering::getSDagStackGuard(M);
}
Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
- if (Subtarget.getTargetTriple().isOSMSVCRT())
+ if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
+ Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
return M.getFunction("__security_check_cookie");
+ }
return TargetLowering::getSSPStackGuardCheck(M);
}
@@ -2140,6 +2217,10 @@ static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
const SDLoc &Dl, SelectionDAG &DAG) {
EVT ValVT = ValArg.getValueType();
+ if (ValVT == MVT::v1i1)
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
+ DAG.getIntPtrConstant(0, Dl));
+
if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
// Two stage lowering might be required
@@ -2150,13 +2231,16 @@ static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
if (ValLoc == MVT::i32)
ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
return ValToCopy;
- } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
- (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
+ }
+
+ if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
+ (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
// One stage lowering is required
// bitcast: v32i1 -> i32 / v64i1 -> i64
return DAG.getBitcast(ValLoc, ValArg);
- } else
- return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
+ }
+
+ return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
}
/// Breaks v64i1 value into two registers and adds the new node to the DAG
@@ -2474,10 +2558,10 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
MachineFunction &MF = DAG.getMachineFunction();
const TargetRegisterClass *RC = &X86::GR32RegClass;
- // Read a 32 bit value from the registers
+ // Read a 32 bit value from the registers.
if (nullptr == InFlag) {
// When no physical register is present,
- // create an intermediate virtual register
+ // create an intermediate virtual register.
Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
@@ -2493,13 +2577,13 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
*InFlag = ArgValueHi.getValue(2);
}
- // Convert the i32 type into v32i1 type
+ // Convert the i32 type into v32i1 type.
Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
- // Convert the i32 type into v32i1 type
+ // Convert the i32 type into v32i1 type.
Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
- // Concatenate the two values together
+ // Concatenate the two values together.
return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
}
@@ -2640,7 +2724,7 @@ enum StructReturnType {
StackStructReturn
};
static StructReturnType
-callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
+callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
if (Outs.empty())
return NotStructReturn;
@@ -2654,7 +2738,7 @@ callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
/// Determines whether a function uses struct return semantics.
static StructReturnType
-argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
+argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
if (Ins.empty())
return NotStructReturn;
@@ -2774,7 +2858,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
if (Flags.isByVal()) {
unsigned Bytes = Flags.getByValSize();
if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
- int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
+
+ // FIXME: For now, all byval parameter objects are marked as aliasing. This
+ // can be improved with deeper analysis.
+ int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
+ /*isAliased=*/true);
// Adjust SP offset of interrupt parameter.
if (CallConv == CallingConv::X86_INTR) {
MFI.setObjectOffset(FI, Offset);
@@ -2898,7 +2986,7 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
}
#ifndef NDEBUG
-static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
+static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
[](const CCValAssign &A, const CCValAssign &B) -> bool {
return A.getValNo() < B.getValNo();
@@ -2975,7 +3063,11 @@ SDValue X86TargetLowering::LowerFormalArguments(
getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
} else {
const TargetRegisterClass *RC;
- if (RegVT == MVT::i32)
+ if (RegVT == MVT::i8)
+ RC = &X86::GR8RegClass;
+ else if (RegVT == MVT::i16)
+ RC = &X86::GR16RegClass;
+ else if (RegVT == MVT::i32)
RC = &X86::GR32RegClass;
else if (Is64Bit && RegVT == MVT::i64)
RC = &X86::GR64RegClass;
@@ -2986,7 +3078,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
else if (RegVT == MVT::f80)
RC = &X86::RFP80RegClass;
else if (RegVT == MVT::f128)
- RC = &X86::FR128RegClass;
+ RC = &X86::VR128RegClass;
else if (RegVT.is512BitVector())
RC = &X86::VR512RegClass;
else if (RegVT.is256BitVector())
@@ -3361,6 +3453,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
+ const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
+ bool HasNoCfCheck =
+ (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
+ const Module *M = MF.getMMI().getModule();
+ Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
if (CallConv == CallingConv::X86_INTR)
report_fatal_error("X86 interrupts may not be called directly");
@@ -3743,6 +3840,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Callee = DAG.getTargetExternalSymbol(
S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
+
+ if (OpFlags == X86II::MO_GOTPCREL) {
+ Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
+ getPointerTy(DAG.getDataLayout()), Callee);
+ Callee = DAG.getLoad(
+ getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ }
} else if (Subtarget.isTarget64BitILP32() &&
Callee->getValueType(0) == MVT::i32) {
// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
@@ -3804,9 +3909,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
// Allocate a new Reg Mask and copy Mask.
- RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
- unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
- memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
+ RegMask = MF.allocateRegMask();
+ unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
+ memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
// Make sure all sub registers of the argument registers are reset
// in the RegMask.
@@ -3836,7 +3941,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
}
- Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
+ if (HasNoCfCheck && IsCFProtectionSupported) {
+ Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
+ } else {
+ Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
+ }
InFlag = Chain.getValue(1);
// Create the CALLSEQ_END node.
@@ -4260,8 +4369,6 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::VSRLDQ:
case X86ISD::MOVLHPS:
case X86ISD::MOVHLPS:
- case X86ISD::MOVLPS:
- case X86ISD::MOVLPD:
case X86ISD::MOVSHDUP:
case X86ISD::MOVSLDUP:
case X86ISD::MOVDDUP:
@@ -4273,12 +4380,12 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::VPERMILPI:
case X86ISD::VPERMILPV:
case X86ISD::VPERM2X128:
+ case X86ISD::SHUF128:
case X86ISD::VPERMIL2:
case X86ISD::VPERMI:
case X86ISD::VPPERM:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
- case X86ISD::VPERMIV3:
case X86ISD::VZEXT_MOVL:
return true;
}
@@ -4294,7 +4401,6 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {
case X86ISD::VPPERM:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
- case X86ISD::VPERMIV3:
return true;
// 'Faux' Target Shuffles.
case ISD::AND:
@@ -4371,7 +4477,7 @@ bool X86::isCalleePop(CallingConv::ID CallingConv,
}
}
-/// \brief Return true if the condition is an unsigned comparison operation.
+/// Return true if the condition is an unsigned comparison operation.
static bool isX86CCUnsigned(unsigned X86CC) {
switch (X86CC) {
default:
@@ -4518,20 +4624,6 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.offset = 0;
switch (IntrData->Type) {
- case EXPAND_FROM_MEM: {
- Info.ptrVal = I.getArgOperand(0);
- Info.memVT = MVT::getVT(I.getType());
- Info.align = 1;
- Info.flags |= MachineMemOperand::MOLoad;
- break;
- }
- case COMPRESS_TO_MEM: {
- Info.ptrVal = I.getArgOperand(0);
- Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
- Info.align = 1;
- Info.flags |= MachineMemOperand::MOStore;
- break;
- }
case TRUNCATE_TO_MEM_VI8:
case TRUNCATE_TO_MEM_VI16:
case TRUNCATE_TO_MEM_VI32: {
@@ -4580,7 +4672,7 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
return true;
}
-/// \brief Returns true if it is beneficial to convert a load of a constant
+/// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
@@ -4625,6 +4717,14 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
return Subtarget.hasLZCNT();
}
+bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
+ EVT BitcastVT) const {
+ if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
+ return false;
+
+ return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
+}
+
bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
const SelectionDAG &DAG) const {
// Do not merge to float value size (128 bytes) if no implicit
@@ -4649,14 +4749,52 @@ bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
}
bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
+ EVT VT = Y.getValueType();
+
+ if (VT.isVector())
+ return false;
+
if (!Subtarget.hasBMI())
return false;
// There are only 32-bit and 64-bit forms for 'andn'.
- EVT VT = Y.getValueType();
if (VT != MVT::i32 && VT != MVT::i64)
return false;
+ // A mask and compare against constant is ok for an 'andn' too
+ // even though the BMI instruction doesn't have an immediate form.
+
+ return true;
+}
+
+bool X86TargetLowering::hasAndNot(SDValue Y) const {
+ EVT VT = Y.getValueType();
+
+ if (!VT.isVector()) // x86 can't form 'andn' with an immediate.
+ return !isa<ConstantSDNode>(Y) && hasAndNotCompare(Y);
+
+ // Vector.
+
+ if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
+ return false;
+
+ if (VT == MVT::v4i32)
+ return true;
+
+ return Subtarget.hasSSE2();
+}
+
+bool X86TargetLowering::preferShiftsToClearExtremeBits(SDValue Y) const {
+ EVT VT = Y.getValueType();
+
+ // For vectors, we don't have a preference, but we probably want a mask.
+ if (VT.isVector())
+ return false;
+
+ // 64-bit shifts on 32-bit targets produce really bad bloated code.
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ return false;
+
return true;
}
@@ -4699,10 +4837,24 @@ static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
return true;
}
+/// Return true if Val falls within the specified range (L, H].
+static bool isInRange(int Val, int Low, int Hi) {
+ return (Val >= Low && Val < Hi);
+}
+
+/// Return true if the value of any element in Mask falls within the specified
+/// range (L, H].
+static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
+ for (int M : Mask)
+ if (isInRange(M, Low, Hi))
+ return true;
+ return false;
+}
+
/// Return true if Val is undef or if its value falls within the
/// specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
- return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
+ return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
}
/// Return true if every element in Mask is undef or if its value
@@ -4718,7 +4870,7 @@ static bool isUndefOrInRange(ArrayRef<int> Mask,
/// Return true if Val is undef, zero or if its value falls within the
/// specified range (L, H].
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
- return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
+ return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
}
/// Return true if every element in Mask is undef, zero or if its value
@@ -4731,11 +4883,11 @@ static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
}
/// Return true if every element in Mask, beginning
-/// from position Pos and ending in Pos+Size, falls within the specified
-/// sequential range (Low, Low+Size]. or is undef.
-static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
- unsigned Pos, unsigned Size, int Low) {
- for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+/// from position Pos and ending in Pos + Size, falls within the specified
+/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
+static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
+ unsigned Size, int Low, int Step = 1) {
+ for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
if (!isUndefOrEqual(Mask[i], Low))
return false;
return true;
@@ -4762,7 +4914,7 @@ static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
return true;
}
-/// \brief Helper function to test whether a shuffle mask could be
+/// Helper function to test whether a shuffle mask could be
/// simplified by widening the elements being shuffled.
///
/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
@@ -4821,6 +4973,24 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
return true;
}
+static bool canWidenShuffleElements(ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ SmallVectorImpl<int> &WidenedMask) {
+ SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());
+ for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
+ if (TargetMask[i] == SM_SentinelUndef)
+ continue;
+ if (Zeroable[i])
+ TargetMask[i] = SM_SentinelZero;
+ }
+ return canWidenShuffleElements(TargetMask, WidenedMask);
+}
+
+static bool canWidenShuffleElements(ArrayRef<int> Mask) {
+ SmallVector<int, 32> WidenedMask;
+ return canWidenShuffleElements(Mask, WidenedMask);
+}
+
/// Returns true if Elt is a constant zero or a floating point constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
return isNullConstant(Elt) || isNullFPConstant(Elt);
@@ -4916,8 +5086,6 @@ static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
} else if (VT.getVectorElementType() == MVT::i1) {
assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type");
- assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
- "Unexpected vector type");
Vec = DAG.getConstant(0, dl, VT);
} else {
unsigned Num32BitElts = VT.getSizeInBits() / 32;
@@ -5007,10 +5175,66 @@ static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
}
-static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, const SDLoc &dl) {
- assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
- return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
+/// Widen a vector to a larger size with the same scalar type, with the new
+/// elements either zero or undef.
+static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl) {
+ assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
+ Vec.getValueType().getScalarType() == VT.getScalarType() &&
+ "Unsupported vector widening type");
+ SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
+ : DAG.getUNDEF(VT);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+// Helper for splitting operands of an operation to legal target size and
+// apply a function on each part.
+// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
+// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
+// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
+// The argument Builder is a function that will be applied on each split part:
+// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
+template <typename F>
+SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
+ F Builder, bool CheckBWI = true) {
+ assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
+ unsigned NumSubs = 1;
+ if ((CheckBWI && Subtarget.useBWIRegs()) ||
+ (!CheckBWI && Subtarget.useAVX512Regs())) {
+ if (VT.getSizeInBits() > 512) {
+ NumSubs = VT.getSizeInBits() / 512;
+ assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
+ }
+ } else if (Subtarget.hasAVX2()) {
+ if (VT.getSizeInBits() > 256) {
+ NumSubs = VT.getSizeInBits() / 256;
+ assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
+ }
+ } else {
+ if (VT.getSizeInBits() > 128) {
+ NumSubs = VT.getSizeInBits() / 128;
+ assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
+ }
+ }
+
+ if (NumSubs == 1)
+ return Builder(DAG, DL, Ops);
+
+ SmallVector<SDValue, 4> Subs;
+ for (unsigned i = 0; i != NumSubs; ++i) {
+ SmallVector<SDValue, 2> SubOps;
+ for (SDValue Op : Ops) {
+ EVT OpVT = Op.getValueType();
+ unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
+ unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
+ SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
+ }
+ Subs.push_back(Builder(DAG, DL, SubOps));
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
}
// Return true if the instruction zeroes the unused upper part of the
@@ -5019,13 +5243,9 @@ static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
switch (Opcode) {
default:
return false;
- case X86ISD::TESTM:
- case X86ISD::TESTNM:
- case X86ISD::PCMPEQM:
- case X86ISD::PCMPGTM:
case X86ISD::CMPM:
- case X86ISD::CMPMU:
case X86ISD::CMPM_RND:
+ case ISD::SETCC:
return true;
}
}
@@ -5166,22 +5386,11 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
-/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
-/// instructions. This is used because creating CONCAT_VECTOR nodes of
-/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
-/// large BUILD_VECTORS.
-static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
- unsigned NumElems, SelectionDAG &DAG,
- const SDLoc &dl) {
- SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
- return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
-}
-
-static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
- unsigned NumElems, SelectionDAG &DAG,
- const SDLoc &dl) {
- SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
- return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
+static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ const SDLoc &dl, unsigned VectorWidth) {
+ SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
+ return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
}
/// Returns a vector of specified type with all bits set.
@@ -5265,6 +5474,13 @@ static SDValue peekThroughOneUseBitcasts(SDValue V) {
return V;
}
+// Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops.
+static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) {
+ while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+ V = V.getOperand(0);
+ return V;
+}
+
static const Constant *getTargetConstantFromNode(SDValue Op) {
Op = peekThroughBitcasts(Op);
@@ -5389,6 +5605,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
return CastBitData(UndefSrcElts, SrcEltBits);
}
+ if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
+ APInt UndefSrcElts = APInt::getNullValue(1);
+ APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
+ SmallVector<APInt, 64> SrcEltBits(1, RawBits);
+ return CastBitData(UndefSrcElts, SrcEltBits);
+ }
// Extract constant bits from build vector.
if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
@@ -5525,14 +5747,15 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::SHUFP:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ DecodeSHUFPMask(NumElems, VT.getScalarSizeInBits(),
+ cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::INSERTPS:
@@ -5548,7 +5771,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
isa<ConstantSDNode>(N->getOperand(2))) {
int BitLen = N->getConstantOperandVal(1);
int BitIdx = N->getConstantOperandVal(2);
- DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
+ DecodeEXTRQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
+ Mask);
IsUnary = true;
}
break;
@@ -5559,20 +5783,21 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
isa<ConstantSDNode>(N->getOperand(3))) {
int BitLen = N->getConstantOperandVal(2);
int BitIdx = N->getConstantOperandVal(3);
- DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
+ DecodeINSERTQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
+ Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
}
break;
case X86ISD::UNPCKH:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- DecodeUNPCKHMask(VT, Mask);
+ DecodeUNPCKHMask(NumElems, VT.getScalarSizeInBits(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::UNPCKL:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- DecodeUNPCKLMask(VT, Mask);
+ DecodeUNPCKLMask(NumElems, VT.getScalarSizeInBits(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::MOVHLPS:
@@ -5592,7 +5817,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
Ops.push_back(N->getOperand(1));
Ops.push_back(N->getOperand(0));
@@ -5601,38 +5827,43 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ Mask);
IsUnary = true;
break;
case X86ISD::VSRLDQ:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ Mask);
IsUnary = true;
break;
case X86ISD::PSHUFD:
case X86ISD::VPERMILPI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ DecodePSHUFMask(NumElems, VT.getScalarSizeInBits(),
+ cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::PSHUFHW:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ Mask);
IsUnary = true;
break;
case X86ISD::PSHUFLW:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ Mask);
IsUnary = true;
break;
case X86ISD::VZEXT_MOVL:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- DecodeZeroMoveLowMask(VT, Mask);
+ DecodeZeroMoveLowMask(NumElems, Mask);
IsUnary = true;
break;
case X86ISD::VBROADCAST: {
@@ -5648,7 +5879,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
// came from an extract from the original width. If we found one, we
// pushed it the Ops vector above.
if (N0.getValueType() == VT || !Ops.empty()) {
- DecodeVectorBroadcast(VT, Mask);
+ DecodeVectorBroadcast(NumElems, Mask);
IsUnary = true;
break;
}
@@ -5661,7 +5892,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
unsigned MaskEltSize = VT.getScalarSizeInBits();
SmallVector<uint64_t, 32> RawMask;
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
- DecodeVPERMILPMask(VT, RawMask, Mask);
+ DecodeVPERMILPMask(NumElems, VT.getScalarSizeInBits(), RawMask, Mask);
break;
}
if (auto *C = getTargetConstantFromNode(MaskNode)) {
@@ -5690,41 +5921,47 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
case X86ISD::VPERMI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::MOVSS:
case X86ISD::MOVSD:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
+ DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
break;
case X86ISD::VPERM2X128:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::SHUF128:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ decodeVSHUF64x2FamilyMask(NumElems, VT.getScalarSizeInBits(),
+ cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::MOVSLDUP:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- DecodeMOVSLDUPMask(VT, Mask);
+ DecodeMOVSLDUPMask(NumElems, Mask);
IsUnary = true;
break;
case X86ISD::MOVSHDUP:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- DecodeMOVSHDUPMask(VT, Mask);
+ DecodeMOVSHDUPMask(NumElems, Mask);
IsUnary = true;
break;
case X86ISD::MOVDDUP:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- DecodeMOVDDUPMask(VT, Mask);
+ DecodeMOVDDUPMask(NumElems, Mask);
IsUnary = true;
break;
- case X86ISD::MOVLPD:
- case X86ISD::MOVLPS:
- // Not yet implemented
- return false;
case X86ISD::VPERMIL2: {
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
@@ -5736,7 +5973,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
unsigned CtrlImm = CtrlOp->getZExtValue();
SmallVector<uint64_t, 32> RawMask;
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
- DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
+ DecodeVPERMIL2PMask(NumElems, VT.getScalarSizeInBits(), CtrlImm,
+ RawMask, Mask);
break;
}
if (auto *C = getTargetConstantFromNode(MaskNode)) {
@@ -5795,21 +6033,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
}
return false;
}
- case X86ISD::VPERMIV3: {
- assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
- IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
- // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
- Ops.push_back(N->getOperand(1));
- Ops.push_back(N->getOperand(2));
- SDValue MaskNode = N->getOperand(0);
- unsigned MaskEltSize = VT.getScalarSizeInBits();
- if (auto *C = getTargetConstantFromNode(MaskNode)) {
- DecodeVPERMV3Mask(C, MaskEltSize, Mask);
- break;
- }
- return false;
- }
default: llvm_unreachable("unknown target shuffle node");
}
@@ -5927,7 +6150,7 @@ static bool setTargetShuffleZeroElements(SDValue N,
// destination value type.
static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops,
- SelectionDAG &DAG) {
+ const SelectionDAG &DAG) {
Mask.clear();
Ops.clear();
@@ -5940,6 +6163,17 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
unsigned Opcode = N.getOpcode();
switch (Opcode) {
+ case ISD::VECTOR_SHUFFLE: {
+ // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
+ ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
+ if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
+ Mask.append(ShuffleMask.begin(), ShuffleMask.end());
+ Ops.push_back(N.getOperand(0));
+ Ops.push_back(N.getOperand(1));
+ return true;
+ }
+ return false;
+ }
case ISD::AND:
case X86ISD::ANDNP: {
// Attempt to decode as a per-byte mask.
@@ -6001,8 +6235,11 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
case X86ISD::PINSRW: {
SDValue InVec = N.getOperand(0);
SDValue InScl = N.getOperand(1);
+ SDValue InIndex = N.getOperand(2);
+ if (!isa<ConstantSDNode>(InIndex) ||
+ cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
+ return false;
uint64_t InIdx = N.getConstantOperandVal(2);
- assert(InIdx < NumElts && "Illegal insertion index");
// Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
if (X86::isZeroNode(InScl)) {
@@ -6020,8 +6257,12 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
return false;
SDValue ExVec = InScl.getOperand(0);
+ SDValue ExIndex = InScl.getOperand(1);
+ if (!isa<ConstantSDNode>(ExIndex) ||
+ cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
+ return false;
uint64_t ExIdx = InScl.getConstantOperandVal(1);
- assert(ExIdx < NumElts && "Illegal extraction index");
+
Ops.push_back(InVec);
Ops.push_back(ExVec);
for (unsigned i = 0; i != NumElts; ++i)
@@ -6097,7 +6338,8 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
MVT SrcVT = Src.getSimpleValueType();
if (NumSizeInBits != SrcVT.getSizeInBits())
break;
- DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
+ DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), VT.getScalarSizeInBits(),
+ VT.getVectorNumElements(), Mask);
Ops.push_back(Src);
return true;
}
@@ -6141,7 +6383,7 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
static bool resolveTargetShuffleInputs(SDValue Op,
SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
- SelectionDAG &DAG) {
+ const SelectionDAG &DAG) {
if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
return false;
@@ -6451,9 +6693,8 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
MVT ShVT = MVT::v16i8;
unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
SrcOp = DAG.getBitcast(ShVT, SrcOp);
- MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
assert(NumBits % 8 == 0 && "Only support byte sized shifts");
- SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
+ SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
}
@@ -6805,17 +7046,13 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
BOperand = ZeroExtended.getOperand(0);
else
BOperand = Ld.getOperand(0).getOperand(0);
- if (BOperand.getValueType().isVector() &&
- BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
- if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||
- NumElts == 8)) || // for broadcastmb2q
- (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||
- NumElts == 16))) { // for broadcastmw2d
- SDValue Brdcst =
- DAG.getNode(X86ISD::VBROADCASTM, dl,
- MVT::getVectorVT(EltType, NumElts), BOperand);
- return DAG.getBitcast(VT, Brdcst);
- }
+ MVT MaskVT = BOperand.getSimpleValueType();
+ if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
+ (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
+ SDValue Brdcst =
+ DAG.getNode(X86ISD::VBROADCASTM, dl,
+ MVT::getVectorVT(EltType, NumElts), BOperand);
+ return DAG.getBitcast(VT, Brdcst);
}
}
}
@@ -6982,7 +7219,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
return SDValue();
}
-/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
+/// For an EXTRACT_VECTOR_ELT with a constant index return the real
/// underlying vector and index.
///
/// Modifies \p ExtractedFromVec to the real vector and returns the real
@@ -7195,7 +7432,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
return DstVec;
}
-/// \brief Return true if \p N implements a horizontal binop and return the
+/// Return true if \p N implements a horizontal binop and return the
/// operands for the horizontal binop into V0 and V1.
///
/// This is a helper function of LowerToHorizontalOp().
@@ -7292,7 +7529,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
return CanFold;
}
-/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
+/// Emit a sequence of two 128-bit horizontal add/sub followed by
/// a concat_vector.
///
/// This is a helper function of LowerToHorizontalOp().
@@ -7360,18 +7597,18 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
}
/// Returns true iff \p BV builds a vector with the result equivalent to
-/// the result of ADDSUB operation.
-/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
-/// are written to the parameters \p Opnd0 and \p Opnd1.
-static bool isAddSub(const BuildVectorSDNode *BV,
- const X86Subtarget &Subtarget, SelectionDAG &DAG,
- SDValue &Opnd0, SDValue &Opnd1,
- unsigned &NumExtracts) {
+/// the result of ADDSUB/SUBADD operation.
+/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
+/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
+/// \p Opnd0 and \p Opnd1.
+static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ SDValue &Opnd0, SDValue &Opnd1,
+ unsigned &NumExtracts,
+ bool &IsSubAdd) {
MVT VT = BV->getSimpleValueType(0);
- if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
- (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
- (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
+ if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
return false;
unsigned NumElts = VT.getVectorNumElements();
@@ -7381,26 +7618,20 @@ static bool isAddSub(const BuildVectorSDNode *BV,
NumExtracts = 0;
// Odd-numbered elements in the input build vector are obtained from
- // adding two integer/float elements.
+ // adding/subtracting two integer/float elements.
// Even-numbered elements in the input build vector are obtained from
- // subtracting two integer/float elements.
- unsigned ExpectedOpcode = ISD::FSUB;
- unsigned NextExpectedOpcode = ISD::FADD;
- bool AddFound = false;
- bool SubFound = false;
-
+ // subtracting/adding two integer/float elements.
+ unsigned Opc[2] {0, 0};
for (unsigned i = 0, e = NumElts; i != e; ++i) {
SDValue Op = BV->getOperand(i);
// Skip 'undef' values.
unsigned Opcode = Op.getOpcode();
- if (Opcode == ISD::UNDEF) {
- std::swap(ExpectedOpcode, NextExpectedOpcode);
+ if (Opcode == ISD::UNDEF)
continue;
- }
// Early exit if we found an unexpected opcode.
- if (Opcode != ExpectedOpcode)
+ if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
return false;
SDValue Op0 = Op.getOperand(0);
@@ -7420,11 +7651,11 @@ static bool isAddSub(const BuildVectorSDNode *BV,
if (I0 != i)
return false;
- // We found a valid add/sub node. Update the information accordingly.
- if (i & 1)
- AddFound = true;
- else
- SubFound = true;
+ // We found a valid add/sub node, make sure its the same opcode as previous
+ // elements for this parity.
+ if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
+ return false;
+ Opc[i % 2] = Opcode;
// Update InVec0 and InVec1.
if (InVec0.isUndef()) {
@@ -7441,7 +7672,7 @@ static bool isAddSub(const BuildVectorSDNode *BV,
// Make sure that operands in input to each add/sub node always
// come from a same pair of vectors.
if (InVec0 != Op0.getOperand(0)) {
- if (ExpectedOpcode == ISD::FSUB)
+ if (Opcode == ISD::FSUB)
return false;
// FADD is commutable. Try to commute the operands
@@ -7454,24 +7685,26 @@ static bool isAddSub(const BuildVectorSDNode *BV,
if (InVec1 != Op1.getOperand(0))
return false;
- // Update the pair of expected opcodes.
- std::swap(ExpectedOpcode, NextExpectedOpcode);
-
// Increment the number of extractions done.
++NumExtracts;
}
- // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
- if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
+ // Ensure we have found an opcode for both parities and that they are
+ // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
+ // inputs are undef.
+ if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
+ InVec0.isUndef() || InVec1.isUndef())
return false;
+ IsSubAdd = Opc[0] == ISD::FADD;
+
Opnd0 = InVec0;
Opnd1 = InVec1;
return true;
}
/// Returns true if is possible to fold MUL and an idiom that has already been
-/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
+/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
///
@@ -7521,14 +7754,17 @@ static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
return true;
}
-/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
-/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
+/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
+/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
+/// X86ISD::FMSUBADD node.
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue Opnd0, Opnd1;
unsigned NumExtracts;
- if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts))
+ bool IsSubAdd;
+ if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
+ IsSubAdd))
return SDValue();
MVT VT = BV->getSimpleValueType(0);
@@ -7536,10 +7772,14 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
// Try to generate X86ISD::FMADDSUB node here.
SDValue Opnd2;
- // TODO: According to coverage reports, the FMADDSUB transform is not
- // triggered by any tests.
- if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts))
- return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
+ if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
+ unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
+ return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
+ }
+
+ // We only support ADDSUB.
+ if (IsSubAdd)
+ return SDValue();
// Do not generate X86ISD::ADDSUB node for 512-bit types even though
// the ADDSUB idiom has been successfully recognized. There are no known
@@ -7708,6 +7948,10 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
case ISD::AND:
case ISD::XOR:
case ISD::OR:
+ // Don't do this if the buildvector is a splat - we'd replace one
+ // constant with an entire vector.
+ if (Op->getSplatValue())
+ return SDValue();
if (!TLI.isOperationLegalOrPromote(Opcode, VT))
return SDValue();
break;
@@ -7762,66 +8006,268 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
-// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
-// reasoned to be a permutation of a vector by indices in a non-constant vector.
-// (build_vector (extract_elt V, (extract_elt I, 0)),
-// (extract_elt V, (extract_elt I, 1)),
-// ...
-// ->
-// (vpermv I, V)
-//
-// TODO: Handle undefs
-// TODO: Utilize pshufb and zero mask blending to support more efficient
-// construction of vectors with constant-0 elements.
-// TODO: Use smaller-element vectors of same width, and "interpolate" the indices,
-// when no native operation available.
-static SDValue
-LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- // Look for VPERMV and PSHUFB opportunities.
- MVT VT = V.getSimpleValueType();
+/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
+/// from a vector of source values and a vector of extraction indices.
+/// The vectors might be manipulated to match the type of the permute op.
+static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
+ SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT ShuffleVT = VT;
+ EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned SizeInBits = VT.getSizeInBits();
+
+ // Adjust IndicesVec to match VT size.
+ assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
+ "Illegal variable permute mask size");
+ if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
+ IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
+ NumElts * VT.getScalarSizeInBits());
+ IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
+
+ // Handle SrcVec that don't match VT type.
+ if (SrcVec.getValueSizeInBits() != SizeInBits) {
+ if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
+ // Handle larger SrcVec by treating it as a larger permute.
+ unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
+ VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
+ IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
+ IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
+ Subtarget, DAG, SDLoc(IndicesVec));
+ return extractSubVector(
+ createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
+ DAG, DL, SizeInBits);
+ } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
+ // Widen smaller SrcVec to match VT.
+ SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
+ } else
+ return SDValue();
+ }
+
+ auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
+ assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
+ EVT SrcVT = Idx.getValueType();
+ unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
+ uint64_t IndexScale = 0;
+ uint64_t IndexOffset = 0;
+
+ // If we're scaling a smaller permute op, then we need to repeat the
+ // indices, scaling and offsetting them as well.
+ // e.g. v4i32 -> v16i8 (Scale = 4)
+ // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
+ // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
+ for (uint64_t i = 0; i != Scale; ++i) {
+ IndexScale |= Scale << (i * NumDstBits);
+ IndexOffset |= i << (i * NumDstBits);
+ }
+
+ Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
+ DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
+ Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
+ DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
+ return Idx;
+ };
+
+ unsigned Opcode = 0;
switch (VT.SimpleTy) {
default:
- return SDValue();
+ break;
case MVT::v16i8:
- if (!Subtarget.hasSSE3())
- return SDValue();
+ if (Subtarget.hasSSSE3())
+ Opcode = X86ISD::PSHUFB;
+ break;
+ case MVT::v8i16:
+ if (Subtarget.hasVLX() && Subtarget.hasBWI())
+ Opcode = X86ISD::VPERMV;
+ else if (Subtarget.hasSSSE3()) {
+ Opcode = X86ISD::PSHUFB;
+ ShuffleVT = MVT::v16i8;
+ }
+ break;
+ case MVT::v4f32:
+ case MVT::v4i32:
+ if (Subtarget.hasAVX()) {
+ Opcode = X86ISD::VPERMILPV;
+ ShuffleVT = MVT::v4f32;
+ } else if (Subtarget.hasSSSE3()) {
+ Opcode = X86ISD::PSHUFB;
+ ShuffleVT = MVT::v16i8;
+ }
+ break;
+ case MVT::v2f64:
+ case MVT::v2i64:
+ if (Subtarget.hasAVX()) {
+ // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
+ IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
+ Opcode = X86ISD::VPERMILPV;
+ ShuffleVT = MVT::v2f64;
+ } else if (Subtarget.hasSSE41()) {
+ // SSE41 can compare v2i64 - select between indices 0 and 1.
+ return DAG.getSelectCC(
+ DL, IndicesVec,
+ getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
+ DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
+ DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
+ ISD::CondCode::SETEQ);
+ }
+ break;
+ case MVT::v32i8:
+ if (Subtarget.hasVLX() && Subtarget.hasVBMI())
+ Opcode = X86ISD::VPERMV;
+ else if (Subtarget.hasXOP()) {
+ SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
+ SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
+ SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
+ SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
+ return DAG.getNode(
+ ISD::CONCAT_VECTORS, DL, VT,
+ DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
+ DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
+ } else if (Subtarget.hasAVX()) {
+ SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
+ SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
+ SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
+ SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
+ auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ // Permute Lo and Hi and then select based on index range.
+ // This works as SHUFB uses bits[3:0] to permute elements and we don't
+ // care about the bit[7] as its just an index vector.
+ SDValue Idx = Ops[2];
+ EVT VT = Idx.getValueType();
+ return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
+ DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
+ DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
+ ISD::CondCode::SETGT);
+ };
+ SDValue Ops[] = {LoLo, HiHi, IndicesVec};
+ return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
+ PSHUFBBuilder);
+ }
+ break;
+ case MVT::v16i16:
+ if (Subtarget.hasVLX() && Subtarget.hasBWI())
+ Opcode = X86ISD::VPERMV;
+ else if (Subtarget.hasAVX()) {
+ // Scale to v32i8 and perform as v32i8.
+ IndicesVec = ScaleIndices(IndicesVec, 2);
+ return DAG.getBitcast(
+ VT, createVariablePermute(
+ MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
+ DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
+ }
break;
case MVT::v8f32:
case MVT::v8i32:
- if (!Subtarget.hasAVX2())
- return SDValue();
+ if (Subtarget.hasAVX2())
+ Opcode = X86ISD::VPERMV;
+ else if (Subtarget.hasAVX()) {
+ SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
+ SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
+ {0, 1, 2, 3, 0, 1, 2, 3});
+ SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
+ {4, 5, 6, 7, 4, 5, 6, 7});
+ if (Subtarget.hasXOP())
+ return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
+ LoLo, HiHi, IndicesVec,
+ DAG.getConstant(0, DL, MVT::i8)));
+ // Permute Lo and Hi and then select based on index range.
+ // This works as VPERMILPS only uses index bits[0:1] to permute elements.
+ SDValue Res = DAG.getSelectCC(
+ DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
+ DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
+ DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
+ ISD::CondCode::SETGT);
+ return DAG.getBitcast(VT, Res);
+ }
break;
case MVT::v4i64:
case MVT::v4f64:
- if (!Subtarget.hasVLX())
- return SDValue();
+ if (Subtarget.hasAVX512()) {
+ if (!Subtarget.hasVLX()) {
+ MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
+ SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
+ SDLoc(SrcVec));
+ IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
+ DAG, SDLoc(IndicesVec));
+ SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
+ DAG, Subtarget);
+ return extract256BitVector(Res, 0, DAG, DL);
+ }
+ Opcode = X86ISD::VPERMV;
+ } else if (Subtarget.hasAVX()) {
+ SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
+ SDValue LoLo =
+ DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
+ SDValue HiHi =
+ DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
+ // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
+ IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
+ if (Subtarget.hasXOP())
+ return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
+ LoLo, HiHi, IndicesVec,
+ DAG.getConstant(0, DL, MVT::i8)));
+ // Permute Lo and Hi and then select based on index range.
+ // This works as VPERMILPD only uses index bit[1] to permute elements.
+ SDValue Res = DAG.getSelectCC(
+ DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
+ DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
+ DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
+ ISD::CondCode::SETGT);
+ return DAG.getBitcast(VT, Res);
+ }
break;
- case MVT::v16f32:
- case MVT::v8f64:
- case MVT::v16i32:
- case MVT::v8i64:
- if (!Subtarget.hasAVX512())
- return SDValue();
+ case MVT::v64i8:
+ if (Subtarget.hasVBMI())
+ Opcode = X86ISD::VPERMV;
break;
case MVT::v32i16:
- if (!Subtarget.hasBWI())
- return SDValue();
- break;
- case MVT::v8i16:
- case MVT::v16i16:
- if (!Subtarget.hasVLX() || !Subtarget.hasBWI())
- return SDValue();
- break;
- case MVT::v64i8:
- if (!Subtarget.hasVBMI())
- return SDValue();
+ if (Subtarget.hasBWI())
+ Opcode = X86ISD::VPERMV;
break;
- case MVT::v32i8:
- if (!Subtarget.hasVLX() || !Subtarget.hasVBMI())
- return SDValue();
+ case MVT::v16f32:
+ case MVT::v16i32:
+ case MVT::v8f64:
+ case MVT::v8i64:
+ if (Subtarget.hasAVX512())
+ Opcode = X86ISD::VPERMV;
break;
}
+ if (!Opcode)
+ return SDValue();
+
+ assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
+ (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
+ "Illegal variable permute shuffle type");
+
+ uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
+ if (Scale > 1)
+ IndicesVec = ScaleIndices(IndicesVec, Scale);
+
+ EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
+ IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
+
+ SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
+ SDValue Res = Opcode == X86ISD::VPERMV
+ ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
+ : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
+ return DAG.getBitcast(VT, Res);
+}
+
+// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
+// reasoned to be a permutation of a vector by indices in a non-constant vector.
+// (build_vector (extract_elt V, (extract_elt I, 0)),
+// (extract_elt V, (extract_elt I, 1)),
+// ...
+// ->
+// (vpermv I, V)
+//
+// TODO: Handle undefs
+// TODO: Utilize pshufb and zero mask blending to support more efficient
+// construction of vectors with constant-0 elements.
+static SDValue
+LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDValue SrcVec, IndicesVec;
// Check for a match of the permute source vector and permute index elements.
// This is done by checking that the i-th build_vector operand is of the form:
@@ -7858,13 +8304,10 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
if (!PermIdx || PermIdx->getZExtValue() != Idx)
return SDValue();
}
- MVT IndicesVT = VT;
- if (VT.isFloatingPoint())
- IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
- VT.getVectorNumElements());
- IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
- return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV,
- SDLoc(V), VT, IndicesVec, SrcVec);
+
+ SDLoc DL(V);
+ MVT VT = V.getSimpleValueType();
+ return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
}
SDValue
@@ -7872,7 +8315,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
- MVT ExtVT = VT.getVectorElementType();
+ MVT EltVT = VT.getVectorElementType();
unsigned NumElems = Op.getNumOperands();
// Generate vectors for predicate vectors.
@@ -7883,8 +8326,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return VectorConstant;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
- // TODO: Support FMSUBADD here if we ever get tests for the FMADDSUB
- // transform here.
if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
return AddSub;
if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
@@ -7894,7 +8335,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
return BitOp;
- unsigned EVTBits = ExtVT.getSizeInBits();
+ unsigned EVTBits = EltVT.getSizeInBits();
unsigned NumZero = 0;
unsigned NumNonZero = 0;
@@ -7930,13 +8371,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// supported, we assume that we will fall back to a shuffle to get the scalar
// blended with the constants. Insertion into a zero vector is handled as a
// special-case somewhere below here.
- LLVMContext &Context = *DAG.getContext();
if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
// Create an all-constant vector. The variable element in the old
// build vector is replaced by undef in the constant vector. Save the
// variable scalar element and its index for use in the insertelement.
+ LLVMContext &Context = *DAG.getContext();
Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
SDValue VarElt;
@@ -7975,27 +8416,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
unsigned Idx = countTrailingZeros(NonZeros);
SDValue Item = Op.getOperand(Idx);
- // If this is an insertion of an i64 value on x86-32, and if the top bits of
- // the value are obviously zero, truncate the value to i32 and do the
- // insertion that way. Only do this if the value is non-constant or if the
- // value is a constant being inserted into element 0. It is cheaper to do
- // a constant pool load than it is to do a movd + shuffle.
- if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
- (!IsAllConstants || Idx == 0)) {
- if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
- // Handle SSE only.
- assert(VT == MVT::v2i64 && "Expected an SSE value type!");
- MVT VecVT = MVT::v4i32;
-
- // Truncate the value (which may itself be a constant) to i32, and
- // convert it to a vector with movd (S2V+shuffle to zero extend).
- Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
- Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
- return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
- Item, Idx * 2, true, Subtarget, DAG));
- }
- }
-
// If we have a constant or non-constant insertion into the low element of
// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
// the rest of the elements. This will be matched as movd/movq/movss/movsd
@@ -8004,8 +8424,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (NumZero == 0)
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
- if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
- (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
+ if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
+ (EltVT == MVT::i64 && Subtarget.is64Bit())) {
assert((VT.is128BitVector() || VT.is256BitVector() ||
VT.is512BitVector()) &&
"Expected an SSE value type!");
@@ -8016,7 +8436,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// We can't directly insert an i8 or i16 into a vector, so zero extend
// it to i32 first.
- if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
+ if (EltVT == MVT::i16 || EltVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
if (VT.getSizeInBits() >= 256) {
MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
@@ -8088,17 +8508,43 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return V;
// See if we can use a vector load to get all of the elements.
- if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
+ {
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
if (SDValue LD =
EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
return LD;
}
+ // If this is a splat of pairs of 32-bit elements, we can use a narrower
+ // build_vector and broadcast it.
+ // TODO: We could probably generalize this more.
+ if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
+ SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
+ DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
+ auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
+ // Make sure all the even/odd operands match.
+ for (unsigned i = 2; i != NumElems; ++i)
+ if (Ops[i % 2] != Op.getOperand(i))
+ return false;
+ return true;
+ };
+ if (CanSplat(Op, NumElems, Ops)) {
+ MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
+ MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
+ // Create a new build vector and cast to v2i64/v2f64.
+ SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
+ DAG.getBuildVector(NarrowVT, dl, Ops));
+ // Broadcast from v2i64/v2f64 and cast to final VT.
+ MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
+ return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
+ NewBV));
+ }
+ }
+
// For AVX-length vectors, build the individual 128-bit pieces and use
// shuffles to put them in place.
- if (VT.is256BitVector() || VT.is512BitVector()) {
- EVT HVT = EVT::getVectorVT(Context, ExtVT, NumElems/2);
+ if (VT.getSizeInBits() > 128) {
+ MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
// Build both the lower and upper subvector.
SDValue Lower =
@@ -8107,9 +8553,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
// Recreate the wider vector with the lower and upper part.
- if (VT.is256BitVector())
- return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
- return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+ return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
+ VT.getSizeInBits() / 2);
}
// Let legalizer expand 2-wide build_vectors.
@@ -8234,30 +8679,60 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// 256-bit AVX can use the vinsertf128 instruction
// to create 256-bit vectors from two other 128-bit ones.
-static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+// TODO: Detect subvector broadcast here instead of DAG combine?
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDLoc dl(Op);
MVT ResVT = Op.getSimpleValueType();
assert((ResVT.is256BitVector() ||
ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
- unsigned NumElems = ResVT.getVectorNumElements();
- if (ResVT.is256BitVector())
- return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+ unsigned NumOperands = Op.getNumOperands();
+ unsigned NumZero = 0;
+ unsigned NumNonZero = 0;
+ unsigned NonZeros = 0;
+ for (unsigned i = 0; i != NumOperands; ++i) {
+ SDValue SubVec = Op.getOperand(i);
+ if (SubVec.isUndef())
+ continue;
+ if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
+ ++NumZero;
+ else {
+ assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
+ NonZeros |= 1 << i;
+ ++NumNonZero;
+ }
+ }
- if (Op.getNumOperands() == 4) {
+ // If we have more than 2 non-zeros, build each half separately.
+ if (NumNonZero > 2) {
MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
ResVT.getVectorNumElements()/2);
- SDValue V3 = Op.getOperand(2);
- SDValue V4 = Op.getOperand(3);
- return concat256BitVectors(
- concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
- concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
- NumElems, DAG, dl);
+ ArrayRef<SDUse> Ops = Op->ops();
+ SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
+ Ops.slice(0, NumOperands/2));
+ SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
+ Ops.slice(NumOperands/2));
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+ }
+
+ // Otherwise, build it up through insert_subvectors.
+ SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
+ : DAG.getUNDEF(ResVT);
+
+ MVT SubVT = Op.getOperand(0).getSimpleValueType();
+ unsigned NumSubElems = SubVT.getVectorNumElements();
+ for (unsigned i = 0; i != NumOperands; ++i) {
+ if ((NonZeros & (1 << i)) == 0)
+ continue;
+
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
+ Op.getOperand(i),
+ DAG.getIntPtrConstant(i * NumSubElems, dl));
}
- return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+
+ return Vec;
}
// Return true if all the operands of the given CONCAT_VECTORS node are zeros
@@ -8314,6 +8789,7 @@ static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
return SDValue();
}
+// TODO: Merge this with LowerAVXCONCAT_VECTORS?
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG & DAG) {
@@ -8328,12 +8804,8 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
// of a node with instruction that zeroes all upper (irrelevant) bits of the
// output register, mark it as legal and catch the pattern in instruction
// selection to avoid emitting extra instructions (for zeroing upper bits).
- if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
- SDValue ZeroC = DAG.getIntPtrConstant(0, dl);
- SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
- ZeroC);
- }
+ if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op))
+ return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl);
unsigned NumZero = 0;
unsigned NumNonZero = 0;
@@ -8404,7 +8876,7 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op,
// from two other 128-bit ones.
// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
- return LowerAVXCONCAT_VECTORS(Op, DAG);
+ return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
}
//===----------------------------------------------------------------------===//
@@ -8418,7 +8890,7 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op,
// patterns.
//===----------------------------------------------------------------------===//
-/// \brief Tiny helper function to identify a no-op mask.
+/// Tiny helper function to identify a no-op mask.
///
/// This is a somewhat boring predicate function. It checks whether the mask
/// array input, which is assumed to be a single-input shuffle mask of the kind
@@ -8434,7 +8906,7 @@ static bool isNoopShuffleMask(ArrayRef<int> Mask) {
return true;
}
-/// \brief Test whether there are elements crossing 128-bit lanes in this
+/// Test whether there are elements crossing 128-bit lanes in this
/// shuffle mask.
///
/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
@@ -8448,7 +8920,7 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
return false;
}
-/// \brief Test whether a shuffle mask is equivalent within each sub-lane.
+/// Test whether a shuffle mask is equivalent within each sub-lane.
///
/// This checks a shuffle mask to see if it is performing the same
/// lane-relative shuffle in each sub-lane. This trivially implies
@@ -8494,6 +8966,12 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
}
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
+ SmallVector<int, 32> RepeatedMask;
+ return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
+}
+
/// Test whether a shuffle mask is equivalent within each 256-bit lane.
static bool
is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
@@ -8537,7 +9015,7 @@ static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
return true;
}
-/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
+/// Checks whether a shuffle mask is equivalent to an explicit list of
/// arguments.
///
/// This is a fast way to test a shuffle mask against a fixed pattern:
@@ -8634,7 +9112,7 @@ static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
return IsUnpackwdMask;
}
-/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
+/// Get a 4-lane 8-bit shuffle immediate for a mask.
///
/// This helper function produces an 8-bit shuffle immediate corresponding to
/// the ubiquitous shuffle encoding scheme used in x86 instructions for
@@ -8662,7 +9140,7 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
}
-/// \brief Compute whether each element of a shuffle is zeroable.
+/// Compute whether each element of a shuffle is zeroable.
///
/// A "zeroable" vector shuffle element is one which can be lowered to zero.
/// Either it is an undef element in the shuffle mask, the element of the input
@@ -8859,8 +9337,8 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
unsigned &UnpackOpcode, bool IsUnary,
- ArrayRef<int> TargetMask, SDLoc &DL,
- SelectionDAG &DAG,
+ ArrayRef<int> TargetMask,
+ const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
int NumElts = VT.getVectorNumElements();
@@ -8969,6 +9447,99 @@ static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
return SDValue();
}
+static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
+ int Delta) {
+ int Size = (int)Mask.size();
+ int Split = Size / Delta;
+ int TruncatedVectorStart = SwappedOps ? Size : 0;
+
+ // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
+ if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
+ return false;
+
+ // The rest of the mask should not refer to the truncated vector's elements.
+ if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
+ TruncatedVectorStart + Size))
+ return false;
+
+ return true;
+}
+
+// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
+//
+// An example is the following:
+//
+// t0: ch = EntryToken
+// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
+// t25: v4i32 = truncate t2
+// t41: v8i16 = bitcast t25
+// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
+// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
+// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
+// t18: v2i64 = bitcast t51
+//
+// Without avx512vl, this is lowered to:
+//
+// vpmovqd %zmm0, %ymm0
+// vpshufb {{.*#+}} xmm0 =
+// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+//
+// But when avx512vl is available, one can just use a single vpmovdw
+// instruction.
+static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (VT != MVT::v16i8 && VT != MVT::v8i16)
+ return SDValue();
+
+ if (Mask.size() != VT.getVectorNumElements())
+ return SDValue();
+
+ bool SwappedOps = false;
+
+ if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
+ if (!ISD::isBuildVectorAllZeros(V1.getNode()))
+ return SDValue();
+
+ std::swap(V1, V2);
+ SwappedOps = true;
+ }
+
+ // Look for:
+ //
+ // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
+ // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
+ //
+ // and similar ones.
+ if (V1.getOpcode() != ISD::BITCAST)
+ return SDValue();
+ if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ SDValue Src = V1.getOperand(0).getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+
+ // The vptrunc** instructions truncating 128 bit and 256 bit vectors
+ // are only available with avx512vl.
+ if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
+ return SDValue();
+
+ // Down Convert Word to Byte is only available with avx512bw. The case with
+ // 256-bit output doesn't contain a shuffle and is therefore not handled here.
+ if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
+ !Subtarget.hasBWI())
+ return SDValue();
+
+ // The first half/quarter of the mask should refer to every second/fourth
+ // element of the vector truncated and bitcasted.
+ if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
+ !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
+ return SDValue();
+
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
+}
+
// X86 has dedicated pack instructions that can handle specific truncation
// operations: PACKSS and PACKUS.
static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
@@ -8984,15 +9555,6 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
auto MatchPACK = [&](SDValue N1, SDValue N2) {
SDValue VV1 = DAG.getBitcast(PackVT, N1);
SDValue VV2 = DAG.getBitcast(PackVT, N2);
- if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
- (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
- V1 = VV1;
- V2 = VV2;
- SrcVT = PackVT;
- PackOpcode = X86ISD::PACKSS;
- return true;
- }
-
if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
@@ -9004,7 +9566,14 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
return true;
}
}
-
+ if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
+ (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
+ V1 = VV1;
+ V2 = VV2;
+ SrcVT = PackVT;
+ PackOpcode = X86ISD::PACKSS;
+ return true;
+ }
return false;
};
@@ -9039,7 +9608,7 @@ static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
return SDValue();
}
-/// \brief Try to emit a bitmask instruction for a shuffle.
+/// Try to emit a bitmask instruction for a shuffle.
///
/// This handles cases where we can model a blend exactly as a bitmask due to
/// one of the inputs being zeroable.
@@ -9072,7 +9641,7 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
return DAG.getNode(ISD::AND, DL, VT, V, VMask);
}
-/// \brief Try to emit a blend instruction for a shuffle using bit math.
+/// Try to emit a blend instruction for a shuffle using bit math.
///
/// This is used as a fallback approach when first class blend instructions are
/// unavailable. Currently it is only suitable for integer vectors, but could
@@ -9159,7 +9728,7 @@ static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
return ScaledMask;
}
-/// \brief Try to emit a blend instruction for a shuffle.
+/// Try to emit a blend instruction for a shuffle.
///
/// This doesn't do any checks for the availability of instructions for blending
/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
@@ -9305,7 +9874,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
}
}
-/// \brief Try to lower as a blend of elements from two inputs followed by
+/// Try to lower as a blend of elements from two inputs followed by
/// a single-input permutation.
///
/// This matches the pattern where we can blend elements from two inputs and
@@ -9337,7 +9906,7 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
}
-/// \brief Generic routine to decompose a shuffle and blend into independent
+/// Generic routine to decompose a shuffle and blend into independent
/// blends and permutes.
///
/// This matches the extremely common pattern for handling combined
@@ -9378,7 +9947,7 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
}
-/// \brief Try to lower a vector shuffle as a rotation.
+/// Try to lower a vector shuffle as a rotation.
///
/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
@@ -9450,7 +10019,7 @@ static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
return Rotation;
}
-/// \brief Try to lower a vector shuffle as a byte rotation.
+/// Try to lower a vector shuffle as a byte rotation.
///
/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
@@ -9534,7 +10103,7 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
}
-/// \brief Try to lower a vector shuffle as a dword/qword rotation.
+/// Try to lower a vector shuffle as a dword/qword rotation.
///
/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
/// rotation of the concatenation of two vectors; This routine will
@@ -9565,7 +10134,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
DAG.getConstant(Rotation, DL, MVT::i8));
}
-/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
+/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
///
/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
@@ -9809,7 +10378,7 @@ static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
return false;
}
-/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable,
@@ -9829,7 +10398,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
return SDValue();
}
-/// \brief Lower a vector shuffle as a zero or any extension.
+/// Lower a vector shuffle as a zero or any extension.
///
/// Given a specific number of elements, element bit width, and extension
/// stride, produce either a zero or any extension based on the available
@@ -9984,7 +10553,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
return DAG.getBitcast(VT, InputV);
}
-/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
+/// Try to lower a vector shuffle as a zero extension on any microarch.
///
/// This routine will try to do everything in its power to cleverly lower
/// a shuffle which happens to match the pattern of a zero extend. It doesn't
@@ -10112,7 +10681,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
return SDValue();
}
-/// \brief Try to get a scalar value for a specific element of a vector.
+/// Try to get a scalar value for a specific element of a vector.
///
/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
@@ -10139,7 +10708,7 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
return SDValue();
}
-/// \brief Helper to test for a load that can be folded with x86 shuffles.
+/// Helper to test for a load that can be folded with x86 shuffles.
///
/// This is particularly important because the set of instructions varies
/// significantly based on whether the operand is a load or not.
@@ -10148,7 +10717,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
return ISD::isNON_EXTLoad(V.getNode());
}
-/// \brief Try to lower insertion of a single element into a zero vector.
+/// Try to lower insertion of a single element into a zero vector.
///
/// This is a common pattern that we have especially efficient patterns to lower
/// across all subtarget feature sets.
@@ -10239,9 +10808,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
V2 = DAG.getBitcast(MVT::v16i8, V2);
V2 = DAG.getNode(
X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
- DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
- DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
- DAG.getDataLayout(), VT)));
+ DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
V2 = DAG.getBitcast(VT, V2);
}
}
@@ -10295,13 +10862,13 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
if (const int OffsetIdx = BroadcastIdx % Scale)
Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
- DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
+ DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
}
-/// \brief Try to lower broadcast of a single element.
+/// Try to lower broadcast of a single element.
///
/// For convenience, this code also bundles all of the subtarget feature set
/// filtering. While a little annoying to re-dispatch on type here, there isn't
@@ -10626,7 +11193,7 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
DAG.getConstant(InsertPSMask, DL, MVT::i8));
}
-/// \brief Try to lower a shuffle as a permute of the inputs followed by an
+/// Try to lower a shuffle as a permute of the inputs followed by an
/// UNPCK instruction.
///
/// This specifically targets cases where we end up with alternating between
@@ -10738,7 +11305,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
return SDValue();
}
-/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
+/// Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
/// support for floating point shuffles but not integer shuffles. These
@@ -10777,22 +11344,23 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
DAG.getConstant(SHUFPDMask, DL, MVT::i8));
}
- assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
- assert(Mask[1] >= 2 && "Non-canonicalized blend!");
+ assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[0] < 2 && "We sort V1 to be the first input.");
+ assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
- // If we have a single input, insert that into V1 if we can do so cheaply.
- if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
- return Insertion;
- // Try inverting the insertion since for v2 masks it is easy to do and we
- // can't reliably sort the mask one way or the other.
- int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
- Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
- return Insertion;
- }
+ // When loading a scalar and then shuffling it into a vector we can often do
+ // the insertion cheaply.
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return Insertion;
+ // Try inverting the insertion since for v2 masks it is easy to do and we
+ // can't reliably sort the mask one way or the other.
+ int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
+ Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
+ return Insertion;
// Try to use one of the special instruction patterns to handle two common
// blend patterns if a zero-blend above didn't work.
@@ -10802,8 +11370,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// We can either use a special instruction to load over the low double or
// to move just the low double.
return DAG.getNode(
- isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
- DL, MVT::v2f64, V2,
+ X86ISD::MOVSD, DL, MVT::v2f64, V2,
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
if (Subtarget.hasSSE41())
@@ -10821,7 +11388,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DAG.getConstant(SHUFPDMask, DL, MVT::i8));
}
-/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
+/// Handle lowering of 2-lane 64-bit integer shuffles.
///
/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
/// the integer unit to minimize domain crossing penalties. However, for blends
@@ -10918,7 +11485,7 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
}
-/// \brief Test whether this can be lowered with a single SHUFPS instruction.
+/// Test whether this can be lowered with a single SHUFPS instruction.
///
/// This is used to disable more specialized lowerings when the shufps lowering
/// will happen to be efficient.
@@ -10940,7 +11507,7 @@ static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
return true;
}
-/// \brief Lower a vector shuffle using the SHUFPS instruction.
+/// Lower a vector shuffle using the SHUFPS instruction.
///
/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
/// It makes no assumptions about whether this is the *best* lowering, it simply
@@ -11027,7 +11594,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
}
-/// \brief Lower 4-lane 32-bit floating point shuffles.
+/// Lower 4-lane 32-bit floating point shuffles.
///
/// Uses instructions exclusively from the floating point unit to minimize
/// domain crossing penalties, as these are sufficient to implement all v4f32
@@ -11123,7 +11690,7 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
}
-/// \brief Lower 4-lane i32 vector shuffles.
+/// Lower 4-lane i32 vector shuffles.
///
/// We try to handle these with integer-domain shuffles where we can, but for
/// blends we use the floating point domain blend instructions.
@@ -11235,7 +11802,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return DAG.getBitcast(MVT::v4i32, ShufPS);
}
-/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
+/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
/// shuffle lowering, and the most complex part.
///
/// The lowering strategy is to try to form pairs of input lanes which are
@@ -11261,13 +11828,27 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
MutableArrayRef<int> LoMask = Mask.slice(0, 4);
MutableArrayRef<int> HiMask = Mask.slice(4, 4);
+ // Attempt to directly match PSHUFLW or PSHUFHW.
+ if (isUndefOrInRange(LoMask, 0, 4) &&
+ isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
+ return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
+ }
+ if (isUndefOrInRange(HiMask, 4, 8) &&
+ isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
+ for (int i = 0; i != 4; ++i)
+ HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
+ return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
+ }
+
SmallVector<int, 4> LoInputs;
copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
- std::sort(LoInputs.begin(), LoInputs.end());
+ array_pod_sort(LoInputs.begin(), LoInputs.end());
LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
SmallVector<int, 4> HiInputs;
copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
- std::sort(HiInputs.begin(), HiInputs.end());
+ array_pod_sort(HiInputs.begin(), HiInputs.end());
HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
int NumLToL =
std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
@@ -11280,13 +11861,11 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
- // If we are splatting two values from one half - one to each half, then
- // we can shuffle that half so each is splatted to a dword, then splat those
- // to their respective halves.
- auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
- int DOffset) {
- int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
- int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
+ // If we are shuffling values from one half - check how many different DWORD
+ // pairs we need to create. If only 1 or 2 then we can perform this as a
+ // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
+ auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
+ ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
V = DAG.getNode(ShufWOp, DL, VT, V,
getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
V = DAG.getBitcast(PSHUFDVT, V);
@@ -11295,10 +11874,48 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
return DAG.getBitcast(VT, V);
};
- if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
- return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
- if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
- return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
+ if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
+ int PSHUFDMask[4] = { -1, -1, -1, -1 };
+ SmallVector<std::pair<int, int>, 4> DWordPairs;
+ int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
+
+ // Collect the different DWORD pairs.
+ for (int DWord = 0; DWord != 4; ++DWord) {
+ int M0 = Mask[2 * DWord + 0];
+ int M1 = Mask[2 * DWord + 1];
+ M0 = (M0 >= 0 ? M0 % 4 : M0);
+ M1 = (M1 >= 0 ? M1 % 4 : M1);
+ if (M0 < 0 && M1 < 0)
+ continue;
+
+ bool Match = false;
+ for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
+ auto &DWordPair = DWordPairs[j];
+ if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
+ (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
+ DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
+ DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
+ PSHUFDMask[DWord] = DOffset + j;
+ Match = true;
+ break;
+ }
+ }
+ if (!Match) {
+ PSHUFDMask[DWord] = DOffset + DWordPairs.size();
+ DWordPairs.push_back(std::make_pair(M0, M1));
+ }
+ }
+
+ if (DWordPairs.size() <= 2) {
+ DWordPairs.resize(2, std::make_pair(-1, -1));
+ int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
+ DWordPairs[1].first, DWordPairs[1].second};
+ if ((NumHToL + NumHToH) == 0)
+ return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
+ if ((NumLToL + NumLToH) == 0)
+ return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
+ }
+ }
// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
// such inputs we can swap two of the dwords across the half mark and end up
@@ -11750,7 +12367,7 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
return DAG.getBitcast(VT, V);
}
-/// \brief Generic lowering of 8-lane i16 shuffles.
+/// Generic lowering of 8-lane i16 shuffles.
///
/// This handles both single-input shuffles and combined shuffle/blends with
/// two inputs. The single input shuffles are immediately delegated to
@@ -11883,7 +12500,7 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
Mask, DAG);
}
-/// \brief Check whether a compaction lowering can be done by dropping even
+/// Check whether a compaction lowering can be done by dropping even
/// elements and compute how many times even elements must be dropped.
///
/// This handles shuffles which take every Nth element where N is a power of
@@ -11962,7 +12579,7 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
}
-/// \brief Generic lowering of v16i8 shuffles.
+/// Generic lowering of v16i8 shuffles.
///
/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
/// detect any complexity reducing interleaving. If that doesn't help, it uses
@@ -12034,12 +12651,12 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
SmallVector<int, 4> LoInputs;
copy_if(Mask, std::back_inserter(LoInputs),
[](int M) { return M >= 0 && M < 8; });
- std::sort(LoInputs.begin(), LoInputs.end());
+ array_pod_sort(LoInputs.begin(), LoInputs.end());
LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
LoInputs.end());
SmallVector<int, 4> HiInputs;
copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
- std::sort(HiInputs.begin(), HiInputs.end());
+ array_pod_sort(HiInputs.begin(), HiInputs.end());
HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
HiInputs.end());
@@ -12262,7 +12879,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
}
-/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
+/// Dispatching routine to lower various 128-bit x86 vector shuffles.
///
/// This routine breaks down the specific type of 128-bit shuffle and
/// dispatches to the lowering routines accordingly.
@@ -12290,7 +12907,7 @@ static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
}
-/// \brief Generic routine to split vector shuffle into half-sized shuffles.
+/// Generic routine to split vector shuffle into half-sized shuffles.
///
/// This routine just extracts two subvectors, shuffles them independently, and
/// then concatenates them back together. This should work effectively with all
@@ -12413,7 +13030,7 @@ static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
}
-/// \brief Either split a vector in halves or decompose the shuffles and the
+/// Either split a vector in halves or decompose the shuffles and the
/// blend.
///
/// This is provided as a good fallback for many lowerings of non-single-input
@@ -12471,7 +13088,7 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
}
-/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
+/// Lower a vector shuffle crossing multiple 128-bit lanes as
/// a permutation and blend of those lanes.
///
/// This essentially blends the out-of-lane inputs to each lane into the lane
@@ -12529,7 +13146,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
}
-/// \brief Handle lowering 2-lane 128-bit shuffles.
+/// Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable,
@@ -12540,9 +13157,22 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
return SDValue();
SmallVector<int, 4> WidenedMask;
- if (!canWidenShuffleElements(Mask, WidenedMask))
+ if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))
return SDValue();
+ bool IsLowZero = (Zeroable & 0x3) == 0x3;
+ bool IsHighZero = (Zeroable & 0xc) == 0xc;
+
+ // Try to use an insert into a zero vector.
+ if (WidenedMask[0] == 0 && IsHighZero) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL), LoV,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
// TODO: If minimizing size and one of the inputs is a zero vector and the
// the zero vector has only one use, we could use a VPERM2X128 to save the
// instruction bytes needed to explicitly generate the zero vector.
@@ -12552,9 +13182,6 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
Zeroable, Subtarget, DAG))
return Blend;
- bool IsLowZero = (Zeroable & 0x3) == 0x3;
- bool IsHighZero = (Zeroable & 0xc) == 0xc;
-
// If either input operand is a zero vector, use VPERM2X128 because its mask
// allows us to replace the zero input with an implicit zero.
if (!IsLowZero && !IsHighZero) {
@@ -12566,14 +13193,12 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
// this will likely become vinsertf128 which can't fold a 256-bit memop.
if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
- MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() / 2);
- SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
- DAG.getIntPtrConstant(0, DL));
- SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
- OnlyUsesV1 ? V1 : V2,
- DAG.getIntPtrConstant(0, DL));
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+ OnlyUsesV1 ? V1 : V2,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
+ DAG.getIntPtrConstant(2, DL));
}
}
@@ -12601,7 +13226,8 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
// [6] - ignore
// [7] - zero high half of destination
- assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");
+ assert((WidenedMask[0] >= 0 || IsLowZero) &&
+ (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
unsigned PermMask = 0;
PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
@@ -12617,7 +13243,7 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
DAG.getConstant(PermMask, DL, MVT::i8));
}
-/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
+/// Lower a vector shuffle by first fixing the 128-bit lanes and then
/// shuffling each lane.
///
/// This will only succeed when the result of fixing the 128-bit lanes results
@@ -12820,7 +13446,7 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
DAG.getIntPtrConstant(Offset, DL));
}
-/// \brief Test whether the specified input (0 or 1) is in-place blended by the
+/// Test whether the specified input (0 or 1) is in-place blended by the
/// given mask.
///
/// This returns true if the elements from a particular input are already in the
@@ -13056,7 +13682,7 @@ static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
DAG.getConstant(Immediate, DL, MVT::i8));
}
-/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
+/// Handle lowering of 4-lane 64-bit floating point shuffles.
///
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
/// isn't available.
@@ -13098,7 +13724,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
// Try to create an in-lane repeating shuffle mask and then shuffle the
- // the results into the target lanes.
+ // results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return V;
@@ -13123,7 +13749,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Op;
// Try to create an in-lane repeating shuffle mask and then shuffle the
- // the results into the target lanes.
+ // results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return V;
@@ -13153,7 +13779,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
}
-/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
+/// Handle lowering of 4-lane 64-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v4i64 shuffling..
@@ -13226,6 +13852,12 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
return V;
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single
@@ -13241,7 +13873,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
Mask, DAG);
}
-/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
+/// Handle lowering of 8-lane 32-bit floating point shuffles.
///
/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
/// isn't available.
@@ -13291,7 +13923,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
// Try to create an in-lane repeating shuffle mask and then shuffle the
- // the results into the target lanes.
+ // results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
return V;
@@ -13340,7 +13972,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
}
-/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
+/// Handle lowering of 8-lane 32-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v8i32 shuffling..
@@ -13453,7 +14085,7 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
Mask, DAG);
}
-/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
+/// Handle lowering of 16-lane 16-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v16i16 shuffling..
@@ -13504,7 +14136,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
- // the results into the target lanes.
+ // results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
return V;
@@ -13544,7 +14176,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
}
-/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
+/// Handle lowering of 32-lane 8-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v32i8 shuffling..
@@ -13595,7 +14227,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
- // the results into the target lanes.
+ // results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
return V;
@@ -13624,7 +14256,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
}
-/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
+/// High-level routine to lower various 256-bit x86 vector shuffles.
///
/// This routine either breaks down the specific type of a 256-bit x86 vector
/// shuffle or splits it into two 128-bit shuffles and fuses the results back
@@ -13694,10 +14326,13 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
}
-/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
+/// Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+ ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.");
@@ -13705,10 +14340,23 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
// function lowerV2X128VectorShuffle() is better solution.
assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
+ // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
SmallVector<int, 4> WidenedMask;
if (!canWidenShuffleElements(Mask, WidenedMask))
return SDValue();
+ // Try to use an insert into a zero vector.
+ if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
+ (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
+ unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL), LoV,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
// Check for patterns which can be matched with a single insert of a 256-bit
// subvector.
bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
@@ -13716,12 +14364,11 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
{0, 1, 2, 3, 8, 9, 10, 11})) {
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
- SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
- DAG.getIntPtrConstant(0, DL));
- SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
- OnlyUsesV1 ? V1 : V2,
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+ OnlyUsesV1 ? V1 : V2,
DAG.getIntPtrConstant(0, DL));
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
+ DAG.getIntPtrConstant(4, DL));
}
assert(WidenedMask.size() == 4);
@@ -13756,7 +14403,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
}
- // Try to lower to to vshuf64x2/vshuf32x4.
+ // Try to lower to vshuf64x2/vshuf32x4.
SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
unsigned PermMask = 0;
// Insure elements came from the same Op.
@@ -13781,7 +14428,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
DAG.getConstant(PermMask, DL, MVT::i8));
}
-/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
+/// Handle lowering of 8-lane 64-bit floating point shuffles.
static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable,
SDValue V1, SDValue V2,
@@ -13814,7 +14461,8 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
if (SDValue Shuf128 =
- lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
+ Subtarget, DAG))
return Shuf128;
if (SDValue Unpck =
@@ -13837,7 +14485,7 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
}
-/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
+/// Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable,
SDValue V1, SDValue V2,
@@ -13892,7 +14540,7 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
}
-/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
+/// Handle lowering of 8-lane 64-bit integer shuffles.
static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable,
SDValue V1, SDValue V2,
@@ -13924,7 +14572,8 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
if (SDValue Shuf128 =
- lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
+ V1, V2, Subtarget, DAG))
return Shuf128;
// Try to use shift instructions.
@@ -13957,7 +14606,7 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
}
-/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
+/// Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable,
SDValue V1, SDValue V2,
@@ -14028,7 +14677,7 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
}
-/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
+/// Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable,
SDValue V1, SDValue V2,
@@ -14083,7 +14732,7 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
}
-/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
+/// Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable,
SDValue V1, SDValue V2,
@@ -14125,7 +14774,7 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
- // the results into the target lanes.
+ // results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
return V;
@@ -14138,7 +14787,7 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
}
-/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
+/// High-level routine to lower various 512-bit x86 vector shuffles.
///
/// This routine either breaks down the specific type of a 512-bit x86 vector
/// shuffle or splits it into two 256-bit shuffles and fuses the results back
@@ -14200,8 +14849,36 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// vector, shuffle and then truncate it back.
static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
+ unsigned NumElts = Mask.size();
+
+ // Try to recognize shuffles that are just padding a subvector with zeros.
+ unsigned SubvecElts = 0;
+ for (int i = 0; i != (int)NumElts; ++i) {
+ if (Mask[i] >= 0 && Mask[i] != i)
+ break;
+
+ ++SubvecElts;
+ }
+ assert(SubvecElts != NumElts && "Identity shuffle?");
+
+ // Clip to a power 2.
+ SubvecElts = PowerOf2Floor(SubvecElts);
+
+ // Make sure the number of zeroable bits in the top at least covers the bits
+ // not covered by the subvector.
+ if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
+ MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
+ V1, DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL),
+ Extract, DAG.getIntPtrConstant(0, DL));
+ }
+
+
assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/o basic ISA!");
MVT ExtVT;
@@ -14220,38 +14897,31 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
break;
case MVT::v16i1:
- ExtVT = MVT::v16i32;
+ // Take 512-bit type, unless we are avoiding 512-bit types and have the
+ // 256-bit operation available.
+ ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
break;
case MVT::v32i1:
- ExtVT = MVT::v32i16;
+ // Take 512-bit type, unless we are avoiding 512-bit types and have the
+ // 256-bit operation available.
+ assert(Subtarget.hasBWI() && "Expected AVX512BW support");
+ ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
break;
case MVT::v64i1:
ExtVT = MVT::v64i8;
break;
}
- if (ISD::isBuildVectorAllZeros(V1.getNode()))
- V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
- else if (ISD::isBuildVectorAllOnes(V1.getNode()))
- V1 = getOnesVector(ExtVT, DAG, DL);
- else
- V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
-
- if (V2.isUndef())
- V2 = DAG.getUNDEF(ExtVT);
- else if (ISD::isBuildVectorAllZeros(V2.getNode()))
- V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
- else if (ISD::isBuildVectorAllOnes(V2.getNode()))
- V2 = getOnesVector(ExtVT, DAG, DL);
- else
- V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
+ V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
+ V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
// i1 was sign extended we can use X86ISD::CVT2MASK.
int NumElems = VT.getVectorNumElements();
if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
(Subtarget.hasDQI() && (NumElems < 32)))
- return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
+ return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
+ Shuffle, ISD::SETGT);
return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
}
@@ -14320,7 +14990,7 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
return false;
}
-/// \brief Top-level lowering for x86 vector shuffles.
+/// Top-level lowering for x86 vector shuffles.
///
/// This handles decomposition, canonicalization, and lowering of all x86
/// vector shuffles. Most of the specific lowering strategies are encapsulated
@@ -14378,20 +15048,49 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
if (Zeroable.isAllOnesValue())
return getZeroVector(VT, Subtarget, DAG, DL);
+ bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
+
+ // Create an alternative mask with info about zeroable elements.
+ // Here we do not set undef elements as zeroable.
+ SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
+ if (V2IsZero) {
+ assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
+ for (int i = 0; i != NumElements; ++i)
+ if (Mask[i] != SM_SentinelUndef && Zeroable[i])
+ ZeroableMask[i] = SM_SentinelZero;
+ }
+
// Try to collapse shuffles into using a vector type with fewer elements but
// wider element types. We cap this to not form integers or floating point
// elements wider than 64 bits, but it might be interesting to form i128
// integers to handle flipping the low and high halves of AVX 256-bit vectors.
SmallVector<int, 16> WidenedMask;
if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
- canWidenShuffleElements(Mask, WidenedMask)) {
+ canWidenShuffleElements(ZeroableMask, WidenedMask)) {
MVT NewEltVT = VT.isFloatingPoint()
? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
- MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
+ int NewNumElts = NumElements / 2;
+ MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
// Make sure that the new vector type is legal. For example, v2f64 isn't
// legal on SSE1.
if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
+ if (V2IsZero) {
+ // Modify the new Mask to take all zeros from the all-zero vector.
+ // Choose indices that are blend-friendly.
+ bool UsedZeroVector = false;
+ assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
+ "V2's non-undef elements are used?!");
+ for (int i = 0; i != NewNumElts; ++i)
+ if (WidenedMask[i] == SM_SentinelZero) {
+ WidenedMask[i] = i + NewNumElts;
+ UsedZeroVector = true;
+ }
+ // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
+ // some elements to be undef.
+ if (UsedZeroVector)
+ V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
+ }
V1 = DAG.getBitcast(NewVT, V1);
V2 = DAG.getBitcast(NewVT, V2);
return DAG.getBitcast(
@@ -14403,6 +15102,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
if (canonicalizeShuffleMaskWithCommute(Mask))
return DAG.getCommutedVectorShuffle(*SVOp);
+ if (SDValue V =
+ lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
+ return V;
+
// For each vector width, delegate to a specialized lowering routine.
if (VT.is128BitVector())
return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
@@ -14417,12 +15120,13 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
DAG);
if (Is1BitVector)
- return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
+ return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
+ DAG);
llvm_unreachable("Unimplemented!");
}
-/// \brief Try to lower a VSELECT instruction to a vector shuffle.
+/// Try to lower a VSELECT instruction to a vector shuffle.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -14441,9 +15145,12 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
SmallVector<int, 32> Mask;
for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
SDValue CondElt = CondBV->getOperand(i);
- Mask.push_back(
- isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
- : -1);
+ int M = i;
+ // We can't map undef to undef here. They have different meanings. Treat
+ // as the same as zero.
+ if (CondElt.isUndef() || isNullConstant(CondElt))
+ M += Size;
+ Mask.push_back(M);
}
return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
}
@@ -14483,9 +15190,11 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
assert(Cond.getValueType().getScalarSizeInBits() ==
VT.getScalarSizeInBits() &&
"Should have a size-matched integer condition!");
- // Build a mask by testing the condition against itself (tests for zero).
+ // Build a mask by testing the condition against zero.
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
- SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
+ SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
+ getZeroVector(VT, Subtarget, DAG, dl),
+ ISD::SETNE);
// Now return a new VSELECT using the mask.
return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
}
@@ -14506,10 +15215,15 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
case MVT::v8i16:
- case MVT::v16i16:
- // FIXME: We should custom lower this by fixing the condition and using i8
- // blends.
- return SDValue();
+ case MVT::v16i16: {
+ // Bitcast everything to the vXi8 type and use a vXi8 vselect.
+ MVT CastVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
+ SDValue Cond = DAG.getBitcast(CastVT, Op->getOperand(0));
+ SDValue LHS = DAG.getBitcast(CastVT, Op->getOperand(1));
+ SDValue RHS = DAG.getBitcast(CastVT, Op->getOperand(2));
+ SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
+ return DAG.getBitcast(VT, Select);
+ }
}
}
@@ -14581,36 +15295,35 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
}
- // Canonicalize result type to MVT::i32.
- if (EltVT != MVT::i32) {
- SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
- Vec, Idx);
- return DAG.getAnyExtOrTrunc(Extract, dl, EltVT);
- }
-
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- // Extracts from element 0 are always allowed.
- if (IdxVal == 0)
- return Op;
-
// If the kshift instructions of the correct width aren't natively supported
// then we need to promote the vector to the native size to get the correct
// zeroing behavior.
- if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
- (VecVT.getVectorNumElements() < 8)) {
+ if (VecVT.getVectorNumElements() < 16) {
VecVT = MVT::v16i1;
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
- DAG.getUNDEF(VecVT),
- Vec,
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
+ DAG.getUNDEF(VecVT), Vec,
DAG.getIntPtrConstant(0, dl));
}
- // Use kshiftr instruction to move to the lower element.
- Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Vec,
- DAG.getIntPtrConstant(0, dl));
+ // Extracts from element 0 are always allowed.
+ if (IdxVal != 0) {
+ // Use kshiftr instruction to move to the lower element.
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+ }
+
+ // Shrink to v16i1 since that's always legal.
+ if (VecVT.getVectorNumElements() > 16) {
+ VecVT = MVT::v16i1;
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Vec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ // Convert to a bitcast+aext/trunc.
+ MVT CastVT = MVT::getIntegerVT(VecVT.getVectorNumElements());
+ return DAG.getAnyExtOrTrunc(DAG.getBitcast(CastVT, Vec), dl, EltVT);
}
SDValue
@@ -14713,7 +15426,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
int ShiftVal = (IdxVal % 4) * 8;
if (ShiftVal != 0)
Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
- DAG.getConstant(ShiftVal, dl, MVT::i32));
+ DAG.getConstant(ShiftVal, dl, MVT::i8));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
}
@@ -14724,7 +15437,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
int ShiftVal = (IdxVal % 2) * 8;
if (ShiftVal != 0)
Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
- DAG.getConstant(ShiftVal, dl, MVT::i16));
+ DAG.getConstant(ShiftVal, dl, MVT::i8));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
}
@@ -14780,74 +15493,11 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
}
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- unsigned NumElems = VecVT.getVectorNumElements();
-
- // If the kshift instructions of the correct width aren't natively supported
- // then we need to promote the vector to the native size to get the correct
- // zeroing behavior.
- if ((!Subtarget.hasDQI() && NumElems == 8) || (NumElems < 8)) {
- // Need to promote to v16i1, do the insert, then extract back.
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
- DAG.getUNDEF(MVT::v16i1), Vec,
- DAG.getIntPtrConstant(0, dl));
- Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op,
- DAG.getIntPtrConstant(0, dl));
- }
-
- SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
+ // Copy into a k-register, extract to v1i1 and insert_subvector.
+ SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
- if (Vec.isUndef()) {
- if (IdxVal)
- EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
- return EltInVec;
- }
-
- // Insertion of one bit into first position
- if (IdxVal == 0 ) {
- // Clean top bits of vector.
- EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
- DAG.getConstant(NumElems - 1, dl, MVT::i8));
- EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
- DAG.getConstant(NumElems - 1, dl, MVT::i8));
- // Clean the first bit in source vector.
- Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
- DAG.getConstant(1 , dl, MVT::i8));
- Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
- DAG.getConstant(1, dl, MVT::i8));
-
- return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
- }
- // Insertion of one bit into last position
- if (IdxVal == NumElems - 1) {
- // Move the bit to the last position inside the vector.
- EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
- // Clean the last bit in the source vector.
- Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
- DAG.getConstant(1, dl, MVT::i8));
- Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
- DAG.getConstant(1 , dl, MVT::i8));
-
- return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
- }
-
- // Move the current value of the bit to be replace to bit 0.
- SDValue Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
- // Xor with the new bit.
- Merged = DAG.getNode(ISD::XOR, dl, VecVT, Merged, EltInVec);
- // Shift to MSB, filling bottom bits with 0.
- Merged = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Merged,
- DAG.getConstant(NumElems - 1, dl, MVT::i8));
- // Shift to the final position, filling upper bits with 0.
- Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Merged,
- DAG.getConstant(NumElems - 1 - IdxVal, dl, MVT::i8));
- // Xor with original vector to cancel out the original bit value that's still
- // present.
- return DAG.getNode(ISD::XOR, dl, VecVT, Merged, Vec);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
+ Op.getOperand(2));
}
SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
@@ -15020,8 +15670,45 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
return insert1BitVector(Op, DAG, Subtarget);
}
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+ "Only vXi1 extract_subvectors need custom lowering");
+
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue Idx = Op.getOperand(1);
+
+ if (!isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ if (IdxVal == 0) // the operation is legal
+ return Op;
+
+ MVT VecVT = Vec.getSimpleValueType();
+ unsigned NumElems = VecVT.getVectorNumElements();
+
+ // Extend to natively supported kshift.
+ MVT WideVecVT = VecVT;
+ if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
+ WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
+ DAG.getUNDEF(WideVecVT), Vec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ // Shift to the LSB.
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
+ DAG.getIntPtrConstant(0, dl));
+}
+
// Returns the appropriate wrapper opcode for a global reference.
-unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
+unsigned X86TargetLowering::getGlobalWrapperKind(
+ const GlobalValue *GV, const unsigned char OpFlags) const {
// References to absolute symbols are never PC-relative.
if (GV && GV->isAbsoluteSymbolRef())
return X86ISD::Wrapper;
@@ -15031,6 +15718,10 @@ unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
(M == CodeModel::Small || M == CodeModel::Kernel))
return X86ISD::WrapperRIP;
+ // GOTPCREL references must always use RIP.
+ if (OpFlags == X86II::MO_GOTPCREL)
+ return X86ISD::WrapperRIP;
+
return X86ISD::Wrapper;
}
@@ -15154,7 +15845,7 @@ SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
}
- Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
+ Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
if (isGlobalRelativeToPICBase(OpFlags)) {
@@ -15336,7 +16027,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
- if (DAG.getTarget().Options.EmulatedTLS)
+ if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(GA, DAG);
const GlobalValue *GV = GA->getGlobal();
@@ -15456,7 +16147,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
auto &DL = DAG.getDataLayout();
SDValue Scale =
- DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
+ DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
@@ -15512,24 +16203,47 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
// values for large shift amounts.
SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i8));
- SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
- AndNode, DAG.getConstant(0, dl, MVT::i8));
+ SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
+ DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
SDValue Hi, Lo;
- SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
- SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
- SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
-
if (Op.getOpcode() == ISD::SHL_PARTS) {
- Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
- Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
+ Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
+ Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
} else {
- Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
- Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
+ Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
+ Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
}
- SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, dl);
+ return DAG.getMergeValues({ Lo, Hi }, dl);
+}
+
+// Try to use a packed vector operation to handle i64 on 32-bit targets when
+// AVX512DQ is enabled.
+static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert((Op.getOpcode() == ISD::SINT_TO_FP ||
+ Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT VT = Op.getSimpleValueType();
+
+ if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
+ (VT != MVT::f32 && VT != MVT::f64))
+ return SDValue();
+
+ // Pack the i64 into a vector, do the operation and extract.
+
+ // Using 256-bit to ensure result is 128-bits for f32 case.
+ unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
+ MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
+ MVT VecVT = MVT::getVectorVT(VT, NumElts);
+
+ SDLoc dl(Op);
+ SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
+ SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
+ DAG.getIntPtrConstant(0, dl));
}
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
@@ -15545,20 +16259,6 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
DAG.getUNDEF(SrcVT)));
}
- if (SrcVT.getVectorElementType() == MVT::i1) {
- if (SrcVT == MVT::v2i1) {
- // For v2i1, we need to widen to v4i1 first.
- assert(VT == MVT::v2f64 && "Unexpected type");
- Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Src,
- DAG.getUNDEF(MVT::v2i1));
- return DAG.getNode(X86ISD::CVTSI2P, dl, Op.getValueType(),
- DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Src));
- }
-
- MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
- return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
- DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
- }
return SDValue();
}
@@ -15567,15 +16267,17 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
// These are really Legal; return the operand so the caller accepts it as
// Legal.
- if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
+ if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
return Op;
- if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
- Subtarget.is64Bit()) {
+ if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit()) {
return Op;
}
+ if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
+ return V;
+
SDValue ValueToStore = Op.getOperand(0);
- if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+ if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
!Subtarget.is64Bit())
// Bitcasting to f64 here allows us to do a single 64-bit store from
// an SSE register, avoiding the store forwarding penalty that would come
@@ -15760,7 +16462,8 @@ static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
}
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
- const X86Subtarget &Subtarget, SDLoc &DL) {
+ const X86Subtarget &Subtarget,
+ const SDLoc &DL) {
if (Op.getSimpleValueType() != MVT::v2f64)
return SDValue();
@@ -15894,21 +16597,6 @@ static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
MVT SrcVT = N0.getSimpleValueType();
SDLoc dl(Op);
- if (SrcVT.getVectorElementType() == MVT::i1) {
- if (SrcVT == MVT::v2i1) {
- // For v2i1, we need to widen to v4i1 first.
- assert(Op.getValueType() == MVT::v2f64 && "Unexpected type");
- N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, N0,
- DAG.getUNDEF(MVT::v2i1));
- return DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v2f64,
- DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0));
- }
-
- MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
- return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
- DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
- }
-
switch (SrcVT.SimpleTy) {
default:
llvm_unreachable("Custom UINT_TO_FP is not supported!");
@@ -15940,6 +16628,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
return Op;
}
+ if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
+ return V;
+
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
if (SrcVT == MVT::i32 && X86ScalarSSEf64)
@@ -16205,15 +16896,17 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
MVT InVT = In.getSimpleValueType();
SDLoc dl(Op);
- if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
- (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
- (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
- (VT != MVT::v8i64 || InVT != MVT::v8i32) &&
- (VT != MVT::v8i64 || InVT != MVT::v8i16) &&
- (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
- (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
- (VT != MVT::v32i16 || InVT != MVT::v32i8))
- return SDValue();
+ assert(VT.isVector() && InVT.isVector() && "Expected vector type");
+ assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
+ "Expected same number of elements");
+ assert((VT.getVectorElementType() == MVT::i16 ||
+ VT.getVectorElementType() == MVT::i32 ||
+ VT.getVectorElementType() == MVT::i64) &&
+ "Unexpected element type");
+ assert((InVT.getVectorElementType() == MVT::i8 ||
+ InVT.getVectorElementType() == MVT::i16 ||
+ InVT.getVectorElementType() == MVT::i32) &&
+ "Unexpected element type");
if (Subtarget.hasInt256())
return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
@@ -16246,6 +16939,20 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
+// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
+static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
+ const SDLoc &dl, SelectionDAG &DAG) {
+ assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
+ DAG.getIntPtrConstant(0, dl));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
+ DAG.getIntPtrConstant(8, dl));
+ Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
+ Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+}
+
static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -16256,11 +16963,23 @@ static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
SDLoc DL(Op);
unsigned NumElts = VT.getVectorNumElements();
- // Extend VT if the scalar type is v8/v16 and BWI is not supported.
+ // For all vectors, but vXi8 we can just emit a sign_extend a shift. This
+ // avoids a constant pool load.
+ if (VT.getVectorElementType() != MVT::i8) {
+ SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
+ return DAG.getNode(ISD::SRL, DL, VT, Extend,
+ DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
+ }
+
+ // Extend VT if BWI is not supported.
MVT ExtVT = VT;
- if (!Subtarget.hasBWI() &&
- (VT.getVectorElementType().getSizeInBits() <= 16))
+ if (!Subtarget.hasBWI()) {
+ // If v16i32 is to be avoided, we'll need to split and concatenate.
+ if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
+ return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
+
ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
+ }
// Widen to 512-bits if VLX is not supported.
MVT WideVT = ExtVT;
@@ -16278,9 +16997,9 @@ static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
- // Truncate if we had to extend i16/i8 above.
+ // Truncate if we had to extend above.
if (VT != ExtVT) {
- WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
+ WideVT = MVT::getVectorVT(MVT::i8, NumElts);
SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
}
@@ -16300,14 +17019,8 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
if (SVT.getVectorElementType() == MVT::i1)
return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
- if (Subtarget.hasFp256())
- if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
- return Res;
-
- assert(!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() ||
- Op.getSimpleValueType().getVectorNumElements() !=
- SVT.getVectorNumElements());
- return SDValue();
+ assert(Subtarget.hasAVX() && "Expected AVX support");
+ return LowerAVXExtend(Op, DAG, Subtarget);
}
/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
@@ -16321,8 +17034,8 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
"Unexpected PACK opcode");
- // Requires SSE2 but AVX512 has fast truncate.
- if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+ // Requires SSE2 but AVX512 has fast vector truncate.
+ if (!Subtarget.hasSSE2() || Subtarget.hasAVX512() || !DstVT.isVector())
return SDValue();
EVT SrcVT = In.getValueType();
@@ -16331,40 +17044,53 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
if (SrcVT == DstVT)
return In;
- // We only support vector truncation to 128bits or greater from a
- // 256bits or greater source.
+ // We only support vector truncation to 64bits or greater from a
+ // 128bits or greater source.
unsigned DstSizeInBits = DstVT.getSizeInBits();
unsigned SrcSizeInBits = SrcVT.getSizeInBits();
- if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0)
+ if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
return SDValue();
- LLVMContext &Ctx = *DAG.getContext();
unsigned NumElems = SrcVT.getVectorNumElements();
+ if (!isPowerOf2_32(NumElems))
+ return SDValue();
+
+ LLVMContext &Ctx = *DAG.getContext();
assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
- // Extract lower/upper subvectors.
- unsigned NumSubElts = NumElems / 2;
- SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
- SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
-
// Pack to the largest type possible:
// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
EVT InVT = MVT::i16, OutVT = MVT::i8;
- if (DstVT.getScalarSizeInBits() > 8 &&
+ if (SrcVT.getScalarSizeInBits() > 16 &&
(Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
InVT = MVT::i32;
OutVT = MVT::i16;
}
+ // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
+ if (SrcVT.is128BitVector()) {
+ InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
+ OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
+ In = DAG.getBitcast(InVT, In);
+ SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
+ Res = extractSubVector(Res, 0, DAG, DL, 64);
+ return DAG.getBitcast(DstVT, Res);
+ }
+
+ // Extract lower/upper subvectors.
+ unsigned NumSubElts = NumElems / 2;
+ SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
+ SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
+
unsigned SubSizeInBits = SrcSizeInBits / 2;
InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
- if (SrcVT.is256BitVector()) {
+ if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
Lo = DAG.getBitcast(InVT, Lo);
Hi = DAG.getBitcast(InVT, Hi);
SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
@@ -16393,7 +17119,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
}
// Recursively pack lower/upper subvectors, concat result and pack again.
- assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater");
+ assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
@@ -16418,18 +17144,49 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
if (InVT.getScalarSizeInBits() <= 16) {
if (Subtarget.hasBWI()) {
// legal, will go to VPMOVB2M, VPMOVW2M
- // Shift packed bytes not supported natively, bitcast to word
- MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
- SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
- DAG.getBitcast(ExtVT, In),
- DAG.getConstant(ShiftInx, DL, ExtVT));
- ShiftNode = DAG.getBitcast(InVT, ShiftNode);
- return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
+ if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
+ // We need to shift to get the lsb into sign position.
+ // Shift packed bytes not supported natively, bitcast to word
+ MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
+ In = DAG.getNode(ISD::SHL, DL, ExtVT,
+ DAG.getBitcast(ExtVT, In),
+ DAG.getConstant(ShiftInx, DL, ExtVT));
+ In = DAG.getBitcast(InVT, In);
+ }
+ return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
+ In, ISD::SETGT);
}
// Use TESTD/Q, extended vector to packed dword/qword.
assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
"Unexpected vector type.");
unsigned NumElts = InVT.getVectorNumElements();
+ assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
+ // We need to change to a wider element type that we have support for.
+ // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
+ // For 16 element vectors we extend to v16i32 unless we are explicitly
+ // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
+ // we need to split into two 8 element vectors which we can extend to v8i32,
+ // truncate and concat the results. There's an additional complication if
+ // the original type is v16i8. In that case we can't split the v16i8 so
+ // first we pre-extend it to v16i16 which we can split to v8i16, then extend
+ // to v8i32, truncate that to v8i1 and concat the two halves.
+ if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
+ if (InVT == MVT::v16i8) {
+ // First we need to sign extend up to 256-bits so we can split that.
+ InVT = MVT::v16i16;
+ In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
+ }
+ SDValue Lo = extract128BitVector(In, 0, DAG, DL);
+ SDValue Hi = extract128BitVector(In, 8, DAG, DL);
+ // We're split now, just emit two truncates and a concat. The two
+ // truncates will trigger legalization to come back to this function.
+ Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
+ Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+ }
+ // We either have 8 elements or we're allowed to use 512-bit vectors.
+ // If we have VLX, we want to use the narrowest vector that can get the
+ // job done so we use vXi32.
MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
@@ -16437,9 +17194,17 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
ShiftInx = InVT.getScalarSizeInBits() - 1;
}
- SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
- DAG.getConstant(ShiftInx, DL, InVT));
- return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
+ if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
+ // We need to shift to get the lsb into sign position.
+ In = DAG.getNode(ISD::SHL, DL, InVT, In,
+ DAG.getConstant(ShiftInx, DL, InVT));
+ }
+ // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
+ if (Subtarget.hasDQI())
+ return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
+ In, ISD::SETGT);
+ return DAG.getSetCC(DL, VT, In, getZeroVector(InVT, Subtarget, DAG, DL),
+ ISD::SETNE);
}
SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
@@ -16458,31 +17223,36 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
// vpmovqb/w/d, vpmovdb/w, vpmovwb
if (Subtarget.hasAVX512()) {
// word to byte only under BWI
- if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
- return DAG.getNode(X86ISD::VTRUNC, DL, VT,
- getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
- return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
+ if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) { // v16i16 -> v16i8
+ // Make sure we're allowed to promote 512-bits.
+ if (Subtarget.canExtendTo512DQ())
+ return DAG.getNode(ISD::TRUNCATE, DL, VT,
+ DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
+ } else {
+ return Op;
+ }
}
- // Truncate with PACKSS if we are truncating a vector with sign-bits that
- // extend all the way to the packed/truncated value.
- unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
- if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
- if (SDValue V =
- truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
- return V;
+ unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
+ unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
// Truncate with PACKUS if we are truncating a vector with leading zero bits
// that extend all the way to the packed/truncated value.
// Pre-SSE41 we can only use PACKUSWB.
KnownBits Known;
DAG.computeKnownBits(In, Known);
- NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
- if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
+ if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
if (SDValue V =
truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
return V;
+ // Truncate with PACKSS if we are truncating a vector with sign-bits that
+ // extend all the way to the packed/truncated value.
+ if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
+ if (SDValue V =
+ truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
+ return V;
+
if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget.hasInt256()) {
@@ -16549,10 +17319,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
}
// Handle truncation of V256 to V128 using shuffles.
- if (!VT.is128BitVector() || !InVT.is256BitVector())
- return SDValue();
+ assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
- assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
+ assert(Subtarget.hasAVX() && "256-bit vector without AVX!");
unsigned NumElems = VT.getVectorNumElements();
MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
@@ -16572,9 +17341,29 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
if (VT.isVector()) {
- assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
SDValue Src = Op.getOperand(0);
SDLoc dl(Op);
+
+ if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
+ MVT ResVT = MVT::v4i32;
+ MVT TruncVT = MVT::v4i1;
+ unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+ if (!IsSigned && !Subtarget.hasVLX()) {
+ // Widen to 512-bits.
+ ResVT = MVT::v8i32;
+ TruncVT = MVT::v8i1;
+ Opc = ISD::FP_TO_UINT;
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
+ DAG.getUNDEF(MVT::v8f64),
+ Src, DAG.getIntPtrConstant(0, dl));
+ }
+ SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
+ Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
@@ -16771,8 +17560,16 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
return Res;
}
+/// Helper for creating a X86ISD::SETCC node.
+static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
+ SelectionDAG &DAG) {
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
+}
+
// Check whether an OR'd tree is PTEST-able.
-static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
+static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
@@ -16859,10 +17656,12 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
}
- return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
+ SDValue Res = DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
+ VecIns.back(), VecIns.back());
+ return getSETCC(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, Res, DL, DAG);
}
-/// \brief return true if \c Op has a use that doesn't just read flags.
+/// return true if \c Op has a use that doesn't just read flags.
static bool hasNonFlagsUse(SDValue Op) {
for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
++UI) {
@@ -16881,33 +17680,10 @@ static bool hasNonFlagsUse(SDValue Op) {
return false;
}
-// Emit KTEST instruction for bit vectors on AVX-512
-static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- if (Op.getOpcode() == ISD::BITCAST) {
- auto hasKTEST = [&](MVT VT) {
- unsigned SizeInBits = VT.getSizeInBits();
- return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
- (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
- };
- SDValue Op0 = Op.getOperand(0);
- MVT Op0VT = Op0.getValueType().getSimpleVT();
- if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
- hasKTEST(Op0VT))
- return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
- }
- return SDValue();
-}
-
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
SelectionDAG &DAG) const {
- if (Op.getValueType() == MVT::i1) {
- SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
- return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
- DAG.getConstant(0, dl, MVT::i8));
- }
// CF and OF aren't always set the way we want. Determine which
// of these we need.
bool NeedCF = false;
@@ -16943,9 +17719,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
// doing a separate TEST. TEST always sets OF and CF to 0, so unless
// we prove that the arithmetic won't overflow, we can't use OF or CF.
if (Op.getResNo() != 0 || NeedOF || NeedCF) {
- // Emit KTEST for bit vectors
- if (auto Node = EmitKTEST(Op, DAG, Subtarget))
- return Node;
// Emit a CMP with 0, which is the TEST pattern.
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, Op.getValueType()));
@@ -17119,14 +17892,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
case ISD::SUB: Opcode = X86ISD::SUB; break;
case ISD::XOR: Opcode = X86ISD::XOR; break;
case ISD::AND: Opcode = X86ISD::AND; break;
- case ISD::OR: {
- if (!NeedTruncation && ZeroCheck) {
- if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
- return EFLAGS;
- }
- Opcode = X86ISD::OR;
- break;
- }
+ case ISD::OR: Opcode = X86ISD::OR; break;
}
NumOperands = 2;
@@ -17168,16 +17934,13 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
- Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ Op = DAG.getNode(ConvertedOp, dl, VTs, V0, V1);
}
}
}
if (Opcode == 0) {
- // Emit KTEST for bit vectors
- if (auto Node = EmitKTEST(Op, DAG, Subtarget))
- return Node;
-
// Emit a CMP with 0, which is the TEST pattern.
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, Op.getValueType()));
@@ -17186,7 +17949,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
- DAG.ReplaceAllUsesWith(Op, New);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
return SDValue(New.getNode(), 1);
}
@@ -17271,7 +18034,6 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
EVT VT = Op.getValueType();
// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
- // TODO: Add support for AVX512 (v16f32).
// It is likely not profitable to do this for f64 because a double-precision
// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
// instructions: convert to single, rsqrtss, convert back to double, refine
@@ -17282,12 +18044,15 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
- (VT == MVT::v8f32 && Subtarget.hasAVX())) {
+ (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
+ (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = 1;
UseOneConstNR = false;
- return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+ // There is no FSQRT for 512-bits, but there is RSQRT14.
+ unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
+ return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
}
return SDValue();
}
@@ -17300,7 +18065,6 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
EVT VT = Op.getValueType();
// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
- // TODO: Add support for AVX512 (v16f32).
// It is likely not profitable to do this for f64 because a double-precision
// reciprocal estimate with refinement on x86 prior to FMA requires
// 15 instructions: convert to single, rcpss, convert back to double, refine
@@ -17309,7 +18073,8 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
(VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
- (VT == MVT::v8f32 && Subtarget.hasAVX())) {
+ (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
+ (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
// Enable estimate codegen with 1 refinement step for vector division.
// Scalar division estimates are disabled because they break too much
// real-world code. These defaults are intended to match GCC behavior.
@@ -17319,7 +18084,9 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = 1;
- return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
+ // There is no FSQRT for 512-bits, but there is RCP14.
+ unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
+ return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
}
return SDValue();
}
@@ -17334,13 +18101,6 @@ unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
return 2;
}
-/// Helper for creating a X86ISD::SETCC node.
-static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
- SelectionDAG &DAG) {
- return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
-}
-
/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
/// according to equal/not-equal condition code \p CC.
static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
@@ -17408,12 +18168,15 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
LHS = AndLHS.getOperand(0);
RHS = AndLHS.getOperand(1);
- }
-
- // Use BT if the immediate can't be encoded in a TEST instruction.
- if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
- LHS = AndLHS;
- RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
+ } else {
+ // Use BT if the immediate can't be encoded in a TEST instruction or we
+ // are optimizing for size and the immedaite won't fit in a byte.
+ bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
+ if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
+ isPowerOf2_64(AndRHSVal)) {
+ LHS = AndLHS;
+ RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
+ }
}
}
@@ -17498,49 +18261,6 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
}
-static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
- SDValue Op0 = Op.getOperand(0);
- SDValue Op1 = Op.getOperand(1);
- SDValue CC = Op.getOperand(2);
- MVT VT = Op.getSimpleValueType();
- SDLoc dl(Op);
-
- assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
- "Unexpected type for boolean compare operation");
- ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
- SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
- DAG.getConstant(-1, dl, VT));
- SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
- DAG.getConstant(-1, dl, VT));
- switch (SetCCOpcode) {
- default: llvm_unreachable("Unexpected SETCC condition");
- case ISD::SETEQ:
- // (x == y) -> ~(x ^ y)
- return DAG.getNode(ISD::XOR, dl, VT,
- DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
- DAG.getConstant(-1, dl, VT));
- case ISD::SETNE:
- // (x != y) -> (x ^ y)
- return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
- case ISD::SETUGT:
- case ISD::SETGT:
- // (x > y) -> (x & ~y)
- return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
- case ISD::SETULT:
- case ISD::SETLT:
- // (x < y) -> (~x & y)
- return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
- case ISD::SETULE:
- case ISD::SETLE:
- // (x <= y) -> (~x | y)
- return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
- case ISD::SETUGE:
- case ISD::SETGE:
- // (x >=y) -> (x | ~y)
- return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
- }
-}
-
static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
SDValue Op0 = Op.getOperand(0);
@@ -17553,48 +18273,24 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
"Cannot set masked compare for this operation");
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
- unsigned Opc = 0;
- bool Unsigned = false;
- bool Swap = false;
- unsigned SSECC;
- switch (SetCCOpcode) {
- default: llvm_unreachable("Unexpected SETCC condition");
- case ISD::SETNE: SSECC = 4; break;
- case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
- case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
- case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
- case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
- case ISD::SETULT: SSECC = 1; Unsigned = true; break;
- case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
- case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
- case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
- case ISD::SETLE: SSECC = 2; break;
- }
- if (Swap)
+ // If this is a seteq make sure any build vectors of all zeros are on the RHS.
+ // This helps with vptestm matching.
+ // TODO: Should we just canonicalize the setcc during DAG combine?
+ if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
+ ISD::isBuildVectorAllZeros(Op0.getNode()))
std::swap(Op0, Op1);
- // See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM.
- if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) {
- SDValue A = peekThroughBitcasts(Op0);
- if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) &&
- ISD::isBuildVectorAllZeros(Op1.getNode())) {
- MVT VT0 = Op0.getSimpleValueType();
- SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));
- SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));
- return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,
- dl, VT, RHS, LHS);
- }
+ // Prefer SETGT over SETLT.
+ if (SetCCOpcode == ISD::SETLT) {
+ SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
+ std::swap(Op0, Op1);
}
- if (Opc)
- return DAG.getNode(Opc, dl, VT, Op0, Op1);
- Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
- return DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getConstant(SSECC, dl, MVT::i8));
+ return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
}
-/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
+/// Try to turn a VSETULT into a VSETULE by modifying its second
/// operand \p Op1. If non-trivial (for example because it's not constant)
/// return an empty value.
static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
@@ -17624,6 +18320,51 @@ static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
return DAG.getBuildVector(VT, dl, ULTOp1);
}
+/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
+/// Op0 u<= Op1:
+/// t = psubus Op0, Op1
+/// pcmpeq t, <0..0>
+static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
+ ISD::CondCode Cond, const SDLoc &dl,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ MVT VET = VT.getVectorElementType();
+ if (VET != MVT::i8 && VET != MVT::i16)
+ return SDValue();
+
+ switch (Cond) {
+ default:
+ return SDValue();
+ case ISD::SETULT: {
+ // If the comparison is against a constant we can turn this into a
+ // setule. With psubus, setule does not require a swap. This is
+ // beneficial because the constant in the register is no longer
+ // destructed as the destination so it can be hoisted out of a loop.
+ // Only do this pre-AVX since vpcmp* is no longer destructive.
+ if (Subtarget.hasAVX())
+ return SDValue();
+ SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
+ if (!ULEOp1)
+ return SDValue();
+ Op1 = ULEOp1;
+ break;
+ }
+ // Psubus is better than flip-sign because it requires no inversion.
+ case ISD::SETUGE:
+ std::swap(Op0, Op1);
+ break;
+ case ISD::SETULE:
+ break;
+ }
+
+ SDValue Result = DAG.getNode(X86ISD::SUBUS, dl, VT, Op0, Op1);
+ return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
+ getZeroVector(VT, Subtarget, DAG, dl));
+}
+
static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue Op0 = Op.getOperand(0);
@@ -17697,23 +18438,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
"Invalid number of packed elements for source and destination!");
- if (VT.is128BitVector() && VTOp0.is256BitVector()) {
- // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
- // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
- // legalizer firstly checks if the first operand in input to the setcc has
- // a legal type. If so, then it promotes the return type to that same type.
- // Otherwise, the return type is promoted to the 'next legal type' which,
- // for a vector of MVT::i1 is always a 128-bit integer vector type.
- //
- // We reach this code only if the following two conditions are met:
- // 1. Both return type and operand type have been promoted to wider types
- // by the type legalizer.
- // 2. The original operand type has been promoted to a 256-bit vector.
- //
- // Note that condition 2. only applies for AVX targets.
- SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
- return DAG.getZExtOrTrunc(NewOp, dl, VT);
- }
+ // This is being called by type legalization because v2i32 is marked custom
+ // for result type legalization for v2f32.
+ if (VTOp0 == MVT::v2i32)
+ return SDValue();
// The non-AVX512 code below works under the assumption that source and
// destination types are the same.
@@ -17724,31 +18452,17 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
if (VT.is256BitVector() && !Subtarget.hasInt256())
return Lower256IntVSETCC(Op, DAG);
- // Operands are boolean (vectors of i1)
- MVT OpVT = Op1.getSimpleValueType();
- if (OpVT.getVectorElementType() == MVT::i1)
- return LowerBoolVSETCC_AVX512(Op, DAG);
-
// The result is boolean, but operands are int/float
if (VT.getVectorElementType() == MVT::i1) {
// In AVX-512 architecture setcc returns mask with i1 elements,
// But there is no compare instruction for i8 and i16 elements in KNL.
- // In this case use SSE compare
- bool UseAVX512Inst =
- (OpVT.is512BitVector() ||
- OpVT.getScalarSizeInBits() >= 32 ||
- (Subtarget.hasBWI() && Subtarget.hasVLX()));
-
- if (UseAVX512Inst)
- return LowerIntVSETCC_AVX512(Op, DAG);
-
- return DAG.getNode(ISD::TRUNCATE, dl, VT,
- DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
+ assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
+ "Unexpected operand type");
+ return LowerIntVSETCC_AVX512(Op, DAG);
}
// Lower using XOP integer comparisons.
- if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
- VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
+ if (VT.is128BitVector() && Subtarget.hasXOP()) {
// Translate compare code to XOP PCOM compare mode.
unsigned CmpMode = 0;
switch (Cond) {
@@ -17791,15 +18505,18 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
}
}
- // We are handling one of the integer comparisons here. Since SSE only has
- // GT and EQ comparisons for integer, swapping operands and multiple
- // operations may be required for some comparisons.
- unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
- : X86ISD::PCMPGT;
- bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
- Cond == ISD::SETGE || Cond == ISD::SETUGE;
- bool Invert = Cond == ISD::SETNE ||
- (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
+ // If this is a SETNE against the signed minimum value, change it to SETGT.
+ // If this is a SETNE against the signed maximum value, change it to SETLT.
+ // which will be swapped to SETGT.
+ // Otherwise we use PCMPEQ+invert.
+ APInt ConstValue;
+ if (Cond == ISD::SETNE &&
+ ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
+ if (ConstValue.isMinSignedValue())
+ Cond = ISD::SETGT;
+ else if (ConstValue.isMaxSignedValue())
+ Cond = ISD::SETLT;
+ }
// If both operands are known non-negative, then an unsigned compare is the
// same as a signed compare and there's no need to flip signbits.
@@ -17808,58 +18525,47 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
- // Special case: Use min/max operations for SETULE/SETUGE
- MVT VET = VT.getVectorElementType();
- bool HasMinMax =
- (Subtarget.hasAVX512() && VET == MVT::i64) ||
- (Subtarget.hasSSE41() && (VET == MVT::i16 || VET == MVT::i32)) ||
- (Subtarget.hasSSE2() && (VET == MVT::i8));
- bool MinMax = false;
- if (HasMinMax) {
+ // Special case: Use min/max operations for unsigned compares. We only want
+ // to do this for unsigned compares if we need to flip signs or if it allows
+ // use to avoid an invert.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (ISD::isUnsignedIntSetCC(Cond) &&
+ (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
+ TLI.isOperationLegal(ISD::UMIN, VT)) {
+ bool Invert = false;
+ unsigned Opc;
switch (Cond) {
- default: break;
- case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
- case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
+ default: llvm_unreachable("Unexpected condition code");
+ case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
+ case ISD::SETULE: Opc = ISD::UMIN; break;
+ case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
+ case ISD::SETUGE: Opc = ISD::UMAX; break;
}
- if (MinMax)
- Swap = Invert = FlipSigns = false;
- }
+ SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+ Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
- bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
- bool Subus = false;
- if (!MinMax && HasSubus) {
- // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
- // Op0 u<= Op1:
- // t = psubus Op0, Op1
- // pcmpeq t, <0..0>
- switch (Cond) {
- default: break;
- case ISD::SETULT: {
- // If the comparison is against a constant we can turn this into a
- // setule. With psubus, setule does not require a swap. This is
- // beneficial because the constant in the register is no longer
- // destructed as the destination so it can be hoisted out of a loop.
- // Only do this pre-AVX since vpcmp* is no longer destructive.
- if (Subtarget.hasAVX())
- break;
- if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
- Op1 = ULEOp1;
- Subus = true; Invert = false; Swap = false;
- }
- break;
- }
- // Psubus is better than flip-sign because it requires no inversion.
- case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
- case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
- }
+ // If the logical-not of the result is required, perform that now.
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, VT);
- if (Subus) {
- Opc = X86ISD::SUBUS;
- FlipSigns = false;
- }
+ return Result;
}
+ // Try to use SUBUS and PCMPEQ.
+ if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
+ return V;
+
+ // We are handling one of the integer comparisons here. Since SSE only has
+ // GT and EQ comparisons for integer, swapping operands and multiple
+ // operations may be required for some comparisons.
+ unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
+ : X86ISD::PCMPGT;
+ bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
+ Cond == ISD::SETGE || Cond == ISD::SETUGE;
+ bool Invert = Cond == ISD::SETNE ||
+ (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
+
if (Swap)
std::swap(Op0, Op1);
@@ -17947,14 +18653,47 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
if (Invert)
Result = DAG.getNOT(dl, Result, VT);
- if (MinMax)
- Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
+ return Result;
+}
- if (Subus)
- Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
- getZeroVector(VT, Subtarget, DAG, dl));
+// Try to select this as a KTEST+SETCC if possible.
+static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Only support equality comparisons.
+ if (CC != ISD::SETEQ && CC != ISD::SETNE)
+ return SDValue();
- return Result;
+ // Must be a bitcast from vXi1.
+ if (Op0.getOpcode() != ISD::BITCAST)
+ return SDValue();
+
+ Op0 = Op0.getOperand(0);
+ MVT VT = Op0.getSimpleValueType();
+ if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
+ !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
+ !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
+ return SDValue();
+
+ X86::CondCode X86CC;
+ if (isNullConstant(Op1)) {
+ X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
+ } else if (isAllOnesConstant(Op1)) {
+ // C flag is set for all ones.
+ X86CC = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
+ } else
+ return SDValue();
+
+ // If the input is an OR, we can combine it's operands into the KORTEST.
+ SDValue LHS = Op0;
+ SDValue RHS = Op0;
+ if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
+ LHS = Op0.getOperand(0);
+ RHS = Op0.getOperand(1);
+ }
+
+ SDValue KORTEST = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
+ return getSETCC(X86CC, KORTEST, dl, DAG);
}
SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
@@ -17979,6 +18718,18 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return NewSetCC;
}
+ // Try to use PTEST for a tree ORs equality compared with 0.
+ // TODO: We could do AND tree with all 1s as well by using the C flag.
+ if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ if (SDValue NewSetCC = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG))
+ return NewSetCC;
+ }
+
+ // Try to lower using KTEST.
+ if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
+ return NewSetCC;
+
// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
// these.
if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
@@ -18070,7 +18821,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// are available or VBLENDV if AVX is available.
// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
if (Cond.getOpcode() == ISD::SETCC &&
- ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
+ ((Subtarget.hasSSE2() && VT == MVT::f64) ||
(Subtarget.hasSSE1() && VT == MVT::f32)) &&
VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
@@ -18132,6 +18883,18 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
}
+ // For v64i1 without 64-bit support we need to split and rejoin.
+ if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+ assert(Subtarget.hasBWI() && "Expected BWI to be legal");
+ SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
+ SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
+ SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
+ SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
+ SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
+ SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+ }
+
if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
SDValue Op1Scalar;
if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
@@ -18379,6 +19142,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
}
+ // Promote i16 cmovs if it won't prevent folding a load.
+ if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) {
+ Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
+ Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
+ SDValue Ops[] = { Op2, Op1, CC, Cond };
+ SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
+ return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
+ }
+
// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
// condition is true.
SDValue Ops[] = { Op2, Op1, CC, Cond };
@@ -18399,8 +19171,13 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
// Extend VT if the scalar type is v8/v16 and BWI is not supported.
MVT ExtVT = VT;
- if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)
+ if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
+ // If v16i32 is to be avoided, we'll need to split and concatenate.
+ if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
+ return SplitAndExtendv16i1(ISD::SIGN_EXTEND, VT, In, dl, DAG);
+
ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
+ }
// Widen to 512-bits if VLX is not supported.
MVT WideVT = ExtVT;
@@ -18416,7 +19193,7 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
MVT WideEltVT = WideVT.getVectorElementType();
if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
- V = getExtendInVec(X86ISD::VSEXT, dl, WideVT, In, DAG);
+ V = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, In);
} else {
SDValue NegOne = getOnesVector(WideVT, DAG, dl);
SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
@@ -18445,11 +19222,8 @@ static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
if (InVT.getVectorElementType() == MVT::i1)
return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
- if (Subtarget.hasFp256())
- if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
- return Res;
-
- return SDValue();
+ assert(Subtarget.hasAVX() && "Expected AVX support");
+ return LowerAVXExtend(Op, DAG, Subtarget);
}
// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
@@ -18549,15 +19323,17 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
if (InVT.getVectorElementType() == MVT::i1)
return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
- if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
- (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
- (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
- (VT != MVT::v8i64 || InVT != MVT::v8i32) &&
- (VT != MVT::v8i64 || InVT != MVT::v8i16) &&
- (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
- (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
- (VT != MVT::v32i16 || InVT != MVT::v32i8))
- return SDValue();
+ assert(VT.isVector() && InVT.isVector() && "Expected vector type");
+ assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
+ "Expected same number of elements");
+ assert((VT.getVectorElementType() == MVT::i16 ||
+ VT.getVectorElementType() == MVT::i32 ||
+ VT.getVectorElementType() == MVT::i64) &&
+ "Unexpected element type");
+ assert((InVT.getVectorElementType() == MVT::i8 ||
+ InVT.getVectorElementType() == MVT::i16 ||
+ InVT.getVectorElementType() == MVT::i32) &&
+ "Unexpected element type");
if (Subtarget.hasInt256())
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
@@ -18595,164 +19371,29 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
-// Lower truncating store. We need a special lowering to vXi1 vectors
-static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
+static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
SDLoc dl(St);
- EVT MemVT = St->getMemoryVT();
- assert(St->isTruncatingStore() && "We only custom truncating store.");
- assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
- "Expected truncstore of i1 vector");
-
- SDValue Op = St->getValue();
- MVT OpVT = Op.getValueType().getSimpleVT();
- unsigned NumElts = OpVT.getVectorNumElements();
- if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
- NumElts == 16) {
- // Truncate and store - everything is legal
- Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
- if (MemVT.getSizeInBits() < 8)
- Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
- DAG.getUNDEF(MVT::v8i1), Op,
- DAG.getIntPtrConstant(0, dl));
- return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
- St->getMemOperand());
- }
-
- // A subset, assume that we have only AVX-512F
- if (NumElts <= 8) {
- if (NumElts < 8) {
- // Extend to 8-elts vector
- MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
- Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
- DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
- }
- Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
- return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
- St->getMemOperand());
- }
- // v32i8
- assert(OpVT == MVT::v32i8 && "Unexpected operand type");
- // Divide the vector into 2 parts and store each part separately
- SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
- DAG.getIntPtrConstant(0, dl));
- Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
- SDValue BasePtr = St->getBasePtr();
- SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
- St->getMemOperand());
- SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
- DAG.getIntPtrConstant(16, dl));
- Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
-
- SDValue BasePtrHi =
- DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
- DAG.getConstant(2, dl, BasePtr.getValueType()));
-
- SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
- BasePtrHi, St->getMemOperand());
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
-}
-
-static SDValue LowerExtended1BitVectorLoad(SDValue Op,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
-
- LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
- SDLoc dl(Ld);
- EVT MemVT = Ld->getMemoryVT();
- assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
- "Expected i1 vector load");
- unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
- ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
- MVT VT = Op.getValueType().getSimpleVT();
- unsigned NumElts = VT.getVectorNumElements();
-
- if ((Subtarget.hasBWI() && NumElts >= 32) ||
- (Subtarget.hasDQI() && NumElts < 16) ||
- NumElts == 16) {
- // Load and extend - everything is legal
- if (NumElts < 8) {
- SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
- Ld->getBasePtr(),
- Ld->getMemOperand());
- // Replace chain users with the new chain.
- assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
- DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
- if (Subtarget.hasVLX()) {
- // Extract to v4i1/v2i1.
- SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Load,
- DAG.getIntPtrConstant(0, dl));
- // Finally, do a normal sign-extend to the desired register.
- return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
- }
-
- MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
- SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
-
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
- DAG.getIntPtrConstant(0, dl));
- }
- SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
- Ld->getBasePtr(),
- Ld->getMemOperand());
- // Replace chain users with the new chain.
- assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
- DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
-
- // Finally, do a normal sign-extend to the desired register.
- return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
- }
-
- if (NumElts <= 8) {
- // A subset, assume that we have only AVX-512F
- SDValue Load = DAG.getLoad(MVT::i8, dl, Ld->getChain(),
- Ld->getBasePtr(),
- Ld->getMemOperand());
- // Replace chain users with the new chain.
- assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
- DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
-
- SDValue BitVec = DAG.getBitcast(MVT::v8i1, Load);
-
- if (NumElts == 8)
- return DAG.getNode(ExtOpcode, dl, VT, BitVec);
-
- if (Subtarget.hasVLX()) {
- // Extract to v4i1/v2i1.
- SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, BitVec,
- DAG.getIntPtrConstant(0, dl));
- // Finally, do a normal sign-extend to the desired register.
- return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
- }
-
- MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
- SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
- DAG.getIntPtrConstant(0, dl));
- }
-
- assert(VT == MVT::v32i8 && "Unexpected extload type");
-
- SDValue BasePtr = Ld->getBasePtr();
- SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
- Ld->getBasePtr(),
- Ld->getMemOperand());
-
- SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);
-
- SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), BasePtrHi,
- Ld->getPointerInfo().getWithOffset(2),
- MinAlign(Ld->getAlignment(), 2U),
- Ld->getMemOperand()->getFlags());
-
- SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- LoadLo.getValue(1), LoadHi.getValue(1));
- DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
+ SDValue StoredVal = St->getValue();
+
+ // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
+ assert(StoredVal.getValueType().isVector() &&
+ StoredVal.getValueType().getVectorElementType() == MVT::i1 &&
+ StoredVal.getValueType().getVectorNumElements() <= 8 &&
+ "Unexpected VT");
+ assert(!St->isTruncatingStore() && "Expected non-truncating store");
+ assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
+ "Expected AVX512F without AVX512DQI");
+
+ StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+ DAG.getUNDEF(MVT::v8i1), StoredVal,
+ DAG.getIntPtrConstant(0, dl));
+ StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
- SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
- SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
+ return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
}
// Lower vector extended loads using a shuffle. If SSSE3 is not available we
@@ -18762,21 +19403,40 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
// TODO: It is possible to support ZExt by zeroing the undef values during
// the shuffle phase or after the shuffle.
-static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
+static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT RegVT = Op.getSimpleValueType();
assert(RegVT.isVector() && "We only custom lower vector sext loads.");
assert(RegVT.isInteger() &&
"We only custom lower integer vector sext loads.");
- // Nothing useful we can do without SSE2 shuffles.
- assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
-
LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
SDLoc dl(Ld);
EVT MemVT = Ld->getMemoryVT();
- if (MemVT.getScalarType() == MVT::i1)
- return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
+
+ // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
+ if (RegVT.isVector() && RegVT.getVectorElementType() == MVT::i1) {
+ assert(EVT(RegVT) == MemVT && "Expected non-extending load");
+ assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
+ assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
+ "Expected AVX512F without AVX512DQI");
+
+ SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+
+ // Replace chain users with the new chain.
+ assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewLd.getValue(1));
+
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
+ DAG.getBitcast(MVT::v8i1, NewLd),
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getMergeValues({Extract, NewLd.getValue(1)}, dl);
+ }
+
+ // Nothing useful we can do without SSE2 shuffles.
+ assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned RegSz = RegVT.getSizeInBits();
@@ -19619,7 +20279,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
}
-/// \brief Return Mask with the necessary casting or extending
+/// Return Mask with the necessary casting or extending
/// for \p Mask according to \p MaskVT when lowering masking intrinsics
static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
@@ -19637,27 +20297,19 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
}
if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
- if (MaskVT == MVT::v64i1) {
- assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
- // In case 32bit mode, bitcast i64 is illegal, extend/split it.
- SDValue Lo, Hi;
- Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
- DAG.getConstant(0, dl, MVT::i32));
- Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
- DAG.getConstant(1, dl, MVT::i32));
-
- Lo = DAG.getBitcast(MVT::v32i1, Lo);
- Hi = DAG.getBitcast(MVT::v32i1, Hi);
-
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
- } else {
- // MaskVT require < 64bit. Truncate mask (should succeed in any case),
- // and bitcast.
- MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
- return DAG.getBitcast(MaskVT,
- DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
- }
-
+ assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
+ assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
+ // In case 32bit mode, bitcast i64 is illegal, extend/split it.
+ SDValue Lo, Hi;
+ Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+ DAG.getConstant(0, dl, MVT::i32));
+ Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+ DAG.getConstant(1, dl, MVT::i32));
+
+ Lo = DAG.getBitcast(MVT::v32i1, Lo);
+ Hi = DAG.getBitcast(MVT::v32i1, Hi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
} else {
MVT BitcastVT = MVT::getVectorVT(MVT::i1,
Mask.getSimpleValueType().getSizeInBits());
@@ -19669,7 +20321,7 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
}
}
-/// \brief Return (and \p Op, \p Mask) for compare instructions or
+/// Return (and \p Op, \p Mask) for compare instructions or
/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
/// necessary casting or extending for \p Mask when lowering masking intrinsics
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
@@ -19690,11 +20342,10 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
default: break;
case X86ISD::CMPM:
case X86ISD::CMPM_RND:
- case X86ISD::CMPMU:
case X86ISD::VPSHUFBITQMB:
- return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
case X86ISD::VFPCLASS:
- return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
+ return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+ case ISD::TRUNCATE:
case X86ISD::VTRUNC:
case X86ISD::VTRUNCS:
case X86ISD::VTRUNCUS:
@@ -19710,7 +20361,7 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
}
-/// \brief Creates an SDNode for a predicated scalar operation.
+/// Creates an SDNode for a predicated scalar operation.
/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
/// The mask is coming as MVT::i8 and it should be transformed
/// to MVT::v1i1 while lowering masking intrinsics.
@@ -19729,12 +20380,12 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
+ assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
if (Op.getOpcode() == X86ISD::FSETCCM ||
- Op.getOpcode() == X86ISD::FSETCCM_RND)
+ Op.getOpcode() == X86ISD::FSETCCM_RND ||
+ Op.getOpcode() == X86ISD::VFPCLASSS)
return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
- if (Op.getOpcode() == X86ISD::VFPCLASSS)
- return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
if (PreservedSrc.isUndef())
PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
@@ -19819,14 +20470,67 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
if (IntrData) {
switch(IntrData->Type) {
- case INTR_TYPE_1OP:
+ case INTR_TYPE_1OP: {
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(2);
+ if (!isRoundModeCurDirection(Rnd)) {
+ return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
+ Op.getOperand(1), Rnd);
+ }
+ }
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
+ }
case INTR_TYPE_2OP:
- return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
- Op.getOperand(2));
+ case INTR_TYPE_2OP_IMM8: {
+ SDValue Src2 = Op.getOperand(2);
+
+ if (IntrData->Type == INTR_TYPE_2OP_IMM8)
+ Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
+
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(3);
+ if (!isRoundModeCurDirection(Rnd)) {
+ return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
+ Op.getOperand(1), Src2, Rnd);
+ }
+ }
+
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(1), Src2);
+ }
case INTR_TYPE_3OP:
- return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
- Op.getOperand(2), Op.getOperand(3));
+ case INTR_TYPE_3OP_IMM8: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+
+ if (IntrData->Type == INTR_TYPE_3OP_IMM8)
+ Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
+
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(4);
+ if (!isRoundModeCurDirection(Rnd)) {
+ return DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3, Rnd);
+ }
+ }
+
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Src1, Src2, Src3);
+ }
case INTR_TYPE_4OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
@@ -19927,16 +20631,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
RoundingMode, Sae),
Mask, Src0, Subtarget, DAG);
}
- case INTR_TYPE_2OP_MASK:
- case INTR_TYPE_2OP_IMM8_MASK: {
+ case INTR_TYPE_2OP_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
- if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
- Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
-
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -19991,26 +20691,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
- case INTR_TYPE_3OP_MASK_RM: {
- SDValue Src1 = Op.getOperand(1);
- SDValue Src2 = Op.getOperand(2);
- SDValue Imm = Op.getOperand(3);
- SDValue PassThru = Op.getOperand(4);
- SDValue Mask = Op.getOperand(5);
- // We specify 2 possible modes for intrinsics, with/without rounding
- // modes.
- // First, we check if the intrinsic have rounding mode (7 operands),
- // if not, we set rounding mode to "current".
- SDValue Rnd;
- if (Op.getNumOperands() == 7)
- Rnd = Op.getOperand(6);
- else
- Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1, Src2, Imm, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
- case INTR_TYPE_3OP_IMM8_MASK:
case INTR_TYPE_3OP_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
@@ -20018,9 +20698,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue PassThru = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
- if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
- Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
-
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -20038,41 +20715,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
- case VPERM_2OP_MASK : {
+ case VPERM_2OP : {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
- SDValue PassThru = Op.getOperand(3);
- SDValue Mask = Op.getOperand(4);
-
- // Swap Src1 and Src2 in the node creation
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
- Mask, PassThru, Subtarget, DAG);
- }
- case VPERM_3OP_MASKZ:
- case VPERM_3OP_MASK:{
- MVT VT = Op.getSimpleValueType();
- // Src2 is the PassThru
- SDValue Src1 = Op.getOperand(1);
- // PassThru needs to be the same type as the destination in order
- // to pattern match correctly.
- SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
- SDValue Src3 = Op.getOperand(3);
- SDValue Mask = Op.getOperand(4);
- SDValue PassThru = SDValue();
-
- // set PassThru element
- if (IntrData->Type == VPERM_3OP_MASKZ)
- PassThru = getZeroVector(VT, Subtarget, DAG, dl);
- else
- PassThru = Src2;
// Swap Src1 and Src2 in the node creation
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
- dl, Op.getValueType(),
- Src2, Src1, Src3),
- Mask, PassThru, Subtarget, DAG);
+ return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
}
- case FMA_OP_MASK3:
case FMA_OP_MASKZ:
case FMA_OP_MASK: {
SDValue Src1 = Op.getOperand(1);
@@ -20085,8 +20734,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// set PassThru element
if (IntrData->Type == FMA_OP_MASKZ)
PassThru = getZeroVector(VT, Subtarget, DAG, dl);
- else if (IntrData->Type == FMA_OP_MASK3)
- PassThru = Src3;
else
PassThru = Src1;
@@ -20107,76 +20754,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
- case FMA_OP_SCALAR_MASK:
- case FMA_OP_SCALAR_MASK3:
- case FMA_OP_SCALAR_MASKZ: {
- SDValue Src1 = Op.getOperand(1);
- SDValue Src2 = Op.getOperand(2);
- SDValue Src3 = Op.getOperand(3);
- SDValue Mask = Op.getOperand(4);
- MVT VT = Op.getSimpleValueType();
- SDValue PassThru = SDValue();
-
- // set PassThru element
- if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
- PassThru = getZeroVector(VT, Subtarget, DAG, dl);
- else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
- PassThru = Src3;
- else
- PassThru = Src1;
-
- unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0) {
- SDValue Rnd = Op.getOperand(5);
- if (!isRoundModeCurDirection(Rnd))
- return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
- Op.getValueType(), Src1, Src2,
- Src3, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
-
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
- Op.getValueType(), Src1, Src2,
- Src3),
- Mask, PassThru, Subtarget, DAG);
- }
- case IFMA_OP_MASKZ:
- case IFMA_OP_MASK: {
- SDValue Src1 = Op.getOperand(1);
- SDValue Src2 = Op.getOperand(2);
- SDValue Src3 = Op.getOperand(3);
- SDValue Mask = Op.getOperand(4);
- MVT VT = Op.getSimpleValueType();
- SDValue PassThru = Src1;
-
- // set PassThru element
- if (IntrData->Type == IFMA_OP_MASKZ)
- PassThru = getZeroVector(VT, Subtarget, DAG, dl);
-
- // Node we need to swizzle the operands to pass the multiply operands
+ case IFMA_OP:
+ // NOTE: We need to swizzle the operands to pass the multiply operands
// first.
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
- dl, Op.getValueType(),
- Src2, Src3, Src1),
- Mask, PassThru, Subtarget, DAG);
- }
- case TERLOG_OP_MASK:
- case TERLOG_OP_MASKZ: {
- SDValue Src1 = Op.getOperand(1);
- SDValue Src2 = Op.getOperand(2);
- SDValue Src3 = Op.getOperand(3);
- SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
- SDValue Mask = Op.getOperand(5);
- MVT VT = Op.getSimpleValueType();
- SDValue PassThru = Src1;
- // Set PassThru element.
- if (IntrData->Type == TERLOG_OP_MASKZ)
- PassThru = getZeroVector(VT, Subtarget, DAG, dl);
-
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1, Src2, Src3, Src4),
- Mask, PassThru, Subtarget, DAG);
- }
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
case CVTPD2PS:
// ISD::FP_ROUND has a second argument that indicates if the truncation
// does not change the value. Set it to 0 since it can change.
@@ -20207,21 +20789,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Mask, PassThru, Subtarget, DAG);
}
case FPCLASS: {
- // FPclass intrinsics with mask
- SDValue Src1 = Op.getOperand(1);
- MVT VT = Src1.getSimpleValueType();
- MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
- SDValue Imm = Op.getOperand(2);
- SDValue Mask = Op.getOperand(3);
- MVT BitcastVT = MVT::getVectorVT(MVT::i1,
- Mask.getSimpleValueType().getSizeInBits());
- SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
- SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(),
- Subtarget, DAG);
- SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
- DAG.getUNDEF(BitcastVT), FPclassMask,
- DAG.getIntPtrConstant(0, dl));
- return DAG.getBitcast(Op.getValueType(), Res);
+ // FPclass intrinsics
+ SDValue Src1 = Op.getOperand(1);
+ MVT MaskVT = Op.getSimpleValueType();
+ SDValue Imm = Op.getOperand(2);
+ return DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
}
case FPCLASSS: {
SDValue Src1 = Op.getOperand(1);
@@ -20230,17 +20802,20 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
Subtarget, DAG);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask,
- DAG.getIntPtrConstant(0, dl));
- }
- case CMP_MASK:
- case CMP_MASK_CC: {
+ // Need to fill with zeros to ensure the bitcast will produce zeroes
+ // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+ SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+ DAG.getConstant(0, dl, MVT::v8i1),
+ FPclassMask, DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(MVT::i8, Ins);
+ }
+ case CMP_MASK: {
// Comparison intrinsics with masks.
// Example of transformation:
// (i8 (int_x86_avx512_mask_pcmpeq_q_128
// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
// (i8 (bitcast
- // (v8i1 (insert_subvector undef,
+ // (v8i1 (insert_subvector zero,
// (v2i1 (and (PCMPEQM %a, %b),
// (extract_subvector
// (v8i1 (bitcast %mask)), 0))), 0))))
@@ -20249,36 +20824,39 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
MVT BitcastVT = MVT::getVectorVT(MVT::i1,
Mask.getSimpleValueType().getSizeInBits());
- SDValue Cmp;
- if (IntrData->Type == CMP_MASK_CC) {
- SDValue CC = Op.getOperand(3);
- CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
- // We specify 2 possible opcodes for intrinsics with rounding modes.
- // First, we check if the intrinsic may have non-default rounding mode,
- // (IntrData->Opc1 != 0), then we check the rounding mode operand.
- if (IntrData->Opc1 != 0) {
- SDValue Rnd = Op.getOperand(5);
- if (!isRoundModeCurDirection(Rnd))
- Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
- Op.getOperand(2), CC, Rnd);
- }
- //default rounding mode
- if(!Cmp.getNode())
- Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
- Op.getOperand(2), CC);
-
- } else {
- assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
- Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
- Op.getOperand(2));
- }
+ SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2));
SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
Subtarget, DAG);
+ // Need to fill with zeros to ensure the bitcast will produce zeroes
+ // for the upper bits in the v2i1/v4i1 case.
SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
- DAG.getUNDEF(BitcastVT), CmpMask,
- DAG.getIntPtrConstant(0, dl));
+ DAG.getConstant(0, dl, BitcastVT),
+ CmpMask, DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(Op.getValueType(), Res);
}
+
+ case CMP_MASK_CC: {
+ MVT MaskVT = Op.getSimpleValueType();
+ SDValue Cmp;
+ SDValue CC = Op.getOperand(3);
+ CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ if (IntrData->Opc1 != 0) {
+ SDValue Rnd = Op.getOperand(4);
+ if (!isRoundModeCurDirection(Rnd))
+ Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2), CC, Rnd);
+ }
+ //default rounding mode
+ if (!Cmp.getNode())
+ Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2), CC);
+
+ return Cmp;
+ }
case CMP_MASK_SCALAR_CC: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
@@ -20297,8 +20875,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
Subtarget, DAG);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CmpMask,
- DAG.getIntPtrConstant(0, dl));
+ // Need to fill with zeros to ensure the bitcast will produce zeroes
+ // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+ SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+ DAG.getConstant(0, dl, MVT::v8i1),
+ CmpMask, DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(MVT::i8, Ins);
}
case COMI: { // Comparison intrinsics
ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
@@ -20351,8 +20933,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
else
FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
DAG.getConstant(CondVal, dl, MVT::i8), Sae);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, FCmp,
- DAG.getIntPtrConstant(0, dl));
+ // Need to fill with zeros to ensure the bitcast will produce zeroes
+ // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+ SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
+ DAG.getConstant(0, dl, MVT::v16i1),
+ FCmp, DAG.getIntPtrConstant(0, dl));
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
+ DAG.getBitcast(MVT::i16, Ins));
}
case VSHIFT:
return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
@@ -20369,22 +20956,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
DataToCompress),
Mask, PassThru, Subtarget, DAG);
}
- case BROADCASTM: {
- SDValue Mask = Op.getOperand(1);
- MVT MaskVT = MVT::getVectorVT(MVT::i1,
- Mask.getSimpleValueType().getSizeInBits());
- Mask = DAG.getBitcast(MaskVT, Mask);
- return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
- }
- case MASK_BINOP: {
- MVT VT = Op.getSimpleValueType();
- MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
-
- SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
- SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
- SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
- return DAG.getBitcast(VT, Res);
- }
case FIXUPIMMS:
case FIXUPIMMS_MASKZ:
case FIXUPIMM:
@@ -20414,18 +20985,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Src1, Src2, Src3, Imm, Rnd),
Mask, Passthru, Subtarget, DAG);
}
- case CONVERT_TO_MASK: {
- MVT SrcVT = Op.getOperand(1).getSimpleValueType();
- MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
- MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
-
- SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
- Op.getOperand(1));
- SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
- DAG.getUNDEF(BitcastVT), CvtMask,
- DAG.getIntPtrConstant(0, dl));
- return DAG.getBitcast(Op.getValueType(), Res);
- }
case ROUNDP: {
assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
// Clear the upper bits of the rounding immediate so that the legacy
@@ -20454,13 +21013,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
- case Intrinsic::x86_avx2_permd:
- case Intrinsic::x86_avx2_permps:
- // Operands intentionally swapped. Mask is last operand to intrinsic,
- // but second operand for node/instruction.
- return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
- Op.getOperand(2), Op.getOperand(1));
-
// ptest and testp intrinsics. The intrinsic these come from are designed to
// return an integer value, not just an instruction so lower it to the ptest
// or testp pattern and a setcc for the result.
@@ -20528,43 +21080,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
- case Intrinsic::x86_avx512_kortestz_w:
- case Intrinsic::x86_avx512_kortestc_w: {
- X86::CondCode X86CC =
- (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
- SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
- SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
- SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
- SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
- return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
- }
-
- case Intrinsic::x86_avx512_knot_w: {
- SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
- SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
- SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
- return DAG.getBitcast(MVT::i16, Res);
- }
-
- case Intrinsic::x86_avx512_kandn_w: {
- SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
- // Invert LHS for the not.
- LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
- DAG.getConstant(1, dl, MVT::v16i1));
- SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
- SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
- return DAG.getBitcast(MVT::i16, Res);
- }
-
- case Intrinsic::x86_avx512_kxnor_w: {
- SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
- SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
- SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
- // Invert result for the not.
- Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
- DAG.getConstant(1, dl, MVT::v16i1));
- return DAG.getBitcast(MVT::i16, Res);
- }
case Intrinsic::x86_sse42_pcmpistria128:
case Intrinsic::x86_sse42_pcmpestria128:
@@ -20581,50 +21096,50 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_sse42_pcmpistria128:
- Opcode = X86ISD::PCMPISTRI;
+ Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_A;
break;
case Intrinsic::x86_sse42_pcmpestria128:
- Opcode = X86ISD::PCMPESTRI;
+ Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_A;
break;
case Intrinsic::x86_sse42_pcmpistric128:
- Opcode = X86ISD::PCMPISTRI;
+ Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_B;
break;
case Intrinsic::x86_sse42_pcmpestric128:
- Opcode = X86ISD::PCMPESTRI;
+ Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_B;
break;
case Intrinsic::x86_sse42_pcmpistrio128:
- Opcode = X86ISD::PCMPISTRI;
+ Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_O;
break;
case Intrinsic::x86_sse42_pcmpestrio128:
- Opcode = X86ISD::PCMPESTRI;
+ Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_O;
break;
case Intrinsic::x86_sse42_pcmpistris128:
- Opcode = X86ISD::PCMPISTRI;
+ Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_S;
break;
case Intrinsic::x86_sse42_pcmpestris128:
- Opcode = X86ISD::PCMPESTRI;
+ Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_S;
break;
case Intrinsic::x86_sse42_pcmpistriz128:
- Opcode = X86ISD::PCMPISTRI;
+ Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_E;
break;
case Intrinsic::x86_sse42_pcmpestriz128:
- Opcode = X86ISD::PCMPESTRI;
+ Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_E;
break;
}
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
- SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
+ SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
+ SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
@@ -20632,15 +21147,28 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::x86_sse42_pcmpestri128: {
unsigned Opcode;
if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
- Opcode = X86ISD::PCMPISTRI;
+ Opcode = X86ISD::PCMPISTR;
else
- Opcode = X86ISD::PCMPESTRI;
+ Opcode = X86ISD::PCMPESTR;
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps);
}
+ case Intrinsic::x86_sse42_pcmpistrm128:
+ case Intrinsic::x86_sse42_pcmpestrm128: {
+ unsigned Opcode;
+ if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
+ Opcode = X86ISD::PCMPISTR;
+ else
+ Opcode = X86ISD::PCMPESTR;
+
+ SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
+ return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
+ }
+
case Intrinsic::eh_sjlj_lsda: {
MachineFunction &MF = DAG.getMachineFunction();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -20708,7 +21236,7 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Segment = DAG.getRegister(0, MVT::i32);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
- // TODO: use undef instead and let ExecutionDepsFix deal with it?
+ // TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
@@ -20736,7 +21264,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Segment = DAG.getRegister(0, MVT::i32);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
- // TODO: use undef instead and let ExecutionDepsFix deal with it?
+ // TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
@@ -21029,17 +21557,35 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return SDValue();
}
case Intrinsic::x86_lwpins32:
- case Intrinsic::x86_lwpins64: {
+ case Intrinsic::x86_lwpins64:
+ case Intrinsic::x86_umwait:
+ case Intrinsic::x86_tpause: {
SDLoc dl(Op);
SDValue Chain = Op->getOperand(0);
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
- SDValue LwpIns =
- DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
+ unsigned Opcode;
+
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case Intrinsic::x86_umwait:
+ Opcode = X86ISD::UMWAIT;
+ break;
+ case Intrinsic::x86_tpause:
+ Opcode = X86ISD::TPAUSE;
+ break;
+ case Intrinsic::x86_lwpins32:
+ case Intrinsic::x86_lwpins64:
+ Opcode = X86ISD::LWPINS;
+ break;
+ }
+
+ SDValue Operation =
+ DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
Op->getOperand(3), Op->getOperand(4));
- SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
+ SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
- LwpIns.getValue(1));
+ Operation.getValue(1));
}
}
return SDValue();
@@ -21155,27 +21701,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SDValue Results[] = { SetCC, Store };
return DAG.getMergeValues(Results, dl);
}
- case COMPRESS_TO_MEM: {
- SDValue Mask = Op.getOperand(4);
- SDValue DataToCompress = Op.getOperand(3);
- SDValue Addr = Op.getOperand(2);
- SDValue Chain = Op.getOperand(0);
- MVT VT = DataToCompress.getSimpleValueType();
-
- MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
- assert(MemIntr && "Expected MemIntrinsicSDNode!");
-
- if (isAllOnesConstant(Mask)) // return just a store
- return DAG.getStore(Chain, dl, DataToCompress, Addr,
- MemIntr->getMemOperand());
-
- MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
- SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
-
- return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
- MemIntr->getMemOperand(),
- false /* truncating */, true /* compressing */);
- }
case TRUNCATE_TO_MEM_VI8:
case TRUNCATE_TO_MEM_VI16:
case TRUNCATE_TO_MEM_VI32: {
@@ -21219,28 +21744,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
llvm_unreachable("Unsupported truncstore intrinsic");
}
}
-
- case EXPAND_FROM_MEM: {
- SDValue Mask = Op.getOperand(4);
- SDValue PassThru = Op.getOperand(3);
- SDValue Addr = Op.getOperand(2);
- SDValue Chain = Op.getOperand(0);
- MVT VT = Op.getSimpleValueType();
-
- MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
- assert(MemIntr && "Expected MemIntrinsicSDNode!");
-
- if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
- return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
- if (X86::isZeroNode(Mask))
- return DAG.getUNDEF(VT);
-
- MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
- SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
- MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
- true /* expanding */);
- }
}
}
@@ -21657,14 +22160,16 @@ static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
unsigned NumElems = VT.getVectorNumElements();
unsigned SizeInBits = VT.getSizeInBits();
+ MVT EltVT = VT.getVectorElementType();
+ SDValue Src = Op.getOperand(0);
+ assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
+ "Src and Op should have the same element type!");
// Extract the Lo/Hi vectors
SDLoc dl(Op);
- SDValue Src = Op.getOperand(0);
SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
- MVT EltVT = VT.getVectorElementType();
MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
@@ -21687,13 +22192,14 @@ static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
return LowerVectorIntUnary(Op, DAG);
}
-/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
+/// Lower a vector CTLZ using native supported vector CTLZ instruction.
//
// i8/i16 vector implemented using dword LZCNT vector instruction
// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
// split the vector, perform operation on it's Lo a Hi part and
// concatenate the results.
-static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
assert(Op.getOpcode() == ISD::CTLZ);
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
@@ -21704,7 +22210,8 @@ static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
"Unsupported element type");
// Split vector, it's Lo and Hi parts will be handled in next iteration.
- if (16 < NumElems)
+ if (NumElems > 16 ||
+ (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
return LowerVectorIntUnary(Op, DAG);
MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
@@ -21809,8 +22316,10 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
- if (Subtarget.hasCDI())
- return LowerVectorCTLZ_AVX512CDI(Op, DAG);
+ if (Subtarget.hasCDI() &&
+ // vXi8 vectors need to be promoted to 512-bits for vXi32.
+ (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
+ return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
@@ -21999,10 +22508,42 @@ static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
}
static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
- assert(Op.getSimpleValueType().is256BitVector() &&
- Op.getSimpleValueType().isInteger() &&
- "Only handle AVX 256-bit vector integer operation");
- return Lower256IntArith(Op, DAG);
+ MVT VT = Op.getSimpleValueType();
+
+ // For AVX1 cases, split to use legal ops (everything but v4i64).
+ if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
+ return Lower256IntArith(Op, DAG);
+
+ SDLoc DL(Op);
+ unsigned Opcode = Op.getOpcode();
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+
+ // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
+ // using the SMIN/SMAX instructions and flipping the signbit back.
+ if (VT == MVT::v8i16) {
+ assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
+ "Unexpected MIN/MAX opcode");
+ SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
+ N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
+ N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
+ Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
+ SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
+ return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
+ }
+
+ // Else, expand to a compare/select.
+ ISD::CondCode CC;
+ switch (Opcode) {
+ case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
+ case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
+ case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
+ case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
+ default: llvm_unreachable("Unknown MINMAX opcode");
+ }
+
+ SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
+ return DAG.getSelect(DL, VT, Cond, N0, N1);
}
static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
@@ -22048,40 +22589,26 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
MVT ExVT = MVT::v8i16;
// Extract the lo parts and sign extend to i16
- SDValue ALo, BLo;
- if (Subtarget.hasSSE41()) {
- ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
- BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
- } else {
- const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
- -1, 4, -1, 5, -1, 6, -1, 7};
- ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
- BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
- ALo = DAG.getBitcast(ExVT, ALo);
- BLo = DAG.getBitcast(ExVT, BLo);
- ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
- BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
- }
+ // We're going to mask off the low byte of each result element of the
+ // pmullw, so it doesn't matter what's in the high byte of each 16-bit
+ // element.
+ const int LoShufMask[] = {0, -1, 1, -1, 2, -1, 3, -1,
+ 4, -1, 5, -1, 6, -1, 7, -1};
+ SDValue ALo = DAG.getVectorShuffle(VT, dl, A, A, LoShufMask);
+ SDValue BLo = DAG.getVectorShuffle(VT, dl, B, B, LoShufMask);
+ ALo = DAG.getBitcast(ExVT, ALo);
+ BLo = DAG.getBitcast(ExVT, BLo);
// Extract the hi parts and sign extend to i16
- SDValue AHi, BHi;
- if (Subtarget.hasSSE41()) {
- const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1};
- AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
- BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
- AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
- BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
- } else {
- const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
- -1, 12, -1, 13, -1, 14, -1, 15};
- AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
- BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
- AHi = DAG.getBitcast(ExVT, AHi);
- BHi = DAG.getBitcast(ExVT, BHi);
- AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
- BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
- }
+ // We're going to mask off the low byte of each result element of the
+ // pmullw, so it doesn't matter what's in the high byte of each 16-bit
+ // element.
+ const int HiShufMask[] = {8, -1, 9, -1, 10, -1, 11, -1,
+ 12, -1, 13, -1, 14, -1, 15, -1};
+ SDValue AHi = DAG.getVectorShuffle(VT, dl, A, A, HiShufMask);
+ SDValue BHi = DAG.getVectorShuffle(VT, dl, B, B, HiShufMask);
+ AHi = DAG.getBitcast(ExVT, AHi);
+ BHi = DAG.getBitcast(ExVT, BHi);
// Multiply, mask the lower 8bits of the lo/hi results and pack
SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
@@ -22096,22 +22623,19 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmulld is available!");
- // If the upper 17 bits of each element are zero then we can use PMADD.
- APInt Mask17 = APInt::getHighBitsSet(32, 17);
- if (DAG.MaskedValueIsZero(A, Mask17) && DAG.MaskedValueIsZero(B, Mask17))
- return DAG.getNode(X86ISD::VPMADDWD, dl, VT,
- DAG.getBitcast(MVT::v8i16, A),
- DAG.getBitcast(MVT::v8i16, B));
-
// Extract the odd parts.
static const int UnpackMask[] = { 1, -1, 3, -1 };
SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
// Multiply the even parts.
- SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
+ SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
+ DAG.getBitcast(MVT::v2i64, A),
+ DAG.getBitcast(MVT::v2i64, B));
// Now multiply odd parts.
- SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
+ SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
+ DAG.getBitcast(MVT::v2i64, Aodds),
+ DAG.getBitcast(MVT::v2i64, Bodds));
Evens = DAG.getBitcast(VT, Evens);
Odds = DAG.getBitcast(VT, Odds);
@@ -22124,17 +22648,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply");
-
- // 32-bit vector types used for MULDQ/MULUDQ.
- MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
-
- // MULDQ returns the 64-bit result of the signed multiplication of the lower
- // 32-bits. We can lower with this if the sign bits stretch that far.
- if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
- DAG.ComputeNumSignBits(B) > 32) {
- return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
- DAG.getBitcast(MulVT, B));
- }
+ assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
// Ahi = psrlqi(a, 32);
// Bhi = psrlqi(b, 32);
@@ -22145,42 +22659,35 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
//
// Hi = psllqi(AloBhi + AhiBlo, 32);
// return AloBlo + Hi;
+ KnownBits AKnown, BKnown;
+ DAG.computeKnownBits(A, AKnown);
+ DAG.computeKnownBits(B, BKnown);
+
APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
- bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
- bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
+ bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
+ bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
- bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
- bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
-
- // If DQI is supported we can use MULLQ, but MULUDQ is still better if the
- // the high bits are known to be zero.
- if (Subtarget.hasDQI() && (!AHiIsZero || !BHiIsZero))
- return Op;
-
- // Bit cast to 32-bit vectors for MULUDQ.
- SDValue Alo = DAG.getBitcast(MulVT, A);
- SDValue Blo = DAG.getBitcast(MulVT, B);
+ bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
+ bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
// Only multiply lo/hi halves that aren't known to be zero.
SDValue AloBlo = Zero;
if (!ALoIsZero && !BLoIsZero)
- AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
+ AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
SDValue AloBhi = Zero;
if (!ALoIsZero && !BHiIsZero) {
SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
- Bhi = DAG.getBitcast(MulVT, Bhi);
- AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
+ AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
}
SDValue AhiBlo = Zero;
if (!AHiIsZero && !BLoIsZero) {
SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
- Ahi = DAG.getBitcast(MulVT, Ahi);
- AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
+ AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
}
SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
@@ -22226,7 +22733,7 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
if (VT == MVT::v32i8) {
- if (Subtarget.hasBWI()) {
+ if (Subtarget.canExtendTo512BW()) {
SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
@@ -22277,13 +22784,14 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
assert(VT == MVT::v16i8 &&
"Pre-AVX2 support only supports v16i8 multiplication");
MVT ExVT = MVT::v8i16;
- unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
+ unsigned ExSSE41 = ISD::MULHU == Opcode ? ISD::ZERO_EXTEND_VECTOR_INREG
+ : ISD::SIGN_EXTEND_VECTOR_INREG;
// Extract the lo parts and zero/sign extend to i16.
SDValue ALo, BLo;
if (Subtarget.hasSSE41()) {
- ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
- BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
+ ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
+ BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
} else {
const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
-1, 4, -1, 5, -1, 6, -1, 7};
@@ -22302,8 +22810,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
-1, -1, -1, -1, -1, -1, -1, -1};
AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
- AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
- BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
+ AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
+ BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
} else {
const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
-1, 12, -1, 13, -1, 14, -1, 15};
@@ -22438,10 +22946,14 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
(!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
// => <2 x i64> <ae|cg>
- SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+ SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
+ DAG.getBitcast(MulVT, Op0),
+ DAG.getBitcast(MulVT, Op1)));
// PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
// => <2 x i64> <bf|dh>
- SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
+ SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
+ DAG.getBitcast(MulVT, Odd0),
+ DAG.getBitcast(MulVT, Odd1)));
// Shuffle it back into the right order.
SmallVector<int, 16> HighMask(NumElts);
@@ -22601,7 +23113,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
if (VT.is512BitVector()) {
assert(VT == MVT::v64i8 && "Unexpected element type!");
- SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
+ SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R,
+ ISD::SETGT);
return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
}
return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
@@ -22711,57 +23224,81 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
+// Determine if V is a splat value, and return the scalar.
+static SDValue IsSplatValue(MVT VT, SDValue V, const SDLoc &dl,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ unsigned Opcode) {
+ V = peekThroughEXTRACT_SUBVECTORs(V);
+
+ // Check if this is a splat build_vector node.
+ if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V)) {
+ SDValue SplatAmt = BV->getSplatValue();
+ if (SplatAmt && SplatAmt.isUndef())
+ return SDValue();
+ return SplatAmt;
+ }
+
+ // Check for SUB(SPLAT_BV, SPLAT) cases from rotate patterns.
+ if (V.getOpcode() == ISD::SUB &&
+ !SupportedVectorVarShift(VT, Subtarget, Opcode)) {
+ SDValue LHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(0));
+ SDValue RHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(1));
+
+ // Ensure that the corresponding splat BV element is not UNDEF.
+ BitVector UndefElts;
+ BuildVectorSDNode *BV0 = dyn_cast<BuildVectorSDNode>(LHS);
+ ShuffleVectorSDNode *SVN1 = dyn_cast<ShuffleVectorSDNode>(RHS);
+ if (BV0 && SVN1 && BV0->getSplatValue(&UndefElts) && SVN1->isSplat()) {
+ unsigned SplatIdx = (unsigned)SVN1->getSplatIndex();
+ if (!UndefElts[SplatIdx])
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+ VT.getVectorElementType(), V,
+ DAG.getIntPtrConstant(SplatIdx, dl));
+ }
+ }
+
+ // Check if this is a shuffle node doing a splat.
+ ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(V);
+ if (!SVN || !SVN->isSplat())
+ return SDValue();
+
+ unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
+ SDValue InVec = V.getOperand(0);
+ if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+ assert((SplatIdx < VT.getVectorNumElements()) &&
+ "Unexpected shuffle index found!");
+ return InVec.getOperand(SplatIdx);
+ } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2)))
+ if (C->getZExtValue() == SplatIdx)
+ return InVec.getOperand(1);
+ }
+
+ // Avoid introducing an extract element from a shuffle.
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+ VT.getVectorElementType(), InVec,
+ DAG.getIntPtrConstant(SplatIdx, dl));
+}
+
static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
+ unsigned Opcode = Op.getOpcode();
- unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
- (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
-
- unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
- (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
-
- if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
- SDValue BaseShAmt;
- MVT EltVT = VT.getVectorElementType();
-
- if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
- // Check if this build_vector node is doing a splat.
- // If so, then set BaseShAmt equal to the splat value.
- BaseShAmt = BV->getSplatValue();
- if (BaseShAmt && BaseShAmt.isUndef())
- BaseShAmt = SDValue();
- } else {
- if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
- Amt = Amt.getOperand(0);
+ unsigned X86OpcI = (Opcode == ISD::SHL) ? X86ISD::VSHLI :
+ (Opcode == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
- ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
- if (SVN && SVN->isSplat()) {
- unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
- SDValue InVec = Amt.getOperand(0);
- if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
- assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
- "Unexpected shuffle index found!");
- BaseShAmt = InVec.getOperand(SplatIdx);
- } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
- if (ConstantSDNode *C =
- dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
- if (C->getZExtValue() == SplatIdx)
- BaseShAmt = InVec.getOperand(1);
- }
- }
+ unsigned X86OpcV = (Opcode == ISD::SHL) ? X86ISD::VSHL :
+ (Opcode == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
- if (!BaseShAmt)
- // Avoid introducing an extract element from a shuffle.
- BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
- DAG.getIntPtrConstant(SplatIdx, dl));
- }
- }
+ Amt = peekThroughEXTRACT_SUBVECTORs(Amt);
- if (BaseShAmt.getNode()) {
+ if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
+ if (SDValue BaseShAmt = IsSplatValue(VT, Amt, dl, DAG, Subtarget, Opcode)) {
+ MVT EltVT = VT.getVectorElementType();
assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
@@ -22793,6 +23330,70 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
+// Convert a shift/rotate left amount to a multiplication scale factor.
+static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Amt.getSimpleValueType();
+ if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
+ (Subtarget.hasInt256() && VT == MVT::v16i16) ||
+ (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
+ return SDValue();
+
+ if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+ SmallVector<SDValue, 8> Elts;
+ MVT SVT = VT.getVectorElementType();
+ unsigned SVTBits = SVT.getSizeInBits();
+ APInt One(SVTBits, 1);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ for (unsigned i = 0; i != NumElems; ++i) {
+ SDValue Op = Amt->getOperand(i);
+ if (Op->isUndef()) {
+ Elts.push_back(Op);
+ continue;
+ }
+
+ ConstantSDNode *ND = cast<ConstantSDNode>(Op);
+ APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
+ uint64_t ShAmt = C.getZExtValue();
+ if (ShAmt >= SVTBits) {
+ Elts.push_back(DAG.getUNDEF(SVT));
+ continue;
+ }
+ Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
+ }
+ return DAG.getBuildVector(VT, dl, Elts);
+ }
+
+ // If the target doesn't support variable shifts, use either FP conversion
+ // or integer multiplication to avoid shifting each element individually.
+ if (VT == MVT::v4i32) {
+ Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
+ DAG.getConstant(0x3f800000U, dl, VT));
+ Amt = DAG.getBitcast(MVT::v4f32, Amt);
+ return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
+ }
+
+ // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
+ if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
+ SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+ SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
+ SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
+ Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
+ Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
+ if (Subtarget.hasSSE41())
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
+
+ return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
+ DAG.getBitcast(VT, Hi),
+ {0, 2, 4, 6, 8, 10, 12, 14});
+ }
+
+ return SDValue();
+}
+
static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
@@ -22815,11 +23416,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
// XOP has 128-bit variable logical/arithmetic shifts.
// +ve/-ve Amt = shift left/right.
- if (Subtarget.hasXOP() &&
- (VT == MVT::v2i64 || VT == MVT::v4i32 ||
- VT == MVT::v8i16 || VT == MVT::v16i8)) {
+ if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+ VT == MVT::v8i16 || VT == MVT::v16i8)) {
if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
- SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
+ SDValue Zero = DAG.getConstant(0, dl, VT);
Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
}
if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
@@ -22852,51 +23452,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
return R;
}
- // If possible, lower this packed shift into a vector multiply instead of
- // expanding it into a sequence of scalar shifts.
- // Do this only if the vector shift count is a constant build_vector.
- if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
- (VT == MVT::v8i16 || VT == MVT::v4i32 ||
- (Subtarget.hasInt256() && VT == MVT::v16i16))) {
- SmallVector<SDValue, 8> Elts;
- MVT SVT = VT.getVectorElementType();
- unsigned SVTBits = SVT.getSizeInBits();
- APInt One(SVTBits, 1);
- unsigned NumElems = VT.getVectorNumElements();
-
- for (unsigned i=0; i !=NumElems; ++i) {
- SDValue Op = Amt->getOperand(i);
- if (Op->isUndef()) {
- Elts.push_back(Op);
- continue;
- }
-
- ConstantSDNode *ND = cast<ConstantSDNode>(Op);
- APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
- uint64_t ShAmt = C.getZExtValue();
- if (ShAmt >= SVTBits) {
- Elts.push_back(DAG.getUNDEF(SVT));
- continue;
- }
- Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
- }
- SDValue BV = DAG.getBuildVector(VT, dl, Elts);
- return DAG.getNode(ISD::MUL, dl, VT, R, BV);
- }
-
- // Lower SHL with variable shift amount.
- if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
- Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
-
- Op = DAG.getNode(ISD::ADD, dl, VT, Op,
- DAG.getConstant(0x3f800000U, dl, VT));
- Op = DAG.getBitcast(MVT::v4f32, Op);
- Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
- return DAG.getNode(ISD::MUL, dl, VT, Op, R);
- }
-
// If possible, lower this shift as a sequence of two shifts by
- // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
+ // constant plus a BLENDing shuffle instead of scalarizing it.
// Example:
// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
//
@@ -22904,67 +23461,54 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
//
// The advantage is that the two shifts from the example would be
- // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
- // the vector shift into four scalar shifts plus four pairs of vector
- // insert/extract.
- if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
- bool UseMOVSD = false;
- bool CanBeSimplified;
- // The splat value for the first packed shift (the 'X' from the example).
- SDValue Amt1 = Amt->getOperand(0);
- // The splat value for the second packed shift (the 'Y' from the example).
- SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
-
- // See if it is possible to replace this node with a sequence of
- // two shifts followed by a MOVSS/MOVSD/PBLEND.
- if (VT == MVT::v4i32) {
- // Check if it is legal to use a MOVSS.
- CanBeSimplified = Amt2 == Amt->getOperand(2) &&
- Amt2 == Amt->getOperand(3);
- if (!CanBeSimplified) {
- // Otherwise, check if we can still simplify this node using a MOVSD.
- CanBeSimplified = Amt1 == Amt->getOperand(1) &&
- Amt->getOperand(2) == Amt->getOperand(3);
- UseMOVSD = true;
- Amt2 = Amt->getOperand(2);
+ // lowered as X86ISD::VSRLI nodes in parallel before blending.
+ if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
+ (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
+ SDValue Amt1, Amt2;
+ unsigned NumElts = VT.getVectorNumElements();
+ SmallVector<int, 8> ShuffleMask;
+ for (unsigned i = 0; i != NumElts; ++i) {
+ SDValue A = Amt->getOperand(i);
+ if (A.isUndef()) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
}
- } else {
- // Do similar checks for the case where the machine value type
- // is MVT::v8i16.
- CanBeSimplified = Amt1 == Amt->getOperand(1);
- for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
- CanBeSimplified = Amt2 == Amt->getOperand(i);
-
- if (!CanBeSimplified) {
- UseMOVSD = true;
- CanBeSimplified = true;
- Amt2 = Amt->getOperand(4);
- for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
- CanBeSimplified = Amt1 == Amt->getOperand(i);
- for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
- CanBeSimplified = Amt2 == Amt->getOperand(j);
+ if (!Amt1 || Amt1 == A) {
+ ShuffleMask.push_back(i);
+ Amt1 = A;
+ continue;
}
+ if (!Amt2 || Amt2 == A) {
+ ShuffleMask.push_back(i + NumElts);
+ Amt2 = A;
+ continue;
+ }
+ break;
}
- if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
- isa<ConstantSDNode>(Amt2)) {
- // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
+ // Only perform this blend if we can perform it without loading a mask.
+ if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
+ isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2) &&
+ (VT != MVT::v16i16 ||
+ is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
+ (VT == MVT::v4i32 || Subtarget.hasSSE41() ||
+ Op.getOpcode() != ISD::SHL || canWidenShuffleElements(ShuffleMask))) {
SDValue Splat1 =
DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
SDValue Splat2 =
DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
- SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
- SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
- if (UseMOVSD)
- return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
- BitCast2, {0, 1, 6, 7}));
- return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
- BitCast2, {0, 5, 6, 7}));
+ return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
}
}
+ // If possible, lower this packed shift into a vector multiply instead of
+ // expanding it into a sequence of scalar shifts.
+ if (Op.getOpcode() == ISD::SHL)
+ if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
+ return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
+
// v4i32 Non Uniform Shifts.
// If the shift amount is constant we can shift each lane using the SSE2
// immediate shifts, else we need to zero-extend each lane to the lower i64
@@ -22994,31 +23538,56 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
break;
}
// The SSE2 shifts use the lower i64 as the same shift amount for
- // all lanes and the upper i64 is ignored. These shuffle masks
- // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
- SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
- Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
- Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
- Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
- Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
+ // all lanes and the upper i64 is ignored. On AVX we're better off
+ // just zero-extending, but for SSE just duplicating the top 16-bits is
+ // cheaper and has the same effect for out of range values.
+ if (Subtarget.hasAVX()) {
+ SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+ Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
+ Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
+ Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
+ Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
+ } else {
+ SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
+ SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
+ {4, 5, 6, 7, -1, -1, -1, -1});
+ Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
+ {0, 1, 1, 1, -1, -1, -1, -1});
+ Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
+ {2, 3, 3, 3, -1, -1, -1, -1});
+ Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
+ {0, 1, 1, 1, -1, -1, -1, -1});
+ Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
+ {2, 3, 3, 3, -1, -1, -1, -1});
+ }
}
- SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
- SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
- SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
- SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
- SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
- SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
- return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
+ SDValue R0 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt0));
+ SDValue R1 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt1));
+ SDValue R2 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt2));
+ SDValue R3 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt3));
+
+ // Merge the shifted lane results optimally with/without PBLENDW.
+ // TODO - ideally shuffle combining would handle this.
+ if (Subtarget.hasSSE41()) {
+ SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
+ SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
+ return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
+ }
+ SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
+ SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
+ return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
}
// It's worth extending once and using the vXi16/vXi32 shifts for smaller
// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
// make the existing SSE solution better.
+ // NOTE: We honor prefered vector width before promoting to 512-bits.
if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
- (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
- (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
- (Subtarget.hasBWI() && VT == MVT::v32i8)) {
+ (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
+ (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
+ (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
+ (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
"Unexpected vector type");
MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
@@ -23046,7 +23615,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
- Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
+ Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
+ ISD::SETGT);
return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
} else if (Subtarget.hasSSE41()) {
// On SSE41 targets we make use of the fact that VSELECT lowers
@@ -23242,13 +23812,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
+ assert(VT.isVector() && "Custom lowering only for vector rotates!");
+
SDLoc DL(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
unsigned Opcode = Op.getOpcode();
unsigned EltSizeInBits = VT.getScalarSizeInBits();
- if (Subtarget.hasAVX512()) {
+ if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
// Attempt to rotate by immediate.
APInt UndefElts;
SmallVector<APInt, 16> EltBits;
@@ -23267,31 +23839,178 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
return Op;
}
- assert(VT.isVector() && "Custom lowering only for vector rotates!");
- assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
assert((Opcode == ISD::ROTL) && "Only ROTL supported");
// XOP has 128-bit vector variable + immediate rotates.
// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
+ if (Subtarget.hasXOP()) {
+ // Split 256-bit integers.
+ if (VT.is256BitVector())
+ return Lower256IntArith(Op, DAG);
+ assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
- // Split 256-bit integers.
- if (VT.is256BitVector())
+ // Attempt to rotate by immediate.
+ if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+ if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
+ uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
+ assert(RotateAmt < EltSizeInBits && "Rotation out of range");
+ return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
+ DAG.getConstant(RotateAmt, DL, MVT::i8));
+ }
+ }
+
+ // Use general rotate by variable (per-element).
+ return Op;
+ }
+
+ // Split 256-bit integers on pre-AVX2 targets.
+ if (VT.is256BitVector() && !Subtarget.hasAVX2())
return Lower256IntArith(Op, DAG);
- assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
+ assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
+ ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
+ Subtarget.hasAVX2())) &&
+ "Only vXi32/vXi16/vXi8 vector rotates supported");
- // Attempt to rotate by immediate.
+ // Rotate by an uniform constant - expand back to shifts.
+ // TODO - legalizers should be able to handle this.
if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
assert(RotateAmt < EltSizeInBits && "Rotation out of range");
- return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
- DAG.getConstant(RotateAmt, DL, MVT::i8));
+ if (RotateAmt == 0)
+ return R;
+
+ SDValue AmtR = DAG.getConstant(EltSizeInBits - RotateAmt, DL, VT);
+ SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
+ SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
+ return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
}
}
- // Use general rotate by variable (per-element).
- return Op;
+ // Rotate by splat - expand back to shifts.
+ // TODO - legalizers should be able to handle this.
+ if ((EltSizeInBits >= 16 || Subtarget.hasBWI()) &&
+ IsSplatValue(VT, Amt, DL, DAG, Subtarget, Opcode)) {
+ SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
+ AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
+ SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
+ SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
+ return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
+ }
+
+ // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
+ // the amount bit.
+ if (EltSizeInBits == 8) {
+ if (Subtarget.hasBWI()) {
+ SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
+ AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
+ SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
+ SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
+ return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
+ }
+
+ MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
+
+ auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
+ if (Subtarget.hasSSE41()) {
+ // On SSE41 targets we make use of the fact that VSELECT lowers
+ // to PBLENDVB which selects bytes based just on the sign bit.
+ V0 = DAG.getBitcast(VT, V0);
+ V1 = DAG.getBitcast(VT, V1);
+ Sel = DAG.getBitcast(VT, Sel);
+ return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
+ }
+ // On pre-SSE41 targets we test for the sign bit by comparing to
+ // zero - a negative value will set all bits of the lanes to true
+ // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
+ SDValue Z = getZeroVector(SelVT, Subtarget, DAG, DL);
+ SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
+ return DAG.getSelect(DL, SelVT, C, V0, V1);
+ };
+
+ // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
+ // We can safely do this using i16 shifts as we're only interested in
+ // the 3 lower bits of each byte.
+ Amt = DAG.getBitcast(ExtVT, Amt);
+ Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
+ Amt = DAG.getBitcast(VT, Amt);
+
+ // r = VSELECT(r, rot(r, 4), a);
+ SDValue M;
+ M = DAG.getNode(
+ ISD::OR, DL, VT,
+ DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
+ DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
+ R = SignBitSelect(VT, Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
+
+ // r = VSELECT(r, rot(r, 2), a);
+ M = DAG.getNode(
+ ISD::OR, DL, VT,
+ DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
+ DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
+ R = SignBitSelect(VT, Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
+
+ // return VSELECT(r, rot(r, 1), a);
+ M = DAG.getNode(
+ ISD::OR, DL, VT,
+ DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
+ DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
+ return SignBitSelect(VT, Amt, M, R);
+ }
+
+ bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
+ bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
+ SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
+
+ // Best to fallback for all supported variable shifts.
+ // AVX2 - best to fallback for non-constants as well.
+ // TODO - legalizers should be able to handle this.
+ if (LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
+ SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
+ AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
+ SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
+ SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
+ return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
+ }
+
+ // As with shifts, convert the rotation amount to a multiplication factor.
+ SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
+ assert(Scale && "Failed to convert ROTL amount to scale");
+
+ // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
+ if (EltSizeInBits == 16) {
+ SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
+ SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
+ return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
+ }
+
+ // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
+ // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
+ // that can then be OR'd with the lower 32-bits.
+ assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
+ static const int OddMask[] = {1, -1, 3, -1};
+ SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
+ SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
+
+ SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
+ DAG.getBitcast(MVT::v2i64, R),
+ DAG.getBitcast(MVT::v2i64, Scale));
+ SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
+ DAG.getBitcast(MVT::v2i64, R13),
+ DAG.getBitcast(MVT::v2i64, Scale13));
+ Res02 = DAG.getBitcast(VT, Res02);
+ Res13 = DAG.getBitcast(VT, Res13);
+
+ return DAG.getNode(ISD::OR, DL, VT,
+ DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
+ DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
}
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
@@ -23353,9 +24072,6 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
- if (N->getValueType(1) == MVT::i1)
- SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
-
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
}
@@ -23366,9 +24082,6 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
- if (N->getValueType(1) == MVT::i1)
- SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
-
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
@@ -23572,11 +24285,68 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
return SDValue();
}
+// Create MOVMSKB, taking into account whether we need to split for AVX1.
+static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT InVT = V.getSimpleValueType();
+
+ if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
+ Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
+ Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
+ Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
+ DAG.getConstant(16, DL, MVT::i8));
+ return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
+ }
+
+ return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
+}
+
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
MVT DstVT = Op.getSimpleValueType();
+ // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
+ // half to v32i1 and concatenating the result.
+ if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
+ assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
+ assert(Subtarget.hasBWI() && "Expected BWI target");
+ SDLoc dl(Op);
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
+ DAG.getIntPtrConstant(0, dl));
+ Lo = DAG.getBitcast(MVT::v32i1, Lo);
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
+ DAG.getIntPtrConstant(1, dl));
+ Hi = DAG.getBitcast(MVT::v32i1, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
+ }
+
+ // Custom splitting for BWI types when AVX512F is available but BWI isn't.
+ if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
+ DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
+ SDLoc dl(Op);
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
+ EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
+ DstVT.getVectorNumElements() / 2);
+ Lo = DAG.getBitcast(CastVT, Lo);
+ Hi = DAG.getBitcast(CastVT, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
+ }
+
+ // Use MOVMSK for vector to scalar conversion to prevent scalarization.
+ if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
+ assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
+ MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
+ SDLoc DL(Op);
+ SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
+ V = getPMOVMSKB(DL, V, DAG, Subtarget);
+ return DAG.getZExtOrTrunc(V, DL, DstVT);
+ }
+
if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
SrcVT == MVT::i64) {
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
@@ -23584,7 +24354,6 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
// This conversion needs to be expanded.
return SDValue();
- SDValue Op0 = Op->getOperand(0);
SmallVector<SDValue, 16> Elts;
SDLoc dl(Op);
unsigned NumElts;
@@ -23596,14 +24365,14 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
// Widen the vector in input in the case of MVT::v2i32.
// Example: from MVT::v2i32 to MVT::v4i32.
for (unsigned i = 0, e = NumElts; i != e; ++i)
- Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
+ Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Src,
DAG.getIntPtrConstant(i, dl)));
} else {
assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
"Unexpected source type in LowerBITCAST");
- Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
+ Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
DAG.getIntPtrConstant(0, dl)));
- Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
+ Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
DAG.getIntPtrConstant(1, dl)));
NumElts = 2;
SVT = MVT::i32;
@@ -23842,7 +24611,7 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
unsigned NumElems = VT.getVectorNumElements();
assert((VT.getVectorElementType() == MVT::i8 ||
VT.getVectorElementType() == MVT::i16) && "Unexpected type");
- if (NumElems <= 16) {
+ if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
@@ -24224,76 +24993,81 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
SDLoc dl(Op);
+ SDValue Scale = N->getScale();
SDValue Index = N->getIndex();
SDValue Mask = N->getMask();
SDValue Chain = N->getChain();
SDValue BasePtr = N->getBasePtr();
- MVT MemVT = N->getMemoryVT().getSimpleVT();
+
+ if (VT == MVT::v2f32) {
+ assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
+ // If the index is v2i64 and we have VLX we can use xmm for data and index.
+ if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getUNDEF(MVT::v2f32));
+ SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
+ SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
+ SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
+ VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
+ DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
+ return SDValue(NewScatter.getNode(), 1);
+ }
+ return SDValue();
+ }
+
+ if (VT == MVT::v2i32) {
+ assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+ DAG.getUNDEF(MVT::v2i32));
+ // If the index is v2i64 and we have VLX we can use xmm for data and index.
+ if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
+ SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
+ SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
+ SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
+ VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
+ DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
+ return SDValue(NewScatter.getNode(), 1);
+ }
+ // Custom widen all the operands to avoid promotion.
+ EVT NewIndexVT = EVT::getVectorVT(
+ *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
+ Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
+ DAG.getUNDEF(Index.getValueType()));
+ Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
+ DAG.getConstant(0, dl, MVT::v2i1));
+ SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
+ return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
+ Ops, N->getMemOperand());
+ }
+
MVT IndexVT = Index.getSimpleValueType();
MVT MaskVT = Mask.getSimpleValueType();
- if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
- // The v2i32 value was promoted to v2i64.
- // Now we "redo" the type legalizer's work and widen the original
- // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
- // with a shuffle.
- assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
- "Unexpected memory type");
- int ShuffleMask[] = {0, 2, -1, -1};
- Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
- DAG.getUNDEF(MVT::v4i32), ShuffleMask);
- // Now we have 4 elements instead of 2.
- // Expand the index.
- MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
- Index = ExtendToType(Index, NewIndexVT, DAG);
-
- // Expand the mask with zeroes
- // Mask may be <2 x i64> or <2 x i1> at this moment
- assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
- "Unexpected mask type");
- MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
- Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
- VT = MVT::v4i32;
- }
+ // If the index is v2i32, we're being called by type legalization and we
+ // should just let the default handling take care of it.
+ if (IndexVT == MVT::v2i32)
+ return SDValue();
- unsigned NumElts = VT.getVectorNumElements();
+ // If we don't have VLX and neither the passthru or index is 512-bits, we
+ // need to widen until one is.
if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
!Index.getSimpleValueType().is512BitVector()) {
- // AVX512F supports only 512-bit vectors. Or data or index should
- // be 512 bit wide. If now the both index and data are 256-bit, but
- // the vector contains 8 elements, we just sign-extend the index
- if (IndexVT == MVT::v8i32)
- // Just extend index
- Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
- else {
- // The minimal number of elts in scatter is 8
- NumElts = 8;
- // Index
- MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
- // Use original index here, do not modify the index twice
- Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
- if (IndexVT.getScalarType() == MVT::i32)
- Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
-
- // Mask
- // At this point we have promoted mask operand
- assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
- MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
- // Use the original mask here, do not modify the mask twice
- Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
-
- // The value that should be stored
- MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
- Src = ExtendToType(Src, NewVT, DAG);
- }
- }
- // If the mask is "wide" at this point - truncate it to i1 vector
- MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
- Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
-
- // The mask is killed by scatter, add it to the values
- SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
- SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
+ // Determine how much we need to widen by to get a 512-bit type.
+ unsigned Factor = std::min(512/VT.getSizeInBits(),
+ 512/IndexVT.getSizeInBits());
+ unsigned NumElts = VT.getVectorNumElements() * Factor;
+
+ VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
+ IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
+ MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+
+ Src = ExtendToType(Src, VT, DAG);
+ Index = ExtendToType(Index, IndexVT, DAG);
+ Mask = ExtendToType(Mask, MaskVT, DAG, true);
+ }
+
+ SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+ SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
@@ -24315,11 +25089,6 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
"Expanding masked load is supported for 32 and 64-bit types only!");
- // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
- // VLX. These types for exp-loads are handled here.
- if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
- return Op;
-
assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.");
@@ -24336,16 +25105,12 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
Src0 = ExtendToType(Src0, WideDataVT, DAG);
// Mask element has to be i1.
- MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
- assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
- "We handle 4x32, 4x64 and 2x64 vectors only in this case");
+ assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
+ "Unexpected mask type");
- MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
+ MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
- if (MaskEltTy != MVT::i1)
- Mask = DAG.getNode(ISD::TRUNCATE, dl,
- MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
N->getBasePtr(), Mask, Src0,
N->getMemoryVT(), N->getMemOperand(),
@@ -24374,10 +25139,6 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
"Expanding masked load is supported for 32 and 64-bit types only!");
- // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
- if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
- return Op;
-
assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.");
@@ -24392,17 +25153,13 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
// Mask element has to be i1.
- MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
- assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
- "We handle 4x32, 4x64 and 2x64 vectors only in this case");
+ assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
+ "Unexpected mask type");
- MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
+ MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
- if (MaskEltTy != MVT::i1)
- Mask = DAG.getNode(ISD::TRUNCATE, dl,
- MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
Mask, N->getMemoryVT(), N->getMemOperand(),
N->isTruncatingStore(), N->isCompressingStore());
@@ -24422,63 +25179,40 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
MVT IndexVT = Index.getSimpleValueType();
MVT MaskVT = Mask.getSimpleValueType();
- unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
// If the index is v2i32, we're being called by type legalization.
if (IndexVT == MVT::v2i32)
return SDValue();
+ // If we don't have VLX and neither the passthru or index is 512-bits, we
+ // need to widen until one is.
+ MVT OrigVT = VT;
if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
- !Index.getSimpleValueType().is512BitVector()) {
- // AVX512F supports only 512-bit vectors. Or data or index should
- // be 512 bit wide. If now the both index and data are 256-bit, but
- // the vector contains 8 elements, we just sign-extend the index
- if (NumElts == 8) {
- Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
- SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
- SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
- N->getMemOperand());
- return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
- }
-
- // Minimal number of elements in Gather
- NumElts = 8;
- // Index
- MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
- Index = ExtendToType(Index, NewIndexVT, DAG);
- if (IndexVT.getScalarType() == MVT::i32)
- Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
-
- // Mask
- MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
- // At this point we have promoted mask operand
- assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
- MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
- Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
- Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
-
- // The pass-through value
- MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
- Src0 = ExtendToType(Src0, NewVT, DAG);
-
- SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
- SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(NewVT, MaskBitVT, MVT::Other), Ops, dl, N->getMemoryVT(),
- N->getMemOperand());
- SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
- NewGather.getValue(0),
- DAG.getIntPtrConstant(0, dl));
- SDValue RetOps[] = {Extract, NewGather.getValue(2)};
- return DAG.getMergeValues(RetOps, dl);
+ !IndexVT.is512BitVector()) {
+ // Determine how much we need to widen by to get a 512-bit type.
+ unsigned Factor = std::min(512/VT.getSizeInBits(),
+ 512/IndexVT.getSizeInBits());
+
+ unsigned NumElts = VT.getVectorNumElements() * Factor;
+
+ VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
+ IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
+ MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+
+ Src0 = ExtendToType(Src0, VT, DAG);
+ Index = ExtendToType(Index, IndexVT, DAG);
+ Mask = ExtendToType(Mask, MaskVT, DAG, true);
}
- SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index,
+ N->getScale() };
SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
N->getMemOperand());
- return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
+ NewGather, DAG.getIntPtrConstant(0, dl));
+ return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
}
SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
@@ -24545,6 +25279,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
+ case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
@@ -24566,7 +25301,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
- case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
+ case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
+ case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
case ISD::FABS:
case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
@@ -24635,7 +25371,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::GC_TRANSITION_START:
return LowerGC_TRANSITION_START(Op, DAG);
case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
- case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
}
}
@@ -24676,19 +25411,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
auto InVT = N->getValueType(0);
- auto InVTSize = InVT.getSizeInBits();
- const unsigned RegSize =
- (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
- assert((Subtarget.hasBWI() || RegSize < 512) &&
- "512-bit vector requires AVX512BW");
- assert((Subtarget.hasAVX2() || RegSize < 256) &&
- "256-bit vector requires AVX2");
-
- auto ElemVT = InVT.getVectorElementType();
- auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
- RegSize / ElemVT.getSizeInBits());
- assert(RegSize % InVT.getSizeInBits() == 0);
- unsigned NumConcat = RegSize / InVT.getSizeInBits();
+ assert(InVT.getSizeInBits() < 128);
+ assert(128 % InVT.getSizeInBits() == 0);
+ unsigned NumConcat = 128 / InVT.getSizeInBits();
+
+ EVT RegVT = EVT::getVectorVT(*DAG.getContext(),
+ InVT.getVectorElementType(),
+ NumConcat * InVT.getVectorNumElements());
SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
Ops[0] = N->getOperand(0);
@@ -24697,12 +25426,32 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
- if (!ExperimentalVectorWideningLegalization)
+ if (getTypeAction(*DAG.getContext(), InVT) != TypeWidenVector)
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
return;
}
+ case ISD::SETCC: {
+ // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
+ // setCC result type is v2i1 because type legalzation will end up with
+ // a v4i1 setcc plus an extend.
+ assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
+ if (N->getOperand(0).getValueType() != MVT::v2f32)
+ return;
+ SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
+ SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(0), UNDEF);
+ SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(1), UNDEF);
+ SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
+ N->getOperand(2));
+ if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
+ return;
+ }
// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
case X86ISD::FMINC:
case X86ISD::FMIN:
@@ -24731,12 +25480,14 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: {
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
- if (N->getValueType(0) == MVT::v2i32) {
+ if (VT == MVT::v2i32) {
assert((IsSigned || Subtarget.hasAVX512()) &&
"Can only handle signed conversion without AVX512");
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
- SDValue Src = N->getOperand(0);
if (Src.getValueType() == MVT::v2f64) {
MVT ResVT = MVT::v4i32;
unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
@@ -24749,20 +25500,21 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Src, DAG.getIntPtrConstant(0, dl));
}
SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
- ResVT = ExperimentalVectorWideningLegalization ? MVT::v4i32
- : MVT::v2i32;
+ bool WidenType = getTypeAction(*DAG.getContext(),
+ MVT::v2i32) == TypeWidenVector;
+ ResVT = WidenType ? MVT::v4i32 : MVT::v2i32;
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
return;
}
- if (Src.getValueType() == MVT::v2f32) {
+ if (SrcVT == MVT::v2f32) {
SDValue Idx = DAG.getIntPtrConstant(0, dl);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getUNDEF(MVT::v2f32));
Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
: ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
- if (!ExperimentalVectorWideningLegalization)
+ if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
Results.push_back(Res);
return;
@@ -24773,11 +25525,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
+ if (Subtarget.hasDQI() && VT == MVT::i64 &&
+ (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
+ assert(!Subtarget.is64Bit() && "i64 should be legal");
+ unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
+ // Using a 256-bit input here to guarantee 128-bit input for f32 case.
+ // TODO: Use 128-bit vectors for f64 case?
+ // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
+ MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
+ MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);
+
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+ SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
+ DAG.getConstantFP(0.0, dl, VecInVT), Src,
+ ZeroIdx);
+ Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
+ Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
+ Results.push_back(Res);
+ return;
+ }
+
std::pair<SDValue,SDValue> Vals =
FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
SDValue FIST = Vals.first, StackSlot = Vals.second;
if (FIST.getNode()) {
- EVT VT = N->getValueType(0);
// Return a load from the stack slot.
if (StackSlot.getNode())
Results.push_back(
@@ -24963,6 +25734,32 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
EVT DstVT = N->getValueType(0);
EVT SrcVT = N->getOperand(0).getValueType();
+ // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
+ // we can split using the k-register rather than memory.
+ if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
+ assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+ Lo = DAG.getBitcast(MVT::i32, Lo);
+ Hi = DAG.getBitcast(MVT::i32, Hi);
+ SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+ Results.push_back(Res);
+ return;
+ }
+
+ // Custom splitting for BWI types when AVX512F is available but BWI isn't.
+ if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
+ SrcVT.isVector() && isTypeLegal(SrcVT)) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+ MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
+ Lo = DAG.getBitcast(CastVT, Lo);
+ Hi = DAG.getBitcast(CastVT, Hi);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
+ Results.push_back(Res);
+ return;
+ }
+
if (SrcVT != MVT::f64 ||
(DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
return;
@@ -24974,7 +25771,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
MVT::v2f64, N->getOperand(0));
SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
- if (ExperimentalVectorWideningLegalization) {
+ if (getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) {
// If we are legalizing vectors by widening, we already have the desired
// legal vector type, just return it.
Results.push_back(ToVecInt);
@@ -25009,7 +25806,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
}
SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
- Index };
+ Index, Gather->getScale() };
SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
Gather->getMemoryVT(), Gather->getMemOperand());
@@ -25036,12 +25833,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
}
SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
- Index };
+ Index, Gather->getScale() };
SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
Gather->getMemoryVT(), Gather->getMemOperand());
SDValue Chain = Res.getValue(2);
- if (!ExperimentalVectorWideningLegalization)
+ if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
@@ -25057,12 +25854,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
DAG.getConstant(0, dl, MVT::v2i1));
SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
- Index };
+ Index, Gather->getScale() };
SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
Gather->getMemoryVT(), dl, Ops,
Gather->getMemOperand());
SDValue Chain = Res.getValue(1);
- if (!ExperimentalVectorWideningLegalization)
+ if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
@@ -25101,7 +25898,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::COMI: return "X86ISD::COMI";
case X86ISD::UCOMI: return "X86ISD::UCOMI";
case X86ISD::CMPM: return "X86ISD::CMPM";
- case X86ISD::CMPMU: return "X86ISD::CMPMU";
case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
case X86ISD::SETCC: return "X86ISD::SETCC";
case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
@@ -25192,7 +25988,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
- case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
case X86ISD::VSHL: return "X86ISD::VSHL";
@@ -25208,8 +26003,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CMPP: return "X86ISD::CMPP";
case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
- case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
- case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
case X86ISD::ADD: return "X86ISD::ADD";
case X86ISD::SUB: return "X86ISD::SUB";
@@ -25226,14 +26019,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::OR: return "X86ISD::OR";
case X86ISD::XOR: return "X86ISD::XOR";
case X86ISD::AND: return "X86ISD::AND";
+ case X86ISD::BEXTR: return "X86ISD::BEXTR";
case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
case X86ISD::PTEST: return "X86ISD::PTEST";
case X86ISD::TESTP: return "X86ISD::TESTP";
- case X86ISD::TESTM: return "X86ISD::TESTM";
- case X86ISD::TESTNM: return "X86ISD::TESTNM";
case X86ISD::KORTEST: return "X86ISD::KORTEST";
case X86ISD::KTEST: return "X86ISD::KTEST";
+ case X86ISD::KADD: return "X86ISD::KADD";
case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
case X86ISD::PACKSS: return "X86ISD::PACKSS";
@@ -25251,8 +26044,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SHUF128: return "X86ISD::SHUF128";
case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
- case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
- case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
@@ -25268,7 +26059,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
case X86ISD::VPERMV: return "X86ISD::VPERMV";
case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
- case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
case X86ISD::VPERMI: return "X86ISD::VPERMI";
case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
@@ -25308,26 +26098,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
- case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
- case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
- case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
- case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
- case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
- case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
- case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
- case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
- case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
- case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
- case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
- case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
- case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
- case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
- case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
- case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
- case X86ISD::FMADD4S: return "X86ISD::FMADD4S";
- case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";
- case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";
- case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";
case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
@@ -25342,8 +26112,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
- case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
- case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
+ case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
+ case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
case X86ISD::XTEST: return "X86ISD::XTEST";
case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
case X86ISD::EXPAND: return "X86ISD::EXPAND";
@@ -25412,6 +26182,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
+ case X86ISD::NT_CALL: return "X86ISD::NT_CALL";
+ case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
+ case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
+ case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
}
return nullptr;
}
@@ -25478,11 +26252,20 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
if (Bits == 8)
return false;
+ // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
+ if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
+ (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
+ return false;
+
// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
// shifts just as cheap as scalar ones.
if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
return false;
+ // AVX512BW has shifts such as vpsllvw.
+ if (Subtarget.hasBWI() && Bits == 16)
+ return false;
+
// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
// fully general vector.
return true;
@@ -25561,7 +26344,15 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return false;
}
-bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
+bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
+ EVT SrcVT = ExtVal.getOperand(0).getValueType();
+
+ // There is no extending load for vXi1.
+ if (SrcVT.getScalarType() == MVT::i1)
+ return false;
+
+ return true;
+}
bool
X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
@@ -25610,13 +26401,27 @@ bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
return isTypeLegal(VT.getSimpleVT());
}
-bool
-X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
- EVT VT) const {
+bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
+ EVT VT) const {
+ // Don't convert an 'and' into a shuffle that we don't directly support.
+ // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
+ if (!Subtarget.hasAVX2())
+ if (VT == MVT::v32i8 || VT == MVT::v16i16)
+ return false;
+
// Just delegate to the generic legality, clear masks aren't special.
return isShuffleMaskLegal(Mask, VT);
}
+bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
+ // If the subtarget is using retpolines, we need to not generate jump tables.
+ if (Subtarget.useRetpoline())
+ return false;
+
+ // Otherwise, fallback on the generic logic.
+ return TargetLowering::areJTsAllowed(Fn);
+}
+
//===----------------------------------------------------------------------===//
// X86 Scheduler Hooks
//===----------------------------------------------------------------------===//
@@ -25697,79 +26502,6 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
return sinkMBB;
}
-// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
-// or XMM0_V32I8 in AVX all of this code can be replaced with that
-// in the .td file.
-static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
- const TargetInstrInfo *TII) {
- unsigned Opc;
- switch (MI.getOpcode()) {
- default: llvm_unreachable("illegal opcode!");
- case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
- case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
- case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
- case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
- case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
- case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
- case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
- case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
- }
-
- DebugLoc dl = MI.getDebugLoc();
- MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
-
- unsigned NumArgs = MI.getNumOperands();
- for (unsigned i = 1; i < NumArgs; ++i) {
- MachineOperand &Op = MI.getOperand(i);
- if (!(Op.isReg() && Op.isImplicit()))
- MIB.add(Op);
- }
- if (MI.hasOneMemOperand())
- MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
- .addReg(X86::XMM0);
-
- MI.eraseFromParent();
- return BB;
-}
-
-// FIXME: Custom handling because TableGen doesn't support multiple implicit
-// defs in an instruction pattern
-static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
- const TargetInstrInfo *TII) {
- unsigned Opc;
- switch (MI.getOpcode()) {
- default: llvm_unreachable("illegal opcode!");
- case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
- case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
- case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
- case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
- case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
- case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
- case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
- case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
- }
-
- DebugLoc dl = MI.getDebugLoc();
- MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
-
- unsigned NumArgs = MI.getNumOperands(); // remove the results
- for (unsigned i = 1; i < NumArgs; ++i) {
- MachineOperand &Op = MI.getOperand(i);
- if (!(Op.isReg() && Op.isImplicit()))
- MIB.add(Op);
- }
- if (MI.hasOneMemOperand())
- MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
- .addReg(X86::ECX);
-
- MI.eraseFromParent();
- return BB;
-}
-
static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
const X86Subtarget &Subtarget) {
DebugLoc dl = MI.getDebugLoc();
@@ -26158,7 +26890,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
!MI.getOperand(MI.getNumOperands() - 1).isReg() ||
MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
"Expected last argument to be EFLAGS");
- unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+ unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
// In the XMM save block, save all the XMM argument registers.
for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
@@ -26919,6 +27651,184 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
return BB;
}
+static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
+ switch (RPOpc) {
+ case X86::RETPOLINE_CALL32:
+ return X86::CALLpcrel32;
+ case X86::RETPOLINE_CALL64:
+ return X86::CALL64pcrel32;
+ case X86::RETPOLINE_TCRETURN32:
+ return X86::TCRETURNdi;
+ case X86::RETPOLINE_TCRETURN64:
+ return X86::TCRETURNdi64;
+ }
+ llvm_unreachable("not retpoline opcode");
+}
+
+static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
+ unsigned Reg) {
+ if (Subtarget.useRetpolineExternalThunk()) {
+ // When using an external thunk for retpolines, we pick names that match the
+ // names GCC happens to use as well. This helps simplify the implementation
+ // of the thunks for kernels where they have no easy ability to create
+ // aliases and are doing non-trivial configuration of the thunk's body. For
+ // example, the Linux kernel will do boot-time hot patching of the thunk
+ // bodies and cannot easily export aliases of these to loaded modules.
+ //
+ // Note that at any point in the future, we may need to change the semantics
+ // of how we implement retpolines and at that time will likely change the
+ // name of the called thunk. Essentially, there is no hard guarantee that
+ // LLVM will generate calls to specific thunks, we merely make a best-effort
+ // attempt to help out kernels and other systems where duplicating the
+ // thunks is costly.
+ switch (Reg) {
+ case X86::EAX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_eax";
+ case X86::ECX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_ecx";
+ case X86::EDX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_edx";
+ case X86::EDI:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_edi";
+ case X86::R11:
+ assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+ return "__x86_indirect_thunk_r11";
+ }
+ llvm_unreachable("unexpected reg for retpoline");
+ }
+
+ // When targeting an internal COMDAT thunk use an LLVM-specific name.
+ switch (Reg) {
+ case X86::EAX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_eax";
+ case X86::ECX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_ecx";
+ case X86::EDX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_edx";
+ case X86::EDI:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_edi";
+ case X86::R11:
+ assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+ return "__llvm_retpoline_r11";
+ }
+ llvm_unreachable("unexpected reg for retpoline");
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ // Copy the virtual register into the R11 physical register and
+ // call the retpoline thunk.
+ DebugLoc DL = MI.getDebugLoc();
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+ unsigned CalleeVReg = MI.getOperand(0).getReg();
+ unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
+
+ // Find an available scratch register to hold the callee. On 64-bit, we can
+ // just use R11, but we scan for uses anyway to ensure we don't generate
+ // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
+ // already a register use operand to the call to hold the callee. If none
+ // are available, use EDI instead. EDI is chosen because EBX is the PIC base
+ // register and ESI is the base pointer to realigned stack frames with VLAs.
+ SmallVector<unsigned, 3> AvailableRegs;
+ if (Subtarget.is64Bit())
+ AvailableRegs.push_back(X86::R11);
+ else
+ AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
+
+ // Zero out any registers that are already used.
+ for (const auto &MO : MI.operands()) {
+ if (MO.isReg() && MO.isUse())
+ for (unsigned &Reg : AvailableRegs)
+ if (Reg == MO.getReg())
+ Reg = 0;
+ }
+
+ // Choose the first remaining non-zero available register.
+ unsigned AvailableReg = 0;
+ for (unsigned MaybeReg : AvailableRegs) {
+ if (MaybeReg) {
+ AvailableReg = MaybeReg;
+ break;
+ }
+ }
+ if (!AvailableReg)
+ report_fatal_error("calling convention incompatible with retpoline, no "
+ "available registers");
+
+ const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
+
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
+ .addReg(CalleeVReg);
+ MI.getOperand(0).ChangeToES(Symbol);
+ MI.setDesc(TII->get(Opc));
+ MachineInstrBuilder(*BB->getParent(), &MI)
+ .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
+ return BB;
+}
+
+/// SetJmp implies future control flow change upon calling the corresponding
+/// LongJmp.
+/// Instead of using the 'return' instruction, the long jump fixes the stack and
+/// performs an indirect branch. To do so it uses the registers that were stored
+/// in the jump buffer (when calling SetJmp).
+/// In case the shadow stack is enabled we need to fix it as well, because some
+/// return addresses will be skipped.
+/// The function will save the SSP for future fixing in the function
+/// emitLongJmpShadowStackFix.
+/// \sa emitLongJmpShadowStackFix
+/// \param [in] MI The temporary Machine Instruction for the builtin.
+/// \param [in] MBB The Machine Basic Block that will be modified.
+void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineInstrBuilder MIB;
+
+ // Memory Reference.
+ MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
+
+ // Initialize a register with zero.
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+ unsigned ZReg = MRI.createVirtualRegister(PtrRC);
+ unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
+ BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
+ .addDef(ZReg)
+ .addReg(ZReg, RegState::Undef)
+ .addReg(ZReg, RegState::Undef);
+
+ // Read the current SSP Register value to the zeroed register.
+ unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
+ unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
+ BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
+
+ // Write the SSP register value to offset 3 in input memory buffer.
+ unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
+ const int64_t SSPOffset = 3 * PVT.getStoreSize();
+ const unsigned MemOpndSlot = 1;
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
+ else
+ MIB.add(MI.getOperand(MemOpndSlot + i));
+ }
+ MIB.addReg(SSPCopyReg);
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+}
+
MachineBasicBlock *
X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const {
@@ -27028,6 +27938,11 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
else
MIB.addMBB(restoreMBB);
MIB.setMemRefs(MMOBegin, MMOEnd);
+
+ if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
+ emitSetJmpShadowStackFix(MI, thisMBB);
+ }
+
// Setup
MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
.addMBB(restoreMBB);
@@ -27069,6 +27984,183 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
return sinkMBB;
}
+/// Fix the shadow stack using the previously saved SSP pointer.
+/// \sa emitSetJmpShadowStackFix
+/// \param [in] MI The temporary Machine Instruction for the builtin.
+/// \param [in] MBB The Machine Basic Block that will be modified.
+/// \return The sink MBB that will perform the future indirect branch.
+MachineBasicBlock *
+X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // Memory Reference
+ MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
+
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+
+ // checkSspMBB:
+ // xor vreg1, vreg1
+ // rdssp vreg1
+ // test vreg1, vreg1
+ // je sinkMBB # Jump if Shadow Stack is not supported
+ // fallMBB:
+ // mov buf+24/12(%rip), vreg2
+ // sub vreg1, vreg2
+ // jbe sinkMBB # No need to fix the Shadow Stack
+ // fixShadowMBB:
+ // shr 3/2, vreg2
+ // incssp vreg2 # fix the SSP according to the lower 8 bits
+ // shr 8, vreg2
+ // je sinkMBB
+ // fixShadowLoopPrepareMBB:
+ // shl vreg2
+ // mov 128, vreg3
+ // fixShadowLoopMBB:
+ // incssp vreg3
+ // dec vreg2
+ // jne fixShadowLoopMBB # Iterate until you finish fixing
+ // # the Shadow Stack
+ // sinkMBB:
+
+ MachineFunction::iterator I = ++MBB->getIterator();
+ const BasicBlock *BB = MBB->getBasicBlock();
+
+ MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, checkSspMBB);
+ MF->insert(I, fallMBB);
+ MF->insert(I, fixShadowMBB);
+ MF->insert(I, fixShadowLoopPrepareMBB);
+ MF->insert(I, fixShadowLoopMBB);
+ MF->insert(I, sinkMBB);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
+ MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ MBB->addSuccessor(checkSspMBB);
+
+ // Initialize a register with zero.
+ unsigned ZReg = MRI.createVirtualRegister(PtrRC);
+ unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
+ BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
+ .addDef(ZReg)
+ .addReg(ZReg, RegState::Undef)
+ .addReg(ZReg, RegState::Undef);
+
+ // Read the current SSP Register value to the zeroed register.
+ unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
+ unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
+ BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
+
+ // Check whether the result of the SSP register is zero and jump directly
+ // to the sink.
+ unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
+ BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
+ .addReg(SSPCopyReg)
+ .addReg(SSPCopyReg);
+ BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
+ checkSspMBB->addSuccessor(sinkMBB);
+ checkSspMBB->addSuccessor(fallMBB);
+
+ // Reload the previously saved SSP register value.
+ unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
+ unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
+ const int64_t SPPOffset = 3 * PVT.getStoreSize();
+ MachineInstrBuilder MIB =
+ BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI.getOperand(i), SPPOffset);
+ else
+ MIB.add(MI.getOperand(i));
+ }
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+
+ // Subtract the current SSP from the previous SSP.
+ unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
+ unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
+ BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
+ .addReg(PrevSSPReg)
+ .addReg(SSPCopyReg);
+
+ // Jump to sink in case PrevSSPReg <= SSPCopyReg.
+ BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB);
+ fallMBB->addSuccessor(sinkMBB);
+ fallMBB->addSuccessor(fixShadowMBB);
+
+ // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
+ unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
+ unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
+ unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
+ BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
+ .addReg(SspSubReg)
+ .addImm(Offset);
+
+ // Increase SSP when looking only on the lower 8 bits of the delta.
+ unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
+ BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
+
+ // Reset the lower 8 bits.
+ unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
+ BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
+ .addReg(SspFirstShrReg)
+ .addImm(8);
+
+ // Jump if the result of the shift is zero.
+ BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
+ fixShadowMBB->addSuccessor(sinkMBB);
+ fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
+
+ // Do a single shift left.
+ unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
+ unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
+ BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
+ .addReg(SspSecondShrReg);
+
+ // Save the value 128 to a register (will be used next with incssp).
+ unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
+ unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
+ BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
+ .addImm(128);
+ fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
+
+ // Since incssp only looks at the lower 8 bits, we might need to do several
+ // iterations of incssp until we finish fixing the shadow stack.
+ unsigned DecReg = MRI.createVirtualRegister(PtrRC);
+ unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
+ BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
+ .addReg(SspAfterShlReg)
+ .addMBB(fixShadowLoopPrepareMBB)
+ .addReg(DecReg)
+ .addMBB(fixShadowLoopMBB);
+
+ // Every iteration we increase the SSP by 128.
+ BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
+
+ // Every iteration we decrement the counter by 1.
+ unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
+ BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
+
+ // Jump if the counter is not zero yet.
+ BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB);
+ fixShadowLoopMBB->addSuccessor(sinkMBB);
+ fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
+
+ return sinkMBB;
+}
+
MachineBasicBlock *
X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const {
@@ -27101,13 +28193,21 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
+ MachineBasicBlock *thisMBB = MBB;
+
+ // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
+ if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
+ thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
+ }
+
// Reload FP
- MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
MIB.add(MI.getOperand(i));
MIB.setMemRefs(MMOBegin, MMOEnd);
+
// Reload IP
- MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
if (i == X86::AddrDisp)
MIB.addDisp(MI.getOperand(i), LabelOffset);
@@ -27115,8 +28215,9 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
MIB.add(MI.getOperand(i));
}
MIB.setMemRefs(MMOBegin, MMOEnd);
+
// Reload SP
- MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
if (i == X86::AddrDisp)
MIB.addDisp(MI.getOperand(i), SPOffset);
@@ -27124,11 +28225,12 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
MIB.add(MI.getOperand(i));
}
MIB.setMemRefs(MMOBegin, MMOEnd);
+
// Jump
- BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
+ BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
MI.eraseFromParent();
- return MBB;
+ return thisMBB;
}
void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
@@ -27201,7 +28303,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MCSymbol *Sym = nullptr;
for (const auto &MI : MBB) {
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
continue;
assert(MI.isEHLabel() && "expected EH_LABEL");
@@ -27419,21 +28521,16 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
switch (MI.getOpcode()) {
default: llvm_unreachable("Unexpected instr type to insert");
- case X86::TAILJMPd64:
- case X86::TAILJMPr64:
- case X86::TAILJMPm64:
- case X86::TAILJMPr64_REX:
- case X86::TAILJMPm64_REX:
- llvm_unreachable("TAILJMP64 would not be touched here.");
- case X86::TCRETURNdi64:
- case X86::TCRETURNri64:
- case X86::TCRETURNmi64:
- return BB;
case X86::TLS_addr32:
case X86::TLS_addr64:
case X86::TLS_base_addr32:
case X86::TLS_base_addr64:
return EmitLoweredTLSAddr(MI, BB);
+ case X86::RETPOLINE_CALL32:
+ case X86::RETPOLINE_CALL64:
+ case X86::RETPOLINE_TCRETURN32:
+ case X86::RETPOLINE_TCRETURN64:
+ return EmitLoweredRetpoline(MI, BB);
case X86::CATCHRET:
return EmitLoweredCatchRet(MI, BB);
case X86::CATCHPAD:
@@ -27446,7 +28543,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return EmitLoweredTLSCall(MI, BB);
case X86::CMOV_FR32:
case X86::CMOV_FR64:
- case X86::CMOV_FR128:
+ case X86::CMOV_F128:
case X86::CMOV_GR8:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
@@ -27474,11 +28571,16 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
- // Permit reads of the FLAGS register without it being defined.
+ // Permit reads of the EFLAGS and DF registers without them being defined.
// This intrinsic exists to read external processor state in flags, such as
// the trap flag, interrupt flag, and direction flag, none of which are
// modeled by the backend.
+ assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
+ "Unexpected register in operand!");
Push->getOperand(2).setIsUndef();
+ assert(Push->getOperand(3).getReg() == X86::DF &&
+ "Unexpected register in operand!");
+ Push->getOperand(3).setIsUndef();
BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
MI.eraseFromParent(); // The pseudo is gone now.
@@ -27561,32 +28663,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
- // String/text processing lowering.
- case X86::PCMPISTRM128REG:
- case X86::VPCMPISTRM128REG:
- case X86::PCMPISTRM128MEM:
- case X86::VPCMPISTRM128MEM:
- case X86::PCMPESTRM128REG:
- case X86::VPCMPESTRM128REG:
- case X86::PCMPESTRM128MEM:
- case X86::VPCMPESTRM128MEM:
- assert(Subtarget.hasSSE42() &&
- "Target must have SSE4.2 or AVX features enabled");
- return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
-
- // String/text processing lowering.
- case X86::PCMPISTRIREG:
- case X86::VPCMPISTRIREG:
- case X86::PCMPISTRIMEM:
- case X86::VPCMPISTRIMEM:
- case X86::PCMPESTRIREG:
- case X86::VPCMPESTRIREG:
- case X86::PCMPESTRIMEM:
- case X86::VPCMPESTRIMEM:
- assert(Subtarget.hasSSE42() &&
- "Target must have SSE4.2 or AVX features enabled");
- return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
-
// Thread synchronization.
case X86::MONITOR:
return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
@@ -27633,8 +28709,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return emitPatchPoint(MI, BB);
case TargetOpcode::PATCHABLE_EVENT_CALL:
- // Do nothing here, handle in xray instrumentation pass.
- return BB;
+ return emitXRayCustomEvent(MI, BB);
+
+ case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
+ return emitXRayTypedEvent(MI, BB);
case X86::LCMPXCHG8B: {
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -27702,6 +28780,65 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// X86 Optimization Hooks
//===----------------------------------------------------------------------===//
+bool
+X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
+ const APInt &Demanded,
+ TargetLoweringOpt &TLO) const {
+ // Only optimize Ands to prevent shrinking a constant that could be
+ // matched by movzx.
+ if (Op.getOpcode() != ISD::AND)
+ return false;
+
+ EVT VT = Op.getValueType();
+
+ // Ignore vectors.
+ if (VT.isVector())
+ return false;
+
+ unsigned Size = VT.getSizeInBits();
+
+ // Make sure the RHS really is a constant.
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (!C)
+ return false;
+
+ const APInt &Mask = C->getAPIntValue();
+
+ // Clear all non-demanded bits initially.
+ APInt ShrunkMask = Mask & Demanded;
+
+ // Find the width of the shrunk mask.
+ unsigned Width = ShrunkMask.getActiveBits();
+
+ // If the mask is all 0s there's nothing to do here.
+ if (Width == 0)
+ return false;
+
+ // Find the next power of 2 width, rounding up to a byte.
+ Width = PowerOf2Ceil(std::max(Width, 8U));
+ // Truncate the width to size to handle illegal types.
+ Width = std::min(Width, Size);
+
+ // Calculate a possible zero extend mask for this constant.
+ APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
+
+ // If we aren't changing the mask, just return true to keep it and prevent
+ // the caller from optimizing.
+ if (ZeroExtendMask == Mask)
+ return true;
+
+ // Make sure the new mask can be represented by a combination of mask bits
+ // and non-demanded bits.
+ if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
+ return false;
+
+ // Replace the constant with the zero extend mask.
+ SDLoc DL(Op);
+ SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
+ SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
+ return TLO.CombineTo(Op, NewOp);
+}
+
void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownBits &Known,
const APInt &DemandedElts,
@@ -27763,6 +28900,19 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
}
break;
}
+ case X86ISD::PACKUS: {
+ // PACKUS is just a truncation if the upper half is zero.
+ // TODO: Add DemandedElts support.
+ KnownBits Known2;
+ DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
+ DAG.computeKnownBits(Op.getOperand(1), Known2, Depth + 1);
+ Known.One &= Known2.One;
+ Known.Zero &= Known2.Zero;
+ if (Known.countMinLeadingZeros() < BitWidth)
+ Known.resetAll();
+ Known = Known.trunc(BitWidth);
+ break;
+ }
case X86ISD::VZEXT: {
// TODO: Add DemandedElts support.
SDValue N0 = Op.getOperand(0);
@@ -27801,6 +28951,57 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known.Zero.setBitsFrom(8);
break;
}
+
+ // Handle target shuffles.
+ // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
+ if (isTargetShuffle(Opc)) {
+ bool IsUnary;
+ SmallVector<int, 64> Mask;
+ SmallVector<SDValue, 2> Ops;
+ if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
+ IsUnary)) {
+ unsigned NumOps = Ops.size();
+ unsigned NumElts = VT.getVectorNumElements();
+ if (Mask.size() == NumElts) {
+ SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
+ Known.Zero.setAllBits(); Known.One.setAllBits();
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (!DemandedElts[i])
+ continue;
+ int M = Mask[i];
+ if (M == SM_SentinelUndef) {
+ // For UNDEF elements, we don't know anything about the common state
+ // of the shuffle result.
+ Known.resetAll();
+ break;
+ } else if (M == SM_SentinelZero) {
+ Known.One.clearAllBits();
+ continue;
+ }
+ assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
+ "Shuffle index out of range");
+
+ unsigned OpIdx = (unsigned)M / NumElts;
+ unsigned EltIdx = (unsigned)M % NumElts;
+ if (Ops[OpIdx].getValueType() != VT) {
+ // TODO - handle target shuffle ops with different value types.
+ Known.resetAll();
+ break;
+ }
+ DemandedOps[OpIdx].setBit(EltIdx);
+ }
+ // Known bits are the values that are shared by every demanded element.
+ for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
+ if (!DemandedOps[i])
+ continue;
+ KnownBits Known2;
+ DAG.computeKnownBits(Ops[i], Known2, DemandedOps[i], Depth + 1);
+ Known.One &= Known2.One;
+ Known.Zero &= Known2.Zero;
+ }
+ }
+ }
+ }
}
unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
@@ -27917,12 +29118,21 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
bool AllowFloatDomain, bool AllowIntDomain,
- SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
+ SDValue &V1, const SDLoc &DL,
+ SelectionDAG &DAG,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
unsigned NumMaskElts = Mask.size();
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
+ // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
+ if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
+ isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
+ Shuffle = X86ISD::VZEXT_MOVL;
+ SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+ return true;
+ }
+
// Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
@@ -28165,7 +29375,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
bool AllowFloatDomain, bool AllowIntDomain,
- SDValue &V1, SDValue &V2, SDLoc &DL,
+ SDValue &V1, SDValue &V2, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
@@ -28175,27 +29385,28 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (MaskVT.is128BitVector()) {
if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
V2 = V1;
- Shuffle = X86ISD::MOVLHPS;
- SrcVT = DstVT = MVT::v4f32;
+ V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
+ Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
+ SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
return true;
}
if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
V2 = V1;
- Shuffle = X86ISD::MOVHLPS;
- SrcVT = DstVT = MVT::v4f32;
+ Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
+ SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
return true;
}
if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
(AllowFloatDomain || !Subtarget.hasSSE41())) {
std::swap(V1, V2);
Shuffle = X86ISD::MOVSD;
- SrcVT = DstVT = MaskVT;
+ SrcVT = DstVT = MVT::v2f64;
return true;
}
if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
(AllowFloatDomain || !Subtarget.hasSSE41())) {
Shuffle = X86ISD::MOVSS;
- SrcVT = DstVT = MaskVT;
+ SrcVT = DstVT = MVT::v4f32;
return true;
}
}
@@ -28228,15 +29439,11 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
return false;
}
-static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- const APInt &Zeroable,
- bool AllowFloatDomain,
- bool AllowIntDomain,
- SDValue &V1, SDValue &V2, SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- unsigned &Shuffle, MVT &ShuffleVT,
- unsigned &PermuteImm) {
+static bool matchBinaryPermuteVectorShuffle(
+ MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
+ bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
+ const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
unsigned NumMaskElts = Mask.size();
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
@@ -28385,7 +29592,7 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
return false;
}
-/// \brief Combine an arbitrary chain of shuffles into a single instruction if
+/// Combine an arbitrary chain of shuffles into a single instruction if
/// possible.
///
/// This is the leaf of the recursive combine below. When we have found some
@@ -28397,7 +29604,6 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
ArrayRef<int> BaseMask, int Depth,
bool HasVariableMask, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
assert((Inputs.size() == 1 || Inputs.size() == 2) &&
@@ -28430,6 +29636,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
unsigned NumRootElts = RootVT.getVectorNumElements();
unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
+ (RootVT.isFloatingPoint() && Depth >= 2) ||
(RootVT.is256BitVector() && !Subtarget.hasAVX2());
// Don't combine if we are a AVX512/EVEX target and the mask element size
@@ -28458,11 +29665,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
Res = DAG.getBitcast(ShuffleVT, V1);
- DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
DAG.getUNDEF(ShuffleVT),
DAG.getConstant(PermMask, DL, MVT::i8));
- DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
}
@@ -28520,16 +29725,15 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
+ SDValue NewV1 = V1; // Save operand in case early exit happens.
if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
- V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
- ShuffleVT) &&
+ NewV1, DL, DAG, Subtarget, Shuffle,
+ ShuffleSrcVT, ShuffleVT) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
- Res = DAG.getBitcast(ShuffleSrcVT, V1);
- DCI.AddToWorklist(Res.getNode());
+ Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
- DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
}
@@ -28540,43 +29744,38 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, V1);
- DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
DAG.getConstant(PermuteImm, DL, MVT::i8));
- DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
}
}
+ SDValue NewV1 = V1; // Save operands in case early exit happens.
+ SDValue NewV2 = V2;
if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
- V1, V2, DL, DAG, Subtarget, Shuffle,
+ NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
- V1 = DAG.getBitcast(ShuffleSrcVT, V1);
- DCI.AddToWorklist(V1.getNode());
- V2 = DAG.getBitcast(ShuffleSrcVT, V2);
- DCI.AddToWorklist(V2.getNode());
- Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
- DCI.AddToWorklist(Res.getNode());
+ NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
+ NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
return DAG.getBitcast(RootVT, Res);
}
- if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
- AllowIntDomain, V1, V2, DL, DAG,
- Subtarget, Shuffle, ShuffleVT,
- PermuteImm) &&
+ NewV1 = V1; // Save operands in case early exit happens.
+ NewV2 = V2;
+ if (matchBinaryPermuteVectorShuffle(
+ MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
+ NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
- V1 = DAG.getBitcast(ShuffleVT, V1);
- DCI.AddToWorklist(V1.getNode());
- V2 = DAG.getBitcast(ShuffleVT, V2);
- DCI.AddToWorklist(V2.getNode());
- Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
+ NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
+ NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
DAG.getConstant(PermuteImm, DL, MVT::i8));
- DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
}
@@ -28592,11 +29791,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
- DCI.AddToWorklist(V1.getNode());
Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
DAG.getConstant(BitLen, DL, MVT::i8),
DAG.getConstant(BitIdx, DL, MVT::i8));
- DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
}
@@ -28604,13 +29801,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
- DCI.AddToWorklist(V1.getNode());
V2 = DAG.getBitcast(IntMaskVT, V2);
- DCI.AddToWorklist(V2.getNode());
Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
DAG.getConstant(BitLen, DL, MVT::i8),
DAG.getConstant(BitIdx, DL, MVT::i8));
- DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
}
}
@@ -28640,11 +29834,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
- DCI.AddToWorklist(VPermMask.getNode());
Res = DAG.getBitcast(MaskVT, V1);
- DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
- DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
}
@@ -28667,13 +29858,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
Mask[i] = NumMaskElts + i;
SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
- DCI.AddToWorklist(VPermMask.getNode());
Res = DAG.getBitcast(MaskVT, V1);
- DCI.AddToWorklist(Res.getNode());
SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
- DCI.AddToWorklist(Zero.getNode());
Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
- DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
}
@@ -28690,13 +29877,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
- DCI.AddToWorklist(VPermMask.getNode());
V1 = DAG.getBitcast(MaskVT, V1);
- DCI.AddToWorklist(V1.getNode());
V2 = DAG.getBitcast(MaskVT, V2);
- DCI.AddToWorklist(V2.getNode());
Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
- DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
}
return SDValue();
@@ -28722,13 +29905,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
EltBits[i] = AllOnes;
}
SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
- DCI.AddToWorklist(BitMask.getNode());
Res = DAG.getBitcast(MaskVT, V1);
- DCI.AddToWorklist(Res.getNode());
unsigned AndOpcode =
FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
- DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
}
@@ -28745,11 +29925,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
VPermIdx.push_back(Idx);
}
SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
- DCI.AddToWorklist(VPermMask.getNode());
Res = DAG.getBitcast(MaskVT, V1);
- DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
- DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
}
@@ -28781,14 +29958,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
VPerm2Idx.push_back(Index);
}
V1 = DAG.getBitcast(MaskVT, V1);
- DCI.AddToWorklist(V1.getNode());
V2 = DAG.getBitcast(MaskVT, V2);
- DCI.AddToWorklist(V2.getNode());
SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
- DCI.AddToWorklist(VPerm2MaskOp.getNode());
Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
DAG.getConstant(M2ZImm, DL, MVT::i8));
- DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
}
@@ -28820,11 +29993,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
Res = DAG.getBitcast(ByteVT, V1);
- DCI.AddToWorklist(Res.getNode());
SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
- DCI.AddToWorklist(PSHUFBMaskOp.getNode());
Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
- DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
}
@@ -28853,13 +30023,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
MVT ByteVT = MVT::v16i8;
V1 = DAG.getBitcast(ByteVT, V1);
- DCI.AddToWorklist(V1.getNode());
V2 = DAG.getBitcast(ByteVT, V2);
- DCI.AddToWorklist(V2.getNode());
SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
- DCI.AddToWorklist(VPPERMMaskOp.getNode());
Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
- DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
}
@@ -28870,11 +30036,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Attempt to constant fold all of the constant source ops.
// Returns true if the entire shuffle is folded to a constant.
// TODO: Extend this to merge multiple constant Ops and update the mask.
-static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
+static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
ArrayRef<int> Mask, SDValue Root,
bool HasVariableMask,
SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
MVT VT = Root.getSimpleValueType();
@@ -28950,11 +30115,10 @@ static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
SDLoc DL(Root);
SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
- DCI.AddToWorklist(CstOp.getNode());
return DAG.getBitcast(VT, CstOp);
}
-/// \brief Fully generic combining of x86 shuffle instructions.
+/// Fully generic combining of x86 shuffle instructions.
///
/// This should be the last combine run over the x86 shuffle instructions. Once
/// they have been fully optimized, this will recursively consider all chains
@@ -28985,12 +30149,12 @@ static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
/// combining in this recursive walk.
static SDValue combineX86ShufflesRecursively(
ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
- ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, int Depth,
- bool HasVariableMask, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
+ ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
+ bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) {
// Bound the depth of our recursive combine because this is ultimately
// quadratic in nature.
- if (Depth > 8)
+ const unsigned MaxRecursionDepth = 8;
+ if (Depth > MaxRecursionDepth)
return SDValue();
// Directly rip through bitcasts to find the underlying operand.
@@ -29143,17 +30307,21 @@ static SDValue combineX86ShufflesRecursively(
// See if we can recurse into each shuffle source op (if it's a target
// shuffle). The source op should only be combined if it either has a
// single use (i.e. current Op) or all its users have already been combined.
- for (int i = 0, e = Ops.size(); i < e; ++i)
- if (Ops[i].getNode()->hasOneUse() ||
- SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
- if (SDValue Res = combineX86ShufflesRecursively(
- Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
- DAG, DCI, Subtarget))
- return Res;
+ // Don't recurse if we already have more source ops than we can combine in
+ // the remaining recursion depth.
+ if (Ops.size() < (MaxRecursionDepth - Depth)) {
+ for (int i = 0, e = Ops.size(); i < e; ++i)
+ if (Ops[i].getNode()->hasOneUse() ||
+ SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
+ if (SDValue Res = combineX86ShufflesRecursively(
+ Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
+ DAG, Subtarget))
+ return Res;
+ }
// Attempt to constant fold all of the constant source ops.
if (SDValue Cst = combineX86ShufflesConstants(
- Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget))
+ Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
return Cst;
// We can only combine unary and binary shuffle mask cases.
@@ -29179,10 +30347,10 @@ static SDValue combineX86ShufflesRecursively(
// Finally, try to combine into a single shuffle instruction.
return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
- DCI, Subtarget);
+ Subtarget);
}
-/// \brief Get the PSHUF-style mask from PSHUF node.
+/// Get the PSHUF-style mask from PSHUF node.
///
/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
/// PSHUF-style masks that can be reused with such instructions.
@@ -29225,7 +30393,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
}
}
-/// \brief Search for a combinable shuffle across a chain ending in pshufd.
+/// Search for a combinable shuffle across a chain ending in pshufd.
///
/// We walk up the chain and look for a combinable shuffle, skipping over
/// shuffles that we could hoist this shuffle's transformation past without
@@ -29358,7 +30526,7 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
return V;
}
-/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
+/// Search for a combinable shuffle across a chain ending in pshuflw or
/// pshufhw.
///
/// We walk up the chain, skipping shuffles of the other half and looking
@@ -29426,7 +30594,7 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
return true;
}
-/// \brief Try to combine x86 target specific shuffles.
+/// Try to combine x86 target specific shuffles.
static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -29459,12 +30627,33 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
}
SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
- DCI.AddToWorklist(Horiz.getNode());
return DAG.getBitcast(VT, Horiz);
}
}
switch (Opcode) {
+ case X86ISD::VBROADCAST: {
+ // If broadcasting from another shuffle, attempt to simplify it.
+ // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
+ SDValue Src = N.getOperand(0);
+ SDValue BC = peekThroughBitcasts(Src);
+ EVT SrcVT = Src.getValueType();
+ EVT BCVT = BC.getValueType();
+ if (isTargetShuffle(BC.getOpcode()) &&
+ VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
+ unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
+ SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
+ SM_SentinelUndef);
+ for (unsigned i = 0; i != Scale; ++i)
+ DemandedMask[i] = i;
+ if (SDValue Res = combineX86ShufflesRecursively(
+ {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
+ /*HasVarMask*/ false, DAG, Subtarget))
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+ DAG.getBitcast(SrcVT, Res));
+ }
+ return SDValue();
+ }
case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
@@ -29505,53 +30694,31 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
return SDValue();
}
- case X86ISD::BLENDI: {
- SDValue V0 = N->getOperand(0);
- SDValue V1 = N->getOperand(1);
- assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
- "Unexpected input vector types");
-
- // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
- // operands and changing the mask to 1. This saves us a bunch of
- // pattern-matching possibilities related to scalar math ops in SSE/AVX.
- // x86InstrInfo knows how to commute this back after instruction selection
- // if it would help register allocation.
-
- // TODO: If optimizing for size or a processor that doesn't suffer from
- // partial register update stalls, this should be transformed into a MOVSD
- // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
-
- if (VT == MVT::v2f64)
- if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
- if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
- SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
- return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
- }
-
- return SDValue();
- }
case X86ISD::MOVSD:
case X86ISD::MOVSS: {
- SDValue V0 = peekThroughBitcasts(N->getOperand(0));
- SDValue V1 = peekThroughBitcasts(N->getOperand(1));
- bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
- bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
- if (isZero0 && isZero1)
- return SDValue();
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
- // We often lower to MOVSD/MOVSS from integer as well as native float
- // types; remove unnecessary domain-crossing bitcasts if we can to make it
- // easier to combine shuffles later on. We've already accounted for the
- // domain switching cost when we decided to lower with it.
- bool isFloat = VT.isFloatingPoint();
- bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
- bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
- if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
- MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
- : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
- V0 = DAG.getBitcast(NewVT, V0);
- V1 = DAG.getBitcast(NewVT, V1);
- return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
+ // Canonicalize scalar FPOps:
+ // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
+ // If commutable, allow OP(N1[0], N0[0]).
+ unsigned Opcode1 = N1.getOpcode();
+ if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
+ Opcode1 == ISD::FDIV) {
+ SDValue N10 = N1.getOperand(0);
+ SDValue N11 = N1.getOperand(1);
+ if (N10 == N0 ||
+ (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
+ if (N10 != N0)
+ std::swap(N10, N11);
+ MVT SVT = VT.getVectorElementType();
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
+ N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
+ N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
+ SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
+ SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
+ return DAG.getNode(Opcode, DL, VT, N0, SclVec);
+ }
}
return SDValue();
@@ -29647,7 +30814,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
// Nuke no-op shuffles that show up after combining.
if (isNoopShuffleMask(Mask))
- return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+ return N.getOperand(0);
// Look for simplifications involving one or two shuffle instructions.
SDValue V = N.getOperand(0);
@@ -29671,10 +30838,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
DMask[DOffset + 1] = DOffset + 0;
MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
V = DAG.getBitcast(DVT, V);
- DCI.AddToWorklist(V.getNode());
V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
- DCI.AddToWorklist(V.getNode());
return DAG.getBitcast(VT, V);
}
@@ -29705,7 +30870,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
// We can replace all three shuffles with an unpack.
V = DAG.getBitcast(VT, D.getOperand(0));
- DCI.AddToWorklist(V.getNode());
return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
: X86ISD::UNPCKH,
DL, VT, V, V);
@@ -29725,6 +30889,37 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return SDValue();
}
+/// Checks if the shuffle mask takes subsequent elements
+/// alternately from two vectors.
+/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
+static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
+
+ int ParitySrc[2] = {-1, -1};
+ unsigned Size = Mask.size();
+ for (unsigned i = 0; i != Size; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+
+ // Make sure we are using the matching element from the input.
+ if ((M % Size) != i)
+ return false;
+
+ // Make sure we use the same input for all elements of the same parity.
+ int Src = M / Size;
+ if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
+ return false;
+ ParitySrc[i % 2] = Src;
+ }
+
+ // Make sure each input is used.
+ if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
+ return false;
+
+ Op0Even = ParitySrc[0] == 0;
+ return true;
+}
+
/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
/// are written to the parameters \p Opnd0 and \p Opnd1.
@@ -29735,13 +30930,13 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
/// by this operation to try to flow through the rest of the combiner
/// the fact that they're unused.
static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
- SDValue &Opnd0, SDValue &Opnd1,
- bool matchSubAdd = false) {
+ SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
+ bool &IsSubAdd) {
EVT VT = N->getValueType(0);
- if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
- (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
- (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
+ !VT.getSimpleVT().isFloatingPoint())
return false;
// We only handle target-independent shuffles.
@@ -29750,21 +30945,13 @@ static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
return false;
- ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
- SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
-
SDValue V1 = N->getOperand(0);
SDValue V2 = N->getOperand(1);
- unsigned ExpectedOpcode = matchSubAdd ? ISD::FADD : ISD::FSUB;
- unsigned NextExpectedOpcode = matchSubAdd ? ISD::FSUB : ISD::FADD;
-
- // We require the first shuffle operand to be the ExpectedOpcode node,
- // and the second to be the NextExpectedOpcode node.
- if (V1.getOpcode() == NextExpectedOpcode && V2.getOpcode() == ExpectedOpcode) {
- ShuffleVectorSDNode::commuteMask(Mask);
- std::swap(V1, V2);
- } else if (V1.getOpcode() != ExpectedOpcode || V2.getOpcode() != NextExpectedOpcode)
+ // Make sure we have an FADD and an FSUB.
+ if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
+ (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
+ V1.getOpcode() == V2.getOpcode())
return false;
// If there are other uses of these operations we can't fold them.
@@ -29773,41 +30960,101 @@ static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
// Ensure that both operations have the same operands. Note that we can
// commute the FADD operands.
- SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
- if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
- (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
- return false;
+ SDValue LHS, RHS;
+ if (V1.getOpcode() == ISD::FSUB) {
+ LHS = V1->getOperand(0); RHS = V1->getOperand(1);
+ if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
+ (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
+ return false;
+ } else {
+ assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
+ LHS = V2->getOperand(0); RHS = V2->getOperand(1);
+ if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
+ (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
+ return false;
+ }
- // We're looking for blends between FADD and FSUB nodes. We insist on these
- // nodes being lined up in a specific expected pattern.
- if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
- isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
- isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
- isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
- 8, 25, 10, 27, 12, 29, 14, 31})))
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+ bool Op0Even;
+ if (!isAddSubOrSubAddMask(Mask, Op0Even))
return false;
+ // It's a subadd if the vector in the even parity is an FADD.
+ IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
+ : V2->getOpcode() == ISD::FADD;
+
Opnd0 = LHS;
Opnd1 = RHS;
return true;
}
-/// \brief Try to combine a shuffle into a target-specific add-sub or
+/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
+static SDValue combineShuffleToFMAddSub(SDNode *N,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // We only handle target-independent shuffles.
+ // FIXME: It would be easy and harmless to use the target shuffle mask
+ // extraction tool to support more.
+ if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
+ return SDValue();
+
+ MVT VT = N->getSimpleValueType(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
+ return SDValue();
+
+ // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDValue FMAdd = Op0, FMSub = Op1;
+ if (FMSub.getOpcode() != X86ISD::FMSUB)
+ std::swap(FMAdd, FMSub);
+
+ if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
+ FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
+ FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
+ FMAdd.getOperand(2) != FMSub.getOperand(2))
+ return SDValue();
+
+ // Check for correct shuffle mask.
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+ bool Op0Even;
+ if (!isAddSubOrSubAddMask(Mask, Op0Even))
+ return SDValue();
+
+ // FMAddSub takes zeroth operand from FMSub node.
+ SDLoc DL(N);
+ bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
+ unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
+ return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
+ FMAdd.getOperand(2));
+}
+
+/// Try to combine a shuffle into a target-specific add-sub or
/// mul-add-sub node.
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
+ if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
+ return V;
+
SDValue Opnd0, Opnd1;
- if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1))
+ bool IsSubAdd;
+ if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
return SDValue();
- EVT VT = N->getValueType(0);
+ MVT VT = N->getSimpleValueType(0);
SDLoc DL(N);
// Try to generate X86ISD::FMADDSUB node here.
SDValue Opnd2;
- if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
- return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
+ if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
+ unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
+ return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
+ }
+
+ if (IsSubAdd)
+ return SDValue();
// Do not generate X86ISD::ADDSUB node for 512-bit types even though
// the ADDSUB idiom has been successfully recognized. There are no known
@@ -29818,26 +31065,6 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
}
-/// \brief Try to combine a shuffle into a target-specific
-/// mul-sub-add node.
-static SDValue combineShuffleToFMSubAdd(SDNode *N,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- SDValue Opnd0, Opnd1;
- if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1, true))
- return SDValue();
-
- EVT VT = N->getValueType(0);
- SDLoc DL(N);
-
- // Try to generate X86ISD::FMSUBADD node here.
- SDValue Opnd2;
- if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
- return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2);
-
- return SDValue();
-}
-
// We are looking for a shuffle where both sources are concatenated with undef
// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
// if we can express this as a single-source shuffle, that's preferable.
@@ -29897,8 +31124,8 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
// lanes of each operand as:
// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
// ...similarly for v2f64 and v8i16.
- // TODO: 256-bit is not the same because...x86.
- if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
+ // TODO: Handle UNDEF operands.
+ if (HOp.getOperand(0) != HOp.getOperand(1))
return SDValue();
// When the operands of a horizontal math op are identical, the low half of
@@ -29909,9 +31136,17 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
// but this should be tied to whatever horizontal op matching and shuffle
// canonicalization are producing.
- if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
- isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
- isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
+ if (HOp.getValueSizeInBits() == 128 &&
+ (isTargetShuffleEquivalent(Mask, {0, 0}) ||
+ isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
+ isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
+ return HOp;
+
+ if (HOp.getValueSizeInBits() == 256 &&
+ (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
+ isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
+ isTargetShuffleEquivalent(
+ Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
return HOp;
return SDValue();
@@ -29929,9 +31164,6 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
return AddSub;
- if (SDValue FMSubAdd = combineShuffleToFMSubAdd(N, Subtarget, DAG))
- return FMSubAdd;
-
if (SDValue HAddSub = foldShuffleOfHorizOp(N))
return HAddSub;
}
@@ -30035,10 +31267,8 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// a particular chain.
if (SDValue Res = combineX86ShufflesRecursively(
{Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
- DCI.CombineTo(N, Res);
- return SDValue();
- }
+ /*HasVarMask*/ false, DAG, Subtarget))
+ return Res;
}
return SDValue();
@@ -30155,53 +31385,6 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
SDValue N0 = BitCast.getOperand(0);
EVT VecVT = N0->getValueType(0);
- if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
- N0->getOpcode() == ISD::OR) {
- SDValue Op0 = N0->getOperand(0);
- SDValue Op1 = N0->getOperand(1);
- MVT TrunckVT;
- MVT BitcastVT;
- switch (VT.getSimpleVT().SimpleTy) {
- default:
- return SDValue();
- case MVT::v16i1:
- TrunckVT = MVT::i8;
- BitcastVT = MVT::v8i1;
- break;
- case MVT::v32i1:
- TrunckVT = MVT::i16;
- BitcastVT = MVT::v16i1;
- break;
- case MVT::v64i1:
- TrunckVT = MVT::i32;
- BitcastVT = MVT::v32i1;
- break;
- }
- bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
- bool isArg0UndefLeft =
- Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND;
- bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
- bool isArg1UndefLeft =
- Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND;
- SDValue OpLeft;
- SDValue OpRight;
- if (isArg0UndefRight && isArg1UndefLeft) {
- OpLeft = Op0;
- OpRight = Op1;
- } else if (isArg1UndefRight && isArg0UndefLeft) {
- OpLeft = Op1;
- OpRight = Op0;
- } else
- return SDValue();
- SDLoc DL(BitCast);
- SDValue Shr = OpLeft->getOperand(0);
- SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
- SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
- SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
- SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
- }
-
if (!VT.isScalarInteger() || !VecVT.isSimple())
return SDValue();
@@ -30269,17 +31452,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
SDLoc DL(BitCast);
SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
- if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {
- // Handle pre-AVX2 cases by splitting to two v16i1's.
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
- SDValue Lo = extract128BitVector(V, 0, DAG, DL);
- SDValue Hi = extract128BitVector(V, 16, DAG, DL);
- Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
- Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
- Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
- DAG.getConstant(16, DL, ShiftTy));
- V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
+ if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
+ V = getPMOVMSKB(DL, V, DAG, Subtarget);
return DAG.getZExtOrTrunc(V, DL, VT);
}
@@ -30296,6 +31470,153 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
return DAG.getZExtOrTrunc(V, DL, VT);
}
+// Convert a vXi1 constant build vector to the same width scalar integer.
+static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
+ EVT SrcVT = Op.getValueType();
+ assert(SrcVT.getVectorElementType() == MVT::i1 &&
+ "Expected a vXi1 vector");
+ assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
+ "Expected a constant build vector");
+
+ APInt Imm(SrcVT.getVectorNumElements(), 0);
+ for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
+ SDValue In = Op.getOperand(Idx);
+ if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
+ Imm.setBit(Idx);
+ }
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
+ return DAG.getConstant(Imm, SDLoc(Op), IntVT);
+}
+
+static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
+
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ // Only do this if we have k-registers.
+ if (!Subtarget.hasAVX512())
+ return SDValue();
+
+ EVT DstVT = N->getValueType(0);
+ SDValue Op = N->getOperand(0);
+ EVT SrcVT = Op.getValueType();
+
+ if (!Op.hasOneUse())
+ return SDValue();
+
+ // Look for logic ops.
+ if (Op.getOpcode() != ISD::AND &&
+ Op.getOpcode() != ISD::OR &&
+ Op.getOpcode() != ISD::XOR)
+ return SDValue();
+
+ // Make sure we have a bitcast between mask registers and a scalar type.
+ if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
+ DstVT.isScalarInteger()) &&
+ !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
+ SrcVT.isScalarInteger()))
+ return SDValue();
+
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+
+ if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
+ LHS.getOperand(0).getValueType() == DstVT)
+ return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
+ DAG.getBitcast(DstVT, RHS));
+
+ if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
+ RHS.getOperand(0).getValueType() == DstVT)
+ return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
+ DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
+
+ // If the RHS is a vXi1 build vector, this is a good reason to flip too.
+ // Most of these have to move a constant from the scalar domain anyway.
+ if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
+ RHS = combinevXi1ConstantToInteger(RHS, DAG);
+ return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
+ DAG.getBitcast(DstVT, LHS), RHS);
+ }
+
+ return SDValue();
+}
+
+static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+ unsigned NumElts = N.getNumOperands();
+
+ auto *BV = cast<BuildVectorSDNode>(N);
+ SDValue Splat = BV->getSplatValue();
+
+ // Build MMX element from integer GPR or SSE float values.
+ auto CreateMMXElement = [&](SDValue V) {
+ if (V.isUndef())
+ return DAG.getUNDEF(MVT::x86mmx);
+ if (V.getValueType().isFloatingPoint()) {
+ if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
+ V = DAG.getBitcast(MVT::v2i64, V);
+ return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
+ }
+ V = DAG.getBitcast(MVT::i32, V);
+ } else {
+ V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
+ }
+ return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
+ };
+
+ // Convert build vector ops to MMX data in the bottom elements.
+ SmallVector<SDValue, 8> Ops;
+
+ // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
+ if (Splat) {
+ if (Splat.isUndef())
+ return DAG.getUNDEF(MVT::x86mmx);
+
+ Splat = CreateMMXElement(Splat);
+
+ if (Subtarget.hasSSE1()) {
+ // Unpack v8i8 to splat i8 elements to lowest 16-bits.
+ if (NumElts == 8)
+ Splat = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
+ DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
+ Splat);
+
+ // Use PSHUFW to repeat 16-bit elements.
+ unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
+ return DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
+ DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
+ DAG.getConstant(ShufMask, DL, MVT::i8));
+ }
+ Ops.append(NumElts, Splat);
+ } else {
+ for (unsigned i = 0; i != NumElts; ++i)
+ Ops.push_back(CreateMMXElement(N.getOperand(i)));
+ }
+
+ // Use tree of PUNPCKLs to build up general MMX vector.
+ while (Ops.size() > 1) {
+ unsigned NumOps = Ops.size();
+ unsigned IntrinOp =
+ (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
+ : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
+ : Intrinsic::x86_mmx_punpcklbw));
+ SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
+ for (unsigned i = 0; i != NumOps; i += 2)
+ Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
+ Ops[i], Ops[i + 1]);
+ Ops.resize(NumOps / 2);
+ }
+
+ return Ops[0];
+}
+
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -30309,42 +31630,124 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
// (i16 movmsk (16i8 sext (v16i1 x)))
// before the setcc result is scalarized on subtargets that don't have legal
// vxi1 types.
- if (DCI.isBeforeLegalize())
+ if (DCI.isBeforeLegalize()) {
if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
return V;
+
+ // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
+ // type, widen both sides to avoid a trip through memory.
+ if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
+ Subtarget.hasAVX512()) {
+ SDLoc dl(N);
+ N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
+ N0 = DAG.getBitcast(MVT::v8i1, N0);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
+ // type, widen both sides to avoid a trip through memory.
+ if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
+ Subtarget.hasAVX512()) {
+ SDLoc dl(N);
+ unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
+ SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
+ Ops[0] = N0;
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
+ N0 = DAG.getBitcast(MVT::i8, N0);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
+ }
+ }
+
// Since MMX types are special and don't usually play with other vector types,
// it's better to handle them early to be sure we emit efficient code by
// avoiding store-load conversions.
+ if (VT == MVT::x86mmx) {
+ // Detect MMX constant vectors.
+ APInt UndefElts;
+ SmallVector<APInt, 1> EltBits;
+ if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
+ SDLoc DL(N0);
+ // Handle zero-extension of i32 with MOVD.
+ if (EltBits[0].countLeadingZeros() >= 32)
+ return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
+ DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
+ // Else, bitcast to a double.
+ // TODO - investigate supporting sext 32-bit immediates on x86_64.
+ APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
+ return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
+ }
+
+ // Detect bitcasts to x86mmx low word.
+ if (N0.getOpcode() == ISD::BUILD_VECTOR &&
+ (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
+ N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
+ bool LowUndef = true, AllUndefOrZero = true;
+ for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
+ SDValue Op = N0.getOperand(i);
+ LowUndef &= Op.isUndef() || (i >= e/2);
+ AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
+ }
+ if (AllUndefOrZero) {
+ SDValue N00 = N0.getOperand(0);
+ SDLoc dl(N00);
+ N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
+ : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
+ return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
+ }
+ }
- // Detect bitcasts between i32 to x86mmx low word.
- if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
- SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
- SDValue N00 = N0->getOperand(0);
- if (N00.getValueType() == MVT::i32)
- return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
+ // Detect bitcasts of 64-bit build vectors and convert to a
+ // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
+ // lowest element.
+ if (N0.getOpcode() == ISD::BUILD_VECTOR &&
+ (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
+ SrcVT == MVT::v8i8))
+ return createMMXBuildVector(N0, DAG, Subtarget);
+
+ // Detect bitcasts between element or subvector extraction to x86mmx.
+ if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+ N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
+ isNullConstant(N0.getOperand(1))) {
+ SDValue N00 = N0.getOperand(0);
+ if (N00.getValueType().is128BitVector())
+ return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
+ DAG.getBitcast(MVT::v2i64, N00));
+ }
+
+ // Detect bitcasts from FP_TO_SINT to x86mmx.
+ if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
+ SDLoc DL(N0);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+ DAG.getUNDEF(MVT::v2i32));
+ return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
+ DAG.getBitcast(MVT::v2i64, Res));
+ }
}
- // Detect bitcasts between element or subvector extraction to x86mmx.
- if (VT == MVT::x86mmx &&
- (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
- N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
- isNullConstant(N0.getOperand(1))) {
- SDValue N00 = N0->getOperand(0);
- if (N00.getValueType().is128BitVector())
- return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
- DAG.getBitcast(MVT::v2i64, N00));
+ // Try to remove a bitcast of constant vXi1 vector. We have to legalize
+ // most of these to scalar anyway.
+ if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
+ SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
+ ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
+ return combinevXi1ConstantToInteger(N0, DAG);
}
- // Detect bitcasts from FP_TO_SINT to x86mmx.
- if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
- N0.getOpcode() == ISD::FP_TO_SINT) {
- SDLoc DL(N0);
- SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
- DAG.getUNDEF(MVT::v2i32));
- return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
- DAG.getBitcast(MVT::v2i64, Res));
+ if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
+ VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ isa<ConstantSDNode>(N0)) {
+ auto *C = cast<ConstantSDNode>(N0);
+ if (C->isAllOnesValue())
+ return DAG.getConstant(1, SDLoc(N0), VT);
+ if (C->isNullValue())
+ return DAG.getConstant(0, SDLoc(N0), VT);
}
+ // Try to remove bitcasts from input and output of mask arithmetic to
+ // remove GPR<->K-register crossings.
+ if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
+ return V;
+
// Convert a bitcasted integer logic operation that has one bitcasted
// floating-point operand into a floating-point logic operation. This may
// create a load of a constant, but that is cheaper than materializing the
@@ -30517,8 +31920,8 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
// to these zexts.
static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
- const SDValue &Zext1, const SDLoc &DL) {
-
+ const SDValue &Zext1, const SDLoc &DL,
+ const X86Subtarget &Subtarget) {
// Find the appropriate width for the PSADBW.
EVT InVT = Zext0.getOperand(0).getValueType();
unsigned RegSize = std::max(128u, InVT.getSizeInBits());
@@ -30533,9 +31936,15 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
Ops[0] = Zext1.getOperand(0);
SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
- // Actually build the SAD
+ // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
+ auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
+ return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
+ };
MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
- return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
+ return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
+ PSADBWBuilder);
}
// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
@@ -30702,12 +32111,12 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
return SDValue();
unsigned RegSize = 128;
- if (Subtarget.hasBWI())
+ if (Subtarget.useBWIRegs())
RegSize = 512;
- else if (Subtarget.hasAVX2())
+ else if (Subtarget.hasAVX())
RegSize = 256;
- // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
+ // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
// TODO: We should be able to handle larger vectors by splitting them before
// feeding them into several SADs, and then reducing over those.
if (RegSize / VT.getVectorNumElements() < 8)
@@ -30742,7 +32151,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
// Create the SAD instruction.
SDLoc DL(Extract);
- SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
+ SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
// If the original vector was wider than 8 elements, sum over the results
// in the SAD vector.
@@ -30791,6 +32200,11 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
return SDValue();
+ // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
+ if (X86ISD::VBROADCAST == Src.getOpcode() &&
+ Src.getOperand(0).getValueType() == VT)
+ return Src.getOperand(0);
+
// Resolve the target shuffle inputs and mask.
SmallVector<int, 16> Mask;
SmallVector<SDValue, 2> Ops;
@@ -30908,8 +32322,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
isa<ConstantSDNode>(EltIdx) &&
isa<ConstantSDNode>(InputVector.getOperand(0))) {
uint64_t ExtractedElt = N->getConstantOperandVal(1);
- uint64_t InputValue = InputVector.getConstantOperandVal(0);
- uint64_t Res = (InputValue >> ExtractedElt) & 1;
+ auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0));
+ const APInt &InputValue = InputC->getAPIntValue();
+ uint64_t Res = InputValue[ExtractedElt];
return DAG.getConstant(Res, dl, MVT::i1);
}
@@ -30927,102 +32342,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
return MinMax;
- // Only operate on vectors of 4 elements, where the alternative shuffling
- // gets to be more expensive.
- if (SrcVT != MVT::v4i32)
- return SDValue();
-
- // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
- // single use which is a sign-extend or zero-extend, and all elements are
- // used.
- SmallVector<SDNode *, 4> Uses;
- unsigned ExtractedElements = 0;
- for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
- UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
- if (UI.getUse().getResNo() != InputVector.getResNo())
- return SDValue();
-
- SDNode *Extract = *UI;
- if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
- return SDValue();
-
- if (Extract->getValueType(0) != MVT::i32)
- return SDValue();
- if (!Extract->hasOneUse())
- return SDValue();
- if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
- Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
- return SDValue();
- if (!isa<ConstantSDNode>(Extract->getOperand(1)))
- return SDValue();
-
- // Record which element was extracted.
- ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
- Uses.push_back(Extract);
- }
-
- // If not all the elements were used, this may not be worthwhile.
- if (ExtractedElements != 15)
- return SDValue();
-
- // Ok, we've now decided to do the transformation.
- // If 64-bit shifts are legal, use the extract-shift sequence,
- // otherwise bounce the vector off the cache.
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- SDValue Vals[4];
-
- if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
- SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
- auto &DL = DAG.getDataLayout();
- EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
- SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
- DAG.getConstant(0, dl, VecIdxTy));
- SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
- DAG.getConstant(1, dl, VecIdxTy));
-
- SDValue ShAmt = DAG.getConstant(
- 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
- Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
- Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
- DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
- Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
- Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
- DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
- } else {
- // Store the value to a temporary stack slot.
- SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
- SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
- MachinePointerInfo());
-
- EVT ElementType = SrcVT.getVectorElementType();
- unsigned EltSize = ElementType.getSizeInBits() / 8;
-
- // Replace each use (extract) with a load of the appropriate element.
- for (unsigned i = 0; i < 4; ++i) {
- uint64_t Offset = EltSize * i;
- auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
- SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
-
- SDValue ScalarAddr =
- DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
-
- // Load the scalar.
- Vals[i] =
- DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
- }
- }
-
- // Replace the extracts
- for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
- UE = Uses.end(); UI != UE; ++UI) {
- SDNode *Extract = *UI;
-
- uint64_t IdxVal = Extract->getConstantOperandVal(1);
- DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
- }
-
- // The replacement was made in place; return N so it won't be revisited.
- return SDValue(N, 0);
+ return SDValue();
}
/// If a vector select has an operand that is -1 or 0, try to simplify the
@@ -31051,8 +32371,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
CondVT.getVectorElementType() == MVT::i1) {
// Invert the cond to not(cond) : xor(op,allones)=not(op)
- SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
- DAG.getAllOnesConstant(DL, CondVT));
+ SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
}
@@ -31191,68 +32510,77 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-// If this is a bitcasted op that can be represented as another type, push the
-// the bitcast to the inputs. This allows more opportunities for pattern
-// matching masked instructions. This is called when we know that the operation
-// is used as one of the inputs of a vselect.
-static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
- // Make sure we have a bitcast.
- if (OrigOp.getOpcode() != ISD::BITCAST)
- return false;
-
- SDValue Op = OrigOp.getOperand(0);
-
- // If the operation is used by anything other than the bitcast, we shouldn't
- // do this combine as that would replicate the operation.
- if (!Op.hasOneUse())
- return false;
+/// If this is a *dynamic* select (non-constant condition) and we can match
+/// this node with one of the variable blend instructions, restructure the
+/// condition so that blends can use the high (sign) bit of each element.
+static SDValue combineVSelectToShrunkBlend(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue Cond = N->getOperand(0);
+ if (N->getOpcode() != ISD::VSELECT ||
+ ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+ return SDValue();
+
+ // Don't optimize before the condition has been transformed to a legal type
+ // and don't ever optimize vector selects that map to AVX512 mask-registers.
+ unsigned BitWidth = Cond.getScalarValueSizeInBits();
+ if (BitWidth < 8 || BitWidth > 64)
+ return SDValue();
+
+ // We can only handle the cases where VSELECT is directly legal on the
+ // subtarget. We custom lower VSELECT nodes with constant conditions and
+ // this makes it hard to see whether a dynamic VSELECT will correctly
+ // lower, so we both check the operation's status and explicitly handle the
+ // cases where a *dynamic* blend will fail even though a constant-condition
+ // blend could be custom lowered.
+ // FIXME: We should find a better way to handle this class of problems.
+ // Potentially, we should combine constant-condition vselect nodes
+ // pre-legalization into shuffles and not mark as many types as custom
+ // lowered.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT VT = N->getValueType(0);
+ if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+ return SDValue();
+ // FIXME: We don't support i16-element blends currently. We could and
+ // should support them by making *all* the bits in the condition be set
+ // rather than just the high bit and using an i8-element blend.
+ if (VT.getVectorElementType() == MVT::i16)
+ return SDValue();
+ // Dynamic blending was only available from SSE4.1 onward.
+ if (VT.is128BitVector() && !Subtarget.hasSSE41())
+ return SDValue();
+ // Byte blends are only available in AVX2
+ if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
+ return SDValue();
+ // There are no 512-bit blend instructions that use sign bits.
+ if (VT.is512BitVector())
+ return SDValue();
- MVT VT = OrigOp.getSimpleValueType();
- MVT EltVT = VT.getVectorElementType();
- SDLoc DL(Op.getNode());
+ // TODO: Add other opcodes eventually lowered into BLEND.
+ for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
+ UI != UE; ++UI)
+ if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
+ return SDValue();
- auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
- SDValue Op2) {
- Op0 = DAG.getBitcast(VT, Op0);
- DCI.AddToWorklist(Op0.getNode());
- Op1 = DAG.getBitcast(VT, Op1);
- DCI.AddToWorklist(Op1.getNode());
- DCI.CombineTo(OrigOp.getNode(),
- DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
- return true;
- };
+ APInt DemandedMask(APInt::getSignMask(BitWidth));
+ KnownBits Known;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
+ return SDValue();
- unsigned Opcode = Op.getOpcode();
- switch (Opcode) {
- case X86ISD::SHUF128: {
- if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
- return false;
- // Only change element size, not type.
- if (VT.isInteger() != Op.getSimpleValueType().isInteger())
- return false;
- return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
- Op.getOperand(2));
- }
- case X86ISD::SUBV_BROADCAST: {
- unsigned EltSize = EltVT.getSizeInBits();
- if (EltSize != 32 && EltSize != 64)
- return false;
- // Only change element size, not type.
- if (VT.isInteger() != Op.getSimpleValueType().isInteger())
- return false;
- SDValue Op0 = Op.getOperand(0);
- MVT Op0VT = MVT::getVectorVT(EltVT,
- Op0.getSimpleValueType().getSizeInBits() / EltSize);
- Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
- DCI.AddToWorklist(Op0.getNode());
- DCI.CombineTo(OrigOp.getNode(),
- DAG.getNode(Opcode, DL, VT, Op0));
- return true;
+ // If we changed the computation somewhere in the DAG, this change will
+ // affect all users of Cond. Update all the nodes so that we do not use
+ // the generic VSELECT anymore. Otherwise, we may perform wrong
+ // optimizations as we messed with the actual expectation for the vector
+ // boolean values.
+ for (SDNode *U : Cond->uses()) {
+ SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U), U->getValueType(0),
+ Cond, U->getOperand(1), U->getOperand(2));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
}
- }
-
- return false;
+ DCI.CommitTargetLoweringOpt(TLO);
+ return SDValue(N, 0);
}
/// Do target-specific dag combines on SELECT and VSELECT nodes.
@@ -31268,6 +32596,23 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
EVT CondVT = Cond.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ // Convert vselects with constant condition into shuffles.
+ if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
+ DCI.isBeforeLegalizeOps()) {
+ SmallVector<int, 64> Mask(VT.getVectorNumElements(), -1);
+ for (int i = 0, Size = Mask.size(); i != Size; ++i) {
+ SDValue CondElt = Cond->getOperand(i);
+ Mask[i] = i;
+ // Arbitrarily choose from the 2nd operand if the select condition element
+ // is undef.
+ // TODO: Can we do better by matching patterns such as even/odd?
+ if (CondElt.isUndef() || isNullConstant(CondElt))
+ Mask[i] += Size;
+ }
+
+ return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
+ }
+
// If we have SSE[12] support, try to form min/max nodes. SSE min/max
// instructions match the semantics of the common C idiom x<y?x:y but not
// x<=y?x:y, because of how they handle negative zero (which can be
@@ -31292,7 +32637,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// and negative zero incorrectly.
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
if (!DAG.getTarget().Options.UnsafeFPMath &&
- !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+ !(DAG.isKnownNeverZeroFloat(LHS) ||
+ DAG.isKnownNeverZeroFloat(RHS)))
break;
std::swap(LHS, RHS);
}
@@ -31302,7 +32648,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Converting this to a min would handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.getTarget().Options.UnsafeFPMath &&
- !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+ !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
break;
Opcode = X86ISD::FMIN;
break;
@@ -31321,7 +32667,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Converting this to a max would handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.getTarget().Options.UnsafeFPMath &&
- !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+ !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
break;
Opcode = X86ISD::FMAX;
break;
@@ -31331,7 +32677,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// and negative zero incorrectly.
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
if (!DAG.getTarget().Options.UnsafeFPMath &&
- !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+ !(DAG.isKnownNeverZeroFloat(LHS) ||
+ DAG.isKnownNeverZeroFloat(RHS)))
break;
std::swap(LHS, RHS);
}
@@ -31358,7 +32705,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// and negative zero incorrectly, and swapping the operands would
// cause it to handle NaNs incorrectly.
if (!DAG.getTarget().Options.UnsafeFPMath &&
- !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
+ !(DAG.isKnownNeverZeroFloat(LHS) ||
+ DAG.isKnownNeverZeroFloat(RHS))) {
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
break;
std::swap(LHS, RHS);
@@ -31394,7 +32742,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// and negative zero incorrectly, and swapping the operands would
// cause it to handle NaNs incorrectly.
if (!DAG.getTarget().Options.UnsafeFPMath &&
- !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
+ !DAG.isKnownNeverZeroFloat(LHS) &&
+ !DAG.isKnownNeverZeroFloat(RHS)) {
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
break;
std::swap(LHS, RHS);
@@ -31418,19 +32767,38 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
}
+ // Some mask scalar intrinsics rely on checking if only one bit is set
+ // and implement it in C code like this:
+ // A[0] = (U & 1) ? A[0] : W[0];
+ // This creates some redundant instructions that break pattern matching.
+ // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
+ if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
+ Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ SDValue AndNode = Cond.getOperand(0);
+ if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
+ isNullConstant(Cond.getOperand(1)) &&
+ isOneConstant(AndNode.getOperand(1))) {
+ // LHS and RHS swapped due to
+ // setcc outputting 1 when AND resulted in 0 and vice versa.
+ AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
+ return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
+ }
+ }
+
// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
// lowering on KNL. In this case we convert it to
// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
- // The same situation for all 128 and 256-bit vectors of i8 and i16.
+ // The same situation all vectors of i8 and i16 without BWI.
+ // Make sure we extend these even before type legalization gets a chance to
+ // split wide vectors.
// Since SKX these selects have a proper lowering.
- if (Subtarget.hasAVX512() && CondVT.isVector() &&
+ if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
CondVT.getVectorElementType() == MVT::i1 &&
- (VT.is128BitVector() || VT.is256BitVector()) &&
+ VT.getVectorNumElements() > 4 &&
(VT.getVectorElementType() == MVT::i8 ||
- VT.getVectorElementType() == MVT::i16) &&
- !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
+ VT.getVectorElementType() == MVT::i16)) {
Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
- DCI.AddToWorklist(Cond.getNode());
return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
}
@@ -31476,7 +32844,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
// psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
- (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+ (Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
// Check if one of the arms of the VSELECT is a zero vector. If it's on the
@@ -31494,40 +32862,50 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
SDValue CondRHS = Cond->getOperand(1);
+ auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
+ };
+
// Look for a general sub with unsigned saturation first.
// x >= y ? x-y : 0 --> subus x, y
// x > y ? x-y : 0 --> subus x, y
if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
- return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
+ SUBUSBuilder);
if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
- if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
- if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
- if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
- // If the RHS is a constant we have to reverse the const
- // canonicalization.
- // x > C-1 ? x+-C : 0 --> subus x, C
- if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
- CondRHSConst->getAPIntValue() ==
- (-OpRHSConst->getAPIntValue() - 1))
- return DAG.getNode(
- X86ISD::SUBUS, DL, VT, OpLHS,
- DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
+ if (isa<BuildVectorSDNode>(CondRHS)) {
+ // If the RHS is a constant we have to reverse the const
+ // canonicalization.
+ // x > C-1 ? x+-C : 0 --> subus x, C
+ auto MatchSUBUS = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
+ return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1);
+ };
+ if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+ ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchSUBUS)) {
+ OpRHS = DAG.getNode(ISD::SUB, DL, VT,
+ DAG.getConstant(0, DL, VT), OpRHS);
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
+ SUBUSBuilder);
+ }
// Another special case: If C was a sign bit, the sub has been
// canonicalized into a xor.
// FIXME: Would it be better to use computeKnownBits to determine
// whether it's safe to decanonicalize the xor?
// x s< 0 ? x^C : 0 --> subus x, C
- if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
- OpRHSConst->getAPIntValue().isSignMask())
- // Note that we have to rebuild the RHS constant here to ensure we
- // don't rely on particular values of undef lanes.
- return DAG.getNode(
- X86ISD::SUBUS, DL, VT, OpLHS,
- DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
+ if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode())
+ if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+ OpRHSConst->getAPIntValue().isSignMask()) {
+ OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
+ // Note that we have to rebuild the RHS constant here to ensure we
+ // don't rely on particular values of undef lanes.
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
+ SUBUSBuilder);
+ }
}
}
}
@@ -31535,99 +32913,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
return V;
- // If this is a *dynamic* select (non-constant condition) and we can match
- // this node with one of the variable blend instructions, restructure the
- // condition so that blends can use the high (sign) bit of each element and
- // use SimplifyDemandedBits to simplify the condition operand.
- if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
- !DCI.isBeforeLegalize() &&
- !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
- unsigned BitWidth = Cond.getScalarValueSizeInBits();
-
- // Don't optimize vector selects that map to mask-registers.
- if (BitWidth == 1)
- return SDValue();
-
- // We can only handle the cases where VSELECT is directly legal on the
- // subtarget. We custom lower VSELECT nodes with constant conditions and
- // this makes it hard to see whether a dynamic VSELECT will correctly
- // lower, so we both check the operation's status and explicitly handle the
- // cases where a *dynamic* blend will fail even though a constant-condition
- // blend could be custom lowered.
- // FIXME: We should find a better way to handle this class of problems.
- // Potentially, we should combine constant-condition vselect nodes
- // pre-legalization into shuffles and not mark as many types as custom
- // lowered.
- if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
- return SDValue();
- // FIXME: We don't support i16-element blends currently. We could and
- // should support them by making *all* the bits in the condition be set
- // rather than just the high bit and using an i8-element blend.
- if (VT.getVectorElementType() == MVT::i16)
- return SDValue();
- // Dynamic blending was only available from SSE4.1 onward.
- if (VT.is128BitVector() && !Subtarget.hasSSE41())
- return SDValue();
- // Byte blends are only available in AVX2
- if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
- return SDValue();
- // There are no 512-bit blend instructions that use sign bits.
- if (VT.is512BitVector())
- return SDValue();
-
- assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
- APInt DemandedMask(APInt::getSignMask(BitWidth));
- KnownBits Known;
- TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
- !DCI.isBeforeLegalizeOps());
- if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
- TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
- // If we changed the computation somewhere in the DAG, this change will
- // affect all users of Cond. Make sure it is fine and update all the nodes
- // so that we do not use the generic VSELECT anymore. Otherwise, we may
- // perform wrong optimizations as we messed with the actual expectation
- // for the vector boolean values.
- if (Cond != TLO.Old) {
- // Check all uses of the condition operand to check whether it will be
- // consumed by non-BLEND instructions. Those may require that all bits
- // are set properly.
- for (SDNode *U : Cond->uses()) {
- // TODO: Add other opcodes eventually lowered into BLEND.
- if (U->getOpcode() != ISD::VSELECT)
- return SDValue();
- }
-
- // Update all users of the condition before committing the change, so
- // that the VSELECT optimizations that expect the correct vector boolean
- // value will not be triggered.
- for (SDNode *U : Cond->uses()) {
- SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
- U->getValueType(0), Cond, U->getOperand(1),
- U->getOperand(2));
- DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
- }
- DCI.CommitTargetLoweringOpt(TLO);
- return SDValue();
- }
- // Only Cond (rather than other nodes in the computation chain) was
- // changed. Change the condition just for N to keep the opportunity to
- // optimize all other users their own way.
- SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
- DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
- return SDValue();
- }
- }
-
- // Look for vselects with LHS/RHS being bitcasted from an operation that
- // can be executed on another type. Push the bitcast to the inputs of
- // the operation. This exposes opportunities for using masking instructions.
- if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
- CondVT.getVectorElementType() == MVT::i1) {
- if (combineBitcastForMaskedOp(LHS, DAG, DCI))
- return SDValue(N, 0);
- if (combineBitcastForMaskedOp(RHS, DAG, DCI))
- return SDValue(N, 0);
- }
+ if (SDValue V = combineVSelectToShrunkBlend(N, DAG, DCI, Subtarget))
+ return V;
// Custom action for SELECT MMX
if (VT == MVT::x86mmx) {
@@ -31969,17 +33256,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
SDValue Cond = N->getOperand(3);
- if (CC == X86::COND_E || CC == X86::COND_NE) {
- switch (Cond.getOpcode()) {
- default: break;
- case X86ISD::BSR:
- case X86ISD::BSF:
- // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
- if (DAG.isKnownNeverZero(Cond.getOperand(0)))
- return (CC == X86::COND_E) ? FalseOp : TrueOp;
- }
- }
-
// Try to simplify the EFLAGS and condition code operands.
// We can't always do this as FCMOV only supports a subset of X86 cond.
if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
@@ -32149,6 +33425,36 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
}
}
+ // Handle (CMOV C-1, (ADD (CTTZ X), C), (X != 0)) ->
+ // (ADD (CMOV (CTTZ X), -1, (X != 0)), C) or
+ // (CMOV (ADD (CTTZ X), C), C-1, (X == 0)) ->
+ // (ADD (CMOV C-1, (CTTZ X), (X == 0)), C)
+ if (CC == X86::COND_NE || CC == X86::COND_E) {
+ auto *Cnst = CC == X86::COND_E ? dyn_cast<ConstantSDNode>(TrueOp)
+ : dyn_cast<ConstantSDNode>(FalseOp);
+ SDValue Add = CC == X86::COND_E ? FalseOp : TrueOp;
+
+ if (Cnst && Add.getOpcode() == ISD::ADD && Add.hasOneUse()) {
+ auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
+ SDValue AddOp2 = Add.getOperand(0);
+ if (AddOp1 && (AddOp2.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
+ AddOp2.getOpcode() == ISD::CTTZ)) {
+ APInt Diff = Cnst->getAPIntValue() - AddOp1->getAPIntValue();
+ if (CC == X86::COND_E) {
+ Add = DAG.getNode(X86ISD::CMOV, DL, Add.getValueType(), AddOp2,
+ DAG.getConstant(Diff, DL, Add.getValueType()),
+ DAG.getConstant(CC, DL, MVT::i8), Cond);
+ } else {
+ Add = DAG.getNode(X86ISD::CMOV, DL, Add.getValueType(),
+ DAG.getConstant(Diff, DL, Add.getValueType()),
+ AddOp2, DAG.getConstant(CC, DL, MVT::i8), Cond);
+ }
+ return DAG.getNode(X86ISD::ADD, DL, Add.getValueType(), Add,
+ SDValue(AddOp1, 0));
+ }
+ }
+ }
+
return SDValue();
}
@@ -32276,13 +33582,6 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
if ((NumElts % 2) != 0)
return SDValue();
- // If the upper 17 bits of each element are zero then we can use PMADD.
- APInt Mask17 = APInt::getHighBitsSet(32, 17);
- if (VT == MVT::v4i32 && DAG.MaskedValueIsZero(N0, Mask17) &&
- DAG.MaskedValueIsZero(N1, Mask17))
- return DAG.getNode(X86ISD::VPMADDWD, DL, VT, DAG.getBitcast(MVT::v8i16, N0),
- DAG.getBitcast(MVT::v8i16, N1));
-
unsigned RegSize = 128;
MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
@@ -32378,7 +33677,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
}
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
- EVT VT, SDLoc DL) {
+ EVT VT, const SDLoc &DL) {
auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
@@ -32390,10 +33689,11 @@ static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
return Result;
};
- auto combineMulMulAddOrSub = [&](bool isAdd) {
+ auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
- DAG.getConstant(9, DL, VT));
- Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
+ DAG.getConstant(Mul1, DL, VT));
+ Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
+ DAG.getConstant(Mul2, DL, VT));
Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
N->getOperand(0));
return Result;
@@ -32408,43 +33708,137 @@ static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
case 21:
// mul x, 21 => add ((shl (mul x, 5), 2), x)
return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
+ case 41:
+ // mul x, 41 => add ((shl (mul x, 5), 3), x)
+ return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
case 22:
// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
case 19:
- // mul x, 19 => sub ((shl (mul x, 5), 2), x)
- return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
+ // mul x, 19 => add ((shl (mul x, 9), 1), x)
+ return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
+ case 37:
+ // mul x, 37 => add ((shl (mul x, 9), 2), x)
+ return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
+ case 73:
+ // mul x, 73 => add ((shl (mul x, 9), 3), x)
+ return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
case 13:
// mul x, 13 => add ((shl (mul x, 3), 2), x)
return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
case 23:
- // mul x, 13 => sub ((shl (mul x, 3), 3), x)
+ // mul x, 23 => sub ((shl (mul x, 3), 3), x)
return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
- case 14:
- // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
- return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
- combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
case 26:
- // mul x, 26 => sub ((mul (mul x, 9), 3), x)
- return combineMulMulAddOrSub(/*isAdd*/ false);
+ // mul x, 26 => add ((mul (mul x, 5), 5), x)
+ return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
case 28:
// mul x, 28 => add ((mul (mul x, 9), 3), x)
- return combineMulMulAddOrSub(/*isAdd*/ true);
+ return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
case 29:
// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
- combineMulMulAddOrSub(/*isAdd*/ true));
- case 30:
- // mul x, 30 => sub (sub ((shl x, 5), x), x)
- return DAG.getNode(
- ISD::SUB, DL, VT,
- DAG.getNode(ISD::SUB, DL, VT,
- DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(5, DL, MVT::i8)),
- N->getOperand(0)),
- N->getOperand(0));
+ combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
+ }
+
+ // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
+ // by a single LEA.
+ // First check if this a sum of two power of 2s because that's easy. Then
+ // count how many zeros are up to the first bit.
+ // TODO: We can do this even without LEA at a cost of two shifts and an add.
+ if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
+ unsigned ScaleShift = countTrailingZeros(MulAmt);
+ if (ScaleShift >= 1 && ScaleShift < 4) {
+ unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
+ SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(ShiftAmt, DL, MVT::i8));
+ SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(ScaleShift, DL, MVT::i8));
+ return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
+ }
+ }
+
+ return SDValue();
+}
+
+// If the upper 17 bits of each element are zero then we can use PMADDWD,
+// which is always at least as quick as PMULLD, expect on KNL.
+static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ if (Subtarget.getProcFamily() == X86Subtarget::IntelKNL)
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ // Only support vXi32 vectors.
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
+ return SDValue();
+
+ // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
+ MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(WVT))
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ APInt Mask17 = APInt::getHighBitsSet(32, 17);
+ if (!DAG.MaskedValueIsZero(N1, Mask17) ||
+ !DAG.MaskedValueIsZero(N0, Mask17))
+ return SDValue();
+
+ // Use SplitOpsAndApply to handle AVX splitting.
+ auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
+ return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
+ { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
+ PMADDWDBuilder);
+}
+
+static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ // Only support vXi64 vectors.
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
+ !DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // MULDQ returns the 64-bit result of the signed multiplication of the lower
+ // 32-bits. We can lower with this if the sign bits stretch that far.
+ if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
+ DAG.ComputeNumSignBits(N1) > 32) {
+ auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
+ PMULDQBuilder, /*CheckBWI*/false);
}
+
+ // If the upper bits are zero we can use a single pmuludq.
+ APInt Mask = APInt::getHighBitsSet(64, 32);
+ if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
+ auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
+ PMULUDQBuilder, /*CheckBWI*/false);
+ }
+
return SDValue();
}
@@ -32454,6 +33848,13 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
+
+ if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
+ return V;
+
+ if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
+ return V;
+
if (DCI.isBeforeLegalize() && VT.isVector())
return reduceVMULWidth(N, DAG, Subtarget);
@@ -32473,9 +33874,14 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
if (!C)
return SDValue();
uint64_t MulAmt = C->getZExtValue();
- if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+ if (isPowerOf2_64(MulAmt))
return SDValue();
+ SDLoc DL(N);
+ if (MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+ return DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+ N->getOperand(1));
+
uint64_t MulAmt1 = 0;
uint64_t MulAmt2 = 0;
if ((MulAmt % 9) == 0) {
@@ -32489,7 +33895,6 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
MulAmt2 = MulAmt / 3;
}
- SDLoc DL(N);
SDValue NewMul;
if (MulAmt2 &&
(isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
@@ -32523,39 +33928,47 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
"Both cases that could cause potential overflows should have "
"already been handled.");
int64_t SignMulAmt = C->getSExtValue();
- if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
- (SignMulAmt != -INT64_MAX)) {
- int NumSign = SignMulAmt > 0 ? 1 : -1;
- bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
- bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
- if (IsPowerOf2_64PlusOne) {
- // (mul x, 2^N + 1) => (add (shl x, N), x)
- NewMul = DAG.getNode(
- ISD::ADD, DL, VT, N->getOperand(0),
- DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
- MVT::i8)));
- } else if (IsPowerOf2_64MinusOne) {
- // (mul x, 2^N - 1) => (sub (shl x, N), x)
- NewMul = DAG.getNode(
- ISD::SUB, DL, VT,
- DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
- MVT::i8)),
- N->getOperand(0));
- }
+ assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
+ uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
+ if (isPowerOf2_64(AbsMulAmt - 1)) {
+ // (mul x, 2^N + 1) => (add (shl x, N), x)
+ NewMul = DAG.getNode(
+ ISD::ADD, DL, VT, N->getOperand(0),
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
+ MVT::i8)));
// To negate, subtract the number from zero
- if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
- NewMul =
- DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
+ if (SignMulAmt < 0)
+ NewMul = DAG.getNode(ISD::SUB, DL, VT,
+ DAG.getConstant(0, DL, VT), NewMul);
+ } else if (isPowerOf2_64(AbsMulAmt + 1)) {
+ // (mul x, 2^N - 1) => (sub (shl x, N), x)
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(AbsMulAmt + 1),
+ DL, MVT::i8));
+ // To negate, reverse the operands of the subtract.
+ if (SignMulAmt < 0)
+ NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
+ else
+ NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
+ } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
+ // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(AbsMulAmt - 2),
+ DL, MVT::i8));
+ NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
+ NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
+ } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
+ // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(AbsMulAmt + 2),
+ DL, MVT::i8));
+ NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
+ NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
}
}
- if (NewMul)
- // Do not add new nodes to DAG combiner worklist.
- DCI.CombineTo(N, NewMul, false);
-
- return SDValue();
+ return NewMul;
}
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
@@ -32670,11 +34083,17 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
+ // Only do this on the last DAG combine as it can interfere with other
+ // combines.
+ if (!DCI.isAfterLegalizeDAG())
+ return SDValue();
+
// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
// TODO: This is a generic DAG combine that became an x86-only combine to
// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
@@ -32691,6 +34110,14 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
// transform should reduce code size. It may also enable secondary transforms
// from improved known-bits analysis or instruction selection.
APInt MaskVal = AndC->getAPIntValue();
+
+ // If this can be matched by a zero extend, don't optimize.
+ if (MaskVal.isMask()) {
+ unsigned TO = MaskVal.countTrailingOnes();
+ if (TO >= 8 && isPowerOf2_32(TO))
+ return SDValue();
+ }
+
APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
unsigned OldMaskSize = MaskVal.getMinSignedBits();
unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
@@ -32717,7 +34144,7 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
return V;
if (N->getOpcode() == ISD::SRL)
- if (SDValue V = combineShiftRightLogical(N, DAG))
+ if (SDValue V = combineShiftRightLogical(N, DAG, DCI))
return V;
return SDValue();
@@ -32797,12 +34224,10 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
// Attempt to combine as shuffle.
SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
- DCI.CombineTo(N, Res);
- return SDValue();
- }
+ if (SDValue Res =
+ combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+ /*HasVarMask*/ false, DAG, Subtarget))
+ return Res;
return SDValue();
}
@@ -32861,10 +34286,8 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(
{Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
- DCI.CombineTo(N, Res);
- return SDValue();
- }
+ /*HasVarMask*/ false, DAG, Subtarget))
+ return Res;
}
// Constant Folding.
@@ -32900,12 +34323,10 @@ static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
// Attempt to combine PINSRB/PINSRW patterns to a shuffle.
SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
- DCI.CombineTo(N, Res);
- return SDValue();
- }
+ if (SDValue Res =
+ combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+ /*HasVarMask*/ false, DAG, Subtarget))
+ return Res;
return SDValue();
}
@@ -32973,9 +34394,13 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
SDValue FSetCC =
DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
DAG.getConstant(x86cc, DL, MVT::i8));
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
- N->getSimpleValueType(0), FSetCC,
- DAG.getIntPtrConstant(0, DL));
+ // Need to fill with zeros to ensure the bitcast will produce zeroes
+ // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+ SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
+ DAG.getConstant(0, DL, MVT::v16i1),
+ FSetCC, DAG.getIntPtrConstant(0, DL));
+ return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
+ N->getSimpleValueType(0));
}
SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
CMP00.getValueType(), CMP00, CMP01,
@@ -33012,25 +34437,40 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Try to match (and (xor X, -1), Y) logic pattern for (andnp X, Y) combines.
+static bool matchANDXORWithAllOnesAsANDNP(SDNode *N, SDValue &X, SDValue &Y) {
+ if (N->getOpcode() != ISD::AND)
+ return false;
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ if (N0.getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) {
+ X = N0.getOperand(0);
+ Y = N1;
+ return true;
+ }
+ if (N1.getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) {
+ X = N1.getOperand(0);
+ Y = N0;
+ return true;
+ }
+
+ return false;
+}
+
/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::AND);
EVT VT = N->getValueType(0);
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- SDLoc DL(N);
-
if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
return SDValue();
- if (N0.getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
- return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
-
- if (N1.getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
- return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
+ SDValue X, Y;
+ if (matchANDXORWithAllOnesAsANDNP(N, X, Y))
+ return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
return SDValue();
}
@@ -33042,8 +34482,7 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
// Even with AVX-512 this is still useful for removing casts around logical
// operations on vXi1 mask types.
static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+ const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
assert(VT.isVector() && "Expected vector type");
@@ -33214,7 +34653,7 @@ static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
// It's equivalent to performing bzhi (zero high bits) on the input, with the
// same index of the load.
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+ const X86Subtarget &Subtarget) {
MVT VT = Node->getSimpleValueType(0);
SDLoc dl(Node);
@@ -33269,15 +34708,16 @@ static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
// that will be replaced with one bzhi instruction.
SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
- SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
+ SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
// Get the Node which indexes into the array.
SDValue Index = getIndexFromUnindexedLoad(Ld);
if (!Index)
return SDValue();
- Index = DAG.getZExtOrTrunc(Index, dl, VT);
+ Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
- SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
+ Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
@@ -33303,6 +34743,20 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
}
+ // Use a 32-bit and+zext if upper bits known zero.
+ if (VT == MVT::i64 && Subtarget.is64Bit() &&
+ !isa<ConstantSDNode>(N->getOperand(1))) {
+ APInt HiMask = APInt::getHighBitsSet(64, 32);
+ if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
+ DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
+ SDLoc dl(N);
+ SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
+ SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
+ DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
+ }
+ }
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -33326,10 +34780,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(
{Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
- DCI.CombineTo(N, Res);
- return SDValue();
- }
+ /*HasVarMask*/ false, DAG, Subtarget))
+ return Res;
}
// Attempt to combine a scalar bitmask AND with an extracted shuffle.
@@ -33365,7 +34817,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue Shuffle = combineX86ShufflesRecursively(
{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
- /*HasVarMask*/ false, DAG, DCI, Subtarget))
+ /*HasVarMask*/ false, DAG, Subtarget))
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
N->getOperand(0).getOperand(1));
}
@@ -33374,6 +34826,38 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
+static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
+ if (N->getOpcode() != ISD::OR)
+ return false;
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Canonicalize AND to LHS.
+ if (N1.getOpcode() == ISD::AND)
+ std::swap(N0, N1);
+
+ // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
+ if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
+ return false;
+
+ Mask = N1.getOperand(0);
+ X = N1.getOperand(1);
+
+ // Check to see if the mask appeared in both the AND and ANDNP.
+ if (N0.getOperand(0) == Mask)
+ Y = N0.getOperand(1);
+ else if (N0.getOperand(1) == Mask)
+ Y = N0.getOperand(0);
+ else
+ return false;
+
+ // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
+ // ANDNP combine allows other combines to happen that prevent matching.
+ return true;
+}
+
// Try to fold:
// (or (and (m, y), (pandn m, x)))
// into:
@@ -33386,33 +34870,13 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
-
if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
(VT.is256BitVector() && Subtarget.hasInt256())))
return SDValue();
- // Canonicalize AND to LHS.
- if (N1.getOpcode() == ISD::AND)
- std::swap(N0, N1);
-
- // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
- // ANDNP combine allows other combines to happen that prevent matching.
- if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
- return SDValue();
-
- SDValue Mask = N1.getOperand(0);
- SDValue X = N1.getOperand(1);
- SDValue Y;
- if (N0.getOperand(0) == Mask)
- Y = N0.getOperand(1);
- if (N0.getOperand(1) == Mask)
- Y = N0.getOperand(0);
-
- // Check to see if the mask appeared in both the AND and ANDNP.
- if (!Y.getNode())
+ SDValue X, Y, Mask;
+ if (!matchLogicBlend(N, X, Y, Mask))
return SDValue();
// Validate that X, Y, and Mask are bitcasts, and see through them.
@@ -33509,7 +34973,7 @@ static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
// encoding of shr and lzcnt is more desirable.
SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
- DAG.getConstant(Log2b, dl, VT));
+ DAG.getConstant(Log2b, dl, MVT::i8));
return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
}
@@ -33829,63 +35293,180 @@ static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
return false;
// FIXME: Scalar type may be supported if we move it to vector register.
- if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
+ if (!SrcVT.isVector())
return false;
EVT SrcElVT = SrcVT.getScalarType();
EVT DstElVT = DstVT.getScalarType();
- if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
- return false;
- if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
+ if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
return false;
if (SrcVT.is512BitVector() || Subtarget.hasVLX())
return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
return false;
}
-/// Detect a pattern of truncation with saturation:
-/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// Detect patterns of truncation with unsigned saturation:
+///
+/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// Return the source value x to be truncated or SDValue() if the pattern was
+/// not matched.
+///
+/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
+/// where C1 >= 0 and C2 is unsigned max of destination type.
+///
+/// (truncate (smax (smin (x, C2), C1)) to dest_type)
+/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
+///
+/// These two patterns are equivalent to:
+/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
+/// So return the smax(x, C1) value to be truncated or SDValue() if the
+/// pattern was not matched.
+static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
+ const SDLoc &DL) {
+ EVT InVT = In.getValueType();
+
+ // Saturation with truncation. We truncate from InVT to VT.
+ assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
+ "Unexpected types for truncate operation");
+
+ // Match min/max and return limit value as a parameter.
+ auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
+ if (V.getOpcode() == Opcode &&
+ ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
+ return V.getOperand(0);
+ return SDValue();
+ };
+
+ APInt C1, C2;
+ if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
+ // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
+ // the element size of the destination type.
+ if (C2.isMask(VT.getScalarSizeInBits()))
+ return UMin;
+
+ if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
+ if (MatchMinMax(SMin, ISD::SMAX, C1))
+ if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
+ return SMin;
+
+ if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
+ if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
+ if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
+ C2.uge(C1)) {
+ return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
+ }
+
+ return SDValue();
+}
+
+/// Detect patterns of truncation with signed saturation:
+/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
+/// signed_max_of_dest_type)) to dest_type)
+/// or:
+/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
+/// signed_min_of_dest_type)) to dest_type).
+/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
/// Return the source value to be truncated or SDValue() if the pattern was not
/// matched.
-static SDValue detectUSatPattern(SDValue In, EVT VT) {
- if (In.getOpcode() != ISD::UMIN)
+static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
+ unsigned NumDstBits = VT.getScalarSizeInBits();
+ unsigned NumSrcBits = In.getScalarValueSizeInBits();
+ assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
+
+ auto MatchMinMax = [](SDValue V, unsigned Opcode,
+ const APInt &Limit) -> SDValue {
+ APInt C;
+ if (V.getOpcode() == Opcode &&
+ ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
+ return V.getOperand(0);
return SDValue();
+ };
- //Saturation with truncation. We truncate from InVT to VT.
- assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
- "Unexpected types for truncate operation");
-
- APInt C;
- if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
- // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
- // the element size of the destination type.
- return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
- SDValue();
+ APInt SignedMax, SignedMin;
+ if (MatchPackUS) {
+ SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
+ SignedMin = APInt(NumSrcBits, 0);
+ } else {
+ SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
+ SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
}
+
+ if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
+ if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
+ return SMax;
+
+ if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
+ if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
+ return SMin;
+
return SDValue();
}
+/// Detect a pattern of truncation with signed saturation.
+/// The types should allow to use VPMOVSS* instruction on AVX512.
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched.
+static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
+ const X86Subtarget &Subtarget,
+ const TargetLowering &TLI) {
+ if (!TLI.isTypeLegal(In.getValueType()))
+ return SDValue();
+ if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+ return SDValue();
+ return detectSSatPattern(In, VT);
+}
+
/// Detect a pattern of truncation with saturation:
/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
/// The types should allow to use VPMOVUS* instruction on AVX512.
/// Return the source value to be truncated or SDValue() if the pattern was not
/// matched.
-static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
- const X86Subtarget &Subtarget) {
+static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
+ const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ const TargetLowering &TLI) {
+ if (!TLI.isTypeLegal(In.getValueType()))
+ return SDValue();
if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
return SDValue();
- return detectUSatPattern(In, VT);
+ return detectUSatPattern(In, VT, DAG, DL);
}
-static SDValue
-combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT SVT = VT.getScalarType();
+ EVT InVT = In.getValueType();
+ EVT InSVT = InVT.getScalarType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
- return SDValue();
- if (auto USatVal = detectUSatPattern(In, VT))
- if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+ if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
+ isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
+ if (auto SSatVal = detectSSatPattern(In, VT))
+ return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
+ if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+ }
+ if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
+ (SVT == MVT::i8 || SVT == MVT::i16) &&
+ (InSVT == MVT::i16 || InSVT == MVT::i32)) {
+ if (auto USatVal = detectSSatPattern(In, VT, true)) {
+ // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
+ if (SVT == MVT::i8 && InSVT == MVT::i32) {
+ EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ VT.getVectorNumElements());
+ SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
+ DAG, Subtarget);
+ if (Mid)
+ return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
+ Subtarget);
+ } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
+ return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
+ Subtarget);
+ }
+ if (auto SSatVal = detectSSatPattern(In, VT))
+ return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
+ Subtarget);
+ }
return SDValue();
}
@@ -33895,7 +35476,7 @@ combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
const SDLoc &DL) {
- if (!VT.isVector() || !VT.isSimple())
+ if (!VT.isVector())
return SDValue();
EVT InVT = In.getValueType();
unsigned NumElems = VT.getVectorNumElements();
@@ -33937,42 +35518,13 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
if (!C)
return false;
- uint64_t Val = C->getZExtValue();
- if (Val < Min || Val > Max)
+ const APInt &Val = C->getAPIntValue();
+ if (Val.ult(Min) || Val.ugt(Max))
return false;
}
return true;
};
- // Split vectors to legal target size and apply AVG.
- auto LowerToAVG = [&](SDValue Op0, SDValue Op1) {
- unsigned NumSubs = 1;
- if (Subtarget.hasBWI()) {
- if (VT.getSizeInBits() > 512)
- NumSubs = VT.getSizeInBits() / 512;
- } else if (Subtarget.hasAVX2()) {
- if (VT.getSizeInBits() > 256)
- NumSubs = VT.getSizeInBits() / 256;
- } else {
- if (VT.getSizeInBits() > 128)
- NumSubs = VT.getSizeInBits() / 128;
- }
-
- if (NumSubs == 1)
- return DAG.getNode(X86ISD::AVG, DL, VT, Op0, Op1);
-
- SmallVector<SDValue, 4> Subs;
- EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
- VT.getVectorNumElements() / NumSubs);
- for (unsigned i = 0; i != NumSubs; ++i) {
- unsigned Idx = i * SubVT.getVectorNumElements();
- SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits());
- SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits());
- Subs.push_back(DAG.getNode(X86ISD::AVG, DL, SubVT, LHS, RHS));
- }
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
- };
-
// Check if each element of the vector is left-shifted by one.
auto LHS = In.getOperand(0);
auto RHS = In.getOperand(1);
@@ -33986,6 +35538,11 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
Operands[0] = LHS.getOperand(0);
Operands[1] = LHS.getOperand(1);
+ auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
+ };
+
// Take care of the case when one of the operands is a constant vector whose
// element is in the range [1, 256].
if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
@@ -33996,7 +35553,9 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
SDValue VecOnes = DAG.getConstant(1, DL, InVT);
Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
- return LowerToAVG(Operands[0].getOperand(0), Operands[1]);
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT,
+ { Operands[0].getOperand(0), Operands[1] },
+ AVGBuilder);
}
if (Operands[0].getOpcode() == ISD::ADD)
@@ -34019,8 +35578,10 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
Operands[j].getOperand(0).getValueType() != VT)
return SDValue();
- // The pattern is detected, emit X86ISD::AVG instruction.
- return LowerToAVG(Operands[0].getOperand(0), Operands[1].getOperand(0));
+ // The pattern is detected, emit X86ISD::AVG instruction(s).
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT,
+ { Operands[0].getOperand(0),
+ Operands[1].getOperand(0) }, AVGBuilder);
}
return SDValue();
@@ -34451,6 +36012,63 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
SDValue StoredVal = St->getOperand(1);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
+ // This will avoid a copy to k-register.
+ if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
+ StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ StoredVal.getOperand(0).getValueType() == MVT::i8) {
+ return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
+ St->getBasePtr(), St->getPointerInfo(),
+ St->getAlignment(), St->getMemOperand()->getFlags());
+ }
+
+ // Widen v2i1/v4i1 stores to v8i1.
+ if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
+ Subtarget.hasAVX512()) {
+ unsigned NumConcats = 8 / VT.getVectorNumElements();
+ SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
+ Ops[0] = StoredVal;
+ StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
+ return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
+ }
+
+ // Turn vXi1 stores of constants into a scalar store.
+ if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
+ VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
+ ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
+ // If its a v64i1 store without 64-bit support, we need two stores.
+ if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+ SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
+ StoredVal->ops().slice(0, 32));
+ Lo = combinevXi1ConstantToInteger(Lo, DAG);
+ SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
+ StoredVal->ops().slice(32, 32));
+ Hi = combinevXi1ConstantToInteger(Hi, DAG);
+
+ unsigned Alignment = St->getAlignment();
+
+ SDValue Ptr0 = St->getBasePtr();
+ SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
+
+ SDValue Ch0 =
+ DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
+ Alignment, St->getMemOperand()->getFlags());
+ SDValue Ch1 =
+ DAG.getStore(St->getChain(), dl, Hi, Ptr1,
+ St->getPointerInfo().getWithOffset(4),
+ MinAlign(Alignment, 4U),
+ St->getMemOperand()->getFlags());
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+ }
+
+ StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
+ return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
+ }
+
// If we are saving a concatenation of two XMM registers and 32-byte stores
// are slow, such as on Sandy Bridge, perform two 16-byte stores.
bool Fast;
@@ -34493,13 +36111,19 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (SDValue Val =
- detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
+ detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
+ TLI))
+ return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
+ dl, Val, St->getBasePtr(),
+ St->getMemoryVT(), St->getMemOperand(), DAG);
+ if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
+ DAG, dl, Subtarget, TLI))
return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
dl, Val, St->getBasePtr(),
St->getMemoryVT(), St->getMemOperand(), DAG);
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NumElems = VT.getVectorNumElements();
assert(StVT != VT && "Cannot truncate to the same type");
unsigned FromSz = VT.getScalarSizeInBits();
@@ -34812,7 +36436,7 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
// Try to synthesize horizontal add/sub from adds/subs of shuffles.
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
- (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+ (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
isHorizontalBinOp(LHS, RHS, IsFadd)) {
auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
@@ -34825,7 +36449,7 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
- SDLoc &DL) {
+ const SDLoc &DL) {
assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
SDValue Src = N->getOperand(0);
unsigned Opcode = Src.getOpcode();
@@ -34898,7 +36522,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
// better to truncate if we have the chance.
if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
- !Subtarget.hasDQI())
+ !TLI.isOperationLegal(Opcode, SrcVT))
return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
LLVM_FALLTHROUGH;
case ISD::ADD: {
@@ -34915,88 +36539,50 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
-static SDValue
-combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
- SmallVector<SDValue, 8> &Regs) {
- assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
- Regs[0].getValueType() == MVT::v2i64));
+/// Truncate using ISD::AND mask and X86ISD::PACKUS.
+static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue In = N->getOperand(0);
+ EVT InVT = In.getValueType();
+ EVT InSVT = InVT.getVectorElementType();
EVT OutVT = N->getValueType(0);
EVT OutSVT = OutVT.getVectorElementType();
- EVT InVT = Regs[0].getValueType();
- EVT InSVT = InVT.getVectorElementType();
- SDLoc DL(N);
- // First, use mask to unset all bits that won't appear in the result.
- assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
- "OutSVT can only be either i8 or i16.");
+ // Split a long vector into vectors of legal type and mask to unset all bits
+ // that won't appear in the result to prevent saturation.
+ // TODO - we should be doing this at the maximum legal size but this is
+ // causing regressions where we're concatenating back to max width just to
+ // perform the AND and then extracting back again.....
+ unsigned NumSubRegs = InVT.getSizeInBits() / 128;
+ unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
+ EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
+ SmallVector<SDValue, 8> SubVecs(NumSubRegs);
+
APInt Mask =
APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
- SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
- for (auto &Reg : Regs)
- Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
-
- MVT UnpackedVT, PackedVT;
- if (OutSVT == MVT::i8) {
- UnpackedVT = MVT::v8i16;
- PackedVT = MVT::v16i8;
- } else {
- UnpackedVT = MVT::v4i32;
- PackedVT = MVT::v8i16;
- }
-
- // In each iteration, truncate the type by a half size.
- auto RegNum = Regs.size();
- for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
- j < e; j *= 2, RegNum /= 2) {
- for (unsigned i = 0; i < RegNum; i++)
- Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
- for (unsigned i = 0; i < RegNum / 2; i++)
- Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
- Regs[i * 2 + 1]);
- }
-
- // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
- // then extract a subvector as the result since v8i8 is not a legal type.
- if (OutVT == MVT::v8i8) {
- Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
- Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
- DAG.getIntPtrConstant(0, DL));
- return Regs[0];
- } else if (RegNum > 1) {
- Regs.resize(RegNum);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
- } else
- return Regs[0];
-}
-
-/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
-static SDValue
-combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
- SelectionDAG &DAG,
- SmallVector<SDValue, 8> &Regs) {
- assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
- EVT OutVT = N->getValueType(0);
- SDLoc DL(N);
+ SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT);
- // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
- SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
- for (auto &Reg : Regs) {
- Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
- Subtarget, DAG);
- Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
- Subtarget, DAG);
+ for (unsigned i = 0; i < NumSubRegs; i++) {
+ SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
+ DAG.getIntPtrConstant(i * NumSubRegElts, DL));
+ SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal);
}
+ In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs);
- for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
- Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
- Regs[i * 2 + 1]);
+ return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
+}
- if (Regs.size() > 2) {
- Regs.resize(Regs.size() / 2);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
- } else
- return Regs[0];
+/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
+static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue In = N->getOperand(0);
+ EVT InVT = In.getValueType();
+ EVT OutVT = N->getValueType(0);
+ In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
+ DAG.getValueType(OutVT));
+ return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
}
/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
@@ -35037,32 +36623,21 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
return SDValue();
SDLoc DL(N);
-
- // Split a long vector into vectors of legal type.
- unsigned RegNum = InVT.getSizeInBits() / 128;
- SmallVector<SDValue, 8> SubVec(RegNum);
- unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
- EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
-
- for (unsigned i = 0; i < RegNum; i++)
- SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
- DAG.getIntPtrConstant(i * NumSubRegElts, DL));
-
// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
// truncate 2 x v4i32 to v8i16.
if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
- return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
- else if (InSVT == MVT::i32)
- return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
- else
- return SDValue();
+ return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
+ if (InSVT == MVT::i32)
+ return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
+
+ return SDValue();
}
/// This function transforms vector truncation of 'extended sign-bits' or
/// 'extended zero-bits' values.
/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
-static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
+static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// Requires SSE2 but AVX512 has fast truncate.
@@ -35082,7 +36657,7 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
MVT InVT = In.getValueType().getSimpleVT();
MVT InSVT = InVT.getScalarType();
- // Check we have a truncation suited for PACKSS.
+ // Check we have a truncation suited for PACKSS/PACKUS.
if (!VT.is128BitVector() && !VT.is256BitVector())
return SDValue();
if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
@@ -35090,25 +36665,79 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
return SDValue();
- // Use PACKSS if the input has sign-bits that extend all the way to the
- // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
- unsigned NumSignBits = DAG.ComputeNumSignBits(In);
- unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
- if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
- return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+ unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
+ unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
// Use PACKUS if the input has zero-bits that extend all the way to the
// packed/truncated value. e.g. masks, zext_in_reg, etc.
KnownBits Known;
DAG.computeKnownBits(In, Known);
unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
- NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
- if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
+ if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
+ // Use PACKSS if the input has sign-bits that extend all the way to the
+ // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
+ unsigned NumSignBits = DAG.ComputeNumSignBits(In);
+ if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
+ return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+
return SDValue();
}
+// Try to form a MULHU or MULHS node by looking for
+// (trunc (srl (mul ext, ext), 16))
+// TODO: This is X86 specific because we want to be able to handle wide types
+// before type legalization. But we can only do it if the vector will be
+// legalized via widening/splitting. Type legalization can't handle promotion
+// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
+// combiner.
+static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ // First instruction should be a right shift of a multiply.
+ if (Src.getOpcode() != ISD::SRL ||
+ Src.getOperand(0).getOpcode() != ISD::MUL)
+ return SDValue();
+
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ // Only handle vXi16 types that are at least 128-bits.
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
+ VT.getVectorNumElements() < 8)
+ return SDValue();
+
+ // Input type should be vXi32.
+ EVT InVT = Src.getValueType();
+ if (InVT.getVectorElementType() != MVT::i32)
+ return SDValue();
+
+ // Need a shift by 16.
+ APInt ShiftAmt;
+ if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
+ ShiftAmt != 16)
+ return SDValue();
+
+ SDValue LHS = Src.getOperand(0).getOperand(0);
+ SDValue RHS = Src.getOperand(0).getOperand(1);
+
+ unsigned ExtOpc = LHS.getOpcode();
+ if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
+ RHS.getOpcode() != ExtOpc)
+ return SDValue();
+
+ // Peek through the extends.
+ LHS = LHS.getOperand(0);
+ RHS = RHS.getOperand(0);
+
+ // Ensure the input types match.
+ if (LHS.getValueType() != VT || RHS.getValueType() != VT)
+ return SDValue();
+
+ unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
+ return DAG.getNode(Opc, DL, VT, LHS, RHS);
+}
+
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
@@ -35123,10 +36752,14 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
return Avg;
- // Try to combine truncation with unsigned saturation.
- if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
+ // Try to combine truncation with signed/unsigned saturation.
+ if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
return Val;
+ // Try to combine PMULHUW/PMULHW for vXi16.
+ if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
+ return V;
+
// The bitcast source is a direct mmx result.
// Detect bitcasts between i32 to x86mmx
if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
@@ -35224,7 +36857,7 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
// If we're negating an FMA node, then we can adjust the
// instruction to include the extra negation.
unsigned NewOpcode = 0;
- if (Arg.hasOneUse()) {
+ if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
switch (Arg.getOpcode()) {
case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
@@ -35320,6 +36953,39 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ unsigned NumBits = VT.getSizeInBits();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+
+ // TODO - Constant Folding.
+ if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
+ // Reduce Cst1 to the bottom 16-bits.
+ // NOTE: SimplifyDemandedBits won't do this for constants.
+ const APInt &Val1 = Cst1->getAPIntValue();
+ APInt MaskedVal1 = Val1 & 0xFFFF;
+ if (MaskedVal1 != Val1)
+ return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
+ DAG.getConstant(MaskedVal1, SDLoc(N), VT));
+ }
+
+ // Only bottom 16-bits of the control bits are required.
+ KnownBits Known;
+ APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
+ if (TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO)) {
+ DCI.CommitTargetLoweringOpt(TLO);
+ return SDValue(N, 0);
+ }
+
+ return SDValue();
+}
static bool isNullFPScalarOrVectorConst(SDValue V) {
return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
@@ -35450,8 +37116,6 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
if (Subtarget.useSoftFloat())
return SDValue();
- // TODO: Check for global or instruction-level "nnan". In that case, we
- // should be able to lower to FMAX/FMIN alone.
// TODO: If an operand is already known to be a NaN or not a NaN, this
// should be an optional swap and FMAX/FMIN.
@@ -35461,14 +37125,21 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
return SDValue();
- // This takes at least 3 instructions, so favor a library call when operating
- // on a scalar and minimizing code size.
- if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
- return SDValue();
-
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
SDLoc DL(N);
+ auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
+
+ // If we don't have to respect NaN inputs, this is a direct translation to x86
+ // min/max instructions.
+ if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
+ return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+
+ // If we have to respect NaN inputs, this takes at least 3 instructions.
+ // Favor a library call when operating on a scalar and minimizing code size.
+ if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
+ return SDValue();
+
EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
DAG.getDataLayout(), *DAG.getContext(), VT);
@@ -35491,9 +37162,8 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
// use those instructions for fmaxnum by selecting away a NaN input.
// If either operand is NaN, the 2nd source operand (Op0) is passed through.
- auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
- SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
+ SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
// are NaN, the NaN value of Op1 is the result.
@@ -35519,10 +37189,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(
{Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
- DCI.CombineTo(N, Res);
- return SDValue();
- }
+ /*HasVarMask*/ false, DAG, Subtarget))
+ return Res;
}
return SDValue();
@@ -35542,12 +37210,54 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+// Try to combine sext_in_reg of a cmov of constants by extending the constants.
+static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
- if (!VT.isVector())
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
+
+ if (ExtraVT != MVT::i16)
return SDValue();
+ // Look through single use any_extends.
+ if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse())
+ N0 = N0.getOperand(0);
+
+ // See if we have a single use cmov.
+ if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
+ return SDValue();
+
+ SDValue CMovOp0 = N0.getOperand(0);
+ SDValue CMovOp1 = N0.getOperand(1);
+
+ // Make sure both operands are constants.
+ if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
+ !isa<ConstantSDNode>(CMovOp1.getNode()))
+ return SDValue();
+
+ SDLoc DL(N);
+
+ // If we looked through an any_extend above, add one to the constants.
+ if (N0.getValueType() != VT) {
+ CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0);
+ CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1);
+ }
+
+ CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1);
+ CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1);
+
+ return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1,
+ N0.getOperand(2), N0.getOperand(3));
+}
+
+static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (SDValue V = combineSextInRegCmov(N, DAG))
+ return V;
+
+ EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
@@ -35686,7 +37396,7 @@ static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
// promotion).
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
SDValue CMovN = Extend->getOperand(0);
- if (CMovN.getOpcode() != X86ISD::CMOV)
+ if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
return SDValue();
EVT TargetVT = Extend->getValueType(0);
@@ -35697,20 +37407,36 @@ static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
SDValue CMovOp0 = CMovN.getOperand(0);
SDValue CMovOp1 = CMovN.getOperand(1);
- bool DoPromoteCMOV =
- (VT == MVT::i16 && (TargetVT == MVT::i32 || TargetVT == MVT::i64)) &&
- CMovN.hasOneUse() &&
- (isa<ConstantSDNode>(CMovOp0.getNode()) &&
- isa<ConstantSDNode>(CMovOp1.getNode()));
+ if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
+ !isa<ConstantSDNode>(CMovOp1.getNode()))
+ return SDValue();
- if (!DoPromoteCMOV)
+ // Only extend to i32 or i64.
+ if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
return SDValue();
- CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0);
- CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1);
+ // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
+ // are free.
+ if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
+ return SDValue();
- return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1,
- CMovN.getOperand(2), CMovN.getOperand(3));
+ // If this a zero extend to i64, we should only extend to i32 and use a free
+ // zero extend to finish.
+ EVT ExtendVT = TargetVT;
+ if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
+ ExtendVT = MVT::i32;
+
+ CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
+ CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
+
+ SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
+ CMovN.getOperand(2), CMovN.getOperand(3));
+
+ // Finish extending if needed.
+ if (ExtendVT != TargetVT)
+ Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
+
+ return Res;
}
// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
@@ -35866,7 +37592,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
// Also use this if we don't have SSE41 to allow the legalizer do its job.
if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
(VT.is256BitVector() && Subtarget.hasInt256()) ||
- (VT.is512BitVector() && Subtarget.hasAVX512())) {
+ (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
return Opcode == ISD::SIGN_EXTEND
? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
@@ -35899,12 +37625,55 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
// On pre-AVX512 targets, split into 256-bit nodes of
// ISD::*_EXTEND_VECTOR_INREG.
- if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
+ if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
return SplitAndExtendInReg(256);
return SDValue();
}
+// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
+// result type.
+static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ SDLoc dl(N);
+
+ // Only do this combine with AVX512 for vector extends.
+ if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC)
+ return SDValue();
+
+ // Only combine legal element types.
+ EVT SVT = VT.getVectorElementType();
+ if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
+ SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
+ return SDValue();
+
+ // We can only do this if the vector size in 256 bits or less.
+ unsigned Size = VT.getSizeInBits();
+ if (Size > 256)
+ return SDValue();
+
+ // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
+ // that's the only integer compares with we have.
+ ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get();
+ if (ISD::isUnsignedIntSetCC(CC))
+ return SDValue();
+
+ // Only do this combine if the extension will be fully consumed by the setcc.
+ EVT N00VT = N0.getOperand(0).getValueType();
+ EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
+ if (Size != MatchingVecType.getSizeInBits())
+ return SDValue();
+
+ SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
+
+ if (N->getOpcode() == ISD::ZERO_EXTEND)
+ Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
+
+ return Res;
+}
+
static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -35922,6 +37691,9 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
if (!DCI.isBeforeLegalizeOps())
return SDValue();
+ if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
+ return V;
+
if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
// Invert and sign-extend a boolean is the same as zero-extend and subtract
@@ -35939,7 +37711,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
return V;
if (VT.isVector())
- if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
+ if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
return R;
if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
@@ -35948,9 +37720,40 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
+ if (NegMul) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode");
+ case ISD::FMA: Opcode = X86ISD::FNMADD; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FNMADD: Opcode = ISD::FMA; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
+ }
+ }
+
+ if (NegAcc) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode");
+ case ISD::FMA: Opcode = X86ISD::FMSUB; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FMSUB: Opcode = ISD::FMA; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
+ }
+ }
+
+ return Opcode;
+}
+
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- // TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.
SDLoc dl(N);
EVT VT = N->getValueType(0);
@@ -35966,96 +37769,41 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
SDValue B = N->getOperand(1);
SDValue C = N->getOperand(2);
- auto invertIfNegative = [](SDValue &V) {
+ auto invertIfNegative = [&DAG](SDValue &V) {
if (SDValue NegVal = isFNEG(V.getNode())) {
- V = NegVal;
+ V = DAG.getBitcast(V.getValueType(), NegVal);
return true;
}
+ // Look through extract_vector_elts. If it comes from an FNEG, create a
+ // new extract from the FNEG input.
+ if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ isa<ConstantSDNode>(V.getOperand(1)) &&
+ cast<ConstantSDNode>(V.getOperand(1))->getZExtValue() == 0) {
+ if (SDValue NegVal = isFNEG(V.getOperand(0).getNode())) {
+ NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal);
+ V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
+ NegVal, V.getOperand(1));
+ return true;
+ }
+ }
+
return false;
};
// Do not convert the passthru input of scalar intrinsics.
// FIXME: We could allow negations of the lower element only.
- bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
- N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
+ bool NegA = invertIfNegative(A);
bool NegB = invertIfNegative(B);
- bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
- N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
+ bool NegC = invertIfNegative(C);
- // Negative multiplication when NegA xor NegB
- bool NegMul = (NegA != NegB);
- bool HasNeg = NegA || NegB || NegC;
+ if (!NegA && !NegB && !NegC)
+ return SDValue();
- unsigned NewOpcode;
- if (!NegMul)
- NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);
- else
- NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
-
- // For FMA, we risk reconstructing the node we started with.
- // In order to avoid this, we check for negation or opcode change. If
- // one of the two happened, then it is a new node and we return it.
- if (N->getOpcode() == ISD::FMA) {
- if (HasNeg || NewOpcode != N->getOpcode())
- return DAG.getNode(NewOpcode, dl, VT, A, B, C);
- return SDValue();
- }
-
- if (N->getOpcode() == X86ISD::FMADD_RND) {
- switch (NewOpcode) {
- case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break;
- case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
- case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
- case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
- }
- } else if (N->getOpcode() == X86ISD::FMADDS1) {
- switch (NewOpcode) {
- case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;
- case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;
- case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
- case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
- }
- } else if (N->getOpcode() == X86ISD::FMADDS3) {
- switch (NewOpcode) {
- case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;
- case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;
- case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
- case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
- }
- } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
- switch (NewOpcode) {
- case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;
- case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
- case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
- case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
- }
- } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
- switch (NewOpcode) {
- case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break;
- case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
- case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
- case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
- }
- } else if (N->getOpcode() == X86ISD::FMADD4S) {
- switch (NewOpcode) {
- case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;
- case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;
- case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
- case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
- }
- } else {
- llvm_unreachable("Unexpected opcode!");
- }
+ unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);
- // Only return the node is the opcode was changed or one of the
- // operand was negated. If not, we'll just recreate the same node.
- if (HasNeg || NewOpcode != N->getOpcode()) {
- if (N->getNumOperands() == 4)
- return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
- return DAG.getNode(NewOpcode, dl, VT, A, B, C);
- }
-
- return SDValue();
+ if (N->getNumOperands() == 4)
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C);
}
// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
@@ -36124,6 +37872,10 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
return NewCMov;
+ if (DCI.isBeforeLegalizeOps())
+ if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
+ return V;
+
if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
return V;
@@ -36131,7 +37883,7 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
return V;
if (VT.isVector())
- if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
+ if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
return R;
if (SDValue DivRem8 = getDivRem8(N, DAG))
@@ -36153,13 +37905,23 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
- // We're looking for an oversized integer equality comparison, but ignore a
- // comparison with zero because that gets special treatment in EmitTest().
+ // We're looking for an oversized integer equality comparison.
SDValue X = SetCC->getOperand(0);
SDValue Y = SetCC->getOperand(1);
EVT OpVT = X.getValueType();
unsigned OpSize = OpVT.getSizeInBits();
- if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
+ if (!OpVT.isScalarInteger() || OpSize < 128)
+ return SDValue();
+
+ // Ignore a comparison with zero because that gets special treatment in
+ // EmitTest(). But make an exception for the special case of a pair of
+ // logically-combined vector-sized operands compared to zero. This pattern may
+ // be generated by the memcmp expansion pass with oversized integer compares
+ // (see PR33325).
+ bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
+ X.getOperand(0).getOpcode() == ISD::XOR &&
+ X.getOperand(1).getOpcode() == ISD::XOR;
+ if (isNullConstant(Y) && !IsOrXorXorCCZero)
return SDValue();
// Bail out if we know that this is not really just an oversized integer.
@@ -36174,15 +37936,29 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
if ((OpSize == 128 && Subtarget.hasSSE2()) ||
(OpSize == 256 && Subtarget.hasAVX2())) {
EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
- SDValue VecX = DAG.getBitcast(VecVT, X);
- SDValue VecY = DAG.getBitcast(VecVT, Y);
-
+ SDValue Cmp;
+ if (IsOrXorXorCCZero) {
+ // This is a bitwise-combined equality comparison of 2 pairs of vectors:
+ // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
+ // Use 2 vector equality compares and 'and' the results before doing a
+ // MOVMSK.
+ SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
+ SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
+ SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
+ SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
+ SDValue Cmp1 = DAG.getSetCC(DL, VecVT, A, B, ISD::SETEQ);
+ SDValue Cmp2 = DAG.getSetCC(DL, VecVT, C, D, ISD::SETEQ);
+ Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
+ } else {
+ SDValue VecX = DAG.getBitcast(VecVT, X);
+ SDValue VecY = DAG.getBitcast(VecVT, Y);
+ Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, ISD::SETEQ);
+ }
// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
- SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
MVT::i32);
@@ -36198,10 +37974,10 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
EVT VT = N->getValueType(0);
+ EVT OpVT = LHS.getValueType();
SDLoc DL(N);
if (CC == ISD::SETNE || CC == ISD::SETEQ) {
- EVT OpVT = LHS.getValueType();
// 0-x == y --> x+y == 0
// 0-x != y --> x+y != 0
if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
@@ -36250,6 +38026,20 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
}
}
+ // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
+ // pre-promote its result type since vXi1 vectors don't get promoted
+ // during type legalization.
+ // NOTE: The element count check is to ignore operand types that need to
+ // go through type promotion to a 128-bit vector.
+ if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
+ VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() > 4 &&
+ (OpVT.getVectorElementType() == MVT::i8 ||
+ OpVT.getVectorElementType() == MVT::i16)) {
+ SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
+ N->getOperand(2));
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
+ }
+
// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
// to avoid scalarization via legalization because v4i32 is not a legal type.
if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
@@ -36264,6 +38054,19 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
SDValue Src = N->getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
+ // Perform constant folding.
+ if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
+ assert(N->getValueType(0) == MVT::i32 && "Unexpected result type");
+ APInt Imm(32, 0);
+ for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
+ SDValue In = Src.getOperand(Idx);
+ if (!In.isUndef() &&
+ cast<ConstantSDNode>(In)->getAPIntValue().isNegative())
+ Imm.setBit(Idx);
+ }
+ return DAG.getConstant(Imm, SDLoc(N), N->getValueType(0));
+ }
+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
@@ -36295,12 +38098,14 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
NewOps[4] = Index.getOperand(0);
- DAG.UpdateNodeOperands(N, NewOps);
- // The original sign extend has less users, add back to worklist in case
- // it needs to be removed
- DCI.AddToWorklist(Index.getNode());
- DCI.AddToWorklist(N);
- return SDValue(N, 0);
+ SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
+ if (Res == N) {
+ // The original sign extend has less users, add back to worklist in
+ // case it needs to be removed
+ DCI.AddToWorklist(Index.getNode());
+ DCI.AddToWorklist(N);
+ }
+ return SDValue(Res, 0);
}
}
@@ -36313,9 +38118,10 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
NewOps[4] = Index;
- DAG.UpdateNodeOperands(N, NewOps);
- DCI.AddToWorklist(N);
- return SDValue(N, 0);
+ SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
+ if (Res == N)
+ DCI.AddToWorklist(N);
+ return SDValue(Res, 0);
}
// Try to remove zero extends from 32->64 if we know the sign bit of
@@ -36326,32 +38132,24 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
if (DAG.SignBitIsZero(Index.getOperand(0))) {
SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
NewOps[4] = Index.getOperand(0);
- DAG.UpdateNodeOperands(N, NewOps);
- // The original zero extend has less users, add back to worklist in case
- // it needs to be removed
- DCI.AddToWorklist(Index.getNode());
- DCI.AddToWorklist(N);
- return SDValue(N, 0);
+ SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
+ if (Res == N) {
+ // The original sign extend has less users, add back to worklist in
+ // case it needs to be removed
+ DCI.AddToWorklist(Index.getNode());
+ DCI.AddToWorklist(N);
+ }
+ return SDValue(Res, 0);
}
}
}
- // Gather and Scatter instructions use k-registers for masks. The type of
- // the masks is v*i1. So the mask will be truncated anyway.
- // The SIGN_EXTEND_INREG my be dropped.
- SDValue Mask = N->getOperand(2);
- if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
- SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
- NewOps[2] = Mask.getOperand(0);
- DAG.UpdateNodeOperands(N, NewOps);
- return SDValue(N, 0);
- }
-
// With AVX2 we only demand the upper bit of the mask.
if (!Subtarget.hasAVX512()) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
+ SDValue Mask = N->getOperand(2);
KnownBits Known;
APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
@@ -36448,11 +38246,11 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
SDValue Op0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
- EVT InSVT = InVT.getScalarType();
+ // UINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
- if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
+ if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
SDLoc dl(N);
EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
InVT.getVectorNumElements());
@@ -36482,14 +38280,11 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
SDValue Op0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
- EVT InSVT = InVT.getScalarType();
// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
- if (InVT.isVector() &&
- (InSVT == MVT::i8 || InSVT == MVT::i16 ||
- (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
+ if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
SDLoc dl(N);
EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
InVT.getVectorNumElements());
@@ -36524,6 +38319,11 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
if (VT == MVT::f16 || VT == MVT::f128)
return SDValue();
+ // If we have AVX512DQ we can use packed conversion instructions unless
+ // the VT is f80.
+ if (Subtarget.hasDQI() && VT != MVT::f80)
+ return SDValue();
+
if (!Ld->isVolatile() && !VT.isVector() &&
ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
!Subtarget.is64Bit() && LdVT == MVT::i64) {
@@ -36778,15 +38578,9 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
- unsigned RegSize = 128;
- if (Subtarget.hasBWI())
- RegSize = 512;
- else if (Subtarget.hasAVX2())
- RegSize = 256;
- unsigned VectorSize = VT.getVectorNumElements() * 16;
// If the vector size is less than 128, or greater than the supported RegSize,
// do not use PMADD.
- if (VectorSize < 128 || VectorSize > RegSize)
+ if (VT.getVectorNumElements() < 8)
return SDValue();
SDLoc DL(N);
@@ -36800,7 +38594,13 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
// Madd vector size is half of the original vector size
- SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
+ auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
+ return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
+ };
+ SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
+ PMADDWDBuilder);
// Fill the rest of the output with 0
SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
@@ -36824,12 +38624,12 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
return SDValue();
unsigned RegSize = 128;
- if (Subtarget.hasBWI())
+ if (Subtarget.useBWIRegs())
RegSize = 512;
- else if (Subtarget.hasAVX2())
+ else if (Subtarget.hasAVX())
RegSize = 256;
- // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+ // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
// TODO: We should be able to handle larger vectors by splitting them before
// feeding them into several SADs, and then reducing over those.
if (VT.getSizeInBits() / 4 > RegSize)
@@ -36855,7 +38655,7 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
// reduction. Note that the number of elements of the result of SAD is less
// than the number of elements of its input. Therefore, we could only update
// part of elements in the reduction vector.
- SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
+ SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
// The output of PSADBW is a vector of i64.
// We need to turn the vector of i64 into a vector of i32.
@@ -36905,6 +38705,236 @@ static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
}
+static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
+ const SDLoc &DL, EVT VT,
+ const X86Subtarget &Subtarget) {
+ // Example of pattern we try to detect:
+ // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
+ //(add (build_vector (extract_elt t, 0),
+ // (extract_elt t, 2),
+ // (extract_elt t, 4),
+ // (extract_elt t, 6)),
+ // (build_vector (extract_elt t, 1),
+ // (extract_elt t, 3),
+ // (extract_elt t, 5),
+ // (extract_elt t, 7)))
+
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
+ Op1.getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
+ VT.getVectorNumElements() < 4 ||
+ !isPowerOf2_32(VT.getVectorNumElements()))
+ return SDValue();
+
+ // Check if one of Op0,Op1 is of the form:
+ // (build_vector (extract_elt Mul, 0),
+ // (extract_elt Mul, 2),
+ // (extract_elt Mul, 4),
+ // ...
+ // the other is of the form:
+ // (build_vector (extract_elt Mul, 1),
+ // (extract_elt Mul, 3),
+ // (extract_elt Mul, 5),
+ // ...
+ // and identify Mul.
+ SDValue Mul;
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
+ SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
+ Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
+ // TODO: Be more tolerant to undefs.
+ if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+ auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
+ auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
+ auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
+ auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
+ if (!Const0L || !Const1L || !Const0H || !Const1H)
+ return SDValue();
+ unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
+ Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
+ // Commutativity of mul allows factors of a product to reorder.
+ if (Idx0L > Idx1L)
+ std::swap(Idx0L, Idx1L);
+ if (Idx0H > Idx1H)
+ std::swap(Idx0H, Idx1H);
+ // Commutativity of add allows pairs of factors to reorder.
+ if (Idx0L > Idx0H) {
+ std::swap(Idx0L, Idx0H);
+ std::swap(Idx1L, Idx1H);
+ }
+ if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
+ Idx1H != 2 * i + 3)
+ return SDValue();
+ if (!Mul) {
+ // First time an extract_elt's source vector is visited. Must be a MUL
+ // with 2X number of vector elements than the BUILD_VECTOR.
+ // Both extracts must be from same MUL.
+ Mul = Op0L->getOperand(0);
+ if (Mul->getOpcode() != ISD::MUL ||
+ Mul.getValueType().getVectorNumElements() != 2 * e)
+ return SDValue();
+ }
+ // Check that the extract is from the same MUL previously seen.
+ if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
+ Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
+ return SDValue();
+ }
+
+ // Check if the Mul source can be safely shrunk.
+ ShrinkMode Mode;
+ if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)
+ return SDValue();
+
+ auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ // Shrink by adding truncate nodes and let DAGCombine fold with the
+ // sources.
+ EVT InVT = Ops[0].getValueType();
+ assert(InVT.getScalarType() == MVT::i32 &&
+ "Unexpected scalar element type");
+ assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
+ EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ InVT.getVectorNumElements() / 2);
+ EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ InVT.getVectorNumElements());
+ return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
+ DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
+ DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
+ };
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT,
+ { Mul.getOperand(0), Mul.getOperand(1) },
+ PMADDBuilder);
+}
+
+// Attempt to turn this pattern into PMADDWD.
+// (mul (add (zext (build_vector)), (zext (build_vector))),
+// (add (zext (build_vector)), (zext (build_vector)))
+static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
+ const SDLoc &DL, EVT VT,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
+ return SDValue();
+
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
+ VT.getVectorNumElements() < 4 ||
+ !isPowerOf2_32(VT.getVectorNumElements()))
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ SDValue N01 = N0.getOperand(1);
+ SDValue N10 = N1.getOperand(0);
+ SDValue N11 = N1.getOperand(1);
+
+ // All inputs need to be sign extends.
+ // TODO: Support ZERO_EXTEND from known positive?
+ if (N00.getOpcode() != ISD::SIGN_EXTEND ||
+ N01.getOpcode() != ISD::SIGN_EXTEND ||
+ N10.getOpcode() != ISD::SIGN_EXTEND ||
+ N11.getOpcode() != ISD::SIGN_EXTEND)
+ return SDValue();
+
+ // Peek through the extends.
+ N00 = N00.getOperand(0);
+ N01 = N01.getOperand(0);
+ N10 = N10.getOperand(0);
+ N11 = N11.getOperand(0);
+
+ // Must be extending from vXi16.
+ EVT InVT = N00.getValueType();
+ if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
+ N10.getValueType() != InVT || N11.getValueType() != InVT)
+ return SDValue();
+
+ // All inputs should be build_vectors.
+ if (N00.getOpcode() != ISD::BUILD_VECTOR ||
+ N01.getOpcode() != ISD::BUILD_VECTOR ||
+ N10.getOpcode() != ISD::BUILD_VECTOR ||
+ N11.getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ // For each element, we need to ensure we have an odd element from one vector
+ // multiplied by the odd element of another vector and the even element from
+ // one of the same vectors being multiplied by the even element from the
+ // other vector. So we need to make sure for each element i, this operator
+ // is being performed:
+ // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
+ SDValue In0, In1;
+ for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
+ SDValue N00Elt = N00.getOperand(i);
+ SDValue N01Elt = N01.getOperand(i);
+ SDValue N10Elt = N10.getOperand(i);
+ SDValue N11Elt = N11.getOperand(i);
+ // TODO: Be more tolerant to undefs.
+ if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+ auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
+ auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
+ auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
+ auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
+ if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
+ return SDValue();
+ unsigned IdxN00 = ConstN00Elt->getZExtValue();
+ unsigned IdxN01 = ConstN01Elt->getZExtValue();
+ unsigned IdxN10 = ConstN10Elt->getZExtValue();
+ unsigned IdxN11 = ConstN11Elt->getZExtValue();
+ // Add is commutative so indices can be reordered.
+ if (IdxN00 > IdxN10) {
+ std::swap(IdxN00, IdxN10);
+ std::swap(IdxN01, IdxN11);
+ }
+ // N0 indices be the even elemtn. N1 indices must be the next odd element.
+ if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
+ IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
+ return SDValue();
+ SDValue N00In = N00Elt.getOperand(0);
+ SDValue N01In = N01Elt.getOperand(0);
+ SDValue N10In = N10Elt.getOperand(0);
+ SDValue N11In = N11Elt.getOperand(0);
+ // First time we find an input capture it.
+ if (!In0) {
+ In0 = N00In;
+ In1 = N01In;
+ }
+ // Mul is commutative so the input vectors can be in any order.
+ // Canonicalize to make the compares easier.
+ if (In0 != N00In)
+ std::swap(N00In, N01In);
+ if (In0 != N10In)
+ std::swap(N10In, N11In);
+ if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
+ return SDValue();
+ }
+
+ auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ // Shrink by adding truncate nodes and let DAGCombine fold with the
+ // sources.
+ EVT InVT = Ops[0].getValueType();
+ assert(InVT.getScalarType() == MVT::i16 &&
+ "Unexpected scalar element type");
+ assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
+ EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ InVT.getVectorNumElements() / 2);
+ return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
+ PMADDBuilder);
+}
+
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
const SDNodeFlags Flags = N->getFlags();
@@ -36918,11 +38948,22 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
+ if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
+ return MAdd;
+ if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
+ return MAdd;
+
// Try to synthesize horizontal adds from adds of shuffles.
- if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
- (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
- isHorizontalBinOp(Op0, Op1, true))
- return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
+ if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
+ VT == MVT::v8i32) &&
+ Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true)) {
+ auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
+ HADDBuilder);
+ }
if (SDValue V = combineIncDecVector(N, DAG))
return V;
@@ -36936,20 +38977,19 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
SDValue Op1 = N->getOperand(1);
EVT VT = N->getValueType(0);
- // PSUBUS is supported, starting from SSE2, but special preprocessing
- // for v8i32 requires umin, which appears in SSE41.
+ // PSUBUS is supported, starting from SSE2, but truncation for v8i32
+ // is only worth it with SSSE3 (PSHUFB).
if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
- !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
- !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
- !(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
- (VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 ||
- VT == MVT::v8i64)))
+ !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
+ !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
+ !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
+ VT == MVT::v16i32 || VT == MVT::v8i64)))
return SDValue();
SDValue SubusLHS, SubusRHS;
// Try to find umax(a,b) - b or a - umin(a,b) patterns
// they may be converted to subus(a,b).
- // TODO: Need to add IR cannonicialization for this code.
+ // TODO: Need to add IR canonicalization for this code.
if (Op0.getOpcode() == ISD::UMAX) {
SubusRHS = Op1;
SDValue MaxLHS = Op0.getOperand(0);
@@ -36973,10 +39013,16 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
} else
return SDValue();
+ auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
+ };
+
// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
// special preprocessing in some cases.
if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
- return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
+ { SubusLHS, SubusRHS }, SUBUSBuilder);
// Special preprocessing case can be only applied
// if the value was zero extended from 16 bit,
@@ -37006,8 +39052,9 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
SDValue NewSubusLHS =
DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
- SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
- NewSubusLHS, NewSubusRHS);
+ SDValue Psubus =
+ SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
+ { NewSubusLHS, NewSubusRHS }, SUBUSBuilder);
// Zero extend the result, it may be used somewhere as 32 bit,
// if not zext and following trunc will shrink.
return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
@@ -37038,10 +39085,16 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
// Try to synthesize horizontal subs from subs of shuffles.
EVT VT = N->getValueType(0);
- if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
- (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
- isHorizontalBinOp(Op0, Op1, false))
- return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
+ if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
+ VT == MVT::v8i32) &&
+ Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false)) {
+ auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
+ HSUBBuilder);
+ }
if (SDValue V = combineIncDecVector(N, DAG))
return V;
@@ -37145,28 +39198,6 @@ static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineTestM(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
-
- MVT VT = N->getSimpleValueType(0);
- SDLoc DL(N);
-
- // TEST (AND a, b) ,(AND a, b) -> TEST a, b
- if (Op0 == Op1 && Op1->getOpcode() == ISD::AND)
- return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0),
- Op0->getOperand(1));
-
- // TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero)
- // TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero)
- if (ISD::isBuildVectorAllZeros(Op0.getNode()) ||
- ISD::isBuildVectorAllZeros(Op1.getNode()))
- return getZeroVector(VT, Subtarget, DAG, DL);
-
- return SDValue();
-}
-
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = N->getSimpleValueType(0);
@@ -37190,9 +39221,7 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
MVT OpVT = N->getSimpleValueType(0);
- // Early out for mask vectors.
- if (OpVT.getVectorElementType() == MVT::i1)
- return SDValue();
+ bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
SDLoc dl(N);
SDValue Vec = N->getOperand(0);
@@ -37204,23 +39233,40 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
// Inserting zeros into zeros is a nop.
if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
- return Vec;
+ return getZeroVector(OpVT, Subtarget, DAG, dl);
// If we're inserting into a zero vector and then into a larger zero vector,
// just insert into the larger zero vector directly.
if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
unsigned Idx2Val = SubVec.getConstantOperandVal(2);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
+ getZeroVector(OpVT, Subtarget, DAG, dl),
SubVec.getOperand(1),
DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
}
+ // If we're inserting into a zero vector and our input was extracted from an
+ // insert into a zero vector of the same type and the extraction was at
+ // least as large as the original insertion. Just insert the original
+ // subvector into a zero vector.
+ if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
+ SubVec.getConstantOperandVal(1) == 0 &&
+ SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
+ SDValue Ins = SubVec.getOperand(0);
+ if (Ins.getConstantOperandVal(2) == 0 &&
+ ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
+ Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
+ getZeroVector(OpVT, Subtarget, DAG, dl),
+ Ins.getOperand(1), N->getOperand(2));
+ }
+
// If we're inserting a bitcast into zeros, rewrite the insert and move the
// bitcast to the other side. This helps with detecting zero extending
// during isel.
// TODO: Is this useful for other indices than 0?
- if (SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
+ if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
@@ -37231,6 +39277,10 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
}
}
+ // Stop here if this is an i1 vector.
+ if (IsI1Vector)
+ return SDValue();
+
// If this is an insert of an extract, combine to a shuffle. Don't do this
// if the insert or extract can be represented with a subregister operation.
if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
@@ -37317,7 +39367,6 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
SubVec2, Vec.getOperand(2));
- DCI.AddToWorklist(Vec.getNode());
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
N->getOperand(2));
@@ -37352,6 +39401,75 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
OpVT, SDLoc(N),
InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
+ // If we're extracting the lowest subvector and we're the only user,
+ // we may be able to perform this with a smaller vector width.
+ if (IdxVal == 0 && InVec.hasOneUse()) {
+ unsigned InOpcode = InVec.getOpcode();
+ if (OpVT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
+ // v2f64 CVTDQ2PD(v4i32).
+ if (InOpcode == ISD::SINT_TO_FP &&
+ InVec.getOperand(0).getValueType() == MVT::v4i32) {
+ return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), OpVT, InVec.getOperand(0));
+ }
+ // v2f64 CVTPS2PD(v4f32).
+ if (InOpcode == ISD::FP_EXTEND &&
+ InVec.getOperand(0).getValueType() == MVT::v4f32) {
+ return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0));
+ }
+ }
+ if ((InOpcode == X86ISD::VZEXT || InOpcode == X86ISD::VSEXT) &&
+ OpVT.is128BitVector() &&
+ InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
+ unsigned ExtOp = InOpcode == X86ISD::VZEXT ? ISD::ZERO_EXTEND_VECTOR_INREG
+ : ISD::SIGN_EXTEND_VECTOR_INREG;
+ return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0));
+ }
+ }
+
+ return SDValue();
+}
+
+static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+
+ // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
+ // This occurs frequently in our masked scalar intrinsic code and our
+ // floating point select lowering with AVX512.
+ // TODO: SimplifyDemandedBits instead?
+ if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
+ if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
+ if (C->getAPIntValue().isOneValue())
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
+ Src.getOperand(0));
+
+ return SDValue();
+}
+
+// Simplify PMULDQ and PMULUDQ operations.
+static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ APInt DemandedMask(APInt::getLowBitsSet(64, 32));
+
+ // PMULQDQ/PMULUDQ only uses lower 32 bits from each vector element.
+ KnownBits LHSKnown;
+ if (TLI.SimplifyDemandedBits(LHS, DemandedMask, LHSKnown, TLO)) {
+ DCI.CommitTargetLoweringOpt(TLO);
+ return SDValue(N, 0);
+ }
+
+ KnownBits RHSKnown;
+ if (TLI.SimplifyDemandedBits(RHS, DemandedMask, RHSKnown, TLO)) {
+ DCI.CommitTargetLoweringOpt(TLO);
+ return SDValue(N, 0);
+ }
+
return SDValue();
}
@@ -37360,6 +39478,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
default: break;
+ case ISD::SCALAR_TO_VECTOR:
+ return combineScalarToVector(N, DAG);
case ISD::EXTRACT_VECTOR_ELT:
case X86ISD::PEXTRW:
case X86ISD::PEXTRB:
@@ -37384,6 +39504,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
+ case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
case ISD::STORE: return combineStore(N, DAG, Subtarget);
@@ -37449,20 +39570,21 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::VPERMI:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
- case X86ISD::VPERMIV3:
case X86ISD::VPERMIL2:
case X86ISD::VPERMILPI:
case X86ISD::VPERMILPV:
case X86ISD::VPERM2X128:
+ case X86ISD::SHUF128:
case X86ISD::VZEXT_MOVL:
case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
case X86ISD::FMADD_RND:
- case X86ISD::FMADDS1_RND:
- case X86ISD::FMADDS3_RND:
- case X86ISD::FMADDS1:
- case X86ISD::FMADDS3:
- case X86ISD::FMADD4S:
- case ISD::FMA: return combineFMA(N, DAG, Subtarget);
+ case X86ISD::FMSUB:
+ case X86ISD::FMSUB_RND:
+ case X86ISD::FNMADD:
+ case X86ISD::FNMADD_RND:
+ case X86ISD::FNMSUB:
+ case X86ISD::FNMSUB_RND:
+ case ISD::FMA: return combineFMA(N, DAG, Subtarget);
case X86ISD::FMADDSUB_RND:
case X86ISD::FMSUBADD_RND:
case X86ISD::FMADDSUB:
@@ -37472,9 +39594,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::MSCATTER:
case ISD::MGATHER:
case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
- case X86ISD::TESTM: return combineTestM(N, DAG, Subtarget);
case X86ISD::PCMPEQ:
case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
+ case X86ISD::PMULDQ:
+ case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI);
}
return SDValue();
@@ -37487,6 +39610,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
if (!isTypeLegal(VT))
return false;
+
+ // There are no vXi8 shifts.
+ if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
+ return false;
+
if (VT != MVT::i16)
return true;
@@ -37509,23 +39637,20 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
}
}
-/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
-/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
-/// we don't adjust the stack we clobber the first frame index.
-/// See X86InstrInfo::copyPhysReg.
-static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
- const MachineRegisterInfo &MRI = MF.getRegInfo();
- return any_of(MRI.reg_instructions(X86::EFLAGS),
- [](const MachineInstr &RI) { return RI.isCopy(); });
-}
-
-void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
- if (hasCopyImplyingStackAdjustment(MF)) {
- MachineFrameInfo &MFI = MF.getFrameInfo();
- MFI.setHasCopyImplyingStackAdjustment(true);
+SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
+ SDValue Value, SDValue Addr,
+ SelectionDAG &DAG) const {
+ const Module *M = DAG.getMachineFunction().getMMI().getModule();
+ Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
+ if (IsCFProtectionSupported) {
+ // In case control-flow branch protection is enabled, we need to add
+ // notrack prefix to the indirect branch.
+ // In order to do that we create NT_BRIND SDNode.
+ // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
+ return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
}
- TargetLoweringBase::finalizeLowering(MF);
+ return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
}
/// This method query the target whether it is beneficial for dag combiner to
@@ -37536,22 +39661,30 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
if (VT != MVT::i16)
return false;
- bool Promote = false;
+ auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
+ if (!Op.hasOneUse())
+ return false;
+ SDNode *User = *Op->use_begin();
+ if (!ISD::isNormalStore(User))
+ return false;
+ auto *Ld = cast<LoadSDNode>(Load);
+ auto *St = cast<StoreSDNode>(User);
+ return Ld->getBasePtr() == St->getBasePtr();
+ };
+
bool Commute = false;
switch (Op.getOpcode()) {
- default: break;
+ default: return false;
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
- Promote = true;
break;
case ISD::SHL:
case ISD::SRL: {
SDValue N0 = Op.getOperand(0);
// Look out for (store (shl (load), x)).
- if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
+ if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
return false;
- Promote = true;
break;
}
case ISD::ADD:
@@ -37564,19 +39697,20 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
case ISD::SUB: {
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
- if (!Commute && MayFoldLoad(N1))
- return false;
// Avoid disabling potential load folding opportunities.
- if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
+ if (MayFoldLoad(N1) &&
+ (!Commute || !isa<ConstantSDNode>(N0) ||
+ (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
return false;
- if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
+ if (MayFoldLoad(N0) &&
+ ((Commute && !isa<ConstantSDNode>(N1)) ||
+ (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
return false;
- Promote = true;
}
}
PVT = MVT::i32;
- return Promote;
+ return true;
}
bool X86TargetLowering::
@@ -37862,7 +39996,7 @@ TargetLowering::ConstraintWeight
LLVM_FALLTHROUGH;
case 'x':
if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
- ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
+ ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
weight = CW_Register;
break;
case 'k':
@@ -38353,6 +40487,25 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return Res;
}
+ // Make sure it isn't a register that requires 64-bit mode.
+ if (!Subtarget.is64Bit() &&
+ (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
+ TRI->getEncodingValue(Res.first) >= 8) {
+ // Register requires REX prefix, but we're in 32-bit mode.
+ Res.first = 0;
+ Res.second = nullptr;
+ return Res;
+ }
+
+ // Make sure it isn't a register that requires AVX512.
+ if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
+ TRI->getEncodingValue(Res.first) & 0x10) {
+ // Register requires EVEX prefix.
+ Res.first = 0;
+ Res.second = nullptr;
+ return Res;
+ }
+
// Otherwise, check to see if this is a register class of the wrong value
// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
// turn into {ax},{dx}.
@@ -38421,7 +40574,7 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
// will take 2 allocations in the out of order engine instead of 1
// for plain addressing mode, i.e. inst (reg1).
// E.g.,
- // vaddps (%rsi,%drx), %ymm0, %ymm1
+ // vaddps (%rsi,%rdx), %ymm0, %ymm1
// Requires two allocations (one for the load, one for the computation)
// whereas:
// vaddps (%rsi), %ymm0, %ymm1
@@ -38516,7 +40669,8 @@ StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const
// Generally, if we aren't on Windows, the platform ABI does not include
// support for stack probes, so don't emit them.
- if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
+ if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
+ MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
return "";
// We need a stack probe to conform to the Windows ABI. Choose the right
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 7708f577ba70..32215b170a8c 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -75,6 +75,9 @@ namespace llvm {
///
CALL,
+ /// Same as call except it adds the NoTrack prefix.
+ NT_CALL,
+
/// This operation implements the lowering for readcyclecounter.
RDTSC_DAG,
@@ -122,6 +125,10 @@ namespace llvm {
/// or TEST instruction.
BRCOND,
+ /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
+ /// operand 1 is the target address.
+ NT_BRIND,
+
/// Return with a flag operand. Operand 0 is the chain operand, operand
/// 1 is the number of bytes of stack to pop.
RET_FLAG,
@@ -304,9 +311,6 @@ namespace llvm {
// Vector FP round.
VFPROUND, VFPROUND_RND, VFPROUNDS_RND,
- // Convert a vector to mask, set bits base on MSB.
- CVT2MASK,
-
// 128-bit vector logical left / right shift
VSHLDQ, VSRLDQ,
@@ -332,8 +336,6 @@ namespace llvm {
// Vector integer comparisons.
PCMPEQ, PCMPGT,
- // Vector integer comparisons, the result is in a mask vector.
- PCMPEQM, PCMPGTM,
// v8i16 Horizontal minimum and position.
PHMINPOS,
@@ -343,7 +345,6 @@ namespace llvm {
/// Vector comparison generating mask bits for fp and
/// integer signed and unsigned data types.
CMPM,
- CMPMU,
// Vector comparison with rounding mode for FP values
CMPM_RND,
@@ -351,6 +352,9 @@ namespace llvm {
ADD, SUB, ADC, SBB, SMUL,
INC, DEC, OR, XOR, AND,
+ // Bit field extract.
+ BEXTR,
+
// LOW, HI, FLAGS = umul LHS, RHS.
UMUL,
@@ -373,14 +377,13 @@ namespace llvm {
// Vector packed fp sign bitwise comparisons.
TESTP,
- // Vector "test" in AVX-512, the result is in a mask vector.
- TESTM,
- TESTNM,
-
// OR/AND test for masks.
KORTEST,
KTEST,
+ // ADD for masks.
+ KADD,
+
// Several flavors of instructions with vector shuffle behaviors.
// Saturated signed/unnsigned packing.
PACKSS,
@@ -405,8 +408,6 @@ namespace llvm {
MOVSLDUP,
MOVLHPS,
MOVHLPS,
- MOVLPS,
- MOVLPD,
MOVSD,
MOVSS,
UNPCKL,
@@ -424,10 +425,6 @@ namespace llvm {
// Res = VPERMV3 V0, MaskV, V1
VPERMV3,
- // 3-op Variable Permute overwriting the index (VPERMI2).
- // Res = VPERMIV3 V0, MaskV, V1
- VPERMIV3,
-
// Bitwise ternary logic.
VPTERNLOG,
// Fix Up Special Packed Float32/64 values.
@@ -502,22 +499,6 @@ namespace llvm {
FMADDSUB_RND,
FMSUBADD_RND,
- // FMA4 specific scalar intrinsics bits that zero the non-scalar bits.
- FMADD4S, FNMADD4S, FMSUB4S, FNMSUB4S,
-
- // Scalar intrinsic FMA.
- FMADDS1, FMADDS3,
- FNMADDS1, FNMADDS3,
- FMSUBS1, FMSUBS3,
- FNMSUBS1, FNMSUBS3,
-
- // Scalar intrinsic FMA with rounding mode.
- // Two versions, passthru bits on op1 or op3.
- FMADDS1_RND, FMADDS3_RND,
- FNMADDS1_RND, FNMADDS3_RND,
- FMSUBS1_RND, FMSUBS3_RND,
- FNMSUBS1_RND, FNMSUBS3_RND,
-
// Compress and expand.
COMPRESS,
EXPAND,
@@ -572,8 +553,13 @@ namespace llvm {
RDSEED,
// SSE42 string comparisons.
- PCMPISTRI,
- PCMPESTRI,
+ // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
+ // will emit one or two instructions based on which results are used. If
+ // flags and index/mask this allows us to use a single instruction since
+ // we won't have to pick and opcode for flags. Instead we can rely on the
+ // DAG to CSE everything and decide at isel.
+ PCMPISTR,
+ PCMPESTR,
// Test if in transactional execution.
XTEST,
@@ -590,6 +576,9 @@ namespace llvm {
// LWP insert record.
LWPINS,
+ // User level wait
+ UMWAIT, TPAUSE,
+
// Compare and swap.
LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
LCMPXCHG8_DAG,
@@ -822,6 +811,28 @@ namespace llvm {
bool hasAndNotCompare(SDValue Y) const override;
+ bool hasAndNot(SDValue Y) const override;
+
+ bool preferShiftsToClearExtremeBits(SDValue Y) const override;
+
+ bool
+ shouldTransformSignedTruncationCheck(EVT XVT,
+ unsigned KeptBits) const override {
+ // For vectors, we don't have a preference..
+ if (XVT.isVector())
+ return false;
+
+ auto VTIsOk = [](EVT VT) -> bool {
+ return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
+ VT == MVT::i64;
+ };
+
+ // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports.
+ // XVT will be larger than KeptBitsVT.
+ MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
+ return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
+ }
+
bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
return VT.isScalarInteger();
}
@@ -829,10 +840,18 @@ namespace llvm {
/// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
MVT hasFastEqualityCompare(unsigned NumBits) const override;
+ /// Allow multiple load pairs per block for smaller and faster code.
+ unsigned getMemcmpEqZeroLoadsPerBlock() const override {
+ return 2;
+ }
+
/// Return the value type to use for ISD::SETCC.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
+ bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+ TargetLoweringOpt &TLO) const override;
+
/// Determine which of the bits specified in Mask are known to be either
/// zero or one and return them in the KnownZero/KnownOne bitsets.
void computeKnownBitsForTargetNode(const SDValue Op,
@@ -913,7 +932,7 @@ namespace llvm {
/// the immediate into a register.
bool isLegalAddImmediate(int64_t Imm) const override;
- /// \brief Return the cost of the scaling factor used in the addressing
+ /// Return the cost of the scaling factor used in the addressing
/// mode represented by AM for this target, for a load/store
/// of the specified type.
/// If the AM is supported, the return value must be >= 0.
@@ -976,11 +995,13 @@ namespace llvm {
/// be legal.
bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
- /// Similar to isShuffleMaskLegal. This is used by Targets can use this to
- /// indicate if there is a suitable VECTOR_SHUFFLE that can be used to
- /// replace a VAND with a constant pool entry.
- bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
- EVT VT) const override;
+ /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
+ /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
+ /// constant pool entry.
+ bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
+
+ /// Returns true if lowering to a jump table is allowed.
+ bool areJTsAllowed(const Function *Fn) const override;
/// If true, then instruction selection should
/// seek to shrink the FP constant of the specified type to a smaller type
@@ -1004,7 +1025,7 @@ namespace llvm {
(VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
}
- /// \brief Returns true if it is beneficial to convert a load of a constant
+ /// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
@@ -1023,6 +1044,8 @@ namespace llvm {
return NumElem > 2;
}
+ bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT) const override;
+
/// Intel processors have a unified instruction and data cache
const char * getClearCacheBuiltinName() const override {
return nullptr; // nothing to do, move along.
@@ -1071,31 +1094,40 @@ namespace llvm {
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
- /// \brief Customize the preferred legalization strategy for certain types.
+ /// Customize the preferred legalization strategy for certain types.
LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
+ MVT getRegisterTypeForCallingConv(LLVMContext &Context,
+ EVT VT) const override;
+
+ unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+ EVT VT) const override;
+
bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
bool supportSwiftError() const override;
StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
+ bool hasVectorBlend() const override { return true; }
+
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
- /// \brief Lower interleaved load(s) into target specific
+ /// Lower interleaved load(s) into target specific
/// instructions/intrinsics.
bool lowerInterleavedLoad(LoadInst *LI,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
- /// \brief Lower interleaved store(s) into target specific
+ /// Lower interleaved store(s) into target specific
/// instructions/intrinsics.
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
-
- void finalizeLowering(MachineFunction &MF) const override;
+ SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value,
+ SDValue Addr, SelectionDAG &DAG)
+ const override;
protected:
std::pair<const TargetRegisterClass *, uint8_t>
@@ -1171,7 +1203,8 @@ namespace llvm {
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
- unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr) const;
+ unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr,
+ const unsigned char OpFlags = 0) const;
SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(const GlobalValue *GV, const SDLoc &dl,
@@ -1292,12 +1325,21 @@ namespace llvm {
MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const;
+ void emitSetJmpShadowStackFix(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const;
+ MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
MachineBasicBlock *emitFMA3Instr(MachineInstr &MI,
MachineBasicBlock *MBB) const;
@@ -1438,6 +1480,7 @@ namespace llvm {
const SDValue &getIndex() const { return getOperand(4); }
const SDValue &getMask() const { return getOperand(2); }
const SDValue &getValue() const { return getOperand(1); }
+ const SDValue &getScale() const { return getOperand(5); }
static bool classof(const SDNode *N) {
return N->getOpcode() == X86ISD::MGATHER ||
diff --git a/lib/Target/X86/X86IndirectBranchTracking.cpp b/lib/Target/X86/X86IndirectBranchTracking.cpp
new file mode 100644
index 000000000000..7c00c9260d15
--- /dev/null
+++ b/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -0,0 +1,121 @@
+//===---- X86IndirectBranchTracking.cpp - Enables CET IBT mechanism -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that enables Indirect Branch Tracking (IBT) as part
+// of Control-Flow Enforcement Technology (CET).
+// The pass adds ENDBR (End Branch) machine instructions at the beginning of
+// each basic block or function that is referenced by an indrect jump/call
+// instruction.
+// The ENDBR instructions have a NOP encoding and as such are ignored in
+// targets that do not support CET IBT mechanism.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-indirect-branch-tracking"
+
+static cl::opt<bool> IndirectBranchTracking(
+ "x86-indirect-branch-tracking", cl::init(false), cl::Hidden,
+ cl::desc("Enable X86 indirect branch tracking pass."));
+
+STATISTIC(NumEndBranchAdded, "Number of ENDBR instructions added");
+
+namespace {
+class X86IndirectBranchTrackingPass : public MachineFunctionPass {
+public:
+ X86IndirectBranchTrackingPass() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override {
+ return "X86 Indirect Branch Tracking";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ static char ID;
+
+ /// Machine instruction info used throughout the class.
+ const X86InstrInfo *TII;
+
+ /// Endbr opcode for the current machine function.
+ unsigned int EndbrOpcode;
+
+ /// Adds a new ENDBR instruction to the begining of the MBB.
+ /// The function will not add it if already exists.
+ /// It will add ENDBR32 or ENDBR64 opcode, depending on the target.
+ /// \returns true if the ENDBR was added and false otherwise.
+ bool addENDBR(MachineBasicBlock &MBB) const;
+};
+
+} // end anonymous namespace
+
+char X86IndirectBranchTrackingPass::ID = 0;
+
+FunctionPass *llvm::createX86IndirectBranchTrackingPass() {
+ return new X86IndirectBranchTrackingPass();
+}
+
+bool X86IndirectBranchTrackingPass::addENDBR(MachineBasicBlock &MBB) const {
+ assert(TII && "Target instruction info was not initialized");
+ assert((X86::ENDBR64 == EndbrOpcode || X86::ENDBR32 == EndbrOpcode) &&
+ "Unexpected Endbr opcode");
+
+ auto MI = MBB.begin();
+ // If the MBB is empty or the first instruction is not ENDBR,
+ // add the ENDBR instruction to the beginning of the MBB.
+ if (MI == MBB.end() || EndbrOpcode != MI->getOpcode()) {
+ BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(EndbrOpcode));
+ NumEndBranchAdded++;
+ return true;
+ }
+
+ return false;
+}
+
+bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
+ const X86Subtarget &SubTarget = MF.getSubtarget<X86Subtarget>();
+
+ // Check that the cf-protection-branch is enabled.
+ Metadata *isCFProtectionSupported =
+ MF.getMMI().getModule()->getModuleFlag("cf-protection-branch");
+ if (!isCFProtectionSupported && !IndirectBranchTracking)
+ return false;
+
+ // True if the current MF was changed and false otherwise.
+ bool Changed = false;
+
+ TII = SubTarget.getInstrInfo();
+ EndbrOpcode = SubTarget.is64Bit() ? X86::ENDBR64 : X86::ENDBR32;
+
+ // Non-internal function or function whose address was taken, can be
+ // accessed through indirect calls. Mark the first BB with ENDBR instruction
+ // unless nocf_check attribute is used.
+ if ((MF.getFunction().hasAddressTaken() ||
+ !MF.getFunction().hasLocalLinkage()) &&
+ !MF.getFunction().doesNoCfCheck()) {
+ auto MBB = MF.begin();
+ Changed |= addENDBR(*MBB);
+ }
+
+ for (auto &MBB : MF)
+ // Find all basic blocks that their address was taken (for example
+ // in the case of indirect jump) and add ENDBR instruction.
+ if (MBB.hasAddressTaken())
+ Changed |= addENDBR(MBB);
+
+ return Changed;
+}
diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td
index 0d30b7d47f3e..46dc6bf7661a 100644
--- a/lib/Target/X86/X86Instr3DNow.td
+++ b/lib/Target/X86/X86Instr3DNow.td
@@ -12,109 +12,71 @@
//
//===----------------------------------------------------------------------===//
-let Sched = WriteFAdd in {
-def I3DNOW_FALU_ITINS : OpndItins<
- IIC_3DNOW_FALU_RR, IIC_3DNOW_FALU_RM
->;
+class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat>
+ : I<o, F, outs, ins, asm, pat>, Requires<[Has3DNow]> {
}
-let Sched = WriteCvtF2I in {
-def I3DNOW_FCVT_F2I_ITINS : OpndItins<
- IIC_3DNOW_FCVT_F2I_RR, IIC_3DNOW_FCVT_F2I_RM
->;
-}
-
-let Sched = WriteCvtI2F in {
-def I3DNOW_FCVT_I2F_ITINS : OpndItins<
- IIC_3DNOW_FCVT_I2F_RR, IIC_3DNOW_FCVT_I2F_RM
->;
-}
-
-let Sched = WriteVecIMul in {
-def I3DNOW_MISC_FUNC_ITINS : OpndItins<
- IIC_3DNOW_MISC_FUNC_REG, IIC_3DNOW_MISC_FUNC_MEM
->;
-}
-
-let Sched = WriteShuffle in {
-def I3DNOW_PSHUF_ITINS : OpndItins<
- IIC_MMX_PSHUF, IIC_MMX_PSHUF
->;
-}
-
-class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat,
- InstrItinClass itin>
- : I<o, F, outs, ins, asm, pat, itin>, TB, Requires<[Has3DNow]> {
-}
-
-class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat,
- InstrItinClass itin>
+class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
: I3DNow<o, F, (outs VR64:$dst), ins,
- !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat, itin>,
- Has3DNow0F0FOpcode {
- // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
- let isAsmParserOnly = 1;
+ !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>, ThreeDNow {
let Constraints = "$src1 = $dst";
}
-class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat,
- InstrItinClass itin>
+class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
: I3DNow<o, F, (outs VR64:$dst), ins,
- !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat, itin>,
- Has3DNow0F0FOpcode {
- // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
- let isAsmParserOnly = 1;
-}
+ !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>, ThreeDNow;
-multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, OpndItins itins,
- bit Commutable = 0, string Ver = ""> {
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn,
+ X86FoldableSchedWrite sched, bit Commutable = 0,
+ string Ver = ""> {
let isCommutable = Commutable in
def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
[(set VR64:$dst, (!cast<Intrinsic>(
- !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))],
- itins.rr>, Sched<[itins.Sched]>;
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>,
+ Sched<[sched]>;
def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
[(set VR64:$dst, (!cast<Intrinsic>(
!strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1,
- (bitconvert (load_mmx addr:$src2))))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (load_mmx addr:$src2))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
-multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, OpndItins itins,
- string Ver = ""> {
+multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn,
+ X86FoldableSchedWrite sched, string Ver = ""> {
def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
[(set VR64:$dst, (!cast<Intrinsic>(
- !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))], itins.rr>,
- Sched<[itins.Sched]>;
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>,
+ Sched<[sched]>;
def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
[(set VR64:$dst, (!cast<Intrinsic>(
!strconcat("int_x86_3dnow", Ver, "_", Mn))
- (bitconvert (load_mmx addr:$src))))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (load_mmx addr:$src))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
-defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb", I3DNOW_MISC_FUNC_ITINS, 1>;
-defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id", I3DNOW_FCVT_F2I_ITINS>;
-defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc", I3DNOW_FALU_ITINS>;
-defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd", I3DNOW_FALU_ITINS, 1>;
-defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq", I3DNOW_FALU_ITINS, 1>;
-defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge", I3DNOW_FALU_ITINS>;
-defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt", I3DNOW_FALU_ITINS>;
-defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax", I3DNOW_FALU_ITINS>;
-defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin", I3DNOW_FALU_ITINS>;
-defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul", I3DNOW_FALU_ITINS, 1>;
-defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp", I3DNOW_FALU_ITINS>;
-defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1", I3DNOW_FALU_ITINS>;
-defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2", I3DNOW_FALU_ITINS>;
-defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1", I3DNOW_FALU_ITINS>;
-defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt", I3DNOW_FALU_ITINS>;
-defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub", I3DNOW_FALU_ITINS, 1>;
-defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr", I3DNOW_FALU_ITINS, 1>;
-defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd", I3DNOW_FCVT_I2F_ITINS>;
-defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", I3DNOW_MISC_FUNC_ITINS, 1>;
-
+defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb", SchedWriteVecALU.MMX, 1>;
+defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id", WriteCvtPS2I>;
+defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc", WriteFAdd>;
+defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd", WriteFAdd, 1>;
+defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq", WriteFAdd, 1>;
+defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge", WriteFAdd>;
+defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt", WriteFAdd>;
+defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax", WriteFAdd>;
+defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin", WriteFAdd>;
+defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul", WriteFAdd, 1>;
+defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp", WriteFAdd>;
+defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1", WriteFAdd>;
+defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2", WriteFAdd>;
+defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1", WriteFAdd>;
+defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt", WriteFAdd>;
+defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub", WriteFAdd, 1>;
+defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr", WriteFAdd, 1>;
+defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd", WriteCvtI2PS>;
+defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", SchedWriteVecIMul.MMX, 1>;
+
+let SchedRW = [WriteEMMS] in
def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
- [(int_x86_mmx_femms)], IIC_MMX_EMMS>;
+ [(int_x86_mmx_femms)]>, TB;
// PREFETCHWT1 is supported we want to use it for everything but T0.
def PrefetchWLevel : PatFrag<(ops), (i32 imm), [{
@@ -130,21 +92,20 @@ let SchedRW = [WriteLoad] in {
let Predicates = [Has3DNow, NoSSEPrefetch] in
def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),
"prefetch\t$addr",
- [(prefetch addr:$addr, imm, imm, (i32 1))],
- IIC_SSE_PREFETCH>;
+ [(prefetch addr:$addr, imm, imm, (i32 1))]>, TB;
def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
- [(prefetch addr:$addr, (i32 1), (i32 PrefetchWLevel), (i32 1))],
- IIC_SSE_PREFETCH>, TB, Requires<[HasPrefetchW]>;
+ [(prefetch addr:$addr, (i32 1), (i32 PrefetchWLevel), (i32 1))]>,
+ TB, Requires<[HasPrefetchW]>;
def PREFETCHWT1 : I<0x0D, MRM2m, (outs), (ins i8mem:$addr), "prefetchwt1\t$addr",
- [(prefetch addr:$addr, (i32 1), (i32 PrefetchWT1Level), (i32 1))],
- IIC_SSE_PREFETCH>, TB, Requires<[HasPREFETCHWT1]>;
+ [(prefetch addr:$addr, (i32 1), (i32 PrefetchWT1Level), (i32 1))]>,
+ TB, Requires<[HasPREFETCHWT1]>;
}
// "3DNowA" instructions
-defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", I3DNOW_FCVT_F2I_ITINS, "a">;
-defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", I3DNOW_FCVT_I2F_ITINS, "a">;
-defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", I3DNOW_FALU_ITINS, 0, "a">;
-defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", I3DNOW_FALU_ITINS, 0, "a">;
-defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", I3DNOW_PSHUF_ITINS, "a">;
+defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", WriteCvtPS2I, "a">;
+defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", WriteCvtI2PS, "a">;
+defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", WriteFAdd, 0, "a">;
+defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", WriteFAdd, 0, "a">;
+defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", SchedWriteShuffle.MMX, "a">;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index dcd84930741b..2d95061a8213 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -90,22 +90,6 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
!cast<ComplexPattern>("sse_load_f64"),
?));
- // The corresponding float type, e.g. v16f32 for v16i32
- // Note: For EltSize < 32, FloatVT is illegal and TableGen
- // fails to compile, so we choose FloatVT = VT
- ValueType FloatVT = !cast<ValueType>(
- !if (!eq (!srl(EltSize,5),0),
- VTName,
- !if (!eq(TypeVariantName, "i"),
- "v" # NumElts # "f" # EltSize,
- VTName)));
-
- ValueType IntVT = !cast<ValueType>(
- !if (!eq (!srl(EltSize,5),0),
- VTName,
- !if (!eq(TypeVariantName, "f"),
- "v" # NumElts # "i" # EltSize,
- VTName)));
// The string to specify embedded broadcast in assembly.
string BroadcastStr = "{1to" # NumElts # "}";
@@ -212,22 +196,22 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
list<dag> Pattern,
list<dag> MaskingPattern,
list<dag> ZeroMaskingPattern,
- InstrItinClass itin,
string MaskingConstraint = "",
bit IsCommutable = 0,
- bit IsKCommutable = 0> {
+ bit IsKCommutable = 0,
+ bit IsKZCommutable = IsCommutable> {
let isCommutable = IsCommutable in
def NAME: AVX512<O, F, Outs, Ins,
OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
"$dst, "#IntelSrcAsm#"}",
- Pattern, itin>;
+ Pattern>;
// Prefer over VMOV*rrk Pat<>
let isCommutable = IsKCommutable in
def NAME#k: AVX512<O, F, Outs, MaskingIns,
OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
"$dst {${mask}}, "#IntelSrcAsm#"}",
- MaskingPattern, itin>,
+ MaskingPattern>,
EVEX_K {
// In case of the 3src subclass this is overridden with a let.
string Constraints = MaskingConstraint;
@@ -235,12 +219,11 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
// Zero mask does not add any restrictions to commute operands transformation.
// So, it is Ok to use IsCommutable instead of IsKCommutable.
- let isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
+ let isCommutable = IsKZCommutable in // Prefer over VMOV*rrkz Pat<>
def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
"$dst {${mask}} {z}, "#IntelSrcAsm#"}",
- ZeroMaskingPattern,
- itin>,
+ ZeroMaskingPattern>,
EVEX_KZ;
}
@@ -252,19 +235,19 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS, dag MaskingRHS,
- InstrItinClass itin,
SDNode Select = vselect,
string MaskingConstraint = "",
bit IsCommutable = 0,
- bit IsKCommutable = 0> :
+ bit IsKCommutable = 0,
+ bit IsKZCommutable = IsCommutable> :
AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
AttSrcAsm, IntelSrcAsm,
[(set _.RC:$dst, RHS)],
[(set _.RC:$dst, MaskingRHS)],
[(set _.RC:$dst,
(Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
- itin, MaskingConstraint, IsCommutable,
- IsKCommutable>;
+ MaskingConstraint, IsCommutable,
+ IsKCommutable, IsKZCommutable>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the vector instruction. In the masking case, the
@@ -274,7 +257,6 @@ multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS, dag MaskRHS,
- InstrItinClass itin,
bit IsCommutable = 0, bit IsKCommutable = 0,
SDNode Select = vselect> :
AVX512_maskable_custom<O, F, Outs, Ins,
@@ -286,7 +268,7 @@ multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _,
(Select _.KRCWM:$mask, MaskRHS, _.RC:$src0))],
[(set _.RC:$dst,
(Select _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))],
- itin, "$src0 = $dst", IsCommutable, IsKCommutable>;
+ "$src0 = $dst", IsCommutable, IsKCommutable>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the vector instruction. In the masking case, the
@@ -295,15 +277,16 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS,
- InstrItinClass itin,
bit IsCommutable = 0, bit IsKCommutable = 0,
+ bit IsKZCommutable = IsCommutable,
SDNode Select = vselect> :
AVX512_maskable_common<O, F, _, Outs, Ins,
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
- (Select _.KRCWM:$mask, RHS, _.RC:$src0), itin,
- Select, "$src0 = $dst", IsCommutable, IsKCommutable>;
+ (Select _.KRCWM:$mask, RHS, _.RC:$src0),
+ Select, "$src0 = $dst", IsCommutable, IsKCommutable,
+ IsKZCommutable>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the scalar instruction.
@@ -311,10 +294,9 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS,
- InstrItinClass itin,
bit IsCommutable = 0> :
AVX512_maskable<O, F, _, Outs, Ins, OpcodeStr, AttSrcAsm, IntelSrcAsm,
- RHS, itin, IsCommutable, 0, X86selects>;
+ RHS, IsCommutable, 0, IsCommutable, X86selects>;
// Similar to AVX512_maskable but in this case one of the source operands
// ($src1) is already tied to $dst so we just use that for the preserved
@@ -323,7 +305,7 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag NonTiedIns, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS, InstrItinClass itin,
+ dag RHS,
bit IsCommutable = 0,
bit IsKCommutable = 0,
SDNode Select = vselect,
@@ -334,32 +316,60 @@ multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
!con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
OpcodeStr, AttSrcAsm, IntelSrcAsm,
!if(MaskOnly, (null_frag), RHS),
- (Select _.KRCWM:$mask, RHS, _.RC:$src1), itin,
+ (Select _.KRCWM:$mask, RHS, _.RC:$src1),
Select, "", IsCommutable, IsKCommutable>;
+// Similar to AVX512_maskable_3src but in this case the input VT for the tied
+// operand differs from the output VT. This requires a bitconvert on
+// the preserved vector going into the vselect.
+// NOTE: The unmasked pattern is disabled.
+multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT,
+ X86VectorVTInfo InVT,
+ dag Outs, dag NonTiedIns, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, bit IsCommutable = 0> :
+ AVX512_maskable_common<O, F, OutVT, Outs,
+ !con((ins InVT.RC:$src1), NonTiedIns),
+ !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
+ !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, (null_frag),
+ (vselect InVT.KRCWM:$mask, RHS,
+ (bitconvert InVT.RC:$src1)),
+ vselect, "", IsCommutable>;
+
multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag NonTiedIns, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS, InstrItinClass itin,
+ dag RHS,
bit IsCommutable = 0,
bit IsKCommutable = 0,
bit MaskOnly = 0> :
AVX512_maskable_3src<O, F, _, Outs, NonTiedIns, OpcodeStr, AttSrcAsm,
- IntelSrcAsm, RHS, itin, IsCommutable, IsKCommutable,
+ IntelSrcAsm, RHS, IsCommutable, IsKCommutable,
X86selects, MaskOnly>;
multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins,
string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- list<dag> Pattern,
- InstrItinClass itin> :
+ list<dag> Pattern> :
AVX512_maskable_custom<O, F, Outs, Ins,
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [],
- itin, "$src0 = $dst">;
-
+ "$src0 = $dst">;
+
+multiclass AVX512_maskable_3src_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag NonTiedIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ list<dag> Pattern> :
+ AVX512_maskable_custom<O, F, Outs,
+ !con((ins _.RC:$src1), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [],
+ "">;
// Instruction with mask that puts result in mask register,
// like "compare" and "vptest"
@@ -370,18 +380,17 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
string AttSrcAsm, string IntelSrcAsm,
list<dag> Pattern,
list<dag> MaskingPattern,
- InstrItinClass itin,
bit IsCommutable = 0> {
let isCommutable = IsCommutable in
def NAME: AVX512<O, F, Outs, Ins,
OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
"$dst, "#IntelSrcAsm#"}",
- Pattern, itin>;
+ Pattern>;
def NAME#k: AVX512<O, F, Outs, MaskingIns,
OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
"$dst {${mask}}, "#IntelSrcAsm#"}",
- MaskingPattern, itin>, EVEX_K;
+ MaskingPattern>, EVEX_K;
}
multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
@@ -390,30 +399,27 @@ multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS, dag MaskingRHS,
- InstrItinClass itin,
bit IsCommutable = 0> :
AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr,
AttSrcAsm, IntelSrcAsm,
[(set _.KRC:$dst, RHS)],
- [(set _.KRC:$dst, MaskingRHS)], itin, IsCommutable>;
+ [(set _.KRC:$dst, MaskingRHS)], IsCommutable>;
multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS, InstrItinClass itin,
- bit IsCommutable = 0> :
+ dag RHS, bit IsCommutable = 0> :
AVX512_maskable_common_cmp<O, F, _, Outs, Ins,
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
- (and _.KRCWM:$mask, RHS), itin, IsCommutable>;
+ (and _.KRCWM:$mask, RHS), IsCommutable>;
multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
- string AttSrcAsm, string IntelSrcAsm,
- InstrItinClass itin> :
+ string AttSrcAsm, string IntelSrcAsm> :
AVX512_maskable_custom_cmp<O, F, Outs,
Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr,
- AttSrcAsm, IntelSrcAsm, [],[], itin>;
+ AttSrcAsm, IntelSrcAsm, [], []>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the vector instruction. In the masking case, the
@@ -422,7 +428,6 @@ multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS, dag MaskedRHS,
- InstrItinClass itin,
bit IsCommutable = 0, SDNode Select = vselect> :
AVX512_maskable_custom<O, F, Outs, Ins,
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
@@ -434,12 +439,12 @@ multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _,
[(set _.RC:$dst,
(Select _.KRCWM:$mask, MaskedRHS,
_.ImmAllZerosV))],
- itin, "$src0 = $dst", IsCommutable>;
+ "$src0 = $dst", IsCommutable>;
// Alias instruction that maps zero vector to pxor / xorp* for AVX-512.
// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
-// swizzled by ExecutionDepsFix to pxor.
+// swizzled by ExecutionDomainFix to pxor.
// We set canFoldAsLoad because this can be converted to a constant-pool
// load of an all-zeros value if folding it would be beneficial.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
@@ -494,7 +499,7 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
X86VectorVTInfo To,
SDPatternOperator vinsert_insert,
SDPatternOperator vinsert_for_mask,
- OpndItins itins> {
+ X86FoldableSchedWrite sched> {
let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
defm rr : AVX512_maskable_split<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
(ins To.RC:$src1, From.RC:$src2, u8imm:$src3),
@@ -505,8 +510,8 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
(iPTR imm)),
(vinsert_for_mask:$src3 (To.VT To.RC:$src1),
(From.VT From.RC:$src2),
- (iPTR imm)), itins.rr>,
- AVX512AIi8Base, EVEX_4V, Sched<[itins.Sched]>;
+ (iPTR imm))>,
+ AVX512AIi8Base, EVEX_4V, Sched<[sched]>;
let mayLoad = 1 in
defm rm : AVX512_maskable_split<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
(ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3),
@@ -517,9 +522,9 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
(iPTR imm)),
(vinsert_for_mask:$src3 (To.VT To.RC:$src1),
(From.VT (bitconvert (From.LdFrag addr:$src2))),
- (iPTR imm)), itins.rm>, AVX512AIi8Base, EVEX_4V,
+ (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
EVEX_CD8<From.EltSize, From.CD8TupleForm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
@@ -527,8 +532,8 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From,
X86VectorVTInfo To,
SDPatternOperator vinsert_insert,
- OpndItins itins> :
- vinsert_for_size_split<Opcode, From, To, vinsert_insert, vinsert_insert, itins>;
+ X86FoldableSchedWrite sched> :
+ vinsert_for_size_split<Opcode, From, To, vinsert_insert, vinsert_insert, sched>;
multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
X86VectorVTInfo To, PatFrag vinsert_insert,
@@ -552,60 +557,51 @@ multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
ValueType EltVT64, int Opcode256,
- OpndItins itins> {
+ X86FoldableSchedWrite sched> {
let Predicates = [HasVLX] in
defm NAME # "32x4Z256" : vinsert_for_size<Opcode128,
X86VectorVTInfo< 4, EltVT32, VR128X>,
X86VectorVTInfo< 8, EltVT32, VR256X>,
- vinsert128_insert, itins>, EVEX_V256;
+ vinsert128_insert, sched>, EVEX_V256;
defm NAME # "32x4Z" : vinsert_for_size<Opcode128,
X86VectorVTInfo< 4, EltVT32, VR128X>,
X86VectorVTInfo<16, EltVT32, VR512>,
- vinsert128_insert, itins>, EVEX_V512;
+ vinsert128_insert, sched>, EVEX_V512;
defm NAME # "64x4Z" : vinsert_for_size<Opcode256,
X86VectorVTInfo< 4, EltVT64, VR256X>,
X86VectorVTInfo< 8, EltVT64, VR512>,
- vinsert256_insert, itins>, VEX_W, EVEX_V512;
+ vinsert256_insert, sched>, VEX_W, EVEX_V512;
// Even with DQI we'd like to only use these instructions for masking.
let Predicates = [HasVLX, HasDQI] in
defm NAME # "64x2Z256" : vinsert_for_size_split<Opcode128,
X86VectorVTInfo< 2, EltVT64, VR128X>,
X86VectorVTInfo< 4, EltVT64, VR256X>,
- null_frag, vinsert128_insert, itins>,
- VEX_W, EVEX_V256;
+ null_frag, vinsert128_insert, sched>,
+ VEX_W1X, EVEX_V256;
// Even with DQI we'd like to only use these instructions for masking.
let Predicates = [HasDQI] in {
defm NAME # "64x2Z" : vinsert_for_size_split<Opcode128,
X86VectorVTInfo< 2, EltVT64, VR128X>,
X86VectorVTInfo< 8, EltVT64, VR512>,
- null_frag, vinsert128_insert, itins>,
+ null_frag, vinsert128_insert, sched>,
VEX_W, EVEX_V512;
defm NAME # "32x8Z" : vinsert_for_size_split<Opcode256,
X86VectorVTInfo< 8, EltVT32, VR256X>,
X86VectorVTInfo<16, EltVT32, VR512>,
- null_frag, vinsert256_insert, itins>,
+ null_frag, vinsert256_insert, sched>,
EVEX_V512;
}
}
-// FIXME: Is there a better scheduler itinerary for VINSERTF/VINSERTI?
-let Sched = WriteFShuffle256 in
-def AVX512_VINSERTF : OpndItins<
- IIC_SSE_SHUFP, IIC_SSE_SHUFP
->;
-let Sched = WriteShuffle256 in
-def AVX512_VINSERTI : OpndItins<
- IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
->;
-
-defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a, AVX512_VINSERTF>;
-defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a, AVX512_VINSERTI>;
+// FIXME: Is there a better scheduler class for VINSERTF/VINSERTI?
+defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a, WriteFShuffle256>;
+defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a, WriteShuffle256>;
// Codegen pattern with the alternative types,
// Even with AVX512DQ we'll still use these for unmasked operations.
@@ -778,15 +774,16 @@ let ExeDomain = SSEPackedSingle in {
def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))],
- IIC_SSE_INSERTPS_RR>, EVEX_4V, Sched<[WriteFShuffle]>;
+ [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+ EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
(ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR128X:$dst, (X86insertps VR128X:$src1,
(v4f32 (scalar_to_vector (loadf32 addr:$src2))),
- imm:$src3))], IIC_SSE_INSERTPS_RM>, EVEX_4V,
- EVEX_CD8<32, CD8VT1>, Sched<[WriteFShuffleLd, ReadAfterLd]>;
+ imm:$src3))]>,
+ EVEX_4V, EVEX_CD8<32, CD8VT1>,
+ Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
}
//===----------------------------------------------------------------------===//
@@ -799,7 +796,7 @@ multiclass vextract_for_size_split<int Opcode,
X86VectorVTInfo From, X86VectorVTInfo To,
SDPatternOperator vextract_extract,
SDPatternOperator vextract_for_mask,
- OpndItins itins> {
+ SchedWrite SchedRR, SchedWrite SchedMR> {
let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
defm rr : AVX512_maskable_split<Opcode, MRMDestReg, To, (outs To.RC:$dst),
@@ -807,8 +804,8 @@ multiclass vextract_for_size_split<int Opcode,
"vextract" # To.EltTypeName # "x" # To.NumElts,
"$idx, $src1", "$src1, $idx",
(vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm)),
- (vextract_for_mask:$idx (From.VT From.RC:$src1), (iPTR imm)),
- itins.rr>, AVX512AIi8Base, EVEX, Sched<[itins.Sched]>;
+ (vextract_for_mask:$idx (From.VT From.RC:$src1), (iPTR imm))>,
+ AVX512AIi8Base, EVEX, Sched<[SchedRR]>;
def mr : AVX512AIi8<Opcode, MRMDestMem, (outs),
(ins To.MemOp:$dst, From.RC:$src1, u8imm:$idx),
@@ -816,8 +813,8 @@ multiclass vextract_for_size_split<int Opcode,
"\t{$idx, $src1, $dst|$dst, $src1, $idx}",
[(store (To.VT (vextract_extract:$idx
(From.VT From.RC:$src1), (iPTR imm))),
- addr:$dst)], itins.rm>, EVEX,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ addr:$dst)]>, EVEX,
+ Sched<[SchedMR]>;
let mayStore = 1, hasSideEffects = 0 in
def mrk : AVX512AIi8<Opcode, MRMDestMem, (outs),
@@ -825,9 +822,8 @@ multiclass vextract_for_size_split<int Opcode,
From.RC:$src1, u8imm:$idx),
"vextract" # To.EltTypeName # "x" # To.NumElts #
"\t{$idx, $src1, $dst {${mask}}|"
- "$dst {${mask}}, $src1, $idx}",
- [], itins.rm>, EVEX_K, EVEX,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ "$dst {${mask}}, $src1, $idx}", []>,
+ EVEX_K, EVEX, Sched<[SchedMR]>, NotMemoryFoldable;
}
}
@@ -835,8 +831,8 @@ multiclass vextract_for_size_split<int Opcode,
multiclass vextract_for_size<int Opcode, X86VectorVTInfo From,
X86VectorVTInfo To,
SDPatternOperator vextract_extract,
- OpndItins itins> :
- vextract_for_size_split<Opcode, From, To, vextract_extract, vextract_extract, itins>;
+ SchedWrite SchedRR, SchedWrite SchedMR> :
+ vextract_for_size_split<Opcode, From, To, vextract_extract, vextract_extract, SchedRR, SchedMR>;
// Codegen pattern for the alternative types
multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
@@ -856,24 +852,24 @@ multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
ValueType EltVT64, int Opcode256,
- OpndItins itins> {
+ SchedWrite SchedRR, SchedWrite SchedMR> {
let Predicates = [HasAVX512] in {
defm NAME # "32x4Z" : vextract_for_size<Opcode128,
X86VectorVTInfo<16, EltVT32, VR512>,
X86VectorVTInfo< 4, EltVT32, VR128X>,
- vextract128_extract, itins>,
+ vextract128_extract, SchedRR, SchedMR>,
EVEX_V512, EVEX_CD8<32, CD8VT4>;
defm NAME # "64x4Z" : vextract_for_size<Opcode256,
X86VectorVTInfo< 8, EltVT64, VR512>,
X86VectorVTInfo< 4, EltVT64, VR256X>,
- vextract256_extract, itins>,
+ vextract256_extract, SchedRR, SchedMR>,
VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>;
}
let Predicates = [HasVLX] in
defm NAME # "32x4Z256" : vextract_for_size<Opcode128,
X86VectorVTInfo< 8, EltVT32, VR256X>,
X86VectorVTInfo< 4, EltVT32, VR128X>,
- vextract128_extract, itins>,
+ vextract128_extract, SchedRR, SchedMR>,
EVEX_V256, EVEX_CD8<32, CD8VT4>;
// Even with DQI we'd like to only use these instructions for masking.
@@ -881,36 +877,27 @@ multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
defm NAME # "64x2Z256" : vextract_for_size_split<Opcode128,
X86VectorVTInfo< 4, EltVT64, VR256X>,
X86VectorVTInfo< 2, EltVT64, VR128X>,
- null_frag, vextract128_extract, itins>,
- VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>;
+ null_frag, vextract128_extract, SchedRR, SchedMR>,
+ VEX_W1X, EVEX_V256, EVEX_CD8<64, CD8VT2>;
// Even with DQI we'd like to only use these instructions for masking.
let Predicates = [HasDQI] in {
defm NAME # "64x2Z" : vextract_for_size_split<Opcode128,
X86VectorVTInfo< 8, EltVT64, VR512>,
X86VectorVTInfo< 2, EltVT64, VR128X>,
- null_frag, vextract128_extract, itins>,
+ null_frag, vextract128_extract, SchedRR, SchedMR>,
VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
defm NAME # "32x8Z" : vextract_for_size_split<Opcode256,
X86VectorVTInfo<16, EltVT32, VR512>,
X86VectorVTInfo< 8, EltVT32, VR256X>,
- null_frag, vextract256_extract, itins>,
+ null_frag, vextract256_extract, SchedRR, SchedMR>,
EVEX_V512, EVEX_CD8<32, CD8VT8>;
}
}
-// FIXME: Is there a better scheduler itinerary for VEXTRACTF/VEXTRACTI?
-let Sched = WriteFShuffle256 in
-def AVX512_VEXTRACTF : OpndItins<
- IIC_SSE_SHUFP, IIC_SSE_SHUFP
->;
-let Sched = WriteShuffle256 in
-def AVX512_VEXTRACTI : OpndItins<
- IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
->;
-
-defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b, AVX512_VEXTRACTF>;
-defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b, AVX512_VEXTRACTI>;
+// TODO - replace WriteFStore/WriteVecStore with X86SchedWriteMoveLSWidths types.
+defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b, WriteFShuffle256, WriteFStore>;
+defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b, WriteShuffle256, WriteVecStore>;
// extract_subvector codegen patterns with the alternative types.
// Even with AVX512DQ we'll still use these for unmasked operations.
@@ -1116,41 +1103,43 @@ defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
(ins VR128X:$src1, u8imm:$src2),
"vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))],
- IIC_SSE_EXTRACTPS_RR>, EVEX, VEX_WIG, Sched<[WriteFShuffle]>;
+ [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
+ EVEX, VEX_WIG, Sched<[WriteVecExtract]>;
def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs),
(ins f32mem:$dst, VR128X:$src1, u8imm:$src2),
"vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
- addr:$dst)], IIC_SSE_EXTRACTPS_RM>,
- EVEX, VEX_WIG, EVEX_CD8<32, CD8VT1>, Sched<[WriteFShuffleLd]>;
+ addr:$dst)]>,
+ EVEX, VEX_WIG, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecExtractSt]>;
//===---------------------------------------------------------------------===//
// AVX-512 BROADCAST
//---
// broadcast with a scalar argument.
multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr,
+ string Name,
X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
def : Pat<(DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
- (!cast<Instruction>(NAME#DestInfo.ZSuffix#r)
- (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
+ (!cast<Instruction>(Name#DestInfo.ZSuffix#r)
+ (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
(X86VBroadcast SrcInfo.FRC:$src),
DestInfo.RC:$src0)),
- (!cast<Instruction>(NAME#DestInfo.ZSuffix#rk)
+ (!cast<Instruction>(Name#DestInfo.ZSuffix#rk)
DestInfo.RC:$src0, DestInfo.KRCWM:$mask,
- (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
+ (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
(X86VBroadcast SrcInfo.FRC:$src),
DestInfo.ImmAllZerosV)),
- (!cast<Instruction>(NAME#DestInfo.ZSuffix#rkz)
- DestInfo.KRCWM:$mask, (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
+ (!cast<Instruction>(Name#DestInfo.ZSuffix#rkz)
+ DestInfo.KRCWM:$mask, (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
}
// Split version to allow mask and broadcast node to be different types. This
// helps support the 32x2 broadcasts.
multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
+ string Name,
SchedWrite SchedRR, SchedWrite SchedRM,
X86VectorVTInfo MaskInfo,
X86VectorVTInfo DestInfo,
@@ -1167,8 +1156,8 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
(MaskInfo.VT
(bitconvert
(DestInfo.VT
- (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
- NoItinerary>, T8PD, EVEX, Sched<[SchedRR]>;
+ (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))))>,
+ T8PD, EVEX, Sched<[SchedRR]>;
let mayLoad = 1 in
defm m : AVX512_maskable_split<opc, MRMSrcMem, MaskInfo,
(outs MaskInfo.RC:$dst),
@@ -1180,8 +1169,8 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
(MaskInfo.VT
(bitconvert
(DestInfo.VT (X86VBroadcast
- (SrcInfo.ScalarLdFrag addr:$src))))),
- NoItinerary>, T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>,
+ (SrcInfo.ScalarLdFrag addr:$src)))))>,
+ T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>,
Sched<[SchedRM]>;
}
@@ -1190,7 +1179,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
(DestInfo.VT (UnmaskedOp
(SrcInfo.VT (scalar_to_vector
(SrcInfo.ScalarLdFrag addr:$src))))))),
- (!cast<Instruction>(NAME#MaskInfo.ZSuffix#m) addr:$src)>;
+ (!cast<Instruction>(Name#MaskInfo.ZSuffix#m) addr:$src)>;
def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask,
(bitconvert
(DestInfo.VT
@@ -1198,7 +1187,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
(SrcInfo.VT (scalar_to_vector
(SrcInfo.ScalarLdFrag addr:$src)))))),
MaskInfo.RC:$src0)),
- (!cast<Instruction>(NAME#DestInfo.ZSuffix#mk)
+ (!cast<Instruction>(Name#DestInfo.ZSuffix#mk)
MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, addr:$src)>;
def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask,
(bitconvert
@@ -1207,62 +1196,64 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
(SrcInfo.VT (scalar_to_vector
(SrcInfo.ScalarLdFrag addr:$src)))))),
MaskInfo.ImmAllZerosV)),
- (!cast<Instruction>(NAME#MaskInfo.ZSuffix#mkz)
+ (!cast<Instruction>(Name#MaskInfo.ZSuffix#mkz)
MaskInfo.KRCWM:$mask, addr:$src)>;
}
// Helper class to force mask and broadcast result to same type.
-multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
+multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, string Name,
SchedWrite SchedRR, SchedWrite SchedRM,
X86VectorVTInfo DestInfo,
X86VectorVTInfo SrcInfo> :
- avx512_broadcast_rm_split<opc, OpcodeStr, SchedRR, SchedRM,
+ avx512_broadcast_rm_split<opc, OpcodeStr, Name, SchedRR, SchedRM,
DestInfo, DestInfo, SrcInfo>;
multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _> {
- let Predicates = [HasAVX512] in
- defm Z : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
WriteFShuffle256Ld, _.info512, _.info128>,
- avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,
- EVEX_V512;
+ avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
+ _.info128>,
+ EVEX_V512;
+ }
let Predicates = [HasVLX] in {
- defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
WriteFShuffle256Ld, _.info256, _.info128>,
- avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>,
- EVEX_V256;
+ avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
+ _.info128>,
+ EVEX_V256;
}
}
multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _> {
- let Predicates = [HasAVX512] in
- defm Z : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
WriteFShuffle256Ld, _.info512, _.info128>,
- avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,
- EVEX_V512;
+ avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
+ _.info128>,
+ EVEX_V512;
+ }
let Predicates = [HasVLX] in {
- defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
WriteFShuffle256Ld, _.info256, _.info128>,
- avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>,
- EVEX_V256;
- defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
+ avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
+ _.info128>,
+ EVEX_V256;
+ defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
WriteFShuffle256Ld, _.info128, _.info128>,
- avx512_broadcast_scalar<opc, OpcodeStr, _.info128, _.info128>,
- EVEX_V128;
+ avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info128,
+ _.info128>,
+ EVEX_V128;
}
}
defm VBROADCASTSS : avx512_fp_broadcast_ss<0x18, "vbroadcastss",
avx512vl_f32_info>;
defm VBROADCASTSD : avx512_fp_broadcast_sd<0x19, "vbroadcastsd",
- avx512vl_f64_info>, VEX_W;
-
-def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src),
- (VBROADCASTSSZm addr:$src)>;
-def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
- (VBROADCASTSDZm addr:$src)>;
+ avx512vl_f64_info>, VEX_W1X;
multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR,
X86VectorVTInfo _, SDPatternOperator OpNode,
@@ -1271,7 +1262,7 @@ multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR,
defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins SrcRC:$src),
"vpbroadcast"##_.Suffix, "$src", "$src",
- (_.VT (OpNode SrcRC:$src)), NoItinerary>, T8PD, EVEX,
+ (_.VT (OpNode SrcRC:$src))>, T8PD, EVEX,
Sched<[SchedRR]>;
}
@@ -1284,7 +1275,7 @@ multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, SchedWrite Sched
!con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)),
!con((ins _.KRCWM:$mask), (ins GR32:$src)),
"vpbroadcast"##_.Suffix, "$src", "$src", [], [], [],
- NoItinerary, "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>;
+ "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>;
def : Pat <(_.VT (OpNode SrcRC:$src)),
(!cast<Instruction>(Name#r)
@@ -1337,37 +1328,34 @@ defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info,
defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info,
X86VBroadcast, GR64, HasAVX512>, VEX_W;
-def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
- (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
-def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
- (VPBROADCASTQrZrkz VK8WM:$mask, (i64 (MOV64ri 0x1)))>;
-
// Provide aliases for broadcast from the same register class that
// automatically does the extract.
-multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo,
- X86VectorVTInfo SrcInfo> {
+multiclass avx512_int_broadcast_rm_lowering<string Name,
+ X86VectorVTInfo DestInfo,
+ X86VectorVTInfo SrcInfo,
+ X86VectorVTInfo ExtInfo> {
def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))),
- (!cast<Instruction>(NAME#DestInfo.ZSuffix#"r")
- (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm))>;
+ (!cast<Instruction>(Name#DestInfo.ZSuffix#"r")
+ (ExtInfo.VT (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm)))>;
}
multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _, Predicate prd> {
let Predicates = [prd] in {
- defm Z : avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle256,
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
WriteShuffle256Ld, _.info512, _.info128>,
- avx512_int_broadcast_rm_lowering<_.info512, _.info256>,
+ avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info256, _.info128>,
EVEX_V512;
// Defined separately to avoid redefinition.
- defm Z_Alt : avx512_int_broadcast_rm_lowering<_.info512, _.info512>;
+ defm Z_Alt : avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info512, _.info128>;
}
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle256,
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
WriteShuffle256Ld, _.info256, _.info128>,
- avx512_int_broadcast_rm_lowering<_.info256, _.info256>,
+ avx512_int_broadcast_rm_lowering<NAME, _.info256, _.info256, _.info128>,
EVEX_V256;
- defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle,
- WriteShuffleLd, _.info128, _.info128>,
+ defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle,
+ WriteShuffleXLd, _.info128, _.info128>,
EVEX_V128;
}
}
@@ -1379,16 +1367,16 @@ defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",
defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",
avx512vl_i32_info, HasAVX512>;
defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
- avx512vl_i64_info, HasAVX512>, VEX_W;
+ avx512vl_i64_info, HasAVX512>, VEX_W1X;
multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
(_Dst.VT (X86SubVBroadcast
- (_Src.VT (bitconvert (_Src.LdFrag addr:$src))))),
- NoItinerary>, AVX5128IBase, EVEX,
- Sched<[WriteShuffleLd]>;
+ (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
+ Sched<[SchedWriteShuffle.YMM.Folded]>,
+ AVX5128IBase, EVEX;
}
// This should be used for the AVX512DQ broadcast instructions. It disables
@@ -1401,9 +1389,9 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
(ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
(null_frag),
(_Dst.VT (X86SubVBroadcast
- (_Src.VT (bitconvert (_Src.LdFrag addr:$src))))),
- NoItinerary>, AVX5128IBase, EVEX,
- Sched<[WriteShuffleLd]>;
+ (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
+ Sched<[SchedWriteShuffle.YMM.Folded]>,
+ AVX5128IBase, EVEX;
}
let Predicates = [HasAVX512] in {
@@ -1490,6 +1478,41 @@ def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
(VBROADCASTI32X4rm addr:$src)>;
def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
(VBROADCASTI32X4rm addr:$src)>;
+
+// Patterns for selects of bitcasted operations.
+def : Pat<(vselect VK16WM:$mask,
+ (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+ (bc_v16f32 (v16i32 immAllZerosV))),
+ (VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect VK16WM:$mask,
+ (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+ VR512:$src0),
+ (VBROADCASTF32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect VK16WM:$mask,
+ (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+ (v16i32 immAllZerosV)),
+ (VBROADCASTI32X4rmkz VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect VK16WM:$mask,
+ (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+ VR512:$src0),
+ (VBROADCASTI32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
+
+def : Pat<(vselect VK8WM:$mask,
+ (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
+ (bc_v8f64 (v16i32 immAllZerosV))),
+ (VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+ (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
+ VR512:$src0),
+ (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+ (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))),
+ (bc_v8i64 (v16i32 immAllZerosV))),
+ (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+ (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))),
+ VR512:$src0),
+ (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
}
let Predicates = [HasVLX] in {
@@ -1509,6 +1532,25 @@ def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
(VBROADCASTI32X4Z256rm addr:$src)>;
+// Patterns for selects of bitcasted operations.
+def : Pat<(vselect VK8WM:$mask,
+ (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+ (bc_v8f32 (v8i32 immAllZerosV))),
+ (VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+ (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+ VR256X:$src0),
+ (VBROADCASTF32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+ (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+ (v8i32 immAllZerosV)),
+ (VBROADCASTI32X4Z256rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+ (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+ VR256X:$src0),
+ (VBROADCASTI32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
+
+
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
@@ -1533,11 +1575,29 @@ def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
let Predicates = [HasVLX, HasDQI] in {
defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2",
- v4i64x_info, v2i64x_info>, VEX_W,
+ v4i64x_info, v2i64x_info>, VEX_W1X,
EVEX_V256, EVEX_CD8<64, CD8VT2>;
defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2",
- v4f64x_info, v2f64x_info>, VEX_W,
+ v4f64x_info, v2f64x_info>, VEX_W1X,
EVEX_V256, EVEX_CD8<64, CD8VT2>;
+
+// Patterns for selects of bitcasted operations.
+def : Pat<(vselect VK4WM:$mask,
+ (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+ (bc_v4f64 (v8i32 immAllZerosV))),
+ (VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>;
+def : Pat<(vselect VK4WM:$mask,
+ (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+ VR256X:$src0),
+ (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
+def : Pat<(vselect VK4WM:$mask,
+ (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+ (bc_v4i64 (v8i32 immAllZerosV))),
+ (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>;
+def : Pat<(vselect VK4WM:$mask,
+ (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+ VR256X:$src0),
+ (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
}
let Predicates = [HasDQI] in {
@@ -1553,17 +1613,52 @@ defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2",
defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8",
v16f32_info, v8f32x_info>,
EVEX_V512, EVEX_CD8<32, CD8VT8>;
+
+// Patterns for selects of bitcasted operations.
+def : Pat<(vselect VK16WM:$mask,
+ (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
+ (bc_v16f32 (v16i32 immAllZerosV))),
+ (VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect VK16WM:$mask,
+ (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
+ VR512:$src0),
+ (VBROADCASTF32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect VK16WM:$mask,
+ (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
+ (v16i32 immAllZerosV)),
+ (VBROADCASTI32X8rmkz VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect VK16WM:$mask,
+ (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
+ VR512:$src0),
+ (VBROADCASTI32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
+
+def : Pat<(vselect VK8WM:$mask,
+ (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+ (bc_v8f64 (v16i32 immAllZerosV))),
+ (VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+ (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+ VR512:$src0),
+ (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+ (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+ (bc_v8i64 (v16i32 immAllZerosV))),
+ (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+ (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+ VR512:$src0),
+ (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
}
multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> {
let Predicates = [HasDQI] in
- defm Z : avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle256,
+ defm Z : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
WriteShuffle256Ld, _Dst.info512,
_Src.info512, _Src.info128, null_frag>,
EVEX_V512;
let Predicates = [HasDQI, HasVLX] in
- defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle256,
+ defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
WriteShuffle256Ld, _Dst.info256,
_Src.info256, _Src.info128, null_frag>,
EVEX_V256;
@@ -1574,8 +1669,8 @@ multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
avx512_common_broadcast_32x2<opc, OpcodeStr, _Dst, _Src> {
let Predicates = [HasDQI, HasVLX] in
- defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle,
- WriteShuffleLd, _Dst.info128,
+ defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle,
+ WriteShuffleXLd, _Dst.info128,
_Src.info128, _Src.info128, null_frag>,
EVEX_V128;
}
@@ -1587,20 +1682,20 @@ defm VBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
let Predicates = [HasVLX] in {
def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256X:$src))),
- (VBROADCASTSSZ256r (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>;
+ (VBROADCASTSSZ256r (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>;
def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256X:$src))),
- (VBROADCASTSDZ256r (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>;
+ (VBROADCASTSDZ256r (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>;
}
def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
- (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
+ (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))>;
def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))),
- (VBROADCASTSSZr (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>;
+ (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>;
def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))),
- (VBROADCASTSDZr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
+ (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))>;
def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))),
- (VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>;
+ (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>;
//===----------------------------------------------------------------------===//
// AVX-512 BROADCAST MASK TO VECTOR REGISTER
@@ -1609,8 +1704,8 @@ multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr,
X86VectorVTInfo _, RegisterClass KRC> {
def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))],
- IIC_SSE_PSHUF_RI>, EVEX, Sched<[WriteShuffle]>;
+ [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>,
+ EVEX, Sched<[WriteShuffle]>;
}
multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
@@ -1630,111 +1725,146 @@ defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q",
//===----------------------------------------------------------------------===//
// -- VPERMI2 - 3 source operands form --
-
-let Sched = WriteFShuffle256 in
-def AVX512_PERM2_F : OpndItins<
- IIC_SSE_SHUFP, IIC_SSE_SHUFP
->;
-
-let Sched = WriteShuffle256 in
-def AVX512_PERM2_I : OpndItins<
- IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
->;
-
-multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, OpndItins itins,
- X86VectorVTInfo _> {
-let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
- // The index operand in the pattern should really be an integer type. However,
- // if we do that and it happens to come from a bitcast, then it becomes
- // difficult to find the bitcast needed to convert the index to the
- // destination type for the passthru since it will be folded with the bitcast
- // of the index operand.
- defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+multiclass avx512_perm_i<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
+ hasSideEffects = 0 in {
+ defm rr: AVX512_maskable_3src_cast<opc, MRMSrcReg, _, IdxVT, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)),
- itins.rr, 1>, EVEX_4V, AVX5128IBase, Sched<[itins.Sched]>;
+ (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1, _.RC:$src3)), 1>,
+ EVEX_4V, AVX5128IBase, Sched<[sched]>;
- defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ let mayLoad = 1 in
+ defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2,
- (_.VT (bitconvert (_.LdFrag addr:$src3))))), itins.rm, 1>,
- EVEX_4V, AVX5128IBase, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1,
+ (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>,
+ EVEX_4V, AVX5128IBase, Sched<[sched.Folded, ReadAfterLd]>;
}
}
-multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr, OpndItins itins,
- X86VectorVTInfo _> {
- let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
- defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
+ hasSideEffects = 0, mayLoad = 1 in
+ defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
- (_.VT (X86VPermi2X _.RC:$src1,
- _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
- itins.rm, 1>, AVX5128IBase, EVEX_4V, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.VT (X86VPermt2 _.RC:$src2,
+ IdxVT.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>,
+ AVX5128IBase, EVEX_4V, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
-multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr, OpndItins itins,
- AVX512VLVectorVTInfo VTInfo> {
- defm NAME: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info512>,
- avx512_perm_i_mb<opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512;
+multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo ShuffleMask> {
+ defm NAME: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info512,
+ ShuffleMask.info512>,
+ avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info512,
+ ShuffleMask.info512>, EVEX_V512;
let Predicates = [HasVLX] in {
- defm NAME#128: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info128>,
- avx512_perm_i_mb<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128;
- defm NAME#256: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info256>,
- avx512_perm_i_mb<opc, OpcodeStr, itins, VTInfo.info256>, EVEX_V256;
+ defm NAME#128: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info128,
+ ShuffleMask.info128>,
+ avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info128,
+ ShuffleMask.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info256,
+ ShuffleMask.info256>,
+ avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info256,
+ ShuffleMask.info256>, EVEX_V256;
}
}
multiclass avx512_perm_i_sizes_bw<bits<8> opc, string OpcodeStr,
- OpndItins itins,
+ X86FoldableSchedWrite sched,
AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo Idx,
Predicate Prd> {
let Predicates = [Prd] in
- defm NAME: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512;
+ defm NAME: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info512,
+ Idx.info512>, EVEX_V512;
let Predicates = [Prd, HasVLX] in {
- defm NAME#128: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128;
- defm NAME#256: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info256>, EVEX_V256;
+ defm NAME#128: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info128,
+ Idx.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info256,
+ Idx.info256>, EVEX_V256;
}
}
-defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d", AVX512_PERM2_I,
- avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q", AVX512_PERM2_I,
- avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMI2W : avx512_perm_i_sizes_bw<0x75, "vpermi2w", AVX512_PERM2_I,
- avx512vl_i16_info, HasBWI>,
+defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d", WriteVarShuffle256,
+ avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q", WriteVarShuffle256,
+ avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2W : avx512_perm_i_sizes_bw<0x75, "vpermi2w", WriteVarShuffle256,
+ avx512vl_i16_info, avx512vl_i16_info, HasBWI>,
VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPERMI2B : avx512_perm_i_sizes_bw<0x75, "vpermi2b", AVX512_PERM2_I,
- avx512vl_i8_info, HasVBMI>,
+defm VPERMI2B : avx512_perm_i_sizes_bw<0x75, "vpermi2b", WriteVarShuffle256,
+ avx512vl_i8_info, avx512vl_i8_info, HasVBMI>,
EVEX_CD8<8, CD8VF>;
-defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps", AVX512_PERM2_F,
- avx512vl_f32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", AVX512_PERM2_F,
- avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps", WriteFVarShuffle256,
+ avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", WriteFVarShuffle256,
+ avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// Extra patterns to deal with extra bitcasts due to passthru and index being
+// different types on the fp versions.
+multiclass avx512_perm_i_lowering<string InstrStr, X86VectorVTInfo _,
+ X86VectorVTInfo IdxVT,
+ X86VectorVTInfo CastVT> {
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (X86VPermt2 (_.VT _.RC:$src2),
+ (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))), _.RC:$src3),
+ (_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
+ (!cast<Instruction>(InstrStr#"rrk") _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, _.RC:$src3)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (X86VPermt2 _.RC:$src2,
+ (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))),
+ (_.LdFrag addr:$src3)),
+ (_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
+ (!cast<Instruction>(InstrStr#"rmk") _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (X86VPermt2 _.RC:$src2,
+ (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))),
+ (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
+ (_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
+ (!cast<Instruction>(InstrStr#"rmbk") _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3)>;
+}
+
+// TODO: Should we add more casts? The vXi64 case is common due to ABI.
+defm : avx512_perm_i_lowering<"VPERMI2PS", v16f32_info, v16i32_info, v8i64_info>;
+defm : avx512_perm_i_lowering<"VPERMI2PS256", v8f32x_info, v8i32x_info, v4i64x_info>;
+defm : avx512_perm_i_lowering<"VPERMI2PS128", v4f32x_info, v4i32x_info, v2i64x_info>;
// VPERMT2
-multiclass avx512_perm_t<bits<8> opc, string OpcodeStr, OpndItins itins,
+multiclass avx512_perm_t<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins IdxVT.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)),
- itins.rr, 1>, EVEX_4V, AVX5128IBase, Sched<[itins.Sched]>;
+ (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>,
+ EVEX_4V, AVX5128IBase, Sched<[sched]>;
defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins IdxVT.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
- (bitconvert (_.LdFrag addr:$src3)))), itins.rm, 1>,
- EVEX_4V, AVX5128IBase, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (_.LdFrag addr:$src3)))), 1>,
+ EVEX_4V, AVX5128IBase, Sched<[sched.Folded, ReadAfterLd]>;
}
}
-multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, OpndItins itins,
+multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -1742,167 +1872,176 @@ multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, OpndItins itins,
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(_.VT (X86VPermt2 _.RC:$src1,
- IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
- itins.rm, 1>, AVX5128IBase, EVEX_4V, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>,
+ AVX5128IBase, EVEX_4V, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
-multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr, OpndItins itins,
+multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
AVX512VLVectorVTInfo VTInfo,
AVX512VLVectorVTInfo ShuffleMask> {
- defm NAME: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info512,
+ defm NAME: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info512,
ShuffleMask.info512>,
- avx512_perm_t_mb<opc, OpcodeStr, itins, VTInfo.info512,
+ avx512_perm_t_mb<opc, OpcodeStr, sched, VTInfo.info512,
ShuffleMask.info512>, EVEX_V512;
let Predicates = [HasVLX] in {
- defm NAME#128: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info128,
+ defm NAME#128: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info128,
ShuffleMask.info128>,
- avx512_perm_t_mb<opc, OpcodeStr, itins, VTInfo.info128,
+ avx512_perm_t_mb<opc, OpcodeStr, sched, VTInfo.info128,
ShuffleMask.info128>, EVEX_V128;
- defm NAME#256: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info256,
+ defm NAME#256: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info256,
ShuffleMask.info256>,
- avx512_perm_t_mb<opc, OpcodeStr, itins, VTInfo.info256,
+ avx512_perm_t_mb<opc, OpcodeStr, sched, VTInfo.info256,
ShuffleMask.info256>, EVEX_V256;
}
}
-multiclass avx512_perm_t_sizes_bw<bits<8> opc, string OpcodeStr, OpndItins itins,
- AVX512VLVectorVTInfo VTInfo,
- AVX512VLVectorVTInfo Idx,
- Predicate Prd> {
+multiclass avx512_perm_t_sizes_bw<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo Idx, Predicate Prd> {
let Predicates = [Prd] in
- defm NAME: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info512,
+ defm NAME: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info512,
Idx.info512>, EVEX_V512;
let Predicates = [Prd, HasVLX] in {
- defm NAME#128: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info128,
+ defm NAME#128: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info128,
Idx.info128>, EVEX_V128;
- defm NAME#256: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info256,
+ defm NAME#256: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info256,
Idx.info256>, EVEX_V256;
}
}
-defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d", AVX512_PERM2_I,
+defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d", WriteVarShuffle256,
avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q", AVX512_PERM2_I,
+defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q", WriteVarShuffle256,
avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMT2W : avx512_perm_t_sizes_bw<0x7D, "vpermt2w", AVX512_PERM2_I,
+defm VPERMT2W : avx512_perm_t_sizes_bw<0x7D, "vpermt2w", WriteVarShuffle256,
avx512vl_i16_info, avx512vl_i16_info, HasBWI>,
VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPERMT2B : avx512_perm_t_sizes_bw<0x7D, "vpermt2b", AVX512_PERM2_I,
+defm VPERMT2B : avx512_perm_t_sizes_bw<0x7D, "vpermt2b", WriteVarShuffle256,
avx512vl_i8_info, avx512vl_i8_info, HasVBMI>,
EVEX_CD8<8, CD8VF>;
-defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps", AVX512_PERM2_F,
+defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps", WriteFVarShuffle256,
avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd", AVX512_PERM2_F,
+defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd", WriteFVarShuffle256,
avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
//===----------------------------------------------------------------------===//
// AVX-512 - BLEND using mask
//
-let Sched = WriteFVarBlend in
-def AVX512_BLENDM : OpndItins<
- IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
->;
-
-let Sched = WriteVarBlend in
-def AVX512_PBLENDM : OpndItins<
- IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-
-multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, OpndItins itins,
- X86VectorVTInfo _> {
+multiclass WriteFVarBlendask<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
- "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
- [], itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+ "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"), []>,
+ EVEX_4V, Sched<[sched]>;
def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
- [], itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>;
+ []>, EVEX_4V, EVEX_K, Sched<[sched]>;
def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
- [], itins.rr>, EVEX_4V, EVEX_KZ, Sched<[itins.Sched]>;
+ []>, EVEX_4V, EVEX_KZ, Sched<[sched]>, NotMemoryFoldable;
let mayLoad = 1 in {
def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
- [], itins.rm>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, ReadAfterLd]>;
def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
- [], itins.rm>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ []>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, ReadAfterLd]>;
def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
- [], itins.rm>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable;
}
}
}
-multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, OpndItins itins,
- X86VectorVTInfo _> {
+multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let mayLoad = 1, hasSideEffects = 0 in {
def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
- [], itins.rm>, EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), []>,
+ EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, ReadAfterLd]>;
+
+ def rmbkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}} {z}|",
+ "$dst {${mask}} {z}, $src1, ${src2}", _.BroadcastStr, "}"), []>,
+ EVEX_4V, EVEX_KZ, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable;
def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
- "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
- [], itins.rm>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ "$dst, $src1, ${src2}", _.BroadcastStr, "}"), []>,
+ EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
-multiclass blendmask_dq <bits<8> opc, string OpcodeStr, OpndItins itins,
- AVX512VLVectorVTInfo VTInfo> {
- defm Z : avx512_blendmask <opc, OpcodeStr, itins, VTInfo.info512>,
- avx512_blendmask_rmb <opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512;
+multiclass blendmask_dq<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo> {
+ defm Z : WriteFVarBlendask<opc, OpcodeStr, sched.ZMM, VTInfo.info512>,
+ WriteFVarBlendask_rmb<opc, OpcodeStr, sched.ZMM, VTInfo.info512>,
+ EVEX_V512;
let Predicates = [HasVLX] in {
- defm Z256 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info256>,
- avx512_blendmask_rmb<opc, OpcodeStr, itins, VTInfo.info256>, EVEX_V256;
- defm Z128 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info128>,
- avx512_blendmask_rmb<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128;
+ defm Z256 : WriteFVarBlendask<opc, OpcodeStr, sched.YMM, VTInfo.info256>,
+ WriteFVarBlendask_rmb<opc, OpcodeStr, sched.YMM, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : WriteFVarBlendask<opc, OpcodeStr, sched.XMM, VTInfo.info128>,
+ WriteFVarBlendask_rmb<opc, OpcodeStr, sched.XMM, VTInfo.info128>,
+ EVEX_V128;
}
}
-multiclass blendmask_bw <bits<8> opc, string OpcodeStr, OpndItins itins,
- AVX512VLVectorVTInfo VTInfo> {
+multiclass blendmask_bw<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo> {
let Predicates = [HasBWI] in
- defm Z : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512;
+ defm Z : WriteFVarBlendask<opc, OpcodeStr, sched.ZMM, VTInfo.info512>,
+ EVEX_V512;
let Predicates = [HasBWI, HasVLX] in {
- defm Z256 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info256>, EVEX_V256;
- defm Z128 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128;
+ defm Z256 : WriteFVarBlendask<opc, OpcodeStr, sched.YMM, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : WriteFVarBlendask<opc, OpcodeStr, sched.XMM, VTInfo.info128>,
+ EVEX_V128;
}
}
-
-defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", AVX512_BLENDM, avx512vl_f32_info>;
-defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", AVX512_BLENDM, avx512vl_f64_info>, VEX_W;
-defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", AVX512_PBLENDM, avx512vl_i32_info>;
-defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", AVX512_PBLENDM, avx512vl_i64_info>, VEX_W;
-defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", AVX512_PBLENDM, avx512vl_i8_info>;
-defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", AVX512_PBLENDM, avx512vl_i16_info>, VEX_W;
-
+defm VBLENDMPS : blendmask_dq<0x65, "vblendmps", SchedWriteFVarBlend,
+ avx512vl_f32_info>;
+defm VBLENDMPD : blendmask_dq<0x65, "vblendmpd", SchedWriteFVarBlend,
+ avx512vl_f64_info>, VEX_W;
+defm VPBLENDMD : blendmask_dq<0x64, "vpblendmd", SchedWriteVarBlend,
+ avx512vl_i32_info>;
+defm VPBLENDMQ : blendmask_dq<0x64, "vpblendmq", SchedWriteVarBlend,
+ avx512vl_i64_info>, VEX_W;
+defm VPBLENDMB : blendmask_bw<0x66, "vpblendmb", SchedWriteVarBlend,
+ avx512vl_i8_info>;
+defm VPBLENDMW : blendmask_bw<0x66, "vpblendmw", SchedWriteVarBlend,
+ avx512vl_i16_info>, VEX_W;
//===----------------------------------------------------------------------===//
// Compare Instructions
@@ -1911,7 +2050,7 @@ defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", AVX512_PBLENDM, avx512vl_i16_i
// avx512_cmp_scalar - AVX512 CMPSS and CMPSD
multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd,
- OpndItins itins> {
+ X86FoldableSchedWrite sched> {
defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
@@ -1919,7 +2058,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- imm:$cc), itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+ imm:$cc)>, EVEX_4V, Sched<[sched]>;
let mayLoad = 1 in
defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
@@ -1927,8 +2066,8 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd,
"vcmp${cc}"#_.Suffix,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
- imm:$cc), itins.rm>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[sched.Folded, ReadAfterLd]>;
defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),
@@ -1938,31 +2077,31 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd,
(OpNodeRnd (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
imm:$cc,
- (i32 FROUND_NO_EXC)), itins.rr>,
- EVEX_4V, EVEX_B, Sched<[itins.Sched]>;
+ (i32 FROUND_NO_EXC))>,
+ EVEX_4V, EVEX_B, Sched<[sched]>;
// Accept explicit immediate argument form instead of comparison code.
let isAsmParserOnly = 1, hasSideEffects = 0 in {
defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
(outs VK1:$dst),
(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rr>, EVEX_4V,
- Sched<[itins.Sched]>;
+ "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V,
+ Sched<[sched]>, NotMemoryFoldable;
let mayLoad = 1 in
defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rm>,
+ "$cc, $src2, $src1", "$src1, $src2, $cc">,
EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable;
defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),
(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
- "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc", itins.rr>,
- EVEX_4V, EVEX_B, Sched<[itins.Sched]>;
+ "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">,
+ EVEX_4V, EVEX_B, Sched<[sched]>, NotMemoryFoldable;
}// let isAsmParserOnly = 1, hasSideEffects = 0
let isCodeGenOnly = 1 in {
@@ -1973,8 +2112,8 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.KRC:$dst, (OpNode _.FRC:$src1,
_.FRC:$src2,
- imm:$cc))],
- itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+ imm:$cc))]>,
+ EVEX_4V, Sched<[sched]>;
def rm : AVX512Ii8<0xC2, MRMSrcMem,
(outs _.KRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
@@ -1982,43 +2121,44 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.KRC:$dst, (OpNode _.FRC:$src1,
(_.ScalarLdFrag addr:$src2),
- imm:$cc))],
- itins.rm>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ imm:$cc))]>,
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
let Predicates = [HasAVX512] in {
let ExeDomain = SSEPackedSingle in
defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd,
- SSE_ALU_F32S>, AVX512XSIi8Base;
+ SchedWriteFCmp.Scl>, AVX512XSIi8Base;
let ExeDomain = SSEPackedDouble in
defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd,
- SSE_ALU_F64S>, AVX512XDIi8Base, VEX_W;
+ SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W;
}
-multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _, bit IsCommutable> {
+multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ bit IsCommutable> {
let isCommutable = IsCommutable in
def rr : AVX512BI<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))],
- itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+ [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))]>,
+ EVEX_4V, Sched<[sched]>;
def rm : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
- (_.VT (bitconvert (_.LdFrag addr:$src2)))))],
- itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
+ EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
let isCommutable = IsCommutable in
def rrk : AVX512BI<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))],
- itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>;
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>,
+ EVEX_4V, EVEX_K, Sched<[sched]>;
def rmk : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
@@ -2026,20 +2166,21 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set _.KRC:$dst, (and _.KRCWM:$mask,
(OpNode (_.VT _.RC:$src1),
(_.VT (bitconvert
- (_.LdFrag addr:$src2))))))],
- itins.rm>, EVEX_4V, EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.LdFrag addr:$src2))))))]>,
+ EVEX_4V, EVEX_K, Sched<[sched.Folded, ReadAfterLd]>;
}
-multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _, bit IsCommutable> :
- avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, _, IsCommutable> {
+multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ bit IsCommutable> :
+ avx512_icmp_packed<opc, OpcodeStr, OpNode, sched, _, IsCommutable> {
def rmb : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
"|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
- (X86VBroadcast (_.ScalarLdFrag addr:$src2))))],
- itins.rm>, EVEX_4V, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>,
+ EVEX_4V, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
def rmbk : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
_.ScalarMemOp:$src2),
@@ -2049,112 +2190,110 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set _.KRC:$dst, (and _.KRCWM:$mask,
(OpNode (_.VT _.RC:$src1),
(X86VBroadcast
- (_.ScalarLdFrag addr:$src2)))))],
- itins.rm>, EVEX_4V, EVEX_K, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.ScalarLdFrag addr:$src2)))))]>,
+ EVEX_4V, EVEX_K, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
-multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo VTInfo,
- Predicate prd, bit IsCommutable = 0> {
+multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd,
+ bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, VTInfo.info512,
- IsCommutable>, EVEX_V512;
+ defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.ZMM,
+ VTInfo.info512, IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, VTInfo.info256,
- IsCommutable>, EVEX_V256;
- defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, VTInfo.info128,
- IsCommutable>, EVEX_V128;
+ defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.YMM,
+ VTInfo.info256, IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.XMM,
+ VTInfo.info128, IsCommutable>, EVEX_V128;
}
}
multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
- SDNode OpNode, OpndItins itins,
+ PatFrag OpNode, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo,
Predicate prd, bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info512,
- IsCommutable>, EVEX_V512;
+ defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.ZMM,
+ VTInfo.info512, IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info256,
- IsCommutable>, EVEX_V256;
- defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info128,
- IsCommutable>, EVEX_V128;
- }
-}
-
-// FIXME: Is there a better scheduler itinerary for VPCMP?
-defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm,
- SSE_ALU_F32P, avx512vl_i8_info, HasBWI, 1>,
+ defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.YMM,
+ VTInfo.info256, IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.XMM,
+ VTInfo.info128, IsCommutable>, EVEX_V128;
+ }
+}
+
+// This fragment treats X86cmpm as commutable to help match loads in both
+// operands for PCMPEQ.
+def X86setcc_commute : SDNode<"ISD::SETCC", SDTSetCC, [SDNPCommutative]>;
+def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2),
+ (X86setcc_commute node:$src1, node:$src2, SETEQ)>;
+def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2),
+ (setcc node:$src1, node:$src2, SETGT)>;
+
+// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
+// increase the pattern complexity the way an immediate would.
+let AddedComplexity = 2 in {
+// FIXME: Is there a better scheduler class for VPCMP?
+defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c,
+ SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>,
EVEX_CD8<8, CD8VF>, VEX_WIG;
-defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm,
- SSE_ALU_F32P, avx512vl_i16_info, HasBWI, 1>,
+defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c,
+ SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>,
EVEX_CD8<16, CD8VF>, VEX_WIG;
-defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm,
- SSE_ALU_F32P, avx512vl_i32_info, HasAVX512, 1>,
+defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c,
+ SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>,
EVEX_CD8<32, CD8VF>;
-defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm,
- SSE_ALU_F32P, avx512vl_i64_info, HasAVX512, 1>,
+defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c,
+ SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>,
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm,
- SSE_ALU_F32P, avx512vl_i8_info, HasBWI>,
+ SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
EVEX_CD8<8, CD8VF>, VEX_WIG;
defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm,
- SSE_ALU_F32P, avx512vl_i16_info, HasBWI>,
+ SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
EVEX_CD8<16, CD8VF>, VEX_WIG;
defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm,
- SSE_ALU_F32P, avx512vl_i32_info, HasAVX512>,
+ SchedWriteVecALU, avx512vl_i32_info, HasAVX512>,
EVEX_CD8<32, CD8VF>;
defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
- SSE_ALU_F32P, avx512vl_i64_info, HasAVX512>,
+ SchedWriteVecALU, avx512vl_i64_info, HasAVX512>,
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
+}
-// Transforms to swizzle an immediate to help matching memory operand in first
-// operand.
-def CommutePCMPCC : SDNodeXForm<imm, [{
- uint8_t Imm = N->getZExtValue() & 0x7;
- switch (Imm) {
- default: llvm_unreachable("Unreachable!");
- case 0x01: Imm = 0x06; break; // LT -> NLE
- case 0x02: Imm = 0x05; break; // LE -> NLT
- case 0x05: Imm = 0x02; break; // NLT -> LE
- case 0x06: Imm = 0x01; break; // NLE -> LT
- case 0x00: // EQ
- case 0x03: // FALSE
- case 0x04: // NE
- case 0x07: // TRUE
- break;
- }
- return getI8Imm(Imm, SDLoc(N));
-}]>;
-
-multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> {
+multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
+ PatFrag CommFrag, X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, string Name> {
let isCommutable = 1 in
def rri : AVX512AIi8<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc),
!strconcat("vpcmp${cc}", Suffix,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- imm:$cc))],
- itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+ [(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ cond)))]>,
+ EVEX_4V, Sched<[sched]>;
def rmi : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc),
!strconcat("vpcmp${cc}", Suffix,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
- (_.VT (bitconvert (_.LdFrag addr:$src2))),
- imm:$cc))],
- itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ [(set _.KRC:$dst, (_.KVT
+ (Frag:$cc
+ (_.VT _.RC:$src1),
+ (_.VT (bitconvert (_.LdFrag addr:$src2))),
+ cond)))]>,
+ EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
let isCommutable = 1 in
def rrik : AVX512AIi8<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
@@ -2163,9 +2302,10 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
"\t{$src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- imm:$cc)))],
- itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>;
+ (_.KVT (Frag:$cc (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ cond))))]>,
+ EVEX_4V, EVEX_K, Sched<[sched]>;
def rmik : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
AVX512ICC:$cc),
@@ -2173,69 +2313,74 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
"\t{$src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1),
- (_.VT (bitconvert (_.LdFrag addr:$src2))),
- imm:$cc)))],
- itins.rm>, EVEX_4V, EVEX_K,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.KVT
+ (Frag:$cc
+ (_.VT _.RC:$src1),
+ (_.VT (bitconvert
+ (_.LdFrag addr:$src2))),
+ cond))))]>,
+ EVEX_4V, EVEX_K, Sched<[sched.Folded, ReadAfterLd]>;
// Accept explicit immediate argument form instead of comparison code.
let isAsmParserOnly = 1, hasSideEffects = 0 in {
def rri_alt : AVX512AIi8<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
!strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
- "$dst, $src1, $src2, $cc}"),
- [], itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+ "$dst, $src1, $src2, $cc}"), []>,
+ EVEX_4V, Sched<[sched]>, NotMemoryFoldable;
let mayLoad = 1 in
def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
!strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
- "$dst, $src1, $src2, $cc}"),
- [], itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ "$dst, $src1, $src2, $cc}"), []>,
+ EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable;
def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
u8imm:$cc),
!strconcat("vpcmp", Suffix,
"\t{$cc, $src2, $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, $src2, $cc}"),
- [], itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>;
+ "$dst {${mask}}, $src1, $src2, $cc}"), []>,
+ EVEX_4V, EVEX_K, Sched<[sched]>, NotMemoryFoldable;
let mayLoad = 1 in
def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
u8imm:$cc),
!strconcat("vpcmp", Suffix,
"\t{$cc, $src2, $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, $src2, $cc}"),
- [], itins.rm>, EVEX_4V, EVEX_K,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ "$dst {${mask}}, $src1, $src2, $cc}"), []>,
+ EVEX_4V, EVEX_K, Sched<[sched.Folded, ReadAfterLd]>,
+ NotMemoryFoldable;
}
- def : Pat<(OpNode (bitconvert (_.LdFrag addr:$src2)),
- (_.VT _.RC:$src1), imm:$cc),
- (!cast<Instruction>(NAME#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
- (CommutePCMPCC imm:$cc))>;
+ def : Pat<(_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)),
+ (_.VT _.RC:$src1), cond)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmi")
+ _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
- def : Pat<(and _.KRCWM:$mask, (OpNode (bitconvert (_.LdFrag addr:$src2)),
- (_.VT _.RC:$src1), imm:$cc)),
- (!cast<Instruction>(NAME#_.ZSuffix#"rmik") _.KRCWM:$mask,
- _.RC:$src1, addr:$src2,
- (CommutePCMPCC imm:$cc))>;
+ def : Pat<(and _.KRCWM:$mask,
+ (_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)),
+ (_.VT _.RC:$src1), cond))),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmik")
+ _.KRCWM:$mask, _.RC:$src1, addr:$src2,
+ (CommFrag.OperandTransform $cc))>;
}
-multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> :
- avx512_icmp_cc<opc, Suffix, OpNode, itins, _> {
+multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
+ PatFrag CommFrag, X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, string Name> :
+ avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched, _, Name> {
def rmib : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
AVX512ICC:$cc),
!strconcat("vpcmp${cc}", Suffix,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
"$dst, $src1, ${src2}", _.BroadcastStr, "}"),
- [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
- (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
- imm:$cc))],
- itins.rm>, EVEX_4V, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ [(set _.KRC:$dst, (_.KVT (Frag:$cc
+ (_.VT _.RC:$src1),
+ (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)),
+ cond)))]>,
+ EVEX_4V, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
def rmibk : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
_.ScalarMemOp:$src2, AVX512ICC:$cc),
@@ -2243,11 +2388,12 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1),
- (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
- imm:$cc)))],
- itins.rm>, EVEX_4V, EVEX_K, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.KVT (Frag:$cc
+ (_.VT _.RC:$src1),
+ (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)),
+ cond))))]>,
+ EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
// Accept explicit immediate argument form instead of comparison code.
let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in {
@@ -2256,99 +2402,142 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
u8imm:$cc),
!strconcat("vpcmp", Suffix,
"\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|",
- "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
- [], itins.rm>, EVEX_4V, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), []>,
+ EVEX_4V, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>,
+ NotMemoryFoldable;
def rmibk_alt : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
_.ScalarMemOp:$src2, u8imm:$cc),
!strconcat("vpcmp", Suffix,
"\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
- [], itins.rm>, EVEX_4V, EVEX_K, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"), []>,
+ EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>,
+ NotMemoryFoldable;
}
- def : Pat<(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
- (_.VT _.RC:$src1), imm:$cc),
- (!cast<Instruction>(NAME#_.ZSuffix#"rmib") _.RC:$src1, addr:$src2,
- (CommutePCMPCC imm:$cc))>;
+ def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+ (_.VT _.RC:$src1), cond)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmib")
+ _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
- def : Pat<(and _.KRCWM:$mask, (OpNode (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)),
- (_.VT _.RC:$src1), imm:$cc)),
- (!cast<Instruction>(NAME#_.ZSuffix#"rmibk") _.KRCWM:$mask,
- _.RC:$src1, addr:$src2,
- (CommutePCMPCC imm:$cc))>;
+ def : Pat<(and _.KRCWM:$mask,
+ (_.KVT (CommFrag:$cc (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)),
+ (_.VT _.RC:$src1), cond))),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmibk")
+ _.KRCWM:$mask, _.RC:$src1, addr:$src2,
+ (CommFrag.OperandTransform $cc))>;
}
-multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo VTInfo,
- Predicate prd> {
+multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, PatFrag Frag,
+ PatFrag CommFrag, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_icmp_cc<opc, Suffix, OpNode, itins, VTInfo.info512>,
- EVEX_V512;
+ defm Z : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.ZMM,
+ VTInfo.info512, NAME>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, itins, VTInfo.info256>,
- EVEX_V256;
- defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, itins, VTInfo.info128>,
- EVEX_V128;
+ defm Z256 : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.YMM,
+ VTInfo.info256, NAME>, EVEX_V256;
+ defm Z128 : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.XMM,
+ VTInfo.info128, NAME>, EVEX_V128;
}
}
-multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo VTInfo,
- Predicate prd> {
+multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, PatFrag Frag,
+ PatFrag CommFrag, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, itins, VTInfo.info512>,
- EVEX_V512;
+ defm Z : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.ZMM,
+ VTInfo.info512, NAME>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, itins, VTInfo.info256>,
- EVEX_V256;
- defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, itins, VTInfo.info128>,
- EVEX_V128;
+ defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.YMM,
+ VTInfo.info256, NAME>, EVEX_V256;
+ defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.XMM,
+ VTInfo.info128, NAME>, EVEX_V128;
}
}
-// FIXME: Is there a better scheduler itinerary for VPCMP/VPCMPU?
-defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, SSE_ALU_F32P,
- avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>;
-defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, SSE_ALU_F32P,
- avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>;
+def X86pcmpm_imm : SDNodeXForm<setcc, [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ uint8_t SSECC = X86::getVPCMPImmForCond(CC);
+ return getI8Imm(SSECC, SDLoc(N));
+}]>;
+
+// Swapped operand version of the above.
+def X86pcmpm_imm_commute : SDNodeXForm<setcc, [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ uint8_t SSECC = X86::getVPCMPImmForCond(CC);
+ SSECC = X86::getSwappedVPCMPImm(SSECC);
+ return getI8Imm(SSECC, SDLoc(N));
+}]>;
-defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, SSE_ALU_F32P,
- avx512vl_i16_info, HasBWI>,
+def X86pcmpm : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return !ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm>;
+
+// Same as above, but commutes immediate. Use for load folding.
+def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return !ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm_commute>;
+
+def X86pcmpum : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm>;
+
+// Same as above, but commutes immediate. Use for load folding.
+def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm_commute>;
+
+// FIXME: Is there a better scheduler class for VPCMP/VPCMPU?
+defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_commute,
+ SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
+ EVEX_CD8<8, CD8VF>;
+defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_commute,
+ SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
+ EVEX_CD8<8, CD8VF>;
+
+defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_commute,
+ SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, SSE_ALU_F32P,
- avx512vl_i16_info, HasBWI>,
+defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_commute,
+ SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, SSE_ALU_F32P,
- avx512vl_i32_info, HasAVX512>,
- EVEX_CD8<32, CD8VF>;
-defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, SSE_ALU_F32P,
- avx512vl_i32_info, HasAVX512>,
- EVEX_CD8<32, CD8VF>;
-
-defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, SSE_ALU_F32P,
- avx512vl_i64_info, HasAVX512>,
- VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, SSE_ALU_F32P,
- avx512vl_i64_info, HasAVX512>,
- VEX_W, EVEX_CD8<64, CD8VF>;
-
-
-multiclass avx512_vcmp_common<OpndItins itins, X86VectorVTInfo _> {
+defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_commute,
+ SchedWriteVecALU, avx512vl_i32_info,
+ HasAVX512>, EVEX_CD8<32, CD8VF>;
+defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_commute,
+ SchedWriteVecALU, avx512vl_i32_info,
+ HasAVX512>, EVEX_CD8<32, CD8VF>;
+
+defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_commute,
+ SchedWriteVecALU, avx512vl_i64_info,
+ HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_commute,
+ SchedWriteVecALU, avx512vl_i64_info,
+ HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ string Name> {
defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc),
"vcmp${cc}"#_.Suffix,
"$src2, $src1", "$src1, $src2",
(X86cmpm (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- imm:$cc), itins.rr, 1>,
- Sched<[itins.Sched]>;
+ imm:$cc), 1>,
+ Sched<[sched]>;
defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
@@ -2356,8 +2545,8 @@ multiclass avx512_vcmp_common<OpndItins itins, X86VectorVTInfo _> {
"$src2, $src1", "$src1, $src2",
(X86cmpm (_.VT _.RC:$src1),
(_.VT (bitconvert (_.LdFrag addr:$src2))),
- imm:$cc), itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ imm:$cc)>,
+ Sched<[sched.Folded, ReadAfterLd]>;
defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
@@ -2367,63 +2556,65 @@ multiclass avx512_vcmp_common<OpndItins itins, X86VectorVTInfo _> {
"$src1, ${src2}"##_.BroadcastStr,
(X86cmpm (_.VT _.RC:$src1),
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- imm:$cc), itins.rm>,
- EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ imm:$cc)>,
+ EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
// Accept explicit immediate argument form instead of comparison code.
let isAsmParserOnly = 1, hasSideEffects = 0 in {
defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),
(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rr>,
- Sched<[itins.Sched]>;
+ "$cc, $src2, $src1", "$src1, $src2, $cc">,
+ Sched<[sched]>, NotMemoryFoldable;
let mayLoad = 1 in {
defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
(ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ "$cc, $src2, $src1", "$src1, $src2, $cc">,
+ Sched<[sched.Folded, ReadAfterLd]>,
+ NotMemoryFoldable;
defm rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
"$cc, ${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr##", $cc", itins.rm>,
- EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ "$src1, ${src2}"##_.BroadcastStr##", $cc">,
+ EVEX_B, Sched<[sched.Folded, ReadAfterLd]>,
+ NotMemoryFoldable;
}
}
// Patterns for selecting with loads in other operand.
def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1),
CommutableCMPCC:$cc),
- (!cast<Instruction>(NAME#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
+ (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
imm:$cc)>;
def : Pat<(and _.KRCWM:$mask, (X86cmpm (_.LdFrag addr:$src2),
(_.VT _.RC:$src1),
CommutableCMPCC:$cc)),
- (!cast<Instruction>(NAME#_.ZSuffix#"rmik") _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
_.RC:$src1, addr:$src2,
imm:$cc)>;
def : Pat<(X86cmpm (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
(_.VT _.RC:$src1), CommutableCMPCC:$cc),
- (!cast<Instruction>(NAME#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
imm:$cc)>;
def : Pat<(and _.KRCWM:$mask, (X86cmpm (X86VBroadcast
(_.ScalarLdFrag addr:$src2)),
(_.VT _.RC:$src1),
CommutableCMPCC:$cc)),
- (!cast<Instruction>(NAME#_.ZSuffix#"rmbik") _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
_.RC:$src1, addr:$src2,
imm:$cc)>;
}
-multiclass avx512_vcmp_sae<OpndItins itins, X86VectorVTInfo _> {
+multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
// comparison code form (VCMP[EQ/LT/LE/...]
defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
@@ -2432,8 +2623,8 @@ multiclass avx512_vcmp_sae<OpndItins itins, X86VectorVTInfo _> {
(X86cmpmRnd (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
imm:$cc,
- (i32 FROUND_NO_EXC)), itins.rr>,
- EVEX_B, Sched<[itins.Sched]>;
+ (i32 FROUND_NO_EXC))>,
+ EVEX_B, Sched<[sched]>;
let isAsmParserOnly = 1, hasSideEffects = 0 in {
defm rrib_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
@@ -2441,29 +2632,28 @@ multiclass avx512_vcmp_sae<OpndItins itins, X86VectorVTInfo _> {
(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
"$cc, {sae}, $src2, $src1",
- "$src1, $src2, {sae}, $cc", itins.rr>,
- EVEX_B, Sched<[itins.Sched]>;
+ "$src1, $src2, {sae}, $cc">,
+ EVEX_B, Sched<[sched]>, NotMemoryFoldable;
}
}
-multiclass avx512_vcmp<OpndItins itins, AVX512VLVectorVTInfo _> {
+multiclass avx512_vcmp<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcmp_common<itins, _.info512>,
- avx512_vcmp_sae<itins, _.info512>, EVEX_V512;
+ defm Z : avx512_vcmp_common<sched.ZMM, _.info512, NAME>,
+ avx512_vcmp_sae<sched.ZMM, _.info512>, EVEX_V512;
}
let Predicates = [HasAVX512,HasVLX] in {
- defm Z128 : avx512_vcmp_common<itins, _.info128>, EVEX_V128;
- defm Z256 : avx512_vcmp_common<itins, _.info256>, EVEX_V256;
+ defm Z128 : avx512_vcmp_common<sched.XMM, _.info128, NAME>, EVEX_V128;
+ defm Z256 : avx512_vcmp_common<sched.YMM, _.info256, NAME>, EVEX_V256;
}
}
-defm VCMPPD : avx512_vcmp<SSE_ALU_F64P, avx512vl_f64_info>,
+defm VCMPPD : avx512_vcmp<SchedWriteFCmp, avx512vl_f64_info>,
AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
-defm VCMPPS : avx512_vcmp<SSE_ALU_F32P, avx512vl_f32_info>,
+defm VCMPPS : avx512_vcmp<SchedWriteFCmp, avx512vl_f32_info>,
AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
-
// Patterns to select fp compares with load as first operand.
let Predicates = [HasAVX512] in {
def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1,
@@ -2480,39 +2670,39 @@ let Predicates = [HasAVX512] in {
//handle fpclass instruction mask = op(reg_scalar,imm)
// op(mem_scalar,imm)
multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
Predicate prd> {
let Predicates = [prd], ExeDomain = _.ExeDomain in {
def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2)))], itins.rr>,
- Sched<[itins.Sched]>;
+ (i32 imm:$src2)))]>,
+ Sched<[sched]>;
def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
- [(set _.KRC:$dst,(or _.KRCWM:$mask,
+ [(set _.KRC:$dst,(and _.KRCWM:$mask,
(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2))))], itins.rr>,
- EVEX_K, Sched<[itins.Sched]>;
+ (i32 imm:$src2))))]>,
+ EVEX_K, Sched<[sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.IntScalarMemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix##
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,
(OpNode _.ScalarIntMemCPat:$src1,
- (i32 imm:$src2)))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 imm:$src2)))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix##
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
- [(set _.KRC:$dst,(or _.KRCWM:$mask,
+ [(set _.KRC:$dst,(and _.KRCWM:$mask,
(OpNode _.ScalarIntMemCPat:$src1,
- (i32 imm:$src2))))], itins.rm>,
- EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 imm:$src2))))]>,
+ EVEX_K, Sched<[sched.Folded, ReadAfterLd]>;
}
}
@@ -2520,39 +2710,39 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
// fpclass(reg_vec, mem_vec, imm)
// fpclass(reg_vec, broadcast(eltVt), imm)
multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
string mem, string broadcast>{
let ExeDomain = _.ExeDomain in {
def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2)))], itins.rr>,
- Sched<[itins.Sched]>;
+ (i32 imm:$src2)))]>,
+ Sched<[sched]>;
def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
- [(set _.KRC:$dst,(or _.KRCWM:$mask,
+ [(set _.KRC:$dst,(and _.KRCWM:$mask,
(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2))))], itins.rr>,
- EVEX_K, Sched<[itins.Sched]>;
+ (i32 imm:$src2))))]>,
+ EVEX_K, Sched<[sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.MemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix##mem#
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,(OpNode
(_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i32 imm:$src2)))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 imm:$src2)))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix##mem#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
- [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode
+ [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode
(_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i32 imm:$src2))))], itins.rm>,
- EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 imm:$src2))))]>,
+ EVEX_K, Sched<[sched.Folded, ReadAfterLd]>;
def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.ScalarMemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
@@ -2561,56 +2751,58 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set _.KRC:$dst,(OpNode
(_.VT (X86VBroadcast
(_.ScalarLdFrag addr:$src1))),
- (i32 imm:$src2)))], itins.rm>,
- EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 imm:$src2)))]>,
+ EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
_.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"##
_.BroadcastStr##", $src2}",
- [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode
+ [(set _.KRC:$dst,(and _.KRCWM:$mask, (OpNode
(_.VT (X86VBroadcast
(_.ScalarLdFrag addr:$src1))),
- (i32 imm:$src2))))], itins.rm>,
- EVEX_B, EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 imm:$src2))))]>,
+ EVEX_B, EVEX_K, Sched<[sched.Folded, ReadAfterLd]>;
}
}
multiclass avx512_vector_fpclass_all<string OpcodeStr, AVX512VLVectorVTInfo _,
bits<8> opc, SDNode OpNode,
- OpndItins itins, Predicate prd,
+ X86SchedWriteWidths sched, Predicate prd,
string broadcast>{
let Predicates = [prd] in {
- defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, itins,
+ defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.ZMM,
_.info512, "{z}", broadcast>, EVEX_V512;
}
let Predicates = [prd, HasVLX] in {
- defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, itins,
+ defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.XMM,
_.info128, "{x}", broadcast>, EVEX_V128;
- defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, itins,
+ defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.YMM,
_.info256, "{y}", broadcast>, EVEX_V256;
}
}
-// FIXME: Is there a better scheduler itinerary for VFPCLASS?
multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
- bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{
+ bits<8> opcScalar, SDNode VecOpNode,
+ SDNode ScalarOpNode, X86SchedWriteWidths sched,
+ Predicate prd> {
defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec,
- VecOpNode, SSE_ALU_F32P, prd, "{l}">,
+ VecOpNode, sched, prd, "{l}">,
EVEX_CD8<32, CD8VF>;
defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec,
- VecOpNode, SSE_ALU_F64P, prd, "{q}">,
+ VecOpNode, sched, prd, "{q}">,
EVEX_CD8<64, CD8VF> , VEX_W;
- defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
- SSE_ALU_F32S, f32x_info, prd>,
- EVEX_CD8<32, CD8VT1>;
- defm SD : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
- SSE_ALU_F64S, f64x_info, prd>,
- EVEX_CD8<64, CD8VT1>, VEX_W;
+ defm SSZ : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+ sched.Scl, f32x_info, prd>,
+ EVEX_CD8<32, CD8VT1>;
+ defm SDZ : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+ sched.Scl, f64x_info, prd>,
+ EVEX_CD8<64, CD8VT1>, VEX_W;
}
defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
- X86Vfpclasss, HasDQI>, AVX512AIi8Base,EVEX;
+ X86Vfpclasss, SchedWriteFCmp, HasDQI>,
+ AVX512AIi8Base, EVEX;
//-----------------------------------------------------------------
// Mask register copy, including
@@ -2621,16 +2813,18 @@ defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
string OpcodeStr, RegisterClass KRC,
ValueType vvt, X86MemOperand x86memop> {
- let hasSideEffects = 0, SchedRW = [WriteMove] in
+ let isMoveReg = 1, hasSideEffects = 0, SchedRW = [WriteMove] in
def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
- IIC_SSE_MOVDQ>;
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+ Sched<[WriteMove]>;
def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set KRC:$dst, (vvt (load addr:$src)))], IIC_SSE_MOVDQ>;
+ [(set KRC:$dst, (vvt (load addr:$src)))]>,
+ Sched<[WriteLoad]>;
def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(store KRC:$src, addr:$dst)], IIC_SSE_MOVDQ>;
+ [(store KRC:$src, addr:$dst)]>,
+ Sched<[WriteStore]>;
}
multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
@@ -2638,11 +2832,11 @@ multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
RegisterClass KRC, RegisterClass GRC> {
let hasSideEffects = 0 in {
def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
- IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+ Sched<[WriteMove]>;
def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
- IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+ Sched<[WriteMove]>;
}
}
@@ -2684,8 +2878,6 @@ def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
(COPY_TO_REGCLASS VK16:$src, GR32)>;
def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
- (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit))>, Requires<[NoDQI]>;
-def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
(KMOVBrk VK8:$src)>, Requires<[HasDQI]>;
def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
(COPY_TO_REGCLASS VK8:$src, GR32)>;
@@ -2701,66 +2893,20 @@ def : Pat<(i64 (bitconvert (v64i1 VK64:$src))),
// Load/store kreg
let Predicates = [HasDQI] in {
- def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
- (KMOVBmk addr:$dst, VK8:$src)>;
- def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
- (KMOVBkm addr:$src)>;
-
- def : Pat<(store VK4:$src, addr:$dst),
- (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>;
- def : Pat<(store VK2:$src, addr:$dst),
- (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>;
def : Pat<(store VK1:$src, addr:$dst),
(KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>;
+ def : Pat<(v1i1 (load addr:$src)),
+ (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK1)>;
def : Pat<(v2i1 (load addr:$src)),
(COPY_TO_REGCLASS (KMOVBkm addr:$src), VK2)>;
def : Pat<(v4i1 (load addr:$src)),
(COPY_TO_REGCLASS (KMOVBkm addr:$src), VK4)>;
}
-let Predicates = [HasAVX512, NoDQI] in {
- def : Pat<(store VK1:$src, addr:$dst),
- (MOV8mr addr:$dst,
- (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)),
- sub_8bit)))>;
- def : Pat<(store VK2:$src, addr:$dst),
- (MOV8mr addr:$dst,
- (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK2:$src, GR32)),
- sub_8bit)))>;
- def : Pat<(store VK4:$src, addr:$dst),
- (MOV8mr addr:$dst,
- (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK4:$src, GR32)),
- sub_8bit)))>;
- def : Pat<(store VK8:$src, addr:$dst),
- (MOV8mr addr:$dst,
- (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)),
- sub_8bit)))>;
-
- def : Pat<(v8i1 (load addr:$src)),
- (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
- def : Pat<(v2i1 (load addr:$src)),
- (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK2)>;
- def : Pat<(v4i1 (load addr:$src)),
- (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK4)>;
-}
let Predicates = [HasAVX512] in {
- def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
- (KMOVWmk addr:$dst, VK16:$src)>;
- def : Pat<(v1i1 (load addr:$src)),
- (COPY_TO_REGCLASS (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), VK1)>;
- def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
- (KMOVWkm addr:$src)>;
-}
-let Predicates = [HasBWI] in {
- def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst),
- (KMOVDmk addr:$dst, VK32:$src)>;
- def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))),
- (KMOVDkm addr:$src)>;
- def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst),
- (KMOVQmk addr:$dst, VK64:$src)>;
- def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))),
- (KMOVQkm addr:$src)>;
+ def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
+ (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
}
let Predicates = [HasAVX512] in {
@@ -2768,9 +2914,6 @@ let Predicates = [HasAVX512] in {
def : Pat<(maskVT (scalar_to_vector GR32:$src)),
(COPY_TO_REGCLASS GR32:$src, maskRC)>;
- def : Pat<(i32 (X86kextract maskRC:$src, (iPTR 0))),
- (COPY_TO_REGCLASS maskRC:$src, GR32)>;
-
def : Pat<(maskVT (scalar_to_vector GR8:$src)),
(COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>;
}
@@ -2783,46 +2926,41 @@ let Predicates = [HasAVX512] in {
defm : operation_gpr_mask_copy_lowering<VK32, v32i1>;
defm : operation_gpr_mask_copy_lowering<VK64, v64i1>;
- def : Pat<(X86kshiftr (X86kshiftl (v1i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
- (COPY_TO_REGCLASS
- (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
- GR8:$src, sub_8bit), (i32 1))), VK1)>;
- def : Pat<(X86kshiftr (X86kshiftl (v16i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
+ def : Pat<(insert_subvector (v16i1 immAllZerosV),
+ (v1i1 (scalar_to_vector GR8:$src)), (iPTR 0)),
(COPY_TO_REGCLASS
- (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
- GR8:$src, sub_8bit), (i32 1))), VK16)>;
- def : Pat<(X86kshiftr (X86kshiftl (v8i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
- (COPY_TO_REGCLASS
- (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
- GR8:$src, sub_8bit), (i32 1))), VK8)>;
-
+ (KMOVWkr (AND32ri8
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit),
+ (i32 1))), VK16)>;
}
// Mask unary operation
// - KNOT
multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
RegisterClass KRC, SDPatternOperator OpNode,
- OpndItins itins, Predicate prd> {
+ X86FoldableSchedWrite sched, Predicate prd> {
let Predicates = [prd] in
def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set KRC:$dst, (OpNode KRC:$src))], itins.rr>,
- Sched<[itins.Sched]>;
+ [(set KRC:$dst, (OpNode KRC:$src))]>,
+ Sched<[sched]>;
}
multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, OpndItins itins> {
+ SDPatternOperator OpNode,
+ X86FoldableSchedWrite sched> {
defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
- itins, HasDQI>, VEX, PD;
+ sched, HasDQI>, VEX, PD;
defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
- itins, HasAVX512>, VEX, PS;
+ sched, HasAVX512>, VEX, PS;
defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
- itins, HasBWI>, VEX, PD, VEX_W;
+ sched, HasBWI>, VEX, PD, VEX_W;
defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
- itins, HasBWI>, VEX, PS, VEX_W;
+ sched, HasBWI>, VEX, PS, VEX_W;
}
-defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot, SSE_BIT_ITINS_P>;
+// TODO - do we need a X86SchedWriteWidths::KMASK type?
+defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot, SchedWriteVecLogic.XMM>;
// KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
let Predicates = [HasAVX512, NoDQI] in
@@ -2838,26 +2976,28 @@ def : Pat<(vnot VK2:$src),
// - KAND, KANDN, KOR, KXNOR, KXOR
multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
RegisterClass KRC, SDPatternOperator OpNode,
- OpndItins itins, Predicate prd, bit IsCommutable> {
+ X86FoldableSchedWrite sched, Predicate prd,
+ bit IsCommutable> {
let Predicates = [prd], isCommutable = IsCommutable in
def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))], itins.rr>,
- Sched<[itins.Sched]>;
+ [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>,
+ Sched<[sched]>;
}
multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, OpndItins itins,
- bit IsCommutable, Predicate prdW = HasAVX512> {
+ SDPatternOperator OpNode,
+ X86FoldableSchedWrite sched, bit IsCommutable,
+ Predicate prdW = HasAVX512> {
defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
- itins, HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
+ sched, HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
- itins, prdW, IsCommutable>, VEX_4V, VEX_L, PS;
+ sched, prdW, IsCommutable>, VEX_4V, VEX_L, PS;
defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
- itins, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD;
+ sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD;
defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
- itins, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS;
+ sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS;
}
def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
@@ -2866,12 +3006,13 @@ def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>;
def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>;
-defm KAND : avx512_mask_binop_all<0x41, "kand", and, SSE_BIT_ITINS_P, 1>;
-defm KOR : avx512_mask_binop_all<0x45, "kor", or, SSE_BIT_ITINS_P, 1>;
-defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, SSE_BIT_ITINS_P, 1>;
-defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, SSE_BIT_ITINS_P, 1>;
-defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, SSE_BIT_ITINS_P, 0>;
-defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, SSE_BIT_ITINS_P, 1, HasDQI>;
+// TODO - do we need a X86SchedWriteWidths::KMASK type?
+defm KAND : avx512_mask_binop_all<0x41, "kand", and, SchedWriteVecLogic.XMM, 1>;
+defm KOR : avx512_mask_binop_all<0x45, "kor", or, SchedWriteVecLogic.XMM, 1>;
+defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, SchedWriteVecLogic.XMM, 1>;
+defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, SchedWriteVecLogic.XMM, 1>;
+defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, SchedWriteVecLogic.XMM, 0>;
+defm KADD : avx512_mask_binop_all<0x4A, "kadd", X86kadd, SchedWriteVecLogic.XMM, 1, HasDQI>;
multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode,
Instruction Inst> {
@@ -2906,13 +3047,14 @@ defm : avx512_binop_pat<xor, xor, KXORWrr>;
// Mask unpacking
multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
- RegisterClass KRCSrc, OpndItins itins, Predicate prd> {
+ RegisterClass KRCSrc, X86FoldableSchedWrite sched,
+ Predicate prd> {
let Predicates = [prd] in {
let hasSideEffects = 0 in
def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst),
(ins KRC:$src1, KRC:$src2),
- "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
- itins.rr>, VEX_4V, VEX_L, Sched<[itins.Sched]>;
+ "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ VEX_4V, VEX_L, Sched<[sched]>;
def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)),
(!cast<Instruction>(NAME##rr)
@@ -2921,104 +3063,199 @@ multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
}
}
-defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, SSE_UNPCK, HasAVX512>, PD;
-defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, SSE_UNPCK, HasBWI>, PS;
-defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, SSE_UNPCK, HasBWI>, PS, VEX_W;
+defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, WriteShuffle, HasAVX512>, PD;
+defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, WriteShuffle, HasBWI>, PS;
+defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, WriteShuffle, HasBWI>, PS, VEX_W;
// Mask bit testing
multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
- SDNode OpNode, OpndItins itins, Predicate prd> {
+ SDNode OpNode, X86FoldableSchedWrite sched,
+ Predicate prd> {
let Predicates = [prd], Defs = [EFLAGS] in
def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
- [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))], itins.rr>,
- Sched<[itins.Sched]>;
+ [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>,
+ Sched<[sched]>;
}
multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, Predicate prdW = HasAVX512> {
- defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, itins, HasDQI>,
+ X86FoldableSchedWrite sched,
+ Predicate prdW = HasAVX512> {
+ defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, sched, HasDQI>,
VEX, PD;
- defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, itins, prdW>,
+ defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, sched, prdW>,
VEX, PS;
- defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, itins, HasBWI>,
+ defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, sched, HasBWI>,
VEX, PS, VEX_W;
- defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, itins, HasBWI>,
+ defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, sched, HasBWI>,
VEX, PD, VEX_W;
}
-defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest, SSE_PTEST>;
-defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, SSE_PTEST, HasDQI>;
+// TODO - do we need a X86SchedWriteWidths::KMASK type?
+defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest, SchedWriteVecLogic.XMM>;
+defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, SchedWriteVecLogic.XMM, HasDQI>;
// Mask shift
multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
- SDNode OpNode, OpndItins itins> {
+ SDNode OpNode, X86FoldableSchedWrite sched> {
let Predicates = [HasAVX512] in
def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm),
!strconcat(OpcodeStr,
"\t{$imm, $src, $dst|$dst, $src, $imm}"),
- [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))],
- itins.rr>, Sched<[itins.Sched]>;
+ [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>,
+ Sched<[sched]>;
}
multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
- SDNode OpNode, OpndItins itins> {
+ SDNode OpNode, X86FoldableSchedWrite sched> {
defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode,
- itins>, VEX, TAPD, VEX_W;
+ sched>, VEX, TAPD, VEX_W;
let Predicates = [HasDQI] in
defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode,
- itins>, VEX, TAPD;
+ sched>, VEX, TAPD;
let Predicates = [HasBWI] in {
defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode,
- itins>, VEX, TAPD, VEX_W;
+ sched>, VEX, TAPD, VEX_W;
defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode,
- itins>, VEX, TAPD;
+ sched>, VEX, TAPD;
}
}
-defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, SSE_PSHUF>;
-defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, SSE_PSHUF>;
-
-multiclass axv512_icmp_packed_no_vlx_lowering<SDNode OpNode, string InstStr> {
-def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrr)
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
+defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShuffle>;
+defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>;
-def : Pat<(v8i1 (and VK8:$mask,
- (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))),
+// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
+multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, string InstStr,
+ X86VectorVTInfo Narrow,
+ X86VectorVTInfo Wide> {
+ def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2))),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrr")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))),
+ Narrow.KRC)>;
+
+ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (Frag (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2)))),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrrk")
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))),
+ Narrow.KRC)>;
+}
+
+// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
+multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag,
+ string InstStr,
+ X86VectorVTInfo Narrow,
+ X86VectorVTInfo Wide> {
+def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2), cond)),
(COPY_TO_REGCLASS
- (!cast<Instruction>(InstStr##Zrrk)
- (COPY_TO_REGCLASS VK8:$mask, VK16),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
- VK8)>;
-}
-
-multiclass axv512_icmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr,
- AVX512VLVectorVTInfo _> {
-def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrri)
- (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
- imm:$cc), VK8)>;
-
-def : Pat<(v8i1 (and VK8:$mask, (OpNode (_.info256.VT VR256X:$src1),
- (_.info256.VT VR256X:$src2), imm:$cc))),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
- (COPY_TO_REGCLASS VK8:$mask, VK16),
- (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
- imm:$cc), VK8)>;
+ (!cast<Instruction>(InstStr##Zrri)
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
+ (Frag.OperandTransform $cc)), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2),
+ cond)))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
+ (Frag.OperandTransform $cc)), Narrow.KRC)>;
+}
+
+// Same as above, but for fp types which don't use PatFrags.
+multiclass axv512_cmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr,
+ X86VectorVTInfo Narrow,
+ X86VectorVTInfo Wide> {
+def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2), imm:$cc)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr##Zrri)
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
+ imm:$cc), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (OpNode (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2), imm:$cc))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
+ imm:$cc), Narrow.KRC)>;
}
let Predicates = [HasAVX512, NoVLX] in {
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD">;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQD">;
+ // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
+ // increase the pattern complexity the way an immediate would.
+ let AddedComplexity = 2 in {
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v8i32x_info, v16i32_info>;
+
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v4i32x_info, v16i32_info>;
+
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v4i64x_info, v8i64_info>;
+
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v2i64x_info, v8i64_info>;
+ }
+
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUD", v8i32x_info, v16i32_info>;
+
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUD", v4i32x_info, v16i32_info>;
+
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", avx512vl_f32_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPD", avx512vl_i32_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUD", avx512vl_i32_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUQ", v2i64x_info, v8i64_info>;
+
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v8f32x_info, v16f32_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v4f32x_info, v16f32_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v4f64x_info, v8f64_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v2f64x_info, v8f64_info>;
+}
+
+let Predicates = [HasBWI, NoVLX] in {
+ // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
+ // increase the pattern complexity the way an immediate would.
+ let AddedComplexity = 2 in {
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v32i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v32i8x_info, v64i8_info>;
+
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v16i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v16i8x_info, v64i8_info>;
+
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v16i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v16i16x_info, v32i16_info>;
+
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v8i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v8i16x_info, v32i16_info>;
+ }
+
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPB", v32i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUB", v32i8x_info, v64i8_info>;
+
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPB", v16i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUB", v16i8x_info, v64i8_info>;
+
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPW", v16i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUW", v16i16x_info, v32i16_info>;
+
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPW", v8i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUW", v8i16x_info, v32i16_info>;
}
// Mask setting all 0s or 1s
@@ -3087,87 +3324,29 @@ defm : operation_subvector_mask_lowering<VK16, v16i1, VK64, v64i1>;
defm : operation_subvector_mask_lowering<VK32, v32i1, VK64, v64i1>;
-
-multiclass vextract_for_mask_to_mask<string InstrStr, X86KVectorVTInfo From,
- X86KVectorVTInfo To, Predicate prd> {
-let Predicates = [prd] in
- def :
- Pat<(To.KVT(extract_subvector(From.KVT From.KRC:$src), (iPTR imm:$imm8))),
- (To.KVT(COPY_TO_REGCLASS
- (!cast<Instruction>(InstrStr#"ri") From.KVT:$src,
- (i8 imm:$imm8)), To.KRC))>;
-}
-
-multiclass vextract_for_mask_to_mask_legal_w<X86KVectorVTInfo From,
- X86KVectorVTInfo To> {
-def :
- Pat<(To.KVT(extract_subvector(From.KVT From.KRC:$src), (iPTR imm:$imm8))),
- (To.KVT(COPY_TO_REGCLASS
- (KSHIFTRWri(COPY_TO_REGCLASS From.KRC:$src, VK16),
- (i8 imm:$imm8)), To.KRC))>;
-}
-
-defm : vextract_for_mask_to_mask_legal_w<v2i1_info, v1i1_info>;
-defm : vextract_for_mask_to_mask_legal_w<v4i1_info, v1i1_info>;
-defm : vextract_for_mask_to_mask_legal_w<v8i1_info, v1i1_info>;
-defm : vextract_for_mask_to_mask_legal_w<v4i1_info, v2i1_info>;
-defm : vextract_for_mask_to_mask_legal_w<v8i1_info, v2i1_info>;
-defm : vextract_for_mask_to_mask_legal_w<v8i1_info, v4i1_info>;
-
-defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v1i1_info, HasAVX512>;
-defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v1i1_info, HasBWI>;
-defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v1i1_info, HasBWI>;
-defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v2i1_info, HasAVX512>;
-defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v2i1_info, HasBWI>;
-defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v2i1_info, HasBWI>;
-defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v4i1_info, HasAVX512>;
-defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v4i1_info, HasBWI>;
-defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v4i1_info, HasBWI>;
-defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v8i1_info, HasAVX512>;
-defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v8i1_info, HasBWI>;
-defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v8i1_info, HasBWI>;
-defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v16i1_info, HasBWI>;
-defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v16i1_info, HasBWI>;
-defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v32i1_info, HasBWI>;
-
-// Patterns for kmask shift
-multiclass mask_shift_lowering<RegisterClass RC, ValueType VT> {
- def : Pat<(VT (X86kshiftl RC:$src, (i8 imm:$imm))),
- (VT (COPY_TO_REGCLASS
- (KSHIFTLWri (COPY_TO_REGCLASS RC:$src, VK16),
- (I8Imm $imm)),
- RC))>;
- def : Pat<(VT (X86kshiftr RC:$src, (i8 imm:$imm))),
- (VT (COPY_TO_REGCLASS
- (KSHIFTRWri (COPY_TO_REGCLASS RC:$src, VK16),
- (I8Imm $imm)),
- RC))>;
-}
-
-defm : mask_shift_lowering<VK8, v8i1>, Requires<[HasAVX512, NoDQI]>;
-defm : mask_shift_lowering<VK4, v4i1>, Requires<[HasAVX512]>;
-defm : mask_shift_lowering<VK2, v2i1>, Requires<[HasAVX512]>;
//===----------------------------------------------------------------------===//
// AVX-512 - Aligned and unaligned load and store
//
-
-multiclass avx512_load<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
+multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
X86VectorVTInfo _, PatFrag ld_frag, PatFrag mload,
+ X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd,
bit NoRMPattern = 0,
SDPatternOperator SelectOprr = vselect> {
let hasSideEffects = 0 in {
+ let isMoveReg = 1 in
def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
- _.ExeDomain, itins.rr>, EVEX, Sched<[WriteMove]>;
+ _.ExeDomain>, EVEX, Sched<[Sched.RR]>,
+ EVEX2VEXOverride<EVEX2VEXOvrd#"rr">;
def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
"${dst} {${mask}} {z}, $src}"),
[(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
(_.VT _.RC:$src),
- _.ImmAllZerosV)))], _.ExeDomain,
- itins.rr>, EVEX, EVEX_KZ, Sched<[WriteMove]>;
+ _.ImmAllZerosV)))], _.ExeDomain>,
+ EVEX, EVEX_KZ, Sched<[Sched.RR]>;
let mayLoad = 1, canFoldAsLoad = 1, isReMaterializable = 1 in
def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src),
@@ -3175,7 +3354,8 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
!if(NoRMPattern, [],
[(set _.RC:$dst,
(_.VT (bitconvert (ld_frag addr:$src))))]),
- _.ExeDomain, itins.rm>, EVEX, Sched<[WriteLoad]>;
+ _.ExeDomain>, EVEX, Sched<[Sched.RM]>,
+ EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in {
def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
@@ -3184,8 +3364,8 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
"${dst} {${mask}}, $src1}"),
[(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
(_.VT _.RC:$src1),
- (_.VT _.RC:$src0))))], _.ExeDomain,
- itins.rr>, EVEX, EVEX_K, Sched<[WriteMove]>;
+ (_.VT _.RC:$src0))))], _.ExeDomain>,
+ EVEX, EVEX_K, Sched<[Sched.RR]>;
def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1),
!strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
@@ -3193,8 +3373,8 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
[(set _.RC:$dst, (_.VT
(vselect _.KRCWM:$mask,
(_.VT (bitconvert (ld_frag addr:$src1))),
- (_.VT _.RC:$src0))))], _.ExeDomain, itins.rm>,
- EVEX, EVEX_K, Sched<[WriteLoad]>;
+ (_.VT _.RC:$src0))))], _.ExeDomain>,
+ EVEX, EVEX_K, Sched<[Sched.RM]>;
}
def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.MemOp:$src),
@@ -3202,77 +3382,83 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
"${dst} {${mask}} {z}, $src}",
[(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
(_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))],
- _.ExeDomain, itins.rm>, EVEX, EVEX_KZ, Sched<[WriteLoad]>;
+ _.ExeDomain>, EVEX, EVEX_KZ, Sched<[Sched.RM]>;
}
def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
- (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+ (!cast<Instruction>(Name#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)),
- (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+ (!cast<Instruction>(Name#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))),
- (!cast<Instruction>(NAME#_.ZSuffix##rmk) _.RC:$src0,
+ (!cast<Instruction>(Name#_.ZSuffix##rmk) _.RC:$src0,
_.KRCWM:$mask, addr:$ptr)>;
}
multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,
- AVX512VLVectorVTInfo _,
- Predicate prd> {
+ AVX512VLVectorVTInfo _, Predicate prd,
+ X86SchedWriteMoveLSWidths Sched,
+ string EVEX2VEXOvrd, bit NoRMPattern = 0> {
let Predicates = [prd] in
- defm Z : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info512,
- _.info512.AlignedLdFrag, masked_load_aligned512>,
- EVEX_V512;
+ defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512,
+ _.info512.AlignedLdFrag, masked_load_aligned512,
+ Sched.ZMM, "", NoRMPattern>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info256,
- _.info256.AlignedLdFrag, masked_load_aligned256>,
- EVEX_V256;
- defm Z128 : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info128,
- _.info128.AlignedLdFrag, masked_load_aligned128>,
- EVEX_V128;
+ defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256,
+ _.info256.AlignedLdFrag, masked_load_aligned256,
+ Sched.YMM, EVEX2VEXOvrd#"Y", NoRMPattern>, EVEX_V256;
+ defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128,
+ _.info128.AlignedLdFrag, masked_load_aligned128,
+ Sched.XMM, EVEX2VEXOvrd, NoRMPattern>, EVEX_V128;
}
}
multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
- AVX512VLVectorVTInfo _,
- Predicate prd,
- bit NoRMPattern = 0,
- SDPatternOperator SelectOprr = vselect> {
+ AVX512VLVectorVTInfo _, Predicate prd,
+ X86SchedWriteMoveLSWidths Sched,
+ string EVEX2VEXOvrd, bit NoRMPattern = 0,
+ SDPatternOperator SelectOprr = vselect> {
let Predicates = [prd] in
- defm Z : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info512, _.info512.LdFrag,
- masked_load_unaligned, NoRMPattern,
- SelectOprr>, EVEX_V512;
+ defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512, _.info512.LdFrag,
+ masked_load_unaligned, Sched.ZMM, "",
+ NoRMPattern, SelectOprr>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info256, _.info256.LdFrag,
- masked_load_unaligned, NoRMPattern,
- SelectOprr>, EVEX_V256;
- defm Z128 : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info128, _.info128.LdFrag,
- masked_load_unaligned, NoRMPattern,
- SelectOprr>, EVEX_V128;
+ defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256, _.info256.LdFrag,
+ masked_load_unaligned, Sched.YMM, EVEX2VEXOvrd#"Y",
+ NoRMPattern, SelectOprr>, EVEX_V256;
+ defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128, _.info128.LdFrag,
+ masked_load_unaligned, Sched.XMM, EVEX2VEXOvrd,
+ NoRMPattern, SelectOprr>, EVEX_V128;
}
}
-multiclass avx512_store<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
+multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName,
X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore,
- string Name, bit NoMRPattern = 0> {
- let hasSideEffects = 0 in {
+ X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd,
+ bit NoMRPattern = 0> {
+ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+ let isMoveReg = 1 in
def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
- OpcodeStr # ".s\t{$src, $dst|$dst, $src}",
- [], _.ExeDomain, itins.rr>, EVEX, FoldGenData<Name#rr>,
- Sched<[WriteMove]>;
+ OpcodeStr # "\t{$src, $dst|$dst, $src}",
+ [], _.ExeDomain>, EVEX,
+ FoldGenData<BaseName#_.ZSuffix#rr>, Sched<[Sched.RR]>,
+ EVEX2VEXOverride<EVEX2VEXOvrd#"rr_REV">;
def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src),
- OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"#
+ OpcodeStr # "\t{$src, ${dst} {${mask}}|"#
"${dst} {${mask}}, $src}",
- [], _.ExeDomain, itins.rr>, EVEX, EVEX_K,
- FoldGenData<Name#rrk>, Sched<[WriteMove]>;
+ [], _.ExeDomain>, EVEX, EVEX_K,
+ FoldGenData<BaseName#_.ZSuffix#rrk>,
+ Sched<[Sched.RR]>;
def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src),
- OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" #
+ OpcodeStr # "\t{$src, ${dst} {${mask}} {z}|" #
"${dst} {${mask}} {z}, $src}",
- [], _.ExeDomain, itins.rr>, EVEX, EVEX_KZ,
- FoldGenData<Name#rrkz>, Sched<[WriteMove]>;
+ [], _.ExeDomain>, EVEX, EVEX_KZ,
+ FoldGenData<BaseName#_.ZSuffix#rrkz>,
+ Sched<[Sched.RR]>;
}
let hasSideEffects = 0, mayStore = 1 in
@@ -3280,132 +3466,154 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
!if(NoMRPattern, [],
[(st_frag (_.VT _.RC:$src), addr:$dst)]),
- _.ExeDomain, itins.mr>, EVEX, Sched<[WriteStore]>;
+ _.ExeDomain>, EVEX, Sched<[Sched.MR]>,
+ EVEX2VEXOverride<EVEX2VEXOvrd#"mr">;
def mrk : AVX512PI<opc, MRMDestMem, (outs),
(ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}",
- [], _.ExeDomain, itins.mr>, EVEX, EVEX_K, Sched<[WriteStore]>;
+ [], _.ExeDomain>, EVEX, EVEX_K, Sched<[Sched.MR]>,
+ NotMemoryFoldable;
def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)),
- (!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr,
- _.KRCWM:$mask, _.RC:$src)>;
-}
+ (!cast<Instruction>(BaseName#_.ZSuffix#mrk) addr:$ptr,
+ _.KRCWM:$mask, _.RC:$src)>;
+ def : InstAlias<OpcodeStr#".s\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(BaseName#_.ZSuffix#"rr_REV")
+ _.RC:$dst, _.RC:$src), 0>;
+ def : InstAlias<OpcodeStr#".s\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}",
+ (!cast<Instruction>(BaseName#_.ZSuffix#"rrk_REV")
+ _.RC:$dst, _.KRCWM:$mask, _.RC:$src), 0>;
+ def : InstAlias<OpcodeStr#".s\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}",
+ (!cast<Instruction>(BaseName#_.ZSuffix#"rrkz_REV")
+ _.RC:$dst, _.KRCWM:$mask, _.RC:$src), 0>;
+}
multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _, Predicate prd,
- string Name, bit NoMRPattern = 0> {
+ X86SchedWriteMoveLSWidths Sched,
+ string EVEX2VEXOvrd, bit NoMRPattern = 0> {
let Predicates = [prd] in
- defm Z : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info512, store,
- masked_store_unaligned, Name#Z, NoMRPattern>, EVEX_V512;
-
+ defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, store,
+ masked_store_unaligned, Sched.ZMM, "",
+ NoMRPattern>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info256, store,
- masked_store_unaligned, Name#Z256,
- NoMRPattern>, EVEX_V256;
- defm Z128 : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info128, store,
- masked_store_unaligned, Name#Z128,
+ defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, store,
+ masked_store_unaligned, Sched.YMM,
+ EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256;
+ defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, store,
+ masked_store_unaligned, Sched.XMM, EVEX2VEXOvrd,
NoMRPattern>, EVEX_V128;
}
}
multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
- AVX512VLVectorVTInfo _, Predicate prd,
- string Name> {
+ AVX512VLVectorVTInfo _, Predicate prd,
+ X86SchedWriteMoveLSWidths Sched,
+ string EVEX2VEXOvrd, bit NoMRPattern = 0> {
let Predicates = [prd] in
- defm Z : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info512, alignedstore,
- masked_store_aligned512, Name#Z>, EVEX_V512;
+ defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, alignedstore,
+ masked_store_aligned512, Sched.ZMM, "",
+ NoMRPattern>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info256, alignedstore,
- masked_store_aligned256, Name#Z256>, EVEX_V256;
- defm Z128 : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info128, alignedstore,
- masked_store_aligned128, Name#Z128>, EVEX_V128;
+ defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, alignedstore,
+ masked_store_aligned256, Sched.YMM,
+ EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256;
+ defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, alignedstore,
+ masked_store_aligned128, Sched.XMM, EVEX2VEXOvrd,
+ NoMRPattern>, EVEX_V128;
}
}
defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
- HasAVX512>,
+ HasAVX512, SchedWriteFMoveLS, "VMOVAPS">,
avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
- HasAVX512, "VMOVAPS">,
+ HasAVX512, SchedWriteFMoveLS, "VMOVAPS">,
PS, EVEX_CD8<32, CD8VF>;
defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
- HasAVX512>,
+ HasAVX512, SchedWriteFMoveLS, "VMOVAPD">,
avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
- HasAVX512, "VMOVAPD">,
+ HasAVX512, SchedWriteFMoveLS, "VMOVAPD">,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
- 0, null_frag>,
+ SchedWriteFMoveLS, "VMOVUPS", 0, null_frag>,
avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512,
- "VMOVUPS">,
- PS, EVEX_CD8<32, CD8VF>;
+ SchedWriteFMoveLS, "VMOVUPS">,
+ PS, EVEX_CD8<32, CD8VF>;
defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
- 0, null_frag>,
+ SchedWriteFMoveLS, "VMOVUPD", 0, null_frag>,
avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512,
- "VMOVUPD">,
+ SchedWriteFMoveLS, "VMOVUPD">,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
- HasAVX512>,
+ HasAVX512, SchedWriteVecMoveLS,
+ "VMOVDQA", 1>,
avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
- HasAVX512, "VMOVDQA32">,
+ HasAVX512, SchedWriteVecMoveLS,
+ "VMOVDQA", 1>,
PD, EVEX_CD8<32, CD8VF>;
defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
- HasAVX512>,
+ HasAVX512, SchedWriteVecMoveLS,
+ "VMOVDQA">,
avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
- HasAVX512, "VMOVDQA64">,
+ HasAVX512, SchedWriteVecMoveLS,
+ "VMOVDQA">,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI, 1>,
- avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
- HasBWI, "VMOVDQU8", 1>,
+defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI,
+ SchedWriteVecMoveLS, "VMOVDQU", 1>,
+ avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, HasBWI,
+ SchedWriteVecMoveLS, "VMOVDQU", 1>,
XD, EVEX_CD8<8, CD8VF>;
-defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI, 1>,
- avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
- HasBWI, "VMOVDQU16", 1>,
+defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI,
+ SchedWriteVecMoveLS, "VMOVDQU", 1>,
+ avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, HasBWI,
+ SchedWriteVecMoveLS, "VMOVDQU", 1>,
XD, VEX_W, EVEX_CD8<16, CD8VF>;
defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
- 0, null_frag>,
- avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
- HasAVX512, "VMOVDQU32">,
+ SchedWriteVecMoveLS, "VMOVDQU", 1, null_frag>,
+ avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
+ SchedWriteVecMoveLS, "VMOVDQU", 1>,
XS, EVEX_CD8<32, CD8VF>;
defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
- 0, null_frag>,
- avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
- HasAVX512, "VMOVDQU64">,
+ SchedWriteVecMoveLS, "VMOVDQU", 0, null_frag>,
+ avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
+ SchedWriteVecMoveLS, "VMOVDQU">,
XS, VEX_W, EVEX_CD8<64, CD8VF>;
// Special instructions to help with spilling when we don't have VLX. We need
// to load or store from a ZMM register instead. These are converted in
// expandPostRAPseudos.
let isReMaterializable = 1, canFoldAsLoad = 1,
- isPseudo = 1, SchedRW = [WriteLoad], mayLoad = 1, hasSideEffects = 0 in {
+ isPseudo = 1, mayLoad = 1, hasSideEffects = 0 in {
def VMOVAPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
- "", [], IIC_SSE_MOVA_P_RM>;
+ "", []>, Sched<[WriteFLoadX]>;
def VMOVAPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
- "", [], IIC_SSE_MOVA_P_RM>;
+ "", []>, Sched<[WriteFLoadY]>;
def VMOVUPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
- "", [], IIC_SSE_MOVA_P_RM>;
+ "", []>, Sched<[WriteFLoadX]>;
def VMOVUPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
- "", [], IIC_SSE_MOVA_P_RM>;
+ "", []>, Sched<[WriteFLoadY]>;
}
-let isPseudo = 1, SchedRW = [WriteStore], mayStore = 1, hasSideEffects = 0 in {
+let isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
def VMOVAPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
- "", [], IIC_SSE_MOVA_P_MR>;
+ "", []>, Sched<[WriteFStoreX]>;
def VMOVAPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
- "", [], IIC_SSE_MOVA_P_MR>;
+ "", []>, Sched<[WriteFStoreY]>;
def VMOVUPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
- "", [], IIC_SSE_MOVA_P_MR>;
+ "", []>, Sched<[WriteFStoreX]>;
def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
- "", [], IIC_SSE_MOVA_P_MR>;
+ "", []>, Sched<[WriteFStoreY]>;
}
def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
@@ -3428,62 +3636,94 @@ def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)),
(v16i32 VR512:$src))),
(VMOVDQA32Zrrkz VK16WM:$mask, VR512:$src)>;
+multiclass mask_move_lowering<string InstrStr, X86VectorVTInfo Narrow,
+ X86VectorVTInfo Wide> {
+ def : Pat<(Narrow.VT (vselect (Narrow.KVT Narrow.KRCWM:$mask),
+ Narrow.RC:$src1, Narrow.RC:$src0)),
+ (EXTRACT_SUBREG
+ (Wide.VT
+ (!cast<Instruction>(InstrStr#"rrk")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src0, Narrow.SubRegIdx)),
+ (COPY_TO_REGCLASS Narrow.KRCWM:$mask, Wide.KRCWM),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)))),
+ Narrow.SubRegIdx)>;
+
+ def : Pat<(Narrow.VT (vselect (Narrow.KVT Narrow.KRCWM:$mask),
+ Narrow.RC:$src1, Narrow.ImmAllZerosV)),
+ (EXTRACT_SUBREG
+ (Wide.VT
+ (!cast<Instruction>(InstrStr#"rrkz")
+ (COPY_TO_REGCLASS Narrow.KRCWM:$mask, Wide.KRCWM),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)))),
+ Narrow.SubRegIdx)>;
+}
+
// Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't
// available. Use a 512-bit operation and extract.
let Predicates = [HasAVX512, NoVLX] in {
-def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
- (v8f32 VR256X:$src0))),
- (EXTRACT_SUBREG
- (v16f32
- (VMOVAPSZrrk
- (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)),
- (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
- (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))),
- sub_ymm)>;
-
-def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
- (v8i32 VR256X:$src0))),
- (EXTRACT_SUBREG
- (v16i32
- (VMOVDQA32Zrrk
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)),
- (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))),
- sub_ymm)>;
+ defm : mask_move_lowering<"VMOVAPSZ", v4f32x_info, v16f32_info>;
+ defm : mask_move_lowering<"VMOVDQA32Z", v4i32x_info, v16i32_info>;
+ defm : mask_move_lowering<"VMOVAPSZ", v8f32x_info, v16f32_info>;
+ defm : mask_move_lowering<"VMOVDQA32Z", v8i32x_info, v16i32_info>;
+
+ defm : mask_move_lowering<"VMOVAPDZ", v2f64x_info, v8f64_info>;
+ defm : mask_move_lowering<"VMOVDQA64Z", v2i64x_info, v8i64_info>;
+ defm : mask_move_lowering<"VMOVAPDZ", v4f64x_info, v8f64_info>;
+ defm : mask_move_lowering<"VMOVDQA64Z", v4i64x_info, v8i64_info>;
+}
+
+let Predicates = [HasBWI, NoVLX] in {
+ defm : mask_move_lowering<"VMOVDQU8Z", v16i8x_info, v64i8_info>;
+ defm : mask_move_lowering<"VMOVDQU8Z", v32i8x_info, v64i8_info>;
+
+ defm : mask_move_lowering<"VMOVDQU16Z", v8i16x_info, v32i16_info>;
+ defm : mask_move_lowering<"VMOVDQU16Z", v16i16x_info, v32i16_info>;
}
let Predicates = [HasAVX512] in {
// 512-bit store.
+ def : Pat<(alignedstore (v16i32 VR512:$src), addr:$dst),
+ (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
def : Pat<(alignedstore (v32i16 VR512:$src), addr:$dst),
- (VMOVDQA32Zmr addr:$dst, VR512:$src)>;
+ (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
def : Pat<(alignedstore (v64i8 VR512:$src), addr:$dst),
- (VMOVDQA32Zmr addr:$dst, VR512:$src)>;
+ (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
+ def : Pat<(store (v16i32 VR512:$src), addr:$dst),
+ (VMOVDQU64Zmr addr:$dst, VR512:$src)>;
def : Pat<(store (v32i16 VR512:$src), addr:$dst),
- (VMOVDQU32Zmr addr:$dst, VR512:$src)>;
+ (VMOVDQU64Zmr addr:$dst, VR512:$src)>;
def : Pat<(store (v64i8 VR512:$src), addr:$dst),
- (VMOVDQU32Zmr addr:$dst, VR512:$src)>;
+ (VMOVDQU64Zmr addr:$dst, VR512:$src)>;
}
let Predicates = [HasVLX] in {
// 128-bit store.
+ def : Pat<(alignedstore (v4i32 VR128X:$src), addr:$dst),
+ (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),
- (VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;
+ (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst),
- (VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;
+ (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(store (v4i32 VR128X:$src), addr:$dst),
+ (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
def : Pat<(store (v8i16 VR128X:$src), addr:$dst),
- (VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;
+ (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
- (VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;
+ (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
// 256-bit store.
+ def : Pat<(alignedstore (v8i32 VR256X:$src), addr:$dst),
+ (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
def : Pat<(alignedstore (v16i16 VR256X:$src), addr:$dst),
- (VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
+ (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
def : Pat<(alignedstore (v32i8 VR256X:$src), addr:$dst),
- (VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
+ (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(store (v8i32 VR256X:$src), addr:$dst),
+ (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
def : Pat<(store (v16i16 VR256X:$src), addr:$dst),
- (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
+ (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
- (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
+ (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
}
multiclass masked_move_for_extract<string InstrStr, X86VectorVTInfo From,
@@ -3495,7 +3735,7 @@ multiclass masked_move_for_extract<string InstrStr, X86VectorVTInfo From,
To.RC:$src0)),
(Cast.VT (!cast<Instruction>(InstrStr#"rrk")
Cast.RC:$src0, Cast.KRCWM:$mask,
- (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx)))>;
+ (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>;
def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask,
(bitconvert
@@ -3504,7 +3744,7 @@ multiclass masked_move_for_extract<string InstrStr, X86VectorVTInfo From,
Cast.ImmAllZerosV)),
(Cast.VT (!cast<Instruction>(InstrStr#"rrkz")
Cast.KRCWM:$mask,
- (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx)))>;
+ (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>;
}
@@ -3561,40 +3801,40 @@ let ExeDomain = SSEPackedInt in {
def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
- (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
- EVEX, Sched<[WriteMove]>;
+ (v4i32 (scalar_to_vector GR32:$src)))]>,
+ EVEX, Sched<[WriteVecMoveFromGpr]>;
def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
- (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
- IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteLoad]>;
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
+ EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecLoad]>;
def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
- (v2i64 (scalar_to_vector GR64:$src)))],
- IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+ (v2i64 (scalar_to_vector GR64:$src)))]>,
+ EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
(ins i64mem:$src),
- "vmovq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>,
- EVEX, VEX_W, EVEX_CD8<64, CD8VT1>, Sched<[WriteLoad]>;
+ "vmovq\t{$src, $dst|$dst, $src}", []>,
+ EVEX, VEX_W, EVEX_CD8<64, CD8VT1>, Sched<[WriteVecLoad]>;
let isCodeGenOnly = 1 in {
def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src),
"vmovq\t{$src, $dst|$dst, $src}",
- [(set FR64X:$dst, (bitconvert GR64:$src))],
- IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+ [(set FR64X:$dst, (bitconvert GR64:$src))]>,
+ EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
def VMOV64toSDZrm : AVX512XSI<0x7E, MRMSrcMem, (outs FR64X:$dst), (ins i64mem:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set FR64X:$dst, (bitconvert (loadi64 addr:$src)))]>,
- EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteLoad]>;
+ EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteVecLoad]>;
def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (bitconvert FR64X:$src))],
- IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+ [(set GR64:$dst, (bitconvert FR64X:$src))]>,
+ EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
- [(store (i64 (bitconvert FR64X:$src)), addr:$dst)],
- IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
+ [(store (i64 (bitconvert FR64X:$src)), addr:$dst)]>,
+ EVEX, VEX_W, Sched<[WriteVecStore]>,
EVEX_CD8<64, CD8VT1>;
}
} // ExeDomain = SSEPackedInt
@@ -3604,13 +3844,13 @@ def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$
let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
"vmovd\t{$src, $dst|$dst, $src}",
- [(set FR32X:$dst, (bitconvert GR32:$src))],
- IIC_SSE_MOVDQ>, EVEX, Sched<[WriteMove]>;
+ [(set FR32X:$dst, (bitconvert GR32:$src))]>,
+ EVEX, Sched<[WriteVecMoveFromGpr]>;
def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
"vmovd\t{$src, $dst|$dst, $src}",
- [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
- IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteLoad]>;
+ [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))]>,
+ EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecLoad]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
// Move doubleword from xmm register to r/m32
@@ -3619,14 +3859,14 @@ let ExeDomain = SSEPackedInt in {
def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
- (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
- EVEX, Sched<[WriteMove]>;
+ (iPTR 0)))]>,
+ EVEX, Sched<[WriteVecMoveToGpr]>;
def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, VR128X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(store (i32 (extractelt (v4i32 VR128X:$src),
- (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
- EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteStore]>;
+ (iPTR 0))), addr:$dst)]>,
+ EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecStore]>;
} // ExeDomain = SSEPackedInt
// Move quadword from xmm1 register to r/m64
@@ -3635,44 +3875,47 @@ let ExeDomain = SSEPackedInt in {
def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
- (iPTR 0)))],
- IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, Sched<[WriteMove]>,
- Requires<[HasAVX512, In64BitMode]>;
+ (iPTR 0)))]>,
+ PD, EVEX, VEX_W, Sched<[WriteVecMoveToGpr]>,
+ Requires<[HasAVX512]>;
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, Sched<[WriteStore]>,
+ "vmovq\t{$src, $dst|$dst, $src}", []>, PD,
+ EVEX, VEX_W, Sched<[WriteVecStore]>,
Requires<[HasAVX512, In64BitMode]>;
def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
(ins i64mem:$dst, VR128X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
- addr:$dst)], IIC_SSE_MOVDQ>,
+ addr:$dst)]>,
EVEX, PD, VEX_W, EVEX_CD8<64, CD8VT1>,
- Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
+ Sched<[WriteVecStore]>, Requires<[HasAVX512]>;
-let hasSideEffects = 0 in
+let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in
def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
(ins VR128X:$src),
- "vmovq.s\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>,
- EVEX, VEX_W, Sched<[WriteMove]>;
+ "vmovq\t{$src, $dst|$dst, $src}", []>,
+ EVEX, VEX_W, Sched<[SchedWriteVecLogic.XMM]>;
} // ExeDomain = SSEPackedInt
+def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
+ (VMOVPQI2QIZrr VR128X:$dst, VR128X:$src), 0>;
+
// Move Scalar Single to Double Int
//
let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
(ins FR32X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (bitconvert FR32X:$src))],
- IIC_SSE_MOVD_ToGP>, EVEX, Sched<[WriteMove]>;
+ [(set GR32:$dst, (bitconvert FR32X:$src))]>,
+ EVEX, Sched<[WriteVecMoveToGpr]>;
def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, FR32X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
- [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
- IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteStore]>;
+ [(store (i32 (bitconvert FR32X:$src)), addr:$dst)]>,
+ EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecStore]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
// Move Quadword Int to Packed Quadword Int
@@ -3683,20 +3926,27 @@ def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
"vmovq\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
(v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
- EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteLoad]>;
+ EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteVecLoad]>;
} // ExeDomain = SSEPackedInt
+// Allow "vmovd" but print "vmovq".
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+ (VMOV64toPQIZrr VR128X:$dst, GR64:$src), 0>;
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+ (VMOVPQIto64Zrr GR64:$dst, VR128X:$src), 0>;
+
//===----------------------------------------------------------------------===//
// AVX-512 MOVSS, MOVSD
//===----------------------------------------------------------------------===//
multiclass avx512_move_scalar<string asm, SDNode OpNode,
X86VectorVTInfo _> {
+ let Predicates = [HasAVX512, OptForSize] in
def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
- _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, Sched<[WriteMove]>;
+ _.ExeDomain>, EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
@@ -3704,7 +3954,7 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode,
[(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
_.ImmAllZerosV)))],
- _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ, Sched<[WriteMove]>;
+ _.ExeDomain>, EVEX_4V, EVEX_KZ, Sched<[SchedWriteFShuffle.XMM]>;
let Constraints = "$src0 = $dst" in
def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
(ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
@@ -3713,34 +3963,35 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode,
[(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
(_.VT _.RC:$src0))))],
- _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K, Sched<[WriteMove]>;
+ _.ExeDomain>, EVEX_4V, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>;
let canFoldAsLoad = 1, isReMaterializable = 1 in
def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
- _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, Sched<[WriteLoad]>;
+ _.ExeDomain>, EVEX, Sched<[WriteFLoad]>;
let mayLoad = 1, hasSideEffects = 0 in {
let Constraints = "$src0 = $dst" in
def rmk : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src0, _.KRCWM:$mask, _.ScalarMemOp:$src),
!strconcat(asm, "\t{$src, $dst {${mask}}|",
"$dst {${mask}}, $src}"),
- [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_K, Sched<[WriteLoad]>;
+ [], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFLoad]>;
def rmkz : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.ScalarMemOp:$src),
!strconcat(asm, "\t{$src, $dst {${mask}} {z}|",
"$dst {${mask}} {z}, $src}"),
- [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_KZ, Sched<[WriteLoad]>;
+ [], _.ExeDomain>, EVEX, EVEX_KZ, Sched<[WriteFLoad]>;
}
def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>,
- EVEX, Sched<[WriteStore]>;
+ [(store _.FRC:$src, addr:$dst)], _.ExeDomain>,
+ EVEX, Sched<[WriteFStore]>;
let mayStore = 1, hasSideEffects = 0 in
def mrk: AVX512PI<0x11, MRMDestMem, (outs),
(ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
!strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
- [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K, Sched<[WriteStore]>;
+ [], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFStore]>,
+ NotMemoryFoldable;
}
defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
@@ -3755,24 +4006,24 @@ multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
def : Pat<(_.VT (OpNode _.RC:$src0,
(_.VT (scalar_to_vector
- (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))),
+ (_.EltVT (X86selects VK1WM:$mask,
(_.EltVT _.FRC:$src1),
(_.EltVT _.FRC:$src2))))))),
(!cast<Instruction>(InstrStr#rrk)
- (COPY_TO_REGCLASS _.FRC:$src2, _.RC),
- (COPY_TO_REGCLASS GR32:$mask, VK1WM),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, _.RC)),
+ VK1WM:$mask,
(_.VT _.RC:$src0),
- (COPY_TO_REGCLASS _.FRC:$src1, _.RC))>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src1, _.RC)))>;
def : Pat<(_.VT (OpNode _.RC:$src0,
(_.VT (scalar_to_vector
- (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))),
+ (_.EltVT (X86selects VK1WM:$mask,
(_.EltVT _.FRC:$src1),
(_.EltVT ZeroFP))))))),
(!cast<Instruction>(InstrStr#rrkz)
- (COPY_TO_REGCLASS GR32:$mask, VK1WM),
+ VK1WM:$mask,
(_.VT _.RC:$src0),
- (COPY_TO_REGCLASS _.FRC:$src1, _.RC))>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src1, _.RC)))>;
}
multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
@@ -3780,9 +4031,7 @@ multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
def : Pat<(masked_store addr:$dst, Mask,
(_.info512.VT (insert_subvector undef,
- (_.info256.VT (insert_subvector undef,
- (_.info128.VT _.info128.RC:$src),
- (iPTR 0))),
+ (_.info128.VT _.info128.RC:$src),
(iPTR 0)))),
(!cast<Instruction>(InstrStr#mrk) addr:$dst,
(COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
@@ -3797,9 +4046,7 @@ multiclass avx512_store_scalar_lowering_subreg<string InstrStr,
def : Pat<(masked_store addr:$dst, Mask,
(_.info512.VT (insert_subvector undef,
- (_.info256.VT (insert_subvector undef,
- (_.info128.VT _.info128.RC:$src),
- (iPTR 0))),
+ (_.info128.VT _.info128.RC:$src),
(iPTR 0)))),
(!cast<Instruction>(InstrStr#mrk) addr:$dst,
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
@@ -3807,6 +4054,31 @@ def : Pat<(masked_store addr:$dst, Mask,
}
+// This matches the more recent codegen from clang that avoids emitting a 512
+// bit masked store directly. Codegen will widen 128-bit masked store to 512
+// bits on AVX512F only targets.
+multiclass avx512_store_scalar_lowering_subreg2<string InstrStr,
+ AVX512VLVectorVTInfo _,
+ dag Mask512, dag Mask128,
+ RegisterClass MaskRC,
+ SubRegIndex subreg> {
+
+// AVX512F pattern.
+def : Pat<(masked_store addr:$dst, Mask512,
+ (_.info512.VT (insert_subvector undef,
+ (_.info128.VT _.info128.RC:$src),
+ (iPTR 0)))),
+ (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+
+// AVX512VL pattern.
+def : Pat<(masked_store addr:$dst, Mask128, (_.info128.VT _.info128.RC:$src)),
+ (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+}
+
multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
dag Mask, RegisterClass MaskRC> {
@@ -3822,9 +4094,7 @@ def : Pat<(_.info128.VT (extract_subvector
def : Pat<(_.info128.VT (extract_subvector
(_.info512.VT (masked_load addr:$srcAddr, Mask,
(_.info512.VT (insert_subvector undef,
- (_.info256.VT (insert_subvector undef,
- (_.info128.VT (X86vzmovl _.info128.RC:$src)),
- (iPTR 0))),
+ (_.info128.VT (X86vzmovl _.info128.RC:$src)),
(iPTR 0))))),
(iPTR 0))),
(!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
@@ -3850,9 +4120,7 @@ def : Pat<(_.info128.VT (extract_subvector
def : Pat<(_.info128.VT (extract_subvector
(_.info512.VT (masked_load addr:$srcAddr, Mask,
(_.info512.VT (insert_subvector undef,
- (_.info256.VT (insert_subvector undef,
- (_.info128.VT (X86vzmovl _.info128.RC:$src)),
- (iPTR 0))),
+ (_.info128.VT (X86vzmovl _.info128.RC:$src)),
(iPTR 0))))),
(iPTR 0))),
(!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
@@ -3861,6 +4129,48 @@ def : Pat<(_.info128.VT (extract_subvector
}
+// This matches the more recent codegen from clang that avoids emitting a 512
+// bit masked load directly. Codegen will widen 128-bit masked load to 512
+// bits on AVX512F only targets.
+multiclass avx512_load_scalar_lowering_subreg2<string InstrStr,
+ AVX512VLVectorVTInfo _,
+ dag Mask512, dag Mask128,
+ RegisterClass MaskRC,
+ SubRegIndex subreg> {
+// AVX512F patterns.
+def : Pat<(_.info128.VT (extract_subvector
+ (_.info512.VT (masked_load addr:$srcAddr, Mask512,
+ (_.info512.VT (bitconvert
+ (v16i32 immAllZerosV))))),
+ (iPTR 0))),
+ (!cast<Instruction>(InstrStr#rmkz)
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (extract_subvector
+ (_.info512.VT (masked_load addr:$srcAddr, Mask512,
+ (_.info512.VT (insert_subvector undef,
+ (_.info128.VT (X86vzmovl _.info128.RC:$src)),
+ (iPTR 0))))),
+ (iPTR 0))),
+ (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ addr:$srcAddr)>;
+
+// AVX512Vl patterns.
+def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128,
+ (_.info128.VT (bitconvert (v4i32 immAllZerosV))))),
+ (!cast<Instruction>(InstrStr#rmkz)
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128,
+ (_.info128.VT (X86vzmovl _.info128.RC:$src)))),
+ (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ addr:$srcAddr)>;
+}
+
defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
@@ -3871,6 +4181,31 @@ defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
(v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
+defm : avx512_store_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (insert_subvector
+ (v16i1 immAllZerosV),
+ (v4i1 (extract_subvector
+ (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (v4i1 (extract_subvector
+ (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+ (iPTR 0))), GR8, sub_8bit>;
+defm : avx512_store_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
+ (v8i1
+ (extract_subvector
+ (v16i1
+ (insert_subvector
+ (v16i1 immAllZerosV),
+ (v2i1 (extract_subvector
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (v2i1 (extract_subvector
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+ (iPTR 0))), GR8, sub_8bit>;
+
defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
(v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
@@ -3878,121 +4213,203 @@ defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
(v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
-def : Pat<(f32 (X86selects (scalar_to_vector (and GR8:$mask, (i8 1))),
- (f32 FR32X:$src1), (f32 FR32X:$src2))),
- (COPY_TO_REGCLASS
- (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
- (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF),
- GR8:$mask, sub_8bit)), VK1WM),
- (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
- FR32X)>;
+defm : avx512_load_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (insert_subvector
+ (v16i1 immAllZerosV),
+ (v4i1 (extract_subvector
+ (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (v4i1 (extract_subvector
+ (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+ (iPTR 0))), GR8, sub_8bit>;
+defm : avx512_load_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
+ (v8i1
+ (extract_subvector
+ (v16i1
+ (insert_subvector
+ (v16i1 immAllZerosV),
+ (v2i1 (extract_subvector
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (v2i1 (extract_subvector
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+ (iPTR 0))), GR8, sub_8bit>;
def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
- (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
+ (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrk
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)),
VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),
- (COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)))), FR32X)>;
-def : Pat<(f64 (X86selects (scalar_to_vector (and GR8:$mask, (i8 1))),
- (f64 FR64X:$src1), (f64 FR64X:$src2))),
- (COPY_TO_REGCLASS
- (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
- (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF),
- GR8:$mask, sub_8bit)), VK1WM),
- (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
- FR64X)>;
+def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), fp32imm0)),
+ (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrkz VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)))), FR32X)>;
def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
- (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
+ (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrk
+ (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)),
VK1WM:$mask, (v2f64 (IMPLICIT_DEF)),
- (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
+ (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>;
-def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
- (VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM),
- (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), fpimm0)),
+ (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrkz VK1WM:$mask, (v2f64 (IMPLICIT_DEF)),
+ (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>;
-let hasSideEffects = 0 in {
+let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2),
- "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], IIC_SSE_MOV_S_RR>, XS, EVEX_4V, VEX_LIG,
- FoldGenData<"VMOVSSZrr">, Sched<[WriteMove]>;
+ "vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, XS, EVEX_4V, VEX_LIG,
+ FoldGenData<"VMOVSSZrr">,
+ Sched<[SchedWriteFShuffle.XMM]>;
-let Constraints = "$src0 = $dst" in
+ let Constraints = "$src0 = $dst" in
def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask,
VR128X:$src1, VR128X:$src2),
- "vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
+ "vmovss\t{$src2, $src1, $dst {${mask}}|"#
"$dst {${mask}}, $src1, $src2}",
- [], IIC_SSE_MOV_S_RR>, EVEX_K, XS, EVEX_4V, VEX_LIG,
- FoldGenData<"VMOVSSZrrk">, Sched<[WriteMove]>;
+ []>, EVEX_K, XS, EVEX_4V, VEX_LIG,
+ FoldGenData<"VMOVSSZrrk">,
+ Sched<[SchedWriteFShuffle.XMM]>;
def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins f32x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
- "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+ "vmovss\t{$src2, $src1, $dst {${mask}} {z}|"#
"$dst {${mask}} {z}, $src1, $src2}",
- [], IIC_SSE_MOV_S_RR>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
- FoldGenData<"VMOVSSZrrkz">, Sched<[WriteMove]>;
+ []>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
+ FoldGenData<"VMOVSSZrrkz">,
+ Sched<[SchedWriteFShuffle.XMM]>;
def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2),
- "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], IIC_SSE_MOV_S_RR>, XD, EVEX_4V, VEX_LIG, VEX_W,
- FoldGenData<"VMOVSDZrr">, Sched<[WriteMove]>;
+ "vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, XD, EVEX_4V, VEX_LIG, VEX_W,
+ FoldGenData<"VMOVSDZrr">,
+ Sched<[SchedWriteFShuffle.XMM]>;
-let Constraints = "$src0 = $dst" in
+ let Constraints = "$src0 = $dst" in
def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask,
VR128X:$src1, VR128X:$src2),
- "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"#
+ "vmovsd\t{$src2, $src1, $dst {${mask}}|"#
"$dst {${mask}}, $src1, $src2}",
- [], IIC_SSE_MOV_S_RR>, EVEX_K, XD, EVEX_4V, VEX_LIG,
- VEX_W, FoldGenData<"VMOVSDZrrk">, Sched<[WriteMove]>;
+ []>, EVEX_K, XD, EVEX_4V, VEX_LIG,
+ VEX_W, FoldGenData<"VMOVSDZrrk">,
+ Sched<[SchedWriteFShuffle.XMM]>;
def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins f64x_info.KRCWM:$mask, VR128X:$src1,
VR128X:$src2),
- "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+ "vmovsd\t{$src2, $src1, $dst {${mask}} {z}|"#
"$dst {${mask}} {z}, $src1, $src2}",
- [], IIC_SSE_MOV_S_RR>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
- VEX_W, FoldGenData<"VMOVSDZrrkz">, Sched<[WriteMove]>;
-}
+ []>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
+ VEX_W, FoldGenData<"VMOVSDZrrkz">,
+ Sched<[SchedWriteFShuffle.XMM]>;
+}
+
+def : InstAlias<"vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VMOVSSZrr_REV VR128X:$dst, VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
+ "$dst {${mask}}, $src1, $src2}",
+ (VMOVSSZrrk_REV VR128X:$dst, VK1WM:$mask,
+ VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+ "$dst {${mask}} {z}, $src1, $src2}",
+ (VMOVSSZrrkz_REV VR128X:$dst, VK1WM:$mask,
+ VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VMOVSDZrr_REV VR128X:$dst, VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}}|"#
+ "$dst {${mask}}, $src1, $src2}",
+ (VMOVSDZrrk_REV VR128X:$dst, VK1WM:$mask,
+ VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+ "$dst {${mask}} {z}, $src1, $src2}",
+ (VMOVSDZrrkz_REV VR128X:$dst, VK1WM:$mask,
+ VR128X:$src1, VR128X:$src2), 0>;
-let Predicates = [HasAVX512] in {
- let AddedComplexity = 15 in {
+let Predicates = [HasAVX512, OptForSize] in {
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
(VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
(VMOVSSZrr (v4i32 (AVX512_128_SET0)), VR128X:$src)>;
- def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
- (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
- (COPY_TO_REGCLASS FR64X:$src, VR128))>;
- }
// Move low f32 and clear high bits.
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
(SUBREG_TO_REG (i32 0),
- (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
- (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>;
+ (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
+ (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))), sub_xmm)>;
def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
(SUBREG_TO_REG (i32 0),
- (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
- (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
+ (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
+ (v4i32 (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)))), sub_xmm)>;
+
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
+ (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
+ (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), sub_xmm)>;
+
+ def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
+ (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))), sub_xmm)>;
+ def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
+ (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>;
+
+ def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
+ (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), sub_xmm)>;
+
+ def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
+ (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), sub_xmm)>;
+
+}
+
+// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than
+// VMOVSS/SD. Unfortunately, loses the ability to use XMM16-31.
+let Predicates = [HasAVX512, OptForSpeed] in {
def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
(SUBREG_TO_REG (i32 0),
- (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
- (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), sub_xmm)>;
+ (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
+ (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)),
+ (i8 1))), sub_xmm)>;
def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
(SUBREG_TO_REG (i32 0),
- (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
- (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), sub_xmm)>;
+ (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
+ (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)),
+ (i8 3))), sub_xmm)>;
+
+ def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VBLENDPDrri (v2f64 (V_SET0)),
+ (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)),
+ (i8 1))), sub_xmm)>;
+ def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VPBLENDWrri (v2i64 (V_SET0)),
+ (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)),
+ (i8 0xf))), sub_xmm)>;
+}
+
+let Predicates = [HasAVX512] in {
- let AddedComplexity = 20 in {
// MOVSSrm zeros the high parts of the register; represent this
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
(COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
- def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
- (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
(COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
def : Pat<(v4f32 (X86vzload addr:$src)),
@@ -4002,8 +4419,6 @@ let Predicates = [HasAVX512] in {
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
(COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
- def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
- (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
(COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
@@ -4015,7 +4430,7 @@ let Predicates = [HasAVX512] in {
// 256-bit types
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
(v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+ (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
@@ -4031,7 +4446,7 @@ let Predicates = [HasAVX512] in {
// 512-bit types
def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
(v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+ (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
def : Pat<(v16f32 (X86vzmovl (insert_subvector undef,
(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
@@ -4042,164 +4457,127 @@ let Predicates = [HasAVX512] in {
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
def : Pat<(v8f64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
- }
+
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
(v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
-
- // Move low f64 and clear high bits.
- def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
- (SUBREG_TO_REG (i32 0),
- (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
- (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
- def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
- (SUBREG_TO_REG (i32 0),
- (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
- (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>;
-
- def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
- (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
- (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
- def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
- (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
- (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>;
+ (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
// Extract and store.
def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
addr:$dst),
(VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
-
- // Shuffle with VMOVSS
- def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
- (VMOVSSZrr (v4i32 VR128X:$src1), VR128X:$src2)>;
-
- def : Pat<(v4f32 (X86Movss VR128X:$src1, (scalar_to_vector FR32X:$src2))),
- (VMOVSSZrr VR128X:$src1,
- (COPY_TO_REGCLASS FR32X:$src2, VR128X))>;
-
- // Shuffle with VMOVSD
- def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
- (VMOVSDZrr VR128X:$src1, VR128X:$src2)>;
-
- def : Pat<(v2f64 (X86Movsd VR128X:$src1, (scalar_to_vector FR64X:$src2))),
- (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS FR64X:$src2, VR128X))>;
-
- def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
- (VMOVSDZrr VR128X:$src1, VR128X:$src2)>;
- def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)),
- (VMOVSDZrr VR128X:$src1, VR128X:$src2)>;
}
-let AddedComplexity = 15 in
+let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst, (v2i64 (X86vzmovl
- (v2i64 VR128X:$src))))],
- IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
+ (v2i64 VR128X:$src))))]>,
+ EVEX, VEX_W;
+}
let Predicates = [HasAVX512] in {
- let AddedComplexity = 15 in {
- def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
- (VMOVDI2PDIZrr GR32:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+ (VMOVDI2PDIZrr GR32:$src)>;
- def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
- (VMOV64toPQIZrr GR64:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+ (VMOV64toPQIZrr GR64:$src)>;
- def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>;
+
+ def : Pat<(v8i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>;
- def : Pat<(v8i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
- }
// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
- let AddedComplexity = 20 in {
- def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
- (VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
- (VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
- (VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
- (VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v4i32 (X86vzload addr:$src)),
- (VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v8i32 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
- def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
- (VMOVQI2PQIZrm addr:$src)>;
- def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
- (VMOVZPQILo2PQIZrr VR128X:$src)>;
- def : Pat<(v2i64 (X86vzload addr:$src)),
- (VMOVQI2PQIZrm addr:$src)>;
- def : Pat<(v4i64 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
- }
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
+ (VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ (VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzload addr:$src)),
+ (VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v8i32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (VMOVQI2PQIZrm addr:$src)>;
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
+ (VMOVZPQILo2PQIZrr VR128X:$src)>;
+ def : Pat<(v2i64 (X86vzload addr:$src)),
+ (VMOVQI2PQIZrm addr:$src)>;
+ def : Pat<(v4i64 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
// Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+ (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;
def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+ (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;
// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
def : Pat<(v16i32 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+ (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
def : Pat<(v8i64 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
+ (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
}
+
//===----------------------------------------------------------------------===//
// AVX-512 - Non-temporals
//===----------------------------------------------------------------------===//
-let SchedRW = [WriteLoad] in {
- def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
- (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
- [], SSEPackedInt>, EVEX, T8PD, EVEX_V512,
- EVEX_CD8<64, CD8VF>;
- let Predicates = [HasVLX] in {
- def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
- (ins i256mem:$src),
- "vmovntdqa\t{$src, $dst|$dst, $src}",
- [], SSEPackedInt>, EVEX, T8PD, EVEX_V256,
- EVEX_CD8<64, CD8VF>;
+def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
+ (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.ZMM.RM]>,
+ EVEX, T8PD, EVEX_V512, EVEX_CD8<64, CD8VF>;
- def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
- (ins i128mem:$src),
- "vmovntdqa\t{$src, $dst|$dst, $src}",
- [], SSEPackedInt>, EVEX, T8PD, EVEX_V128,
- EVEX_CD8<64, CD8VF>;
- }
+let Predicates = [HasVLX] in {
+ def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
+ (ins i256mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.YMM.RM]>,
+ EVEX, T8PD, EVEX_V256, EVEX_CD8<64, CD8VF>;
+
+ def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
+ (ins i128mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.XMM.RM]>,
+ EVEX, T8PD, EVEX_V128, EVEX_CD8<64, CD8VF>;
}
multiclass avx512_movnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- PatFrag st_frag = alignednontemporalstore,
- InstrItinClass itin = IIC_SSE_MOVNT> {
- let SchedRW = [WriteStore], AddedComplexity = 400 in
+ X86SchedWriteMoveLS Sched,
+ PatFrag st_frag = alignednontemporalstore> {
+ let SchedRW = [Sched.MR], AddedComplexity = 400 in
def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(st_frag (_.VT _.RC:$src), addr:$dst)],
- _.ExeDomain, itin>, EVEX, EVEX_CD8<_.EltSize, CD8VF>;
+ _.ExeDomain>, EVEX, EVEX_CD8<_.EltSize, CD8VF>;
}
multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr,
- AVX512VLVectorVTInfo VTInfo> {
+ AVX512VLVectorVTInfo VTInfo,
+ X86SchedWriteMoveLSWidths Sched> {
let Predicates = [HasAVX512] in
- defm Z : avx512_movnt<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+ defm Z : avx512_movnt<opc, OpcodeStr, VTInfo.info512, Sched.ZMM>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z256 : avx512_movnt<opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
- defm Z128 : avx512_movnt<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+ defm Z256 : avx512_movnt<opc, OpcodeStr, VTInfo.info256, Sched.YMM>, EVEX_V256;
+ defm Z128 : avx512_movnt<opc, OpcodeStr, VTInfo.info128, Sched.XMM>, EVEX_V128;
}
}
-defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", avx512vl_i64_info>, PD;
-defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", avx512vl_f64_info>, PD, VEX_W;
-defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", avx512vl_f32_info>, PS;
+defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", avx512vl_i64_info,
+ SchedWriteVecMoveLSNT>, PD;
+defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", avx512vl_f64_info,
+ SchedWriteFMoveLSNT>, PD, VEX_W;
+defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", avx512vl_f32_info,
+ SchedWriteFMoveLSNT>, PS;
let Predicates = [HasAVX512], AddedComplexity = 400 in {
def : Pat<(alignednontemporalstore (v16i32 VR512:$src), addr:$dst),
@@ -4251,131 +4629,135 @@ let Predicates = [HasVLX], AddedComplexity = 400 in {
// AVX-512 - Integer arithmetic
//
multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _, OpndItins itins,
+ X86VectorVTInfo _, X86FoldableSchedWrite sched,
bit IsCommutable = 0> {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
- itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V,
- Sched<[itins.Sched]>;
+ IsCommutable>, AVX512BIBase, EVEX_4V,
+ Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1,
- (bitconvert (_.LdFrag addr:$src2)))),
- itins.rm>, AVX512BIBase, EVEX_4V,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (_.LdFrag addr:$src2))))>,
+ AVX512BIBase, EVEX_4V,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _, OpndItins itins,
+ X86VectorVTInfo _, X86FoldableSchedWrite sched,
bit IsCommutable = 0> :
- avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
+ avx512_binop_rm<opc, OpcodeStr, OpNode, _, sched, IsCommutable> {
defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
(_.VT (OpNode _.RC:$src1,
(X86VBroadcast
- (_.ScalarLdFrag addr:$src2)))),
- itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.ScalarLdFrag addr:$src2))))>,
+ AVX512BIBase, EVEX_4V, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo VTInfo, OpndItins itins,
- Predicate prd, bit IsCommutable = 0> {
+ AVX512VLVectorVTInfo VTInfo,
+ X86SchedWriteWidths sched, Predicate prd,
+ bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+ defm Z : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info512, sched.ZMM,
IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
- IsCommutable>, EVEX_V256;
- defm Z128 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
- IsCommutable>, EVEX_V128;
+ defm Z256 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info256,
+ sched.YMM, IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info128,
+ sched.XMM, IsCommutable>, EVEX_V128;
}
}
multiclass avx512_binop_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo VTInfo, OpndItins itins,
- Predicate prd, bit IsCommutable = 0> {
+ AVX512VLVectorVTInfo VTInfo,
+ X86SchedWriteWidths sched, Predicate prd,
+ bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+ defm Z : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, sched.ZMM,
IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
- IsCommutable>, EVEX_V256;
- defm Z128 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
- IsCommutable>, EVEX_V128;
+ defm Z256 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info256,
+ sched.YMM, IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info128,
+ sched.XMM, IsCommutable>, EVEX_V128;
}
}
multiclass avx512_binop_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, Predicate prd,
+ X86SchedWriteWidths sched, Predicate prd,
bit IsCommutable = 0> {
defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
- itins, prd, IsCommutable>,
- VEX_W, EVEX_CD8<64, CD8VF>;
+ sched, prd, IsCommutable>,
+ VEX_W, EVEX_CD8<64, CD8VF>;
}
multiclass avx512_binop_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, Predicate prd,
+ X86SchedWriteWidths sched, Predicate prd,
bit IsCommutable = 0> {
defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
- itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
+ sched, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
}
multiclass avx512_binop_rm_vl_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, Predicate prd,
+ X86SchedWriteWidths sched, Predicate prd,
bit IsCommutable = 0> {
defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i16_info,
- itins, prd, IsCommutable>, EVEX_CD8<16, CD8VF>,
- VEX_WIG;
+ sched, prd, IsCommutable>, EVEX_CD8<16, CD8VF>,
+ VEX_WIG;
}
multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, Predicate prd,
+ X86SchedWriteWidths sched, Predicate prd,
bit IsCommutable = 0> {
defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i8_info,
- itins, prd, IsCommutable>, EVEX_CD8<8, CD8VF>,
- VEX_WIG;
+ sched, prd, IsCommutable>, EVEX_CD8<8, CD8VF>,
+ VEX_WIG;
}
multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
- SDNode OpNode, OpndItins itins, Predicate prd,
- bit IsCommutable = 0> {
- defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd,
+ SDNode OpNode, X86SchedWriteWidths sched,
+ Predicate prd, bit IsCommutable = 0> {
+ defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, sched, prd,
IsCommutable>;
- defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd,
+ defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, sched, prd,
IsCommutable>;
}
multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
- SDNode OpNode, OpndItins itins, Predicate prd,
- bit IsCommutable = 0> {
- defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, itins, prd,
+ SDNode OpNode, X86SchedWriteWidths sched,
+ Predicate prd, bit IsCommutable = 0> {
+ defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, sched, prd,
IsCommutable>;
- defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, itins, prd,
+ defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, sched, prd,
IsCommutable>;
}
multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
bits<8> opc_d, bits<8> opc_q,
string OpcodeStr, SDNode OpNode,
- OpndItins itins, bit IsCommutable = 0> {
+ X86SchedWriteWidths sched,
+ bit IsCommutable = 0> {
defm NAME : avx512_binop_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode,
- itins, HasAVX512, IsCommutable>,
+ sched, HasAVX512, IsCommutable>,
avx512_binop_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode,
- itins, HasBWI, IsCommutable>;
+ sched, HasBWI, IsCommutable>;
}
-multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
+multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
SDNode OpNode,X86VectorVTInfo _Src,
X86VectorVTInfo _Dst, X86VectorVTInfo _Brdct,
bit IsCommutable = 0> {
@@ -4385,15 +4767,15 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
(_Dst.VT (OpNode
(_Src.VT _Src.RC:$src1),
(_Src.VT _Src.RC:$src2))),
- itins.rr, IsCommutable>,
- AVX512BIBase, EVEX_4V, Sched<[itins.Sched]>;
+ IsCommutable>,
+ AVX512BIBase, EVEX_4V, Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
- (bitconvert (_Src.LdFrag addr:$src2)))),
- itins.rm>, AVX512BIBase, EVEX_4V,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (_Src.LdFrag addr:$src2))))>,
+ AVX512BIBase, EVEX_4V,
+ Sched<[sched.Folded, ReadAfterLd]>;
defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Brdct.ScalarMemOp:$src2),
@@ -4402,71 +4784,72 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
"$src1, ${src2}"##_Brdct.BroadcastStr,
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
(_Brdct.VT (X86VBroadcast
- (_Brdct.ScalarLdFrag addr:$src2)))))),
- itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_Brdct.ScalarLdFrag addr:$src2))))))>,
+ AVX512BIBase, EVEX_4V, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add,
- SSE_INTALU_ITINS_P, 1>;
+ SchedWriteVecALU, 1>;
defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub,
- SSE_INTALU_ITINS_P, 0>;
+ SchedWriteVecALU, 0>;
defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds,
- SSE_INTALU_ITINS_P, HasBWI, 1>;
+ SchedWriteVecALU, HasBWI, 1>;
defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs,
- SSE_INTALU_ITINS_P, HasBWI, 0>;
+ SchedWriteVecALU, HasBWI, 0>;
defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus,
- SSE_INTALU_ITINS_P, HasBWI, 1>;
+ SchedWriteVecALU, HasBWI, 1>;
defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus,
- SSE_INTALU_ITINS_P, HasBWI, 0>;
+ SchedWriteVecALU, HasBWI, 0>;
defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
- SSE_INTMUL_ITINS_P, HasAVX512, 1>, T8PD;
+ SchedWritePMULLD, HasAVX512, 1>, T8PD;
defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,
- SSE_INTMUL_ITINS_P, HasBWI, 1>;
+ SchedWriteVecIMul, HasBWI, 1>;
defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul,
- SSE_INTMUL_ITINS_P, HasDQI, 1>, T8PD;
-defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SSE_INTMUL_ITINS_P,
+ SchedWriteVecIMul, HasDQI, 1>, T8PD,
+ NotEVEX2VEXConvertible;
+defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SchedWriteVecIMul,
HasBWI, 1>;
-defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SSE_INTMUL_ITINS_P,
+defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SchedWriteVecIMul,
HasBWI, 1>;
-defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SSE_INTMUL_ITINS_P,
- HasBWI, 1>, T8PD;
+defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs,
+ SchedWriteVecIMul, HasBWI, 1>, T8PD;
defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
- SSE_INTALU_ITINS_P, HasBWI, 1>;
-
-multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins,
- AVX512VLVectorVTInfo _SrcVTInfo, AVX512VLVectorVTInfo _DstVTInfo,
+ SchedWriteVecALU, HasBWI, 1>;
+defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq,
+ SchedWriteVecIMul, HasAVX512, 1>, T8PD;
+defm VPMULUDQ : avx512_binop_rm_vl_q<0xF4, "vpmuludq", X86pmuludq,
+ SchedWriteVecIMul, HasAVX512, 1>;
+
+multiclass avx512_binop_all<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _SrcVTInfo,
+ AVX512VLVectorVTInfo _DstVTInfo,
SDNode OpNode, Predicate prd, bit IsCommutable = 0> {
let Predicates = [prd] in
- defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+ defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, sched.ZMM, OpNode,
_SrcVTInfo.info512, _DstVTInfo.info512,
v8i64_info, IsCommutable>,
EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W;
let Predicates = [HasVLX, prd] in {
- defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+ defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, sched.YMM, OpNode,
_SrcVTInfo.info256, _DstVTInfo.info256,
v4i64x_info, IsCommutable>,
EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W;
- defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+ defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, sched.XMM, OpNode,
_SrcVTInfo.info128, _DstVTInfo.info128,
v2i64x_info, IsCommutable>,
EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W;
}
}
-defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTMUL_ITINS_P,
- avx512vl_i32_info, avx512vl_i64_info,
- X86pmuldq, HasAVX512, 1>,T8PD;
-defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P,
- avx512vl_i32_info, avx512vl_i64_info,
- X86pmuludq, HasAVX512, 1>;
-defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SSE_INTALU_ITINS_P,
+defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SchedWriteVecALU,
avx512vl_i8_info, avx512vl_i8_info,
X86multishift, HasVBMI, 0>, T8PD;
multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _Src, X86VectorVTInfo _Dst,
- OpndItins itins> {
+ X86FoldableSchedWrite sched> {
defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
OpcodeStr,
@@ -4474,14 +4857,14 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$src1, ${src2}"##_Src.BroadcastStr,
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
(_Src.VT (X86VBroadcast
- (_Src.ScalarLdFrag addr:$src2)))))),
- itins.rm>, EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_Src.ScalarLdFrag addr:$src2))))))>,
+ EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
SDNode OpNode,X86VectorVTInfo _Src,
- X86VectorVTInfo _Dst, OpndItins itins,
+ X86VectorVTInfo _Dst, X86FoldableSchedWrite sched,
bit IsCommutable = 0> {
defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
@@ -4489,45 +4872,49 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
(_Dst.VT (OpNode
(_Src.VT _Src.RC:$src1),
(_Src.VT _Src.RC:$src2))),
- itins.rr, IsCommutable>,
- EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V, Sched<[itins.Sched]>;
+ IsCommutable>,
+ EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V, Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
- (bitconvert (_Src.LdFrag addr:$src2)))), itins.rm>,
+ (bitconvert (_Src.LdFrag addr:$src2))))>,
EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass avx512_packs_all_i32_i16<bits<8> opc, string OpcodeStr,
SDNode OpNode> {
let Predicates = [HasBWI] in
defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i32_info,
- v32i16_info, SSE_PACK>,
+ v32i16_info, SchedWriteShuffle.ZMM>,
avx512_packs_rmb<opc, OpcodeStr, OpNode, v16i32_info,
- v32i16_info, SSE_PACK>, EVEX_V512;
+ v32i16_info, SchedWriteShuffle.ZMM>, EVEX_V512;
let Predicates = [HasBWI, HasVLX] in {
defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i32x_info,
- v16i16x_info, SSE_PACK>,
+ v16i16x_info, SchedWriteShuffle.YMM>,
avx512_packs_rmb<opc, OpcodeStr, OpNode, v8i32x_info,
- v16i16x_info, SSE_PACK>, EVEX_V256;
+ v16i16x_info, SchedWriteShuffle.YMM>,
+ EVEX_V256;
defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v4i32x_info,
- v8i16x_info, SSE_PACK>,
+ v8i16x_info, SchedWriteShuffle.XMM>,
avx512_packs_rmb<opc, OpcodeStr, OpNode, v4i32x_info,
- v8i16x_info, SSE_PACK>, EVEX_V128;
+ v8i16x_info, SchedWriteShuffle.XMM>,
+ EVEX_V128;
}
}
multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr,
SDNode OpNode> {
let Predicates = [HasBWI] in
- defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info,
- v64i8_info, SSE_PACK>, EVEX_V512, VEX_WIG;
+ defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info, v64i8_info,
+ SchedWriteShuffle.ZMM>, EVEX_V512, VEX_WIG;
let Predicates = [HasBWI, HasVLX] in {
defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i16x_info,
- v32i8x_info, SSE_PACK>, EVEX_V256, VEX_WIG;
+ v32i8x_info, SchedWriteShuffle.YMM>,
+ EVEX_V256, VEX_WIG;
defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i16x_info,
- v16i8x_info, SSE_PACK>, EVEX_V128, VEX_WIG;
+ v16i8x_info, SchedWriteShuffle.XMM>,
+ EVEX_V128, VEX_WIG;
}
}
@@ -4536,12 +4923,15 @@ multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _Dst, bit IsCommutable = 0> {
let Predicates = [HasBWI] in
defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512,
- _Dst.info512, SSE_PMADD, IsCommutable>, EVEX_V512;
+ _Dst.info512, SchedWriteVecIMul.ZMM,
+ IsCommutable>, EVEX_V512;
let Predicates = [HasBWI, HasVLX] in {
defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256,
- _Dst.info256, SSE_PMADD, IsCommutable>, EVEX_V256;
+ _Dst.info256, SchedWriteVecIMul.YMM,
+ IsCommutable>, EVEX_V256;
defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128,
- _Dst.info128, SSE_PMADD, IsCommutable>, EVEX_V128;
+ _Dst.info128, SchedWriteVecIMul.XMM,
+ IsCommutable>, EVEX_V128;
}
}
@@ -4556,32 +4946,44 @@ defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
avx512vl_i16_info, avx512vl_i32_info, 1>, AVX512BIBase, VEX_WIG;
defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax,
- SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+ SchedWriteVecALU, HasBWI, 1>, T8PD;
defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax,
- SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax,
- SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+ SchedWriteVecALU, HasBWI, 1>;
+defm VPMAXSD : avx512_binop_rm_vl_d<0x3D, "vpmaxsd", smax,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD;
+defm VPMAXSQ : avx512_binop_rm_vl_q<0x3D, "vpmaxsq", smax,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD,
+ NotEVEX2VEXConvertible;
defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax,
- SSE_INTALU_ITINS_P, HasBWI, 1>;
+ SchedWriteVecALU, HasBWI, 1>;
defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax,
- SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax,
- SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+ SchedWriteVecALU, HasBWI, 1>, T8PD;
+defm VPMAXUD : avx512_binop_rm_vl_d<0x3F, "vpmaxud", umax,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD;
+defm VPMAXUQ : avx512_binop_rm_vl_q<0x3F, "vpmaxuq", umax,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD,
+ NotEVEX2VEXConvertible;
defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin,
- SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+ SchedWriteVecALU, HasBWI, 1>, T8PD;
defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin,
- SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin,
- SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+ SchedWriteVecALU, HasBWI, 1>;
+defm VPMINSD : avx512_binop_rm_vl_d<0x39, "vpminsd", smin,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD;
+defm VPMINSQ : avx512_binop_rm_vl_q<0x39, "vpminsq", smin,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD,
+ NotEVEX2VEXConvertible;
defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin,
- SSE_INTALU_ITINS_P, HasBWI, 1>;
+ SchedWriteVecALU, HasBWI, 1>;
defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,
- SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin,
- SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+ SchedWriteVecALU, HasBWI, 1>, T8PD;
+defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD;
+defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD,
+ NotEVEX2VEXConvertible;
// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
let Predicates = [HasDQI, NoVLX] in {
@@ -4633,7 +5035,7 @@ multiclass avx512_min_max_lowering<Instruction Instr, SDNode OpNode> {
sub_xmm)>;
}
-let Predicates = [HasAVX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
defm : avx512_min_max_lowering<VPMAXUQZrr, umax>;
defm : avx512_min_max_lowering<VPMINUQZrr, umin>;
defm : avx512_min_max_lowering<VPMAXSQZrr, smax>;
@@ -4648,8 +5050,8 @@ let Predicates = [HasAVX512] in {
// be set to null_frag for 32-bit elements.
multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNode,
- SDNode OpNodeMsk, OpndItins itins, X86VectorVTInfo _,
- bit IsCommutable = 0> {
+ SDNode OpNodeMsk, X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, bit IsCommutable = 0> {
let hasSideEffects = 0 in
defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
@@ -4658,8 +5060,8 @@ multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr,
(bitconvert (_.VT _.RC:$src2)))),
(_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
_.RC:$src2)))),
- itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V,
- Sched<[itins.Sched]>;
+ IsCommutable>, AVX512BIBase, EVEX_4V,
+ Sched<[sched]>;
let hasSideEffects = 0, mayLoad = 1 in
defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -4668,18 +5070,18 @@ multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr,
(_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
(bitconvert (_.LdFrag addr:$src2)))),
(_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
- (bitconvert (_.LdFrag addr:$src2)))))),
- itins.rm>, AVX512BIBase, EVEX_4V,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (_.LdFrag addr:$src2))))))>,
+ AVX512BIBase, EVEX_4V,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
// OpNodeMsk is the OpNode to use where element size is important. So use
// for all of the broadcast patterns.
multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNode,
- SDNode OpNodeMsk, OpndItins itins, X86VectorVTInfo _,
+ SDNode OpNodeMsk, X86FoldableSchedWrite sched, X86VectorVTInfo _,
bit IsCommutable = 0> :
- avx512_logic_rm<opc, OpcodeStr, OpNode, OpNodeMsk, itins, _,
+ avx512_logic_rm<opc, OpcodeStr, OpNode, OpNodeMsk, sched, _,
IsCommutable> {
defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
@@ -4692,327 +5094,350 @@ multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr,
(_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
(bitconvert
(_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)))))))),
- itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.ScalarLdFrag addr:$src2))))))))>,
+ AVX512BIBase, EVEX_4V, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNode,
- SDNode OpNodeMsk, OpndItins itins,
+ SDNode OpNodeMsk, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo,
bit IsCommutable = 0> {
let Predicates = [HasAVX512] in
- defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, itins,
+ defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, sched.ZMM,
VTInfo.info512, IsCommutable>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, itins,
+ defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, sched.YMM,
VTInfo.info256, IsCommutable>, EVEX_V256;
- defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, itins,
+ defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, sched.XMM,
VTInfo.info128, IsCommutable>, EVEX_V128;
}
}
multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
- SDNode OpNode, OpndItins itins,
+ SDNode OpNode, X86SchedWriteWidths sched,
bit IsCommutable = 0> {
- defm Q : avx512_logic_rmb_vl<opc_q, OpcodeStr#"q", OpNode, OpNode, itins,
+ defm Q : avx512_logic_rmb_vl<opc_q, OpcodeStr#"q", OpNode, OpNode, sched,
avx512vl_i64_info, IsCommutable>,
VEX_W, EVEX_CD8<64, CD8VF>;
- defm D : avx512_logic_rmb_vl<opc_d, OpcodeStr#"d", null_frag, OpNode, itins,
+ defm D : avx512_logic_rmb_vl<opc_d, OpcodeStr#"d", null_frag, OpNode, sched,
avx512vl_i32_info, IsCommutable>,
EVEX_CD8<32, CD8VF>;
}
-defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, SSE_BIT_ITINS_P, 1>;
-defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, SSE_BIT_ITINS_P, 1>;
-defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, SSE_BIT_ITINS_P, 1>;
-defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, SSE_BIT_ITINS_P>;
+defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and,
+ SchedWriteVecLogic, 1>;
+defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or,
+ SchedWriteVecLogic, 1>;
+defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
+ SchedWriteVecLogic, 1>;
+defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
+ SchedWriteVecLogic>;
//===----------------------------------------------------------------------===//
// AVX-512 FP arithmetic
//===----------------------------------------------------------------------===//
+
multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
- SDNode OpNode, SDNode VecNode, OpndItins itins,
- bit IsCommutable> {
+ SDNode OpNode, SDNode VecNode,
+ X86FoldableSchedWrite sched, bit IsCommutable> {
let ExeDomain = _.ExeDomain in {
defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (VecNode _.RC:$src1, _.RC:$src2,
- (i32 FROUND_CURRENT))),
- itins.rr>, Sched<[itins.Sched]>;
+ (i32 FROUND_CURRENT)))>,
+ Sched<[sched]>;
defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (VecNode _.RC:$src1,
_.ScalarIntMemCPat:$src2,
- (i32 FROUND_CURRENT))),
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 FROUND_CURRENT)))>,
+ Sched<[sched.Folded, ReadAfterLd]>;
let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
- itins.rr>, Sched<[itins.Sched]> {
+ [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
+ Sched<[sched]> {
let isCommutable = IsCommutable;
}
def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1,
- (_.ScalarLdFrag addr:$src2)))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.ScalarLdFrag addr:$src2)))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
}
multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
- SDNode VecNode, OpndItins itins, bit IsCommutable = 0> {
+ SDNode VecNode, X86FoldableSchedWrite sched,
+ bit IsCommutable = 0> {
let ExeDomain = _.ExeDomain in
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
(VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 imm:$rc)), itins.rr, IsCommutable>,
- EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
+ (i32 imm:$rc)), IsCommutable>,
+ EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
SDNode OpNode, SDNode VecNode, SDNode SaeNode,
- OpndItins itins, bit IsCommutable> {
+ X86FoldableSchedWrite sched, bit IsCommutable> {
let ExeDomain = _.ExeDomain in {
defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (_.VT (VecNode _.RC:$src1, _.RC:$src2)),
- itins.rr>, Sched<[itins.Sched]>;
+ (_.VT (VecNode _.RC:$src1, _.RC:$src2))>,
+ Sched<[sched]>;
defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (VecNode _.RC:$src1,
- _.ScalarIntMemCPat:$src2)),
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ _.ScalarIntMemCPat:$src2))>,
+ Sched<[sched.Folded, ReadAfterLd]>;
let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
- itins.rr>, Sched<[itins.Sched]> {
+ [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
+ Sched<[sched]> {
let isCommutable = IsCommutable;
}
def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1,
- (_.ScalarLdFrag addr:$src2)))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.ScalarLdFrag addr:$src2)))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
(SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 FROUND_NO_EXC)), itins.rr>, EVEX_B,
- Sched<[itins.Sched]>;
+ (i32 FROUND_NO_EXC))>, EVEX_B,
+ Sched<[sched]>;
}
}
multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode VecNode,
- SizeItins itins, bit IsCommutable> {
+ SDNode VecNode, X86SchedWriteSizes sched,
+ bit IsCommutable> {
defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
- itins.s, IsCommutable>,
+ sched.PS.Scl, IsCommutable>,
avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, VecNode,
- itins.s, IsCommutable>,
+ sched.PS.Scl, IsCommutable>,
XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
- itins.d, IsCommutable>,
+ sched.PD.Scl, IsCommutable>,
avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, VecNode,
- itins.d, IsCommutable>,
+ sched.PD.Scl, IsCommutable>,
XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
}
multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode VecNode, SDNode SaeNode,
- SizeItins itins, bit IsCommutable> {
+ SDNode VecNode, SDNode SaeNode,
+ X86SchedWriteSizes sched, bit IsCommutable> {
defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
- VecNode, SaeNode, itins.s, IsCommutable>,
+ VecNode, SaeNode, sched.PS.Scl, IsCommutable>,
XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode,
- VecNode, SaeNode, itins.d, IsCommutable>,
+ VecNode, SaeNode, sched.PD.Scl, IsCommutable>,
XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
}
-defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnds, SSE_ALU_ITINS_S, 1>;
-defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnds, SSE_MUL_ITINS_S, 1>;
-defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnds, SSE_ALU_ITINS_S, 0>;
-defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnds, SSE_DIV_ITINS_S, 0>;
-defm VMIN : avx512_binop_s_sae <0x5D, "vmin", X86fmin, X86fmins, X86fminRnds,
- SSE_ALU_ITINS_S, 0>;
-defm VMAX : avx512_binop_s_sae <0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxRnds,
- SSE_ALU_ITINS_S, 0>;
+defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnds,
+ SchedWriteFAddSizes, 1>;
+defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnds,
+ SchedWriteFMulSizes, 1>;
+defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnds,
+ SchedWriteFAddSizes, 0>;
+defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnds,
+ SchedWriteFDivSizes, 0>;
+defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminRnds,
+ SchedWriteFCmpSizes, 0>;
+defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxRnds,
+ SchedWriteFCmpSizes, 0>;
// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
// X86fminc and X86fmaxc instead of X86fmin and X86fmax
multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
- X86VectorVTInfo _, SDNode OpNode, OpndItins itins> {
+ X86VectorVTInfo _, SDNode OpNode,
+ X86FoldableSchedWrite sched> {
let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
- itins.rr>, Sched<[itins.Sched]> {
+ [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
+ Sched<[sched]> {
let isCommutable = 1;
}
def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1,
- (_.ScalarLdFrag addr:$src2)))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.ScalarLdFrag addr:$src2)))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
- SSE_ALU_ITINS_S.s>, XS, EVEX_4V, VEX_LIG,
- EVEX_CD8<32, CD8VT1>;
+ SchedWriteFCmp.Scl>, XS, EVEX_4V,
+ VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
- SSE_ALU_ITINS_S.d>, XD, VEX_W, EVEX_4V, VEX_LIG,
- EVEX_CD8<64, CD8VT1>;
+ SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V,
+ VEX_LIG, EVEX_CD8<64, CD8VT1>;
defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
- SSE_ALU_ITINS_S.s>, XS, EVEX_4V, VEX_LIG,
- EVEX_CD8<32, CD8VT1>;
+ SchedWriteFCmp.Scl>, XS, EVEX_4V,
+ VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
- SSE_ALU_ITINS_S.d>, XD, VEX_W, EVEX_4V, VEX_LIG,
- EVEX_CD8<64, CD8VT1>;
+ SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V,
+ VEX_LIG, EVEX_CD8<64, CD8VT1>;
multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
- X86VectorVTInfo _, OpndItins itins,
- bit IsCommutable> {
+ X86VectorVTInfo _, X86FoldableSchedWrite sched,
+ bit IsCommutable,
+ bit IsKZCommutable = IsCommutable> {
let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2)), itins.rr,
- IsCommutable>, EVEX_4V, Sched<[itins.Sched]>;
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable, 0,
+ IsKZCommutable>,
+ EVEX_4V, Sched<[sched]>;
let mayLoad = 1 in {
defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (OpNode _.RC:$src1, (_.LdFrag addr:$src2)), itins.rm>,
- EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
+ EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
(OpNode _.RC:$src1, (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)))),
- itins.rm>, EVEX_4V, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.ScalarLdFrag addr:$src2))))>,
+ EVEX_4V, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
}
-multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd,
- OpndItins itins, X86VectorVTInfo _> {
+multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNodeRnd,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
"$rc, $src2, $src1", "$src1, $src2, $rc",
- (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc))), itins.rr>,
- EVEX_4V, EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
+ (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>,
+ EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
}
-multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd,
- OpndItins itins, X86VectorVTInfo _> {
+multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNodeRnd,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
- (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC))), itins.rr>,
- EVEX_4V, EVEX_B, Sched<[itins.Sched]>;
+ (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>,
+ EVEX_4V, EVEX_B, Sched<[sched]>;
}
multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
- Predicate prd, SizeItins itins,
- bit IsCommutable = 0> {
+ Predicate prd, X86SchedWriteSizes sched,
+ bit IsCommutable = 0,
+ bit IsPD128Commutable = IsCommutable> {
let Predicates = [prd] in {
defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
- itins.s, IsCommutable>, EVEX_V512, PS,
+ sched.PS.ZMM, IsCommutable>, EVEX_V512, PS,
EVEX_CD8<32, CD8VF>;
defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info,
- itins.d, IsCommutable>, EVEX_V512, PD, VEX_W,
+ sched.PD.ZMM, IsCommutable>, EVEX_V512, PD, VEX_W,
EVEX_CD8<64, CD8VF>;
}
// Define only if AVX512VL feature is present.
let Predicates = [prd, HasVLX] in {
defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info,
- itins.s, IsCommutable>, EVEX_V128, PS,
+ sched.PS.XMM, IsCommutable>, EVEX_V128, PS,
EVEX_CD8<32, CD8VF>;
defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info,
- itins.s, IsCommutable>, EVEX_V256, PS,
+ sched.PS.YMM, IsCommutable>, EVEX_V256, PS,
EVEX_CD8<32, CD8VF>;
defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info,
- itins.d, IsCommutable>, EVEX_V128, PD, VEX_W,
+ sched.PD.XMM, IsPD128Commutable,
+ IsCommutable>, EVEX_V128, PD, VEX_W,
EVEX_CD8<64, CD8VF>;
defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info,
- itins.d, IsCommutable>, EVEX_V256, PD, VEX_W,
+ sched.PD.YMM, IsCommutable>, EVEX_V256, PD, VEX_W,
EVEX_CD8<64, CD8VF>;
}
}
multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
- SizeItins itins> {
- defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, itins.s, v16f32_info>,
- EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, itins.d, v8f64_info>,
- EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+ X86SchedWriteSizes sched> {
+ defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
+ v16f32_info>,
+ EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PD.ZMM,
+ v8f64_info>,
+ EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
}
multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
- SizeItins itins> {
- defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, itins.s, v16f32_info>,
- EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, itins.d, v8f64_info>,
- EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+ X86SchedWriteSizes sched> {
+ defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
+ v16f32_info>,
+ EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PD.ZMM,
+ v8f64_info>,
+ EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
}
defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, HasAVX512,
- SSE_ALU_ITINS_P, 1>,
- avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SSE_ALU_ITINS_P>;
+ SchedWriteFAddSizes, 1>,
+ avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SchedWriteFAddSizes>;
defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, HasAVX512,
- SSE_MUL_ITINS_P, 1>,
- avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SSE_MUL_ITINS_P>;
-defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512, SSE_ALU_ITINS_P>,
- avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SSE_ALU_ITINS_P>;
-defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512, SSE_DIV_ITINS_P>,
- avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SSE_DIV_ITINS_P>;
+ SchedWriteFMulSizes, 1>,
+ avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SchedWriteFMulSizes>;
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512,
+ SchedWriteFAddSizes>,
+ avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SchedWriteFAddSizes>;
+defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512,
+ SchedWriteFDivSizes>,
+ avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>;
defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512,
- SSE_ALU_ITINS_P, 0>,
- avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd, SSE_ALU_ITINS_P>;
+ SchedWriteFCmpSizes, 0>,
+ avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd, SchedWriteFCmpSizes>;
defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512,
- SSE_ALU_ITINS_P, 0>,
- avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd, SSE_ALU_ITINS_P>;
+ SchedWriteFCmpSizes, 0>,
+ avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd, SchedWriteFCmpSizes>;
let isCodeGenOnly = 1 in {
defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512,
- SSE_ALU_ITINS_P, 1>;
+ SchedWriteFCmpSizes, 1>;
defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512,
- SSE_ALU_ITINS_P, 1>;
+ SchedWriteFCmpSizes, 1>;
}
defm VAND : avx512_fp_binop_p<0x54, "vand", null_frag, HasDQI,
- SSE_ALU_ITINS_P, 1>;
+ SchedWriteFLogicSizes, 1>;
defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, HasDQI,
- SSE_ALU_ITINS_P, 0>;
+ SchedWriteFLogicSizes, 0>;
defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI,
- SSE_ALU_ITINS_P, 1>;
+ SchedWriteFLogicSizes, 1>;
defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
- SSE_ALU_ITINS_P, 1>;
+ SchedWriteFLogicSizes, 1>;
// Patterns catch floating point selects with bitcasted integer logic ops.
multiclass avx512_fp_logical_lowering<string InstrStr, SDNode OpNode,
@@ -5084,370 +5509,444 @@ defm : avx512_fp_logical_lowering_sizes<"VPANDN", X86andnp>;
let Predicates = [HasVLX,HasDQI] in {
// Use packed logical operations for scalar ops.
def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)),
- (COPY_TO_REGCLASS (VANDPDZ128rr
- (COPY_TO_REGCLASS FR64X:$src1, VR128X),
- (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+ (COPY_TO_REGCLASS
+ (v2f64 (VANDPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
+ (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
+ FR64X)>;
def : Pat<(f64 (X86for FR64X:$src1, FR64X:$src2)),
- (COPY_TO_REGCLASS (VORPDZ128rr
- (COPY_TO_REGCLASS FR64X:$src1, VR128X),
- (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+ (COPY_TO_REGCLASS
+ (v2f64 (VORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
+ (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
+ FR64X)>;
def : Pat<(f64 (X86fxor FR64X:$src1, FR64X:$src2)),
- (COPY_TO_REGCLASS (VXORPDZ128rr
- (COPY_TO_REGCLASS FR64X:$src1, VR128X),
- (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+ (COPY_TO_REGCLASS
+ (v2f64 (VXORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
+ (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
+ FR64X)>;
def : Pat<(f64 (X86fandn FR64X:$src1, FR64X:$src2)),
- (COPY_TO_REGCLASS (VANDNPDZ128rr
- (COPY_TO_REGCLASS FR64X:$src1, VR128X),
- (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+ (COPY_TO_REGCLASS
+ (v2f64 (VANDNPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
+ (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
+ FR64X)>;
def : Pat<(f32 (X86fand FR32X:$src1, FR32X:$src2)),
- (COPY_TO_REGCLASS (VANDPSZ128rr
- (COPY_TO_REGCLASS FR32X:$src1, VR128X),
- (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+ (COPY_TO_REGCLASS
+ (v4f32 (VANDPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
+ FR32X)>;
def : Pat<(f32 (X86for FR32X:$src1, FR32X:$src2)),
- (COPY_TO_REGCLASS (VORPSZ128rr
- (COPY_TO_REGCLASS FR32X:$src1, VR128X),
- (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+ (COPY_TO_REGCLASS
+ (v4f32 (VORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
+ FR32X)>;
def : Pat<(f32 (X86fxor FR32X:$src1, FR32X:$src2)),
- (COPY_TO_REGCLASS (VXORPSZ128rr
- (COPY_TO_REGCLASS FR32X:$src1, VR128X),
- (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+ (COPY_TO_REGCLASS
+ (v4f32 (VXORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
+ FR32X)>;
def : Pat<(f32 (X86fandn FR32X:$src1, FR32X:$src2)),
- (COPY_TO_REGCLASS (VANDNPSZ128rr
- (COPY_TO_REGCLASS FR32X:$src1, VR128X),
- (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+ (COPY_TO_REGCLASS
+ (v4f32 (VANDNPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
+ FR32X)>;
}
multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT))),
- itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>,
+ EVEX_4V, Sched<[sched]>;
defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT)),
- itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>,
+ EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
(OpNode _.RC:$src1, (_.VT (X86VBroadcast
(_.ScalarLdFrag addr:$src2))),
- (i32 FROUND_CURRENT)), itins.rm>,
- EVEX_4V, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 FROUND_CURRENT))>,
+ EVEX_4V, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
}
}
multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT))), itins.rr>,
- Sched<[itins.Sched]>;
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>,
+ Sched<[sched]>;
defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
(OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2,
- (i32 FROUND_CURRENT)), itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 FROUND_CURRENT))>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
-multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode, SDNode OpNodeScal> {
- defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v16f32_info>,
- avx512_fp_round_packed<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v16f32_info>,
+multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeScal,
+ X86SchedWriteWidths sched> {
+ defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.ZMM, v16f32_info>,
+ avx512_fp_round_packed<opc, OpcodeStr, OpNode, sched.ZMM, v16f32_info>,
EVEX_V512, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v8f64_info>,
- avx512_fp_round_packed<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v8f64_info>,
+ defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.ZMM, v8f64_info>,
+ avx512_fp_round_packed<opc, OpcodeStr, OpNode, sched.ZMM, v8f64_info>,
EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
- defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, SSE_ALU_F32S, f32x_info>,
- avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNodeScal, SSE_ALU_ITINS_S.s>,
- EVEX_4V,EVEX_CD8<32, CD8VT1>;
- defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, SSE_ALU_F64S, f64x_info>,
- avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNodeScal, SSE_ALU_ITINS_S.d>,
- EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+ defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, sched.Scl, f32x_info>,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNodeScal, sched.Scl>,
+ EVEX_4V,EVEX_CD8<32, CD8VT1>;
+ defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, sched.Scl, f64x_info>,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNodeScal, sched.Scl>,
+ EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
// Define only if AVX512VL feature is present.
let Predicates = [HasVLX] in {
- defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v4f32x_info>,
+ defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.XMM, v4f32x_info>,
EVEX_V128, EVEX_CD8<32, CD8VF>;
- defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v8f32x_info>,
+ defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.YMM, v8f32x_info>,
EVEX_V256, EVEX_CD8<32, CD8VF>;
- defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v2f64x_info>,
+ defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.XMM, v2f64x_info>,
EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
- defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v4f64x_info>,
+ defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.YMM, v4f64x_info>,
EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
}
}
-defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef, X86scalefs>, T8PD;
+defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef, X86scalefs,
+ SchedWriteFAdd>, T8PD, NotEVEX2VEXConvertible;
//===----------------------------------------------------------------------===//
// AVX-512 VPTESTM instructions
//===----------------------------------------------------------------------===//
-multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> {
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ string Name> {
let ExeDomain = _.ExeDomain in {
let isCommutable = 1 in
defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)), itins.rr>,
- EVEX_4V, Sched<[itins.Sched]>;
+ (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))),
+ _.ImmAllZerosV)>,
+ EVEX_4V, Sched<[sched]>;
defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (_.VT _.RC:$src1),
- (_.VT (bitconvert (_.LdFrag addr:$src2)))), itins.rm>,
+ (OpNode (bitconvert
+ (_.i64VT (and _.RC:$src1,
+ (bitconvert (_.LdFrag addr:$src2))))),
+ _.ImmAllZerosV)>,
EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>;
}
+
+ // Patterns for compare with 0 that just use the same source twice.
+ def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)),
+ (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rr")
+ _.RC:$src, _.RC:$src))>;
+
+ def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))),
+ (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rrk")
+ _.KRC:$mask, _.RC:$src, _.RC:$src))>;
}
-multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> {
+multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
- (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)))),
- itins.rm>, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (OpNode (and _.RC:$src1,
+ (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2))),
+ _.ImmAllZerosV)>,
+ EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
// Use 512bit version to implement 128/256 bit in case NoVLX.
-multiclass avx512_vptest_lowering<SDNode OpNode, X86VectorVTInfo ExtendInfo,
- X86VectorVTInfo _, string Suffix> {
- def : Pat<(_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))),
- (_.KVT (COPY_TO_REGCLASS
- (!cast<Instruction>(NAME # Suffix # "Zrr")
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src1, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src2, _.SubRegIdx)),
- _.KRC))>;
-}
-
-multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo _,
- string Suffix> {
+multiclass avx512_vptest_lowering<PatFrag OpNode, X86VectorVTInfo ExtendInfo,
+ X86VectorVTInfo _, string Name> {
+ def : Pat<(_.KVT (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))),
+ _.ImmAllZerosV)),
+ (_.KVT (COPY_TO_REGCLASS
+ (!cast<Instruction>(Name # "Zrr")
+ (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+ _.RC:$src1, _.SubRegIdx),
+ (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+ _.RC:$src2, _.SubRegIdx)),
+ _.KRC))>;
+
+ def : Pat<(_.KVT (and _.KRC:$mask,
+ (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))),
+ _.ImmAllZerosV))),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(Name # "Zrrk")
+ (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
+ (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+ _.RC:$src1, _.SubRegIdx),
+ (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+ _.RC:$src2, _.SubRegIdx)),
+ _.KRC)>;
+
+ def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)),
+ (_.KVT (COPY_TO_REGCLASS
+ (!cast<Instruction>(Name # "Zrr")
+ (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+ _.RC:$src, _.SubRegIdx),
+ (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+ _.RC:$src, _.SubRegIdx)),
+ _.KRC))>;
+
+ def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(Name # "Zrrk")
+ (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
+ (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+ _.RC:$src, _.SubRegIdx),
+ (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+ _.RC:$src, _.SubRegIdx)),
+ _.KRC)>;
+}
+
+multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in
- defm Z : avx512_vptest<opc, OpcodeStr, OpNode, itins, _.info512>,
- avx512_vptest_mb<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512;
+ defm Z : avx512_vptest<opc, OpcodeStr, OpNode, sched.ZMM, _.info512, NAME>,
+ avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, itins, _.info256>,
- avx512_vptest_mb<opc, OpcodeStr, OpNode,itins, _.info256>, EVEX_V256;
- defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, itins, _.info128>,
- avx512_vptest_mb<opc, OpcodeStr, OpNode, itins, _.info128>, EVEX_V128;
+ defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, sched.YMM, _.info256, NAME>,
+ avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.YMM, _.info256>, EVEX_V256;
+ defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, sched.XMM, _.info128, NAME>,
+ avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.XMM, _.info128>, EVEX_V128;
}
let Predicates = [HasAVX512, NoVLX] in {
- defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, Suffix>;
- defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, Suffix>;
+ defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, NAME>;
+ defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, NAME>;
}
}
-multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
- defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode, itins,
- avx512vl_i32_info, "D">;
- defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode, itins,
- avx512vl_i64_info, "Q">, VEX_W;
+multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+ X86SchedWriteWidths sched> {
+ defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode, sched,
+ avx512vl_i32_info>;
+ defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode, sched,
+ avx512vl_i64_info>, VEX_W;
}
multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
- SDNode OpNode, OpndItins itins> {
+ PatFrag OpNode, X86SchedWriteWidths sched> {
let Predicates = [HasBWI] in {
- defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, itins, v32i16_info>,
- EVEX_V512, VEX_W;
- defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, itins, v64i8_info>,
- EVEX_V512;
+ defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.ZMM,
+ v32i16_info, NAME#"W">, EVEX_V512, VEX_W;
+ defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.ZMM,
+ v64i8_info, NAME#"B">, EVEX_V512;
}
let Predicates = [HasVLX, HasBWI] in {
- defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, itins, v16i16x_info>,
- EVEX_V256, VEX_W;
- defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, itins, v8i16x_info>,
- EVEX_V128, VEX_W;
- defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, itins, v32i8x_info>,
- EVEX_V256;
- defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, itins, v16i8x_info>,
- EVEX_V128;
+ defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.YMM,
+ v16i16x_info, NAME#"W">, EVEX_V256, VEX_W;
+ defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.XMM,
+ v8i16x_info, NAME#"W">, EVEX_V128, VEX_W;
+ defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.YMM,
+ v32i8x_info, NAME#"B">, EVEX_V256;
+ defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.XMM,
+ v16i8x_info, NAME#"B">, EVEX_V128;
}
let Predicates = [HasAVX512, NoVLX] in {
- defm BZ256_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v32i8x_info, "B">;
- defm BZ128_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v16i8x_info, "B">;
- defm WZ256_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v16i16x_info, "W">;
- defm WZ128_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v8i16x_info, "W">;
+ defm BZ256_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v32i8x_info, NAME#"B">;
+ defm BZ128_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v16i8x_info, NAME#"B">;
+ defm WZ256_Alt : avx512_vptest_lowering<OpNode, v32i16_info, v16i16x_info, NAME#"W">;
+ defm WZ128_Alt : avx512_vptest_lowering<OpNode, v32i16_info, v8i16x_info, NAME#"W">;
}
}
-multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
- SDNode OpNode, OpndItins itins> :
- avx512_vptest_wb <opc_wb, OpcodeStr, OpNode, itins>,
- avx512_vptest_dq<opc_dq, OpcodeStr, OpNode, itins>;
+// These patterns are used to match vptestm/vptestnm. We don't treat pcmpeqm
+// as commutable here because we already canonicalized all zeros vectors to the
+// RHS during lowering.
+def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2),
+ (setcc node:$src1, node:$src2, SETEQ)>;
+def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2),
+ (setcc node:$src1, node:$src2, SETNE)>;
-defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86testm,
- SSE_BIT_ITINS_P>, T8PD;
-defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm,
- SSE_BIT_ITINS_P>, T8XS;
+multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
+ PatFrag OpNode, X86SchedWriteWidths sched> :
+ avx512_vptest_wb<opc_wb, OpcodeStr, OpNode, sched>,
+ avx512_vptest_dq<opc_dq, OpcodeStr, OpNode, sched>;
+defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86pcmpnem,
+ SchedWriteVecLogic>, T8PD;
+defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86pcmpeqm,
+ SchedWriteVecLogic>, T8XS;
//===----------------------------------------------------------------------===//
// AVX-512 Shift instructions
//===----------------------------------------------------------------------===//
+
multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
- string OpcodeStr, SDNode OpNode, OpndItins itins,
- X86VectorVTInfo _> {
+ string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst),
(ins _.RC:$src1, u8imm:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))),
- itins.rr>, Sched<[itins.Sched]>;
+ (_.VT (OpNode _.RC:$src1, (i8 imm:$src2)))>,
+ Sched<[sched]>;
defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
(ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i8 imm:$src2))),
- itins.rm>, Sched<[itins.Sched.Folded]>;
+ (i8 imm:$src2)))>,
+ Sched<[sched.Folded]>;
}
}
multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
- string OpcodeStr, SDNode OpNode, OpndItins itins,
- X86VectorVTInfo _> {
+ string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
"$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
- (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))),
- itins.rm>, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2)))>,
+ EVEX_B, Sched<[sched.Folded]>;
}
multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, ValueType SrcVT, PatFrag bc_frag,
- X86VectorVTInfo _> {
+ X86FoldableSchedWrite sched, ValueType SrcVT,
+ PatFrag bc_frag, X86VectorVTInfo _> {
// src2 is always 128-bit
let ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, VR128X:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))),
- itins.rr>, AVX512BIBase, EVEX_4V, Sched<[itins.Sched]>;
+ (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2)))>,
+ AVX512BIBase, EVEX_4V, Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, i128mem:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))),
- itins.rm>, AVX512BIBase,
- EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2))))>,
+ AVX512BIBase,
+ EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
}
}
multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, ValueType SrcVT, PatFrag bc_frag,
- AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ X86SchedWriteWidths sched, ValueType SrcVT,
+ PatFrag bc_frag, AVX512VLVectorVTInfo VTInfo,
+ Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, itins, SrcVT, bc_frag,
- VTInfo.info512>, EVEX_V512,
- EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ;
+ defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.ZMM, SrcVT,
+ bc_frag, VTInfo.info512>, EVEX_V512,
+ EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, itins, SrcVT, bc_frag,
- VTInfo.info256>, EVEX_V256,
- EVEX_CD8<VTInfo.info256.EltSize, CD8VH>;
- defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, itins, SrcVT, bc_frag,
- VTInfo.info128>, EVEX_V128,
- EVEX_CD8<VTInfo.info128.EltSize, CD8VF>;
+ defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.YMM, SrcVT,
+ bc_frag, VTInfo.info256>, EVEX_V256,
+ EVEX_CD8<VTInfo.info256.EltSize, CD8VH>;
+ defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.XMM, SrcVT,
+ bc_frag, VTInfo.info128>, EVEX_V128,
+ EVEX_CD8<VTInfo.info128.EltSize, CD8VF>;
}
}
multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw,
string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
- defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, itins, v4i32,
+ X86SchedWriteWidths sched,
+ bit NotEVEX2VEXConvertibleQ = 0> {
+ defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, sched, v4i32,
bc_v4i32, avx512vl_i32_info, HasAVX512>;
- defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, itins, v2i64,
+ let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in
+ defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, sched, v2i64,
bc_v2i64, avx512vl_i64_info, HasAVX512>, VEX_W;
- defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, itins, v8i16,
+ defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, sched, v8i16,
bc_v2i64, avx512vl_i16_info, HasBWI>;
}
multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
string OpcodeStr, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo VTInfo> {
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo> {
let Predicates = [HasAVX512] in
- defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, itins,
- VTInfo.info512>,
- avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, itins,
- VTInfo.info512>, EVEX_V512;
+ defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ sched.ZMM, VTInfo.info512>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, sched.ZMM,
+ VTInfo.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, itins,
- VTInfo.info256>,
- avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, itins,
- VTInfo.info256>, EVEX_V256;
+ defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ sched.YMM, VTInfo.info256>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, sched.YMM,
+ VTInfo.info256>, EVEX_V256;
defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
- itins, VTInfo.info128>,
- avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, itins,
- VTInfo.info128>, EVEX_V128;
+ sched.XMM, VTInfo.info128>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, sched.XMM,
+ VTInfo.info128>, EVEX_V128;
}
}
-multiclass avx512_shift_rmi_w<bits<8> opcw,
- Format ImmFormR, Format ImmFormM,
- string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
+multiclass avx512_shift_rmi_w<bits<8> opcw, Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched> {
let Predicates = [HasBWI] in
defm WZ: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
- itins, v32i16_info>, EVEX_V512, VEX_WIG;
+ sched.ZMM, v32i16_info>, EVEX_V512, VEX_WIG;
let Predicates = [HasVLX, HasBWI] in {
defm WZ256: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
- itins, v16i16x_info>, EVEX_V256, VEX_WIG;
+ sched.YMM, v16i16x_info>, EVEX_V256, VEX_WIG;
defm WZ128: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
- itins, v8i16x_info>, EVEX_V128, VEX_WIG;
+ sched.XMM, v8i16x_info>, EVEX_V128, VEX_WIG;
}
}
multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq,
- Format ImmFormR, Format ImmFormM,
- string OpcodeStr, SDNode OpNode, OpndItins itins> {
+ Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched,
+ bit NotEVEX2VEXConvertibleQ = 0> {
defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode,
- itins, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+ sched, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+ let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in
defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode,
- itins, avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W;
+ sched, avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W;
}
defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli,
- SSE_INTSHIFT_P>,
+ SchedWriteVecShiftImm>,
avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli,
- SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V;
+ SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli,
- SSE_INTSHIFT_P>,
+ SchedWriteVecShiftImm>,
avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli,
- SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V;
+ SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai,
- SSE_INTSHIFT_P>,
+ SchedWriteVecShiftImm, 1>,
avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai,
- SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V;
+ SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri,
- SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V;
+ SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli,
- SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V;
+ SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
-defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl, SSE_INTSHIFT_P>;
-defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra, SSE_INTSHIFT_P>;
-defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl, SSE_INTSHIFT_P>;
+defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl,
+ SchedWriteVecShift>;
+defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra,
+ SchedWriteVecShift, 1>;
+defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl,
+ SchedWriteVecShift>;
// Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX.
let Predicates = [HasAVX512, NoVLX] in {
@@ -5479,59 +5978,57 @@ let Predicates = [HasAVX512, NoVLX] in {
//===-------------------------------------------------------------------===//
// Variable Bit Shifts
//===-------------------------------------------------------------------===//
+
multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))),
- itins.rr>, AVX5128IBase, EVEX_4V,
- Sched<[itins.Sched]>;
+ (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2)))>,
+ AVX5128IBase, EVEX_4V, Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1,
- (_.VT (bitconvert (_.LdFrag addr:$src2))))),
- itins.rm>, AVX5128IBase, EVEX_4V,
- EVEX_CD8<_.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.VT (bitconvert (_.LdFrag addr:$src2)))))>,
+ AVX5128IBase, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
(_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))))),
- itins.rm>, AVX5128IBase, EVEX_B,
- EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.ScalarLdFrag addr:$src2)))))>,
+ AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo _> {
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in
- defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info512>,
- avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512;
+ defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info256>,
- avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info256>, EVEX_V256;
- defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info128>,
- avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info128>, EVEX_V128;
+ defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, sched.YMM, _.info256>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched.YMM, _.info256>, EVEX_V256;
+ defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, sched.XMM, _.info128>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched.XMM, _.info128>, EVEX_V128;
}
}
multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr,
- SDNode OpNode, OpndItins itins> {
- defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode, itins,
+ SDNode OpNode, X86SchedWriteWidths sched> {
+ defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode, sched,
avx512vl_i32_info>;
- defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode, itins,
+ defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode, sched,
avx512vl_i64_info>, VEX_W;
}
@@ -5557,30 +6054,30 @@ multiclass avx512_var_shift_lowering<AVX512VLVectorVTInfo _, string OpcodeStr,
}
}
multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
- SDNode OpNode, OpndItins itins> {
+ SDNode OpNode, X86SchedWriteWidths sched> {
let Predicates = [HasBWI] in
- defm WZ: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v32i16_info>,
+ defm WZ: avx512_var_shift<opc, OpcodeStr, OpNode, sched.ZMM, v32i16_info>,
EVEX_V512, VEX_W;
let Predicates = [HasVLX, HasBWI] in {
- defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v16i16x_info>,
+ defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, sched.YMM, v16i16x_info>,
EVEX_V256, VEX_W;
- defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v8i16x_info>,
+ defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, sched.XMM, v8i16x_info>,
EVEX_V128, VEX_W;
}
}
-defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl, SSE_INTSHIFT_P>,
- avx512_var_shift_w<0x12, "vpsllvw", shl, SSE_INTSHIFT_P>;
+defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl, SchedWriteVarVecShift>,
+ avx512_var_shift_w<0x12, "vpsllvw", shl, SchedWriteVarVecShift>;
-defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra, SSE_INTSHIFT_P>,
- avx512_var_shift_w<0x11, "vpsravw", sra, SSE_INTSHIFT_P>;
+defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra, SchedWriteVarVecShift>,
+ avx512_var_shift_w<0x11, "vpsravw", sra, SchedWriteVarVecShift>;
-defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl, SSE_INTSHIFT_P>,
- avx512_var_shift_w<0x10, "vpsrlvw", srl, SSE_INTSHIFT_P>;
+defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl, SchedWriteVarVecShift>,
+ avx512_var_shift_w<0x10, "vpsrlvw", srl, SchedWriteVarVecShift>;
-defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SSE_INTSHIFT_P>;
-defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SSE_INTSHIFT_P>;
+defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>;
+defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>;
defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", sra, [HasAVX512, NoVLX]>;
defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", shl, [HasBWI, NoVLX]>;
@@ -5651,7 +6148,6 @@ defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>;
defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>;
defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>;
-
// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
@@ -5757,87 +6253,89 @@ let Predicates = [HasAVX512, NoVLX] in {
//===-------------------------------------------------------------------===//
// 1-src variable permutation VPERMW/D/Q
//===-------------------------------------------------------------------===//
+
multiclass avx512_vperm_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo _> {
+ X86FoldableSchedWrite sched, AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in
- defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info512>,
- avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512;
+ defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info512>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched, _.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in
- defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info256>,
- avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info256>, EVEX_V256;
+ defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info256>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched, _.info256>, EVEX_V256;
}
multiclass avx512_vpermi_dq_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
string OpcodeStr, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo VTInfo> {
+ X86FoldableSchedWrite sched, AVX512VLVectorVTInfo VTInfo> {
let Predicates = [HasAVX512] in
defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
- itins, VTInfo.info512>,
+ sched, VTInfo.info512>,
avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
- itins, VTInfo.info512>, EVEX_V512;
+ sched, VTInfo.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in
defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
- itins, VTInfo.info256>,
+ sched, VTInfo.info256>,
avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
- itins, VTInfo.info256>, EVEX_V256;
+ sched, VTInfo.info256>, EVEX_V256;
}
multiclass avx512_vperm_bw<bits<8> opc, string OpcodeStr,
Predicate prd, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo _> {
+ X86FoldableSchedWrite sched, AVX512VLVectorVTInfo _> {
let Predicates = [prd] in
- defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info512>,
+ defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info512>,
EVEX_V512 ;
let Predicates = [HasVLX, prd] in {
- defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info256>,
+ defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info256>,
EVEX_V256 ;
- defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info128>,
+ defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info128>,
EVEX_V128 ;
}
}
defm VPERMW : avx512_vperm_bw<0x8D, "vpermw", HasBWI, X86VPermv,
- AVX2_PERMV_I, avx512vl_i16_info>, VEX_W;
+ WriteVarShuffle256, avx512vl_i16_info>, VEX_W;
defm VPERMB : avx512_vperm_bw<0x8D, "vpermb", HasVBMI, X86VPermv,
- AVX2_PERMV_I, avx512vl_i8_info>;
+ WriteVarShuffle256, avx512vl_i8_info>;
defm VPERMD : avx512_vperm_dq_sizes<0x36, "vpermd", X86VPermv,
- AVX2_PERMV_I, avx512vl_i32_info>;
+ WriteVarShuffle256, avx512vl_i32_info>;
defm VPERMQ : avx512_vperm_dq_sizes<0x36, "vpermq", X86VPermv,
- AVX2_PERMV_I, avx512vl_i64_info>, VEX_W;
+ WriteVarShuffle256, avx512vl_i64_info>, VEX_W;
defm VPERMPS : avx512_vperm_dq_sizes<0x16, "vpermps", X86VPermv,
- AVX2_PERMV_F, avx512vl_f32_info>;
+ WriteFVarShuffle256, avx512vl_f32_info>;
defm VPERMPD : avx512_vperm_dq_sizes<0x16, "vpermpd", X86VPermv,
- AVX2_PERMV_F, avx512vl_f64_info>, VEX_W;
+ WriteFVarShuffle256, avx512vl_f64_info>, VEX_W;
defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq",
- X86VPermi, AVX2_PERMV_I, avx512vl_i64_info>,
+ X86VPermi, WriteShuffle256, avx512vl_i64_info>,
EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd",
- X86VPermi, AVX2_PERMV_F, avx512vl_f64_info>,
+ X86VPermi, WriteFShuffle256, avx512vl_f64_info>,
EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+
//===----------------------------------------------------------------------===//
// AVX-512 - VPERMIL
//===----------------------------------------------------------------------===//
multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
X86VectorVTInfo Ctrl> {
defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1,
- (Ctrl.VT Ctrl.RC:$src2))), itins.rr>,
- T8PD, EVEX_4V, Sched<[itins.Sched]>;
+ (Ctrl.VT Ctrl.RC:$src2)))>,
+ T8PD, EVEX_4V, Sched<[sched]>;
defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode
_.RC:$src1,
- (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2))))),
- itins.rm>, T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
+ T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, ReadAfterLd]>;
defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
"${src2}"##_.BroadcastStr##", $src1",
@@ -5845,31 +6343,33 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
(_.VT (OpNode
_.RC:$src1,
(Ctrl.VT (X86VBroadcast
- (Ctrl.ScalarLdFrag addr:$src2))))),
- itins.rm>, T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (Ctrl.ScalarLdFrag addr:$src2)))))>,
+ T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar,
- OpndItins itins, AVX512VLVectorVTInfo _,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _,
AVX512VLVectorVTInfo Ctrl> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, itins,
+ defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, sched.ZMM,
_.info512, Ctrl.info512>, EVEX_V512;
}
let Predicates = [HasAVX512, HasVLX] in {
- defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, itins,
+ defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, sched.XMM,
_.info128, Ctrl.info128>, EVEX_V128;
- defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, itins,
+ defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, sched.YMM,
_.info256, Ctrl.info256>, EVEX_V256;
}
}
multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar,
AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
- defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, AVX_VPERMILV, _, Ctrl>;
+ defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, SchedWriteFVarShuffle,
+ _, Ctrl>;
defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr,
- X86VPermilpi, AVX_VPERMILV, _>,
+ X86VPermilpi, SchedWriteFShuffle, _>,
EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>;
}
@@ -5878,54 +6378,68 @@ defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info,
avx512vl_i32_info>;
let ExeDomain = SSEPackedDouble in
defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info,
- avx512vl_i64_info>, VEX_W;
+ avx512vl_i64_info>, VEX_W1X;
//===----------------------------------------------------------------------===//
// AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW
//===----------------------------------------------------------------------===//
defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd",
- X86PShufd, SSE_PSHUF, avx512vl_i32_info>,
+ X86PShufd, SchedWriteShuffle, avx512vl_i32_info>,
EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>;
defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw",
- X86PShufhw, SSE_PSHUF>, EVEX, AVX512XSIi8Base;
+ X86PShufhw, SchedWriteShuffle>,
+ EVEX, AVX512XSIi8Base;
defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw",
- X86PShuflw, SSE_PSHUF>, EVEX, AVX512XDIi8Base;
+ X86PShuflw, SchedWriteShuffle>,
+ EVEX, AVX512XDIi8Base;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPSHUFB
+//===----------------------------------------------------------------------===//
multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
+ X86SchedWriteWidths sched> {
let Predicates = [HasBWI] in
- defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v64i8_info>, EVEX_V512;
+ defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, sched.ZMM, v64i8_info>,
+ EVEX_V512;
let Predicates = [HasVLX, HasBWI] in {
- defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v32i8x_info>, EVEX_V256;
- defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v16i8x_info>, EVEX_V128;
+ defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, sched.YMM, v32i8x_info>,
+ EVEX_V256;
+ defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, sched.XMM, v16i8x_info>,
+ EVEX_V128;
}
}
-defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb, SSE_PSHUFB>, VEX_WIG;
+defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb,
+ SchedWriteVarShuffle>, VEX_WIG;
//===----------------------------------------------------------------------===//
// Move Low to High and High to Low packed FP Instructions
//===----------------------------------------------------------------------===//
+
def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2),
"vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))],
- IIC_SSE_MOV_LH>, EVEX_4V;
+ [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))]>,
+ Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V;
+let isCommutable = 1 in
def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2),
"vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
- IIC_SSE_MOV_LH>, EVEX_4V;
+ [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))]>,
+ Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V, NotMemoryFoldable;
//===----------------------------------------------------------------------===//
// VMOVHPS/PD VMOVLPS Instructions
// All patterns was taken from SSS implementation.
//===----------------------------------------------------------------------===//
-multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+
+multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in
+ let hasSideEffects = 0, mayLoad = 1, ExeDomain = _.ExeDomain in
def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src1, f64mem:$src2),
!strconcat(OpcodeStr,
@@ -5933,71 +6447,57 @@ multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set _.RC:$dst,
(OpNode _.RC:$src1,
(_.VT (bitconvert
- (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))],
- IIC_SSE_MOV_LH>, EVEX_4V;
+ (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))]>,
+ Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>, EVEX_4V;
}
-defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps,
+// No patterns for MOVLPS/MOVHPS as the Movlhps node should only be created in
+// SSE1. And MOVLPS pattern is even more complex.
+defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", null_frag,
v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Unpckl,
v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
-defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", X86Movlps,
+defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", null_frag,
v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
-defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movlpd,
+defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movsd,
v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
let Predicates = [HasAVX512] in {
- // VMOVHPS patterns
- def : Pat<(X86Movlhps VR128X:$src1,
- (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
- (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
- def : Pat<(X86Movlhps VR128X:$src1,
- (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
- (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
// VMOVHPD patterns
def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
- // VMOVLPS patterns
- def : Pat<(v4f32 (X86Movlps VR128X:$src1, (load addr:$src2))),
- (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
- // VMOVLPD patterns
- def : Pat<(v2f64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
- (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
- def : Pat<(v2f64 (X86Movsd VR128X:$src1,
- (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
- (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
}
+let SchedRW = [WriteFStore] in {
def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovhps\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt
(X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)),
(bc_v2f64 (v4f32 VR128X:$src))),
- (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+ (iPTR 0))), addr:$dst)]>,
EVEX, EVEX_CD8<32, CD8VT2>;
def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovhpd\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt
(v2f64 (X86Unpckh VR128X:$src, VR128X:$src)),
- (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+ (iPTR 0))), addr:$dst)]>,
EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovlps\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt (bc_v2f64 (v4f32 VR128X:$src)),
- (iPTR 0))), addr:$dst)],
- IIC_SSE_MOV_LH>,
+ (iPTR 0))), addr:$dst)]>,
EVEX, EVEX_CD8<32, CD8VT2>;
def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovlpd\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt (v2f64 VR128X:$src),
- (iPTR 0))), addr:$dst)],
- IIC_SSE_MOV_LH>,
+ (iPTR 0))), addr:$dst)]>,
EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+} // SchedRW
let Predicates = [HasAVX512] in {
// VMOVHPD patterns
@@ -6005,77 +6505,75 @@ let Predicates = [HasAVX512] in {
(v2f64 (X86VPermilpi VR128X:$src, (i8 1))),
(iPTR 0))), addr:$dst),
(VMOVHPDZ128mr addr:$dst, VR128X:$src)>;
- // VMOVLPS patterns
- def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128X:$src2)),
- addr:$src1),
- (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
- // VMOVLPD patterns
- def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
- addr:$src1),
- (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
}
//===----------------------------------------------------------------------===//
// FMA - Fused Multiply Operations
//
multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), NoItinerary, 1, 1>,
- AVX512FMA3Base, Sched<[WriteFMA]>;
+ (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
+ AVX512FMA3Base, Sched<[sched]>;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))),
- NoItinerary, 1, 0>, AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>;
+ (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
+ AVX512FMA3Base, Sched<[sched.Folded, ReadAfterLd]>;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(OpNode _.RC:$src2,
- _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))),
- NoItinerary, 1, 0>, AVX512FMA3Base, EVEX_B,
- Sched<[WriteFMALd, ReadAfterLd]>;
+ _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>,
+ AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
}
}
multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
- (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))),
- NoItinerary, 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>;
+ (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
- string Suff> {
+ SDNode OpNodeRnd, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _, string Suff> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
- avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, _.info512,
- Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.ZMM,
+ _.info512, Suff>,
+ avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
+ _.info512, Suff>,
+ EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
}
let Predicates = [HasVLX, HasAVX512] in {
- defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
+ defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.YMM,
+ _.info256, Suff>,
EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
- defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
+ defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.XMM,
+ _.info128, Suff>,
EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
}
}
multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd > {
+ SDNode OpNodeRnd> {
defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
- avx512vl_f32_info, "PS">;
+ SchedWriteFMA, avx512vl_f32_info, "PS">;
defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
- avx512vl_f64_info, "PD">, VEX_W;
+ SchedWriteFMA, avx512vl_f64_info, "PD">,
+ VEX_W;
}
defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86Fmadd, X86FmaddRnd>;
@@ -6087,19 +6585,20 @@ defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubR
multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), NoItinerary, 1, 1,
- vselect, 1>, AVX512FMA3Base, Sched<[WriteFMA]>;
+ (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1,
+ vselect, 1>, AVX512FMA3Base, Sched<[sched]>;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)),
- NoItinerary, 1, 0>, AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>;
+ (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
+ AVX512FMA3Base, Sched<[sched.Folded, ReadAfterLd]>;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
@@ -6107,34 +6606,39 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$src2, ${src3}"##_.BroadcastStr,
(_.VT (OpNode _.RC:$src2,
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
- _.RC:$src1)), NoItinerary, 1, 0>, AVX512FMA3Base, EVEX_B,
- Sched<[WriteFMALd, ReadAfterLd]>;
+ _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
(_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))),
- NoItinerary, 1, 1, vselect, 1>,
- AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>;
+ 1, 1, vselect, 1>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
- string Suff> {
+ SDNode OpNodeRnd, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _, string Suff> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
- avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, _.info512,
- Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.ZMM,
+ _.info512, Suff>,
+ avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
+ _.info512, Suff>,
+ EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
}
let Predicates = [HasVLX, HasAVX512] in {
- defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
+ defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.YMM,
+ _.info256, Suff>,
EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
- defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
+ defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.XMM,
+ _.info128, Suff>,
EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
}
}
@@ -6142,9 +6646,10 @@ multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
SDNode OpNodeRnd > {
defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
- avx512vl_f32_info, "PS">;
+ SchedWriteFMA, avx512vl_f32_info, "PS">;
defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
- avx512vl_f64_info, "PD">, VEX_W;
+ SchedWriteFMA, avx512vl_f64_info, "PD">,
+ VEX_W;
}
defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86Fmadd, X86FmaddRnd>;
@@ -6155,21 +6660,22 @@ defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddR
defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>;
multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), NoItinerary,
- 1, 1, vselect, 1>, AVX512FMA3Base, Sched<[WriteFMA]>;
+ (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1, vselect, 1>,
+ AVX512FMA3Base, Sched<[sched]>;
// Pattern is 312 order so that the load is in a different place from the
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)),
- NoItinerary, 1, 0>, AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>;
+ (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>,
+ AVX512FMA3Base, Sched<[sched.Folded, ReadAfterLd]>;
// Pattern is 312 order so that the load is in a different place from the
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
@@ -6178,34 +6684,39 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
"$src2, ${src3}"##_.BroadcastStr,
(_.VT (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
- _.RC:$src1, _.RC:$src2)), NoItinerary, 1, 0>,
- AVX512FMA3Base, EVEX_B, Sched<[WriteFMALd, ReadAfterLd]>;
+ _.RC:$src1, _.RC:$src2)), 1, 0>,
+ AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
}
}
multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
(_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))),
- NoItinerary, 1, 1, vselect, 1>,
- AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>;
+ 1, 1, vselect, 1>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
- string Suff> {
+ SDNode OpNodeRnd, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _, string Suff> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
- avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, _.info512,
- Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.ZMM,
+ _.info512, Suff>,
+ avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
+ _.info512, Suff>,
+ EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
}
let Predicates = [HasVLX, HasAVX512] in {
- defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
+ defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.YMM,
+ _.info256, Suff>,
EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
- defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
+ defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.XMM,
+ _.info128, Suff>,
EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
}
}
@@ -6213,9 +6724,10 @@ multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
SDNode OpNodeRnd > {
defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
- avx512vl_f32_info, "PS">;
+ SchedWriteFMA, avx512vl_f32_info, "PS">;
defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
- avx512vl_f64_info, "PD">, VEX_W;
+ SchedWriteFMA, avx512vl_f64_info, "PD">,
+ VEX_W;
}
defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86Fmadd, X86FmaddRnd>;
@@ -6227,129 +6739,337 @@ defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubR
// Scalar FMA
multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- dag RHS_VEC_r, dag RHS_VEC_m, dag RHS_VEC_rb,
- dag RHS_r, dag RHS_m, bit MaskOnlyReg> {
+ dag RHS_r, dag RHS_m, dag RHS_b, bit MaskOnlyReg> {
let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3), OpcodeStr,
- "$src3, $src2", "$src2, $src3", RHS_VEC_r, NoItinerary, 1, 1>,
- AVX512FMA3Base, Sched<[WriteFMA]>;
+ "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
+ AVX512FMA3Base, Sched<[SchedWriteFMA.Scl]>;
+ let mayLoad = 1 in
defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
- "$src3, $src2", "$src2, $src3", RHS_VEC_m, NoItinerary, 1, 1>,
- AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>;
+ "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
+ AVX512FMA3Base, Sched<[SchedWriteFMA.Scl.Folded, ReadAfterLd]>;
defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
- OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb,
- NoItinerary, 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC,
- Sched<[WriteFMA]>;
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), 1, 1>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>;
let isCodeGenOnly = 1, isCommutable = 1 in {
def r : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- !if(MaskOnlyReg, [], [RHS_r])>, Sched<[WriteFMA]>;
+ !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>;
def m : AVX512FMA3S<opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [RHS_m]>, Sched<[WriteFMALd, ReadAfterLd]>;
+ [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, ReadAfterLd]>;
+
+ def rb : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3, AVX512RC:$rc),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !if(MaskOnlyReg, [], [RHS_b])>, EVEX_B, EVEX_RC,
+ Sched<[SchedWriteFMA.Scl]>;
}// isCodeGenOnly = 1
}// Constraints = "$src1 = $dst"
}
multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
- string OpcodeStr, SDNode OpNode, SDNode OpNodes1,
- SDNode OpNodeRnds1, SDNode OpNodes3,
- SDNode OpNodeRnds3, X86VectorVTInfo _,
- string SUFF> {
+ string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd,
+ X86VectorVTInfo _, string SUFF> {
let ExeDomain = _.ExeDomain in {
defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix, _,
// Operands for intrinsic are in 123 order to preserve passthu
// semantics.
- (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2, _.RC:$src3)),
- (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2,
- _.ScalarIntMemCPat:$src3)),
- (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
- (i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
_.FRC:$src3))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
- (_.ScalarLdFrag addr:$src3)))), 0>;
+ (_.ScalarLdFrag addr:$src3)))),
+ (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src1,
+ _.FRC:$src3, (i32 imm:$rc)))), 0>;
defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix, _,
- (_.VT (OpNodes3 _.RC:$src2, _.RC:$src3, _.RC:$src1)),
- (_.VT (OpNodes3 _.RC:$src2, _.ScalarIntMemCPat:$src3,
- _.RC:$src1)),
- (_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
- (i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
_.FRC:$src1))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2,
- (_.ScalarLdFrag addr:$src3), _.FRC:$src1))), 1>;
+ (_.ScalarLdFrag addr:$src3), _.FRC:$src1))),
+ (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src3,
+ _.FRC:$src1, (i32 imm:$rc)))), 1>;
// One pattern is 312 order so that the load is in a different place from the
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix, _,
- (null_frag),
- (_.VT (OpNodes1 _.RC:$src1, _.ScalarIntMemCPat:$src3,
- _.RC:$src2)),
- (null_frag),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
_.FRC:$src2))),
(set _.FRC:$dst, (_.EltVT (OpNode (_.ScalarLdFrag addr:$src3),
- _.FRC:$src1, _.FRC:$src2))), 1>;
+ _.FRC:$src1, _.FRC:$src2))),
+ (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src1, _.FRC:$src3,
+ _.FRC:$src2, (i32 imm:$rc)))), 1>;
}
}
multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
- string OpcodeStr, SDNode OpNode, SDNode OpNodes1,
- SDNode OpNodeRnds1, SDNode OpNodes3,
- SDNode OpNodeRnds3> {
+ string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd> {
let Predicates = [HasAVX512] in {
defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
- OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3,
- f32x_info, "SS">,
+ OpNodeRnd, f32x_info, "SS">,
EVEX_CD8<32, CD8VT1>, VEX_LIG;
defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
- OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3,
- f64x_info, "SD">,
+ OpNodeRnd, f64x_info, "SD">,
EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
}
}
-defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86Fmadds1,
- X86FmaddRnds1, X86Fmadds3, X86FmaddRnds3>;
-defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86Fmsubs1,
- X86FmsubRnds1, X86Fmsubs3, X86FmsubRnds3>;
-defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86Fnmadds1,
- X86FnmaddRnds1, X86Fnmadds3, X86FnmaddRnds3>;
-defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86Fnmsubs1,
- X86FnmsubRnds1, X86Fnmsubs3, X86FnmsubRnds3>;
+defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>;
+
+multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
+ string Suffix, SDNode Move,
+ X86VectorVTInfo _, PatLeaf ZeroFP> {
+ let Predicates = [HasAVX512] in {
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (Op _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src3))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zr_Int")
+ VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (Op _.FRC:$src2, _.FRC:$src3,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zr_Int")
+ VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (Op _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src3)))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zm_Int")
+ VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src3), _.FRC:$src2))))),
+ (!cast<I>(Prefix#"132"#Suffix#"Zm_Int")
+ VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zm_Int")
+ VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src3),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zr_Intk")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src3)),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zm_Intk")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src3), _.FRC:$src2),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"132"#Suffix#"Zm_Intk")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op _.FRC:$src2, _.FRC:$src3,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zr_Intk")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zm_Intk")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src3),
+ (_.EltVT ZeroFP)))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zr_Intkz")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op _.FRC:$src2, _.FRC:$src3,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+ (_.EltVT ZeroFP)))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zr_Intkz")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src3)),
+ (_.EltVT ZeroFP)))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zm_Intkz")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src2, (_.ScalarLdFrag addr:$src3)),
+ (_.EltVT ZeroFP)))))),
+ (!cast<I>(Prefix#"132"#Suffix#"Zm_Intkz")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+ (_.EltVT ZeroFP)))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zm_Intkz")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+ // Patterns with rounding mode.
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (RndOp _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src3, (i32 imm:$rc)))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zrb_Int")
+ VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (RndOp _.FRC:$src2, _.FRC:$src3,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (i32 imm:$rc)))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zrb_Int")
+ VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (RndOp _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src3, (i32 imm:$rc)),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zrb_Intk")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (RndOp _.FRC:$src2, _.FRC:$src3,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (i32 imm:$rc)),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zrb_Intk")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (RndOp _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src3, (i32 imm:$rc)),
+ (_.EltVT ZeroFP)))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zrb_Intkz")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (RndOp _.FRC:$src2, _.FRC:$src3,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (i32 imm:$rc)),
+ (_.EltVT ZeroFP)))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zrb_Intkz")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+ }
+}
+
+defm : avx512_scalar_fma_patterns<X86Fmadd, X86FmaddRnd, "VFMADD", "SS",
+ X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SS",
+ X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SS",
+ X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SS",
+ X86Movss, v4f32x_info, fp32imm0>;
+
+defm : avx512_scalar_fma_patterns<X86Fmadd, X86FmaddRnd, "VFMADD", "SD",
+ X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SD",
+ X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SD",
+ X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SD",
+ X86Movsd, v2f64x_info, fp64imm0>;
//===----------------------------------------------------------------------===//
// AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
//===----------------------------------------------------------------------===//
let Constraints = "$src1 = $dst" in {
multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
// NOTE: The SDNode have the multiply operands first with the add last.
// This enables commuted load patterns to be autogenerated by tablegen.
let ExeDomain = _.ExeDomain in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), itins.rr, 1, 1>,
- AVX512FMA3Base, Sched<[itins.Sched]>;
+ (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
+ AVX512FMA3Base, Sched<[sched]>;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)),
- itins.rm>, AVX512FMA3Base, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
+ AVX512FMA3Base, Sched<[sched.Folded, ReadAfterLd]>;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
@@ -6357,48 +7077,50 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(OpNode _.RC:$src2,
(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
- _.RC:$src1), itins.rm>,
- AVX512FMA3Base, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ _.RC:$src1)>,
+ AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
}
}
} // Constraints = "$src1 = $dst"
multiclass avx512_pmadd52_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo _> {
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
let Predicates = [HasIFMA] in {
- defm Z : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, itins, _.info512>,
+ defm Z : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>,
EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
}
let Predicates = [HasVLX, HasIFMA] in {
- defm Z256 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, itins, _.info256>,
+ defm Z256 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, sched.YMM, _.info256>,
EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
- defm Z128 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, itins, _.info128>,
+ defm Z128 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, sched.XMM, _.info128>,
EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
}
}
defm VPMADD52LUQ : avx512_pmadd52_common<0xb4, "vpmadd52luq", x86vpmadd52l,
- SSE_PMADD, avx512vl_i64_info>, VEX_W;
+ SchedWriteVecIMul, avx512vl_i64_info>,
+ VEX_W;
defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h,
- SSE_PMADD, avx512vl_i64_info>, VEX_W;
+ SchedWriteVecIMul, avx512vl_i64_info>,
+ VEX_W;
//===----------------------------------------------------------------------===//
// AVX-512 Scalar convert from sign integer to float/double
//===----------------------------------------------------------------------===//
-multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, OpndItins itins,
+multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, X86FoldableSchedWrite sched,
RegisterClass SrcRC, X86VectorVTInfo DstVT,
X86MemOperand x86memop, PatFrag ld_frag, string asm> {
let hasSideEffects = 0 in {
def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
(ins DstVT.FRC:$src1, SrcRC:$src),
- !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), [],
- itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ EVEX_4V, Sched<[sched]>;
let mayLoad = 1 in
def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst),
(ins DstVT.FRC:$src1, x86memop:$src),
- !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), [],
- itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
} // hasSideEffects = 0
let isCodeGenOnly = 1 in {
def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
@@ -6407,8 +7129,8 @@ multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, OpndItins itins,
[(set DstVT.RC:$dst,
(OpNode (DstVT.VT DstVT.RC:$src1),
SrcRC:$src2,
- (i32 FROUND_CURRENT)))], itins.rr>,
- EVEX_4V, Sched<[itins.Sched]>;
+ (i32 FROUND_CURRENT)))]>,
+ EVEX_4V, Sched<[sched]>;
def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
(ins DstVT.RC:$src1, x86memop:$src2),
@@ -6416,13 +7138,14 @@ multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, OpndItins itins,
[(set DstVT.RC:$dst,
(OpNode (DstVT.VT DstVT.RC:$src1),
(ld_frag addr:$src2),
- (i32 FROUND_CURRENT)))], itins.rm>,
- EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 FROUND_CURRENT)))]>,
+ EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
}//isCodeGenOnly = 1
}
-multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, OpndItins itins,
- RegisterClass SrcRC, X86VectorVTInfo DstVT, string asm> {
+multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode,
+ X86FoldableSchedWrite sched, RegisterClass SrcRC,
+ X86VectorVTInfo DstVT, string asm> {
def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
(ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc),
!strconcat(asm,
@@ -6430,36 +7153,37 @@ multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, OpndItins itins,
[(set DstVT.RC:$dst,
(OpNode (DstVT.VT DstVT.RC:$src1),
SrcRC:$src2,
- (i32 imm:$rc)))], itins.rr>,
- EVEX_4V, EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
+ (i32 imm:$rc)))]>,
+ EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
}
-multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, OpndItins itins,
- RegisterClass SrcRC, X86VectorVTInfo DstVT,
- X86MemOperand x86memop, PatFrag ld_frag, string asm> {
- defm NAME : avx512_vcvtsi_round<opc, OpNode, itins, SrcRC, DstVT, asm>,
- avx512_vcvtsi<opc, OpNode, itins, SrcRC, DstVT, x86memop,
+multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode,
+ X86FoldableSchedWrite sched,
+ RegisterClass SrcRC, X86VectorVTInfo DstVT,
+ X86MemOperand x86memop, PatFrag ld_frag, string asm> {
+ defm NAME : avx512_vcvtsi_round<opc, OpNode, sched, SrcRC, DstVT, asm>,
+ avx512_vcvtsi<opc, OpNode, sched, SrcRC, DstVT, x86memop,
ld_frag, asm>, VEX_LIG;
}
let Predicates = [HasAVX512] in {
-defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SS, GR32,
+defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SS, GR32,
v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">,
XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SS, GR64,
+defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SS, GR64,
v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">,
XS, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SD, GR32,
+defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SD, GR32,
v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">,
XD, EVEX_CD8<32, CD8VT1>;
-defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SD, GR64,
+defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SD, GR64,
v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+ (VCVTSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+ (VCVTSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
@@ -6479,23 +7203,23 @@ def : Pat<(f64 (sint_to_fp GR32:$src)),
def : Pat<(f64 (sint_to_fp GR64:$src)),
(VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
-defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, SSE_CVT_SI2SS, GR32,
+defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SS, GR32,
v4f32x_info, i32mem, loadi32,
"cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, SSE_CVT_SI2SS, GR64,
+defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SS, GR64,
v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">,
XS, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, SSE_CVT_SI2SD, GR32, v2f64x_info,
+defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, WriteCvtI2SD, GR32, v2f64x_info,
i32mem, loadi32, "cvtusi2sd{l}">,
XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, SSE_CVT_SI2SD, GR64,
+defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SD, GR64,
v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTUSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+ (VCVTUSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTUSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+ (VCVTUSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
(VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
@@ -6522,50 +7246,70 @@ def : Pat<(f64 (uint_to_fp GR64:$src)),
multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
X86VectorVTInfo DstVT, SDNode OpNode,
- OpndItins itins, string asm> {
+ X86FoldableSchedWrite sched, string asm,
+ string aliasStr,
+ bit CodeGenOnly = 1> {
let Predicates = [HasAVX512] in {
def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
- [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 FROUND_CURRENT)))],
- itins.rr>, EVEX, VEX_LIG, Sched<[itins.Sched]>;
+ [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 FROUND_CURRENT)))]>,
+ EVEX, VEX_LIG, Sched<[sched]>;
def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc),
!strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
- [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))],
- itins.rr>, EVEX, VEX_LIG, EVEX_B, EVEX_RC,
- Sched<[itins.Sched]>;
+ [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>,
+ EVEX, VEX_LIG, EVEX_B, EVEX_RC,
+ Sched<[sched]>;
+ let isCodeGenOnly = CodeGenOnly, ForceDisassemble = CodeGenOnly in
def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set DstVT.RC:$dst, (OpNode
(SrcVT.VT SrcVT.ScalarIntMemCPat:$src),
- (i32 FROUND_CURRENT)))], itins.rm>,
- EVEX, VEX_LIG, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 FROUND_CURRENT)))]>,
+ EVEX, VEX_LIG, Sched<[sched.Folded, ReadAfterLd]>;
+
+ def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "rr_Int") DstVT.RC:$dst, SrcVT.RC:$src), 0, "att">;
+ def : InstAlias<"v" # asm # aliasStr # "\t{$rc, $src, $dst|$dst, $src, $rc}",
+ (!cast<Instruction>(NAME # "rrb_Int") DstVT.RC:$dst, SrcVT.RC:$src, AVX512RC:$rc), 0, "att">;
+ } // Predicates = [HasAVX512]
+}
+
+multiclass avx512_cvt_s_int_round_aliases<bits<8> opc, X86VectorVTInfo SrcVT,
+ X86VectorVTInfo DstVT, SDNode OpNode,
+ X86FoldableSchedWrite sched, string asm,
+ string aliasStr> :
+ avx512_cvt_s_int_round<opc, SrcVT, DstVT, OpNode, sched, asm, aliasStr, 0> {
+ let Predicates = [HasAVX512] in {
+ def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "rm_Int") DstVT.RC:$dst,
+ SrcVT.IntScalarMemOp:$src), 0, "att">;
} // Predicates = [HasAVX512]
}
// Convert float/double to signed/unsigned int 32/64
defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info,
- X86cvts2si, SSE_CVT_SS2SI_32, "cvtss2si">,
+ X86cvts2si, WriteCvtSS2I, "cvtss2si", "{l}">,
XS, EVEX_CD8<32, CD8VT1>;
defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info,
- X86cvts2si, SSE_CVT_SS2SI_64, "cvtss2si">,
+ X86cvts2si, WriteCvtSS2I, "cvtss2si", "{q}">,
XS, VEX_W, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info,
- X86cvts2usi, SSE_CVT_SS2SI_32, "cvtss2usi">,
+defm VCVTSS2USIZ: avx512_cvt_s_int_round_aliases<0x79, f32x_info, i32x_info,
+ X86cvts2usi, WriteCvtSS2I, "cvtss2usi", "{l}">,
XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info,
- X86cvts2usi, SSE_CVT_SS2SI_64, "cvtss2usi">,
+defm VCVTSS2USI64Z: avx512_cvt_s_int_round_aliases<0x79, f32x_info, i64x_info,
+ X86cvts2usi, WriteCvtSS2I, "cvtss2usi", "{q}">,
XS, VEX_W, EVEX_CD8<32, CD8VT1>;
defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info,
- X86cvts2si, SSE_CVT_SD2SI, "cvtsd2si">,
+ X86cvts2si, WriteCvtSD2I, "cvtsd2si", "{l}">,
XD, EVEX_CD8<64, CD8VT1>;
defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info,
- X86cvts2si, SSE_CVT_SD2SI, "cvtsd2si">,
+ X86cvts2si, WriteCvtSD2I, "cvtsd2si", "{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, f64x_info, i32x_info,
- X86cvts2usi, SSE_CVT_SD2SI, "cvtsd2usi">,
+defm VCVTSD2USIZ: avx512_cvt_s_int_round_aliases<0x79, f64x_info, i32x_info,
+ X86cvts2usi, WriteCvtSD2I, "cvtsd2usi", "{l}">,
XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info,
- X86cvts2usi, SSE_CVT_SD2SI, "cvtsd2usi">,
+defm VCVTSD2USI64Z: avx512_cvt_s_int_round_aliases<0x79, f64x_info, i64x_info,
+ X86cvts2usi, WriteCvtSD2I, "cvtsd2usi", "{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
// The SSE version of these instructions are disabled for AVX512.
@@ -6589,29 +7333,6 @@ let Predicates = [HasAVX512] in {
(VCVTSD2SI64Zrm_Int sse_load_f64:$src)>;
} // HasAVX512
-let Predicates = [HasAVX512] in {
- def : Pat<(int_x86_sse_cvtsi2ss VR128X:$src1, GR32:$src2),
- (VCVTSI2SSZrr_Int VR128X:$src1, GR32:$src2)>;
- def : Pat<(int_x86_sse_cvtsi2ss VR128X:$src1, (loadi32 addr:$src2)),
- (VCVTSI2SSZrm_Int VR128X:$src1, addr:$src2)>;
- def : Pat<(int_x86_sse_cvtsi642ss VR128X:$src1, GR64:$src2),
- (VCVTSI642SSZrr_Int VR128X:$src1, GR64:$src2)>;
- def : Pat<(int_x86_sse_cvtsi642ss VR128X:$src1, (loadi64 addr:$src2)),
- (VCVTSI642SSZrm_Int VR128X:$src1, addr:$src2)>;
- def : Pat<(int_x86_sse2_cvtsi2sd VR128X:$src1, GR32:$src2),
- (VCVTSI2SDZrr_Int VR128X:$src1, GR32:$src2)>;
- def : Pat<(int_x86_sse2_cvtsi2sd VR128X:$src1, (loadi32 addr:$src2)),
- (VCVTSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
- def : Pat<(int_x86_sse2_cvtsi642sd VR128X:$src1, GR64:$src2),
- (VCVTSI642SDZrr_Int VR128X:$src1, GR64:$src2)>;
- def : Pat<(int_x86_sse2_cvtsi642sd VR128X:$src1, (loadi64 addr:$src2)),
- (VCVTSI642SDZrm_Int VR128X:$src1, addr:$src2)>;
- def : Pat<(int_x86_avx512_cvtusi2sd VR128X:$src1, GR32:$src2),
- (VCVTUSI2SDZrr_Int VR128X:$src1, GR32:$src2)>;
- def : Pat<(int_x86_avx512_cvtusi2sd VR128X:$src1, (loadi32 addr:$src2)),
- (VCVTUSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
-} // Predicates = [HasAVX512]
-
// Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang
// which produce unnecessary vmovs{s,d} instructions
let Predicates = [HasAVX512] in {
@@ -6622,9 +7343,19 @@ def : Pat<(v4f32 (X86Movss
def : Pat<(v4f32 (X86Movss
(v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
+ (VCVTSI642SSZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
(v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
(VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>;
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
+ (VCVTSI2SSZrm_Int VR128X:$dst, addr:$src)>;
+
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128X:$dst),
(v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
@@ -6632,83 +7363,143 @@ def : Pat<(v2f64 (X86Movsd
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
+ (VCVTSI642SDZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
(v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
(VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
+ (VCVTSI2SDZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (uint_to_fp GR64:$src)))))),
+ (VCVTUSI642SSZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (uint_to_fp (loadi64 addr:$src))))))),
+ (VCVTUSI642SSZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (uint_to_fp GR32:$src)))))),
+ (VCVTUSI2SSZrr_Int VR128X:$dst, GR32:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (uint_to_fp (loadi32 addr:$src))))))),
+ (VCVTUSI2SSZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (uint_to_fp GR64:$src)))))),
+ (VCVTUSI642SDZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (uint_to_fp (loadi64 addr:$src))))))),
+ (VCVTUSI642SDZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (uint_to_fp GR32:$src)))))),
+ (VCVTUSI2SDZrr_Int VR128X:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (uint_to_fp (loadi32 addr:$src))))))),
+ (VCVTUSI2SDZrm_Int VR128X:$dst, addr:$src)>;
} // Predicates = [HasAVX512]
// Convert float/double to signed/unsigned int 32/64 with truncation
multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
X86VectorVTInfo _DstRC, SDNode OpNode,
- SDNode OpNodeRnd, OpndItins itins, string aliasStr>{
+ SDNode OpNodeRnd, X86FoldableSchedWrite sched,
+ string aliasStr, bit CodeGenOnly = 1>{
let Predicates = [HasAVX512] in {
+ let isCodeGenOnly = 1 in {
def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
- [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))], itins.rr>,
- EVEX, Sched<[itins.Sched]>;
- let hasSideEffects = 0 in
- def rrb : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
- !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
- [], itins.rr>, EVEX, EVEX_B, Sched<[itins.Sched]>;
+ [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>,
+ EVEX, Sched<[sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
- [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))],
- itins.rm>, EVEX, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
+ EVEX, Sched<[sched.Folded, ReadAfterLd]>;
+ }
+
+ def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
+ (i32 FROUND_CURRENT)))]>,
+ EVEX, VEX_LIG, Sched<[sched]>;
+ def rrb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+ !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+ [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
+ (i32 FROUND_NO_EXC)))]>,
+ EVEX,VEX_LIG , EVEX_B, Sched<[sched]>;
+ let isCodeGenOnly = CodeGenOnly, ForceDisassemble = CodeGenOnly in
+ def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
+ (ins _SrcRC.IntScalarMemOp:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set _DstRC.RC:$dst, (OpNodeRnd
+ (_SrcRC.VT _SrcRC.ScalarIntMemCPat:$src),
+ (i32 FROUND_CURRENT)))]>,
+ EVEX, VEX_LIG, Sched<[sched.Folded, ReadAfterLd]>;
def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "rr") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>;
- def : InstAlias<asm # aliasStr # "\t\t{{sae}, $src, $dst|$dst, $src, {sae}}",
- (!cast<Instruction>(NAME # "rrb") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>;
- def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "rm") _DstRC.RC:$dst,
- _SrcRC.ScalarMemOp:$src), 0>;
-
- let isCodeGenOnly = 1 in {
- def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
- !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
- [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
- (i32 FROUND_CURRENT)))], itins.rr>,
- EVEX, VEX_LIG, Sched<[itins.Sched]>;
- def rrb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
- !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
- [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
- (i32 FROUND_NO_EXC)))], itins.rr>,
- EVEX,VEX_LIG , EVEX_B, Sched<[itins.Sched]>;
- let mayLoad = 1, hasSideEffects = 0 in
- def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
- (ins _SrcRC.IntScalarMemOp:$src),
- !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
- [], itins.rm>, EVEX, VEX_LIG,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
- } // isCodeGenOnly = 1
+ (!cast<Instruction>(NAME # "rr_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">;
+ def : InstAlias<asm # aliasStr # "\t{{sae}, $src, $dst|$dst, $src, {sae}}",
+ (!cast<Instruction>(NAME # "rrb_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">;
} //HasAVX512
}
+multiclass avx512_cvt_s_all_unsigned<bits<8> opc, string asm,
+ X86VectorVTInfo _SrcRC,
+ X86VectorVTInfo _DstRC, SDNode OpNode,
+ SDNode OpNodeRnd, X86FoldableSchedWrite sched,
+ string aliasStr> :
+ avx512_cvt_s_all<opc, asm, _SrcRC, _DstRC, OpNode, OpNodeRnd, sched,
+ aliasStr, 0> {
+let Predicates = [HasAVX512] in {
+ def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "rm_Int") _DstRC.RC:$dst,
+ _SrcRC.IntScalarMemOp:$src), 0, "att">;
+}
+}
defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info,
- fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SS2SI_32, "{l}">,
+ fp_to_sint, X86cvtts2IntRnd, WriteCvtSS2I, "{l}">,
XS, EVEX_CD8<32, CD8VT1>;
defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info,
- fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SS2SI_64, "{q}">,
+ fp_to_sint, X86cvtts2IntRnd, WriteCvtSS2I, "{q}">,
VEX_W, XS, EVEX_CD8<32, CD8VT1>;
defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info,
- fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SD2SI, "{l}">,
+ fp_to_sint, X86cvtts2IntRnd, WriteCvtSD2I, "{l}">,
XD, EVEX_CD8<64, CD8VT1>;
defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info,
- fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SD2SI, "{q}">,
+ fp_to_sint, X86cvtts2IntRnd, WriteCvtSD2I, "{q}">,
VEX_W, XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info,
- fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SS2SI_32, "{l}">,
+defm VCVTTSS2USIZ: avx512_cvt_s_all_unsigned<0x78, "vcvttss2usi", f32x_info, i32x_info,
+ fp_to_uint, X86cvtts2UIntRnd, WriteCvtSS2I, "{l}">,
XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info,
- fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SS2SI_64, "{q}">,
+defm VCVTTSS2USI64Z: avx512_cvt_s_all_unsigned<0x78, "vcvttss2usi", f32x_info, i64x_info,
+ fp_to_uint, X86cvtts2UIntRnd, WriteCvtSS2I, "{q}">,
XS,VEX_W, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info,
- fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SD2SI, "{l}">,
+defm VCVTTSD2USIZ: avx512_cvt_s_all_unsigned<0x78, "vcvttsd2usi", f64x_info, i32x_info,
+ fp_to_uint, X86cvtts2UIntRnd, WriteCvtSD2I, "{l}">,
XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
- fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SD2SI, "{q}">,
+defm VCVTTSD2USI64Z: avx512_cvt_s_all_unsigned<0x78, "vcvttsd2usi", f64x_info, i64x_info,
+ fp_to_uint, X86cvtts2UIntRnd, WriteCvtSD2I, "{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
let Predicates = [HasAVX512] in {
def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))),
(VCVTTSS2SIZrr_Int VR128X:$src)>;
@@ -6733,93 +7524,94 @@ let Predicates = [HasAVX512] in {
//===----------------------------------------------------------------------===//
multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNode, OpndItins itins> {
+ X86VectorVTInfo _Src, SDNode OpNode,
+ X86FoldableSchedWrite sched> {
defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT _.RC:$src1),
(_Src.VT _Src.RC:$src2),
- (i32 FROUND_CURRENT))), itins.rr>,
- EVEX_4V, VEX_LIG, Sched<[itins.Sched]>;
+ (i32 FROUND_CURRENT)))>,
+ EVEX_4V, VEX_LIG, Sched<[sched]>;
defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT _.RC:$src1),
(_Src.VT _Src.ScalarIntMemCPat:$src2),
- (i32 FROUND_CURRENT))), itins.rm>,
+ (i32 FROUND_CURRENT)))>,
EVEX_4V, VEX_LIG,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>;
let isCodeGenOnly = 1, hasSideEffects = 0 in {
def rr : I<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _Src.FRC:$src2),
- OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
- itins.rr>, EVEX_4V, VEX_LIG, Sched<[itins.Sched]>;
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ EVEX_4V, VEX_LIG, Sched<[sched]>;
let mayLoad = 1 in
def rm : I<opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _Src.ScalarMemOp:$src2),
- OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
- itins.rm>, EVEX_4V, VEX_LIG,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ EVEX_4V, VEX_LIG, Sched<[sched.Folded, ReadAfterLd]>;
}
}
// Scalar Coversion with SAE - suppress all exceptions
multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNodeRnd, OpndItins itins> {
+ X86VectorVTInfo _Src, SDNode OpNodeRnd,
+ X86FoldableSchedWrite sched> {
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
(_.VT (OpNodeRnd (_.VT _.RC:$src1),
(_Src.VT _Src.RC:$src2),
- (i32 FROUND_NO_EXC))), itins.rr>,
- EVEX_4V, VEX_LIG, EVEX_B, Sched<[itins.Sched]>;
+ (i32 FROUND_NO_EXC)))>,
+ EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
}
// Scalar Conversion with rounding control (RC)
multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNodeRnd, OpndItins itins> {
+ X86VectorVTInfo _Src, SDNode OpNodeRnd,
+ X86FoldableSchedWrite sched> {
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
(_.VT (OpNodeRnd (_.VT _.RC:$src1),
- (_Src.VT _Src.RC:$src2), (i32 imm:$rc))),
- itins.rr>,
- EVEX_4V, VEX_LIG, Sched<[itins.Sched]>,
+ (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>,
+ EVEX_4V, VEX_LIG, Sched<[sched]>,
EVEX_B, EVEX_RC;
}
multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr,
- SDNode OpNodeRnd, OpndItins itins,
+ SDNode OpNodeRnd, X86FoldableSchedWrite sched,
X86VectorVTInfo _src, X86VectorVTInfo _dst> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, itins>,
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>,
avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
- OpNodeRnd, itins>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
+ OpNodeRnd, sched>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
}
}
-multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr,
- SDNode OpNodeRnd, OpndItins itins,
- X86VectorVTInfo _src, X86VectorVTInfo _dst> {
+multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _src, X86VectorVTInfo _dst> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, itins>,
- avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, itins>,
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>,
+ avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>,
EVEX_CD8<32, CD8VT1>, XS;
}
}
defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss",
- X86froundRnd, SSE_CVT_SD2SS, f64x_info,
- f32x_info>, NotMemoryFoldable;
+ X86froundRnd, WriteCvtSD2SS, f64x_info,
+ f32x_info>;
defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd",
- X86fpextRnd, SSE_CVT_SS2SD, f32x_info,
- f64x_info>, NotMemoryFoldable;
+ X86fpextRnd, WriteCvtSS2SD, f32x_info,
+ f64x_info>;
def : Pat<(f64 (fpextend FR32X:$src)),
(VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>,
Requires<[HasAVX512]>;
def : Pat<(f64 (fpextend (loadf32 addr:$src))),
(VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
- Requires<[HasAVX512]>;
+ Requires<[HasAVX512, OptForSize]>;
def : Pat<(f64 (extloadf32 addr:$src)),
(VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
@@ -6853,110 +7645,109 @@ def : Pat<(v2f64 (X86Movsd
//===----------------------------------------------------------------------===//
multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNode, OpndItins itins,
- string Broadcast = _.BroadcastStr,
- string Alias = "", X86MemOperand MemOp = _Src.MemOp> {
+ X86VectorVTInfo _Src, SDNode OpNode,
+ X86FoldableSchedWrite sched,
+ string Broadcast = _.BroadcastStr,
+ string Alias = "", X86MemOperand MemOp = _Src.MemOp> {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src), OpcodeStr, "$src", "$src",
- (_.VT (OpNode (_Src.VT _Src.RC:$src))), itins.rr>,
- EVEX, Sched<[itins.Sched]>;
+ (_.VT (OpNode (_Src.VT _Src.RC:$src)))>,
+ EVEX, Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins MemOp:$src), OpcodeStr#Alias, "$src", "$src",
(_.VT (OpNode (_Src.VT
- (bitconvert (_Src.LdFrag addr:$src))))), itins.rm>,
- EVEX, Sched<[itins.Sched.Folded]>;
+ (bitconvert (_Src.LdFrag addr:$src)))))>,
+ EVEX, Sched<[sched.Folded]>;
defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _Src.ScalarMemOp:$src), OpcodeStr,
"${src}"##Broadcast, "${src}"##Broadcast,
(_.VT (OpNode (_Src.VT
(X86VBroadcast (_Src.ScalarLdFrag addr:$src)))
- )), itins.rm>, EVEX, EVEX_B,
- Sched<[itins.Sched.Folded]>;
+ ))>, EVEX, EVEX_B,
+ Sched<[sched.Folded]>;
}
// Coversion with SAE - suppress all exceptions
multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
X86VectorVTInfo _Src, SDNode OpNodeRnd,
- OpndItins itins> {
+ X86FoldableSchedWrite sched> {
defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src), OpcodeStr,
"{sae}, $src", "$src, {sae}",
(_.VT (OpNodeRnd (_Src.VT _Src.RC:$src),
- (i32 FROUND_NO_EXC))), itins.rr>,
- EVEX, EVEX_B, Sched<[itins.Sched]>;
+ (i32 FROUND_NO_EXC)))>,
+ EVEX, EVEX_B, Sched<[sched]>;
}
// Conversion with rounding control (RC)
multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
X86VectorVTInfo _Src, SDNode OpNodeRnd,
- OpndItins itins> {
+ X86FoldableSchedWrite sched> {
defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr,
"$rc, $src", "$src, $rc",
- (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc))),
- itins.rr>, EVEX, EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
+ (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>,
+ EVEX, EVEX_B, EVEX_RC, Sched<[sched]>;
}
// Extend Float to Double
multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr,
- OpndItins itins> {
+ X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info,
- fpextend, itins>,
+ fpextend, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
- X86vfpextRnd, itins>, EVEX_V512;
+ X86vfpextRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info,
- X86vfpext, itins, "{1to2}", "", f64mem>, EVEX_V128;
+ X86vfpext, sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend,
- itins>, EVEX_V256;
+ sched.YMM>, EVEX_V256;
}
}
// Truncate Double to Float
-multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, OpndItins itins> {
+multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround, itins>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround, sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
- X86vfproundRnd, itins>, EVEX_V512;
+ X86vfproundRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
- X86vfpround, itins, "{1to2}", "{x}">, EVEX_V128;
+ X86vfpround, sched.XMM, "{1to2}", "{x}">, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fpround,
- itins, "{1to4}", "{y}">, EVEX_V256;
+ sched.YMM, "{1to4}", "{y}">, EVEX_V256;
def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0>;
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0, "intel">;
def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0>;
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0, "intel">;
}
}
-defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SSE_CVT_PD2PS>,
+defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>,
VEX_W, PD, EVEX_CD8<64, CD8VF>;
-defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SSE_CVT_PS2PD>,
+defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SchedWriteCvtPS2PD>,
PS, EVEX_CD8<32, CD8VH>;
def : Pat<(v8f64 (extloadv8f32 addr:$src)),
(VCVTPS2PDZrm addr:$src)>;
let Predicates = [HasVLX] in {
- let AddedComplexity = 15 in {
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (v2f64 VR128X:$src)))))),
- (VCVTPD2PSZ128rr VR128X:$src)>;
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
- (VCVTPD2PSZ128rm addr:$src)>;
- }
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (v2f64 VR128X:$src)))))),
+ (VCVTPD2PSZ128rr VR128X:$src)>;
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
+ (VCVTPD2PSZ128rm addr:$src)>;
def : Pat<(v2f64 (extloadv2f32 addr:$src)),
(VCVTPS2PDZ128rm addr:$src)>;
def : Pat<(v4f64 (extloadv4f32 addr:$src)),
@@ -6965,80 +7756,79 @@ let Predicates = [HasVLX] in {
// Convert Signed/Unsigned Doubleword to Double
multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNode128, OpndItins itins> {
+ SDNode OpNode128, X86SchedWriteWidths sched> {
// No rounding in this op
let Predicates = [HasAVX512] in
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode,
- itins>, EVEX_V512;
+ sched.ZMM>, EVEX_V512;
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
- OpNode128, itins, "{1to2}", "", i64mem>, EVEX_V128;
+ OpNode128, sched.XMM, "{1to2}", "", i64mem>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode,
- itins>, EVEX_V256;
+ sched.YMM>, EVEX_V256;
}
}
// Convert Signed/Unsigned Doubleword to Float
multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, OpndItins itins> {
+ SDNode OpNodeRnd, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode,
- itins>,
+ sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info,
- OpNodeRnd, itins>, EVEX_V512;
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode,
- itins>, EVEX_V128;
+ sched.XMM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode,
- itins>, EVEX_V256;
+ sched.YMM>, EVEX_V256;
}
}
// Convert Float to Signed/Unsigned Doubleword with truncation
multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, OpndItins itins> {
+ SDNode OpNodeRnd, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
- itins>,
+ sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
- OpNodeRnd, itins>, EVEX_V512;
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
- itins>, EVEX_V128;
+ sched.XMM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode,
- itins>, EVEX_V256;
+ sched.YMM>, EVEX_V256;
}
}
// Convert Float to Signed/Unsigned Doubleword
multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, OpndItins itins> {
+ SDNode OpNodeRnd, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
- itins>,
+ sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info,
- OpNodeRnd, itins>, EVEX_V512;
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
- itins>, EVEX_V128;
+ sched.XMM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode,
- itins>, EVEX_V256;
+ sched.YMM>, EVEX_V256;
}
}
// Convert Double to Signed/Unsigned Doubleword with truncation
multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNode128, SDNode OpNodeRnd,
- OpndItins itins> {
+ SDNode OpNodeRnd, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
- itins>,
+ sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
- OpNodeRnd, itins>, EVEX_V512;
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
// we need "x"/"y" suffixes in order to distinguish between 128 and 256
@@ -7046,29 +7836,29 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
// due to the same reason.
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
- OpNode128, itins, "{1to2}", "{x}">, EVEX_V128;
+ OpNode, sched.XMM, "{1to2}", "{x}">, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
- itins, "{1to4}", "{y}">, EVEX_V256;
+ sched.YMM, "{1to4}", "{y}">, EVEX_V256;
def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0>;
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0, "intel">;
def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0>;
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0, "intel">;
}
}
// Convert Double to Signed/Unsigned Doubleword
multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, OpndItins itins> {
+ SDNode OpNodeRnd, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
- itins>,
+ sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info,
- OpNodeRnd, itins>, EVEX_V512;
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
// we need "x"/"y" suffixes in order to distinguish between 128 and 256
@@ -7076,118 +7866,118 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
// due to the same reason.
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
- itins, "{1to2}", "{x}">, EVEX_V128;
+ sched.XMM, "{1to2}", "{x}">, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
- itins, "{1to4}", "{y}">, EVEX_V256;
+ sched.YMM, "{1to4}", "{y}">, EVEX_V256;
def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0>;
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0, "intel">;
def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0>;
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0, "intel">;
}
}
// Convert Double to Signed/Unsigned Quardword
multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, OpndItins itins> {
+ SDNode OpNodeRnd, X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode,
- itins>,
+ sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info,
- OpNodeRnd,itins>, EVEX_V512;
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasDQI, HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode,
- itins>, EVEX_V128;
+ sched.XMM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode,
- itins>, EVEX_V256;
+ sched.YMM>, EVEX_V256;
}
}
// Convert Double to Signed/Unsigned Quardword with truncation
multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, OpndItins itins> {
+ SDNode OpNodeRnd, X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode,
- itins>,
+ sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
- OpNodeRnd, itins>, EVEX_V512;
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasDQI, HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode,
- itins>, EVEX_V128;
+ sched.XMM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode,
- itins>, EVEX_V256;
+ sched.YMM>, EVEX_V256;
}
}
// Convert Signed/Unsigned Quardword to Double
multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, OpndItins itins> {
+ SDNode OpNodeRnd, X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode,
- itins>,
+ sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info,
- OpNodeRnd, itins>, EVEX_V512;
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasDQI, HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode,
- itins>, EVEX_V128;
+ sched.XMM>, EVEX_V128, NotEVEX2VEXConvertible;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode,
- itins>, EVEX_V256;
+ sched.YMM>, EVEX_V256, NotEVEX2VEXConvertible;
}
}
// Convert Float to Signed/Unsigned Quardword
multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, OpndItins itins> {
+ SDNode OpNodeRnd, X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode,
- itins>,
+ sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info,
- OpNodeRnd, itins>, EVEX_V512;
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasDQI, HasVLX] in {
// Explicitly specified broadcast string, since we take only 2 elements
// from v4f32x_info source
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
- itins, "{1to2}", "", f64mem>, EVEX_V128;
+ sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
- itins>, EVEX_V256;
+ sched.YMM>, EVEX_V256;
}
}
// Convert Float to Signed/Unsigned Quardword with truncation
multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNode128, SDNode OpNodeRnd, OpndItins itins> {
+ SDNode OpNodeRnd, X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode,
- itins>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
- OpNodeRnd, itins>, EVEX_V512;
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasDQI, HasVLX] in {
// Explicitly specified broadcast string, since we take only 2 elements
// from v4f32x_info source
- defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode128,
- itins, "{1to2}", "", f64mem>, EVEX_V128;
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
+ sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
- itins>, EVEX_V256;
+ sched.YMM>, EVEX_V256;
}
}
// Convert Signed/Unsigned Quardword to Float
multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNode128, SDNode OpNodeRnd, OpndItins itins> {
+ SDNode OpNode128, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode,
- itins>,
+ sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
- OpNodeRnd, itins>, EVEX_V512;
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasDQI, HasVLX] in {
// we need "x"/"y" suffixes in order to distinguish between 128 and 256
@@ -7195,116 +7985,226 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
// due to the same reason.
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode128,
- itins, "{1to2}", "{x}">, EVEX_V128;
+ sched.XMM, "{1to2}", "{x}">, EVEX_V128,
+ NotEVEX2VEXConvertible;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
- itins, "{1to4}", "{y}">, EVEX_V256;
+ sched.YMM, "{1to4}", "{y}">, EVEX_V256,
+ NotEVEX2VEXConvertible;
def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0>;
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0, "intel">;
def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0>;
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0, "intel">;
}
}
defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP,
- SSE_CVT_I2PD>, XS, EVEX_CD8<32, CD8VH>;
+ SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>;
defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp,
- X86VSintToFpRnd, SSE_CVT_I2PS>,
+ X86VSintToFpRnd, SchedWriteCvtDQ2PS>,
PS, EVEX_CD8<32, CD8VF>;
-defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint,
- X86cvttp2siRnd, SSE_CVT_PS2I>,
+defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86cvttp2si,
+ X86cvttp2siRnd, SchedWriteCvtPS2DQ>,
XS, EVEX_CD8<32, CD8VF>;
-defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, X86cvttp2si,
- X86cvttp2siRnd, SSE_CVT_PD2I>,
+defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86cvttp2si,
+ X86cvttp2siRnd, SchedWriteCvtPD2DQ>,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint,
- X86cvttp2uiRnd, SSE_CVT_PS2I>, PS,
+defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86cvttp2ui,
+ X86cvttp2uiRnd, SchedWriteCvtPS2DQ>, PS,
EVEX_CD8<32, CD8VF>;
-defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint,
- X86cvttp2ui, X86cvttp2uiRnd, SSE_CVT_PD2I>,
+defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86cvttp2ui,
+ X86cvttp2uiRnd, SchedWriteCvtPD2DQ>,
PS, VEX_W, EVEX_CD8<64, CD8VF>;
defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp,
- X86VUintToFP, SSE_CVT_I2PD>, XS,
+ X86VUintToFP, SchedWriteCvtDQ2PD>, XS,
EVEX_CD8<32, CD8VH>;
defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp,
- X86VUintToFpRnd, SSE_CVT_I2PS>, XD,
+ X86VUintToFpRnd, SchedWriteCvtDQ2PS>, XD,
EVEX_CD8<32, CD8VF>;
defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int,
- X86cvtp2IntRnd, SSE_CVT_PS2I>, PD,
+ X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VF>;
defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int,
- X86cvtp2IntRnd, SSE_CVT_PD2I>, XD,
+ X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, XD,
VEX_W, EVEX_CD8<64, CD8VF>;
defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt,
- X86cvtp2UIntRnd, SSE_CVT_PS2I>,
+ X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>,
PS, EVEX_CD8<32, CD8VF>;
defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt,
- X86cvtp2UIntRnd, SSE_CVT_PD2I>, VEX_W,
+ X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, VEX_W,
PS, EVEX_CD8<64, CD8VF>;
defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int,
- X86cvtp2IntRnd, SSE_CVT_PD2I>, VEX_W,
+ X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int,
- X86cvtp2IntRnd, SSE_CVT_PS2I>, PD,
+ X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VH>;
defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt,
- X86cvtp2UIntRnd, SSE_CVT_PD2I>, VEX_W,
+ X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt,
- X86cvtp2UIntRnd, SSE_CVT_PS2I>, PD,
+ X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VH>;
-defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint,
- X86cvttp2siRnd, SSE_CVT_PD2I>, VEX_W,
+defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86cvttp2si,
+ X86cvttp2siRnd, SchedWriteCvtPD2DQ>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
-defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint, X86cvttp2si,
- X86cvttp2siRnd, SSE_CVT_PS2I>, PD,
+defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86cvttp2si,
+ X86cvttp2siRnd, SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VH>;
-defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint,
- X86cvttp2uiRnd, SSE_CVT_PD2I>, VEX_W,
+defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86cvttp2ui,
+ X86cvttp2uiRnd, SchedWriteCvtPD2DQ>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
-defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint, X86cvttp2ui,
- X86cvttp2uiRnd, SSE_CVT_PS2I>, PD,
+defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86cvttp2ui,
+ X86cvttp2uiRnd, SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VH>;
defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
- X86VSintToFpRnd, SSE_CVT_I2PD>, VEX_W, XS,
+ X86VSintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS,
EVEX_CD8<64, CD8VF>;
defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp,
- X86VUintToFpRnd, SSE_CVT_I2PD>, VEX_W, XS,
+ X86VUintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS,
EVEX_CD8<64, CD8VF>;
defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFP,
- X86VSintToFpRnd, SSE_CVT_I2PS>, VEX_W, PS,
+ X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, PS,
EVEX_CD8<64, CD8VF>;
defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFP,
- X86VUintToFpRnd, SSE_CVT_I2PS>, VEX_W, XD,
+ X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, XD,
EVEX_CD8<64, CD8VF>;
+let Predicates = [HasAVX512] in {
+ def : Pat<(v16i32 (fp_to_sint (v16f32 VR512:$src))),
+ (VCVTTPS2DQZrr VR512:$src)>;
+ def : Pat<(v16i32 (fp_to_sint (loadv16f32 addr:$src))),
+ (VCVTTPS2DQZrm addr:$src)>;
+
+ def : Pat<(v16i32 (fp_to_uint (v16f32 VR512:$src))),
+ (VCVTTPS2UDQZrr VR512:$src)>;
+ def : Pat<(v16i32 (fp_to_uint (loadv16f32 addr:$src))),
+ (VCVTTPS2UDQZrm addr:$src)>;
+
+ def : Pat<(v8i32 (fp_to_sint (v8f64 VR512:$src))),
+ (VCVTTPD2DQZrr VR512:$src)>;
+ def : Pat<(v8i32 (fp_to_sint (loadv8f64 addr:$src))),
+ (VCVTTPD2DQZrm addr:$src)>;
+
+ def : Pat<(v8i32 (fp_to_uint (v8f64 VR512:$src))),
+ (VCVTTPD2UDQZrr VR512:$src)>;
+ def : Pat<(v8i32 (fp_to_uint (loadv8f64 addr:$src))),
+ (VCVTTPD2UDQZrm addr:$src)>;
+}
+
+let Predicates = [HasVLX] in {
+ def : Pat<(v4i32 (fp_to_sint (v4f32 VR128X:$src))),
+ (VCVTTPS2DQZ128rr VR128X:$src)>;
+ def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
+ (VCVTTPS2DQZ128rm addr:$src)>;
+
+ def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src))),
+ (VCVTTPS2UDQZ128rr VR128X:$src)>;
+ def : Pat<(v4i32 (fp_to_uint (loadv4f32 addr:$src))),
+ (VCVTTPS2UDQZ128rm addr:$src)>;
+
+ def : Pat<(v8i32 (fp_to_sint (v8f32 VR256X:$src))),
+ (VCVTTPS2DQZ256rr VR256X:$src)>;
+ def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
+ (VCVTTPS2DQZ256rm addr:$src)>;
+
+ def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src))),
+ (VCVTTPS2UDQZ256rr VR256X:$src)>;
+ def : Pat<(v8i32 (fp_to_uint (loadv8f32 addr:$src))),
+ (VCVTTPS2UDQZ256rm addr:$src)>;
+
+ def : Pat<(v4i32 (fp_to_sint (v4f64 VR256X:$src))),
+ (VCVTTPD2DQZ256rr VR256X:$src)>;
+ def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
+ (VCVTTPD2DQZ256rm addr:$src)>;
+
+ def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src))),
+ (VCVTTPD2UDQZ256rr VR256X:$src)>;
+ def : Pat<(v4i32 (fp_to_uint (loadv4f64 addr:$src))),
+ (VCVTTPD2UDQZ256rm addr:$src)>;
+}
+
+let Predicates = [HasDQI] in {
+ def : Pat<(v8i64 (fp_to_sint (v8f32 VR256X:$src))),
+ (VCVTTPS2QQZrr VR256X:$src)>;
+ def : Pat<(v8i64 (fp_to_sint (loadv8f32 addr:$src))),
+ (VCVTTPS2QQZrm addr:$src)>;
+
+ def : Pat<(v8i64 (fp_to_uint (v8f32 VR256X:$src))),
+ (VCVTTPS2UQQZrr VR256X:$src)>;
+ def : Pat<(v8i64 (fp_to_uint (loadv8f32 addr:$src))),
+ (VCVTTPS2UQQZrm addr:$src)>;
+
+ def : Pat<(v8i64 (fp_to_sint (v8f64 VR512:$src))),
+ (VCVTTPD2QQZrr VR512:$src)>;
+ def : Pat<(v8i64 (fp_to_sint (loadv8f64 addr:$src))),
+ (VCVTTPD2QQZrm addr:$src)>;
+
+ def : Pat<(v8i64 (fp_to_uint (v8f64 VR512:$src))),
+ (VCVTTPD2UQQZrr VR512:$src)>;
+ def : Pat<(v8i64 (fp_to_uint (loadv8f64 addr:$src))),
+ (VCVTTPD2UQQZrm addr:$src)>;
+}
+
+let Predicates = [HasDQI, HasVLX] in {
+ def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src))),
+ (VCVTTPS2QQZ256rr VR128X:$src)>;
+ def : Pat<(v4i64 (fp_to_sint (loadv4f32 addr:$src))),
+ (VCVTTPS2QQZ256rm addr:$src)>;
+
+ def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src))),
+ (VCVTTPS2UQQZ256rr VR128X:$src)>;
+ def : Pat<(v4i64 (fp_to_uint (loadv4f32 addr:$src))),
+ (VCVTTPS2UQQZ256rm addr:$src)>;
+
+ def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src))),
+ (VCVTTPD2QQZ128rr VR128X:$src)>;
+ def : Pat<(v2i64 (fp_to_sint (loadv2f64 addr:$src))),
+ (VCVTTPD2QQZ128rm addr:$src)>;
+
+ def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src))),
+ (VCVTTPD2UQQZ128rr VR128X:$src)>;
+ def : Pat<(v2i64 (fp_to_uint (loadv2f64 addr:$src))),
+ (VCVTTPD2UQQZ128rm addr:$src)>;
+
+ def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src))),
+ (VCVTTPD2QQZ256rr VR256X:$src)>;
+ def : Pat<(v4i64 (fp_to_sint (loadv4f64 addr:$src))),
+ (VCVTTPD2QQZ256rm addr:$src)>;
+
+ def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src))),
+ (VCVTTPD2UQQZ256rr VR256X:$src)>;
+ def : Pat<(v4i64 (fp_to_uint (loadv4f64 addr:$src))),
+ (VCVTTPD2UQQZ256rm addr:$src)>;
+}
+
let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
(EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
@@ -7343,26 +8243,24 @@ def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))),
}
let Predicates = [HasAVX512, HasVLX] in {
- let AddedComplexity = 15 in {
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))),
- (VCVTPD2DQZ128rr VR128X:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
- (VCVTPD2DQZ128rm addr:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))),
- (VCVTPD2UDQZ128rr VR128X:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))),
- (VCVTTPD2DQZ128rr VR128X:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
- (VCVTTPD2DQZ128rm addr:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))),
- (VCVTTPD2UDQZ128rr VR128X:$src)>;
- }
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))),
+ (VCVTPD2DQZ128rr VR128X:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
+ (VCVTPD2DQZ128rm addr:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))),
+ (VCVTPD2UDQZ128rr VR128X:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))),
+ (VCVTTPD2DQZ128rr VR128X:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
+ (VCVTTPD2DQZ128rm addr:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))),
+ (VCVTTPD2UDQZ128rr VR128X:$src)>;
def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(VCVTDQ2PDZ128rm addr:$src)>;
@@ -7383,14 +8281,12 @@ let Predicates = [HasAVX512] in {
}
let Predicates = [HasDQI, HasVLX] in {
- let AddedComplexity = 15 in {
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))),
- (VCVTQQ2PSZ128rr VR128X:$src)>;
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))),
- (VCVTUQQ2PSZ128rr VR128X:$src)>;
- }
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))),
+ (VCVTQQ2PSZ128rr VR128X:$src)>;
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))),
+ (VCVTUQQ2PSZ128rr VR128X:$src)>;
}
let Predicates = [HasDQI, NoVLX] in {
@@ -7461,41 +8357,41 @@ def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))),
multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
X86MemOperand x86memop, PatFrag ld_frag,
- OpndItins itins> {
+ X86FoldableSchedWrite sched> {
defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst),
(ins _src.RC:$src), "vcvtph2ps", "$src", "$src",
- (X86cvtph2ps (_src.VT _src.RC:$src)),itins.rr>,
- T8PD, Sched<[itins.Sched]>;
+ (X86cvtph2ps (_src.VT _src.RC:$src))>,
+ T8PD, Sched<[sched]>;
defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
(ins x86memop:$src), "vcvtph2ps", "$src", "$src",
(X86cvtph2ps (_src.VT
(bitconvert
- (ld_frag addr:$src)))), itins.rm>,
- T8PD, Sched<[itins.Sched.Folded]>;
+ (ld_frag addr:$src))))>,
+ T8PD, Sched<[sched.Folded]>;
}
multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
- OpndItins itins> {
+ X86FoldableSchedWrite sched> {
defm rrb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst),
(ins _src.RC:$src), "vcvtph2ps",
"{sae}, $src", "$src, {sae}",
(X86cvtph2psRnd (_src.VT _src.RC:$src),
- (i32 FROUND_NO_EXC)), itins.rr>,
- T8PD, EVEX_B, Sched<[itins.Sched]>;
+ (i32 FROUND_NO_EXC))>,
+ T8PD, EVEX_B, Sched<[sched]>;
}
let Predicates = [HasAVX512] in
defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64,
- SSE_CVT_PH2PS>,
- avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, SSE_CVT_PH2PS>,
+ WriteCvtPH2PSZ>,
+ avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, WriteCvtPH2PSZ>,
EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
let Predicates = [HasVLX] in {
defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
- loadv2i64, SSE_CVT_PH2PS>, EVEX, EVEX_V256,
+ loadv2i64, WriteCvtPH2PSY>, EVEX, EVEX_V256,
EVEX_CD8<32, CD8VH>;
defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
- loadv2i64, SSE_CVT_PH2PS>, EVEX, EVEX_V128,
+ loadv2i64, WriteCvtPH2PS>, EVEX, EVEX_V128,
EVEX_CD8<32, CD8VH>;
// Pattern match vcvtph2ps of a scalar i64 load.
@@ -7509,48 +8405,47 @@ let Predicates = [HasVLX] in {
}
multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
- X86MemOperand x86memop, OpndItins itins> {
+ X86MemOperand x86memop, SchedWrite RR, SchedWrite MR> {
defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
(ins _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph", "$src2, $src1", "$src1, $src2",
(X86cvtps2ph (_src.VT _src.RC:$src1),
- (i32 imm:$src2)),
- itins.rr, 0, 0>, AVX512AIi8Base, Sched<[itins.Sched]>;
+ (i32 imm:$src2)), 0, 0>,
+ AVX512AIi8Base, Sched<[RR]>;
let hasSideEffects = 0, mayStore = 1 in {
def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
(ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
- "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ Sched<[MR]>;
def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
(ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
- "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
- [], itins.rm>, EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", []>,
+ EVEX_K, Sched<[MR]>, NotMemoryFoldable;
}
}
multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
- OpndItins itins> {
+ SchedWrite Sched> {
let hasSideEffects = 0 in
defm rrb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest,
(outs _dest.RC:$dst),
(ins _src.RC:$src1, i32u8imm:$src2),
- "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2",
- [], itins.rr>, EVEX_B, AVX512AIi8Base, Sched<[itins.Sched]>;
+ "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2", []>,
+ EVEX_B, AVX512AIi8Base, Sched<[Sched]>;
}
let Predicates = [HasAVX512] in {
defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem,
- SSE_CVT_PS2PH>,
- avx512_cvtps2ph_sae<v16i16x_info, v16f32_info,
- SSE_CVT_PS2PH>, EVEX, EVEX_V512,
- EVEX_CD8<32, CD8VH>;
+ WriteCvtPS2PHZ, WriteCvtPS2PHZSt>,
+ avx512_cvtps2ph_sae<v16i16x_info, v16f32_info, WriteCvtPS2PHZ>,
+ EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
let Predicates = [HasVLX] in {
defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem,
- SSE_CVT_PS2PH>, EVEX, EVEX_V256,
- EVEX_CD8<32, CD8VH>;
+ WriteCvtPS2PHY, WriteCvtPS2PHYSt>,
+ EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem,
- SSE_CVT_PS2PH>, EVEX, EVEX_V128,
- EVEX_CD8<32, CD8VH>;
+ WriteCvtPS2PH, WriteCvtPS2PHSt>,
+ EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
}
def : Pat<(store (f64 (extractelt
@@ -7575,431 +8470,430 @@ let Predicates = [HasVLX] in {
// more consistent with other instructions, which are always controlled by it.
// It's encoded as 0b100.
def : Pat<(fp_to_f16 FR32X:$src),
- (i16 (EXTRACT_SUBREG (VMOVPDI2DIZrr (VCVTPS2PHZ128rr
- (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), sub_16bit))>;
+ (i16 (EXTRACT_SUBREG (VMOVPDI2DIZrr (v8i16 (VCVTPS2PHZ128rr
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4))), sub_16bit))>;
def : Pat<(f16_to_fp GR16:$src),
- (f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr
- (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)), FR32X)) >;
+ (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSZ128rr
+ (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)))), FR32X)) >;
def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
- (f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr
- (VCVTPS2PHZ128rr (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), FR32X)) >;
+ (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSZ128rr
+ (v8i16 (VCVTPS2PHZ128rr
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4)))), FR32X)) >;
}
// Unordered/Ordered scalar fp compare with Sea and set EFLAGS
multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
- string OpcodeStr, OpndItins itins> {
+ string OpcodeStr, X86FoldableSchedWrite sched> {
let hasSideEffects = 0 in
def rrb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
- !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"),
- [], itins.rr>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
- Sched<[itins.Sched]>;
+ !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), []>,
+ EVEX, EVEX_B, VEX_LIG, EVEX_V128, Sched<[sched]>;
}
let Defs = [EFLAGS], Predicates = [HasAVX512] in {
- defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss", SSE_COMIS>,
+ defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss", WriteFCom>,
AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
- defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd", SSE_COMIS>,
+ defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd", WriteFCom>,
AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
- defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss", SSE_COMIS>,
+ defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss", WriteFCom>,
AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
- defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd", SSE_COMIS>,
+ defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd", WriteFCom>,
AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
}
let Defs = [EFLAGS], Predicates = [HasAVX512] in {
defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
- "ucomiss", SSE_COMIS>, PS, EVEX, VEX_LIG,
+ "ucomiss", WriteFCom>, PS, EVEX, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64,
- "ucomisd", SSE_COMIS>, PD, EVEX,
+ "ucomisd", WriteFCom>, PD, EVEX,
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
let Pattern = []<dag> in {
defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32,
- "comiss", SSE_COMIS>, PS, EVEX, VEX_LIG,
+ "comiss", WriteFCom>, PS, EVEX, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64,
- "comisd", SSE_COMIS>, PD, EVEX,
+ "comisd", WriteFCom>, PD, EVEX,
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
}
let isCodeGenOnly = 1 in {
- defm Int_VUCOMISSZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem,
- sse_load_f32, "ucomiss", SSE_COMIS>, PS, EVEX, VEX_LIG,
- EVEX_CD8<32, CD8VT1>;
- defm Int_VUCOMISDZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem,
- sse_load_f64, "ucomisd", SSE_COMIS>, PD, EVEX,
- VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+ defm VUCOMISSZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem,
+ sse_load_f32, "ucomiss", WriteFCom>, PS, EVEX, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm VUCOMISDZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem,
+ sse_load_f64, "ucomisd", WriteFCom>, PD, EVEX,
+ VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
- defm Int_VCOMISSZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem,
- sse_load_f32, "comiss", SSE_COMIS>, PS, EVEX, VEX_LIG,
- EVEX_CD8<32, CD8VT1>;
- defm Int_VCOMISDZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem,
- sse_load_f64, "comisd", SSE_COMIS>, PD, EVEX,
- VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+ defm VCOMISSZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem,
+ sse_load_f32, "comiss", WriteFCom>, PS, EVEX, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm VCOMISDZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem,
+ sse_load_f64, "comisd", WriteFCom>, PD, EVEX,
+ VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
}
}
/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)), itins.rr>,
- EVEX_4V, Sched<[itins.Sched]>;
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
+ EVEX_4V, Sched<[sched]>;
defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1),
- _.ScalarIntMemCPat:$src2), itins.rm>, EVEX_4V,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ _.ScalarIntMemCPat:$src2)>, EVEX_4V,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
-defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, SSE_RCPS, f32x_info>,
- EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, SSE_RCPS, f64x_info>,
- VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s, SSE_RSQRTSS, f32x_info>,
- EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s, SSE_RSQRTSS, f64x_info>,
- VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
+defm VRCP14SSZ : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, SchedWriteFRcp.Scl,
+ f32x_info>, EVEX_CD8<32, CD8VT1>,
+ T8PD;
+defm VRCP14SDZ : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, SchedWriteFRcp.Scl,
+ f64x_info>, VEX_W, EVEX_CD8<64, CD8VT1>,
+ T8PD;
+defm VRSQRT14SSZ : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s,
+ SchedWriteFRsqrt.Scl, f32x_info>,
+ EVEX_CD8<32, CD8VT1>, T8PD;
+defm VRSQRT14SDZ : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s,
+ SchedWriteFRsqrt.Scl, f64x_info>, VEX_W,
+ EVEX_CD8<64, CD8VT1>, T8PD;
/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
- (_.FloatVT (OpNode _.RC:$src)), itins.rr>, EVEX, T8PD,
- Sched<[itins.Sched]>;
+ (_.VT (OpNode _.RC:$src))>, EVEX, T8PD,
+ Sched<[sched]>;
defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
- (OpNode (_.FloatVT
- (bitconvert (_.LdFrag addr:$src)))), itins.rm>, EVEX, T8PD,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (OpNode (_.VT
+ (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD,
+ Sched<[sched.Folded, ReadAfterLd]>;
defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr,
"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
- (OpNode (_.FloatVT
- (X86VBroadcast (_.ScalarLdFrag addr:$src)))), itins.rm>,
- EVEX, T8PD, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (OpNode (_.VT
+ (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+ EVEX, T8PD, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
}
}
multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SizeItins itins> {
- defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, itins.s,
+ X86SchedWriteWidths sched> {
+ defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, sched.ZMM,
v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, itins.d,
+ defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, sched.ZMM,
v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
// Define only if AVX512VL feature is present.
let Predicates = [HasVLX] in {
defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
- OpNode, itins.s, v4f32x_info>,
+ OpNode, sched.XMM, v4f32x_info>,
EVEX_V128, EVEX_CD8<32, CD8VF>;
defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
- OpNode, itins.s, v8f32x_info>,
+ OpNode, sched.YMM, v8f32x_info>,
EVEX_V256, EVEX_CD8<32, CD8VF>;
defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
- OpNode, itins.d, v2f64x_info>,
+ OpNode, sched.XMM, v2f64x_info>,
EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
- OpNode, itins.d, v4f64x_info>,
+ OpNode, sched.YMM, v4f64x_info>,
EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
}
}
-defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14, SSE_RSQRT_P>;
-defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SSE_RCP_P>;
+defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14, SchedWriteFRsqrt>;
+defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SchedWriteFRcp>;
/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
- SDNode OpNode, OpndItins itins> {
+ SDNode OpNode, X86FoldableSchedWrite sched> {
let ExeDomain = _.ExeDomain in {
defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 FROUND_CURRENT)), itins.rr>,
- Sched<[itins.Sched]>;
+ (i32 FROUND_CURRENT))>,
+ Sched<[sched]>;
defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 FROUND_NO_EXC)), itins.rm>, EVEX_B,
- Sched<[itins.Sched]>;
+ (i32 FROUND_NO_EXC))>, EVEX_B,
+ Sched<[sched]>;
defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
- (i32 FROUND_CURRENT)), itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 FROUND_CURRENT))>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SizeItins itins> {
- defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, itins.s>,
- EVEX_CD8<32, CD8VT1>;
- defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, itins.d>,
- EVEX_CD8<64, CD8VT1>, VEX_W;
+ X86FoldableSchedWrite sched> {
+ defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, sched>,
+ EVEX_CD8<32, CD8VT1>;
+ defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, sched>,
+ EVEX_CD8<64, CD8VT1>, VEX_W;
}
let Predicates = [HasERI] in {
- defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, SSE_RCP_S>,
- T8PD, EVEX_4V;
- defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, SSE_RSQRT_S>,
+ defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, SchedWriteFRcp.Scl>,
T8PD, EVEX_4V;
+ defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s,
+ SchedWriteFRsqrt.Scl>, T8PD, EVEX_4V;
}
-defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds, SSE_ALU_ITINS_S>,
- T8PD, EVEX_4V;
+defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds,
+ SchedWriteFRnd.Scl>, T8PD, EVEX_4V;
/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- SDNode OpNode, OpndItins itins> {
+ SDNode OpNode, X86FoldableSchedWrite sched> {
let ExeDomain = _.ExeDomain in {
defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
- (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT)),
- itins.rr>, Sched<[itins.Sched]>;
+ (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>,
+ Sched<[sched]>;
defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
- (OpNode (_.FloatVT
+ (OpNode (_.VT
(bitconvert (_.LdFrag addr:$src))),
- (i32 FROUND_CURRENT)), itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 FROUND_CURRENT))>,
+ Sched<[sched.Folded, ReadAfterLd]>;
defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr,
"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
- (OpNode (_.FloatVT
+ (OpNode (_.VT
(X86VBroadcast (_.ScalarLdFrag addr:$src))),
- (i32 FROUND_CURRENT)), itins.rm>, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 FROUND_CURRENT))>, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- SDNode OpNode, OpndItins itins> {
+ SDNode OpNode, X86FoldableSchedWrite sched> {
let ExeDomain = _.ExeDomain in
defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr,
"{sae}, $src", "$src, {sae}",
- (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)),
- itins.rr>, EVEX_B, Sched<[itins.Sched]>;
+ (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>,
+ EVEX_B, Sched<[sched]>;
}
multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SizeItins itins> {
- defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode, itins.s>,
- avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode, itins.s>,
- T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
- defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode, itins.d>,
- avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode, itins.d>,
- T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+ X86SchedWriteWidths sched> {
+ defm PSZ : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>,
+ avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>,
+ T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode, sched.ZMM>,
+ avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode, sched.ZMM>,
+ T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
}
multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
- SDNode OpNode, SizeItins itins> {
+ SDNode OpNode, X86SchedWriteWidths sched> {
// Define only if AVX512VL feature is present.
let Predicates = [HasVLX] in {
- defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode, itins.s>,
+ defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode, sched.XMM>,
EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
- defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode, itins.s>,
+ defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode, sched.YMM>,
EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
- defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode, itins.d>,
+ defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode, sched.XMM>,
EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
- defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode, itins.d>,
+ defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode, sched.YMM>,
EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
}
}
-let Predicates = [HasERI] in {
- defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, SSE_RSQRT_P>, EVEX;
- defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28, SSE_RCP_P>, EVEX;
- defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2, SSE_ALU_ITINS_P>, EVEX;
+let Predicates = [HasERI] in {
+ defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, SchedWriteFRsqrt>, EVEX;
+ defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28, SchedWriteFRcp>, EVEX;
+ defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2, SchedWriteFAdd>, EVEX;
}
-defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd, SSE_ALU_ITINS_P>,
+defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd, SchedWriteFRnd>,
avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd,
- SSE_ALU_ITINS_P>, EVEX;
+ SchedWriteFRnd>, EVEX;
-multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr, OpndItins itins,
- X86VectorVTInfo _>{
+multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _>{
let ExeDomain = _.ExeDomain in
defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc",
- (_.VT (X86fsqrtRnd _.RC:$src, (i32 imm:$rc))), itins.rr>,
- EVEX, EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
+ (_.VT (X86fsqrtRnd _.RC:$src, (i32 imm:$rc)))>,
+ EVEX, EVEX_B, EVEX_RC, Sched<[sched]>;
}
-multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, OpndItins itins,
- X86VectorVTInfo _>{
+multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _>{
let ExeDomain = _.ExeDomain in {
defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
- (_.FloatVT (fsqrt _.RC:$src)), itins.rr>, EVEX,
- Sched<[itins.Sched]>;
+ (_.VT (fsqrt _.RC:$src))>, EVEX,
+ Sched<[sched]>;
defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
- (fsqrt (_.FloatVT
- (bitconvert (_.LdFrag addr:$src)))), itins.rm>, EVEX,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (fsqrt (_.VT
+ (bitconvert (_.LdFrag addr:$src))))>, EVEX,
+ Sched<[sched.Folded, ReadAfterLd]>;
defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr,
"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
- (fsqrt (_.FloatVT
- (X86VBroadcast (_.ScalarLdFrag addr:$src)))), itins.rm>,
- EVEX, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (fsqrt (_.VT
+ (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+ EVEX, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
}
}
-multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr> {
- defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), SSE_SQRTPS, v16f32_info>,
+multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
+ X86SchedWriteSizes sched> {
+ defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
+ sched.PS.ZMM, v16f32_info>,
EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), SSE_SQRTPD, v8f64_info>,
+ defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
+ sched.PD.ZMM, v8f64_info>,
EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
// Define only if AVX512VL feature is present.
let Predicates = [HasVLX] in {
defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
- SSE_SQRTPS, v4f32x_info>,
+ sched.PS.XMM, v4f32x_info>,
EVEX_V128, PS, EVEX_CD8<32, CD8VF>;
defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
- SSE_SQRTPS, v8f32x_info>,
+ sched.PS.YMM, v8f32x_info>,
EVEX_V256, PS, EVEX_CD8<32, CD8VF>;
defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
- SSE_SQRTPD, v2f64x_info>,
+ sched.PD.XMM, v2f64x_info>,
EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>;
defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
- SSE_SQRTPD, v4f64x_info>,
+ sched.PD.YMM, v4f64x_info>,
EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>;
}
}
-multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr> {
- defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"), SSE_SQRTPS,
- v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"), SSE_SQRTPD,
- v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
+ X86SchedWriteSizes sched> {
+ defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"),
+ sched.PS.ZMM, v16f32_info>,
+ EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"),
+ sched.PD.ZMM, v8f64_info>,
+ EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
}
-multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, OpndItins itins,
- X86VectorVTInfo _, string SUFF, Intrinsic Intr> {
+multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, string Name> {
let ExeDomain = _.ExeDomain in {
- defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(X86fsqrtRnds (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 FROUND_CURRENT)), itins.rr>,
- Sched<[itins.Sched]>;
- defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
- "$src2, $src1", "$src1, $src2",
- (X86fsqrtRnds (_.VT _.RC:$src1),
- _.ScalarIntMemCPat:$src2,
- (i32 FROUND_CURRENT)), itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
- defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (i32 FROUND_CURRENT))>,
+ Sched<[sched]>;
+ defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (X86fsqrtRnds (_.VT _.RC:$src1),
+ _.ScalarIntMemCPat:$src2,
+ (i32 FROUND_CURRENT))>,
+ Sched<[sched.Folded, ReadAfterLd]>;
+ defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
(X86fsqrtRnds (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$rc)), itins.rr>,
- EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
-
- let isCodeGenOnly = 1, hasSideEffects = 0 in {
- def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
- (ins _.FRC:$src1, _.FRC:$src2),
- OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], itins.rr>,
- Sched<[itins.Sched]>;
- let mayLoad = 1 in
- def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
- (ins _.FRC:$src1, _.ScalarMemOp:$src2),
- OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
- }
+ (i32 imm:$rc))>,
+ EVEX_B, EVEX_RC, Sched<[sched]>;
+
+ let isCodeGenOnly = 1, hasSideEffects = 0, Predicates=[HasAVX512] in {
+ def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ Sched<[sched]>;
+ let mayLoad = 1 in
+ def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ Sched<[sched.Folded, ReadAfterLd]>;
+ }
}
-let Predicates = [HasAVX512] in {
- def : Pat<(_.EltVT (fsqrt _.FRC:$src)),
- (!cast<Instruction>(NAME#SUFF#Zr)
- (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
-
- def : Pat<(Intr VR128X:$src),
- (!cast<Instruction>(NAME#SUFF#Zr_Int) VR128X:$src,
- VR128X:$src)>;
-}
-
-let Predicates = [HasAVX512, OptForSize] in {
- def : Pat<(_.EltVT (fsqrt (load addr:$src))),
- (!cast<Instruction>(NAME#SUFF#Zm)
- (_.EltVT (IMPLICIT_DEF)), addr:$src)>;
-
- def : Pat<(Intr _.ScalarIntMemCPat:$src2),
- (!cast<Instruction>(NAME#SUFF#Zm_Int)
- (_.VT (IMPLICIT_DEF)), addr:$src2)>;
-}
+ let Predicates = [HasAVX512] in {
+ def : Pat<(_.EltVT (fsqrt _.FRC:$src)),
+ (!cast<Instruction>(Name#Zr)
+ (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
+ }
+ let Predicates = [HasAVX512, OptForSize] in {
+ def : Pat<(_.EltVT (fsqrt (load addr:$src))),
+ (!cast<Instruction>(Name#Zm)
+ (_.EltVT (IMPLICIT_DEF)), addr:$src)>;
+ }
}
-multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> {
- defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", SSE_SQRTPS, f32x_info, "SS",
- int_x86_sse_sqrt_ss>,
- EVEX_CD8<32, CD8VT1>, EVEX_4V, XS, NotMemoryFoldable;
- defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", SSE_SQRTPD, f64x_info, "SD",
- int_x86_sse2_sqrt_sd>,
- EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W,
- NotMemoryFoldable;
+multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr,
+ X86SchedWriteSizes sched> {
+ defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS">,
+ EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
+ defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD">,
+ EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W;
}
-defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt">,
- avx512_sqrt_packed_all_round<0x51, "vsqrt">;
+defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", SchedWriteFSqrtSizes>,
+ avx512_sqrt_packed_all_round<0x51, "vsqrt", SchedWriteFSqrtSizes>;
-defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG;
+defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt", SchedWriteFSqrtSizes>, VEX_LIG;
multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
- OpndItins itins, X86VectorVTInfo _> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
"$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 imm:$src3))), itins.rr>,
- Sched<[itins.Sched]>;
+ (i32 imm:$src3)))>,
+ Sched<[sched]>;
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
"$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
(_.VT (X86RndScalesRnd (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 imm:$src3), (i32 FROUND_NO_EXC))), itins.rr>, EVEX_B,
- Sched<[itins.Sched]>;
+ (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B,
+ Sched<[sched]>;
defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3),
OpcodeStr,
"$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (X86RndScales _.RC:$src1,
- _.ScalarIntMemCPat:$src2, (i32 imm:$src3))), itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ _.ScalarIntMemCPat:$src2, (i32 imm:$src3)))>,
+ Sched<[sched.Folded, ReadAfterLd]>;
- let isCodeGenOnly = 1, hasSideEffects = 0 in {
+ let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in {
def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, i32u8imm:$src3),
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [], itins.rr>, Sched<[itins.Sched]>;
+ []>, Sched<[sched]>;
let mayLoad = 1 in
def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ []>, Sched<[sched.Folded, ReadAfterLd]>;
}
}
@@ -8040,344 +8934,397 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
}
}
-defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", SSE_ALU_F32S,
- f32x_info>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VRNDSCALESSZ : avx512_rndscale_scalar<0x0A, "vrndscaless",
+ SchedWriteFRnd.Scl, f32x_info>,
+ AVX512AIi8Base, EVEX_4V,
+ EVEX_CD8<32, CD8VT1>;
+
+defm VRNDSCALESDZ : avx512_rndscale_scalar<0x0B, "vrndscalesd",
+ SchedWriteFRnd.Scl, f64x_info>,
+ VEX_W, AVX512AIi8Base, EVEX_4V,
+ EVEX_CD8<64, CD8VT1>;
+
+multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
+ dag Mask, X86VectorVTInfo _, PatLeaf ZeroFP,
+ dag OutMask, Predicate BasePredicate> {
+ let Predicates = [BasePredicate] in {
+ def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
+ (OpNode (extractelt _.VT:$src2, (iPTR 0))),
+ (extractelt _.VT:$dst, (iPTR 0))))),
+ (!cast<Instruction>("V"#OpcPrefix#r_Intk)
+ _.VT:$dst, OutMask, _.VT:$src2, _.VT:$src1)>;
+
+ def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
+ (OpNode (extractelt _.VT:$src2, (iPTR 0))),
+ ZeroFP))),
+ (!cast<Instruction>("V"#OpcPrefix#r_Intkz)
+ OutMask, _.VT:$src2, _.VT:$src1)>;
+ }
+}
+
+defm : avx512_masked_scalar<fsqrt, "SQRTSSZ", X86Movss,
+ (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v4f32x_info,
+ fp32imm0, (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
+defm : avx512_masked_scalar<fsqrt, "SQRTSDZ", X86Movsd,
+ (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v2f64x_info,
+ fp64imm0, (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
+
+multiclass avx512_masked_scalar_imm<SDNode OpNode, string OpcPrefix, SDNode Move,
+ X86VectorVTInfo _, PatLeaf ZeroFP,
+ bits<8> ImmV, Predicate BasePredicate> {
+ let Predicates = [BasePredicate] in {
+ def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects VK1WM:$mask,
+ (OpNode (extractelt _.VT:$src2, (iPTR 0))),
+ (extractelt _.VT:$dst, (iPTR 0))))),
+ (!cast<Instruction>("V"#OpcPrefix#Zr_Intk)
+ _.VT:$dst, VK1WM:$mask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>;
+
+ def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects VK1WM:$mask,
+ (OpNode (extractelt _.VT:$src2, (iPTR 0))), ZeroFP))),
+ (!cast<Instruction>("V"#OpcPrefix#Zr_Intkz)
+ VK1WM:$mask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>;
+ }
+}
+
+defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESS", X86Movss,
+ v4f32x_info, fp32imm0, 0x01, HasAVX512>;
+defm : avx512_masked_scalar_imm<fceil, "RNDSCALESS", X86Movss,
+ v4f32x_info, fp32imm0, 0x02, HasAVX512>;
+defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESD", X86Movsd,
+ v2f64x_info, fp64imm0, 0x01, HasAVX512>;
+defm : avx512_masked_scalar_imm<fceil, "RNDSCALESD", X86Movsd,
+ v2f64x_info, fp64imm0, 0x02, HasAVX512>;
-defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", SSE_ALU_F64S,
- f64x_info>, VEX_W, AVX512AIi8Base, EVEX_4V,
- EVEX_CD8<64, CD8VT1>;
//-------------------------------------------------
// Integer truncate and extend operations
//-------------------------------------------------
-let Sched = WriteShuffle256 in
-def AVX512_EXTEND : OpndItins<
- IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
->;
-
-let Sched = WriteShuffle256 in
-def AVX512_TRUNCATE : OpndItins<
- IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
->;
-
multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo SrcInfo,
+ X86FoldableSchedWrite sched, X86VectorVTInfo SrcInfo,
X86VectorVTInfo DestInfo, X86MemOperand x86memop> {
let ExeDomain = DestInfo.ExeDomain in
defm rr : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst),
(ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1",
- (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
- itins.rr>, EVEX, T8XS, Sched<[itins.Sched]>;
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>,
+ EVEX, T8XS, Sched<[sched]>;
- let mayStore = 1, mayLoad = 1, hasSideEffects = 0,
- ExeDomain = DestInfo.ExeDomain in {
+ let mayStore = 1, hasSideEffects = 0, ExeDomain = DestInfo.ExeDomain in {
def mr : AVX512XS8I<opc, MRMDestMem, (outs),
(ins x86memop:$dst, SrcInfo.RC:$src),
- OpcodeStr # "\t{$src, $dst|$dst, $src}",
- [], itins.rm>, EVEX, Sched<[itins.Sched.Folded]>;
+ OpcodeStr # "\t{$src, $dst|$dst, $src}", []>,
+ EVEX, Sched<[sched.Folded]>;
def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
(ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src),
- OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
- [], itins.rm>, EVEX, EVEX_K, Sched<[itins.Sched.Folded]>;
- }//mayStore = 1, mayLoad = 1, hasSideEffects = 0
+ OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", []>,
+ EVEX, EVEX_K, Sched<[sched.Folded]>, NotMemoryFoldable;
+ }//mayStore = 1, hasSideEffects = 0
}
multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
X86VectorVTInfo DestInfo,
- PatFrag truncFrag, PatFrag mtruncFrag > {
+ PatFrag truncFrag, PatFrag mtruncFrag,
+ string Name> {
def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst),
- (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr)
+ (!cast<Instruction>(Name#SrcInfo.ZSuffix##mr)
addr:$dst, SrcInfo.RC:$src)>;
def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask,
(SrcInfo.VT SrcInfo.RC:$src)),
- (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk)
+ (!cast<Instruction>(Name#SrcInfo.ZSuffix##mrk)
addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>;
}
-multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
- X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
- X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
- X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag,
- Predicate prd = HasAVX512>{
+multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode128,
+ SDNode OpNode256, SDNode OpNode512, X86FoldableSchedWrite sched,
+ AVX512VLVectorVTInfo VTSrcInfo,
+ X86VectorVTInfo DestInfoZ128,
+ X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
+ X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
+ X86MemOperand x86memopZ, PatFrag truncFrag,
+ PatFrag mtruncFrag, Predicate prd = HasAVX512>{
let Predicates = [HasVLX, prd] in {
- defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, itins,
+ defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode128, sched,
VTSrcInfo.info128, DestInfoZ128, x86memopZ128>,
avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
- truncFrag, mtruncFrag>, EVEX_V128;
+ truncFrag, mtruncFrag, NAME>, EVEX_V128;
- defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, itins,
+ defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode256, sched,
VTSrcInfo.info256, DestInfoZ256, x86memopZ256>,
avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
- truncFrag, mtruncFrag>, EVEX_V256;
+ truncFrag, mtruncFrag, NAME>, EVEX_V256;
}
let Predicates = [prd] in
- defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, itins,
+ defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode512, sched,
VTSrcInfo.info512, DestInfoZ, x86memopZ>,
avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ,
- truncFrag, mtruncFrag>, EVEX_V512;
+ truncFrag, mtruncFrag, NAME>, EVEX_V512;
}
multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, PatFrag StoreNode,
- PatFrag MaskedStoreNode> {
- defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i64_info,
- v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
- StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VO>;
+ X86FoldableSchedWrite sched, PatFrag StoreNode,
+ PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, InVecNode, InVecNode, sched,
+ avx512vl_i64_info, v16i8x_info, v16i8x_info,
+ v16i8x_info, i16mem, i32mem, i64mem, StoreNode,
+ MaskedStoreNode>, EVEX_CD8<8, CD8VO>;
}
multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, PatFrag StoreNode,
- PatFrag MaskedStoreNode> {
- defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i64_info,
- v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
- StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VQ>;
+ X86FoldableSchedWrite sched, PatFrag StoreNode,
+ PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, InVecNode, OpNode, sched,
+ avx512vl_i64_info, v8i16x_info, v8i16x_info,
+ v8i16x_info, i32mem, i64mem, i128mem, StoreNode,
+ MaskedStoreNode>, EVEX_CD8<16, CD8VQ>;
}
multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, PatFrag StoreNode,
- PatFrag MaskedStoreNode> {
- defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i64_info,
- v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
- StoreNode, MaskedStoreNode>, EVEX_CD8<32, CD8VH>;
+ X86FoldableSchedWrite sched, PatFrag StoreNode,
+ PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, OpNode, OpNode, sched,
+ avx512vl_i64_info, v4i32x_info, v4i32x_info,
+ v8i32x_info, i64mem, i128mem, i256mem, StoreNode,
+ MaskedStoreNode>, EVEX_CD8<32, CD8VH>;
}
multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, PatFrag StoreNode,
- PatFrag MaskedStoreNode> {
- defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i32_info,
- v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
- StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VQ>;
+ X86FoldableSchedWrite sched, PatFrag StoreNode,
+ PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, InVecNode, OpNode, sched,
+ avx512vl_i32_info, v16i8x_info, v16i8x_info,
+ v16i8x_info, i32mem, i64mem, i128mem, StoreNode,
+ MaskedStoreNode>, EVEX_CD8<8, CD8VQ>;
}
multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, PatFrag StoreNode,
- PatFrag MaskedStoreNode> {
- defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i32_info,
- v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
- StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VH>;
+ X86FoldableSchedWrite sched, PatFrag StoreNode,
+ PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, OpNode, OpNode, sched,
+ avx512vl_i32_info, v8i16x_info, v8i16x_info,
+ v16i16x_info, i64mem, i128mem, i256mem, StoreNode,
+ MaskedStoreNode>, EVEX_CD8<16, CD8VH>;
}
multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, PatFrag StoreNode,
- PatFrag MaskedStoreNode> {
- defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i16_info,
- v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
- StoreNode, MaskedStoreNode, HasBWI>, EVEX_CD8<16, CD8VH>;
+ X86FoldableSchedWrite sched, PatFrag StoreNode,
+ PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, OpNode, OpNode,
+ sched, avx512vl_i16_info, v16i8x_info, v16i8x_info,
+ v32i8x_info, i64mem, i128mem, i256mem, StoreNode,
+ MaskedStoreNode, HasBWI>, EVEX_CD8<16, CD8VH>;
}
-defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc, AVX512_TRUNCATE,
- truncstorevi8, masked_truncstorevi8>;
-defm VPMOVSQB : avx512_trunc_qb<0x22, "vpmovsqb", X86vtruncs, AVX512_TRUNCATE,
+defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", trunc, WriteShuffle256,
+ truncstorevi8, masked_truncstorevi8, X86vtrunc>;
+defm VPMOVSQB : avx512_trunc_qb<0x22, "vpmovsqb", X86vtruncs, WriteShuffle256,
truncstore_s_vi8, masked_truncstore_s_vi8>;
-defm VPMOVUSQB : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus, AVX512_TRUNCATE,
+defm VPMOVUSQB : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus, WriteShuffle256,
truncstore_us_vi8, masked_truncstore_us_vi8>;
-defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc, AVX512_TRUNCATE,
- truncstorevi16, masked_truncstorevi16>;
-defm VPMOVSQW : avx512_trunc_qw<0x24, "vpmovsqw", X86vtruncs, AVX512_TRUNCATE,
+defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", trunc, WriteShuffle256,
+ truncstorevi16, masked_truncstorevi16, X86vtrunc>;
+defm VPMOVSQW : avx512_trunc_qw<0x24, "vpmovsqw", X86vtruncs, WriteShuffle256,
truncstore_s_vi16, masked_truncstore_s_vi16>;
-defm VPMOVUSQW : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus, AVX512_TRUNCATE,
+defm VPMOVUSQW : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus, WriteShuffle256,
truncstore_us_vi16, masked_truncstore_us_vi16>;
-defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc, AVX512_TRUNCATE,
- truncstorevi32, masked_truncstorevi32>;
-defm VPMOVSQD : avx512_trunc_qd<0x25, "vpmovsqd", X86vtruncs, AVX512_TRUNCATE,
+defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", trunc, WriteShuffle256,
+ truncstorevi32, masked_truncstorevi32, X86vtrunc>;
+defm VPMOVSQD : avx512_trunc_qd<0x25, "vpmovsqd", X86vtruncs, WriteShuffle256,
truncstore_s_vi32, masked_truncstore_s_vi32>;
-defm VPMOVUSQD : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus, AVX512_TRUNCATE,
+defm VPMOVUSQD : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus, WriteShuffle256,
truncstore_us_vi32, masked_truncstore_us_vi32>;
-defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc, AVX512_TRUNCATE,
- truncstorevi8, masked_truncstorevi8>;
-defm VPMOVSDB : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs, AVX512_TRUNCATE,
+defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", trunc, WriteShuffle256,
+ truncstorevi8, masked_truncstorevi8, X86vtrunc>;
+defm VPMOVSDB : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs, WriteShuffle256,
truncstore_s_vi8, masked_truncstore_s_vi8>;
-defm VPMOVUSDB : avx512_trunc_db<0x11, "vpmovusdb", X86vtruncus, AVX512_TRUNCATE,
+defm VPMOVUSDB : avx512_trunc_db<0x11, "vpmovusdb", X86vtruncus, WriteShuffle256,
truncstore_us_vi8, masked_truncstore_us_vi8>;
-defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc, AVX512_TRUNCATE,
- truncstorevi16, masked_truncstorevi16>;
-defm VPMOVSDW : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs, AVX512_TRUNCATE,
+defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", trunc, WriteShuffle256,
+ truncstorevi16, masked_truncstorevi16, X86vtrunc>;
+defm VPMOVSDW : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs, WriteShuffle256,
truncstore_s_vi16, masked_truncstore_s_vi16>;
-defm VPMOVUSDW : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus, AVX512_TRUNCATE,
+defm VPMOVUSDW : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus, WriteShuffle256,
truncstore_us_vi16, masked_truncstore_us_vi16>;
-defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc, AVX512_TRUNCATE,
- truncstorevi8, masked_truncstorevi8>;
-defm VPMOVSWB : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs, AVX512_TRUNCATE,
+defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", trunc, WriteShuffle256,
+ truncstorevi8, masked_truncstorevi8, X86vtrunc>;
+defm VPMOVSWB : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs, WriteShuffle256,
truncstore_s_vi8, masked_truncstore_s_vi8>;
-defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus, AVX512_TRUNCATE,
+defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus, WriteShuffle256,
truncstore_us_vi8, masked_truncstore_us_vi8>;
let Predicates = [HasAVX512, NoVLX] in {
-def: Pat<(v8i16 (X86vtrunc (v8i32 VR256X:$src))),
+def: Pat<(v8i16 (trunc (v8i32 VR256X:$src))),
(v8i16 (EXTRACT_SUBREG
(v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src, sub_ymm)))), sub_xmm))>;
-def: Pat<(v4i32 (X86vtrunc (v4i64 VR256X:$src))),
+def: Pat<(v4i32 (trunc (v4i64 VR256X:$src))),
(v4i32 (EXTRACT_SUBREG
(v8i32 (VPMOVQDZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src, sub_ymm)))), sub_xmm))>;
}
let Predicates = [HasBWI, NoVLX] in {
-def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))),
+def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
(v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src, sub_ymm))), sub_xmm))>;
}
-multiclass avx512_extend_common<bits<8> opc, string OpcodeStr, OpndItins itins,
+multiclass WriteShuffle256_common<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched,
X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo,
- X86MemOperand x86memop, PatFrag LdFrag, SDPatternOperator OpNode>{
+ X86MemOperand x86memop, PatFrag LdFrag, SDNode OpNode>{
let ExeDomain = DestInfo.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
(ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src",
- (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src))), itins.rr>,
- EVEX, Sched<[itins.Sched]>;
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>,
+ EVEX, Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
(ins x86memop:$src), OpcodeStr ,"$src", "$src",
- (DestInfo.VT (LdFrag addr:$src)), itins.rm>,
- EVEX, Sched<[itins.Sched.Folded]>;
+ (DestInfo.VT (LdFrag addr:$src))>,
+ EVEX, Sched<[sched.Folded]>;
}
}
-multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
- OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+multiclass WriteShuffle256_BW<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode InVecNode, string ExtTy,
+ X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
let Predicates = [HasVLX, HasBWI] in {
- defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v8i16x_info,
+ defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v8i16x_info,
v16i8x_info, i64mem, LdFrag, InVecNode>,
EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128, VEX_WIG;
- defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v16i16x_info,
+ defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v16i16x_info,
v16i8x_info, i128mem, LdFrag, OpNode>,
EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256, VEX_WIG;
}
let Predicates = [HasBWI] in {
- defm Z : avx512_extend_common<opc, OpcodeStr, itins, v32i16_info,
+ defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v32i16_info,
v32i8x_info, i256mem, LdFrag, OpNode>,
EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512, VEX_WIG;
}
}
-multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
- OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+multiclass WriteShuffle256_BD<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode InVecNode, string ExtTy,
+ X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
let Predicates = [HasVLX, HasAVX512] in {
- defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v4i32x_info,
+ defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v4i32x_info,
v16i8x_info, i32mem, LdFrag, InVecNode>,
EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128, VEX_WIG;
- defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v8i32x_info,
+ defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v8i32x_info,
v16i8x_info, i64mem, LdFrag, OpNode>,
EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256, VEX_WIG;
}
let Predicates = [HasAVX512] in {
- defm Z : avx512_extend_common<opc, OpcodeStr, itins, v16i32_info,
+ defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v16i32_info,
v16i8x_info, i128mem, LdFrag, OpNode>,
EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512, VEX_WIG;
}
}
-multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
- OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+multiclass WriteShuffle256_BQ<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode InVecNode, string ExtTy,
+ X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
let Predicates = [HasVLX, HasAVX512] in {
- defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v2i64x_info,
+ defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info,
v16i8x_info, i16mem, LdFrag, InVecNode>,
EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128, VEX_WIG;
- defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v4i64x_info,
+ defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info,
v16i8x_info, i32mem, LdFrag, OpNode>,
EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256, VEX_WIG;
}
let Predicates = [HasAVX512] in {
- defm Z : avx512_extend_common<opc, OpcodeStr, itins, v8i64_info,
+ defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info,
v16i8x_info, i64mem, LdFrag, OpNode>,
EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512, VEX_WIG;
}
}
-multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
- OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+multiclass WriteShuffle256_WD<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode InVecNode, string ExtTy,
+ X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
let Predicates = [HasVLX, HasAVX512] in {
- defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v4i32x_info,
+ defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v4i32x_info,
v8i16x_info, i64mem, LdFrag, InVecNode>,
EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128, VEX_WIG;
- defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v8i32x_info,
+ defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v8i32x_info,
v8i16x_info, i128mem, LdFrag, OpNode>,
EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256, VEX_WIG;
}
let Predicates = [HasAVX512] in {
- defm Z : avx512_extend_common<opc, OpcodeStr, itins, v16i32_info,
+ defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v16i32_info,
v16i16x_info, i256mem, LdFrag, OpNode>,
EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512, VEX_WIG;
}
}
-multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
- OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+multiclass WriteShuffle256_WQ<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode InVecNode, string ExtTy,
+ X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
let Predicates = [HasVLX, HasAVX512] in {
- defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v2i64x_info,
+ defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info,
v8i16x_info, i32mem, LdFrag, InVecNode>,
EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128, VEX_WIG;
- defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v4i64x_info,
+ defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info,
v8i16x_info, i64mem, LdFrag, OpNode>,
EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256, VEX_WIG;
}
let Predicates = [HasAVX512] in {
- defm Z : avx512_extend_common<opc, OpcodeStr, itins, v8i64_info,
+ defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info,
v8i16x_info, i128mem, LdFrag, OpNode>,
EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512, VEX_WIG;
}
}
-multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
- OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
+multiclass WriteShuffle256_DQ<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode InVecNode, string ExtTy,
+ X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
let Predicates = [HasVLX, HasAVX512] in {
- defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v2i64x_info,
+ defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info,
v4i32x_info, i64mem, LdFrag, InVecNode>,
EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128;
- defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v4i64x_info,
+ defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info,
v4i32x_info, i128mem, LdFrag, OpNode>,
EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256;
}
let Predicates = [HasAVX512] in {
- defm Z : avx512_extend_common<opc, OpcodeStr, itins, v8i64_info,
+ defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info,
v8i32x_info, i256mem, LdFrag, OpNode>,
EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512;
}
}
-defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z", AVX512_EXTEND>;
-defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z", AVX512_EXTEND>;
-defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z", AVX512_EXTEND>;
-defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z", AVX512_EXTEND>;
-defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z", AVX512_EXTEND>;
-defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z", AVX512_EXTEND>;
+defm VPMOVZXBW : WriteShuffle256_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXBD : WriteShuffle256_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXWD : WriteShuffle256_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXWQ : WriteShuffle256_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXDQ : WriteShuffle256_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z", WriteShuffle256>;
-defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s", AVX512_EXTEND>;
-defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s", AVX512_EXTEND>;
-defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s", AVX512_EXTEND>;
-defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s", AVX512_EXTEND>;
-defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s", AVX512_EXTEND>;
-defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s", AVX512_EXTEND>;
+defm VPMOVSXBW: WriteShuffle256_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXBD: WriteShuffle256_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXWD: WriteShuffle256_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXWQ: WriteShuffle256_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXDQ: WriteShuffle256_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s", WriteShuffle256>;
multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
- SDNode InVecOp, PatFrag ExtLoad16> {
+ SDNode InVecOp> {
// 128-bit patterns
let Predicates = [HasVLX, HasBWI] in {
def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -8401,7 +9348,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
def : Pat<(v4i32 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
(!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
+ def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
(!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
(!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
@@ -8517,8 +9464,8 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
}
}
-defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, sext_invec, extloadi32i16>;
-defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec, loadi16_anyext>;
+defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, sext_invec>;
+defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec>;
//===----------------------------------------------------------------------===//
// GATHER - SCATTER Operations
@@ -8542,7 +9489,7 @@ multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,
AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512,
- vy512mem, mgatherv8i32>, EVEX_V512, VEX_W;
+ vy512xmem, mgatherv8i32>, EVEX_V512, VEX_W;
defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info512,
vz512mem, mgatherv8i64>, EVEX_V512, VEX_W;
let Predicates = [HasVLX] in {
@@ -8561,7 +9508,7 @@ multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc,
AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz512mem,
mgatherv16i32>, EVEX_V512;
- defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz256xmem,
+ defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz256mem,
mgatherv8i64>, EVEX_V512;
let Predicates = [HasVLX] in {
defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
@@ -8584,16 +9531,17 @@ defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q
avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">;
multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86MemOperand memop, PatFrag ScatterNode> {
+ X86MemOperand memop, PatFrag ScatterNode,
+ RegisterClass MaskRC = _.KRCWM> {
let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in
- def mr : AVX5128I<opc, MRMDestMem, (outs _.KRCWM:$mask_wb),
- (ins memop:$dst, _.KRCWM:$mask, _.RC:$src),
+ def mr : AVX5128I<opc, MRMDestMem, (outs MaskRC:$mask_wb),
+ (ins memop:$dst, MaskRC:$mask, _.RC:$src),
!strconcat(OpcodeStr#_.Suffix,
"\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
- [(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src),
- _.KRCWM:$mask, vectoraddr:$dst))]>,
+ [(set MaskRC:$mask_wb, (ScatterNode (_.VT _.RC:$src),
+ MaskRC:$mask, vectoraddr:$dst))]>,
EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[WriteStore]>;
}
@@ -8601,7 +9549,7 @@ let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in
multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc,
AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512,
- vy512mem, mscatterv8i32>, EVEX_V512, VEX_W;
+ vy512xmem, mscatterv8i32>, EVEX_V512, VEX_W;
defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info512,
vz512mem, mscatterv8i64>, EVEX_V512, VEX_W;
let Predicates = [HasVLX] in {
@@ -8620,7 +9568,7 @@ multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc,
AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz512mem,
mscatterv16i32>, EVEX_V512;
- defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz256xmem,
+ defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz256mem,
mscatterv8i64>, EVEX_V512;
let Predicates = [HasVLX] in {
defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
@@ -8630,7 +9578,8 @@ let Predicates = [HasVLX] in {
defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
vx128xmem, mscatterv4i32>, EVEX_V128;
defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
- vx64xmem, mscatterv2i64>, EVEX_V128;
+ vx64xmem, mscatterv2i64, VK2WM>,
+ EVEX_V128;
}
}
@@ -8643,20 +9592,20 @@ defm VPSCATTER : avx512_scatter_q_pd<0xA0, 0xA1, avx512vl_i64_info, "vpscatter",
// prefetch
multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
RegisterClass KRC, X86MemOperand memop> {
- let Predicates = [HasPFI], hasSideEffects = 1 in
+ let Predicates = [HasPFI], mayLoad = 1, mayStore = 1 in
def m : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
- !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"),
- [], IIC_SSE_PREFETCH>, EVEX, EVEX_K, Sched<[WriteLoad]>;
+ !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"), []>,
+ EVEX, EVEX_K, Sched<[WriteLoad]>;
}
defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
- VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+ VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
- VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+ VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -8665,10 +9614,10 @@ defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
- VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+ VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
- VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+ VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -8677,10 +9626,10 @@ defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps
VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
- VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+ VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
- VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+ VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -8689,10 +9638,10 @@ defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps
VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
- VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+ VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
- VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+ VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -8700,8 +9649,8 @@ defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd
multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
!strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
- [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))],
- IIC_SSE_MOV_S_RR>, EVEX, Sched<[WriteMove]>;
+ [(set Vec.RC:$dst, (Vec.VT (sext Vec.KRC:$src)))]>,
+ EVEX, Sched<[WriteMove]>; // TODO - WriteVecTrunc?
}
multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
@@ -8723,17 +9672,18 @@ defm VPMOVM2Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, "vpmovm2", HasDQI
multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))],
- IIC_SSE_MOV_S_RR>, EVEX, Sched<[WriteMove]>;
+ [(set _.KRC:$dst, (X86pcmpgtm _.ImmAllZerosV, (_.VT _.RC:$src)))]>,
+ EVEX, Sched<[WriteMove]>;
}
// Use 512bit version to implement 128/256 bit in case NoVLX.
multiclass convert_vector_to_mask_lowering<X86VectorVTInfo ExtendInfo,
- X86VectorVTInfo _> {
+ X86VectorVTInfo _,
+ string Name> {
- def : Pat<(_.KVT (X86cvt2mask (_.VT _.RC:$src))),
+ def : Pat<(_.KVT (X86pcmpgtm _.ImmAllZerosV, (_.VT _.RC:$src))),
(_.KVT (COPY_TO_REGCLASS
- (!cast<Instruction>(NAME#"Zrr")
+ (!cast<Instruction>(Name#"Zrr")
(INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
_.RC:$src, _.SubRegIdx)),
_.KRC))>;
@@ -8752,8 +9702,8 @@ multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
EVEX_V128;
}
let Predicates = [prd, NoVLX] in {
- defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256>;
- defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128>;
+ defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256, NAME>;
+ defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128, NAME>;
}
}
@@ -8766,125 +9716,131 @@ defm VPMOVD2M : avx512_convert_vector_to_mask<0x39, "vpmovd2m",
defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m",
avx512vl_i64_info, HasDQI>, VEX_W;
+// Patterns for handling sext from a mask register to v16i8/v16i16 when DQI
+// is available, but BWI is not. We can't handle this in lowering because
+// a target independent DAG combine likes to combine sext and trunc.
+let Predicates = [HasDQI, NoBWI] in {
+ def : Pat<(v16i8 (sext (v16i1 VK16:$src))),
+ (VPMOVDBZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
+ def : Pat<(v16i16 (sext (v16i1 VK16:$src))),
+ (VPMOVDWZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
+}
+
//===----------------------------------------------------------------------===//
// AVX-512 - COMPRESS and EXPAND
//
-// FIXME: Is there a better scheduler itinerary for VPCOMPRESS/VPEXPAND?
-let Sched = WriteShuffle256 in {
-def AVX512_COMPRESS : OpndItins<
- IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-def AVX512_EXPAND : OpndItins<
- IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-}
-
multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _,
- string OpcodeStr, OpndItins itins> {
+ string OpcodeStr, X86FoldableSchedWrite sched> {
defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst),
(ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
- (_.VT (X86compress _.RC:$src1)), itins.rr>, AVX5128IBase,
- Sched<[itins.Sched]>;
+ (_.VT (X86compress _.RC:$src1))>, AVX5128IBase,
+ Sched<[sched]>;
let mayStore = 1, hasSideEffects = 0 in
def mr : AVX5128I<opc, MRMDestMem, (outs),
(ins _.MemOp:$dst, _.RC:$src),
OpcodeStr # "\t{$src, $dst|$dst, $src}",
[]>, EVEX_CD8<_.EltSize, CD8VT1>,
- Sched<[itins.Sched.Folded]>;
+ Sched<[sched.Folded]>;
def mrk : AVX5128I<opc, MRMDestMem, (outs),
(ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
[]>,
EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
- Sched<[itins.Sched.Folded]>;
+ Sched<[sched.Folded]>;
}
-multiclass compress_by_vec_width_lowering<X86VectorVTInfo _ > {
+multiclass compress_by_vec_width_lowering<X86VectorVTInfo _, string Name> {
def : Pat<(X86mCompressingStore addr:$dst, _.KRCWM:$mask,
(_.VT _.RC:$src)),
- (!cast<Instruction>(NAME#_.ZSuffix##mrk)
+ (!cast<Instruction>(Name#_.ZSuffix##mrk)
addr:$dst, _.KRCWM:$mask, _.RC:$src)>;
}
multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr,
- OpndItins itins,
+ X86FoldableSchedWrite sched,
AVX512VLVectorVTInfo VTInfo,
Predicate Pred = HasAVX512> {
let Predicates = [Pred] in
- defm Z : compress_by_vec_width_common<opc, VTInfo.info512, OpcodeStr, itins>,
- compress_by_vec_width_lowering<VTInfo.info512>, EVEX_V512;
+ defm Z : compress_by_vec_width_common<opc, VTInfo.info512, OpcodeStr, sched>,
+ compress_by_vec_width_lowering<VTInfo.info512, NAME>, EVEX_V512;
let Predicates = [Pred, HasVLX] in {
- defm Z256 : compress_by_vec_width_common<opc, VTInfo.info256, OpcodeStr, itins>,
- compress_by_vec_width_lowering<VTInfo.info256>, EVEX_V256;
- defm Z128 : compress_by_vec_width_common<opc, VTInfo.info128, OpcodeStr, itins>,
- compress_by_vec_width_lowering<VTInfo.info128>, EVEX_V128;
+ defm Z256 : compress_by_vec_width_common<opc, VTInfo.info256, OpcodeStr, sched>,
+ compress_by_vec_width_lowering<VTInfo.info256, NAME>, EVEX_V256;
+ defm Z128 : compress_by_vec_width_common<opc, VTInfo.info128, OpcodeStr, sched>,
+ compress_by_vec_width_lowering<VTInfo.info128, NAME>, EVEX_V128;
}
}
-defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", AVX512_COMPRESS,
- avx512vl_i32_info>, EVEX;
-defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", AVX512_COMPRESS,
- avx512vl_i64_info>, EVEX, VEX_W;
-defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", AVX512_COMPRESS,
- avx512vl_f32_info>, EVEX;
-defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", AVX512_COMPRESS,
- avx512vl_f64_info>, EVEX, VEX_W;
+// FIXME: Is there a better scheduler class for VPCOMPRESS?
+defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", WriteVarShuffle256,
+ avx512vl_i32_info>, EVEX, NotMemoryFoldable;
+defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", WriteVarShuffle256,
+ avx512vl_i64_info>, EVEX, VEX_W, NotMemoryFoldable;
+defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", WriteVarShuffle256,
+ avx512vl_f32_info>, EVEX, NotMemoryFoldable;
+defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", WriteVarShuffle256,
+ avx512vl_f64_info>, EVEX, VEX_W, NotMemoryFoldable;
// expand
multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
- string OpcodeStr, OpndItins itins> {
+ string OpcodeStr, X86FoldableSchedWrite sched> {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
- (_.VT (X86expand _.RC:$src1)), itins.rr>, AVX5128IBase,
- Sched<[itins.Sched]>;
+ (_.VT (X86expand _.RC:$src1))>, AVX5128IBase,
+ Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1",
(_.VT (X86expand (_.VT (bitconvert
- (_.LdFrag addr:$src1))))), itins.rm>,
+ (_.LdFrag addr:$src1)))))>,
AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>;
}
-multiclass expand_by_vec_width_lowering<X86VectorVTInfo _ > {
+multiclass expand_by_vec_width_lowering<X86VectorVTInfo _, string Name> {
def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, undef)),
- (!cast<Instruction>(NAME#_.ZSuffix##rmkz)
+ (!cast<Instruction>(Name#_.ZSuffix##rmkz)
+ _.KRCWM:$mask, addr:$src)>;
+
+ def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, _.ImmAllZerosV)),
+ (!cast<Instruction>(Name#_.ZSuffix##rmkz)
_.KRCWM:$mask, addr:$src)>;
def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask,
(_.VT _.RC:$src0))),
- (!cast<Instruction>(NAME#_.ZSuffix##rmk)
+ (!cast<Instruction>(Name#_.ZSuffix##rmk)
_.RC:$src0, _.KRCWM:$mask, addr:$src)>;
}
multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr,
- OpndItins itins,
+ X86FoldableSchedWrite sched,
AVX512VLVectorVTInfo VTInfo,
Predicate Pred = HasAVX512> {
let Predicates = [Pred] in
- defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr, itins>,
- expand_by_vec_width_lowering<VTInfo.info512>, EVEX_V512;
+ defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr, sched>,
+ expand_by_vec_width_lowering<VTInfo.info512, NAME>, EVEX_V512;
let Predicates = [Pred, HasVLX] in {
- defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr, itins>,
- expand_by_vec_width_lowering<VTInfo.info256>, EVEX_V256;
- defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr, itins>,
- expand_by_vec_width_lowering<VTInfo.info128>, EVEX_V128;
+ defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr, sched>,
+ expand_by_vec_width_lowering<VTInfo.info256, NAME>, EVEX_V256;
+ defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr, sched>,
+ expand_by_vec_width_lowering<VTInfo.info128, NAME>, EVEX_V128;
}
}
-defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", AVX512_EXPAND,
+// FIXME: Is there a better scheduler class for VPEXPAND?
+defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", WriteVarShuffle256,
avx512vl_i32_info>, EVEX;
-defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", AVX512_EXPAND,
+defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", WriteVarShuffle256,
avx512vl_i64_info>, EVEX, VEX_W;
-defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", AVX512_EXPAND,
+defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", WriteVarShuffle256,
avx512vl_f32_info>, EVEX;
-defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", AVX512_EXPAND,
+defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", WriteVarShuffle256,
avx512vl_f64_info>, EVEX, VEX_W;
//handle instruction reg_vec1 = op(reg_vec,imm)
@@ -8892,32 +9848,32 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", AVX512_EXPAND,
// op(broadcast(eltVt),imm)
//all instruction created with FROUND_CURRENT
multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2)), itins.rr>, Sched<[itins.Sched]>;
+ (i32 imm:$src2))>, Sched<[sched]>;
defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
(OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i32 imm:$src2)), itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 imm:$src2))>,
+ Sched<[sched.Folded, ReadAfterLd]>;
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
"${src1}"##_.BroadcastStr##", $src2",
(OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
- (i32 imm:$src2)), itins.rm>, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 imm:$src2))>, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
- SDNode OpNode, OpndItins itins,
+ SDNode OpNode, X86FoldableSchedWrite sched,
X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -8926,23 +9882,23 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
"$src1, {sae}, $src2",
(OpNode (_.VT _.RC:$src1),
(i32 imm:$src2),
- (i32 FROUND_NO_EXC)), itins.rr>,
- EVEX_B, Sched<[itins.Sched]>;
+ (i32 FROUND_NO_EXC))>,
+ EVEX_B, Sched<[sched]>;
}
multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode,
- SDNode OpNodeRnd, OpndItins itins, Predicate prd>{
+ SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{
let Predicates = [prd] in {
- defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, itins,
+ defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM,
_.info512>,
avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd,
- itins, _.info512>, EVEX_V512;
+ sched.ZMM, _.info512>, EVEX_V512;
}
let Predicates = [prd, HasVLX] in {
- defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, itins,
+ defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.XMM,
_.info128>, EVEX_V128;
- defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, itins,
+ defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.YMM,
_.info256>, EVEX_V256;
}
}
@@ -8952,37 +9908,37 @@ multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
// op(reg_vec2,broadcast(eltVt),imm)
//all instruction created with FROUND_CURRENT
multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _>{
+ X86FoldableSchedWrite sched, X86VectorVTInfo _>{
let ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3)), itins.rr>,
- Sched<[itins.Sched]>;
+ (i32 imm:$src3))>,
+ Sched<[sched]>;
defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (bitconvert (_.LdFrag addr:$src2))),
- (i32 imm:$src3)), itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 imm:$src3))>,
+ Sched<[sched.Folded, ReadAfterLd]>;
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr##", $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- (i32 imm:$src3)), itins.rm>, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 imm:$src3))>, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
// op(reg_vec2,mem_vec,imm)
multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo DestInfo,
+ X86FoldableSchedWrite sched, X86VectorVTInfo DestInfo,
X86VectorVTInfo SrcInfo>{
let ExeDomain = DestInfo.ExeDomain in {
defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
@@ -8990,16 +9946,16 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
(SrcInfo.VT SrcInfo.RC:$src2),
- (i8 imm:$src3))), itins.rr>,
- Sched<[itins.Sched]>;
+ (i8 imm:$src3)))>,
+ Sched<[sched]>;
defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
(ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
(SrcInfo.VT (bitconvert
(SrcInfo.LdFrag addr:$src2))),
- (i8 imm:$src3))), itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i8 imm:$src3)))>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
@@ -9007,8 +9963,8 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
// op(reg_vec2,mem_vec,imm)
// op(reg_vec2,broadcast(eltVt),imm)
multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _>:
- avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, itins, _, _>{
+ X86FoldableSchedWrite sched, X86VectorVTInfo _>:
+ avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, sched, _, _>{
let ExeDomain = _.ExeDomain in
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -9017,36 +9973,36 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$src1, ${src2}"##_.BroadcastStr##", $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- (i8 imm:$src3)), itins.rm>, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i8 imm:$src3))>, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
// op(reg_vec2,mem_scalar,imm)
multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3)), itins.rr>,
- Sched<[itins.Sched]>;
+ (i32 imm:$src3))>,
+ Sched<[sched]>;
defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (scalar_to_vector
(_.ScalarLdFrag addr:$src2))),
- (i32 imm:$src3)), itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 imm:$src3))>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
- SDNode OpNode, OpndItins itins,
+ SDNode OpNode, X86FoldableSchedWrite sched,
X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -9056,13 +10012,13 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(i32 imm:$src3),
- (i32 FROUND_NO_EXC)), itins.rr>,
- EVEX_B, Sched<[itins.Sched]>;
+ (i32 FROUND_NO_EXC))>,
+ EVEX_B, Sched<[sched]>;
}
//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
@@ -9071,203 +10027,379 @@ multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(i32 imm:$src3),
- (i32 FROUND_NO_EXC)), itins.rr>,
- EVEX_B, Sched<[itins.Sched]>;
+ (i32 FROUND_NO_EXC))>,
+ EVEX_B, Sched<[sched]>;
}
multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode,
- SDNode OpNodeRnd, OpndItins itins, Predicate prd>{
+ SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{
let Predicates = [prd] in {
- defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, itins, _.info512>,
- avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd, itins, _.info512>,
+ defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>,
+ avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd, sched.ZMM, _.info512>,
EVEX_V512;
}
let Predicates = [prd, HasVLX] in {
- defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, itins, _.info128>,
+ defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, sched.XMM, _.info128>,
EVEX_V128;
- defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, itins, _.info256>,
+ defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, sched.YMM, _.info256>,
EVEX_V256;
}
}
multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr,
- OpndItins itins, AVX512VLVectorVTInfo DestInfo,
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo DestInfo,
AVX512VLVectorVTInfo SrcInfo, Predicate Pred = HasBWI> {
let Predicates = [Pred] in {
- defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, itins, DestInfo.info512,
+ defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.ZMM, DestInfo.info512,
SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V;
}
let Predicates = [Pred, HasVLX] in {
- defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, itins, DestInfo.info128,
+ defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.XMM, DestInfo.info128,
SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V;
- defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, itins, DestInfo.info256,
+ defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.YMM, DestInfo.info256,
SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V;
}
}
multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
- bits<8> opc, SDNode OpNode, OpndItins itins,
+ bits<8> opc, SDNode OpNode, X86SchedWriteWidths sched,
Predicate Pred = HasAVX512> {
let Predicates = [Pred] in {
- defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512;
+ defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>,
+ EVEX_V512;
}
let Predicates = [Pred, HasVLX] in {
- defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, itins, _.info128>, EVEX_V128;
- defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, itins, _.info256>, EVEX_V256;
+ defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, sched.XMM, _.info128>,
+ EVEX_V128;
+ defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, sched.YMM, _.info256>,
+ EVEX_V256;
}
}
multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
X86VectorVTInfo _, bits<8> opc, SDNode OpNode,
- SDNode OpNodeRnd, OpndItins itins, Predicate prd>{
+ SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd> {
let Predicates = [prd] in {
- defm Z128 : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, itins, _>,
- avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNodeRnd, itins, _>;
+ defm Z : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, sched.XMM, _>,
+ avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNodeRnd, sched.XMM, _>;
}
}
multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
bits<8> opcPs, bits<8> opcPd, SDNode OpNode,
- SDNode OpNodeRnd, SizeItins itins, Predicate prd>{
+ SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{
defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
- opcPs, OpNode, OpNodeRnd, itins.s, prd>,
+ opcPs, OpNode, OpNodeRnd, sched, prd>,
EVEX_CD8<32, CD8VF>;
defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
- opcPd, OpNode, OpNodeRnd, itins.d, prd>,
+ opcPd, OpNode, OpNodeRnd, sched, prd>,
EVEX_CD8<64, CD8VF>, VEX_W;
}
defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
- X86VReduce, X86VReduceRnd, SSE_ALU_ITINS_P, HasDQI>,
+ X86VReduce, X86VReduceRnd, SchedWriteFRnd, HasDQI>,
AVX512AIi8Base, EVEX;
defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
- X86VRndScale, X86VRndScaleRnd, SSE_ALU_ITINS_P, HasAVX512>,
+ X86VRndScale, X86VRndScaleRnd, SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, EVEX;
defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
- X86VGetMant, X86VGetMantRnd, SSE_ALU_ITINS_P, HasAVX512>,
+ X86VGetMant, X86VGetMantRnd, SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, EVEX;
defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
0x50, X86VRange, X86VRangeRnd,
- SSE_ALU_F64P, HasDQI>,
+ SchedWriteFAdd, HasDQI>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info,
0x50, X86VRange, X86VRangeRnd,
- SSE_ALU_F32P, HasDQI>,
+ SchedWriteFAdd, HasDQI>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd",
- f64x_info, 0x51, X86Ranges, X86RangesRnd, SSE_ALU_F64S, HasDQI>,
+ f64x_info, 0x51, X86Ranges, X86RangesRnd, SchedWriteFAdd, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
- 0x51, X86Ranges, X86RangesRnd, SSE_ALU_F32S, HasDQI>,
+ 0x51, X86Ranges, X86RangesRnd, SchedWriteFAdd, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
- 0x57, X86Reduces, X86ReducesRnd, SSE_ALU_F64S, HasDQI>,
+ 0x57, X86Reduces, X86ReducesRnd, SchedWriteFRnd, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
- 0x57, X86Reduces, X86ReducesRnd, SSE_ALU_F32S, HasDQI>,
+ 0x57, X86Reduces, X86ReducesRnd, SchedWriteFRnd, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
- 0x27, X86GetMants, X86GetMantsRnd, SSE_ALU_F64S, HasAVX512>,
+ 0x27, X86GetMants, X86GetMantsRnd, SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
- 0x27, X86GetMants, X86GetMantsRnd, SSE_ALU_F32S, HasAVX512>,
+ 0x27, X86GetMants, X86GetMantsRnd, SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+multiclass AVX512_rndscale_lowering<X86VectorVTInfo _, string Suffix> {
+ // Register
+ def : Pat<(_.VT (ffloor _.RC:$src)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
+ _.RC:$src, (i32 0x9))>;
+ def : Pat<(_.VT (fnearbyint _.RC:$src)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
+ _.RC:$src, (i32 0xC))>;
+ def : Pat<(_.VT (fceil _.RC:$src)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
+ _.RC:$src, (i32 0xA))>;
+ def : Pat<(_.VT (frint _.RC:$src)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
+ _.RC:$src, (i32 0x4))>;
+ def : Pat<(_.VT (ftrunc _.RC:$src)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
+ _.RC:$src, (i32 0xB))>;
+
+ // Merge-masking
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src), _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
+ _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x9))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src), _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
+ _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xC))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src), _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
+ _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xA))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src), _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
+ _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x4))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src), _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
+ _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xB))>;
+
+ // Zero-masking
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
+ _.KRCWM:$mask, _.RC:$src, (i32 0x9))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
+ _.KRCWM:$mask, _.RC:$src, (i32 0xC))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
+ _.KRCWM:$mask, _.RC:$src, (i32 0xA))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
+ _.KRCWM:$mask, _.RC:$src, (i32 0x4))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
+ _.KRCWM:$mask, _.RC:$src, (i32 0xB))>;
+
+ // Load
+ def : Pat<(_.VT (ffloor (_.LdFrag addr:$src))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
+ addr:$src, (i32 0x9))>;
+ def : Pat<(_.VT (fnearbyint (_.LdFrag addr:$src))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
+ addr:$src, (i32 0xC))>;
+ def : Pat<(_.VT (fceil (_.LdFrag addr:$src))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
+ addr:$src, (i32 0xA))>;
+ def : Pat<(_.VT (frint (_.LdFrag addr:$src))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
+ addr:$src, (i32 0x4))>;
+ def : Pat<(_.VT (ftrunc (_.LdFrag addr:$src))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
+ addr:$src, (i32 0xB))>;
+
+ // Merge-masking + load
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>;
+
+ // Zero-masking + load
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
+ _.KRCWM:$mask, addr:$src, (i32 0x9))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
+ _.KRCWM:$mask, addr:$src, (i32 0xC))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
+ _.KRCWM:$mask, addr:$src, (i32 0xA))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
+ _.KRCWM:$mask, addr:$src, (i32 0x4))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
+ _.KRCWM:$mask, addr:$src, (i32 0xB))>;
+
+ // Broadcast load
+ def : Pat<(_.VT (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
+ addr:$src, (i32 0x9))>;
+ def : Pat<(_.VT (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
+ addr:$src, (i32 0xC))>;
+ def : Pat<(_.VT (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
+ addr:$src, (i32 0xA))>;
+ def : Pat<(_.VT (frint (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
+ addr:$src, (i32 0x4))>;
+ def : Pat<(_.VT (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
+ addr:$src, (i32 0xB))>;
+
+ // Merge-masking + broadcast load
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>;
+
+ // Zero-masking + broadcast load
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
+ _.KRCWM:$mask, addr:$src, (i32 0x9))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
+ _.KRCWM:$mask, addr:$src, (i32 0xC))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
+ _.KRCWM:$mask, addr:$src, (i32 0xA))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
+ _.KRCWM:$mask, addr:$src, (i32 0x4))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
+ _.KRCWM:$mask, addr:$src, (i32 0xB))>;
+}
+
let Predicates = [HasAVX512] in {
-def : Pat<(v16f32 (ffloor VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0x9))>;
-def : Pat<(v16f32 (fnearbyint VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
-def : Pat<(v16f32 (fceil VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0xA))>;
-def : Pat<(v16f32 (frint VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
-def : Pat<(v16f32 (ftrunc VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0xB))>;
-
-def : Pat<(v8f64 (ffloor VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0x9))>;
-def : Pat<(v8f64 (fnearbyint VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
-def : Pat<(v8f64 (fceil VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0xA))>;
-def : Pat<(v8f64 (frint VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
-def : Pat<(v8f64 (ftrunc VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0xB))>;
+ defm : AVX512_rndscale_lowering<v16f32_info, "PS">;
+ defm : AVX512_rndscale_lowering<v8f64_info, "PD">;
}
let Predicates = [HasVLX] in {
-def : Pat<(v4f32 (ffloor VR128X:$src)),
- (VRNDSCALEPSZ128rri VR128X:$src, (i32 0x9))>;
-def : Pat<(v4f32 (fnearbyint VR128X:$src)),
- (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xC))>;
-def : Pat<(v4f32 (fceil VR128X:$src)),
- (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xA))>;
-def : Pat<(v4f32 (frint VR128X:$src)),
- (VRNDSCALEPSZ128rri VR128X:$src, (i32 0x4))>;
-def : Pat<(v4f32 (ftrunc VR128X:$src)),
- (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xB))>;
-
-def : Pat<(v2f64 (ffloor VR128X:$src)),
- (VRNDSCALEPDZ128rri VR128X:$src, (i32 0x9))>;
-def : Pat<(v2f64 (fnearbyint VR128X:$src)),
- (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xC))>;
-def : Pat<(v2f64 (fceil VR128X:$src)),
- (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xA))>;
-def : Pat<(v2f64 (frint VR128X:$src)),
- (VRNDSCALEPDZ128rri VR128X:$src, (i32 0x4))>;
-def : Pat<(v2f64 (ftrunc VR128X:$src)),
- (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xB))>;
-
-def : Pat<(v8f32 (ffloor VR256X:$src)),
- (VRNDSCALEPSZ256rri VR256X:$src, (i32 0x9))>;
-def : Pat<(v8f32 (fnearbyint VR256X:$src)),
- (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xC))>;
-def : Pat<(v8f32 (fceil VR256X:$src)),
- (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xA))>;
-def : Pat<(v8f32 (frint VR256X:$src)),
- (VRNDSCALEPSZ256rri VR256X:$src, (i32 0x4))>;
-def : Pat<(v8f32 (ftrunc VR256X:$src)),
- (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xB))>;
-
-def : Pat<(v4f64 (ffloor VR256X:$src)),
- (VRNDSCALEPDZ256rri VR256X:$src, (i32 0x9))>;
-def : Pat<(v4f64 (fnearbyint VR256X:$src)),
- (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xC))>;
-def : Pat<(v4f64 (fceil VR256X:$src)),
- (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xA))>;
-def : Pat<(v4f64 (frint VR256X:$src)),
- (VRNDSCALEPDZ256rri VR256X:$src, (i32 0x4))>;
-def : Pat<(v4f64 (ftrunc VR256X:$src)),
- (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xB))>;
-}
-
-multiclass avx512_shuff_packed_128<string OpcodeStr, OpndItins itins,
- AVX512VLVectorVTInfo _, bits<8> opc>{
- let Predicates = [HasAVX512] in {
- defm Z : avx512_3Op_imm8<opc, OpcodeStr, X86Shuf128, itins, _.info512>, EVEX_V512;
+ defm : AVX512_rndscale_lowering<v8f32x_info, "PS">;
+ defm : AVX512_rndscale_lowering<v4f64x_info, "PD">;
+ defm : AVX512_rndscale_lowering<v4f32x_info, "PS">;
+ defm : AVX512_rndscale_lowering<v2f64x_info, "PD">;
+}
- }
- let Predicates = [HasAVX512, HasVLX] in {
- defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, X86Shuf128, itins, _.info256>, EVEX_V256;
+multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _,
+ X86VectorVTInfo CastInfo,
+ string EVEX2VEXOvrd> {
+ let ExeDomain = _.ExeDomain in {
+ defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (_.VT (bitconvert
+ (CastInfo.VT (X86Shuf128 _.RC:$src1, _.RC:$src2,
+ (i8 imm:$src3)))))>,
+ Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr">;
+ defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (_.VT
+ (bitconvert
+ (CastInfo.VT (X86Shuf128 _.RC:$src1,
+ (bitconvert (_.LdFrag addr:$src2)),
+ (i8 imm:$src3)))))>,
+ Sched<[sched.Folded, ReadAfterLd]>,
+ EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
+ defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr##", $src3",
+ (_.VT
+ (bitconvert
+ (CastInfo.VT
+ (X86Shuf128 _.RC:$src1,
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+ (i8 imm:$src3)))))>, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
-defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4", SSE_SHUFP,
- avx512vl_f32_info, 0x23>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
-defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2", SSE_SHUFP,
- avx512vl_f64_info, 0x23>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
-defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", SSE_SHUFP,
- avx512vl_i32_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
-defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", SSE_SHUFP,
- avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+multiclass avx512_shuff_packed_128<string OpcodeStr, X86FoldableSchedWrite sched,
+ AVX512VLVectorVTInfo _,
+ AVX512VLVectorVTInfo CastInfo, bits<8> opc,
+ string EVEX2VEXOvrd>{
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_shuff_packed_128_common<opc, OpcodeStr, sched,
+ _.info512, CastInfo.info512, "">, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in
+ defm Z256 : avx512_shuff_packed_128_common<opc, OpcodeStr, sched,
+ _.info256, CastInfo.info256,
+ EVEX2VEXOvrd>, EVEX_V256;
+}
+
+defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4", WriteFShuffle256,
+ avx512vl_f32_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2", WriteFShuffle256,
+ avx512vl_f64_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", WriteFShuffle256,
+ avx512vl_i32_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", WriteFShuffle256,
+ avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
let Predicates = [HasAVX512] in {
// Provide fallback in case the load node that is used in the broadcast
@@ -9302,20 +10434,61 @@ def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
0)>;
}
-multiclass avx512_valign<string OpcodeStr, OpndItins itins,
- AVX512VLVectorVTInfo VTInfo_I> {
- defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign, itins>,
- AVX512AIi8Base, EVEX_4V;
+multiclass avx512_valign<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _>{
+ // NOTE: EVEX2VEXOverride changed back to Unset for 256-bit at the
+ // instantiation of this class.
+ let ExeDomain = _.ExeDomain in {
+ defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$src3)))>,
+ Sched<[sched]>, EVEX2VEXOverride<"VPALIGNRrri">;
+ defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (_.VT (X86VAlign _.RC:$src1,
+ (bitconvert (_.LdFrag addr:$src2)),
+ (i8 imm:$src3)))>,
+ Sched<[sched.Folded, ReadAfterLd]>,
+ EVEX2VEXOverride<"VPALIGNRrmi">;
+
+ defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr##", $src3",
+ (X86VAlign _.RC:$src1,
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+ (i8 imm:$src3))>, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
+ }
+}
+
+multiclass avx512_valign_common<string OpcodeStr, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_valign<0x03, OpcodeStr, sched.ZMM, _.info512>,
+ AVX512AIi8Base, EVEX_4V, EVEX_V512;
+ }
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_valign<0x03, OpcodeStr, sched.XMM, _.info128>,
+ AVX512AIi8Base, EVEX_4V, EVEX_V128;
+ // We can't really override the 256-bit version so change it back to unset.
+ let EVEX2VEXOverride = ? in
+ defm Z256 : avx512_valign<0x03, OpcodeStr, sched.YMM, _.info256>,
+ AVX512AIi8Base, EVEX_4V, EVEX_V256;
+ }
}
-defm VALIGND: avx512_valign<"valignd", SSE_PALIGN, avx512vl_i32_info>,
- EVEX_CD8<32, CD8VF>;
-defm VALIGNQ: avx512_valign<"valignq", SSE_PALIGN, avx512vl_i64_info>,
- EVEX_CD8<64, CD8VF>, VEX_W;
+defm VALIGND: avx512_valign_common<"valignd", SchedWriteShuffle,
+ avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VALIGNQ: avx512_valign_common<"valignq", SchedWriteShuffle,
+ avx512vl_i64_info>, EVEX_CD8<64, CD8VF>,
+ VEX_W;
-defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr", SSE_PALIGN,
- avx512vl_i8_info, avx512vl_i8_info>,
- EVEX_CD8<8, CD8VF>;
+defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr",
+ SchedWriteShuffle, avx512vl_i8_info,
+ avx512vl_i8_info>, EVEX_CD8<8, CD8VF>;
// Fragments to help convert valignq into masked valignd. Or valignq/valignd
// into vpalignr.
@@ -9435,97 +10608,100 @@ let Predicates = [HasVLX, HasBWI] in {
}
defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw",
- SSE_INTMUL_ITINS_P, avx512vl_i16_info, avx512vl_i8_info>,
- EVEX_CD8<8, CD8VF>;
+ SchedWritePSADBW, avx512vl_i16_info, avx512vl_i8_info>,
+ EVEX_CD8<8, CD8VF>, NotEVEX2VEXConvertible;
multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1), OpcodeStr,
"$src1", "$src1",
- (_.VT (OpNode _.RC:$src1)), itins.rr>, EVEX, AVX5128IBase,
- Sched<[itins.Sched]>;
+ (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase,
+ Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src1), OpcodeStr,
"$src1", "$src1",
- (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1)))), itins.rm>,
+ (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded]>;
+ Sched<[sched.Folded]>;
}
}
multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> :
- avx512_unary_rm<opc, OpcodeStr, OpNode, itins, _> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> :
+ avx512_unary_rm<opc, OpcodeStr, OpNode, sched, _> {
defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src1), OpcodeStr,
"${src1}"##_.BroadcastStr,
"${src1}"##_.BroadcastStr,
(_.VT (OpNode (X86VBroadcast
- (_.ScalarLdFrag addr:$src1)))), itins.rm>,
+ (_.ScalarLdFrag addr:$src1))))>,
EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded]>;
+ Sched<[sched.Folded]>;
}
multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo VTInfo,
- Predicate prd> {
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, itins, VTInfo.info512>,
+ defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, sched.ZMM, VTInfo.info512>,
EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, itins, VTInfo.info256>,
+ defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, sched.YMM, VTInfo.info256>,
EVEX_V256;
- defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, itins, VTInfo.info128>,
+ defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, sched.XMM, VTInfo.info128>,
EVEX_V128;
}
}
multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo VTInfo,
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo,
Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info512>,
+ defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, sched.ZMM, VTInfo.info512>,
EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info256>,
+ defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, sched.YMM, VTInfo.info256>,
EVEX_V256;
- defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info128>,
+ defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, sched.XMM, VTInfo.info128>,
EVEX_V128;
}
}
multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
- SDNode OpNode, OpndItins itins, Predicate prd> {
- defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, itins,
+ SDNode OpNode, X86SchedWriteWidths sched,
+ Predicate prd> {
+ defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, sched,
avx512vl_i64_info, prd>, VEX_W;
- defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, itins,
+ defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, sched,
avx512vl_i32_info, prd>;
}
multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
- SDNode OpNode, OpndItins itins, Predicate prd> {
- defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, itins,
+ SDNode OpNode, X86SchedWriteWidths sched,
+ Predicate prd> {
+ defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, sched,
avx512vl_i16_info, prd>, VEX_WIG;
- defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, itins,
+ defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, sched,
avx512vl_i8_info, prd>, VEX_WIG;
}
multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
bits<8> opc_d, bits<8> opc_q,
string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
- defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode, itins,
+ X86SchedWriteWidths sched> {
+ defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode, sched,
HasAVX512>,
- avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode, itins,
+ avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode, sched,
HasBWI>;
}
-defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs, SSE_PABS>;
+defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs,
+ SchedWriteVecALU>;
// VPABS: Use 512bit version to implement 128/256 bit in case NoVLX.
let Predicates = [HasAVX512, NoVLX] in {
@@ -9563,13 +10739,12 @@ multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
}
}
-// FIXME: Is there a better scheduler itinerary for VPLZCNT?
defm VPLZCNT : avx512_unary_rm_vl_dq<0x44, 0x44, "vplzcnt", ctlz,
- SSE_INTALU_ITINS_P, HasCDI>;
+ SchedWriteVecIMul, HasCDI>;
-// FIXME: Is there a better scheduler itinerary for VPCONFLICT?
+// FIXME: Is there a better scheduler class for VPCONFLICT?
defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict,
- SSE_INTALU_ITINS_P, HasCDI>;
+ SchedWriteVecALU, HasCDI>;
// VPLZCNT: Use 512bit version to implement 128/256 bit in case NoVLX.
defm : avx512_unary_lowering<"VPLZCNTQ", ctlz, avx512vl_i64_info, HasCDI>;
@@ -9579,9 +10754,9 @@ defm : avx512_unary_lowering<"VPLZCNTD", ctlz, avx512vl_i32_info, HasCDI>;
// Counts number of ones - VPOPCNTD and VPOPCNTQ
//===---------------------------------------------------------------------===//
-// FIXME: Is there a better scheduler itinerary for VPOPCNTD/VPOPCNTQ?
+// FIXME: Is there a better scheduler class for VPOPCNTD/VPOPCNTQ?
defm VPOPCNT : avx512_unary_rm_vl_dq<0x55, 0x55, "vpopcnt", ctpop,
- SSE_INTALU_ITINS_P, HasVPOPCNTDQ>;
+ SchedWriteVecALU, HasVPOPCNTDQ>;
defm : avx512_unary_lowering<"VPOPCNTQ", ctpop, avx512vl_i64_info, HasVPOPCNTDQ>;
defm : avx512_unary_lowering<"VPOPCNTD", ctpop, avx512vl_i32_info, HasVPOPCNTDQ>;
@@ -9589,71 +10764,74 @@ defm : avx512_unary_lowering<"VPOPCNTD", ctpop, avx512vl_i32_info, HasVPOPCNTDQ>
//===---------------------------------------------------------------------===//
// Replicate Single FP - MOVSHDUP and MOVSLDUP
//===---------------------------------------------------------------------===//
+
multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
- defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, itins,
+ X86SchedWriteWidths sched> {
+ defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, sched,
avx512vl_f32_info, HasAVX512>, XS;
}
-defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup, SSE_MOVDDUP>;
-defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup, SSE_MOVDDUP>;
+defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup,
+ SchedWriteFShuffle>;
+defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup,
+ SchedWriteFShuffle>;
//===----------------------------------------------------------------------===//
// AVX-512 - MOVDDUP
//===----------------------------------------------------------------------===//
multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
- (_.VT (OpNode (_.VT _.RC:$src))), itins.rr>, EVEX,
- Sched<[itins.Sched]>;
+ (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX,
+ Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
(_.VT (OpNode (_.VT (scalar_to_vector
- (_.ScalarLdFrag addr:$src))))),
- itins.rm>, EVEX, EVEX_CD8<_.EltSize, CD8VH>,
- Sched<[itins.Sched.Folded]>;
+ (_.ScalarLdFrag addr:$src)))))>,
+ EVEX, EVEX_CD8<_.EltSize, CD8VH>,
+ Sched<[sched.Folded]>;
}
}
multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo VTInfo> {
-
- defm Z : avx512_unary_rm<opc, OpcodeStr, X86Movddup, itins, VTInfo.info512>, EVEX_V512;
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo> {
+ defm Z : avx512_unary_rm<opc, OpcodeStr, X86Movddup, sched.ZMM,
+ VTInfo.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z256 : avx512_unary_rm<opc, OpcodeStr, X86Movddup, itins, VTInfo.info256>,
- EVEX_V256;
- defm Z128 : avx512_movddup_128<opc, OpcodeStr, X86VBroadcast, itins, VTInfo.info128>,
- EVEX_V128;
+ defm Z256 : avx512_unary_rm<opc, OpcodeStr, X86Movddup, sched.YMM,
+ VTInfo.info256>, EVEX_V256;
+ defm Z128 : avx512_movddup_128<opc, OpcodeStr, X86VBroadcast, sched.XMM,
+ VTInfo.info128>, EVEX_V128;
}
}
multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
- defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode, itins,
+ X86SchedWriteWidths sched> {
+ defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode, sched,
avx512vl_f64_info>, XD, VEX_W;
}
-defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SSE_MOVDDUP>;
+defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SchedWriteFShuffle>;
let Predicates = [HasVLX] in {
def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
(VMOVDDUPZ128rm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
- (VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+ (VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
(VMOVDDUPZ128rm addr:$src)>;
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
(v2f64 VR128X:$src0)),
(VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask,
- (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+ (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
(bitconvert (v4i32 immAllZerosV))),
- (VMOVDDUPZ128rrkz VK2WM:$mask, (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+ (VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
(v2f64 VR128X:$src0)),
@@ -9673,28 +10851,29 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$sr
//===----------------------------------------------------------------------===//
// AVX-512 - Unpack Instructions
//===----------------------------------------------------------------------===//
+
defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512,
- SSE_ALU_ITINS_S>;
+ SchedWriteFShuffleSizes, 0, 1>;
defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512,
- SSE_ALU_ITINS_S>;
+ SchedWriteFShuffleSizes>;
defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl,
- SSE_INTALU_ITINS_P, HasBWI>;
+ SchedWriteShuffle, HasBWI>;
defm VPUNPCKHBW : avx512_binop_rm_vl_b<0x68, "vpunpckhbw", X86Unpckh,
- SSE_INTALU_ITINS_P, HasBWI>;
+ SchedWriteShuffle, HasBWI>;
defm VPUNPCKLWD : avx512_binop_rm_vl_w<0x61, "vpunpcklwd", X86Unpckl,
- SSE_INTALU_ITINS_P, HasBWI>;
+ SchedWriteShuffle, HasBWI>;
defm VPUNPCKHWD : avx512_binop_rm_vl_w<0x69, "vpunpckhwd", X86Unpckh,
- SSE_INTALU_ITINS_P, HasBWI>;
+ SchedWriteShuffle, HasBWI>;
defm VPUNPCKLDQ : avx512_binop_rm_vl_d<0x62, "vpunpckldq", X86Unpckl,
- SSE_INTALU_ITINS_P, HasAVX512>;
+ SchedWriteShuffle, HasAVX512>;
defm VPUNPCKHDQ : avx512_binop_rm_vl_d<0x6A, "vpunpckhdq", X86Unpckh,
- SSE_INTALU_ITINS_P, HasAVX512>;
+ SchedWriteShuffle, HasAVX512>;
defm VPUNPCKLQDQ : avx512_binop_rm_vl_q<0x6C, "vpunpcklqdq", X86Unpckl,
- SSE_INTALU_ITINS_P, HasAVX512>;
+ SchedWriteShuffle, HasAVX512>;
defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh,
- SSE_INTALU_ITINS_P, HasAVX512>;
+ SchedWriteShuffle, HasAVX512>;
//===----------------------------------------------------------------------===//
// AVX-512 - Extract & Insert Integer Instructions
@@ -9707,7 +10886,7 @@ multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), imm:$src2))),
addr:$dst)]>,
- EVEX, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteShuffleLd]>;
+ EVEX, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecExtractSt]>;
}
multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
@@ -9717,7 +10896,7 @@ multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst,
(X86pextrb (_.VT _.RC:$src1), imm:$src2))]>,
- EVEX, TAPD, Sched<[WriteShuffle]>;
+ EVEX, TAPD, Sched<[WriteVecExtract]>;
defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
}
@@ -9729,15 +10908,15 @@ multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
(ins _.RC:$src1, u8imm:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst,
- (X86pextrw (_.VT _.RC:$src1), imm:$src2))],
- IIC_SSE_PEXTRW>, EVEX, PD, Sched<[WriteShuffle]>;
+ (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>,
+ EVEX, PD, Sched<[WriteVecExtract]>;
- let hasSideEffects = 0 in
+ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in
def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
(ins _.RC:$src1, u8imm:$src2),
- OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
- IIC_SSE_PEXTRW>, EVEX, TAPD, FoldGenData<NAME#rr>,
- Sched<[WriteShuffle]>;
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ EVEX, TAPD, FoldGenData<NAME#rr>,
+ Sched<[WriteVecExtract]>;
defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
}
@@ -9751,7 +10930,7 @@ multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GRC:$dst,
(extractelt (_.VT _.RC:$src1), imm:$src2))]>,
- EVEX, TAPD, Sched<[WriteShuffle]>;
+ EVEX, TAPD, Sched<[WriteVecExtract]>;
def mr : AVX512Ii8<0x16, MRMDestMem, (outs),
(ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
@@ -9759,7 +10938,7 @@ multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
[(store (extractelt (_.VT _.RC:$src1),
imm:$src2),addr:$dst)]>,
EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD,
- Sched<[WriteShuffleLd]>;
+ Sched<[WriteVecExtractSt]>;
}
}
@@ -9775,7 +10954,7 @@ multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set _.RC:$dst,
(_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>,
- EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecInsertLd, ReadAfterLd]>;
}
multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -9786,7 +10965,7 @@ multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set _.RC:$dst,
(OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V,
- Sched<[WriteShuffle]>;
+ Sched<[WriteVecInsert]>;
defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>;
}
@@ -9800,7 +10979,7 @@ multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set _.RC:$dst,
(_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>,
- EVEX_4V, TAPD, Sched<[WriteShuffle]>;
+ EVEX_4V, TAPD, Sched<[WriteVecInsert]>;
defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
_.ScalarLdFrag>, TAPD;
@@ -9819,10 +10998,11 @@ defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
//===----------------------------------------------------------------------===//
multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
- AVX512VLVectorVTInfo VTInfo_FP>{
+ AVX512VLVectorVTInfo VTInfo_FP>{
defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp,
- SSE_SHUFP>, EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
- AVX512AIi8Base, EVEX_4V;
+ SchedWriteFShuffle>,
+ EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
+ AVX512AIi8Base, EVEX_4V;
}
defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS;
@@ -9832,85 +11012,80 @@ defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD,
// AVX-512 - Byte shift Left/Right
//===----------------------------------------------------------------------===//
-let Sched = WriteVecShift in
-def AVX512_BYTESHIFT : OpndItins<
- IIC_SSE_INTSHDQ_P_RI, IIC_SSE_INTSHDQ_P_RI
->;
-
+// FIXME: The SSE/AVX names are PSLLDQri etc. - should we add the i here as well?
multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
Format MRMm, string OpcodeStr,
- OpndItins itins, X86VectorVTInfo _>{
+ X86FoldableSchedWrite sched, X86VectorVTInfo _>{
def rr : AVX512<opc, MRMr,
(outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))],
- itins.rr>, Sched<[itins.Sched]>;
+ [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>,
+ Sched<[sched]>;
def rm : AVX512<opc, MRMm,
(outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.RC:$dst,(_.VT (OpNode
(_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i8 imm:$src2))))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i8 imm:$src2))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr,
Format MRMm, string OpcodeStr,
- OpndItins itins, Predicate prd>{
+ X86SchedWriteWidths sched, Predicate prd>{
let Predicates = [prd] in
- defm Z : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
- OpcodeStr, itins, v64i8_info>, EVEX_V512;
+ defm Z : avx512_shift_packed<opc, OpNode, MRMr, MRMm, OpcodeStr,
+ sched.ZMM, v64i8_info>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
- OpcodeStr, itins, v32i8x_info>, EVEX_V256;
- defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
- OpcodeStr, itins, v16i8x_info>, EVEX_V128;
+ defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, OpcodeStr,
+ sched.YMM, v32i8x_info>, EVEX_V256;
+ defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, OpcodeStr,
+ sched.XMM, v16i8x_info>, EVEX_V128;
}
}
defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq",
- AVX512_BYTESHIFT, HasBWI>, AVX512PDIi8Base,
- EVEX_4V, VEX_WIG;
+ SchedWriteShuffle, HasBWI>,
+ AVX512PDIi8Base, EVEX_4V, VEX_WIG;
defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
- AVX512_BYTESHIFT, HasBWI>, AVX512PDIi8Base,
- EVEX_4V, VEX_WIG;
-
+ SchedWriteShuffle, HasBWI>,
+ AVX512PDIi8Base, EVEX_4V, VEX_WIG;
multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
- string OpcodeStr, OpndItins itins,
+ string OpcodeStr, X86FoldableSchedWrite sched,
X86VectorVTInfo _dst, X86VectorVTInfo _src> {
def rr : AVX512BI<opc, MRMSrcReg,
(outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _dst.RC:$dst,(_dst.VT
(OpNode (_src.VT _src.RC:$src1),
- (_src.VT _src.RC:$src2))))], itins.rr>,
- Sched<[itins.Sched]>;
+ (_src.VT _src.RC:$src2))))]>,
+ Sched<[sched]>;
def rm : AVX512BI<opc, MRMSrcMem,
(outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _dst.RC:$dst,(_dst.VT
(OpNode (_src.VT _src.RC:$src1),
(_src.VT (bitconvert
- (_src.LdFrag addr:$src2))))))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (_src.LdFrag addr:$src2))))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
- string OpcodeStr, OpndItins itins,
+ string OpcodeStr, X86SchedWriteWidths sched,
Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_psadbw_packed<opc, OpNode, OpcodeStr, itins, v8i64_info,
- v64i8_info>, EVEX_V512;
+ defm Z : avx512_psadbw_packed<opc, OpNode, OpcodeStr, sched.ZMM,
+ v8i64_info, v64i8_info>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, itins, v4i64x_info,
- v32i8x_info>, EVEX_V256;
- defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, itins, v2i64x_info,
- v16i8x_info>, EVEX_V128;
+ defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, sched.YMM,
+ v4i64x_info, v32i8x_info>, EVEX_V256;
+ defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, sched.XMM,
+ v2i64x_info, v16i8x_info>, EVEX_V128;
}
}
defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
- SSE_MPSADBW_ITINS, HasBWI>, EVEX_4V, VEX_WIG;
+ SchedWritePSADBW, HasBWI>, EVEX_4V, VEX_WIG;
// Transforms to swizzle an immediate to enable better matching when
// memory operand isn't in the right place.
@@ -9975,7 +11150,8 @@ def VPTERNLOG312_imm8 : SDNodeXForm<imm, [{
}]>;
multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _>{
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ string Name>{
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, u8imm:$src4),
@@ -9983,17 +11159,17 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_.VT _.RC:$src3),
- (i8 imm:$src4)), itins.rr, 1, 1>,
- AVX512AIi8Base, EVEX_4V, Sched<[itins.Sched]>;
+ (i8 imm:$src4)), 1, 1>,
+ AVX512AIi8Base, EVEX_4V, Sched<[sched]>;
defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_.VT (bitconvert (_.LdFrag addr:$src3))),
- (i8 imm:$src4)), itins.rm, 1, 0>,
+ (i8 imm:$src4)), 1, 0>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>;
defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2",
@@ -10001,32 +11177,32 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
- (i8 imm:$src4)), itins.rm, 1, 0>, EVEX_B,
+ (i8 imm:$src4)), 1, 0>, EVEX_B,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>;
}// Constraints = "$src1 = $dst"
// Additional patterns for matching passthru operand in other positions.
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
_.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 imm:$src4)),
_.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
// Additional patterns for matching loads in other positions.
def : Pat<(_.VT (OpNode (bitconvert (_.LdFrag addr:$src3)),
_.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
- (!cast<Instruction>(NAME#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
+ (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
def : Pat<(_.VT (OpNode _.RC:$src1,
(bitconvert (_.LdFrag addr:$src3)),
_.RC:$src2, (i8 imm:$src4))),
- (!cast<Instruction>(NAME#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
+ (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
// Additional patterns for matching zero masking with loads in other
@@ -10035,13 +11211,13 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode (bitconvert (_.LdFrag addr:$src3)),
_.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
_.ImmAllZerosV)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
_.RC:$src2, (i8 imm:$src4)),
_.ImmAllZerosV)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
// Additional patterns for matching masked loads with different
@@ -10050,42 +11226,42 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
_.RC:$src2, (i8 imm:$src4)),
_.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode (bitconvert (_.LdFrag addr:$src3)),
_.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
_.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src2, _.RC:$src1,
(bitconvert (_.LdFrag addr:$src3)), (i8 imm:$src4)),
_.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)),
_.RC:$src1, (i8 imm:$src4)),
_.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode (bitconvert (_.LdFrag addr:$src3)),
_.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
_.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
// Additional patterns for matching broadcasts in other positions.
def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
_.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
- (!cast<Instruction>(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
+ (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
def : Pat<(_.VT (OpNode _.RC:$src1,
(X86VBroadcast (_.ScalarLdFrag addr:$src3)),
_.RC:$src2, (i8 imm:$src4))),
- (!cast<Instruction>(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
+ (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
// Additional patterns for matching zero masking with broadcasts in other
@@ -10094,7 +11270,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
_.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
_.ImmAllZerosV)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmbikz) _.RC:$src1,
+ (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1,
_.KRCWM:$mask, _.RC:$src2, addr:$src3,
(VPTERNLOG321_imm8 imm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
@@ -10102,7 +11278,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(X86VBroadcast (_.ScalarLdFrag addr:$src3)),
_.RC:$src2, (i8 imm:$src4)),
_.ImmAllZerosV)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmbikz) _.RC:$src1,
+ (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1,
_.KRCWM:$mask, _.RC:$src2, addr:$src3,
(VPTERNLOG132_imm8 imm:$src4))>;
@@ -10113,90 +11289,129 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(X86VBroadcast (_.ScalarLdFrag addr:$src3)),
_.RC:$src2, (i8 imm:$src4)),
_.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
_.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
_.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src2, _.RC:$src1,
(X86VBroadcast (_.ScalarLdFrag addr:$src3)),
(i8 imm:$src4)), _.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src2,
(X86VBroadcast (_.ScalarLdFrag addr:$src3)),
_.RC:$src1, (i8 imm:$src4)),
_.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
_.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
_.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
}
-multiclass avx512_common_ternlog<string OpcodeStr, OpndItins itins,
+multiclass avx512_common_ternlog<string OpcodeStr, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in
- defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, itins, _.info512>, EVEX_V512;
+ defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, sched.ZMM,
+ _.info512, NAME>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, itins, _.info128>, EVEX_V128;
- defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, itins, _.info256>, EVEX_V256;
+ defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, sched.XMM,
+ _.info128, NAME>, EVEX_V128;
+ defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, sched.YMM,
+ _.info256, NAME>, EVEX_V256;
}
}
-defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SSE_INTALU_ITINS_P,
+defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SchedWriteVecALU,
avx512vl_i32_info>;
-defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SSE_INTALU_ITINS_P,
+defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU,
avx512vl_i64_info>, VEX_W;
+// Patterns to implement vnot using vpternlog instead of creating all ones
+// using pcmpeq or vpternlog and then xoring with that. The value 15 is chosen
+// so that the result is only dependent on src0. But we use the same source
+// for all operands to prevent a false dependency.
+// TODO: We should maybe have a more generalized algorithm for folding to
+// vpternlog.
+let Predicates = [HasAVX512] in {
+ def : Pat<(v8i64 (xor VR512:$src, (bc_v8i64 (v16i32 immAllOnesV)))),
+ (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
+}
+
+let Predicates = [HasAVX512, NoVLX] in {
+ def : Pat<(v2i64 (xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV)))),
+ (EXTRACT_SUBREG
+ (VPTERNLOGQZrri
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (i8 15)), sub_xmm)>;
+ def : Pat<(v4i64 (xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV)))),
+ (EXTRACT_SUBREG
+ (VPTERNLOGQZrri
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (i8 15)), sub_ymm)>;
+}
+
+let Predicates = [HasVLX] in {
+ def : Pat<(v2i64 (xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV)))),
+ (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
+ def : Pat<(v4i64 (xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV)))),
+ (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
+}
+
//===----------------------------------------------------------------------===//
// AVX-512 - FixupImm
//===----------------------------------------------------------------------===//
multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _>{
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ X86VectorVTInfo TblVT>{
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (_.IntVT _.RC:$src3),
+ (TblVT.VT _.RC:$src3),
(i32 imm:$src4),
- (i32 FROUND_CURRENT)), itins.rr>, Sched<[itins.Sched]>;
+ (i32 FROUND_CURRENT))>, Sched<[sched]>;
defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (_.IntVT (bitconvert (_.LdFrag addr:$src3))),
+ (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))),
(i32 imm:$src4),
- (i32 FROUND_CURRENT)), itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 FROUND_CURRENT))>,
+ Sched<[sched.Folded, ReadAfterLd]>;
defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, ${src3}"##_.BroadcastStr##", $src2",
"$src2, ${src3}"##_.BroadcastStr##", $src4",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (_.IntVT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+ (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))),
(i32 imm:$src4),
- (i32 FROUND_CURRENT)), itins.rm>,
- EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 FROUND_CURRENT))>,
+ EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
} // Constraints = "$src1 = $dst"
}
multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr,
- SDNode OpNode, OpndItins itins,
- X86VectorVTInfo _>{
+ SDNode OpNode, X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, X86VectorVTInfo TblVT>{
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
@@ -10204,15 +11419,15 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
"$src2, $src3, {sae}, $src4",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (_.IntVT _.RC:$src3),
+ (TblVT.VT _.RC:$src3),
(i32 imm:$src4),
- (i32 FROUND_NO_EXC)), itins.rr>,
- EVEX_B, Sched<[itins.Sched]>;
+ (i32 FROUND_NO_EXC))>,
+ EVEX_B, Sched<[sched]>;
}
}
multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo _,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
X86VectorVTInfo _src3VT> {
let Constraints = "$src1 = $dst" , Predicates = [HasAVX512],
ExeDomain = _.ExeDomain in {
@@ -10223,7 +11438,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_.VT _.RC:$src2),
(_src3VT.VT _src3VT.RC:$src3),
(i32 imm:$src4),
- (i32 FROUND_CURRENT)), itins.rr>, Sched<[itins.Sched]>;
+ (i32 FROUND_CURRENT))>, Sched<[sched]>;
defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
@@ -10232,8 +11447,8 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_.VT _.RC:$src2),
(_src3VT.VT _src3VT.RC:$src3),
(i32 imm:$src4),
- (i32 FROUND_NO_EXC)), itins.rm>,
- EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 FROUND_NO_EXC))>,
+ EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
@@ -10242,37 +11457,40 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_src3VT.VT (scalar_to_vector
(_src3VT.ScalarLdFrag addr:$src3))),
(i32 imm:$src4),
- (i32 FROUND_CURRENT)), itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i32 FROUND_CURRENT))>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
-multiclass avx512_fixupimm_packed_all<OpndItins itins, AVX512VLVectorVTInfo _Vec> {
+multiclass avx512_fixupimm_packed_all<X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _Vec,
+ AVX512VLVectorVTInfo _Tbl> {
let Predicates = [HasAVX512] in
- defm Z : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, itins,
- _Vec.info512>,
- avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, itins,
- _Vec.info512>, AVX512AIi8Base, EVEX_4V, EVEX_V512;
+ defm Z : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.ZMM,
+ _Vec.info512, _Tbl.info512>,
+ avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, sched.ZMM,
+ _Vec.info512, _Tbl.info512>, AVX512AIi8Base,
+ EVEX_4V, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, itins,
- _Vec.info128>, AVX512AIi8Base, EVEX_4V, EVEX_V128;
- defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, itins,
- _Vec.info256>, AVX512AIi8Base, EVEX_4V, EVEX_V256;
- }
-}
-
-defm VFIXUPIMMSS : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
- SSE_ALU_F32S, f32x_info, v4i32x_info>,
- AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
-defm VFIXUPIMMSD : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
- SSE_ALU_F64S, f64x_info, v2i64x_info>,
- AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
-defm VFIXUPIMMPS : avx512_fixupimm_packed_all<SSE_ALU_F32P, avx512vl_f32_info>,
- EVEX_CD8<32, CD8VF>;
-defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SSE_ALU_F64P, avx512vl_f64_info>,
- EVEX_CD8<64, CD8VF>, VEX_W;
-
-
+ defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.XMM,
+ _Vec.info128, _Tbl.info128>, AVX512AIi8Base,
+ EVEX_4V, EVEX_V128;
+ defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.YMM,
+ _Vec.info256, _Tbl.info256>, AVX512AIi8Base,
+ EVEX_4V, EVEX_V256;
+ }
+}
+
+defm VFIXUPIMMSSZ : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
+ SchedWriteFAdd.Scl, f32x_info, v4i32x_info>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VFIXUPIMMSDZ : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
+ SchedWriteFAdd.Scl, f64x_info, v2i64x_info>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VFIXUPIMMPS : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f32_info,
+ avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f64_info,
+ avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W;
// Patterns used to select SSE scalar fp arithmetic instructions from
// either:
@@ -10316,69 +11534,85 @@ defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SSE_ALU_F64P, avx512vl_f64_info>,
// TODO: Some canonicalization in lowering would simplify the number of
// patterns we have to try to match.
-multiclass AVX512_scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
+multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode MoveNode,
+ X86VectorVTInfo _, PatLeaf ZeroFP> {
let Predicates = [HasAVX512] in {
// extracted scalar math op with insert via movss
- def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector
- (Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))),
- FR32X:$src))))),
- (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst,
- (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
+ def : Pat<(MoveNode
+ (_.VT VR128X:$dst),
+ (_.VT (scalar_to_vector
+ (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))),
+ _.FRC:$src)))),
+ (!cast<Instruction>("V"#OpcPrefix#Zrr_Int) _.VT:$dst,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>;
- // vector math op with insert via movss
- def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst),
- (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)))),
- (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>;
+ // extracted masked scalar math op with insert via movss
+ def : Pat<(MoveNode (_.VT VR128X:$src1),
+ (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src2),
+ _.FRC:$src0))),
+ (!cast<Instruction>("V"#OpcPrefix#Zrr_Intk)
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
+ VK1WM:$mask, _.VT:$src1,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
// extracted masked scalar math op with insert via movss
- def : Pat<(X86Movss (v4f32 VR128X:$src1),
+ def : Pat<(MoveNode (_.VT VR128X:$src1),
(scalar_to_vector
(X86selects VK1WM:$mask,
- (Op (f32 (extractelt (v4f32 VR128X:$src1), (iPTR 0))),
- FR32X:$src2),
- FR32X:$src0))),
- (!cast<I>("V"#OpcPrefix#SSZrr_Intk) (COPY_TO_REGCLASS FR32X:$src0, VR128X),
- VK1WM:$mask, v4f32:$src1,
- (COPY_TO_REGCLASS FR32X:$src2, VR128X))>;
+ (Op (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src2), (_.EltVT ZeroFP)))),
+ (!cast<I>("V"#OpcPrefix#Zrr_Intkz)
+ VK1WM:$mask, _.VT:$src1,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
}
}
-defm : AVX512_scalar_math_f32_patterns<fadd, "ADD">;
-defm : AVX512_scalar_math_f32_patterns<fsub, "SUB">;
-defm : AVX512_scalar_math_f32_patterns<fmul, "MUL">;
-defm : AVX512_scalar_math_f32_patterns<fdiv, "DIV">;
+defm : AVX512_scalar_math_fp_patterns<fadd, "ADDSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<fsub, "SUBSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<fmul, "MULSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<fdiv, "DIVSS", X86Movss, v4f32x_info, fp32imm0>;
+
+defm : AVX512_scalar_math_fp_patterns<fadd, "ADDSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<fsub, "SUBSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<fmul, "MULSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<fdiv, "DIVSD", X86Movsd, v2f64x_info, fp64imm0>;
-multiclass AVX512_scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
+multiclass AVX512_scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix,
+ SDNode Move, X86VectorVTInfo _> {
let Predicates = [HasAVX512] in {
- // extracted scalar math op with insert via movsd
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector
- (Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))),
- FR64X:$src))))),
- (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst,
- (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
-
- // vector math op with insert via movsd
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst),
- (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)))),
- (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>;
+ def : Pat<(_.VT (Move _.VT:$dst,
+ (scalar_to_vector (OpNode (extractelt _.VT:$src, 0))))),
+ (!cast<Instruction>("V"#OpcPrefix#Zr_Int) _.VT:$dst, _.VT:$src)>;
+ }
+}
- // extracted masked scalar math op with insert via movss
- def : Pat<(X86Movsd (v2f64 VR128X:$src1),
- (scalar_to_vector
- (X86selects VK1WM:$mask,
- (Op (f64 (extractelt (v2f64 VR128X:$src1), (iPTR 0))),
- FR64X:$src2),
- FR64X:$src0))),
- (!cast<I>("V"#OpcPrefix#SDZrr_Intk) (COPY_TO_REGCLASS FR64X:$src0, VR128X),
- VK1WM:$mask, v2f64:$src1,
- (COPY_TO_REGCLASS FR64X:$src2, VR128X))>;
+defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32x_info>;
+defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64x_info>;
+
+multiclass AVX512_scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix,
+ SDNode Move, X86VectorVTInfo _,
+ bits<8> ImmV> {
+ let Predicates = [HasAVX512] in {
+ def : Pat<(_.VT (Move _.VT:$dst,
+ (scalar_to_vector (OpNode (extractelt _.VT:$src, 0))))),
+ (!cast<Instruction>("V"#OpcPrefix#Zr_Int) _.VT:$dst, _.VT:$src,
+ (i32 ImmV))>;
}
}
-defm : AVX512_scalar_math_f64_patterns<fadd, "ADD">;
-defm : AVX512_scalar_math_f64_patterns<fsub, "SUB">;
-defm : AVX512_scalar_math_f64_patterns<fmul, "MUL">;
-defm : AVX512_scalar_math_f64_patterns<fdiv, "DIV">;
+defm : AVX512_scalar_unary_math_imm_patterns<ffloor, "RNDSCALESS", X86Movss,
+ v4f32x_info, 0x01>;
+defm : AVX512_scalar_unary_math_imm_patterns<fceil, "RNDSCALESS", X86Movss,
+ v4f32x_info, 0x02>;
+defm : AVX512_scalar_unary_math_imm_patterns<ffloor, "RNDSCALESD", X86Movsd,
+ v2f64x_info, 0x01>;
+defm : AVX512_scalar_unary_math_imm_patterns<fceil, "RNDSCALESD", X86Movsd,
+ v2f64x_info, 0x02>;
//===----------------------------------------------------------------------===//
// AES instructions
@@ -10434,27 +11668,27 @@ defm : vpclmulqdq_aliases<"VPCLMULQDQZ256", VR256X, i256mem>;
//===----------------------------------------------------------------------===//
multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo VTI> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
let Constraints = "$src1 = $dst",
ExeDomain = VTI.ExeDomain in {
defm r: AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
"$src3, $src2", "$src2, $src3",
- (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3)),
- itins.rr>, AVX512FMA3Base, Sched<[itins.Sched]>;
+ (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>,
+ AVX512FMA3Base, Sched<[sched]>;
defm m: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
"$src3, $src2", "$src2, $src3",
(VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
- (VTI.VT (bitconvert (VTI.LdFrag addr:$src3))))),
- itins.rm>, AVX512FMA3Base,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (VTI.VT (bitconvert (VTI.LdFrag addr:$src3)))))>,
+ AVX512FMA3Base,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo VTI>
- : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo VTI>
+ : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched, VTI> {
let Constraints = "$src1 = $dst",
ExeDomain = VTI.ExeDomain in
defm mb: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
@@ -10462,66 +11696,74 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
"${src3}"##VTI.BroadcastStr##", $src2",
"$src2, ${src3}"##VTI.BroadcastStr,
(OpNode VTI.RC:$src1, VTI.RC:$src2,
- (VTI.VT (X86VBroadcast (VTI.ScalarLdFrag addr:$src3)))),
- itins.rm>, AVX512FMA3Base, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (VTI.VT (X86VBroadcast (VTI.ScalarLdFrag addr:$src3))))>,
+ AVX512FMA3Base, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo VTI> {
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
let Predicates = [HasVBMI2] in
- defm Z : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI.info512>, EVEX_V512;
+ defm Z : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.ZMM, VTI.info512>,
+ EVEX_V512;
let Predicates = [HasVBMI2, HasVLX] in {
- defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI.info256>, EVEX_V256;
- defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI.info128>, EVEX_V128;
+ defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.YMM, VTI.info256>,
+ EVEX_V256;
+ defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.XMM, VTI.info128>,
+ EVEX_V128;
}
}
multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode,
- OpndItins itins, AVX512VLVectorVTInfo VTI> {
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
let Predicates = [HasVBMI2] in
- defm Z : VBMI2_shift_var_rmb<Op, OpStr, OpNode, itins, VTI.info512>, EVEX_V512;
+ defm Z : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.ZMM, VTI.info512>,
+ EVEX_V512;
let Predicates = [HasVBMI2, HasVLX] in {
- defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, itins, VTI.info256>, EVEX_V256;
- defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, itins, VTI.info128>, EVEX_V128;
+ defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.YMM, VTI.info256>,
+ EVEX_V256;
+ defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.XMM, VTI.info128>,
+ EVEX_V128;
}
}
multiclass VBMI2_shift_var<bits<8> wOp, bits<8> dqOp, string Prefix,
- SDNode OpNode, OpndItins itins> {
- defm W : VBMI2_shift_var_rm_common<wOp, Prefix##"w", OpNode, itins,
+ SDNode OpNode, X86SchedWriteWidths sched> {
+ defm W : VBMI2_shift_var_rm_common<wOp, Prefix##"w", OpNode, sched,
avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
- defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix##"d", OpNode, itins,
+ defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix##"d", OpNode, sched,
avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
- defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix##"q", OpNode, itins,
+ defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix##"q", OpNode, sched,
avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
}
multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix,
- SDNode OpNode, OpndItins itins> {
- defm W : avx512_common_3Op_rm_imm8<wOp, OpNode, Prefix##"w", itins,
+ SDNode OpNode, X86SchedWriteWidths sched> {
+ defm W : avx512_common_3Op_rm_imm8<wOp, OpNode, Prefix##"w", sched,
avx512vl_i16_info, avx512vl_i16_info, HasVBMI2>,
VEX_W, EVEX_CD8<16, CD8VF>;
defm D : avx512_common_3Op_imm8<Prefix##"d", avx512vl_i32_info, dqOp,
- OpNode, itins, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+ OpNode, sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
defm Q : avx512_common_3Op_imm8<Prefix##"q", avx512vl_i64_info, dqOp, OpNode,
- itins, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+ sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
}
// Concat & Shift
-defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", X86VShldv, SSE_INTMUL_ITINS_P>;
-defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", X86VShrdv, SSE_INTMUL_ITINS_P>;
-defm VPSHLD : VBMI2_shift_imm<0x70, 0x71, "vpshld", X86VShld, SSE_INTMUL_ITINS_P>;
-defm VPSHRD : VBMI2_shift_imm<0x72, 0x73, "vpshrd", X86VShrd, SSE_INTMUL_ITINS_P>;
+defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", X86VShldv, SchedWriteVecIMul>;
+defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", X86VShrdv, SchedWriteVecIMul>;
+defm VPSHLD : VBMI2_shift_imm<0x70, 0x71, "vpshld", X86VShld, SchedWriteVecIMul>;
+defm VPSHRD : VBMI2_shift_imm<0x72, 0x73, "vpshrd", X86VShrd, SchedWriteVecIMul>;
// Compress
-defm VPCOMPRESSB : compress_by_elt_width<0x63, "vpcompressb", AVX512_COMPRESS,
- avx512vl_i8_info, HasVBMI2>, EVEX;
-defm VPCOMPRESSW : compress_by_elt_width <0x63, "vpcompressw", AVX512_COMPRESS,
- avx512vl_i16_info, HasVBMI2>, EVEX, VEX_W;
+defm VPCOMPRESSB : compress_by_elt_width<0x63, "vpcompressb", WriteVarShuffle256,
+ avx512vl_i8_info, HasVBMI2>, EVEX,
+ NotMemoryFoldable;
+defm VPCOMPRESSW : compress_by_elt_width <0x63, "vpcompressw", WriteVarShuffle256,
+ avx512vl_i16_info, HasVBMI2>, EVEX, VEX_W,
+ NotMemoryFoldable;
// Expand
-defm VPEXPANDB : expand_by_elt_width <0x62, "vpexpandb", AVX512_EXPAND,
+defm VPEXPANDB : expand_by_elt_width <0x62, "vpexpandb", WriteVarShuffle256,
avx512vl_i8_info, HasVBMI2>, EVEX;
-defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", AVX512_EXPAND,
+defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", WriteVarShuffle256,
avx512vl_i16_info, HasVBMI2>, EVEX, VEX_W;
//===----------------------------------------------------------------------===//
@@ -10530,113 +11772,116 @@ defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", AVX512_EXPAND,
let Constraints = "$src1 = $dst" in
multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo VTI> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
defm r : AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
"$src3, $src2", "$src2, $src3",
(VTI.VT (OpNode VTI.RC:$src1,
- VTI.RC:$src2, VTI.RC:$src3)),
- itins.rr>, EVEX_4V, T8PD, Sched<[itins.Sched]>;
+ VTI.RC:$src2, VTI.RC:$src3))>,
+ EVEX_4V, T8PD, Sched<[sched]>;
defm m : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
"$src3, $src2", "$src2, $src3",
(VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
(VTI.VT (bitconvert
- (VTI.LdFrag addr:$src3))))),
- itins.rm>, EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (VTI.LdFrag addr:$src3)))))>,
+ EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD,
+ Sched<[sched.Folded, ReadAfterLd]>;
defm mb : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.ScalarMemOp:$src3),
OpStr, "${src3}"##VTI.BroadcastStr##", $src2",
"$src2, ${src3}"##VTI.BroadcastStr,
(OpNode VTI.RC:$src1, VTI.RC:$src2,
(VTI.VT (X86VBroadcast
- (VTI.ScalarLdFrag addr:$src3)))),
- itins.rm>, EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B,
- T8PD, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (VTI.ScalarLdFrag addr:$src3))))>,
+ EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B,
+ T8PD, Sched<[sched.Folded, ReadAfterLd]>;
}
-multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode, OpndItins itins> {
+multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode,
+ X86SchedWriteWidths sched> {
let Predicates = [HasVNNI] in
- defm Z : VNNI_rmb<Op, OpStr, OpNode, itins, v16i32_info>, EVEX_V512;
+ defm Z : VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info>, EVEX_V512;
let Predicates = [HasVNNI, HasVLX] in {
- defm Z256 : VNNI_rmb<Op, OpStr, OpNode, itins, v8i32x_info>, EVEX_V256;
- defm Z128 : VNNI_rmb<Op, OpStr, OpNode, itins, v4i32x_info>, EVEX_V128;
+ defm Z256 : VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info>, EVEX_V256;
+ defm Z128 : VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info>, EVEX_V128;
}
}
-// FIXME: Is there a better scheduler itinerary for VPDP?
-defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SSE_PMADD>;
-defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SSE_PMADD>;
-defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SSE_PMADD>;
-defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SSE_PMADD>;
+// FIXME: Is there a better scheduler class for VPDP?
+defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul>;
+defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul>;
+defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul>;
+defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul>;
//===----------------------------------------------------------------------===//
// Bit Algorithms
//===----------------------------------------------------------------------===//
-// FIXME: Is there a better scheduler itinerary for VPOPCNTB/VPOPCNTW?
-defm VPOPCNTB : avx512_unary_rm_vl<0x54, "vpopcntb", ctpop, SSE_INTALU_ITINS_P,
+// FIXME: Is there a better scheduler class for VPOPCNTB/VPOPCNTW?
+defm VPOPCNTB : avx512_unary_rm_vl<0x54, "vpopcntb", ctpop, SchedWriteVecALU,
avx512vl_i8_info, HasBITALG>;
-defm VPOPCNTW : avx512_unary_rm_vl<0x54, "vpopcntw", ctpop, SSE_INTALU_ITINS_P,
+defm VPOPCNTW : avx512_unary_rm_vl<0x54, "vpopcntw", ctpop, SchedWriteVecALU,
avx512vl_i16_info, HasBITALG>, VEX_W;
defm : avx512_unary_lowering<"VPOPCNTB", ctpop, avx512vl_i8_info, HasBITALG>;
defm : avx512_unary_lowering<"VPOPCNTW", ctpop, avx512vl_i16_info, HasBITALG>;
-multiclass VPSHUFBITQMB_rm<OpndItins itins, X86VectorVTInfo VTI> {
+multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
defm rr : AVX512_maskable_cmp<0x8F, MRMSrcReg, VTI, (outs VTI.KRC:$dst),
(ins VTI.RC:$src1, VTI.RC:$src2),
"vpshufbitqmb",
"$src2, $src1", "$src1, $src2",
(X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
- (VTI.VT VTI.RC:$src2)), itins.rr>, EVEX_4V, T8PD,
- Sched<[itins.Sched]>;
+ (VTI.VT VTI.RC:$src2))>, EVEX_4V, T8PD,
+ Sched<[sched]>;
defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst),
(ins VTI.RC:$src1, VTI.MemOp:$src2),
"vpshufbitqmb",
"$src2, $src1", "$src1, $src2",
(X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
- (VTI.VT (bitconvert (VTI.LdFrag addr:$src2)))),
- itins.rm>, EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (VTI.VT (bitconvert (VTI.LdFrag addr:$src2))))>,
+ EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
-multiclass VPSHUFBITQMB_common<OpndItins itins, AVX512VLVectorVTInfo VTI> {
+multiclass VPSHUFBITQMB_common<X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
let Predicates = [HasBITALG] in
- defm Z : VPSHUFBITQMB_rm<itins, VTI.info512>, EVEX_V512;
+ defm Z : VPSHUFBITQMB_rm<sched.ZMM, VTI.info512>, EVEX_V512;
let Predicates = [HasBITALG, HasVLX] in {
- defm Z256 : VPSHUFBITQMB_rm<itins, VTI.info256>, EVEX_V256;
- defm Z128 : VPSHUFBITQMB_rm<itins, VTI.info128>, EVEX_V128;
+ defm Z256 : VPSHUFBITQMB_rm<sched.YMM, VTI.info256>, EVEX_V256;
+ defm Z128 : VPSHUFBITQMB_rm<sched.XMM, VTI.info128>, EVEX_V128;
}
}
-// FIXME: Is there a better scheduler itinerary for VPSHUFBITQMB?
-defm VPSHUFBITQMB : VPSHUFBITQMB_common<SSE_INTMUL_ITINS_P, avx512vl_i8_info>;
+// FIXME: Is there a better scheduler class for VPSHUFBITQMB?
+defm VPSHUFBITQMB : VPSHUFBITQMB_common<SchedWriteVecIMul, avx512vl_i8_info>;
//===----------------------------------------------------------------------===//
// GFNI
//===----------------------------------------------------------------------===//
-multiclass GF2P8MULB_avx512_common<bits<8> Op, string OpStr, SDNode OpNode> {
+multiclass GF2P8MULB_avx512_common<bits<8> Op, string OpStr, SDNode OpNode,
+ X86SchedWriteWidths sched> {
let Predicates = [HasGFNI, HasAVX512, HasBWI] in
- defm Z : avx512_binop_rm<Op, OpStr, OpNode, v64i8_info,
- SSE_INTALU_ITINS_P, 1>, EVEX_V512;
+ defm Z : avx512_binop_rm<Op, OpStr, OpNode, v64i8_info, sched.ZMM, 1>,
+ EVEX_V512;
let Predicates = [HasGFNI, HasVLX, HasBWI] in {
- defm Z256 : avx512_binop_rm<Op, OpStr, OpNode, v32i8x_info,
- SSE_INTALU_ITINS_P, 1>, EVEX_V256;
- defm Z128 : avx512_binop_rm<Op, OpStr, OpNode, v16i8x_info,
- SSE_INTALU_ITINS_P, 1>, EVEX_V128;
+ defm Z256 : avx512_binop_rm<Op, OpStr, OpNode, v32i8x_info, sched.YMM, 1>,
+ EVEX_V256;
+ defm Z128 : avx512_binop_rm<Op, OpStr, OpNode, v16i8x_info, sched.XMM, 1>,
+ EVEX_V128;
}
}
-defm GF2P8MULB : GF2P8MULB_avx512_common<0xCF, "vgf2p8mulb", X86GF2P8mulb>,
- EVEX_CD8<8, CD8VF>, T8PD;
+defm VGF2P8MULB : GF2P8MULB_avx512_common<0xCF, "vgf2p8mulb", X86GF2P8mulb,
+ SchedWriteVecALU>,
+ EVEX_CD8<8, CD8VF>, T8PD;
multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
- OpndItins itins, X86VectorVTInfo VTI,
+ X86FoldableSchedWrite sched, X86VectorVTInfo VTI,
X86VectorVTInfo BcstVTI>
- : avx512_3Op_rm_imm8<Op, OpStr, OpNode, itins, VTI, VTI> {
+ : avx512_3Op_rm_imm8<Op, OpStr, OpNode, sched, VTI, VTI> {
let ExeDomain = VTI.ExeDomain in
defm rmbi : AVX512_maskable<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src1, VTI.ScalarMemOp:$src2, u8imm:$src3),
@@ -10644,27 +11889,78 @@ multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
"$src1, ${src2}"##BcstVTI.BroadcastStr##", $src3",
(OpNode (VTI.VT VTI.RC:$src1),
(bitconvert (BcstVTI.VT (X86VBroadcast (loadi64 addr:$src2)))),
- (i8 imm:$src3)), itins.rm>, EVEX_B,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i8 imm:$src3))>, EVEX_B,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass GF2P8AFFINE_avx512_common<bits<8> Op, string OpStr, SDNode OpNode,
- OpndItins itins> {
+ X86SchedWriteWidths sched> {
let Predicates = [HasGFNI, HasAVX512, HasBWI] in
- defm Z : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, itins, v64i8_info,
- v8i64_info>, EVEX_V512;
+ defm Z : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, sched.ZMM,
+ v64i8_info, v8i64_info>, EVEX_V512;
let Predicates = [HasGFNI, HasVLX, HasBWI] in {
- defm Z256 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, itins, v32i8x_info,
- v4i64x_info>, EVEX_V256;
- defm Z128 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, itins, v16i8x_info,
- v2i64x_info>, EVEX_V128;
+ defm Z256 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, sched.YMM,
+ v32i8x_info, v4i64x_info>, EVEX_V256;
+ defm Z128 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, sched.XMM,
+ v16i8x_info, v2i64x_info>, EVEX_V128;
}
}
-defm GF2P8AFFINEINVQB : GF2P8AFFINE_avx512_common<0xCF, "vgf2p8affineinvqb",
- X86GF2P8affineinvqb, SSE_INTMUL_ITINS_P>,
- EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base;
-defm GF2P8AFFINEQB : GF2P8AFFINE_avx512_common<0xCE, "vgf2p8affineqb",
- X86GF2P8affineqb, SSE_INTMUL_ITINS_P>,
- EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base;
+defm VGF2P8AFFINEINVQB : GF2P8AFFINE_avx512_common<0xCF, "vgf2p8affineinvqb",
+ X86GF2P8affineinvqb, SchedWriteVecIMul>,
+ EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base;
+defm VGF2P8AFFINEQB : GF2P8AFFINE_avx512_common<0xCE, "vgf2p8affineqb",
+ X86GF2P8affineqb, SchedWriteVecIMul>,
+ EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base;
+
+
+//===----------------------------------------------------------------------===//
+// AVX5124FMAPS
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedSingle,
+ Constraints = "$src1 = $dst" in {
+defm V4FMADDPSrm : AVX512_maskable_3src_in_asm<0x9A, MRMSrcMem, v16f32_info,
+ (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
+ "v4fmaddps", "$src3, $src2", "$src2, $src3",
+ []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+ Sched<[SchedWriteFMA.ZMM.Folded]>;
+
+defm V4FNMADDPSrm : AVX512_maskable_3src_in_asm<0xAA, MRMSrcMem, v16f32_info,
+ (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
+ "v4fnmaddps", "$src3, $src2", "$src2, $src3",
+ []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+ Sched<[SchedWriteFMA.ZMM.Folded]>;
+
+defm V4FMADDSSrm : AVX512_maskable_3src_in_asm<0x9B, MRMSrcMem, f32x_info,
+ (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3),
+ "v4fmaddss", "$src3, $src2", "$src2, $src3",
+ []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+ Sched<[SchedWriteFMA.Scl.Folded]>;
+
+defm V4FNMADDSSrm : AVX512_maskable_3src_in_asm<0xAB, MRMSrcMem, f32x_info,
+ (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3),
+ "v4fnmaddss", "$src3, $src2", "$src2, $src3",
+ []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+ Sched<[SchedWriteFMA.Scl.Folded]>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX5124VNNIW
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedInt,
+ Constraints = "$src1 = $dst" in {
+defm VP4DPWSSDrm : AVX512_maskable_3src_in_asm<0x52, MRMSrcMem, v16i32_info,
+ (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
+ "vp4dpwssd", "$src3, $src2", "$src2, $src3",
+ []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+ Sched<[SchedWriteFMA.ZMM.Folded]>;
+
+defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info,
+ (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
+ "vp4dpwssds", "$src3, $src2", "$src2, $src3",
+ []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+ Sched<[SchedWriteFMA.ZMM.Folded]>;
+}
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index d09deb5b7584..c444fa761960 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -18,24 +18,24 @@ let SchedRW = [WriteLEA] in {
let hasSideEffects = 0 in
def LEA16r : I<0x8D, MRMSrcMem,
(outs GR16:$dst), (ins anymem:$src),
- "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize16;
+ "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize16;
let isReMaterializable = 1 in
def LEA32r : I<0x8D, MRMSrcMem,
(outs GR32:$dst), (ins anymem:$src),
"lea{l}\t{$src|$dst}, {$dst|$src}",
- [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
+ [(set GR32:$dst, lea32addr:$src)]>,
OpSize32, Requires<[Not64BitMode]>;
def LEA64_32r : I<0x8D, MRMSrcMem,
(outs GR32:$dst), (ins lea64_32mem:$src),
"lea{l}\t{$src|$dst}, {$dst|$src}",
- [(set GR32:$dst, lea64_32addr:$src)], IIC_LEA>,
+ [(set GR32:$dst, lea64_32addr:$src)]>,
OpSize32, Requires<[In64BitMode]>;
let isReMaterializable = 1 in
def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
"lea{q}\t{$src|$dst}, {$dst|$src}",
- [(set GR64:$dst, lea64addr:$src)], IIC_LEA>;
+ [(set GR64:$dst, lea64addr:$src)]>;
} // SchedRW
//===----------------------------------------------------------------------===//
@@ -63,24 +63,24 @@ def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src",
// This probably ought to be moved to a def : Pat<> if the
// syntax can be accepted.
[(set AL, (mul AL, GR8:$src)),
- (implicit EFLAGS)], IIC_MUL8>, Sched<[WriteIMul]>;
+ (implicit EFLAGS)]>, Sched<[WriteIMul]>;
// AX,DX = AX*GR16
let Defs = [AX,DX,EFLAGS], Uses = [AX], hasSideEffects = 0 in
def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src),
"mul{w}\t$src",
- [], IIC_MUL16_REG>, OpSize16, Sched<[WriteIMul]>;
+ []>, OpSize16, Sched<[WriteIMul]>;
// EAX,EDX = EAX*GR32
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], hasSideEffects = 0 in
def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src),
"mul{l}\t$src",
- [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/],
- IIC_MUL32_REG>, OpSize32, Sched<[WriteIMul]>;
+ [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/]>,
+ OpSize32, Sched<[WriteIMul]>;
// RAX,RDX = RAX*GR64
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], hasSideEffects = 0 in
def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
"mul{q}\t$src",
- [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/],
- IIC_MUL64>, Sched<[WriteIMul]>;
+ [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/]>,
+ Sched<[WriteIMul64]>;
// AL,AH = AL*[mem8]
let Defs = [AL,EFLAGS,AX], Uses = [AL] in
def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
@@ -89,62 +89,58 @@ def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
// This probably ought to be moved to a def : Pat<> if the
// syntax can be accepted.
[(set AL, (mul AL, (loadi8 addr:$src))),
- (implicit EFLAGS)], IIC_MUL8>, SchedLoadReg<WriteIMulLd>;
+ (implicit EFLAGS)]>, SchedLoadReg<WriteIMul.Folded>;
// AX,DX = AX*[mem16]
let mayLoad = 1, hasSideEffects = 0 in {
let Defs = [AX,DX,EFLAGS], Uses = [AX] in
def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
- "mul{w}\t$src",
- [], IIC_MUL16_MEM>, OpSize16, SchedLoadReg<WriteIMulLd>;
+ "mul{w}\t$src", []>, OpSize16, SchedLoadReg<WriteIMul.Folded>;
// EAX,EDX = EAX*[mem32]
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
- "mul{l}\t$src",
- [], IIC_MUL32_MEM>, OpSize32, SchedLoadReg<WriteIMulLd>;
+ "mul{l}\t$src", []>, OpSize32, SchedLoadReg<WriteIMul.Folded>;
// RAX,RDX = RAX*[mem64]
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
- "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>,
+ "mul{q}\t$src", []>, SchedLoadReg<WriteIMul64.Folded>,
Requires<[In64BitMode]>;
}
let hasSideEffects = 0 in {
// AL,AH = AL*GR8
let Defs = [AL,EFLAGS,AX], Uses = [AL] in
-def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", [],
- IIC_IMUL8>, Sched<[WriteIMul]>;
+def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", []>,
+ Sched<[WriteIMul]>;
// AX,DX = AX*GR16
let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-def IMUL16r : I<0xF7, MRM5r, (outs), (ins GR16:$src), "imul{w}\t$src", [],
- IIC_IMUL16_RR>, OpSize16, Sched<[WriteIMul]>;
+def IMUL16r : I<0xF7, MRM5r, (outs), (ins GR16:$src), "imul{w}\t$src", []>,
+ OpSize16, Sched<[WriteIMul]>;
// EAX,EDX = EAX*GR32
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
-def IMUL32r : I<0xF7, MRM5r, (outs), (ins GR32:$src), "imul{l}\t$src", [],
- IIC_IMUL32_RR>, OpSize32, Sched<[WriteIMul]>;
+def IMUL32r : I<0xF7, MRM5r, (outs), (ins GR32:$src), "imul{l}\t$src", []>,
+ OpSize32, Sched<[WriteIMul]>;
// RAX,RDX = RAX*GR64
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
-def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", [],
- IIC_IMUL64_RR>, Sched<[WriteIMul]>;
+def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", []>,
+ Sched<[WriteIMul64]>;
let mayLoad = 1 in {
// AL,AH = AL*[mem8]
let Defs = [AL,EFLAGS,AX], Uses = [AL] in
def IMUL8m : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
- "imul{b}\t$src", [], IIC_IMUL8>, SchedLoadReg<WriteIMulLd>;
+ "imul{b}\t$src", []>, SchedLoadReg<WriteIMul.Folded>;
// AX,DX = AX*[mem16]
let Defs = [AX,DX,EFLAGS], Uses = [AX] in
def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
- "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize16,
- SchedLoadReg<WriteIMulLd>;
+ "imul{w}\t$src", []>, OpSize16, SchedLoadReg<WriteIMul.Folded>;
// EAX,EDX = EAX*[mem32]
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
- "imul{l}\t$src", [], IIC_IMUL32_MEM>, OpSize32,
- SchedLoadReg<WriteIMulLd>;
+ "imul{l}\t$src", []>, OpSize32, SchedLoadReg<WriteIMul.Folded>;
// RAX,RDX = RAX*[mem64]
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
- "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>,
+ "imul{q}\t$src", []>, SchedLoadReg<WriteIMul64.Folded>,
Requires<[In64BitMode]>;
}
} // hasSideEffects
@@ -153,218 +149,195 @@ def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
let Defs = [EFLAGS] in {
let Constraints = "$src1 = $dst" in {
-let isCommutable = 1, SchedRW = [WriteIMul] in {
+let isCommutable = 1 in {
// X = IMUL Y, Z --> X = IMUL Z, Y
// Register-Register Signed Integer Multiply
def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
"imul{w}\t{$src2, $dst|$dst, $src2}",
[(set GR16:$dst, EFLAGS,
- (X86smul_flag GR16:$src1, GR16:$src2))], IIC_IMUL16_RR>,
- TB, OpSize16;
+ (X86smul_flag GR16:$src1, GR16:$src2))]>,
+ Sched<[WriteIMul]>, TB, OpSize16;
def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
"imul{l}\t{$src2, $dst|$dst, $src2}",
[(set GR32:$dst, EFLAGS,
- (X86smul_flag GR32:$src1, GR32:$src2))], IIC_IMUL32_RR>,
- TB, OpSize32;
+ (X86smul_flag GR32:$src1, GR32:$src2))]>,
+ Sched<[WriteIMul]>, TB, OpSize32;
def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
"imul{q}\t{$src2, $dst|$dst, $src2}",
[(set GR64:$dst, EFLAGS,
- (X86smul_flag GR64:$src1, GR64:$src2))], IIC_IMUL64_RR>,
- TB;
-} // isCommutable, SchedRW
+ (X86smul_flag GR64:$src1, GR64:$src2))]>,
+ Sched<[WriteIMul64]>, TB;
+} // isCommutable
// Register-Memory Signed Integer Multiply
-let SchedRW = [WriteIMulLd, ReadAfterLd] in {
def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
(ins GR16:$src1, i16mem:$src2),
"imul{w}\t{$src2, $dst|$dst, $src2}",
[(set GR16:$dst, EFLAGS,
- (X86smul_flag GR16:$src1, (load addr:$src2)))],
- IIC_IMUL16_RM>,
- TB, OpSize16;
+ (X86smul_flag GR16:$src1, (loadi16 addr:$src2)))]>,
+ Sched<[WriteIMul.Folded, ReadAfterLd]>, TB, OpSize16;
def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst),
(ins GR32:$src1, i32mem:$src2),
"imul{l}\t{$src2, $dst|$dst, $src2}",
[(set GR32:$dst, EFLAGS,
- (X86smul_flag GR32:$src1, (load addr:$src2)))],
- IIC_IMUL32_RM>,
- TB, OpSize32;
+ (X86smul_flag GR32:$src1, (loadi32 addr:$src2)))]>,
+ Sched<[WriteIMul.Folded, ReadAfterLd]>, TB, OpSize32;
def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
(ins GR64:$src1, i64mem:$src2),
"imul{q}\t{$src2, $dst|$dst, $src2}",
[(set GR64:$dst, EFLAGS,
- (X86smul_flag GR64:$src1, (load addr:$src2)))],
- IIC_IMUL64_RM>,
- TB;
-} // SchedRW
+ (X86smul_flag GR64:$src1, (loadi64 addr:$src2)))]>,
+ Sched<[WriteIMul64.Folded, ReadAfterLd]>, TB;
} // Constraints = "$src1 = $dst"
} // Defs = [EFLAGS]
// Surprisingly enough, these are not two address instructions!
let Defs = [EFLAGS] in {
-let SchedRW = [WriteIMul] in {
// Register-Integer Signed Integer Multiply
def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16
(outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
"imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR16:$dst, EFLAGS,
- (X86smul_flag GR16:$src1, imm:$src2))],
- IIC_IMUL16_RRI>, OpSize16;
+ (X86smul_flag GR16:$src1, imm:$src2))]>,
+ Sched<[WriteIMul]>, OpSize16;
def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8
(outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
"imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR16:$dst, EFLAGS,
- (X86smul_flag GR16:$src1, i16immSExt8:$src2))],
- IIC_IMUL16_RRI>, OpSize16;
+ (X86smul_flag GR16:$src1, i16immSExt8:$src2))]>,
+ Sched<[WriteIMul]>, OpSize16;
def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32
(outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
"imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32:$dst, EFLAGS,
- (X86smul_flag GR32:$src1, imm:$src2))],
- IIC_IMUL32_RRI>, OpSize32;
+ (X86smul_flag GR32:$src1, imm:$src2))]>,
+ Sched<[WriteIMul]>, OpSize32;
def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8
(outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
"imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32:$dst, EFLAGS,
- (X86smul_flag GR32:$src1, i32immSExt8:$src2))],
- IIC_IMUL32_RRI>, OpSize32;
+ (X86smul_flag GR32:$src1, i32immSExt8:$src2))]>,
+ Sched<[WriteIMul]>, OpSize32;
def IMUL64rri32 : RIi32S<0x69, MRMSrcReg, // GR64 = GR64*I32
(outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
"imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR64:$dst, EFLAGS,
- (X86smul_flag GR64:$src1, i64immSExt32:$src2))],
- IIC_IMUL64_RRI>;
+ (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>,
+ Sched<[WriteIMul64]>;
def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8
(outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
"imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR64:$dst, EFLAGS,
- (X86smul_flag GR64:$src1, i64immSExt8:$src2))],
- IIC_IMUL64_RRI>;
-} // SchedRW
+ (X86smul_flag GR64:$src1, i64immSExt8:$src2))]>,
+ Sched<[WriteIMul64]>;
// Memory-Integer Signed Integer Multiply
-let SchedRW = [WriteIMulLd] in {
def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16
(outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
"imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR16:$dst, EFLAGS,
- (X86smul_flag (load addr:$src1), imm:$src2))],
- IIC_IMUL16_RMI>,
- OpSize16;
+ (X86smul_flag (loadi16 addr:$src1), imm:$src2))]>,
+ Sched<[WriteIMul.Folded]>, OpSize16;
def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8
(outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
"imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR16:$dst, EFLAGS,
- (X86smul_flag (load addr:$src1),
- i16immSExt8:$src2))], IIC_IMUL16_RMI>,
- OpSize16;
+ (X86smul_flag (loadi16 addr:$src1),
+ i16immSExt8:$src2))]>,
+ Sched<[WriteIMul.Folded]>, OpSize16;
def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32
(outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
"imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32:$dst, EFLAGS,
- (X86smul_flag (load addr:$src1), imm:$src2))],
- IIC_IMUL32_RMI>, OpSize32;
+ (X86smul_flag (loadi32 addr:$src1), imm:$src2))]>,
+ Sched<[WriteIMul.Folded]>, OpSize32;
def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8
(outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
"imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32:$dst, EFLAGS,
- (X86smul_flag (load addr:$src1),
- i32immSExt8:$src2))],
- IIC_IMUL32_RMI>, OpSize32;
+ (X86smul_flag (loadi32 addr:$src1),
+ i32immSExt8:$src2))]>,
+ Sched<[WriteIMul.Folded]>, OpSize32;
def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem, // GR64 = [mem64]*I32
(outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
"imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR64:$dst, EFLAGS,
- (X86smul_flag (load addr:$src1),
- i64immSExt32:$src2))],
- IIC_IMUL64_RMI>;
+ (X86smul_flag (loadi64 addr:$src1),
+ i64immSExt32:$src2))]>,
+ Sched<[WriteIMul64.Folded]>;
def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
(outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
"imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR64:$dst, EFLAGS,
- (X86smul_flag (load addr:$src1),
- i64immSExt8:$src2))],
- IIC_IMUL64_RMI>;
-} // SchedRW
+ (X86smul_flag (loadi64 addr:$src1),
+ i64immSExt8:$src2))]>,
+ Sched<[WriteIMul64.Folded]>;
} // Defs = [EFLAGS]
-
-
-
// unsigned division/remainder
let hasSideEffects = 1 in { // so that we don't speculatively execute
-let SchedRW = [WriteIDiv] in {
let Defs = [AL,AH,EFLAGS], Uses = [AX] in
def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
- "div{b}\t$src", [], IIC_DIV8_REG>;
+ "div{b}\t$src", []>, Sched<[WriteDiv8]>;
let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
def DIV16r : I<0xF7, MRM6r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
- "div{w}\t$src", [], IIC_DIV16>, OpSize16;
+ "div{w}\t$src", []>, Sched<[WriteDiv16]>, OpSize16;
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
def DIV32r : I<0xF7, MRM6r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
- "div{l}\t$src", [], IIC_DIV32>, OpSize32;
+ "div{l}\t$src", []>, Sched<[WriteDiv32]>, OpSize32;
// RDX:RAX/r64 = RAX,RDX
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
- "div{q}\t$src", [], IIC_DIV64>;
-} // SchedRW
+ "div{q}\t$src", []>, Sched<[WriteDiv64]>;
let mayLoad = 1 in {
let Defs = [AL,AH,EFLAGS], Uses = [AX] in
def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
- "div{b}\t$src", [], IIC_DIV8_MEM>,
- SchedLoadReg<WriteIDivLd>;
+ "div{b}\t$src", []>, SchedLoadReg<WriteDiv8.Folded>;
let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
- "div{w}\t$src", [], IIC_DIV16>, OpSize16,
- SchedLoadReg<WriteIDivLd>;
+ "div{w}\t$src", []>, OpSize16, SchedLoadReg<WriteDiv16.Folded>;
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX
def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
- "div{l}\t$src", [], IIC_DIV32>,
- SchedLoadReg<WriteIDivLd>, OpSize32;
+ "div{l}\t$src", []>, SchedLoadReg<WriteDiv32.Folded>, OpSize32;
// RDX:RAX/[mem64] = RAX,RDX
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
- "div{q}\t$src", [], IIC_DIV64>,
- SchedLoadReg<WriteIDivLd>, Requires<[In64BitMode]>;
+ "div{q}\t$src", []>, SchedLoadReg<WriteDiv64.Folded>,
+ Requires<[In64BitMode]>;
}
// Signed division/remainder.
-let SchedRW = [WriteIDiv] in {
let Defs = [AL,AH,EFLAGS], Uses = [AX] in
def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
- "idiv{b}\t$src", [], IIC_IDIV8>;
+ "idiv{b}\t$src", []>, Sched<[WriteIDiv8]>;
let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
def IDIV16r: I<0xF7, MRM7r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
- "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16;
+ "idiv{w}\t$src", []>, Sched<[WriteIDiv16]>, OpSize16;
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
def IDIV32r: I<0xF7, MRM7r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
- "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32;
+ "idiv{l}\t$src", []>, Sched<[WriteIDiv32]>, OpSize32;
// RDX:RAX/r64 = RAX,RDX
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
- "idiv{q}\t$src", [], IIC_IDIV64>;
-} // SchedRW
+ "idiv{q}\t$src", []>, Sched<[WriteIDiv64]>;
let mayLoad = 1 in {
let Defs = [AL,AH,EFLAGS], Uses = [AX] in
def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
- "idiv{b}\t$src", [], IIC_IDIV8>,
- SchedLoadReg<WriteIDivLd>;
+ "idiv{b}\t$src", []>, SchedLoadReg<WriteIDiv8.Folded>;
let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
- "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16,
- SchedLoadReg<WriteIDivLd>;
+ "idiv{w}\t$src", []>, OpSize16, SchedLoadReg<WriteIDiv16.Folded>;
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX
def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
- "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32,
- SchedLoadReg<WriteIDivLd>;
+ "idiv{l}\t$src", []>, OpSize32, SchedLoadReg<WriteIDiv32.Folded>;
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
- "idiv{q}\t$src", [], IIC_IDIV64>,
- SchedLoadReg<WriteIDivLd>, Requires<[In64BitMode]>;
+ "idiv{q}\t$src", []>, SchedLoadReg<WriteIDiv64.Folded>,
+ Requires<[In64BitMode]>;
}
} // hasSideEffects = 0
@@ -379,37 +352,37 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
def NEG8r : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1),
"neg{b}\t$dst",
[(set GR8:$dst, (ineg GR8:$src1)),
- (implicit EFLAGS)], IIC_UNARY_REG>;
+ (implicit EFLAGS)]>;
def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
"neg{w}\t$dst",
[(set GR16:$dst, (ineg GR16:$src1)),
- (implicit EFLAGS)], IIC_UNARY_REG>, OpSize16;
+ (implicit EFLAGS)]>, OpSize16;
def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
"neg{l}\t$dst",
[(set GR32:$dst, (ineg GR32:$src1)),
- (implicit EFLAGS)], IIC_UNARY_REG>, OpSize32;
+ (implicit EFLAGS)]>, OpSize32;
def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst",
[(set GR64:$dst, (ineg GR64:$src1)),
- (implicit EFLAGS)], IIC_UNARY_REG>;
+ (implicit EFLAGS)]>;
} // Constraints = "$src1 = $dst", SchedRW
// Read-modify-write negate.
-let SchedRW = [WriteALULd, WriteRMW] in {
+let SchedRW = [WriteALURMW] in {
def NEG8m : I<0xF6, MRM3m, (outs), (ins i8mem :$dst),
"neg{b}\t$dst",
[(store (ineg (loadi8 addr:$dst)), addr:$dst),
- (implicit EFLAGS)], IIC_UNARY_MEM>;
+ (implicit EFLAGS)]>;
def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst),
"neg{w}\t$dst",
[(store (ineg (loadi16 addr:$dst)), addr:$dst),
- (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+ (implicit EFLAGS)]>, OpSize16;
def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst),
"neg{l}\t$dst",
[(store (ineg (loadi32 addr:$dst)), addr:$dst),
- (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+ (implicit EFLAGS)]>, OpSize32;
def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
[(store (ineg (loadi64 addr:$dst)), addr:$dst),
- (implicit EFLAGS)], IIC_UNARY_MEM>,
+ (implicit EFLAGS)]>,
Requires<[In64BitMode]>;
} // SchedRW
} // Defs = [EFLAGS]
@@ -418,36 +391,33 @@ def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
// Note: NOT does not set EFLAGS!
let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
-// Match xor -1 to not. Favors these over a move imm + xor to save code size.
-let AddedComplexity = 15 in {
def NOT8r : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1),
"not{b}\t$dst",
- [(set GR8:$dst, (not GR8:$src1))], IIC_UNARY_REG>;
+ [(set GR8:$dst, (not GR8:$src1))]>;
def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
"not{w}\t$dst",
- [(set GR16:$dst, (not GR16:$src1))], IIC_UNARY_REG>, OpSize16;
+ [(set GR16:$dst, (not GR16:$src1))]>, OpSize16;
def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
"not{l}\t$dst",
- [(set GR32:$dst, (not GR32:$src1))], IIC_UNARY_REG>, OpSize32;
+ [(set GR32:$dst, (not GR32:$src1))]>, OpSize32;
def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst",
- [(set GR64:$dst, (not GR64:$src1))], IIC_UNARY_REG>;
-}
+ [(set GR64:$dst, (not GR64:$src1))]>;
} // Constraints = "$src1 = $dst", SchedRW
-let SchedRW = [WriteALULd, WriteRMW] in {
+let SchedRW = [WriteALURMW] in {
def NOT8m : I<0xF6, MRM2m, (outs), (ins i8mem :$dst),
"not{b}\t$dst",
- [(store (not (loadi8 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+ [(store (not (loadi8 addr:$dst)), addr:$dst)]>;
def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst),
"not{w}\t$dst",
- [(store (not (loadi16 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+ [(store (not (loadi16 addr:$dst)), addr:$dst)]>,
OpSize16;
def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst),
"not{l}\t$dst",
- [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+ [(store (not (loadi32 addr:$dst)), addr:$dst)]>,
OpSize32;
def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
- [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+ [(store (not (loadi64 addr:$dst)), addr:$dst)]>,
Requires<[In64BitMode]>;
} // SchedRW
} // CodeSize
@@ -458,49 +428,45 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
let CodeSize = 2 in
def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
"inc{b}\t$dst",
- [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))],
- IIC_UNARY_REG>;
+ [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))]>;
let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
"inc{w}\t$dst",
- [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))],
- IIC_UNARY_REG>, OpSize16;
+ [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))]>, OpSize16;
def INC32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
"inc{l}\t$dst",
- [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
- IIC_UNARY_REG>, OpSize32;
+ [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))]>, OpSize32;
def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst",
- [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))],
- IIC_UNARY_REG>;
+ [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))]>;
} // isConvertibleToThreeAddress = 1, CodeSize = 2
// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
let CodeSize = 1, hasSideEffects = 0 in {
def INC16r_alt : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
- "inc{w}\t$dst", [], IIC_UNARY_REG>,
+ "inc{w}\t$dst", []>,
OpSize16, Requires<[Not64BitMode]>;
def INC32r_alt : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
- "inc{l}\t$dst", [], IIC_UNARY_REG>,
+ "inc{l}\t$dst", []>,
OpSize32, Requires<[Not64BitMode]>;
} // CodeSize = 1, hasSideEffects = 0
} // Constraints = "$src1 = $dst", SchedRW
-let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+let CodeSize = 2, SchedRW = [WriteALURMW] in {
let Predicates = [UseIncDec] in {
def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
[(store (add (loadi8 addr:$dst), 1), addr:$dst),
- (implicit EFLAGS)], IIC_UNARY_MEM>;
+ (implicit EFLAGS)]>;
def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
[(store (add (loadi16 addr:$dst), 1), addr:$dst),
- (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+ (implicit EFLAGS)]>, OpSize16;
def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
[(store (add (loadi32 addr:$dst), 1), addr:$dst),
- (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+ (implicit EFLAGS)]>, OpSize32;
} // Predicates
let Predicates = [UseIncDec, In64BitMode] in {
def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
[(store (add (loadi64 addr:$dst), 1), addr:$dst),
- (implicit EFLAGS)], IIC_UNARY_MEM>;
+ (implicit EFLAGS)]>;
} // Predicates
} // CodeSize = 2, SchedRW
@@ -508,50 +474,46 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
let CodeSize = 2 in
def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
"dec{b}\t$dst",
- [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))],
- IIC_UNARY_REG>;
+ [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))]>;
let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
"dec{w}\t$dst",
- [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
- IIC_UNARY_REG>, OpSize16;
+ [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))]>, OpSize16;
def DEC32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
"dec{l}\t$dst",
- [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
- IIC_UNARY_REG>, OpSize32;
+ [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))]>, OpSize32;
def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst",
- [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))],
- IIC_UNARY_REG>;
+ [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))]>;
} // isConvertibleToThreeAddress = 1, CodeSize = 2
// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
let CodeSize = 1, hasSideEffects = 0 in {
def DEC16r_alt : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
- "dec{w}\t$dst", [], IIC_UNARY_REG>,
+ "dec{w}\t$dst", []>,
OpSize16, Requires<[Not64BitMode]>;
def DEC32r_alt : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
- "dec{l}\t$dst", [], IIC_UNARY_REG>,
+ "dec{l}\t$dst", []>,
OpSize32, Requires<[Not64BitMode]>;
} // CodeSize = 1, hasSideEffects = 0
} // Constraints = "$src1 = $dst", SchedRW
-let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+let CodeSize = 2, SchedRW = [WriteALURMW] in {
let Predicates = [UseIncDec] in {
def DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
[(store (add (loadi8 addr:$dst), -1), addr:$dst),
- (implicit EFLAGS)], IIC_UNARY_MEM>;
+ (implicit EFLAGS)]>;
def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
[(store (add (loadi16 addr:$dst), -1), addr:$dst),
- (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+ (implicit EFLAGS)]>, OpSize16;
def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
[(store (add (loadi32 addr:$dst), -1), addr:$dst),
- (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+ (implicit EFLAGS)]>, OpSize32;
} // Predicates
let Predicates = [UseIncDec, In64BitMode] in {
def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
[(store (add (loadi64 addr:$dst), -1), addr:$dst),
- (implicit EFLAGS)], IIC_UNARY_MEM>;
+ (implicit EFLAGS)]>;
} // Predicates
} // CodeSize = 2, SchedRW
} // Defs = [EFLAGS]
@@ -649,13 +611,11 @@ def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations)
/// or 1 (for i16,i32,i64 operations).
class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
- string mnemonic, string args, list<dag> pattern,
- InstrItinClass itin = IIC_BIN_NONMEM>
+ string mnemonic, string args, list<dag> pattern>
: I<{opcode{7}, opcode{6}, opcode{5}, opcode{4},
opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode },
f, outs, ins,
- !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern,
- itin> {
+ !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern> {
// Infer instruction prefixes from type info.
let OpSize = typeinfo.OpSize;
@@ -664,47 +624,45 @@ class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
// BinOpRR - Instructions like "add reg, reg, reg".
class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
- dag outlist, list<dag> pattern, InstrItinClass itin>
+ dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
: ITy<opcode, MRMDestReg, typeinfo, outlist,
(ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
- mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
- Sched<[WriteALU]>;
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
+ Sched<[sched]>;
// BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has
// just a EFLAGS as a result.
class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDPatternOperator opnode>
- : BinOpRR<opcode, mnemonic, typeinfo, (outs),
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs), WriteALU,
[(set EFLAGS,
- (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
- IIC_BIN_NONMEM>;
+ (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>;
// BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has
// both a regclass and EFLAGS as a result.
class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDNode opnode>
- : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteALU,
[(set typeinfo.RegClass:$dst, EFLAGS,
- (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
- IIC_BIN_NONMEM>;
+ (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>;
// BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has
// both a regclass and EFLAGS as a result, and has EFLAGS as input.
class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDNode opnode>
- : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
[(set typeinfo.RegClass:$dst, EFLAGS,
(opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
- EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+ EFLAGS))]>;
// BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding).
class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
- InstrItinClass itin = IIC_BIN_NONMEM>
+ X86FoldableSchedWrite sched = WriteALU>
: ITy<opcode, MRMSrcReg, typeinfo,
(outs typeinfo.RegClass:$dst),
(ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
- mnemonic, "{$src2, $dst|$dst, $src2}", [], itin>,
- Sched<[WriteALU]> {
+ mnemonic, "{$src2, $dst|$dst, $src2}", []>,
+ Sched<[sched]> {
// The disassembler should know about this, but not the asmparser.
let isCodeGenOnly = 1;
let ForceDisassemble = 1;
@@ -713,13 +671,13 @@ class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
// BinOpRR_RDD_Rev - Instructions like "adc reg, reg, reg" (reversed encoding).
class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
- : BinOpRR_Rev<opcode, mnemonic, typeinfo, IIC_BIN_CARRY_NONMEM>;
+ : BinOpRR_Rev<opcode, mnemonic, typeinfo, WriteADC>;
// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
: ITy<opcode, MRMSrcReg, typeinfo, (outs),
(ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
- mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM>,
+ mnemonic, "{$src2, $src1|$src1, $src2}", []>,
Sched<[WriteALU]> {
// The disassembler should know about this, but not the asmparser.
let isCodeGenOnly = 1;
@@ -729,137 +687,134 @@ class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
// BinOpRM - Instructions like "add reg, reg, [mem]".
class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
- dag outlist, list<dag> pattern,
- InstrItinClass itin = IIC_BIN_MEM>
+ dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
: ITy<opcode, MRMSrcMem, typeinfo, outlist,
(ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
- mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
- Sched<[WriteALULd, ReadAfterLd]>;
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
+ Sched<[sched.Folded, ReadAfterLd]>;
// BinOpRM_F - Instructions like "cmp reg, [mem]".
class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDNode opnode>
- : BinOpRM<opcode, mnemonic, typeinfo, (outs),
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs), WriteALU,
[(set EFLAGS,
(opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
// BinOpRM_RF - Instructions like "add reg, reg, [mem]".
class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDNode opnode>
- : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteALU,
[(set typeinfo.RegClass:$dst, EFLAGS,
(opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
// BinOpRM_RFF - Instructions like "adc reg, reg, [mem]".
class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDNode opnode>
- : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
[(set typeinfo.RegClass:$dst, EFLAGS,
(opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
- EFLAGS))], IIC_BIN_CARRY_MEM>;
+ EFLAGS))]>;
// BinOpRI - Instructions like "add reg, reg, imm".
class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
- Format f, dag outlist, list<dag> pattern,
- InstrItinClass itin = IIC_BIN_NONMEM>
+ Format f, dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
: ITy<opcode, f, typeinfo, outlist,
(ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
- mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
- Sched<[WriteALU]> {
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
+ Sched<[sched]> {
let ImmT = typeinfo.ImmEncoding;
}
// BinOpRI_F - Instructions like "cmp reg, imm".
class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDPatternOperator opnode, Format f>
- : BinOpRI<opcode, mnemonic, typeinfo, f, (outs),
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs), WriteALU,
[(set EFLAGS,
(opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
// BinOpRI_RF - Instructions like "add reg, reg, imm".
class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDNode opnode, Format f>
- : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteALU,
[(set typeinfo.RegClass:$dst, EFLAGS,
(opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
// BinOpRI_RFF - Instructions like "adc reg, reg, imm".
class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDNode opnode, Format f>
- : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteADC,
[(set typeinfo.RegClass:$dst, EFLAGS,
(opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
- EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+ EFLAGS))]>;
// BinOpRI8 - Instructions like "add reg, reg, imm8".
class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
- Format f, dag outlist, list<dag> pattern,
- InstrItinClass itin = IIC_BIN_NONMEM>
+ Format f, dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
: ITy<opcode, f, typeinfo, outlist,
(ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
- mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
- Sched<[WriteALU]> {
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
+ Sched<[sched]> {
let ImmT = Imm8; // Always 8-bit immediate.
}
// BinOpRI8_F - Instructions like "cmp reg, imm8".
class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDPatternOperator opnode, Format f>
- : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs),
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs), WriteALU,
[(set EFLAGS,
(opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
// BinOpRI8_RF - Instructions like "add reg, reg, imm8".
class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDPatternOperator opnode, Format f>
- : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteALU,
[(set typeinfo.RegClass:$dst, EFLAGS,
(opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
// BinOpRI8_RFF - Instructions like "adc reg, reg, imm8".
class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDPatternOperator opnode, Format f>
- : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteADC,
[(set typeinfo.RegClass:$dst, EFLAGS,
(opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2,
- EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+ EFLAGS))]>;
// BinOpMR - Instructions like "add [mem], reg".
class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
- list<dag> pattern, InstrItinClass itin = IIC_BIN_MEM>
+ list<dag> pattern>
: ITy<opcode, MRMDestMem, typeinfo,
(outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
- mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
- Sched<[WriteALULd, WriteRMW]>;
+ mnemonic, "{$src, $dst|$dst, $src}", pattern>;
// BinOpMR_RMW - Instructions like "add [mem], reg".
class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDNode opnode>
: BinOpMR<opcode, mnemonic, typeinfo,
[(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
- (implicit EFLAGS)]>;
+ (implicit EFLAGS)]>, Sched<[WriteALURMW]>;
// BinOpMR_RMW_FF - Instructions like "adc [mem], reg".
class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDNode opnode>
: BinOpMR<opcode, mnemonic, typeinfo,
- [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
- addr:$dst),
- (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+ [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
+ addr:$dst),
+ (implicit EFLAGS)]>, Sched<[WriteADCRMW]>;
// BinOpMR_F - Instructions like "cmp [mem], reg".
class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDPatternOperator opnode>
: BinOpMR<opcode, mnemonic, typeinfo,
- [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>;
+ [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst),
+ typeinfo.RegClass:$src))]>,
+ Sched<[WriteALULd, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault, ReadDefault, ReadAfterLd]>;
// BinOpMI - Instructions like "add [mem], imm".
class BinOpMI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
- Format f, list<dag> pattern,
- InstrItinClass itin = IIC_BIN_MEM>
+ Format f, list<dag> pattern>
: ITy<opcode, f, typeinfo,
(outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
- mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
- Sched<[WriteALULd, WriteRMW]> {
+ mnemonic, "{$src, $dst|$dst, $src}", pattern> {
let ImmT = typeinfo.ImmEncoding;
}
@@ -869,30 +824,29 @@ class BinOpMI_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
: BinOpMI<opcode, mnemonic, typeinfo, f,
[(store (opnode (typeinfo.VT (load addr:$dst)),
typeinfo.ImmOperator:$src), addr:$dst),
- (implicit EFLAGS)]>;
+ (implicit EFLAGS)]>, Sched<[WriteALURMW]>;
// BinOpMI_RMW_FF - Instructions like "adc [mem], imm".
class BinOpMI_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDNode opnode, Format f>
: BinOpMI<opcode, mnemonic, typeinfo, f,
[(store (opnode (typeinfo.VT (load addr:$dst)),
- typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
- (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+ typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
+ (implicit EFLAGS)]>, Sched<[WriteADCRMW]>;
// BinOpMI_F - Instructions like "cmp [mem], imm".
class BinOpMI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDPatternOperator opnode, Format f>
: BinOpMI<opcode, mnemonic, typeinfo, f,
- [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)),
- typeinfo.ImmOperator:$src))]>;
+ [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst),
+ typeinfo.ImmOperator:$src))]>,
+ Sched<[WriteALULd]>;
// BinOpMI8 - Instructions like "add [mem], imm8".
class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
- Format f, list<dag> pattern,
- InstrItinClass itin = IIC_BIN_MEM>
+ Format f, list<dag> pattern>
: ITy<0x82, f, typeinfo,
(outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
- mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
- Sched<[WriteALULd, WriteRMW]> {
+ mnemonic, "{$src, $dst|$dst, $src}", pattern> {
let ImmT = Imm8; // Always 8-bit immediate.
}
@@ -902,7 +856,7 @@ class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo,
: BinOpMI8<mnemonic, typeinfo, f,
[(store (opnode (load addr:$dst),
typeinfo.Imm8Operator:$src), addr:$dst),
- (implicit EFLAGS)]>;
+ (implicit EFLAGS)]>, Sched<[WriteALURMW]>;
// BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8".
class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
@@ -910,22 +864,22 @@ class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
: BinOpMI8<mnemonic, typeinfo, f,
[(store (opnode (load addr:$dst),
typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst),
- (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+ (implicit EFLAGS)]>, Sched<[WriteADCRMW]>;
// BinOpMI8_F - Instructions like "cmp [mem], imm8".
class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
SDPatternOperator opnode, Format f>
: BinOpMI8<mnemonic, typeinfo, f,
- [(set EFLAGS, (opnode (load addr:$dst),
- typeinfo.Imm8Operator:$src))]>;
+ [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst),
+ typeinfo.Imm8Operator:$src))]>,
+ Sched<[WriteALULd]>;
// BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS.
class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
- Register areg, string operands,
- InstrItinClass itin = IIC_BIN_NONMEM>
+ Register areg, string operands, X86FoldableSchedWrite sched = WriteALU>
: ITy<opcode, RawFrm, typeinfo,
(outs), (ins typeinfo.ImmOperand:$src),
- mnemonic, operands, [], itin>, Sched<[WriteALU]> {
+ mnemonic, operands, []>, Sched<[sched]> {
let ImmT = typeinfo.ImmEncoding;
let Uses = [areg];
let Defs = [areg, EFLAGS];
@@ -936,8 +890,7 @@ class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
// and use EFLAGS.
class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
Register areg, string operands>
- : BinOpAI<opcode, mnemonic, typeinfo, areg, operands,
- IIC_BIN_CARRY_NONMEM> {
+ : BinOpAI<opcode, mnemonic, typeinfo, areg, operands, WriteADC> {
let Uses = [areg, EFLAGS];
}
@@ -1257,14 +1210,6 @@ let isCompare = 1 in {
def TEST32mi : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>;
let Predicates = [In64BitMode] in
def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>;
-
- // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
- // register class is constrained to GR8_NOREX. This pseudo is explicitly
- // marked side-effect free, since it doesn't have an isel pattern like
- // other test instructions.
- let isPseudo = 1, hasSideEffects = 0 in
- def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
- "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
} // Defs = [EFLAGS]
def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL,
@@ -1284,21 +1229,22 @@ multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
PatFrag ld_frag> {
def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))],
- IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+ [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
+ Sched<[WriteALU]>;
def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, EFLAGS,
- (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>,
+ (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
Sched<[WriteALULd, ReadAfterLd]>;
}
-let Predicates = [HasBMI], Defs = [EFLAGS] in {
+// Complexity is reduced to give and with immediate a chance to match first.
+let Predicates = [HasBMI], Defs = [EFLAGS], AddedComplexity = -6 in {
defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8PS, VEX_4V;
defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8PS, VEX_4V, VEX_W;
}
-let Predicates = [HasBMI] in {
+let Predicates = [HasBMI], AddedComplexity = -6 in {
def : Pat<(and (not GR32:$src1), GR32:$src2),
(ANDN32rr GR32:$src1, GR32:$src2)>;
def : Pat<(and (not GR64:$src1), GR64:$src2),
@@ -1312,78 +1258,81 @@ let Predicates = [HasBMI] in {
//===----------------------------------------------------------------------===//
// MULX Instruction
//
-multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop> {
+multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
+ X86FoldableSchedWrite sched> {
let hasSideEffects = 0 in {
let isCommutable = 1 in
def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
- [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul, WriteIMulH]>;
+ []>, T8XD, VEX_4V, Sched<[sched, WriteIMulH]>;
let mayLoad = 1 in
def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
- [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd, WriteIMulH]>;
+ []>, T8XD, VEX_4V, Sched<[sched.Folded, WriteIMulH]>;
}
}
let Predicates = [HasBMI2] in {
let Uses = [EDX] in
- defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem>;
+ defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem, WriteIMul>;
let Uses = [RDX] in
- defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem>, VEX_W;
+ defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem, WriteIMul64>, VEX_W;
}
//===----------------------------------------------------------------------===//
-// ADCX Instruction
+// ADCX and ADOX Instructions
//
let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
- Constraints = "$src0 = $dst", AddedComplexity = 10 in {
- let SchedRW = [WriteALU] in {
+ Constraints = "$src1 = $dst", AddedComplexity = 10 in {
+ let SchedRW = [WriteADC] in {
def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst),
- (ins GR32:$src0, GR32:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS,
- (X86adc_flag GR32:$src0, GR32:$src, EFLAGS))],
- IIC_BIN_CARRY_NONMEM>, T8PD;
+ (ins GR32:$src1, GR32:$src2),
+ "adcx{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86adc_flag GR32:$src1, GR32:$src2, EFLAGS))]>, T8PD;
def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst),
- (ins GR64:$src0, GR64:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, EFLAGS,
- (X86adc_flag GR64:$src0, GR64:$src, EFLAGS))],
- IIC_BIN_CARRY_NONMEM>, T8PD;
+ (ins GR64:$src1, GR64:$src2),
+ "adcx{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86adc_flag GR64:$src1, GR64:$src2, EFLAGS))]>, T8PD;
+
+ // We don't have patterns for ADOX yet.
+ let hasSideEffects = 0 in {
+ def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "adox{l}\t{$src2, $dst|$dst, $src2}", []>, T8XS;
+
+ def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "adox{q}\t{$src2, $dst|$dst, $src2}", []>, T8XS;
+ } // hasSideEffects = 0
} // SchedRW
- let mayLoad = 1, SchedRW = [WriteALULd] in {
+ let mayLoad = 1, SchedRW = [WriteADCLd, ReadAfterLd] in {
def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst),
- (ins GR32:$src0, i32mem:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS,
- (X86adc_flag GR32:$src0, (loadi32 addr:$src), EFLAGS))],
- IIC_BIN_CARRY_MEM>, T8PD;
+ (ins GR32:$src1, i32mem:$src2),
+ "adcx{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86adc_flag GR32:$src1, (loadi32 addr:$src2), EFLAGS))]>,
+ T8PD;
def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst),
- (ins GR64:$src0, i64mem:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, EFLAGS,
- (X86adc_flag GR64:$src0, (loadi64 addr:$src), EFLAGS))],
- IIC_BIN_CARRY_MEM>, T8PD;
- }
-}
-
-//===----------------------------------------------------------------------===//
-// ADOX Instruction
-//
-let Predicates = [HasADX], hasSideEffects = 0, Defs = [EFLAGS],
- Uses = [EFLAGS] in {
- let SchedRW = [WriteALU] in {
- def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
- "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
-
- def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
- "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
- } // SchedRW
-
- let mayLoad = 1, SchedRW = [WriteALULd] in {
- def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
- "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
-
- def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
- "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
- }
+ (ins GR64:$src1, i64mem:$src2),
+ "adcx{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86adc_flag GR64:$src1, (loadi64 addr:$src2), EFLAGS))]>,
+ T8PD;
+
+ // We don't have patterns for ADOX yet.
+ let hasSideEffects = 0 in {
+ def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "adox{l}\t{$src2, $dst|$dst, $src2}", []>, T8XS;
+
+ def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "adox{q}\t{$src2, $dst|$dst, $src2}", []>, T8XS;
+ } // hasSideEffects = 0
+ } // mayLoad = 1, SchedRW = [WriteADCLd]
}
diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td
index 8dd5e1c0626b..eda4ba5ae6f0 100644
--- a/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/lib/Target/X86/X86InstrCMovSetCC.td
@@ -14,69 +14,67 @@
// CMOV instructions.
-multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
+multiclass CMOV<bits<8> opc, string Mnemonic, X86FoldableSchedWrite Sched,
+ PatLeaf CondNode> {
let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
- isCommutable = 1, SchedRW = [WriteALU] in {
+ isCommutable = 1, SchedRW = [Sched] in {
def NAME#16rr
: I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
!strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
[(set GR16:$dst,
- (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))],
- IIC_CMOV16_RR>, TB, OpSize16;
+ (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))]>,
+ TB, OpSize16;
def NAME#32rr
: I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
!strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
[(set GR32:$dst,
- (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))],
- IIC_CMOV32_RR>, TB, OpSize32;
+ (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))]>,
+ TB, OpSize32;
def NAME#64rr
:RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
!strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
[(set GR64:$dst,
- (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))],
- IIC_CMOV32_RR>, TB;
+ (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))]>, TB;
}
let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
- SchedRW = [WriteALULd, ReadAfterLd] in {
+ SchedRW = [Sched.Folded, ReadAfterLd] in {
def NAME#16rm
: I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
!strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
[(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
- CondNode, EFLAGS))], IIC_CMOV16_RM>,
- TB, OpSize16;
+ CondNode, EFLAGS))]>, TB, OpSize16;
def NAME#32rm
: I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
!strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
[(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
- CondNode, EFLAGS))], IIC_CMOV32_RM>,
- TB, OpSize32;
+ CondNode, EFLAGS))]>, TB, OpSize32;
def NAME#64rm
:RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
!strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
[(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
- CondNode, EFLAGS))], IIC_CMOV32_RM>, TB;
+ CondNode, EFLAGS))]>, TB;
} // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
} // end multiclass
// Conditional Moves.
-defm CMOVO : CMOV<0x40, "cmovo" , X86_COND_O>;
-defm CMOVNO : CMOV<0x41, "cmovno", X86_COND_NO>;
-defm CMOVB : CMOV<0x42, "cmovb" , X86_COND_B>;
-defm CMOVAE : CMOV<0x43, "cmovae", X86_COND_AE>;
-defm CMOVE : CMOV<0x44, "cmove" , X86_COND_E>;
-defm CMOVNE : CMOV<0x45, "cmovne", X86_COND_NE>;
-defm CMOVBE : CMOV<0x46, "cmovbe", X86_COND_BE>;
-defm CMOVA : CMOV<0x47, "cmova" , X86_COND_A>;
-defm CMOVS : CMOV<0x48, "cmovs" , X86_COND_S>;
-defm CMOVNS : CMOV<0x49, "cmovns", X86_COND_NS>;
-defm CMOVP : CMOV<0x4A, "cmovp" , X86_COND_P>;
-defm CMOVNP : CMOV<0x4B, "cmovnp", X86_COND_NP>;
-defm CMOVL : CMOV<0x4C, "cmovl" , X86_COND_L>;
-defm CMOVGE : CMOV<0x4D, "cmovge", X86_COND_GE>;
-defm CMOVLE : CMOV<0x4E, "cmovle", X86_COND_LE>;
-defm CMOVG : CMOV<0x4F, "cmovg" , X86_COND_G>;
+defm CMOVO : CMOV<0x40, "cmovo" , WriteCMOV, X86_COND_O>;
+defm CMOVNO : CMOV<0x41, "cmovno", WriteCMOV, X86_COND_NO>;
+defm CMOVB : CMOV<0x42, "cmovb" , WriteCMOV, X86_COND_B>;
+defm CMOVAE : CMOV<0x43, "cmovae", WriteCMOV, X86_COND_AE>;
+defm CMOVE : CMOV<0x44, "cmove" , WriteCMOV, X86_COND_E>;
+defm CMOVNE : CMOV<0x45, "cmovne", WriteCMOV, X86_COND_NE>;
+defm CMOVBE : CMOV<0x46, "cmovbe", WriteCMOV2, X86_COND_BE>;
+defm CMOVA : CMOV<0x47, "cmova" , WriteCMOV2, X86_COND_A>;
+defm CMOVS : CMOV<0x48, "cmovs" , WriteCMOV, X86_COND_S>;
+defm CMOVNS : CMOV<0x49, "cmovns", WriteCMOV, X86_COND_NS>;
+defm CMOVP : CMOV<0x4A, "cmovp" , WriteCMOV, X86_COND_P>;
+defm CMOVNP : CMOV<0x4B, "cmovnp", WriteCMOV, X86_COND_NP>;
+defm CMOVL : CMOV<0x4C, "cmovl" , WriteCMOV, X86_COND_L>;
+defm CMOVGE : CMOV<0x4D, "cmovge", WriteCMOV, X86_COND_GE>;
+defm CMOVLE : CMOV<0x4E, "cmovle", WriteCMOV, X86_COND_LE>;
+defm CMOVG : CMOV<0x4F, "cmovg" , WriteCMOV, X86_COND_G>;
// SetCC instructions.
@@ -84,12 +82,12 @@ multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> {
let Uses = [EFLAGS] in {
def r : I<opc, MRMXr, (outs GR8:$dst), (ins),
!strconcat(Mnemonic, "\t$dst"),
- [(set GR8:$dst, (X86setcc OpNode, EFLAGS))],
- IIC_SET_R>, TB, Sched<[WriteALU]>;
+ [(set GR8:$dst, (X86setcc OpNode, EFLAGS))]>,
+ TB, Sched<[WriteSETCC]>;
def m : I<opc, MRMXm, (outs), (ins i8mem:$dst),
!strconcat(Mnemonic, "\t$dst"),
- [(store (X86setcc OpNode, EFLAGS), addr:$dst)],
- IIC_SET_M>, TB, Sched<[WriteALU, WriteStore]>;
+ [(store (X86setcc OpNode, EFLAGS), addr:$dst)]>,
+ TB, Sched<[WriteSETCCStore]>;
} // Uses = [EFLAGS]
}
@@ -114,5 +112,5 @@ defm SETG : SETCC<0x9F, "setg", X86_COND_G>; // signed greater than
// here http://www.rcollins.org/secrets/opcodes/SALC.html
// Set AL if carry.
let Uses = [EFLAGS], Defs = [AL], SchedRW = [WriteALU] in {
- def SALC : I<0xD6, RawFrm, (outs), (ins), "salc", [], IIC_AHF>, Requires<[Not64BitMode]>;
+ def SALC : I<0xD6, RawFrm, (outs), (ins), "salc", []>, Requires<[Not64BitMode]>;
}
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 06600a4ef286..373f85020372 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -17,7 +17,7 @@
def GetLo32XForm : SDNodeXForm<imm, [{
// Transformation function: get the low 32 bits.
- return getI32Imm((unsigned)N->getZExtValue(), SDLoc(N));
+ return getI32Imm((uint32_t)N->getZExtValue(), SDLoc(N));
}]>;
def GetLo8XForm : SDNodeXForm<imm, [{
@@ -35,8 +35,12 @@ def GetLo8XForm : SDNodeXForm<imm, [{
let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP, SSP],
SchedRW = [WriteJump] in
def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
- "", [], IIC_CALL_RI>;
+ "", []>;
+// 64-bit large code model PIC base construction.
+let hasSideEffects = 0, mayLoad = 1, isNotDuplicable = 1, SchedRW = [WriteJump] in
+ def MOVGOT64r : PseudoI<(outs GR64:$reg),
+ (ins GR64:$scratch, i64i32imm_pcrel:$got), []>;
// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
// a stack adjustment and the codegen must know that they may modify the stack
@@ -46,12 +50,11 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP, SSP],
let Defs = [ESP, EFLAGS, SSP], Uses = [ESP, SSP], SchedRW = [WriteALU] in {
def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs),
(ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
- "#ADJCALLSTACKDOWN", [], IIC_ALU_NONMEM>,
- Requires<[NotLP64]>;
+ "#ADJCALLSTACKDOWN", []>, Requires<[NotLP64]>;
def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
"#ADJCALLSTACKUP",
- [(X86callseq_end timm:$amt1, timm:$amt2)],
- IIC_ALU_NONMEM>, Requires<[NotLP64]>;
+ [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+ Requires<[NotLP64]>;
}
def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
(ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>;
@@ -65,12 +68,11 @@ def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
let Defs = [RSP, EFLAGS, SSP], Uses = [RSP, SSP], SchedRW = [WriteALU] in {
def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs),
(ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
- "#ADJCALLSTACKDOWN",
- [], IIC_ALU_NONMEM>, Requires<[IsLP64]>;
+ "#ADJCALLSTACKDOWN", []>, Requires<[IsLP64]>;
def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
"#ADJCALLSTACKUP",
- [(X86callseq_end timm:$amt1, timm:$amt2)],
- IIC_ALU_NONMEM>, Requires<[IsLP64]>;
+ [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+ Requires<[IsLP64]>;
}
def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
(ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>;
@@ -148,10 +150,10 @@ def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size),
// frame register after register allocation.
let Constraints = "$src = $dst", isPseudo = 1, Defs = [EFLAGS] in {
def XOR32_FP : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src),
- "xorl\t$$FP, $src", [], IIC_BIN_NONMEM>,
+ "xorl\t$$FP, $src", []>,
Requires<[NotLP64]>, Sched<[WriteALU]>;
def XOR64_FP : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src),
- "xorq\t$$FP $src", [], IIC_BIN_NONMEM>,
+ "xorq\t$$FP $src", []>,
Requires<[In64BitMode]>, Sched<[WriteALU]>;
}
@@ -163,7 +165,7 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
hasCtrlDep = 1, isCodeGenOnly = 1 in {
def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
"ret\t#eh_return, addr: $addr",
- [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+ [(X86ehret GR32:$addr)]>, Sched<[WriteJumpLd]>;
}
@@ -171,7 +173,7 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
hasCtrlDep = 1, isCodeGenOnly = 1 in {
def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
"ret\t#eh_return, addr: $addr",
- [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+ [(X86ehret GR64:$addr)]>, Sched<[WriteJumpLd]>;
}
@@ -256,14 +258,12 @@ let isPseudo = 1, SchedRW = [WriteSystem] in {
// this so that we don't have to have a MachineBasicBlock which ends
// with a RET and also has successors.
let isPseudo = 1, SchedRW = [WriteJumpLd] in {
-def MORESTACK_RET: I<0, Pseudo, (outs), (ins),
- "", [], IIC_RET>;
+def MORESTACK_RET: I<0, Pseudo, (outs), (ins), "", []>;
// This instruction is lowered to a RET followed by a MOV. The two
// instructions are not generated on a higher level since then the
// verifier sees a MachineBasicBlock ending with a non-terminator.
-def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
- "", [], IIC_RET>;
+def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), "", []>;
}
//===----------------------------------------------------------------------===//
@@ -275,7 +275,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
isPseudo = 1, AddedComplexity = 10 in
def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
- [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+ [(set GR32:$dst, 0)]>, Sched<[WriteZero]>;
// Other widths can also make use of the 32-bit xor, which may have a smaller
// encoding and avoid partial register updates.
@@ -292,9 +292,9 @@ let Predicates = [OptForSize, Not64BitMode],
// which only require 3 bytes compared to MOV32ri which requires 5.
let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
- [(set GR32:$dst, 1)], IIC_ALU_NONMEM>;
+ [(set GR32:$dst, 1)]>;
def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
- [(set GR32:$dst, -1)], IIC_ALU_NONMEM>;
+ [(set GR32:$dst, -1)]>;
}
} // SchedRW
@@ -307,10 +307,10 @@ let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 5,
SchedRW = [WriteALU] in {
// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
- [(set GR32:$dst, i32immSExt8:$src)], IIC_ALU_NONMEM>,
+ [(set GR32:$dst, i32immSExt8:$src)]>,
Requires<[OptForMinSize, NotWin64WithoutFP]>;
def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
- [(set GR64:$dst, i64immSExt8:$src)], IIC_ALU_NONMEM>,
+ [(set GR64:$dst, i64immSExt8:$src)]>,
Requires<[OptForMinSize, NotWin64WithoutFP]>;
}
@@ -318,9 +318,8 @@ def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
// that would make it more difficult to rematerialize.
let isReMaterializable = 1, isAsCheapAsAMove = 1,
- isPseudo = 1, hasSideEffects = 0, SchedRW = [WriteALU] in
-def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", [],
- IIC_ALU_NONMEM>;
+ isPseudo = 1, hasSideEffects = 0, SchedRW = [WriteMove] in
+def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", []>;
// This 64-bit pseudo-move can be used for both a 64-bit constant that is
// actually the zero-extension of a 32-bit constant and for labels in the
@@ -398,28 +397,28 @@ def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
let SchedRW = [WriteMicrocoded] in {
let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
- [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+ [(X86rep_movs i8)]>, REP,
Requires<[Not64BitMode]>;
def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
- [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
+ [(X86rep_movs i16)]>, REP, OpSize16,
Requires<[Not64BitMode]>;
def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
- [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
+ [(X86rep_movs i32)]>, REP, OpSize32,
Requires<[Not64BitMode]>;
}
let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
- [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+ [(X86rep_movs i8)]>, REP,
Requires<[In64BitMode]>;
def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
- [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
+ [(X86rep_movs i16)]>, REP, OpSize16,
Requires<[In64BitMode]>;
def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
- [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
+ [(X86rep_movs i32)]>, REP, OpSize32,
Requires<[In64BitMode]>;
def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
- [(X86rep_movs i64)], IIC_REP_MOVS>, REP,
+ [(X86rep_movs i64)]>, REP,
Requires<[In64BitMode]>;
}
@@ -427,36 +426,36 @@ def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
let Uses = [AL,ECX,EDI] in
def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
- [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+ [(X86rep_stos i8)]>, REP,
Requires<[Not64BitMode]>;
let Uses = [AX,ECX,EDI] in
def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
- [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
+ [(X86rep_stos i16)]>, REP, OpSize16,
Requires<[Not64BitMode]>;
let Uses = [EAX,ECX,EDI] in
def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
- [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
+ [(X86rep_stos i32)]>, REP, OpSize32,
Requires<[Not64BitMode]>;
}
let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
let Uses = [AL,RCX,RDI] in
def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
- [(X86rep_stos i8)], IIC_REP_STOS>, REP,
- Requires<[In64BitMode]>;
+ [(X86rep_stos i8)]>, REP,
+ Requires<[In64BitMode]>;
let Uses = [AX,RCX,RDI] in
def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
- [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
- Requires<[In64BitMode]>;
+ [(X86rep_stos i16)]>, REP, OpSize16,
+ Requires<[In64BitMode]>;
let Uses = [RAX,RCX,RDI] in
def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
- [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
- Requires<[In64BitMode]>;
+ [(X86rep_stos i32)]>, REP, OpSize32,
+ Requires<[In64BitMode]>;
let Uses = [RAX,RCX,RDI] in
def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
- [(X86rep_stos i64)], IIC_REP_STOS>, REP,
- Requires<[In64BitMode]>;
+ [(X86rep_stos i64)]>, REP,
+ Requires<[In64BitMode]>;
}
} // SchedRW
@@ -473,7 +472,7 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
- XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF],
usesCustomInserter = 1, Uses = [ESP, SSP] in {
def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
"# TLS_addr32",
@@ -493,7 +492,7 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
- XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF],
usesCustomInserter = 1, Uses = [RSP, SSP] in {
def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
"# TLS_addr64",
@@ -509,7 +508,7 @@ def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
// For i386, the address of the thunk is passed on the stack, on return the
// address of the variable is in %eax. %ecx is trashed during the function
// call. All other registers are preserved.
-let Defs = [EAX, ECX, EFLAGS],
+let Defs = [EAX, ECX, EFLAGS, DF],
Uses = [ESP, SSP],
usesCustomInserter = 1 in
def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
@@ -522,7 +521,7 @@ def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
// %rdi. The lowering will do the right thing with RDI.
// On return the address of the variable is in %rax. All other
// registers are preserved.
-let Defs = [RAX, EFLAGS],
+let Defs = [RAX, EFLAGS, DF],
Uses = [RSP, SSP],
usesCustomInserter = 1 in
def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
@@ -568,7 +567,7 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
defm _FR32 : CMOVrr_PSEUDO<FR32, f32>;
defm _FR64 : CMOVrr_PSEUDO<FR64, f64>;
- defm _FR128 : CMOVrr_PSEUDO<FR128, f128>;
+ defm _F128 : CMOVrr_PSEUDO<VR128, f128>;
defm _V4F32 : CMOVrr_PSEUDO<VR128, v4f32>;
defm _V2F64 : CMOVrr_PSEUDO<VR128, v2f64>;
defm _V2I64 : CMOVrr_PSEUDO<VR128, v2i64>;
@@ -595,9 +594,9 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
// TODO: Get this to fold the constant into the instruction.
let isCodeGenOnly = 1, Defs = [EFLAGS] in
def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
- "or{l}\t{$zero, $dst|$dst, $zero}", [],
- IIC_ALU_MEM>, Requires<[Not64BitMode]>, OpSize32, LOCK,
- Sched<[WriteALULd, WriteRMW]>;
+ "or{l}\t{$zero, $dst|$dst, $zero}", []>,
+ Requires<[Not64BitMode]>, OpSize32, LOCK,
+ Sched<[WriteALULd, WriteRMW]>;
let hasSideEffects = 1 in
def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
@@ -618,89 +617,85 @@ def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
!strconcat(mnemonic, "{b}\t",
"{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, GR8:$src2))],
- IIC_ALU_NONMEM>, LOCK;
+ [(set EFLAGS, (Op addr:$dst, GR8:$src2))]>, LOCK;
def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
!strconcat(mnemonic, "{w}\t",
"{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, GR16:$src2))],
- IIC_ALU_NONMEM>, OpSize16, LOCK;
+ [(set EFLAGS, (Op addr:$dst, GR16:$src2))]>,
+ OpSize16, LOCK;
def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
!strconcat(mnemonic, "{l}\t",
"{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, GR32:$src2))],
- IIC_ALU_NONMEM>, OpSize32, LOCK;
+ [(set EFLAGS, (Op addr:$dst, GR32:$src2))]>,
+ OpSize32, LOCK;
def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
!strconcat(mnemonic, "{q}\t",
"{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, GR64:$src2))],
- IIC_ALU_NONMEM>, LOCK;
+ [(set EFLAGS, (Op addr:$dst, GR64:$src2))]>, LOCK;
def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
!strconcat(mnemonic, "{b}\t",
"{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, (i8 imm:$src2)))],
- IIC_ALU_MEM>, LOCK;
+ [(set EFLAGS, (Op addr:$dst, (i8 imm:$src2)))]>, LOCK;
def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
!strconcat(mnemonic, "{w}\t",
"{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, (i16 imm:$src2)))],
- IIC_ALU_MEM>, OpSize16, LOCK;
+ [(set EFLAGS, (Op addr:$dst, (i16 imm:$src2)))]>,
+ OpSize16, LOCK;
def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
!strconcat(mnemonic, "{l}\t",
"{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, (i32 imm:$src2)))],
- IIC_ALU_MEM>, OpSize32, LOCK;
+ [(set EFLAGS, (Op addr:$dst, (i32 imm:$src2)))]>,
+ OpSize32, LOCK;
def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
!strconcat(mnemonic, "{q}\t",
"{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))],
- IIC_ALU_MEM>, LOCK;
+ [(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))]>,
+ LOCK;
def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
!strconcat(mnemonic, "{w}\t",
"{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))],
- IIC_ALU_MEM>, OpSize16, LOCK;
+ [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))]>,
+ OpSize16, LOCK;
def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
!strconcat(mnemonic, "{l}\t",
"{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))],
- IIC_ALU_MEM>, OpSize32, LOCK;
+ [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))]>,
+ OpSize32, LOCK;
def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
!strconcat(mnemonic, "{q}\t",
"{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))],
- IIC_ALU_MEM>, LOCK;
-
+ [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))]>,
+ LOCK;
}
}
@@ -717,20 +712,20 @@ let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
SchedRW = [WriteALULd, WriteRMW] in {
def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst),
!strconcat(mnemonic, "{b}\t$dst"),
- [(set EFLAGS, (!cast<PatFrag>(frag # "_8") addr:$dst))],
- IIC_UNARY_MEM>, LOCK;
+ [(set EFLAGS, (!cast<PatFrag>(frag # "_8") addr:$dst))]>,
+ LOCK;
def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
!strconcat(mnemonic, "{w}\t$dst"),
- [(set EFLAGS, (!cast<PatFrag>(frag # "_16") addr:$dst))],
- IIC_UNARY_MEM>, OpSize16, LOCK;
+ [(set EFLAGS, (!cast<PatFrag>(frag # "_16") addr:$dst))]>,
+ OpSize16, LOCK;
def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
!strconcat(mnemonic, "{l}\t$dst"),
- [(set EFLAGS, (!cast<PatFrag>(frag # "_32") addr:$dst))],
- IIC_UNARY_MEM>, OpSize32, LOCK;
+ [(set EFLAGS, (!cast<PatFrag>(frag # "_32") addr:$dst))]>,
+ OpSize32, LOCK;
def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
!strconcat(mnemonic, "{q}\t$dst"),
- [(set EFLAGS, (!cast<PatFrag>(frag # "_64") addr:$dst))],
- IIC_UNARY_MEM>, LOCK;
+ [(set EFLAGS, (!cast<PatFrag>(frag # "_64") addr:$dst))]>,
+ LOCK;
}
}
@@ -761,43 +756,39 @@ defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "X86lock_dec", "dec">;
// Atomic compare and swap.
multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
- SDPatternOperator frag, X86MemOperand x86memop,
- InstrItinClass itin> {
+ SDPatternOperator frag, X86MemOperand x86memop> {
let isCodeGenOnly = 1, usesCustomInserter = 1 in {
def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
!strconcat(mnemonic, "\t$ptr"),
- [(frag addr:$ptr)], itin>, TB, LOCK;
+ [(frag addr:$ptr)]>, TB, LOCK;
}
}
multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
- string mnemonic, SDPatternOperator frag,
- InstrItinClass itin8, InstrItinClass itin> {
+ string mnemonic, SDPatternOperator frag> {
let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in {
let Defs = [AL, EFLAGS], Uses = [AL] in
def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
!strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
- [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
+ [(frag addr:$ptr, GR8:$swap, 1)]>, TB, LOCK;
let Defs = [AX, EFLAGS], Uses = [AX] in
def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
!strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
- [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize16, LOCK;
+ [(frag addr:$ptr, GR16:$swap, 2)]>, TB, OpSize16, LOCK;
let Defs = [EAX, EFLAGS], Uses = [EAX] in
def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
!strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
- [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, OpSize32, LOCK;
+ [(frag addr:$ptr, GR32:$swap, 4)]>, TB, OpSize32, LOCK;
let Defs = [RAX, EFLAGS], Uses = [RAX] in
def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
!strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
- [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
+ [(frag addr:$ptr, GR64:$swap, 8)]>, TB, LOCK;
}
}
let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
SchedRW = [WriteALULd, WriteRMW] in {
-defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
- X86cas8, i64mem,
- IIC_CMPX_LOCK_8B>;
+defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>;
}
// This pseudo must be used when the frame uses RBX as
@@ -827,16 +818,14 @@ def LCMPXCHG8B_SAVE_EBX :
(ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save),
!strconcat("cmpxchg8b", "\t$ptr"),
[(set GR32:$dst, (X86cas8save_ebx addr:$ptr, GR32:$ebx_input,
- GR32:$ebx_save))],
- IIC_CMPX_LOCK_8B>;
+ GR32:$ebx_save))]>;
}
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
- X86cas16, i128mem,
- IIC_CMPX_LOCK_16B>, REX_W;
+ X86cas16, i128mem>, REX_W;
}
// Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant.
@@ -849,52 +838,45 @@ def LCMPXCHG16B_SAVE_RBX :
(ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save),
!strconcat("cmpxchg16b", "\t$ptr"),
[(set GR64:$dst, (X86cas16save_rbx addr:$ptr, GR64:$rbx_input,
- GR64:$rbx_save))],
- IIC_CMPX_LOCK_16B>;
+ GR64:$rbx_save))]>;
}
-defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
- X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
+defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", X86cas>;
// Atomic exchange and add
multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
- string frag,
- InstrItinClass itin8, InstrItinClass itin> {
+ string frag> {
let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
SchedRW = [WriteALULd, WriteRMW] in {
def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst),
(ins GR8:$val, i8mem:$ptr),
!strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
[(set GR8:$dst,
- (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
- itin8>;
+ (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))]>;
def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
(ins GR16:$val, i16mem:$ptr),
!strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
[(set
GR16:$dst,
- (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
- itin>, OpSize16;
+ (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))]>,
+ OpSize16;
def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
(ins GR32:$val, i32mem:$ptr),
!strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
[(set
GR32:$dst,
- (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
- itin>, OpSize32;
+ (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>,
+ OpSize32;
def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
(ins GR64:$val, i64mem:$ptr),
!strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
[(set
GR64:$dst,
- (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
- itin>;
+ (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))]>;
}
}
-defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
- IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
- TB, LOCK;
+defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add">, TB, LOCK;
/* The following multiclass tries to make sure that in code like
* x.store (immediate op x.load(acquire), release)
@@ -1146,14 +1128,14 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
(TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
- Requires<[Not64BitMode]>;
+ Requires<[Not64BitMode, NotUseRetpoline]>;
// FIXME: This is disabled for 32-bit PIC mode because the global base
// register which is part of the address mode may be assigned a
// callee-saved register.
def : Pat<(X86tcret (load addr:$dst), imm:$off),
(TCRETURNmi addr:$dst, imm:$off)>,
- Requires<[Not64BitMode, IsNotPIC]>;
+ Requires<[Not64BitMode, IsNotPIC, NotUseRetpoline]>;
def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
(TCRETURNdi tglobaladdr:$dst, imm:$off)>,
@@ -1165,13 +1147,21 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
(TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
- Requires<[In64BitMode]>;
+ Requires<[In64BitMode, NotUseRetpoline]>;
// Don't fold loads into X86tcret requiring more than 6 regs.
// There wouldn't be enough scratch registers for base+index.
def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
(TCRETURNmi64 addr:$dst, imm:$off)>,
- Requires<[In64BitMode]>;
+ Requires<[In64BitMode, NotUseRetpoline]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+ (RETPOLINE_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>,
+ Requires<[In64BitMode, UseRetpoline]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+ (RETPOLINE_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>,
+ Requires<[Not64BitMode, UseRetpoline]>;
def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
(TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
@@ -1368,12 +1358,50 @@ def ADD64ri8_DB : I<0, Pseudo,
i64immSExt8:$src2))]>;
def ADD64ri32_DB : I<0, Pseudo,
(outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
- "", // orq/addq REG, imm
- [(set GR64:$dst, (or_is_add GR64:$src1,
- i64immSExt32:$src2))]>;
+ "", // orq/addq REG, imm
+ [(set GR64:$dst, (or_is_add GR64:$src1,
+ i64immSExt32:$src2))]>;
}
} // AddedComplexity, SchedRW
+//===----------------------------------------------------------------------===//
+// Pattern match SUB as XOR
+//===----------------------------------------------------------------------===//
+
+// An immediate in the LHS of a subtract can't be encoded in the instruction.
+// If there is no possibility of a borrow we can use an XOR instead of a SUB
+// to enable the immediate to be folded.
+// TODO: Move this to a DAG combine?
+
+def sub_is_xor : PatFrag<(ops node:$lhs, node:$rhs), (sub node:$lhs, node:$rhs),[{
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+ KnownBits Known;
+ CurDAG->computeKnownBits(N->getOperand(1), Known);
+
+ // If all possible ones in the RHS are set in the LHS then there can't be
+ // a borrow and we can use xor.
+ return (~Known.Zero).isSubsetOf(CN->getAPIntValue());
+ }
+
+ return false;
+}]>;
+
+let AddedComplexity = 5 in {
+def : Pat<(sub_is_xor imm:$src2, GR8:$src1),
+ (XOR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(sub_is_xor i16immSExt8:$src2, GR16:$src1),
+ (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(sub_is_xor imm:$src2, GR16:$src1),
+ (XOR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(sub_is_xor i32immSExt8:$src2, GR32:$src1),
+ (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(sub_is_xor imm:$src2, GR32:$src1),
+ (XOR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(sub_is_xor i64immSExt8:$src2, GR64:$src1),
+ (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(sub_is_xor i64immSExt32:$src2, GR64:$src1),
+ (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+}
//===----------------------------------------------------------------------===//
// Some peepholes
@@ -1463,6 +1491,37 @@ def : Pat<(and GR64:$src, 0xff),
} // AddedComplexity = 1
+// Try to use BTS/BTR/BTC for single bit operations on the upper 32-bits.
+
+def BTRXForm : SDNodeXForm<imm, [{
+ // Transformation function: Find the lowest 0.
+ return getI64Imm((uint8_t)N->getAPIntValue().countTrailingOnes(), SDLoc(N));
+}]>;
+
+def BTCBTSXForm : SDNodeXForm<imm, [{
+ // Transformation function: Find the lowest 1.
+ return getI64Imm((uint8_t)N->getAPIntValue().countTrailingZeros(), SDLoc(N));
+}]>;
+
+def BTRMask64 : ImmLeaf<i64, [{
+ return !isUInt<32>(Imm) && !isInt<32>(Imm) && isPowerOf2_64(~Imm);
+}]>;
+
+def BTCBTSMask64 : ImmLeaf<i64, [{
+ return !isInt<32>(Imm) && isPowerOf2_64(Imm);
+}]>;
+
+// For now only do this for optsize.
+let AddedComplexity = 1, Predicates=[OptForSize] in {
+ def : Pat<(and GR64:$src1, BTRMask64:$mask),
+ (BTR64ri8 GR64:$src1, (BTRXForm imm:$mask))>;
+ def : Pat<(or GR64:$src1, BTCBTSMask64:$mask),
+ (BTS64ri8 GR64:$src1, (BTCBTSXForm imm:$mask))>;
+ def : Pat<(xor GR64:$src1, BTCBTSMask64:$mask),
+ (BTC64ri8 GR64:$src1, (BTCBTSXForm imm:$mask))>;
+}
+
+
// sext_inreg patterns
def : Pat<(sext_inreg GR32:$src, i16),
(MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
@@ -1514,6 +1573,10 @@ def : Pat<(i8 (trunc GR16:$src)),
(EXTRACT_SUBREG GR16:$src, sub_8bit)>,
Requires<[In64BitMode]>;
+def immff00_ffff : ImmLeaf<i32, [{
+ return Imm >= 0xff00 && Imm <= 0xffff;
+}]>;
+
// h-register tricks
def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
(EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>,
@@ -1526,16 +1589,16 @@ def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
Requires<[Not64BitMode]>;
def : Pat<(srl GR16:$src, (i8 8)),
(EXTRACT_SUBREG
- (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
+ (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
sub_16bit)>;
def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
- (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
+ (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
- (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
+ (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
- (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
-def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
- (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
+ (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
+def : Pat<(srl (and_su GR32:$src, immff00_ffff), (i8 8)),
+ (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
// h-register tricks.
// For now, be conservative on x86-64 and use an h-register extract only if the
@@ -1548,19 +1611,19 @@ def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
(SUBREG_TO_REG
(i64 0),
- (MOVZX32_NOREXrr8
+ (MOVZX32rr8_NOREX
(EXTRACT_SUBREG GR64:$src, sub_8bit_hi)),
sub_32bit)>;
def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
(SUBREG_TO_REG
(i64 0),
- (MOVZX32_NOREXrr8
+ (MOVZX32rr8_NOREX
(EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
sub_32bit)>;
def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
(SUBREG_TO_REG
(i64 0),
- (MOVZX32_NOREXrr8
+ (MOVZX32rr8_NOREX
(EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
sub_32bit)>;
@@ -1711,36 +1774,65 @@ let Predicates = [HasBMI2] in {
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
}
- let AddedComplexity = -20 in {
- def : Pat<(sra (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
- (SARX32rm addr:$src1,
- (INSERT_SUBREG
- (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(sra (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
- (SARX64rm addr:$src1,
- (INSERT_SUBREG
- (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-
- def : Pat<(srl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
- (SHRX32rm addr:$src1,
- (INSERT_SUBREG
- (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(srl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
- (SHRX64rm addr:$src1,
- (INSERT_SUBREG
- (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(sra (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
+ (SARX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(sra (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
+ (SARX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(srl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
+ (SHRX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(srl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
+ (SHRX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(shl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
+ (SHLX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(shl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
+ (SHLX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+}
- def : Pat<(shl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
- (SHLX32rm addr:$src1,
- (INSERT_SUBREG
- (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(shl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
- (SHLX64rm addr:$src1,
- (INSERT_SUBREG
- (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- }
+// Use BTR/BTS/BTC for clearing/setting/toggling a bit in a variable location.
+multiclass one_bit_patterns<RegisterClass RC, ValueType VT, Instruction BTR,
+ Instruction BTS, Instruction BTC,
+ ImmLeaf ImmShift> {
+ def : Pat<(and RC:$src1, (rotl -2, GR8:$src2)),
+ (BTR RC:$src1,
+ (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(or RC:$src1, (shl 1, GR8:$src2)),
+ (BTS RC:$src1,
+ (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(xor RC:$src1, (shl 1, GR8:$src2)),
+ (BTC RC:$src1,
+ (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ // Similar to above, but removing unneeded masking of the shift amount.
+ def : Pat<(and RC:$src1, (rotl -2, (and GR8:$src2, ImmShift))),
+ (BTR RC:$src1,
+ (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(or RC:$src1, (shl 1, (and GR8:$src2, ImmShift))),
+ (BTS RC:$src1,
+ (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(xor RC:$src1, (shl 1, (and GR8:$src2, ImmShift))),
+ (BTC RC:$src1,
+ (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
}
+defm : one_bit_patterns<GR16, i16, BTR16rr, BTS16rr, BTC16rr, immShift16>;
+defm : one_bit_patterns<GR32, i32, BTR32rr, BTS32rr, BTC32rr, immShift32>;
+defm : one_bit_patterns<GR64, i64, BTR64rr, BTS64rr, BTC64rr, immShift64>;
+
+
// (anyext (setcc_carry)) -> (setcc_carry)
def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
(SETB_C16r)>;
@@ -1757,6 +1849,7 @@ def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>;
def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(add GR64:$src1, GR64:$src2), (ADD64rr GR64:$src1, GR64:$src2)>;
// add reg, mem
def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
@@ -1765,6 +1858,8 @@ def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
(ADD16rm GR16:$src1, addr:$src2)>;
def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
(ADD32rm GR32:$src1, addr:$src2)>;
+def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
+ (ADD64rm GR64:$src1, addr:$src2)>;
// add reg, imm
def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>;
@@ -1774,11 +1869,16 @@ def : Pat<(add GR16:$src1, i16immSExt8:$src2),
(ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
def : Pat<(add GR32:$src1, i32immSExt8:$src2),
(ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt8:$src2),
+ (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt32:$src2),
+ (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
// sub reg, reg
def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>;
def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(sub GR64:$src1, GR64:$src2), (SUB64rr GR64:$src1, GR64:$src2)>;
// sub reg, mem
def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
@@ -1787,6 +1887,8 @@ def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
(SUB16rm GR16:$src1, addr:$src2)>;
def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
(SUB32rm GR32:$src1, addr:$src2)>;
+def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
+ (SUB64rm GR64:$src1, addr:$src2)>;
// sub reg, imm
def : Pat<(sub GR8:$src1, imm:$src2),
@@ -1799,6 +1901,10 @@ def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
(SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
(SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
+ (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
+ (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
// sub 0, reg
def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r GR8 :$src)>;
@@ -1817,12 +1923,16 @@ def : Pat<(mul GR16:$src1, GR16:$src2),
(IMUL16rr GR16:$src1, GR16:$src2)>;
def : Pat<(mul GR32:$src1, GR32:$src2),
(IMUL32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(mul GR64:$src1, GR64:$src2),
+ (IMUL64rr GR64:$src1, GR64:$src2)>;
// mul reg, mem
def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
(IMUL16rm GR16:$src1, addr:$src2)>;
def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
(IMUL32rm GR32:$src1, addr:$src2)>;
+def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
+ (IMUL64rm GR64:$src1, addr:$src2)>;
// mul reg, imm
def : Pat<(mul GR16:$src1, imm:$src2),
@@ -1833,6 +1943,10 @@ def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
(IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
(IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
+ (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
+ (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
// reg = mul mem, imm
def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
@@ -1843,38 +1957,6 @@ def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
(IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
(IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
-
-// Patterns for nodes that do not produce flags, for instructions that do.
-
-// addition
-def : Pat<(add GR64:$src1, GR64:$src2),
- (ADD64rr GR64:$src1, GR64:$src2)>;
-def : Pat<(add GR64:$src1, i64immSExt8:$src2),
- (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(add GR64:$src1, i64immSExt32:$src2),
- (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
-def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
- (ADD64rm GR64:$src1, addr:$src2)>;
-
-// subtraction
-def : Pat<(sub GR64:$src1, GR64:$src2),
- (SUB64rr GR64:$src1, GR64:$src2)>;
-def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
- (SUB64rm GR64:$src1, addr:$src2)>;
-def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
- (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
- (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// Multiply
-def : Pat<(mul GR64:$src1, GR64:$src2),
- (IMUL64rr GR64:$src1, GR64:$src2)>;
-def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
- (IMUL64rm GR64:$src1, addr:$src2)>;
-def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
- (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
- (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
(IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
@@ -1999,3 +2081,23 @@ def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
let Predicates = [HasMOVBE] in {
def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
}
+
+// These patterns are selected by some custom code in X86ISelDAGToDAG.cpp that
+// custom combines and+srl into BEXTR. We use these patterns to avoid a bunch
+// of manual code for folding loads.
+let Predicates = [HasBMI, NoTBM] in {
+ def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
+ (BEXTR32rr GR32:$src1, (MOV32ri imm:$src2))>;
+ def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
+ (BEXTR32rm addr:$src1, (MOV32ri imm:$src2))>;
+ def : Pat<(X86bextr GR64:$src1, mov64imm32:$src2),
+ (BEXTR64rr GR64:$src1,
+ (SUBREG_TO_REG (i64 0),
+ (MOV32ri64 mov64imm32:$src2),
+ sub_32bit))>;
+ def : Pat<(X86bextr (loadi64 addr:$src1), mov64imm32:$src2),
+ (BEXTR64rm addr:$src1,
+ (SUBREG_TO_REG (i64 0),
+ (MOV32ri64 mov64imm32:$src2),
+ sub_32bit))>;
+} // HasBMI, NoTBM
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index 5581fd462a1d..650bce74dcf2 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -22,47 +22,37 @@
let isTerminator = 1, isReturn = 1, isBarrier = 1,
hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
def RETL : I <0xC3, RawFrm, (outs), (ins variable_ops),
- "ret{l}", [], IIC_RET>, OpSize32,
- Requires<[Not64BitMode]>;
+ "ret{l}", []>, OpSize32, Requires<[Not64BitMode]>;
def RETQ : I <0xC3, RawFrm, (outs), (ins variable_ops),
- "ret{q}", [], IIC_RET>, OpSize32,
- Requires<[In64BitMode]>;
+ "ret{q}", []>, OpSize32, Requires<[In64BitMode]>;
def RETW : I <0xC3, RawFrm, (outs), (ins),
- "ret{w}",
- [], IIC_RET>, OpSize16;
+ "ret{w}", []>, OpSize16;
def RETIL : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
- "ret{l}\t$amt",
- [], IIC_RET_IMM>, OpSize32,
- Requires<[Not64BitMode]>;
+ "ret{l}\t$amt", []>, OpSize32, Requires<[Not64BitMode]>;
def RETIQ : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
- "ret{q}\t$amt",
- [], IIC_RET_IMM>, OpSize32,
- Requires<[In64BitMode]>;
+ "ret{q}\t$amt", []>, OpSize32, Requires<[In64BitMode]>;
def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
- "ret{w}\t$amt",
- [], IIC_RET_IMM>, OpSize16;
+ "ret{w}\t$amt", []>, OpSize16;
def LRETL : I <0xCB, RawFrm, (outs), (ins),
- "{l}ret{l|f}", [], IIC_RET>, OpSize32;
+ "{l}ret{l|f}", []>, OpSize32;
def LRETQ : RI <0xCB, RawFrm, (outs), (ins),
- "{l}ret{|f}q", [], IIC_RET>, Requires<[In64BitMode]>;
+ "{l}ret{|f}q", []>, Requires<[In64BitMode]>;
def LRETW : I <0xCB, RawFrm, (outs), (ins),
- "{l}ret{w|f}", [], IIC_RET>, OpSize16;
+ "{l}ret{w|f}", []>, OpSize16;
def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
- "{l}ret{l|f}\t$amt", [], IIC_RET>, OpSize32;
+ "{l}ret{l|f}\t$amt", []>, OpSize32;
def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
- "{l}ret{|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>;
+ "{l}ret{|f}q\t$amt", []>, Requires<[In64BitMode]>;
def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
- "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize16;
+ "{l}ret{w|f}\t$amt", []>, OpSize16;
// The machine return from interrupt instruction, but sometimes we need to
// perform a post-epilogue stack adjustment. Codegen emits the pseudo form
// which expands to include an SP adjustment if necessary.
- def IRET16 : I <0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>,
+ def IRET16 : I <0xcf, RawFrm, (outs), (ins), "iret{w}", []>,
OpSize16;
- def IRET32 : I <0xcf, RawFrm, (outs), (ins), "iret{l|d}", [],
- IIC_IRET>, OpSize32;
- def IRET64 : RI <0xcf, RawFrm, (outs), (ins), "iretq", [],
- IIC_IRET>, Requires<[In64BitMode]>;
+ def IRET32 : I <0xcf, RawFrm, (outs), (ins), "iret{l|d}", []>, OpSize32;
+ def IRET64 : RI <0xcf, RawFrm, (outs), (ins), "iretq", []>, Requires<[In64BitMode]>;
let isCodeGenOnly = 1 in
def IRET : PseudoI<(outs), (ins i32imm:$adj), [(X86iret timm:$adj)]>;
def RET : PseudoI<(outs), (ins i32imm:$adj, variable_ops), [(X86retflag timm:$adj)]>;
@@ -71,12 +61,12 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
// Unconditional branches.
let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
- "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
+ "jmp\t$dst", [(br bb:$dst)]>;
let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst),
- "jmp\t$dst", [], IIC_JMP_REL>, OpSize16;
+ "jmp\t$dst", []>, OpSize16;
def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst),
- "jmp\t$dst", [], IIC_JMP_REL>, OpSize32;
+ "jmp\t$dst", []>, OpSize32;
}
}
@@ -84,12 +74,12 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm,
- [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>;
+ [(X86brcond bb:$dst, Cond, EFLAGS)]>;
let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm,
- [], IIC_Jcc>, OpSize16, TB;
+ []>, OpSize16, TB;
def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm,
- [], IIC_Jcc>, TB, OpSize32;
+ []>, TB, OpSize32;
}
}
}
@@ -118,69 +108,91 @@ let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in
// jecxz.
let Uses = [CX] in
def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
- "jcxz\t$dst", [], IIC_JCXZ>, AdSize16,
- Requires<[Not64BitMode]>;
+ "jcxz\t$dst", []>, AdSize16, Requires<[Not64BitMode]>;
let Uses = [ECX] in
def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
- "jecxz\t$dst", [], IIC_JCXZ>, AdSize32;
+ "jecxz\t$dst", []>, AdSize32;
let Uses = [RCX] in
def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
- "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64,
- Requires<[In64BitMode]>;
+ "jrcxz\t$dst", []>, AdSize64, Requires<[In64BitMode]>;
}
// Indirect branches
let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
def JMP16r : I<0xFF, MRM4r, (outs), (ins GR16:$dst), "jmp{w}\t{*}$dst",
- [(brind GR16:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
- OpSize16, Sched<[WriteJump]>;
+ [(brind GR16:$dst)]>, Requires<[Not64BitMode]>,
+ OpSize16, Sched<[WriteJump]>;
def JMP16m : I<0xFF, MRM4m, (outs), (ins i16mem:$dst), "jmp{w}\t{*}$dst",
- [(brind (loadi16 addr:$dst))], IIC_JMP_MEM>,
- Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>;
+ [(brind (loadi16 addr:$dst))]>, Requires<[Not64BitMode]>,
+ OpSize16, Sched<[WriteJumpLd]>;
def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
- [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
- OpSize32, Sched<[WriteJump]>;
+ [(brind GR32:$dst)]>, Requires<[Not64BitMode]>,
+ OpSize32, Sched<[WriteJump]>;
def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
- [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>,
- Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>;
+ [(brind (loadi32 addr:$dst))]>, Requires<[Not64BitMode]>,
+ OpSize32, Sched<[WriteJumpLd]>;
def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
- [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>,
- Sched<[WriteJump]>;
+ [(brind GR64:$dst)]>, Requires<[In64BitMode]>,
+ Sched<[WriteJump]>;
def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
- [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>,
- Requires<[In64BitMode]>, Sched<[WriteJumpLd]>;
+ [(brind (loadi64 addr:$dst))]>, Requires<[In64BitMode]>,
+ Sched<[WriteJumpLd]>;
+
+ // Non-tracking jumps for IBT, use with caution.
+ let isCodeGenOnly = 1 in {
+ def JMP16r_NT : I<0xFF, MRM4r, (outs), (ins GR16 : $dst), "jmp{w}\t{*}$dst",
+ [(X86NoTrackBrind GR16 : $dst)]>, Requires<[Not64BitMode]>,
+ OpSize16, Sched<[WriteJump]>, NOTRACK;
+
+ def JMP16m_NT : I<0xFF, MRM4m, (outs), (ins i16mem : $dst), "jmp{w}\t{*}$dst",
+ [(X86NoTrackBrind (loadi16 addr : $dst))]>,
+ Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>,
+ NOTRACK;
+
+ def JMP32r_NT : I<0xFF, MRM4r, (outs), (ins GR32 : $dst), "jmp{l}\t{*}$dst",
+ [(X86NoTrackBrind GR32 : $dst)]>, Requires<[Not64BitMode]>,
+ OpSize32, Sched<[WriteJump]>, NOTRACK;
+ def JMP32m_NT : I<0xFF, MRM4m, (outs), (ins i32mem : $dst), "jmp{l}\t{*}$dst",
+ [(X86NoTrackBrind (loadi32 addr : $dst))]>,
+ Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>,
+ NOTRACK;
+
+ def JMP64r_NT : I<0xFF, MRM4r, (outs), (ins GR64 : $dst), "jmp{q}\t{*}$dst",
+ [(X86NoTrackBrind GR64 : $dst)]>, Requires<[In64BitMode]>,
+ Sched<[WriteJump]>, NOTRACK;
+ def JMP64m_NT : I<0xFF, MRM4m, (outs), (ins i64mem : $dst), "jmp{q}\t{*}$dst",
+ [(X86NoTrackBrind(loadi64 addr : $dst))]>,
+ Requires<[In64BitMode]>, Sched<[WriteJumpLd]>, NOTRACK;
+ }
- let Predicates = [Not64BitMode] in {
+ let Predicates = [Not64BitMode], AsmVariantName = "att" in {
def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs),
(ins i16imm:$off, i16imm:$seg),
- "ljmp{w}\t$seg, $off", [],
- IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+ "ljmp{w}\t$seg, $off", []>,
+ OpSize16, Sched<[WriteJump]>;
def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs),
(ins i32imm:$off, i16imm:$seg),
- "ljmp{l}\t$seg, $off", [],
- IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+ "ljmp{l}\t$seg, $off", []>,
+ OpSize32, Sched<[WriteJump]>;
}
- def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
- "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
- Sched<[WriteJump]>;
-
- def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
- "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize16,
- Sched<[WriteJumpLd]>;
- def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
- "{l}jmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32,
- Sched<[WriteJumpLd]>;
+ def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
+ "ljmp{q}\t{*}$dst", []>, Sched<[WriteJump]>, Requires<[In64BitMode]>;
+
+ let AsmVariantName = "att" in
+ def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
+ "ljmp{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
+ def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
+ "{l}jmp{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
}
-
// Loop instructions
let SchedRW = [WriteJump] in {
-def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>;
-def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>;
-def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>;
+def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", []>;
+def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", []>;
+def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", []>;
}
//===----------------------------------------------------------------------===//
@@ -194,47 +206,62 @@ let isCall = 1 in
let Uses = [ESP, SSP] in {
def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
(outs), (ins i32imm_pcrel:$dst),
- "call{l}\t$dst", [], IIC_CALL_RI>, OpSize32,
+ "call{l}\t$dst", []>, OpSize32,
Requires<[Not64BitMode]>, Sched<[WriteJump]>;
let hasSideEffects = 0 in
def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
(outs), (ins i16imm_pcrel:$dst),
- "call{w}\t$dst", [], IIC_CALL_RI>, OpSize16,
+ "call{w}\t$dst", []>, OpSize16,
Sched<[WriteJump]>;
def CALL16r : I<0xFF, MRM2r, (outs), (ins GR16:$dst),
- "call{w}\t{*}$dst", [(X86call GR16:$dst)], IIC_CALL_RI>,
+ "call{w}\t{*}$dst", [(X86call GR16:$dst)]>,
OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
def CALL16m : I<0xFF, MRM2m, (outs), (ins i16mem:$dst),
- "call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))],
- IIC_CALL_MEM>, OpSize16,
- Requires<[Not64BitMode,FavorMemIndirectCall]>,
- Sched<[WriteJumpLd]>;
+ "call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))]>,
+ OpSize16, Requires<[Not64BitMode,FavorMemIndirectCall]>,
+ Sched<[WriteJumpLd]>;
def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
- "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
- OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+ "call{l}\t{*}$dst", [(X86call GR32:$dst)]>, OpSize32,
+ Requires<[Not64BitMode,NotUseRetpoline]>, Sched<[WriteJump]>;
def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
- "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))],
- IIC_CALL_MEM>, OpSize32,
- Requires<[Not64BitMode,FavorMemIndirectCall]>,
- Sched<[WriteJumpLd]>;
+ "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))]>,
+ OpSize32,
+ Requires<[Not64BitMode,FavorMemIndirectCall,NotUseRetpoline]>,
+ Sched<[WriteJumpLd]>;
+
+ // Non-tracking calls for IBT, use with caution.
+ let isCodeGenOnly = 1 in {
+ def CALL16r_NT : I<0xFF, MRM2r, (outs), (ins GR16 : $dst),
+ "call{w}\t{*}$dst",[(X86NoTrackCall GR16 : $dst)]>,
+ OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>, NOTRACK;
+ def CALL16m_NT : I<0xFF, MRM2m, (outs), (ins i16mem : $dst),
+ "call{w}\t{*}$dst",[(X86NoTrackCall(loadi16 addr : $dst))]>,
+ OpSize16, Requires<[Not64BitMode,FavorMemIndirectCall]>,
+ Sched<[WriteJumpLd]>, NOTRACK;
+ def CALL32r_NT : I<0xFF, MRM2r, (outs), (ins GR32 : $dst),
+ "call{l}\t{*}$dst",[(X86NoTrackCall GR32 : $dst)]>,
+ OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>, NOTRACK;
+ def CALL32m_NT : I<0xFF, MRM2m, (outs), (ins i32mem : $dst),
+ "call{l}\t{*}$dst",[(X86NoTrackCall(loadi32 addr : $dst))]>,
+ OpSize32, Requires<[Not64BitMode,FavorMemIndirectCall]>,
+ Sched<[WriteJumpLd]>, NOTRACK;
+ }
- let Predicates = [Not64BitMode] in {
+ let Predicates = [Not64BitMode], AsmVariantName = "att" in {
def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs),
(ins i16imm:$off, i16imm:$seg),
- "lcall{w}\t$seg, $off", [],
- IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+ "lcall{w}\t$seg, $off", []>,
+ OpSize16, Sched<[WriteJump]>;
def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs),
(ins i32imm:$off, i16imm:$seg),
- "lcall{l}\t$seg, $off", [],
- IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+ "lcall{l}\t$seg, $off", []>,
+ OpSize32, Sched<[WriteJump]>;
}
- def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
- "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16,
- Sched<[WriteJumpLd]>;
- def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
- "{l}call{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32,
- Sched<[WriteJumpLd]>;
+ def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
+ "lcall{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
+ def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
+ "{l}call{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
}
@@ -253,15 +280,13 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
// FIXME: The should be pseudo instructions that are lowered when going to
// mcinst.
def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
- (ins i32imm_pcrel:$dst),
- "jmp\t$dst",
- [], IIC_JMP_REL>;
+ (ins i32imm_pcrel:$dst), "jmp\t$dst", []>;
def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
- "", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead.
+ "", []>; // FIXME: Remove encoding when JIT is dead.
let mayLoad = 1 in
def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
- "jmp{l}\t{*}$dst", [], IIC_JMP_MEM>;
+ "jmp{l}\t{*}$dst", []>;
}
// Conditional tail calls are similar to the above, but they are branches
@@ -274,9 +299,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
// This gets substituted to a conditional jump instruction in MC lowering.
def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs),
- (ins i32imm_pcrel:$dst, i32imm:$cond),
- "",
- [], IIC_JMP_REL>;
+ (ins i32imm_pcrel:$dst, i32imm:$cond), "", []>;
}
@@ -293,24 +316,33 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
// the 32-bit pcrel field that we have.
def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
(outs), (ins i64i32imm_pcrel:$dst),
- "call{q}\t$dst", [], IIC_CALL_RI>, OpSize32,
+ "call{q}\t$dst", []>, OpSize32,
Requires<[In64BitMode]>;
def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
- "call{q}\t{*}$dst", [(X86call GR64:$dst)],
- IIC_CALL_RI>,
- Requires<[In64BitMode]>;
+ "call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
+ Requires<[In64BitMode,NotUseRetpoline]>;
def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
- "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
- IIC_CALL_MEM>,
- Requires<[In64BitMode,FavorMemIndirectCall]>;
+ "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
+ Requires<[In64BitMode,FavorMemIndirectCall,
+ NotUseRetpoline]>;
+
+ // Non-tracking calls for IBT, use with caution.
+ let isCodeGenOnly = 1 in {
+ def CALL64r_NT : I<0xFF, MRM2r, (outs), (ins GR64 : $dst),
+ "call{q}\t{*}$dst",[(X86NoTrackCall GR64 : $dst)]>,
+ Requires<[In64BitMode]>, NOTRACK;
+ def CALL64m_NT : I<0xFF, MRM2m, (outs), (ins i64mem : $dst),
+ "call{q}\t{*}$dst",
+ [(X86NoTrackCall(loadi64 addr : $dst))]>,
+ Requires<[In64BitMode,FavorMemIndirectCall]>, NOTRACK;
+ }
- def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
- "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
+ def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
+ "lcall{q}\t{*}$dst", []>;
}
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
- isCodeGenOnly = 1, Uses = [RSP, SSP], usesCustomInserter = 1,
- SchedRW = [WriteJump] in {
+ isCodeGenOnly = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
def TCRETURNdi64 : PseudoI<(outs),
(ins i64i32imm_pcrel:$dst, i32imm:$offset),
[]>;
@@ -321,23 +353,44 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
(ins i64mem_TC:$dst, i32imm:$offset), []>, NotMemoryFoldable;
def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst),
- "jmp\t$dst", [], IIC_JMP_REL>;
+ "jmp\t$dst", []>;
def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
- "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+ "jmp{q}\t{*}$dst", []>;
let mayLoad = 1 in
def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
- "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+ "jmp{q}\t{*}$dst", []>;
// Win64 wants indirect jumps leaving the function to have a REX_W prefix.
let hasREX_WPrefix = 1 in {
def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
- "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+ "rex64 jmp{q}\t{*}$dst", []>;
let mayLoad = 1 in
def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
- "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+ "rex64 jmp{q}\t{*}$dst", []>;
+ }
+}
+
+let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
+ Uses = [RSP, SSP],
+ usesCustomInserter = 1,
+ SchedRW = [WriteJump] in {
+ def RETPOLINE_CALL32 :
+ PseudoI<(outs), (ins GR32:$dst), [(X86call GR32:$dst)]>,
+ Requires<[Not64BitMode,UseRetpoline]>;
+
+ def RETPOLINE_CALL64 :
+ PseudoI<(outs), (ins GR64:$dst), [(X86call GR64:$dst)]>,
+ Requires<[In64BitMode,UseRetpoline]>;
+
+ // Retpoline variant of indirect tail calls.
+ let isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+ def RETPOLINE_TCRETURN64 :
+ PseudoI<(outs), (ins GR64:$dst, i32imm:$offset), []>;
+ def RETPOLINE_TCRETURN32 :
+ PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), []>;
}
}
@@ -352,7 +405,5 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
// This gets substituted to a conditional jump instruction in MC lowering.
def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs),
- (ins i64i32imm_pcrel:$dst, i32imm:$cond),
- "",
- [], IIC_JMP_REL>;
+ (ins i64i32imm_pcrel:$dst, i32imm:$cond), "", []>;
}
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 2a8ab0069b1e..421792c5599f 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -14,104 +14,124 @@
let hasSideEffects = 0 in {
let Defs = [AX], Uses = [AL] in // AX = signext(AL)
def CBW : I<0x98, RawFrm, (outs), (ins),
- "{cbtw|cbw}", [], IIC_CBW>, OpSize16, Sched<[WriteALU]>;
+ "{cbtw|cbw}", []>, OpSize16, Sched<[WriteALU]>;
let Defs = [EAX], Uses = [AX] in // EAX = signext(AX)
def CWDE : I<0x98, RawFrm, (outs), (ins),
- "{cwtl|cwde}", [], IIC_CBW>, OpSize32, Sched<[WriteALU]>;
+ "{cwtl|cwde}", []>, OpSize32, Sched<[WriteALU]>;
let Defs = [AX,DX], Uses = [AX] in // DX:AX = signext(AX)
def CWD : I<0x99, RawFrm, (outs), (ins),
- "{cwtd|cwd}", [], IIC_CBW>, OpSize16, Sched<[WriteALU]>;
+ "{cwtd|cwd}", []>, OpSize16, Sched<[WriteALU]>;
let Defs = [EAX,EDX], Uses = [EAX] in // EDX:EAX = signext(EAX)
def CDQ : I<0x99, RawFrm, (outs), (ins),
- "{cltd|cdq}", [], IIC_CBW>, OpSize32, Sched<[WriteALU]>;
+ "{cltd|cdq}", []>, OpSize32, Sched<[WriteALU]>;
let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX)
def CDQE : RI<0x98, RawFrm, (outs), (ins),
- "{cltq|cdqe}", [], IIC_CBW>, Sched<[WriteALU]>;
+ "{cltq|cdqe}", []>, Sched<[WriteALU]>;
let Defs = [RAX,RDX], Uses = [RAX] in // RDX:RAX = signext(RAX)
def CQO : RI<0x99, RawFrm, (outs), (ins),
- "{cqto|cqo}", [], IIC_CBW>, Sched<[WriteALU]>;
+ "{cqto|cqo}", []>, Sched<[WriteALU]>;
}
// Sign/Zero extenders
let hasSideEffects = 0 in {
def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
- "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>,
+ "movs{bw|x}\t{$src, $dst|$dst, $src}", []>,
TB, OpSize16, Sched<[WriteALU]>;
let mayLoad = 1 in
def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
- "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>,
+ "movs{bw|x}\t{$src, $dst|$dst, $src}", []>,
TB, OpSize16, Sched<[WriteALULd]>;
} // hasSideEffects = 0
def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
"movs{bl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+ [(set GR32:$dst, (sext GR8:$src))]>, TB,
OpSize32, Sched<[WriteALU]>;
def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
"movs{bl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB,
+ [(set GR32:$dst, (sextloadi32i8 addr:$src))]>, TB,
OpSize32, Sched<[WriteALULd]>;
def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
"movs{wl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+ [(set GR32:$dst, (sext GR16:$src))]>, TB,
OpSize32, Sched<[WriteALU]>;
def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
"movs{wl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>,
+ [(set GR32:$dst, (sextloadi32i16 addr:$src))]>,
OpSize32, TB, Sched<[WriteALULd]>;
let hasSideEffects = 0 in {
def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
- "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>,
+ "movz{bw|x}\t{$src, $dst|$dst, $src}", []>,
TB, OpSize16, Sched<[WriteALU]>;
let mayLoad = 1 in
def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
- "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>,
+ "movz{bw|x}\t{$src, $dst|$dst, $src}", []>,
TB, OpSize16, Sched<[WriteALULd]>;
} // hasSideEffects = 0
def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
"movz{bl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
+ [(set GR32:$dst, (zext GR8:$src))]>, TB,
OpSize32, Sched<[WriteALU]>;
def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
"movz{bl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB,
+ [(set GR32:$dst, (zextloadi32i8 addr:$src))]>, TB,
OpSize32, Sched<[WriteALULd]>;
def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
"movz{wl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB,
+ [(set GR32:$dst, (zext GR16:$src))]>, TB,
OpSize32, Sched<[WriteALU]>;
def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
"movz{wl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>,
+ [(set GR32:$dst, (zextloadi32i16 addr:$src))]>,
TB, OpSize32, Sched<[WriteALULd]>;
+// These instructions exist as a consequence of operand size prefix having
+// control of the destination size, but not the input size. Only support them
+// for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOVSX16rr16: I<0xBF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "movs{ww|x}\t{$src, $dst|$dst, $src}",
+ []>, TB, OpSize16, Sched<[WriteALU]>, NotMemoryFoldable;
+def MOVZX16rr16: I<0xB7, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "movz{ww|x}\t{$src, $dst|$dst, $src}",
+ []>, TB, OpSize16, Sched<[WriteALU]>, NotMemoryFoldable;
+let mayLoad = 1 in {
+def MOVSX16rm16: I<0xBF, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "movs{ww|x}\t{$src, $dst|$dst, $src}",
+ []>, OpSize16, TB, Sched<[WriteALULd]>, NotMemoryFoldable;
+def MOVZX16rm16: I<0xB7, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "movz{ww|x}\t{$src, $dst|$dst, $src}",
+ []>, TB, OpSize16, Sched<[WriteALULd]>, NotMemoryFoldable;
+} // mayLoad = 1
+} // isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0
+
// These are the same as the regular MOVZX32rr8 and MOVZX32rm8
// except that they use GR32_NOREX for the output operand register class
// instead of GR32. This allows them to operate on h registers on x86-64.
let hasSideEffects = 0, isCodeGenOnly = 1 in {
-def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
+def MOVZX32rr8_NOREX : I<0xB6, MRMSrcReg,
(outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
- "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
- [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALU]>;
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ []>, TB, OpSize32, Sched<[WriteALU]>;
let mayLoad = 1 in
-def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
+def MOVZX32rm8_NOREX : I<0xB6, MRMSrcMem,
(outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
- "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
- [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALULd]>;
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ []>, TB, OpSize32, Sched<[WriteALULd]>;
-def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg,
+def MOVSX32rr8_NOREX : I<0xBE, MRMSrcReg,
(outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
- "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
- [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALU]>;
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ []>, TB, OpSize32, Sched<[WriteALU]>;
let mayLoad = 1 in
-def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem,
+def MOVSX32rm8_NOREX : I<0xBE, MRMSrcMem,
(outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
- "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
- [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALULd]>;
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ []>, TB, OpSize32, Sched<[WriteALULd]>;
}
// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
@@ -120,44 +140,44 @@ def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem,
// were generalized, this would require a special register class.
def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
"movs{bq|x}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+ [(set GR64:$dst, (sext GR8:$src))]>, TB,
Sched<[WriteALU]>;
def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
"movs{bq|x}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sextloadi64i8 addr:$src))], IIC_MOVSX>,
+ [(set GR64:$dst, (sextloadi64i8 addr:$src))]>,
TB, Sched<[WriteALULd]>;
def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
"movs{wq|x}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+ [(set GR64:$dst, (sext GR16:$src))]>, TB,
Sched<[WriteALU]>;
def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
"movs{wq|x}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sextloadi64i16 addr:$src))], IIC_MOVSX>,
+ [(set GR64:$dst, (sextloadi64i16 addr:$src))]>,
TB, Sched<[WriteALULd]>;
def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
"movs{lq|xd}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>,
+ [(set GR64:$dst, (sext GR32:$src))]>,
Sched<[WriteALU]>, Requires<[In64BitMode]>;
def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
"movs{lq|xd}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>,
+ [(set GR64:$dst, (sextloadi64i32 addr:$src))]>,
Sched<[WriteALULd]>, Requires<[In64BitMode]>;
// movzbq and movzwq encodings for the disassembler
let hasSideEffects = 0 in {
def MOVZX64rr8 : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
- "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ "movz{bq|x}\t{$src, $dst|$dst, $src}", []>,
TB, Sched<[WriteALU]>;
let mayLoad = 1 in
def MOVZX64rm8 : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
- "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ "movz{bq|x}\t{$src, $dst|$dst, $src}", []>,
TB, Sched<[WriteALULd]>;
def MOVZX64rr16 : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
- "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ "movz{wq|x}\t{$src, $dst|$dst, $src}", []>,
TB, Sched<[WriteALU]>;
let mayLoad = 1 in
def MOVZX64rm16 : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
- "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ "movz{wq|x}\t{$src, $dst|$dst, $src}", []>,
TB, Sched<[WriteALULd]>;
}
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 35fa45590fc6..a559f62c8f38 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -36,13 +36,13 @@
multiclass fma3p_rm_213<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
- SDNode Op> {
+ SDNode Op, X86FoldableSchedWrite sched> {
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set RC:$dst, (VT (Op RC:$src2, RC:$src1, RC:$src3)))]>,
- Sched<[WriteFMA]>;
+ Sched<[sched]>;
let mayLoad = 1 in
def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
@@ -51,18 +51,18 @@ multiclass fma3p_rm_213<bits<8> opc, string OpcodeStr, RegisterClass RC,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set RC:$dst, (VT (Op RC:$src2, RC:$src1,
(MemFrag addr:$src3))))]>,
- Sched<[WriteFMALd, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
}
multiclass fma3p_rm_231<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
- SDNode Op> {
+ SDNode Op, X86FoldableSchedWrite sched> {
let hasSideEffects = 0 in
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>, Sched<[WriteFMA]>;
+ []>, Sched<[sched]>;
let mayLoad = 1 in
def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
@@ -70,18 +70,19 @@ multiclass fma3p_rm_231<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set RC:$dst, (VT (Op RC:$src2, (MemFrag addr:$src3),
- RC:$src1)))]>, Sched<[WriteFMALd, ReadAfterLd]>;
+ RC:$src1)))]>,
+ Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
}
multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
- SDNode Op> {
+ SDNode Op, X86FoldableSchedWrite sched> {
let hasSideEffects = 0 in
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>, Sched<[WriteFMA]>;
+ []>, Sched<[sched]>;
// Pattern is 312 order so that the load is in a different place from the
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
@@ -91,71 +92,77 @@ multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set RC:$dst, (VT (Op (MemFrag addr:$src3), RC:$src1,
- RC:$src2)))]>, Sched<[WriteFMALd, ReadAfterLd]>;
+ RC:$src2)))]>,
+ Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
}
let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpcodeStr, string PackTy, string Suff,
PatFrag MemFrag128, PatFrag MemFrag256,
- SDNode Op, ValueType OpTy128, ValueType OpTy256> {
+ SDNode Op, ValueType OpTy128, ValueType OpTy256,
+ X86SchedWriteWidths sched> {
defm NAME#213#Suff : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy),
- VR128, OpTy128, f128mem, MemFrag128, Op>;
+ VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>;
defm NAME#231#Suff : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy),
- VR128, OpTy128, f128mem, MemFrag128, Op>;
+ VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>;
defm NAME#132#Suff : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy),
- VR128, OpTy128, f128mem, MemFrag128, Op>;
+ VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>;
defm NAME#213#Suff#Y : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy),
- VR256, OpTy256, f256mem, MemFrag256, Op>,
+ VR256, OpTy256, f256mem, MemFrag256, Op, sched.YMM>,
VEX_L;
defm NAME#231#Suff#Y : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy),
- VR256, OpTy256, f256mem, MemFrag256, Op>,
+ VR256, OpTy256, f256mem, MemFrag256, Op, sched.YMM>,
VEX_L;
defm NAME#132#Suff#Y : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy),
- VR256, OpTy256, f256mem, MemFrag256, Op>,
+ VR256, OpTy256, f256mem, MemFrag256, Op, sched.YMM>,
VEX_L;
}
// Fused Multiply-Add
let ExeDomain = SSEPackedSingle in {
defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS",
- loadv4f32, loadv8f32, X86Fmadd, v4f32, v8f32>;
+ loadv4f32, loadv8f32, X86Fmadd, v4f32, v8f32,
+ SchedWriteFMA>;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
- loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32>;
+ loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32,
+ SchedWriteFMA>;
defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS",
- loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32>;
+ loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32,
+ SchedWriteFMA>;
defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", "PS",
- loadv4f32, loadv8f32, X86Fmsubadd, v4f32, v8f32>;
+ loadv4f32, loadv8f32, X86Fmsubadd, v4f32, v8f32,
+ SchedWriteFMA>;
}
let ExeDomain = SSEPackedDouble in {
defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD",
loadv2f64, loadv4f64, X86Fmadd, v2f64,
- v4f64>, VEX_W;
+ v4f64, SchedWriteFMA>, VEX_W;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
loadv2f64, loadv4f64, X86Fmsub, v2f64,
- v4f64>, VEX_W;
+ v4f64, SchedWriteFMA>, VEX_W;
defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD",
loadv2f64, loadv4f64, X86Fmaddsub,
- v2f64, v4f64>, VEX_W;
+ v2f64, v4f64, SchedWriteFMA>, VEX_W;
defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", "PD",
loadv2f64, loadv4f64, X86Fmsubadd,
- v2f64, v4f64>, VEX_W;
+ v2f64, v4f64, SchedWriteFMA>, VEX_W;
}
// Fused Negative Multiply-Add
let ExeDomain = SSEPackedSingle in {
defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32,
- loadv8f32, X86Fnmadd, v4f32, v8f32>;
+ loadv8f32, X86Fnmadd, v4f32, v8f32, SchedWriteFMA>;
defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32,
- loadv8f32, X86Fnmsub, v4f32, v8f32>;
+ loadv8f32, X86Fnmsub, v4f32, v8f32, SchedWriteFMA>;
}
let ExeDomain = SSEPackedDouble in {
defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64,
- loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
+ loadv4f64, X86Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W;
defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64,
- loadv4f64, X86Fnmsub, v2f64, v4f64>, VEX_W;
+ loadv4f64, X86Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W;
}
// All source register operands of FMA opcodes defined in fma3s_rm multiclass
@@ -169,13 +176,14 @@ let ExeDomain = SSEPackedDouble in {
// defining FMA3 opcodes above.
multiclass fma3s_rm_213<bits<8> opc, string OpcodeStr,
X86MemOperand x86memop, RegisterClass RC,
- SDPatternOperator OpNode> {
+ SDPatternOperator OpNode,
+ X86FoldableSchedWrite sched> {
def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>,
- Sched<[WriteFMA]>;
+ Sched<[sched]>;
let mayLoad = 1 in
def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst),
@@ -184,18 +192,18 @@ multiclass fma3s_rm_213<bits<8> opc, string OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set RC:$dst,
(OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>,
- Sched<[WriteFMALd, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
}
multiclass fma3s_rm_231<bits<8> opc, string OpcodeStr,
X86MemOperand x86memop, RegisterClass RC,
- SDPatternOperator OpNode> {
+ SDPatternOperator OpNode, X86FoldableSchedWrite sched> {
let hasSideEffects = 0 in
def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>, Sched<[WriteFMA]>;
+ []>, Sched<[sched]>;
let mayLoad = 1 in
def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst),
@@ -204,18 +212,18 @@ multiclass fma3s_rm_231<bits<8> opc, string OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set RC:$dst,
(OpNode RC:$src2, (load addr:$src3), RC:$src1))]>,
- Sched<[WriteFMALd, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
}
multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr,
X86MemOperand x86memop, RegisterClass RC,
- SDPatternOperator OpNode> {
+ SDPatternOperator OpNode, X86FoldableSchedWrite sched> {
let hasSideEffects = 0 in
def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>, Sched<[WriteFMA]>;
+ []>, Sched<[sched]>;
// Pattern is 312 order so that the load is in a different place from the
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
@@ -226,20 +234,20 @@ multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set RC:$dst,
(OpNode (load addr:$src3), RC:$src1, RC:$src2))]>,
- Sched<[WriteFMALd, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
}
let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpStr, string PackTy, string Suff,
SDNode OpNode, RegisterClass RC,
- X86MemOperand x86memop> {
+ X86MemOperand x86memop, X86FoldableSchedWrite sched> {
defm NAME#213#Suff : fma3s_rm_213<opc213, !strconcat(OpStr, "213", PackTy),
- x86memop, RC, OpNode>;
+ x86memop, RC, OpNode, sched>;
defm NAME#231#Suff : fma3s_rm_231<opc231, !strconcat(OpStr, "231", PackTy),
- x86memop, RC, OpNode>;
+ x86memop, RC, OpNode, sched>;
defm NAME#132#Suff : fma3s_rm_132<opc132, !strconcat(OpStr, "132", PackTy),
- x86memop, RC, OpNode>;
+ x86memop, RC, OpNode, sched>;
}
// These FMA*_Int instructions are defined specially for being used when
@@ -258,19 +266,20 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
hasSideEffects = 0 in
multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
- Operand memopr, RegisterClass RC> {
+ Operand memopr, RegisterClass RC,
+ X86FoldableSchedWrite sched> {
def r_Int : FMA3S_Int<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>, Sched<[WriteFMA]>;
+ []>, Sched<[sched]>;
let mayLoad = 1 in
def m_Int : FMA3S_Int<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, RC:$src2, memopr:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>, Sched<[WriteFMALd, ReadAfterLd]>;
+ []>, Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
}
// The FMA 213 form is created for lowering of scalar FMA intrinscis
@@ -284,82 +293,101 @@ multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
// such analysis will be implemented eventually.
multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpStr, string PackTy, string Suff,
- RegisterClass RC, Operand memop> {
+ RegisterClass RC, Operand memop,
+ X86FoldableSchedWrite sched> {
defm NAME#132#Suff : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy),
- memop, RC>;
+ memop, RC, sched>;
defm NAME#213#Suff : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
- memop, RC>;
+ memop, RC, sched>;
defm NAME#231#Suff : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy),
- memop, RC>;
+ memop, RC, sched>;
}
multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
- string OpStr, SDNode OpNodeIntrin, SDNode OpNode> {
+ string OpStr, SDNode OpNode, X86FoldableSchedWrite sched> {
let ExeDomain = SSEPackedSingle in
defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode,
- FR32, f32mem>,
+ FR32, f32mem, sched>,
fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", "SS",
- VR128, ssmem>;
+ VR128, ssmem, sched>;
let ExeDomain = SSEPackedDouble in
defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "SD", OpNode,
- FR64, f64mem>,
+ FR64, f64mem, sched>,
fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", "SD",
- VR128, sdmem>, VEX_W;
+ VR128, sdmem, sched>, VEX_W;
+}
+
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86Fmadd,
+ SchedWriteFMA.Scl>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsub,
+ SchedWriteFMA.Scl>, VEX_LIG;
+
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadd,
+ SchedWriteFMA.Scl>, VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsub,
+ SchedWriteFMA.Scl>, VEX_LIG;
- // These patterns use the 123 ordering, instead of 213, even though
- // they match the intrinsic to the 213 version of the instruction.
- // This is because src1 is tied to dest, and the scalar intrinsics
- // require the pass-through values to come from the first source
- // operand, not the second.
+multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
+ SDNode Move, ValueType VT, ValueType EltVT,
+ RegisterClass RC, PatFrag mem_frag> {
let Predicates = [HasFMA, NoAVX512] in {
- def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)),
- (!cast<Instruction>(NAME#"213SSr_Int")
- VR128:$src1, VR128:$src2, VR128:$src3)>;
-
- def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)),
- (!cast<Instruction>(NAME#"213SDr_Int")
- VR128:$src1, VR128:$src2, VR128:$src3)>;
-
- def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2,
- sse_load_f32:$src3)),
- (!cast<Instruction>(NAME#"213SSm_Int")
- VR128:$src1, VR128:$src2, sse_load_f32:$src3)>;
-
- def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2,
- sse_load_f64:$src3)),
- (!cast<Instruction>(NAME#"213SDm_Int")
- VR128:$src1, VR128:$src2, sse_load_f64:$src3)>;
-
- def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, sse_load_f32:$src3,
- VR128:$src2)),
- (!cast<Instruction>(NAME#"132SSm_Int")
- VR128:$src1, VR128:$src2, sse_load_f32:$src3)>;
-
- def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, sse_load_f64:$src3,
- VR128:$src2)),
- (!cast<Instruction>(NAME#"132SDm_Int")
- VR128:$src1, VR128:$src2, sse_load_f64:$src3)>;
+ def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+ (Op RC:$src2,
+ (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
+ RC:$src3))))),
+ (!cast<Instruction>(Prefix#"213"#Suffix#"r_Int")
+ VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+ (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
+
+ def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+ (Op RC:$src2, RC:$src3,
+ (EltVT (extractelt (VT VR128:$src1), (iPTR 0)))))))),
+ (!cast<Instruction>(Prefix#"231"#Suffix#"r_Int")
+ VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+ (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
+
+ def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+ (Op RC:$src2,
+ (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
+ (mem_frag addr:$src3)))))),
+ (!cast<Instruction>(Prefix#"213"#Suffix#"m_Int")
+ VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+ addr:$src3)>;
+
+ def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+ (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
+ (mem_frag addr:$src3), RC:$src2))))),
+ (!cast<Instruction>(Prefix#"132"#Suffix#"m_Int")
+ VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+ addr:$src3)>;
+
+ def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+ (Op RC:$src2, (mem_frag addr:$src3),
+ (EltVT (extractelt (VT VR128:$src1), (iPTR 0)))))))),
+ (!cast<Instruction>(Prefix#"231"#Suffix#"m_Int")
+ VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+ addr:$src3)>;
}
}
-defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86Fmadds1, X86Fmadd>, VEX_LIG;
-defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsubs1, X86Fmsub>, VEX_LIG;
-
-defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadds1, X86Fnmadd>,
- VEX_LIG;
-defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsubs1, X86Fnmsub>,
- VEX_LIG;
+defm : scalar_fma_patterns<X86Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
//===----------------------------------------------------------------------===//
// FMA4 - AMD 4 operand Fused Multiply-Add instructions
//===----------------------------------------------------------------------===//
-
multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
- PatFrag mem_frag> {
+ PatFrag mem_frag, X86FoldableSchedWrite sched> {
let isCommutable = 1 in
def rr : FMA4S<opc, MRMSrcRegOp4, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
@@ -367,66 +395,74 @@ multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst,
(OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG,
- Sched<[WriteFMA]>;
+ Sched<[sched]>;
def rm : FMA4S<opc, MRMSrcMemOp4, (outs RC:$dst),
(ins RC:$src1, RC:$src2, x86memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (OpNode RC:$src1, RC:$src2,
(mem_frag addr:$src3)))]>, VEX_W, VEX_LIG,
- Sched<[WriteFMALd, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
def mr : FMA4S<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst,
(OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG,
- Sched<[WriteFMALd, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd,
+ // x86memop:$src2
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC:$src3
+ ReadAfterLd]>;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
def rr_REV : FMA4S<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
- VEX_LIG, FoldGenData<NAME#rr>, Sched<[WriteFMA]>;
+ VEX_LIG, FoldGenData<NAME#rr>, Sched<[sched]>;
}
multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
- ValueType VT, ComplexPattern mem_cpat, SDNode OpNode> {
-let isCodeGenOnly = 1 in {
+ ValueType VT, X86FoldableSchedWrite sched> {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- [(set VR128:$dst,
- (VT (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, VEX_W,
- VEX_LIG, Sched<[WriteFMA]>;
+ []>, VEX_W, VEX_LIG, Sched<[sched]>;
+ let mayLoad = 1 in
def rm_Int : FMA4S_Int<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- [(set VR128:$dst, (VT (OpNode VR128:$src1, VR128:$src2,
- mem_cpat:$src3)))]>, VEX_W, VEX_LIG,
- Sched<[WriteFMALd, ReadAfterLd]>;
+ []>, VEX_W, VEX_LIG,
+ Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
+ let mayLoad = 1 in
def mr_Int : FMA4S_Int<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, memop:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- [(set VR128:$dst,
- (VT (OpNode VR128:$src1, mem_cpat:$src2, VR128:$src3)))]>,
- VEX_LIG, Sched<[WriteFMALd, ReadAfterLd]>;
-let hasSideEffects = 0 in
+ []>,
+ VEX_LIG, Sched<[sched.Folded, ReadAfterLd,
+ // memop:$src2
+ ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault, ReadDefault,
+ // VR128::$src3
+ ReadAfterLd]>;
def rr_Int_REV : FMA4S_Int<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, VEX_LIG, FoldGenData<NAME#rr_Int>, Sched<[WriteFMA]>;
+ []>, VEX_LIG, FoldGenData<NAME#rr_Int>, Sched<[sched]>;
} // isCodeGenOnly = 1
}
multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType OpVT128, ValueType OpVT256,
- PatFrag ld_frag128, PatFrag ld_frag256> {
+ PatFrag ld_frag128, PatFrag ld_frag256,
+ X86SchedWriteWidths sched> {
let isCommutable = 1 in
def rr : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
@@ -434,21 +470,26 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
- VEX_W, Sched<[WriteFMA]>;
+ VEX_W, Sched<[sched.XMM]>;
def rm : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, f128mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
(ld_frag128 addr:$src3)))]>, VEX_W,
- Sched<[WriteFMALd, ReadAfterLd]>;
+ Sched<[sched.XMM.Folded, ReadAfterLd, ReadAfterLd]>;
def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>,
- Sched<[WriteFMALd, ReadAfterLd]>;
+ Sched<[sched.XMM.Folded, ReadAfterLd,
+ // f128mem:$src2
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // VR128::$src3
+ ReadAfterLd]>;
let isCommutable = 1 in
def Yrr : FMA4<opc, MRMSrcRegOp4, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
@@ -456,95 +497,140 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR256:$dst,
(OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
- VEX_W, VEX_L, Sched<[WriteFMA]>;
+ VEX_W, VEX_L, Sched<[sched.YMM]>;
def Yrm : FMA4<opc, MRMSrcMemOp4, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, f256mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
(ld_frag256 addr:$src3)))]>, VEX_W, VEX_L,
- Sched<[WriteFMALd, ReadAfterLd]>;
+ Sched<[sched.YMM.Folded, ReadAfterLd, ReadAfterLd]>;
def Ymr : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR256:$dst, (OpNode VR256:$src1,
(ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L,
- Sched<[WriteFMALd, ReadAfterLd]>;
+ Sched<[sched.YMM.Folded, ReadAfterLd,
+ // f256mem:$src2
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // VR256::$src3
+ ReadAfterLd]>;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
- Sched<[WriteFMA]>, FoldGenData<NAME#rr>;
+ Sched<[sched.XMM]>, FoldGenData<NAME#rr>;
def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
- VEX_L, Sched<[WriteFMA]>, FoldGenData<NAME#Yrr>;
+ VEX_L, Sched<[sched.YMM]>, FoldGenData<NAME#Yrr>;
} // isCodeGenOnly = 1
}
let ExeDomain = SSEPackedSingle in {
// Scalar Instructions
- defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
- fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, sse_load_f32,
- X86Fmadd4s>;
- defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
- fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, sse_load_f32,
- X86Fmsub4s>;
+ defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32,
+ SchedWriteFMA.Scl>,
+ fma4s_int<0x6A, "vfmaddss", ssmem, v4f32,
+ SchedWriteFMA.Scl>;
+ defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32,
+ SchedWriteFMA.Scl>,
+ fma4s_int<0x6E, "vfmsubss", ssmem, v4f32,
+ SchedWriteFMA.Scl>;
defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
- X86Fnmadd, loadf32>,
- fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, sse_load_f32,
- X86Fnmadd4s>;
+ X86Fnmadd, loadf32, SchedWriteFMA.Scl>,
+ fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32,
+ SchedWriteFMA.Scl>;
defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
- X86Fnmsub, loadf32>,
- fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, sse_load_f32,
- X86Fnmsub4s>;
+ X86Fnmsub, loadf32, SchedWriteFMA.Scl>,
+ fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32,
+ SchedWriteFMA.Scl>;
// Packed Instructions
defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
- loadv4f32, loadv8f32>;
+ loadv4f32, loadv8f32, SchedWriteFMA>;
defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
- loadv4f32, loadv8f32>;
+ loadv4f32, loadv8f32, SchedWriteFMA>;
defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
- loadv4f32, loadv8f32>;
+ loadv4f32, loadv8f32, SchedWriteFMA>;
defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
- loadv4f32, loadv8f32>;
+ loadv4f32, loadv8f32, SchedWriteFMA>;
defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
- loadv4f32, loadv8f32>;
+ loadv4f32, loadv8f32, SchedWriteFMA>;
defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
- loadv4f32, loadv8f32>;
+ loadv4f32, loadv8f32, SchedWriteFMA>;
}
let ExeDomain = SSEPackedDouble in {
// Scalar Instructions
- defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
- fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, sse_load_f64,
- X86Fmadd4s>;
- defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
- fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, sse_load_f64,
- X86Fmsub4s>;
+ defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64,
+ SchedWriteFMA.Scl>,
+ fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64,
+ SchedWriteFMA.Scl>;
+ defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64,
+ SchedWriteFMA.Scl>,
+ fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64,
+ SchedWriteFMA.Scl>;
defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
- X86Fnmadd, loadf64>,
- fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, sse_load_f64,
- X86Fnmadd4s>;
+ X86Fnmadd, loadf64, SchedWriteFMA.Scl>,
+ fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64,
+ SchedWriteFMA.Scl>;
defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
- X86Fnmsub, loadf64>,
- fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, sse_load_f64,
- X86Fnmsub4s>;
+ X86Fnmsub, loadf64, SchedWriteFMA.Scl>,
+ fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64,
+ SchedWriteFMA.Scl>;
// Packed Instructions
defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
- loadv2f64, loadv4f64>;
+ loadv2f64, loadv4f64, SchedWriteFMA>;
defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
- loadv2f64, loadv4f64>;
+ loadv2f64, loadv4f64, SchedWriteFMA>;
defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
- loadv2f64, loadv4f64>;
+ loadv2f64, loadv4f64, SchedWriteFMA>;
defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
- loadv2f64, loadv4f64>;
+ loadv2f64, loadv4f64, SchedWriteFMA>;
defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
- loadv2f64, loadv4f64>;
+ loadv2f64, loadv4f64, SchedWriteFMA>;
defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
- loadv2f64, loadv4f64>;
+ loadv2f64, loadv4f64, SchedWriteFMA>;
}
+multiclass scalar_fma4_patterns<SDNode Op, string Name,
+ ValueType VT, ValueType EltVT,
+ RegisterClass RC, PatFrag mem_frag> {
+ let Predicates = [HasFMA4] in {
+ def : Pat<(VT (X86vzmovl (VT (scalar_to_vector
+ (Op RC:$src1, RC:$src2, RC:$src3))))),
+ (!cast<Instruction>(Name#"rr_Int")
+ (VT (COPY_TO_REGCLASS RC:$src1, VR128)),
+ (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+ (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
+
+ def : Pat<(VT (X86vzmovl (VT (scalar_to_vector
+ (Op RC:$src1, RC:$src2,
+ (mem_frag addr:$src3)))))),
+ (!cast<Instruction>(Name#"rm_Int")
+ (VT (COPY_TO_REGCLASS RC:$src1, VR128)),
+ (VT (COPY_TO_REGCLASS RC:$src2, VR128)), addr:$src3)>;
+
+ def : Pat<(VT (X86vzmovl (VT (scalar_to_vector
+ (Op RC:$src1, (mem_frag addr:$src2),
+ RC:$src3))))),
+ (!cast<Instruction>(Name#"mr_Int")
+ (VT (COPY_TO_REGCLASS RC:$src1, VR128)), addr:$src2,
+ (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
+ }
+}
+
+defm : scalar_fma4_patterns<X86Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
+
+defm : scalar_fma4_patterns<X86Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
diff --git a/lib/Target/X86/X86InstrFMA3Info.cpp b/lib/Target/X86/X86InstrFMA3Info.cpp
index 00ef65cdb6bd..def732a2dd00 100644
--- a/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -21,268 +21,150 @@
using namespace llvm;
-/// This flag is used in the method llvm::call_once() used below to make the
-/// initialization of the map 'OpcodeToGroup' thread safe.
-static llvm::once_flag InitGroupsOnceFlag;
-
-static ManagedStatic<X86InstrFMA3Info> X86InstrFMA3InfoObj;
-X86InstrFMA3Info *X86InstrFMA3Info::getX86InstrFMA3Info() {
- return &*X86InstrFMA3InfoObj;
-}
-
-void X86InstrFMA3Info::initRMGroup(const uint16_t *RegOpcodes,
- const uint16_t *MemOpcodes, unsigned Attr) {
- // Create a new instance of this class that would hold a group of FMA opcodes.
- X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, MemOpcodes, Attr);
-
- // Add the references from indvidual opcodes to the group holding them.
- assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
- !OpcodeToGroup[RegOpcodes[2]] && !OpcodeToGroup[MemOpcodes[0]] &&
- !OpcodeToGroup[MemOpcodes[1]] && !OpcodeToGroup[MemOpcodes[2]]) &&
- "Duplication or rewrite of elements in OpcodeToGroup.");
- OpcodeToGroup[RegOpcodes[0]] = G;
- OpcodeToGroup[RegOpcodes[1]] = G;
- OpcodeToGroup[RegOpcodes[2]] = G;
- OpcodeToGroup[MemOpcodes[0]] = G;
- OpcodeToGroup[MemOpcodes[1]] = G;
- OpcodeToGroup[MemOpcodes[2]] = G;
-}
-
-void X86InstrFMA3Info::initRGroup(const uint16_t *RegOpcodes, unsigned Attr) {
- // Create a new instance of this class that would hold a group of FMA opcodes.
- X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, nullptr, Attr);
-
- // Add the references from indvidual opcodes to the group holding them.
- assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
- !OpcodeToGroup[RegOpcodes[2]]) &&
- "Duplication or rewrite of elements in OpcodeToGroup.");
- OpcodeToGroup[RegOpcodes[0]] = G;
- OpcodeToGroup[RegOpcodes[1]] = G;
- OpcodeToGroup[RegOpcodes[2]] = G;
-}
-
-void X86InstrFMA3Info::initMGroup(const uint16_t *MemOpcodes, unsigned Attr) {
- // Create a new instance of this class that would hold a group of FMA opcodes.
- X86InstrFMA3Group *G = new X86InstrFMA3Group(nullptr, MemOpcodes, Attr);
-
- // Add the references from indvidual opcodes to the group holding them.
- assert((!OpcodeToGroup[MemOpcodes[0]] && !OpcodeToGroup[MemOpcodes[1]] &&
- !OpcodeToGroup[MemOpcodes[2]]) &&
- "Duplication or rewrite of elements in OpcodeToGroup.");
- OpcodeToGroup[MemOpcodes[0]] = G;
- OpcodeToGroup[MemOpcodes[1]] = G;
- OpcodeToGroup[MemOpcodes[2]] = G;
-}
-
-#define FMA3RM(R132, R213, R231, M132, M213, M231) \
- static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
- static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231}; \
- initRMGroup(Reg##R132, Mem##R132);
-
-#define FMA3RMA(R132, R213, R231, M132, M213, M231, Attrs) \
- static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
- static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231}; \
- initRMGroup(Reg##R132, Mem##R132, (Attrs));
-
-#define FMA3R(R132, R213, R231) \
- static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
- initRGroup(Reg##R132);
-
-#define FMA3RA(R132, R213, R231, Attrs) \
- static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
- initRGroup(Reg##R132, (Attrs));
-
-#define FMA3M(M132, M213, M231) \
- static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231}; \
- initMGroup(Mem##M132);
-
-#define FMA3MA(M132, M213, M231, Attrs) \
- static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231}; \
- initMGroup(Mem##M132, (Attrs));
-
-#define FMA3_AVX2_VECTOR_GROUP(Name) \
- FMA3RM(Name##132PSr, Name##213PSr, Name##231PSr, \
- Name##132PSm, Name##213PSm, Name##231PSm); \
- FMA3RM(Name##132PDr, Name##213PDr, Name##231PDr, \
- Name##132PDm, Name##213PDm, Name##231PDm); \
- FMA3RM(Name##132PSYr, Name##213PSYr, Name##231PSYr, \
- Name##132PSYm, Name##213PSYm, Name##231PSYm); \
- FMA3RM(Name##132PDYr, Name##213PDYr, Name##231PDYr, \
- Name##132PDYm, Name##213PDYm, Name##231PDYm);
-
-#define FMA3_AVX2_SCALAR_GROUP(Name) \
- FMA3RM(Name##132SSr, Name##213SSr, Name##231SSr, \
- Name##132SSm, Name##213SSm, Name##231SSm); \
- FMA3RM(Name##132SDr, Name##213SDr, Name##231SDr, \
- Name##132SDm, Name##213SDm, Name##231SDm); \
- FMA3RMA(Name##132SSr_Int, Name##213SSr_Int, Name##231SSr_Int, \
- Name##132SSm_Int, Name##213SSm_Int, Name##231SSm_Int, \
- X86InstrFMA3Group::X86FMA3Intrinsic); \
- FMA3RMA(Name##132SDr_Int, Name##213SDr_Int, Name##231SDr_Int, \
- Name##132SDm_Int, Name##213SDm_Int, Name##231SDm_Int, \
- X86InstrFMA3Group::X86FMA3Intrinsic);
-
-#define FMA3_AVX2_FULL_GROUP(Name) \
- FMA3_AVX2_VECTOR_GROUP(Name); \
- FMA3_AVX2_SCALAR_GROUP(Name);
-
-#define FMA3_AVX512_VECTOR_GROUP(Name) \
- FMA3RM(Name##132PSZ128r, Name##213PSZ128r, Name##231PSZ128r, \
- Name##132PSZ128m, Name##213PSZ128m, Name##231PSZ128m); \
- FMA3RM(Name##132PDZ128r, Name##213PDZ128r, Name##231PDZ128r, \
- Name##132PDZ128m, Name##213PDZ128m, Name##231PDZ128m); \
- FMA3RM(Name##132PSZ256r, Name##213PSZ256r, Name##231PSZ256r, \
- Name##132PSZ256m, Name##213PSZ256m, Name##231PSZ256m); \
- FMA3RM(Name##132PDZ256r, Name##213PDZ256r, Name##231PDZ256r, \
- Name##132PDZ256m, Name##213PDZ256m, Name##231PDZ256m); \
- FMA3RM(Name##132PSZr, Name##213PSZr, Name##231PSZr, \
- Name##132PSZm, Name##213PSZm, Name##231PSZm); \
- FMA3RM(Name##132PDZr, Name##213PDZr, Name##231PDZr, \
- Name##132PDZm, Name##213PDZm, Name##231PDZm); \
- FMA3RMA(Name##132PSZ128rk, Name##213PSZ128rk, Name##231PSZ128rk, \
- Name##132PSZ128mk, Name##213PSZ128mk, Name##231PSZ128mk, \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3RMA(Name##132PDZ128rk, Name##213PDZ128rk, Name##231PDZ128rk, \
- Name##132PDZ128mk, Name##213PDZ128mk, Name##231PDZ128mk, \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3RMA(Name##132PSZ256rk, Name##213PSZ256rk, Name##231PSZ256rk, \
- Name##132PSZ256mk, Name##213PSZ256mk, Name##231PSZ256mk, \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3RMA(Name##132PDZ256rk, Name##213PDZ256rk, Name##231PDZ256rk, \
- Name##132PDZ256mk, Name##213PDZ256mk, Name##231PDZ256mk, \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3RMA(Name##132PSZrk, Name##213PSZrk, Name##231PSZrk, \
- Name##132PSZmk, Name##213PSZmk, Name##231PSZmk, \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3RMA(Name##132PDZrk, Name##213PDZrk, Name##231PDZrk, \
- Name##132PDZmk, Name##213PDZmk, Name##231PDZmk, \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3RMA(Name##132PSZ128rkz, Name##213PSZ128rkz, Name##231PSZ128rkz, \
- Name##132PSZ128mkz, Name##213PSZ128mkz, Name##231PSZ128mkz, \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3RMA(Name##132PDZ128rkz, Name##213PDZ128rkz, Name##231PDZ128rkz, \
- Name##132PDZ128mkz, Name##213PDZ128mkz, Name##231PDZ128mkz, \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3RMA(Name##132PSZ256rkz, Name##213PSZ256rkz, Name##231PSZ256rkz, \
- Name##132PSZ256mkz, Name##213PSZ256mkz, Name##231PSZ256mkz, \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3RMA(Name##132PDZ256rkz, Name##213PDZ256rkz, Name##231PDZ256rkz, \
- Name##132PDZ256mkz, Name##213PDZ256mkz, Name##231PDZ256mkz, \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3RMA(Name##132PSZrkz, Name##213PSZrkz, Name##231PSZrkz, \
- Name##132PSZmkz, Name##213PSZmkz, Name##231PSZmkz, \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3RMA(Name##132PDZrkz, Name##213PDZrkz, Name##231PDZrkz, \
- Name##132PDZmkz, Name##213PDZmkz, Name##231PDZmkz, \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3R(Name##132PSZrb, Name##213PSZrb, Name##231PSZrb); \
- FMA3R(Name##132PDZrb, Name##213PDZrb, Name##231PDZrb); \
- FMA3RA(Name##132PSZrbk, Name##213PSZrbk, Name##231PSZrbk, \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3RA(Name##132PDZrbk, Name##213PDZrbk, Name##231PDZrbk, \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3RA(Name##132PSZrbkz, Name##213PSZrbkz, Name##231PSZrbkz, \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3RA(Name##132PDZrbkz, Name##213PDZrbkz, Name##231PDZrbkz, \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3M(Name##132PSZ128mb, Name##213PSZ128mb, Name##231PSZ128mb); \
- FMA3M(Name##132PDZ128mb, Name##213PDZ128mb, Name##231PDZ128mb); \
- FMA3M(Name##132PSZ256mb, Name##213PSZ256mb, Name##231PSZ256mb); \
- FMA3M(Name##132PDZ256mb, Name##213PDZ256mb, Name##231PDZ256mb); \
- FMA3M(Name##132PSZmb, Name##213PSZmb, Name##231PSZmb); \
- FMA3M(Name##132PDZmb, Name##213PDZmb, Name##231PDZmb); \
- FMA3MA(Name##132PSZ128mbk, Name##213PSZ128mbk, Name##231PSZ128mbk, \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3MA(Name##132PDZ128mbk, Name##213PDZ128mbk, Name##231PDZ128mbk, \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3MA(Name##132PSZ256mbk, Name##213PSZ256mbk, Name##231PSZ256mbk, \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3MA(Name##132PDZ256mbk, Name##213PDZ256mbk, Name##231PDZ256mbk, \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3MA(Name##132PSZmbk, Name##213PSZmbk, Name##231PSZmbk, \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3MA(Name##132PDZmbk, Name##213PDZmbk, Name##231PDZmbk, \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3MA(Name##132PSZ128mbkz, Name##213PSZ128mbkz, Name##231PSZ128mbkz, \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3MA(Name##132PDZ128mbkz, Name##213PDZ128mbkz, Name##231PDZ128mbkz, \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3MA(Name##132PSZ256mbkz, Name##213PSZ256mbkz, Name##231PSZ256mbkz, \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3MA(Name##132PDZ256mbkz, Name##213PDZ256mbkz, Name##231PDZ256mbkz, \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3MA(Name##132PSZmbkz, Name##213PSZmbkz, Name##231PSZmbkz, \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3MA(Name##132PDZmbkz, Name##213PDZmbkz, Name##231PDZmbkz, \
- X86InstrFMA3Group::X86FMA3KZeroMasked);
-
-#define FMA3_AVX512_SCALAR_GROUP(Name) \
- FMA3RM(Name##132SSZr, Name##213SSZr, Name##231SSZr, \
- Name##132SSZm, Name##213SSZm, Name##231SSZm); \
- FMA3RM(Name##132SDZr, Name##213SDZr, Name##231SDZr, \
- Name##132SDZm, Name##213SDZm, Name##231SDZm); \
- FMA3RMA(Name##132SSZr_Int, Name##213SSZr_Int, Name##231SSZr_Int, \
- Name##132SSZm_Int, Name##213SSZm_Int, Name##231SSZm_Int, \
- X86InstrFMA3Group::X86FMA3Intrinsic); \
- FMA3RMA(Name##132SDZr_Int, Name##213SDZr_Int, Name##231SDZr_Int, \
- Name##132SDZm_Int, Name##213SDZm_Int, Name##231SDZm_Int, \
- X86InstrFMA3Group::X86FMA3Intrinsic); \
- FMA3RMA(Name##132SSZr_Intk, Name##213SSZr_Intk, Name##231SSZr_Intk, \
- Name##132SSZm_Intk, Name##213SSZm_Intk, Name##231SSZm_Intk, \
- X86InstrFMA3Group::X86FMA3Intrinsic | \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3RMA(Name##132SDZr_Intk, Name##213SDZr_Intk, Name##231SDZr_Intk, \
- Name##132SDZm_Intk, Name##213SDZm_Intk, Name##231SDZm_Intk, \
- X86InstrFMA3Group::X86FMA3Intrinsic | \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3RMA(Name##132SSZr_Intkz, Name##213SSZr_Intkz, Name##231SSZr_Intkz, \
- Name##132SSZm_Intkz, Name##213SSZm_Intkz, Name##231SSZm_Intkz, \
- X86InstrFMA3Group::X86FMA3Intrinsic | \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3RMA(Name##132SDZr_Intkz, Name##213SDZr_Intkz, Name##231SDZr_Intkz, \
- Name##132SDZm_Intkz, Name##213SDZm_Intkz, Name##231SDZm_Intkz, \
- X86InstrFMA3Group::X86FMA3Intrinsic | \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3RA(Name##132SSZrb_Int, Name##213SSZrb_Int, Name##231SSZrb_Int, \
- X86InstrFMA3Group::X86FMA3Intrinsic); \
- FMA3RA(Name##132SDZrb_Int, Name##213SDZrb_Int, Name##231SDZrb_Int, \
- X86InstrFMA3Group::X86FMA3Intrinsic); \
- FMA3RA(Name##132SSZrb_Intk, Name##213SSZrb_Intk, Name##231SSZrb_Intk, \
- X86InstrFMA3Group::X86FMA3Intrinsic | \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3RA(Name##132SDZrb_Intk, Name##213SDZrb_Intk, Name##231SDZrb_Intk, \
- X86InstrFMA3Group::X86FMA3Intrinsic | \
- X86InstrFMA3Group::X86FMA3KMergeMasked); \
- FMA3RA(Name##132SSZrb_Intkz, Name##213SSZrb_Intkz, Name##231SSZrb_Intkz, \
- X86InstrFMA3Group::X86FMA3Intrinsic | \
- X86InstrFMA3Group::X86FMA3KZeroMasked); \
- FMA3RA(Name##132SDZrb_Intkz, Name##213SDZrb_Intkz, Name##231SDZrb_Intkz, \
- X86InstrFMA3Group::X86FMA3Intrinsic | \
- X86InstrFMA3Group::X86FMA3KZeroMasked);
-
-#define FMA3_AVX512_FULL_GROUP(Name) \
- FMA3_AVX512_VECTOR_GROUP(Name); \
- FMA3_AVX512_SCALAR_GROUP(Name);
-
-void X86InstrFMA3Info::initGroupsOnceImpl() {
- FMA3_AVX2_FULL_GROUP(VFMADD);
- FMA3_AVX2_FULL_GROUP(VFMSUB);
- FMA3_AVX2_FULL_GROUP(VFNMADD);
- FMA3_AVX2_FULL_GROUP(VFNMSUB);
-
- FMA3_AVX2_VECTOR_GROUP(VFMADDSUB);
- FMA3_AVX2_VECTOR_GROUP(VFMSUBADD);
-
- FMA3_AVX512_FULL_GROUP(VFMADD);
- FMA3_AVX512_FULL_GROUP(VFMSUB);
- FMA3_AVX512_FULL_GROUP(VFNMADD);
- FMA3_AVX512_FULL_GROUP(VFNMSUB);
-
- FMA3_AVX512_VECTOR_GROUP(VFMADDSUB);
- FMA3_AVX512_VECTOR_GROUP(VFMSUBADD);
+#define FMA3GROUP(Name, Suf, Attrs) \
+ { { X86::Name##132##Suf, X86::Name##213##Suf, X86::Name##231##Suf }, Attrs },
+
+#define FMA3GROUP_MASKED(Name, Suf, Attrs) \
+ FMA3GROUP(Name, Suf, Attrs) \
+ FMA3GROUP(Name, Suf##k, Attrs | X86InstrFMA3Group::KMergeMasked) \
+ FMA3GROUP(Name, Suf##kz, Attrs | X86InstrFMA3Group::KZeroMasked)
+
+#define FMA3GROUP_PACKED_WIDTHS(Name, Suf, Attrs) \
+ FMA3GROUP(Name, Suf##Ym, Attrs) \
+ FMA3GROUP(Name, Suf##Yr, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Z128m, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Z128r, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Z256m, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Z256r, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Zm, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Zr, Attrs) \
+ FMA3GROUP(Name, Suf##m, Attrs) \
+ FMA3GROUP(Name, Suf##r, Attrs)
+
+#define FMA3GROUP_PACKED(Name, Attrs) \
+ FMA3GROUP_PACKED_WIDTHS(Name, PD, Attrs) \
+ FMA3GROUP_PACKED_WIDTHS(Name, PS, Attrs)
+
+#define FMA3GROUP_SCALAR_WIDTHS(Name, Suf, Attrs) \
+ FMA3GROUP(Name, Suf##Zm, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Zm_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
+ FMA3GROUP(Name, Suf##Zr, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Zr_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
+ FMA3GROUP(Name, Suf##m, Attrs) \
+ FMA3GROUP(Name, Suf##m_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
+ FMA3GROUP(Name, Suf##r, Attrs) \
+ FMA3GROUP(Name, Suf##r_Int, Attrs | X86InstrFMA3Group::Intrinsic)
+
+#define FMA3GROUP_SCALAR(Name, Attrs) \
+ FMA3GROUP_SCALAR_WIDTHS(Name, SD, Attrs) \
+ FMA3GROUP_SCALAR_WIDTHS(Name, SS, Attrs) \
+
+#define FMA3GROUP_FULL(Name, Attrs) \
+ FMA3GROUP_PACKED(Name, Attrs) \
+ FMA3GROUP_SCALAR(Name, Attrs)
+
+static const X86InstrFMA3Group Groups[] = {
+ FMA3GROUP_FULL(VFMADD, 0)
+ FMA3GROUP_PACKED(VFMADDSUB, 0)
+ FMA3GROUP_FULL(VFMSUB, 0)
+ FMA3GROUP_PACKED(VFMSUBADD, 0)
+ FMA3GROUP_FULL(VFNMADD, 0)
+ FMA3GROUP_FULL(VFNMSUB, 0)
+};
+
+#define FMA3GROUP_PACKED_AVX512_WIDTHS(Name, Type, Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, Type##Z128##Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, Type##Z256##Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, Type##Z##Suf, Attrs)
+
+#define FMA3GROUP_PACKED_AVX512(Name, Suf, Attrs) \
+ FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs) \
+ FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PS, Suf, Attrs)
+
+#define FMA3GROUP_PACKED_AVX512_ROUND(Name, Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, PDZ##Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, PSZ##Suf, Attrs)
+
+#define FMA3GROUP_SCALAR_AVX512_ROUND(Name, Suf, Attrs) \
+ FMA3GROUP(Name, SDZ##Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, SDZ##Suf##_Int, Attrs) \
+ FMA3GROUP(Name, SSZ##Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, SSZ##Suf##_Int, Attrs)
+
+static const X86InstrFMA3Group BroadcastGroups[] = {
+ FMA3GROUP_PACKED_AVX512(VFMADD, mb, 0)
+ FMA3GROUP_PACKED_AVX512(VFMADDSUB, mb, 0)
+ FMA3GROUP_PACKED_AVX512(VFMSUB, mb, 0)
+ FMA3GROUP_PACKED_AVX512(VFMSUBADD, mb, 0)
+ FMA3GROUP_PACKED_AVX512(VFNMADD, mb, 0)
+ FMA3GROUP_PACKED_AVX512(VFNMSUB, mb, 0)
+};
+
+static const X86InstrFMA3Group RoundGroups[] = {
+ FMA3GROUP_PACKED_AVX512_ROUND(VFMADD, rb, 0)
+ FMA3GROUP_SCALAR_AVX512_ROUND(VFMADD, rb, X86InstrFMA3Group::Intrinsic)
+ FMA3GROUP_PACKED_AVX512_ROUND(VFMADDSUB, rb, 0)
+ FMA3GROUP_PACKED_AVX512_ROUND(VFMSUB, rb, 0)
+ FMA3GROUP_SCALAR_AVX512_ROUND(VFMSUB, rb, X86InstrFMA3Group::Intrinsic)
+ FMA3GROUP_PACKED_AVX512_ROUND(VFMSUBADD, rb, 0)
+ FMA3GROUP_PACKED_AVX512_ROUND(VFNMADD, rb, 0)
+ FMA3GROUP_SCALAR_AVX512_ROUND(VFNMADD, rb, X86InstrFMA3Group::Intrinsic)
+ FMA3GROUP_PACKED_AVX512_ROUND(VFNMSUB, rb, 0)
+ FMA3GROUP_SCALAR_AVX512_ROUND(VFNMSUB, rb, X86InstrFMA3Group::Intrinsic)
+};
+
+static void verifyTables() {
+#ifndef NDEBUG
+ static std::atomic<bool> TableChecked(false);
+ if (!TableChecked.load(std::memory_order_relaxed)) {
+ assert(std::is_sorted(std::begin(Groups), std::end(Groups)) &&
+ std::is_sorted(std::begin(RoundGroups), std::end(RoundGroups)) &&
+ std::is_sorted(std::begin(BroadcastGroups),
+ std::end(BroadcastGroups)) &&
+ "FMA3 tables not sorted!");
+ TableChecked.store(true, std::memory_order_relaxed);
+ }
+#endif
}
-void X86InstrFMA3Info::initGroupsOnce() {
- llvm::call_once(InitGroupsOnceFlag,
- []() { getX86InstrFMA3Info()->initGroupsOnceImpl(); });
+/// Returns a reference to a group of FMA3 opcodes to where the given
+/// \p Opcode is included. If the given \p Opcode is not recognized as FMA3
+/// and not included into any FMA3 group, then nullptr is returned.
+const X86InstrFMA3Group *llvm::getFMA3Group(unsigned Opcode, uint64_t TSFlags) {
+
+ // FMA3 instructions have a well defined encoding pattern we can exploit.
+ uint8_t BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+ bool IsFMA3 = ((TSFlags & X86II::EncodingMask) == X86II::VEX ||
+ (TSFlags & X86II::EncodingMask) == X86II::EVEX) &&
+ (TSFlags & X86II::OpMapMask) == X86II::T8 &&
+ (TSFlags & X86II::OpPrefixMask) == X86II::PD &&
+ ((BaseOpcode >= 0x96 && BaseOpcode <= 0x9F) ||
+ (BaseOpcode >= 0xA6 && BaseOpcode <= 0xAF) ||
+ (BaseOpcode >= 0xB6 && BaseOpcode <= 0xBF));
+ if (!IsFMA3)
+ return nullptr;
+
+ verifyTables();
+
+ ArrayRef<X86InstrFMA3Group> Table;
+ if (TSFlags & X86II::EVEX_RC)
+ Table = makeArrayRef(RoundGroups);
+ else if (TSFlags & X86II::EVEX_B)
+ Table = makeArrayRef(BroadcastGroups);
+ else
+ Table = makeArrayRef(Groups);
+
+ // FMA 132 instructions have an opcode of 0x96-0x9F
+ // FMA 213 instructions have an opcode of 0xA6-0xAF
+ // FMA 231 instructions have an opcode of 0xB6-0xBF
+ unsigned FormIndex = ((BaseOpcode - 0x90) >> 4) & 0x3;
+
+ auto I = std::lower_bound(Table.begin(), Table.end(), Opcode,
+ [FormIndex](const X86InstrFMA3Group &Group,
+ unsigned Opcode) {
+ return Group.Opcodes[FormIndex] < Opcode;
+ });
+ assert(I != Table.end() && I->Opcodes[FormIndex] == Opcode &&
+ "Couldn't find FMA3 opcode!");
+ return I;
}
diff --git a/lib/Target/X86/X86InstrFMA3Info.h b/lib/Target/X86/X86InstrFMA3Info.h
index e3568160da46..6eec1db98bf8 100644
--- a/lib/Target/X86/X86InstrFMA3Info.h
+++ b/lib/Target/X86/X86InstrFMA3Info.h
@@ -24,294 +24,78 @@
namespace llvm {
/// This class is used to group {132, 213, 231} forms of FMA opcodes together.
-/// Each of the groups has either 3 register opcodes, 3 memory opcodes,
-/// or 6 register and memory opcodes. Also, each group has an attrubutes field
-/// describing it.
-class X86InstrFMA3Group {
-private:
- /// Reference to an array holding 3 forms of register FMA opcodes.
- /// It may be set to nullptr if the group of FMA opcodes does not have
- /// any register form opcodes.
- const uint16_t *RegOpcodes;
-
- /// Reference to an array holding 3 forms of memory FMA opcodes.
- /// It may be set to nullptr if the group of FMA opcodes does not have
- /// any register form opcodes.
- const uint16_t *MemOpcodes;
+/// Each of the groups has either 3 opcodes, Also, each group has an attributes
+/// field describing it.
+struct X86InstrFMA3Group {
+ /// An array holding 3 forms of FMA opcodes.
+ uint16_t Opcodes[3];
/// This bitfield specifies the attributes associated with the created
/// FMA groups of opcodes.
- unsigned Attributes;
-
- static const unsigned Form132 = 0;
- static const unsigned Form213 = 1;
- static const unsigned Form231 = 2;
-
-public:
- /// This bit must be set in the 'Attributes' field of FMA group if such
- /// group of FMA opcodes consists of FMA intrinsic opcodes.
- static const unsigned X86FMA3Intrinsic = 0x1;
+ uint16_t Attributes;
- /// This bit must be set in the 'Attributes' field of FMA group if such
- /// group of FMA opcodes consists of AVX512 opcodes accepting a k-mask and
- /// passing the elements from the 1st operand to the result of the operation
- /// when the correpondings bits in the k-mask are unset.
- static const unsigned X86FMA3KMergeMasked = 0x2;
-
- /// This bit must be set in the 'Attributes' field of FMA group if such
- /// group of FMA opcodes consists of AVX512 opcodes accepting a k-zeromask.
- static const unsigned X86FMA3KZeroMasked = 0x4;
-
- /// Constructor. Creates a new group of FMA opcodes with three register form
- /// FMA opcodes \p RegOpcodes and three memory form FMA opcodes \p MemOpcodes.
- /// The parameters \p RegOpcodes and \p MemOpcodes may be set to nullptr,
- /// which means that the created group of FMA opcodes does not have the
- /// corresponding (register or memory) opcodes.
- /// The parameter \p Attr specifies the attributes describing the created
- /// group.
- X86InstrFMA3Group(const uint16_t *RegOpcodes, const uint16_t *MemOpcodes,
- unsigned Attr)
- : RegOpcodes(RegOpcodes), MemOpcodes(MemOpcodes), Attributes(Attr) {
- assert((RegOpcodes || MemOpcodes) &&
- "Cannot create a group not having any opcodes.");
- }
+ enum {
+ Form132,
+ Form213,
+ Form231,
+ };
- /// Returns a memory form opcode that is the equivalent of the given register
- /// form opcode \p RegOpcode. 0 is returned if the group does not have
- /// either register of memory opcodes.
- unsigned getMemOpcode(unsigned RegOpcode) const {
- if (!RegOpcodes || !MemOpcodes)
- return 0;
- for (unsigned Form = 0; Form < 3; Form++)
- if (RegOpcodes[Form] == RegOpcode)
- return MemOpcodes[Form];
- return 0;
- }
+ enum : uint16_t {
+ /// This bit must be set in the 'Attributes' field of FMA group if such
+ /// group of FMA opcodes consists of FMA intrinsic opcodes.
+ Intrinsic = 0x1,
- /// Returns the 132 form of FMA register opcode.
- unsigned getReg132Opcode() const {
- assert(RegOpcodes && "The group does not have register opcodes.");
- return RegOpcodes[Form132];
- }
-
- /// Returns the 213 form of FMA register opcode.
- unsigned getReg213Opcode() const {
- assert(RegOpcodes && "The group does not have register opcodes.");
- return RegOpcodes[Form213];
- }
+ /// This bit must be set in the 'Attributes' field of FMA group if such
+ /// group of FMA opcodes consists of AVX512 opcodes accepting a k-mask and
+ /// passing the elements from the 1st operand to the result of the operation
+ /// when the correpondings bits in the k-mask are unset.
+ KMergeMasked = 0x2,
- /// Returns the 231 form of FMA register opcode.
- unsigned getReg231Opcode() const {
- assert(RegOpcodes && "The group does not have register opcodes.");
- return RegOpcodes[Form231];
- }
+ /// This bit must be set in the 'Attributes' field of FMA group if such
+ /// group of FMA opcodes consists of AVX512 opcodes accepting a k-zeromask.
+ KZeroMasked = 0x4,
+ };
- /// Returns the 132 form of FMA memory opcode.
- unsigned getMem132Opcode() const {
- assert(MemOpcodes && "The group does not have memory opcodes.");
- return MemOpcodes[Form132];
+ /// Returns the 132 form of FMA opcode.
+ unsigned get132Opcode() const {
+ return Opcodes[Form132];
}
- /// Returns the 213 form of FMA memory opcode.
- unsigned getMem213Opcode() const {
- assert(MemOpcodes && "The group does not have memory opcodes.");
- return MemOpcodes[Form213];
+ /// Returns the 213 form of FMA opcode.
+ unsigned get213Opcode() const {
+ return Opcodes[Form213];
}
- /// Returns the 231 form of FMA memory opcode.
- unsigned getMem231Opcode() const {
- assert(MemOpcodes && "The group does not have memory opcodes.");
- return MemOpcodes[Form231];
+ /// Returns the 231 form of FMA opcode.
+ unsigned get231Opcode() const {
+ return Opcodes[Form231];
}
/// Returns true iff the group of FMA opcodes holds intrinsic opcodes.
- bool isIntrinsic() const { return (Attributes & X86FMA3Intrinsic) != 0; }
+ bool isIntrinsic() const { return (Attributes & Intrinsic) != 0; }
/// Returns true iff the group of FMA opcodes holds k-merge-masked opcodes.
bool isKMergeMasked() const {
- return (Attributes & X86FMA3KMergeMasked) != 0;
+ return (Attributes & KMergeMasked) != 0;
}
/// Returns true iff the group of FMA opcodes holds k-zero-masked opcodes.
- bool isKZeroMasked() const { return (Attributes & X86FMA3KZeroMasked) != 0; }
+ bool isKZeroMasked() const { return (Attributes &KZeroMasked) != 0; }
/// Returns true iff the group of FMA opcodes holds any of k-masked opcodes.
bool isKMasked() const {
- return (Attributes & (X86FMA3KMergeMasked | X86FMA3KZeroMasked)) != 0;
- }
-
- /// Returns true iff the given \p Opcode is a register opcode from the
- /// groups of FMA opcodes.
- bool isRegOpcodeFromGroup(unsigned Opcode) const {
- if (!RegOpcodes)
- return false;
- for (unsigned Form = 0; Form < 3; Form++)
- if (Opcode == RegOpcodes[Form])
- return true;
- return false;
+ return (Attributes & (KMergeMasked | KZeroMasked)) != 0;
}
- /// Returns true iff the given \p Opcode is a memory opcode from the
- /// groups of FMA opcodes.
- bool isMemOpcodeFromGroup(unsigned Opcode) const {
- if (!MemOpcodes)
- return false;
- for (unsigned Form = 0; Form < 3; Form++)
- if (Opcode == MemOpcodes[Form])
- return true;
- return false;
+ bool operator<(const X86InstrFMA3Group &RHS) const {
+ return Opcodes[0] < RHS.Opcodes[0];
}
};
-/// This class provides information about all existing FMA3 opcodes
-///
-class X86InstrFMA3Info {
-private:
- /// A map that is used to find the group of FMA opcodes using any FMA opcode
- /// from the group.
- DenseMap<unsigned, const X86InstrFMA3Group *> OpcodeToGroup;
-
- /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
- /// This method can be called many times, but the actual initialization is
- /// called only once.
- static void initGroupsOnce();
-
- /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
- /// This method must be called ONLY from initGroupsOnce(). Otherwise, such
- /// call is not thread safe.
- void initGroupsOnceImpl();
-
- /// Creates one group of FMA opcodes having the register opcodes
- /// \p RegOpcodes and memory opcodes \p MemOpcodes. The parameter \p Attr
- /// specifies the attributes describing the created group.
- void initRMGroup(const uint16_t *RegOpcodes,
- const uint16_t *MemOpcodes, unsigned Attr = 0);
-
- /// Creates one group of FMA opcodes having only the register opcodes
- /// \p RegOpcodes. The parameter \p Attr specifies the attributes describing
- /// the created group.
- void initRGroup(const uint16_t *RegOpcodes, unsigned Attr = 0);
-
- /// Creates one group of FMA opcodes having only the memory opcodes
- /// \p MemOpcodes. The parameter \p Attr specifies the attributes describing
- /// the created group.
- void initMGroup(const uint16_t *MemOpcodes, unsigned Attr = 0);
-
-public:
- /// Returns the reference to an object of this class. It is assumed that
- /// only one object may exist.
- static X86InstrFMA3Info *getX86InstrFMA3Info();
-
- /// Constructor. Just creates an object of the class.
- X86InstrFMA3Info() = default;
-
- /// Destructor. Deallocates the memory used for FMA3 Groups.
- ~X86InstrFMA3Info() {
- std::set<const X86InstrFMA3Group *> DeletedGroups;
- auto E = OpcodeToGroup.end();
- for (auto I = OpcodeToGroup.begin(); I != E; I++) {
- const X86InstrFMA3Group *G = I->second;
- if (DeletedGroups.find(G) == DeletedGroups.end()) {
- DeletedGroups.insert(G);
- delete G;
- }
- }
- }
-
- /// Returns a reference to a group of FMA3 opcodes to where the given
- /// \p Opcode is included. If the given \p Opcode is not recognized as FMA3
- /// and not included into any FMA3 group, then nullptr is returned.
- static const X86InstrFMA3Group *getFMA3Group(unsigned Opcode) {
- // Ensure that the groups of opcodes are initialized.
- initGroupsOnce();
-
- // Find the group including the given opcode.
- const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
- auto I = FMA3Info->OpcodeToGroup.find(Opcode);
- if (I == FMA3Info->OpcodeToGroup.end())
- return nullptr;
-
- return I->second;
- }
-
- /// Returns true iff the given \p Opcode is recognized as FMA3 by this class.
- static bool isFMA3(unsigned Opcode) {
- return getFMA3Group(Opcode) != nullptr;
- }
-
- /// Iterator that is used to walk on FMA register opcodes having memory
- /// form equivalents.
- class rm_iterator {
- private:
- /// Iterator associated with the OpcodeToGroup map. It must always be
- /// initialized with an entry from OpcodeToGroup for which I->first
- /// points to a register FMA opcode and I->second points to a group of
- /// FMA opcodes having memory form equivalent of I->first.
- DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I;
-
- public:
- /// Constructor. Creates rm_iterator. The parameter \p I must be an
- /// iterator to OpcodeToGroup map entry having I->first pointing to
- /// register form FMA opcode and I->second pointing to a group of FMA
- /// opcodes holding memory form equivalent for I->fist.
- rm_iterator(DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I)
- : I(I) {}
-
- /// Returns the register form FMA opcode.
- unsigned getRegOpcode() const { return I->first; };
-
- /// Returns the memory form equivalent opcode for FMA register opcode
- /// referenced by I->first.
- unsigned getMemOpcode() const {
- unsigned Opcode = I->first;
- const X86InstrFMA3Group *Group = I->second;
- return Group->getMemOpcode(Opcode);
- }
-
- /// Returns a reference to a group of FMA opcodes.
- const X86InstrFMA3Group *getGroup() const { return I->second; }
-
- bool operator==(const rm_iterator &OtherIt) const { return I == OtherIt.I; }
- bool operator!=(const rm_iterator &OtherIt) const { return I != OtherIt.I; }
-
- /// Increment. Advances the 'I' iterator to the next OpcodeToGroup entry
- /// having I->first pointing to register form FMA and I->second pointing
- /// to a group of FMA opcodes holding memory form equivalen for I->first.
- rm_iterator &operator++() {
- auto E = getX86InstrFMA3Info()->OpcodeToGroup.end();
- for (++I; I != E; ++I) {
- unsigned RegOpcode = I->first;
- const X86InstrFMA3Group *Group = I->second;
- if (Group->getMemOpcode(RegOpcode) != 0)
- break;
- }
- return *this;
- }
- };
-
- /// Returns rm_iterator pointing to the first entry of OpcodeToGroup map
- /// with a register FMA opcode having memory form opcode equivalent.
- static rm_iterator rm_begin() {
- initGroupsOnce();
- const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
- auto I = FMA3Info->OpcodeToGroup.begin();
- auto E = FMA3Info->OpcodeToGroup.end();
- while (I != E) {
- unsigned Opcode = I->first;
- const X86InstrFMA3Group *G = I->second;
- if (G->getMemOpcode(Opcode) != 0)
- break;
- I++;
- }
- return rm_iterator(I);
- }
-
- /// Returns the last rm_iterator.
- static rm_iterator rm_end() {
- initGroupsOnce();
- return rm_iterator(getX86InstrFMA3Info()->OpcodeToGroup.end());
- }
-};
+/// Returns a reference to a group of FMA3 opcodes to where the given
+/// \p Opcode is included. If the given \p Opcode is not recognized as FMA3
+/// and not included into any FMA3 group, then nullptr is returned.
+const X86InstrFMA3Group *getFMA3Group(unsigned Opcode, uint64_t TSFlags);
} // end namespace llvm
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 619b399ef8d8..cc81a919ec99 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -118,12 +118,10 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
// f80 instructions cannot use SSE and use neither of these.
-class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern,
- InstrItinClass itin = NoItinerary> :
- FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf32]>;
-class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern,
- InstrItinClass itin = NoItinerary> :
- FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf64]>;
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
// Factoring for arithmetic.
multiclass FPBinary_rr<SDNode OpNode> {
@@ -279,6 +277,8 @@ def SUB_FPrST0 : FPrST0PInst<MRM5r, "fsub{r}p\t$op">;
def SUB_FST0r : FPST0rInst <MRM4r, "fsub\t$op">;
def SUBR_FrST0 : FPrST0Inst <MRM4r, "fsub{|r}\t{%st(0), $op|$op, st(0)}">;
def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t$op">;
+} // SchedRW
+let SchedRW = [WriteFCom] in {
def COM_FST0r : FPST0rInst <MRM2r, "fcom\t$op">;
def COMP_FST0r : FPST0rInst <MRM3r, "fcomp\t$op">;
} // SchedRW
@@ -297,46 +297,45 @@ def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t$op">;
} // SchedRW
// Unary operations.
-multiclass FPUnary<SDNode OpNode, Format fp, string asmstring,
- InstrItinClass itin> {
+multiclass FPUnary<SDNode OpNode, Format fp, string asmstring> {
def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
- [(set RFP32:$dst, (OpNode RFP32:$src))], itin>;
+ [(set RFP32:$dst, (OpNode RFP32:$src))]>;
def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
- [(set RFP64:$dst, (OpNode RFP64:$src))], itin>;
+ [(set RFP64:$dst, (OpNode RFP64:$src))]>;
def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
- [(set RFP80:$dst, (OpNode RFP80:$src))], itin>;
-def _F : FPI<0xD9, fp, (outs), (ins), asmstring, itin>;
+ [(set RFP80:$dst, (OpNode RFP80:$src))]>;
+def _F : FPI<0xD9, fp, (outs), (ins), asmstring>;
}
let Defs = [FPSW] in {
-let SchedRW = [WriteVecLogic] in {
-defm CHS : FPUnary<fneg, MRM_E0, "fchs", IIC_FSIGN>;
-defm ABS : FPUnary<fabs, MRM_E1, "fabs", IIC_FSIGN>;
+let SchedRW = [WriteFSign] in {
+defm CHS : FPUnary<fneg, MRM_E0, "fchs">;
+defm ABS : FPUnary<fabs, MRM_E1, "fabs">;
}
-let SchedRW = [WriteFSqrt] in
-defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt", IIC_FSQRT>;
+let SchedRW = [WriteFSqrt80] in
+defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">;
let SchedRW = [WriteMicrocoded] in {
-defm SIN : FPUnary<fsin, MRM_FE, "fsin", IIC_FSINCOS>;
-defm COS : FPUnary<fcos, MRM_FF, "fcos", IIC_FSINCOS>;
+defm SIN : FPUnary<fsin, MRM_FE, "fsin">;
+defm COS : FPUnary<fcos, MRM_FF, "fcos">;
}
-let SchedRW = [WriteFAdd] in {
+let SchedRW = [WriteFCom] in {
let hasSideEffects = 0 in {
def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
} // hasSideEffects
-def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst", IIC_FCOMI>;
+def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
} // SchedRW
} // Defs = [FPSW]
// Versions of FP instructions that take a single memory operand. Added for the
// disassembler; remove as they are included with patterns elsewhere.
-let SchedRW = [WriteFAddLd] in {
+let SchedRW = [WriteFComLd] in {
def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
@@ -363,31 +362,29 @@ def FBSTPm : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">;
} // SchedRW
// Floating point cmovs.
-class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern,
- InstrItinClass itin> :
- FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf32, HasCMov]>;
-class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern,
- InstrItinClass itin> :
- FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf64, HasCMov]>;
+class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>;
+class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>;
multiclass FPCMov<PatLeaf cc> {
def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
CondMovFP,
[(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
- cc, EFLAGS))], IIC_FCMOV>;
+ cc, EFLAGS))]>;
def _Fp64 : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2),
CondMovFP,
[(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
- cc, EFLAGS))], IIC_FCMOV>;
+ cc, EFLAGS))]>;
def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
CondMovFP,
[(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
- cc, EFLAGS))], IIC_FCMOV>,
+ cc, EFLAGS))]>,
Requires<[HasCMov]>;
}
let Defs = [FPSW] in {
-let SchedRW = [WriteFAdd] in {
+let SchedRW = [WriteFCMOV] in {
let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
defm CMOVB : FPCMov<X86_COND_B>;
defm CMOVBE : FPCMov<X86_COND_BE>;
@@ -402,21 +399,21 @@ defm CMOVNP : FPCMov<X86_COND_NP>;
let Predicates = [HasCMov] in {
// These are not factored because there's no clean way to pass DA/DB.
def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RST:$op),
- "fcmovb\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+ "fcmovb\t{$op, %st(0)|st(0), $op}">;
def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op),
- "fcmovbe\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+ "fcmovbe\t{$op, %st(0)|st(0), $op}">;
def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RST:$op),
- "fcmove\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+ "fcmove\t{$op, %st(0)|st(0), $op}">;
def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RST:$op),
- "fcmovu\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+ "fcmovu\t{$op, %st(0)|st(0), $op}">;
def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op),
- "fcmovnb\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+ "fcmovnb\t{$op, %st(0)|st(0), $op}">;
def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op),
- "fcmovnbe\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+ "fcmovnbe\t{$op, %st(0)|st(0), $op}">;
def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op),
- "fcmovne\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+ "fcmovne\t{$op, %st(0)|st(0), $op}">;
def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op),
- "fcmovnu\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+ "fcmovnu\t{$op, %st(0)|st(0), $op}">;
} // Predicates = [HasCMov]
} // SchedRW
@@ -495,40 +492,24 @@ def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
} // SchedRW
let mayLoad = 1, SchedRW = [WriteLoad] in {
-def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src",
- IIC_FLD>;
-def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src",
- IIC_FLD>;
-def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src",
- IIC_FLD80>;
-def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src",
- IIC_FILD>;
-def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src",
- IIC_FILD>;
-def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src",
- IIC_FILD>;
+def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">;
+def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">;
+def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">;
+def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">;
+def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">;
+def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">;
}
let mayStore = 1, SchedRW = [WriteStore] in {
-def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst",
- IIC_FST>;
-def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst",
- IIC_FST>;
-def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst",
- IIC_FST>;
-def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst",
- IIC_FST>;
-def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst",
- IIC_FST80>;
-def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst",
- IIC_FIST>;
-def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst",
- IIC_FIST>;
-def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst",
- IIC_FIST>;
-def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst",
- IIC_FIST>;
-def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst",
- IIC_FIST>;
+def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">;
+def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">;
+def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">;
+def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst">;
+def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst">;
+def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst">;
+def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst">;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst">;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst">;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">;
}
// FISTTP requires SSE3 even though it's a FPStack op.
@@ -554,20 +535,17 @@ def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
} // Predicates = [HasSSE3]
let mayStore = 1, SchedRW = [WriteStore] in {
-def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst",
- IIC_FST>;
-def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst",
- IIC_FST>;
-def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst),
- "fisttp{ll}\t$dst", IIC_FST>;
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), "fisttp{ll}\t$dst">;
}
// FP Stack manipulation instructions.
let SchedRW = [WriteMove] in {
-def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op", IIC_FLD>;
-def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op", IIC_FST>;
-def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op", IIC_FST>;
-def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op", IIC_FXCH>;
+def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op">;
+def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op">;
+def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op">;
+def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op">;
}
// Floating point constant loads.
@@ -586,13 +564,22 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
[(set RFP80:$dst, fpimm1)]>;
}
-let SchedRW = [WriteZero] in {
-def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz", IIC_FLDZ>;
-def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1", IIC_FIST>;
-}
+let SchedRW = [WriteFLD0] in
+def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz">;
+
+let SchedRW = [WriteFLD1] in
+def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1">;
+
+let SchedRW = [WriteFLDC], Defs = [FPSW] in {
+def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", []>;
+def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", []>;
+def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", []>;
+def FLDLG2 : I<0xD9, MRM_EC, (outs), (ins), "fldlg2", []>;
+def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", []>;
+} // SchedRW
// Floating point compares.
-let SchedRW = [WriteFAdd] in {
+let SchedRW = [WriteFCom] in {
def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
[(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
@@ -602,7 +589,7 @@ def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
} // SchedRW
} // Defs = [FPSW]
-let SchedRW = [WriteFAdd] in {
+let SchedRW = [WriteFCom] in {
// CC = ST(0) cmp ST(i)
let Defs = [EFLAGS, FPSW] in {
def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
@@ -615,25 +602,23 @@ def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
let Defs = [FPSW], Uses = [ST0] in {
def UCOM_Fr : FPI<0xDD, MRM4r, // FPSW = cmp ST(0) with ST(i)
- (outs), (ins RST:$reg), "fucom\t$reg", IIC_FUCOM>;
+ (outs), (ins RST:$reg), "fucom\t$reg">;
def UCOM_FPr : FPI<0xDD, MRM5r, // FPSW = cmp ST(0) with ST(i), pop
- (outs), (ins RST:$reg), "fucomp\t$reg", IIC_FUCOM>;
+ (outs), (ins RST:$reg), "fucomp\t$reg">;
def UCOM_FPPr : FPI<0xDA, MRM_E9, // cmp ST(0) with ST(1), pop, pop
- (outs), (ins), "fucompp", IIC_FUCOM>;
+ (outs), (ins), "fucompp">;
}
let Defs = [EFLAGS, FPSW], Uses = [ST0] in {
def UCOM_FIr : FPI<0xDB, MRM5r, // CC = cmp ST(0) with ST(i)
- (outs), (ins RST:$reg), "fucomi\t$reg", IIC_FUCOMI>;
+ (outs), (ins RST:$reg), "fucomi\t$reg">;
def UCOM_FIPr : FPI<0xDF, MRM5r, // CC = cmp ST(0) with ST(i), pop
- (outs), (ins RST:$reg), "fucompi\t$reg", IIC_FUCOMI>;
+ (outs), (ins RST:$reg), "fucompi\t$reg">;
}
let Defs = [EFLAGS, FPSW] in {
-def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg),
- "fcomi\t$reg", IIC_FCOMI>;
-def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg),
- "fcompi\t$reg", IIC_FCOMI>;
+def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg), "fcomi\t$reg">;
+def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg), "fcompi\t$reg">;
}
} // SchedRW
@@ -642,71 +627,64 @@ let SchedRW = [WriteALU] in {
let Defs = [AX], Uses = [FPSW] in
def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags
(outs), (ins), "fnstsw\t{%ax|ax}",
- [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>;
+ [(set AX, (X86fp_stsw FPSW))]>;
let Defs = [FPSW] in
def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world
(outs), (ins i16mem:$dst), "fnstcw\t$dst",
- [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>;
+ [(X86fp_cwd_get16 addr:$dst)]>;
} // SchedRW
let Defs = [FPSW], mayLoad = 1 in
def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
- (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>,
+ (outs), (ins i16mem:$dst), "fldcw\t$dst", []>,
Sched<[WriteLoad]>;
// FPU control instructions
let SchedRW = [WriteMicrocoded] in {
let Defs = [FPSW] in {
-def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", [], IIC_FNINIT>;
-def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg),
- "ffree\t$reg", IIC_FFREE>;
-def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RST:$reg),
- "ffreep\t$reg", IIC_FFREE>;
+def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", []>;
+def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg), "ffree\t$reg">;
+def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RST:$reg), "ffreep\t$reg">;
// Clear exceptions
-def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", [], IIC_FNCLEX>;
+def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", []>;
} // Defs = [FPSW]
} // SchedRW
-// Operandless floating-point instructions for the disassembler.
-let SchedRW = [WriteMicrocoded] in {
-def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", [], IIC_FNOP>;
+// Operand-less floating-point instructions for the disassembler.
+def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", []>, Sched<[WriteNop]>;
+let SchedRW = [WriteMicrocoded] in {
let Defs = [FPSW] in {
-def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
-def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", [], IIC_FXAM>;
-def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", [], IIC_FLDL>;
-def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", [], IIC_FLDL>;
-def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", [], IIC_FLDL>;
-def FLDLG2 : I<0xD9, MRM_EC, (outs), (ins), "fldlg2", [], IIC_FLDL>;
-def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", [], IIC_FLDL>;
-def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", [], IIC_F2XM1>;
-def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", [], IIC_FYL2X>;
-def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", [], IIC_FPTAN>;
-def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", [], IIC_FPATAN>;
-def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", [], IIC_FXTRACT>;
-def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", [], IIC_FPREM1>;
-def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", [], IIC_FPSTP>;
-def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", [], IIC_FPSTP>;
-def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", [], IIC_FPREM>;
-def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>;
-def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", [], IIC_FSINCOS>;
-def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>;
-def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>;
-def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>;
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>;
+def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", []>;
+def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", []>;
+def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", []>;
+def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", []>;
+def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", []>;
+def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", []>;
+def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", []>;
+def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", []>;
+def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", []>;
+def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", []>;
+def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", []>;
+def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", []>;
+def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", []>;
+def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", []>;
+def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", []>;
} // Defs = [FPSW]
-def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
- "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB,
+def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
+ "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, TB,
Requires<[HasFXSR]>;
-def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
- "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
- IIC_FXSAVE>, TB, Requires<[HasFXSR, In64BitMode]>;
-def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
- "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>,
+def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
+ "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)]>,
+ TB, Requires<[HasFXSR, In64BitMode]>;
+def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaquemem:$src),
+ "fxrstor\t$src", [(int_x86_fxrstor addr:$src)]>,
TB, Requires<[HasFXSR]>;
-def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
- "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
- IIC_FXRSTOR>, TB, Requires<[HasFXSR, In64BitMode]>;
+def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src),
+ "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)]>,
+ TB, Requires<[HasFXSR, In64BitMode]>;
} // SchedRW
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrFoldTables.cpp b/lib/Target/X86/X86InstrFoldTables.cpp
new file mode 100644
index 000000000000..5d8400595bfa
--- /dev/null
+++ b/lib/Target/X86/X86InstrFoldTables.cpp
@@ -0,0 +1,5412 @@
+//===-- X86InstrFoldTables.cpp - X86 Instruction Folding Tables -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 memory folding tables.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrFoldTables.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include <vector>
+
+using namespace llvm;
+
+// These tables are sorted by their RegOp value allowing them to be binary
+// searched at runtime without the need for additional storage. The enum values
+// are currently emitted in X86GenInstrInfo.inc in alphabetical order. Which
+// makes sorting these tables a simple matter of alphabetizing the table.
+//
+// We also have a tablegen emitter that tries to autogenerate these tables
+// by comparing encoding information. This can be enabled by passing
+// X86_GEN_FOLD_TABLES=ON to cmake which fill produce X86GenFoldTables.inc
+// in the build area. There are currently some bugs in the autogenerated table
+// that require a manual review to copy them from the autogenerated table into
+// this table. It is unclear if we will ever be able to fully automate this
+// because as new instruction are added into holes in the X86 opcode map they
+// potentially pair up with old instructions and create new entries in the
+// tables that would be incorrect. The manual review process allows us a chance
+// to catch these before they become observable bugs.
+static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
+ { X86::ADC16ri, X86::ADC16mi, 0 },
+ { X86::ADC16ri8, X86::ADC16mi8, 0 },
+ { X86::ADC16rr, X86::ADC16mr, 0 },
+ { X86::ADC32ri, X86::ADC32mi, 0 },
+ { X86::ADC32ri8, X86::ADC32mi8, 0 },
+ { X86::ADC32rr, X86::ADC32mr, 0 },
+ { X86::ADC64ri32, X86::ADC64mi32, 0 },
+ { X86::ADC64ri8, X86::ADC64mi8, 0 },
+ { X86::ADC64rr, X86::ADC64mr, 0 },
+ { X86::ADC8ri, X86::ADC8mi, 0 },
+ { X86::ADC8ri8, X86::ADC8mi8, 0 },
+ { X86::ADC8rr, X86::ADC8mr, 0 },
+ { X86::ADD16ri, X86::ADD16mi, 0 },
+ { X86::ADD16ri8, X86::ADD16mi8, 0 },
+ { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE },
+ { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE },
+ { X86::ADD16rr, X86::ADD16mr, 0 },
+ { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE },
+ { X86::ADD32ri, X86::ADD32mi, 0 },
+ { X86::ADD32ri8, X86::ADD32mi8, 0 },
+ { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE },
+ { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE },
+ { X86::ADD32rr, X86::ADD32mr, 0 },
+ { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE },
+ { X86::ADD64ri32, X86::ADD64mi32, 0 },
+ { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE },
+ { X86::ADD64ri8, X86::ADD64mi8, 0 },
+ { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE },
+ { X86::ADD64rr, X86::ADD64mr, 0 },
+ { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE },
+ { X86::ADD8ri, X86::ADD8mi, 0 },
+ { X86::ADD8ri8, X86::ADD8mi8, 0 },
+ { X86::ADD8rr, X86::ADD8mr, 0 },
+ { X86::AND16ri, X86::AND16mi, 0 },
+ { X86::AND16ri8, X86::AND16mi8, 0 },
+ { X86::AND16rr, X86::AND16mr, 0 },
+ { X86::AND32ri, X86::AND32mi, 0 },
+ { X86::AND32ri8, X86::AND32mi8, 0 },
+ { X86::AND32rr, X86::AND32mr, 0 },
+ { X86::AND64ri32, X86::AND64mi32, 0 },
+ { X86::AND64ri8, X86::AND64mi8, 0 },
+ { X86::AND64rr, X86::AND64mr, 0 },
+ { X86::AND8ri, X86::AND8mi, 0 },
+ { X86::AND8ri8, X86::AND8mi8, 0 },
+ { X86::AND8rr, X86::AND8mr, 0 },
+ { X86::BTC16ri8, X86::BTC16mi8, 0 },
+ { X86::BTC32ri8, X86::BTC32mi8, 0 },
+ { X86::BTC64ri8, X86::BTC64mi8, 0 },
+ { X86::BTR16ri8, X86::BTR16mi8, 0 },
+ { X86::BTR32ri8, X86::BTR32mi8, 0 },
+ { X86::BTR64ri8, X86::BTR64mi8, 0 },
+ { X86::BTS16ri8, X86::BTS16mi8, 0 },
+ { X86::BTS32ri8, X86::BTS32mi8, 0 },
+ { X86::BTS64ri8, X86::BTS64mi8, 0 },
+ { X86::DEC16r, X86::DEC16m, 0 },
+ { X86::DEC32r, X86::DEC32m, 0 },
+ { X86::DEC64r, X86::DEC64m, 0 },
+ { X86::DEC8r, X86::DEC8m, 0 },
+ { X86::INC16r, X86::INC16m, 0 },
+ { X86::INC32r, X86::INC32m, 0 },
+ { X86::INC64r, X86::INC64m, 0 },
+ { X86::INC8r, X86::INC8m, 0 },
+ { X86::NEG16r, X86::NEG16m, 0 },
+ { X86::NEG32r, X86::NEG32m, 0 },
+ { X86::NEG64r, X86::NEG64m, 0 },
+ { X86::NEG8r, X86::NEG8m, 0 },
+ { X86::NOT16r, X86::NOT16m, 0 },
+ { X86::NOT32r, X86::NOT32m, 0 },
+ { X86::NOT64r, X86::NOT64m, 0 },
+ { X86::NOT8r, X86::NOT8m, 0 },
+ { X86::OR16ri, X86::OR16mi, 0 },
+ { X86::OR16ri8, X86::OR16mi8, 0 },
+ { X86::OR16rr, X86::OR16mr, 0 },
+ { X86::OR32ri, X86::OR32mi, 0 },
+ { X86::OR32ri8, X86::OR32mi8, 0 },
+ { X86::OR32rr, X86::OR32mr, 0 },
+ { X86::OR64ri32, X86::OR64mi32, 0 },
+ { X86::OR64ri8, X86::OR64mi8, 0 },
+ { X86::OR64rr, X86::OR64mr, 0 },
+ { X86::OR8ri, X86::OR8mi, 0 },
+ { X86::OR8ri8, X86::OR8mi8, 0 },
+ { X86::OR8rr, X86::OR8mr, 0 },
+ { X86::RCL16r1, X86::RCL16m1, 0 },
+ { X86::RCL16rCL, X86::RCL16mCL, 0 },
+ { X86::RCL16ri, X86::RCL16mi, 0 },
+ { X86::RCL32r1, X86::RCL32m1, 0 },
+ { X86::RCL32rCL, X86::RCL32mCL, 0 },
+ { X86::RCL32ri, X86::RCL32mi, 0 },
+ { X86::RCL64r1, X86::RCL64m1, 0 },
+ { X86::RCL64rCL, X86::RCL64mCL, 0 },
+ { X86::RCL64ri, X86::RCL64mi, 0 },
+ { X86::RCL8r1, X86::RCL8m1, 0 },
+ { X86::RCL8rCL, X86::RCL8mCL, 0 },
+ { X86::RCL8ri, X86::RCL8mi, 0 },
+ { X86::RCR16r1, X86::RCR16m1, 0 },
+ { X86::RCR16rCL, X86::RCR16mCL, 0 },
+ { X86::RCR16ri, X86::RCR16mi, 0 },
+ { X86::RCR32r1, X86::RCR32m1, 0 },
+ { X86::RCR32rCL, X86::RCR32mCL, 0 },
+ { X86::RCR32ri, X86::RCR32mi, 0 },
+ { X86::RCR64r1, X86::RCR64m1, 0 },
+ { X86::RCR64rCL, X86::RCR64mCL, 0 },
+ { X86::RCR64ri, X86::RCR64mi, 0 },
+ { X86::RCR8r1, X86::RCR8m1, 0 },
+ { X86::RCR8rCL, X86::RCR8mCL, 0 },
+ { X86::RCR8ri, X86::RCR8mi, 0 },
+ { X86::ROL16r1, X86::ROL16m1, 0 },
+ { X86::ROL16rCL, X86::ROL16mCL, 0 },
+ { X86::ROL16ri, X86::ROL16mi, 0 },
+ { X86::ROL32r1, X86::ROL32m1, 0 },
+ { X86::ROL32rCL, X86::ROL32mCL, 0 },
+ { X86::ROL32ri, X86::ROL32mi, 0 },
+ { X86::ROL64r1, X86::ROL64m1, 0 },
+ { X86::ROL64rCL, X86::ROL64mCL, 0 },
+ { X86::ROL64ri, X86::ROL64mi, 0 },
+ { X86::ROL8r1, X86::ROL8m1, 0 },
+ { X86::ROL8rCL, X86::ROL8mCL, 0 },
+ { X86::ROL8ri, X86::ROL8mi, 0 },
+ { X86::ROR16r1, X86::ROR16m1, 0 },
+ { X86::ROR16rCL, X86::ROR16mCL, 0 },
+ { X86::ROR16ri, X86::ROR16mi, 0 },
+ { X86::ROR32r1, X86::ROR32m1, 0 },
+ { X86::ROR32rCL, X86::ROR32mCL, 0 },
+ { X86::ROR32ri, X86::ROR32mi, 0 },
+ { X86::ROR64r1, X86::ROR64m1, 0 },
+ { X86::ROR64rCL, X86::ROR64mCL, 0 },
+ { X86::ROR64ri, X86::ROR64mi, 0 },
+ { X86::ROR8r1, X86::ROR8m1, 0 },
+ { X86::ROR8rCL, X86::ROR8mCL, 0 },
+ { X86::ROR8ri, X86::ROR8mi, 0 },
+ { X86::SAR16r1, X86::SAR16m1, 0 },
+ { X86::SAR16rCL, X86::SAR16mCL, 0 },
+ { X86::SAR16ri, X86::SAR16mi, 0 },
+ { X86::SAR32r1, X86::SAR32m1, 0 },
+ { X86::SAR32rCL, X86::SAR32mCL, 0 },
+ { X86::SAR32ri, X86::SAR32mi, 0 },
+ { X86::SAR64r1, X86::SAR64m1, 0 },
+ { X86::SAR64rCL, X86::SAR64mCL, 0 },
+ { X86::SAR64ri, X86::SAR64mi, 0 },
+ { X86::SAR8r1, X86::SAR8m1, 0 },
+ { X86::SAR8rCL, X86::SAR8mCL, 0 },
+ { X86::SAR8ri, X86::SAR8mi, 0 },
+ { X86::SBB16ri, X86::SBB16mi, 0 },
+ { X86::SBB16ri8, X86::SBB16mi8, 0 },
+ { X86::SBB16rr, X86::SBB16mr, 0 },
+ { X86::SBB32ri, X86::SBB32mi, 0 },
+ { X86::SBB32ri8, X86::SBB32mi8, 0 },
+ { X86::SBB32rr, X86::SBB32mr, 0 },
+ { X86::SBB64ri32, X86::SBB64mi32, 0 },
+ { X86::SBB64ri8, X86::SBB64mi8, 0 },
+ { X86::SBB64rr, X86::SBB64mr, 0 },
+ { X86::SBB8ri, X86::SBB8mi, 0 },
+ { X86::SBB8ri8, X86::SBB8mi8, 0 },
+ { X86::SBB8rr, X86::SBB8mr, 0 },
+ { X86::SHL16r1, X86::SHL16m1, 0 },
+ { X86::SHL16rCL, X86::SHL16mCL, 0 },
+ { X86::SHL16ri, X86::SHL16mi, 0 },
+ { X86::SHL32r1, X86::SHL32m1, 0 },
+ { X86::SHL32rCL, X86::SHL32mCL, 0 },
+ { X86::SHL32ri, X86::SHL32mi, 0 },
+ { X86::SHL64r1, X86::SHL64m1, 0 },
+ { X86::SHL64rCL, X86::SHL64mCL, 0 },
+ { X86::SHL64ri, X86::SHL64mi, 0 },
+ { X86::SHL8r1, X86::SHL8m1, 0 },
+ { X86::SHL8rCL, X86::SHL8mCL, 0 },
+ { X86::SHL8ri, X86::SHL8mi, 0 },
+ { X86::SHLD16rrCL, X86::SHLD16mrCL, 0 },
+ { X86::SHLD16rri8, X86::SHLD16mri8, 0 },
+ { X86::SHLD32rrCL, X86::SHLD32mrCL, 0 },
+ { X86::SHLD32rri8, X86::SHLD32mri8, 0 },
+ { X86::SHLD64rrCL, X86::SHLD64mrCL, 0 },
+ { X86::SHLD64rri8, X86::SHLD64mri8, 0 },
+ { X86::SHR16r1, X86::SHR16m1, 0 },
+ { X86::SHR16rCL, X86::SHR16mCL, 0 },
+ { X86::SHR16ri, X86::SHR16mi, 0 },
+ { X86::SHR32r1, X86::SHR32m1, 0 },
+ { X86::SHR32rCL, X86::SHR32mCL, 0 },
+ { X86::SHR32ri, X86::SHR32mi, 0 },
+ { X86::SHR64r1, X86::SHR64m1, 0 },
+ { X86::SHR64rCL, X86::SHR64mCL, 0 },
+ { X86::SHR64ri, X86::SHR64mi, 0 },
+ { X86::SHR8r1, X86::SHR8m1, 0 },
+ { X86::SHR8rCL, X86::SHR8mCL, 0 },
+ { X86::SHR8ri, X86::SHR8mi, 0 },
+ { X86::SHRD16rrCL, X86::SHRD16mrCL, 0 },
+ { X86::SHRD16rri8, X86::SHRD16mri8, 0 },
+ { X86::SHRD32rrCL, X86::SHRD32mrCL, 0 },
+ { X86::SHRD32rri8, X86::SHRD32mri8, 0 },
+ { X86::SHRD64rrCL, X86::SHRD64mrCL, 0 },
+ { X86::SHRD64rri8, X86::SHRD64mri8, 0 },
+ { X86::SUB16ri, X86::SUB16mi, 0 },
+ { X86::SUB16ri8, X86::SUB16mi8, 0 },
+ { X86::SUB16rr, X86::SUB16mr, 0 },
+ { X86::SUB32ri, X86::SUB32mi, 0 },
+ { X86::SUB32ri8, X86::SUB32mi8, 0 },
+ { X86::SUB32rr, X86::SUB32mr, 0 },
+ { X86::SUB64ri32, X86::SUB64mi32, 0 },
+ { X86::SUB64ri8, X86::SUB64mi8, 0 },
+ { X86::SUB64rr, X86::SUB64mr, 0 },
+ { X86::SUB8ri, X86::SUB8mi, 0 },
+ { X86::SUB8ri8, X86::SUB8mi8, 0 },
+ { X86::SUB8rr, X86::SUB8mr, 0 },
+ { X86::XOR16ri, X86::XOR16mi, 0 },
+ { X86::XOR16ri8, X86::XOR16mi8, 0 },
+ { X86::XOR16rr, X86::XOR16mr, 0 },
+ { X86::XOR32ri, X86::XOR32mi, 0 },
+ { X86::XOR32ri8, X86::XOR32mi8, 0 },
+ { X86::XOR32rr, X86::XOR32mr, 0 },
+ { X86::XOR64ri32, X86::XOR64mi32, 0 },
+ { X86::XOR64ri8, X86::XOR64mi8, 0 },
+ { X86::XOR64rr, X86::XOR64mr, 0 },
+ { X86::XOR8ri, X86::XOR8mi, 0 },
+ { X86::XOR8ri8, X86::XOR8mi8, 0 },
+ { X86::XOR8rr, X86::XOR8mr, 0 }
+};
+
+static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
+ { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD },
+ { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD },
+ { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD },
+ { X86::CALL16r, X86::CALL16m, TB_FOLDED_LOAD },
+ { X86::CALL16r_NT, X86::CALL16m_NT, TB_FOLDED_LOAD },
+ { X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD },
+ { X86::CALL32r_NT, X86::CALL32m_NT, TB_FOLDED_LOAD },
+ { X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD },
+ { X86::CALL64r_NT, X86::CALL64m_NT, TB_FOLDED_LOAD },
+ { X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD },
+ { X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD },
+ { X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD },
+ { X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD },
+ { X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD },
+ { X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD },
+ { X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD },
+ { X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD },
+ { X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD },
+ { X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD },
+ { X86::CMP8ri8, X86::CMP8mi8, TB_FOLDED_LOAD },
+ { X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD },
+ { X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD },
+ { X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD },
+ { X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD },
+ { X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD },
+ { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE },
+ { X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD },
+ { X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD },
+ { X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD },
+ { X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD },
+ { X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD },
+ { X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD },
+ { X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD },
+ { X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD },
+ { X86::JMP16r, X86::JMP16m, TB_FOLDED_LOAD },
+ { X86::JMP16r_NT, X86::JMP16m_NT, TB_FOLDED_LOAD },
+ { X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD },
+ { X86::JMP32r_NT, X86::JMP32m_NT, TB_FOLDED_LOAD },
+ { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD },
+ { X86::JMP64r_NT, X86::JMP64m_NT, TB_FOLDED_LOAD },
+ { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE },
+ { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE },
+ { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE },
+ { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE },
+ { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE },
+ { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE },
+ { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE },
+ { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE },
+ { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
+ { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE },
+ { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
+ { X86::MOVPQIto64rr, X86::MOVPQI2QImr, TB_FOLDED_STORE },
+ { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE },
+ { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE },
+ { X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE },
+ { X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE },
+ { X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD },
+ { X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD },
+ { X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD },
+ { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },
+ { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE },
+ { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE },
+ { X86::PTWRITE64r, X86::PTWRITE64m, TB_FOLDED_LOAD },
+ { X86::PTWRITEr, X86::PTWRITEm, TB_FOLDED_LOAD },
+ { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD },
+ { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD },
+ { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD },
+ { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
+ { X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
+ { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
+ { X86::SETBr, X86::SETBm, TB_FOLDED_STORE },
+ { X86::SETEr, X86::SETEm, TB_FOLDED_STORE },
+ { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE },
+ { X86::SETGr, X86::SETGm, TB_FOLDED_STORE },
+ { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE },
+ { X86::SETLr, X86::SETLm, TB_FOLDED_STORE },
+ { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE },
+ { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE },
+ { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE },
+ { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE },
+ { X86::SETOr, X86::SETOm, TB_FOLDED_STORE },
+ { X86::SETPr, X86::SETPm, TB_FOLDED_STORE },
+ { X86::SETSr, X86::SETSm, TB_FOLDED_STORE },
+ { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD },
+ { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD },
+ { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
+ { X86::TCRETURNri, X86::TCRETURNmi, TB_FOLDED_LOAD | TB_NO_FORWARD },
+ { X86::TCRETURNri64, X86::TCRETURNmi64, TB_FOLDED_LOAD | TB_NO_FORWARD },
+ { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD },
+ { X86::TEST16rr, X86::TEST16mr, TB_FOLDED_LOAD },
+ { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD },
+ { X86::TEST32rr, X86::TEST32mr, TB_FOLDED_LOAD },
+ { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD },
+ { X86::TEST64rr, X86::TEST64mr, TB_FOLDED_LOAD },
+ { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD },
+ { X86::TEST8rr, X86::TEST8mr, TB_FOLDED_LOAD },
+ { X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE },
+ { X86::VCVTPS2PHZ256rr, X86::VCVTPS2PHZ256mr, TB_FOLDED_STORE },
+ { X86::VCVTPS2PHZrr, X86::VCVTPS2PHZmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF32x4Z256rr, X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF32x4Zrr, X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF32x8Zrr, X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF64x2Z256rr, X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF64x2Zrr, X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF64x4Zrr, X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI32x4Z256rr, X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI32x4Zrr, X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI32x8Zrr, X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI64x2Z256rr, X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI64x2Zrr, X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI64x4Zrr, X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTPSrr, X86::VEXTRACTPSmr, TB_FOLDED_STORE },
+ { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE },
+ { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE },
+ { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE },
+ { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE },
+ { X86::VMOVDQUYrr, X86::VMOVDQUYmr, TB_FOLDED_STORE },
+ { X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE },
+ { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
+ { X86::VMOVPDI2DIrr, X86::VMOVPDI2DImr, TB_FOLDED_STORE },
+ { X86::VMOVPQIto64Zrr, X86::VMOVPQI2QIZmr, TB_FOLDED_STORE },
+ { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr, TB_FOLDED_STORE },
+ { X86::VMOVSDto64Zrr, X86::VMOVSDto64Zmr, TB_FOLDED_STORE },
+ { X86::VMOVSDto64rr, X86::VMOVSDto64mr, TB_FOLDED_STORE },
+ { X86::VMOVSS2DIZrr, X86::VMOVSS2DIZmr, TB_FOLDED_STORE },
+ { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE },
+ { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
+ { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE },
+ { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE },
+ { X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE },
+ { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE },
+ { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE },
+ { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128mr, TB_FOLDED_STORE },
+ { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256mr, TB_FOLDED_STORE },
+ { X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE },
+ { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE },
+ { X86::VPEXTRDZrr, X86::VPEXTRDZmr, TB_FOLDED_STORE },
+ { X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE },
+ { X86::VPEXTRQZrr, X86::VPEXTRQZmr, TB_FOLDED_STORE },
+ { X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE },
+ { X86::VPMOVDBZrr, X86::VPMOVDBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVDWZ256rr, X86::VPMOVDWZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVDWZrr, X86::VPMOVDWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVQDZ256rr, X86::VPMOVQDZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVQDZrr, X86::VPMOVQDZmr, TB_FOLDED_STORE },
+ { X86::VPMOVQWZrr, X86::VPMOVQWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSDBZrr, X86::VPMOVSDBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSDWZ256rr, X86::VPMOVSDWZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVSDWZrr, X86::VPMOVSDWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSQDZ256rr, X86::VPMOVSQDZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVSQDZrr, X86::VPMOVSQDZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSQWZrr, X86::VPMOVSQWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSWBZ256rr, X86::VPMOVSWBZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVSWBZrr, X86::VPMOVSWBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSDBZrr, X86::VPMOVUSDBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSDWZ256rr, X86::VPMOVUSDWZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVUSDWZrr, X86::VPMOVUSDWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSQDZ256rr, X86::VPMOVUSQDZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVUSQDZrr, X86::VPMOVUSQDZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSQWZrr, X86::VPMOVUSQWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSWBZ256rr, X86::VPMOVUSWBZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVUSWBZrr, X86::VPMOVUSWBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVWBZ256rr, X86::VPMOVWBZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVWBZrr, X86::VPMOVWBZmr, TB_FOLDED_STORE },
+};
+
+static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
+ { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 },
+ { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 },
+ { X86::BEXTR32rr, X86::BEXTR32rm, 0 },
+ { X86::BEXTR64rr, X86::BEXTR64rm, 0 },
+ { X86::BEXTRI32ri, X86::BEXTRI32mi, 0 },
+ { X86::BEXTRI64ri, X86::BEXTRI64mi, 0 },
+ { X86::BLCFILL32rr, X86::BLCFILL32rm, 0 },
+ { X86::BLCFILL64rr, X86::BLCFILL64rm, 0 },
+ { X86::BLCI32rr, X86::BLCI32rm, 0 },
+ { X86::BLCI64rr, X86::BLCI64rm, 0 },
+ { X86::BLCIC32rr, X86::BLCIC32rm, 0 },
+ { X86::BLCIC64rr, X86::BLCIC64rm, 0 },
+ { X86::BLCMSK32rr, X86::BLCMSK32rm, 0 },
+ { X86::BLCMSK64rr, X86::BLCMSK64rm, 0 },
+ { X86::BLCS32rr, X86::BLCS32rm, 0 },
+ { X86::BLCS64rr, X86::BLCS64rm, 0 },
+ { X86::BLSFILL32rr, X86::BLSFILL32rm, 0 },
+ { X86::BLSFILL64rr, X86::BLSFILL64rm, 0 },
+ { X86::BLSI32rr, X86::BLSI32rm, 0 },
+ { X86::BLSI64rr, X86::BLSI64rm, 0 },
+ { X86::BLSIC32rr, X86::BLSIC32rm, 0 },
+ { X86::BLSIC64rr, X86::BLSIC64rm, 0 },
+ { X86::BLSMSK32rr, X86::BLSMSK32rm, 0 },
+ { X86::BLSMSK64rr, X86::BLSMSK64rm, 0 },
+ { X86::BLSR32rr, X86::BLSR32rm, 0 },
+ { X86::BLSR64rr, X86::BLSR64rm, 0 },
+ { X86::BSF16rr, X86::BSF16rm, 0 },
+ { X86::BSF32rr, X86::BSF32rm, 0 },
+ { X86::BSF64rr, X86::BSF64rm, 0 },
+ { X86::BSR16rr, X86::BSR16rm, 0 },
+ { X86::BSR32rr, X86::BSR32rm, 0 },
+ { X86::BSR64rr, X86::BSR64rm, 0 },
+ { X86::BZHI32rr, X86::BZHI32rm, 0 },
+ { X86::BZHI64rr, X86::BZHI64rm, 0 },
+ { X86::CMP16rr, X86::CMP16rm, 0 },
+ { X86::CMP32rr, X86::CMP32rm, 0 },
+ { X86::CMP64rr, X86::CMP64rm, 0 },
+ { X86::CMP8rr, X86::CMP8rm, 0 },
+ { X86::COMISDrr, X86::COMISDrm, 0 },
+ { X86::COMISDrr_Int, X86::COMISDrm_Int, TB_NO_REVERSE },
+ { X86::COMISSrr, X86::COMISSrm, 0 },
+ { X86::COMISSrr_Int, X86::COMISSrm_Int, TB_NO_REVERSE },
+ { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE },
+ { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 },
+ { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 },
+ { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 },
+ { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 },
+ { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE },
+ { X86::CVTSD2SI64rr_Int, X86::CVTSD2SI64rm_Int, TB_NO_REVERSE },
+ { X86::CVTSD2SIrr_Int, X86::CVTSD2SIrm_Int, TB_NO_REVERSE },
+ { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 },
+ { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 },
+ { X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 },
+ { X86::CVTSI642SDrr, X86::CVTSI642SDrm, 0 },
+ { X86::CVTSI642SSrr, X86::CVTSI642SSrm, 0 },
+ { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 },
+ { X86::CVTSS2SI64rr_Int, X86::CVTSS2SI64rm_Int, TB_NO_REVERSE },
+ { X86::CVTSS2SIrr_Int, X86::CVTSS2SIrm_Int, TB_NO_REVERSE },
+ { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
+ { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
+ { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 },
+ { X86::CVTTSD2SI64rr_Int, X86::CVTTSD2SI64rm_Int, TB_NO_REVERSE },
+ { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 },
+ { X86::CVTTSD2SIrr_Int, X86::CVTTSD2SIrm_Int, TB_NO_REVERSE },
+ { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 },
+ { X86::CVTTSS2SI64rr_Int, X86::CVTTSS2SI64rm_Int, TB_NO_REVERSE },
+ { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 },
+ { X86::CVTTSS2SIrr_Int, X86::CVTTSS2SIrm_Int, TB_NO_REVERSE },
+ { X86::IMUL16rri, X86::IMUL16rmi, 0 },
+ { X86::IMUL16rri8, X86::IMUL16rmi8, 0 },
+ { X86::IMUL32rri, X86::IMUL32rmi, 0 },
+ { X86::IMUL32rri8, X86::IMUL32rmi8, 0 },
+ { X86::IMUL64rri32, X86::IMUL64rmi32, 0 },
+ { X86::IMUL64rri8, X86::IMUL64rmi8, 0 },
+ { X86::LWPINS32rri, X86::LWPINS32rmi, 0 },
+ { X86::LWPINS64rri, X86::LWPINS64rmi, 0 },
+ { X86::LWPVAL32rri, X86::LWPVAL32rmi, 0 },
+ { X86::LWPVAL64rri, X86::LWPVAL64rmi, 0 },
+ { X86::LZCNT16rr, X86::LZCNT16rm, 0 },
+ { X86::LZCNT32rr, X86::LZCNT32rm, 0 },
+ { X86::LZCNT64rr, X86::LZCNT64rm, 0 },
+ { X86::MMX_CVTPD2PIirr, X86::MMX_CVTPD2PIirm, TB_ALIGN_16 },
+ { X86::MMX_CVTPI2PDirr, X86::MMX_CVTPI2PDirm, 0 },
+ { X86::MMX_CVTPS2PIirr, X86::MMX_CVTPS2PIirm, TB_NO_REVERSE },
+ { X86::MMX_CVTTPD2PIirr, X86::MMX_CVTTPD2PIirm, TB_ALIGN_16 },
+ { X86::MMX_CVTTPS2PIirr, X86::MMX_CVTTPS2PIirm, TB_NO_REVERSE },
+ { X86::MMX_MOVD64to64rr, X86::MMX_MOVQ64rm, 0 },
+ { X86::MMX_PABSBrr, X86::MMX_PABSBrm, 0 },
+ { X86::MMX_PABSDrr, X86::MMX_PABSDrm, 0 },
+ { X86::MMX_PABSWrr, X86::MMX_PABSWrm, 0 },
+ { X86::MMX_PSHUFWri, X86::MMX_PSHUFWmi, 0 },
+ { X86::MOV16rr, X86::MOV16rm, 0 },
+ { X86::MOV32rr, X86::MOV32rm, 0 },
+ { X86::MOV64rr, X86::MOV64rm, 0 },
+ { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 },
+ { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 },
+ { X86::MOV8rr, X86::MOV8rm, 0 },
+ { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
+ { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
+ { X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE },
+ { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
+ { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
+ { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
+ { X86::MOVDQUrr, X86::MOVDQUrm, 0 },
+ { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 },
+ { X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 },
+ { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
+ { X86::MOVSX32rr16, X86::MOVSX32rm16, 0 },
+ { X86::MOVSX32rr8, X86::MOVSX32rm8, 0 },
+ { X86::MOVSX32rr8_NOREX, X86::MOVSX32rm8_NOREX, 0 },
+ { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 },
+ { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 },
+ { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
+ { X86::MOVUPDrr, X86::MOVUPDrm, 0 },
+ { X86::MOVUPSrr, X86::MOVUPSrm, 0 },
+ { X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE },
+ { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
+ { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
+ { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },
+ { X86::MOVZX32rr8_NOREX, X86::MOVZX32rm8_NOREX, 0 },
+ { X86::MOVZX64rr16, X86::MOVZX64rm16, 0 },
+ { X86::MOVZX64rr8, X86::MOVZX64rm8, 0 },
+ { X86::PABSBrr, X86::PABSBrm, TB_ALIGN_16 },
+ { X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 },
+ { X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 },
+ { X86::PCMPESTRIrr, X86::PCMPESTRIrm, 0 },
+ { X86::PCMPESTRMrr, X86::PCMPESTRMrm, 0 },
+ { X86::PCMPISTRIrr, X86::PCMPISTRIrm, 0 },
+ { X86::PCMPISTRMrr, X86::PCMPISTRMrm, 0 },
+ { X86::PF2IDrr, X86::PF2IDrm, 0 },
+ { X86::PF2IWrr, X86::PF2IWrm, 0 },
+ { X86::PFRCPrr, X86::PFRCPrm, 0 },
+ { X86::PFRSQRTrr, X86::PFRSQRTrm, 0 },
+ { X86::PHMINPOSUWrr, X86::PHMINPOSUWrm, TB_ALIGN_16 },
+ { X86::PI2FDrr, X86::PI2FDrm, 0 },
+ { X86::PI2FWrr, X86::PI2FWrm, 0 },
+ { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE },
+ { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE },
+ { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_NO_REVERSE },
+ { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_NO_REVERSE },
+ { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_NO_REVERSE },
+ { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_NO_REVERSE },
+ { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_NO_REVERSE },
+ { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_NO_REVERSE },
+ { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_NO_REVERSE },
+ { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_NO_REVERSE },
+ { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_NO_REVERSE },
+ { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_NO_REVERSE },
+ { X86::POPCNT16rr, X86::POPCNT16rm, 0 },
+ { X86::POPCNT32rr, X86::POPCNT32rm, 0 },
+ { X86::POPCNT64rr, X86::POPCNT64rm, 0 },
+ { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 },
+ { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 },
+ { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
+ { X86::PSWAPDrr, X86::PSWAPDrm, 0 },
+ { X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 },
+ { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
+ { X86::RCPSSr, X86::RCPSSm, 0 },
+ { X86::RORX32ri, X86::RORX32mi, 0 },
+ { X86::RORX64ri, X86::RORX64mi, 0 },
+ { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
+ { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
+ { X86::ROUNDSDr, X86::ROUNDSDm, 0 },
+ { X86::ROUNDSSr, X86::ROUNDSSm, 0 },
+ { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
+ { X86::RSQRTSSr, X86::RSQRTSSm, 0 },
+ { X86::SARX32rr, X86::SARX32rm, 0 },
+ { X86::SARX64rr, X86::SARX64rm, 0 },
+ { X86::SHLX32rr, X86::SHLX32rm, 0 },
+ { X86::SHLX64rr, X86::SHLX64rm, 0 },
+ { X86::SHRX32rr, X86::SHRX32rm, 0 },
+ { X86::SHRX64rr, X86::SHRX64rm, 0 },
+ { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 },
+ { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 },
+ { X86::SQRTSDr, X86::SQRTSDm, 0 },
+ { X86::SQRTSSr, X86::SQRTSSm, 0 },
+ { X86::T1MSKC32rr, X86::T1MSKC32rm, 0 },
+ { X86::T1MSKC64rr, X86::T1MSKC64rm, 0 },
+ // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+ { X86::TZCNT16rr, X86::TZCNT16rm, 0 },
+ { X86::TZCNT32rr, X86::TZCNT32rm, 0 },
+ { X86::TZCNT64rr, X86::TZCNT64rm, 0 },
+ { X86::TZMSK32rr, X86::TZMSK32rm, 0 },
+ { X86::TZMSK64rr, X86::TZMSK64rm, 0 },
+ { X86::UCOMISDrr, X86::UCOMISDrm, 0 },
+ { X86::UCOMISDrr_Int, X86::UCOMISDrm_Int, TB_NO_REVERSE },
+ { X86::UCOMISSrr, X86::UCOMISSrm, 0 },
+ { X86::UCOMISSrr_Int, X86::UCOMISSrm_Int, TB_NO_REVERSE },
+ { X86::VAESIMCrr, X86::VAESIMCrm, 0 },
+ { X86::VAESKEYGENASSIST128rr,X86::VAESKEYGENASSIST128rm,0 },
+ { X86::VBROADCASTF32X2Z256r, X86::VBROADCASTF32X2Z256m, TB_NO_REVERSE },
+ { X86::VBROADCASTF32X2Zr, X86::VBROADCASTF32X2Zm, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z128r, X86::VBROADCASTI32X2Z128m, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z256r, X86::VBROADCASTI32X2Z256m, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Zr, X86::VBROADCASTI32X2Zm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
+ { X86::VCOMISDZrr, X86::VCOMISDZrm, 0 },
+ { X86::VCOMISDZrr_Int, X86::VCOMISDZrm_Int, TB_NO_REVERSE },
+ { X86::VCOMISDrr, X86::VCOMISDrm, 0 },
+ { X86::VCOMISDrr_Int, X86::VCOMISDrm_Int, TB_NO_REVERSE },
+ { X86::VCOMISSZrr, X86::VCOMISSZrm, 0 },
+ { X86::VCOMISSZrr_Int, X86::VCOMISSZrm_Int, TB_NO_REVERSE },
+ { X86::VCOMISSrr, X86::VCOMISSrm, 0 },
+ { X86::VCOMISSrr_Int, X86::VCOMISSrm_Int, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 },
+ { X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rm, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0 },
+ { X86::VCVTDQ2PDZrr, X86::VCVTDQ2PDZrm, 0 },
+ { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE },
+ { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 },
+ { X86::VCVTDQ2PSZ128rr, X86::VCVTDQ2PSZ128rm, 0 },
+ { X86::VCVTDQ2PSZ256rr, X86::VCVTDQ2PSZ256rm, 0 },
+ { X86::VCVTDQ2PSZrr, X86::VCVTDQ2PSZrm, 0 },
+ { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 },
+ { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 },
+ { X86::VCVTPD2DQZ128rr, X86::VCVTPD2DQZ128rm, 0 },
+ { X86::VCVTPD2DQZ256rr, X86::VCVTPD2DQZ256rm, 0 },
+ { X86::VCVTPD2DQZrr, X86::VCVTPD2DQZrm, 0 },
+ { X86::VCVTPD2DQrr, X86::VCVTPD2DQrm, 0 },
+ { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 },
+ { X86::VCVTPD2PSZ128rr, X86::VCVTPD2PSZ128rm, 0 },
+ { X86::VCVTPD2PSZ256rr, X86::VCVTPD2PSZ256rm, 0 },
+ { X86::VCVTPD2PSZrr, X86::VCVTPD2PSZrm, 0 },
+ { X86::VCVTPD2PSrr, X86::VCVTPD2PSrm, 0 },
+ { X86::VCVTPD2QQZ128rr, X86::VCVTPD2QQZ128rm, 0 },
+ { X86::VCVTPD2QQZ256rr, X86::VCVTPD2QQZ256rm, 0 },
+ { X86::VCVTPD2QQZrr, X86::VCVTPD2QQZrm, 0 },
+ { X86::VCVTPD2UDQZ128rr, X86::VCVTPD2UDQZ128rm, 0 },
+ { X86::VCVTPD2UDQZ256rr, X86::VCVTPD2UDQZ256rm, 0 },
+ { X86::VCVTPD2UDQZrr, X86::VCVTPD2UDQZrm, 0 },
+ { X86::VCVTPD2UQQZ128rr, X86::VCVTPD2UQQZ128rm, 0 },
+ { X86::VCVTPD2UQQZ256rr, X86::VCVTPD2UQQZ256rm, 0 },
+ { X86::VCVTPD2UQQZrr, X86::VCVTPD2UQQZrm, 0 },
+ { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 },
+ { X86::VCVTPH2PSZ128rr, X86::VCVTPH2PSZ128rm, TB_NO_REVERSE },
+ { X86::VCVTPH2PSZ256rr, X86::VCVTPH2PSZ256rm, 0 },
+ { X86::VCVTPH2PSZrr, X86::VCVTPH2PSZrm, 0 },
+ { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, TB_NO_REVERSE },
+ { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 },
+ { X86::VCVTPS2DQZ128rr, X86::VCVTPS2DQZ128rm, 0 },
+ { X86::VCVTPS2DQZ256rr, X86::VCVTPS2DQZ256rm, 0 },
+ { X86::VCVTPS2DQZrr, X86::VCVTPS2DQZrm, 0 },
+ { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 },
+ { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 },
+ { X86::VCVTPS2PDZ128rr, X86::VCVTPS2PDZ128rm, TB_NO_REVERSE },
+ { X86::VCVTPS2PDZ256rr, X86::VCVTPS2PDZ256rm, 0 },
+ { X86::VCVTPS2PDZrr, X86::VCVTPS2PDZrm, 0 },
+ { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, TB_NO_REVERSE },
+ { X86::VCVTPS2QQZ128rr, X86::VCVTPS2QQZ128rm, TB_NO_REVERSE },
+ { X86::VCVTPS2QQZ256rr, X86::VCVTPS2QQZ256rm, 0 },
+ { X86::VCVTPS2QQZrr, X86::VCVTPS2QQZrm, 0 },
+ { X86::VCVTPS2UDQZ128rr, X86::VCVTPS2UDQZ128rm, 0 },
+ { X86::VCVTPS2UDQZ256rr, X86::VCVTPS2UDQZ256rm, 0 },
+ { X86::VCVTPS2UDQZrr, X86::VCVTPS2UDQZrm, 0 },
+ { X86::VCVTPS2UQQZ128rr, X86::VCVTPS2UQQZ128rm, TB_NO_REVERSE },
+ { X86::VCVTPS2UQQZ256rr, X86::VCVTPS2UQQZ256rm, 0 },
+ { X86::VCVTPS2UQQZrr, X86::VCVTPS2UQQZrm, 0 },
+ { X86::VCVTQQ2PDZ128rr, X86::VCVTQQ2PDZ128rm, 0 },
+ { X86::VCVTQQ2PDZ256rr, X86::VCVTQQ2PDZ256rm, 0 },
+ { X86::VCVTQQ2PDZrr, X86::VCVTQQ2PDZrm, 0 },
+ { X86::VCVTQQ2PSZ128rr, X86::VCVTQQ2PSZ128rm, 0 },
+ { X86::VCVTQQ2PSZ256rr, X86::VCVTQQ2PSZ256rm, 0 },
+ { X86::VCVTQQ2PSZrr, X86::VCVTQQ2PSZrm, 0 },
+ { X86::VCVTSD2SI64Zrr_Int, X86::VCVTSD2SI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2SI64rr_Int, X86::VCVTSD2SI64rm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2SIZrr_Int, X86::VCVTSD2SIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2SIrr_Int, X86::VCVTSD2SIrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2USI64Zrr_Int, X86::VCVTSD2USI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2USIZrr_Int, X86::VCVTSD2USIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SI64Zrr_Int, X86::VCVTSS2SI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SI64rr_Int, X86::VCVTSS2SI64rm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SIZrr_Int, X86::VCVTSS2SIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SIrr_Int, X86::VCVTSS2SIrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2USI64Zrr_Int, X86::VCVTSS2USI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2USIZrr_Int, X86::VCVTSS2USIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 },
+ { X86::VCVTTPD2DQZ128rr, X86::VCVTTPD2DQZ128rm, 0 },
+ { X86::VCVTTPD2DQZ256rr, X86::VCVTTPD2DQZ256rm, 0 },
+ { X86::VCVTTPD2DQZrr, X86::VCVTTPD2DQZrm, 0 },
+ { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, 0 },
+ { X86::VCVTTPD2QQZ128rr, X86::VCVTTPD2QQZ128rm, 0 },
+ { X86::VCVTTPD2QQZ256rr, X86::VCVTTPD2QQZ256rm, 0 },
+ { X86::VCVTTPD2QQZrr, X86::VCVTTPD2QQZrm, 0 },
+ { X86::VCVTTPD2UDQZ128rr, X86::VCVTTPD2UDQZ128rm, 0 },
+ { X86::VCVTTPD2UDQZ256rr, X86::VCVTTPD2UDQZ256rm, 0 },
+ { X86::VCVTTPD2UDQZrr, X86::VCVTTPD2UDQZrm, 0 },
+ { X86::VCVTTPD2UQQZ128rr, X86::VCVTTPD2UQQZ128rm, 0 },
+ { X86::VCVTTPD2UQQZ256rr, X86::VCVTTPD2UQQZ256rm, 0 },
+ { X86::VCVTTPD2UQQZrr, X86::VCVTTPD2UQQZrm, 0 },
+ { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 },
+ { X86::VCVTTPS2DQZ128rr, X86::VCVTTPS2DQZ128rm, 0 },
+ { X86::VCVTTPS2DQZ256rr, X86::VCVTTPS2DQZ256rm, 0 },
+ { X86::VCVTTPS2DQZrr, X86::VCVTTPS2DQZrm, 0 },
+ { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 },
+ { X86::VCVTTPS2QQZ128rr, X86::VCVTTPS2QQZ128rm, TB_NO_REVERSE },
+ { X86::VCVTTPS2QQZ256rr, X86::VCVTTPS2QQZ256rm, 0 },
+ { X86::VCVTTPS2QQZrr, X86::VCVTTPS2QQZrm, 0 },
+ { X86::VCVTTPS2UDQZ128rr, X86::VCVTTPS2UDQZ128rm, 0 },
+ { X86::VCVTTPS2UDQZ256rr, X86::VCVTTPS2UDQZ256rm, 0 },
+ { X86::VCVTTPS2UDQZrr, X86::VCVTTPS2UDQZrm, 0 },
+ { X86::VCVTTPS2UQQZ128rr, X86::VCVTTPS2UQQZ128rm, TB_NO_REVERSE },
+ { X86::VCVTTPS2UQQZ256rr, X86::VCVTTPS2UQQZ256rm, 0 },
+ { X86::VCVTTPS2UQQZrr, X86::VCVTTPS2UQQZrm, 0 },
+ { X86::VCVTTSD2SI64Zrr, X86::VCVTTSD2SI64Zrm, 0 },
+ { X86::VCVTTSD2SI64Zrr_Int, X86::VCVTTSD2SI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 },
+ { X86::VCVTTSD2SI64rr_Int, X86::VCVTTSD2SI64rm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSD2SIZrr, X86::VCVTTSD2SIZrm, 0 },
+ { X86::VCVTTSD2SIZrr_Int, X86::VCVTTSD2SIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 },
+ { X86::VCVTTSD2SIrr_Int, X86::VCVTTSD2SIrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSD2USI64Zrr, X86::VCVTTSD2USI64Zrm, 0 },
+ { X86::VCVTTSD2USI64Zrr_Int, X86::VCVTTSD2USI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSD2USIZrr, X86::VCVTTSD2USIZrm, 0 },
+ { X86::VCVTTSD2USIZrr_Int, X86::VCVTTSD2USIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSS2SI64Zrr, X86::VCVTTSS2SI64Zrm, 0 },
+ { X86::VCVTTSS2SI64Zrr_Int, X86::VCVTTSS2SI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 },
+ { X86::VCVTTSS2SI64rr_Int, X86::VCVTTSS2SI64rm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSS2SIZrr, X86::VCVTTSS2SIZrm, 0 },
+ { X86::VCVTTSS2SIZrr_Int, X86::VCVTTSS2SIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 },
+ { X86::VCVTTSS2SIrr_Int, X86::VCVTTSS2SIrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSS2USI64Zrr, X86::VCVTTSS2USI64Zrm, 0 },
+ { X86::VCVTTSS2USI64Zrr_Int, X86::VCVTTSS2USI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSS2USIZrr, X86::VCVTTSS2USIZrm, 0 },
+ { X86::VCVTTSS2USIZrr_Int, X86::VCVTTSS2USIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTUDQ2PDZ128rr, X86::VCVTUDQ2PDZ128rm, TB_NO_REVERSE },
+ { X86::VCVTUDQ2PDZ256rr, X86::VCVTUDQ2PDZ256rm, 0 },
+ { X86::VCVTUDQ2PDZrr, X86::VCVTUDQ2PDZrm, 0 },
+ { X86::VCVTUDQ2PSZ128rr, X86::VCVTUDQ2PSZ128rm, 0 },
+ { X86::VCVTUDQ2PSZ256rr, X86::VCVTUDQ2PSZ256rm, 0 },
+ { X86::VCVTUDQ2PSZrr, X86::VCVTUDQ2PSZrm, 0 },
+ { X86::VCVTUQQ2PDZ128rr, X86::VCVTUQQ2PDZ128rm, 0 },
+ { X86::VCVTUQQ2PDZ256rr, X86::VCVTUQQ2PDZ256rm, 0 },
+ { X86::VCVTUQQ2PDZrr, X86::VCVTUQQ2PDZrm, 0 },
+ { X86::VCVTUQQ2PSZ128rr, X86::VCVTUQQ2PSZ128rm, 0 },
+ { X86::VCVTUQQ2PSZ256rr, X86::VCVTUQQ2PSZ256rm, 0 },
+ { X86::VCVTUQQ2PSZrr, X86::VCVTUQQ2PSZrm, 0 },
+ { X86::VEXP2PDZr, X86::VEXP2PDZm, 0 },
+ { X86::VEXP2PSZr, X86::VEXP2PSZm, 0 },
+ { X86::VEXPANDPDZ128rr, X86::VEXPANDPDZ128rm, TB_NO_REVERSE },
+ { X86::VEXPANDPDZ256rr, X86::VEXPANDPDZ256rm, TB_NO_REVERSE },
+ { X86::VEXPANDPDZrr, X86::VEXPANDPDZrm, TB_NO_REVERSE },
+ { X86::VEXPANDPSZ128rr, X86::VEXPANDPSZ128rm, TB_NO_REVERSE },
+ { X86::VEXPANDPSZ256rr, X86::VEXPANDPSZ256rm, TB_NO_REVERSE },
+ { X86::VEXPANDPSZrr, X86::VEXPANDPSZrm, TB_NO_REVERSE },
+ { X86::VFPCLASSPDZ128rr, X86::VFPCLASSPDZ128rm, 0 },
+ { X86::VFPCLASSPDZ256rr, X86::VFPCLASSPDZ256rm, 0 },
+ { X86::VFPCLASSPDZrr, X86::VFPCLASSPDZrm, 0 },
+ { X86::VFPCLASSPSZ128rr, X86::VFPCLASSPSZ128rm, 0 },
+ { X86::VFPCLASSPSZ256rr, X86::VFPCLASSPSZ256rm, 0 },
+ { X86::VFPCLASSPSZrr, X86::VFPCLASSPSZrm, 0 },
+ { X86::VFPCLASSSDZrr, X86::VFPCLASSSDZrm, TB_NO_REVERSE },
+ { X86::VFPCLASSSSZrr, X86::VFPCLASSSSZrm, TB_NO_REVERSE },
+ { X86::VFRCZPDYrr, X86::VFRCZPDYrm, 0 },
+ { X86::VFRCZPDrr, X86::VFRCZPDrm, 0 },
+ { X86::VFRCZPSYrr, X86::VFRCZPSYrm, 0 },
+ { X86::VFRCZPSrr, X86::VFRCZPSrm, 0 },
+ { X86::VFRCZSDrr, X86::VFRCZSDrm, TB_NO_REVERSE },
+ { X86::VFRCZSSrr, X86::VFRCZSSrm, TB_NO_REVERSE },
+ { X86::VGETEXPPDZ128r, X86::VGETEXPPDZ128m, 0 },
+ { X86::VGETEXPPDZ256r, X86::VGETEXPPDZ256m, 0 },
+ { X86::VGETEXPPDZr, X86::VGETEXPPDZm, 0 },
+ { X86::VGETEXPPSZ128r, X86::VGETEXPPSZ128m, 0 },
+ { X86::VGETEXPPSZ256r, X86::VGETEXPPSZ256m, 0 },
+ { X86::VGETEXPPSZr, X86::VGETEXPPSZm, 0 },
+ { X86::VGETMANTPDZ128rri, X86::VGETMANTPDZ128rmi, 0 },
+ { X86::VGETMANTPDZ256rri, X86::VGETMANTPDZ256rmi, 0 },
+ { X86::VGETMANTPDZrri, X86::VGETMANTPDZrmi, 0 },
+ { X86::VGETMANTPSZ128rri, X86::VGETMANTPSZ128rmi, 0 },
+ { X86::VGETMANTPSZ256rri, X86::VGETMANTPSZ256rmi, 0 },
+ { X86::VGETMANTPSZrri, X86::VGETMANTPSZrmi, 0 },
+ { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 },
+ { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
+ { X86::VMOV64toSDZrr, X86::VMOV64toSDZrm, 0 },
+ { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
+ { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
+ { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
+ { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
+ { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 },
+ { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 },
+ { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 },
+ { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 },
+ { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 },
+ { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 },
+ { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 },
+ { X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 },
+ { X86::VMOVDDUPZ128rr, X86::VMOVDDUPZ128rm, TB_NO_REVERSE },
+ { X86::VMOVDDUPZ256rr, X86::VMOVDDUPZ256rm, 0 },
+ { X86::VMOVDDUPZrr, X86::VMOVDDUPZrm, 0 },
+ { X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE },
+ { X86::VMOVDI2PDIZrr, X86::VMOVDI2PDIZrm, 0 },
+ { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
+ { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 },
+ { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
+ { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 },
+ { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 },
+ { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 },
+ { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 },
+ { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 },
+ { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 },
+ { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 },
+ { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 },
+ { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 },
+ { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 },
+ { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 },
+ { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 },
+ { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 },
+ { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 },
+ { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 },
+ { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 },
+ { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 },
+ { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 },
+ { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 },
+ { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 },
+ { X86::VMOVDQUYrr, X86::VMOVDQUYrm, 0 },
+ { X86::VMOVDQUrr, X86::VMOVDQUrm, 0 },
+ { X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 },
+ { X86::VMOVSHDUPZ128rr, X86::VMOVSHDUPZ128rm, 0 },
+ { X86::VMOVSHDUPZ256rr, X86::VMOVSHDUPZ256rm, 0 },
+ { X86::VMOVSHDUPZrr, X86::VMOVSHDUPZrm, 0 },
+ { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 },
+ { X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 },
+ { X86::VMOVSLDUPZ128rr, X86::VMOVSLDUPZ128rm, 0 },
+ { X86::VMOVSLDUPZ256rr, X86::VMOVSLDUPZ256rm, 0 },
+ { X86::VMOVSLDUPZrr, X86::VMOVSLDUPZrm, 0 },
+ { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 },
+ { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
+ { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 },
+ { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 },
+ { X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 },
+ { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
+ { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 },
+ { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 },
+ { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 },
+ { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 },
+ { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
+ { X86::VMOVZPQILo2PQIZrr, X86::VMOVQI2PQIZrm, TB_NO_REVERSE },
+ { X86::VMOVZPQILo2PQIrr, X86::VMOVQI2PQIrm, TB_NO_REVERSE },
+ { X86::VPABSBYrr, X86::VPABSBYrm, 0 },
+ { X86::VPABSBZ128rr, X86::VPABSBZ128rm, 0 },
+ { X86::VPABSBZ256rr, X86::VPABSBZ256rm, 0 },
+ { X86::VPABSBZrr, X86::VPABSBZrm, 0 },
+ { X86::VPABSBrr, X86::VPABSBrm, 0 },
+ { X86::VPABSDYrr, X86::VPABSDYrm, 0 },
+ { X86::VPABSDZ128rr, X86::VPABSDZ128rm, 0 },
+ { X86::VPABSDZ256rr, X86::VPABSDZ256rm, 0 },
+ { X86::VPABSDZrr, X86::VPABSDZrm, 0 },
+ { X86::VPABSDrr, X86::VPABSDrm, 0 },
+ { X86::VPABSQZ128rr, X86::VPABSQZ128rm, 0 },
+ { X86::VPABSQZ256rr, X86::VPABSQZ256rm, 0 },
+ { X86::VPABSQZrr, X86::VPABSQZrm, 0 },
+ { X86::VPABSWYrr, X86::VPABSWYrm, 0 },
+ { X86::VPABSWZ128rr, X86::VPABSWZ128rm, 0 },
+ { X86::VPABSWZ256rr, X86::VPABSWZ256rm, 0 },
+ { X86::VPABSWZrr, X86::VPABSWZrm, 0 },
+ { X86::VPABSWrr, X86::VPABSWrm, 0 },
+ { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZ128r, X86::VPBROADCASTBZ128m, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZ256r, X86::VPBROADCASTBZ256m, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZr, X86::VPBROADCASTBZm, TB_NO_REVERSE },
+ { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ128r, X86::VPBROADCASTDZ128m, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ256r, X86::VPBROADCASTDZ256m, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZr, X86::VPBROADCASTDZm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ128r, X86::VPBROADCASTQZ128m, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ256r, X86::VPBROADCASTQZ256m, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZr, X86::VPBROADCASTQZm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ128r, X86::VPBROADCASTWZ128m, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ256r, X86::VPBROADCASTWZ256m, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZr, X86::VPBROADCASTWZm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE },
+ { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
+ { X86::VPCMPESTRMrr, X86::VPCMPESTRMrm, 0 },
+ { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 },
+ { X86::VPCMPISTRMrr, X86::VPCMPISTRMrm, 0 },
+ { X86::VPCONFLICTDZ128rr, X86::VPCONFLICTDZ128rm, 0 },
+ { X86::VPCONFLICTDZ256rr, X86::VPCONFLICTDZ256rm, 0 },
+ { X86::VPCONFLICTDZrr, X86::VPCONFLICTDZrm, 0 },
+ { X86::VPCONFLICTQZ128rr, X86::VPCONFLICTQZ128rm, 0 },
+ { X86::VPCONFLICTQZ256rr, X86::VPCONFLICTQZ256rm, 0 },
+ { X86::VPCONFLICTQZrr, X86::VPCONFLICTQZrm, 0 },
+ { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 },
+ { X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 },
+ { X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 },
+ { X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 },
+ { X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
+ { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 },
+ { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 },
+ { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 },
+ { X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 },
+ { X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
+ { X86::VPERMPDYri, X86::VPERMPDYmi, 0 },
+ { X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 },
+ { X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
+ { X86::VPERMQYri, X86::VPERMQYmi, 0 },
+ { X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 },
+ { X86::VPERMQZri, X86::VPERMQZmi, 0 },
+ { X86::VPEXPANDBZ128rr, X86::VPEXPANDBZ128rm, TB_NO_REVERSE },
+ { X86::VPEXPANDBZ256rr, X86::VPEXPANDBZ256rm, TB_NO_REVERSE },
+ { X86::VPEXPANDBZrr, X86::VPEXPANDBZrm, TB_NO_REVERSE },
+ { X86::VPEXPANDDZ128rr, X86::VPEXPANDDZ128rm, TB_NO_REVERSE },
+ { X86::VPEXPANDDZ256rr, X86::VPEXPANDDZ256rm, TB_NO_REVERSE },
+ { X86::VPEXPANDDZrr, X86::VPEXPANDDZrm, TB_NO_REVERSE },
+ { X86::VPEXPANDQZ128rr, X86::VPEXPANDQZ128rm, TB_NO_REVERSE },
+ { X86::VPEXPANDQZ256rr, X86::VPEXPANDQZ256rm, TB_NO_REVERSE },
+ { X86::VPEXPANDQZrr, X86::VPEXPANDQZrm, TB_NO_REVERSE },
+ { X86::VPEXPANDWZ128rr, X86::VPEXPANDWZ128rm, TB_NO_REVERSE },
+ { X86::VPEXPANDWZ256rr, X86::VPEXPANDWZ256rm, TB_NO_REVERSE },
+ { X86::VPEXPANDWZrr, X86::VPEXPANDWZrm, TB_NO_REVERSE },
+ { X86::VPHADDBDrr, X86::VPHADDBDrm, 0 },
+ { X86::VPHADDBQrr, X86::VPHADDBQrm, 0 },
+ { X86::VPHADDBWrr, X86::VPHADDBWrm, 0 },
+ { X86::VPHADDDQrr, X86::VPHADDDQrm, 0 },
+ { X86::VPHADDUBDrr, X86::VPHADDUBDrm, 0 },
+ { X86::VPHADDUBQrr, X86::VPHADDUBQrm, 0 },
+ { X86::VPHADDUBWrr, X86::VPHADDUBWrm, 0 },
+ { X86::VPHADDUDQrr, X86::VPHADDUDQrm, 0 },
+ { X86::VPHADDUWDrr, X86::VPHADDUWDrm, 0 },
+ { X86::VPHADDUWQrr, X86::VPHADDUWQrm, 0 },
+ { X86::VPHADDWDrr, X86::VPHADDWDrm, 0 },
+ { X86::VPHADDWQrr, X86::VPHADDWQrm, 0 },
+ { X86::VPHMINPOSUWrr, X86::VPHMINPOSUWrm, 0 },
+ { X86::VPHSUBBWrr, X86::VPHSUBBWrm, 0 },
+ { X86::VPHSUBDQrr, X86::VPHSUBDQrm, 0 },
+ { X86::VPHSUBWDrr, X86::VPHSUBWDrm, 0 },
+ { X86::VPLZCNTDZ128rr, X86::VPLZCNTDZ128rm, 0 },
+ { X86::VPLZCNTDZ256rr, X86::VPLZCNTDZ256rm, 0 },
+ { X86::VPLZCNTDZrr, X86::VPLZCNTDZrm, 0 },
+ { X86::VPLZCNTQZ128rr, X86::VPLZCNTQZ128rm, 0 },
+ { X86::VPLZCNTQZ256rr, X86::VPLZCNTQZ256rm, 0 },
+ { X86::VPLZCNTQZrr, X86::VPLZCNTQZrm, 0 },
+ { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 },
+ { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ128rr, X86::VPMOVSXBQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 },
+ { X86::VPMOVSXBWZ128rr, X86::VPMOVSXBWZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 },
+ { X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 },
+ { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, TB_NO_REVERSE },
+ { X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 },
+ { X86::VPMOVSXDQZ128rr, X86::VPMOVSXDQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXDQZ256rr, X86::VPMOVSXDQZ256rm, 0 },
+ { X86::VPMOVSXDQZrr, X86::VPMOVSXDQZrm, 0 },
+ { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, TB_NO_REVERSE },
+ { X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 },
+ { X86::VPMOVSXWDZ128rr, X86::VPMOVSXWDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXWDZ256rr, X86::VPMOVSXWDZ256rm, 0 },
+ { X86::VPMOVSXWDZrr, X86::VPMOVSXWDZrm, 0 },
+ { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, TB_NO_REVERSE },
+ { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ128rr, X86::VPMOVSXWQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ256rr, X86::VPMOVSXWQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZrr, X86::VPMOVSXWQZrm, 0 },
+ { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ128rr, X86::VPMOVZXBDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ256rr, X86::VPMOVZXBDZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZrr, X86::VPMOVZXBDZrm, 0 },
+ { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ128rr, X86::VPMOVZXBQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ256rr, X86::VPMOVZXBQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZrr, X86::VPMOVZXBQZrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 },
+ { X86::VPMOVZXBWZ128rr, X86::VPMOVZXBWZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ256rr, X86::VPMOVZXBWZ256rm, 0 },
+ { X86::VPMOVZXBWZrr, X86::VPMOVZXBWZrm, 0 },
+ { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, TB_NO_REVERSE },
+ { X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 },
+ { X86::VPMOVZXDQZ128rr, X86::VPMOVZXDQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXDQZ256rr, X86::VPMOVZXDQZ256rm, 0 },
+ { X86::VPMOVZXDQZrr, X86::VPMOVZXDQZrm, 0 },
+ { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, TB_NO_REVERSE },
+ { X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 },
+ { X86::VPMOVZXWDZ128rr, X86::VPMOVZXWDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXWDZ256rr, X86::VPMOVZXWDZ256rm, 0 },
+ { X86::VPMOVZXWDZrr, X86::VPMOVZXWDZrm, 0 },
+ { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, TB_NO_REVERSE },
+ { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ128rr, X86::VPMOVZXWQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ256rr, X86::VPMOVZXWQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZrr, X86::VPMOVZXWQZrm, 0 },
+ { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, TB_NO_REVERSE },
+ { X86::VPOPCNTBZ128rr, X86::VPOPCNTBZ128rm, 0 },
+ { X86::VPOPCNTBZ256rr, X86::VPOPCNTBZ256rm, 0 },
+ { X86::VPOPCNTBZrr, X86::VPOPCNTBZrm, 0 },
+ { X86::VPOPCNTDZ128rr, X86::VPOPCNTDZ128rm, 0 },
+ { X86::VPOPCNTDZ256rr, X86::VPOPCNTDZ256rm, 0 },
+ { X86::VPOPCNTDZrr, X86::VPOPCNTDZrm, 0 },
+ { X86::VPOPCNTQZ128rr, X86::VPOPCNTQZ128rm, 0 },
+ { X86::VPOPCNTQZ256rr, X86::VPOPCNTQZ256rm, 0 },
+ { X86::VPOPCNTQZrr, X86::VPOPCNTQZrm, 0 },
+ { X86::VPOPCNTWZ128rr, X86::VPOPCNTWZ128rm, 0 },
+ { X86::VPOPCNTWZ256rr, X86::VPOPCNTWZ256rm, 0 },
+ { X86::VPOPCNTWZrr, X86::VPOPCNTWZrm, 0 },
+ { X86::VPROLDZ128ri, X86::VPROLDZ128mi, 0 },
+ { X86::VPROLDZ256ri, X86::VPROLDZ256mi, 0 },
+ { X86::VPROLDZri, X86::VPROLDZmi, 0 },
+ { X86::VPROLQZ128ri, X86::VPROLQZ128mi, 0 },
+ { X86::VPROLQZ256ri, X86::VPROLQZ256mi, 0 },
+ { X86::VPROLQZri, X86::VPROLQZmi, 0 },
+ { X86::VPRORDZ128ri, X86::VPRORDZ128mi, 0 },
+ { X86::VPRORDZ256ri, X86::VPRORDZ256mi, 0 },
+ { X86::VPRORDZri, X86::VPRORDZmi, 0 },
+ { X86::VPRORQZ128ri, X86::VPRORQZ128mi, 0 },
+ { X86::VPRORQZ256ri, X86::VPRORQZ256mi, 0 },
+ { X86::VPRORQZri, X86::VPRORQZmi, 0 },
+ { X86::VPROTBri, X86::VPROTBmi, 0 },
+ { X86::VPROTBrr, X86::VPROTBmr, 0 },
+ { X86::VPROTDri, X86::VPROTDmi, 0 },
+ { X86::VPROTDrr, X86::VPROTDmr, 0 },
+ { X86::VPROTQri, X86::VPROTQmi, 0 },
+ { X86::VPROTQrr, X86::VPROTQmr, 0 },
+ { X86::VPROTWri, X86::VPROTWmi, 0 },
+ { X86::VPROTWrr, X86::VPROTWmr, 0 },
+ { X86::VPSHABrr, X86::VPSHABmr, 0 },
+ { X86::VPSHADrr, X86::VPSHADmr, 0 },
+ { X86::VPSHAQrr, X86::VPSHAQmr, 0 },
+ { X86::VPSHAWrr, X86::VPSHAWmr, 0 },
+ { X86::VPSHLBrr, X86::VPSHLBmr, 0 },
+ { X86::VPSHLDrr, X86::VPSHLDmr, 0 },
+ { X86::VPSHLQrr, X86::VPSHLQmr, 0 },
+ { X86::VPSHLWrr, X86::VPSHLWmr, 0 },
+ { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 },
+ { X86::VPSHUFDZ128ri, X86::VPSHUFDZ128mi, 0 },
+ { X86::VPSHUFDZ256ri, X86::VPSHUFDZ256mi, 0 },
+ { X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 },
+ { X86::VPSHUFDri, X86::VPSHUFDmi, 0 },
+ { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 },
+ { X86::VPSHUFHWZ128ri, X86::VPSHUFHWZ128mi, 0 },
+ { X86::VPSHUFHWZ256ri, X86::VPSHUFHWZ256mi, 0 },
+ { X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 },
+ { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },
+ { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 },
+ { X86::VPSHUFLWZ128ri, X86::VPSHUFLWZ128mi, 0 },
+ { X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 },
+ { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 },
+ { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
+ { X86::VPSLLDQZ128rr, X86::VPSLLDQZ128rm, 0 },
+ { X86::VPSLLDQZ256rr, X86::VPSLLDQZ256rm, 0 },
+ { X86::VPSLLDQZrr, X86::VPSLLDQZrm, 0 },
+ { X86::VPSLLDZ128ri, X86::VPSLLDZ128mi, 0 },
+ { X86::VPSLLDZ256ri, X86::VPSLLDZ256mi, 0 },
+ { X86::VPSLLDZri, X86::VPSLLDZmi, 0 },
+ { X86::VPSLLQZ128ri, X86::VPSLLQZ128mi, 0 },
+ { X86::VPSLLQZ256ri, X86::VPSLLQZ256mi, 0 },
+ { X86::VPSLLQZri, X86::VPSLLQZmi, 0 },
+ { X86::VPSLLWZ128ri, X86::VPSLLWZ128mi, 0 },
+ { X86::VPSLLWZ256ri, X86::VPSLLWZ256mi, 0 },
+ { X86::VPSLLWZri, X86::VPSLLWZmi, 0 },
+ { X86::VPSRADZ128ri, X86::VPSRADZ128mi, 0 },
+ { X86::VPSRADZ256ri, X86::VPSRADZ256mi, 0 },
+ { X86::VPSRADZri, X86::VPSRADZmi, 0 },
+ { X86::VPSRAQZ128ri, X86::VPSRAQZ128mi, 0 },
+ { X86::VPSRAQZ256ri, X86::VPSRAQZ256mi, 0 },
+ { X86::VPSRAQZri, X86::VPSRAQZmi, 0 },
+ { X86::VPSRAWZ128ri, X86::VPSRAWZ128mi, 0 },
+ { X86::VPSRAWZ256ri, X86::VPSRAWZ256mi, 0 },
+ { X86::VPSRAWZri, X86::VPSRAWZmi, 0 },
+ { X86::VPSRLDQZ128rr, X86::VPSRLDQZ128rm, 0 },
+ { X86::VPSRLDQZ256rr, X86::VPSRLDQZ256rm, 0 },
+ { X86::VPSRLDQZrr, X86::VPSRLDQZrm, 0 },
+ { X86::VPSRLDZ128ri, X86::VPSRLDZ128mi, 0 },
+ { X86::VPSRLDZ256ri, X86::VPSRLDZ256mi, 0 },
+ { X86::VPSRLDZri, X86::VPSRLDZmi, 0 },
+ { X86::VPSRLQZ128ri, X86::VPSRLQZ128mi, 0 },
+ { X86::VPSRLQZ256ri, X86::VPSRLQZ256mi, 0 },
+ { X86::VPSRLQZri, X86::VPSRLQZmi, 0 },
+ { X86::VPSRLWZ128ri, X86::VPSRLWZ128mi, 0 },
+ { X86::VPSRLWZ256ri, X86::VPSRLWZ256mi, 0 },
+ { X86::VPSRLWZri, X86::VPSRLWZmi, 0 },
+ { X86::VPTESTYrr, X86::VPTESTYrm, 0 },
+ { X86::VPTESTrr, X86::VPTESTrm, 0 },
+ { X86::VRCP14PDZ128r, X86::VRCP14PDZ128m, 0 },
+ { X86::VRCP14PDZ256r, X86::VRCP14PDZ256m, 0 },
+ { X86::VRCP14PDZr, X86::VRCP14PDZm, 0 },
+ { X86::VRCP14PSZ128r, X86::VRCP14PSZ128m, 0 },
+ { X86::VRCP14PSZ256r, X86::VRCP14PSZ256m, 0 },
+ { X86::VRCP14PSZr, X86::VRCP14PSZm, 0 },
+ { X86::VRCP28PDZr, X86::VRCP28PDZm, 0 },
+ { X86::VRCP28PSZr, X86::VRCP28PSZm, 0 },
+ { X86::VRCPPSYr, X86::VRCPPSYm, 0 },
+ { X86::VRCPPSr, X86::VRCPPSm, 0 },
+ { X86::VREDUCEPDZ128rri, X86::VREDUCEPDZ128rmi, 0 },
+ { X86::VREDUCEPDZ256rri, X86::VREDUCEPDZ256rmi, 0 },
+ { X86::VREDUCEPDZrri, X86::VREDUCEPDZrmi, 0 },
+ { X86::VREDUCEPSZ128rri, X86::VREDUCEPSZ128rmi, 0 },
+ { X86::VREDUCEPSZ256rri, X86::VREDUCEPSZ256rmi, 0 },
+ { X86::VREDUCEPSZrri, X86::VREDUCEPSZrmi, 0 },
+ { X86::VRNDSCALEPDZ128rri, X86::VRNDSCALEPDZ128rmi, 0 },
+ { X86::VRNDSCALEPDZ256rri, X86::VRNDSCALEPDZ256rmi, 0 },
+ { X86::VRNDSCALEPDZrri, X86::VRNDSCALEPDZrmi, 0 },
+ { X86::VRNDSCALEPSZ128rri, X86::VRNDSCALEPSZ128rmi, 0 },
+ { X86::VRNDSCALEPSZ256rri, X86::VRNDSCALEPSZ256rmi, 0 },
+ { X86::VRNDSCALEPSZrri, X86::VRNDSCALEPSZrmi, 0 },
+ { X86::VROUNDPDYr, X86::VROUNDPDYm, 0 },
+ { X86::VROUNDPDr, X86::VROUNDPDm, 0 },
+ { X86::VROUNDPSYr, X86::VROUNDPSYm, 0 },
+ { X86::VROUNDPSr, X86::VROUNDPSm, 0 },
+ { X86::VRSQRT14PDZ128r, X86::VRSQRT14PDZ128m, 0 },
+ { X86::VRSQRT14PDZ256r, X86::VRSQRT14PDZ256m, 0 },
+ { X86::VRSQRT14PDZr, X86::VRSQRT14PDZm, 0 },
+ { X86::VRSQRT14PSZ128r, X86::VRSQRT14PSZ128m, 0 },
+ { X86::VRSQRT14PSZ256r, X86::VRSQRT14PSZ256m, 0 },
+ { X86::VRSQRT14PSZr, X86::VRSQRT14PSZm, 0 },
+ { X86::VRSQRT28PDZr, X86::VRSQRT28PDZm, 0 },
+ { X86::VRSQRT28PSZr, X86::VRSQRT28PSZm, 0 },
+ { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 },
+ { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },
+ { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 },
+ { X86::VSQRTPDZ128r, X86::VSQRTPDZ128m, 0 },
+ { X86::VSQRTPDZ256r, X86::VSQRTPDZ256m, 0 },
+ { X86::VSQRTPDZr, X86::VSQRTPDZm, 0 },
+ { X86::VSQRTPDr, X86::VSQRTPDm, 0 },
+ { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 },
+ { X86::VSQRTPSZ128r, X86::VSQRTPSZ128m, 0 },
+ { X86::VSQRTPSZ256r, X86::VSQRTPSZ256m, 0 },
+ { X86::VSQRTPSZr, X86::VSQRTPSZm, 0 },
+ { X86::VSQRTPSr, X86::VSQRTPSm, 0 },
+ { X86::VTESTPDYrr, X86::VTESTPDYrm, 0 },
+ { X86::VTESTPDrr, X86::VTESTPDrm, 0 },
+ { X86::VTESTPSYrr, X86::VTESTPSYrm, 0 },
+ { X86::VTESTPSrr, X86::VTESTPSrm, 0 },
+ { X86::VUCOMISDZrr, X86::VUCOMISDZrm, 0 },
+ { X86::VUCOMISDZrr_Int, X86::VUCOMISDZrm_Int, TB_NO_REVERSE },
+ { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 },
+ { X86::VUCOMISDrr_Int, X86::VUCOMISDrm_Int, TB_NO_REVERSE },
+ { X86::VUCOMISSZrr, X86::VUCOMISSZrm, 0 },
+ { X86::VUCOMISSZrr_Int, X86::VUCOMISSZrm_Int, TB_NO_REVERSE },
+ { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },
+ { X86::VUCOMISSrr_Int, X86::VUCOMISSrm_Int, TB_NO_REVERSE },
+};
+
+static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
+ { X86::ADC16rr, X86::ADC16rm, 0 },
+ { X86::ADC32rr, X86::ADC32rm, 0 },
+ { X86::ADC64rr, X86::ADC64rm, 0 },
+ { X86::ADC8rr, X86::ADC8rm, 0 },
+ { X86::ADCX32rr, X86::ADCX32rm, 0 },
+ { X86::ADCX64rr, X86::ADCX64rm, 0 },
+ { X86::ADD16rr, X86::ADD16rm, 0 },
+ { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE },
+ { X86::ADD32rr, X86::ADD32rm, 0 },
+ { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE },
+ { X86::ADD64rr, X86::ADD64rm, 0 },
+ { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE },
+ { X86::ADD8rr, X86::ADD8rm, 0 },
+ { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 },
+ { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 },
+ { X86::ADDSDrr, X86::ADDSDrm, 0 },
+ { X86::ADDSDrr_Int, X86::ADDSDrm_Int, TB_NO_REVERSE },
+ { X86::ADDSSrr, X86::ADDSSrm, 0 },
+ { X86::ADDSSrr_Int, X86::ADDSSrm_Int, TB_NO_REVERSE },
+ { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 },
+ { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 },
+ { X86::ADOX32rr, X86::ADOX32rm, 0 },
+ { X86::ADOX64rr, X86::ADOX64rm, 0 },
+ { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 },
+ { X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 },
+ { X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 },
+ { X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 },
+ { X86::AND16rr, X86::AND16rm, 0 },
+ { X86::AND32rr, X86::AND32rm, 0 },
+ { X86::AND64rr, X86::AND64rm, 0 },
+ { X86::AND8rr, X86::AND8rm, 0 },
+ { X86::ANDN32rr, X86::ANDN32rm, 0 },
+ { X86::ANDN64rr, X86::ANDN64rm, 0 },
+ { X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 },
+ { X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 },
+ { X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 },
+ { X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 },
+ { X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 },
+ { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 },
+ { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 },
+ { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 },
+ { X86::CMOVA16rr, X86::CMOVA16rm, 0 },
+ { X86::CMOVA32rr, X86::CMOVA32rm, 0 },
+ { X86::CMOVA64rr, X86::CMOVA64rm, 0 },
+ { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 },
+ { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 },
+ { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 },
+ { X86::CMOVB16rr, X86::CMOVB16rm, 0 },
+ { X86::CMOVB32rr, X86::CMOVB32rm, 0 },
+ { X86::CMOVB64rr, X86::CMOVB64rm, 0 },
+ { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 },
+ { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 },
+ { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 },
+ { X86::CMOVE16rr, X86::CMOVE16rm, 0 },
+ { X86::CMOVE32rr, X86::CMOVE32rm, 0 },
+ { X86::CMOVE64rr, X86::CMOVE64rm, 0 },
+ { X86::CMOVG16rr, X86::CMOVG16rm, 0 },
+ { X86::CMOVG32rr, X86::CMOVG32rm, 0 },
+ { X86::CMOVG64rr, X86::CMOVG64rm, 0 },
+ { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 },
+ { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 },
+ { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 },
+ { X86::CMOVL16rr, X86::CMOVL16rm, 0 },
+ { X86::CMOVL32rr, X86::CMOVL32rm, 0 },
+ { X86::CMOVL64rr, X86::CMOVL64rm, 0 },
+ { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 },
+ { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 },
+ { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 },
+ { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 },
+ { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 },
+ { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 },
+ { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 },
+ { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 },
+ { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 },
+ { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 },
+ { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 },
+ { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 },
+ { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 },
+ { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 },
+ { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 },
+ { X86::CMOVO16rr, X86::CMOVO16rm, 0 },
+ { X86::CMOVO32rr, X86::CMOVO32rm, 0 },
+ { X86::CMOVO64rr, X86::CMOVO64rm, 0 },
+ { X86::CMOVP16rr, X86::CMOVP16rm, 0 },
+ { X86::CMOVP32rr, X86::CMOVP32rm, 0 },
+ { X86::CMOVP64rr, X86::CMOVP64rm, 0 },
+ { X86::CMOVS16rr, X86::CMOVS16rm, 0 },
+ { X86::CMOVS32rr, X86::CMOVS32rm, 0 },
+ { X86::CMOVS64rr, X86::CMOVS64rm, 0 },
+ { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 },
+ { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 },
+ { X86::CMPSDrr, X86::CMPSDrm, 0 },
+ { X86::CMPSDrr_Int, X86::CMPSDrm_Int, TB_NO_REVERSE },
+ { X86::CMPSSrr, X86::CMPSSrm, 0 },
+ { X86::CMPSSrr_Int, X86::CMPSSrm_Int, TB_NO_REVERSE },
+ { X86::CRC32r32r16, X86::CRC32r32m16, 0 },
+ { X86::CRC32r32r32, X86::CRC32r32m32, 0 },
+ { X86::CRC32r32r8, X86::CRC32r32m8, 0 },
+ { X86::CRC32r64r64, X86::CRC32r64m64, 0 },
+ { X86::CRC32r64r8, X86::CRC32r64m8, 0 },
+ { X86::CVTSD2SSrr_Int, X86::CVTSD2SSrm_Int, TB_NO_REVERSE },
+ { X86::CVTSI2SDrr_Int, X86::CVTSI2SDrm_Int, 0 },
+ { X86::CVTSI2SSrr_Int, X86::CVTSI2SSrm_Int, 0 },
+ { X86::CVTSI642SDrr_Int, X86::CVTSI642SDrm_Int, 0 },
+ { X86::CVTSI642SSrr_Int, X86::CVTSI642SSrm_Int, 0 },
+ { X86::CVTSS2SDrr_Int, X86::CVTSS2SDrm_Int, TB_NO_REVERSE },
+ { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 },
+ { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 },
+ { X86::DIVSDrr, X86::DIVSDrm, 0 },
+ { X86::DIVSDrr_Int, X86::DIVSDrm_Int, TB_NO_REVERSE },
+ { X86::DIVSSrr, X86::DIVSSrm, 0 },
+ { X86::DIVSSrr_Int, X86::DIVSSrm_Int, TB_NO_REVERSE },
+ { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
+ { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
+ { X86::GF2P8AFFINEINVQBrri, X86::GF2P8AFFINEINVQBrmi, TB_ALIGN_16 },
+ { X86::GF2P8AFFINEQBrri, X86::GF2P8AFFINEQBrmi, TB_ALIGN_16 },
+ { X86::GF2P8MULBrr, X86::GF2P8MULBrm, TB_ALIGN_16 },
+ { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 },
+ { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 },
+ { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 },
+ { X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 },
+ { X86::IMUL16rr, X86::IMUL16rm, 0 },
+ { X86::IMUL32rr, X86::IMUL32rm, 0 },
+ { X86::IMUL64rr, X86::IMUL64rm, 0 },
+ { X86::MAXCPDrr, X86::MAXCPDrm, TB_ALIGN_16 },
+ { X86::MAXCPSrr, X86::MAXCPSrm, TB_ALIGN_16 },
+ { X86::MAXCSDrr, X86::MAXCSDrm, 0 },
+ { X86::MAXCSSrr, X86::MAXCSSrm, 0 },
+ { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 },
+ { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 },
+ { X86::MAXSDrr, X86::MAXSDrm, 0 },
+ { X86::MAXSDrr_Int, X86::MAXSDrm_Int, TB_NO_REVERSE },
+ { X86::MAXSSrr, X86::MAXSSrm, 0 },
+ { X86::MAXSSrr_Int, X86::MAXSSrm_Int, TB_NO_REVERSE },
+ { X86::MINCPDrr, X86::MINCPDrm, TB_ALIGN_16 },
+ { X86::MINCPSrr, X86::MINCPSrm, TB_ALIGN_16 },
+ { X86::MINCSDrr, X86::MINCSDrm, 0 },
+ { X86::MINCSSrr, X86::MINCSSrm, 0 },
+ { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },
+ { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },
+ { X86::MINSDrr, X86::MINSDrm, 0 },
+ { X86::MINSDrr_Int, X86::MINSDrm_Int, TB_NO_REVERSE },
+ { X86::MINSSrr, X86::MINSSrm, 0 },
+ { X86::MINSSrr_Int, X86::MINSSrm_Int, TB_NO_REVERSE },
+ { X86::MMX_CVTPI2PSirr, X86::MMX_CVTPI2PSirm, 0 },
+ { X86::MMX_PACKSSDWirr, X86::MMX_PACKSSDWirm, 0 },
+ { X86::MMX_PACKSSWBirr, X86::MMX_PACKSSWBirm, 0 },
+ { X86::MMX_PACKUSWBirr, X86::MMX_PACKUSWBirm, 0 },
+ { X86::MMX_PADDBirr, X86::MMX_PADDBirm, 0 },
+ { X86::MMX_PADDDirr, X86::MMX_PADDDirm, 0 },
+ { X86::MMX_PADDQirr, X86::MMX_PADDQirm, 0 },
+ { X86::MMX_PADDSBirr, X86::MMX_PADDSBirm, 0 },
+ { X86::MMX_PADDSWirr, X86::MMX_PADDSWirm, 0 },
+ { X86::MMX_PADDUSBirr, X86::MMX_PADDUSBirm, 0 },
+ { X86::MMX_PADDUSWirr, X86::MMX_PADDUSWirm, 0 },
+ { X86::MMX_PADDWirr, X86::MMX_PADDWirm, 0 },
+ { X86::MMX_PALIGNRrri, X86::MMX_PALIGNRrmi, 0 },
+ { X86::MMX_PANDNirr, X86::MMX_PANDNirm, 0 },
+ { X86::MMX_PANDirr, X86::MMX_PANDirm, 0 },
+ { X86::MMX_PAVGBirr, X86::MMX_PAVGBirm, 0 },
+ { X86::MMX_PAVGWirr, X86::MMX_PAVGWirm, 0 },
+ { X86::MMX_PCMPEQBirr, X86::MMX_PCMPEQBirm, 0 },
+ { X86::MMX_PCMPEQDirr, X86::MMX_PCMPEQDirm, 0 },
+ { X86::MMX_PCMPEQWirr, X86::MMX_PCMPEQWirm, 0 },
+ { X86::MMX_PCMPGTBirr, X86::MMX_PCMPGTBirm, 0 },
+ { X86::MMX_PCMPGTDirr, X86::MMX_PCMPGTDirm, 0 },
+ { X86::MMX_PCMPGTWirr, X86::MMX_PCMPGTWirm, 0 },
+ { X86::MMX_PHADDDrr, X86::MMX_PHADDDrm, 0 },
+ { X86::MMX_PHADDSWrr, X86::MMX_PHADDSWrm, 0 },
+ { X86::MMX_PHADDWrr, X86::MMX_PHADDWrm, 0 },
+ { X86::MMX_PHSUBDrr, X86::MMX_PHSUBDrm, 0 },
+ { X86::MMX_PHSUBSWrr, X86::MMX_PHSUBSWrm, 0 },
+ { X86::MMX_PHSUBWrr, X86::MMX_PHSUBWrm, 0 },
+ { X86::MMX_PINSRWrr, X86::MMX_PINSRWrm, TB_NO_REVERSE },
+ { X86::MMX_PMADDUBSWrr, X86::MMX_PMADDUBSWrm, 0 },
+ { X86::MMX_PMADDWDirr, X86::MMX_PMADDWDirm, 0 },
+ { X86::MMX_PMAXSWirr, X86::MMX_PMAXSWirm, 0 },
+ { X86::MMX_PMAXUBirr, X86::MMX_PMAXUBirm, 0 },
+ { X86::MMX_PMINSWirr, X86::MMX_PMINSWirm, 0 },
+ { X86::MMX_PMINUBirr, X86::MMX_PMINUBirm, 0 },
+ { X86::MMX_PMULHRSWrr, X86::MMX_PMULHRSWrm, 0 },
+ { X86::MMX_PMULHUWirr, X86::MMX_PMULHUWirm, 0 },
+ { X86::MMX_PMULHWirr, X86::MMX_PMULHWirm, 0 },
+ { X86::MMX_PMULLWirr, X86::MMX_PMULLWirm, 0 },
+ { X86::MMX_PMULUDQirr, X86::MMX_PMULUDQirm, 0 },
+ { X86::MMX_PORirr, X86::MMX_PORirm, 0 },
+ { X86::MMX_PSADBWirr, X86::MMX_PSADBWirm, 0 },
+ { X86::MMX_PSHUFBrr, X86::MMX_PSHUFBrm, 0 },
+ { X86::MMX_PSIGNBrr, X86::MMX_PSIGNBrm, 0 },
+ { X86::MMX_PSIGNDrr, X86::MMX_PSIGNDrm, 0 },
+ { X86::MMX_PSIGNWrr, X86::MMX_PSIGNWrm, 0 },
+ { X86::MMX_PSLLDrr, X86::MMX_PSLLDrm, 0 },
+ { X86::MMX_PSLLQrr, X86::MMX_PSLLQrm, 0 },
+ { X86::MMX_PSLLWrr, X86::MMX_PSLLWrm, 0 },
+ { X86::MMX_PSRADrr, X86::MMX_PSRADrm, 0 },
+ { X86::MMX_PSRAWrr, X86::MMX_PSRAWrm, 0 },
+ { X86::MMX_PSRLDrr, X86::MMX_PSRLDrm, 0 },
+ { X86::MMX_PSRLQrr, X86::MMX_PSRLQrm, 0 },
+ { X86::MMX_PSRLWrr, X86::MMX_PSRLWrm, 0 },
+ { X86::MMX_PSUBBirr, X86::MMX_PSUBBirm, 0 },
+ { X86::MMX_PSUBDirr, X86::MMX_PSUBDirm, 0 },
+ { X86::MMX_PSUBQirr, X86::MMX_PSUBQirm, 0 },
+ { X86::MMX_PSUBSBirr, X86::MMX_PSUBSBirm, 0 },
+ { X86::MMX_PSUBSWirr, X86::MMX_PSUBSWirm, 0 },
+ { X86::MMX_PSUBUSBirr, X86::MMX_PSUBUSBirm, 0 },
+ { X86::MMX_PSUBUSWirr, X86::MMX_PSUBUSWirm, 0 },
+ { X86::MMX_PSUBWirr, X86::MMX_PSUBWirm, 0 },
+ { X86::MMX_PUNPCKHBWirr, X86::MMX_PUNPCKHBWirm, 0 },
+ { X86::MMX_PUNPCKHDQirr, X86::MMX_PUNPCKHDQirm, 0 },
+ { X86::MMX_PUNPCKHWDirr, X86::MMX_PUNPCKHWDirm, 0 },
+ { X86::MMX_PUNPCKLBWirr, X86::MMX_PUNPCKLBWirm, TB_NO_REVERSE },
+ { X86::MMX_PUNPCKLDQirr, X86::MMX_PUNPCKLDQirm, TB_NO_REVERSE },
+ { X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, TB_NO_REVERSE },
+ { X86::MMX_PXORirr, X86::MMX_PXORirm, 0 },
+ { X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE },
+ { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
+ { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
+ { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
+ { X86::MULSDrr, X86::MULSDrm, 0 },
+ { X86::MULSDrr_Int, X86::MULSDrm_Int, TB_NO_REVERSE },
+ { X86::MULSSrr, X86::MULSSrm, 0 },
+ { X86::MULSSrr_Int, X86::MULSSrm_Int, TB_NO_REVERSE },
+ { X86::MULX32rr, X86::MULX32rm, 0 },
+ { X86::MULX64rr, X86::MULX64rm, 0 },
+ { X86::OR16rr, X86::OR16rm, 0 },
+ { X86::OR32rr, X86::OR32rm, 0 },
+ { X86::OR64rr, X86::OR64rm, 0 },
+ { X86::OR8rr, X86::OR8rm, 0 },
+ { X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 },
+ { X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 },
+ { X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 },
+ { X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 },
+ { X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 },
+ { X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 },
+ { X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 },
+ { X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 },
+ { X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 },
+ { X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 },
+ { X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 },
+ { X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 },
+ { X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 },
+ { X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 },
+ { X86::PALIGNRrri, X86::PALIGNRrmi, TB_ALIGN_16 },
+ { X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 },
+ { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 },
+ { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 },
+ { X86::PAVGUSBrr, X86::PAVGUSBrm, 0 },
+ { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 },
+ { X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 },
+ { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 },
+ { X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 },
+ { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 },
+ { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 },
+ { X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 },
+ { X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 },
+ { X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 },
+ { X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 },
+ { X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 },
+ { X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 },
+ { X86::PDEP32rr, X86::PDEP32rm, 0 },
+ { X86::PDEP64rr, X86::PDEP64rm, 0 },
+ { X86::PEXT32rr, X86::PEXT32rm, 0 },
+ { X86::PEXT64rr, X86::PEXT64rm, 0 },
+ { X86::PFACCrr, X86::PFACCrm, 0 },
+ { X86::PFADDrr, X86::PFADDrm, 0 },
+ { X86::PFCMPEQrr, X86::PFCMPEQrm, 0 },
+ { X86::PFCMPGErr, X86::PFCMPGErm, 0 },
+ { X86::PFCMPGTrr, X86::PFCMPGTrm, 0 },
+ { X86::PFMAXrr, X86::PFMAXrm, 0 },
+ { X86::PFMINrr, X86::PFMINrm, 0 },
+ { X86::PFMULrr, X86::PFMULrm, 0 },
+ { X86::PFNACCrr, X86::PFNACCrm, 0 },
+ { X86::PFPNACCrr, X86::PFPNACCrm, 0 },
+ { X86::PFRCPIT1rr, X86::PFRCPIT1rm, 0 },
+ { X86::PFRCPIT2rr, X86::PFRCPIT2rm, 0 },
+ { X86::PFRSQIT1rr, X86::PFRSQIT1rm, 0 },
+ { X86::PFSUBRrr, X86::PFSUBRrm, 0 },
+ { X86::PFSUBrr, X86::PFSUBrm, 0 },
+ { X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 },
+ { X86::PHADDSWrr, X86::PHADDSWrm, TB_ALIGN_16 },
+ { X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 },
+ { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 },
+ { X86::PHSUBSWrr, X86::PHSUBSWrm, TB_ALIGN_16 },
+ { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 },
+ { X86::PINSRBrr, X86::PINSRBrm, TB_NO_REVERSE },
+ { X86::PINSRDrr, X86::PINSRDrm, 0 },
+ { X86::PINSRQrr, X86::PINSRQrm, 0 },
+ { X86::PINSRWrr, X86::PINSRWrm, TB_NO_REVERSE },
+ { X86::PMADDUBSWrr, X86::PMADDUBSWrm, TB_ALIGN_16 },
+ { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },
+ { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 },
+ { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 },
+ { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 },
+ { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 },
+ { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 },
+ { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 },
+ { X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 },
+ { X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 },
+ { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 },
+ { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 },
+ { X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 },
+ { X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 },
+ { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 },
+ { X86::PMULHRSWrr, X86::PMULHRSWrm, TB_ALIGN_16 },
+ { X86::PMULHRWrr, X86::PMULHRWrm, 0 },
+ { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 },
+ { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 },
+ { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 },
+ { X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 },
+ { X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 },
+ { X86::PORrr, X86::PORrm, TB_ALIGN_16 },
+ { X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 },
+ { X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 },
+ { X86::PSIGNBrr, X86::PSIGNBrm, TB_ALIGN_16 },
+ { X86::PSIGNDrr, X86::PSIGNDrm, TB_ALIGN_16 },
+ { X86::PSIGNWrr, X86::PSIGNWrm, TB_ALIGN_16 },
+ { X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 },
+ { X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 },
+ { X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 },
+ { X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 },
+ { X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 },
+ { X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 },
+ { X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 },
+ { X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 },
+ { X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 },
+ { X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 },
+ { X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 },
+ { X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 },
+ { X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 },
+ { X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 },
+ { X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 },
+ { X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 },
+ { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 },
+ { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 },
+ { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 },
+ { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
+ { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
+ { X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE },
+ { X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE },
+ { X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE },
+ { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE },
+ { X86::SBB16rr, X86::SBB16rm, 0 },
+ { X86::SBB32rr, X86::SBB32rm, 0 },
+ { X86::SBB64rr, X86::SBB64rm, 0 },
+ { X86::SBB8rr, X86::SBB8rm, 0 },
+ { X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 },
+ { X86::SHA1MSG2rr, X86::SHA1MSG2rm, TB_ALIGN_16 },
+ { X86::SHA1NEXTErr, X86::SHA1NEXTErm, TB_ALIGN_16 },
+ { X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 },
+ { X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 },
+ { X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 },
+ { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 },
+ { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 },
+ { X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 },
+ { X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE },
+ { X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE },
+ { X86::SUB16rr, X86::SUB16rm, 0 },
+ { X86::SUB32rr, X86::SUB32rm, 0 },
+ { X86::SUB64rr, X86::SUB64rm, 0 },
+ { X86::SUB8rr, X86::SUB8rm, 0 },
+ { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 },
+ { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 },
+ { X86::SUBSDrr, X86::SUBSDrm, 0 },
+ { X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE },
+ { X86::SUBSSrr, X86::SUBSSrm, 0 },
+ { X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE },
+ // FIXME: TEST*rr -> swapped operand of TEST *mr.
+ { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 },
+ { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 },
+ { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 },
+ { X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 },
+ { X86::VADDPDYrr, X86::VADDPDYrm, 0 },
+ { X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 },
+ { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 },
+ { X86::VADDPDZrr, X86::VADDPDZrm, 0 },
+ { X86::VADDPDrr, X86::VADDPDrm, 0 },
+ { X86::VADDPSYrr, X86::VADDPSYrm, 0 },
+ { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 },
+ { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 },
+ { X86::VADDPSZrr, X86::VADDPSZrm, 0 },
+ { X86::VADDPSrr, X86::VADDPSrm, 0 },
+ { X86::VADDSDZrr, X86::VADDSDZrm, 0 },
+ { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, TB_NO_REVERSE },
+ { X86::VADDSDrr, X86::VADDSDrm, 0 },
+ { X86::VADDSDrr_Int, X86::VADDSDrm_Int, TB_NO_REVERSE },
+ { X86::VADDSSZrr, X86::VADDSSZrm, 0 },
+ { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, TB_NO_REVERSE },
+ { X86::VADDSSrr, X86::VADDSSrm, 0 },
+ { X86::VADDSSrr_Int, X86::VADDSSrm_Int, TB_NO_REVERSE },
+ { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 },
+ { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 },
+ { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 },
+ { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 },
+ { X86::VAESDECLASTYrr, X86::VAESDECLASTYrm, 0 },
+ { X86::VAESDECLASTZ128rr, X86::VAESDECLASTZ128rm, 0 },
+ { X86::VAESDECLASTZ256rr, X86::VAESDECLASTZ256rm, 0 },
+ { X86::VAESDECLASTZrr, X86::VAESDECLASTZrm, 0 },
+ { X86::VAESDECLASTrr, X86::VAESDECLASTrm, 0 },
+ { X86::VAESDECYrr, X86::VAESDECYrm, 0 },
+ { X86::VAESDECZ128rr, X86::VAESDECZ128rm, 0 },
+ { X86::VAESDECZ256rr, X86::VAESDECZ256rm, 0 },
+ { X86::VAESDECZrr, X86::VAESDECZrm, 0 },
+ { X86::VAESDECrr, X86::VAESDECrm, 0 },
+ { X86::VAESENCLASTYrr, X86::VAESENCLASTYrm, 0 },
+ { X86::VAESENCLASTZ128rr, X86::VAESENCLASTZ128rm, 0 },
+ { X86::VAESENCLASTZ256rr, X86::VAESENCLASTZ256rm, 0 },
+ { X86::VAESENCLASTZrr, X86::VAESENCLASTZrm, 0 },
+ { X86::VAESENCLASTrr, X86::VAESENCLASTrm, 0 },
+ { X86::VAESENCYrr, X86::VAESENCYrm, 0 },
+ { X86::VAESENCZ128rr, X86::VAESENCZ128rm, 0 },
+ { X86::VAESENCZ256rr, X86::VAESENCZ256rm, 0 },
+ { X86::VAESENCZrr, X86::VAESENCZrm, 0 },
+ { X86::VAESENCrr, X86::VAESENCrm, 0 },
+ { X86::VALIGNDZ128rri, X86::VALIGNDZ128rmi, 0 },
+ { X86::VALIGNDZ256rri, X86::VALIGNDZ256rmi, 0 },
+ { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
+ { X86::VALIGNQZ128rri, X86::VALIGNQZ128rmi, 0 },
+ { X86::VALIGNQZ256rri, X86::VALIGNQZ256rmi, 0 },
+ { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
+ { X86::VANDNPDYrr, X86::VANDNPDYrm, 0 },
+ { X86::VANDNPDZ128rr, X86::VANDNPDZ128rm, 0 },
+ { X86::VANDNPDZ256rr, X86::VANDNPDZ256rm, 0 },
+ { X86::VANDNPDZrr, X86::VANDNPDZrm, 0 },
+ { X86::VANDNPDrr, X86::VANDNPDrm, 0 },
+ { X86::VANDNPSYrr, X86::VANDNPSYrm, 0 },
+ { X86::VANDNPSZ128rr, X86::VANDNPSZ128rm, 0 },
+ { X86::VANDNPSZ256rr, X86::VANDNPSZ256rm, 0 },
+ { X86::VANDNPSZrr, X86::VANDNPSZrm, 0 },
+ { X86::VANDNPSrr, X86::VANDNPSrm, 0 },
+ { X86::VANDPDYrr, X86::VANDPDYrm, 0 },
+ { X86::VANDPDZ128rr, X86::VANDPDZ128rm, 0 },
+ { X86::VANDPDZ256rr, X86::VANDPDZ256rm, 0 },
+ { X86::VANDPDZrr, X86::VANDPDZrm, 0 },
+ { X86::VANDPDrr, X86::VANDPDrm, 0 },
+ { X86::VANDPSYrr, X86::VANDPSYrm, 0 },
+ { X86::VANDPSZ128rr, X86::VANDPSZ128rm, 0 },
+ { X86::VANDPSZ256rr, X86::VANDPSZ256rm, 0 },
+ { X86::VANDPSZrr, X86::VANDPSZrm, 0 },
+ { X86::VANDPSrr, X86::VANDPSrm, 0 },
+ { X86::VBLENDMPDZ128rr, X86::VBLENDMPDZ128rm, 0 },
+ { X86::VBLENDMPDZ256rr, X86::VBLENDMPDZ256rm, 0 },
+ { X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 },
+ { X86::VBLENDMPSZ128rr, X86::VBLENDMPSZ128rm, 0 },
+ { X86::VBLENDMPSZ256rr, X86::VBLENDMPSZ256rm, 0 },
+ { X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 },
+ { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 },
+ { X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 },
+ { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 },
+ { X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 },
+ { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 },
+ { X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 },
+ { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 },
+ { X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 },
+ { X86::VBROADCASTF32X2Z256rkz, X86::VBROADCASTF32X2Z256mkz, TB_NO_REVERSE },
+ { X86::VBROADCASTF32X2Zrkz, X86::VBROADCASTF32X2Zmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z128rkz, X86::VBROADCASTI32X2Z128mkz, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z256rkz, X86::VBROADCASTI32X2Z256mkz, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Zrkz, X86::VBROADCASTI32X2Zmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
+ { X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 },
+ { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 },
+ { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 },
+ { X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 },
+ { X86::VCMPPDrri, X86::VCMPPDrmi, 0 },
+ { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 },
+ { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 },
+ { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmi, 0 },
+ { X86::VCMPPSZrri, X86::VCMPPSZrmi, 0 },
+ { X86::VCMPPSrri, X86::VCMPPSrmi, 0 },
+ { X86::VCMPSDZrr, X86::VCMPSDZrm, 0 },
+ { X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE },
+ { X86::VCMPSDrr, X86::VCMPSDrm, 0 },
+ { X86::VCMPSDrr_Int, X86::VCMPSDrm_Int, TB_NO_REVERSE },
+ { X86::VCMPSSZrr, X86::VCMPSSZrm, 0 },
+ { X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE },
+ { X86::VCMPSSrr, X86::VCMPSSrm, 0 },
+ { X86::VCMPSSrr_Int, X86::VCMPSSrm_Int, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDZ128rrkz, X86::VCVTDQ2PDZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDZ256rrkz, X86::VCVTDQ2PDZ256rmkz, 0 },
+ { X86::VCVTDQ2PDZrrkz, X86::VCVTDQ2PDZrmkz, 0 },
+ { X86::VCVTDQ2PSZ128rrkz, X86::VCVTDQ2PSZ128rmkz, 0 },
+ { X86::VCVTDQ2PSZ256rrkz, X86::VCVTDQ2PSZ256rmkz, 0 },
+ { X86::VCVTDQ2PSZrrkz, X86::VCVTDQ2PSZrmkz, 0 },
+ { X86::VCVTPD2DQZ128rrkz, X86::VCVTPD2DQZ128rmkz, 0 },
+ { X86::VCVTPD2DQZ256rrkz, X86::VCVTPD2DQZ256rmkz, 0 },
+ { X86::VCVTPD2DQZrrkz, X86::VCVTPD2DQZrmkz, 0 },
+ { X86::VCVTPD2PSZ128rrkz, X86::VCVTPD2PSZ128rmkz, 0 },
+ { X86::VCVTPD2PSZ256rrkz, X86::VCVTPD2PSZ256rmkz, 0 },
+ { X86::VCVTPD2PSZrrkz, X86::VCVTPD2PSZrmkz, 0 },
+ { X86::VCVTPD2QQZ128rrkz, X86::VCVTPD2QQZ128rmkz, 0 },
+ { X86::VCVTPD2QQZ256rrkz, X86::VCVTPD2QQZ256rmkz, 0 },
+ { X86::VCVTPD2QQZrrkz, X86::VCVTPD2QQZrmkz, 0 },
+ { X86::VCVTPD2UDQZ128rrkz, X86::VCVTPD2UDQZ128rmkz, 0 },
+ { X86::VCVTPD2UDQZ256rrkz, X86::VCVTPD2UDQZ256rmkz, 0 },
+ { X86::VCVTPD2UDQZrrkz, X86::VCVTPD2UDQZrmkz, 0 },
+ { X86::VCVTPD2UQQZ128rrkz, X86::VCVTPD2UQQZ128rmkz, 0 },
+ { X86::VCVTPD2UQQZ256rrkz, X86::VCVTPD2UQQZ256rmkz, 0 },
+ { X86::VCVTPD2UQQZrrkz, X86::VCVTPD2UQQZrmkz, 0 },
+ { X86::VCVTPH2PSZ128rrkz, X86::VCVTPH2PSZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTPH2PSZ256rrkz, X86::VCVTPH2PSZ256rmkz, 0 },
+ { X86::VCVTPH2PSZrrkz, X86::VCVTPH2PSZrmkz, 0 },
+ { X86::VCVTPS2DQZ128rrkz, X86::VCVTPS2DQZ128rmkz, 0 },
+ { X86::VCVTPS2DQZ256rrkz, X86::VCVTPS2DQZ256rmkz, 0 },
+ { X86::VCVTPS2DQZrrkz, X86::VCVTPS2DQZrmkz, 0 },
+ { X86::VCVTPS2PDZ128rrkz, X86::VCVTPS2PDZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTPS2PDZ256rrkz, X86::VCVTPS2PDZ256rmkz, 0 },
+ { X86::VCVTPS2PDZrrkz, X86::VCVTPS2PDZrmkz, 0 },
+ { X86::VCVTPS2QQZ128rrkz, X86::VCVTPS2QQZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTPS2QQZ256rrkz, X86::VCVTPS2QQZ256rmkz, 0 },
+ { X86::VCVTPS2QQZrrkz, X86::VCVTPS2QQZrmkz, 0 },
+ { X86::VCVTPS2UDQZ128rrkz, X86::VCVTPS2UDQZ128rmkz, 0 },
+ { X86::VCVTPS2UDQZ256rrkz, X86::VCVTPS2UDQZ256rmkz, 0 },
+ { X86::VCVTPS2UDQZrrkz, X86::VCVTPS2UDQZrmkz, 0 },
+ { X86::VCVTPS2UQQZ128rrkz, X86::VCVTPS2UQQZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTPS2UQQZ256rrkz, X86::VCVTPS2UQQZ256rmkz, 0 },
+ { X86::VCVTPS2UQQZrrkz, X86::VCVTPS2UQQZrmkz, 0 },
+ { X86::VCVTQQ2PDZ128rrkz, X86::VCVTQQ2PDZ128rmkz, 0 },
+ { X86::VCVTQQ2PDZ256rrkz, X86::VCVTQQ2PDZ256rmkz, 0 },
+ { X86::VCVTQQ2PDZrrkz, X86::VCVTQQ2PDZrmkz, 0 },
+ { X86::VCVTQQ2PSZ128rrkz, X86::VCVTQQ2PSZ128rmkz, 0 },
+ { X86::VCVTQQ2PSZ256rrkz, X86::VCVTQQ2PSZ256rmkz, 0 },
+ { X86::VCVTQQ2PSZrrkz, X86::VCVTQQ2PSZrmkz, 0 },
+ { X86::VCVTSD2SSZrr, X86::VCVTSD2SSZrm, 0 },
+ { X86::VCVTSD2SSZrr_Int, X86::VCVTSD2SSZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 },
+ { X86::VCVTSD2SSrr_Int, X86::VCVTSD2SSrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSI2SDZrr, X86::VCVTSI2SDZrm, 0 },
+ { X86::VCVTSI2SDZrr_Int, X86::VCVTSI2SDZrm_Int, 0 },
+ { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 },
+ { X86::VCVTSI2SDrr_Int, X86::VCVTSI2SDrm_Int, 0 },
+ { X86::VCVTSI2SSZrr, X86::VCVTSI2SSZrm, 0 },
+ { X86::VCVTSI2SSZrr_Int, X86::VCVTSI2SSZrm_Int, 0 },
+ { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 },
+ { X86::VCVTSI2SSrr_Int, X86::VCVTSI2SSrm_Int, 0 },
+ { X86::VCVTSI642SDZrr, X86::VCVTSI642SDZrm, 0 },
+ { X86::VCVTSI642SDZrr_Int, X86::VCVTSI642SDZrm_Int, 0 },
+ { X86::VCVTSI642SDrr, X86::VCVTSI642SDrm, 0 },
+ { X86::VCVTSI642SDrr_Int, X86::VCVTSI642SDrm_Int, 0 },
+ { X86::VCVTSI642SSZrr, X86::VCVTSI642SSZrm, 0 },
+ { X86::VCVTSI642SSZrr_Int, X86::VCVTSI642SSZrm_Int, 0 },
+ { X86::VCVTSI642SSrr, X86::VCVTSI642SSrm, 0 },
+ { X86::VCVTSI642SSrr_Int, X86::VCVTSI642SSrm_Int, 0 },
+ { X86::VCVTSS2SDZrr, X86::VCVTSS2SDZrm, 0 },
+ { X86::VCVTSS2SDZrr_Int, X86::VCVTSS2SDZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 },
+ { X86::VCVTSS2SDrr_Int, X86::VCVTSS2SDrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTPD2DQZ128rrkz, X86::VCVTTPD2DQZ128rmkz, 0 },
+ { X86::VCVTTPD2DQZ256rrkz, X86::VCVTTPD2DQZ256rmkz, 0 },
+ { X86::VCVTTPD2DQZrrkz, X86::VCVTTPD2DQZrmkz, 0 },
+ { X86::VCVTTPD2QQZ128rrkz, X86::VCVTTPD2QQZ128rmkz, 0 },
+ { X86::VCVTTPD2QQZ256rrkz, X86::VCVTTPD2QQZ256rmkz, 0 },
+ { X86::VCVTTPD2QQZrrkz, X86::VCVTTPD2QQZrmkz, 0 },
+ { X86::VCVTTPD2UDQZ128rrkz, X86::VCVTTPD2UDQZ128rmkz, 0 },
+ { X86::VCVTTPD2UDQZ256rrkz, X86::VCVTTPD2UDQZ256rmkz, 0 },
+ { X86::VCVTTPD2UDQZrrkz, X86::VCVTTPD2UDQZrmkz, 0 },
+ { X86::VCVTTPD2UQQZ128rrkz, X86::VCVTTPD2UQQZ128rmkz, 0 },
+ { X86::VCVTTPD2UQQZ256rrkz, X86::VCVTTPD2UQQZ256rmkz, 0 },
+ { X86::VCVTTPD2UQQZrrkz, X86::VCVTTPD2UQQZrmkz, 0 },
+ { X86::VCVTTPS2DQZ128rrkz, X86::VCVTTPS2DQZ128rmkz, 0 },
+ { X86::VCVTTPS2DQZ256rrkz, X86::VCVTTPS2DQZ256rmkz, 0 },
+ { X86::VCVTTPS2DQZrrkz, X86::VCVTTPS2DQZrmkz, 0 },
+ { X86::VCVTTPS2QQZ128rrkz, X86::VCVTTPS2QQZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTTPS2QQZ256rrkz, X86::VCVTTPS2QQZ256rmkz, 0 },
+ { X86::VCVTTPS2QQZrrkz, X86::VCVTTPS2QQZrmkz, 0 },
+ { X86::VCVTTPS2UDQZ128rrkz, X86::VCVTTPS2UDQZ128rmkz, 0 },
+ { X86::VCVTTPS2UDQZ256rrkz, X86::VCVTTPS2UDQZ256rmkz, 0 },
+ { X86::VCVTTPS2UDQZrrkz, X86::VCVTTPS2UDQZrmkz, 0 },
+ { X86::VCVTTPS2UQQZ128rrkz, X86::VCVTTPS2UQQZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTTPS2UQQZ256rrkz, X86::VCVTTPS2UQQZ256rmkz, 0 },
+ { X86::VCVTTPS2UQQZrrkz, X86::VCVTTPS2UQQZrmkz, 0 },
+ { X86::VCVTUDQ2PDZ128rrkz, X86::VCVTUDQ2PDZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTUDQ2PDZ256rrkz, X86::VCVTUDQ2PDZ256rmkz, 0 },
+ { X86::VCVTUDQ2PDZrrkz, X86::VCVTUDQ2PDZrmkz, 0 },
+ { X86::VCVTUDQ2PSZ128rrkz, X86::VCVTUDQ2PSZ128rmkz, 0 },
+ { X86::VCVTUDQ2PSZ256rrkz, X86::VCVTUDQ2PSZ256rmkz, 0 },
+ { X86::VCVTUDQ2PSZrrkz, X86::VCVTUDQ2PSZrmkz, 0 },
+ { X86::VCVTUQQ2PDZ128rrkz, X86::VCVTUQQ2PDZ128rmkz, 0 },
+ { X86::VCVTUQQ2PDZ256rrkz, X86::VCVTUQQ2PDZ256rmkz, 0 },
+ { X86::VCVTUQQ2PDZrrkz, X86::VCVTUQQ2PDZrmkz, 0 },
+ { X86::VCVTUQQ2PSZ128rrkz, X86::VCVTUQQ2PSZ128rmkz, 0 },
+ { X86::VCVTUQQ2PSZ256rrkz, X86::VCVTUQQ2PSZ256rmkz, 0 },
+ { X86::VCVTUQQ2PSZrrkz, X86::VCVTUQQ2PSZrmkz, 0 },
+ { X86::VCVTUSI2SDZrr, X86::VCVTUSI2SDZrm, 0 },
+ { X86::VCVTUSI2SDZrr_Int, X86::VCVTUSI2SDZrm_Int, 0 },
+ { X86::VCVTUSI2SSZrr, X86::VCVTUSI2SSZrm, 0 },
+ { X86::VCVTUSI2SSZrr_Int, X86::VCVTUSI2SSZrm_Int, 0 },
+ { X86::VCVTUSI642SDZrr, X86::VCVTUSI642SDZrm, 0 },
+ { X86::VCVTUSI642SDZrr_Int, X86::VCVTUSI642SDZrm_Int, 0 },
+ { X86::VCVTUSI642SSZrr, X86::VCVTUSI642SSZrm, 0 },
+ { X86::VCVTUSI642SSZrr_Int, X86::VCVTUSI642SSZrm_Int, 0 },
+ { X86::VDBPSADBWZ128rri, X86::VDBPSADBWZ128rmi, 0 },
+ { X86::VDBPSADBWZ256rri, X86::VDBPSADBWZ256rmi, 0 },
+ { X86::VDBPSADBWZrri, X86::VDBPSADBWZrmi, 0 },
+ { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 },
+ { X86::VDIVPDZ128rr, X86::VDIVPDZ128rm, 0 },
+ { X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0 },
+ { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 },
+ { X86::VDIVPDrr, X86::VDIVPDrm, 0 },
+ { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 },
+ { X86::VDIVPSZ128rr, X86::VDIVPSZ128rm, 0 },
+ { X86::VDIVPSZ256rr, X86::VDIVPSZ256rm, 0 },
+ { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 },
+ { X86::VDIVPSrr, X86::VDIVPSrm, 0 },
+ { X86::VDIVSDZrr, X86::VDIVSDZrm, 0 },
+ { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, TB_NO_REVERSE },
+ { X86::VDIVSDrr, X86::VDIVSDrm, 0 },
+ { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, TB_NO_REVERSE },
+ { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 },
+ { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, TB_NO_REVERSE },
+ { X86::VDIVSSrr, X86::VDIVSSrm, 0 },
+ { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, TB_NO_REVERSE },
+ { X86::VDPPDrri, X86::VDPPDrmi, 0 },
+ { X86::VDPPSYrri, X86::VDPPSYrmi, 0 },
+ { X86::VDPPSrri, X86::VDPPSrmi, 0 },
+ { X86::VEXP2PDZrkz, X86::VEXP2PDZmkz, 0 },
+ { X86::VEXP2PSZrkz, X86::VEXP2PSZmkz, 0 },
+ { X86::VEXPANDPDZ128rrkz, X86::VEXPANDPDZ128rmkz, TB_NO_REVERSE },
+ { X86::VEXPANDPDZ256rrkz, X86::VEXPANDPDZ256rmkz, TB_NO_REVERSE },
+ { X86::VEXPANDPDZrrkz, X86::VEXPANDPDZrmkz, TB_NO_REVERSE },
+ { X86::VEXPANDPSZ128rrkz, X86::VEXPANDPSZ128rmkz, TB_NO_REVERSE },
+ { X86::VEXPANDPSZ256rrkz, X86::VEXPANDPSZ256rmkz, TB_NO_REVERSE },
+ { X86::VEXPANDPSZrrkz, X86::VEXPANDPSZrmkz, TB_NO_REVERSE },
+ { X86::VFMADDPD4Yrr, X86::VFMADDPD4Ymr, 0 },
+ { X86::VFMADDPD4rr, X86::VFMADDPD4mr, 0 },
+ { X86::VFMADDPS4Yrr, X86::VFMADDPS4Ymr, 0 },
+ { X86::VFMADDPS4rr, X86::VFMADDPS4mr, 0 },
+ { X86::VFMADDSD4rr, X86::VFMADDSD4mr, 0 },
+ { X86::VFMADDSD4rr_Int, X86::VFMADDSD4mr_Int, TB_NO_REVERSE },
+ { X86::VFMADDSS4rr, X86::VFMADDSS4mr, 0 },
+ { X86::VFMADDSS4rr_Int, X86::VFMADDSS4mr_Int, TB_NO_REVERSE },
+ { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Ymr, 0 },
+ { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, 0 },
+ { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Ymr, 0 },
+ { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, 0 },
+ { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Ymr, 0 },
+ { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, 0 },
+ { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Ymr, 0 },
+ { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, 0 },
+ { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Ymr, 0 },
+ { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, 0 },
+ { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Ymr, 0 },
+ { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, 0 },
+ { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, 0 },
+ { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4mr_Int, TB_NO_REVERSE },
+ { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 },
+ { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4mr_Int, TB_NO_REVERSE },
+ { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Ymr, 0 },
+ { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, 0 },
+ { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Ymr, 0 },
+ { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, 0 },
+ { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, 0 },
+ { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4mr_Int, TB_NO_REVERSE },
+ { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, 0 },
+ { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4mr_Int, TB_NO_REVERSE },
+ { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Ymr, 0 },
+ { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, 0 },
+ { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Ymr, 0 },
+ { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, 0 },
+ { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, 0 },
+ { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE },
+ { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0 },
+ { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE },
+ { X86::VFPCLASSPDZ128rrk, X86::VFPCLASSPDZ128rmk, 0 },
+ { X86::VFPCLASSPDZ256rrk, X86::VFPCLASSPDZ256rmk, 0 },
+ { X86::VFPCLASSPDZrrk, X86::VFPCLASSPDZrmk, 0 },
+ { X86::VFPCLASSPSZ128rrk, X86::VFPCLASSPSZ128rmk, 0 },
+ { X86::VFPCLASSPSZ256rrk, X86::VFPCLASSPSZ256rmk, 0 },
+ { X86::VFPCLASSPSZrrk, X86::VFPCLASSPSZrmk, 0 },
+ { X86::VFPCLASSSDZrrk, X86::VFPCLASSSDZrmk, TB_NO_REVERSE },
+ { X86::VFPCLASSSSZrrk, X86::VFPCLASSSSZrmk, TB_NO_REVERSE },
+ { X86::VGETEXPPDZ128rkz, X86::VGETEXPPDZ128mkz, 0 },
+ { X86::VGETEXPPDZ256rkz, X86::VGETEXPPDZ256mkz, 0 },
+ { X86::VGETEXPPDZrkz, X86::VGETEXPPDZmkz, 0 },
+ { X86::VGETEXPPSZ128rkz, X86::VGETEXPPSZ128mkz, 0 },
+ { X86::VGETEXPPSZ256rkz, X86::VGETEXPPSZ256mkz, 0 },
+ { X86::VGETEXPPSZrkz, X86::VGETEXPPSZmkz, 0 },
+ { X86::VGETEXPSDZr, X86::VGETEXPSDZm, TB_NO_REVERSE },
+ { X86::VGETEXPSSZr, X86::VGETEXPSSZm, TB_NO_REVERSE },
+ { X86::VGETMANTPDZ128rrikz, X86::VGETMANTPDZ128rmikz, 0 },
+ { X86::VGETMANTPDZ256rrikz, X86::VGETMANTPDZ256rmikz, 0 },
+ { X86::VGETMANTPDZrrikz, X86::VGETMANTPDZrmikz, 0 },
+ { X86::VGETMANTPSZ128rrikz, X86::VGETMANTPSZ128rmikz, 0 },
+ { X86::VGETMANTPSZ256rrikz, X86::VGETMANTPSZ256rmikz, 0 },
+ { X86::VGETMANTPSZrrikz, X86::VGETMANTPSZrmikz, 0 },
+ { X86::VGETMANTSDZrri, X86::VGETMANTSDZrmi, TB_NO_REVERSE },
+ { X86::VGETMANTSSZrri, X86::VGETMANTSSZrmi, TB_NO_REVERSE },
+ { X86::VGF2P8AFFINEINVQBYrri, X86::VGF2P8AFFINEINVQBYrmi, 0 },
+ { X86::VGF2P8AFFINEINVQBZ128rri, X86::VGF2P8AFFINEINVQBZ128rmi, 0 },
+ { X86::VGF2P8AFFINEINVQBZ256rri, X86::VGF2P8AFFINEINVQBZ256rmi, 0 },
+ { X86::VGF2P8AFFINEINVQBZrri, X86::VGF2P8AFFINEINVQBZrmi, 0 },
+ { X86::VGF2P8AFFINEINVQBrri, X86::VGF2P8AFFINEINVQBrmi, 0 },
+ { X86::VGF2P8AFFINEQBYrri, X86::VGF2P8AFFINEQBYrmi, 0 },
+ { X86::VGF2P8AFFINEQBZ128rri, X86::VGF2P8AFFINEQBZ128rmi, 0 },
+ { X86::VGF2P8AFFINEQBZ256rri, X86::VGF2P8AFFINEQBZ256rmi, 0 },
+ { X86::VGF2P8AFFINEQBZrri, X86::VGF2P8AFFINEQBZrmi, 0 },
+ { X86::VGF2P8AFFINEQBrri, X86::VGF2P8AFFINEQBrmi, 0 },
+ { X86::VGF2P8MULBYrr, X86::VGF2P8MULBYrm, 0 },
+ { X86::VGF2P8MULBZ128rr, X86::VGF2P8MULBZ128rm, 0 },
+ { X86::VGF2P8MULBZ256rr, X86::VGF2P8MULBZ256rm, 0 },
+ { X86::VGF2P8MULBZrr, X86::VGF2P8MULBZrm, 0 },
+ { X86::VGF2P8MULBrr, X86::VGF2P8MULBrm, 0 },
+ { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 },
+ { X86::VHADDPDrr, X86::VHADDPDrm, 0 },
+ { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 },
+ { X86::VHADDPSrr, X86::VHADDPSrm, 0 },
+ { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 },
+ { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
+ { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 },
+ { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 },
+ { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 },
+ { X86::VINSERTF32x4Z256rr, X86::VINSERTF32x4Z256rm, 0 },
+ { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrm, 0 },
+ { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrm, 0 },
+ { X86::VINSERTF64x2Z256rr, X86::VINSERTF64x2Z256rm, 0 },
+ { X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrm, 0 },
+ { X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrm, 0 },
+ { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 },
+ { X86::VINSERTI32x4Z256rr, X86::VINSERTI32x4Z256rm, 0 },
+ { X86::VINSERTI32x4Zrr, X86::VINSERTI32x4Zrm, 0 },
+ { X86::VINSERTI32x8Zrr, X86::VINSERTI32x8Zrm, 0 },
+ { X86::VINSERTI64x2Z256rr, X86::VINSERTI64x2Z256rm, 0 },
+ { X86::VINSERTI64x2Zrr, X86::VINSERTI64x2Zrm, 0 },
+ { X86::VINSERTI64x4Zrr, X86::VINSERTI64x4Zrm, 0 },
+ { X86::VMAXCPDYrr, X86::VMAXCPDYrm, 0 },
+ { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rm, 0 },
+ { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0 },
+ { X86::VMAXCPDZrr, X86::VMAXCPDZrm, 0 },
+ { X86::VMAXCPDrr, X86::VMAXCPDrm, 0 },
+ { X86::VMAXCPSYrr, X86::VMAXCPSYrm, 0 },
+ { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rm, 0 },
+ { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rm, 0 },
+ { X86::VMAXCPSZrr, X86::VMAXCPSZrm, 0 },
+ { X86::VMAXCPSrr, X86::VMAXCPSrm, 0 },
+ { X86::VMAXCSDZrr, X86::VMAXCSDZrm, 0 },
+ { X86::VMAXCSDrr, X86::VMAXCSDrm, 0 },
+ { X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0 },
+ { X86::VMAXCSSrr, X86::VMAXCSSrm, 0 },
+ { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 },
+ { X86::VMAXPDZ128rr, X86::VMAXPDZ128rm, 0 },
+ { X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0 },
+ { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 },
+ { X86::VMAXPDrr, X86::VMAXPDrm, 0 },
+ { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 },
+ { X86::VMAXPSZ128rr, X86::VMAXPSZ128rm, 0 },
+ { X86::VMAXPSZ256rr, X86::VMAXPSZ256rm, 0 },
+ { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 },
+ { X86::VMAXPSrr, X86::VMAXPSrm, 0 },
+ { X86::VMAXSDZrr, X86::VMAXSDZrm, 0 },
+ { X86::VMAXSDZrr_Int, X86::VMAXSDZrm_Int, TB_NO_REVERSE },
+ { X86::VMAXSDrr, X86::VMAXSDrm, 0 },
+ { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, TB_NO_REVERSE },
+ { X86::VMAXSSZrr, X86::VMAXSSZrm, 0 },
+ { X86::VMAXSSZrr_Int, X86::VMAXSSZrm_Int, TB_NO_REVERSE },
+ { X86::VMAXSSrr, X86::VMAXSSrm, 0 },
+ { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, TB_NO_REVERSE },
+ { X86::VMINCPDYrr, X86::VMINCPDYrm, 0 },
+ { X86::VMINCPDZ128rr, X86::VMINCPDZ128rm, 0 },
+ { X86::VMINCPDZ256rr, X86::VMINCPDZ256rm, 0 },
+ { X86::VMINCPDZrr, X86::VMINCPDZrm, 0 },
+ { X86::VMINCPDrr, X86::VMINCPDrm, 0 },
+ { X86::VMINCPSYrr, X86::VMINCPSYrm, 0 },
+ { X86::VMINCPSZ128rr, X86::VMINCPSZ128rm, 0 },
+ { X86::VMINCPSZ256rr, X86::VMINCPSZ256rm, 0 },
+ { X86::VMINCPSZrr, X86::VMINCPSZrm, 0 },
+ { X86::VMINCPSrr, X86::VMINCPSrm, 0 },
+ { X86::VMINCSDZrr, X86::VMINCSDZrm, 0 },
+ { X86::VMINCSDrr, X86::VMINCSDrm, 0 },
+ { X86::VMINCSSZrr, X86::VMINCSSZrm, 0 },
+ { X86::VMINCSSrr, X86::VMINCSSrm, 0 },
+ { X86::VMINPDYrr, X86::VMINPDYrm, 0 },
+ { X86::VMINPDZ128rr, X86::VMINPDZ128rm, 0 },
+ { X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0 },
+ { X86::VMINPDZrr, X86::VMINPDZrm, 0 },
+ { X86::VMINPDrr, X86::VMINPDrm, 0 },
+ { X86::VMINPSYrr, X86::VMINPSYrm, 0 },
+ { X86::VMINPSZ128rr, X86::VMINPSZ128rm, 0 },
+ { X86::VMINPSZ256rr, X86::VMINPSZ256rm, 0 },
+ { X86::VMINPSZrr, X86::VMINPSZrm, 0 },
+ { X86::VMINPSrr, X86::VMINPSrm, 0 },
+ { X86::VMINSDZrr, X86::VMINSDZrm, 0 },
+ { X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE },
+ { X86::VMINSDrr, X86::VMINSDrm, 0 },
+ { X86::VMINSDrr_Int, X86::VMINSDrm_Int, TB_NO_REVERSE },
+ { X86::VMINSSZrr, X86::VMINSSZrm, 0 },
+ { X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE },
+ { X86::VMINSSrr, X86::VMINSSrm, 0 },
+ { X86::VMINSSrr_Int, X86::VMINSSrm_Int, TB_NO_REVERSE },
+ { X86::VMOVAPDZ128rrkz, X86::VMOVAPDZ128rmkz, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVAPDZ256rrkz, X86::VMOVAPDZ256rmkz, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVAPDZrrkz, X86::VMOVAPDZrmkz, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVAPSZ128rrkz, X86::VMOVAPSZ128rmkz, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVAPSZ256rrkz, X86::VMOVAPSZ256rmkz, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVAPSZrrkz, X86::VMOVAPSZrmkz, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVDDUPZ128rrkz, X86::VMOVDDUPZ128rmkz, TB_NO_REVERSE },
+ { X86::VMOVDDUPZ256rrkz, X86::VMOVDDUPZ256rmkz, 0 },
+ { X86::VMOVDDUPZrrkz, X86::VMOVDDUPZrmkz, 0 },
+ { X86::VMOVDQA32Z128rrkz, X86::VMOVDQA32Z128rmkz, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVDQA32Z256rrkz, X86::VMOVDQA32Z256rmkz, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVDQA32Zrrkz, X86::VMOVDQA32Zrmkz, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVDQA64Z128rrkz, X86::VMOVDQA64Z128rmkz, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVDQA64Z256rrkz, X86::VMOVDQA64Z256rmkz, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVDQA64Zrrkz, X86::VMOVDQA64Zrmkz, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVDQU16Z128rrkz, X86::VMOVDQU16Z128rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU16Z256rrkz, X86::VMOVDQU16Z256rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU16Zrrkz, X86::VMOVDQU16Zrmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU32Z128rrkz, X86::VMOVDQU32Z128rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU32Z256rrkz, X86::VMOVDQU32Z256rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU32Zrrkz, X86::VMOVDQU32Zrmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU64Z128rrkz, X86::VMOVDQU64Z128rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU64Z256rrkz, X86::VMOVDQU64Z256rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU64Zrrkz, X86::VMOVDQU64Zrmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU8Z128rrkz, X86::VMOVDQU8Z128rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU8Z256rrkz, X86::VMOVDQU8Z256rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU8Zrrkz, X86::VMOVDQU8Zrmkz, TB_NO_REVERSE },
+ { X86::VMOVLHPSZrr, X86::VMOVHPSZ128rm, TB_NO_REVERSE },
+ { X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE },
+ { X86::VMOVSHDUPZ128rrkz, X86::VMOVSHDUPZ128rmkz, 0 },
+ { X86::VMOVSHDUPZ256rrkz, X86::VMOVSHDUPZ256rmkz, 0 },
+ { X86::VMOVSHDUPZrrkz, X86::VMOVSHDUPZrmkz, 0 },
+ { X86::VMOVSLDUPZ128rrkz, X86::VMOVSLDUPZ128rmkz, 0 },
+ { X86::VMOVSLDUPZ256rrkz, X86::VMOVSLDUPZ256rmkz, 0 },
+ { X86::VMOVSLDUPZrrkz, X86::VMOVSLDUPZrmkz, 0 },
+ { X86::VMOVUPDZ128rrkz, X86::VMOVUPDZ128rmkz, TB_NO_REVERSE },
+ { X86::VMOVUPDZ256rrkz, X86::VMOVUPDZ256rmkz, TB_NO_REVERSE },
+ { X86::VMOVUPDZrrkz, X86::VMOVUPDZrmkz, TB_NO_REVERSE },
+ { X86::VMOVUPSZ128rrkz, X86::VMOVUPSZ128rmkz, TB_NO_REVERSE },
+ { X86::VMOVUPSZ256rrkz, X86::VMOVUPSZ256rmkz, TB_NO_REVERSE },
+ { X86::VMOVUPSZrrkz, X86::VMOVUPSZrmkz, TB_NO_REVERSE },
+ { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 },
+ { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
+ { X86::VMULPDYrr, X86::VMULPDYrm, 0 },
+ { X86::VMULPDZ128rr, X86::VMULPDZ128rm, 0 },
+ { X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0 },
+ { X86::VMULPDZrr, X86::VMULPDZrm, 0 },
+ { X86::VMULPDrr, X86::VMULPDrm, 0 },
+ { X86::VMULPSYrr, X86::VMULPSYrm, 0 },
+ { X86::VMULPSZ128rr, X86::VMULPSZ128rm, 0 },
+ { X86::VMULPSZ256rr, X86::VMULPSZ256rm, 0 },
+ { X86::VMULPSZrr, X86::VMULPSZrm, 0 },
+ { X86::VMULPSrr, X86::VMULPSrm, 0 },
+ { X86::VMULSDZrr, X86::VMULSDZrm, 0 },
+ { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, TB_NO_REVERSE },
+ { X86::VMULSDrr, X86::VMULSDrm, 0 },
+ { X86::VMULSDrr_Int, X86::VMULSDrm_Int, TB_NO_REVERSE },
+ { X86::VMULSSZrr, X86::VMULSSZrm, 0 },
+ { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE },
+ { X86::VMULSSrr, X86::VMULSSrm, 0 },
+ { X86::VMULSSrr_Int, X86::VMULSSrm_Int, TB_NO_REVERSE },
+ { X86::VORPDYrr, X86::VORPDYrm, 0 },
+ { X86::VORPDZ128rr, X86::VORPDZ128rm, 0 },
+ { X86::VORPDZ256rr, X86::VORPDZ256rm, 0 },
+ { X86::VORPDZrr, X86::VORPDZrm, 0 },
+ { X86::VORPDrr, X86::VORPDrm, 0 },
+ { X86::VORPSYrr, X86::VORPSYrm, 0 },
+ { X86::VORPSZ128rr, X86::VORPSZ128rm, 0 },
+ { X86::VORPSZ256rr, X86::VORPSZ256rm, 0 },
+ { X86::VORPSZrr, X86::VORPSZrm, 0 },
+ { X86::VORPSrr, X86::VORPSrm, 0 },
+ { X86::VPABSBZ128rrkz, X86::VPABSBZ128rmkz, 0 },
+ { X86::VPABSBZ256rrkz, X86::VPABSBZ256rmkz, 0 },
+ { X86::VPABSBZrrkz, X86::VPABSBZrmkz, 0 },
+ { X86::VPABSDZ128rrkz, X86::VPABSDZ128rmkz, 0 },
+ { X86::VPABSDZ256rrkz, X86::VPABSDZ256rmkz, 0 },
+ { X86::VPABSDZrrkz, X86::VPABSDZrmkz, 0 },
+ { X86::VPABSQZ128rrkz, X86::VPABSQZ128rmkz, 0 },
+ { X86::VPABSQZ256rrkz, X86::VPABSQZ256rmkz, 0 },
+ { X86::VPABSQZrrkz, X86::VPABSQZrmkz, 0 },
+ { X86::VPABSWZ128rrkz, X86::VPABSWZ128rmkz, 0 },
+ { X86::VPABSWZ256rrkz, X86::VPABSWZ256rmkz, 0 },
+ { X86::VPABSWZrrkz, X86::VPABSWZrmkz, 0 },
+ { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 },
+ { X86::VPACKSSDWZ128rr, X86::VPACKSSDWZ128rm, 0 },
+ { X86::VPACKSSDWZ256rr, X86::VPACKSSDWZ256rm, 0 },
+ { X86::VPACKSSDWZrr, X86::VPACKSSDWZrm, 0 },
+ { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 },
+ { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 },
+ { X86::VPACKSSWBZ128rr, X86::VPACKSSWBZ128rm, 0 },
+ { X86::VPACKSSWBZ256rr, X86::VPACKSSWBZ256rm, 0 },
+ { X86::VPACKSSWBZrr, X86::VPACKSSWBZrm, 0 },
+ { X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 },
+ { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 },
+ { X86::VPACKUSDWZ128rr, X86::VPACKUSDWZ128rm, 0 },
+ { X86::VPACKUSDWZ256rr, X86::VPACKUSDWZ256rm, 0 },
+ { X86::VPACKUSDWZrr, X86::VPACKUSDWZrm, 0 },
+ { X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 },
+ { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 },
+ { X86::VPACKUSWBZ128rr, X86::VPACKUSWBZ128rm, 0 },
+ { X86::VPACKUSWBZ256rr, X86::VPACKUSWBZ256rm, 0 },
+ { X86::VPACKUSWBZrr, X86::VPACKUSWBZrm, 0 },
+ { X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 },
+ { X86::VPADDBYrr, X86::VPADDBYrm, 0 },
+ { X86::VPADDBZ128rr, X86::VPADDBZ128rm, 0 },
+ { X86::VPADDBZ256rr, X86::VPADDBZ256rm, 0 },
+ { X86::VPADDBZrr, X86::VPADDBZrm, 0 },
+ { X86::VPADDBrr, X86::VPADDBrm, 0 },
+ { X86::VPADDDYrr, X86::VPADDDYrm, 0 },
+ { X86::VPADDDZ128rr, X86::VPADDDZ128rm, 0 },
+ { X86::VPADDDZ256rr, X86::VPADDDZ256rm, 0 },
+ { X86::VPADDDZrr, X86::VPADDDZrm, 0 },
+ { X86::VPADDDrr, X86::VPADDDrm, 0 },
+ { X86::VPADDQYrr, X86::VPADDQYrm, 0 },
+ { X86::VPADDQZ128rr, X86::VPADDQZ128rm, 0 },
+ { X86::VPADDQZ256rr, X86::VPADDQZ256rm, 0 },
+ { X86::VPADDQZrr, X86::VPADDQZrm, 0 },
+ { X86::VPADDQrr, X86::VPADDQrm, 0 },
+ { X86::VPADDSBYrr, X86::VPADDSBYrm, 0 },
+ { X86::VPADDSBZ128rr, X86::VPADDSBZ128rm, 0 },
+ { X86::VPADDSBZ256rr, X86::VPADDSBZ256rm, 0 },
+ { X86::VPADDSBZrr, X86::VPADDSBZrm, 0 },
+ { X86::VPADDSBrr, X86::VPADDSBrm, 0 },
+ { X86::VPADDSWYrr, X86::VPADDSWYrm, 0 },
+ { X86::VPADDSWZ128rr, X86::VPADDSWZ128rm, 0 },
+ { X86::VPADDSWZ256rr, X86::VPADDSWZ256rm, 0 },
+ { X86::VPADDSWZrr, X86::VPADDSWZrm, 0 },
+ { X86::VPADDSWrr, X86::VPADDSWrm, 0 },
+ { X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 },
+ { X86::VPADDUSBZ128rr, X86::VPADDUSBZ128rm, 0 },
+ { X86::VPADDUSBZ256rr, X86::VPADDUSBZ256rm, 0 },
+ { X86::VPADDUSBZrr, X86::VPADDUSBZrm, 0 },
+ { X86::VPADDUSBrr, X86::VPADDUSBrm, 0 },
+ { X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 },
+ { X86::VPADDUSWZ128rr, X86::VPADDUSWZ128rm, 0 },
+ { X86::VPADDUSWZ256rr, X86::VPADDUSWZ256rm, 0 },
+ { X86::VPADDUSWZrr, X86::VPADDUSWZrm, 0 },
+ { X86::VPADDUSWrr, X86::VPADDUSWrm, 0 },
+ { X86::VPADDWYrr, X86::VPADDWYrm, 0 },
+ { X86::VPADDWZ128rr, X86::VPADDWZ128rm, 0 },
+ { X86::VPADDWZ256rr, X86::VPADDWZ256rm, 0 },
+ { X86::VPADDWZrr, X86::VPADDWZrm, 0 },
+ { X86::VPADDWrr, X86::VPADDWrm, 0 },
+ { X86::VPALIGNRYrri, X86::VPALIGNRYrmi, 0 },
+ { X86::VPALIGNRZ128rri, X86::VPALIGNRZ128rmi, 0 },
+ { X86::VPALIGNRZ256rri, X86::VPALIGNRZ256rmi, 0 },
+ { X86::VPALIGNRZrri, X86::VPALIGNRZrmi, 0 },
+ { X86::VPALIGNRrri, X86::VPALIGNRrmi, 0 },
+ { X86::VPANDDZ128rr, X86::VPANDDZ128rm, 0 },
+ { X86::VPANDDZ256rr, X86::VPANDDZ256rm, 0 },
+ { X86::VPANDDZrr, X86::VPANDDZrm, 0 },
+ { X86::VPANDNDZ128rr, X86::VPANDNDZ128rm, 0 },
+ { X86::VPANDNDZ256rr, X86::VPANDNDZ256rm, 0 },
+ { X86::VPANDNDZrr, X86::VPANDNDZrm, 0 },
+ { X86::VPANDNQZ128rr, X86::VPANDNQZ128rm, 0 },
+ { X86::VPANDNQZ256rr, X86::VPANDNQZ256rm, 0 },
+ { X86::VPANDNQZrr, X86::VPANDNQZrm, 0 },
+ { X86::VPANDNYrr, X86::VPANDNYrm, 0 },
+ { X86::VPANDNrr, X86::VPANDNrm, 0 },
+ { X86::VPANDQZ128rr, X86::VPANDQZ128rm, 0 },
+ { X86::VPANDQZ256rr, X86::VPANDQZ256rm, 0 },
+ { X86::VPANDQZrr, X86::VPANDQZrm, 0 },
+ { X86::VPANDYrr, X86::VPANDYrm, 0 },
+ { X86::VPANDrr, X86::VPANDrm, 0 },
+ { X86::VPAVGBYrr, X86::VPAVGBYrm, 0 },
+ { X86::VPAVGBZ128rr, X86::VPAVGBZ128rm, 0 },
+ { X86::VPAVGBZ256rr, X86::VPAVGBZ256rm, 0 },
+ { X86::VPAVGBZrr, X86::VPAVGBZrm, 0 },
+ { X86::VPAVGBrr, X86::VPAVGBrm, 0 },
+ { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 },
+ { X86::VPAVGWZ128rr, X86::VPAVGWZ128rm, 0 },
+ { X86::VPAVGWZ256rr, X86::VPAVGWZ256rm, 0 },
+ { X86::VPAVGWZrr, X86::VPAVGWZrm, 0 },
+ { X86::VPAVGWrr, X86::VPAVGWrm, 0 },
+ { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 },
+ { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 },
+ { X86::VPBLENDMBZ128rr, X86::VPBLENDMBZ128rm, 0 },
+ { X86::VPBLENDMBZ256rr, X86::VPBLENDMBZ256rm, 0 },
+ { X86::VPBLENDMBZrr, X86::VPBLENDMBZrm, 0 },
+ { X86::VPBLENDMDZ128rr, X86::VPBLENDMDZ128rm, 0 },
+ { X86::VPBLENDMDZ256rr, X86::VPBLENDMDZ256rm, 0 },
+ { X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 },
+ { X86::VPBLENDMQZ128rr, X86::VPBLENDMQZ128rm, 0 },
+ { X86::VPBLENDMQZ256rr, X86::VPBLENDMQZ256rm, 0 },
+ { X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 },
+ { X86::VPBLENDMWZ128rr, X86::VPBLENDMWZ128rm, 0 },
+ { X86::VPBLENDMWZ256rr, X86::VPBLENDMWZ256rm, 0 },
+ { X86::VPBLENDMWZrr, X86::VPBLENDMWZrm, 0 },
+ { X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0 },
+ { X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 },
+ { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 },
+ { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 },
+ { X86::VPBROADCASTBZ128rkz, X86::VPBROADCASTBZ128mkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZ256rkz, X86::VPBROADCASTBZ256mkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZrkz, X86::VPBROADCASTBZmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ128rkz, X86::VPBROADCASTDZ128mkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ256rkz, X86::VPBROADCASTDZ256mkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZrkz, X86::VPBROADCASTDZmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ128rkz, X86::VPBROADCASTQZ128mkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ256rkz, X86::VPBROADCASTQZ256mkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZrkz, X86::VPBROADCASTQZmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ128rkz, X86::VPBROADCASTWZ128mkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ256rkz, X86::VPBROADCASTWZ256mkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZrkz, X86::VPBROADCASTWZmkz, TB_NO_REVERSE },
+ { X86::VPCLMULQDQYrr, X86::VPCLMULQDQYrm, 0 },
+ { X86::VPCLMULQDQZ128rr, X86::VPCLMULQDQZ128rm, 0 },
+ { X86::VPCLMULQDQZ256rr, X86::VPCLMULQDQZ256rm, 0 },
+ { X86::VPCLMULQDQZrr, X86::VPCLMULQDQZrm, 0 },
+ { X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 },
+ { X86::VPCMOVYrrr, X86::VPCMOVYrmr, 0 },
+ { X86::VPCMOVrrr, X86::VPCMOVrmr, 0 },
+ { X86::VPCMPBZ128rri, X86::VPCMPBZ128rmi, 0 },
+ { X86::VPCMPBZ256rri, X86::VPCMPBZ256rmi, 0 },
+ { X86::VPCMPBZrri, X86::VPCMPBZrmi, 0 },
+ { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmi, 0 },
+ { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmi, 0 },
+ { X86::VPCMPDZrri, X86::VPCMPDZrmi, 0 },
+ { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 },
+ { X86::VPCMPEQBZ128rr, X86::VPCMPEQBZ128rm, 0 },
+ { X86::VPCMPEQBZ256rr, X86::VPCMPEQBZ256rm, 0 },
+ { X86::VPCMPEQBZrr, X86::VPCMPEQBZrm, 0 },
+ { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 },
+ { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 },
+ { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rm, 0 },
+ { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rm, 0 },
+ { X86::VPCMPEQDZrr, X86::VPCMPEQDZrm, 0 },
+ { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 },
+ { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 },
+ { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rm, 0 },
+ { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rm, 0 },
+ { X86::VPCMPEQQZrr, X86::VPCMPEQQZrm, 0 },
+ { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 },
+ { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 },
+ { X86::VPCMPEQWZ128rr, X86::VPCMPEQWZ128rm, 0 },
+ { X86::VPCMPEQWZ256rr, X86::VPCMPEQWZ256rm, 0 },
+ { X86::VPCMPEQWZrr, X86::VPCMPEQWZrm, 0 },
+ { X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 },
+ { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 },
+ { X86::VPCMPGTBZ128rr, X86::VPCMPGTBZ128rm, 0 },
+ { X86::VPCMPGTBZ256rr, X86::VPCMPGTBZ256rm, 0 },
+ { X86::VPCMPGTBZrr, X86::VPCMPGTBZrm, 0 },
+ { X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 },
+ { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 },
+ { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rm, 0 },
+ { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rm, 0 },
+ { X86::VPCMPGTDZrr, X86::VPCMPGTDZrm, 0 },
+ { X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 },
+ { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 },
+ { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rm, 0 },
+ { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rm, 0 },
+ { X86::VPCMPGTQZrr, X86::VPCMPGTQZrm, 0 },
+ { X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 },
+ { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 },
+ { X86::VPCMPGTWZ128rr, X86::VPCMPGTWZ128rm, 0 },
+ { X86::VPCMPGTWZ256rr, X86::VPCMPGTWZ256rm, 0 },
+ { X86::VPCMPGTWZrr, X86::VPCMPGTWZrm, 0 },
+ { X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 },
+ { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmi, 0 },
+ { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmi, 0 },
+ { X86::VPCMPQZrri, X86::VPCMPQZrmi, 0 },
+ { X86::VPCMPUBZ128rri, X86::VPCMPUBZ128rmi, 0 },
+ { X86::VPCMPUBZ256rri, X86::VPCMPUBZ256rmi, 0 },
+ { X86::VPCMPUBZrri, X86::VPCMPUBZrmi, 0 },
+ { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmi, 0 },
+ { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmi, 0 },
+ { X86::VPCMPUDZrri, X86::VPCMPUDZrmi, 0 },
+ { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmi, 0 },
+ { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmi, 0 },
+ { X86::VPCMPUQZrri, X86::VPCMPUQZrmi, 0 },
+ { X86::VPCMPUWZ128rri, X86::VPCMPUWZ128rmi, 0 },
+ { X86::VPCMPUWZ256rri, X86::VPCMPUWZ256rmi, 0 },
+ { X86::VPCMPUWZrri, X86::VPCMPUWZrmi, 0 },
+ { X86::VPCMPWZ128rri, X86::VPCMPWZ128rmi, 0 },
+ { X86::VPCMPWZ256rri, X86::VPCMPWZ256rmi, 0 },
+ { X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 },
+ { X86::VPCOMBri, X86::VPCOMBmi, 0 },
+ { X86::VPCOMDri, X86::VPCOMDmi, 0 },
+ { X86::VPCOMQri, X86::VPCOMQmi, 0 },
+ { X86::VPCOMUBri, X86::VPCOMUBmi, 0 },
+ { X86::VPCOMUDri, X86::VPCOMUDmi, 0 },
+ { X86::VPCOMUQri, X86::VPCOMUQmi, 0 },
+ { X86::VPCOMUWri, X86::VPCOMUWmi, 0 },
+ { X86::VPCOMWri, X86::VPCOMWmi, 0 },
+ { X86::VPCONFLICTDZ128rrkz, X86::VPCONFLICTDZ128rmkz, 0 },
+ { X86::VPCONFLICTDZ256rrkz, X86::VPCONFLICTDZ256rmkz, 0 },
+ { X86::VPCONFLICTDZrrkz, X86::VPCONFLICTDZrmkz, 0 },
+ { X86::VPCONFLICTQZ128rrkz, X86::VPCONFLICTQZ128rmkz, 0 },
+ { X86::VPCONFLICTQZ256rrkz, X86::VPCONFLICTQZ256rmkz, 0 },
+ { X86::VPCONFLICTQZrrkz, X86::VPCONFLICTQZrmkz, 0 },
+ { X86::VPERM2F128rr, X86::VPERM2F128rm, 0 },
+ { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 },
+ { X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 },
+ { X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 },
+ { X86::VPERMBZrr, X86::VPERMBZrm, 0 },
+ { X86::VPERMDYrr, X86::VPERMDYrm, 0 },
+ { X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 },
+ { X86::VPERMDZrr, X86::VPERMDZrm, 0 },
+ { X86::VPERMIL2PDYrr, X86::VPERMIL2PDYmr, 0 },
+ { X86::VPERMIL2PDrr, X86::VPERMIL2PDmr, 0 },
+ { X86::VPERMIL2PSYrr, X86::VPERMIL2PSYmr, 0 },
+ { X86::VPERMIL2PSrr, X86::VPERMIL2PSmr, 0 },
+ { X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 },
+ { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 },
+ { X86::VPERMILPDZ128rr, X86::VPERMILPDZ128rm, 0 },
+ { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 },
+ { X86::VPERMILPDZ256rr, X86::VPERMILPDZ256rm, 0 },
+ { X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 },
+ { X86::VPERMILPDZrr, X86::VPERMILPDZrm, 0 },
+ { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 },
+ { X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 },
+ { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 },
+ { X86::VPERMILPSZ128rr, X86::VPERMILPSZ128rm, 0 },
+ { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 },
+ { X86::VPERMILPSZ256rr, X86::VPERMILPSZ256rm, 0 },
+ { X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 },
+ { X86::VPERMILPSZrr, X86::VPERMILPSZrm, 0 },
+ { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 },
+ { X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 },
+ { X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 },
+ { X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 },
+ { X86::VPERMPDZrr, X86::VPERMPDZrm, 0 },
+ { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 },
+ { X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 },
+ { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 },
+ { X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 },
+ { X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 },
+ { X86::VPERMQZrikz, X86::VPERMQZmikz, 0 },
+ { X86::VPERMQZrr, X86::VPERMQZrm, 0 },
+ { X86::VPERMWZ128rr, X86::VPERMWZ128rm, 0 },
+ { X86::VPERMWZ256rr, X86::VPERMWZ256rm, 0 },
+ { X86::VPERMWZrr, X86::VPERMWZrm, 0 },
+ { X86::VPEXPANDBZ128rrkz, X86::VPEXPANDBZ128rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDBZ256rrkz, X86::VPEXPANDBZ256rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDBZrrkz, X86::VPEXPANDBZrmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDDZ128rrkz, X86::VPEXPANDDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDDZ256rrkz, X86::VPEXPANDDZ256rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDDZrrkz, X86::VPEXPANDDZrmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDQZ128rrkz, X86::VPEXPANDQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDQZ256rrkz, X86::VPEXPANDQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDQZrrkz, X86::VPEXPANDQZrmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDWZ128rrkz, X86::VPEXPANDWZ128rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDWZ256rrkz, X86::VPEXPANDWZ256rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDWZrrkz, X86::VPEXPANDWZrmkz, TB_NO_REVERSE },
+ { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 },
+ { X86::VPHADDDrr, X86::VPHADDDrm, 0 },
+ { X86::VPHADDSWYrr, X86::VPHADDSWYrm, 0 },
+ { X86::VPHADDSWrr, X86::VPHADDSWrm, 0 },
+ { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 },
+ { X86::VPHADDWrr, X86::VPHADDWrm, 0 },
+ { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 },
+ { X86::VPHSUBDrr, X86::VPHSUBDrm, 0 },
+ { X86::VPHSUBSWYrr, X86::VPHSUBSWYrm, 0 },
+ { X86::VPHSUBSWrr, X86::VPHSUBSWrm, 0 },
+ { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 },
+ { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 },
+ { X86::VPINSRBZrr, X86::VPINSRBZrm, TB_NO_REVERSE },
+ { X86::VPINSRBrr, X86::VPINSRBrm, TB_NO_REVERSE },
+ { X86::VPINSRDZrr, X86::VPINSRDZrm, 0 },
+ { X86::VPINSRDrr, X86::VPINSRDrm, 0 },
+ { X86::VPINSRQZrr, X86::VPINSRQZrm, 0 },
+ { X86::VPINSRQrr, X86::VPINSRQrm, 0 },
+ { X86::VPINSRWZrr, X86::VPINSRWZrm, TB_NO_REVERSE },
+ { X86::VPINSRWrr, X86::VPINSRWrm, TB_NO_REVERSE },
+ { X86::VPLZCNTDZ128rrkz, X86::VPLZCNTDZ128rmkz, 0 },
+ { X86::VPLZCNTDZ256rrkz, X86::VPLZCNTDZ256rmkz, 0 },
+ { X86::VPLZCNTDZrrkz, X86::VPLZCNTDZrmkz, 0 },
+ { X86::VPLZCNTQZ128rrkz, X86::VPLZCNTQZ128rmkz, 0 },
+ { X86::VPLZCNTQZ256rrkz, X86::VPLZCNTQZ256rmkz, 0 },
+ { X86::VPLZCNTQZrrkz, X86::VPLZCNTQZrmkz, 0 },
+ { X86::VPMACSDDrr, X86::VPMACSDDrm, 0 },
+ { X86::VPMACSDQHrr, X86::VPMACSDQHrm, 0 },
+ { X86::VPMACSDQLrr, X86::VPMACSDQLrm, 0 },
+ { X86::VPMACSSDDrr, X86::VPMACSSDDrm, 0 },
+ { X86::VPMACSSDQHrr, X86::VPMACSSDQHrm, 0 },
+ { X86::VPMACSSDQLrr, X86::VPMACSSDQLrm, 0 },
+ { X86::VPMACSSWDrr, X86::VPMACSSWDrm, 0 },
+ { X86::VPMACSSWWrr, X86::VPMACSSWWrm, 0 },
+ { X86::VPMACSWDrr, X86::VPMACSWDrm, 0 },
+ { X86::VPMACSWWrr, X86::VPMACSWWrm, 0 },
+ { X86::VPMADCSSWDrr, X86::VPMADCSSWDrm, 0 },
+ { X86::VPMADCSWDrr, X86::VPMADCSWDrm, 0 },
+ { X86::VPMADDUBSWYrr, X86::VPMADDUBSWYrm, 0 },
+ { X86::VPMADDUBSWZ128rr, X86::VPMADDUBSWZ128rm, 0 },
+ { X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 },
+ { X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 },
+ { X86::VPMADDUBSWrr, X86::VPMADDUBSWrm, 0 },
+ { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 },
+ { X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 },
+ { X86::VPMADDWDZ256rr, X86::VPMADDWDZ256rm, 0 },
+ { X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 },
+ { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },
+ { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 },
+ { X86::VPMAXSBZ128rr, X86::VPMAXSBZ128rm, 0 },
+ { X86::VPMAXSBZ256rr, X86::VPMAXSBZ256rm, 0 },
+ { X86::VPMAXSBZrr, X86::VPMAXSBZrm, 0 },
+ { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 },
+ { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 },
+ { X86::VPMAXSDZ128rr, X86::VPMAXSDZ128rm, 0 },
+ { X86::VPMAXSDZ256rr, X86::VPMAXSDZ256rm, 0 },
+ { X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 },
+ { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 },
+ { X86::VPMAXSQZ128rr, X86::VPMAXSQZ128rm, 0 },
+ { X86::VPMAXSQZ256rr, X86::VPMAXSQZ256rm, 0 },
+ { X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 },
+ { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 },
+ { X86::VPMAXSWZ128rr, X86::VPMAXSWZ128rm, 0 },
+ { X86::VPMAXSWZ256rr, X86::VPMAXSWZ256rm, 0 },
+ { X86::VPMAXSWZrr, X86::VPMAXSWZrm, 0 },
+ { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 },
+ { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 },
+ { X86::VPMAXUBZ128rr, X86::VPMAXUBZ128rm, 0 },
+ { X86::VPMAXUBZ256rr, X86::VPMAXUBZ256rm, 0 },
+ { X86::VPMAXUBZrr, X86::VPMAXUBZrm, 0 },
+ { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 },
+ { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 },
+ { X86::VPMAXUDZ128rr, X86::VPMAXUDZ128rm, 0 },
+ { X86::VPMAXUDZ256rr, X86::VPMAXUDZ256rm, 0 },
+ { X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 },
+ { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 },
+ { X86::VPMAXUQZ128rr, X86::VPMAXUQZ128rm, 0 },
+ { X86::VPMAXUQZ256rr, X86::VPMAXUQZ256rm, 0 },
+ { X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 },
+ { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 },
+ { X86::VPMAXUWZ128rr, X86::VPMAXUWZ128rm, 0 },
+ { X86::VPMAXUWZ256rr, X86::VPMAXUWZ256rm, 0 },
+ { X86::VPMAXUWZrr, X86::VPMAXUWZrm, 0 },
+ { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 },
+ { X86::VPMINSBYrr, X86::VPMINSBYrm, 0 },
+ { X86::VPMINSBZ128rr, X86::VPMINSBZ128rm, 0 },
+ { X86::VPMINSBZ256rr, X86::VPMINSBZ256rm, 0 },
+ { X86::VPMINSBZrr, X86::VPMINSBZrm, 0 },
+ { X86::VPMINSBrr, X86::VPMINSBrm, 0 },
+ { X86::VPMINSDYrr, X86::VPMINSDYrm, 0 },
+ { X86::VPMINSDZ128rr, X86::VPMINSDZ128rm, 0 },
+ { X86::VPMINSDZ256rr, X86::VPMINSDZ256rm, 0 },
+ { X86::VPMINSDZrr, X86::VPMINSDZrm, 0 },
+ { X86::VPMINSDrr, X86::VPMINSDrm, 0 },
+ { X86::VPMINSQZ128rr, X86::VPMINSQZ128rm, 0 },
+ { X86::VPMINSQZ256rr, X86::VPMINSQZ256rm, 0 },
+ { X86::VPMINSQZrr, X86::VPMINSQZrm, 0 },
+ { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 },
+ { X86::VPMINSWZ128rr, X86::VPMINSWZ128rm, 0 },
+ { X86::VPMINSWZ256rr, X86::VPMINSWZ256rm, 0 },
+ { X86::VPMINSWZrr, X86::VPMINSWZrm, 0 },
+ { X86::VPMINSWrr, X86::VPMINSWrm, 0 },
+ { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 },
+ { X86::VPMINUBZ128rr, X86::VPMINUBZ128rm, 0 },
+ { X86::VPMINUBZ256rr, X86::VPMINUBZ256rm, 0 },
+ { X86::VPMINUBZrr, X86::VPMINUBZrm, 0 },
+ { X86::VPMINUBrr, X86::VPMINUBrm, 0 },
+ { X86::VPMINUDYrr, X86::VPMINUDYrm, 0 },
+ { X86::VPMINUDZ128rr, X86::VPMINUDZ128rm, 0 },
+ { X86::VPMINUDZ256rr, X86::VPMINUDZ256rm, 0 },
+ { X86::VPMINUDZrr, X86::VPMINUDZrm, 0 },
+ { X86::VPMINUDrr, X86::VPMINUDrm, 0 },
+ { X86::VPMINUQZ128rr, X86::VPMINUQZ128rm, 0 },
+ { X86::VPMINUQZ256rr, X86::VPMINUQZ256rm, 0 },
+ { X86::VPMINUQZrr, X86::VPMINUQZrm, 0 },
+ { X86::VPMINUWYrr, X86::VPMINUWYrm, 0 },
+ { X86::VPMINUWZ128rr, X86::VPMINUWZ128rm, 0 },
+ { X86::VPMINUWZ256rr, X86::VPMINUWZ256rm, 0 },
+ { X86::VPMINUWZrr, X86::VPMINUWZrm, 0 },
+ { X86::VPMINUWrr, X86::VPMINUWrm, 0 },
+ { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 },
+ { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 },
+ { X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 },
+ { X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz, 0 },
+ { X86::VPMOVSXDQZrrkz, X86::VPMOVSXDQZrmkz, 0 },
+ { X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz, 0 },
+ { X86::VPMOVSXWDZrrkz, X86::VPMOVSXWDZrmkz, 0 },
+ { X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZrrkz, X86::VPMOVSXWQZrmkz, 0 },
+ { X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZrrkz, X86::VPMOVZXBDZrmkz, 0 },
+ { X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZrrkz, X86::VPMOVZXBQZrmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz, 0 },
+ { X86::VPMOVZXBWZrrkz, X86::VPMOVZXBWZrmkz, 0 },
+ { X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz, 0 },
+ { X86::VPMOVZXDQZrrkz, X86::VPMOVZXDQZrmkz, 0 },
+ { X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz, 0 },
+ { X86::VPMOVZXWDZrrkz, X86::VPMOVZXWDZrmkz, 0 },
+ { X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZrrkz, X86::VPMOVZXWQZrmkz, 0 },
+ { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 },
+ { X86::VPMULDQZ128rr, X86::VPMULDQZ128rm, 0 },
+ { X86::VPMULDQZ256rr, X86::VPMULDQZ256rm, 0 },
+ { X86::VPMULDQZrr, X86::VPMULDQZrm, 0 },
+ { X86::VPMULDQrr, X86::VPMULDQrm, 0 },
+ { X86::VPMULHRSWYrr, X86::VPMULHRSWYrm, 0 },
+ { X86::VPMULHRSWZ128rr, X86::VPMULHRSWZ128rm, 0 },
+ { X86::VPMULHRSWZ256rr, X86::VPMULHRSWZ256rm, 0 },
+ { X86::VPMULHRSWZrr, X86::VPMULHRSWZrm, 0 },
+ { X86::VPMULHRSWrr, X86::VPMULHRSWrm, 0 },
+ { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 },
+ { X86::VPMULHUWZ128rr, X86::VPMULHUWZ128rm, 0 },
+ { X86::VPMULHUWZ256rr, X86::VPMULHUWZ256rm, 0 },
+ { X86::VPMULHUWZrr, X86::VPMULHUWZrm, 0 },
+ { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 },
+ { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 },
+ { X86::VPMULHWZ128rr, X86::VPMULHWZ128rm, 0 },
+ { X86::VPMULHWZ256rr, X86::VPMULHWZ256rm, 0 },
+ { X86::VPMULHWZrr, X86::VPMULHWZrm, 0 },
+ { X86::VPMULHWrr, X86::VPMULHWrm, 0 },
+ { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 },
+ { X86::VPMULLDZ128rr, X86::VPMULLDZ128rm, 0 },
+ { X86::VPMULLDZ256rr, X86::VPMULLDZ256rm, 0 },
+ { X86::VPMULLDZrr, X86::VPMULLDZrm, 0 },
+ { X86::VPMULLDrr, X86::VPMULLDrm, 0 },
+ { X86::VPMULLQZ128rr, X86::VPMULLQZ128rm, 0 },
+ { X86::VPMULLQZ256rr, X86::VPMULLQZ256rm, 0 },
+ { X86::VPMULLQZrr, X86::VPMULLQZrm, 0 },
+ { X86::VPMULLWYrr, X86::VPMULLWYrm, 0 },
+ { X86::VPMULLWZ128rr, X86::VPMULLWZ128rm, 0 },
+ { X86::VPMULLWZ256rr, X86::VPMULLWZ256rm, 0 },
+ { X86::VPMULLWZrr, X86::VPMULLWZrm, 0 },
+ { X86::VPMULLWrr, X86::VPMULLWrm, 0 },
+ { X86::VPMULTISHIFTQBZ128rr, X86::VPMULTISHIFTQBZ128rm, 0 },
+ { X86::VPMULTISHIFTQBZ256rr, X86::VPMULTISHIFTQBZ256rm, 0 },
+ { X86::VPMULTISHIFTQBZrr, X86::VPMULTISHIFTQBZrm, 0 },
+ { X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 },
+ { X86::VPMULUDQZ128rr, X86::VPMULUDQZ128rm, 0 },
+ { X86::VPMULUDQZ256rr, X86::VPMULUDQZ256rm, 0 },
+ { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
+ { X86::VPMULUDQrr, X86::VPMULUDQrm, 0 },
+ { X86::VPOPCNTBZ128rrkz, X86::VPOPCNTBZ128rmkz, 0 },
+ { X86::VPOPCNTBZ256rrkz, X86::VPOPCNTBZ256rmkz, 0 },
+ { X86::VPOPCNTBZrrkz, X86::VPOPCNTBZrmkz, 0 },
+ { X86::VPOPCNTDZ128rrkz, X86::VPOPCNTDZ128rmkz, 0 },
+ { X86::VPOPCNTDZ256rrkz, X86::VPOPCNTDZ256rmkz, 0 },
+ { X86::VPOPCNTDZrrkz, X86::VPOPCNTDZrmkz, 0 },
+ { X86::VPOPCNTQZ128rrkz, X86::VPOPCNTQZ128rmkz, 0 },
+ { X86::VPOPCNTQZ256rrkz, X86::VPOPCNTQZ256rmkz, 0 },
+ { X86::VPOPCNTQZrrkz, X86::VPOPCNTQZrmkz, 0 },
+ { X86::VPOPCNTWZ128rrkz, X86::VPOPCNTWZ128rmkz, 0 },
+ { X86::VPOPCNTWZ256rrkz, X86::VPOPCNTWZ256rmkz, 0 },
+ { X86::VPOPCNTWZrrkz, X86::VPOPCNTWZrmkz, 0 },
+ { X86::VPORDZ128rr, X86::VPORDZ128rm, 0 },
+ { X86::VPORDZ256rr, X86::VPORDZ256rm, 0 },
+ { X86::VPORDZrr, X86::VPORDZrm, 0 },
+ { X86::VPORQZ128rr, X86::VPORQZ128rm, 0 },
+ { X86::VPORQZ256rr, X86::VPORQZ256rm, 0 },
+ { X86::VPORQZrr, X86::VPORQZrm, 0 },
+ { X86::VPORYrr, X86::VPORYrm, 0 },
+ { X86::VPORrr, X86::VPORrm, 0 },
+ { X86::VPPERMrrr, X86::VPPERMrmr, 0 },
+ { X86::VPROLDZ128rikz, X86::VPROLDZ128mikz, 0 },
+ { X86::VPROLDZ256rikz, X86::VPROLDZ256mikz, 0 },
+ { X86::VPROLDZrikz, X86::VPROLDZmikz, 0 },
+ { X86::VPROLQZ128rikz, X86::VPROLQZ128mikz, 0 },
+ { X86::VPROLQZ256rikz, X86::VPROLQZ256mikz, 0 },
+ { X86::VPROLQZrikz, X86::VPROLQZmikz, 0 },
+ { X86::VPROLVDZ128rr, X86::VPROLVDZ128rm, 0 },
+ { X86::VPROLVDZ256rr, X86::VPROLVDZ256rm, 0 },
+ { X86::VPROLVDZrr, X86::VPROLVDZrm, 0 },
+ { X86::VPROLVQZ128rr, X86::VPROLVQZ128rm, 0 },
+ { X86::VPROLVQZ256rr, X86::VPROLVQZ256rm, 0 },
+ { X86::VPROLVQZrr, X86::VPROLVQZrm, 0 },
+ { X86::VPRORDZ128rikz, X86::VPRORDZ128mikz, 0 },
+ { X86::VPRORDZ256rikz, X86::VPRORDZ256mikz, 0 },
+ { X86::VPRORDZrikz, X86::VPRORDZmikz, 0 },
+ { X86::VPRORQZ128rikz, X86::VPRORQZ128mikz, 0 },
+ { X86::VPRORQZ256rikz, X86::VPRORQZ256mikz, 0 },
+ { X86::VPRORQZrikz, X86::VPRORQZmikz, 0 },
+ { X86::VPRORVDZ128rr, X86::VPRORVDZ128rm, 0 },
+ { X86::VPRORVDZ256rr, X86::VPRORVDZ256rm, 0 },
+ { X86::VPRORVDZrr, X86::VPRORVDZrm, 0 },
+ { X86::VPRORVQZ128rr, X86::VPRORVQZ128rm, 0 },
+ { X86::VPRORVQZ256rr, X86::VPRORVQZ256rm, 0 },
+ { X86::VPRORVQZrr, X86::VPRORVQZrm, 0 },
+ { X86::VPROTBrr, X86::VPROTBrm, 0 },
+ { X86::VPROTDrr, X86::VPROTDrm, 0 },
+ { X86::VPROTQrr, X86::VPROTQrm, 0 },
+ { X86::VPROTWrr, X86::VPROTWrm, 0 },
+ { X86::VPSADBWYrr, X86::VPSADBWYrm, 0 },
+ { X86::VPSADBWZ128rr, X86::VPSADBWZ128rm, 0 },
+ { X86::VPSADBWZ256rr, X86::VPSADBWZ256rm, 0 },
+ { X86::VPSADBWZrr, X86::VPSADBWZrm, 0 },
+ { X86::VPSADBWrr, X86::VPSADBWrm, 0 },
+ { X86::VPSHABrr, X86::VPSHABrm, 0 },
+ { X86::VPSHADrr, X86::VPSHADrm, 0 },
+ { X86::VPSHAQrr, X86::VPSHAQrm, 0 },
+ { X86::VPSHAWrr, X86::VPSHAWrm, 0 },
+ { X86::VPSHLBrr, X86::VPSHLBrm, 0 },
+ { X86::VPSHLDDZ128rri, X86::VPSHLDDZ128rmi, 0 },
+ { X86::VPSHLDDZ256rri, X86::VPSHLDDZ256rmi, 0 },
+ { X86::VPSHLDDZrri, X86::VPSHLDDZrmi, 0 },
+ { X86::VPSHLDQZ128rri, X86::VPSHLDQZ128rmi, 0 },
+ { X86::VPSHLDQZ256rri, X86::VPSHLDQZ256rmi, 0 },
+ { X86::VPSHLDQZrri, X86::VPSHLDQZrmi, 0 },
+ { X86::VPSHLDWZ128rri, X86::VPSHLDWZ128rmi, 0 },
+ { X86::VPSHLDWZ256rri, X86::VPSHLDWZ256rmi, 0 },
+ { X86::VPSHLDWZrri, X86::VPSHLDWZrmi, 0 },
+ { X86::VPSHLDrr, X86::VPSHLDrm, 0 },
+ { X86::VPSHLQrr, X86::VPSHLQrm, 0 },
+ { X86::VPSHLWrr, X86::VPSHLWrm, 0 },
+ { X86::VPSHRDDZ128rri, X86::VPSHRDDZ128rmi, 0 },
+ { X86::VPSHRDDZ256rri, X86::VPSHRDDZ256rmi, 0 },
+ { X86::VPSHRDDZrri, X86::VPSHRDDZrmi, 0 },
+ { X86::VPSHRDQZ128rri, X86::VPSHRDQZ128rmi, 0 },
+ { X86::VPSHRDQZ256rri, X86::VPSHRDQZ256rmi, 0 },
+ { X86::VPSHRDQZrri, X86::VPSHRDQZrmi, 0 },
+ { X86::VPSHRDWZ128rri, X86::VPSHRDWZ128rmi, 0 },
+ { X86::VPSHRDWZ256rri, X86::VPSHRDWZ256rmi, 0 },
+ { X86::VPSHRDWZrri, X86::VPSHRDWZrmi, 0 },
+ { X86::VPSHUFBITQMBZ128rr, X86::VPSHUFBITQMBZ128rm, 0 },
+ { X86::VPSHUFBITQMBZ256rr, X86::VPSHUFBITQMBZ256rm, 0 },
+ { X86::VPSHUFBITQMBZrr, X86::VPSHUFBITQMBZrm, 0 },
+ { X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 },
+ { X86::VPSHUFBZ128rr, X86::VPSHUFBZ128rm, 0 },
+ { X86::VPSHUFBZ256rr, X86::VPSHUFBZ256rm, 0 },
+ { X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 },
+ { X86::VPSHUFBrr, X86::VPSHUFBrm, 0 },
+ { X86::VPSHUFDZ128rikz, X86::VPSHUFDZ128mikz, 0 },
+ { X86::VPSHUFDZ256rikz, X86::VPSHUFDZ256mikz, 0 },
+ { X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 },
+ { X86::VPSHUFHWZ128rikz, X86::VPSHUFHWZ128mikz, 0 },
+ { X86::VPSHUFHWZ256rikz, X86::VPSHUFHWZ256mikz, 0 },
+ { X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 },
+ { X86::VPSHUFLWZ128rikz, X86::VPSHUFLWZ128mikz, 0 },
+ { X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 },
+ { X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 },
+ { X86::VPSIGNBYrr, X86::VPSIGNBYrm, 0 },
+ { X86::VPSIGNBrr, X86::VPSIGNBrm, 0 },
+ { X86::VPSIGNDYrr, X86::VPSIGNDYrm, 0 },
+ { X86::VPSIGNDrr, X86::VPSIGNDrm, 0 },
+ { X86::VPSIGNWYrr, X86::VPSIGNWYrm, 0 },
+ { X86::VPSIGNWrr, X86::VPSIGNWrm, 0 },
+ { X86::VPSLLDYrr, X86::VPSLLDYrm, 0 },
+ { X86::VPSLLDZ128rikz, X86::VPSLLDZ128mikz, 0 },
+ { X86::VPSLLDZ128rr, X86::VPSLLDZ128rm, 0 },
+ { X86::VPSLLDZ256rikz, X86::VPSLLDZ256mikz, 0 },
+ { X86::VPSLLDZ256rr, X86::VPSLLDZ256rm, 0 },
+ { X86::VPSLLDZrikz, X86::VPSLLDZmikz, 0 },
+ { X86::VPSLLDZrr, X86::VPSLLDZrm, 0 },
+ { X86::VPSLLDrr, X86::VPSLLDrm, 0 },
+ { X86::VPSLLQYrr, X86::VPSLLQYrm, 0 },
+ { X86::VPSLLQZ128rikz, X86::VPSLLQZ128mikz, 0 },
+ { X86::VPSLLQZ128rr, X86::VPSLLQZ128rm, 0 },
+ { X86::VPSLLQZ256rikz, X86::VPSLLQZ256mikz, 0 },
+ { X86::VPSLLQZ256rr, X86::VPSLLQZ256rm, 0 },
+ { X86::VPSLLQZrikz, X86::VPSLLQZmikz, 0 },
+ { X86::VPSLLQZrr, X86::VPSLLQZrm, 0 },
+ { X86::VPSLLQrr, X86::VPSLLQrm, 0 },
+ { X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 },
+ { X86::VPSLLVDZ128rr, X86::VPSLLVDZ128rm, 0 },
+ { X86::VPSLLVDZ256rr, X86::VPSLLVDZ256rm, 0 },
+ { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 },
+ { X86::VPSLLVDrr, X86::VPSLLVDrm, 0 },
+ { X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 },
+ { X86::VPSLLVQZ128rr, X86::VPSLLVQZ128rm, 0 },
+ { X86::VPSLLVQZ256rr, X86::VPSLLVQZ256rm, 0 },
+ { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 },
+ { X86::VPSLLVQrr, X86::VPSLLVQrm, 0 },
+ { X86::VPSLLVWZ128rr, X86::VPSLLVWZ128rm, 0 },
+ { X86::VPSLLVWZ256rr, X86::VPSLLVWZ256rm, 0 },
+ { X86::VPSLLVWZrr, X86::VPSLLVWZrm, 0 },
+ { X86::VPSLLWYrr, X86::VPSLLWYrm, 0 },
+ { X86::VPSLLWZ128rikz, X86::VPSLLWZ128mikz, 0 },
+ { X86::VPSLLWZ128rr, X86::VPSLLWZ128rm, 0 },
+ { X86::VPSLLWZ256rikz, X86::VPSLLWZ256mikz, 0 },
+ { X86::VPSLLWZ256rr, X86::VPSLLWZ256rm, 0 },
+ { X86::VPSLLWZrikz, X86::VPSLLWZmikz, 0 },
+ { X86::VPSLLWZrr, X86::VPSLLWZrm, 0 },
+ { X86::VPSLLWrr, X86::VPSLLWrm, 0 },
+ { X86::VPSRADYrr, X86::VPSRADYrm, 0 },
+ { X86::VPSRADZ128rikz, X86::VPSRADZ128mikz, 0 },
+ { X86::VPSRADZ128rr, X86::VPSRADZ128rm, 0 },
+ { X86::VPSRADZ256rikz, X86::VPSRADZ256mikz, 0 },
+ { X86::VPSRADZ256rr, X86::VPSRADZ256rm, 0 },
+ { X86::VPSRADZrikz, X86::VPSRADZmikz, 0 },
+ { X86::VPSRADZrr, X86::VPSRADZrm, 0 },
+ { X86::VPSRADrr, X86::VPSRADrm, 0 },
+ { X86::VPSRAQZ128rikz, X86::VPSRAQZ128mikz, 0 },
+ { X86::VPSRAQZ128rr, X86::VPSRAQZ128rm, 0 },
+ { X86::VPSRAQZ256rikz, X86::VPSRAQZ256mikz, 0 },
+ { X86::VPSRAQZ256rr, X86::VPSRAQZ256rm, 0 },
+ { X86::VPSRAQZrikz, X86::VPSRAQZmikz, 0 },
+ { X86::VPSRAQZrr, X86::VPSRAQZrm, 0 },
+ { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 },
+ { X86::VPSRAVDZ128rr, X86::VPSRAVDZ128rm, 0 },
+ { X86::VPSRAVDZ256rr, X86::VPSRAVDZ256rm, 0 },
+ { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 },
+ { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 },
+ { X86::VPSRAVQZ128rr, X86::VPSRAVQZ128rm, 0 },
+ { X86::VPSRAVQZ256rr, X86::VPSRAVQZ256rm, 0 },
+ { X86::VPSRAVQZrr, X86::VPSRAVQZrm, 0 },
+ { X86::VPSRAVWZ128rr, X86::VPSRAVWZ128rm, 0 },
+ { X86::VPSRAVWZ256rr, X86::VPSRAVWZ256rm, 0 },
+ { X86::VPSRAVWZrr, X86::VPSRAVWZrm, 0 },
+ { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 },
+ { X86::VPSRAWZ128rikz, X86::VPSRAWZ128mikz, 0 },
+ { X86::VPSRAWZ128rr, X86::VPSRAWZ128rm, 0 },
+ { X86::VPSRAWZ256rikz, X86::VPSRAWZ256mikz, 0 },
+ { X86::VPSRAWZ256rr, X86::VPSRAWZ256rm, 0 },
+ { X86::VPSRAWZrikz, X86::VPSRAWZmikz, 0 },
+ { X86::VPSRAWZrr, X86::VPSRAWZrm, 0 },
+ { X86::VPSRAWrr, X86::VPSRAWrm, 0 },
+ { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 },
+ { X86::VPSRLDZ128rikz, X86::VPSRLDZ128mikz, 0 },
+ { X86::VPSRLDZ128rr, X86::VPSRLDZ128rm, 0 },
+ { X86::VPSRLDZ256rikz, X86::VPSRLDZ256mikz, 0 },
+ { X86::VPSRLDZ256rr, X86::VPSRLDZ256rm, 0 },
+ { X86::VPSRLDZrikz, X86::VPSRLDZmikz, 0 },
+ { X86::VPSRLDZrr, X86::VPSRLDZrm, 0 },
+ { X86::VPSRLDrr, X86::VPSRLDrm, 0 },
+ { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 },
+ { X86::VPSRLQZ128rikz, X86::VPSRLQZ128mikz, 0 },
+ { X86::VPSRLQZ128rr, X86::VPSRLQZ128rm, 0 },
+ { X86::VPSRLQZ256rikz, X86::VPSRLQZ256mikz, 0 },
+ { X86::VPSRLQZ256rr, X86::VPSRLQZ256rm, 0 },
+ { X86::VPSRLQZrikz, X86::VPSRLQZmikz, 0 },
+ { X86::VPSRLQZrr, X86::VPSRLQZrm, 0 },
+ { X86::VPSRLQrr, X86::VPSRLQrm, 0 },
+ { X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 },
+ { X86::VPSRLVDZ128rr, X86::VPSRLVDZ128rm, 0 },
+ { X86::VPSRLVDZ256rr, X86::VPSRLVDZ256rm, 0 },
+ { X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 },
+ { X86::VPSRLVDrr, X86::VPSRLVDrm, 0 },
+ { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 },
+ { X86::VPSRLVQZ128rr, X86::VPSRLVQZ128rm, 0 },
+ { X86::VPSRLVQZ256rr, X86::VPSRLVQZ256rm, 0 },
+ { X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 },
+ { X86::VPSRLVQrr, X86::VPSRLVQrm, 0 },
+ { X86::VPSRLVWZ128rr, X86::VPSRLVWZ128rm, 0 },
+ { X86::VPSRLVWZ256rr, X86::VPSRLVWZ256rm, 0 },
+ { X86::VPSRLVWZrr, X86::VPSRLVWZrm, 0 },
+ { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 },
+ { X86::VPSRLWZ128rikz, X86::VPSRLWZ128mikz, 0 },
+ { X86::VPSRLWZ128rr, X86::VPSRLWZ128rm, 0 },
+ { X86::VPSRLWZ256rikz, X86::VPSRLWZ256mikz, 0 },
+ { X86::VPSRLWZ256rr, X86::VPSRLWZ256rm, 0 },
+ { X86::VPSRLWZrikz, X86::VPSRLWZmikz, 0 },
+ { X86::VPSRLWZrr, X86::VPSRLWZrm, 0 },
+ { X86::VPSRLWrr, X86::VPSRLWrm, 0 },
+ { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 },
+ { X86::VPSUBBZ128rr, X86::VPSUBBZ128rm, 0 },
+ { X86::VPSUBBZ256rr, X86::VPSUBBZ256rm, 0 },
+ { X86::VPSUBBZrr, X86::VPSUBBZrm, 0 },
+ { X86::VPSUBBrr, X86::VPSUBBrm, 0 },
+ { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 },
+ { X86::VPSUBDZ128rr, X86::VPSUBDZ128rm, 0 },
+ { X86::VPSUBDZ256rr, X86::VPSUBDZ256rm, 0 },
+ { X86::VPSUBDZrr, X86::VPSUBDZrm, 0 },
+ { X86::VPSUBDrr, X86::VPSUBDrm, 0 },
+ { X86::VPSUBQYrr, X86::VPSUBQYrm, 0 },
+ { X86::VPSUBQZ128rr, X86::VPSUBQZ128rm, 0 },
+ { X86::VPSUBQZ256rr, X86::VPSUBQZ256rm, 0 },
+ { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 },
+ { X86::VPSUBQrr, X86::VPSUBQrm, 0 },
+ { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 },
+ { X86::VPSUBSBZ128rr, X86::VPSUBSBZ128rm, 0 },
+ { X86::VPSUBSBZ256rr, X86::VPSUBSBZ256rm, 0 },
+ { X86::VPSUBSBZrr, X86::VPSUBSBZrm, 0 },
+ { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 },
+ { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 },
+ { X86::VPSUBSWZ128rr, X86::VPSUBSWZ128rm, 0 },
+ { X86::VPSUBSWZ256rr, X86::VPSUBSWZ256rm, 0 },
+ { X86::VPSUBSWZrr, X86::VPSUBSWZrm, 0 },
+ { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 },
+ { X86::VPSUBUSBYrr, X86::VPSUBUSBYrm, 0 },
+ { X86::VPSUBUSBZ128rr, X86::VPSUBUSBZ128rm, 0 },
+ { X86::VPSUBUSBZ256rr, X86::VPSUBUSBZ256rm, 0 },
+ { X86::VPSUBUSBZrr, X86::VPSUBUSBZrm, 0 },
+ { X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 },
+ { X86::VPSUBUSWYrr, X86::VPSUBUSWYrm, 0 },
+ { X86::VPSUBUSWZ128rr, X86::VPSUBUSWZ128rm, 0 },
+ { X86::VPSUBUSWZ256rr, X86::VPSUBUSWZ256rm, 0 },
+ { X86::VPSUBUSWZrr, X86::VPSUBUSWZrm, 0 },
+ { X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 },
+ { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 },
+ { X86::VPSUBWZ128rr, X86::VPSUBWZ128rm, 0 },
+ { X86::VPSUBWZ256rr, X86::VPSUBWZ256rm, 0 },
+ { X86::VPSUBWZrr, X86::VPSUBWZrm, 0 },
+ { X86::VPSUBWrr, X86::VPSUBWrm, 0 },
+ { X86::VPTESTMBZ128rr, X86::VPTESTMBZ128rm, 0 },
+ { X86::VPTESTMBZ256rr, X86::VPTESTMBZ256rm, 0 },
+ { X86::VPTESTMBZrr, X86::VPTESTMBZrm, 0 },
+ { X86::VPTESTMDZ128rr, X86::VPTESTMDZ128rm, 0 },
+ { X86::VPTESTMDZ256rr, X86::VPTESTMDZ256rm, 0 },
+ { X86::VPTESTMDZrr, X86::VPTESTMDZrm, 0 },
+ { X86::VPTESTMQZ128rr, X86::VPTESTMQZ128rm, 0 },
+ { X86::VPTESTMQZ256rr, X86::VPTESTMQZ256rm, 0 },
+ { X86::VPTESTMQZrr, X86::VPTESTMQZrm, 0 },
+ { X86::VPTESTMWZ128rr, X86::VPTESTMWZ128rm, 0 },
+ { X86::VPTESTMWZ256rr, X86::VPTESTMWZ256rm, 0 },
+ { X86::VPTESTMWZrr, X86::VPTESTMWZrm, 0 },
+ { X86::VPTESTNMBZ128rr, X86::VPTESTNMBZ128rm, 0 },
+ { X86::VPTESTNMBZ256rr, X86::VPTESTNMBZ256rm, 0 },
+ { X86::VPTESTNMBZrr, X86::VPTESTNMBZrm, 0 },
+ { X86::VPTESTNMDZ128rr, X86::VPTESTNMDZ128rm, 0 },
+ { X86::VPTESTNMDZ256rr, X86::VPTESTNMDZ256rm, 0 },
+ { X86::VPTESTNMDZrr, X86::VPTESTNMDZrm, 0 },
+ { X86::VPTESTNMQZ128rr, X86::VPTESTNMQZ128rm, 0 },
+ { X86::VPTESTNMQZ256rr, X86::VPTESTNMQZ256rm, 0 },
+ { X86::VPTESTNMQZrr, X86::VPTESTNMQZrm, 0 },
+ { X86::VPTESTNMWZ128rr, X86::VPTESTNMWZ128rm, 0 },
+ { X86::VPTESTNMWZ256rr, X86::VPTESTNMWZ256rm, 0 },
+ { X86::VPTESTNMWZrr, X86::VPTESTNMWZrm, 0 },
+ { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 },
+ { X86::VPUNPCKHBWZ128rr, X86::VPUNPCKHBWZ128rm, 0 },
+ { X86::VPUNPCKHBWZ256rr, X86::VPUNPCKHBWZ256rm, 0 },
+ { X86::VPUNPCKHBWZrr, X86::VPUNPCKHBWZrm, 0 },
+ { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 },
+ { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 },
+ { X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rm, 0 },
+ { X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rm, 0 },
+ { X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrm, 0 },
+ { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 },
+ { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 },
+ { X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm, 0 },
+ { X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm, 0 },
+ { X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrm, 0 },
+ { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 },
+ { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 },
+ { X86::VPUNPCKHWDZ128rr, X86::VPUNPCKHWDZ128rm, 0 },
+ { X86::VPUNPCKHWDZ256rr, X86::VPUNPCKHWDZ256rm, 0 },
+ { X86::VPUNPCKHWDZrr, X86::VPUNPCKHWDZrm, 0 },
+ { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 },
+ { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 },
+ { X86::VPUNPCKLBWZ128rr, X86::VPUNPCKLBWZ128rm, 0 },
+ { X86::VPUNPCKLBWZ256rr, X86::VPUNPCKLBWZ256rm, 0 },
+ { X86::VPUNPCKLBWZrr, X86::VPUNPCKLBWZrm, 0 },
+ { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 },
+ { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 },
+ { X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rm, 0 },
+ { X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rm, 0 },
+ { X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrm, 0 },
+ { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 },
+ { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 },
+ { X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm, 0 },
+ { X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm, 0 },
+ { X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrm, 0 },
+ { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
+ { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 },
+ { X86::VPUNPCKLWDZ128rr, X86::VPUNPCKLWDZ128rm, 0 },
+ { X86::VPUNPCKLWDZ256rr, X86::VPUNPCKLWDZ256rm, 0 },
+ { X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 },
+ { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 },
+ { X86::VPXORDZ128rr, X86::VPXORDZ128rm, 0 },
+ { X86::VPXORDZ256rr, X86::VPXORDZ256rm, 0 },
+ { X86::VPXORDZrr, X86::VPXORDZrm, 0 },
+ { X86::VPXORQZ128rr, X86::VPXORQZ128rm, 0 },
+ { X86::VPXORQZ256rr, X86::VPXORQZ256rm, 0 },
+ { X86::VPXORQZrr, X86::VPXORQZrm, 0 },
+ { X86::VPXORYrr, X86::VPXORYrm, 0 },
+ { X86::VPXORrr, X86::VPXORrm, 0 },
+ { X86::VRANGEPDZ128rri, X86::VRANGEPDZ128rmi, 0 },
+ { X86::VRANGEPDZ256rri, X86::VRANGEPDZ256rmi, 0 },
+ { X86::VRANGEPDZrri, X86::VRANGEPDZrmi, 0 },
+ { X86::VRANGEPSZ128rri, X86::VRANGEPSZ128rmi, 0 },
+ { X86::VRANGEPSZ256rri, X86::VRANGEPSZ256rmi, 0 },
+ { X86::VRANGEPSZrri, X86::VRANGEPSZrmi, 0 },
+ { X86::VRANGESDZrri, X86::VRANGESDZrmi, TB_NO_REVERSE },
+ { X86::VRANGESSZrri, X86::VRANGESSZrmi, TB_NO_REVERSE },
+ { X86::VRCP14PDZ128rkz, X86::VRCP14PDZ128mkz, 0 },
+ { X86::VRCP14PDZ256rkz, X86::VRCP14PDZ256mkz, 0 },
+ { X86::VRCP14PDZrkz, X86::VRCP14PDZmkz, 0 },
+ { X86::VRCP14PSZ128rkz, X86::VRCP14PSZ128mkz, 0 },
+ { X86::VRCP14PSZ256rkz, X86::VRCP14PSZ256mkz, 0 },
+ { X86::VRCP14PSZrkz, X86::VRCP14PSZmkz, 0 },
+ { X86::VRCP14SDZrr, X86::VRCP14SDZrm, TB_NO_REVERSE },
+ { X86::VRCP14SSZrr, X86::VRCP14SSZrm, TB_NO_REVERSE },
+ { X86::VRCP28PDZrkz, X86::VRCP28PDZmkz, 0 },
+ { X86::VRCP28PSZrkz, X86::VRCP28PSZmkz, 0 },
+ { X86::VRCP28SDZr, X86::VRCP28SDZm, TB_NO_REVERSE },
+ { X86::VRCP28SSZr, X86::VRCP28SSZm, TB_NO_REVERSE },
+ { X86::VRCPSSr, X86::VRCPSSm, 0 },
+ { X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE },
+ { X86::VREDUCEPDZ128rrikz, X86::VREDUCEPDZ128rmikz, 0 },
+ { X86::VREDUCEPDZ256rrikz, X86::VREDUCEPDZ256rmikz, 0 },
+ { X86::VREDUCEPDZrrikz, X86::VREDUCEPDZrmikz, 0 },
+ { X86::VREDUCEPSZ128rrikz, X86::VREDUCEPSZ128rmikz, 0 },
+ { X86::VREDUCEPSZ256rrikz, X86::VREDUCEPSZ256rmikz, 0 },
+ { X86::VREDUCEPSZrrikz, X86::VREDUCEPSZrmikz, 0 },
+ { X86::VREDUCESDZrri, X86::VREDUCESDZrmi, TB_NO_REVERSE },
+ { X86::VREDUCESSZrri, X86::VREDUCESSZrmi, TB_NO_REVERSE },
+ { X86::VRNDSCALEPDZ128rrikz, X86::VRNDSCALEPDZ128rmikz, 0 },
+ { X86::VRNDSCALEPDZ256rrikz, X86::VRNDSCALEPDZ256rmikz, 0 },
+ { X86::VRNDSCALEPDZrrikz, X86::VRNDSCALEPDZrmikz, 0 },
+ { X86::VRNDSCALEPSZ128rrikz, X86::VRNDSCALEPSZ128rmikz, 0 },
+ { X86::VRNDSCALEPSZ256rrikz, X86::VRNDSCALEPSZ256rmikz, 0 },
+ { X86::VRNDSCALEPSZrrikz, X86::VRNDSCALEPSZrmikz, 0 },
+ { X86::VRNDSCALESDZr, X86::VRNDSCALESDZm, 0 },
+ { X86::VRNDSCALESDZr_Int, X86::VRNDSCALESDZm_Int, TB_NO_REVERSE },
+ { X86::VRNDSCALESSZr, X86::VRNDSCALESSZm, 0 },
+ { X86::VRNDSCALESSZr_Int, X86::VRNDSCALESSZm_Int, TB_NO_REVERSE },
+ { X86::VROUNDSDr, X86::VROUNDSDm, 0 },
+ { X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE },
+ { X86::VROUNDSSr, X86::VROUNDSSm, 0 },
+ { X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE },
+ { X86::VRSQRT14PDZ128rkz, X86::VRSQRT14PDZ128mkz, 0 },
+ { X86::VRSQRT14PDZ256rkz, X86::VRSQRT14PDZ256mkz, 0 },
+ { X86::VRSQRT14PDZrkz, X86::VRSQRT14PDZmkz, 0 },
+ { X86::VRSQRT14PSZ128rkz, X86::VRSQRT14PSZ128mkz, 0 },
+ { X86::VRSQRT14PSZ256rkz, X86::VRSQRT14PSZ256mkz, 0 },
+ { X86::VRSQRT14PSZrkz, X86::VRSQRT14PSZmkz, 0 },
+ { X86::VRSQRT14SDZrr, X86::VRSQRT14SDZrm, TB_NO_REVERSE },
+ { X86::VRSQRT14SSZrr, X86::VRSQRT14SSZrm, TB_NO_REVERSE },
+ { X86::VRSQRT28PDZrkz, X86::VRSQRT28PDZmkz, 0 },
+ { X86::VRSQRT28PSZrkz, X86::VRSQRT28PSZmkz, 0 },
+ { X86::VRSQRT28SDZr, X86::VRSQRT28SDZm, TB_NO_REVERSE },
+ { X86::VRSQRT28SSZr, X86::VRSQRT28SSZm, TB_NO_REVERSE },
+ { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
+ { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE },
+ { X86::VSCALEFPDZ128rr, X86::VSCALEFPDZ128rm, 0 },
+ { X86::VSCALEFPDZ256rr, X86::VSCALEFPDZ256rm, 0 },
+ { X86::VSCALEFPDZrr, X86::VSCALEFPDZrm, 0 },
+ { X86::VSCALEFPSZ128rr, X86::VSCALEFPSZ128rm, 0 },
+ { X86::VSCALEFPSZ256rr, X86::VSCALEFPSZ256rm, 0 },
+ { X86::VSCALEFPSZrr, X86::VSCALEFPSZrm, 0 },
+ { X86::VSCALEFSDZrr, X86::VSCALEFSDZrm, TB_NO_REVERSE },
+ { X86::VSCALEFSSZrr, X86::VSCALEFSSZrm, TB_NO_REVERSE },
+ { X86::VSHUFF32X4Z256rri, X86::VSHUFF32X4Z256rmi, 0 },
+ { X86::VSHUFF32X4Zrri, X86::VSHUFF32X4Zrmi, 0 },
+ { X86::VSHUFF64X2Z256rri, X86::VSHUFF64X2Z256rmi, 0 },
+ { X86::VSHUFF64X2Zrri, X86::VSHUFF64X2Zrmi, 0 },
+ { X86::VSHUFI32X4Z256rri, X86::VSHUFI32X4Z256rmi, 0 },
+ { X86::VSHUFI32X4Zrri, X86::VSHUFI32X4Zrmi, 0 },
+ { X86::VSHUFI64X2Z256rri, X86::VSHUFI64X2Z256rmi, 0 },
+ { X86::VSHUFI64X2Zrri, X86::VSHUFI64X2Zrmi, 0 },
+ { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 },
+ { X86::VSHUFPDZ128rri, X86::VSHUFPDZ128rmi, 0 },
+ { X86::VSHUFPDZ256rri, X86::VSHUFPDZ256rmi, 0 },
+ { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 },
+ { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 },
+ { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 },
+ { X86::VSHUFPSZ128rri, X86::VSHUFPSZ128rmi, 0 },
+ { X86::VSHUFPSZ256rri, X86::VSHUFPSZ256rmi, 0 },
+ { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 },
+ { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 },
+ { X86::VSQRTPDZ128rkz, X86::VSQRTPDZ128mkz, 0 },
+ { X86::VSQRTPDZ256rkz, X86::VSQRTPDZ256mkz, 0 },
+ { X86::VSQRTPDZrkz, X86::VSQRTPDZmkz, 0 },
+ { X86::VSQRTPSZ128rkz, X86::VSQRTPSZ128mkz, 0 },
+ { X86::VSQRTPSZ256rkz, X86::VSQRTPSZ256mkz, 0 },
+ { X86::VSQRTPSZrkz, X86::VSQRTPSZmkz, 0 },
+ { X86::VSQRTSDZr, X86::VSQRTSDZm, 0 },
+ { X86::VSQRTSDZr_Int, X86::VSQRTSDZm_Int, TB_NO_REVERSE },
+ { X86::VSQRTSDr, X86::VSQRTSDm, 0 },
+ { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, TB_NO_REVERSE },
+ { X86::VSQRTSSZr, X86::VSQRTSSZm, 0 },
+ { X86::VSQRTSSZr_Int, X86::VSQRTSSZm_Int, TB_NO_REVERSE },
+ { X86::VSQRTSSr, X86::VSQRTSSm, 0 },
+ { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, TB_NO_REVERSE },
+ { X86::VSUBPDYrr, X86::VSUBPDYrm, 0 },
+ { X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0 },
+ { X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 },
+ { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 },
+ { X86::VSUBPDrr, X86::VSUBPDrm, 0 },
+ { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 },
+ { X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 },
+ { X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 },
+ { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 },
+ { X86::VSUBPSrr, X86::VSUBPSrm, 0 },
+ { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 },
+ { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE },
+ { X86::VSUBSDrr, X86::VSUBSDrm, 0 },
+ { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, TB_NO_REVERSE },
+ { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 },
+ { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE },
+ { X86::VSUBSSrr, X86::VSUBSSrm, 0 },
+ { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, TB_NO_REVERSE },
+ { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 },
+ { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rm, 0 },
+ { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rm, 0 },
+ { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrm, 0 },
+ { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 },
+ { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 },
+ { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rm, 0 },
+ { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rm, 0 },
+ { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrm, 0 },
+ { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 },
+ { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 },
+ { X86::VUNPCKLPDZ128rr, X86::VUNPCKLPDZ128rm, 0 },
+ { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rm, 0 },
+ { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrm, 0 },
+ { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 },
+ { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 },
+ { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rm, 0 },
+ { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rm, 0 },
+ { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrm, 0 },
+ { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 },
+ { X86::VXORPDYrr, X86::VXORPDYrm, 0 },
+ { X86::VXORPDZ128rr, X86::VXORPDZ128rm, 0 },
+ { X86::VXORPDZ256rr, X86::VXORPDZ256rm, 0 },
+ { X86::VXORPDZrr, X86::VXORPDZrm, 0 },
+ { X86::VXORPDrr, X86::VXORPDrm, 0 },
+ { X86::VXORPSYrr, X86::VXORPSYrm, 0 },
+ { X86::VXORPSZ128rr, X86::VXORPSZ128rm, 0 },
+ { X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 },
+ { X86::VXORPSZrr, X86::VXORPSZrm, 0 },
+ { X86::VXORPSrr, X86::VXORPSrm, 0 },
+ { X86::XOR16rr, X86::XOR16rm, 0 },
+ { X86::XOR32rr, X86::XOR32rm, 0 },
+ { X86::XOR64rr, X86::XOR64rm, 0 },
+ { X86::XOR8rr, X86::XOR8rm, 0 },
+ { X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 },
+ { X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 },
+};
+
+static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
+ { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 },
+ { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 },
+ { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 },
+ { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 },
+ { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 },
+ { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 },
+ { X86::VADDSDZrr_Intkz, X86::VADDSDZrm_Intkz, TB_NO_REVERSE },
+ { X86::VADDSSZrr_Intkz, X86::VADDSSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0 },
+ { X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0 },
+ { X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 },
+ { X86::VALIGNQZ128rrikz, X86::VALIGNQZ128rmikz, 0 },
+ { X86::VALIGNQZ256rrikz, X86::VALIGNQZ256rmikz, 0 },
+ { X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 },
+ { X86::VANDNPDZ128rrkz, X86::VANDNPDZ128rmkz, 0 },
+ { X86::VANDNPDZ256rrkz, X86::VANDNPDZ256rmkz, 0 },
+ { X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 },
+ { X86::VANDNPSZ128rrkz, X86::VANDNPSZ128rmkz, 0 },
+ { X86::VANDNPSZ256rrkz, X86::VANDNPSZ256rmkz, 0 },
+ { X86::VANDNPSZrrkz, X86::VANDNPSZrmkz, 0 },
+ { X86::VANDPDZ128rrkz, X86::VANDPDZ128rmkz, 0 },
+ { X86::VANDPDZ256rrkz, X86::VANDPDZ256rmkz, 0 },
+ { X86::VANDPDZrrkz, X86::VANDPDZrmkz, 0 },
+ { X86::VANDPSZ128rrkz, X86::VANDPSZ128rmkz, 0 },
+ { X86::VANDPSZ256rrkz, X86::VANDPSZ256rmkz, 0 },
+ { X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 },
+ { X86::VBLENDMPDZ128rrk, X86::VBLENDMPDZ128rmk, 0 },
+ { X86::VBLENDMPDZ256rrk, X86::VBLENDMPDZ256rmk, 0 },
+ { X86::VBLENDMPDZrrk, X86::VBLENDMPDZrmk, 0 },
+ { X86::VBLENDMPSZ128rrk, X86::VBLENDMPSZ128rmk, 0 },
+ { X86::VBLENDMPSZ256rrk, X86::VBLENDMPSZ256rmk, 0 },
+ { X86::VBLENDMPSZrrk, X86::VBLENDMPSZrmk, 0 },
+ { X86::VBROADCASTF32X2Z256rk, X86::VBROADCASTF32X2Z256mk, TB_NO_REVERSE },
+ { X86::VBROADCASTF32X2Zrk, X86::VBROADCASTF32X2Zmk, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z128rk, X86::VBROADCASTI32X2Z128mk, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z256rk, X86::VBROADCASTI32X2Z256mk, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Zrk, X86::VBROADCASTI32X2Zmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE },
+ { X86::VCMPPDZ128rrik, X86::VCMPPDZ128rmik, 0 },
+ { X86::VCMPPDZ256rrik, X86::VCMPPDZ256rmik, 0 },
+ { X86::VCMPPDZrrik, X86::VCMPPDZrmik, 0 },
+ { X86::VCMPPSZ128rrik, X86::VCMPPSZ128rmik, 0 },
+ { X86::VCMPPSZ256rrik, X86::VCMPPSZ256rmik, 0 },
+ { X86::VCMPPSZrrik, X86::VCMPPSZrmik, 0 },
+ { X86::VCMPSDZrr_Intk, X86::VCMPSDZrm_Intk, TB_NO_REVERSE },
+ { X86::VCMPSSZrr_Intk, X86::VCMPSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDZ128rrk, X86::VCVTDQ2PDZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDZ256rrk, X86::VCVTDQ2PDZ256rmk, 0 },
+ { X86::VCVTDQ2PDZrrk, X86::VCVTDQ2PDZrmk, 0 },
+ { X86::VCVTDQ2PSZ128rrk, X86::VCVTDQ2PSZ128rmk, 0 },
+ { X86::VCVTDQ2PSZ256rrk, X86::VCVTDQ2PSZ256rmk, 0 },
+ { X86::VCVTDQ2PSZrrk, X86::VCVTDQ2PSZrmk, 0 },
+ { X86::VCVTPD2DQZ128rrk, X86::VCVTPD2DQZ128rmk, 0 },
+ { X86::VCVTPD2DQZ256rrk, X86::VCVTPD2DQZ256rmk, 0 },
+ { X86::VCVTPD2DQZrrk, X86::VCVTPD2DQZrmk, 0 },
+ { X86::VCVTPD2PSZ128rrk, X86::VCVTPD2PSZ128rmk, 0 },
+ { X86::VCVTPD2PSZ256rrk, X86::VCVTPD2PSZ256rmk, 0 },
+ { X86::VCVTPD2PSZrrk, X86::VCVTPD2PSZrmk, 0 },
+ { X86::VCVTPD2QQZ128rrk, X86::VCVTPD2QQZ128rmk, 0 },
+ { X86::VCVTPD2QQZ256rrk, X86::VCVTPD2QQZ256rmk, 0 },
+ { X86::VCVTPD2QQZrrk, X86::VCVTPD2QQZrmk, 0 },
+ { X86::VCVTPD2UDQZ128rrk, X86::VCVTPD2UDQZ128rmk, 0 },
+ { X86::VCVTPD2UDQZ256rrk, X86::VCVTPD2UDQZ256rmk, 0 },
+ { X86::VCVTPD2UDQZrrk, X86::VCVTPD2UDQZrmk, 0 },
+ { X86::VCVTPD2UQQZ128rrk, X86::VCVTPD2UQQZ128rmk, 0 },
+ { X86::VCVTPD2UQQZ256rrk, X86::VCVTPD2UQQZ256rmk, 0 },
+ { X86::VCVTPD2UQQZrrk, X86::VCVTPD2UQQZrmk, 0 },
+ { X86::VCVTPH2PSZ128rrk, X86::VCVTPH2PSZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTPH2PSZ256rrk, X86::VCVTPH2PSZ256rmk, 0 },
+ { X86::VCVTPH2PSZrrk, X86::VCVTPH2PSZrmk, 0 },
+ { X86::VCVTPS2DQZ128rrk, X86::VCVTPS2DQZ128rmk, 0 },
+ { X86::VCVTPS2DQZ256rrk, X86::VCVTPS2DQZ256rmk, 0 },
+ { X86::VCVTPS2DQZrrk, X86::VCVTPS2DQZrmk, 0 },
+ { X86::VCVTPS2PDZ128rrk, X86::VCVTPS2PDZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTPS2PDZ256rrk, X86::VCVTPS2PDZ256rmk, 0 },
+ { X86::VCVTPS2PDZrrk, X86::VCVTPS2PDZrmk, 0 },
+ { X86::VCVTPS2QQZ128rrk, X86::VCVTPS2QQZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTPS2QQZ256rrk, X86::VCVTPS2QQZ256rmk, 0 },
+ { X86::VCVTPS2QQZrrk, X86::VCVTPS2QQZrmk, 0 },
+ { X86::VCVTPS2UDQZ128rrk, X86::VCVTPS2UDQZ128rmk, 0 },
+ { X86::VCVTPS2UDQZ256rrk, X86::VCVTPS2UDQZ256rmk, 0 },
+ { X86::VCVTPS2UDQZrrk, X86::VCVTPS2UDQZrmk, 0 },
+ { X86::VCVTPS2UQQZ128rrk, X86::VCVTPS2UQQZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTPS2UQQZ256rrk, X86::VCVTPS2UQQZ256rmk, 0 },
+ { X86::VCVTPS2UQQZrrk, X86::VCVTPS2UQQZrmk, 0 },
+ { X86::VCVTQQ2PDZ128rrk, X86::VCVTQQ2PDZ128rmk, 0 },
+ { X86::VCVTQQ2PDZ256rrk, X86::VCVTQQ2PDZ256rmk, 0 },
+ { X86::VCVTQQ2PDZrrk, X86::VCVTQQ2PDZrmk, 0 },
+ { X86::VCVTQQ2PSZ128rrk, X86::VCVTQQ2PSZ128rmk, 0 },
+ { X86::VCVTQQ2PSZ256rrk, X86::VCVTQQ2PSZ256rmk, 0 },
+ { X86::VCVTQQ2PSZrrk, X86::VCVTQQ2PSZrmk, 0 },
+ { X86::VCVTSD2SSZrr_Intkz, X86::VCVTSD2SSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VCVTSS2SDZrr_Intkz, X86::VCVTSS2SDZrm_Intkz, TB_NO_REVERSE },
+ { X86::VCVTTPD2DQZ128rrk, X86::VCVTTPD2DQZ128rmk, 0 },
+ { X86::VCVTTPD2DQZ256rrk, X86::VCVTTPD2DQZ256rmk, 0 },
+ { X86::VCVTTPD2DQZrrk, X86::VCVTTPD2DQZrmk, 0 },
+ { X86::VCVTTPD2QQZ128rrk, X86::VCVTTPD2QQZ128rmk, 0 },
+ { X86::VCVTTPD2QQZ256rrk, X86::VCVTTPD2QQZ256rmk, 0 },
+ { X86::VCVTTPD2QQZrrk, X86::VCVTTPD2QQZrmk, 0 },
+ { X86::VCVTTPD2UDQZ128rrk, X86::VCVTTPD2UDQZ128rmk, 0 },
+ { X86::VCVTTPD2UDQZ256rrk, X86::VCVTTPD2UDQZ256rmk, 0 },
+ { X86::VCVTTPD2UDQZrrk, X86::VCVTTPD2UDQZrmk, 0 },
+ { X86::VCVTTPD2UQQZ128rrk, X86::VCVTTPD2UQQZ128rmk, 0 },
+ { X86::VCVTTPD2UQQZ256rrk, X86::VCVTTPD2UQQZ256rmk, 0 },
+ { X86::VCVTTPD2UQQZrrk, X86::VCVTTPD2UQQZrmk, 0 },
+ { X86::VCVTTPS2DQZ128rrk, X86::VCVTTPS2DQZ128rmk, 0 },
+ { X86::VCVTTPS2DQZ256rrk, X86::VCVTTPS2DQZ256rmk, 0 },
+ { X86::VCVTTPS2DQZrrk, X86::VCVTTPS2DQZrmk, 0 },
+ { X86::VCVTTPS2QQZ128rrk, X86::VCVTTPS2QQZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTTPS2QQZ256rrk, X86::VCVTTPS2QQZ256rmk, 0 },
+ { X86::VCVTTPS2QQZrrk, X86::VCVTTPS2QQZrmk, 0 },
+ { X86::VCVTTPS2UDQZ128rrk, X86::VCVTTPS2UDQZ128rmk, 0 },
+ { X86::VCVTTPS2UDQZ256rrk, X86::VCVTTPS2UDQZ256rmk, 0 },
+ { X86::VCVTTPS2UDQZrrk, X86::VCVTTPS2UDQZrmk, 0 },
+ { X86::VCVTTPS2UQQZ128rrk, X86::VCVTTPS2UQQZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTTPS2UQQZ256rrk, X86::VCVTTPS2UQQZ256rmk, 0 },
+ { X86::VCVTTPS2UQQZrrk, X86::VCVTTPS2UQQZrmk, 0 },
+ { X86::VCVTUDQ2PDZ128rrk, X86::VCVTUDQ2PDZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTUDQ2PDZ256rrk, X86::VCVTUDQ2PDZ256rmk, 0 },
+ { X86::VCVTUDQ2PDZrrk, X86::VCVTUDQ2PDZrmk, 0 },
+ { X86::VCVTUDQ2PSZ128rrk, X86::VCVTUDQ2PSZ128rmk, 0 },
+ { X86::VCVTUDQ2PSZ256rrk, X86::VCVTUDQ2PSZ256rmk, 0 },
+ { X86::VCVTUDQ2PSZrrk, X86::VCVTUDQ2PSZrmk, 0 },
+ { X86::VCVTUQQ2PDZ128rrk, X86::VCVTUQQ2PDZ128rmk, 0 },
+ { X86::VCVTUQQ2PDZ256rrk, X86::VCVTUQQ2PDZ256rmk, 0 },
+ { X86::VCVTUQQ2PDZrrk, X86::VCVTUQQ2PDZrmk, 0 },
+ { X86::VCVTUQQ2PSZ128rrk, X86::VCVTUQQ2PSZ128rmk, 0 },
+ { X86::VCVTUQQ2PSZ256rrk, X86::VCVTUQQ2PSZ256rmk, 0 },
+ { X86::VCVTUQQ2PSZrrk, X86::VCVTUQQ2PSZrmk, 0 },
+ { X86::VDBPSADBWZ128rrikz, X86::VDBPSADBWZ128rmikz, 0 },
+ { X86::VDBPSADBWZ256rrikz, X86::VDBPSADBWZ256rmikz, 0 },
+ { X86::VDBPSADBWZrrikz, X86::VDBPSADBWZrmikz, 0 },
+ { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 },
+ { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 },
+ { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 },
+ { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 },
+ { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 },
+ { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
+ { X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE },
+ { X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VEXP2PDZrk, X86::VEXP2PDZmk, 0 },
+ { X86::VEXP2PSZrk, X86::VEXP2PSZmk, 0 },
+ { X86::VEXPANDPDZ128rrk, X86::VEXPANDPDZ128rmk, TB_NO_REVERSE },
+ { X86::VEXPANDPDZ256rrk, X86::VEXPANDPDZ256rmk, TB_NO_REVERSE },
+ { X86::VEXPANDPDZrrk, X86::VEXPANDPDZrmk, TB_NO_REVERSE },
+ { X86::VEXPANDPSZ128rrk, X86::VEXPANDPSZ128rmk, TB_NO_REVERSE },
+ { X86::VEXPANDPSZ256rrk, X86::VEXPANDPSZ256rmk, TB_NO_REVERSE },
+ { X86::VEXPANDPSZrrk, X86::VEXPANDPSZrmk, TB_NO_REVERSE },
+ { X86::VFIXUPIMMPDZ128rri, X86::VFIXUPIMMPDZ128rmi, 0 },
+ { X86::VFIXUPIMMPDZ256rri, X86::VFIXUPIMMPDZ256rmi, 0 },
+ { X86::VFIXUPIMMPDZrri, X86::VFIXUPIMMPDZrmi, 0 },
+ { X86::VFIXUPIMMPSZ128rri, X86::VFIXUPIMMPSZ128rmi, 0 },
+ { X86::VFIXUPIMMPSZ256rri, X86::VFIXUPIMMPSZ256rmi, 0 },
+ { X86::VFIXUPIMMPSZrri, X86::VFIXUPIMMPSZrmi, 0 },
+ { X86::VFIXUPIMMSDZrri, X86::VFIXUPIMMSDZrmi, TB_NO_REVERSE },
+ { X86::VFIXUPIMMSSZrri, X86::VFIXUPIMMSSZrmi, TB_NO_REVERSE },
+ { X86::VFMADD132PDYr, X86::VFMADD132PDYm, 0 },
+ { X86::VFMADD132PDZ128r, X86::VFMADD132PDZ128m, 0 },
+ { X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256m, 0 },
+ { X86::VFMADD132PDZr, X86::VFMADD132PDZm, 0 },
+ { X86::VFMADD132PDr, X86::VFMADD132PDm, 0 },
+ { X86::VFMADD132PSYr, X86::VFMADD132PSYm, 0 },
+ { X86::VFMADD132PSZ128r, X86::VFMADD132PSZ128m, 0 },
+ { X86::VFMADD132PSZ256r, X86::VFMADD132PSZ256m, 0 },
+ { X86::VFMADD132PSZr, X86::VFMADD132PSZm, 0 },
+ { X86::VFMADD132PSr, X86::VFMADD132PSm, 0 },
+ { X86::VFMADD132SDZr, X86::VFMADD132SDZm, 0 },
+ { X86::VFMADD132SDZr_Int, X86::VFMADD132SDZm_Int, TB_NO_REVERSE },
+ { X86::VFMADD132SDr, X86::VFMADD132SDm, 0 },
+ { X86::VFMADD132SDr_Int, X86::VFMADD132SDm_Int, TB_NO_REVERSE },
+ { X86::VFMADD132SSZr, X86::VFMADD132SSZm, 0 },
+ { X86::VFMADD132SSZr_Int, X86::VFMADD132SSZm_Int, TB_NO_REVERSE },
+ { X86::VFMADD132SSr, X86::VFMADD132SSm, 0 },
+ { X86::VFMADD132SSr_Int, X86::VFMADD132SSm_Int, TB_NO_REVERSE },
+ { X86::VFMADD213PDYr, X86::VFMADD213PDYm, 0 },
+ { X86::VFMADD213PDZ128r, X86::VFMADD213PDZ128m, 0 },
+ { X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256m, 0 },
+ { X86::VFMADD213PDZr, X86::VFMADD213PDZm, 0 },
+ { X86::VFMADD213PDr, X86::VFMADD213PDm, 0 },
+ { X86::VFMADD213PSYr, X86::VFMADD213PSYm, 0 },
+ { X86::VFMADD213PSZ128r, X86::VFMADD213PSZ128m, 0 },
+ { X86::VFMADD213PSZ256r, X86::VFMADD213PSZ256m, 0 },
+ { X86::VFMADD213PSZr, X86::VFMADD213PSZm, 0 },
+ { X86::VFMADD213PSr, X86::VFMADD213PSm, 0 },
+ { X86::VFMADD213SDZr, X86::VFMADD213SDZm, 0 },
+ { X86::VFMADD213SDZr_Int, X86::VFMADD213SDZm_Int, TB_NO_REVERSE },
+ { X86::VFMADD213SDr, X86::VFMADD213SDm, 0 },
+ { X86::VFMADD213SDr_Int, X86::VFMADD213SDm_Int, TB_NO_REVERSE },
+ { X86::VFMADD213SSZr, X86::VFMADD213SSZm, 0 },
+ { X86::VFMADD213SSZr_Int, X86::VFMADD213SSZm_Int, TB_NO_REVERSE },
+ { X86::VFMADD213SSr, X86::VFMADD213SSm, 0 },
+ { X86::VFMADD213SSr_Int, X86::VFMADD213SSm_Int, TB_NO_REVERSE },
+ { X86::VFMADD231PDYr, X86::VFMADD231PDYm, 0 },
+ { X86::VFMADD231PDZ128r, X86::VFMADD231PDZ128m, 0 },
+ { X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256m, 0 },
+ { X86::VFMADD231PDZr, X86::VFMADD231PDZm, 0 },
+ { X86::VFMADD231PDr, X86::VFMADD231PDm, 0 },
+ { X86::VFMADD231PSYr, X86::VFMADD231PSYm, 0 },
+ { X86::VFMADD231PSZ128r, X86::VFMADD231PSZ128m, 0 },
+ { X86::VFMADD231PSZ256r, X86::VFMADD231PSZ256m, 0 },
+ { X86::VFMADD231PSZr, X86::VFMADD231PSZm, 0 },
+ { X86::VFMADD231PSr, X86::VFMADD231PSm, 0 },
+ { X86::VFMADD231SDZr, X86::VFMADD231SDZm, 0 },
+ { X86::VFMADD231SDZr_Int, X86::VFMADD231SDZm_Int, TB_NO_REVERSE },
+ { X86::VFMADD231SDr, X86::VFMADD231SDm, 0 },
+ { X86::VFMADD231SDr_Int, X86::VFMADD231SDm_Int, TB_NO_REVERSE },
+ { X86::VFMADD231SSZr, X86::VFMADD231SSZm, 0 },
+ { X86::VFMADD231SSZr_Int, X86::VFMADD231SSZm_Int, TB_NO_REVERSE },
+ { X86::VFMADD231SSr, X86::VFMADD231SSm, 0 },
+ { X86::VFMADD231SSr_Int, X86::VFMADD231SSm_Int, TB_NO_REVERSE },
+ { X86::VFMADDPD4Yrr, X86::VFMADDPD4Yrm, 0 },
+ { X86::VFMADDPD4rr, X86::VFMADDPD4rm, 0 },
+ { X86::VFMADDPS4Yrr, X86::VFMADDPS4Yrm, 0 },
+ { X86::VFMADDPS4rr, X86::VFMADDPS4rm, 0 },
+ { X86::VFMADDSD4rr, X86::VFMADDSD4rm, 0 },
+ { X86::VFMADDSD4rr_Int, X86::VFMADDSD4rm_Int, TB_NO_REVERSE },
+ { X86::VFMADDSS4rr, X86::VFMADDSS4rm, 0 },
+ { X86::VFMADDSS4rr_Int, X86::VFMADDSS4rm_Int, TB_NO_REVERSE },
+ { X86::VFMADDSUB132PDYr, X86::VFMADDSUB132PDYm, 0 },
+ { X86::VFMADDSUB132PDZ128r, X86::VFMADDSUB132PDZ128m, 0 },
+ { X86::VFMADDSUB132PDZ256r, X86::VFMADDSUB132PDZ256m, 0 },
+ { X86::VFMADDSUB132PDZr, X86::VFMADDSUB132PDZm, 0 },
+ { X86::VFMADDSUB132PDr, X86::VFMADDSUB132PDm, 0 },
+ { X86::VFMADDSUB132PSYr, X86::VFMADDSUB132PSYm, 0 },
+ { X86::VFMADDSUB132PSZ128r, X86::VFMADDSUB132PSZ128m, 0 },
+ { X86::VFMADDSUB132PSZ256r, X86::VFMADDSUB132PSZ256m, 0 },
+ { X86::VFMADDSUB132PSZr, X86::VFMADDSUB132PSZm, 0 },
+ { X86::VFMADDSUB132PSr, X86::VFMADDSUB132PSm, 0 },
+ { X86::VFMADDSUB213PDYr, X86::VFMADDSUB213PDYm, 0 },
+ { X86::VFMADDSUB213PDZ128r, X86::VFMADDSUB213PDZ128m, 0 },
+ { X86::VFMADDSUB213PDZ256r, X86::VFMADDSUB213PDZ256m, 0 },
+ { X86::VFMADDSUB213PDZr, X86::VFMADDSUB213PDZm, 0 },
+ { X86::VFMADDSUB213PDr, X86::VFMADDSUB213PDm, 0 },
+ { X86::VFMADDSUB213PSYr, X86::VFMADDSUB213PSYm, 0 },
+ { X86::VFMADDSUB213PSZ128r, X86::VFMADDSUB213PSZ128m, 0 },
+ { X86::VFMADDSUB213PSZ256r, X86::VFMADDSUB213PSZ256m, 0 },
+ { X86::VFMADDSUB213PSZr, X86::VFMADDSUB213PSZm, 0 },
+ { X86::VFMADDSUB213PSr, X86::VFMADDSUB213PSm, 0 },
+ { X86::VFMADDSUB231PDYr, X86::VFMADDSUB231PDYm, 0 },
+ { X86::VFMADDSUB231PDZ128r, X86::VFMADDSUB231PDZ128m, 0 },
+ { X86::VFMADDSUB231PDZ256r, X86::VFMADDSUB231PDZ256m, 0 },
+ { X86::VFMADDSUB231PDZr, X86::VFMADDSUB231PDZm, 0 },
+ { X86::VFMADDSUB231PDr, X86::VFMADDSUB231PDm, 0 },
+ { X86::VFMADDSUB231PSYr, X86::VFMADDSUB231PSYm, 0 },
+ { X86::VFMADDSUB231PSZ128r, X86::VFMADDSUB231PSZ128m, 0 },
+ { X86::VFMADDSUB231PSZ256r, X86::VFMADDSUB231PSZ256m, 0 },
+ { X86::VFMADDSUB231PSZr, X86::VFMADDSUB231PSZm, 0 },
+ { X86::VFMADDSUB231PSr, X86::VFMADDSUB231PSm, 0 },
+ { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Yrm, 0 },
+ { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, 0 },
+ { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Yrm, 0 },
+ { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, 0 },
+ { X86::VFMSUB132PDYr, X86::VFMSUB132PDYm, 0 },
+ { X86::VFMSUB132PDZ128r, X86::VFMSUB132PDZ128m, 0 },
+ { X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256m, 0 },
+ { X86::VFMSUB132PDZr, X86::VFMSUB132PDZm, 0 },
+ { X86::VFMSUB132PDr, X86::VFMSUB132PDm, 0 },
+ { X86::VFMSUB132PSYr, X86::VFMSUB132PSYm, 0 },
+ { X86::VFMSUB132PSZ128r, X86::VFMSUB132PSZ128m, 0 },
+ { X86::VFMSUB132PSZ256r, X86::VFMSUB132PSZ256m, 0 },
+ { X86::VFMSUB132PSZr, X86::VFMSUB132PSZm, 0 },
+ { X86::VFMSUB132PSr, X86::VFMSUB132PSm, 0 },
+ { X86::VFMSUB132SDZr, X86::VFMSUB132SDZm, 0 },
+ { X86::VFMSUB132SDZr_Int, X86::VFMSUB132SDZm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB132SDr, X86::VFMSUB132SDm, 0 },
+ { X86::VFMSUB132SDr_Int, X86::VFMSUB132SDm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB132SSZr, X86::VFMSUB132SSZm, 0 },
+ { X86::VFMSUB132SSZr_Int, X86::VFMSUB132SSZm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB132SSr, X86::VFMSUB132SSm, 0 },
+ { X86::VFMSUB132SSr_Int, X86::VFMSUB132SSm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB213PDYr, X86::VFMSUB213PDYm, 0 },
+ { X86::VFMSUB213PDZ128r, X86::VFMSUB213PDZ128m, 0 },
+ { X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256m, 0 },
+ { X86::VFMSUB213PDZr, X86::VFMSUB213PDZm, 0 },
+ { X86::VFMSUB213PDr, X86::VFMSUB213PDm, 0 },
+ { X86::VFMSUB213PSYr, X86::VFMSUB213PSYm, 0 },
+ { X86::VFMSUB213PSZ128r, X86::VFMSUB213PSZ128m, 0 },
+ { X86::VFMSUB213PSZ256r, X86::VFMSUB213PSZ256m, 0 },
+ { X86::VFMSUB213PSZr, X86::VFMSUB213PSZm, 0 },
+ { X86::VFMSUB213PSr, X86::VFMSUB213PSm, 0 },
+ { X86::VFMSUB213SDZr, X86::VFMSUB213SDZm, 0 },
+ { X86::VFMSUB213SDZr_Int, X86::VFMSUB213SDZm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB213SDr, X86::VFMSUB213SDm, 0 },
+ { X86::VFMSUB213SDr_Int, X86::VFMSUB213SDm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB213SSZr, X86::VFMSUB213SSZm, 0 },
+ { X86::VFMSUB213SSZr_Int, X86::VFMSUB213SSZm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB213SSr, X86::VFMSUB213SSm, 0 },
+ { X86::VFMSUB213SSr_Int, X86::VFMSUB213SSm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB231PDYr, X86::VFMSUB231PDYm, 0 },
+ { X86::VFMSUB231PDZ128r, X86::VFMSUB231PDZ128m, 0 },
+ { X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256m, 0 },
+ { X86::VFMSUB231PDZr, X86::VFMSUB231PDZm, 0 },
+ { X86::VFMSUB231PDr, X86::VFMSUB231PDm, 0 },
+ { X86::VFMSUB231PSYr, X86::VFMSUB231PSYm, 0 },
+ { X86::VFMSUB231PSZ128r, X86::VFMSUB231PSZ128m, 0 },
+ { X86::VFMSUB231PSZ256r, X86::VFMSUB231PSZ256m, 0 },
+ { X86::VFMSUB231PSZr, X86::VFMSUB231PSZm, 0 },
+ { X86::VFMSUB231PSr, X86::VFMSUB231PSm, 0 },
+ { X86::VFMSUB231SDZr, X86::VFMSUB231SDZm, 0 },
+ { X86::VFMSUB231SDZr_Int, X86::VFMSUB231SDZm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB231SDr, X86::VFMSUB231SDm, 0 },
+ { X86::VFMSUB231SDr_Int, X86::VFMSUB231SDm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB231SSZr, X86::VFMSUB231SSZm, 0 },
+ { X86::VFMSUB231SSZr_Int, X86::VFMSUB231SSZm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB231SSr, X86::VFMSUB231SSm, 0 },
+ { X86::VFMSUB231SSr_Int, X86::VFMSUB231SSm_Int, TB_NO_REVERSE },
+ { X86::VFMSUBADD132PDYr, X86::VFMSUBADD132PDYm, 0 },
+ { X86::VFMSUBADD132PDZ128r, X86::VFMSUBADD132PDZ128m, 0 },
+ { X86::VFMSUBADD132PDZ256r, X86::VFMSUBADD132PDZ256m, 0 },
+ { X86::VFMSUBADD132PDZr, X86::VFMSUBADD132PDZm, 0 },
+ { X86::VFMSUBADD132PDr, X86::VFMSUBADD132PDm, 0 },
+ { X86::VFMSUBADD132PSYr, X86::VFMSUBADD132PSYm, 0 },
+ { X86::VFMSUBADD132PSZ128r, X86::VFMSUBADD132PSZ128m, 0 },
+ { X86::VFMSUBADD132PSZ256r, X86::VFMSUBADD132PSZ256m, 0 },
+ { X86::VFMSUBADD132PSZr, X86::VFMSUBADD132PSZm, 0 },
+ { X86::VFMSUBADD132PSr, X86::VFMSUBADD132PSm, 0 },
+ { X86::VFMSUBADD213PDYr, X86::VFMSUBADD213PDYm, 0 },
+ { X86::VFMSUBADD213PDZ128r, X86::VFMSUBADD213PDZ128m, 0 },
+ { X86::VFMSUBADD213PDZ256r, X86::VFMSUBADD213PDZ256m, 0 },
+ { X86::VFMSUBADD213PDZr, X86::VFMSUBADD213PDZm, 0 },
+ { X86::VFMSUBADD213PDr, X86::VFMSUBADD213PDm, 0 },
+ { X86::VFMSUBADD213PSYr, X86::VFMSUBADD213PSYm, 0 },
+ { X86::VFMSUBADD213PSZ128r, X86::VFMSUBADD213PSZ128m, 0 },
+ { X86::VFMSUBADD213PSZ256r, X86::VFMSUBADD213PSZ256m, 0 },
+ { X86::VFMSUBADD213PSZr, X86::VFMSUBADD213PSZm, 0 },
+ { X86::VFMSUBADD213PSr, X86::VFMSUBADD213PSm, 0 },
+ { X86::VFMSUBADD231PDYr, X86::VFMSUBADD231PDYm, 0 },
+ { X86::VFMSUBADD231PDZ128r, X86::VFMSUBADD231PDZ128m, 0 },
+ { X86::VFMSUBADD231PDZ256r, X86::VFMSUBADD231PDZ256m, 0 },
+ { X86::VFMSUBADD231PDZr, X86::VFMSUBADD231PDZm, 0 },
+ { X86::VFMSUBADD231PDr, X86::VFMSUBADD231PDm, 0 },
+ { X86::VFMSUBADD231PSYr, X86::VFMSUBADD231PSYm, 0 },
+ { X86::VFMSUBADD231PSZ128r, X86::VFMSUBADD231PSZ128m, 0 },
+ { X86::VFMSUBADD231PSZ256r, X86::VFMSUBADD231PSZ256m, 0 },
+ { X86::VFMSUBADD231PSZr, X86::VFMSUBADD231PSZm, 0 },
+ { X86::VFMSUBADD231PSr, X86::VFMSUBADD231PSm, 0 },
+ { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Yrm, 0 },
+ { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, 0 },
+ { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Yrm, 0 },
+ { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, 0 },
+ { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Yrm, 0 },
+ { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, 0 },
+ { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Yrm, 0 },
+ { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, 0 },
+ { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, 0 },
+ { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4rm_Int, TB_NO_REVERSE },
+ { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, 0 },
+ { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4rm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD132PDYr, X86::VFNMADD132PDYm, 0 },
+ { X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128m, 0 },
+ { X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256m, 0 },
+ { X86::VFNMADD132PDZr, X86::VFNMADD132PDZm, 0 },
+ { X86::VFNMADD132PDr, X86::VFNMADD132PDm, 0 },
+ { X86::VFNMADD132PSYr, X86::VFNMADD132PSYm, 0 },
+ { X86::VFNMADD132PSZ128r, X86::VFNMADD132PSZ128m, 0 },
+ { X86::VFNMADD132PSZ256r, X86::VFNMADD132PSZ256m, 0 },
+ { X86::VFNMADD132PSZr, X86::VFNMADD132PSZm, 0 },
+ { X86::VFNMADD132PSr, X86::VFNMADD132PSm, 0 },
+ { X86::VFNMADD132SDZr, X86::VFNMADD132SDZm, 0 },
+ { X86::VFNMADD132SDZr_Int, X86::VFNMADD132SDZm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD132SDr, X86::VFNMADD132SDm, 0 },
+ { X86::VFNMADD132SDr_Int, X86::VFNMADD132SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD132SSZr, X86::VFNMADD132SSZm, 0 },
+ { X86::VFNMADD132SSZr_Int, X86::VFNMADD132SSZm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD132SSr, X86::VFNMADD132SSm, 0 },
+ { X86::VFNMADD132SSr_Int, X86::VFNMADD132SSm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD213PDYr, X86::VFNMADD213PDYm, 0 },
+ { X86::VFNMADD213PDZ128r, X86::VFNMADD213PDZ128m, 0 },
+ { X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256m, 0 },
+ { X86::VFNMADD213PDZr, X86::VFNMADD213PDZm, 0 },
+ { X86::VFNMADD213PDr, X86::VFNMADD213PDm, 0 },
+ { X86::VFNMADD213PSYr, X86::VFNMADD213PSYm, 0 },
+ { X86::VFNMADD213PSZ128r, X86::VFNMADD213PSZ128m, 0 },
+ { X86::VFNMADD213PSZ256r, X86::VFNMADD213PSZ256m, 0 },
+ { X86::VFNMADD213PSZr, X86::VFNMADD213PSZm, 0 },
+ { X86::VFNMADD213PSr, X86::VFNMADD213PSm, 0 },
+ { X86::VFNMADD213SDZr, X86::VFNMADD213SDZm, 0 },
+ { X86::VFNMADD213SDZr_Int, X86::VFNMADD213SDZm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD213SDr, X86::VFNMADD213SDm, 0 },
+ { X86::VFNMADD213SDr_Int, X86::VFNMADD213SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD213SSZr, X86::VFNMADD213SSZm, 0 },
+ { X86::VFNMADD213SSZr_Int, X86::VFNMADD213SSZm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD213SSr, X86::VFNMADD213SSm, 0 },
+ { X86::VFNMADD213SSr_Int, X86::VFNMADD213SSm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD231PDYr, X86::VFNMADD231PDYm, 0 },
+ { X86::VFNMADD231PDZ128r, X86::VFNMADD231PDZ128m, 0 },
+ { X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256m, 0 },
+ { X86::VFNMADD231PDZr, X86::VFNMADD231PDZm, 0 },
+ { X86::VFNMADD231PDr, X86::VFNMADD231PDm, 0 },
+ { X86::VFNMADD231PSYr, X86::VFNMADD231PSYm, 0 },
+ { X86::VFNMADD231PSZ128r, X86::VFNMADD231PSZ128m, 0 },
+ { X86::VFNMADD231PSZ256r, X86::VFNMADD231PSZ256m, 0 },
+ { X86::VFNMADD231PSZr, X86::VFNMADD231PSZm, 0 },
+ { X86::VFNMADD231PSr, X86::VFNMADD231PSm, 0 },
+ { X86::VFNMADD231SDZr, X86::VFNMADD231SDZm, 0 },
+ { X86::VFNMADD231SDZr_Int, X86::VFNMADD231SDZm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD231SDr, X86::VFNMADD231SDm, 0 },
+ { X86::VFNMADD231SDr_Int, X86::VFNMADD231SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD231SSZr, X86::VFNMADD231SSZm, 0 },
+ { X86::VFNMADD231SSZr_Int, X86::VFNMADD231SSZm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD231SSr, X86::VFNMADD231SSm, 0 },
+ { X86::VFNMADD231SSr_Int, X86::VFNMADD231SSm_Int, TB_NO_REVERSE },
+ { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Yrm, 0 },
+ { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, 0 },
+ { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Yrm, 0 },
+ { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, 0 },
+ { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, 0 },
+ { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4rm_Int, TB_NO_REVERSE },
+ { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, 0 },
+ { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4rm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB132PDYr, X86::VFNMSUB132PDYm, 0 },
+ { X86::VFNMSUB132PDZ128r, X86::VFNMSUB132PDZ128m, 0 },
+ { X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256m, 0 },
+ { X86::VFNMSUB132PDZr, X86::VFNMSUB132PDZm, 0 },
+ { X86::VFNMSUB132PDr, X86::VFNMSUB132PDm, 0 },
+ { X86::VFNMSUB132PSYr, X86::VFNMSUB132PSYm, 0 },
+ { X86::VFNMSUB132PSZ128r, X86::VFNMSUB132PSZ128m, 0 },
+ { X86::VFNMSUB132PSZ256r, X86::VFNMSUB132PSZ256m, 0 },
+ { X86::VFNMSUB132PSZr, X86::VFNMSUB132PSZm, 0 },
+ { X86::VFNMSUB132PSr, X86::VFNMSUB132PSm, 0 },
+ { X86::VFNMSUB132SDZr, X86::VFNMSUB132SDZm, 0 },
+ { X86::VFNMSUB132SDZr_Int, X86::VFNMSUB132SDZm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB132SDr, X86::VFNMSUB132SDm, 0 },
+ { X86::VFNMSUB132SDr_Int, X86::VFNMSUB132SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB132SSZr, X86::VFNMSUB132SSZm, 0 },
+ { X86::VFNMSUB132SSZr_Int, X86::VFNMSUB132SSZm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB132SSr, X86::VFNMSUB132SSm, 0 },
+ { X86::VFNMSUB132SSr_Int, X86::VFNMSUB132SSm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB213PDYr, X86::VFNMSUB213PDYm, 0 },
+ { X86::VFNMSUB213PDZ128r, X86::VFNMSUB213PDZ128m, 0 },
+ { X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256m, 0 },
+ { X86::VFNMSUB213PDZr, X86::VFNMSUB213PDZm, 0 },
+ { X86::VFNMSUB213PDr, X86::VFNMSUB213PDm, 0 },
+ { X86::VFNMSUB213PSYr, X86::VFNMSUB213PSYm, 0 },
+ { X86::VFNMSUB213PSZ128r, X86::VFNMSUB213PSZ128m, 0 },
+ { X86::VFNMSUB213PSZ256r, X86::VFNMSUB213PSZ256m, 0 },
+ { X86::VFNMSUB213PSZr, X86::VFNMSUB213PSZm, 0 },
+ { X86::VFNMSUB213PSr, X86::VFNMSUB213PSm, 0 },
+ { X86::VFNMSUB213SDZr, X86::VFNMSUB213SDZm, 0 },
+ { X86::VFNMSUB213SDZr_Int, X86::VFNMSUB213SDZm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB213SDr, X86::VFNMSUB213SDm, 0 },
+ { X86::VFNMSUB213SDr_Int, X86::VFNMSUB213SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB213SSZr, X86::VFNMSUB213SSZm, 0 },
+ { X86::VFNMSUB213SSZr_Int, X86::VFNMSUB213SSZm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB213SSr, X86::VFNMSUB213SSm, 0 },
+ { X86::VFNMSUB213SSr_Int, X86::VFNMSUB213SSm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB231PDYr, X86::VFNMSUB231PDYm, 0 },
+ { X86::VFNMSUB231PDZ128r, X86::VFNMSUB231PDZ128m, 0 },
+ { X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256m, 0 },
+ { X86::VFNMSUB231PDZr, X86::VFNMSUB231PDZm, 0 },
+ { X86::VFNMSUB231PDr, X86::VFNMSUB231PDm, 0 },
+ { X86::VFNMSUB231PSYr, X86::VFNMSUB231PSYm, 0 },
+ { X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128m, 0 },
+ { X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256m, 0 },
+ { X86::VFNMSUB231PSZr, X86::VFNMSUB231PSZm, 0 },
+ { X86::VFNMSUB231PSr, X86::VFNMSUB231PSm, 0 },
+ { X86::VFNMSUB231SDZr, X86::VFNMSUB231SDZm, 0 },
+ { X86::VFNMSUB231SDZr_Int, X86::VFNMSUB231SDZm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB231SDr, X86::VFNMSUB231SDm, 0 },
+ { X86::VFNMSUB231SDr_Int, X86::VFNMSUB231SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB231SSZr, X86::VFNMSUB231SSZm, 0 },
+ { X86::VFNMSUB231SSZr_Int, X86::VFNMSUB231SSZm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB231SSr, X86::VFNMSUB231SSm, 0 },
+ { X86::VFNMSUB231SSr_Int, X86::VFNMSUB231SSm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Yrm, 0 },
+ { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, 0 },
+ { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Yrm, 0 },
+ { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, 0 },
+ { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, 0 },
+ { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4rm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, 0 },
+ { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4rm_Int, TB_NO_REVERSE },
+ { X86::VGETEXPPDZ128rk, X86::VGETEXPPDZ128mk, 0 },
+ { X86::VGETEXPPDZ256rk, X86::VGETEXPPDZ256mk, 0 },
+ { X86::VGETEXPPDZrk, X86::VGETEXPPDZmk, 0 },
+ { X86::VGETEXPPSZ128rk, X86::VGETEXPPSZ128mk, 0 },
+ { X86::VGETEXPPSZ256rk, X86::VGETEXPPSZ256mk, 0 },
+ { X86::VGETEXPPSZrk, X86::VGETEXPPSZmk, 0 },
+ { X86::VGETEXPSDZrkz, X86::VGETEXPSDZmkz, TB_NO_REVERSE },
+ { X86::VGETEXPSSZrkz, X86::VGETEXPSSZmkz, TB_NO_REVERSE },
+ { X86::VGETMANTPDZ128rrik, X86::VGETMANTPDZ128rmik, 0 },
+ { X86::VGETMANTPDZ256rrik, X86::VGETMANTPDZ256rmik, 0 },
+ { X86::VGETMANTPDZrrik, X86::VGETMANTPDZrmik, 0 },
+ { X86::VGETMANTPSZ128rrik, X86::VGETMANTPSZ128rmik, 0 },
+ { X86::VGETMANTPSZ256rrik, X86::VGETMANTPSZ256rmik, 0 },
+ { X86::VGETMANTPSZrrik, X86::VGETMANTPSZrmik, 0 },
+ { X86::VGETMANTSDZrrikz, X86::VGETMANTSDZrmikz, TB_NO_REVERSE },
+ { X86::VGETMANTSSZrrikz, X86::VGETMANTSSZrmikz, TB_NO_REVERSE },
+ { X86::VGF2P8AFFINEINVQBZ128rrikz, X86::VGF2P8AFFINEINVQBZ128rmikz, 0 },
+ { X86::VGF2P8AFFINEINVQBZ256rrikz, X86::VGF2P8AFFINEINVQBZ256rmikz, 0 },
+ { X86::VGF2P8AFFINEINVQBZrrikz, X86::VGF2P8AFFINEINVQBZrmikz, 0 },
+ { X86::VGF2P8AFFINEQBZ128rrikz, X86::VGF2P8AFFINEQBZ128rmikz, 0 },
+ { X86::VGF2P8AFFINEQBZ256rrikz, X86::VGF2P8AFFINEQBZ256rmikz, 0 },
+ { X86::VGF2P8AFFINEQBZrrikz, X86::VGF2P8AFFINEQBZrmikz, 0 },
+ { X86::VGF2P8MULBZ128rrkz, X86::VGF2P8MULBZ128rmkz, 0 },
+ { X86::VGF2P8MULBZ256rrkz, X86::VGF2P8MULBZ256rmkz, 0 },
+ { X86::VGF2P8MULBZrrkz, X86::VGF2P8MULBZrmkz, 0 },
+ { X86::VINSERTF32x4Z256rrkz, X86::VINSERTF32x4Z256rmkz, 0 },
+ { X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 },
+ { X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 },
+ { X86::VINSERTF64x2Z256rrkz, X86::VINSERTF64x2Z256rmkz, 0 },
+ { X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 },
+ { X86::VINSERTF64x4Zrrkz, X86::VINSERTF64x4Zrmkz, 0 },
+ { X86::VINSERTI32x4Z256rrkz, X86::VINSERTI32x4Z256rmkz, 0 },
+ { X86::VINSERTI32x4Zrrkz, X86::VINSERTI32x4Zrmkz, 0 },
+ { X86::VINSERTI32x8Zrrkz, X86::VINSERTI32x8Zrmkz, 0 },
+ { X86::VINSERTI64x2Z256rrkz, X86::VINSERTI64x2Z256rmkz, 0 },
+ { X86::VINSERTI64x2Zrrkz, X86::VINSERTI64x2Zrmkz, 0 },
+ { X86::VINSERTI64x4Zrrkz, X86::VINSERTI64x4Zrmkz, 0 },
+ { X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0 },
+ { X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0 },
+ { X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0 },
+ { X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0 },
+ { X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0 },
+ { X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 },
+ { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 },
+ { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 },
+ { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 },
+ { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 },
+ { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 },
+ { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 },
+ { X86::VMAXSDZrr_Intkz, X86::VMAXSDZrm_Intkz, TB_NO_REVERSE },
+ { X86::VMAXSSZrr_Intkz, X86::VMAXSSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0 },
+ { X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0 },
+ { X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 },
+ { X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmkz, 0 },
+ { X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmkz, 0 },
+ { X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 },
+ { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 },
+ { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 },
+ { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 },
+ { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 },
+ { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 },
+ { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 },
+ { X86::VMINSDZrr_Intkz, X86::VMINSDZrm_Intkz, TB_NO_REVERSE },
+ { X86::VMINSSZrr_Intkz, X86::VMINSSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VMOVAPDZ128rrk, X86::VMOVAPDZ128rmk, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVAPDZ256rrk, X86::VMOVAPDZ256rmk, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVAPDZrrk, X86::VMOVAPDZrmk, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVAPSZ128rrk, X86::VMOVAPSZ128rmk, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVAPSZ256rrk, X86::VMOVAPSZ256rmk, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVAPSZrrk, X86::VMOVAPSZrmk, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVDDUPZ128rrk, X86::VMOVDDUPZ128rmk, TB_NO_REVERSE },
+ { X86::VMOVDDUPZ256rrk, X86::VMOVDDUPZ256rmk, 0 },
+ { X86::VMOVDDUPZrrk, X86::VMOVDDUPZrmk, 0 },
+ { X86::VMOVDQA32Z128rrk, X86::VMOVDQA32Z128rmk, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVDQA32Z256rrk, X86::VMOVDQA32Z256rmk, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVDQA32Zrrk, X86::VMOVDQA32Zrmk, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVDQA64Z128rrk, X86::VMOVDQA64Z128rmk, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVDQA64Z256rrk, X86::VMOVDQA64Z256rmk, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVDQA64Zrrk, X86::VMOVDQA64Zrmk, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVDQU16Z128rrk, X86::VMOVDQU16Z128rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU16Z256rrk, X86::VMOVDQU16Z256rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU16Zrrk, X86::VMOVDQU16Zrmk, TB_NO_REVERSE },
+ { X86::VMOVDQU32Z128rrk, X86::VMOVDQU32Z128rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU32Z256rrk, X86::VMOVDQU32Z256rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU32Zrrk, X86::VMOVDQU32Zrmk, TB_NO_REVERSE },
+ { X86::VMOVDQU64Z128rrk, X86::VMOVDQU64Z128rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU64Z256rrk, X86::VMOVDQU64Z256rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU64Zrrk, X86::VMOVDQU64Zrmk, TB_NO_REVERSE },
+ { X86::VMOVDQU8Z128rrk, X86::VMOVDQU8Z128rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU8Z256rrk, X86::VMOVDQU8Z256rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU8Zrrk, X86::VMOVDQU8Zrmk, TB_NO_REVERSE },
+ { X86::VMOVSHDUPZ128rrk, X86::VMOVSHDUPZ128rmk, 0 },
+ { X86::VMOVSHDUPZ256rrk, X86::VMOVSHDUPZ256rmk, 0 },
+ { X86::VMOVSHDUPZrrk, X86::VMOVSHDUPZrmk, 0 },
+ { X86::VMOVSLDUPZ128rrk, X86::VMOVSLDUPZ128rmk, 0 },
+ { X86::VMOVSLDUPZ256rrk, X86::VMOVSLDUPZ256rmk, 0 },
+ { X86::VMOVSLDUPZrrk, X86::VMOVSLDUPZrmk, 0 },
+ { X86::VMOVUPDZ128rrk, X86::VMOVUPDZ128rmk, TB_NO_REVERSE },
+ { X86::VMOVUPDZ256rrk, X86::VMOVUPDZ256rmk, TB_NO_REVERSE },
+ { X86::VMOVUPDZrrk, X86::VMOVUPDZrmk, TB_NO_REVERSE },
+ { X86::VMOVUPSZ128rrk, X86::VMOVUPSZ128rmk, TB_NO_REVERSE },
+ { X86::VMOVUPSZ256rrk, X86::VMOVUPSZ256rmk, TB_NO_REVERSE },
+ { X86::VMOVUPSZrrk, X86::VMOVUPSZrmk, TB_NO_REVERSE },
+ { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 },
+ { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 },
+ { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 },
+ { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 },
+ { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 },
+ { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 },
+ { X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE },
+ { X86::VMULSSZrr_Intkz, X86::VMULSSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 },
+ { X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 },
+ { X86::VORPDZrrkz, X86::VORPDZrmkz, 0 },
+ { X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 },
+ { X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 },
+ { X86::VORPSZrrkz, X86::VORPSZrmkz, 0 },
+ { X86::VPABSBZ128rrk, X86::VPABSBZ128rmk, 0 },
+ { X86::VPABSBZ256rrk, X86::VPABSBZ256rmk, 0 },
+ { X86::VPABSBZrrk, X86::VPABSBZrmk, 0 },
+ { X86::VPABSDZ128rrk, X86::VPABSDZ128rmk, 0 },
+ { X86::VPABSDZ256rrk, X86::VPABSDZ256rmk, 0 },
+ { X86::VPABSDZrrk, X86::VPABSDZrmk, 0 },
+ { X86::VPABSQZ128rrk, X86::VPABSQZ128rmk, 0 },
+ { X86::VPABSQZ256rrk, X86::VPABSQZ256rmk, 0 },
+ { X86::VPABSQZrrk, X86::VPABSQZrmk, 0 },
+ { X86::VPABSWZ128rrk, X86::VPABSWZ128rmk, 0 },
+ { X86::VPABSWZ256rrk, X86::VPABSWZ256rmk, 0 },
+ { X86::VPABSWZrrk, X86::VPABSWZrmk, 0 },
+ { X86::VPACKSSDWZ128rrkz, X86::VPACKSSDWZ128rmkz, 0 },
+ { X86::VPACKSSDWZ256rrkz, X86::VPACKSSDWZ256rmkz, 0 },
+ { X86::VPACKSSDWZrrkz, X86::VPACKSSDWZrmkz, 0 },
+ { X86::VPACKSSWBZ128rrkz, X86::VPACKSSWBZ128rmkz, 0 },
+ { X86::VPACKSSWBZ256rrkz, X86::VPACKSSWBZ256rmkz, 0 },
+ { X86::VPACKSSWBZrrkz, X86::VPACKSSWBZrmkz, 0 },
+ { X86::VPACKUSDWZ128rrkz, X86::VPACKUSDWZ128rmkz, 0 },
+ { X86::VPACKUSDWZ256rrkz, X86::VPACKUSDWZ256rmkz, 0 },
+ { X86::VPACKUSDWZrrkz, X86::VPACKUSDWZrmkz, 0 },
+ { X86::VPACKUSWBZ128rrkz, X86::VPACKUSWBZ128rmkz, 0 },
+ { X86::VPACKUSWBZ256rrkz, X86::VPACKUSWBZ256rmkz, 0 },
+ { X86::VPACKUSWBZrrkz, X86::VPACKUSWBZrmkz, 0 },
+ { X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 },
+ { X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 },
+ { X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 },
+ { X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 },
+ { X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 },
+ { X86::VPADDDZrrkz, X86::VPADDDZrmkz, 0 },
+ { X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 },
+ { X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 },
+ { X86::VPADDQZrrkz, X86::VPADDQZrmkz, 0 },
+ { X86::VPADDSBZ128rrkz, X86::VPADDSBZ128rmkz, 0 },
+ { X86::VPADDSBZ256rrkz, X86::VPADDSBZ256rmkz, 0 },
+ { X86::VPADDSBZrrkz, X86::VPADDSBZrmkz, 0 },
+ { X86::VPADDSWZ128rrkz, X86::VPADDSWZ128rmkz, 0 },
+ { X86::VPADDSWZ256rrkz, X86::VPADDSWZ256rmkz, 0 },
+ { X86::VPADDSWZrrkz, X86::VPADDSWZrmkz, 0 },
+ { X86::VPADDUSBZ128rrkz, X86::VPADDUSBZ128rmkz, 0 },
+ { X86::VPADDUSBZ256rrkz, X86::VPADDUSBZ256rmkz, 0 },
+ { X86::VPADDUSBZrrkz, X86::VPADDUSBZrmkz, 0 },
+ { X86::VPADDUSWZ128rrkz, X86::VPADDUSWZ128rmkz, 0 },
+ { X86::VPADDUSWZ256rrkz, X86::VPADDUSWZ256rmkz, 0 },
+ { X86::VPADDUSWZrrkz, X86::VPADDUSWZrmkz, 0 },
+ { X86::VPADDWZ128rrkz, X86::VPADDWZ128rmkz, 0 },
+ { X86::VPADDWZ256rrkz, X86::VPADDWZ256rmkz, 0 },
+ { X86::VPADDWZrrkz, X86::VPADDWZrmkz, 0 },
+ { X86::VPALIGNRZ128rrikz, X86::VPALIGNRZ128rmikz, 0 },
+ { X86::VPALIGNRZ256rrikz, X86::VPALIGNRZ256rmikz, 0 },
+ { X86::VPALIGNRZrrikz, X86::VPALIGNRZrmikz, 0 },
+ { X86::VPANDDZ128rrkz, X86::VPANDDZ128rmkz, 0 },
+ { X86::VPANDDZ256rrkz, X86::VPANDDZ256rmkz, 0 },
+ { X86::VPANDDZrrkz, X86::VPANDDZrmkz, 0 },
+ { X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 },
+ { X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 },
+ { X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 },
+ { X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 },
+ { X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 },
+ { X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 },
+ { X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 },
+ { X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 },
+ { X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 },
+ { X86::VPAVGBZ128rrkz, X86::VPAVGBZ128rmkz, 0 },
+ { X86::VPAVGBZ256rrkz, X86::VPAVGBZ256rmkz, 0 },
+ { X86::VPAVGBZrrkz, X86::VPAVGBZrmkz, 0 },
+ { X86::VPAVGWZ128rrkz, X86::VPAVGWZ128rmkz, 0 },
+ { X86::VPAVGWZ256rrkz, X86::VPAVGWZ256rmkz, 0 },
+ { X86::VPAVGWZrrkz, X86::VPAVGWZrmkz, 0 },
+ { X86::VPBLENDMBZ128rrk, X86::VPBLENDMBZ128rmk, 0 },
+ { X86::VPBLENDMBZ256rrk, X86::VPBLENDMBZ256rmk, 0 },
+ { X86::VPBLENDMBZrrk, X86::VPBLENDMBZrmk, 0 },
+ { X86::VPBLENDMDZ128rrk, X86::VPBLENDMDZ128rmk, 0 },
+ { X86::VPBLENDMDZ256rrk, X86::VPBLENDMDZ256rmk, 0 },
+ { X86::VPBLENDMDZrrk, X86::VPBLENDMDZrmk, 0 },
+ { X86::VPBLENDMQZ128rrk, X86::VPBLENDMQZ128rmk, 0 },
+ { X86::VPBLENDMQZ256rrk, X86::VPBLENDMQZ256rmk, 0 },
+ { X86::VPBLENDMQZrrk, X86::VPBLENDMQZrmk, 0 },
+ { X86::VPBLENDMWZ128rrk, X86::VPBLENDMWZ128rmk, 0 },
+ { X86::VPBLENDMWZ256rrk, X86::VPBLENDMWZ256rmk, 0 },
+ { X86::VPBLENDMWZrrk, X86::VPBLENDMWZrmk, 0 },
+ { X86::VPBROADCASTBZ128rk, X86::VPBROADCASTBZ128mk, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZ256rk, X86::VPBROADCASTBZ256mk, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZrk, X86::VPBROADCASTBZmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ128rk, X86::VPBROADCASTDZ128mk, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ256rk, X86::VPBROADCASTDZ256mk, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZrk, X86::VPBROADCASTDZmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ128rk, X86::VPBROADCASTQZ128mk, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ256rk, X86::VPBROADCASTQZ256mk, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZrk, X86::VPBROADCASTQZmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ128rk, X86::VPBROADCASTWZ128mk, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ256rk, X86::VPBROADCASTWZ256mk, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZrk, X86::VPBROADCASTWZmk, TB_NO_REVERSE },
+ { X86::VPCMOVYrrr, X86::VPCMOVYrrm, 0 },
+ { X86::VPCMOVrrr, X86::VPCMOVrrm, 0 },
+ { X86::VPCMPBZ128rrik, X86::VPCMPBZ128rmik, 0 },
+ { X86::VPCMPBZ256rrik, X86::VPCMPBZ256rmik, 0 },
+ { X86::VPCMPBZrrik, X86::VPCMPBZrmik, 0 },
+ { X86::VPCMPDZ128rrik, X86::VPCMPDZ128rmik, 0 },
+ { X86::VPCMPDZ256rrik, X86::VPCMPDZ256rmik, 0 },
+ { X86::VPCMPDZrrik, X86::VPCMPDZrmik, 0 },
+ { X86::VPCMPEQBZ128rrk, X86::VPCMPEQBZ128rmk, 0 },
+ { X86::VPCMPEQBZ256rrk, X86::VPCMPEQBZ256rmk, 0 },
+ { X86::VPCMPEQBZrrk, X86::VPCMPEQBZrmk, 0 },
+ { X86::VPCMPEQDZ128rrk, X86::VPCMPEQDZ128rmk, 0 },
+ { X86::VPCMPEQDZ256rrk, X86::VPCMPEQDZ256rmk, 0 },
+ { X86::VPCMPEQDZrrk, X86::VPCMPEQDZrmk, 0 },
+ { X86::VPCMPEQQZ128rrk, X86::VPCMPEQQZ128rmk, 0 },
+ { X86::VPCMPEQQZ256rrk, X86::VPCMPEQQZ256rmk, 0 },
+ { X86::VPCMPEQQZrrk, X86::VPCMPEQQZrmk, 0 },
+ { X86::VPCMPEQWZ128rrk, X86::VPCMPEQWZ128rmk, 0 },
+ { X86::VPCMPEQWZ256rrk, X86::VPCMPEQWZ256rmk, 0 },
+ { X86::VPCMPEQWZrrk, X86::VPCMPEQWZrmk, 0 },
+ { X86::VPCMPGTBZ128rrk, X86::VPCMPGTBZ128rmk, 0 },
+ { X86::VPCMPGTBZ256rrk, X86::VPCMPGTBZ256rmk, 0 },
+ { X86::VPCMPGTBZrrk, X86::VPCMPGTBZrmk, 0 },
+ { X86::VPCMPGTDZ128rrk, X86::VPCMPGTDZ128rmk, 0 },
+ { X86::VPCMPGTDZ256rrk, X86::VPCMPGTDZ256rmk, 0 },
+ { X86::VPCMPGTDZrrk, X86::VPCMPGTDZrmk, 0 },
+ { X86::VPCMPGTQZ128rrk, X86::VPCMPGTQZ128rmk, 0 },
+ { X86::VPCMPGTQZ256rrk, X86::VPCMPGTQZ256rmk, 0 },
+ { X86::VPCMPGTQZrrk, X86::VPCMPGTQZrmk, 0 },
+ { X86::VPCMPGTWZ128rrk, X86::VPCMPGTWZ128rmk, 0 },
+ { X86::VPCMPGTWZ256rrk, X86::VPCMPGTWZ256rmk, 0 },
+ { X86::VPCMPGTWZrrk, X86::VPCMPGTWZrmk, 0 },
+ { X86::VPCMPQZ128rrik, X86::VPCMPQZ128rmik, 0 },
+ { X86::VPCMPQZ256rrik, X86::VPCMPQZ256rmik, 0 },
+ { X86::VPCMPQZrrik, X86::VPCMPQZrmik, 0 },
+ { X86::VPCMPUBZ128rrik, X86::VPCMPUBZ128rmik, 0 },
+ { X86::VPCMPUBZ256rrik, X86::VPCMPUBZ256rmik, 0 },
+ { X86::VPCMPUBZrrik, X86::VPCMPUBZrmik, 0 },
+ { X86::VPCMPUDZ128rrik, X86::VPCMPUDZ128rmik, 0 },
+ { X86::VPCMPUDZ256rrik, X86::VPCMPUDZ256rmik, 0 },
+ { X86::VPCMPUDZrrik, X86::VPCMPUDZrmik, 0 },
+ { X86::VPCMPUQZ128rrik, X86::VPCMPUQZ128rmik, 0 },
+ { X86::VPCMPUQZ256rrik, X86::VPCMPUQZ256rmik, 0 },
+ { X86::VPCMPUQZrrik, X86::VPCMPUQZrmik, 0 },
+ { X86::VPCMPUWZ128rrik, X86::VPCMPUWZ128rmik, 0 },
+ { X86::VPCMPUWZ256rrik, X86::VPCMPUWZ256rmik, 0 },
+ { X86::VPCMPUWZrrik, X86::VPCMPUWZrmik, 0 },
+ { X86::VPCMPWZ128rrik, X86::VPCMPWZ128rmik, 0 },
+ { X86::VPCMPWZ256rrik, X86::VPCMPWZ256rmik, 0 },
+ { X86::VPCMPWZrrik, X86::VPCMPWZrmik, 0 },
+ { X86::VPCONFLICTDZ128rrk, X86::VPCONFLICTDZ128rmk, 0 },
+ { X86::VPCONFLICTDZ256rrk, X86::VPCONFLICTDZ256rmk, 0 },
+ { X86::VPCONFLICTDZrrk, X86::VPCONFLICTDZrmk, 0 },
+ { X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmk, 0 },
+ { X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmk, 0 },
+ { X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmk, 0 },
+ { X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128m, 0 },
+ { X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256m, 0 },
+ { X86::VPDPBUSDSZr, X86::VPDPBUSDSZm, 0 },
+ { X86::VPDPBUSDZ128r, X86::VPDPBUSDZ128m, 0 },
+ { X86::VPDPBUSDZ256r, X86::VPDPBUSDZ256m, 0 },
+ { X86::VPDPBUSDZr, X86::VPDPBUSDZm, 0 },
+ { X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128m, 0 },
+ { X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256m, 0 },
+ { X86::VPDPWSSDSZr, X86::VPDPWSSDSZm, 0 },
+ { X86::VPDPWSSDZ128r, X86::VPDPWSSDZ128m, 0 },
+ { X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256m, 0 },
+ { X86::VPDPWSSDZr, X86::VPDPWSSDZm, 0 },
+ { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 },
+ { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 },
+ { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 },
+ { X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 },
+ { X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 },
+ { X86::VPERMI2B128rr, X86::VPERMI2B128rm, 0 },
+ { X86::VPERMI2B256rr, X86::VPERMI2B256rm, 0 },
+ { X86::VPERMI2Brr, X86::VPERMI2Brm, 0 },
+ { X86::VPERMI2D128rr, X86::VPERMI2D128rm, 0 },
+ { X86::VPERMI2D256rr, X86::VPERMI2D256rm, 0 },
+ { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 },
+ { X86::VPERMI2PD128rr, X86::VPERMI2PD128rm, 0 },
+ { X86::VPERMI2PD256rr, X86::VPERMI2PD256rm, 0 },
+ { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 },
+ { X86::VPERMI2PS128rr, X86::VPERMI2PS128rm, 0 },
+ { X86::VPERMI2PS256rr, X86::VPERMI2PS256rm, 0 },
+ { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 },
+ { X86::VPERMI2Q128rr, X86::VPERMI2Q128rm, 0 },
+ { X86::VPERMI2Q256rr, X86::VPERMI2Q256rm, 0 },
+ { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 },
+ { X86::VPERMI2W128rr, X86::VPERMI2W128rm, 0 },
+ { X86::VPERMI2W256rr, X86::VPERMI2W256rm, 0 },
+ { X86::VPERMI2Wrr, X86::VPERMI2Wrm, 0 },
+ { X86::VPERMIL2PDYrr, X86::VPERMIL2PDYrm, 0 },
+ { X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 },
+ { X86::VPERMIL2PSYrr, X86::VPERMIL2PSYrm, 0 },
+ { X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 },
+ { X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 },
+ { X86::VPERMILPDZ128rrkz, X86::VPERMILPDZ128rmkz, 0 },
+ { X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 },
+ { X86::VPERMILPDZ256rrkz, X86::VPERMILPDZ256rmkz, 0 },
+ { X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 },
+ { X86::VPERMILPDZrrkz, X86::VPERMILPDZrmkz, 0 },
+ { X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 },
+ { X86::VPERMILPSZ128rrkz, X86::VPERMILPSZ128rmkz, 0 },
+ { X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 },
+ { X86::VPERMILPSZ256rrkz, X86::VPERMILPSZ256rmkz, 0 },
+ { X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 },
+ { X86::VPERMILPSZrrkz, X86::VPERMILPSZrmkz, 0 },
+ { X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 },
+ { X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 },
+ { X86::VPERMPDZrik, X86::VPERMPDZmik, 0 },
+ { X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 },
+ { X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 },
+ { X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 },
+ { X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 },
+ { X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 },
+ { X86::VPERMQZrik, X86::VPERMQZmik, 0 },
+ { X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 },
+ { X86::VPERMT2B128rr, X86::VPERMT2B128rm, 0 },
+ { X86::VPERMT2B256rr, X86::VPERMT2B256rm, 0 },
+ { X86::VPERMT2Brr, X86::VPERMT2Brm, 0 },
+ { X86::VPERMT2D128rr, X86::VPERMT2D128rm, 0 },
+ { X86::VPERMT2D256rr, X86::VPERMT2D256rm, 0 },
+ { X86::VPERMT2Drr, X86::VPERMT2Drm, 0 },
+ { X86::VPERMT2PD128rr, X86::VPERMT2PD128rm, 0 },
+ { X86::VPERMT2PD256rr, X86::VPERMT2PD256rm, 0 },
+ { X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 },
+ { X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 },
+ { X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 },
+ { X86::VPERMT2PSrr, X86::VPERMT2PSrm, 0 },
+ { X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 },
+ { X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 },
+ { X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 },
+ { X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 },
+ { X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 },
+ { X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 },
+ { X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 },
+ { X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 },
+ { X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 },
+ { X86::VPEXPANDBZ128rrk, X86::VPEXPANDBZ128rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDBZ256rrk, X86::VPEXPANDBZ256rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDBZrrk, X86::VPEXPANDBZrmk, TB_NO_REVERSE },
+ { X86::VPEXPANDDZ128rrk, X86::VPEXPANDDZ128rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDDZ256rrk, X86::VPEXPANDDZ256rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDDZrrk, X86::VPEXPANDDZrmk, TB_NO_REVERSE },
+ { X86::VPEXPANDQZ128rrk, X86::VPEXPANDQZ128rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDQZ256rrk, X86::VPEXPANDQZ256rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDQZrrk, X86::VPEXPANDQZrmk, TB_NO_REVERSE },
+ { X86::VPEXPANDWZ128rrk, X86::VPEXPANDWZ128rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDWZ256rrk, X86::VPEXPANDWZ256rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDWZrrk, X86::VPEXPANDWZrmk, TB_NO_REVERSE },
+ { X86::VPLZCNTDZ128rrk, X86::VPLZCNTDZ128rmk, 0 },
+ { X86::VPLZCNTDZ256rrk, X86::VPLZCNTDZ256rmk, 0 },
+ { X86::VPLZCNTDZrrk, X86::VPLZCNTDZrmk, 0 },
+ { X86::VPLZCNTQZ128rrk, X86::VPLZCNTQZ128rmk, 0 },
+ { X86::VPLZCNTQZ256rrk, X86::VPLZCNTQZ256rmk, 0 },
+ { X86::VPLZCNTQZrrk, X86::VPLZCNTQZrmk, 0 },
+ { X86::VPMADD52HUQZ128r, X86::VPMADD52HUQZ128m, 0 },
+ { X86::VPMADD52HUQZ256r, X86::VPMADD52HUQZ256m, 0 },
+ { X86::VPMADD52HUQZr, X86::VPMADD52HUQZm, 0 },
+ { X86::VPMADD52LUQZ128r, X86::VPMADD52LUQZ128m, 0 },
+ { X86::VPMADD52LUQZ256r, X86::VPMADD52LUQZ256m, 0 },
+ { X86::VPMADD52LUQZr, X86::VPMADD52LUQZm, 0 },
+ { X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 },
+ { X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 },
+ { X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 },
+ { X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 },
+ { X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 },
+ { X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 },
+ { X86::VPMAXSBZ128rrkz, X86::VPMAXSBZ128rmkz, 0 },
+ { X86::VPMAXSBZ256rrkz, X86::VPMAXSBZ256rmkz, 0 },
+ { X86::VPMAXSBZrrkz, X86::VPMAXSBZrmkz, 0 },
+ { X86::VPMAXSDZ128rrkz, X86::VPMAXSDZ128rmkz, 0 },
+ { X86::VPMAXSDZ256rrkz, X86::VPMAXSDZ256rmkz, 0 },
+ { X86::VPMAXSDZrrkz, X86::VPMAXSDZrmkz, 0 },
+ { X86::VPMAXSQZ128rrkz, X86::VPMAXSQZ128rmkz, 0 },
+ { X86::VPMAXSQZ256rrkz, X86::VPMAXSQZ256rmkz, 0 },
+ { X86::VPMAXSQZrrkz, X86::VPMAXSQZrmkz, 0 },
+ { X86::VPMAXSWZ128rrkz, X86::VPMAXSWZ128rmkz, 0 },
+ { X86::VPMAXSWZ256rrkz, X86::VPMAXSWZ256rmkz, 0 },
+ { X86::VPMAXSWZrrkz, X86::VPMAXSWZrmkz, 0 },
+ { X86::VPMAXUBZ128rrkz, X86::VPMAXUBZ128rmkz, 0 },
+ { X86::VPMAXUBZ256rrkz, X86::VPMAXUBZ256rmkz, 0 },
+ { X86::VPMAXUBZrrkz, X86::VPMAXUBZrmkz, 0 },
+ { X86::VPMAXUDZ128rrkz, X86::VPMAXUDZ128rmkz, 0 },
+ { X86::VPMAXUDZ256rrkz, X86::VPMAXUDZ256rmkz, 0 },
+ { X86::VPMAXUDZrrkz, X86::VPMAXUDZrmkz, 0 },
+ { X86::VPMAXUQZ128rrkz, X86::VPMAXUQZ128rmkz, 0 },
+ { X86::VPMAXUQZ256rrkz, X86::VPMAXUQZ256rmkz, 0 },
+ { X86::VPMAXUQZrrkz, X86::VPMAXUQZrmkz, 0 },
+ { X86::VPMAXUWZ128rrkz, X86::VPMAXUWZ128rmkz, 0 },
+ { X86::VPMAXUWZ256rrkz, X86::VPMAXUWZ256rmkz, 0 },
+ { X86::VPMAXUWZrrkz, X86::VPMAXUWZrmkz, 0 },
+ { X86::VPMINSBZ128rrkz, X86::VPMINSBZ128rmkz, 0 },
+ { X86::VPMINSBZ256rrkz, X86::VPMINSBZ256rmkz, 0 },
+ { X86::VPMINSBZrrkz, X86::VPMINSBZrmkz, 0 },
+ { X86::VPMINSDZ128rrkz, X86::VPMINSDZ128rmkz, 0 },
+ { X86::VPMINSDZ256rrkz, X86::VPMINSDZ256rmkz, 0 },
+ { X86::VPMINSDZrrkz, X86::VPMINSDZrmkz, 0 },
+ { X86::VPMINSQZ128rrkz, X86::VPMINSQZ128rmkz, 0 },
+ { X86::VPMINSQZ256rrkz, X86::VPMINSQZ256rmkz, 0 },
+ { X86::VPMINSQZrrkz, X86::VPMINSQZrmkz, 0 },
+ { X86::VPMINSWZ128rrkz, X86::VPMINSWZ128rmkz, 0 },
+ { X86::VPMINSWZ256rrkz, X86::VPMINSWZ256rmkz, 0 },
+ { X86::VPMINSWZrrkz, X86::VPMINSWZrmkz, 0 },
+ { X86::VPMINUBZ128rrkz, X86::VPMINUBZ128rmkz, 0 },
+ { X86::VPMINUBZ256rrkz, X86::VPMINUBZ256rmkz, 0 },
+ { X86::VPMINUBZrrkz, X86::VPMINUBZrmkz, 0 },
+ { X86::VPMINUDZ128rrkz, X86::VPMINUDZ128rmkz, 0 },
+ { X86::VPMINUDZ256rrkz, X86::VPMINUDZ256rmkz, 0 },
+ { X86::VPMINUDZrrkz, X86::VPMINUDZrmkz, 0 },
+ { X86::VPMINUQZ128rrkz, X86::VPMINUQZ128rmkz, 0 },
+ { X86::VPMINUQZ256rrkz, X86::VPMINUQZ256rmkz, 0 },
+ { X86::VPMINUQZrrkz, X86::VPMINUQZrmkz, 0 },
+ { X86::VPMINUWZ128rrkz, X86::VPMINUWZ128rmkz, 0 },
+ { X86::VPMINUWZ256rrkz, X86::VPMINUWZ256rmkz, 0 },
+ { X86::VPMINUWZrrkz, X86::VPMINUWZrmkz, 0 },
+ { X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 },
+ { X86::VPMOVSXBQZ128rrk, X86::VPMOVSXBQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ128rrk, X86::VPMOVSXBWZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 },
+ { X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 },
+ { X86::VPMOVSXDQZ128rrk, X86::VPMOVSXDQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXDQZ256rrk, X86::VPMOVSXDQZ256rmk, 0 },
+ { X86::VPMOVSXDQZrrk, X86::VPMOVSXDQZrmk, 0 },
+ { X86::VPMOVSXWDZ128rrk, X86::VPMOVSXWDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXWDZ256rrk, X86::VPMOVSXWDZ256rmk, 0 },
+ { X86::VPMOVSXWDZrrk, X86::VPMOVSXWDZrmk, 0 },
+ { X86::VPMOVSXWQZ128rrk, X86::VPMOVSXWQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ256rrk, X86::VPMOVSXWQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZrrk, X86::VPMOVSXWQZrmk, 0 },
+ { X86::VPMOVZXBDZ128rrk, X86::VPMOVZXBDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ256rrk, X86::VPMOVZXBDZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZrrk, X86::VPMOVZXBDZrmk, 0 },
+ { X86::VPMOVZXBQZ128rrk, X86::VPMOVZXBQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ256rrk, X86::VPMOVZXBQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZrrk, X86::VPMOVZXBQZrmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ128rrk, X86::VPMOVZXBWZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ256rrk, X86::VPMOVZXBWZ256rmk, 0 },
+ { X86::VPMOVZXBWZrrk, X86::VPMOVZXBWZrmk, 0 },
+ { X86::VPMOVZXDQZ128rrk, X86::VPMOVZXDQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXDQZ256rrk, X86::VPMOVZXDQZ256rmk, 0 },
+ { X86::VPMOVZXDQZrrk, X86::VPMOVZXDQZrmk, 0 },
+ { X86::VPMOVZXWDZ128rrk, X86::VPMOVZXWDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXWDZ256rrk, X86::VPMOVZXWDZ256rmk, 0 },
+ { X86::VPMOVZXWDZrrk, X86::VPMOVZXWDZrmk, 0 },
+ { X86::VPMOVZXWQZ128rrk, X86::VPMOVZXWQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ256rrk, X86::VPMOVZXWQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZrrk, X86::VPMOVZXWQZrmk, 0 },
+ { X86::VPMULDQZ128rrkz, X86::VPMULDQZ128rmkz, 0 },
+ { X86::VPMULDQZ256rrkz, X86::VPMULDQZ256rmkz, 0 },
+ { X86::VPMULDQZrrkz, X86::VPMULDQZrmkz, 0 },
+ { X86::VPMULHRSWZ128rrkz, X86::VPMULHRSWZ128rmkz, 0 },
+ { X86::VPMULHRSWZ256rrkz, X86::VPMULHRSWZ256rmkz, 0 },
+ { X86::VPMULHRSWZrrkz, X86::VPMULHRSWZrmkz, 0 },
+ { X86::VPMULHUWZ128rrkz, X86::VPMULHUWZ128rmkz, 0 },
+ { X86::VPMULHUWZ256rrkz, X86::VPMULHUWZ256rmkz, 0 },
+ { X86::VPMULHUWZrrkz, X86::VPMULHUWZrmkz, 0 },
+ { X86::VPMULHWZ128rrkz, X86::VPMULHWZ128rmkz, 0 },
+ { X86::VPMULHWZ256rrkz, X86::VPMULHWZ256rmkz, 0 },
+ { X86::VPMULHWZrrkz, X86::VPMULHWZrmkz, 0 },
+ { X86::VPMULLDZ128rrkz, X86::VPMULLDZ128rmkz, 0 },
+ { X86::VPMULLDZ256rrkz, X86::VPMULLDZ256rmkz, 0 },
+ { X86::VPMULLDZrrkz, X86::VPMULLDZrmkz, 0 },
+ { X86::VPMULLQZ128rrkz, X86::VPMULLQZ128rmkz, 0 },
+ { X86::VPMULLQZ256rrkz, X86::VPMULLQZ256rmkz, 0 },
+ { X86::VPMULLQZrrkz, X86::VPMULLQZrmkz, 0 },
+ { X86::VPMULLWZ128rrkz, X86::VPMULLWZ128rmkz, 0 },
+ { X86::VPMULLWZ256rrkz, X86::VPMULLWZ256rmkz, 0 },
+ { X86::VPMULLWZrrkz, X86::VPMULLWZrmkz, 0 },
+ { X86::VPMULTISHIFTQBZ128rrkz, X86::VPMULTISHIFTQBZ128rmkz, 0 },
+ { X86::VPMULTISHIFTQBZ256rrkz, X86::VPMULTISHIFTQBZ256rmkz, 0 },
+ { X86::VPMULTISHIFTQBZrrkz, X86::VPMULTISHIFTQBZrmkz, 0 },
+ { X86::VPMULUDQZ128rrkz, X86::VPMULUDQZ128rmkz, 0 },
+ { X86::VPMULUDQZ256rrkz, X86::VPMULUDQZ256rmkz, 0 },
+ { X86::VPMULUDQZrrkz, X86::VPMULUDQZrmkz, 0 },
+ { X86::VPOPCNTBZ128rrk, X86::VPOPCNTBZ128rmk, 0 },
+ { X86::VPOPCNTBZ256rrk, X86::VPOPCNTBZ256rmk, 0 },
+ { X86::VPOPCNTBZrrk, X86::VPOPCNTBZrmk, 0 },
+ { X86::VPOPCNTDZ128rrk, X86::VPOPCNTDZ128rmk, 0 },
+ { X86::VPOPCNTDZ256rrk, X86::VPOPCNTDZ256rmk, 0 },
+ { X86::VPOPCNTDZrrk, X86::VPOPCNTDZrmk, 0 },
+ { X86::VPOPCNTQZ128rrk, X86::VPOPCNTQZ128rmk, 0 },
+ { X86::VPOPCNTQZ256rrk, X86::VPOPCNTQZ256rmk, 0 },
+ { X86::VPOPCNTQZrrk, X86::VPOPCNTQZrmk, 0 },
+ { X86::VPOPCNTWZ128rrk, X86::VPOPCNTWZ128rmk, 0 },
+ { X86::VPOPCNTWZ256rrk, X86::VPOPCNTWZ256rmk, 0 },
+ { X86::VPOPCNTWZrrk, X86::VPOPCNTWZrmk, 0 },
+ { X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 },
+ { X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 },
+ { X86::VPORDZrrkz, X86::VPORDZrmkz, 0 },
+ { X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 },
+ { X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 },
+ { X86::VPORQZrrkz, X86::VPORQZrmkz, 0 },
+ { X86::VPPERMrrr, X86::VPPERMrrm, 0 },
+ { X86::VPROLDZ128rik, X86::VPROLDZ128mik, 0 },
+ { X86::VPROLDZ256rik, X86::VPROLDZ256mik, 0 },
+ { X86::VPROLDZrik, X86::VPROLDZmik, 0 },
+ { X86::VPROLQZ128rik, X86::VPROLQZ128mik, 0 },
+ { X86::VPROLQZ256rik, X86::VPROLQZ256mik, 0 },
+ { X86::VPROLQZrik, X86::VPROLQZmik, 0 },
+ { X86::VPROLVDZ128rrkz, X86::VPROLVDZ128rmkz, 0 },
+ { X86::VPROLVDZ256rrkz, X86::VPROLVDZ256rmkz, 0 },
+ { X86::VPROLVDZrrkz, X86::VPROLVDZrmkz, 0 },
+ { X86::VPROLVQZ128rrkz, X86::VPROLVQZ128rmkz, 0 },
+ { X86::VPROLVQZ256rrkz, X86::VPROLVQZ256rmkz, 0 },
+ { X86::VPROLVQZrrkz, X86::VPROLVQZrmkz, 0 },
+ { X86::VPRORDZ128rik, X86::VPRORDZ128mik, 0 },
+ { X86::VPRORDZ256rik, X86::VPRORDZ256mik, 0 },
+ { X86::VPRORDZrik, X86::VPRORDZmik, 0 },
+ { X86::VPRORQZ128rik, X86::VPRORQZ128mik, 0 },
+ { X86::VPRORQZ256rik, X86::VPRORQZ256mik, 0 },
+ { X86::VPRORQZrik, X86::VPRORQZmik, 0 },
+ { X86::VPRORVDZ128rrkz, X86::VPRORVDZ128rmkz, 0 },
+ { X86::VPRORVDZ256rrkz, X86::VPRORVDZ256rmkz, 0 },
+ { X86::VPRORVDZrrkz, X86::VPRORVDZrmkz, 0 },
+ { X86::VPRORVQZ128rrkz, X86::VPRORVQZ128rmkz, 0 },
+ { X86::VPRORVQZ256rrkz, X86::VPRORVQZ256rmkz, 0 },
+ { X86::VPRORVQZrrkz, X86::VPRORVQZrmkz, 0 },
+ { X86::VPSHLDDZ128rrikz, X86::VPSHLDDZ128rmikz, 0 },
+ { X86::VPSHLDDZ256rrikz, X86::VPSHLDDZ256rmikz, 0 },
+ { X86::VPSHLDDZrrikz, X86::VPSHLDDZrmikz, 0 },
+ { X86::VPSHLDQZ128rrikz, X86::VPSHLDQZ128rmikz, 0 },
+ { X86::VPSHLDQZ256rrikz, X86::VPSHLDQZ256rmikz, 0 },
+ { X86::VPSHLDQZrrikz, X86::VPSHLDQZrmikz, 0 },
+ { X86::VPSHLDVDZ128r, X86::VPSHLDVDZ128m, 0 },
+ { X86::VPSHLDVDZ256r, X86::VPSHLDVDZ256m, 0 },
+ { X86::VPSHLDVDZr, X86::VPSHLDVDZm, 0 },
+ { X86::VPSHLDVQZ128r, X86::VPSHLDVQZ128m, 0 },
+ { X86::VPSHLDVQZ256r, X86::VPSHLDVQZ256m, 0 },
+ { X86::VPSHLDVQZr, X86::VPSHLDVQZm, 0 },
+ { X86::VPSHLDVWZ128r, X86::VPSHLDVWZ128m, 0 },
+ { X86::VPSHLDVWZ256r, X86::VPSHLDVWZ256m, 0 },
+ { X86::VPSHLDVWZr, X86::VPSHLDVWZm, 0 },
+ { X86::VPSHLDWZ128rrikz, X86::VPSHLDWZ128rmikz, 0 },
+ { X86::VPSHLDWZ256rrikz, X86::VPSHLDWZ256rmikz, 0 },
+ { X86::VPSHLDWZrrikz, X86::VPSHLDWZrmikz, 0 },
+ { X86::VPSHRDDZ128rrikz, X86::VPSHRDDZ128rmikz, 0 },
+ { X86::VPSHRDDZ256rrikz, X86::VPSHRDDZ256rmikz, 0 },
+ { X86::VPSHRDDZrrikz, X86::VPSHRDDZrmikz, 0 },
+ { X86::VPSHRDQZ128rrikz, X86::VPSHRDQZ128rmikz, 0 },
+ { X86::VPSHRDQZ256rrikz, X86::VPSHRDQZ256rmikz, 0 },
+ { X86::VPSHRDQZrrikz, X86::VPSHRDQZrmikz, 0 },
+ { X86::VPSHRDVDZ128r, X86::VPSHRDVDZ128m, 0 },
+ { X86::VPSHRDVDZ256r, X86::VPSHRDVDZ256m, 0 },
+ { X86::VPSHRDVDZr, X86::VPSHRDVDZm, 0 },
+ { X86::VPSHRDVQZ128r, X86::VPSHRDVQZ128m, 0 },
+ { X86::VPSHRDVQZ256r, X86::VPSHRDVQZ256m, 0 },
+ { X86::VPSHRDVQZr, X86::VPSHRDVQZm, 0 },
+ { X86::VPSHRDVWZ128r, X86::VPSHRDVWZ128m, 0 },
+ { X86::VPSHRDVWZ256r, X86::VPSHRDVWZ256m, 0 },
+ { X86::VPSHRDVWZr, X86::VPSHRDVWZm, 0 },
+ { X86::VPSHRDWZ128rrikz, X86::VPSHRDWZ128rmikz, 0 },
+ { X86::VPSHRDWZ256rrikz, X86::VPSHRDWZ256rmikz, 0 },
+ { X86::VPSHRDWZrrikz, X86::VPSHRDWZrmikz, 0 },
+ { X86::VPSHUFBITQMBZ128rrk, X86::VPSHUFBITQMBZ128rmk, 0 },
+ { X86::VPSHUFBITQMBZ256rrk, X86::VPSHUFBITQMBZ256rmk, 0 },
+ { X86::VPSHUFBITQMBZrrk, X86::VPSHUFBITQMBZrmk, 0 },
+ { X86::VPSHUFBZ128rrkz, X86::VPSHUFBZ128rmkz, 0 },
+ { X86::VPSHUFBZ256rrkz, X86::VPSHUFBZ256rmkz, 0 },
+ { X86::VPSHUFBZrrkz, X86::VPSHUFBZrmkz, 0 },
+ { X86::VPSHUFDZ128rik, X86::VPSHUFDZ128mik, 0 },
+ { X86::VPSHUFDZ256rik, X86::VPSHUFDZ256mik, 0 },
+ { X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 },
+ { X86::VPSHUFHWZ128rik, X86::VPSHUFHWZ128mik, 0 },
+ { X86::VPSHUFHWZ256rik, X86::VPSHUFHWZ256mik, 0 },
+ { X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 },
+ { X86::VPSHUFLWZ128rik, X86::VPSHUFLWZ128mik, 0 },
+ { X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 },
+ { X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 },
+ { X86::VPSLLDZ128rik, X86::VPSLLDZ128mik, 0 },
+ { X86::VPSLLDZ128rrkz, X86::VPSLLDZ128rmkz, 0 },
+ { X86::VPSLLDZ256rik, X86::VPSLLDZ256mik, 0 },
+ { X86::VPSLLDZ256rrkz, X86::VPSLLDZ256rmkz, 0 },
+ { X86::VPSLLDZrik, X86::VPSLLDZmik, 0 },
+ { X86::VPSLLDZrrkz, X86::VPSLLDZrmkz, 0 },
+ { X86::VPSLLQZ128rik, X86::VPSLLQZ128mik, 0 },
+ { X86::VPSLLQZ128rrkz, X86::VPSLLQZ128rmkz, 0 },
+ { X86::VPSLLQZ256rik, X86::VPSLLQZ256mik, 0 },
+ { X86::VPSLLQZ256rrkz, X86::VPSLLQZ256rmkz, 0 },
+ { X86::VPSLLQZrik, X86::VPSLLQZmik, 0 },
+ { X86::VPSLLQZrrkz, X86::VPSLLQZrmkz, 0 },
+ { X86::VPSLLVDZ128rrkz, X86::VPSLLVDZ128rmkz, 0 },
+ { X86::VPSLLVDZ256rrkz, X86::VPSLLVDZ256rmkz, 0 },
+ { X86::VPSLLVDZrrkz, X86::VPSLLVDZrmkz, 0 },
+ { X86::VPSLLVQZ128rrkz, X86::VPSLLVQZ128rmkz, 0 },
+ { X86::VPSLLVQZ256rrkz, X86::VPSLLVQZ256rmkz, 0 },
+ { X86::VPSLLVQZrrkz, X86::VPSLLVQZrmkz, 0 },
+ { X86::VPSLLVWZ128rrkz, X86::VPSLLVWZ128rmkz, 0 },
+ { X86::VPSLLVWZ256rrkz, X86::VPSLLVWZ256rmkz, 0 },
+ { X86::VPSLLVWZrrkz, X86::VPSLLVWZrmkz, 0 },
+ { X86::VPSLLWZ128rik, X86::VPSLLWZ128mik, 0 },
+ { X86::VPSLLWZ128rrkz, X86::VPSLLWZ128rmkz, 0 },
+ { X86::VPSLLWZ256rik, X86::VPSLLWZ256mik, 0 },
+ { X86::VPSLLWZ256rrkz, X86::VPSLLWZ256rmkz, 0 },
+ { X86::VPSLLWZrik, X86::VPSLLWZmik, 0 },
+ { X86::VPSLLWZrrkz, X86::VPSLLWZrmkz, 0 },
+ { X86::VPSRADZ128rik, X86::VPSRADZ128mik, 0 },
+ { X86::VPSRADZ128rrkz, X86::VPSRADZ128rmkz, 0 },
+ { X86::VPSRADZ256rik, X86::VPSRADZ256mik, 0 },
+ { X86::VPSRADZ256rrkz, X86::VPSRADZ256rmkz, 0 },
+ { X86::VPSRADZrik, X86::VPSRADZmik, 0 },
+ { X86::VPSRADZrrkz, X86::VPSRADZrmkz, 0 },
+ { X86::VPSRAQZ128rik, X86::VPSRAQZ128mik, 0 },
+ { X86::VPSRAQZ128rrkz, X86::VPSRAQZ128rmkz, 0 },
+ { X86::VPSRAQZ256rik, X86::VPSRAQZ256mik, 0 },
+ { X86::VPSRAQZ256rrkz, X86::VPSRAQZ256rmkz, 0 },
+ { X86::VPSRAQZrik, X86::VPSRAQZmik, 0 },
+ { X86::VPSRAQZrrkz, X86::VPSRAQZrmkz, 0 },
+ { X86::VPSRAVDZ128rrkz, X86::VPSRAVDZ128rmkz, 0 },
+ { X86::VPSRAVDZ256rrkz, X86::VPSRAVDZ256rmkz, 0 },
+ { X86::VPSRAVDZrrkz, X86::VPSRAVDZrmkz, 0 },
+ { X86::VPSRAVQZ128rrkz, X86::VPSRAVQZ128rmkz, 0 },
+ { X86::VPSRAVQZ256rrkz, X86::VPSRAVQZ256rmkz, 0 },
+ { X86::VPSRAVQZrrkz, X86::VPSRAVQZrmkz, 0 },
+ { X86::VPSRAVWZ128rrkz, X86::VPSRAVWZ128rmkz, 0 },
+ { X86::VPSRAVWZ256rrkz, X86::VPSRAVWZ256rmkz, 0 },
+ { X86::VPSRAVWZrrkz, X86::VPSRAVWZrmkz, 0 },
+ { X86::VPSRAWZ128rik, X86::VPSRAWZ128mik, 0 },
+ { X86::VPSRAWZ128rrkz, X86::VPSRAWZ128rmkz, 0 },
+ { X86::VPSRAWZ256rik, X86::VPSRAWZ256mik, 0 },
+ { X86::VPSRAWZ256rrkz, X86::VPSRAWZ256rmkz, 0 },
+ { X86::VPSRAWZrik, X86::VPSRAWZmik, 0 },
+ { X86::VPSRAWZrrkz, X86::VPSRAWZrmkz, 0 },
+ { X86::VPSRLDZ128rik, X86::VPSRLDZ128mik, 0 },
+ { X86::VPSRLDZ128rrkz, X86::VPSRLDZ128rmkz, 0 },
+ { X86::VPSRLDZ256rik, X86::VPSRLDZ256mik, 0 },
+ { X86::VPSRLDZ256rrkz, X86::VPSRLDZ256rmkz, 0 },
+ { X86::VPSRLDZrik, X86::VPSRLDZmik, 0 },
+ { X86::VPSRLDZrrkz, X86::VPSRLDZrmkz, 0 },
+ { X86::VPSRLQZ128rik, X86::VPSRLQZ128mik, 0 },
+ { X86::VPSRLQZ128rrkz, X86::VPSRLQZ128rmkz, 0 },
+ { X86::VPSRLQZ256rik, X86::VPSRLQZ256mik, 0 },
+ { X86::VPSRLQZ256rrkz, X86::VPSRLQZ256rmkz, 0 },
+ { X86::VPSRLQZrik, X86::VPSRLQZmik, 0 },
+ { X86::VPSRLQZrrkz, X86::VPSRLQZrmkz, 0 },
+ { X86::VPSRLVDZ128rrkz, X86::VPSRLVDZ128rmkz, 0 },
+ { X86::VPSRLVDZ256rrkz, X86::VPSRLVDZ256rmkz, 0 },
+ { X86::VPSRLVDZrrkz, X86::VPSRLVDZrmkz, 0 },
+ { X86::VPSRLVQZ128rrkz, X86::VPSRLVQZ128rmkz, 0 },
+ { X86::VPSRLVQZ256rrkz, X86::VPSRLVQZ256rmkz, 0 },
+ { X86::VPSRLVQZrrkz, X86::VPSRLVQZrmkz, 0 },
+ { X86::VPSRLVWZ128rrkz, X86::VPSRLVWZ128rmkz, 0 },
+ { X86::VPSRLVWZ256rrkz, X86::VPSRLVWZ256rmkz, 0 },
+ { X86::VPSRLVWZrrkz, X86::VPSRLVWZrmkz, 0 },
+ { X86::VPSRLWZ128rik, X86::VPSRLWZ128mik, 0 },
+ { X86::VPSRLWZ128rrkz, X86::VPSRLWZ128rmkz, 0 },
+ { X86::VPSRLWZ256rik, X86::VPSRLWZ256mik, 0 },
+ { X86::VPSRLWZ256rrkz, X86::VPSRLWZ256rmkz, 0 },
+ { X86::VPSRLWZrik, X86::VPSRLWZmik, 0 },
+ { X86::VPSRLWZrrkz, X86::VPSRLWZrmkz, 0 },
+ { X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 },
+ { X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 },
+ { X86::VPSUBBZrrkz, X86::VPSUBBZrmkz, 0 },
+ { X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 },
+ { X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 },
+ { X86::VPSUBDZrrkz, X86::VPSUBDZrmkz, 0 },
+ { X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 },
+ { X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 },
+ { X86::VPSUBQZrrkz, X86::VPSUBQZrmkz, 0 },
+ { X86::VPSUBSBZ128rrkz, X86::VPSUBSBZ128rmkz, 0 },
+ { X86::VPSUBSBZ256rrkz, X86::VPSUBSBZ256rmkz, 0 },
+ { X86::VPSUBSBZrrkz, X86::VPSUBSBZrmkz, 0 },
+ { X86::VPSUBSWZ128rrkz, X86::VPSUBSWZ128rmkz, 0 },
+ { X86::VPSUBSWZ256rrkz, X86::VPSUBSWZ256rmkz, 0 },
+ { X86::VPSUBSWZrrkz, X86::VPSUBSWZrmkz, 0 },
+ { X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 },
+ { X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 },
+ { X86::VPSUBUSBZrrkz, X86::VPSUBUSBZrmkz, 0 },
+ { X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 },
+ { X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 },
+ { X86::VPSUBUSWZrrkz, X86::VPSUBUSWZrmkz, 0 },
+ { X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 },
+ { X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 },
+ { X86::VPSUBWZrrkz, X86::VPSUBWZrmkz, 0 },
+ { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 },
+ { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 },
+ { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 },
+ { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 },
+ { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 },
+ { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 },
+ { X86::VPTESTMBZ128rrk, X86::VPTESTMBZ128rmk, 0 },
+ { X86::VPTESTMBZ256rrk, X86::VPTESTMBZ256rmk, 0 },
+ { X86::VPTESTMBZrrk, X86::VPTESTMBZrmk, 0 },
+ { X86::VPTESTMDZ128rrk, X86::VPTESTMDZ128rmk, 0 },
+ { X86::VPTESTMDZ256rrk, X86::VPTESTMDZ256rmk, 0 },
+ { X86::VPTESTMDZrrk, X86::VPTESTMDZrmk, 0 },
+ { X86::VPTESTMQZ128rrk, X86::VPTESTMQZ128rmk, 0 },
+ { X86::VPTESTMQZ256rrk, X86::VPTESTMQZ256rmk, 0 },
+ { X86::VPTESTMQZrrk, X86::VPTESTMQZrmk, 0 },
+ { X86::VPTESTMWZ128rrk, X86::VPTESTMWZ128rmk, 0 },
+ { X86::VPTESTMWZ256rrk, X86::VPTESTMWZ256rmk, 0 },
+ { X86::VPTESTMWZrrk, X86::VPTESTMWZrmk, 0 },
+ { X86::VPTESTNMBZ128rrk, X86::VPTESTNMBZ128rmk, 0 },
+ { X86::VPTESTNMBZ256rrk, X86::VPTESTNMBZ256rmk, 0 },
+ { X86::VPTESTNMBZrrk, X86::VPTESTNMBZrmk, 0 },
+ { X86::VPTESTNMDZ128rrk, X86::VPTESTNMDZ128rmk, 0 },
+ { X86::VPTESTNMDZ256rrk, X86::VPTESTNMDZ256rmk, 0 },
+ { X86::VPTESTNMDZrrk, X86::VPTESTNMDZrmk, 0 },
+ { X86::VPTESTNMQZ128rrk, X86::VPTESTNMQZ128rmk, 0 },
+ { X86::VPTESTNMQZ256rrk, X86::VPTESTNMQZ256rmk, 0 },
+ { X86::VPTESTNMQZrrk, X86::VPTESTNMQZrmk, 0 },
+ { X86::VPTESTNMWZ128rrk, X86::VPTESTNMWZ128rmk, 0 },
+ { X86::VPTESTNMWZ256rrk, X86::VPTESTNMWZ256rmk, 0 },
+ { X86::VPTESTNMWZrrk, X86::VPTESTNMWZrmk, 0 },
+ { X86::VPUNPCKHBWZ128rrkz, X86::VPUNPCKHBWZ128rmkz, 0 },
+ { X86::VPUNPCKHBWZ256rrkz, X86::VPUNPCKHBWZ256rmkz, 0 },
+ { X86::VPUNPCKHBWZrrkz, X86::VPUNPCKHBWZrmkz, 0 },
+ { X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmkz, 0 },
+ { X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmkz, 0 },
+ { X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmkz, 0 },
+ { X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmkz, 0 },
+ { X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmkz, 0 },
+ { X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmkz, 0 },
+ { X86::VPUNPCKHWDZ128rrkz, X86::VPUNPCKHWDZ128rmkz, 0 },
+ { X86::VPUNPCKHWDZ256rrkz, X86::VPUNPCKHWDZ256rmkz, 0 },
+ { X86::VPUNPCKHWDZrrkz, X86::VPUNPCKHWDZrmkz, 0 },
+ { X86::VPUNPCKLBWZ128rrkz, X86::VPUNPCKLBWZ128rmkz, 0 },
+ { X86::VPUNPCKLBWZ256rrkz, X86::VPUNPCKLBWZ256rmkz, 0 },
+ { X86::VPUNPCKLBWZrrkz, X86::VPUNPCKLBWZrmkz, 0 },
+ { X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmkz, 0 },
+ { X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmkz, 0 },
+ { X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmkz, 0 },
+ { X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmkz, 0 },
+ { X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmkz, 0 },
+ { X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmkz, 0 },
+ { X86::VPUNPCKLWDZ128rrkz, X86::VPUNPCKLWDZ128rmkz, 0 },
+ { X86::VPUNPCKLWDZ256rrkz, X86::VPUNPCKLWDZ256rmkz, 0 },
+ { X86::VPUNPCKLWDZrrkz, X86::VPUNPCKLWDZrmkz, 0 },
+ { X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 },
+ { X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 },
+ { X86::VPXORDZrrkz, X86::VPXORDZrmkz, 0 },
+ { X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 },
+ { X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 },
+ { X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 },
+ { X86::VRANGEPDZ128rrikz, X86::VRANGEPDZ128rmikz, 0 },
+ { X86::VRANGEPDZ256rrikz, X86::VRANGEPDZ256rmikz, 0 },
+ { X86::VRANGEPDZrrikz, X86::VRANGEPDZrmikz, 0 },
+ { X86::VRANGEPSZ128rrikz, X86::VRANGEPSZ128rmikz, 0 },
+ { X86::VRANGEPSZ256rrikz, X86::VRANGEPSZ256rmikz, 0 },
+ { X86::VRANGEPSZrrikz, X86::VRANGEPSZrmikz, 0 },
+ { X86::VRANGESDZrrikz, X86::VRANGESDZrmikz, TB_NO_REVERSE },
+ { X86::VRANGESSZrrikz, X86::VRANGESSZrmikz, TB_NO_REVERSE },
+ { X86::VRCP14PDZ128rk, X86::VRCP14PDZ128mk, 0 },
+ { X86::VRCP14PDZ256rk, X86::VRCP14PDZ256mk, 0 },
+ { X86::VRCP14PDZrk, X86::VRCP14PDZmk, 0 },
+ { X86::VRCP14PSZ128rk, X86::VRCP14PSZ128mk, 0 },
+ { X86::VRCP14PSZ256rk, X86::VRCP14PSZ256mk, 0 },
+ { X86::VRCP14PSZrk, X86::VRCP14PSZmk, 0 },
+ { X86::VRCP14SDZrrkz, X86::VRCP14SDZrmkz, TB_NO_REVERSE },
+ { X86::VRCP14SSZrrkz, X86::VRCP14SSZrmkz, TB_NO_REVERSE },
+ { X86::VRCP28PDZrk, X86::VRCP28PDZmk, 0 },
+ { X86::VRCP28PSZrk, X86::VRCP28PSZmk, 0 },
+ { X86::VRCP28SDZrkz, X86::VRCP28SDZmkz, TB_NO_REVERSE },
+ { X86::VRCP28SSZrkz, X86::VRCP28SSZmkz, TB_NO_REVERSE },
+ { X86::VREDUCEPDZ128rrik, X86::VREDUCEPDZ128rmik, 0 },
+ { X86::VREDUCEPDZ256rrik, X86::VREDUCEPDZ256rmik, 0 },
+ { X86::VREDUCEPDZrrik, X86::VREDUCEPDZrmik, 0 },
+ { X86::VREDUCEPSZ128rrik, X86::VREDUCEPSZ128rmik, 0 },
+ { X86::VREDUCEPSZ256rrik, X86::VREDUCEPSZ256rmik, 0 },
+ { X86::VREDUCEPSZrrik, X86::VREDUCEPSZrmik, 0 },
+ { X86::VREDUCESDZrrikz, X86::VREDUCESDZrmikz, TB_NO_REVERSE },
+ { X86::VREDUCESSZrrikz, X86::VREDUCESSZrmikz, TB_NO_REVERSE },
+ { X86::VRNDSCALEPDZ128rrik, X86::VRNDSCALEPDZ128rmik, 0 },
+ { X86::VRNDSCALEPDZ256rrik, X86::VRNDSCALEPDZ256rmik, 0 },
+ { X86::VRNDSCALEPDZrrik, X86::VRNDSCALEPDZrmik, 0 },
+ { X86::VRNDSCALEPSZ128rrik, X86::VRNDSCALEPSZ128rmik, 0 },
+ { X86::VRNDSCALEPSZ256rrik, X86::VRNDSCALEPSZ256rmik, 0 },
+ { X86::VRNDSCALEPSZrrik, X86::VRNDSCALEPSZrmik, 0 },
+ { X86::VRNDSCALESDZr_Intkz, X86::VRNDSCALESDZm_Intkz, TB_NO_REVERSE },
+ { X86::VRNDSCALESSZr_Intkz, X86::VRNDSCALESSZm_Intkz, TB_NO_REVERSE },
+ { X86::VRSQRT14PDZ128rk, X86::VRSQRT14PDZ128mk, 0 },
+ { X86::VRSQRT14PDZ256rk, X86::VRSQRT14PDZ256mk, 0 },
+ { X86::VRSQRT14PDZrk, X86::VRSQRT14PDZmk, 0 },
+ { X86::VRSQRT14PSZ128rk, X86::VRSQRT14PSZ128mk, 0 },
+ { X86::VRSQRT14PSZ256rk, X86::VRSQRT14PSZ256mk, 0 },
+ { X86::VRSQRT14PSZrk, X86::VRSQRT14PSZmk, 0 },
+ { X86::VRSQRT14SDZrrkz, X86::VRSQRT14SDZrmkz, TB_NO_REVERSE },
+ { X86::VRSQRT14SSZrrkz, X86::VRSQRT14SSZrmkz, TB_NO_REVERSE },
+ { X86::VRSQRT28PDZrk, X86::VRSQRT28PDZmk, 0 },
+ { X86::VRSQRT28PSZrk, X86::VRSQRT28PSZmk, 0 },
+ { X86::VRSQRT28SDZrkz, X86::VRSQRT28SDZmkz, TB_NO_REVERSE },
+ { X86::VRSQRT28SSZrkz, X86::VRSQRT28SSZmkz, TB_NO_REVERSE },
+ { X86::VSCALEFPDZ128rrkz, X86::VSCALEFPDZ128rmkz, 0 },
+ { X86::VSCALEFPDZ256rrkz, X86::VSCALEFPDZ256rmkz, 0 },
+ { X86::VSCALEFPDZrrkz, X86::VSCALEFPDZrmkz, 0 },
+ { X86::VSCALEFPSZ128rrkz, X86::VSCALEFPSZ128rmkz, 0 },
+ { X86::VSCALEFPSZ256rrkz, X86::VSCALEFPSZ256rmkz, 0 },
+ { X86::VSCALEFPSZrrkz, X86::VSCALEFPSZrmkz, 0 },
+ { X86::VSCALEFSDZrrkz, X86::VSCALEFSDZrmkz, TB_NO_REVERSE },
+ { X86::VSCALEFSSZrrkz, X86::VSCALEFSSZrmkz, TB_NO_REVERSE },
+ { X86::VSHUFF32X4Z256rrikz, X86::VSHUFF32X4Z256rmikz, 0 },
+ { X86::VSHUFF32X4Zrrikz, X86::VSHUFF32X4Zrmikz, 0 },
+ { X86::VSHUFF64X2Z256rrikz, X86::VSHUFF64X2Z256rmikz, 0 },
+ { X86::VSHUFF64X2Zrrikz, X86::VSHUFF64X2Zrmikz, 0 },
+ { X86::VSHUFI32X4Z256rrikz, X86::VSHUFI32X4Z256rmikz, 0 },
+ { X86::VSHUFI32X4Zrrikz, X86::VSHUFI32X4Zrmikz, 0 },
+ { X86::VSHUFI64X2Z256rrikz, X86::VSHUFI64X2Z256rmikz, 0 },
+ { X86::VSHUFI64X2Zrrikz, X86::VSHUFI64X2Zrmikz, 0 },
+ { X86::VSHUFPDZ128rrikz, X86::VSHUFPDZ128rmikz, 0 },
+ { X86::VSHUFPDZ256rrikz, X86::VSHUFPDZ256rmikz, 0 },
+ { X86::VSHUFPDZrrikz, X86::VSHUFPDZrmikz, 0 },
+ { X86::VSHUFPSZ128rrikz, X86::VSHUFPSZ128rmikz, 0 },
+ { X86::VSHUFPSZ256rrikz, X86::VSHUFPSZ256rmikz, 0 },
+ { X86::VSHUFPSZrrikz, X86::VSHUFPSZrmikz, 0 },
+ { X86::VSQRTPDZ128rk, X86::VSQRTPDZ128mk, 0 },
+ { X86::VSQRTPDZ256rk, X86::VSQRTPDZ256mk, 0 },
+ { X86::VSQRTPDZrk, X86::VSQRTPDZmk, 0 },
+ { X86::VSQRTPSZ128rk, X86::VSQRTPSZ128mk, 0 },
+ { X86::VSQRTPSZ256rk, X86::VSQRTPSZ256mk, 0 },
+ { X86::VSQRTPSZrk, X86::VSQRTPSZmk, 0 },
+ { X86::VSQRTSDZr_Intkz, X86::VSQRTSDZm_Intkz, TB_NO_REVERSE },
+ { X86::VSQRTSSZr_Intkz, X86::VSQRTSSZm_Intkz, TB_NO_REVERSE },
+ { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 },
+ { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 },
+ { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 },
+ { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 },
+ { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 },
+ { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 },
+ { X86::VSUBSDZrr_Intkz, X86::VSUBSDZrm_Intkz, TB_NO_REVERSE },
+ { X86::VSUBSSZrr_Intkz, X86::VSUBSSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 },
+ { X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 },
+ { X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 },
+ { X86::VUNPCKHPSZ128rrkz, X86::VUNPCKHPSZ128rmkz, 0 },
+ { X86::VUNPCKHPSZ256rrkz, X86::VUNPCKHPSZ256rmkz, 0 },
+ { X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 },
+ { X86::VUNPCKLPDZ128rrkz, X86::VUNPCKLPDZ128rmkz, 0 },
+ { X86::VUNPCKLPDZ256rrkz, X86::VUNPCKLPDZ256rmkz, 0 },
+ { X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 },
+ { X86::VUNPCKLPSZ128rrkz, X86::VUNPCKLPSZ128rmkz, 0 },
+ { X86::VUNPCKLPSZ256rrkz, X86::VUNPCKLPSZ256rmkz, 0 },
+ { X86::VUNPCKLPSZrrkz, X86::VUNPCKLPSZrmkz, 0 },
+ { X86::VXORPDZ128rrkz, X86::VXORPDZ128rmkz, 0 },
+ { X86::VXORPDZ256rrkz, X86::VXORPDZ256rmkz, 0 },
+ { X86::VXORPDZrrkz, X86::VXORPDZrmkz, 0 },
+ { X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 },
+ { X86::VXORPSZ256rrkz, X86::VXORPSZ256rmkz, 0 },
+ { X86::VXORPSZrrkz, X86::VXORPSZrmkz, 0 },
+};
+
+static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
+ { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 },
+ { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 },
+ { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 },
+ { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 },
+ { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 },
+ { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 },
+ { X86::VADDSDZrr_Intk, X86::VADDSDZrm_Intk, TB_NO_REVERSE },
+ { X86::VADDSSZrr_Intk, X86::VADDSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0 },
+ { X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0 },
+ { X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 },
+ { X86::VALIGNQZ128rrik, X86::VALIGNQZ128rmik, 0 },
+ { X86::VALIGNQZ256rrik, X86::VALIGNQZ256rmik, 0 },
+ { X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 },
+ { X86::VANDNPDZ128rrk, X86::VANDNPDZ128rmk, 0 },
+ { X86::VANDNPDZ256rrk, X86::VANDNPDZ256rmk, 0 },
+ { X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 },
+ { X86::VANDNPSZ128rrk, X86::VANDNPSZ128rmk, 0 },
+ { X86::VANDNPSZ256rrk, X86::VANDNPSZ256rmk, 0 },
+ { X86::VANDNPSZrrk, X86::VANDNPSZrmk, 0 },
+ { X86::VANDPDZ128rrk, X86::VANDPDZ128rmk, 0 },
+ { X86::VANDPDZ256rrk, X86::VANDPDZ256rmk, 0 },
+ { X86::VANDPDZrrk, X86::VANDPDZrmk, 0 },
+ { X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 },
+ { X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 },
+ { X86::VANDPSZrrk, X86::VANDPSZrmk, 0 },
+ { X86::VCVTSD2SSZrr_Intk, X86::VCVTSD2SSZrm_Intk, TB_NO_REVERSE },
+ { X86::VCVTSS2SDZrr_Intk, X86::VCVTSS2SDZrm_Intk, TB_NO_REVERSE },
+ { X86::VDBPSADBWZ128rrik, X86::VDBPSADBWZ128rmik, 0 },
+ { X86::VDBPSADBWZ256rrik, X86::VDBPSADBWZ256rmik, 0 },
+ { X86::VDBPSADBWZrrik, X86::VDBPSADBWZrmik, 0 },
+ { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 },
+ { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 },
+ { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 },
+ { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 },
+ { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 },
+ { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
+ { X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE },
+ { X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VFIXUPIMMPDZ128rrik, X86::VFIXUPIMMPDZ128rmik, 0 },
+ { X86::VFIXUPIMMPDZ128rrikz, X86::VFIXUPIMMPDZ128rmikz, 0 },
+ { X86::VFIXUPIMMPDZ256rrik, X86::VFIXUPIMMPDZ256rmik, 0 },
+ { X86::VFIXUPIMMPDZ256rrikz, X86::VFIXUPIMMPDZ256rmikz, 0 },
+ { X86::VFIXUPIMMPDZrrik, X86::VFIXUPIMMPDZrmik, 0 },
+ { X86::VFIXUPIMMPDZrrikz, X86::VFIXUPIMMPDZrmikz, 0 },
+ { X86::VFIXUPIMMPSZ128rrik, X86::VFIXUPIMMPSZ128rmik, 0 },
+ { X86::VFIXUPIMMPSZ128rrikz, X86::VFIXUPIMMPSZ128rmikz, 0 },
+ { X86::VFIXUPIMMPSZ256rrik, X86::VFIXUPIMMPSZ256rmik, 0 },
+ { X86::VFIXUPIMMPSZ256rrikz, X86::VFIXUPIMMPSZ256rmikz, 0 },
+ { X86::VFIXUPIMMPSZrrik, X86::VFIXUPIMMPSZrmik, 0 },
+ { X86::VFIXUPIMMPSZrrikz, X86::VFIXUPIMMPSZrmikz, 0 },
+ { X86::VFIXUPIMMSDZrrik, X86::VFIXUPIMMSDZrmik, TB_NO_REVERSE },
+ { X86::VFIXUPIMMSDZrrikz, X86::VFIXUPIMMSDZrmikz, TB_NO_REVERSE },
+ { X86::VFIXUPIMMSSZrrik, X86::VFIXUPIMMSSZrmik, TB_NO_REVERSE },
+ { X86::VFIXUPIMMSSZrrikz, X86::VFIXUPIMMSSZrmikz, TB_NO_REVERSE },
+ { X86::VFMADD132PDZ128rk, X86::VFMADD132PDZ128mk, 0 },
+ { X86::VFMADD132PDZ128rkz, X86::VFMADD132PDZ128mkz, 0 },
+ { X86::VFMADD132PDZ256rk, X86::VFMADD132PDZ256mk, 0 },
+ { X86::VFMADD132PDZ256rkz, X86::VFMADD132PDZ256mkz, 0 },
+ { X86::VFMADD132PDZrk, X86::VFMADD132PDZmk, 0 },
+ { X86::VFMADD132PDZrkz, X86::VFMADD132PDZmkz, 0 },
+ { X86::VFMADD132PSZ128rk, X86::VFMADD132PSZ128mk, 0 },
+ { X86::VFMADD132PSZ128rkz, X86::VFMADD132PSZ128mkz, 0 },
+ { X86::VFMADD132PSZ256rk, X86::VFMADD132PSZ256mk, 0 },
+ { X86::VFMADD132PSZ256rkz, X86::VFMADD132PSZ256mkz, 0 },
+ { X86::VFMADD132PSZrk, X86::VFMADD132PSZmk, 0 },
+ { X86::VFMADD132PSZrkz, X86::VFMADD132PSZmkz, 0 },
+ { X86::VFMADD132SDZr_Intk, X86::VFMADD132SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFMADD132SDZr_Intkz, X86::VFMADD132SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADD132SSZr_Intk, X86::VFMADD132SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFMADD132SSZr_Intkz, X86::VFMADD132SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADD213PDZ128rk, X86::VFMADD213PDZ128mk, 0 },
+ { X86::VFMADD213PDZ128rkz, X86::VFMADD213PDZ128mkz, 0 },
+ { X86::VFMADD213PDZ256rk, X86::VFMADD213PDZ256mk, 0 },
+ { X86::VFMADD213PDZ256rkz, X86::VFMADD213PDZ256mkz, 0 },
+ { X86::VFMADD213PDZrk, X86::VFMADD213PDZmk, 0 },
+ { X86::VFMADD213PDZrkz, X86::VFMADD213PDZmkz, 0 },
+ { X86::VFMADD213PSZ128rk, X86::VFMADD213PSZ128mk, 0 },
+ { X86::VFMADD213PSZ128rkz, X86::VFMADD213PSZ128mkz, 0 },
+ { X86::VFMADD213PSZ256rk, X86::VFMADD213PSZ256mk, 0 },
+ { X86::VFMADD213PSZ256rkz, X86::VFMADD213PSZ256mkz, 0 },
+ { X86::VFMADD213PSZrk, X86::VFMADD213PSZmk, 0 },
+ { X86::VFMADD213PSZrkz, X86::VFMADD213PSZmkz, 0 },
+ { X86::VFMADD213SDZr_Intk, X86::VFMADD213SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFMADD213SDZr_Intkz, X86::VFMADD213SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADD213SSZr_Intk, X86::VFMADD213SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFMADD213SSZr_Intkz, X86::VFMADD213SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADD231PDZ128rk, X86::VFMADD231PDZ128mk, 0 },
+ { X86::VFMADD231PDZ128rkz, X86::VFMADD231PDZ128mkz, 0 },
+ { X86::VFMADD231PDZ256rk, X86::VFMADD231PDZ256mk, 0 },
+ { X86::VFMADD231PDZ256rkz, X86::VFMADD231PDZ256mkz, 0 },
+ { X86::VFMADD231PDZrk, X86::VFMADD231PDZmk, 0 },
+ { X86::VFMADD231PDZrkz, X86::VFMADD231PDZmkz, 0 },
+ { X86::VFMADD231PSZ128rk, X86::VFMADD231PSZ128mk, 0 },
+ { X86::VFMADD231PSZ128rkz, X86::VFMADD231PSZ128mkz, 0 },
+ { X86::VFMADD231PSZ256rk, X86::VFMADD231PSZ256mk, 0 },
+ { X86::VFMADD231PSZ256rkz, X86::VFMADD231PSZ256mkz, 0 },
+ { X86::VFMADD231PSZrk, X86::VFMADD231PSZmk, 0 },
+ { X86::VFMADD231PSZrkz, X86::VFMADD231PSZmkz, 0 },
+ { X86::VFMADD231SDZr_Intk, X86::VFMADD231SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFMADD231SDZr_Intkz, X86::VFMADD231SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADD231SSZr_Intk, X86::VFMADD231SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFMADD231SSZr_Intkz, X86::VFMADD231SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADDSUB132PDZ128rk, X86::VFMADDSUB132PDZ128mk, 0 },
+ { X86::VFMADDSUB132PDZ128rkz, X86::VFMADDSUB132PDZ128mkz, 0 },
+ { X86::VFMADDSUB132PDZ256rk, X86::VFMADDSUB132PDZ256mk, 0 },
+ { X86::VFMADDSUB132PDZ256rkz, X86::VFMADDSUB132PDZ256mkz, 0 },
+ { X86::VFMADDSUB132PDZrk, X86::VFMADDSUB132PDZmk, 0 },
+ { X86::VFMADDSUB132PDZrkz, X86::VFMADDSUB132PDZmkz, 0 },
+ { X86::VFMADDSUB132PSZ128rk, X86::VFMADDSUB132PSZ128mk, 0 },
+ { X86::VFMADDSUB132PSZ128rkz, X86::VFMADDSUB132PSZ128mkz, 0 },
+ { X86::VFMADDSUB132PSZ256rk, X86::VFMADDSUB132PSZ256mk, 0 },
+ { X86::VFMADDSUB132PSZ256rkz, X86::VFMADDSUB132PSZ256mkz, 0 },
+ { X86::VFMADDSUB132PSZrk, X86::VFMADDSUB132PSZmk, 0 },
+ { X86::VFMADDSUB132PSZrkz, X86::VFMADDSUB132PSZmkz, 0 },
+ { X86::VFMADDSUB213PDZ128rk, X86::VFMADDSUB213PDZ128mk, 0 },
+ { X86::VFMADDSUB213PDZ128rkz, X86::VFMADDSUB213PDZ128mkz, 0 },
+ { X86::VFMADDSUB213PDZ256rk, X86::VFMADDSUB213PDZ256mk, 0 },
+ { X86::VFMADDSUB213PDZ256rkz, X86::VFMADDSUB213PDZ256mkz, 0 },
+ { X86::VFMADDSUB213PDZrk, X86::VFMADDSUB213PDZmk, 0 },
+ { X86::VFMADDSUB213PDZrkz, X86::VFMADDSUB213PDZmkz, 0 },
+ { X86::VFMADDSUB213PSZ128rk, X86::VFMADDSUB213PSZ128mk, 0 },
+ { X86::VFMADDSUB213PSZ128rkz, X86::VFMADDSUB213PSZ128mkz, 0 },
+ { X86::VFMADDSUB213PSZ256rk, X86::VFMADDSUB213PSZ256mk, 0 },
+ { X86::VFMADDSUB213PSZ256rkz, X86::VFMADDSUB213PSZ256mkz, 0 },
+ { X86::VFMADDSUB213PSZrk, X86::VFMADDSUB213PSZmk, 0 },
+ { X86::VFMADDSUB213PSZrkz, X86::VFMADDSUB213PSZmkz, 0 },
+ { X86::VFMADDSUB231PDZ128rk, X86::VFMADDSUB231PDZ128mk, 0 },
+ { X86::VFMADDSUB231PDZ128rkz, X86::VFMADDSUB231PDZ128mkz, 0 },
+ { X86::VFMADDSUB231PDZ256rk, X86::VFMADDSUB231PDZ256mk, 0 },
+ { X86::VFMADDSUB231PDZ256rkz, X86::VFMADDSUB231PDZ256mkz, 0 },
+ { X86::VFMADDSUB231PDZrk, X86::VFMADDSUB231PDZmk, 0 },
+ { X86::VFMADDSUB231PDZrkz, X86::VFMADDSUB231PDZmkz, 0 },
+ { X86::VFMADDSUB231PSZ128rk, X86::VFMADDSUB231PSZ128mk, 0 },
+ { X86::VFMADDSUB231PSZ128rkz, X86::VFMADDSUB231PSZ128mkz, 0 },
+ { X86::VFMADDSUB231PSZ256rk, X86::VFMADDSUB231PSZ256mk, 0 },
+ { X86::VFMADDSUB231PSZ256rkz, X86::VFMADDSUB231PSZ256mkz, 0 },
+ { X86::VFMADDSUB231PSZrk, X86::VFMADDSUB231PSZmk, 0 },
+ { X86::VFMADDSUB231PSZrkz, X86::VFMADDSUB231PSZmkz, 0 },
+ { X86::VFMSUB132PDZ128rk, X86::VFMSUB132PDZ128mk, 0 },
+ { X86::VFMSUB132PDZ128rkz, X86::VFMSUB132PDZ128mkz, 0 },
+ { X86::VFMSUB132PDZ256rk, X86::VFMSUB132PDZ256mk, 0 },
+ { X86::VFMSUB132PDZ256rkz, X86::VFMSUB132PDZ256mkz, 0 },
+ { X86::VFMSUB132PDZrk, X86::VFMSUB132PDZmk, 0 },
+ { X86::VFMSUB132PDZrkz, X86::VFMSUB132PDZmkz, 0 },
+ { X86::VFMSUB132PSZ128rk, X86::VFMSUB132PSZ128mk, 0 },
+ { X86::VFMSUB132PSZ128rkz, X86::VFMSUB132PSZ128mkz, 0 },
+ { X86::VFMSUB132PSZ256rk, X86::VFMSUB132PSZ256mk, 0 },
+ { X86::VFMSUB132PSZ256rkz, X86::VFMSUB132PSZ256mkz, 0 },
+ { X86::VFMSUB132PSZrk, X86::VFMSUB132PSZmk, 0 },
+ { X86::VFMSUB132PSZrkz, X86::VFMSUB132PSZmkz, 0 },
+ { X86::VFMSUB132SDZr_Intk, X86::VFMSUB132SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFMSUB132SDZr_Intkz, X86::VFMSUB132SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMSUB132SSZr_Intk, X86::VFMSUB132SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFMSUB132SSZr_Intkz, X86::VFMSUB132SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMSUB213PDZ128rk, X86::VFMSUB213PDZ128mk, 0 },
+ { X86::VFMSUB213PDZ128rkz, X86::VFMSUB213PDZ128mkz, 0 },
+ { X86::VFMSUB213PDZ256rk, X86::VFMSUB213PDZ256mk, 0 },
+ { X86::VFMSUB213PDZ256rkz, X86::VFMSUB213PDZ256mkz, 0 },
+ { X86::VFMSUB213PDZrk, X86::VFMSUB213PDZmk, 0 },
+ { X86::VFMSUB213PDZrkz, X86::VFMSUB213PDZmkz, 0 },
+ { X86::VFMSUB213PSZ128rk, X86::VFMSUB213PSZ128mk, 0 },
+ { X86::VFMSUB213PSZ128rkz, X86::VFMSUB213PSZ128mkz, 0 },
+ { X86::VFMSUB213PSZ256rk, X86::VFMSUB213PSZ256mk, 0 },
+ { X86::VFMSUB213PSZ256rkz, X86::VFMSUB213PSZ256mkz, 0 },
+ { X86::VFMSUB213PSZrk, X86::VFMSUB213PSZmk, 0 },
+ { X86::VFMSUB213PSZrkz, X86::VFMSUB213PSZmkz, 0 },
+ { X86::VFMSUB213SDZr_Intk, X86::VFMSUB213SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFMSUB213SDZr_Intkz, X86::VFMSUB213SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMSUB213SSZr_Intk, X86::VFMSUB213SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFMSUB213SSZr_Intkz, X86::VFMSUB213SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMSUB231PDZ128rk, X86::VFMSUB231PDZ128mk, 0 },
+ { X86::VFMSUB231PDZ128rkz, X86::VFMSUB231PDZ128mkz, 0 },
+ { X86::VFMSUB231PDZ256rk, X86::VFMSUB231PDZ256mk, 0 },
+ { X86::VFMSUB231PDZ256rkz, X86::VFMSUB231PDZ256mkz, 0 },
+ { X86::VFMSUB231PDZrk, X86::VFMSUB231PDZmk, 0 },
+ { X86::VFMSUB231PDZrkz, X86::VFMSUB231PDZmkz, 0 },
+ { X86::VFMSUB231PSZ128rk, X86::VFMSUB231PSZ128mk, 0 },
+ { X86::VFMSUB231PSZ128rkz, X86::VFMSUB231PSZ128mkz, 0 },
+ { X86::VFMSUB231PSZ256rk, X86::VFMSUB231PSZ256mk, 0 },
+ { X86::VFMSUB231PSZ256rkz, X86::VFMSUB231PSZ256mkz, 0 },
+ { X86::VFMSUB231PSZrk, X86::VFMSUB231PSZmk, 0 },
+ { X86::VFMSUB231PSZrkz, X86::VFMSUB231PSZmkz, 0 },
+ { X86::VFMSUB231SDZr_Intk, X86::VFMSUB231SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFMSUB231SDZr_Intkz, X86::VFMSUB231SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMSUB231SSZr_Intk, X86::VFMSUB231SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFMSUB231SSZr_Intkz, X86::VFMSUB231SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMSUBADD132PDZ128rk, X86::VFMSUBADD132PDZ128mk, 0 },
+ { X86::VFMSUBADD132PDZ128rkz, X86::VFMSUBADD132PDZ128mkz, 0 },
+ { X86::VFMSUBADD132PDZ256rk, X86::VFMSUBADD132PDZ256mk, 0 },
+ { X86::VFMSUBADD132PDZ256rkz, X86::VFMSUBADD132PDZ256mkz, 0 },
+ { X86::VFMSUBADD132PDZrk, X86::VFMSUBADD132PDZmk, 0 },
+ { X86::VFMSUBADD132PDZrkz, X86::VFMSUBADD132PDZmkz, 0 },
+ { X86::VFMSUBADD132PSZ128rk, X86::VFMSUBADD132PSZ128mk, 0 },
+ { X86::VFMSUBADD132PSZ128rkz, X86::VFMSUBADD132PSZ128mkz, 0 },
+ { X86::VFMSUBADD132PSZ256rk, X86::VFMSUBADD132PSZ256mk, 0 },
+ { X86::VFMSUBADD132PSZ256rkz, X86::VFMSUBADD132PSZ256mkz, 0 },
+ { X86::VFMSUBADD132PSZrk, X86::VFMSUBADD132PSZmk, 0 },
+ { X86::VFMSUBADD132PSZrkz, X86::VFMSUBADD132PSZmkz, 0 },
+ { X86::VFMSUBADD213PDZ128rk, X86::VFMSUBADD213PDZ128mk, 0 },
+ { X86::VFMSUBADD213PDZ128rkz, X86::VFMSUBADD213PDZ128mkz, 0 },
+ { X86::VFMSUBADD213PDZ256rk, X86::VFMSUBADD213PDZ256mk, 0 },
+ { X86::VFMSUBADD213PDZ256rkz, X86::VFMSUBADD213PDZ256mkz, 0 },
+ { X86::VFMSUBADD213PDZrk, X86::VFMSUBADD213PDZmk, 0 },
+ { X86::VFMSUBADD213PDZrkz, X86::VFMSUBADD213PDZmkz, 0 },
+ { X86::VFMSUBADD213PSZ128rk, X86::VFMSUBADD213PSZ128mk, 0 },
+ { X86::VFMSUBADD213PSZ128rkz, X86::VFMSUBADD213PSZ128mkz, 0 },
+ { X86::VFMSUBADD213PSZ256rk, X86::VFMSUBADD213PSZ256mk, 0 },
+ { X86::VFMSUBADD213PSZ256rkz, X86::VFMSUBADD213PSZ256mkz, 0 },
+ { X86::VFMSUBADD213PSZrk, X86::VFMSUBADD213PSZmk, 0 },
+ { X86::VFMSUBADD213PSZrkz, X86::VFMSUBADD213PSZmkz, 0 },
+ { X86::VFMSUBADD231PDZ128rk, X86::VFMSUBADD231PDZ128mk, 0 },
+ { X86::VFMSUBADD231PDZ128rkz, X86::VFMSUBADD231PDZ128mkz, 0 },
+ { X86::VFMSUBADD231PDZ256rk, X86::VFMSUBADD231PDZ256mk, 0 },
+ { X86::VFMSUBADD231PDZ256rkz, X86::VFMSUBADD231PDZ256mkz, 0 },
+ { X86::VFMSUBADD231PDZrk, X86::VFMSUBADD231PDZmk, 0 },
+ { X86::VFMSUBADD231PDZrkz, X86::VFMSUBADD231PDZmkz, 0 },
+ { X86::VFMSUBADD231PSZ128rk, X86::VFMSUBADD231PSZ128mk, 0 },
+ { X86::VFMSUBADD231PSZ128rkz, X86::VFMSUBADD231PSZ128mkz, 0 },
+ { X86::VFMSUBADD231PSZ256rk, X86::VFMSUBADD231PSZ256mk, 0 },
+ { X86::VFMSUBADD231PSZ256rkz, X86::VFMSUBADD231PSZ256mkz, 0 },
+ { X86::VFMSUBADD231PSZrk, X86::VFMSUBADD231PSZmk, 0 },
+ { X86::VFMSUBADD231PSZrkz, X86::VFMSUBADD231PSZmkz, 0 },
+ { X86::VFNMADD132PDZ128rk, X86::VFNMADD132PDZ128mk, 0 },
+ { X86::VFNMADD132PDZ128rkz, X86::VFNMADD132PDZ128mkz, 0 },
+ { X86::VFNMADD132PDZ256rk, X86::VFNMADD132PDZ256mk, 0 },
+ { X86::VFNMADD132PDZ256rkz, X86::VFNMADD132PDZ256mkz, 0 },
+ { X86::VFNMADD132PDZrk, X86::VFNMADD132PDZmk, 0 },
+ { X86::VFNMADD132PDZrkz, X86::VFNMADD132PDZmkz, 0 },
+ { X86::VFNMADD132PSZ128rk, X86::VFNMADD132PSZ128mk, 0 },
+ { X86::VFNMADD132PSZ128rkz, X86::VFNMADD132PSZ128mkz, 0 },
+ { X86::VFNMADD132PSZ256rk, X86::VFNMADD132PSZ256mk, 0 },
+ { X86::VFNMADD132PSZ256rkz, X86::VFNMADD132PSZ256mkz, 0 },
+ { X86::VFNMADD132PSZrk, X86::VFNMADD132PSZmk, 0 },
+ { X86::VFNMADD132PSZrkz, X86::VFNMADD132PSZmkz, 0 },
+ { X86::VFNMADD132SDZr_Intk, X86::VFNMADD132SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMADD132SDZr_Intkz, X86::VFNMADD132SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMADD132SSZr_Intk, X86::VFNMADD132SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMADD132SSZr_Intkz, X86::VFNMADD132SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMADD213PDZ128rk, X86::VFNMADD213PDZ128mk, 0 },
+ { X86::VFNMADD213PDZ128rkz, X86::VFNMADD213PDZ128mkz, 0 },
+ { X86::VFNMADD213PDZ256rk, X86::VFNMADD213PDZ256mk, 0 },
+ { X86::VFNMADD213PDZ256rkz, X86::VFNMADD213PDZ256mkz, 0 },
+ { X86::VFNMADD213PDZrk, X86::VFNMADD213PDZmk, 0 },
+ { X86::VFNMADD213PDZrkz, X86::VFNMADD213PDZmkz, 0 },
+ { X86::VFNMADD213PSZ128rk, X86::VFNMADD213PSZ128mk, 0 },
+ { X86::VFNMADD213PSZ128rkz, X86::VFNMADD213PSZ128mkz, 0 },
+ { X86::VFNMADD213PSZ256rk, X86::VFNMADD213PSZ256mk, 0 },
+ { X86::VFNMADD213PSZ256rkz, X86::VFNMADD213PSZ256mkz, 0 },
+ { X86::VFNMADD213PSZrk, X86::VFNMADD213PSZmk, 0 },
+ { X86::VFNMADD213PSZrkz, X86::VFNMADD213PSZmkz, 0 },
+ { X86::VFNMADD213SDZr_Intk, X86::VFNMADD213SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMADD213SDZr_Intkz, X86::VFNMADD213SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMADD213SSZr_Intk, X86::VFNMADD213SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMADD213SSZr_Intkz, X86::VFNMADD213SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMADD231PDZ128rk, X86::VFNMADD231PDZ128mk, 0 },
+ { X86::VFNMADD231PDZ128rkz, X86::VFNMADD231PDZ128mkz, 0 },
+ { X86::VFNMADD231PDZ256rk, X86::VFNMADD231PDZ256mk, 0 },
+ { X86::VFNMADD231PDZ256rkz, X86::VFNMADD231PDZ256mkz, 0 },
+ { X86::VFNMADD231PDZrk, X86::VFNMADD231PDZmk, 0 },
+ { X86::VFNMADD231PDZrkz, X86::VFNMADD231PDZmkz, 0 },
+ { X86::VFNMADD231PSZ128rk, X86::VFNMADD231PSZ128mk, 0 },
+ { X86::VFNMADD231PSZ128rkz, X86::VFNMADD231PSZ128mkz, 0 },
+ { X86::VFNMADD231PSZ256rk, X86::VFNMADD231PSZ256mk, 0 },
+ { X86::VFNMADD231PSZ256rkz, X86::VFNMADD231PSZ256mkz, 0 },
+ { X86::VFNMADD231PSZrk, X86::VFNMADD231PSZmk, 0 },
+ { X86::VFNMADD231PSZrkz, X86::VFNMADD231PSZmkz, 0 },
+ { X86::VFNMADD231SDZr_Intk, X86::VFNMADD231SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMADD231SDZr_Intkz, X86::VFNMADD231SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMADD231SSZr_Intk, X86::VFNMADD231SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMADD231SSZr_Intkz, X86::VFNMADD231SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMSUB132PDZ128rk, X86::VFNMSUB132PDZ128mk, 0 },
+ { X86::VFNMSUB132PDZ128rkz, X86::VFNMSUB132PDZ128mkz, 0 },
+ { X86::VFNMSUB132PDZ256rk, X86::VFNMSUB132PDZ256mk, 0 },
+ { X86::VFNMSUB132PDZ256rkz, X86::VFNMSUB132PDZ256mkz, 0 },
+ { X86::VFNMSUB132PDZrk, X86::VFNMSUB132PDZmk, 0 },
+ { X86::VFNMSUB132PDZrkz, X86::VFNMSUB132PDZmkz, 0 },
+ { X86::VFNMSUB132PSZ128rk, X86::VFNMSUB132PSZ128mk, 0 },
+ { X86::VFNMSUB132PSZ128rkz, X86::VFNMSUB132PSZ128mkz, 0 },
+ { X86::VFNMSUB132PSZ256rk, X86::VFNMSUB132PSZ256mk, 0 },
+ { X86::VFNMSUB132PSZ256rkz, X86::VFNMSUB132PSZ256mkz, 0 },
+ { X86::VFNMSUB132PSZrk, X86::VFNMSUB132PSZmk, 0 },
+ { X86::VFNMSUB132PSZrkz, X86::VFNMSUB132PSZmkz, 0 },
+ { X86::VFNMSUB132SDZr_Intk, X86::VFNMSUB132SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMSUB132SDZr_Intkz, X86::VFNMSUB132SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMSUB132SSZr_Intk, X86::VFNMSUB132SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMSUB132SSZr_Intkz, X86::VFNMSUB132SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMSUB213PDZ128rk, X86::VFNMSUB213PDZ128mk, 0 },
+ { X86::VFNMSUB213PDZ128rkz, X86::VFNMSUB213PDZ128mkz, 0 },
+ { X86::VFNMSUB213PDZ256rk, X86::VFNMSUB213PDZ256mk, 0 },
+ { X86::VFNMSUB213PDZ256rkz, X86::VFNMSUB213PDZ256mkz, 0 },
+ { X86::VFNMSUB213PDZrk, X86::VFNMSUB213PDZmk, 0 },
+ { X86::VFNMSUB213PDZrkz, X86::VFNMSUB213PDZmkz, 0 },
+ { X86::VFNMSUB213PSZ128rk, X86::VFNMSUB213PSZ128mk, 0 },
+ { X86::VFNMSUB213PSZ128rkz, X86::VFNMSUB213PSZ128mkz, 0 },
+ { X86::VFNMSUB213PSZ256rk, X86::VFNMSUB213PSZ256mk, 0 },
+ { X86::VFNMSUB213PSZ256rkz, X86::VFNMSUB213PSZ256mkz, 0 },
+ { X86::VFNMSUB213PSZrk, X86::VFNMSUB213PSZmk, 0 },
+ { X86::VFNMSUB213PSZrkz, X86::VFNMSUB213PSZmkz, 0 },
+ { X86::VFNMSUB213SDZr_Intk, X86::VFNMSUB213SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMSUB213SDZr_Intkz, X86::VFNMSUB213SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMSUB213SSZr_Intk, X86::VFNMSUB213SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMSUB213SSZr_Intkz, X86::VFNMSUB213SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMSUB231PDZ128rk, X86::VFNMSUB231PDZ128mk, 0 },
+ { X86::VFNMSUB231PDZ128rkz, X86::VFNMSUB231PDZ128mkz, 0 },
+ { X86::VFNMSUB231PDZ256rk, X86::VFNMSUB231PDZ256mk, 0 },
+ { X86::VFNMSUB231PDZ256rkz, X86::VFNMSUB231PDZ256mkz, 0 },
+ { X86::VFNMSUB231PDZrk, X86::VFNMSUB231PDZmk, 0 },
+ { X86::VFNMSUB231PDZrkz, X86::VFNMSUB231PDZmkz, 0 },
+ { X86::VFNMSUB231PSZ128rk, X86::VFNMSUB231PSZ128mk, 0 },
+ { X86::VFNMSUB231PSZ128rkz, X86::VFNMSUB231PSZ128mkz, 0 },
+ { X86::VFNMSUB231PSZ256rk, X86::VFNMSUB231PSZ256mk, 0 },
+ { X86::VFNMSUB231PSZ256rkz, X86::VFNMSUB231PSZ256mkz, 0 },
+ { X86::VFNMSUB231PSZrk, X86::VFNMSUB231PSZmk, 0 },
+ { X86::VFNMSUB231PSZrkz, X86::VFNMSUB231PSZmkz, 0 },
+ { X86::VFNMSUB231SDZr_Intk, X86::VFNMSUB231SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMSUB231SDZr_Intkz, X86::VFNMSUB231SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMSUB231SSZr_Intk, X86::VFNMSUB231SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMSUB231SSZr_Intkz, X86::VFNMSUB231SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VGETEXPSDZrk, X86::VGETEXPSDZmk, TB_NO_REVERSE },
+ { X86::VGETEXPSSZrk, X86::VGETEXPSSZmk, TB_NO_REVERSE },
+ { X86::VGETMANTSDZrrik, X86::VGETMANTSDZrmik, TB_NO_REVERSE },
+ { X86::VGETMANTSSZrrik, X86::VGETMANTSSZrmik, TB_NO_REVERSE },
+ { X86::VGF2P8AFFINEINVQBZ128rrik, X86::VGF2P8AFFINEINVQBZ128rmik, 0 },
+ { X86::VGF2P8AFFINEINVQBZ256rrik, X86::VGF2P8AFFINEINVQBZ256rmik, 0 },
+ { X86::VGF2P8AFFINEINVQBZrrik, X86::VGF2P8AFFINEINVQBZrmik, 0 },
+ { X86::VGF2P8AFFINEQBZ128rrik, X86::VGF2P8AFFINEQBZ128rmik, 0 },
+ { X86::VGF2P8AFFINEQBZ256rrik, X86::VGF2P8AFFINEQBZ256rmik, 0 },
+ { X86::VGF2P8AFFINEQBZrrik, X86::VGF2P8AFFINEQBZrmik, 0 },
+ { X86::VGF2P8MULBZ128rrk, X86::VGF2P8MULBZ128rmk, 0 },
+ { X86::VGF2P8MULBZ256rrk, X86::VGF2P8MULBZ256rmk, 0 },
+ { X86::VGF2P8MULBZrrk, X86::VGF2P8MULBZrmk, 0 },
+ { X86::VINSERTF32x4Z256rrk, X86::VINSERTF32x4Z256rmk, 0 },
+ { X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 },
+ { X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 },
+ { X86::VINSERTF64x2Z256rrk, X86::VINSERTF64x2Z256rmk, 0 },
+ { X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 },
+ { X86::VINSERTF64x4Zrrk, X86::VINSERTF64x4Zrmk, 0 },
+ { X86::VINSERTI32x4Z256rrk, X86::VINSERTI32x4Z256rmk, 0 },
+ { X86::VINSERTI32x4Zrrk, X86::VINSERTI32x4Zrmk, 0 },
+ { X86::VINSERTI32x8Zrrk, X86::VINSERTI32x8Zrmk, 0 },
+ { X86::VINSERTI64x2Z256rrk, X86::VINSERTI64x2Z256rmk, 0 },
+ { X86::VINSERTI64x2Zrrk, X86::VINSERTI64x2Zrmk, 0 },
+ { X86::VINSERTI64x4Zrrk, X86::VINSERTI64x4Zrmk, 0 },
+ { X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0 },
+ { X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0 },
+ { X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0 },
+ { X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0 },
+ { X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0 },
+ { X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 },
+ { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 },
+ { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 },
+ { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 },
+ { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 },
+ { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 },
+ { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 },
+ { X86::VMAXSDZrr_Intk, X86::VMAXSDZrm_Intk, TB_NO_REVERSE },
+ { X86::VMAXSSZrr_Intk, X86::VMAXSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0 },
+ { X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0 },
+ { X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 },
+ { X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmk, 0 },
+ { X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmk, 0 },
+ { X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 },
+ { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 },
+ { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 },
+ { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 },
+ { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 },
+ { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 },
+ { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 },
+ { X86::VMINSDZrr_Intk, X86::VMINSDZrm_Intk, TB_NO_REVERSE },
+ { X86::VMINSSZrr_Intk, X86::VMINSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 },
+ { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 },
+ { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 },
+ { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 },
+ { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 },
+ { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 },
+ { X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE },
+ { X86::VMULSSZrr_Intk, X86::VMULSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 },
+ { X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 },
+ { X86::VORPDZrrk, X86::VORPDZrmk, 0 },
+ { X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 },
+ { X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 },
+ { X86::VORPSZrrk, X86::VORPSZrmk, 0 },
+ { X86::VPACKSSDWZ128rrk, X86::VPACKSSDWZ128rmk, 0 },
+ { X86::VPACKSSDWZ256rrk, X86::VPACKSSDWZ256rmk, 0 },
+ { X86::VPACKSSDWZrrk, X86::VPACKSSDWZrmk, 0 },
+ { X86::VPACKSSWBZ128rrk, X86::VPACKSSWBZ128rmk, 0 },
+ { X86::VPACKSSWBZ256rrk, X86::VPACKSSWBZ256rmk, 0 },
+ { X86::VPACKSSWBZrrk, X86::VPACKSSWBZrmk, 0 },
+ { X86::VPACKUSDWZ128rrk, X86::VPACKUSDWZ128rmk, 0 },
+ { X86::VPACKUSDWZ256rrk, X86::VPACKUSDWZ256rmk, 0 },
+ { X86::VPACKUSDWZrrk, X86::VPACKUSDWZrmk, 0 },
+ { X86::VPACKUSWBZ128rrk, X86::VPACKUSWBZ128rmk, 0 },
+ { X86::VPACKUSWBZ256rrk, X86::VPACKUSWBZ256rmk, 0 },
+ { X86::VPACKUSWBZrrk, X86::VPACKUSWBZrmk, 0 },
+ { X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 },
+ { X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 },
+ { X86::VPADDBZrrk, X86::VPADDBZrmk, 0 },
+ { X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 },
+ { X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 },
+ { X86::VPADDDZrrk, X86::VPADDDZrmk, 0 },
+ { X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 },
+ { X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 },
+ { X86::VPADDQZrrk, X86::VPADDQZrmk, 0 },
+ { X86::VPADDSBZ128rrk, X86::VPADDSBZ128rmk, 0 },
+ { X86::VPADDSBZ256rrk, X86::VPADDSBZ256rmk, 0 },
+ { X86::VPADDSBZrrk, X86::VPADDSBZrmk, 0 },
+ { X86::VPADDSWZ128rrk, X86::VPADDSWZ128rmk, 0 },
+ { X86::VPADDSWZ256rrk, X86::VPADDSWZ256rmk, 0 },
+ { X86::VPADDSWZrrk, X86::VPADDSWZrmk, 0 },
+ { X86::VPADDUSBZ128rrk, X86::VPADDUSBZ128rmk, 0 },
+ { X86::VPADDUSBZ256rrk, X86::VPADDUSBZ256rmk, 0 },
+ { X86::VPADDUSBZrrk, X86::VPADDUSBZrmk, 0 },
+ { X86::VPADDUSWZ128rrk, X86::VPADDUSWZ128rmk, 0 },
+ { X86::VPADDUSWZ256rrk, X86::VPADDUSWZ256rmk, 0 },
+ { X86::VPADDUSWZrrk, X86::VPADDUSWZrmk, 0 },
+ { X86::VPADDWZ128rrk, X86::VPADDWZ128rmk, 0 },
+ { X86::VPADDWZ256rrk, X86::VPADDWZ256rmk, 0 },
+ { X86::VPADDWZrrk, X86::VPADDWZrmk, 0 },
+ { X86::VPALIGNRZ128rrik, X86::VPALIGNRZ128rmik, 0 },
+ { X86::VPALIGNRZ256rrik, X86::VPALIGNRZ256rmik, 0 },
+ { X86::VPALIGNRZrrik, X86::VPALIGNRZrmik, 0 },
+ { X86::VPANDDZ128rrk, X86::VPANDDZ128rmk, 0 },
+ { X86::VPANDDZ256rrk, X86::VPANDDZ256rmk, 0 },
+ { X86::VPANDDZrrk, X86::VPANDDZrmk, 0 },
+ { X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 },
+ { X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 },
+ { X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 },
+ { X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 },
+ { X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 },
+ { X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 },
+ { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 },
+ { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 },
+ { X86::VPANDQZrrk, X86::VPANDQZrmk, 0 },
+ { X86::VPAVGBZ128rrk, X86::VPAVGBZ128rmk, 0 },
+ { X86::VPAVGBZ256rrk, X86::VPAVGBZ256rmk, 0 },
+ { X86::VPAVGBZrrk, X86::VPAVGBZrmk, 0 },
+ { X86::VPAVGWZ128rrk, X86::VPAVGWZ128rmk, 0 },
+ { X86::VPAVGWZ256rrk, X86::VPAVGWZ256rmk, 0 },
+ { X86::VPAVGWZrrk, X86::VPAVGWZrmk, 0 },
+ { X86::VPDPBUSDSZ128rk, X86::VPDPBUSDSZ128mk, 0 },
+ { X86::VPDPBUSDSZ128rkz, X86::VPDPBUSDSZ128mkz, 0 },
+ { X86::VPDPBUSDSZ256rk, X86::VPDPBUSDSZ256mk, 0 },
+ { X86::VPDPBUSDSZ256rkz, X86::VPDPBUSDSZ256mkz, 0 },
+ { X86::VPDPBUSDSZrk, X86::VPDPBUSDSZmk, 0 },
+ { X86::VPDPBUSDSZrkz, X86::VPDPBUSDSZmkz, 0 },
+ { X86::VPDPBUSDZ128rk, X86::VPDPBUSDZ128mk, 0 },
+ { X86::VPDPBUSDZ128rkz, X86::VPDPBUSDZ128mkz, 0 },
+ { X86::VPDPBUSDZ256rk, X86::VPDPBUSDZ256mk, 0 },
+ { X86::VPDPBUSDZ256rkz, X86::VPDPBUSDZ256mkz, 0 },
+ { X86::VPDPBUSDZrk, X86::VPDPBUSDZmk, 0 },
+ { X86::VPDPBUSDZrkz, X86::VPDPBUSDZmkz, 0 },
+ { X86::VPDPWSSDSZ128rk, X86::VPDPWSSDSZ128mk, 0 },
+ { X86::VPDPWSSDSZ128rkz, X86::VPDPWSSDSZ128mkz, 0 },
+ { X86::VPDPWSSDSZ256rk, X86::VPDPWSSDSZ256mk, 0 },
+ { X86::VPDPWSSDSZ256rkz, X86::VPDPWSSDSZ256mkz, 0 },
+ { X86::VPDPWSSDSZrk, X86::VPDPWSSDSZmk, 0 },
+ { X86::VPDPWSSDSZrkz, X86::VPDPWSSDSZmkz, 0 },
+ { X86::VPDPWSSDZ128rk, X86::VPDPWSSDZ128mk, 0 },
+ { X86::VPDPWSSDZ128rkz, X86::VPDPWSSDZ128mkz, 0 },
+ { X86::VPDPWSSDZ256rk, X86::VPDPWSSDZ256mk, 0 },
+ { X86::VPDPWSSDZ256rkz, X86::VPDPWSSDZ256mkz, 0 },
+ { X86::VPDPWSSDZrk, X86::VPDPWSSDZmk, 0 },
+ { X86::VPDPWSSDZrkz, X86::VPDPWSSDZmkz, 0 },
+ { X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 },
+ { X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 },
+ { X86::VPERMBZrrk, X86::VPERMBZrmk, 0 },
+ { X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 },
+ { X86::VPERMDZrrk, X86::VPERMDZrmk, 0 },
+ { X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 },
+ { X86::VPERMI2B128rrkz, X86::VPERMI2B128rmkz, 0 },
+ { X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 },
+ { X86::VPERMI2B256rrkz, X86::VPERMI2B256rmkz, 0 },
+ { X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 },
+ { X86::VPERMI2Brrkz, X86::VPERMI2Brmkz, 0 },
+ { X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 },
+ { X86::VPERMI2D128rrkz, X86::VPERMI2D128rmkz, 0 },
+ { X86::VPERMI2D256rrk, X86::VPERMI2D256rmk, 0 },
+ { X86::VPERMI2D256rrkz, X86::VPERMI2D256rmkz, 0 },
+ { X86::VPERMI2Drrk, X86::VPERMI2Drmk, 0 },
+ { X86::VPERMI2Drrkz, X86::VPERMI2Drmkz, 0 },
+ { X86::VPERMI2PD128rrk, X86::VPERMI2PD128rmk, 0 },
+ { X86::VPERMI2PD128rrkz, X86::VPERMI2PD128rmkz, 0 },
+ { X86::VPERMI2PD256rrk, X86::VPERMI2PD256rmk, 0 },
+ { X86::VPERMI2PD256rrkz, X86::VPERMI2PD256rmkz, 0 },
+ { X86::VPERMI2PDrrk, X86::VPERMI2PDrmk, 0 },
+ { X86::VPERMI2PDrrkz, X86::VPERMI2PDrmkz, 0 },
+ { X86::VPERMI2PS128rrk, X86::VPERMI2PS128rmk, 0 },
+ { X86::VPERMI2PS128rrkz, X86::VPERMI2PS128rmkz, 0 },
+ { X86::VPERMI2PS256rrk, X86::VPERMI2PS256rmk, 0 },
+ { X86::VPERMI2PS256rrkz, X86::VPERMI2PS256rmkz, 0 },
+ { X86::VPERMI2PSrrk, X86::VPERMI2PSrmk, 0 },
+ { X86::VPERMI2PSrrkz, X86::VPERMI2PSrmkz, 0 },
+ { X86::VPERMI2Q128rrk, X86::VPERMI2Q128rmk, 0 },
+ { X86::VPERMI2Q128rrkz, X86::VPERMI2Q128rmkz, 0 },
+ { X86::VPERMI2Q256rrk, X86::VPERMI2Q256rmk, 0 },
+ { X86::VPERMI2Q256rrkz, X86::VPERMI2Q256rmkz, 0 },
+ { X86::VPERMI2Qrrk, X86::VPERMI2Qrmk, 0 },
+ { X86::VPERMI2Qrrkz, X86::VPERMI2Qrmkz, 0 },
+ { X86::VPERMI2W128rrk, X86::VPERMI2W128rmk, 0 },
+ { X86::VPERMI2W128rrkz, X86::VPERMI2W128rmkz, 0 },
+ { X86::VPERMI2W256rrk, X86::VPERMI2W256rmk, 0 },
+ { X86::VPERMI2W256rrkz, X86::VPERMI2W256rmkz, 0 },
+ { X86::VPERMI2Wrrk, X86::VPERMI2Wrmk, 0 },
+ { X86::VPERMI2Wrrkz, X86::VPERMI2Wrmkz, 0 },
+ { X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 },
+ { X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 },
+ { X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 },
+ { X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 },
+ { X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 },
+ { X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 },
+ { X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 },
+ { X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 },
+ { X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 },
+ { X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 },
+ { X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 },
+ { X86::VPERMQZrrk, X86::VPERMQZrmk, 0 },
+ { X86::VPERMT2B128rrk, X86::VPERMT2B128rmk, 0 },
+ { X86::VPERMT2B128rrkz, X86::VPERMT2B128rmkz, 0 },
+ { X86::VPERMT2B256rrk, X86::VPERMT2B256rmk, 0 },
+ { X86::VPERMT2B256rrkz, X86::VPERMT2B256rmkz, 0 },
+ { X86::VPERMT2Brrk, X86::VPERMT2Brmk, 0 },
+ { X86::VPERMT2Brrkz, X86::VPERMT2Brmkz, 0 },
+ { X86::VPERMT2D128rrk, X86::VPERMT2D128rmk, 0 },
+ { X86::VPERMT2D128rrkz, X86::VPERMT2D128rmkz, 0 },
+ { X86::VPERMT2D256rrk, X86::VPERMT2D256rmk, 0 },
+ { X86::VPERMT2D256rrkz, X86::VPERMT2D256rmkz, 0 },
+ { X86::VPERMT2Drrk, X86::VPERMT2Drmk, 0 },
+ { X86::VPERMT2Drrkz, X86::VPERMT2Drmkz, 0 },
+ { X86::VPERMT2PD128rrk, X86::VPERMT2PD128rmk, 0 },
+ { X86::VPERMT2PD128rrkz, X86::VPERMT2PD128rmkz, 0 },
+ { X86::VPERMT2PD256rrk, X86::VPERMT2PD256rmk, 0 },
+ { X86::VPERMT2PD256rrkz, X86::VPERMT2PD256rmkz, 0 },
+ { X86::VPERMT2PDrrk, X86::VPERMT2PDrmk, 0 },
+ { X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 },
+ { X86::VPERMT2PS128rrk, X86::VPERMT2PS128rmk, 0 },
+ { X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 },
+ { X86::VPERMT2PS256rrk, X86::VPERMT2PS256rmk, 0 },
+ { X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 },
+ { X86::VPERMT2PSrrk, X86::VPERMT2PSrmk, 0 },
+ { X86::VPERMT2PSrrkz, X86::VPERMT2PSrmkz, 0 },
+ { X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 },
+ { X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 },
+ { X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 },
+ { X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 },
+ { X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 },
+ { X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 },
+ { X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 },
+ { X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 },
+ { X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 },
+ { X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 },
+ { X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 },
+ { X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 },
+ { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 },
+ { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 },
+ { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 },
+ { X86::VPMADD52HUQZ128rk, X86::VPMADD52HUQZ128mk, 0 },
+ { X86::VPMADD52HUQZ128rkz, X86::VPMADD52HUQZ128mkz, 0 },
+ { X86::VPMADD52HUQZ256rk, X86::VPMADD52HUQZ256mk, 0 },
+ { X86::VPMADD52HUQZ256rkz, X86::VPMADD52HUQZ256mkz, 0 },
+ { X86::VPMADD52HUQZrk, X86::VPMADD52HUQZmk, 0 },
+ { X86::VPMADD52HUQZrkz, X86::VPMADD52HUQZmkz, 0 },
+ { X86::VPMADD52LUQZ128rk, X86::VPMADD52LUQZ128mk, 0 },
+ { X86::VPMADD52LUQZ128rkz, X86::VPMADD52LUQZ128mkz, 0 },
+ { X86::VPMADD52LUQZ256rk, X86::VPMADD52LUQZ256mk, 0 },
+ { X86::VPMADD52LUQZ256rkz, X86::VPMADD52LUQZ256mkz, 0 },
+ { X86::VPMADD52LUQZrk, X86::VPMADD52LUQZmk, 0 },
+ { X86::VPMADD52LUQZrkz, X86::VPMADD52LUQZmkz, 0 },
+ { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 },
+ { X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 },
+ { X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 },
+ { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 },
+ { X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 },
+ { X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 },
+ { X86::VPMAXSBZ128rrk, X86::VPMAXSBZ128rmk, 0 },
+ { X86::VPMAXSBZ256rrk, X86::VPMAXSBZ256rmk, 0 },
+ { X86::VPMAXSBZrrk, X86::VPMAXSBZrmk, 0 },
+ { X86::VPMAXSDZ128rrk, X86::VPMAXSDZ128rmk, 0 },
+ { X86::VPMAXSDZ256rrk, X86::VPMAXSDZ256rmk, 0 },
+ { X86::VPMAXSDZrrk, X86::VPMAXSDZrmk, 0 },
+ { X86::VPMAXSQZ128rrk, X86::VPMAXSQZ128rmk, 0 },
+ { X86::VPMAXSQZ256rrk, X86::VPMAXSQZ256rmk, 0 },
+ { X86::VPMAXSQZrrk, X86::VPMAXSQZrmk, 0 },
+ { X86::VPMAXSWZ128rrk, X86::VPMAXSWZ128rmk, 0 },
+ { X86::VPMAXSWZ256rrk, X86::VPMAXSWZ256rmk, 0 },
+ { X86::VPMAXSWZrrk, X86::VPMAXSWZrmk, 0 },
+ { X86::VPMAXUBZ128rrk, X86::VPMAXUBZ128rmk, 0 },
+ { X86::VPMAXUBZ256rrk, X86::VPMAXUBZ256rmk, 0 },
+ { X86::VPMAXUBZrrk, X86::VPMAXUBZrmk, 0 },
+ { X86::VPMAXUDZ128rrk, X86::VPMAXUDZ128rmk, 0 },
+ { X86::VPMAXUDZ256rrk, X86::VPMAXUDZ256rmk, 0 },
+ { X86::VPMAXUDZrrk, X86::VPMAXUDZrmk, 0 },
+ { X86::VPMAXUQZ128rrk, X86::VPMAXUQZ128rmk, 0 },
+ { X86::VPMAXUQZ256rrk, X86::VPMAXUQZ256rmk, 0 },
+ { X86::VPMAXUQZrrk, X86::VPMAXUQZrmk, 0 },
+ { X86::VPMAXUWZ128rrk, X86::VPMAXUWZ128rmk, 0 },
+ { X86::VPMAXUWZ256rrk, X86::VPMAXUWZ256rmk, 0 },
+ { X86::VPMAXUWZrrk, X86::VPMAXUWZrmk, 0 },
+ { X86::VPMINSBZ128rrk, X86::VPMINSBZ128rmk, 0 },
+ { X86::VPMINSBZ256rrk, X86::VPMINSBZ256rmk, 0 },
+ { X86::VPMINSBZrrk, X86::VPMINSBZrmk, 0 },
+ { X86::VPMINSDZ128rrk, X86::VPMINSDZ128rmk, 0 },
+ { X86::VPMINSDZ256rrk, X86::VPMINSDZ256rmk, 0 },
+ { X86::VPMINSDZrrk, X86::VPMINSDZrmk, 0 },
+ { X86::VPMINSQZ128rrk, X86::VPMINSQZ128rmk, 0 },
+ { X86::VPMINSQZ256rrk, X86::VPMINSQZ256rmk, 0 },
+ { X86::VPMINSQZrrk, X86::VPMINSQZrmk, 0 },
+ { X86::VPMINSWZ128rrk, X86::VPMINSWZ128rmk, 0 },
+ { X86::VPMINSWZ256rrk, X86::VPMINSWZ256rmk, 0 },
+ { X86::VPMINSWZrrk, X86::VPMINSWZrmk, 0 },
+ { X86::VPMINUBZ128rrk, X86::VPMINUBZ128rmk, 0 },
+ { X86::VPMINUBZ256rrk, X86::VPMINUBZ256rmk, 0 },
+ { X86::VPMINUBZrrk, X86::VPMINUBZrmk, 0 },
+ { X86::VPMINUDZ128rrk, X86::VPMINUDZ128rmk, 0 },
+ { X86::VPMINUDZ256rrk, X86::VPMINUDZ256rmk, 0 },
+ { X86::VPMINUDZrrk, X86::VPMINUDZrmk, 0 },
+ { X86::VPMINUQZ128rrk, X86::VPMINUQZ128rmk, 0 },
+ { X86::VPMINUQZ256rrk, X86::VPMINUQZ256rmk, 0 },
+ { X86::VPMINUQZrrk, X86::VPMINUQZrmk, 0 },
+ { X86::VPMINUWZ128rrk, X86::VPMINUWZ128rmk, 0 },
+ { X86::VPMINUWZ256rrk, X86::VPMINUWZ256rmk, 0 },
+ { X86::VPMINUWZrrk, X86::VPMINUWZrmk, 0 },
+ { X86::VPMULDQZ128rrk, X86::VPMULDQZ128rmk, 0 },
+ { X86::VPMULDQZ256rrk, X86::VPMULDQZ256rmk, 0 },
+ { X86::VPMULDQZrrk, X86::VPMULDQZrmk, 0 },
+ { X86::VPMULHRSWZ128rrk, X86::VPMULHRSWZ128rmk, 0 },
+ { X86::VPMULHRSWZ256rrk, X86::VPMULHRSWZ256rmk, 0 },
+ { X86::VPMULHRSWZrrk, X86::VPMULHRSWZrmk, 0 },
+ { X86::VPMULHUWZ128rrk, X86::VPMULHUWZ128rmk, 0 },
+ { X86::VPMULHUWZ256rrk, X86::VPMULHUWZ256rmk, 0 },
+ { X86::VPMULHUWZrrk, X86::VPMULHUWZrmk, 0 },
+ { X86::VPMULHWZ128rrk, X86::VPMULHWZ128rmk, 0 },
+ { X86::VPMULHWZ256rrk, X86::VPMULHWZ256rmk, 0 },
+ { X86::VPMULHWZrrk, X86::VPMULHWZrmk, 0 },
+ { X86::VPMULLDZ128rrk, X86::VPMULLDZ128rmk, 0 },
+ { X86::VPMULLDZ256rrk, X86::VPMULLDZ256rmk, 0 },
+ { X86::VPMULLDZrrk, X86::VPMULLDZrmk, 0 },
+ { X86::VPMULLQZ128rrk, X86::VPMULLQZ128rmk, 0 },
+ { X86::VPMULLQZ256rrk, X86::VPMULLQZ256rmk, 0 },
+ { X86::VPMULLQZrrk, X86::VPMULLQZrmk, 0 },
+ { X86::VPMULLWZ128rrk, X86::VPMULLWZ128rmk, 0 },
+ { X86::VPMULLWZ256rrk, X86::VPMULLWZ256rmk, 0 },
+ { X86::VPMULLWZrrk, X86::VPMULLWZrmk, 0 },
+ { X86::VPMULTISHIFTQBZ128rrk, X86::VPMULTISHIFTQBZ128rmk, 0 },
+ { X86::VPMULTISHIFTQBZ256rrk, X86::VPMULTISHIFTQBZ256rmk, 0 },
+ { X86::VPMULTISHIFTQBZrrk, X86::VPMULTISHIFTQBZrmk, 0 },
+ { X86::VPMULUDQZ128rrk, X86::VPMULUDQZ128rmk, 0 },
+ { X86::VPMULUDQZ256rrk, X86::VPMULUDQZ256rmk, 0 },
+ { X86::VPMULUDQZrrk, X86::VPMULUDQZrmk, 0 },
+ { X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 },
+ { X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 },
+ { X86::VPORDZrrk, X86::VPORDZrmk, 0 },
+ { X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 },
+ { X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 },
+ { X86::VPORQZrrk, X86::VPORQZrmk, 0 },
+ { X86::VPROLVDZ128rrk, X86::VPROLVDZ128rmk, 0 },
+ { X86::VPROLVDZ256rrk, X86::VPROLVDZ256rmk, 0 },
+ { X86::VPROLVDZrrk, X86::VPROLVDZrmk, 0 },
+ { X86::VPROLVQZ128rrk, X86::VPROLVQZ128rmk, 0 },
+ { X86::VPROLVQZ256rrk, X86::VPROLVQZ256rmk, 0 },
+ { X86::VPROLVQZrrk, X86::VPROLVQZrmk, 0 },
+ { X86::VPRORVDZ128rrk, X86::VPRORVDZ128rmk, 0 },
+ { X86::VPRORVDZ256rrk, X86::VPRORVDZ256rmk, 0 },
+ { X86::VPRORVDZrrk, X86::VPRORVDZrmk, 0 },
+ { X86::VPRORVQZ128rrk, X86::VPRORVQZ128rmk, 0 },
+ { X86::VPRORVQZ256rrk, X86::VPRORVQZ256rmk, 0 },
+ { X86::VPRORVQZrrk, X86::VPRORVQZrmk, 0 },
+ { X86::VPSHLDDZ128rrik, X86::VPSHLDDZ128rmik, 0 },
+ { X86::VPSHLDDZ256rrik, X86::VPSHLDDZ256rmik, 0 },
+ { X86::VPSHLDDZrrik, X86::VPSHLDDZrmik, 0 },
+ { X86::VPSHLDQZ128rrik, X86::VPSHLDQZ128rmik, 0 },
+ { X86::VPSHLDQZ256rrik, X86::VPSHLDQZ256rmik, 0 },
+ { X86::VPSHLDQZrrik, X86::VPSHLDQZrmik, 0 },
+ { X86::VPSHLDVDZ128rk, X86::VPSHLDVDZ128mk, 0 },
+ { X86::VPSHLDVDZ128rkz, X86::VPSHLDVDZ128mkz, 0 },
+ { X86::VPSHLDVDZ256rk, X86::VPSHLDVDZ256mk, 0 },
+ { X86::VPSHLDVDZ256rkz, X86::VPSHLDVDZ256mkz, 0 },
+ { X86::VPSHLDVDZrk, X86::VPSHLDVDZmk, 0 },
+ { X86::VPSHLDVDZrkz, X86::VPSHLDVDZmkz, 0 },
+ { X86::VPSHLDVQZ128rk, X86::VPSHLDVQZ128mk, 0 },
+ { X86::VPSHLDVQZ128rkz, X86::VPSHLDVQZ128mkz, 0 },
+ { X86::VPSHLDVQZ256rk, X86::VPSHLDVQZ256mk, 0 },
+ { X86::VPSHLDVQZ256rkz, X86::VPSHLDVQZ256mkz, 0 },
+ { X86::VPSHLDVQZrk, X86::VPSHLDVQZmk, 0 },
+ { X86::VPSHLDVQZrkz, X86::VPSHLDVQZmkz, 0 },
+ { X86::VPSHLDVWZ128rk, X86::VPSHLDVWZ128mk, 0 },
+ { X86::VPSHLDVWZ128rkz, X86::VPSHLDVWZ128mkz, 0 },
+ { X86::VPSHLDVWZ256rk, X86::VPSHLDVWZ256mk, 0 },
+ { X86::VPSHLDVWZ256rkz, X86::VPSHLDVWZ256mkz, 0 },
+ { X86::VPSHLDVWZrk, X86::VPSHLDVWZmk, 0 },
+ { X86::VPSHLDVWZrkz, X86::VPSHLDVWZmkz, 0 },
+ { X86::VPSHLDWZ128rrik, X86::VPSHLDWZ128rmik, 0 },
+ { X86::VPSHLDWZ256rrik, X86::VPSHLDWZ256rmik, 0 },
+ { X86::VPSHLDWZrrik, X86::VPSHLDWZrmik, 0 },
+ { X86::VPSHRDDZ128rrik, X86::VPSHRDDZ128rmik, 0 },
+ { X86::VPSHRDDZ256rrik, X86::VPSHRDDZ256rmik, 0 },
+ { X86::VPSHRDDZrrik, X86::VPSHRDDZrmik, 0 },
+ { X86::VPSHRDQZ128rrik, X86::VPSHRDQZ128rmik, 0 },
+ { X86::VPSHRDQZ256rrik, X86::VPSHRDQZ256rmik, 0 },
+ { X86::VPSHRDQZrrik, X86::VPSHRDQZrmik, 0 },
+ { X86::VPSHRDVDZ128rk, X86::VPSHRDVDZ128mk, 0 },
+ { X86::VPSHRDVDZ128rkz, X86::VPSHRDVDZ128mkz, 0 },
+ { X86::VPSHRDVDZ256rk, X86::VPSHRDVDZ256mk, 0 },
+ { X86::VPSHRDVDZ256rkz, X86::VPSHRDVDZ256mkz, 0 },
+ { X86::VPSHRDVDZrk, X86::VPSHRDVDZmk, 0 },
+ { X86::VPSHRDVDZrkz, X86::VPSHRDVDZmkz, 0 },
+ { X86::VPSHRDVQZ128rk, X86::VPSHRDVQZ128mk, 0 },
+ { X86::VPSHRDVQZ128rkz, X86::VPSHRDVQZ128mkz, 0 },
+ { X86::VPSHRDVQZ256rk, X86::VPSHRDVQZ256mk, 0 },
+ { X86::VPSHRDVQZ256rkz, X86::VPSHRDVQZ256mkz, 0 },
+ { X86::VPSHRDVQZrk, X86::VPSHRDVQZmk, 0 },
+ { X86::VPSHRDVQZrkz, X86::VPSHRDVQZmkz, 0 },
+ { X86::VPSHRDVWZ128rk, X86::VPSHRDVWZ128mk, 0 },
+ { X86::VPSHRDVWZ128rkz, X86::VPSHRDVWZ128mkz, 0 },
+ { X86::VPSHRDVWZ256rk, X86::VPSHRDVWZ256mk, 0 },
+ { X86::VPSHRDVWZ256rkz, X86::VPSHRDVWZ256mkz, 0 },
+ { X86::VPSHRDVWZrk, X86::VPSHRDVWZmk, 0 },
+ { X86::VPSHRDVWZrkz, X86::VPSHRDVWZmkz, 0 },
+ { X86::VPSHRDWZ128rrik, X86::VPSHRDWZ128rmik, 0 },
+ { X86::VPSHRDWZ256rrik, X86::VPSHRDWZ256rmik, 0 },
+ { X86::VPSHRDWZrrik, X86::VPSHRDWZrmik, 0 },
+ { X86::VPSHUFBZ128rrk, X86::VPSHUFBZ128rmk, 0 },
+ { X86::VPSHUFBZ256rrk, X86::VPSHUFBZ256rmk, 0 },
+ { X86::VPSHUFBZrrk, X86::VPSHUFBZrmk, 0 },
+ { X86::VPSLLDZ128rrk, X86::VPSLLDZ128rmk, 0 },
+ { X86::VPSLLDZ256rrk, X86::VPSLLDZ256rmk, 0 },
+ { X86::VPSLLDZrrk, X86::VPSLLDZrmk, 0 },
+ { X86::VPSLLQZ128rrk, X86::VPSLLQZ128rmk, 0 },
+ { X86::VPSLLQZ256rrk, X86::VPSLLQZ256rmk, 0 },
+ { X86::VPSLLQZrrk, X86::VPSLLQZrmk, 0 },
+ { X86::VPSLLVDZ128rrk, X86::VPSLLVDZ128rmk, 0 },
+ { X86::VPSLLVDZ256rrk, X86::VPSLLVDZ256rmk, 0 },
+ { X86::VPSLLVDZrrk, X86::VPSLLVDZrmk, 0 },
+ { X86::VPSLLVQZ128rrk, X86::VPSLLVQZ128rmk, 0 },
+ { X86::VPSLLVQZ256rrk, X86::VPSLLVQZ256rmk, 0 },
+ { X86::VPSLLVQZrrk, X86::VPSLLVQZrmk, 0 },
+ { X86::VPSLLVWZ128rrk, X86::VPSLLVWZ128rmk, 0 },
+ { X86::VPSLLVWZ256rrk, X86::VPSLLVWZ256rmk, 0 },
+ { X86::VPSLLVWZrrk, X86::VPSLLVWZrmk, 0 },
+ { X86::VPSLLWZ128rrk, X86::VPSLLWZ128rmk, 0 },
+ { X86::VPSLLWZ256rrk, X86::VPSLLWZ256rmk, 0 },
+ { X86::VPSLLWZrrk, X86::VPSLLWZrmk, 0 },
+ { X86::VPSRADZ128rrk, X86::VPSRADZ128rmk, 0 },
+ { X86::VPSRADZ256rrk, X86::VPSRADZ256rmk, 0 },
+ { X86::VPSRADZrrk, X86::VPSRADZrmk, 0 },
+ { X86::VPSRAQZ128rrk, X86::VPSRAQZ128rmk, 0 },
+ { X86::VPSRAQZ256rrk, X86::VPSRAQZ256rmk, 0 },
+ { X86::VPSRAQZrrk, X86::VPSRAQZrmk, 0 },
+ { X86::VPSRAVDZ128rrk, X86::VPSRAVDZ128rmk, 0 },
+ { X86::VPSRAVDZ256rrk, X86::VPSRAVDZ256rmk, 0 },
+ { X86::VPSRAVDZrrk, X86::VPSRAVDZrmk, 0 },
+ { X86::VPSRAVQZ128rrk, X86::VPSRAVQZ128rmk, 0 },
+ { X86::VPSRAVQZ256rrk, X86::VPSRAVQZ256rmk, 0 },
+ { X86::VPSRAVQZrrk, X86::VPSRAVQZrmk, 0 },
+ { X86::VPSRAVWZ128rrk, X86::VPSRAVWZ128rmk, 0 },
+ { X86::VPSRAVWZ256rrk, X86::VPSRAVWZ256rmk, 0 },
+ { X86::VPSRAVWZrrk, X86::VPSRAVWZrmk, 0 },
+ { X86::VPSRAWZ128rrk, X86::VPSRAWZ128rmk, 0 },
+ { X86::VPSRAWZ256rrk, X86::VPSRAWZ256rmk, 0 },
+ { X86::VPSRAWZrrk, X86::VPSRAWZrmk, 0 },
+ { X86::VPSRLDZ128rrk, X86::VPSRLDZ128rmk, 0 },
+ { X86::VPSRLDZ256rrk, X86::VPSRLDZ256rmk, 0 },
+ { X86::VPSRLDZrrk, X86::VPSRLDZrmk, 0 },
+ { X86::VPSRLQZ128rrk, X86::VPSRLQZ128rmk, 0 },
+ { X86::VPSRLQZ256rrk, X86::VPSRLQZ256rmk, 0 },
+ { X86::VPSRLQZrrk, X86::VPSRLQZrmk, 0 },
+ { X86::VPSRLVDZ128rrk, X86::VPSRLVDZ128rmk, 0 },
+ { X86::VPSRLVDZ256rrk, X86::VPSRLVDZ256rmk, 0 },
+ { X86::VPSRLVDZrrk, X86::VPSRLVDZrmk, 0 },
+ { X86::VPSRLVQZ128rrk, X86::VPSRLVQZ128rmk, 0 },
+ { X86::VPSRLVQZ256rrk, X86::VPSRLVQZ256rmk, 0 },
+ { X86::VPSRLVQZrrk, X86::VPSRLVQZrmk, 0 },
+ { X86::VPSRLVWZ128rrk, X86::VPSRLVWZ128rmk, 0 },
+ { X86::VPSRLVWZ256rrk, X86::VPSRLVWZ256rmk, 0 },
+ { X86::VPSRLVWZrrk, X86::VPSRLVWZrmk, 0 },
+ { X86::VPSRLWZ128rrk, X86::VPSRLWZ128rmk, 0 },
+ { X86::VPSRLWZ256rrk, X86::VPSRLWZ256rmk, 0 },
+ { X86::VPSRLWZrrk, X86::VPSRLWZrmk, 0 },
+ { X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 },
+ { X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 },
+ { X86::VPSUBBZrrk, X86::VPSUBBZrmk, 0 },
+ { X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 },
+ { X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 },
+ { X86::VPSUBDZrrk, X86::VPSUBDZrmk, 0 },
+ { X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 },
+ { X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 },
+ { X86::VPSUBQZrrk, X86::VPSUBQZrmk, 0 },
+ { X86::VPSUBSBZ128rrk, X86::VPSUBSBZ128rmk, 0 },
+ { X86::VPSUBSBZ256rrk, X86::VPSUBSBZ256rmk, 0 },
+ { X86::VPSUBSBZrrk, X86::VPSUBSBZrmk, 0 },
+ { X86::VPSUBSWZ128rrk, X86::VPSUBSWZ128rmk, 0 },
+ { X86::VPSUBSWZ256rrk, X86::VPSUBSWZ256rmk, 0 },
+ { X86::VPSUBSWZrrk, X86::VPSUBSWZrmk, 0 },
+ { X86::VPSUBUSBZ128rrk, X86::VPSUBUSBZ128rmk, 0 },
+ { X86::VPSUBUSBZ256rrk, X86::VPSUBUSBZ256rmk, 0 },
+ { X86::VPSUBUSBZrrk, X86::VPSUBUSBZrmk, 0 },
+ { X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 },
+ { X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 },
+ { X86::VPSUBUSWZrrk, X86::VPSUBUSWZrmk, 0 },
+ { X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 },
+ { X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 },
+ { X86::VPSUBWZrrk, X86::VPSUBWZrmk, 0 },
+ { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 },
+ { X86::VPTERNLOGDZ128rrikz, X86::VPTERNLOGDZ128rmikz, 0 },
+ { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 },
+ { X86::VPTERNLOGDZ256rrikz, X86::VPTERNLOGDZ256rmikz, 0 },
+ { X86::VPTERNLOGDZrrik, X86::VPTERNLOGDZrmik, 0 },
+ { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 },
+ { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 },
+ { X86::VPTERNLOGQZ128rrikz, X86::VPTERNLOGQZ128rmikz, 0 },
+ { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 },
+ { X86::VPTERNLOGQZ256rrikz, X86::VPTERNLOGQZ256rmikz, 0 },
+ { X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 },
+ { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 },
+ { X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 },
+ { X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 },
+ { X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 },
+ { X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 },
+ { X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 },
+ { X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 },
+ { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 },
+ { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 },
+ { X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 },
+ { X86::VPUNPCKHWDZ128rrk, X86::VPUNPCKHWDZ128rmk, 0 },
+ { X86::VPUNPCKHWDZ256rrk, X86::VPUNPCKHWDZ256rmk, 0 },
+ { X86::VPUNPCKHWDZrrk, X86::VPUNPCKHWDZrmk, 0 },
+ { X86::VPUNPCKLBWZ128rrk, X86::VPUNPCKLBWZ128rmk, 0 },
+ { X86::VPUNPCKLBWZ256rrk, X86::VPUNPCKLBWZ256rmk, 0 },
+ { X86::VPUNPCKLBWZrrk, X86::VPUNPCKLBWZrmk, 0 },
+ { X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmk, 0 },
+ { X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmk, 0 },
+ { X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmk, 0 },
+ { X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk, 0 },
+ { X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk, 0 },
+ { X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmk, 0 },
+ { X86::VPUNPCKLWDZ128rrk, X86::VPUNPCKLWDZ128rmk, 0 },
+ { X86::VPUNPCKLWDZ256rrk, X86::VPUNPCKLWDZ256rmk, 0 },
+ { X86::VPUNPCKLWDZrrk, X86::VPUNPCKLWDZrmk, 0 },
+ { X86::VPXORDZ128rrk, X86::VPXORDZ128rmk, 0 },
+ { X86::VPXORDZ256rrk, X86::VPXORDZ256rmk, 0 },
+ { X86::VPXORDZrrk, X86::VPXORDZrmk, 0 },
+ { X86::VPXORQZ128rrk, X86::VPXORQZ128rmk, 0 },
+ { X86::VPXORQZ256rrk, X86::VPXORQZ256rmk, 0 },
+ { X86::VPXORQZrrk, X86::VPXORQZrmk, 0 },
+ { X86::VRANGEPDZ128rrik, X86::VRANGEPDZ128rmik, 0 },
+ { X86::VRANGEPDZ256rrik, X86::VRANGEPDZ256rmik, 0 },
+ { X86::VRANGEPDZrrik, X86::VRANGEPDZrmik, 0 },
+ { X86::VRANGEPSZ128rrik, X86::VRANGEPSZ128rmik, 0 },
+ { X86::VRANGEPSZ256rrik, X86::VRANGEPSZ256rmik, 0 },
+ { X86::VRANGEPSZrrik, X86::VRANGEPSZrmik, 0 },
+ { X86::VRANGESDZrrik, X86::VRANGESDZrmik, TB_NO_REVERSE },
+ { X86::VRANGESSZrrik, X86::VRANGESSZrmik, TB_NO_REVERSE },
+ { X86::VRCP14SDZrrk, X86::VRCP14SDZrmk, TB_NO_REVERSE },
+ { X86::VRCP14SSZrrk, X86::VRCP14SSZrmk, TB_NO_REVERSE },
+ { X86::VRCP28SDZrk, X86::VRCP28SDZmk, TB_NO_REVERSE },
+ { X86::VRCP28SSZrk, X86::VRCP28SSZmk, TB_NO_REVERSE },
+ { X86::VREDUCESDZrrik, X86::VREDUCESDZrmik, TB_NO_REVERSE },
+ { X86::VREDUCESSZrrik, X86::VREDUCESSZrmik, TB_NO_REVERSE },
+ { X86::VRNDSCALESDZr_Intk, X86::VRNDSCALESDZm_Intk, TB_NO_REVERSE },
+ { X86::VRNDSCALESSZr_Intk, X86::VRNDSCALESSZm_Intk, TB_NO_REVERSE },
+ { X86::VRSQRT14SDZrrk, X86::VRSQRT14SDZrmk, TB_NO_REVERSE },
+ { X86::VRSQRT14SSZrrk, X86::VRSQRT14SSZrmk, TB_NO_REVERSE },
+ { X86::VRSQRT28SDZrk, X86::VRSQRT28SDZmk, TB_NO_REVERSE },
+ { X86::VRSQRT28SSZrk, X86::VRSQRT28SSZmk, TB_NO_REVERSE },
+ { X86::VSCALEFPDZ128rrk, X86::VSCALEFPDZ128rmk, 0 },
+ { X86::VSCALEFPDZ256rrk, X86::VSCALEFPDZ256rmk, 0 },
+ { X86::VSCALEFPDZrrk, X86::VSCALEFPDZrmk, 0 },
+ { X86::VSCALEFPSZ128rrk, X86::VSCALEFPSZ128rmk, 0 },
+ { X86::VSCALEFPSZ256rrk, X86::VSCALEFPSZ256rmk, 0 },
+ { X86::VSCALEFPSZrrk, X86::VSCALEFPSZrmk, 0 },
+ { X86::VSCALEFSDZrrk, X86::VSCALEFSDZrmk, TB_NO_REVERSE },
+ { X86::VSCALEFSSZrrk, X86::VSCALEFSSZrmk, TB_NO_REVERSE },
+ { X86::VSHUFF32X4Z256rrik, X86::VSHUFF32X4Z256rmik, 0 },
+ { X86::VSHUFF32X4Zrrik, X86::VSHUFF32X4Zrmik, 0 },
+ { X86::VSHUFF64X2Z256rrik, X86::VSHUFF64X2Z256rmik, 0 },
+ { X86::VSHUFF64X2Zrrik, X86::VSHUFF64X2Zrmik, 0 },
+ { X86::VSHUFI32X4Z256rrik, X86::VSHUFI32X4Z256rmik, 0 },
+ { X86::VSHUFI32X4Zrrik, X86::VSHUFI32X4Zrmik, 0 },
+ { X86::VSHUFI64X2Z256rrik, X86::VSHUFI64X2Z256rmik, 0 },
+ { X86::VSHUFI64X2Zrrik, X86::VSHUFI64X2Zrmik, 0 },
+ { X86::VSHUFPDZ128rrik, X86::VSHUFPDZ128rmik, 0 },
+ { X86::VSHUFPDZ256rrik, X86::VSHUFPDZ256rmik, 0 },
+ { X86::VSHUFPDZrrik, X86::VSHUFPDZrmik, 0 },
+ { X86::VSHUFPSZ128rrik, X86::VSHUFPSZ128rmik, 0 },
+ { X86::VSHUFPSZ256rrik, X86::VSHUFPSZ256rmik, 0 },
+ { X86::VSHUFPSZrrik, X86::VSHUFPSZrmik, 0 },
+ { X86::VSQRTSDZr_Intk, X86::VSQRTSDZm_Intk, TB_NO_REVERSE },
+ { X86::VSQRTSSZr_Intk, X86::VSQRTSSZm_Intk, TB_NO_REVERSE },
+ { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 },
+ { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 },
+ { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 },
+ { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 },
+ { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 },
+ { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 },
+ { X86::VSUBSDZrr_Intk, X86::VSUBSDZrm_Intk, TB_NO_REVERSE },
+ { X86::VSUBSSZrr_Intk, X86::VSUBSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 },
+ { X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 },
+ { X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 },
+ { X86::VUNPCKHPSZ128rrk, X86::VUNPCKHPSZ128rmk, 0 },
+ { X86::VUNPCKHPSZ256rrk, X86::VUNPCKHPSZ256rmk, 0 },
+ { X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 },
+ { X86::VUNPCKLPDZ128rrk, X86::VUNPCKLPDZ128rmk, 0 },
+ { X86::VUNPCKLPDZ256rrk, X86::VUNPCKLPDZ256rmk, 0 },
+ { X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 },
+ { X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 },
+ { X86::VUNPCKLPSZ256rrk, X86::VUNPCKLPSZ256rmk, 0 },
+ { X86::VUNPCKLPSZrrk, X86::VUNPCKLPSZrmk, 0 },
+ { X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 },
+ { X86::VXORPDZ256rrk, X86::VXORPDZ256rmk, 0 },
+ { X86::VXORPDZrrk, X86::VXORPDZrmk, 0 },
+ { X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 },
+ { X86::VXORPSZ256rrk, X86::VXORPSZ256rmk, 0 },
+ { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 },
+};
+
+static const X86MemoryFoldTableEntry *
+lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) {
+#ifndef NDEBUG
+ // Make sure the tables are sorted.
+ static std::atomic<bool> FoldTablesChecked(false);
+ if (!FoldTablesChecked.load(std::memory_order_relaxed)) {
+ assert(std::is_sorted(std::begin(MemoryFoldTable2Addr),
+ std::end(MemoryFoldTable2Addr)) &&
+ std::adjacent_find(std::begin(MemoryFoldTable2Addr),
+ std::end(MemoryFoldTable2Addr)) ==
+ std::end(MemoryFoldTable2Addr) &&
+ "MemoryFoldTable2Addr is not sorted and unique!");
+ assert(std::is_sorted(std::begin(MemoryFoldTable0),
+ std::end(MemoryFoldTable0)) &&
+ std::adjacent_find(std::begin(MemoryFoldTable0),
+ std::end(MemoryFoldTable0)) ==
+ std::end(MemoryFoldTable0) &&
+ "MemoryFoldTable0 is not sorted and unique!");
+ assert(std::is_sorted(std::begin(MemoryFoldTable1),
+ std::end(MemoryFoldTable1)) &&
+ std::adjacent_find(std::begin(MemoryFoldTable1),
+ std::end(MemoryFoldTable1)) ==
+ std::end(MemoryFoldTable1) &&
+ "MemoryFoldTable1 is not sorted and unique!");
+ assert(std::is_sorted(std::begin(MemoryFoldTable2),
+ std::end(MemoryFoldTable2)) &&
+ std::adjacent_find(std::begin(MemoryFoldTable2),
+ std::end(MemoryFoldTable2)) ==
+ std::end(MemoryFoldTable2) &&
+ "MemoryFoldTable2 is not sorted and unique!");
+ assert(std::is_sorted(std::begin(MemoryFoldTable3),
+ std::end(MemoryFoldTable3)) &&
+ std::adjacent_find(std::begin(MemoryFoldTable3),
+ std::end(MemoryFoldTable3)) ==
+ std::end(MemoryFoldTable3) &&
+ "MemoryFoldTable3 is not sorted and unique!");
+ assert(std::is_sorted(std::begin(MemoryFoldTable4),
+ std::end(MemoryFoldTable4)) &&
+ std::adjacent_find(std::begin(MemoryFoldTable4),
+ std::end(MemoryFoldTable4)) ==
+ std::end(MemoryFoldTable4) &&
+ "MemoryFoldTable4 is not sorted and unique!");
+ FoldTablesChecked.store(true, std::memory_order_relaxed);
+ }
+#endif
+
+ const X86MemoryFoldTableEntry *Data = std::lower_bound(Table.begin(),
+ Table.end(),
+ RegOp);
+ if (Data != Table.end() && Data->KeyOp == RegOp &&
+ !(Data->Flags & TB_NO_FORWARD))
+ return Data;
+ return nullptr;
+}
+
+const X86MemoryFoldTableEntry *
+llvm::lookupTwoAddrFoldTable(unsigned RegOp) {
+ return lookupFoldTableImpl(MemoryFoldTable2Addr, RegOp);
+}
+
+const X86MemoryFoldTableEntry *
+llvm::lookupFoldTable(unsigned RegOp, unsigned OpNum) {
+ ArrayRef<X86MemoryFoldTableEntry> FoldTable;
+ if (OpNum == 0)
+ FoldTable = makeArrayRef(MemoryFoldTable0);
+ else if (OpNum == 1)
+ FoldTable = makeArrayRef(MemoryFoldTable1);
+ else if (OpNum == 2)
+ FoldTable = makeArrayRef(MemoryFoldTable2);
+ else if (OpNum == 3)
+ FoldTable = makeArrayRef(MemoryFoldTable3);
+ else if (OpNum == 4)
+ FoldTable = makeArrayRef(MemoryFoldTable4);
+ else
+ return nullptr;
+
+ return lookupFoldTableImpl(FoldTable, RegOp);
+}
+
+namespace {
+
+// This class stores the memory unfolding tables. It is instantiated as a
+// ManagedStatic to lazily init the unfolding table.
+struct X86MemUnfoldTable {
+ // Stores memory unfolding tables entries sorted by opcode.
+ std::vector<X86MemoryFoldTableEntry> Table;
+
+ X86MemUnfoldTable() {
+ for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable2Addr)
+ // Index 0, folded load and store, no alignment requirement.
+ addTableEntry(Entry, TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+
+ for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable0)
+ // Index 0, mix of loads and stores.
+ addTableEntry(Entry, TB_INDEX_0);
+
+ for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable1)
+ // Index 1, folded load
+ addTableEntry(Entry, TB_INDEX_1 | TB_FOLDED_LOAD);
+
+ for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable2)
+ // Index 2, folded load
+ addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD);
+
+ for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable3)
+ // Index 3, folded load
+ addTableEntry(Entry, TB_INDEX_3 | TB_FOLDED_LOAD);
+
+ for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable4)
+ // Index 4, folded load
+ addTableEntry(Entry, TB_INDEX_4 | TB_FOLDED_LOAD);
+
+ // Sort the memory->reg unfold table.
+ array_pod_sort(Table.begin(), Table.end());
+
+ // Now that it's sorted, ensure its unique.
+ assert(std::adjacent_find(Table.begin(), Table.end()) == Table.end() &&
+ "Memory unfolding table is not unique!");
+ }
+
+ void addTableEntry(const X86MemoryFoldTableEntry &Entry,
+ uint16_t ExtraFlags) {
+ // NOTE: This swaps the KeyOp and DstOp in the table so we can sort it.
+ if ((Entry.Flags & TB_NO_REVERSE) == 0)
+ Table.push_back({Entry.DstOp, Entry.KeyOp,
+ static_cast<uint16_t>(Entry.Flags | ExtraFlags) });
+ }
+};
+}
+
+static ManagedStatic<X86MemUnfoldTable> MemUnfoldTable;
+
+const X86MemoryFoldTableEntry *
+llvm::lookupUnfoldTable(unsigned MemOp) {
+ auto &Table = MemUnfoldTable->Table;
+ auto I = std::lower_bound(Table.begin(), Table.end(), MemOp);
+ if (I != Table.end() && I->KeyOp == MemOp)
+ return &*I;
+ return nullptr;
+}
+
diff --git a/lib/Target/X86/X86InstrFoldTables.h b/lib/Target/X86/X86InstrFoldTables.h
new file mode 100644
index 000000000000..90016baead96
--- /dev/null
+++ b/lib/Target/X86/X86InstrFoldTables.h
@@ -0,0 +1,85 @@
+//===-- X86InstrFoldTables.h - X86 Instruction Folding Tables ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the interface to query the X86 memory folding tables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRFOLDTABLES_H
+#define LLVM_LIB_TARGET_X86_X86INSTRFOLDTABLES_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+enum {
+ // Select which memory operand is being unfolded.
+ // (stored in bits 0 - 3)
+ TB_INDEX_0 = 0,
+ TB_INDEX_1 = 1,
+ TB_INDEX_2 = 2,
+ TB_INDEX_3 = 3,
+ TB_INDEX_4 = 4,
+ TB_INDEX_MASK = 0xf,
+
+ // Do not insert the reverse map (MemOp -> RegOp) into the table.
+ // This may be needed because there is a many -> one mapping.
+ TB_NO_REVERSE = 1 << 4,
+
+ // Do not insert the forward map (RegOp -> MemOp) into the table.
+ // This is needed for Native Client, which prohibits branch
+ // instructions from using a memory operand.
+ TB_NO_FORWARD = 1 << 5,
+
+ TB_FOLDED_LOAD = 1 << 6,
+ TB_FOLDED_STORE = 1 << 7,
+
+ // Minimum alignment required for load/store.
+ // Used for RegOp->MemOp conversion.
+ // (stored in bits 8 - 15)
+ TB_ALIGN_SHIFT = 8,
+ TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
+ TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT,
+ TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT,
+ TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT,
+ TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT
+};
+
+// This struct is used for both the folding and unfold tables. They KeyOp
+// is used to determine the sorting order.
+struct X86MemoryFoldTableEntry {
+ uint16_t KeyOp;
+ uint16_t DstOp;
+ uint16_t Flags;
+
+ bool operator<(const X86MemoryFoldTableEntry &RHS) const {
+ return KeyOp < RHS.KeyOp;
+ }
+ bool operator==(const X86MemoryFoldTableEntry &RHS) const {
+ return KeyOp == RHS.KeyOp;
+ }
+ friend bool operator<(const X86MemoryFoldTableEntry &TE, unsigned Opcode) {
+ return TE.KeyOp < Opcode;
+ }
+};
+
+// Look up the memory folding table entry for folding a load and a store into
+// operand 0.
+const X86MemoryFoldTableEntry *lookupTwoAddrFoldTable(unsigned RegOp);
+
+// Look up the memory folding table entry for folding a load or store with
+// operand OpNum.
+const X86MemoryFoldTableEntry *lookupFoldTable(unsigned RegOp, unsigned OpNum);
+
+// Look up the memory unfolding table entry for this instruction.
+const X86MemoryFoldTableEntry *lookupUnfoldTable(unsigned MemOp);
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 0b266e5591b4..47d4719d3060 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -127,22 +127,28 @@ class Prefix<bits<3> val> {
bits<3> Value = val;
}
def NoPrfx : Prefix<0>;
-def PS : Prefix<1>;
-def PD : Prefix<2>;
-def XS : Prefix<3>;
-def XD : Prefix<4>;
+def PD : Prefix<1>;
+def XS : Prefix<2>;
+def XD : Prefix<3>;
+def PS : Prefix<4>; // Similar to NoPrfx, but disassembler uses this to know
+ // that other instructions with this opcode use PD/XS/XD
+ // and if any of those is not supported they shouldn't
+ // decode to this instruction. e.g. ANDSS/ANDSD don't
+ // exist, but the 0xf2/0xf3 encoding shouldn't
+ // disable to ANDPS.
// Class specifying the opcode map.
class Map<bits<3> val> {
bits<3> Value = val;
}
-def OB : Map<0>;
-def TB : Map<1>;
-def T8 : Map<2>;
-def TA : Map<3>;
-def XOP8 : Map<4>;
-def XOP9 : Map<5>;
-def XOPA : Map<6>;
+def OB : Map<0>;
+def TB : Map<1>;
+def T8 : Map<2>;
+def TA : Map<3>;
+def XOP8 : Map<4>;
+def XOP9 : Map<5>;
+def XOPA : Map<6>;
+def ThreeDNow : Map<7>;
// Class specifying the encoding
class Encoding<bits<2> val> {
@@ -160,7 +166,6 @@ class OperandSize<bits<2> val> {
def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix.
def OpSize16 : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode.
def OpSize32 : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode.
-def OpSizeIgnore : OperandSize<3>; // Takes 0x66 prefix, never emits.
// Address size for encodings that change based on mode.
class AddressSize<bits<2> val> {
@@ -175,7 +180,6 @@ def AdSize64 : AddressSize<3>; // Encodes a 64-bit address.
// emitter that various prefix bytes are required.
class OpSize16 { OperandSize OpSize = OpSize16; }
class OpSize32 { OperandSize OpSize = OpSize32; }
-class OpSizeIgnore { OperandSize OpSize = OpSizeIgnore; }
class AdSize16 { AddressSize AdSize = AdSize16; }
class AdSize32 { AddressSize AdSize = AdSize32; }
class AdSize64 { AddressSize AdSize = AdSize64; }
@@ -188,6 +192,7 @@ class TA { Map OpMap = TA; }
class XOP8 { Map OpMap = XOP8; Prefix OpPrefix = PS; }
class XOP9 { Map OpMap = XOP9; Prefix OpPrefix = PS; }
class XOPA { Map OpMap = XOPA; Prefix OpPrefix = PS; }
+class ThreeDNow { Map OpMap = ThreeDNow; }
class OBXS { Prefix OpPrefix = XS; }
class PS : TB { Prefix OpPrefix = PS; }
class PD : TB { Prefix OpPrefix = PD; }
@@ -203,11 +208,16 @@ class TAXD : TA { Prefix OpPrefix = XD; }
class VEX { Encoding OpEnc = EncVEX; }
class VEX_W { bits<2> VEX_WPrefix = 1; }
class VEX_WIG { bits<2> VEX_WPrefix = 2; }
+// Special version of VEX_W that can be changed to VEX.W==0 for EVEX2VEX.
+// FIXME: We should consider adding separate bits for VEX_WIG and the extra
+// part of W1X. This would probably simplify the tablegen emitters and
+// the TSFlags creation below.
+class VEX_W1X { bits<2> VEX_WPrefix = 3; }
class VEX_4V : VEX { bit hasVEX_4V = 1; }
class VEX_L { bit hasVEX_L = 1; }
class VEX_LIG { bit ignoresVEX_L = 1; }
-class EVEX : VEX { Encoding OpEnc = EncEVEX; }
-class EVEX_4V : VEX_4V { Encoding OpEnc = EncEVEX; }
+class EVEX { Encoding OpEnc = EncEVEX; }
+class EVEX_4V : EVEX { bit hasVEX_4V = 1; }
class EVEX_K { bit hasEVEX_K = 1; }
class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; }
class EVEX_B { bit hasEVEX_B = 1; }
@@ -215,6 +225,7 @@ class EVEX_RC { bit hasEVEX_RC = 1; }
class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; }
class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; }
class EVEX_V128 { bit hasEVEX_L2 = 0; bit hasVEX_L = 0; }
+class NOTRACK { bit hasNoTrackPrefix = 1; }
// Specify AVX512 8-bit compressed displacement encoding based on the vector
// element size in bits (8, 16, 32, 64) and the CDisp8 form.
@@ -223,23 +234,28 @@ class EVEX_CD8<int esize, CD8VForm form> {
bits<3> CD8_Form = form.Value;
}
-class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; }
class XOP { Encoding OpEnc = EncXOP; }
class XOP_4V : XOP { bit hasVEX_4V = 1; }
// Specify the alternative register form instruction to replace the current
// instruction in case it was picked during generation of memory folding tables
class FoldGenData<string _RegisterForm> {
- string FoldGenRegForm = _RegisterForm;
+ string FoldGenRegForm = _RegisterForm;
+}
+
+// Provide a specific instruction to be used by the EVEX2VEX conversion.
+class EVEX2VEXOverride<string VEXInstrName> {
+ string EVEX2VEXOverride = VEXInstrName;
}
// Mark the instruction as "illegal to memory fold/unfold"
class NotMemoryFoldable { bit isMemoryFoldable = 0; }
+// Prevent EVEX->VEX conversion from considering this instruction.
+class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; }
+
class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
- string AsmStr,
- InstrItinClass itin,
- Domain d = GenericDomain>
+ string AsmStr, Domain d = GenericDomain>
: Instruction {
let Namespace = "X86";
@@ -255,8 +271,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
// If this is a pseudo instruction, mark it isCodeGenOnly.
let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
- let Itinerary = itin;
-
//
// Attributes specific to X86 instructions...
//
@@ -294,8 +308,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
// Declare it int rather than bits<4> so that all bits are defined when
// assigning to bits<7>.
int CD8_EltSize = 0; // Compressed disp8 form - element-size in bytes.
- bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
bit hasEVEX_RC = 0; // Explicitly specified rounding control in FP instruction.
+ bit hasNoTrackPrefix = 0; // Does this inst has 0x3E (NoTrack) prefix?
bits<2> EVEX_LL;
let EVEX_LL{0} = hasVEX_L;
@@ -319,112 +333,118 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
// instruction to replace the current one in case it got picked during generation.
string FoldGenRegForm = ?;
+ // Used to prevent an explicit EVEX2VEX override for this instruction.
+ string EVEX2VEXOverride = ?;
+
bit isMemoryFoldable = 1; // Is it allowed to memory fold/unfold this instruction?
+ bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion.
// TSFlags layout should be kept in sync with X86BaseInfo.h.
let TSFlags{6-0} = FormBits;
let TSFlags{8-7} = OpSizeBits;
let TSFlags{10-9} = AdSizeBits;
- let TSFlags{13-11} = OpPrefixBits;
- let TSFlags{16-14} = OpMapBits;
- let TSFlags{17} = hasREX_WPrefix;
- let TSFlags{21-18} = ImmT.Value;
- let TSFlags{24-22} = FPForm.Value;
- let TSFlags{25} = hasLockPrefix;
- let TSFlags{26} = hasREPPrefix;
- let TSFlags{28-27} = ExeDomain.Value;
- let TSFlags{30-29} = OpEncBits;
- let TSFlags{38-31} = Opcode;
+ // No need for 3rd bit, we don't need to distinguish NoPrfx from PS.
+ let TSFlags{12-11} = OpPrefixBits{1-0};
+ let TSFlags{15-13} = OpMapBits;
+ let TSFlags{16} = hasREX_WPrefix;
+ let TSFlags{20-17} = ImmT.Value;
+ let TSFlags{23-21} = FPForm.Value;
+ let TSFlags{24} = hasLockPrefix;
+ let TSFlags{25} = hasREPPrefix;
+ let TSFlags{27-26} = ExeDomain.Value;
+ let TSFlags{29-28} = OpEncBits;
+ let TSFlags{37-30} = Opcode;
// Currently no need for second bit in TSFlags - W Ignore is equivalent to 0.
- let TSFlags{39} = VEX_WPrefix{0};
- let TSFlags{40} = hasVEX_4V;
- let TSFlags{41} = hasVEX_L;
- let TSFlags{42} = hasEVEX_K;
- let TSFlags{43} = hasEVEX_Z;
- let TSFlags{44} = hasEVEX_L2;
- let TSFlags{45} = hasEVEX_B;
+ let TSFlags{38} = VEX_WPrefix{0};
+ let TSFlags{39} = hasVEX_4V;
+ let TSFlags{40} = hasVEX_L;
+ let TSFlags{41} = hasEVEX_K;
+ let TSFlags{42} = hasEVEX_Z;
+ let TSFlags{43} = hasEVEX_L2;
+ let TSFlags{44} = hasEVEX_B;
// If we run out of TSFlags bits, it's possible to encode this in 3 bits.
- let TSFlags{52-46} = CD8_Scale;
- let TSFlags{53} = has3DNow0F0FOpcode;
- let TSFlags{54} = hasEVEX_RC;
+ let TSFlags{51-45} = CD8_Scale;
+ let TSFlags{52} = hasEVEX_RC;
+ let TSFlags{53} = hasNoTrackPrefix;
}
-class PseudoI<dag oops, dag iops, list<dag> pattern,
- InstrItinClass itin = NoItinerary>
- : X86Inst<0, Pseudo, NoImm, oops, iops, "", itin> {
+class PseudoI<dag oops, dag iops, list<dag> pattern>
+ : X86Inst<0, Pseudo, NoImm, oops, iops, ""> {
let Pattern = pattern;
}
class I<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary,
- Domain d = GenericDomain>
- : X86Inst<o, f, NoImm, outs, ins, asm, itin, d> {
+ list<dag> pattern, Domain d = GenericDomain>
+ : X86Inst<o, f, NoImm, outs, ins, asm, d> {
let Pattern = pattern;
let CodeSize = 3;
}
-class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary,
- Domain d = GenericDomain>
- : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> {
+class Ii8<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d = GenericDomain>
+ : X86Inst<o, f, Imm8, outs, ins, asm, d> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii8Reg<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary,
- Domain d = GenericDomain>
- : X86Inst<o, f, Imm8Reg, outs, ins, asm, itin, d> {
+ list<dag> pattern, Domain d = GenericDomain>
+ : X86Inst<o, f, Imm8Reg, outs, ins, asm, d> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> {
+ list<dag> pattern>
+ : X86Inst<o, f, Imm8PCRel, outs, ins, asm> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+ list<dag> pattern>
+ : X86Inst<o, f, Imm16, outs, ins, asm> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+ list<dag> pattern>
+ : X86Inst<o, f, Imm32, outs, ins, asm> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii32S<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : X86Inst<o, f, Imm32S, outs, ins, asm, itin> {
+ list<dag> pattern>
+ : X86Inst<o, f, Imm32S, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Ii64<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm64, outs, ins, asm> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> {
+ list<dag> pattern>
+ : X86Inst<o, f, Imm16PCRel, outs, ins, asm> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> {
+ list<dag> pattern>
+ : X86Inst<o, f, Imm32PCRel, outs, ins, asm> {
let Pattern = pattern;
let CodeSize = 3;
}
// FPStack Instruction Templates:
// FPI - Floating Point Instruction template.
-class FPI<bits<8> o, Format F, dag outs, dag ins, string asm,
- InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, [], itin> {}
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm>
+ : I<o, F, outs, ins, asm, []> {}
// FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
-class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
- InstrItinClass itin = NoItinerary>
- : PseudoI<outs, ins, pattern, itin> {
+class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern>
+ : PseudoI<outs, ins, pattern> {
let FPForm = fp;
}
@@ -435,24 +455,23 @@ class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
// Iseg32 - 16-bit segment selector, 32-bit offset
class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+ list<dag> pattern>
+ : X86Inst<o, f, Imm16, outs, ins, asm> {
let Pattern = pattern;
let CodeSize = 3;
}
class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+ list<dag> pattern>
+ : X86Inst<o, f, Imm32, outs, ins, asm> {
let Pattern = pattern;
let CodeSize = 3;
}
// SI - SSE 1 & 2 scalar instructions
class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary,
- Domain d = GenericDomain>
- : I<o, F, outs, ins, asm, pattern, itin, d> {
+ list<dag> pattern, Domain d = GenericDomain>
+ : I<o, F, outs, ins, asm, pattern, d> {
let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
!if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
!if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
@@ -468,9 +487,8 @@ class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
// SI - SSE 1 & 2 scalar intrinsics - vex form available on AVX512
class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary,
- Domain d = GenericDomain>
- : I<o, F, outs, ins, asm, pattern, itin, d> {
+ list<dag> pattern, Domain d = GenericDomain>
+ : I<o, F, outs, ins, asm, pattern, d> {
let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
!if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
!if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
@@ -485,8 +503,8 @@ class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
}
// SIi8 - SSE 1 & 2 scalar instructions - vex form available on AVX512
class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin> {
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern> {
let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
!if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
!if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
@@ -500,8 +518,8 @@ class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
// PI - SSE 1 & 2 packed instructions
class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
- InstrItinClass itin, Domain d>
- : I<o, F, outs, ins, asm, pattern, itin, d> {
+ Domain d>
+ : I<o, F, outs, ins, asm, pattern, d> {
let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
!if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
!if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
@@ -515,16 +533,16 @@ class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
// MMXPI - SSE 1 & 2 packed instructions with MMX operands
class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
- InstrItinClass itin, Domain d>
- : I<o, F, outs, ins, asm, pattern, itin, d> {
- let Predicates = !if(!eq(OpPrefix.Value, PD.Value), [HasSSE2],
- [HasSSE1]);
+ Domain d>
+ : I<o, F, outs, ins, asm, pattern, d> {
+ let Predicates = !if(!eq(OpPrefix.Value, PD.Value), [HasMMX, HasSSE2],
+ [HasMMX, HasSSE1]);
}
// PIi8 - SSE 1 & 2 packed instructions with immediate
class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin, Domain d>
- : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
+ list<dag> pattern, Domain d>
+ : Ii8<o, F, outs, ins, asm, pattern, d> {
let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
!if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
!if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
@@ -545,26 +563,26 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
// VPSI - SSE1 instructions with PS prefix in AVX form, packed single.
class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE1]>;
class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE1]>;
class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS,
Requires<[UseSSE1]>;
class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS,
Requires<[UseSSE1]>;
class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+ list<dag> pattern>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS,
Requires<[HasAVX]>;
class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, PS,
+ list<dag> pattern>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>, PS,
Requires<[HasAVX]>;
// SSE2 Instruction Templates:
@@ -586,50 +604,50 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
// MMX operands.
class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, XD, Requires<[UseSSE2]>;
class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[UseSSE2]>;
class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>;
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
+ list<dag> pattern>
: Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
Requires<[UseSSE2]>;
class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
Requires<[UseSSE2]>;
class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
+ list<dag> pattern>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XD,
Requires<[UseAVX]>;
class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+ list<dag> pattern>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS,
Requires<[HasAVX]>;
class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>,
+ list<dag> pattern>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedDouble>,
PD, Requires<[HasAVX]>;
class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, PD,
+ list<dag> pattern>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern>, PD,
Requires<[UseAVX]>;
class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[UseSSE2]>;
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, PD, Requires<[UseSSE2]>;
class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX, HasSSE2]>;
class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX, HasSSE2]>;
// SSE3 Instruction Templates:
//
@@ -638,16 +656,16 @@ class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
// S3DI - SSE3 instructions with XD prefix.
class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, XS,
Requires<[UseSSE3]>;
class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, XD,
Requires<[UseSSE3]>;
class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
Requires<[UseSSE3]>;
@@ -663,21 +681,21 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
// classes. They need to be enabled even if AVX is enabled.
class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
Requires<[UseSSSE3]>;
class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
Requires<[UseSSSE3]>;
class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PS,
- Requires<[HasSSSE3]>;
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PS,
+ Requires<[HasMMX, HasSSSE3]>;
class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPS,
- Requires<[HasSSSE3]>;
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPS,
+ Requires<[HasMMX, HasSSSE3]>;
// SSE4.1 Instruction Templates:
//
@@ -685,32 +703,32 @@ class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
// SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
//
class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
Requires<[UseSSE41]>;
class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
Requires<[UseSSE41]>;
// SSE4.2 Instruction Templates:
//
// SS428I - SSE 4.2 instructions with T8 prefix.
class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
Requires<[UseSSE42]>;
// SS42FI - SSE 4.2 instructions with T8XD prefix.
// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>;
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, T8XD, Requires<[HasSSE42]>;
// SS42AI = SSE 4.2 instructions with TA prefix
class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
Requires<[UseSSE42]>;
// AVX Instruction Templates:
@@ -719,12 +737,12 @@ class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
// AVX8I - AVX instructions with T8PD prefix.
// AVXAIi8 - AVX instructions with TAPD prefix and ImmT = Imm8.
class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
Requires<[HasAVX]>;
class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
Requires<[HasAVX]>;
// AVX2 Instruction Templates:
@@ -733,12 +751,12 @@ class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
// AVX28I - AVX2 instructions with T8PD prefix.
// AVX2AIi8 - AVX2 instructions with TAPD prefix and ImmT = Imm8.
class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
Requires<[HasAVX2]>;
class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
Requires<[HasAVX2]>;
@@ -755,34 +773,34 @@ class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
// AVX512SI - AVX-512 scalar instructions with PD prefix.
class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
Requires<[HasAVX512]>;
class AVX5128IBase : T8PD {
Domain ExeDomain = SSEPackedInt;
}
class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8XS,
Requires<[HasAVX512]>;
class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, XS,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, XS,
Requires<[HasAVX512]>;
class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, XD,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, XD,
Requires<[HasAVX512]>;
class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, PD,
Requires<[HasAVX512]>;
class AVX512BIBase : PD {
Domain ExeDomain = SSEPackedInt;
}
class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, PD,
Requires<[HasAVX512]>;
class AVX512BIi8Base : PD {
Domain ExeDomain = SSEPackedInt;
@@ -805,149 +823,138 @@ class AVX512PDIi8Base : PD {
ImmType ImmT = Imm8;
}
class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
Requires<[HasAVX512]>;
class AVX512AIi8Base : TAPD {
ImmType ImmT = Imm8;
}
class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>,
Requires<[HasAVX512]>;
class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
Requires<[HasAVX512]>;
class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS,
Requires<[HasAVX512]>;
class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+ list<dag> pattern, Domain d>
+ : Ii8<o, F, outs, ins, asm, pattern, d>, Requires<[HasAVX512]>;
class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+ list<dag> pattern, Domain d>
+ : I<o, F, outs, ins, asm, pattern, d>, Requires<[HasAVX512]>;
class AVX512FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+ list<dag>pattern>
+ : I<o, F, outs, ins, asm, pattern>, T8PD,
EVEX_4V, Requires<[HasAVX512]>;
class AVX512FMA3Base : T8PD, EVEX_4V;
class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, Requires<[HasAVX512]>;
+ list<dag>pattern>
+ : I<o, F, outs, ins, asm, pattern>, Requires<[HasAVX512]>;
// AES Instruction Templates:
//
// AES8I
// These use the same encoding as the SSE4.2 T8 and TA encodings.
class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = IIC_AES>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ list<dag>pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
Requires<[NoAVX, HasAES]>;
class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
Requires<[NoAVX, HasAES]>;
// PCLMUL Instruction Templates
class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD;
+ list<dag>pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD;
// FMA3 Instruction Templates
class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+ list<dag>pattern>
+ : I<o, F, outs, ins, asm, pattern>, T8PD,
VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>;
class FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+ list<dag>pattern>
+ : I<o, F, outs, ins, asm, pattern>, T8PD,
VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>;
class FMA3S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+ list<dag>pattern>
+ : I<o, F, outs, ins, asm, pattern>, T8PD,
VEX_4V, FMASC, Requires<[HasFMA, NoAVX512]>;
// FMA4 Instruction Templates
class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = NoItinerary>
- : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
+ list<dag>pattern>
+ : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
VEX_4V, FMASC, Requires<[HasFMA4, NoVLX]>;
class FMA4S<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = NoItinerary>
- : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
+ list<dag>pattern>
+ : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
VEX_4V, FMASC, Requires<[HasFMA4, NoAVX512]>;
class FMA4S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = NoItinerary>
- : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
+ list<dag>pattern>
+ : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
VEX_4V, FMASC, Requires<[HasFMA4]>;
// XOP 2, 3 and 4 Operand Instruction Template
class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>,
XOP9, Requires<[HasXOP]>;
// XOP 2 and 3 Operand Instruction Templates with imm byte
class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>,
XOP8, Requires<[HasXOP]>;
// XOP 4 Operand Instruction Templates with imm byte
class IXOPi8Reg<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8Reg<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+ list<dag> pattern>
+ : Ii8Reg<o, F, outs, ins, asm, pattern, SSEPackedDouble>,
XOP8, Requires<[HasXOP]>;
// XOP 5 operand instruction (VEX encoding!)
class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = NoItinerary>
- : Ii8Reg<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ list<dag>pattern>
+ : Ii8Reg<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
VEX_4V, Requires<[HasXOP]>;
// X86-64 Instruction templates...
//
class RI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, REX_W;
class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W;
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, REX_W;
class RIi16 <bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii16<o, F, outs, ins, asm, pattern, itin>, REX_W;
+ list<dag> pattern>
+ : Ii16<o, F, outs, ins, asm, pattern>, REX_W;
class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W;
+ list<dag> pattern>
+ : Ii32<o, F, outs, ins, asm, pattern>, REX_W;
class RIi32S <bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii32S<o, F, outs, ins, asm, pattern, itin>, REX_W;
-
-class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : X86Inst<o, f, Imm64, outs, ins, asm, itin>, REX_W {
- let Pattern = pattern;
- let CodeSize = 3;
-}
-
-class RIi64_NOREX<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : X86Inst<o, f, Imm64, outs, ins, asm, itin> {
- let Pattern = pattern;
- let CodeSize = 3;
-}
+ list<dag> pattern>
+ : Ii32S<o, F, outs, ins, asm, pattern>, REX_W;
+class RIi64<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii64<o, F, outs, ins, asm, pattern>, REX_W;
class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : S2I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+ list<dag> pattern>
+ : S2I<o, F, outs, ins, asm, pattern>, REX_W;
class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : VS2I<o, F, outs, ins, asm, pattern, itin>, VEX_W;
+ list<dag> pattern>
+ : VS2I<o, F, outs, ins, asm, pattern>, VEX_W;
// MMX Instruction templates
//
@@ -961,26 +968,26 @@ class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
// MMXID - MMX instructions with XD prefix.
// MMXIS - MMX instructions with XS prefix.
class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX]>;
class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,Not64BitMode]>;
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX,Not64BitMode]>;
class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,In64BitMode]>;
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX,In64BitMode]>;
class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, PS, REX_W, Requires<[HasMMX]>;
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, PS, REX_W, Requires<[HasMMX]>;
class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[HasMMX]>;
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, PD, Requires<[HasMMX]>;
class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX]>;
class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>;
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX]>;
class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>;
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX]>;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index ebbef00c01d9..739275907978 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -153,12 +153,6 @@ def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>;
def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
-def X86IntCmpMask : SDTypeProfile<1, 2,
- [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, SDTCisSameAs<1, 2>, SDTCisInt<1>,
- SDTCisSameNumEltsAs<0, 1>]>;
-def X86pcmpeqm : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>;
-def X86pcmpgtm : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>;
-
def X86CmpMaskCC :
SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
SDTCisVec<1>, SDTCisSameAs<2, 1>,
@@ -177,8 +171,9 @@ def X86CmpMaskCCScalarRound :
SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>;
+// Hack to make CMPM commutable in tablegen patterns for load folding.
+def X86cmpm_c : SDNode<"X86ISD::CMPM", X86CmpMaskCC, [SDNPCommutative]>;
def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>;
-def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>;
def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>;
def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>;
@@ -211,6 +206,8 @@ def X86kshiftr : SDNode<"X86ISD::KSHIFTR",
SDTCisSameAs<0, 1>,
SDTCisVT<2, i8>]>>;
+def X86kadd : SDNode<"X86ISD::KADD", SDTIntBinOp, [SDNPCommutative]>;
+
def X86vrotli : SDNode<"X86ISD::VROTLI", X86vshiftimm>;
def X86vrotri : SDNode<"X86ISD::VROTRI", X86vshiftimm>;
@@ -228,9 +225,9 @@ def X86vpcomu : SDNode<"X86ISD::VPCOMU",
def X86vpermil2 : SDNode<"X86ISD::VPERMIL2",
SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>,
- SDTCisSameSizeAs<0,3>,
- SDTCisSameNumEltsAs<0, 3>,
SDTCisFP<0>, SDTCisInt<3>,
+ SDTCisSameNumEltsAs<0, 3>,
+ SDTCisSameSizeAs<0,3>,
SDTCisVT<4, i8>]>>;
def X86vpperm : SDNode<"X86ISD::VPPERM",
SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
@@ -240,10 +237,6 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
SDTCisVec<1>,
SDTCisSameAs<2, 1>]>;
-def SDTX86Testm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
- SDTCisSameAs<2, 1>, SDTCVecEltisVT<0, i1>,
- SDTCisSameNumEltsAs<0, 1>]>;
-
def X86addus : SDNode<"X86ISD::ADDUS", SDTIntBinOp, [SDNPCommutative]>;
def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
def X86adds : SDNode<"X86ISD::ADDS", SDTIntBinOp, [SDNPCommutative]>;
@@ -254,8 +247,6 @@ def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
-def X86testm : SDNode<"X86ISD::TESTM", SDTX86Testm, [SDNPCommutative]>;
-def X86testnm : SDNode<"X86ISD::TESTNM", SDTX86Testm, [SDNPCommutative]>;
def X86movmsk : SDNode<"X86ISD::MOVMSK",
SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVec<1>]>>;
@@ -267,14 +258,12 @@ def X86selects : SDNode<"X86ISD::SELECTS",
def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
- SDTCVecEltisVT<1, i32>,
- SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<0,1>,
SDTCisSameAs<1,2>]>,
[SDNPCommutative]>;
def X86pmuldq : SDNode<"X86ISD::PMULDQ",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
- SDTCVecEltisVT<1, i32>,
- SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<0,1>,
SDTCisSameAs<1,2>]>,
[SDNPCommutative]>;
@@ -292,11 +281,13 @@ def X86insertqi : SDNode<"X86ISD::INSERTQI",
def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>;
+def SDTShuff2OpFP : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>,
+ SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>;
def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
- SDTCisSameSizeAs<0,2>,
+ SDTCisFP<0>, SDTCisInt<2>,
SDTCisSameNumEltsAs<0,2>,
- SDTCisFP<0>, SDTCisInt<2>]>;
+ SDTCisSameSizeAs<0,2>]>;
def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>;
def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
@@ -379,15 +370,11 @@ def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
-def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>;
-def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>;
+def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2OpFP>;
+def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2OpFP>;
-def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>;
-def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>;
-def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
-
-def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
-def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
+def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2OpFP>;
+def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2OpFP>;
def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>,
SDTCisVec<1>, SDTCisInt<1>,
@@ -427,15 +414,6 @@ def X86VPermt2 : SDNode<"X86ISD::VPERMV3",
SDTCisSameSizeAs<0,2>,
SDTCisSameAs<0,3>]>, []>;
-// Even though the index operand should be integer, we need to make it match the
-// destination type so that we can pattern match the masked version where the
-// index is also the passthru operand.
-def X86VPermi2X : SDNode<"X86ISD::VPERMIV3",
- SDTypeProfile<1, 3, [SDTCisVec<0>,
- SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>,
- SDTCisSameAs<0,3>]>, []>;
-
def X86vpternlog : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>;
def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
@@ -465,10 +443,6 @@ def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
-def X86kextract : SDNode<"ISD::EXTRACT_VECTOR_ELT",
- SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
- SDTCVecEltisVT<1, i1>,
- SDTCisPtrTy<2>]>>;
def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>;
@@ -507,35 +481,6 @@ def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound, [SDNPCommutat
def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound, [SDNPCommutative]>;
def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound, [SDNPCommutative]>;
-// Scalar FMA4 intrinsics which zero the non-scalar bits.
-def X86Fmadd4s : SDNode<"X86ISD::FMADD4S", SDTFPTernaryOp, [SDNPCommutative]>;
-def X86Fnmadd4s : SDNode<"X86ISD::FNMADD4S", SDTFPTernaryOp, [SDNPCommutative]>;
-def X86Fmsub4s : SDNode<"X86ISD::FMSUB4S", SDTFPTernaryOp, [SDNPCommutative]>;
-def X86Fnmsub4s : SDNode<"X86ISD::FNMSUB4S", SDTFPTernaryOp, [SDNPCommutative]>;
-
-// Scalar FMA intrinsics with passthru bits in operand 1.
-def X86Fmadds1 : SDNode<"X86ISD::FMADDS1", SDTFPTernaryOp>;
-def X86Fnmadds1 : SDNode<"X86ISD::FNMADDS1", SDTFPTernaryOp>;
-def X86Fmsubs1 : SDNode<"X86ISD::FMSUBS1", SDTFPTernaryOp>;
-def X86Fnmsubs1 : SDNode<"X86ISD::FNMSUBS1", SDTFPTernaryOp>;
-
-// Scalar FMA intrinsics with passthru bits in operand 1.
-def X86FmaddRnds1 : SDNode<"X86ISD::FMADDS1_RND", SDTFmaRound>;
-def X86FnmaddRnds1 : SDNode<"X86ISD::FNMADDS1_RND", SDTFmaRound>;
-def X86FmsubRnds1 : SDNode<"X86ISD::FMSUBS1_RND", SDTFmaRound>;
-def X86FnmsubRnds1 : SDNode<"X86ISD::FNMSUBS1_RND", SDTFmaRound>;
-
-def X86Fmadds3 : SDNode<"X86ISD::FMADDS3", SDTFPTernaryOp, [SDNPCommutative]>;
-def X86Fnmadds3 : SDNode<"X86ISD::FNMADDS3", SDTFPTernaryOp, [SDNPCommutative]>;
-def X86Fmsubs3 : SDNode<"X86ISD::FMSUBS3", SDTFPTernaryOp, [SDNPCommutative]>;
-def X86Fnmsubs3 : SDNode<"X86ISD::FNMSUBS3", SDTFPTernaryOp, [SDNPCommutative]>;
-
-// Scalar FMA intrinsics with passthru bits in operand 3.
-def X86FmaddRnds3 : SDNode<"X86ISD::FMADDS3_RND", SDTFmaRound, [SDNPCommutative]>;
-def X86FnmaddRnds3 : SDNode<"X86ISD::FNMADDS3_RND", SDTFmaRound, [SDNPCommutative]>;
-def X86FmsubRnds3 : SDNode<"X86ISD::FMSUBS3_RND", SDTFmaRound, [SDNPCommutative]>;
-def X86FnmsubRnds3 : SDNode<"X86ISD::FNMSUBS3_RND", SDTFmaRound, [SDNPCommutative]>;
-
def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>;
@@ -569,17 +514,6 @@ def X86RndScalesRnd : SDNode<"X86ISD::VRNDSCALES_RND", SDTFPBinOpImmRound>;
def X86ReducesRnd : SDNode<"X86ISD::VREDUCES_RND", SDTFPBinOpImmRound>;
def X86GetMantsRnd : SDNode<"X86ISD::VGETMANTS_RND", SDTFPBinOpImmRound>;
-def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
- SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
- SDTCisVT<4, i8>]>;
-def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
- SDTCisVT<2, v16i8>, SDTCisVT<3, i32>,
- SDTCisVT<4, v16i8>, SDTCisVT<5, i32>,
- SDTCisVT<6, i8>]>;
-
-def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>;
-def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>;
-
def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1,
[SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,
@@ -671,8 +605,6 @@ def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND",
SDTCisOpSmallerThanOp<0, 1>,
SDTCisVT<2, i32>]>>;
-def X86cvt2mask : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>;
-
// galois field arithmetic
def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>;
def X86GF2P8affineqb : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>;
@@ -687,10 +619,10 @@ def X86GF2P8mulb : SDNode<"X86ISD::GF2P8MULB", SDTIntBinOp>;
// forms.
def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [],
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
- SDNPWantRoot]>;
+ SDNPWantRoot, SDNPWantParent]>;
def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [],
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
- SDNPWantRoot]>;
+ SDNPWantRoot, SDNPWantParent]>;
def ssmem : Operand<v4f32> {
let PrintMethod = "printf32mem";
@@ -892,6 +824,7 @@ def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>;
def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>;
def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
def bc_v8f32 : PatFrag<(ops node:$in), (v8f32 (bitconvert node:$in))>;
+def bc_v4f64 : PatFrag<(ops node:$in), (v4f64 (bitconvert node:$in))>;
// 512-bit bitconvert pattern fragments
def bc_v64i8 : PatFrag<(ops node:$in), (v64i8 (bitconvert node:$in))>;
@@ -924,10 +857,8 @@ def I8Imm : SDNodeXForm<imm, [{
return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
}]>;
-def FROUND_NO_EXC : ImmLeaf<i32, [{ return Imm == 8; }]>;
-def FROUND_CURRENT : ImmLeaf<i32, [{
- return Imm == X86::STATIC_ROUNDING::CUR_DIRECTION;
-}]>;
+def FROUND_NO_EXC : PatLeaf<(i32 8)>;
+def FROUND_CURRENT : PatLeaf<(i32 4)>;
// BYTE_imm - Transform bit immediates into byte immediates.
def BYTE_imm : SDNodeXForm<imm, [{
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 7ca1c58184f6..1b61accfb42b 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -14,10 +14,12 @@
#include "X86InstrInfo.h"
#include "X86.h"
#include "X86InstrBuilder.h"
+#include "X86InstrFoldTables.h"
#include "X86MachineFunctionInfo.h"
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineConstantPool.h"
@@ -71,44 +73,6 @@ UndefRegClearance("undef-reg-clearance",
"certain undef register reads"),
cl::init(128), cl::Hidden);
-enum {
- // Select which memory operand is being unfolded.
- // (stored in bits 0 - 3)
- TB_INDEX_0 = 0,
- TB_INDEX_1 = 1,
- TB_INDEX_2 = 2,
- TB_INDEX_3 = 3,
- TB_INDEX_4 = 4,
- TB_INDEX_MASK = 0xf,
-
- // Do not insert the reverse map (MemOp -> RegOp) into the table.
- // This may be needed because there is a many -> one mapping.
- TB_NO_REVERSE = 1 << 4,
-
- // Do not insert the forward map (RegOp -> MemOp) into the table.
- // This is needed for Native Client, which prohibits branch
- // instructions from using a memory operand.
- TB_NO_FORWARD = 1 << 5,
-
- TB_FOLDED_LOAD = 1 << 6,
- TB_FOLDED_STORE = 1 << 7,
-
- // Minimum alignment required for load/store.
- // Used for RegOp->MemOp conversion.
- // (stored in bits 8 - 15)
- TB_ALIGN_SHIFT = 8,
- TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
- TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT,
- TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT,
- TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT,
- TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT
-};
-
-struct X86MemoryFoldTableEntry {
- uint16_t RegOp;
- uint16_t MemOp;
- uint16_t Flags;
-};
// Pin the vtable to this file.
void X86InstrInfo::anchor() {}
@@ -121,3631 +85,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
X86::CATCHRET,
(STI.is64Bit() ? X86::RETQ : X86::RETL)),
Subtarget(STI), RI(STI.getTargetTriple()) {
-
- static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
- { X86::ADC16ri, X86::ADC16mi, 0 },
- { X86::ADC16ri8, X86::ADC16mi8, 0 },
- { X86::ADC16rr, X86::ADC16mr, 0 },
- { X86::ADC32ri, X86::ADC32mi, 0 },
- { X86::ADC32ri8, X86::ADC32mi8, 0 },
- { X86::ADC32rr, X86::ADC32mr, 0 },
- { X86::ADC64ri32, X86::ADC64mi32, 0 },
- { X86::ADC64ri8, X86::ADC64mi8, 0 },
- { X86::ADC64rr, X86::ADC64mr, 0 },
- { X86::ADC8ri, X86::ADC8mi, 0 },
- { X86::ADC8ri8, X86::ADC8mi8, 0 },
- { X86::ADC8rr, X86::ADC8mr, 0 },
- { X86::ADD16ri, X86::ADD16mi, 0 },
- { X86::ADD16ri8, X86::ADD16mi8, 0 },
- { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE },
- { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE },
- { X86::ADD16rr, X86::ADD16mr, 0 },
- { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE },
- { X86::ADD32ri, X86::ADD32mi, 0 },
- { X86::ADD32ri8, X86::ADD32mi8, 0 },
- { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE },
- { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE },
- { X86::ADD32rr, X86::ADD32mr, 0 },
- { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE },
- { X86::ADD64ri32, X86::ADD64mi32, 0 },
- { X86::ADD64ri8, X86::ADD64mi8, 0 },
- { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE },
- { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE },
- { X86::ADD64rr, X86::ADD64mr, 0 },
- { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE },
- { X86::ADD8ri, X86::ADD8mi, 0 },
- { X86::ADD8ri8, X86::ADD8mi8, 0 },
- { X86::ADD8rr, X86::ADD8mr, 0 },
- { X86::AND16ri, X86::AND16mi, 0 },
- { X86::AND16ri8, X86::AND16mi8, 0 },
- { X86::AND16rr, X86::AND16mr, 0 },
- { X86::AND32ri, X86::AND32mi, 0 },
- { X86::AND32ri8, X86::AND32mi8, 0 },
- { X86::AND32rr, X86::AND32mr, 0 },
- { X86::AND64ri32, X86::AND64mi32, 0 },
- { X86::AND64ri8, X86::AND64mi8, 0 },
- { X86::AND64rr, X86::AND64mr, 0 },
- { X86::AND8ri, X86::AND8mi, 0 },
- { X86::AND8ri8, X86::AND8mi8, 0 },
- { X86::AND8rr, X86::AND8mr, 0 },
- { X86::BTC16ri8, X86::BTC16mi8, 0 },
- { X86::BTC32ri8, X86::BTC32mi8, 0 },
- { X86::BTC64ri8, X86::BTC64mi8, 0 },
- { X86::BTR16ri8, X86::BTR16mi8, 0 },
- { X86::BTR32ri8, X86::BTR32mi8, 0 },
- { X86::BTR64ri8, X86::BTR64mi8, 0 },
- { X86::BTS16ri8, X86::BTS16mi8, 0 },
- { X86::BTS32ri8, X86::BTS32mi8, 0 },
- { X86::BTS64ri8, X86::BTS64mi8, 0 },
- { X86::DEC16r, X86::DEC16m, 0 },
- { X86::DEC32r, X86::DEC32m, 0 },
- { X86::DEC64r, X86::DEC64m, 0 },
- { X86::DEC8r, X86::DEC8m, 0 },
- { X86::INC16r, X86::INC16m, 0 },
- { X86::INC32r, X86::INC32m, 0 },
- { X86::INC64r, X86::INC64m, 0 },
- { X86::INC8r, X86::INC8m, 0 },
- { X86::NEG16r, X86::NEG16m, 0 },
- { X86::NEG32r, X86::NEG32m, 0 },
- { X86::NEG64r, X86::NEG64m, 0 },
- { X86::NEG8r, X86::NEG8m, 0 },
- { X86::NOT16r, X86::NOT16m, 0 },
- { X86::NOT32r, X86::NOT32m, 0 },
- { X86::NOT64r, X86::NOT64m, 0 },
- { X86::NOT8r, X86::NOT8m, 0 },
- { X86::OR16ri, X86::OR16mi, 0 },
- { X86::OR16ri8, X86::OR16mi8, 0 },
- { X86::OR16rr, X86::OR16mr, 0 },
- { X86::OR32ri, X86::OR32mi, 0 },
- { X86::OR32ri8, X86::OR32mi8, 0 },
- { X86::OR32rr, X86::OR32mr, 0 },
- { X86::OR64ri32, X86::OR64mi32, 0 },
- { X86::OR64ri8, X86::OR64mi8, 0 },
- { X86::OR64rr, X86::OR64mr, 0 },
- { X86::OR8ri, X86::OR8mi, 0 },
- { X86::OR8ri8, X86::OR8mi8, 0 },
- { X86::OR8rr, X86::OR8mr, 0 },
- { X86::RCL16r1, X86::RCL16m1, 0 },
- { X86::RCL16rCL, X86::RCL16mCL, 0 },
- { X86::RCL16ri, X86::RCL16mi, 0 },
- { X86::RCL32r1, X86::RCL32m1, 0 },
- { X86::RCL32rCL, X86::RCL32mCL, 0 },
- { X86::RCL32ri, X86::RCL32mi, 0 },
- { X86::RCL64r1, X86::RCL64m1, 0 },
- { X86::RCL64rCL, X86::RCL64mCL, 0 },
- { X86::RCL64ri, X86::RCL64mi, 0 },
- { X86::RCL8r1, X86::RCL8m1, 0 },
- { X86::RCL8rCL, X86::RCL8mCL, 0 },
- { X86::RCL8ri, X86::RCL8mi, 0 },
- { X86::RCR16r1, X86::RCR16m1, 0 },
- { X86::RCR16rCL, X86::RCR16mCL, 0 },
- { X86::RCR16ri, X86::RCR16mi, 0 },
- { X86::RCR32r1, X86::RCR32m1, 0 },
- { X86::RCR32rCL, X86::RCR32mCL, 0 },
- { X86::RCR32ri, X86::RCR32mi, 0 },
- { X86::RCR64r1, X86::RCR64m1, 0 },
- { X86::RCR64rCL, X86::RCR64mCL, 0 },
- { X86::RCR64ri, X86::RCR64mi, 0 },
- { X86::RCR8r1, X86::RCR8m1, 0 },
- { X86::RCR8rCL, X86::RCR8mCL, 0 },
- { X86::RCR8ri, X86::RCR8mi, 0 },
- { X86::ROL16r1, X86::ROL16m1, 0 },
- { X86::ROL16rCL, X86::ROL16mCL, 0 },
- { X86::ROL16ri, X86::ROL16mi, 0 },
- { X86::ROL32r1, X86::ROL32m1, 0 },
- { X86::ROL32rCL, X86::ROL32mCL, 0 },
- { X86::ROL32ri, X86::ROL32mi, 0 },
- { X86::ROL64r1, X86::ROL64m1, 0 },
- { X86::ROL64rCL, X86::ROL64mCL, 0 },
- { X86::ROL64ri, X86::ROL64mi, 0 },
- { X86::ROL8r1, X86::ROL8m1, 0 },
- { X86::ROL8rCL, X86::ROL8mCL, 0 },
- { X86::ROL8ri, X86::ROL8mi, 0 },
- { X86::ROR16r1, X86::ROR16m1, 0 },
- { X86::ROR16rCL, X86::ROR16mCL, 0 },
- { X86::ROR16ri, X86::ROR16mi, 0 },
- { X86::ROR32r1, X86::ROR32m1, 0 },
- { X86::ROR32rCL, X86::ROR32mCL, 0 },
- { X86::ROR32ri, X86::ROR32mi, 0 },
- { X86::ROR64r1, X86::ROR64m1, 0 },
- { X86::ROR64rCL, X86::ROR64mCL, 0 },
- { X86::ROR64ri, X86::ROR64mi, 0 },
- { X86::ROR8r1, X86::ROR8m1, 0 },
- { X86::ROR8rCL, X86::ROR8mCL, 0 },
- { X86::ROR8ri, X86::ROR8mi, 0 },
- { X86::SAR16r1, X86::SAR16m1, 0 },
- { X86::SAR16rCL, X86::SAR16mCL, 0 },
- { X86::SAR16ri, X86::SAR16mi, 0 },
- { X86::SAR32r1, X86::SAR32m1, 0 },
- { X86::SAR32rCL, X86::SAR32mCL, 0 },
- { X86::SAR32ri, X86::SAR32mi, 0 },
- { X86::SAR64r1, X86::SAR64m1, 0 },
- { X86::SAR64rCL, X86::SAR64mCL, 0 },
- { X86::SAR64ri, X86::SAR64mi, 0 },
- { X86::SAR8r1, X86::SAR8m1, 0 },
- { X86::SAR8rCL, X86::SAR8mCL, 0 },
- { X86::SAR8ri, X86::SAR8mi, 0 },
- { X86::SBB16ri, X86::SBB16mi, 0 },
- { X86::SBB16ri8, X86::SBB16mi8, 0 },
- { X86::SBB16rr, X86::SBB16mr, 0 },
- { X86::SBB32ri, X86::SBB32mi, 0 },
- { X86::SBB32ri8, X86::SBB32mi8, 0 },
- { X86::SBB32rr, X86::SBB32mr, 0 },
- { X86::SBB64ri32, X86::SBB64mi32, 0 },
- { X86::SBB64ri8, X86::SBB64mi8, 0 },
- { X86::SBB64rr, X86::SBB64mr, 0 },
- { X86::SBB8ri, X86::SBB8mi, 0 },
- { X86::SBB8ri8, X86::SBB8mi8, 0 },
- { X86::SBB8rr, X86::SBB8mr, 0 },
- { X86::SHL16r1, X86::SHL16m1, 0 },
- { X86::SHL16rCL, X86::SHL16mCL, 0 },
- { X86::SHL16ri, X86::SHL16mi, 0 },
- { X86::SHL32r1, X86::SHL32m1, 0 },
- { X86::SHL32rCL, X86::SHL32mCL, 0 },
- { X86::SHL32ri, X86::SHL32mi, 0 },
- { X86::SHL64r1, X86::SHL64m1, 0 },
- { X86::SHL64rCL, X86::SHL64mCL, 0 },
- { X86::SHL64ri, X86::SHL64mi, 0 },
- { X86::SHL8r1, X86::SHL8m1, 0 },
- { X86::SHL8rCL, X86::SHL8mCL, 0 },
- { X86::SHL8ri, X86::SHL8mi, 0 },
- { X86::SHLD16rrCL, X86::SHLD16mrCL, 0 },
- { X86::SHLD16rri8, X86::SHLD16mri8, 0 },
- { X86::SHLD32rrCL, X86::SHLD32mrCL, 0 },
- { X86::SHLD32rri8, X86::SHLD32mri8, 0 },
- { X86::SHLD64rrCL, X86::SHLD64mrCL, 0 },
- { X86::SHLD64rri8, X86::SHLD64mri8, 0 },
- { X86::SHR16r1, X86::SHR16m1, 0 },
- { X86::SHR16rCL, X86::SHR16mCL, 0 },
- { X86::SHR16ri, X86::SHR16mi, 0 },
- { X86::SHR32r1, X86::SHR32m1, 0 },
- { X86::SHR32rCL, X86::SHR32mCL, 0 },
- { X86::SHR32ri, X86::SHR32mi, 0 },
- { X86::SHR64r1, X86::SHR64m1, 0 },
- { X86::SHR64rCL, X86::SHR64mCL, 0 },
- { X86::SHR64ri, X86::SHR64mi, 0 },
- { X86::SHR8r1, X86::SHR8m1, 0 },
- { X86::SHR8rCL, X86::SHR8mCL, 0 },
- { X86::SHR8ri, X86::SHR8mi, 0 },
- { X86::SHRD16rrCL, X86::SHRD16mrCL, 0 },
- { X86::SHRD16rri8, X86::SHRD16mri8, 0 },
- { X86::SHRD32rrCL, X86::SHRD32mrCL, 0 },
- { X86::SHRD32rri8, X86::SHRD32mri8, 0 },
- { X86::SHRD64rrCL, X86::SHRD64mrCL, 0 },
- { X86::SHRD64rri8, X86::SHRD64mri8, 0 },
- { X86::SUB16ri, X86::SUB16mi, 0 },
- { X86::SUB16ri8, X86::SUB16mi8, 0 },
- { X86::SUB16rr, X86::SUB16mr, 0 },
- { X86::SUB32ri, X86::SUB32mi, 0 },
- { X86::SUB32ri8, X86::SUB32mi8, 0 },
- { X86::SUB32rr, X86::SUB32mr, 0 },
- { X86::SUB64ri32, X86::SUB64mi32, 0 },
- { X86::SUB64ri8, X86::SUB64mi8, 0 },
- { X86::SUB64rr, X86::SUB64mr, 0 },
- { X86::SUB8ri, X86::SUB8mi, 0 },
- { X86::SUB8ri8, X86::SUB8mi8, 0 },
- { X86::SUB8rr, X86::SUB8mr, 0 },
- { X86::XOR16ri, X86::XOR16mi, 0 },
- { X86::XOR16ri8, X86::XOR16mi8, 0 },
- { X86::XOR16rr, X86::XOR16mr, 0 },
- { X86::XOR32ri, X86::XOR32mi, 0 },
- { X86::XOR32ri8, X86::XOR32mi8, 0 },
- { X86::XOR32rr, X86::XOR32mr, 0 },
- { X86::XOR64ri32, X86::XOR64mi32, 0 },
- { X86::XOR64ri8, X86::XOR64mi8, 0 },
- { X86::XOR64rr, X86::XOR64mr, 0 },
- { X86::XOR8ri, X86::XOR8mi, 0 },
- { X86::XOR8ri8, X86::XOR8mi8, 0 },
- { X86::XOR8rr, X86::XOR8mr, 0 }
- };
-
- for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) {
- AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
- Entry.RegOp, Entry.MemOp,
- // Index 0, folded load and store, no alignment requirement.
- Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
- }
-
- static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
- { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD },
- { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD },
- { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD },
- { X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD },
- { X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD },
- { X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD },
- { X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD },
- { X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD },
- { X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD },
- { X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD },
- { X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD },
- { X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD },
- { X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD },
- { X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD },
- { X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD },
- { X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD },
- { X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD },
- { X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD },
- { X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD },
- { X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD },
- { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE },
- { X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD },
- { X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD },
- { X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD },
- { X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD },
- { X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD },
- { X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD },
- { X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD },
- { X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD },
- { X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD },
- { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD },
- { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE },
- { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE },
- { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE },
- { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE },
- { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE },
- { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE },
- { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE },
- { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE },
- { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
- { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE },
- { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
- { X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE },
- { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE },
- { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE },
- { X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE },
- { X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE },
- { X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD },
- { X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD },
- { X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD },
- { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },
- { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE },
- { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE },
- { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD },
- { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD },
- { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD },
- { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
- { X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
- { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
- { X86::SETBr, X86::SETBm, TB_FOLDED_STORE },
- { X86::SETEr, X86::SETEm, TB_FOLDED_STORE },
- { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE },
- { X86::SETGr, X86::SETGm, TB_FOLDED_STORE },
- { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE },
- { X86::SETLr, X86::SETLm, TB_FOLDED_STORE },
- { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE },
- { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE },
- { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE },
- { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE },
- { X86::SETOr, X86::SETOm, TB_FOLDED_STORE },
- { X86::SETPr, X86::SETPm, TB_FOLDED_STORE },
- { X86::SETSr, X86::SETSm, TB_FOLDED_STORE },
- { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD },
- { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD },
- { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
- { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD },
- { X86::TEST16rr, X86::TEST16mr, TB_FOLDED_LOAD },
- { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD },
- { X86::TEST32rr, X86::TEST32mr, TB_FOLDED_LOAD },
- { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD },
- { X86::TEST64rr, X86::TEST64mr, TB_FOLDED_LOAD },
- { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD },
- { X86::TEST8rr, X86::TEST8mr, TB_FOLDED_LOAD },
-
- // AVX 128-bit versions of foldable instructions
- { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE },
- { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE },
- { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE },
- { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
- { X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE },
- { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE },
- { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE },
- { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE },
- { X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE },
- { X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE },
-
- // AVX 256-bit foldable instructions
- { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
- { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
- { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
- { X86::VMOVDQUYrr, X86::VMOVDQUYmr, TB_FOLDED_STORE },
- { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
- { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE },
-
- // AVX-512 foldable instructions
- { X86::VEXTRACTF32x4Zrr,X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTF32x8Zrr,X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTF64x2Zrr,X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTF64x4Zrr,X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTI32x4Zrr,X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTI32x8Zrr,X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE },
- { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
- { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
- { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
- { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
- { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE },
- { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE },
- { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE },
- { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE },
- { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
- { X86::VMOVPQIto64Zrr, X86::VMOVPQI2QIZmr, TB_FOLDED_STORE },
- { X86::VMOVSDto64Zrr, X86::VMOVSDto64Zmr, TB_FOLDED_STORE },
- { X86::VMOVSS2DIZrr, X86::VMOVSS2DIZmr, TB_FOLDED_STORE },
- { X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE },
- { X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE },
- { X86::VPEXTRDZrr, X86::VPEXTRDZmr, TB_FOLDED_STORE },
- { X86::VPEXTRQZrr, X86::VPEXTRQZmr, TB_FOLDED_STORE },
- { X86::VPMOVDBZrr, X86::VPMOVDBZmr, TB_FOLDED_STORE },
- { X86::VPMOVDWZrr, X86::VPMOVDWZmr, TB_FOLDED_STORE },
- { X86::VPMOVQDZrr, X86::VPMOVQDZmr, TB_FOLDED_STORE },
- { X86::VPMOVQWZrr, X86::VPMOVQWZmr, TB_FOLDED_STORE },
- { X86::VPMOVWBZrr, X86::VPMOVWBZmr, TB_FOLDED_STORE },
- { X86::VPMOVSDBZrr, X86::VPMOVSDBZmr, TB_FOLDED_STORE },
- { X86::VPMOVSDWZrr, X86::VPMOVSDWZmr, TB_FOLDED_STORE },
- { X86::VPMOVSQDZrr, X86::VPMOVSQDZmr, TB_FOLDED_STORE },
- { X86::VPMOVSQWZrr, X86::VPMOVSQWZmr, TB_FOLDED_STORE },
- { X86::VPMOVSWBZrr, X86::VPMOVSWBZmr, TB_FOLDED_STORE },
- { X86::VPMOVUSDBZrr, X86::VPMOVUSDBZmr, TB_FOLDED_STORE },
- { X86::VPMOVUSDWZrr, X86::VPMOVUSDWZmr, TB_FOLDED_STORE },
- { X86::VPMOVUSQDZrr, X86::VPMOVUSQDZmr, TB_FOLDED_STORE },
- { X86::VPMOVUSQWZrr, X86::VPMOVUSQWZmr, TB_FOLDED_STORE },
- { X86::VPMOVUSWBZrr, X86::VPMOVUSWBZmr, TB_FOLDED_STORE },
-
- // AVX-512 foldable instructions (256-bit versions)
- { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE },
- { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE },
- { X86::VEXTRACTI32x4Z256rr,X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE },
- { X86::VEXTRACTI64x2Z256rr,X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE },
- { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
- { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
- { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
- { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
- { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE },
- { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256mr, TB_FOLDED_STORE },
- { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256mr, TB_FOLDED_STORE },
- { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE },
- { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE },
- { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE },
- { X86::VPMOVDWZ256rr, X86::VPMOVDWZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVQDZ256rr, X86::VPMOVQDZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVWBZ256rr, X86::VPMOVWBZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVSDWZ256rr, X86::VPMOVSDWZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVSQDZ256rr, X86::VPMOVSQDZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVSWBZ256rr, X86::VPMOVSWBZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVUSDWZ256rr, X86::VPMOVUSDWZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVUSQDZ256rr, X86::VPMOVUSQDZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVUSWBZ256rr, X86::VPMOVUSWBZ256mr, TB_FOLDED_STORE },
-
- // AVX-512 foldable instructions (128-bit versions)
- { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE },
- { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128mr, TB_FOLDED_STORE },
- { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE },
- { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE },
- { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE },
- { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE },
-
- // F16C foldable instructions
- { X86::VCVTPS2PHrr, X86::VCVTPS2PHmr, TB_FOLDED_STORE },
- { X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE }
- };
-
- for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) {
- AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
- Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags);
- }
-
- static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
- { X86::BSF16rr, X86::BSF16rm, 0 },
- { X86::BSF32rr, X86::BSF32rm, 0 },
- { X86::BSF64rr, X86::BSF64rm, 0 },
- { X86::BSR16rr, X86::BSR16rm, 0 },
- { X86::BSR32rr, X86::BSR32rm, 0 },
- { X86::BSR64rr, X86::BSR64rm, 0 },
- { X86::CMP16rr, X86::CMP16rm, 0 },
- { X86::CMP32rr, X86::CMP32rm, 0 },
- { X86::CMP64rr, X86::CMP64rm, 0 },
- { X86::CMP8rr, X86::CMP8rm, 0 },
- { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE },
- { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 },
- { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 },
- { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 },
- { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 },
- { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE },
- { X86::CVTSD2SI64rr_Int, X86::CVTSD2SI64rm_Int, TB_NO_REVERSE },
- { X86::CVTSD2SIrr_Int, X86::CVTSD2SIrm_Int, TB_NO_REVERSE },
- { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 },
- { X86::CVTSI642SDrr, X86::CVTSI642SDrm, 0 },
- { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 },
- { X86::CVTSI642SSrr, X86::CVTSI642SSrm, 0 },
- { X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 },
- { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 },
- { X86::CVTSS2SI64rr_Int, X86::CVTSS2SI64rm_Int, TB_NO_REVERSE },
- { X86::CVTSS2SIrr_Int, X86::CVTSS2SIrm_Int, TB_NO_REVERSE },
- { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
- { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
- { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 },
- { X86::CVTTSD2SI64rr_Int,X86::CVTTSD2SI64rm_Int, TB_NO_REVERSE },
- { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 },
- { X86::CVTTSD2SIrr_Int, X86::CVTTSD2SIrm_Int, TB_NO_REVERSE },
- { X86::CVTTSS2SI64rr_Int,X86::CVTTSS2SI64rm_Int, TB_NO_REVERSE },
- { X86::CVTTSS2SIrr_Int, X86::CVTTSS2SIrm_Int, TB_NO_REVERSE },
- { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 },
- { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 },
- { X86::IMUL16rri, X86::IMUL16rmi, 0 },
- { X86::IMUL16rri8, X86::IMUL16rmi8, 0 },
- { X86::IMUL32rri, X86::IMUL32rmi, 0 },
- { X86::IMUL32rri8, X86::IMUL32rmi8, 0 },
- { X86::IMUL64rri32, X86::IMUL64rmi32, 0 },
- { X86::IMUL64rri8, X86::IMUL64rmi8, 0 },
- { X86::Int_COMISDrr, X86::Int_COMISDrm, TB_NO_REVERSE },
- { X86::Int_COMISSrr, X86::Int_COMISSrm, TB_NO_REVERSE },
- { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, TB_NO_REVERSE },
- { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, TB_NO_REVERSE },
- { X86::MOV16rr, X86::MOV16rm, 0 },
- { X86::MOV32rr, X86::MOV32rm, 0 },
- { X86::MOV64rr, X86::MOV64rm, 0 },
- { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 },
- { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 },
- { X86::MOV8rr, X86::MOV8rm, 0 },
- { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
- { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
- { X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE },
- { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
- { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
- { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
- { X86::MOVDQUrr, X86::MOVDQUrm, 0 },
- { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 },
- { X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 },
- { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
- { X86::MOVSX32rr16, X86::MOVSX32rm16, 0 },
- { X86::MOVSX32rr8, X86::MOVSX32rm8, 0 },
- { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 },
- { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 },
- { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
- { X86::MOVUPDrr, X86::MOVUPDrm, 0 },
- { X86::MOVUPSrr, X86::MOVUPSrm, 0 },
- { X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE },
- { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
- { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
- { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
- { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },
- { X86::PABSBrr, X86::PABSBrm, TB_ALIGN_16 },
- { X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 },
- { X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 },
- { X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 },
- { X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 },
- { X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 },
- { X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 },
- { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 },
- { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE },
- { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE },
- { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_NO_REVERSE },
- { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_NO_REVERSE },
- { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_NO_REVERSE },
- { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_NO_REVERSE },
- { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_NO_REVERSE },
- { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_NO_REVERSE },
- { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_NO_REVERSE },
- { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_NO_REVERSE },
- { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_NO_REVERSE },
- { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_NO_REVERSE },
- { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 },
- { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 },
- { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
- { X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 },
- { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
- { X86::RCPSSr, X86::RCPSSm, 0 },
- { X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE },
- { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
- { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
- { X86::ROUNDSDr, X86::ROUNDSDm, 0 },
- { X86::ROUNDSSr, X86::ROUNDSSm, 0 },
- { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
- { X86::RSQRTSSr, X86::RSQRTSSm, 0 },
- { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE },
- { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 },
- { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 },
- { X86::SQRTSDr, X86::SQRTSDm, 0 },
- { X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE },
- { X86::SQRTSSr, X86::SQRTSSm, 0 },
- { X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE },
- // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
- { X86::UCOMISDrr, X86::UCOMISDrm, 0 },
- { X86::UCOMISSrr, X86::UCOMISSrm, 0 },
-
- // MMX version of foldable instructions
- { X86::MMX_CVTPD2PIirr, X86::MMX_CVTPD2PIirm, 0 },
- { X86::MMX_CVTPI2PDirr, X86::MMX_CVTPI2PDirm, 0 },
- { X86::MMX_CVTPS2PIirr, X86::MMX_CVTPS2PIirm, 0 },
- { X86::MMX_CVTTPD2PIirr, X86::MMX_CVTTPD2PIirm, 0 },
- { X86::MMX_CVTTPS2PIirr, X86::MMX_CVTTPS2PIirm, 0 },
- { X86::MMX_MOVD64to64rr, X86::MMX_MOVQ64rm, 0 },
- { X86::MMX_PABSBrr64, X86::MMX_PABSBrm64, 0 },
- { X86::MMX_PABSDrr64, X86::MMX_PABSDrm64, 0 },
- { X86::MMX_PABSWrr64, X86::MMX_PABSWrm64, 0 },
- { X86::MMX_PSHUFWri, X86::MMX_PSHUFWmi, 0 },
-
- // 3DNow! version of foldable instructions
- { X86::PF2IDrr, X86::PF2IDrm, 0 },
- { X86::PF2IWrr, X86::PF2IWrm, 0 },
- { X86::PFRCPrr, X86::PFRCPrm, 0 },
- { X86::PFRSQRTrr, X86::PFRSQRTrm, 0 },
- { X86::PI2FDrr, X86::PI2FDrm, 0 },
- { X86::PI2FWrr, X86::PI2FWrm, 0 },
- { X86::PSWAPDrr, X86::PSWAPDrm, 0 },
-
- // AVX 128-bit versions of foldable instructions
- { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, TB_NO_REVERSE },
- { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, TB_NO_REVERSE },
- { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, TB_NO_REVERSE },
- { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, TB_NO_REVERSE },
- { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 },
- { X86::VCVTTSD2SI64rr_Int,X86::VCVTTSD2SI64rm_Int,TB_NO_REVERSE },
- { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 },
- { X86::VCVTTSD2SIrr_Int,X86::VCVTTSD2SIrm_Int, TB_NO_REVERSE },
- { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 },
- { X86::VCVTTSS2SI64rr_Int,X86::VCVTTSS2SI64rm_Int,TB_NO_REVERSE },
- { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 },
- { X86::VCVTTSS2SIrr_Int,X86::VCVTTSS2SIrm_Int, TB_NO_REVERSE },
- { X86::VCVTSD2SI64rr_Int, X86::VCVTSD2SI64rm_Int, TB_NO_REVERSE },
- { X86::VCVTSD2SIrr_Int, X86::VCVTSD2SIrm_Int, TB_NO_REVERSE },
- { X86::VCVTSS2SI64rr_Int, X86::VCVTSS2SI64rm_Int, TB_NO_REVERSE },
- { X86::VCVTSS2SIrr_Int, X86::VCVTSS2SIrm_Int, TB_NO_REVERSE },
- { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE },
- { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 },
- { X86::VCVTPD2DQrr, X86::VCVTPD2DQrm, 0 },
- { X86::VCVTPD2PSrr, X86::VCVTPD2PSrm, 0 },
- { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 },
- { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, TB_NO_REVERSE },
- { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, 0 },
- { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 },
- { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
- { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
- { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 },
- { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 },
- { X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE },
- { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
- { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
- { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 },
- { X86::VMOVDQUrr, X86::VMOVDQUrm, 0 },
- { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 },
- { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 },
- { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
- { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
- { X86::VMOVZPQILo2PQIrr,X86::VMOVQI2PQIrm, TB_NO_REVERSE },
- { X86::VPABSBrr, X86::VPABSBrm, 0 },
- { X86::VPABSDrr, X86::VPABSDrm, 0 },
- { X86::VPABSWrr, X86::VPABSWrm, 0 },
- { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
- { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 },
- { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 },
- { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 },
- { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 },
- { X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
- { X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
- { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE },
- { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, TB_NO_REVERSE },
- { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, TB_NO_REVERSE },
- { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, TB_NO_REVERSE },
- { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, TB_NO_REVERSE },
- { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, TB_NO_REVERSE },
- { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, TB_NO_REVERSE },
- { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, TB_NO_REVERSE },
- { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, TB_NO_REVERSE },
- { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, TB_NO_REVERSE },
- { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, TB_NO_REVERSE },
- { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, TB_NO_REVERSE },
- { X86::VPSHUFDri, X86::VPSHUFDmi, 0 },
- { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },
- { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
- { X86::VPTESTrr, X86::VPTESTrm, 0 },
- { X86::VRCPPSr, X86::VRCPPSm, 0 },
- { X86::VROUNDPDr, X86::VROUNDPDm, 0 },
- { X86::VROUNDPSr, X86::VROUNDPSm, 0 },
- { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },
- { X86::VSQRTPDr, X86::VSQRTPDm, 0 },
- { X86::VSQRTPSr, X86::VSQRTPSm, 0 },
- { X86::VTESTPDrr, X86::VTESTPDrm, 0 },
- { X86::VTESTPSrr, X86::VTESTPSrm, 0 },
- { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 },
- { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },
-
- // AVX 256-bit foldable instructions
- { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 },
- { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 },
- { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 },
- { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 },
- { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 },
- { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 },
- { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 },
- { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 },
- { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
- { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 },
- { X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 },
- { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 },
- { X86::VMOVDQUYrr, X86::VMOVDQUYrm, 0 },
- { X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 },
- { X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 },
- { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
- { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 },
- { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 },
- { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 },
- { X86::VPTESTYrr, X86::VPTESTYrm, 0 },
- { X86::VRCPPSYr, X86::VRCPPSYm, 0 },
- { X86::VROUNDYPDr, X86::VROUNDYPDm, 0 },
- { X86::VROUNDYPSr, X86::VROUNDYPSm, 0 },
- { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 },
- { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 },
- { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 },
- { X86::VTESTPDYrr, X86::VTESTPDYrm, 0 },
- { X86::VTESTPSYrr, X86::VTESTPSYrm, 0 },
-
- // AVX2 foldable instructions
-
- // VBROADCASTS{SD}rr register instructions were an AVX2 addition while the
- // VBROADCASTS{SD}rm memory instructions were available from AVX1.
- // TB_NO_REVERSE prevents unfolding from introducing an illegal instruction
- // on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions
- // so they don't need an equivalent limitation.
- { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
- { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
- { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
- { X86::VPABSBYrr, X86::VPABSBYrm, 0 },
- { X86::VPABSDYrr, X86::VPABSDYrm, 0 },
- { X86::VPABSWYrr, X86::VPABSWYrm, 0 },
- { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, TB_NO_REVERSE },
- { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE },
- { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, TB_NO_REVERSE },
- { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, TB_NO_REVERSE },
- { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, TB_NO_REVERSE },
- { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, TB_NO_REVERSE },
- { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE },
- { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, TB_NO_REVERSE },
- { X86::VPERMPDYri, X86::VPERMPDYmi, 0 },
- { X86::VPERMQYri, X86::VPERMQYmi, 0 },
- { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, TB_NO_REVERSE },
- { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, TB_NO_REVERSE },
- { X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 },
- { X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 },
- { X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 },
- { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, TB_NO_REVERSE },
- { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, TB_NO_REVERSE },
- { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, TB_NO_REVERSE },
- { X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 },
- { X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 },
- { X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 },
- { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, TB_NO_REVERSE },
- { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 },
- { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 },
- { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 },
-
- // XOP foldable instructions
- { X86::VFRCZPDrr, X86::VFRCZPDrm, 0 },
- { X86::VFRCZPDrrY, X86::VFRCZPDrmY, 0 },
- { X86::VFRCZPSrr, X86::VFRCZPSrm, 0 },
- { X86::VFRCZPSrrY, X86::VFRCZPSrmY, 0 },
- { X86::VFRCZSDrr, X86::VFRCZSDrm, 0 },
- { X86::VFRCZSSrr, X86::VFRCZSSrm, 0 },
- { X86::VPHADDBDrr, X86::VPHADDBDrm, 0 },
- { X86::VPHADDBQrr, X86::VPHADDBQrm, 0 },
- { X86::VPHADDBWrr, X86::VPHADDBWrm, 0 },
- { X86::VPHADDDQrr, X86::VPHADDDQrm, 0 },
- { X86::VPHADDWDrr, X86::VPHADDWDrm, 0 },
- { X86::VPHADDWQrr, X86::VPHADDWQrm, 0 },
- { X86::VPHADDUBDrr, X86::VPHADDUBDrm, 0 },
- { X86::VPHADDUBQrr, X86::VPHADDUBQrm, 0 },
- { X86::VPHADDUBWrr, X86::VPHADDUBWrm, 0 },
- { X86::VPHADDUDQrr, X86::VPHADDUDQrm, 0 },
- { X86::VPHADDUWDrr, X86::VPHADDUWDrm, 0 },
- { X86::VPHADDUWQrr, X86::VPHADDUWQrm, 0 },
- { X86::VPHSUBBWrr, X86::VPHSUBBWrm, 0 },
- { X86::VPHSUBDQrr, X86::VPHSUBDQrm, 0 },
- { X86::VPHSUBWDrr, X86::VPHSUBWDrm, 0 },
- { X86::VPROTBri, X86::VPROTBmi, 0 },
- { X86::VPROTBrr, X86::VPROTBmr, 0 },
- { X86::VPROTDri, X86::VPROTDmi, 0 },
- { X86::VPROTDrr, X86::VPROTDmr, 0 },
- { X86::VPROTQri, X86::VPROTQmi, 0 },
- { X86::VPROTQrr, X86::VPROTQmr, 0 },
- { X86::VPROTWri, X86::VPROTWmi, 0 },
- { X86::VPROTWrr, X86::VPROTWmr, 0 },
- { X86::VPSHABrr, X86::VPSHABmr, 0 },
- { X86::VPSHADrr, X86::VPSHADmr, 0 },
- { X86::VPSHAQrr, X86::VPSHAQmr, 0 },
- { X86::VPSHAWrr, X86::VPSHAWmr, 0 },
- { X86::VPSHLBrr, X86::VPSHLBmr, 0 },
- { X86::VPSHLDrr, X86::VPSHLDmr, 0 },
- { X86::VPSHLQrr, X86::VPSHLQmr, 0 },
- { X86::VPSHLWrr, X86::VPSHLWmr, 0 },
-
- // LWP foldable instructions
- { X86::LWPINS32rri, X86::LWPINS32rmi, 0 },
- { X86::LWPINS64rri, X86::LWPINS64rmi, 0 },
- { X86::LWPVAL32rri, X86::LWPVAL32rmi, 0 },
- { X86::LWPVAL64rri, X86::LWPVAL64rmi, 0 },
-
- // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
- { X86::BEXTR32rr, X86::BEXTR32rm, 0 },
- { X86::BEXTR64rr, X86::BEXTR64rm, 0 },
- { X86::BEXTRI32ri, X86::BEXTRI32mi, 0 },
- { X86::BEXTRI64ri, X86::BEXTRI64mi, 0 },
- { X86::BLCFILL32rr, X86::BLCFILL32rm, 0 },
- { X86::BLCFILL64rr, X86::BLCFILL64rm, 0 },
- { X86::BLCI32rr, X86::BLCI32rm, 0 },
- { X86::BLCI64rr, X86::BLCI64rm, 0 },
- { X86::BLCIC32rr, X86::BLCIC32rm, 0 },
- { X86::BLCIC64rr, X86::BLCIC64rm, 0 },
- { X86::BLCMSK32rr, X86::BLCMSK32rm, 0 },
- { X86::BLCMSK64rr, X86::BLCMSK64rm, 0 },
- { X86::BLCS32rr, X86::BLCS32rm, 0 },
- { X86::BLCS64rr, X86::BLCS64rm, 0 },
- { X86::BLSFILL32rr, X86::BLSFILL32rm, 0 },
- { X86::BLSFILL64rr, X86::BLSFILL64rm, 0 },
- { X86::BLSI32rr, X86::BLSI32rm, 0 },
- { X86::BLSI64rr, X86::BLSI64rm, 0 },
- { X86::BLSIC32rr, X86::BLSIC32rm, 0 },
- { X86::BLSIC64rr, X86::BLSIC64rm, 0 },
- { X86::BLSMSK32rr, X86::BLSMSK32rm, 0 },
- { X86::BLSMSK64rr, X86::BLSMSK64rm, 0 },
- { X86::BLSR32rr, X86::BLSR32rm, 0 },
- { X86::BLSR64rr, X86::BLSR64rm, 0 },
- { X86::BZHI32rr, X86::BZHI32rm, 0 },
- { X86::BZHI64rr, X86::BZHI64rm, 0 },
- { X86::LZCNT16rr, X86::LZCNT16rm, 0 },
- { X86::LZCNT32rr, X86::LZCNT32rm, 0 },
- { X86::LZCNT64rr, X86::LZCNT64rm, 0 },
- { X86::POPCNT16rr, X86::POPCNT16rm, 0 },
- { X86::POPCNT32rr, X86::POPCNT32rm, 0 },
- { X86::POPCNT64rr, X86::POPCNT64rm, 0 },
- { X86::RORX32ri, X86::RORX32mi, 0 },
- { X86::RORX64ri, X86::RORX64mi, 0 },
- { X86::SARX32rr, X86::SARX32rm, 0 },
- { X86::SARX64rr, X86::SARX64rm, 0 },
- { X86::SHRX32rr, X86::SHRX32rm, 0 },
- { X86::SHRX64rr, X86::SHRX64rm, 0 },
- { X86::SHLX32rr, X86::SHLX32rm, 0 },
- { X86::SHLX64rr, X86::SHLX64rm, 0 },
- { X86::T1MSKC32rr, X86::T1MSKC32rm, 0 },
- { X86::T1MSKC64rr, X86::T1MSKC64rm, 0 },
- { X86::TZCNT16rr, X86::TZCNT16rm, 0 },
- { X86::TZCNT32rr, X86::TZCNT32rm, 0 },
- { X86::TZCNT64rr, X86::TZCNT64rm, 0 },
- { X86::TZMSK32rr, X86::TZMSK32rm, 0 },
- { X86::TZMSK64rr, X86::TZMSK64rm, 0 },
-
- // AVX-512 foldable instructions
- { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE },
- { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE },
- { X86::VCVTDQ2PDZrr, X86::VCVTDQ2PDZrm, 0 },
- { X86::VCVTPD2PSZrr, X86::VCVTPD2PSZrm, 0 },
- { X86::VCVTUDQ2PDZrr, X86::VCVTUDQ2PDZrm, 0 },
- { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 },
- { X86::VMOV64toSDZrr, X86::VMOV64toSDZrm, 0 },
- { X86::VMOVDI2PDIZrr, X86::VMOVDI2PDIZrm, 0 },
- { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 },
- { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 },
- { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 },
- { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 },
- { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 },
- { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 },
- { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 },
- { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 },
- { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 },
- { X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 },
- { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 },
- { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm, TB_NO_REVERSE },
- { X86::VPABSBZrr, X86::VPABSBZrm, 0 },
- { X86::VPABSDZrr, X86::VPABSDZrm, 0 },
- { X86::VPABSQZrr, X86::VPABSQZrm, 0 },
- { X86::VPABSWZrr, X86::VPABSWZrm, 0 },
- { X86::VPCONFLICTDZrr, X86::VPCONFLICTDZrm, 0 },
- { X86::VPCONFLICTQZrr, X86::VPCONFLICTQZrm, 0 },
- { X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 },
- { X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 },
- { X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
- { X86::VPERMQZri, X86::VPERMQZmi, 0 },
- { X86::VPLZCNTDZrr, X86::VPLZCNTDZrm, 0 },
- { X86::VPLZCNTQZrr, X86::VPLZCNTQZrm, 0 },
- { X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 },
- { X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE },
- { X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 },
- { X86::VPMOVSXDQZrr, X86::VPMOVSXDQZrm, 0 },
- { X86::VPMOVSXWDZrr, X86::VPMOVSXWDZrm, 0 },
- { X86::VPMOVSXWQZrr, X86::VPMOVSXWQZrm, 0 },
- { X86::VPMOVZXBDZrr, X86::VPMOVZXBDZrm, 0 },
- { X86::VPMOVZXBQZrr, X86::VPMOVZXBQZrm, TB_NO_REVERSE },
- { X86::VPMOVZXBWZrr, X86::VPMOVZXBWZrm, 0 },
- { X86::VPMOVZXDQZrr, X86::VPMOVZXDQZrm, 0 },
- { X86::VPMOVZXWDZrr, X86::VPMOVZXWDZrm, 0 },
- { X86::VPMOVZXWQZrr, X86::VPMOVZXWQZrm, 0 },
- { X86::VPOPCNTDZrr, X86::VPOPCNTDZrm, 0 },
- { X86::VPOPCNTQZrr, X86::VPOPCNTQZrm, 0 },
- { X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 },
- { X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 },
- { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 },
- { X86::VPSLLDQZrr, X86::VPSLLDQZrm, 0 },
- { X86::VPSLLDZri, X86::VPSLLDZmi, 0 },
- { X86::VPSLLQZri, X86::VPSLLQZmi, 0 },
- { X86::VPSLLWZri, X86::VPSLLWZmi, 0 },
- { X86::VPSRADZri, X86::VPSRADZmi, 0 },
- { X86::VPSRAQZri, X86::VPSRAQZmi, 0 },
- { X86::VPSRAWZri, X86::VPSRAWZmi, 0 },
- { X86::VPSRLDQZrr, X86::VPSRLDQZrm, 0 },
- { X86::VPSRLDZri, X86::VPSRLDZmi, 0 },
- { X86::VPSRLQZri, X86::VPSRLQZmi, 0 },
- { X86::VPSRLWZri, X86::VPSRLWZmi, 0 },
-
- // AVX-512 foldable instructions (256-bit versions)
- { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
- { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
- { X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0 },
- { X86::VCVTPD2PSZ256rr, X86::VCVTPD2PSZ256rm, 0 },
- { X86::VCVTUDQ2PDZ256rr, X86::VCVTUDQ2PDZ256rm, 0 },
- { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
- { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 },
- { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 },
- { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 },
- { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 },
- { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 },
- { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 },
- { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 },
- { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 },
- { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 },
- { X86::VPABSBZ256rr, X86::VPABSBZ256rm, 0 },
- { X86::VPABSDZ256rr, X86::VPABSDZ256rm, 0 },
- { X86::VPABSQZ256rr, X86::VPABSQZ256rm, 0 },
- { X86::VPABSWZ256rr, X86::VPABSWZ256rm, 0 },
- { X86::VPCONFLICTDZ256rr, X86::VPCONFLICTDZ256rm, 0 },
- { X86::VPCONFLICTQZ256rr, X86::VPCONFLICTQZ256rm, 0 },
- { X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 },
- { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 },
- { X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 },
- { X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 },
- { X86::VPLZCNTDZ256rr, X86::VPLZCNTDZ256rm, 0 },
- { X86::VPLZCNTQZ256rr, X86::VPLZCNTQZ256rm, 0 },
- { X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE },
- { X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE },
- { X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 },
- { X86::VPMOVSXDQZ256rr, X86::VPMOVSXDQZ256rm, 0 },
- { X86::VPMOVSXWDZ256rr, X86::VPMOVSXWDZ256rm, 0 },
- { X86::VPMOVSXWQZ256rr, X86::VPMOVSXWQZ256rm, TB_NO_REVERSE },
- { X86::VPMOVZXBDZ256rr, X86::VPMOVZXBDZ256rm, TB_NO_REVERSE },
- { X86::VPMOVZXBQZ256rr, X86::VPMOVZXBQZ256rm, TB_NO_REVERSE },
- { X86::VPMOVZXBWZ256rr, X86::VPMOVZXBWZ256rm, 0 },
- { X86::VPMOVZXDQZ256rr, X86::VPMOVZXDQZ256rm, 0 },
- { X86::VPMOVZXWDZ256rr, X86::VPMOVZXWDZ256rm, 0 },
- { X86::VPMOVZXWQZ256rr, X86::VPMOVZXWQZ256rm, TB_NO_REVERSE },
- { X86::VPSHUFDZ256ri, X86::VPSHUFDZ256mi, 0 },
- { X86::VPSHUFHWZ256ri, X86::VPSHUFHWZ256mi, 0 },
- { X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 },
- { X86::VPSLLDQZ256rr, X86::VPSLLDQZ256rm, 0 },
- { X86::VPSLLDZ256ri, X86::VPSLLDZ256mi, 0 },
- { X86::VPSLLQZ256ri, X86::VPSLLQZ256mi, 0 },
- { X86::VPSLLWZ256ri, X86::VPSLLWZ256mi, 0 },
- { X86::VPSRADZ256ri, X86::VPSRADZ256mi, 0 },
- { X86::VPSRAQZ256ri, X86::VPSRAQZ256mi, 0 },
- { X86::VPSRAWZ256ri, X86::VPSRAWZ256mi, 0 },
- { X86::VPSRLDQZ256rr, X86::VPSRLDQZ256rm, 0 },
- { X86::VPSRLDZ256ri, X86::VPSRLDZ256mi, 0 },
- { X86::VPSRLQZ256ri, X86::VPSRLQZ256mi, 0 },
- { X86::VPSRLWZ256ri, X86::VPSRLWZ256mi, 0 },
-
- // AVX-512 foldable instructions (128-bit versions)
- { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
- { X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rm, TB_NO_REVERSE },
- { X86::VCVTPD2PSZ128rr, X86::VCVTPD2PSZ128rm, 0 },
- { X86::VCVTUDQ2PDZ128rr, X86::VCVTUDQ2PDZ128rm, TB_NO_REVERSE },
- { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
- { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 },
- { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 },
- { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 },
- { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 },
- { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 },
- { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 },
- { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 },
- { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 },
- { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 },
- { X86::VPABSBZ128rr, X86::VPABSBZ128rm, 0 },
- { X86::VPABSDZ128rr, X86::VPABSDZ128rm, 0 },
- { X86::VPABSQZ128rr, X86::VPABSQZ128rm, 0 },
- { X86::VPABSWZ128rr, X86::VPABSWZ128rm, 0 },
- { X86::VPCONFLICTDZ128rr, X86::VPCONFLICTDZ128rm, 0 },
- { X86::VPCONFLICTQZ128rr, X86::VPCONFLICTQZ128rm, 0 },
- { X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 },
- { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 },
- { X86::VPLZCNTDZ128rr, X86::VPLZCNTDZ128rm, 0 },
- { X86::VPLZCNTQZ128rr, X86::VPLZCNTQZ128rm, 0 },
- { X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE },
- { X86::VPMOVSXBQZ128rr, X86::VPMOVSXBQZ128rm, TB_NO_REVERSE },
- { X86::VPMOVSXBWZ128rr, X86::VPMOVSXBWZ128rm, TB_NO_REVERSE },
- { X86::VPMOVSXDQZ128rr, X86::VPMOVSXDQZ128rm, TB_NO_REVERSE },
- { X86::VPMOVSXWDZ128rr, X86::VPMOVSXWDZ128rm, TB_NO_REVERSE },
- { X86::VPMOVSXWQZ128rr, X86::VPMOVSXWQZ128rm, TB_NO_REVERSE },
- { X86::VPMOVZXBDZ128rr, X86::VPMOVZXBDZ128rm, TB_NO_REVERSE },
- { X86::VPMOVZXBQZ128rr, X86::VPMOVZXBQZ128rm, TB_NO_REVERSE },
- { X86::VPMOVZXBWZ128rr, X86::VPMOVZXBWZ128rm, TB_NO_REVERSE },
- { X86::VPMOVZXDQZ128rr, X86::VPMOVZXDQZ128rm, TB_NO_REVERSE },
- { X86::VPMOVZXWDZ128rr, X86::VPMOVZXWDZ128rm, TB_NO_REVERSE },
- { X86::VPMOVZXWQZ128rr, X86::VPMOVZXWQZ128rm, TB_NO_REVERSE },
- { X86::VPSHUFDZ128ri, X86::VPSHUFDZ128mi, 0 },
- { X86::VPSHUFHWZ128ri, X86::VPSHUFHWZ128mi, 0 },
- { X86::VPSHUFLWZ128ri, X86::VPSHUFLWZ128mi, 0 },
- { X86::VPSLLDQZ128rr, X86::VPSLLDQZ128rm, 0 },
- { X86::VPSLLDZ128ri, X86::VPSLLDZ128mi, 0 },
- { X86::VPSLLQZ128ri, X86::VPSLLQZ128mi, 0 },
- { X86::VPSLLWZ128ri, X86::VPSLLWZ128mi, 0 },
- { X86::VPSRADZ128ri, X86::VPSRADZ128mi, 0 },
- { X86::VPSRAQZ128ri, X86::VPSRAQZ128mi, 0 },
- { X86::VPSRAWZ128ri, X86::VPSRAWZ128mi, 0 },
- { X86::VPSRLDQZ128rr, X86::VPSRLDQZ128rm, 0 },
- { X86::VPSRLDZ128ri, X86::VPSRLDZ128mi, 0 },
- { X86::VPSRLQZ128ri, X86::VPSRLQZ128mi, 0 },
- { X86::VPSRLWZ128ri, X86::VPSRLWZ128mi, 0 },
-
- // F16C foldable instructions
- { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 },
- { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 },
-
- // AES foldable instructions
- { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 },
- { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 },
- { X86::VAESIMCrr, X86::VAESIMCrm, 0 },
- { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
- };
-
- for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) {
- AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
- Entry.RegOp, Entry.MemOp,
- // Index 1, folded load
- Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
- }
-
- static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
- { X86::ADC32rr, X86::ADC32rm, 0 },
- { X86::ADC64rr, X86::ADC64rm, 0 },
- { X86::ADD16rr, X86::ADD16rm, 0 },
- { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE },
- { X86::ADD32rr, X86::ADD32rm, 0 },
- { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE },
- { X86::ADD64rr, X86::ADD64rm, 0 },
- { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE },
- { X86::ADD8rr, X86::ADD8rm, 0 },
- { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 },
- { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 },
- { X86::ADDSDrr, X86::ADDSDrm, 0 },
- { X86::ADDSDrr_Int, X86::ADDSDrm_Int, TB_NO_REVERSE },
- { X86::ADDSSrr, X86::ADDSSrm, 0 },
- { X86::ADDSSrr_Int, X86::ADDSSrm_Int, TB_NO_REVERSE },
- { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 },
- { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 },
- { X86::AND16rr, X86::AND16rm, 0 },
- { X86::AND32rr, X86::AND32rm, 0 },
- { X86::AND64rr, X86::AND64rm, 0 },
- { X86::AND8rr, X86::AND8rm, 0 },
- { X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 },
- { X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 },
- { X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 },
- { X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 },
- { X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 },
- { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 },
- { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 },
- { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 },
- { X86::CMOVA16rr, X86::CMOVA16rm, 0 },
- { X86::CMOVA32rr, X86::CMOVA32rm, 0 },
- { X86::CMOVA64rr, X86::CMOVA64rm, 0 },
- { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 },
- { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 },
- { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 },
- { X86::CMOVB16rr, X86::CMOVB16rm, 0 },
- { X86::CMOVB32rr, X86::CMOVB32rm, 0 },
- { X86::CMOVB64rr, X86::CMOVB64rm, 0 },
- { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 },
- { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 },
- { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 },
- { X86::CMOVE16rr, X86::CMOVE16rm, 0 },
- { X86::CMOVE32rr, X86::CMOVE32rm, 0 },
- { X86::CMOVE64rr, X86::CMOVE64rm, 0 },
- { X86::CMOVG16rr, X86::CMOVG16rm, 0 },
- { X86::CMOVG32rr, X86::CMOVG32rm, 0 },
- { X86::CMOVG64rr, X86::CMOVG64rm, 0 },
- { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 },
- { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 },
- { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 },
- { X86::CMOVL16rr, X86::CMOVL16rm, 0 },
- { X86::CMOVL32rr, X86::CMOVL32rm, 0 },
- { X86::CMOVL64rr, X86::CMOVL64rm, 0 },
- { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 },
- { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 },
- { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 },
- { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 },
- { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 },
- { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 },
- { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 },
- { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 },
- { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 },
- { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 },
- { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 },
- { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 },
- { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 },
- { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 },
- { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 },
- { X86::CMOVO16rr, X86::CMOVO16rm, 0 },
- { X86::CMOVO32rr, X86::CMOVO32rm, 0 },
- { X86::CMOVO64rr, X86::CMOVO64rm, 0 },
- { X86::CMOVP16rr, X86::CMOVP16rm, 0 },
- { X86::CMOVP32rr, X86::CMOVP32rm, 0 },
- { X86::CMOVP64rr, X86::CMOVP64rm, 0 },
- { X86::CMOVS16rr, X86::CMOVS16rm, 0 },
- { X86::CMOVS32rr, X86::CMOVS32rm, 0 },
- { X86::CMOVS64rr, X86::CMOVS64rm, 0 },
- { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 },
- { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 },
- { X86::CMPSDrr, X86::CMPSDrm, 0 },
- { X86::CMPSDrr_Int, X86::CMPSDrm_Int, TB_NO_REVERSE },
- { X86::CMPSSrr, X86::CMPSSrm, 0 },
- { X86::CMPSSrr_Int, X86::CMPSSrm_Int, TB_NO_REVERSE },
- { X86::CRC32r32r32, X86::CRC32r32m32, 0 },
- { X86::CRC32r64r64, X86::CRC32r64m64, 0 },
- { X86::CVTSD2SSrr_Int, X86::CVTSD2SSrm_Int, TB_NO_REVERSE },
- { X86::CVTSS2SDrr_Int, X86::CVTSS2SDrm_Int, TB_NO_REVERSE },
- { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 },
- { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 },
- { X86::DIVSDrr, X86::DIVSDrm, 0 },
- { X86::DIVSDrr_Int, X86::DIVSDrm_Int, TB_NO_REVERSE },
- { X86::DIVSSrr, X86::DIVSSrm, 0 },
- { X86::DIVSSrr_Int, X86::DIVSSrm_Int, TB_NO_REVERSE },
- { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
- { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
- { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 },
- { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 },
- { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 },
- { X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 },
- { X86::IMUL16rr, X86::IMUL16rm, 0 },
- { X86::IMUL32rr, X86::IMUL32rm, 0 },
- { X86::IMUL64rr, X86::IMUL64rm, 0 },
- { X86::CVTSI642SDrr_Int,X86::CVTSI642SDrm_Int, 0 },
- { X86::CVTSI2SDrr_Int, X86::CVTSI2SDrm_Int, 0 },
- { X86::CVTSI642SSrr_Int,X86::CVTSI642SSrm_Int, 0 },
- { X86::CVTSI2SSrr_Int, X86::CVTSI2SSrm_Int, 0 },
- { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 },
- { X86::MAXCPDrr, X86::MAXCPDrm, TB_ALIGN_16 },
- { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 },
- { X86::MAXCPSrr, X86::MAXCPSrm, TB_ALIGN_16 },
- { X86::MAXSDrr, X86::MAXSDrm, 0 },
- { X86::MAXCSDrr, X86::MAXCSDrm, 0 },
- { X86::MAXSDrr_Int, X86::MAXSDrm_Int, TB_NO_REVERSE },
- { X86::MAXSSrr, X86::MAXSSrm, 0 },
- { X86::MAXCSSrr, X86::MAXCSSrm, 0 },
- { X86::MAXSSrr_Int, X86::MAXSSrm_Int, TB_NO_REVERSE },
- { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },
- { X86::MINCPDrr, X86::MINCPDrm, TB_ALIGN_16 },
- { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },
- { X86::MINCPSrr, X86::MINCPSrm, TB_ALIGN_16 },
- { X86::MINSDrr, X86::MINSDrm, 0 },
- { X86::MINCSDrr, X86::MINCSDrm, 0 },
- { X86::MINSDrr_Int, X86::MINSDrm_Int, TB_NO_REVERSE },
- { X86::MINSSrr, X86::MINSSrm, 0 },
- { X86::MINCSSrr, X86::MINCSSrm, 0 },
- { X86::MINSSrr_Int, X86::MINSSrm_Int, TB_NO_REVERSE },
- { X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE },
- { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
- { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
- { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
- { X86::MULSDrr, X86::MULSDrm, 0 },
- { X86::MULSDrr_Int, X86::MULSDrm_Int, TB_NO_REVERSE },
- { X86::MULSSrr, X86::MULSSrm, 0 },
- { X86::MULSSrr_Int, X86::MULSSrm_Int, TB_NO_REVERSE },
- { X86::OR16rr, X86::OR16rm, 0 },
- { X86::OR32rr, X86::OR32rm, 0 },
- { X86::OR64rr, X86::OR64rm, 0 },
- { X86::OR8rr, X86::OR8rm, 0 },
- { X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 },
- { X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 },
- { X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 },
- { X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 },
- { X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 },
- { X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 },
- { X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 },
- { X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 },
- { X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 },
- { X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 },
- { X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 },
- { X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 },
- { X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 },
- { X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 },
- { X86::PALIGNRrri, X86::PALIGNRrmi, TB_ALIGN_16 },
- { X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 },
- { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 },
- { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 },
- { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 },
- { X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 },
- { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 },
- { X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 },
- { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 },
- { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 },
- { X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 },
- { X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 },
- { X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 },
- { X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 },
- { X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 },
- { X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 },
- { X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 },
- { X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 },
- { X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 },
- { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 },
- { X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 },
- { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 },
- { X86::PINSRBrr, X86::PINSRBrm, 0 },
- { X86::PINSRDrr, X86::PINSRDrm, 0 },
- { X86::PINSRQrr, X86::PINSRQrm, 0 },
- { X86::PINSRWrri, X86::PINSRWrmi, 0 },
- { X86::PMADDUBSWrr, X86::PMADDUBSWrm, TB_ALIGN_16 },
- { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },
- { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 },
- { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 },
- { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 },
- { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 },
- { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 },
- { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 },
- { X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 },
- { X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 },
- { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 },
- { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 },
- { X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 },
- { X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 },
- { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 },
- { X86::PMULHRSWrr, X86::PMULHRSWrm, TB_ALIGN_16 },
- { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 },
- { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 },
- { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 },
- { X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 },
- { X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 },
- { X86::PORrr, X86::PORrm, TB_ALIGN_16 },
- { X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 },
- { X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 },
- { X86::PSIGNBrr128, X86::PSIGNBrm128, TB_ALIGN_16 },
- { X86::PSIGNWrr128, X86::PSIGNWrm128, TB_ALIGN_16 },
- { X86::PSIGNDrr128, X86::PSIGNDrm128, TB_ALIGN_16 },
- { X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 },
- { X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 },
- { X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 },
- { X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 },
- { X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 },
- { X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 },
- { X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 },
- { X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 },
- { X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 },
- { X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 },
- { X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 },
- { X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 },
- { X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 },
- { X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 },
- { X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 },
- { X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 },
- { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 },
- { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 },
- { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 },
- { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 },
- { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 },
- { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 },
- { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
- { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
- { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
- { X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE },
- { X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE },
- { X86::SBB32rr, X86::SBB32rm, 0 },
- { X86::SBB64rr, X86::SBB64rm, 0 },
- { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 },
- { X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 },
- { X86::SUB16rr, X86::SUB16rm, 0 },
- { X86::SUB32rr, X86::SUB32rm, 0 },
- { X86::SUB64rr, X86::SUB64rm, 0 },
- { X86::SUB8rr, X86::SUB8rm, 0 },
- { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 },
- { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 },
- { X86::SUBSDrr, X86::SUBSDrm, 0 },
- { X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE },
- { X86::SUBSSrr, X86::SUBSSrm, 0 },
- { X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE },
- // FIXME: TEST*rr -> swapped operand of TEST*mr.
- { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 },
- { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 },
- { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 },
- { X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 },
- { X86::XOR16rr, X86::XOR16rm, 0 },
- { X86::XOR32rr, X86::XOR32rm, 0 },
- { X86::XOR64rr, X86::XOR64rm, 0 },
- { X86::XOR8rr, X86::XOR8rm, 0 },
- { X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 },
- { X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 },
-
- // MMX version of foldable instructions
- { X86::MMX_CVTPI2PSirr, X86::MMX_CVTPI2PSirm, 0 },
- { X86::MMX_PACKSSDWirr, X86::MMX_PACKSSDWirm, 0 },
- { X86::MMX_PACKSSWBirr, X86::MMX_PACKSSWBirm, 0 },
- { X86::MMX_PACKUSWBirr, X86::MMX_PACKUSWBirm, 0 },
- { X86::MMX_PADDBirr, X86::MMX_PADDBirm, 0 },
- { X86::MMX_PADDDirr, X86::MMX_PADDDirm, 0 },
- { X86::MMX_PADDQirr, X86::MMX_PADDQirm, 0 },
- { X86::MMX_PADDSBirr, X86::MMX_PADDSBirm, 0 },
- { X86::MMX_PADDSWirr, X86::MMX_PADDSWirm, 0 },
- { X86::MMX_PADDUSBirr, X86::MMX_PADDUSBirm, 0 },
- { X86::MMX_PADDUSWirr, X86::MMX_PADDUSWirm, 0 },
- { X86::MMX_PADDWirr, X86::MMX_PADDWirm, 0 },
- { X86::MMX_PALIGNR64irr, X86::MMX_PALIGNR64irm, 0 },
- { X86::MMX_PANDNirr, X86::MMX_PANDNirm, 0 },
- { X86::MMX_PANDirr, X86::MMX_PANDirm, 0 },
- { X86::MMX_PAVGBirr, X86::MMX_PAVGBirm, 0 },
- { X86::MMX_PAVGWirr, X86::MMX_PAVGWirm, 0 },
- { X86::MMX_PCMPEQBirr, X86::MMX_PCMPEQBirm, 0 },
- { X86::MMX_PCMPEQDirr, X86::MMX_PCMPEQDirm, 0 },
- { X86::MMX_PCMPEQWirr, X86::MMX_PCMPEQWirm, 0 },
- { X86::MMX_PCMPGTBirr, X86::MMX_PCMPGTBirm, 0 },
- { X86::MMX_PCMPGTDirr, X86::MMX_PCMPGTDirm, 0 },
- { X86::MMX_PCMPGTWirr, X86::MMX_PCMPGTWirm, 0 },
- { X86::MMX_PHADDSWrr64, X86::MMX_PHADDSWrm64, 0 },
- { X86::MMX_PHADDWrr64, X86::MMX_PHADDWrm64, 0 },
- { X86::MMX_PHADDrr64, X86::MMX_PHADDrm64, 0 },
- { X86::MMX_PHSUBDrr64, X86::MMX_PHSUBDrm64, 0 },
- { X86::MMX_PHSUBSWrr64, X86::MMX_PHSUBSWrm64, 0 },
- { X86::MMX_PHSUBWrr64, X86::MMX_PHSUBWrm64, 0 },
- { X86::MMX_PINSRWirri, X86::MMX_PINSRWirmi, 0 },
- { X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 },
- { X86::MMX_PMADDWDirr, X86::MMX_PMADDWDirm, 0 },
- { X86::MMX_PMAXSWirr, X86::MMX_PMAXSWirm, 0 },
- { X86::MMX_PMAXUBirr, X86::MMX_PMAXUBirm, 0 },
- { X86::MMX_PMINSWirr, X86::MMX_PMINSWirm, 0 },
- { X86::MMX_PMINUBirr, X86::MMX_PMINUBirm, 0 },
- { X86::MMX_PMULHRSWrr64, X86::MMX_PMULHRSWrm64, 0 },
- { X86::MMX_PMULHUWirr, X86::MMX_PMULHUWirm, 0 },
- { X86::MMX_PMULHWirr, X86::MMX_PMULHWirm, 0 },
- { X86::MMX_PMULLWirr, X86::MMX_PMULLWirm, 0 },
- { X86::MMX_PMULUDQirr, X86::MMX_PMULUDQirm, 0 },
- { X86::MMX_PORirr, X86::MMX_PORirm, 0 },
- { X86::MMX_PSADBWirr, X86::MMX_PSADBWirm, 0 },
- { X86::MMX_PSHUFBrr64, X86::MMX_PSHUFBrm64, 0 },
- { X86::MMX_PSIGNBrr64, X86::MMX_PSIGNBrm64, 0 },
- { X86::MMX_PSIGNDrr64, X86::MMX_PSIGNDrm64, 0 },
- { X86::MMX_PSIGNWrr64, X86::MMX_PSIGNWrm64, 0 },
- { X86::MMX_PSLLDrr, X86::MMX_PSLLDrm, 0 },
- { X86::MMX_PSLLQrr, X86::MMX_PSLLQrm, 0 },
- { X86::MMX_PSLLWrr, X86::MMX_PSLLWrm, 0 },
- { X86::MMX_PSRADrr, X86::MMX_PSRADrm, 0 },
- { X86::MMX_PSRAWrr, X86::MMX_PSRAWrm, 0 },
- { X86::MMX_PSRLDrr, X86::MMX_PSRLDrm, 0 },
- { X86::MMX_PSRLQrr, X86::MMX_PSRLQrm, 0 },
- { X86::MMX_PSRLWrr, X86::MMX_PSRLWrm, 0 },
- { X86::MMX_PSUBBirr, X86::MMX_PSUBBirm, 0 },
- { X86::MMX_PSUBDirr, X86::MMX_PSUBDirm, 0 },
- { X86::MMX_PSUBQirr, X86::MMX_PSUBQirm, 0 },
- { X86::MMX_PSUBSBirr, X86::MMX_PSUBSBirm, 0 },
- { X86::MMX_PSUBSWirr, X86::MMX_PSUBSWirm, 0 },
- { X86::MMX_PSUBUSBirr, X86::MMX_PSUBUSBirm, 0 },
- { X86::MMX_PSUBUSWirr, X86::MMX_PSUBUSWirm, 0 },
- { X86::MMX_PSUBWirr, X86::MMX_PSUBWirm, 0 },
- { X86::MMX_PUNPCKHBWirr, X86::MMX_PUNPCKHBWirm, 0 },
- { X86::MMX_PUNPCKHDQirr, X86::MMX_PUNPCKHDQirm, 0 },
- { X86::MMX_PUNPCKHWDirr, X86::MMX_PUNPCKHWDirm, 0 },
- { X86::MMX_PUNPCKLBWirr, X86::MMX_PUNPCKLBWirm, 0 },
- { X86::MMX_PUNPCKLDQirr, X86::MMX_PUNPCKLDQirm, 0 },
- { X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, 0 },
- { X86::MMX_PXORirr, X86::MMX_PXORirm, 0 },
-
- // 3DNow! version of foldable instructions
- { X86::PAVGUSBrr, X86::PAVGUSBrm, 0 },
- { X86::PFACCrr, X86::PFACCrm, 0 },
- { X86::PFADDrr, X86::PFADDrm, 0 },
- { X86::PFCMPEQrr, X86::PFCMPEQrm, 0 },
- { X86::PFCMPGErr, X86::PFCMPGErm, 0 },
- { X86::PFCMPGTrr, X86::PFCMPGTrm, 0 },
- { X86::PFMAXrr, X86::PFMAXrm, 0 },
- { X86::PFMINrr, X86::PFMINrm, 0 },
- { X86::PFMULrr, X86::PFMULrm, 0 },
- { X86::PFNACCrr, X86::PFNACCrm, 0 },
- { X86::PFPNACCrr, X86::PFPNACCrm, 0 },
- { X86::PFRCPIT1rr, X86::PFRCPIT1rm, 0 },
- { X86::PFRCPIT2rr, X86::PFRCPIT2rm, 0 },
- { X86::PFRSQIT1rr, X86::PFRSQIT1rm, 0 },
- { X86::PFSUBrr, X86::PFSUBrm, 0 },
- { X86::PFSUBRrr, X86::PFSUBRrm, 0 },
- { X86::PMULHRWrr, X86::PMULHRWrm, 0 },
-
- // AVX 128-bit versions of foldable instructions
- { X86::VCVTSI642SDrr, X86::VCVTSI642SDrm, 0 },
- { X86::VCVTSI642SDrr_Int, X86::VCVTSI642SDrm_Int, 0 },
- { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 },
- { X86::VCVTSI2SDrr_Int, X86::VCVTSI2SDrm_Int, 0 },
- { X86::VCVTSI642SSrr, X86::VCVTSI642SSrm, 0 },
- { X86::VCVTSI642SSrr_Int, X86::VCVTSI642SSrm_Int, 0 },
- { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 },
- { X86::VCVTSI2SSrr_Int, X86::VCVTSI2SSrm_Int, 0 },
- { X86::VADDPDrr, X86::VADDPDrm, 0 },
- { X86::VADDPSrr, X86::VADDPSrm, 0 },
- { X86::VADDSDrr, X86::VADDSDrm, 0 },
- { X86::VADDSDrr_Int, X86::VADDSDrm_Int, TB_NO_REVERSE },
- { X86::VADDSSrr, X86::VADDSSrm, 0 },
- { X86::VADDSSrr_Int, X86::VADDSSrm_Int, TB_NO_REVERSE },
- { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 },
- { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 },
- { X86::VANDNPDrr, X86::VANDNPDrm, 0 },
- { X86::VANDNPSrr, X86::VANDNPSrm, 0 },
- { X86::VANDPDrr, X86::VANDPDrm, 0 },
- { X86::VANDPSrr, X86::VANDPSrm, 0 },
- { X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 },
- { X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 },
- { X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 },
- { X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 },
- { X86::VCMPPDrri, X86::VCMPPDrmi, 0 },
- { X86::VCMPPSrri, X86::VCMPPSrmi, 0 },
- { X86::VCMPSDrr, X86::VCMPSDrm, 0 },
- { X86::VCMPSDrr_Int, X86::VCMPSDrm_Int, TB_NO_REVERSE },
- { X86::VCMPSSrr, X86::VCMPSSrm, 0 },
- { X86::VCMPSSrr_Int, X86::VCMPSSrm_Int, TB_NO_REVERSE },
- { X86::VDIVPDrr, X86::VDIVPDrm, 0 },
- { X86::VDIVPSrr, X86::VDIVPSrm, 0 },
- { X86::VDIVSDrr, X86::VDIVSDrm, 0 },
- { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, TB_NO_REVERSE },
- { X86::VDIVSSrr, X86::VDIVSSrm, 0 },
- { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, TB_NO_REVERSE },
- { X86::VDPPDrri, X86::VDPPDrmi, 0 },
- { X86::VDPPSrri, X86::VDPPSrmi, 0 },
- { X86::VHADDPDrr, X86::VHADDPDrm, 0 },
- { X86::VHADDPSrr, X86::VHADDPSrm, 0 },
- { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
- { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 },
- { X86::VMAXCPDrr, X86::VMAXCPDrm, 0 },
- { X86::VMAXCPSrr, X86::VMAXCPSrm, 0 },
- { X86::VMAXCSDrr, X86::VMAXCSDrm, 0 },
- { X86::VMAXCSSrr, X86::VMAXCSSrm, 0 },
- { X86::VMAXPDrr, X86::VMAXPDrm, 0 },
- { X86::VMAXPSrr, X86::VMAXPSrm, 0 },
- { X86::VMAXSDrr, X86::VMAXSDrm, 0 },
- { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, TB_NO_REVERSE },
- { X86::VMAXSSrr, X86::VMAXSSrm, 0 },
- { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, TB_NO_REVERSE },
- { X86::VMINCPDrr, X86::VMINCPDrm, 0 },
- { X86::VMINCPSrr, X86::VMINCPSrm, 0 },
- { X86::VMINCSDrr, X86::VMINCSDrm, 0 },
- { X86::VMINCSSrr, X86::VMINCSSrm, 0 },
- { X86::VMINPDrr, X86::VMINPDrm, 0 },
- { X86::VMINPSrr, X86::VMINPSrm, 0 },
- { X86::VMINSDrr, X86::VMINSDrm, 0 },
- { X86::VMINSDrr_Int, X86::VMINSDrm_Int, TB_NO_REVERSE },
- { X86::VMINSSrr, X86::VMINSSrm, 0 },
- { X86::VMINSSrr_Int, X86::VMINSSrm_Int, TB_NO_REVERSE },
- { X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE },
- { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
- { X86::VMULPDrr, X86::VMULPDrm, 0 },
- { X86::VMULPSrr, X86::VMULPSrm, 0 },
- { X86::VMULSDrr, X86::VMULSDrm, 0 },
- { X86::VMULSDrr_Int, X86::VMULSDrm_Int, TB_NO_REVERSE },
- { X86::VMULSSrr, X86::VMULSSrm, 0 },
- { X86::VMULSSrr_Int, X86::VMULSSrm_Int, TB_NO_REVERSE },
- { X86::VORPDrr, X86::VORPDrm, 0 },
- { X86::VORPSrr, X86::VORPSrm, 0 },
- { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 },
- { X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 },
- { X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 },
- { X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 },
- { X86::VPADDBrr, X86::VPADDBrm, 0 },
- { X86::VPADDDrr, X86::VPADDDrm, 0 },
- { X86::VPADDQrr, X86::VPADDQrm, 0 },
- { X86::VPADDSBrr, X86::VPADDSBrm, 0 },
- { X86::VPADDSWrr, X86::VPADDSWrm, 0 },
- { X86::VPADDUSBrr, X86::VPADDUSBrm, 0 },
- { X86::VPADDUSWrr, X86::VPADDUSWrm, 0 },
- { X86::VPADDWrr, X86::VPADDWrm, 0 },
- { X86::VPALIGNRrri, X86::VPALIGNRrmi, 0 },
- { X86::VPANDNrr, X86::VPANDNrm, 0 },
- { X86::VPANDrr, X86::VPANDrm, 0 },
- { X86::VPAVGBrr, X86::VPAVGBrm, 0 },
- { X86::VPAVGWrr, X86::VPAVGWrm, 0 },
- { X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 },
- { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 },
- { X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 },
- { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 },
- { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 },
- { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 },
- { X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 },
- { X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 },
- { X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 },
- { X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 },
- { X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 },
- { X86::VPHADDDrr, X86::VPHADDDrm, 0 },
- { X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 },
- { X86::VPHADDWrr, X86::VPHADDWrm, 0 },
- { X86::VPHSUBDrr, X86::VPHSUBDrm, 0 },
- { X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 },
- { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 },
- { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 },
- { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 },
- { X86::VPINSRBrr, X86::VPINSRBrm, 0 },
- { X86::VPINSRDrr, X86::VPINSRDrm, 0 },
- { X86::VPINSRQrr, X86::VPINSRQrm, 0 },
- { X86::VPINSRWrri, X86::VPINSRWrmi, 0 },
- { X86::VPMADDUBSWrr, X86::VPMADDUBSWrm, 0 },
- { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },
- { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 },
- { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 },
- { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 },
- { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 },
- { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 },
- { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 },
- { X86::VPMINSBrr, X86::VPMINSBrm, 0 },
- { X86::VPMINSDrr, X86::VPMINSDrm, 0 },
- { X86::VPMINSWrr, X86::VPMINSWrm, 0 },
- { X86::VPMINUBrr, X86::VPMINUBrm, 0 },
- { X86::VPMINUDrr, X86::VPMINUDrm, 0 },
- { X86::VPMINUWrr, X86::VPMINUWrm, 0 },
- { X86::VPMULDQrr, X86::VPMULDQrm, 0 },
- { X86::VPMULHRSWrr, X86::VPMULHRSWrm, 0 },
- { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 },
- { X86::VPMULHWrr, X86::VPMULHWrm, 0 },
- { X86::VPMULLDrr, X86::VPMULLDrm, 0 },
- { X86::VPMULLWrr, X86::VPMULLWrm, 0 },
- { X86::VPMULUDQrr, X86::VPMULUDQrm, 0 },
- { X86::VPORrr, X86::VPORrm, 0 },
- { X86::VPSADBWrr, X86::VPSADBWrm, 0 },
- { X86::VPSHUFBrr, X86::VPSHUFBrm, 0 },
- { X86::VPSIGNBrr128, X86::VPSIGNBrm128, 0 },
- { X86::VPSIGNWrr128, X86::VPSIGNWrm128, 0 },
- { X86::VPSIGNDrr128, X86::VPSIGNDrm128, 0 },
- { X86::VPSLLDrr, X86::VPSLLDrm, 0 },
- { X86::VPSLLQrr, X86::VPSLLQrm, 0 },
- { X86::VPSLLWrr, X86::VPSLLWrm, 0 },
- { X86::VPSRADrr, X86::VPSRADrm, 0 },
- { X86::VPSRAWrr, X86::VPSRAWrm, 0 },
- { X86::VPSRLDrr, X86::VPSRLDrm, 0 },
- { X86::VPSRLQrr, X86::VPSRLQrm, 0 },
- { X86::VPSRLWrr, X86::VPSRLWrm, 0 },
- { X86::VPSUBBrr, X86::VPSUBBrm, 0 },
- { X86::VPSUBDrr, X86::VPSUBDrm, 0 },
- { X86::VPSUBQrr, X86::VPSUBQrm, 0 },
- { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 },
- { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 },
- { X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 },
- { X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 },
- { X86::VPSUBWrr, X86::VPSUBWrm, 0 },
- { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 },
- { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 },
- { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 },
- { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 },
- { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 },
- { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 },
- { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
- { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 },
- { X86::VPXORrr, X86::VPXORrm, 0 },
- { X86::VRCPSSr, X86::VRCPSSm, 0 },
- { X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE },
- { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
- { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE },
- { X86::VROUNDSDr, X86::VROUNDSDm, 0 },
- { X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE },
- { X86::VROUNDSSr, X86::VROUNDSSm, 0 },
- { X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE },
- { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 },
- { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 },
- { X86::VSQRTSDr, X86::VSQRTSDm, 0 },
- { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, TB_NO_REVERSE },
- { X86::VSQRTSSr, X86::VSQRTSSm, 0 },
- { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, TB_NO_REVERSE },
- { X86::VSUBPDrr, X86::VSUBPDrm, 0 },
- { X86::VSUBPSrr, X86::VSUBPSrm, 0 },
- { X86::VSUBSDrr, X86::VSUBSDrm, 0 },
- { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, TB_NO_REVERSE },
- { X86::VSUBSSrr, X86::VSUBSSrm, 0 },
- { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, TB_NO_REVERSE },
- { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 },
- { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 },
- { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 },
- { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 },
- { X86::VXORPDrr, X86::VXORPDrm, 0 },
- { X86::VXORPSrr, X86::VXORPSrm, 0 },
-
- // AVX 256-bit foldable instructions
- { X86::VADDPDYrr, X86::VADDPDYrm, 0 },
- { X86::VADDPSYrr, X86::VADDPSYrm, 0 },
- { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 },
- { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 },
- { X86::VANDNPDYrr, X86::VANDNPDYrm, 0 },
- { X86::VANDNPSYrr, X86::VANDNPSYrm, 0 },
- { X86::VANDPDYrr, X86::VANDPDYrm, 0 },
- { X86::VANDPSYrr, X86::VANDPSYrm, 0 },
- { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 },
- { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 },
- { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 },
- { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 },
- { X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 },
- { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 },
- { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 },
- { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 },
- { X86::VDPPSYrri, X86::VDPPSYrmi, 0 },
- { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 },
- { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 },
- { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 },
- { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 },
- { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 },
- { X86::VMAXCPDYrr, X86::VMAXCPDYrm, 0 },
- { X86::VMAXCPSYrr, X86::VMAXCPSYrm, 0 },
- { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 },
- { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 },
- { X86::VMINCPDYrr, X86::VMINCPDYrm, 0 },
- { X86::VMINCPSYrr, X86::VMINCPSYrm, 0 },
- { X86::VMINPDYrr, X86::VMINPDYrm, 0 },
- { X86::VMINPSYrr, X86::VMINPSYrm, 0 },
- { X86::VMULPDYrr, X86::VMULPDYrm, 0 },
- { X86::VMULPSYrr, X86::VMULPSYrm, 0 },
- { X86::VORPDYrr, X86::VORPDYrm, 0 },
- { X86::VORPSYrr, X86::VORPSYrm, 0 },
- { X86::VPERM2F128rr, X86::VPERM2F128rm, 0 },
- { X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 },
- { X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 },
- { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 },
- { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 },
- { X86::VSUBPDYrr, X86::VSUBPDYrm, 0 },
- { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 },
- { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 },
- { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 },
- { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 },
- { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 },
- { X86::VXORPDYrr, X86::VXORPDYrm, 0 },
- { X86::VXORPSYrr, X86::VXORPSYrm, 0 },
-
- // AVX2 foldable instructions
- { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 },
- { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 },
- { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 },
- { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 },
- { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 },
- { X86::VPADDBYrr, X86::VPADDBYrm, 0 },
- { X86::VPADDDYrr, X86::VPADDDYrm, 0 },
- { X86::VPADDQYrr, X86::VPADDQYrm, 0 },
- { X86::VPADDSBYrr, X86::VPADDSBYrm, 0 },
- { X86::VPADDSWYrr, X86::VPADDSWYrm, 0 },
- { X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 },
- { X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 },
- { X86::VPADDWYrr, X86::VPADDWYrm, 0 },
- { X86::VPALIGNRYrri, X86::VPALIGNRYrmi, 0 },
- { X86::VPANDNYrr, X86::VPANDNYrm, 0 },
- { X86::VPANDYrr, X86::VPANDYrm, 0 },
- { X86::VPAVGBYrr, X86::VPAVGBYrm, 0 },
- { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 },
- { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 },
- { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 },
- { X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0 },
- { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 },
- { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 },
- { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 },
- { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 },
- { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 },
- { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 },
- { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 },
- { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 },
- { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 },
- { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 },
- { X86::VPERMDYrr, X86::VPERMDYrm, 0 },
- { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 },
- { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 },
- { X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 },
- { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 },
- { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 },
- { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 },
- { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 },
- { X86::VPMADDUBSWYrr, X86::VPMADDUBSWYrm, 0 },
- { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 },
- { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 },
- { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 },
- { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 },
- { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 },
- { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 },
- { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 },
- { X86::VPMINSBYrr, X86::VPMINSBYrm, 0 },
- { X86::VPMINSDYrr, X86::VPMINSDYrm, 0 },
- { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 },
- { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 },
- { X86::VPMINUDYrr, X86::VPMINUDYrm, 0 },
- { X86::VPMINUWYrr, X86::VPMINUWYrm, 0 },
- { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 },
- { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 },
- { X86::VPMULHRSWYrr, X86::VPMULHRSWYrm, 0 },
- { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 },
- { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 },
- { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 },
- { X86::VPMULLWYrr, X86::VPMULLWYrm, 0 },
- { X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 },
- { X86::VPORYrr, X86::VPORYrm, 0 },
- { X86::VPSADBWYrr, X86::VPSADBWYrm, 0 },
- { X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 },
- { X86::VPSIGNBYrr256, X86::VPSIGNBYrm256, 0 },
- { X86::VPSIGNWYrr256, X86::VPSIGNWYrm256, 0 },
- { X86::VPSIGNDYrr256, X86::VPSIGNDYrm256, 0 },
- { X86::VPSLLDYrr, X86::VPSLLDYrm, 0 },
- { X86::VPSLLQYrr, X86::VPSLLQYrm, 0 },
- { X86::VPSLLWYrr, X86::VPSLLWYrm, 0 },
- { X86::VPSLLVDrr, X86::VPSLLVDrm, 0 },
- { X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 },
- { X86::VPSLLVQrr, X86::VPSLLVQrm, 0 },
- { X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 },
- { X86::VPSRADYrr, X86::VPSRADYrm, 0 },
- { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 },
- { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 },
- { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 },
- { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 },
- { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 },
- { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 },
- { X86::VPSRLVDrr, X86::VPSRLVDrm, 0 },
- { X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 },
- { X86::VPSRLVQrr, X86::VPSRLVQrm, 0 },
- { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 },
- { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 },
- { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 },
- { X86::VPSUBQYrr, X86::VPSUBQYrm, 0 },
- { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 },
- { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 },
- { X86::VPSUBUSBYrr, X86::VPSUBUSBYrm, 0 },
- { X86::VPSUBUSWYrr, X86::VPSUBUSWYrm, 0 },
- { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 },
- { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 },
- { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 },
- { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 },
- { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 },
- { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 },
- { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 },
- { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 },
- { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 },
- { X86::VPXORYrr, X86::VPXORYrm, 0 },
-
- // FMA4 foldable patterns
- { X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE },
- { X86::VFMADDSS4rr_Int, X86::VFMADDSS4mr_Int, TB_NO_REVERSE },
- { X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE },
- { X86::VFMADDSD4rr_Int, X86::VFMADDSD4mr_Int, TB_NO_REVERSE },
- { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE },
- { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE },
- { X86::VFMADDPS4Yrr, X86::VFMADDPS4Ymr, TB_ALIGN_NONE },
- { X86::VFMADDPD4Yrr, X86::VFMADDPD4Ymr, TB_ALIGN_NONE },
- { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE },
- { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4mr_Int, TB_NO_REVERSE },
- { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE },
- { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4mr_Int, TB_NO_REVERSE },
- { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE },
- { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE },
- { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Ymr, TB_ALIGN_NONE },
- { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Ymr, TB_ALIGN_NONE },
- { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE },
- { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4mr_Int, TB_NO_REVERSE },
- { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE },
- { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4mr_Int, TB_NO_REVERSE },
- { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE },
- { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE },
- { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Ymr, TB_ALIGN_NONE },
- { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Ymr, TB_ALIGN_NONE },
- { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE },
- { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE },
- { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE },
- { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE },
- { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE },
- { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE },
- { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Ymr, TB_ALIGN_NONE },
- { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Ymr, TB_ALIGN_NONE },
- { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE },
- { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE },
- { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Ymr, TB_ALIGN_NONE },
- { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Ymr, TB_ALIGN_NONE },
- { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE },
- { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE },
- { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Ymr, TB_ALIGN_NONE },
- { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Ymr, TB_ALIGN_NONE },
-
- // XOP foldable instructions
- { X86::VPCMOVrrr, X86::VPCMOVrmr, 0 },
- { X86::VPCMOVYrrr, X86::VPCMOVYrmr, 0 },
- { X86::VPCOMBri, X86::VPCOMBmi, 0 },
- { X86::VPCOMDri, X86::VPCOMDmi, 0 },
- { X86::VPCOMQri, X86::VPCOMQmi, 0 },
- { X86::VPCOMWri, X86::VPCOMWmi, 0 },
- { X86::VPCOMUBri, X86::VPCOMUBmi, 0 },
- { X86::VPCOMUDri, X86::VPCOMUDmi, 0 },
- { X86::VPCOMUQri, X86::VPCOMUQmi, 0 },
- { X86::VPCOMUWri, X86::VPCOMUWmi, 0 },
- { X86::VPERMIL2PDrr, X86::VPERMIL2PDmr, 0 },
- { X86::VPERMIL2PDYrr, X86::VPERMIL2PDYmr, 0 },
- { X86::VPERMIL2PSrr, X86::VPERMIL2PSmr, 0 },
- { X86::VPERMIL2PSYrr, X86::VPERMIL2PSYmr, 0 },
- { X86::VPMACSDDrr, X86::VPMACSDDrm, 0 },
- { X86::VPMACSDQHrr, X86::VPMACSDQHrm, 0 },
- { X86::VPMACSDQLrr, X86::VPMACSDQLrm, 0 },
- { X86::VPMACSSDDrr, X86::VPMACSSDDrm, 0 },
- { X86::VPMACSSDQHrr, X86::VPMACSSDQHrm, 0 },
- { X86::VPMACSSDQLrr, X86::VPMACSSDQLrm, 0 },
- { X86::VPMACSSWDrr, X86::VPMACSSWDrm, 0 },
- { X86::VPMACSSWWrr, X86::VPMACSSWWrm, 0 },
- { X86::VPMACSWDrr, X86::VPMACSWDrm, 0 },
- { X86::VPMACSWWrr, X86::VPMACSWWrm, 0 },
- { X86::VPMADCSSWDrr, X86::VPMADCSSWDrm, 0 },
- { X86::VPMADCSWDrr, X86::VPMADCSWDrm, 0 },
- { X86::VPPERMrrr, X86::VPPERMrmr, 0 },
- { X86::VPROTBrr, X86::VPROTBrm, 0 },
- { X86::VPROTDrr, X86::VPROTDrm, 0 },
- { X86::VPROTQrr, X86::VPROTQrm, 0 },
- { X86::VPROTWrr, X86::VPROTWrm, 0 },
- { X86::VPSHABrr, X86::VPSHABrm, 0 },
- { X86::VPSHADrr, X86::VPSHADrm, 0 },
- { X86::VPSHAQrr, X86::VPSHAQrm, 0 },
- { X86::VPSHAWrr, X86::VPSHAWrm, 0 },
- { X86::VPSHLBrr, X86::VPSHLBrm, 0 },
- { X86::VPSHLDrr, X86::VPSHLDrm, 0 },
- { X86::VPSHLQrr, X86::VPSHLQrm, 0 },
- { X86::VPSHLWrr, X86::VPSHLWrm, 0 },
-
- // BMI/BMI2 foldable instructions
- { X86::ANDN32rr, X86::ANDN32rm, 0 },
- { X86::ANDN64rr, X86::ANDN64rm, 0 },
- { X86::MULX32rr, X86::MULX32rm, 0 },
- { X86::MULX64rr, X86::MULX64rm, 0 },
- { X86::PDEP32rr, X86::PDEP32rm, 0 },
- { X86::PDEP64rr, X86::PDEP64rm, 0 },
- { X86::PEXT32rr, X86::PEXT32rm, 0 },
- { X86::PEXT64rr, X86::PEXT64rm, 0 },
-
- // ADX foldable instructions
- { X86::ADCX32rr, X86::ADCX32rm, 0 },
- { X86::ADCX64rr, X86::ADCX64rm, 0 },
- { X86::ADOX32rr, X86::ADOX32rm, 0 },
- { X86::ADOX64rr, X86::ADOX64rm, 0 },
-
- // AVX-512 foldable instructions
- { X86::VADDPDZrr, X86::VADDPDZrm, 0 },
- { X86::VADDPSZrr, X86::VADDPSZrm, 0 },
- { X86::VADDSDZrr, X86::VADDSDZrm, 0 },
- { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, TB_NO_REVERSE },
- { X86::VADDSSZrr, X86::VADDSSZrm, 0 },
- { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, TB_NO_REVERSE },
- { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
- { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
- { X86::VANDNPDZrr, X86::VANDNPDZrm, 0 },
- { X86::VANDNPSZrr, X86::VANDNPSZrm, 0 },
- { X86::VANDPDZrr, X86::VANDPDZrm, 0 },
- { X86::VANDPSZrr, X86::VANDPSZrm, 0 },
- { X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 },
- { X86::VCMPPSZrri, X86::VCMPPSZrmi, 0 },
- { X86::VCMPSDZrr, X86::VCMPSDZrm, 0 },
- { X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE },
- { X86::VCMPSSZrr, X86::VCMPSSZrm, 0 },
- { X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE },
- { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 },
- { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 },
- { X86::VDIVSDZrr, X86::VDIVSDZrm, 0 },
- { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, TB_NO_REVERSE },
- { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 },
- { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, TB_NO_REVERSE },
- { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrm, 0 },
- { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrm, 0 },
- { X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrm, 0 },
- { X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrm, 0 },
- { X86::VINSERTI32x4Zrr, X86::VINSERTI32x4Zrm, 0 },
- { X86::VINSERTI32x8Zrr, X86::VINSERTI32x8Zrm, 0 },
- { X86::VINSERTI64x2Zrr, X86::VINSERTI64x2Zrm, 0 },
- { X86::VINSERTI64x4Zrr, X86::VINSERTI64x4Zrm, 0 },
- { X86::VMAXCPDZrr, X86::VMAXCPDZrm, 0 },
- { X86::VMAXCPSZrr, X86::VMAXCPSZrm, 0 },
- { X86::VMAXCSDZrr, X86::VMAXCSDZrm, 0 },
- { X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0 },
- { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 },
- { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 },
- { X86::VMAXSDZrr, X86::VMAXSDZrm, 0 },
- { X86::VMAXSDZrr_Int, X86::VMAXSDZrm_Int, TB_NO_REVERSE },
- { X86::VMAXSSZrr, X86::VMAXSSZrm, 0 },
- { X86::VMAXSSZrr_Int, X86::VMAXSSZrm_Int, TB_NO_REVERSE },
- { X86::VMINCPDZrr, X86::VMINCPDZrm, 0 },
- { X86::VMINCPSZrr, X86::VMINCPSZrm, 0 },
- { X86::VMINCSDZrr, X86::VMINCSDZrm, 0 },
- { X86::VMINCSSZrr, X86::VMINCSSZrm, 0 },
- { X86::VMINPDZrr, X86::VMINPDZrm, 0 },
- { X86::VMINPSZrr, X86::VMINPSZrm, 0 },
- { X86::VMINSDZrr, X86::VMINSDZrm, 0 },
- { X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE },
- { X86::VMINSSZrr, X86::VMINSSZrm, 0 },
- { X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE },
- { X86::VMOVLHPSZrr, X86::VMOVHPSZ128rm, TB_NO_REVERSE },
- { X86::VMULPDZrr, X86::VMULPDZrm, 0 },
- { X86::VMULPSZrr, X86::VMULPSZrm, 0 },
- { X86::VMULSDZrr, X86::VMULSDZrm, 0 },
- { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, TB_NO_REVERSE },
- { X86::VMULSSZrr, X86::VMULSSZrm, 0 },
- { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE },
- { X86::VORPDZrr, X86::VORPDZrm, 0 },
- { X86::VORPSZrr, X86::VORPSZrm, 0 },
- { X86::VPACKSSDWZrr, X86::VPACKSSDWZrm, 0 },
- { X86::VPACKSSWBZrr, X86::VPACKSSWBZrm, 0 },
- { X86::VPACKUSDWZrr, X86::VPACKUSDWZrm, 0 },
- { X86::VPACKUSWBZrr, X86::VPACKUSWBZrm, 0 },
- { X86::VPADDBZrr, X86::VPADDBZrm, 0 },
- { X86::VPADDDZrr, X86::VPADDDZrm, 0 },
- { X86::VPADDQZrr, X86::VPADDQZrm, 0 },
- { X86::VPADDSBZrr, X86::VPADDSBZrm, 0 },
- { X86::VPADDSWZrr, X86::VPADDSWZrm, 0 },
- { X86::VPADDUSBZrr, X86::VPADDUSBZrm, 0 },
- { X86::VPADDUSWZrr, X86::VPADDUSWZrm, 0 },
- { X86::VPADDWZrr, X86::VPADDWZrm, 0 },
- { X86::VPALIGNRZrri, X86::VPALIGNRZrmi, 0 },
- { X86::VPANDDZrr, X86::VPANDDZrm, 0 },
- { X86::VPANDNDZrr, X86::VPANDNDZrm, 0 },
- { X86::VPANDNQZrr, X86::VPANDNQZrm, 0 },
- { X86::VPANDQZrr, X86::VPANDQZrm, 0 },
- { X86::VPAVGBZrr, X86::VPAVGBZrm, 0 },
- { X86::VPAVGWZrr, X86::VPAVGWZrm, 0 },
- { X86::VPCMPBZrri, X86::VPCMPBZrmi, 0 },
- { X86::VPCMPDZrri, X86::VPCMPDZrmi, 0 },
- { X86::VPCMPEQBZrr, X86::VPCMPEQBZrm, 0 },
- { X86::VPCMPEQDZrr, X86::VPCMPEQDZrm, 0 },
- { X86::VPCMPEQQZrr, X86::VPCMPEQQZrm, 0 },
- { X86::VPCMPEQWZrr, X86::VPCMPEQWZrm, 0 },
- { X86::VPCMPGTBZrr, X86::VPCMPGTBZrm, 0 },
- { X86::VPCMPGTDZrr, X86::VPCMPGTDZrm, 0 },
- { X86::VPCMPGTQZrr, X86::VPCMPGTQZrm, 0 },
- { X86::VPCMPGTWZrr, X86::VPCMPGTWZrm, 0 },
- { X86::VPCMPQZrri, X86::VPCMPQZrmi, 0 },
- { X86::VPCMPUBZrri, X86::VPCMPUBZrmi, 0 },
- { X86::VPCMPUDZrri, X86::VPCMPUDZrmi, 0 },
- { X86::VPCMPUQZrri, X86::VPCMPUQZrmi, 0 },
- { X86::VPCMPUWZrri, X86::VPCMPUWZrmi, 0 },
- { X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 },
- { X86::VPERMBZrr, X86::VPERMBZrm, 0 },
- { X86::VPERMDZrr, X86::VPERMDZrm, 0 },
- { X86::VPERMILPDZrr, X86::VPERMILPDZrm, 0 },
- { X86::VPERMILPSZrr, X86::VPERMILPSZrm, 0 },
- { X86::VPERMPDZrr, X86::VPERMPDZrm, 0 },
- { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 },
- { X86::VPERMQZrr, X86::VPERMQZrm, 0 },
- { X86::VPERMWZrr, X86::VPERMWZrm, 0 },
- { X86::VPINSRBZrr, X86::VPINSRBZrm, 0 },
- { X86::VPINSRDZrr, X86::VPINSRDZrm, 0 },
- { X86::VPINSRQZrr, X86::VPINSRQZrm, 0 },
- { X86::VPINSRWZrr, X86::VPINSRWZrm, 0 },
- { X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 },
- { X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 },
- { X86::VPMAXSBZrr, X86::VPMAXSBZrm, 0 },
- { X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 },
- { X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 },
- { X86::VPMAXSWZrr, X86::VPMAXSWZrm, 0 },
- { X86::VPMAXUBZrr, X86::VPMAXUBZrm, 0 },
- { X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 },
- { X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 },
- { X86::VPMAXUWZrr, X86::VPMAXUWZrm, 0 },
- { X86::VPMINSBZrr, X86::VPMINSBZrm, 0 },
- { X86::VPMINSDZrr, X86::VPMINSDZrm, 0 },
- { X86::VPMINSQZrr, X86::VPMINSQZrm, 0 },
- { X86::VPMINSWZrr, X86::VPMINSWZrm, 0 },
- { X86::VPMINUBZrr, X86::VPMINUBZrm, 0 },
- { X86::VPMINUDZrr, X86::VPMINUDZrm, 0 },
- { X86::VPMINUQZrr, X86::VPMINUQZrm, 0 },
- { X86::VPMINUWZrr, X86::VPMINUWZrm, 0 },
- { X86::VPMULDQZrr, X86::VPMULDQZrm, 0 },
- { X86::VPMULLDZrr, X86::VPMULLDZrm, 0 },
- { X86::VPMULLQZrr, X86::VPMULLQZrm, 0 },
- { X86::VPMULLWZrr, X86::VPMULLWZrm, 0 },
- { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
- { X86::VPORDZrr, X86::VPORDZrm, 0 },
- { X86::VPORQZrr, X86::VPORQZrm, 0 },
- { X86::VPSADBWZrr, X86::VPSADBWZrm, 0 },
- { X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 },
- { X86::VPSLLDZrr, X86::VPSLLDZrm, 0 },
- { X86::VPSLLQZrr, X86::VPSLLQZrm, 0 },
- { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 },
- { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 },
- { X86::VPSLLVWZrr, X86::VPSLLVWZrm, 0 },
- { X86::VPSLLWZrr, X86::VPSLLWZrm, 0 },
- { X86::VPSRADZrr, X86::VPSRADZrm, 0 },
- { X86::VPSRAQZrr, X86::VPSRAQZrm, 0 },
- { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 },
- { X86::VPSRAVQZrr, X86::VPSRAVQZrm, 0 },
- { X86::VPSRAVWZrr, X86::VPSRAVWZrm, 0 },
- { X86::VPSRAWZrr, X86::VPSRAWZrm, 0 },
- { X86::VPSRLDZrr, X86::VPSRLDZrm, 0 },
- { X86::VPSRLQZrr, X86::VPSRLQZrm, 0 },
- { X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 },
- { X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 },
- { X86::VPSRLVWZrr, X86::VPSRLVWZrm, 0 },
- { X86::VPSRLWZrr, X86::VPSRLWZrm, 0 },
- { X86::VPSUBBZrr, X86::VPSUBBZrm, 0 },
- { X86::VPSUBDZrr, X86::VPSUBDZrm, 0 },
- { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 },
- { X86::VPSUBSBZrr, X86::VPSUBSBZrm, 0 },
- { X86::VPSUBSWZrr, X86::VPSUBSWZrm, 0 },
- { X86::VPSUBUSBZrr, X86::VPSUBUSBZrm, 0 },
- { X86::VPSUBUSWZrr, X86::VPSUBUSWZrm, 0 },
- { X86::VPSUBWZrr, X86::VPSUBWZrm, 0 },
- { X86::VPUNPCKHBWZrr, X86::VPUNPCKHBWZrm, 0 },
- { X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrm, 0 },
- { X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrm, 0 },
- { X86::VPUNPCKHWDZrr, X86::VPUNPCKHWDZrm, 0 },
- { X86::VPUNPCKLBWZrr, X86::VPUNPCKLBWZrm, 0 },
- { X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrm, 0 },
- { X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrm, 0 },
- { X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 },
- { X86::VPXORDZrr, X86::VPXORDZrm, 0 },
- { X86::VPXORQZrr, X86::VPXORQZrm, 0 },
- { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 },
- { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 },
- { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 },
- { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 },
- { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 },
- { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE },
- { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 },
- { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE },
- { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrm, 0 },
- { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrm, 0 },
- { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrm, 0 },
- { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrm, 0 },
- { X86::VXORPDZrr, X86::VXORPDZrm, 0 },
- { X86::VXORPSZrr, X86::VXORPSZrm, 0 },
-
- // AVX-512{F,VL} foldable instructions
- { X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 },
- { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 },
- { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 },
- { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 },
- { X86::VALIGNDZ128rri, X86::VALIGNDZ128rmi, 0 },
- { X86::VALIGNDZ256rri, X86::VALIGNDZ256rmi, 0 },
- { X86::VALIGNQZ128rri, X86::VALIGNQZ128rmi, 0 },
- { X86::VALIGNQZ256rri, X86::VALIGNQZ256rmi, 0 },
- { X86::VANDNPDZ128rr, X86::VANDNPDZ128rm, 0 },
- { X86::VANDNPDZ256rr, X86::VANDNPDZ256rm, 0 },
- { X86::VANDNPSZ128rr, X86::VANDNPSZ128rm, 0 },
- { X86::VANDNPSZ256rr, X86::VANDNPSZ256rm, 0 },
- { X86::VANDPDZ128rr, X86::VANDPDZ128rm, 0 },
- { X86::VANDPDZ256rr, X86::VANDPDZ256rm, 0 },
- { X86::VANDPSZ128rr, X86::VANDPSZ128rm, 0 },
- { X86::VANDPSZ256rr, X86::VANDPSZ256rm, 0 },
- { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 },
- { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 },
- { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 },
- { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmi, 0 },
- { X86::VDIVPDZ128rr, X86::VDIVPDZ128rm, 0 },
- { X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0 },
- { X86::VDIVPSZ128rr, X86::VDIVPSZ128rm, 0 },
- { X86::VDIVPSZ256rr, X86::VDIVPSZ256rm, 0 },
- { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rm, 0 },
- { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rm, 0 },
- { X86::VINSERTI32x4Z256rr,X86::VINSERTI32x4Z256rm, 0 },
- { X86::VINSERTI64x2Z256rr,X86::VINSERTI64x2Z256rm, 0 },
- { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rm, 0 },
- { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0 },
- { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rm, 0 },
- { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rm, 0 },
- { X86::VMAXPDZ128rr, X86::VMAXPDZ128rm, 0 },
- { X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0 },
- { X86::VMAXPSZ128rr, X86::VMAXPSZ128rm, 0 },
- { X86::VMAXPSZ256rr, X86::VMAXPSZ256rm, 0 },
- { X86::VMINCPDZ128rr, X86::VMINCPDZ128rm, 0 },
- { X86::VMINCPDZ256rr, X86::VMINCPDZ256rm, 0 },
- { X86::VMINCPSZ128rr, X86::VMINCPSZ128rm, 0 },
- { X86::VMINCPSZ256rr, X86::VMINCPSZ256rm, 0 },
- { X86::VMINPDZ128rr, X86::VMINPDZ128rm, 0 },
- { X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0 },
- { X86::VMINPSZ128rr, X86::VMINPSZ128rm, 0 },
- { X86::VMINPSZ256rr, X86::VMINPSZ256rm, 0 },
- { X86::VMULPDZ128rr, X86::VMULPDZ128rm, 0 },
- { X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0 },
- { X86::VMULPSZ128rr, X86::VMULPSZ128rm, 0 },
- { X86::VMULPSZ256rr, X86::VMULPSZ256rm, 0 },
- { X86::VORPDZ128rr, X86::VORPDZ128rm, 0 },
- { X86::VORPDZ256rr, X86::VORPDZ256rm, 0 },
- { X86::VORPSZ128rr, X86::VORPSZ128rm, 0 },
- { X86::VORPSZ256rr, X86::VORPSZ256rm, 0 },
- { X86::VPACKSSDWZ256rr, X86::VPACKSSDWZ256rm, 0 },
- { X86::VPACKSSDWZ128rr, X86::VPACKSSDWZ128rm, 0 },
- { X86::VPACKSSWBZ256rr, X86::VPACKSSWBZ256rm, 0 },
- { X86::VPACKSSWBZ128rr, X86::VPACKSSWBZ128rm, 0 },
- { X86::VPACKUSDWZ256rr, X86::VPACKUSDWZ256rm, 0 },
- { X86::VPACKUSDWZ128rr, X86::VPACKUSDWZ128rm, 0 },
- { X86::VPACKUSWBZ256rr, X86::VPACKUSWBZ256rm, 0 },
- { X86::VPACKUSWBZ128rr, X86::VPACKUSWBZ128rm, 0 },
- { X86::VPADDBZ128rr, X86::VPADDBZ128rm, 0 },
- { X86::VPADDBZ256rr, X86::VPADDBZ256rm, 0 },
- { X86::VPADDDZ128rr, X86::VPADDDZ128rm, 0 },
- { X86::VPADDDZ256rr, X86::VPADDDZ256rm, 0 },
- { X86::VPADDQZ128rr, X86::VPADDQZ128rm, 0 },
- { X86::VPADDQZ256rr, X86::VPADDQZ256rm, 0 },
- { X86::VPADDSBZ128rr, X86::VPADDSBZ128rm, 0 },
- { X86::VPADDSBZ256rr, X86::VPADDSBZ256rm, 0 },
- { X86::VPADDSWZ128rr, X86::VPADDSWZ128rm, 0 },
- { X86::VPADDSWZ256rr, X86::VPADDSWZ256rm, 0 },
- { X86::VPADDUSBZ128rr, X86::VPADDUSBZ128rm, 0 },
- { X86::VPADDUSBZ256rr, X86::VPADDUSBZ256rm, 0 },
- { X86::VPADDUSWZ128rr, X86::VPADDUSWZ128rm, 0 },
- { X86::VPADDUSWZ256rr, X86::VPADDUSWZ256rm, 0 },
- { X86::VPADDWZ128rr, X86::VPADDWZ128rm, 0 },
- { X86::VPADDWZ256rr, X86::VPADDWZ256rm, 0 },
- { X86::VPALIGNRZ128rri, X86::VPALIGNRZ128rmi, 0 },
- { X86::VPALIGNRZ256rri, X86::VPALIGNRZ256rmi, 0 },
- { X86::VPANDDZ128rr, X86::VPANDDZ128rm, 0 },
- { X86::VPANDDZ256rr, X86::VPANDDZ256rm, 0 },
- { X86::VPANDNDZ128rr, X86::VPANDNDZ128rm, 0 },
- { X86::VPANDNDZ256rr, X86::VPANDNDZ256rm, 0 },
- { X86::VPANDNQZ128rr, X86::VPANDNQZ128rm, 0 },
- { X86::VPANDNQZ256rr, X86::VPANDNQZ256rm, 0 },
- { X86::VPANDQZ128rr, X86::VPANDQZ128rm, 0 },
- { X86::VPANDQZ256rr, X86::VPANDQZ256rm, 0 },
- { X86::VPAVGBZ128rr, X86::VPAVGBZ128rm, 0 },
- { X86::VPAVGBZ256rr, X86::VPAVGBZ256rm, 0 },
- { X86::VPAVGWZ128rr, X86::VPAVGWZ128rm, 0 },
- { X86::VPAVGWZ256rr, X86::VPAVGWZ256rm, 0 },
- { X86::VPCMPBZ128rri, X86::VPCMPBZ128rmi, 0 },
- { X86::VPCMPBZ256rri, X86::VPCMPBZ256rmi, 0 },
- { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmi, 0 },
- { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmi, 0 },
- { X86::VPCMPEQBZ128rr, X86::VPCMPEQBZ128rm, 0 },
- { X86::VPCMPEQBZ256rr, X86::VPCMPEQBZ256rm, 0 },
- { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rm, 0 },
- { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rm, 0 },
- { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rm, 0 },
- { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rm, 0 },
- { X86::VPCMPEQWZ128rr, X86::VPCMPEQWZ128rm, 0 },
- { X86::VPCMPEQWZ256rr, X86::VPCMPEQWZ256rm, 0 },
- { X86::VPCMPGTBZ128rr, X86::VPCMPGTBZ128rm, 0 },
- { X86::VPCMPGTBZ256rr, X86::VPCMPGTBZ256rm, 0 },
- { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rm, 0 },
- { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rm, 0 },
- { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rm, 0 },
- { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rm, 0 },
- { X86::VPCMPGTWZ128rr, X86::VPCMPGTWZ128rm, 0 },
- { X86::VPCMPGTWZ256rr, X86::VPCMPGTWZ256rm, 0 },
- { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmi, 0 },
- { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmi, 0 },
- { X86::VPCMPUBZ128rri, X86::VPCMPUBZ128rmi, 0 },
- { X86::VPCMPUBZ256rri, X86::VPCMPUBZ256rmi, 0 },
- { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmi, 0 },
- { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmi, 0 },
- { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmi, 0 },
- { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmi, 0 },
- { X86::VPCMPUWZ128rri, X86::VPCMPUWZ128rmi, 0 },
- { X86::VPCMPUWZ256rri, X86::VPCMPUWZ256rmi, 0 },
- { X86::VPCMPWZ128rri, X86::VPCMPWZ128rmi, 0 },
- { X86::VPCMPWZ256rri, X86::VPCMPWZ256rmi, 0 },
- { X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 },
- { X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 },
- { X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 },
- { X86::VPERMILPDZ128rr, X86::VPERMILPDZ128rm, 0 },
- { X86::VPERMILPDZ256rr, X86::VPERMILPDZ256rm, 0 },
- { X86::VPERMILPSZ128rr, X86::VPERMILPSZ128rm, 0 },
- { X86::VPERMILPSZ256rr, X86::VPERMILPSZ256rm, 0 },
- { X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 },
- { X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 },
- { X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 },
- { X86::VPERMWZ128rr, X86::VPERMWZ128rm, 0 },
- { X86::VPERMWZ256rr, X86::VPERMWZ256rm, 0 },
- { X86::VPMADDUBSWZ128rr, X86::VPMADDUBSWZ128rm, 0 },
- { X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 },
- { X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 },
- { X86::VPMADDWDZ256rr, X86::VPMADDWDZ256rm, 0 },
- { X86::VPMAXSBZ128rr, X86::VPMAXSBZ128rm, 0 },
- { X86::VPMAXSBZ256rr, X86::VPMAXSBZ256rm, 0 },
- { X86::VPMAXSDZ128rr, X86::VPMAXSDZ128rm, 0 },
- { X86::VPMAXSDZ256rr, X86::VPMAXSDZ256rm, 0 },
- { X86::VPMAXSQZ128rr, X86::VPMAXSQZ128rm, 0 },
- { X86::VPMAXSQZ256rr, X86::VPMAXSQZ256rm, 0 },
- { X86::VPMAXSWZ128rr, X86::VPMAXSWZ128rm, 0 },
- { X86::VPMAXSWZ256rr, X86::VPMAXSWZ256rm, 0 },
- { X86::VPMAXUBZ128rr, X86::VPMAXUBZ128rm, 0 },
- { X86::VPMAXUBZ256rr, X86::VPMAXUBZ256rm, 0 },
- { X86::VPMAXUDZ128rr, X86::VPMAXUDZ128rm, 0 },
- { X86::VPMAXUDZ256rr, X86::VPMAXUDZ256rm, 0 },
- { X86::VPMAXUQZ128rr, X86::VPMAXUQZ128rm, 0 },
- { X86::VPMAXUQZ256rr, X86::VPMAXUQZ256rm, 0 },
- { X86::VPMAXUWZ128rr, X86::VPMAXUWZ128rm, 0 },
- { X86::VPMAXUWZ256rr, X86::VPMAXUWZ256rm, 0 },
- { X86::VPMINSBZ128rr, X86::VPMINSBZ128rm, 0 },
- { X86::VPMINSBZ256rr, X86::VPMINSBZ256rm, 0 },
- { X86::VPMINSDZ128rr, X86::VPMINSDZ128rm, 0 },
- { X86::VPMINSDZ256rr, X86::VPMINSDZ256rm, 0 },
- { X86::VPMINSQZ128rr, X86::VPMINSQZ128rm, 0 },
- { X86::VPMINSQZ256rr, X86::VPMINSQZ256rm, 0 },
- { X86::VPMINSWZ128rr, X86::VPMINSWZ128rm, 0 },
- { X86::VPMINSWZ256rr, X86::VPMINSWZ256rm, 0 },
- { X86::VPMINUBZ128rr, X86::VPMINUBZ128rm, 0 },
- { X86::VPMINUBZ256rr, X86::VPMINUBZ256rm, 0 },
- { X86::VPMINUDZ128rr, X86::VPMINUDZ128rm, 0 },
- { X86::VPMINUDZ256rr, X86::VPMINUDZ256rm, 0 },
- { X86::VPMINUQZ128rr, X86::VPMINUQZ128rm, 0 },
- { X86::VPMINUQZ256rr, X86::VPMINUQZ256rm, 0 },
- { X86::VPMINUWZ128rr, X86::VPMINUWZ128rm, 0 },
- { X86::VPMINUWZ256rr, X86::VPMINUWZ256rm, 0 },
- { X86::VPMULDQZ128rr, X86::VPMULDQZ128rm, 0 },
- { X86::VPMULDQZ256rr, X86::VPMULDQZ256rm, 0 },
- { X86::VPMULLDZ128rr, X86::VPMULLDZ128rm, 0 },
- { X86::VPMULLDZ256rr, X86::VPMULLDZ256rm, 0 },
- { X86::VPMULLQZ128rr, X86::VPMULLQZ128rm, 0 },
- { X86::VPMULLQZ256rr, X86::VPMULLQZ256rm, 0 },
- { X86::VPMULLWZ128rr, X86::VPMULLWZ128rm, 0 },
- { X86::VPMULLWZ256rr, X86::VPMULLWZ256rm, 0 },
- { X86::VPMULUDQZ128rr, X86::VPMULUDQZ128rm, 0 },
- { X86::VPMULUDQZ256rr, X86::VPMULUDQZ256rm, 0 },
- { X86::VPORDZ128rr, X86::VPORDZ128rm, 0 },
- { X86::VPORDZ256rr, X86::VPORDZ256rm, 0 },
- { X86::VPORQZ128rr, X86::VPORQZ128rm, 0 },
- { X86::VPORQZ256rr, X86::VPORQZ256rm, 0 },
- { X86::VPSADBWZ128rr, X86::VPSADBWZ128rm, 0 },
- { X86::VPSADBWZ256rr, X86::VPSADBWZ256rm, 0 },
- { X86::VPSHUFBZ128rr, X86::VPSHUFBZ128rm, 0 },
- { X86::VPSHUFBZ256rr, X86::VPSHUFBZ256rm, 0 },
- { X86::VPSLLDZ128rr, X86::VPSLLDZ128rm, 0 },
- { X86::VPSLLDZ256rr, X86::VPSLLDZ256rm, 0 },
- { X86::VPSLLQZ128rr, X86::VPSLLQZ128rm, 0 },
- { X86::VPSLLQZ256rr, X86::VPSLLQZ256rm, 0 },
- { X86::VPSLLVDZ128rr, X86::VPSLLVDZ128rm, 0 },
- { X86::VPSLLVDZ256rr, X86::VPSLLVDZ256rm, 0 },
- { X86::VPSLLVQZ128rr, X86::VPSLLVQZ128rm, 0 },
- { X86::VPSLLVQZ256rr, X86::VPSLLVQZ256rm, 0 },
- { X86::VPSLLVWZ128rr, X86::VPSLLVWZ128rm, 0 },
- { X86::VPSLLVWZ256rr, X86::VPSLLVWZ256rm, 0 },
- { X86::VPSLLWZ128rr, X86::VPSLLWZ128rm, 0 },
- { X86::VPSLLWZ256rr, X86::VPSLLWZ256rm, 0 },
- { X86::VPSRADZ128rr, X86::VPSRADZ128rm, 0 },
- { X86::VPSRADZ256rr, X86::VPSRADZ256rm, 0 },
- { X86::VPSRAQZ128rr, X86::VPSRAQZ128rm, 0 },
- { X86::VPSRAQZ256rr, X86::VPSRAQZ256rm, 0 },
- { X86::VPSRAVDZ128rr, X86::VPSRAVDZ128rm, 0 },
- { X86::VPSRAVDZ256rr, X86::VPSRAVDZ256rm, 0 },
- { X86::VPSRAVQZ128rr, X86::VPSRAVQZ128rm, 0 },
- { X86::VPSRAVQZ256rr, X86::VPSRAVQZ256rm, 0 },
- { X86::VPSRAVWZ128rr, X86::VPSRAVWZ128rm, 0 },
- { X86::VPSRAVWZ256rr, X86::VPSRAVWZ256rm, 0 },
- { X86::VPSRAWZ128rr, X86::VPSRAWZ128rm, 0 },
- { X86::VPSRAWZ256rr, X86::VPSRAWZ256rm, 0 },
- { X86::VPSRLDZ128rr, X86::VPSRLDZ128rm, 0 },
- { X86::VPSRLDZ256rr, X86::VPSRLDZ256rm, 0 },
- { X86::VPSRLQZ128rr, X86::VPSRLQZ128rm, 0 },
- { X86::VPSRLQZ256rr, X86::VPSRLQZ256rm, 0 },
- { X86::VPSRLVDZ128rr, X86::VPSRLVDZ128rm, 0 },
- { X86::VPSRLVDZ256rr, X86::VPSRLVDZ256rm, 0 },
- { X86::VPSRLVQZ128rr, X86::VPSRLVQZ128rm, 0 },
- { X86::VPSRLVQZ256rr, X86::VPSRLVQZ256rm, 0 },
- { X86::VPSRLVWZ128rr, X86::VPSRLVWZ128rm, 0 },
- { X86::VPSRLVWZ256rr, X86::VPSRLVWZ256rm, 0 },
- { X86::VPSRLWZ128rr, X86::VPSRLWZ128rm, 0 },
- { X86::VPSRLWZ256rr, X86::VPSRLWZ256rm, 0 },
- { X86::VPSUBBZ128rr, X86::VPSUBBZ128rm, 0 },
- { X86::VPSUBBZ256rr, X86::VPSUBBZ256rm, 0 },
- { X86::VPSUBDZ128rr, X86::VPSUBDZ128rm, 0 },
- { X86::VPSUBDZ256rr, X86::VPSUBDZ256rm, 0 },
- { X86::VPSUBQZ128rr, X86::VPSUBQZ128rm, 0 },
- { X86::VPSUBQZ256rr, X86::VPSUBQZ256rm, 0 },
- { X86::VPSUBSBZ128rr, X86::VPSUBSBZ128rm, 0 },
- { X86::VPSUBSBZ256rr, X86::VPSUBSBZ256rm, 0 },
- { X86::VPSUBSWZ128rr, X86::VPSUBSWZ128rm, 0 },
- { X86::VPSUBSWZ256rr, X86::VPSUBSWZ256rm, 0 },
- { X86::VPSUBUSBZ128rr, X86::VPSUBUSBZ128rm, 0 },
- { X86::VPSUBUSBZ256rr, X86::VPSUBUSBZ256rm, 0 },
- { X86::VPSUBUSWZ128rr, X86::VPSUBUSWZ128rm, 0 },
- { X86::VPSUBUSWZ256rr, X86::VPSUBUSWZ256rm, 0 },
- { X86::VPSUBWZ128rr, X86::VPSUBWZ128rm, 0 },
- { X86::VPSUBWZ256rr, X86::VPSUBWZ256rm, 0 },
- { X86::VPUNPCKHBWZ128rr, X86::VPUNPCKHBWZ128rm, 0 },
- { X86::VPUNPCKHBWZ256rr, X86::VPUNPCKHBWZ256rm, 0 },
- { X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rm, 0 },
- { X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rm, 0 },
- { X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm, 0 },
- { X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm, 0 },
- { X86::VPUNPCKHWDZ128rr, X86::VPUNPCKHWDZ128rm, 0 },
- { X86::VPUNPCKHWDZ256rr, X86::VPUNPCKHWDZ256rm, 0 },
- { X86::VPUNPCKLBWZ128rr, X86::VPUNPCKLBWZ128rm, 0 },
- { X86::VPUNPCKLBWZ256rr, X86::VPUNPCKLBWZ256rm, 0 },
- { X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rm, 0 },
- { X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rm, 0 },
- { X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm, 0 },
- { X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm, 0 },
- { X86::VPUNPCKLWDZ128rr, X86::VPUNPCKLWDZ128rm, 0 },
- { X86::VPUNPCKLWDZ256rr, X86::VPUNPCKLWDZ256rm, 0 },
- { X86::VPXORDZ128rr, X86::VPXORDZ128rm, 0 },
- { X86::VPXORDZ256rr, X86::VPXORDZ256rm, 0 },
- { X86::VPXORQZ128rr, X86::VPXORQZ128rm, 0 },
- { X86::VPXORQZ256rr, X86::VPXORQZ256rm, 0 },
- { X86::VSHUFPDZ128rri, X86::VSHUFPDZ128rmi, 0 },
- { X86::VSHUFPDZ256rri, X86::VSHUFPDZ256rmi, 0 },
- { X86::VSHUFPSZ128rri, X86::VSHUFPSZ128rmi, 0 },
- { X86::VSHUFPSZ256rri, X86::VSHUFPSZ256rmi, 0 },
- { X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0 },
- { X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 },
- { X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 },
- { X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 },
- { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rm, 0 },
- { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rm, 0 },
- { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rm, 0 },
- { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rm, 0 },
- { X86::VUNPCKLPDZ128rr, X86::VUNPCKLPDZ128rm, 0 },
- { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rm, 0 },
- { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rm, 0 },
- { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rm, 0 },
- { X86::VXORPDZ128rr, X86::VXORPDZ128rm, 0 },
- { X86::VXORPDZ256rr, X86::VXORPDZ256rm, 0 },
- { X86::VXORPSZ128rr, X86::VXORPSZ128rm, 0 },
- { X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 },
-
- // AVX-512 masked foldable instructions
- { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
- { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
- { X86::VPABSBZrrkz, X86::VPABSBZrmkz, 0 },
- { X86::VPABSDZrrkz, X86::VPABSDZrmkz, 0 },
- { X86::VPABSQZrrkz, X86::VPABSQZrmkz, 0 },
- { X86::VPABSWZrrkz, X86::VPABSWZrmkz, 0 },
- { X86::VPCONFLICTDZrrkz, X86::VPCONFLICTDZrmkz, 0 },
- { X86::VPCONFLICTQZrrkz, X86::VPCONFLICTQZrmkz, 0 },
- { X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 },
- { X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 },
- { X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 },
- { X86::VPERMQZrikz, X86::VPERMQZmikz, 0 },
- { X86::VPLZCNTDZrrkz, X86::VPLZCNTDZrmkz, 0 },
- { X86::VPLZCNTQZrrkz, X86::VPLZCNTQZrmkz, 0 },
- { X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 },
- { X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE },
- { X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 },
- { X86::VPMOVSXDQZrrkz, X86::VPMOVSXDQZrmkz, 0 },
- { X86::VPMOVSXWDZrrkz, X86::VPMOVSXWDZrmkz, 0 },
- { X86::VPMOVSXWQZrrkz, X86::VPMOVSXWQZrmkz, 0 },
- { X86::VPMOVZXBDZrrkz, X86::VPMOVZXBDZrmkz, 0 },
- { X86::VPMOVZXBQZrrkz, X86::VPMOVZXBQZrmkz, TB_NO_REVERSE },
- { X86::VPMOVZXBWZrrkz, X86::VPMOVZXBWZrmkz, 0 },
- { X86::VPMOVZXDQZrrkz, X86::VPMOVZXDQZrmkz, 0 },
- { X86::VPMOVZXWDZrrkz, X86::VPMOVZXWDZrmkz, 0 },
- { X86::VPMOVZXWQZrrkz, X86::VPMOVZXWQZrmkz, 0 },
- { X86::VPOPCNTDZrrkz, X86::VPOPCNTDZrmkz, 0 },
- { X86::VPOPCNTQZrrkz, X86::VPOPCNTQZrmkz, 0 },
- { X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 },
- { X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 },
- { X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 },
- { X86::VPSLLDZrikz, X86::VPSLLDZmikz, 0 },
- { X86::VPSLLQZrikz, X86::VPSLLQZmikz, 0 },
- { X86::VPSLLWZrikz, X86::VPSLLWZmikz, 0 },
- { X86::VPSRADZrikz, X86::VPSRADZmikz, 0 },
- { X86::VPSRAQZrikz, X86::VPSRAQZmikz, 0 },
- { X86::VPSRAWZrikz, X86::VPSRAWZmikz, 0 },
- { X86::VPSRLDZrikz, X86::VPSRLDZmikz, 0 },
- { X86::VPSRLQZrikz, X86::VPSRLQZmikz, 0 },
- { X86::VPSRLWZrikz, X86::VPSRLWZmikz, 0 },
-
- // AVX-512VL 256-bit masked foldable instructions
- { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE },
- { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE },
- { X86::VPABSBZ256rrkz, X86::VPABSBZ256rmkz, 0 },
- { X86::VPABSDZ256rrkz, X86::VPABSDZ256rmkz, 0 },
- { X86::VPABSQZ256rrkz, X86::VPABSQZ256rmkz, 0 },
- { X86::VPABSWZ256rrkz, X86::VPABSWZ256rmkz, 0 },
- { X86::VPCONFLICTDZ256rrkz, X86::VPCONFLICTDZ256rmkz, 0 },
- { X86::VPCONFLICTQZ256rrkz, X86::VPCONFLICTQZ256rmkz, 0 },
- { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 },
- { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 },
- { X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 },
- { X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 },
- { X86::VPLZCNTDZ256rrkz, X86::VPLZCNTDZ256rmkz, 0 },
- { X86::VPLZCNTQZ256rrkz, X86::VPLZCNTQZ256rmkz, 0 },
- { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE },
- { X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE },
- { X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 },
- { X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz, 0 },
- { X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz, 0 },
- { X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz, 0 },
- { X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz, 0 },
- { X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz, 0 },
- { X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz, TB_NO_REVERSE },
- { X86::VPSHUFDZ256rikz, X86::VPSHUFDZ256mikz, 0 },
- { X86::VPSHUFHWZ256rikz, X86::VPSHUFHWZ256mikz, 0 },
- { X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 },
- { X86::VPSLLDZ256rikz, X86::VPSLLDZ256mikz, 0 },
- { X86::VPSLLQZ256rikz, X86::VPSLLQZ256mikz, 0 },
- { X86::VPSLLWZ256rikz, X86::VPSLLWZ256mikz, 0 },
- { X86::VPSRADZ256rikz, X86::VPSRADZ256mikz, 0 },
- { X86::VPSRAQZ256rikz, X86::VPSRAQZ256mikz, 0 },
- { X86::VPSRAWZ256rikz, X86::VPSRAWZ256mikz, 0 },
- { X86::VPSRLDZ256rikz, X86::VPSRLDZ256mikz, 0 },
- { X86::VPSRLQZ256rikz, X86::VPSRLQZ256mikz, 0 },
- { X86::VPSRLWZ256rikz, X86::VPSRLWZ256mikz, 0 },
-
- // AVX-512VL 128-bit masked foldable instructions
- { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE },
- { X86::VPABSBZ128rrkz, X86::VPABSBZ128rmkz, 0 },
- { X86::VPABSDZ128rrkz, X86::VPABSDZ128rmkz, 0 },
- { X86::VPABSQZ128rrkz, X86::VPABSQZ128rmkz, 0 },
- { X86::VPABSWZ128rrkz, X86::VPABSWZ128rmkz, 0 },
- { X86::VPCONFLICTDZ128rrkz, X86::VPCONFLICTDZ128rmkz, 0 },
- { X86::VPCONFLICTQZ128rrkz, X86::VPCONFLICTQZ128rmkz, 0 },
- { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 },
- { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 },
- { X86::VPLZCNTDZ128rrkz, X86::VPLZCNTDZ128rmkz, 0 },
- { X86::VPLZCNTQZ128rrkz, X86::VPLZCNTQZ128rmkz, 0 },
- { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz, TB_NO_REVERSE },
- { X86::VPSHUFDZ128rikz, X86::VPSHUFDZ128mikz, 0 },
- { X86::VPSHUFHWZ128rikz, X86::VPSHUFHWZ128mikz, 0 },
- { X86::VPSHUFLWZ128rikz, X86::VPSHUFLWZ128mikz, 0 },
- { X86::VPSLLDZ128rikz, X86::VPSLLDZ128mikz, 0 },
- { X86::VPSLLQZ128rikz, X86::VPSLLQZ128mikz, 0 },
- { X86::VPSLLWZ128rikz, X86::VPSLLWZ128mikz, 0 },
- { X86::VPSRADZ128rikz, X86::VPSRADZ128mikz, 0 },
- { X86::VPSRAQZ128rikz, X86::VPSRAQZ128mikz, 0 },
- { X86::VPSRAWZ128rikz, X86::VPSRAWZ128mikz, 0 },
- { X86::VPSRLDZ128rikz, X86::VPSRLDZ128mikz, 0 },
- { X86::VPSRLQZ128rikz, X86::VPSRLQZ128mikz, 0 },
- { X86::VPSRLWZ128rikz, X86::VPSRLWZ128mikz, 0 },
-
- // AES foldable instructions
- { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 },
- { X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 },
- { X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 },
- { X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 },
- { X86::VAESDECLASTrr, X86::VAESDECLASTrm, 0 },
- { X86::VAESDECrr, X86::VAESDECrm, 0 },
- { X86::VAESENCLASTrr, X86::VAESENCLASTrm, 0 },
- { X86::VAESENCrr, X86::VAESENCrm, 0 },
-
- // SHA foldable instructions
- { X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 },
- { X86::SHA1MSG2rr, X86::SHA1MSG2rm, TB_ALIGN_16 },
- { X86::SHA1NEXTErr, X86::SHA1NEXTErm, TB_ALIGN_16 },
- { X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 },
- { X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 },
- { X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 },
- { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 }
- };
-
- for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) {
- AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
- Entry.RegOp, Entry.MemOp,
- // Index 2, folded load
- Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
- }
-
- static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
- // FMA4 foldable patterns
- { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE },
- { X86::VFMADDSS4rr_Int, X86::VFMADDSS4rm_Int, TB_NO_REVERSE },
- { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE },
- { X86::VFMADDSD4rr_Int, X86::VFMADDSD4rm_Int, TB_NO_REVERSE },
- { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE },
- { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE },
- { X86::VFMADDPS4Yrr, X86::VFMADDPS4Yrm, TB_ALIGN_NONE },
- { X86::VFMADDPD4Yrr, X86::VFMADDPD4Yrm, TB_ALIGN_NONE },
- { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE },
- { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4rm_Int, TB_NO_REVERSE },
- { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE },
- { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4rm_Int, TB_NO_REVERSE },
- { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE },
- { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE },
- { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Yrm, TB_ALIGN_NONE },
- { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Yrm, TB_ALIGN_NONE },
- { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE },
- { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4rm_Int, TB_NO_REVERSE },
- { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE },
- { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4rm_Int, TB_NO_REVERSE },
- { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE },
- { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE },
- { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Yrm, TB_ALIGN_NONE },
- { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Yrm, TB_ALIGN_NONE },
- { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE },
- { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4rm_Int, TB_NO_REVERSE },
- { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE },
- { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4rm_Int, TB_NO_REVERSE },
- { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE },
- { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE },
- { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Yrm, TB_ALIGN_NONE },
- { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Yrm, TB_ALIGN_NONE },
- { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE },
- { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE },
- { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Yrm, TB_ALIGN_NONE },
- { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Yrm, TB_ALIGN_NONE },
- { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE },
- { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE },
- { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Yrm, TB_ALIGN_NONE },
- { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Yrm, TB_ALIGN_NONE },
-
- // XOP foldable instructions
- { X86::VPCMOVrrr, X86::VPCMOVrrm, 0 },
- { X86::VPCMOVYrrr, X86::VPCMOVYrrm, 0 },
- { X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 },
- { X86::VPERMIL2PDYrr, X86::VPERMIL2PDYrm, 0 },
- { X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 },
- { X86::VPERMIL2PSYrr, X86::VPERMIL2PSYrm, 0 },
- { X86::VPPERMrrr, X86::VPPERMrrm, 0 },
-
- // AVX-512 instructions with 3 source operands.
- { X86::VPERMI2Brr, X86::VPERMI2Brm, 0 },
- { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 },
- { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 },
- { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 },
- { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 },
- { X86::VPERMI2Wrr, X86::VPERMI2Wrm, 0 },
- { X86::VPERMT2Brr, X86::VPERMT2Brm, 0 },
- { X86::VPERMT2Drr, X86::VPERMT2Drm, 0 },
- { X86::VPERMT2PSrr, X86::VPERMT2PSrm, 0 },
- { X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 },
- { X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 },
- { X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 },
- { X86::VPMADD52HUQZr, X86::VPMADD52HUQZm, 0 },
- { X86::VPMADD52LUQZr, X86::VPMADD52LUQZm, 0 },
- { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 },
- { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 },
-
- // AVX-512VL 256-bit instructions with 3 source operands.
- { X86::VPERMI2B256rr, X86::VPERMI2B256rm, 0 },
- { X86::VPERMI2D256rr, X86::VPERMI2D256rm, 0 },
- { X86::VPERMI2PD256rr, X86::VPERMI2PD256rm, 0 },
- { X86::VPERMI2PS256rr, X86::VPERMI2PS256rm, 0 },
- { X86::VPERMI2Q256rr, X86::VPERMI2Q256rm, 0 },
- { X86::VPERMI2W256rr, X86::VPERMI2W256rm, 0 },
- { X86::VPERMT2B256rr, X86::VPERMT2B256rm, 0 },
- { X86::VPERMT2D256rr, X86::VPERMT2D256rm, 0 },
- { X86::VPERMT2PD256rr, X86::VPERMT2PD256rm, 0 },
- { X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 },
- { X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 },
- { X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 },
- { X86::VPMADD52HUQZ256r, X86::VPMADD52HUQZ256m, 0 },
- { X86::VPMADD52LUQZ256r, X86::VPMADD52LUQZ256m, 0 },
- { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 },
- { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 },
-
- // AVX-512VL 128-bit instructions with 3 source operands.
- { X86::VPERMI2B128rr, X86::VPERMI2B128rm, 0 },
- { X86::VPERMI2D128rr, X86::VPERMI2D128rm, 0 },
- { X86::VPERMI2PD128rr, X86::VPERMI2PD128rm, 0 },
- { X86::VPERMI2PS128rr, X86::VPERMI2PS128rm, 0 },
- { X86::VPERMI2Q128rr, X86::VPERMI2Q128rm, 0 },
- { X86::VPERMI2W128rr, X86::VPERMI2W128rm, 0 },
- { X86::VPERMT2B128rr, X86::VPERMT2B128rm, 0 },
- { X86::VPERMT2D128rr, X86::VPERMT2D128rm, 0 },
- { X86::VPERMT2PD128rr, X86::VPERMT2PD128rm, 0 },
- { X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 },
- { X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 },
- { X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 },
- { X86::VPMADD52HUQZ128r, X86::VPMADD52HUQZ128m, 0 },
- { X86::VPMADD52LUQZ128r, X86::VPMADD52LUQZ128m, 0 },
- { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 },
- { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 },
-
- // AVX-512 masked instructions
- { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 },
- { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 },
- { X86::VADDSDZrr_Intkz, X86::VADDSDZrm_Intkz, TB_NO_REVERSE },
- { X86::VADDSSZrr_Intkz, X86::VADDSSZrm_Intkz, TB_NO_REVERSE },
- { X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 },
- { X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 },
- { X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 },
- { X86::VANDNPSZrrkz, X86::VANDNPSZrmkz, 0 },
- { X86::VANDPDZrrkz, X86::VANDPDZrmkz, 0 },
- { X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 },
- { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 },
- { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
- { X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE },
- { X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE },
- { X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 },
- { X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 },
- { X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 },
- { X86::VINSERTF64x4Zrrkz, X86::VINSERTF64x4Zrmkz, 0 },
- { X86::VINSERTI32x4Zrrkz, X86::VINSERTI32x4Zrmkz, 0 },
- { X86::VINSERTI32x8Zrrkz, X86::VINSERTI32x8Zrmkz, 0 },
- { X86::VINSERTI64x2Zrrkz, X86::VINSERTI64x2Zrmkz, 0 },
- { X86::VINSERTI64x4Zrrkz, X86::VINSERTI64x4Zrmkz, 0 },
- { X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0 },
- { X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 },
- { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 },
- { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 },
- { X86::VMAXSDZrr_Intkz, X86::VMAXSDZrm_Intkz, 0 },
- { X86::VMAXSSZrr_Intkz, X86::VMAXSSZrm_Intkz, 0 },
- { X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 },
- { X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 },
- { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 },
- { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 },
- { X86::VMINSDZrr_Intkz, X86::VMINSDZrm_Intkz, 0 },
- { X86::VMINSSZrr_Intkz, X86::VMINSSZrm_Intkz, 0 },
- { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 },
- { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 },
- { X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE },
- { X86::VMULSSZrr_Intkz, X86::VMULSSZrm_Intkz, TB_NO_REVERSE },
- { X86::VORPDZrrkz, X86::VORPDZrmkz, 0 },
- { X86::VORPSZrrkz, X86::VORPSZrmkz, 0 },
- { X86::VPACKSSDWZrrkz, X86::VPACKSSDWZrmkz, 0 },
- { X86::VPACKSSWBZrrkz, X86::VPACKSSWBZrmkz, 0 },
- { X86::VPACKUSDWZrrkz, X86::VPACKUSDWZrmkz, 0 },
- { X86::VPACKUSWBZrrkz, X86::VPACKUSWBZrmkz, 0 },
- { X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 },
- { X86::VPADDDZrrkz, X86::VPADDDZrmkz, 0 },
- { X86::VPADDQZrrkz, X86::VPADDQZrmkz, 0 },
- { X86::VPADDSBZrrkz, X86::VPADDSBZrmkz, 0 },
- { X86::VPADDSWZrrkz, X86::VPADDSWZrmkz, 0 },
- { X86::VPADDUSBZrrkz, X86::VPADDUSBZrmkz, 0 },
- { X86::VPADDUSWZrrkz, X86::VPADDUSWZrmkz, 0 },
- { X86::VPADDWZrrkz, X86::VPADDWZrmkz, 0 },
- { X86::VPALIGNRZrrikz, X86::VPALIGNRZrmikz, 0 },
- { X86::VPANDDZrrkz, X86::VPANDDZrmkz, 0 },
- { X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 },
- { X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 },
- { X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 },
- { X86::VPAVGBZrrkz, X86::VPAVGBZrmkz, 0 },
- { X86::VPAVGWZrrkz, X86::VPAVGWZrmkz, 0 },
- { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 },
- { X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 },
- { X86::VPERMILPDZrrkz, X86::VPERMILPDZrmkz, 0 },
- { X86::VPERMILPSZrrkz, X86::VPERMILPSZrmkz, 0 },
- { X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 },
- { X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 },
- { X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 },
- { X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 },
- { X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 },
- { X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 },
- { X86::VPMAXSBZrrkz, X86::VPMAXSBZrmkz, 0 },
- { X86::VPMAXSDZrrkz, X86::VPMAXSDZrmkz, 0 },
- { X86::VPMAXSQZrrkz, X86::VPMAXSQZrmkz, 0 },
- { X86::VPMAXSWZrrkz, X86::VPMAXSWZrmkz, 0 },
- { X86::VPMAXUBZrrkz, X86::VPMAXUBZrmkz, 0 },
- { X86::VPMAXUDZrrkz, X86::VPMAXUDZrmkz, 0 },
- { X86::VPMAXUQZrrkz, X86::VPMAXUQZrmkz, 0 },
- { X86::VPMAXUWZrrkz, X86::VPMAXUWZrmkz, 0 },
- { X86::VPMINSBZrrkz, X86::VPMINSBZrmkz, 0 },
- { X86::VPMINSDZrrkz, X86::VPMINSDZrmkz, 0 },
- { X86::VPMINSQZrrkz, X86::VPMINSQZrmkz, 0 },
- { X86::VPMINSWZrrkz, X86::VPMINSWZrmkz, 0 },
- { X86::VPMINUBZrrkz, X86::VPMINUBZrmkz, 0 },
- { X86::VPMINUDZrrkz, X86::VPMINUDZrmkz, 0 },
- { X86::VPMINUQZrrkz, X86::VPMINUQZrmkz, 0 },
- { X86::VPMINUWZrrkz, X86::VPMINUWZrmkz, 0 },
- { X86::VPMULLDZrrkz, X86::VPMULLDZrmkz, 0 },
- { X86::VPMULLQZrrkz, X86::VPMULLQZrmkz, 0 },
- { X86::VPMULLWZrrkz, X86::VPMULLWZrmkz, 0 },
- { X86::VPMULDQZrrkz, X86::VPMULDQZrmkz, 0 },
- { X86::VPMULUDQZrrkz, X86::VPMULUDQZrmkz, 0 },
- { X86::VPORDZrrkz, X86::VPORDZrmkz, 0 },
- { X86::VPORQZrrkz, X86::VPORQZrmkz, 0 },
- { X86::VPSHUFBZrrkz, X86::VPSHUFBZrmkz, 0 },
- { X86::VPSLLDZrrkz, X86::VPSLLDZrmkz, 0 },
- { X86::VPSLLQZrrkz, X86::VPSLLQZrmkz, 0 },
- { X86::VPSLLVDZrrkz, X86::VPSLLVDZrmkz, 0 },
- { X86::VPSLLVQZrrkz, X86::VPSLLVQZrmkz, 0 },
- { X86::VPSLLVWZrrkz, X86::VPSLLVWZrmkz, 0 },
- { X86::VPSLLWZrrkz, X86::VPSLLWZrmkz, 0 },
- { X86::VPSRADZrrkz, X86::VPSRADZrmkz, 0 },
- { X86::VPSRAQZrrkz, X86::VPSRAQZrmkz, 0 },
- { X86::VPSRAVDZrrkz, X86::VPSRAVDZrmkz, 0 },
- { X86::VPSRAVQZrrkz, X86::VPSRAVQZrmkz, 0 },
- { X86::VPSRAVWZrrkz, X86::VPSRAVWZrmkz, 0 },
- { X86::VPSRAWZrrkz, X86::VPSRAWZrmkz, 0 },
- { X86::VPSRLDZrrkz, X86::VPSRLDZrmkz, 0 },
- { X86::VPSRLQZrrkz, X86::VPSRLQZrmkz, 0 },
- { X86::VPSRLVDZrrkz, X86::VPSRLVDZrmkz, 0 },
- { X86::VPSRLVQZrrkz, X86::VPSRLVQZrmkz, 0 },
- { X86::VPSRLVWZrrkz, X86::VPSRLVWZrmkz, 0 },
- { X86::VPSRLWZrrkz, X86::VPSRLWZrmkz, 0 },
- { X86::VPSUBBZrrkz, X86::VPSUBBZrmkz, 0 },
- { X86::VPSUBDZrrkz, X86::VPSUBDZrmkz, 0 },
- { X86::VPSUBQZrrkz, X86::VPSUBQZrmkz, 0 },
- { X86::VPSUBSBZrrkz, X86::VPSUBSBZrmkz, 0 },
- { X86::VPSUBSWZrrkz, X86::VPSUBSWZrmkz, 0 },
- { X86::VPSUBUSBZrrkz, X86::VPSUBUSBZrmkz, 0 },
- { X86::VPSUBUSWZrrkz, X86::VPSUBUSWZrmkz, 0 },
- { X86::VPSUBWZrrkz, X86::VPSUBWZrmkz, 0 },
- { X86::VPUNPCKHBWZrrkz, X86::VPUNPCKHBWZrmkz, 0 },
- { X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmkz, 0 },
- { X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmkz, 0 },
- { X86::VPUNPCKHWDZrrkz, X86::VPUNPCKHWDZrmkz, 0 },
- { X86::VPUNPCKLBWZrrkz, X86::VPUNPCKLBWZrmkz, 0 },
- { X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmkz, 0 },
- { X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmkz, 0 },
- { X86::VPUNPCKLWDZrrkz, X86::VPUNPCKLWDZrmkz, 0 },
- { X86::VPXORDZrrkz, X86::VPXORDZrmkz, 0 },
- { X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 },
- { X86::VSHUFPDZrrikz, X86::VSHUFPDZrmikz, 0 },
- { X86::VSHUFPSZrrikz, X86::VSHUFPSZrmikz, 0 },
- { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 },
- { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 },
- { X86::VSUBSDZrr_Intkz, X86::VSUBSDZrm_Intkz, TB_NO_REVERSE },
- { X86::VSUBSSZrr_Intkz, X86::VSUBSSZrm_Intkz, TB_NO_REVERSE },
- { X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 },
- { X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 },
- { X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 },
- { X86::VUNPCKLPSZrrkz, X86::VUNPCKLPSZrmkz, 0 },
- { X86::VXORPDZrrkz, X86::VXORPDZrmkz, 0 },
- { X86::VXORPSZrrkz, X86::VXORPSZrmkz, 0 },
-
- // AVX-512{F,VL} masked arithmetic instructions 256-bit
- { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 },
- { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 },
- { X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0 },
- { X86::VALIGNQZ256rrikz, X86::VALIGNQZ256rmikz, 0 },
- { X86::VANDNPDZ256rrkz, X86::VANDNPDZ256rmkz, 0 },
- { X86::VANDNPSZ256rrkz, X86::VANDNPSZ256rmkz, 0 },
- { X86::VANDPDZ256rrkz, X86::VANDPDZ256rmkz, 0 },
- { X86::VANDPSZ256rrkz, X86::VANDPSZ256rmkz, 0 },
- { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 },
- { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 },
- { X86::VINSERTF32x4Z256rrkz, X86::VINSERTF32x4Z256rmkz, 0 },
- { X86::VINSERTF64x2Z256rrkz, X86::VINSERTF64x2Z256rmkz, 0 },
- { X86::VINSERTI32x4Z256rrkz, X86::VINSERTI32x4Z256rmkz, 0 },
- { X86::VINSERTI64x2Z256rrkz, X86::VINSERTI64x2Z256rmkz, 0 },
- { X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0 },
- { X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0 },
- { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 },
- { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 },
- { X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0 },
- { X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmkz, 0 },
- { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 },
- { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 },
- { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 },
- { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 },
- { X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 },
- { X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 },
- { X86::VPACKSSDWZ256rrkz, X86::VPACKSSDWZ256rmkz, 0 },
- { X86::VPACKSSWBZ256rrkz, X86::VPACKSSWBZ256rmkz, 0 },
- { X86::VPACKUSDWZ256rrkz, X86::VPACKUSDWZ256rmkz, 0 },
- { X86::VPACKUSWBZ256rrkz, X86::VPACKUSWBZ256rmkz, 0 },
- { X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 },
- { X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 },
- { X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 },
- { X86::VPADDSBZ256rrkz, X86::VPADDSBZ256rmkz, 0 },
- { X86::VPADDSWZ256rrkz, X86::VPADDSWZ256rmkz, 0 },
- { X86::VPADDUSBZ256rrkz, X86::VPADDUSBZ256rmkz, 0 },
- { X86::VPADDUSWZ256rrkz, X86::VPADDUSWZ256rmkz, 0 },
- { X86::VPADDWZ256rrkz, X86::VPADDWZ256rmkz, 0 },
- { X86::VPALIGNRZ256rrikz, X86::VPALIGNRZ256rmikz, 0 },
- { X86::VPANDDZ256rrkz, X86::VPANDDZ256rmkz, 0 },
- { X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 },
- { X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 },
- { X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 },
- { X86::VPAVGBZ256rrkz, X86::VPAVGBZ256rmkz, 0 },
- { X86::VPAVGWZ256rrkz, X86::VPAVGWZ256rmkz, 0 },
- { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 },
- { X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 },
- { X86::VPERMILPDZ256rrkz, X86::VPERMILPDZ256rmkz, 0 },
- { X86::VPERMILPSZ256rrkz, X86::VPERMILPSZ256rmkz, 0 },
- { X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 },
- { X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 },
- { X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 },
- { X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 },
- { X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 },
- { X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 },
- { X86::VPMAXSBZ256rrkz, X86::VPMAXSBZ256rmkz, 0 },
- { X86::VPMAXSDZ256rrkz, X86::VPMAXSDZ256rmkz, 0 },
- { X86::VPMAXSQZ256rrkz, X86::VPMAXSQZ256rmkz, 0 },
- { X86::VPMAXSWZ256rrkz, X86::VPMAXSWZ256rmkz, 0 },
- { X86::VPMAXUBZ256rrkz, X86::VPMAXUBZ256rmkz, 0 },
- { X86::VPMAXUDZ256rrkz, X86::VPMAXUDZ256rmkz, 0 },
- { X86::VPMAXUQZ256rrkz, X86::VPMAXUQZ256rmkz, 0 },
- { X86::VPMAXUWZ256rrkz, X86::VPMAXUWZ256rmkz, 0 },
- { X86::VPMINSBZ256rrkz, X86::VPMINSBZ256rmkz, 0 },
- { X86::VPMINSDZ256rrkz, X86::VPMINSDZ256rmkz, 0 },
- { X86::VPMINSQZ256rrkz, X86::VPMINSQZ256rmkz, 0 },
- { X86::VPMINSWZ256rrkz, X86::VPMINSWZ256rmkz, 0 },
- { X86::VPMINUBZ256rrkz, X86::VPMINUBZ256rmkz, 0 },
- { X86::VPMINUDZ256rrkz, X86::VPMINUDZ256rmkz, 0 },
- { X86::VPMINUQZ256rrkz, X86::VPMINUQZ256rmkz, 0 },
- { X86::VPMINUWZ256rrkz, X86::VPMINUWZ256rmkz, 0 },
- { X86::VPMULDQZ256rrkz, X86::VPMULDQZ256rmkz, 0 },
- { X86::VPMULLDZ256rrkz, X86::VPMULLDZ256rmkz, 0 },
- { X86::VPMULLQZ256rrkz, X86::VPMULLQZ256rmkz, 0 },
- { X86::VPMULLWZ256rrkz, X86::VPMULLWZ256rmkz, 0 },
- { X86::VPMULUDQZ256rrkz, X86::VPMULUDQZ256rmkz, 0 },
- { X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 },
- { X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 },
- { X86::VPSHUFBZ256rrkz, X86::VPSHUFBZ256rmkz, 0 },
- { X86::VPSLLDZ256rrkz, X86::VPSLLDZ256rmkz, 0 },
- { X86::VPSLLQZ256rrkz, X86::VPSLLQZ256rmkz, 0 },
- { X86::VPSLLVDZ256rrkz, X86::VPSLLVDZ256rmkz, 0 },
- { X86::VPSLLVQZ256rrkz, X86::VPSLLVQZ256rmkz, 0 },
- { X86::VPSLLVWZ256rrkz, X86::VPSLLVWZ256rmkz, 0 },
- { X86::VPSLLWZ256rrkz, X86::VPSLLWZ256rmkz, 0 },
- { X86::VPSRADZ256rrkz, X86::VPSRADZ256rmkz, 0 },
- { X86::VPSRAQZ256rrkz, X86::VPSRAQZ256rmkz, 0 },
- { X86::VPSRAVDZ256rrkz, X86::VPSRAVDZ256rmkz, 0 },
- { X86::VPSRAVQZ256rrkz, X86::VPSRAVQZ256rmkz, 0 },
- { X86::VPSRAVWZ256rrkz, X86::VPSRAVWZ256rmkz, 0 },
- { X86::VPSRAWZ256rrkz, X86::VPSRAWZ256rmkz, 0 },
- { X86::VPSRLDZ256rrkz, X86::VPSRLDZ256rmkz, 0 },
- { X86::VPSRLQZ256rrkz, X86::VPSRLQZ256rmkz, 0 },
- { X86::VPSRLVDZ256rrkz, X86::VPSRLVDZ256rmkz, 0 },
- { X86::VPSRLVQZ256rrkz, X86::VPSRLVQZ256rmkz, 0 },
- { X86::VPSRLVWZ256rrkz, X86::VPSRLVWZ256rmkz, 0 },
- { X86::VPSRLWZ256rrkz, X86::VPSRLWZ256rmkz, 0 },
- { X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 },
- { X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 },
- { X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 },
- { X86::VPSUBSBZ256rrkz, X86::VPSUBSBZ256rmkz, 0 },
- { X86::VPSUBSWZ256rrkz, X86::VPSUBSWZ256rmkz, 0 },
- { X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 },
- { X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 },
- { X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 },
- { X86::VPUNPCKHBWZ256rrkz, X86::VPUNPCKHBWZ256rmkz, 0 },
- { X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmkz, 0 },
- { X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmkz, 0 },
- { X86::VPUNPCKHWDZ256rrkz, X86::VPUNPCKHWDZ256rmkz, 0 },
- { X86::VPUNPCKLBWZ256rrkz, X86::VPUNPCKLBWZ256rmkz, 0 },
- { X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmkz, 0 },
- { X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmkz, 0 },
- { X86::VPUNPCKLWDZ256rrkz, X86::VPUNPCKLWDZ256rmkz, 0 },
- { X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 },
- { X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 },
- { X86::VSHUFPDZ256rrikz, X86::VSHUFPDZ256rmikz, 0 },
- { X86::VSHUFPSZ256rrikz, X86::VSHUFPSZ256rmikz, 0 },
- { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 },
- { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 },
- { X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 },
- { X86::VUNPCKHPSZ256rrkz, X86::VUNPCKHPSZ256rmkz, 0 },
- { X86::VUNPCKLPDZ256rrkz, X86::VUNPCKLPDZ256rmkz, 0 },
- { X86::VUNPCKLPSZ256rrkz, X86::VUNPCKLPSZ256rmkz, 0 },
- { X86::VXORPDZ256rrkz, X86::VXORPDZ256rmkz, 0 },
- { X86::VXORPSZ256rrkz, X86::VXORPSZ256rmkz, 0 },
-
- // AVX-512{F,VL} masked arithmetic instructions 128-bit
- { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 },
- { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 },
- { X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0 },
- { X86::VALIGNQZ128rrikz, X86::VALIGNQZ128rmikz, 0 },
- { X86::VANDNPDZ128rrkz, X86::VANDNPDZ128rmkz, 0 },
- { X86::VANDNPSZ128rrkz, X86::VANDNPSZ128rmkz, 0 },
- { X86::VANDPDZ128rrkz, X86::VANDPDZ128rmkz, 0 },
- { X86::VANDPSZ128rrkz, X86::VANDPSZ128rmkz, 0 },
- { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 },
- { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 },
- { X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0 },
- { X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0 },
- { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 },
- { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 },
- { X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0 },
- { X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmkz, 0 },
- { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 },
- { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 },
- { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 },
- { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 },
- { X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 },
- { X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 },
- { X86::VPACKSSDWZ128rrkz, X86::VPACKSSDWZ128rmkz, 0 },
- { X86::VPACKSSWBZ128rrkz, X86::VPACKSSWBZ128rmkz, 0 },
- { X86::VPACKUSDWZ128rrkz, X86::VPACKUSDWZ128rmkz, 0 },
- { X86::VPACKUSWBZ128rrkz, X86::VPACKUSWBZ128rmkz, 0 },
- { X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 },
- { X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 },
- { X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 },
- { X86::VPADDSBZ128rrkz, X86::VPADDSBZ128rmkz, 0 },
- { X86::VPADDSWZ128rrkz, X86::VPADDSWZ128rmkz, 0 },
- { X86::VPADDUSBZ128rrkz, X86::VPADDUSBZ128rmkz, 0 },
- { X86::VPADDUSWZ128rrkz, X86::VPADDUSWZ128rmkz, 0 },
- { X86::VPADDWZ128rrkz, X86::VPADDWZ128rmkz, 0 },
- { X86::VPALIGNRZ128rrikz, X86::VPALIGNRZ128rmikz, 0 },
- { X86::VPANDDZ128rrkz, X86::VPANDDZ128rmkz, 0 },
- { X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 },
- { X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 },
- { X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 },
- { X86::VPAVGBZ128rrkz, X86::VPAVGBZ128rmkz, 0 },
- { X86::VPAVGWZ128rrkz, X86::VPAVGWZ128rmkz, 0 },
- { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 },
- { X86::VPERMILPDZ128rrkz, X86::VPERMILPDZ128rmkz, 0 },
- { X86::VPERMILPSZ128rrkz, X86::VPERMILPSZ128rmkz, 0 },
- { X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 },
- { X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 },
- { X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 },
- { X86::VPMAXSBZ128rrkz, X86::VPMAXSBZ128rmkz, 0 },
- { X86::VPMAXSDZ128rrkz, X86::VPMAXSDZ128rmkz, 0 },
- { X86::VPMAXSQZ128rrkz, X86::VPMAXSQZ128rmkz, 0 },
- { X86::VPMAXSWZ128rrkz, X86::VPMAXSWZ128rmkz, 0 },
- { X86::VPMAXUBZ128rrkz, X86::VPMAXUBZ128rmkz, 0 },
- { X86::VPMAXUDZ128rrkz, X86::VPMAXUDZ128rmkz, 0 },
- { X86::VPMAXUQZ128rrkz, X86::VPMAXUQZ128rmkz, 0 },
- { X86::VPMAXUWZ128rrkz, X86::VPMAXUWZ128rmkz, 0 },
- { X86::VPMINSBZ128rrkz, X86::VPMINSBZ128rmkz, 0 },
- { X86::VPMINSDZ128rrkz, X86::VPMINSDZ128rmkz, 0 },
- { X86::VPMINSQZ128rrkz, X86::VPMINSQZ128rmkz, 0 },
- { X86::VPMINSWZ128rrkz, X86::VPMINSWZ128rmkz, 0 },
- { X86::VPMINUBZ128rrkz, X86::VPMINUBZ128rmkz, 0 },
- { X86::VPMINUDZ128rrkz, X86::VPMINUDZ128rmkz, 0 },
- { X86::VPMINUQZ128rrkz, X86::VPMINUQZ128rmkz, 0 },
- { X86::VPMINUWZ128rrkz, X86::VPMINUWZ128rmkz, 0 },
- { X86::VPMULDQZ128rrkz, X86::VPMULDQZ128rmkz, 0 },
- { X86::VPMULLDZ128rrkz, X86::VPMULLDZ128rmkz, 0 },
- { X86::VPMULLQZ128rrkz, X86::VPMULLQZ128rmkz, 0 },
- { X86::VPMULLWZ128rrkz, X86::VPMULLWZ128rmkz, 0 },
- { X86::VPMULUDQZ128rrkz, X86::VPMULUDQZ128rmkz, 0 },
- { X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 },
- { X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 },
- { X86::VPSHUFBZ128rrkz, X86::VPSHUFBZ128rmkz, 0 },
- { X86::VPSLLDZ128rrkz, X86::VPSLLDZ128rmkz, 0 },
- { X86::VPSLLQZ128rrkz, X86::VPSLLQZ128rmkz, 0 },
- { X86::VPSLLVDZ128rrkz, X86::VPSLLVDZ128rmkz, 0 },
- { X86::VPSLLVQZ128rrkz, X86::VPSLLVQZ128rmkz, 0 },
- { X86::VPSLLVWZ128rrkz, X86::VPSLLVWZ128rmkz, 0 },
- { X86::VPSLLWZ128rrkz, X86::VPSLLWZ128rmkz, 0 },
- { X86::VPSRADZ128rrkz, X86::VPSRADZ128rmkz, 0 },
- { X86::VPSRAQZ128rrkz, X86::VPSRAQZ128rmkz, 0 },
- { X86::VPSRAVDZ128rrkz, X86::VPSRAVDZ128rmkz, 0 },
- { X86::VPSRAVQZ128rrkz, X86::VPSRAVQZ128rmkz, 0 },
- { X86::VPSRAVWZ128rrkz, X86::VPSRAVWZ128rmkz, 0 },
- { X86::VPSRAWZ128rrkz, X86::VPSRAWZ128rmkz, 0 },
- { X86::VPSRLDZ128rrkz, X86::VPSRLDZ128rmkz, 0 },
- { X86::VPSRLQZ128rrkz, X86::VPSRLQZ128rmkz, 0 },
- { X86::VPSRLVDZ128rrkz, X86::VPSRLVDZ128rmkz, 0 },
- { X86::VPSRLVQZ128rrkz, X86::VPSRLVQZ128rmkz, 0 },
- { X86::VPSRLVWZ128rrkz, X86::VPSRLVWZ128rmkz, 0 },
- { X86::VPSRLWZ128rrkz, X86::VPSRLWZ128rmkz, 0 },
- { X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 },
- { X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 },
- { X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 },
- { X86::VPSUBSBZ128rrkz, X86::VPSUBSBZ128rmkz, 0 },
- { X86::VPSUBSWZ128rrkz, X86::VPSUBSWZ128rmkz, 0 },
- { X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 },
- { X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 },
- { X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 },
- { X86::VPUNPCKHBWZ128rrkz, X86::VPUNPCKHBWZ128rmkz, 0 },
- { X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmkz, 0 },
- { X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmkz, 0 },
- { X86::VPUNPCKHWDZ128rrkz, X86::VPUNPCKHWDZ128rmkz, 0 },
- { X86::VPUNPCKLBWZ128rrkz, X86::VPUNPCKLBWZ128rmkz, 0 },
- { X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmkz, 0 },
- { X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmkz, 0 },
- { X86::VPUNPCKLWDZ128rrkz, X86::VPUNPCKLWDZ128rmkz, 0 },
- { X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 },
- { X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 },
- { X86::VSHUFPDZ128rrikz, X86::VSHUFPDZ128rmikz, 0 },
- { X86::VSHUFPSZ128rrikz, X86::VSHUFPSZ128rmikz, 0 },
- { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 },
- { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 },
- { X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 },
- { X86::VUNPCKHPSZ128rrkz, X86::VUNPCKHPSZ128rmkz, 0 },
- { X86::VUNPCKLPDZ128rrkz, X86::VUNPCKLPDZ128rmkz, 0 },
- { X86::VUNPCKLPSZ128rrkz, X86::VUNPCKLPSZ128rmkz, 0 },
- { X86::VXORPDZ128rrkz, X86::VXORPDZ128rmkz, 0 },
- { X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 },
-
- // AVX-512 masked foldable instructions
- { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE },
- { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE },
- { X86::VPABSBZrrk, X86::VPABSBZrmk, 0 },
- { X86::VPABSDZrrk, X86::VPABSDZrmk, 0 },
- { X86::VPABSQZrrk, X86::VPABSQZrmk, 0 },
- { X86::VPABSWZrrk, X86::VPABSWZrmk, 0 },
- { X86::VPCONFLICTDZrrk, X86::VPCONFLICTDZrmk, 0 },
- { X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmk, 0 },
- { X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 },
- { X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 },
- { X86::VPERMPDZrik, X86::VPERMPDZmik, 0 },
- { X86::VPERMQZrik, X86::VPERMQZmik, 0 },
- { X86::VPLZCNTDZrrk, X86::VPLZCNTDZrmk, 0 },
- { X86::VPLZCNTQZrrk, X86::VPLZCNTQZrmk, 0 },
- { X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 },
- { X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE },
- { X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 },
- { X86::VPMOVSXDQZrrk, X86::VPMOVSXDQZrmk, 0 },
- { X86::VPMOVSXWDZrrk, X86::VPMOVSXWDZrmk, 0 },
- { X86::VPMOVSXWQZrrk, X86::VPMOVSXWQZrmk, 0 },
- { X86::VPMOVZXBDZrrk, X86::VPMOVZXBDZrmk, 0 },
- { X86::VPMOVZXBQZrrk, X86::VPMOVZXBQZrmk, TB_NO_REVERSE },
- { X86::VPMOVZXBWZrrk, X86::VPMOVZXBWZrmk, 0 },
- { X86::VPMOVZXDQZrrk, X86::VPMOVZXDQZrmk, 0 },
- { X86::VPMOVZXWDZrrk, X86::VPMOVZXWDZrmk, 0 },
- { X86::VPMOVZXWQZrrk, X86::VPMOVZXWQZrmk, 0 },
- { X86::VPOPCNTDZrrk, X86::VPOPCNTDZrmk, 0 },
- { X86::VPOPCNTQZrrk, X86::VPOPCNTQZrmk, 0 },
- { X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 },
- { X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 },
- { X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 },
- { X86::VPSLLDZrik, X86::VPSLLDZmik, 0 },
- { X86::VPSLLQZrik, X86::VPSLLQZmik, 0 },
- { X86::VPSLLWZrik, X86::VPSLLWZmik, 0 },
- { X86::VPSRADZrik, X86::VPSRADZmik, 0 },
- { X86::VPSRAQZrik, X86::VPSRAQZmik, 0 },
- { X86::VPSRAWZrik, X86::VPSRAWZmik, 0 },
- { X86::VPSRLDZrik, X86::VPSRLDZmik, 0 },
- { X86::VPSRLQZrik, X86::VPSRLQZmik, 0 },
- { X86::VPSRLWZrik, X86::VPSRLWZmik, 0 },
-
- // AVX-512VL 256-bit masked foldable instructions
- { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE },
- { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE },
- { X86::VPABSBZ256rrk, X86::VPABSBZ256rmk, 0 },
- { X86::VPABSDZ256rrk, X86::VPABSDZ256rmk, 0 },
- { X86::VPABSQZ256rrk, X86::VPABSQZ256rmk, 0 },
- { X86::VPABSWZ256rrk, X86::VPABSWZ256rmk, 0 },
- { X86::VPCONFLICTDZ256rrk, X86::VPCONFLICTDZ256rmk, 0 },
- { X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmk, 0 },
- { X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 },
- { X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 },
- { X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 },
- { X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 },
- { X86::VPLZCNTDZ256rrk, X86::VPLZCNTDZ256rmk, 0 },
- { X86::VPLZCNTQZ256rrk, X86::VPLZCNTQZ256rmk, 0 },
- { X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE },
- { X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE },
- { X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 },
- { X86::VPMOVSXDQZ256rrk, X86::VPMOVSXDQZ256rmk, 0 },
- { X86::VPMOVSXWDZ256rrk, X86::VPMOVSXWDZ256rmk, 0 },
- { X86::VPMOVSXWQZ256rrk, X86::VPMOVSXWQZ256rmk, TB_NO_REVERSE },
- { X86::VPMOVZXBDZ256rrk, X86::VPMOVZXBDZ256rmk, TB_NO_REVERSE },
- { X86::VPMOVZXBQZ256rrk, X86::VPMOVZXBQZ256rmk, TB_NO_REVERSE },
- { X86::VPMOVZXBWZ256rrk, X86::VPMOVZXBWZ256rmk, 0 },
- { X86::VPMOVZXDQZ256rrk, X86::VPMOVZXDQZ256rmk, 0 },
- { X86::VPMOVZXWDZ256rrk, X86::VPMOVZXWDZ256rmk, 0 },
- { X86::VPMOVZXWQZ256rrk, X86::VPMOVZXWQZ256rmk, TB_NO_REVERSE },
- { X86::VPSHUFDZ256rik, X86::VPSHUFDZ256mik, 0 },
- { X86::VPSHUFHWZ256rik, X86::VPSHUFHWZ256mik, 0 },
- { X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 },
- { X86::VPSLLDZ256rik, X86::VPSLLDZ256mik, 0 },
- { X86::VPSLLQZ256rik, X86::VPSLLQZ256mik, 0 },
- { X86::VPSLLWZ256rik, X86::VPSLLWZ256mik, 0 },
- { X86::VPSRADZ256rik, X86::VPSRADZ256mik, 0 },
- { X86::VPSRAQZ256rik, X86::VPSRAQZ256mik, 0 },
- { X86::VPSRAWZ256rik, X86::VPSRAWZ256mik, 0 },
- { X86::VPSRLDZ256rik, X86::VPSRLDZ256mik, 0 },
- { X86::VPSRLQZ256rik, X86::VPSRLQZ256mik, 0 },
- { X86::VPSRLWZ256rik, X86::VPSRLWZ256mik, 0 },
-
- // AVX-512VL 128-bit masked foldable instructions
- { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE },
- { X86::VPABSBZ128rrk, X86::VPABSBZ128rmk, 0 },
- { X86::VPABSDZ128rrk, X86::VPABSDZ128rmk, 0 },
- { X86::VPABSQZ128rrk, X86::VPABSQZ128rmk, 0 },
- { X86::VPABSWZ128rrk, X86::VPABSWZ128rmk, 0 },
- { X86::VPCONFLICTDZ128rrk, X86::VPCONFLICTDZ128rmk, 0 },
- { X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmk, 0 },
- { X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 },
- { X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 },
- { X86::VPLZCNTDZ128rrk, X86::VPLZCNTDZ128rmk, 0 },
- { X86::VPLZCNTQZ128rrk, X86::VPLZCNTQZ128rmk, 0 },
- { X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVSXBQZ128rrk, X86::VPMOVSXBQZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVSXBWZ128rrk, X86::VPMOVSXBWZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVSXDQZ128rrk, X86::VPMOVSXDQZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVSXWDZ128rrk, X86::VPMOVSXWDZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVSXWQZ128rrk, X86::VPMOVSXWQZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVZXBDZ128rrk, X86::VPMOVZXBDZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVZXBQZ128rrk, X86::VPMOVZXBQZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVZXBWZ128rrk, X86::VPMOVZXBWZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVZXDQZ128rrk, X86::VPMOVZXDQZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVZXWDZ128rrk, X86::VPMOVZXWDZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVZXWQZ128rrk, X86::VPMOVZXWQZ128rmk, TB_NO_REVERSE },
- { X86::VPSHUFDZ128rik, X86::VPSHUFDZ128mik, 0 },
- { X86::VPSHUFHWZ128rik, X86::VPSHUFHWZ128mik, 0 },
- { X86::VPSHUFLWZ128rik, X86::VPSHUFLWZ128mik, 0 },
- { X86::VPSLLDZ128rik, X86::VPSLLDZ128mik, 0 },
- { X86::VPSLLQZ128rik, X86::VPSLLQZ128mik, 0 },
- { X86::VPSLLWZ128rik, X86::VPSLLWZ128mik, 0 },
- { X86::VPSRADZ128rik, X86::VPSRADZ128mik, 0 },
- { X86::VPSRAQZ128rik, X86::VPSRAQZ128mik, 0 },
- { X86::VPSRAWZ128rik, X86::VPSRAWZ128mik, 0 },
- { X86::VPSRLDZ128rik, X86::VPSRLDZ128mik, 0 },
- { X86::VPSRLQZ128rik, X86::VPSRLQZ128mik, 0 },
- { X86::VPSRLWZ128rik, X86::VPSRLWZ128mik, 0 },
-
- // AVX-512 masked compare instructions
- { X86::VCMPPDZ128rrik, X86::VCMPPDZ128rmik, 0 },
- { X86::VCMPPSZ128rrik, X86::VCMPPSZ128rmik, 0 },
- { X86::VCMPPDZ256rrik, X86::VCMPPDZ256rmik, 0 },
- { X86::VCMPPSZ256rrik, X86::VCMPPSZ256rmik, 0 },
- { X86::VCMPPDZrrik, X86::VCMPPDZrmik, 0 },
- { X86::VCMPPSZrrik, X86::VCMPPSZrmik, 0 },
- { X86::VCMPSDZrr_Intk, X86::VCMPSDZrm_Intk, TB_NO_REVERSE },
- { X86::VCMPSSZrr_Intk, X86::VCMPSSZrm_Intk, TB_NO_REVERSE },
- { X86::VPCMPBZ128rrik, X86::VPCMPBZ128rmik, 0 },
- { X86::VPCMPBZ256rrik, X86::VPCMPBZ256rmik, 0 },
- { X86::VPCMPBZrrik, X86::VPCMPBZrmik, 0 },
- { X86::VPCMPDZ128rrik, X86::VPCMPDZ128rmik, 0 },
- { X86::VPCMPDZ256rrik, X86::VPCMPDZ256rmik, 0 },
- { X86::VPCMPDZrrik, X86::VPCMPDZrmik, 0 },
- { X86::VPCMPEQBZ128rrk, X86::VPCMPEQBZ128rmk, 0 },
- { X86::VPCMPEQBZ256rrk, X86::VPCMPEQBZ256rmk, 0 },
- { X86::VPCMPEQBZrrk, X86::VPCMPEQBZrmk, 0 },
- { X86::VPCMPEQDZ128rrk, X86::VPCMPEQDZ128rmk, 0 },
- { X86::VPCMPEQDZ256rrk, X86::VPCMPEQDZ256rmk, 0 },
- { X86::VPCMPEQDZrrk, X86::VPCMPEQDZrmk, 0 },
- { X86::VPCMPEQQZ128rrk, X86::VPCMPEQQZ128rmk, 0 },
- { X86::VPCMPEQQZ256rrk, X86::VPCMPEQQZ256rmk, 0 },
- { X86::VPCMPEQQZrrk, X86::VPCMPEQQZrmk, 0 },
- { X86::VPCMPEQWZ128rrk, X86::VPCMPEQWZ128rmk, 0 },
- { X86::VPCMPEQWZ256rrk, X86::VPCMPEQWZ256rmk, 0 },
- { X86::VPCMPEQWZrrk, X86::VPCMPEQWZrmk, 0 },
- { X86::VPCMPGTBZ128rrk, X86::VPCMPGTBZ128rmk, 0 },
- { X86::VPCMPGTBZ256rrk, X86::VPCMPGTBZ256rmk, 0 },
- { X86::VPCMPGTBZrrk, X86::VPCMPGTBZrmk, 0 },
- { X86::VPCMPGTDZ128rrk, X86::VPCMPGTDZ128rmk, 0 },
- { X86::VPCMPGTDZ256rrk, X86::VPCMPGTDZ256rmk, 0 },
- { X86::VPCMPGTDZrrk, X86::VPCMPGTDZrmk, 0 },
- { X86::VPCMPGTQZ128rrk, X86::VPCMPGTQZ128rmk, 0 },
- { X86::VPCMPGTQZ256rrk, X86::VPCMPGTQZ256rmk, 0 },
- { X86::VPCMPGTQZrrk, X86::VPCMPGTQZrmk, 0 },
- { X86::VPCMPGTWZ128rrk, X86::VPCMPGTWZ128rmk, 0 },
- { X86::VPCMPGTWZ256rrk, X86::VPCMPGTWZ256rmk, 0 },
- { X86::VPCMPGTWZrrk, X86::VPCMPGTWZrmk, 0 },
- { X86::VPCMPQZ128rrik, X86::VPCMPQZ128rmik, 0 },
- { X86::VPCMPQZ256rrik, X86::VPCMPQZ256rmik, 0 },
- { X86::VPCMPQZrrik, X86::VPCMPQZrmik, 0 },
- { X86::VPCMPUBZ128rrik, X86::VPCMPUBZ128rmik, 0 },
- { X86::VPCMPUBZ256rrik, X86::VPCMPUBZ256rmik, 0 },
- { X86::VPCMPUBZrrik, X86::VPCMPUBZrmik, 0 },
- { X86::VPCMPUDZ128rrik, X86::VPCMPUDZ128rmik, 0 },
- { X86::VPCMPUDZ256rrik, X86::VPCMPUDZ256rmik, 0 },
- { X86::VPCMPUDZrrik, X86::VPCMPUDZrmik, 0 },
- { X86::VPCMPUQZ128rrik, X86::VPCMPUQZ128rmik, 0 },
- { X86::VPCMPUQZ256rrik, X86::VPCMPUQZ256rmik, 0 },
- { X86::VPCMPUQZrrik, X86::VPCMPUQZrmik, 0 },
- { X86::VPCMPUWZ128rrik, X86::VPCMPUWZ128rmik, 0 },
- { X86::VPCMPUWZ256rrik, X86::VPCMPUWZ256rmik, 0 },
- { X86::VPCMPUWZrrik, X86::VPCMPUWZrmik, 0 },
- { X86::VPCMPWZ128rrik, X86::VPCMPWZ128rmik, 0 },
- { X86::VPCMPWZ256rrik, X86::VPCMPWZ256rmik, 0 },
- { X86::VPCMPWZrrik, X86::VPCMPWZrmik, 0 },
- };
-
- for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
- AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
- Entry.RegOp, Entry.MemOp,
- // Index 3, folded load
- Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
- }
- auto I = X86InstrFMA3Info::rm_begin();
- auto E = X86InstrFMA3Info::rm_end();
- for (; I != E; ++I) {
- if (!I.getGroup()->isKMasked()) {
- // Intrinsic forms need to pass TB_NO_REVERSE.
- if (I.getGroup()->isIntrinsic()) {
- AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
- I.getRegOpcode(), I.getMemOpcode(),
- TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD | TB_NO_REVERSE);
- } else {
- AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
- I.getRegOpcode(), I.getMemOpcode(),
- TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD);
- }
- }
- }
-
- static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
- // AVX-512 foldable masked instructions
- { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 },
- { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 },
- { X86::VADDSDZrr_Intk, X86::VADDSDZrm_Intk, TB_NO_REVERSE },
- { X86::VADDSSZrr_Intk, X86::VADDSSZrm_Intk, TB_NO_REVERSE },
- { X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 },
- { X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 },
- { X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 },
- { X86::VANDNPSZrrk, X86::VANDNPSZrmk, 0 },
- { X86::VANDPDZrrk, X86::VANDPDZrmk, 0 },
- { X86::VANDPSZrrk, X86::VANDPSZrmk, 0 },
- { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 },
- { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
- { X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE },
- { X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE },
- { X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 },
- { X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 },
- { X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 },
- { X86::VINSERTF64x4Zrrk, X86::VINSERTF64x4Zrmk, 0 },
- { X86::VINSERTI32x4Zrrk, X86::VINSERTI32x4Zrmk, 0 },
- { X86::VINSERTI32x8Zrrk, X86::VINSERTI32x8Zrmk, 0 },
- { X86::VINSERTI64x2Zrrk, X86::VINSERTI64x2Zrmk, 0 },
- { X86::VINSERTI64x4Zrrk, X86::VINSERTI64x4Zrmk, 0 },
- { X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0 },
- { X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 },
- { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 },
- { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 },
- { X86::VMAXSDZrr_Intk, X86::VMAXSDZrm_Intk, 0 },
- { X86::VMAXSSZrr_Intk, X86::VMAXSSZrm_Intk, 0 },
- { X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 },
- { X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 },
- { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 },
- { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 },
- { X86::VMINSDZrr_Intk, X86::VMINSDZrm_Intk, 0 },
- { X86::VMINSSZrr_Intk, X86::VMINSSZrm_Intk, 0 },
- { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 },
- { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 },
- { X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE },
- { X86::VMULSSZrr_Intk, X86::VMULSSZrm_Intk, TB_NO_REVERSE },
- { X86::VORPDZrrk, X86::VORPDZrmk, 0 },
- { X86::VORPSZrrk, X86::VORPSZrmk, 0 },
- { X86::VPACKSSDWZrrk, X86::VPACKSSDWZrmk, 0 },
- { X86::VPACKSSWBZrrk, X86::VPACKSSWBZrmk, 0 },
- { X86::VPACKUSDWZrrk, X86::VPACKUSDWZrmk, 0 },
- { X86::VPACKUSWBZrrk, X86::VPACKUSWBZrmk, 0 },
- { X86::VPADDBZrrk, X86::VPADDBZrmk, 0 },
- { X86::VPADDDZrrk, X86::VPADDDZrmk, 0 },
- { X86::VPADDQZrrk, X86::VPADDQZrmk, 0 },
- { X86::VPADDSBZrrk, X86::VPADDSBZrmk, 0 },
- { X86::VPADDSWZrrk, X86::VPADDSWZrmk, 0 },
- { X86::VPADDUSBZrrk, X86::VPADDUSBZrmk, 0 },
- { X86::VPADDUSWZrrk, X86::VPADDUSWZrmk, 0 },
- { X86::VPADDWZrrk, X86::VPADDWZrmk, 0 },
- { X86::VPALIGNRZrrik, X86::VPALIGNRZrmik, 0 },
- { X86::VPANDDZrrk, X86::VPANDDZrmk, 0 },
- { X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 },
- { X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 },
- { X86::VPANDQZrrk, X86::VPANDQZrmk, 0 },
- { X86::VPAVGBZrrk, X86::VPAVGBZrmk, 0 },
- { X86::VPAVGWZrrk, X86::VPAVGWZrmk, 0 },
- { X86::VPERMBZrrk, X86::VPERMBZrmk, 0 },
- { X86::VPERMDZrrk, X86::VPERMDZrmk, 0 },
- { X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 },
- { X86::VPERMI2Drrk, X86::VPERMI2Drmk, 0 },
- { X86::VPERMI2PSrrk, X86::VPERMI2PSrmk, 0 },
- { X86::VPERMI2PDrrk, X86::VPERMI2PDrmk, 0 },
- { X86::VPERMI2Qrrk, X86::VPERMI2Qrmk, 0 },
- { X86::VPERMI2Wrrk, X86::VPERMI2Wrmk, 0 },
- { X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 },
- { X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 },
- { X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 },
- { X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 },
- { X86::VPERMQZrrk, X86::VPERMQZrmk, 0 },
- { X86::VPERMT2Brrk, X86::VPERMT2Brmk, 0 },
- { X86::VPERMT2Drrk, X86::VPERMT2Drmk, 0 },
- { X86::VPERMT2PSrrk, X86::VPERMT2PSrmk, 0 },
- { X86::VPERMT2PDrrk, X86::VPERMT2PDrmk, 0 },
- { X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 },
- { X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 },
- { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 },
- { X86::VPMADD52HUQZrk, X86::VPMADD52HUQZmk, 0 },
- { X86::VPMADD52LUQZrk, X86::VPMADD52LUQZmk, 0 },
- { X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 },
- { X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 },
- { X86::VPMAXSBZrrk, X86::VPMAXSBZrmk, 0 },
- { X86::VPMAXSDZrrk, X86::VPMAXSDZrmk, 0 },
- { X86::VPMAXSQZrrk, X86::VPMAXSQZrmk, 0 },
- { X86::VPMAXSWZrrk, X86::VPMAXSWZrmk, 0 },
- { X86::VPMAXUBZrrk, X86::VPMAXUBZrmk, 0 },
- { X86::VPMAXUDZrrk, X86::VPMAXUDZrmk, 0 },
- { X86::VPMAXUQZrrk, X86::VPMAXUQZrmk, 0 },
- { X86::VPMAXUWZrrk, X86::VPMAXUWZrmk, 0 },
- { X86::VPMINSBZrrk, X86::VPMINSBZrmk, 0 },
- { X86::VPMINSDZrrk, X86::VPMINSDZrmk, 0 },
- { X86::VPMINSQZrrk, X86::VPMINSQZrmk, 0 },
- { X86::VPMINSWZrrk, X86::VPMINSWZrmk, 0 },
- { X86::VPMINUBZrrk, X86::VPMINUBZrmk, 0 },
- { X86::VPMINUDZrrk, X86::VPMINUDZrmk, 0 },
- { X86::VPMINUQZrrk, X86::VPMINUQZrmk, 0 },
- { X86::VPMINUWZrrk, X86::VPMINUWZrmk, 0 },
- { X86::VPMULDQZrrk, X86::VPMULDQZrmk, 0 },
- { X86::VPMULLDZrrk, X86::VPMULLDZrmk, 0 },
- { X86::VPMULLQZrrk, X86::VPMULLQZrmk, 0 },
- { X86::VPMULLWZrrk, X86::VPMULLWZrmk, 0 },
- { X86::VPMULUDQZrrk, X86::VPMULUDQZrmk, 0 },
- { X86::VPORDZrrk, X86::VPORDZrmk, 0 },
- { X86::VPORQZrrk, X86::VPORQZrmk, 0 },
- { X86::VPSHUFBZrrk, X86::VPSHUFBZrmk, 0 },
- { X86::VPSLLDZrrk, X86::VPSLLDZrmk, 0 },
- { X86::VPSLLQZrrk, X86::VPSLLQZrmk, 0 },
- { X86::VPSLLVDZrrk, X86::VPSLLVDZrmk, 0 },
- { X86::VPSLLVQZrrk, X86::VPSLLVQZrmk, 0 },
- { X86::VPSLLVWZrrk, X86::VPSLLVWZrmk, 0 },
- { X86::VPSLLWZrrk, X86::VPSLLWZrmk, 0 },
- { X86::VPSRADZrrk, X86::VPSRADZrmk, 0 },
- { X86::VPSRAQZrrk, X86::VPSRAQZrmk, 0 },
- { X86::VPSRAVDZrrk, X86::VPSRAVDZrmk, 0 },
- { X86::VPSRAVQZrrk, X86::VPSRAVQZrmk, 0 },
- { X86::VPSRAVWZrrk, X86::VPSRAVWZrmk, 0 },
- { X86::VPSRAWZrrk, X86::VPSRAWZrmk, 0 },
- { X86::VPSRLDZrrk, X86::VPSRLDZrmk, 0 },
- { X86::VPSRLQZrrk, X86::VPSRLQZrmk, 0 },
- { X86::VPSRLVDZrrk, X86::VPSRLVDZrmk, 0 },
- { X86::VPSRLVQZrrk, X86::VPSRLVQZrmk, 0 },
- { X86::VPSRLVWZrrk, X86::VPSRLVWZrmk, 0 },
- { X86::VPSRLWZrrk, X86::VPSRLWZrmk, 0 },
- { X86::VPSUBBZrrk, X86::VPSUBBZrmk, 0 },
- { X86::VPSUBDZrrk, X86::VPSUBDZrmk, 0 },
- { X86::VPSUBQZrrk, X86::VPSUBQZrmk, 0 },
- { X86::VPSUBSBZrrk, X86::VPSUBSBZrmk, 0 },
- { X86::VPSUBSWZrrk, X86::VPSUBSWZrmk, 0 },
- { X86::VPSUBUSBZrrk, X86::VPSUBUSBZrmk, 0 },
- { X86::VPSUBUSWZrrk, X86::VPSUBUSWZrmk, 0 },
- { X86::VPTERNLOGDZrrik, X86::VPTERNLOGDZrmik, 0 },
- { X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 },
- { X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 },
- { X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 },
- { X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 },
- { X86::VPUNPCKHWDZrrk, X86::VPUNPCKHWDZrmk, 0 },
- { X86::VPUNPCKLBWZrrk, X86::VPUNPCKLBWZrmk, 0 },
- { X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmk, 0 },
- { X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmk, 0 },
- { X86::VPUNPCKLWDZrrk, X86::VPUNPCKLWDZrmk, 0 },
- { X86::VPXORDZrrk, X86::VPXORDZrmk, 0 },
- { X86::VPXORQZrrk, X86::VPXORQZrmk, 0 },
- { X86::VSHUFPDZrrik, X86::VSHUFPDZrmik, 0 },
- { X86::VSHUFPSZrrik, X86::VSHUFPSZrmik, 0 },
- { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 },
- { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 },
- { X86::VSUBSDZrr_Intk, X86::VSUBSDZrm_Intk, TB_NO_REVERSE },
- { X86::VSUBSSZrr_Intk, X86::VSUBSSZrm_Intk, TB_NO_REVERSE },
- { X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 },
- { X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 },
- { X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 },
- { X86::VUNPCKLPSZrrk, X86::VUNPCKLPSZrmk, 0 },
- { X86::VXORPDZrrk, X86::VXORPDZrmk, 0 },
- { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 },
-
- // AVX-512{F,VL} foldable masked instructions 256-bit
- { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 },
- { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 },
- { X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0 },
- { X86::VALIGNQZ256rrik, X86::VALIGNQZ256rmik, 0 },
- { X86::VANDNPDZ256rrk, X86::VANDNPDZ256rmk, 0 },
- { X86::VANDNPSZ256rrk, X86::VANDNPSZ256rmk, 0 },
- { X86::VANDPDZ256rrk, X86::VANDPDZ256rmk, 0 },
- { X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 },
- { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 },
- { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 },
- { X86::VINSERTF32x4Z256rrk,X86::VINSERTF32x4Z256rmk, 0 },
- { X86::VINSERTF64x2Z256rrk,X86::VINSERTF64x2Z256rmk, 0 },
- { X86::VINSERTI32x4Z256rrk,X86::VINSERTI32x4Z256rmk, 0 },
- { X86::VINSERTI64x2Z256rrk,X86::VINSERTI64x2Z256rmk, 0 },
- { X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0 },
- { X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0 },
- { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 },
- { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 },
- { X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0 },
- { X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmk, 0 },
- { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 },
- { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 },
- { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 },
- { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 },
- { X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 },
- { X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 },
- { X86::VPACKSSDWZ256rrk, X86::VPACKSSDWZ256rmk, 0 },
- { X86::VPACKSSWBZ256rrk, X86::VPACKSSWBZ256rmk, 0 },
- { X86::VPACKUSDWZ256rrk, X86::VPACKUSDWZ256rmk, 0 },
- { X86::VPACKUSWBZ256rrk, X86::VPACKUSWBZ256rmk, 0 },
- { X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 },
- { X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 },
- { X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 },
- { X86::VPADDSBZ256rrk, X86::VPADDSBZ256rmk, 0 },
- { X86::VPADDSWZ256rrk, X86::VPADDSWZ256rmk, 0 },
- { X86::VPADDUSBZ256rrk, X86::VPADDUSBZ256rmk, 0 },
- { X86::VPADDUSWZ256rrk, X86::VPADDUSWZ256rmk, 0 },
- { X86::VPADDWZ256rrk, X86::VPADDWZ256rmk, 0 },
- { X86::VPALIGNRZ256rrik, X86::VPALIGNRZ256rmik, 0 },
- { X86::VPANDDZ256rrk, X86::VPANDDZ256rmk, 0 },
- { X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 },
- { X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 },
- { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 },
- { X86::VPAVGBZ256rrk, X86::VPAVGBZ256rmk, 0 },
- { X86::VPAVGWZ256rrk, X86::VPAVGWZ256rmk, 0 },
- { X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 },
- { X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 },
- { X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 },
- { X86::VPERMI2D256rrk, X86::VPERMI2D256rmk, 0 },
- { X86::VPERMI2PD256rrk, X86::VPERMI2PD256rmk, 0 },
- { X86::VPERMI2PS256rrk, X86::VPERMI2PS256rmk, 0 },
- { X86::VPERMI2Q256rrk, X86::VPERMI2Q256rmk, 0 },
- { X86::VPERMI2W256rrk, X86::VPERMI2W256rmk, 0 },
- { X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 },
- { X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 },
- { X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 },
- { X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 },
- { X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 },
- { X86::VPERMT2B256rrk, X86::VPERMT2B256rmk, 0 },
- { X86::VPERMT2D256rrk, X86::VPERMT2D256rmk, 0 },
- { X86::VPERMT2PD256rrk, X86::VPERMT2PD256rmk, 0 },
- { X86::VPERMT2PS256rrk, X86::VPERMT2PS256rmk, 0 },
- { X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 },
- { X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 },
- { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 },
- { X86::VPMADD52HUQZ256rk, X86::VPMADD52HUQZ256mk, 0 },
- { X86::VPMADD52LUQZ256rk, X86::VPMADD52LUQZ256mk, 0 },
- { X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 },
- { X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 },
- { X86::VPMAXSBZ256rrk, X86::VPMAXSBZ256rmk, 0 },
- { X86::VPMAXSDZ256rrk, X86::VPMAXSDZ256rmk, 0 },
- { X86::VPMAXSQZ256rrk, X86::VPMAXSQZ256rmk, 0 },
- { X86::VPMAXSWZ256rrk, X86::VPMAXSWZ256rmk, 0 },
- { X86::VPMAXUBZ256rrk, X86::VPMAXUBZ256rmk, 0 },
- { X86::VPMAXUDZ256rrk, X86::VPMAXUDZ256rmk, 0 },
- { X86::VPMAXUQZ256rrk, X86::VPMAXUQZ256rmk, 0 },
- { X86::VPMAXUWZ256rrk, X86::VPMAXUWZ256rmk, 0 },
- { X86::VPMINSBZ256rrk, X86::VPMINSBZ256rmk, 0 },
- { X86::VPMINSDZ256rrk, X86::VPMINSDZ256rmk, 0 },
- { X86::VPMINSQZ256rrk, X86::VPMINSQZ256rmk, 0 },
- { X86::VPMINSWZ256rrk, X86::VPMINSWZ256rmk, 0 },
- { X86::VPMINUBZ256rrk, X86::VPMINUBZ256rmk, 0 },
- { X86::VPMINUDZ256rrk, X86::VPMINUDZ256rmk, 0 },
- { X86::VPMINUQZ256rrk, X86::VPMINUQZ256rmk, 0 },
- { X86::VPMINUWZ256rrk, X86::VPMINUWZ256rmk, 0 },
- { X86::VPMULDQZ256rrk, X86::VPMULDQZ256rmk, 0 },
- { X86::VPMULLDZ256rrk, X86::VPMULLDZ256rmk, 0 },
- { X86::VPMULLQZ256rrk, X86::VPMULLQZ256rmk, 0 },
- { X86::VPMULLWZ256rrk, X86::VPMULLWZ256rmk, 0 },
- { X86::VPMULUDQZ256rrk, X86::VPMULUDQZ256rmk, 0 },
- { X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 },
- { X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 },
- { X86::VPSHUFBZ256rrk, X86::VPSHUFBZ256rmk, 0 },
- { X86::VPSLLDZ256rrk, X86::VPSLLDZ256rmk, 0 },
- { X86::VPSLLQZ256rrk, X86::VPSLLQZ256rmk, 0 },
- { X86::VPSLLVDZ256rrk, X86::VPSLLVDZ256rmk, 0 },
- { X86::VPSLLVQZ256rrk, X86::VPSLLVQZ256rmk, 0 },
- { X86::VPSLLVWZ256rrk, X86::VPSLLVWZ256rmk, 0 },
- { X86::VPSLLWZ256rrk, X86::VPSLLWZ256rmk, 0 },
- { X86::VPSRADZ256rrk, X86::VPSRADZ256rmk, 0 },
- { X86::VPSRAQZ256rrk, X86::VPSRAQZ256rmk, 0 },
- { X86::VPSRAVDZ256rrk, X86::VPSRAVDZ256rmk, 0 },
- { X86::VPSRAVQZ256rrk, X86::VPSRAVQZ256rmk, 0 },
- { X86::VPSRAVWZ256rrk, X86::VPSRAVWZ256rmk, 0 },
- { X86::VPSRAWZ256rrk, X86::VPSRAWZ256rmk, 0 },
- { X86::VPSRLDZ256rrk, X86::VPSRLDZ256rmk, 0 },
- { X86::VPSRLQZ256rrk, X86::VPSRLQZ256rmk, 0 },
- { X86::VPSRLVDZ256rrk, X86::VPSRLVDZ256rmk, 0 },
- { X86::VPSRLVQZ256rrk, X86::VPSRLVQZ256rmk, 0 },
- { X86::VPSRLVWZ256rrk, X86::VPSRLVWZ256rmk, 0 },
- { X86::VPSRLWZ256rrk, X86::VPSRLWZ256rmk, 0 },
- { X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 },
- { X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 },
- { X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 },
- { X86::VPSUBSBZ256rrk, X86::VPSUBSBZ256rmk, 0 },
- { X86::VPSUBSWZ256rrk, X86::VPSUBSWZ256rmk, 0 },
- { X86::VPSUBUSBZ256rrk, X86::VPSUBUSBZ256rmk, 0 },
- { X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 },
- { X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 },
- { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 },
- { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 },
- { X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 },
- { X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 },
- { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 },
- { X86::VPUNPCKHWDZ256rrk, X86::VPUNPCKHWDZ256rmk, 0 },
- { X86::VPUNPCKLBWZ256rrk, X86::VPUNPCKLBWZ256rmk, 0 },
- { X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmk, 0 },
- { X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk, 0 },
- { X86::VPUNPCKLWDZ256rrk, X86::VPUNPCKLWDZ256rmk, 0 },
- { X86::VPXORDZ256rrk, X86::VPXORDZ256rmk, 0 },
- { X86::VPXORQZ256rrk, X86::VPXORQZ256rmk, 0 },
- { X86::VSHUFPDZ256rrik, X86::VSHUFPDZ256rmik, 0 },
- { X86::VSHUFPSZ256rrik, X86::VSHUFPSZ256rmik, 0 },
- { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 },
- { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 },
- { X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 },
- { X86::VUNPCKHPSZ256rrk, X86::VUNPCKHPSZ256rmk, 0 },
- { X86::VUNPCKLPDZ256rrk, X86::VUNPCKLPDZ256rmk, 0 },
- { X86::VUNPCKLPSZ256rrk, X86::VUNPCKLPSZ256rmk, 0 },
- { X86::VXORPDZ256rrk, X86::VXORPDZ256rmk, 0 },
- { X86::VXORPSZ256rrk, X86::VXORPSZ256rmk, 0 },
-
- // AVX-512{F,VL} foldable instructions 128-bit
- { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 },
- { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 },
- { X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0 },
- { X86::VALIGNQZ128rrik, X86::VALIGNQZ128rmik, 0 },
- { X86::VANDNPDZ128rrk, X86::VANDNPDZ128rmk, 0 },
- { X86::VANDNPSZ128rrk, X86::VANDNPSZ128rmk, 0 },
- { X86::VANDPDZ128rrk, X86::VANDPDZ128rmk, 0 },
- { X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 },
- { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 },
- { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 },
- { X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0 },
- { X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0 },
- { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 },
- { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 },
- { X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0 },
- { X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmk, 0 },
- { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 },
- { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 },
- { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 },
- { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 },
- { X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 },
- { X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 },
- { X86::VPACKSSDWZ128rrk, X86::VPACKSSDWZ128rmk, 0 },
- { X86::VPACKSSWBZ128rrk, X86::VPACKSSWBZ128rmk, 0 },
- { X86::VPACKUSDWZ128rrk, X86::VPACKUSDWZ128rmk, 0 },
- { X86::VPACKUSWBZ128rrk, X86::VPACKUSWBZ128rmk, 0 },
- { X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 },
- { X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 },
- { X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 },
- { X86::VPADDSBZ128rrk, X86::VPADDSBZ128rmk, 0 },
- { X86::VPADDSWZ128rrk, X86::VPADDSWZ128rmk, 0 },
- { X86::VPADDUSBZ128rrk, X86::VPADDUSBZ128rmk, 0 },
- { X86::VPADDUSWZ128rrk, X86::VPADDUSWZ128rmk, 0 },
- { X86::VPADDWZ128rrk, X86::VPADDWZ128rmk, 0 },
- { X86::VPALIGNRZ128rrik, X86::VPALIGNRZ128rmik, 0 },
- { X86::VPANDDZ128rrk, X86::VPANDDZ128rmk, 0 },
- { X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 },
- { X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 },
- { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 },
- { X86::VPAVGBZ128rrk, X86::VPAVGBZ128rmk, 0 },
- { X86::VPAVGWZ128rrk, X86::VPAVGWZ128rmk, 0 },
- { X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 },
- { X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 },
- { X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 },
- { X86::VPERMI2PD128rrk, X86::VPERMI2PD128rmk, 0 },
- { X86::VPERMI2PS128rrk, X86::VPERMI2PS128rmk, 0 },
- { X86::VPERMI2Q128rrk, X86::VPERMI2Q128rmk, 0 },
- { X86::VPERMI2W128rrk, X86::VPERMI2W128rmk, 0 },
- { X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 },
- { X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 },
- { X86::VPERMT2B128rrk, X86::VPERMT2B128rmk, 0 },
- { X86::VPERMT2D128rrk, X86::VPERMT2D128rmk, 0 },
- { X86::VPERMT2PD128rrk, X86::VPERMT2PD128rmk, 0 },
- { X86::VPERMT2PS128rrk, X86::VPERMT2PS128rmk, 0 },
- { X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 },
- { X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 },
- { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 },
- { X86::VPMADD52HUQZ128rk, X86::VPMADD52HUQZ128mk, 0 },
- { X86::VPMADD52LUQZ128rk, X86::VPMADD52LUQZ128mk, 0 },
- { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 },
- { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 },
- { X86::VPMAXSBZ128rrk, X86::VPMAXSBZ128rmk, 0 },
- { X86::VPMAXSDZ128rrk, X86::VPMAXSDZ128rmk, 0 },
- { X86::VPMAXSQZ128rrk, X86::VPMAXSQZ128rmk, 0 },
- { X86::VPMAXSWZ128rrk, X86::VPMAXSWZ128rmk, 0 },
- { X86::VPMAXUBZ128rrk, X86::VPMAXUBZ128rmk, 0 },
- { X86::VPMAXUDZ128rrk, X86::VPMAXUDZ128rmk, 0 },
- { X86::VPMAXUQZ128rrk, X86::VPMAXUQZ128rmk, 0 },
- { X86::VPMAXUWZ128rrk, X86::VPMAXUWZ128rmk, 0 },
- { X86::VPMINSBZ128rrk, X86::VPMINSBZ128rmk, 0 },
- { X86::VPMINSDZ128rrk, X86::VPMINSDZ128rmk, 0 },
- { X86::VPMINSQZ128rrk, X86::VPMINSQZ128rmk, 0 },
- { X86::VPMINSWZ128rrk, X86::VPMINSWZ128rmk, 0 },
- { X86::VPMINUBZ128rrk, X86::VPMINUBZ128rmk, 0 },
- { X86::VPMINUDZ128rrk, X86::VPMINUDZ128rmk, 0 },
- { X86::VPMINUQZ128rrk, X86::VPMINUQZ128rmk, 0 },
- { X86::VPMINUWZ128rrk, X86::VPMINUWZ128rmk, 0 },
- { X86::VPMULDQZ128rrk, X86::VPMULDQZ128rmk, 0 },
- { X86::VPMULLDZ128rrk, X86::VPMULLDZ128rmk, 0 },
- { X86::VPMULLQZ128rrk, X86::VPMULLQZ128rmk, 0 },
- { X86::VPMULLWZ128rrk, X86::VPMULLWZ128rmk, 0 },
- { X86::VPMULUDQZ128rrk, X86::VPMULUDQZ128rmk, 0 },
- { X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 },
- { X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 },
- { X86::VPSHUFBZ128rrk, X86::VPSHUFBZ128rmk, 0 },
- { X86::VPSLLDZ128rrk, X86::VPSLLDZ128rmk, 0 },
- { X86::VPSLLQZ128rrk, X86::VPSLLQZ128rmk, 0 },
- { X86::VPSLLVDZ128rrk, X86::VPSLLVDZ128rmk, 0 },
- { X86::VPSLLVQZ128rrk, X86::VPSLLVQZ128rmk, 0 },
- { X86::VPSLLVWZ128rrk, X86::VPSLLVWZ128rmk, 0 },
- { X86::VPSLLWZ128rrk, X86::VPSLLWZ128rmk, 0 },
- { X86::VPSRADZ128rrk, X86::VPSRADZ128rmk, 0 },
- { X86::VPSRAQZ128rrk, X86::VPSRAQZ128rmk, 0 },
- { X86::VPSRAVDZ128rrk, X86::VPSRAVDZ128rmk, 0 },
- { X86::VPSRAVQZ128rrk, X86::VPSRAVQZ128rmk, 0 },
- { X86::VPSRAVWZ128rrk, X86::VPSRAVWZ128rmk, 0 },
- { X86::VPSRAWZ128rrk, X86::VPSRAWZ128rmk, 0 },
- { X86::VPSRLDZ128rrk, X86::VPSRLDZ128rmk, 0 },
- { X86::VPSRLQZ128rrk, X86::VPSRLQZ128rmk, 0 },
- { X86::VPSRLVDZ128rrk, X86::VPSRLVDZ128rmk, 0 },
- { X86::VPSRLVQZ128rrk, X86::VPSRLVQZ128rmk, 0 },
- { X86::VPSRLVWZ128rrk, X86::VPSRLVWZ128rmk, 0 },
- { X86::VPSRLWZ128rrk, X86::VPSRLWZ128rmk, 0 },
- { X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 },
- { X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 },
- { X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 },
- { X86::VPSUBSBZ128rrk, X86::VPSUBSBZ128rmk, 0 },
- { X86::VPSUBSWZ128rrk, X86::VPSUBSWZ128rmk, 0 },
- { X86::VPSUBUSBZ128rrk, X86::VPSUBUSBZ128rmk, 0 },
- { X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 },
- { X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 },
- { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 },
- { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 },
- { X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 },
- { X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 },
- { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 },
- { X86::VPUNPCKHWDZ128rrk, X86::VPUNPCKHWDZ128rmk, 0 },
- { X86::VPUNPCKLBWZ128rrk, X86::VPUNPCKLBWZ128rmk, 0 },
- { X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmk, 0 },
- { X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk, 0 },
- { X86::VPUNPCKLWDZ128rrk, X86::VPUNPCKLWDZ128rmk, 0 },
- { X86::VPXORDZ128rrk, X86::VPXORDZ128rmk, 0 },
- { X86::VPXORQZ128rrk, X86::VPXORQZ128rmk, 0 },
- { X86::VSHUFPDZ128rrik, X86::VSHUFPDZ128rmik, 0 },
- { X86::VSHUFPSZ128rrik, X86::VSHUFPSZ128rmik, 0 },
- { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 },
- { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 },
- { X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 },
- { X86::VUNPCKHPSZ128rrk, X86::VUNPCKHPSZ128rmk, 0 },
- { X86::VUNPCKLPDZ128rrk, X86::VUNPCKLPDZ128rmk, 0 },
- { X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 },
- { X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 },
- { X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 },
-
- // 512-bit three source instructions with zero masking.
- { X86::VPERMI2Brrkz, X86::VPERMI2Brmkz, 0 },
- { X86::VPERMI2Drrkz, X86::VPERMI2Drmkz, 0 },
- { X86::VPERMI2PSrrkz, X86::VPERMI2PSrmkz, 0 },
- { X86::VPERMI2PDrrkz, X86::VPERMI2PDrmkz, 0 },
- { X86::VPERMI2Qrrkz, X86::VPERMI2Qrmkz, 0 },
- { X86::VPERMI2Wrrkz, X86::VPERMI2Wrmkz, 0 },
- { X86::VPERMT2Brrkz, X86::VPERMT2Brmkz, 0 },
- { X86::VPERMT2Drrkz, X86::VPERMT2Drmkz, 0 },
- { X86::VPERMT2PSrrkz, X86::VPERMT2PSrmkz, 0 },
- { X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 },
- { X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 },
- { X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 },
- { X86::VPMADD52HUQZrkz, X86::VPMADD52HUQZmkz, 0 },
- { X86::VPMADD52LUQZrkz, X86::VPMADD52LUQZmkz, 0 },
- { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 },
- { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 },
-
- // 256-bit three source instructions with zero masking.
- { X86::VPERMI2B256rrkz, X86::VPERMI2B256rmkz, 0 },
- { X86::VPERMI2D256rrkz, X86::VPERMI2D256rmkz, 0 },
- { X86::VPERMI2PD256rrkz, X86::VPERMI2PD256rmkz, 0 },
- { X86::VPERMI2PS256rrkz, X86::VPERMI2PS256rmkz, 0 },
- { X86::VPERMI2Q256rrkz, X86::VPERMI2Q256rmkz, 0 },
- { X86::VPERMI2W256rrkz, X86::VPERMI2W256rmkz, 0 },
- { X86::VPERMT2B256rrkz, X86::VPERMT2B256rmkz, 0 },
- { X86::VPERMT2D256rrkz, X86::VPERMT2D256rmkz, 0 },
- { X86::VPERMT2PD256rrkz, X86::VPERMT2PD256rmkz, 0 },
- { X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 },
- { X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 },
- { X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 },
- { X86::VPMADD52HUQZ256rkz, X86::VPMADD52HUQZ256mkz, 0 },
- { X86::VPMADD52LUQZ256rkz, X86::VPMADD52LUQZ256mkz, 0 },
- { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 },
- { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 },
-
- // 128-bit three source instructions with zero masking.
- { X86::VPERMI2B128rrkz, X86::VPERMI2B128rmkz, 0 },
- { X86::VPERMI2D128rrkz, X86::VPERMI2D128rmkz, 0 },
- { X86::VPERMI2PD128rrkz, X86::VPERMI2PD128rmkz, 0 },
- { X86::VPERMI2PS128rrkz, X86::VPERMI2PS128rmkz, 0 },
- { X86::VPERMI2Q128rrkz, X86::VPERMI2Q128rmkz, 0 },
- { X86::VPERMI2W128rrkz, X86::VPERMI2W128rmkz, 0 },
- { X86::VPERMT2B128rrkz, X86::VPERMT2B128rmkz, 0 },
- { X86::VPERMT2D128rrkz, X86::VPERMT2D128rmkz, 0 },
- { X86::VPERMT2PD128rrkz, X86::VPERMT2PD128rmkz, 0 },
- { X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 },
- { X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 },
- { X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 },
- { X86::VPMADD52HUQZ128rkz, X86::VPMADD52HUQZ128mkz, 0 },
- { X86::VPMADD52LUQZ128rkz, X86::VPMADD52LUQZ128mkz, 0 },
- { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 },
- { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 },
- };
-
- for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
- AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
- Entry.RegOp, Entry.MemOp,
- // Index 4, folded load
- Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
- }
- for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I) {
- if (I.getGroup()->isKMasked()) {
- // Intrinsics need to pass TB_NO_REVERSE.
- if (I.getGroup()->isIntrinsic()) {
- AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
- I.getRegOpcode(), I.getMemOpcode(),
- TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD | TB_NO_REVERSE);
- } else {
- AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
- I.getRegOpcode(), I.getMemOpcode(),
- TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD);
- }
- }
- }
-}
-
-void
-X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
- MemOp2RegOpTableType &M2RTable,
- uint16_t RegOp, uint16_t MemOp, uint16_t Flags) {
- if ((Flags & TB_NO_FORWARD) == 0) {
- assert(!R2MTable.count(RegOp) && "Duplicate entry!");
- R2MTable[RegOp] = std::make_pair(MemOp, Flags);
- }
- if ((Flags & TB_NO_REVERSE) == 0) {
- assert(!M2RTable.count(MemOp) &&
- "Duplicated entries in unfolding maps?");
- M2RTable[MemOp] = std::make_pair(RegOp, Flags);
- }
}
bool
@@ -3867,156 +206,183 @@ bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
return false;
}
-static bool isFrameLoadOpcode(int Opcode) {
+static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
switch (Opcode) {
default:
return false;
case X86::MOV8rm:
+ case X86::KMOVBkm:
+ MemBytes = 1;
+ return true;
case X86::MOV16rm:
+ case X86::KMOVWkm:
+ MemBytes = 2;
+ return true;
case X86::MOV32rm:
+ case X86::MOVSSrm:
+ case X86::VMOVSSZrm:
+ case X86::VMOVSSrm:
+ case X86::KMOVDkm:
+ MemBytes = 4;
+ return true;
case X86::MOV64rm:
case X86::LD_Fp64m:
- case X86::MOVSSrm:
case X86::MOVSDrm:
+ case X86::VMOVSDrm:
+ case X86::VMOVSDZrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::KMOVQkm:
+ MemBytes = 8;
+ return true;
case X86::MOVAPSrm:
case X86::MOVUPSrm:
case X86::MOVAPDrm:
case X86::MOVUPDrm:
case X86::MOVDQArm:
case X86::MOVDQUrm:
- case X86::VMOVSSrm:
- case X86::VMOVSDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
case X86::VMOVUPDrm:
case X86::VMOVDQArm:
case X86::VMOVDQUrm:
- case X86::VMOVUPSYrm:
+ case X86::VMOVAPSZ128rm:
+ case X86::VMOVUPSZ128rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ128rm_NOVLX:
+ case X86::VMOVAPDZ128rm:
+ case X86::VMOVUPDZ128rm:
+ case X86::VMOVDQU8Z128rm:
+ case X86::VMOVDQU16Z128rm:
+ case X86::VMOVDQA32Z128rm:
+ case X86::VMOVDQU32Z128rm:
+ case X86::VMOVDQA64Z128rm:
+ case X86::VMOVDQU64Z128rm:
+ MemBytes = 16;
+ return true;
case X86::VMOVAPSYrm:
- case X86::VMOVUPDYrm:
+ case X86::VMOVUPSYrm:
case X86::VMOVAPDYrm:
- case X86::VMOVDQUYrm:
+ case X86::VMOVUPDYrm:
case X86::VMOVDQAYrm:
- case X86::MMX_MOVD64rm:
- case X86::MMX_MOVQ64rm:
- case X86::VMOVSSZrm:
- case X86::VMOVSDZrm:
- case X86::VMOVAPSZrm:
- case X86::VMOVAPSZ128rm:
+ case X86::VMOVDQUYrm:
case X86::VMOVAPSZ256rm:
- case X86::VMOVAPSZ128rm_NOVLX:
- case X86::VMOVAPSZ256rm_NOVLX:
- case X86::VMOVUPSZrm:
- case X86::VMOVUPSZ128rm:
case X86::VMOVUPSZ256rm:
- case X86::VMOVUPSZ128rm_NOVLX:
+ case X86::VMOVAPSZ256rm_NOVLX:
case X86::VMOVUPSZ256rm_NOVLX:
- case X86::VMOVAPDZrm:
- case X86::VMOVAPDZ128rm:
case X86::VMOVAPDZ256rm:
- case X86::VMOVUPDZrm:
- case X86::VMOVUPDZ128rm:
case X86::VMOVUPDZ256rm:
- case X86::VMOVDQA32Zrm:
- case X86::VMOVDQA32Z128rm:
+ case X86::VMOVDQU8Z256rm:
+ case X86::VMOVDQU16Z256rm:
case X86::VMOVDQA32Z256rm:
- case X86::VMOVDQU32Zrm:
- case X86::VMOVDQU32Z128rm:
case X86::VMOVDQU32Z256rm:
- case X86::VMOVDQA64Zrm:
- case X86::VMOVDQA64Z128rm:
case X86::VMOVDQA64Z256rm:
- case X86::VMOVDQU64Zrm:
- case X86::VMOVDQU64Z128rm:
case X86::VMOVDQU64Z256rm:
+ MemBytes = 32;
+ return true;
+ case X86::VMOVAPSZrm:
+ case X86::VMOVUPSZrm:
+ case X86::VMOVAPDZrm:
+ case X86::VMOVUPDZrm:
case X86::VMOVDQU8Zrm:
- case X86::VMOVDQU8Z128rm:
- case X86::VMOVDQU8Z256rm:
case X86::VMOVDQU16Zrm:
- case X86::VMOVDQU16Z128rm:
- case X86::VMOVDQU16Z256rm:
- case X86::KMOVBkm:
- case X86::KMOVWkm:
- case X86::KMOVDkm:
- case X86::KMOVQkm:
+ case X86::VMOVDQA32Zrm:
+ case X86::VMOVDQU32Zrm:
+ case X86::VMOVDQA64Zrm:
+ case X86::VMOVDQU64Zrm:
+ MemBytes = 64;
return true;
}
}
-static bool isFrameStoreOpcode(int Opcode) {
+static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {
switch (Opcode) {
- default: break;
+ default:
+ return false;
case X86::MOV8mr:
+ case X86::KMOVBmk:
+ MemBytes = 1;
+ return true;
case X86::MOV16mr:
+ case X86::KMOVWmk:
+ MemBytes = 2;
+ return true;
case X86::MOV32mr:
+ case X86::MOVSSmr:
+ case X86::VMOVSSmr:
+ case X86::VMOVSSZmr:
+ case X86::KMOVDmk:
+ MemBytes = 4;
+ return true;
case X86::MOV64mr:
case X86::ST_FpP64m:
- case X86::MOVSSmr:
case X86::MOVSDmr:
+ case X86::VMOVSDmr:
+ case X86::VMOVSDZmr:
+ case X86::MMX_MOVD64mr:
+ case X86::MMX_MOVQ64mr:
+ case X86::MMX_MOVNTQmr:
+ case X86::KMOVQmk:
+ MemBytes = 8;
+ return true;
case X86::MOVAPSmr:
case X86::MOVUPSmr:
case X86::MOVAPDmr:
case X86::MOVUPDmr:
case X86::MOVDQAmr:
case X86::MOVDQUmr:
- case X86::VMOVSSmr:
- case X86::VMOVSDmr:
case X86::VMOVAPSmr:
case X86::VMOVUPSmr:
case X86::VMOVAPDmr:
case X86::VMOVUPDmr:
case X86::VMOVDQAmr:
case X86::VMOVDQUmr:
+ case X86::VMOVUPSZ128mr:
+ case X86::VMOVAPSZ128mr:
+ case X86::VMOVUPSZ128mr_NOVLX:
+ case X86::VMOVAPSZ128mr_NOVLX:
+ case X86::VMOVUPDZ128mr:
+ case X86::VMOVAPDZ128mr:
+ case X86::VMOVDQA32Z128mr:
+ case X86::VMOVDQU32Z128mr:
+ case X86::VMOVDQA64Z128mr:
+ case X86::VMOVDQU64Z128mr:
+ case X86::VMOVDQU8Z128mr:
+ case X86::VMOVDQU16Z128mr:
+ MemBytes = 16;
+ return true;
case X86::VMOVUPSYmr:
case X86::VMOVAPSYmr:
case X86::VMOVUPDYmr:
case X86::VMOVAPDYmr:
case X86::VMOVDQUYmr:
case X86::VMOVDQAYmr:
- case X86::VMOVSSZmr:
- case X86::VMOVSDZmr:
- case X86::VMOVUPSZmr:
- case X86::VMOVUPSZ128mr:
case X86::VMOVUPSZ256mr:
- case X86::VMOVUPSZ128mr_NOVLX:
- case X86::VMOVUPSZ256mr_NOVLX:
- case X86::VMOVAPSZmr:
- case X86::VMOVAPSZ128mr:
case X86::VMOVAPSZ256mr:
- case X86::VMOVAPSZ128mr_NOVLX:
+ case X86::VMOVUPSZ256mr_NOVLX:
case X86::VMOVAPSZ256mr_NOVLX:
- case X86::VMOVUPDZmr:
- case X86::VMOVUPDZ128mr:
case X86::VMOVUPDZ256mr:
- case X86::VMOVAPDZmr:
- case X86::VMOVAPDZ128mr:
case X86::VMOVAPDZ256mr:
- case X86::VMOVDQA32Zmr:
- case X86::VMOVDQA32Z128mr:
+ case X86::VMOVDQU8Z256mr:
+ case X86::VMOVDQU16Z256mr:
case X86::VMOVDQA32Z256mr:
- case X86::VMOVDQU32Zmr:
- case X86::VMOVDQU32Z128mr:
case X86::VMOVDQU32Z256mr:
- case X86::VMOVDQA64Zmr:
- case X86::VMOVDQA64Z128mr:
case X86::VMOVDQA64Z256mr:
- case X86::VMOVDQU64Zmr:
- case X86::VMOVDQU64Z128mr:
case X86::VMOVDQU64Z256mr:
+ MemBytes = 32;
+ return true;
+ case X86::VMOVUPSZmr:
+ case X86::VMOVAPSZmr:
+ case X86::VMOVUPDZmr:
+ case X86::VMOVAPDZmr:
case X86::VMOVDQU8Zmr:
- case X86::VMOVDQU8Z128mr:
- case X86::VMOVDQU8Z256mr:
case X86::VMOVDQU16Zmr:
- case X86::VMOVDQU16Z128mr:
- case X86::VMOVDQU16Z256mr:
- case X86::MMX_MOVD64mr:
- case X86::MMX_MOVQ64mr:
- case X86::MMX_MOVNTQmr:
- case X86::KMOVBmk:
- case X86::KMOVWmk:
- case X86::KMOVDmk:
- case X86::KMOVQmk:
+ case X86::VMOVDQA32Zmr:
+ case X86::VMOVDQU32Zmr:
+ case X86::VMOVDQA64Zmr:
+ case X86::VMOVDQU64Zmr:
+ MemBytes = 64;
return true;
}
return false;
@@ -4024,7 +390,14 @@ static bool isFrameStoreOpcode(int Opcode) {
unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
- if (isFrameLoadOpcode(MI.getOpcode()))
+ unsigned Dummy;
+ return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex,
+ unsigned &MemBytes) const {
+ if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
return MI.getOperand(0).getReg();
return 0;
@@ -4032,7 +405,8 @@ unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const {
- if (isFrameLoadOpcode(MI.getOpcode())) {
+ unsigned Dummy;
+ if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
unsigned Reg;
if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
return Reg;
@@ -4045,7 +419,14 @@ unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
- if (isFrameStoreOpcode(MI.getOpcode()))
+ unsigned Dummy;
+ return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex,
+ unsigned &MemBytes) const {
+ if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
isFrameOperand(MI, 0, FrameIndex))
return MI.getOperand(X86::AddrNumOperands).getReg();
@@ -4054,7 +435,8 @@ unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const {
- if (isFrameStoreOpcode(MI.getOpcode())) {
+ unsigned Dummy;
+ if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
unsigned Reg;
if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
return Reg;
@@ -4225,8 +607,8 @@ bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
// This instruction defines EFLAGS, no need to look any further.
return true;
++Iter;
- // Skip over DBG_VALUE.
- while (Iter != E && Iter->isDebugValue())
+ // Skip over debug instructions.
+ while (Iter != E && Iter->isDebugInstr())
++Iter;
}
@@ -4248,8 +630,8 @@ bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
return !MBB.isLiveIn(X86::EFLAGS);
--Iter;
- // Skip over DBG_VALUE.
- while (Iter != B && Iter->isDebugValue())
+ // Skip over debug instructions.
+ while (Iter != B && Iter->isDebugInstr())
--Iter;
bool SawKill = false;
@@ -4928,34 +1310,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
/// Case 0 - Possible to commute the first and second operands.
/// Case 1 - Possible to commute the first and third operands.
/// Case 2 - Possible to commute the second and third operands.
-static int getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
- unsigned SrcOpIdx2) {
+static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) {
// Put the lowest index to SrcOpIdx1 to simplify the checks below.
if (SrcOpIdx1 > SrcOpIdx2)
std::swap(SrcOpIdx1, SrcOpIdx2);
unsigned Op1 = 1, Op2 = 2, Op3 = 3;
if (X86II::isKMasked(TSFlags)) {
- // The k-mask operand cannot be commuted.
- if (SrcOpIdx1 == 2)
- return -1;
-
- // For k-zero-masked operations it is Ok to commute the first vector
- // operand.
- // For regular k-masked operations a conservative choice is done as the
- // elements of the first vector operand, for which the corresponding bit
- // in the k-mask operand is set to 0, are copied to the result of the
- // instruction.
- // TODO/FIXME: The commute still may be legal if it is known that the
- // k-mask operand is set to either all ones or all zeroes.
- // It is also Ok to commute the 1st operand if all users of MI use only
- // the elements enabled by the k-mask operand. For example,
- // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
- // : v1[i];
- // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
- // // Ok, to commute v1 in FMADD213PSZrk.
- if (X86II::isKMergeMasked(TSFlags) && SrcOpIdx1 == Op1)
- return -1;
Op2++;
Op3++;
}
@@ -4966,7 +1328,7 @@ static int getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
return 1;
if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
return 2;
- return -1;
+ llvm_unreachable("Unknown three src commute case.");
}
unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
@@ -4975,23 +1337,19 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
unsigned Opc = MI.getOpcode();
- // Put the lowest index to SrcOpIdx1 to simplify the checks below.
- if (SrcOpIdx1 > SrcOpIdx2)
- std::swap(SrcOpIdx1, SrcOpIdx2);
-
// TODO: Commuting the 1st operand of FMA*_Int requires some additional
// analysis. The commute optimization is legal only if all users of FMA*_Int
// use only the lowest element of the FMA*_Int instruction. Such analysis are
// not implemented yet. So, just return 0 in that case.
// When such analysis are available this place will be the right place for
// calling it.
- if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1)
- return 0;
+ assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
+ "Intrinsic instructions can't commute operand 1");
// Determine which case this commute is or if it can't be done.
- int Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
- if (Case < 0)
- return 0;
+ unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
+ SrcOpIdx2);
+ assert(Case < 3 && "Unexpected case number!");
// Define the FMA forms mapping array that helps to map input FMA form
// to output FMA form to preserve the operation semantics after
@@ -5018,15 +1376,9 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
};
unsigned FMAForms[3];
- if (FMA3Group.isRegOpcodeFromGroup(Opc)) {
- FMAForms[0] = FMA3Group.getReg132Opcode();
- FMAForms[1] = FMA3Group.getReg213Opcode();
- FMAForms[2] = FMA3Group.getReg231Opcode();
- } else {
- FMAForms[0] = FMA3Group.getMem132Opcode();
- FMAForms[1] = FMA3Group.getMem213Opcode();
- FMAForms[2] = FMA3Group.getMem231Opcode();
- }
+ FMAForms[0] = FMA3Group.get132Opcode();
+ FMAForms[1] = FMA3Group.get213Opcode();
+ FMAForms[2] = FMA3Group.get231Opcode();
unsigned FormIndex;
for (FormIndex = 0; FormIndex < 3; FormIndex++)
if (Opc == FMAForms[FormIndex])
@@ -5037,14 +1389,12 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
return FMAForms[FormIndex];
}
-static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
+static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
unsigned SrcOpIdx2) {
- uint64_t TSFlags = MI.getDesc().TSFlags;
-
// Determine which case this commute is or if it can't be done.
- int Case = getThreeSrcCommuteCase(TSFlags, SrcOpIdx1, SrcOpIdx2);
- if (Case < 0)
- return false;
+ unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
+ SrcOpIdx2);
+ assert(Case < 3 && "Unexpected case value!");
// For each case we need to swap two pairs of bits in the final immediate.
static const uint8_t SwapMasks[3][4] = {
@@ -5063,11 +1413,9 @@ static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
-
- return true;
}
-// Returns true if this is a VPERMI2 or VPERMT2 instrution that can be
+// Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
// commuted.
static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
#define VPERM_CASES(Suffix) \
@@ -5108,7 +1456,7 @@ static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
}
// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
-// from the I opcod to the T opcode and vice versa.
+// from the I opcode to the T opcode and vice versa.
static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
#define VPERM_CASES(Orig, New) \
case X86::Orig##128rr: return X86::New##128rr; \
@@ -5200,9 +1548,29 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
}
case X86::BLENDPDrri:
case X86::BLENDPSrri:
- case X86::PBLENDWrri:
case X86::VBLENDPDrri:
case X86::VBLENDPSrri:
+ // If we're optimizing for size, try to use MOVSD/MOVSS.
+ if (MI.getParent()->getParent()->getFunction().optForSize()) {
+ unsigned Mask, Opc;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::BLENDPDrri: Opc = X86::MOVSDrr; Mask = 0x03; break;
+ case X86::BLENDPSrri: Opc = X86::MOVSSrr; Mask = 0x0F; break;
+ case X86::VBLENDPDrri: Opc = X86::VMOVSDrr; Mask = 0x03; break;
+ case X86::VBLENDPSrri: Opc = X86::VMOVSSrr; Mask = 0x0F; break;
+ }
+ if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ WorkingMI.RemoveOperand(3);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI,
+ /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ }
+ LLVM_FALLTHROUGH;
+ case X86::PBLENDWrri:
case X86::VBLENDPDYrri:
case X86::VBLENDPSYrri:
case X86::VPBLENDDrri:
@@ -5236,8 +1604,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
case X86::VMOVSDrr:
case X86::VMOVSSrr:{
// On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
- if (!Subtarget.hasSSE41())
- return nullptr;
+ assert(Subtarget.hasSSE41() && "Commuting MOVSD/MOVSS requires SSE41!");
unsigned Mask, Opc;
switch (MI.getOpcode()) {
@@ -5270,37 +1637,6 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
- case X86::CMPSDrr:
- case X86::CMPSSrr:
- case X86::CMPPDrri:
- case X86::CMPPSrri:
- case X86::VCMPSDrr:
- case X86::VCMPSSrr:
- case X86::VCMPPDrri:
- case X86::VCMPPSrri:
- case X86::VCMPPDYrri:
- case X86::VCMPPSYrri:
- case X86::VCMPSDZrr:
- case X86::VCMPSSZrr:
- case X86::VCMPPDZrri:
- case X86::VCMPPSZrri:
- case X86::VCMPPDZ128rri:
- case X86::VCMPPSZ128rri:
- case X86::VCMPPDZ256rri:
- case X86::VCMPPSZ256rri: {
- // Float comparison can be safely commuted for
- // Ordered/Unordered/Equal/NotEqual tests
- unsigned Imm = MI.getOperand(3).getImm() & 0x7;
- switch (Imm) {
- case 0x00: // EQUAL
- case 0x03: // UNORDERED
- case 0x04: // NOT EQUAL
- case 0x07: // ORDERED
- return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
- default:
- return nullptr;
- }
- }
case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
@@ -5327,18 +1663,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
case X86::VPCMPWZrrik: case X86::VPCMPUWZrrik: {
// Flip comparison mode immediate (if necessary).
unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7;
- switch (Imm) {
- default: llvm_unreachable("Unreachable!");
- case 0x01: Imm = 0x06; break; // LT -> NLE
- case 0x02: Imm = 0x05; break; // LE -> NLT
- case 0x05: Imm = 0x02; break; // NLT -> LE
- case 0x06: Imm = 0x01; break; // NLE -> LT
- case 0x00: // EQ
- case 0x03: // FALSE
- case 0x04: // NE
- case 0x07: // TRUE
- break;
- }
+ Imm = X86::getSwappedVPCMPImm(Imm);
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
@@ -5350,18 +1675,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
case X86::VPCOMWri: case X86::VPCOMUWri: {
// Flip comparison mode immediate (if necessary).
unsigned Imm = MI.getOperand(3).getImm() & 0x7;
- switch (Imm) {
- default: llvm_unreachable("Unreachable!");
- case 0x00: Imm = 0x02; break; // LT -> GT
- case 0x01: Imm = 0x03; break; // LE -> GE
- case 0x02: Imm = 0x00; break; // GT -> LT
- case 0x03: Imm = 0x01; break; // GE -> LE
- case 0x04: // EQ
- case 0x05: // NE
- case 0x06: // FALSE
- case 0x07: // TRUE
- break;
- }
+ Imm = X86::getSwappedVPCOMImm(Imm);
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(3).setImm(Imm);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
@@ -5379,15 +1693,22 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
OpIdx1, OpIdx2);
}
case X86::MOVHLPSrr:
- case X86::UNPCKHPDrr: {
- if (!Subtarget.hasSSE2())
- return nullptr;
+ case X86::UNPCKHPDrr:
+ case X86::VMOVHLPSrr:
+ case X86::VUNPCKHPDrr:
+ case X86::VMOVHLPSZrr:
+ case X86::VUNPCKHPDZ128rr: {
+ assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
unsigned Opc = MI.getOpcode();
switch (Opc) {
- default: llvm_unreachable("Unreachable!");
- case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
- case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
+ case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
+ case X86::VMOVHLPSrr: Opc = X86::VUNPCKHPDrr; break;
+ case X86::VUNPCKHPDrr: Opc = X86::VMOVHLPSrr; break;
+ case X86::VMOVHLPSZrr: Opc = X86::VUNPCKHPDZ128rr; break;
+ case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr; break;
}
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
@@ -5498,8 +1819,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
case X86::VPTERNLOGQZ256rmbikz:
case X86::VPTERNLOGQZrmbikz: {
auto &WorkingMI = cloneIfNew(MI);
- if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2))
- return nullptr;
+ commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
@@ -5512,13 +1832,11 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
OpIdx1, OpIdx2);
}
- const X86InstrFMA3Group *FMA3Group =
- X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+ const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
+ MI.getDesc().TSFlags);
if (FMA3Group) {
unsigned Opc =
getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
- if (Opc == 0)
- return nullptr;
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
@@ -5530,27 +1848,32 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
}
}
-bool X86InstrInfo::findFMA3CommutedOpIndices(
- const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
- const X86InstrFMA3Group &FMA3Group) const {
-
- if (!findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2))
- return false;
-
- // Check if we can adjust the opcode to preserve the semantics when
- // commute the register operands.
- return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0;
-}
-
-bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
- unsigned &SrcOpIdx1,
- unsigned &SrcOpIdx2) const {
+bool
+X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2,
+ bool IsIntrinsic) const {
uint64_t TSFlags = MI.getDesc().TSFlags;
unsigned FirstCommutableVecOp = 1;
unsigned LastCommutableVecOp = 3;
- unsigned KMaskOp = 0;
+ unsigned KMaskOp = -1U;
if (X86II::isKMasked(TSFlags)) {
+ // For k-zero-masked operations it is Ok to commute the first vector
+ // operand.
+ // For regular k-masked operations a conservative choice is done as the
+ // elements of the first vector operand, for which the corresponding bit
+ // in the k-mask operand is set to 0, are copied to the result of the
+ // instruction.
+ // TODO/FIXME: The commute still may be legal if it is known that the
+ // k-mask operand is set to either all ones or all zeroes.
+ // It is also Ok to commute the 1st operand if all users of MI use only
+ // the elements enabled by the k-mask operand. For example,
+ // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
+ // : v1[i];
+ // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
+ // // Ok, to commute v1 in FMADD213PSZrk.
+
// The k-mask operand has index = 2 for masked and zero-masked operations.
KMaskOp = 2;
@@ -5560,6 +1883,10 @@ bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
FirstCommutableVecOp = 3;
LastCommutableVecOp++;
+ } else if (IsIntrinsic) {
+ // Commuting the first operand of an intrinsic instruction isn't possible
+ // unless we can prove that only the lowest element of the result is used.
+ FirstCommutableVecOp = 2;
}
if (isMem(MI, LastCommutableVecOp))
@@ -5666,11 +1993,19 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
case X86::MOVSDrr:
case X86::MOVSSrr:
case X86::VMOVSDrr:
- case X86::VMOVSSrr: {
+ case X86::VMOVSSrr:
if (Subtarget.hasSSE41())
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
return false;
- }
+ case X86::MOVHLPSrr:
+ case X86::UNPCKHPDrr:
+ case X86::VMOVHLPSrr:
+ case X86::VUNPCKHPDrr:
+ case X86::VMOVHLPSZrr:
+ case X86::VUNPCKHPDZ128rr:
+ if (Subtarget.hasSSE2())
+ return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ return false;
case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
@@ -5722,7 +2057,7 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
case X86::VPMADD52LUQZrkz: {
unsigned CommutableOpIdx1 = 2;
unsigned CommutableOpIdx2 = 3;
- if (Desc.TSFlags & X86II::EVEX_K) {
+ if (X86II::isKMasked(Desc.TSFlags)) {
// Skip the mask register.
++CommutableOpIdx1;
++CommutableOpIdx2;
@@ -5738,14 +2073,15 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
}
default:
- const X86InstrFMA3Group *FMA3Group =
- X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+ const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
+ MI.getDesc().TSFlags);
if (FMA3Group)
- return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, *FMA3Group);
+ return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
+ FMA3Group->isIntrinsic());
// Handled masked instructions since we need to skip over the mask input
// and the preserved input.
- if (Desc.TSFlags & X86II::EVEX_K) {
+ if (X86II::isKMasked(Desc.TSFlags)) {
// First assume that the first input is the mask operand and skip past it.
unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
@@ -5758,11 +2094,11 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
// be a 3 input instruction and we want the first two non-mask inputs.
// Otherwise this is a 2 input instruction with a preserved input and
// mask, so we need to move the indices to skip one more input.
- if (Desc.TSFlags & X86II::EVEX_Z)
- --CommutableOpIdx1;
- else {
+ if (X86II::isKMergeMasked(Desc.TSFlags)) {
++CommutableOpIdx1;
++CommutableOpIdx2;
+ } else {
+ --CommutableOpIdx1;
}
}
@@ -5782,7 +2118,7 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
return false;
}
-static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
+X86::CondCode X86::getCondFromBranchOpc(unsigned BrOpc) {
switch (BrOpc) {
default: return X86::COND_INVALID;
case X86::JE_1: return X86::COND_E;
@@ -5805,7 +2141,7 @@ static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
}
/// Return condition code of a SET opcode.
-static X86::CondCode getCondFromSETOpc(unsigned Opc) {
+X86::CondCode X86::getCondFromSETOpc(unsigned Opc) {
switch (Opc) {
default: return X86::COND_INVALID;
case X86::SETAr: case X86::SETAm: return X86::COND_A;
@@ -6061,6 +2397,59 @@ unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
}
}
+/// Get the VPCMP immediate for the given condition.
+unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) {
+ switch (CC) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETNE: return 4;
+ case ISD::SETEQ: return 0;
+ case ISD::SETULT:
+ case ISD::SETLT: return 1;
+ case ISD::SETUGT:
+ case ISD::SETGT: return 6;
+ case ISD::SETUGE:
+ case ISD::SETGE: return 5;
+ case ISD::SETULE:
+ case ISD::SETLE: return 2;
+ }
+}
+
+/// Get the VPCMP immediate if the opcodes are swapped.
+unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
+ switch (Imm) {
+ default: llvm_unreachable("Unreachable!");
+ case 0x01: Imm = 0x06; break; // LT -> NLE
+ case 0x02: Imm = 0x05; break; // LE -> NLT
+ case 0x05: Imm = 0x02; break; // NLT -> LE
+ case 0x06: Imm = 0x01; break; // NLE -> LT
+ case 0x00: // EQ
+ case 0x03: // FALSE
+ case 0x04: // NE
+ case 0x07: // TRUE
+ break;
+ }
+
+ return Imm;
+}
+
+/// Get the VPCOM immediate if the opcodes are swapped.
+unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
+ switch (Imm) {
+ default: llvm_unreachable("Unreachable!");
+ case 0x00: Imm = 0x02; break; // LT -> GT
+ case 0x01: Imm = 0x03; break; // LE -> GE
+ case 0x02: Imm = 0x00; break; // GT -> LT
+ case 0x03: Imm = 0x01; break; // GE -> LE
+ case 0x04: // EQ
+ case 0x05: // NE
+ case 0x06: // FALSE
+ case 0x07: // TRUE
+ break;
+ }
+
+ return Imm;
+}
+
bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
if (!MI.isTerminator()) return false;
@@ -6125,12 +2514,12 @@ void X86InstrInfo::replaceBranchWithTailCall(
MachineBasicBlock::iterator I = MBB.end();
while (I != MBB.begin()) {
--I;
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
if (!I->isBranch())
assert(0 && "Can't find the branch to replace!");
- X86::CondCode CC = getCondFromBranchOpc(I->getOpcode());
+ X86::CondCode CC = X86::getCondFromBranchOpc(I->getOpcode());
assert(BranchCond.size() == 1);
if (CC != BranchCond[0].getImm())
continue;
@@ -6193,7 +2582,7 @@ bool X86InstrInfo::AnalyzeBranchImpl(
MachineBasicBlock::iterator UnCondBrIter = MBB.end();
while (I != MBB.begin()) {
--I;
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
// Working from the bottom, when we see a non-terminator instruction, we're
@@ -6237,7 +2626,7 @@ bool X86InstrInfo::AnalyzeBranchImpl(
}
// Handle conditional branches.
- X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
+ X86::CondCode BranchCode = X86::getCondFromBranchOpc(I->getOpcode());
if (BranchCode == X86::COND_INVALID)
return true; // Can't handle indirect branch.
@@ -6430,10 +2819,10 @@ unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB,
while (I != MBB.begin()) {
--I;
- if (I->isDebugValue())
+ if (I->isDebugInstr())
continue;
if (I->getOpcode() != X86::JMP_1 &&
- getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+ X86::getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
break;
// Remove the branch.
I->eraseFromParent();
@@ -6562,7 +2951,7 @@ static bool isHReg(unsigned Reg) {
}
// Try and copy between VR128/VR64 and GR64 registers.
-static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
+static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
const X86Subtarget &Subtarget) {
bool HasAVX = Subtarget.hasAVX();
bool HasAVX512 = Subtarget.hasAVX512();
@@ -6710,109 +3099,30 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- bool FromEFLAGS = SrcReg == X86::EFLAGS;
- bool ToEFLAGS = DestReg == X86::EFLAGS;
- int Reg = FromEFLAGS ? DestReg : SrcReg;
- bool is32 = X86::GR32RegClass.contains(Reg);
- bool is64 = X86::GR64RegClass.contains(Reg);
-
- if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
- int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
- int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
- int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32;
- int Pop = is64 ? X86::POP64r : X86::POP32r;
- int PopF = is64 ? X86::POPF64 : X86::POPF32;
- int AX = is64 ? X86::RAX : X86::EAX;
-
- if (!Subtarget.hasLAHFSAHF()) {
- assert(Subtarget.is64Bit() &&
- "Not having LAHF/SAHF only happens on 64-bit.");
- // Moving EFLAGS to / from another register requires a push and a pop.
- // Notice that we have to adjust the stack if we don't want to clobber the
- // first frame index. See X86FrameLowering.cpp - usesTheStack.
- if (FromEFLAGS) {
- BuildMI(MBB, MI, DL, get(PushF));
- BuildMI(MBB, MI, DL, get(Pop), DestReg);
- }
- if (ToEFLAGS) {
- BuildMI(MBB, MI, DL, get(Push))
- .addReg(SrcReg, getKillRegState(KillSrc));
- BuildMI(MBB, MI, DL, get(PopF));
- }
- return;
- }
-
- // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
- // inefficient. Instead:
- // - Save the overflow flag OF into AL using SETO, and restore it using a
- // signed 8-bit addition of AL and INT8_MAX.
- // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
- // using LAHF/SAHF.
- // - When RAX/EAX is live and isn't the destination register, make sure it
- // isn't clobbered by PUSH/POP'ing it before and after saving/restoring
- // the flags.
- // This approach is ~2.25x faster than using PUSHF/POPF.
- //
- // This is still somewhat inefficient because we don't know which flags are
- // actually live inside EFLAGS. Were we able to do a single SETcc instead of
- // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
- //
- // PUSHF/POPF is also potentially incorrect because it affects other flags
- // such as TF/IF/DF, which LLVM doesn't model.
- //
- // Notice that we have to adjust the stack if we don't want to clobber the
- // first frame index.
- // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.
-
- const TargetRegisterInfo &TRI = getRegisterInfo();
- MachineBasicBlock::LivenessQueryResult LQR =
- MBB.computeRegisterLiveness(&TRI, AX, MI);
- // We do not want to save and restore AX if we do not have to.
- // Moreover, if we do so whereas AX is dead, we would need to set
- // an undef flag on the use of AX, otherwise the verifier will
- // complain that we read an undef value.
- // We do not want to change the behavior of the machine verifier
- // as this is usually wrong to read an undef value.
- if (MachineBasicBlock::LQR_Unknown == LQR) {
- LivePhysRegs LPR(TRI);
- LPR.addLiveOuts(MBB);
- MachineBasicBlock::iterator I = MBB.end();
- while (I != MI) {
- --I;
- LPR.stepBackward(*I);
- }
- // AX contains the top most register in the aliasing hierarchy.
- // It may not be live, but one of its aliases may be.
- for (MCRegAliasIterator AI(AX, &TRI, true);
- AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI)
- LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live
- : MachineBasicBlock::LQR_Dead;
- }
- bool AXDead = (Reg == AX) || (MachineBasicBlock::LQR_Dead == LQR);
- if (!AXDead)
- BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
- if (FromEFLAGS) {
- BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
- BuildMI(MBB, MI, DL, get(X86::LAHF));
- BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
- }
- if (ToEFLAGS) {
- BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
- BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
- .addReg(X86::AL)
- .addImm(INT8_MAX);
- BuildMI(MBB, MI, DL, get(X86::SAHF));
- }
- if (!AXDead)
- BuildMI(MBB, MI, DL, get(Pop), AX);
- return;
+ if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
+ // FIXME: We use a fatal error here because historically LLVM has tried
+ // lower some of these physreg copies and we want to ensure we get
+ // reasonable bug reports if someone encounters a case no other testing
+ // found. This path should be removed after the LLVM 7 release.
+ report_fatal_error("Unable to copy EFLAGS physical register!");
}
- DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
- << " to " << RI.getName(DestReg) << '\n');
+ LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
+ << RI.getName(DestReg) << '\n');
llvm_unreachable("Cannot emit physreg copy instruction");
}
+bool X86InstrInfo::isCopyInstr(const MachineInstr &MI,
+ const MachineOperand *&Src,
+ const MachineOperand *&Dest) const {
+ if (MI.isMoveReg()) {
+ Dest = &MI.getOperand(0);
+ Src = &MI.getOperand(1);
+ return true;
+ }
+ return false;
+}
+
static unsigned getLoadStoreRegOpcode(unsigned Reg,
const TargetRegisterClass *RC,
bool isStackAligned,
@@ -6847,8 +3157,10 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
(HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
if (X86::RFP32RegClass.hasSubClassEq(RC))
return load ? X86::LD_Fp32m : X86::ST_Fp32m;
- if (X86::VK32RegClass.hasSubClassEq(RC))
+ if (X86::VK32RegClass.hasSubClassEq(RC)) {
+ assert(STI.hasBWI() && "KMOVD requires BWI");
return load ? X86::KMOVDkm : X86::KMOVDmk;
+ }
llvm_unreachable("Unknown 4-byte regclass");
case 8:
if (X86::GR64RegClass.hasSubClassEq(RC))
@@ -6861,8 +3173,10 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
if (X86::RFP64RegClass.hasSubClassEq(RC))
return load ? X86::LD_Fp64m : X86::ST_Fp64m;
- if (X86::VK64RegClass.hasSubClassEq(RC))
+ if (X86::VK64RegClass.hasSubClassEq(RC)) {
+ assert(STI.hasBWI() && "KMOVQ requires BWI");
return load ? X86::KMOVQkm : X86::KMOVQmk;
+ }
llvm_unreachable("Unknown 8-byte regclass");
case 10:
assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
@@ -6893,9 +3207,9 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
}
if (X86::BNDRRegClass.hasSubClassEq(RC)) {
if (STI.is64Bit())
- return load ? X86::BNDMOVRM64rm : X86::BNDMOVMR64mr;
+ return load ? X86::BNDMOV64rm : X86::BNDMOV64mr;
else
- return load ? X86::BNDMOVRM32rm : X86::BNDMOVMR32mr;
+ return load ? X86::BNDMOV32rm : X86::BNDMOV32mr;
}
llvm_unreachable("Unknown 16-byte regclass");
}
@@ -7290,6 +3604,13 @@ static X86::CondCode isUseDefConvertible(MachineInstr &MI) {
case X86::TZCNT32rr: case X86::TZCNT32rm:
case X86::TZCNT64rr: case X86::TZCNT64rm:
return X86::COND_B;
+ case X86::BSF16rr:
+ case X86::BSF16rm:
+ case X86::BSF32rr:
+ case X86::BSF32rm:
+ case X86::BSF64rr:
+ case X86::BSF64rm:
+ return X86::COND_E;
}
}
@@ -7465,9 +3786,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
if (IsCmpZero || IsSwapped) {
// We decode the condition code from opcode.
if (Instr.isBranch())
- OldCC = getCondFromBranchOpc(Instr.getOpcode());
+ OldCC = X86::getCondFromBranchOpc(Instr.getOpcode());
else {
- OldCC = getCondFromSETOpc(Instr.getOpcode());
+ OldCC = X86::getCondFromSETOpc(Instr.getOpcode());
if (OldCC != X86::COND_INVALID)
OpcIsSET = true;
else
@@ -7841,6 +4162,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return Expand2AddrUndef(MIB, get(X86::SBB32rr));
case X86::SETB_C64r:
return Expand2AddrUndef(MIB, get(X86::SBB64rr));
+ case X86::MMX_SET0:
+ return Expand2AddrUndef(MIB, get(X86::MMX_PXORirr));
case X86::V_SET0:
case X86::FsFLD0SS:
case X86::FsFLD0SD:
@@ -7944,9 +4267,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::VMOVUPSZ256mr_NOVLX:
return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
- case X86::TEST8ri_NOREX:
- MI.setDesc(get(X86::TEST8ri));
- return true;
case X86::MOV32ri64:
MI.setDesc(get(X86::MOV32ri));
return true;
@@ -7990,7 +4310,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
///
/// FIXME: This should be turned into a TSFlags.
///
-static bool hasPartialRegUpdate(unsigned Opcode) {
+static bool hasPartialRegUpdate(unsigned Opcode,
+ const X86Subtarget &Subtarget) {
switch (Opcode) {
case X86::CVTSI2SSrr:
case X86::CVTSI2SSrm:
@@ -8029,17 +4350,32 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
case X86::SQRTSDr_Int:
case X86::SQRTSDm_Int:
return true;
+ // GPR
+ case X86::POPCNT32rm:
+ case X86::POPCNT32rr:
+ case X86::POPCNT64rm:
+ case X86::POPCNT64rr:
+ return Subtarget.hasPOPCNTFalseDeps();
+ case X86::LZCNT32rm:
+ case X86::LZCNT32rr:
+ case X86::LZCNT64rm:
+ case X86::LZCNT64rr:
+ case X86::TZCNT32rm:
+ case X86::TZCNT32rr:
+ case X86::TZCNT64rm:
+ case X86::TZCNT64rr:
+ return Subtarget.hasLZCNTFalseDeps();
}
return false;
}
-/// Inform the ExecutionDepsFix pass how many idle
+/// Inform the BreakFalseDeps pass how many idle
/// instructions we would like before a partial register update.
unsigned X86InstrInfo::getPartialRegUpdateClearance(
const MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const {
- if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode()))
+ if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget))
return 0;
// If MI is marked as reading Reg, the partial register update is wanted.
@@ -8161,20 +4497,51 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
case X86::VCVTSS2SDZrrb_Int:
case X86::VCVTSS2SDZrm:
case X86::VCVTSS2SDZrm_Int:
- case X86::VRNDSCALESDr:
- case X86::VRNDSCALESDr_Int:
- case X86::VRNDSCALESDrb_Int:
- case X86::VRNDSCALESDm:
- case X86::VRNDSCALESDm_Int:
- case X86::VRNDSCALESSr:
- case X86::VRNDSCALESSr_Int:
- case X86::VRNDSCALESSrb_Int:
- case X86::VRNDSCALESSm:
- case X86::VRNDSCALESSm_Int:
- case X86::VRCP14SSrr:
- case X86::VRCP14SSrm:
- case X86::VRSQRT14SSrr:
- case X86::VRSQRT14SSrm:
+ case X86::VGETEXPSDZr:
+ case X86::VGETEXPSDZrb:
+ case X86::VGETEXPSDZm:
+ case X86::VGETEXPSSZr:
+ case X86::VGETEXPSSZrb:
+ case X86::VGETEXPSSZm:
+ case X86::VGETMANTSDZrri:
+ case X86::VGETMANTSDZrrib:
+ case X86::VGETMANTSDZrmi:
+ case X86::VGETMANTSSZrri:
+ case X86::VGETMANTSSZrrib:
+ case X86::VGETMANTSSZrmi:
+ case X86::VRNDSCALESDZr:
+ case X86::VRNDSCALESDZr_Int:
+ case X86::VRNDSCALESDZrb_Int:
+ case X86::VRNDSCALESDZm:
+ case X86::VRNDSCALESDZm_Int:
+ case X86::VRNDSCALESSZr:
+ case X86::VRNDSCALESSZr_Int:
+ case X86::VRNDSCALESSZrb_Int:
+ case X86::VRNDSCALESSZm:
+ case X86::VRNDSCALESSZm_Int:
+ case X86::VRCP14SDZrr:
+ case X86::VRCP14SDZrm:
+ case X86::VRCP14SSZrr:
+ case X86::VRCP14SSZrm:
+ case X86::VRCP28SDZr:
+ case X86::VRCP28SDZrb:
+ case X86::VRCP28SDZm:
+ case X86::VRCP28SSZr:
+ case X86::VRCP28SSZrb:
+ case X86::VRCP28SSZm:
+ case X86::VREDUCESSZrmi:
+ case X86::VREDUCESSZrri:
+ case X86::VREDUCESSZrrib:
+ case X86::VRSQRT14SDZrr:
+ case X86::VRSQRT14SDZrm:
+ case X86::VRSQRT14SSZrr:
+ case X86::VRSQRT14SSZrm:
+ case X86::VRSQRT28SDZr:
+ case X86::VRSQRT28SDZrb:
+ case X86::VRSQRT28SDZm:
+ case X86::VRSQRT28SSZr:
+ case X86::VRSQRT28SSZrb:
+ case X86::VRSQRT28SSZm:
case X86::VSQRTSSZr:
case X86::VSQRTSSZr_Int:
case X86::VSQRTSSZrb_Int:
@@ -8191,7 +4558,7 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
return false;
}
-/// Inform the ExecutionDepsFix pass how many idle instructions we would like
+/// Inform the BreakFalseDeps pass how many idle instructions we would like
/// before certain undef register reads.
///
/// This catches the VCVTSI2SD family of instructions:
@@ -8245,6 +4612,20 @@ void X86InstrInfo::breakPartialRegDependency(
.addReg(XReg, RegState::Undef)
.addReg(Reg, RegState::ImplicitDefine);
MI.addRegisterKilled(Reg, TRI, true);
+ } else if (X86::GR64RegClass.contains(Reg)) {
+ // Using XOR32rr because it has shorter encoding and zeros up the upper bits
+ // as well.
+ unsigned XReg = TRI->getSubReg(Reg, X86::sub_32bit);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
+ .addReg(XReg, RegState::Undef)
+ .addReg(XReg, RegState::Undef)
+ .addReg(Reg, RegState::ImplicitDefine);
+ MI.addRegisterKilled(Reg, TRI, true);
+ } else if (X86::GR32RegClass.contains(Reg)) {
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+ MI.addRegisterKilled(Reg, TRI, true);
}
}
@@ -8272,6 +4653,32 @@ static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
}
}
+static void updateOperandRegConstraints(MachineFunction &MF,
+ MachineInstr &NewMI,
+ const TargetInstrInfo &TII) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+
+ for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) {
+ MachineOperand &MO = NewMI.getOperand(Idx);
+ // We only need to update constraints on virtual register operands.
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (!TRI.isVirtualRegister(Reg))
+ continue;
+
+ auto *NewRC = MRI.constrainRegClass(
+ Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI, MF));
+ if (!NewRC) {
+ LLVM_DEBUG(
+ dbgs() << "WARNING: Unable to update register constraint for operand "
+ << Idx << " of instruction:\n";
+ NewMI.dump(); dbgs() << "\n");
+ }
+ }
+}
+
static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
ArrayRef<MachineOperand> MOs,
MachineBasicBlock::iterator InsertPt,
@@ -8295,6 +4702,8 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
MIB.add(MO);
}
+ updateOperandRegConstraints(MF, *NewMI, TII);
+
MachineBasicBlock *MBB = InsertPt->getParent();
MBB->insert(InsertPt, NewMI);
@@ -8321,6 +4730,8 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
}
}
+ updateOperandRegConstraints(MF, *NewMI, TII);
+
MachineBasicBlock *MBB = InsertPt->getParent();
MBB->insert(InsertPt, NewMI);
@@ -8396,12 +4807,29 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
return nullptr;
}
+static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI) {
+ if (MF.getFunction().optForSize() || !hasUndefRegUpdate(MI.getOpcode()) ||
+ !MI.getOperand(1).isReg())
+ return false;
+
+ // The are two cases we need to handle depending on where in the pipeline
+ // the folding attempt is being made.
+ // -Register has the undef flag set.
+ // -Register is produced by the IMPLICIT_DEF instruction.
+
+ if (MI.getOperand(1).isUndef())
+ return true;
+
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ MachineInstr *VRegDef = RegInfo.getUniqueVRegDef(MI.getOperand(1).getReg());
+ return VRegDef && VRegDef->isImplicitDef();
+}
+
+
MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
unsigned Size, unsigned Align, bool AllowCommute) const {
- const DenseMap<unsigned,
- std::pair<uint16_t, uint16_t> > *OpcodeTablePtr = nullptr;
bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
bool isTwoAddrFold = false;
@@ -8414,9 +4842,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MI.getOpcode() == X86::PUSH64r))
return nullptr;
- // Avoid partial register update stalls unless optimizing for size.
- // TODO: we should block undef reg update as well.
- if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+ // Avoid partial and undef register update stalls unless optimizing for size.
+ if (!MF.getFunction().optForSize() &&
+ (hasPartialRegUpdate(MI.getOpcode(), Subtarget) ||
+ shouldPreventUndefRegUpdateMemFold(MF, MI)))
return nullptr;
unsigned NumOps = MI.getDesc().getNumOperands();
@@ -8429,6 +4858,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
return nullptr;
+ // GOTTPOFF relocation loads can only be folded into add instructions.
+ // FIXME: Need to exclude other relocations that only support specific
+ // instructions.
+ if (MOs.size() == X86::AddrNumOperands &&
+ MOs[X86::AddrDisp].getTargetFlags() == X86II::MO_GOTTPOFF &&
+ MI.getOpcode() != X86::ADD64rr)
+ return nullptr;
+
MachineInstr *NewMI = nullptr;
// Attempt to fold any custom cases we have.
@@ -8436,79 +4873,70 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
return CustomMI;
+ const X86MemoryFoldTableEntry *I = nullptr;
+
// Folding a memory location into the two-address part of a two-address
// instruction is different than folding it other places. It requires
// replacing the *two* registers with the memory location.
if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI.getOperand(0).isReg() &&
MI.getOperand(1).isReg() &&
MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
- OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+ I = lookupTwoAddrFoldTable(MI.getOpcode());
isTwoAddrFold = true;
- } else if (OpNum == 0) {
- if (MI.getOpcode() == X86::MOV32r0) {
- NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
- if (NewMI)
- return NewMI;
+ } else {
+ if (OpNum == 0) {
+ if (MI.getOpcode() == X86::MOV32r0) {
+ NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
+ if (NewMI)
+ return NewMI;
+ }
}
- OpcodeTablePtr = &RegOp2MemOpTable0;
- } else if (OpNum == 1) {
- OpcodeTablePtr = &RegOp2MemOpTable1;
- } else if (OpNum == 2) {
- OpcodeTablePtr = &RegOp2MemOpTable2;
- } else if (OpNum == 3) {
- OpcodeTablePtr = &RegOp2MemOpTable3;
- } else if (OpNum == 4) {
- OpcodeTablePtr = &RegOp2MemOpTable4;
- }
-
- // If table selected...
- if (OpcodeTablePtr) {
- // Find the Opcode to fuse
- auto I = OpcodeTablePtr->find(MI.getOpcode());
- if (I != OpcodeTablePtr->end()) {
- unsigned Opcode = I->second.first;
- unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
- if (Align < MinAlign)
- return nullptr;
- bool NarrowToMOV32rm = false;
- if (Size) {
- const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
- const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum,
- &RI, MF);
- unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
- if (Size < RCSize) {
- // Check if it's safe to fold the load. If the size of the object is
- // narrower than the load width, then it's not.
- if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
- return nullptr;
- // If this is a 64-bit load, but the spill slot is 32, then we can do
- // a 32-bit load which is implicitly zero-extended. This likely is
- // due to live interval analysis remat'ing a load from stack slot.
- if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
- return nullptr;
- Opcode = X86::MOV32rm;
- NarrowToMOV32rm = true;
- }
+ I = lookupFoldTable(MI.getOpcode(), OpNum);
+ }
+
+ if (I != nullptr) {
+ unsigned Opcode = I->DstOp;
+ unsigned MinAlign = (I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
+ if (Align < MinAlign)
+ return nullptr;
+ bool NarrowToMOV32rm = false;
+ if (Size) {
+ const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+ const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum,
+ &RI, MF);
+ unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
+ if (Size < RCSize) {
+ // Check if it's safe to fold the load. If the size of the object is
+ // narrower than the load width, then it's not.
+ if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+ return nullptr;
+ // If this is a 64-bit load, but the spill slot is 32, then we can do
+ // a 32-bit load which is implicitly zero-extended. This likely is
+ // due to live interval analysis remat'ing a load from stack slot.
+ if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+ return nullptr;
+ Opcode = X86::MOV32rm;
+ NarrowToMOV32rm = true;
}
+ }
- if (isTwoAddrFold)
- NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
+ if (isTwoAddrFold)
+ NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
+ else
+ NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
+
+ if (NarrowToMOV32rm) {
+ // If this is the special case where we use a MOV32rm to load a 32-bit
+ // value and zero-extend the top bits. Change the destination register
+ // to a 32-bit one.
+ unsigned DstReg = NewMI->getOperand(0).getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
else
- NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
-
- if (NarrowToMOV32rm) {
- // If this is the special case where we use a MOV32rm to load a 32-bit
- // value and zero-extend the top bits. Change the destination register
- // to a 32-bit one.
- unsigned DstReg = NewMI->getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(DstReg))
- NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
- else
- NewMI->getOperand(0).setSubReg(X86::sub_32bit);
- }
- return NewMI;
+ NewMI->getOperand(0).setSubReg(X86::sub_32bit);
}
+ return NewMI;
}
// If the instruction and target operand are commutable, commute the
@@ -8582,10 +5010,10 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
if (NoFusing)
return nullptr;
- // Unless optimizing for size, don't fold to avoid partial
- // register update stalls
- // TODO: we should block undef reg update as well.
- if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+ // Avoid partial and undef register update stalls unless optimizing for size.
+ if (!MF.getFunction().optForSize() &&
+ (hasPartialRegUpdate(MI.getOpcode(), Subtarget) ||
+ shouldPreventUndefRegUpdateMemFold(MF, MI)))
return nullptr;
// Don't fold subreg spills, or reloads that use a high subreg.
@@ -8782,9 +5210,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
// Check switch flag
if (NoFusing) return nullptr;
- // Avoid partial register update stalls unless optimizing for size.
- // TODO: we should block undef reg update as well.
- if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+ // Avoid partial and undef register update stalls unless optimizing for size.
+ if (!MF.getFunction().optForSize() &&
+ (hasPartialRegUpdate(MI.getOpcode(), Subtarget) ||
+ shouldPreventUndefRegUpdateMemFold(MF, MI)))
return nullptr;
// Determine the alignment of the load.
@@ -8808,6 +5237,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::AVX512_128_SET0:
Alignment = 16;
break;
+ case X86::MMX_SET0:
case X86::FsFLD0SD:
case X86::AVX512_FsFLD0SD:
Alignment = 8;
@@ -8841,6 +5271,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
switch (LoadMI.getOpcode()) {
+ case X86::MMX_SET0:
case X86::V_SET0:
case X86::V_SETALLONES:
case X86::AVX2_SETALLONES:
@@ -8888,6 +5319,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8);
+ else if (Opc == X86::MMX_SET0)
+ Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 2);
else
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 4);
@@ -8923,13 +5356,13 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
bool X86InstrInfo::unfoldMemoryOperand(
MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad,
bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
- auto I = MemOp2RegOpTable.find(MI.getOpcode());
- if (I == MemOp2RegOpTable.end())
+ const X86MemoryFoldTableEntry *I = lookupUnfoldTable(MI.getOpcode());
+ if (I == nullptr)
return false;
- unsigned Opc = I->second.first;
- unsigned Index = I->second.second & TB_INDEX_MASK;
- bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
- bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+ unsigned Opc = I->DstOp;
+ unsigned Index = I->Flags & TB_INDEX_MASK;
+ bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
+ bool FoldedStore = I->Flags & TB_FOLDED_STORE;
if (UnfoldLoad && !FoldedLoad)
return false;
UnfoldLoad &= FoldedLoad;
@@ -9045,13 +5478,13 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
if (!N->isMachineOpcode())
return false;
- auto I = MemOp2RegOpTable.find(N->getMachineOpcode());
- if (I == MemOp2RegOpTable.end())
+ const X86MemoryFoldTableEntry *I = lookupUnfoldTable(N->getMachineOpcode());
+ if (I == nullptr)
return false;
- unsigned Opc = I->second.first;
- unsigned Index = I->second.second & TB_INDEX_MASK;
- bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
- bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+ unsigned Opc = I->DstOp;
+ unsigned Index = I->Flags & TB_INDEX_MASK;
+ bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
+ bool FoldedStore = I->Flags & TB_FOLDED_STORE;
const MCInstrDesc &MCID = get(Opc);
MachineFunction &MF = DAG.getMachineFunction();
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
@@ -9115,6 +5548,30 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
if (Load)
BeforeOps.push_back(SDValue(Load, 0));
BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end());
+ // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
+ switch (Opc) {
+ default: break;
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP8ri:
+ if (isNullConstant(BeforeOps[1])) {
+ switch (Opc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::CMP64ri8:
+ case X86::CMP64ri32: Opc = X86::TEST64rr; break;
+ case X86::CMP32ri8:
+ case X86::CMP32ri: Opc = X86::TEST32rr; break;
+ case X86::CMP16ri8:
+ case X86::CMP16ri: Opc = X86::TEST16rr; break;
+ case X86::CMP8ri: Opc = X86::TEST8rr; break;
+ }
+ BeforeOps[1] = BeforeOps[0];
+ }
+ }
SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
NewNodes.push_back(NewNode);
@@ -9152,18 +5609,18 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
bool UnfoldLoad, bool UnfoldStore,
unsigned *LoadRegIndex) const {
- auto I = MemOp2RegOpTable.find(Opc);
- if (I == MemOp2RegOpTable.end())
+ const X86MemoryFoldTableEntry *I = lookupUnfoldTable(Opc);
+ if (I == nullptr)
return 0;
- bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
- bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+ bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
+ bool FoldedStore = I->Flags & TB_FOLDED_STORE;
if (UnfoldLoad && !FoldedLoad)
return 0;
if (UnfoldStore && !FoldedStore)
return 0;
if (LoadRegIndex)
- *LoadRegIndex = I->second.second & TB_INDEX_MASK;
- return I->second.first;
+ *LoadRegIndex = I->Flags & TB_INDEX_MASK;
+ return I->DstOp;
}
bool
@@ -9413,8 +5870,9 @@ bool X86InstrInfo::
isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
// FIXME: Return false for x87 stack register classes for now. We can't
// allow any loads of these registers before FpGet_ST0_80.
- return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
- RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
+ return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass ||
+ RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass ||
+ RC == &X86::RFP80RegClass);
}
/// Return a virtual register initialized with the
@@ -9424,7 +5882,9 @@ isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
///
unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
- assert(!Subtarget.is64Bit() &&
+ assert((!Subtarget.is64Bit() ||
+ MF->getTarget().getCodeModel() == CodeModel::Medium ||
+ MF->getTarget().getCodeModel() == CodeModel::Large) &&
"X86-64 PIC uses RIP relative addressing");
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
@@ -9435,7 +5895,8 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
// Create the register. The code to initialize it is inserted
// later, by the CGBR pass (below).
MachineRegisterInfo &RegInfo = MF->getRegInfo();
- GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+ GlobalBaseReg = RegInfo.createVirtualRegister(
+ Subtarget.is64Bit() ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass);
X86FI->setGlobalBaseReg(GlobalBaseReg);
return GlobalBaseReg;
}
@@ -9625,8 +6086,6 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
{ X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm},
{ X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 },
- { X86::VBLENDPSrri, X86::VBLENDPSrri, X86::VPBLENDDrri },
- { X86::VBLENDPSrmi, X86::VBLENDPSrmi, X86::VPBLENDDrmi },
{ X86::VBLENDPSYrri, X86::VBLENDPSYrri, X86::VPBLENDDYrri },
{ X86::VBLENDPSYrmi, X86::VBLENDPSYrmi, X86::VPBLENDDYrmi },
{ X86::VPERMILPSYmi, X86::VPERMILPSYmi, X86::VPSHUFDYmi },
@@ -9880,6 +6339,47 @@ static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
X86::VPXORQZrmbkz, X86::VPXORDZrmbkz },
};
+// NOTE: These should only be used by the custom domain methods.
+static const uint16_t ReplaceableCustomInstrs[][3] = {
+ //PackedSingle PackedDouble PackedInt
+ { X86::BLENDPSrmi, X86::BLENDPDrmi, X86::PBLENDWrmi },
+ { X86::BLENDPSrri, X86::BLENDPDrri, X86::PBLENDWrri },
+ { X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDWrmi },
+ { X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDWrri },
+ { X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDWYrmi },
+ { X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDWYrri },
+};
+static const uint16_t ReplaceableCustomAVX2Instrs[][3] = {
+ //PackedSingle PackedDouble PackedInt
+ { X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDDrmi },
+ { X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDDrri },
+ { X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDDYrmi },
+ { X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDDYrri },
+};
+
+// Special table for changing EVEX logic instructions to VEX.
+// TODO: Should we run EVEX->VEX earlier?
+static const uint16_t ReplaceableCustomAVX512LogicInstrs[][4] = {
+ // Two integer columns for 64-bit and 32-bit elements.
+ //PackedSingle PackedDouble PackedInt PackedInt
+ { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNQZ128rm, X86::VPANDNDZ128rm },
+ { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNQZ128rr, X86::VPANDNDZ128rr },
+ { X86::VANDPSrm, X86::VANDPDrm, X86::VPANDQZ128rm, X86::VPANDDZ128rm },
+ { X86::VANDPSrr, X86::VANDPDrr, X86::VPANDQZ128rr, X86::VPANDDZ128rr },
+ { X86::VORPSrm, X86::VORPDrm, X86::VPORQZ128rm, X86::VPORDZ128rm },
+ { X86::VORPSrr, X86::VORPDrr, X86::VPORQZ128rr, X86::VPORDZ128rr },
+ { X86::VXORPSrm, X86::VXORPDrm, X86::VPXORQZ128rm, X86::VPXORDZ128rm },
+ { X86::VXORPSrr, X86::VXORPDrr, X86::VPXORQZ128rr, X86::VPXORDZ128rr },
+ { X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm },
+ { X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr },
+ { X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDQZ256rm, X86::VPANDDZ256rm },
+ { X86::VANDPSYrr, X86::VANDPDYrr, X86::VPANDQZ256rr, X86::VPANDDZ256rr },
+ { X86::VORPSYrm, X86::VORPDYrm, X86::VPORQZ256rm, X86::VPORDZ256rm },
+ { X86::VORPSYrr, X86::VORPDYrr, X86::VPORQZ256rr, X86::VPORDZ256rr },
+ { X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORQZ256rm, X86::VPXORDZ256rm },
+ { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORQZ256rr, X86::VPXORDZ256rr },
+};
+
// FIXME: Some shuffle and unpack instructions have equivalents in different
// domains, but they require a bit more work than just switching opcodes.
@@ -9900,13 +6400,239 @@ static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
return nullptr;
}
+// Helper to attempt to widen/narrow blend masks.
+static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth,
+ unsigned NewWidth, unsigned *pNewMask = nullptr) {
+ assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) &&
+ "Illegal blend mask scale");
+ unsigned NewMask = 0;
+
+ if ((OldWidth % NewWidth) == 0) {
+ unsigned Scale = OldWidth / NewWidth;
+ unsigned SubMask = (1u << Scale) - 1;
+ for (unsigned i = 0; i != NewWidth; ++i) {
+ unsigned Sub = (OldMask >> (i * Scale)) & SubMask;
+ if (Sub == SubMask)
+ NewMask |= (1u << i);
+ else if (Sub != 0x0)
+ return false;
+ }
+ } else {
+ unsigned Scale = NewWidth / OldWidth;
+ unsigned SubMask = (1u << Scale) - 1;
+ for (unsigned i = 0; i != OldWidth; ++i) {
+ if (OldMask & (1 << i)) {
+ NewMask |= (SubMask << (i * Scale));
+ }
+ }
+ }
+
+ if (pNewMask)
+ *pNewMask = NewMask;
+ return true;
+}
+
+uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ unsigned NumOperands = MI.getDesc().getNumOperands();
+
+ auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) {
+ uint16_t validDomains = 0;
+ if (MI.getOperand(NumOperands - 1).isImm()) {
+ unsigned Imm = MI.getOperand(NumOperands - 1).getImm();
+ if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4))
+ validDomains |= 0x2; // PackedSingle
+ if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2))
+ validDomains |= 0x4; // PackedDouble
+ if (!Is256 || Subtarget.hasAVX2())
+ validDomains |= 0x8; // PackedInt
+ }
+ return validDomains;
+ };
+
+ switch (Opcode) {
+ case X86::BLENDPDrmi:
+ case X86::BLENDPDrri:
+ case X86::VBLENDPDrmi:
+ case X86::VBLENDPDrri:
+ return GetBlendDomains(2, false);
+ case X86::VBLENDPDYrmi:
+ case X86::VBLENDPDYrri:
+ return GetBlendDomains(4, true);
+ case X86::BLENDPSrmi:
+ case X86::BLENDPSrri:
+ case X86::VBLENDPSrmi:
+ case X86::VBLENDPSrri:
+ case X86::VPBLENDDrmi:
+ case X86::VPBLENDDrri:
+ return GetBlendDomains(4, false);
+ case X86::VBLENDPSYrmi:
+ case X86::VBLENDPSYrri:
+ case X86::VPBLENDDYrmi:
+ case X86::VPBLENDDYrri:
+ return GetBlendDomains(8, true);
+ case X86::PBLENDWrmi:
+ case X86::PBLENDWrri:
+ case X86::VPBLENDWrmi:
+ case X86::VPBLENDWrri:
+ // Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks.
+ case X86::VPBLENDWYrmi:
+ case X86::VPBLENDWYrri:
+ return GetBlendDomains(8, false);
+ case X86::VPANDDZ128rr: case X86::VPANDDZ128rm:
+ case X86::VPANDDZ256rr: case X86::VPANDDZ256rm:
+ case X86::VPANDQZ128rr: case X86::VPANDQZ128rm:
+ case X86::VPANDQZ256rr: case X86::VPANDQZ256rm:
+ case X86::VPANDNDZ128rr: case X86::VPANDNDZ128rm:
+ case X86::VPANDNDZ256rr: case X86::VPANDNDZ256rm:
+ case X86::VPANDNQZ128rr: case X86::VPANDNQZ128rm:
+ case X86::VPANDNQZ256rr: case X86::VPANDNQZ256rm:
+ case X86::VPORDZ128rr: case X86::VPORDZ128rm:
+ case X86::VPORDZ256rr: case X86::VPORDZ256rm:
+ case X86::VPORQZ128rr: case X86::VPORQZ128rm:
+ case X86::VPORQZ256rr: case X86::VPORQZ256rm:
+ case X86::VPXORDZ128rr: case X86::VPXORDZ128rm:
+ case X86::VPXORDZ256rr: case X86::VPXORDZ256rm:
+ case X86::VPXORQZ128rr: case X86::VPXORQZ128rm:
+ case X86::VPXORQZ256rr: case X86::VPXORQZ256rm:
+ // If we don't have DQI see if we can still switch from an EVEX integer
+ // instruction to a VEX floating point instruction.
+ if (Subtarget.hasDQI())
+ return 0;
+
+ if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16)
+ return 0;
+ if (RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16)
+ return 0;
+ // Register forms will have 3 operands. Memory form will have more.
+ if (NumOperands == 3 &&
+ RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
+ return 0;
+
+ // All domains are valid.
+ return 0xe;
+ }
+ return 0;
+}
+
+bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
+ unsigned Domain) const {
+ assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
+ uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+ assert(dom && "Not an SSE instruction");
+
+ unsigned Opcode = MI.getOpcode();
+ unsigned NumOperands = MI.getDesc().getNumOperands();
+
+ auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) {
+ if (MI.getOperand(NumOperands - 1).isImm()) {
+ unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255;
+ Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
+ unsigned NewImm = Imm;
+
+ const uint16_t *table = lookup(Opcode, dom, ReplaceableCustomInstrs);
+ if (!table)
+ table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
+
+ if (Domain == 1) { // PackedSingle
+ AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
+ } else if (Domain == 2) { // PackedDouble
+ AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm);
+ } else if (Domain == 3) { // PackedInt
+ if (Subtarget.hasAVX2()) {
+ // If we are already VPBLENDW use that, else use VPBLENDD.
+ if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
+ table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
+ AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
+ }
+ } else {
+ assert(!Is256 && "128-bit vector expected");
+ AdjustBlendMask(Imm, ImmWidth, 8, &NewImm);
+ }
+ }
+
+ assert(table && table[Domain - 1] && "Unknown domain op");
+ MI.setDesc(get(table[Domain - 1]));
+ MI.getOperand(NumOperands - 1).setImm(NewImm & 255);
+ }
+ return true;
+ };
+
+ switch (Opcode) {
+ case X86::BLENDPDrmi:
+ case X86::BLENDPDrri:
+ case X86::VBLENDPDrmi:
+ case X86::VBLENDPDrri:
+ return SetBlendDomain(2, false);
+ case X86::VBLENDPDYrmi:
+ case X86::VBLENDPDYrri:
+ return SetBlendDomain(4, true);
+ case X86::BLENDPSrmi:
+ case X86::BLENDPSrri:
+ case X86::VBLENDPSrmi:
+ case X86::VBLENDPSrri:
+ case X86::VPBLENDDrmi:
+ case X86::VPBLENDDrri:
+ return SetBlendDomain(4, false);
+ case X86::VBLENDPSYrmi:
+ case X86::VBLENDPSYrri:
+ case X86::VPBLENDDYrmi:
+ case X86::VPBLENDDYrri:
+ return SetBlendDomain(8, true);
+ case X86::PBLENDWrmi:
+ case X86::PBLENDWrri:
+ case X86::VPBLENDWrmi:
+ case X86::VPBLENDWrri:
+ return SetBlendDomain(8, false);
+ case X86::VPBLENDWYrmi:
+ case X86::VPBLENDWYrri:
+ return SetBlendDomain(16, true);
+ case X86::VPANDDZ128rr: case X86::VPANDDZ128rm:
+ case X86::VPANDDZ256rr: case X86::VPANDDZ256rm:
+ case X86::VPANDQZ128rr: case X86::VPANDQZ128rm:
+ case X86::VPANDQZ256rr: case X86::VPANDQZ256rm:
+ case X86::VPANDNDZ128rr: case X86::VPANDNDZ128rm:
+ case X86::VPANDNDZ256rr: case X86::VPANDNDZ256rm:
+ case X86::VPANDNQZ128rr: case X86::VPANDNQZ128rm:
+ case X86::VPANDNQZ256rr: case X86::VPANDNQZ256rm:
+ case X86::VPORDZ128rr: case X86::VPORDZ128rm:
+ case X86::VPORDZ256rr: case X86::VPORDZ256rm:
+ case X86::VPORQZ128rr: case X86::VPORQZ128rm:
+ case X86::VPORQZ256rr: case X86::VPORQZ256rm:
+ case X86::VPXORDZ128rr: case X86::VPXORDZ128rm:
+ case X86::VPXORDZ256rr: case X86::VPXORDZ256rm:
+ case X86::VPXORQZ128rr: case X86::VPXORQZ128rm:
+ case X86::VPXORQZ256rr: case X86::VPXORQZ256rm: {
+ // Without DQI, convert EVEX instructions to VEX instructions.
+ if (Subtarget.hasDQI())
+ return false;
+
+ const uint16_t *table = lookupAVX512(MI.getOpcode(), dom,
+ ReplaceableCustomAVX512LogicInstrs);
+ assert(table && "Instruction not found in table?");
+ // Don't change integer Q instructions to D instructions and
+ // use D intructions if we started with a PS instruction.
+ if (Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
+ Domain = 4;
+ MI.setDesc(get(table[Domain - 1]));
+ return true;
+ }
+ }
+ return false;
+}
+
std::pair<uint16_t, uint16_t>
X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
unsigned opcode = MI.getOpcode();
uint16_t validDomains = 0;
if (domain) {
- if (lookup(MI.getOpcode(), domain, ReplaceableInstrs)) {
+ // Attempt to match for custom instructions.
+ validDomains = getExecutionDomainCustom(MI);
+ if (validDomains)
+ return std::make_pair(domain, validDomains);
+
+ if (lookup(opcode, domain, ReplaceableInstrs)) {
validDomains = 0xe;
} else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
@@ -9938,6 +6664,11 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
assert(Domain>0 && Domain<4 && "Invalid execution domain");
uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
assert(dom && "Not an SSE instruction");
+
+ // Attempt to match for custom instructions.
+ if (setExecutionDomainCustom(MI, Domain))
+ return;
+
const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
if (!table) { // try the other table
assert((Subtarget.hasAVX2() || Domain < 3) &&
@@ -10624,9 +7355,10 @@ namespace {
static_cast<const X86TargetMachine *>(&MF.getTarget());
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
- // Don't do anything if this is 64-bit as 64-bit PIC
- // uses RIP relative addressing.
- if (STI.is64Bit())
+ // Don't do anything in the 64-bit small and kernel code models. They use
+ // RIP-relative addressing for everything.
+ if (STI.is64Bit() && (TM->getCodeModel() == CodeModel::Small ||
+ TM->getCodeModel() == CodeModel::Kernel))
return false;
// Only emit a global base reg in PIC mode.
@@ -10653,17 +7385,41 @@ namespace {
else
PC = GlobalBaseReg;
- // Operand of MovePCtoStack is completely ignored by asm printer. It's
- // only used in JIT code emission as displacement to pc.
- BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
-
- // If we're using vanilla 'GOT' PIC style, we should use relative addressing
- // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
- if (STI.isPICStyleGOT()) {
- // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
- BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
- .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
- X86II::MO_GOT_ABSOLUTE_ADDRESS);
+ if (STI.is64Bit()) {
+ if (TM->getCodeModel() == CodeModel::Medium) {
+ // In the medium code model, use a RIP-relative LEA to materialize the
+ // GOT.
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PC)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addExternalSymbol("_GLOBAL_OFFSET_TABLE_")
+ .addReg(0);
+ } else if (TM->getCodeModel() == CodeModel::Large) {
+ // Loading the GOT in the large code model requires math with labels,
+ // so we use a pseudo instruction and expand it during MC emission.
+ unsigned Scratch = RegInfo.createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVGOT64r), PC)
+ .addReg(Scratch, RegState::Undef | RegState::Define)
+ .addExternalSymbol("_GLOBAL_OFFSET_TABLE_");
+ } else {
+ llvm_unreachable("unexpected code model");
+ }
+ } else {
+ // Operand of MovePCtoStack is completely ignored by asm printer. It's
+ // only used in JIT code emission as displacement to pc.
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
+
+ // If we're using vanilla 'GOT' PIC style, we should use relative
+ // addressing not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
+ if (STI.isPICStyleGOT()) {
+ // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel],
+ // %some_register
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+ .addReg(PC)
+ .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+ X86II::MO_GOT_ABSOLUTE_ADDRESS);
+ }
}
return true;
@@ -10832,21 +7588,36 @@ enum MachineOutlinerClass {
MachineOutlinerTailCall
};
-X86GenInstrInfo::MachineOutlinerInfo
-X86InstrInfo::getOutlininingCandidateInfo(
- std::vector<
- std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
- &RepeatedSequenceLocs) const {
-
- if (RepeatedSequenceLocs[0].second->isTerminator())
- return MachineOutlinerInfo(1, // Number of instructions to emit call.
- 0, // Number of instructions to emit frame.
- MachineOutlinerTailCall, // Type of call.
- MachineOutlinerTailCall // Type of frame.
- );
-
- return MachineOutlinerInfo(1, 1, MachineOutlinerDefault,
- MachineOutlinerDefault);
+outliner::OutlinedFunction X86InstrInfo::getOutliningCandidateInfo(
+ std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
+ unsigned SequenceSize =
+ std::accumulate(RepeatedSequenceLocs[0].front(),
+ std::next(RepeatedSequenceLocs[0].back()), 0,
+ [](unsigned Sum, const MachineInstr &MI) {
+ // FIXME: x86 doesn't implement getInstSizeInBytes, so
+ // we can't tell the cost. Just assume each instruction
+ // is one byte.
+ if (MI.isDebugInstr() || MI.isKill())
+ return Sum;
+ return Sum + 1;
+ });
+
+ // FIXME: Use real size in bytes for call and ret instructions.
+ if (RepeatedSequenceLocs[0].back()->isTerminator()) {
+ for (outliner::Candidate &C : RepeatedSequenceLocs)
+ C.setCallInfo(MachineOutlinerTailCall, 1);
+
+ return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
+ 0, // Number of bytes to emit frame.
+ MachineOutlinerTailCall // Type of frame.
+ );
+ }
+
+ for (outliner::Candidate &C : RepeatedSequenceLocs)
+ C.setCallInfo(MachineOutlinerDefault, 1);
+
+ return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 1,
+ MachineOutlinerDefault);
}
bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF,
@@ -10855,8 +7626,12 @@ bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF,
// Does the function use a red zone? If it does, then we can't risk messing
// with the stack.
- if (!F.hasFnAttribute(Attribute::NoRedZone))
+ if (!F.hasFnAttribute(Attribute::NoRedZone)) {
+ // It could have a red zone. If it does, then we don't want to touch it.
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ if (!X86FI || X86FI->getUsesRedZone())
return false;
+ }
// If we *don't* want to outline from things that could potentially be deduped
// then return false.
@@ -10867,26 +7642,31 @@ bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF,
return true;
}
-X86GenInstrInfo::MachineOutlinerInstrType
-X86InstrInfo::getOutliningType(MachineInstr &MI) const {
-
+outliner::InstrType
+X86InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const {
+ MachineInstr &MI = *MIT;
// Don't allow debug values to impact outlining type.
- if (MI.isDebugValue() || MI.isIndirectDebugValue())
- return MachineOutlinerInstrType::Invisible;
+ if (MI.isDebugInstr() || MI.isIndirectDebugValue())
+ return outliner::InstrType::Invisible;
+
+ // At this point, KILL instructions don't really tell us much so we can go
+ // ahead and skip over them.
+ if (MI.isKill())
+ return outliner::InstrType::Invisible;
// Is this a tail call? If yes, we can outline as a tail call.
if (isTailCall(MI))
- return MachineOutlinerInstrType::Legal;
+ return outliner::InstrType::Legal;
// Is this the terminator of a basic block?
if (MI.isTerminator() || MI.isReturn()) {
// Does its parent have any successors in its MachineFunction?
if (MI.getParent()->succ_empty())
- return MachineOutlinerInstrType::Legal;
+ return outliner::InstrType::Legal;
// It does, so we can't tail call it.
- return MachineOutlinerInstrType::Illegal;
+ return outliner::InstrType::Illegal;
}
// Don't outline anything that modifies or reads from the stack pointer.
@@ -10901,33 +7681,33 @@ X86InstrInfo::getOutliningType(MachineInstr &MI) const {
if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP))
- return MachineOutlinerInstrType::Illegal;
+ return outliner::InstrType::Illegal;
// Outlined calls change the instruction pointer, so don't read from it.
if (MI.readsRegister(X86::RIP, &RI) ||
MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) ||
MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP))
- return MachineOutlinerInstrType::Illegal;
+ return outliner::InstrType::Illegal;
// Positions can't safely be outlined.
if (MI.isPosition())
- return MachineOutlinerInstrType::Illegal;
+ return outliner::InstrType::Illegal;
// Make sure none of the operands of this instruction do anything tricky.
for (const MachineOperand &MOP : MI.operands())
if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
MOP.isTargetIndex())
- return MachineOutlinerInstrType::Illegal;
+ return outliner::InstrType::Illegal;
- return MachineOutlinerInstrType::Legal;
+ return outliner::InstrType::Legal;
}
-void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB,
+void X86InstrInfo::buildOutlinedFrame(MachineBasicBlock &MBB,
MachineFunction &MF,
- const MachineOutlinerInfo &MInfo)
+ const outliner::OutlinedFunction &OF)
const {
// If we're a tail call, we already have a return, so don't do anything.
- if (MInfo.FrameConstructionID == MachineOutlinerTailCall)
+ if (OF.FrameConstructionID == MachineOutlinerTailCall)
return;
// We're a normal call, so our sequence doesn't have a return instruction.
@@ -10936,18 +7716,13 @@ void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB,
MBB.insert(MBB.end(), retq);
}
-void X86InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB,
- MachineFunction &MF,
- const MachineOutlinerInfo &MInfo)
- const {}
-
MachineBasicBlock::iterator
X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
MachineBasicBlock::iterator &It,
MachineFunction &MF,
- const MachineOutlinerInfo &MInfo) const {
+ const outliner::Candidate &C) const {
// Is it a tail call?
- if (MInfo.CallConstructionID == MachineOutlinerTailCall) {
+ if (C.CallConstructionID == MachineOutlinerTailCall) {
// Yes, just insert a JMP.
It = MBB.insert(It,
BuildMI(MF, DebugLoc(), get(X86::JMP_1))
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 02a09c340cef..b1ceb767cce4 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -17,8 +17,9 @@
#include "MCTargetDesc/X86BaseInfo.h"
#include "X86InstrFMA3Info.h"
#include "X86RegisterInfo.h"
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
+#include <vector>
#define GET_INSTRINFO_HEADER
#include "X86GenInstrInfo.inc"
@@ -29,6 +30,12 @@ class X86RegisterInfo;
class X86Subtarget;
namespace X86 {
+
+enum AsmComments {
+ // For instr that was compressed from EVEX to VEX.
+ AC_EVEX_2_VEX = MachineInstr::TAsmComments
+};
+
// X86 specific condition code. These correspond to X86_*_COND in
// X86InstrInfo.td. They must be kept in synch.
enum CondCode {
@@ -64,25 +71,41 @@ enum CondCode {
// Turn condition code into conditional branch opcode.
unsigned GetCondBranchFromCond(CondCode CC);
-/// \brief Return a pair of condition code for the given predicate and whether
+/// Return a pair of condition code for the given predicate and whether
/// the instruction operands should be swaped to match the condition code.
std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate);
-/// \brief Return a set opcode for the given condition and whether it has
+/// Return a set opcode for the given condition and whether it has
/// a memory operand.
unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
-/// \brief Return a cmov opcode for the given condition, register size in
+/// Return a cmov opcode for the given condition, register size in
/// bytes, and operand type.
unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
bool HasMemoryOperand = false);
+// Turn jCC opcode into condition code.
+CondCode getCondFromBranchOpc(unsigned Opc);
+
+// Turn setCC opcode into condition code.
+CondCode getCondFromSETOpc(unsigned Opc);
+
// Turn CMov opcode into condition code.
CondCode getCondFromCMovOpc(unsigned Opc);
/// GetOppositeBranchCondition - Return the inverse of the specified cond,
/// e.g. turning COND_E to COND_NE.
CondCode GetOppositeBranchCondition(CondCode CC);
+
+/// Get the VPCMP immediate for the given condition.
+unsigned getVPCMPImmForCond(ISD::CondCode CC);
+
+/// Get the VPCMP immediate if the opcodes are swapped.
+unsigned getSwappedVPCMPImm(unsigned Imm);
+
+/// Get the VPCOM immediate if the opcodes are swapped.
+unsigned getSwappedVPCOMImm(unsigned Imm);
+
} // namespace X86
/// isGlobalStubReference - Return true if the specified TargetFlag operand is
@@ -145,28 +168,6 @@ class X86InstrInfo final : public X86GenInstrInfo {
X86Subtarget &Subtarget;
const X86RegisterInfo RI;
- /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
- /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
- ///
- typedef DenseMap<unsigned, std::pair<uint16_t, uint16_t>>
- RegOp2MemOpTableType;
- RegOp2MemOpTableType RegOp2MemOpTable2Addr;
- RegOp2MemOpTableType RegOp2MemOpTable0;
- RegOp2MemOpTableType RegOp2MemOpTable1;
- RegOp2MemOpTableType RegOp2MemOpTable2;
- RegOp2MemOpTableType RegOp2MemOpTable3;
- RegOp2MemOpTableType RegOp2MemOpTable4;
-
- /// MemOp2RegOpTable - Load / store unfolding opcode map.
- ///
- typedef DenseMap<unsigned, std::pair<uint16_t, uint16_t>>
- MemOp2RegOpTableType;
- MemOp2RegOpTableType MemOp2RegOpTable;
-
- static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
- MemOp2RegOpTableType &M2RTable, uint16_t RegOp,
- uint16_t MemOp, uint16_t Flags);
-
virtual void anchor();
bool AnalyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
@@ -219,6 +220,9 @@ public:
unsigned isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex,
+ unsigned &MemBytes) const override;
/// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
/// stack locations as well. This uses a heuristic so it isn't
/// reliable for correctness.
@@ -227,6 +231,9 @@ public:
unsigned isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex,
+ unsigned &MemBytes) const override;
/// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
/// stack locations as well. This uses a heuristic so it isn't
/// reliable for correctness.
@@ -285,34 +292,6 @@ public:
bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;
- /// Returns true if the routine could find two commutable operands
- /// in the given FMA instruction \p MI. Otherwise, returns false.
- ///
- /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
- /// The output indices of the commuted operands are returned in these
- /// arguments. Also, the input values of these arguments may be preset either
- /// to indices of operands that must be commuted or be equal to a special
- /// value 'CommuteAnyOperandIndex' which means that the corresponding
- /// operand index is not set and this method is free to pick any of
- /// available commutable operands.
- /// The parameter \p FMA3Group keeps the reference to the group of relative
- /// FMA3 opcodes including register/memory forms of 132/213/231 opcodes.
- ///
- /// For example, calling this method this way:
- /// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
- /// findFMA3CommutedOpIndices(MI, Idx1, Idx2, FMA3Group);
- /// can be interpreted as a query asking if the operand #1 can be swapped
- /// with any other available operand (e.g. operand #2, operand #3, etc.).
- ///
- /// The returned FMA opcode may differ from the opcode in the given MI.
- /// For example, commuting the operands #1 and #3 in the following FMA
- /// FMA213 #1, #2, #3
- /// results into instruction with adjusted opcode:
- /// FMA231 #3, #2, #1
- bool findFMA3CommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
- unsigned &SrcOpIdx2,
- const X86InstrFMA3Group &FMA3Group) const;
-
/// Returns an adjusted FMA opcode that must be used in FMA instruction that
/// performs the same computations as the given \p MI but which has the
/// operands \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
@@ -369,6 +348,8 @@ public:
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc) const override;
+ bool isCopyInstr(const MachineInstr &MI, const MachineOperand *&Src,
+ const MachineOperand *&Dest) const override;
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, unsigned SrcReg,
bool isKill, int FrameIndex,
@@ -490,8 +471,12 @@ public:
std::pair<uint16_t, uint16_t>
getExecutionDomain(const MachineInstr &MI) const override;
+ uint16_t getExecutionDomainCustom(const MachineInstr &MI) const;
+
void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;
+ bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const;
+
unsigned
getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const override;
@@ -559,27 +544,22 @@ public:
ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const override;
- virtual MachineOutlinerInfo getOutlininingCandidateInfo(
- std::vector<
- std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
- &RepeatedSequenceLocs) const override;
+ virtual outliner::OutlinedFunction getOutliningCandidateInfo(
+ std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
bool isFunctionSafeToOutlineFrom(MachineFunction &MF,
bool OutlineFromLinkOnceODRs) const override;
- llvm::X86GenInstrInfo::MachineOutlinerInstrType
- getOutliningType(MachineInstr &MI) const override;
-
- void insertOutlinerEpilogue(MachineBasicBlock &MBB, MachineFunction &MF,
- const MachineOutlinerInfo &MInfo) const override;
+ outliner::InstrType
+ getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const override;
- void insertOutlinerPrologue(MachineBasicBlock &MBB, MachineFunction &MF,
- const MachineOutlinerInfo &MInfo) const override;
+ void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF,
+ const outliner::OutlinedFunction &OF) const override;
MachineBasicBlock::iterator
insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
MachineBasicBlock::iterator &It, MachineFunction &MF,
- const MachineOutlinerInfo &MInfo) const override;
+ const outliner::Candidate &C) const override;
protected:
/// Commutes the operands in the given instruction by changing the operands
@@ -631,9 +611,12 @@ private:
/// findThreeSrcCommutedOpIndices(MI, Op1, Op2);
/// can be interpreted as a query asking to find an operand that would be
/// commutable with the operand#1.
+ ///
+ /// If IsIntrinsic is set, operand 1 will be ignored for commuting.
bool findThreeSrcCommutedOpIndices(const MachineInstr &MI,
unsigned &SrcOpIdx1,
- unsigned &SrcOpIdx2) const;
+ unsigned &SrcOpIdx2,
+ bool IsIntrinsic = false) const;
};
} // namespace llvm
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 27c67500b26f..7509b312c100 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -94,6 +94,8 @@ def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+def SDT_X86NtBrind : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
SDTCisVT<1, iPTR>,
SDTCisVT<2, iPTR>]>;
@@ -196,6 +198,12 @@ def X86call : SDNode<"X86ISD::CALL", SDT_X86Call,
[SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
SDNPVariadic]>;
+def X86NoTrackCall : SDNode<"X86ISD::NT_CALL", SDT_X86Call,
+ [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+ SDNPVariadic]>;
+def X86NoTrackBrind : SDNode<"X86ISD::NT_BRIND", SDT_X86NtBrind,
+ [SDNPHasChain]>;
+
def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>;
def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
@@ -281,6 +289,8 @@ def X86lock_dec : SDNode<"X86ISD::LDEC", SDTLockUnaryArithWithFlags,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
SDNPMemOperand]>;
+def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>;
+
def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
@@ -297,6 +307,16 @@ def X86lwpins : SDNode<"X86ISD::LWPINS",
SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPSideEffect]>;
+def X86umwait : SDNode<"X86ISD::UMWAIT",
+ SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86tpause : SDNode<"X86ISD::TPAUSE",
+ SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
//===----------------------------------------------------------------------===//
// X86 Operand Definitions.
//
@@ -358,10 +378,9 @@ class X86VMemOperand<RegisterClass RC, string printMethod,
def anymem : X86MemOperand<"printanymem">;
-def opaque32mem : X86MemOperand<"printopaquemem">;
-def opaque48mem : X86MemOperand<"printopaquemem">;
-def opaque80mem : X86MemOperand<"printopaquemem">;
-def opaque512mem : X86MemOperand<"printopaquemem">;
+// FIXME: Right now we allow any size during parsing, but we might want to
+// restrict to only unsized memory.
+def opaquemem : X86MemOperand<"printopaquemem">;
def i8mem : X86MemOperand<"printi8mem", X86Mem8AsmOperand>;
def i16mem : X86MemOperand<"printi16mem", X86Mem16AsmOperand>;
@@ -391,8 +410,8 @@ def vx128xmem : X86VMemOperand<VR128X, "printi128mem", X86Mem128_RC128XOperand>;
def vx256xmem : X86VMemOperand<VR128X, "printi256mem", X86Mem256_RC128XOperand>;
def vy128xmem : X86VMemOperand<VR256X, "printi128mem", X86Mem128_RC256XOperand>;
def vy256xmem : X86VMemOperand<VR256X, "printi256mem", X86Mem256_RC256XOperand>;
-def vy512mem : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>;
-def vz256xmem : X86VMemOperand<VR512, "printi256mem", X86Mem256_RC512Operand>;
+def vy512xmem : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>;
+def vz256mem : X86VMemOperand<VR512, "printi256mem", X86Mem256_RC512Operand>;
def vz512mem : X86VMemOperand<VR512, "printi512mem", X86Mem512_RC512Operand>;
// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
@@ -807,36 +826,26 @@ def NoAVX : Predicate<"!Subtarget->hasAVX()">;
def HasAVX : Predicate<"Subtarget->hasAVX()">;
def HasAVX2 : Predicate<"Subtarget->hasAVX2()">;
def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
-def HasAVX512 : Predicate<"Subtarget->hasAVX512()">,
- AssemblerPredicate<"FeatureAVX512", "AVX-512 ISA">;
+def HasAVX512 : Predicate<"Subtarget->hasAVX512()">;
def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">;
-def HasCDI : Predicate<"Subtarget->hasCDI()">,
- AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">;
-def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">,
- AssemblerPredicate<"FeatureVPOPCNTDQ", "AVX-512 VPOPCNTDQ ISA">;
-def HasPFI : Predicate<"Subtarget->hasPFI()">,
- AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">;
-def HasERI : Predicate<"Subtarget->hasERI()">,
- AssemblerPredicate<"FeatureERI", "AVX-512 ER ISA">;
-def HasDQI : Predicate<"Subtarget->hasDQI()">,
- AssemblerPredicate<"FeatureDQI", "AVX-512 DQ ISA">;
+def HasCDI : Predicate<"Subtarget->hasCDI()">;
+def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">;
+def HasPFI : Predicate<"Subtarget->hasPFI()">;
+def HasERI : Predicate<"Subtarget->hasERI()">;
+def HasDQI : Predicate<"Subtarget->hasDQI()">;
def NoDQI : Predicate<"!Subtarget->hasDQI()">;
-def HasBWI : Predicate<"Subtarget->hasBWI()">,
- AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">;
+def HasBWI : Predicate<"Subtarget->hasBWI()">;
def NoBWI : Predicate<"!Subtarget->hasBWI()">;
-def HasVLX : Predicate<"Subtarget->hasVLX()">,
- AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">;
+def HasVLX : Predicate<"Subtarget->hasVLX()">;
def NoVLX : Predicate<"!Subtarget->hasVLX()">;
def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">;
def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
def PKU : Predicate<"Subtarget->hasPKU()">;
-def HasVNNI : Predicate<"Subtarget->hasVNNI()">,
- AssemblerPredicate<"FeatureVNNI", "AVX-512 VNNI ISA">;
+def HasVNNI : Predicate<"Subtarget->hasVNNI()">;
-def HasBITALG : Predicate<"Subtarget->hasBITALG()">,
- AssemblerPredicate<"FeatureBITALG", "AVX-512 BITALG ISA">;
+def HasBITALG : Predicate<"Subtarget->hasBITALG()">;
def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
def HasAES : Predicate<"Subtarget->hasAES()">;
def HasVAES : Predicate<"Subtarget->hasVAES()">;
@@ -866,15 +875,13 @@ def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">;
def HasBMI : Predicate<"Subtarget->hasBMI()">;
def HasBMI2 : Predicate<"Subtarget->hasBMI2()">;
def NoBMI2 : Predicate<"!Subtarget->hasBMI2()">;
-def HasVBMI : Predicate<"Subtarget->hasVBMI()">,
- AssemblerPredicate<"FeatureVBMI", "AVX-512 VBMI ISA">;
-def HasVBMI2 : Predicate<"Subtarget->hasVBMI2()">,
- AssemblerPredicate<"FeatureVBMI2", "AVX-512 VBMI2 ISA">;
-def HasIFMA : Predicate<"Subtarget->hasIFMA()">,
- AssemblerPredicate<"FeatureIFMA", "AVX-512 IFMA ISA">;
+def HasVBMI : Predicate<"Subtarget->hasVBMI()">;
+def HasVBMI2 : Predicate<"Subtarget->hasVBMI2()">;
+def HasIFMA : Predicate<"Subtarget->hasIFMA()">;
def HasRTM : Predicate<"Subtarget->hasRTM()">;
def HasADX : Predicate<"Subtarget->hasADX()">;
def HasSHA : Predicate<"Subtarget->hasSHA()">;
+def HasSGX : Predicate<"Subtarget->hasSGX()">;
def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">;
def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">;
def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">;
@@ -884,14 +891,22 @@ def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">;
def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">;
def HasMWAITX : Predicate<"Subtarget->hasMWAITX()">;
def HasCLZERO : Predicate<"Subtarget->hasCLZERO()">;
+def HasCLDEMOTE : Predicate<"Subtarget->hasCLDEMOTE()">;
+def HasMOVDIRI : Predicate<"Subtarget->hasMOVDIRI()">;
+def HasMOVDIR64B : Predicate<"Subtarget->hasMOVDIR64B()">;
+def HasPTWRITE : Predicate<"Subtarget->hasPTWRITE()">;
def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
def HasMPX : Predicate<"Subtarget->hasMPX()">;
def HasSHSTK : Predicate<"Subtarget->hasSHSTK()">;
-def HasIBT : Predicate<"Subtarget->hasIBT()">;
def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">;
def HasCLWB : Predicate<"Subtarget->hasCLWB()">;
+def HasWBNOINVD : Predicate<"Subtarget->hasWBNOINVD()">;
+def HasRDPID : Predicate<"Subtarget->hasRDPID()">;
+def HasWAITPKG : Predicate<"Subtarget->hasWAITPKG()">;
+def HasINVPCID : Predicate<"Subtarget->hasINVPCID()">;
def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
+def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">;
def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
def In64BitMode : Predicate<"Subtarget->is64Bit()">,
@@ -929,6 +944,8 @@ let RecomputePerFunction = 1 in {
def OptForSpeed : Predicate<"!MF->getFunction().optForSize()">;
def UseIncDec : Predicate<"!Subtarget->slowIncDec() || "
"MF->getFunction().optForSize()">;
+ def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().optForSize() || "
+ "!Subtarget->hasSSE41()">;
}
def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
@@ -938,6 +955,8 @@ def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
def HasMFence : Predicate<"Subtarget->hasMFence()">;
+def UseRetpoline : Predicate<"Subtarget->useRetpoline()">;
+def NotUseRetpoline : Predicate<"!Subtarget->useRetpoline()">;
//===----------------------------------------------------------------------===//
// X86 Instruction Format Definitions.
@@ -1038,6 +1057,17 @@ def i64immZExt32SExt8 : ImmLeaf<i64, [{
}]>;
// Helper fragments for loads.
+
+// It's safe to fold a zextload/extload from i1 as a regular i8 load. The
+// upper bits are guaranteed to be zero and we were going to emit a MOV8rm
+// which might get folded during peephole anyway.
+def loadi8 : PatFrag<(ops node:$ptr), (i8 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ return ExtType == ISD::NON_EXTLOAD || ExtType == ISD::EXTLOAD ||
+ ExtType == ISD::ZEXTLOAD;
+}]>;
+
// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
// known to be 32-bit aligned or better. Ditto for i8 to i16.
def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
@@ -1050,14 +1080,6 @@ def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
return false;
}]>;
-def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{
- LoadSDNode *LD = cast<LoadSDNode>(N);
- ISD::LoadExtType ExtType = LD->getExtensionType();
- if (ExtType == ISD::EXTLOAD)
- return LD->getAlignment() >= 2 && !LD->isVolatile();
- return false;
-}]>;
-
def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
LoadSDNode *LD = cast<LoadSDNode>(N);
ISD::LoadExtType ExtType = LD->getExtensionType();
@@ -1068,12 +1090,20 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
return false;
}]>;
-def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>;
def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>;
+def alignedloadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr)), [{
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
+}]>;
+def memopf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr)), [{
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ return Subtarget->hasSSEUnalignedMem() ||
+ Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
+}]>;
def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
@@ -1123,39 +1153,37 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
//
// Nop
-let hasSideEffects = 0, SchedRW = [WriteZero] in {
- def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
+let hasSideEffects = 0, SchedRW = [WriteNop] in {
+ def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>;
def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero),
- "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
+ "nop{w}\t$zero", []>, TB, OpSize16, NotMemoryFoldable;
def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero),
- "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32;
+ "nop{l}\t$zero", []>, TB, OpSize32, NotMemoryFoldable;
def NOOPQ : RI<0x1f, MRMXm, (outs), (ins i64mem:$zero),
- "nop{q}\t$zero", [], IIC_NOP>, TB,
+ "nop{q}\t$zero", []>, TB, NotMemoryFoldable,
Requires<[In64BitMode]>;
// Also allow register so we can assemble/disassemble
def NOOPWr : I<0x1f, MRMXr, (outs), (ins GR16:$zero),
- "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
+ "nop{w}\t$zero", []>, TB, OpSize16, NotMemoryFoldable;
def NOOPLr : I<0x1f, MRMXr, (outs), (ins GR32:$zero),
- "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32;
+ "nop{l}\t$zero", []>, TB, OpSize32, NotMemoryFoldable;
def NOOPQr : RI<0x1f, MRMXr, (outs), (ins GR64:$zero),
- "nop{q}\t$zero", [], IIC_NOP>, TB,
+ "nop{q}\t$zero", []>, TB, NotMemoryFoldable,
Requires<[In64BitMode]>;
}
// Constructing a stack frame.
def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
- "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>;
+ "enter\t$len, $lvl", []>, Sched<[WriteMicrocoded]>;
let SchedRW = [WriteALU] in {
let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in
-def LEAVE : I<0xC9, RawFrm,
- (outs), (ins), "leave", [], IIC_LEAVE>,
+def LEAVE : I<0xC9, RawFrm, (outs), (ins), "leave", []>,
Requires<[Not64BitMode]>;
let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in
-def LEAVE64 : I<0xC9, RawFrm,
- (outs), (ins), "leave", [], IIC_LEAVE>,
+def LEAVE64 : I<0xC9, RawFrm, (outs), (ins), "leave", []>,
Requires<[In64BitMode]>;
} // SchedRW
@@ -1170,50 +1198,56 @@ let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1,
let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in {
let mayLoad = 1, SchedRW = [WriteLoad] in {
-def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
- IIC_POP_REG16>, OpSize16;
-def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
- IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
-def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
- IIC_POP_REG>, OpSize16;
-def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
- IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
+def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
+ OpSize16;
+def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+// Long form for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
+ OpSize16, NotMemoryFoldable;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>,
+ OpSize32, Requires<[Not64BitMode]>, NotMemoryFoldable;
+} // isCodeGenOnly = 1, ForceDisassemble = 1
} // mayLoad, SchedRW
let mayStore = 1, mayLoad = 1, SchedRW = [WriteRMW] in {
-def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
- IIC_POP_MEM>, OpSize16;
-def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [],
- IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>;
+def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", []>,
+ OpSize16;
+def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", []>,
+ OpSize32, Requires<[Not64BitMode]>;
} // mayStore, mayLoad, WriteRMW
let mayStore = 1, SchedRW = [WriteStore] in {
-def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
- IIC_PUSH_REG>, OpSize16;
-def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
- IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
-def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
- IIC_PUSH_REG>, OpSize16;
-def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
- IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
+def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
+ OpSize16;
+def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>,
+ OpSize32, Requires<[Not64BitMode]>;
+// Long form for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
+ OpSize16, NotMemoryFoldable;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>,
+ OpSize32, Requires<[Not64BitMode]>, NotMemoryFoldable;
+} // isCodeGenOnly = 1, ForceDisassemble = 1
def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm),
- "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+ "push{w}\t$imm", []>, OpSize16;
def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
- "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+ "push{w}\t$imm", []>, OpSize16;
def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
- "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+ "push{l}\t$imm", []>, OpSize32,
Requires<[Not64BitMode]>;
def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
- "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+ "push{l}\t$imm", []>, OpSize32,
Requires<[Not64BitMode]>;
} // mayStore, SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
-def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
- IIC_PUSH_MEM>, OpSize16;
-def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
- IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>;
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src", []>,
+ OpSize16;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src", []>,
+ OpSize32, Requires<[Not64BitMode]>;
} // mayLoad, mayStore, SchedRW
}
@@ -1233,203 +1267,212 @@ let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
SchedRW = [WriteRMW] in {
- let Defs = [ESP, EFLAGS], Uses = [ESP] in
+ let Defs = [ESP, EFLAGS, DF], Uses = [ESP] in
def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src),
[(int_x86_flags_write_u32 GR32:$src)]>,
Requires<[Not64BitMode]>;
- let Defs = [RSP, EFLAGS], Uses = [RSP] in
+ let Defs = [RSP, EFLAGS, DF], Uses = [RSP] in
def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src),
[(int_x86_flags_write_u64 GR64:$src)]>,
Requires<[In64BitMode]>;
}
-let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
+let Defs = [ESP, EFLAGS, DF], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
SchedRW = [WriteLoad] in {
-def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>,
- OpSize16;
-def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
- OpSize32, Requires<[Not64BitMode]>;
+def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", []>, OpSize16;
+def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", []>, OpSize32,
+ Requires<[Not64BitMode]>;
}
-let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, hasSideEffects=0,
+let Defs = [ESP], Uses = [ESP, EFLAGS, DF], mayStore = 1, hasSideEffects=0,
SchedRW = [WriteStore] in {
-def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
- OpSize16;
-def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>,
- OpSize32, Requires<[Not64BitMode]>;
+def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", []>, OpSize16;
+def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>, OpSize32,
+ Requires<[Not64BitMode]>;
}
let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in {
let mayLoad = 1, SchedRW = [WriteLoad] in {
-def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
- IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
-def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
- IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
+def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>,
+ OpSize32, Requires<[In64BitMode]>;
+// Long form for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>,
+ OpSize32, Requires<[In64BitMode]>, NotMemoryFoldable;
+} // isCodeGenOnly = 1, ForceDisassemble = 1
} // mayLoad, SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in
-def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [],
- IIC_POP_MEM>, OpSize32, Requires<[In64BitMode]>;
+def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", []>,
+ OpSize32, Requires<[In64BitMode]>;
let mayStore = 1, SchedRW = [WriteStore] in {
-def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
- IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
-def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
- IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
+def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", []>,
+ OpSize32, Requires<[In64BitMode]>;
+// Long form for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", []>,
+ OpSize32, Requires<[In64BitMode]>, NotMemoryFoldable;
+} // isCodeGenOnly = 1, ForceDisassemble = 1
} // mayStore, SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
-def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
- IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>;
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>,
+ OpSize32, Requires<[In64BitMode]>;
} // mayLoad, mayStore, SchedRW
}
let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1,
SchedRW = [WriteStore] in {
def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
- "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+ "push{q}\t$imm", []>, OpSize32,
Requires<[In64BitMode]>;
def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
- "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+ "push{q}\t$imm", []>, OpSize32,
Requires<[In64BitMode]>;
}
-let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in
-def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
+let Defs = [RSP, EFLAGS, DF], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in
+def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", []>,
OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>;
-let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, hasSideEffects=0 in
-def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
+let Defs = [RSP], Uses = [RSP, EFLAGS, DF], mayStore = 1, hasSideEffects=0 in
+def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", []>,
OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>;
let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in {
-def POPA32 : I<0x61, RawFrm, (outs), (ins), "popal", [], IIC_POP_A>,
+def POPA32 : I<0x61, RawFrm, (outs), (ins), "popal", []>,
OpSize32, Requires<[Not64BitMode]>;
-def POPA16 : I<0x61, RawFrm, (outs), (ins), "popaw", [], IIC_POP_A>,
+def POPA16 : I<0x61, RawFrm, (outs), (ins), "popaw", []>,
OpSize16, Requires<[Not64BitMode]>;
}
let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
-def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pushal", [], IIC_PUSH_A>,
+def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pushal", []>,
OpSize32, Requires<[Not64BitMode]>;
-def PUSHA16 : I<0x60, RawFrm, (outs), (ins), "pushaw", [], IIC_PUSH_A>,
+def PUSHA16 : I<0x60, RawFrm, (outs), (ins), "pushaw", []>,
OpSize16, Requires<[Not64BitMode]>;
}
-let Constraints = "$src = $dst", SchedRW = [WriteALU] in {
+let Constraints = "$src = $dst", SchedRW = [WriteBSWAP32] in {
+// This instruction is a consequence of BSWAP32r observing operand size. The
+// encoding is valid, but the behavior is undefined.
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+def BSWAP16r_BAD : I<0xC8, AddRegFrm, (outs GR16:$dst), (ins GR16:$src),
+ "bswap{w}\t$dst", []>, OpSize16, TB;
// GR32 = bswap GR32
-def BSWAP32r : I<0xC8, AddRegFrm,
- (outs GR32:$dst), (ins GR32:$src),
+def BSWAP32r : I<0xC8, AddRegFrm, (outs GR32:$dst), (ins GR32:$src),
"bswap{l}\t$dst",
- [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, OpSize32, TB;
+ [(set GR32:$dst, (bswap GR32:$src))]>, OpSize32, TB;
+let SchedRW = [WriteBSWAP64] in
def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
"bswap{q}\t$dst",
- [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB;
+ [(set GR64:$dst, (bswap GR64:$src))]>, TB;
} // Constraints = "$src = $dst", SchedRW
// Bit scan instructions.
let Defs = [EFLAGS] in {
def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
"bsf{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
- IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
+ [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>,
+ PS, OpSize16, Sched<[WriteBSF]>;
def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"bsf{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
- IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
+ [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>,
+ PS, OpSize16, Sched<[WriteBSFLd]>;
def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
"bsf{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))],
- IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
+ [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>,
+ PS, OpSize32, Sched<[WriteBSF]>;
def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"bsf{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
- IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
+ [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>,
+ PS, OpSize32, Sched<[WriteBSFLd]>;
def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
"bsf{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
- IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
+ [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>,
+ PS, Sched<[WriteBSF]>;
def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"bsf{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
- IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
+ [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>,
+ PS, Sched<[WriteBSFLd]>;
def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
"bsr{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))],
- IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
+ [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>,
+ PS, OpSize16, Sched<[WriteBSR]>;
def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"bsr{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
- IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
+ [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>,
+ PS, OpSize16, Sched<[WriteBSRLd]>;
def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
"bsr{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))],
- IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
+ [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>,
+ PS, OpSize32, Sched<[WriteBSR]>;
def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"bsr{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
- IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
+ [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>,
+ PS, OpSize32, Sched<[WriteBSRLd]>;
def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
"bsr{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))],
- IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
+ [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>,
+ PS, Sched<[WriteBSR]>;
def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"bsr{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
- IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
+ [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>,
+ PS, Sched<[WriteBSRLd]>;
} // Defs = [EFLAGS]
let SchedRW = [WriteMicrocoded] in {
-// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
-let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
+let Defs = [EDI,ESI], Uses = [EDI,ESI,DF] in {
def MOVSB : I<0xA4, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
- "movsb\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
+ "movsb\t{$src, $dst|$dst, $src}", []>;
def MOVSW : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
- "movsw\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize16;
+ "movsw\t{$src, $dst|$dst, $src}", []>, OpSize16;
def MOVSL : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
- "movs{l|d}\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize32;
+ "movs{l|d}\t{$src, $dst|$dst, $src}", []>, OpSize32;
def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
- "movsq\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
+ "movsq\t{$src, $dst|$dst, $src}", []>,
+ Requires<[In64BitMode]>;
}
-// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
-let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
+let Defs = [EDI], Uses = [AL,EDI,DF] in
def STOSB : I<0xAA, RawFrmDst, (outs), (ins dstidx8:$dst),
- "stosb\t{%al, $dst|$dst, al}", [], IIC_STOS>;
-let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
+ "stosb\t{%al, $dst|$dst, al}", []>;
+let Defs = [EDI], Uses = [AX,EDI,DF] in
def STOSW : I<0xAB, RawFrmDst, (outs), (ins dstidx16:$dst),
- "stosw\t{%ax, $dst|$dst, ax}", [], IIC_STOS>, OpSize16;
-let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
+ "stosw\t{%ax, $dst|$dst, ax}", []>, OpSize16;
+let Defs = [EDI], Uses = [EAX,EDI,DF] in
def STOSL : I<0xAB, RawFrmDst, (outs), (ins dstidx32:$dst),
- "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32;
-let Defs = [RDI], Uses = [RAX,RDI,EFLAGS] in
+ "stos{l|d}\t{%eax, $dst|$dst, eax}", []>, OpSize32;
+let Defs = [RDI], Uses = [RAX,RDI,DF] in
def STOSQ : RI<0xAB, RawFrmDst, (outs), (ins dstidx64:$dst),
- "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>;
+ "stosq\t{%rax, $dst|$dst, rax}", []>,
+ Requires<[In64BitMode]>;
-// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
-let Defs = [EDI,EFLAGS], Uses = [AL,EDI,EFLAGS] in
+let Defs = [EDI,EFLAGS], Uses = [AL,EDI,DF] in
def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst),
- "scasb\t{$dst, %al|al, $dst}", [], IIC_SCAS>;
-let Defs = [EDI,EFLAGS], Uses = [AX,EDI,EFLAGS] in
+ "scasb\t{$dst, %al|al, $dst}", []>;
+let Defs = [EDI,EFLAGS], Uses = [AX,EDI,DF] in
def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst),
- "scasw\t{$dst, %ax|ax, $dst}", [], IIC_SCAS>, OpSize16;
-let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,EFLAGS] in
+ "scasw\t{$dst, %ax|ax, $dst}", []>, OpSize16;
+let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,DF] in
def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst),
- "scas{l|d}\t{$dst, %eax|eax, $dst}", [], IIC_SCAS>, OpSize32;
-let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,EFLAGS] in
+ "scas{l|d}\t{$dst, %eax|eax, $dst}", []>, OpSize32;
+let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,DF] in
def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst),
- "scasq\t{$dst, %rax|rax, $dst}", [], IIC_SCAS>;
+ "scasq\t{$dst, %rax|rax, $dst}", []>,
+ Requires<[In64BitMode]>;
-// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
-let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,EFLAGS] in {
+let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,DF] in {
def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
- "cmpsb\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+ "cmpsb\t{$dst, $src|$src, $dst}", []>;
def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
- "cmpsw\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize16;
+ "cmpsw\t{$dst, $src|$src, $dst}", []>, OpSize16;
def CMPSL : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
- "cmps{l|d}\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize32;
+ "cmps{l|d}\t{$dst, $src|$src, $dst}", []>, OpSize32;
def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
- "cmpsq\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+ "cmpsq\t{$dst, $src|$src, $dst}", []>,
+ Requires<[In64BitMode]>;
}
} // SchedRW
@@ -1437,47 +1480,47 @@ def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
// Move Instructions.
//
let SchedRW = [WriteMove] in {
-let hasSideEffects = 0 in {
+let hasSideEffects = 0, isMoveReg = 1 in {
def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
- "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+ "mov{b}\t{$src, $dst|$dst, $src}", []>;
def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16;
def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32;
def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+ "mov{q}\t{$src, $dst|$dst, $src}", []>;
}
let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
- [(set GR8:$dst, imm:$src)], IIC_MOV>;
+ [(set GR8:$dst, imm:$src)]>;
def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
"mov{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize16;
+ [(set GR16:$dst, imm:$src)]>, OpSize16;
def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
"mov{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, relocImm:$src)], IIC_MOV>, OpSize32;
+ [(set GR32:$dst, relocImm:$src)]>, OpSize32;
def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
+ [(set GR64:$dst, i64immSExt32:$src)]>;
}
let isReMaterializable = 1 in {
def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
"movabs{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, relocImm:$src)], IIC_MOV>;
+ [(set GR64:$dst, relocImm:$src)]>;
}
// Longer forms that use a ModR/M byte. Needed for disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
def MOV8ri_alt : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src),
- "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>,
+ "mov{b}\t{$src, $dst|$dst, $src}", []>,
FoldGenData<"MOV8ri">;
def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16,
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16,
FoldGenData<"MOV16ri">;
def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32,
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32,
FoldGenData<"MOV32ri">;
}
} // SchedRW
@@ -1485,16 +1528,16 @@ def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
let SchedRW = [WriteStore] in {
def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
- [(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>;
+ [(store (i8 imm8_su:$src), addr:$dst)]>;
def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
"mov{w}\t{$src, $dst|$dst, $src}",
- [(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16;
+ [(store (i16 imm16_su:$src), addr:$dst)]>, OpSize16;
def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
"mov{l}\t{$src, $dst|$dst, $src}",
- [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
+ [(store (i32 imm32_su:$src), addr:$dst)]>, OpSize32;
def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
- [(store i64immSExt32_su:$src, addr:$dst)], IIC_MOV_MEM>,
+ [(store i64immSExt32_su:$src, addr:$dst)]>,
Requires<[In64BitMode]>;
} // SchedRW
@@ -1506,183 +1549,200 @@ let SchedRW = [WriteALU] in {
let mayLoad = 1 in {
let Defs = [AL] in
def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src),
- "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
+ "mov{b}\t{$src, %al|al, $src}", []>,
AdSize32;
let Defs = [AX] in
def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src),
- "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+ "mov{w}\t{$src, %ax|ax, $src}", []>,
OpSize16, AdSize32;
let Defs = [EAX] in
def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src),
- "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+ "mov{l}\t{$src, %eax|eax, $src}", []>,
OpSize32, AdSize32;
let Defs = [RAX] in
def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src),
- "mov{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>,
+ "mov{q}\t{$src, %rax|rax, $src}", []>,
AdSize32;
let Defs = [AL] in
def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src),
- "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, AdSize16;
+ "mov{b}\t{$src, %al|al, $src}", []>, AdSize16;
let Defs = [AX] in
def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src),
- "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+ "mov{w}\t{$src, %ax|ax, $src}", []>,
OpSize16, AdSize16;
let Defs = [EAX] in
def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src),
- "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+ "mov{l}\t{$src, %eax|eax, $src}", []>,
AdSize16, OpSize32;
-}
+} // mayLoad
let mayStore = 1 in {
let Uses = [AL] in
def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs), (ins offset32_8:$dst),
- "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize32;
+ "mov{b}\t{%al, $dst|$dst, al}", []>, AdSize32;
let Uses = [AX] in
def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_16:$dst),
- "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+ "mov{w}\t{%ax, $dst|$dst, ax}", []>,
OpSize16, AdSize32;
let Uses = [EAX] in
def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_32:$dst),
- "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+ "mov{l}\t{%eax, $dst|$dst, eax}", []>,
OpSize32, AdSize32;
let Uses = [RAX] in
def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs), (ins offset32_64:$dst),
- "mov{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>,
+ "mov{q}\t{%rax, $dst|$dst, rax}", []>,
AdSize32;
let Uses = [AL] in
def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs), (ins offset16_8:$dst),
- "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize16;
+ "mov{b}\t{%al, $dst|$dst, al}", []>, AdSize16;
let Uses = [AX] in
def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_16:$dst),
- "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+ "mov{w}\t{%ax, $dst|$dst, ax}", []>,
OpSize16, AdSize16;
let Uses = [EAX] in
def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_32:$dst),
- "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+ "mov{l}\t{%eax, $dst|$dst, eax}", []>,
OpSize32, AdSize16;
-}
-}
+} // mayStore
// These forms all have full 64-bit absolute addresses in their instructions
// and use the movabs mnemonic to indicate this specific form.
let mayLoad = 1 in {
let Defs = [AL] in
-def MOV8ao64 : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src),
- "movabs{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
- AdSize64;
+def MOV8ao64 : Ii64<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src),
+ "movabs{b}\t{$src, %al|al, $src}", []>,
+ AdSize64;
let Defs = [AX] in
-def MOV16ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src),
- "movabs{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+def MOV16ao64 : Ii64<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src),
+ "movabs{w}\t{$src, %ax|ax, $src}", []>,
OpSize16, AdSize64;
let Defs = [EAX] in
-def MOV32ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src),
- "movabs{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+def MOV32ao64 : Ii64<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src),
+ "movabs{l}\t{$src, %eax|eax, $src}", []>,
OpSize32, AdSize64;
let Defs = [RAX] in
def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src),
- "movabs{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>,
+ "movabs{q}\t{$src, %rax|rax, $src}", []>,
AdSize64;
-}
+} // mayLoad
let mayStore = 1 in {
let Uses = [AL] in
-def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs), (ins offset64_8:$dst),
- "movabs{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>,
- AdSize64;
+def MOV8o64a : Ii64<0xA2, RawFrmMemOffs, (outs), (ins offset64_8:$dst),
+ "movabs{b}\t{%al, $dst|$dst, al}", []>,
+ AdSize64;
let Uses = [AX] in
-def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_16:$dst),
- "movabs{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+def MOV16o64a : Ii64<0xA3, RawFrmMemOffs, (outs), (ins offset64_16:$dst),
+ "movabs{w}\t{%ax, $dst|$dst, ax}", []>,
OpSize16, AdSize64;
let Uses = [EAX] in
-def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_32:$dst),
- "movabs{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+def MOV32o64a : Ii64<0xA3, RawFrmMemOffs, (outs), (ins offset64_32:$dst),
+ "movabs{l}\t{%eax, $dst|$dst, eax}", []>,
OpSize32, AdSize64;
let Uses = [RAX] in
def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst),
- "movabs{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>,
+ "movabs{q}\t{%rax, $dst|$dst, rax}", []>,
AdSize64;
-}
+} // mayStore
+} // SchedRW
} // hasSideEffects = 0
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
- SchedRW = [WriteMove] in {
+ SchedRW = [WriteMove], isMoveReg = 1 in {
def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
- "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>,
+ "mov{b}\t{$src, $dst|$dst, $src}", []>,
FoldGenData<"MOV8rr">;
def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16,
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16,
FoldGenData<"MOV16rr">;
def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32,
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32,
FoldGenData<"MOV32rr">;
def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>,
+ "mov{q}\t{$src, $dst|$dst, $src}", []>,
FoldGenData<"MOV64rr">;
}
+// Reversed version with ".s" suffix for GAS compatibility.
+def : InstAlias<"mov{b}.s\t{$src, $dst|$dst, $src}",
+ (MOV8rr_REV GR8:$dst, GR8:$src), 0>;
+def : InstAlias<"mov{w}.s\t{$src, $dst|$dst, $src}",
+ (MOV16rr_REV GR16:$dst, GR16:$src), 0>;
+def : InstAlias<"mov{l}.s\t{$src, $dst|$dst, $src}",
+ (MOV32rr_REV GR32:$dst, GR32:$src), 0>;
+def : InstAlias<"mov{q}.s\t{$src, $dst|$dst, $src}",
+ (MOV64rr_REV GR64:$dst, GR64:$src), 0>;
+def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}",
+ (MOV8rr_REV GR8:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}",
+ (MOV16rr_REV GR16:$dst, GR16:$src), 0, "att">;
+def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}",
+ (MOV32rr_REV GR32:$dst, GR32:$src), 0, "att">;
+def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}",
+ (MOV64rr_REV GR64:$dst, GR64:$src), 0, "att">;
+
let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
- [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>;
+ [(set GR8:$dst, (loadi8 addr:$src))]>;
def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"mov{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize16;
+ [(set GR16:$dst, (loadi16 addr:$src))]>, OpSize16;
def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"mov{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>, OpSize32;
+ [(set GR32:$dst, (loadi32 addr:$src))]>, OpSize32;
def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>;
+ [(set GR64:$dst, (load addr:$src))]>;
}
let SchedRW = [WriteStore] in {
def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
- [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>;
+ [(store GR8:$src, addr:$dst)]>;
def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
"mov{w}\t{$src, $dst|$dst, $src}",
- [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize16;
+ [(store GR16:$src, addr:$dst)]>, OpSize16;
def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
"mov{l}\t{$src, $dst|$dst, $src}",
- [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>, OpSize32;
+ [(store GR32:$src, addr:$dst)]>, OpSize32;
def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
- [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>;
+ [(store GR64:$src, addr:$dst)]>;
} // SchedRW
// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
// that they can be used for copying and storing h registers, which can't be
// encoded when a REX prefix is present.
let isCodeGenOnly = 1 in {
-let hasSideEffects = 0 in
+let hasSideEffects = 0, isMoveReg = 1 in
def MOV8rr_NOREX : I<0x88, MRMDestReg,
(outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
- "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>,
+ "mov{b}\t{$src, $dst|$dst, $src}", []>,
Sched<[WriteMove]>;
let mayStore = 1, hasSideEffects = 0 in
def MOV8mr_NOREX : I<0x88, MRMDestMem,
(outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
- "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [],
- IIC_MOV_MEM>, Sched<[WriteStore]>;
+ "mov{b}\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteStore]>;
let mayLoad = 1, hasSideEffects = 0,
canFoldAsLoad = 1, isReMaterializable = 1 in
def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
(outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
- "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [],
- IIC_MOV_MEM>, Sched<[WriteLoad]>;
+ "mov{b}\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteLoad]>;
}
// Condition code ops, incl. set if equal/not equal/...
-let SchedRW = [WriteALU] in {
+let SchedRW = [WriteLAHFSAHF] in {
let Defs = [EFLAGS], Uses = [AH] in
def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf",
- [(set EFLAGS, (X86sahf AH))], IIC_AHF>,
- Requires<[HasLAHFSAHF]>;
+ [(set EFLAGS, (X86sahf AH))]>,
+ Requires<[HasLAHFSAHF]>;
let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
-def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [],
- IIC_AHF>, // AH = flags
+def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>, // AH = flags
Requires<[HasLAHFSAHF]>;
} // SchedRW
@@ -1693,15 +1753,15 @@ let Defs = [EFLAGS] in {
let SchedRW = [WriteALU] in {
def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
+ [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))]>,
OpSize16, TB, NotMemoryFoldable;
def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
"bt{l}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>,
+ [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))]>,
OpSize32, TB, NotMemoryFoldable;
def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB,
+ [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))]>, TB,
NotMemoryFoldable;
} // SchedRW
@@ -1714,189 +1774,180 @@ def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in {
def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
- [], IIC_BT_MR
- >, OpSize16, TB, NotMemoryFoldable;
+ []>, OpSize16, TB, NotMemoryFoldable;
def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
"bt{l}\t{$src2, $src1|$src1, $src2}",
- [], IIC_BT_MR
- >, OpSize32, TB, NotMemoryFoldable;
+ []>, OpSize32, TB, NotMemoryFoldable;
def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
- [], IIC_BT_MR
- >, TB, NotMemoryFoldable;
+ []>, TB, NotMemoryFoldable;
}
let SchedRW = [WriteALU] in {
def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))],
- IIC_BT_RI>, OpSize16, TB;
+ [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>,
+ OpSize16, TB;
def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
"bt{l}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))],
- IIC_BT_RI>, OpSize32, TB;
+ [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))]>,
+ OpSize32, TB;
def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))],
- IIC_BT_RI>, TB;
+ [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB;
} // SchedRW
// Note that these instructions aren't slow because that only applies when the
// other operand is in a register. When it's an immediate, bt is still fast.
let SchedRW = [WriteALU] in {
def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
- "bt{w}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
- ], IIC_BT_MI>, OpSize16, TB;
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi16 addr:$src1),
+ i16immSExt8:$src2))]>,
+ OpSize16, TB;
def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
- "bt{l}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2))
- ], IIC_BT_MI>, OpSize32, TB;
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi32 addr:$src1),
+ i32immSExt8:$src2))]>,
+ OpSize32, TB;
def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt (loadi64 addr:$src1),
- i64immSExt8:$src2))], IIC_BT_MI>, TB,
+ i64immSExt8:$src2))]>, TB,
Requires<[In64BitMode]>;
} // SchedRW
let hasSideEffects = 0 in {
let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
def BTC16rr : I<0xBB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
- "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ "btc{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB, NotMemoryFoldable;
def BTC32rr : I<0xBB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
- "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ "btc{l}\t{$src2, $src1|$src1, $src2}", []>,
OpSize32, TB, NotMemoryFoldable;
def BTC64rr : RI<0xBB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
- "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB,
+ "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
NotMemoryFoldable;
} // SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
- "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ "btc{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB, NotMemoryFoldable;
def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
- "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ "btc{l}\t{$src2, $src1|$src1, $src2}", []>,
OpSize32, TB, NotMemoryFoldable;
def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
- "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB,
+ "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
NotMemoryFoldable;
}
let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
- "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
- OpSize16, TB;
+ "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
- "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
- OpSize32, TB;
+ "btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
- "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+ "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
} // SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
- "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
- OpSize16, TB;
+ "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
- "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
- OpSize32, TB;
+ "btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
- "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB,
+ "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
Requires<[In64BitMode]>;
}
let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
def BTR16rr : I<0xB3, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
- "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ "btr{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB, NotMemoryFoldable;
def BTR32rr : I<0xB3, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
- "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ "btr{l}\t{$src2, $src1|$src1, $src2}", []>,
OpSize32, TB, NotMemoryFoldable;
def BTR64rr : RI<0xB3, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
- "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB, NotMemoryFoldable;
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
+ NotMemoryFoldable;
} // SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
- "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ "btr{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB, NotMemoryFoldable;
def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
- "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ "btr{l}\t{$src2, $src1|$src1, $src2}", []>,
OpSize32, TB, NotMemoryFoldable;
def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
- "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB,
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
NotMemoryFoldable;
}
let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
- "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ "btr{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB;
def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
- "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ "btr{l}\t{$src2, $src1|$src1, $src2}", []>,
OpSize32, TB;
def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
- "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
} // SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
- "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ "btr{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB;
def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
- "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ "btr{l}\t{$src2, $src1|$src1, $src2}", []>,
OpSize32, TB;
def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
- "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB,
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
Requires<[In64BitMode]>;
}
let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
def BTS16rr : I<0xAB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
- "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ "bts{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB, NotMemoryFoldable;
def BTS32rr : I<0xAB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
- "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ "bts{l}\t{$src2, $src1|$src1, $src2}", []>,
OpSize32, TB, NotMemoryFoldable;
def BTS64rr : RI<0xAB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
- "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB,
+ "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
NotMemoryFoldable;
} // SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
- "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ "bts{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB, NotMemoryFoldable;
def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
- "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ "bts{l}\t{$src2, $src1|$src1, $src2}", []>,
OpSize32, TB, NotMemoryFoldable;
def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
- "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB,
+ "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
NotMemoryFoldable;
}
let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
- "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
- OpSize16, TB;
+ "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
- "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
- OpSize32, TB;
+ "bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
- "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+ "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
} // SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
- "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
- OpSize16, TB;
+ "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
- "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
- OpSize32, TB;
+ "bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
- "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB,
+ "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
Requires<[In64BitMode]>;
}
} // hasSideEffects = 0
@@ -1909,143 +1960,154 @@ def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
// Atomic swap. These are just normal xchg instructions. But since a memory
// operand is referenced, the atomicity is ensured.
-multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
- InstrItinClass itin> {
+multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag> {
let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in {
def NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst),
(ins GR8:$val, i8mem:$ptr),
!strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
[(set
GR8:$dst,
- (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
- itin>;
+ (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))]>;
def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
(ins GR16:$val, i16mem:$ptr),
!strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
[(set
GR16:$dst,
- (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
- itin>, OpSize16;
+ (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))]>,
+ OpSize16;
def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
(ins GR32:$val, i32mem:$ptr),
!strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
[(set
GR32:$dst,
- (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
- itin>, OpSize32;
+ (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>,
+ OpSize32;
def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
(ins GR64:$val, i64mem:$ptr),
!strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
[(set
GR64:$dst,
- (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
- itin>;
+ (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))]>;
}
}
-defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>;
+defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap">, NotMemoryFoldable;
// Swap between registers.
let SchedRW = [WriteALU] in {
-let Constraints = "$val = $dst" in {
-def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
- "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
-def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
- "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
- OpSize16;
-def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
- "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
- OpSize32;
-def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
- "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+let Constraints = "$src1 = $dst1, $src2 = $dst2", hasSideEffects = 0 in {
+def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst1, GR8:$dst2),
+ (ins GR8:$src1, GR8:$src2),
+ "xchg{b}\t{$src2, $src1|$src1, $src2}", []>, NotMemoryFoldable;
+def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst1, GR16:$dst2),
+ (ins GR16:$src1, GR16:$src2),
+ "xchg{w}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize16, NotMemoryFoldable;
+def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst1, GR32:$dst2),
+ (ins GR32:$src1, GR32:$src2),
+ "xchg{l}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize32, NotMemoryFoldable;
+def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst1, GR64:$dst2),
+ (ins GR64:$src1 ,GR64:$src2),
+ "xchg{q}\t{$src2, $src1|$src1, $src2}", []>, NotMemoryFoldable;
}
// Swap between EAX and other registers.
+let Constraints = "$src = $dst", hasSideEffects = 0 in {
let Uses = [AX], Defs = [AX] in
-def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
- "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize16;
+def XCHG16ar : I<0x90, AddRegFrm, (outs GR16:$dst), (ins GR16:$src),
+ "xchg{w}\t{$src, %ax|ax, $src}", []>, OpSize16;
let Uses = [EAX], Defs = [EAX] in
-def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
- "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
- OpSize32, Requires<[Not64BitMode]>;
-let Uses = [EAX], Defs = [EAX] in
-// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
-// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
-def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
- "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
- OpSize32, Requires<[In64BitMode]>;
+def XCHG32ar : I<0x90, AddRegFrm, (outs GR32:$dst), (ins GR32:$src),
+ "xchg{l}\t{$src, %eax|eax, $src}", []>, OpSize32;
let Uses = [RAX], Defs = [RAX] in
-def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
- "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>;
+def XCHG64ar : RI<0x90, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
+ "xchg{q}\t{$src, %rax|rax, $src}", []>;
+}
} // SchedRW
-let SchedRW = [WriteALU] in {
-def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
- "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
-def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
- "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
- OpSize16;
-def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
- "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
- OpSize32;
-def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
- "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+let hasSideEffects = 0, Constraints = "$src1 = $dst1, $src2 = $dst2",
+ Defs = [EFLAGS], SchedRW = [WriteALU] in {
+def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst1, GR8:$dst2),
+ (ins GR8:$src1, GR8:$src2),
+ "xadd{b}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst1, GR16:$dst2),
+ (ins GR16:$src1, GR16:$src2),
+ "xadd{w}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
+def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst1, GR32:$dst2),
+ (ins GR32:$src1, GR32:$src2),
+ "xadd{l}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
+def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst1, GR64:$dst2),
+ (ins GR64:$src1, GR64:$src2),
+ "xadd{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
} // SchedRW
-let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
-def XADD8rm : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
- "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
-def XADD16rm : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
- "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+let mayLoad = 1, mayStore = 1, hasSideEffects = 0, Constraints = "$val = $dst",
+ Defs = [EFLAGS], SchedRW = [WriteALULd, WriteRMW] in {
+def XADD8rm : I<0xC0, MRMSrcMem, (outs GR8:$dst),
+ (ins GR8:$val, i8mem:$ptr),
+ "xadd{b}\t{$val, $ptr|$ptr, $val}", []>, TB;
+def XADD16rm : I<0xC1, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$val, i16mem:$ptr),
+ "xadd{w}\t{$val, $ptr|$ptr, $val}", []>, TB,
OpSize16;
-def XADD32rm : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
- "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+def XADD32rm : I<0xC1, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$val, i32mem:$ptr),
+ "xadd{l}\t{$val, $ptr|$ptr, $val}", []>, TB,
OpSize32;
-def XADD64rm : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
- "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+def XADD64rm : RI<0xC1, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$val, i64mem:$ptr),
+ "xadd{q}\t{$val, $ptr|$ptr, $val}", []>, TB;
}
-let SchedRW = [WriteALU] in {
+let SchedRW = [WriteALU], hasSideEffects = 0 in {
+let Defs = [AL, EFLAGS], Uses = [AL] in
def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
- "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
- IIC_CMPXCHG_REG8>, TB;
+ "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB,
+ NotMemoryFoldable;
+let Defs = [AX, EFLAGS], Uses = [AX] in
def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
- "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
- IIC_CMPXCHG_REG>, TB, OpSize16;
+ "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16,
+ NotMemoryFoldable;
+let Defs = [EAX, EFLAGS], Uses = [EAX] in
def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
- "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
- IIC_CMPXCHG_REG>, TB, OpSize32;
+ "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32,
+ NotMemoryFoldable;
+let Defs = [RAX, EFLAGS], Uses = [RAX] in
def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
- "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
- IIC_CMPXCHG_REG>, TB;
-} // SchedRW
+ "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB,
+ NotMemoryFoldable;
+} // SchedRW, hasSideEffects
-let SchedRW = [WriteALULd, WriteRMW] in {
-let mayLoad = 1, mayStore = 1 in {
+let SchedRW = [WriteALULd, WriteRMW], mayLoad = 1, mayStore = 1,
+ hasSideEffects = 0 in {
+let Defs = [AL, EFLAGS], Uses = [AL] in
def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
- "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
- IIC_CMPXCHG_MEM8>, TB;
+ "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB,
+ NotMemoryFoldable;
+let Defs = [AX, EFLAGS], Uses = [AX] in
def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
- "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
- IIC_CMPXCHG_MEM>, TB, OpSize16;
+ "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16,
+ NotMemoryFoldable;
+let Defs = [EAX, EFLAGS], Uses = [EAX] in
def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
- "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
- IIC_CMPXCHG_MEM>, TB, OpSize32;
+ "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32,
+ NotMemoryFoldable;
+let Defs = [RAX, EFLAGS], Uses = [RAX] in
def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
- "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
- IIC_CMPXCHG_MEM>, TB;
-}
+ "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB,
+ NotMemoryFoldable;
let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
- "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB;
+ "cmpxchg8b\t$dst", []>, TB;
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
- "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>,
+ "cmpxchg16b\t$dst", []>,
TB, Requires<[HasCmpxchg16b, In64BitMode]>;
-} // SchedRW
+} // SchedRW, mayLoad, mayStore, hasSideEffects
// Lock instruction prefix
@@ -2055,21 +2117,15 @@ def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>;
let SchedRW = [WriteNop] in {
// Rex64 instruction prefix
-def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", [], IIC_NOP>,
+def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>,
Requires<[In64BitMode]>;
// Data16 instruction prefix
-def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", [], IIC_NOP>,
- Requires<[Not16BitMode]>;
-
-// Data instruction prefix
-def DATA32_PREFIX : I<0x66, RawFrm, (outs), (ins), "data32", [], IIC_NOP>,
- Requires<[In16BitMode]>;
+def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>;
} // SchedRW
// Repeat string operation instruction prefixes
-// These use the DF flag in the EFLAGS register to inc or dec ECX
-let Defs = [ECX], Uses = [ECX,EFLAGS], SchedRW = [WriteMicrocoded] in {
+let Defs = [ECX], Uses = [ECX,DF], SchedRW = [WriteMicrocoded] in {
// Repeat (used with INS, OUTS, MOVS, LODS and STOS)
def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>;
// Repeat while not equal (used with CMPS and SCAS)
@@ -2078,110 +2134,108 @@ def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>;
// String manipulation instructions
let SchedRW = [WriteMicrocoded] in {
-// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
-let Defs = [AL,ESI], Uses = [ESI,EFLAGS] in
+let Defs = [AL,ESI], Uses = [ESI,DF] in
def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src),
- "lodsb\t{$src, %al|al, $src}", [], IIC_LODS>;
-let Defs = [AX,ESI], Uses = [ESI,EFLAGS] in
+ "lodsb\t{$src, %al|al, $src}", []>;
+let Defs = [AX,ESI], Uses = [ESI,DF] in
def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src),
- "lodsw\t{$src, %ax|ax, $src}", [], IIC_LODS>, OpSize16;
-let Defs = [EAX,ESI], Uses = [ESI,EFLAGS] in
+ "lodsw\t{$src, %ax|ax, $src}", []>, OpSize16;
+let Defs = [EAX,ESI], Uses = [ESI,DF] in
def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src),
- "lods{l|d}\t{$src, %eax|eax, $src}", [], IIC_LODS>, OpSize32;
-let Defs = [RAX,ESI], Uses = [ESI,EFLAGS] in
+ "lods{l|d}\t{$src, %eax|eax, $src}", []>, OpSize32;
+let Defs = [RAX,ESI], Uses = [ESI,DF] in
def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src),
- "lodsq\t{$src, %rax|rax, $src}", [], IIC_LODS>;
+ "lodsq\t{$src, %rax|rax, $src}", []>,
+ Requires<[In64BitMode]>;
}
let SchedRW = [WriteSystem] in {
-// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
-let Defs = [ESI], Uses = [DX,ESI,EFLAGS] in {
+let Defs = [ESI], Uses = [DX,ESI,DF] in {
def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src),
- "outsb\t{$src, %dx|dx, $src}", [], IIC_OUTS>;
+ "outsb\t{$src, %dx|dx, $src}", []>;
def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src),
- "outsw\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize16;
+ "outsw\t{$src, %dx|dx, $src}", []>, OpSize16;
def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src),
- "outs{l|d}\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize32;
+ "outs{l|d}\t{$src, %dx|dx, $src}", []>, OpSize32;
}
-// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
-let Defs = [EDI], Uses = [DX,EDI,EFLAGS] in {
+let Defs = [EDI], Uses = [DX,EDI,DF] in {
def INSB : I<0x6C, RawFrmDst, (outs), (ins dstidx8:$dst),
- "insb\t{%dx, $dst|$dst, dx}", [], IIC_INS>;
+ "insb\t{%dx, $dst|$dst, dx}", []>;
def INSW : I<0x6D, RawFrmDst, (outs), (ins dstidx16:$dst),
- "insw\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize16;
+ "insw\t{%dx, $dst|$dst, dx}", []>, OpSize16;
def INSL : I<0x6D, RawFrmDst, (outs), (ins dstidx32:$dst),
- "ins{l|d}\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize32;
+ "ins{l|d}\t{%dx, $dst|$dst, dx}", []>, OpSize32;
}
}
-// Flag instructions
-let SchedRW = [WriteALU] in {
-def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>;
-def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>;
-def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
-def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>;
-def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>;
-def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>;
-def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>;
+// EFLAGS management instructions.
+let SchedRW = [WriteALU], Defs = [EFLAGS], Uses = [EFLAGS] in {
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", []>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", []>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", []>;
+}
-def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
+// DF management instructions.
+let SchedRW = [WriteALU], Defs = [DF] in {
+def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", []>;
+def STD : I<0xFD, RawFrm, (outs), (ins), "std", []>;
}
// Table lookup instructions
let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in
-def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>,
- Sched<[WriteLoad]>;
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", []>, Sched<[WriteLoad]>;
let SchedRW = [WriteMicrocoded] in {
// ASCII Adjust After Addition
let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
-def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
+def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", []>,
Requires<[Not64BitMode]>;
// ASCII Adjust AX Before Division
let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in
def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
- "aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>;
+ "aad\t$src", []>, Requires<[Not64BitMode]>;
// ASCII Adjust AX After Multiply
let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in
def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
- "aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>;
+ "aam\t$src", []>, Requires<[Not64BitMode]>;
// ASCII Adjust AL After Subtraction - sets
let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
-def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
+def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", []>,
Requires<[Not64BitMode]>;
// Decimal Adjust AL after Addition
let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
-def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
+def DAA : I<0x27, RawFrm, (outs), (ins), "daa", []>,
Requires<[Not64BitMode]>;
// Decimal Adjust AL after Subtraction
let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
-def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
+def DAS : I<0x2F, RawFrm, (outs), (ins), "das", []>,
Requires<[Not64BitMode]>;
} // SchedRW
let SchedRW = [WriteSystem] in {
// Check Array Index Against Bounds
+// Note: "bound" does not have reversed operands in at&t syntax.
def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
- "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize16,
+ "bound\t$dst, $src", []>, OpSize16,
Requires<[Not64BitMode]>;
def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
- "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize32,
+ "bound\t$dst, $src", []>, OpSize32,
Requires<[Not64BitMode]>;
// Adjust RPL Field of Segment Selector
def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
- "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
- Requires<[Not64BitMode]>;
+ "arpl\t{$src, $dst|$dst, $src}", []>,
+ Requires<[Not64BitMode]>, NotMemoryFoldable;
let mayStore = 1 in
def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
- "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
- Requires<[Not64BitMode]>;
+ "arpl\t{$src, $dst|$dst, $src}", []>,
+ Requires<[Not64BitMode]>, NotMemoryFoldable;
} // SchedRW
//===----------------------------------------------------------------------===//
@@ -2191,29 +2245,29 @@ let Predicates = [HasMOVBE] in {
let SchedRW = [WriteALULd] in {
def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"movbe{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>,
+ [(set GR16:$dst, (bswap (loadi16 addr:$src)))]>,
OpSize16, T8PS;
def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"movbe{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>,
+ [(set GR32:$dst, (bswap (loadi32 addr:$src)))]>,
OpSize32, T8PS;
def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"movbe{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>,
+ [(set GR64:$dst, (bswap (loadi64 addr:$src)))]>,
T8PS;
}
let SchedRW = [WriteStore] in {
def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
"movbe{w}\t{$src, $dst|$dst, $src}",
- [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>,
+ [(store (bswap GR16:$src), addr:$dst)]>,
OpSize16, T8PS;
def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
"movbe{l}\t{$src, $dst|$dst, $src}",
- [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>,
+ [(store (bswap GR32:$src), addr:$dst)]>,
OpSize32, T8PS;
def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
"movbe{q}\t{$src, $dst|$dst, $src}",
- [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>,
+ [(store (bswap GR64:$src), addr:$dst)]>,
T8PS;
}
}
@@ -2223,33 +2277,26 @@ let Predicates = [HasMOVBE] in {
//
let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
- "rdrand{w}\t$dst",
- [(set GR16:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>,
+ "rdrand{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86rdrand))]>,
OpSize16, PS;
def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
- "rdrand{l}\t$dst",
- [(set GR32:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>,
+ "rdrand{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86rdrand))]>,
OpSize32, PS;
def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
- "rdrand{q}\t$dst",
- [(set GR64:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>, PS;
+ "rdrand{q}\t$dst", [(set GR64:$dst, EFLAGS, (X86rdrand))]>,
+ PS;
}
//===----------------------------------------------------------------------===//
// RDSEED Instruction
//
let Predicates = [HasRDSEED], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
- def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins),
- "rdseed{w}\t$dst",
- [(set GR16:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>,
- OpSize16, PS;
- def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
- "rdseed{l}\t$dst",
- [(set GR32:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>,
- OpSize32, PS;
- def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins),
- "rdseed{q}\t$dst",
- [(set GR64:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>, PS;
+ def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins), "rdseed{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, PS;
+ def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins), "rdseed{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, PS;
+ def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdseed{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86rdseed))]>, PS;
}
//===----------------------------------------------------------------------===//
@@ -2258,33 +2305,30 @@ let Predicates = [HasRDSEED], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
"lzcnt{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)],
- IIC_LZCNT_RR>, XS, OpSize16, Sched<[WriteIMul]>;
+ [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>,
+ XS, OpSize16, Sched<[WriteLZCNT]>;
def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"lzcnt{w}\t{$src, $dst|$dst, $src}",
[(set GR16:$dst, (ctlz (loadi16 addr:$src))),
- (implicit EFLAGS)], IIC_LZCNT_RM>, XS, OpSize16,
- Sched<[WriteIMulLd]>;
+ (implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteLZCNTLd]>;
def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
"lzcnt{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)],
- IIC_LZCNT_RR>, XS, OpSize32, Sched<[WriteIMul]>;
+ [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>,
+ XS, OpSize32, Sched<[WriteLZCNT]>;
def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"lzcnt{l}\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (ctlz (loadi32 addr:$src))),
- (implicit EFLAGS)], IIC_LZCNT_RM>, XS, OpSize32,
- Sched<[WriteIMulLd]>;
+ (implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteLZCNTLd]>;
def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
"lzcnt{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)],
- IIC_LZCNT_RR>, XS, Sched<[WriteIMul]>;
+ [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
+ XS, Sched<[WriteLZCNT]>;
def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"lzcnt{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (ctlz (loadi64 addr:$src))),
- (implicit EFLAGS)], IIC_LZCNT_RM>, XS,
- Sched<[WriteIMulLd]>;
+ (implicit EFLAGS)]>, XS, Sched<[WriteLZCNTLd]>;
}
//===----------------------------------------------------------------------===//
@@ -2293,45 +2337,42 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
let Predicates = [HasBMI], Defs = [EFLAGS] in {
def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
"tzcnt{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)],
- IIC_TZCNT_RR>, XS, OpSize16, Sched<[WriteIMul]>;
+ [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>,
+ XS, OpSize16, Sched<[WriteTZCNT]>;
def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"tzcnt{w}\t{$src, $dst|$dst, $src}",
[(set GR16:$dst, (cttz (loadi16 addr:$src))),
- (implicit EFLAGS)], IIC_TZCNT_RM>, XS, OpSize16,
- Sched<[WriteIMulLd]>;
+ (implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteTZCNTLd]>;
def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
"tzcnt{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)],
- IIC_TZCNT_RR>, XS, OpSize32, Sched<[WriteIMul]>;
+ [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>,
+ XS, OpSize32, Sched<[WriteTZCNT]>;
def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"tzcnt{l}\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (cttz (loadi32 addr:$src))),
- (implicit EFLAGS)], IIC_TZCNT_RM>, XS, OpSize32,
- Sched<[WriteIMulLd]>;
+ (implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteTZCNTLd]>;
def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
"tzcnt{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)],
- IIC_TZCNT_RR>, XS, Sched<[WriteIMul]>;
+ [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
+ XS, Sched<[WriteTZCNT]>;
def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"tzcnt{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (cttz (loadi64 addr:$src))),
- (implicit EFLAGS)], IIC_TZCNT_RM>, XS,
- Sched<[WriteIMulLd]>;
+ (implicit EFLAGS)]>, XS, Sched<[WriteTZCNTLd]>;
}
multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
RegisterClass RC, X86MemOperand x86memop> {
let hasSideEffects = 0 in {
def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
- !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
- [], IIC_UNARY_REG>, T8PS, VEX_4V, Sched<[WriteALU]>;
+ !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
+ T8PS, VEX_4V, Sched<[WriteALU]>;
let mayLoad = 1 in
def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
- !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
- [], IIC_UNARY_MEM>, T8PS, VEX_4V, Sched<[WriteALULd, ReadAfterLd]>;
+ !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
+ T8PS, VEX_4V, Sched<[WriteALULd]>;
}
}
@@ -2366,32 +2407,56 @@ let Predicates = [HasBMI] in {
(BLSI64rr GR64:$src)>;
}
-multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
- X86MemOperand x86memop, Intrinsic Int,
- PatFrag ld_frag> {
+multiclass bmi_bextr<bits<8> opc, string mnemonic, RegisterClass RC,
+ X86MemOperand x86memop, SDNode OpNode,
+ PatFrag ld_frag, X86FoldableSchedWrite Sched> {
def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)], IIC_BIN_NONMEM>,
- T8PS, VEX, Sched<[WriteALU]>;
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
+ T8PS, VEX, Sched<[Sched]>;
def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
- (implicit EFLAGS)], IIC_BIN_MEM>, T8PS, VEX,
- Sched<[WriteALULd, ReadAfterLd]>;
+ [(set RC:$dst, (OpNode (ld_frag addr:$src1), RC:$src2)),
+ (implicit EFLAGS)]>, T8PS, VEX,
+ Sched<[Sched.Folded,
+ // x86memop:$src1
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC:$src2
+ ReadAfterLd]>;
}
let Predicates = [HasBMI], Defs = [EFLAGS] in {
- defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem,
- int_x86_bmi_bextr_32, loadi32>;
- defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem,
- int_x86_bmi_bextr_64, loadi64>, VEX_W;
+ defm BEXTR32 : bmi_bextr<0xF7, "bextr{l}", GR32, i32mem,
+ X86bextr, loadi32, WriteBEXTR>;
+ defm BEXTR64 : bmi_bextr<0xF7, "bextr{q}", GR64, i64mem,
+ X86bextr, loadi64, WriteBEXTR>, VEX_W;
+}
+
+multiclass bmi_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
+ X86MemOperand x86memop, Intrinsic Int,
+ PatFrag ld_frag, X86FoldableSchedWrite Sched> {
+ def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
+ T8PS, VEX, Sched<[Sched]>;
+ def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
+ (implicit EFLAGS)]>, T8PS, VEX,
+ Sched<[Sched.Folded,
+ // x86memop:$src1
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC:$src2
+ ReadAfterLd]>;
}
let Predicates = [HasBMI2], Defs = [EFLAGS] in {
- defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
- int_x86_bmi_bzhi_32, loadi32>;
- defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
- int_x86_bmi_bzhi_64, loadi64>, VEX_W;
+ defm BZHI32 : bmi_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
+ int_x86_bmi_bzhi_32, loadi32, WriteBZHI>;
+ defm BZHI64 : bmi_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
+ int_x86_bmi_bzhi_64, loadi64, WriteBZHI>, VEX_W;
}
def CountTrailingOnes : SDNodeXForm<imm, [{
@@ -2405,7 +2470,7 @@ def BEXTRMaskXForm : SDNodeXForm<imm, [{
}]>;
def AndMask64 : ImmLeaf<i64, [{
- return isMask_64(Imm) && Imm > UINT32_MAX;
+ return isMask_64(Imm) && !isUInt<32>(Imm);
}]>;
// Use BEXTR for 64-bit 'and' with large immediate 'mask'.
@@ -2433,21 +2498,49 @@ let Predicates = [HasBMI2, NoTBM] in {
}
let Predicates = [HasBMI2] in {
- def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)),
- (BZHI32rr GR32:$src,
- (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
-
- def : Pat<(and (loadi32 addr:$src), (add (shl 1, GR8:$lz), -1)),
- (BZHI32rm addr:$src,
- (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+ multiclass _bmi_bzhi_pattern<dag regpattern, dag mempattern, RegisterClass RC,
+ ValueType VT, Instruction DstInst,
+ Instruction DstMemInst> {
+ def : Pat<regpattern,
+ (DstInst RC:$src,
+ (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+ def : Pat<mempattern,
+ (DstMemInst addr:$src,
+ (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+ }
- def : Pat<(and GR64:$src, (add (shl 1, GR8:$lz), -1)),
- (BZHI64rr GR64:$src,
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+ multiclass bmi_bzhi_patterns<RegisterClass RC, int bitwidth, ValueType VT,
+ Instruction DstInst, X86MemOperand x86memop,
+ Instruction DstMemInst> {
+ // x & ((1 << y) - 1)
+ defm : _bmi_bzhi_pattern<(and RC:$src, (add (shl 1, GR8:$lz), -1)),
+ (and (x86memop addr:$src),
+ (add (shl 1, GR8:$lz), -1)),
+ RC, VT, DstInst, DstMemInst>;
+
+ // x & ~(-1 << y)
+ defm : _bmi_bzhi_pattern<(and RC:$src, (xor (shl -1, GR8:$lz), -1)),
+ (and (x86memop addr:$src),
+ (xor (shl -1, GR8:$lz), -1)),
+ RC, VT, DstInst, DstMemInst>;
+
+ // x & (-1 >> (bitwidth - y))
+ defm : _bmi_bzhi_pattern<(and RC:$src, (srl -1, (sub bitwidth, GR8:$lz))),
+ (and (x86memop addr:$src),
+ (srl -1, (sub bitwidth, GR8:$lz))),
+ RC, VT, DstInst, DstMemInst>;
+
+ // x << (bitwidth - y) >> (bitwidth - y)
+ defm : _bmi_bzhi_pattern<(srl (shl RC:$src, (sub bitwidth, GR8:$lz)),
+ (sub bitwidth, GR8:$lz)),
+ (srl (shl (x86memop addr:$src),
+ (sub bitwidth, GR8:$lz)),
+ (sub bitwidth, GR8:$lz)),
+ RC, VT, DstInst, DstMemInst>;
+ }
- def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
- (BZHI64rm addr:$src,
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+ defm : bmi_bzhi_patterns<GR32, 32, i32, BZHI32rr, loadi32, BZHI32rm>;
+ defm : bmi_bzhi_patterns<GR64, 64, i64, BZHI64rr, loadi64, BZHI64rm>;
// x & (-1 >> (32 - y))
def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
@@ -2487,12 +2580,12 @@ multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
PatFrag ld_frag> {
def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (Int RC:$src1, RC:$src2))], IIC_BIN_NONMEM>,
+ [(set RC:$dst, (Int RC:$src1, RC:$src2))]>,
VEX_4V, Sched<[WriteALU]>;
def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))],
- IIC_BIN_MEM>, VEX_4V, Sched<[WriteALULd, ReadAfterLd]>;
+ [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>,
+ VEX_4V, Sched<[WriteALULd, ReadAfterLd]>;
}
let Predicates = [HasBMI2] in {
@@ -2511,61 +2604,63 @@ let Predicates = [HasBMI2] in {
//
let Predicates = [HasTBM], Defs = [EFLAGS] in {
-multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr,
- X86MemOperand x86memop, PatFrag ld_frag,
- Intrinsic Int, Operand immtype,
- SDPatternOperator immoperator> {
+multiclass tbm_ternary_imm<bits<8> opc, RegisterClass RC, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ SDNode OpNode, Operand immtype,
+ SDPatternOperator immoperator,
+ X86FoldableSchedWrite Sched> {
def ri : Ii32<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
!strconcat(OpcodeStr,
"\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
- [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))],
- IIC_BIN_NONMEM>, XOP, XOPA, Sched<[WriteALU]>;
+ [(set RC:$dst, (OpNode RC:$src1, immoperator:$cntl))]>,
+ XOP, XOPA, Sched<[Sched]>;
def mi : Ii32<opc, MRMSrcMem, (outs RC:$dst),
(ins x86memop:$src1, immtype:$cntl),
!strconcat(OpcodeStr,
"\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
- [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))],
- IIC_BIN_MEM>, XOP, XOPA, Sched<[WriteALULd, ReadAfterLd]>;
+ [(set RC:$dst, (OpNode (ld_frag addr:$src1), immoperator:$cntl))]>,
+ XOP, XOPA, Sched<[Sched.Folded]>;
}
-defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32,
- int_x86_tbm_bextri_u32, i32imm, imm>;
+defm BEXTRI32 : tbm_ternary_imm<0x10, GR32, "bextr{l}", i32mem, loadi32,
+ X86bextr, i32imm, imm, WriteBEXTR>;
let ImmT = Imm32S in
-defm BEXTRI64 : tbm_ternary_imm_intr<0x10, GR64, "bextr", i64mem, loadi64,
- int_x86_tbm_bextri_u64, i64i32imm,
- i64immSExt32>, VEX_W;
+defm BEXTRI64 : tbm_ternary_imm<0x10, GR64, "bextr{q}", i64mem, loadi64,
+ X86bextr, i64i32imm,
+ i64immSExt32, WriteBEXTR>, VEX_W;
multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
RegisterClass RC, string OpcodeStr,
- X86MemOperand x86memop, PatFrag ld_frag> {
+ X86MemOperand x86memop, X86FoldableSchedWrite Sched> {
let hasSideEffects = 0 in {
def rr : I<opc, FormReg, (outs RC:$dst), (ins RC:$src),
- !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
- [], IIC_BIN_NONMEM>, XOP_4V, XOP9, Sched<[WriteALU]>;
+ !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>,
+ XOP_4V, XOP9, Sched<[Sched]>;
let mayLoad = 1 in
def rm : I<opc, FormMem, (outs RC:$dst), (ins x86memop:$src),
- !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
- [], IIC_BIN_MEM>, XOP_4V, XOP9, Sched<[WriteALULd, ReadAfterLd]>;
+ !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>,
+ XOP_4V, XOP9, Sched<[Sched.Folded]>;
}
}
multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite Sched,
Format FormReg, Format FormMem> {
- defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr, i32mem,
- loadi32>;
- defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr, i64mem,
- loadi64>, VEX_W;
-}
-
-defm BLCFILL : tbm_binary_intr<0x01, "blcfill", MRM1r, MRM1m>;
-defm BLCI : tbm_binary_intr<0x02, "blci", MRM6r, MRM6m>;
-defm BLCIC : tbm_binary_intr<0x01, "blcic", MRM5r, MRM5m>;
-defm BLCMSK : tbm_binary_intr<0x02, "blcmsk", MRM1r, MRM1m>;
-defm BLCS : tbm_binary_intr<0x01, "blcs", MRM3r, MRM3m>;
-defm BLSFILL : tbm_binary_intr<0x01, "blsfill", MRM2r, MRM2m>;
-defm BLSIC : tbm_binary_intr<0x01, "blsic", MRM6r, MRM6m>;
-defm T1MSKC : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>;
-defm TZMSK : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
+ defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr#"{l}",
+ i32mem, Sched>;
+ defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr#"{q}",
+ i64mem, Sched>, VEX_W;
+}
+
+defm BLCFILL : tbm_binary_intr<0x01, "blcfill", WriteALU, MRM1r, MRM1m>;
+defm BLCI : tbm_binary_intr<0x02, "blci", WriteALU, MRM6r, MRM6m>;
+defm BLCIC : tbm_binary_intr<0x01, "blcic", WriteALU, MRM5r, MRM5m>;
+defm BLCMSK : tbm_binary_intr<0x02, "blcmsk", WriteALU, MRM1r, MRM1m>;
+defm BLCS : tbm_binary_intr<0x01, "blcs", WriteALU, MRM3r, MRM3m>;
+defm BLSFILL : tbm_binary_intr<0x01, "blsfill", WriteALU, MRM2r, MRM2m>;
+defm BLSIC : tbm_binary_intr<0x01, "blsic", WriteALU, MRM6r, MRM6m>;
+defm T1MSKC : tbm_binary_intr<0x01, "t1mskc", WriteALU, MRM7r, MRM7m>;
+defm TZMSK : tbm_binary_intr<0x01, "tzmsk", WriteALU, MRM4r, MRM4m>;
} // HasTBM, EFLAGS
// Use BEXTRI for 64-bit 'and' with large immediate 'mask'.
@@ -2583,28 +2678,24 @@ let Predicates = [HasTBM] in {
let Predicates = [HasLWP], SchedRW = [WriteSystem] in {
def LLWPCB : I<0x12, MRM0r, (outs), (ins GR32:$src), "llwpcb\t$src",
- [(int_x86_llwpcb GR32:$src)], IIC_LWP>,
- XOP, XOP9;
+ [(int_x86_llwpcb GR32:$src)]>, XOP, XOP9;
def SLWPCB : I<0x12, MRM1r, (outs GR32:$dst), (ins), "slwpcb\t$dst",
- [(set GR32:$dst, (int_x86_slwpcb))], IIC_LWP>,
- XOP, XOP9;
+ [(set GR32:$dst, (int_x86_slwpcb))]>, XOP, XOP9;
def LLWPCB64 : I<0x12, MRM0r, (outs), (ins GR64:$src), "llwpcb\t$src",
- [(int_x86_llwpcb GR64:$src)], IIC_LWP>,
- XOP, XOP9, VEX_W;
+ [(int_x86_llwpcb GR64:$src)]>, XOP, XOP9, VEX_W;
def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst",
- [(set GR64:$dst, (int_x86_slwpcb))], IIC_LWP>,
- XOP, XOP9, VEX_W;
+ [(set GR64:$dst, (int_x86_slwpcb))]>, XOP, XOP9, VEX_W;
multiclass lwpins_intr<RegisterClass RC> {
def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
"lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
- [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))], IIC_LWP>,
+ [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))]>,
XOP_4V, XOPA;
let mayLoad = 1 in
def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
"lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
- [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))], IIC_LWP>,
+ [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))]>,
XOP_4V, XOPA;
}
@@ -2616,12 +2707,11 @@ let Defs = [EFLAGS] in {
multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> {
def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
"lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
- [(Int RC:$src0, GR32:$src1, imm:$cntl)], IIC_LWP>,
- XOP_4V, XOPA;
+ [(Int RC:$src0, GR32:$src1, imm:$cntl)]>, XOP_4V, XOPA;
let mayLoad = 1 in
def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
"lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
- [(Int RC:$src0, (loadi32 addr:$src1), imm:$cntl)], IIC_LWP>,
+ [(Int RC:$src0, (loadi32 addr:$src1), imm:$cntl)]>,
XOP_4V, XOPA;
}
@@ -2641,13 +2731,13 @@ let SchedRW = [ WriteSystem ] in {
}
let Uses = [ EAX, ECX, EDX ] in {
- def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [], IIC_SSE_MONITORX>,
+ def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>,
TB, Requires<[ HasMWAITX ]>;
}
let Uses = [ ECX, EAX, EBX ] in {
def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx",
- [(int_x86_mwaitx ECX, EAX, EBX)], IIC_SSE_MWAITX>,
+ [(int_x86_mwaitx ECX, EAX, EBX)]>,
TB, Requires<[ HasMWAITX ]>;
}
} // SchedRW
@@ -2663,11 +2753,67 @@ def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>,
Requires<[ In64BitMode ]>;
//===----------------------------------------------------------------------===//
+// WAITPKG Instructions
+//
+let SchedRW = [WriteSystem] in {
+ def UMONITOR16 : I<0xAE, MRM6r, (outs), (ins GR16:$src),
+ "umonitor\t$src", [(int_x86_umonitor GR16:$src)]>,
+ XS, AdSize16, Requires<[HasWAITPKG, Not64BitMode]>;
+ def UMONITOR32 : I<0xAE, MRM6r, (outs), (ins GR32:$src),
+ "umonitor\t$src", [(int_x86_umonitor GR32:$src)]>,
+ XS, AdSize32, Requires<[HasWAITPKG]>;
+ def UMONITOR64 : I<0xAE, MRM6r, (outs), (ins GR64:$src),
+ "umonitor\t$src", [(int_x86_umonitor GR64:$src)]>,
+ XS, AdSize64, Requires<[HasWAITPKG, In64BitMode]>;
+ let Uses = [EAX, EDX], Defs = [EFLAGS] in {
+ def UMWAIT : I<0xAE, MRM6r,
+ (outs), (ins GR32orGR64:$src), "umwait\t$src",
+ [(set EFLAGS, (X86umwait GR32orGR64:$src, EDX, EAX))]>,
+ XD, Requires<[HasWAITPKG]>;
+ def TPAUSE : I<0xAE, MRM6r,
+ (outs), (ins GR32orGR64:$src), "tpause\t$src",
+ [(set EFLAGS, (X86tpause GR32orGR64:$src, EDX, EAX))]>,
+ PD, Requires<[HasWAITPKG]>, NotMemoryFoldable;
+ }
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// MOVDIRI - Move doubleword/quadword as direct store
+//
+let SchedRW = [WriteStore] in {
+def MOVDIRI32 : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "movdiri\t{$src, $dst|$dst, $src}",
+ [(int_x86_directstore32 addr:$dst, GR32:$src)]>,
+ T8, Requires<[HasMOVDIRI]>;
+def MOVDIRI64 : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "movdiri\t{$src, $dst|$dst, $src}",
+ [(int_x86_directstore64 addr:$dst, GR64:$src)]>,
+ T8, Requires<[In64BitMode, HasMOVDIRI]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// MOVDIR64B - Move 64 bytes as direct store
+//
+let SchedRW = [WriteStore] in {
+def MOVDIR64B16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
+ "movdir64b\t{$src, $dst|$dst, $src}", []>,
+ T8PD, AdSize16, Requires<[HasMOVDIR64B, Not64BitMode]>;
+def MOVDIR64B32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
+ "movdir64b\t{$src, $dst|$dst, $src}",
+ [(int_x86_movdir64b GR32:$dst, addr:$src)]>,
+ T8PD, AdSize32, Requires<[HasMOVDIR64B]>;
+def MOVDIR64B64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
+ "movdir64b\t{$src, $dst|$dst, $src}",
+ [(int_x86_movdir64b GR64:$dst, addr:$src)]>,
+ T8PD, AdSize64, Requires<[HasMOVDIR64B, In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
// CLZERO Instruction
//
let SchedRW = [WriteSystem] in {
let Uses = [EAX] in
- def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", [], IIC_SSE_CLZERO>,
+ def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>,
TB, Requires<[HasCLZERO]>;
let usesCustomInserter = 1 in {
@@ -2743,12 +2889,15 @@ let Predicates = [HasTBM] in {
let Predicates = [HasCLFLUSHOPT], SchedRW = [WriteLoad] in
def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
- "clflushopt\t$src", [(int_x86_clflushopt addr:$src)],
- IIC_SSE_PREFETCH>, PD;
+ "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD;
let Predicates = [HasCLWB], SchedRW = [WriteLoad] in
def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src",
- [(int_x86_clwb addr:$src)], IIC_SSE_PREFETCH>, PD;
+ [(int_x86_clwb addr:$src)]>, PD, NotMemoryFoldable;
+
+let Predicates = [HasCLDEMOTE], SchedRW = [WriteLoad] in
+def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src",
+ [(int_x86_cldemote addr:$src)]>, TB;
//===----------------------------------------------------------------------===//
// Subsystems.
@@ -2902,6 +3051,14 @@ def : MnemonicAlias<"sgdt", "sgdtq", "att">, Requires<[In64BitMode]>;
def : MnemonicAlias<"sidt", "sidtw", "att">, Requires<[In16BitMode]>;
def : MnemonicAlias<"sidt", "sidtl", "att">, Requires<[In32BitMode]>;
def : MnemonicAlias<"sidt", "sidtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtd", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lidt", "lidtw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lidt", "lidtd", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtd", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sidt", "sidtw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sidt", "sidtd", "intel">, Requires<[In32BitMode]>;
// Floating point stack aliases.
@@ -2979,19 +3136,19 @@ def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>;
// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
// Likewise for btc/btr/bts.
def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}",
- (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+ (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}",
- (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+ (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}",
- (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+ (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}",
- (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+ (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
// clr aliases.
-def : InstAlias<"clrb\t$reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>;
-def : InstAlias<"clrw\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
-def : InstAlias<"clrl\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
-def : InstAlias<"clrq\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
+def : InstAlias<"clr{b}\t$reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>;
+def : InstAlias<"clr{w}\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
+def : InstAlias<"clr{l}\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
+def : InstAlias<"clr{q}\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
// lods aliases. Accept the destination being omitted because it's implicit
// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
@@ -3004,10 +3161,10 @@ def : InstAlias<"lods\t{$src, %al|al, $src}", (LODSB srcidx8:$src), 0>;
def : InstAlias<"lods\t{$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>;
def : InstAlias<"lods\t{$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>;
def : InstAlias<"lods\t{$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"lods\t$src", (LODSB srcidx8:$src), 0>;
-def : InstAlias<"lods\t$src", (LODSW srcidx16:$src), 0>;
-def : InstAlias<"lods\t$src", (LODSL srcidx32:$src), 0>;
-def : InstAlias<"lods\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lods\t$src", (LODSB srcidx8:$src), 0, "intel">;
+def : InstAlias<"lods\t$src", (LODSW srcidx16:$src), 0, "intel">;
+def : InstAlias<"lods\t$src", (LODSL srcidx32:$src), 0, "intel">;
+def : InstAlias<"lods\t$src", (LODSQ srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>;
// stos aliases. Accept the source being omitted because it's implicit in
@@ -3021,10 +3178,10 @@ def : InstAlias<"stos\t{%al, $dst|$dst, al}", (STOSB dstidx8:$dst), 0>;
def : InstAlias<"stos\t{%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>;
def : InstAlias<"stos\t{%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>;
def : InstAlias<"stos\t{%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"stos\t$dst", (STOSB dstidx8:$dst), 0>;
-def : InstAlias<"stos\t$dst", (STOSW dstidx16:$dst), 0>;
-def : InstAlias<"stos\t$dst", (STOSL dstidx32:$dst), 0>;
-def : InstAlias<"stos\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stos\t$dst", (STOSB dstidx8:$dst), 0, "intel">;
+def : InstAlias<"stos\t$dst", (STOSW dstidx16:$dst), 0, "intel">;
+def : InstAlias<"stos\t$dst", (STOSL dstidx32:$dst), 0, "intel">;
+def : InstAlias<"stos\t$dst", (STOSQ dstidx64:$dst), 0, "intel">, Requires<[In64BitMode]>;
// scas aliases. Accept the destination being omitted because it's implicit
@@ -3038,24 +3195,24 @@ def : InstAlias<"scas\t{$dst, %al|al, $dst}", (SCASB dstidx8:$dst), 0>;
def : InstAlias<"scas\t{$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>;
def : InstAlias<"scas\t{$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>;
def : InstAlias<"scas\t{$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"scas\t$dst", (SCASB dstidx8:$dst), 0>;
-def : InstAlias<"scas\t$dst", (SCASW dstidx16:$dst), 0>;
-def : InstAlias<"scas\t$dst", (SCASL dstidx32:$dst), 0>;
-def : InstAlias<"scas\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scas\t$dst", (SCASB dstidx8:$dst), 0, "intel">;
+def : InstAlias<"scas\t$dst", (SCASW dstidx16:$dst), 0, "intel">;
+def : InstAlias<"scas\t$dst", (SCASL dstidx32:$dst), 0, "intel">;
+def : InstAlias<"scas\t$dst", (SCASQ dstidx64:$dst), 0, "intel">, Requires<[In64BitMode]>;
// cmps aliases. Mnemonic suffix being omitted because it's implicit
// in the destination.
-def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSB dstidx8:$dst, srcidx8:$src), 0>;
-def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSW dstidx16:$dst, srcidx16:$src), 0>;
-def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSL dstidx32:$dst, srcidx32:$src), 0>;
-def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSQ dstidx64:$dst, srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSB dstidx8:$dst, srcidx8:$src), 0, "intel">;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSW dstidx16:$dst, srcidx16:$src), 0, "intel">;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSL dstidx32:$dst, srcidx32:$src), 0, "intel">;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSQ dstidx64:$dst, srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>;
// movs aliases. Mnemonic suffix being omitted because it's implicit
// in the destination.
-def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSB dstidx8:$dst, srcidx8:$src), 0>;
-def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSW dstidx16:$dst, srcidx16:$src), 0>;
-def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSL dstidx32:$dst, srcidx32:$src), 0>;
-def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSQ dstidx64:$dst, srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSB dstidx8:$dst, srcidx8:$src), 0, "intel">;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSW dstidx16:$dst, srcidx16:$src), 0, "intel">;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSL dstidx32:$dst, srcidx32:$src), 0, "intel">;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSQ dstidx64:$dst, srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>;
// div and idiv aliases for explicit A register.
def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r GR8 :$src)>;
@@ -3077,7 +3234,7 @@ def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>;
-// Various unary fpstack operations default to operating on on ST1.
+// Various unary fpstack operations default to operating on ST1.
// For example, "fxch" -> "fxch %st(1)"
def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>;
def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>;
@@ -3136,28 +3293,22 @@ def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>;
def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>;
def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>;
-// We accept "fnstsw %eax" even though it only writes %ax.
-def : InstAlias<"fnstsw\t{%eax|eax}", (FNSTSW16r)>;
-def : InstAlias<"fnstsw\t{%al|al}" , (FNSTSW16r)>;
-def : InstAlias<"fnstsw" , (FNSTSW16r)>;
+def : InstAlias<"fnstsw" , (FNSTSW16r), 0>;
// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but
// this is compatible with what GAS does.
def : InstAlias<"lcall\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
def : InstAlias<"ljmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
-def : InstAlias<"lcall\t{*}$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
-def : InstAlias<"ljmp\t{*}$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall\t{*}$dst", (FARCALL32m opaquemem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp\t{*}$dst", (FARJMP32m opaquemem:$dst), 0>, Requires<[Not16BitMode]>;
def : InstAlias<"lcall\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
def : InstAlias<"ljmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"lcall\t{*}$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"ljmp\t{*}$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall\t{*}$dst", (FARCALL16m opaquemem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp\t{*}$dst", (FARJMP16m opaquemem:$dst), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"call\t{*}$dst", (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"jmp\t{*}$dst", (JMP64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"call\t{*}$dst", (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
-def : InstAlias<"jmp\t{*}$dst", (JMP32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
-def : InstAlias<"call\t{*}$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"jmp\t{*}$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp\t{*}$dst", (JMP64m i64mem:$dst), 0, "att">, Requires<[In64BitMode]>;
+def : InstAlias<"jmp\t{*}$dst", (JMP32m i32mem:$dst), 0, "att">, Requires<[In32BitMode]>;
+def : InstAlias<"jmp\t{*}$dst", (JMP16m i16mem:$dst), 0, "att">, Requires<[In16BitMode]>;
// "imul <imm>, B" is an alias for "imul <imm>, B, B".
@@ -3170,15 +3321,15 @@ def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i6
// ins aliases. Accept the mnemonic suffix being omitted because it's implicit
// in the destination.
-def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSB dstidx8:$dst), 0>;
-def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSW dstidx16:$dst), 0>;
-def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSL dstidx32:$dst), 0>;
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSB dstidx8:$dst), 0, "intel">;
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSW dstidx16:$dst), 0, "intel">;
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSL dstidx32:$dst), 0, "intel">;
// outs aliases. Accept the mnemonic suffix being omitted because it's implicit
// in the source.
-def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSB srcidx8:$src), 0>;
-def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSW srcidx16:$src), 0>;
-def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSL srcidx32:$src), 0>;
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSB srcidx8:$src), 0, "intel">;
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSW srcidx16:$src), 0, "intel">;
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSL srcidx32:$src), 0, "intel">;
// inb %dx -> inb %al, %dx
def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
@@ -3199,37 +3350,33 @@ def : InstAlias<"jmpw\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Req
def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
def : InstAlias<"jmpl\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
-// Force mov without a suffix with a segment and mem to prefer the 'l' form of
-// the move. All segment/mem forms are equivalent, this has the shortest
-// encoding.
-def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV16sm SEGMENT_REG:$seg, i16mem:$mem), 0>;
-def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV16ms i16mem:$mem, SEGMENT_REG:$seg), 0>;
-
// Match 'movq <largeimm>, <reg>' as an alias for movabsq.
def : InstAlias<"mov{q}\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
-// Match 'movq GR64, MMX' as an alias for movd.
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+// Match 'movd GR64, MMX' as an alias for movq to be compatible with gas,
+// which supports this due to an old AMD documentation bug when 64-bit mode was
+// created.
+def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
(MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
(MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
// movsx aliases
-def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
-def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
-def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
-def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0, "att">;
// movzx aliases
-def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
-def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
-def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0, "att">;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0, "att">;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0, "att">;
// Note: No GR32->GR64 movzx form.
// outb %dx -> outb %al, %dx
@@ -3310,12 +3457,19 @@ def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}",
// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>;
-def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
- (XCHG32ar GR32:$src), 0>, Requires<[Not64BitMode]>;
-def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
- (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src), 0>;
def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>;
+// In 64-bit mode, xchg %eax, %eax can't be encoded with the 0x90 opcode we
+// would get by default because it's defined as NOP. But xchg %eax, %eax implies
+// implicit zeroing of the upper 32 bits. So alias to the longer encoding.
+def : InstAlias<"xchg{l}\t{%eax, %eax|eax, eax}",
+ (XCHG32rr EAX, EAX), 0>, Requires<[In64BitMode]>;
+
+// xchg %rax, %rax is a nop in x86-64 and can be encoded as such. Without this
+// we emit an unneeded REX.w prefix.
+def : InstAlias<"xchg{q}\t{%rax, %rax|rax, rax}", (NOOP), 0>;
+
// These aliases exist to get the parser to prioritize matching 8-bit
// immediate encodings over matching the implicit ax/eax/rax encodings. By
// explicitly mentioning the A register here, these entries will be ordered
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 039b4a248544..aefeffedfc1a 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -20,178 +20,120 @@
// MMX Multiclasses
//===----------------------------------------------------------------------===//
-let Sched = WriteVecALU in {
-def MMX_INTALU_ITINS : OpndItins<
- IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
->;
-
-def MMX_INTALUQ_ITINS : OpndItins<
- IIC_MMX_ALUQ_RR, IIC_MMX_ALUQ_RM
->;
-
-def MMX_PHADDSUBW : OpndItins<
- IIC_MMX_PHADDSUBW_RR, IIC_MMX_PHADDSUBW_RM
->;
-
-def MMX_PHADDSUBD : OpndItins<
- IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM
->;
-}
-
-let Sched = WriteVecLogic in
-def MMX_INTALU_ITINS_VECLOGICSCHED : OpndItins<
- IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
->;
-
-let Sched = WriteVecIMul in
-def MMX_PMUL_ITINS : OpndItins<
- IIC_MMX_PMUL, IIC_MMX_PMUL
->;
-
-let Sched = WriteVecIMul in {
-def MMX_PSADBW_ITINS : OpndItins<
- IIC_MMX_PSADBW, IIC_MMX_PSADBW
->;
-
-def MMX_MISC_FUNC_ITINS : OpndItins<
- IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG
->;
-}
-
-def MMX_SHIFT_ITINS : ShiftOpndItins<
- IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI
->;
-
-let Sched = WriteShuffle in {
-def MMX_UNPCK_H_ITINS : OpndItins<
- IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM
->;
-
-def MMX_UNPCK_L_ITINS : OpndItins<
- IIC_MMX_UNPCK_L, IIC_MMX_UNPCK_L
->;
-
-def MMX_PCK_ITINS : OpndItins<
- IIC_MMX_PCK_RR, IIC_MMX_PCK_RM
->;
-
-def MMX_PSHUF_ITINS : OpndItins<
- IIC_MMX_PSHUF, IIC_MMX_PSHUF
->;
-} // Sched
-
-let Sched = WriteCvtF2I in {
-def MMX_CVT_PD_ITINS : OpndItins<
- IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM
->;
-
-def MMX_CVT_PS_ITINS : OpndItins<
- IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM
->;
+// Alias instruction that maps zero vector to pxor mmx.
+// This is expanded by ExpandPostRAPseudos to an pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteZero] in {
+def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "", []>;
}
let Constraints = "$src1 = $dst" in {
// MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
// When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
- OpndItins itins, bit Commutable = 0> {
+ X86FoldableSchedWrite sched, bit Commutable = 0,
+ X86MemOperand OType = i64mem> {
def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
(ins VR64:$src1, VR64:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
- Sched<[itins.Sched]> {
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>,
+ Sched<[sched]> {
let isCommutable = Commutable;
}
def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
- (ins VR64:$src1, i64mem:$src2),
+ (ins VR64:$src1, OType:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[(set VR64:$dst, (IntId VR64:$src1,
- (bitconvert (load_mmx addr:$src2))))],
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (load_mmx addr:$src2))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
string OpcodeStr, Intrinsic IntId,
- Intrinsic IntId2, ShiftOpndItins itins> {
+ Intrinsic IntId2, X86FoldableSchedWrite sched,
+ X86FoldableSchedWrite schedImm> {
def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
(ins VR64:$src1, VR64:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
- Sched<[WriteVecShift]>;
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>,
+ Sched<[sched]>;
def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
(ins VR64:$src1, i64mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[(set VR64:$dst, (IntId VR64:$src1,
- (bitconvert (load_mmx addr:$src2))))],
- itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+ (bitconvert (load_mmx addr:$src2))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
(ins VR64:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))], itins.ri>,
- Sched<[WriteVecShift]>;
+ [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))]>,
+ Sched<[schedImm]>;
}
}
/// Unary MMX instructions requiring SSSE3.
multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
- Intrinsic IntId64, OpndItins itins> {
- def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>,
- Sched<[itins.Sched]>;
-
- def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR64:$dst,
- (IntId64 (bitconvert (load_mmx addr:$src))))],
- itins.rm>, Sched<[itins.Sched.Folded]>;
+ Intrinsic IntId64, X86FoldableSchedWrite sched> {
+ def rr : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst, (IntId64 VR64:$src))]>,
+ Sched<[sched]>;
+
+ def rm : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst,
+ (IntId64 (bitconvert (load_mmx addr:$src))))]>,
+ Sched<[sched.Folded]>;
}
/// Binary MMX instructions requiring SSSE3.
let ImmT = NoImm, Constraints = "$src1 = $dst" in {
multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
- Intrinsic IntId64, OpndItins itins,
+ Intrinsic IntId64, X86FoldableSchedWrite sched,
bit Commutable = 0> {
let isCommutable = Commutable in
- def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
+ def rr : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
(ins VR64:$src1, VR64:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>,
- Sched<[itins.Sched]>;
- def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
+ [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]>,
+ Sched<[sched]>;
+ def rm : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
(ins VR64:$src1, i64mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[(set VR64:$dst,
(IntId64 VR64:$src1,
- (bitconvert (load_mmx addr:$src2))))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (load_mmx addr:$src2))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
/// PALIGN MMX instructions (require SSSE3).
-multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
- def R64irr : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+multiclass ssse3_palign_mm<string asm, Intrinsic IntId,
+ X86FoldableSchedWrite sched> {
+ def rri : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
(ins VR64:$src1, VR64:$src2, u8imm:$src3),
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>,
- Sched<[WriteShuffle]>;
- def R64irm : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+ Sched<[sched]>;
+ def rmi : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
(ins VR64:$src1, i64mem:$src2, u8imm:$src3),
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR64:$dst, (IntId VR64:$src1,
(bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>,
- Sched<[WriteShuffleLd, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
- string asm, OpndItins itins, Domain d> {
+ string asm, X86FoldableSchedWrite sched, Domain d> {
def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
- [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>,
- Sched<[itins.Sched]>;
+ [(set DstRC:$dst, (Int SrcRC:$src))], d>,
+ Sched<[sched]>;
def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
- [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>,
- Sched<[itins.Sched.Folded]>;
+ [(set DstRC:$dst, (Int (ld_frag addr:$src)))], d>,
+ Sched<[sched.Folded]>;
}
multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
@@ -199,20 +141,20 @@ multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
PatFrag ld_frag, string asm, Domain d> {
def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst),
(ins DstRC:$src1, SrcRC:$src2), asm,
- [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
- NoItinerary, d>, Sched<[WriteCvtI2F]>;
+ [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], d>,
+ Sched<[WriteCvtI2PS]>;
def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst),
(ins DstRC:$src1, x86memop:$src2), asm,
- [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
- NoItinerary, d>, Sched<[WriteCvtI2FLd]>;
+ [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], d>,
+ Sched<[WriteCvtI2PS.Folded]>;
}
//===----------------------------------------------------------------------===//
// MMX EMMS Instruction
//===----------------------------------------------------------------------===//
-def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms",
- [(int_x86_mmx_emms)], IIC_MMX_EMMS>;
+let SchedRW = [WriteEMMS] in
+def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", [(int_x86_mmx_emms)]>;
//===----------------------------------------------------------------------===//
// MMX Scalar Instructions
@@ -222,402 +164,407 @@ def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms",
def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
- (x86mmx (scalar_to_vector GR32:$src)))],
- IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+ (x86mmx (scalar_to_vector GR32:$src)))]>,
+ Sched<[WriteVecMoveFromGpr]>;
def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
- (x86mmx (scalar_to_vector (loadi32 addr:$src))))],
- IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
+ (x86mmx (scalar_to_vector (loadi32 addr:$src))))]>,
+ Sched<[WriteVecLoad]>;
let Predicates = [HasMMX] in {
- let AddedComplexity = 15 in
- def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)),
- (MMX_MOVD64rr GR32:$src)>;
- let AddedComplexity = 20 in
- def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))),
- (MMX_MOVD64rm addr:$src)>;
+ def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)),
+ (MMX_MOVD64rr GR32:$src)>;
+ def : Pat<(x86mmx (MMX_X86movw2d (i32 0))),
+ (MMX_SET0)>;
+ def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))),
+ (MMX_MOVD64rm addr:$src)>;
}
let mayStore = 1 in
def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
- "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>,
- Sched<[WriteStore]>;
+ "movd\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteVecStore]>;
def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst,
- (MMX_X86movd2w (x86mmx VR64:$src)))],
- IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>,
- FoldGenData<"MMX_MOVD64rr">;
+ (MMX_X86movd2w (x86mmx VR64:$src)))]>,
+ Sched<[WriteVecMoveToGpr]>, FoldGenData<"MMX_MOVD64rr">;
let isBitcast = 1 in
def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
- "movd\t{$src, $dst|$dst, $src}",
- [(set VR64:$dst, (bitconvert GR64:$src))],
- IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (bitconvert GR64:$src))]>,
+ Sched<[WriteVecMoveFromGpr]>;
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst),
- (ins i64mem:$src), "movd\t{$src, $dst|$dst, $src}",
- [], IIC_MMX_MOVQ_RM>, Sched<[WriteLoad]>;
+ (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}",
+ []>, Sched<[SchedWriteVecMoveLS.MMX.RM]>;
-// These are 64 bit moves, but since the OS X assembler doesn't
-// recognize a register-register movq, we write them as
-// movd.
-let SchedRW = [WriteMove], isBitcast = 1 in {
+let isBitcast = 1 in {
def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
(outs GR64:$dst), (ins VR64:$src),
- "movd\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst,
- (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
-let hasSideEffects = 0 in
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert VR64:$src))]>,
+ Sched<[WriteVecMoveToGpr]>;
+let SchedRW = [WriteVecMove], hasSideEffects = 0, isMoveReg = 1 in {
def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
- "movq\t{$src, $dst|$dst, $src}", [],
- IIC_MMX_MOVQ_RR>;
-let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+ "movq\t{$src, $dst|$dst, $src}", []>;
+let isCodeGenOnly = 1, ForceDisassemble = 1 in
def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
- "movq\t{$src, $dst|$dst, $src}", [],
- IIC_MMX_MOVQ_RR>, FoldGenData<"MMX_MOVQ64rr">;
-}
-} // SchedRW
+ "movq\t{$src, $dst|$dst, $src}", []>,
+ FoldGenData<"MMX_MOVQ64rr">;
+} // SchedRW, hasSideEffects, isMoveReg
+} // isBitcast
+
+def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
+ (MMX_MOVQ64rr_REV VR64:$dst, VR64:$src), 0>;
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem,
(outs), (ins i64mem:$dst, VR64:$src),
- "movd\t{$src, $dst|$dst, $src}",
- [], IIC_MMX_MOV_REG_MM>, Sched<[WriteStore]>;
+ "movq\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.MMX.MR]>;
-let SchedRW = [WriteLoad] in {
+let SchedRW = [SchedWriteVecMoveLS.MMX.RM] in {
let canFoldAsLoad = 1 in
def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(set VR64:$dst, (load_mmx addr:$src))],
- IIC_MMX_MOVQ_RM>;
+ [(set VR64:$dst, (load_mmx addr:$src))]>;
} // SchedRW
-let SchedRW = [WriteStore] in
+let SchedRW = [SchedWriteVecMoveLS.MMX.MR] in
def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(store (x86mmx VR64:$src), addr:$dst)],
- IIC_MMX_MOVQ_RM>;
+ [(store (x86mmx VR64:$src), addr:$dst)]>;
-let SchedRW = [WriteMove] in {
+let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
(ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
(x86mmx (bitconvert
(i64 (extractelt (v2i64 VR128:$src),
- (iPTR 0))))))],
- IIC_MMX_MOVQ_RR>;
+ (iPTR 0))))))]>;
def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
(ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v2i64
(scalar_to_vector
- (i64 (bitconvert (x86mmx VR64:$src))))))],
- IIC_MMX_MOVQ_RR>;
+ (i64 (bitconvert (x86mmx VR64:$src))))))]>;
let isCodeGenOnly = 1, hasSideEffects = 1 in {
def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
(ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
- [], IIC_MMX_MOVQ_RR>;
+ []>;
def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
(ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
- [], IIC_MMX_MOVQ_RR>;
+ []>;
}
} // SchedRW
-let Predicates = [HasSSE1] in
+let Predicates = [HasMMX, HasSSE1] in
def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
"movntq\t{$src, $dst|$dst, $src}",
- [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
- IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>;
+ [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>,
+ Sched<[SchedWriteVecMoveLSNT.MMX.MR]>;
let Predicates = [HasMMX] in {
- let AddedComplexity = 15 in
// movd to MMX register zero-extends
def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))),
(MMX_MOVD64rr GR32:$src)>;
- let AddedComplexity = 20 in
def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
(MMX_MOVD64rm addr:$src)>;
}
// Arithmetic Instructions
defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
- MMX_INTALU_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w,
- MMX_INTALU_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d,
- MMX_INTALU_ITINS>;
+ SchedWriteVecALU.MMX>;
// -- Addition
defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b,
- MMX_INTALU_ITINS, 1>;
+ SchedWriteVecALU.MMX, 1>;
defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w,
- MMX_INTALU_ITINS, 1>;
+ SchedWriteVecALU.MMX, 1>;
defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d,
- MMX_INTALU_ITINS, 1>;
-let Predicates = [HasSSE2] in
+ SchedWriteVecALU.MMX, 1>;
+let Predicates = [HasMMX, HasSSE2] in
defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q,
- MMX_INTALUQ_ITINS, 1>;
+ SchedWriteVecALU.MMX, 1>;
defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b,
- MMX_INTALU_ITINS, 1>;
+ SchedWriteVecALU.MMX, 1>;
defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w,
- MMX_INTALU_ITINS, 1>;
+ SchedWriteVecALU.MMX, 1>;
defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b,
- MMX_INTALU_ITINS, 1>;
+ SchedWriteVecALU.MMX, 1>;
defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w,
- MMX_INTALU_ITINS, 1>;
+ SchedWriteVecALU.MMX, 1>;
defm MMX_PHADDW : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w,
- MMX_PHADDSUBW>;
-defm MMX_PHADD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
- MMX_PHADDSUBD>;
+ SchedWritePHAdd.MMX>;
+defm MMX_PHADDD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
+ SchedWritePHAdd.MMX>;
defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
- MMX_PHADDSUBW>;
+ SchedWritePHAdd.MMX>;
// -- Subtraction
defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
- MMX_INTALU_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
- MMX_INTALU_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
- MMX_INTALU_ITINS>;
-let Predicates = [HasSSE2] in
+ SchedWriteVecALU.MMX>;
+let Predicates = [HasMMX, HasSSE2] in
defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
- MMX_INTALUQ_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
- MMX_INTALU_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
- MMX_INTALU_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
- MMX_INTALU_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
- MMX_INTALU_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
- MMX_PHADDSUBW>;
+ SchedWritePHAdd.MMX>;
defm MMX_PHSUBD : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d,
- MMX_PHADDSUBD>;
+ SchedWritePHAdd.MMX>;
defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw,
- MMX_PHADDSUBW>;
+ SchedWritePHAdd.MMX>;
// -- Multiplication
defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w,
- MMX_PMUL_ITINS, 1>;
+ SchedWriteVecIMul.MMX, 1>;
defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w,
- MMX_PMUL_ITINS, 1>;
-let Predicates = [HasSSE1] in
+ SchedWriteVecIMul.MMX, 1>;
+let Predicates = [HasMMX, HasSSE1] in
defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
- MMX_PMUL_ITINS, 1>;
-let Predicates = [HasSSE2] in
+ SchedWriteVecIMul.MMX, 1>;
+let Predicates = [HasMMX, HasSSE2] in
defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
- MMX_PMUL_ITINS, 1>;
+ SchedWriteVecIMul.MMX, 1>;
defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw",
int_x86_ssse3_pmul_hr_sw,
- MMX_PMUL_ITINS, 1>;
+ SchedWriteVecIMul.MMX, 1>;
// -- Miscellanea
defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
- MMX_PMUL_ITINS, 1>;
+ SchedWriteVecIMul.MMX, 1>;
defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
- int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>;
-let Predicates = [HasSSE1] in {
+ int_x86_ssse3_pmadd_ub_sw,
+ SchedWriteVecIMul.MMX>;
+let Predicates = [HasMMX, HasSSE1] in {
defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b,
- MMX_MISC_FUNC_ITINS, 1>;
+ SchedWriteVecALU.MMX, 1>;
defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w,
- MMX_MISC_FUNC_ITINS, 1>;
+ SchedWriteVecALU.MMX, 1>;
defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b,
- MMX_MISC_FUNC_ITINS, 1>;
+ SchedWriteVecALU.MMX, 1>;
defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w,
- MMX_MISC_FUNC_ITINS, 1>;
+ SchedWriteVecALU.MMX, 1>;
defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b,
- MMX_MISC_FUNC_ITINS, 1>;
+ SchedWriteVecALU.MMX, 1>;
defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w,
- MMX_MISC_FUNC_ITINS, 1>;
+ SchedWriteVecALU.MMX, 1>;
defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw,
- MMX_PSADBW_ITINS, 1>;
+ SchedWritePSADBW.MMX, 1>;
}
defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b,
- MMX_MISC_FUNC_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PSIGNW : SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w,
- MMX_MISC_FUNC_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PSIGND : SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d,
- MMX_MISC_FUNC_ITINS>;
+ SchedWriteVecALU.MMX>;
let Constraints = "$src1 = $dst" in
- defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>;
+ defm MMX_PALIGNR : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b,
+ SchedWriteShuffle.MMX>;
// Logical Instructions
defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand,
- MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+ SchedWriteVecLogic.MMX, 1>;
defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,
- MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+ SchedWriteVecLogic.MMX, 1>;
defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor,
- MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+ SchedWriteVecLogic.MMX, 1>;
defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn,
- MMX_INTALU_ITINS_VECLOGICSCHED>;
+ SchedWriteVecLogic.MMX>;
// Shift Instructions
defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
int_x86_mmx_psrl_w, int_x86_mmx_psrli_w,
- MMX_SHIFT_ITINS>;
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
int_x86_mmx_psrl_d, int_x86_mmx_psrli_d,
- MMX_SHIFT_ITINS>;
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
- MMX_SHIFT_ITINS>;
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
- MMX_SHIFT_ITINS>;
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
int_x86_mmx_psll_d, int_x86_mmx_pslli_d,
- MMX_SHIFT_ITINS>;
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
- MMX_SHIFT_ITINS>;
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
- MMX_SHIFT_ITINS>;
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
- MMX_SHIFT_ITINS>;
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
// Comparison Instructions
defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
- MMX_INTALU_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w,
- MMX_INTALU_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d,
- MMX_INTALU_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b,
- MMX_INTALU_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w,
- MMX_INTALU_ITINS>;
+ SchedWriteVecALU.MMX>;
defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d,
- MMX_INTALU_ITINS>;
+ SchedWriteVecALU.MMX>;
// -- Unpack Instructions
defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw",
int_x86_mmx_punpckhbw,
- MMX_UNPCK_H_ITINS>;
+ SchedWriteShuffle.MMX>;
defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd",
int_x86_mmx_punpckhwd,
- MMX_UNPCK_H_ITINS>;
+ SchedWriteShuffle.MMX>;
defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq",
int_x86_mmx_punpckhdq,
- MMX_UNPCK_H_ITINS>;
+ SchedWriteShuffle.MMX>;
defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw",
int_x86_mmx_punpcklbw,
- MMX_UNPCK_L_ITINS>;
+ SchedWriteShuffle.MMX,
+ 0, i32mem>;
defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd",
int_x86_mmx_punpcklwd,
- MMX_UNPCK_L_ITINS>;
+ SchedWriteShuffle.MMX,
+ 0, i32mem>;
defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq",
int_x86_mmx_punpckldq,
- MMX_UNPCK_L_ITINS>;
+ SchedWriteShuffle.MMX,
+ 0, i32mem>;
// -- Pack Instructions
defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb,
- MMX_PCK_ITINS>;
+ SchedWriteShuffle.MMX>;
defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw,
- MMX_PCK_ITINS>;
+ SchedWriteShuffle.MMX>;
defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
- MMX_PCK_ITINS>;
+ SchedWriteShuffle.MMX>;
// -- Shuffle Instructions
defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
- MMX_PSHUF_ITINS>;
+ SchedWriteVarShuffle.MMX>;
def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
(outs VR64:$dst), (ins VR64:$src1, u8imm:$src2),
"pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR64:$dst,
- (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))],
- IIC_MMX_PSHUF>, Sched<[WriteShuffle]>;
+ (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))]>,
+ Sched<[SchedWriteShuffle.MMX]>;
def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
(outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2),
"pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR64:$dst,
(int_x86_sse_pshuf_w (load_mmx addr:$src1),
- imm:$src2))],
- IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>;
+ imm:$src2))]>,
+ Sched<[SchedWriteShuffle.MMX.Folded]>;
// -- Conversion Instructions
defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
- MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
+ WriteCvtPS2I, SSEPackedSingle>, PS;
defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
- MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+ WriteCvtPD2I, SSEPackedDouble>, PD;
defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
- MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
+ WriteCvtPS2I, SSEPackedSingle>, PS;
defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
- MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+ WriteCvtPD2I, SSEPackedDouble>, PD;
defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
- MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+ WriteCvtI2PD, SSEPackedDouble>, PD;
let Constraints = "$src1 = $dst" in {
defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
int_x86_sse_cvtpi2ps,
i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
- SSEPackedSingle>, PS;
+ SSEPackedSingle>, PS;
}
// Extract / Insert
-let Predicates = [HasSSE1] in
-def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
- (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2),
- "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
- imm:$src2))],
- IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
+let Predicates = [HasMMX, HasSSE1] in
+def MMX_PEXTRWrr: MMXIi8<0xC5, MRMSrcReg,
+ (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2),
+ "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
+ imm:$src2))]>,
+ Sched<[WriteVecExtract]>;
let Constraints = "$src1 = $dst" in {
-let Predicates = [HasSSE1] in {
- def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
- (outs VR64:$dst),
- (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3),
- "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
- GR32orGR64:$src2, imm:$src3))],
- IIC_MMX_PINSRW>, Sched<[WriteShuffle]>;
-
- def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
- (outs VR64:$dst),
- (ins VR64:$src1, i16mem:$src2, i32u8imm:$src3),
- "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
- (i32 (anyext (loadi16 addr:$src2))),
- imm:$src3))],
- IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+let Predicates = [HasMMX, HasSSE1] in {
+ def MMX_PINSRWrr : MMXIi8<0xC4, MRMSrcReg,
+ (outs VR64:$dst),
+ (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+ GR32orGR64:$src2, imm:$src3))]>,
+ Sched<[WriteVecInsert]>;
+
+ def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem,
+ (outs VR64:$dst),
+ (ins VR64:$src1, i16mem:$src2, i32u8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+ (i32 (anyext (loadi16 addr:$src2))),
+ imm:$src3))]>,
+ Sched<[WriteVecInsertLd, ReadAfterLd]>;
}
}
// Mask creation
-let Predicates = [HasSSE1] in
+let Predicates = [HasMMX, HasSSE1] in
def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
(ins VR64:$src),
"pmovmskb\t{$src, $dst|$dst, $src}",
[(set GR32orGR64:$dst,
- (int_x86_mmx_pmovmskb VR64:$src))],
- IIC_MMX_MOVMSK>, Sched<[WriteVecLogic]>;
+ (int_x86_mmx_pmovmskb VR64:$src))]>,
+ Sched<[WriteMMXMOVMSK]>;
// Low word of XMM to MMX.
def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
@@ -630,29 +577,30 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
(x86mmx (MMX_MOVQ64rm addr:$src))>;
// Misc.
-let SchedRW = [WriteShuffle] in {
-let Uses = [EDI], Predicates = [HasSSE1,Not64BitMode] in
+let SchedRW = [SchedWriteShuffle.MMX] in {
+let Uses = [EDI], Predicates = [HasMMX, HasSSE1,Not64BitMode] in
def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
"maskmovq\t{$mask, $src|$src, $mask}",
- [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
- IIC_MMX_MASKMOV>;
-let Uses = [RDI], Predicates = [HasSSE1,In64BitMode] in
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>;
+let Uses = [RDI], Predicates = [HasMMX, HasSSE1,In64BitMode] in
def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
"maskmovq\t{$mask, $src|$src, $mask}",
- [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)],
- IIC_MMX_MASKMOV>;
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)]>;
}
// 64-bit bit convert.
-let Predicates = [HasSSE2] in {
+let Predicates = [HasMMX, HasSSE2] in {
def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
(MMX_MOVQ2FR64rr VR64:$src)>;
def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
(MMX_MOVFR642Qrr FR64:$src)>;
def : Pat<(x86mmx (MMX_X86movdq2q
- (bc_v2i64 (v4i32 (int_x86_sse2_cvtps2dq VR128:$src))))),
+ (bc_v2i64 (v4i32 (X86cvtp2Int (v4f32 VR128:$src)))))),
(MMX_CVTPS2PIirr VR128:$src)>;
def : Pat<(x86mmx (MMX_X86movdq2q
+ (bc_v2i64 (v4i32 (X86cvttp2si (v4f32 VR128:$src)))))),
+ (MMX_CVTTPS2PIirr VR128:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
(bc_v2i64 (v4i32 (fp_to_sint (v4f32 VR128:$src)))))),
(MMX_CVTTPS2PIirr VR128:$src)>;
def : Pat<(x86mmx (MMX_X86movdq2q
diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td
index cb2b47b4f0c9..c1a8cc7c5fbf 100644
--- a/lib/Target/X86/X86InstrMPX.td
+++ b/lib/Target/X86/X86InstrMPX.td
@@ -13,70 +13,68 @@
//
//===----------------------------------------------------------------------===//
-// FIXME: Investigate a better scheduler itinerary once MPX is used inside LLVM.
+// FIXME: Investigate a better scheduler class once MPX is used inside LLVM.
let SchedRW = [WriteSystem] in {
multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
-let mayLoad = 1 in {
- def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src),
- OpcodeStr#"\t{$src, $dst|$dst, $src}", [], IIC_MPX>,
+ def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
+ OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
Requires<[HasMPX, Not64BitMode]>;
- def 64rm: RI<opc, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
- OpcodeStr#"\t{$src, $dst|$dst, $src}", [], IIC_MPX>,
+ def 64rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
+ OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
Requires<[HasMPX, In64BitMode]>;
}
-}
defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
-let mayLoad = 1 in {
- def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i32mem:$src2),
- OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>,
+ def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, anymem:$src2),
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
Requires<[HasMPX, Not64BitMode]>;
- def 64rm: RI<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i64mem:$src2),
- OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>,
+ def 64rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, anymem:$src2),
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
Requires<[HasMPX, In64BitMode]>;
-}
+
def 32rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR32:$src2),
- OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>,
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
Requires<[HasMPX, Not64BitMode]>;
- def 64rr: RI<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR64:$src2),
- OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>,
+ def 64rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR64:$src2),
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
Requires<[HasMPX, In64BitMode]>;
}
-defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS;
-defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD;
-defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD;
+defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS, NotMemoryFoldable;
+defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD, NotMemoryFoldable;
+defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD, NotMemoryFoldable;
-def BNDMOVRMrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
- "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
- Requires<[HasMPX]>;
+def BNDMOVrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX]>, NotMemoryFoldable;
let mayLoad = 1 in {
-def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
- "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
- Requires<[HasMPX, Not64BitMode]>;
-def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
- "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
- Requires<[HasMPX, In64BitMode]>;
+def BNDMOV32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable;
+def BNDMOV64rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable;
}
-def BNDMOVMRrr : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
- "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
- Requires<[HasMPX]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1 in
+def BNDMOVrr_REV : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX]>, NotMemoryFoldable;
let mayStore = 1 in {
-def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
- "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
- Requires<[HasMPX, Not64BitMode]>;
-def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src),
- "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
- Requires<[HasMPX, In64BitMode]>;
+def BNDMOV32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable;
+def BNDMOV64mr : I<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable;
-def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
- "bndstx\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PS,
+def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins anymem:$dst, BNDR:$src),
+ "bndstx\t{$src, $dst|$dst, $src}", []>, PS,
Requires<[HasMPX]>;
}
let mayLoad = 1 in
def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
- "bndldx\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PS,
+ "bndldx\t{$src, $dst|$dst, $src}", []>, PS,
Requires<[HasMPX]>;
} // SchedRW
diff --git a/lib/Target/X86/X86InstrSGX.td b/lib/Target/X86/X86InstrSGX.td
index f4331c5e2d93..488cc4438076 100644
--- a/lib/Target/X86/X86InstrSGX.td
+++ b/lib/Target/X86/X86InstrSGX.td
@@ -15,7 +15,7 @@
//===----------------------------------------------------------------------===//
// SGX instructions
-let SchedRW = [WriteSystem] in {
+let SchedRW = [WriteSystem], Predicates = [HasSGX] in {
// ENCLS - Execute an Enclave System Function of Specified Leaf Number
def ENCLS : I<0x01, MRM_CF, (outs), (ins),
"encls", []>, TB;
@@ -23,4 +23,8 @@ def ENCLS : I<0x01, MRM_CF, (outs), (ins),
// ENCLU - Execute an Enclave User Function of Specified Leaf Number
def ENCLU : I<0x01, MRM_D7, (outs), (ins),
"enclu", []>, TB;
+
+// ENCLV - Execute an Enclave VMM Function of Specified Leaf Number
+def ENCLV : I<0x01, MRM_C0, (outs), (ins),
+ "enclv", []>, TB;
} // SchedRW
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index cb84f9aecf79..6a9b20998210 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -13,246 +13,6 @@
//
//===----------------------------------------------------------------------===//
-class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
- InstrItinClass rr = arg_rr;
- InstrItinClass rm = arg_rm;
- // InstrSchedModel info.
- X86FoldableSchedWrite Sched = WriteFAdd;
-}
-
-class SizeItins<OpndItins arg_s, OpndItins arg_d> {
- OpndItins s = arg_s;
- OpndItins d = arg_d;
-}
-
-class MoveLoadStoreItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
- InstrItinClass arg_mr> {
- InstrItinClass rr = arg_rr;
- InstrItinClass rm = arg_rm;
- InstrItinClass mr = arg_mr;
-}
-
-class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
- InstrItinClass arg_ri> {
- InstrItinClass rr = arg_rr;
- InstrItinClass rm = arg_rm;
- InstrItinClass ri = arg_ri;
-}
-
-// scalar
-let Sched = WriteFAdd in {
-def SSE_ALU_F32S : OpndItins<
- IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
->;
-
-def SSE_ALU_F64S : OpndItins<
- IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
->;
-}
-
-def SSE_ALU_ITINS_S : SizeItins<
- SSE_ALU_F32S, SSE_ALU_F64S
->;
-
-let Sched = WriteFMul in {
-def SSE_MUL_F32S : OpndItins<
- IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
->;
-
-def SSE_MUL_F64S : OpndItins<
- IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
->;
-}
-
-def SSE_MUL_ITINS_S : SizeItins<
- SSE_MUL_F32S, SSE_MUL_F64S
->;
-
-let Sched = WriteFDiv in {
-def SSE_DIV_F32S : OpndItins<
- IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
->;
-
-def SSE_DIV_F64S : OpndItins<
- IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
->;
-}
-
-def SSE_DIV_ITINS_S : SizeItins<
- SSE_DIV_F32S, SSE_DIV_F64S
->;
-
-// parallel
-let Sched = WriteFAdd in {
-def SSE_ALU_F32P : OpndItins<
- IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
->;
-
-def SSE_ALU_F64P : OpndItins<
- IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
->;
-}
-
-def SSE_ALU_ITINS_P : SizeItins<
- SSE_ALU_F32P, SSE_ALU_F64P
->;
-
-let Sched = WriteFMul in {
-def SSE_MUL_F32P : OpndItins<
- IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
->;
-
-def SSE_MUL_F64P : OpndItins<
- IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
->;
-}
-
-def SSE_MUL_ITINS_P : SizeItins<
- SSE_MUL_F32P, SSE_MUL_F64P
->;
-
-let Sched = WriteFDiv in {
-def SSE_DIV_F32P : OpndItins<
- IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
->;
-
-def SSE_DIV_F64P : OpndItins<
- IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
->;
-}
-
-def SSE_DIV_ITINS_P : SizeItins<
- SSE_DIV_F32P, SSE_DIV_F64P
->;
-
-let Sched = WriteVecLogic in
-def SSE_BIT_ITINS_P : OpndItins<
- IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
->;
-
-let Sched = WriteVecALU in {
-def SSE_INTALU_ITINS_P : OpndItins<
- IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-
-def SSE_INTALUQ_ITINS_P : OpndItins<
- IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
->;
-}
-
-let Sched = WriteVecIMul in
-def SSE_INTMUL_ITINS_P : OpndItins<
- IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
->;
-
-// FIXME: Merge SSE_INTSHIFT_P + SSE_INTSHIFT_ITINS_P.
-def SSE_INTSHIFT_P : OpndItins<
- IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM
->;
-
-def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
- IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
->;
-
-def SSE_MOVA_ITINS : OpndItins<
- IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
->;
-
-def SSE_MOVA : MoveLoadStoreItins<
- IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM, IIC_SSE_MOVA_P_MR
->;
-
-def SSE_MOVU_ITINS : OpndItins<
- IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
->;
-
-def SSE_MOVU : MoveLoadStoreItins<
- IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM, IIC_SSE_MOVU_P_MR
->;
-
-def SSE_DPPD_ITINS : OpndItins<
- IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
->;
-
-def SSE_DPPS_ITINS : OpndItins<
- IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
->;
-
-def DEFAULT_ITINS : OpndItins<
- IIC_ALU_NONMEM, IIC_ALU_MEM
->;
-
-def SSE_EXTRACT_ITINS : OpndItins<
- IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
->;
-
-def SSE_INSERT_ITINS : OpndItins<
- IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
->;
-
-let Sched = WriteMPSAD in
-def SSE_MPSADBW_ITINS : OpndItins<
- IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
->;
-
-let Sched = WriteVecIMul in
-def SSE_PMULLD_ITINS : OpndItins<
- IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
->;
-
-// Definitions for backward compatibility.
-// The instructions mapped on these definitions uses a different itinerary
-// than the actual scheduling model.
-let Sched = WriteShuffle in
-def DEFAULT_ITINS_SHUFFLESCHED : OpndItins<
- IIC_ALU_NONMEM, IIC_ALU_MEM
->;
-
-let Sched = WriteVecIMul in
-def DEFAULT_ITINS_VECIMULSCHED : OpndItins<
- IIC_ALU_NONMEM, IIC_ALU_MEM
->;
-
-let Sched = WriteShuffle in
-def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
- IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-
-let Sched = WriteShuffle in
-def SSE_PACK : OpndItins<
- IIC_SSE_PACK, IIC_SSE_PACK
->;
-
-let Sched = WriteMPSAD in
-def DEFAULT_ITINS_MPSADSCHED : OpndItins<
- IIC_ALU_NONMEM, IIC_ALU_MEM
->;
-
-let Sched = WriteFBlend in
-def DEFAULT_ITINS_FBLENDSCHED : OpndItins<
- IIC_ALU_NONMEM, IIC_ALU_MEM
->;
-
-let Sched = WriteBlend in
-def DEFAULT_ITINS_BLENDSCHED : OpndItins<
- IIC_ALU_NONMEM, IIC_ALU_MEM
->;
-
-let Sched = WriteVarBlend in
-def DEFAULT_ITINS_VARBLENDSCHED : OpndItins<
- IIC_ALU_NONMEM, IIC_ALU_MEM
->;
-
-let Sched = WriteFBlend in
-def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
- IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-
-let Sched = WriteBlend in
-def SSE_INTALU_ITINS_BLEND_P : OpndItins<
- IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-
//===----------------------------------------------------------------------===//
// SSE 1 & 2 Instructions Classes
//===----------------------------------------------------------------------===//
@@ -260,21 +20,22 @@ def SSE_INTALU_ITINS_BLEND_P : OpndItins<
/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
RegisterClass RC, X86MemOperand x86memop,
- Domain d, OpndItins itins, bit Is2Addr = 1> {
+ Domain d, X86FoldableSchedWrite sched,
+ bit Is2Addr = 1> {
let isCommutable = 1 in {
def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr, d>,
- Sched<[itins.Sched]>;
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
+ Sched<[sched]>;
}
def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm, d>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
@@ -282,21 +43,21 @@ multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNode, RegisterClass RC,
ValueType VT, string asm, Operand memopr,
ComplexPattern mem_cpat, Domain d,
- OpndItins itins, bit Is2Addr = 1> {
+ X86FoldableSchedWrite sched, bit Is2Addr = 1> {
let isCodeGenOnly = 1, hasSideEffects = 0 in {
def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
- Sched<[itins.Sched]>;
+ [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
+ Sched<[sched]>;
let mayLoad = 1 in
def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], itins.rm, d>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
@@ -304,27 +65,29 @@ let isCodeGenOnly = 1, hasSideEffects = 0 in {
multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
RegisterClass RC, ValueType vt,
X86MemOperand x86memop, PatFrag mem_frag,
- Domain d, OpndItins itins, bit Is2Addr = 1> {
+ Domain d, X86FoldableSchedWrite sched,
+ bit Is2Addr = 1> {
let isCommutable = 1 in
def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
- Sched<[itins.Sched]>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
+ Sched<[sched]>;
let mayLoad = 1 in
def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
- itins.rm, d>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ d>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
string OpcodeStr, X86MemOperand x86memop,
+ X86FoldableSchedWrite sched,
list<dag> pat_rr, list<dag> pat_rm,
bit Is2Addr = 1> {
let isCommutable = 1, hasSideEffects = 0 in
@@ -332,15 +95,15 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- pat_rr, IIC_SSE_BIT_P_RR, d>,
- Sched<[WriteVecLogic]>;
+ pat_rr, d>,
+ Sched<[sched]>;
let hasSideEffects = 0, mayLoad = 1 in
def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- pat_rm, IIC_SSE_BIT_P_RM, d>,
- Sched<[WriteVecLogicLd, ReadAfterLd]>;
+ pat_rm, d>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
@@ -360,7 +123,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
// Alias instruction that maps zero vector to pxor / xorp* for sse.
// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
-// swizzled by ExecutionDepsFix to pxor.
+// swizzled by ExecutionDomainFix to pxor.
// We set canFoldAsLoad because this can be converted to a constant-pool
// load of an all-zeros value if folding it would be beneficial.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
@@ -415,22 +178,22 @@ multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(base_opc, asm_opr),
- [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
- IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>;
+ [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
+ Sched<[SchedWriteFShuffle.XMM]>;
// For the disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
- !strconcat(base_opc, asm_opr),
- [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>,
- FoldGenData<Name#rr>;
+ !strconcat(base_opc, asm_opr), []>,
+ Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
}
multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
X86MemOperand x86memop, string OpcodeStr,
- Domain d, string Name> {
+ Domain d, string Name, Predicate pred> {
// AVX
+ let Predicates = [UseAVX, OptForSize] in
defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
"V"#Name>,
@@ -438,18 +201,26 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
- VEX, VEX_LIG, Sched<[WriteStore]>, VEX_WIG;
+ [(store RC:$src, addr:$dst)], d>,
+ VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
// SSE1 & 2
let Constraints = "$src1 = $dst" in {
+ let Predicates = [pred, NoSSE41_Or_OptForSize] in
defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
"\t{$src2, $dst|$dst, $src2}", d, Name>;
}
def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
- Sched<[WriteStore]>;
+ [(store RC:$src, addr:$dst)], d>,
+ Sched<[WriteFStore]>;
+
+ def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (!cast<Instruction>("V"#NAME#"rr_REV")
+ VR128:$dst, VR128:$src1, VR128:$src2), 0>;
+ def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
+ (!cast<Instruction>(NAME#"rr_REV")
+ VR128:$dst, VR128:$src2), 0>;
}
// Loading from memory automatically zeroing upper bits.
@@ -457,37 +228,32 @@ multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
PatFrag mem_pat, string OpcodeStr, Domain d> {
def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (mem_pat addr:$src))],
- IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>, VEX_WIG;
+ [(set RC:$dst, (mem_pat addr:$src))], d>,
+ VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (mem_pat addr:$src))],
- IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>;
+ [(set RC:$dst, (mem_pat addr:$src))], d>,
+ Sched<[WriteFLoad]>;
}
defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
- SSEPackedSingle, "MOVSS">, XS;
+ SSEPackedSingle, "MOVSS", UseSSE1>, XS;
defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
- SSEPackedDouble, "MOVSD">, XD;
+ SSEPackedDouble, "MOVSD", UseSSE2>, XD;
let canFoldAsLoad = 1, isReMaterializable = 1 in {
defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
SSEPackedSingle>, XS;
-
- let AddedComplexity = 20 in
- defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
- SSEPackedDouble>, XD;
+ defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
+ SSEPackedDouble>, XD;
}
// Patterns
let Predicates = [UseAVX] in {
- let AddedComplexity = 20 in {
// MOVSSrm zeros the high parts of the register; represent this
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
(COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
- def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
- (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
(COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
def : Pat<(v4f32 (X86vzload addr:$src)),
@@ -497,8 +263,6 @@ let Predicates = [UseAVX] in {
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
(COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
- def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
- (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
(COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
@@ -518,43 +282,45 @@ let Predicates = [UseAVX] in {
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
def : Pat<(v4f64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
- }
// Extract and store.
def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
addr:$dst),
(VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
+}
- // Shuffle with VMOVSS
- def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
- (VMOVSSrr VR128:$src1, VR128:$src2)>;
-
- def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
- (VMOVSSrr VR128:$src1, (COPY_TO_REGCLASS FR32:$src2, VR128))>;
-
- // Shuffle with VMOVSD
- def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
- (VMOVSDrr VR128:$src1, VR128:$src2)>;
+let Predicates = [UseAVX, OptForSize] in {
+ // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+ // MOVSS to the lower bits.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
- def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
- (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS FR64:$src2, VR128))>;
+ // Move low f32 and clear high bits.
+ def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
+ (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
+ def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
+ (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
- // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
- // is during lowering, where it's not possible to recognize the fold cause
- // it has two uses through a bitcast. One use disappears at isel time and the
- // fold opportunity reappears.
- def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
- (VMOVSDrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
- (VMOVSDrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
- (VMOVSDrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
- (VMOVSDrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VMOVSDrr (v2f64 (V_SET0)),
+ (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VMOVSDrr (v2i64 (V_SET0)),
+ (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
+ sub_xmm)>;
}
let Predicates = [UseSSE1] in {
- let Predicates = [NoSSE41], AddedComplexity = 15 in {
+ let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
// MOVSS to the lower bits.
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
@@ -563,72 +329,30 @@ let Predicates = [UseSSE1] in {
(MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
}
- let AddedComplexity = 20 in {
// MOVSSrm already zeros the high parts of the register.
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
(COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
- def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
- (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
(COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
def : Pat<(v4f32 (X86vzload addr:$src)),
(COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
- }
// Extract and store.
def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
addr:$dst),
(MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
-
- // Shuffle with MOVSS
- def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
- (MOVSSrr VR128:$src1, VR128:$src2)>;
-
- def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
- (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS FR32:$src2, VR128))>;
}
let Predicates = [UseSSE2] in {
- let Predicates = [NoSSE41], AddedComplexity = 15 in {
- // Move scalar to XMM zero-extended, zeroing a VR128 then do a
- // MOVSD to the lower bits.
- def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
- (MOVSDrr (v2f64 (V_SET0)), (COPY_TO_REGCLASS FR64:$src, VR128))>;
- }
-
- let AddedComplexity = 20 in {
// MOVSDrm already zeros the high parts of the register.
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
- def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
- (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
def : Pat<(v2f64 (X86vzload addr:$src)),
(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
- }
-
- // Shuffle with MOVSD
- def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, VR128:$src2)>;
-
- def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
- (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS FR64:$src2, VR128))>;
-
- // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
- // is during lowering, where it's not possible to recognize the fold because
- // it has two uses through a bitcast. One use disappears at isel time and the
- // fold opportunity reappears.
- def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, VR128:$src2)>;
}
// Aliases to help the assembler pick two byte VEX encodings by swapping the
@@ -645,142 +369,144 @@ def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
X86MemOperand x86memop, PatFrag ld_frag,
string asm, Domain d,
- OpndItins itins> {
-let hasSideEffects = 0 in
+ X86SchedWriteMoveLS sched> {
+let hasSideEffects = 0, isMoveReg = 1 in
def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
- !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
- Sched<[WriteFShuffle]>;
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
+ Sched<[sched.RR]>;
let canFoldAsLoad = 1, isReMaterializable = 1 in
def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
- Sched<[WriteLoad]>;
+ [(set RC:$dst, (ld_frag addr:$src))], d>,
+ Sched<[sched.RM]>;
}
let Predicates = [HasAVX, NoVLX] in {
-defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
- "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
- PS, VEX, VEX_WIG;
-defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
- "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
- PD, VEX, VEX_WIG;
-defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
- "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
- PS, VEX, VEX_WIG;
-defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
- "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
- PD, VEX, VEX_WIG;
-
-defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
- "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
- PS, VEX, VEX_L, VEX_WIG;
-defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
- "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
- PD, VEX, VEX_L, VEX_WIG;
-defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
- "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
- PS, VEX, VEX_L, VEX_WIG;
-defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
- "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
- PD, VEX, VEX_L, VEX_WIG;
+defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
+ SSEPackedSingle, SchedWriteFMoveLS.XMM>,
+ PS, VEX, VEX_WIG;
+defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
+ SSEPackedDouble, SchedWriteFMoveLS.XMM>,
+ PD, VEX, VEX_WIG;
+defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
+ SSEPackedSingle, SchedWriteFMoveLS.XMM>,
+ PS, VEX, VEX_WIG;
+defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
+ SSEPackedDouble, SchedWriteFMoveLS.XMM>,
+ PD, VEX, VEX_WIG;
+
+defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
+ SSEPackedSingle, SchedWriteFMoveLS.YMM>,
+ PS, VEX, VEX_L, VEX_WIG;
+defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
+ SSEPackedDouble, SchedWriteFMoveLS.YMM>,
+ PD, VEX, VEX_L, VEX_WIG;
+defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
+ SSEPackedSingle, SchedWriteFMoveLS.YMM>,
+ PS, VEX, VEX_L, VEX_WIG;
+defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
+ SSEPackedDouble, SchedWriteFMoveLS.YMM>,
+ PD, VEX, VEX_L, VEX_WIG;
}
let Predicates = [UseSSE1] in {
-defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
- "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
- PS;
-defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
- "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
- PS;
+defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
+ SSEPackedSingle, SchedWriteFMoveLS.XMM>,
+ PS;
+defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
+ SSEPackedSingle, SchedWriteFMoveLS.XMM>,
+ PS;
}
let Predicates = [UseSSE2] in {
-defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
- "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
- PD;
-defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
- "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
- PD;
+defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
+ SSEPackedDouble, SchedWriteFMoveLS.XMM>,
+ PD;
+defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
+ SSEPackedDouble, SchedWriteFMoveLS.XMM>,
+ PD;
}
-let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in {
+let Predicates = [HasAVX, NoVLX] in {
+let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movaps\t{$src, $dst|$dst, $src}",
- [(alignedstore (v4f32 VR128:$src), addr:$dst)],
- IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
+ [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
+ VEX, VEX_WIG;
def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movapd\t{$src, $dst|$dst, $src}",
- [(alignedstore (v2f64 VR128:$src), addr:$dst)],
- IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
+ [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
+ VEX, VEX_WIG;
def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movups\t{$src, $dst|$dst, $src}",
- [(store (v4f32 VR128:$src), addr:$dst)],
- IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG;
+ [(store (v4f32 VR128:$src), addr:$dst)]>,
+ VEX, VEX_WIG;
def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movupd\t{$src, $dst|$dst, $src}",
- [(store (v2f64 VR128:$src), addr:$dst)],
- IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG;
+ [(store (v2f64 VR128:$src), addr:$dst)]>,
+ VEX, VEX_WIG;
+} // SchedRW
+
+let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
"movaps\t{$src, $dst|$dst, $src}",
- [(alignedstore (v8f32 VR256:$src), addr:$dst)],
- IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG;
+ [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
+ VEX, VEX_L, VEX_WIG;
def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
"movapd\t{$src, $dst|$dst, $src}",
- [(alignedstore (v4f64 VR256:$src), addr:$dst)],
- IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG;
+ [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
+ VEX, VEX_L, VEX_WIG;
def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
"movups\t{$src, $dst|$dst, $src}",
- [(store (v8f32 VR256:$src), addr:$dst)],
- IIC_SSE_MOVU_P_MR>, VEX, VEX_L, VEX_WIG;
+ [(store (v8f32 VR256:$src), addr:$dst)]>,
+ VEX, VEX_L, VEX_WIG;
def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
"movupd\t{$src, $dst|$dst, $src}",
- [(store (v4f64 VR256:$src), addr:$dst)],
- IIC_SSE_MOVU_P_MR>, VEX, VEX_L, VEX_WIG;
+ [(store (v4f64 VR256:$src), addr:$dst)]>,
+ VEX, VEX_L, VEX_WIG;
} // SchedRW
+} // Predicate
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
- SchedRW = [WriteFShuffle] in {
+ isMoveReg = 1 in {
+let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
- "movaps\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG,
- FoldGenData<"VMOVAPSrr">;
+ "movaps\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
- "movapd\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG,
- FoldGenData<"VMOVAPDrr">;
+ "movapd\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
- "movups\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG,
- FoldGenData<"VMOVUPSrr">;
+ "movups\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
- "movupd\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG,
- FoldGenData<"VMOVUPDrr">;
+ "movupd\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
+} // SchedRW
+
+let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
- "movaps\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
- FoldGenData<"VMOVAPSYrr">;
+ "movaps\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
- "movapd\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
- FoldGenData<"VMOVAPDYrr">;
+ "movapd\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
- "movups\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
- FoldGenData<"VMOVUPSYrr">;
+ "movups\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
- "movupd\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
- FoldGenData<"VMOVUPDYrr">;
-}
+ "movupd\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
+} // SchedRW
+} // Predicate
// Aliases to help the assembler pick two byte VEX encodings by swapping the
// operands relative to the normal instructions to use VEX.R instead of VEX.B.
@@ -801,42 +527,66 @@ def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
(VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
-let SchedRW = [WriteStore] in {
+// Reversed version with ".s" suffix for GAS compatibility.
+def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
+ (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
+ (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
+ (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
+ (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
+ (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
+def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
+ (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
+def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
+ (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
+def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
+ (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
+
+let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movaps\t{$src, $dst|$dst, $src}",
- [(alignedstore (v4f32 VR128:$src), addr:$dst)],
- IIC_SSE_MOVA_P_MR>;
+ [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movapd\t{$src, $dst|$dst, $src}",
- [(alignedstore (v2f64 VR128:$src), addr:$dst)],
- IIC_SSE_MOVA_P_MR>;
+ [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movups\t{$src, $dst|$dst, $src}",
- [(store (v4f32 VR128:$src), addr:$dst)],
- IIC_SSE_MOVU_P_MR>;
+ [(store (v4f32 VR128:$src), addr:$dst)]>;
def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movupd\t{$src, $dst|$dst, $src}",
- [(store (v2f64 VR128:$src), addr:$dst)],
- IIC_SSE_MOVU_P_MR>;
+ [(store (v2f64 VR128:$src), addr:$dst)]>;
} // SchedRW
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
- SchedRW = [WriteFShuffle] in {
+ isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movaps\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPSrr">;
+ "movaps\t{$src, $dst|$dst, $src}", []>,
+ FoldGenData<"MOVAPSrr">;
def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movapd\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPDrr">;
+ "movapd\t{$src, $dst|$dst, $src}", []>,
+ FoldGenData<"MOVAPDrr">;
def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movups\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPSrr">;
+ "movups\t{$src, $dst|$dst, $src}", []>,
+ FoldGenData<"MOVUPSrr">;
def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movupd\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPDrr">;
+ "movupd\t{$src, $dst|$dst, $src}", []>,
+ FoldGenData<"MOVUPDrr">;
}
+// Reversed version with ".s" suffix for GAS compatibility.
+def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
+ (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
+ (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
+ (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
+ (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
+
let Predicates = [HasAVX, NoVLX] in {
// 256-bit load/store need to use floating point load/store in case we don't
// have AVX2. Execution domain fixing will convert to integer if AVX2 is
@@ -894,135 +644,82 @@ let Predicates = [UseSSE1] in {
// SSE 1 & 2 - Move Low packed FP Instructions
//===----------------------------------------------------------------------===//
-multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
- string base_opc, string asm_opr,
- InstrItinClass itin> {
+multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode,
+ string base_opc, string asm_opr> {
+ // No pattern as they need be special cased between high and low.
+ let hasSideEffects = 0, mayLoad = 1 in
def PSrm : PI<opc, MRMSrcMem,
- (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
- !strconcat(base_opc, "s", asm_opr),
- [(set VR128:$dst,
- (psnode VR128:$src1,
- (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
- itin, SSEPackedSingle>, PS,
- Sched<[WriteFShuffleLd, ReadAfterLd]>;
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ !strconcat(base_opc, "s", asm_opr),
+ [], SSEPackedSingle>, PS,
+ Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
def PDrm : PI<opc, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
!strconcat(base_opc, "d", asm_opr),
[(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
(scalar_to_vector (loadf64 addr:$src2)))))],
- itin, SSEPackedDouble>, PD,
- Sched<[WriteFShuffleLd, ReadAfterLd]>;
-
+ SSEPackedDouble>, PD,
+ Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
}
-multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
- string base_opc, InstrItinClass itin> {
+multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
+ string base_opc> {
let Predicates = [UseAVX] in
- defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- itin>, VEX_4V, VEX_WIG;
+ defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
+ VEX_4V, VEX_WIG;
let Constraints = "$src1 = $dst" in
- defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
- "\t{$src2, $dst|$dst, $src2}",
- itin>;
+ defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
+ "\t{$src2, $dst|$dst, $src2}">;
}
-let AddedComplexity = 20 in {
- defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
- IIC_SSE_MOV_LH>;
-}
+defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
-let SchedRW = [WriteStore] in {
+let SchedRW = [WriteFStore] in {
let Predicates = [UseAVX] in {
def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
- "movlps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
- (iPTR 0))), addr:$dst)],
- IIC_SSE_MOV_LH>, VEX, VEX_WIG;
+ "movlps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
+ (iPTR 0))), addr:$dst)]>,
+ VEX, VEX_WIG;
def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
- "movlpd\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt (v2f64 VR128:$src),
- (iPTR 0))), addr:$dst)],
- IIC_SSE_MOV_LH>, VEX, VEX_WIG;
+ "movlpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt (v2f64 VR128:$src),
+ (iPTR 0))), addr:$dst)]>,
+ VEX, VEX_WIG;
}// UseAVX
def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlps\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
- (iPTR 0))), addr:$dst)],
- IIC_SSE_MOV_LH>;
+ (iPTR 0))), addr:$dst)]>;
def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlpd\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt (v2f64 VR128:$src),
- (iPTR 0))), addr:$dst)],
- IIC_SSE_MOV_LH>;
+ (iPTR 0))), addr:$dst)]>;
} // SchedRW
-let Predicates = [UseAVX] in {
- // Shuffle with VMOVLPS
- def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
- (VMOVLPSrm VR128:$src1, addr:$src2)>;
-
- // Shuffle with VMOVLPD
- def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
- (VMOVLPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2f64 (X86Movsd VR128:$src1,
- (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
- (VMOVLPDrm VR128:$src1, addr:$src2)>;
-
- // Store patterns
- def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
- addr:$src1),
- (VMOVLPSmr addr:$src1, VR128:$src2)>;
- def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
- addr:$src1),
- (VMOVLPDmr addr:$src1, VR128:$src2)>;
-}
-
let Predicates = [UseSSE1] in {
// (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
(iPTR 0))), addr:$src1),
(MOVLPSmr addr:$src1, VR128:$src2)>;
- // Shuffle with MOVLPS
- def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
- (MOVLPSrm VR128:$src1, addr:$src2)>;
- def : Pat<(X86Movlps VR128:$src1,
- (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+ // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
+ // end up with a movsd or blend instead of shufp.
+ // No need for aligned load, we're only loading 64-bits.
+ def : Pat<(X86Shufp (loadv4f32 addr:$src2), VR128:$src1, (i8 -28)),
(MOVLPSrm VR128:$src1, addr:$src2)>;
-
- // Store patterns
- def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
- addr:$src1),
- (MOVLPSmr addr:$src1, VR128:$src2)>;
-}
-
-let Predicates = [UseSSE2] in {
- // Shuffle with MOVLPD
- def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
- (MOVLPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2f64 (X86Movsd VR128:$src1,
- (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
- (MOVLPDrm VR128:$src1, addr:$src2)>;
-
- // Store patterns
- def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
- addr:$src1),
- (MOVLPDmr addr:$src1, VR128:$src2)>;
}
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move Hi packed FP Instructions
//===----------------------------------------------------------------------===//
-let AddedComplexity = 20 in {
- defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Unpckl, "movhp",
- IIC_SSE_MOV_LH>;
-}
+defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
-let SchedRW = [WriteStore] in {
+let SchedRW = [WriteFStore] in {
// v2f64 extract element 1 is always custom lowered to unpack high to low
// and extract element 0 so the non-store version isn't too horrible.
let Predicates = [UseAVX] in {
@@ -1031,35 +728,27 @@ def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
[(store (f64 (extractelt
(X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
(bc_v2f64 (v4f32 VR128:$src))),
- (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX, VEX_WIG;
+ (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhpd\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt
(v2f64 (X86Unpckh VR128:$src, VR128:$src)),
- (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX, VEX_WIG;
+ (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
} // UseAVX
def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhps\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt
(X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
(bc_v2f64 (v4f32 VR128:$src))),
- (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+ (iPTR 0))), addr:$dst)]>;
def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhpd\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt
(v2f64 (X86Unpckh VR128:$src, VR128:$src)),
- (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+ (iPTR 0))), addr:$dst)]>;
} // SchedRW
let Predicates = [UseAVX] in {
- // VMOVHPS patterns
- def : Pat<(X86Movlhps VR128:$src1,
- (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
- (VMOVHPSrm VR128:$src1, addr:$src2)>;
- def : Pat<(X86Movlhps VR128:$src1,
- (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
- (VMOVHPSrm VR128:$src1, addr:$src2)>;
-
// Also handle an i64 load because that may get selected as a faster way to
// load the data.
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -1067,23 +756,16 @@ let Predicates = [UseAVX] in {
(VMOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
- (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
- (iPTR 0))), addr:$dst),
- (VMOVHPDmr addr:$dst, VR128:$src)>;
-
- def : Pat<(store (f64 (extractelt
(v2f64 (X86VPermilpi VR128:$src, (i8 1))),
(iPTR 0))), addr:$dst),
(VMOVHPDmr addr:$dst, VR128:$src)>;
}
let Predicates = [UseSSE1] in {
- // MOVHPS patterns
- def : Pat<(X86Movlhps VR128:$src1,
- (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
- (MOVHPSrm VR128:$src1, addr:$src2)>;
- def : Pat<(X86Movlhps VR128:$src1,
- (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
+ // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
+ // end up with a movsd or blend instead of shufp.
+ // No need for aligned load, we're only loading 64-bits.
+ def : Pat<(X86Movlhps VR128:$src1, (loadv4f32 addr:$src2)),
(MOVHPSrm VR128:$src1, addr:$src2)>;
}
@@ -1097,11 +779,6 @@ let Predicates = [UseSSE2] in {
(MOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
- (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
- (iPTR 0))), addr:$dst),
- (MOVHPDmr addr:$dst, VR128:$src)>;
-
- def : Pat<(store (f64 (extractelt
(v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
(iPTR 0))), addr:$dst),
(MOVHPDmr addr:$dst, VR128:$src)>;
@@ -1111,206 +788,149 @@ let Predicates = [UseSSE2] in {
// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
//===----------------------------------------------------------------------===//
-let AddedComplexity = 20, Predicates = [UseAVX] in {
+let Predicates = [UseAVX] in {
def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
- IIC_SSE_MOV_LH>,
- VEX_4V, Sched<[WriteFShuffle]>, VEX_WIG;
+ (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
+ VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
+ let isCommutable = 1 in
def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
- IIC_SSE_MOV_LH>,
- VEX_4V, Sched<[WriteFShuffle]>, VEX_WIG;
+ (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
+ VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
+ NotMemoryFoldable;
}
-let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
+let Constraints = "$src1 = $dst" in {
def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movlhps\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
- (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
- IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+ (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
+ Sched<[SchedWriteFShuffle.XMM]>;
let isCommutable = 1 in
def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movhlps\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
- (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
- IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
-}
-
-//===----------------------------------------------------------------------===//
-// SSE 1 & 2 - Conversion Instructions
-//===----------------------------------------------------------------------===//
-
-let Sched = WriteCvtF2I in {
-def SSE_CVT_SS2SI_32 : OpndItins<
- IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
->;
-
-let Sched = WriteCvtF2I in
-def SSE_CVT_SS2SI_64 : OpndItins<
- IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
->;
-
-def SSE_CVT_SD2SI : OpndItins<
- IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
->;
-
-def SSE_CVT_PS2I : OpndItins<
- IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
->;
-
-def SSE_CVT_PD2I : OpndItins<
- IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
->;
+ (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
+ Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
}
-let Sched = WriteCvtI2F in {
-def SSE_CVT_SI2SS : OpndItins<
- IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
->;
-
-def SSE_CVT_SI2SD : OpndItins<
- IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
->;
-
-def SSE_CVT_I2PS : OpndItins<
- IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
->;
+// TODO: This is largely to trick fastisel into ignoring the pattern.
+def UnpckhUnary : PatFrag<(ops node:$src1, node:$src2),
+ (X86Unpckh node:$src1, node:$src2), [{
+ return N->getOperand(0) == N->getOperand(1);
+}]>;
-def SSE_CVT_I2PD : OpndItins<
- IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
->;
+let Predicates = [UseSSE2] in {
+ // TODO: This is a hack pattern to allow lowering to emit unpckh instead of
+ // movhlps for sse2 without changing a bunch of tests.
+ def : Pat<(v2f64 (UnpckhUnary VR128:$src, VR128:$src)),
+ (MOVHLPSrr VR128:$src, VR128:$src)>;
}
-let Sched = WriteCvtF2F in {
-def SSE_CVT_SD2SS : OpndItins<
- IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
->;
-
-def SSE_CVT_SS2SD : OpndItins<
- IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
->;
-
-def SSE_CVT_PD2PS : OpndItins<
- IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
->;
-
-def SSE_CVT_PS2PD : OpndItins<
- IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
->;
-
-def SSE_CVT_PH2PS : OpndItins<
- IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
->;
-
-def SSE_CVT_PS2PH : OpndItins<
- IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
->;
-}
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Conversion Instructions
+//===----------------------------------------------------------------------===//
-// FIXME: We probably want to match the rm form only when optimizing for
-// size, to avoid false depenendecies (see sse_fp_unop_s for details)
multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
- string asm, OpndItins itins> {
+ string asm, X86FoldableSchedWrite sched> {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
- [(set DstRC:$dst, (OpNode SrcRC:$src))],
- itins.rr>, Sched<[itins.Sched]>;
+ [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
+ Sched<[sched]>;
def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
- [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
- itins.rm>, Sched<[itins.Sched.Folded]>;
+ [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
+ Sched<[sched.Folded]>;
}
multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
- string asm, Domain d, OpndItins itins> {
+ string asm, Domain d, X86FoldableSchedWrite sched> {
let hasSideEffects = 0 in {
def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
- [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))],
- itins.rr, d>, Sched<[itins.Sched]>;
+ [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], d>,
+ Sched<[sched]>;
let mayLoad = 1 in
def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
[(set RC:$dst, (DstTy (sint_to_fp
- (SrcTy (bitconvert (ld_frag addr:$src))))))],
- itins.rm, d>, Sched<[itins.Sched.Folded]>;
+ (SrcTy (bitconvert (ld_frag addr:$src))))))], d>,
+ Sched<[sched.Folded]>;
}
}
-// FIXME: We probably want to match the rm form only when optimizing for
-// size, to avoid false depenendecies (see sse_fp_unop_s for details)
multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
- X86MemOperand x86memop, string asm, OpndItins itins> {
+ X86MemOperand x86memop, string asm,
+ X86FoldableSchedWrite sched> {
let hasSideEffects = 0, Predicates = [UseAVX] in {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
- !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), [],
- itins.rr>, Sched<[itins.Sched]>;
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ Sched<[sched]>;
let mayLoad = 1 in
def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
(ins DstRC:$src1, x86memop:$src),
!strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>;
} // hasSideEffects = 0
}
let Predicates = [UseAVX] in {
defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
"cvttss2si\t{$src, $dst|$dst, $src}",
- SSE_CVT_SS2SI_32>,
+ WriteCvtSS2I>,
XS, VEX, VEX_LIG;
defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
"cvttss2si\t{$src, $dst|$dst, $src}",
- SSE_CVT_SS2SI_64>,
+ WriteCvtSS2I>,
XS, VEX, VEX_W, VEX_LIG;
defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
"cvttsd2si\t{$src, $dst|$dst, $src}",
- SSE_CVT_SD2SI>,
+ WriteCvtSD2I>,
XD, VEX, VEX_LIG;
defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
"cvttsd2si\t{$src, $dst|$dst, $src}",
- SSE_CVT_SD2SI>,
+ WriteCvtSD2I>,
XD, VEX, VEX_W, VEX_LIG;
def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+ (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+ (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+ (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+ (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+ (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+ (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+ (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+ (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
}
// The assembler can recognize rr 64-bit instructions by seeing a rxx
// register, but the same isn't true when only using memory operands,
// provide other assembly "l" and "q" forms to address this explicitly
// where appropriate to do so.
defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}",
- SSE_CVT_SI2SS>, XS, VEX_4V, VEX_LIG;
+ WriteCvtI2SS>, XS, VEX_4V, VEX_LIG;
defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}",
- SSE_CVT_SI2SS>, XS, VEX_4V, VEX_W, VEX_LIG;
+ WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG;
defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}",
- SSE_CVT_SI2SD>, XD, VEX_4V, VEX_LIG;
+ WriteCvtI2SD>, XD, VEX_4V, VEX_LIG;
defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}",
- SSE_CVT_SI2SD>, XD, VEX_4V, VEX_W, VEX_LIG;
+ WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG;
let Predicates = [UseAVX] in {
def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
+ (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
+ (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
@@ -1333,50 +953,50 @@ let Predicates = [UseAVX] in {
defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
"cvttss2si\t{$src, $dst|$dst, $src}",
- SSE_CVT_SS2SI_32>, XS;
+ WriteCvtSS2I>, XS;
defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
"cvttss2si\t{$src, $dst|$dst, $src}",
- SSE_CVT_SS2SI_64>, XS, REX_W;
+ WriteCvtSS2I>, XS, REX_W;
defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
"cvttsd2si\t{$src, $dst|$dst, $src}",
- SSE_CVT_SD2SI>, XD;
+ WriteCvtSD2I>, XD;
defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
"cvttsd2si\t{$src, $dst|$dst, $src}",
- SSE_CVT_SD2SI>, XD, REX_W;
+ WriteCvtSD2I>, XD, REX_W;
defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
- SSE_CVT_SI2SS>, XS;
+ WriteCvtI2SS>, XS;
defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
- SSE_CVT_SI2SS>, XS, REX_W;
+ WriteCvtI2SS>, XS, REX_W;
defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
- SSE_CVT_SI2SD>, XD;
+ WriteCvtI2SD>, XD;
defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
- SSE_CVT_SI2SD>, XD, REX_W;
+ WriteCvtI2SD>, XD, REX_W;
def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
- (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+ (CVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
- (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+ (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
- (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+ (CVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
- (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+ (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
- (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+ (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
- (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+ (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
- (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+ (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
- (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+ (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
- (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
+ (CVTSI2SSrm FR64:$dst, i32mem:$src), 0, "att">;
def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
- (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;
+ (CVTSI2SDrm FR64:$dst, i32mem:$src), 0, "att">;
// Conversion Instructions Intrinsics - Match intrinsics which expect MM
// and/or XMM operand(s).
@@ -1384,81 +1004,72 @@ def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
// FIXME: We probably want to match the rm form only when optimizing for
// size, to avoid false depenendecies (see sse_fp_unop_s for details)
multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
- Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
- string asm, OpndItins itins> {
+ Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
+ string asm, X86FoldableSchedWrite sched> {
def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
- Sched<[itins.Sched]>;
+ [(set DstRC:$dst, (Int SrcRC:$src))]>,
+ Sched<[sched]>;
def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
- Sched<[itins.Sched.Folded]>;
+ [(set DstRC:$dst, (Int mem_cpat:$src))]>,
+ Sched<[sched.Folded]>;
}
multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
- RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
- PatFrag ld_frag, string asm, OpndItins itins,
+ RegisterClass DstRC, X86MemOperand x86memop,
+ string asm, X86FoldableSchedWrite sched,
bit Is2Addr = 1> {
+let hasSideEffects = 0 in {
def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
- itins.rr>, Sched<[itins.Sched]>;
+ []>, Sched<[sched]>;
+ let mayLoad = 1 in
def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
(ins DstRC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ []>, Sched<[sched.Folded, ReadAfterLd]>;
+}
}
let Predicates = [UseAVX] in {
defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
- SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
+ WriteCvtSD2I>, XD, VEX, VEX_LIG;
defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
- SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
+ WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG;
}
defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
- sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
+ sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD;
defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
- sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+ sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W;
let isCodeGenOnly = 1 in {
let Predicates = [UseAVX] in {
defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
- SSE_CVT_SI2SS, 0>, XS, VEX_4V;
+ i32mem, "cvtsi2ss{l}", WriteCvtI2SS, 0>, XS, VEX_4V;
defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
- SSE_CVT_SI2SS, 0>, XS, VEX_4V,
- VEX_W;
+ i64mem, "cvtsi2ss{q}", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_W;
defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
- SSE_CVT_SI2SD, 0>, XD, VEX_4V;
+ i32mem, "cvtsi2sd{l}", WriteCvtI2SD, 0>, XD, VEX_4V;
defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
- SSE_CVT_SI2SD, 0>, XD,
- VEX_4V, VEX_W;
+ i64mem, "cvtsi2sd{q}", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_W;
}
let Constraints = "$src1 = $dst" in {
defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- int_x86_sse_cvtsi2ss, i32mem, loadi32,
- "cvtsi2ss{l}", SSE_CVT_SI2SS>, XS;
+ i32mem, "cvtsi2ss{l}", WriteCvtI2SS>, XS;
defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- int_x86_sse_cvtsi642ss, i64mem, loadi64,
- "cvtsi2ss{q}", SSE_CVT_SI2SS>, XS, REX_W;
+ i64mem, "cvtsi2ss{q}", WriteCvtI2SS>, XS, REX_W;
defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- int_x86_sse2_cvtsi2sd, i32mem, loadi32,
- "cvtsi2sd{l}", SSE_CVT_SI2SD>, XD;
+ i32mem, "cvtsi2sd{l}", WriteCvtI2SD>, XD;
defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- int_x86_sse2_cvtsi642sd, i64mem, loadi64,
- "cvtsi2sd{q}", SSE_CVT_SI2SD>, XD, REX_W;
+ i64mem, "cvtsi2sd{q}", WriteCvtI2SD>, XD, REX_W;
}
} // isCodeGenOnly = 1
@@ -1469,113 +1080,113 @@ let isCodeGenOnly = 1 in {
let Predicates = [UseAVX] in {
defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
ssmem, sse_load_f32, "cvttss2si",
- SSE_CVT_SS2SI_32>, XS, VEX;
+ WriteCvtSS2I>, XS, VEX;
defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
- "cvttss2si", SSE_CVT_SS2SI_64>,
+ "cvttss2si", WriteCvtSS2I>,
XS, VEX, VEX_W;
defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
sdmem, sse_load_f64, "cvttsd2si",
- SSE_CVT_SD2SI>, XD, VEX;
+ WriteCvtSS2I>, XD, VEX;
defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
- "cvttsd2si", SSE_CVT_SD2SI>,
+ "cvttsd2si", WriteCvtSS2I>,
XD, VEX, VEX_W;
}
defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
ssmem, sse_load_f32, "cvttss2si",
- SSE_CVT_SS2SI_32>, XS;
+ WriteCvtSS2I>, XS;
defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
- "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
+ "cvttss2si", WriteCvtSS2I>, XS, REX_W;
defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
sdmem, sse_load_f64, "cvttsd2si",
- SSE_CVT_SD2SI>, XD;
+ WriteCvtSD2I>, XD;
defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
- "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+ "cvttsd2si", WriteCvtSD2I>, XD, REX_W;
} // isCodeGenOnly = 1
let Predicates = [UseAVX] in {
defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
ssmem, sse_load_f32, "cvtss2si",
- SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
+ WriteCvtSS2I>, XS, VEX, VEX_LIG;
defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
ssmem, sse_load_f32, "cvtss2si",
- SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
+ WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG;
}
defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
ssmem, sse_load_f32, "cvtss2si",
- SSE_CVT_SS2SI_32>, XS;
+ WriteCvtSS2I>, XS;
defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
ssmem, sse_load_f32, "cvtss2si",
- SSE_CVT_SS2SI_64>, XS, REX_W;
+ WriteCvtSS2I>, XS, REX_W;
defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
"vcvtdq2ps\t{$src, $dst|$dst, $src}",
- SSEPackedSingle, SSE_CVT_I2PS>,
+ SSEPackedSingle, WriteCvtI2PS>,
PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
"vcvtdq2ps\t{$src, $dst|$dst, $src}",
- SSEPackedSingle, SSE_CVT_I2PS>,
+ SSEPackedSingle, WriteCvtI2PSY>,
PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
"cvtdq2ps\t{$src, $dst|$dst, $src}",
- SSEPackedSingle, SSE_CVT_I2PS>,
+ SSEPackedSingle, WriteCvtI2PS>,
PS, Requires<[UseSSE2]>;
let Predicates = [UseAVX] in {
def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0>;
+ (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0>;
+ (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0>;
+ (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0>;
+ (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0>;
+ (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0>;
+ (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0>;
+ (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0>;
+ (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
}
def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
- (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0>;
+ (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
- (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0>;
+ (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
- (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0>;
+ (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
- (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0>;
+ (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
- (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0>;
+ (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
- (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0>;
+ (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
- (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0>;
+ (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
- (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0>;
+ (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
/// SSE 2 Only
// Convert scalar double to scalar single
let hasSideEffects = 0, Predicates = [UseAVX] in {
def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
- (ins FR32:$src1, FR64:$src2),
- "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
- IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
- Sched<[WriteCvtF2F]>, VEX_WIG, NotMemoryFoldable;
+ (ins FR32:$src1, FR64:$src2),
+ "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ VEX_4V, VEX_LIG, VEX_WIG,
+ Sched<[WriteCvtSD2SS]>;
let mayLoad = 1 in
def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
- (ins FR32:$src1, f64mem:$src2),
- "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_LIG,
- Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable;
+ (ins FR32:$src1, f64mem:$src2),
+ "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG,
+ Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>;
}
def : Pat<(f32 (fpround FR64:$src)),
@@ -1584,69 +1195,67 @@ def : Pat<(f32 (fpround FR64:$src)),
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (fpround FR64:$src))],
- IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
+ [(set FR32:$dst, (fpround FR64:$src))]>,
+ Sched<[WriteCvtSD2SS]>;
def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
- "cvtsd2ss\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (fpround (loadf64 addr:$src)))],
- IIC_SSE_CVT_Scalar_RM>,
- XD,
- Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+ "cvtsd2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>,
+ XD, Requires<[UseSSE2, OptForSize]>,
+ Sched<[WriteCvtSD2SS.Folded]>;
let isCodeGenOnly = 1 in {
def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
- IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, VEX_WIG,
- Requires<[HasAVX]>, Sched<[WriteCvtF2F]>;
+ (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
+ XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
+ Sched<[WriteCvtSD2SS]>;
def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
- VR128:$src1, sse_load_f64:$src2))],
- IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_WIG,
- Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
-
+ VR128:$src1, sse_load_f64:$src2))]>,
+ XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
+ Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>;
let Constraints = "$src1 = $dst" in {
def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"cvtsd2ss\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
- (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
- IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
- Sched<[WriteCvtF2F]>;
+ (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
+ XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"cvtsd2ss\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
- VR128:$src1, sse_load_f64:$src2))],
- IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
- Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+ VR128:$src1, sse_load_f64:$src2))]>,
+ XD, Requires<[UseSSE2]>,
+ Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>;
}
} // isCodeGenOnly = 1
// Convert scalar single to scalar double
// SSE2 instructions with XS prefix
-let hasSideEffects = 0, Predicates = [UseAVX] in {
+let hasSideEffects = 0 in {
def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
(ins FR64:$src1, FR32:$src2),
- "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_LIG,
- Sched<[WriteCvtF2F]>, VEX_WIG, NotMemoryFoldable;
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ XS, VEX_4V, VEX_LIG, VEX_WIG,
+ Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>;
let mayLoad = 1 in
def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
(ins FR64:$src1, f32mem:$src2),
- "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_LIG,
- Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable;
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ XS, VEX_4V, VEX_LIG, VEX_WIG,
+ Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>,
+ Requires<[UseAVX, OptForSize]>;
}
def : Pat<(f64 (fpextend FR32:$src)),
(VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
def : Pat<(fpextend (loadf32 addr:$src)),
- (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
+ (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
def : Pat<(extloadf32 addr:$src),
(VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>,
@@ -1657,14 +1266,13 @@ def : Pat<(extloadf32 addr:$src),
def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (fpextend FR32:$src))],
- IIC_SSE_CVT_Scalar_RR>, XS,
- Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
+ [(set FR64:$dst, (fpextend FR32:$src))]>,
+ XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>;
def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (extloadf32 addr:$src))],
- IIC_SSE_CVT_Scalar_RM>, XS,
- Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+ [(set FR64:$dst, (extloadf32 addr:$src))]>,
+ XS, Requires<[UseSSE2, OptForSize]>,
+ Sched<[WriteCvtSS2SD.Folded]>;
// extload f32 -> f64. This matches load+fpextend because we have a hack in
// the isel (PreprocessForFPConvert) that can introduce loads after dag
@@ -1672,40 +1280,34 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
// Since these loads aren't folded into the fpextend, we have to match it
// explicitly here.
def : Pat<(fpextend (loadf32 addr:$src)),
- (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
+ (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2, OptForSize]>;
def : Pat<(extloadf32 addr:$src),
(CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128:$dst,
- (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
- IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_WIG,
- Requires<[HasAVX]>, Sched<[WriteCvtF2F]>;
+ []>, XS, VEX_4V, VEX_WIG,
+ Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
+let mayLoad = 1 in
def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128:$dst,
- (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
- IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_WIG,
- Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+ []>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
+ Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>;
let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"cvtss2sd\t{$src2, $dst|$dst, $src2}",
- [(set VR128:$dst,
- (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
- IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
- Sched<[WriteCvtF2F]>;
+ []>, XS, Requires<[UseSSE2]>,
+ Sched<[WriteCvtSS2SD]>;
+let mayLoad = 1 in
def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
"cvtss2sd\t{$src2, $dst|$dst, $src2}",
- [(set VR128:$dst,
- (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
- IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
- Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+ []>, XS, Requires<[UseSSE2]>,
+ Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>;
}
} // isCodeGenOnly = 1
@@ -1732,9 +1334,19 @@ def : Pat<(v4f32 (X86Movss
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
+ (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
(v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
(VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
+ (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
+
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
(v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
@@ -1742,8 +1354,18 @@ def : Pat<(v2f64 (X86Movsd
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
+ (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
(v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
(VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
+ (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
} // Predicates = [UseAVX]
let Predicates = [UseSSE2] in {
@@ -1766,8 +1388,18 @@ def : Pat<(v2f64 (X86Movsd
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
+ (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
(v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
(CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
+ (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
} // Predicates = [UseSSE2]
let Predicates = [UseSSE1] in {
@@ -1778,39 +1410,51 @@ def : Pat<(v4f32 (X86Movss
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
+ (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
(v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
(CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
+ (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
} // Predicates = [UseSSE1]
+let Predicates = [HasAVX, NoVLX] in {
// Convert packed single/double fp to doubleword
def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
- IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
+ [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
+ VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
- IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
+ (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
+ VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
- (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
- IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
+ (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
- (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
- IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
+ (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
+}
def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
- IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+ [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
+ Sched<[WriteCvtPS2I]>;
def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
- IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+ (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
+ Sched<[WriteCvtPS2ILd]>;
// Convert Packed Double FP to Packed DW Integers
@@ -1822,7 +1466,7 @@ def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
- VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
+ VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
// XMM only
def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
@@ -1831,37 +1475,37 @@ def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
- Sched<[WriteCvtF2ILd]>, VEX_WIG;
+ Sched<[WriteCvtPD2ILd]>, VEX_WIG;
def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
- (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0>;
+ (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
// YMM only
def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"vcvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
- VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
+ VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
- VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
+ VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
(VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
- (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0>;
+ (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
}
def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))],
- IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
+ (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
+ Sched<[WriteCvtPD2ILd]>;
def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))],
- IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
+ Sched<[WriteCvtPD2I]>;
// Convert with truncation packed single/double fp to doubleword
// SSE2 packed instructions with XS prefix
@@ -1869,43 +1513,61 @@ let Predicates = [HasAVX, NoVLX] in {
def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
- IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
+ (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
+ VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (fp_to_sint (loadv4f32 addr:$src))))],
- IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
+ (v4i32 (X86cvttp2si (loadv4f32 addr:$src))))]>,
+ VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
- (v8i32 (fp_to_sint (v8f32 VR256:$src))))],
- IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
+ (v8i32 (X86cvttp2si (v8f32 VR256:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
- (v8i32 (fp_to_sint (loadv8f32 addr:$src))))],
- IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
- Sched<[WriteCvtF2ILd]>, VEX_WIG;
+ (v8i32 (X86cvttp2si (loadv8f32 addr:$src))))]>,
+ VEX, VEX_L,
+ Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+ (VCVTTPS2DQrr VR128:$src)>;
+ def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
+ (VCVTTPS2DQrm addr:$src)>;
+ def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
+ (VCVTTPS2DQYrr VR256:$src)>;
+ def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
+ (VCVTTPS2DQYrm addr:$src)>;
}
def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
- IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+ (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
+ Sched<[WriteCvtPS2I]>;
def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (fp_to_sint (memopv4f32 addr:$src))))],
- IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+ (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>,
+ Sched<[WriteCvtPS2ILd]>;
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+ (CVTTPS2DQrr VR128:$src)>;
+ def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
+ (CVTTPS2DQrm addr:$src)>;
+}
let Predicates = [HasAVX, NoVLX] in
def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
- IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
+ VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
@@ -1914,76 +1576,80 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
// XMM only
def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
+
let Predicates = [HasAVX, NoVLX] in
def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))],
- IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
+ (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>,
+ VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
- (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0>;
+ (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
// YMM only
let Predicates = [HasAVX, NoVLX] in {
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (fp_to_sint (v4f64 VR256:$src))))],
- IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
+ (v4i32 (X86cvttp2si (v4f64 VR256:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (fp_to_sint (loadv4f64 addr:$src))))],
- IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
+ (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
}
def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
- (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0>;
+ (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
let Predicates = [HasAVX, NoVLX] in {
- let AddedComplexity = 15 in {
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
- (VCVTPD2DQrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
- (VCVTPD2DQrm addr:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
- (VCVTTPD2DQrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
- (VCVTTPD2DQrm addr:$src)>;
- }
+ def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
+ (VCVTTPD2DQYrr VR256:$src)>;
+ def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
+ (VCVTTPD2DQYrm addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+ (VCVTPD2DQrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
+ (VCVTPD2DQrm addr:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+ (VCVTTPD2DQrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
+ (VCVTTPD2DQrm addr:$src)>;
} // Predicates = [HasAVX, NoVLX]
def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
- IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
+ Sched<[WriteCvtPD2I]>;
def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))],
- IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
+ (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>,
+ Sched<[WriteCvtPD2ILd]>;
let Predicates = [UseSSE2] in {
- let AddedComplexity = 15 in {
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
- (CVTPD2DQrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))),
- (CVTPD2DQrm addr:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
- (CVTTPD2DQrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))),
- (CVTTPD2DQrm addr:$src)>;
- }
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+ (CVTPD2DQrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))),
+ (CVTPD2DQrm addr:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+ (CVTTPD2DQrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))),
+ (CVTTPD2DQrm addr:$src)>;
} // Predicates = [UseSSE2]
// Convert packed single to packed double
@@ -1991,31 +1657,31 @@ let Predicates = [HasAVX, NoVLX] in {
// SSE2 instructions without OpSize prefix
def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
- IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>, VEX_WIG;
+ [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
+ PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>, VEX_WIG;
+ [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
+ PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
- [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))],
- IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>, VEX_WIG;
+ [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))]>,
+ PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
- [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>, VEX_WIG;
+ [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
+ PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
}
let Predicates = [UseSSE2] in {
def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
- IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
+ [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
+ PS, Sched<[WriteCvtPS2PD]>;
def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>;
+ [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
+ PS, Sched<[WriteCvtPS2PD.Folded]>;
}
// Convert Packed DW Integers to Packed Double FP
@@ -2025,35 +1691,36 @@ def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
- VEX, Sched<[WriteCvtI2FLd]>, VEX_WIG;
+ VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
- VEX, Sched<[WriteCvtI2F]>, VEX_WIG;
+ VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
- VEX, VEX_L, Sched<[WriteCvtI2FLd]>, VEX_WIG;
+ VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
+ VEX_WIG;
def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
(v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
- VEX, VEX_L, Sched<[WriteCvtI2F]>, VEX_WIG;
+ VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
}
let hasSideEffects = 0, mayLoad = 1 in
def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))],
- IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
+ (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+ Sched<[WriteCvtI2PDLd]>;
def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2f64 (X86VSintToFP (v4i32 VR128:$src))))],
- IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
+ (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
+ Sched<[WriteCvtI2PD]>;
// AVX register conversion intrinsics
let Predicates = [HasAVX, NoVLX] in {
@@ -2078,8 +1745,8 @@ let Predicates = [UseSSE2] in {
let Predicates = [HasAVX, NoVLX] in
def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
- IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>, VEX_WIG;
+ [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
+ VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
// XMM only
def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
@@ -2087,35 +1754,35 @@ def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
let Predicates = [HasAVX, NoVLX] in
def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>, VEX_WIG;
+ [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>,
+ VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
- (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0>;
+ (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0, "intel">;
// YMM only
let Predicates = [HasAVX, NoVLX] in {
def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (fpround VR256:$src))],
- IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>, VEX_WIG;
+ [(set VR128:$dst, (fpround VR256:$src))]>,
+ VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>, VEX_WIG;
+ [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))]>,
+ VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
}
def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
(VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
- (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0>;
+ (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0, "intel">;
def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
- IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
+ [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
+ Sched<[WriteCvtPD2PS]>;
def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
+ [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>,
+ Sched<[WriteCvtPD2PS.Folded]>;
// AVX 256-bit register conversion intrinsics
// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
@@ -2123,64 +1790,53 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
let Predicates = [HasAVX, NoVLX] in {
// Match fpround and fpextend for 128/256-bit conversions
- let AddedComplexity = 15 in {
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
- (VCVTPD2PSrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
- (VCVTPD2PSrm addr:$src)>;
- }
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
+ (VCVTPD2PSrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
+ (VCVTPD2PSrm addr:$src)>;
}
let Predicates = [UseSSE2] in {
// Match fpround and fpextend for 128 conversions
- let AddedComplexity = 15 in {
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
- (CVTPD2PSrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (memopv2f64 addr:$src)))))),
- (CVTPD2PSrm addr:$src)>;
- }
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
+ (CVTPD2PSrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (memopv2f64 addr:$src)))))),
+ (CVTPD2PSrm addr:$src)>;
}
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Compare Instructions
//===----------------------------------------------------------------------===//
-let Sched = WriteFAdd in
-def SSE_COMIS : OpndItins<
- IIC_SSE_COMIS_RR, IIC_SSE_COMIS_RM
->;
-
// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
Operand CC, SDNode OpNode, ValueType VT,
PatFrag ld_frag, string asm, string asm_alt,
- OpndItins itins> {
+ X86FoldableSchedWrite sched> {
let isCommutable = 1 in
def rr : SIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
- [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
- itins.rr>, Sched<[itins.Sched]>;
+ [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>,
+ Sched<[sched]>;
def rm : SIi8<0xC2, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
[(set RC:$dst, (OpNode (VT RC:$src1),
- (ld_frag addr:$src2), imm:$cc))],
- itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (ld_frag addr:$src2), imm:$cc))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
// Accept explicit immediate argument form instead of comparison code.
let isAsmParserOnly = 1, hasSideEffects = 0 in {
def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [],
- IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
+ (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, []>,
+ Sched<[sched]>, NotMemoryFoldable;
let mayLoad = 1 in
def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [],
- IIC_SSE_ALU_F32S_RM>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, []>,
+ Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable;
}
}
@@ -2188,43 +1844,41 @@ let ExeDomain = SSEPackedSingle in
defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
"cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SSE_ALU_F32S>, XS, VEX_4V, VEX_LIG, VEX_WIG;
+ SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
let ExeDomain = SSEPackedDouble in
defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
"cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SSE_ALU_F32S>, // same latency as 32 bit compare
+ SchedWriteFCmpSizes.PD.Scl>,
XD, VEX_4V, VEX_LIG, VEX_WIG;
let Constraints = "$src1 = $dst" in {
let ExeDomain = SSEPackedSingle in
defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
"cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
- "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>,
- XS;
+ "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SchedWriteFCmpSizes.PS.Scl>, XS;
let ExeDomain = SSEPackedDouble in
defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
"cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
"cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
- SSE_ALU_F64S>, XD;
+ SchedWriteFCmpSizes.PD.Scl>, XD;
}
multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
- Intrinsic Int, string asm, OpndItins itins,
+ Intrinsic Int, string asm, X86FoldableSchedWrite sched,
ComplexPattern mem_cpat> {
def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src, CC:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
- VR128:$src, imm:$cc))],
- itins.rr>,
- Sched<[itins.Sched]>;
+ VR128:$src, imm:$cc))]>,
+ Sched<[sched]>;
let mayLoad = 1 in
def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, memop:$src, CC:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
- mem_cpat:$src, imm:$cc))],
- itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ mem_cpat:$src, imm:$cc))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
let isCodeGenOnly = 1 in {
@@ -2232,174 +1886,168 @@ let isCodeGenOnly = 1 in {
let ExeDomain = SSEPackedSingle in
defm VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
"cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- SSE_ALU_F32S, sse_load_f32>, XS, VEX_4V;
+ SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS, VEX_4V;
let ExeDomain = SSEPackedDouble in
defm VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
"cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- SSE_ALU_F32S, sse_load_f64>, // same latency as f32
+ SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
XD, VEX_4V;
let Constraints = "$src1 = $dst" in {
let ExeDomain = SSEPackedSingle in
defm CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
"cmp${cc}ss\t{$src, $dst|$dst, $src}",
- SSE_ALU_F32S, sse_load_f32>, XS;
+ SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
let ExeDomain = SSEPackedDouble in
defm CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
"cmp${cc}sd\t{$src, $dst|$dst, $src}",
- SSE_ALU_F64S, sse_load_f64>, XD;
+ SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
}
}
// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
- ValueType vt, X86MemOperand x86memop,
- PatFrag ld_frag, string OpcodeStr,
- OpndItins itins> {
+ ValueType vt, X86MemOperand x86memop,
+ PatFrag ld_frag, string OpcodeStr,
+ X86FoldableSchedWrite sched> {
let hasSideEffects = 0 in {
def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
- [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
- itins.rr>,
- Sched<[itins.Sched]>;
+ [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
+ Sched<[sched]>;
let mayLoad = 1 in
def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1),
- (ld_frag addr:$src2)))],
- itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (ld_frag addr:$src2)))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
- ValueType vt, Operand memop,
- ComplexPattern mem_cpat, string OpcodeStr,
- OpndItins itins> {
- def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ ValueType vt, Operand memop,
+ ComplexPattern mem_cpat, string OpcodeStr,
+ X86FoldableSchedWrite sched> {
+ def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
- [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
- itins.rr>,
- Sched<[itins.Sched]>;
+ [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
+ Sched<[sched]>;
let mayLoad = 1 in
- def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
+ def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1),
- mem_cpat:$src2))],
- itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ mem_cpat:$src2))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
let Defs = [EFLAGS] in {
defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
- "ucomiss", SSE_COMIS>, PS, VEX, VEX_LIG, VEX_WIG;
+ "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
- "ucomisd", SSE_COMIS>, PD, VEX, VEX_LIG, VEX_WIG;
+ "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
let Pattern = []<dag> in {
defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
- "comiss", SSE_COMIS>, PS, VEX, VEX_LIG, VEX_WIG;
+ "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
- "comisd", SSE_COMIS>, PD, VEX, VEX_LIG, VEX_WIG;
+ "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
}
let isCodeGenOnly = 1 in {
- defm Int_VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
- sse_load_f32, "ucomiss", SSE_COMIS>, PS, VEX, VEX_WIG;
- defm Int_VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
- sse_load_f64, "ucomisd", SSE_COMIS>, PD, VEX, VEX_WIG;
-
- defm Int_VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
- sse_load_f32, "comiss", SSE_COMIS>, PS, VEX, VEX_WIG;
- defm Int_VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
- sse_load_f64, "comisd", SSE_COMIS>, PD, VEX, VEX_WIG;
+ defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
+ sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_WIG;
+ defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
+ sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_WIG;
+
+ defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
+ sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_WIG;
+ defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
+ sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_WIG;
}
defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
- "ucomiss", SSE_COMIS>, PS;
+ "ucomiss", WriteFCom>, PS;
defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
- "ucomisd", SSE_COMIS>, PD;
+ "ucomisd", WriteFCom>, PD;
let Pattern = []<dag> in {
defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
- "comiss", SSE_COMIS>, PS;
+ "comiss", WriteFCom>, PS;
defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
- "comisd", SSE_COMIS>, PD;
+ "comisd", WriteFCom>, PD;
}
let isCodeGenOnly = 1 in {
- defm Int_UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
- sse_load_f32, "ucomiss", SSE_COMIS>, PS;
- defm Int_UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
- sse_load_f64, "ucomisd", SSE_COMIS>, PD;
-
- defm Int_COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
- sse_load_f32, "comiss", SSE_COMIS>, PS;
- defm Int_COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
- sse_load_f64, "comisd", SSE_COMIS>, PD;
+ defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
+ sse_load_f32, "ucomiss", WriteFCom>, PS;
+ defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
+ sse_load_f64, "ucomisd", WriteFCom>, PD;
+
+ defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
+ sse_load_f32, "comiss", WriteFCom>, PS;
+ defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
+ sse_load_f64, "comisd", WriteFCom>, PD;
}
} // Defs = [EFLAGS]
// sse12_cmp_packed - sse 1 & 2 compare packed instructions
multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
Operand CC, ValueType VT, string asm,
- string asm_alt, Domain d,
- PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
+ string asm_alt, X86FoldableSchedWrite sched,
+ Domain d, PatFrag ld_frag> {
let isCommutable = 1 in
def rri : PIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
- [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))],
- itins.rr, d>,
- Sched<[WriteFAdd]>;
+ [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>,
+ Sched<[sched]>;
def rmi : PIi8<0xC2, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
[(set RC:$dst,
- (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))],
- itins.rm, d>,
- Sched<[WriteFAddLd, ReadAfterLd]>;
+ (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>,
+ Sched<[sched.Folded, ReadAfterLd]>;
// Accept explicit immediate argument form instead of comparison code.
let isAsmParserOnly = 1, hasSideEffects = 0 in {
def rri_alt : PIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
- asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
+ asm_alt, [], d>, Sched<[sched]>, NotMemoryFoldable;
let mayLoad = 1 in
def rmi_alt : PIi8<0xC2, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
- asm_alt, [], itins.rm, d>,
- Sched<[WriteFAddLd, ReadAfterLd]>;
+ asm_alt, [], d>, Sched<[sched.Folded, ReadAfterLd]>,
+ NotMemoryFoldable;
}
}
defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32,
"cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
+ SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64,
"cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
+ SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32,
"cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L;
+ SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64,
"cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L;
+ SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
let Constraints = "$src1 = $dst" in {
defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32,
"cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
"cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
- SSEPackedSingle, memopv4f32, SSE_ALU_F32P>, PS;
+ SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64,
"cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
"cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
- SSEPackedDouble, memopv2f64, SSE_ALU_F64P>, PD;
+ SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
}
def CommutableCMPCC : PatLeaf<(imm), [{
- return (N->getZExtValue() == 0x00 || N->getZExtValue() == 0x03 ||
- N->getZExtValue() == 0x04 || N->getZExtValue() == 0x07);
+ uint64_t Imm = N->getZExtValue() & 0x7;
+ return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
}]>;
// Patterns to select compares with loads in first operand.
@@ -2453,120 +2101,114 @@ let Predicates = [UseSSE1] in {
// SSE 1 & 2 - Shuffle Instructions
//===----------------------------------------------------------------------===//
-let Sched = WriteFShuffle in
-def SSE_SHUFP : OpndItins<
- IIC_SSE_SHUFP, IIC_SSE_SHUFP
->;
-
/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
ValueType vt, string asm, PatFrag mem_frag,
- OpndItins itins, Domain d> {
+ X86FoldableSchedWrite sched, Domain d> {
def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
[(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
- (i8 imm:$src3))))], itins.rm, d>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i8 imm:$src3))))], d>,
+ Sched<[sched.Folded, ReadAfterLd]>;
def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3), asm,
[(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
- (i8 imm:$src3))))], itins.rr, d>,
- Sched<[itins.Sched]>;
+ (i8 imm:$src3))))], d>,
+ Sched<[sched]>;
}
let Predicates = [HasAVX, NoVLX] in {
defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
"shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- loadv4f32, SSE_SHUFP, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
+ loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
+ PS, VEX_4V, VEX_WIG;
defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
"shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- loadv8f32, SSE_SHUFP, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
+ loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
+ PS, VEX_4V, VEX_L, VEX_WIG;
defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
"shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- loadv2f64, SSE_SHUFP, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
+ loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
+ PD, VEX_4V, VEX_WIG;
defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
"shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- loadv4f64, SSE_SHUFP, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
+ loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
+ PD, VEX_4V, VEX_L, VEX_WIG;
}
let Constraints = "$src1 = $dst" in {
defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
"shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- memopv4f32, SSE_SHUFP, SSEPackedSingle>, PS;
+ memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
"shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- memopv2f64, SSE_SHUFP, SSEPackedDouble>, PD;
+ memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
}
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Unpack FP Instructions
//===----------------------------------------------------------------------===//
-let Sched = WriteFShuffle in
-def SSE_UNPCK : OpndItins<
- IIC_SSE_UNPCK, IIC_SSE_UNPCK
->;
-
/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
PatFrag mem_frag, RegisterClass RC,
X86MemOperand x86memop, string asm,
- OpndItins itins, Domain d, bit IsCommutable = 0> {
+ X86FoldableSchedWrite sched, Domain d,
+ bit IsCommutable = 0> {
let isCommutable = IsCommutable in
def rr : PI<opc, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2),
asm, [(set RC:$dst,
- (vt (OpNode RC:$src1, RC:$src2)))],
- itins.rr, d>, Sched<[itins.Sched]>;
+ (vt (OpNode RC:$src1, RC:$src2)))], d>,
+ Sched<[sched]>;
def rm : PI<opc, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2),
asm, [(set RC:$dst,
(vt (OpNode RC:$src1,
- (mem_frag addr:$src2))))],
- itins.rm, d>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (mem_frag addr:$src2))))], d>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
let Predicates = [HasAVX, NoVLX] in {
defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
+ SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
+ SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
+ SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
+ SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
+ SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
+ SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
+ SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
+ SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
}// Predicates = [HasAVX, NoVLX]
let Constraints = "$src1 = $dst" in {
defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
- SSE_UNPCK, SSEPackedSingle>, PS;
+ SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
- SSE_UNPCK, SSEPackedDouble, 1>, PD;
+ SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
- SSE_UNPCK, SSEPackedSingle>, PS;
+ SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
- SSE_UNPCK, SSEPackedDouble>, PD;
+ SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
} // Constraints = "$src1 = $dst"
let Predicates = [HasAVX1Only] in {
@@ -2598,8 +2240,8 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
string asm, Domain d> {
def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], IIC_SSE_MOVMSK, d>,
- Sched<[WriteVecLogic]>;
+ [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
+ Sched<[WriteFMOVMSK]>;
}
let Predicates = [HasAVX] in {
@@ -2627,7 +2269,7 @@ let ExeDomain = SSEPackedInt in { // SSE integer instructions
/// PDI_binop_rm - Simple SSE2 binary operator.
multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
- X86MemOperand x86memop, OpndItins itins,
+ X86MemOperand x86memop, X86FoldableSchedWrite sched,
bit IsCommutable, bit Is2Addr> {
let isCommutable = IsCommutable in
def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
@@ -2635,47 +2277,48 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
- Sched<[itins.Sched]>;
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+ Sched<[sched]>;
def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (OpVT (OpNode RC:$src1,
- (bitconvert (memop_frag addr:$src2)))))],
- itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (memop_frag addr:$src2)))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
} // ExeDomain = SSEPackedInt
multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
ValueType OpVT128, ValueType OpVT256,
- OpndItins itins, bit IsCommutable = 0, Predicate prd> {
+ X86SchedWriteWidths sched, bit IsCommutable,
+ Predicate prd> {
let Predicates = [HasAVX, prd] in
defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
- VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V, VEX_WIG;
+ VR128, loadv2i64, i128mem, sched.XMM,
+ IsCommutable, 0>, VEX_4V, VEX_WIG;
let Constraints = "$src1 = $dst" in
defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
- memopv2i64, i128mem, itins, IsCommutable, 1>;
+ memopv2i64, i128mem, sched.XMM, IsCommutable, 1>;
let Predicates = [HasAVX2, prd] in
defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
- OpVT256, VR256, loadv4i64, i256mem, itins,
+ OpVT256, VR256, loadv4i64, i256mem, sched.YMM,
IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
}
// These are ordered here for pattern ordering requirements with the fp versions
defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
- SSE_BIT_ITINS_P, 1, NoVLX>;
+ SchedWriteVecLogic, 1, NoVLX>;
defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
- SSE_BIT_ITINS_P, 1, NoVLX>;
+ SchedWriteVecLogic, 1, NoVLX>;
defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
- SSE_BIT_ITINS_P, 1, NoVLX>;
+ SchedWriteVecLogic, 1, NoVLX>;
defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
- SSE_BIT_ITINS_P, 0, NoVLX>;
+ SchedWriteVecLogic, 0, NoVLX>;
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Logical Instructions
@@ -2686,41 +2329,41 @@ defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
/// There are no patterns here because isel prefers integer versions for SSE2
/// and later. There are SSE1 v4f32 patterns later.
multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
- SDNode OpNode> {
+ SDNode OpNode, X86SchedWriteWidths sched> {
let Predicates = [HasAVX, NoVLX] in {
defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
- !strconcat(OpcodeStr, "ps"), f256mem,
+ !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
[], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
- !strconcat(OpcodeStr, "pd"), f256mem,
+ !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
[], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
- !strconcat(OpcodeStr, "ps"), f128mem,
+ !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
[], [], 0>, PS, VEX_4V, VEX_WIG;
defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
- !strconcat(OpcodeStr, "pd"), f128mem,
+ !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
[], [], 0>, PD, VEX_4V, VEX_WIG;
}
let Constraints = "$src1 = $dst" in {
defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
- !strconcat(OpcodeStr, "ps"), f128mem,
+ !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
[], []>, PS;
defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
- !strconcat(OpcodeStr, "pd"), f128mem,
+ !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
[], []>, PD;
}
}
-defm AND : sse12_fp_packed_logical<0x54, "and", and>;
-defm OR : sse12_fp_packed_logical<0x56, "or", or>;
-defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>;
+defm AND : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>;
+defm OR : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>;
+defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
let isCommutable = 0 in
- defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
+ defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
// If only AVX1 is supported, we need to handle integer operations with
// floating point instructions since the integer versions aren't available.
@@ -2747,78 +2390,94 @@ let Predicates = [HasAVX1Only] in {
let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
// Use packed logical operations for scalar ops.
def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS (VANDPDrr
- (COPY_TO_REGCLASS FR64:$src1, VR128),
- (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ (COPY_TO_REGCLASS
+ (v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+ (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+ FR64)>;
def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS (VORPDrr
- (COPY_TO_REGCLASS FR64:$src1, VR128),
- (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ (COPY_TO_REGCLASS
+ (v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+ (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+ FR64)>;
def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS (VXORPDrr
- (COPY_TO_REGCLASS FR64:$src1, VR128),
- (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ (COPY_TO_REGCLASS
+ (v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+ (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+ FR64)>;
def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS (VANDNPDrr
- (COPY_TO_REGCLASS FR64:$src1, VR128),
- (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ (COPY_TO_REGCLASS
+ (v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+ (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+ FR64)>;
def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS (VANDPSrr
- (COPY_TO_REGCLASS FR32:$src1, VR128),
- (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ (COPY_TO_REGCLASS
+ (v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+ (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+ FR32)>;
def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS (VORPSrr
- (COPY_TO_REGCLASS FR32:$src1, VR128),
- (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ (COPY_TO_REGCLASS
+ (v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+ (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+ FR32)>;
def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS (VXORPSrr
- (COPY_TO_REGCLASS FR32:$src1, VR128),
- (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ (COPY_TO_REGCLASS
+ (v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+ (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+ FR32)>;
def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS (VANDNPSrr
- (COPY_TO_REGCLASS FR32:$src1, VR128),
- (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ (COPY_TO_REGCLASS
+ (v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+ (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+ FR32)>;
}
let Predicates = [UseSSE1] in {
// Use packed logical operations for scalar ops.
def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS (ANDPSrr
- (COPY_TO_REGCLASS FR32:$src1, VR128),
- (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ (COPY_TO_REGCLASS
+ (v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+ (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+ FR32)>;
def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS (ORPSrr
- (COPY_TO_REGCLASS FR32:$src1, VR128),
- (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ (COPY_TO_REGCLASS
+ (v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+ (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+ FR32)>;
def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS (XORPSrr
- (COPY_TO_REGCLASS FR32:$src1, VR128),
- (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ (COPY_TO_REGCLASS
+ (v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+ (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+ FR32)>;
def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS (ANDNPSrr
- (COPY_TO_REGCLASS FR32:$src1, VR128),
- (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ (COPY_TO_REGCLASS
+ (v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+ (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+ FR32)>;
}
let Predicates = [UseSSE2] in {
// Use packed logical operations for scalar ops.
def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS (ANDPDrr
- (COPY_TO_REGCLASS FR64:$src1, VR128),
- (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ (COPY_TO_REGCLASS
+ (v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+ (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+ FR64)>;
def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS (ORPDrr
- (COPY_TO_REGCLASS FR64:$src1, VR128),
- (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ (COPY_TO_REGCLASS
+ (v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+ (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+ FR64)>;
def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS (XORPDrr
- (COPY_TO_REGCLASS FR64:$src1, VR128),
- (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ (COPY_TO_REGCLASS
+ (v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+ (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+ FR64)>;
def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS (ANDNPDrr
- (COPY_TO_REGCLASS FR64:$src1, VR128),
- (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ (COPY_TO_REGCLASS
+ (v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+ (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+ FR64)>;
}
// Patterns for packed operations when we don't have integer type available.
@@ -2858,99 +2517,99 @@ def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
/// classes below
multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
- SDNode OpNode, SizeItins itins> {
+ SDNode OpNode, X86SchedWriteSizes sched> {
let Predicates = [HasAVX, NoVLX] in {
defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
VR128, v4f32, f128mem, loadv4f32,
- SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_WIG;
+ SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
VR128, v2f64, f128mem, loadv2f64,
- SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_WIG;
+ SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
OpNode, VR256, v8f32, f256mem, loadv8f32,
- SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
+ SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
OpNode, VR256, v4f64, f256mem, loadv4f64,
- SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
+ SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
}
let Constraints = "$src1 = $dst" in {
defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
v4f32, f128mem, memopv4f32, SSEPackedSingle,
- itins.s>, PS;
+ sched.PS.XMM>, PS;
defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
v2f64, f128mem, memopv2f64, SSEPackedDouble,
- itins.d>, PD;
+ sched.PD.XMM>, PD;
}
}
multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SizeItins itins> {
+ X86SchedWriteSizes sched> {
defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
- OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>,
+ OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
XS, VEX_4V, VEX_LIG, VEX_WIG;
defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
- OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>,
+ OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
XD, VEX_4V, VEX_LIG, VEX_WIG;
let Constraints = "$src1 = $dst" in {
defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
OpNode, FR32, f32mem, SSEPackedSingle,
- itins.s>, XS;
+ sched.PS.Scl>, XS;
defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
OpNode, FR64, f64mem, SSEPackedDouble,
- itins.d>, XD;
+ sched.PD.Scl>, XD;
}
}
multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNode,
- SizeItins itins> {
+ X86SchedWriteSizes sched> {
defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
!strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
- SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
+ SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
!strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
- SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
+ SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
let Constraints = "$src1 = $dst" in {
defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
!strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
- SSEPackedSingle, itins.s>, XS;
+ SSEPackedSingle, sched.PS.Scl>, XS;
defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
!strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
- SSEPackedDouble, itins.d>, XD;
+ SSEPackedDouble, sched.PD.Scl>, XD;
}
}
// Binary Arithmetic instructions
-defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
- basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SSE_ALU_ITINS_S>;
-defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
- basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SSE_MUL_ITINS_S>;
+defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SchedWriteFAddSizes>,
+ basic_sse12_fp_binop_s<0x58, "add", fadd, SchedWriteFAddSizes>,
+ basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
+defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SchedWriteFMulSizes>,
+ basic_sse12_fp_binop_s<0x59, "mul", fmul, SchedWriteFMulSizes>,
+ basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
let isCommutable = 0 in {
- defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
- basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag,SSE_ALU_ITINS_S>;
- defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
- basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x5E, "div", null_frag,SSE_DIV_ITINS_S>;
- defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
- basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SSE_ALU_ITINS_S>;
- defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
- basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SSE_ALU_ITINS_S>;
+ defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SchedWriteFAddSizes>,
+ basic_sse12_fp_binop_s<0x5C, "sub", fsub, SchedWriteFAddSizes>,
+ basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
+ defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SchedWriteFDivSizes>,
+ basic_sse12_fp_binop_s<0x5E, "div", fdiv, SchedWriteFDivSizes>,
+ basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
+ defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
+ basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
+ basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
+ defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
+ basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
+ basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
}
let isCodeGenOnly = 1 in {
- defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
- basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
- defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
- basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
+ defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
+ basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
+ defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
+ basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
}
// Patterns used to select SSE scalar fp arithmetic instructions from
@@ -2995,79 +2654,41 @@ let isCodeGenOnly = 1 in {
// TODO: Some canonicalization in lowering would simplify the number of
// patterns we have to try to match.
-multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
- let Predicates = [UseSSE1] in {
- // extracted scalar math op with insert via movss
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
- (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))))),
- (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
- (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
- // vector math op with insert via movss
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
- (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
- (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
- }
-
- // Repeat everything for AVX.
- let Predicates = [UseAVX] in {
- // extracted scalar math op with insert via movss
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
- (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))))),
- (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
- (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
- // vector math op with insert via movss
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
- (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
- (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
- }
-}
-
-defm : scalar_math_f32_patterns<fadd, "ADD">;
-defm : scalar_math_f32_patterns<fsub, "SUB">;
-defm : scalar_math_f32_patterns<fmul, "MUL">;
-defm : scalar_math_f32_patterns<fdiv, "DIV">;
-
-multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
- let Predicates = [UseSSE2] in {
- // extracted scalar math op with insert via movsd
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
- (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))))),
- (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
- (COPY_TO_REGCLASS FR64:$src, VR128))>;
-
- // vector math op with insert via movsd
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
- (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
- (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
+ ValueType VT, ValueType EltTy,
+ RegisterClass RC, Predicate BasePredicate> {
+ let Predicates = [BasePredicate] in {
+ // extracted scalar math op with insert via movss/movsd
+ def : Pat<(VT (Move (VT VR128:$dst),
+ (VT (scalar_to_vector
+ (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
+ RC:$src))))),
+ (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
+ (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
}
- // Repeat everything for AVX.
+ // Repeat for AVX versions of the instructions.
let Predicates = [UseAVX] in {
- // extracted scalar math op with insert via movsd
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
- (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))))),
- (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
- (COPY_TO_REGCLASS FR64:$src, VR128))>;
-
- // vector math op with insert via movsd
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
- (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
- (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+ // extracted scalar math op with insert via movss/movsd
+ def : Pat<(VT (Move (VT VR128:$dst),
+ (VT (scalar_to_vector
+ (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
+ RC:$src))))),
+ (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
+ (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
}
}
-defm : scalar_math_f64_patterns<fadd, "ADD">;
-defm : scalar_math_f64_patterns<fsub, "SUB">;
-defm : scalar_math_f64_patterns<fmul, "MUL">;
-defm : scalar_math_f64_patterns<fdiv, "DIV">;
-
+defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
+defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
+defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
+defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
+defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
+defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
+defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
+defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
+
/// Unop Arithmetic
/// In addition, we also have a special variant of the scalar form here to
/// represent the associated intrinsic operation. This form is unlike the
@@ -3076,98 +2697,46 @@ defm : scalar_math_f64_patterns<fdiv, "DIV">;
///
/// And, we have a special variant form for a full-vector intrinsic form.
-let Sched = WriteFSqrt in {
-def SSE_SQRTPS : OpndItins<
- IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
->;
-
-def SSE_SQRTSS : OpndItins<
- IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
->;
-
-def SSE_SQRTPD : OpndItins<
- IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
->;
-
-def SSE_SQRTSD : OpndItins<
- IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
->;
-}
-
-let Sched = WriteFRsqrt in {
-def SSE_RSQRTPS : OpndItins<
- IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
->;
-
-def SSE_RSQRTSS : OpndItins<
- IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
->;
-}
-
-def SSE_RSQRT_P : SizeItins<
- SSE_RSQRTPS, SSE_RSQRTPS
->;
-
-def SSE_RSQRT_S : SizeItins<
- SSE_RSQRTSS, SSE_RSQRTSS
->;
-
-let Sched = WriteFRcp in {
-def SSE_RCPP : OpndItins<
- IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
->;
-
-def SSE_RCPS : OpndItins<
- IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
->;
-}
-
-def SSE_RCP_P : SizeItins<
- SSE_RCPP, SSE_RCPP
->;
-
-def SSE_RCP_S : SizeItins<
- SSE_RCPS, SSE_RCPS
->;
-
/// sse_fp_unop_s - SSE1 unops in scalar form
/// For the non-AVX defs, we need $src1 to be tied to $dst because
/// the HW instructions are 2 operand / destructive.
multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
- ValueType vt, ValueType ScalarVT,
- X86MemOperand x86memop,
- Operand intmemop, ComplexPattern int_cpat,
- Intrinsic Intr,
- SDNode OpNode, Domain d, OpndItins itins,
- Predicate target, string Suffix> {
+ ValueType ScalarVT, X86MemOperand x86memop,
+ Operand intmemop, SDNode OpNode, Domain d,
+ X86FoldableSchedWrite sched, Predicate target> {
let hasSideEffects = 0 in {
def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
!strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
- [(set RC:$dst, (OpNode RC:$src1))], itins.rr, d>, Sched<[itins.Sched]>,
+ [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
Requires<[target]>;
let mayLoad = 1 in
def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
!strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
- [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm, d>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>,
+ [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
+ Sched<[sched.Folded, ReadAfterLd]>,
Requires<[target, OptForSize]>;
let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in {
def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
+ Sched<[sched]>;
let mayLoad = 1 in
def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
+}
+
+multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
+ ComplexPattern int_cpat, Intrinsic Intr,
+ Predicate target, string Suffix> {
let Predicates = [target] in {
// These are unary operations, but they are modeled as having 2 source operands
// because the high elements of the destination are unchanged in SSE.
def : Pat<(Intr VR128:$src),
- (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>;
+ (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
}
// We don't want to fold scalar loads into these instructions unless
// optimizing for size. This is because the folded instruction will have a
@@ -3178,35 +2747,47 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
// rcpss mem, %xmm0
let Predicates = [target, OptForSize] in {
def : Pat<(Intr int_cpat:$src2),
- (!cast<Instruction>(NAME#Suffix##m_Int)
+ (!cast<Instruction>(NAME#m_Int)
(vt (IMPLICIT_DEF)), addr:$src2)>;
}
}
+multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat,
+ Intrinsic Intr, Predicate target> {
+ let Predicates = [target] in {
+ def : Pat<(Intr VR128:$src),
+ (!cast<Instruction>(NAME#r_Int) VR128:$src,
+ VR128:$src)>;
+ }
+ let Predicates = [target, OptForSize] in {
+ def : Pat<(Intr int_cpat:$src2),
+ (!cast<Instruction>(NAME#m_Int)
+ (vt (IMPLICIT_DEF)), addr:$src2)>;
+ }
+}
+
multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
- ValueType vt, ValueType ScalarVT,
- X86MemOperand x86memop,
- Operand intmemop, ComplexPattern int_cpat,
- Intrinsic Intr, SDNode OpNode, Domain d,
- OpndItins itins, Predicate target, string Suffix> {
+ ValueType ScalarVT, X86MemOperand x86memop,
+ Operand intmemop, SDNode OpNode, Domain d,
+ X86FoldableSchedWrite sched, Predicate target> {
let hasSideEffects = 0 in {
def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [], itins.rr, d>, Sched<[itins.Sched]>;
+ [], d>, Sched<[sched]>;
let mayLoad = 1 in
def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ [], d>, Sched<[sched.Folded, ReadAfterLd]>;
let isCodeGenOnly = 1, ExeDomain = d in {
def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, Sched<[itins.Sched.Folded]>;
+ []>, Sched<[sched]>;
let mayLoad = 1 in
def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, intmemop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ []>, Sched<[sched.Folded, ReadAfterLd]>;
}
}
@@ -3218,164 +2799,191 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
// which has a clobber before the rcp, vs.
// vrcpss mem, %xmm0, %xmm0
// TODO: In theory, we could fold the load, and avoid the stall caused by
- // the partial register store, either in ExecutionDepsFix or with smarter RA.
+ // the partial register store, either in BreakFalseDeps or with smarter RA.
let Predicates = [target] in {
- def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r)
+ def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r)
(ScalarVT (IMPLICIT_DEF)), RC:$src)>;
- def : Pat<(Intr VR128:$src),
- (!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src,
- VR128:$src)>;
}
let Predicates = [target, OptForSize] in {
- def : Pat<(Intr int_cpat:$src2),
- (!cast<Instruction>("V"#NAME#Suffix##m_Int)
- (vt (IMPLICIT_DEF)), addr:$src2)>;
def : Pat<(ScalarVT (OpNode (load addr:$src))),
- (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
+ (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
addr:$src)>;
}
}
/// sse1_fp_unop_p - SSE1 unops in packed form.
multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, list<Predicate> prds> {
+ X86SchedWriteWidths sched, list<Predicate> prds> {
let Predicates = prds in {
def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat("v", OpcodeStr,
"ps\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
- itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG;
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
+ VEX, Sched<[sched.XMM]>, VEX_WIG;
def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
!strconcat("v", OpcodeStr,
"ps\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
- itins.rm>, VEX, Sched<[itins.Sched.Folded]>, VEX_WIG;
+ [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
+ VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
!strconcat("v", OpcodeStr,
"ps\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
- itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG;
+ [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
+ VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
!strconcat("v", OpcodeStr,
"ps\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
- itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>, VEX_WIG;
+ [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
+ VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
}
def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
- Sched<[itins.Sched]>;
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
+ Sched<[sched.XMM]>;
def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
!strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
- Sched<[itins.Sched.Folded]>;
+ [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
+ Sched<[sched.XMM.Folded]>;
}
/// sse2_fp_unop_p - SSE2 unops in vector forms.
multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
- SDNode OpNode, OpndItins itins> {
+ SDNode OpNode, X86SchedWriteWidths sched> {
let Predicates = [HasAVX, NoVLX] in {
def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat("v", OpcodeStr,
"pd\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
- itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG;
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
+ VEX, Sched<[sched.XMM]>, VEX_WIG;
def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
!strconcat("v", OpcodeStr,
"pd\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
- itins.rm>, VEX, Sched<[itins.Sched.Folded]>, VEX_WIG;
+ [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
+ VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
!strconcat("v", OpcodeStr,
"pd\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
- itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG;
+ [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
+ VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
!strconcat("v", OpcodeStr,
"pd\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
- itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>, VEX_WIG;
+ [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
+ VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
}
def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
- Sched<[itins.Sched]>;
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
+ Sched<[sched.XMM]>;
def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
!strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
- Sched<[itins.Sched.Folded]>;
+ [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
+ Sched<[sched.XMM.Folded]>;
+}
+
+multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, Predicate AVXTarget> {
+ defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
+ !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
+ UseSSE1, "SS">, XS;
+ defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
+ !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
+ AVXTarget>,
+ XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
}
multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, Predicate AVXTarget> {
- defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
- ssmem, sse_load_f32,
- !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
- SSEPackedSingle, itins, UseSSE1, "SS">, XS;
- defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
- f32mem, ssmem, sse_load_f32,
- !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
- SSEPackedSingle, itins, AVXTarget, "SS">, XS, VEX_4V,
- VEX_LIG, VEX_WIG, NotMemoryFoldable;
+ X86SchedWriteWidths sched, Predicate AVXTarget> {
+ defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem,
+ ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
+ defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32,
+ f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
+ XS, VEX_4V, VEX_LIG, VEX_WIG;
}
multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins, Predicate AVXTarget> {
- defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
- sdmem, sse_load_f64,
- !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
- OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
- defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
- f64mem, sdmem, sse_load_f64,
- !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
- OpNode, SSEPackedDouble, itins, AVXTarget, "SD">,
- XD, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
+ X86SchedWriteWidths sched, Predicate AVXTarget> {
+ defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem,
+ sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
+ defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64,
+ f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG;
}
// Square root.
-defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS, UseAVX>,
- sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX, NoVLX]>,
- sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD, UseAVX>,
- sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
+defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>,
+ sse1_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
+ sse2_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt64, UseAVX>,
+ sse2_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt64>;
// Reciprocal approximations. Note that these typically require refinement
// in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS, HasAVX>,
- sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX]>;
-defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS, HasAVX>,
- sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX]>;
+defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
+ sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
+ sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
+defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
+ sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
+ sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
// There is no f64 version of the reciprocal approximation instructions.
-// TODO: We should add *scalar* op patterns for these just like we have for
-// the binops above. If the binop and unop patterns could all be unified
-// that would be even better.
+multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
+ ValueType VT, Predicate BasePredicate> {
+ let Predicates = [BasePredicate] in {
+ def : Pat<(VT (Move VT:$dst, (scalar_to_vector
+ (OpNode (extractelt VT:$src, 0))))),
+ (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+ }
+
+ // Repeat for AVX versions of the instructions.
+ let Predicates = [UseAVX] in {
+ def : Pat<(VT (Move VT:$dst, (scalar_to_vector
+ (OpNode (extractelt VT:$src, 0))))),
+ (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+ }
+}
+
+multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
+ ValueType VT, bits<8> ImmV,
+ Predicate BasePredicate> {
+ let Predicates = [BasePredicate] in {
+ def : Pat<(VT (Move VT:$dst, (scalar_to_vector
+ (OpNode (extractelt VT:$src, 0))))),
+ (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
+ }
+
+ // Repeat for AVX versions of the instructions.
+ let Predicates = [UseAVX] in {
+ def : Pat<(VT (Move VT:$dst, (scalar_to_vector
+ (OpNode (extractelt VT:$src, 0))))),
+ (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
+ }
+}
+
+defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
-multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix,
- SDNode Move, ValueType VT,
- Predicate BasePredicate> {
+multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
+ SDNode Move, ValueType VT,
+ Predicate BasePredicate> {
let Predicates = [BasePredicate] in {
def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
- (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+ (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
}
// Repeat for AVX versions of the instructions.
let Predicates = [HasAVX] in {
def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
- (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+ (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
}
}
-defm : scalar_unary_math_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
- v4f32, UseSSE1>;
-defm : scalar_unary_math_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
- v4f32, UseSSE1>;
-defm : scalar_unary_math_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss,
- v4f32, UseSSE1>;
-defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd,
- v2f64, UseSSE2>;
+defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
+ v4f32, UseSSE1>;
+defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
+ v4f32, UseSSE1>;
//===----------------------------------------------------------------------===//
@@ -3383,77 +2991,74 @@ defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd,
//===----------------------------------------------------------------------===//
let AddedComplexity = 400 in { // Prefer non-temporal versions
-let SchedRW = [WriteStore] in {
let Predicates = [HasAVX, NoVLX] in {
+let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
(ins f128mem:$dst, VR128:$src),
"movntps\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v4f32 VR128:$src),
- addr:$dst)],
- IIC_SSE_MOVNT>, VEX, VEX_WIG;
+ addr:$dst)]>, VEX, VEX_WIG;
def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
(ins f128mem:$dst, VR128:$src),
"movntpd\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v2f64 VR128:$src),
- addr:$dst)],
- IIC_SSE_MOVNT>, VEX, VEX_WIG;
-
-let ExeDomain = SSEPackedInt in
-def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
- (ins i128mem:$dst, VR128:$src),
- "movntdq\t{$src, $dst|$dst, $src}",
- [(alignednontemporalstore (v2i64 VR128:$src),
- addr:$dst)],
- IIC_SSE_MOVNT>, VEX, VEX_WIG;
+ addr:$dst)]>, VEX, VEX_WIG;
+} // SchedRW
+let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
(ins f256mem:$dst, VR256:$src),
"movntps\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v8f32 VR256:$src),
- addr:$dst)],
- IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
+ addr:$dst)]>, VEX, VEX_L, VEX_WIG;
def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
(ins f256mem:$dst, VR256:$src),
"movntpd\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v4f64 VR256:$src),
- addr:$dst)],
- IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
-let ExeDomain = SSEPackedInt in
+ addr:$dst)]>, VEX, VEX_L, VEX_WIG;
+} // SchedRW
+
+let ExeDomain = SSEPackedInt in {
+def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2i64 VR128:$src),
+ addr:$dst)]>, VEX, VEX_WIG,
+ Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
(ins i256mem:$dst, VR256:$src),
"movntdq\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v4i64 VR256:$src),
- addr:$dst)],
- IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
-}
+ addr:$dst)]>, VEX, VEX_L, VEX_WIG,
+ Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
+} // ExeDomain
+} // Predicates
+let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntps\t{$src, $dst|$dst, $src}",
- [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
- IIC_SSE_MOVNT>;
+ [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntpd\t{$src, $dst|$dst, $src}",
- [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
- IIC_SSE_MOVNT>;
+ [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
+} // SchedRW
-let ExeDomain = SSEPackedInt in
+let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntdq\t{$src, $dst|$dst, $src}",
- [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
- IIC_SSE_MOVNT>;
+ [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
+let SchedRW = [WriteStoreNT] in {
// There is no AVX form for instructions below this point
def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
"movnti{l}\t{$src, $dst|$dst, $src}",
- [(nontemporalstore (i32 GR32:$src), addr:$dst)],
- IIC_SSE_MOVNT>,
+ [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
PS, Requires<[HasSSE2]>;
def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
"movnti{q}\t{$src, $dst|$dst, $src}",
- [(nontemporalstore (i64 GR64:$src), addr:$dst)],
- IIC_SSE_MOVNT>,
+ [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
PS, Requires<[HasSSE2]>;
-} // SchedRW = [WriteStore]
+} // SchedRW = [WriteStoreNT]
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
@@ -3489,47 +3094,40 @@ let Predicates = [UseSSE2] in {
// Prefetch intrinsic.
let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
- "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
- IIC_SSE_PREFETCH>, TB;
+ "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
- "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
- IIC_SSE_PREFETCH>, TB;
+ "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
- "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
- IIC_SSE_PREFETCH>, TB;
+ "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
- "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
- IIC_SSE_PREFETCH>, TB;
+ "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
}
// FIXME: How should flush instruction be modeled?
let SchedRW = [WriteLoad] in {
// Flush cache
def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
- "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
- IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>;
+ "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
+ PS, Requires<[HasSSE2]>;
}
let SchedRW = [WriteNop] in {
// Pause. This "instruction" is encoded as "rep; nop", so even though it
// was introduced with SSE2, it's backward compatible.
def PAUSE : I<0x90, RawFrm, (outs), (ins),
- "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, OBXS;
+ "pause", [(int_x86_sse2_pause)]>, OBXS;
}
let SchedRW = [WriteFence] in {
// Load, store, and memory fence
// TODO: As with mfence, we may want to ease the availablity of sfence/lfence
// to include any 64-bit target.
-def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
- "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
+def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
PS, Requires<[HasSSE1]>;
-def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
- "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
- TB, Requires<[HasSSE2]>;
-def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
- "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
- TB, Requires<[HasMFence]>;
+def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
+ PS, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
+ PS, Requires<[HasMFence]>;
} // SchedRW
def : Pat<(X86MFence), (MFENCE)>;
@@ -3539,18 +3137,18 @@ def : Pat<(X86MFence), (MFENCE)>;
//===----------------------------------------------------------------------===//
def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
- "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
- IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>, VEX_WIG;
+ "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
+ VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
- "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
- IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>, VEX_WIG;
+ "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
+ VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
- "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
- IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
+ "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
+ TB, Sched<[WriteLDMXCSR]>;
def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
- "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
- IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
+ "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
+ TB, Sched<[WriteSTMXCSR]>;
//===---------------------------------------------------------------------===//
// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
@@ -3558,128 +3156,122 @@ def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
let ExeDomain = SSEPackedInt in { // SSE integer instructions
-let hasSideEffects = 0, SchedRW = [WriteMove] in {
+let hasSideEffects = 0 in {
def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
- VEX, VEX_WIG;
-def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
- "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
- VEX, VEX_L, VEX_WIG;
+ "movdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
- VEX, VEX_WIG;
+ "movdqu\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
+def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
- "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
- VEX, VEX_L, VEX_WIG;
+ "movdqu\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
}
// For Disassembler
-let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
- SchedRW = [WriteMove] in {
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movdqa\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>,
- VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
+ "movdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.XMM.RR]>,
+ VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
- "movdqa\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
- FoldGenData<"VMOVDQAYrr">;
+ "movdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.YMM.RR]>,
+ VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movdqu\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVU_P_RR>,
- VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
+ "movdqu\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.XMM.RR]>,
+ VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
- "movdqu\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
- FoldGenData<"VMOVDQUYrr">;
+ "movdqu\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.YMM.RR]>,
+ VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
}
let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
- hasSideEffects = 0, SchedRW = [WriteLoad] in {
-let Predicates = [HasAVX,NoVLX] in
+ hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
- "movdqa\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (alignedloadv2i64 addr:$src))],
- IIC_SSE_MOVA_P_RM>, VEX, VEX_WIG;
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
+ Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
- "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
- VEX, VEX_L, VEX_WIG;
-let Predicates = [HasAVX,NoVLX] in
+ "movdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.YMM.RM]>,
+ VEX, VEX_L, VEX_WIG;
def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
- "vmovdqu\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (loadv2i64 addr:$src))],
- IIC_SSE_MOVU_P_RM>, XS, VEX, VEX_WIG;
+ "vmovdqu\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (loadv2i64 addr:$src))]>,
+ Sched<[SchedWriteVecMoveLS.XMM.RM]>,
+ XS, VEX, VEX_WIG;
def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
- "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
- XS, VEX, VEX_L, VEX_WIG;
+ "vmovdqu\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.YMM.RM]>,
+ XS, VEX, VEX_L, VEX_WIG;
}
-let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
-let Predicates = [HasAVX,NoVLX] in
+let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
- (ins i128mem:$dst, VR128:$src),
- "movdqa\t{$src, $dst|$dst, $src}",
- [(alignedstore (v2i64 VR128:$src), addr:$dst)],
- IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
+ (ins i128mem:$dst, VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
+ Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
- (ins i256mem:$dst, VR256:$src),
- "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
- VEX, VEX_L, VEX_WIG;
-let Predicates = [HasAVX,NoVLX] in
+ (ins i256mem:$dst, VR256:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
- "vmovdqu\t{$src, $dst|$dst, $src}",
- [(store (v2i64 VR128:$src), addr:$dst)], IIC_SSE_MOVU_P_MR>,
- XS, VEX, VEX_WIG;
+ "vmovdqu\t{$src, $dst|$dst, $src}",
+ [(store (v2i64 VR128:$src), addr:$dst)]>,
+ Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
- "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
- XS, VEX, VEX_L, VEX_WIG;
+ "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
+ Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
}
-let SchedRW = [WriteMove] in {
+let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
let hasSideEffects = 0 in {
def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
+ "movdqa\t{$src, $dst|$dst, $src}", []>;
def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "movdqu\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+ "movdqu\t{$src, $dst|$dst, $src}", []>,
+ XS, Requires<[UseSSE2]>;
}
// For Disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movdqa\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVDQArr">;
+ "movdqa\t{$src, $dst|$dst, $src}", []>,
+ FoldGenData<"MOVDQArr">;
def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movdqu\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>,
- FoldGenData<"MOVDQUrr">;
+ "movdqu\t{$src, $dst|$dst, $src}", []>,
+ XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
}
} // SchedRW
let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
- hasSideEffects = 0, SchedRW = [WriteLoad] in {
+ hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"movdqa\t{$src, $dst|$dst, $src}",
- [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
- IIC_SSE_MOVA_P_RM>;
+ [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"movdqu\t{$src, $dst|$dst, $src}",
- [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
- IIC_SSE_MOVU_P_RM>,
+ [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
XS, Requires<[UseSSE2]>;
}
-let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+let mayStore = 1, hasSideEffects = 0,
+ SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
"movdqa\t{$src, $dst|$dst, $src}",
- [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
- IIC_SSE_MOVA_P_MR>;
+ [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
"movdqu\t{$src, $dst|$dst, $src}",
- [/*(store (v2i64 VR128:$src), addr:$dst)*/],
- IIC_SSE_MOVU_P_MR>,
+ [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
XS, Requires<[UseSSE2]>;
}
@@ -3696,6 +3288,22 @@ def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
(VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
+// Reversed version with ".s" suffix for GAS compatibility.
+def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
+ (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
+ (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
+def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
+ (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
+ (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
+
+// Reversed version with ".s" suffix for GAS compatibility.
+def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
+ (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
+ (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
+
let Predicates = [HasAVX, NoVLX] in {
// Additional patterns for other integer sizes.
def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
@@ -3716,123 +3324,109 @@ let Predicates = [HasAVX, NoVLX] in {
// SSE2 - Packed Integer Arithmetic Instructions
//===---------------------------------------------------------------------===//
-let Sched = WriteVecIMul in
-def SSE_PMADD : OpndItins<
- IIC_SSE_PMADD, IIC_SSE_PMADD
->;
-
let ExeDomain = SSEPackedInt in { // SSE integer instructions
/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType DstVT, ValueType SrcVT, RegisterClass RC,
PatFrag memop_frag, X86MemOperand x86memop,
- OpndItins itins, bit Is2Addr = 1> {
+ X86FoldableSchedWrite sched, bit Is2Addr = 1> {
let isCommutable = 1 in
def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))], itins.rr>,
- Sched<[itins.Sched]>;
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+ Sched<[sched]>;
def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
- (bitconvert (memop_frag addr:$src2)))))],
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (memop_frag addr:$src2)))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
} // ExeDomain = SSEPackedInt
defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
- SSE_INTALU_ITINS_P, 1, NoVLX>;
+ SchedWriteVecALU, 1, NoVLX>;
defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
- SSE_INTALUQ_ITINS_P, 1, NoVLX>;
+ SchedWriteVecALU, 1, NoVLX>;
defm PADDSB : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PADDSW : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
- SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+ SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
- SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+ SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
- SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+ SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
- SSE_INTALU_ITINS_P, 0, NoVLX>;
+ SchedWriteVecALU, 0, NoVLX>;
defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
- SSE_INTALUQ_ITINS_P, 0, NoVLX>;
+ SchedWriteVecALU, 0, NoVLX>;
defm PSUBSB : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
defm PSUBSW : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
+defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
+ SchedWriteVecIMul, 1, NoVLX>;
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
- loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V, VEX_WIG;
+ loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
+ VEX_4V, VEX_WIG;
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
- VR256, loadv4i64, i256mem, SSE_PMADD,
+ VR256, loadv4i64, i256mem, SchedWriteVecIMul.YMM,
0>, VEX_4V, VEX_L, VEX_WIG;
let Constraints = "$src1 = $dst" in
defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
- memopv2i64, i128mem, SSE_PMADD>;
+ memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
- loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
+ loadv2i64, i128mem, SchedWritePSADBW.XMM, 0>,
VEX_4V, VEX_WIG;
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
- loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 0>,
+ loadv4i64, i256mem, SchedWritePSADBW.YMM, 0>,
VEX_4V, VEX_L, VEX_WIG;
let Constraints = "$src1 = $dst" in
defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
- memopv2i64, i128mem, SSE_INTALU_ITINS_P>;
-
-let Predicates = [HasAVX, NoVLX] in
-defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
- loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
- VEX_4V, VEX_WIG;
-let Predicates = [HasAVX2, NoVLX] in
-defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
- VR256, loadv4i64, i256mem,
- SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L, VEX_WIG;
-let Constraints = "$src1 = $dst" in
-defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
- memopv2i64, i128mem, SSE_INTMUL_ITINS_P>;
+ memopv2i64, i128mem, SchedWritePSADBW.XMM>;
//===---------------------------------------------------------------------===//
// SSE2 - Packed Integer Logical Instructions
@@ -3841,6 +3435,8 @@ defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
string OpcodeStr, SDNode OpNode,
SDNode OpNode2, RegisterClass RC,
+ X86FoldableSchedWrite sched,
+ X86FoldableSchedWrite schedImm,
ValueType DstVT, ValueType SrcVT,
PatFrag ld_frag, bit Is2Addr = 1> {
// src2 is always 128-bit
@@ -3849,89 +3445,103 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
- SSE_INTSHIFT_ITINS_P.rr>, Sched<[WriteVecShift]>;
+ [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
+ Sched<[sched]>;
def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, i128mem:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (DstVT (OpNode RC:$src1,
- (SrcVT (bitconvert (ld_frag addr:$src2))))))],
- SSE_INTSHIFT_ITINS_P.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+ (SrcVT (bitconvert (ld_frag addr:$src2))))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
(ins RC:$src1, u8imm:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))],
- SSE_INTSHIFT_ITINS_P.ri>, Sched<[WriteVecShift]>;
+ [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>,
+ Sched<[schedImm]>;
}
multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
string OpcodeStr, SDNode OpNode,
SDNode OpNode2, ValueType DstVT128,
ValueType DstVT256, ValueType SrcVT,
- Predicate prd> {
+ X86SchedWriteWidths sched,
+ X86SchedWriteWidths schedImm, Predicate prd> {
let Predicates = [HasAVX, prd] in
defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
- OpNode, OpNode2, VR128, DstVT128, SrcVT,
- loadv2i64, 0>, VEX_4V, VEX_WIG;
+ OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
+ DstVT128, SrcVT, loadv2i64, 0>, VEX_4V, VEX_WIG;
let Predicates = [HasAVX2, prd] in
defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
- OpNode, OpNode2, VR256, DstVT256, SrcVT,
- loadv2i64, 0>, VEX_4V, VEX_L, VEX_WIG;
+ OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
+ DstVT256, SrcVT, loadv2i64, 0>, VEX_4V, VEX_L,
+ VEX_WIG;
let Constraints = "$src1 = $dst" in
defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
- VR128, DstVT128, SrcVT, memopv2i64>;
+ VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
+ memopv2i64>;
}
multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
SDNode OpNode, RegisterClass RC, ValueType VT,
- bit Is2Addr = 1> {
+ X86FoldableSchedWrite sched, bit Is2Addr = 1> {
def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))],
- IIC_SSE_INTSHDQ_P_RI>, Sched<[WriteVecShift]>;
+ [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+ Sched<[sched]>;
}
multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
- SDNode OpNode> {
+ SDNode OpNode, X86SchedWriteWidths sched> {
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
- VR128, v16i8, 0>, VEX_4V, VEX_WIG;
+ VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
- VR256, v32i8, 0>, VEX_4V, VEX_L, VEX_WIG;
+ VR256, v32i8, sched.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
let Constraints = "$src1 = $dst" in
- defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8>;
+ defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
+ sched.XMM>;
}
let ExeDomain = SSEPackedInt in {
defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
- v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+ v8i16, v16i16, v8i16, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
- v4i32, v8i32, v4i32, NoVLX>;
+ v4i32, v8i32, v4i32, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX>;
defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
- v2i64, v4i64, v2i64, NoVLX>;
+ v2i64, v4i64, v2i64, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX>;
defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
- v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+ v8i16, v16i16, v8i16, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
- v4i32, v8i32, v4i32, NoVLX>;
+ v4i32, v8i32, v4i32, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX>;
defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
- v2i64, v4i64, v2i64, NoVLX>;
+ v2i64, v4i64, v2i64, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX>;
defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
- v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+ v8i16, v16i16, v8i16, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
- v4i32, v8i32, v4i32, NoVLX>;
+ v4i32, v8i32, v4i32, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX>;
- defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq>;
- defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq>;
- // PSRADQri doesn't exist in SSE[1-3].
+ defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
+ SchedWriteShuffle>;
+ defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
+ SchedWriteShuffle>;
} // ExeDomain = SSEPackedInt
//===---------------------------------------------------------------------===//
@@ -3939,46 +3549,42 @@ let ExeDomain = SSEPackedInt in {
//===---------------------------------------------------------------------===//
defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 1, TruePredicate>;
+ SchedWriteVecALU, 1, TruePredicate>;
defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 1, TruePredicate>;
+ SchedWriteVecALU, 1, TruePredicate>;
defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
- SSE_INTALU_ITINS_P, 1, TruePredicate>;
+ SchedWriteVecALU, 1, TruePredicate>;
defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 0, TruePredicate>;
+ SchedWriteVecALU, 0, TruePredicate>;
defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 0, TruePredicate>;
+ SchedWriteVecALU, 0, TruePredicate>;
defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
- SSE_INTALU_ITINS_P, 0, TruePredicate>;
+ SchedWriteVecALU, 0, TruePredicate>;
//===---------------------------------------------------------------------===//
// SSE2 - Packed Integer Shuffle Instructions
//===---------------------------------------------------------------------===//
-let Sched = WriteShuffle in
-def SSE_PSHUF : OpndItins<
- IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
->;
-
let ExeDomain = SSEPackedInt in {
multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
- SDNode OpNode, OpndItins itins, Predicate prd> {
+ SDNode OpNode, X86SchedWriteWidths sched,
+ Predicate prd> {
let Predicates = [HasAVX, prd] in {
def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, u8imm:$src2),
!strconcat("v", OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
- (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
- itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG;
+ (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
+ VEX, Sched<[sched.XMM]>, VEX_WIG;
def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1, u8imm:$src2),
!strconcat("v", OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
- (i8 imm:$src2))))], itins.rm>, VEX,
- Sched<[itins.Sched.Folded]>, VEX_WIG;
+ (i8 imm:$src2))))]>, VEX,
+ Sched<[sched.XMM.Folded]>, VEX_WIG;
}
let Predicates = [HasAVX2, prd] in {
@@ -3987,16 +3593,16 @@ let Predicates = [HasAVX2, prd] in {
!strconcat("v", OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
- (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
- itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG;
+ (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>,
+ VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
(ins i256mem:$src1, u8imm:$src2),
!strconcat("v", OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
(vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
- (i8 imm:$src2))))], itins.rm>, VEX, VEX_L,
- Sched<[itins.Sched.Folded]>, VEX_WIG;
+ (i8 imm:$src2))))]>, VEX, VEX_L,
+ Sched<[sched.YMM.Folded]>, VEX_WIG;
}
let Predicates = [UseSSE2] in {
@@ -4004,27 +3610,27 @@ let Predicates = [UseSSE2] in {
(outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128:$dst,
- (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
- itins.rr>, Sched<[itins.Sched]>;
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
+ Sched<[sched.XMM]>;
def mi : Ii8<0x70, MRMSrcMem,
(outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128:$dst,
- (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
- (i8 imm:$src2))))], itins.rm>,
- Sched<[itins.Sched.Folded]>;
+ [(set VR128:$dst,
+ (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+ (i8 imm:$src2))))]>,
+ Sched<[sched.XMM.Folded]>;
}
}
} // ExeDomain = SSEPackedInt
-defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, SSE_PSHUF,
- NoVLX>, PD;
-defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, SSE_PSHUF,
- NoVLX_Or_NoBWI>, XS;
-defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, SSE_PSHUF,
- NoVLX_Or_NoBWI>, XD;
+defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
+ SchedWriteShuffle, NoVLX>, PD;
+defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
+ SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
+defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
+ SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
//===---------------------------------------------------------------------===//
// Packed Integer Pack Instructions (SSE & AVX)
@@ -4033,8 +3639,8 @@ defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, SSE_PSHUF,
let ExeDomain = SSEPackedInt in {
multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
ValueType ArgVT, SDNode OpNode, RegisterClass RC,
- X86MemOperand x86memop, OpndItins itins, PatFrag ld_frag,
- bit Is2Addr = 1> {
+ X86MemOperand x86memop, X86FoldableSchedWrite sched,
+ PatFrag ld_frag, bit Is2Addr = 1> {
def rr : PDI<opc, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
@@ -4042,8 +3648,8 @@ multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst,
- (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))],
- itins.rr>, Sched<[itins.Sched]>;
+ (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
+ Sched<[sched]>;
def rm : PDI<opc, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
@@ -4052,14 +3658,14 @@ multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst,
(OutVT (OpNode (ArgVT RC:$src1),
- (bitconvert (ld_frag addr:$src2)))))],
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (ld_frag addr:$src2)))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
ValueType ArgVT, SDNode OpNode, RegisterClass RC,
- X86MemOperand x86memop, OpndItins itins, PatFrag ld_frag,
- bit Is2Addr = 1> {
+ X86MemOperand x86memop, X86FoldableSchedWrite sched,
+ PatFrag ld_frag, bit Is2Addr = 1> {
def rr : SS48I<opc, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
@@ -4067,8 +3673,8 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst,
- (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))],
- itins.rr>, Sched<[itins.Sched]>;
+ (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
+ Sched<[sched]>;
def rm : SS48I<opc, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
@@ -4077,49 +3683,53 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst,
(OutVT (OpNode (ArgVT RC:$src1),
- (bitconvert (ld_frag addr:$src2)))))],
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (ld_frag addr:$src2)))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
- i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+ VEX_4V, VEX_WIG;
defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
- i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+ VEX_4V, VEX_WIG;
defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
- i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+ VEX_4V, VEX_WIG;
defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
- i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V;
+ i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+ VEX_4V;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
- defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss,
- VR256, i256mem, SSE_PACK, loadv4i64, 0>,
+ defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
+ i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
VEX_4V, VEX_L, VEX_WIG;
- defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss,
- VR256, i256mem, SSE_PACK, loadv4i64, 0>,
+ defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
+ i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
VEX_4V, VEX_L, VEX_WIG;
- defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus,
- VR256,i256mem, SSE_PACK, loadv4i64, 0>,
+ defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
+ i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
VEX_4V, VEX_L, VEX_WIG;
- defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus,
- VR256, i256mem, SSE_PACK, loadv4i64, 0>,
+ defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
+ i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
VEX_4V, VEX_L;
}
let Constraints = "$src1 = $dst" in {
defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
- i128mem, SSE_PACK, memopv2i64>;
+ i128mem, SchedWriteShuffle.XMM, memopv2i64>;
defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
- i128mem, SSE_PACK, memopv2i64>;
+ i128mem, SchedWriteShuffle.XMM, memopv2i64>;
defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
- i128mem, SSE_PACK, memopv2i64>;
+ i128mem, SchedWriteShuffle.XMM, memopv2i64>;
defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
- i128mem, SSE_PACK, memopv2i64>;
+ i128mem, SchedWriteShuffle.XMM, memopv2i64>;
}
} // ExeDomain = SSEPackedInt
@@ -4127,107 +3737,106 @@ let Constraints = "$src1 = $dst" in {
// SSE2 - Packed Integer Unpack Instructions
//===---------------------------------------------------------------------===//
-let Sched = WriteShuffle in
-def SSE_PUNPCK : OpndItins<
- IIC_SSE_UNPCK, IIC_SSE_UNPCK
->;
-
let ExeDomain = SSEPackedInt in {
multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
- OpndItins itins, PatFrag ld_frag, bit Is2Addr = 1> {
+ X86FoldableSchedWrite sched, PatFrag ld_frag,
+ bit Is2Addr = 1> {
def rr : PDI<opc, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))],
- itins.rr>, Sched<[itins.Sched]>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
+ Sched<[sched]>;
def rm : PDI<opc, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (vt (OpNode RC:$src1,
- (bitconvert (ld_frag addr:$src2)))))],
- itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (ld_frag addr:$src2)))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
- i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+ VEX_4V, VEX_WIG;
defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
- i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+ VEX_4V, VEX_WIG;
defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
- i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+ VEX_4V, VEX_WIG;
defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
- i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+ VEX_4V, VEX_WIG;
}
let Predicates = [HasAVX, NoVLX] in {
defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
- i128mem, SSE_PUNPCK, loadv2i64, 0>,
+ i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
VEX_4V, VEX_WIG;
defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
- i128mem, SSE_PUNPCK, loadv2i64, 0>,
+ i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
VEX_4V, VEX_WIG;
defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
- i128mem, SSE_PUNPCK, loadv2i64, 0>,
+ i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
VEX_4V, VEX_WIG;
defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
- i128mem, SSE_PUNPCK, loadv2i64, 0>,
+ i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
VEX_4V, VEX_WIG;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
- i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
VEX_4V, VEX_L, VEX_WIG;
defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
- i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
VEX_4V, VEX_L, VEX_WIG;
defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
- i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
VEX_4V, VEX_L, VEX_WIG;
defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
- i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
VEX_4V, VEX_L, VEX_WIG;
}
let Predicates = [HasAVX2, NoVLX] in {
defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
- i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
VEX_4V, VEX_L, VEX_WIG;
defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
- i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
VEX_4V, VEX_L, VEX_WIG;
defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
- i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
VEX_4V, VEX_L, VEX_WIG;
defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
- i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
VEX_4V, VEX_L, VEX_WIG;
}
let Constraints = "$src1 = $dst" in {
defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
- i128mem, SSE_PUNPCK, memopv2i64>;
+ i128mem, SchedWriteShuffle.XMM, memopv2i64>;
defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
- i128mem, SSE_PUNPCK, memopv2i64>;
+ i128mem, SchedWriteShuffle.XMM, memopv2i64>;
defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
- i128mem, SSE_PUNPCK, memopv2i64>;
+ i128mem, SchedWriteShuffle.XMM, memopv2i64>;
defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
- i128mem, SSE_PUNPCK, memopv2i64>;
+ i128mem, SchedWriteShuffle.XMM, memopv2i64>;
defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
- i128mem, SSE_PUNPCK, memopv2i64>;
+ i128mem, SchedWriteShuffle.XMM, memopv2i64>;
defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
- i128mem, SSE_PUNPCK, memopv2i64>;
+ i128mem, SchedWriteShuffle.XMM, memopv2i64>;
defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
- i128mem, SSE_PUNPCK, memopv2i64>;
+ i128mem, SchedWriteShuffle.XMM, memopv2i64>;
defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
- i128mem, SSE_PUNPCK, memopv2i64>;
+ i128mem, SchedWriteShuffle.XMM, memopv2i64>;
}
} // ExeDomain = SSEPackedInt
@@ -4237,41 +3846,41 @@ let Constraints = "$src1 = $dst" in {
let ExeDomain = SSEPackedInt in {
multiclass sse2_pinsrw<bit Is2Addr = 1> {
- def rri : Ii8<0xC4, MRMSrcReg,
+ def rr : Ii8<0xC4, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1,
GR32orGR64:$src2, u8imm:$src3),
!if(Is2Addr,
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
"vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
- IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
- def rmi : Ii8<0xC4, MRMSrcMem,
- (outs VR128:$dst), (ins VR128:$src1,
- i16mem:$src2, u8imm:$src3),
+ (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
+ Sched<[WriteVecInsert]>;
+ def rm : Ii8<0xC4, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1,
+ i16mem:$src2, u8imm:$src3),
!if(Is2Addr,
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
"vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
- imm:$src3))], IIC_SSE_PINSRW>,
- Sched<[WriteShuffleLd, ReadAfterLd]>;
+ imm:$src3))]>,
+ Sched<[WriteVecInsertLd, ReadAfterLd]>;
}
// Extract
let Predicates = [HasAVX, NoBWI] in
-def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
+def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
(outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
"vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
- imm:$src2))]>, PD, VEX,
- Sched<[WriteShuffle]>;
-def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+ imm:$src2))]>,
+ PD, VEX, Sched<[WriteVecExtract]>;
+def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
(outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
"pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
- imm:$src2))], IIC_SSE_PEXTRW>,
- Sched<[WriteShuffleLd, ReadAfterLd]>;
+ imm:$src2))]>,
+ Sched<[WriteVecExtract]>;
// Insert
let Predicates = [HasAVX, NoBWI] in
@@ -4286,26 +3895,26 @@ defm PINSRW : sse2_pinsrw, PD;
// SSE2 - Packed Mask Creation
//===---------------------------------------------------------------------===//
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
+let ExeDomain = SSEPackedInt in {
def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
(ins VR128:$src),
"pmovmskb\t{$src, $dst|$dst, $src}",
- [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
- IIC_SSE_MOVMSK>, VEX, VEX_WIG;
+ [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
+ Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
let Predicates = [HasAVX2] in {
def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
(ins VR256:$src),
"pmovmskb\t{$src, $dst|$dst, $src}",
[(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
- VEX, VEX_L, VEX_WIG;
+ Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
}
def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
"pmovmskb\t{$src, $dst|$dst, $src}",
- [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
- IIC_SSE_MOVMSK>;
+ [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
+ Sched<[WriteVecMOVMSK]>;
} // ExeDomain = SSEPackedInt
@@ -4313,31 +3922,28 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
// SSE2 - Conditional Store
//===---------------------------------------------------------------------===//
-let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
-
+let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
(ins VR128:$src, VR128:$mask),
"maskmovdqu\t{$mask, $src|$src, $mask}",
- [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
- IIC_SSE_MASKMOV>, VEX, VEX_WIG;
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
+ VEX, VEX_WIG;
let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
(ins VR128:$src, VR128:$mask),
"maskmovdqu\t{$mask, $src|$src, $mask}",
- [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
- IIC_SSE_MASKMOV>, VEX, VEX_WIG;
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
+ VEX, VEX_WIG;
let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
"maskmovdqu\t{$mask, $src|$src, $mask}",
- [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
- IIC_SSE_MASKMOV>;
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
"maskmovdqu\t{$mask, $src|$src, $mask}",
- [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
- IIC_SSE_MASKMOV>;
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
} // ExeDomain = SSEPackedInt
@@ -4350,55 +3956,54 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
//
let ExeDomain = SSEPackedInt in {
def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
- "movd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
- VEX, Sched<[WriteMove]>;
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector GR32:$src)))]>,
+ VEX, Sched<[WriteVecMoveFromGpr]>;
def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
- "movd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
- IIC_SSE_MOVDQ>,
- VEX, Sched<[WriteLoad]>;
-def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
- "movq\t{$src, $dst|$dst, $src}",
+ "movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2i64 (scalar_to_vector GR64:$src)))],
- IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
+ VEX, Sched<[WriteVecLoad]>;
+def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector GR64:$src)))]>,
+ VEX, Sched<[WriteVecMoveFromGpr]>;
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>;
+ "movq\t{$src, $dst|$dst, $src}", []>,
+ VEX, Sched<[WriteVecLoad]>;
let isCodeGenOnly = 1 in
def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (bitconvert GR64:$src))],
- IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert GR64:$src))]>,
+ VEX, Sched<[WriteVecMoveFromGpr]>;
def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
- Sched<[WriteMove]>;
+ (v4i32 (scalar_to_vector GR32:$src)))]>,
+ Sched<[WriteVecMoveFromGpr]>;
def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
- IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
+ Sched<[WriteVecLoad]>;
def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2i64 (scalar_to_vector GR64:$src)))],
- IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+ (v2i64 (scalar_to_vector GR64:$src)))]>,
+ Sched<[WriteVecMoveFromGpr]>;
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+ "movq\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteVecLoad]>;
let isCodeGenOnly = 1 in
def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (bitconvert GR64:$src))],
- IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+ [(set FR64:$dst, (bitconvert GR64:$src))]>,
+ Sched<[WriteVecMoveFromGpr]>;
} // ExeDomain = SSEPackedInt
//===---------------------------------------------------------------------===//
@@ -4407,23 +4012,22 @@ def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (bitconvert GR32:$src))],
- IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+ [(set FR32:$dst, (bitconvert GR32:$src))]>,
+ VEX, Sched<[WriteVecMoveFromGpr]>;
def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
- IIC_SSE_MOVDQ>,
- VEX, Sched<[WriteLoad]>;
+ [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
+ VEX, Sched<[WriteVecLoad]>;
def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (bitconvert GR32:$src))],
- IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+ [(set FR32:$dst, (bitconvert GR32:$src))]>,
+ Sched<[WriteVecMoveFromGpr]>;
def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
- IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+ [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
+ Sched<[WriteVecLoad]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
//===---------------------------------------------------------------------===//
@@ -4431,55 +4035,54 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
//
let ExeDomain = SSEPackedInt in {
def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
- "movd\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
- (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
- Sched<[WriteMove]>;
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
+ (iPTR 0)))]>, VEX,
+ Sched<[WriteVecMoveToGpr]>;
def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
- (ins i32mem:$dst, VR128:$src),
- "movd\t{$src, $dst|$dst, $src}",
- [(store (i32 (extractelt (v4i32 VR128:$src),
- (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
- VEX, Sched<[WriteStore]>;
+ (ins i32mem:$dst, VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (extractelt (v4i32 VR128:$src),
+ (iPTR 0))), addr:$dst)]>,
+ VEX, Sched<[WriteVecStore]>;
def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (extractelt (v4i32 VR128:$src),
- (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
- Sched<[WriteMove]>;
+ (iPTR 0)))]>,
+ Sched<[WriteVecMoveToGpr]>;
def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
[(store (i32 (extractelt (v4i32 VR128:$src),
- (iPTR 0))), addr:$dst)],
- IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+ (iPTR 0))), addr:$dst)]>,
+ Sched<[WriteVecStore]>;
} // ExeDomain = SSEPackedInt
+
//===---------------------------------------------------------------------===//
// Move Packed Doubleword Int first element to Doubleword Int
//
let ExeDomain = SSEPackedInt in {
-let SchedRW = [WriteMove] in {
+let SchedRW = [WriteVecMoveToGpr] in {
def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (extractelt (v2i64 VR128:$src),
- (iPTR 0)))],
- IIC_SSE_MOVD_ToGP>,
+ (iPTR 0)))]>,
VEX;
def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (extractelt (v2i64 VR128:$src),
- (iPTR 0)))],
- IIC_SSE_MOVD_ToGP>;
+ (iPTR 0)))]>;
} //SchedRW
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
(ins i64mem:$dst, VR128:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+ "movq\t{$src, $dst|$dst, $src}", []>,
+ VEX, Sched<[WriteVecStore]>;
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+ "movq\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteVecStore]>;
} // ExeDomain = SSEPackedInt
//===---------------------------------------------------------------------===//
@@ -4490,28 +4093,28 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
- VEX, Sched<[WriteLoad]>;
+ VEX, Sched<[WriteVecLoad]>;
def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (bitconvert FR64:$src))],
- IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+ [(set GR64:$dst, (bitconvert FR64:$src))]>,
+ VEX, Sched<[WriteVecMoveToGpr]>;
def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
- IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+ [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
+ VEX, Sched<[WriteVecStore]>;
def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
- IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+ [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+ Sched<[WriteVecLoad]>;
def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (bitconvert FR64:$src))],
- IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+ [(set GR64:$dst, (bitconvert FR64:$src))]>,
+ Sched<[WriteVecMoveToGpr]>;
def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
- IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+ [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
+ Sched<[WriteVecStore]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
//===---------------------------------------------------------------------===//
@@ -4520,79 +4123,67 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (bitconvert FR32:$src))],
- IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
+ [(set GR32:$dst, (bitconvert FR32:$src))]>,
+ VEX, Sched<[WriteVecMoveToGpr]>;
def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
- IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+ [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
+ VEX, Sched<[WriteVecStore]>;
def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (bitconvert FR32:$src))],
- IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+ [(set GR32:$dst, (bitconvert FR32:$src))]>,
+ Sched<[WriteVecMoveToGpr]>;
def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
- IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+ [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
+ Sched<[WriteVecStore]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
let Predicates = [UseAVX] in {
- let AddedComplexity = 15 in {
- def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
- (VMOVDI2PDIrr GR32:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+ (VMOVDI2PDIrr GR32:$src)>;
- def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
- (VMOV64toPQIrr GR64:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+ (VMOV64toPQIrr GR64:$src)>;
- def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>;
- }
+ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>;
// AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
// These instructions also write zeros in the high part of a 256-bit register.
- let AddedComplexity = 20 in {
- def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
- (VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
- (VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
- (VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
- (VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzload addr:$src)),
- (VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
- def : Pat<(v8i32 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i64 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
- }
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
+ (VMOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ (VMOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VMOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzload addr:$src)),
+ (VMOVDI2PDIrm addr:$src)>;
+ def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
+ def : Pat<(v8i32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
// Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
+ (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>;
}
let Predicates = [UseSSE2] in {
- let AddedComplexity = 15 in {
- def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
- (MOVDI2PDIrr GR32:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+ (MOVDI2PDIrr GR32:$src)>;
- def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
- (MOV64toPQIrr GR64:$src)>;
- }
- let AddedComplexity = 20 in {
- def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
- (MOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
- (MOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
- (MOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
- (MOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzload addr:$src)),
- (MOVDI2PDIrm addr:$src)>;
- }
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+ (MOV64toPQIrr GR64:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
+ (MOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ (MOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+ (MOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzload addr:$src)),
+ (MOVDI2PDIrm addr:$src)>;
}
// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
@@ -4616,7 +4207,7 @@ def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
// Move Quadword Int to Packed Quadword Int
//
-let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in {
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@@ -4625,34 +4216,32 @@ def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
- IIC_SSE_MOVDQ>, XS,
- Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
+ XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
} // ExeDomain, SchedRW
//===---------------------------------------------------------------------===//
// Move Packed Quadword Int to Quadword Int
//
-let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [(store (i64 (extractelt (v2i64 VR128:$src),
- (iPTR 0))), addr:$dst)],
- IIC_SSE_MOVDQ>, VEX, VEX_WIG;
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (extractelt (v2i64 VR128:$src),
+ (iPTR 0))), addr:$dst)]>,
+ VEX, VEX_WIG;
def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
[(store (i64 (extractelt (v2i64 VR128:$src),
- (iPTR 0))), addr:$dst)],
- IIC_SSE_MOVDQ>;
+ (iPTR 0))), addr:$dst)]>;
} // ExeDomain, SchedRW
// For disassembler only
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
- SchedRW = [WriteVecLogic] in {
+ SchedRW = [SchedWriteVecLogic.XMM] in {
def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX, VEX_WIG;
+ "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
+ "movq\t{$src, $dst|$dst, $src}", []>;
}
// Aliases to help the assembler pick two byte VEX encodings by swapping the
@@ -4660,29 +4249,26 @@ def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}",
(VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;
-let Predicates = [UseAVX], AddedComplexity = 20 in {
- def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
- (VMOVQI2PQIrm addr:$src)>;
+def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
+ (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
+ (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
+
+let Predicates = [UseAVX] in {
def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
(VMOVQI2PQIrm addr:$src)>;
- def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
- (VMOVQI2PQIrm addr:$src)>;
def : Pat<(v2i64 (X86vzload addr:$src)),
(VMOVQI2PQIrm addr:$src)>;
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
(v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
+ (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
def : Pat<(v4i64 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
+ (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
}
-let Predicates = [UseSSE2], AddedComplexity = 20 in {
- def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
- (MOVQI2PQIrm addr:$src)>;
+let Predicates = [UseSSE2] in {
def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
(MOVQI2PQIrm addr:$src)>;
- def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
- (MOVQI2PQIrm addr:$src)>;
def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
}
@@ -4690,62 +4276,61 @@ let Predicates = [UseSSE2], AddedComplexity = 20 in {
// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
// IA32 document. movq xmm1, xmm2 does clear the high bits.
//
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
-let AddedComplexity = 15 in
+let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vmovq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
- IIC_SSE_MOVQ_RR>,
- XS, VEX, Requires<[UseAVX]>, VEX_WIG;
-let AddedComplexity = 15 in
+ [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
+ XS, VEX, Requires<[UseAVX]>, VEX_WIG;
def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
- IIC_SSE_MOVQ_RR>,
- XS, Requires<[UseSSE2]>;
+ [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
+ XS, Requires<[UseSSE2]>;
} // ExeDomain, SchedRW
-let AddedComplexity = 20 in {
- let Predicates = [UseAVX] in {
- def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
- (VMOVZPQILo2PQIrr VR128:$src)>;
- }
- let Predicates = [UseSSE2] in {
- def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
- (MOVZPQILo2PQIrr VR128:$src)>;
- }
+let Predicates = [UseAVX] in {
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+ (VMOVZPQILo2PQIrr VR128:$src)>;
+}
+let Predicates = [UseSSE2] in {
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+ (MOVZPQILo2PQIrr VR128:$src)>;
}
//===---------------------------------------------------------------------===//
// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
//===---------------------------------------------------------------------===//
+
multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
ValueType vt, RegisterClass RC, PatFrag mem_frag,
- X86MemOperand x86memop> {
+ X86MemOperand x86memop, X86FoldableSchedWrite sched> {
def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (vt (OpNode RC:$src)))],
- IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+ [(set RC:$dst, (vt (OpNode RC:$src)))]>,
+ Sched<[sched]>;
def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
- IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
+ [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
+ Sched<[sched.Folded]>;
}
let Predicates = [HasAVX, NoVLX] in {
defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
- v4f32, VR128, loadv4f32, f128mem>, VEX, VEX_WIG;
+ v4f32, VR128, loadv4f32, f128mem,
+ SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
- v4f32, VR128, loadv4f32, f128mem>, VEX, VEX_WIG;
+ v4f32, VR128, loadv4f32, f128mem,
+ SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
- v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L, VEX_WIG;
+ v8f32, VR256, loadv8f32, f256mem,
+ SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
- v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L, VEX_WIG;
+ v8f32, VR256, loadv8f32, f256mem,
+ SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
}
defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
- memopv4f32, f128mem>;
+ memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
- memopv4f32, f128mem>;
+ memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4i32 (X86Movshdup VR128:$src)),
@@ -4781,44 +4366,40 @@ let Predicates = [UseSSE3] in {
// SSE3 - Replicate Double FP - MOVDDUP
//===---------------------------------------------------------------------===//
-// FIXME: Improve MOVDDUP/BROADCAST reg/mem scheduling itineraries.
-let Sched = WriteFShuffle in
-def SSE_MOVDDUP : OpndItins<
- IIC_SSE_MOV_LH, IIC_SSE_MOV_LH
->;
-
-multiclass sse3_replicate_dfp<string OpcodeStr> {
+multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))],
- IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+ [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
+ Sched<[sched.XMM]>;
def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
(v2f64 (X86Movddup
- (scalar_to_vector (loadf64 addr:$src)))))],
- IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
+ (scalar_to_vector (loadf64 addr:$src)))))]>,
+ Sched<[sched.XMM.Folded]>;
}
-// FIXME: Merge with above classe when there're patterns for the ymm version
-multiclass sse3_replicate_dfp_y<string OpcodeStr> {
+// FIXME: Merge with above classes when there are patterns for the ymm version
+multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
- Sched<[WriteFShuffle]>;
+ Sched<[sched.YMM]>;
def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR256:$dst,
(v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
- Sched<[WriteLoad]>;
+ Sched<[sched.YMM.Folded]>;
}
let Predicates = [HasAVX, NoVLX] in {
- defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX, VEX_WIG;
- defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L, VEX_WIG;
+ defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
+ VEX, VEX_WIG;
+ defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
+ VEX, VEX_L, VEX_WIG;
}
-defm MOVDDUP : sse3_replicate_dfp<"movddup">;
+defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
let Predicates = [HasAVX, NoVLX] in {
@@ -4836,152 +4417,149 @@ let Predicates = [UseSSE3] in {
// SSE3 - Move Unaligned Integer
//===---------------------------------------------------------------------===//
-let SchedRW = [WriteLoad] in {
let Predicates = [HasAVX] in {
def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
- "vlddqu\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX, VEX_WIG;
+ "vlddqu\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
+ Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
- "vlddqu\t{$src, $dst|$dst, $src}",
- [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
- VEX, VEX_L, VEX_WIG;
-}
+ "vlddqu\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
+ Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
+} // Predicates
+
def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"lddqu\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
- IIC_SSE_LDDQU>;
-}
+ [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
+ Sched<[SchedWriteVecMoveLS.XMM.RM]>;
//===---------------------------------------------------------------------===//
// SSE3 - Arithmetic
//===---------------------------------------------------------------------===//
multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
- X86MemOperand x86memop, OpndItins itins,
+ X86MemOperand x86memop, X86FoldableSchedWrite sched,
PatFrag ld_frag, bit Is2Addr = 1> {
def rr : I<0xD0, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))], itins.rr>,
- Sched<[itins.Sched]>;
+ [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
+ Sched<[sched]>;
def rm : I<0xD0, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))],
- itins.rr>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
let Predicates = [HasAVX] in {
let ExeDomain = SSEPackedSingle in {
defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
- SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V,
- VEX_WIG;
+ SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
+ XD, VEX_4V, VEX_WIG;
defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
- SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V,
- VEX_L, VEX_WIG;
+ SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
+ XD, VEX_4V, VEX_L, VEX_WIG;
}
let ExeDomain = SSEPackedDouble in {
defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
- SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V,
- VEX_WIG;
+ SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
+ PD, VEX_4V, VEX_WIG;
defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
- SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V,
- VEX_L, VEX_WIG;
+ SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
+ PD, VEX_4V, VEX_L, VEX_WIG;
}
}
let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
let ExeDomain = SSEPackedSingle in
- defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem, SSE_ALU_F32P,
- memopv4f32>, XD;
+ defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
+ SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
let ExeDomain = SSEPackedDouble in
- defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem, SSE_ALU_F64P,
- memopv2f64>, PD;
+ defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
+ SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
}
//===---------------------------------------------------------------------===//
// SSE3 Instructions
//===---------------------------------------------------------------------===//
-let Sched = WriteFHAdd in
-def SSE_HADDSUB : OpndItins<
- IIC_SSE_HADDSUB_RR, IIC_SSE_HADDSUB_RM
->;
-
// Horizontal ops
multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
- X86MemOperand x86memop, SDNode OpNode, OpndItins itins,
- PatFrag ld_frag, bit Is2Addr = 1> {
+ X86MemOperand x86memop, SDNode OpNode,
+ X86FoldableSchedWrite sched, PatFrag ld_frag,
+ bit Is2Addr = 1> {
def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr>,
- Sched<[itins.Sched]>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
+ Sched<[sched]>;
def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
- X86MemOperand x86memop, SDNode OpNode, OpndItins itins,
- PatFrag ld_frag, bit Is2Addr = 1> {
+ X86MemOperand x86memop, SDNode OpNode,
+ X86FoldableSchedWrite sched, PatFrag ld_frag,
+ bit Is2Addr = 1> {
def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr>,
- Sched<[itins.Sched]>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
+ Sched<[sched]>;
def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
let Predicates = [HasAVX] in {
let ExeDomain = SSEPackedSingle in {
defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
- X86fhadd, SSE_HADDSUB, loadv4f32, 0>, VEX_4V, VEX_WIG;
+ X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
- X86fhsub, SSE_HADDSUB, loadv4f32, 0>, VEX_4V, VEX_WIG;
+ X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
- X86fhadd, SSE_HADDSUB, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
+ X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
- X86fhsub, SSE_HADDSUB, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
+ X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
}
let ExeDomain = SSEPackedDouble in {
- defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
- X86fhadd, SSE_HADDSUB, loadv2f64, 0>, VEX_4V, VEX_WIG;
- defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
- X86fhsub, SSE_HADDSUB, loadv2f64, 0>, VEX_4V, VEX_WIG;
- defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
- X86fhadd, SSE_HADDSUB, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
- defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
- X86fhsub, SSE_HADDSUB, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
+ defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
+ X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
+ defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
+ X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
+ defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
+ X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
+ defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
+ X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
}
}
let Constraints = "$src1 = $dst" in {
let ExeDomain = SSEPackedSingle in {
defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
- SSE_HADDSUB, memopv4f32>;
+ WriteFHAdd, memopv4f32>;
defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
- SSE_HADDSUB, memopv4f32>;
+ WriteFHAdd, memopv4f32>;
}
let ExeDomain = SSEPackedDouble in {
defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
- SSE_HADDSUB, memopv2f64>;
+ WriteFHAdd, memopv2f64>;
defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
- SSE_HADDSUB, memopv2f64>;
+ WriteFHAdd, memopv2f64>;
}
}
@@ -4989,105 +4567,85 @@ let Constraints = "$src1 = $dst" in {
// SSSE3 - Packed Absolute Instructions
//===---------------------------------------------------------------------===//
-let Sched = WriteVecALU in
-def SSE_PABS : OpndItins<
- IIC_SSE_PABS_RR, IIC_SSE_PABS_RM
->;
-
/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
- SDNode OpNode, OpndItins itins, PatFrag ld_frag> {
+ SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (vt (OpNode VR128:$src)))],
- itins.rr>, Sched<[itins.Sched]>;
+ [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
+ Sched<[sched.XMM]>;
def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
- (vt (OpNode (bitconvert (ld_frag addr:$src)))))],
- itins.rm>, Sched<[itins.Sched.Folded]>;
+ (vt (OpNode (bitconvert (ld_frag addr:$src)))))]>,
+ Sched<[sched.XMM.Folded]>;
}
/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
- SDNode OpNode, OpndItins itins> {
+ SDNode OpNode, X86SchedWriteWidths sched> {
def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (vt (OpNode VR256:$src)))], itins.rr>,
- Sched<[itins.Sched]>;
+ [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
+ Sched<[sched.YMM]>;
def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
(ins i256mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR256:$dst,
- (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))], itins.rm>,
- Sched<[itins.Sched.Folded]>;
+ (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
+ Sched<[sched.YMM.Folded]>;
}
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
- defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SSE_PABS, loadv2i64>, VEX, VEX_WIG;
- defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SSE_PABS, loadv2i64>, VEX, VEX_WIG;
+ defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
+ loadv2i64>, VEX, VEX_WIG;
+ defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
+ loadv2i64>, VEX, VEX_WIG;
}
let Predicates = [HasAVX, NoVLX] in {
- defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SSE_PABS, loadv2i64>, VEX, VEX_WIG;
+ defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
+ loadv2i64>, VEX, VEX_WIG;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
- defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SSE_PABS>, VEX, VEX_L, VEX_WIG;
- defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SSE_PABS>, VEX, VEX_L, VEX_WIG;
+ defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
+ VEX, VEX_L, VEX_WIG;
+ defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
+ VEX, VEX_L, VEX_WIG;
}
let Predicates = [HasAVX2, NoVLX] in {
- defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SSE_PABS>, VEX, VEX_L, VEX_WIG;
+ defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
+ VEX, VEX_L, VEX_WIG;
}
-defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SSE_PABS, memopv2i64>;
-defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SSE_PABS, memopv2i64>;
-defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SSE_PABS, memopv2i64>;
+defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
+ memopv2i64>;
+defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
+ memopv2i64>;
+defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
+ memopv2i64>;
//===---------------------------------------------------------------------===//
// SSSE3 - Packed Binary Operator Instructions
//===---------------------------------------------------------------------===//
-let Sched = WritePHAdd in {
-def SSE_PHADDSUBD : OpndItins<
- IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
->;
-def SSE_PHADDSUBSW : OpndItins<
- IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
->;
-def SSE_PHADDSUBW : OpndItins<
- IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
->;
-}
-let Sched = WriteShuffle in
-def SSE_PSHUFB : OpndItins<
- IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
->;
-let Sched = WriteVecALU in
-def SSE_PSIGN : OpndItins<
- IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
->;
-let Sched = WriteVecIMul in
-def SSE_PMULHRSW : OpndItins<
- IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
->;
-
/// SS3I_binop_rm - Simple SSSE3 bin op
multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType DstVT, ValueType OpVT, RegisterClass RC,
PatFrag memop_frag, X86MemOperand x86memop,
- OpndItins itins, bit Is2Addr = 1> {
+ X86FoldableSchedWrite sched, bit Is2Addr = 1> {
let isCommutable = 1 in
def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))], itins.rr>,
- Sched<[itins.Sched]>;
+ [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
+ Sched<[sched]>;
def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
@@ -5095,93 +4653,93 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst,
(DstVT (OpNode (OpVT RC:$src1),
- (bitconvert (memop_frag addr:$src2)))))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (memop_frag addr:$src2)))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
- Intrinsic IntId128, OpndItins itins,
+ Intrinsic IntId128, X86FoldableSchedWrite sched,
PatFrag ld_frag, bit Is2Addr = 1> {
let isCommutable = 1 in
- def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))], itins.rr>,
- Sched<[itins.Sched]>;
- def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ Sched<[sched]>;
+ def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set VR128:$dst,
(IntId128 VR128:$src1,
- (bitconvert (ld_frag addr:$src2))))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (ld_frag addr:$src2))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
Intrinsic IntId256,
- X86FoldableSchedWrite Sched> {
+ X86FoldableSchedWrite sched> {
let isCommutable = 1 in
- def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+ def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
- Sched<[Sched]>;
- def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+ Sched<[sched]>;
+ def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, i256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
(IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
- Sched<[Sched.Folded, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>;
}
let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
let isCommutable = 0 in {
defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
VR128, loadv2i64, i128mem,
- SSE_PSHUFB, 0>, VEX_4V, VEX_WIG;
+ SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
v16i8, VR128, loadv2i64, i128mem,
- SSE_PMADD, 0>, VEX_4V, VEX_WIG;
+ SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
}
defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
VR128, loadv2i64, i128mem,
- SSE_PMULHRSW, 0>, VEX_4V, VEX_WIG;
+ SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
}
let ImmT = NoImm, Predicates = [HasAVX] in {
let isCommutable = 0 in {
defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
loadv2i64, i128mem,
- SSE_PHADDSUBW, 0>, VEX_4V, VEX_WIG;
+ SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
loadv2i64, i128mem,
- SSE_PHADDSUBD, 0>, VEX_4V, VEX_WIG;
+ SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
loadv2i64, i128mem,
- SSE_PHADDSUBW, 0>, VEX_4V, VEX_WIG;
+ SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
loadv2i64, i128mem,
- SSE_PHADDSUBD, 0>, VEX_4V;
+ SchedWritePHAdd.XMM, 0>, VEX_4V;
defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
int_x86_ssse3_psign_b_128,
- SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
int_x86_ssse3_psign_w_128,
- SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
int_x86_ssse3_psign_d_128,
- SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
int_x86_ssse3_phadd_sw_128,
- SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
int_x86_ssse3_phsub_sw_128,
- SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
}
}
@@ -5189,42 +4747,42 @@ let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
let isCommutable = 0 in {
defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
VR256, loadv4i64, i256mem,
- SSE_PSHUFB, 0>, VEX_4V, VEX_L, VEX_WIG;
+ SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
v32i8, VR256, loadv4i64, i256mem,
- SSE_PMADD, 0>, VEX_4V, VEX_L, VEX_WIG;
+ SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
}
defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
VR256, loadv4i64, i256mem,
- SSE_PMULHRSW, 0>, VEX_4V, VEX_L, VEX_WIG;
+ SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
}
let ImmT = NoImm, Predicates = [HasAVX2] in {
let isCommutable = 0 in {
defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
VR256, loadv4i64, i256mem,
- SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
+ SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
loadv4i64, i256mem,
- SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
+ SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
VR256, loadv4i64, i256mem,
- SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
+ SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
loadv4i64, i256mem,
- SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
- defm VPSIGNBY : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
- WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
- defm VPSIGNWY : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
- WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
- defm VPSIGNDY : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
- WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
- defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
- int_x86_avx2_phadd_sw,
- WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
- defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
- int_x86_avx2_phsub_sw,
- WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
+ SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
+ defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
+ SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
+ defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
+ SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
+ defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
+ SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
+ defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
+ int_x86_avx2_phadd_sw,
+ SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
+ defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
+ int_x86_avx2_phsub_sw,
+ SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
}
}
@@ -5232,47 +4790,42 @@ let isCommutable = 0 in {
let ImmT = NoImm, Constraints = "$src1 = $dst" in {
let isCommutable = 0 in {
defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
- memopv2i64, i128mem, SSE_PHADDSUBW>;
+ memopv2i64, i128mem, SchedWritePHAdd.XMM>;
defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
- memopv2i64, i128mem, SSE_PHADDSUBD>;
+ memopv2i64, i128mem, SchedWritePHAdd.XMM>;
defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
- memopv2i64, i128mem, SSE_PHADDSUBW>;
+ memopv2i64, i128mem, SchedWritePHAdd.XMM>;
defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
- memopv2i64, i128mem, SSE_PHADDSUBD>;
+ memopv2i64, i128mem, SchedWritePHAdd.XMM>;
defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
- SSE_PSIGN, memopv2i64>;
+ SchedWriteVecALU.XMM, memopv2i64>;
defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
- SSE_PSIGN, memopv2i64>;
+ SchedWriteVecALU.XMM, memopv2i64>;
defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
- SSE_PSIGN, memopv2i64>;
+ SchedWriteVecALU.XMM, memopv2i64>;
defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
- memopv2i64, i128mem, SSE_PSHUFB>;
+ memopv2i64, i128mem, SchedWriteVarShuffle.XMM>;
defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
int_x86_ssse3_phadd_sw_128,
- SSE_PHADDSUBSW, memopv2i64>;
+ SchedWritePHAdd.XMM, memopv2i64>;
defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
int_x86_ssse3_phsub_sw_128,
- SSE_PHADDSUBSW, memopv2i64>;
+ SchedWritePHAdd.XMM, memopv2i64>;
defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
v16i8, VR128, memopv2i64, i128mem,
- SSE_PMADD>;
+ SchedWriteVecIMul.XMM>;
}
defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
- VR128, memopv2i64, i128mem, SSE_PMULHRSW>;
+ VR128, memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
}
//===---------------------------------------------------------------------===//
// SSSE3 - Packed Align Instruction Patterns
//===---------------------------------------------------------------------===//
-let Sched = WriteShuffle in
-def SSE_PALIGN : OpndItins<
- IIC_SSE_PALIGNRR, IIC_SSE_PALIGNRM
->;
-
multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
PatFrag memop_frag, X86MemOperand x86memop,
- OpndItins itins, bit Is2Addr = 1> {
+ X86FoldableSchedWrite sched, bit Is2Addr = 1> {
let hasSideEffects = 0 in {
def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3),
@@ -5280,8 +4833,8 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))],
- itins.rr>, Sched<[itins.Sched]>;
+ [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>,
+ Sched<[sched]>;
let mayLoad = 1 in
def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3),
@@ -5291,20 +4844,20 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set RC:$dst, (VT (X86PAlignr RC:$src1,
(bitconvert (memop_frag addr:$src2)),
- (i8 imm:$src3))))],
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (i8 imm:$src3))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
- defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64,
- i128mem, SSE_PALIGN, 0>, VEX_4V, VEX_WIG;
+ defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64, i128mem,
+ SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
- defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64,
- i256mem, SSE_PALIGN, 0>, VEX_4V, VEX_L, VEX_WIG;
+ defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64, i256mem,
+ SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
- defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64,
- i128mem, SSE_PALIGN>;
+ defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64, i128mem,
+ SchedWriteShuffle.XMM>;
//===---------------------------------------------------------------------===//
// SSSE3 - Thread synchronization
@@ -5318,13 +4871,12 @@ def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
}
let Uses = [EAX, ECX, EDX] in
-def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
- TB, Requires<[HasSSE3]>;
+def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
+ TB, Requires<[HasSSE3]>;
let Uses = [ECX, EAX] in
def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
- [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
- TB, Requires<[HasSSE3]>;
+ [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
} // SchedRW
def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
@@ -5340,45 +4892,39 @@ def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
//===----------------------------------------------------------------------===//
multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
- RegisterClass OutRC, RegisterClass InRC,
- OpndItins itins> {
+ RegisterClass OutRC, RegisterClass InRC,
+ X86FoldableSchedWrite sched> {
def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [], itins.rr>,
- Sched<[itins.Sched]>;
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+ Sched<[sched]>;
def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [],
- itins.rm>, Sched<[itins.Sched.Folded]>;
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+ Sched<[sched.Folded]>;
}
multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
- X86MemOperand MemOp, X86MemOperand MemYOp,
- OpndItins SSEItins, OpndItins AVXItins,
- OpndItins AVX2Itins, Predicate prd> {
- defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
+ X86MemOperand MemOp, X86MemOperand MemYOp,
+ Predicate prd> {
+ defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
+ SchedWriteShuffle.XMM>;
let Predicates = [HasAVX, prd] in
defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
- VR128, VR128, AVXItins>, VEX, VEX_WIG;
+ VR128, VR128, SchedWriteShuffle.XMM>,
+ VEX, VEX_WIG;
let Predicates = [HasAVX2, prd] in
defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
- VR256, VR128, AVX2Itins>, VEX, VEX_L, VEX_WIG;
+ VR256, VR128, WriteShuffle256>,
+ VEX, VEX_L, VEX_WIG;
}
multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
X86MemOperand MemYOp, Predicate prd> {
defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
- MemOp, MemYOp,
- SSE_INTALU_ITINS_SHUFF_P,
- DEFAULT_ITINS_SHUFFLESCHED,
- DEFAULT_ITINS_SHUFFLESCHED, prd>;
+ MemOp, MemYOp, prd>;
defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
!strconcat("pmovzx", OpcodeStr),
- MemOp, MemYOp,
- SSE_INTALU_ITINS_SHUFF_P,
- DEFAULT_ITINS_SHUFFLESCHED,
- DEFAULT_ITINS_SHUFFLESCHED, prd>;
+ MemOp, MemYOp, prd>;
}
defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
@@ -5490,7 +5036,7 @@ defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
// SSE4.1/AVX patterns.
multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
- SDNode ExtOp, PatFrag ExtLoad16> {
+ SDNode ExtOp> {
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
(!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
@@ -5549,7 +5095,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
- def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
+ def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
(!cast<I>(OpcPrefix#BQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
(!cast<I>(OpcPrefix#BQrm) addr:$src)>;
@@ -5591,12 +5137,12 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
}
}
-defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec, extloadi32i16>;
-defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec, loadi16_anyext>;
+defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
+defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
let Predicates = [UseSSE41] in {
- defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec, extloadi32i16>;
- defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec, loadi16_anyext>;
+ defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
+ defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
}
//===----------------------------------------------------------------------===//
@@ -5611,15 +5157,14 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
imm:$src2))]>,
- Sched<[WriteShuffle]>;
- let hasSideEffects = 0, mayStore = 1,
- SchedRW = [WriteShuffleLd, WriteRMW] in
+ Sched<[WriteVecExtract]>;
+ let hasSideEffects = 0, mayStore = 1 in
def mr : SS4AIi8<opc, MRMDestMem, (outs),
(ins i8mem:$dst, VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
- addr:$dst)]>;
+ addr:$dst)]>, Sched<[WriteVecExtractSt]>;
}
let Predicates = [HasAVX, NoBWI] in
@@ -5634,17 +5179,16 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
(ins VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, Sched<[WriteShuffle]>, FoldGenData<NAME#ri>;
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+ Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
- let hasSideEffects = 0, mayStore = 1,
- SchedRW = [WriteShuffleLd, WriteRMW] in
+ let hasSideEffects = 0, mayStore = 1 in
def mr : SS4AIi8<opc, MRMDestMem, (outs),
(ins i16mem:$dst, VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
- addr:$dst)]>;
+ addr:$dst)]>, Sched<[WriteVecExtractSt]>;
}
let Predicates = [HasAVX, NoBWI] in
@@ -5661,14 +5205,13 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set GR32:$dst,
(extractelt (v4i32 VR128:$src1), imm:$src2))]>,
- Sched<[WriteShuffle]>;
- let SchedRW = [WriteShuffleLd, WriteRMW] in
+ Sched<[WriteVecExtract]>;
def mr : SS4AIi8<opc, MRMDestMem, (outs),
(ins i32mem:$dst, VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(store (extractelt (v4i32 VR128:$src1), imm:$src2),
- addr:$dst)]>;
+ addr:$dst)]>, Sched<[WriteVecExtractSt]>;
}
let Predicates = [HasAVX, NoDQI] in
@@ -5684,14 +5227,13 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set GR64:$dst,
(extractelt (v2i64 VR128:$src1), imm:$src2))]>,
- Sched<[WriteShuffle]>;
- let SchedRW = [WriteShuffleLd, WriteRMW] in
+ Sched<[WriteVecExtract]>;
def mr : SS4AIi8<opc, MRMDestMem, (outs),
(ins i64mem:$dst, VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(store (extractelt (v2i64 VR128:$src1), imm:$src2),
- addr:$dst)]>;
+ addr:$dst)]>, Sched<[WriteVecExtractSt]>;
}
let Predicates = [HasAVX, NoDQI] in
@@ -5701,28 +5243,26 @@ defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W;
/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
/// destination
-multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
- OpndItins itins = DEFAULT_ITINS> {
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
- (ins VR128:$src1, u8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set GR32orGR64:$dst,
- (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
- itins.rr>, Sched<[WriteFBlend]>;
- let SchedRW = [WriteFBlendLd, WriteRMW] in
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32orGR64:$dst,
+ (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
+ Sched<[WriteVecExtract]>;
def mr : SS4AIi8<opc, MRMDestMem, (outs),
- (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
- addr:$dst)], itins.rm>;
+ (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
+ addr:$dst)]>, Sched<[WriteVecExtractSt]>;
}
let ExeDomain = SSEPackedSingle in {
let Predicates = [UseAVX] in
defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
- defm EXTRACTPS : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
+ defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
}
// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
@@ -5750,7 +5290,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
- Sched<[WriteShuffle]>;
+ Sched<[WriteVecInsert]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i8mem:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -5759,7 +5299,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
- imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ imm:$src3))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>;
}
let Predicates = [HasAVX, NoBWI] in
@@ -5776,7 +5316,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
- Sched<[WriteShuffle]>;
+ Sched<[WriteVecInsert]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i32mem:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -5785,7 +5325,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
- imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>;
}
let Predicates = [HasAVX, NoDQI] in
@@ -5802,7 +5342,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
- Sched<[WriteShuffle]>;
+ Sched<[WriteVecInsert]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i64mem:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -5811,7 +5351,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
- imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>;
}
let Predicates = [HasAVX, NoDQI] in
@@ -5823,8 +5363,7 @@ let Constraints = "$src1 = $dst" in
// are optimized inserts that won't zero arbitrary elements in the destination
// vector. The next one matches the intrinsic and could zero arbitrary elements
// in the target vector.
-multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
- OpndItins itins = DEFAULT_ITINS> {
+multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -5832,8 +5371,8 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
- Sched<[WriteFShuffle]>;
+ (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>,
+ Sched<[SchedWriteFShuffle.XMM]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f32mem:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -5843,15 +5382,16 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
[(set VR128:$dst,
(X86insertps VR128:$src1,
(v4f32 (scalar_to_vector (loadf32 addr:$src2))),
- imm:$src3))], itins.rm>,
- Sched<[WriteFShuffleLd, ReadAfterLd]>;
+ imm:$src3))]>,
+ Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
}
let ExeDomain = SSEPackedSingle in {
let Predicates = [UseAVX] in
- defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V, VEX_WIG;
+ defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
+ VEX_4V, VEX_WIG;
let Constraints = "$src1 = $dst" in
- defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
+ defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
}
let Predicates = [UseAVX] in {
@@ -5869,66 +5409,44 @@ let Predicates = [UseAVX] in {
// SSE4.1 - Round Instructions
//===----------------------------------------------------------------------===//
-multiclass sse41_fp_unop_p<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
X86MemOperand x86memop, RegisterClass RC,
- ValueType VT32, ValueType VT64,
- PatFrag mem_frag32, PatFrag mem_frag64,
- SDNode OpNode> {
-let ExeDomain = SSEPackedSingle in {
+ ValueType VT, PatFrag mem_frag, SDNode OpNode,
+ X86FoldableSchedWrite sched> {
// Intrinsic operation, reg.
// Vector intrinsic operation, reg
- def PSr : SS4AIi8<opcps, MRMSrcReg,
- (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
- !strconcat(OpcodeStr,
- "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (VT32 (OpNode RC:$src1, imm:$src2)))],
- IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
-
- // Vector intrinsic operation, mem
- def PSm : SS4AIi8<opcps, MRMSrcMem,
- (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
- !strconcat(OpcodeStr,
- "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst,
- (VT32 (OpNode (mem_frag32 addr:$src1),imm:$src2)))],
- IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
-} // ExeDomain = SSEPackedSingle
-
-let ExeDomain = SSEPackedDouble in {
- // Vector intrinsic operation, reg
- def PDr : SS4AIi8<opcpd, MRMSrcReg,
- (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
- !strconcat(OpcodeStr,
- "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (VT64 (OpNode RC:$src1, imm:$src2)))],
- IIC_SSE_ROUNDPD_REG>, Sched<[WriteFAdd]>;
+ def r : SS4AIi8<opc, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>,
+ Sched<[sched]>;
// Vector intrinsic operation, mem
- def PDm : SS4AIi8<opcpd, MRMSrcMem,
- (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
- !strconcat(OpcodeStr,
- "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst,
- (VT64 (OpNode (mem_frag64 addr:$src1),imm:$src2)))],
- IIC_SSE_ROUNDPD_REG>, Sched<[WriteFAddLd]>;
-} // ExeDomain = SSEPackedDouble
+ def m : SS4AIi8<opc, MRMSrcMem,
+ (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>,
+ Sched<[sched.Folded]>;
}
multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
- string OpcodeStr> {
+ string OpcodeStr, X86FoldableSchedWrite sched> {
let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
def SSr : SS4AIi8<opcss, MRMSrcReg,
(outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, Sched<[WriteFAdd]>;
+ []>, Sched<[sched]>;
let mayLoad = 1 in
def SSm : SS4AIi8<opcss, MRMSrcMem,
(outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+ []>, Sched<[sched.Folded, ReadAfterLd]>;
} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
@@ -5936,32 +5454,32 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
(outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, Sched<[WriteFAdd]>;
+ []>, Sched<[sched]>;
let mayLoad = 1 in
def SDm : SS4AIi8<opcsd, MRMSrcMem,
(outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+ []>, Sched<[sched.Folded, ReadAfterLd]>;
} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
}
multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
- string OpcodeStr> {
+ string OpcodeStr, X86FoldableSchedWrite sched> {
let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
def SSr : SS4AIi8<opcss, MRMSrcReg,
(outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
"ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, Sched<[WriteFAdd]>;
+ []>, Sched<[sched]>;
let mayLoad = 1 in
def SSm : SS4AIi8<opcss, MRMSrcMem,
(outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
"ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+ []>, Sched<[sched.Folded, ReadAfterLd]>;
} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
@@ -5969,19 +5487,20 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
(outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
"sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, Sched<[WriteFAdd]>;
+ []>, Sched<[sched]>;
let mayLoad = 1 in
def SDm : SS4AIi8<opcsd, MRMSrcMem,
(outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
"sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+ []>, Sched<[sched.Folded, ReadAfterLd]>;
} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
}
multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
- string OpcodeStr, ValueType VT32, ValueType VT64,
+ string OpcodeStr, X86FoldableSchedWrite sched,
+ ValueType VT32, ValueType VT64,
SDNode OpNode, bit Is2Addr = 1> {
let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
@@ -5992,7 +5511,7 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
- Sched<[WriteFAdd]>;
+ Sched<[sched]>;
def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
@@ -6003,7 +5522,7 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
- Sched<[WriteFAddLd, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>;
} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
@@ -6015,7 +5534,7 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
- Sched<[WriteFAdd]>;
+ Sched<[sched]>;
def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
@@ -6026,49 +5545,87 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
- Sched<[WriteFAddLd, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>;
} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
}
// FP round - roundss, roundps, roundsd, roundpd
let Predicates = [HasAVX, NoVLX] in {
- // Intrinsic form
- defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128, v4f32,
- v2f64, loadv4f32, loadv2f64, X86VRndScale>,
- VEX, VEX_WIG;
- defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256, v8f32,
- v4f64, loadv8f32, loadv4f64, X86VRndScale>,
- VEX, VEX_L, VEX_WIG;
+ let ExeDomain = SSEPackedSingle in {
+ // Intrinsic form
+ defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
+ loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>,
+ VEX, VEX_WIG;
+ defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
+ loadv8f32, X86VRndScale, SchedWriteFRnd.YMM>,
+ VEX, VEX_L, VEX_WIG;
+ }
+
+ let ExeDomain = SSEPackedDouble in {
+ defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
+ loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>,
+ VEX, VEX_WIG;
+ defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
+ loadv4f64, X86VRndScale, SchedWriteFRnd.YMM>,
+ VEX, VEX_L, VEX_WIG;
+ }
}
let Predicates = [HasAVX, NoAVX512] in {
- defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", v4f32, v2f64,
- X86RndScales, 0>, VEX_4V, VEX_LIG, VEX_WIG;
- defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG, VEX_WIG;
+ defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
+ v4f32, v2f64, X86RndScales, 0>,
+ VEX_4V, VEX_LIG, VEX_WIG;
+ defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
+ VEX_4V, VEX_LIG, VEX_WIG;
}
let Predicates = [UseAVX] in {
def : Pat<(ffloor FR32:$src),
(VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
- def : Pat<(f64 (ffloor FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
def : Pat<(f32 (fnearbyint FR32:$src)),
(VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
- def : Pat<(f64 (fnearbyint FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
def : Pat<(f32 (fceil FR32:$src)),
(VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
- def : Pat<(f64 (fceil FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
def : Pat<(f32 (frint FR32:$src)),
(VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
- def : Pat<(f64 (frint FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
def : Pat<(f32 (ftrunc FR32:$src)),
(VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
+
+ def : Pat<(f64 (ffloor FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
+ def : Pat<(f64 (fnearbyint FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+ def : Pat<(f64 (fceil FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
+ def : Pat<(f64 (frint FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
def : Pat<(f64 (ftrunc FR64:$src)),
(VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
}
+let Predicates = [UseAVX, OptForSize] in {
+ def : Pat<(ffloor (loadf32 addr:$src)),
+ (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
+ def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
+ (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
+ def : Pat<(f32 (fceil (loadf32 addr:$src))),
+ (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
+ def : Pat<(f32 (frint (loadf32 addr:$src))),
+ (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
+ def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
+ (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
+
+ def : Pat<(f64 (ffloor (loadf64 addr:$src))),
+ (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
+ def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
+ (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
+ def : Pat<(f64 (fceil (loadf64 addr:$src))),
+ (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
+ def : Pat<(f64 (frint (loadf64 addr:$src))),
+ (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
+ def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
+ (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
+}
+
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4f32 (ffloor VR128:$src)),
(VROUNDPSr VR128:$src, (i32 0x9))>;
@@ -6081,6 +5638,17 @@ let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4f32 (ftrunc VR128:$src)),
(VROUNDPSr VR128:$src, (i32 0xB))>;
+ def : Pat<(v4f32 (ffloor (loadv4f32 addr:$src))),
+ (VROUNDPSm addr:$src, (i32 0x9))>;
+ def : Pat<(v4f32 (fnearbyint (loadv4f32 addr:$src))),
+ (VROUNDPSm addr:$src, (i32 0xC))>;
+ def : Pat<(v4f32 (fceil (loadv4f32 addr:$src))),
+ (VROUNDPSm addr:$src, (i32 0xA))>;
+ def : Pat<(v4f32 (frint (loadv4f32 addr:$src))),
+ (VROUNDPSm addr:$src, (i32 0x4))>;
+ def : Pat<(v4f32 (ftrunc (loadv4f32 addr:$src))),
+ (VROUNDPSm addr:$src, (i32 0xB))>;
+
def : Pat<(v2f64 (ffloor VR128:$src)),
(VROUNDPDr VR128:$src, (i32 0x9))>;
def : Pat<(v2f64 (fnearbyint VR128:$src)),
@@ -6092,59 +5660,124 @@ let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v2f64 (ftrunc VR128:$src)),
(VROUNDPDr VR128:$src, (i32 0xB))>;
+ def : Pat<(v2f64 (ffloor (loadv2f64 addr:$src))),
+ (VROUNDPDm addr:$src, (i32 0x9))>;
+ def : Pat<(v2f64 (fnearbyint (loadv2f64 addr:$src))),
+ (VROUNDPDm addr:$src, (i32 0xC))>;
+ def : Pat<(v2f64 (fceil (loadv2f64 addr:$src))),
+ (VROUNDPDm addr:$src, (i32 0xA))>;
+ def : Pat<(v2f64 (frint (loadv2f64 addr:$src))),
+ (VROUNDPDm addr:$src, (i32 0x4))>;
+ def : Pat<(v2f64 (ftrunc (loadv2f64 addr:$src))),
+ (VROUNDPDm addr:$src, (i32 0xB))>;
+
def : Pat<(v8f32 (ffloor VR256:$src)),
- (VROUNDYPSr VR256:$src, (i32 0x9))>;
+ (VROUNDPSYr VR256:$src, (i32 0x9))>;
def : Pat<(v8f32 (fnearbyint VR256:$src)),
- (VROUNDYPSr VR256:$src, (i32 0xC))>;
+ (VROUNDPSYr VR256:$src, (i32 0xC))>;
def : Pat<(v8f32 (fceil VR256:$src)),
- (VROUNDYPSr VR256:$src, (i32 0xA))>;
+ (VROUNDPSYr VR256:$src, (i32 0xA))>;
def : Pat<(v8f32 (frint VR256:$src)),
- (VROUNDYPSr VR256:$src, (i32 0x4))>;
+ (VROUNDPSYr VR256:$src, (i32 0x4))>;
def : Pat<(v8f32 (ftrunc VR256:$src)),
- (VROUNDYPSr VR256:$src, (i32 0xB))>;
+ (VROUNDPSYr VR256:$src, (i32 0xB))>;
+
+ def : Pat<(v8f32 (ffloor (loadv8f32 addr:$src))),
+ (VROUNDPSYm addr:$src, (i32 0x9))>;
+ def : Pat<(v8f32 (fnearbyint (loadv8f32 addr:$src))),
+ (VROUNDPSYm addr:$src, (i32 0xC))>;
+ def : Pat<(v8f32 (fceil (loadv8f32 addr:$src))),
+ (VROUNDPSYm addr:$src, (i32 0xA))>;
+ def : Pat<(v8f32 (frint (loadv8f32 addr:$src))),
+ (VROUNDPSYm addr:$src, (i32 0x4))>;
+ def : Pat<(v8f32 (ftrunc (loadv8f32 addr:$src))),
+ (VROUNDPSYm addr:$src, (i32 0xB))>;
def : Pat<(v4f64 (ffloor VR256:$src)),
- (VROUNDYPDr VR256:$src, (i32 0x9))>;
+ (VROUNDPDYr VR256:$src, (i32 0x9))>;
def : Pat<(v4f64 (fnearbyint VR256:$src)),
- (VROUNDYPDr VR256:$src, (i32 0xC))>;
+ (VROUNDPDYr VR256:$src, (i32 0xC))>;
def : Pat<(v4f64 (fceil VR256:$src)),
- (VROUNDYPDr VR256:$src, (i32 0xA))>;
+ (VROUNDPDYr VR256:$src, (i32 0xA))>;
def : Pat<(v4f64 (frint VR256:$src)),
- (VROUNDYPDr VR256:$src, (i32 0x4))>;
+ (VROUNDPDYr VR256:$src, (i32 0x4))>;
def : Pat<(v4f64 (ftrunc VR256:$src)),
- (VROUNDYPDr VR256:$src, (i32 0xB))>;
+ (VROUNDPDYr VR256:$src, (i32 0xB))>;
+
+ def : Pat<(v4f64 (ffloor (loadv4f64 addr:$src))),
+ (VROUNDPDYm addr:$src, (i32 0x9))>;
+ def : Pat<(v4f64 (fnearbyint (loadv4f64 addr:$src))),
+ (VROUNDPDYm addr:$src, (i32 0xC))>;
+ def : Pat<(v4f64 (fceil (loadv4f64 addr:$src))),
+ (VROUNDPDYm addr:$src, (i32 0xA))>;
+ def : Pat<(v4f64 (frint (loadv4f64 addr:$src))),
+ (VROUNDPDYm addr:$src, (i32 0x4))>;
+ def : Pat<(v4f64 (ftrunc (loadv4f64 addr:$src))),
+ (VROUNDPDYm addr:$src, (i32 0xB))>;
}
-defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128, v4f32, v2f64,
- memopv4f32, memopv2f64, X86VRndScale>;
+let ExeDomain = SSEPackedSingle in
+defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
+ memopv4f32, X86VRndScale, SchedWriteFRnd.XMM>;
+let ExeDomain = SSEPackedDouble in
+defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
+ memopv2f64, X86VRndScale, SchedWriteFRnd.XMM>;
-defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round">;
+defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
let Constraints = "$src1 = $dst" in
-defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", v4f32, v2f64, X86RndScales>;
+defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
+ v4f32, v2f64, X86RndScales>;
let Predicates = [UseSSE41] in {
def : Pat<(ffloor FR32:$src),
(ROUNDSSr FR32:$src, (i32 0x9))>;
- def : Pat<(f64 (ffloor FR64:$src)),
- (ROUNDSDr FR64:$src, (i32 0x9))>;
def : Pat<(f32 (fnearbyint FR32:$src)),
(ROUNDSSr FR32:$src, (i32 0xC))>;
- def : Pat<(f64 (fnearbyint FR64:$src)),
- (ROUNDSDr FR64:$src, (i32 0xC))>;
def : Pat<(f32 (fceil FR32:$src)),
(ROUNDSSr FR32:$src, (i32 0xA))>;
- def : Pat<(f64 (fceil FR64:$src)),
- (ROUNDSDr FR64:$src, (i32 0xA))>;
def : Pat<(f32 (frint FR32:$src)),
(ROUNDSSr FR32:$src, (i32 0x4))>;
- def : Pat<(f64 (frint FR64:$src)),
- (ROUNDSDr FR64:$src, (i32 0x4))>;
def : Pat<(f32 (ftrunc FR32:$src)),
(ROUNDSSr FR32:$src, (i32 0xB))>;
+
+ def : Pat<(f64 (ffloor FR64:$src)),
+ (ROUNDSDr FR64:$src, (i32 0x9))>;
+ def : Pat<(f64 (fnearbyint FR64:$src)),
+ (ROUNDSDr FR64:$src, (i32 0xC))>;
+ def : Pat<(f64 (fceil FR64:$src)),
+ (ROUNDSDr FR64:$src, (i32 0xA))>;
+ def : Pat<(f64 (frint FR64:$src)),
+ (ROUNDSDr FR64:$src, (i32 0x4))>;
def : Pat<(f64 (ftrunc FR64:$src)),
(ROUNDSDr FR64:$src, (i32 0xB))>;
+}
+
+let Predicates = [UseSSE41, OptForSize] in {
+ def : Pat<(ffloor (loadf32 addr:$src)),
+ (ROUNDSSm addr:$src, (i32 0x9))>;
+ def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
+ (ROUNDSSm addr:$src, (i32 0xC))>;
+ def : Pat<(f32 (fceil (loadf32 addr:$src))),
+ (ROUNDSSm addr:$src, (i32 0xA))>;
+ def : Pat<(f32 (frint (loadf32 addr:$src))),
+ (ROUNDSSm addr:$src, (i32 0x4))>;
+ def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
+ (ROUNDSSm addr:$src, (i32 0xB))>;
+
+ def : Pat<(f64 (ffloor (loadf64 addr:$src))),
+ (ROUNDSDm addr:$src, (i32 0x9))>;
+ def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
+ (ROUNDSDm addr:$src, (i32 0xC))>;
+ def : Pat<(f64 (fceil (loadf64 addr:$src))),
+ (ROUNDSDm addr:$src, (i32 0xA))>;
+ def : Pat<(f64 (frint (loadf64 addr:$src))),
+ (ROUNDSDm addr:$src, (i32 0x4))>;
+ def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
+ (ROUNDSDm addr:$src, (i32 0xB))>;
+}
+let Predicates = [UseSSE41] in {
def : Pat<(v4f32 (ffloor VR128:$src)),
(ROUNDPSr VR128:$src, (i32 0x9))>;
def : Pat<(v4f32 (fnearbyint VR128:$src)),
@@ -6156,6 +5789,17 @@ let Predicates = [UseSSE41] in {
def : Pat<(v4f32 (ftrunc VR128:$src)),
(ROUNDPSr VR128:$src, (i32 0xB))>;
+ def : Pat<(v4f32 (ffloor (memopv4f32 addr:$src))),
+ (ROUNDPSm addr:$src, (i32 0x9))>;
+ def : Pat<(v4f32 (fnearbyint (memopv4f32 addr:$src))),
+ (ROUNDPSm addr:$src, (i32 0xC))>;
+ def : Pat<(v4f32 (fceil (memopv4f32 addr:$src))),
+ (ROUNDPSm addr:$src, (i32 0xA))>;
+ def : Pat<(v4f32 (frint (memopv4f32 addr:$src))),
+ (ROUNDPSm addr:$src, (i32 0x4))>;
+ def : Pat<(v4f32 (ftrunc (memopv4f32 addr:$src))),
+ (ROUNDPSm addr:$src, (i32 0xB))>;
+
def : Pat<(v2f64 (ffloor VR128:$src)),
(ROUNDPDr VR128:$src, (i32 0x9))>;
def : Pat<(v2f64 (fnearbyint VR128:$src)),
@@ -6166,73 +5810,93 @@ let Predicates = [UseSSE41] in {
(ROUNDPDr VR128:$src, (i32 0x4))>;
def : Pat<(v2f64 (ftrunc VR128:$src)),
(ROUNDPDr VR128:$src, (i32 0xB))>;
-}
+
+ def : Pat<(v2f64 (ffloor (memopv2f64 addr:$src))),
+ (ROUNDPDm addr:$src, (i32 0x9))>;
+ def : Pat<(v2f64 (fnearbyint (memopv2f64 addr:$src))),
+ (ROUNDPDm addr:$src, (i32 0xC))>;
+ def : Pat<(v2f64 (fceil (memopv2f64 addr:$src))),
+ (ROUNDPDm addr:$src, (i32 0xA))>;
+ def : Pat<(v2f64 (frint (memopv2f64 addr:$src))),
+ (ROUNDPDm addr:$src, (i32 0x4))>;
+ def : Pat<(v2f64 (ftrunc (memopv2f64 addr:$src))),
+ (ROUNDPDm addr:$src, (i32 0xB))>;
+}
+
+defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSS", X86Movss,
+ v4f32, 0x01, UseSSE41>;
+defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSS", X86Movss,
+ v4f32, 0x02, UseSSE41>;
+defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSD", X86Movsd,
+ v2f64, 0x01, UseSSE41>;
+defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSD", X86Movsd,
+ v2f64, 0x02, UseSSE41>;
//===----------------------------------------------------------------------===//
// SSE4.1 - Packed Bit Test
//===----------------------------------------------------------------------===//
-let Sched = WriteVecLogic in
-def SSE_PTEST : OpndItins<
- IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-
// ptest instruction we'll lower to this in X86ISelLowering primarily from
// the intel intrinsic that corresponds to this.
let Defs = [EFLAGS], Predicates = [HasAVX] in {
def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
"vptest\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
- Sched<[WriteVecLogic]>, VEX, VEX_WIG;
+ Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
"vptest\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
- Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_WIG;
+ Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>,
+ VEX, VEX_WIG;
def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
"vptest\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
- Sched<[WriteVecLogic]>, VEX, VEX_L, VEX_WIG;
+ Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
"vptest\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
- Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L, VEX_WIG;
+ Sched<[SchedWriteVecTest.YMM.Folded, ReadAfterLd]>,
+ VEX, VEX_L, VEX_WIG;
}
let Defs = [EFLAGS] in {
def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
"ptest\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
- Sched<[WriteVecLogic]>;
+ Sched<[SchedWriteVecTest.XMM]>;
def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
"ptest\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
- Sched<[WriteVecLogicLd, ReadAfterLd]>;
+ Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>;
}
// The bit test instructions below are AVX only
multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
- X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
+ X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
+ X86FoldableSchedWrite sched> {
def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
- Sched<[WriteVecLogic]>, VEX;
+ Sched<[sched]>, VEX;
def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
- Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
+ Sched<[sched.Folded, ReadAfterLd]>, VEX;
}
let Defs = [EFLAGS], Predicates = [HasAVX] in {
let ExeDomain = SSEPackedSingle in {
-defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
-defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
- VEX_L;
+defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
+ SchedWriteFTest.XMM>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
+ SchedWriteFTest.YMM>, VEX_L;
}
let ExeDomain = SSEPackedDouble in {
-defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
-defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
- VEX_L;
+defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
+ SchedWriteFTest.XMM>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
+ SchedWriteFTest.YMM>, VEX_L;
}
}
@@ -6243,229 +5907,201 @@ defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
"popcnt{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
- IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
- OpSize16, XS;
+ [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
+ Sched<[WritePOPCNT]>, OpSize16, XS;
def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"popcnt{w}\t{$src, $dst|$dst, $src}",
[(set GR16:$dst, (ctpop (loadi16 addr:$src))),
- (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
- Sched<[WriteFAddLd]>, OpSize16, XS;
+ (implicit EFLAGS)]>,
+ Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
"popcnt{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
- IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
- OpSize32, XS;
+ [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
+ Sched<[WritePOPCNT]>, OpSize32, XS;
def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"popcnt{l}\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (ctpop (loadi32 addr:$src))),
- (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
- Sched<[WriteFAddLd]>, OpSize32, XS;
+ (implicit EFLAGS)]>,
+ Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
"popcnt{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
- IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS;
+ [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
+ Sched<[WritePOPCNT]>, XS;
def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"popcnt{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (ctpop (loadi64 addr:$src))),
- (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
- Sched<[WriteFAddLd]>, XS;
+ (implicit EFLAGS)]>,
+ Sched<[WritePOPCNT.Folded]>, XS;
}
// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
SDNode OpNode, PatFrag ld_frag,
X86FoldableSchedWrite Sched> {
- def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
- Sched<[Sched]>;
- def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
- (ins i128mem:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst,
- (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>,
- Sched<[Sched.Folded]>;
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
+ Sched<[Sched]>;
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>,
+ Sched<[Sched.Folded]>;
}
// PHMIN has the same profile as PSAD, thus we use the same scheduling
// model, although the naming is misleading.
let Predicates = [HasAVX] in
-defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
+defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
X86phminpos, loadv2i64,
- WriteVecIMul>, VEX, VEX_WIG;
-defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
+ WritePHMINPOS>, VEX, VEX_WIG;
+defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
X86phminpos, memopv2i64,
- WriteVecIMul>;
+ WritePHMINPOS>;
/// SS48I_binop_rm - Simple SSE41 binary operator.
multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
- X86MemOperand x86memop, bit Is2Addr = 1,
- OpndItins itins = SSE_INTALU_ITINS_P> {
+ X86MemOperand x86memop, X86FoldableSchedWrite sched,
+ bit Is2Addr = 1> {
let isCommutable = 1 in
def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
- Sched<[itins.Sched]>;
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+ Sched<[sched]>;
def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst,
- (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))],
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
-}
-
-/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
-/// types.
-multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType DstVT, ValueType SrcVT, RegisterClass RC,
- PatFrag memop_frag, X86MemOperand x86memop,
- OpndItins itins,
- bit IsCommutable = 0, bit Is2Addr = 1> {
- let isCommutable = IsCommutable in
- def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))], itins.rr>,
- Sched<[itins.Sched]>;
- def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, x86memop:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
- (bitconvert (memop_frag addr:$src2)))))],
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
let Predicates = [HasAVX, NoVLX] in {
defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
- loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
VEX_4V, VEX_WIG;
defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
- loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
VEX_4V, VEX_WIG;
defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
- loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
VEX_4V, VEX_WIG;
defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
- loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
+ loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
VEX_4V, VEX_WIG;
- defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
- VR128, loadv2i64, i128mem,
- SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_WIG;
}
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
- loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
VEX_4V, VEX_WIG;
defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
- loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
VEX_4V, VEX_WIG;
defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
- loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
VEX_4V, VEX_WIG;
defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
- loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
VEX_4V, VEX_WIG;
}
let Predicates = [HasAVX2, NoVLX] in {
defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
- loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
VEX_4V, VEX_L, VEX_WIG;
defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
- loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
VEX_4V, VEX_L, VEX_WIG;
defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
- loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
VEX_4V, VEX_L, VEX_WIG;
defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
- loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
+ loadv4i64, i256mem, SchedWriteVecIMul.YMM, 0>,
VEX_4V, VEX_L, VEX_WIG;
- defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
- VR256, loadv4i64, i256mem,
- SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L, VEX_WIG;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
- loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
VEX_4V, VEX_L, VEX_WIG;
defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
- loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
VEX_4V, VEX_L, VEX_WIG;
defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
- loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
VEX_4V, VEX_L, VEX_WIG;
defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
- loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
VEX_4V, VEX_L, VEX_WIG;
}
let Constraints = "$src1 = $dst" in {
defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
- memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
- memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
- memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
- memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
- memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
- memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
- memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
- memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
- defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
- VR128, memopv2i64, i128mem,
- SSE_INTMUL_ITINS_P, 1>;
+ memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+ defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
+ memopv2i64, i128mem, SchedWriteVecIMul.XMM, 1>;
}
let Predicates = [HasAVX, NoVLX] in
defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
- loadv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
+ loadv2i64, i128mem, SchedWritePMULLD.XMM, 0>,
VEX_4V, VEX_WIG;
let Predicates = [HasAVX] in
defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
- loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
VEX_4V, VEX_WIG;
let Predicates = [HasAVX2, NoVLX] in
defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
- loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
+ loadv4i64, i256mem, SchedWritePMULLD.YMM, 0>,
VEX_4V, VEX_L, VEX_WIG;
let Predicates = [HasAVX2] in
defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
- loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
VEX_4V, VEX_L, VEX_WIG;
let Constraints = "$src1 = $dst" in {
defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
- memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
+ memopv2i64, i128mem, SchedWritePMULLD.XMM, 1>;
defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
- memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
+ memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
}
/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
X86MemOperand x86memop, bit Is2Addr,
- OpndItins itins> {
+ X86FoldableSchedWrite sched> {
let isCommutable = 1 in
def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3),
@@ -6474,8 +6110,8 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
- Sched<[itins.Sched]>;
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+ Sched<[sched]>;
def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -6485,15 +6121,15 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set RC:$dst,
(IntId RC:$src1,
- (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
X86MemOperand x86memop, bit Is2Addr,
- OpndItins itins> {
+ X86FoldableSchedWrite sched> {
let isCommutable = 1 in
def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3),
@@ -6502,8 +6138,8 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
- itins.rr>, Sched<[itins.Sched]>;
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+ Sched<[sched]>;
def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -6513,8 +6149,8 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set RC:$dst,
(OpVT (OpNode RC:$src1,
- (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
def BlendCommuteImm2 : SDNodeXForm<imm, [{
@@ -6536,53 +6172,53 @@ let Predicates = [HasAVX] in {
let isCommutable = 0 in {
defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
VR128, loadv2i64, i128mem, 0,
- DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_WIG;
+ SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
}
let ExeDomain = SSEPackedSingle in
defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
VR128, loadv4f32, f128mem, 0,
- SSE_DPPS_ITINS>, VEX_4V, VEX_WIG;
+ SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
let ExeDomain = SSEPackedDouble in
defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
VR128, loadv2f64, f128mem, 0,
- SSE_DPPS_ITINS>, VEX_4V, VEX_WIG;
+ SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
let ExeDomain = SSEPackedSingle in
defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
VR256, loadv8f32, i256mem, 0,
- SSE_DPPS_ITINS>, VEX_4V, VEX_L, VEX_WIG;
+ SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
}
let Predicates = [HasAVX2] in {
let isCommutable = 0 in {
defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
VR256, loadv4i64, i256mem, 0,
- DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L, VEX_WIG;
+ SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
}
}
let Constraints = "$src1 = $dst" in {
let isCommutable = 0 in {
defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
- VR128, memopv2i64, i128mem,
- 1, SSE_MPSADBW_ITINS>;
+ VR128, memopv2i64, i128mem, 1,
+ SchedWriteMPSAD.XMM>;
}
let ExeDomain = SSEPackedSingle in
defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
VR128, memopv4f32, f128mem, 1,
- SSE_DPPS_ITINS>;
+ SchedWriteDPPS.XMM>;
let ExeDomain = SSEPackedDouble in
defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
VR128, memopv2f64, f128mem, 1,
- SSE_DPPD_ITINS>;
+ SchedWriteDPPD.XMM>;
}
/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
X86MemOperand x86memop, bit Is2Addr, Domain d,
- OpndItins itins, SDNodeXForm commuteXForm> {
+ X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
let isCommutable = 1 in
def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
@@ -6592,8 +6228,8 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
- itins.rr>, Sched<[itins.Sched]>;
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+ Sched<[sched]>;
def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -6603,8 +6239,8 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set RC:$dst,
(OpVT (OpNode RC:$src1,
- (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
// Pattern to commute if load is in first source.
@@ -6617,42 +6253,42 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
let Predicates = [HasAVX] in {
defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
VR128, loadv4f32, f128mem, 0, SSEPackedSingle,
- DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm4>,
+ SchedWriteFBlend.XMM, BlendCommuteImm4>,
VEX_4V, VEX_WIG;
defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
VR256, loadv8f32, f256mem, 0, SSEPackedSingle,
- DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm8>,
+ SchedWriteFBlend.YMM, BlendCommuteImm8>,
VEX_4V, VEX_L, VEX_WIG;
defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
VR128, loadv2f64, f128mem, 0, SSEPackedDouble,
- DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm2>,
+ SchedWriteFBlend.XMM, BlendCommuteImm2>,
VEX_4V, VEX_WIG;
defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
VR256, loadv4f64, f256mem, 0, SSEPackedDouble,
- DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm4>,
+ SchedWriteFBlend.YMM, BlendCommuteImm4>,
VEX_4V, VEX_L, VEX_WIG;
defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
VR128, loadv2i64, i128mem, 0, SSEPackedInt,
- DEFAULT_ITINS_BLENDSCHED, BlendCommuteImm8>,
+ SchedWriteBlend.XMM, BlendCommuteImm8>,
VEX_4V, VEX_WIG;
}
let Predicates = [HasAVX2] in {
defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
VR256, loadv4i64, i256mem, 0, SSEPackedInt,
- DEFAULT_ITINS_BLENDSCHED, BlendCommuteImm8>,
+ SchedWriteBlend.YMM, BlendCommuteImm8>,
VEX_4V, VEX_L, VEX_WIG;
}
defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
VR128, memopv4f32, f128mem, 1, SSEPackedSingle,
- SSE_INTALU_ITINS_FBLEND_P, BlendCommuteImm4>;
+ SchedWriteFBlend.XMM, BlendCommuteImm4>;
defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
VR128, memopv2f64, f128mem, 1, SSEPackedDouble,
- SSE_INTALU_ITINS_FBLEND_P, BlendCommuteImm2>;
+ SchedWriteFBlend.XMM, BlendCommuteImm2>;
defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
VR128, memopv2i64, i128mem, 1, SSEPackedInt,
- SSE_INTALU_ITINS_BLEND_P, BlendCommuteImm8>;
+ SchedWriteBlend.XMM, BlendCommuteImm8>;
// For insertion into the zero index (low half) of a 256-bit vector, it is
// more efficient to generate a blend with immediate instead of an insert*128.
@@ -6671,14 +6307,14 @@ def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
RegisterClass RC, X86MemOperand x86memop,
PatFrag mem_frag, Intrinsic IntId,
- X86FoldableSchedWrite Sched> {
+ X86FoldableSchedWrite sched> {
def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
- NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
- Sched<[Sched]>;
+ SSEPackedInt>, TAPD, VEX_4V,
+ Sched<[sched]>;
def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, RC:$src3),
@@ -6686,37 +6322,41 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst,
(IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
- RC:$src3))],
- NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
- Sched<[Sched.Folded, ReadAfterLd]>;
+ RC:$src3))], SSEPackedInt>, TAPD, VEX_4V,
+ Sched<[sched.Folded, ReadAfterLd,
+ // x86memop:$src2
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC::$src3
+ ReadAfterLd]>;
}
let Predicates = [HasAVX] in {
let ExeDomain = SSEPackedDouble in {
defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
loadv2f64, int_x86_sse41_blendvpd,
- WriteFVarBlend>;
+ SchedWriteFVarBlend.XMM>;
defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
loadv4f64, int_x86_avx_blendv_pd_256,
- WriteFVarBlend>, VEX_L;
+ SchedWriteFVarBlend.YMM>, VEX_L;
} // ExeDomain = SSEPackedDouble
let ExeDomain = SSEPackedSingle in {
defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
loadv4f32, int_x86_sse41_blendvps,
- WriteFVarBlend>;
+ SchedWriteFVarBlend.XMM>;
defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
loadv8f32, int_x86_avx_blendv_ps_256,
- WriteFVarBlend>, VEX_L;
+ SchedWriteFVarBlend.YMM>, VEX_L;
} // ExeDomain = SSEPackedSingle
defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
loadv2i64, int_x86_sse41_pblendvb,
- WriteVarBlend>;
+ SchedWriteVarBlend.XMM>;
}
let Predicates = [HasAVX2] in {
defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
loadv4i64, int_x86_avx2_pblendvb,
- WriteVarBlend>, VEX_L;
+ SchedWriteVarBlend.YMM>, VEX_L;
}
let Predicates = [HasAVX] in {
@@ -6755,48 +6395,76 @@ let Predicates = [HasAVX2] in {
(VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
}
-// Patterns
-// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
-// on targets where they have equal performance. These were changed to use
-// blends because blends have better throughput on SandyBridge and Haswell, but
-// movs[s/d] are 1-2 byte shorter instructions.
-let Predicates = [UseAVX] in {
- let AddedComplexity = 15 in {
+// Prefer a movss or movsd over a blendps when optimizing for size. these were
+// changed to use blends because blends have better throughput on sandybridge
+// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
+let Predicates = [HasAVX, OptForSpeed] in {
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
(VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
(VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
- def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
- (VMOVSDrr (v2f64 (V_SET0)), (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+ (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
+ def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
+ (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
+ def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
+ (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
+
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
+ (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
+ def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
+ (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
// Move low f32 and clear high bits.
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
- (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
+ (SUBREG_TO_REG (i32 0),
+ (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
+ (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
+ (i8 1))), sub_xmm)>;
+ def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
+ (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
+ (i8 3))), sub_xmm)>;
- // Move low f64 and clear high bits.
def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
- (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
- }
-
- // These will incur an FP/int domain crossing penalty, but it may be the only
- // way without AVX2. Do not add any complexity because we may be able to match
- // more optimal patterns defined earlier in this file.
- def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
- (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VBLENDPDrri (v2f64 (V_SET0)),
+ (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)),
+ (i8 1))), sub_xmm)>;
def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
- (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VPBLENDWrri (v2i64 (V_SET0)),
+ (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)),
+ (i8 0xf))), sub_xmm)>;
}
-// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
-// on targets where they have equal performance. These were changed to use
-// blends because blends have better throughput on SandyBridge and Haswell, but
-// movs[s/d] are 1-2 byte shorter instructions.
-let Predicates = [UseSSE41], AddedComplexity = 15 in {
+// Prefer a movss or movsd over a blendps when optimizing for size. these were
+// changed to use blends because blends have better throughput on sandybridge
+// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
+let Predicates = [UseSSE41, OptForSpeed] in {
// With SSE41 we can use blends for these patterns.
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
(BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
(PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
+
+ def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+ (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
+ def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
+ (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
+ def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
+ (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
+
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
+ (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
+ def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
+ (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
}
@@ -6804,13 +6472,13 @@ let Predicates = [UseSSE41], AddedComplexity = 15 in {
let Uses = [XMM0], Constraints = "$src1 = $dst" in {
multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
X86MemOperand x86memop, Intrinsic IntId,
- OpndItins itins> {
+ X86FoldableSchedWrite sched> {
def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr,
"\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
- [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
- itins.rr>, Sched<[itins.Sched]>;
+ [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
+ Sched<[sched]>;
def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, x86memop:$src2),
@@ -6818,22 +6486,19 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
"\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
[(set VR128:$dst,
(IntId VR128:$src1,
- (bitconvert (mem_frag addr:$src2)), XMM0))],
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bitconvert (mem_frag addr:$src2)), XMM0))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
}
let ExeDomain = SSEPackedDouble in
defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
- int_x86_sse41_blendvpd,
- DEFAULT_ITINS_FBLENDSCHED>;
+ int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>;
let ExeDomain = SSEPackedSingle in
defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
- int_x86_sse41_blendvps,
- DEFAULT_ITINS_FBLENDSCHED>;
+ int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>;
defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
- int_x86_sse41_pblendvb,
- DEFAULT_ITINS_VARBLENDSCHED>;
+ int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>;
// Aliases with the implicit xmm0 argument
def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
@@ -6868,18 +6533,18 @@ let Predicates = [UseSSE41] in {
}
let AddedComplexity = 400 in { // Prefer non-temporal versions
-let SchedRW = [WriteLoad] in {
+
let Predicates = [HasAVX, NoVLX] in
def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
- "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
- VEX, VEX_WIG;
+ "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
let Predicates = [HasAVX2, NoVLX] in
def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
"vmovntdqa\t{$src, $dst|$dst, $src}", []>,
- VEX, VEX_L, VEX_WIG;
+ Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
- "movntdqa\t{$src, $dst|$dst, $src}", []>;
-} // SchedRW
+ "movntdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v8f32 (alignednontemporalload addr:$src)),
@@ -6917,62 +6582,43 @@ let Predicates = [UseSSE41] in {
/// SS42I_binop_rm - Simple SSE 4.2 binary operator
multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
- X86MemOperand x86memop, OpndItins itins,
+ X86MemOperand x86memop, X86FoldableSchedWrite sched,
bit Is2Addr = 1> {
def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
- Sched<[itins.Sched]>;
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+ Sched<[sched]>;
def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst,
- (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))], itins.rm>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
let Predicates = [HasAVX] in
defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
- loadv2i64, i128mem, SSE_INTALU_ITINS_P, 0>,
+ loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
VEX_4V, VEX_WIG;
let Predicates = [HasAVX2] in
defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
- loadv4i64, i256mem, SSE_INTALU_ITINS_P, 0>,
+ loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
VEX_4V, VEX_L, VEX_WIG;
let Constraints = "$src1 = $dst" in
defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
- memopv2i64, i128mem, SSE_INTALU_ITINS_P>;
+ memopv2i64, i128mem, SchedWriteVecALU.XMM>;
//===----------------------------------------------------------------------===//
// SSE4.2 - String/text Processing Instructions
//===----------------------------------------------------------------------===//
-// Packed Compare Implicit Length Strings, Return Mask
-multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
- def REG : PseudoI<(outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, u8imm:$src3),
- [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
- imm:$src3))]>;
- def MEM : PseudoI<(outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
- [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
- (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
-}
-
-let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
- defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
- Requires<[HasAVX]>, VEX_WIG;
- defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
- Requires<[UseSSE42]>;
-}
-
multiclass pcmpistrm_SS42AI<string asm> {
def rr : SS42AI<0x62, MRMSrcReg, (outs),
(ins VR128:$src1, VR128:$src2, u8imm:$src3),
@@ -6982,32 +6628,13 @@ multiclass pcmpistrm_SS42AI<string asm> {
def rm :SS42AI<0x62, MRMSrcMem, (outs),
(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
!strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
- []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
+ []>, Sched<[WritePCmpIStrM.Folded, ReadAfterLd]>;
}
let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
let Predicates = [HasAVX] in
- defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
- defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ;
-}
-
-// Packed Compare Explicit Length Strings, Return Mask
-multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
- def REG : PseudoI<(outs VR128:$dst),
- (ins VR128:$src1, VR128:$src3, u8imm:$src5),
- [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
- VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
- def MEM : PseudoI<(outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
- [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
- (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
-}
-
-let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
- defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
- Requires<[HasAVX]>;
- defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>,
- Requires<[UseSSE42]>;
+ defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+ defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ;
}
multiclass SS42AI_pcmpestrm<string asm> {
@@ -7019,32 +6646,13 @@ multiclass SS42AI_pcmpestrm<string asm> {
def rm : SS42AI<0x60, MRMSrcMem, (outs),
(ins VR128:$src1, i128mem:$src3, u8imm:$src5),
!strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
- []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
+ []>, Sched<[WritePCmpEStrM.Folded, ReadAfterLd]>;
}
let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
let Predicates = [HasAVX] in
- defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
- defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">;
-}
-
-// Packed Compare Implicit Length Strings, Return Index
-multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
- def REG : PseudoI<(outs GR32:$dst),
- (ins VR128:$src1, VR128:$src2, u8imm:$src3),
- [(set GR32:$dst, EFLAGS,
- (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
- def MEM : PseudoI<(outs GR32:$dst),
- (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
- [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
- (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
-}
-
-let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
- defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
- Requires<[HasAVX]>, VEX_WIG;
- defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
- Requires<[UseSSE42]>;
+ defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+ defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">;
}
multiclass SS42AI_pcmpistri<string asm> {
@@ -7056,7 +6664,7 @@ multiclass SS42AI_pcmpistri<string asm> {
def rm : SS42AI<0x63, MRMSrcMem, (outs),
(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
!strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
- []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
+ []>, Sched<[WritePCmpIStrI.Folded, ReadAfterLd]>;
}
let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
@@ -7065,26 +6673,6 @@ let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
}
-// Packed Compare Explicit Length Strings, Return Index
-multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
- def REG : PseudoI<(outs GR32:$dst),
- (ins VR128:$src1, VR128:$src3, u8imm:$src5),
- [(set GR32:$dst, EFLAGS,
- (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
- def MEM : PseudoI<(outs GR32:$dst),
- (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
- [(set GR32:$dst, EFLAGS,
- (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX,
- imm:$src5))]>;
-}
-
-let Defs = [EFLAGS], Uses = [EAX, EDX], hasNoSchedulingInfo = 1, usesCustomInserter = 1 in {
- defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
- Requires<[HasAVX]>;
- defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>,
- Requires<[UseSSE42]>;
-}
-
multiclass SS42AI_pcmpestri<string asm> {
def rr : SS42AI<0x61, MRMSrcReg, (outs),
(ins VR128:$src1, VR128:$src3, u8imm:$src5),
@@ -7094,7 +6682,7 @@ multiclass SS42AI_pcmpestri<string asm> {
def rm : SS42AI<0x61, MRMSrcMem, (outs),
(ins VR128:$src1, i128mem:$src3, u8imm:$src5),
!strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
- []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
+ []>, Sched<[WritePCmpEStrI.Folded, ReadAfterLd]>;
}
let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
@@ -7116,15 +6704,15 @@ class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
RegisterClass RCIn, SDPatternOperator Int> :
SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
!strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
- [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>,
- Sched<[WriteFAdd]>;
+ [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
+ Sched<[WriteCRC32]>;
class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
X86MemOperand x86memop, SDPatternOperator Int> :
SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
!strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
- [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
- IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+ [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
+ Sched<[WriteCRC32.Folded, ReadAfterLd]>;
let Constraints = "$src1 = $dst" in {
def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
@@ -7156,9 +6744,9 @@ let Constraints = "$src1 = $dst" in {
// SHA-NI Instructions
//===----------------------------------------------------------------------===//
-// FIXME: Is there a better scheduler itinerary for SHA than WriteVecIMul?
+// FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
- OpndItins itins, bit UsesXMM0 = 0> {
+ X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!if(UsesXMM0,
@@ -7166,8 +6754,8 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
[!if(UsesXMM0,
(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
- (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))], itins.rr>,
- T8, Sched<[itins.Sched]>;
+ (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
+ T8, Sched<[sched]>;
def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
@@ -7178,8 +6766,8 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
(set VR128:$dst, (IntId VR128:$src1,
(bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
(set VR128:$dst, (IntId VR128:$src1,
- (bc_v4i32 (memopv2i64 addr:$src2)))))], itins.rm>, T8,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8,
+ Sched<[sched.Folded, ReadAfterLd]>;
}
let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
@@ -7188,32 +6776,32 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
"sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
(int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
- (i8 imm:$src3)))], IIC_SSE_INTMUL_P_RR>, TA,
- Sched<[WriteVecIMul]>;
+ (i8 imm:$src3)))]>, TA,
+ Sched<[SchedWriteVecIMul.XMM]>;
def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
"sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
(int_x86_sha1rnds4 VR128:$src1,
(bc_v4i32 (memopv2i64 addr:$src2)),
- (i8 imm:$src3)))], IIC_SSE_INTMUL_P_RM>, TA,
- Sched<[WriteVecIMulLd, ReadAfterLd]>;
+ (i8 imm:$src3)))]>, TA,
+ Sched<[SchedWriteVecIMul.XMM.Folded, ReadAfterLd]>;
defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
- SSE_INTMUL_ITINS_P>;
+ SchedWriteVecIMul.XMM>;
defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
- SSE_INTMUL_ITINS_P>;
+ SchedWriteVecIMul.XMM>;
defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
- SSE_INTMUL_ITINS_P>;
+ SchedWriteVecIMul.XMM>;
let Uses=[XMM0] in
defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
- SSE_INTMUL_ITINS_P, 1>;
+ SchedWriteVecIMul.XMM, 1>;
defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
- SSE_INTMUL_ITINS_P>;
+ SchedWriteVecIMul.XMM>;
defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
- SSE_INTMUL_ITINS_P>;
+ SchedWriteVecIMul.XMM>;
}
// Aliases with explicit %xmm0
@@ -7240,7 +6828,7 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, MemOp:$src2), "",
[(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
- Sched<[WriteAESDecEncLd, ReadAfterLd]>;
+ Sched<[WriteAESDecEnc.Folded, ReadAfterLd]>;
}
}
@@ -7294,7 +6882,7 @@ let Predicates = [HasAVX, HasAES] in {
(ins i128mem:$src1),
"vaesimc\t{$src1, $dst|$dst, $src1}",
[(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
- Sched<[WriteAESIMCLd]>, VEX, VEX_WIG;
+ Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
}
def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1),
@@ -7305,7 +6893,7 @@ def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1),
"aesimc\t{$src1, $dst|$dst, $src1}",
[(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
- Sched<[WriteAESIMCLd]>;
+ Sched<[WriteAESIMC.Folded]>;
// AES Round Key Generation Assist
let Predicates = [HasAVX, HasAES] in {
@@ -7320,7 +6908,7 @@ let Predicates = [HasAVX, HasAES] in {
"vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
(int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
- Sched<[WriteAESKeyGenLd]>, VEX, VEX_WIG;
+ Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
}
def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, u8imm:$src2),
@@ -7333,7 +6921,7 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
"aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
(int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
- Sched<[WriteAESKeyGenLd]>;
+ Sched<[WriteAESKeyGen.Folded]>;
//===----------------------------------------------------------------------===//
// PCLMUL Instructions
@@ -7353,16 +6941,16 @@ let Predicates = [NoAVX, HasPCLMUL] in {
(ins VR128:$src1, VR128:$src2, u8imm:$src3),
"pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
- (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
- IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
+ (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
+ Sched<[WriteCLMul]>;
def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
"pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
(int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2),
- imm:$src3))],
- IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMulLd, ReadAfterLd]>;
+ imm:$src3))]>,
+ Sched<[WriteCLMul.Folded, ReadAfterLd]>;
} // Constraints = "$src1 = $dst"
def : Pat<(int_x86_pclmulqdq (memopv2i64 addr:$src2), VR128:$src1,
@@ -7398,7 +6986,7 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
"vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set RC:$dst,
(IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>,
- Sched<[WriteCLMulLd, ReadAfterLd]>;
+ Sched<[WriteCLMul.Folded, ReadAfterLd]>;
// We can commute a load in the first operand by swapping the sources and
// rotating the immediate.
@@ -7449,45 +7037,45 @@ def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
(ins VR128:$src, u8imm:$len, u8imm:$idx),
"extrq\t{$idx, $len, $src|$src, $len, $idx}",
[(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
- imm:$idx))], IIC_SSE_INTALU_P_RR>,
- PD, Sched<[WriteVecALU]>;
+ imm:$idx))]>,
+ PD, Sched<[SchedWriteVecALU.XMM]>;
def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src, VR128:$mask),
"extrq\t{$mask, $src|$src, $mask}",
[(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
- VR128:$mask))], IIC_SSE_INTALU_P_RR>,
- PD, Sched<[WriteVecALU]>;
+ VR128:$mask))]>,
+ PD, Sched<[SchedWriteVecALU.XMM]>;
def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
"insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
[(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
- imm:$len, imm:$idx))], IIC_SSE_INTALU_P_RR>,
- XD, Sched<[WriteVecALU]>;
+ imm:$len, imm:$idx))]>,
+ XD, Sched<[SchedWriteVecALU.XMM]>;
def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src, VR128:$mask),
"insertq\t{$mask, $src|$src, $mask}",
[(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
- VR128:$mask))], IIC_SSE_INTALU_P_RR>,
- XD, Sched<[WriteVecALU]>;
+ VR128:$mask))]>,
+ XD, Sched<[SchedWriteVecALU.XMM]>;
}
} // ExeDomain = SSEPackedInt
// Non-temporal (unaligned) scalar stores.
let AddedComplexity = 400 in { // Prefer non-temporal versions
-let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteStore] in {
+let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
- "movntss\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XS;
+ "movntss\t{$src, $dst|$dst, $src}", []>, XS;
def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
- "movntsd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XD;
+ "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
} // SchedRW
def : Pat<(nontemporalstore FR32:$src, addr:$dst),
- (MOVNTSS addr:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
def : Pat<(nontemporalstore FR64:$src, addr:$dst),
- (MOVNTSD addr:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
} // AddedComplexity
} // HasSSE4A
@@ -7518,18 +7106,20 @@ class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
- f32mem, v4f32, loadf32, WriteLoad>;
+ f32mem, v4f32, loadf32,
+ SchedWriteFShuffle.XMM.Folded>;
def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
- f32mem, v8f32, loadf32,
- WriteFShuffleLd>, VEX_L;
+ f32mem, v8f32, loadf32,
+ SchedWriteFShuffle.XMM.Folded>, VEX_L;
}
let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
- v4f64, loadf64, WriteFShuffleLd>, VEX_L;
+ v4f64, loadf64,
+ SchedWriteFShuffle.XMM.Folded>, VEX_L;
let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
- v4f32, v4f32, WriteFShuffle>;
+ v4f32, v4f32, SchedWriteFShuffle.XMM>;
def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
v8f32, v4f32, WriteFShuffle256>, VEX_L;
}
@@ -7554,13 +7144,14 @@ let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
(ins i128mem:$src),
"vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
- Sched<[WriteLoad]>, VEX, VEX_L;
+ Sched<[WriteShuffleLd]>, VEX, VEX_L;
-let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX] in
+let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
+ ExeDomain = SSEPackedSingle in
def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
(ins f128mem:$src),
"vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
- Sched<[WriteFShuffleLd]>, VEX, VEX_L;
+ Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
@@ -7598,12 +7189,12 @@ let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR128:$src2, u8imm:$src3),
"vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
+ []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
let mayLoad = 1 in
def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f128mem:$src2, u8imm:$src3),
"vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
+ []>, Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
}
// To create a 256-bit all ones value, we should produce VCMPTRUEPS
@@ -7645,12 +7236,12 @@ let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
(ins VR256:$src1, u8imm:$src2),
"vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
+ []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
let mayStore = 1 in
def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
(ins f128mem:$dst, VR256:$src1, u8imm:$src2),
"vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, Sched<[WriteStore]>, VEX, VEX_L;
+ []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
}
multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
@@ -7686,23 +7277,23 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))],
- IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteLoad]>;
+ [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
+ VEX_4V, Sched<[WriteFMaskedLoad]>;
def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))],
- IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteLoad]>;
+ [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+ VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>;
def mr : AVX8I<opc_mr, MRMDestMem, (outs),
(ins f128mem:$dst, VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(IntSt addr:$dst, VR128:$src1, VR128:$src2)], IIC_SSE_MASKMOV>,
- VEX_4V, Sched<[WriteStore]>;
+ [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
+ VEX_4V, Sched<[WriteFMaskedStore]>;
def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
(ins f256mem:$dst, VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)], IIC_SSE_MASKMOV>,
- VEX_4V, VEX_L, Sched<[WriteStore]>;
+ [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
+ VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>;
}
let ExeDomain = SSEPackedSingle in
@@ -7722,63 +7313,60 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
// VPERMIL - Permute Single and Double Floating-Point Values
//
-let Sched = WriteFShuffle in
-def AVX_VPERMILV : OpndItins<
- IIC_SSE_SHUFP, IIC_SSE_SHUFP
->;
-
-let Sched = WriteFShuffle in
-def AVX_VPERMIL : OpndItins<
- IIC_SSE_SHUFP, IIC_SSE_SHUFP
->;
-
multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
RegisterClass RC, X86MemOperand x86memop_f,
X86MemOperand x86memop_i, PatFrag i_frag,
- ValueType f_vt, ValueType i_vt> {
+ ValueType f_vt, ValueType i_vt,
+ X86FoldableSchedWrite sched,
+ X86FoldableSchedWrite varsched> {
let Predicates = [HasAVX, NoVLX] in {
def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
- Sched<[WriteFShuffle]>;
+ Sched<[varsched]>;
def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop_i:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
(i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V,
- Sched<[WriteFShuffleLd, ReadAfterLd]>;
+ Sched<[varsched.Folded, ReadAfterLd]>;
def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
- Sched<[WriteFShuffle]>;
+ Sched<[sched]>;
def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
(ins x86memop_f:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
(f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
- Sched<[WriteFShuffleLd]>;
+ Sched<[sched.Folded]>;
}// Predicates = [HasAVX, NoVLX]
}
let ExeDomain = SSEPackedSingle in {
defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
- loadv2i64, v4f32, v4i32>;
+ loadv2i64, v4f32, v4i32, SchedWriteFShuffle.XMM,
+ SchedWriteFVarShuffle.XMM>;
defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
- loadv4i64, v8f32, v8i32>, VEX_L;
+ loadv4i64, v8f32, v8i32, SchedWriteFShuffle.YMM,
+ SchedWriteFVarShuffle.YMM>, VEX_L;
}
let ExeDomain = SSEPackedDouble in {
defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
- loadv2i64, v2f64, v2i64>;
+ loadv2i64, v2f64, v2i64, SchedWriteFShuffle.XMM,
+ SchedWriteFVarShuffle.XMM>;
defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
- loadv4i64, v4f64, v4i64>, VEX_L;
+ loadv4i64, v4f64, v4i64, SchedWriteFShuffle.YMM,
+ SchedWriteFVarShuffle.YMM>, VEX_L;
}
//===----------------------------------------------------------------------===//
// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
//
+
let ExeDomain = SSEPackedSingle in {
let isCommutable = 1 in
def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
@@ -7786,13 +7374,13 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
(i8 imm:$src3))))]>, VEX_4V, VEX_L,
- Sched<[WriteFShuffle]>;
+ Sched<[WriteFShuffle256]>;
def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
(i8 imm:$src3)))]>, VEX_4V, VEX_L,
- Sched<[WriteFShuffleLd, ReadAfterLd]>;
+ Sched<[WriteFShuffle256Ld, ReadAfterLd]>;
}
// Immediate transform to help with commuting.
@@ -7821,58 +7409,63 @@ def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
//===----------------------------------------------------------------------===//
// VZERO - Zero YMM registers
+// Note: These instruction do not affect the YMM16-YMM31.
//
-// Note, these instruction do not affect the YMM16-YMM31.
+
let SchedRW = [WriteSystem] in {
let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
// Zero All YMM registers
def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
- [(int_x86_avx_vzeroall)], IIC_AVX_ZERO>, PS, VEX, VEX_L,
+ [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
Requires<[HasAVX]>, VEX_WIG;
// Zero Upper bits of YMM registers
def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
- [(int_x86_avx_vzeroupper)], IIC_AVX_ZERO>, PS, VEX,
+ [(int_x86_avx_vzeroupper)]>, PS, VEX,
Requires<[HasAVX]>, VEX_WIG;
} // Defs
} // SchedRW
//===----------------------------------------------------------------------===//
// Half precision conversion instructions
-//===----------------------------------------------------------------------===//
-multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop> {
+//
+
+multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
+ X86FoldableSchedWrite sched> {
def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
"vcvtph2ps\t{$src, $dst|$dst, $src}",
[(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
- T8PD, VEX, Sched<[WriteCvtF2F]>;
+ T8PD, VEX, Sched<[sched]>;
let hasSideEffects = 0, mayLoad = 1 in
def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
"vcvtph2ps\t{$src, $dst|$dst, $src}",
[(set RC:$dst, (X86cvtph2ps (bc_v8i16
(loadv2i64 addr:$src))))]>,
- T8PD, VEX, Sched<[WriteCvtF2FLd]>;
+ T8PD, VEX, Sched<[sched.Folded]>;
}
-multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop> {
+multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
+ SchedWrite RR, SchedWrite MR> {
def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
(ins RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>,
- TAPD, VEX, Sched<[WriteCvtF2F]>;
- let hasSideEffects = 0, mayStore = 1,
- SchedRW = [WriteCvtF2FLd, WriteRMW] in
+ TAPD, VEX, Sched<[RR]>;
+ let hasSideEffects = 0, mayStore = 1 in
def mr : Ii8<0x1D, MRMDestMem, (outs),
(ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- TAPD, VEX;
+ TAPD, VEX, Sched<[MR]>;
}
let Predicates = [HasF16C, NoVLX] in {
- defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem>;
- defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem>, VEX_L;
- defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem>;
- defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem>, VEX_L;
+ defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>;
+ defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L;
+ defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
+ WriteCvtPS2PHSt>;
+ defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
+ WriteCvtPS2PHYSt>, VEX_L;
// Pattern match vcvtph2ps of a scalar i64 load.
def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
@@ -7903,16 +7496,16 @@ let Predicates = [HasF16C, NoVLX] in {
// more consistent with other instructions, which are always controlled by it.
// It's encoded as 0b100.
def : Pat<(fp_to_f16 FR32:$src),
- (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
- (COPY_TO_REGCLASS FR32:$src, VR128), 4)), sub_16bit))>;
+ (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr
+ (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>;
def : Pat<(f16_to_fp GR16:$src),
- (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
- (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >;
+ (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
+ (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >;
def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
- (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
- (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 4)), FR32)) >;
+ (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
+ (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >;
}
//===----------------------------------------------------------------------===//
@@ -7921,7 +7514,8 @@ let Predicates = [HasF16C, NoVLX] in {
/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ ValueType OpVT, X86FoldableSchedWrite sched,
+ RegisterClass RC, PatFrag memop_frag,
X86MemOperand x86memop, SDNodeXForm commuteXForm> {
let isCommutable = 1 in
def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
@@ -7929,7 +7523,7 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
- Sched<[WriteBlend]>, VEX_4V;
+ Sched<[sched]>, VEX_4V;
def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3),
!strconcat(OpcodeStr,
@@ -7937,7 +7531,7 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set RC:$dst,
(OpVT (OpNode RC:$src1,
(bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
- Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
+ Sched<[sched.Folded, ReadAfterLd]>, VEX_4V;
// Pattern to commute if load is in first source.
def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
@@ -7947,10 +7541,11 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
- VR128, loadv2i64, i128mem, BlendCommuteImm4>;
+ SchedWriteBlend.XMM, VR128, loadv2i64, i128mem,
+ BlendCommuteImm4>;
defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
- VR256, loadv4i64, i256mem, BlendCommuteImm8>,
- VEX_L;
+ SchedWriteBlend.YMM, VR256, loadv4i64, i256mem,
+ BlendCommuteImm8>, VEX_L;
// For insertion into the zero index (low half) of a 256-bit vector, it is
// more efficient to generate a blend with immediate instead of an insert*128.
@@ -8004,12 +7599,12 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
(OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
- Sched<[WriteShuffle]>, VEX;
+ Sched<[SchedWriteShuffle.XMM]>, VEX;
def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
(OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
- Sched<[WriteLoad]>, VEX;
+ Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR256:$dst,
@@ -8019,7 +7614,7 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR256:$dst,
(OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
- Sched<[WriteLoad]>, VEX, VEX_L;
+ Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
// Provide aliases for broadcast from the same register class that
// automatically does the extract.
@@ -8084,45 +7679,45 @@ let Predicates = [HasAVX2, NoVLX] in {
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
- (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
- (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
- (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
- (VPBROADCASTBrr (COPY_TO_REGCLASS
+ (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
GR8:$src, sub_8bit)),
- VR128))>;
+ VR128)))>;
def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
- (VPBROADCASTBYrr (COPY_TO_REGCLASS
+ (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
GR8:$src, sub_8bit)),
- VR128))>;
+ VR128)))>;
def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
- (VPBROADCASTWrr (COPY_TO_REGCLASS
+ (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
GR16:$src, sub_16bit)),
- VR128))>;
+ VR128)))>;
def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
- (VPBROADCASTWYrr (COPY_TO_REGCLASS
+ (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
GR16:$src, sub_16bit)),
- VR128))>;
+ VR128)))>;
}
let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
- (VPBROADCASTDrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+ (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
- (VPBROADCASTDYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+ (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
- (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+ (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
- (VPBROADCASTQYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+ (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
}
// AVX1 broadcast patterns
@@ -8140,7 +7735,7 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
let Predicates = [HasAVX, NoVLX] in {
// 128bit broadcasts:
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
- (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
(VMOVDDUPrm addr:$src)>;
@@ -8152,29 +7747,29 @@ let Predicates = [HasAVX, NoVLX] in {
let Predicates = [HasAVX1Only] in {
def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
- (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
+ (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
(VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
- (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
- (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
+ (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
+ (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
(VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
- (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_xmm),
- (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), 1)>;
+ (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
+ (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
- (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
+ (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>;
def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
(VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
- (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
+ (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm),
+ (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>;
def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
(VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
- (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
- (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
+ (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm),
+ (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>;
def : Pat<(v2i64 (X86VBroadcast i64:$src)),
- (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44)>;
+ (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>;
def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
(VMOVDDUPrm addr:$src)>;
}
@@ -8183,16 +7778,6 @@ let Predicates = [HasAVX1Only] in {
// VPERM - Permute instructions
//
-let Sched = WriteFShuffle256 in
-def AVX2_PERMV_F : OpndItins<
- IIC_SSE_SHUFP, IIC_SSE_SHUFP
->;
-
-let Sched = WriteShuffle256 in
-def AVX2_PERMV_I : OpndItins<
- IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
->;
-
multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
ValueType OpVT, X86FoldableSchedWrite Sched,
X86MemOperand memOp> {
@@ -8215,10 +7800,10 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
}
}
-defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256,
+defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteVarShuffle256,
i256mem>;
let ExeDomain = SSEPackedSingle in
-defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256,
+defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFVarShuffle256,
f256mem>;
multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
@@ -8305,7 +7890,7 @@ let hasSideEffects = 0, mayStore = 1 in
def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
(ins i128mem:$dst, VR256:$src1, u8imm:$src2),
"vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- Sched<[WriteStore]>, VEX, VEX_L;
+ Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
let Predicates = [HasAVX2, NoVLX] in {
defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
@@ -8323,23 +7908,23 @@ multiclass avx2_pmovmask<string OpcodeStr,
def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))],
- IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteLoad]>;
+ [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
+ VEX_4V, Sched<[WriteVecMaskedLoad]>;
def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, i256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))],
- IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteLoad]>;
+ [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+ VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>;
def mr : AVX28I<0x8e, MRMDestMem, (outs),
(ins i128mem:$dst, VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)], IIC_SSE_MASKMOV>,
- VEX_4V, Sched<[WriteStore]>;
+ [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
+ VEX_4V, Sched<[WriteVecMaskedStore]>;
def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
(ins i256mem:$dst, VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)], IIC_SSE_MASKMOV>,
- VEX_4V, VEX_L, Sched<[WriteStore]>;
+ [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
+ VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>;
}
defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
@@ -8367,7 +7952,7 @@ multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
(!cast<Instruction>(BlendStr#"rr")
RC:$src0,
- (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr),
+ (VT (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)),
RC:$mask)>;
}
let Predicates = [HasAVX] in {
@@ -8444,27 +8029,27 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
- VEX_4V, Sched<[WriteVarVecShift]>;
+ VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode VR128:$src1,
(vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
- VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+ VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, ReadAfterLd]>;
def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
(vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
- VEX_4V, VEX_L, Sched<[WriteVarVecShift]>;
+ VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, i256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
(vt256 (OpNode VR256:$src1,
(vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
- VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+ VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, ReadAfterLd]>;
}
let Predicates = [HasAVX2, NoVLX] in {
@@ -8547,60 +8132,49 @@ let Predicates = [UseAVX2] in {
}
//===----------------------------------------------------------------------===//
-// Extra selection patterns for FR128, f128, f128mem
+// Extra selection patterns for f128, f128mem
// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
-def : Pat<(store (f128 FR128:$src), addr:$dst),
- (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>;
+def : Pat<(alignedstore (f128 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>;
+def : Pat<(store (f128 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>;
+def : Pat<(alignedloadf128 addr:$src),
+ (COPY_TO_REGCLASS (MOVAPSrm addr:$src), VR128)>;
def : Pat<(loadf128 addr:$src),
- (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>;
+ (COPY_TO_REGCLASS (MOVUPSrm addr:$src), VR128)>;
// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
-def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)),
- (COPY_TO_REGCLASS
- (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
- FR128)>;
-
-def : Pat<(X86fand FR128:$src1, FR128:$src2),
- (COPY_TO_REGCLASS
- (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
- (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
-
-def : Pat<(and FR128:$src1, FR128:$src2),
- (COPY_TO_REGCLASS
- (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
- (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
-
-def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)),
+def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))),
(COPY_TO_REGCLASS
- (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
- FR128)>;
+ (ANDPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2),
+ VR128)>;
-def : Pat<(X86for FR128:$src1, FR128:$src2),
+def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)),
(COPY_TO_REGCLASS
- (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
- (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+ (ANDPSrr (COPY_TO_REGCLASS VR128:$src1, VR128),
+ (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>;
-def : Pat<(or FR128:$src1, FR128:$src2),
+def : Pat<(f128 (X86for VR128:$src1, (memopf128 addr:$src2))),
(COPY_TO_REGCLASS
- (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
- (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+ (ORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2),
+ VR128)>;
-def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)),
+def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)),
(COPY_TO_REGCLASS
- (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
- FR128)>;
+ (ORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128),
+ (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>;
-def : Pat<(X86fxor FR128:$src1, FR128:$src2),
+def : Pat<(f128 (X86fxor VR128:$src1, (memopf128 addr:$src2))),
(COPY_TO_REGCLASS
- (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
- (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+ (XORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2),
+ VR128)>;
-def : Pat<(xor FR128:$src1, FR128:$src2),
+def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
(COPY_TO_REGCLASS
- (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
- (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+ (XORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128),
+ (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>;
//===----------------------------------------------------------------------===//
// GFNI instructions
@@ -8615,15 +8189,13 @@ multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
let isCommutable = 1 in
def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
- [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))],
- SSE_INTALU_ITINS_P.rr>,
- Sched<[SSE_INTALU_ITINS_P.Sched]>, T8PD;
+ [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
+ Sched<[SchedWriteVecALU.XMM]>, T8PD;
def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
[(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
- (bitconvert (MemOpFrag addr:$src2)))))],
- SSE_INTALU_ITINS_P.rm>,
- Sched<[SSE_INTALU_ITINS_P.Sched.Folded, ReadAfterLd]>, T8PD;
+ (bitconvert (MemOpFrag addr:$src2)))))]>,
+ Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>, T8PD;
}
}
@@ -8636,15 +8208,13 @@ multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3), "",
[(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
- SSE_INTALU_ITINS_P.rr, SSEPackedInt>,
- Sched<[WriteVecALU]>;
+ SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
[(set RC:$dst, (OpVT (OpNode RC:$src1,
(bitconvert (MemOpFrag addr:$src2)),
- imm:$src3)))],
- SSE_INTALU_ITINS_P.rm, SSEPackedInt>,
- Sched<[WriteVecALU.Folded, ReadAfterLd]>;
+ imm:$src3)))], SSEPackedInt>,
+ Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>;
}
}
diff --git a/lib/Target/X86/X86InstrSVM.td b/lib/Target/X86/X86InstrSVM.td
index bdf478600279..2dc6e8b43667 100644
--- a/lib/Target/X86/X86InstrSVM.td
+++ b/lib/Target/X86/X86InstrSVM.td
@@ -17,47 +17,47 @@
let SchedRW = [WriteSystem] in {
// 0F 01 D9
-def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", [], IIC_SVM>, TB;
+def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB;
// 0F 01 DC
-def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", [], IIC_STGI>, TB;
+def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB;
// 0F 01 DD
-def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", [], IIC_CLGI>, TB;
+def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
// 0F 01 DE
let Uses = [EAX] in
-def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", [], IIC_SKINIT>, TB;
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB;
// 0F 01 D8
let Uses = [EAX] in
-def VMRUN32 : I<0x01, MRM_D8, (outs), (ins),
- "vmrun\t{%eax|eax}", [], IIC_SVM>, TB, Requires<[Not64BitMode]>;
+def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), "vmrun\t{%eax|eax}", []>, TB,
+ Requires<[Not64BitMode]>;
let Uses = [RAX] in
-def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
- "vmrun\t{%rax|rax}", [], IIC_SVM>, TB, Requires<[In64BitMode]>;
+def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), "vmrun\t{%rax|rax}", []>, TB,
+ Requires<[In64BitMode]>;
// 0F 01 DA
let Uses = [EAX] in
-def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins),
- "vmload\t{%eax|eax}", [], IIC_SVM>, TB, Requires<[Not64BitMode]>;
+def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), "vmload\t{%eax|eax}", []>, TB,
+ Requires<[Not64BitMode]>;
let Uses = [RAX] in
-def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
- "vmload\t{%rax|rax}", [], IIC_SVM>, TB, Requires<[In64BitMode]>;
+def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), "vmload\t{%rax|rax}", []>, TB,
+ Requires<[In64BitMode]>;
// 0F 01 DB
let Uses = [EAX] in
-def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins),
- "vmsave\t{%eax|eax}", [], IIC_SVM>, TB, Requires<[Not64BitMode]>;
+def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), "vmsave\t{%eax|eax}", []>, TB,
+ Requires<[Not64BitMode]>;
let Uses = [RAX] in
-def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
- "vmsave\t{%rax|rax}", [], IIC_SVM>, TB, Requires<[In64BitMode]>;
+def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), "vmsave\t{%rax|rax}", []>, TB,
+ Requires<[In64BitMode]>;
// 0F 01 DF
let Uses = [EAX, ECX] in
def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
- "invlpga\t{%ecx, %eax|eax, ecx}", [], IIC_INVLPG>, TB, Requires<[Not64BitMode]>;
+ "invlpga\t{%eax, %ecx|eax, ecx}", []>, TB, Requires<[Not64BitMode]>;
let Uses = [RAX, ECX] in
def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
- "invlpga\t{%ecx, %rax|rax, ecx}", [], IIC_INVLPG>, TB, Requires<[In64BitMode]>;
+ "invlpga\t{%rax, %ecx|rax, ecx}", []>, TB, Requires<[In64BitMode]>;
} // SchedRW
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index 43e1752f2df2..ee3b01159174 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -19,49 +19,48 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL] in {
def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
"shl{b}\t{%cl, $dst|$dst, cl}",
- [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>;
+ [(set GR8:$dst, (shl GR8:$src1, CL))]>;
def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
"shl{w}\t{%cl, $dst|$dst, cl}",
- [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize16;
+ [(set GR16:$dst, (shl GR16:$src1, CL))]>, OpSize16;
def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
"shl{l}\t{%cl, $dst|$dst, cl}",
- [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>, OpSize32;
+ [(set GR32:$dst, (shl GR32:$src1, CL))]>, OpSize32;
def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
"shl{q}\t{%cl, $dst|$dst, cl}",
- [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
+ [(set GR64:$dst, (shl GR64:$src1, CL))]>;
} // Uses = [CL]
def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
"shl{b}\t{$src2, $dst|$dst, $src2}",
- [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+ [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>;
let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"shl{w}\t{$src2, $dst|$dst, $src2}",
- [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>,
+ [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>,
OpSize16;
def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
"shl{l}\t{$src2, $dst|$dst, $src2}",
- [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>,
+ [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>,
OpSize32;
def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst),
(ins GR64:$src1, u8imm:$src2),
"shl{q}\t{$src2, $dst|$dst, $src2}",
- [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))],
- IIC_SR>;
+ [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>;
} // isConvertibleToThreeAddress = 1
// NOTE: We don't include patterns for shifts of a register by one, because
// 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one).
let hasSideEffects = 0 in {
def SHL8r1 : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
- "shl{b}\t$dst", [], IIC_SR>;
+ "shl{b}\t$dst", []>;
def SHL16r1 : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
- "shl{w}\t$dst", [], IIC_SR>, OpSize16;
+ "shl{w}\t$dst", []>, OpSize16;
def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
- "shl{l}\t$dst", [], IIC_SR>, OpSize32;
+ "shl{l}\t$dst", []>, OpSize32;
def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
- "shl{q}\t$dst", [], IIC_SR>;
+ "shl{q}\t$dst", []>;
} // hasSideEffects = 0
} // Constraints = "$src = $dst", SchedRW
@@ -72,100 +71,98 @@ let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {
def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
"shl{b}\t{%cl, $dst|$dst, cl}",
- [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+ [(store (shl (loadi8 addr:$dst), CL), addr:$dst)]>;
def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
"shl{w}\t{%cl, $dst|$dst, cl}",
- [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ [(store (shl (loadi16 addr:$dst), CL), addr:$dst)]>,
OpSize16;
def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
"shl{l}\t{%cl, $dst|$dst, cl}",
- [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ [(store (shl (loadi32 addr:$dst), CL), addr:$dst)]>,
OpSize32;
def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
"shl{q}\t{%cl, $dst|$dst, cl}",
- [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ [(store (shl (loadi64 addr:$dst), CL), addr:$dst)]>,
Requires<[In64BitMode]>;
}
def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src),
"shl{b}\t{$src, $dst|$dst, $src}",
- [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>;
+ [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, u8imm:$src),
"shl{w}\t{$src, $dst|$dst, $src}",
- [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>, OpSize16;
+ [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize16;
def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src),
"shl{l}\t{$src, $dst|$dst, $src}",
- [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>, OpSize32;
+ [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize32;
def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src),
"shl{q}\t{$src, $dst|$dst, $src}",
- [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>, Requires<[In64BitMode]>;
+ [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
// Shift by 1
def SHL8m1 : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
"shl{b}\t$dst",
- [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>;
+ [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
def SHL16m1 : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
"shl{w}\t$dst",
- [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>, OpSize16;
+ [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize16;
def SHL32m1 : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
"shl{l}\t$dst",
- [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>, OpSize32;
+ [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize32;
def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
"shl{q}\t$dst",
- [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>, Requires<[In64BitMode]>;
+ [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
} // SchedRW
let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL] in {
def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
"shr{b}\t{%cl, $dst|$dst, cl}",
- [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>;
+ [(set GR8:$dst, (srl GR8:$src1, CL))]>;
def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
"shr{w}\t{%cl, $dst|$dst, cl}",
- [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize16;
+ [(set GR16:$dst, (srl GR16:$src1, CL))]>, OpSize16;
def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
"shr{l}\t{%cl, $dst|$dst, cl}",
- [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>, OpSize32;
+ [(set GR32:$dst, (srl GR32:$src1, CL))]>, OpSize32;
def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
"shr{q}\t{%cl, $dst|$dst, cl}",
- [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
+ [(set GR64:$dst, (srl GR64:$src1, CL))]>;
}
def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$src2),
"shr{b}\t{$src2, $dst|$dst, $src2}",
- [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+ [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))]>;
def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"shr{w}\t{$src2, $dst|$dst, $src2}",
- [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))],
- IIC_SR>, OpSize16;
+ [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))]>,
+ OpSize16;
def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
"shr{l}\t{$src2, $dst|$dst, $src2}",
- [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))],
- IIC_SR>, OpSize32;
+ [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))]>,
+ OpSize32;
def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2),
"shr{q}\t{$src2, $dst|$dst, $src2}",
- [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>;
+ [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>;
// Shift right by 1
def SHR8r1 : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
"shr{b}\t$dst",
- [(set GR8:$dst, (srl GR8:$src1, (i8 1)))], IIC_SR>;
+ [(set GR8:$dst, (srl GR8:$src1, (i8 1)))]>;
def SHR16r1 : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
"shr{w}\t$dst",
- [(set GR16:$dst, (srl GR16:$src1, (i8 1)))], IIC_SR>, OpSize16;
+ [(set GR16:$dst, (srl GR16:$src1, (i8 1)))]>, OpSize16;
def SHR32r1 : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
"shr{l}\t$dst",
- [(set GR32:$dst, (srl GR32:$src1, (i8 1)))], IIC_SR>, OpSize32;
+ [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>, OpSize32;
def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
"shr{q}\t$dst",
- [(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>;
+ [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>;
} // Constraints = "$src = $dst", SchedRW
@@ -173,111 +170,101 @@ let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {
def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
"shr{b}\t{%cl, $dst|$dst, cl}",
- [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+ [(store (srl (loadi8 addr:$dst), CL), addr:$dst)]>;
def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
"shr{w}\t{%cl, $dst|$dst, cl}",
- [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ [(store (srl (loadi16 addr:$dst), CL), addr:$dst)]>,
OpSize16;
def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
"shr{l}\t{%cl, $dst|$dst, cl}",
- [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ [(store (srl (loadi32 addr:$dst), CL), addr:$dst)]>,
OpSize32;
def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
"shr{q}\t{%cl, $dst|$dst, cl}",
- [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ [(store (srl (loadi64 addr:$dst), CL), addr:$dst)]>,
Requires<[In64BitMode]>;
}
def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src),
"shr{b}\t{$src, $dst|$dst, $src}",
- [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>;
+ [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, u8imm:$src),
"shr{w}\t{$src, $dst|$dst, $src}",
- [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>, OpSize16;
+ [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize16;
def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src),
"shr{l}\t{$src, $dst|$dst, $src}",
- [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>, OpSize32;
+ [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize32;
def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src),
"shr{q}\t{$src, $dst|$dst, $src}",
- [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>, Requires<[In64BitMode]>;
+ [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
// Shift by 1
def SHR8m1 : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
"shr{b}\t$dst",
- [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>;
+ [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
def SHR16m1 : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
"shr{w}\t$dst",
- [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>, OpSize16;
+ [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize16;
def SHR32m1 : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
"shr{l}\t$dst",
- [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>, OpSize32;
+ [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize32;
def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
"shr{q}\t$dst",
- [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>, Requires<[In64BitMode]>;
+ [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
} // SchedRW
let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL] in {
def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
"sar{b}\t{%cl, $dst|$dst, cl}",
- [(set GR8:$dst, (sra GR8:$src1, CL))],
- IIC_SR>;
+ [(set GR8:$dst, (sra GR8:$src1, CL))]>;
def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
"sar{w}\t{%cl, $dst|$dst, cl}",
- [(set GR16:$dst, (sra GR16:$src1, CL))],
- IIC_SR>, OpSize16;
+ [(set GR16:$dst, (sra GR16:$src1, CL))]>,
+ OpSize16;
def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
"sar{l}\t{%cl, $dst|$dst, cl}",
- [(set GR32:$dst, (sra GR32:$src1, CL))],
- IIC_SR>, OpSize32;
+ [(set GR32:$dst, (sra GR32:$src1, CL))]>,
+ OpSize32;
def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
"sar{q}\t{%cl, $dst|$dst, cl}",
- [(set GR64:$dst, (sra GR64:$src1, CL))],
- IIC_SR>;
+ [(set GR64:$dst, (sra GR64:$src1, CL))]>;
}
def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
"sar{b}\t{$src2, $dst|$dst, $src2}",
- [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))],
- IIC_SR>;
+ [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))]>;
def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"sar{w}\t{$src2, $dst|$dst, $src2}",
- [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))],
- IIC_SR>, OpSize16;
+ [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))]>,
+ OpSize16;
def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
"sar{l}\t{$src2, $dst|$dst, $src2}",
- [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))],
- IIC_SR>, OpSize32;
+ [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))]>,
+ OpSize32;
def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst),
(ins GR64:$src1, u8imm:$src2),
"sar{q}\t{$src2, $dst|$dst, $src2}",
- [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))],
- IIC_SR>;
+ [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))]>;
// Shift by 1
def SAR8r1 : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
"sar{b}\t$dst",
- [(set GR8:$dst, (sra GR8:$src1, (i8 1)))],
- IIC_SR>;
+ [(set GR8:$dst, (sra GR8:$src1, (i8 1)))]>;
def SAR16r1 : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
"sar{w}\t$dst",
- [(set GR16:$dst, (sra GR16:$src1, (i8 1)))],
- IIC_SR>, OpSize16;
+ [(set GR16:$dst, (sra GR16:$src1, (i8 1)))]>, OpSize16;
def SAR32r1 : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
"sar{l}\t$dst",
- [(set GR32:$dst, (sra GR32:$src1, (i8 1)))],
- IIC_SR>, OpSize32;
+ [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>, OpSize32;
def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
- "sar{q}\t$dst",
- [(set GR64:$dst, (sra GR64:$src1, (i8 1)))],
- IIC_SR>;
+ "sar{q}\t$dst",
+ [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>;
} // Constraints = "$src = $dst", SchedRW
@@ -285,55 +272,52 @@ let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {
def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
"sar{b}\t{%cl, $dst|$dst, cl}",
- [(store (sra (loadi8 addr:$dst), CL), addr:$dst)],
- IIC_SR>;
+ [(store (sra (loadi8 addr:$dst), CL), addr:$dst)]>;
def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
"sar{w}\t{%cl, $dst|$dst, cl}",
- [(store (sra (loadi16 addr:$dst), CL), addr:$dst)],
- IIC_SR>, OpSize16;
+ [(store (sra (loadi16 addr:$dst), CL), addr:$dst)]>,
+ OpSize16;
def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst),
"sar{l}\t{%cl, $dst|$dst, cl}",
- [(store (sra (loadi32 addr:$dst), CL), addr:$dst)],
- IIC_SR>, OpSize32;
+ [(store (sra (loadi32 addr:$dst), CL), addr:$dst)]>,
+ OpSize32;
def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
"sar{q}\t{%cl, $dst|$dst, cl}",
- [(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
- IIC_SR>, Requires<[In64BitMode]>;
+ [(store (sra (loadi64 addr:$dst), CL), addr:$dst)]>,
+ Requires<[In64BitMode]>;
}
def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src),
"sar{b}\t{$src, $dst|$dst, $src}",
- [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>;
+ [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, u8imm:$src),
"sar{w}\t{$src, $dst|$dst, $src}",
- [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>, OpSize16;
+ [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize16;
def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src),
"sar{l}\t{$src, $dst|$dst, $src}",
- [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>, OpSize32;
+ [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize32;
def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src),
"sar{q}\t{$src, $dst|$dst, $src}",
- [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>, Requires<[In64BitMode]>;
+ [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
// Shift by 1
def SAR8m1 : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
"sar{b}\t$dst",
- [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>;
+ [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
def SAR16m1 : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
"sar{w}\t$dst",
- [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>, OpSize16;
+ [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize16;
def SAR32m1 : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
"sar{l}\t$dst",
- [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>, OpSize32;
+ [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize32;
def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
"sar{q}\t$dst",
- [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>, Requires<[In64BitMode]>;
+ [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
} // SchedRW
//===----------------------------------------------------------------------===//
@@ -345,62 +329,62 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL, EFLAGS] in {
def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
- "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+ "rcl{b}\t{%cl, $dst|$dst, cl}", []>;
def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
- "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+ "rcl{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16;
def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
- "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+ "rcl{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32;
def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
- "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+ "rcl{q}\t{%cl, $dst|$dst, cl}", []>;
} // Uses = [CL, EFLAGS]
let Uses = [EFLAGS] in {
def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
- "rcl{b}\t$dst", [], IIC_SR>;
+ "rcl{b}\t$dst", []>;
def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
- "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+ "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>;
def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
- "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
+ "rcl{w}\t$dst", []>, OpSize16;
def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
- "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+ "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16;
def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
- "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
+ "rcl{l}\t$dst", []>, OpSize32;
def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
- "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+ "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32;
def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
- "rcl{q}\t$dst", [], IIC_SR>;
+ "rcl{q}\t$dst", []>;
def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
- "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+ "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>;
} // Uses = [EFLAGS]
let Uses = [CL, EFLAGS] in {
def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
- "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+ "rcr{b}\t{%cl, $dst|$dst, cl}", []>;
def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
- "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+ "rcr{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16;
def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
- "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+ "rcr{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32;
def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
- "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+ "rcr{q}\t{%cl, $dst|$dst, cl}", []>;
} // Uses = [CL, EFLAGS]
let Uses = [EFLAGS] in {
def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
- "rcr{b}\t$dst", [], IIC_SR>;
+ "rcr{b}\t$dst", []>;
def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
- "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+ "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>;
def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
- "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
+ "rcr{w}\t$dst", []>, OpSize16;
def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
- "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+ "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16;
def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
- "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
+ "rcr{l}\t$dst", []>, OpSize32;
def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
- "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+ "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32;
def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
- "rcr{q}\t$dst", [], IIC_SR>;
+ "rcr{q}\t$dst", []>;
def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
- "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+ "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>;
} // Uses = [EFLAGS]
} // Constraints = "$src = $dst"
@@ -408,61 +392,61 @@ def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
let SchedRW = [WriteShiftLd, WriteRMW], mayStore = 1 in {
let Uses = [EFLAGS] in {
def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
- "rcl{b}\t$dst", [], IIC_SR>;
+ "rcl{b}\t$dst", []>;
def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt),
- "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+ "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>;
def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst),
- "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
+ "rcl{w}\t$dst", []>, OpSize16;
def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, u8imm:$cnt),
- "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+ "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16;
def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
- "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
+ "rcl{l}\t$dst", []>, OpSize32;
def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt),
- "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+ "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32;
def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
- "rcl{q}\t$dst", [], IIC_SR>, Requires<[In64BitMode]>;
+ "rcl{q}\t$dst", []>, Requires<[In64BitMode]>;
def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt),
- "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>,
+ "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>,
Requires<[In64BitMode]>;
def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
- "rcr{b}\t$dst", [], IIC_SR>;
+ "rcr{b}\t$dst", []>;
def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, u8imm:$cnt),
- "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+ "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>;
def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst),
- "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
+ "rcr{w}\t$dst", []>, OpSize16;
def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, u8imm:$cnt),
- "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+ "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16;
def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
- "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
+ "rcr{l}\t$dst", []>, OpSize32;
def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt),
- "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+ "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32;
def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
- "rcr{q}\t$dst", [], IIC_SR>, Requires<[In64BitMode]>;
+ "rcr{q}\t$dst", []>, Requires<[In64BitMode]>;
def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt),
- "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>,
+ "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>,
Requires<[In64BitMode]>;
} // Uses = [EFLAGS]
let Uses = [CL, EFLAGS] in {
def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
- "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+ "rcl{b}\t{%cl, $dst|$dst, cl}", []>;
def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
- "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+ "rcl{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16;
def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
- "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+ "rcl{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32;
def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
- "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>,
+ "rcl{q}\t{%cl, $dst|$dst, cl}", []>,
Requires<[In64BitMode]>;
def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
- "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+ "rcr{b}\t{%cl, $dst|$dst, cl}", []>;
def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
- "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+ "rcr{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16;
def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
- "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+ "rcr{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32;
def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
- "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>,
+ "rcr{q}\t{%cl, $dst|$dst, cl}", []>,
Requires<[In64BitMode]>;
} // Uses = [CL, EFLAGS]
} // SchedRW
@@ -473,215 +457,192 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL] in {
def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
"rol{b}\t{%cl, $dst|$dst, cl}",
- [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>;
+ [(set GR8:$dst, (rotl GR8:$src1, CL))]>;
def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
"rol{w}\t{%cl, $dst|$dst, cl}",
- [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize16;
+ [(set GR16:$dst, (rotl GR16:$src1, CL))]>, OpSize16;
def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
"rol{l}\t{%cl, $dst|$dst, cl}",
- [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>, OpSize32;
+ [(set GR32:$dst, (rotl GR32:$src1, CL))]>, OpSize32;
def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
"rol{q}\t{%cl, $dst|$dst, cl}",
- [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
+ [(set GR64:$dst, (rotl GR64:$src1, CL))]>;
}
def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
"rol{b}\t{$src2, $dst|$dst, $src2}",
- [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>;
def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"rol{w}\t{$src2, $dst|$dst, $src2}",
- [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))],
- IIC_SR>, OpSize16;
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, OpSize16;
def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
"rol{l}\t{$src2, $dst|$dst, $src2}",
- [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))],
- IIC_SR>, OpSize32;
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>, OpSize32;
def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst),
(ins GR64:$src1, u8imm:$src2),
"rol{q}\t{$src2, $dst|$dst, $src2}",
- [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))],
- IIC_SR>;
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>;
// Rotate by 1
def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
"rol{b}\t$dst",
- [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))],
- IIC_SR>;
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>;
def ROL16r1 : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
"rol{w}\t$dst",
- [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))],
- IIC_SR>, OpSize16;
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize16;
def ROL32r1 : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
"rol{l}\t$dst",
- [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))],
- IIC_SR>, OpSize32;
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>, OpSize32;
def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
"rol{q}\t$dst",
- [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))],
- IIC_SR>;
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>;
} // Constraints = "$src = $dst", SchedRW
let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {
def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
"rol{b}\t{%cl, $dst|$dst, cl}",
- [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)],
- IIC_SR>;
+ [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)]>;
def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
"rol{w}\t{%cl, $dst|$dst, cl}",
- [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)],
- IIC_SR>, OpSize16;
+ [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize16;
def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
"rol{l}\t{%cl, $dst|$dst, cl}",
- [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)],
- IIC_SR>, OpSize32;
+ [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)]>, OpSize32;
def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
"rol{q}\t{%cl, $dst|$dst, cl}",
- [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
- IIC_SR>, Requires<[In64BitMode]>;
+ [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)]>,
+ Requires<[In64BitMode]>;
}
def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1),
"rol{b}\t{$src1, $dst|$dst, $src1}",
- [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)],
- IIC_SR>;
+ [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)]>;
def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, u8imm:$src1),
"rol{w}\t{$src1, $dst|$dst, $src1}",
- [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)],
- IIC_SR>, OpSize16;
+ [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)]>,
+ OpSize16;
def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1),
"rol{l}\t{$src1, $dst|$dst, $src1}",
- [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)],
- IIC_SR>, OpSize32;
+ [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)]>,
+ OpSize32;
def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1),
"rol{q}\t{$src1, $dst|$dst, $src1}",
- [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)],
- IIC_SR>, Requires<[In64BitMode]>;
+ [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
// Rotate by 1
def ROL8m1 : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
"rol{b}\t$dst",
- [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>;
+ [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
def ROL16m1 : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
"rol{w}\t$dst",
- [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>, OpSize16;
+ [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize16;
def ROL32m1 : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
"rol{l}\t$dst",
- [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>, OpSize32;
+ [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize32;
def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
"rol{q}\t$dst",
- [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>, Requires<[In64BitMode]>;
+ [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
} // SchedRW
let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL] in {
def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
"ror{b}\t{%cl, $dst|$dst, cl}",
- [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>;
+ [(set GR8:$dst, (rotr GR8:$src1, CL))]>;
def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
"ror{w}\t{%cl, $dst|$dst, cl}",
- [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize16;
+ [(set GR16:$dst, (rotr GR16:$src1, CL))]>, OpSize16;
def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
"ror{l}\t{%cl, $dst|$dst, cl}",
- [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>, OpSize32;
+ [(set GR32:$dst, (rotr GR32:$src1, CL))]>, OpSize32;
def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
"ror{q}\t{%cl, $dst|$dst, cl}",
- [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
+ [(set GR64:$dst, (rotr GR64:$src1, CL))]>;
}
def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
"ror{b}\t{$src2, $dst|$dst, $src2}",
- [(set GR8:$dst, (rotr GR8:$src1, (i8 relocImm:$src2)))],
- IIC_SR>;
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 relocImm:$src2)))]>;
def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"ror{w}\t{$src2, $dst|$dst, $src2}",
- [(set GR16:$dst, (rotr GR16:$src1, (i8 relocImm:$src2)))],
- IIC_SR>, OpSize16;
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 relocImm:$src2)))]>,
+ OpSize16;
def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
"ror{l}\t{$src2, $dst|$dst, $src2}",
- [(set GR32:$dst, (rotr GR32:$src1, (i8 relocImm:$src2)))],
- IIC_SR>, OpSize32;
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 relocImm:$src2)))]>,
+ OpSize32;
def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst),
(ins GR64:$src1, u8imm:$src2),
"ror{q}\t{$src2, $dst|$dst, $src2}",
- [(set GR64:$dst, (rotr GR64:$src1, (i8 relocImm:$src2)))],
- IIC_SR>;
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 relocImm:$src2)))]>;
// Rotate by 1
def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
"ror{b}\t$dst",
- [(set GR8:$dst, (rotl GR8:$src1, (i8 7)))],
- IIC_SR>;
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 7)))]>;
def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
"ror{w}\t$dst",
- [(set GR16:$dst, (rotl GR16:$src1, (i8 15)))],
- IIC_SR>, OpSize16;
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 15)))]>, OpSize16;
def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
"ror{l}\t$dst",
- [(set GR32:$dst, (rotl GR32:$src1, (i8 31)))],
- IIC_SR>, OpSize32;
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 31)))]>, OpSize32;
def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
"ror{q}\t$dst",
- [(set GR64:$dst, (rotl GR64:$src1, (i8 63)))],
- IIC_SR>;
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 63)))]>;
} // Constraints = "$src = $dst", SchedRW
let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {
def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
"ror{b}\t{%cl, $dst|$dst, cl}",
- [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)],
- IIC_SR>;
+ [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)]>;
def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
"ror{w}\t{%cl, $dst|$dst, cl}",
- [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)],
- IIC_SR>, OpSize16;
+ [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize16;
def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst),
"ror{l}\t{%cl, $dst|$dst, cl}",
- [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)],
- IIC_SR>, OpSize32;
+ [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)]>, OpSize32;
def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
"ror{q}\t{%cl, $dst|$dst, cl}",
- [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
- IIC_SR>, Requires<[In64BitMode]>;
+ [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)]>,
+ Requires<[In64BitMode]>;
}
def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src),
"ror{b}\t{$src, $dst|$dst, $src}",
- [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>;
+ [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, u8imm:$src),
"ror{w}\t{$src, $dst|$dst, $src}",
- [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>, OpSize16;
+ [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize16;
def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src),
"ror{l}\t{$src, $dst|$dst, $src}",
- [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>, OpSize32;
+ [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize32;
def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src),
"ror{q}\t{$src, $dst|$dst, $src}",
- [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>, Requires<[In64BitMode]>;
+ [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
// Rotate by 1
def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
"ror{b}\t$dst",
- [(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst)],
- IIC_SR>;
+ [(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst)]>;
def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
"ror{w}\t$dst",
- [(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst)],
- IIC_SR>, OpSize16;
+ [(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst)]>,
+ OpSize16;
def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
"ror{l}\t$dst",
- [(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst)],
- IIC_SR>, OpSize32;
+ [(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst)]>,
+ OpSize32;
def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
"ror{q}\t$dst",
- [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)],
- IIC_SR>, Requires<[In64BitMode]>;
+ [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
} // SchedRW
@@ -689,42 +650,38 @@ def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
// Double shift instructions (generalizations of rotate)
//===----------------------------------------------------------------------===//
-let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShiftDouble] in {
let Uses = [CL] in {
def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
(ins GR16:$src1, GR16:$src2),
"shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))],
- IIC_SHD16_REG_CL>,
+ [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>,
TB, OpSize16;
def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
(ins GR16:$src1, GR16:$src2),
"shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))],
- IIC_SHD16_REG_CL>,
+ [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>,
TB, OpSize16;
def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
(ins GR32:$src1, GR32:$src2),
"shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))],
- IIC_SHD32_REG_CL>, TB, OpSize32;
+ [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>,
+ TB, OpSize32;
def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
(ins GR32:$src1, GR32:$src2),
"shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))],
- IIC_SHD32_REG_CL>, TB, OpSize32;
+ [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>,
+ TB, OpSize32;
def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
"shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))],
- IIC_SHD64_REG_CL>,
+ [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))]>,
TB;
def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
"shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))],
- IIC_SHD64_REG_CL>,
+ [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>,
TB;
}
@@ -734,119 +691,113 @@ def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
(ins GR16:$src1, GR16:$src2, u8imm:$src3),
"shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
- (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+ (i8 imm:$src3)))]>,
TB, OpSize16;
def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
(outs GR16:$dst),
(ins GR16:$src1, GR16:$src2, u8imm:$src3),
"shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
- (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+ (i8 imm:$src3)))]>,
TB, OpSize16;
def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
(outs GR32:$dst),
(ins GR32:$src1, GR32:$src2, u8imm:$src3),
"shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
- (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+ (i8 imm:$src3)))]>,
TB, OpSize32;
def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
(outs GR32:$dst),
(ins GR32:$src1, GR32:$src2, u8imm:$src3),
"shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
- (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+ (i8 imm:$src3)))]>,
TB, OpSize32;
def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
(outs GR64:$dst),
(ins GR64:$src1, GR64:$src2, u8imm:$src3),
"shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
- (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+ (i8 imm:$src3)))]>,
TB;
def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
(outs GR64:$dst),
(ins GR64:$src1, GR64:$src2, u8imm:$src3),
"shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
- (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+ (i8 imm:$src3)))]>,
TB;
}
} // Constraints = "$src = $dst", SchedRW
-let SchedRW = [WriteShiftLd, WriteRMW] in {
+let SchedRW = [WriteShiftDoubleLd, WriteRMW] in {
let Uses = [CL] in {
def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
"shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
- addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
+ addr:$dst)]>, TB, OpSize16;
def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
"shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
- addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
+ addr:$dst)]>, TB, OpSize16;
def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
"shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
- addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
+ addr:$dst)]>, TB, OpSize32;
def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
"shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
- addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
+ addr:$dst)]>, TB, OpSize32;
def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
"shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
- addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+ addr:$dst)]>, TB;
def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
"shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
- addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+ addr:$dst)]>, TB;
}
def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
(outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
"shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shld (loadi16 addr:$dst), GR16:$src2,
- (i8 imm:$src3)), addr:$dst)],
- IIC_SHD16_MEM_IM>,
+ (i8 imm:$src3)), addr:$dst)]>,
TB, OpSize16;
def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
(outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
"shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
- (i8 imm:$src3)), addr:$dst)],
- IIC_SHD16_MEM_IM>,
+ (i8 imm:$src3)), addr:$dst)]>,
TB, OpSize16;
def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
(outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
"shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shld (loadi32 addr:$dst), GR32:$src2,
- (i8 imm:$src3)), addr:$dst)],
- IIC_SHD32_MEM_IM>,
+ (i8 imm:$src3)), addr:$dst)]>,
TB, OpSize32;
def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
(outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
"shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
- (i8 imm:$src3)), addr:$dst)],
- IIC_SHD32_MEM_IM>,
+ (i8 imm:$src3)), addr:$dst)]>,
TB, OpSize32;
def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
(outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
"shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shld (loadi64 addr:$dst), GR64:$src2,
- (i8 imm:$src3)), addr:$dst)],
- IIC_SHD64_MEM_IM>,
+ (i8 imm:$src3)), addr:$dst)]>,
TB;
def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
(outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
"shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
- (i8 imm:$src3)), addr:$dst)],
- IIC_SHD64_MEM_IM>,
+ (i8 imm:$src3)), addr:$dst)]>,
TB;
} // SchedRW
@@ -897,7 +848,7 @@ let hasSideEffects = 0 in {
// x86memop:$src1
ReadDefault, ReadDefault, ReadDefault, ReadDefault,
ReadDefault,
- // RC:$src1
+ // RC:$src2
ReadAfterLd]>;
}
}
@@ -967,7 +918,7 @@ let Predicates = [HasBMI2] in {
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
}
- // Artificially lower the complexity so that we'll favor
+ // We prefer to use
// mov (%ecx), %esi
// shl $imm, $esi
//
@@ -975,32 +926,32 @@ let Predicates = [HasBMI2] in {
//
// movb $imm, %al
// shlx %al, (%ecx), %esi
- let AddedComplexity = -20 in {
- def : Pat<(sra (loadi32 addr:$src1), GR8:$src2),
- (SARX32rm addr:$src1,
- (INSERT_SUBREG
- (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(sra (loadi64 addr:$src1), GR8:$src2),
- (SARX64rm addr:$src1,
- (INSERT_SUBREG
- (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-
- def : Pat<(srl (loadi32 addr:$src1), GR8:$src2),
- (SHRX32rm addr:$src1,
- (INSERT_SUBREG
- (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(srl (loadi64 addr:$src1), GR8:$src2),
- (SHRX64rm addr:$src1,
- (INSERT_SUBREG
- (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-
- def : Pat<(shl (loadi32 addr:$src1), GR8:$src2),
- (SHLX32rm addr:$src1,
- (INSERT_SUBREG
- (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(shl (loadi64 addr:$src1), GR8:$src2),
- (SHLX64rm addr:$src1,
- (INSERT_SUBREG
- (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- }
+ //
+ // This priority is enforced by IsProfitableToFoldLoad.
+ def : Pat<(sra (loadi32 addr:$src1), GR8:$src2),
+ (SARX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(sra (loadi64 addr:$src1), GR8:$src2),
+ (SARX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(srl (loadi32 addr:$src1), GR8:$src2),
+ (SHRX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(srl (loadi64 addr:$src1), GR8:$src2),
+ (SHRX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(shl (loadi32 addr:$src1), GR8:$src2),
+ (SHLX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(shl (loadi64 addr:$src1), GR8:$src2),
+ (SHLX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
}
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 40d2dca4f9ec..35ee00b9e016 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -15,28 +15,26 @@
let SchedRW = [WriteSystem] in {
let Defs = [RAX, RDX] in
- def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>,
- TB;
+ def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>, TB;
let Defs = [RAX, RCX, RDX] in
- def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)],
- IIC_RDTSCP>, TB;
+ def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB;
// CPU flow control instructions
-let mayLoad = 1, mayStore = 0, hasSideEffects = 1 in {
+let mayLoad = 1, mayStore = 0, hasSideEffects = 1, isTrap = 1 in {
def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
def UD2B : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB;
}
-def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>;
-def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB;
+def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>;
+def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", []>, TB;
// Interrupt and SysCall Instructions.
let Uses = [EFLAGS] in
def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>, Requires<[Not64BitMode]>;
-def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
- [(int_x86_int (i8 3))], IIC_INT3>;
+
+def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>;
} // SchedRW
// The long form of "int $3" turns into int3 as a size optimization.
@@ -46,21 +44,19 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
let SchedRW = [WriteSystem] in {
def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap",
- [(int_x86_int imm:$trap)], IIC_INT>;
+ [(int_x86_int imm:$trap)]>;
-def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", [], IIC_SYSCALL>, TB;
-def SYSRET : I<0x07, RawFrm, (outs), (ins), "sysret{l}", [], IIC_SYSCALL>, TB;
-def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", [], IIC_SYSCALL>, TB,
+def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB;
+def SYSRET : I<0x07, RawFrm, (outs), (ins), "sysret{l}", []>, TB;
+def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysretq", []>, TB,
Requires<[In64BitMode]>;
-def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [],
- IIC_SYS_ENTER_EXIT>, TB;
+def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", []>, TB;
-def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [],
- IIC_SYS_ENTER_EXIT>, TB;
-def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", [],
- IIC_SYS_ENTER_EXIT>, TB, Requires<[In64BitMode]>;
+def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", []>, TB;
+def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexitq", []>, TB,
+ Requires<[In64BitMode]>;
} // SchedRW
def : Pat<(debugtrap),
@@ -73,44 +69,42 @@ def : Pat<(debugtrap),
//
let SchedRW = [WriteSystem] in {
let Defs = [AL], Uses = [DX] in
-def IN8rr : I<0xEC, RawFrm, (outs), (ins),
- "in{b}\t{%dx, %al|al, dx}", [], IIC_IN_RR>;
+def IN8rr : I<0xEC, RawFrm, (outs), (ins), "in{b}\t{%dx, %al|al, dx}", []>;
let Defs = [AX], Uses = [DX] in
-def IN16rr : I<0xED, RawFrm, (outs), (ins),
- "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>, OpSize16;
+def IN16rr : I<0xED, RawFrm, (outs), (ins), "in{w}\t{%dx, %ax|ax, dx}", []>,
+ OpSize16;
let Defs = [EAX], Uses = [DX] in
-def IN32rr : I<0xED, RawFrm, (outs), (ins),
- "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>, OpSize32;
+def IN32rr : I<0xED, RawFrm, (outs), (ins), "in{l}\t{%dx, %eax|eax, dx}", []>,
+ OpSize32;
let Defs = [AL] in
def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins u8imm:$port),
- "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>;
+ "in{b}\t{$port, %al|al, $port}", []>;
let Defs = [AX] in
def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
- "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize16;
+ "in{w}\t{$port, %ax|ax, $port}", []>, OpSize16;
let Defs = [EAX] in
def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
- "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>, OpSize32;
+ "in{l}\t{$port, %eax|eax, $port}", []>, OpSize32;
let Uses = [DX, AL] in
-def OUT8rr : I<0xEE, RawFrm, (outs), (ins),
- "out{b}\t{%al, %dx|dx, al}", [], IIC_OUT_RR>;
+def OUT8rr : I<0xEE, RawFrm, (outs), (ins), "out{b}\t{%al, %dx|dx, al}", []>;
let Uses = [DX, AX] in
-def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
- "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize16;
+def OUT16rr : I<0xEF, RawFrm, (outs), (ins), "out{w}\t{%ax, %dx|dx, ax}", []>,
+ OpSize16;
let Uses = [DX, EAX] in
-def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
- "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>, OpSize32;
+def OUT32rr : I<0xEF, RawFrm, (outs), (ins), "out{l}\t{%eax, %dx|dx, eax}", []>,
+ OpSize32;
let Uses = [AL] in
def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins u8imm:$port),
- "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>;
+ "out{b}\t{%al, $port|$port, al}", []>;
let Uses = [AX] in
def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
- "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize16;
+ "out{w}\t{%ax, $port|$port, ax}", []>, OpSize16;
let Uses = [EAX] in
def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
- "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>, OpSize32;
+ "out{l}\t{%eax, $port|$port, eax}", []>, OpSize32;
} // SchedRW
@@ -119,17 +113,17 @@ def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
let SchedRW = [WriteSystem] in {
def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, TB,
Requires<[Not64BitMode]>;
def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+ "mov{q}\t{$src, $dst|$dst, $src}", []>, TB,
Requires<[In64BitMode]>;
def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, TB,
Requires<[Not64BitMode]>;
def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+ "mov{q}\t{$src, $dst|$dst, $src}", []>, TB,
Requires<[In64BitMode]>;
} // SchedRW
@@ -138,17 +132,17 @@ def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
let SchedRW = [WriteSystem] in {
def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, TB,
Requires<[Not64BitMode]>;
def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+ "mov{q}\t{$src, $dst|$dst, $src}", []>, TB,
Requires<[In64BitMode]>;
def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, TB,
Requires<[Not64BitMode]>;
def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+ "mov{q}\t{$src, $dst|$dst, $src}", []>, TB,
Requires<[In64BitMode]>;
} // SchedRW
@@ -156,12 +150,12 @@ def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
// Segment override instruction prefixes
let SchedRW = [WriteNop] in {
-def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", [], IIC_NOP>;
-def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", [], IIC_NOP>;
-def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", [], IIC_NOP>;
-def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", [], IIC_NOP>;
-def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", [], IIC_NOP>;
-def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", [], IIC_NOP>;
+def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>;
+def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>;
+def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>;
+def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>;
+def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>;
+def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
} // SchedRW
//===----------------------------------------------------------------------===//
@@ -170,24 +164,24 @@ def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", [], IIC_NOP>;
let SchedRW = [WriteMove] in {
def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize16;
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16;
def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize32;
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32;
def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
+ "mov{q}\t{$src, $dst|$dst, $src}", []>;
let mayStore = 1 in {
def MOV16ms : I<0x8C, MRMDestMem, (outs), (ins i16mem:$dst, SEGMENT_REG:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSizeIgnore;
+ "mov{w}\t{$src, $dst|$dst, $src}", []>;
}
def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize16;
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16;
def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize32;
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32;
def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
+ "mov{q}\t{$src, $dst|$dst, $src}", []>;
let mayLoad = 1 in {
def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSizeIgnore;
+ "mov{w}\t{$src, $dst|$dst, $src}", []>;
}
} // SchedRW
@@ -195,198 +189,168 @@ def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
// Segmentation support instructions.
let SchedRW = [WriteSystem] in {
-def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
+def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", []>, TB;
let mayLoad = 1 in
def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
- "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
- OpSize16;
+ "lar{w}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize16, NotMemoryFoldable;
def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
- "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
- OpSize16;
+ "lar{w}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize16, NotMemoryFoldable;
// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
let mayLoad = 1 in
def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
- "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
- OpSize32;
+ "lar{l}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize32, NotMemoryFoldable;
def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
- "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
- OpSize32;
-// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
+ "lar{l}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize32, NotMemoryFoldable;
+// i16mem operand in LAR64rm and GR32 operand in LAR64rr is not a typo.
let mayLoad = 1 in
def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
- "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
+ "lar{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
- "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
+ "lar{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
+// i16mem operand in LSL32rm and GR32 operand in LSL32rr is not a typo.
let mayLoad = 1 in
def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
- "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
- OpSize16;
+ "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize16, NotMemoryFoldable;
def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
- "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
- OpSize16;
+ "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize16, NotMemoryFoldable;
+// i16mem operand in LSL64rm and GR32 operand in LSL64rr is not a typo.
let mayLoad = 1 in
-def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
- "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
- OpSize32;
+def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize32, NotMemoryFoldable;
def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
- "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
- OpSize32;
+ "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize32, NotMemoryFoldable;
let mayLoad = 1 in
-def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
- "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB;
-def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
- "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
+def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
+def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+ "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
-def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr",
- [], IIC_INVLPG>, TB;
+def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB;
def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
- "str{w}\t$dst", [], IIC_STR>, TB, OpSize16;
+ "str{w}\t$dst", []>, TB, OpSize16;
def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
- "str{l}\t$dst", [], IIC_STR>, TB, OpSize32;
+ "str{l}\t$dst", []>, TB, OpSize32;
def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
- "str{q}\t$dst", [], IIC_STR>, TB;
+ "str{q}\t$dst", []>, TB;
let mayStore = 1 in
-def STRm : I<0x00, MRM1m, (outs), (ins i16mem:$dst),
- "str{w}\t$dst", [], IIC_STR>, TB;
+def STRm : I<0x00, MRM1m, (outs), (ins i16mem:$dst), "str{w}\t$dst", []>, TB;
-def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
- "ltr{w}\t$src", [], IIC_LTR>, TB;
+def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src), "ltr{w}\t$src", []>, TB, NotMemoryFoldable;
let mayLoad = 1 in
-def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
- "ltr{w}\t$src", [], IIC_LTR>, TB;
+def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), "ltr{w}\t$src", []>, TB, NotMemoryFoldable;
-def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
- "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>,
+def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins), "push{w}\t{%cs|cs}", []>,
OpSize16, Requires<[Not64BitMode]>;
-def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
- "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>,
+def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins), "push{l}\t{%cs|cs}", []>,
OpSize32, Requires<[Not64BitMode]>;
-def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
- "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>,
+def PUSHSS16 : I<0x16, RawFrm, (outs), (ins), "push{w}\t{%ss|ss}", []>,
OpSize16, Requires<[Not64BitMode]>;
-def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
- "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>,
+def PUSHSS32 : I<0x16, RawFrm, (outs), (ins), "push{l}\t{%ss|ss}", []>,
OpSize32, Requires<[Not64BitMode]>;
-def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
- "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>,
+def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins), "push{w}\t{%ds|ds}", []>,
OpSize16, Requires<[Not64BitMode]>;
-def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
- "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>,
+def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins), "push{l}\t{%ds|ds}", []>,
OpSize32, Requires<[Not64BitMode]>;
-def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
- "push{w}\t{%es|es}", [], IIC_PUSH_SR>,
+def PUSHES16 : I<0x06, RawFrm, (outs), (ins), "push{w}\t{%es|es}", []>,
OpSize16, Requires<[Not64BitMode]>;
-def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
- "push{l}\t{%es|es}", [], IIC_PUSH_SR>,
+def PUSHES32 : I<0x06, RawFrm, (outs), (ins), "push{l}\t{%es|es}", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins), "push{w}\t{%fs|fs}", []>,
+ OpSize16, TB;
+def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins), "push{l}\t{%fs|fs}", []>, TB,
OpSize32, Requires<[Not64BitMode]>;
-def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
- "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize16, TB;
-def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
- "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
- OpSize32, Requires<[Not64BitMode]>;
-def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
- "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize16, TB;
-def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
- "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
- OpSize32, Requires<[Not64BitMode]>;
-def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
- "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
- OpSize32, Requires<[In64BitMode]>;
-def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
- "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
- OpSize32, Requires<[In64BitMode]>;
+def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins), "push{w}\t{%gs|gs}", []>,
+ OpSize16, TB;
+def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins), "push{l}\t{%gs|gs}", []>, TB,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins), "push{q}\t{%fs|fs}", []>, TB,
+ OpSize32, Requires<[In64BitMode]>;
+def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins), "push{q}\t{%gs|gs}", []>, TB,
+ OpSize32, Requires<[In64BitMode]>;
// No "pop cs" instruction.
-def POPSS16 : I<0x17, RawFrm, (outs), (ins),
- "pop{w}\t{%ss|ss}", [], IIC_POP_SR_SS>,
+def POPSS16 : I<0x17, RawFrm, (outs), (ins), "pop{w}\t{%ss|ss}", []>,
OpSize16, Requires<[Not64BitMode]>;
-def POPSS32 : I<0x17, RawFrm, (outs), (ins),
- "pop{l}\t{%ss|ss}", [], IIC_POP_SR_SS>,
+def POPSS32 : I<0x17, RawFrm, (outs), (ins), "pop{l}\t{%ss|ss}", []>,
OpSize32, Requires<[Not64BitMode]>;
-def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
- "pop{w}\t{%ds|ds}", [], IIC_POP_SR>,
+def POPDS16 : I<0x1F, RawFrm, (outs), (ins), "pop{w}\t{%ds|ds}", []>,
OpSize16, Requires<[Not64BitMode]>;
-def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
- "pop{l}\t{%ds|ds}", [], IIC_POP_SR>,
+def POPDS32 : I<0x1F, RawFrm, (outs), (ins), "pop{l}\t{%ds|ds}", []>,
OpSize32, Requires<[Not64BitMode]>;
-def POPES16 : I<0x07, RawFrm, (outs), (ins),
- "pop{w}\t{%es|es}", [], IIC_POP_SR>,
+def POPES16 : I<0x07, RawFrm, (outs), (ins), "pop{w}\t{%es|es}", []>,
OpSize16, Requires<[Not64BitMode]>;
-def POPES32 : I<0x07, RawFrm, (outs), (ins),
- "pop{l}\t{%es|es}", [], IIC_POP_SR>,
- OpSize32, Requires<[Not64BitMode]>;
-
-def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
- "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize16, TB;
-def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
- "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB,
+def POPES32 : I<0x07, RawFrm, (outs), (ins), "pop{l}\t{%es|es}", []>,
OpSize32, Requires<[Not64BitMode]>;
-def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
- "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB,
- OpSize32, Requires<[In64BitMode]>;
-
-def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
- "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize16, TB;
-def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
- "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB,
- OpSize32, Requires<[Not64BitMode]>;
-def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
- "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB,
- OpSize32, Requires<[In64BitMode]>;
-
-def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
- "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16,
+def POPFS16 : I<0xa1, RawFrm, (outs), (ins), "pop{w}\t{%fs|fs}", []>,
+ OpSize16, TB;
+def POPFS32 : I<0xa1, RawFrm, (outs), (ins), "pop{l}\t{%fs|fs}", []>, TB,
+ OpSize32, Requires<[Not64BitMode]>;
+def POPFS64 : I<0xa1, RawFrm, (outs), (ins), "pop{q}\t{%fs|fs}", []>, TB,
+ OpSize32, Requires<[In64BitMode]>;
+
+def POPGS16 : I<0xa9, RawFrm, (outs), (ins), "pop{w}\t{%gs|gs}", []>,
+ OpSize16, TB;
+def POPGS32 : I<0xa9, RawFrm, (outs), (ins), "pop{l}\t{%gs|gs}", []>, TB,
+ OpSize32, Requires<[Not64BitMode]>;
+def POPGS64 : I<0xa9, RawFrm, (outs), (ins), "pop{q}\t{%gs|gs}", []>, TB,
+ OpSize32, Requires<[In64BitMode]>;
+
+def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src),
+ "lds{w}\t{$src, $dst|$dst, $src}", []>, OpSize16,
Requires<[Not64BitMode]>;
-def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
- "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32,
+def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src),
+ "lds{l}\t{$src, $dst|$dst, $src}", []>, OpSize32,
Requires<[Not64BitMode]>;
-def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
- "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
-def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
- "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
-def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
- "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src),
+ "lss{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16;
+def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src),
+ "lss{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32;
+def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaquemem:$src),
+ "lss{q}\t{$src, $dst|$dst, $src}", []>, TB;
-def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
- "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16,
+def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src),
+ "les{w}\t{$src, $dst|$dst, $src}", []>, OpSize16,
Requires<[Not64BitMode]>;
-def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
- "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32,
+def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src),
+ "les{l}\t{$src, $dst|$dst, $src}", []>, OpSize32,
Requires<[Not64BitMode]>;
-def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
- "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
-def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
- "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
-def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
- "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
-
-def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
- "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
-def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
- "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src),
+ "lfs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16;
+def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src),
+ "lfs{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32;
+def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaquemem:$src),
+ "lfs{q}\t{$src, $dst|$dst, $src}", []>, TB;
-def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
- "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src),
+ "lgs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16;
+def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src),
+ "lgs{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32;
+def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaquemem:$src),
+ "lgs{q}\t{$src, $dst|$dst, $src}", []>, TB;
-def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
- "verr\t$seg", [], IIC_VERR>, TB;
-def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
- "verw\t$seg", [], IIC_VERW_MEM>, TB;
+def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg), "verr\t$seg", []>, TB, NotMemoryFoldable;
+def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg), "verw\t$seg", []>, TB, NotMemoryFoldable;
let mayLoad = 1 in {
-def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
- "verr\t$seg", [], IIC_VERR>, TB;
-def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
- "verw\t$seg", [], IIC_VERW_REG>, TB;
+def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg), "verr\t$seg", []>, TB, NotMemoryFoldable;
+def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg), "verw\t$seg", []>, TB, NotMemoryFoldable;
}
} // SchedRW
@@ -394,97 +358,100 @@ def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
// Descriptor-table support instructions
let SchedRW = [WriteSystem] in {
-def SGDT16m : I<0x01, MRM0m, (outs), (ins opaque48mem:$dst),
- "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
-def SGDT32m : I<0x01, MRM0m, (outs), (ins opaque48mem:$dst),
- "sgdt{l}\t$dst", [], IIC_SGDT>, OpSize32, TB, Requires <[Not64BitMode]>;
-def SGDT64m : I<0x01, MRM0m, (outs), (ins opaque80mem:$dst),
- "sgdt{q}\t$dst", [], IIC_SGDT>, TB, Requires <[In64BitMode]>;
-def SIDT16m : I<0x01, MRM1m, (outs), (ins opaque48mem:$dst),
- "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
-def SIDT32m : I<0x01, MRM1m, (outs), (ins opaque48mem:$dst),
- "sidt{l}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>;
-def SIDT64m : I<0x01, MRM1m, (outs), (ins opaque80mem:$dst),
- "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
+def SGDT16m : I<0x01, MRM0m, (outs), (ins opaquemem:$dst),
+ "sgdtw\t$dst", []>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SGDT32m : I<0x01, MRM0m, (outs), (ins opaquemem:$dst),
+ "sgdt{l|d}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SGDT64m : I<0x01, MRM0m, (outs), (ins opaquemem:$dst),
+ "sgdt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
+def SIDT16m : I<0x01, MRM1m, (outs), (ins opaquemem:$dst),
+ "sidtw\t$dst", []>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SIDT32m : I<0x01, MRM1m, (outs), (ins opaquemem:$dst),
+ "sidt{l|d}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SIDT64m : I<0x01, MRM1m, (outs), (ins opaquemem:$dst),
+ "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
- "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize16;
+ "sldt{w}\t$dst", []>, TB, OpSize16;
let mayStore = 1 in
def SLDT16m : I<0x00, MRM0m, (outs), (ins i16mem:$dst),
- "sldt{w}\t$dst", [], IIC_SLDT>, TB;
+ "sldt{w}\t$dst", []>, TB;
def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
- "sldt{l}\t$dst", [], IIC_SLDT>, OpSize32, TB;
+ "sldt{l}\t$dst", []>, OpSize32, TB;
// LLDT is not interpreted specially in 64-bit mode because there is no sign
// extension.
def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
- "sldt{q}\t$dst", [], IIC_SLDT>, TB;
-let mayStore = 1 in
-def SLDT64m : RI<0x00, MRM0m, (outs), (ins i16mem:$dst),
- "sldt{q}\t$dst", [], IIC_SLDT>, TB;
-
-def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
- "lgdt{w}\t$src", [], IIC_LGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
-def LGDT32m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
- "lgdt{l}\t$src", [], IIC_LGDT>, OpSize32, TB, Requires<[Not64BitMode]>;
-def LGDT64m : I<0x01, MRM2m, (outs), (ins opaque80mem:$src),
- "lgdt{q}\t$src", [], IIC_LGDT>, TB, Requires<[In64BitMode]>;
-def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
- "lidt{w}\t$src", [], IIC_LIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
-def LIDT32m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
- "lidt{l}\t$src", [], IIC_LIDT>, OpSize32, TB, Requires<[Not64BitMode]>;
-def LIDT64m : I<0x01, MRM3m, (outs), (ins opaque80mem:$src),
- "lidt{q}\t$src", [], IIC_LIDT>, TB, Requires<[In64BitMode]>;
+ "sldt{q}\t$dst", []>, TB, Requires<[In64BitMode]>;
+
+def LGDT16m : I<0x01, MRM2m, (outs), (ins opaquemem:$src),
+ "lgdtw\t$src", []>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LGDT32m : I<0x01, MRM2m, (outs), (ins opaquemem:$src),
+ "lgdt{l|d}\t$src", []>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LGDT64m : I<0x01, MRM2m, (outs), (ins opaquemem:$src),
+ "lgdt{q}\t$src", []>, TB, Requires<[In64BitMode]>;
+def LIDT16m : I<0x01, MRM3m, (outs), (ins opaquemem:$src),
+ "lidtw\t$src", []>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LIDT32m : I<0x01, MRM3m, (outs), (ins opaquemem:$src),
+ "lidt{l|d}\t$src", []>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LIDT64m : I<0x01, MRM3m, (outs), (ins opaquemem:$src),
+ "lidt{q}\t$src", []>, TB, Requires<[In64BitMode]>;
def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
- "lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
+ "lldt{w}\t$src", []>, TB, NotMemoryFoldable;
let mayLoad = 1 in
def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
- "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB;
+ "lldt{w}\t$src", []>, TB, NotMemoryFoldable;
} // SchedRW
//===----------------------------------------------------------------------===//
// Specialized register support
let SchedRW = [WriteSystem] in {
let Uses = [EAX, ECX, EDX] in
-def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
+def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", []>, TB;
let Defs = [EAX, EDX], Uses = [ECX] in
-def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
+def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB;
let Defs = [RAX, RDX], Uses = [ECX] in
- def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>,
- TB;
+ def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)]>, TB;
def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins),
- "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB;
+ "smsw{w}\t$dst", []>, OpSize16, TB;
def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins),
- "smsw{l}\t$dst", [], IIC_SMSW>, OpSize32, TB;
+ "smsw{l}\t$dst", []>, OpSize32, TB;
// no m form encodable; use SMSW16m
def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins),
- "smsw{q}\t$dst", [], IIC_SMSW>, TB;
+ "smsw{q}\t$dst", []>, TB;
// For memory operands, there is only a 16-bit form
def SMSW16m : I<0x01, MRM4m, (outs), (ins i16mem:$dst),
- "smsw{w}\t$dst", [], IIC_SMSW>, TB;
+ "smsw{w}\t$dst", []>, TB;
def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
- "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB;
+ "lmsw{w}\t$src", []>, TB, NotMemoryFoldable;
let mayLoad = 1 in
def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
- "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
+ "lmsw{w}\t$src", []>, TB, NotMemoryFoldable;
let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in
- def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB;
+ def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", []>, TB;
} // SchedRW
//===----------------------------------------------------------------------===//
// Cache instructions
let SchedRW = [WriteSystem] in {
-def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB;
-def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
+def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, TB;
+
+// wbnoinvd is like wbinvd, except without invalidation
+// encoding: like wbinvd + an 0xF3 prefix
+def WBNOINVD : I<0x09, RawFrm, (outs), (ins), "wbnoinvd",
+ [(int_x86_wbnoinvd)]>, XS,
+ Requires<[HasWBNOINVD]>;
} // SchedRW
//===----------------------------------------------------------------------===//
// CET instructions
-let SchedRW = [WriteSystem], Predicates = [HasSHSTK] in{
+// Use with caution, availability is not predicated on features.
+let SchedRW = [WriteSystem] in {
let Uses = [SSP] in {
let Defs = [SSP] in {
def INCSSPD : I<0xAE, MRM5r, (outs), (ins GR32:$src), "incsspd\t$src",
@@ -534,7 +501,12 @@ let SchedRW = [WriteSystem], Predicates = [HasSHSTK] in{
"clrssbsy\t$src",
[(int_x86_clrssbsy addr:$src)]>, XS;
} // Defs SSP
-} // SchedRW && HasSHSTK
+} // SchedRW
+
+let SchedRW = [WriteSystem] in {
+ def ENDBR64 : I<0x1E, MRM_FA, (outs), (ins), "endbr64", []>, XS;
+ def ENDBR32 : I<0x1E, MRM_FB, (outs), (ins), "endbr32", []>, XS;
+} // SchedRW
//===----------------------------------------------------------------------===//
// XSAVE instructions
@@ -551,40 +523,40 @@ let Uses = [EDX, EAX, ECX] in
} // HasXSAVE
let Uses = [EDX, EAX] in {
-def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+def XSAVE : I<0xAE, MRM4m, (outs), (ins opaquemem:$dst),
"xsave\t$dst",
[(int_x86_xsave addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>;
-def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaquemem:$dst),
"xsave64\t$dst",
[(int_x86_xsave64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
-def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaquemem:$dst),
"xrstor\t$dst",
[(int_x86_xrstor addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>;
-def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaquemem:$dst),
"xrstor64\t$dst",
[(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
-def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaquemem:$dst),
"xsaveopt\t$dst",
[(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT]>;
-def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaquemem:$dst),
"xsaveopt64\t$dst",
[(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT, In64BitMode]>;
-def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
"xsavec\t$dst",
[(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC]>;
-def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
"xsavec64\t$dst",
[(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC, In64BitMode]>;
-def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+def XSAVES : I<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
"xsaves\t$dst",
[(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>;
-def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
"xsaves64\t$dst",
[(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE, In64BitMode]>;
-def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
"xrstors\t$dst",
[(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>;
-def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
"xrstors64\t$dst",
[(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES, In64BitMode]>;
} // Uses
@@ -625,9 +597,9 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
let SchedRW = [WriteSystem] in {
let Defs = [EAX, EDX], Uses = [ECX] in
- def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", [], IIC_PKU>, TB;
+ def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB;
let Uses = [EAX, ECX, EDX] in
- def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", [], IIC_PKU>, TB;
+ def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB;
} // SchedRW
//===----------------------------------------------------------------------===//
@@ -635,87 +607,134 @@ let Uses = [EAX, ECX, EDX] in
let Predicates = [HasFSGSBase, In64BitMode], SchedRW = [WriteSystem] in {
def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
"rdfsbase{l}\t$dst",
- [(set GR32:$dst, (int_x86_rdfsbase_32))],
- IIC_SEGMENT_BASE_R>, XS;
+ [(set GR32:$dst, (int_x86_rdfsbase_32))]>, XS;
def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
"rdfsbase{q}\t$dst",
- [(set GR64:$dst, (int_x86_rdfsbase_64))],
- IIC_SEGMENT_BASE_R>, XS;
+ [(set GR64:$dst, (int_x86_rdfsbase_64))]>, XS;
def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
"rdgsbase{l}\t$dst",
- [(set GR32:$dst, (int_x86_rdgsbase_32))],
- IIC_SEGMENT_BASE_R>, XS;
+ [(set GR32:$dst, (int_x86_rdgsbase_32))]>, XS;
def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
"rdgsbase{q}\t$dst",
- [(set GR64:$dst, (int_x86_rdgsbase_64))],
- IIC_SEGMENT_BASE_R>, XS;
+ [(set GR64:$dst, (int_x86_rdgsbase_64))]>, XS;
def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src),
"wrfsbase{l}\t$src",
- [(int_x86_wrfsbase_32 GR32:$src)],
- IIC_SEGMENT_BASE_W>, XS;
+ [(int_x86_wrfsbase_32 GR32:$src)]>, XS;
def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src),
"wrfsbase{q}\t$src",
- [(int_x86_wrfsbase_64 GR64:$src)],
- IIC_SEGMENT_BASE_W>, XS;
+ [(int_x86_wrfsbase_64 GR64:$src)]>, XS;
def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src),
"wrgsbase{l}\t$src",
- [(int_x86_wrgsbase_32 GR32:$src)], IIC_SEGMENT_BASE_W>, XS;
+ [(int_x86_wrgsbase_32 GR32:$src)]>, XS;
def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src),
"wrgsbase{q}\t$src",
- [(int_x86_wrgsbase_64 GR64:$src)],
- IIC_SEGMENT_BASE_W>, XS;
+ [(int_x86_wrgsbase_64 GR64:$src)]>, XS;
}
//===----------------------------------------------------------------------===//
// INVPCID Instruction
let SchedRW = [WriteSystem] in {
def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
- "invpcid\t{$src2, $src1|$src1, $src2}", [], IIC_INVPCID>, T8PD,
- Requires<[Not64BitMode]>;
+ "invpcid\t{$src2, $src1|$src1, $src2}",
+ [(int_x86_invpcid GR32:$src1, addr:$src2)]>, T8PD,
+ Requires<[Not64BitMode, HasINVPCID]>;
def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
- "invpcid\t{$src2, $src1|$src1, $src2}", [], IIC_INVPCID>, T8PD,
- Requires<[In64BitMode]>;
+ "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[In64BitMode, HasINVPCID]>;
} // SchedRW
+let Predicates = [In64BitMode, HasINVPCID] in {
+ // The instruction can only use a 64 bit register as the register argument
+ // in 64 bit mode, while the intrinsic only accepts a 32 bit argument
+ // corresponding to it.
+ // The accepted values for now are 0,1,2,3 anyways (see Intel SDM -- INVCPID
+ // type),/ so it doesn't hurt us that one can't supply a 64 bit value here.
+ def : Pat<(int_x86_invpcid GR32:$src1, addr:$src2),
+ (INVPCID64
+ (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src1), sub_32bit),
+ addr:$src2)>;
+}
+
+
//===----------------------------------------------------------------------===//
// SMAP Instruction
let Defs = [EFLAGS], SchedRW = [WriteSystem] in {
- def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", [], IIC_SMAP>, TB;
- def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", [], IIC_SMAP>, TB;
+ def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
+ def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
}
//===----------------------------------------------------------------------===//
// SMX Instruction
let SchedRW = [WriteSystem] in {
let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in {
- def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", [], IIC_SMX>, TB;
+ def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB;
} // Uses, Defs
} // SchedRW
//===----------------------------------------------------------------------===//
+// TS flag control instruction.
+let SchedRW = [WriteSystem] in {
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", []>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// IF (inside EFLAGS) management instructions.
+let SchedRW = [WriteSystem], Uses = [EFLAGS], Defs = [EFLAGS] in {
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", []>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", []>;
+}
+
+//===----------------------------------------------------------------------===//
// RDPID Instruction
let SchedRW = [WriteSystem] in {
-def RDPID32 : I<0xC7, MRM7r, (outs GR32:$src), (ins),
- "rdpid\t$src", [], IIC_RDPID>, XS,
- Requires<[Not64BitMode]>;
-def RDPID64 : I<0xC7, MRM7r, (outs GR64:$src), (ins),
- "rdpid\t$src", [], IIC_RDPID>, XS,
- Requires<[In64BitMode]>;
+def RDPID32 : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
+ "rdpid\t$dst", [(set GR32:$dst, (int_x86_rdpid))]>, XS,
+ Requires<[Not64BitMode, HasRDPID]>;
+def RDPID64 : I<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdpid\t$dst", []>, XS,
+ Requires<[In64BitMode, HasRDPID]>;
} // SchedRW
+let Predicates = [In64BitMode, HasRDPID] in {
+ // Due to silly instruction definition, we have to compensate for the
+ // instruction outputing a 64-bit register.
+ def : Pat<(int_x86_rdpid),
+ (EXTRACT_SUBREG (RDPID64), sub_32bit)>;
+}
+
+
//===----------------------------------------------------------------------===//
-// PTWRITE Instruction
+// PTWRITE Instruction - Write Data to a Processor Trace Packet
let SchedRW = [WriteSystem] in {
-
def PTWRITEm: I<0xAE, MRM4m, (outs), (ins i32mem:$dst),
- "ptwrite{l}\t$dst", [], IIC_PTWRITE>, XS;
+ "ptwrite{l}\t$dst", [(int_x86_ptwrite32 (loadi32 addr:$dst))]>, XS,
+ Requires<[HasPTWRITE]>;
def PTWRITE64m : RI<0xAE, MRM4m, (outs), (ins i64mem:$dst),
- "ptwrite{q}\t$dst", [], IIC_PTWRITE>, XS,
- Requires<[In64BitMode]>;
+ "ptwrite{q}\t$dst", [(int_x86_ptwrite64 (loadi64 addr:$dst))]>, XS,
+ Requires<[In64BitMode, HasPTWRITE]>;
def PTWRITEr : I<0xAE, MRM4r, (outs), (ins GR32:$dst),
- "ptwrite{l}\t$dst", [], IIC_PTWRITE>, XS;
+ "ptwrite{l}\t$dst", [(int_x86_ptwrite32 GR32:$dst)]>, XS,
+ Requires<[HasPTWRITE]>;
def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst),
- "ptwrite{q}\t$dst", [], IIC_PTWRITE>, XS,
- Requires<[In64BitMode]>;
+ "ptwrite{q}\t$dst", [(int_x86_ptwrite64 GR64:$dst)]>, XS,
+ Requires<[In64BitMode, HasPTWRITE]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Platform Configuration instruction
+
+// From ISA docs:
+// "This instruction is used to execute functions for configuring platform
+// features.
+// EAX: Leaf function to be invoked.
+// RBX/RCX/RDX: Leaf-specific purpose."
+// "Successful execution of the leaf clears RAX (set to zero) and ZF, CF, PF,
+// AF, OF, and SF are cleared. In case of failure, the failure reason is
+// indicated in RAX with ZF set to 1 and CF, PF, AF, OF, and SF are cleared."
+// Thus all these mentioned registers are considered clobbered.
+
+let SchedRW = [WriteSystem] in {
+let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX, RDX, EFLAGS] in
+ def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, TB,
+ Requires<[HasPCONFIG]>;
} // SchedRW
diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td
index 4bb2c204b368..06a438ebfcad 100644
--- a/lib/Target/X86/X86InstrVMX.td
+++ b/lib/Target/X86/X86InstrVMX.td
@@ -18,59 +18,67 @@
let SchedRW = [WriteSystem] in {
// 66 0F 38 80
def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
- "invept\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD,
+ "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
Requires<[Not64BitMode]>;
def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
- "invept\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD,
+ "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
Requires<[In64BitMode]>;
// 66 0F 38 81
def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
- "invvpid\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD,
+ "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
Requires<[Not64BitMode]>;
def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
- "invvpid\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD,
+ "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
Requires<[In64BitMode]>;
// 0F 01 C1
-def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", [], IIC_VMX>, TB;
+def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
"vmclear\t$vmcs", []>, PD;
// OF 01 D4
-def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", [], IIC_VMX>, TB;
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
// 0F 01 C2
-def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", [], IIC_VMX>, TB;
+def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
// 0F 01 C3
-def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", [], IIC_VMX>, TB;
+def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
- "vmptrld\t$vmcs", [], IIC_VMX>, PS;
+ "vmptrld\t$vmcs", []>, PS;
def VMPTRSTm : I<0xC7, MRM7m, (outs), (ins i64mem:$vmcs),
- "vmptrst\t$vmcs", [], IIC_VMX>, PS;
+ "vmptrst\t$vmcs", []>, PS;
def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
- "vmread{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>;
+ "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>,
+ NotMemoryFoldable;
def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
- "vmread{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>;
+ "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>,
+ NotMemoryFoldable;
let mayStore = 1 in {
def VMREAD64mr : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
- "vmread{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>;
+ "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>,
+ NotMemoryFoldable;
def VMREAD32mr : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
- "vmread{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>;
+ "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>,
+ NotMemoryFoldable;
} // mayStore
def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
- "vmwrite{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>;
+ "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>,
+ NotMemoryFoldable;
def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
- "vmwrite{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>;
+ "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>,
+ NotMemoryFoldable;
let mayLoad = 1 in {
def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
- "vmwrite{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>;
+ "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>,
+ NotMemoryFoldable;
def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
- "vmwrite{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>;
+ "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>,
+ NotMemoryFoldable;
} // mayLoad
// 0F 01 C4
diff --git a/lib/Target/X86/X86InstrVecCompiler.td b/lib/Target/X86/X86InstrVecCompiler.td
index c1cb4dcb16be..322bdb74e2de 100644
--- a/lib/Target/X86/X86InstrVecCompiler.td
+++ b/lib/Target/X86/X86InstrVecCompiler.td
@@ -48,8 +48,6 @@ def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
-def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>;
-def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>;
// Bitcasts between 256-bit vector types. Return the original type since
// no instruction is needed for the conversion
@@ -111,7 +109,6 @@ def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>;
def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))), (v32i16 VR512:$src)>;
def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))), (v32i16 VR512:$src)>;
def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
-def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
def : Pat<(v64i8 (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>;
def : Pat<(v64i8 (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>;
def : Pat<(v64i8 (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>;
@@ -148,7 +145,6 @@ multiclass subvector_subreg_lowering<RegisterClass subRC, ValueType subVT,
def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))),
(subVT (EXTRACT_SUBREG RC:$src, subIdx))>;
- let AddedComplexity = 25 in // to give priority over vinsertf128rm
def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))),
(VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>;
}
@@ -217,13 +213,13 @@ let Predicates = [HasVLX] in {
sub_xmm>;
defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR256X, v4f32, v8f32,
sub_xmm>;
- defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v2i64,
+ defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v2i64,
v4i64, sub_xmm>;
- defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v4i32,
+ defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v4i32,
v8i32, sub_xmm>;
- defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v8i16,
+ defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v8i16,
v16i16, sub_xmm>;
- defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v16i8,
+ defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v16i8,
v32i8, sub_xmm>;
// Special patterns for storing subvector extracts of lower 128-bits of 512.
@@ -232,13 +228,13 @@ let Predicates = [HasVLX] in {
sub_xmm>;
defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR512, v4f32, v16f32,
sub_xmm>;
- defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v2i64,
+ defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v2i64,
v8i64, sub_xmm>;
- defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v4i32,
+ defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v4i32,
v16i32, sub_xmm>;
- defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v8i16,
+ defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v8i16,
v32i16, sub_xmm>;
- defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v16i8,
+ defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v16i8,
v64i8, sub_xmm>;
// Special patterns for storing subvector extracts of lower 256-bits of 512.
@@ -247,186 +243,83 @@ let Predicates = [HasVLX] in {
sub_ymm>;
defm : subvector_store_lowering<"APSZ256", "UPSZ256", VR512, v8f32, v16f32,
sub_ymm>;
- defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v4i64,
+ defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v4i64,
v8i64, sub_ymm>;
- defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v8i32,
+ defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v8i32,
v16i32, sub_ymm>;
- defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v16i16,
+ defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v16i16,
v32i16, sub_ymm>;
- defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v32i8,
+ defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v32i8,
v64i8, sub_ymm>;
}
// If we're inserting into an all zeros vector, just use a plain move which
-// will zero the upper bits.
-// TODO: Is there a safe way to detect whether the producing instruction
-// already zeroed the upper bits?
-multiclass subvector_zero_lowering<string MoveStr, RegisterClass RC,
- ValueType DstTy, ValueType SrcTy,
- ValueType ZeroTy, PatFrag memop,
- SubRegIndex SubIdx> {
+// will zero the upper bits. A post-isel hook will take care of removing
+// any moves that we can prove are unnecessary.
+multiclass subvec_zero_lowering<string MoveStr,
+ RegisterClass RC, ValueType DstTy,
+ ValueType SrcTy, ValueType ZeroTy,
+ SubRegIndex SubIdx> {
def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
(SrcTy RC:$src), (iPTR 0))),
(SUBREG_TO_REG (i64 0),
- (!cast<Instruction>("VMOV"#MoveStr#"rr") RC:$src), SubIdx)>;
-
- def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
- (SrcTy (bitconvert (memop addr:$src))),
- (iPTR 0))),
- (SUBREG_TO_REG (i64 0),
- (!cast<Instruction>("VMOV"#MoveStr#"rm") addr:$src), SubIdx)>;
+ (SrcTy (!cast<Instruction>("VMOV"#MoveStr#"rr") RC:$src)), SubIdx)>;
}
let Predicates = [HasAVX, NoVLX] in {
- defm : subvector_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, loadv2f64,
- sub_xmm>;
- defm : subvector_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, loadv4f32,
- sub_xmm>;
- defm : subvector_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, loadv2i64,
- sub_xmm>;
- defm : subvector_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, loadv2i64,
- sub_xmm>;
- defm : subvector_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, loadv2i64,
- sub_xmm>;
- defm : subvector_zero_lowering<"DQA", VR128, v32i8, v16i8, v8i32, loadv2i64,
- sub_xmm>;
+ defm : subvec_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v32i8, v16i8, v8i32, sub_xmm>;
}
let Predicates = [HasVLX] in {
- defm : subvector_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32,
- loadv2f64, sub_xmm>;
- defm : subvector_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32,
- loadv4f32, sub_xmm>;
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32,
- loadv2i64, sub_xmm>;
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32,
- loadv2i64, sub_xmm>;
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32,
- loadv2i64, sub_xmm>;
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, v8i32,
- loadv2i64, sub_xmm>;
-
- defm : subvector_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32,
- loadv2f64, sub_xmm>;
- defm : subvector_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32,
- loadv4f32, sub_xmm>;
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32,
- loadv2i64, sub_xmm>;
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32,
- loadv2i64, sub_xmm>;
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32,
- loadv2i64, sub_xmm>;
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, v16i32,
- loadv2i64, sub_xmm>;
-
- defm : subvector_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32,
- loadv4f64, sub_ymm>;
- defm : subvector_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32,
- loadv8f32, sub_ymm>;
- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32,
- loadv4i64, sub_ymm>;
- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32,
- loadv4i64, sub_ymm>;
- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32,
- loadv4i64, sub_ymm>;
- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, v16i32,
- loadv4i64, sub_ymm>;
+ defm : subvec_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, v8i32, sub_xmm>;
+
+ defm : subvec_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, v16i32, sub_xmm>;
+
+ defm : subvec_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, v16i32, sub_ymm>;
}
let Predicates = [HasAVX512, NoVLX] in {
- defm : subvector_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, loadv2f64,
- sub_xmm>;
- defm : subvector_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, loadv4f32,
- sub_xmm>;
- defm : subvector_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, loadv2i64,
- sub_xmm>;
- defm : subvector_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, loadv2i64,
- sub_xmm>;
- defm : subvector_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, loadv2i64,
- sub_xmm>;
- defm : subvector_zero_lowering<"DQA", VR128, v64i8, v16i8, v16i32, loadv2i64,
- sub_xmm>;
-
- defm : subvector_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32,
- loadv4f64, sub_ymm>;
- defm : subvector_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32,
- loadv8f32, sub_ymm>;
- defm : subvector_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32,
- loadv4i64, sub_ymm>;
- defm : subvector_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32,
- loadv4i64, sub_ymm>;
- defm : subvector_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32,
- loadv4i64, sub_ymm>;
- defm : subvector_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32,
- loadv4i64, sub_ymm>;
+ defm : subvec_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v64i8, v16i8, v16i32, sub_xmm>;
+
+ defm : subvec_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32, sub_ymm>;
}
-// List of opcodes that guaranteed to zero the upper elements of vector regs.
-// TODO: Ideally this would be a blacklist instead of a whitelist. But SHA
-// intrinsics and some MMX->XMM move instructions that aren't VEX encoded make
-// this difficult. So starting with a couple opcodes used by reduction loops
-// where we explicitly insert zeros.
-class veczeroupper<ValueType vt, RegisterClass RC> :
- PatLeaf<(vt RC:$src), [{
- return N->getOpcode() == X86ISD::VPMADDWD ||
- N->getOpcode() == X86ISD::PSADBW;
- }]>;
-
-def zeroupperv2f64 : veczeroupper<v2f64, VR128>;
-def zeroupperv4f32 : veczeroupper<v4f32, VR128>;
-def zeroupperv2i64 : veczeroupper<v2i64, VR128>;
-def zeroupperv4i32 : veczeroupper<v4i32, VR128>;
-def zeroupperv8i16 : veczeroupper<v8i16, VR128>;
-def zeroupperv16i8 : veczeroupper<v16i8, VR128>;
-
-def zeroupperv4f64 : veczeroupper<v4f64, VR256>;
-def zeroupperv8f32 : veczeroupper<v8f32, VR256>;
-def zeroupperv4i64 : veczeroupper<v4i64, VR256>;
-def zeroupperv8i32 : veczeroupper<v8i32, VR256>;
-def zeroupperv16i16 : veczeroupper<v16i16, VR256>;
-def zeroupperv32i8 : veczeroupper<v32i8, VR256>;
-
-
-// If we can guarantee the upper elements have already been zeroed we can elide
-// an explicit zeroing.
-multiclass subvector_zero_ellision<RegisterClass RC, ValueType DstTy,
- ValueType SrcTy, ValueType ZeroTy,
- SubRegIndex SubIdx, PatLeaf Zeroupper> {
- def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
- Zeroupper:$src, (iPTR 0))),
- (SUBREG_TO_REG (i64 0), RC:$src, SubIdx)>;
-}
-
-// 128->256
-defm: subvector_zero_ellision<VR128, v4f64, v2f64, v8i32, sub_xmm, zeroupperv2f64>;
-defm: subvector_zero_ellision<VR128, v8f32, v4f32, v8i32, sub_xmm, zeroupperv4f32>;
-defm: subvector_zero_ellision<VR128, v4i64, v2i64, v8i32, sub_xmm, zeroupperv2i64>;
-defm: subvector_zero_ellision<VR128, v8i32, v4i32, v8i32, sub_xmm, zeroupperv4i32>;
-defm: subvector_zero_ellision<VR128, v16i16, v8i16, v8i32, sub_xmm, zeroupperv8i16>;
-defm: subvector_zero_ellision<VR128, v32i8, v16i8, v8i32, sub_xmm, zeroupperv16i8>;
-
-// 128->512
-defm: subvector_zero_ellision<VR128, v8f64, v2f64, v16i32, sub_xmm, zeroupperv2f64>;
-defm: subvector_zero_ellision<VR128, v16f32, v4f32, v16i32, sub_xmm, zeroupperv4f32>;
-defm: subvector_zero_ellision<VR128, v8i64, v2i64, v16i32, sub_xmm, zeroupperv2i64>;
-defm: subvector_zero_ellision<VR128, v16i32, v4i32, v16i32, sub_xmm, zeroupperv4i32>;
-defm: subvector_zero_ellision<VR128, v32i16, v8i16, v16i32, sub_xmm, zeroupperv8i16>;
-defm: subvector_zero_ellision<VR128, v64i8, v16i8, v16i32, sub_xmm, zeroupperv16i8>;
-
-// 256->512
-defm: subvector_zero_ellision<VR256, v8f64, v4f64, v16i32, sub_ymm, zeroupperv4f64>;
-defm: subvector_zero_ellision<VR256, v16f32, v8f32, v16i32, sub_ymm, zeroupperv8f32>;
-defm: subvector_zero_ellision<VR256, v8i64, v4i64, v16i32, sub_ymm, zeroupperv4i64>;
-defm: subvector_zero_ellision<VR256, v16i32, v8i32, v16i32, sub_ymm, zeroupperv8i32>;
-defm: subvector_zero_ellision<VR256, v32i16, v16i16, v16i32, sub_ymm, zeroupperv16i16>;
-defm: subvector_zero_ellision<VR256, v64i8, v32i8, v16i32, sub_ymm, zeroupperv32i8>;
-
-
class maskzeroupper<ValueType vt, RegisterClass RC> :
PatLeaf<(vt RC:$src), [{
return isMaskZeroExtended(N);
}]>;
+def maskzeroupperv1i1 : maskzeroupper<v1i1, VK1>;
def maskzeroupperv2i1 : maskzeroupper<v2i1, VK2>;
def maskzeroupperv4i1 : maskzeroupper<v4i1, VK4>;
def maskzeroupperv8i1 : maskzeroupper<v8i1, VK8>;
@@ -438,11 +331,18 @@ def maskzeroupperv32i1 : maskzeroupper<v32i1, VK32>;
// zeroing.
let Predicates = [HasBWI] in {
def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ maskzeroupperv1i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK1:$src, VK32)>;
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
maskzeroupperv8i1:$src, (iPTR 0))),
(COPY_TO_REGCLASS VK8:$src, VK32)>;
def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
maskzeroupperv16i1:$src, (iPTR 0))),
(COPY_TO_REGCLASS VK16:$src, VK32)>;
+
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ maskzeroupperv1i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK1:$src, VK64)>;
def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
maskzeroupperv8i1:$src, (iPTR 0))),
(COPY_TO_REGCLASS VK8:$src, VK64)>;
@@ -456,10 +356,19 @@ let Predicates = [HasBWI] in {
let Predicates = [HasAVX512] in {
def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ maskzeroupperv1i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK1:$src, VK16)>;
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
maskzeroupperv8i1:$src, (iPTR 0))),
(COPY_TO_REGCLASS VK8:$src, VK16)>;
}
+let Predicates = [HasDQI] in {
+ def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
+ maskzeroupperv1i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK1:$src, VK8)>;
+}
+
let Predicates = [HasVLX, HasDQI] in {
def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
maskzeroupperv2i1:$src, (iPTR 0))),
@@ -495,6 +404,23 @@ let Predicates = [HasBWI, HasVLX] in {
// If the bits are not zero we have to fall back to explicitly zeroing by
// using shifts.
+let Predicates = [HasAVX512] in {
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ (v1i1 VK1:$mask), (iPTR 0))),
+ (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK1:$mask, VK16),
+ (i8 15)), (i8 15))>;
+
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ (v2i1 VK2:$mask), (iPTR 0))),
+ (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK2:$mask, VK16),
+ (i8 14)), (i8 14))>;
+
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ (v4i1 VK4:$mask), (iPTR 0))),
+ (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK4:$mask, VK16),
+ (i8 12)), (i8 12))>;
+}
+
let Predicates = [HasAVX512, NoDQI] in {
def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
(v8i1 VK8:$mask), (iPTR 0))),
@@ -506,9 +432,11 @@ let Predicates = [HasDQI] in {
def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
(v8i1 VK8:$mask), (iPTR 0))),
(COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK16)>;
-}
-let Predicates = [HasVLX, HasDQI] in {
+ def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
+ (v1i1 VK1:$mask), (iPTR 0))),
+ (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK1:$mask, VK8),
+ (i8 7)), (i8 7))>;
def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
(v2i1 VK2:$mask), (iPTR 0))),
(KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK2:$mask, VK8),
@@ -519,17 +447,6 @@ let Predicates = [HasVLX, HasDQI] in {
(i8 4)), (i8 4))>;
}
-let Predicates = [HasVLX] in {
- def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
- (v2i1 VK2:$mask), (iPTR 0))),
- (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK2:$mask, VK16),
- (i8 14)), (i8 14))>;
- def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
- (v4i1 VK4:$mask), (iPTR 0))),
- (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK4:$mask, VK16),
- (i8 12)), (i8 12))>;
-}
-
let Predicates = [HasBWI] in {
def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
(v16i1 VK16:$mask), (iPTR 0))),
@@ -567,6 +484,10 @@ let Predicates = [HasBWI, HasDQI] in {
let Predicates = [HasBWI, HasVLX] in {
def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ (v1i1 VK1:$mask), (iPTR 0))),
+ (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK1:$mask, VK32),
+ (i8 31)), (i8 31))>;
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
(v2i1 VK2:$mask), (iPTR 0))),
(KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK2:$mask, VK32),
(i8 30)), (i8 30))>;
@@ -576,6 +497,10 @@ let Predicates = [HasBWI, HasVLX] in {
(i8 28)), (i8 28))>;
def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ (v1i1 VK1:$mask), (iPTR 0))),
+ (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK1:$mask, VK64),
+ (i8 63)), (i8 63))>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
(v2i1 VK2:$mask), (iPTR 0))),
(KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK2:$mask, VK64),
(i8 62)), (i8 62))>;
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index c4b8e3e90d29..ff3e3be48a24 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -14,11 +14,11 @@
multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[WritePHAdd]>;
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWritePHAdd.XMM]>;
def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP,
- Sched<[WritePHAddLd, ReadAfterLd]>;
+ Sched<[SchedWritePHAdd.XMM.Folded, ReadAfterLd]>;
}
let ExeDomain = SSEPackedInt in {
@@ -41,123 +41,133 @@ let ExeDomain = SSEPackedInt in {
// Scalar load 2 addr operand instructions
multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
- Operand memop, ComplexPattern mem_cpat> {
+ Operand memop, ComplexPattern mem_cpat,
+ X86FoldableSchedWrite sched> {
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[WriteFAdd]>;
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP,
- Sched<[WriteFAddLd, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
- PatFrag memop> {
+ PatFrag memop, X86FoldableSchedWrite sched> {
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[WriteFAdd]>;
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP,
- Sched<[WriteFAddLd, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>;
}
multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
- PatFrag memop> {
- def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ PatFrag memop, X86FoldableSchedWrite sched> {
+ def Yrr : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L, Sched<[WriteFAdd]>;
- def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L, Sched<[sched]>;
+ def Yrm : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L,
- Sched<[WriteFAddLd, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd]>;
}
let ExeDomain = SSEPackedSingle in {
defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
- ssmem, sse_load_f32>;
- defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>;
- defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>;
+ ssmem, sse_load_f32, SchedWriteFRnd.Scl>;
+ defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32,
+ SchedWriteFRnd.XMM>;
+ defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32,
+ SchedWriteFRnd.YMM>;
}
let ExeDomain = SSEPackedDouble in {
defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
- sdmem, sse_load_f64>;
- defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>;
- defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>;
+ sdmem, sse_load_f64, SchedWriteFRnd.Scl>;
+ defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64,
+ SchedWriteFRnd.XMM>;
+ defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64,
+ SchedWriteFRnd.YMM>;
}
multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType vt128> {
+ ValueType vt128, X86FoldableSchedWrite sched> {
def rr : IXOP<opc, MRMSrcReg4VOp3, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>,
- XOP, Sched<[WriteVarVecShift]>;
+ XOP, Sched<[sched]>;
def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1),
(vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
- XOP_4V, VEX_W, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+ XOP_4V, VEX_W, Sched<[sched.Folded, ReadAfterLd]>;
def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst),
(ins i128mem:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
(vt128 VR128:$src2))))]>,
- XOP, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+ XOP, Sched<[sched.Folded, ReadAfterLd]>;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
def rr_REV : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>,
- XOP_4V, VEX_W, Sched<[WriteVarVecShift]>, FoldGenData<NAME#rr>;
+ XOP_4V, VEX_W, Sched<[sched]>, FoldGenData<NAME#rr>;
}
let ExeDomain = SSEPackedInt in {
- defm VPROTB : xop3op<0x90, "vprotb", rotl, v16i8>;
- defm VPROTD : xop3op<0x92, "vprotd", rotl, v4i32>;
- defm VPROTQ : xop3op<0x93, "vprotq", rotl, v2i64>;
- defm VPROTW : xop3op<0x91, "vprotw", rotl, v8i16>;
- defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>;
- defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>;
- defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>;
- defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>;
- defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>;
- defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>;
- defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>;
- defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>;
+ defm VPROTB : xop3op<0x90, "vprotb", rotl, v16i8, SchedWriteVarVecShift.XMM>;
+ defm VPROTD : xop3op<0x92, "vprotd", rotl, v4i32, SchedWriteVarVecShift.XMM>;
+ defm VPROTQ : xop3op<0x93, "vprotq", rotl, v2i64, SchedWriteVarVecShift.XMM>;
+ defm VPROTW : xop3op<0x91, "vprotw", rotl, v8i16, SchedWriteVarVecShift.XMM>;
+ defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8, SchedWriteVarVecShift.XMM>;
+ defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32, SchedWriteVarVecShift.XMM>;
+ defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64, SchedWriteVarVecShift.XMM>;
+ defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16, SchedWriteVarVecShift.XMM>;
+ defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8, SchedWriteVarVecShift.XMM>;
+ defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32, SchedWriteVarVecShift.XMM>;
+ defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64, SchedWriteVarVecShift.XMM>;
+ defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16, SchedWriteVarVecShift.XMM>;
}
multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType vt128> {
+ ValueType vt128, X86FoldableSchedWrite sched> {
def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>,
- XOP, Sched<[WriteVecShift]>;
+ XOP, Sched<[sched]>;
def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>,
- XOP, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+ XOP, Sched<[sched.Folded, ReadAfterLd]>;
}
let ExeDomain = SSEPackedInt in {
- defm VPROTB : xop3opimm<0xC0, "vprotb", X86vrotli, v16i8>;
- defm VPROTD : xop3opimm<0xC2, "vprotd", X86vrotli, v4i32>;
- defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vrotli, v2i64>;
- defm VPROTW : xop3opimm<0xC1, "vprotw", X86vrotli, v8i16>;
+ defm VPROTB : xop3opimm<0xC0, "vprotb", X86vrotli, v16i8,
+ SchedWriteVecShiftImm.XMM>;
+ defm VPROTD : xop3opimm<0xC2, "vprotd", X86vrotli, v4i32,
+ SchedWriteVecShiftImm.XMM>;
+ defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vrotli, v2i64,
+ SchedWriteVecShiftImm.XMM>;
+ defm VPROTW : xop3opimm<0xC1, "vprotw", X86vrotli, v8i16,
+ SchedWriteVecShiftImm.XMM>;
}
// Instruction where second source can be memory, but third must be register
-multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ X86FoldableSchedWrite sched> {
let isCommutable = 1 in
def rr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
@@ -165,29 +175,41 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V,
- Sched<[WriteVecIMul]>;
+ Sched<[sched]>;
def rm : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
- VR128:$src3))]>, XOP_4V, Sched<[WriteVecIMulLd, ReadAfterLd]>;
+ VR128:$src3))]>, XOP_4V, Sched<[sched.Folded, ReadAfterLd]>;
}
let ExeDomain = SSEPackedInt in {
- defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
- defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
- defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
- defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
- defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
- defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
- defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
- defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
- defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
- defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
- defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
- defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
+ defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd",
+ int_x86_xop_vpmadcswd, SchedWriteVecIMul.XMM>;
+ defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd",
+ int_x86_xop_vpmadcsswd, SchedWriteVecIMul.XMM>;
+ defm VPMACSWW : xop4opm2<0x95, "vpmacsww",
+ int_x86_xop_vpmacsww, SchedWriteVecIMul.XMM>;
+ defm VPMACSWD : xop4opm2<0x96, "vpmacswd",
+ int_x86_xop_vpmacswd, SchedWriteVecIMul.XMM>;
+ defm VPMACSSWW : xop4opm2<0x85, "vpmacssww",
+ int_x86_xop_vpmacssww, SchedWriteVecIMul.XMM>;
+ defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd",
+ int_x86_xop_vpmacsswd, SchedWriteVecIMul.XMM>;
+ defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql",
+ int_x86_xop_vpmacssdql, SchedWritePMULLD.XMM>;
+ defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh",
+ int_x86_xop_vpmacssdqh, SchedWritePMULLD.XMM>;
+ defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd",
+ int_x86_xop_vpmacssdd, SchedWritePMULLD.XMM>;
+ defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql",
+ int_x86_xop_vpmacsdql, SchedWritePMULLD.XMM>;
+ defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh",
+ int_x86_xop_vpmacsdqh, SchedWritePMULLD.XMM>;
+ defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd",
+ int_x86_xop_vpmacsdd, SchedWritePMULLD.XMM>;
}
// IFMA patterns - for cases where we can safely ignore the overflow bits from
@@ -199,11 +221,11 @@ let Predicates = [HasXOP] in {
def : Pat<(v4i32 (add (mul (v4i32 VR128:$src1), (v4i32 VR128:$src2)),
(v4i32 VR128:$src3))),
(VPMACSDDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
- def : Pat<(v2i64 (add (X86pmuldq (X86PShufd (v4i32 VR128:$src1), (i8 -11)),
- (X86PShufd (v4i32 VR128:$src2), (i8 -11))),
+ def : Pat<(v2i64 (add (X86pmuldq (bc_v2i64 (X86PShufd (v4i32 VR128:$src1), (i8 -11))),
+ (bc_v2i64 (X86PShufd (v4i32 VR128:$src2), (i8 -11)))),
(v2i64 VR128:$src3))),
(VPMACSDQHrr VR128:$src1, VR128:$src2, VR128:$src3)>;
- def : Pat<(v2i64 (add (X86pmuldq (v4i32 VR128:$src1), (v4i32 VR128:$src2)),
+ def : Pat<(v2i64 (add (X86pmuldq (v2i64 VR128:$src1), (v2i64 VR128:$src2)),
(v2i64 VR128:$src3))),
(VPMACSDQLrr VR128:$src1, VR128:$src2, VR128:$src3)>;
def : Pat<(v4i32 (add (X86vpmaddwd (v8i16 VR128:$src1), (v8i16 VR128:$src2)),
@@ -211,54 +233,69 @@ let Predicates = [HasXOP] in {
(VPMADCSWDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
}
+// Transforms to swizzle an immediate to help matching memory operand in first
+// operand.
+def CommuteVPCOMCC : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue() & 0x7;
+ Imm = X86::getSwappedVPCOMImm(Imm);
+ return getI8Imm(Imm, SDLoc(N));
+}]>;
+
// Instruction where second source can be memory, third must be imm8
-multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> {
- let isCommutable = 1 in
- def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, XOPCC:$cc),
- !strconcat("vpcom${cc}", Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128:$dst,
- (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
- imm:$cc)))]>,
- XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>;
- def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2, XOPCC:$cc),
- !strconcat("vpcom${cc}", Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128:$dst,
- (vt128 (OpNode (vt128 VR128:$src1),
- (vt128 (bitconvert (loadv2i64 addr:$src2))),
- imm:$cc)))]>,
- XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>;
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, u8imm:$src3),
- !strconcat("vpcom", Suffix,
- "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>;
- let mayLoad = 1 in
- def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
- !strconcat("vpcom", Suffix,
- "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>;
+multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
+ X86FoldableSchedWrite sched> {
+ let ExeDomain = SSEPackedInt in { // SSE integer instructions
+ let isCommutable = 1 in
+ def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, XOPCC:$cc),
+ !strconcat("vpcom${cc}", Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+ imm:$cc)))]>,
+ XOP_4V, Sched<[sched]>;
+ def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, XOPCC:$cc),
+ !strconcat("vpcom${cc}", Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1),
+ (vt128 (bitconvert (loadv2i64 addr:$src2))),
+ imm:$cc)))]>,
+ XOP_4V, Sched<[sched.Folded, ReadAfterLd]>;
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ !strconcat("vpcom", Suffix,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, XOP_4V, Sched<[sched]>, NotMemoryFoldable;
+ let mayLoad = 1 in
+ def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ !strconcat("vpcom", Suffix,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, XOP_4V, Sched<[sched.Folded, ReadAfterLd]>,
+ NotMemoryFoldable;
+ }
}
-}
-let ExeDomain = SSEPackedInt in { // SSE integer instructions
- defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8>;
- defm VPCOMW : xopvpcom<0xCD, "w", X86vpcom, v8i16>;
- defm VPCOMD : xopvpcom<0xCE, "d", X86vpcom, v4i32>;
- defm VPCOMQ : xopvpcom<0xCF, "q", X86vpcom, v2i64>;
- defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8>;
- defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16>;
- defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32>;
- defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64>;
+ def : Pat<(OpNode (bitconvert (loadv2i64 addr:$src2)),
+ (vt128 VR128:$src1), imm:$cc),
+ (!cast<Instruction>(NAME#"mi") VR128:$src1, addr:$src2,
+ (CommuteVPCOMCC imm:$cc))>;
}
+defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8, SchedWriteVecALU.XMM>;
+defm VPCOMW : xopvpcom<0xCD, "w", X86vpcom, v8i16, SchedWriteVecALU.XMM>;
+defm VPCOMD : xopvpcom<0xCE, "d", X86vpcom, v4i32, SchedWriteVecALU.XMM>;
+defm VPCOMQ : xopvpcom<0xCF, "q", X86vpcom, v2i64, SchedWriteVecALU.XMM>;
+defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8, SchedWriteVecALU.XMM>;
+defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16, SchedWriteVecALU.XMM>;
+defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32, SchedWriteVecALU.XMM>;
+defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64, SchedWriteVecALU.XMM>;
+
multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType vt128> {
+ ValueType vt128, X86FoldableSchedWrite sched> {
def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
@@ -266,7 +303,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
(vt128 VR128:$src3))))]>,
- XOP_4V, Sched<[WriteShuffle]>;
+ XOP_4V, Sched<[sched]>;
def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i128mem:$src3),
!strconcat(OpcodeStr,
@@ -274,7 +311,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
(vt128 (bitconvert (loadv2i64 addr:$src3))))))]>,
- XOP_4V, VEX_W, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ XOP_4V, VEX_W, Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
@@ -282,69 +319,83 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set VR128:$dst,
(v16i8 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))),
(vt128 VR128:$src3))))]>,
- XOP_4V, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ XOP_4V, Sched<[sched.Folded, ReadAfterLd,
+ // 128mem:$src2
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // VR128:$src3
+ ReadAfterLd]>;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, VEX_W, Sched<[WriteShuffle]>, FoldGenData<NAME#rrr>;
+ []>, XOP_4V, VEX_W, Sched<[sched]>, FoldGenData<NAME#rrr>;
}
let ExeDomain = SSEPackedInt in {
- defm VPPERM : xop4op<0xA3, "vpperm", X86vpperm, v16i8>;
+ defm VPPERM : xop4op<0xA3, "vpperm", X86vpperm, v16i8,
+ SchedWriteVarShuffle.XMM>;
}
// Instruction where either second or third source can be memory
multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
- X86MemOperand x86memop, ValueType VT> {
+ X86MemOperand x86memop, ValueType VT,
+ X86FoldableSchedWrite sched> {
def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
(X86andnp RC:$src3, RC:$src2))))]>, XOP_4V,
- Sched<[WriteShuffle]>;
+ Sched<[sched]>;
def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst),
(ins RC:$src1, RC:$src2, x86memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (VT (or (and (load addr:$src3), RC:$src1),
(X86andnp (load addr:$src3), RC:$src2))))]>,
- XOP_4V, VEX_W, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ XOP_4V, VEX_W, Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
(X86andnp RC:$src3, (load addr:$src2)))))]>,
- XOP_4V, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ XOP_4V, Sched<[sched.Folded, ReadAfterLd,
+ // x86memop:$src2
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC::$src3
+ ReadAfterLd]>;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, VEX_W, Sched<[WriteShuffle]>, FoldGenData<NAME#rrr>;
+ []>, XOP_4V, VEX_W, Sched<[sched]>, FoldGenData<NAME#rrr>;
}
let ExeDomain = SSEPackedInt in {
- defm VPCMOV : xop4op_int<0xA2, "vpcmov", VR128, i128mem, v2i64>;
- defm VPCMOVY : xop4op_int<0xA2, "vpcmov", VR256, i256mem, v4i64>, VEX_L;
+ defm VPCMOV : xop4op_int<0xA2, "vpcmov", VR128, i128mem, v2i64,
+ SchedWriteShuffle.XMM>;
+ defm VPCMOVY : xop4op_int<0xA2, "vpcmov", VR256, i256mem, v4i64,
+ SchedWriteShuffle.YMM>, VEX_L;
}
multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
X86MemOperand intmemop, X86MemOperand fpmemop,
- ValueType VT, PatFrag FPLdFrag,
- PatFrag IntLdFrag> {
+ ValueType VT, PatFrag FPLdFrag, PatFrag IntLdFrag,
+ X86FoldableSchedWrite sched> {
def rr : IXOP5<Opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set RC:$dst,
(VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>,
- Sched<[WriteFShuffle]>;
+ Sched<[sched]>;
def rm : IXOP5<Opc, MRMSrcMemOp4, (outs RC:$dst),
(ins RC:$src1, RC:$src2, intmemop:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
@@ -353,7 +404,7 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
(VT (X86vpermil2 RC:$src1, RC:$src2,
(bitconvert (IntLdFrag addr:$src3)),
(i8 imm:$src4))))]>, VEX_W,
- Sched<[WriteFShuffleLd, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, fpmemop:$src2, RC:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
@@ -361,27 +412,35 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
[(set RC:$dst,
(VT (X86vpermil2 RC:$src1, (FPLdFrag addr:$src2),
RC:$src3, (i8 imm:$src4))))]>,
- Sched<[WriteFShuffleLd, ReadAfterLd]>;
+ Sched<[sched.Folded, ReadAfterLd,
+ // fpmemop:$src2
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ // RC:$src3
+ ReadAfterLd]>;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
def rr_REV : IXOP5<Opc, MRMSrcRegOp4, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
- []>, VEX_W, Sched<[WriteFShuffle]>, FoldGenData<NAME#rr>;
+ []>, VEX_W, Sched<[sched]>, FoldGenData<NAME#rr>;
}
let ExeDomain = SSEPackedDouble in {
defm VPERMIL2PD : xop_vpermil2<0x49, "vpermil2pd", VR128, i128mem, f128mem,
- v2f64, loadv2f64, loadv2i64>;
+ v2f64, loadv2f64, loadv2i64,
+ SchedWriteFVarShuffle.XMM>;
defm VPERMIL2PDY : xop_vpermil2<0x49, "vpermil2pd", VR256, i256mem, f256mem,
- v4f64, loadv4f64, loadv4i64>, VEX_L;
+ v4f64, loadv4f64, loadv4i64,
+ SchedWriteFVarShuffle.YMM>, VEX_L;
}
let ExeDomain = SSEPackedSingle in {
defm VPERMIL2PS : xop_vpermil2<0x48, "vpermil2ps", VR128, i128mem, f128mem,
- v4f32, loadv4f32, loadv2i64>;
+ v4f32, loadv4f32, loadv2i64,
+ SchedWriteFVarShuffle.XMM>;
defm VPERMIL2PSY : xop_vpermil2<0x48, "vpermil2ps", VR256, i256mem, f256mem,
- v8f32, loadv8f32, loadv4i64>, VEX_L;
+ v8f32, loadv8f32, loadv4i64,
+ SchedWriteFVarShuffle.YMM>, VEX_L;
}
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index 44bbc3f1b3fa..36d36cb11d72 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -81,8 +81,8 @@ private:
MachineFunction &MF) const;
bool selectConstant(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
- bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI,
- MachineFunction &MF) const;
+ bool selectTruncOrPtrToInt(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
bool selectZext(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
bool selectAnyext(MachineInstr &I, MachineRegisterInfo &MRI,
@@ -104,9 +104,18 @@ private:
MachineFunction &MF) const;
bool selectCondBranch(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
+ bool selectTurnIntoCOPY(MachineInstr &I, MachineRegisterInfo &MRI,
+ const unsigned DstReg,
+ const TargetRegisterClass *DstRC,
+ const unsigned SrcReg,
+ const TargetRegisterClass *SrcRC) const;
bool materializeFP(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
bool selectImplicitDefOrPHI(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectShift(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectSDiv(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
// emit insert subreg instruction and insert it before MachineInstr &I
bool emitInsertSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I,
@@ -287,8 +296,8 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
const TargetRegisterClass *OldRC = MRI.getRegClassOrNull(DstReg);
if (!OldRC || !DstRC->hasSubClassEq(OldRC)) {
if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
- DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
- << " operand\n");
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
return false;
}
}
@@ -324,7 +333,7 @@ bool X86InstructionSelector::select(MachineInstr &I,
if (selectImpl(I, CoverageInfo))
return true;
- DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs()));
+ LLVM_DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs()));
// TODO: This should be implemented by tblgen.
switch (I.getOpcode()) {
@@ -342,8 +351,11 @@ bool X86InstructionSelector::select(MachineInstr &I,
return selectConstant(I, MRI, MF);
case TargetOpcode::G_FCONSTANT:
return materializeFP(I, MRI, MF);
+ case TargetOpcode::G_PTRTOINT:
case TargetOpcode::G_TRUNC:
- return selectTrunc(I, MRI, MF);
+ return selectTruncOrPtrToInt(I, MRI, MF);
+ case TargetOpcode::G_INTTOPTR:
+ return selectCopy(I, MRI);
case TargetOpcode::G_ZEXT:
return selectZext(I, MRI, MF);
case TargetOpcode::G_ANYEXT:
@@ -365,6 +377,12 @@ bool X86InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_IMPLICIT_DEF:
case TargetOpcode::G_PHI:
return selectImplicitDefOrPHI(I, MRI);
+ case TargetOpcode::G_SHL:
+ case TargetOpcode::G_ASHR:
+ case TargetOpcode::G_LSHR:
+ return selectShift(I, MRI, MF);
+ case TargetOpcode::G_SDIV:
+ return selectSDiv(I, MRI, MF);
}
return false;
@@ -485,7 +503,7 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
auto &MemOp = **I.memoperands_begin();
if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
- DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+ LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
return false;
}
@@ -640,10 +658,37 @@ bool X86InstructionSelector::selectConstant(MachineInstr &I,
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
-bool X86InstructionSelector::selectTrunc(MachineInstr &I,
- MachineRegisterInfo &MRI,
- MachineFunction &MF) const {
- assert((I.getOpcode() == TargetOpcode::G_TRUNC) && "unexpected instruction");
+// Helper function for selectTruncOrPtrToInt and selectAnyext.
+// Returns true if DstRC lives on a floating register class and
+// SrcRC lives on a 128-bit vector class.
+static bool canTurnIntoCOPY(const TargetRegisterClass *DstRC,
+ const TargetRegisterClass *SrcRC) {
+ return (DstRC == &X86::FR32RegClass || DstRC == &X86::FR32XRegClass ||
+ DstRC == &X86::FR64RegClass || DstRC == &X86::FR64XRegClass) &&
+ (SrcRC == &X86::VR128RegClass || SrcRC == &X86::VR128XRegClass);
+}
+
+bool X86InstructionSelector::selectTurnIntoCOPY(
+ MachineInstr &I, MachineRegisterInfo &MRI, const unsigned DstReg,
+ const TargetRegisterClass *DstRC, const unsigned SrcReg,
+ const TargetRegisterClass *SrcRC) const {
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
+ return false;
+ }
+ I.setDesc(TII.get(X86::COPY));
+ return true;
+}
+
+bool X86InstructionSelector::selectTruncOrPtrToInt(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_TRUNC ||
+ I.getOpcode() == TargetOpcode::G_PTRTOINT) &&
+ "unexpected instruction");
const unsigned DstReg = I.getOperand(0).getReg();
const unsigned SrcReg = I.getOperand(1).getReg();
@@ -655,19 +700,24 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
if (DstRB.getID() != SrcRB.getID()) {
- DEBUG(dbgs() << "G_TRUNC input/output on different banks\n");
+ LLVM_DEBUG(dbgs() << TII.getName(I.getOpcode())
+ << " input/output on different banks\n");
return false;
}
- if (DstRB.getID() != X86::GPRRegBankID)
- return false;
-
const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
- if (!DstRC)
+ const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
+
+ if (!DstRC || !SrcRC)
return false;
- const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
- if (!SrcRC)
+ // If that's truncation of the value that lives on the vector class and goes
+ // into the floating class, just replace it with copy, as we are able to
+ // select it as a regular move.
+ if (canTurnIntoCOPY(DstRC, SrcRC))
+ return selectTurnIntoCOPY(I, MRI, DstReg, DstRC, SrcReg, SrcRC);
+
+ if (DstRB.getID() != X86::GPRRegBankID)
return false;
unsigned SubIdx;
@@ -688,7 +738,8 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
- DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << "\n");
return false;
}
@@ -709,6 +760,70 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
const LLT DstTy = MRI.getType(DstReg);
const LLT SrcTy = MRI.getType(SrcReg);
+ assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(32)) &&
+ "8=>32 Zext is handled by tablegen");
+ assert(!(SrcTy == LLT::scalar(16) && DstTy == LLT::scalar(32)) &&
+ "16=>32 Zext is handled by tablegen");
+
+ const static struct ZextEntry {
+ LLT SrcTy;
+ LLT DstTy;
+ unsigned MovOp;
+ bool NeedSubregToReg;
+ } OpTable[] = {
+ {LLT::scalar(8), LLT::scalar(16), X86::MOVZX16rr8, false}, // i8 => i16
+ {LLT::scalar(8), LLT::scalar(64), X86::MOVZX32rr8, true}, // i8 => i64
+ {LLT::scalar(16), LLT::scalar(64), X86::MOVZX32rr16, true}, // i16 => i64
+ {LLT::scalar(32), LLT::scalar(64), 0, true} // i32 => i64
+ };
+
+ auto ZextEntryIt =
+ std::find_if(std::begin(OpTable), std::end(OpTable),
+ [SrcTy, DstTy](const ZextEntry &El) {
+ return El.DstTy == DstTy && El.SrcTy == SrcTy;
+ });
+
+ // Here we try to select Zext into a MOVZ and/or SUBREG_TO_REG instruction.
+ if (ZextEntryIt != std::end(OpTable)) {
+ const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+ const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
+ const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
+ const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
+ return false;
+ }
+
+ unsigned TransitRegTo = DstReg;
+ unsigned TransitRegFrom = SrcReg;
+ if (ZextEntryIt->MovOp) {
+ // If we select Zext into MOVZ + SUBREG_TO_REG, we need to have
+ // a transit register in between: create it here.
+ if (ZextEntryIt->NeedSubregToReg) {
+ TransitRegFrom = MRI.createVirtualRegister(
+ getRegClass(LLT::scalar(32), DstReg, MRI));
+ TransitRegTo = TransitRegFrom;
+ }
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ZextEntryIt->MovOp))
+ .addDef(TransitRegTo)
+ .addReg(SrcReg);
+ }
+ if (ZextEntryIt->NeedSubregToReg) {
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::SUBREG_TO_REG))
+ .addDef(DstReg)
+ .addImm(0)
+ .addReg(TransitRegFrom)
+ .addImm(X86::sub_32bit);
+ }
+ I.eraseFromParent();
+ return true;
+ }
+
if (SrcTy != LLT::scalar(1))
return false;
@@ -765,16 +880,22 @@ bool X86InstructionSelector::selectAnyext(MachineInstr &I,
assert(DstTy.getSizeInBits() > SrcTy.getSizeInBits() &&
"G_ANYEXT incorrect operand size");
- if (DstRB.getID() != X86::GPRRegBankID)
- return false;
-
const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
+ // If that's ANY_EXT of the value that lives on the floating class and goes
+ // into the vector class, just replace it with copy, as we are able to select
+ // it as a regular move.
+ if (canTurnIntoCOPY(SrcRC, DstRC))
+ return selectTurnIntoCOPY(I, MRI, SrcReg, SrcRC, DstReg, DstRC);
+
+ if (DstRB.getID() != X86::GPRRegBankID)
+ return false;
+
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
- DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
- << " operand\n");
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
return false;
}
@@ -990,7 +1111,7 @@ bool X86InstructionSelector::emitExtractSubreg(unsigned DstReg, unsigned SrcReg,
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
- DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+ LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
return false;
}
@@ -1027,7 +1148,7 @@ bool X86InstructionSelector::emitInsertSubreg(unsigned DstReg, unsigned SrcReg,
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
- DEBUG(dbgs() << "Failed to constrain INSERT_SUBREG\n");
+ LLVM_DEBUG(dbgs() << "Failed to constrain INSERT_SUBREG\n");
return false;
}
@@ -1271,8 +1392,8 @@ bool X86InstructionSelector::selectImplicitDefOrPHI(
const TargetRegisterClass *RC = getRegClass(DstTy, DstReg, MRI);
if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
- DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
- << " operand\n");
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
return false;
}
}
@@ -1285,6 +1406,165 @@ bool X86InstructionSelector::selectImplicitDefOrPHI(
return true;
}
+// Currently GlobalIsel TableGen generates patterns for shift imm and shift 1,
+// but with shiftCount i8. In G_LSHR/G_ASHR/G_SHL like LLVM-IR both arguments
+// has the same type, so for now only shift i8 can use auto generated
+// TableGen patterns.
+bool X86InstructionSelector::selectShift(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+
+ assert((I.getOpcode() == TargetOpcode::G_SHL ||
+ I.getOpcode() == TargetOpcode::G_ASHR ||
+ I.getOpcode() == TargetOpcode::G_LSHR) &&
+ "unexpected instruction");
+
+ unsigned DstReg = I.getOperand(0).getReg();
+ const LLT DstTy = MRI.getType(DstReg);
+ const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+
+ const static struct ShiftEntry {
+ unsigned SizeInBits;
+ unsigned CReg;
+ unsigned OpLSHR;
+ unsigned OpASHR;
+ unsigned OpSHL;
+ } OpTable[] = {
+ {8, X86::CL, X86::SHR8rCL, X86::SAR8rCL, X86::SHL8rCL}, // i8
+ {16, X86::CX, X86::SHR16rCL, X86::SAR16rCL, X86::SHL16rCL}, // i16
+ {32, X86::ECX, X86::SHR32rCL, X86::SAR32rCL, X86::SHL32rCL}, // i32
+ {64, X86::RCX, X86::SHR64rCL, X86::SAR64rCL, X86::SHL64rCL} // i64
+ };
+
+ if (DstRB.getID() != X86::GPRRegBankID)
+ return false;
+
+ auto ShiftEntryIt = std::find_if(
+ std::begin(OpTable), std::end(OpTable), [DstTy](const ShiftEntry &El) {
+ return El.SizeInBits == DstTy.getSizeInBits();
+ });
+ if (ShiftEntryIt == std::end(OpTable))
+ return false;
+
+ unsigned CReg = ShiftEntryIt->CReg;
+ unsigned Opcode = 0;
+ switch (I.getOpcode()) {
+ case TargetOpcode::G_SHL:
+ Opcode = ShiftEntryIt->OpSHL;
+ break;
+ case TargetOpcode::G_ASHR:
+ Opcode = ShiftEntryIt->OpASHR;
+ break;
+ case TargetOpcode::G_LSHR:
+ Opcode = ShiftEntryIt->OpLSHR;
+ break;
+ default:
+ return false;
+ }
+
+ unsigned Op0Reg = I.getOperand(1).getReg();
+ unsigned Op1Reg = I.getOperand(2).getReg();
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY),
+ ShiftEntryIt->CReg)
+ .addReg(Op1Reg);
+
+ // The shift instruction uses X86::CL. If we defined a super-register
+ // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
+ if (CReg != X86::CL)
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::KILL),
+ X86::CL)
+ .addReg(CReg, RegState::Kill);
+
+ MachineInstr &ShiftInst =
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode), DstReg)
+ .addReg(Op0Reg);
+
+ constrainSelectedInstRegOperands(ShiftInst, TII, TRI, RBI);
+ I.eraseFromParent();
+ return true;
+}
+
+bool X86InstructionSelector::selectSDiv(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+
+ assert(I.getOpcode() == TargetOpcode::G_SDIV && "unexpected instruction");
+
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const unsigned DividentReg = I.getOperand(1).getReg();
+ const unsigned DiviserReg = I.getOperand(2).getReg();
+
+ const LLT RegTy = MRI.getType(DstReg);
+ assert(RegTy == MRI.getType(DividentReg) &&
+ RegTy == MRI.getType(DiviserReg) &&
+ "Arguments and return value types must match");
+
+ const RegisterBank &RegRB = *RBI.getRegBank(DstReg, MRI, TRI);
+
+ // For the X86 IDIV instruction, in most cases the dividend
+ // (numerator) must be in a specific register pair highreg:lowreg,
+ // producing the quotient in lowreg and the remainder in highreg.
+ // For most data types, to set up the instruction, the dividend is
+ // copied into lowreg, and lowreg is sign-extended into highreg. The
+ // exception is i8, where the dividend is defined as a single register rather
+ // than a register pair, and we therefore directly sign-extend the dividend
+ // into lowreg, instead of copying, and ignore the highreg.
+ const static struct SDivEntry {
+ unsigned SizeInBits;
+ unsigned QuotientReg;
+ unsigned DividentRegUpper;
+ unsigned DividentRegLower;
+ unsigned OpSignExtend;
+ unsigned OpCopy;
+ unsigned OpDiv;
+ } OpTable[] = {
+ {8, X86::AL, X86::NoRegister, X86::AX, 0, X86::MOVSX16rr8,
+ X86::IDIV8r}, // i8
+ {16, X86::AX, X86::DX, X86::AX, X86::CWD, TargetOpcode::COPY,
+ X86::IDIV16r}, // i16
+ {32, X86::EAX, X86::EDX, X86::EAX, X86::CDQ, TargetOpcode::COPY,
+ X86::IDIV32r}, // i32
+ {64, X86::RAX, X86::RDX, X86::RAX, X86::CQO, TargetOpcode::COPY,
+ X86::IDIV64r} // i64
+ };
+
+ if (RegRB.getID() != X86::GPRRegBankID)
+ return false;
+
+ auto SDivEntryIt = std::find_if(
+ std::begin(OpTable), std::end(OpTable), [RegTy](const SDivEntry &El) {
+ return El.SizeInBits == RegTy.getSizeInBits();
+ });
+
+ if (SDivEntryIt == std::end(OpTable))
+ return false;
+
+ const TargetRegisterClass *RegRC = getRegClass(RegTy, RegRB);
+ if (!RBI.constrainGenericRegister(DividentReg, *RegRC, MRI) ||
+ !RBI.constrainGenericRegister(DiviserReg, *RegRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *RegRC, MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
+ return false;
+ }
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SDivEntryIt->OpCopy),
+ SDivEntryIt->DividentRegLower)
+ .addReg(DividentReg);
+ if (SDivEntryIt->DividentRegUpper != X86::NoRegister)
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(SDivEntryIt->OpSignExtend));
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SDivEntryIt->OpDiv))
+ .addReg(DiviserReg);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY),
+ DstReg)
+ .addReg(SDivEntryIt->QuotientReg);
+
+ I.eraseFromParent();
+ return true;
+}
+
InstructionSelector *
llvm::createX86InstructionSelector(const X86TargetMachine &TM,
X86Subtarget &Subtarget,
diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp
index cdb24b9d40a6..6c7fb9c339ac 100644
--- a/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/lib/Target/X86/X86InterleavedAccess.cpp
@@ -19,7 +19,6 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
@@ -30,6 +29,7 @@
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/MachineValueType.h"
#include <algorithm>
#include <cassert>
#include <cmath>
@@ -39,7 +39,7 @@ using namespace llvm;
namespace {
-/// \brief This class holds necessary information to represent an interleaved
+/// This class holds necessary information to represent an interleaved
/// access group and supports utilities to lower the group into
/// X86-specific instructions/intrinsics.
/// E.g. A group of interleaving access loads (Factor = 2; accessing every
@@ -48,32 +48,32 @@ namespace {
/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
class X86InterleavedAccessGroup {
- /// \brief Reference to the wide-load instruction of an interleaved access
+ /// Reference to the wide-load instruction of an interleaved access
/// group.
Instruction *const Inst;
- /// \brief Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
+ /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
ArrayRef<ShuffleVectorInst *> Shuffles;
- /// \brief Reference to the starting index of each user-shuffle.
+ /// Reference to the starting index of each user-shuffle.
ArrayRef<unsigned> Indices;
- /// \brief Reference to the interleaving stride in terms of elements.
+ /// Reference to the interleaving stride in terms of elements.
const unsigned Factor;
- /// \brief Reference to the underlying target.
+ /// Reference to the underlying target.
const X86Subtarget &Subtarget;
const DataLayout &DL;
IRBuilder<> &Builder;
- /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
+ /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
/// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
SmallVectorImpl<Instruction *> &DecomposedVectors);
- /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
+ /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
/// returns the transposed-vectors in \p TransposedVectors.
/// E.g.
/// InputVectors:
@@ -115,11 +115,11 @@ public:
: Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
DL(Inst->getModule()->getDataLayout()), Builder(B) {}
- /// \brief Returns true if this interleaved access group can be lowered into
+ /// Returns true if this interleaved access group can be lowered into
/// x86-specific instructions/intrinsics, false otherwise.
bool isSupported() const;
- /// \brief Lowers this interleaved access group into X86-specific
+ /// Lowers this interleaved access group into X86-specific
/// instructions/intrinsics.
bool lowerIntoOptimizedSequence();
};
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 0782d5598746..2dd60a1b8b5a 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -20,24 +20,21 @@
namespace llvm {
enum IntrinsicType : uint16_t {
- INTR_NO_TYPE,
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASS, FPCLASSS,
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
+ INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP_IMM8,
CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM,
CVTPD2PS, CVTPD2PS_MASK,
INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
- INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
- INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
- FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
- FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
- IFMA_OP_MASK, IFMA_OP_MASKZ,
- VPERM_2OP_MASK, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
+ INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM,
+ INTR_TYPE_3OP_MASK,
+ FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_SCALAR,
+ IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK,
INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK,
- COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
+ COMPRESS_EXPAND_IN_REG,
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
- EXPAND_FROM_MEM,
- TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
- FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
+ FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
+ FIXUPIMMS_MASKZ, GATHER_AVX2,
ROUNDP, ROUNDS
};
@@ -54,6 +51,9 @@ struct IntrinsicData {
bool operator==(const IntrinsicData &RHS) const {
return RHS.Id == Id;
}
+ friend bool operator<(const IntrinsicData &LHS, unsigned Id) {
+ return LHS.Id < Id;
+ }
};
#define X86_INTRINSIC_DATA(id, type, op0, op1) \
@@ -120,78 +120,6 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_b_128,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_b_256,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_b_512,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_d_128,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_d_256,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_d_512,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_128,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_256,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_512,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_128,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_256,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_512,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_q_128,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_q_256,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_w_128,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_w_256,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_store_w_512,
- COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_b_128,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_b_256,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_b_512,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_d_128,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_d_256,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_d_512,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_128,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_256,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_512,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_128,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_256,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_512,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_q_128,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_q_256,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_w_128,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_w_256,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_load_w_512,
- EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
@@ -352,13 +280,11 @@ static const IntrinsicData IntrinsicsWithChain[] = {
/*
* Find Intrinsic data by intrinsic ID
*/
-static const IntrinsicData* getIntrinsicWithChain(uint16_t IntNo) {
-
- IntrinsicData IntrinsicToFind = {IntNo, INTR_NO_TYPE, 0, 0 };
+static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) {
const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithChain),
std::end(IntrinsicsWithChain),
- IntrinsicToFind);
- if (Data != std::end(IntrinsicsWithChain) && *Data == IntrinsicToFind)
+ IntNo);
+ if (Data != std::end(IntrinsicsWithChain) && Data->Id == IntNo)
return Data;
return nullptr;
}
@@ -374,9 +300,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx_cmp_ps_256, INTR_TYPE_3OP, X86ISD::CMPP, 0),
X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0),
X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
- X86_INTRINSIC_DATA(avx_cvtdq2_ps_256, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
- X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
- X86_INTRINSIC_DATA(avx_cvtt_ps2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx_cvt_ps2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
+ X86_INTRINSIC_DATA(avx_cvtt_ps2dq_256,INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
@@ -391,8 +317,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx_round_pd_256, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(avx_round_ps_256, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
- X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx_vpermilvar_pd, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx_vpermilvar_ps, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
@@ -405,6 +329,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
@@ -412,11 +338,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
- X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
X86_INTRINSIC_DATA(avx2_pmul_hr_sw, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
- X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
@@ -449,15 +373,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
- X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
- X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
- X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
- X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
- X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
- X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
- X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
- X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
- X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
+ X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
+ X86_INTRINSIC_DATA(avx512_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_RND),
+ X86_INTRINSIC_DATA(avx512_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_RND),
X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
@@ -472,30 +395,23 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
- X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
- X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_dbpsadbw_128, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_dbpsadbw_256, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_dbpsadbw_512, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_div_pd_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
+ X86_INTRINSIC_DATA(avx512_div_ps_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
- X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
- X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
- X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
- X86ISD::FADD_RND),
- X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,
- X86ISD::FADD_RND),
+ X86_INTRINSIC_DATA(avx512_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0),
X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FADDS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FADDS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM,
- X86ISD::CMPM_RND),
- X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM,
- X86ISD::CMPM_RND),
X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC,
X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC,
@@ -549,22 +465,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CONFLICT, 0),
X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK,
X86ISD::CONFLICT, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_128, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_256, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK,
ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), //er
X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_256, INTR_TYPE_1OP_MASK,
- X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, INTR_TYPE_1OP_MASK,
X86ISD::VFPROUND, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, CVTPD2PS_MASK,
- ISD::FP_ROUND, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, CVTPD2PS_MASK,
ISD::FP_ROUND, X86ISD::VFPROUND_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK,
@@ -591,10 +499,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_128, INTR_TYPE_1OP_MASK,
- X86ISD::VFPEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_256, INTR_TYPE_1OP_MASK,
- ISD::FP_EXTEND, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK,
ISD::FP_EXTEND, X86ISD::VFPEXT_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK,
@@ -615,10 +519,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CVTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_128, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_256, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK,
ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK,
@@ -633,62 +533,48 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VFPEXTS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2SI, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, 0),
+ X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, 0),
+ X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, 0),
+ X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, 0),
+ X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, 0),
+ X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_128, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_256, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, 0),
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, 0),
+ X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, 0),
+ X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, 0),
+ X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, 0),
+ X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_128, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_256, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, 0),
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK,
ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_128, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_256, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK,
ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK,
@@ -697,16 +583,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
ISD::UINT_TO_FP, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK,
ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
- X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_128, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::DBPSADBW, 0),
- X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_256, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::DBPSADBW, 0),
- X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::DBPSADBW, 0),
- X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
- X86ISD::FDIV_RND),
- X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
- X86ISD::FDIV_RND),
X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FDIVS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM,
@@ -755,12 +631,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_mask_fixupimm_sd, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
X86_INTRINSIC_DATA(avx512_mask_fixupimm_ss, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
- X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0),
- X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0),
- X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0),
- X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0),
- X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0),
- X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0),
X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0),
X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0),
X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM,
@@ -795,26 +665,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND),
X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK,
X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND),
- X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
- X86ISD::FMAX_RND),
- X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
- X86ISD::FMAX_RND),
X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK,
X86ISD::FMAXS, X86ISD::FMAXS_RND),
X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK,
X86ISD::FMAXS, X86ISD::FMAXS_RND),
- X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
- X86ISD::FMIN_RND),
- X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
- X86ISD::FMIN_RND),
X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK,
X86ISD::FMINS, X86ISD::FMINS_RND),
X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK,
X86ISD::FMINS, X86ISD::FMINS_RND),
- X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
- X86ISD::FMUL_RND),
- X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
- X86ISD::FMUL_RND),
X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FMULS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM,
@@ -831,58 +689,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
- X86_INTRINSIC_DATA(avx512_mask_permvar_df_256, VPERM_2OP_MASK,
- X86ISD::VPERMV, 0),
- X86_INTRINSIC_DATA(avx512_mask_permvar_df_512, VPERM_2OP_MASK,
- X86ISD::VPERMV, 0),
- X86_INTRINSIC_DATA(avx512_mask_permvar_di_256, VPERM_2OP_MASK,
- X86ISD::VPERMV, 0),
- X86_INTRINSIC_DATA(avx512_mask_permvar_di_512, VPERM_2OP_MASK,
- X86ISD::VPERMV, 0),
- X86_INTRINSIC_DATA(avx512_mask_permvar_hi_128, VPERM_2OP_MASK,
- X86ISD::VPERMV, 0),
- X86_INTRINSIC_DATA(avx512_mask_permvar_hi_256, VPERM_2OP_MASK,
- X86ISD::VPERMV, 0),
- X86_INTRINSIC_DATA(avx512_mask_permvar_hi_512, VPERM_2OP_MASK,
- X86ISD::VPERMV, 0),
- X86_INTRINSIC_DATA(avx512_mask_permvar_qi_128, VPERM_2OP_MASK,
- X86ISD::VPERMV, 0),
- X86_INTRINSIC_DATA(avx512_mask_permvar_qi_256, VPERM_2OP_MASK,
- X86ISD::VPERMV, 0),
- X86_INTRINSIC_DATA(avx512_mask_permvar_qi_512, VPERM_2OP_MASK,
- X86ISD::VPERMV, 0),
- X86_INTRINSIC_DATA(avx512_mask_permvar_sf_256, VPERM_2OP_MASK,
- X86ISD::VPERMV, 0),
- X86_INTRINSIC_DATA(avx512_mask_permvar_sf_512, VPERM_2OP_MASK,
- X86ISD::VPERMV, 0),
- X86_INTRINSIC_DATA(avx512_mask_permvar_si_256, VPERM_2OP_MASK,
- X86ISD::VPERMV, 0),
- X86_INTRINSIC_DATA(avx512_mask_permvar_si_512, VPERM_2OP_MASK,
- X86ISD::VPERMV, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK,
- X86ISD::VPMADDUBSW, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK,
- X86ISD::VPMADDUBSW, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK,
- X86ISD::VPMADDUBSW, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK,
- X86ISD::VPMADDWD, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK,
- X86ISD::VPMADDWD, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK,
- X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, INTR_TYPE_1OP_MASK,
- X86ISD::VTRUNC, 0),
+ ISD::TRUNCATE, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, INTR_TYPE_1OP_MASK,
- X86ISD::VTRUNC, 0),
+ ISD::TRUNCATE, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, INTR_TYPE_1OP_MASK,
- X86ISD::VTRUNC, 0),
+ ISD::TRUNCATE, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, INTR_TYPE_1OP_MASK,
@@ -892,21 +710,21 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK,
- X86ISD::VTRUNC, 0),
+ ISD::TRUNCATE, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK,
- X86ISD::VTRUNC, 0),
+ ISD::TRUNCATE, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, INTR_TYPE_1OP_MASK,
- X86ISD::VTRUNC, 0),
+ ISD::TRUNCATE, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK,
- X86ISD::VTRUNC, 0),
+ ISD::TRUNCATE, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK,
- X86ISD::VTRUNC, 0),
+ ISD::TRUNCATE, 0),
X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNCS, 0),
X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, INTR_TYPE_1OP_MASK,
@@ -979,45 +797,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VTRUNCUS, 0),
X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNCUS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_128, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_256, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_512, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmulh_w_128, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmulh_w_256, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmulh_w_512, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_128, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_256, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_512, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_128, INTR_TYPE_2OP_MASK,
X86ISD::MULTISHIFT, 0),
X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_256, INTR_TYPE_2OP_MASK,
X86ISD::MULTISHIFT, 0),
X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_512, INTR_TYPE_2OP_MASK,
X86ISD::MULTISHIFT, 0),
- X86_INTRINSIC_DATA(avx512_mask_prol_d_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_prol_d_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_prol_d_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_prol_q_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_prol_q_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_prol_q_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_prolv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
- X86_INTRINSIC_DATA(avx512_mask_prolv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
- X86_INTRINSIC_DATA(avx512_mask_prolv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
- X86_INTRINSIC_DATA(avx512_mask_prolv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
- X86_INTRINSIC_DATA(avx512_mask_prolv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
- X86_INTRINSIC_DATA(avx512_mask_prolv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
- X86_INTRINSIC_DATA(avx512_mask_pror_d_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
- X86_INTRINSIC_DATA(avx512_mask_pror_d_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
- X86_INTRINSIC_DATA(avx512_mask_pror_d_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
- X86_INTRINSIC_DATA(avx512_mask_pror_q_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
- X86_INTRINSIC_DATA(avx512_mask_pror_q_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
- X86_INTRINSIC_DATA(avx512_mask_pror_q_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
- X86_INTRINSIC_DATA(avx512_mask_prorv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
- X86_INTRINSIC_DATA(avx512_mask_prorv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
- X86_INTRINSIC_DATA(avx512_mask_prorv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
- X86_INTRINSIC_DATA(avx512_mask_prorv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
- X86_INTRINSIC_DATA(avx512_mask_prorv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
- X86_INTRINSIC_DATA(avx512_mask_prorv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
X86_INTRINSIC_DATA(avx512_mask_psubs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
X86_INTRINSIC_DATA(avx512_mask_psubs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
X86_INTRINSIC_DATA(avx512_mask_psubs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
@@ -1030,18 +815,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK,
- X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK,
- X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_mask_pternlog_d_512, TERLOG_OP_MASK,
- X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_mask_pternlog_q_128, TERLOG_OP_MASK,
- X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_mask_pternlog_q_256, TERLOG_OP_MASK,
- X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK,
- X86ISD::VPTERNLOG, 0),
X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND),
@@ -1084,22 +857,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::SCALEFS, 0),
X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::SCALEFS, 0),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
- X86ISD::FSQRT_RND),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
- X86ISD::FSQRT_RND),
X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FSQRTS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FSQRTS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
- X86ISD::FSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
- X86ISD::FSUB_RND),
X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FSUBS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
@@ -1116,151 +877,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CVTPS2PH, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK,
X86ISD::CVTPS2PH, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, ISD::FMA, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, ISD::FMA, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, ISD::FMA,
- X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_128, FMA_OP_MASK, ISD::FMA, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_256, FMA_OP_MASK, ISD::FMA, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, ISD::FMA,
- X86ISD::FMADD_RND),
-
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
- X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
- X86ISD::FMADDSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_512, FMA_OP_MASK, X86ISD::FMADDSUB,
- X86ISD::FMADDSUB_RND),
-
- X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_512, FMA_OP_MASK, X86ISD::FNMADD,
- X86ISD::FNMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_512, FMA_OP_MASK, X86ISD::FNMADD,
- X86ISD::FNMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_512, FMA_OP_MASK, X86ISD::FNMSUB,
- X86ISD::FNMSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB,
- X86ISD::FNMSUB_RND),
-
- X86_INTRINSIC_DATA(avx512_mask_vpdpbusd_128, FMA_OP_MASK, X86ISD::VPDPBUSD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpdpbusd_256, FMA_OP_MASK, X86ISD::VPDPBUSD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpdpbusd_512, FMA_OP_MASK, X86ISD::VPDPBUSD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpdpbusds_128, FMA_OP_MASK, X86ISD::VPDPBUSDS, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpdpbusds_256, FMA_OP_MASK, X86ISD::VPDPBUSDS, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpdpbusds_512, FMA_OP_MASK, X86ISD::VPDPBUSDS, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpdpwssd_128, FMA_OP_MASK, X86ISD::VPDPWSSD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpdpwssd_256, FMA_OP_MASK, X86ISD::VPDPWSSD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpdpwssd_512, FMA_OP_MASK, X86ISD::VPDPWSSD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_128, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_256, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_512, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0),
-
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_512, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_128, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_256, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_512, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_128, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_256, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_512, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_128, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_256, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_512, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_128, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_256, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_128, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_256, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_512, VPERM_3OP_MASK,
- X86ISD::VPERMIV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_512, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_512, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_512, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_128, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_256, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_512, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , IFMA_OP_MASK,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , IFMA_OP_MASK,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , IFMA_OP_MASK,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , IFMA_OP_MASK,
- X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , IFMA_OP_MASK,
- X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , IFMA_OP_MASK,
- X86ISD::VPMADD52L, 0),
-
- X86_INTRINSIC_DATA(avx512_mask_vpshld_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshld_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshld_d_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshld_q_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshld_q_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshld_q_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshld_w_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshld_w_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshld_w_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_128, FMA_OP_MASK, X86ISD::VSHLDV, 0),
X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_256, FMA_OP_MASK, X86ISD::VSHLDV, 0),
X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_512, FMA_OP_MASK, X86ISD::VSHLDV, 0),
@@ -1270,15 +887,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_128, FMA_OP_MASK, X86ISD::VSHLDV, 0),
X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_256, FMA_OP_MASK, X86ISD::VSHLDV, 0),
X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_512, FMA_OP_MASK, X86ISD::VSHLDV, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_128, FMA_OP_MASK, X86ISD::VSHRDV, 0),
X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_256, FMA_OP_MASK, X86ISD::VSHRDV, 0),
X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_512, FMA_OP_MASK, X86ISD::VSHRDV, 0),
@@ -1296,56 +904,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_512, CMP_MASK,
X86ISD::VPSHUFBITQMB, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, ISD::FMA, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, ISD::FMA, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, ISD::FMA,
- X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, ISD::FMA, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, ISD::FMA, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, ISD::FMA,
- X86ISD::FMADD_RND),
-
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
- X86ISD::FMADDSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
- X86ISD::FMADDSUB_RND),
-
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_512, FMA_OP_MASK3, X86ISD::FMSUB,
- X86ISD::FMSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
- X86ISD::FMSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),
-
- X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
- X86ISD::FMSUBADD_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
- X86ISD::FMSUBADD_RND),
-
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_512, FMA_OP_MASK3, X86ISD::FNMSUB,
- X86ISD::FNMSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
- X86ISD::FNMSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
@@ -1362,99 +920,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VFIXUPIMMS, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMMS_MASKZ,
X86ISD::VFIXUPIMMS, 0),
- X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_128, TERLOG_OP_MASKZ,
- X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_256, TERLOG_OP_MASKZ,
- X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_512, TERLOG_OP_MASKZ,
- X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_128, TERLOG_OP_MASKZ,
- X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_256, TERLOG_OP_MASKZ,
- X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ,
- X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, ISD::FMA, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, ISD::FMA, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, ISD::FMA,
- X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_128, FMA_OP_MASKZ, ISD::FMA, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_256, FMA_OP_MASKZ, ISD::FMA, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, ISD::FMA,
- X86ISD::FMADD_RND),
-
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
- X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
- X86ISD::FMADDSUB_RND),
- X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
- X86ISD::FMADDSUB_RND),
-
- X86_INTRINSIC_DATA(avx512_maskz_vpdpbusd_128, FMA_OP_MASKZ, X86ISD::VPDPBUSD, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpdpbusd_256, FMA_OP_MASKZ, X86ISD::VPDPBUSD, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpdpbusd_512, FMA_OP_MASKZ, X86ISD::VPDPBUSD, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpdpbusds_128, FMA_OP_MASKZ, X86ISD::VPDPBUSDS, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpdpbusds_256, FMA_OP_MASKZ, X86ISD::VPDPBUSDS, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpdpbusds_512, FMA_OP_MASKZ, X86ISD::VPDPBUSDS, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpdpwssd_128, FMA_OP_MASKZ, X86ISD::VPDPWSSD, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpdpwssd_256, FMA_OP_MASKZ, X86ISD::VPDPWSSD, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpdpwssd_512, FMA_OP_MASKZ, X86ISD::VPDPWSSD, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_128, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_256, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_512, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0),
-
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_128, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_256, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_512, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_128, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_256, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_512, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_128, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_256, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_512, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_128, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_256, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_512, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_128, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_256, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_128, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_256, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_512, VPERM_3OP_MASKZ,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, IFMA_OP_MASKZ,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, IFMA_OP_MASKZ,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, IFMA_OP_MASKZ,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, IFMA_OP_MASKZ,
- X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, IFMA_OP_MASKZ,
- X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, IFMA_OP_MASKZ,
- X86ISD::VPMADD52L, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
@@ -1475,12 +940,57 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_256, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_512, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_max_pd_512, INTR_TYPE_2OP, X86ISD::FMAX, X86ISD::FMAX_RND),
+ X86_INTRINSIC_DATA(avx512_max_ps_512, INTR_TYPE_2OP, X86ISD::FMAX, X86ISD::FMAX_RND),
+ X86_INTRINSIC_DATA(avx512_min_pd_512, INTR_TYPE_2OP, X86ISD::FMIN, X86ISD::FMIN_RND),
+ X86_INTRINSIC_DATA(avx512_min_ps_512, INTR_TYPE_2OP, X86ISD::FMIN, X86ISD::FMIN_RND),
+ X86_INTRINSIC_DATA(avx512_mul_pd_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND),
+ X86_INTRINSIC_DATA(avx512_mul_ps_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND),
X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
- X86_INTRINSIC_DATA(avx512_pmul_dq_512, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
- X86_INTRINSIC_DATA(avx512_pmulu_dq_512, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_di_512, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_hi_128, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_hi_256, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_hi_512, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_qi_128, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_qi_256, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_qi_512, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_sf_512, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_si_512, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_pmaddubs_w_512, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(avx512_pmaddw_d_512, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
+ X86_INTRINSIC_DATA(avx512_pmul_hr_sw_512, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
+ X86_INTRINSIC_DATA(avx512_pmulh_w_512, INTR_TYPE_2OP, ISD::MULHS, 0),
+ X86_INTRINSIC_DATA(avx512_pmulhu_w_512, INTR_TYPE_2OP, ISD::MULHU, 0),
+ X86_INTRINSIC_DATA(avx512_prol_d_128, INTR_TYPE_2OP_IMM8, X86ISD::VROTLI, 0),
+ X86_INTRINSIC_DATA(avx512_prol_d_256, INTR_TYPE_2OP_IMM8, X86ISD::VROTLI, 0),
+ X86_INTRINSIC_DATA(avx512_prol_d_512, INTR_TYPE_2OP_IMM8, X86ISD::VROTLI, 0),
+ X86_INTRINSIC_DATA(avx512_prol_q_128, INTR_TYPE_2OP_IMM8, X86ISD::VROTLI, 0),
+ X86_INTRINSIC_DATA(avx512_prol_q_256, INTR_TYPE_2OP_IMM8, X86ISD::VROTLI, 0),
+ X86_INTRINSIC_DATA(avx512_prol_q_512, INTR_TYPE_2OP_IMM8, X86ISD::VROTLI, 0),
+ X86_INTRINSIC_DATA(avx512_prolv_d_128, INTR_TYPE_2OP, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(avx512_prolv_d_256, INTR_TYPE_2OP, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(avx512_prolv_d_512, INTR_TYPE_2OP, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(avx512_prolv_q_128, INTR_TYPE_2OP, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(avx512_prolv_q_256, INTR_TYPE_2OP, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(avx512_prolv_q_512, INTR_TYPE_2OP, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(avx512_pror_d_128, INTR_TYPE_2OP_IMM8, X86ISD::VROTRI, 0),
+ X86_INTRINSIC_DATA(avx512_pror_d_256, INTR_TYPE_2OP_IMM8, X86ISD::VROTRI, 0),
+ X86_INTRINSIC_DATA(avx512_pror_d_512, INTR_TYPE_2OP_IMM8, X86ISD::VROTRI, 0),
+ X86_INTRINSIC_DATA(avx512_pror_q_128, INTR_TYPE_2OP_IMM8, X86ISD::VROTRI, 0),
+ X86_INTRINSIC_DATA(avx512_pror_q_256, INTR_TYPE_2OP_IMM8, X86ISD::VROTRI, 0),
+ X86_INTRINSIC_DATA(avx512_pror_q_512, INTR_TYPE_2OP_IMM8, X86ISD::VROTRI, 0),
+ X86_INTRINSIC_DATA(avx512_prorv_d_128, INTR_TYPE_2OP, ISD::ROTR, 0),
+ X86_INTRINSIC_DATA(avx512_prorv_d_256, INTR_TYPE_2OP, ISD::ROTR, 0),
+ X86_INTRINSIC_DATA(avx512_prorv_d_512, INTR_TYPE_2OP, ISD::ROTR, 0),
+ X86_INTRINSIC_DATA(avx512_prorv_q_128, INTR_TYPE_2OP, ISD::ROTR, 0),
+ X86_INTRINSIC_DATA(avx512_prorv_q_256, INTR_TYPE_2OP, ISD::ROTR, 0),
+ X86_INTRINSIC_DATA(avx512_prorv_q_512, INTR_TYPE_2OP, ISD::ROTR, 0),
X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
X86_INTRINSIC_DATA(avx512_pshuf_b_512, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
X86_INTRINSIC_DATA(avx512_psll_d_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
@@ -1522,6 +1032,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0),
X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0),
X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_q_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_q_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_q_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
@@ -1546,6 +1062,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
+ X86_INTRINSIC_DATA(avx512_sqrt_pd_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
+ X86_INTRINSIC_DATA(avx512_sqrt_ps_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
+ X86_INTRINSIC_DATA(avx512_sub_pd_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND),
+ X86_INTRINSIC_DATA(avx512_sub_ps_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND),
X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
@@ -1556,42 +1076,74 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_vcvtss2si64, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
X86_INTRINSIC_DATA(avx512_vcvtss2usi32, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vfmadd_f32, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512_vfmadd_f64, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512_vfmadd_pd_512, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512_vfmadd_ps_512, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512_vfmaddsub_pd_512, INTR_TYPE_3OP, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+ X86_INTRINSIC_DATA(avx512_vfmaddsub_ps_512, INTR_TYPE_3OP, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+
+ X86_INTRINSIC_DATA(avx512_vpdpbusd_128, INTR_TYPE_3OP, X86ISD::VPDPBUSD, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpbusd_256, INTR_TYPE_3OP, X86ISD::VPDPBUSD, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpbusd_512, INTR_TYPE_3OP, X86ISD::VPDPBUSD, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpbusds_128, INTR_TYPE_3OP, X86ISD::VPDPBUSDS, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpbusds_256, INTR_TYPE_3OP, X86ISD::VPDPBUSDS, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpbusds_512, INTR_TYPE_3OP, X86ISD::VPDPBUSDS, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpwssd_128, INTR_TYPE_3OP, X86ISD::VPDPWSSD, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpwssd_256, INTR_TYPE_3OP, X86ISD::VPDPWSSD, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpwssd_512, INTR_TYPE_3OP, X86ISD::VPDPWSSD, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpwssds_128, INTR_TYPE_3OP, X86ISD::VPDPWSSDS, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpwssds_256, INTR_TYPE_3OP, X86ISD::VPDPWSSDS, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpwssds_512, INTR_TYPE_3OP, X86ISD::VPDPWSSDS, 0),
+
+ X86_INTRINSIC_DATA(avx512_vpermi2var_d_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_d_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_d_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_hi_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_hi_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_hi_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_pd_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_pd_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_pd_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_ps_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_ps_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_ps_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_q_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_q_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_q_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_qi_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_qi_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_qi_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_vpermilvar_pd_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx512_vpermilvar_ps_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
- X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, ISD::FMA, 0),
- X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, ISD::FMA, 0),
- X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, ISD::FMA, 0),
- X86_INTRINSIC_DATA(fma_vfmadd_ps_256, INTR_TYPE_3OP, ISD::FMA, 0),
- X86_INTRINSIC_DATA(fma_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADDS1, 0),
- X86_INTRINSIC_DATA(fma_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADDS1, 0),
- X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
- X86_INTRINSIC_DATA(fma_vfmsub_pd, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
- X86_INTRINSIC_DATA(fma_vfmsub_pd_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
- X86_INTRINSIC_DATA(fma_vfmsub_ps, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
- X86_INTRINSIC_DATA(fma_vfmsub_ps_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
- X86_INTRINSIC_DATA(fma_vfmsub_sd, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0),
- X86_INTRINSIC_DATA(fma_vfmsub_ss, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0),
- X86_INTRINSIC_DATA(fma_vfmsubadd_pd, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
- X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
- X86_INTRINSIC_DATA(fma_vfmsubadd_ps, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
- X86_INTRINSIC_DATA(fma_vfmsubadd_ps_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
- X86_INTRINSIC_DATA(fma_vfnmadd_pd, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
- X86_INTRINSIC_DATA(fma_vfnmadd_pd_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
- X86_INTRINSIC_DATA(fma_vfnmadd_ps, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
- X86_INTRINSIC_DATA(fma_vfnmadd_ps_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
- X86_INTRINSIC_DATA(fma_vfnmadd_sd, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0),
- X86_INTRINSIC_DATA(fma_vfnmadd_ss, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0),
- X86_INTRINSIC_DATA(fma_vfnmsub_pd, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
- X86_INTRINSIC_DATA(fma_vfnmsub_pd_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
- X86_INTRINSIC_DATA(fma_vfnmsub_ps, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
- X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
- X86_INTRINSIC_DATA(fma_vfnmsub_sd, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
- X86_INTRINSIC_DATA(fma_vfnmsub_ss, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
- X86_INTRINSIC_DATA(fma4_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADD4S, 0),
- X86_INTRINSIC_DATA(fma4_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADD4S, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_128 , IFMA_OP, X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_256 , IFMA_OP, X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_512 , IFMA_OP, X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_128 , IFMA_OP, X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_256 , IFMA_OP, X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_512 , IFMA_OP, X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_vpshld_d_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshld_d_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshld_d_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshld_q_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshld_q_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshld_q_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshld_w_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshld_w_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshld_w_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshrd_d_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshrd_d_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshrd_d_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshrd_q_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshrd_q_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshrd_q_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshrd_w_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshrd_w_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_vpshrd_w_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
+ X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0),
X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE),
@@ -1606,7 +1158,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0),
X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(sse_sqrt_ps, INTR_TYPE_1OP, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ),
X86_INTRINSIC_DATA(sse_ucomige_ss, COMI, X86ISD::UCOMI, ISD::SETGE),
X86_INTRINSIC_DATA(sse_ucomigt_ss, COMI, X86ISD::UCOMI, ISD::SETGT),
@@ -1620,11 +1171,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_comile_sd, COMI, X86ISD::COMI, ISD::SETLE),
X86_INTRINSIC_DATA(sse2_comilt_sd, COMI, X86ISD::COMI, ISD::SETLT),
X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE),
- X86_INTRINSIC_DATA(sse2_cvtdq2ps, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
X86_INTRINSIC_DATA(sse2_cvtpd2dq, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(sse2_cvtpd2ps, INTR_TYPE_1OP, X86ISD::VFPROUND, 0),
+ X86_INTRINSIC_DATA(sse2_cvtps2dq, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
- X86_INTRINSIC_DATA(sse2_cvttps2dq, INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(sse2_cvttps2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0),
X86_INTRINSIC_DATA(sse2_max_sd, INTR_TYPE_2OP, X86ISD::FMAXS, 0),
X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0),
@@ -1641,7 +1192,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
- X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
X86_INTRINSIC_DATA(sse2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
X86_INTRINSIC_DATA(sse2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0),
@@ -1663,7 +1213,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
X86_INTRINSIC_DATA(sse2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
X86_INTRINSIC_DATA(sse2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
- X86_INTRINSIC_DATA(sse2_sqrt_pd, INTR_TYPE_1OP, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ),
X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE),
X86_INTRINSIC_DATA(sse2_ucomigt_sd, COMI, X86ISD::UCOMI, ISD::SETGT),
@@ -1679,7 +1228,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0),
- X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(sse41_round_sd, ROUNDS, X86ISD::VRNDSCALES, 0),
@@ -1693,6 +1241,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(tbm_bextri_u32, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
+ X86_INTRINSIC_DATA(tbm_bextri_u64, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
X86_INTRINSIC_DATA(vcvtph2ps_128, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
X86_INTRINSIC_DATA(vcvtph2ps_256, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
@@ -1752,12 +1302,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
* Retrieve data for Intrinsic without chain.
* Return nullptr if intrinsic is not defined in the table.
*/
-static const IntrinsicData* getIntrinsicWithoutChain(uint16_t IntNo) {
- IntrinsicData IntrinsicToFind = { IntNo, INTR_NO_TYPE, 0, 0 };
+static const IntrinsicData* getIntrinsicWithoutChain(unsigned IntNo) {
const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithoutChain),
std::end(IntrinsicsWithoutChain),
- IntrinsicToFind);
- if (Data != std::end(IntrinsicsWithoutChain) && *Data == IntrinsicToFind)
+ IntNo);
+ if (Data != std::end(IntrinsicsWithoutChain) && Data->Id == IntNo)
return Data;
return nullptr;
}
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 4108a58fa7a5..d372cada8de8 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -21,6 +21,7 @@
using namespace llvm;
using namespace TargetOpcode;
+using namespace LegalizeActions;
/// FIXME: The following static functions are SizeChangeStrategy functions
/// that are meant to temporarily mimic the behaviour of the old legalization
@@ -38,7 +39,7 @@ addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
result.push_back(v[i]);
if (i + 1 < v[i].first && i + 1 < v.size() &&
v[i + 1].first != v[i].first + 1)
- result.push_back({v[i].first + 1, LegalizerInfo::Unsupported});
+ result.push_back({v[i].first + 1, Unsupported});
}
}
@@ -46,11 +47,11 @@ static LegalizerInfo::SizeAndActionsVec
widen_1(const LegalizerInfo::SizeAndActionsVec &v) {
assert(v.size() >= 1);
assert(v[0].first > 1);
- LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::WidenScalar},
- {2, LegalizerInfo::Unsupported}};
+ LegalizerInfo::SizeAndActionsVec result = {{1, WidenScalar},
+ {2, Unsupported}};
addAndInterleaveWithUnsupported(result, v);
auto Largest = result.back().first;
- result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+ result.push_back({Largest + 1, Unsupported});
return result;
}
@@ -81,16 +82,18 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest);
computeTables();
+ verify(*STI.getInstrInfo());
}
void X86LegalizerInfo::setLegalizerInfo32bit() {
- const LLT p0 = LLT::pointer(0, TM.getPointerSize() * 8);
+ const LLT p0 = LLT::pointer(0, TM.getPointerSizeInBits(0));
const LLT s1 = LLT::scalar(1);
const LLT s8 = LLT::scalar(8);
const LLT s16 = LLT::scalar(16);
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
+ const LLT s128 = LLT::scalar(128);
for (auto Ty : {p0, s1, s8, s16, s32})
setAction({G_IMPLICIT_DEF, Ty}, Legal);
@@ -122,6 +125,19 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
setAction({G_GEP, p0}, Legal);
setAction({G_GEP, 1, s32}, Legal);
+ if (!Subtarget.is64Bit()) {
+ getActionDefinitionsBuilder(G_PTRTOINT)
+ .legalForCartesianProduct({s1, s8, s16, s32}, {p0})
+ .maxScalar(0, s32)
+ .widenScalarToNextPow2(0, /*Min*/ 8);
+ getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s32}});
+
+ // Shifts and SDIV
+ getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR, G_SDIV})
+ .legalFor({s8, s16, s32})
+ .clampScalar(0, s8, s32);
+ }
+
// Control-flow
setAction({G_BRCOND, s1}, Legal);
@@ -135,6 +151,7 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
setAction({G_SEXT, Ty}, Legal);
setAction({G_ANYEXT, Ty}, Legal);
}
+ setAction({G_ANYEXT, s128}, Legal);
// Comparison
setAction({G_ICMP, s1}, Legal);
@@ -158,10 +175,18 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
if (!Subtarget.is64Bit())
return;
+ const LLT p0 = LLT::pointer(0, TM.getPointerSizeInBits(0));
+ const LLT s1 = LLT::scalar(1);
+ const LLT s8 = LLT::scalar(8);
+ const LLT s16 = LLT::scalar(16);
+ const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
const LLT s128 = LLT::scalar(128);
setAction({G_IMPLICIT_DEF, s64}, Legal);
+ // Need to have that, as tryFoldImplicitDef will create this pattern:
+ // s128 = EXTEND (G_IMPLICIT_DEF s32/s64) -> s128 = G_IMPLICIT_DEF
+ setAction({G_IMPLICIT_DEF, s128}, Legal);
setAction({G_PHI, s64}, Legal);
@@ -173,6 +198,11 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
// Pointer-handling
setAction({G_GEP, 1, s64}, Legal);
+ getActionDefinitionsBuilder(G_PTRTOINT)
+ .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0})
+ .maxScalar(0, s64)
+ .widenScalarToNextPow2(0, /*Min*/ 8);
+ getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s64}});
// Constants
setAction({TargetOpcode::G_CONSTANT, s64}, Legal);
@@ -182,9 +212,21 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
setAction({extOp, s64}, Legal);
}
+ getActionDefinitionsBuilder(G_SITOFP)
+ .legalForCartesianProduct({s32, s64})
+ .clampScalar(1, s32, s64)
+ .widenScalarToNextPow2(1)
+ .clampScalar(0, s32, s64)
+ .widenScalarToNextPow2(0);
+
// Comparison
setAction({G_ICMP, 1, s64}, Legal);
+ // Shifts and SDIV
+ getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR, G_SDIV})
+ .legalFor({s8, s16, s32, s64})
+ .clampScalar(0, s8, s64);
+
// Merge/Unmerge
setAction({G_MERGE_VALUES, s128}, Legal);
setAction({G_UNMERGE_VALUES, 1, s128}, Legal);
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 8a7179e48a0b..d38c7b497965 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -28,7 +28,6 @@
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/StackMaps.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Mangler.h"
@@ -44,6 +43,7 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
using namespace llvm;
@@ -56,6 +56,7 @@ class X86MCInstLower {
const TargetMachine &TM;
const MCAsmInfo &MAI;
X86AsmPrinter &AsmPrinter;
+
public:
X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);
@@ -115,13 +116,12 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
}
-
/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
/// operand to an MCSymbol.
-MCSymbol *X86MCInstLower::
-GetSymbolFromOperand(const MachineOperand &MO) const {
+MCSymbol *X86MCInstLower::GetSymbolFromOperand(const MachineOperand &MO) const {
const DataLayout &DL = MF.getDataLayout();
- assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
+ assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) &&
+ "Isn't a symbol reference");
MCSymbol *Sym = nullptr;
SmallString<128> Name;
@@ -158,17 +158,17 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
// If the target flags on the operand changes the name of the symbol, do that
// before we return the symbol.
switch (MO.getTargetFlags()) {
- default: break;
+ default:
+ break;
case X86II::MO_DARWIN_NONLAZY:
case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
MachineModuleInfoImpl::StubValueTy &StubSym =
- getMachOMMI().getGVStubEntry(Sym);
+ getMachOMMI().getGVStubEntry(Sym);
if (!StubSym.getPointer()) {
assert(MO.isGlobal() && "Extern symbol not handled yet");
- StubSym =
- MachineModuleInfoImpl::
- StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
- !MO.getGlobal()->hasInternalLinkage());
+ StubSym = MachineModuleInfoImpl::StubValueTy(
+ AsmPrinter.getSymbol(MO.getGlobal()),
+ !MO.getGlobal()->hasInternalLinkage());
}
break;
}
@@ -185,44 +185,74 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
switch (MO.getTargetFlags()) {
- default: llvm_unreachable("Unknown target flag on GV operand");
- case X86II::MO_NO_FLAG: // No flag.
+ default:
+ llvm_unreachable("Unknown target flag on GV operand");
+ case X86II::MO_NO_FLAG: // No flag.
// These affect the name of the symbol, not any suffix.
case X86II::MO_DARWIN_NONLAZY:
case X86II::MO_DLLIMPORT:
break;
- case X86II::MO_TLVP: RefKind = MCSymbolRefExpr::VK_TLVP; break;
+ case X86II::MO_TLVP:
+ RefKind = MCSymbolRefExpr::VK_TLVP;
+ break;
case X86II::MO_TLVP_PIC_BASE:
Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
// Subtract the pic base.
- Expr = MCBinaryExpr::createSub(Expr,
- MCSymbolRefExpr::create(MF.getPICBaseSymbol(),
- Ctx),
- Ctx);
- break;
- case X86II::MO_SECREL: RefKind = MCSymbolRefExpr::VK_SECREL; break;
- case X86II::MO_TLSGD: RefKind = MCSymbolRefExpr::VK_TLSGD; break;
- case X86II::MO_TLSLD: RefKind = MCSymbolRefExpr::VK_TLSLD; break;
- case X86II::MO_TLSLDM: RefKind = MCSymbolRefExpr::VK_TLSLDM; break;
- case X86II::MO_GOTTPOFF: RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
- case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
- case X86II::MO_TPOFF: RefKind = MCSymbolRefExpr::VK_TPOFF; break;
- case X86II::MO_DTPOFF: RefKind = MCSymbolRefExpr::VK_DTPOFF; break;
- case X86II::MO_NTPOFF: RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
- case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break;
- case X86II::MO_GOTPCREL: RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
- case X86II::MO_GOT: RefKind = MCSymbolRefExpr::VK_GOT; break;
- case X86II::MO_GOTOFF: RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
- case X86II::MO_PLT: RefKind = MCSymbolRefExpr::VK_PLT; break;
- case X86II::MO_ABS8: RefKind = MCSymbolRefExpr::VK_X86_ABS8; break;
+ Expr = MCBinaryExpr::createSub(
+ Expr, MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx), Ctx);
+ break;
+ case X86II::MO_SECREL:
+ RefKind = MCSymbolRefExpr::VK_SECREL;
+ break;
+ case X86II::MO_TLSGD:
+ RefKind = MCSymbolRefExpr::VK_TLSGD;
+ break;
+ case X86II::MO_TLSLD:
+ RefKind = MCSymbolRefExpr::VK_TLSLD;
+ break;
+ case X86II::MO_TLSLDM:
+ RefKind = MCSymbolRefExpr::VK_TLSLDM;
+ break;
+ case X86II::MO_GOTTPOFF:
+ RefKind = MCSymbolRefExpr::VK_GOTTPOFF;
+ break;
+ case X86II::MO_INDNTPOFF:
+ RefKind = MCSymbolRefExpr::VK_INDNTPOFF;
+ break;
+ case X86II::MO_TPOFF:
+ RefKind = MCSymbolRefExpr::VK_TPOFF;
+ break;
+ case X86II::MO_DTPOFF:
+ RefKind = MCSymbolRefExpr::VK_DTPOFF;
+ break;
+ case X86II::MO_NTPOFF:
+ RefKind = MCSymbolRefExpr::VK_NTPOFF;
+ break;
+ case X86II::MO_GOTNTPOFF:
+ RefKind = MCSymbolRefExpr::VK_GOTNTPOFF;
+ break;
+ case X86II::MO_GOTPCREL:
+ RefKind = MCSymbolRefExpr::VK_GOTPCREL;
+ break;
+ case X86II::MO_GOT:
+ RefKind = MCSymbolRefExpr::VK_GOT;
+ break;
+ case X86II::MO_GOTOFF:
+ RefKind = MCSymbolRefExpr::VK_GOTOFF;
+ break;
+ case X86II::MO_PLT:
+ RefKind = MCSymbolRefExpr::VK_PLT;
+ break;
+ case X86II::MO_ABS8:
+ RefKind = MCSymbolRefExpr::VK_X86_ABS8;
+ break;
case X86II::MO_PIC_BASE_OFFSET:
case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
Expr = MCSymbolRefExpr::create(Sym, Ctx);
// Subtract the pic base.
- Expr = MCBinaryExpr::createSub(Expr,
- MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx),
- Ctx);
+ Expr = MCBinaryExpr::createSub(
+ Expr, MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx), Ctx);
if (MO.isJTI()) {
assert(MAI.doesSetDirectiveSuppressReloc());
// If .set directive is supported, use it to reduce the number of
@@ -240,14 +270,12 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
- Expr = MCBinaryExpr::createAdd(Expr,
- MCConstantExpr::create(MO.getOffset(), Ctx),
- Ctx);
+ Expr = MCBinaryExpr::createAdd(
+ Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
return MCOperand::createExpr(Expr);
}
-
-/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
+/// Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
/// a short fixed-register form.
static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
unsigned ImmOp = Inst.getNumOperands() - 1;
@@ -255,7 +283,8 @@ static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
(Inst.getOperand(ImmOp).isImm() || Inst.getOperand(ImmOp).isExpr()) &&
((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) ||
- Inst.getNumOperands() == 2) && "Unexpected instruction!");
+ Inst.getNumOperands() == 2) &&
+ "Unexpected instruction!");
// Check whether the destination register can be fixed.
unsigned Reg = Inst.getOperand(0).getReg();
@@ -269,7 +298,7 @@ static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
Inst.addOperand(Saved);
}
-/// \brief If a movsx instruction has a shorter encoding for the used register
+/// If a movsx instruction has a shorter encoding for the used register
/// simplify the instruction to use it instead.
static void SimplifyMOVSX(MCInst &Inst) {
unsigned NewOpcode = 0;
@@ -277,7 +306,7 @@ static void SimplifyMOVSX(MCInst &Inst) {
switch (Inst.getOpcode()) {
default:
llvm_unreachable("Unexpected instruction!");
- case X86::MOVSX16rr8: // movsbw %al, %ax --> cbtw
+ case X86::MOVSX16rr8: // movsbw %al, %ax --> cbtw
if (Op0 == X86::AX && Op1 == X86::AL)
NewOpcode = X86::CBW;
break;
@@ -297,7 +326,7 @@ static void SimplifyMOVSX(MCInst &Inst) {
}
}
-/// \brief Simplify things like MOV32rm to MOV32o32a.
+/// Simplify things like MOV32rm to MOV32o32a.
static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
unsigned Opcode) {
// Don't make these simplifications in 64-bit mode; other assemblers don't
@@ -309,14 +338,14 @@ static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
unsigned AddrBase = IsStore;
unsigned RegOp = IsStore ? 0 : 5;
unsigned AddrOp = AddrBase + 3;
- assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
- Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() &&
- Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() &&
- Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() &&
- Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() &&
- (Inst.getOperand(AddrOp).isExpr() ||
- Inst.getOperand(AddrOp).isImm()) &&
- "Unexpected instruction!");
+ assert(
+ Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
+ Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() &&
+ Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() &&
+ Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() &&
+ Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() &&
+ (Inst.getOperand(AddrOp).isExpr() || Inst.getOperand(AddrOp).isImm()) &&
+ "Unexpected instruction!");
// Check whether the destination register can be fixed.
unsigned Reg = Inst.getOperand(RegOp).getReg();
@@ -401,9 +430,9 @@ ReSimplify:
case X86::LEA16r:
case X86::LEA32r:
// LEA should have a segment register, but it must be empty.
- assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands &&
+ assert(OutMI.getNumOperands() == 1 + X86::AddrNumOperands &&
"Unexpected # of LEA operands");
- assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
+ assert(OutMI.getOperand(1 + X86::AddrSegmentReg).getReg() == 0 &&
"LEA has segment specified!");
break;
@@ -452,8 +481,8 @@ ReSimplify:
unsigned NewOpc;
switch (OutMI.getOpcode()) {
default: llvm_unreachable("Invalid opcode");
- case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break;
- case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break;
+ case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break;
+ case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break;
}
OutMI.setOpcode(NewOpc);
}
@@ -499,24 +528,30 @@ ReSimplify:
break;
}
- // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump instruction.
- { unsigned Opcode;
- case X86::TAILJMPr: Opcode = X86::JMP32r; goto SetTailJmpOpcode;
- case X86::TAILJMPd:
- case X86::TAILJMPd64: Opcode = X86::JMP_1; goto SetTailJmpOpcode;
- case X86::TAILJMPd_CC:
- case X86::TAILJMPd64_CC:
- Opcode = X86::GetCondBranchFromCond(
- static_cast<X86::CondCode>(MI->getOperand(1).getImm()));
- goto SetTailJmpOpcode;
-
- SetTailJmpOpcode:
- MCOperand Saved = OutMI.getOperand(0);
- OutMI = MCInst();
- OutMI.setOpcode(Opcode);
- OutMI.addOperand(Saved);
- break;
- }
+ // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump
+ // instruction.
+ {
+ unsigned Opcode;
+ case X86::TAILJMPr:
+ Opcode = X86::JMP32r;
+ goto SetTailJmpOpcode;
+ case X86::TAILJMPd:
+ case X86::TAILJMPd64:
+ Opcode = X86::JMP_1;
+ goto SetTailJmpOpcode;
+ case X86::TAILJMPd_CC:
+ case X86::TAILJMPd64_CC:
+ Opcode = X86::GetCondBranchFromCond(
+ static_cast<X86::CondCode>(MI->getOperand(1).getImm()));
+ goto SetTailJmpOpcode;
+
+ SetTailJmpOpcode:
+ MCOperand Saved = OutMI.getOperand(0);
+ OutMI = MCInst();
+ OutMI.setOpcode(Opcode);
+ OutMI.addOperand(Saved);
+ break;
+ }
case X86::DEC16r:
case X86::DEC32r:
@@ -539,63 +574,63 @@ ReSimplify:
// These are pseudo-ops for OR to help with the OR->ADD transformation. We do
// this with an ugly goto in case the resultant OR uses EAX and needs the
// short form.
- case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
- case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
- case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
- case X86::ADD16ri_DB: OutMI.setOpcode(X86::OR16ri); goto ReSimplify;
- case X86::ADD32ri_DB: OutMI.setOpcode(X86::OR32ri); goto ReSimplify;
+ case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
+ case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
+ case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
+ case X86::ADD16ri_DB: OutMI.setOpcode(X86::OR16ri); goto ReSimplify;
+ case X86::ADD32ri_DB: OutMI.setOpcode(X86::OR32ri); goto ReSimplify;
case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify;
- case X86::ADD16ri8_DB: OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
- case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
- case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;
+ case X86::ADD16ri8_DB: OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
+ case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
+ case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;
// Atomic load and store require a separate pseudo-inst because Acquire
// implies mayStore and Release implies mayLoad; fix these to regular MOV
// instructions here
- case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
- case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
- case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
- case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
- case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
- case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
- case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
- case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
- case X86::RELEASE_MOV8mi: OutMI.setOpcode(X86::MOV8mi); goto ReSimplify;
- case X86::RELEASE_MOV16mi: OutMI.setOpcode(X86::MOV16mi); goto ReSimplify;
- case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify;
+ case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
+ case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
+ case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
+ case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
+ case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
+ case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
+ case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
+ case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
+ case X86::RELEASE_MOV8mi: OutMI.setOpcode(X86::MOV8mi); goto ReSimplify;
+ case X86::RELEASE_MOV16mi: OutMI.setOpcode(X86::MOV16mi); goto ReSimplify;
+ case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify;
case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify;
- case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify;
- case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify;
- case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify;
- case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify;
+ case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify;
+ case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify;
+ case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify;
+ case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify;
case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify;
- case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify;
- case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify;
- case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify;
- case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify;
- case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify;
+ case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify;
+ case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify;
+ case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify;
+ case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify;
+ case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify;
case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify;
- case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify;
- case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify;
- case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify;
- case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify;
- case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify;
- case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify;
- case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify;
- case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify;
- case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify;
- case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify;
- case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify;
+ case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify;
+ case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify;
+ case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify;
+ case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify;
+ case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify;
+ case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify;
+ case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify;
+ case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify;
+ case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify;
+ case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify;
+ case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify;
case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify;
- case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify;
- case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify;
- case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify;
- case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify;
- case X86::RELEASE_INC64m: OutMI.setOpcode(X86::INC64m); goto ReSimplify;
- case X86::RELEASE_DEC8m: OutMI.setOpcode(X86::DEC8m); goto ReSimplify;
- case X86::RELEASE_DEC16m: OutMI.setOpcode(X86::DEC16m); goto ReSimplify;
- case X86::RELEASE_DEC32m: OutMI.setOpcode(X86::DEC32m); goto ReSimplify;
- case X86::RELEASE_DEC64m: OutMI.setOpcode(X86::DEC64m); goto ReSimplify;
+ case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify;
+ case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify;
+ case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify;
+ case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify;
+ case X86::RELEASE_INC64m: OutMI.setOpcode(X86::INC64m); goto ReSimplify;
+ case X86::RELEASE_DEC8m: OutMI.setOpcode(X86::DEC8m); goto ReSimplify;
+ case X86::RELEASE_DEC16m: OutMI.setOpcode(X86::DEC16m); goto ReSimplify;
+ case X86::RELEASE_DEC32m: OutMI.setOpcode(X86::DEC32m); goto ReSimplify;
+ case X86::RELEASE_DEC64m: OutMI.setOpcode(X86::DEC64m); goto ReSimplify;
// We don't currently select the correct instruction form for instructions
// which have a short %eax, etc. form. Handle this by custom lowering, for
@@ -616,13 +651,13 @@ ReSimplify:
switch (OutMI.getOpcode()) {
default: llvm_unreachable("Invalid opcode");
case X86::MOV8mr_NOREX:
- case X86::MOV8mr: NewOpc = X86::MOV8o32a; break;
+ case X86::MOV8mr: NewOpc = X86::MOV8o32a; break;
case X86::MOV8rm_NOREX:
- case X86::MOV8rm: NewOpc = X86::MOV8ao32; break;
- case X86::MOV16mr: NewOpc = X86::MOV16o32a; break;
- case X86::MOV16rm: NewOpc = X86::MOV16ao32; break;
- case X86::MOV32mr: NewOpc = X86::MOV32o32a; break;
- case X86::MOV32rm: NewOpc = X86::MOV32ao32; break;
+ case X86::MOV8rm: NewOpc = X86::MOV8ao32; break;
+ case X86::MOV16mr: NewOpc = X86::MOV16o32a; break;
+ case X86::MOV16rm: NewOpc = X86::MOV16ao32; break;
+ case X86::MOV32mr: NewOpc = X86::MOV32o32a; break;
+ case X86::MOV32rm: NewOpc = X86::MOV32ao32; break;
}
SimplifyShortMoveForm(AsmPrinter, OutMI, NewOpc);
break;
@@ -705,18 +740,18 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
MCSymbolRefExpr::VariantKind SRVK;
switch (MI.getOpcode()) {
- case X86::TLS_addr32:
- case X86::TLS_addr64:
- SRVK = MCSymbolRefExpr::VK_TLSGD;
- break;
- case X86::TLS_base_addr32:
- SRVK = MCSymbolRefExpr::VK_TLSLDM;
- break;
- case X86::TLS_base_addr64:
- SRVK = MCSymbolRefExpr::VK_TLSLD;
- break;
- default:
- llvm_unreachable("unexpected opcode");
+ case X86::TLS_addr32:
+ case X86::TLS_addr64:
+ SRVK = MCSymbolRefExpr::VK_TLSGD;
+ break;
+ case X86::TLS_base_addr32:
+ SRVK = MCSymbolRefExpr::VK_TLSLDM;
+ break;
+ case X86::TLS_base_addr64:
+ SRVK = MCSymbolRefExpr::VK_TLSLD;
+ break;
+ default:
+ llvm_unreachable("unexpected opcode");
}
MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
@@ -759,16 +794,14 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name);
const MCSymbolRefExpr *tlsRef =
- MCSymbolRefExpr::create(tlsGetAddr,
- MCSymbolRefExpr::VK_PLT,
- context);
+ MCSymbolRefExpr::create(tlsGetAddr, MCSymbolRefExpr::VK_PLT, context);
- EmitAndCountInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32
- : X86::CALLpcrel32)
- .addExpr(tlsRef));
+ EmitAndCountInstruction(
+ MCInstBuilder(is64Bits ? X86::CALL64pcrel32 : X86::CALLpcrel32)
+ .addExpr(tlsRef));
}
-/// \brief Emit the largest nop instruction smaller than or equal to \p NumBytes
+/// Emit the largest nop instruction smaller than or equal to \p NumBytes
/// bytes. Return the size of nop emitted.
static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
const MCSubtargetInfo &STI) {
@@ -782,22 +815,62 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
BaseReg = X86::RAX;
ScaleVal = 1;
switch (NumBytes) {
- case 0: llvm_unreachable("Zero nops?"); break;
- case 1: NopSize = 1; Opc = X86::NOOP; break;
- case 2: NopSize = 2; Opc = X86::XCHG16ar; break;
- case 3: NopSize = 3; Opc = X86::NOOPL; break;
- case 4: NopSize = 4; Opc = X86::NOOPL; Displacement = 8; break;
- case 5: NopSize = 5; Opc = X86::NOOPL; Displacement = 8;
- IndexReg = X86::RAX; break;
- case 6: NopSize = 6; Opc = X86::NOOPW; Displacement = 8;
- IndexReg = X86::RAX; break;
- case 7: NopSize = 7; Opc = X86::NOOPL; Displacement = 512; break;
- case 8: NopSize = 8; Opc = X86::NOOPL; Displacement = 512;
- IndexReg = X86::RAX; break;
- case 9: NopSize = 9; Opc = X86::NOOPW; Displacement = 512;
- IndexReg = X86::RAX; break;
- default: NopSize = 10; Opc = X86::NOOPW; Displacement = 512;
- IndexReg = X86::RAX; SegmentReg = X86::CS; break;
+ case 0:
+ llvm_unreachable("Zero nops?");
+ break;
+ case 1:
+ NopSize = 1;
+ Opc = X86::NOOP;
+ break;
+ case 2:
+ NopSize = 2;
+ Opc = X86::XCHG16ar;
+ break;
+ case 3:
+ NopSize = 3;
+ Opc = X86::NOOPL;
+ break;
+ case 4:
+ NopSize = 4;
+ Opc = X86::NOOPL;
+ Displacement = 8;
+ break;
+ case 5:
+ NopSize = 5;
+ Opc = X86::NOOPL;
+ Displacement = 8;
+ IndexReg = X86::RAX;
+ break;
+ case 6:
+ NopSize = 6;
+ Opc = X86::NOOPW;
+ Displacement = 8;
+ IndexReg = X86::RAX;
+ break;
+ case 7:
+ NopSize = 7;
+ Opc = X86::NOOPL;
+ Displacement = 512;
+ break;
+ case 8:
+ NopSize = 8;
+ Opc = X86::NOOPL;
+ Displacement = 512;
+ IndexReg = X86::RAX;
+ break;
+ case 9:
+ NopSize = 9;
+ Opc = X86::NOOPW;
+ Displacement = 512;
+ IndexReg = X86::RAX;
+ break;
+ default:
+ NopSize = 10;
+ Opc = X86::NOOPW;
+ Displacement = 512;
+ IndexReg = X86::RAX;
+ SegmentReg = X86::CS;
+ break;
}
unsigned NumPrefixes = std::min(NumBytes - NopSize, 5U);
@@ -806,14 +879,12 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
OS.EmitBytes("\x66");
switch (Opc) {
- default:
- llvm_unreachable("Unexpected opcode");
- break;
+ default: llvm_unreachable("Unexpected opcode");
case X86::NOOP:
OS.EmitInstruction(MCInstBuilder(Opc), STI);
break;
case X86::XCHG16ar:
- OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX), STI);
+ OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX).addReg(X86::AX), STI);
break;
case X86::NOOPL:
case X86::NOOPW:
@@ -830,7 +901,7 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
return NopSize;
}
-/// \brief Emit the optimal amount of multi-byte nops on X86.
+/// Emit the optimal amount of multi-byte nops on X86.
static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
const MCSubtargetInfo &STI) {
unsigned NopsToEmit = NumBytes;
@@ -874,6 +945,10 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
// address is to far away. (TODO: support non-relative addressing)
break;
case MachineOperand::MO_Register:
+ // FIXME: Add retpoline support and remove this.
+ if (Subtarget->useRetpoline())
+ report_fatal_error("Lowering register statepoints with retpoline not "
+ "yet implemented.");
CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
CallOpcode = X86::CALL64r;
break;
@@ -967,7 +1042,7 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
unsigned NopSize = EmitNop(*OutStreamer, MinSize, Subtarget->is64Bit(),
getSubtargetInfo());
assert(NopSize == MinSize && "Could not implement MinSize!");
- (void) NopSize;
+ (void)NopSize;
}
}
@@ -1012,9 +1087,8 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
break;
case MachineOperand::MO_ExternalSymbol:
case MachineOperand::MO_GlobalAddress:
- CalleeMCOp =
- MCIL.LowerSymbolOperand(CalleeMO,
- MCIL.GetSymbolFromOperand(CalleeMO));
+ CalleeMCOp = MCIL.LowerSymbolOperand(CalleeMO,
+ MCIL.GetSymbolFromOperand(CalleeMO));
break;
}
@@ -1028,6 +1102,10 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
EmitAndCountInstruction(
MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp));
+ // FIXME: Add retpoline support and remove this.
+ if (Subtarget->useRetpoline())
+ report_fatal_error(
+ "Lowering patchpoint with retpoline not yet implemented.");
EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
}
@@ -1076,8 +1154,10 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
// The default C calling convention will place two arguments into %rcx and
// %rdx -- so we only work with those.
- unsigned UsedRegs[] = {X86::RDI, X86::RSI};
+ unsigned DestRegs[] = {X86::RDI, X86::RSI};
bool UsedMask[] = {false, false};
+ // Filled out in loop.
+ unsigned SrcRegs[] = {0, 0};
// Then we put the operands in the %rdi and %rsi registers. We spill the
// values in the register before we clobber them, and mark them as used in
@@ -1087,18 +1167,22 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
for (unsigned I = 0; I < MI.getNumOperands(); ++I)
if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
assert(Op->isReg() && "Only support arguments in registers");
- if (Op->getReg() != UsedRegs[I]) {
+ SrcRegs[I] = Op->getReg();
+ if (SrcRegs[I] != DestRegs[I]) {
UsedMask[I] = true;
EmitAndCountInstruction(
- MCInstBuilder(X86::PUSH64r).addReg(UsedRegs[I]));
- EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
- .addReg(UsedRegs[I])
- .addReg(Op->getReg()));
+ MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I]));
} else {
EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo());
}
}
+ // Now that the register values are stashed, mov arguments into place.
+ for (unsigned I = 0; I < MI.getNumOperands(); ++I)
+ if (SrcRegs[I] != DestRegs[I])
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::MOV64rr).addReg(DestRegs[I]).addReg(SrcRegs[I]));
+
// We emit a hard dependency on the __xray_CustomEvent symbol, which is the
// name of the trampoline to be implemented by the XRay runtime.
auto TSym = OutContext.getOrCreateSymbol("__xray_CustomEvent");
@@ -1113,7 +1197,7 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
// Restore caller-saved and used registers.
for (unsigned I = sizeof UsedMask; I-- > 0;)
if (UsedMask[I])
- EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(UsedRegs[I]));
+ EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I]));
else
EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo());
@@ -1125,6 +1209,102 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
recordSled(CurSled, MI, SledKind::CUSTOM_EVENT, 1);
}
+void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ assert(Subtarget->is64Bit() && "XRay typed events only supports X86-64");
+
+ // We want to emit the following pattern, which follows the x86 calling
+ // convention to prepare for the trampoline call to be patched in.
+ //
+ // .p2align 1, ...
+ // .Lxray_event_sled_N:
+ // jmp +N // jump across the instrumentation sled
+ // ... // set up arguments in register
+ // callq __xray_TypedEvent@plt // force dependency to symbol
+ // ...
+ // <jump here>
+ //
+ // After patching, it would look something like:
+ //
+ // nopw (2-byte nop)
+ // ...
+ // callq __xrayTypedEvent // already lowered
+ // ...
+ //
+ // ---
+ // First we emit the label and the jump.
+ auto CurSled = OutContext.createTempSymbol("xray_typed_event_sled_", true);
+ OutStreamer->AddComment("# XRay Typed Event Log");
+ OutStreamer->EmitCodeAlignment(2);
+ OutStreamer->EmitLabel(CurSled);
+
+ // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
+ // an operand (computed as an offset from the jmp instruction).
+ // FIXME: Find another less hacky way do force the relative jump.
+ OutStreamer->EmitBinaryData("\xeb\x14");
+
+ // An x86-64 convention may place three arguments into %rcx, %rdx, and R8,
+ // so we'll work with those. Or we may be called via SystemV, in which case
+ // we don't have to do any translation.
+ unsigned DestRegs[] = {X86::RDI, X86::RSI, X86::RDX};
+ bool UsedMask[] = {false, false, false};
+
+ // Will fill out src regs in the loop.
+ unsigned SrcRegs[] = {0, 0, 0};
+
+ // Then we put the operands in the SystemV registers. We spill the values in
+ // the registers before we clobber them, and mark them as used in UsedMask.
+ // In case the arguments are already in the correct register, we emit nops
+ // appropriately sized to keep the sled the same size in every situation.
+ for (unsigned I = 0; I < MI.getNumOperands(); ++I)
+ if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
+ // TODO: Is register only support adequate?
+ assert(Op->isReg() && "Only supports arguments in registers");
+ SrcRegs[I] = Op->getReg();
+ if (SrcRegs[I] != DestRegs[I]) {
+ UsedMask[I] = true;
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I]));
+ } else {
+ EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo());
+ }
+ }
+
+ // In the above loop we only stash all of the destination registers or emit
+ // nops if the arguments are already in the right place. Doing the actually
+ // moving is postponed until after all the registers are stashed so nothing
+ // is clobbers. We've already added nops to account for the size of mov and
+ // push if the register is in the right place, so we only have to worry about
+ // emitting movs.
+ for (unsigned I = 0; I < MI.getNumOperands(); ++I)
+ if (UsedMask[I])
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::MOV64rr).addReg(DestRegs[I]).addReg(SrcRegs[I]));
+
+ // We emit a hard dependency on the __xray_TypedEvent symbol, which is the
+ // name of the trampoline to be implemented by the XRay runtime.
+ auto TSym = OutContext.getOrCreateSymbol("__xray_TypedEvent");
+ MachineOperand TOp = MachineOperand::CreateMCSymbol(TSym);
+ if (isPositionIndependent())
+ TOp.setTargetFlags(X86II::MO_PLT);
+
+ // Emit the call instruction.
+ EmitAndCountInstruction(MCInstBuilder(X86::CALL64pcrel32)
+ .addOperand(MCIL.LowerSymbolOperand(TOp, TSym)));
+
+ // Restore caller-saved and used registers.
+ for (unsigned I = sizeof UsedMask; I-- > 0;)
+ if (UsedMask[I])
+ EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I]));
+ else
+ EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo());
+
+ OutStreamer->AddComment("xray typed event end.");
+
+ // Record the sled version.
+ recordSled(CurSled, MI, SledKind::TYPED_EVENT, 0);
+}
+
void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
X86MCInstLower &MCIL) {
// We want to emit the following pattern:
@@ -1182,7 +1362,8 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
recordSled(CurSled, MI, SledKind::FUNCTION_EXIT);
}
-void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL) {
+void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
// Like PATCHABLE_RET, we have the actual instruction in the operands to this
// instruction so we lower that particular instruction and its operands.
// Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how
@@ -1236,8 +1417,7 @@ static const Constant *getConstantFromPool(const MachineInstr &MI,
ArrayRef<MachineConstantPoolEntry> Constants =
MI.getParent()->getParent()->getConstantPool()->getConstants();
- const MachineConstantPoolEntry &ConstantEntry =
- Constants[Op.getIndex()];
+ const MachineConstantPoolEntry &ConstantEntry = Constants[Op.getIndex()];
// Bail if this is a machine constant pool entry, we won't be able to dig out
// anything useful.
@@ -1250,10 +1430,8 @@ static const Constant *getConstantFromPool(const MachineInstr &MI,
return C;
}
-static std::string getShuffleComment(const MachineInstr *MI,
- unsigned SrcOp1Idx,
- unsigned SrcOp2Idx,
- ArrayRef<int> Mask) {
+static std::string getShuffleComment(const MachineInstr *MI, unsigned SrcOp1Idx,
+ unsigned SrcOp2Idx, ArrayRef<int> Mask) {
std::string Comment;
// Compute the name for a register. This is really goofy because we have
@@ -1441,12 +1619,13 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
X86MCInstLower MCInstLowering(*MF, *this);
- const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+ const X86RegisterInfo *RI =
+ MF->getSubtarget<X86Subtarget>().getRegisterInfo();
// Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
// are compressed from EVEX encoding to VEX encoding.
if (TM.Options.MCOptions.ShowMCEncoding) {
- if (MI->getAsmPrinterFlags() & AC_EVEX_2_VEX)
+ if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX)
OutStreamer->AddComment("EVEX TO VEX Compression ", false);
}
@@ -1459,7 +1638,6 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
OutStreamer->emitRawComment("MEMBARRIER");
return;
-
case X86::EH_RETURN:
case X86::EH_RETURN64: {
// Lower these as normal, but add some comments.
@@ -1511,13 +1689,14 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
MCSymbol *PICBase = MF->getPICBaseSymbol();
// FIXME: We would like an efficient form for this, so we don't have to do a
// lot of extra uniquing.
- EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32)
- .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::CALLpcrel32)
+ .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
- const X86FrameLowering* FrameLowering =
+ const X86FrameLowering *FrameLowering =
MF->getSubtarget<X86Subtarget>().getFrameLowering();
bool hasFP = FrameLowering->hasFP(*MF);
-
+
// TODO: This is needed only if we require precise CFA.
bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
!OutStreamer->getDwarfFrameInfos().back().End;
@@ -1532,8 +1711,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
OutStreamer->EmitLabel(PICBase);
// popl $reg
- EmitAndCountInstruction(MCInstBuilder(X86::POP32r)
- .addReg(MI->getOperand(0).getReg()));
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::POP32r).addReg(MI->getOperand(0).getReg()));
if (HasActiveDwarfFrame && !hasFP) {
OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth);
@@ -1541,6 +1720,41 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
+ case X86::MOVGOT64r: {
+ // Materializes the GOT for the 64-bit large code model.
+ MCSymbol *DotSym = OutContext.createTempSymbol();
+ OutStreamer->EmitLabel(DotSym);
+
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned ScratchReg = MI->getOperand(1).getReg();
+ MCSymbol *GOTSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
+
+ // .LtmpN: leaq .LtmpN(%rip), %dst
+ const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
+ EmitAndCountInstruction(MCInstBuilder(X86::LEA64r)
+ .addReg(DstReg) // dest
+ .addReg(X86::RIP) // base
+ .addImm(1) // scale
+ .addReg(0) // index
+ .addExpr(DotExpr) // disp
+ .addReg(0)); // seg
+
+ // movq $_GLOBAL_OFFSET_TABLE_ - .LtmpN, %scratch
+ const MCExpr *GOTSymExpr = MCSymbolRefExpr::create(GOTSym, OutContext);
+ const MCExpr *GOTDiffExpr =
+ MCBinaryExpr::createSub(GOTSymExpr, DotExpr, OutContext);
+ EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri)
+ .addReg(ScratchReg) // dest
+ .addExpr(GOTDiffExpr)); // disp
+
+ // addq %scratch, %dst
+ EmitAndCountInstruction(MCInstBuilder(X86::ADD64rr)
+ .addReg(DstReg) // dest
+ .addReg(DstReg) // dest
+ .addReg(ScratchReg)); // src
+ return;
+ }
+
case X86::ADD32ri: {
// Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
@@ -1561,16 +1775,16 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
const MCExpr *PICBase =
- MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
+ MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);
- DotExpr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(OpSym,OutContext),
- DotExpr, OutContext);
+ DotExpr = MCBinaryExpr::createAdd(
+ MCSymbolRefExpr::create(OpSym, OutContext), DotExpr, OutContext);
EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
- .addReg(MI->getOperand(0).getReg())
- .addReg(MI->getOperand(1).getReg())
- .addExpr(DotExpr));
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addExpr(DotExpr));
return;
}
case TargetOpcode::STATEPOINT:
@@ -1599,10 +1813,13 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case TargetOpcode::PATCHABLE_TAIL_CALL:
return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);
-
+
case TargetOpcode::PATCHABLE_EVENT_CALL:
return LowerPATCHABLE_EVENT_CALL(*MI, MCInstLowering);
+ case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
+ return LowerPATCHABLE_TYPED_EVENT_CALL(*MI, MCInstLowering);
+
case X86::MORESTACK_RET:
EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
return;
@@ -1610,9 +1827,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::MORESTACK_RET_RESTORE_R10:
// Return, then restore R10.
EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
- EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
- .addReg(X86::R10)
- .addReg(X86::RAX));
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::MOV64rr).addReg(X86::R10).addReg(X86::RAX));
return;
case X86::SEH_PushReg:
@@ -1814,37 +2030,55 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
break;
}
-#define MOV_CASE(Prefix, Suffix) \
- case X86::Prefix##MOVAPD##Suffix##rm: \
- case X86::Prefix##MOVAPS##Suffix##rm: \
- case X86::Prefix##MOVUPD##Suffix##rm: \
- case X86::Prefix##MOVUPS##Suffix##rm: \
- case X86::Prefix##MOVDQA##Suffix##rm: \
+ case X86::MMX_MOVQ64rm: {
+ if (!OutStreamer->isVerboseAsm())
+ break;
+ if (MI->getNumOperands() <= 4)
+ break;
+ if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+ std::string Comment;
+ raw_string_ostream CS(Comment);
+ const MachineOperand &DstOp = MI->getOperand(0);
+ CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+ if (auto *CF = dyn_cast<ConstantFP>(C)) {
+ CS << "0x" << CF->getValueAPF().bitcastToAPInt().toString(16, false);
+ OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
+ }
+ }
+ break;
+ }
+
+#define MOV_CASE(Prefix, Suffix) \
+ case X86::Prefix##MOVAPD##Suffix##rm: \
+ case X86::Prefix##MOVAPS##Suffix##rm: \
+ case X86::Prefix##MOVUPD##Suffix##rm: \
+ case X86::Prefix##MOVUPS##Suffix##rm: \
+ case X86::Prefix##MOVDQA##Suffix##rm: \
case X86::Prefix##MOVDQU##Suffix##rm:
-#define MOV_AVX512_CASE(Suffix) \
- case X86::VMOVDQA64##Suffix##rm: \
- case X86::VMOVDQA32##Suffix##rm: \
- case X86::VMOVDQU64##Suffix##rm: \
- case X86::VMOVDQU32##Suffix##rm: \
- case X86::VMOVDQU16##Suffix##rm: \
- case X86::VMOVDQU8##Suffix##rm: \
- case X86::VMOVAPS##Suffix##rm: \
- case X86::VMOVAPD##Suffix##rm: \
- case X86::VMOVUPS##Suffix##rm: \
+#define MOV_AVX512_CASE(Suffix) \
+ case X86::VMOVDQA64##Suffix##rm: \
+ case X86::VMOVDQA32##Suffix##rm: \
+ case X86::VMOVDQU64##Suffix##rm: \
+ case X86::VMOVDQU32##Suffix##rm: \
+ case X86::VMOVDQU16##Suffix##rm: \
+ case X86::VMOVDQU8##Suffix##rm: \
+ case X86::VMOVAPS##Suffix##rm: \
+ case X86::VMOVAPD##Suffix##rm: \
+ case X86::VMOVUPS##Suffix##rm: \
case X86::VMOVUPD##Suffix##rm:
-#define CASE_ALL_MOV_RM() \
- MOV_CASE(, ) /* SSE */ \
- MOV_CASE(V, ) /* AVX-128 */ \
- MOV_CASE(V, Y) /* AVX-256 */ \
- MOV_AVX512_CASE(Z) \
- MOV_AVX512_CASE(Z256) \
+#define CASE_ALL_MOV_RM() \
+ MOV_CASE(, ) /* SSE */ \
+ MOV_CASE(V, ) /* AVX-128 */ \
+ MOV_CASE(V, Y) /* AVX-256 */ \
+ MOV_AVX512_CASE(Z) \
+ MOV_AVX512_CASE(Z256) \
MOV_AVX512_CASE(Z128)
- // For loads from a constant pool to a vector register, print the constant
- // loaded.
- CASE_ALL_MOV_RM()
+ // For loads from a constant pool to a vector register, print the constant
+ // loaded.
+ CASE_ALL_MOV_RM()
case X86::VBROADCASTF128:
case X86::VBROADCASTI128:
case X86::VBROADCASTF32X4Z256rm:
@@ -1867,20 +2101,20 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
int NumLanes = 1;
// Override NumLanes for the broadcast instructions.
switch (MI->getOpcode()) {
- case X86::VBROADCASTF128: NumLanes = 2; break;
- case X86::VBROADCASTI128: NumLanes = 2; break;
- case X86::VBROADCASTF32X4Z256rm: NumLanes = 2; break;
- case X86::VBROADCASTF32X4rm: NumLanes = 4; break;
- case X86::VBROADCASTF32X8rm: NumLanes = 2; break;
- case X86::VBROADCASTF64X2Z128rm: NumLanes = 2; break;
- case X86::VBROADCASTF64X2rm: NumLanes = 4; break;
- case X86::VBROADCASTF64X4rm: NumLanes = 2; break;
- case X86::VBROADCASTI32X4Z256rm: NumLanes = 2; break;
- case X86::VBROADCASTI32X4rm: NumLanes = 4; break;
- case X86::VBROADCASTI32X8rm: NumLanes = 2; break;
- case X86::VBROADCASTI64X2Z128rm: NumLanes = 2; break;
- case X86::VBROADCASTI64X2rm: NumLanes = 4; break;
- case X86::VBROADCASTI64X4rm: NumLanes = 2; break;
+ case X86::VBROADCASTF128: NumLanes = 2; break;
+ case X86::VBROADCASTI128: NumLanes = 2; break;
+ case X86::VBROADCASTF32X4Z256rm: NumLanes = 2; break;
+ case X86::VBROADCASTF32X4rm: NumLanes = 4; break;
+ case X86::VBROADCASTF32X8rm: NumLanes = 2; break;
+ case X86::VBROADCASTF64X2Z128rm: NumLanes = 2; break;
+ case X86::VBROADCASTF64X2rm: NumLanes = 4; break;
+ case X86::VBROADCASTF64X4rm: NumLanes = 2; break;
+ case X86::VBROADCASTI32X4Z256rm: NumLanes = 2; break;
+ case X86::VBROADCASTI32X4rm: NumLanes = 4; break;
+ case X86::VBROADCASTI32X8rm: NumLanes = 2; break;
+ case X86::VBROADCASTI64X2Z128rm: NumLanes = 2; break;
+ case X86::VBROADCASTI64X2rm: NumLanes = 4; break;
+ case X86::VBROADCASTI64X4rm: NumLanes = 2; break;
}
std::string Comment;
@@ -1890,7 +2124,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
CS << "[";
for (int l = 0; l != NumLanes; ++l) {
- for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) {
+ for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements;
+ ++i) {
if (i != 0 || l != 0)
CS << ",";
if (CDS->getElementType()->isIntegerTy())
@@ -1908,7 +2143,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
} else if (auto *CV = dyn_cast<ConstantVector>(C)) {
CS << "<";
for (int l = 0; l != NumLanes; ++l) {
- for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) {
+ for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands;
+ ++i) {
if (i != 0 || l != 0)
CS << ",";
printConstant(CV->getOperand(i), CS);
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index d517d82537a7..e1183bd14796 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -16,7 +16,7 @@
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Support/MachineValueType.h"
namespace llvm {
@@ -49,7 +49,7 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
/// ReturnAddrIndex - FrameIndex for return slot.
int ReturnAddrIndex = 0;
- /// \brief FrameIndex for return slot.
+ /// FrameIndex for return slot.
int FrameAddrIndex = 0;
/// TailCallReturnAddrDelta - The number of bytes by which return address
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
index 67d95c2233de..df3abb17014d 100644
--- a/lib/Target/X86/X86MacroFusion.cpp
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -19,7 +19,7 @@
using namespace llvm;
-/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// Check if the instr pair, FirstMI and SecondMI, should be fused
/// together. Given SecondMI, when FirstMI is unspecified, then check if
/// SecondMI may be part of a fused pair at all.
static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
@@ -86,7 +86,6 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
case X86::TEST16mr:
case X86::TEST32mr:
case X86::TEST64mr:
- case X86::TEST8ri_NOREX:
case X86::AND16i16:
case X86::AND16ri:
case X86::AND16ri8:
diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp
index 1fc6f07b79fa..42db51b3cf01 100644
--- a/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -60,17 +60,17 @@ static cl::opt<bool>
STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed");
-/// \brief Returns true if two machine operands are identical and they are not
+/// Returns true if two machine operands are identical and they are not
/// physical registers.
static inline bool isIdenticalOp(const MachineOperand &MO1,
const MachineOperand &MO2);
-/// \brief Returns true if two address displacement operands are of the same
+/// Returns true if two address displacement operands are of the same
/// type and use the same symbol/index/address regardless of the offset.
static bool isSimilarDispOp(const MachineOperand &MO1,
const MachineOperand &MO2);
-/// \brief Returns true if the instruction is LEA.
+/// Returns true if the instruction is LEA.
static inline bool isLEA(const MachineInstr &MI);
namespace {
@@ -184,7 +184,7 @@ template <> struct DenseMapInfo<MemOpKey> {
} // end namespace llvm
-/// \brief Returns a hash table key based on memory operands of \p MI. The
+/// Returns a hash table key based on memory operands of \p MI. The
/// number of the first memory operand of \p MI is specified through \p N.
static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N) {
assert((isLEA(MI) || MI.mayLoadOrStore()) &&
@@ -242,7 +242,7 @@ public:
StringRef getPassName() const override { return "X86 LEA Optimize"; }
- /// \brief Loop over all of the basic blocks, replacing address
+ /// Loop over all of the basic blocks, replacing address
/// calculations in load and store instructions, if it's already
/// been calculated by LEA. Also, remove redundant LEAs.
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -250,11 +250,11 @@ public:
private:
using MemOpMap = DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>>;
- /// \brief Returns a distance between two instructions inside one basic block.
+ /// Returns a distance between two instructions inside one basic block.
/// Negative result means, that instructions occur in reverse order.
int calcInstrDist(const MachineInstr &First, const MachineInstr &Last);
- /// \brief Choose the best \p LEA instruction from the \p List to replace
+ /// Choose the best \p LEA instruction from the \p List to replace
/// address calculation in \p MI instruction. Return the address displacement
/// and the distance between \p MI and the chosen \p BestLEA in
/// \p AddrDispShift and \p Dist.
@@ -262,25 +262,25 @@ private:
const MachineInstr &MI, MachineInstr *&BestLEA,
int64_t &AddrDispShift, int &Dist);
- /// \brief Returns the difference between addresses' displacements of \p MI1
+ /// Returns the difference between addresses' displacements of \p MI1
/// and \p MI2. The numbers of the first memory operands for the instructions
/// are specified through \p N1 and \p N2.
int64_t getAddrDispShift(const MachineInstr &MI1, unsigned N1,
const MachineInstr &MI2, unsigned N2) const;
- /// \brief Returns true if the \p Last LEA instruction can be replaced by the
+ /// Returns true if the \p Last LEA instruction can be replaced by the
/// \p First. The difference between displacements of the addresses calculated
/// by these LEAs is returned in \p AddrDispShift. It'll be used for proper
/// replacement of the \p Last LEA's uses with the \p First's def register.
bool isReplaceable(const MachineInstr &First, const MachineInstr &Last,
int64_t &AddrDispShift) const;
- /// \brief Find all LEA instructions in the basic block. Also, assign position
+ /// Find all LEA instructions in the basic block. Also, assign position
/// numbers to all instructions in the basic block to speed up calculation of
/// distance between them.
void findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs);
- /// \brief Removes redundant address calculations.
+ /// Removes redundant address calculations.
bool removeRedundantAddrCalc(MemOpMap &LEAs);
/// Replace debug value MI with a new debug value instruction using register
@@ -289,7 +289,7 @@ private:
MachineInstr *replaceDebugValue(MachineInstr &MI, unsigned VReg,
int64_t AddrDispShift);
- /// \brief Removes LEAs which calculate similar addresses.
+ /// Removes LEAs which calculate similar addresses.
bool removeRedundantLEAs(MemOpMap &LEAs);
DenseMap<const MachineInstr *, unsigned> InstrPos;
@@ -541,7 +541,7 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
MRI->clearKillFlags(DefMI->getOperand(0).getReg());
++NumSubstLEAs;
- DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump(););
+ LLVM_DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump(););
// Change instruction operands.
MI.getOperand(MemOpNo + X86::AddrBaseReg)
@@ -553,7 +553,7 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
MI.getOperand(MemOpNo + X86::AddrSegmentReg)
.ChangeToRegister(X86::NoRegister, false);
- DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump(););
+ LLVM_DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump(););
Changed = true;
}
@@ -649,7 +649,8 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
MRI->clearKillFlags(FirstVReg);
++NumRedundantLEAs;
- DEBUG(dbgs() << "OptimizeLEAs: Remove redundant LEA: "; Last.dump(););
+ LLVM_DEBUG(dbgs() << "OptimizeLEAs: Remove redundant LEA: ";
+ Last.dump(););
// By this moment, all of the Last LEA's uses must be replaced. So we
// can freely remove it.
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index 1da0fad8b6cf..85b9aecc2106 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -21,7 +21,7 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -49,7 +49,7 @@ namespace {
struct PadShortFunc : public MachineFunctionPass {
static char ID;
PadShortFunc() : MachineFunctionPass(ID)
- , Threshold(4), STI(nullptr), TII(nullptr) {}
+ , Threshold(4) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -82,8 +82,7 @@ namespace {
// VisitedBBs - Cache of previously visited BBs.
DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs;
- const X86Subtarget *STI;
- const TargetInstrInfo *TII;
+ TargetSchedModel TSM;
};
char PadShortFunc::ID = 0;
@@ -99,15 +98,13 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
- if (MF.getFunction().optForSize()) {
+ if (MF.getFunction().optForSize())
return false;
- }
- STI = &MF.getSubtarget<X86Subtarget>();
- if (!STI->padShortFunctions())
+ if (!MF.getSubtarget<X86Subtarget>().padShortFunctions())
return false;
- TII = STI->getInstrInfo();
+ TSM.init(&MF.getSubtarget());
// Search through basic blocks and mark the ones that have early returns
ReturnBBs.clear();
@@ -132,7 +129,7 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
"Basic block should contain at least a RET but is empty");
MachineBasicBlock::iterator ReturnLoc = --MBB->end();
- while (ReturnLoc->isDebugValue())
+ while (ReturnLoc->isDebugInstr())
--ReturnLoc;
assert(ReturnLoc->isReturn() && !ReturnLoc->isCall() &&
"Basic block does not end with RET");
@@ -195,7 +192,7 @@ bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
return true;
}
- CyclesToEnd += TII->getInstrLatency(STI->getInstrItineraryData(), MI);
+ CyclesToEnd += TSM.computeInstrLatency(&MI);
}
VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd);
@@ -209,9 +206,8 @@ void PadShortFunc::addPadding(MachineBasicBlock *MBB,
MachineBasicBlock::iterator &MBBI,
unsigned int NOOPsToAdd) {
DebugLoc DL = MBBI->getDebugLoc();
+ unsigned IssueWidth = TSM.getIssueWidth();
- while (NOOPsToAdd-- > 0) {
- BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
- BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
- }
+ for (unsigned i = 0, e = IssueWidth * NOOPsToAdd; i != e; ++i)
+ BuildMI(*MBB, MBBI, DL, TSM.getInstrInfo()->get(X86::NOOP));
}
diff --git a/lib/Target/X86/X86PfmCounters.td b/lib/Target/X86/X86PfmCounters.td
new file mode 100644
index 000000000000..093fbafa3fba
--- /dev/null
+++ b/lib/Target/X86/X86PfmCounters.td
@@ -0,0 +1,77 @@
+//===-- X86PfmCounters.td - X86 Hardware Counters ----------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the available hardware counters for various subtargets.
+//
+//===----------------------------------------------------------------------===//
+
+let SchedModel = SandyBridgeModel in {
+def SBCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
+def SBPort0Counter : PfmIssueCounter<SBPort0, ["uops_dispatched_port:port_0"]>;
+def SBPort1Counter : PfmIssueCounter<SBPort1, ["uops_dispatched_port:port_1"]>;
+def SBPort23Counter : PfmIssueCounter<SBPort23,
+ ["uops_dispatched_port:port_2",
+ "uops_dispatched_port:port_3"]>;
+def SBPort4Counter : PfmIssueCounter<SBPort4, ["uops_dispatched_port:port_4"]>;
+def SBPort5Counter : PfmIssueCounter<SBPort5, ["uops_dispatched_port:port_5"]>;
+}
+
+let SchedModel = HaswellModel in {
+def HWCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
+def HWPort0Counter : PfmIssueCounter<HWPort0, ["uops_dispatched_port:port_0"]>;
+def HWPort1Counter : PfmIssueCounter<HWPort1, ["uops_dispatched_port:port_1"]>;
+def HWPort2Counter : PfmIssueCounter<HWPort2, ["uops_dispatched_port:port_2"]>;
+def HWPort3Counter : PfmIssueCounter<HWPort3, ["uops_dispatched_port:port_3"]>;
+def HWPort4Counter : PfmIssueCounter<HWPort4, ["uops_dispatched_port:port_4"]>;
+def HWPort5Counter : PfmIssueCounter<HWPort5, ["uops_dispatched_port:port_5"]>;
+def HWPort6Counter : PfmIssueCounter<HWPort6, ["uops_dispatched_port:port_6"]>;
+def HWPort7Counter : PfmIssueCounter<HWPort7, ["uops_dispatched_port:port_7"]>;
+}
+
+let SchedModel = BroadwellModel in {
+def BWCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
+def BWPort0Counter : PfmIssueCounter<BWPort0, ["uops_executed_port:port_0"]>;
+def BWPort1Counter : PfmIssueCounter<BWPort1, ["uops_executed_port:port_1"]>;
+def BWPort2Counter : PfmIssueCounter<BWPort2, ["uops_executed_port:port_2"]>;
+def BWPort3Counter : PfmIssueCounter<BWPort3, ["uops_executed_port:port_3"]>;
+def BWPort4Counter : PfmIssueCounter<BWPort4, ["uops_executed_port:port_4"]>;
+def BWPort5Counter : PfmIssueCounter<BWPort5, ["uops_executed_port:port_5"]>;
+def BWPort6Counter : PfmIssueCounter<BWPort6, ["uops_executed_port:port_6"]>;
+def BWPort7Counter : PfmIssueCounter<BWPort7, ["uops_executed_port:port_7"]>;
+}
+
+let SchedModel = SkylakeClientModel in {
+def SKLCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
+def SKLPort0Counter : PfmIssueCounter<SKLPort0, ["uops_dispatched_port:port_0"]>;
+def SKLPort1Counter : PfmIssueCounter<SKLPort1, ["uops_dispatched_port:port_1"]>;
+def SKLPort2Counter : PfmIssueCounter<SKLPort2, ["uops_dispatched_port:port_2"]>;
+def SKLPort3Counter : PfmIssueCounter<SKLPort3, ["uops_dispatched_port:port_3"]>;
+def SKLPort4Counter : PfmIssueCounter<SKLPort4, ["uops_dispatched_port:port_4"]>;
+def SKLPort5Counter : PfmIssueCounter<SKLPort5, ["uops_dispatched_port:port_5"]>;
+def SKLPort6Counter : PfmIssueCounter<SKLPort6, ["uops_dispatched_port:port_6"]>;
+def SKLPort7Counter : PfmIssueCounter<SKLPort7, ["uops_dispatched_port:port_7"]>;
+}
+
+let SchedModel = SkylakeServerModel in {
+def SKXCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
+def SKXPort0Counter : PfmIssueCounter<SKXPort0, ["uops_dispatched_port:port_0"]>;
+def SKXPort1Counter : PfmIssueCounter<SKXPort1, ["uops_dispatched_port:port_1"]>;
+def SKXPort2Counter : PfmIssueCounter<SKXPort2, ["uops_dispatched_port:port_2"]>;
+def SKXPort3Counter : PfmIssueCounter<SKXPort3, ["uops_dispatched_port:port_3"]>;
+def SKXPort4Counter : PfmIssueCounter<SKXPort4, ["uops_dispatched_port:port_4"]>;
+def SKXPort5Counter : PfmIssueCounter<SKXPort5, ["uops_dispatched_port:port_5"]>;
+def SKXPort6Counter : PfmIssueCounter<SKXPort6, ["uops_dispatched_port:port_6"]>;
+def SKXPort7Counter : PfmIssueCounter<SKXPort7, ["uops_dispatched_port:port_7"]>;
+}
+
+let SchedModel = BtVer2Model in {
+def JCycleCounter : PfmCycleCounter<"cpu_clk_unhalted">;
+def JFPU0Counter : PfmIssueCounter<JFPU0, ["dispatched_fpu:pipe0"]>;
+def JFPU1Counter : PfmIssueCounter<JFPU1, ["dispatched_fpu:pipe1"]>;
+}
diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp
index aa0e3743c948..246d6d5a58d0 100644
--- a/lib/Target/X86/X86RegisterBankInfo.cpp
+++ b/lib/Target/X86/X86RegisterBankInfo.cpp
@@ -73,6 +73,8 @@ X86GenRegisterBankInfo::getPartialMappingIdx(const LLT &Ty, bool isFP) {
return PMI_GPR32;
case 64:
return PMI_GPR64;
+ case 128:
+ return PMI_VEC128;
break;
default:
llvm_unreachable("Unsupported register size.");
@@ -83,6 +85,8 @@ X86GenRegisterBankInfo::getPartialMappingIdx(const LLT &Ty, bool isFP) {
return PMI_FP32;
case 64:
return PMI_FP64;
+ case 128:
+ return PMI_VEC128;
default:
llvm_unreachable("Unsupported register size.");
}
@@ -169,6 +173,10 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
switch (Opc) {
case TargetOpcode::G_ADD:
case TargetOpcode::G_SUB:
+ case TargetOpcode::G_MUL:
+ case TargetOpcode::G_SHL:
+ case TargetOpcode::G_LSHR:
+ case TargetOpcode::G_ASHR:
return getSameOperandsMapping(MI, false);
break;
case TargetOpcode::G_FADD:
@@ -190,6 +198,34 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// Instruction having only floating-point operands (all scalars in VECRReg)
getInstrPartialMappingIdxs(MI, MRI, /* isFP */ true, OpRegBankIdx);
break;
+ case TargetOpcode::G_SITOFP: {
+ // Some of the floating-point instructions have mixed GPR and FP operands:
+ // fine-tune the computed mapping.
+ auto &Op0 = MI.getOperand(0);
+ auto &Op1 = MI.getOperand(1);
+ const LLT Ty0 = MRI.getType(Op0.getReg());
+ const LLT Ty1 = MRI.getType(Op1.getReg());
+ OpRegBankIdx[0] = getPartialMappingIdx(Ty0, /* isFP */ true);
+ OpRegBankIdx[1] = getPartialMappingIdx(Ty1, /* isFP */ false);
+ break;
+ }
+ case TargetOpcode::G_TRUNC:
+ case TargetOpcode::G_ANYEXT: {
+ auto &Op0 = MI.getOperand(0);
+ auto &Op1 = MI.getOperand(1);
+ const LLT Ty0 = MRI.getType(Op0.getReg());
+ const LLT Ty1 = MRI.getType(Op1.getReg());
+
+ bool isFPTrunc = (Ty0.getSizeInBits() == 32 || Ty0.getSizeInBits() == 64) &&
+ Ty1.getSizeInBits() == 128 && Opc == TargetOpcode::G_TRUNC;
+ bool isFPAnyExt =
+ Ty0.getSizeInBits() == 128 &&
+ (Ty1.getSizeInBits() == 32 || Ty1.getSizeInBits() == 64) &&
+ Opc == TargetOpcode::G_ANYEXT;
+
+ getInstrPartialMappingIdxs(MI, MRI, /* isFP */ isFPTrunc || isFPAnyExt,
+ OpRegBankIdx);
+ } break;
default:
// Track the bank of each register, use NotFP mapping (all scalars in GPRs)
getInstrPartialMappingIdxs(MI, MRI, /* isFP */ false, OpRegBankIdx);
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index bc31e95aa6b5..55842a4a2091 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -75,7 +75,7 @@ X86RegisterInfo::X86RegisterInfo(const Triple &TT)
bool
X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
- // ExecutionDepsFixer and PostRAScheduler require liveness.
+ // ExecutionDomainFix, BreakFalseDeps and PostRAScheduler require liveness.
return true;
}
@@ -552,6 +552,10 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(X86::DIL);
Reserved.set(X86::BPL);
Reserved.set(X86::SPL);
+ Reserved.set(X86::SIH);
+ Reserved.set(X86::DIH);
+ Reserved.set(X86::BPH);
+ Reserved.set(X86::SPH);
for (unsigned n = 0; n != 8; ++n) {
// R8, R9, ...
@@ -571,7 +575,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
}
assert(checkAllSuperRegsMarked(Reserved,
- {X86::SIL, X86::DIL, X86::BPL, X86::SPL}));
+ {X86::SIL, X86::DIL, X86::BPL, X86::SPL,
+ X86::SIH, X86::DIH, X86::BPH, X86::SPH}));
return Reserved;
}
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 2341e1fb0fac..ee9e7891f9f6 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -21,12 +21,14 @@ class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n>
// Subregister indices.
let Namespace = "X86" in {
- def sub_8bit : SubRegIndex<8>;
- def sub_8bit_hi : SubRegIndex<8, 8>;
- def sub_16bit : SubRegIndex<16>;
- def sub_32bit : SubRegIndex<32>;
- def sub_xmm : SubRegIndex<128>;
- def sub_ymm : SubRegIndex<256>;
+ def sub_8bit : SubRegIndex<8>;
+ def sub_8bit_hi : SubRegIndex<8, 8>;
+ def sub_8bit_hi_phony : SubRegIndex<8, 8>;
+ def sub_16bit : SubRegIndex<16>;
+ def sub_16bit_hi : SubRegIndex<16, 16>;
+ def sub_32bit : SubRegIndex<32>;
+ def sub_xmm : SubRegIndex<128>;
+ def sub_ymm : SubRegIndex<256>;
}
//===----------------------------------------------------------------------===//
@@ -73,6 +75,40 @@ def R14B : X86Reg<"r14b", 14>;
def R15B : X86Reg<"r15b", 15>;
}
+let isArtificial = 1 in {
+// High byte of the low 16 bits of the super-register:
+def SIH : X86Reg<"", -1>;
+def DIH : X86Reg<"", -1>;
+def BPH : X86Reg<"", -1>;
+def SPH : X86Reg<"", -1>;
+def R8BH : X86Reg<"", -1>;
+def R9BH : X86Reg<"", -1>;
+def R10BH : X86Reg<"", -1>;
+def R11BH : X86Reg<"", -1>;
+def R12BH : X86Reg<"", -1>;
+def R13BH : X86Reg<"", -1>;
+def R14BH : X86Reg<"", -1>;
+def R15BH : X86Reg<"", -1>;
+// High word of the low 32 bits of the super-register:
+def HAX : X86Reg<"", -1>;
+def HDX : X86Reg<"", -1>;
+def HCX : X86Reg<"", -1>;
+def HBX : X86Reg<"", -1>;
+def HSI : X86Reg<"", -1>;
+def HDI : X86Reg<"", -1>;
+def HBP : X86Reg<"", -1>;
+def HSP : X86Reg<"", -1>;
+def HIP : X86Reg<"", -1>;
+def R8WH : X86Reg<"", -1>;
+def R9WH : X86Reg<"", -1>;
+def R10WH : X86Reg<"", -1>;
+def R11WH : X86Reg<"", -1>;
+def R12WH : X86Reg<"", -1>;
+def R13WH : X86Reg<"", -1>;
+def R14WH : X86Reg<"", -1>;
+def R15WH : X86Reg<"", -1>;
+}
+
// 16-bit registers
let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in {
def AX : X86Reg<"ax", 0, [AL,AH]>;
@@ -80,49 +116,52 @@ def DX : X86Reg<"dx", 2, [DL,DH]>;
def CX : X86Reg<"cx", 1, [CL,CH]>;
def BX : X86Reg<"bx", 3, [BL,BH]>;
}
-let SubRegIndices = [sub_8bit] in {
-def SI : X86Reg<"si", 6, [SIL]>;
-def DI : X86Reg<"di", 7, [DIL]>;
-def BP : X86Reg<"bp", 5, [BPL]>;
-def SP : X86Reg<"sp", 4, [SPL]>;
+let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CoveredBySubRegs = 1 in {
+def SI : X86Reg<"si", 6, [SIL,SIH]>;
+def DI : X86Reg<"di", 7, [DIL,DIH]>;
+def BP : X86Reg<"bp", 5, [BPL,BPH]>;
+def SP : X86Reg<"sp", 4, [SPL,SPH]>;
}
def IP : X86Reg<"ip", 0>;
// X86-64 only, requires REX.
-let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
-def R8W : X86Reg<"r8w", 8, [R8B]>;
-def R9W : X86Reg<"r9w", 9, [R9B]>;
-def R10W : X86Reg<"r10w", 10, [R10B]>;
-def R11W : X86Reg<"r11w", 11, [R11B]>;
-def R12W : X86Reg<"r12w", 12, [R12B]>;
-def R13W : X86Reg<"r13w", 13, [R13B]>;
-def R14W : X86Reg<"r14w", 14, [R14B]>;
-def R15W : X86Reg<"r15w", 15, [R15B]>;
+let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CostPerUse = 1,
+ CoveredBySubRegs = 1 in {
+def R8W : X86Reg<"r8w", 8, [R8B,R8BH]>;
+def R9W : X86Reg<"r9w", 9, [R9B,R9BH]>;
+def R10W : X86Reg<"r10w", 10, [R10B,R10BH]>;
+def R11W : X86Reg<"r11w", 11, [R11B,R11BH]>;
+def R12W : X86Reg<"r12w", 12, [R12B,R12BH]>;
+def R13W : X86Reg<"r13w", 13, [R13B,R13BH]>;
+def R14W : X86Reg<"r14w", 14, [R14B,R14BH]>;
+def R15W : X86Reg<"r15w", 15, [R15B,R15BH]>;
}
// 32-bit registers
-let SubRegIndices = [sub_16bit] in {
-def EAX : X86Reg<"eax", 0, [AX]>, DwarfRegNum<[-2, 0, 0]>;
-def EDX : X86Reg<"edx", 2, [DX]>, DwarfRegNum<[-2, 2, 2]>;
-def ECX : X86Reg<"ecx", 1, [CX]>, DwarfRegNum<[-2, 1, 1]>;
-def EBX : X86Reg<"ebx", 3, [BX]>, DwarfRegNum<[-2, 3, 3]>;
-def ESI : X86Reg<"esi", 6, [SI]>, DwarfRegNum<[-2, 6, 6]>;
-def EDI : X86Reg<"edi", 7, [DI]>, DwarfRegNum<[-2, 7, 7]>;
-def EBP : X86Reg<"ebp", 5, [BP]>, DwarfRegNum<[-2, 4, 5]>;
-def ESP : X86Reg<"esp", 4, [SP]>, DwarfRegNum<[-2, 5, 4]>;
-def EIP : X86Reg<"eip", 0, [IP]>, DwarfRegNum<[-2, 8, 8]>;
+let SubRegIndices = [sub_16bit, sub_16bit_hi], CoveredBySubRegs = 1 in {
+def EAX : X86Reg<"eax", 0, [AX, HAX]>, DwarfRegNum<[-2, 0, 0]>;
+def EDX : X86Reg<"edx", 2, [DX, HDX]>, DwarfRegNum<[-2, 2, 2]>;
+def ECX : X86Reg<"ecx", 1, [CX, HCX]>, DwarfRegNum<[-2, 1, 1]>;
+def EBX : X86Reg<"ebx", 3, [BX, HBX]>, DwarfRegNum<[-2, 3, 3]>;
+def ESI : X86Reg<"esi", 6, [SI, HSI]>, DwarfRegNum<[-2, 6, 6]>;
+def EDI : X86Reg<"edi", 7, [DI, HDI]>, DwarfRegNum<[-2, 7, 7]>;
+def EBP : X86Reg<"ebp", 5, [BP, HBP]>, DwarfRegNum<[-2, 4, 5]>;
+def ESP : X86Reg<"esp", 4, [SP, HSP]>, DwarfRegNum<[-2, 5, 4]>;
+def EIP : X86Reg<"eip", 0, [IP, HIP]>, DwarfRegNum<[-2, 8, 8]>;
+}
// X86-64 only, requires REX
-let CostPerUse = 1 in {
-def R8D : X86Reg<"r8d", 8, [R8W]>;
-def R9D : X86Reg<"r9d", 9, [R9W]>;
-def R10D : X86Reg<"r10d", 10, [R10W]>;
-def R11D : X86Reg<"r11d", 11, [R11W]>;
-def R12D : X86Reg<"r12d", 12, [R12W]>;
-def R13D : X86Reg<"r13d", 13, [R13W]>;
-def R14D : X86Reg<"r14d", 14, [R14W]>;
-def R15D : X86Reg<"r15d", 15, [R15W]>;
-}}
+let SubRegIndices = [sub_16bit, sub_16bit_hi], CostPerUse = 1,
+ CoveredBySubRegs = 1 in {
+def R8D : X86Reg<"r8d", 8, [R8W,R8WH]>;
+def R9D : X86Reg<"r9d", 9, [R9W,R9WH]>;
+def R10D : X86Reg<"r10d", 10, [R10W,R10WH]>;
+def R11D : X86Reg<"r11d", 11, [R11W,R11WH]>;
+def R12D : X86Reg<"r12d", 12, [R12W,R12WH]>;
+def R13D : X86Reg<"r13d", 13, [R13W,R13WH]>;
+def R14D : X86Reg<"r14d", 14, [R14W,R14WH]>;
+def R15D : X86Reg<"r15d", 15, [R15W,R15WH]>;
+}
// 64-bit registers, X86-64 only
let SubRegIndices = [sub_32bit] in {
@@ -251,9 +290,19 @@ def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>;
// Floating-point status word
def FPSW : X86Reg<"fpsw", 0>;
-// Status flags register
+// Status flags register.
+//
+// Note that some flags that are commonly thought of as part of the status
+// flags register are modeled separately. Typically this is due to instructions
+// reading and updating those flags independently of all the others. We don't
+// want to create false dependencies between these instructions and so we use
+// a separate register to model them.
def EFLAGS : X86Reg<"flags", 0>;
+// The direction flag.
+def DF : X86Reg<"dirflag", 0>;
+
+
// Segment registers
def CS : X86Reg<"cs", 1>;
def DS : X86Reg<"ds", 3>;
@@ -337,10 +386,21 @@ def GR8 : RegisterClass<"X86", [i8], 8,
}];
}
+let isAllocatable = 0 in
+def GRH8 : RegisterClass<"X86", [i8], 8,
+ (add SIH, DIH, BPH, SPH, R8BH, R9BH, R10BH, R11BH,
+ R12BH, R13BH, R14BH, R15BH)>;
+
def GR16 : RegisterClass<"X86", [i16], 16,
(add AX, CX, DX, SI, DI, BX, BP, SP,
R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>;
+let isAllocatable = 0 in
+def GRH16 : RegisterClass<"X86", [i16], 16,
+ (add HAX, HCX, HDX, HSI, HDI, HBX, HBP, HSP, HIP,
+ R8WH, R9WH, R10WH, R11WH, R12WH, R13WH, R14WH,
+ R15WH)>;
+
def GR32 : RegisterClass<"X86", [i32], 32,
(add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>;
@@ -400,11 +460,6 @@ def GR32_NOREX : RegisterClass<"X86", [i32], 32,
def GR64_NOREX : RegisterClass<"X86", [i64], 64,
(add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>;
-// GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit
-// mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs
-// to clear upper 32-bits of RAX so is not a NOP.
-def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)>;
-
// GR32_NOSP - GR32 registers except ESP.
def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>;
@@ -449,8 +504,6 @@ def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
-def FR128 : RegisterClass<"X86", [i128, f128], 128, (add FR32)>;
-
// FIXME: This sets up the floating point register files as though they are f64
// values, though they really are f80 values. This will cause us to spill
@@ -472,16 +525,16 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
// Generic vector registers: VR64 and VR128.
// Ensure that float types are declared first - only float is legal on SSE1.
def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
-def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
128, (add FR32)>;
def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
256, (sequence "YMM%u", 0, 15)>;
// Special classes that help the assembly parser choose some alternate
// instructions to favor 2-byte VEX encodings.
-def VR128L : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+def VR128L : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
128, (sequence "XMM%u", 0, 7)>;
-def VR128H : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+def VR128H : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
128, (sequence "XMM%u", 8, 15)>;
def VR256L : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
256, (sequence "YMM%u", 0, 7)>;
@@ -497,6 +550,10 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
let CopyCost = -1; // Don't allow copying of status registers.
let isAllocatable = 0;
}
+def DFCCR : RegisterClass<"X86", [i32], 32, (add DF)> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+ let isAllocatable = 0;
+}
// AVX-512 vector/mask registers.
def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
@@ -508,7 +565,7 @@ def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
// Extended VR128 and VR256 for AVX-512 instructions
-def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
128, (add FR32X)>;
def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
256, (sequence "YMM%u", 0, 31)>;
diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp
new file mode 100644
index 000000000000..250deb3523b4
--- /dev/null
+++ b/lib/Target/X86/X86RetpolineThunks.cpp
@@ -0,0 +1,274 @@
+//======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86 --=====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Pass that injects an MI thunk implementing a "retpoline". This is
+/// a RET-implemented trampoline that is used to lower indirect calls in a way
+/// that prevents speculation on some x86 processors and can be used to mitigate
+/// security vulnerabilities due to targeted speculative execution and side
+/// channels such as CVE-2017-5715.
+///
+/// TODO(chandlerc): All of this code could use better comments and
+/// documentation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-retpoline-thunks"
+
+static const char ThunkNamePrefix[] = "__llvm_retpoline_";
+static const char R11ThunkName[] = "__llvm_retpoline_r11";
+static const char EAXThunkName[] = "__llvm_retpoline_eax";
+static const char ECXThunkName[] = "__llvm_retpoline_ecx";
+static const char EDXThunkName[] = "__llvm_retpoline_edx";
+static const char EDIThunkName[] = "__llvm_retpoline_edi";
+
+namespace {
+class X86RetpolineThunks : public MachineFunctionPass {
+public:
+ static char ID;
+
+ X86RetpolineThunks() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "X86 Retpoline Thunks"; }
+
+ bool doInitialization(Module &M) override;
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<MachineModuleInfo>();
+ AU.addPreserved<MachineModuleInfo>();
+ }
+
+private:
+ MachineModuleInfo *MMI;
+ const TargetMachine *TM;
+ bool Is64Bit;
+ const X86Subtarget *STI;
+ const X86InstrInfo *TII;
+
+ bool InsertedThunks;
+
+ void createThunkFunction(Module &M, StringRef Name);
+ void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
+ void populateThunk(MachineFunction &MF, Optional<unsigned> Reg = None);
+};
+
+} // end anonymous namespace
+
+FunctionPass *llvm::createX86RetpolineThunksPass() {
+ return new X86RetpolineThunks();
+}
+
+char X86RetpolineThunks::ID = 0;
+
+bool X86RetpolineThunks::doInitialization(Module &M) {
+ InsertedThunks = false;
+ return false;
+}
+
+bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << getPassName() << '\n');
+
+ TM = &MF.getTarget();;
+ STI = &MF.getSubtarget<X86Subtarget>();
+ TII = STI->getInstrInfo();
+ Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64;
+
+ MMI = &getAnalysis<MachineModuleInfo>();
+ Module &M = const_cast<Module &>(*MMI->getModule());
+
+ // If this function is not a thunk, check to see if we need to insert
+ // a thunk.
+ if (!MF.getName().startswith(ThunkNamePrefix)) {
+ // If we've already inserted a thunk, nothing else to do.
+ if (InsertedThunks)
+ return false;
+
+ // Only add a thunk if one of the functions has the retpoline feature
+ // enabled in its subtarget, and doesn't enable external thunks.
+ // FIXME: Conditionalize on indirect calls so we don't emit a thunk when
+ // nothing will end up calling it.
+ // FIXME: It's a little silly to look at every function just to enumerate
+ // the subtargets, but eventually we'll want to look at them for indirect
+ // calls, so maybe this is OK.
+ if (!STI->useRetpoline() || STI->useRetpolineExternalThunk())
+ return false;
+
+ // Otherwise, we need to insert the thunk.
+ // WARNING: This is not really a well behaving thing to do in a function
+ // pass. We extract the module and insert a new function (and machine
+ // function) directly into the module.
+ if (Is64Bit)
+ createThunkFunction(M, R11ThunkName);
+ else
+ for (StringRef Name :
+ {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName})
+ createThunkFunction(M, Name);
+ InsertedThunks = true;
+ return true;
+ }
+
+ // If this *is* a thunk function, we need to populate it with the correct MI.
+ if (Is64Bit) {
+ assert(MF.getName() == "__llvm_retpoline_r11" &&
+ "Should only have an r11 thunk on 64-bit targets");
+
+ // __llvm_retpoline_r11:
+ // callq .Lr11_call_target
+ // .Lr11_capture_spec:
+ // pause
+ // lfence
+ // jmp .Lr11_capture_spec
+ // .align 16
+ // .Lr11_call_target:
+ // movq %r11, (%rsp)
+ // retq
+ populateThunk(MF, X86::R11);
+ } else {
+ // For 32-bit targets we need to emit a collection of thunks for various
+ // possible scratch registers as well as a fallback that uses EDI, which is
+ // normally callee saved.
+ // __llvm_retpoline_eax:
+ // calll .Leax_call_target
+ // .Leax_capture_spec:
+ // pause
+ // jmp .Leax_capture_spec
+ // .align 16
+ // .Leax_call_target:
+ // movl %eax, (%esp) # Clobber return addr
+ // retl
+ //
+ // __llvm_retpoline_ecx:
+ // ... # Same setup
+ // movl %ecx, (%esp)
+ // retl
+ //
+ // __llvm_retpoline_edx:
+ // ... # Same setup
+ // movl %edx, (%esp)
+ // retl
+ //
+ // __llvm_retpoline_edi:
+ // ... # Same setup
+ // movl %edi, (%esp)
+ // retl
+ if (MF.getName() == EAXThunkName)
+ populateThunk(MF, X86::EAX);
+ else if (MF.getName() == ECXThunkName)
+ populateThunk(MF, X86::ECX);
+ else if (MF.getName() == EDXThunkName)
+ populateThunk(MF, X86::EDX);
+ else if (MF.getName() == EDIThunkName)
+ populateThunk(MF, X86::EDI);
+ else
+ llvm_unreachable("Invalid thunk name on x86-32!");
+ }
+
+ return true;
+}
+
+void X86RetpolineThunks::createThunkFunction(Module &M, StringRef Name) {
+ assert(Name.startswith(ThunkNamePrefix) &&
+ "Created a thunk with an unexpected prefix!");
+
+ LLVMContext &Ctx = M.getContext();
+ auto Type = FunctionType::get(Type::getVoidTy(Ctx), false);
+ Function *F =
+ Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M);
+ F->setVisibility(GlobalValue::HiddenVisibility);
+ F->setComdat(M.getOrInsertComdat(Name));
+
+ // Add Attributes so that we don't create a frame, unwind information, or
+ // inline.
+ AttrBuilder B;
+ B.addAttribute(llvm::Attribute::NoUnwind);
+ B.addAttribute(llvm::Attribute::Naked);
+ F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+
+ // Populate our function a bit so that we can verify.
+ BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F);
+ IRBuilder<> Builder(Entry);
+
+ Builder.CreateRetVoid();
+
+ // MachineFunctions/MachineBasicBlocks aren't created automatically for the
+ // IR-level constructs we already made. Create them and insert them into the
+ // module.
+ MachineFunction &MF = MMI->getOrCreateMachineFunction(*F);
+ MachineBasicBlock *EntryMBB = MF.CreateMachineBasicBlock(Entry);
+
+ // Insert EntryMBB into MF. It's not in the module until we do this.
+ MF.insert(MF.end(), EntryMBB);
+}
+
+void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
+ unsigned Reg) {
+ const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr;
+ const unsigned SPReg = Is64Bit ? X86::RSP : X86::ESP;
+ addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(MovOpc)), SPReg, false, 0)
+ .addReg(Reg);
+}
+
+void X86RetpolineThunks::populateThunk(MachineFunction &MF,
+ Optional<unsigned> Reg) {
+ // Set MF properties. We never use vregs...
+ MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+
+ MachineBasicBlock *Entry = &MF.front();
+ Entry->clear();
+
+ MachineBasicBlock *CaptureSpec = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
+ MachineBasicBlock *CallTarget = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
+ MF.push_back(CaptureSpec);
+ MF.push_back(CallTarget);
+
+ const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+ const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL;
+
+ BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addMBB(CallTarget);
+ Entry->addSuccessor(CallTarget);
+ Entry->addSuccessor(CaptureSpec);
+ CallTarget->setHasAddressTaken();
+
+ // In the capture loop for speculation, we want to stop the processor from
+ // speculating as fast as possible. On Intel processors, the PAUSE instruction
+ // will block speculation without consuming any execution resources. On AMD
+ // processors, the PAUSE instruction is (essentially) a nop, so we also use an
+ // LFENCE instruction which they have advised will stop speculation as well
+ // with minimal resource utilization. We still end the capture with a jump to
+ // form an infinite loop to fully guarantee that no matter what implementation
+ // of the x86 ISA, speculating this code path never escapes.
+ BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE));
+ BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE));
+ BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec);
+ CaptureSpec->setHasAddressTaken();
+ CaptureSpec->addSuccessor(CaptureSpec);
+
+ CallTarget->setAlignment(4);
+ insertRegReturnAddrClobber(*CallTarget, *Reg);
+ BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
+}
diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td
index e4e0ed435103..c7713fea70fa 100755
--- a/lib/Target/X86/X86SchedBroadwell.td
+++ b/lib/Target/X86/X86SchedBroadwell.td
@@ -11,8 +11,9 @@
// scheduling and other instruction cost heuristics.
//
//===----------------------------------------------------------------------===//
+
def BroadwellModel : SchedMachineModel {
- // All x86 instructions are modeled as a single micro-op, and HW can decode 4
+ // All x86 instructions are modeled as a single micro-op, and BW can decode 4
// instructions per cycle.
let IssueWidth = 4;
let MicroOpBufferSize = 192; // Based on the reorder buffer.
@@ -22,7 +23,7 @@ def BroadwellModel : SchedMachineModel {
// Based on the LSD (loop-stream detector) queue size and benchmarking data.
let LoopMicroOpBufferSize = 50;
- // This flag is set to allow the scheduler to assign a default model to
+ // This flag is set to allow the scheduler to assign a default model to
// unrecognized opcodes.
let CompleteModel = 0;
}
@@ -66,6 +67,11 @@ def BWPortAny : ProcResGroup<[BWPort0, BWPort1, BWPort2, BWPort3, BWPort4,
let BufferSize=60;
}
+// Integer division issued on port 0.
+def BWDivider : ProcResource<1>;
+// FP division and sqrt on port 0.
+def BWFPDivider : ProcResource<1>;
+
// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
// cycles after the memory operand.
def : ReadAdvance<ReadAfterLd, 5>;
@@ -76,45 +82,84 @@ def : ReadAdvance<ReadAfterLd, 5>;
// This multiclass defines the resource usage for variants with and without
// folded loads.
multiclass BWWriteResPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1,
+ int LoadLat = 5> {
// Register variant is using a single cycle on ExePort.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
- // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
- // latency.
- def : WriteRes<SchedRW.Folded, [BWPort23, ExePort]> {
- let Latency = !add(Lat, 5);
+ // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+ // the latency (default = 5).
+ def : WriteRes<SchedRW.Folded, !listconcat([BWPort23], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = !add(UOps, 1);
}
}
-// A folded store needs a cycle on port 4 for the store data, but it does not
-// need an extra port 2/3 cycle to recompute the address.
-def : WriteRes<WriteRMW, [BWPort4]>;
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3/7 cycle to recompute the address.
+def : WriteRes<WriteRMW, [BWPort237,BWPort4]>;
// Arithmetic.
-defm : BWWriteResPair<WriteALU, BWPort0156, 1>; // Simple integer ALU op.
-defm : BWWriteResPair<WriteIMul, BWPort1, 3>; // Integer multiplication.
+defm : BWWriteResPair<WriteALU, [BWPort0156], 1>; // Simple integer ALU op.
+defm : BWWriteResPair<WriteADC, [BWPort06], 1>; // Integer ALU + flags op.
+defm : BWWriteResPair<WriteIMul, [BWPort1], 3>; // Integer multiplication.
+defm : BWWriteResPair<WriteIMul64, [BWPort1], 3>; // Integer 64-bit multiplication.
+defm : BWWriteResPair<WriteDiv8, [BWPort0, BWDivider], 25, [1, 10]>;
+defm : BWWriteResPair<WriteDiv16, [BWPort0, BWDivider], 25, [1, 10]>;
+defm : BWWriteResPair<WriteDiv32, [BWPort0, BWDivider], 25, [1, 10]>;
+defm : BWWriteResPair<WriteDiv64, [BWPort0, BWDivider], 25, [1, 10]>;
+defm : BWWriteResPair<WriteIDiv8, [BWPort0, BWDivider], 25, [1, 10]>;
+defm : BWWriteResPair<WriteIDiv16, [BWPort0, BWDivider], 25, [1, 10]>;
+defm : BWWriteResPair<WriteIDiv32, [BWPort0, BWDivider], 25, [1, 10]>;
+defm : BWWriteResPair<WriteIDiv64, [BWPort0, BWDivider], 25, [1, 10]>;
+
+defm : BWWriteResPair<WriteBSWAP32,[BWPort15], 1>; //
+defm : BWWriteResPair<WriteBSWAP64,[BWPort06, BWPort15], 2, [1, 1], 2>; //
+
+defm : BWWriteResPair<WriteCRC32, [BWPort1], 3>;
def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part.
-def BWDivider : ProcResource<1>; // Integer division issued on port 0.
-def : WriteRes<WriteIDiv, [BWPort0, BWDivider]> { // Integer division.
- let Latency = 25;
- let ResourceCycles = [1, 10];
-}
-def : WriteRes<WriteIDivLd, [BWPort23, BWPort0, BWDivider]> {
- let Latency = 29;
- let ResourceCycles = [1, 1, 10];
-}
def : WriteRes<WriteLEA, [BWPort15]>; // LEA instructions can't fold loads.
+defm : BWWriteResPair<WriteCMOV, [BWPort06], 1>; // Conditional move.
+defm : BWWriteResPair<WriteCMOV2, [BWPort06,BWPort0156], 2, [1,1], 2>; // // Conditional (CF + ZF flag) move.
+defm : X86WriteRes<WriteFCMOV, [BWPort1], 3, [1], 1>; // x87 conditional move.
+
+def : WriteRes<WriteSETCC, [BWPort06]>; // Setcc.
+def : WriteRes<WriteSETCCStore, [BWPort06,BWPort4,BWPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+def : WriteRes<WriteLAHFSAHF, [BWPort06]>;
+
+// Bit counts.
+defm : BWWriteResPair<WriteBSF, [BWPort1], 3>;
+defm : BWWriteResPair<WriteBSR, [BWPort1], 3>;
+defm : BWWriteResPair<WriteLZCNT, [BWPort1], 3>;
+defm : BWWriteResPair<WriteTZCNT, [BWPort1], 3>;
+defm : BWWriteResPair<WritePOPCNT, [BWPort1], 3>;
+
// Integer shifts and rotates.
-defm : BWWriteResPair<WriteShift, BWPort06, 1>;
+defm : BWWriteResPair<WriteShift, [BWPort06], 1>;
+
+// Double shift instructions.
+defm : BWWriteResPair<WriteShiftDouble, [BWPort06], 1>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : BWWriteResPair<WriteBEXTR, [BWPort06,BWPort15], 2, [1,1], 2>;
+defm : BWWriteResPair<WriteBZHI, [BWPort15], 1>;
// Loads, stores, and moves, not folded with other operations.
-def : WriteRes<WriteLoad, [BWPort23]> { let Latency = 5; }
-def : WriteRes<WriteStore, [BWPort237, BWPort4]>;
-def : WriteRes<WriteMove, [BWPort0156]>;
+defm : X86WriteRes<WriteLoad, [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteStore, [BWPort237, BWPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteStoreNT, [BWPort237, BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteMove, [BWPort0156], 1, [1], 1>;
// Idioms that clear a register, like xorps %xmm0, %xmm0.
// These can often bypass execution ports completely.
@@ -125,153 +170,367 @@ def : InstRW<[WriteMove], (instrs COPY)>;
// Branches don't produce values, so they have no latency, but they still
// consume resources. Indirect branches can fold loads.
-defm : BWWriteResPair<WriteJump, BWPort06, 1>;
+defm : BWWriteResPair<WriteJump, [BWPort06], 1>;
// Floating point. This covers both scalar and vector operations.
-defm : BWWriteResPair<WriteFAdd, BWPort1, 3>; // Floating point add/sub/compare.
-defm : BWWriteResPair<WriteFMul, BWPort0, 5>; // Floating point multiplication.
-defm : BWWriteResPair<WriteFDiv, BWPort0, 12>; // 10-14 cycles. // Floating point division.
-defm : BWWriteResPair<WriteFSqrt, BWPort0, 15>; // Floating point square root.
-defm : BWWriteResPair<WriteFRcp, BWPort0, 5>; // Floating point reciprocal estimate.
-defm : BWWriteResPair<WriteFRsqrt, BWPort0, 5>; // Floating point reciprocal square root estimate.
-defm : BWWriteResPair<WriteFMA, BWPort01, 5>; // Fused Multiply Add.
-defm : BWWriteResPair<WriteFShuffle, BWPort5, 1>; // Floating point vector shuffles.
-defm : BWWriteResPair<WriteFBlend, BWPort015, 1>; // Floating point vector blends.
-def : WriteRes<WriteFVarBlend, [BWPort5]> { // Fp vector variable blends.
- let Latency = 2;
- let ResourceCycles = [2];
-}
-def : WriteRes<WriteFVarBlendLd, [BWPort5, BWPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1];
-}
+defm : X86WriteRes<WriteFLD0, [BWPort01], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1, [BWPort01], 1, [2], 2>;
+defm : X86WriteRes<WriteFLDC, [BWPort01], 1, [2], 2>;
+defm : X86WriteRes<WriteFLoad, [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX, [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [BWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad, [BWPort23,BWPort5], 7, [1,2], 3>;
+defm : X86WriteRes<WriteFMaskedLoadY, [BWPort23,BWPort5], 8, [1,2], 3>;
+defm : X86WriteRes<WriteFStore, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreX, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreY, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNT, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTX, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTY, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMove, [BWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX, [BWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY, [BWPort5], 1, [1], 1>;
+
+defm : BWWriteResPair<WriteFAdd, [BWPort1], 3, [1], 1, 5>; // Floating point add/sub.
+defm : BWWriteResPair<WriteFAddX, [BWPort1], 3, [1], 1, 5>; // Floating point add/sub (XMM).
+defm : BWWriteResPair<WriteFAddY, [BWPort1], 3, [1], 1, 6>; // Floating point add/sub (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : BWWriteResPair<WriteFAdd64, [BWPort1], 3, [1], 1, 5>; // Floating point double add/sub.
+defm : BWWriteResPair<WriteFAdd64X, [BWPort1], 3, [1], 1, 5>; // Floating point double add/sub (XMM).
+defm : BWWriteResPair<WriteFAdd64Y, [BWPort1], 3, [1], 1, 6>; // Floating point double add/sub (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+
+defm : BWWriteResPair<WriteFCmp, [BWPort1], 3, [1], 1, 5>; // Floating point compare.
+defm : BWWriteResPair<WriteFCmpX, [BWPort1], 3, [1], 1, 5>; // Floating point compare (XMM).
+defm : BWWriteResPair<WriteFCmpY, [BWPort1], 3, [1], 1, 6>; // Floating point compare (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : BWWriteResPair<WriteFCmp64, [BWPort1], 3, [1], 1, 5>; // Floating point double compare.
+defm : BWWriteResPair<WriteFCmp64X, [BWPort1], 3, [1], 1, 5>; // Floating point double compare (XMM).
+defm : BWWriteResPair<WriteFCmp64Y, [BWPort1], 3, [1], 1, 6>; // Floating point double compare (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+
+defm : BWWriteResPair<WriteFCom, [BWPort1], 3>; // Floating point compare to flags.
+
+defm : BWWriteResPair<WriteFMul, [BWPort01], 3, [1], 1, 5>; // Floating point multiplication.
+defm : BWWriteResPair<WriteFMulX, [BWPort01], 3, [1], 1, 5>; // Floating point multiplication (XMM).
+defm : BWWriteResPair<WriteFMulY, [BWPort01], 3, [1], 1, 6>; // Floating point multiplication (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : BWWriteResPair<WriteFMul64, [BWPort01], 3, [1], 1, 5>; // Floating point double multiplication.
+defm : BWWriteResPair<WriteFMul64X, [BWPort01], 3, [1], 1, 5>; // Floating point double multiplication (XMM).
+defm : BWWriteResPair<WriteFMul64Y, [BWPort01], 3, [1], 1, 6>; // Floating point double multiplication (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+
+//defm : BWWriteResPair<WriteFDiv, [BWPort0,BWFPDivider], 11, [1,3], 1, 5>; // Floating point division.
+defm : BWWriteResPair<WriteFDivX, [BWPort0,BWFPDivider], 11, [1,5], 1, 5>; // Floating point division (XMM).
+defm : BWWriteResPair<WriteFDivY, [BWPort0,BWPort015,BWFPDivider], 17, [2,1,10], 3, 6>; // Floating point division (YMM).
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+//defm : BWWriteResPair<WriteFDiv64, [BWPort0,BWFPDivider], 14, [1,8], 1, 5>; // Floating point division.
+defm : BWWriteResPair<WriteFDiv64X, [BWPort0,BWFPDivider], 14, [1,8], 1, 5>; // Floating point division (XMM).
+defm : BWWriteResPair<WriteFDiv64Y, [BWPort0,BWPort015,BWFPDivider], 23, [2,1,16], 3, 6>; // Floating point division (YMM).
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+
+defm : X86WriteRes<WriteFSqrt, [BWPort0,BWFPDivider], 11, [1,4], 1>; // Floating point square root.
+defm : X86WriteRes<WriteFSqrtLd, [BWPort0,BWPort23,BWFPDivider], 16, [1,1,7], 2>;
+defm : BWWriteResPair<WriteFSqrtX, [BWPort0,BWFPDivider], 11, [1,7], 1, 5>; // Floating point square root (XMM).
+defm : BWWriteResPair<WriteFSqrtY, [BWPort0,BWPort015,BWFPDivider], 21, [2,1,14], 3, 6>; // Floating point square root (YMM).
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : X86WriteRes<WriteFSqrt64, [BWPort0,BWFPDivider], 16, [1,8], 1>; // Floating point double square root.
+defm : X86WriteRes<WriteFSqrt64Ld, [BWPort0,BWPort23,BWFPDivider], 21, [1,1,14], 2>;
+defm : BWWriteResPair<WriteFSqrt64X, [BWPort0,BWFPDivider], 16, [1,14],1, 5>; // Floating point double square root (XMM).
+defm : BWWriteResPair<WriteFSqrt64Y, [BWPort0,BWPort015,BWFPDivider], 29, [2,1,28], 3, 6>; // Floating point double square root (YMM).
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : BWWriteResPair<WriteFSqrt80, [BWPort0,BWFPDivider], 23, [1,9]>; // Floating point long double square root.
+
+defm : BWWriteResPair<WriteFRcp, [BWPort0], 5, [1], 1, 5>; // Floating point reciprocal estimate.
+defm : BWWriteResPair<WriteFRcpX, [BWPort0], 5, [1], 1, 5>; // Floating point reciprocal estimate (XMM).
+defm : BWWriteResPair<WriteFRcpY, [BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal estimate (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+
+defm : BWWriteResPair<WriteFRsqrt, [BWPort0], 5, [1], 1, 5>; // Floating point reciprocal square root estimate.
+defm : BWWriteResPair<WriteFRsqrtX,[BWPort0], 5, [1], 1, 5>; // Floating point reciprocal square root estimate (XMM).
+defm : BWWriteResPair<WriteFRsqrtY,[BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal square root estimate (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+
+defm : BWWriteResPair<WriteFMA, [BWPort01], 5, [1], 1, 5>; // Fused Multiply Add.
+defm : BWWriteResPair<WriteFMAX, [BWPort01], 5, [1], 1, 5>; // Fused Multiply Add (XMM).
+defm : BWWriteResPair<WriteFMAY, [BWPort01], 5, [1], 1, 6>; // Fused Multiply Add (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : BWWriteResPair<WriteDPPD, [BWPort0,BWPort1,BWPort5], 9, [1,1,1], 3, 5>; // Floating point double dot product.
+defm : BWWriteResPair<WriteDPPS, [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4, 5>; // Floating point single dot product.
+defm : BWWriteResPair<WriteDPPSY, [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4, 6>; // Floating point single dot product (YMM).
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+defm : BWWriteResPair<WriteFSign, [BWPort5], 1>; // Floating point fabs/fchs.
+defm : X86WriteRes<WriteFRnd, [BWPort23], 6, [1], 1>; // Floating point rounding.
+defm : X86WriteRes<WriteFRndY, [BWPort23], 6, [1], 1>; // Floating point rounding (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : X86WriteRes<WriteFRndLd, [BWPort1,BWPort23], 11, [2,1], 3>;
+defm : X86WriteRes<WriteFRndYLd, [BWPort1,BWPort23], 12, [2,1], 3>;
+defm : BWWriteResPair<WriteFLogic, [BWPort5], 1, [1], 1, 5>; // Floating point and/or/xor logicals.
+defm : BWWriteResPair<WriteFLogicY, [BWPort5], 1, [1], 1, 6>; // Floating point and/or/xor logicals (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : BWWriteResPair<WriteFTest, [BWPort0], 1, [1], 1, 5>; // Floating point TEST instructions.
+defm : BWWriteResPair<WriteFTestY, [BWPort0], 1, [1], 1, 6>; // Floating point TEST instructions (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : BWWriteResPair<WriteFShuffle, [BWPort5], 1, [1], 1, 5>; // Floating point vector shuffles.
+defm : BWWriteResPair<WriteFShuffleY, [BWPort5], 1, [1], 1, 6>; // Floating point vector shuffles (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : BWWriteResPair<WriteFVarShuffle, [BWPort5], 1, [1], 1, 5>; // Floating point vector variable shuffles.
+defm : BWWriteResPair<WriteFVarShuffleY, [BWPort5], 1, [1], 1, 6>; // Floating point vector variable shuffles.
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : BWWriteResPair<WriteFBlend, [BWPort015], 1, [1], 1, 5>; // Floating point vector blends.
+defm : BWWriteResPair<WriteFBlendY, [BWPort015], 1, [1], 1, 6>; // Floating point vector blends.
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : BWWriteResPair<WriteFVarBlend, [BWPort5], 2, [2], 2, 5>; // Fp vector variable blends.
+defm : BWWriteResPair<WriteFVarBlendY, [BWPort5], 2, [2], 2, 6>; // Fp vector variable blends.
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
// FMA Scheduling helper class.
// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
// Vector integer operations.
-defm : BWWriteResPair<WriteVecALU, BWPort15, 1>; // Vector integer ALU op, no logicals.
-defm : BWWriteResPair<WriteVecShift, BWPort0, 1>; // Vector integer shifts.
-defm : BWWriteResPair<WriteVecIMul, BWPort0, 5>; // Vector integer multiply.
-defm : BWWriteResPair<WriteShuffle, BWPort5, 1>; // Vector shuffles.
-defm : BWWriteResPair<WriteBlend, BWPort15, 1>; // Vector blends.
-
-def : WriteRes<WriteVarBlend, [BWPort5]> { // Vector variable blends.
+defm : X86WriteRes<WriteVecLoad, [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [BWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT, [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [BWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [BWPort23,BWPort5], 7, [1,2], 3>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [BWPort23,BWPort5], 8, [1,2], 3>;
+defm : X86WriteRes<WriteVecStore, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreX, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreY, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNT, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNTY, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMove, [BWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [BWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [BWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr, [BWPort0], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [BWPort5], 1, [1], 1>;
+
+defm : X86WriteRes<WriteEMMS, [BWPort01,BWPort15,BWPort015,BWPort0156], 31, [8,1,21,1], 31>;
+
+defm : BWWriteResPair<WriteVecALU, [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : BWWriteResPair<WriteVecALUX, [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : BWWriteResPair<WriteVecALUY, [BWPort15], 1, [1], 1, 6>; // Vector integer ALU op, no logicals (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : BWWriteResPair<WriteVecLogic, [BWPort015], 1, [1], 1, 5>; // Vector integer and/or/xor.
+defm : BWWriteResPair<WriteVecLogicX,[BWPort015], 1, [1], 1, 5>; // Vector integer and/or/xor.
+defm : BWWriteResPair<WriteVecLogicY,[BWPort015], 1, [1], 1, 6>; // Vector integer and/or/xor (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : BWWriteResPair<WriteVecTest, [BWPort0,BWPort5], 2, [1,1], 2, 5>; // Vector integer TEST instructions.
+defm : BWWriteResPair<WriteVecTestY, [BWPort0,BWPort5], 4, [1,1], 2, 6>; // Vector integer TEST instructions (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : BWWriteResPair<WriteVecIMul, [BWPort0], 5, [1], 1, 5>; // Vector integer multiply.
+defm : BWWriteResPair<WriteVecIMulX, [BWPort0], 5, [1], 1, 5>; // Vector integer multiply.
+defm : BWWriteResPair<WriteVecIMulY, [BWPort0], 5, [1], 1, 6>; // Vector integer multiply.
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : BWWriteResPair<WritePMULLD, [BWPort0], 10, [2], 2, 5>; // Vector PMULLD.
+defm : BWWriteResPair<WritePMULLDY, [BWPort0], 10, [2], 2, 6>; // Vector PMULLD (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : BWWriteResPair<WriteShuffle, [BWPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : BWWriteResPair<WriteShuffleX, [BWPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : BWWriteResPair<WriteShuffleY, [BWPort5], 1, [1], 1, 6>; // Vector shuffles (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : BWWriteResPair<WriteVarShuffle, [BWPort5], 1, [1], 1, 5>; // Vector variable shuffles.
+defm : BWWriteResPair<WriteVarShuffleX,[BWPort5], 1, [1], 1, 5>; // Vector variable shuffles.
+defm : BWWriteResPair<WriteVarShuffleY,[BWPort5], 1, [1], 1, 6>; // Vector variable shuffles (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : BWWriteResPair<WriteBlend, [BWPort5], 1, [1], 1, 5>; // Vector blends.
+defm : BWWriteResPair<WriteBlendY, [BWPort5], 1, [1], 1, 6>; // Vector blends (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : BWWriteResPair<WriteVarBlend, [BWPort5], 2, [2], 2, 5>; // Vector variable blends.
+defm : BWWriteResPair<WriteVarBlendY, [BWPort5], 2, [2], 2, 6>; // Vector variable blends (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : BWWriteResPair<WriteMPSAD, [BWPort0, BWPort5], 7, [1, 2], 3, 5>; // Vector MPSAD.
+defm : BWWriteResPair<WriteMPSADY, [BWPort0, BWPort5], 7, [1, 2], 3, 6>; // Vector MPSAD.
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+defm : BWWriteResPair<WritePSADBW, [BWPort0], 5, [1], 1, 5>; // Vector PSADBW.
+defm : BWWriteResPair<WritePSADBWX, [BWPort0], 5, [1], 1, 5>; // Vector PSADBW.
+defm : BWWriteResPair<WritePSADBWY, [BWPort0], 5, [1], 1, 6>; // Vector PSADBW (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : BWWriteResPair<WritePHMINPOS, [BWPort0], 5>; // Vector PHMINPOS.
+
+// Vector integer shifts.
+defm : BWWriteResPair<WriteVecShift, [BWPort0], 1, [1], 1, 5>;
+defm : BWWriteResPair<WriteVecShiftX, [BWPort0,BWPort5], 2, [1,1], 2, 5>;
+defm : X86WriteRes<WriteVecShiftY, [BWPort0,BWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftYLd, [BWPort0,BWPort23], 7, [1,1], 2>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+
+defm : BWWriteResPair<WriteVecShiftImm, [BWPort0], 1, [1], 1, 5>;
+defm : BWWriteResPair<WriteVecShiftImmX, [BWPort0], 1, [1], 1, 5>; // Vector integer immediate shifts (XMM).
+defm : BWWriteResPair<WriteVecShiftImmY, [BWPort0], 1, [1], 1, 6>; // Vector integer immediate shifts (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : BWWriteResPair<WriteVarVecShift, [BWPort0, BWPort5], 3, [2,1], 3, 5>; // Variable vector shifts.
+defm : BWWriteResPair<WriteVarVecShiftY, [BWPort0, BWPort5], 3, [2,1], 3, 6>; // Variable vector shifts (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [BWPort5]> {
let Latency = 2;
+ let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def : WriteRes<WriteVarBlendLd, [BWPort5, BWPort23]> {
+def : WriteRes<WriteVecInsertLd, [BWPort5,BWPort23]> {
let Latency = 6;
- let ResourceCycles = [2, 1];
+ let NumMicroOps = 2;
}
-def : WriteRes<WriteMPSAD, [BWPort0, BWPort5]> { // Vector MPSAD.
- let Latency = 6;
- let ResourceCycles = [1, 2];
+def : WriteRes<WriteVecExtract, [BWPort0,BWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
}
-def : WriteRes<WriteMPSADLd, [BWPort23, BWPort0, BWPort5]> {
- let Latency = 6;
- let ResourceCycles = [1, 1, 2];
+def : WriteRes<WriteVecExtractSt, [BWPort4,BWPort5,BWPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
}
-// Vector bitwise operations.
-// These are often used on both floating point and integer vectors.
-defm : BWWriteResPair<WriteVecLogic, BWPort015, 1>; // Vector and/or/xor.
-
// Conversion between integer and float.
-defm : BWWriteResPair<WriteCvtF2I, BWPort1, 3>; // Float -> Integer.
-defm : BWWriteResPair<WriteCvtI2F, BWPort1, 4>; // Integer -> Float.
-defm : BWWriteResPair<WriteCvtF2F, BWPort1, 3>; // Float -> Float size conversion.
+defm : BWWriteResPair<WriteCvtSS2I, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2I, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2IY, [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : BWWriteResPair<WriteCvtSD2I, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2I, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2IY, [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+defm : BWWriteResPair<WriteCvtI2SS, [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PS, [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PSY, [BWPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : BWWriteResPair<WriteCvtI2SD, [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PD, [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PDY, [BWPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : BWWriteResPair<WriteCvtSS2SD, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2PD, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2PDY, [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+defm : BWWriteResPair<WriteCvtSD2SS, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2PS, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2PSY, [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+defm : X86WriteRes<WriteCvtPH2PS, [BWPort0,BWPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSY, [BWPort0,BWPort5], 2, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZ>;
+defm : X86WriteRes<WriteCvtPH2PSLd, [BWPort0,BWPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSYLd, [BWPort0,BWPort23], 6, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
+
+defm : X86WriteRes<WriteCvtPS2PH, [BWPort1,BWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHY, [BWPort1,BWPort5], 6, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteRes<WriteCvtPS2PHSt, [BWPort1,BWPort4,BWPort237], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [BWPort1,BWPort4,BWPort237], 7, [1,1,1], 3>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
// Strings instructions.
+
// Packed Compare Implicit Length Strings, Return Mask
-// String instructions.
def : WriteRes<WritePCmpIStrM, [BWPort0]> {
- let Latency = 10;
+ let Latency = 11;
+ let NumMicroOps = 3;
let ResourceCycles = [3];
}
def : WriteRes<WritePCmpIStrMLd, [BWPort0, BWPort23]> {
- let Latency = 10;
- let ResourceCycles = [3, 1];
-}
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
// Packed Compare Explicit Length Strings, Return Mask
-def : WriteRes<WritePCmpEStrM, [BWPort0, BWPort16, BWPort5]> {
- let Latency = 10;
- let ResourceCycles = [3, 2, 4];
+def : WriteRes<WritePCmpEStrM, [BWPort0, BWPort5, BWPort015, BWPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
}
-def : WriteRes<WritePCmpEStrMLd, [BWPort05, BWPort16, BWPort23]> {
- let Latency = 10;
- let ResourceCycles = [6, 2, 1];
-}
- // Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrMLd, [BWPort0, BWPort5, BWPort23, BWPort015, BWPort0156]> {
+ let Latency = 24;
+ let NumMicroOps = 10;
+ let ResourceCycles = [4,3,1,1,1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
def : WriteRes<WritePCmpIStrI, [BWPort0]> {
let Latency = 11;
+ let NumMicroOps = 3;
let ResourceCycles = [3];
}
def : WriteRes<WritePCmpIStrILd, [BWPort0, BWPort23]> {
- let Latency = 11;
- let ResourceCycles = [3, 1];
-}
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
// Packed Compare Explicit Length Strings, Return Index
-def : WriteRes<WritePCmpEStrI, [BWPort05, BWPort16]> {
- let Latency = 11;
- let ResourceCycles = [6, 2];
+def : WriteRes<WritePCmpEStrI, [BWPort0, BWPort5, BWPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [4,3,1];
}
-def : WriteRes<WritePCmpEStrILd, [BWPort0, BWPort16, BWPort5, BWPort23]> {
- let Latency = 11;
- let ResourceCycles = [3, 2, 2, 1];
+def : WriteRes<WritePCmpEStrILd, [BWPort0, BWPort5, BWPort23, BWPort0156]> {
+ let Latency = 23;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
}
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [BWPort0]> { let Latency = 3; }
+def : WriteRes<WriteVecMOVMSK, [BWPort0]> { let Latency = 3; }
+def : WriteRes<WriteVecMOVMSKY, [BWPort0]> { let Latency = 3; }
+def : WriteRes<WriteMMXMOVMSK, [BWPort0]> { let Latency = 1; }
+
// AES instructions.
def : WriteRes<WriteAESDecEnc, [BWPort5]> { // Decryption, encryption.
let Latency = 7;
+ let NumMicroOps = 1;
let ResourceCycles = [1];
}
def : WriteRes<WriteAESDecEncLd, [BWPort5, BWPort23]> {
- let Latency = 7;
- let ResourceCycles = [1, 1];
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
+
def : WriteRes<WriteAESIMC, [BWPort5]> { // InvMixColumn.
let Latency = 14;
+ let NumMicroOps = 2;
let ResourceCycles = [2];
}
def : WriteRes<WriteAESIMCLd, [BWPort5, BWPort23]> {
- let Latency = 14;
- let ResourceCycles = [2, 1];
+ let Latency = 19;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
}
-def : WriteRes<WriteAESKeyGen, [BWPort0, BWPort5]> { // Key Generation.
- let Latency = 10;
- let ResourceCycles = [2, 8];
+
+def : WriteRes<WriteAESKeyGen, [BWPort0, BWPort5, BWPort015]> { // Key Generation.
+ let Latency = 29;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,7,2];
}
-def : WriteRes<WriteAESKeyGenLd, [BWPort0, BWPort5, BWPort23]> {
- let Latency = 10;
- let ResourceCycles = [2, 7, 1];
+def : WriteRes<WriteAESKeyGenLd, [BWPort0, BWPort5, BWPort23, BWPort015]> {
+ let Latency = 33;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,7,1,1];
}
// Carry-less multiplication instructions.
-def : WriteRes<WriteCLMul, [BWPort0, BWPort5]> {
- let Latency = 7;
- let ResourceCycles = [2, 1];
-}
-def : WriteRes<WriteCLMulLd, [BWPort0, BWPort5, BWPort23]> {
- let Latency = 7;
- let ResourceCycles = [2, 1, 1];
-}
+defm : BWWriteResPair<WriteCLMul, [BWPort0], 5>;
// Catch-all for expensive system instructions.
def : WriteRes<WriteSystem, [BWPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
// AVX2.
-defm : BWWriteResPair<WriteFShuffle256, BWPort5, 3>; // Fp 256-bit width vector shuffles.
-defm : BWWriteResPair<WriteShuffle256, BWPort5, 3>; // 256-bit width vector shuffles.
-def : WriteRes<WriteVarVecShift, [BWPort0, BWPort5]> { // Variable vector shifts.
- let Latency = 2;
- let ResourceCycles = [2, 1];
-}
-def : WriteRes<WriteVarVecShiftLd, [BWPort0, BWPort5, BWPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1, 1];
-}
+defm : BWWriteResPair<WriteFShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteFVarShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector variable shuffles.
+defm : BWWriteResPair<WriteShuffle256, [BWPort5], 3, [1], 1, 6>; // 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteVarShuffle256, [BWPort5], 3, [1], 1, 6>; // 256-bit width vector variable shuffles.
// Old microcoded instructions that nobody use.
def : WriteRes<WriteMicrocoded, [BWPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
@@ -279,33 +538,22 @@ def : WriteRes<WriteMicrocoded, [BWPort0156]> { let Latency = 100; } // def Writ
// Fence instructions.
def : WriteRes<WriteFence, [BWPort23, BWPort4]>;
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [BWPort0,BWPort23,BWPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [BWPort4,BWPort5,BWPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+
// Nop, not very useful expect it provides a model for nops!
def : WriteRes<WriteNop, []>;
////////////////////////////////////////////////////////////////////////////////
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [BWPort1]> {
- let Latency = 3;
-}
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [BWPort1, BWPort23]> {
- let Latency = 7;
- let ResourceCycles = [1, 1];
-}
-
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [BWPort15]>;
-
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [BWPort15, BWPort23]> {
- let Latency = 5;
- let ResourceCycles = [1, 1];
-}
+defm : BWWriteResPair<WriteFHAdd, [BWPort1,BWPort5], 5, [1,2], 3, 5>;
+defm : BWWriteResPair<WriteFHAddY, [BWPort1,BWPort5], 5, [1,2], 3, 6>;
+defm : BWWriteResPair<WritePHAdd, [BWPort5,BWPort15], 3, [2,1], 3, 5>;
+defm : BWWriteResPair<WritePHAddX, [BWPort5,BWPort15], 3, [2,1], 3, 5>;
+defm : BWWriteResPair<WritePHAddY, [BWPort5,BWPort15], 3, [2,1], 3, 6>;
// Remaining instrs.
@@ -314,264 +562,23 @@ def BWWriteResGroup1 : SchedWriteRes<[BWPort0]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_MOVD64grr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PMOVMSKBrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLDri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLDrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLQri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLQrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLWrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRADri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRADrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRAWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRAWrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLDri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLDrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLQri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLQrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLWrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MOVPDI2DIrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MOVPQIto64rr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSLLDri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSLLQri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSLLWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSRADri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSRAWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSRLDri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSRLQri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSRLWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VMOVPDI2DIrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VMOVPQIto64rr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLDYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLDri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLQYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLQri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLVQYrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLVQrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLWYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRADYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRADri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRAWYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRAWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLDYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLDri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLQYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLQri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLVQYrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLVQrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLWYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VTESTPDYrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VTESTPDrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VTESTPSYrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VTESTPSrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSLLVQ(Y?)rr",
+ "VPSRLVQ(Y?)rr")>;
def BWWriteResGroup2 : SchedWriteRes<[BWPort1]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup2], (instregex "COMP_FST0r")>;
-def: InstRW<[BWWriteResGroup2], (instregex "COM_FST0r")>;
-def: InstRW<[BWWriteResGroup2], (instregex "MMX_MASKMOVQ64")>;
-def: InstRW<[BWWriteResGroup2], (instregex "MMX_MASKMOVQ64")>;
-def: InstRW<[BWWriteResGroup2], (instregex "UCOM_FPr")>;
-def: InstRW<[BWWriteResGroup2], (instregex "UCOM_Fr")>;
-def: InstRW<[BWWriteResGroup2], (instregex "VMASKMOVDQU")>;
+def: InstRW<[BWWriteResGroup2], (instregex "COM(P?)_FST0r",
+ "UCOM_F(P?)r")>;
def BWWriteResGroup3 : SchedWriteRes<[BWPort5]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup3], (instregex "ANDNPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "ANDNPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "ANDPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "ANDPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "INSERTPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_MOVD64rr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_MOVD64to64rr")>;
def: InstRW<[BWWriteResGroup3], (instregex "MMX_MOVQ2DQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PALIGNR64irr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PSHUFBrr64")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PSHUFWri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKHBWirr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKHDQirr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKHWDirr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKLBWirr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKLDQirr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKLWDirr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOV64toPQIrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVAPDrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVAPSrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVDDUPrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVDI2PDIrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVHLPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVLHPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVSDrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVSHDUPrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVSLDUPrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVSSrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVUPDrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVUPSrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "ORPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "ORPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PACKSSDWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PACKSSWBrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PACKUSDWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PACKUSWBrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PALIGNRrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PBLENDWrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXBDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXBQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXWQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXBDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXBQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXWQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PSHUFBrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PSHUFDri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PSHUFHWri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PSHUFLWri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PSLLDQri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PSRLDQri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHQDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLQDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "SHUFPDrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "SHUFPSrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "UNPCKHPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "UNPCKHPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "UNPCKLPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "UNPCKLPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDNPDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDNPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDNPSYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDNPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDPDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDPSYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VBROADCASTSSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VINSERTPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOV64toPQIrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPDYrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPDrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPSYrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPSrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVDDUPYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVDDUPrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVDI2PDIrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVHLPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVLHPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVSDrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVSHDUPYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVSHDUPrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVSLDUPYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVSLDUPrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVSSrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPDYrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPDrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPSYrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPSrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VORPDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VORPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VORPSYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VORPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSDWYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSDWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSWBYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSWBrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSDWYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSDWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSWBYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSWBrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPALIGNRYrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPALIGNRrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPBLENDWYrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPBLENDWrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPBROADCASTDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPBROADCASTQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDYri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSYri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXBDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXBQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXWQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXBDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXBQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXWQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFBYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFBrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFDYri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFDri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFHWYri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFHWri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFLWYri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFLWri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSLLDQYri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSLLDQri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSRLDQYri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSRLDQri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHBWYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHDQYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHQDQYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHQDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHWDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLBWYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLDQYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLQDQYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLQDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLWDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPDYrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPDrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPSYrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPSrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPSYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPSYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VXORPDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VXORPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VXORPSYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VXORPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "XORPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "XORPSrr")>;
def BWWriteResGroup4 : SchedWriteRes<[BWPort6]> {
let Latency = 1;
@@ -585,561 +592,93 @@ def BWWriteResGroup5 : SchedWriteRes<[BWPort01]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup5], (instregex "FINCSTP")>;
-def: InstRW<[BWWriteResGroup5], (instregex "FNOP")>;
+def: InstRW<[BWWriteResGroup5], (instrs FINCSTP, FNOP)>;
def BWWriteResGroup6 : SchedWriteRes<[BWPort06]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup6], (instregex "ADC(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "ADC(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup6], (instregex "ADC8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup6], (instregex "ADCX(32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "ADOX(32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BT(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BT(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BTC(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BTC(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BTR(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BTR(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BTS(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BTS(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CDQ")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVAE(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVB(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVE(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVG(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVGE(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVL(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVLE(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVNE(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVNO(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVNP(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVNS(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVO(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVP(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVS(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CQO")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JAE_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JAE_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JA_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JA_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JBE_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JBE_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JB_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JB_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JE_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JE_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JGE_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JGE_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JG_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JG_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JLE_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JLE_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JL_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JL_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JMP_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JMP_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNE_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNE_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNO_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNO_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNP_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNP_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNS_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNS_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JO_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JO_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JP_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JP_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JS_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JS_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "RORX(32|64)ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SAR(16|32|64)r1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SAR(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SAR8r1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SAR8ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SARX(32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SBB(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SBB(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SBB8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETAEr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETBr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETEr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETGEr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETGr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETLEr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETLr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETNEr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETNOr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETNPr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETNSr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETOr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETPr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETSr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHL(16|32|64)r1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHL(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHL8r1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHL8ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHLX(32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHR(16|32|64)r1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHR(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHR8r1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHR8ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHRX(32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instrs CDQ, CQO)>;
+def: InstRW<[BWWriteResGroup6], (instregex "BT(16|32|64)ri8",
+ "BT(16|32|64)rr",
+ "BTC(16|32|64)ri8",
+ "BTC(16|32|64)rr",
+ "BTR(16|32|64)ri8",
+ "BTR(16|32|64)rr",
+ "BTS(16|32|64)ri8",
+ "BTS(16|32|64)rr")>;
def BWWriteResGroup7 : SchedWriteRes<[BWPort15]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup7], (instregex "ANDN(32|64)rr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "BLSI(32|64)rr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "BLSMSK(32|64)rr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "BLSR(32|64)rr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "BZHI(32|64)rr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "LEA(16|32|64)(_32)?r")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PABSBrr64")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PABSDrr64")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PABSWrr64")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDDirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDQirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDSBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDSWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDUSBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDUSWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PAVGBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PAVGWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPEQBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPEQDirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPEQWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPGTBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPGTDirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPGTWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMAXSWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMAXUBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMINSWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMINUBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSIGNBrr64")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSIGNDrr64")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSIGNWrr64")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBDirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBQirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBSBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBSWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBUSBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBUSWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PABSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PABSDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PABSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDQrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDUSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDUSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PAVGBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PAVGWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQQrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PCMPGTBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PCMPGTDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PCMPGTWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMAXSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMAXSDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMAXSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMAXUBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMAXUDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMAXUWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMINSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMINSDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMINSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMINUBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMINUDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMINUWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSIGNBrr128")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSIGNDrr128")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSIGNWrr128")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBQrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBUSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBUSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPABSBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPABSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPABSDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPABSDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPABSWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPABSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDQYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDQrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDSBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDSWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPAVGBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPAVGBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPAVGWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPAVGWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQQYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQQrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINSBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINSDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINSDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINSWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINUBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINUBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINUDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINUDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINUWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINUWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNBYrr256")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNBrr128")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNDYrr256")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNDrr128")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNWYrr256")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNWrr128")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBQYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBQrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "ANDN(32|64)rr",
+ "BLSI(32|64)rr",
+ "BLSMSK(32|64)rr",
+ "BLSR(32|64)rr")>;
def BWWriteResGroup8 : SchedWriteRes<[BWPort015]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup8], (instregex "BLENDPDrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "BLENDPSrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MMX_MOVQ64rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MMX_PANDNirr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MMX_PANDirr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MMX_PORirr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MMX_PXORirr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MOVDQArr(_REV)?")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MOVDQUrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MOVPQI2QIrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "PANDNrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "PANDrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "PORrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "PXORrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPDYrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPDrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPSYrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPSrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQAYrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQArr(_REV)?")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQUYrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQUrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVPQI2QIrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVZPQILo2PQIrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPANDNYrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPANDNrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPANDYrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPANDrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPBLENDDYrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPBLENDDrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPORYrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPORrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPXORYrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPXORrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VPBLENDD(Y?)rri")>;
def BWWriteResGroup9 : SchedWriteRes<[BWPort0156]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup9], (instregex "ADD(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "ADD(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "ADD8i8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "ADD8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "ADD8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "AND(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "AND(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "AND8i8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "AND8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "AND8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CBW")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CLC")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CMC")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CMP(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CMP(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CMP8i8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CMP8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CMP8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CWDE")>;
-def: InstRW<[BWWriteResGroup9], (instregex "DEC(16|32|64)r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "DEC8r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "INC(16|32|64)r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "INC8r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "LAHF")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOV(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOV8ri(_alt)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOV8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOVSX(16|32|64)rr16")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOVSX(16|32|64)rr32")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOVSX(16|32|64)rr8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOVZX(16|32|64)rr16")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOVZX(16|32|64)rr8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "NEG(16|32|64)r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "NEG8r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "NOOP")>;
-def: InstRW<[BWWriteResGroup9], (instregex "NOT(16|32|64)r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "NOT8r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "OR(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "OR(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "OR8i8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "OR8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "OR8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SAHF")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SGDT64m")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SIDT64m")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SLDT64m")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SMSW16m")>;
-def: InstRW<[BWWriteResGroup9], (instregex "STC")>;
-def: InstRW<[BWWriteResGroup9], (instregex "STRm")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SUB(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SUB(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SUB8i8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SUB8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SUB8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SYSCALL")>;
-def: InstRW<[BWWriteResGroup9], (instregex "TEST(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup9], (instregex "TEST8i8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "TEST8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "TEST8rr")>;
-def: InstRW<[BWWriteResGroup9], (instregex "XCHG(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup9], (instregex "XOR(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "XOR(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "XOR8i8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "XOR8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "XOR8rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "SGDT64m",
+ "SIDT64m",
+ "SMSW16m",
+ "STRm",
+ "SYSCALL")>;
def BWWriteResGroup10 : SchedWriteRes<[BWPort4,BWPort237]> {
let Latency = 1;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup10], (instregex "FBSTPm")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVD64mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVNTQmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVQ64mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOV(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOV8mi")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOV8mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVAPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVAPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVDQAmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVDQUmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVHPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVHPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVLPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVLPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVNTDQmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVNTI_64mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVNTImr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVNTPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVNTPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVPDI2DImr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVPQI2QImr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVPQIto64mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVSDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVSSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVUPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVUPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "ST_FP32m")>;
-def: InstRW<[BWWriteResGroup10], (instregex "ST_FP64m")>;
-def: InstRW<[BWWriteResGroup10], (instregex "ST_FP80m")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VEXTRACTF128mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VEXTRACTI128mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPDYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPSYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQAYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQAmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQUYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQUmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVHPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVHPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVLPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVLPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTDQYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTDQmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPDYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPSYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVPDI2DImr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVPQI2QImr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVPQIto64mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVSDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVSSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPDYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPSYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPSmr")>;
-
-def BWWriteResGroup11 : SchedWriteRes<[BWPort5]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[BWWriteResGroup11], (instregex "BLENDVPDrr0")>;
-def: InstRW<[BWWriteResGroup11], (instregex "BLENDVPSrr0")>;
-def: InstRW<[BWWriteResGroup11], (instregex "MMX_PINSRWirri")>;
-def: InstRW<[BWWriteResGroup11], (instregex "PBLENDVBrr0")>;
-def: InstRW<[BWWriteResGroup11], (instregex "PINSRBrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "PINSRDrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "PINSRQrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "PINSRWrri")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPDYrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPDrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPSYrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPSrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VPBLENDVBYrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VPBLENDVBrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VPINSRBrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VPINSRDrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VPINSRQrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VPINSRWrri")>;
+def: InstRW<[BWWriteResGroup10], (instregex "FBSTPm",
+ "ST_FP(32|64|80)m")>;
def BWWriteResGroup12 : SchedWriteRes<[BWPort01]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[BWWriteResGroup12], (instregex "FDECSTP")>;
+def: InstRW<[BWWriteResGroup12], (instrs FDECSTP)>;
def BWWriteResGroup13 : SchedWriteRes<[BWPort06]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[BWWriteResGroup13], (instregex "ROL(16|32|64)r1")>;
-def: InstRW<[BWWriteResGroup13], (instregex "ROL(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup13], (instregex "ROL8r1")>;
-def: InstRW<[BWWriteResGroup13], (instregex "ROL8ri")>;
-def: InstRW<[BWWriteResGroup13], (instregex "ROR(16|32|64)r1")>;
-def: InstRW<[BWWriteResGroup13], (instregex "ROR(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup13], (instregex "ROR8r1")>;
-def: InstRW<[BWWriteResGroup13], (instregex "ROR8ri")>;
+def: InstRW<[BWWriteResGroup13], (instregex "ROL(8|16|32|64)r1",
+ "ROL(8|16|32|64)ri",
+ "ROR(8|16|32|64)r1",
+ "ROR(8|16|32|64)ri")>;
def BWWriteResGroup14 : SchedWriteRes<[BWPort0156]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[BWWriteResGroup14], (instregex "LFENCE")>;
-def: InstRW<[BWWriteResGroup14], (instregex "MFENCE")>;
-def: InstRW<[BWWriteResGroup14], (instregex "WAIT")>;
-def: InstRW<[BWWriteResGroup14], (instregex "XGETBV")>;
+def: InstRW<[BWWriteResGroup14], (instrs LFENCE,
+ MFENCE,
+ WAIT,
+ XGETBV)>;
def BWWriteResGroup15 : SchedWriteRes<[BWPort0,BWPort5]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup15], (instregex "CVTPS2PDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "CVTSS2SDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "EXTRACTPSrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "MMX_PEXTRWirri")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PEXTRBrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PEXTRDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PEXTRQrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PEXTRWri")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PEXTRWrr_REV")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSLLDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSLLQrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSLLWrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSRADrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSRAWrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSRLDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSRLQrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSRLWrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PTESTrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VCVTPH2PSYrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VCVTPH2PSrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VCVTPS2PDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VCVTSS2SDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VEXTRACTPSrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRBrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRQrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRWri")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRWrr_REV")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSLLDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSLLQrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSLLWrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSRADrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSRAWrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSRLDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSRLQrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSRLWrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPTESTrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "(V?)CVTPS2PDrr",
+ "(V?)CVTSS2SDrr")>;
def BWWriteResGroup16 : SchedWriteRes<[BWPort6,BWPort0156]> {
let Latency = 2;
@@ -1160,76 +699,27 @@ def BWWriteResGroup18 : SchedWriteRes<[BWPort237,BWPort0156]> {
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup18], (instregex "SFENCE")>;
-
-def BWWriteResGroup19 : SchedWriteRes<[BWPort06,BWPort15]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup19], (instregex "BEXTR(32|64)rr")>;
-def: InstRW<[BWWriteResGroup19], (instregex "BSWAP(16|32|64)r")>;
+def: InstRW<[BWWriteResGroup18], (instrs SFENCE)>;
def BWWriteResGroup20 : SchedWriteRes<[BWPort06,BWPort0156]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup20], (instregex "ADC8i8")>;
-def: InstRW<[BWWriteResGroup20], (instregex "ADC8ri")>;
-def: InstRW<[BWWriteResGroup20], (instregex "CMOVA(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup20], (instregex "CMOVBE(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup20], (instregex "CWD")>;
-def: InstRW<[BWWriteResGroup20], (instregex "JRCXZ")>;
-def: InstRW<[BWWriteResGroup20], (instregex "SBB8i8")>;
-def: InstRW<[BWWriteResGroup20], (instregex "SBB8ri")>;
-def: InstRW<[BWWriteResGroup20], (instregex "SETAr")>;
-def: InstRW<[BWWriteResGroup20], (instregex "SETBEr")>;
-
-def BWWriteResGroup21 : SchedWriteRes<[BWPort4,BWPort5,BWPort237]> {
- let Latency = 2;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup21], (instregex "EXTRACTPSmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "PEXTRBmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "PEXTRDmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "PEXTRQmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "PEXTRWmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "STMXCSR")>;
-def: InstRW<[BWWriteResGroup21], (instregex "VEXTRACTPSmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRBmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRDmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRQmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRWmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "VSTMXCSR")>;
+def: InstRW<[BWWriteResGroup20], (instrs CWD)>;
+def: InstRW<[BWWriteResGroup20], (instrs JCXZ, JECXZ, JRCXZ)>;
+def: InstRW<[BWWriteResGroup20], (instregex "ADC8i8",
+ "ADC8ri",
+ "SBB8i8",
+ "SBB8ri",
+ "SET(A|BE)r")>;
def BWWriteResGroup22 : SchedWriteRes<[BWPort4,BWPort6,BWPort237]> {
let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[BWWriteResGroup22], (instregex "FNSTCW16m")>;
-
-def BWWriteResGroup23 : SchedWriteRes<[BWPort4,BWPort237,BWPort06]> {
- let Latency = 2;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup23], (instregex "SETAEm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETBm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETEm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETGEm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETGm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETLEm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETLm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETNEm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETNOm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETNPm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETNSm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETOm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETPm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETSm")>;
+def: InstRW<[BWWriteResGroup22], (instrs FNSTCW16m)>;
def BWWriteResGroup24 : SchedWriteRes<[BWPort4,BWPort237,BWPort15]> {
let Latency = 2;
@@ -1243,247 +733,55 @@ def BWWriteResGroup25 : SchedWriteRes<[BWPort4,BWPort237,BWPort0156]> {
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[BWWriteResGroup25], (instregex "PUSH(16|32|64)r(mr)?")>;
-def: InstRW<[BWWriteResGroup25], (instregex "PUSH64i8")>;
-def: InstRW<[BWWriteResGroup25], (instregex "STOSB")>;
-def: InstRW<[BWWriteResGroup25], (instregex "STOSL")>;
-def: InstRW<[BWWriteResGroup25], (instregex "STOSQ")>;
-def: InstRW<[BWWriteResGroup25], (instregex "STOSW")>;
-
-def BWWriteResGroup26 : SchedWriteRes<[BWPort0]> {
- let Latency = 3;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[BWWriteResGroup26], (instregex "MOVMSKPDrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "MOVMSKPSrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "PMOVMSKBrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPDYrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPDrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPSYrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPSrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "VPMOVMSKBYrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "VPMOVMSKBrr")>;
+def: InstRW<[BWWriteResGroup25], (instrs PUSH16r, PUSH32r, PUSH64r,
+ STOSB, STOSL, STOSQ, STOSW)>;
+def: InstRW<[BWWriteResGroup25], (instregex "PUSH(16|32|64)rmr",
+ "PUSH64i8")>;
def BWWriteResGroup27 : SchedWriteRes<[BWPort1]> {
let Latency = 3;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup27], (instregex "ADDPDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADDPSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADDSDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADDSSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADDSUBPDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADDSUBPSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADD_FPrST0")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADD_FST0r")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADD_FrST0")>;
-def: InstRW<[BWWriteResGroup27], (instregex "BSF(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "BSR(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "CMPPDrri")>;
-def: InstRW<[BWWriteResGroup27], (instregex "CMPPSrri")>;
-def: InstRW<[BWWriteResGroup27], (instregex "CMPSDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "CMPSSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "COMISDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "COMISSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "CVTDQ2PSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "CVTPS2DQrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "CVTTPS2DQrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "IMUL(32|64)rr(i8)?")>;
-def: InstRW<[BWWriteResGroup27], (instregex "IMUL8r")>;
-def: InstRW<[BWWriteResGroup27], (instregex "LZCNT(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)PDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)PSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)SDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)SSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)PDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)PSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)SDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)SSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MMX_CVTPI2PSirr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MUL8r")>;
-def: InstRW<[BWWriteResGroup27], (instregex "PDEP(32|64)rr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "PEXT(32|64)rr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "POPCNT(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SHLD(16|32|64)rri8")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SHRD(16|32|64)rri8")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUBPDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUBPSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUBR_FPrST0")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUBR_FST0r")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUBR_FrST0")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUBSDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUBSSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUB_FPrST0")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUB_FST0r")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUB_FrST0")>;
-def: InstRW<[BWWriteResGroup27], (instregex "TZCNT(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "UCOMISDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "UCOMISSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDPDYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDPDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDPSYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDPSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDSDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDSSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPDYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPSYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCMPPDYrri")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCMPPDrri")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCMPPSYrri")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCMPPSrri")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCMPSDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCMPSSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCOMISDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCOMISSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCVTDQ2PSYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCVTDQ2PSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCVTPS2DQYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCVTPS2DQrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCVTTPS2DQYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCVTTPS2DQrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PDYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PSYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)SDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)SSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PDYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PSYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)SDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)SSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VSUBPDYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VSUBPDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VSUBPSYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VSUBPSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VSUBSDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VSUBSSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VUCOMISDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VUCOMISSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "MMX_CVTPI2PSirr",
+ "PDEP(32|64)rr",
+ "PEXT(32|64)rr",
+ "SHLD(16|32|64)rri8",
+ "SHRD(16|32|64)rri8",
+ "(V?)CVTDQ2PS(Y?)rr")>;
def BWWriteResGroup27_16 : SchedWriteRes<[BWPort1, BWPort0156]> {
- let Latency = 3;
+ let Latency = 4;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup27_16], (instregex "IMUL16rr(i8)?")>;
+def: InstRW<[BWWriteResGroup27_16], (instrs IMUL16rri, IMUL16rri8)>;
def BWWriteResGroup28 : SchedWriteRes<[BWPort5]> {
let Latency = 3;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup28], (instregex "VBROADCASTSDYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VBROADCASTSSYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VEXTRACTF128rr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VEXTRACTI128rr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VINSERTF128rr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VINSERTI128rr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTBYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTBrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTDYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTQYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTWYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTWrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPERM2F128rr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPERM2I128rr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPERMDYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPERMPDYri")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPERMPSYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPERMQYri")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXBDYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXBQYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXBWYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXDQYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXWDYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXWQYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXBDYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXBQYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXBWYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXDQYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXWDYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXWQYrr")>;
-
-def BWWriteResGroup29 : SchedWriteRes<[BWPort01]> {
- let Latency = 3;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[BWWriteResGroup29], (instregex "MULPDrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "MULPSrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "MULSDrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "MULSSrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "VMULPDYrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "VMULPDrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "VMULPSYrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "VMULPSrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "VMULSDrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "VMULSSrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTBrr",
+ "VPBROADCASTWrr")>;
def BWWriteResGroup30 : SchedWriteRes<[BWPort0156]> {
- let Latency = 3;
+ let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [3];
}
-def: InstRW<[BWWriteResGroup30], (instregex "XADD(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup30], (instregex "XADD8rr")>;
-def: InstRW<[BWWriteResGroup30], (instregex "XCHG8rr")>;
-
-def BWWriteResGroup31 : SchedWriteRes<[BWPort0,BWPort5]> {
- let Latency = 3;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup31], (instregex "VPSLLVDYrr")>;
-def: InstRW<[BWWriteResGroup31], (instregex "VPSLLVDrr")>;
-def: InstRW<[BWWriteResGroup31], (instregex "VPSRAVDYrr")>;
-def: InstRW<[BWWriteResGroup31], (instregex "VPSRAVDrr")>;
-def: InstRW<[BWWriteResGroup31], (instregex "VPSRLVDYrr")>;
-def: InstRW<[BWWriteResGroup31], (instregex "VPSRLVDrr")>;
-
-def BWWriteResGroup32 : SchedWriteRes<[BWPort5,BWPort15]> {
- let Latency = 3;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHADDSWrr64")>;
-def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHADDWrr64")>;
-def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHADDrr64")>;
-def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHSUBDrr64")>;
-def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHSUBSWrr64")>;
-def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHSUBWrr64")>;
-def: InstRW<[BWWriteResGroup32], (instregex "PHADDDrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "PHADDSWrr128")>;
-def: InstRW<[BWWriteResGroup32], (instregex "PHADDWrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "PHSUBDrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "PHSUBSWrr128")>;
-def: InstRW<[BWWriteResGroup32], (instregex "PHSUBWrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHADDDYrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHADDDrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHADDSWrr128")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHADDSWrr256")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHADDWYrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHADDWrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBDYrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBDrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBSWrr128")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBSWrr256")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBWYrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBWrr")>;
+def: InstRW<[BWWriteResGroup30], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr,
+ XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr,
+ XCHG16ar, XCHG32ar, XCHG64ar)>;
def BWWriteResGroup33 : SchedWriteRes<[BWPort5,BWPort0156]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[BWWriteResGroup33], (instregex "MMX_PACKSSDWirr")>;
-def: InstRW<[BWWriteResGroup33], (instregex "MMX_PACKSSWBirr")>;
-def: InstRW<[BWWriteResGroup33], (instregex "MMX_PACKUSWBirr")>;
+def: InstRW<[BWWriteResGroup33], (instregex "MMX_PACKSSDWirr",
+ "MMX_PACKSSWBirr",
+ "MMX_PACKUSWBirr")>;
def BWWriteResGroup34 : SchedWriteRes<[BWPort6,BWPort0156]> {
let Latency = 3;
@@ -1497,30 +795,21 @@ def BWWriteResGroup35 : SchedWriteRes<[BWPort06,BWPort0156]> {
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[BWWriteResGroup35], (instregex "RCL(16|32|64)r1")>;
-def: InstRW<[BWWriteResGroup35], (instregex "RCL(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup35], (instregex "RCL8r1")>;
-def: InstRW<[BWWriteResGroup35], (instregex "RCL8ri")>;
-def: InstRW<[BWWriteResGroup35], (instregex "RCR(16|32|64)r1")>;
-def: InstRW<[BWWriteResGroup35], (instregex "RCR(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup35], (instregex "RCR8r1")>;
-def: InstRW<[BWWriteResGroup35], (instregex "RCR8ri")>;
+def: InstRW<[BWWriteResGroup35], (instregex "RCL(8|16|32|64)r1",
+ "RCL(8|16|32|64)ri",
+ "RCR(8|16|32|64)r1",
+ "RCR(8|16|32|64)ri")>;
def BWWriteResGroup36 : SchedWriteRes<[BWPort06,BWPort0156]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[BWWriteResGroup36], (instregex "ROL(16|32|64)rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "ROL8rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "ROR(16|32|64)rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "ROR8rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "SAR(16|32|64)rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "SAR8rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "SHL(16|32|64)rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "SHL8rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "SHR(16|32|64)rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "SHR8rCL")>;
+def: InstRW<[BWWriteResGroup36], (instregex "ROL(8|16|32|64)rCL",
+ "ROR(8|16|32|64)rCL",
+ "SAR(8|16|32|64)rCL",
+ "SHL(8|16|32|64)rCL",
+ "SHR(8|16|32|64)rCL")>;
def BWWriteResGroup37 : SchedWriteRes<[BWPort4,BWPort6,BWPort237,BWPort0156]> {
let Latency = 3;
@@ -1534,31 +823,18 @@ def BWWriteResGroup38 : SchedWriteRes<[BWPort4,BWPort237,BWPort06,BWPort0156]> {
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[BWWriteResGroup38], (instregex "CALL64pcrel32")>;
-def: InstRW<[BWWriteResGroup38], (instregex "SETAm")>;
-def: InstRW<[BWWriteResGroup38], (instregex "SETBEm")>;
+def: InstRW<[BWWriteResGroup38], (instrs CALL64pcrel32)>;
+def: InstRW<[BWWriteResGroup38], (instregex "SET(A|BE)m")>;
def BWWriteResGroup39 : SchedWriteRes<[BWPort0,BWPort1]> {
let Latency = 4;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup39], (instregex "CVTSD2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "CVTSD2SIrr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "CVTSS2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "CVTSS2SIrr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "CVTTSD2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "CVTTSD2SIrr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "CVTTSS2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "CVTTSS2SIrr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTSD2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTSD2SIrr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTSS2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTSS2SIrr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSD2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSD2SIrr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSS2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSS2SIrr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "(V?)CVT(T?)SD2SI64rr",
+ "(V?)CVT(T?)SD2SIrr",
+ "(V?)CVT(T?)SS2SI64rr",
+ "(V?)CVT(T?)SS2SIrr")>;
def BWWriteResGroup40 : SchedWriteRes<[BWPort0,BWPort5]> {
let Latency = 4;
@@ -1566,241 +842,98 @@ def BWWriteResGroup40 : SchedWriteRes<[BWPort0,BWPort5]> {
let ResourceCycles = [1,1];
}
def: InstRW<[BWWriteResGroup40], (instregex "VCVTPS2PDYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSLLDYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSLLQYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSLLWYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSRADYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSRAWYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSRLDYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSRLQYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSRLWYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPTESTYrr")>;
def BWWriteResGroup41 : SchedWriteRes<[BWPort0,BWPort0156]> {
let Latency = 4;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup41], (instregex "FNSTSW16r")>;
+def: InstRW<[BWWriteResGroup41], (instrs FNSTSW16r)>;
def BWWriteResGroup42 : SchedWriteRes<[BWPort1,BWPort5]> {
let Latency = 4;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup42], (instregex "CVTDQ2PDrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "CVTPD2DQrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "CVTPD2PSrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "CVTSD2SSrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "CVTSI642SDrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "CVTSI2SDrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "CVTSI2SSrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "CVTTPD2DQrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "IMUL(32|64)r")>;
-def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPD2PIirr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPI2PDirr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPS2PIirr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTTPD2PIirr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTTPS2PIirr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "MUL(32|64)r")>;
-def: InstRW<[BWWriteResGroup42], (instregex "MULX64rr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTDQ2PDrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTPD2DQrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTPD2PSrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTPS2PHrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTSD2SSrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTSI642SDrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTSI2SDrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTSI2SSrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTTPD2DQrr")>;
+def: InstRW<[BWWriteResGroup42], (instrs IMUL64r, MUL64r, MULX64rr)>;
+def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPI2PDirr",
+ "MMX_CVT(T?)PD2PIirr",
+ "MMX_CVT(T?)PS2PIirr",
+ "(V?)CVTDQ2PDrr",
+ "(V?)CVTPD2PSrr",
+ "(V?)CVTSD2SSrr",
+ "(V?)CVTSI642SDrr",
+ "(V?)CVTSI2SDrr",
+ "(V?)CVTSI2SSrr",
+ "(V?)CVT(T?)PD2DQrr")>;
def BWWriteResGroup42_16 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
let Latency = 4;
let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
}
-def: InstRW<[BWWriteResGroup42_16], (instregex "IMUL16r")>;
-def: InstRW<[BWWriteResGroup42_16], (instregex "MUL16r")>;
+def: InstRW<[BWWriteResGroup42_16], (instrs IMUL16r, MUL16r)>;
def BWWriteResGroup43 : SchedWriteRes<[BWPort0,BWPort4,BWPort237]> {
let Latency = 4;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[BWWriteResGroup43], (instregex "FNSTSWm")>;
+def: InstRW<[BWWriteResGroup43], (instrs FNSTSWm)>;
def BWWriteResGroup44 : SchedWriteRes<[BWPort1,BWPort4,BWPort237]> {
let Latency = 4;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[BWWriteResGroup44], (instregex "ISTT_FP16m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "ISTT_FP32m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "ISTT_FP64m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "IST_F16m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "IST_F32m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "IST_FP16m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "IST_FP32m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "IST_FP64m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "VCVTPS2PHYmr")>;
-def: InstRW<[BWWriteResGroup44], (instregex "VCVTPS2PHmr")>;
+def: InstRW<[BWWriteResGroup44], (instregex "IST(T?)_FP(16|32|64)m",
+ "IST_F(16|32)m")>;
def BWWriteResGroup45 : SchedWriteRes<[BWPort0156]> {
let Latency = 4;
let NumMicroOps = 4;
let ResourceCycles = [4];
}
-def: InstRW<[BWWriteResGroup45], (instregex "FNCLEX")>;
+def: InstRW<[BWWriteResGroup45], (instrs FNCLEX)>;
def BWWriteResGroup46 : SchedWriteRes<[BWPort015,BWPort0156]> {
let Latency = 4;
let NumMicroOps = 4;
let ResourceCycles = [1,3];
}
-def: InstRW<[BWWriteResGroup46], (instregex "VZEROUPPER")>;
+def: InstRW<[BWWriteResGroup46], (instrs VZEROUPPER)>;
def BWWriteResGroup47 : SchedWriteRes<[BWPort0]> {
let Latency = 5;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMADDUBSWrr64")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMADDWDirr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULHRSWrr64")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULHUWirr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULHWirr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULLWirr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULUDQirr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PSADBWirr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MUL_FPrST0")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MUL_FST0r")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MUL_FrST0")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PCLMULQDQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PCMPGTQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PHMINPOSUWrr128")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMADDUBSWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMADDWDrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMULDQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMULHRSWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMULHUWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMULHWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMULLWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMULUDQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PSADBWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "RCPPSr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "RCPSSr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "RSQRTPSr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "RSQRTSSr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPCLMULQDQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPCMPGTQYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPCMPGTQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPHMINPOSUWrr128")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMADDUBSWYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMADDUBSWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMADDWDYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMADDWDrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULDQYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULDQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULHRSWYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULHRSWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULHUWYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULHUWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULHWYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULHWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULLWYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULLWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULUDQYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULUDQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPSADBWYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPSADBWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VRCPPSr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VRCPSSr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VRSQRTPSr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VRSQRTSSr")>;
-
-def BWWriteResGroup48 : SchedWriteRes<[BWPort01]> {
- let Latency = 5;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[BWWriteResGroup48],
- (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)(Y)?r",
- "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>;
+def: InstRW<[BWWriteResGroup47], (instregex "(V?)PCMPGTQ(Y?)rr",
+ "MUL_(FPrST0|FST0r|FrST0)")>;
def BWWriteResGroup49 : SchedWriteRes<[BWPort23]> {
let Latency = 5;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup49], (instregex "LDDQUrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVD64rm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVD64to64rm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVQ64rm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOV(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOV64toPQIrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOV8rm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVAPDrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVAPSrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVDDUPrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVDI2PDIrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVDQArm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVDQUrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVNTDQArm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVQI2PQIrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVSDrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVSHDUPrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVSLDUPrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVSSrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm16")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm32")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm8")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVUPDrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVUPSrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVZX(16|32|64)rm16")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVZX(16|32|64)rm8")>;
-def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHNTA")>;
-def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHT0")>;
-def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHT1")>;
-def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHT2")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VBROADCASTSSrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VLDDQUrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOV64toPQIrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVAPDrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVAPSrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVDDUPrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVDI2PDIrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVDQArm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVDQUrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVNTDQArm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVQI2PQIrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVSDrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVSHDUPrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVSLDUPrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVSSrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVUPDrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVUPSrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VPBROADCASTDrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VPBROADCASTQrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm16",
+ "MOVSX(16|32|64)rm32",
+ "MOVSX(16|32|64)rm8",
+ "MOVZX(16|32|64)rm16",
+ "MOVZX(16|32|64)rm8",
+ "VBROADCASTSSrm",
+ "(V?)MOVDDUPrm",
+ "(V?)MOVSHDUPrm",
+ "(V?)MOVSLDUPrm",
+ "VPBROADCASTDrm",
+ "VPBROADCASTQrm")>;
def BWWriteResGroup50 : SchedWriteRes<[BWPort1,BWPort5]> {
let Latency = 5;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[BWWriteResGroup50], (instregex "CVTSI642SSrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "HADDPDrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "HADDPSrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "HSUBPDrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "HSUBPSrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VCVTSI642SSrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHADDPDYrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHADDPDrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHADDPSYrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHADDPSrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPDYrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPDrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPSYrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPSrr")>;
+def: InstRW<[BWWriteResGroup50], (instregex "(V?)CVTSI642SSrr")>;
def BWWriteResGroup51 : SchedWriteRes<[BWPort1,BWPort6,BWPort06]> {
let Latency = 5;
@@ -1810,482 +943,125 @@ def BWWriteResGroup51 : SchedWriteRes<[BWPort1,BWPort6,BWPort06]> {
def: InstRW<[BWWriteResGroup51], (instregex "STR(16|32|64)r")>;
def BWWriteResGroup52 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
- let Latency = 5;
+ let Latency = 4;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[BWWriteResGroup52], (instregex "MULX32rr")>;
-
-def BWWriteResGroup53 : SchedWriteRes<[BWPort0,BWPort4,BWPort237,BWPort15]> {
- let Latency = 5;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPDYmr")>;
-def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPDmr")>;
-def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPSYmr")>;
-def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPSmr")>;
-def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVDYmr")>;
-def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVDmr")>;
-def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVQYmr")>;
-def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVQmr")>;
+def: InstRW<[BWWriteResGroup52], (instrs IMUL32r, MUL32r, MULX32rr)>;
def BWWriteResGroup54 : SchedWriteRes<[BWPort6,BWPort0156]> {
let Latency = 5;
let NumMicroOps = 5;
let ResourceCycles = [1,4];
}
-def: InstRW<[BWWriteResGroup54], (instregex "PAUSE")>;
+def: InstRW<[BWWriteResGroup54], (instrs PAUSE)>;
def BWWriteResGroup55 : SchedWriteRes<[BWPort06,BWPort0156]> {
let Latency = 5;
let NumMicroOps = 5;
let ResourceCycles = [1,4];
}
-def: InstRW<[BWWriteResGroup55], (instregex "XSETBV")>;
+def: InstRW<[BWWriteResGroup55], (instrs XSETBV)>;
def BWWriteResGroup56 : SchedWriteRes<[BWPort06,BWPort0156]> {
let Latency = 5;
let NumMicroOps = 5;
let ResourceCycles = [2,3];
}
-def: InstRW<[BWWriteResGroup56], (instregex "CMPXCHG(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup56], (instregex "CMPXCHG8rr")>;
+def: InstRW<[BWWriteResGroup56], (instregex "CMPXCHG(8|16|32|64)rr")>;
def BWWriteResGroup57 : SchedWriteRes<[BWPort4,BWPort237,BWPort0156]> {
let Latency = 5;
let NumMicroOps = 6;
let ResourceCycles = [1,1,4];
}
-def: InstRW<[BWWriteResGroup57], (instregex "PUSHF16")>;
-def: InstRW<[BWWriteResGroup57], (instregex "PUSHF64")>;
+def: InstRW<[BWWriteResGroup57], (instregex "PUSHF(16|64)")>;
def BWWriteResGroup58 : SchedWriteRes<[BWPort23]> {
let Latency = 6;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup58], (instregex "LD_F32m")>;
-def: InstRW<[BWWriteResGroup58], (instregex "LD_F64m")>;
-def: InstRW<[BWWriteResGroup58], (instregex "LD_F80m")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTF128")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTI128")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTSDYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTSSYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VLDDQUYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVAPDYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVAPSYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVDDUPYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVDQAYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVDQUYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVNTDQAYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVSHDUPYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVSLDUPYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVUPDYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVUPSYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VPBROADCASTDYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VPBROADCASTQYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "ROUNDPDr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "ROUNDPSr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "ROUNDSDr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "ROUNDSSr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VROUNDPDr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VROUNDPSr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VROUNDSDr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VROUNDSSr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VROUNDYPDr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VROUNDYPSr")>;
+def: InstRW<[BWWriteResGroup58], (instregex "LD_F(32|64|80)m",
+ "VBROADCASTF128",
+ "VBROADCASTI128",
+ "VBROADCASTSDYrm",
+ "VBROADCASTSSYrm",
+ "VMOVDDUPYrm",
+ "VMOVSHDUPYrm",
+ "VMOVSLDUPYrm",
+ "VPBROADCASTDYrm",
+ "VPBROADCASTQYrm")>;
def BWWriteResGroup59 : SchedWriteRes<[BWPort0,BWPort23]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup59], (instregex "CVTPS2PDrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "CVTSS2SDrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSLLDrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSLLQrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSLLWrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRADrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRAWrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRLDrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRLQrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRLWrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VCVTPH2PSYrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VCVTPH2PSrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VCVTPS2PDrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VCVTSS2SDrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VPSLLVQrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VPSRLVQrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VTESTPDrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VTESTPSrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "(V?)CVTPS2PDrm",
+ "(V?)CVTSS2SDrm",
+ "VPSLLVQrm",
+ "VPSRLVQrm")>;
def BWWriteResGroup60 : SchedWriteRes<[BWPort1,BWPort5]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup60], (instregex "VCVTDQ2PDYrr")>;
-def: InstRW<[BWWriteResGroup60], (instregex "VCVTPD2DQYrr")>;
-def: InstRW<[BWWriteResGroup60], (instregex "VCVTPD2PSYrr")>;
-def: InstRW<[BWWriteResGroup60], (instregex "VCVTPS2PHYrr")>;
-def: InstRW<[BWWriteResGroup60], (instregex "VCVTTPD2DQYrr")>;
-
-def BWWriteResGroup61 : SchedWriteRes<[BWPort5,BWPort23]> {
- let Latency = 6;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup61], (instregex "ANDNPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "ANDNPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "ANDPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "ANDPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "INSERTPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PALIGNR64irm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PINSRWirmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PSHUFBrm64")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PSHUFWmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKHBWirm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKHDQirm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKHWDirm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKLBWirm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKLDQirm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKLWDirm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MOVHPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MOVHPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MOVLPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MOVLPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "ORPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "ORPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PACKSSDWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PACKSSWBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PACKUSDWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PACKUSWBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PALIGNRrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PBLENDWrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PINSRBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PINSRDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PINSRQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PINSRWrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXBDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXBQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXWQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXBDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXBQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXWQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PSHUFBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PSHUFDmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PSHUFHWmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PSHUFLWmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHQDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLQDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "SHUFPDrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "SHUFPSrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "UNPCKHPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "UNPCKHPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "UNPCKLPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "UNPCKLPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VANDNPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VANDNPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VANDPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VANDPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VINSERTPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VMOVHPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VMOVHPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VMOVLPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VMOVLPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VORPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VORPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPACKSSDWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPACKSSWBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPACKUSDWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPACKUSWBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPALIGNRrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPBLENDWrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPDmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPSmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPINSRBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPINSRDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPINSRQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPINSRWrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXBDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXBQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXWQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXBDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXBQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXWQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFDmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFHWmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFLWmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHQDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLQDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VSHUFPDrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VSHUFPSrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKHPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKHPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKLPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKLPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VXORPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VXORPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "XORPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "XORPSrm")>;
+def: InstRW<[BWWriteResGroup60], (instregex "VCVTDQ2PDYrr",
+ "VCVTPD2PSYrr",
+ "VCVT(T?)PD2DQYrr")>;
def BWWriteResGroup62 : SchedWriteRes<[BWPort6,BWPort23]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup62], (instregex "FARJMP64")>;
-def: InstRW<[BWWriteResGroup62], (instregex "JMP(16|32|64)m")>;
+def: InstRW<[BWWriteResGroup62], (instregex "FARJMP64",
+ "JMP(16|32|64)m")>;
def BWWriteResGroup63 : SchedWriteRes<[BWPort23,BWPort06]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup63], (instregex "ADC(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "ADC8rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "ADCX(32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "ADOX(32|64)rm")>;
def: InstRW<[BWWriteResGroup63], (instregex "BT(16|32|64)mi8")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVAE(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVB(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVE(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVG(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVGE(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVL(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVLE(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVNE(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVNO(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVNP(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVNS(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVO(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVP(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVS(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "RORX(32|64)mi")>;
-def: InstRW<[BWWriteResGroup63], (instregex "SARX(32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "SBB(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "SBB8rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "SHLX(32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "SHRX(32|64)rm")>;
def BWWriteResGroup64 : SchedWriteRes<[BWPort23,BWPort15]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup64], (instregex "ANDN(32|64)rm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "BLSI(32|64)rm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "BLSMSK(32|64)rm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "BLSR(32|64)rm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "BZHI(32|64)rm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PABSBrm64")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PABSDrm64")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PABSWrm64")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDDirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDQirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDSBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDSWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDUSBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDUSWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PAVGBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PAVGWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPEQBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPEQDirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPEQWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPGTBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPGTDirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPGTWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMAXSWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMAXUBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMINSWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMINUBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSIGNBrm64")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSIGNDrm64")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSIGNWrm64")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBDirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBQirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBSBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBSWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBUSBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBUSWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MOVBE(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PABSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PABSDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PABSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDQrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDUSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDUSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PAVGBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PAVGWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQQrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PCMPGTBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PCMPGTDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PCMPGTWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMAXSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMAXSDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMAXSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMAXUBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMAXUDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMAXUWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMINSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMINSDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMINSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMINUBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMINUDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMINUWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSIGNBrm128")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSIGNDrm128")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSIGNWrm128")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBQrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBUSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBUSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPABSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPABSDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPABSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDQrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDUSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDUSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPAVGBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPAVGWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQQrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPCMPGTBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPCMPGTDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPCMPGTWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMAXSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMAXSDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMAXSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMAXUBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMAXUDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMAXUWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMINSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMINSDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMINSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMINUBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMINUDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMINUWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSIGNBrm128")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSIGNDrm128")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSIGNWrm128")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBQrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBUSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBUSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "ANDN(32|64)rm",
+ "BLSI(32|64)rm",
+ "BLSMSK(32|64)rm",
+ "BLSR(32|64)rm",
+ "MOVBE(16|32|64)rm")>;
def BWWriteResGroup65 : SchedWriteRes<[BWPort23,BWPort015]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup65], (instregex "BLENDPDrmi")>;
-def: InstRW<[BWWriteResGroup65], (instregex "BLENDPSrmi")>;
-def: InstRW<[BWWriteResGroup65], (instregex "MMX_PANDNirm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "MMX_PANDirm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "MMX_PORirm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "MMX_PXORirm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "PANDNrm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "PANDrm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "PORrm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "PXORrm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VBLENDPDrmi")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VBLENDPSrmi")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VINSERTF128rm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VINSERTI128rm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VPANDNrm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VPANDrm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VPBLENDDrmi")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VPORrm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VPXORrm")>;
+def: InstRW<[BWWriteResGroup65], (instregex "VINSERTF128rm",
+ "VINSERTI128rm",
+ "VPBLENDDrmi")>;
def BWWriteResGroup66 : SchedWriteRes<[BWPort23,BWPort0156]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup66], (instregex "ADD(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "ADD8rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "AND(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "AND8rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "CMP(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup66], (instregex "CMP(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup66], (instregex "CMP(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "CMP8mi")>;
-def: InstRW<[BWWriteResGroup66], (instregex "CMP8mr")>;
-def: InstRW<[BWWriteResGroup66], (instregex "CMP8rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "OR(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "OR8rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "POP(16|32|64)r(mr)?")>;
-def: InstRW<[BWWriteResGroup66], (instregex "SUB(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "SUB8rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "TEST(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup66], (instregex "TEST8mi")>;
-def: InstRW<[BWWriteResGroup66], (instregex "TEST8mr")>;
-def: InstRW<[BWWriteResGroup66], (instregex "XOR(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "XOR8rm")>;
+def: InstRW<[BWWriteResGroup66], (instrs POP16r, POP32r, POP64r)>;
+def: InstRW<[BWWriteResGroup66], (instregex "POP(16|32|64)rmr")>;
def BWWriteResGroup67 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
let Latency = 6;
let NumMicroOps = 4;
let ResourceCycles = [1,1,2];
}
-def: InstRW<[BWWriteResGroup67], (instregex "SHLD(16|32|64)rrCL")>;
-def: InstRW<[BWWriteResGroup67], (instregex "SHRD(16|32|64)rrCL")>;
+def: InstRW<[BWWriteResGroup67], (instregex "SHLD(16|32|64)rrCL",
+ "SHRD(16|32|64)rrCL")>;
def BWWriteResGroup68 : SchedWriteRes<[BWPort1,BWPort6,BWPort06,BWPort0156]> {
let Latency = 6;
@@ -2299,665 +1075,209 @@ def BWWriteResGroup69 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> {
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[BWWriteResGroup69], (instregex "BTC(16|32|64)mi8")>;
-def: InstRW<[BWWriteResGroup69], (instregex "BTR(16|32|64)mi8")>;
-def: InstRW<[BWWriteResGroup69], (instregex "BTS(16|32|64)mi8")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SAR(16|32|64)m1")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SAR(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SAR8m1")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SAR8mi")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHL(16|32|64)m1")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHL(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHL8m1")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHL8mi")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHR(16|32|64)m1")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHR(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHR8m1")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHR8mi")>;
+def: InstRW<[BWWriteResGroup69], (instregex "BTC(16|32|64)mi8",
+ "BTR(16|32|64)mi8",
+ "BTS(16|32|64)mi8",
+ "SAR(8|16|32|64)m1",
+ "SAR(8|16|32|64)mi",
+ "SHL(8|16|32|64)m1",
+ "SHL(8|16|32|64)mi",
+ "SHR(8|16|32|64)m1",
+ "SHR(8|16|32|64)mi")>;
def BWWriteResGroup70 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> {
let Latency = 6;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[BWWriteResGroup70], (instregex "ADD(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "ADD(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "ADD8mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "ADD8mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "AND(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "AND(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "AND8mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "AND8mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "DEC(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "DEC8m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "INC(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "INC8m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "NEG(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "NEG8m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "NOT(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "NOT8m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "OR(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "OR(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "OR8mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "OR8mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "POP(16|32|64)rmm")>;
-def: InstRW<[BWWriteResGroup70], (instregex "PUSH(16|32|64)rmm")>;
-def: InstRW<[BWWriteResGroup70], (instregex "SUB(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "SUB(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "SUB8mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "SUB8mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "XOR(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "XOR(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "XOR8mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "XOR8mr")>;
+def: InstRW<[BWWriteResGroup70], (instregex "POP(16|32|64)rmm",
+ "PUSH(16|32|64)rmm")>;
def BWWriteResGroup71 : SchedWriteRes<[BWPort6,BWPort0156]> {
let Latency = 6;
let NumMicroOps = 6;
let ResourceCycles = [1,5];
}
-def: InstRW<[BWWriteResGroup71], (instregex "STD")>;
-
-def BWWriteResGroup72 : SchedWriteRes<[BWPort5]> {
- let Latency = 7;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[BWWriteResGroup72], (instregex "AESDECLASTrr")>;
-def: InstRW<[BWWriteResGroup72], (instregex "AESDECrr")>;
-def: InstRW<[BWWriteResGroup72], (instregex "AESENCLASTrr")>;
-def: InstRW<[BWWriteResGroup72], (instregex "AESENCrr")>;
-def: InstRW<[BWWriteResGroup72], (instregex "VAESDECLASTrr")>;
-def: InstRW<[BWWriteResGroup72], (instregex "VAESDECrr")>;
-def: InstRW<[BWWriteResGroup72], (instregex "VAESENCLASTrr")>;
-def: InstRW<[BWWriteResGroup72], (instregex "VAESENCrr")>;
+def: InstRW<[BWWriteResGroup71], (instrs STD)>;
def BWWriteResGroup73 : SchedWriteRes<[BWPort0,BWPort23]> {
let Latency = 7;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup73], (instregex "VPSLLDYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSLLQYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSLLVQYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSLLWYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSRADYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSRAWYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSRLDYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSRLQYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSRLVQYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSRLWYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VTESTPDYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VTESTPSYrm")>;
+def: InstRW<[BWWriteResGroup73], (instregex "VPSLLVQYrm",
+ "VPSRLVQYrm")>;
def BWWriteResGroup74 : SchedWriteRes<[BWPort1,BWPort23]> {
let Latency = 7;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup74], (instregex "FCOM32m")>;
-def: InstRW<[BWWriteResGroup74], (instregex "FCOM64m")>;
-def: InstRW<[BWWriteResGroup74], (instregex "FCOMP32m")>;
-def: InstRW<[BWWriteResGroup74], (instregex "FCOMP64m")>;
-
-def BWWriteResGroup75 : SchedWriteRes<[BWPort5,BWPort23]> {
- let Latency = 7;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup75], (instregex "VANDNPDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VANDNPSYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VANDPDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VANDPSYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VORPDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VORPSYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPACKSSDWYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPACKSSWBYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPACKUSDWYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPACKUSWBYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPALIGNRYrmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPBLENDWYrmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPDYmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPSYmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPSYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFBYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFDYmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFHWYmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFLWYmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHBWYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHDQYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHQDQYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHWDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLBWYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLDQYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLQDQYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLWDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VSHUFPDYrmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VSHUFPSYrmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKHPDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKHPSYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKLPDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKLPSYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VXORPDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VXORPSYrm")>;
-
-def BWWriteResGroup76 : SchedWriteRes<[BWPort23,BWPort15]> {
- let Latency = 7;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup76], (instregex "VPABSBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPABSDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPABSWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDQYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDSBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDSWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDUSBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDUSWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPAVGBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPAVGWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQQYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPCMPGTBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPCMPGTDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPCMPGTWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMAXSBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMAXSDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMAXSWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMAXUBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMAXUDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMAXUWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMINSBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMINSDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMINSWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMINUBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMINUDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMINUWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSIGNBYrm256")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSIGNDYrm256")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSIGNWYrm256")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBQYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBSBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBSWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBUSBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBUSWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBWYrm")>;
+def: InstRW<[BWWriteResGroup74], (instregex "FCOM(P?)(32|64)m")>;
def BWWriteResGroup77 : SchedWriteRes<[BWPort23,BWPort015]> {
let Latency = 7;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup77], (instregex "VBLENDPDYrmi")>;
-def: InstRW<[BWWriteResGroup77], (instregex "VBLENDPSYrmi")>;
-def: InstRW<[BWWriteResGroup77], (instregex "VPANDNYrm")>;
-def: InstRW<[BWWriteResGroup77], (instregex "VPANDYrm")>;
def: InstRW<[BWWriteResGroup77], (instregex "VPBLENDDYrmi")>;
-def: InstRW<[BWWriteResGroup77], (instregex "VPORYrm")>;
-def: InstRW<[BWWriteResGroup77], (instregex "VPXORYrm")>;
-
-def BWWriteResGroup78 : SchedWriteRes<[BWPort0,BWPort5]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[BWWriteResGroup78], (instregex "MPSADBWrri")>;
-def: InstRW<[BWWriteResGroup78], (instregex "VMPSADBWYrri")>;
-def: InstRW<[BWWriteResGroup78], (instregex "VMPSADBWrri")>;
def BWWriteResGroup79 : SchedWriteRes<[BWPort5,BWPort23]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[BWWriteResGroup79], (instregex "BLENDVPDrm0")>;
-def: InstRW<[BWWriteResGroup79], (instregex "BLENDVPSrm0")>;
-def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKSSDWirm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKSSWBirm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKUSWBirm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "PBLENDVBrm0")>;
-def: InstRW<[BWWriteResGroup79], (instregex "VBLENDVPDrm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "VBLENDVPSrm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "VMASKMOVPDrm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "VMASKMOVPSrm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "VPBLENDVBrm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "VPMASKMOVDrm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "VPMASKMOVQrm")>;
+def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKSSDWirm",
+ "MMX_PACKSSWBirm",
+ "MMX_PACKUSWBirm")>;
def BWWriteResGroup80 : SchedWriteRes<[BWPort23,BWPort0156]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[BWWriteResGroup80], (instregex "LEAVE64")>;
-def: InstRW<[BWWriteResGroup80], (instregex "SCASB")>;
-def: InstRW<[BWWriteResGroup80], (instregex "SCASL")>;
-def: InstRW<[BWWriteResGroup80], (instregex "SCASQ")>;
-def: InstRW<[BWWriteResGroup80], (instregex "SCASW")>;
-
-def BWWriteResGroup81 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup81], (instregex "PSLLDrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PSLLQrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PSLLWrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PSRADrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PSRAWrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PSRLDrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PSRLQrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PSRLWrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PTESTrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSLLDrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSLLQrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSLLWrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSRADrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSRAWrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSRLDrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSRLQrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSRLWrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPTESTrm")>;
+def: InstRW<[BWWriteResGroup80], (instrs LEAVE, LEAVE64,
+ SCASB, SCASL, SCASQ, SCASW)>;
def BWWriteResGroup82 : SchedWriteRes<[BWPort0,BWPort01,BWPort23]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[BWWriteResGroup82], (instregex "FLDCW16m")>;
-
-def BWWriteResGroup83 : SchedWriteRes<[BWPort0,BWPort23,BWPort0156]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup83], (instregex "LDMXCSR")>;
-def: InstRW<[BWWriteResGroup83], (instregex "VLDMXCSR")>;
+def: InstRW<[BWWriteResGroup82], (instrs FLDCW16m)>;
def BWWriteResGroup84 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[BWWriteResGroup84], (instregex "LRETQ")>;
-def: InstRW<[BWWriteResGroup84], (instregex "RETQ")>;
-
-def BWWriteResGroup85 : SchedWriteRes<[BWPort23,BWPort06,BWPort15]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup85], (instregex "BEXTR(32|64)rm")>;
-
-def BWWriteResGroup86 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup86], (instregex "CMOVA(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup86], (instregex "CMOVBE(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup84], (instrs LRETQ, RETQ)>;
def BWWriteResGroup87 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> {
let Latency = 7;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,2];
}
-def: InstRW<[BWWriteResGroup87], (instregex "ROL(16|32|64)m1")>;
-def: InstRW<[BWWriteResGroup87], (instregex "ROL(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup87], (instregex "ROL8m1")>;
-def: InstRW<[BWWriteResGroup87], (instregex "ROL8mi")>;
-def: InstRW<[BWWriteResGroup87], (instregex "ROR(16|32|64)m1")>;
-def: InstRW<[BWWriteResGroup87], (instregex "ROR(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup87], (instregex "ROR8m1")>;
-def: InstRW<[BWWriteResGroup87], (instregex "ROR8mi")>;
+def: InstRW<[BWWriteResGroup87], (instregex "ROL(8|16|32|64)m1",
+ "ROL(8|16|32|64)mi",
+ "ROR(8|16|32|64)m1",
+ "ROR(8|16|32|64)mi")>;
def BWWriteResGroup88 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> {
let Latency = 7;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,2];
}
-def: InstRW<[BWWriteResGroup88], (instregex "XADD(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup88], (instregex "XADD8rm")>;
+def: InstRW<[BWWriteResGroup88], (instregex "XADD(8|16|32|64)rm")>;
def BWWriteResGroup89 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> {
let Latency = 7;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,1,1];
}
-def: InstRW<[BWWriteResGroup89], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup89], (instregex "FARCALL64")>;
+def: InstRW<[BWWriteResGroup89], (instregex "CALL(16|32|64)m",
+ "FARCALL64")>;
def BWWriteResGroup90 : SchedWriteRes<[BWPort6,BWPort06,BWPort15,BWPort0156]> {
let Latency = 7;
let NumMicroOps = 7;
let ResourceCycles = [2,2,1,2];
}
-def: InstRW<[BWWriteResGroup90], (instregex "LOOP")>;
+def: InstRW<[BWWriteResGroup90], (instrs LOOP)>;
def BWWriteResGroup91 : SchedWriteRes<[BWPort1,BWPort23]> {
let Latency = 8;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup91], (instregex "ADDPDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "ADDPSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "ADDSDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "ADDSSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "ADDSUBPDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "ADDSUBPSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "BSF(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "BSR(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "CMPPDrmi")>;
-def: InstRW<[BWWriteResGroup91], (instregex "CMPPSrmi")>;
-def: InstRW<[BWWriteResGroup91], (instregex "CMPSDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "CMPSSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "COMISDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "COMISSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "CVTDQ2PSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "CVTPS2DQrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "CVTTPS2DQrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "IMUL64m")>;
-def: InstRW<[BWWriteResGroup91], (instregex "IMUL(32|64)rm(i8)?")>;
-def: InstRW<[BWWriteResGroup91], (instregex "IMUL8m")>;
-def: InstRW<[BWWriteResGroup91], (instregex "LZCNT(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)PDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)PSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)SDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)SSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)PDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)PSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)SDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)SSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MMX_CVTPI2PSirm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MMX_CVTPS2PIirm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MMX_CVTTPS2PIirm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MUL64m")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MUL8m")>;
-def: InstRW<[BWWriteResGroup91], (instregex "PDEP(32|64)rm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "PEXT(32|64)rm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "POPCNT(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "SUBPDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "SUBPSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "SUBSDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "SUBSSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "TZCNT(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "UCOMISDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "UCOMISSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VADDPDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VADDPSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VADDSDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VADDSSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VADDSUBPDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VADDSUBPSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCMPPDrmi")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCMPPSrmi")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCMPSDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCMPSSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCOMISDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCOMISSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCVTDQ2PSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCVTPS2DQrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCVTTPS2DQrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)PDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)PSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)SDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)SSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)PDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)PSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)SDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)SSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VSUBPDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VSUBPSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VSUBSDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VSUBSSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VUCOMISDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VUCOMISSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "MMX_CVTPI2PSirm",
+ "PDEP(32|64)rm",
+ "PEXT(32|64)rm",
+ "(V?)CVTDQ2PSrm")>;
def BWWriteResGroup91_16 : SchedWriteRes<[BWPort1, BWPort0156, BWPort23]> {
let Latency = 8;
let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
+ let ResourceCycles = [1,1,1];
}
-def: InstRW<[BWWriteResGroup91_16], (instregex "IMUL16rm(i8)?")>;
+def: InstRW<[BWWriteResGroup91_16], (instrs IMUL16rmi, IMUL16rmi8)>;
-def BWWriteResGroup91_16_2 : SchedWriteRes<[BWPort1, BWPort0156, BWPort23]> {
- let Latency = 8;
+def BWWriteResGroup91_16_2 : SchedWriteRes<[BWPort1, BWPort06, BWPort0156, BWPort23]> {
+ let Latency = 9;
let NumMicroOps = 5;
+ let ResourceCycles = [1,1,2,1];
}
-def: InstRW<[BWWriteResGroup91_16_2], (instregex "IMUL16m")>;
-def: InstRW<[BWWriteResGroup91_16_2], (instregex "MUL16m")>;
-
-def BWWriteResGroup91_32 : SchedWriteRes<[BWPort1, BWPort0156, BWPort23]> {
- let Latency = 8;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup91_32], (instregex "IMUL32m")>;
-def: InstRW<[BWWriteResGroup91_32], (instregex "MUL32m")>;
+def: InstRW<[BWWriteResGroup91_16_2], (instrs IMUL16m, MUL16m)>;
def BWWriteResGroup92 : SchedWriteRes<[BWPort5,BWPort23]> {
let Latency = 8;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXBDYrm")>;
-def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXBQYrm")>;
-def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXBWYrm")>;
-def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXDQYrm")>;
-def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXWDYrm")>;
-def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXWQYrm")>;
-def: InstRW<[BWWriteResGroup92], (instregex "VPMOVZXWDYrm")>;
-
-def BWWriteResGroup93 : SchedWriteRes<[BWPort01,BWPort23]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup93], (instregex "MULPDrm")>;
-def: InstRW<[BWWriteResGroup93], (instregex "MULPSrm")>;
-def: InstRW<[BWWriteResGroup93], (instregex "MULSDrm")>;
-def: InstRW<[BWWriteResGroup93], (instregex "MULSSrm")>;
-def: InstRW<[BWWriteResGroup93], (instregex "VMULPDrm")>;
-def: InstRW<[BWWriteResGroup93], (instregex "VMULPSrm")>;
-def: InstRW<[BWWriteResGroup93], (instregex "VMULSDrm")>;
-def: InstRW<[BWWriteResGroup93], (instregex "VMULSSrm")>;
-
-def BWWriteResGroup94 : SchedWriteRes<[BWPort5,BWPort23]> {
- let Latency = 8;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup94], (instregex "VBLENDVPDYrm")>;
-def: InstRW<[BWWriteResGroup94], (instregex "VBLENDVPSYrm")>;
-def: InstRW<[BWWriteResGroup94], (instregex "VMASKMOVPDYrm")>;
-def: InstRW<[BWWriteResGroup94], (instregex "VMASKMOVPSYrm")>;
-def: InstRW<[BWWriteResGroup94], (instregex "VPBLENDVBYrm")>;
-def: InstRW<[BWWriteResGroup94], (instregex "VPMASKMOVDYrm")>;
-def: InstRW<[BWWriteResGroup94], (instregex "VPMASKMOVQYrm")>;
-
-def BWWriteResGroup95 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
- let Latency = 8;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup95], (instregex "VPSLLVDrm")>;
-def: InstRW<[BWWriteResGroup95], (instregex "VPSRAVDrm")>;
-def: InstRW<[BWWriteResGroup95], (instregex "VPSRLVDrm")>;
-
-def BWWriteResGroup96 : SchedWriteRes<[BWPort5,BWPort23,BWPort15]> {
- let Latency = 8;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHADDSWrm64")>;
-def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHADDWrm64")>;
-def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHADDrm64")>;
-def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHSUBDrm64")>;
-def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHSUBSWrm64")>;
-def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHSUBWrm64")>;
-def: InstRW<[BWWriteResGroup96], (instregex "PHADDDrm")>;
-def: InstRW<[BWWriteResGroup96], (instregex "PHADDSWrm128")>;
-def: InstRW<[BWWriteResGroup96], (instregex "PHADDWrm")>;
-def: InstRW<[BWWriteResGroup96], (instregex "PHSUBDrm")>;
-def: InstRW<[BWWriteResGroup96], (instregex "PHSUBSWrm128")>;
-def: InstRW<[BWWriteResGroup96], (instregex "PHSUBWrm")>;
-def: InstRW<[BWWriteResGroup96], (instregex "VPHADDDrm")>;
-def: InstRW<[BWWriteResGroup96], (instregex "VPHADDSWrm128")>;
-def: InstRW<[BWWriteResGroup96], (instregex "VPHADDWrm")>;
-def: InstRW<[BWWriteResGroup96], (instregex "VPHSUBDrm")>;
-def: InstRW<[BWWriteResGroup96], (instregex "VPHSUBSWrm128")>;
-def: InstRW<[BWWriteResGroup96], (instregex "VPHSUBWrm")>;
+def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXBDYrm",
+ "VPMOVSXBQYrm",
+ "VPMOVSXBWYrm",
+ "VPMOVSXDQYrm",
+ "VPMOVSXWDYrm",
+ "VPMOVSXWQYrm",
+ "VPMOVZXWDYrm")>;
def BWWriteResGroup97 : SchedWriteRes<[BWPort23,BWPort237,BWPort06,BWPort0156]> {
let Latency = 8;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,2];
}
-def: InstRW<[BWWriteResGroup97], (instregex "RCL(16|32|64)m1")>;
-def: InstRW<[BWWriteResGroup97], (instregex "RCL(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup97], (instregex "RCL8m1")>;
-def: InstRW<[BWWriteResGroup97], (instregex "RCL8mi")>;
-def: InstRW<[BWWriteResGroup97], (instregex "RCR(16|32|64)m1")>;
-def: InstRW<[BWWriteResGroup97], (instregex "RCR(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup97], (instregex "RCR8m1")>;
-def: InstRW<[BWWriteResGroup97], (instregex "RCR8mi")>;
+def: InstRW<[BWWriteResGroup97], (instregex "RCL(8|16|32|64)m1",
+ "RCL(8|16|32|64)mi",
+ "RCR(8|16|32|64)m1",
+ "RCR(8|16|32|64)mi")>;
def BWWriteResGroup98 : SchedWriteRes<[BWPort23,BWPort237,BWPort06,BWPort0156]> {
let Latency = 8;
let NumMicroOps = 5;
let ResourceCycles = [1,1,2,1];
}
-def: InstRW<[BWWriteResGroup98], (instregex "ROR(16|32|64)mCL")>;
-def: InstRW<[BWWriteResGroup98], (instregex "ROR8mCL")>;
+def: InstRW<[BWWriteResGroup98], (instregex "ROR(8|16|32|64)mCL")>;
def BWWriteResGroup99 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> {
let Latency = 8;
let NumMicroOps = 6;
let ResourceCycles = [1,1,1,3];
}
-def: InstRW<[BWWriteResGroup99], (instregex "ADC(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup99], (instregex "ADC8mi")>;
-def: InstRW<[BWWriteResGroup99], (instregex "ADD8mi")>;
-def: InstRW<[BWWriteResGroup99], (instregex "AND8mi")>;
-def: InstRW<[BWWriteResGroup99], (instregex "OR8mi")>;
-def: InstRW<[BWWriteResGroup99], (instregex "SUB8mi")>;
-def: InstRW<[BWWriteResGroup99], (instregex "XCHG(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup99], (instregex "XCHG8rm")>;
-def: InstRW<[BWWriteResGroup99], (instregex "XOR8mi")>;
+def: InstRW<[BWWriteResGroup99], (instregex "XCHG(8|16|32|64)rm")>;
def BWWriteResGroup100 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06,BWPort0156]> {
let Latency = 8;
let NumMicroOps = 6;
let ResourceCycles = [1,1,1,2,1];
}
-def: InstRW<[BWWriteResGroup100], (instregex "ADC(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup100], (instregex "ADC8mr")>;
-def: InstRW<[BWWriteResGroup100], (instregex "CMPXCHG(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup100], (instregex "CMPXCHG8rm")>;
-def: InstRW<[BWWriteResGroup100], (instregex "ROL(16|32|64)mCL")>;
-def: InstRW<[BWWriteResGroup100], (instregex "ROL8mCL")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SAR(16|32|64)mCL")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SAR8mCL")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SBB(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SBB(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SBB8mi")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SBB8mr")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SHL(16|32|64)mCL")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SHL8mCL")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SHR(16|32|64)mCL")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SHR8mCL")>;
+def : SchedAlias<WriteADCRMW, BWWriteResGroup100>;
+def: InstRW<[BWWriteResGroup100], (instregex "CMPXCHG(8|16|32|64)rm",
+ "ROL(8|16|32|64)mCL",
+ "SAR(8|16|32|64)mCL",
+ "SHL(8|16|32|64)mCL",
+ "SHR(8|16|32|64)mCL")>;
def BWWriteResGroup101 : SchedWriteRes<[BWPort1,BWPort23]> {
let Latency = 9;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup101], (instregex "ADD_F32m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "ADD_F64m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "ILD_F16m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "ILD_F32m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "ILD_F64m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "SUBR_F32m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "SUBR_F64m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "SUB_F32m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "SUB_F64m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VADDPDYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VADDPSYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VADDSUBPDYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VADDSUBPSYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VCMPPDYrmi")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VCMPPSYrmi")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VCVTDQ2PSYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VCVTPS2DQYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VCVTTPS2DQYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VMAX(C?)PDYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VMAX(C?)PSYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VMIN(C?)PDYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VMIN(C?)PSYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VSUBPDYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VSUBPSYrm")>;
-
-def BWWriteResGroup102 : SchedWriteRes<[BWPort5,BWPort23]> {
- let Latency = 9;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup102], (instregex "VPERM2F128rm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPERM2I128rm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPERMDYrm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPERMPDYmi")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPERMPSYrm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPERMQYmi")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXBDYrm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXBQYrm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXBWYrm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXDQYrm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXWQYrm")>;
-
-def BWWriteResGroup103 : SchedWriteRes<[BWPort01,BWPort23]> {
- let Latency = 9;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup103], (instregex "VMULPDYrm")>;
-def: InstRW<[BWWriteResGroup103], (instregex "VMULPSYrm")>;
-
-def BWWriteResGroup104 : SchedWriteRes<[BWPort0,BWPort1,BWPort5]> {
- let Latency = 9;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup104], (instregex "DPPDrri")>;
-def: InstRW<[BWWriteResGroup104], (instregex "VDPPDrri")>;
+def: InstRW<[BWWriteResGroup101], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+ "ILD_F(16|32|64)m",
+ "VCVTPS2DQYrm",
+ "VCVTTPS2DQYrm")>;
def BWWriteResGroup105 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
let Latency = 9;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[BWWriteResGroup105], (instregex "CVTSD2SI64rm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "CVTSD2SIrm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "CVTSS2SI64rm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "CVTSS2SIrm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "CVTTSD2SI64rm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "CVTTSD2SIrm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "CVTTSS2SIrm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTSD2SI64rm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTSD2SIrm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTSS2SI64rm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTSS2SIrm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSD2SI64rm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSD2SIrm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSS2SI64rm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSS2SIrm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "(V?)CVTSS2SI(64)?rm",
+ "(V?)CVT(T?)SD2SI64rm",
+ "(V?)CVT(T?)SD2SIrm",
+ "VCVTTSS2SI64rm",
+ "(V?)CVTTSS2SIrm")>;
def BWWriteResGroup106 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
let Latency = 9;
@@ -2971,56 +1291,29 @@ def BWWriteResGroup107 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[BWWriteResGroup107], (instregex "CVTDQ2PDrm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "CVTPD2DQrm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "CVTPD2PSrm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "CVTSD2SSrm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "CVTTPD2DQrm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVTPD2PIirm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVTPI2PDirm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVTTPD2PIirm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "MULX64rm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "VCVTDQ2PDrm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "VCVTSD2SSrm")>;
+def: InstRW<[BWWriteResGroup107], (instrs IMUL64m, MUL64m, MULX64rm)>;
+def: InstRW<[BWWriteResGroup107], (instregex "CVTPD2PSrm",
+ "CVT(T?)PD2DQrm",
+ "MMX_CVTPI2PDirm",
+ "MMX_CVT(T?)PD2PIirm",
+ "(V?)CVTDQ2PDrm",
+ "(V?)CVTSD2SSrm")>;
def BWWriteResGroup108 : SchedWriteRes<[BWPort5,BWPort23,BWPort015]> {
let Latency = 9;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTBYrm")>;
-def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTBrm")>;
-def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTWYrm")>;
-def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTWrm")>;
-
-def BWWriteResGroup109 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
- let Latency = 9;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup109], (instregex "VPSLLVDYrm")>;
-def: InstRW<[BWWriteResGroup109], (instregex "VPSRAVDYrm")>;
-def: InstRW<[BWWriteResGroup109], (instregex "VPSRLVDYrm")>;
-
-def BWWriteResGroup110 : SchedWriteRes<[BWPort5,BWPort23,BWPort15]> {
- let Latency = 9;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup110], (instregex "VPHADDDYrm")>;
-def: InstRW<[BWWriteResGroup110], (instregex "VPHADDSWrm256")>;
-def: InstRW<[BWWriteResGroup110], (instregex "VPHADDWYrm")>;
-def: InstRW<[BWWriteResGroup110], (instregex "VPHSUBDYrm")>;
-def: InstRW<[BWWriteResGroup110], (instregex "VPHSUBSWrm256")>;
-def: InstRW<[BWWriteResGroup110], (instregex "VPHSUBWYrm")>;
+def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTB(Y?)rm",
+ "VPBROADCASTW(Y?)rm")>;
def BWWriteResGroup111 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort0156]> {
let Latency = 9;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[BWWriteResGroup111], (instregex "SHLD(16|32|64)mri8")>;
-def: InstRW<[BWWriteResGroup111], (instregex "SHRD(16|32|64)mri8")>;
+def: InstRW<[BWWriteResGroup111], (instregex "SHLD(16|32|64)mri8",
+ "SHRD(16|32|64)mri8")>;
def BWWriteResGroup112 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> {
let Latency = 9;
@@ -3034,103 +1327,22 @@ def BWWriteResGroup113 : SchedWriteRes<[BWPort1,BWPort6,BWPort23,BWPort0156]> {
let NumMicroOps = 5;
let ResourceCycles = [1,2,1,1];
}
-def: InstRW<[BWWriteResGroup113], (instregex "LAR(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup113], (instregex "LSL(16|32|64)rm")>;
-
-def BWWriteResGroup114 : SchedWriteRes<[BWPort0]> {
- let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[BWWriteResGroup114], (instregex "PMULLDrr")>;
-def: InstRW<[BWWriteResGroup114], (instregex "VPMULLDYrr")>;
-def: InstRW<[BWWriteResGroup114], (instregex "VPMULLDrr")>;
+def: InstRW<[BWWriteResGroup113], (instregex "LAR(16|32|64)rm",
+ "LSL(16|32|64)rm")>;
def BWWriteResGroup115 : SchedWriteRes<[BWPort0,BWPort23]> {
let Latency = 10;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMADDUBSWrm64")>;
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMADDWDirm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULHRSWrm64")>;
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULHUWirm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULHWirm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULLWirm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULUDQirm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PSADBWirm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PCLMULQDQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PCMPGTQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PHMINPOSUWrm128")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMADDUBSWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMADDWDrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMULDQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMULHRSWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMULHUWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMULHWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMULLWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMULUDQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PSADBWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "RCPPSm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "RCPSSm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "RSQRTPSm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "RSQRTSSm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPCLMULQDQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPCMPGTQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPHMINPOSUWrm128")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMADDUBSWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMADDWDrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMULDQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMULHRSWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMULHUWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMULHWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMULLWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMULUDQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPSADBWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VRCPPSm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VRCPSSm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VRSQRTPSm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VRSQRTSSm")>;
-
-def BWWriteResGroup116 : SchedWriteRes<[BWPort01,BWPort23]> {
- let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup116],
- (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m",
- "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>;
+def: InstRW<[BWWriteResGroup115], (instregex "(V?)PCMPGTQrm")>;
def BWWriteResGroup117 : SchedWriteRes<[BWPort1,BWPort23]> {
let Latency = 10;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[BWWriteResGroup117], (instregex "FICOM16m")>;
-def: InstRW<[BWWriteResGroup117], (instregex "FICOM32m")>;
-def: InstRW<[BWWriteResGroup117], (instregex "FICOMP16m")>;
-def: InstRW<[BWWriteResGroup117], (instregex "FICOMP32m")>;
-
-def BWWriteResGroup118 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
- let Latency = 10;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup118], (instregex "VPTESTYrm")>;
-
-def BWWriteResGroup119 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
- let Latency = 10;
- let NumMicroOps = 4;
- let ResourceCycles = [1,2,1];
-}
-def: InstRW<[BWWriteResGroup119], (instregex "HADDPDrm")>;
-def: InstRW<[BWWriteResGroup119], (instregex "HADDPSrm")>;
-def: InstRW<[BWWriteResGroup119], (instregex "HSUBPDrm")>;
-def: InstRW<[BWWriteResGroup119], (instregex "HSUBPSrm")>;
-def: InstRW<[BWWriteResGroup119], (instregex "VHADDPDrm")>;
-def: InstRW<[BWWriteResGroup119], (instregex "VHADDPSrm")>;
-def: InstRW<[BWWriteResGroup119], (instregex "VHSUBPDrm")>;
-def: InstRW<[BWWriteResGroup119], (instregex "VHSUBPSrm")>;
+def: InstRW<[BWWriteResGroup117], (instregex "FICOM(P?)(16|32)m")>;
def BWWriteResGroup120 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> {
let Latency = 10;
@@ -3140,79 +1352,26 @@ def BWWriteResGroup120 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> {
def: InstRW<[BWWriteResGroup120], (instregex "CVTTSS2SI64rm")>;
def BWWriteResGroup121 : SchedWriteRes<[BWPort1,BWPort23,BWPort06,BWPort0156]> {
- let Latency = 10;
+ let Latency = 9;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[BWWriteResGroup121], (instregex "MULX32rm")>;
+def: InstRW<[BWWriteResGroup121], (instrs IMUL32m, MUL32m, MULX32rm)>;
-def BWWriteResGroup122 : SchedWriteRes<[BWPort0]> {
+def BWWriteResGroup122_1 : SchedWriteRes<[BWPort0,BWFPDivider]> {
let Latency = 11;
let NumMicroOps = 1;
- let ResourceCycles = [1];
+ let ResourceCycles = [1,3]; // Really 2.5 cycle throughput
}
-def: InstRW<[BWWriteResGroup122], (instregex "DIVPSrr")>;
-def: InstRW<[BWWriteResGroup122], (instregex "DIVSSrr")>;
-def: InstRW<[BWWriteResGroup122], (instregex "VDIVPSrr")>;
-def: InstRW<[BWWriteResGroup122], (instregex "VDIVSSrr")>;
+def : SchedAlias<WriteFDiv, BWWriteResGroup122_1>; // TODO - convert to ZnWriteResFpuPair
def BWWriteResGroup123 : SchedWriteRes<[BWPort0,BWPort23]> {
let Latency = 11;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup123], (instregex "MUL_F32m")>;
-def: InstRW<[BWWriteResGroup123], (instregex "MUL_F64m")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPCMPGTQYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMADDUBSWYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMADDWDYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMULDQYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMULHRSWYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMULHUWYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMULHWYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMULLWYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMULUDQYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPSADBWYrm")>;
-
-def BWWriteResGroup124 : SchedWriteRes<[BWPort01,BWPort23]> {
- let Latency = 11;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup124],
- (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym")>;
-
-def BWWriteResGroup125 : SchedWriteRes<[BWPort0]> {
- let Latency = 11;
- let NumMicroOps = 3;
- let ResourceCycles = [3];
-}
-def: InstRW<[BWWriteResGroup125], (instregex "PCMPISTRIrr")>;
-def: InstRW<[BWWriteResGroup125], (instregex "PCMPISTRM128rr")>;
-def: InstRW<[BWWriteResGroup125], (instregex "VPCMPISTRIrr")>;
-def: InstRW<[BWWriteResGroup125], (instregex "VPCMPISTRM128rr")>;
-
-def BWWriteResGroup126 : SchedWriteRes<[BWPort0,BWPort015]> {
- let Latency = 11;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup126], (instregex "VRCPPSYr")>;
-def: InstRW<[BWWriteResGroup126], (instregex "VRSQRTPSYr")>;
-
-def BWWriteResGroup127 : SchedWriteRes<[BWPort1,BWPort23]> {
- let Latency = 11;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup127], (instregex "ROUNDPDm")>;
-def: InstRW<[BWWriteResGroup127], (instregex "ROUNDPSm")>;
-def: InstRW<[BWWriteResGroup127], (instregex "ROUNDSDm")>;
-def: InstRW<[BWWriteResGroup127], (instregex "ROUNDSSm")>;
-def: InstRW<[BWWriteResGroup127], (instregex "VROUNDPDm")>;
-def: InstRW<[BWWriteResGroup127], (instregex "VROUNDPSm")>;
-def: InstRW<[BWWriteResGroup127], (instregex "VROUNDSDm")>;
-def: InstRW<[BWWriteResGroup127], (instregex "VROUNDSSm")>;
+def: InstRW<[BWWriteResGroup123], (instregex "MUL_F(32|64)m",
+ "VPCMPGTQYrm")>;
def BWWriteResGroup128 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
let Latency = 11;
@@ -3221,31 +1380,21 @@ def BWWriteResGroup128 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
}
def: InstRW<[BWWriteResGroup128], (instregex "VCVTDQ2PDYrm")>;
-def BWWriteResGroup129 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
- let Latency = 11;
- let NumMicroOps = 4;
- let ResourceCycles = [1,2,1];
-}
-def: InstRW<[BWWriteResGroup129], (instregex "VHADDPDYrm")>;
-def: InstRW<[BWWriteResGroup129], (instregex "VHADDPSYrm")>;
-def: InstRW<[BWWriteResGroup129], (instregex "VHSUBPDYrm")>;
-def: InstRW<[BWWriteResGroup129], (instregex "VHSUBPSYrm")>;
-
def BWWriteResGroup130 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort0156]> {
let Latency = 11;
let NumMicroOps = 6;
let ResourceCycles = [1,1,1,1,2];
}
-def: InstRW<[BWWriteResGroup130], (instregex "SHLD(16|32|64)mrCL")>;
-def: InstRW<[BWWriteResGroup130], (instregex "SHRD(16|32|64)mrCL")>;
+def: InstRW<[BWWriteResGroup130], (instregex "SHLD(16|32|64)mrCL",
+ "SHRD(16|32|64)mrCL")>;
def BWWriteResGroup131 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
let Latency = 11;
let NumMicroOps = 7;
let ResourceCycles = [2,2,3];
}
-def: InstRW<[BWWriteResGroup131], (instregex "RCL(16|32|64)rCL")>;
-def: InstRW<[BWWriteResGroup131], (instregex "RCR(16|32|64)rCL")>;
+def: InstRW<[BWWriteResGroup131], (instregex "RCL(16|32|64)rCL",
+ "RCR(16|32|64)rCL")>;
def BWWriteResGroup132 : SchedWriteRes<[BWPort1,BWPort06,BWPort15,BWPort0156]> {
let Latency = 11;
@@ -3259,104 +1408,29 @@ def BWWriteResGroup133 : SchedWriteRes<[BWPort06,BWPort0156]> {
let NumMicroOps = 11;
let ResourceCycles = [2,9];
}
-def: InstRW<[BWWriteResGroup133], (instregex "LOOPE")>;
-def: InstRW<[BWWriteResGroup133], (instregex "LOOPNE")>;
-
-def BWWriteResGroup134 : SchedWriteRes<[BWPort5,BWPort23]> {
- let Latency = 12;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup134], (instregex "AESDECLASTrm")>;
-def: InstRW<[BWWriteResGroup134], (instregex "AESDECrm")>;
-def: InstRW<[BWWriteResGroup134], (instregex "AESENCLASTrm")>;
-def: InstRW<[BWWriteResGroup134], (instregex "AESENCrm")>;
-def: InstRW<[BWWriteResGroup134], (instregex "VAESDECLASTrm")>;
-def: InstRW<[BWWriteResGroup134], (instregex "VAESDECrm")>;
-def: InstRW<[BWWriteResGroup134], (instregex "VAESENCLASTrm")>;
-def: InstRW<[BWWriteResGroup134], (instregex "VAESENCrm")>;
+def: InstRW<[BWWriteResGroup133], (instrs LOOPE)>;
+def: InstRW<[BWWriteResGroup133], (instrs LOOPNE)>;
def BWWriteResGroup135 : SchedWriteRes<[BWPort1,BWPort23]> {
let Latency = 12;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[BWWriteResGroup135], (instregex "ADD_FI16m")>;
-def: InstRW<[BWWriteResGroup135], (instregex "ADD_FI32m")>;
-def: InstRW<[BWWriteResGroup135], (instregex "SUBR_FI16m")>;
-def: InstRW<[BWWriteResGroup135], (instregex "SUBR_FI32m")>;
-def: InstRW<[BWWriteResGroup135], (instregex "SUB_FI16m")>;
-def: InstRW<[BWWriteResGroup135], (instregex "SUB_FI32m")>;
-def: InstRW<[BWWriteResGroup135], (instregex "VROUNDYPDm")>;
-def: InstRW<[BWWriteResGroup135], (instregex "VROUNDYPSm")>;
+def: InstRW<[BWWriteResGroup135], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>;
-def BWWriteResGroup136 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
- let Latency = 12;
- let NumMicroOps = 4;
- let ResourceCycles = [1,2,1];
-}
-def: InstRW<[BWWriteResGroup136], (instregex "MPSADBWrmi")>;
-def: InstRW<[BWWriteResGroup136], (instregex "VMPSADBWrmi")>;
-
-def BWWriteResGroup137 : SchedWriteRes<[BWPort0]> {
- let Latency = 13;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[BWWriteResGroup137], (instregex "SQRTPSr")>;
-def: InstRW<[BWWriteResGroup137], (instregex "SQRTSSr")>;
-
-def BWWriteResGroup138 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
- let Latency = 13;
- let NumMicroOps = 4;
- let ResourceCycles = [1,2,1];
-}
-def: InstRW<[BWWriteResGroup138], (instregex "VMPSADBWYrmi")>;
-
-def BWWriteResGroup139 : SchedWriteRes<[BWPort0]> {
+def BWWriteResGroup139_1 : SchedWriteRes<[BWPort0,BWFPDivider]> {
let Latency = 14;
let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[BWWriteResGroup139], (instregex "DIVPDrr")>;
-def: InstRW<[BWWriteResGroup139], (instregex "DIVSDrr")>;
-def: InstRW<[BWWriteResGroup139], (instregex "VDIVPDrr")>;
-def: InstRW<[BWWriteResGroup139], (instregex "VDIVSDrr")>;
-def: InstRW<[BWWriteResGroup139], (instregex "VSQRTPSr")>;
-def: InstRW<[BWWriteResGroup139], (instregex "VSQRTSSr")>;
-
-def BWWriteResGroup140 : SchedWriteRes<[BWPort5]> {
- let Latency = 14;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
+ let ResourceCycles = [1,4];
}
-def: InstRW<[BWWriteResGroup140], (instregex "AESIMCrr")>;
-def: InstRW<[BWWriteResGroup140], (instregex "VAESIMCrr")>;
+def : SchedAlias<WriteFDiv64, BWWriteResGroup139_1>; // TODO - convert to ZnWriteResFpuPair
def BWWriteResGroup141 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
let Latency = 14;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[BWWriteResGroup141], (instregex "MUL_FI16m")>;
-def: InstRW<[BWWriteResGroup141], (instregex "MUL_FI32m")>;
-
-def BWWriteResGroup142 : SchedWriteRes<[BWPort0,BWPort1,BWPort5]> {
- let Latency = 14;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup142], (instregex "DPPSrri")>;
-def: InstRW<[BWWriteResGroup142], (instregex "VDPPSYrri")>;
-def: InstRW<[BWWriteResGroup142], (instregex "VDPPSrri")>;
-
-def BWWriteResGroup143 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> {
- let Latency = 14;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[BWWriteResGroup143], (instregex "DPPDrmi")>;
-def: InstRW<[BWWriteResGroup143], (instregex "VDPPDrmi")>;
+def: InstRW<[BWWriteResGroup141], (instregex "MUL_FI(16|32)m")>;
def BWWriteResGroup144 : SchedWriteRes<[BWPort1,BWPort6,BWPort23,BWPort0156]> {
let Latency = 14;
@@ -3377,213 +1451,92 @@ def BWWriteResGroup146 : SchedWriteRes<[BWPort0,BWPort1,BWPort6,BWPort0156]> {
let NumMicroOps = 12;
let ResourceCycles = [2,1,4,5];
}
-def: InstRW<[BWWriteResGroup146], (instregex "XCH_F")>;
+def: InstRW<[BWWriteResGroup146], (instrs XCH_F)>;
def BWWriteResGroup147 : SchedWriteRes<[BWPort0]> {
let Latency = 15;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup147], (instregex "DIVR_FPrST0")>;
-def: InstRW<[BWWriteResGroup147], (instregex "DIVR_FST0r")>;
-def: InstRW<[BWWriteResGroup147], (instregex "DIVR_FrST0")>;
-
-def BWWriteResGroup148 : SchedWriteRes<[BWPort0,BWPort23]> {
- let Latency = 15;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup148], (instregex "PMULLDrm")>;
-def: InstRW<[BWWriteResGroup148], (instregex "VPMULLDrm")>;
+def: InstRW<[BWWriteResGroup147], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>;
def BWWriteResGroup149 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> {
let Latency = 15;
let NumMicroOps = 10;
let ResourceCycles = [1,1,1,4,1,2];
}
-def: InstRW<[BWWriteResGroup149], (instregex "RCL(16|32|64)mCL")>;
-def: InstRW<[BWWriteResGroup149], (instregex "RCL8mCL")>;
+def: InstRW<[BWWriteResGroup149], (instregex "RCL(8|16|32|64)mCL")>;
-def BWWriteResGroup150 : SchedWriteRes<[BWPort0,BWPort23]> {
+def BWWriteResGroup150 : SchedWriteRes<[BWPort0,BWPort23,BWFPDivider]> {
let Latency = 16;
let NumMicroOps = 2;
- let ResourceCycles = [1,1];
+ let ResourceCycles = [1,1,5];
}
-def: InstRW<[BWWriteResGroup150], (instregex "DIVPSrm")>;
-def: InstRW<[BWWriteResGroup150], (instregex "DIVSSrm")>;
-def: InstRW<[BWWriteResGroup150], (instregex "VDIVPSrm")>;
-def: InstRW<[BWWriteResGroup150], (instregex "VDIVSSrm")>;
-
-def BWWriteResGroup151 : SchedWriteRes<[BWPort0,BWPort23]> {
- let Latency = 16;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup151], (instregex "VPMULLDYrm")>;
-
-def BWWriteResGroup152 : SchedWriteRes<[BWPort0,BWPort23]> {
- let Latency = 16;
- let NumMicroOps = 4;
- let ResourceCycles = [3,1];
-}
-def: InstRW<[BWWriteResGroup152], (instregex "PCMPISTRIrm")>;
-def: InstRW<[BWWriteResGroup152], (instregex "PCMPISTRM128rm")>;
-def: InstRW<[BWWriteResGroup152], (instregex "VPCMPISTRIrm")>;
-def: InstRW<[BWWriteResGroup152], (instregex "VPCMPISTRM128rm")>;
+def : SchedAlias<WriteFDivLd, BWWriteResGroup150>; // TODO - convert to ZnWriteResFpuPair
def BWWriteResGroup153 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> {
let Latency = 16;
let NumMicroOps = 14;
let ResourceCycles = [1,1,1,4,2,5];
}
-def: InstRW<[BWWriteResGroup153], (instregex "CMPXCHG8B")>;
+def: InstRW<[BWWriteResGroup153], (instrs CMPXCHG8B)>;
def BWWriteResGroup154 : SchedWriteRes<[BWPort5]> {
let Latency = 16;
let NumMicroOps = 16;
let ResourceCycles = [16];
}
-def: InstRW<[BWWriteResGroup154], (instregex "VZEROALL")>;
-
-def BWWriteResGroup155 : SchedWriteRes<[BWPort0,BWPort015]> {
- let Latency = 17;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup155], (instregex "VDIVPSYrr")>;
-
-def BWWriteResGroup156 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> {
- let Latency = 17;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup156], (instregex "VRCPPSYm")>;
-def: InstRW<[BWWriteResGroup156], (instregex "VRSQRTPSYm")>;
-
-def BWWriteResGroup157 : SchedWriteRes<[BWPort0,BWPort23]> {
- let Latency = 18;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup157], (instregex "SQRTPSm")>;
-def: InstRW<[BWWriteResGroup157], (instregex "SQRTSSm")>;
-
-def BWWriteResGroup158 : SchedWriteRes<[BWPort0,BWPort5,BWPort0156]> {
- let Latency = 18;
- let NumMicroOps = 8;
- let ResourceCycles = [4,3,1];
-}
-def: InstRW<[BWWriteResGroup158], (instregex "PCMPESTRIrr")>;
-def: InstRW<[BWWriteResGroup158], (instregex "VPCMPESTRIrr")>;
+def: InstRW<[BWWriteResGroup154], (instrs VZEROALL)>;
def BWWriteResGroup159 : SchedWriteRes<[BWPort5,BWPort6,BWPort06,BWPort0156]> {
let Latency = 18;
let NumMicroOps = 8;
let ResourceCycles = [1,1,1,5];
}
-def: InstRW<[BWWriteResGroup159], (instregex "CPUID")>;
-def: InstRW<[BWWriteResGroup159], (instregex "RDTSC")>;
+def: InstRW<[BWWriteResGroup159], (instrs CPUID)>;
+def: InstRW<[BWWriteResGroup159], (instrs RDTSC)>;
def BWWriteResGroup160 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> {
let Latency = 18;
let NumMicroOps = 11;
let ResourceCycles = [2,1,1,3,1,3];
}
-def: InstRW<[BWWriteResGroup160], (instregex "RCR(16|32|64)mCL")>;
-def: InstRW<[BWWriteResGroup160], (instregex "RCR8mCL")>;
+def: InstRW<[BWWriteResGroup160], (instregex "RCR(8|16|32|64)mCL")>;
-def BWWriteResGroup161 : SchedWriteRes<[BWPort0,BWPort23]> {
+def BWWriteResGroup161 : SchedWriteRes<[BWPort0,BWPort23,BWFPDivider]> {
let Latency = 19;
let NumMicroOps = 2;
- let ResourceCycles = [1,1];
+ let ResourceCycles = [1,1,8];
}
-def: InstRW<[BWWriteResGroup161], (instregex "DIVPDrm")>;
-def: InstRW<[BWWriteResGroup161], (instregex "DIVSDrm")>;
-def: InstRW<[BWWriteResGroup161], (instregex "VDIVPDrm")>;
-def: InstRW<[BWWriteResGroup161], (instregex "VDIVSDrm")>;
-def: InstRW<[BWWriteResGroup161], (instregex "VSQRTPSm")>;
-def: InstRW<[BWWriteResGroup161], (instregex "VSQRTSSm")>;
-
-def BWWriteResGroup162 : SchedWriteRes<[BWPort5,BWPort23]> {
- let Latency = 19;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup162], (instregex "AESIMCrm")>;
-def: InstRW<[BWWriteResGroup162], (instregex "VAESIMCrm")>;
-
-def BWWriteResGroup163 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> {
- let Latency = 19;
- let NumMicroOps = 5;
- let ResourceCycles = [2,1,1,1];
-}
-def: InstRW<[BWWriteResGroup163], (instregex "DPPSrmi")>;
-def: InstRW<[BWWriteResGroup163], (instregex "VDPPSrmi")>;
-
-def BWWriteResGroup164 : SchedWriteRes<[BWPort0,BWPort5,BWPort015,BWPort0156]> {
- let Latency = 19;
- let NumMicroOps = 9;
- let ResourceCycles = [4,3,1,1];
-}
-def: InstRW<[BWWriteResGroup164], (instregex "PCMPESTRM128rr")>;
-def: InstRW<[BWWriteResGroup164], (instregex "VPCMPESTRM128rr")>;
+def : SchedAlias<WriteFDiv64Ld, BWWriteResGroup161>; // TODO - convert to ZnWriteResFpuPair
def BWWriteResGroup165 : SchedWriteRes<[BWPort0]> {
let Latency = 20;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup165], (instregex "DIV_FPrST0")>;
-def: InstRW<[BWWriteResGroup165], (instregex "DIV_FST0r")>;
-def: InstRW<[BWWriteResGroup165], (instregex "DIV_FrST0")>;
-def: InstRW<[BWWriteResGroup165], (instregex "SQRTPDr")>;
-def: InstRW<[BWWriteResGroup165], (instregex "SQRTSDr")>;
-
-def BWWriteResGroup166 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> {
- let Latency = 20;
- let NumMicroOps = 5;
- let ResourceCycles = [2,1,1,1];
-}
-def: InstRW<[BWWriteResGroup166], (instregex "VDPPSYrmi")>;
+def: InstRW<[BWWriteResGroup165], (instregex "DIV_(FPrST0|FST0r|FrST0)")>;
def BWWriteResGroup167 : SchedWriteRes<[BWPort4,BWPort5,BWPort6,BWPort23,BWPort237,BWPort06,BWPort0156]> {
let Latency = 20;
let NumMicroOps = 8;
let ResourceCycles = [1,1,1,1,1,1,2];
}
-def: InstRW<[BWWriteResGroup167], (instregex "INSB")>;
-def: InstRW<[BWWriteResGroup167], (instregex "INSL")>;
-def: InstRW<[BWWriteResGroup167], (instregex "INSW")>;
-
-def BWWriteResGroup168 : SchedWriteRes<[BWPort0]> {
- let Latency = 21;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[BWWriteResGroup168], (instregex "VSQRTPDr")>;
-def: InstRW<[BWWriteResGroup168], (instregex "VSQRTSDr")>;
+def: InstRW<[BWWriteResGroup167], (instrs INSB, INSL, INSW)>;
def BWWriteResGroup169 : SchedWriteRes<[BWPort0,BWPort23]> {
let Latency = 21;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup169], (instregex "DIV_F32m")>;
-def: InstRW<[BWWriteResGroup169], (instregex "DIV_F64m")>;
-
-def BWWriteResGroup170 : SchedWriteRes<[BWPort0,BWPort015]> {
- let Latency = 21;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup170], (instregex "VSQRTPSYr")>;
+def: InstRW<[BWWriteResGroup169], (instregex "DIV_F(32|64)m")>;
def BWWriteResGroup171 : SchedWriteRes<[BWPort0,BWPort4,BWPort5,BWPort23,BWPort237,BWPort06,BWPort0156]> {
let Latency = 21;
let NumMicroOps = 19;
let ResourceCycles = [2,1,4,1,1,4,6];
}
-def: InstRW<[BWWriteResGroup171], (instregex "CMPXCHG16B")>;
+def: InstRW<[BWWriteResGroup171], (instrs CMPXCHG16B)>;
def BWWriteResGroup172 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
let Latency = 22;
@@ -3592,28 +1545,6 @@ def BWWriteResGroup172 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
}
def: InstRW<[BWWriteResGroup172], (instregex "POPF64")>;
-def BWWriteResGroup173 : SchedWriteRes<[BWPort0,BWPort015]> {
- let Latency = 23;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup173], (instregex "VDIVPDYrr")>;
-
-def BWWriteResGroup174 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> {
- let Latency = 23;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup174], (instregex "VDIVPSYrm")>;
-
-def BWWriteResGroup175 : SchedWriteRes<[BWPort0,BWPort5,BWPort23,BWPort0156]> {
- let Latency = 23;
- let NumMicroOps = 9;
- let ResourceCycles = [4,3,1,1];
-}
-def: InstRW<[BWWriteResGroup175], (instregex "PCMPESTRIrm")>;
-def: InstRW<[BWWriteResGroup175], (instregex "VPCMPESTRIrm")>;
-
def BWWriteResGroup176 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
let Latency = 23;
let NumMicroOps = 19;
@@ -3626,56 +1557,21 @@ def BWWriteResGroup177 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[BWWriteResGroup177], (instregex "DIV_FI16m")>;
-def: InstRW<[BWWriteResGroup177], (instregex "DIV_FI32m")>;
-
-def BWWriteResGroup178 : SchedWriteRes<[BWPort0,BWPort5,BWPort23,BWPort015,BWPort0156]> {
- let Latency = 24;
- let NumMicroOps = 10;
- let ResourceCycles = [4,3,1,1,1];
-}
-def: InstRW<[BWWriteResGroup178], (instregex "PCMPESTRM128rm")>;
-def: InstRW<[BWWriteResGroup178], (instregex "VPCMPESTRM128rm")>;
-
-def BWWriteResGroup179 : SchedWriteRes<[BWPort0,BWPort23]> {
- let Latency = 25;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup179], (instregex "SQRTPDm")>;
-def: InstRW<[BWWriteResGroup179], (instregex "SQRTSDm")>;
+def: InstRW<[BWWriteResGroup177], (instregex "DIV_FI(16|32)m")>;
def BWWriteResGroup180 : SchedWriteRes<[BWPort0,BWPort23]> {
let Latency = 26;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup180], (instregex "DIVR_F32m")>;
-def: InstRW<[BWWriteResGroup180], (instregex "DIVR_F64m")>;
-def: InstRW<[BWWriteResGroup180], (instregex "VSQRTPDm")>;
-def: InstRW<[BWWriteResGroup180], (instregex "VSQRTSDm")>;
-
-def BWWriteResGroup181 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> {
- let Latency = 27;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup181], (instregex "VSQRTPSYm")>;
+def: InstRW<[BWWriteResGroup180], (instregex "DIVR_F(32|64)m")>;
def BWWriteResGroup182 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
let Latency = 29;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[BWWriteResGroup182], (instregex "DIVR_FI16m")>;
-def: InstRW<[BWWriteResGroup182], (instregex "DIVR_FI32m")>;
-
-def BWWriteResGroup183 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> {
- let Latency = 29;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup183], (instregex "VDIVPDYrm")>;
+def: InstRW<[BWWriteResGroup182], (instregex "DIVR_FI(16|32)m")>;
def BWWriteResGroup183_1 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
let Latency = 22;
@@ -3716,7 +1612,7 @@ def: InstRW<[BWWriteResGroup183_5], (instrs VGATHERDPDYrm)>;
def BWWriteResGroup183_6 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
let Latency = 26;
let NumMicroOps = 14;
- let ResourceCycles = [1,4,8,1];
+ let ResourceCycles = [1,4,8,1];
}
def: InstRW<[BWWriteResGroup183_6], (instrs VGATHERDPSYrm)>;
@@ -3727,128 +1623,85 @@ def BWWriteResGroup183_7 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156
}
def: InstRW<[BWWriteResGroup183_7], (instrs VGATHERQPSrm)>;
-def BWWriteResGroup184 : SchedWriteRes<[BWPort0,BWPort5,BWPort015]> {
- let Latency = 29;
- let NumMicroOps = 11;
- let ResourceCycles = [2,7,2];
-}
-def: InstRW<[BWWriteResGroup184], (instregex "AESKEYGENASSIST128rr")>;
-def: InstRW<[BWWriteResGroup184], (instregex "VAESKEYGENASSIST128rr")>;
-
def BWWriteResGroup185 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> {
let Latency = 29;
let NumMicroOps = 27;
let ResourceCycles = [1,5,1,1,19];
}
-def: InstRW<[BWWriteResGroup185], (instregex "XSAVE64")>;
+def: InstRW<[BWWriteResGroup185], (instrs XSAVE64)>;
def BWWriteResGroup186 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> {
let Latency = 30;
let NumMicroOps = 28;
let ResourceCycles = [1,6,1,1,19];
}
-def: InstRW<[BWWriteResGroup186], (instregex "XSAVE(OPT)?")>;
-
-def BWWriteResGroup187 : SchedWriteRes<[BWPort01,BWPort15,BWPort015,BWPort0156]> {
- let Latency = 31;
- let NumMicroOps = 31;
- let ResourceCycles = [8,1,21,1];
-}
-def: InstRW<[BWWriteResGroup187], (instregex "MMX_EMMS")>;
-
-def BWWriteResGroup188 : SchedWriteRes<[BWPort0,BWPort5,BWPort23,BWPort015]> {
- let Latency = 33;
- let NumMicroOps = 11;
- let ResourceCycles = [2,7,1,1];
-}
-def: InstRW<[BWWriteResGroup188], (instregex "AESKEYGENASSIST128rm")>;
-def: InstRW<[BWWriteResGroup188], (instregex "VAESKEYGENASSIST128rm")>;
-
-def BWWriteResGroup189 : SchedWriteRes<[BWPort0,BWPort015]> {
- let Latency = 34;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup189], (instregex "VSQRTPDYr")>;
+def: InstRW<[BWWriteResGroup186], (instrs XSAVE)>;
+def: InstRW<[BWWriteResGroup186], (instregex "XSAVEC", "XSAVES", "XSAVEOPT")>;
def BWWriteResGroup190 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156]> {
let Latency = 34;
let NumMicroOps = 8;
let ResourceCycles = [2,2,2,1,1];
}
-def: InstRW<[BWWriteResGroup190], (instregex "DIV(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup190], (instregex "DIV8m")>;
+def: InstRW<[BWWriteResGroup190], (instregex "DIV(8|16|32|64)m")>;
def BWWriteResGroup191 : SchedWriteRes<[BWPort5,BWPort6,BWPort23,BWPort06,BWPort0156]> {
let Latency = 34;
let NumMicroOps = 23;
let ResourceCycles = [1,5,3,4,10];
}
-def: InstRW<[BWWriteResGroup191], (instregex "IN(16|32)ri")>;
-def: InstRW<[BWWriteResGroup191], (instregex "IN(16|32)rr")>;
-def: InstRW<[BWWriteResGroup191], (instregex "IN8ri")>;
-def: InstRW<[BWWriteResGroup191], (instregex "IN8rr")>;
+def: InstRW<[BWWriteResGroup191], (instregex "IN(8|16|32)ri",
+ "IN(8|16|32)rr")>;
def BWWriteResGroup193 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156]> {
let Latency = 35;
let NumMicroOps = 8;
let ResourceCycles = [2,2,2,1,1];
}
-def: InstRW<[BWWriteResGroup193], (instregex "IDIV(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup193], (instregex "IDIV8m")>;
+def: InstRW<[BWWriteResGroup193], (instregex "IDIV(8|16|32|64)m")>;
def BWWriteResGroup194 : SchedWriteRes<[BWPort5,BWPort6,BWPort23,BWPort237,BWPort06,BWPort0156]> {
let Latency = 35;
let NumMicroOps = 23;
let ResourceCycles = [1,5,2,1,4,10];
}
-def: InstRW<[BWWriteResGroup194], (instregex "OUT(16|32)ir")>;
-def: InstRW<[BWWriteResGroup194], (instregex "OUT(16|32)rr")>;
-def: InstRW<[BWWriteResGroup194], (instregex "OUT8ir")>;
-def: InstRW<[BWWriteResGroup194], (instregex "OUT8rr")>;
-
-def BWWriteResGroup195 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> {
- let Latency = 40;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup195], (instregex "VSQRTPDYm")>;
+def: InstRW<[BWWriteResGroup194], (instregex "OUT(8|16|32)ir",
+ "OUT(8|16|32)rr")>;
def BWWriteResGroup196 : SchedWriteRes<[BWPort5,BWPort0156]> {
let Latency = 42;
let NumMicroOps = 22;
let ResourceCycles = [2,20];
}
-def: InstRW<[BWWriteResGroup196], (instregex "RDTSCP")>;
+def: InstRW<[BWWriteResGroup196], (instrs RDTSCP)>;
def BWWriteResGroup197 : SchedWriteRes<[BWPort0,BWPort01,BWPort23,BWPort05,BWPort06,BWPort015,BWPort0156]> {
let Latency = 60;
let NumMicroOps = 64;
let ResourceCycles = [2,2,8,1,10,2,39];
}
-def: InstRW<[BWWriteResGroup197], (instregex "FLDENVm")>;
-def: InstRW<[BWWriteResGroup197], (instregex "FLDENVm")>;
+def: InstRW<[BWWriteResGroup197], (instrs FLDENVm)>;
def BWWriteResGroup198 : SchedWriteRes<[BWPort0,BWPort6,BWPort23,BWPort05,BWPort06,BWPort15,BWPort0156]> {
let Latency = 63;
let NumMicroOps = 88;
let ResourceCycles = [4,4,31,1,2,1,45];
}
-def: InstRW<[BWWriteResGroup198], (instregex "FXRSTOR64")>;
+def: InstRW<[BWWriteResGroup198], (instrs FXRSTOR64)>;
def BWWriteResGroup199 : SchedWriteRes<[BWPort0,BWPort6,BWPort23,BWPort05,BWPort06,BWPort15,BWPort0156]> {
let Latency = 63;
let NumMicroOps = 90;
let ResourceCycles = [4,2,33,1,2,1,47];
}
-def: InstRW<[BWWriteResGroup199], (instregex "FXRSTOR")>;
+def: InstRW<[BWWriteResGroup199], (instrs FXRSTOR)>;
def BWWriteResGroup200 : SchedWriteRes<[BWPort5,BWPort01,BWPort0156]> {
let Latency = 75;
let NumMicroOps = 15;
let ResourceCycles = [6,3,6];
}
-def: InstRW<[BWWriteResGroup200], (instregex "FNINIT")>;
+def: InstRW<[BWWriteResGroup200], (instrs FNINIT)>;
def BWWriteResGroup201 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156]> {
let Latency = 80;
@@ -3862,8 +1715,8 @@ def BWWriteResGroup202 : SchedWriteRes<[BWPort0,BWPort1,BWPort4,BWPort5,BWPort6,
let NumMicroOps = 100;
let ResourceCycles = [9,9,11,8,1,11,21,30];
}
-def: InstRW<[BWWriteResGroup202], (instregex "FSTENVm")>;
-def: InstRW<[BWWriteResGroup202], (instregex "FSTENVm")>;
+def: InstRW<[BWWriteResGroup202], (instrs FSTENVm)>;
-} // SchedModel
+def: InstRW<[WriteZero], (instrs CLC)>;
+} // SchedModel
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index 46612554b1fa..189dd4183839 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -10,6 +10,11 @@
// This file defines the machine model for Haswell to support instruction
// scheduling and other instruction cost heuristics.
//
+// Note that we define some instructions here that are not supported by haswell,
+// but we still have to define them because KNL uses the HSW model.
+// They are currently tagged with a comment `Unsupported = 1`.
+// FIXME: Use Unsupported = 1 once KNL has its own model.
+//
//===----------------------------------------------------------------------===//
def HaswellModel : SchedMachineModel {
@@ -23,7 +28,7 @@ def HaswellModel : SchedMachineModel {
// Based on the LSD (loop-stream detector) queue size and benchmarking data.
let LoopMicroOpBufferSize = 50;
- // This flag is set to allow the scheduler to assign a default model to
+ // This flag is set to allow the scheduler to assign a default model to
// unrecognized opcodes.
let CompleteModel = 0;
}
@@ -69,6 +74,8 @@ def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4,
// Integer division issued on port 0.
def HWDivider : ProcResource<1>;
+// FP division and sqrt on port 0.
+def HWFPDivider : ProcResource<1>;
// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
// cycles after the memory operand.
@@ -80,189 +87,451 @@ def : ReadAdvance<ReadAfterLd, 5>;
// This multiclass defines the resource usage for variants with and without
// folded loads.
multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1,
+ int LoadLat = 5> {
// Register variant is using a single cycle on ExePort.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
- // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
- // latency.
- def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> {
- let Latency = !add(Lat, 5);
+ // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+ // the latency (default = 5).
+ def : WriteRes<SchedRW.Folded, !listconcat([HWPort23], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = !add(UOps, 1);
}
}
-// A folded store needs a cycle on port 4 for the store data, but it does not
-// need an extra port 2/3 cycle to recompute the address.
-def : WriteRes<WriteRMW, [HWPort4]>;
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3/7 cycle to recompute the address.
+def : WriteRes<WriteRMW, [HWPort237,HWPort4]>;
// Store_addr on 237.
// Store_data on 4.
-def : WriteRes<WriteStore, [HWPort237, HWPort4]>;
-def : WriteRes<WriteLoad, [HWPort23]> { let Latency = 5; }
-def : WriteRes<WriteMove, [HWPort0156]>;
-def : WriteRes<WriteZero, []>;
+defm : X86WriteRes<WriteStore, [HWPort237, HWPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteStoreNT, [HWPort237, HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteLoad, [HWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteMove, [HWPort0156], 1, [1], 1>;
+def : WriteRes<WriteZero, []>;
+
+defm : HWWriteResPair<WriteALU, [HWPort0156], 1>;
+defm : HWWriteResPair<WriteADC, [HWPort06,HWPort0156], 2, [1,1], 2>;
+defm : HWWriteResPair<WriteIMul, [HWPort1], 3>;
+defm : HWWriteResPair<WriteIMul64, [HWPort1], 3>;
+
+defm : HWWriteResPair<WriteBSWAP32,[HWPort15], 1>;
+defm : HWWriteResPair<WriteBSWAP64,[HWPort06, HWPort15], 2, [1,1], 2>;
-defm : HWWriteResPair<WriteALU, HWPort0156, 1>;
-defm : HWWriteResPair<WriteIMul, HWPort1, 3>;
def : WriteRes<WriteIMulH, []> { let Latency = 3; }
-defm : HWWriteResPair<WriteShift, HWPort06, 1>;
-defm : HWWriteResPair<WriteJump, HWPort06, 1>;
+defm : HWWriteResPair<WriteShift, [HWPort06], 1>;
+defm : HWWriteResPair<WriteShiftDouble, [HWPort06], 1>;
+defm : HWWriteResPair<WriteJump, [HWPort06], 1>;
+defm : HWWriteResPair<WriteCRC32, [HWPort1], 3>;
+
+defm : HWWriteResPair<WriteCMOV, [HWPort06,HWPort0156], 2, [1,1], 2>; // Conditional move.
+defm : HWWriteResPair<WriteCMOV2, [HWPort06,HWPort0156], 3, [1,2], 3>; // Conditional (CF + ZF flag) move.
+defm : X86WriteRes<WriteFCMOV, [HWPort1], 3, [1], 1>; // x87 conditional move.
+def : WriteRes<WriteSETCC, [HWPort06]>; // Setcc.
+def : WriteRes<WriteSETCCStore, [HWPort06,HWPort4,HWPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+def : WriteRes<WriteLAHFSAHF, [HWPort06]>;
// This is for simple LEAs with one or two input operands.
// The complex ones can only execute on port 1, and they require two cycles on
// the port to read all inputs. We don't model that.
def : WriteRes<WriteLEA, [HWPort15]>;
-// This is quite rough, latency depends on the dividend.
-def : WriteRes<WriteIDiv, [HWPort0, HWDivider]> {
- let Latency = 25;
- let ResourceCycles = [1, 10];
-}
-def : WriteRes<WriteIDivLd, [HWPort23, HWPort0, HWDivider]> {
- let Latency = 29;
- let ResourceCycles = [1, 1, 10];
-}
+// Bit counts.
+defm : HWWriteResPair<WriteBSF, [HWPort1], 3>;
+defm : HWWriteResPair<WriteBSR, [HWPort1], 3>;
+defm : HWWriteResPair<WriteLZCNT, [HWPort1], 3>;
+defm : HWWriteResPair<WriteTZCNT, [HWPort1], 3>;
+defm : HWWriteResPair<WritePOPCNT, [HWPort1], 3>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : HWWriteResPair<WriteBEXTR, [HWPort06,HWPort15], 2, [1,1], 2>;
+defm : HWWriteResPair<WriteBZHI, [HWPort15], 1>;
+
+defm : HWWriteResPair<WriteDiv8, [HWPort0, HWDivider], 25, [1,10], 1, 4>;
+defm : HWWriteResPair<WriteDiv16, [HWPort0, HWDivider], 25, [1,10], 1, 4>;
+defm : HWWriteResPair<WriteDiv32, [HWPort0, HWDivider], 25, [1,10], 1, 4>;
+defm : HWWriteResPair<WriteDiv64, [HWPort0, HWDivider], 25, [1,10], 1, 4>;
+defm : HWWriteResPair<WriteIDiv8, [HWPort0, HWDivider], 25, [1,10], 1, 4>;
+defm : HWWriteResPair<WriteIDiv16, [HWPort0, HWDivider], 25, [1,10], 1, 4>;
+defm : HWWriteResPair<WriteIDiv32, [HWPort0, HWDivider], 25, [1,10], 1, 4>;
+defm : HWWriteResPair<WriteIDiv64, [HWPort0, HWDivider], 25, [1,10], 1, 4>;
// Scalar and vector floating point.
-defm : HWWriteResPair<WriteFAdd, HWPort1, 3>;
-defm : HWWriteResPair<WriteFMul, HWPort0, 5>;
-defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles.
-defm : HWWriteResPair<WriteFRcp, HWPort0, 5>;
-defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>;
-defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>;
-defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
-defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
-defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>;
-defm : HWWriteResPair<WriteFMA, HWPort01, 5>;
-defm : HWWriteResPair<WriteFShuffle, HWPort5, 1>;
-defm : HWWriteResPair<WriteFBlend, HWPort015, 1>;
-defm : HWWriteResPair<WriteFShuffle256, HWPort5, 3>;
-
-def : WriteRes<WriteFVarBlend, [HWPort5]> {
- let Latency = 2;
- let ResourceCycles = [2];
-}
-def : WriteRes<WriteFVarBlendLd, [HWPort5, HWPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1];
-}
+defm : X86WriteRes<WriteFLD0, [HWPort01], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1, [HWPort01], 1, [2], 2>;
+defm : X86WriteRes<WriteFLDC, [HWPort01], 1, [2], 2>;
+defm : X86WriteRes<WriteFLoad, [HWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX, [HWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [HWPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad, [HWPort23,HWPort5], 8, [1,2], 3>;
+defm : X86WriteRes<WriteFMaskedLoadY, [HWPort23,HWPort5], 9, [1,2], 3>;
+defm : X86WriteRes<WriteFStore, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreX, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreY, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNT, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTX, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTY, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMove, [HWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX, [HWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY, [HWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteEMMS, [HWPort01,HWPort15,HWPort015,HWPort0156], 31, [8,1,21,1], 31>;
+
+defm : HWWriteResPair<WriteFAdd, [HWPort1], 3, [1], 1, 5>;
+defm : HWWriteResPair<WriteFAddX, [HWPort1], 3, [1], 1, 6>;
+defm : HWWriteResPair<WriteFAddY, [HWPort1], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFAddZ, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFAdd64, [HWPort1], 3, [1], 1, 5>;
+defm : HWWriteResPair<WriteFAdd64X, [HWPort1], 3, [1], 1, 6>;
+defm : HWWriteResPair<WriteFAdd64Y, [HWPort1], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFAdd64Z, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFCmp, [HWPort1], 3, [1], 1, 5>;
+defm : HWWriteResPair<WriteFCmpX, [HWPort1], 3, [1], 1, 6>;
+defm : HWWriteResPair<WriteFCmpY, [HWPort1], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFCmpZ, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFCmp64, [HWPort1], 3, [1], 1, 5>;
+defm : HWWriteResPair<WriteFCmp64X, [HWPort1], 3, [1], 1, 6>;
+defm : HWWriteResPair<WriteFCmp64Y, [HWPort1], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFCmp64Z, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFCom, [HWPort1], 3>;
+
+defm : HWWriteResPair<WriteFMul, [HWPort01], 5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFMulX, [HWPort01], 5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFMulY, [HWPort01], 5, [1], 1, 7>;
+defm : HWWriteResPair<WriteFMulZ, [HWPort01], 5, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFMul64, [HWPort01], 5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFMul64X, [HWPort01], 5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFMul64Y, [HWPort01], 5, [1], 1, 7>;
+defm : HWWriteResPair<WriteFMul64Z, [HWPort01], 5, [1], 1, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFDiv, [HWPort0,HWFPDivider], 13, [1,7], 1, 5>;
+defm : HWWriteResPair<WriteFDivX, [HWPort0,HWFPDivider], 13, [1,7], 1, 6>;
+defm : HWWriteResPair<WriteFDivY, [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>;
+defm : HWWriteResPair<WriteFDivZ, [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFDiv64, [HWPort0,HWFPDivider], 20, [1,14], 1, 5>;
+defm : HWWriteResPair<WriteFDiv64X, [HWPort0,HWFPDivider], 20, [1,14], 1, 6>;
+defm : HWWriteResPair<WriteFDiv64Y, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>;
+defm : HWWriteResPair<WriteFDiv64Z, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFRcp, [HWPort0], 5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFRcpX, [HWPort0], 5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFRcpY, [HWPort0,HWPort015], 11, [2,1], 3, 7>;
+defm : HWWriteResPair<WriteFRcpZ, [HWPort0,HWPort015], 11, [2,1], 3, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFRsqrt, [HWPort0], 5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFRsqrtX,[HWPort0], 5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFRsqrtY,[HWPort0,HWPort015], 11, [2,1], 3, 7>;
+defm : HWWriteResPair<WriteFRsqrtZ,[HWPort0,HWPort015], 11, [2,1], 3, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFSqrt, [HWPort0,HWFPDivider], 11, [1,7], 1, 5>;
+defm : HWWriteResPair<WriteFSqrtX, [HWPort0,HWFPDivider], 11, [1,7], 1, 6>;
+defm : HWWriteResPair<WriteFSqrtY, [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>;
+defm : HWWriteResPair<WriteFSqrtZ, [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFSqrt64, [HWPort0,HWFPDivider], 16, [1,14], 1, 5>;
+defm : HWWriteResPair<WriteFSqrt64X, [HWPort0,HWFPDivider], 16, [1,14], 1, 6>;
+defm : HWWriteResPair<WriteFSqrt64Y, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>;
+defm : HWWriteResPair<WriteFSqrt64Z, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFSqrt80, [HWPort0,HWFPDivider], 23, [1,17]>;
+
+defm : HWWriteResPair<WriteFMA, [HWPort01], 5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFMAX, [HWPort01], 5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFMAY, [HWPort01], 5, [1], 1, 7>;
+defm : HWWriteResPair<WriteFMAZ, [HWPort01], 5, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteDPPD, [HWPort0,HWPort1,HWPort5], 9, [1,1,1], 3, 6>;
+defm : HWWriteResPair<WriteDPPS, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 6>;
+defm : HWWriteResPair<WriteDPPSY, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>;
+defm : HWWriteResPair<WriteDPPSZ, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFSign, [HWPort0], 1>;
+defm : X86WriteRes<WriteFRnd, [HWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFRndY, [HWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFRndZ, [HWPort23], 6, [1], 1>; // Unsupported = 1
+defm : X86WriteRes<WriteFRndLd, [HWPort1,HWPort23], 12, [2,1], 3>;
+defm : X86WriteRes<WriteFRndYLd, [HWPort1,HWPort23], 13, [2,1], 3>;
+defm : X86WriteRes<WriteFRndZLd, [HWPort1,HWPort23], 13, [2,1], 3>; // Unsupported = 1
+defm : HWWriteResPair<WriteFLogic, [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteFLogicY, [HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteFLogicZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFTest, [HWPort0], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteFTestY, [HWPort0], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteFTestZ, [HWPort0], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFShuffle, [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteFShuffleY, [HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteFShuffleZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFVarShuffle, [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteFVarShuffleY, [HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteFVarShuffleZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFBlend, [HWPort015], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteFBlendY, [HWPort015], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteFBlendZ, [HWPort015], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFShuffle256, [HWPort5], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFVarShuffle256, [HWPort5], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFVarBlend, [HWPort5], 2, [2], 2, 6>;
+defm : HWWriteResPair<WriteFVarBlendY, [HWPort5], 2, [2], 2, 7>;
+defm : HWWriteResPair<WriteFVarBlendZ, [HWPort5], 2, [2], 2, 7>; // Unsupported = 1
+
+// Conversion between integer and float.
+defm : HWWriteResPair<WriteCvtSD2I, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2I, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2IY, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2IZ, [HWPort1], 3>; // Unsupported = 1
+defm : HWWriteResPair<WriteCvtSS2I, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2I, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2IY, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2IZ, [HWPort1], 3>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteCvtI2SD, [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PD, [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PDY, [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PDZ, [HWPort1], 4>; // Unsupported = 1
+defm : HWWriteResPair<WriteCvtI2SS, [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PS, [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PSY, [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PSZ, [HWPort1], 4>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteCvtSS2SD, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2PD, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2PDY, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2PDZ, [HWPort1], 3>; // Unsupported = 1
+defm : HWWriteResPair<WriteCvtSD2SS, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2PS, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2PSY, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2PSZ, [HWPort1], 3>; // Unsupported = 1
+
+defm : X86WriteRes<WriteCvtPH2PS, [HWPort0,HWPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSY, [HWPort0,HWPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSZ, [HWPort0,HWPort5], 2, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPH2PSLd, [HWPort0,HWPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSYLd, [HWPort0,HWPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSZLd, [HWPort0,HWPort23], 7, [1,1], 2>; // Unsupported = 1
+
+defm : X86WriteRes<WriteCvtPS2PH, [HWPort1,HWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHY, [HWPort1,HWPort5], 6, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHZ, [HWPort1,HWPort5], 6, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPS2PHSt, [HWPort1,HWPort4,HWPort5,HWPort237], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [HWPort1,HWPort4,HWPort5,HWPort237], 7, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHZSt, [HWPort1,HWPort4,HWPort5,HWPort237], 7, [1,1,1,1], 4>; // Unsupported = 1
// Vector integer operations.
-defm : HWWriteResPair<WriteVecShift, HWPort0, 1>;
-defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>;
-defm : HWWriteResPair<WriteVecALU, HWPort15, 1>;
-defm : HWWriteResPair<WriteVecIMul, HWPort0, 5>;
-defm : HWWriteResPair<WriteShuffle, HWPort5, 1>;
-defm : HWWriteResPair<WriteBlend, HWPort15, 1>;
-defm : HWWriteResPair<WriteShuffle256, HWPort5, 3>;
-
-def : WriteRes<WriteVarBlend, [HWPort5]> {
+defm : X86WriteRes<WriteVecLoad, [HWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [HWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [HWPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT, [HWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [HWPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [HWPort23,HWPort5], 8, [1,2], 3>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [HWPort23,HWPort5], 9, [1,2], 3>;
+defm : X86WriteRes<WriteVecStore, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreX, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreY, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNT, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNTY, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMove, [HWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [HWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [HWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr, [HWPort0], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [HWPort5], 1, [1], 1>;
+
+defm : HWWriteResPair<WriteVecLogic, [HWPort015], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVecLogicX,[HWPort015], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteVecLogicY,[HWPort015], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteVecLogicZ,[HWPort015], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteVecTest, [HWPort0,HWPort5], 2, [1,1], 2, 6>;
+defm : HWWriteResPair<WriteVecTestY, [HWPort0,HWPort5], 4, [1,1], 2, 7>;
+defm : HWWriteResPair<WriteVecTestZ, [HWPort0,HWPort5], 4, [1,1], 2, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteVecALU, [HWPort15], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVecALUX, [HWPort15], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteVecALUY, [HWPort15], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteVecALUZ, [HWPort15], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteVecIMul, [HWPort0], 5, [1], 1, 5>;
+defm : HWWriteResPair<WriteVecIMulX, [HWPort0], 5, [1], 1, 6>;
+defm : HWWriteResPair<WriteVecIMulY, [HWPort0], 5, [1], 1, 7>;
+defm : HWWriteResPair<WriteVecIMulZ, [HWPort0], 5, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WritePMULLD, [HWPort0], 10, [2], 2, 6>;
+defm : HWWriteResPair<WritePMULLDY, [HWPort0], 10, [2], 2, 7>;
+defm : HWWriteResPair<WritePMULLDZ, [HWPort0], 10, [2], 2, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteShuffle, [HWPort5], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteShuffleX, [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteShuffleY, [HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteShuffleZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteVarShuffle, [HWPort5], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVarShuffleX,[HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteVarShuffleY,[HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteVarShuffleZ,[HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteBlend, [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteBlendY, [HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteBlendZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteShuffle256, [HWPort5], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteVarShuffle256, [HWPort5], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteVarBlend, [HWPort5], 2, [2], 2, 6>;
+defm : HWWriteResPair<WriteVarBlendY, [HWPort5], 2, [2], 2, 7>;
+defm : HWWriteResPair<WriteVarBlendZ, [HWPort5], 2, [2], 2, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteMPSAD, [HWPort0, HWPort5], 7, [1, 2], 3, 6>;
+defm : HWWriteResPair<WriteMPSADY, [HWPort0, HWPort5], 7, [1, 2], 3, 7>;
+defm : HWWriteResPair<WriteMPSADZ, [HWPort0, HWPort5], 7, [1, 2], 3, 7>; // Unsupported = 1
+defm : HWWriteResPair<WritePSADBW, [HWPort0], 5, [1], 1, 5>;
+defm : HWWriteResPair<WritePSADBWX, [HWPort0], 5, [1], 1, 6>;
+defm : HWWriteResPair<WritePSADBWY, [HWPort0], 5, [1], 1, 7>;
+defm : HWWriteResPair<WritePSADBWZ, [HWPort0], 5, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WritePHMINPOS, [HWPort0], 5, [1], 1, 6>;
+
+// Vector integer shifts.
+defm : HWWriteResPair<WriteVecShift, [HWPort0], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVecShiftX, [HWPort0,HWPort5], 2, [1,1], 2, 6>;
+defm : X86WriteRes<WriteVecShiftY, [HWPort0,HWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftZ, [HWPort0,HWPort5], 4, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteVecShiftYLd, [HWPort0,HWPort23], 8, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftZLd, [HWPort0,HWPort23], 8, [1,1], 2>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteVecShiftImm, [HWPort0], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVecShiftImmX, [HWPort0], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteVecShiftImmY, [HWPort0], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteVecShiftImmZ, [HWPort0], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteVarVecShift, [HWPort0, HWPort5], 3, [2,1], 3, 6>;
+defm : HWWriteResPair<WriteVarVecShiftY, [HWPort0, HWPort5], 3, [2,1], 3, 7>;
+defm : HWWriteResPair<WriteVarVecShiftZ, [HWPort0, HWPort5], 3, [2,1], 3, 7>; // Unsupported = 1
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [HWPort5]> {
let Latency = 2;
+ let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def : WriteRes<WriteVarBlendLd, [HWPort5, HWPort23]> {
+def : WriteRes<WriteVecInsertLd, [HWPort5,HWPort23]> {
let Latency = 6;
- let ResourceCycles = [2, 1];
+ let NumMicroOps = 2;
}
+def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>;
-def : WriteRes<WriteVarVecShift, [HWPort0, HWPort5]> {
+def : WriteRes<WriteVecExtract, [HWPort0,HWPort5]> {
let Latency = 2;
- let ResourceCycles = [2, 1];
-}
-def : WriteRes<WriteVarVecShiftLd, [HWPort0, HWPort5, HWPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1, 1];
-}
-
-def : WriteRes<WriteMPSAD, [HWPort0, HWPort5]> {
- let Latency = 6;
- let ResourceCycles = [1, 2];
+ let NumMicroOps = 2;
}
-def : WriteRes<WriteMPSADLd, [HWPort23, HWPort0, HWPort5]> {
- let Latency = 6;
- let ResourceCycles = [1, 1, 2];
+def : WriteRes<WriteVecExtractSt, [HWPort4,HWPort5,HWPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
}
// String instructions.
+
// Packed Compare Implicit Length Strings, Return Mask
def : WriteRes<WritePCmpIStrM, [HWPort0]> {
- let Latency = 10;
+ let Latency = 11;
+ let NumMicroOps = 3;
let ResourceCycles = [3];
}
def : WriteRes<WritePCmpIStrMLd, [HWPort0, HWPort23]> {
- let Latency = 10;
- let ResourceCycles = [3, 1];
+ let Latency = 17;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
}
// Packed Compare Explicit Length Strings, Return Mask
-def : WriteRes<WritePCmpEStrM, [HWPort0, HWPort16, HWPort5]> {
- let Latency = 10;
- let ResourceCycles = [3, 2, 4];
+def : WriteRes<WritePCmpEStrM, [HWPort0, HWPort5, HWPort015, HWPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
}
-def : WriteRes<WritePCmpEStrMLd, [HWPort05, HWPort16, HWPort23]> {
- let Latency = 10;
- let ResourceCycles = [6, 2, 1];
+def : WriteRes<WritePCmpEStrMLd, [HWPort0, HWPort5, HWPort23, HWPort015, HWPort0156]> {
+ let Latency = 25;
+ let NumMicroOps = 10;
+ let ResourceCycles = [4,3,1,1,1];
}
// Packed Compare Implicit Length Strings, Return Index
def : WriteRes<WritePCmpIStrI, [HWPort0]> {
let Latency = 11;
+ let NumMicroOps = 3;
let ResourceCycles = [3];
}
def : WriteRes<WritePCmpIStrILd, [HWPort0, HWPort23]> {
- let Latency = 11;
- let ResourceCycles = [3, 1];
+ let Latency = 17;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
}
// Packed Compare Explicit Length Strings, Return Index
-def : WriteRes<WritePCmpEStrI, [HWPort05, HWPort16]> {
- let Latency = 11;
- let ResourceCycles = [6, 2];
+def : WriteRes<WritePCmpEStrI, [HWPort0, HWPort5, HWPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [4,3,1];
}
-def : WriteRes<WritePCmpEStrILd, [HWPort0, HWPort16, HWPort5, HWPort23]> {
- let Latency = 11;
- let ResourceCycles = [3, 2, 2, 1];
+def : WriteRes<WritePCmpEStrILd, [HWPort0, HWPort5, HWPort23, HWPort0156]> {
+ let Latency = 24;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
}
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [HWPort0]> { let Latency = 3; }
+def : WriteRes<WriteVecMOVMSK, [HWPort0]> { let Latency = 3; }
+def : WriteRes<WriteVecMOVMSKY, [HWPort0]> { let Latency = 3; }
+def : WriteRes<WriteMMXMOVMSK, [HWPort0]> { let Latency = 1; }
+
// AES Instructions.
def : WriteRes<WriteAESDecEnc, [HWPort5]> {
let Latency = 7;
+ let NumMicroOps = 1;
let ResourceCycles = [1];
}
def : WriteRes<WriteAESDecEncLd, [HWPort5, HWPort23]> {
- let Latency = 7;
- let ResourceCycles = [1, 1];
+ let Latency = 13;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
def : WriteRes<WriteAESIMC, [HWPort5]> {
let Latency = 14;
+ let NumMicroOps = 2;
let ResourceCycles = [2];
}
def : WriteRes<WriteAESIMCLd, [HWPort5, HWPort23]> {
- let Latency = 14;
- let ResourceCycles = [2, 1];
+ let Latency = 20;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
}
-def : WriteRes<WriteAESKeyGen, [HWPort0, HWPort5]> {
- let Latency = 10;
- let ResourceCycles = [2, 8];
+def : WriteRes<WriteAESKeyGen, [HWPort0,HWPort5,HWPort015]> {
+ let Latency = 29;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,7,2];
}
-def : WriteRes<WriteAESKeyGenLd, [HWPort0, HWPort5, HWPort23]> {
- let Latency = 10;
- let ResourceCycles = [2, 7, 1];
+def : WriteRes<WriteAESKeyGenLd, [HWPort0,HWPort5,HWPort23,HWPort015]> {
+ let Latency = 34;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,7,1,1];
}
// Carry-less multiplication instructions.
def : WriteRes<WriteCLMul, [HWPort0, HWPort5]> {
- let Latency = 7;
- let ResourceCycles = [2, 1];
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
}
def : WriteRes<WriteCLMulLd, [HWPort0, HWPort5, HWPort23]> {
- let Latency = 7;
- let ResourceCycles = [2, 1, 1];
+ let Latency = 17;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
}
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [HWPort0,HWPort23,HWPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [HWPort4,HWPort5,HWPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+
def : WriteRes<WriteSystem, [HWPort0156]> { let Latency = 100; }
def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; }
def : WriteRes<WriteFence, [HWPort23, HWPort4]>;
@@ -273,149 +542,34 @@ def : WriteRes<WriteNop, []>;
//-- Specific Scheduling Models --//
// Starting with P0.
-def WriteP0 : SchedWriteRes<[HWPort0]>;
-
-def WriteP0_P1_Lat4 : SchedWriteRes<[HWPort0, HWPort1]> {
- let Latency = 4;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
-}
-
-def WriteP0_P1_Lat4Ld : SchedWriteRes<[HWPort0, HWPort1, HWPort23]> {
- let Latency = 8;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
-}
+def HWWriteP0 : SchedWriteRes<[HWPort0]>;
-def WriteP01 : SchedWriteRes<[HWPort01]>;
+def HWWriteP01 : SchedWriteRes<[HWPort01]>;
-def Write2P01 : SchedWriteRes<[HWPort01]> {
+def HWWrite2P01 : SchedWriteRes<[HWPort01]> {
let NumMicroOps = 2;
}
-def Write3P01 : SchedWriteRes<[HWPort01]> {
+def HWWrite3P01 : SchedWriteRes<[HWPort01]> {
let NumMicroOps = 3;
}
-def WriteP015 : SchedWriteRes<[HWPort015]>;
-
-def WriteP01_P5 : SchedWriteRes<[HWPort01, HWPort5]> {
- let NumMicroOps = 2;
-}
-def WriteP06 : SchedWriteRes<[HWPort06]>;
-
-def Write2P06 : SchedWriteRes<[HWPort06]> {
- let Latency = 1;
+def HWWriteP0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
let NumMicroOps = 2;
- let ResourceCycles = [2];
}
-def Write3P06_Lat2 : SchedWriteRes<[HWPort06]> {
- let Latency = 2;
- let NumMicroOps = 3;
- let ResourceCycles = [3];
-}
-
-def WriteP0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
- let NumMicroOps = 2;
-}
-
-def Write2P0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
+def HWWrite2P0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
let NumMicroOps = 3;
let ResourceCycles = [2, 1];
}
-def Write2P0156_Lat2 : SchedWriteRes<[HWPort0156]> {
- let Latency = 2;
- let ResourceCycles = [2];
-}
-def Write2P0156_Lat2Ld : SchedWriteRes<[HWPort0156, HWPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1];
-}
-
-def Write5P0156 : SchedWriteRes<[HWPort0156]> {
- let NumMicroOps = 5;
- let ResourceCycles = [5];
-}
-
-def WriteP0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
- let Latency = 1;
- let ResourceCycles = [1, 2, 1];
-}
-
-def Write2P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
- let Latency = 1;
- let ResourceCycles = [2, 2, 1];
-}
-
-def Write3P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
- let Latency = 1;
- let ResourceCycles = [3, 2, 1];
-}
-
// Starting with P1.
-def WriteP1 : SchedWriteRes<[HWPort1]>;
+def HWWriteP1 : SchedWriteRes<[HWPort1]>;
-def WriteP1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
- let NumMicroOps = 2;
-}
-def WriteP1_Lat3 : SchedWriteRes<[HWPort1]> {
- let Latency = 3;
-}
-def WriteP1_Lat3Ld : SchedWriteRes<[HWPort1, HWPort23]> {
- let Latency = 7;
-}
-def Write2P1 : SchedWriteRes<[HWPort1]> {
+def HWWrite2P1 : SchedWriteRes<[HWPort1]> {
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def Write2P1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
-}
-def WriteP15 : SchedWriteRes<[HWPort15]>;
-def WriteP15Ld : SchedWriteRes<[HWPort15, HWPort23]> {
- let Latency = 4;
-}
-
-def WriteP1_P5_Lat4 : SchedWriteRes<[HWPort1, HWPort5]> {
- let Latency = 4;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
-}
-
-def WriteP1_P5_Lat4Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
- let Latency = 8;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
-}
-
-def WriteP1_P5_Lat6 : SchedWriteRes<[HWPort1, HWPort5]> {
- let Latency = 6;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
-}
-
-def WriteP1_P5_Lat6Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
- let Latency = 10;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
-}
-
-// Starting with P2.
-def Write2P237_P4 : SchedWriteRes<[HWPort237, HWPort4]> {
- let Latency = 1;
- let ResourceCycles = [2, 1];
-}
-
-// Starting with P5.
-def WriteP5 : SchedWriteRes<[HWPort5]>;
-def WriteP5Ld : SchedWriteRes<[HWPort5, HWPort23]> {
- let Latency = 5;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
-}
// Notation:
// - r: register.
@@ -429,284 +583,215 @@ def WriteP5Ld : SchedWriteRes<[HWPort5, HWPort23]> {
//=== Integer Instructions ===//
//-- Move instructions --//
-// MOV.
-// r16,m.
-def : InstRW<[WriteALULd], (instregex "MOV16rm")>;
-
-// MOVSX, MOVZX.
-// r,m.
-def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm8")>;
-
// XLAT.
-def WriteXLAT : SchedWriteRes<[]> {
+def HWWriteXLAT : SchedWriteRes<[]> {
let Latency = 7;
let NumMicroOps = 3;
}
-def : InstRW<[WriteXLAT], (instregex "XLAT")>;
-
-// PUSH.
-// m.
-def : InstRW<[Write2P237_P4], (instregex "PUSH(16|32)rmm")>;
+def : InstRW<[HWWriteXLAT], (instrs XLAT)>;
// PUSHA.
-def WritePushA : SchedWriteRes<[]> {
+def HWWritePushA : SchedWriteRes<[]> {
let NumMicroOps = 19;
}
-def : InstRW<[WritePushA], (instregex "PUSHA(16|32)")>;
-
-// POP.
-// m.
-def : InstRW<[Write2P237_P4], (instregex "POP(16|32)rmm")>;
+def : InstRW<[HWWritePushA], (instregex "PUSHA(16|32)")>;
// POPA.
-def WritePopA : SchedWriteRes<[]> {
+def HWWritePopA : SchedWriteRes<[]> {
let NumMicroOps = 18;
}
-def : InstRW<[WritePopA], (instregex "POPA(16|32)")>;
+def : InstRW<[HWWritePopA], (instregex "POPA(16|32)")>;
//-- Arithmetic instructions --//
// DIV.
// r8.
-def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+def HWWriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
let Latency = 22;
let NumMicroOps = 9;
}
-def : InstRW<[WriteDiv8], (instregex "DIV8r")>;
+def : InstRW<[HWWriteDiv8], (instregex "DIV8r")>;
// IDIV.
// r8.
-def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+def HWWriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
let Latency = 23;
let NumMicroOps = 9;
}
-def : InstRW<[WriteIDiv8], (instregex "IDIV8r")>;
+def : InstRW<[HWWriteIDiv8], (instregex "IDIV8r")>;
// BT.
// m,r.
-def WriteBTmr : SchedWriteRes<[]> {
+def HWWriteBTmr : SchedWriteRes<[]> {
let NumMicroOps = 10;
}
-def : InstRW<[WriteBTmr], (instregex "BT(16|32|64)mr")>;
+def : InstRW<[HWWriteBTmr], (instregex "BT(16|32|64)mr")>;
// BTR BTS BTC.
// m,r.
-def WriteBTRSCmr : SchedWriteRes<[]> {
+def HWWriteBTRSCmr : SchedWriteRes<[]> {
let NumMicroOps = 11;
}
-def : InstRW<[WriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>;
+def : InstRW<[HWWriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>;
//-- Control transfer instructions --//
// CALL.
// i.
-def WriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> {
+def HWWriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> {
let NumMicroOps = 4;
let ResourceCycles = [1, 2, 1];
}
-def : InstRW<[WriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>;
+def : InstRW<[HWWriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>;
// BOUND.
// r,m.
-def WriteBOUND : SchedWriteRes<[]> {
+def HWWriteBOUND : SchedWriteRes<[]> {
let NumMicroOps = 15;
}
-def : InstRW<[WriteBOUND], (instregex "BOUNDS(16|32)rm")>;
+def : InstRW<[HWWriteBOUND], (instregex "BOUNDS(16|32)rm")>;
// INTO.
-def WriteINTO : SchedWriteRes<[]> {
+def HWWriteINTO : SchedWriteRes<[]> {
let NumMicroOps = 4;
}
-def : InstRW<[WriteINTO], (instregex "INTO")>;
+def : InstRW<[HWWriteINTO], (instrs INTO)>;
//-- String instructions --//
// LODSB/W.
-def : InstRW<[Write2P0156_P23], (instregex "LODS(B|W)")>;
+def : InstRW<[HWWrite2P0156_P23], (instregex "LODS(B|W)")>;
// LODSD/Q.
-def : InstRW<[WriteP0156_P23], (instregex "LODS(L|Q)")>;
+def : InstRW<[HWWriteP0156_P23], (instregex "LODS(L|Q)")>;
// MOVS.
-def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> {
+def HWWriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> {
let Latency = 4;
let NumMicroOps = 5;
let ResourceCycles = [2, 1, 2];
}
-def : InstRW<[WriteMOVS], (instregex "MOVS(B|L|Q|W)")>;
+def : InstRW<[HWWriteMOVS], (instrs MOVSB, MOVSL, MOVSQ, MOVSW)>;
// CMPS.
-def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> {
+def HWWriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> {
let Latency = 4;
let NumMicroOps = 5;
let ResourceCycles = [2, 3];
}
-def : InstRW<[WriteCMPS], (instregex "CMPS(B|L|Q|W)")>;
+def : InstRW<[HWWriteCMPS], (instregex "CMPS(B|L|Q|W)")>;
//-- Other --//
// RDPMC.f
-def WriteRDPMC : SchedWriteRes<[]> {
+def HWWriteRDPMC : SchedWriteRes<[]> {
let NumMicroOps = 34;
}
-def : InstRW<[WriteRDPMC], (instregex "RDPMC")>;
+def : InstRW<[HWWriteRDPMC], (instrs RDPMC)>;
// RDRAND.
-def WriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> {
+def HWWriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> {
let NumMicroOps = 17;
let ResourceCycles = [1, 16];
}
-def : InstRW<[WriteRDRAND], (instregex "RDRAND(16|32|64)r")>;
+def : InstRW<[HWWriteRDRAND], (instregex "RDRAND(16|32|64)r")>;
//=== Floating Point x87 Instructions ===//
//-- Move instructions --//
// FLD.
// m80.
-def : InstRW<[WriteP01], (instregex "LD_Frr")>;
+def : InstRW<[HWWriteP01], (instregex "LD_Frr")>;
// FBLD.
// m80.
-def WriteFBLD : SchedWriteRes<[]> {
+def HWWriteFBLD : SchedWriteRes<[]> {
let Latency = 47;
let NumMicroOps = 43;
}
-def : InstRW<[WriteFBLD], (instregex "FBLDm")>;
+def : InstRW<[HWWriteFBLD], (instregex "FBLDm")>;
// FST(P).
// r.
-def : InstRW<[WriteP01], (instregex "ST_(F|FP)rr")>;
-
-// FLDZ.
-def : InstRW<[WriteP01], (instregex "LD_F0")>;
-
-// FLDPI FLDL2E etc.
-def : InstRW<[Write2P01], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>;
+def : InstRW<[HWWriteP01], (instregex "ST_(F|FP)rr")>;
// FFREE.
-def : InstRW<[WriteP01], (instregex "FFREE")>;
+def : InstRW<[HWWriteP01], (instregex "FFREE")>;
// FNSAVE.
-def WriteFNSAVE : SchedWriteRes<[]> {
+def HWWriteFNSAVE : SchedWriteRes<[]> {
let NumMicroOps = 147;
}
-def : InstRW<[WriteFNSAVE], (instregex "FSAVEm")>;
+def : InstRW<[HWWriteFNSAVE], (instregex "FSAVEm")>;
// FRSTOR.
-def WriteFRSTOR : SchedWriteRes<[]> {
+def HWWriteFRSTOR : SchedWriteRes<[]> {
let NumMicroOps = 90;
}
-def : InstRW<[WriteFRSTOR], (instregex "FRSTORm")>;
+def : InstRW<[HWWriteFRSTOR], (instregex "FRSTORm")>;
//-- Arithmetic instructions --//
-// FABS.
-def : InstRW<[WriteP0], (instregex "ABS_F")>;
-
-// FCHS.
-def : InstRW<[WriteP0], (instregex "CHS_F")>;
-
// FCOMPP FUCOMPP.
// r.
-def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>;
+def : InstRW<[HWWrite2P01], (instrs FCOMPP, UCOM_FPPr)>;
// FCOMI(P) FUCOMI(P).
// m.
-def : InstRW<[Write3P01], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr",
- "UCOM_FIPr")>;
+def : InstRW<[HWWrite3P01], (instrs COM_FIPr, COM_FIr, UCOM_FIPr, UCOM_FIr)>;
// FTST.
-def : InstRW<[WriteP1], (instregex "TST_F")>;
+def : InstRW<[HWWriteP1], (instregex "TST_F")>;
// FXAM.
-def : InstRW<[Write2P1], (instregex "FXAM")>;
+def : InstRW<[HWWrite2P1], (instrs FXAM)>;
// FPREM.
-def WriteFPREM : SchedWriteRes<[]> {
+def HWWriteFPREM : SchedWriteRes<[]> {
let Latency = 19;
let NumMicroOps = 28;
}
-def : InstRW<[WriteFPREM], (instregex "FPREM")>;
+def : InstRW<[HWWriteFPREM], (instrs FPREM)>;
// FPREM1.
-def WriteFPREM1 : SchedWriteRes<[]> {
+def HWWriteFPREM1 : SchedWriteRes<[]> {
let Latency = 27;
let NumMicroOps = 41;
}
-def : InstRW<[WriteFPREM1], (instregex "FPREM1")>;
+def : InstRW<[HWWriteFPREM1], (instrs FPREM1)>;
// FRNDINT.
-def WriteFRNDINT : SchedWriteRes<[]> {
+def HWWriteFRNDINT : SchedWriteRes<[]> {
let Latency = 11;
let NumMicroOps = 17;
}
-def : InstRW<[WriteFRNDINT], (instregex "FRNDINT")>;
+def : InstRW<[HWWriteFRNDINT], (instrs FRNDINT)>;
//-- Math instructions --//
// FSCALE.
-def WriteFSCALE : SchedWriteRes<[]> {
+def HWWriteFSCALE : SchedWriteRes<[]> {
let Latency = 75; // 49-125
let NumMicroOps = 50; // 25-75
}
-def : InstRW<[WriteFSCALE], (instregex "FSCALE")>;
+def : InstRW<[HWWriteFSCALE], (instrs FSCALE)>;
// FXTRACT.
-def WriteFXTRACT : SchedWriteRes<[]> {
+def HWWriteFXTRACT : SchedWriteRes<[]> {
let Latency = 15;
let NumMicroOps = 17;
}
-def : InstRW<[WriteFXTRACT], (instregex "FXTRACT")>;
-
-//-- Other instructions --//
-
-// FNOP.
-def : InstRW<[WriteP01], (instregex "FNOP")>;
-
-// WAIT.
-def : InstRW<[Write2P01], (instregex "WAIT")>;
-
-// FNCLEX.
-def : InstRW<[Write5P0156], (instregex "FNCLEX")>;
-
-// FNINIT.
-def WriteFNINIT : SchedWriteRes<[]> {
- let NumMicroOps = 26;
-}
-def : InstRW<[WriteFNINIT], (instregex "FNINIT")>;
+def : InstRW<[HWWriteFXTRACT], (instrs FXTRACT)>;
////////////////////////////////////////////////////////////////////////////////
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [HWPort1, HWPort5]> {
- let Latency = 5;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 2];
-}
-
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [HWPort1, HWPort5, HWPort23]> {
- let Latency = 9;
- let NumMicroOps = 4;
- let ResourceCycles = [1, 2, 1];
-}
-
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [HWPort1, HWPort5]> {
- let Latency = 3;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 2];
-}
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [HWPort1, HWPort5, HWPort23]> {
- let Latency = 6;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 2, 1];
-}
+defm : HWWriteResPair<WriteFHAdd, [HWPort1, HWPort5], 5, [1,2], 3, 6>;
+defm : HWWriteResPair<WriteFHAddY, [HWPort1, HWPort5], 5, [1,2], 3, 7>;
+defm : HWWriteResPair<WritePHAdd, [HWPort5, HWPort15], 3, [2,1], 3, 5>;
+defm : HWWriteResPair<WritePHAddX, [HWPort5, HWPort15], 3, [2,1], 3, 6>;
+defm : HWWriteResPair<WritePHAddY, [HWPort5, HWPort15], 3, [2,1], 3, 7>;
//=== Floating Point XMM and YMM Instructions ===//
@@ -717,429 +802,69 @@ def HWWriteResGroup0 : SchedWriteRes<[HWPort23]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup0], (instregex "LDDQUrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVAPDrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVAPSrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVDQArm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVDQUrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVNTDQArm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVSHDUPrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVSLDUPrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVUPDrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVUPSrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSSrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VLDDQUrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPDrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPSrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQArm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQUrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVNTDQArm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVSHDUPrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVSLDUPrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPDrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPSrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTDrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTQrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "ROUNDPDr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "ROUNDPSr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "ROUNDSDr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "ROUNDSSr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VROUNDPDr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VROUNDPSr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VROUNDSDr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VROUNDSSr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VROUNDYPDr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VROUNDYPSr")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSSrm",
+ "(V?)MOVSHDUPrm",
+ "(V?)MOVSLDUPrm",
+ "VPBROADCAST(D|Q)rm")>;
def HWWriteResGroup0_1 : SchedWriteRes<[HWPort23]> {
let Latency = 7;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F32m")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F64m")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F80m")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTF128")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTI128")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTSDYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTSSYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VLDDQUYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVAPDYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVAPSYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVDDUPYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVDQAYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVDQUYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVNTDQAYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVSHDUPYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVSLDUPYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVUPDYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVUPSYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VPBROADCASTDYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VPBROADCASTQYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F(32|64|80)m",
+ "VBROADCASTF128",
+ "VBROADCASTI128",
+ "VBROADCASTSDYrm",
+ "VBROADCASTSSYrm",
+ "VMOVDDUPYrm",
+ "VMOVSHDUPYrm",
+ "VMOVSLDUPYrm",
+ "VPBROADCAST(D|Q)Yrm")>;
def HWWriteResGroup0_2 : SchedWriteRes<[HWPort23]> {
let Latency = 5;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVD64rm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVD64to64rm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVQ64rm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOV(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOV64toPQIrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOV8rm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVDDUPrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVDI2PDIrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVQI2PQIrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSDrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSSrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm16")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm32")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm8")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVZX(16|32|64)rm16")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVZX(16|32|64)rm8")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHNTA")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHT0")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHT1")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHT2")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "VMOV64toPQIrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVDDUPrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVDI2PDIrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVQI2PQIrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVSDrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVSSrm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm16",
+ "MOVSX(16|32|64)rm32",
+ "MOVSX(16|32|64)rm8",
+ "MOVZX(16|32|64)rm16",
+ "MOVZX(16|32|64)rm8",
+ "(V?)MOVDDUPrm")>;
def HWWriteResGroup1 : SchedWriteRes<[HWPort4,HWPort237]> {
let Latency = 1;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup1], (instregex "FBSTPm")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVD64mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVNTQmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVQ64mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOV(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOV8mi")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOV8mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVAPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVAPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVDQAmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVDQUmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVHPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVHPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVLPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVLPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVNTDQmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVNTI_64mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVNTImr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVNTPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVNTPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVPDI2DImr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVPQI2QImr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVPQIto64mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVSDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVSSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVUPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVUPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "ST_FP32m")>;
-def: InstRW<[HWWriteResGroup1], (instregex "ST_FP64m")>;
-def: InstRW<[HWWriteResGroup1], (instregex "ST_FP80m")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VEXTRACTF128mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VEXTRACTI128mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPDYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPSYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQAYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQAmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQUYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQUmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVHPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVHPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVLPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVLPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTDQYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTDQmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPDYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPSYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVPDI2DImr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVPQI2QImr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVPQIto64mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVSDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVSSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPDYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPSYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMPTRSTm")>;
+def: InstRW<[HWWriteResGroup1], (instregex "FBSTPm",
+ "ST_FP(32|64|80)m",
+ "VMPTRSTm")>;
def HWWriteResGroup2 : SchedWriteRes<[HWPort0]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_MOVD64grr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PMOVMSKBrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLDri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLDrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLQri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLQrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLWrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRADri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRADrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRAWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRAWrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLDri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLDrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLQri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLQrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLWrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MOVPDI2DIrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MOVPQIto64rr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSLLDri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSLLQri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSLLWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSRADri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSRAWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSRLDri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSRLQri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSRLWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VMOVPDI2DIrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VMOVPQIto64rr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLDYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLDri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLQYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLQri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQYrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLWYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRADYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRADri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRAWYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRAWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLDYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLDri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLQYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLQri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLVQYrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLVQrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLWYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VTESTPDYrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VTESTPDrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VTESTPSYrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VTESTPSrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQ(Y?)rr",
+ "VPSRLVQ(Y?)rr")>;
def HWWriteResGroup3 : SchedWriteRes<[HWPort1]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup3], (instregex "COMP_FST0r")>;
-def: InstRW<[HWWriteResGroup3], (instregex "COM_FST0r")>;
-def: InstRW<[HWWriteResGroup3], (instregex "MMX_MASKMOVQ64")>;
-def: InstRW<[HWWriteResGroup3], (instregex "MMX_MASKMOVQ64")>;
-def: InstRW<[HWWriteResGroup3], (instregex "UCOM_FPr")>;
-def: InstRW<[HWWriteResGroup3], (instregex "UCOM_Fr")>;
-def: InstRW<[HWWriteResGroup3], (instregex "VMASKMOVDQU")>;
+def: InstRW<[HWWriteResGroup3], (instregex "COM(P?)_FST0r",
+ "UCOM_F(P?)r")>;
def HWWriteResGroup4 : SchedWriteRes<[HWPort5]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup4], (instregex "ANDNPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "ANDNPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "ANDPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "ANDPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "INSERTPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVD64rr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVD64to64rr")>;
def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVQ2DQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PALIGNR64irr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PSHUFBrr64")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PSHUFWri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHBWirr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHDQirr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHWDirr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLBWirr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLDQirr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLWDirr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOV64toPQIrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVAPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVAPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVDDUPrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVDI2PDIrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVHLPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVLHPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVSDrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVSHDUPrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVSLDUPrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVSSrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVUPDrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVUPSrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "ORPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "ORPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PACKSSDWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PACKSSWBrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PACKUSDWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PACKUSWBrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PALIGNRrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PBLENDWrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXWQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXWQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PSHUFBrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PSHUFDri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PSHUFHWri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PSHUFLWri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PSLLDQri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PSRLDQri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHQDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLQDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "SHUFPDrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "SHUFPSrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "UNPCKHPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "UNPCKHPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "UNPCKLPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "UNPCKLPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDNPDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDNPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDNPSYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDNPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDPDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDPSYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VBROADCASTSSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VINSERTPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOV64toPQIrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDYrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSYrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVDI2PDIrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVHLPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVLHPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVSDrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVSSrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDYrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSYrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VORPDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VORPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VORPSYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VORPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSDWYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSDWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSWBYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSWBrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSDWYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSDWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSWBYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSWBrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPALIGNRYrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPALIGNRrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPBLENDWYrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPBLENDWrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPBROADCASTDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPBROADCASTQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDYri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSYri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXWQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXWQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFBYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFBrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFDYri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFDri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFHWYri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFHWri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFLWYri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFLWri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSLLDQYri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSLLDQri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSRLDQYri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSRLDQri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHBWYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHDQYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHQDQYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHQDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHWDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLBWYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLDQYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLQDQYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLQDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLWDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPDYrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPDrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPSYrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPSrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPSYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPSYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VXORPDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VXORPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VXORPSYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VXORPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "XORPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "XORPSrr")>;
def HWWriteResGroup5 : SchedWriteRes<[HWPort6]> {
let Latency = 1;
@@ -1153,661 +878,128 @@ def HWWriteResGroup6 : SchedWriteRes<[HWPort01]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup6], (instregex "FINCSTP")>;
-def: InstRW<[HWWriteResGroup6], (instregex "FNOP")>;
+def: InstRW<[HWWriteResGroup6], (instrs FINCSTP, FNOP)>;
def HWWriteResGroup7 : SchedWriteRes<[HWPort06]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)ri8")>;
-def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "BTC(16|32|64)ri8")>;
-def: InstRW<[HWWriteResGroup7], (instregex "BTC(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "BTR(16|32|64)ri8")>;
-def: InstRW<[HWWriteResGroup7], (instregex "BTR(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "BTS(16|32|64)ri8")>;
-def: InstRW<[HWWriteResGroup7], (instregex "BTS(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "CDQ")>;
-def: InstRW<[HWWriteResGroup7], (instregex "CQO")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JAE_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JAE_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JA_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JA_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JBE_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JBE_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JB_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JB_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JE_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JE_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JGE_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JGE_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JG_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JG_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JLE_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JLE_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JL_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JL_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JMP_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JMP_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNE_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNE_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNO_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNO_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNP_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNP_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNS_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNS_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JO_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JO_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JP_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JP_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JS_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JS_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "RORX(32|64)ri")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SAR(16|32|64)r1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SAR(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SAR8r1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SAR8ri")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SARX(32|64)rr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETAEr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETBr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETEr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETGEr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETGr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETLEr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETLr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETNEr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETNOr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETNPr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETNSr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETOr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETPr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETSr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHL(16|32|64)r1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHL(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHL8r1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHL8ri")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHLX(32|64)rr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHR(16|32|64)r1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHR(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHR8r1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHR8ri")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHRX(32|64)rr")>;
+def: InstRW<[HWWriteResGroup7], (instrs CDQ, CQO)>;
+def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)ri8",
+ "BT(16|32|64)rr",
+ "BTC(16|32|64)ri8",
+ "BTC(16|32|64)rr",
+ "BTR(16|32|64)ri8",
+ "BTR(16|32|64)rr",
+ "BTS(16|32|64)ri8",
+ "BTS(16|32|64)rr")>;
def HWWriteResGroup8 : SchedWriteRes<[HWPort15]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup8], (instregex "ANDN(32|64)rr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "BLSI(32|64)rr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "BLSMSK(32|64)rr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "BLSR(32|64)rr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "BZHI(32|64)rr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "LEA(16|32|64)(_32)?r")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSBrr64")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSDrr64")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSWrr64")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDDirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDQirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDSBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDSWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDUSBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDUSWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PAVGBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PAVGWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQDirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTDirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMAXSWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMAXUBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMINSWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMINUBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNBrr64")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNDrr64")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNWrr64")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBDirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBQirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBSBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBSWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBUSBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBUSWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PABSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PABSDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PABSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDQrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDUSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDUSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PAVGBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PAVGWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQQrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMAXSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMAXSDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMAXSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMAXUBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMAXUDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMAXUWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMINSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMINSDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMINSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMINUBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMINUDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMINUWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSIGNBrr128")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSIGNDrr128")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSIGNWrr128")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBQrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBUSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBUSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPABSBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPABSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPABSDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPABSDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPABSWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPABSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDQYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDQrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDSBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDSWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPAVGBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPAVGBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPAVGWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPAVGWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQQYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQQrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINSBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINSDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINSDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINSWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINUBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINUBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINUDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINUDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINUWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINUWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNBYrr256")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNBrr128")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNDYrr256")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNDrr128")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNWYrr256")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNWrr128")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBQYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBQrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "ANDN(32|64)rr",
+ "BLSI(32|64)rr",
+ "BLSMSK(32|64)rr",
+ "BLSR(32|64)rr")>;
def HWWriteResGroup9 : SchedWriteRes<[HWPort015]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup9], (instregex "BLENDPDrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "BLENDPSrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVQ64rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDNirr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDirr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MMX_PORirr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MMX_PXORirr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MOVDQArr(_REV)?")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MOVDQUrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MOVPQI2QIrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "PANDNrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "PANDrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "PORrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "PXORrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPDYrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPDrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSYrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQAYrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQArr(_REV)?")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUYrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVPQI2QIrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPANDNYrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPANDNrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPANDYrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPANDrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDDYrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDDrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPORYrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPORrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPXORYrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPXORrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDD(Y?)rri")>;
def HWWriteResGroup10 : SchedWriteRes<[HWPort0156]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup10], (instregex "ADD(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "ADD8i8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "ADD8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "ADD8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "AND(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "AND(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "AND8i8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "AND8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "AND8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CBW")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CLC")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CMC")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CMP(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CMP8i8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CMP8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CMP8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CWDE")>;
-def: InstRW<[HWWriteResGroup10], (instregex "DEC(16|32|64)r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "DEC8r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "INC(16|32|64)r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "INC8r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "LAHF")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOV8ri(_alt)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOV8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOVZX(16|32|64)rr16")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOVZX(16|32|64)rr8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "NEG(16|32|64)r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "NEG8r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "NOOP")>;
-def: InstRW<[HWWriteResGroup10], (instregex "NOT(16|32|64)r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "NOT8r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "OR(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "OR(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "OR8i8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "OR8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "OR8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SAHF")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SGDT64m")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SIDT64m")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SLDT64m")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SMSW16m")>;
-def: InstRW<[HWWriteResGroup10], (instregex "STC")>;
-def: InstRW<[HWWriteResGroup10], (instregex "STRm")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SUB(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SUB8i8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SUB8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SUB8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SYSCALL")>;
-def: InstRW<[HWWriteResGroup10], (instregex "TEST(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup10], (instregex "TEST8i8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "TEST8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "TEST8rr")>;
-def: InstRW<[HWWriteResGroup10], (instregex "XCHG(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup10], (instregex "XOR(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "XOR(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup10], (instregex "XOR8i8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "XOR8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "XOR8rr")>;
+def: InstRW<[HWWriteResGroup10], (instrs CBW, CWDE, CDQE,
+ CMC, STC)>;
+def: InstRW<[HWWriteResGroup10], (instregex "SGDT64m",
+ "SIDT64m",
+ "SMSW16m",
+ "STRm",
+ "SYSCALL")>;
def HWWriteResGroup11 : SchedWriteRes<[HWPort0,HWPort23]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup11], (instregex "CVTPS2PDrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLDrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLQrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLWrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRADrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRAWrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLDrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLQrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLWrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "VCVTPH2PSrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "VCVTPS2PDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "(V?)CVTPS2PDrm")>;
def HWWriteResGroup11_1 : SchedWriteRes<[HWPort0,HWPort23]> {
let Latency = 7;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup11_1], (instregex "CVTSS2SDrm")>;
-def: InstRW<[HWWriteResGroup11_1], (instregex "VCVTPH2PSYrm")>;
-def: InstRW<[HWWriteResGroup11_1], (instregex "VCVTSS2SDrm")>;
-def: InstRW<[HWWriteResGroup11_1], (instregex "VPSLLVQrm")>;
-def: InstRW<[HWWriteResGroup11_1], (instregex "VPSRLVQrm")>;
-def: InstRW<[HWWriteResGroup11_1], (instregex "VTESTPDrm")>;
-def: InstRW<[HWWriteResGroup11_1], (instregex "VTESTPSrm")>;
+def: InstRW<[HWWriteResGroup11_1], (instregex "(V?)CVTSS2SDrm",
+ "VPSLLVQrm",
+ "VPSRLVQrm")>;
def HWWriteResGroup11_2 : SchedWriteRes<[HWPort0,HWPort23]> {
let Latency = 8;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLDYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLQYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLVQYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLWYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRADYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRAWYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLDYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLQYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLVQYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLWYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VTESTPDYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VTESTPSYrm")>;
+def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLVQYrm",
+ "VPSRLVQYrm")>;
def HWWriteResGroup12 : SchedWriteRes<[HWPort1,HWPort23]> {
let Latency = 8;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup12], (instregex "ADDSDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "ADDSSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "BSF(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "BSR(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "CMPSDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "CMPSSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "COMISDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "COMISSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "FCOM32m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "FCOM64m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "FCOMP32m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "FCOMP64m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "IMUL(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "IMUL(16|32|64)rm(i8)?")>;
-def: InstRW<[HWWriteResGroup12], (instregex "IMUL8m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "LZCNT(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MAX(C?)SDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MAX(C?)SSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MIN(C?)SDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MIN(C?)SSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MMX_CVTPI2PSirm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MMX_CVTPS2PIirm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MMX_CVTTPS2PIirm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MUL(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MUL8m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "PDEP(32|64)rm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "PEXT(32|64)rm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "POPCNT(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "SUBSDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "SUBSSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "TZCNT(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "UCOMISDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "UCOMISSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VADDSDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VADDSSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VCMPSDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VCMPSSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VCOMISDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VCOMISSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VMAX(C?)SDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VMAX(C?)SSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VMIN(C?)SDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VMIN(C?)SSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VSUBSDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VSUBSSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VUCOMISDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VUCOMISSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MMX_CVTPI2PSirm",
+ "PDEP(32|64)rm",
+ "PEXT(32|64)rm")>;
+
+def HWWriteResGroup12_1 : SchedWriteRes<[HWPort1,HWPort0156,HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup12_1], (instrs IMUL16rmi, IMUL16rmi8)>;
+
+def HWWriteResGroup12_2 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156,HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,2,1];
+}
+def: InstRW<[HWWriteResGroup12_2], (instrs IMUL16m, MUL16m)>;
def HWWriteResGroup13 : SchedWriteRes<[HWPort5,HWPort23]> {
- let Latency = 7;
+ let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup13], (instregex "ANDNPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "ANDNPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "ANDPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "ANDPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "INSERTPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "ORPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "ORPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PACKSSDWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PACKSSWBrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PACKUSDWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PACKUSWBrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PALIGNRrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PBLENDWrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PSHUFBrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PSHUFDmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PSHUFHWmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PSHUFLWmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHBWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHQDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHWDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLBWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLQDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLWDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "SHUFPDrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "SHUFPSrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "UNPCKHPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "UNPCKHPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "UNPCKLPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "UNPCKLPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VANDNPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VANDNPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VANDPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VANDPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VINSERTPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VORPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VORPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSDWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSWBrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSDWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSWBrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPALIGNRrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPBLENDWrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFBrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFDmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFHWmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFLWmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHBWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHQDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHWDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLBWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLQDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLWDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPDrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPSrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VXORPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VXORPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "XORPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "XORPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "(V?)PMOV(SX|ZX)BDrm",
+ "(V?)PMOV(SX|ZX)BQrm",
+ "(V?)PMOV(SX|ZX)BWrm",
+ "(V?)PMOV(SX|ZX)DQrm",
+ "(V?)PMOV(SX|ZX)WDrm",
+ "(V?)PMOV(SX|ZX)WQrm")>;
def HWWriteResGroup13_1 : SchedWriteRes<[HWPort5,HWPort23]> {
let Latency = 8;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup13_1], (instregex "VANDNPDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VANDNPSYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VANDPDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VANDPSYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VORPDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VORPSYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKSSDWYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKSSWBYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKUSDWYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKUSWBYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPALIGNRYrmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPBLENDWYrmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPDYmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPSYmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPSYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXBDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXBQYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXWQYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFBYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFDYmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFHWYmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFLWYmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHBWYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHDQYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHQDQYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHWDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLBWYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLDQYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLQDQYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLWDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VSHUFPDYrmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VSHUFPSYrmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKHPDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKHPSYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKLPDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKLPSYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VXORPDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VXORPSYrm")>;
-
-def HWWriteResGroup13_2 : SchedWriteRes<[HWPort5,HWPort23]> {
- let Latency = 6;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PALIGNR64irm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PINSRWirmi")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PSHUFBrm64")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PSHUFWmi")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKHBWirm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKHDQirm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKHWDirm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKLBWirm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKLDQirm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKLWDirm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MOVHPDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MOVHPSrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MOVLPDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MOVLPSrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRBrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRWrmi")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXBDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXBQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXBWrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXDQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXWDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXWQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXBDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXBQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXBWrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXDQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXWDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXWQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVHPDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVHPSrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVLPDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVLPSrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRBrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRWrmi")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXBDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXBQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXBWrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXDQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXWDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXWQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXBDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXBQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXBWrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXDQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXWDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXWQrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXBDYrm",
+ "VPMOVSXBQYrm",
+ "VPMOVSXWQYrm")>;
def HWWriteResGroup14 : SchedWriteRes<[HWPort6,HWPort23]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup14], (instregex "FARJMP64")>;
-def: InstRW<[HWWriteResGroup14], (instregex "JMP(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup14], (instregex "FARJMP64",
+ "JMP(16|32|64)m")>;
def HWWriteResGroup15 : SchedWriteRes<[HWPort23,HWPort06]> {
let Latency = 6;
@@ -1815,323 +1007,55 @@ def HWWriteResGroup15 : SchedWriteRes<[HWPort23,HWPort06]> {
let ResourceCycles = [1,1];
}
def: InstRW<[HWWriteResGroup15], (instregex "BT(16|32|64)mi8")>;
-def: InstRW<[HWWriteResGroup15], (instregex "RORX32mi")>;
-def: InstRW<[HWWriteResGroup15], (instregex "RORX64mi")>;
-def: InstRW<[HWWriteResGroup15], (instregex "SARX32rm")>;
-def: InstRW<[HWWriteResGroup15], (instregex "SARX64rm")>;
-def: InstRW<[HWWriteResGroup15], (instregex "SHLX32rm")>;
-def: InstRW<[HWWriteResGroup15], (instregex "SHLX64rm")>;
-def: InstRW<[HWWriteResGroup15], (instregex "SHRX32rm")>;
-def: InstRW<[HWWriteResGroup15], (instregex "SHRX64rm")>;
def HWWriteResGroup16 : SchedWriteRes<[HWPort23,HWPort15]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup16], (instregex "ANDN(32|64)rm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "BLSI(32|64)rm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "BLSMSK(32|64)rm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "BLSR(32|64)rm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "BZHI(32|64)rm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSBrm64")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSDrm64")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSWrm64")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDDirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDQirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDSBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDSWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDUSBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDUSWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PAVGBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PAVGWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQDirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTDirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMAXSWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMAXUBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMINSWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMINUBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNBrm64")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNDrm64")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNWrm64")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBDirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBQirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBSBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBSWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBUSBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBUSWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MOVBE(16|32|64)rm")>;
-
-def HWWriteResGroup16_1 : SchedWriteRes<[HWPort23,HWPort15]> {
- let Latency = 7;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup16_1], (instregex "PABSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PABSDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PABSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDQrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDUSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDUSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PAVGBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PAVGWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQQrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPGTBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPGTDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPGTWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXSDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXUBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXUDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXUWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMINSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMINSDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMINSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMINUBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMINUDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMINUWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSIGNBrm128")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSIGNDrm128")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSIGNWrm128")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBQrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBUSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBUSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPABSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPABSDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPABSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDQrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDUSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDUSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPAVGBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPAVGWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQQrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPGTBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPGTDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPGTWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXSDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXUBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXUDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXUWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINSDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINUBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINUDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINUWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSIGNBrm128")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSIGNDrm128")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSIGNWrm128")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBQrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBUSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBUSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBWrm")>;
-
-def HWWriteResGroup16_2 : SchedWriteRes<[HWPort23,HWPort15]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPABSBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPABSDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPABSWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDQYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDSBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDSWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDUSBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDUSWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPAVGBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPAVGWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQQYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPGTBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPGTDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPGTWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXSBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXSDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXSWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXUBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXUDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXUWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINSBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINSDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINSWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINUBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINUDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINUWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSIGNBYrm256")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSIGNDYrm256")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSIGNWYrm256")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBQYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBSBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBSWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBUSBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBUSWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "ANDN(32|64)rm",
+ "BLSI(32|64)rm",
+ "BLSMSK(32|64)rm",
+ "BLSR(32|64)rm",
+ "MOVBE(16|32|64)rm")>;
def HWWriteResGroup17 : SchedWriteRes<[HWPort23,HWPort015]> {
let Latency = 7;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup17], (instregex "BLENDPDrmi")>;
-def: InstRW<[HWWriteResGroup17], (instregex "BLENDPSrmi")>;
-def: InstRW<[HWWriteResGroup17], (instregex "PANDNrm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "PANDrm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "PORrm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "PXORrm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPDrmi")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPSrmi")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VINSERTF128rm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VINSERTI128rm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VPANDNrm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VPANDrm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VPBLENDDrmi")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VPORrm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VPXORrm")>;
-
-def HWWriteResGroup17_1 : SchedWriteRes<[HWPort23,HWPort015]> {
- let Latency = 6;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PANDNirm")>;
-def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PANDirm")>;
-def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PORirm")>;
-def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PXORirm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VINSERTF128rm",
+ "VINSERTI128rm",
+ "VPBLENDDrmi")>;
def HWWriteResGroup17_2 : SchedWriteRes<[HWPort23,HWPort015]> {
let Latency = 8;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup17_2], (instregex "VBLENDPDYrmi")>;
-def: InstRW<[HWWriteResGroup17_2], (instregex "VBLENDPSYrmi")>;
-def: InstRW<[HWWriteResGroup17_2], (instregex "VPANDNYrm")>;
-def: InstRW<[HWWriteResGroup17_2], (instregex "VPANDYrm")>;
def: InstRW<[HWWriteResGroup17_2], (instregex "VPBLENDDYrmi")>;
-def: InstRW<[HWWriteResGroup17_2], (instregex "VPORYrm")>;
-def: InstRW<[HWWriteResGroup17_2], (instregex "VPXORYrm")>;
def HWWriteResGroup18 : SchedWriteRes<[HWPort23,HWPort0156]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup18], (instregex "ADD(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "ADD8rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "AND(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "AND8rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "CMP8mi")>;
-def: InstRW<[HWWriteResGroup18], (instregex "CMP8mr")>;
-def: InstRW<[HWWriteResGroup18], (instregex "CMP8rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "OR(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "OR8rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "POP(16|32|64)r(mr)?")>;
-def: InstRW<[HWWriteResGroup18], (instregex "SUB(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "SUB8rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "TEST(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup18], (instregex "TEST8mi")>;
-def: InstRW<[HWWriteResGroup18], (instregex "TEST8mr")>;
-def: InstRW<[HWWriteResGroup18], (instregex "XOR(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "XOR8rm")>;
+def: InstRW<[HWWriteResGroup18], (instrs POP16r, POP32r, POP64r)>;
+def: InstRW<[HWWriteResGroup18], (instregex "POP(16|32|64)rmr")>;
def HWWriteResGroup19 : SchedWriteRes<[HWPort237,HWPort0156]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup19], (instregex "SFENCE")>;
-
-def HWWriteResGroup20 : SchedWriteRes<[HWPort4,HWPort5,HWPort237]> {
- let Latency = 2;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup20], (instregex "EXTRACTPSmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "PEXTRBmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "PEXTRDmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "PEXTRQmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "PEXTRWmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "STMXCSR")>;
-def: InstRW<[HWWriteResGroup20], (instregex "VEXTRACTPSmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRBmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRDmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRQmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRWmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "VSTMXCSR")>;
+def: InstRW<[HWWriteResGroup19], (instrs SFENCE)>;
def HWWriteResGroup21 : SchedWriteRes<[HWPort4,HWPort6,HWPort237]> {
let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup21], (instregex "FNSTCW16m")>;
-
-def HWWriteResGroup22 : SchedWriteRes<[HWPort4,HWPort237,HWPort06]> {
- let Latency = 2;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup22], (instregex "SETAEm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETBm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETEm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETGEm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETGm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETLEm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETLm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETNEm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETNOm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETNPm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETNSm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETOm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETPm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETSm")>;
+def: InstRW<[HWWriteResGroup21], (instrs FNSTCW16m)>;
def HWWriteResGroup23 : SchedWriteRes<[HWPort4,HWPort237,HWPort15]> {
let Latency = 2;
@@ -2145,174 +1069,75 @@ def HWWriteResGroup23_16 : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup23_16], (instregex "MOVBE16mr")>;
+def: InstRW<[HWWriteResGroup23_16], (instrs MOVBE16mr)>;
def HWWriteResGroup24 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> {
let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup24], (instregex "PUSH(16|32|64)r(mr)?")>;
-def: InstRW<[HWWriteResGroup24], (instregex "PUSH64i8")>;
-def: InstRW<[HWWriteResGroup24], (instregex "STOSB")>;
-def: InstRW<[HWWriteResGroup24], (instregex "STOSL")>;
-def: InstRW<[HWWriteResGroup24], (instregex "STOSQ")>;
-def: InstRW<[HWWriteResGroup24], (instregex "STOSW")>;
+def: InstRW<[HWWriteResGroup24], (instrs PUSH16r, PUSH32r, PUSH64r,
+ STOSB, STOSL, STOSQ, STOSW)>;
+def: InstRW<[HWWriteResGroup24], (instregex "PUSH(16|32|64)rmr",
+ "PUSH64i8")>;
def HWWriteResGroup25 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> {
let Latency = 7;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[HWWriteResGroup25], (instregex "BTC(16|32|64)mi8")>;
-def: InstRW<[HWWriteResGroup25], (instregex "BTR(16|32|64)mi8")>;
-def: InstRW<[HWWriteResGroup25], (instregex "BTS(16|32|64)mi8")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SAR(16|32|64)m1")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SAR(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SAR8m1")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SAR8mi")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHL(16|32|64)m1")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHL(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHL8m1")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHL8mi")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHR(16|32|64)m1")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHR(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHR8m1")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHR8mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "BTC(16|32|64)mi8",
+ "BTR(16|32|64)mi8",
+ "BTS(16|32|64)mi8",
+ "SAR(8|16|32|64)m1",
+ "SAR(8|16|32|64)mi",
+ "SHL(8|16|32|64)m1",
+ "SHL(8|16|32|64)mi",
+ "SHR(8|16|32|64)m1",
+ "SHR(8|16|32|64)mi")>;
def HWWriteResGroup26 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
let Latency = 7;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[HWWriteResGroup26], (instregex "ADD(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "ADD(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "ADD8mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "ADD8mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "AND(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "AND(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "AND8mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "AND8mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "DEC(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "DEC8m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "INC(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "INC8m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "NEG(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "NEG8m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "NOT(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "NOT8m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "OR(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "OR(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "OR8mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "OR8mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "POP(16|32|64)rmm")>;
-def: InstRW<[HWWriteResGroup26], (instregex "PUSH(16|32|64)rmm")>;
-def: InstRW<[HWWriteResGroup26], (instregex "SUB(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "SUB(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "SUB8mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "SUB8mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "XOR(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "XOR(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "XOR8mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "XOR8mr")>;
-
-def HWWriteResGroup27 : SchedWriteRes<[HWPort5]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[HWWriteResGroup27], (instregex "BLENDVPDrr0")>;
-def: InstRW<[HWWriteResGroup27], (instregex "BLENDVPSrr0")>;
-def: InstRW<[HWWriteResGroup27], (instregex "MMX_PINSRWirri")>;
-def: InstRW<[HWWriteResGroup27], (instregex "PBLENDVBrr0")>;
-def: InstRW<[HWWriteResGroup27], (instregex "PINSRBrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "PINSRDrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "PINSRQrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "PINSRWrri")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPDYrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPDrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPSYrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPSrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VPBLENDVBYrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VPBLENDVBrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VPINSRBrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VPINSRDrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VPINSRQrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VPINSRWrri")>;
+def: InstRW<[HWWriteResGroup26], (instregex "POP(16|32|64)rmm",
+ "PUSH(16|32|64)rmm")>;
def HWWriteResGroup28 : SchedWriteRes<[HWPort01]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[HWWriteResGroup28], (instregex "FDECSTP")>;
+def: InstRW<[HWWriteResGroup28], (instrs FDECSTP)>;
def HWWriteResGroup29 : SchedWriteRes<[HWPort06]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[HWWriteResGroup29], (instregex "ROL(16|32|64)r1")>;
-def: InstRW<[HWWriteResGroup29], (instregex "ROL(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup29], (instregex "ROL8r1")>;
-def: InstRW<[HWWriteResGroup29], (instregex "ROL8ri")>;
-def: InstRW<[HWWriteResGroup29], (instregex "ROR(16|32|64)r1")>;
-def: InstRW<[HWWriteResGroup29], (instregex "ROR(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup29], (instregex "ROR8r1")>;
-def: InstRW<[HWWriteResGroup29], (instregex "ROR8ri")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROL(8|16|32|64)r1",
+ "ROL(8|16|32|64)ri",
+ "ROR(8|16|32|64)r1",
+ "ROR(8|16|32|64)ri")>;
def HWWriteResGroup30 : SchedWriteRes<[HWPort0156]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[HWWriteResGroup30], (instregex "LFENCE")>;
-def: InstRW<[HWWriteResGroup30], (instregex "MFENCE")>;
-def: InstRW<[HWWriteResGroup30], (instregex "WAIT")>;
-def: InstRW<[HWWriteResGroup30], (instregex "XGETBV")>;
+def: InstRW<[HWWriteResGroup30], (instrs LFENCE,
+ MFENCE,
+ WAIT,
+ XGETBV)>;
def HWWriteResGroup31 : SchedWriteRes<[HWPort0,HWPort5]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup31], (instregex "CVTPS2PDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "CVTSS2SDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "EXTRACTPSrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "MMX_PEXTRWirri")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PEXTRBrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PEXTRDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PEXTRQrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PEXTRWri")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PEXTRWrr_REV")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSLLDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSLLQrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSLLWrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSRADrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSRAWrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSRLDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSRLQrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSRLWrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PTESTrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VCVTPH2PSYrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VCVTPH2PSrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VCVTPS2PDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VCVTSS2SDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VEXTRACTPSrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRBrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRQrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRWri")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRWrr_REV")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSLLDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSLLQrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSLLWrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSRADrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSRAWrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSRLDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSRLQrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSRLWrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPTESTrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "(V?)CVTPS2PDrr",
+ "(V?)CVTSS2SDrr")>;
def HWWriteResGroup32 : SchedWriteRes<[HWPort6,HWPort0156]> {
let Latency = 2;
@@ -2328,175 +1153,44 @@ def HWWriteResGroup33 : SchedWriteRes<[HWPort01,HWPort015]> {
}
def: InstRW<[HWWriteResGroup33], (instregex "MMX_MOVDQ2Qrr")>;
-def HWWriteResGroup34 : SchedWriteRes<[HWPort06,HWPort15]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup34], (instregex "BEXTR(32|64)rr")>;
-def: InstRW<[HWWriteResGroup34], (instregex "BSWAP(16|32|64)r")>;
-
def HWWriteResGroup35 : SchedWriteRes<[HWPort06,HWPort0156]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup35], (instregex "ADC(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup35], (instregex "ADC(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup35], (instregex "ADC8i8")>;
-def: InstRW<[HWWriteResGroup35], (instregex "ADC8ri")>;
-def: InstRW<[HWWriteResGroup35], (instregex "ADC8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVAE(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVB(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVE(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVG(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVGE(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVL(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVLE(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVNE(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVNO(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVNP(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVNS(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVO(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVP(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVS(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CWD")>;
-def: InstRW<[HWWriteResGroup35], (instregex "JRCXZ")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SBB(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SBB(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SBB8i8")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SBB8ri")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SBB8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SETAr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SETBEr")>;
-
-def HWWriteResGroup36 : SchedWriteRes<[HWPort5,HWPort23]> {
- let Latency = 8;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup36], (instregex "BLENDVPDrm0")>;
-def: InstRW<[HWWriteResGroup36], (instregex "BLENDVPSrm0")>;
-def: InstRW<[HWWriteResGroup36], (instregex "PBLENDVBrm0")>;
-def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPDrm")>;
-def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPSrm")>;
-def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPDrm")>;
-def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPSrm")>;
-def: InstRW<[HWWriteResGroup36], (instregex "VPBLENDVBrm")>;
-def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVDrm")>;
-def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVQrm")>;
-
-def HWWriteResGroup36_1 : SchedWriteRes<[HWPort5,HWPort23]> {
- let Latency = 9;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup36_1], (instregex "VBLENDVPDYrm")>;
-def: InstRW<[HWWriteResGroup36_1], (instregex "VBLENDVPSYrm")>;
-def: InstRW<[HWWriteResGroup36_1], (instregex "VMASKMOVPDYrm")>;
-def: InstRW<[HWWriteResGroup36_1], (instregex "VMASKMOVPSYrm")>;
-def: InstRW<[HWWriteResGroup36_1], (instregex "VPBLENDVBYrm")>;
-def: InstRW<[HWWriteResGroup36_1], (instregex "VPMASKMOVDYrm")>;
-def: InstRW<[HWWriteResGroup36_1], (instregex "VPMASKMOVQYrm")>;
+def: InstRW<[HWWriteResGroup35], (instrs CWD, JCXZ, JECXZ, JRCXZ)>;
+def: InstRW<[HWWriteResGroup35], (instregex "SET(A|BE)r")>;
def HWWriteResGroup36_2 : SchedWriteRes<[HWPort5,HWPort23]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[HWWriteResGroup36_2], (instregex "MMX_PACKSSDWirm")>;
-def: InstRW<[HWWriteResGroup36_2], (instregex "MMX_PACKSSWBirm")>;
-def: InstRW<[HWWriteResGroup36_2], (instregex "MMX_PACKUSWBirm")>;
+def: InstRW<[HWWriteResGroup36_2], (instregex "MMX_PACKSSDWirm",
+ "MMX_PACKSSWBirm",
+ "MMX_PACKUSWBirm")>;
def HWWriteResGroup37 : SchedWriteRes<[HWPort23,HWPort0156]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[HWWriteResGroup37], (instregex "LEAVE64")>;
-def: InstRW<[HWWriteResGroup37], (instregex "SCASB")>;
-def: InstRW<[HWWriteResGroup37], (instregex "SCASL")>;
-def: InstRW<[HWWriteResGroup37], (instregex "SCASQ")>;
-def: InstRW<[HWWriteResGroup37], (instregex "SCASW")>;
-
-def HWWriteResGroup38 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
- let Latency = 8;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup38], (instregex "PSLLDrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PSLLQrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PSLLWrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PSRADrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PSRAWrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PSRLDrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PSRLQrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PSRLWrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PTESTrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSLLDrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSLLQrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSLLWrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSRADrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSRAWrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSRLDrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSRLQrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSRLWrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPTESTrm")>;
+def: InstRW<[HWWriteResGroup37], (instrs LEAVE, LEAVE64,
+ SCASB, SCASL, SCASQ, SCASW)>;
def HWWriteResGroup39 : SchedWriteRes<[HWPort0,HWPort01,HWPort23]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup39], (instregex "FLDCW16m")>;
-
-def HWWriteResGroup40 : SchedWriteRes<[HWPort0,HWPort23,HWPort0156]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup40], (instregex "LDMXCSR")>;
-def: InstRW<[HWWriteResGroup40], (instregex "VLDMXCSR")>;
+def: InstRW<[HWWriteResGroup39], (instrs FLDCW16m)>;
def HWWriteResGroup41 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup41], (instregex "LRETQ")>;
-def: InstRW<[HWWriteResGroup41], (instregex "RETL")>;
-def: InstRW<[HWWriteResGroup41], (instregex "RETQ")>;
-
-def HWWriteResGroup42 : SchedWriteRes<[HWPort23,HWPort06,HWPort15]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup42], (instregex "BEXTR(32|64)rm")>;
-
-def HWWriteResGroup43 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup43], (instregex "ADC(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "ADC8rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVAE(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVB(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVE(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVG(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVGE(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVL(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVLE(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVNE(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVNO(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVNP(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVNS(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVO(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVP(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVS(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "SBB(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "SBB8rm")>;
+def: InstRW<[HWWriteResGroup41], (instrs LRETQ, RETL, RETQ)>;
def HWWriteResGroup44 : SchedWriteRes<[HWPort4,HWPort6,HWPort237,HWPort0156]> {
let Latency = 3;
@@ -2510,356 +1204,106 @@ def HWWriteResGroup45 : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> {
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[HWWriteResGroup45], (instregex "CALL64pcrel32")>;
-def: InstRW<[HWWriteResGroup45], (instregex "SETAm")>;
-def: InstRW<[HWWriteResGroup45], (instregex "SETBEm")>;
+def: InstRW<[HWWriteResGroup45], (instrs CALL64pcrel32)>;
+def: InstRW<[HWWriteResGroup45], (instregex "SET(A|BE)m")>;
def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> {
let Latency = 8;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,2];
}
-def: InstRW<[HWWriteResGroup46], (instregex "ROL(16|32|64)m1")>;
-def: InstRW<[HWWriteResGroup46], (instregex "ROL(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup46], (instregex "ROL8m1")>;
-def: InstRW<[HWWriteResGroup46], (instregex "ROL8mi")>;
-def: InstRW<[HWWriteResGroup46], (instregex "ROR(16|32|64)m1")>;
-def: InstRW<[HWWriteResGroup46], (instregex "ROR(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup46], (instregex "ROR8m1")>;
-def: InstRW<[HWWriteResGroup46], (instregex "ROR8mi")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROL(8|16|32|64)m1",
+ "ROL(8|16|32|64)mi",
+ "ROR(8|16|32|64)m1",
+ "ROR(8|16|32|64)mi")>;
def HWWriteResGroup47 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
let Latency = 8;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,2];
}
-def: InstRW<[HWWriteResGroup47], (instregex "XADD(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup47], (instregex "XADD8rm")>;
+def: InstRW<[HWWriteResGroup47], (instregex "XADD(8|16|32|64)rm")>;
def HWWriteResGroup48 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
let Latency = 8;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,1,1];
}
-def: InstRW<[HWWriteResGroup48], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup48], (instregex "FARCALL64")>;
-
-def HWWriteResGroup49 : SchedWriteRes<[HWPort0]> {
- let Latency = 3;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[HWWriteResGroup49], (instregex "MOVMSKPDrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "MOVMSKPSrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "PMOVMSKBrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPDYrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPDrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPSYrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPSrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "VPMOVMSKBYrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "VPMOVMSKBrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "CALL(16|32|64)m",
+ "FARCALL64")>;
def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> {
let Latency = 3;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup50], (instregex "ADDPDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADDPSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADDSDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADDSSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADDSUBPDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADDSUBPSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADD_FPrST0")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADD_FST0r")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADD_FrST0")>;
-def: InstRW<[HWWriteResGroup50], (instregex "BSF(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "BSR(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "CMPPDrri")>;
-def: InstRW<[HWWriteResGroup50], (instregex "CMPPSrri")>;
-def: InstRW<[HWWriteResGroup50], (instregex "CMPSDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "CMPSSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "COMISDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "COMISSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "CVTDQ2PSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "CVTPS2DQrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "CVTTPS2DQrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "IMUL64rr(i8)?")>;
-def: InstRW<[HWWriteResGroup50], (instregex "IMUL8r")>;
-def: InstRW<[HWWriteResGroup50], (instregex "LZCNT(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)PDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)PSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)SDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)SSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)PDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)PSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)SDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)SSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MMX_CVTPI2PSirr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MUL8r")>;
-def: InstRW<[HWWriteResGroup50], (instregex "PDEP(32|64)rr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "PEXT(32|64)rr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "POPCNT(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SHLD(16|32|64)rri8")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SHRD(16|32|64)rri8")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUBPDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUBPSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FPrST0")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FST0r")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FrST0")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUBSDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUBSSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUB_FPrST0")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUB_FST0r")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUB_FrST0")>;
-def: InstRW<[HWWriteResGroup50], (instregex "TZCNT(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "UCOMISDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "UCOMISSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDPDYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDPDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDPSYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDPSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDSDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDSSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPDYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPSYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCMPPDYrri")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCMPPDrri")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCMPPSYrri")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCMPPSrri")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCMPSDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCMPSSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCOMISDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCOMISSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCVTDQ2PSYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCVTDQ2PSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCVTPS2DQYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCVTPS2DQrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCVTTPS2DQYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCVTTPS2DQrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PDYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PSYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)SDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)SSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PDYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PSYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)SDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)SSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VSUBPDYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VSUBPDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VSUBPSYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VSUBPSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VSUBSDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VSUBSSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISSrr")>;
-
-def HWWriteResGroup50_16 : SchedWriteRes<[HWPort1, HWPort0156]> {
- let Latency = 3;
- let NumMicroOps = 4;
-}
-def: InstRW<[HWWriteResGroup50_16], (instregex "IMUL16rr(i8)?")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MMX_CVTPI2PSirr",
+ "PDEP(32|64)rr",
+ "PEXT(32|64)rr",
+ "SHLD(16|32|64)rri8",
+ "SHRD(16|32|64)rri8",
+ "(V?)CVTDQ2PS(Y?)rr")>;
-def HWWriteResGroup50_32 : SchedWriteRes<[HWPort1, HWPort0156]> {
- let Latency = 3;
- let NumMicroOps = 3;
+def HWWriteResGroup50_16i : SchedWriteRes<[HWPort1, HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup50_32], (instregex "IMUL32rr(i8)?")>;
+def: InstRW<[HWWriteResGroup50_16i], (instrs IMUL16rri, IMUL16rri8)>;
def HWWriteResGroup51 : SchedWriteRes<[HWPort5]> {
let Latency = 3;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup51], (instregex "VBROADCASTSDYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VBROADCASTSSYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VEXTRACTF128rr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VEXTRACTI128rr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VINSERTF128rr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VINSERTI128rr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTBYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTBrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTDYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTQYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTWYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTWrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPERM2F128rr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPERM2I128rr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPERMDYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPERMPDYri")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPERMPSYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPERMQYri")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBDYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBQYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBWYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXDQYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXWDYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXWQYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBDYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBQYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBWYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXDQYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXWDYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXWQYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCAST(B|W)rr")>;
def HWWriteResGroup52 : SchedWriteRes<[HWPort1,HWPort23]> {
let Latency = 9;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup52], (instregex "ADDPDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "ADDPSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "ADDSUBPDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "ADDSUBPSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "CMPPDrmi")>;
-def: InstRW<[HWWriteResGroup52], (instregex "CMPPSrmi")>;
-def: InstRW<[HWWriteResGroup52], (instregex "CVTDQ2PSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "CVTPS2DQrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "CVTTPS2DQrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "MAX(C?)PDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "MAX(C?)PSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "MIN(C?)PDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "MIN(C?)PSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "SUBPDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "SUBPSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VADDPDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VADDPSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VCMPPDrmi")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VCMPPSrmi")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VCVTDQ2PSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VCVTPS2DQrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VCVTTPS2DQrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VMAX(C?)PDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VMAX(C?)PSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VMIN(C?)PDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VMIN(C?)PSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VSUBPDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VSUBPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "(V?)CVTPS2DQrm",
+ "(V?)CVTTPS2DQrm")>;
def HWWriteResGroup52_1 : SchedWriteRes<[HWPort1,HWPort23]> {
let Latency = 10;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup52_1], (instregex "ADD_F32m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "ADD_F64m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "ILD_F16m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "ILD_F32m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "ILD_F64m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "SUBR_F32m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "SUBR_F64m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "SUB_F32m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "SUB_F64m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VADDPDYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VADDPSYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VADDSUBPDYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VADDSUBPSYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VCMPPDYrmi")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VCMPPSYrmi")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VCVTDQ2PSYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VCVTPS2DQYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VCVTTPS2DQYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VMAX(C?)PDYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VMAX(C?)PSYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VMIN(C?)PDYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VMIN(C?)PSYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VSUBPDYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VSUBPSYrm")>;
-
-def HWWriteResGroup53 : SchedWriteRes<[HWPort5,HWPort23]> {
- let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup53], (instregex "VPERM2F128rm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPERM2I128rm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPERMDYrm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPERMPDYmi")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPERMPSYrm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPERMQYmi")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBDYrm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBQYrm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBWYrm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXDQYrm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXWQYrm")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+ "ILD_F(16|32|64)m",
+ "VCVTDQ2PSYrm",
+ "VCVTPS2DQYrm",
+ "VCVTTPS2DQYrm")>;
def HWWriteResGroup53_1 : SchedWriteRes<[HWPort5,HWPort23]> {
let Latency = 9;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVSXBWYrm")>;
-def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVSXDQYrm")>;
-def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVSXWDYrm")>;
-def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVZXWDYrm")>;
+def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVSXBWYrm",
+ "VPMOVSXDQYrm",
+ "VPMOVSXWDYrm",
+ "VPMOVZXWDYrm")>;
def HWWriteResGroup54 : SchedWriteRes<[HWPort0156]> {
- let Latency = 3;
+ let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [3];
}
-def: InstRW<[HWWriteResGroup54], (instregex "XADD(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup54], (instregex "XADD8rr")>;
-def: InstRW<[HWWriteResGroup54], (instregex "XCHG8rr")>;
-
-def HWWriteResGroup55 : SchedWriteRes<[HWPort0,HWPort5]> {
- let Latency = 3;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup55], (instregex "VPSLLVDYrr")>;
-def: InstRW<[HWWriteResGroup55], (instregex "VPSLLVDrr")>;
-def: InstRW<[HWWriteResGroup55], (instregex "VPSRAVDYrr")>;
-def: InstRW<[HWWriteResGroup55], (instregex "VPSRAVDrr")>;
-def: InstRW<[HWWriteResGroup55], (instregex "VPSRLVDYrr")>;
-def: InstRW<[HWWriteResGroup55], (instregex "VPSRLVDrr")>;
-
-def HWWriteResGroup56 : SchedWriteRes<[HWPort5,HWPort15]> {
- let Latency = 3;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDSWrr64")>;
-def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDWrr64")>;
-def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDrr64")>;
-def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBDrr64")>;
-def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBSWrr64")>;
-def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBWrr64")>;
-def: InstRW<[HWWriteResGroup56], (instregex "PHADDDrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "PHADDSWrr128")>;
-def: InstRW<[HWWriteResGroup56], (instregex "PHADDWrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "PHSUBDrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "PHSUBSWrr128")>;
-def: InstRW<[HWWriteResGroup56], (instregex "PHSUBWrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHADDDYrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHADDDrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHADDSWrr128")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHADDSWrr256")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHADDWYrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHADDWrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBDYrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBDrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBSWrr128")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBSWrr256")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBWYrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBWrr")>;
+def: InstRW<[HWWriteResGroup54], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr,
+ XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr,
+ XCHG16ar, XCHG32ar, XCHG64ar)>;
def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKSSDWirr")>;
-def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKSSWBirr")>;
-def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKUSWBirr")>;
+def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKSSDWirr",
+ "MMX_PACKSSWBirr",
+ "MMX_PACKUSWBirr")>;
def HWWriteResGroup58 : SchedWriteRes<[HWPort6,HWPort0156]> {
let Latency = 3;
@@ -2873,202 +1317,80 @@ def HWWriteResGroup59 : SchedWriteRes<[HWPort06,HWPort0156]> {
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[HWWriteResGroup59], (instregex "CMOVA(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup59], (instregex "CMOVBE(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCL(16|32|64)r1")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCL(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCL8r1")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCL8ri")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCR(16|32|64)r1")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCR(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCR8r1")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCR8ri")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCL(8|16|32|64)r1",
+ "RCL(8|16|32|64)ri",
+ "RCR(8|16|32|64)r1",
+ "RCR(8|16|32|64)ri")>;
def HWWriteResGroup60 : SchedWriteRes<[HWPort06,HWPort0156]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[HWWriteResGroup60], (instregex "ROL(16|32|64)rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "ROL8rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "ROR(16|32|64)rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "ROR8rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "SAR(16|32|64)rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "SAR8rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "SHL(16|32|64)rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "SHL8rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "SHR(16|32|64)rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "SHR8rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "ROL(8|16|32|64)rCL",
+ "ROR(8|16|32|64)rCL",
+ "SAR(8|16|32|64)rCL",
+ "SHL(8|16|32|64)rCL",
+ "SHR(8|16|32|64)rCL")>;
def HWWriteResGroup61 : SchedWriteRes<[HWPort0,HWPort4,HWPort237]> {
let Latency = 4;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup61], (instregex "FNSTSWm")>;
+def: InstRW<[HWWriteResGroup61], (instrs FNSTSWm)>;
def HWWriteResGroup62 : SchedWriteRes<[HWPort1,HWPort4,HWPort237]> {
let Latency = 4;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP16m")>;
-def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP32m")>;
-def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP64m")>;
-def: InstRW<[HWWriteResGroup62], (instregex "IST_F16m")>;
-def: InstRW<[HWWriteResGroup62], (instregex "IST_F32m")>;
-def: InstRW<[HWWriteResGroup62], (instregex "IST_FP16m")>;
-def: InstRW<[HWWriteResGroup62], (instregex "IST_FP32m")>;
-def: InstRW<[HWWriteResGroup62], (instregex "IST_FP64m")>;
-
-def HWWriteResGroup63 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
- let Latency = 10;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup63], (instregex "VPSLLVDYrm")>;
-def: InstRW<[HWWriteResGroup63], (instregex "VPSRAVDYrm")>;
-def: InstRW<[HWWriteResGroup63], (instregex "VPSRLVDYrm")>;
-
-def HWWriteResGroup63_1 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
- let Latency = 9;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup63_1], (instregex "VPSLLVDrm")>;
-def: InstRW<[HWWriteResGroup63_1], (instregex "VPSRAVDrm")>;
-def: InstRW<[HWWriteResGroup63_1], (instregex "VPSRLVDrm")>;
-
-def HWWriteResGroup64 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> {
- let Latency = 8;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDSWrm64")>;
-def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDWrm64")>;
-def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDrm64")>;
-def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBDrm64")>;
-def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBSWrm64")>;
-def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBWrm64")>;
-
-def HWWriteResGroup64_1 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> {
- let Latency = 10;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup64_1], (instregex "VPHADDDYrm")>;
-def: InstRW<[HWWriteResGroup64_1], (instregex "VPHADDSWrm256")>;
-def: InstRW<[HWWriteResGroup64_1], (instregex "VPHADDWYrm")>;
-def: InstRW<[HWWriteResGroup64_1], (instregex "VPHSUBDYrm")>;
-def: InstRW<[HWWriteResGroup64_1], (instregex "VPHSUBSWrm256")>;
-def: InstRW<[HWWriteResGroup64_1], (instregex "VPHSUBWYrm")>;
-
-def HWWriteResGroup64_2 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> {
- let Latency = 9;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup64_2], (instregex "PHADDDrm")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "PHADDSWrm128")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "PHADDWrm")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "PHSUBDrm")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "PHSUBSWrm128")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "PHSUBWrm")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "VPHADDDrm")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "VPHADDSWrm128")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "VPHADDWrm")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "VPHSUBDrm")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "VPHSUBSWrm128")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "VPHSUBWrm")>;
-
-def HWWriteResGroup65 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> {
- let Latency = 8;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,2];
-}
-def: InstRW<[HWWriteResGroup65], (instregex "CMOVA(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup65], (instregex "CMOVBE(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup62], (instregex "IST(T?)_FP(16|32|64)m",
+ "IST_F(16|32)m")>;
def HWWriteResGroup66 : SchedWriteRes<[HWPort23,HWPort237,HWPort06,HWPort0156]> {
let Latency = 9;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,2];
}
-def: InstRW<[HWWriteResGroup66], (instregex "RCL(16|32|64)m1")>;
-def: InstRW<[HWWriteResGroup66], (instregex "RCL(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup66], (instregex "RCL8m1")>;
-def: InstRW<[HWWriteResGroup66], (instregex "RCL8mi")>;
-def: InstRW<[HWWriteResGroup66], (instregex "RCR(16|32|64)m1")>;
-def: InstRW<[HWWriteResGroup66], (instregex "RCR(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup66], (instregex "RCR8m1")>;
-def: InstRW<[HWWriteResGroup66], (instregex "RCR8mi")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCL(8|16|32|64)m1",
+ "RCL(8|16|32|64)mi",
+ "RCR(8|16|32|64)m1",
+ "RCR(8|16|32|64)mi")>;
def HWWriteResGroup67 : SchedWriteRes<[HWPort23,HWPort237,HWPort06,HWPort0156]> {
let Latency = 9;
let NumMicroOps = 5;
let ResourceCycles = [1,1,2,1];
}
-def: InstRW<[HWWriteResGroup67], (instregex "ROR(16|32|64)mCL")>;
-def: InstRW<[HWWriteResGroup67], (instregex "ROR8mCL")>;
+def: InstRW<[HWWriteResGroup67], (instregex "ROR(8|16|32|64)mCL")>;
def HWWriteResGroup68 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
let Latency = 9;
let NumMicroOps = 6;
let ResourceCycles = [1,1,1,3];
}
-def: InstRW<[HWWriteResGroup68], (instregex "ADC(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup68], (instregex "ADC8mi")>;
-def: InstRW<[HWWriteResGroup68], (instregex "ADD8mi")>;
-def: InstRW<[HWWriteResGroup68], (instregex "AND8mi")>;
-def: InstRW<[HWWriteResGroup68], (instregex "OR8mi")>;
-def: InstRW<[HWWriteResGroup68], (instregex "SUB8mi")>;
-def: InstRW<[HWWriteResGroup68], (instregex "XCHG(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup68], (instregex "XCHG8rm")>;
-def: InstRW<[HWWriteResGroup68], (instregex "XOR8mi")>;
+def: InstRW<[HWWriteResGroup68], (instregex "XCHG(8|16|32|64)rm")>;
def HWWriteResGroup69 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort0156]> {
let Latency = 9;
let NumMicroOps = 6;
let ResourceCycles = [1,1,1,2,1];
}
-def: InstRW<[HWWriteResGroup69], (instregex "ADC(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup69], (instregex "ADC8mr")>;
-def: InstRW<[HWWriteResGroup69], (instregex "CMPXCHG(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup69], (instregex "CMPXCHG8rm")>;
-def: InstRW<[HWWriteResGroup69], (instregex "ROL(16|32|64)mCL")>;
-def: InstRW<[HWWriteResGroup69], (instregex "ROL8mCL")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SAR(16|32|64)mCL")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SAR8mCL")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SBB(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SBB(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SBB8mi")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SBB8mr")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SHL(16|32|64)mCL")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SHL8mCL")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SHR(16|32|64)mCL")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SHR8mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "CMPXCHG(8|16|32|64)rm",
+ "ROL(8|16|32|64)mCL",
+ "SAR(8|16|32|64)mCL",
+ "SHL(8|16|32|64)mCL",
+ "SHR(8|16|32|64)mCL")>;
+def: SchedAlias<WriteADCRMW, HWWriteResGroup69>;
def HWWriteResGroup70 : SchedWriteRes<[HWPort0,HWPort1]> {
let Latency = 4;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup70], (instregex "CVTSD2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "CVTSD2SIrr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "CVTSS2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "CVTSS2SIrr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "CVTTSD2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "CVTTSD2SIrr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "CVTTSS2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "CVTTSS2SIrr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTSD2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTSD2SIrr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTSS2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTSS2SIrr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSD2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSD2SIrr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSS2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSS2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "(V?)CVT(T?)SD2SI(64)?rr",
+ "(V?)CVT(T?)SS2SI(64)?rr")>;
def HWWriteResGroup71 : SchedWriteRes<[HWPort0,HWPort5]> {
let Latency = 4;
@@ -3076,104 +1398,60 @@ def HWWriteResGroup71 : SchedWriteRes<[HWPort0,HWPort5]> {
let ResourceCycles = [1,1];
}
def: InstRW<[HWWriteResGroup71], (instregex "VCVTPS2PDYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSLLDYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSLLQYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSLLWYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSRADYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSRAWYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSRLDYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSRLQYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSRLWYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPTESTYrr")>;
def HWWriteResGroup72 : SchedWriteRes<[HWPort0,HWPort0156]> {
let Latency = 4;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup72], (instregex "FNSTSW16r")>;
+def: InstRW<[HWWriteResGroup72], (instrs FNSTSW16r)>;
def HWWriteResGroup73 : SchedWriteRes<[HWPort1,HWPort5]> {
let Latency = 4;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup73], (instregex "CVTDQ2PDrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "CVTPD2DQrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "CVTPD2PSrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "CVTSD2SSrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "CVTSI642SDrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "CVTSI2SDrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "CVTSI2SSrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "CVTTPD2DQrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPD2PIirr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPI2PDirr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPS2PIirr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTTPD2PIirr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTTPS2PIirr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTDQ2PDrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTPD2DQrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTPD2PSrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTPS2PHrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTSD2SSrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI642SDrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI2SDrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI2SSrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTTPD2DQrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPI2PDirr",
+ "MMX_CVT(T?)PD2PIirr",
+ "MMX_CVT(T?)PS2PIirr",
+ "(V?)CVTDQ2PDrr",
+ "(V?)CVTPD2PSrr",
+ "(V?)CVTSD2SSrr",
+ "(V?)CVTSI(64)?2SDrr",
+ "(V?)CVTSI2SSrr",
+ "(V?)CVT(T?)PD2DQrr")>;
def HWWriteResGroup74 : SchedWriteRes<[HWPort1,HWPort6]> {
let Latency = 4;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup74], (instregex "IMUL64r")>;
-def: InstRW<[HWWriteResGroup74], (instregex "MUL64r")>;
-def: InstRW<[HWWriteResGroup74], (instregex "MULX64rr")>;
+def: InstRW<[HWWriteResGroup74], (instrs IMUL64r, MUL64r, MULX64rr)>;
-def HWWriteResGroup74_16 : SchedWriteRes<[HWPort1, HWPort0156]> {
+def HWWriteResGroup74_16 : SchedWriteRes<[HWPort1, HWPort06, HWPort0156]> {
let Latency = 4;
let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
}
-def: InstRW<[HWWriteResGroup74_16], (instregex "IMUL16r")>;
-def: InstRW<[HWWriteResGroup74_16], (instregex "MUL16r")>;
-
-def HWWriteResGroup74_32 : SchedWriteRes<[HWPort1,HWPort0156]> {
- let Latency = 4;
- let NumMicroOps = 3;
-}
-def: InstRW<[HWWriteResGroup74_32], (instregex "IMUL32r")>;
-def: InstRW<[HWWriteResGroup74_32], (instregex "MUL32r")>;
+def: InstRW<[HWWriteResGroup74_16], (instrs IMUL16r, MUL16r)>;
def HWWriteResGroup75 : SchedWriteRes<[HWPort1,HWPort23]> {
let Latency = 11;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[HWWriteResGroup75], (instregex "FICOM16m")>;
-def: InstRW<[HWWriteResGroup75], (instregex "FICOM32m")>;
-def: InstRW<[HWWriteResGroup75], (instregex "FICOMP16m")>;
-def: InstRW<[HWWriteResGroup75], (instregex "FICOMP32m")>;
+def: InstRW<[HWWriteResGroup75], (instregex "FICOM(P?)(16|32)m")>;
def HWWriteResGroup76 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
let Latency = 9;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup76], (instregex "CVTSD2SI64rm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "CVTSD2SIrm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "CVTSS2SI64rm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "CVTSS2SIrm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "CVTTSD2SI64rm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "CVTTSD2SIrm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "CVTTSS2SIrm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTSD2SI64rm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTSD2SIrm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTSS2SI64rm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTSS2SIrm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSD2SI64rm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSD2SIrm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSS2SI64rm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSS2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "(V?)CVTSD2SI(64)?rm",
+ "(V?)CVTSS2SI(64)?rm",
+ "(V?)CVTTSD2SI(64)?rm",
+ "VCVTTSS2SI64rm",
+ "(V?)CVTTSS2SIrm")>;
def HWWriteResGroup77 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
let Latency = 10;
@@ -3182,65 +1460,51 @@ def HWWriteResGroup77 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
}
def: InstRW<[HWWriteResGroup77], (instregex "VCVTPS2PDYrm")>;
-def HWWriteResGroup77_1 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
- let Latency = 11;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup77_1], (instregex "VPTESTYrm")>;
-
def HWWriteResGroup78 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
let Latency = 10;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup78], (instregex "CVTDQ2PDrm")>;
-def: InstRW<[HWWriteResGroup78], (instregex "CVTPD2DQrm")>;
-def: InstRW<[HWWriteResGroup78], (instregex "CVTPD2PSrm")>;
-def: InstRW<[HWWriteResGroup78], (instregex "CVTTPD2DQrm")>;
-def: InstRW<[HWWriteResGroup78], (instregex "MMX_CVTPD2PIirm")>;
-def: InstRW<[HWWriteResGroup78], (instregex "MMX_CVTTPD2PIirm")>;
-def: InstRW<[HWWriteResGroup78], (instregex "VCVTDQ2PDrm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "CVTPD2PSrm",
+ "CVT(T?)PD2DQrm",
+ "MMX_CVT(T?)PD2PIirm",
+ "(V?)CVTDQ2PDrm")>;
def HWWriteResGroup78_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
let Latency = 9;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup78_1], (instregex "CVTSD2SSrm")>;
-def: InstRW<[HWWriteResGroup78_1], (instregex "MMX_CVTPI2PDirm")>;
-def: InstRW<[HWWriteResGroup78_1], (instregex "VCVTSD2SSrm")>;
+def: InstRW<[HWWriteResGroup78_1], (instregex "MMX_CVTPI2PDirm",
+ "(V?)CVTSD2SSrm")>;
def HWWriteResGroup79 : SchedWriteRes<[HWPort1,HWPort6,HWPort23]> {
let Latency = 9;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup79], (instregex "MULX64rm")>;
+def: InstRW<[HWWriteResGroup79], (instrs IMUL64m, MUL64m, MULX64rm)>;
def HWWriteResGroup80 : SchedWriteRes<[HWPort5,HWPort23,HWPort015]> {
let Latency = 9;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTBYrm")>;
-def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTBrm")>;
-def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTWYrm")>;
-def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTWrm")>;
+def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCAST(B|W)(Y?)rm")>;
def HWWriteResGroup81 : SchedWriteRes<[HWPort0156]> {
let Latency = 4;
let NumMicroOps = 4;
let ResourceCycles = [4];
}
-def: InstRW<[HWWriteResGroup81], (instregex "FNCLEX")>;
+def: InstRW<[HWWriteResGroup81], (instrs FNCLEX)>;
def HWWriteResGroup82 : SchedWriteRes<[HWPort015,HWPort0156]> {
let Latency = 4;
let NumMicroOps = 4;
let ResourceCycles = [1,3];
}
-def: InstRW<[HWWriteResGroup82], (instregex "VZEROUPPER")>;
+def: InstRW<[HWWriteResGroup82], (instrs VZEROUPPER)>;
def HWWriteResGroup83 : SchedWriteRes<[HWPort1,HWPort6,HWPort0156]> {
let Latency = 4;
@@ -3249,256 +1513,58 @@ def HWWriteResGroup83 : SchedWriteRes<[HWPort1,HWPort6,HWPort0156]> {
}
def: InstRW<[HWWriteResGroup83], (instregex "LAR(16|32|64)rr")>;
-def HWWriteResGroup84 : SchedWriteRes<[HWPort0,HWPort4,HWPort237,HWPort15]> {
- let Latency = 5;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPDYmr")>;
-def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPDmr")>;
-def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPSYmr")>;
-def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPSmr")>;
-def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVDYmr")>;
-def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVDmr")>;
-def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVQYmr")>;
-def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVQmr")>;
-
-def HWWriteResGroup85 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> {
- let Latency = 5;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[HWWriteResGroup85], (instregex "VCVTPS2PHmr")>;
-
def HWWriteResGroup86 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort0156]> {
let Latency = 10;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[HWWriteResGroup86], (instregex "SHLD(16|32|64)mri8")>;
-def: InstRW<[HWWriteResGroup86], (instregex "SHRD(16|32|64)mri8")>;
+def: InstRW<[HWWriteResGroup86], (instregex "SHLD(16|32|64)mri8",
+ "SHRD(16|32|64)mri8")>;
def HWWriteResGroup87 : SchedWriteRes<[HWPort1,HWPort6,HWPort23,HWPort0156]> {
let Latency = 9;
let NumMicroOps = 5;
let ResourceCycles = [1,2,1,1];
}
-def: InstRW<[HWWriteResGroup87], (instregex "LAR(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup87], (instregex "LSL(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup87], (instregex "LAR(16|32|64)rm",
+ "LSL(16|32|64)rm")>;
def HWWriteResGroup88 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> {
let Latency = 5;
let NumMicroOps = 6;
let ResourceCycles = [1,1,4];
}
-def: InstRW<[HWWriteResGroup88], (instregex "PUSHF16")>;
-def: InstRW<[HWWriteResGroup88], (instregex "PUSHF64")>;
+def: InstRW<[HWWriteResGroup88], (instregex "PUSHF(16|64)")>;
def HWWriteResGroup89 : SchedWriteRes<[HWPort0]> {
let Latency = 5;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMADDUBSWrr64")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMADDWDirr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHRSWrr64")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHUWirr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHWirr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULLWirr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULUDQirr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PSADBWirr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MUL_FPrST0")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MUL_FST0r")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MUL_FrST0")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PCMPGTQrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PHMINPOSUWrr128")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMADDUBSWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMADDWDrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMULDQrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMULHRSWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMULHUWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMULHWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMULLWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMULUDQrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PSADBWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "RCPPSr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "RCPSSr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "RSQRTPSr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "RSQRTSSr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPCMPGTQYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPCMPGTQrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPHMINPOSUWrr128")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMADDUBSWYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMADDUBSWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMADDWDYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMADDWDrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULDQYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULDQrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULHRSWYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULHRSWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULHUWYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULHUWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULHWYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULHWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULLWYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULLWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULUDQYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULUDQrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPSADBWYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPSADBWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VRCPPSr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VRCPSSr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VRSQRTPSr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VRSQRTSSr")>;
-
-def HWWriteResGroup90 : SchedWriteRes<[HWPort01]> {
- let Latency = 5;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[HWWriteResGroup90], (instregex "MULPDrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "MULPSrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "MULSDrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "MULSSrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "VMULPDYrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "VMULPDrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "VMULPSYrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "VMULPSrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "VMULSDrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "VMULSSrr")>;
-def: InstRW<[HWWriteResGroup90],
- (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)(Y)?r",
- "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>;
-
-def HWWriteResGroup91 : SchedWriteRes<[HWPort0,HWPort23]> {
- let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMADDUBSWrm64")>;
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMADDWDirm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHRSWrm64")>;
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHUWirm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHWirm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULLWirm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULUDQirm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PSADBWirm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "RCPSSm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "RSQRTSSm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "VRCPSSm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "VRSQRTSSm")>;
-
-def HWWriteResGroup91_1 : SchedWriteRes<[HWPort0,HWPort23]> {
- let Latency = 18;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup91_1], (instregex "SQRTSSm")>;
-def: InstRW<[HWWriteResGroup91_1], (instregex "VDIVSSrm")>;
+def: InstRW<[HWWriteResGroup89], (instregex "(V?)PCMPGTQ(Y?)rr",
+ "MUL_(FPrST0|FST0r|FrST0)")>;
def HWWriteResGroup91_2 : SchedWriteRes<[HWPort0,HWPort23]> {
let Latency = 11;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup91_2], (instregex "PCMPGTQrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PHMINPOSUWrm128")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMADDUBSWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMADDWDrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMULDQrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMULHRSWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMULHUWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMULHWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMULLWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMULUDQrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PSADBWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "RCPPSm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "RSQRTPSm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPCMPGTQrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPHMINPOSUWrm128")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMADDUBSWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMADDWDrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULDQrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULHRSWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULHUWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULHWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULLWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULUDQrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPSADBWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VRCPPSm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VRSQRTPSm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "(V?)PCMPGTQrm")>;
def HWWriteResGroup91_3 : SchedWriteRes<[HWPort0,HWPort23]> {
let Latency = 12;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup91_3], (instregex "MUL_F32m")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "MUL_F64m")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPCMPGTQYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMADDUBSWYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMADDWDYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULDQYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULHRSWYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULHUWYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULHWYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULLWYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULUDQYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPSADBWYrm")>;
-
-def HWWriteResGroup92 : SchedWriteRes<[HWPort01,HWPort23]> {
- let Latency = 11;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup92], (instregex "MULPDrm")>;
-def: InstRW<[HWWriteResGroup92], (instregex "MULPSrm")>;
-def: InstRW<[HWWriteResGroup92], (instregex "VMULPDrm")>;
-def: InstRW<[HWWriteResGroup92], (instregex "VMULPSrm")>;
-def: InstRW<[HWWriteResGroup92],
- (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m")>;
-
-def HWWriteResGroup92_1 : SchedWriteRes<[HWPort01,HWPort23]> {
- let Latency = 12;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup92_1], (instregex "VMULPDYrm")>;
-def: InstRW<[HWWriteResGroup92_1], (instregex "VMULPSYrm")>;
-def: InstRW<[HWWriteResGroup92_1],
- (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym")>;
-
-def HWWriteResGroup92_2 : SchedWriteRes<[HWPort01,HWPort23]> {
- let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup92_2], (instregex "MULSDrm")>;
-def: InstRW<[HWWriteResGroup92_2], (instregex "MULSSrm")>;
-def: InstRW<[HWWriteResGroup92_2], (instregex "VMULSDrm")>;
-def: InstRW<[HWWriteResGroup92_2], (instregex "VMULSSrm")>;
-def: InstRW<[HWWriteResGroup92_2],
- (instregex "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>;
+def: InstRW<[HWWriteResGroup91_3], (instregex "MUL_F(32|64)m",
+ "VPCMPGTQYrm")>;
def HWWriteResGroup93 : SchedWriteRes<[HWPort1,HWPort5]> {
let Latency = 5;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[HWWriteResGroup93], (instregex "CVTSI642SSrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "HADDPDrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "HADDPSrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "HSUBPDrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "HSUBPSrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VCVTSI642SSrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHADDPDYrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHADDPDrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHADDPSYrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHADDPSrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPDYrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPDrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPSYrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPSrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "(V?)CVTSI642SSrr")>;
def HWWriteResGroup94 : SchedWriteRes<[HWPort1,HWPort6,HWPort06]> {
let Latency = 5;
@@ -3508,35 +1574,11 @@ def HWWriteResGroup94 : SchedWriteRes<[HWPort1,HWPort6,HWPort06]> {
def: InstRW<[HWWriteResGroup94], (instregex "STR(16|32|64)r")>;
def HWWriteResGroup95 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
- let Latency = 5;
+ let Latency = 4;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup95], (instregex "MULX32rr")>;
-
-def HWWriteResGroup96 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
- let Latency = 11;
- let NumMicroOps = 4;
- let ResourceCycles = [1,2,1];
-}
-def: InstRW<[HWWriteResGroup96], (instregex "HADDPDrm")>;
-def: InstRW<[HWWriteResGroup96], (instregex "HADDPSrm")>;
-def: InstRW<[HWWriteResGroup96], (instregex "HSUBPDrm")>;
-def: InstRW<[HWWriteResGroup96], (instregex "HSUBPSrm")>;
-def: InstRW<[HWWriteResGroup96], (instregex "VHADDPDrm")>;
-def: InstRW<[HWWriteResGroup96], (instregex "VHADDPSrm")>;
-def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPDrm")>;
-def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPSrm")>;
-
-def HWWriteResGroup96_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
- let Latency = 12;
- let NumMicroOps = 4;
- let ResourceCycles = [1,2,1];
-}
-def: InstRW<[HWWriteResGroup96_1], (instregex "VHADDPDYrm")>;
-def: InstRW<[HWWriteResGroup96_1], (instregex "VHADDPSYrm")>;
-def: InstRW<[HWWriteResGroup96_1], (instregex "VHSUBPDYrm")>;
-def: InstRW<[HWWriteResGroup96_1], (instregex "VHSUBPSYrm")>;
+def: InstRW<[HWWriteResGroup95], (instrs IMUL32r, MUL32r, MULX32rr)>;
def HWWriteResGroup97 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
let Latency = 10;
@@ -3546,72 +1588,48 @@ def HWWriteResGroup97 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
def: InstRW<[HWWriteResGroup97], (instregex "CVTTSS2SI64rm")>;
def HWWriteResGroup98 : SchedWriteRes<[HWPort1,HWPort23,HWPort06,HWPort0156]> {
- let Latency = 10;
+ let Latency = 9;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[HWWriteResGroup98], (instregex "MULX32rm")>;
+def: InstRW<[HWWriteResGroup98], (instrs IMUL32m, MUL32m, MULX32rm)>;
def HWWriteResGroup99 : SchedWriteRes<[HWPort6,HWPort0156]> {
let Latency = 5;
let NumMicroOps = 5;
let ResourceCycles = [1,4];
}
-def: InstRW<[HWWriteResGroup99], (instregex "PAUSE")>;
+def: InstRW<[HWWriteResGroup99], (instrs PAUSE)>;
def HWWriteResGroup100 : SchedWriteRes<[HWPort06,HWPort0156]> {
let Latency = 5;
let NumMicroOps = 5;
let ResourceCycles = [1,4];
}
-def: InstRW<[HWWriteResGroup100], (instregex "XSETBV")>;
+def: InstRW<[HWWriteResGroup100], (instrs XSETBV)>;
def HWWriteResGroup101 : SchedWriteRes<[HWPort06,HWPort0156]> {
let Latency = 5;
let NumMicroOps = 5;
let ResourceCycles = [2,3];
}
-def: InstRW<[HWWriteResGroup101], (instregex "CMPXCHG(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup101], (instregex "CMPXCHG8rr")>;
+def: InstRW<[HWWriteResGroup101], (instregex "CMPXCHG(8|16|32|64)rr")>;
def HWWriteResGroup102 : SchedWriteRes<[HWPort1,HWPort5]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup102], (instregex "VCVTDQ2PDYrr")>;
-def: InstRW<[HWWriteResGroup102], (instregex "VCVTPD2DQYrr")>;
-def: InstRW<[HWWriteResGroup102], (instregex "VCVTPD2PSYrr")>;
-def: InstRW<[HWWriteResGroup102], (instregex "VCVTPS2PHYrr")>;
-def: InstRW<[HWWriteResGroup102], (instregex "VCVTTPD2DQYrr")>;
+def: InstRW<[HWWriteResGroup102], (instregex "VCVTDQ2PDYrr",
+ "VCVTPD2PSYrr",
+ "VCVT(T?)PD2DQYrr")>;
def HWWriteResGroup103 : SchedWriteRes<[HWPort1,HWPort23]> {
let Latency = 13;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[HWWriteResGroup103], (instregex "ADD_FI16m")>;
-def: InstRW<[HWWriteResGroup103], (instregex "ADD_FI32m")>;
-def: InstRW<[HWWriteResGroup103], (instregex "SUBR_FI16m")>;
-def: InstRW<[HWWriteResGroup103], (instregex "SUBR_FI32m")>;
-def: InstRW<[HWWriteResGroup103], (instregex "SUB_FI16m")>;
-def: InstRW<[HWWriteResGroup103], (instregex "SUB_FI32m")>;
-def: InstRW<[HWWriteResGroup103], (instregex "VROUNDYPDm")>;
-def: InstRW<[HWWriteResGroup103], (instregex "VROUNDYPSm")>;
-
-def HWWriteResGroup103_1 : SchedWriteRes<[HWPort1,HWPort23]> {
- let Latency = 12;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDPDm")>;
-def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDPSm")>;
-def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDSDm")>;
-def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDSSm")>;
-def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDPDm")>;
-def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDPSm")>;
-def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDSDm")>;
-def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDSSm")>;
+def: InstRW<[HWWriteResGroup103], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>;
def HWWriteResGroup104 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
let Latency = 12;
@@ -3625,15 +1643,8 @@ def HWWriteResGroup105 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
let NumMicroOps = 4;
let ResourceCycles = [1,1,2];
}
-def: InstRW<[HWWriteResGroup105], (instregex "SHLD(16|32|64)rrCL")>;
-def: InstRW<[HWWriteResGroup105], (instregex "SHRD(16|32|64)rrCL")>;
-
-def HWWriteResGroup106 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> {
- let Latency = 7;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[HWWriteResGroup106], (instregex "VCVTPS2PHYmr")>;
+def: InstRW<[HWWriteResGroup105], (instregex "SHLD(16|32|64)rrCL",
+ "SHRD(16|32|64)rrCL")>;
def HWWriteResGroup107 : SchedWriteRes<[HWPort1,HWPort6,HWPort06,HWPort0156]> {
let Latency = 6;
@@ -3647,212 +1658,44 @@ def HWWriteResGroup108 : SchedWriteRes<[HWPort6,HWPort0156]> {
let NumMicroOps = 6;
let ResourceCycles = [1,5];
}
-def: InstRW<[HWWriteResGroup108], (instregex "STD")>;
+def: InstRW<[HWWriteResGroup108], (instrs STD)>;
def HWWriteResGroup109 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort0156]> {
let Latency = 12;
let NumMicroOps = 6;
let ResourceCycles = [1,1,1,1,2];
}
-def: InstRW<[HWWriteResGroup109], (instregex "SHLD(16|32|64)mrCL")>;
-def: InstRW<[HWWriteResGroup109], (instregex "SHRD(16|32|64)mrCL")>;
-
-def HWWriteResGroup110 : SchedWriteRes<[HWPort5]> {
- let Latency = 7;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[HWWriteResGroup110], (instregex "AESDECLASTrr")>;
-def: InstRW<[HWWriteResGroup110], (instregex "AESDECrr")>;
-def: InstRW<[HWWriteResGroup110], (instregex "AESENCLASTrr")>;
-def: InstRW<[HWWriteResGroup110], (instregex "AESENCrr")>;
-def: InstRW<[HWWriteResGroup110], (instregex "VAESDECLASTrr")>;
-def: InstRW<[HWWriteResGroup110], (instregex "VAESDECrr")>;
-def: InstRW<[HWWriteResGroup110], (instregex "VAESENCLASTrr")>;
-def: InstRW<[HWWriteResGroup110], (instregex "VAESENCrr")>;
-
-def HWWriteResGroup111 : SchedWriteRes<[HWPort5,HWPort23]> {
- let Latency = 13;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup111], (instregex "AESDECLASTrm")>;
-def: InstRW<[HWWriteResGroup111], (instregex "AESDECrm")>;
-def: InstRW<[HWWriteResGroup111], (instregex "AESENCLASTrm")>;
-def: InstRW<[HWWriteResGroup111], (instregex "AESENCrm")>;
-def: InstRW<[HWWriteResGroup111], (instregex "VAESDECLASTrm")>;
-def: InstRW<[HWWriteResGroup111], (instregex "VAESDECrm")>;
-def: InstRW<[HWWriteResGroup111], (instregex "VAESENCLASTrm")>;
-def: InstRW<[HWWriteResGroup111], (instregex "VAESENCrm")>;
-
-def HWWriteResGroup112 : SchedWriteRes<[HWPort0,HWPort5]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[HWWriteResGroup112], (instregex "MPSADBWrri")>;
-def: InstRW<[HWWriteResGroup112], (instregex "VMPSADBWYrri")>;
-def: InstRW<[HWWriteResGroup112], (instregex "VMPSADBWrri")>;
-
-def HWWriteResGroup113 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
- let Latency = 13;
- let NumMicroOps = 4;
- let ResourceCycles = [1,2,1];
-}
-def: InstRW<[HWWriteResGroup113], (instregex "MPSADBWrmi")>;
-def: InstRW<[HWWriteResGroup113], (instregex "VMPSADBWrmi")>;
-
-def HWWriteResGroup113_1 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
- let Latency = 14;
- let NumMicroOps = 4;
- let ResourceCycles = [1,2,1];
-}
-def: InstRW<[HWWriteResGroup113_1], (instregex "VMPSADBWYrmi")>;
+def: InstRW<[HWWriteResGroup109], (instregex "SHLD(16|32|64)mrCL",
+ "SHRD(16|32|64)mrCL")>;
def HWWriteResGroup114 : SchedWriteRes<[HWPort6,HWPort06,HWPort15,HWPort0156]> {
let Latency = 7;
let NumMicroOps = 7;
let ResourceCycles = [2,2,1,2];
}
-def: InstRW<[HWWriteResGroup114], (instregex "LOOP")>;
+def: InstRW<[HWWriteResGroup114], (instrs LOOP)>;
def HWWriteResGroup115 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
let Latency = 15;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI16m")>;
-def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI32m")>;
-
-def HWWriteResGroup116 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> {
- let Latency = 9;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup116], (instregex "DPPDrri")>;
-def: InstRW<[HWWriteResGroup116], (instregex "VDPPDrri")>;
-
-def HWWriteResGroup117 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
- let Latency = 15;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[HWWriteResGroup117], (instregex "DPPDrmi")>;
-def: InstRW<[HWWriteResGroup117], (instregex "VDPPDrmi")>;
-
-def HWWriteResGroup118 : SchedWriteRes<[HWPort0]> {
- let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[HWWriteResGroup118], (instregex "PMULLDrr")>;
-def: InstRW<[HWWriteResGroup118], (instregex "VPMULLDYrr")>;
-def: InstRW<[HWWriteResGroup118], (instregex "VPMULLDrr")>;
-
-def HWWriteResGroup119 : SchedWriteRes<[HWPort0,HWPort23]> {
- let Latency = 16;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup119], (instregex "PMULLDrm")>;
-def: InstRW<[HWWriteResGroup119], (instregex "VPMULLDrm")>;
-
-def HWWriteResGroup119_1 : SchedWriteRes<[HWPort0,HWPort23]> {
- let Latency = 17;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup119_1], (instregex "VPMULLDYrm")>;
+def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI(16|32)m")>;
def HWWriteResGroup120 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> {
let Latency = 16;
let NumMicroOps = 10;
let ResourceCycles = [1,1,1,4,1,2];
}
-def: InstRW<[HWWriteResGroup120], (instregex "RCL(16|32|64)mCL")>;
-def: InstRW<[HWWriteResGroup120], (instregex "RCL8mCL")>;
-
-def HWWriteResGroup121 : SchedWriteRes<[HWPort0]> {
- let Latency = 11;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[HWWriteResGroup121], (instregex "DIVPSrr")>;
-def: InstRW<[HWWriteResGroup121], (instregex "DIVSSrr")>;
-
-def HWWriteResGroup122 : SchedWriteRes<[HWPort0,HWPort23]> {
- let Latency = 17;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup122], (instregex "DIVPSrm")>;
-
-def HWWriteResGroup122_1 : SchedWriteRes<[HWPort0,HWPort23]> {
- let Latency = 16;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup122_1], (instregex "DIVSSrm")>;
-
-def HWWriteResGroup123 : SchedWriteRes<[HWPort0]> {
- let Latency = 11;
- let NumMicroOps = 3;
- let ResourceCycles = [3];
-}
-def: InstRW<[HWWriteResGroup123], (instregex "PCMPISTRIrr")>;
-def: InstRW<[HWWriteResGroup123], (instregex "PCMPISTRM128rr")>;
-def: InstRW<[HWWriteResGroup123], (instregex "VPCMPISTRIrr")>;
-def: InstRW<[HWWriteResGroup123], (instregex "VPCMPISTRM128rr")>;
-
-def HWWriteResGroup124 : SchedWriteRes<[HWPort0,HWPort5]> {
- let Latency = 11;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup124], (instregex "PCLMULQDQrr")>;
-def: InstRW<[HWWriteResGroup124], (instregex "VPCLMULQDQrr")>;
-
-def HWWriteResGroup125 : SchedWriteRes<[HWPort0,HWPort015]> {
- let Latency = 11;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup125], (instregex "VRCPPSYr")>;
-def: InstRW<[HWWriteResGroup125], (instregex "VRSQRTPSYr")>;
-
-def HWWriteResGroup126 : SchedWriteRes<[HWPort0,HWPort23]> {
- let Latency = 17;
- let NumMicroOps = 4;
- let ResourceCycles = [3,1];
-}
-def: InstRW<[HWWriteResGroup126], (instregex "PCMPISTRIrm")>;
-def: InstRW<[HWWriteResGroup126], (instregex "PCMPISTRM128rm")>;
-def: InstRW<[HWWriteResGroup126], (instregex "VPCMPISTRIrm")>;
-def: InstRW<[HWWriteResGroup126], (instregex "VPCMPISTRM128rm")>;
-
-def HWWriteResGroup127 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
- let Latency = 17;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup127], (instregex "PCLMULQDQrm")>;
-def: InstRW<[HWWriteResGroup127], (instregex "VPCLMULQDQrm")>;
-
-def HWWriteResGroup128 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> {
- let Latency = 18;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup128], (instregex "VRCPPSYm")>;
-def: InstRW<[HWWriteResGroup128], (instregex "VRSQRTPSYm")>;
+def: InstRW<[HWWriteResGroup120], (instregex "RCL(8|16|32|64)mCL")>;
def HWWriteResGroup129 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
let Latency = 11;
let NumMicroOps = 7;
let ResourceCycles = [2,2,3];
}
-def: InstRW<[HWWriteResGroup129], (instregex "RCL(16|32|64)rCL")>;
-def: InstRW<[HWWriteResGroup129], (instregex "RCR(16|32|64)rCL")>;
+def: InstRW<[HWWriteResGroup129], (instregex "RCL(16|32|64)rCL",
+ "RCR(16|32|64)rCL")>;
def HWWriteResGroup130 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> {
let Latency = 11;
@@ -3866,101 +1709,21 @@ def HWWriteResGroup131 : SchedWriteRes<[HWPort06,HWPort0156]> {
let NumMicroOps = 11;
let ResourceCycles = [2,9];
}
-def: InstRW<[HWWriteResGroup131], (instregex "LOOPE")>;
-def: InstRW<[HWWriteResGroup131], (instregex "LOOPNE")>;
+def: InstRW<[HWWriteResGroup131], (instrs LOOPE, LOOPNE)>;
def HWWriteResGroup132 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> {
let Latency = 17;
let NumMicroOps = 14;
let ResourceCycles = [1,1,1,4,2,5];
}
-def: InstRW<[HWWriteResGroup132], (instregex "CMPXCHG8B")>;
-
-def HWWriteResGroup133 : SchedWriteRes<[HWPort0]> {
- let Latency = 13;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[HWWriteResGroup133], (instregex "SQRTPSr")>;
-def: InstRW<[HWWriteResGroup133], (instregex "SQRTSSr")>;
-def: InstRW<[HWWriteResGroup133], (instregex "VDIVPSrr")>;
-def: InstRW<[HWWriteResGroup133], (instregex "VDIVSSrr")>;
-
-def HWWriteResGroup134 : SchedWriteRes<[HWPort0,HWPort23]> {
- let Latency = 19;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup134], (instregex "DIVSDrm")>;
-def: InstRW<[HWWriteResGroup134], (instregex "SQRTPSm")>;
-def: InstRW<[HWWriteResGroup134], (instregex "VDIVPSrm")>;
-def: InstRW<[HWWriteResGroup134], (instregex "VSQRTSSm")>;
+def: InstRW<[HWWriteResGroup132], (instrs CMPXCHG8B)>;
def HWWriteResGroup135 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> {
let Latency = 19;
let NumMicroOps = 11;
let ResourceCycles = [2,1,1,3,1,3];
}
-def: InstRW<[HWWriteResGroup135], (instregex "RCR(16|32|64)mCL")>;
-def: InstRW<[HWWriteResGroup135], (instregex "RCR8mCL")>;
-
-def HWWriteResGroup136 : SchedWriteRes<[HWPort0]> {
- let Latency = 14;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[HWWriteResGroup136], (instregex "DIVPDrr")>;
-def: InstRW<[HWWriteResGroup136], (instregex "DIVSDrr")>;
-def: InstRW<[HWWriteResGroup136], (instregex "VSQRTPSr")>;
-def: InstRW<[HWWriteResGroup136], (instregex "VSQRTSSr")>;
-
-def HWWriteResGroup137 : SchedWriteRes<[HWPort5]> {
- let Latency = 14;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[HWWriteResGroup137], (instregex "AESIMCrr")>;
-def: InstRW<[HWWriteResGroup137], (instregex "VAESIMCrr")>;
-
-def HWWriteResGroup138 : SchedWriteRes<[HWPort0,HWPort23]> {
- let Latency = 20;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup138], (instregex "DIVPDrm")>;
-def: InstRW<[HWWriteResGroup138], (instregex "VSQRTPSm")>;
-
-def HWWriteResGroup139 : SchedWriteRes<[HWPort5,HWPort23]> {
- let Latency = 20;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup139], (instregex "AESIMCrm")>;
-def: InstRW<[HWWriteResGroup139], (instregex "VAESIMCrm")>;
-
-def HWWriteResGroup140 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> {
- let Latency = 14;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup140], (instregex "DPPSrri")>;
-def: InstRW<[HWWriteResGroup140], (instregex "VDPPSYrri")>;
-def: InstRW<[HWWriteResGroup140], (instregex "VDPPSrri")>;
-
-def HWWriteResGroup141 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
- let Latency = 20;
- let NumMicroOps = 5;
- let ResourceCycles = [2,1,1,1];
-}
-def: InstRW<[HWWriteResGroup141], (instregex "DPPSrmi")>;
-def: InstRW<[HWWriteResGroup141], (instregex "VDPPSrmi")>;
-
-def HWWriteResGroup141_1 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
- let Latency = 21;
- let NumMicroOps = 5;
- let ResourceCycles = [2,1,1,1];
-}
-def: InstRW<[HWWriteResGroup141_1], (instregex "VDPPSYrmi")>;
+def: InstRW<[HWWriteResGroup135], (instregex "RCR(8|16|32|64)mCL")>;
def HWWriteResGroup142 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> {
let Latency = 14;
@@ -3981,54 +1744,35 @@ def HWWriteResGroup144 : SchedWriteRes<[HWPort4,HWPort5,HWPort6,HWPort23,HWPort2
let NumMicroOps = 8;
let ResourceCycles = [1,1,1,1,1,1,2];
}
-def: InstRW<[HWWriteResGroup144], (instregex "INSB")>;
-def: InstRW<[HWWriteResGroup144], (instregex "INSL")>;
-def: InstRW<[HWWriteResGroup144], (instregex "INSW")>;
+def: InstRW<[HWWriteResGroup144], (instrs INSB, INSL, INSW)>;
def HWWriteResGroup145 : SchedWriteRes<[HWPort5]> {
let Latency = 16;
let NumMicroOps = 16;
let ResourceCycles = [16];
}
-def: InstRW<[HWWriteResGroup145], (instregex "VZEROALL")>;
+def: InstRW<[HWWriteResGroup145], (instrs VZEROALL)>;
def HWWriteResGroup146 : SchedWriteRes<[HWPort0,HWPort4,HWPort5,HWPort23,HWPort237,HWPort06,HWPort0156]> {
let Latency = 22;
let NumMicroOps = 19;
let ResourceCycles = [2,1,4,1,1,4,6];
}
-def: InstRW<[HWWriteResGroup146], (instregex "CMPXCHG16B")>;
+def: InstRW<[HWWriteResGroup146], (instrs CMPXCHG16B)>;
def HWWriteResGroup147 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> {
let Latency = 17;
let NumMicroOps = 15;
let ResourceCycles = [2,1,2,4,2,4];
}
-def: InstRW<[HWWriteResGroup147], (instregex "XCH_F")>;
-
-def HWWriteResGroup148 : SchedWriteRes<[HWPort0,HWPort5,HWPort0156]> {
- let Latency = 18;
- let NumMicroOps = 8;
- let ResourceCycles = [4,3,1];
-}
-def: InstRW<[HWWriteResGroup148], (instregex "PCMPESTRIrr")>;
-def: InstRW<[HWWriteResGroup148], (instregex "VPCMPESTRIrr")>;
+def: InstRW<[HWWriteResGroup147], (instrs XCH_F)>;
def HWWriteResGroup149 : SchedWriteRes<[HWPort5,HWPort6,HWPort06,HWPort0156]> {
let Latency = 18;
let NumMicroOps = 8;
let ResourceCycles = [1,1,1,5];
}
-def: InstRW<[HWWriteResGroup149], (instregex "CPUID")>;
-def: InstRW<[HWWriteResGroup149], (instregex "RDTSC")>;
-
-def HWWriteResGroup150 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort0156]> {
- let Latency = 24;
- let NumMicroOps = 9;
- let ResourceCycles = [4,3,1,1];
-}
-def: InstRW<[HWWriteResGroup150], (instregex "PCMPESTRIrm")>;
-def: InstRW<[HWWriteResGroup150], (instregex "VPCMPESTRIrm")>;
+def: InstRW<[HWWriteResGroup149], (instrs CPUID, RDTSC)>;
def HWWriteResGroup151 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> {
let Latency = 23;
@@ -4037,240 +1781,127 @@ def HWWriteResGroup151 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> {
}
def: InstRW<[HWWriteResGroup151], (instregex "XRSTOR(64)?")>;
-def HWWriteResGroup152 : SchedWriteRes<[HWPort0,HWPort5,HWPort015,HWPort0156]> {
- let Latency = 19;
- let NumMicroOps = 9;
- let ResourceCycles = [4,3,1,1];
-}
-def: InstRW<[HWWriteResGroup152], (instregex "PCMPESTRM128rr")>;
-def: InstRW<[HWWriteResGroup152], (instregex "VPCMPESTRM128rr")>;
-
-def HWWriteResGroup153 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort015,HWPort0156]> {
- let Latency = 25;
- let NumMicroOps = 10;
- let ResourceCycles = [4,3,1,1,1];
-}
-def: InstRW<[HWWriteResGroup153], (instregex "PCMPESTRM128rm")>;
-def: InstRW<[HWWriteResGroup153], (instregex "VPCMPESTRM128rm")>;
-
def HWWriteResGroup154 : SchedWriteRes<[HWPort0]> {
let Latency = 20;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup154], (instregex "DIV_FPrST0")>;
-def: InstRW<[HWWriteResGroup154], (instregex "DIV_FST0r")>;
-def: InstRW<[HWWriteResGroup154], (instregex "DIV_FrST0")>;
-def: InstRW<[HWWriteResGroup154], (instregex "SQRTPDr")>;
-def: InstRW<[HWWriteResGroup154], (instregex "SQRTSDr")>;
-def: InstRW<[HWWriteResGroup154], (instregex "VDIVPDrr")>;
-def: InstRW<[HWWriteResGroup154], (instregex "VDIVSDrr")>;
+def: InstRW<[HWWriteResGroup154], (instregex "DIV_(FPrST0|FST0r|FrST0)")>;
def HWWriteResGroup155 : SchedWriteRes<[HWPort0,HWPort23]> {
let Latency = 27;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F32m")>;
-def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F64m")>;
-def: InstRW<[HWWriteResGroup155], (instregex "VSQRTPDm")>;
-
-def HWWriteResGroup155_1 : SchedWriteRes<[HWPort0,HWPort23]> {
- let Latency = 26;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup155_1], (instregex "SQRTPDm")>;
-def: InstRW<[HWWriteResGroup155_1], (instregex "VDIVPDrm")>;
-def: InstRW<[HWWriteResGroup155_1], (instregex "VSQRTSDm")>;
-
-def HWWriteResGroup155_2 : SchedWriteRes<[HWPort0,HWPort23]> {
- let Latency = 25;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup155_2], (instregex "SQRTSDm")>;
-def: InstRW<[HWWriteResGroup155_2], (instregex "VDIVSDrm")>;
+def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F(32|64)m")>;
def HWWriteResGroup156 : SchedWriteRes<[HWPort5,HWPort6,HWPort0156]> {
let Latency = 20;
let NumMicroOps = 10;
let ResourceCycles = [1,2,7];
}
-def: InstRW<[HWWriteResGroup156], (instregex "MWAITrr")>;
-
-def HWWriteResGroup157 : SchedWriteRes<[HWPort0]> {
- let Latency = 21;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[HWWriteResGroup157], (instregex "VSQRTPDr")>;
-def: InstRW<[HWWriteResGroup157], (instregex "VSQRTSDr")>;
-
-def HWWriteResGroup159 : SchedWriteRes<[HWPort0,HWPort015]> {
- let Latency = 21;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup159], (instregex "VDIVPSYrr")>;
-def: InstRW<[HWWriteResGroup159], (instregex "VSQRTPSYr")>;
-
-def HWWriteResGroup160 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> {
- let Latency = 28;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup160], (instregex "VDIVPSYrm")>;
-def: InstRW<[HWWriteResGroup160], (instregex "VSQRTPSYm")>;
+def: InstRW<[HWWriteResGroup156], (instrs MWAITrr)>;
def HWWriteResGroup161 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
let Latency = 30;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI16m")>;
-def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI32m")>;
+def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI(16|32)m")>;
def HWWriteResGroup162 : SchedWriteRes<[HWPort0]> {
let Latency = 24;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FPrST0")>;
-def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FST0r")>;
-def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FrST0")>;
+def: InstRW<[HWWriteResGroup162], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>;
def HWWriteResGroup163 : SchedWriteRes<[HWPort0,HWPort23]> {
let Latency = 31;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup163], (instregex "DIV_F32m")>;
-def: InstRW<[HWWriteResGroup163], (instregex "DIV_F64m")>;
+def: InstRW<[HWWriteResGroup163], (instregex "DIV_F(32|64)m")>;
def HWWriteResGroup164 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
let Latency = 30;
let NumMicroOps = 27;
let ResourceCycles = [1,5,1,1,19];
}
-def: InstRW<[HWWriteResGroup164], (instregex "XSAVE64")>;
+def: InstRW<[HWWriteResGroup164], (instrs XSAVE64)>;
def HWWriteResGroup165 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
let Latency = 31;
let NumMicroOps = 28;
let ResourceCycles = [1,6,1,1,19];
}
-def: InstRW<[HWWriteResGroup165], (instregex "XSAVE(OPT)?")>;
+def: InstRW<[HWWriteResGroup165], (instrs XSAVE)>;
+def: InstRW<[HWWriteResGroup165], (instregex "XSAVEC", "XSAVES", "XSAVEOPT")>;
def HWWriteResGroup166 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
let Latency = 34;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI16m")>;
-def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI32m")>;
-
-def HWWriteResGroup167 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort015]> {
- let Latency = 34;
- let NumMicroOps = 11;
- let ResourceCycles = [2,7,1,1];
-}
-def: InstRW<[HWWriteResGroup167], (instregex "AESKEYGENASSIST128rm")>;
-def: InstRW<[HWWriteResGroup167], (instregex "VAESKEYGENASSIST128rm")>;
-
-def HWWriteResGroup168 : SchedWriteRes<[HWPort0,HWPort5,HWPort015]> {
- let Latency = 29;
- let NumMicroOps = 11;
- let ResourceCycles = [2,7,2];
-}
-def: InstRW<[HWWriteResGroup168], (instregex "AESKEYGENASSIST128rr")>;
-def: InstRW<[HWWriteResGroup168], (instregex "VAESKEYGENASSIST128rr")>;
+def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI(16|32)m")>;
def HWWriteResGroup170 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort06,HWPort0156]> {
let Latency = 35;
let NumMicroOps = 23;
let ResourceCycles = [1,5,3,4,10];
}
-def: InstRW<[HWWriteResGroup170], (instregex "IN(16|32)ri")>;
-def: InstRW<[HWWriteResGroup170], (instregex "IN(16|32)rr")>;
-def: InstRW<[HWWriteResGroup170], (instregex "IN8ri")>;
-def: InstRW<[HWWriteResGroup170], (instregex "IN8rr")>;
+def: InstRW<[HWWriteResGroup170], (instregex "IN(8|16|32)ri",
+ "IN(8|16|32)rr")>;
def HWWriteResGroup171 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort237,HWPort06,HWPort0156]> {
let Latency = 36;
let NumMicroOps = 23;
let ResourceCycles = [1,5,2,1,4,10];
}
-def: InstRW<[HWWriteResGroup171], (instregex "OUT(16|32)ir")>;
-def: InstRW<[HWWriteResGroup171], (instregex "OUT(16|32)rr")>;
-def: InstRW<[HWWriteResGroup171], (instregex "OUT8ir")>;
-def: InstRW<[HWWriteResGroup171], (instregex "OUT8rr")>;
-
-def HWWriteResGroup172 : SchedWriteRes<[HWPort01,HWPort15,HWPort015,HWPort0156]> {
- let Latency = 31;
- let NumMicroOps = 31;
- let ResourceCycles = [8,1,21,1];
-}
-def: InstRW<[HWWriteResGroup172], (instregex "MMX_EMMS")>;
-
-def HWWriteResGroup173 : SchedWriteRes<[HWPort0,HWPort015]> {
- let Latency = 35;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup173], (instregex "VDIVPDYrr")>;
-def: InstRW<[HWWriteResGroup173], (instregex "VSQRTPDYr")>;
-
-def HWWriteResGroup174 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> {
- let Latency = 42;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup174], (instregex "VDIVPDYrm")>;
-def: InstRW<[HWWriteResGroup174], (instregex "VSQRTPDYm")>;
+def: InstRW<[HWWriteResGroup171], (instregex "OUT(8|16|32)ir",
+ "OUT(8|16|32)rr")>;
def HWWriteResGroup175 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort15,HWPort0156]> {
let Latency = 41;
let NumMicroOps = 18;
let ResourceCycles = [1,1,2,3,1,1,1,8];
}
-def: InstRW<[HWWriteResGroup175], (instregex "VMCLEARm")>;
+def: InstRW<[HWWriteResGroup175], (instrs VMCLEARm)>;
def HWWriteResGroup176 : SchedWriteRes<[HWPort5,HWPort0156]> {
let Latency = 42;
let NumMicroOps = 22;
let ResourceCycles = [2,20];
}
-def: InstRW<[HWWriteResGroup176], (instregex "RDTSCP")>;
+def: InstRW<[HWWriteResGroup176], (instrs RDTSCP)>;
def HWWriteResGroup177 : SchedWriteRes<[HWPort0,HWPort01,HWPort23,HWPort05,HWPort06,HWPort015,HWPort0156]> {
let Latency = 61;
let NumMicroOps = 64;
let ResourceCycles = [2,2,8,1,10,2,39];
}
-def: InstRW<[HWWriteResGroup177], (instregex "FLDENVm")>;
-def: InstRW<[HWWriteResGroup177], (instregex "FLDENVm")>;
+def: InstRW<[HWWriteResGroup177], (instrs FLDENVm)>;
def HWWriteResGroup178 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> {
let Latency = 64;
let NumMicroOps = 88;
let ResourceCycles = [4,4,31,1,2,1,45];
}
-def: InstRW<[HWWriteResGroup178], (instregex "FXRSTOR64")>;
+def: InstRW<[HWWriteResGroup178], (instrs FXRSTOR64)>;
def HWWriteResGroup179 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> {
let Latency = 64;
let NumMicroOps = 90;
let ResourceCycles = [4,2,33,1,2,1,47];
}
-def: InstRW<[HWWriteResGroup179], (instregex "FXRSTOR")>;
+def: InstRW<[HWWriteResGroup179], (instrs FXRSTOR)>;
def HWWriteResGroup180 : SchedWriteRes<[HWPort5,HWPort01,HWPort0156]> {
let Latency = 75;
let NumMicroOps = 15;
let ResourceCycles = [6,3,6];
}
-def: InstRW<[HWWriteResGroup180], (instregex "FNINIT")>;
+def: InstRW<[HWWriteResGroup180], (instrs FNINIT)>;
def HWWriteResGroup181 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> {
let Latency = 98;
@@ -4291,8 +1922,7 @@ def HWWriteResGroup183 : SchedWriteRes<[HWPort0,HWPort1,HWPort4,HWPort5,HWPort6,
let NumMicroOps = 100;
let ResourceCycles = [9,9,11,8,1,11,21,30];
}
-def: InstRW<[HWWriteResGroup183], (instregex "FSTENVm")>;
-def: InstRW<[HWWriteResGroup183], (instregex "FSTENVm")>;
+def: InstRW<[HWWriteResGroup183], (instrs FSTENVm)>;
def HWWriteResGroup184 : SchedWriteRes<[HWPort0, HWPort5, HWPort15, HWPort015, HWPort06, HWPort23]> {
let Latency = 26;
@@ -4364,4 +1994,6 @@ def HWWriteResGroup192 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HW
def: InstRW<[HWWriteResGroup192], (instrs VGATHERQPSrm,
VGATHERDPSrm)>;
+def: InstRW<[WriteZero], (instrs CLC)>;
+
} // SchedModel
diff --git a/lib/Target/X86/X86SchedPredicates.td b/lib/Target/X86/X86SchedPredicates.td
new file mode 100644
index 000000000000..27aaeb193583
--- /dev/null
+++ b/lib/Target/X86/X86SchedPredicates.td
@@ -0,0 +1,49 @@
+//===-- X86SchedPredicates.td - X86 Scheduling Predicates --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines scheduling predicate definitions that are common to
+// all X86 subtargets.
+//
+//===----------------------------------------------------------------------===//
+
+// A predicate used to identify dependency-breaking instructions that clear the
+// content of the destination register. Note that this predicate only checks if
+// input registers are the same. This predicate doesn't make any assumptions on
+// the expected instruction opcodes, because different processors may implement
+// different zero-idioms.
+def ZeroIdiomPredicate : CheckSameRegOperand<1, 2>;
+
+// A predicate used to check if an instruction is a LEA, and if it uses all
+// three source operands: base, index, and offset.
+def IsThreeOperandsLEAPredicate: CheckAll<[
+ CheckOpcode<[LEA32r, LEA64r, LEA64_32r, LEA16r]>,
+
+ // isRegOperand(Base)
+ CheckIsRegOperand<1>,
+ CheckNot<CheckInvalidRegOperand<1>>,
+
+ // isRegOperand(Index)
+ CheckIsRegOperand<3>,
+ CheckNot<CheckInvalidRegOperand<3>>,
+
+ // hasLEAOffset(Offset)
+ CheckAny<[
+ CheckAll<[
+ CheckIsImmOperand<4>,
+ CheckNot<CheckZeroOperand<4>>
+ ]>,
+ CheckNonPortable<"MI.getOperand(4).isGlobal()">
+ ]>
+]>;
+
+// This predicate evaluates to true only if the input machine instruction is a
+// 3-operands LEA. Tablegen automatically generates a new method for it in
+// X86GenInstrInfo.
+def IsThreeOperandsLEAFn :
+ TIIPredicate<"X86", "isThreeOperandsLEA", IsThreeOperandsLEAPredicate>;
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index 4466d30f14c7..3b543c680ef4 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -10,6 +10,10 @@
// This file defines the machine model for Sandy Bridge to support instruction
// scheduling and other instruction cost heuristics.
//
+// Note that we define some instructions here that are not supported by SNB,
+// but we still have to define them because SNB is the default subtarget for
+// X86. These instructions are tagged with a comment `Unsupported = 1`.
+//
//===----------------------------------------------------------------------===//
def SandyBridgeModel : SchedMachineModel {
@@ -18,7 +22,7 @@ def SandyBridgeModel : SchedMachineModel {
// FIXME: Identify instructions that aren't a single fused micro-op.
let IssueWidth = 4;
let MicroOpBufferSize = 168; // Based on the reorder buffer.
- let LoadLatency = 4;
+ let LoadLatency = 5;
let MispredictPenalty = 16;
// Based on the LSD (loop-stream detector) queue size.
@@ -60,10 +64,12 @@ def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> {
// Integer division issued on port 0.
def SBDivider : ProcResource<1>;
+// FP division and sqrt on port 0.
+def SBFPDivider : ProcResource<1>;
-// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
// cycles after the memory operand.
-def : ReadAdvance<ReadAfterLd, 4>;
+def : ReadAdvance<ReadAfterLd, 5>;
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
@@ -71,129 +77,362 @@ def : ReadAdvance<ReadAfterLd, 4>;
// This multiclass defines the resource usage for variants with and without
// folded loads.
multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1,
+ int LoadLat = 5> {
// Register variant is using a single cycle on ExePort.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
- // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
- // latency.
- def : WriteRes<SchedRW.Folded, [SBPort23, ExePort]> {
- let Latency = !add(Lat, 4);
+ // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+ // the latency (default = 5).
+ def : WriteRes<SchedRW.Folded, !listconcat([SBPort23], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = !add(UOps, 1);
}
}
-// A folded store needs a cycle on port 4 for the store data, but it does not
-// need an extra port 2/3 cycle to recompute the address.
-def : WriteRes<WriteRMW, [SBPort4]>;
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SBPort23,SBPort4]>;
+
+def : WriteRes<WriteStore, [SBPort23, SBPort4]>;
+def : WriteRes<WriteStoreNT, [SBPort23, SBPort4]>;
+def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 5; }
+def : WriteRes<WriteMove, [SBPort015]>;
+def : WriteRes<WriteZero, []>;
-def : WriteRes<WriteStore, [SBPort23, SBPort4]>;
-def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 4; }
-def : WriteRes<WriteMove, [SBPort015]>;
-def : WriteRes<WriteZero, []>;
+defm : SBWriteResPair<WriteALU, [SBPort015], 1>;
+defm : SBWriteResPair<WriteADC, [SBPort05,SBPort015], 2, [1,1], 2>;
+defm : SBWriteResPair<WriteIMul, [SBPort1], 3>;
+defm : SBWriteResPair<WriteIMul64, [SBPort1], 3>;
+
+defm : SBWriteResPair<WriteBSWAP32,[SBPort1], 1>;
+defm : SBWriteResPair<WriteBSWAP64,[SBPort1,SBPort05], 2, [1,1], 2>;
+
+defm : SBWriteResPair<WriteDiv8, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteDiv16, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteDiv32, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteDiv64, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteIDiv8, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteIDiv16, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteIDiv32, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteIDiv64, [SBPort0, SBDivider], 25, [1, 10]>;
-defm : SBWriteResPair<WriteALU, SBPort015, 1>;
-defm : SBWriteResPair<WriteIMul, SBPort1, 3>;
def : WriteRes<WriteIMulH, []> { let Latency = 3; }
-defm : SBWriteResPair<WriteShift, SBPort05, 1>;
-defm : SBWriteResPair<WriteJump, SBPort5, 1>;
+
+defm : SBWriteResPair<WriteShift, [SBPort05], 1>;
+defm : SBWriteResPair<WriteShiftDouble, [SBPort05], 1>;
+defm : SBWriteResPair<WriteJump, [SBPort5], 1>;
+defm : SBWriteResPair<WriteCRC32, [SBPort1], 3, [1], 1, 5>;
+
+defm : SBWriteResPair<WriteCMOV, [SBPort05,SBPort015], 2, [1,1], 2>; // Conditional move.
+defm : SBWriteResPair<WriteCMOV2, [SBPort05,SBPort015], 3, [2,1], 3>; // Conditional (CF + ZF flag) move.
+defm : X86WriteRes<WriteFCMOV, [SBPort5,SBPort05], 3, [2,1], 3>; // x87 conditional move.
+def : WriteRes<WriteSETCC, [SBPort05]>; // Setcc.
+def : WriteRes<WriteSETCCStore, [SBPort05,SBPort4,SBPort23]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+def : WriteRes<WriteLAHFSAHF, [SBPort05]>;
// This is for simple LEAs with one or two input operands.
// The complex ones can only execute on port 1, and they require two cycles on
// the port to read all inputs. We don't model that.
-def : WriteRes<WriteLEA, [SBPort15]>;
+def : WriteRes<WriteLEA, [SBPort01]>;
-// This is quite rough, latency depends on the dividend.
-def : WriteRes<WriteIDiv, [SBPort0, SBDivider]> {
- let Latency = 25;
- let ResourceCycles = [1, 10];
-}
-def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> {
- let Latency = 29;
- let ResourceCycles = [1, 1, 10];
-}
+// Bit counts.
+defm : SBWriteResPair<WriteBSF, [SBPort1], 3, [1], 1, 5>;
+defm : SBWriteResPair<WriteBSR, [SBPort1], 3, [1], 1, 5>;
+defm : SBWriteResPair<WriteLZCNT, [SBPort1], 3, [1], 1, 5>;
+defm : SBWriteResPair<WriteTZCNT, [SBPort1], 3, [1], 1, 5>;
+defm : SBWriteResPair<WritePOPCNT, [SBPort1], 3, [1], 1, 6>;
+
+// BMI1 BEXTR, BMI2 BZHI
+// NOTE: These don't exist on Sandy Bridge. Ports are guesses.
+defm : SBWriteResPair<WriteBEXTR, [SBPort05,SBPort1], 2, [1,1], 2>;
+defm : SBWriteResPair<WriteBZHI, [SBPort1], 1>;
// Scalar and vector floating point.
-defm : SBWriteResPair<WriteFAdd, SBPort1, 3>;
-defm : SBWriteResPair<WriteFMul, SBPort0, 5>;
-defm : SBWriteResPair<WriteFDiv, SBPort0, 24>;
-defm : SBWriteResPair<WriteFRcp, SBPort0, 5>;
-defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
-defm : SBWriteResPair<WriteFSqrt, SBPort0, 14>;
-defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
-defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
-defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>;
-defm : SBWriteResPair<WriteFShuffle, SBPort5, 1>;
-defm : SBWriteResPair<WriteFBlend, SBPort05, 1>;
-def : WriteRes<WriteFVarBlend, [SBPort0, SBPort5]> {
- let Latency = 2;
- let ResourceCycles = [1, 1];
-}
-def : WriteRes<WriteFVarBlendLd, [SBPort0, SBPort5, SBPort23]> {
- let Latency = 6;
- let ResourceCycles = [1, 1, 1];
-}
+defm : X86WriteRes<WriteFLD0, [SBPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1, [SBPort0,SBPort5], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFLDC, [SBPort0,SBPort1], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFLoad, [SBPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX, [SBPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [SBPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad, [SBPort23,SBPort05], 8, [1,2], 3>;
+defm : X86WriteRes<WriteFMaskedLoadY, [SBPort23,SBPort05], 9, [1,2], 3>;
+defm : X86WriteRes<WriteFStore, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreX, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreY, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreNT, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreNTX, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreNTY, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMove, [SBPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX, [SBPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY, [SBPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteEMMS, [SBPort015], 31, [31], 31>;
+
+defm : SBWriteResPair<WriteFAdd, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFAddX, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFAddY, [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteFAddZ, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFAdd64, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFAdd64X, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFAdd64Y, [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteFAdd64Z, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFCmp, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFCmpX, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFCmpY, [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteFCmpZ, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFCmp64, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFCmp64X, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFCmp64Y, [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteFCmp64Z, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFCom, [SBPort1], 3>;
+
+defm : SBWriteResPair<WriteFMul, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFMulX, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFMulY, [SBPort0], 5, [1], 1, 7>;
+defm : SBWriteResPair<WriteFMulZ, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFMul64, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFMul64X, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFMul64Y, [SBPort0], 5, [1], 1, 7>;
+defm : SBWriteResPair<WriteFMul64Z, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFDiv, [SBPort0,SBFPDivider], 14, [1,14], 1, 6>;
+defm : SBWriteResPair<WriteFDivX, [SBPort0,SBFPDivider], 14, [1,14], 1, 6>;
+defm : SBWriteResPair<WriteFDivY, [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>;
+defm : SBWriteResPair<WriteFDivZ, [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFDiv64, [SBPort0,SBFPDivider], 22, [1,22], 1, 6>;
+defm : SBWriteResPair<WriteFDiv64X, [SBPort0,SBFPDivider], 22, [1,22], 1, 6>;
+defm : SBWriteResPair<WriteFDiv64Y, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>;
+defm : SBWriteResPair<WriteFDiv64Z, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFRcp, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFRcpX, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFRcpY, [SBPort0,SBPort05], 7, [2,1], 3, 7>;
+defm : SBWriteResPair<WriteFRcpZ, [SBPort0,SBPort05], 7, [2,1], 3, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFRsqrt, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFRsqrtX,[SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFRsqrtY,[SBPort0,SBPort05], 7, [2,1], 3, 7>;
+defm : SBWriteResPair<WriteFRsqrtZ,[SBPort0,SBPort05], 7, [2,1], 3, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFSqrt, [SBPort0,SBFPDivider], 14, [1,14], 1, 6>;
+defm : SBWriteResPair<WriteFSqrtX, [SBPort0,SBFPDivider], 14, [1,14], 1, 6>;
+defm : SBWriteResPair<WriteFSqrtY, [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>;
+defm : SBWriteResPair<WriteFSqrtZ, [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFSqrt64, [SBPort0,SBFPDivider], 21, [1,21], 1, 6>;
+defm : SBWriteResPair<WriteFSqrt64X, [SBPort0,SBFPDivider], 21, [1,21], 1, 6>;
+defm : SBWriteResPair<WriteFSqrt64Y, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>;
+defm : SBWriteResPair<WriteFSqrt64Z, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFSqrt80, [SBPort0,SBFPDivider], 24, [1,24], 1, 6>;
+
+defm : SBWriteResPair<WriteDPPD, [SBPort0,SBPort1,SBPort5], 9, [1,1,1], 3, 6>;
+defm : SBWriteResPair<WriteDPPS, [SBPort0,SBPort1,SBPort5], 12, [1,2,1], 4, 6>;
+defm : SBWriteResPair<WriteDPPSY, [SBPort0,SBPort1,SBPort5], 12, [1,2,1], 4, 7>;
+defm : SBWriteResPair<WriteDPPSZ, [SBPort0,SBPort1,SBPort5], 12, [1,2,1], 4, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFSign, [SBPort5], 1>;
+defm : SBWriteResPair<WriteFRnd, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFRndY, [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteFRndZ, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFLogic, [SBPort5], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFLogicY, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFLogicZ, [SBPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFTest, [SBPort0], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFTestY, [SBPort0], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFTestZ, [SBPort0], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFShuffle, [SBPort5], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFShuffleY,[SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFShuffleZ,[SBPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFVarShuffle, [SBPort5], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFVarShuffleY,[SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFVarShuffleZ,[SBPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFBlend, [SBPort05], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFBlendY, [SBPort05], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFBlendZ, [SBPort05], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFVarBlend, [SBPort05], 2, [2], 2, 6>;
+defm : SBWriteResPair<WriteFVarBlendY,[SBPort05], 2, [2], 2, 7>;
+defm : SBWriteResPair<WriteFVarBlendZ,[SBPort05], 2, [2], 2, 7>; // Unsupported = 1
+
+// Conversion between integer and float.
+defm : SBWriteResPair<WriteCvtSS2I, [SBPort0,SBPort1], 5, [1,1], 2>;
+defm : SBWriteResPair<WriteCvtPS2I, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteCvtPS2IY, [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteCvtPS2IZ, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteCvtSD2I, [SBPort0,SBPort1], 5, [1,1], 2>;
+defm : SBWriteResPair<WriteCvtPD2I, [SBPort1,SBPort5], 4, [1,1], 2, 6>;
+defm : X86WriteRes<WriteCvtPD2IY, [SBPort1,SBPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPD2IZ, [SBPort1,SBPort5], 4, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPD2IYLd, [SBPort1,SBPort5,SBPort23], 11, [1,1,1], 3>;
+defm : X86WriteRes<WriteCvtPD2IZLd, [SBPort1,SBPort5,SBPort23], 11, [1,1,1], 3>; // Unsupported = 1
+
+defm : X86WriteRes<WriteCvtI2SS, [SBPort1,SBPort5], 5, [1,2], 3>;
+defm : X86WriteRes<WriteCvtI2SSLd, [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>;
+defm : SBWriteResPair<WriteCvtI2PS, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteCvtI2PSY, [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteCvtI2PSZ, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtI2SD, [SBPort1,SBPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PD, [SBPort1,SBPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PDY, [SBPort1,SBPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PDZ, [SBPort1,SBPort5], 4, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtI2SDLd, [SBPort1,SBPort23], 9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PDLd, [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>;
+defm : X86WriteRes<WriteCvtI2PDYLd, [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>;
+defm : X86WriteRes<WriteCvtI2PDZLd, [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteCvtSS2SD, [SBPort0], 1, [1], 1, 6>;
+defm : X86WriteRes<WriteCvtPS2PD, [SBPort0,SBPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDY, [SBPort0,SBPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDZ, [SBPort0,SBPort5], 2, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPS2PDLd, [SBPort0,SBPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDYLd, [SBPort0,SBPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDZLd, [SBPort0,SBPort23], 7, [1,1], 2>; // Unsupported = 1
+defm : SBWriteResPair<WriteCvtSD2SS, [SBPort1,SBPort5], 4, [1,1], 2, 6>;
+defm : SBWriteResPair<WriteCvtPD2PS, [SBPort1,SBPort5], 4, [1,1], 2, 6>;
+defm : SBWriteResPair<WriteCvtPD2PSY, [SBPort1,SBPort5], 4, [1,1], 2, 7>;
+defm : SBWriteResPair<WriteCvtPD2PSZ, [SBPort1,SBPort5], 4, [1,1], 2, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteCvtPH2PS, [SBPort1], 3>;
+defm : SBWriteResPair<WriteCvtPH2PSY, [SBPort1], 3>;
+defm : SBWriteResPair<WriteCvtPH2PSZ, [SBPort1], 3>; // Unsupported = 1
+
+defm : X86WriteRes<WriteCvtPS2PH, [SBPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHY, [SBPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHZ, [SBPort1], 3, [1], 1>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPS2PHSt, [SBPort1, SBPort23, SBPort4], 4, [1,1,1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [SBPort1, SBPort23, SBPort4], 4, [1,1,1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHZSt, [SBPort1, SBPort23, SBPort4], 4, [1,1,1], 1>; // Unsupported = 1
// Vector integer operations.
-defm : SBWriteResPair<WriteVecShift, SBPort5, 1>;
-defm : SBWriteResPair<WriteVecLogic, SBPort5, 1>;
-defm : SBWriteResPair<WriteVecALU, SBPort1, 3>;
-defm : SBWriteResPair<WriteVecIMul, SBPort0, 5>;
-defm : SBWriteResPair<WriteShuffle, SBPort5, 1>;
-defm : SBWriteResPair<WriteBlend, SBPort15, 1>;
-def : WriteRes<WriteVarBlend, [SBPort1, SBPort5]> {
+defm : X86WriteRes<WriteVecLoad, [SBPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [SBPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [SBPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT, [SBPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [SBPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [SBPort23,SBPort05], 8, [1,2], 3>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [SBPort23,SBPort05], 9, [1,2], 3>;
+defm : X86WriteRes<WriteVecStore, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteVecStoreX, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteVecStoreY, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteVecStoreNT, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteVecStoreNTY, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMove, [SBPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [SBPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [SBPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr, [SBPort0], 2, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [SBPort5], 1, [1], 1>;
+
+defm : SBWriteResPair<WriteVecLogic, [SBPort015], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteVecLogicX,[SBPort015], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVecLogicY,[SBPort015], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVecLogicZ,[SBPort015], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVecTest, [SBPort0,SBPort5], 2, [1,1], 2, 6>;
+defm : SBWriteResPair<WriteVecTestY, [SBPort0,SBPort5], 2, [1,1], 2, 7>;
+defm : SBWriteResPair<WriteVecTestZ, [SBPort0,SBPort5], 2, [1,1], 2, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVecALU, [SBPort1], 3, [1], 1, 5>;
+defm : SBWriteResPair<WriteVecALUX, [SBPort15], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVecALUY, [SBPort15], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVecALUZ, [SBPort15], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVecIMul, [SBPort0], 5, [1], 1, 5>;
+defm : SBWriteResPair<WriteVecIMulX, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteVecIMulY, [SBPort0], 5, [1], 1, 7>;
+defm : SBWriteResPair<WriteVecIMulZ, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WritePMULLD, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WritePMULLDY, [SBPort0], 5, [1], 1, 7>; // TODO this is probably wrong for 256/512-bit for the "generic" model
+defm : SBWriteResPair<WritePMULLDZ, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteShuffle, [SBPort5], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteShuffleX, [SBPort15], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteShuffleY, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteShuffleZ, [SBPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVarShuffle, [SBPort15], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteVarShuffleX, [SBPort15], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVarShuffleY, [SBPort15], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVarShuffleZ, [SBPort15], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteBlend, [SBPort15], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteBlendY, [SBPort15], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteBlendZ, [SBPort15], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVarBlend, [SBPort15], 2, [2], 2, 6>;
+defm : SBWriteResPair<WriteVarBlendY,[SBPort15], 2, [2], 2, 7>;
+defm : SBWriteResPair<WriteVarBlendZ,[SBPort15], 2, [2], 2, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteMPSAD, [SBPort0, SBPort15], 7, [1,2], 3, 6>;
+defm : SBWriteResPair<WriteMPSADY, [SBPort0, SBPort15], 7, [1,2], 3, 7>;
+defm : SBWriteResPair<WriteMPSADZ, [SBPort0, SBPort15], 7, [1,2], 3, 7>; // Unsupported = 1
+defm : SBWriteResPair<WritePSADBW, [SBPort0], 5, [1], 1, 5>;
+defm : SBWriteResPair<WritePSADBWX, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WritePSADBWY, [SBPort0], 5, [1], 1, 7>;
+defm : SBWriteResPair<WritePSADBWZ, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WritePHMINPOS, [SBPort0], 5, [1], 1, 6>;
+
+// Vector integer shifts.
+defm : SBWriteResPair<WriteVecShift, [SBPort5], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteVecShiftX, [SBPort0,SBPort15], 2, [1,1], 2, 6>;
+defm : SBWriteResPair<WriteVecShiftY, [SBPort0,SBPort15], 4, [1,1], 2, 7>;
+defm : SBWriteResPair<WriteVecShiftZ, [SBPort0,SBPort15], 4, [1,1], 2, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVecShiftImm, [SBPort5], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteVecShiftImmX, [SBPort0], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVecShiftImmY, [SBPort0], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVecShiftImmZ, [SBPort0], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVarVecShift, [SBPort0], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVarVecShiftY, [SBPort0], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVarVecShiftZ, [SBPort0], 1, [1], 1, 7>; // Unsupported = 1
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [SBPort5,SBPort15]> {
let Latency = 2;
- let ResourceCycles = [1, 1];
+ let NumMicroOps = 2;
}
-def : WriteRes<WriteVarBlendLd, [SBPort1, SBPort5, SBPort23]> {
- let Latency = 6;
- let ResourceCycles = [1, 1, 1];
+def : WriteRes<WriteVecInsertLd, [SBPort23,SBPort15]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
}
-def : WriteRes<WriteMPSAD, [SBPort0,SBPort15]> {
+
+def : WriteRes<WriteVecExtract, [SBPort0,SBPort15]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def : WriteRes<WriteVecExtractSt, [SBPort4,SBPort23,SBPort15]> {
let Latency = 5;
let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def : WriteRes<WriteMPSADLd, [SBPort0,SBPort23,SBPort15]> {
- let Latency = 11;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,2];
}
////////////////////////////////////////////////////////////////////////////////
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [SBPort1]> {
- let Latency = 3;
-}
-
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [SBPort1, SBPort23]> {
- let Latency = 7;
- let ResourceCycles = [1, 1];
-}
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [SBPort15]>;
-
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [SBPort15, SBPort23]> {
- let Latency = 5;
- let ResourceCycles = [1, 1];
-}
+defm : SBWriteResPair<WriteFHAdd, [SBPort1,SBPort5], 5, [1,2], 3, 6>;
+defm : SBWriteResPair<WriteFHAddY, [SBPort1,SBPort5], 5, [1,2], 3, 7>;
+defm : SBWriteResPair<WriteFHAddZ, [SBPort1,SBPort5], 5, [1,2], 3, 7>; // Unsupported = 1
+defm : SBWriteResPair<WritePHAdd, [SBPort15], 3, [3], 3, 5>;
+defm : SBWriteResPair<WritePHAddX, [SBPort15], 3, [3], 3, 6>;
+defm : SBWriteResPair<WritePHAddY, [SBPort15], 3, [3], 3, 7>;
+defm : SBWriteResPair<WritePHAddZ, [SBPort15], 3, [3], 3, 7>; // Unsupported = 1
+////////////////////////////////////////////////////////////////////////////////
// String instructions.
+////////////////////////////////////////////////////////////////////////////////
+
// Packed Compare Implicit Length Strings, Return Mask
-def : WriteRes<WritePCmpIStrM, [SBPort015]> {
+def : WriteRes<WritePCmpIStrM, [SBPort0]> {
let Latency = 11;
+ let NumMicroOps = 3;
let ResourceCycles = [3];
}
-def : WriteRes<WritePCmpIStrMLd, [SBPort015, SBPort23]> {
- let Latency = 11;
- let ResourceCycles = [3, 1];
+def : WriteRes<WritePCmpIStrMLd, [SBPort0, SBPort23]> {
+ let Latency = 17;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
}
// Packed Compare Explicit Length Strings, Return Mask
@@ -228,6 +467,12 @@ def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> {
let ResourceCycles = [7, 1];
}
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [SBPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSK, [SBPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSKY, [SBPort0]> { let Latency = 2; }
+def : WriteRes<WriteMMXMOVMSK, [SBPort0]> { let Latency = 1; }
+
// AES Instructions.
def : WriteRes<WriteAESDecEnc, [SBPort5,SBPort015]> {
let Latency = 7;
@@ -270,6 +515,10 @@ def : WriteRes<WriteCLMulLd, [SBPort015, SBPort23]> {
let ResourceCycles = [17, 1];
}
+// Load/store MXCSR.
+// FIXME: This is probably wrong. Only STMXCSR should require Port4.
+def : WriteRes<WriteLDMXCSR, [SBPort0,SBPort4,SBPort5,SBPort23]> { let Latency = 5; let NumMicroOps = 4; let ResourceCycles = [1,1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [SBPort0,SBPort4,SBPort5,SBPort23]> { let Latency = 5; let NumMicroOps = 4; let ResourceCycles = [1,1,1,1]; }
def : WriteRes<WriteSystem, [SBPort015]> { let Latency = 100; }
def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; }
@@ -278,624 +527,107 @@ def : WriteRes<WriteNop, []>;
// AVX2/FMA is not supported on that architecture, but we should define the basic
// scheduling resources anyway.
-defm : SBWriteResPair<WriteFShuffle256, SBPort0, 1>;
-defm : SBWriteResPair<WriteShuffle256, SBPort0, 1>;
-defm : SBWriteResPair<WriteVarVecShift, SBPort0, 1>;
-defm : SBWriteResPair<WriteFMA, SBPort01, 5>;
+defm : SBWriteResPair<WriteFShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFVarShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVarShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFMA, [SBPort01], 5>;
+defm : SBWriteResPair<WriteFMAX, [SBPort01], 5>;
+defm : SBWriteResPair<WriteFMAY, [SBPort01], 5>;
+defm : SBWriteResPair<WriteFMAZ, [SBPort01], 5>; // Unsupported = 1
// Remaining SNB instrs.
-def SBWriteResGroup0 : SchedWriteRes<[SBPort0]> {
- let Latency = 1;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup0], (instregex "CVTSS2SDrr")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSLLDri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSLLQri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSLLWri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSRADri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSRAWri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSRLDri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSRLQri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSRLWri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VCVTSS2SDrr")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPMOVMSKBrr")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSLLDri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSLLQri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSLLWri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSRADri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSRAWri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSRLDri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSRLQri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSRLWri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VTESTPDYrr")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VTESTPDrr")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VTESTPSYrr")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VTESTPSrr")>;
-
def SBWriteResGroup1 : SchedWriteRes<[SBPort1]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SBWriteResGroup1], (instregex "COMP_FST0r")>;
-def: InstRW<[SBWriteResGroup1], (instregex "COM_FST0r")>;
-def: InstRW<[SBWriteResGroup1], (instregex "UCOM_FPr")>;
-def: InstRW<[SBWriteResGroup1], (instregex "UCOM_Fr")>;
+def: InstRW<[SBWriteResGroup1], (instrs COMP_FST0r,
+ COM_FST0r,
+ UCOM_FPr,
+ UCOM_Fr)>;
def SBWriteResGroup2 : SchedWriteRes<[SBPort5]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SBWriteResGroup2], (instregex "ANDNPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "ANDNPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "ANDPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "ANDPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "FDECSTP")>;
-def: InstRW<[SBWriteResGroup2], (instregex "FFREE")>;
-def: InstRW<[SBWriteResGroup2], (instregex "FINCSTP")>;
-def: InstRW<[SBWriteResGroup2], (instregex "FNOP")>;
-def: InstRW<[SBWriteResGroup2], (instregex "INSERTPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JAE_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JAE_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JA_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JA_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JBE_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JBE_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JB_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JB_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JE_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JE_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JGE_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JGE_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JG_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JG_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JLE_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JLE_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JL_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JL_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JMP64r")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JMP_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JMP_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNE_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNE_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNO_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNO_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNP_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNP_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNS_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNS_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JO_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JO_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JP_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JP_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JS_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JS_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "LD_Frr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "LOOP")>;
-def: InstRW<[SBWriteResGroup2], (instregex "LOOPE")>;
-def: InstRW<[SBWriteResGroup2], (instregex "LOOPNE")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOV64toPQIrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVAPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVAPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVDDUPrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVDI2PDIrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVHLPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVLHPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVSDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVSHDUPrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVSLDUPrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVSSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVUPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVUPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "ORPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "ORPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "RETQ")>;
-def: InstRW<[SBWriteResGroup2], (instregex "SHUFPDrri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "SHUFPSrri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "ST_FPrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "ST_Frr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "UNPCKHPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "UNPCKHPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "UNPCKLPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "UNPCKLPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDNPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDNPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDNPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDNPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VEXTRACTF128rr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VINSERTF128rr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VINSERTPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOV64toPQIrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVDDUPYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVDDUPrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVDI2PDIrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVHLPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVHLPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVSDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVSHDUPYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVSHDUPrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVSLDUPYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVSLDUPrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVSSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VORPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VORPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VORPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VORPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERM2F128rr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDYri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSYri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPDYrri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPDrri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPSYrri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPSrri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VXORPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VXORPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VXORPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VXORPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "XORPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "XORPSrr")>;
-
-def SBWriteResGroup3 : SchedWriteRes<[SBPort01]> {
- let Latency = 1;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup3], (instregex "LEA(16|32|64)(_32)?r")>;
+def: InstRW<[SBWriteResGroup2], (instrs FDECSTP, FINCSTP, FFREE, FFREEP, FNOP,
+ LD_Frr, ST_Frr, ST_FPrr)>;
+def: InstRW<[SBWriteResGroup2], (instrs LOOP, LOOPE, LOOPNE)>; // FIXME: This seems wrong compared to other Intel CPUs.
+def: InstRW<[SBWriteResGroup2], (instrs RETQ)>;
def SBWriteResGroup4 : SchedWriteRes<[SBPort05]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SBWriteResGroup4], (instregex "BLENDPDrri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BLENDPSrri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BT(16|32|64)ri8")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BT(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BTC(16|32|64)ri8")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BTC(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BTR(16|32|64)ri8")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BTR(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BTS(16|32|64)ri8")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BTS(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "CDQ")>;
-def: InstRW<[SBWriteResGroup4], (instregex "CQO")>;
-def: InstRW<[SBWriteResGroup4], (instregex "LAHF")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SAHF")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SAR(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SAR8ri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETAEr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETBr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETEr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETGEr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETGr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETLEr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETLr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETNEr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETNOr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETNPr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETNSr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETOr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETPr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETSr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SHL(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SHL(16|32|64)r1")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SHL8r1")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SHL8ri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SHR(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SHR8ri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPDYrri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPDrri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPSYrri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPSrri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQAYrr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQArr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQUYrr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQUrr")>;
+def: InstRW<[SBWriteResGroup4], (instrs CDQ, CQO)>;
+def: InstRW<[SBWriteResGroup4], (instregex "BT(16|32|64)ri8",
+ "BT(16|32|64)rr",
+ "BTC(16|32|64)ri8",
+ "BTC(16|32|64)rr",
+ "BTR(16|32|64)ri8",
+ "BTR(16|32|64)rr",
+ "BTS(16|32|64)ri8",
+ "BTS(16|32|64)rr")>;
def SBWriteResGroup5 : SchedWriteRes<[SBPort15]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABSBrr64")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABSDrr64")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABSWrr64")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PADDQirr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PALIGNR64irr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSHUFBrr64")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSIGNBrr64")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSIGNDrr64")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSIGNWrr64")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PABSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PABSDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PABSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PACKSSDWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PACKSSWBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PACKUSDWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PACKUSWBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDUSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDUSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PALIGNRrri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PAVGBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PAVGWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PBLENDWrri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PCMPGTBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PCMPGTDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PCMPGTWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMAXSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMAXSDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMAXSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMAXUBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMAXUDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMAXUWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMINSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMINSDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMINSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMINUBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMINUDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMINUWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXBDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXBQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXWDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXWQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXBDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXBQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXWDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXWQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSHUFBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSHUFDri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSHUFHWri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSHUFLWri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSIGNBrr128")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSIGNDrr128")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSIGNWrr128")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSLLDQri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSRLDQri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBUSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBUSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHQDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHWDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLQDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLWDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPABSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPABSDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPABSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPACKSSDWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPACKSSWBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPACKUSDWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPACKUSWBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDUSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDUSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPALIGNRrri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPAVGBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPAVGWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPBLENDWrri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPCMPGTBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPCMPGTDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPCMPGTWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMAXSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMAXSDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMAXSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMAXUBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMAXUDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMAXUWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMINSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMINSDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMINSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMINUBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMINUDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMINUWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXBDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXBQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXWDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXWQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXBDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXBQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXWDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXWQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFDri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFHWri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFLWri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSIGNBrr128")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSIGNDrr128")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSIGNWrr128")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSLLDQri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSRLDQri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBUSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBUSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHQDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHWDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLQDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLWDrr")>;
-
-def SBWriteResGroup6 : SchedWriteRes<[SBPort015]> {
- let Latency = 1;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup6], (instregex "ADD(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "ADD(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "ADD8i8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "ADD8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "ADD8rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "AND(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "AND(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "AND8i8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "AND8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "AND8rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CBW")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CMC")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CMP(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CMP(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CMP8i8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CMP8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CMP8rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CWDE")>;
-def: InstRW<[SBWriteResGroup6], (instregex "DEC(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "DEC8r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "INC(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "INC8r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MMX_MOVQ2DQrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOV(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOV8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOV8rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVDQArr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVDQUrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVPQI2QIrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVSX(16|32|64)rr16")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVSX(16|32|64)rr32")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVSX(16|32|64)rr8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVZX(16|32|64)rr16")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVZX(16|32|64)rr8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "NEG(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "NEG8r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "NOT(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "NOT8r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "OR(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "OR(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "OR8i8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "OR8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "OR8rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "PANDNrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "PANDrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "PORrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "PXORrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "STC")>;
-def: InstRW<[SBWriteResGroup6], (instregex "SUB(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "SUB(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "SUB8i8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "SUB8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "SUB8rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "TEST(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "TEST8i8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "TEST8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "TEST8rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "VMOVPQI2QIrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "VMOVZPQILo2PQIrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "VPANDNrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "VPANDrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "VPORrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "VPXORrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "XOR(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "XOR(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "XOR8i8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "XOR8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "XOR8rr")>;
-
-def SBWriteResGroup7 : SchedWriteRes<[SBPort0]> {
- let Latency = 2;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup7], (instregex "MOVMSKPDrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "MOVMSKPSrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "MOVPDI2DIrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "MOVPQIto64rr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "PMOVMSKBrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPDYrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPDrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPSYrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPSrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "VMOVPDI2DIrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "VMOVPQIto64rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABS(B|D|W)rr",
+ "MMX_PADDQirr",
+ "MMX_PALIGNRrri",
+ "MMX_PSIGN(B|D|W)rr")>;
def SBWriteResGroup9 : SchedWriteRes<[SBPort05]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[SBWriteResGroup9], (instregex "BLENDVPDrr0")>;
-def: InstRW<[SBWriteResGroup9], (instregex "BLENDVPSrr0")>;
-def: InstRW<[SBWriteResGroup9], (instregex "ROL(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup9], (instregex "ROL8ri")>;
-def: InstRW<[SBWriteResGroup9], (instregex "ROR(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup9], (instregex "ROR8ri")>;
-def: InstRW<[SBWriteResGroup9], (instregex "SETAr")>;
-def: InstRW<[SBWriteResGroup9], (instregex "SETBEr")>;
-def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPDYrr")>;
-def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPDrr")>;
-def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPSYrr")>;
-def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPSrr")>;
-
-def SBWriteResGroup10 : SchedWriteRes<[SBPort15]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SBWriteResGroup10], (instregex "VPBLENDVBrr")>;
+def: InstRW<[SBWriteResGroup9], (instregex "ROL(8|16|32|64)r1",
+ "ROL(8|16|32|64)ri",
+ "ROR(8|16|32|64)r1",
+ "ROR(8|16|32|64)ri",
+ "SET(A|BE)r")>;
def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[SBWriteResGroup11], (instregex "SCASB")>;
-def: InstRW<[SBWriteResGroup11], (instregex "SCASL")>;
-def: InstRW<[SBWriteResGroup11], (instregex "SCASQ")>;
-def: InstRW<[SBWriteResGroup11], (instregex "SCASW")>;
+def: InstRW<[SBWriteResGroup11], (instrs SCASB,
+ SCASL,
+ SCASQ,
+ SCASW)>;
def SBWriteResGroup12 : SchedWriteRes<[SBPort0,SBPort1]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup12], (instregex "COMISDrr")>;
-def: InstRW<[SBWriteResGroup12], (instregex "COMISSrr")>;
-def: InstRW<[SBWriteResGroup12], (instregex "UCOMISDrr")>;
-def: InstRW<[SBWriteResGroup12], (instregex "UCOMISSrr")>;
-def: InstRW<[SBWriteResGroup12], (instregex "VCOMISDrr")>;
-def: InstRW<[SBWriteResGroup12], (instregex "VCOMISSrr")>;
-def: InstRW<[SBWriteResGroup12], (instregex "VUCOMISDrr")>;
-def: InstRW<[SBWriteResGroup12], (instregex "VUCOMISSrr")>;
-
-def SBWriteResGroup13 : SchedWriteRes<[SBPort0,SBPort5]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup13], (instregex "CVTPS2PDrr")>;
-def: InstRW<[SBWriteResGroup13], (instregex "PTESTrr")>;
-def: InstRW<[SBWriteResGroup13], (instregex "VCVTPS2PDYrr")>;
-def: InstRW<[SBWriteResGroup13], (instregex "VCVTPS2PDrr")>;
-def: InstRW<[SBWriteResGroup13], (instregex "VPTESTYrr")>;
-def: InstRW<[SBWriteResGroup13], (instregex "VPTESTrr")>;
-
-def SBWriteResGroup14 : SchedWriteRes<[SBPort0,SBPort15]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup14], (instregex "PSLLDrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "PSLLQrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "PSLLWrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "PSRADrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "PSRAWrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "PSRLDrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "PSRLQrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "PSRLWrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSLLDrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSLLQrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSLLWrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSRADrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSRAWrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSRLDrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSRLQrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSRLWrr")>;
+def: InstRW<[SBWriteResGroup12], (instregex "(V?)COMISDrr",
+ "(V?)COMISSrr",
+ "(V?)UCOMISDrr",
+ "(V?)UCOMISSrr")>;
def SBWriteResGroup15 : SchedWriteRes<[SBPort0,SBPort015]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup15], (instregex "CWD")>;
-def: InstRW<[SBWriteResGroup15], (instregex "FNSTSW16r")>;
-
-def SBWriteResGroup16 : SchedWriteRes<[SBPort1,SBPort05]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup16], (instregex "BSWAP(16|32|64)r")>;
-
-def SBWriteResGroup17 : SchedWriteRes<[SBPort5,SBPort15]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup17], (instregex "PINSRBrr")>;
-def: InstRW<[SBWriteResGroup17], (instregex "PINSRDrr")>;
-def: InstRW<[SBWriteResGroup17], (instregex "PINSRQrr")>;
-def: InstRW<[SBWriteResGroup17], (instregex "PINSRWrri")>;
-def: InstRW<[SBWriteResGroup17], (instregex "VPINSRBrr")>;
-def: InstRW<[SBWriteResGroup17], (instregex "VPINSRDrr")>;
-def: InstRW<[SBWriteResGroup17], (instregex "VPINSRQrr")>;
-def: InstRW<[SBWriteResGroup17], (instregex "VPINSRWrri")>;
+def: InstRW<[SBWriteResGroup15], (instrs CWD,
+ FNSTSW16r)>;
def SBWriteResGroup18 : SchedWriteRes<[SBPort5,SBPort015]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup18], (instregex "JRCXZ")>;
+def: InstRW<[SBWriteResGroup18], (instrs JCXZ, JECXZ, JRCXZ)>;
def: InstRW<[SBWriteResGroup18], (instregex "MMX_MOVDQ2Qrr")>;
def SBWriteResGroup19 : SchedWriteRes<[SBPort05,SBPort015]> {
@@ -903,300 +635,84 @@ def SBWriteResGroup19 : SchedWriteRes<[SBPort05,SBPort015]> {
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup19], (instregex "ADC(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup19], (instregex "ADC(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "ADC8ri")>;
-def: InstRW<[SBWriteResGroup19], (instregex "ADC8rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVAE(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVB(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVE(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVG(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVGE(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVL(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVLE(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVNE(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVNO(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVNP(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVNS(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVO(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVP(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVS(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "SBB(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup19], (instregex "SBB(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "SBB8ri")>;
-def: InstRW<[SBWriteResGroup19], (instregex "SBB8rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "SHLD(16|32|64)rri8")>;
-def: InstRW<[SBWriteResGroup19], (instregex "SHRD(16|32|64)rri8")>;
-
-def SBWriteResGroup20 : SchedWriteRes<[SBPort0]> {
- let Latency = 3;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup20], (instregex "MMX_PMADDUBSWrr64")>;
-def: InstRW<[SBWriteResGroup20], (instregex "MMX_PMULHRSWrr64")>;
-def: InstRW<[SBWriteResGroup20], (instregex "MMX_PMULUDQirr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMADDUBSWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMADDWDrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMULDQrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMULHRSWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMULHUWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMULHWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMULLDrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMULLWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMULUDQrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PSADBWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMADDUBSWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMADDWDrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMULDQrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMULHRSWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMULHUWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMULHWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMULLDrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMULLWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMULUDQrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPSADBWrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "SHLD(16|32|64)rri8",
+ "SHRD(16|32|64)rri8")>;
def SBWriteResGroup21 : SchedWriteRes<[SBPort1]> {
let Latency = 3;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SBWriteResGroup21], (instregex "ADDPDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADDPSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADDSDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADDSSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADDSUBPDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADDSUBPSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADD_FPrST0")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADD_FST0r")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADD_FrST0")>;
-def: InstRW<[SBWriteResGroup21], (instregex "BSF(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "BSR(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CMPPDrri")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CMPPSrri")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CMPSDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CMPSSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CRC32r(16|32|64)r8")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CRC32r(16|32|64)r64")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CVTDQ2PSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CVTPS2DQrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CVTTPS2DQrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)PDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)PSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)SDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)SSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)PDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)PSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)SDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)SSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTPI2PSirr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTPS2PIirr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTTPS2PIirr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MUL8r")>;
-def: InstRW<[SBWriteResGroup21], (instregex "POPCNT(16|32|64)rr")>;
def: InstRW<[SBWriteResGroup21], (instregex "PUSHFS64")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ROUNDPDr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ROUNDPSr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ROUNDSDr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ROUNDSSr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUBPDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUBPSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUBR_FPrST0")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUBR_FST0r")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUBR_FrST0")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUBSDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUBSSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUB_FPrST0")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUB_FST0r")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUB_FrST0")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDPDYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDPDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDPSYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDPSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDSDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDSSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPDYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPSYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCMPPDYrri")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCMPPDrri")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCMPPSYrri")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCMPPSrri")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCMPSDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCMPSSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCVTDQ2PSYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCVTDQ2PSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCVTPS2DQYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCVTPS2DQrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCVTTPS2DQYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCVTTPS2DQrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PDYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PSYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)SDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)SSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PDYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PSYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)SDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)SSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VROUNDPDr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VROUNDPSr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VROUNDSDr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VROUNDSSr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VROUNDYPDr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VROUNDYPSr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VSUBPDYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VSUBPDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VSUBPSYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VSUBPSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VSUBSDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VSUBSSrr")>;
-def SBWriteResGroup22 : SchedWriteRes<[SBPort0,SBPort5]> {
- let Latency = 3;
+def SBWriteResGroup21_16i : SchedWriteRes<[SBPort1, SBPort015]> {
+ let Latency = 4;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup22], (instregex "EXTRACTPSrr")>;
-def: InstRW<[SBWriteResGroup22], (instregex "VEXTRACTPSrr")>;
+def: InstRW<[SBWriteResGroup21_16i], (instrs IMUL16rri, IMUL16rri8)>;
-def SBWriteResGroup23 : SchedWriteRes<[SBPort0,SBPort15]> {
+def SBWriteResGroup22 : SchedWriteRes<[SBPort0,SBPort5]> {
let Latency = 3;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup23], (instregex "PEXTRBrr")>;
-def: InstRW<[SBWriteResGroup23], (instregex "PEXTRDrr")>;
-def: InstRW<[SBWriteResGroup23], (instregex "PEXTRQrr")>;
-def: InstRW<[SBWriteResGroup23], (instregex "PEXTRWri")>;
-def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRBrr")>;
-def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRDrr")>;
-def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRQrr")>;
-def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRWri")>;
+def: InstRW<[SBWriteResGroup22], (instregex "(V?)EXTRACTPSrr")>;
def SBWriteResGroup23_2 : SchedWriteRes<[SBPort05]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [3];
}
-def: InstRW<[SBWriteResGroup23_2], (instregex "ROL(16|32|64)rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "ROL8rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "ROR(16|32|64)rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "ROR8rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "SAR(16|32|64)rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "SAR8rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "SHL(16|32|64)rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "SHL8rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "SHR(16|32|64)rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "SHR8rCL")>;
-
-def SBWriteResGroup24 : SchedWriteRes<[SBPort15]> {
- let Latency = 3;
- let NumMicroOps = 3;
- let ResourceCycles = [3];
-}
-def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHADDSWrr64")>;
-def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHADDWrr64")>;
-def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHADDrr64")>;
-def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHSUBDrr64")>;
-def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHSUBSWrr64")>;
-def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHSUBWrr64")>;
-def: InstRW<[SBWriteResGroup24], (instregex "PHADDDrr")>;
-def: InstRW<[SBWriteResGroup24], (instregex "PHADDSWrr128")>;
-def: InstRW<[SBWriteResGroup24], (instregex "PHADDWrr")>;
-def: InstRW<[SBWriteResGroup24], (instregex "PHSUBDrr")>;
-def: InstRW<[SBWriteResGroup24], (instregex "PHSUBSWrr128")>;
-def: InstRW<[SBWriteResGroup24], (instregex "PHSUBWrr")>;
-def: InstRW<[SBWriteResGroup24], (instregex "VPHADDDrr")>;
-def: InstRW<[SBWriteResGroup24], (instregex "VPHADDSWrr128")>;
-def: InstRW<[SBWriteResGroup24], (instregex "VPHADDWrr")>;
-def: InstRW<[SBWriteResGroup24], (instregex "VPHSUBDrr")>;
-def: InstRW<[SBWriteResGroup24], (instregex "VPHSUBSWrr128")>;
-def: InstRW<[SBWriteResGroup24], (instregex "VPHSUBWrr")>;
+def: InstRW<[SBWriteResGroup23_2], (instregex "ROL(8|16|32|64)rCL",
+ "ROR(8|16|32|64)rCL",
+ "SAR(8|16|32|64)rCL",
+ "SHL(8|16|32|64)rCL",
+ "SHR(8|16|32|64)rCL")>;
def SBWriteResGroup25 : SchedWriteRes<[SBPort015]> {
- let Latency = 3;
+ let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [3];
}
-def: InstRW<[SBWriteResGroup25], (instregex "ADC8i8")>;
-def: InstRW<[SBWriteResGroup25], (instregex "LEAVE64")>;
-def: InstRW<[SBWriteResGroup25], (instregex "OUT32rr")>;
-def: InstRW<[SBWriteResGroup25], (instregex "OUT8rr")>;
-def: InstRW<[SBWriteResGroup25], (instregex "SBB8i8")>;
-def: InstRW<[SBWriteResGroup25], (instregex "XADD(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup25], (instregex "XADD8rr")>;
-
-def SBWriteResGroup25_2 : SchedWriteRes<[SBPort5,SBPort05]> {
- let Latency = 3;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVBE_F")>;
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVB_F")>;
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVE_F")>;
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNBE_F")>;
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNB_F")>;
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNE_F")>;
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNP_F")>;
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVP_F")>;
+def: InstRW<[SBWriteResGroup25], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr,
+ XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr,
+ XCHG16ar, XCHG32ar, XCHG64ar)>;
-def SBWriteResGroup26 : SchedWriteRes<[SBPort05,SBPort015]> {
- let Latency = 3;
+def SBWriteResGroup25_1 : SchedWriteRes<[SBPort23,SBPort015]> {
+ let Latency = 7;
let NumMicroOps = 3;
- let ResourceCycles = [2,1];
+ let ResourceCycles = [1,2];
}
-def: InstRW<[SBWriteResGroup26], (instregex "CMOVA(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup26], (instregex "CMOVBE(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup25_1], (instrs LEAVE, LEAVE64)>;
def SBWriteResGroup26_2 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SBWriteResGroup26_2], (instregex "COM_FIPr")>;
-def: InstRW<[SBWriteResGroup26_2], (instregex "COM_FIr")>;
-def: InstRW<[SBWriteResGroup26_2], (instregex "UCOM_FIPr")>;
-def: InstRW<[SBWriteResGroup26_2], (instregex "UCOM_FIr")>;
+def: InstRW<[SBWriteResGroup26_2], (instrs COM_FIPr, COM_FIr, UCOM_FIPr, UCOM_FIr)>;
def SBWriteResGroup27 : SchedWriteRes<[SBPort0,SBPort1]> {
let Latency = 4;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup27], (instregex "MUL(16|32|64)r")>;
+def: InstRW<[SBWriteResGroup27], (instrs IMUL64r, MUL64r)>;
-def SBWriteResGroup28 : SchedWriteRes<[SBPort1,SBPort5]> {
+def SBWriteResGroup27_1 : SchedWriteRes<[SBPort1,SBPort05,SBPort015]> {
let Latency = 4;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def: InstRW<[SBWriteResGroup28], (instregex "CVTDQ2PDrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "CVTPD2DQrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "CVTPD2PSrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "CVTSD2SSrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "CVTSI642SDrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "CVTSI2SDrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "CVTTPD2DQrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "MMX_CVTPD2PIirr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "MMX_CVTPI2PDirr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "MMX_CVTTPD2PIirr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTDQ2PDYrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTDQ2PDrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2DQYrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2DQrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2PSYrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2PSrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTSD2SSrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTSI642SDrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTSI2SDrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTTPD2DQYrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTTPD2DQrr")>;
+def: InstRW<[SBWriteResGroup27_1], (instrs IMUL32r, MUL32r)>;
+
+def SBWriteResGroup27_2 : SchedWriteRes<[SBPort1,SBPort05,SBPort015]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SBWriteResGroup27_2], (instrs IMUL16r, MUL16r)>;
def SBWriteResGroup29 : SchedWriteRes<[SBPort1,SBPort015]> {
let Latency = 4;
@@ -1210,288 +726,97 @@ def SBWriteResGroup29_2 : SchedWriteRes<[SBPort5,SBPort015]> {
let NumMicroOps = 4;
let ResourceCycles = [1,3];
}
-def: InstRW<[SBWriteResGroup29_2], (instregex "OUT32ir")>;
-def: InstRW<[SBWriteResGroup29_2], (instregex "OUT8ir")>;
-def: InstRW<[SBWriteResGroup29_2], (instregex "PAUSE")>;
+def: InstRW<[SBWriteResGroup29_2], (instrs PAUSE)>;
def SBWriteResGroup29_3 : SchedWriteRes<[SBPort05,SBPort015]> {
let Latency = 4;
let NumMicroOps = 4;
let ResourceCycles = [3,1];
}
-def: InstRW<[SBWriteResGroup29_3], (instregex "SHLD(16|32|64)rrCL")>;
-def: InstRW<[SBWriteResGroup29_3], (instregex "SHRD(16|32|64)rrCL")>;
+def: InstRW<[SBWriteResGroup29_3], (instregex "SHLD(16|32|64)rrCL",
+ "SHRD(16|32|64)rrCL")>;
def SBWriteResGroup30 : SchedWriteRes<[SBPort0]> {
let Latency = 5;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SBWriteResGroup30], (instregex "MULPDrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "MULPSrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "MULSDrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "MULSSrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "MUL_FPrST0")>;
-def: InstRW<[SBWriteResGroup30], (instregex "MUL_FST0r")>;
-def: InstRW<[SBWriteResGroup30], (instregex "MUL_FrST0")>;
-def: InstRW<[SBWriteResGroup30], (instregex "PCMPGTQrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "PHMINPOSUWrr128")>;
-def: InstRW<[SBWriteResGroup30], (instregex "RCPPSr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "RCPSSr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "RSQRTPSr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "RSQRTSSr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VMULPDYrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VMULPDrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VMULPSYrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VMULPSrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VMULSDrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VMULSSrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VPCMPGTQrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VPHMINPOSUWrr128")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VRCPPSr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VRCPSSr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VRSQRTPSr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VRSQRTSSr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "(V?)PCMPGTQrr")>;
def SBWriteResGroup31 : SchedWriteRes<[SBPort23]> {
let Latency = 5;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SBWriteResGroup31], (instregex "MOV(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup31], (instregex "MOV8rm")>;
-def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm16")>;
-def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm32")>;
-def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm8")>;
-def: InstRW<[SBWriteResGroup31], (instregex "MOVZX(16|32|64)rm16")>;
-def: InstRW<[SBWriteResGroup31], (instregex "MOVZX(16|32|64)rm8")>;
-def: InstRW<[SBWriteResGroup31], (instregex "PREFETCH")>;
-
-def SBWriteResGroup32 : SchedWriteRes<[SBPort0,SBPort1]> {
- let Latency = 5;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup32], (instregex "CVTSD2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "CVTSD2SIrr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "CVTSS2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "CVTSS2SIrr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "CVTTSD2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "CVTTSD2SIrr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "CVTTSS2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "CVTTSS2SIrr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTSD2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTSD2SIrr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTSS2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTSS2SIrr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSD2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSD2SIrr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSS2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSS2SIrr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm(8|16|32)",
+ "MOVZX(16|32|64)rm(8|16)")>;
def SBWriteResGroup33 : SchedWriteRes<[SBPort4,SBPort23]> {
let Latency = 5;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup33], (instregex "MOV(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOV8mr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVAPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVAPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVDQAmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVDQUmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVHPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVHPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVLPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVLPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVNTDQmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVNTI_64mr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVNTImr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVNTPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVNTPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVPDI2DImr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVPQI2QImr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVPQIto64mr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVSDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVSSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVUPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVUPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "PUSH64i8")>;
-def: InstRW<[SBWriteResGroup33], (instregex "PUSH(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VEXTRACTF128mr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPDYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPSYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQAYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQAmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQUYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQUmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVHPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVHPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVLPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVLPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTDQYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTDQmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPDYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPSYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVPDI2DImr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVPQI2QImr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVPQIto64mr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVSDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVSSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPDYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPSYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPSmr")>;
-
-def SBWriteResGroup34 : SchedWriteRes<[SBPort0,SBPort15]> {
- let Latency = 5;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[SBWriteResGroup34], (instregex "MPSADBWrri")>;
-def: InstRW<[SBWriteResGroup34], (instregex "VMPSADBWrri")>;
+def: InstRW<[SBWriteResGroup33], (instregex "PUSH(16r|32r|64r|64i8)")>;
def SBWriteResGroup35 : SchedWriteRes<[SBPort1,SBPort5]> {
let Latency = 5;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[SBWriteResGroup35], (instregex "CLI")>;
-def: InstRW<[SBWriteResGroup35], (instregex "CVTSI642SSrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "CVTSI2SSrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "HADDPDrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "HADDPSrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "HSUBPDrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "HSUBPSrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VCVTSI642SSrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VCVTSI2SSrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHADDPDYrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHADDPDrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHADDPSYrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHADDPSrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPDYrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPDrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPSYrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPSrr")>;
+def: InstRW<[SBWriteResGroup35], (instrs CLI)>;
def SBWriteResGroup35_2 : SchedWriteRes<[SBPort1,SBPort4,SBPort23]> {
let Latency = 5;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP16m")>;
-def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP32m")>;
-def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP64m")>;
-def: InstRW<[SBWriteResGroup35_2], (instregex "PUSHGS64")>;
+def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP(16|32|64)m",
+ "PUSHGS64")>;
def SBWriteResGroup36 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> {
let Latency = 5;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SBWriteResGroup36], (instregex "CALL64pcrel32")>;
-def: InstRW<[SBWriteResGroup36], (instregex "CALL(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup36], (instregex "EXTRACTPSmr")>;
-def: InstRW<[SBWriteResGroup36], (instregex "VEXTRACTPSmr")>;
-
-def SBWriteResGroup37 : SchedWriteRes<[SBPort4,SBPort01,SBPort23]> {
- let Latency = 5;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPDYmr")>;
-def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPDmr")>;
-def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPSYmr")>;
-def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPSmr")>;
-
-def SBWriteResGroup38 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
- let Latency = 5;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup38], (instregex "SETAEm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETBm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETEm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETGEm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETGm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETLEm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETLm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETNEm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETNOm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETNPm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETNSm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETOm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETPm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETSm")>;
-
-def SBWriteResGroup39 : SchedWriteRes<[SBPort4,SBPort23,SBPort15]> {
- let Latency = 5;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup39], (instregex "PEXTRBmr")>;
-def: InstRW<[SBWriteResGroup39], (instregex "VPEXTRBmr")>;
-def: InstRW<[SBWriteResGroup39], (instregex "VPEXTRDmr")>;
-def: InstRW<[SBWriteResGroup39], (instregex "VPEXTRWmr")>;
+def: InstRW<[SBWriteResGroup36], (instrs CALL64pcrel32)>;
+def: InstRW<[SBWriteResGroup36], (instregex "CALL(16|32|64)r",
+ "(V?)EXTRACTPSmr")>;
def SBWriteResGroup40 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
let Latency = 5;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SBWriteResGroup40], (instregex "MOV8mi")>;
-def: InstRW<[SBWriteResGroup40], (instregex "STOSB")>;
-def: InstRW<[SBWriteResGroup40], (instregex "STOSL")>;
-def: InstRW<[SBWriteResGroup40], (instregex "STOSQ")>;
-def: InstRW<[SBWriteResGroup40], (instregex "STOSW")>;
+def: InstRW<[SBWriteResGroup40], (instrs STOSB, STOSL, STOSQ, STOSW)>;
def SBWriteResGroup41 : SchedWriteRes<[SBPort5,SBPort015]> {
let Latency = 5;
let NumMicroOps = 4;
let ResourceCycles = [1,3];
}
-def: InstRW<[SBWriteResGroup41], (instregex "FNINIT")>;
+def: InstRW<[SBWriteResGroup41], (instrs FNINIT)>;
def SBWriteResGroup42 : SchedWriteRes<[SBPort05,SBPort015]> {
let Latency = 5;
let NumMicroOps = 4;
let ResourceCycles = [1,3];
}
-def: InstRW<[SBWriteResGroup42], (instregex "CMPXCHG(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup42], (instregex "CMPXCHG8rr")>;
+def: InstRW<[SBWriteResGroup42], (instregex "CMPXCHG(8|16|32|64)rr")>;
def SBWriteResGroup43 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
- let Latency = 5;
+ let Latency = 3;
let NumMicroOps = 4;
let ResourceCycles = [1,1,2];
}
-def: InstRW<[SBWriteResGroup43], (instregex "SETAm")>;
-def: InstRW<[SBWriteResGroup43], (instregex "SETBEm")>;
-
-def SBWriteResGroup44 : SchedWriteRes<[SBPort0,SBPort4,SBPort5,SBPort23]> {
- let Latency = 5;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[SBWriteResGroup44], (instregex "LDMXCSR")>;
-def: InstRW<[SBWriteResGroup44], (instregex "STMXCSR")>;
-def: InstRW<[SBWriteResGroup44], (instregex "VLDMXCSR")>;
-def: InstRW<[SBWriteResGroup44], (instregex "VSTMXCSR")>;
+def: InstRW<[SBWriteResGroup43], (instregex "SET(A|BE)m")>;
def SBWriteResGroup45 : SchedWriteRes<[SBPort0,SBPort4,SBPort23,SBPort15]> {
let Latency = 5;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[SBWriteResGroup45], (instregex "PEXTRDmr")>;
-def: InstRW<[SBWriteResGroup45], (instregex "PEXTRQmr")>;
-def: InstRW<[SBWriteResGroup45], (instregex "VPEXTRQmr")>;
-def: InstRW<[SBWriteResGroup45], (instregex "PUSHF16")>;
-def: InstRW<[SBWriteResGroup45], (instregex "PUSHF64")>;
+def: InstRW<[SBWriteResGroup45], (instregex "(V?)PEXTR(D|Q)mr",
+ "PUSHF(16|64)")>;
def SBWriteResGroup46 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> {
let Latency = 5;
@@ -1512,49 +837,23 @@ def SBWriteResGroup48 : SchedWriteRes<[SBPort23]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SBWriteResGroup48], (instregex "LDDQUrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOV64toPQIrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVAPDrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVAPSrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVDDUPrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVDI2PDIrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVDQArm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVDQUrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVNTDQArm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVQI2PQIrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVSDrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVSHDUPrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVSLDUPrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVSSrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVUPDrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVUPSrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "POP(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VBROADCASTSSrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VLDDQUYrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VLDDQUrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOV64toPQIrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVAPDrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVAPSrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVDDUPrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVDI2PDIrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVDQArm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVDQUrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVNTDQArm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVQI2PQIrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVSDrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVSHDUPrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVSLDUPrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVSSrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVUPDrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVUPSrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MMX_MOVD64from64rm",
+ "POP(16|32|64)r",
+ "VBROADCASTSSrm",
+ "(V?)MOV64toPQIrm",
+ "(V?)MOVDDUPrm",
+ "(V?)MOVDI2PDIrm",
+ "(V?)MOVQI2PQIrm",
+ "(V?)MOVSDrm",
+ "(V?)MOVSHDUPrm",
+ "(V?)MOVSLDUPrm",
+ "(V?)MOVSSrm")>;
def SBWriteResGroup49 : SchedWriteRes<[SBPort5,SBPort23]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup49], (instregex "JMP(16|32|64)m")>;
def: InstRW<[SBWriteResGroup49], (instregex "MOV16sm")>;
def SBWriteResGroup50 : SchedWriteRes<[SBPort23,SBPort05]> {
@@ -1569,153 +868,42 @@ def SBWriteResGroup51 : SchedWriteRes<[SBPort23,SBPort15]> {
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABSBrm64")>;
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABSDrm64")>;
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABSWrm64")>;
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PALIGNR64irm")>;
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSHUFBrm64")>;
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSIGNBrm64")>;
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSIGNDrm64")>;
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSIGNWrm64")>;
+def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABS(B|D|W)rm",
+ "MMX_PALIGNRrmi",
+ "MMX_PSIGN(B|D|W)rm")>;
def SBWriteResGroup52 : SchedWriteRes<[SBPort23,SBPort015]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup52], (instregex "ADD(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "ADD8rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "AND(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "AND8rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "CMP(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup52], (instregex "CMP(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup52], (instregex "CMP(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "CMP8mi")>;
-def: InstRW<[SBWriteResGroup52], (instregex "CMP8mr")>;
-def: InstRW<[SBWriteResGroup52], (instregex "CMP8rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "LODSL")>;
-def: InstRW<[SBWriteResGroup52], (instregex "LODSQ")>;
-def: InstRW<[SBWriteResGroup52], (instregex "OR(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "OR8rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "SUB(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "SUB8rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "XOR(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "XOR8rm")>;
+def: InstRW<[SBWriteResGroup52], (instrs LODSL, LODSQ)>;
def SBWriteResGroup53 : SchedWriteRes<[SBPort4,SBPort23]> {
let Latency = 6;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[SBWriteResGroup53], (instregex "ST_F32m")>;
-def: InstRW<[SBWriteResGroup53], (instregex "ST_F64m")>;
-def: InstRW<[SBWriteResGroup53], (instregex "ST_FP32m")>;
-def: InstRW<[SBWriteResGroup53], (instregex "ST_FP64m")>;
-def: InstRW<[SBWriteResGroup53], (instregex "ST_FP80m")>;
+def: InstRW<[SBWriteResGroup53], (instregex "ST_F(32|64)m",
+ "ST_FP(32|64|80)m")>;
def SBWriteResGroup54 : SchedWriteRes<[SBPort23]> {
let Latency = 7;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SBWriteResGroup54], (instregex "VBROADCASTSDYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VBROADCASTSSYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVAPDYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVAPSYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVDDUPYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVDQAYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVDQUYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVSHDUPYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVSLDUPYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVUPDYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVUPSYrm")>;
-
-def SBWriteResGroup55 : SchedWriteRes<[SBPort0,SBPort23]> {
- let Latency = 7;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup55], (instregex "CVTPS2PDrm")>;
-def: InstRW<[SBWriteResGroup55], (instregex "CVTSS2SDrm")>;
-def: InstRW<[SBWriteResGroup55], (instregex "VCVTPS2PDYrm")>;
-def: InstRW<[SBWriteResGroup55], (instregex "VCVTPS2PDrm")>;
-def: InstRW<[SBWriteResGroup55], (instregex "VCVTSS2SDrm")>;
-def: InstRW<[SBWriteResGroup55], (instregex "VTESTPDrm")>;
-def: InstRW<[SBWriteResGroup55], (instregex "VTESTPSrm")>;
-
-def SBWriteResGroup56 : SchedWriteRes<[SBPort5,SBPort23]> {
- let Latency = 7;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup56], (instregex "ANDNPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "ANDNPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "ANDPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "ANDPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "INSERTPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "MOVHPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "MOVHPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "MOVLPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "MOVLPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "ORPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "ORPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "SHUFPDrmi")>;
-def: InstRW<[SBWriteResGroup56], (instregex "SHUFPSrmi")>;
-def: InstRW<[SBWriteResGroup56], (instregex "UNPCKHPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "UNPCKHPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "UNPCKLPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "UNPCKLPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VANDNPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VANDNPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VANDPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VANDPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VBROADCASTF128")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VINSERTPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VMOVHPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VMOVHPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VMOVLPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VMOVLPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VORPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VORPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPDmi")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPSmi")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VSHUFPDrmi")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VSHUFPSrmi")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKHPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKHPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKLPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKLPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VXORPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VXORPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "XORPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "XORPSrm")>;
-
-def SBWriteResGroup57 : SchedWriteRes<[SBPort5,SBPort015]> {
- let Latency = 7;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup57], (instregex "AESDECLASTrr")>;
-def: InstRW<[SBWriteResGroup57], (instregex "AESDECrr")>;
-def: InstRW<[SBWriteResGroup57], (instregex "AESENCLASTrr")>;
-def: InstRW<[SBWriteResGroup57], (instregex "AESENCrr")>;
-def: InstRW<[SBWriteResGroup57], (instregex "VAESDECLASTrr")>;
-def: InstRW<[SBWriteResGroup57], (instregex "VAESDECrr")>;
-def: InstRW<[SBWriteResGroup57], (instregex "VAESENCLASTrr")>;
-def: InstRW<[SBWriteResGroup57], (instregex "VAESENCrr")>;
+def: InstRW<[SBWriteResGroup54], (instregex "VBROADCASTSDYrm",
+ "VBROADCASTSSYrm",
+ "VMOVDDUPYrm",
+ "VMOVSHDUPYrm",
+ "VMOVSLDUPYrm")>;
def SBWriteResGroup58 : SchedWriteRes<[SBPort23,SBPort05]> {
let Latency = 7;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup58], (instregex "BLENDPDrmi")>;
-def: InstRW<[SBWriteResGroup58], (instregex "BLENDPSrmi")>;
-def: InstRW<[SBWriteResGroup58], (instregex "VBLENDPDrmi")>;
-def: InstRW<[SBWriteResGroup58], (instregex "VBLENDPSrmi")>;
-def: InstRW<[SBWriteResGroup58], (instregex "VINSERTF128rm")>;
+def: InstRW<[SBWriteResGroup58], (instrs VINSERTF128rm)>;
def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> {
let Latency = 7;
@@ -1723,1136 +911,282 @@ def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> {
let ResourceCycles = [1,1];
}
def: InstRW<[SBWriteResGroup59], (instregex "MMX_PADDQirm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PABSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PABSDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PABSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PACKSSDWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PACKSSWBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PACKUSDWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PACKUSWBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDUSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDUSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PALIGNRrmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PAVGBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PAVGWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PBLENDWrmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PCMPGTBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PCMPGTDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PCMPGTWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PINSRBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PINSRDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PINSRQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PINSRWrmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMAXSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMAXSDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMAXSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMAXUBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMAXUDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMAXUWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMINSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMINSDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMINSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMINUBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMINUDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMINUWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXBDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXBQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXWDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXWQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXBDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXBQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXWDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXWQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSHUFBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSHUFDmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSHUFHWmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSHUFLWmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSIGNBrm128")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSIGNDrm128")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSIGNWrm128")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBUSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBUSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHQDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHWDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLQDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLWDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPABSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPABSDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPABSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPACKSSDWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPACKSSWBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPACKUSDWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPACKUSWBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDUSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDUSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPALIGNRrmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPAVGBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPAVGWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPBLENDWrmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPCMPGTBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPCMPGTDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPCMPGTWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPINSRBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPINSRDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPINSRQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPINSRWrmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMAXSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMAXSDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMAXSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMAXUBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMAXUDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMAXUWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMINSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMINSDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMINSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMINUBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMINUDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMINUWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXBDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXBQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXWDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXWQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXBDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXBQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXWDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXWQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFDmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFHWmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFLWmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSIGNBrm128")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSIGNDrm128")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSIGNWrm128")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBUSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBUSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHQDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHWDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLQDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLWDrm")>;
-
-def SBWriteResGroup60 : SchedWriteRes<[SBPort23,SBPort015]> {
- let Latency = 7;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup60], (instregex "PANDNrm")>;
-def: InstRW<[SBWriteResGroup60], (instregex "PANDrm")>;
-def: InstRW<[SBWriteResGroup60], (instregex "PORrm")>;
-def: InstRW<[SBWriteResGroup60], (instregex "PXORrm")>;
-def: InstRW<[SBWriteResGroup60], (instregex "VPANDNrm")>;
-def: InstRW<[SBWriteResGroup60], (instregex "VPANDrm")>;
-def: InstRW<[SBWriteResGroup60], (instregex "VPORrm")>;
-def: InstRW<[SBWriteResGroup60], (instregex "VPXORrm")>;
-
-def SBWriteResGroup61 : SchedWriteRes<[SBPort0,SBPort05]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[SBWriteResGroup61], (instregex "VRCPPSYr")>;
-def: InstRW<[SBWriteResGroup61], (instregex "VRSQRTPSYr")>;
def SBWriteResGroup62 : SchedWriteRes<[SBPort5,SBPort23]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SBWriteResGroup62], (instregex "VERRm")>;
-def: InstRW<[SBWriteResGroup62], (instregex "VERWm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "VER(R|W)m")>;
def SBWriteResGroup63 : SchedWriteRes<[SBPort23,SBPort015]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[SBWriteResGroup63], (instregex "LODSB")>;
-def: InstRW<[SBWriteResGroup63], (instregex "LODSW")>;
+def: InstRW<[SBWriteResGroup63], (instrs LODSB, LODSW)>;
def SBWriteResGroup64 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SBWriteResGroup64], (instregex "FARJMP64")>;
-
-def SBWriteResGroup65 : SchedWriteRes<[SBPort23,SBPort05,SBPort015]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup65], (instregex "ADC(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "ADC8rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVAE(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVB(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVE(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVG(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVGE(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVL(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVLE(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVNE(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVNO(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVNP(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVNS(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVO(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVP(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVS(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "SBB(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "SBB8rm")>;
+def: InstRW<[SBWriteResGroup64], (instrs FARJMP64)>;
def SBWriteResGroup66 : SchedWriteRes<[SBPort0,SBPort4,SBPort23]> {
let Latency = 7;
let NumMicroOps = 4;
let ResourceCycles = [1,1,2];
}
-def: InstRW<[SBWriteResGroup66], (instregex "FNSTSWm")>;
+def: InstRW<[SBWriteResGroup66], (instrs FNSTSWm)>;
def SBWriteResGroup67 : SchedWriteRes<[SBPort1,SBPort5,SBPort015]> {
let Latency = 7;
let NumMicroOps = 4;
let ResourceCycles = [1,2,1];
}
-def: InstRW<[SBWriteResGroup67], (instregex "SLDT(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup67], (instregex "STR(16|32|64)r")>;
+def: InstRW<[SBWriteResGroup67], (instregex "SLDT(16|32|64)r",
+ "STR(16|32|64)r")>;
def SBWriteResGroup68 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> {
let Latency = 7;
let NumMicroOps = 4;
let ResourceCycles = [1,1,2];
}
+def: InstRW<[SBWriteResGroup68], (instrs FNSTCW16m)>;
def: InstRW<[SBWriteResGroup68], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[SBWriteResGroup68], (instregex "FNSTCW16m")>;
def SBWriteResGroup69 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
let Latency = 7;
let NumMicroOps = 4;
let ResourceCycles = [1,2,1];
}
-def: InstRW<[SBWriteResGroup69], (instregex "BTC(16|32|64)mi8")>;
-def: InstRW<[SBWriteResGroup69], (instregex "BTR(16|32|64)mi8")>;
-def: InstRW<[SBWriteResGroup69], (instregex "BTS(16|32|64)mi8")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SAR(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SAR8mi")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SHL(16|32|64)m1")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SHL(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SHL8m1")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SHL8mi")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SHR(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SHR8mi")>;
-
-def SBWriteResGroup70 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
- let Latency = 7;
- let NumMicroOps = 4;
- let ResourceCycles = [1,2,1];
-}
-def: InstRW<[SBWriteResGroup70], (instregex "ADD(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "ADD(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "ADD8mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "ADD8mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "AND(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "AND(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "AND8mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "AND8mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "DEC(16|32|64)m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "DEC8m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "INC(16|32|64)m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "INC8m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "NEG(16|32|64)m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "NEG8m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "NOT(16|32|64)m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "NOT8m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "OR(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "OR(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "OR8mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "OR8mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "SUB(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "SUB(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "SUB8mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "SUB8mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "TEST(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "TEST8mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "TEST8mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "XOR(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "XOR(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "XOR8mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "XOR8mr")>;
-
-def SBWriteResGroup71 : SchedWriteRes<[SBPort0,SBPort23]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup71], (instregex "MMX_PMADDUBSWrm64")>;
-def: InstRW<[SBWriteResGroup71], (instregex "MMX_PMULHRSWrm64")>;
-def: InstRW<[SBWriteResGroup71], (instregex "VTESTPDYrm")>;
-def: InstRW<[SBWriteResGroup71], (instregex "VTESTPSYrm")>;
-
-def SBWriteResGroup72 : SchedWriteRes<[SBPort1,SBPort23]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup72], (instregex "BSF(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup72], (instregex "BSR(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup72], (instregex "CRC32r(16|32|64)m64")>;
-def: InstRW<[SBWriteResGroup72], (instregex "CRC32r(16|32|64)m8")>;
-def: InstRW<[SBWriteResGroup72], (instregex "FCOM32m")>;
-def: InstRW<[SBWriteResGroup72], (instregex "FCOM64m")>;
-def: InstRW<[SBWriteResGroup72], (instregex "FCOMP32m")>;
-def: InstRW<[SBWriteResGroup72], (instregex "FCOMP64m")>;
-def: InstRW<[SBWriteResGroup72], (instregex "MUL8m")>;
-
-def SBWriteResGroup73 : SchedWriteRes<[SBPort5,SBPort23]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup73], (instregex "VANDNPDYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VANDNPSYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VANDPDYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VANDPSYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VORPDYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VORPSYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VPERM2F128rm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPDYmi")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPDYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPSYmi")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPSYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VSHUFPDYrmi")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VSHUFPSYrmi")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKHPDYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKHPSYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKLPDYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKLPSYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VXORPDYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VXORPSYrm")>;
-
-def SBWriteResGroup74 : SchedWriteRes<[SBPort23,SBPort05]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup74], (instregex "VBLENDPDYrmi")>;
-def: InstRW<[SBWriteResGroup74], (instregex "VBLENDPSYrmi")>;
-
-def SBWriteResGroup75 : SchedWriteRes<[SBPort23,SBPort05]> {
- let Latency = 8;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[SBWriteResGroup75], (instregex "BLENDVPDrm0")>;
-def: InstRW<[SBWriteResGroup75], (instregex "BLENDVPSrm0")>;
-def: InstRW<[SBWriteResGroup75], (instregex "VBLENDVPDrm")>;
-def: InstRW<[SBWriteResGroup75], (instregex "VBLENDVPSrm")>;
-def: InstRW<[SBWriteResGroup75], (instregex "VMASKMOVPDrm")>;
-def: InstRW<[SBWriteResGroup75], (instregex "VMASKMOVPSrm")>;
-
-def SBWriteResGroup76 : SchedWriteRes<[SBPort23,SBPort15]> {
- let Latency = 8;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[SBWriteResGroup76], (instregex "PBLENDVBrr0")>;
-def: InstRW<[SBWriteResGroup76], (instregex "VPBLENDVBrm")>;
+def: InstRW<[SBWriteResGroup69], (instregex "BTC(16|32|64)mi8",
+ "BTR(16|32|64)mi8",
+ "BTS(16|32|64)mi8",
+ "SAR(8|16|32|64)m1",
+ "SAR(8|16|32|64)mi",
+ "SHL(8|16|32|64)m1",
+ "SHL(8|16|32|64)mi",
+ "SHR(8|16|32|64)m1",
+ "SHR(8|16|32|64)mi")>;
def SBWriteResGroup77 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
let Latency = 8;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SBWriteResGroup77], (instregex "COMISDrm")>;
-def: InstRW<[SBWriteResGroup77], (instregex "COMISSrm")>;
-def: InstRW<[SBWriteResGroup77], (instregex "UCOMISDrm")>;
-def: InstRW<[SBWriteResGroup77], (instregex "UCOMISSrm")>;
-def: InstRW<[SBWriteResGroup77], (instregex "VCOMISDrm")>;
-def: InstRW<[SBWriteResGroup77], (instregex "VCOMISSrm")>;
-def: InstRW<[SBWriteResGroup77], (instregex "VUCOMISDrm")>;
-def: InstRW<[SBWriteResGroup77], (instregex "VUCOMISSrm")>;
-
-def SBWriteResGroup78 : SchedWriteRes<[SBPort0,SBPort5,SBPort23]> {
- let Latency = 8;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup78], (instregex "PTESTrm")>;
-def: InstRW<[SBWriteResGroup78], (instregex "VPTESTrm")>;
-
-def SBWriteResGroup79 : SchedWriteRes<[SBPort0,SBPort23,SBPort15]> {
- let Latency = 8;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup79], (instregex "PSLLDrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "PSLLQrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "PSLLWrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "PSRADrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "PSRAWrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "PSRLDrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "PSRLQrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "PSRLWrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSLLDrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSLLQrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSLLWrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSRADrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSRAWrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSRLDrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSRLQrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSRLWrm")>;
-
-def SBWriteResGroup80 : SchedWriteRes<[SBPort23,SBPort15]> {
- let Latency = 8;
- let NumMicroOps = 4;
- let ResourceCycles = [1,3];
-}
-def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHADDSWrm64")>;
-def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHADDWrm64")>;
-def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHADDrm64")>;
-def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHSUBDrm64")>;
-def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHSUBSWrm64")>;
-def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHSUBWrm64")>;
+def: InstRW<[SBWriteResGroup77], (instregex "(V?)(U?)COMI(SD|SS)rm")>;
def SBWriteResGroup81 : SchedWriteRes<[SBPort23,SBPort015]> {
let Latency = 8;
let NumMicroOps = 4;
let ResourceCycles = [1,3];
}
-def: InstRW<[SBWriteResGroup81], (instregex "CMPXCHG(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup81], (instregex "CMPXCHG8rm")>;
-
-def SBWriteResGroup82 : SchedWriteRes<[SBPort23,SBPort05,SBPort015]> {
- let Latency = 8;
- let NumMicroOps = 4;
- let ResourceCycles = [1,2,1];
-}
-def: InstRW<[SBWriteResGroup82], (instregex "CMOVA(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup82], (instregex "CMOVBE(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup81], (instregex "CMPXCHG(8|16|32|64)rm")>;
def SBWriteResGroup83 : SchedWriteRes<[SBPort23,SBPort015]> {
let Latency = 8;
let NumMicroOps = 5;
let ResourceCycles = [2,3];
}
-def: InstRW<[SBWriteResGroup83], (instregex "CMPSB")>;
-def: InstRW<[SBWriteResGroup83], (instregex "CMPSL")>;
-def: InstRW<[SBWriteResGroup83], (instregex "CMPSQ")>;
-def: InstRW<[SBWriteResGroup83], (instregex "CMPSW")>;
+def: InstRW<[SBWriteResGroup83], (instrs CMPSB,
+ CMPSL,
+ CMPSQ,
+ CMPSW)>;
def SBWriteResGroup84 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> {
let Latency = 8;
let NumMicroOps = 5;
let ResourceCycles = [1,2,2];
}
-def: InstRW<[SBWriteResGroup84], (instregex "FLDCW16m")>;
+def: InstRW<[SBWriteResGroup84], (instrs FLDCW16m)>;
def SBWriteResGroup85 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
let Latency = 8;
let NumMicroOps = 5;
let ResourceCycles = [1,2,2];
}
-def: InstRW<[SBWriteResGroup85], (instregex "ROL(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup85], (instregex "ROL8mi")>;
-def: InstRW<[SBWriteResGroup85], (instregex "ROR(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup85], (instregex "ROR8mi")>;
+def: InstRW<[SBWriteResGroup85], (instregex "ROL(8|16|32|64)m1",
+ "ROL(8|16|32|64)mi",
+ "ROR(8|16|32|64)m1",
+ "ROR(8|16|32|64)mi")>;
def SBWriteResGroup86 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
let Latency = 8;
let NumMicroOps = 5;
let ResourceCycles = [1,2,2];
}
-def: InstRW<[SBWriteResGroup86], (instregex "MOVSB")>;
-def: InstRW<[SBWriteResGroup86], (instregex "MOVSL")>;
-def: InstRW<[SBWriteResGroup86], (instregex "MOVSQ")>;
-def: InstRW<[SBWriteResGroup86], (instregex "MOVSW")>;
-def: InstRW<[SBWriteResGroup86], (instregex "XADD(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup86], (instregex "XADD8rm")>;
+def: InstRW<[SBWriteResGroup86], (instrs MOVSB, MOVSL, MOVSQ, MOVSW)>;
+def: InstRW<[SBWriteResGroup86], (instregex "XADD(8|16|32|64)rm")>;
def SBWriteResGroup87 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> {
let Latency = 8;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,2];
}
-def: InstRW<[SBWriteResGroup87], (instregex "FARCALL64")>;
+def: InstRW<[SBWriteResGroup87], (instrs FARCALL64)>;
def SBWriteResGroup88 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> {
let Latency = 8;
let NumMicroOps = 5;
let ResourceCycles = [1,2,1,1];
}
-def: InstRW<[SBWriteResGroup88], (instregex "SHLD(16|32|64)mri8")>;
-def: InstRW<[SBWriteResGroup88], (instregex "SHRD(16|32|64)mri8")>;
+def: InstRW<[SBWriteResGroup88], (instregex "SHLD(16|32|64)mri8",
+ "SHRD(16|32|64)mri8")>;
-def SBWriteResGroup89 : SchedWriteRes<[SBPort0,SBPort23]> {
- let Latency = 9;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup89], (instregex "MMX_PMULUDQirm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMADDUBSWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMADDWDrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMULDQrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMULHRSWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMULHUWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMULHWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMULLDrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMULLWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMULUDQrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PSADBWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMADDUBSWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMADDWDrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMULDQrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMULHRSWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMULHUWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMULHWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMULLDrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMULLWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMULUDQrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPSADBWrm")>;
-
-def SBWriteResGroup90 : SchedWriteRes<[SBPort1,SBPort23]> {
- let Latency = 9;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup90], (instregex "ADDPDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ADDPSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ADDSDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ADDSSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ADDSUBPDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ADDSUBPSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CMPPDrmi")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CMPPSrmi")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CMPSDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CMPSSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CVTDQ2PSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CVTPS2DQrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CVTSI642SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CVTSI2SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CVTTPS2DQrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)PDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)PSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)SSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)PDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)PSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)SSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MMX_CVTPI2PSirm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MMX_CVTPS2PIirm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MMX_CVTTPS2PIirm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "POPCNT(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ROUNDPDm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ROUNDPSm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ROUNDSDm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ROUNDSSm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "SUBPDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "SUBPSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "SUBSDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "SUBSSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VADDPDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VADDPSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VADDSDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VADDSSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VADDSUBPDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VADDSUBPSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCMPPDrmi")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCMPPSrmi")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCMPSDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCMPSSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCVTDQ2PSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCVTPS2DQrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCVTSI642SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCVTSI2SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCVTTPS2DQrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)PDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)PSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)SSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)PDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)PSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)SSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VROUNDPDm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VROUNDPSm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VROUNDSDm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VROUNDSSm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VSUBPDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VSUBPSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VSUBSDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VSUBSSrm")>;
-
-def SBWriteResGroup91 : SchedWriteRes<[SBPort23,SBPort05]> {
+def SBWriteResGroup93 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
let Latency = 9;
let NumMicroOps = 3;
- let ResourceCycles = [1,2];
+ let ResourceCycles = [1,1,1];
}
-def: InstRW<[SBWriteResGroup91], (instregex "VBLENDVPDYrm")>;
-def: InstRW<[SBWriteResGroup91], (instregex "VBLENDVPSYrm")>;
-def: InstRW<[SBWriteResGroup91], (instregex "VMASKMOVPDYrm")>;
-def: InstRW<[SBWriteResGroup91], (instregex "VMASKMOVPSYrm")>;
+def: InstRW<[SBWriteResGroup93], (instregex "CVT(T?)SD2SI(64)?rm",
+ "CVT(T?)SS2SI(64)?rm")>;
-def SBWriteResGroup92 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> {
+def SBWriteResGroup93_1 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
let Latency = 9;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SBWriteResGroup92], (instregex "DPPDrri")>;
-def: InstRW<[SBWriteResGroup92], (instregex "VDPPDrri")>;
+def: InstRW<[SBWriteResGroup93_1], (instrs IMUL64m, MUL64m)>;
-def SBWriteResGroup93 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
+def SBWriteResGroup93_2 : SchedWriteRes<[SBPort1,SBPort23,SBPort05,SBPort015]> {
let Latency = 9;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[SBWriteResGroup93], (instregex "CVTSD2SI64rm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "CVTSD2SIrm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "CVTSS2SI64rm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "CVTSS2SIrm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "CVTTSD2SI64rm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "CVTTSD2SIrm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "CVTTSS2SI64rm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "CVTTSS2SIrm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "MUL(16|32|64)m")>;
-
-def SBWriteResGroup94 : SchedWriteRes<[SBPort0,SBPort5,SBPort23]> {
+def: InstRW<[SBWriteResGroup93_2], (instrs IMUL32m, MUL32m)>;
+
+def SBWriteResGroup93_3 : SchedWriteRes<[SBPort1,SBPort05,SBPort015,SBPort23]> {
let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,2,1];
+}
+def: InstRW<[SBWriteResGroup93_3], (instrs IMUL16m, MUL16m)>;
+
+def SBWriteResGroup93_4 : SchedWriteRes<[SBPort1,SBPort015,SBPort23]> {
+ let Latency = 8;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SBWriteResGroup94], (instregex "VPTESTYrm")>;
+def: InstRW<[SBWriteResGroup93_4], (instrs IMUL16rmi, IMUL16rmi8)>;
def SBWriteResGroup95 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> {
let Latency = 9;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SBWriteResGroup95], (instregex "LD_F32m")>;
-def: InstRW<[SBWriteResGroup95], (instregex "LD_F64m")>;
-def: InstRW<[SBWriteResGroup95], (instregex "LD_F80m")>;
-
-def SBWriteResGroup96 : SchedWriteRes<[SBPort23,SBPort15]> {
- let Latency = 9;
- let NumMicroOps = 4;
- let ResourceCycles = [1,3];
-}
-def: InstRW<[SBWriteResGroup96], (instregex "PHADDDrm")>;
-def: InstRW<[SBWriteResGroup96], (instregex "PHADDSWrm128")>;
-def: InstRW<[SBWriteResGroup96], (instregex "PHADDWrm")>;
-def: InstRW<[SBWriteResGroup96], (instregex "PHSUBDrm")>;
-def: InstRW<[SBWriteResGroup96], (instregex "PHSUBSWrm128")>;
-def: InstRW<[SBWriteResGroup96], (instregex "PHSUBWrm")>;
-def: InstRW<[SBWriteResGroup96], (instregex "VPHADDDrm")>;
-def: InstRW<[SBWriteResGroup96], (instregex "VPHADDSWrm128")>;
-def: InstRW<[SBWriteResGroup96], (instregex "VPHADDWrm")>;
-def: InstRW<[SBWriteResGroup96], (instregex "VPHSUBDrm")>;
-def: InstRW<[SBWriteResGroup96], (instregex "VPHSUBSWrm128")>;
-def: InstRW<[SBWriteResGroup96], (instregex "VPHSUBWrm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "LD_F(32|64|80)m")>;
def SBWriteResGroup97 : SchedWriteRes<[SBPort1,SBPort4,SBPort23]> {
let Latency = 9;
let NumMicroOps = 4;
let ResourceCycles = [1,1,2];
}
-def: InstRW<[SBWriteResGroup97], (instregex "IST_F16m")>;
-def: InstRW<[SBWriteResGroup97], (instregex "IST_F32m")>;
-def: InstRW<[SBWriteResGroup97], (instregex "IST_FP16m")>;
-def: InstRW<[SBWriteResGroup97], (instregex "IST_FP32m")>;
-def: InstRW<[SBWriteResGroup97], (instregex "IST_FP64m")>;
+def: InstRW<[SBWriteResGroup97], (instregex "IST_F(16|32)m",
+ "IST_FP(16|32|64)m")>;
def SBWriteResGroup97_2 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
let Latency = 9;
let NumMicroOps = 6;
let ResourceCycles = [1,2,3];
}
-def: InstRW<[SBWriteResGroup97_2], (instregex "ROL(16|32|64)mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "ROL8mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "ROR(16|32|64)mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "ROR8mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "SAR(16|32|64)mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "SAR8mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "SHL(16|32|64)mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "SHL8mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "SHR(16|32|64)mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "SHR8mCL")>;
+def: InstRW<[SBWriteResGroup97_2], (instregex "ROL(8|16|32|64)mCL",
+ "ROR(8|16|32|64)mCL",
+ "SAR(8|16|32|64)mCL",
+ "SHL(8|16|32|64)mCL",
+ "SHR(8|16|32|64)mCL")>;
def SBWriteResGroup98 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
let Latency = 9;
let NumMicroOps = 6;
let ResourceCycles = [1,2,3];
}
-def: InstRW<[SBWriteResGroup98], (instregex "ADC(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup98], (instregex "ADC8mi")>;
-def: InstRW<[SBWriteResGroup98], (instregex "SBB(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup98], (instregex "SBB8mi")>;
+def: SchedAlias<WriteADCRMW, SBWriteResGroup98>;
def SBWriteResGroup99 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> {
let Latency = 9;
let NumMicroOps = 6;
let ResourceCycles = [1,2,2,1];
}
-def: InstRW<[SBWriteResGroup99], (instregex "ADC(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup99], (instregex "ADC8mr")>;
-def: InstRW<[SBWriteResGroup99], (instregex "SBB(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup99], (instregex "SBB8mr")>;
+def: InstRW<[SBWriteResGroup99, ReadAfterLd], (instrs ADC8mr, ADC16mr, ADC32mr, ADC64mr,
+ SBB8mr, SBB16mr, SBB32mr, SBB64mr)>;
def SBWriteResGroup100 : SchedWriteRes<[SBPort4,SBPort5,SBPort23,SBPort05,SBPort015]> {
let Latency = 9;
let NumMicroOps = 6;
let ResourceCycles = [1,1,2,1,1];
}
-def: InstRW<[SBWriteResGroup100], (instregex "BT(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup100], (instregex "BTC(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup100], (instregex "BTR(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup100], (instregex "BTS(16|32|64)mr")>;
+def: InstRW<[SBWriteResGroup100], (instregex "BT(16|32|64)mr",
+ "BTC(16|32|64)mr",
+ "BTR(16|32|64)mr",
+ "BTS(16|32|64)mr")>;
def SBWriteResGroup101 : SchedWriteRes<[SBPort1,SBPort23]> {
let Latency = 10;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup101], (instregex "ADD_F32m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "ADD_F64m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "ILD_F16m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "ILD_F32m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "ILD_F64m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "SUBR_F32m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "SUBR_F64m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "SUB_F32m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "SUB_F64m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VADDPDYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VADDPSYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VADDSUBPDYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VADDSUBPSYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VCMPPDYrmi")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VCMPPSYrmi")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VCVTDQ2PSYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VCVTPS2DQYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VCVTTPS2DQYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VMAX(C?)PDYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VMAX(C?)PSYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VMIN(C?)PDYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VMIN(C?)PSYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VROUNDYPDm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VROUNDYPSm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VSUBPDYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VSUBPSYrm")>;
-
-def SBWriteResGroup102 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
- let Latency = 10;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTSD2SI64rm")>;
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTSD2SIrm")>;
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTSS2SI64rm")>;
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTSS2SIrm")>;
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSD2SI64rm")>;
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSD2SIrm")>;
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSS2SI64rm")>;
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSS2SIrm")>;
-
-def SBWriteResGroup103 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
- let Latency = 10;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup103], (instregex "CVTDQ2PDrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "CVTPD2DQrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "CVTPD2PSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "CVTSD2SSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "CVTSI642SSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "CVTSI2SSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "CVTTPD2DQrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "MMX_CVTPD2PIirm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "MMX_CVTPI2PDirm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "MMX_CVTTPD2PIirm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTDQ2PDYrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTDQ2PDrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTPD2DQrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTPD2PSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTSD2SSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTSI642SSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTSI2SSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTTPD2DQrm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+ "ILD_F(16|32|64)m")>;
def SBWriteResGroup103_2 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> {
let Latency = 10;
let NumMicroOps = 7;
let ResourceCycles = [1,2,3,1];
}
-def: InstRW<[SBWriteResGroup103_2], (instregex "SHLD(16|32|64)mrCL")>;
-def: InstRW<[SBWriteResGroup103_2], (instregex "SHRD(16|32|64)mrCL")>;
+def: InstRW<[SBWriteResGroup103_2], (instregex "SHLD(16|32|64)mrCL",
+ "SHRD(16|32|64)mrCL")>;
def SBWriteResGroup104 : SchedWriteRes<[SBPort0,SBPort23]> {
let Latency = 11;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup104], (instregex "MULPDrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "MULPSrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "MULSDrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "MULSSrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "PCMPGTQrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "PHMINPOSUWrm128")>;
-def: InstRW<[SBWriteResGroup104], (instregex "RCPPSm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "RCPSSm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "RSQRTPSm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "RSQRTSSm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VMULPDrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VMULPSrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VMULSDrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VMULSSrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VPCMPGTQrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VPHMINPOSUWrm128")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VRCPPSm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VRCPSSm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VRSQRTPSm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VRSQRTSSm")>;
-
-def SBWriteResGroup105 : SchedWriteRes<[SBPort0]> {
- let Latency = 11;
- let NumMicroOps = 3;
- let ResourceCycles = [3];
-}
-def: InstRW<[SBWriteResGroup105], (instregex "PCMPISTRIrr")>;
-def: InstRW<[SBWriteResGroup105], (instregex "PCMPISTRM128rr")>;
-def: InstRW<[SBWriteResGroup105], (instregex "VPCMPISTRIrr")>;
-def: InstRW<[SBWriteResGroup105], (instregex "VPCMPISTRM128rr")>;
+def: InstRW<[SBWriteResGroup104], (instregex "(V?)PCMPGTQrm")>;
def SBWriteResGroup106 : SchedWriteRes<[SBPort1,SBPort23]> {
let Latency = 11;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SBWriteResGroup106], (instregex "FICOM16m")>;
-def: InstRW<[SBWriteResGroup106], (instregex "FICOM32m")>;
-def: InstRW<[SBWriteResGroup106], (instregex "FICOMP16m")>;
-def: InstRW<[SBWriteResGroup106], (instregex "FICOMP32m")>;
-
-def SBWriteResGroup107 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
- let Latency = 11;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup107], (instregex "VCVTPD2DQYrm")>;
-def: InstRW<[SBWriteResGroup107], (instregex "VCVTPD2PSYrm")>;
-def: InstRW<[SBWriteResGroup107], (instregex "VCVTTPD2DQYrm")>;
-
-def SBWriteResGroup108 : SchedWriteRes<[SBPort0,SBPort23,SBPort15]> {
- let Latency = 11;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,2];
-}
-def: InstRW<[SBWriteResGroup108], (instregex "MPSADBWrmi")>;
-def: InstRW<[SBWriteResGroup108], (instregex "VMPSADBWrmi")>;
-
-def SBWriteResGroup109 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
- let Latency = 11;
- let NumMicroOps = 4;
- let ResourceCycles = [1,2,1];
-}
-def: InstRW<[SBWriteResGroup109], (instregex "HADDPDrm")>;
-def: InstRW<[SBWriteResGroup109], (instregex "HADDPSrm")>;
-def: InstRW<[SBWriteResGroup109], (instregex "HSUBPDrm")>;
-def: InstRW<[SBWriteResGroup109], (instregex "HSUBPSrm")>;
-def: InstRW<[SBWriteResGroup109], (instregex "VHADDPDrm")>;
-def: InstRW<[SBWriteResGroup109], (instregex "VHADDPSrm")>;
-def: InstRW<[SBWriteResGroup109], (instregex "VHSUBPDrm")>;
-def: InstRW<[SBWriteResGroup109], (instregex "VHSUBPSrm")>;
-
-def SBWriteResGroup110 : SchedWriteRes<[SBPort5]> {
- let Latency = 12;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SBWriteResGroup110], (instregex "AESIMCrr")>;
-def: InstRW<[SBWriteResGroup110], (instregex "VAESIMCrr")>;
+def: InstRW<[SBWriteResGroup106], (instregex "FICOM(P?)(16|32)m")>;
def SBWriteResGroup111 : SchedWriteRes<[SBPort0,SBPort23]> {
let Latency = 12;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup111], (instregex "MUL_F32m")>;
-def: InstRW<[SBWriteResGroup111], (instregex "MUL_F64m")>;
-def: InstRW<[SBWriteResGroup111], (instregex "VMULPDYrm")>;
-def: InstRW<[SBWriteResGroup111], (instregex "VMULPSYrm")>;
-
-def SBWriteResGroup112 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> {
- let Latency = 12;
- let NumMicroOps = 4;
- let ResourceCycles = [1,2,1];
-}
-def: InstRW<[SBWriteResGroup112], (instregex "DPPSrri")>;
-def: InstRW<[SBWriteResGroup112], (instregex "VDPPSYrri")>;
-def: InstRW<[SBWriteResGroup112], (instregex "VDPPSrri")>;
-
-def SBWriteResGroup113 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
- let Latency = 12;
- let NumMicroOps = 4;
- let ResourceCycles = [1,2,1];
-}
-def: InstRW<[SBWriteResGroup113], (instregex "VHADDPDYrm")>;
-def: InstRW<[SBWriteResGroup113], (instregex "VHADDPSYrm")>;
-def: InstRW<[SBWriteResGroup113], (instregex "VHSUBPDYrm")>;
-def: InstRW<[SBWriteResGroup113], (instregex "VHSUBPSYrm")>;
+def: InstRW<[SBWriteResGroup111], (instregex "MUL_F(32|64)m")>;
def SBWriteResGroup114 : SchedWriteRes<[SBPort1,SBPort23]> {
let Latency = 13;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SBWriteResGroup114], (instregex "ADD_FI16m")>;
-def: InstRW<[SBWriteResGroup114], (instregex "ADD_FI32m")>;
-def: InstRW<[SBWriteResGroup114], (instregex "SUBR_FI16m")>;
-def: InstRW<[SBWriteResGroup114], (instregex "SUBR_FI32m")>;
-def: InstRW<[SBWriteResGroup114], (instregex "SUB_FI16m")>;
-def: InstRW<[SBWriteResGroup114], (instregex "SUB_FI32m")>;
-
-def SBWriteResGroup115 : SchedWriteRes<[SBPort5,SBPort23,SBPort015]> {
- let Latency = 13;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup115], (instregex "AESDECLASTrm")>;
-def: InstRW<[SBWriteResGroup115], (instregex "AESDECrm")>;
-def: InstRW<[SBWriteResGroup115], (instregex "AESENCLASTrm")>;
-def: InstRW<[SBWriteResGroup115], (instregex "AESENCrm")>;
-def: InstRW<[SBWriteResGroup115], (instregex "VAESDECLASTrm")>;
-def: InstRW<[SBWriteResGroup115], (instregex "VAESDECrm")>;
-def: InstRW<[SBWriteResGroup115], (instregex "VAESENCLASTrm")>;
-def: InstRW<[SBWriteResGroup115], (instregex "VAESENCrm")>;
-
-def SBWriteResGroup116 : SchedWriteRes<[SBPort0]> {
- let Latency = 14;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup116], (instregex "DIVPSrr")>;
-def: InstRW<[SBWriteResGroup116], (instregex "DIVSSrr")>;
-def: InstRW<[SBWriteResGroup116], (instregex "SQRTPSr")>;
-def: InstRW<[SBWriteResGroup116], (instregex "SQRTSSr")>;
-def: InstRW<[SBWriteResGroup116], (instregex "VDIVPSrr")>;
-def: InstRW<[SBWriteResGroup116], (instregex "VDIVSSrr")>;
-def: InstRW<[SBWriteResGroup116], (instregex "VSQRTPSr")>;
-
-def SBWriteResGroup117 : SchedWriteRes<[SBPort0,SBPort23]> {
- let Latency = 14;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup117], (instregex "VSQRTSSm")>;
-
-def SBWriteResGroup118 : SchedWriteRes<[SBPort0,SBPort23,SBPort05]> {
- let Latency = 14;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SBWriteResGroup118], (instregex "VRCPPSYm")>;
-def: InstRW<[SBWriteResGroup118], (instregex "VRSQRTPSYm")>;
+def: InstRW<[SBWriteResGroup114], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>;
def SBWriteResGroup119 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
let Latency = 15;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SBWriteResGroup119], (instregex "MUL_FI16m")>;
-def: InstRW<[SBWriteResGroup119], (instregex "MUL_FI32m")>;
-
-def SBWriteResGroup120 : SchedWriteRes<[SBPort0,SBPort1,SBPort5,SBPort23]> {
- let Latency = 15;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[SBWriteResGroup120], (instregex "DPPDrmi")>;
-def: InstRW<[SBWriteResGroup120], (instregex "VDPPDrmi")>;
-
-def SBWriteResGroup121 : SchedWriteRes<[SBPort0,SBPort23]> {
- let Latency = 17;
- let NumMicroOps = 4;
- let ResourceCycles = [3,1];
-}
-def: InstRW<[SBWriteResGroup121], (instregex "PCMPISTRIrm")>;
-def: InstRW<[SBWriteResGroup121], (instregex "PCMPISTRM128rm")>;
-def: InstRW<[SBWriteResGroup121], (instregex "VPCMPISTRIrm")>;
-def: InstRW<[SBWriteResGroup121], (instregex "VPCMPISTRM128rm")>;
-
-def SBWriteResGroup122 : SchedWriteRes<[SBPort5,SBPort23]> {
- let Latency = 18;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[SBWriteResGroup122], (instregex "AESIMCrm")>;
-def: InstRW<[SBWriteResGroup122], (instregex "VAESIMCrm")>;
-
-def SBWriteResGroup123 : SchedWriteRes<[SBPort0,SBPort23]> {
- let Latency = 20;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup123], (instregex "DIVPSrm")>;
-def: InstRW<[SBWriteResGroup123], (instregex "DIVSSrm")>;
-def: InstRW<[SBWriteResGroup123], (instregex "SQRTPSm")>;
-def: InstRW<[SBWriteResGroup123], (instregex "SQRTSSm")>;
-def: InstRW<[SBWriteResGroup123], (instregex "VDIVPSrm")>;
-def: InstRW<[SBWriteResGroup123], (instregex "VDIVSSrm")>;
-def: InstRW<[SBWriteResGroup123], (instregex "VSQRTPSm")>;
-
-def SBWriteResGroup124 : SchedWriteRes<[SBPort0]> {
- let Latency = 21;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup124], (instregex "VSQRTSDr")>;
-
-def SBWriteResGroup125 : SchedWriteRes<[SBPort0,SBPort23]> {
- let Latency = 21;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup125], (instregex "VSQRTSDm")>;
-
-def SBWriteResGroup126 : SchedWriteRes<[SBPort0]> {
- let Latency = 22;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup126], (instregex "DIVPDrr")>;
-def: InstRW<[SBWriteResGroup126], (instregex "DIVSDrr")>;
-def: InstRW<[SBWriteResGroup126], (instregex "SQRTPDr")>;
-def: InstRW<[SBWriteResGroup126], (instregex "SQRTSDr")>;
-def: InstRW<[SBWriteResGroup126], (instregex "VDIVPDrr")>;
-def: InstRW<[SBWriteResGroup126], (instregex "VDIVSDrr")>;
-def: InstRW<[SBWriteResGroup126], (instregex "VSQRTPDr")>;
-
-def SBWriteResGroup127 : SchedWriteRes<[SBPort0]> {
- let Latency = 24;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup127], (instregex "DIVR_FPrST0")>;
-def: InstRW<[SBWriteResGroup127], (instregex "DIVR_FST0r")>;
-def: InstRW<[SBWriteResGroup127], (instregex "DIVR_FrST0")>;
-def: InstRW<[SBWriteResGroup127], (instregex "DIV_FPrST0")>;
-def: InstRW<[SBWriteResGroup127], (instregex "DIV_FST0r")>;
-def: InstRW<[SBWriteResGroup127], (instregex "DIV_FrST0")>;
-
-def SBWriteResGroup128 : SchedWriteRes<[SBPort0,SBPort23]> {
- let Latency = 28;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup128], (instregex "DIVPDrm")>;
-def: InstRW<[SBWriteResGroup128], (instregex "DIVSDrm")>;
-def: InstRW<[SBWriteResGroup128], (instregex "SQRTPDm")>;
-def: InstRW<[SBWriteResGroup128], (instregex "SQRTSDm")>;
-def: InstRW<[SBWriteResGroup128], (instregex "VDIVPDrm")>;
-def: InstRW<[SBWriteResGroup128], (instregex "VDIVSDrm")>;
-def: InstRW<[SBWriteResGroup128], (instregex "VSQRTPDm")>;
-
-def SBWriteResGroup129 : SchedWriteRes<[SBPort0,SBPort05]> {
- let Latency = 29;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[SBWriteResGroup129], (instregex "VDIVPSYrr")>;
-def: InstRW<[SBWriteResGroup129], (instregex "VSQRTPSYr")>;
+def: InstRW<[SBWriteResGroup119], (instregex "MUL_FI(16|32)m")>;
def SBWriteResGroup130 : SchedWriteRes<[SBPort0,SBPort23]> {
let Latency = 31;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup130], (instregex "DIVR_F32m")>;
-def: InstRW<[SBWriteResGroup130], (instregex "DIVR_F64m")>;
-def: InstRW<[SBWriteResGroup130], (instregex "DIV_F32m")>;
-def: InstRW<[SBWriteResGroup130], (instregex "DIV_F64m")>;
+def: InstRW<[SBWriteResGroup130], (instregex "DIV(R?)_F(32|64)m")>;
def SBWriteResGroup131 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
let Latency = 34;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SBWriteResGroup131], (instregex "DIVR_FI16m")>;
-def: InstRW<[SBWriteResGroup131], (instregex "DIVR_FI32m")>;
-def: InstRW<[SBWriteResGroup131], (instregex "DIV_FI16m")>;
-def: InstRW<[SBWriteResGroup131], (instregex "DIV_FI32m")>;
+def: InstRW<[SBWriteResGroup131], (instregex "DIV(R?)_FI(16|32)m")>;
-def SBWriteResGroup132 : SchedWriteRes<[SBPort0,SBPort23,SBPort05]> {
- let Latency = 36;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SBWriteResGroup132], (instregex "VDIVPSYrm")>;
-def: InstRW<[SBWriteResGroup132], (instregex "VSQRTPSYm")>;
-
-def SBWriteResGroup133 : SchedWriteRes<[SBPort0,SBPort05]> {
- let Latency = 45;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[SBWriteResGroup133], (instregex "VDIVPDYrr")>;
-def: InstRW<[SBWriteResGroup133], (instregex "VSQRTPDYr")>;
-
-def SBWriteResGroup134 : SchedWriteRes<[SBPort0,SBPort23,SBPort05]> {
- let Latency = 52;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SBWriteResGroup134], (instregex "VDIVPDYrm")>;
-def: InstRW<[SBWriteResGroup134], (instregex "VSQRTPDYm")>;
-
-def SBWriteResGroup135 : SchedWriteRes<[SBPort0]> {
- let Latency = 114;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup135], (instregex "VSQRTSSr")>;
+def: InstRW<[WriteZero], (instrs CLC)>;
} // SchedModel
diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td
index 9a417b2d3e82..1417799d76be 100644
--- a/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/lib/Target/X86/X86SchedSkylakeClient.td
@@ -19,7 +19,7 @@ def SkylakeClientModel : SchedMachineModel {
let MicroOpBufferSize = 224; // Based on the reorder buffer.
let LoadLatency = 5;
let MispredictPenalty = 14;
-
+
// Based on the LSD (loop-stream detector) queue size and benchmarking data.
let LoopMicroOpBufferSize = 50;
@@ -61,6 +61,10 @@ def SKLPort015 : ProcResGroup<[SKLPort0, SKLPort1, SKLPort5]>;
def SKLPort056 : ProcResGroup<[SKLPort0, SKLPort5, SKLPort6]>;
def SKLPort0156: ProcResGroup<[SKLPort0, SKLPort1, SKLPort5, SKLPort6]>;
+def SKLDivider : ProcResource<1>; // Integer division issued on port 0.
+// FP division and sqrt on port 0.
+def SKLFPDivider : ProcResource<1>;
+
// 60 Entry Unified Scheduler
def SKLPortAny : ProcResGroup<[SKLPort0, SKLPort1, SKLPort2, SKLPort3, SKLPort4,
SKLPort5, SKLPort6, SKLPort7]> {
@@ -77,45 +81,84 @@ def : ReadAdvance<ReadAfterLd, 5>;
// This multiclass defines the resource usage for variants with and without
// folded loads.
multiclass SKLWriteResPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1,
+ int LoadLat = 5> {
// Register variant is using a single cycle on ExePort.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
- // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
- // latency.
- def : WriteRes<SchedRW.Folded, [SKLPort23, ExePort]> {
- let Latency = !add(Lat, 5);
+ // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+ // the latency (default = 5).
+ def : WriteRes<SchedRW.Folded, !listconcat([SKLPort23], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = !add(UOps, 1);
}
}
-// A folded store needs a cycle on port 4 for the store data, but it does not
-// need an extra port 2/3 cycle to recompute the address.
-def : WriteRes<WriteRMW, [SKLPort4]>;
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3/7 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SKLPort237,SKLPort4]>;
// Arithmetic.
-defm : SKLWriteResPair<WriteALU, SKLPort0156, 1>; // Simple integer ALU op.
-defm : SKLWriteResPair<WriteIMul, SKLPort1, 3>; // Integer multiplication.
+defm : SKLWriteResPair<WriteALU, [SKLPort0156], 1>; // Simple integer ALU op.
+defm : SKLWriteResPair<WriteADC, [SKLPort06], 1>; // Integer ALU + flags op.
+defm : SKLWriteResPair<WriteIMul, [SKLPort1], 3>; // Integer multiplication.
+defm : SKLWriteResPair<WriteIMul64, [SKLPort1], 3>; // Integer 64-bit multiplication.
+
+defm : SKLWriteResPair<WriteBSWAP32,[SKLPort15], 1>; //
+defm : SKLWriteResPair<WriteBSWAP64,[SKLPort06, SKLPort15], 2, [1,1], 2>; //
+
+defm : SKLWriteResPair<WriteDiv8, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+defm : SKLWriteResPair<WriteDiv16, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+defm : SKLWriteResPair<WriteDiv32, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+defm : SKLWriteResPair<WriteDiv64, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+defm : SKLWriteResPair<WriteIDiv8, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+defm : SKLWriteResPair<WriteIDiv16, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+defm : SKLWriteResPair<WriteIDiv32, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+defm : SKLWriteResPair<WriteIDiv64, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+
+defm : SKLWriteResPair<WriteCRC32, [SKLPort1], 3>;
+
def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part.
-def SKLDivider : ProcResource<1>; // Integer division issued on port 0.
-def : WriteRes<WriteIDiv, [SKLPort0, SKLDivider]> { // Integer division.
- let Latency = 25;
- let ResourceCycles = [1, 10];
-}
-def : WriteRes<WriteIDivLd, [SKLPort23, SKLPort0, SKLDivider]> {
- let Latency = 29;
- let ResourceCycles = [1, 1, 10];
+def : WriteRes<WriteLEA, [SKLPort15]>; // LEA instructions can't fold loads.
+
+defm : SKLWriteResPair<WriteCMOV, [SKLPort06], 1, [1], 1>; // Conditional move.
+defm : SKLWriteResPair<WriteCMOV2, [SKLPort06], 2, [2], 2>; // Conditional (CF + ZF flag) move.
+defm : X86WriteRes<WriteFCMOV, [SKLPort1], 3, [1], 1>; // x87 conditional move.
+def : WriteRes<WriteSETCC, [SKLPort06]>; // Setcc.
+def : WriteRes<WriteSETCCStore, [SKLPort06,SKLPort4,SKLPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
}
+def : WriteRes<WriteLAHFSAHF, [SKLPort06]>;
-def : WriteRes<WriteLEA, [SKLPort15]>; // LEA instructions can't fold loads.
+// Bit counts.
+defm : SKLWriteResPair<WriteBSF, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteBSR, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteLZCNT, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteTZCNT, [SKLPort1], 3>;
+defm : SKLWriteResPair<WritePOPCNT, [SKLPort1], 3>;
// Integer shifts and rotates.
-defm : SKLWriteResPair<WriteShift, SKLPort06, 1>;
+defm : SKLWriteResPair<WriteShift, [SKLPort06], 1>;
+
+// Double shift instructions.
+defm : SKLWriteResPair<WriteShiftDouble, [SKLPort06], 1>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : SKLWriteResPair<WriteBEXTR, [SKLPort06,SKLPort15], 2, [1,1], 2>;
+defm : SKLWriteResPair<WriteBZHI, [SKLPort15], 1>;
// Loads, stores, and moves, not folded with other operations.
-def : WriteRes<WriteLoad, [SKLPort23]> { let Latency = 5; }
-def : WriteRes<WriteStore, [SKLPort237, SKLPort4]>;
-def : WriteRes<WriteMove, [SKLPort0156]>;
+defm : X86WriteRes<WriteLoad, [SKLPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteStore, [SKLPort237, SKLPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteStoreNT, [SKLPort237, SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteMove, [SKLPort0156], 1, [1], 1>;
// Idioms that clear a register, like xorps %xmm0, %xmm0.
// These can often bypass execution ports completely.
@@ -123,153 +166,373 @@ def : WriteRes<WriteZero, []>;
// Branches don't produce values, so they have no latency, but they still
// consume resources. Indirect branches can fold loads.
-defm : SKLWriteResPair<WriteJump, SKLPort06, 1>;
+defm : SKLWriteResPair<WriteJump, [SKLPort06], 1>;
// Floating point. This covers both scalar and vector operations.
-defm : SKLWriteResPair<WriteFAdd, SKLPort1, 3>; // Floating point add/sub/compare.
-defm : SKLWriteResPair<WriteFMul, SKLPort0, 5>; // Floating point multiplication.
-defm : SKLWriteResPair<WriteFDiv, SKLPort0, 12>; // 10-14 cycles. // Floating point division.
-defm : SKLWriteResPair<WriteFSqrt, SKLPort0, 15>; // Floating point square root.
-defm : SKLWriteResPair<WriteFRcp, SKLPort0, 5>; // Floating point reciprocal estimate.
-defm : SKLWriteResPair<WriteFRsqrt, SKLPort0, 5>; // Floating point reciprocal square root estimate.
-defm : SKLWriteResPair<WriteFMA, SKLPort01, 4>; // Fused Multiply Add.
-defm : SKLWriteResPair<WriteFShuffle, SKLPort5, 1>; // Floating point vector shuffles.
-defm : SKLWriteResPair<WriteFBlend, SKLPort015, 1>; // Floating point vector blends.
-def : WriteRes<WriteFVarBlend, [SKLPort5]> { // Fp vector variable blends.
- let Latency = 2;
- let ResourceCycles = [2];
-}
-def : WriteRes<WriteFVarBlendLd, [SKLPort5, SKLPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1];
-}
+defm : X86WriteRes<WriteFLD0, [SKLPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1, [SKLPort05], 1, [2], 2>;
+defm : X86WriteRes<WriteFLDC, [SKLPort05], 1, [2], 2>;
+defm : X86WriteRes<WriteFLoad, [SKLPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX, [SKLPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [SKLPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad, [SKLPort23,SKLPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedLoadY, [SKLPort23,SKLPort015], 8, [1,1], 2>;
+defm : X86WriteRes<WriteFStore, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreX, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNT, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTX, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMove, [SKLPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX, [SKLPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY, [SKLPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteEMMS, [SKLPort05,SKLPort0156], 10, [9,1], 10>;
+
+defm : SKLWriteResPair<WriteFAdd, [SKLPort01], 4, [1], 1, 5>; // Floating point add/sub.
+defm : SKLWriteResPair<WriteFAddX, [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFAddY, [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : SKLWriteResPair<WriteFAdd64, [SKLPort01], 4, [1], 1, 5>; // Floating point double add/sub.
+defm : SKLWriteResPair<WriteFAdd64X, [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFAdd64Y, [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+
+defm : SKLWriteResPair<WriteFCmp, [SKLPort01], 4, [1], 1, 5>; // Floating point compare.
+defm : SKLWriteResPair<WriteFCmpX, [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFCmpY, [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : SKLWriteResPair<WriteFCmp64, [SKLPort01], 4, [1], 1, 5>; // Floating point double compare.
+defm : SKLWriteResPair<WriteFCmp64X, [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFCmp64Y, [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+
+defm : SKLWriteResPair<WriteFCom, [SKLPort0], 2>; // Floating point compare to flags.
+
+defm : SKLWriteResPair<WriteFMul, [SKLPort01], 4, [1], 1, 5>; // Floating point multiplication.
+defm : SKLWriteResPair<WriteFMulX, [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFMulY, [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : SKLWriteResPair<WriteFMul64, [SKLPort01], 4, [1], 1, 5>; // Floating point double multiplication.
+defm : SKLWriteResPair<WriteFMul64X, [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFMul64Y, [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+
+defm : SKLWriteResPair<WriteFDiv, [SKLPort0,SKLFPDivider], 11, [1,3], 1, 5>; // Floating point division.
+//defm : SKLWriteResPair<WriteFDivX, [SKLPort0,SKLFPDivider], 11, [1,3], 1, 6>;
+defm : SKLWriteResPair<WriteFDivY, [SKLPort0,SKLFPDivider], 11, [1,5], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+//defm : SKLWriteResPair<WriteFDiv64, [SKLPort0,SKLFPDivider], 14, [1,3], 1, 5>; // Floating point double division.
+//defm : SKLWriteResPair<WriteFDiv64X, [SKLPort0,SKLFPDivider], 14, [1,3], 1, 6>;
+//defm : SKLWriteResPair<WriteFDiv64Y, [SKLPort0,SKLFPDivider], 14, [1,5], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+
+defm : SKLWriteResPair<WriteFSqrt, [SKLPort0,SKLFPDivider], 12, [1,3], 1, 5>; // Floating point square root.
+defm : SKLWriteResPair<WriteFSqrtX, [SKLPort0,SKLFPDivider], 12, [1,3], 1, 6>;
+defm : SKLWriteResPair<WriteFSqrtY, [SKLPort0,SKLFPDivider], 12, [1,6], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : SKLWriteResPair<WriteFSqrt64, [SKLPort0,SKLFPDivider], 18, [1,6], 1, 5>; // Floating point double square root.
+defm : SKLWriteResPair<WriteFSqrt64X, [SKLPort0,SKLFPDivider], 18, [1,6], 1, 6>;
+defm : SKLWriteResPair<WriteFSqrt64Y, [SKLPort0,SKLFPDivider], 18, [1,12],1, 7>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : SKLWriteResPair<WriteFSqrt80, [SKLPort0,SKLFPDivider], 21, [1,7]>; // Floating point long double square root.
+
+defm : SKLWriteResPair<WriteFRcp, [SKLPort0], 4, [1], 1, 5>; // Floating point reciprocal estimate.
+defm : SKLWriteResPair<WriteFRcpX, [SKLPort0], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFRcpY, [SKLPort0], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+
+defm : SKLWriteResPair<WriteFRsqrt, [SKLPort0], 4, [1], 1, 5>; // Floating point reciprocal square root estimate.
+defm : SKLWriteResPair<WriteFRsqrtX,[SKLPort0], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFRsqrtY,[SKLPort0], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+
+defm : SKLWriteResPair<WriteFMA, [SKLPort01], 4, [1], 1, 5>; // Fused Multiply Add.
+defm : SKLWriteResPair<WriteFMAX, [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFMAY, [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : SKLWriteResPair<WriteDPPD, [SKLPort5,SKLPort01], 9, [1,2], 3, 6>; // Floating point double dot product.
+defm : SKLWriteResPair<WriteDPPS, [SKLPort5,SKLPort01], 13, [1,3], 4, 6>;
+defm : SKLWriteResPair<WriteDPPSY, [SKLPort5,SKLPort01], 13, [1,3], 4, 7>;
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+defm : SKLWriteResPair<WriteFSign, [SKLPort0], 1>; // Floating point fabs/fchs.
+defm : SKLWriteResPair<WriteFRnd, [SKLPort01], 8, [2], 2, 6>; // Floating point rounding.
+defm : SKLWriteResPair<WriteFRndY, [SKLPort01], 8, [2], 2, 7>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : SKLWriteResPair<WriteFLogic, [SKLPort015], 1, [1], 1, 6>; // Floating point and/or/xor logicals.
+defm : SKLWriteResPair<WriteFLogicY, [SKLPort015], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : SKLWriteResPair<WriteFTest, [SKLPort0], 2, [1], 1, 6>; // Floating point TEST instructions.
+defm : SKLWriteResPair<WriteFTestY, [SKLPort0], 2, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : SKLWriteResPair<WriteFShuffle, [SKLPort5], 1, [1], 1, 6>; // Floating point vector shuffles.
+defm : SKLWriteResPair<WriteFShuffleY, [SKLPort5], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : SKLWriteResPair<WriteFVarShuffle, [SKLPort5], 1, [1], 1, 6>; // Floating point vector shuffles.
+defm : SKLWriteResPair<WriteFVarShuffleY, [SKLPort5], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : SKLWriteResPair<WriteFBlend, [SKLPort015], 1, [1], 1, 6>; // Floating point vector blends.
+defm : SKLWriteResPair<WriteFBlendY, [SKLPort015], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : SKLWriteResPair<WriteFVarBlend, [SKLPort015], 2, [2], 2, 6>; // Fp vector variable blends.
+defm : SKLWriteResPair<WriteFVarBlendY,[SKLPort015], 2, [2], 2, 7>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
// FMA Scheduling helper class.
// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
// Vector integer operations.
-defm : SKLWriteResPair<WriteVecALU, SKLPort15, 1>; // Vector integer ALU op, no logicals.
-defm : SKLWriteResPair<WriteVecShift, SKLPort0, 1>; // Vector integer shifts.
-defm : SKLWriteResPair<WriteVecIMul, SKLPort0, 5>; // Vector integer multiply.
-defm : SKLWriteResPair<WriteShuffle, SKLPort5, 1>; // Vector shuffles.
-defm : SKLWriteResPair<WriteBlend, SKLPort15, 1>; // Vector blends.
-
-def : WriteRes<WriteVarBlend, [SKLPort5]> { // Vector variable blends.
+defm : X86WriteRes<WriteVecLoad, [SKLPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [SKLPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [SKLPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT, [SKLPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [SKLPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [SKLPort23,SKLPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [SKLPort23,SKLPort015], 8, [1,1], 2>;
+defm : X86WriteRes<WriteVecStore, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreX, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNT, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNTY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMove, [SKLPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [SKLPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [SKLPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr, [SKLPort0], 2, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [SKLPort5], 1, [1], 1>;
+
+defm : SKLWriteResPair<WriteVecALU, [SKLPort05], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : SKLWriteResPair<WriteVecALUX, [SKLPort01], 1, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVecALUY, [SKLPort01], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : SKLWriteResPair<WriteVecLogic, [SKLPort05], 1, [1], 1, 5>; // Vector integer and/or/xor.
+defm : SKLWriteResPair<WriteVecLogicX,[SKLPort015], 1, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVecLogicY,[SKLPort015], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : SKLWriteResPair<WriteVecTest, [SKLPort0,SKLPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions.
+defm : SKLWriteResPair<WriteVecTestY, [SKLPort0,SKLPort5], 3, [1,1], 2, 7>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : SKLWriteResPair<WriteVecIMul, [SKLPort0] , 4, [1], 1, 5>; // Vector integer multiply.
+defm : SKLWriteResPair<WriteVecIMulX, [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVecIMulY, [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : SKLWriteResPair<WritePMULLD, [SKLPort01], 10, [2], 2, 6>; // Vector PMULLD.
+defm : SKLWriteResPair<WritePMULLDY, [SKLPort01], 10, [2], 2, 7>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : SKLWriteResPair<WriteShuffle, [SKLPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : SKLWriteResPair<WriteShuffleX, [SKLPort5], 1, [1], 1, 6>;
+defm : SKLWriteResPair<WriteShuffleY, [SKLPort5], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : SKLWriteResPair<WriteVarShuffle, [SKLPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : SKLWriteResPair<WriteVarShuffleX, [SKLPort5], 1, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVarShuffleY, [SKLPort5], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : SKLWriteResPair<WriteBlend, [SKLPort5], 1, [1], 1, 6>; // Vector blends.
+defm : SKLWriteResPair<WriteBlendY, [SKLPort5], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : SKLWriteResPair<WriteVarBlend, [SKLPort015], 2, [2], 2, 6>; // Vector variable blends.
+defm : SKLWriteResPair<WriteVarBlendY, [SKLPort015], 2, [2], 2, 6>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : SKLWriteResPair<WriteMPSAD, [SKLPort5], 4, [2], 2, 6>; // Vector MPSAD.
+defm : SKLWriteResPair<WriteMPSADY, [SKLPort5], 4, [2], 2, 7>;
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+defm : SKLWriteResPair<WritePSADBW, [SKLPort5], 3, [1], 1, 5>; // Vector PSADBW.
+defm : SKLWriteResPair<WritePSADBWX, [SKLPort5], 3, [1], 1, 6>;
+defm : SKLWriteResPair<WritePSADBWY, [SKLPort5], 3, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : SKLWriteResPair<WritePHMINPOS, [SKLPort01], 4, [1], 1, 6>; // Vector PHMINPOS.
+
+// Vector integer shifts.
+defm : SKLWriteResPair<WriteVecShift, [SKLPort0], 1, [1], 1, 5>;
+defm : X86WriteRes<WriteVecShiftX, [SKLPort5,SKLPort01], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftY, [SKLPort5,SKLPort01], 4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftXLd, [SKLPort01,SKLPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftYLd, [SKLPort01,SKLPort23], 8, [1,1], 2>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+
+defm : SKLWriteResPair<WriteVecShiftImm, [SKLPort0], 1, [1], 1, 5>; // Vector integer immediate shifts.
+defm : SKLWriteResPair<WriteVecShiftImmX, [SKLPort01], 1, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVecShiftImmY, [SKLPort01], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : SKLWriteResPair<WriteVarVecShift, [SKLPort01], 1, [1], 1, 6>; // Variable vector shifts.
+defm : SKLWriteResPair<WriteVarVecShiftY, [SKLPort01], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [SKLPort5]> {
let Latency = 2;
+ let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def : WriteRes<WriteVarBlendLd, [SKLPort5, SKLPort23]> {
+def : WriteRes<WriteVecInsertLd, [SKLPort5,SKLPort23]> {
let Latency = 6;
- let ResourceCycles = [2, 1];
+ let NumMicroOps = 2;
}
+def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>;
-def : WriteRes<WriteMPSAD, [SKLPort0, SKLPort5]> { // Vector MPSAD.
- let Latency = 6;
- let ResourceCycles = [1, 2];
+def : WriteRes<WriteVecExtract, [SKLPort0,SKLPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
}
-def : WriteRes<WriteMPSADLd, [SKLPort23, SKLPort0, SKLPort5]> {
- let Latency = 6;
- let ResourceCycles = [1, 1, 2];
+def : WriteRes<WriteVecExtractSt, [SKLPort4,SKLPort5,SKLPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
}
-// Vector bitwise operations.
-// These are often used on both floating point and integer vectors.
-defm : SKLWriteResPair<WriteVecLogic, SKLPort015, 1>; // Vector and/or/xor.
-
// Conversion between integer and float.
-defm : SKLWriteResPair<WriteCvtF2I, SKLPort1, 3>; // Float -> Integer.
-defm : SKLWriteResPair<WriteCvtI2F, SKLPort1, 4>; // Integer -> Float.
-defm : SKLWriteResPair<WriteCvtF2F, SKLPort1, 3>; // Float -> Float size conversion.
+defm : SKLWriteResPair<WriteCvtSS2I, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPS2I, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPS2IY, [SKLPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : SKLWriteResPair<WriteCvtSD2I, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPD2I, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPD2IY, [SKLPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+defm : SKLWriteResPair<WriteCvtI2SS, [SKLPort1], 4>;
+defm : SKLWriteResPair<WriteCvtI2PS, [SKLPort1], 4>;
+defm : SKLWriteResPair<WriteCvtI2PSY, [SKLPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : SKLWriteResPair<WriteCvtI2SD, [SKLPort1], 4>;
+defm : SKLWriteResPair<WriteCvtI2PD, [SKLPort1], 4>;
+defm : SKLWriteResPair<WriteCvtI2PDY, [SKLPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : SKLWriteResPair<WriteCvtSS2SD, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPS2PD, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPS2PDY, [SKLPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+defm : SKLWriteResPair<WriteCvtSD2SS, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPD2PS, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPD2PSY, [SKLPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+defm : X86WriteRes<WriteCvtPH2PS, [SKLPort5,SKLPort015], 5, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSY, [SKLPort5,SKLPort01], 7, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZ>;
+defm : X86WriteRes<WriteCvtPH2PSLd, [SKLPort23,SKLPort01], 9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSYLd, [SKLPort23,SKLPort01], 10, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
+
+defm : X86WriteRes<WriteCvtPS2PH, [SKLPort5,SKLPort015], 5, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHY, [SKLPort5,SKLPort01], 7, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteRes<WriteCvtPS2PHSt, [SKLPort4,SKLPort5,SKLPort237,SKLPort01], 6, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [SKLPort4,SKLPort5,SKLPort237,SKLPort01], 8, [1,1,1,1], 4>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
// Strings instructions.
+
// Packed Compare Implicit Length Strings, Return Mask
-// String instructions.
def : WriteRes<WritePCmpIStrM, [SKLPort0]> {
let Latency = 10;
+ let NumMicroOps = 3;
let ResourceCycles = [3];
}
def : WriteRes<WritePCmpIStrMLd, [SKLPort0, SKLPort23]> {
- let Latency = 10;
- let ResourceCycles = [3, 1];
-}
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
// Packed Compare Explicit Length Strings, Return Mask
-def : WriteRes<WritePCmpEStrM, [SKLPort0, SKLPort16, SKLPort5]> {
- let Latency = 10;
- let ResourceCycles = [3, 2, 4];
+def : WriteRes<WritePCmpEStrM, [SKLPort0, SKLPort5, SKLPort015, SKLPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
}
-def : WriteRes<WritePCmpEStrMLd, [SKLPort05, SKLPort16, SKLPort23]> {
- let Latency = 10;
- let ResourceCycles = [6, 2, 1];
-}
- // Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrMLd, [SKLPort0, SKLPort5,SKLPort23, SKLPort015, SKLPort0156]> {
+ let Latency = 25;
+ let NumMicroOps = 10;
+ let ResourceCycles = [4,3,1,1,1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
def : WriteRes<WritePCmpIStrI, [SKLPort0]> {
- let Latency = 11;
+ let Latency = 10;
+ let NumMicroOps = 3;
let ResourceCycles = [3];
}
def : WriteRes<WritePCmpIStrILd, [SKLPort0, SKLPort23]> {
- let Latency = 11;
- let ResourceCycles = [3, 1];
-}
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
// Packed Compare Explicit Length Strings, Return Index
-def : WriteRes<WritePCmpEStrI, [SKLPort05, SKLPort16]> {
- let Latency = 11;
- let ResourceCycles = [6, 2];
+def : WriteRes<WritePCmpEStrI, [SKLPort0, SKLPort5, SKLPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [4,3,1];
}
-def : WriteRes<WritePCmpEStrILd, [SKLPort0, SKLPort16, SKLPort5, SKLPort23]> {
- let Latency = 11;
- let ResourceCycles = [3, 2, 2, 1];
+def : WriteRes<WritePCmpEStrILd, [SKLPort0, SKLPort5, SKLPort23, SKLPort0156]> {
+ let Latency = 24;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
}
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [SKLPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSK, [SKLPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSKY, [SKLPort0]> { let Latency = 2; }
+def : WriteRes<WriteMMXMOVMSK, [SKLPort0]> { let Latency = 2; }
+
// AES instructions.
-def : WriteRes<WriteAESDecEnc, [SKLPort5]> { // Decryption, encryption.
- let Latency = 7;
+def : WriteRes<WriteAESDecEnc, [SKLPort0]> { // Decryption, encryption.
+ let Latency = 4;
+ let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def : WriteRes<WriteAESDecEncLd, [SKLPort5, SKLPort23]> {
- let Latency = 7;
- let ResourceCycles = [1, 1];
+def : WriteRes<WriteAESDecEncLd, [SKLPort0, SKLPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def : WriteRes<WriteAESIMC, [SKLPort5]> { // InvMixColumn.
- let Latency = 14;
+
+def : WriteRes<WriteAESIMC, [SKLPort0]> { // InvMixColumn.
+ let Latency = 8;
+ let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def : WriteRes<WriteAESIMCLd, [SKLPort5, SKLPort23]> {
+def : WriteRes<WriteAESIMCLd, [SKLPort0, SKLPort23]> {
let Latency = 14;
- let ResourceCycles = [2, 1];
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
}
-def : WriteRes<WriteAESKeyGen, [SKLPort0, SKLPort5]> { // Key Generation.
- let Latency = 10;
- let ResourceCycles = [2, 8];
+
+def : WriteRes<WriteAESKeyGen, [SKLPort0, SKLPort5, SKLPort015]> { // Key Generation.
+ let Latency = 20;
+ let NumMicroOps = 11;
+ let ResourceCycles = [3,6,2];
}
-def : WriteRes<WriteAESKeyGenLd, [SKLPort0, SKLPort5, SKLPort23]> {
- let Latency = 10;
- let ResourceCycles = [2, 7, 1];
+def : WriteRes<WriteAESKeyGenLd, [SKLPort0, SKLPort5, SKLPort23, SKLPort015]> {
+ let Latency = 25;
+ let NumMicroOps = 11;
+ let ResourceCycles = [3,6,1,1];
}
// Carry-less multiplication instructions.
-def : WriteRes<WriteCLMul, [SKLPort0, SKLPort5]> {
- let Latency = 7;
- let ResourceCycles = [2, 1];
+def : WriteRes<WriteCLMul, [SKLPort5]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
}
-def : WriteRes<WriteCLMulLd, [SKLPort0, SKLPort5, SKLPort23]> {
- let Latency = 7;
- let ResourceCycles = [2, 1, 1];
+def : WriteRes<WriteCLMulLd, [SKLPort5, SKLPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
// Catch-all for expensive system instructions.
def : WriteRes<WriteSystem, [SKLPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
// AVX2.
-defm : SKLWriteResPair<WriteFShuffle256, SKLPort5, 3>; // Fp 256-bit width vector shuffles.
-defm : SKLWriteResPair<WriteShuffle256, SKLPort5, 3>; // 256-bit width vector shuffles.
-def : WriteRes<WriteVarVecShift, [SKLPort0, SKLPort5]> { // Variable vector shifts.
- let Latency = 2;
- let ResourceCycles = [2, 1];
-}
-def : WriteRes<WriteVarVecShiftLd, [SKLPort0, SKLPort5, SKLPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1, 1];
-}
+defm : SKLWriteResPair<WriteFShuffle256, [SKLPort5], 3, [1], 1, 7>; // Fp 256-bit width vector shuffles.
+defm : SKLWriteResPair<WriteFVarShuffle256, [SKLPort5], 3, [1], 1, 7>; // Fp 256-bit width vector variable shuffles.
+defm : SKLWriteResPair<WriteShuffle256, [SKLPort5], 3, [1], 1, 7>; // 256-bit width vector shuffles.
+defm : SKLWriteResPair<WriteVarShuffle256, [SKLPort5], 3, [1], 1, 7>; // 256-bit width vector variable shuffles.
// Old microcoded instructions that nobody use.
def : WriteRes<WriteMicrocoded, [SKLPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
@@ -277,33 +540,22 @@ def : WriteRes<WriteMicrocoded, [SKLPort0156]> { let Latency = 100; } // def Wri
// Fence instructions.
def : WriteRes<WriteFence, [SKLPort23, SKLPort4]>;
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [SKLPort0,SKLPort23,SKLPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [SKLPort4,SKLPort5,SKLPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+
// Nop, not very useful expect it provides a model for nops!
def : WriteRes<WriteNop, []>;
////////////////////////////////////////////////////////////////////////////////
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [SKLPort1]> {
- let Latency = 3;
-}
-
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [SKLPort1, SKLPort23]> {
- let Latency = 7;
- let ResourceCycles = [1, 1];
-}
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [SKLPort15]>;
-
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [SKLPort15, SKLPort23]> {
- let Latency = 5;
- let ResourceCycles = [1, 1];
-}
+defm : SKLWriteResPair<WriteFHAdd, [SKLPort5,SKLPort01], 6, [2,1], 3, 6>;
+defm : SKLWriteResPair<WriteFHAddY, [SKLPort5,SKLPort01], 6, [2,1], 3, 7>;
+defm : SKLWriteResPair<WritePHAdd, [SKLPort5,SKLPort05], 3, [2,1], 3, 5>;
+defm : SKLWriteResPair<WritePHAddX, [SKLPort5,SKLPort015], 3, [2,1], 3, 6>;
+defm : SKLWriteResPair<WritePHAddY, [SKLPort5,SKLPort015], 3, [2,1], 3, 7>;
// Remaining instrs.
@@ -312,210 +564,23 @@ def SKLWriteResGroup1 : SchedWriteRes<[SKLPort0]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDSBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDSWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDUSBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDUSWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PAVGBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PAVGWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPEQBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPEQDirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPEQWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPGTBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPGTDirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPGTWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMAXSWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMAXUBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMINSWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMINUBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLDri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLDrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLQri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLQrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLWri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLWrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRADri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRADrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRAWri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRAWrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLDri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLDrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLQri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLQrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLWri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLWrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBSBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBSWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBUSBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBUSWirr")>;
-
-def SKLWriteResGroup2 : SchedWriteRes<[SKLPort1]> {
- let Latency = 1;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup2], (instregex "MMX_MASKMOVQ64")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDS(B|W)irr",
+ "MMX_PADDUS(B|W)irr",
+ "MMX_PAVG(B|W)irr",
+ "MMX_PCMPEQ(B|D|W)irr",
+ "MMX_PCMPGT(B|D|W)irr",
+ "MMX_P(MAX|MIN)SWirr",
+ "MMX_P(MAX|MIN)UBirr",
+ "MMX_PSUBS(B|W)irr",
+ "MMX_PSUBUS(B|W)irr")>;
def SKLWriteResGroup3 : SchedWriteRes<[SKLPort5]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup3], (instregex "COMP_FST0r")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "COM_FST0r")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "INSERTPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_MOVD64rr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_MOVD64to64rr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PALIGNR64irr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PSHUFBrr64")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PSHUFWri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKHBWirr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKHDQirr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKHWDirr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKLBWirr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKLDQirr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKLWDirr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOV64toPQIrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVDDUPrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVDI2PDIrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVHLPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVLHPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVSDrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVSHDUPrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVSLDUPrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVUPDrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVUPSrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PACKSSDWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PACKSSWBrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PACKUSDWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PACKUSWBrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PALIGNRrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PBLENDWrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXBDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXBQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXWQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXBDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXBQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXWQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFBrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFDri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFHWri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFLWri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PSLLDQri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PSRLDQri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHQDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLQDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "SHUFPDrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "SHUFPSrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "UCOM_FPr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "UCOM_Fr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKHPDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKHPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKLPDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKLPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VBROADCASTSSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VINSERTPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOV64toPQIrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVDDUPYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVDDUPrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVDI2PDIrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVHLPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVLHPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSDrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSHDUPYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSHDUPrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSLDUPYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSLDUPrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPDYrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPDrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPSYrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPSrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSDWYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSDWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSWBYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSWBrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSDWYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSDWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSWBYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSWBrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPALIGNRYrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPALIGNRrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPBLENDWYrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPBLENDWrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPBROADCASTDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPBROADCASTQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDYri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSYri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXBDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXBQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXWQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXBDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXBQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXWQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFBYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFBrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFDYri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFDri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFHWYri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFHWri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFLWYri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFLWri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSLLDQYri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSLLDQri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSRLDQYri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSRLDQri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHBWYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHDQYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHQDQYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHQDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHWDYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLBWYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLDQYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLQDQYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLQDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLWDYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPDYrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPDrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPSYrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPSrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPDYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPSYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPDYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPSYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPSrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "COM(P?)_FST0r",
+ "UCOM_F(P?)r")>;
def SKLWriteResGroup4 : SchedWriteRes<[SKLPort6]> {
let Latency = 1;
@@ -524,557 +589,68 @@ def SKLWriteResGroup4 : SchedWriteRes<[SKLPort6]> {
}
def: InstRW<[SKLWriteResGroup4], (instregex "JMP(16|32|64)r")>;
-def SKLWriteResGroup5 : SchedWriteRes<[SKLPort01]> {
- let Latency = 1;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup5], (instregex "PABSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PABSDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PABSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PADDSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PADDSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PADDUSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PADDUSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PAVGBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PAVGWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQQrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PCMPGTBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PCMPGTDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PCMPGTWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMAXSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMAXSDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMAXSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMAXUBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMAXUDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMAXUWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMINSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMINSDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMINSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMINUBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMINUDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMINUWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSIGNBrr128")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSIGNDrr128")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSIGNWrr128")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSLLDri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSLLQri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSLLWri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSRADri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSRAWri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSRLDri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSRLQri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSRLWri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSUBSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSUBSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSUBUSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSUBUSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPABSBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPABSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPABSDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPABSDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPABSWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPABSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQQYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQQrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNBYrr256")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNBrr128")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNDYrr256")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNDrr128")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNWYrr256")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNWrr128")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLDYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLDri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLQYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLQri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVQYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVQrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLWYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLWri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRADYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRADri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAVDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAVDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAWYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAWri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLDYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLDri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLQYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLQri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVQYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVQrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLWYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLWri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSWrr")>;
-
def SKLWriteResGroup6 : SchedWriteRes<[SKLPort05]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup6], (instregex "FINCSTP")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "FNOP")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_MOVQ64rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PABSBrr64")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PABSDrr64")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PABSWrr64")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDBirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDDirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDQirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDWirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PANDNirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PANDirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PORirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSIGNBrr64")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSIGNDrr64")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSIGNWrr64")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBBirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBDirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBQirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBWirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PXORirr")>;
+def: InstRW<[SKLWriteResGroup6], (instrs FINCSTP, FNOP)>;
def SKLWriteResGroup7 : SchedWriteRes<[SKLPort06]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup7], (instregex "ADC(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "ADC(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "ADC8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "ADCX(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "ADOX(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BT(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BT(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BTC(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BTC(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BTR(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BTR(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BTS(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BTS(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CDQ")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CLAC")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVAE(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVB(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVE(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVG(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVGE(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVL(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVLE(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNE(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNO(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNP(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNS(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVO(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVP(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVS(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CQO")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JAE_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JAE_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JA_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JA_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JBE_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JBE_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JB_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JB_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JE_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JE_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JGE_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JGE_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JG_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JG_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JLE_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JLE_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JL_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JL_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JMP_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JMP_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNE_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNE_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNO_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNO_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNP_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNP_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNS_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNS_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JO_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JO_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JP_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JP_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JS_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JS_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "RORX(32|64)ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SAR(16|32|64)r1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SAR(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SAR8r1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SAR8ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SARX(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SBB(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SBB(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SBB8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETAEr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETBr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETEr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETGEr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETGr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETLEr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETLr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETNEr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETNOr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETNPr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETNSr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETOr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETPr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETSr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHL(16|32|64)r1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHL(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHL8r1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHL8ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHLX(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHR(16|32|64)r1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHR(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHR8r1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHR8ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHRX(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "STAC")>;
+def: InstRW<[SKLWriteResGroup7], (instrs CDQ, CQO, CLAC, STAC)>;
+def: InstRW<[SKLWriteResGroup7], (instregex "BT(16|32|64)ri8",
+ "BT(16|32|64)rr",
+ "BTC(16|32|64)ri8",
+ "BTC(16|32|64)rr",
+ "BTR(16|32|64)ri8",
+ "BTR(16|32|64)rr",
+ "BTS(16|32|64)ri8",
+ "BTS(16|32|64)rr")>;
def SKLWriteResGroup8 : SchedWriteRes<[SKLPort15]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup8], (instregex "ANDN(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup8], (instregex "BLSI(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup8], (instregex "BLSMSK(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup8], (instregex "BLSR(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup8], (instregex "BZHI(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup8], (instregex "LEA(16|32|64)(_32)?r")>;
+def: InstRW<[SKLWriteResGroup8], (instregex "ANDN(32|64)rr",
+ "BLSI(32|64)rr",
+ "BLSMSK(32|64)rr",
+ "BLSR(32|64)rr")>;
def SKLWriteResGroup9 : SchedWriteRes<[SKLPort015]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup9], (instregex "ANDNPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "ANDNPSrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "ANDPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "ANDPSrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "BLENDPDrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "BLENDPSrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVAPDrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVAPSrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVDQArr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVDQUrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVPQI2QIrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVSSrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "ORPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "ORPSrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PADDBrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PADDDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PADDQrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PADDWrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PANDNrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PANDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PORrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PSUBBrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PSUBDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PSUBQrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PSUBWrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PXORrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPDYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPSYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPSrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDPDYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDPSYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDPSrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPDYrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPDrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPSYrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPSrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPDYrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPDrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPSYrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPSrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQAYrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQArr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQUYrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQUrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVPQI2QIrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVSSrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VORPDYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VORPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VORPSYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VORPSrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDBYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDBrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDDYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDQYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDQrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDWYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDWrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPANDNYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPANDNrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPANDYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPANDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPBLENDDYrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPBLENDDrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPORYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPORrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBBYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBBrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBDYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBQYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBQrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBWYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBWrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPXORYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPXORrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VXORPDYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VXORPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VXORPSYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VXORPSrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "XORPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "XORPSrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "(V?)PADD(B|D|Q|W)(Y?)rr",
+ "VPBLENDD(Y?)rri",
+ "(V?)PSUB(B|D|Q|W)(Y?)rr")>;
def SKLWriteResGroup10 : SchedWriteRes<[SKLPort0156]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup10], (instregex "ADD(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "ADD8i8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "ADD8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "ADD8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "AND(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "AND(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "AND8i8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "AND8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "AND8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CBW")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CLC")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CMC")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CMP(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CMP8i8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CMP8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CMP8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CWDE")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "DEC(16|32|64)r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "DEC8r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "INC(16|32|64)r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "INC8r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "LAHF")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOV8ri(_alt)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOV8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOVZX(16|32|64)rr16")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOVZX(16|32|64)rr8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "NEG(16|32|64)r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "NEG8r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "NOOP")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "NOT(16|32|64)r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "NOT8r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "OR(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "OR(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "OR8i8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "OR8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "OR8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SAHF")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SGDT64m")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SIDT64m")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SLDT64m")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SMSW16m")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "STC")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "STRm")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SUB(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SUB8i8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SUB8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SUB8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SYSCALL")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "TEST(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "TEST8i8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "TEST8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "TEST8rr")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "XCHG(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "XOR(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "XOR(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "XOR8i8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "XOR8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "XOR8rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instrs CBW, CWDE, CDQE,
+ CMC, STC)>;
+def: InstRW<[SKLWriteResGroup10], (instregex "SGDT64m",
+ "SIDT64m",
+ "SMSW16m",
+ "STRm",
+ "SYSCALL")>;
def SKLWriteResGroup11 : SchedWriteRes<[SKLPort4,SKLPort237]> {
let Latency = 1;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup11], (instregex "FBSTPm")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVD64mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVNTQmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVQ64mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOV(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOV8mi")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOV8mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVAPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVAPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVDQAmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVDQUmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVHPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVHPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVLPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVLPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTDQmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTI_64mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTImr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVPDI2DImr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVPQI2QImr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVPQIto64mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVSDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVSSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVUPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVUPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "ST_FP32m")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "ST_FP64m")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "ST_FP80m")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VEXTRACTF128mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VEXTRACTI128mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPDYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPSYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQAYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQAmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQUYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQUmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVHPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVHPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVLPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVLPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTDQYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTDQmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPDYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPSYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVPDI2DImr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVPQI2QImr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVPQIto64mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVSDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVSSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPDYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPSYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMPTRSTm")>;
-
-def SKLWriteResGroup12 : SchedWriteRes<[SKLPort0]> {
- let Latency = 2;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup12], (instregex "COMISDrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "COMISSrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "MMX_MOVD64grr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PMOVMSKBrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "MOVMSKPDrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "MOVMSKPSrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "MOVPDI2DIrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "MOVPQIto64rr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "PMOVMSKBrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "UCOMISDrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "UCOMISSrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VCOMISDrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VCOMISSrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPDYrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPDrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPSYrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPSrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VMOVPDI2DIrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VMOVPQIto64rr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VPMOVMSKBYrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VPMOVMSKBrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPDYrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPDrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPSYrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPSrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VUCOMISDrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VUCOMISSrr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "FBSTPm",
+ "ST_FP(32|64|80)m",
+ "VMPTRSTm")>;
def SKLWriteResGroup13 : SchedWriteRes<[SKLPort5]> {
let Latency = 2;
@@ -1082,22 +658,13 @@ def SKLWriteResGroup13 : SchedWriteRes<[SKLPort5]> {
let ResourceCycles = [2];
}
def: InstRW<[SKLWriteResGroup13], (instregex "MMX_MOVQ2DQrr")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "MMX_PINSRWirri")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "PINSRBrr")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "PINSRDrr")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "PINSRQrr")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "PINSRWrri")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRBrr")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRDrr")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRQrr")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRWrri")>;
def SKLWriteResGroup14 : SchedWriteRes<[SKLPort05]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[SKLWriteResGroup14], (instregex "FDECSTP")>;
+def: InstRW<[SKLWriteResGroup14], (instrs FDECSTP)>;
def: InstRW<[SKLWriteResGroup14], (instregex "MMX_MOVDQ2Qrr")>;
def SKLWriteResGroup15 : SchedWriteRes<[SKLPort06]> {
@@ -1105,80 +672,20 @@ def SKLWriteResGroup15 : SchedWriteRes<[SKLPort06]> {
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[SKLWriteResGroup15], (instregex "CMOVA(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "CMOVBE(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROL(16|32|64)r1")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROL(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROL8r1")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROL8ri")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROR(16|32|64)r1")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROR(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROR8r1")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROR8ri")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "SETAr")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "SETBEr")>;
-
-def SKLWriteResGroup16 : SchedWriteRes<[SKLPort015]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SKLWriteResGroup16], (instregex "BLENDVPDrr0")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "BLENDVPSrr0")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "PBLENDVBrr0")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPDYrr")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPDrr")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPSYrr")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPSrr")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "VPBLENDVBYrr")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "VPBLENDVBrr")>;
+def: InstRW<[SKLWriteResGroup15], (instregex "ROL(8|16|32|64)r1",
+ "ROL(8|16|32|64)ri",
+ "ROR(8|16|32|64)r1",
+ "ROR(8|16|32|64)ri",
+ "SET(A|BE)r")>;
def SKLWriteResGroup17 : SchedWriteRes<[SKLPort0156]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[SKLWriteResGroup17], (instregex "LFENCE")>;
-def: InstRW<[SKLWriteResGroup17], (instregex "WAIT")>;
-def: InstRW<[SKLWriteResGroup17], (instregex "XGETBV")>;
-
-def SKLWriteResGroup18 : SchedWriteRes<[SKLPort0,SKLPort237]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup18], (instregex "MMX_MASKMOVQ64")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVDQU")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPDYmr")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPDmr")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPSYmr")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPSmr")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVDYmr")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVDmr")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVQYmr")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVQmr")>;
-
-def SKLWriteResGroup19 : SchedWriteRes<[SKLPort5,SKLPort01]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup19], (instregex "PSLLDrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "PSLLQrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "PSLLWrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "PSRADrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "PSRAWrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "PSRLDrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "PSRLQrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "PSRLWrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSLLDrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSLLQrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSLLWrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSRADrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSRAWrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSRLDrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSRLQrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSRLWrr")>;
+def: InstRW<[SKLWriteResGroup17], (instrs LFENCE,
+ WAIT,
+ XGETBV)>;
def SKLWriteResGroup20 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
let Latency = 2;
@@ -1192,72 +699,26 @@ def SKLWriteResGroup21 : SchedWriteRes<[SKLPort237,SKLPort0156]> {
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup21], (instregex "SFENCE")>;
-
-def SKLWriteResGroup22 : SchedWriteRes<[SKLPort06,SKLPort15]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup22], (instregex "BEXTR(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup22], (instregex "BSWAP(16|32|64)r")>;
+def: InstRW<[SKLWriteResGroup21], (instrs SFENCE)>;
def SKLWriteResGroup23 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup23], (instregex "ADC8i8")>;
-def: InstRW<[SKLWriteResGroup23], (instregex "ADC8ri")>;
-def: InstRW<[SKLWriteResGroup23], (instregex "CWD")>;
-def: InstRW<[SKLWriteResGroup23], (instregex "JRCXZ")>;
-def: InstRW<[SKLWriteResGroup23], (instregex "SBB8i8")>;
-def: InstRW<[SKLWriteResGroup23], (instregex "SBB8ri")>;
-
-def SKLWriteResGroup24 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237]> {
- let Latency = 2;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKLWriteResGroup24], (instregex "EXTRACTPSmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRBmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRDmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRQmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRWmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "STMXCSR")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "VEXTRACTPSmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRBmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRDmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRQmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRWmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "VSTMXCSR")>;
+def: InstRW<[SKLWriteResGroup23], (instrs CWD)>;
+def: InstRW<[SKLWriteResGroup23], (instrs JCXZ, JECXZ, JRCXZ)>;
+def: InstRW<[SKLWriteResGroup23], (instregex "ADC8i8",
+ "ADC8ri",
+ "SBB8i8",
+ "SBB8ri")>;
def SKLWriteResGroup25 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237]> {
let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup25], (instregex "FNSTCW16m")>;
-
-def SKLWriteResGroup26 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> {
- let Latency = 2;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKLWriteResGroup26], (instregex "SETAEm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETBm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETEm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETGEm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETGm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETLEm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETLm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETNEm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETNOm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETNPm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETNSm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETOm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETPm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETSm")>;
+def: InstRW<[SKLWriteResGroup25], (instrs FNSTCW16m)>;
def SKLWriteResGroup27 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort15]> {
let Latency = 2;
@@ -1271,206 +732,88 @@ def SKLWriteResGroup28 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort0156]> {
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup28], (instregex "PUSH(16|32|64)r(mr)?")>;
-def: InstRW<[SKLWriteResGroup28], (instregex "PUSH64i8")>;
-def: InstRW<[SKLWriteResGroup28], (instregex "STOSB")>;
-def: InstRW<[SKLWriteResGroup28], (instregex "STOSL")>;
-def: InstRW<[SKLWriteResGroup28], (instregex "STOSQ")>;
-def: InstRW<[SKLWriteResGroup28], (instregex "STOSW")>;
+def: InstRW<[SKLWriteResGroup28], (instrs PUSH16r, PUSH32r, PUSH64r,
+ STOSB, STOSL, STOSQ, STOSW)>;
+def: InstRW<[SKLWriteResGroup28], (instregex "PUSH(16|32|64)rmr",
+ "PUSH64i8")>;
def SKLWriteResGroup29 : SchedWriteRes<[SKLPort1]> {
let Latency = 3;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup29], (instregex "BSF(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "BSR(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "IMUL64rr(i8)?")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "IMUL8r")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "LZCNT(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "MUL8r")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "PDEP(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "PEXT(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "POPCNT(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "SHLD(16|32|64)rri8")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "SHRD(16|32|64)rri8")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "TZCNT(16|32|64)rr")>;
-
-def SKLWriteResGroup29_16 : SchedWriteRes<[SKLPort1, SKLPort0156]> {
- let Latency = 3;
+def: InstRW<[SKLWriteResGroup29], (instregex "PDEP(32|64)rr",
+ "PEXT(32|64)rr",
+ "SHLD(16|32|64)rri8",
+ "SHRD(16|32|64)rri8")>;
+
+def SKLWriteResGroup29_16i : SchedWriteRes<[SKLPort1, SKLPort0156]> {
+ let Latency = 4;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup29_16], (instregex "IMUL16rr(i8)?")>;
-
-def SKLWriteResGroup29_32 : SchedWriteRes<[SKLPort1]> {
- let Latency = 3;
- let NumMicroOps = 1;
-}
-def: InstRW<[SKLWriteResGroup29_32], (instregex "IMUL32rr(i8)?")>;
+def: InstRW<[SKLWriteResGroup29_16i], (instrs IMUL16rri, IMUL16rri8)>;
def SKLWriteResGroup30 : SchedWriteRes<[SKLPort5]> {
let Latency = 3;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup30], (instregex "ADD_FPrST0")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "ADD_FST0r")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "ADD_FrST0")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "MMX_PSADBWirr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "PCMPGTQrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "PSADBWrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "SUBR_FPrST0")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "SUBR_FST0r")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "SUBR_FrST0")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "SUB_FPrST0")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "SUB_FST0r")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "SUB_FrST0")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VBROADCASTSDYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VBROADCASTSSYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VEXTRACTF128rr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VEXTRACTI128rr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VINSERTF128rr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VINSERTI128rr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTBYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTBrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTDYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTWYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTWrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPCMPGTQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPCMPGTQrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPERM2F128rr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPERM2I128rr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPERMDYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPERMPDYri")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPERMPSYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPERMQYri")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXBDYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXBQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXBWYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXDQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXWDYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXWQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXBDYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXBQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXBWYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXDQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXWDYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXWQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPSADBWYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPSADBWrr")>;
-
-def SKLWriteResGroup31 : SchedWriteRes<[SKLPort0,SKLPort5]> {
- let Latency = 3;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup31], (instregex "EXTRACTPSrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "MMX_PEXTRWirri")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRBrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRDrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRQrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRWri")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRWrr_REV")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "PTESTrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VEXTRACTPSrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRBrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRDrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRQrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRWri")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRWrr_REV")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VPTESTYrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VPTESTrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)",
+ "VPBROADCASTBrr",
+ "VPBROADCASTWrr",
+ "(V?)PCMPGTQ(Y?)rr")>;
def SKLWriteResGroup32 : SchedWriteRes<[SKLPort0,SKLPort0156]> {
let Latency = 3;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup32], (instregex "FNSTSW16r")>;
+def: InstRW<[SKLWriteResGroup32], (instrs FNSTSW16r)>;
def SKLWriteResGroup33 : SchedWriteRes<[SKLPort06]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [3];
}
-def: InstRW<[SKLWriteResGroup33], (instregex "ROL(16|32|64)rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "ROL8rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "ROR(16|32|64)rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "ROR8rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "SAR(16|32|64)rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "SAR8rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "SHL(16|32|64)rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "SHL8rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "SHR(16|32|64)rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "SHR8rCL")>;
+def: InstRW<[SKLWriteResGroup33], (instregex "ROL(8|16|32|64)rCL",
+ "ROR(8|16|32|64)rCL",
+ "SAR(8|16|32|64)rCL",
+ "SHL(8|16|32|64)rCL",
+ "SHR(8|16|32|64)rCL")>;
def SKLWriteResGroup34 : SchedWriteRes<[SKLPort0156]> {
- let Latency = 3;
+ let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [3];
}
-def: InstRW<[SKLWriteResGroup34], (instregex "XADD(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup34], (instregex "XADD8rr")>;
-def: InstRW<[SKLWriteResGroup34], (instregex "XCHG8rr")>;
+def: InstRW<[SKLWriteResGroup34], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr,
+ XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr,
+ XCHG16ar, XCHG32ar, XCHG64ar)>;
def SKLWriteResGroup35 : SchedWriteRes<[SKLPort0,SKLPort5]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[SKLWriteResGroup35], (instregex "MMX_PHADDSWrr64")>;
-def: InstRW<[SKLWriteResGroup35], (instregex "MMX_PHSUBSWrr64")>;
+def: InstRW<[SKLWriteResGroup35], (instregex "MMX_PH(ADD|SUB)SWrr")>;
def SKLWriteResGroup36 : SchedWriteRes<[SKLPort5,SKLPort01]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKLWriteResGroup36], (instregex "PHADDSWrr128")>;
-def: InstRW<[SKLWriteResGroup36], (instregex "PHSUBSWrr128")>;
-def: InstRW<[SKLWriteResGroup36], (instregex "VPHADDSWrr128")>;
-def: InstRW<[SKLWriteResGroup36], (instregex "VPHADDSWrr256")>;
-def: InstRW<[SKLWriteResGroup36], (instregex "VPHSUBSWrr128")>;
-def: InstRW<[SKLWriteResGroup36], (instregex "VPHSUBSWrr256")>;
-
-def SKLWriteResGroup37 : SchedWriteRes<[SKLPort5,SKLPort05]> {
- let Latency = 3;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHADDWrr64")>;
-def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHADDrr64")>;
-def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHSUBDrr64")>;
-def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHSUBWrr64")>;
-
-def SKLWriteResGroup38 : SchedWriteRes<[SKLPort5,SKLPort015]> {
- let Latency = 3;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[SKLWriteResGroup38], (instregex "PHADDDrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "PHADDWrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "PHSUBDrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "PHSUBWrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDDYrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDDrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDWYrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDWrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBDYrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBDrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBWYrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBWrr")>;
+def: InstRW<[SKLWriteResGroup36], (instregex "(V?)PHADDSW(Y?)rr",
+ "(V?)PHSUBSW(Y?)rr")>;
def SKLWriteResGroup39 : SchedWriteRes<[SKLPort5,SKLPort0156]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKLWriteResGroup39], (instregex "MMX_PACKSSDWirr")>;
-def: InstRW<[SKLWriteResGroup39], (instregex "MMX_PACKSSWBirr")>;
-def: InstRW<[SKLWriteResGroup39], (instregex "MMX_PACKUSWBirr")>;
+def: InstRW<[SKLWriteResGroup39], (instregex "MMX_PACKSSDWirr",
+ "MMX_PACKSSWBirr",
+ "MMX_PACKUSWBirr")>;
def SKLWriteResGroup40 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
let Latency = 3;
@@ -1484,36 +827,31 @@ def SKLWriteResGroup41 : SchedWriteRes<[SKLPort237,SKLPort0156]> {
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[SKLWriteResGroup41], (instregex "MFENCE")>;
+def: InstRW<[SKLWriteResGroup41], (instrs MFENCE)>;
def SKLWriteResGroup42 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[SKLWriteResGroup42], (instregex "RCL(16|32|64)r1")>;
-def: InstRW<[SKLWriteResGroup42], (instregex "RCL(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup42], (instregex "RCL8r1")>;
-def: InstRW<[SKLWriteResGroup42], (instregex "RCL8ri")>;
-def: InstRW<[SKLWriteResGroup42], (instregex "RCR(16|32|64)r1")>;
-def: InstRW<[SKLWriteResGroup42], (instregex "RCR(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup42], (instregex "RCR8r1")>;
-def: InstRW<[SKLWriteResGroup42], (instregex "RCR8ri")>;
+def: InstRW<[SKLWriteResGroup42], (instregex "RCL(8|16|32|64)r1",
+ "RCL(8|16|32|64)ri",
+ "RCR(8|16|32|64)r1",
+ "RCR(8|16|32|64)ri")>;
def SKLWriteResGroup43 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup43], (instregex "FNSTSWm")>;
+def: InstRW<[SKLWriteResGroup43], (instrs FNSTSWm)>;
def SKLWriteResGroup44 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> {
let Latency = 3;
let NumMicroOps = 4;
let ResourceCycles = [1,1,2];
}
-def: InstRW<[SKLWriteResGroup44], (instregex "SETAm")>;
-def: InstRW<[SKLWriteResGroup44], (instregex "SETBEm")>;
+def: InstRW<[SKLWriteResGroup44], (instregex "SET(A|BE)m")>;
def SKLWriteResGroup45 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237,SKLPort0156]> {
let Latency = 3;
@@ -1527,232 +865,65 @@ def SKLWriteResGroup46 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06,SKLPort015
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[SKLWriteResGroup46], (instregex "CALL64pcrel32")>;
+def: InstRW<[SKLWriteResGroup46], (instrs CALL64pcrel32)>;
def SKLWriteResGroup47 : SchedWriteRes<[SKLPort0]> {
let Latency = 4;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup47], (instregex "AESDECLASTrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "AESDECrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "AESENCLASTrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "AESENCrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMADDUBSWrr64")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMADDWDirr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULHRSWrr64")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULHUWirr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULHWirr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULLWirr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULUDQirr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MUL_FPrST0")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MUL_FST0r")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MUL_FrST0")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "RCPPSr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "RCPSSr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "RSQRTPSr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "RSQRTSSr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VAESDECLASTrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VAESDECrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VAESENCLASTrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VAESENCrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VRCPPSYr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VRCPPSr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VRCPSSr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VRSQRTPSYr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VRSQRTPSr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VRSQRTSSr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "MUL_(FPrST0|FST0r|FrST0)")>;
def SKLWriteResGroup48 : SchedWriteRes<[SKLPort01]> {
let Latency = 4;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup48], (instregex "ADDPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "ADDPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "ADDSDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "ADDSSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "ADDSUBPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "ADDSUBPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "MULPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "MULPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "MULSDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "MULSSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "SUBPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "SUBPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "SUBSDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "SUBSSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDPDYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDPSYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDSDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDSSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPDYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPSYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VMULPDYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VMULPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VMULPSYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VMULPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VMULSDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VMULSSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPDYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPSYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VSUBSDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VSUBSSrr")>;
-def: InstRW<[SKLWriteResGroup48],
- (instregex
- "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)(Y)?r",
- "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>;
-
-def SKLWriteResGroup49 : SchedWriteRes<[SKLPort015]> {
- let Latency = 4;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup49], (instregex "CMPPDrri")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "CMPPSrri")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "CMPSDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "CMPSSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "CVTDQ2PSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "CVTPS2DQrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "CVTTPS2DQrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)PDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)PSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)SDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)SSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)PDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)PSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)SDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)SSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PHMINPOSUWrr128")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMADDUBSWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMADDWDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMULDQrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMULHRSWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMULHUWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMULHWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMULLWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMULUDQrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPDYrri")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPDrri")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPSYrri")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPSrri")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCMPSDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCMPSSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCVTDQ2PSYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCVTDQ2PSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCVTPS2DQYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCVTPS2DQrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCVTTPS2DQYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCVTTPS2DQrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PDYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PSYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)SDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)SSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PDYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PSYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)SDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)SSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPHMINPOSUWrr128")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDUBSWYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDUBSWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDWDYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDWDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULDQYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULDQrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHRSWYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHRSWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHUWYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHUWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHWYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULLWYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULLWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULUDQYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULUDQrr")>;
-
-def SKLWriteResGroup50 : SchedWriteRes<[SKLPort5]> {
- let Latency = 4;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SKLWriteResGroup50], (instregex "MPSADBWrri")>;
-def: InstRW<[SKLWriteResGroup50], (instregex "VMPSADBWYrri")>;
-def: InstRW<[SKLWriteResGroup50], (instregex "VMPSADBWrri")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "(V?)CVTDQ2PS(Y?)rr",
+ "(V?)CVT(T?)PS2DQ(Y?)rr")>;
def SKLWriteResGroup51 : SchedWriteRes<[SKLPort1,SKLPort5]> {
let Latency = 4;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup51], (instregex "IMUL64r")>;
-def: InstRW<[SKLWriteResGroup51], (instregex "MUL64r")>;
-def: InstRW<[SKLWriteResGroup51], (instregex "MULX64rr")>;
+def: InstRW<[SKLWriteResGroup51], (instrs IMUL64r, MUL64r, MULX64rr)>;
def SKLWriteResGroup51_16 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
let Latency = 4;
let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
}
-def: InstRW<[SKLWriteResGroup51_16], (instregex "IMUL16r")>;
-def: InstRW<[SKLWriteResGroup51_16], (instregex "MUL16r")>;
-
-def SKLWriteResGroup52 : SchedWriteRes<[SKLPort5,SKLPort01]> {
- let Latency = 4;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSLLDYrr")>;
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSLLQYrr")>;
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSLLWYrr")>;
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSRADYrr")>;
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSRAWYrr")>;
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSRLDYrr")>;
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSRLQYrr")>;
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSRLWYrr")>;
+def: InstRW<[SKLWriteResGroup51_16], (instrs IMUL16r, MUL16r)>;
def SKLWriteResGroup53 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237]> {
let Latency = 4;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup53], (instregex "ISTT_FP16m")>;
-def: InstRW<[SKLWriteResGroup53], (instregex "ISTT_FP32m")>;
-def: InstRW<[SKLWriteResGroup53], (instregex "ISTT_FP64m")>;
-def: InstRW<[SKLWriteResGroup53], (instregex "IST_F16m")>;
-def: InstRW<[SKLWriteResGroup53], (instregex "IST_F32m")>;
-def: InstRW<[SKLWriteResGroup53], (instregex "IST_FP16m")>;
-def: InstRW<[SKLWriteResGroup53], (instregex "IST_FP32m")>;
-def: InstRW<[SKLWriteResGroup53], (instregex "IST_FP64m")>;
+def: InstRW<[SKLWriteResGroup53], (instregex "IST(T?)_FP(16|32|64)m",
+ "IST_F(16|32)m")>;
def SKLWriteResGroup54 : SchedWriteRes<[SKLPort0156]> {
let Latency = 4;
let NumMicroOps = 4;
let ResourceCycles = [4];
}
-def: InstRW<[SKLWriteResGroup54], (instregex "FNCLEX")>;
+def: InstRW<[SKLWriteResGroup54], (instrs FNCLEX)>;
def SKLWriteResGroup55 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
let Latency = 4;
let NumMicroOps = 4;
let ResourceCycles = [1,3];
}
-def: InstRW<[SKLWriteResGroup55], (instregex "PAUSE")>;
+def: InstRW<[SKLWriteResGroup55], (instrs PAUSE)>;
def SKLWriteResGroup56 : SchedWriteRes<[SKLPort015,SKLPort0156]> {
let Latency = 4;
let NumMicroOps = 4;
let ResourceCycles = [1,3];
}
-def: InstRW<[SKLWriteResGroup56], (instregex "VZEROUPPER")>;
+def: InstRW<[SKLWriteResGroup56], (instrs VZEROUPPER)>;
def SKLWriteResGroup57 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort0156]> {
let Latency = 4;
@@ -1766,72 +937,36 @@ def SKLWriteResGroup58 : SchedWriteRes<[SKLPort23]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVD64rm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVD64to64rm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVQ64rm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOV(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOV64toPQIrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOV8rm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVDDUPrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVDI2PDIrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVQI2PQIrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVSDrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVSSrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm16")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm32")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm8")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVZX(16|32|64)rm16")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVZX(16|32|64)rm8")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHNTA")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHT0")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHT1")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHT2")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "VMOV64toPQIrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "VMOVDDUPrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "VMOVDI2PDIrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "VMOVQI2PQIrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "VMOVSDrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "VMOVSSrm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm16",
+ "MOVSX(16|32|64)rm32",
+ "MOVSX(16|32|64)rm8",
+ "MOVZX(16|32|64)rm16",
+ "MOVZX(16|32|64)rm8",
+ "(V?)MOVDDUPrm")>; // TODO: Should this be SKLWriteResGroup67?
def SKLWriteResGroup59 : SchedWriteRes<[SKLPort0,SKLPort5]> {
let Latency = 5;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup59], (instregex "CVTDQ2PDrr")>;
-def: InstRW<[SKLWriteResGroup59], (instregex "MMX_CVTPI2PDirr")>;
-def: InstRW<[SKLWriteResGroup59], (instregex "VCVTDQ2PDrr")>;
+def: InstRW<[SKLWriteResGroup59], (instregex "MMX_CVTPI2PDirr",
+ "(V?)CVTDQ2PDrr")>;
def SKLWriteResGroup60 : SchedWriteRes<[SKLPort5,SKLPort015]> {
let Latency = 5;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTPD2DQrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTPD2PSrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTPS2PDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTSD2SSrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTSI642SDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTSI2SDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTSI2SSrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTSS2SDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTTPD2DQrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTPD2PIirr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTPS2PIirr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTTPD2PIirr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTTPS2PIirr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPD2DQrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPD2PSrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPH2PSrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPS2PDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPS2PHrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSD2SSrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSI642SDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSI2SDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSI2SSrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSS2SDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTTPD2DQrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVT(T?)PD2PIirr",
+ "MMX_CVT(T?)PS2PIirr",
+ "(V?)CVT(T?)PD2DQrr",
+ "(V?)CVTPD2PSrr",
+ "(V?)CVTPS2PDrr",
+ "(V?)CVTSD2SSrr",
+ "(V?)CVTSI642SDrr",
+ "(V?)CVTSI2SDrr",
+ "(V?)CVTSI2SSrr",
+ "(V?)CVTSS2SDrr")>;
def SKLWriteResGroup61 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06]> {
let Latency = 5;
@@ -1841,73 +976,43 @@ def SKLWriteResGroup61 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06]> {
def: InstRW<[SKLWriteResGroup61], (instregex "STR(16|32|64)r")>;
def SKLWriteResGroup62 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
- let Latency = 5;
+ let Latency = 4;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup62], (instregex "IMUL32r")>;
-def: InstRW<[SKLWriteResGroup62], (instregex "MUL32r")>;
-def: InstRW<[SKLWriteResGroup62], (instregex "MULX32rr")>;
+def: InstRW<[SKLWriteResGroup62], (instrs IMUL32r, MUL32r, MULX32rr)>;
def SKLWriteResGroup63 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
let Latency = 5;
let NumMicroOps = 5;
let ResourceCycles = [1,4];
}
-def: InstRW<[SKLWriteResGroup63], (instregex "XSETBV")>;
+def: InstRW<[SKLWriteResGroup63], (instrs XSETBV)>;
def SKLWriteResGroup64 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
let Latency = 5;
let NumMicroOps = 5;
let ResourceCycles = [2,3];
}
-def: InstRW<[SKLWriteResGroup64], (instregex "CMPXCHG(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup64], (instregex "CMPXCHG8rr")>;
+def: InstRW<[SKLWriteResGroup64], (instregex "CMPXCHG(8|16|32|64)rr")>;
def SKLWriteResGroup65 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort0156]> {
let Latency = 5;
let NumMicroOps = 6;
let ResourceCycles = [1,1,4];
}
-def: InstRW<[SKLWriteResGroup65], (instregex "PUSHF16")>;
-def: InstRW<[SKLWriteResGroup65], (instregex "PUSHF64")>;
-
-def SKLWriteResGroup66 : SchedWriteRes<[SKLPort5]> {
- let Latency = 6;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup66], (instregex "PCLMULQDQrr")>;
-def: InstRW<[SKLWriteResGroup66], (instregex "VPCLMULQDQrr")>;
+def: InstRW<[SKLWriteResGroup65], (instregex "PUSHF(16|64)")>;
def SKLWriteResGroup67 : SchedWriteRes<[SKLPort23]> {
let Latency = 6;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup67], (instregex "LDDQUrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVAPDrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVAPSrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVDQArm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVDQUrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVNTDQArm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVSHDUPrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVSLDUPrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVUPDrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVUPSrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VBROADCASTSSrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VLDDQUrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVAPDrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVAPSrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVDQArm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVDQUrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVNTDQArm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVSHDUPrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVSLDUPrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVUPDrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVUPSrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VPBROADCASTDrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VPBROADCASTQrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "VBROADCASTSSrm",
+ "(V?)MOVSHDUPrm",
+ "(V?)MOVSLDUPrm",
+ "VPBROADCASTDrm",
+ "VPBROADCASTQrm")>;
def SKLWriteResGroup68 : SchedWriteRes<[SKLPort0]> {
let Latency = 6;
@@ -1921,247 +1026,83 @@ def SKLWriteResGroup69 : SchedWriteRes<[SKLPort0,SKLPort23]> {
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDSBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDSWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDUSBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDUSWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PAVGBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PAVGWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPEQBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPEQDirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPEQWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPGTBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPGTDirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPGTWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMAXSWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMAXUBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMINSWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMINUBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSLLDrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSLLQrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSLLWrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRADrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRAWrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRLDrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRLQrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRLWrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBSBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBSWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBUSBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBUSWirm")>;
-
-def SKLWriteResGroup70 : SchedWriteRes<[SKLPort0,SKLPort015]> {
- let Latency = 6;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup70], (instregex "CVTSD2SI64rr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "CVTSD2SIrr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "CVTSS2SI64rr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "CVTSS2SIrr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "CVTTSD2SI64rr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "CVTTSD2SIrr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSD2SI64rr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSD2SIrr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSS2SI64rr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSS2SIrr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "VCVTTSD2SI64rr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "VCVTTSD2SIrr")>;
-
-def SKLWriteResGroup71 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDSBirm",
+ "MMX_PADDSWirm",
+ "MMX_PADDUSBirm",
+ "MMX_PADDUSWirm",
+ "MMX_PAVGBirm",
+ "MMX_PAVGWirm",
+ "MMX_PCMPEQBirm",
+ "MMX_PCMPEQDirm",
+ "MMX_PCMPEQWirm",
+ "MMX_PCMPGTBirm",
+ "MMX_PCMPGTDirm",
+ "MMX_PCMPGTWirm",
+ "MMX_PMAXSWirm",
+ "MMX_PMAXUBirm",
+ "MMX_PMINSWirm",
+ "MMX_PMINUBirm",
+ "MMX_PSUBSBirm",
+ "MMX_PSUBSWirm",
+ "MMX_PSUBUSBirm",
+ "MMX_PSUBUSWirm")>;
+
+def SKLWriteResGroup70 : SchedWriteRes<[SKLPort0,SKLPort01]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PALIGNR64irm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PINSRWirmi")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PSHUFBrm64")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PSHUFWmi")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKHBWirm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKHDQirm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKHWDirm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKLBWirm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKLDQirm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKLWDirm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MOVHPDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MOVHPSrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MOVLPDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MOVLPSrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PINSRBrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PINSRDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PINSRQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PINSRWrmi")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXBDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXBQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXBWrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXDQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXWDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXWQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXBDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXBQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXBWrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXDQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXWDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXWQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VMOVHPDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VMOVHPSrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VMOVLPDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VMOVLPSrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRBrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRWrmi")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXBDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXBQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXBWrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXDQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXWDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXWQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXBDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXBQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXBWrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXDQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXWDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXWQrm")>;
+def: InstRW<[SKLWriteResGroup70], (instregex "(V?)CVTSS2SI(64)?rr",
+ "(V?)CVT(T?)SD2SI(64)?rr")>;
def SKLWriteResGroup72 : SchedWriteRes<[SKLPort6,SKLPort23]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup72], (instregex "FARJMP64")>;
-def: InstRW<[SKLWriteResGroup72], (instregex "JMP(16|32|64)m")>;
-
-def SKLWriteResGroup73 : SchedWriteRes<[SKLPort23,SKLPort05]> {
- let Latency = 6;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PABSBrm64")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PABSDrm64")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PABSWrm64")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDBirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDDirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDQirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDWirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PANDNirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PANDirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PORirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSIGNBrm64")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSIGNDrm64")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSIGNWrm64")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBBirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBDirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBQirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBWirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PXORirm")>;
+def: InstRW<[SKLWriteResGroup72], (instregex "FARJMP64",
+ "JMP(16|32|64)m")>;
def SKLWriteResGroup74 : SchedWriteRes<[SKLPort23,SKLPort06]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup74], (instregex "ADC(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "ADC8rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "ADCX(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "ADOX(32|64)rm")>;
def: InstRW<[SKLWriteResGroup74], (instregex "BT(16|32|64)mi8")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVAE(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVB(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVE(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVG(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVGE(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVL(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVLE(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNE(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNO(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNP(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNS(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVO(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVP(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVS(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "RORX32mi")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "RORX64mi")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SARX32rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SARX64rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SBB(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SBB8rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SHLX32rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SHLX64rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SHRX32rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SHRX64rm")>;
def SKLWriteResGroup75 : SchedWriteRes<[SKLPort23,SKLPort15]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup75], (instregex "ANDN(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup75], (instregex "BLSI(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup75], (instregex "BLSMSK(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup75], (instregex "BLSR(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup75], (instregex "BZHI(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup75], (instregex "MOVBE(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup75], (instregex "ANDN(32|64)rm",
+ "BLSI(32|64)rm",
+ "BLSMSK(32|64)rm",
+ "BLSR(32|64)rm",
+ "MOVBE(16|32|64)rm")>;
def SKLWriteResGroup76 : SchedWriteRes<[SKLPort23,SKLPort0156]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup76], (instregex "ADD(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "ADD8rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "AND(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "AND8rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "CMP(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "CMP(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "CMP(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "CMP8mi")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "CMP8mr")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "CMP8rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "OR(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "OR8rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "POP(16|32|64)r(mr)?")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "SUB(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "SUB8rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "TEST(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "TEST8mi")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "TEST8mr")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "XOR(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "XOR8rm")>;
-
-def SKLWriteResGroup77 : SchedWriteRes<[SKLPort5,SKLPort01]> {
- let Latency = 6;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[SKLWriteResGroup77], (instregex "HADDPDrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "HADDPSrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "HSUBPDrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "HSUBPSrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPDYrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPDrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPSYrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPSrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPDYrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPDrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPSYrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPSrr")>;
-
-def SKLWriteResGroup78 : SchedWriteRes<[SKLPort5,SKLPort015]> {
+def: InstRW<[SKLWriteResGroup76], (instrs POP16r, POP32r, POP64r)>;
+def: InstRW<[SKLWriteResGroup76], (instregex "POP(16|32|64)rmr")>;
+
+def SKLWriteResGroup78 : SchedWriteRes<[SKLPort5,SKLPort01]> {
let Latency = 6;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKLWriteResGroup78], (instregex "CVTSI642SSrr")>;
-def: InstRW<[SKLWriteResGroup78], (instregex "VCVTSI642SSrr")>;
+def: InstRW<[SKLWriteResGroup78], (instregex "(V?)CVTSI642SSrr")>;
def SKLWriteResGroup79 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
let Latency = 6;
let NumMicroOps = 4;
let ResourceCycles = [1,2,1];
}
-def: InstRW<[SKLWriteResGroup79], (instregex "SHLD(16|32|64)rrCL")>;
-def: InstRW<[SKLWriteResGroup79], (instregex "SHRD(16|32|64)rrCL")>;
+def: InstRW<[SKLWriteResGroup79], (instregex "SHLD(16|32|64)rrCL",
+ "SHRD(16|32|64)rrCL")>;
def SKLWriteResGroup80 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06,SKLPort0156]> {
let Latency = 6;
@@ -2170,102 +1111,51 @@ def SKLWriteResGroup80 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06,SKLPort0156]
}
def: InstRW<[SKLWriteResGroup80], (instregex "SLDT(16|32|64)r")>;
-def SKLWriteResGroup81 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237,SKLPort015]> {
- let Latency = 6;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[SKLWriteResGroup81], (instregex "VCVTPS2PHmr")>;
-
def SKLWriteResGroup82 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> {
let Latency = 6;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[SKLWriteResGroup82], (instregex "BTC(16|32|64)mi8")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "BTR(16|32|64)mi8")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "BTS(16|32|64)mi8")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SAR(16|32|64)m1")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SAR(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SAR8m1")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SAR8mi")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHL(16|32|64)m1")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHL(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHL8m1")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHL8mi")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHR(16|32|64)m1")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHR(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHR8m1")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHR8mi")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "BTC(16|32|64)mi8",
+ "BTR(16|32|64)mi8",
+ "BTS(16|32|64)mi8",
+ "SAR(8|16|32|64)m1",
+ "SAR(8|16|32|64)mi",
+ "SHL(8|16|32|64)m1",
+ "SHL(8|16|32|64)mi",
+ "SHR(8|16|32|64)m1",
+ "SHR(8|16|32|64)mi")>;
def SKLWriteResGroup83 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> {
let Latency = 6;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[SKLWriteResGroup83], (instregex "ADD(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "ADD(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "ADD8mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "ADD8mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "AND(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "AND(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "AND8mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "AND8mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "DEC(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "DEC8m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "INC(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "INC8m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "NEG(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "NEG8m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "NOT(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "NOT8m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "OR(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "OR(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "OR8mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "OR8mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "POP(16|32|64)rmm")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "PUSH(16|32|64)rmm")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "SUB(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "SUB(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "SUB8mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "SUB8mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "XOR(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "XOR(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "XOR8mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "XOR8mr")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "POP(16|32|64)rmm",
+ "PUSH(16|32|64)rmm")>;
def SKLWriteResGroup84 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
let Latency = 6;
let NumMicroOps = 6;
let ResourceCycles = [1,5];
}
-def: InstRW<[SKLWriteResGroup84], (instregex "STD")>;
+def: InstRW<[SKLWriteResGroup84], (instrs STD)>;
def SKLWriteResGroup85 : SchedWriteRes<[SKLPort23]> {
let Latency = 7;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup85], (instregex "LD_F32m")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "LD_F64m")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "LD_F80m")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTF128")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTI128")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTSDYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTSSYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VLDDQUYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVAPDYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVAPSYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVDDUPYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVDQAYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVDQUYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVNTDQAYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVSHDUPYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVSLDUPYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVUPDYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVUPSYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VPBROADCASTDYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VPBROADCASTQYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "LD_F(32|64|80)m",
+ "VBROADCASTF128",
+ "VBROADCASTI128",
+ "VBROADCASTSDYrm",
+ "VBROADCASTSSYrm",
+ "VMOVDDUPYrm",
+ "VMOVSHDUPYrm",
+ "VMOVSLDUPYrm",
+ "VPBROADCASTDYrm",
+ "VPBROADCASTQYrm")>;
def SKLWriteResGroup86 : SchedWriteRes<[SKLPort0,SKLPort5]> {
let Latency = 7;
@@ -2274,654 +1164,192 @@ def SKLWriteResGroup86 : SchedWriteRes<[SKLPort0,SKLPort5]> {
}
def: InstRW<[SKLWriteResGroup86], (instregex "VCVTDQ2PDYrr")>;
-def SKLWriteResGroup87 : SchedWriteRes<[SKLPort0,SKLPort23]> {
- let Latency = 7;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup87], (instregex "COMISDrm")>;
-def: InstRW<[SKLWriteResGroup87], (instregex "COMISSrm")>;
-def: InstRW<[SKLWriteResGroup87], (instregex "UCOMISDrm")>;
-def: InstRW<[SKLWriteResGroup87], (instregex "UCOMISSrm")>;
-def: InstRW<[SKLWriteResGroup87], (instregex "VCOMISDrm")>;
-def: InstRW<[SKLWriteResGroup87], (instregex "VCOMISSrm")>;
-def: InstRW<[SKLWriteResGroup87], (instregex "VUCOMISDrm")>;
-def: InstRW<[SKLWriteResGroup87], (instregex "VUCOMISSrm")>;
-
def SKLWriteResGroup88 : SchedWriteRes<[SKLPort5,SKLPort23]> {
- let Latency = 7;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup88], (instregex "INSERTPSrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PACKSSDWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PACKSSWBrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PACKUSDWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PACKUSWBrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PALIGNRrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PBLENDWrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFBrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFDmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFHWmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFLWmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHBWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHQDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHWDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLBWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLQDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLWDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "SHUFPDrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "SHUFPSrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKHPDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKHPSrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKLPDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKLPSrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VINSERTPSrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPACKSSDWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPACKSSWBrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPACKUSDWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPACKUSWBrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPALIGNRrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPBLENDWrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPBROADCASTBrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPBROADCASTWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPDmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPSmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPSrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFBrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFDmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFHWmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFLWmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHBWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHQDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHWDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLBWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLQDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLWDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VSHUFPDrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VSHUFPSrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKHPDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKHPSrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKLPDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKLPSrm")>;
-
-def SKLWriteResGroup89 : SchedWriteRes<[SKLPort5,SKLPort015]> {
- let Latency = 7;
+ let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPD2DQYrr")>;
-def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPD2PSYrr")>;
-def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPH2PSYrr")>;
-def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPS2PDYrr")>;
-def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPS2PHYrr")>;
-def: InstRW<[SKLWriteResGroup89], (instregex "VCVTTPD2DQYrr")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "(V?)PMOV(SX|ZX)BDrm",
+ "(V?)PMOV(SX|ZX)BQrm",
+ "(V?)PMOV(SX|ZX)BWrm",
+ "(V?)PMOV(SX|ZX)DQrm",
+ "(V?)PMOV(SX|ZX)WDrm",
+ "(V?)PMOV(SX|ZX)WQrm")>;
-def SKLWriteResGroup90 : SchedWriteRes<[SKLPort01,SKLPort23]> {
+def SKLWriteResGroup89 : SchedWriteRes<[SKLPort5,SKLPort01]> {
let Latency = 7;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup90], (instregex "PABSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PABSDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PABSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PADDSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PADDSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PADDUSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PADDUSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PAVGBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PAVGWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PCMPGTBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PCMPGTDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PCMPGTWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMAXSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMAXSDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMAXSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMAXUBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMAXUDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMAXUWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMINSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMINSDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMINSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMINUBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMINUDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMINUWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSIGNBrm128")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSIGNDrm128")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSIGNWrm128")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSLLDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSLLQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSLLWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSRADrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSRAWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSRLDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSRLQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSRLWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSUBSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSUBSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSUBUSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSUBUSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPABSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPABSDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPABSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPADDSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPADDSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPADDUSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPADDUSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPAVGBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPAVGWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPGTBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPGTDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPGTWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXSDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXUBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXUDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXUWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMINSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMINSDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMINSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMINUBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMINUDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMINUWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSIGNBrm128")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSIGNDrm128")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSIGNWrm128")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLVDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLVQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRADrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRAVDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRAWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLVDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLVQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBUSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBUSWrm")>;
+def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPD2PSYrr",
+ "VCVTPS2PDYrr",
+ "VCVT(T?)PD2DQYrr")>;
def SKLWriteResGroup91 : SchedWriteRes<[SKLPort23,SKLPort015]> {
let Latency = 7;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup91], (instregex "ANDNPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "ANDNPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "ANDPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "ANDPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "BLENDPDrmi")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "BLENDPSrmi")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "ORPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "ORPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PADDBrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PADDDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PADDQrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PADDWrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PANDNrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PANDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PORrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PSUBBrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PSUBDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PSUBQrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PSUBWrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PXORrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VANDNPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VANDNPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VANDPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VANDPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VBLENDPDrmi")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VBLENDPSrmi")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VINSERTF128rm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VINSERTI128rm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VMASKMOVPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VMASKMOVPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VORPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VORPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPADDBrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPADDDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPADDQrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPADDWrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPANDNrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPANDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPBLENDDrmi")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPMASKMOVDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPMASKMOVQrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPORrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBBrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBQrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBWrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPXORrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VXORPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VXORPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "XORPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "XORPSrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "(V?)INSERTF128rm",
+ "(V?)INSERTI128rm",
+ "(V?)PADD(B|D|Q|W)rm",
+ "(V?)PBLENDDrmi",
+ "(V?)PSUB(B|D|Q|W)rm")>;
def SKLWriteResGroup92 : SchedWriteRes<[SKLPort5,SKLPort23]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKLWriteResGroup92], (instregex "MMX_PACKSSDWirm")>;
-def: InstRW<[SKLWriteResGroup92], (instregex "MMX_PACKSSWBirm")>;
-def: InstRW<[SKLWriteResGroup92], (instregex "MMX_PACKUSWBirm")>;
-
-def SKLWriteResGroup93 : SchedWriteRes<[SKLPort23,SKLPort06]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[SKLWriteResGroup93], (instregex "CMOVA(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup93], (instregex "CMOVBE(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup92], (instregex "MMX_PACKSSDWirm",
+ "MMX_PACKSSWBirm",
+ "MMX_PACKUSWBirm")>;
def SKLWriteResGroup94 : SchedWriteRes<[SKLPort23,SKLPort0156]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[SKLWriteResGroup94], (instregex "LEAVE64")>;
-def: InstRW<[SKLWriteResGroup94], (instregex "SCASB")>;
-def: InstRW<[SKLWriteResGroup94], (instregex "SCASL")>;
-def: InstRW<[SKLWriteResGroup94], (instregex "SCASQ")>;
-def: InstRW<[SKLWriteResGroup94], (instregex "SCASW")>;
+def: InstRW<[SKLWriteResGroup94], (instrs LEAVE, LEAVE64,
+ SCASB, SCASL, SCASQ, SCASW)>;
-def SKLWriteResGroup95 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort015]> {
+def SKLWriteResGroup95 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort01]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup95], (instregex "CVTTSS2SI64rr")>;
-def: InstRW<[SKLWriteResGroup95], (instregex "CVTTSS2SIrr")>;
-def: InstRW<[SKLWriteResGroup95], (instregex "VCVTTSS2SI64rr")>;
-def: InstRW<[SKLWriteResGroup95], (instregex "VCVTTSS2SIrr")>;
+def: InstRW<[SKLWriteResGroup95], (instregex "(V?)CVTTSS2SI(64)?rr")>;
def SKLWriteResGroup96 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort05]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup96], (instregex "FLDCW16m")>;
-
-def SKLWriteResGroup97 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort0156]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKLWriteResGroup97], (instregex "LDMXCSR")>;
-def: InstRW<[SKLWriteResGroup97], (instregex "VLDMXCSR")>;
+def: InstRW<[SKLWriteResGroup96], (instrs FLDCW16m)>;
def SKLWriteResGroup98 : SchedWriteRes<[SKLPort6,SKLPort23,SKLPort0156]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup98], (instregex "LRETQ")>;
-def: InstRW<[SKLWriteResGroup98], (instregex "RETQ")>;
-
-def SKLWriteResGroup99 : SchedWriteRes<[SKLPort23,SKLPort06,SKLPort15]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKLWriteResGroup99], (instregex "BEXTR(32|64)rm")>;
+def: InstRW<[SKLWriteResGroup98], (instrs LRETQ, RETQ)>;
def SKLWriteResGroup100 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> {
let Latency = 7;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,2];
}
-def: InstRW<[SKLWriteResGroup100], (instregex "ROL(16|32|64)m1")>;
-def: InstRW<[SKLWriteResGroup100], (instregex "ROL(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup100], (instregex "ROL8m1")>;
-def: InstRW<[SKLWriteResGroup100], (instregex "ROL8mi")>;
-def: InstRW<[SKLWriteResGroup100], (instregex "ROR(16|32|64)m1")>;
-def: InstRW<[SKLWriteResGroup100], (instregex "ROR(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup100], (instregex "ROR8m1")>;
-def: InstRW<[SKLWriteResGroup100], (instregex "ROR8mi")>;
+def: InstRW<[SKLWriteResGroup100], (instregex "ROL(8|16|32|64)m1",
+ "ROL(8|16|32|64)mi",
+ "ROR(8|16|32|64)m1",
+ "ROR(8|16|32|64)mi")>;
def SKLWriteResGroup101 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> {
let Latency = 7;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,2];
}
-def: InstRW<[SKLWriteResGroup101], (instregex "XADD(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup101], (instregex "XADD8rm")>;
+def: InstRW<[SKLWriteResGroup101], (instregex "XADD(8|16|32|64)rm")>;
def SKLWriteResGroup102 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
let Latency = 7;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,1,1];
}
-def: InstRW<[SKLWriteResGroup102], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup102], (instregex "FARCALL64")>;
+def: InstRW<[SKLWriteResGroup102], (instregex "CALL(16|32|64)m",
+ "FARCALL64")>;
def SKLWriteResGroup103 : SchedWriteRes<[SKLPort6,SKLPort06,SKLPort15,SKLPort0156]> {
let Latency = 7;
let NumMicroOps = 7;
let ResourceCycles = [1,3,1,2];
}
-def: InstRW<[SKLWriteResGroup103], (instregex "LOOP")>;
-
-def SKLWriteResGroup104 : SchedWriteRes<[SKLPort0]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SKLWriteResGroup104], (instregex "AESIMCrr")>;
-def: InstRW<[SKLWriteResGroup104], (instregex "VAESIMCrr")>;
-
-def SKLWriteResGroup105 : SchedWriteRes<[SKLPort015]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SKLWriteResGroup105], (instregex "PMULLDrr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDPDr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDPSr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDSDr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDSSr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VPMULLDYrr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VPMULLDrr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDPDr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDPSr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDSDr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDSSr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDYPDr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDYPSr")>;
-
-def SKLWriteResGroup106 : SchedWriteRes<[SKLPort0,SKLPort23]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup106], (instregex "VTESTPDrm")>;
-def: InstRW<[SKLWriteResGroup106], (instregex "VTESTPSrm")>;
+def: InstRW<[SKLWriteResGroup103], (instrs LOOP)>;
def SKLWriteResGroup107 : SchedWriteRes<[SKLPort1,SKLPort23]> {
let Latency = 8;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup107], (instregex "BSF(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "BSR(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "IMUL64m")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "IMUL(32|64)rm(i8)?")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "IMUL8m")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "LZCNT(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "MUL(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "MUL8m")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "PDEP(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "PEXT(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "POPCNT(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "TZCNT(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup107], (instregex "PDEP(32|64)rm",
+ "PEXT(32|64)rm")>;
def SKLWriteResGroup107_16 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> {
- let Latency = 3;
+ let Latency = 8;
let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
+ let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup107_16], (instregex "IMUL16rm(i8)?")>;
+def: InstRW<[SKLWriteResGroup107_16], (instrs IMUL16rmi, IMUL16rmi8)>;
-def SKLWriteResGroup107_16_2 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> {
- let Latency = 3;
+def SKLWriteResGroup107_16_2 : SchedWriteRes<[SKLPort1, SKLPort06, SKLPort0156, SKLPort23]> {
+ let Latency = 9;
let NumMicroOps = 5;
+ let ResourceCycles = [1,1,2,1];
}
-def: InstRW<[SKLWriteResGroup107_16_2], (instregex "IMUL16m")>;
-def: InstRW<[SKLWriteResGroup107_16_2], (instregex "MUL16m")>;
-
-def SKLWriteResGroup107_32 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> {
- let Latency = 3;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKLWriteResGroup107_32], (instregex "IMUL32m")>;
-def: InstRW<[SKLWriteResGroup107_32], (instregex "MUL32m")>;
+def: InstRW<[SKLWriteResGroup107_16_2], (instrs IMUL16m, MUL16m)>;
def SKLWriteResGroup108 : SchedWriteRes<[SKLPort5,SKLPort23]> {
let Latency = 8;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup108], (instregex "FCOM32m")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "FCOM64m")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "FCOMP32m")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "FCOMP64m")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "MMX_PSADBWirm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPACKSSDWYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPACKSSWBYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPACKUSDWYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPACKUSWBYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPALIGNRYrmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPBLENDWYrmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPBROADCASTBYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPBROADCASTWYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPDYmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPDYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPSYmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPSYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPMOVSXBDYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPMOVSXBQYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPMOVSXWQYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFBYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFDYmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFHWYmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFLWYmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHBWYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHDQYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHQDQYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHWDYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLBWYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLDQYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLQDQYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLWDYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VSHUFPDYrmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VSHUFPSYrmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKHPDYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKHPSYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKLPDYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKLPSYrm")>;
-
-def SKLWriteResGroup109 : SchedWriteRes<[SKLPort01,SKLPort23]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup109], (instregex "VPABSBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPABSDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPABSWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPADDSBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPADDSWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPADDUSBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPADDUSWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPAVGBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPAVGWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQQYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPGTBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPGTDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPGTWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXSBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXSDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXSWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXUBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXUDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXUWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMINSBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMINSDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMINSWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMINUBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMINUDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMINUWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSIGNBYrm256")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSIGNDYrm256")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSIGNWYrm256")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLQYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLVDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLVQYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRADYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRAVDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRAWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLQYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLVDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLVQYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBSBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBSWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBUSBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBUSWYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "FCOM(P?)(32|64)m",
+ "VPBROADCASTBYrm",
+ "VPBROADCASTWYrm",
+ "VPMOVSXBDYrm",
+ "VPMOVSXBQYrm",
+ "VPMOVSXWQYrm")>;
def SKLWriteResGroup110 : SchedWriteRes<[SKLPort23,SKLPort015]> {
let Latency = 8;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup110], (instregex "VANDNPDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VANDNPSYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VANDPDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VANDPSYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VBLENDPDYrmi")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VBLENDPSYrmi")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VMASKMOVPDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VMASKMOVPSYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VORPDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VORPSYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPADDBYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPADDDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPADDQYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPADDWYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPANDNYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPANDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPBLENDDYrmi")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPMASKMOVDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPMASKMOVQYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPORYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBBYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBQYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBWYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPXORYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VXORPDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VXORPSYrm")>;
-
-def SKLWriteResGroup111 : SchedWriteRes<[SKLPort23,SKLPort015]> {
- let Latency = 8;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[SKLWriteResGroup111], (instregex "BLENDVPDrm0")>;
-def: InstRW<[SKLWriteResGroup111], (instregex "BLENDVPSrm0")>;
-def: InstRW<[SKLWriteResGroup111], (instregex "PBLENDVBrm0")>;
-def: InstRW<[SKLWriteResGroup111], (instregex "VBLENDVPDrm")>;
-def: InstRW<[SKLWriteResGroup111], (instregex "VBLENDVPSrm")>;
-def: InstRW<[SKLWriteResGroup111], (instregex "VPBLENDVBYrm")>;
-def: InstRW<[SKLWriteResGroup111], (instregex "VPBLENDVBrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPADD(B|D|Q|W)Yrm",
+ "VPBLENDDYrmi",
+ "VPSUB(B|D|Q|W)Yrm")>;
def SKLWriteResGroup112 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
let Latency = 8;
let NumMicroOps = 4;
let ResourceCycles = [1,2,1];
}
-def: InstRW<[SKLWriteResGroup112], (instregex "MMX_PHADDSWrm64")>;
-def: InstRW<[SKLWriteResGroup112], (instregex "MMX_PHSUBSWrm64")>;
-
-def SKLWriteResGroup113 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort05]> {
- let Latency = 8;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHADDWrm64")>;
-def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHADDrm64")>;
-def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHSUBDrm64")>;
-def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHSUBWrm64")>;
-
-def SKLWriteResGroup114 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237,SKLPort015]> {
- let Latency = 8;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[SKLWriteResGroup114], (instregex "VCVTPS2PHYmr")>;
+def: InstRW<[SKLWriteResGroup112], (instregex "MMX_PH(ADD|SUB)SWrm")>;
def SKLWriteResGroup115 : SchedWriteRes<[SKLPort23,SKLPort237,SKLPort06]> {
let Latency = 8;
let NumMicroOps = 5;
let ResourceCycles = [1,1,3];
}
-def: InstRW<[SKLWriteResGroup115], (instregex "ROR(16|32|64)mCL")>;
-def: InstRW<[SKLWriteResGroup115], (instregex "ROR8mCL")>;
+def: InstRW<[SKLWriteResGroup115], (instregex "ROR(8|16|32|64)mCL")>;
def SKLWriteResGroup116 : SchedWriteRes<[SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
let Latency = 8;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,2];
}
-def: InstRW<[SKLWriteResGroup116], (instregex "RCL(16|32|64)m1")>;
-def: InstRW<[SKLWriteResGroup116], (instregex "RCL(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup116], (instregex "RCL8m1")>;
-def: InstRW<[SKLWriteResGroup116], (instregex "RCL8mi")>;
-def: InstRW<[SKLWriteResGroup116], (instregex "RCR(16|32|64)m1")>;
-def: InstRW<[SKLWriteResGroup116], (instregex "RCR(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup116], (instregex "RCR8m1")>;
-def: InstRW<[SKLWriteResGroup116], (instregex "RCR8mi")>;
+def: InstRW<[SKLWriteResGroup116], (instregex "RCL(8|16|32|64)m1",
+ "RCL(8|16|32|64)mi",
+ "RCR(8|16|32|64)m1",
+ "RCR(8|16|32|64)mi")>;
def SKLWriteResGroup117 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> {
let Latency = 8;
let NumMicroOps = 6;
let ResourceCycles = [1,1,1,3];
}
-def: InstRW<[SKLWriteResGroup117], (instregex "ROL(16|32|64)mCL")>;
-def: InstRW<[SKLWriteResGroup117], (instregex "ROL8mCL")>;
-def: InstRW<[SKLWriteResGroup117], (instregex "SAR(16|32|64)mCL")>;
-def: InstRW<[SKLWriteResGroup117], (instregex "SAR8mCL")>;
-def: InstRW<[SKLWriteResGroup117], (instregex "SHL(16|32|64)mCL")>;
-def: InstRW<[SKLWriteResGroup117], (instregex "SHL8mCL")>;
-def: InstRW<[SKLWriteResGroup117], (instregex "SHR(16|32|64)mCL")>;
-def: InstRW<[SKLWriteResGroup117], (instregex "SHR8mCL")>;
-
-def SKLWriteResGroup118 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> {
- let Latency = 8;
- let NumMicroOps = 6;
- let ResourceCycles = [1,1,1,3];
-}
-def: InstRW<[SKLWriteResGroup118], (instregex "ADC(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup118], (instregex "ADC8mi")>;
+def: InstRW<[SKLWriteResGroup117], (instregex "ROL(8|16|32|64)mCL",
+ "SAR(8|16|32|64)mCL",
+ "SHL(8|16|32|64)mCL",
+ "SHR(8|16|32|64)mCL")>;
def SKLWriteResGroup119 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
let Latency = 8;
let NumMicroOps = 6;
let ResourceCycles = [1,1,1,2,1];
}
-def: InstRW<[SKLWriteResGroup119], (instregex "ADC(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup119], (instregex "ADC8mr")>;
-def: InstRW<[SKLWriteResGroup119], (instregex "CMPXCHG(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup119], (instregex "CMPXCHG8rm")>;
-def: InstRW<[SKLWriteResGroup119], (instregex "SBB(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup119], (instregex "SBB(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup119], (instregex "SBB8mi")>;
-def: InstRW<[SKLWriteResGroup119], (instregex "SBB8mr")>;
+def: SchedAlias<WriteADCRMW, SKLWriteResGroup119>;
+def: InstRW<[SKLWriteResGroup119], (instregex "CMPXCHG(8|16|32|64)rm")>;
def SKLWriteResGroup120 : SchedWriteRes<[SKLPort0,SKLPort23]> {
let Latency = 9;
@@ -2929,280 +1357,75 @@ def SKLWriteResGroup120 : SchedWriteRes<[SKLPort0,SKLPort23]> {
let ResourceCycles = [1,1];
}
def: InstRW<[SKLWriteResGroup120], (instregex "MMX_CVTPI2PSirm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMADDUBSWrm64")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMADDWDirm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULHRSWrm64")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULHUWirm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULHWirm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULLWirm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULUDQirm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "RCPSSm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "RSQRTSSm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "VRCPSSm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "VRSQRTSSm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "VTESTPDYrm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "VTESTPSYrm")>;
def SKLWriteResGroup121 : SchedWriteRes<[SKLPort5,SKLPort23]> {
let Latency = 9;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup121], (instregex "PCMPGTQrm")>;
-def: InstRW<[SKLWriteResGroup121], (instregex "PSADBWrm")>;
-def: InstRW<[SKLWriteResGroup121], (instregex "VPCMPGTQrm")>;
-def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVSXBWYrm")>;
-def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVSXDQYrm")>;
-def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVSXWDYrm")>;
-def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVZXWDYrm")>;
-def: InstRW<[SKLWriteResGroup121], (instregex "VPSADBWrm")>;
+def: InstRW<[SKLWriteResGroup121], (instregex "(V?)PCMPGTQrm",
+ "VPMOVSXBWYrm",
+ "VPMOVSXDQYrm",
+ "VPMOVSXWDYrm",
+ "VPMOVZXWDYrm")>;
-def SKLWriteResGroup122 : SchedWriteRes<[SKLPort01,SKLPort23]> {
- let Latency = 9;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup122], (instregex "ADDSDrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "ADDSSrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "MULSDrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "MULSSrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "SUBSDrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "SUBSSrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "VADDSDrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "VADDSSrm")>;
-def: InstRW<[SKLWriteResGroup122],
- (instregex "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "VMULSDrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "VMULSSrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "VSUBSDrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "VSUBSSrm")>;
-
-def SKLWriteResGroup123 : SchedWriteRes<[SKLPort23,SKLPort015]> {
+def SKLWriteResGroup123 : SchedWriteRes<[SKLPort23,SKLPort01]> {
let Latency = 9;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup123], (instregex "CMPSDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "CMPSSrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "CVTPS2PDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "MAX(C?)SDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "MAX(C?)SSrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "MIN(C?)SDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "MIN(C?)SSrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVTPS2PIirm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVTTPS2PIirm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VCMPSDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VCMPSSrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VCVTPH2PSrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VCVTPS2PDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VMAX(C?)SDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VMAX(C?)SSrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VMIN(C?)SDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VMIN(C?)SSrm")>;
-
-def SKLWriteResGroup124 : SchedWriteRes<[SKLPort5,SKLPort015]> {
- let Latency = 9;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[SKLWriteResGroup124], (instregex "DPPDrri")>;
-def: InstRW<[SKLWriteResGroup124], (instregex "VDPPDrri")>;
-
-def SKLWriteResGroup125 : SchedWriteRes<[SKLPort23,SKLPort015]> {
- let Latency = 9;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[SKLWriteResGroup125], (instregex "VBLENDVPDYrm")>;
-def: InstRW<[SKLWriteResGroup125], (instregex "VBLENDVPSYrm")>;
-
-def SKLWriteResGroup126 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
- let Latency = 9;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKLWriteResGroup126], (instregex "PTESTrm")>;
-def: InstRW<[SKLWriteResGroup126], (instregex "VPTESTrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVT(T?)PS2PIirm",
+ "(V?)CVTPS2PDrm")>;
def SKLWriteResGroup127 : SchedWriteRes<[SKLPort1,SKLPort5,SKLPort23]> {
let Latency = 9;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup127], (instregex "MULX64rm")>;
+def: InstRW<[SKLWriteResGroup127], (instrs IMUL64m, MUL64m, MULX64rm)>;
def SKLWriteResGroup128 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> {
let Latency = 9;
let NumMicroOps = 4;
let ResourceCycles = [2,1,1];
}
-def: InstRW<[SKLWriteResGroup128], (instregex "PHADDSWrm128")>;
-def: InstRW<[SKLWriteResGroup128], (instregex "PHSUBSWrm128")>;
-def: InstRW<[SKLWriteResGroup128], (instregex "VPHADDSWrm128")>;
-def: InstRW<[SKLWriteResGroup128], (instregex "VPHSUBSWrm128")>;
-
-def SKLWriteResGroup129 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
- let Latency = 9;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKLWriteResGroup129], (instregex "PHADDDrm")>;
-def: InstRW<[SKLWriteResGroup129], (instregex "PHADDWrm")>;
-def: InstRW<[SKLWriteResGroup129], (instregex "PHSUBDrm")>;
-def: InstRW<[SKLWriteResGroup129], (instregex "PHSUBWrm")>;
-def: InstRW<[SKLWriteResGroup129], (instregex "VPHADDDrm")>;
-def: InstRW<[SKLWriteResGroup129], (instregex "VPHADDWrm")>;
-def: InstRW<[SKLWriteResGroup129], (instregex "VPHSUBDrm")>;
-def: InstRW<[SKLWriteResGroup129], (instregex "VPHSUBWrm")>;
+def: InstRW<[SKLWriteResGroup128], (instregex "(V?)PHADDSWrm",
+ "(V?)PHSUBSWrm")>;
def SKLWriteResGroup130 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort0156]> {
let Latency = 9;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[SKLWriteResGroup130], (instregex "SHLD(16|32|64)mri8")>;
-def: InstRW<[SKLWriteResGroup130], (instregex "SHRD(16|32|64)mri8")>;
+def: InstRW<[SKLWriteResGroup130], (instregex "SHLD(16|32|64)mri8",
+ "SHRD(16|32|64)mri8")>;
def SKLWriteResGroup131 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> {
let Latency = 9;
let NumMicroOps = 5;
let ResourceCycles = [1,2,1,1];
}
-def: InstRW<[SKLWriteResGroup131], (instregex "LAR(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup131], (instregex "LSL(16|32|64)rm")>;
-
-def SKLWriteResGroup132 : SchedWriteRes<[SKLPort0,SKLPort23]> {
- let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup132], (instregex "AESDECLASTrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "AESDECrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "AESENCLASTrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "AESENCrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "RCPPSm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "RSQRTPSm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "VAESDECLASTrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "VAESDECrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "VAESENCLASTrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "VAESENCrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "VRCPPSm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "VRSQRTPSm")>;
+def: InstRW<[SKLWriteResGroup131], (instregex "LAR(16|32|64)rm",
+ "LSL(16|32|64)rm")>;
def SKLWriteResGroup133 : SchedWriteRes<[SKLPort5,SKLPort23]> {
let Latency = 10;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup133], (instregex "ADD_F32m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "ADD_F64m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "ILD_F16m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "ILD_F32m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "ILD_F64m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "SUBR_F32m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "SUBR_F64m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "SUB_F32m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "SUB_F64m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPCMPGTQYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPERM2F128rm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPERM2I128rm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPERMDYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPERMPDYmi")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPERMPSYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPERMQYmi")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXBDYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXBQYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXBWYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXDQYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXWQYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPSADBWYrm")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+ "ILD_F(16|32|64)m",
+ "VPCMPGTQYrm")>;
def SKLWriteResGroup134 : SchedWriteRes<[SKLPort01,SKLPort23]> {
let Latency = 10;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup134], (instregex "ADDPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "ADDPSrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "ADDSUBPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "ADDSUBPSrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "MULPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "MULPSrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "SUBPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "SUBPSrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VADDPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VADDPSrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VADDSUBPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VADDSUBPSrm")>;
-def: InstRW<[SKLWriteResGroup134],
- (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VMULPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VMULPSrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VSUBPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VSUBPSrm")>;
-
-def SKLWriteResGroup135 : SchedWriteRes<[SKLPort23,SKLPort015]> {
- let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup135], (instregex "CMPPDrmi")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "CMPPSrmi")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "CVTDQ2PSrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "CVTPS2DQrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "CVTSS2SDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "CVTTPS2DQrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "MAX(C?)PDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "MAX(C?)PSrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "MIN(C?)PDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "MIN(C?)PSrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PHMINPOSUWrm128")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMADDUBSWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMADDWDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMULDQrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMULHRSWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMULHUWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMULHWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMULLWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMULUDQrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VCMPPDrmi")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VCMPPSrmi")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VCVTDQ2PSrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VCVTPH2PSYrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VCVTPS2DQrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VCVTSS2SDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VCVTTPS2DQrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VMAX(C?)PDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VMAX(C?)PSrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VMIN(C?)PDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VMIN(C?)PSrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPHMINPOSUWrm128")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMADDUBSWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMADDWDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMULDQrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMULHRSWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMULHUWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMULHWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMULLWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMULUDQrm")>;
-
-def SKLWriteResGroup136 : SchedWriteRes<[SKLPort0]> {
- let Latency = 10;
- let NumMicroOps = 3;
- let ResourceCycles = [3];
-}
-def: InstRW<[SKLWriteResGroup136], (instregex "PCMPISTRIrr")>;
-def: InstRW<[SKLWriteResGroup136], (instregex "PCMPISTRM128rr")>;
-def: InstRW<[SKLWriteResGroup136], (instregex "VPCMPISTRIrr")>;
-def: InstRW<[SKLWriteResGroup136], (instregex "VPCMPISTRM128rr")>;
-
-def SKLWriteResGroup137 : SchedWriteRes<[SKLPort5,SKLPort23]> {
- let Latency = 10;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[SKLWriteResGroup137], (instregex "MPSADBWrmi")>;
-def: InstRW<[SKLWriteResGroup137], (instregex "VMPSADBWrmi")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "(V?)CVTDQ2PSrm",
+ "(V?)CVTPS2DQrm",
+ "(V?)CVTSS2SDrm",
+ "(V?)CVTTPS2DQrm")>;
def SKLWriteResGroup138 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
let Latency = 10;
@@ -3210,188 +1433,107 @@ def SKLWriteResGroup138 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
let ResourceCycles = [1,1,1];
}
def: InstRW<[SKLWriteResGroup138], (instregex "MMX_CVTPI2PDirm")>;
-def: InstRW<[SKLWriteResGroup138], (instregex "VPTESTYrm")>;
-def SKLWriteResGroup139 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
+def SKLWriteResGroup139 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort01]> {
let Latency = 10;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup139], (instregex "CVTSD2SSrm")>;
-def: InstRW<[SKLWriteResGroup139], (instregex "VCVTSD2SSrm")>;
+def: InstRW<[SKLWriteResGroup139], (instregex "(V?)CVTSD2SSrm")>;
def SKLWriteResGroup140 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> {
let Latency = 10;
let NumMicroOps = 4;
let ResourceCycles = [2,1,1];
}
-def: InstRW<[SKLWriteResGroup140], (instregex "VPHADDSWrm256")>;
-def: InstRW<[SKLWriteResGroup140], (instregex "VPHSUBSWrm256")>;
-
-def SKLWriteResGroup141 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
- let Latency = 10;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKLWriteResGroup141], (instregex "VPHADDDYrm")>;
-def: InstRW<[SKLWriteResGroup141], (instregex "VPHADDWYrm")>;
-def: InstRW<[SKLWriteResGroup141], (instregex "VPHSUBDYrm")>;
-def: InstRW<[SKLWriteResGroup141], (instregex "VPHSUBWYrm")>;
+def: InstRW<[SKLWriteResGroup140], (instregex "VPHADDSWYrm",
+ "VPHSUBSWYrm")>;
def SKLWriteResGroup142 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort06,SKLPort0156]> {
- let Latency = 10;
+ let Latency = 9;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[SKLWriteResGroup142], (instregex "MULX32rm")>;
+def: InstRW<[SKLWriteResGroup142], (instrs IMUL32m, MUL32m, MULX32rm)>;
def SKLWriteResGroup143 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
let Latency = 10;
let NumMicroOps = 8;
let ResourceCycles = [1,1,1,1,1,3];
}
-def: InstRW<[SKLWriteResGroup143], (instregex "ADD8mi")>;
-def: InstRW<[SKLWriteResGroup143], (instregex "AND8mi")>;
-def: InstRW<[SKLWriteResGroup143], (instregex "OR8mi")>;
-def: InstRW<[SKLWriteResGroup143], (instregex "SUB8mi")>;
-def: InstRW<[SKLWriteResGroup143], (instregex "XCHG(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup143], (instregex "XCHG8rm")>;
-def: InstRW<[SKLWriteResGroup143], (instregex "XOR8mi")>;
-
-def SKLWriteResGroup144 : SchedWriteRes<[SKLPort05,SKLPort0156]> {
- let Latency = 10;
- let NumMicroOps = 10;
- let ResourceCycles = [9,1];
-}
-def: InstRW<[SKLWriteResGroup144], (instregex "MMX_EMMS")>;
+def: InstRW<[SKLWriteResGroup143], (instregex "XCHG(8|16|32|64)rm")>;
-def SKLWriteResGroup145 : SchedWriteRes<[SKLPort0]> {
+def SKLWriteResGroup145 : SchedWriteRes<[SKLPort0,SKLFPDivider]> {
let Latency = 11;
let NumMicroOps = 1;
- let ResourceCycles = [1];
+ let ResourceCycles = [1,3];
}
-def: InstRW<[SKLWriteResGroup145], (instregex "DIVPSrr")>;
-def: InstRW<[SKLWriteResGroup145], (instregex "DIVSSrr")>;
-def: InstRW<[SKLWriteResGroup145], (instregex "VDIVPSYrr")>;
-def: InstRW<[SKLWriteResGroup145], (instregex "VDIVPSrr")>;
-def: InstRW<[SKLWriteResGroup145], (instregex "VDIVSSrr")>;
+def : SchedAlias<WriteFDivX, SKLWriteResGroup145>; // TODO - convert to ZnWriteResFpuPair
def SKLWriteResGroup146 : SchedWriteRes<[SKLPort0,SKLPort23]> {
let Latency = 11;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup146], (instregex "MUL_F32m")>;
-def: InstRW<[SKLWriteResGroup146], (instregex "MUL_F64m")>;
-def: InstRW<[SKLWriteResGroup146], (instregex "VRCPPSYm")>;
-def: InstRW<[SKLWriteResGroup146], (instregex "VRSQRTPSYm")>;
+def: InstRW<[SKLWriteResGroup146], (instregex "MUL_F(32|64)m")>;
def SKLWriteResGroup147 : SchedWriteRes<[SKLPort01,SKLPort23]> {
let Latency = 11;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup147], (instregex "VADDPDYrm")>;
-def: InstRW<[SKLWriteResGroup147], (instregex "VADDPSYrm")>;
-def: InstRW<[SKLWriteResGroup147], (instregex "VADDSUBPDYrm")>;
-def: InstRW<[SKLWriteResGroup147], (instregex "VADDSUBPSYrm")>;
-def: InstRW<[SKLWriteResGroup147],
- (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym")>;
-def: InstRW<[SKLWriteResGroup147], (instregex "VMULPDYrm")>;
-def: InstRW<[SKLWriteResGroup147], (instregex "VMULPSYrm")>;
-def: InstRW<[SKLWriteResGroup147], (instregex "VSUBPDYrm")>;
-def: InstRW<[SKLWriteResGroup147], (instregex "VSUBPSYrm")>;
-
-def SKLWriteResGroup148 : SchedWriteRes<[SKLPort23,SKLPort015]> {
- let Latency = 11;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup148], (instregex "VCMPPDYrmi")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VCMPPSYrmi")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VCVTDQ2PSYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VCVTPS2DQYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VCVTPS2PDYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VCVTTPS2DQYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VMAX(C?)PDYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VMAX(C?)PSYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VMIN(C?)PDYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VMIN(C?)PSYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMADDUBSWYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMADDWDYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMULDQYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMULHRSWYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMULHUWYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMULHWYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMULLWYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMULUDQYrm")>;
+def: InstRW<[SKLWriteResGroup147], (instregex "VCVTDQ2PSYrm",
+ "VCVTPS2PDYrm",
+ "VCVT(T?)PS2DQYrm")>;
def SKLWriteResGroup149 : SchedWriteRes<[SKLPort5,SKLPort23]> {
let Latency = 11;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKLWriteResGroup149], (instregex "FICOM16m")>;
-def: InstRW<[SKLWriteResGroup149], (instregex "FICOM32m")>;
-def: InstRW<[SKLWriteResGroup149], (instregex "FICOMP16m")>;
-def: InstRW<[SKLWriteResGroup149], (instregex "FICOMP32m")>;
-def: InstRW<[SKLWriteResGroup149], (instregex "VMPSADBWYrmi")>;
+def: InstRW<[SKLWriteResGroup149], (instregex "FICOM(P?)(16|32)m")>;
def SKLWriteResGroup150 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
let Latency = 11;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup150], (instregex "CVTDQ2PDrm")>;
-def: InstRW<[SKLWriteResGroup150], (instregex "VCVTDQ2PDrm")>;
+def: InstRW<[SKLWriteResGroup150], (instregex "(V?)CVTDQ2PDrm")>;
-def SKLWriteResGroup151 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort015]> {
+def SKLWriteResGroup151 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort01]> {
let Latency = 11;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup151], (instregex "CVTSD2SI64rm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "CVTSD2SIrm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "CVTSS2SI64rm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "CVTSS2SIrm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "CVTTSD2SI64rm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "CVTTSD2SIrm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "CVTTSS2SIrm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSD2SI64rm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSD2SIrm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSS2SI64rm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSS2SIrm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSD2SI64rm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSD2SIrm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSS2SI64rm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSS2SIrm")>;
-
-def SKLWriteResGroup152 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
+def: InstRW<[SKLWriteResGroup151], (instregex "(V?)CVTSS2SI64rm",
+ "(V?)CVT(T?)SD2SI(64)?rm",
+ "VCVTTSS2SI64rm",
+ "(V?)CVT(T?)SS2SIrm")>;
+
+def SKLWriteResGroup152 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort01]> {
let Latency = 11;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup152], (instregex "CVTPD2DQrm")>;
-def: InstRW<[SKLWriteResGroup152], (instregex "CVTPD2PSrm")>;
-def: InstRW<[SKLWriteResGroup152], (instregex "CVTTPD2DQrm")>;
-def: InstRW<[SKLWriteResGroup152], (instregex "MMX_CVTPD2PIirm")>;
-def: InstRW<[SKLWriteResGroup152], (instregex "MMX_CVTTPD2PIirm")>;
+def: InstRW<[SKLWriteResGroup152], (instregex "CVTPD2PSrm",
+ "CVT(T?)PD2DQrm",
+ "MMX_CVT(T?)PD2PIirm")>;
def SKLWriteResGroup153 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
let Latency = 11;
let NumMicroOps = 6;
let ResourceCycles = [1,1,1,2,1];
}
-def: InstRW<[SKLWriteResGroup153], (instregex "SHLD(16|32|64)mrCL")>;
-def: InstRW<[SKLWriteResGroup153], (instregex "SHRD(16|32|64)mrCL")>;
+def: InstRW<[SKLWriteResGroup153], (instregex "SHLD(16|32|64)mrCL",
+ "SHRD(16|32|64)mrCL")>;
def SKLWriteResGroup154 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
let Latency = 11;
let NumMicroOps = 7;
let ResourceCycles = [2,3,2];
}
-def: InstRW<[SKLWriteResGroup154], (instregex "RCL(16|32|64)rCL")>;
-def: InstRW<[SKLWriteResGroup154], (instregex "RCR(16|32|64)rCL")>;
+def: InstRW<[SKLWriteResGroup154], (instregex "RCL(16|32|64)rCL",
+ "RCR(16|32|64)rCL")>;
def SKLWriteResGroup155 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort15,SKLPort0156]> {
let Latency = 11;
@@ -3405,66 +1547,21 @@ def SKLWriteResGroup156 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
let NumMicroOps = 11;
let ResourceCycles = [2,9];
}
-def: InstRW<[SKLWriteResGroup156], (instregex "LOOPE")>;
-def: InstRW<[SKLWriteResGroup156], (instregex "LOOPNE")>;
-
-def SKLWriteResGroup157 : SchedWriteRes<[SKLPort0]> {
- let Latency = 12;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup157], (instregex "VSQRTPSYr")>;
-def: InstRW<[SKLWriteResGroup157], (instregex "VSQRTPSr")>;
-def: InstRW<[SKLWriteResGroup157], (instregex "VSQRTSSr")>;
-
-def SKLWriteResGroup158 : SchedWriteRes<[SKLPort5,SKLPort23]> {
- let Latency = 12;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup158], (instregex "PCLMULQDQrm")>;
-def: InstRW<[SKLWriteResGroup158], (instregex "VPCLMULQDQrm")>;
+def: InstRW<[SKLWriteResGroup156], (instrs LOOPE, LOOPNE)>;
-def SKLWriteResGroup159 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> {
- let Latency = 12;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKLWriteResGroup159], (instregex "HADDPDrm")>;
-def: InstRW<[SKLWriteResGroup159], (instregex "HADDPSrm")>;
-def: InstRW<[SKLWriteResGroup159], (instregex "HSUBPDrm")>;
-def: InstRW<[SKLWriteResGroup159], (instregex "HSUBPSrm")>;
-def: InstRW<[SKLWriteResGroup159], (instregex "VHADDPDrm")>;
-def: InstRW<[SKLWriteResGroup159], (instregex "VHADDPSrm")>;
-def: InstRW<[SKLWriteResGroup159], (instregex "VHSUBPDrm")>;
-def: InstRW<[SKLWriteResGroup159], (instregex "VHSUBPSrm")>;
-
-def SKLWriteResGroup160 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort015]> {
+def SKLWriteResGroup160 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort01]> {
let Latency = 12;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
def: InstRW<[SKLWriteResGroup160], (instregex "CVTTSS2SI64rm")>;
-def SKLWriteResGroup161 : SchedWriteRes<[SKLPort0]> {
- let Latency = 13;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup161], (instregex "SQRTPSr")>;
-def: InstRW<[SKLWriteResGroup161], (instregex "SQRTSSr")>;
-
def SKLWriteResGroup162 : SchedWriteRes<[SKLPort5,SKLPort23]> {
let Latency = 13;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKLWriteResGroup162], (instregex "ADD_FI16m")>;
-def: InstRW<[SKLWriteResGroup162], (instregex "ADD_FI32m")>;
-def: InstRW<[SKLWriteResGroup162], (instregex "SUBR_FI16m")>;
-def: InstRW<[SKLWriteResGroup162], (instregex "SUBR_FI32m")>;
-def: InstRW<[SKLWriteResGroup162], (instregex "SUB_FI16m")>;
-def: InstRW<[SKLWriteResGroup162], (instregex "SUB_FI32m")>;
+def: InstRW<[SKLWriteResGroup162], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>;
def SKLWriteResGroup163 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
let Latency = 13;
@@ -3473,67 +1570,27 @@ def SKLWriteResGroup163 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
}
def: InstRW<[SKLWriteResGroup163], (instregex "VCVTDQ2PDYrm")>;
-def SKLWriteResGroup164 : SchedWriteRes<[SKLPort5,SKLPort015]> {
- let Latency = 13;
- let NumMicroOps = 4;
- let ResourceCycles = [1,3];
-}
-def: InstRW<[SKLWriteResGroup164], (instregex "DPPSrri")>;
-def: InstRW<[SKLWriteResGroup164], (instregex "VDPPSYrri")>;
-def: InstRW<[SKLWriteResGroup164], (instregex "VDPPSrri")>;
-
-def SKLWriteResGroup165 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> {
- let Latency = 13;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKLWriteResGroup165], (instregex "VHADDPDYrm")>;
-def: InstRW<[SKLWriteResGroup165], (instregex "VHADDPSYrm")>;
-def: InstRW<[SKLWriteResGroup165], (instregex "VHSUBPDYrm")>;
-def: InstRW<[SKLWriteResGroup165], (instregex "VHSUBPSYrm")>;
-
-def SKLWriteResGroup166 : SchedWriteRes<[SKLPort0]> {
+def SKLWriteResGroup166 : SchedWriteRes<[SKLPort0,SKLFPDivider]> {
let Latency = 14;
let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup166], (instregex "DIVPDrr")>;
-def: InstRW<[SKLWriteResGroup166], (instregex "DIVSDrr")>;
-def: InstRW<[SKLWriteResGroup166], (instregex "VDIVPDYrr")>;
-def: InstRW<[SKLWriteResGroup166], (instregex "VDIVPDrr")>;
-def: InstRW<[SKLWriteResGroup166], (instregex "VDIVSDrr")>;
-
-def SKLWriteResGroup167 : SchedWriteRes<[SKLPort0,SKLPort23]> {
- let Latency = 14;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
+ let ResourceCycles = [1,3];
}
-def: InstRW<[SKLWriteResGroup167], (instregex "AESIMCrm")>;
-def: InstRW<[SKLWriteResGroup167], (instregex "VAESIMCrm")>;
+def : SchedAlias<WriteFDiv64, SKLWriteResGroup166>; // TODO - convert to ZnWriteResFpuPair
+def : SchedAlias<WriteFDiv64X, SKLWriteResGroup166>; // TODO - convert to ZnWriteResFpuPair
-def SKLWriteResGroup168 : SchedWriteRes<[SKLPort23,SKLPort015]> {
+def SKLWriteResGroup166_1 : SchedWriteRes<[SKLPort0,SKLFPDivider]> {
let Latency = 14;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
+ let NumMicroOps = 1;
+ let ResourceCycles = [1,5];
}
-def: InstRW<[SKLWriteResGroup168], (instregex "PMULLDrm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDPDm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDPSm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDSDm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDSSm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "VPMULLDrm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDPDm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDPSm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDSDm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDSSm")>;
+def : SchedAlias<WriteFDiv64Y, SKLWriteResGroup166_1>; // TODO - convert to ZnWriteResFpuPair
def SKLWriteResGroup169 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
let Latency = 14;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup169], (instregex "MUL_FI16m")>;
-def: InstRW<[SKLWriteResGroup169], (instregex "MUL_FI32m")>;
+def: InstRW<[SKLWriteResGroup169], (instregex "MUL_FI(16|32)m")>;
def SKLWriteResGroup170 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort15,SKLPort0156]> {
let Latency = 14;
@@ -3547,215 +1604,105 @@ def SKLWriteResGroup171 : SchedWriteRes<[SKLPort0]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_FPrST0")>;
-def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_FST0r")>;
-def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_FrST0")>;
-
-def SKLWriteResGroup172 : SchedWriteRes<[SKLPort23,SKLPort015]> {
- let Latency = 15;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[SKLWriteResGroup172], (instregex "VPMULLDYrm")>;
-def: InstRW<[SKLWriteResGroup172], (instregex "VROUNDYPDm")>;
-def: InstRW<[SKLWriteResGroup172], (instregex "VROUNDYPSm")>;
-
-def SKLWriteResGroup173 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
- let Latency = 15;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,2];
-}
-def: InstRW<[SKLWriteResGroup173], (instregex "DPPDrmi")>;
-def: InstRW<[SKLWriteResGroup173], (instregex "VDPPDrmi")>;
+def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>;
def SKLWriteResGroup174 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> {
let Latency = 15;
let NumMicroOps = 10;
let ResourceCycles = [1,1,1,5,1,1];
}
-def: InstRW<[SKLWriteResGroup174], (instregex "RCL(16|32|64)mCL")>;
-def: InstRW<[SKLWriteResGroup174], (instregex "RCL8mCL")>;
-
-def SKLWriteResGroup175 : SchedWriteRes<[SKLPort0,SKLPort23]> {
- let Latency = 16;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup175], (instregex "DIVSSrm")>;
-def: InstRW<[SKLWriteResGroup175], (instregex "VDIVSSrm")>;
-
-def SKLWriteResGroup176 : SchedWriteRes<[SKLPort0,SKLPort23]> {
- let Latency = 16;
- let NumMicroOps = 4;
- let ResourceCycles = [3,1];
-}
-def: InstRW<[SKLWriteResGroup176], (instregex "PCMPISTRIrm")>;
-def: InstRW<[SKLWriteResGroup176], (instregex "PCMPISTRM128rm")>;
-def: InstRW<[SKLWriteResGroup176], (instregex "VPCMPISTRIrm")>;
-def: InstRW<[SKLWriteResGroup176], (instregex "VPCMPISTRM128rm")>;
+def: InstRW<[SKLWriteResGroup174], (instregex "RCL(8|16|32|64)mCL")>;
def SKLWriteResGroup177 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> {
let Latency = 16;
let NumMicroOps = 14;
let ResourceCycles = [1,1,1,4,2,5];
}
-def: InstRW<[SKLWriteResGroup177], (instregex "CMPXCHG8B")>;
+def: InstRW<[SKLWriteResGroup177], (instrs CMPXCHG8B)>;
def SKLWriteResGroup178 : SchedWriteRes<[SKLPort0156]> {
let Latency = 16;
let NumMicroOps = 16;
let ResourceCycles = [16];
}
-def: InstRW<[SKLWriteResGroup178], (instregex "VZEROALL")>;
+def: InstRW<[SKLWriteResGroup178], (instrs VZEROALL)>;
-def SKLWriteResGroup179 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+def SKLWriteResGroup179 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> {
let Latency = 17;
let NumMicroOps = 2;
- let ResourceCycles = [1,1];
+ let ResourceCycles = [1,1,5];
}
-def: InstRW<[SKLWriteResGroup179], (instregex "DIVPSrm")>;
-def: InstRW<[SKLWriteResGroup179], (instregex "VDIVPSrm")>;
-def: InstRW<[SKLWriteResGroup179], (instregex "VSQRTSSm")>;
+def : SchedAlias<WriteFDivXLd, SKLWriteResGroup179>; // TODO - convert to ZnWriteResFpuPair
def SKLWriteResGroup180 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156]> {
let Latency = 17;
let NumMicroOps = 15;
let ResourceCycles = [2,1,2,4,2,4];
}
-def: InstRW<[SKLWriteResGroup180], (instregex "XCH_F")>;
-
-def SKLWriteResGroup181 : SchedWriteRes<[SKLPort0]> {
- let Latency = 18;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup181], (instregex "VSQRTPDYr")>;
-def: InstRW<[SKLWriteResGroup181], (instregex "VSQRTPDr")>;
-def: InstRW<[SKLWriteResGroup181], (instregex "VSQRTSDr")>;
-
-def SKLWriteResGroup182 : SchedWriteRes<[SKLPort0,SKLPort23]> {
- let Latency = 18;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup182], (instregex "SQRTSSm")>;
-def: InstRW<[SKLWriteResGroup182], (instregex "VDIVPSYrm")>;
-def: InstRW<[SKLWriteResGroup182], (instregex "VSQRTPSm")>;
-
-def SKLWriteResGroup183 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort0156]> {
- let Latency = 18;
- let NumMicroOps = 8;
- let ResourceCycles = [4,3,1];
-}
-def: InstRW<[SKLWriteResGroup183], (instregex "PCMPESTRIrr")>;
-def: InstRW<[SKLWriteResGroup183], (instregex "VPCMPESTRIrr")>;
+def: InstRW<[SKLWriteResGroup180], (instrs XCH_F)>;
def SKLWriteResGroup184 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort06,SKLPort0156]> {
let Latency = 18;
let NumMicroOps = 8;
let ResourceCycles = [1,1,1,5];
}
-def: InstRW<[SKLWriteResGroup184], (instregex "CPUID")>;
-def: InstRW<[SKLWriteResGroup184], (instregex "RDTSC")>;
+def: InstRW<[SKLWriteResGroup184], (instrs CPUID, RDTSC)>;
def SKLWriteResGroup185 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> {
let Latency = 18;
let NumMicroOps = 11;
let ResourceCycles = [2,1,1,4,1,2];
}
-def: InstRW<[SKLWriteResGroup185], (instregex "RCR(16|32|64)mCL")>;
-def: InstRW<[SKLWriteResGroup185], (instregex "RCR8mCL")>;
+def: InstRW<[SKLWriteResGroup185], (instregex "RCR(8|16|32|64)mCL")>;
-def SKLWriteResGroup186 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+def SKLWriteResGroup186 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> {
let Latency = 19;
let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup186], (instregex "DIVSDrm")>;
-def: InstRW<[SKLWriteResGroup186], (instregex "SQRTPSm")>;
-def: InstRW<[SKLWriteResGroup186], (instregex "VDIVSDrm")>;
-def: InstRW<[SKLWriteResGroup186], (instregex "VSQRTPSYm")>;
-
-def SKLWriteResGroup187 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
- let Latency = 19;
- let NumMicroOps = 5;
- let ResourceCycles = [1,1,3];
-}
-def: InstRW<[SKLWriteResGroup187], (instregex "DPPSrmi")>;
-def: InstRW<[SKLWriteResGroup187], (instregex "VDPPSrmi")>;
-
-def SKLWriteResGroup188 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort015,SKLPort0156]> {
- let Latency = 19;
- let NumMicroOps = 9;
- let ResourceCycles = [4,3,1,1];
+ let ResourceCycles = [1,1,4];
}
-def: InstRW<[SKLWriteResGroup188], (instregex "PCMPESTRM128rr")>;
-def: InstRW<[SKLWriteResGroup188], (instregex "VPCMPESTRM128rr")>;
+def : SchedAlias<WriteFDiv64Ld, SKLWriteResGroup186>; // TODO - convert to ZnWriteResFpuPair
def SKLWriteResGroup189 : SchedWriteRes<[SKLPort0]> {
let Latency = 20;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKLWriteResGroup189], (instregex "DIV_FPrST0")>;
-def: InstRW<[SKLWriteResGroup189], (instregex "DIV_FST0r")>;
-def: InstRW<[SKLWriteResGroup189], (instregex "DIV_FrST0")>;
-def: InstRW<[SKLWriteResGroup189], (instregex "SQRTPDr")>;
-def: InstRW<[SKLWriteResGroup189], (instregex "SQRTSDr")>;
+def: InstRW<[SKLWriteResGroup189], (instregex "DIV_(FPrST0|FST0r|FrST0)")>;
-def SKLWriteResGroup190 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+def SKLWriteResGroup190 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> {
let Latency = 20;
let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup190], (instregex "DIVPDrm")>;
-def: InstRW<[SKLWriteResGroup190], (instregex "VDIVPDrm")>;
-
-def SKLWriteResGroup191 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
- let Latency = 20;
- let NumMicroOps = 5;
- let ResourceCycles = [1,1,3];
+ let ResourceCycles = [1,1,4];
}
-def: InstRW<[SKLWriteResGroup191], (instregex "VDPPSYrmi")>;
+def : SchedAlias<WriteFDiv64XLd, SKLWriteResGroup190>; // TODO - convert to ZnWriteResFpuPair
def SKLWriteResGroup192 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
let Latency = 20;
let NumMicroOps = 8;
let ResourceCycles = [1,1,1,1,1,1,2];
}
-def: InstRW<[SKLWriteResGroup192], (instregex "INSB")>;
-def: InstRW<[SKLWriteResGroup192], (instregex "INSL")>;
-def: InstRW<[SKLWriteResGroup192], (instregex "INSW")>;
+def: InstRW<[SKLWriteResGroup192], (instrs INSB, INSL, INSW)>;
def SKLWriteResGroup193 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort0156]> {
let Latency = 20;
let NumMicroOps = 10;
let ResourceCycles = [1,2,7];
}
-def: InstRW<[SKLWriteResGroup193], (instregex "MWAITrr")>;
+def: InstRW<[SKLWriteResGroup193], (instrs MWAITrr)>;
-def SKLWriteResGroup194 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort015]> {
- let Latency = 20;
- let NumMicroOps = 11;
- let ResourceCycles = [3,6,2];
-}
-def: InstRW<[SKLWriteResGroup194], (instregex "AESKEYGENASSIST128rr")>;
-def: InstRW<[SKLWriteResGroup194], (instregex "VAESKEYGENASSIST128rr")>;
-
-def SKLWriteResGroup195 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+def SKLWriteResGroup195 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> {
let Latency = 21;
let NumMicroOps = 2;
- let ResourceCycles = [1,1];
+ let ResourceCycles = [1,1,8];
}
-def: InstRW<[SKLWriteResGroup195], (instregex "VDIVPDYrm")>;
+def : SchedAlias<WriteFDiv64YLd, SKLWriteResGroup195>; // TODO - convert to ZnWriteResFpuPair
def SKLWriteResGroup196 : SchedWriteRes<[SKLPort0,SKLPort23]> {
let Latency = 22;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F32m")>;
-def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F64m")>;
+def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F(32|64)m")>;
def SKLWriteResGroup196_1 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
let Latency = 22;
@@ -3785,117 +1732,56 @@ def: InstRW<[SKLWriteResGroup196_2], (instrs VGATHERDPSYrm,
VPGATHERQQYrm,
VGATHERDPDYrm)>;
-def SKLWriteResGroup197 : SchedWriteRes<[SKLPort0,SKLPort23]> {
- let Latency = 23;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup197], (instregex "VSQRTSDm")>;
-
def SKLWriteResGroup198 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort5,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
let Latency = 23;
let NumMicroOps = 19;
let ResourceCycles = [2,1,4,1,1,4,6];
}
-def: InstRW<[SKLWriteResGroup198], (instregex "CMPXCHG16B")>;
-
-def SKLWriteResGroup199 : SchedWriteRes<[SKLPort0,SKLPort23]> {
- let Latency = 24;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup199], (instregex "VSQRTPDm")>;
-
-def SKLWriteResGroup200 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort0156]> {
- let Latency = 24;
- let NumMicroOps = 9;
- let ResourceCycles = [4,3,1,1];
-}
-def: InstRW<[SKLWriteResGroup200], (instregex "PCMPESTRIrm")>;
-def: InstRW<[SKLWriteResGroup200], (instregex "VPCMPESTRIrm")>;
-
-def SKLWriteResGroup201 : SchedWriteRes<[SKLPort0,SKLPort23]> {
- let Latency = 25;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup201], (instregex "SQRTSDm")>;
-def: InstRW<[SKLWriteResGroup201], (instregex "VSQRTPDYm")>;
+def: InstRW<[SKLWriteResGroup198], (instrs CMPXCHG16B)>;
def SKLWriteResGroup202 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
let Latency = 25;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup202], (instregex "DIV_FI16m")>;
-def: InstRW<[SKLWriteResGroup202], (instregex "DIV_FI32m")>;
-
-def SKLWriteResGroup203 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort015,SKLPort0156]> {
- let Latency = 25;
- let NumMicroOps = 10;
- let ResourceCycles = [4,3,1,1,1];
-}
-def: InstRW<[SKLWriteResGroup203], (instregex "PCMPESTRM128rm")>;
-def: InstRW<[SKLWriteResGroup203], (instregex "VPCMPESTRM128rm")>;
-
-def SKLWriteResGroup204 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort015]> {
- let Latency = 25;
- let NumMicroOps = 11;
- let ResourceCycles = [3,6,1,1];
-}
-def: InstRW<[SKLWriteResGroup204], (instregex "AESKEYGENASSIST128rm")>;
-def: InstRW<[SKLWriteResGroup204], (instregex "VAESKEYGENASSIST128rm")>;
-
-def SKLWriteResGroup205 : SchedWriteRes<[SKLPort0,SKLPort23]> {
- let Latency = 26;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup205], (instregex "SQRTPDm")>;
+def: InstRW<[SKLWriteResGroup202], (instregex "DIV_FI(16|32)m")>;
def SKLWriteResGroup206 : SchedWriteRes<[SKLPort0,SKLPort23]> {
let Latency = 27;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup206], (instregex "DIVR_F32m")>;
-def: InstRW<[SKLWriteResGroup206], (instregex "DIVR_F64m")>;
+def: InstRW<[SKLWriteResGroup206], (instregex "DIVR_F(32|64)m")>;
def SKLWriteResGroup207 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort0156]> {
let Latency = 28;
let NumMicroOps = 8;
let ResourceCycles = [2,4,1,1];
}
-def: InstRW<[SKLWriteResGroup207], (instregex "IDIV(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup207], (instregex "IDIV8m")>;
+def: InstRW<[SKLWriteResGroup207], (instregex "IDIV(8|16|32|64)m")>;
def SKLWriteResGroup208 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
let Latency = 30;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKLWriteResGroup208], (instregex "DIVR_FI16m")>;
-def: InstRW<[SKLWriteResGroup208], (instregex "DIVR_FI32m")>;
+def: InstRW<[SKLWriteResGroup208], (instregex "DIVR_FI(16|32)m")>;
def SKLWriteResGroup209 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort23,SKLPort06,SKLPort0156]> {
let Latency = 35;
let NumMicroOps = 23;
let ResourceCycles = [1,5,3,4,10];
}
-def: InstRW<[SKLWriteResGroup209], (instregex "IN(16|32)ri")>;
-def: InstRW<[SKLWriteResGroup209], (instregex "IN(16|32)rr")>;
-def: InstRW<[SKLWriteResGroup209], (instregex "IN8ri")>;
-def: InstRW<[SKLWriteResGroup209], (instregex "IN8rr")>;
+def: InstRW<[SKLWriteResGroup209], (instregex "IN(8|16|32)ri",
+ "IN(8|16|32)rr")>;
def SKLWriteResGroup210 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
let Latency = 35;
let NumMicroOps = 23;
let ResourceCycles = [1,5,2,1,4,10];
}
-def: InstRW<[SKLWriteResGroup210], (instregex "OUT(16|32)ir")>;
-def: InstRW<[SKLWriteResGroup210], (instregex "OUT(16|32)rr")>;
-def: InstRW<[SKLWriteResGroup210], (instregex "OUT8ir")>;
-def: InstRW<[SKLWriteResGroup210], (instregex "OUT8rr")>;
+def: InstRW<[SKLWriteResGroup210], (instregex "OUT(8|16|32)ir",
+ "OUT(8|16|32)rr")>;
def SKLWriteResGroup211 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> {
let Latency = 37;
@@ -3909,28 +1795,29 @@ def SKLWriteResGroup212 : SchedWriteRes<[SKLPort1,SKLPort4,SKLPort5,SKLPort6,SKL
let NumMicroOps = 18;
let ResourceCycles = [1,1,2,3,1,1,1,8];
}
-def: InstRW<[SKLWriteResGroup212], (instregex "VMCLEARm")>;
+def: InstRW<[SKLWriteResGroup212], (instrs VMCLEARm)>;
def SKLWriteResGroup213 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
let Latency = 41;
let NumMicroOps = 39;
let ResourceCycles = [1,10,1,1,26];
}
-def: InstRW<[SKLWriteResGroup213], (instregex "XSAVE64")>;
+def: InstRW<[SKLWriteResGroup213], (instrs XSAVE64)>;
def SKLWriteResGroup214 : SchedWriteRes<[SKLPort5,SKLPort0156]> {
let Latency = 42;
let NumMicroOps = 22;
let ResourceCycles = [2,20];
}
-def: InstRW<[SKLWriteResGroup214], (instregex "RDTSCP")>;
+def: InstRW<[SKLWriteResGroup214], (instrs RDTSCP)>;
def SKLWriteResGroup215 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
let Latency = 42;
let NumMicroOps = 40;
let ResourceCycles = [1,11,1,1,26];
}
-def: InstRW<[SKLWriteResGroup215], (instregex "^XSAVE$", "XSAVEC", "XSAVES")>;
+def: InstRW<[SKLWriteResGroup215], (instrs XSAVE)>;
+def: InstRW<[SKLWriteResGroup215], (instregex "XSAVEC", "XSAVES")>;
def SKLWriteResGroup216 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
let Latency = 46;
@@ -3944,29 +1831,28 @@ def SKLWriteResGroup217 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort05,SKLPort06,
let NumMicroOps = 64;
let ResourceCycles = [2,8,5,10,39];
}
-def: InstRW<[SKLWriteResGroup217], (instregex "FLDENVm")>;
-def: InstRW<[SKLWriteResGroup217], (instregex "FLDENVm")>;
+def: InstRW<[SKLWriteResGroup217], (instrs FLDENVm)>;
def SKLWriteResGroup218 : SchedWriteRes<[SKLPort0,SKLPort6,SKLPort23,SKLPort05,SKLPort06,SKLPort15,SKLPort0156]> {
let Latency = 63;
let NumMicroOps = 88;
let ResourceCycles = [4,4,31,1,2,1,45];
}
-def: InstRW<[SKLWriteResGroup218], (instregex "FXRSTOR64")>;
+def: InstRW<[SKLWriteResGroup218], (instrs FXRSTOR64)>;
def SKLWriteResGroup219 : SchedWriteRes<[SKLPort0,SKLPort6,SKLPort23,SKLPort05,SKLPort06,SKLPort15,SKLPort0156]> {
let Latency = 63;
let NumMicroOps = 90;
let ResourceCycles = [4,2,33,1,2,1,47];
}
-def: InstRW<[SKLWriteResGroup219], (instregex "FXRSTOR")>;
+def: InstRW<[SKLWriteResGroup219], (instrs FXRSTOR)>;
def SKLWriteResGroup220 : SchedWriteRes<[SKLPort5,SKLPort05,SKLPort0156]> {
let Latency = 75;
let NumMicroOps = 15;
let ResourceCycles = [6,3,6];
}
-def: InstRW<[SKLWriteResGroup220], (instregex "FNINIT")>;
+def: InstRW<[SKLWriteResGroup220], (instrs FNINIT)>;
def SKLWriteResGroup221 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156]> {
let Latency = 76;
@@ -3987,7 +1873,8 @@ def SKLWriteResGroup223 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort4,SKLPort5,SKL
let NumMicroOps = 100;
let ResourceCycles = [9,1,11,16,1,11,21,30];
}
-def: InstRW<[SKLWriteResGroup223], (instregex "FSTENVm")>;
-def: InstRW<[SKLWriteResGroup223], (instregex "FSTENVm")>;
+def: InstRW<[SKLWriteResGroup223], (instrs FSTENVm)>;
+
+def: InstRW<[WriteZero], (instrs CLC)>;
} // SchedModel
diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td
index 439a2ffa36a4..7095ec081bd9 100755
--- a/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/lib/Target/X86/X86SchedSkylakeServer.td
@@ -19,7 +19,7 @@ def SkylakeServerModel : SchedMachineModel {
let MicroOpBufferSize = 224; // Based on the reorder buffer.
let LoadLatency = 5;
let MispredictPenalty = 14;
-
+
// Based on the LSD (loop-stream detector) queue size and benchmarking data.
let LoopMicroOpBufferSize = 50;
@@ -61,6 +61,10 @@ def SKXPort015 : ProcResGroup<[SKXPort0, SKXPort1, SKXPort5]>;
def SKXPort056 : ProcResGroup<[SKXPort0, SKXPort5, SKXPort6]>;
def SKXPort0156: ProcResGroup<[SKXPort0, SKXPort1, SKXPort5, SKXPort6]>;
+def SKXDivider : ProcResource<1>; // Integer division issued on port 0.
+// FP division and sqrt on port 0.
+def SKXFPDivider : ProcResource<1>;
+
// 60 Entry Unified Scheduler
def SKXPortAny : ProcResGroup<[SKXPort0, SKXPort1, SKXPort2, SKXPort3, SKXPort4,
SKXPort5, SKXPort6, SKXPort7]> {
@@ -77,45 +81,84 @@ def : ReadAdvance<ReadAfterLd, 5>;
// This multiclass defines the resource usage for variants with and without
// folded loads.
multiclass SKXWriteResPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1,
+ int LoadLat = 5> {
// Register variant is using a single cycle on ExePort.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
- // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
- // latency.
- def : WriteRes<SchedRW.Folded, [SKXPort23, ExePort]> {
- let Latency = !add(Lat, 5);
+ // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+ // the latency (default = 5).
+ def : WriteRes<SchedRW.Folded, !listconcat([SKXPort23], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = !add(UOps, 1);
}
}
-// A folded store needs a cycle on port 4 for the store data, but it does not
-// need an extra port 2/3 cycle to recompute the address.
-def : WriteRes<WriteRMW, [SKXPort4]>;
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3/7 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SKXPort237,SKXPort4]>;
// Arithmetic.
-defm : SKXWriteResPair<WriteALU, SKXPort0156, 1>; // Simple integer ALU op.
-defm : SKXWriteResPair<WriteIMul, SKXPort1, 3>; // Integer multiplication.
-def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part.
-def SKXDivider : ProcResource<1>; // Integer division issued on port 0.
-def : WriteRes<WriteIDiv, [SKXPort0, SKXDivider]> { // Integer division.
- let Latency = 25;
- let ResourceCycles = [1, 10];
-}
-def : WriteRes<WriteIDivLd, [SKXPort23, SKXPort0, SKXDivider]> {
- let Latency = 29;
- let ResourceCycles = [1, 1, 10];
-}
+defm : SKXWriteResPair<WriteALU, [SKXPort0156], 1>; // Simple integer ALU op.
+defm : SKXWriteResPair<WriteADC, [SKXPort06], 1>; // Integer ALU + flags op.
+defm : SKXWriteResPair<WriteIMul, [SKXPort1], 3>; // Integer multiplication.
+defm : SKXWriteResPair<WriteIMul64, [SKXPort1], 3>; // Integer 64-bit multiplication.
+defm : SKXWriteResPair<WriteBSWAP32,[SKXPort15], 1>; //
+defm : SKXWriteResPair<WriteBSWAP64,[SKXPort06, SKXPort15], 2, [1,1], 2>; //
+
+defm : SKXWriteResPair<WriteDiv8, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+defm : SKXWriteResPair<WriteDiv16, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+defm : SKXWriteResPair<WriteDiv32, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+defm : SKXWriteResPair<WriteDiv64, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+defm : SKXWriteResPair<WriteIDiv8, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+defm : SKXWriteResPair<WriteIDiv16, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+defm : SKXWriteResPair<WriteIDiv32, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+defm : SKXWriteResPair<WriteIDiv64, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+
+defm : SKXWriteResPair<WriteCRC32, [SKXPort1], 3>;
+
+def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part.
def : WriteRes<WriteLEA, [SKXPort15]>; // LEA instructions can't fold loads.
+defm : SKXWriteResPair<WriteCMOV, [SKXPort06], 1, [1], 1>; // Conditional move.
+defm : SKXWriteResPair<WriteCMOV2, [SKXPort06], 2, [2], 2>; // Conditional (CF + ZF flag) move.
+defm : X86WriteRes<WriteFCMOV, [SKXPort1], 3, [1], 1>; // x87 conditional move.
+def : WriteRes<WriteSETCC, [SKXPort06]>; // Setcc.
+def : WriteRes<WriteSETCCStore, [SKXPort06,SKXPort4,SKXPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+def : WriteRes<WriteLAHFSAHF, [SKXPort06]>;
+
// Integer shifts and rotates.
-defm : SKXWriteResPair<WriteShift, SKXPort06, 1>;
+defm : SKXWriteResPair<WriteShift, [SKXPort06], 1>;
+
+// Double shift instructions.
+defm : SKXWriteResPair<WriteShiftDouble, [SKXPort06], 1>;
+
+// Bit counts.
+defm : SKXWriteResPair<WriteBSF, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteBSR, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteLZCNT, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteTZCNT, [SKXPort1], 3>;
+defm : SKXWriteResPair<WritePOPCNT, [SKXPort1], 3>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : SKXWriteResPair<WriteBEXTR, [SKXPort06,SKXPort15], 2, [1,1], 2>;
+defm : SKXWriteResPair<WriteBZHI, [SKXPort15], 1>;
// Loads, stores, and moves, not folded with other operations.
-def : WriteRes<WriteLoad, [SKXPort23]> { let Latency = 5; }
-def : WriteRes<WriteStore, [SKXPort237, SKXPort4]>;
-def : WriteRes<WriteMove, [SKXPort0156]>;
+defm : X86WriteRes<WriteLoad, [SKXPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteStore, [SKXPort237, SKXPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteStoreNT, [SKXPort237, SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteMove, [SKXPort0156], 1, [1], 1>;
// Idioms that clear a register, like xorps %xmm0, %xmm0.
// These can often bypass execution ports completely.
@@ -123,153 +166,374 @@ def : WriteRes<WriteZero, []>;
// Branches don't produce values, so they have no latency, but they still
// consume resources. Indirect branches can fold loads.
-defm : SKXWriteResPair<WriteJump, SKXPort06, 1>;
+defm : SKXWriteResPair<WriteJump, [SKXPort06], 1>;
// Floating point. This covers both scalar and vector operations.
-defm : SKXWriteResPair<WriteFAdd, SKXPort1, 3>; // Floating point add/sub/compare.
-defm : SKXWriteResPair<WriteFMul, SKXPort0, 5>; // Floating point multiplication.
-defm : SKXWriteResPair<WriteFDiv, SKXPort0, 12>; // 10-14 cycles. // Floating point division.
-defm : SKXWriteResPair<WriteFSqrt, SKXPort0, 15>; // Floating point square root.
-defm : SKXWriteResPair<WriteFRcp, SKXPort0, 5>; // Floating point reciprocal estimate.
-defm : SKXWriteResPair<WriteFRsqrt, SKXPort0, 5>; // Floating point reciprocal square root estimate.
-defm : SKXWriteResPair<WriteFMA, SKXPort015, 4>; // Fused Multiply Add.
-defm : SKXWriteResPair<WriteFShuffle, SKXPort5, 1>; // Floating point vector shuffles.
-defm : SKXWriteResPair<WriteFBlend, SKXPort015, 1>; // Floating point vector blends.
-def : WriteRes<WriteFVarBlend, [SKXPort5]> { // Fp vector variable blends.
- let Latency = 2;
- let ResourceCycles = [2];
-}
-def : WriteRes<WriteFVarBlendLd, [SKXPort5, SKXPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1];
-}
+defm : X86WriteRes<WriteFLD0, [SKXPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1, [SKXPort05], 1, [2], 2>;
+defm : X86WriteRes<WriteFLDC, [SKXPort05], 1, [2], 2>;
+defm : X86WriteRes<WriteFLoad, [SKXPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX, [SKXPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [SKXPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad, [SKXPort23,SKXPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedLoadY, [SKXPort23,SKXPort015], 8, [1,1], 2>;
+defm : X86WriteRes<WriteFStore, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreX, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNT, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTX, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMove, [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX, [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY, [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteEMMS, [SKXPort05,SKXPort0156], 10, [9,1], 10>;
+
+defm : SKXWriteResPair<WriteFAdd, [SKXPort01], 4, [1], 1, 5>; // Floating point add/sub.
+defm : SKXWriteResPair<WriteFAddX, [SKXPort01], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFAddY, [SKXPort01], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFAddZ, [SKXPort05], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFAdd64, [SKXPort01], 4, [1], 1, 5>; // Floating point double add/sub.
+defm : SKXWriteResPair<WriteFAdd64X, [SKXPort01], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFAdd64Y, [SKXPort01], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFAdd64Z, [SKXPort05], 4, [1], 1, 7>;
+
+defm : SKXWriteResPair<WriteFCmp, [SKXPort01], 4, [1], 1, 5>; // Floating point compare.
+defm : SKXWriteResPair<WriteFCmpX, [SKXPort01], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFCmpY, [SKXPort01], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFCmpZ, [SKXPort05], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFCmp64, [SKXPort01], 4, [1], 1, 5>; // Floating point double compare.
+defm : SKXWriteResPair<WriteFCmp64X, [SKXPort01], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFCmp64Y, [SKXPort01], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFCmp64Z, [SKXPort05], 4, [1], 1, 7>;
+
+defm : SKXWriteResPair<WriteFCom, [SKXPort0], 2>; // Floating point compare to flags.
+
+defm : SKXWriteResPair<WriteFMul, [SKXPort01], 4, [1], 1, 5>; // Floating point multiplication.
+defm : SKXWriteResPair<WriteFMulX, [SKXPort01], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFMulY, [SKXPort01], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFMulZ, [SKXPort05], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFMul64, [SKXPort01], 4, [1], 1, 5>; // Floating point double multiplication.
+defm : SKXWriteResPair<WriteFMul64X, [SKXPort01], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFMul64Y, [SKXPort01], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFMul64Z, [SKXPort05], 4, [1], 1, 7>;
+
+defm : SKXWriteResPair<WriteFDiv, [SKXPort0,SKXFPDivider], 11, [1,3], 1, 5>; // 10-14 cycles. // Floating point division.
+//defm : SKXWriteResPair<WriteFDivX, [SKXPort0,SKXFPDivider], 11, [1,3], 1, 6>; // 10-14 cycles.
+defm : SKXWriteResPair<WriteFDivY, [SKXPort0,SKXFPDivider], 11, [1,5], 1, 7>; // 10-14 cycles.
+defm : SKXWriteResPair<WriteFDivZ, [SKXPort0,SKXPort5,SKXFPDivider], 18, [2,1,10], 3, 7>; // 10-14 cycles.
+//defm : SKXWriteResPair<WriteFDiv64, [SKXPort0,SKXFPDivider], 14, [1,3], 1, 5>; // 10-14 cycles. // Floating point division.
+//defm : SKXWriteResPair<WriteFDiv64X, [SKXPort0,SKXFPDivider], 14, [1,3], 1, 6>; // 10-14 cycles.
+//defm : SKXWriteResPair<WriteFDiv64Y, [SKXPort0,SKXFPDivider], 14, [1,5], 1, 7>; // 10-14 cycles.
+defm : SKXWriteResPair<WriteFDiv64Z, [SKXPort0,SKXPort5,SKXFPDivider], 23, [2,1,16], 3, 7>; // 10-14 cycles.
+
+defm : SKXWriteResPair<WriteFSqrt, [SKXPort0,SKXFPDivider], 12, [1,3], 1, 5>; // Floating point square root.
+defm : SKXWriteResPair<WriteFSqrtX, [SKXPort0,SKXFPDivider], 12, [1,3], 1, 6>;
+defm : SKXWriteResPair<WriteFSqrtY, [SKXPort0,SKXFPDivider], 12, [1,6], 1, 7>;
+defm : SKXWriteResPair<WriteFSqrtZ, [SKXPort0,SKXPort5,SKXFPDivider], 20, [2,1,12], 3, 7>;
+defm : SKXWriteResPair<WriteFSqrt64, [SKXPort0,SKXFPDivider], 18, [1,6], 1, 5>; // Floating point double square root.
+defm : SKXWriteResPair<WriteFSqrt64X, [SKXPort0,SKXFPDivider], 18, [1,6], 1, 6>;
+defm : SKXWriteResPair<WriteFSqrt64Y, [SKXPort0,SKXFPDivider], 18, [1,12],1, 7>;
+defm : SKXWriteResPair<WriteFSqrt64Z, [SKXPort0,SKXPort5,SKXFPDivider], 32, [2,1,24], 3, 7>;
+defm : SKXWriteResPair<WriteFSqrt80, [SKXPort0,SKXFPDivider], 21, [1,7]>; // Floating point long double square root.
+
+defm : SKXWriteResPair<WriteFRcp, [SKXPort0], 4, [1], 1, 5>; // Floating point reciprocal estimate.
+defm : SKXWriteResPair<WriteFRcpX, [SKXPort0], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFRcpY, [SKXPort0], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFRcpZ, [SKXPort0,SKXPort5], 4, [2,1], 3, 7>;
+
+defm : SKXWriteResPair<WriteFRsqrt, [SKXPort0], 4, [1], 1, 5>; // Floating point reciprocal square root estimate.
+defm : SKXWriteResPair<WriteFRsqrtX,[SKXPort0], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFRsqrtY,[SKXPort0], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFRsqrtZ,[SKXPort0,SKXPort5], 9, [2,1], 3, 7>;
+
+defm : SKXWriteResPair<WriteFMA, [SKXPort01], 4, [1], 1, 5>; // Fused Multiply Add.
+defm : SKXWriteResPair<WriteFMAX, [SKXPort01], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFMAY, [SKXPort01], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFMAZ, [SKXPort05], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteDPPD, [SKXPort5,SKXPort015], 9, [1,2], 3, 6>; // Floating point double dot product.
+defm : SKXWriteResPair<WriteDPPS, [SKXPort5,SKXPort015], 13, [1,3], 4, 6>;
+defm : SKXWriteResPair<WriteDPPSY,[SKXPort5,SKXPort015], 13, [1,3], 4, 7>;
+defm : SKXWriteResPair<WriteDPPSZ,[SKXPort5,SKXPort015], 13, [1,3], 4, 7>;
+defm : SKXWriteResPair<WriteFSign, [SKXPort0], 1>; // Floating point fabs/fchs.
+defm : SKXWriteResPair<WriteFRnd, [SKXPort01], 8, [2], 2, 6>; // Floating point rounding.
+defm : SKXWriteResPair<WriteFRndY, [SKXPort01], 8, [2], 2, 7>;
+defm : SKXWriteResPair<WriteFRndZ, [SKXPort05], 8, [2], 2, 7>;
+defm : SKXWriteResPair<WriteFLogic, [SKXPort015], 1, [1], 1, 6>; // Floating point and/or/xor logicals.
+defm : SKXWriteResPair<WriteFLogicY, [SKXPort015], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFLogicZ, [SKXPort05], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFTest, [SKXPort0], 2, [1], 1, 6>; // Floating point TEST instructions.
+defm : SKXWriteResPair<WriteFTestY, [SKXPort0], 2, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFTestZ, [SKXPort0], 2, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFShuffle, [SKXPort5], 1, [1], 1, 6>; // Floating point vector shuffles.
+defm : SKXWriteResPair<WriteFShuffleY, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFShuffleZ, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFVarShuffle, [SKXPort5], 1, [1], 1, 6>; // Floating point vector variable shuffles.
+defm : SKXWriteResPair<WriteFVarShuffleY, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFVarShuffleZ, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFBlend, [SKXPort015], 1, [1], 1, 6>; // Floating point vector blends.
+defm : SKXWriteResPair<WriteFBlendY,[SKXPort015], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFBlendZ,[SKXPort015], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFVarBlend, [SKXPort015], 2, [2], 2, 6>; // Fp vector variable blends.
+defm : SKXWriteResPair<WriteFVarBlendY,[SKXPort015], 2, [2], 2, 7>;
+defm : SKXWriteResPair<WriteFVarBlendZ,[SKXPort015], 2, [2], 2, 7>;
// FMA Scheduling helper class.
// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
// Vector integer operations.
-defm : SKXWriteResPair<WriteVecALU, SKXPort15, 1>; // Vector integer ALU op, no logicals.
-defm : SKXWriteResPair<WriteVecShift, SKXPort0, 1>; // Vector integer shifts.
-defm : SKXWriteResPair<WriteVecIMul, SKXPort0, 5>; // Vector integer multiply.
-defm : SKXWriteResPair<WriteShuffle, SKXPort5, 1>; // Vector shuffles.
-defm : SKXWriteResPair<WriteBlend, SKXPort15, 1>; // Vector blends.
-
-def : WriteRes<WriteVarBlend, [SKXPort5]> { // Vector variable blends.
+defm : X86WriteRes<WriteVecLoad, [SKXPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [SKXPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [SKXPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT, [SKXPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [SKXPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [SKXPort23,SKXPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [SKXPort23,SKXPort015], 8, [1,1], 2>;
+defm : X86WriteRes<WriteVecStore, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreX, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNT, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNTY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMove, [SKXPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr, [SKXPort0], 2, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [SKXPort5], 1, [1], 1>;
+
+defm : SKXWriteResPair<WriteVecALU, [SKXPort05], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : SKXWriteResPair<WriteVecALUX, [SKXPort01], 1, [1], 1, 6>;
+defm : SKXWriteResPair<WriteVecALUY, [SKXPort01], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecALUZ, [SKXPort0], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecLogic, [SKXPort05], 1, [1], 1, 5>; // Vector integer and/or/xor.
+defm : SKXWriteResPair<WriteVecLogicX,[SKXPort015], 1, [1], 1, 6>;
+defm : SKXWriteResPair<WriteVecLogicY,[SKXPort015], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecLogicZ,[SKXPort05], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecTest, [SKXPort0,SKXPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions.
+defm : SKXWriteResPair<WriteVecTestY, [SKXPort0,SKXPort5], 3, [1,1], 2, 7>;
+defm : SKXWriteResPair<WriteVecTestZ, [SKXPort0,SKXPort5], 3, [1,1], 2, 7>;
+defm : SKXWriteResPair<WriteVecIMul, [SKXPort0], 4, [1], 1, 5>; // Vector integer multiply.
+defm : SKXWriteResPair<WriteVecIMulX, [SKXPort01], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteVecIMulY, [SKXPort01], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecIMulZ, [SKXPort05], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WritePMULLD, [SKXPort01], 10, [2], 2, 6>; // Vector PMULLD.
+defm : SKXWriteResPair<WritePMULLDY, [SKXPort01], 10, [2], 2, 7>;
+defm : SKXWriteResPair<WritePMULLDZ, [SKXPort05], 10, [2], 2, 7>;
+defm : SKXWriteResPair<WriteShuffle, [SKXPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : SKXWriteResPair<WriteShuffleX, [SKXPort5], 1, [1], 1, 6>;
+defm : SKXWriteResPair<WriteShuffleY, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteShuffleZ, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVarShuffle, [SKXPort5], 1, [1], 1, 5>; // Vector variable shuffles.
+defm : SKXWriteResPair<WriteVarShuffleX, [SKXPort5], 1, [1], 1, 6>;
+defm : SKXWriteResPair<WriteVarShuffleY, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVarShuffleZ, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteBlend, [SKXPort5], 1, [1], 1, 6>; // Vector blends.
+defm : SKXWriteResPair<WriteBlendY,[SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteBlendZ,[SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVarBlend, [SKXPort015], 2, [2], 2, 6>; // Vector variable blends.
+defm : SKXWriteResPair<WriteVarBlendY,[SKXPort015], 2, [2], 2, 6>;
+defm : SKXWriteResPair<WriteVarBlendZ,[SKXPort05], 2, [1], 1, 6>;
+defm : SKXWriteResPair<WriteMPSAD, [SKXPort5], 4, [2], 2, 6>; // Vector MPSAD.
+defm : SKXWriteResPair<WriteMPSADY, [SKXPort5], 4, [2], 2, 7>;
+defm : SKXWriteResPair<WriteMPSADZ, [SKXPort5], 4, [2], 2, 7>;
+defm : SKXWriteResPair<WritePSADBW, [SKXPort5], 3, [1], 1, 5>; // Vector PSADBW.
+defm : SKXWriteResPair<WritePSADBWX, [SKXPort5], 3, [1], 1, 6>;
+defm : SKXWriteResPair<WritePSADBWY, [SKXPort5], 3, [1], 1, 7>;
+defm : SKXWriteResPair<WritePSADBWZ, [SKXPort5], 3, [1], 1, 7>;
+defm : SKXWriteResPair<WritePHMINPOS, [SKXPort0], 4, [1], 1, 6>; // Vector PHMINPOS.
+
+// Vector integer shifts.
+defm : SKXWriteResPair<WriteVecShift, [SKXPort0], 1, [1], 1, 5>;
+defm : X86WriteRes<WriteVecShiftX, [SKXPort5,SKXPort01], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftY, [SKXPort5,SKXPort01], 4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftZ, [SKXPort5,SKXPort0], 4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftXLd, [SKXPort01,SKXPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftYLd, [SKXPort01,SKXPort23], 8, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftZLd, [SKXPort0,SKXPort23], 8, [1,1], 2>;
+
+defm : SKXWriteResPair<WriteVecShiftImm, [SKXPort0], 1, [1], 1, 5>;
+defm : SKXWriteResPair<WriteVecShiftImmX, [SKXPort01], 1, [1], 1, 6>; // Vector integer immediate shifts.
+defm : SKXWriteResPair<WriteVecShiftImmY, [SKXPort01], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecShiftImmZ, [SKXPort0], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVarVecShift, [SKXPort01], 1, [1], 1, 6>; // Variable vector shifts.
+defm : SKXWriteResPair<WriteVarVecShiftY, [SKXPort01], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVarVecShiftZ, [SKXPort0], 1, [1], 1, 7>;
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [SKXPort5]> {
let Latency = 2;
+ let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def : WriteRes<WriteVarBlendLd, [SKXPort5, SKXPort23]> {
+def : WriteRes<WriteVecInsertLd, [SKXPort5,SKXPort23]> {
let Latency = 6;
- let ResourceCycles = [2, 1];
+ let NumMicroOps = 2;
}
+def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>;
-def : WriteRes<WriteMPSAD, [SKXPort0, SKXPort5]> { // Vector MPSAD.
- let Latency = 6;
- let ResourceCycles = [1, 2];
+def : WriteRes<WriteVecExtract, [SKXPort0,SKXPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
}
-def : WriteRes<WriteMPSADLd, [SKXPort23, SKXPort0, SKXPort5]> {
- let Latency = 6;
- let ResourceCycles = [1, 1, 2];
+def : WriteRes<WriteVecExtractSt, [SKXPort4,SKXPort5,SKXPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
}
-// Vector bitwise operations.
-// These are often used on both floating point and integer vectors.
-defm : SKXWriteResPair<WriteVecLogic, SKXPort015, 1>; // Vector and/or/xor.
-
// Conversion between integer and float.
-defm : SKXWriteResPair<WriteCvtF2I, SKXPort1, 3>; // Float -> Integer.
-defm : SKXWriteResPair<WriteCvtI2F, SKXPort1, 4>; // Integer -> Float.
-defm : SKXWriteResPair<WriteCvtF2F, SKXPort1, 3>; // Float -> Float size conversion.
+defm : SKXWriteResPair<WriteCvtSS2I, [SKXPort01], 6, [2], 2>; // Needs more work: DD vs DQ.
+defm : SKXWriteResPair<WriteCvtPS2I, [SKXPort01], 3>;
+defm : SKXWriteResPair<WriteCvtPS2IY, [SKXPort01], 3>;
+defm : SKXWriteResPair<WriteCvtPS2IZ, [SKXPort05], 3>;
+defm : SKXWriteResPair<WriteCvtSD2I, [SKXPort01], 6, [2], 2>;
+defm : SKXWriteResPair<WriteCvtPD2I, [SKXPort01], 3>;
+defm : SKXWriteResPair<WriteCvtPD2IY, [SKXPort01], 3>;
+defm : SKXWriteResPair<WriteCvtPD2IZ, [SKXPort05], 3>;
+
+defm : SKXWriteResPair<WriteCvtI2SS, [SKXPort1], 4>;
+defm : SKXWriteResPair<WriteCvtI2PS, [SKXPort01], 4>;
+defm : SKXWriteResPair<WriteCvtI2PSY, [SKXPort01], 4>;
+defm : SKXWriteResPair<WriteCvtI2PSZ, [SKXPort05], 4>; // Needs more work: DD vs DQ.
+defm : SKXWriteResPair<WriteCvtI2SD, [SKXPort1], 4>;
+defm : SKXWriteResPair<WriteCvtI2PD, [SKXPort01], 4>;
+defm : SKXWriteResPair<WriteCvtI2PDY, [SKXPort01], 4>;
+defm : SKXWriteResPair<WriteCvtI2PDZ, [SKXPort05], 4>;
+
+defm : SKXWriteResPair<WriteCvtSS2SD, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteCvtPS2PD, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteCvtPS2PDY, [SKXPort5,SKXPort01], 3, [1,1], 2>;
+defm : SKXWriteResPair<WriteCvtPS2PDZ, [SKXPort05], 3, [2], 2>;
+defm : SKXWriteResPair<WriteCvtSD2SS, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteCvtPD2PS, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteCvtPD2PSY, [SKXPort5,SKXPort01], 3, [1,1], 2>;
+defm : SKXWriteResPair<WriteCvtPD2PSZ, [SKXPort05], 3, [2], 2>;
+
+defm : X86WriteRes<WriteCvtPH2PS, [SKXPort5,SKXPort01], 5, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSY, [SKXPort5,SKXPort01], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSZ, [SKXPort5,SKXPort0], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSLd, [SKXPort23,SKXPort01], 9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSYLd, [SKXPort23,SKXPort01], 10, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSZLd, [SKXPort23,SKXPort05], 10, [1,1], 2>;
+
+defm : X86WriteRes<WriteCvtPS2PH, [SKXPort5,SKXPort01], 5, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHY, [SKXPort5,SKXPort01], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHZ, [SKXPort5,SKXPort05], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHSt, [SKXPort4,SKXPort5,SKXPort237,SKXPort01], 6, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [SKXPort4,SKXPort5,SKXPort237,SKXPort01], 8, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHZSt, [SKXPort4,SKXPort5,SKXPort237,SKXPort05], 8, [1,1,1,1], 4>;
// Strings instructions.
+
// Packed Compare Implicit Length Strings, Return Mask
-// String instructions.
def : WriteRes<WritePCmpIStrM, [SKXPort0]> {
let Latency = 10;
+ let NumMicroOps = 3;
let ResourceCycles = [3];
}
def : WriteRes<WritePCmpIStrMLd, [SKXPort0, SKXPort23]> {
- let Latency = 10;
- let ResourceCycles = [3, 1];
-}
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
// Packed Compare Explicit Length Strings, Return Mask
-def : WriteRes<WritePCmpEStrM, [SKXPort0, SKXPort16, SKXPort5]> {
- let Latency = 10;
- let ResourceCycles = [3, 2, 4];
+def : WriteRes<WritePCmpEStrM, [SKXPort0, SKXPort5, SKXPort015, SKXPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
}
-def : WriteRes<WritePCmpEStrMLd, [SKXPort05, SKXPort16, SKXPort23]> {
- let Latency = 10;
- let ResourceCycles = [6, 2, 1];
-}
- // Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrMLd, [SKXPort0, SKXPort5, SKXPort23, SKXPort015, SKXPort0156]> {
+ let Latency = 25;
+ let NumMicroOps = 10;
+ let ResourceCycles = [4,3,1,1,1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
def : WriteRes<WritePCmpIStrI, [SKXPort0]> {
- let Latency = 11;
+ let Latency = 10;
+ let NumMicroOps = 3;
let ResourceCycles = [3];
}
def : WriteRes<WritePCmpIStrILd, [SKXPort0, SKXPort23]> {
- let Latency = 11;
- let ResourceCycles = [3, 1];
-}
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
// Packed Compare Explicit Length Strings, Return Index
-def : WriteRes<WritePCmpEStrI, [SKXPort05, SKXPort16]> {
- let Latency = 11;
- let ResourceCycles = [6, 2];
+def : WriteRes<WritePCmpEStrI, [SKXPort0,SKXPort5,SKXPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [4,3,1];
}
-def : WriteRes<WritePCmpEStrILd, [SKXPort0, SKXPort16, SKXPort5, SKXPort23]> {
- let Latency = 11;
- let ResourceCycles = [3, 2, 2, 1];
+def : WriteRes<WritePCmpEStrILd, [SKXPort0, SKXPort5, SKXPort23, SKXPort0156]> {
+ let Latency = 24;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
}
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [SKXPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSK, [SKXPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSKY, [SKXPort0]> { let Latency = 2; }
+def : WriteRes<WriteMMXMOVMSK, [SKXPort0]> { let Latency = 2; }
+
// AES instructions.
-def : WriteRes<WriteAESDecEnc, [SKXPort5]> { // Decryption, encryption.
- let Latency = 7;
+def : WriteRes<WriteAESDecEnc, [SKXPort0]> { // Decryption, encryption.
+ let Latency = 4;
+ let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def : WriteRes<WriteAESDecEncLd, [SKXPort5, SKXPort23]> {
- let Latency = 7;
- let ResourceCycles = [1, 1];
+def : WriteRes<WriteAESDecEncLd, [SKXPort0, SKXPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def : WriteRes<WriteAESIMC, [SKXPort5]> { // InvMixColumn.
- let Latency = 14;
+
+def : WriteRes<WriteAESIMC, [SKXPort0]> { // InvMixColumn.
+ let Latency = 8;
+ let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def : WriteRes<WriteAESIMCLd, [SKXPort5, SKXPort23]> {
+def : WriteRes<WriteAESIMCLd, [SKXPort0, SKXPort23]> {
let Latency = 14;
- let ResourceCycles = [2, 1];
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
}
-def : WriteRes<WriteAESKeyGen, [SKXPort0, SKXPort5]> { // Key Generation.
- let Latency = 10;
- let ResourceCycles = [2, 8];
+
+def : WriteRes<WriteAESKeyGen, [SKXPort0,SKXPort5,SKXPort015]> { // Key Generation.
+ let Latency = 20;
+ let NumMicroOps = 11;
+ let ResourceCycles = [3,6,2];
}
-def : WriteRes<WriteAESKeyGenLd, [SKXPort0, SKXPort5, SKXPort23]> {
- let Latency = 10;
- let ResourceCycles = [2, 7, 1];
+def : WriteRes<WriteAESKeyGenLd, [SKXPort0,SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 25;
+ let NumMicroOps = 11;
+ let ResourceCycles = [3,6,1,1];
}
// Carry-less multiplication instructions.
-def : WriteRes<WriteCLMul, [SKXPort0, SKXPort5]> {
- let Latency = 7;
- let ResourceCycles = [2, 1];
+def : WriteRes<WriteCLMul, [SKXPort5]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
}
-def : WriteRes<WriteCLMulLd, [SKXPort0, SKXPort5, SKXPort23]> {
- let Latency = 7;
- let ResourceCycles = [2, 1, 1];
+def : WriteRes<WriteCLMulLd, [SKXPort5, SKXPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
// Catch-all for expensive system instructions.
def : WriteRes<WriteSystem, [SKXPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
// AVX2.
-defm : SKXWriteResPair<WriteFShuffle256, SKXPort5, 3>; // Fp 256-bit width vector shuffles.
-defm : SKXWriteResPair<WriteShuffle256, SKXPort5, 3>; // 256-bit width vector shuffles.
-def : WriteRes<WriteVarVecShift, [SKXPort0, SKXPort5]> { // Variable vector shifts.
- let Latency = 2;
- let ResourceCycles = [2, 1];
-}
-def : WriteRes<WriteVarVecShiftLd, [SKXPort0, SKXPort5, SKXPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1, 1];
-}
+defm : SKXWriteResPair<WriteFShuffle256, [SKXPort5], 3, [1], 1, 7>; // Fp 256-bit width vector shuffles.
+defm : SKXWriteResPair<WriteFVarShuffle256, [SKXPort5], 3, [1], 1, 7>; // Fp 256-bit width vector variable shuffles.
+defm : SKXWriteResPair<WriteShuffle256, [SKXPort5], 3, [1], 1, 7>; // 256-bit width vector shuffles.
+defm : SKXWriteResPair<WriteVarShuffle256, [SKXPort5], 3, [1], 1, 7>; // 256-bit width vector variable shuffles.
// Old microcoded instructions that nobody use.
def : WriteRes<WriteMicrocoded, [SKXPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
@@ -277,33 +541,22 @@ def : WriteRes<WriteMicrocoded, [SKXPort0156]> { let Latency = 100; } // def Wri
// Fence instructions.
def : WriteRes<WriteFence, [SKXPort23, SKXPort4]>;
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [SKXPort0,SKXPort23,SKXPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [SKXPort4,SKXPort5,SKXPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+
// Nop, not very useful expect it provides a model for nops!
def : WriteRes<WriteNop, []>;
////////////////////////////////////////////////////////////////////////////////
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [SKXPort1]> {
- let Latency = 3;
-}
-
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [SKXPort1, SKXPort23]> {
- let Latency = 7;
- let ResourceCycles = [1, 1];
-}
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [SKXPort15]>;
-
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [SKXPort15, SKXPort23]> {
- let Latency = 5;
- let ResourceCycles = [1, 1];
-}
+defm : SKXWriteResPair<WriteFHAdd, [SKXPort5,SKXPort015], 6, [2,1], 3, 6>;
+defm : SKXWriteResPair<WriteFHAddY, [SKXPort5,SKXPort015], 6, [2,1], 3, 7>;
+defm : SKXWriteResPair<WritePHAdd, [SKXPort5,SKXPort05], 3, [2,1], 3, 5>;
+defm : SKXWriteResPair<WritePHAddX, [SKXPort5,SKXPort015], 3, [2,1], 3, 6>;
+defm : SKXWriteResPair<WritePHAddY, [SKXPort5,SKXPort015], 3, [2,1], 3, 7>;
// Remaining instrs.
@@ -312,358 +565,35 @@ def SKXWriteResGroup1 : SchedWriteRes<[SKXPort0]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDBrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDNBrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDNDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDNQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDNWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KMOVBkk")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KMOVDkk")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KMOVQkk")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KMOVWkk")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KNOTBrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KNOTDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KNOTQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KNOTWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KORBrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KORDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KORQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KORWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXNORBrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXNORDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXNORQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXNORWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXORBrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXORDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXORQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXORWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDSBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDSWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDUSBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDUSWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PAVGBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PAVGWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPEQBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPEQDirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPEQWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPGTBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPGTDirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPGTWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMAXSWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMAXUBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMINSWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMINUBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLDri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLQri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLWri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRADri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRADrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRAWri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRAWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLDri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLQri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLWri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBSBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBSWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBUSBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBUSWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVB2MZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVB2MZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVB2MZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVD2MZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVD2MZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVD2MZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVQ2MZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVQ2MZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVQ2MZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVW2MZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVW2MZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVW2MZrr(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup2 : SchedWriteRes<[SKXPort1]> {
- let Latency = 1;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKXWriteResGroup2], (instregex "MMX_MASKMOVQ64")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KAND(B|D|Q|W)rr",
+ "KANDN(B|D|Q|W)rr",
+ "KMOV(B|D|Q|W)kk",
+ "KNOT(B|D|Q|W)rr",
+ "KOR(B|D|Q|W)rr",
+ "KXNOR(B|D|Q|W)rr",
+ "KXOR(B|D|Q|W)rr",
+ "MMX_PADDS(B|W)irr",
+ "MMX_PADDUS(B|W)irr",
+ "MMX_PAVG(B|W)irr",
+ "MMX_PCMPEQ(B|D|W)irr",
+ "MMX_PCMPGT(B|D|W)irr",
+ "MMX_P(MAX|MIN)SWirr",
+ "MMX_P(MAX|MIN)UBirr",
+ "MMX_PSUBS(B|W)irr",
+ "MMX_PSUBUS(B|W)irr",
+ "VPMOVB2M(Z|Z128|Z256)rr",
+ "VPMOVD2M(Z|Z128|Z256)rr",
+ "VPMOVQ2M(Z|Z128|Z256)rr",
+ "VPMOVW2M(Z|Z128|Z256)rr")>;
def SKXWriteResGroup3 : SchedWriteRes<[SKXPort5]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup3], (instregex "COMP_FST0r")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "COM_FST0r")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "INSERTPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "KMOVBkr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "KMOVDkr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "KMOVQkr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "KMOVWkr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_MOVD64rr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_MOVD64to64rr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PALIGNR64irr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PSHUFBrr64")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PSHUFWri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKHBWirr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKHDQirr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKHWDirr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKLBWirr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKLDQirr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKLWDirr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOV64toPQIrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVDDUPrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVDI2PDIrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVHLPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVLHPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVSDrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVSHDUPrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVSLDUPrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVUPDrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVUPSrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PACKSSDWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PACKSSWBrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PACKUSDWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PACKUSWBrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PALIGNRrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PBLENDWrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXBDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXBQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXWQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXBDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXBQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXWQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFBrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFDri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFHWri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFLWri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PSLLDQri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PSRLDQri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHQDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLQDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "SHUFPDrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "SHUFPSrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "UCOM_FPr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "UCOM_Fr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKHPDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKHPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKLPDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKLPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VBROADCASTI32X2Z128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VBROADCASTSSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VINSERTPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VINSERTPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOV64toPQIZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOV64toPQIrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDI2PDIZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDI2PDIrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVHLPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVHLPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVLHPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVLHPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSDrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSSZrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPDYrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPDrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPSYrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPSrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRYrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPBLENDWYrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPBLENDWrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPBROADCASTDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPBROADCASTQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDYri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSYri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXBDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXBQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXWQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXBDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXBQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXWQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDYri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWYri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWYri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQYri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQYri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDYrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSYrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "COM(P?)_FST0r",
+ "KMOV(B|D|Q|W)kr",
+ "UCOM_F(P?)r")>;
def SKXWriteResGroup4 : SchedWriteRes<[SKXPort6]> {
let Latency = 1;
@@ -672,907 +602,79 @@ def SKXWriteResGroup4 : SchedWriteRes<[SKXPort6]> {
}
def: InstRW<[SKXWriteResGroup4], (instregex "JMP(16|32|64)r")>;
-def SKXWriteResGroup5 : SchedWriteRes<[SKXPort01]> {
- let Latency = 1;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKXWriteResGroup5], (instregex "PABSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PABSDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PABSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PADDSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PADDSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PADDUSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PADDUSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PAVGBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PAVGWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQQrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PCMPGTBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PCMPGTDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PCMPGTWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMAXSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMAX(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMAXSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMAXUBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMAXUDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMAXUWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMINSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMIN(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMINSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMINUBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMINUDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMINUWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSIGNBrr128")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSIGNDrr128")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSIGNWrr128")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSLLDri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSLLQri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSLLWri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSRADri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSRAWri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSRLDri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSRLQri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSRLWri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSUBSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSUBSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSUBUSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSUBUSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQQYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQQrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLDZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLDZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLDZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLQZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLQZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLQZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORDZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORDZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORDZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORQZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORQZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORQZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNBYrr256")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNBrr128")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNDYrr256")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNDrr128")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNWYrr256")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNWrr128")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWZ128ri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWZ256ri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAQZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAQZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAQZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWZ128ri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWZ256ri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWZ128ri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWZ256ri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWrr")>;
-
def SKXWriteResGroup6 : SchedWriteRes<[SKXPort05]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup6], (instregex "FINCSTP")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "FNOP")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_MOVQ64rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PABSBrr64")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PABSDrr64")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PABSWrr64")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDBirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDDirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDQirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDWirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PANDNirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PANDirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PORirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSIGNBrr64")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSIGNDrr64")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSIGNWrr64")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBBirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBDirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBQirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBWirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PXORirr")>;
+def: InstRW<[SKXWriteResGroup6], (instrs FINCSTP, FNOP)>;
def SKXWriteResGroup7 : SchedWriteRes<[SKXPort06]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup7], (instregex "ADC(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "ADC(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "ADC8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "ADCX(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "ADOX(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BT(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BT(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BTC(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BTC(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BTR(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BTR(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BTS(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BTS(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CDQ")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CLAC")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVAE(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVB(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVE(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVG(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVGE(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVL(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVLE(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNE(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNO(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNP(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNS(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVO(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVP(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVS(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CQO")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JAE_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JAE_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JA_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JA_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JBE_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JBE_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JB_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JB_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JE_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JE_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JGE_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JGE_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JG_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JG_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JLE_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JLE_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JL_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JL_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JMP_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JMP_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNE_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNE_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNO_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNO_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNP_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNP_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNS_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNS_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JO_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JO_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JP_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JP_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JS_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JS_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "RORX(32|64)ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SAR(16|32|64)r1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SAR(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SAR8r1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SAR8ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SARX(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SBB(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SBB(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SBB8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETAEr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETBr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETEr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETGEr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETGr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETLEr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETLr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETNEr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETNOr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETNPr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETNSr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETOr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETPr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETSr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHL(16|32|64)r1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHL(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHL8r1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHL8ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHLX(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHR(16|32|64)r1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHR(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHR8r1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHR8ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHRX(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "STAC")>;
+def: InstRW<[SKXWriteResGroup7], (instrs CDQ, CQO, CLAC, STAC)>;
+def: InstRW<[SKXWriteResGroup7], (instregex "BT(16|32|64)ri8",
+ "BT(16|32|64)rr",
+ "BTC(16|32|64)ri8",
+ "BTC(16|32|64)rr",
+ "BTR(16|32|64)ri8",
+ "BTR(16|32|64)rr",
+ "BTS(16|32|64)ri8",
+ "BTS(16|32|64)rr")>;
def SKXWriteResGroup8 : SchedWriteRes<[SKXPort15]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup8], (instregex "ANDN(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup8], (instregex "BLSI(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup8], (instregex "BLSMSK(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup8], (instregex "BLSR(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup8], (instregex "BZHI(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup8], (instregex "LEA(16|32|64)(_32)?r")>;
+def: InstRW<[SKXWriteResGroup8], (instregex "ANDN(32|64)rr",
+ "BLSI(32|64)rr",
+ "BLSMSK(32|64)rr",
+ "BLSR(32|64)rr")>;
def SKXWriteResGroup9 : SchedWriteRes<[SKXPort015]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup9], (instregex "ANDNPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "ANDNPSrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "ANDPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "ANDPSrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "BLENDPDrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "BLENDPSrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVAPDrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVAPSrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVDQArr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVDQUrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVPQI2QIrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVSSrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "ORPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "ORPSrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PADDBrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PADDDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PADDQrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PADDWrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PANDNrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PANDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PORrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PSUBBrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PSUBDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PSUBQrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PSUBWrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PXORrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPDYrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPDrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPSYrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPSrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDYrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZ128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZ256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSYrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Z128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Z256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Zrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Z128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Zrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQAYrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQArr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Z128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Z256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Zrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Z128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Z256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Zrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Z128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Z256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Zrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Z128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Z256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Zrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQUYrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQUrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVPQI(2Q|Lo2PQ)IZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVPQI2QIrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVSSrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZ128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZ256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZ128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZ256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPDYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPSYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPSrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDDYrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDDrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGQZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGQZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGQZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "XORPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "XORPSrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPD(Z128|Z256)rr",
+ "VBLENDMPS(Z128|Z256)rr",
+ "VPADD(B|D|Q|W)(Y|Z|Z128|Z256)rr",
+ "(V?)PADD(B|D|Q|W)rr",
+ "VPBLENDD(Y?)rri",
+ "VPBLENDMB(Z128|Z256)rr",
+ "VPBLENDMD(Z128|Z256)rr",
+ "VPBLENDMQ(Z128|Z256)rr",
+ "VPBLENDMW(Z128|Z256)rr",
+ "VPSUB(B|D|Q|W)(Y|Z|Z128|Z256)rr",
+ "(V?)PSUB(B|D|Q|W)rr",
+ "VPTERNLOGD(Z|Z128|Z256)rri",
+ "VPTERNLOGQ(Z|Z128|Z256)rri")>;
def SKXWriteResGroup10 : SchedWriteRes<[SKXPort0156]> {
let Latency = 1;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup10], (instregex "ADD(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "ADD8i8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "ADD8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "ADD8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "AND(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "AND(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "AND8i8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "AND8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "AND8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CBW")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CLC")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CMC")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CMP(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CMP8i8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CMP8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CMP8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CWDE")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "DEC(16|32|64)r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "DEC8r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "INC(16|32|64)r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "INC8r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "LAHF")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOV8ri(_alt)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOV8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOVZX(16|32|64)rr16")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOVZX(16|32|64)rr8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "NEG(16|32|64)r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "NEG8r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "NOOP")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "NOT(16|32|64)r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "NOT8r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "OR(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "OR(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "OR8i8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "OR8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "OR8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SAHF")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SGDT64m")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SIDT64m")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SLDT64m")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SMSW16m")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "STC")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "STRm")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SUB(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SUB8i8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SUB8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SUB8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SYSCALL")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "TEST(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "TEST8i8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "TEST8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "TEST8rr")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "XCHG(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "XOR(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "XOR(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "XOR8i8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "XOR8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "XOR8rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instrs CBW, CWDE, CDQE,
+ CMC, STC)>;
+def: InstRW<[SKXWriteResGroup10], (instregex "SGDT64m",
+ "SIDT64m",
+ "SMSW16m",
+ "STRm",
+ "SYSCALL")>;
def SKXWriteResGroup11 : SchedWriteRes<[SKXPort4,SKXPort237]> {
let Latency = 1;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup11], (instregex "FBSTPm")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "KMOVBmk")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "KMOVDmk")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "KMOVQmk")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "KMOVWmk")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVD64mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVNTQmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVQ64mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOV(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOV8mi")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOV8mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVAPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVAPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVDQAmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVDQUmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVHPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVHPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVLPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVLPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTDQmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTI_64mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTImr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVPDI2DImr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVPQI2QImr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVPQIto64mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVSDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVSSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVUPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVUPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "ST_FP32m")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "ST_FP64m")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "ST_FP80m")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF128mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF32x4Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF32x4Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF32x8Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF64x2Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF64x2Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF64x4Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI128mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI32x4Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI32x4Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI32x8Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI64x2Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI64x2Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI64x4Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA32Z128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA32Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA32Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA64Z128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA64Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA64Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQAYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQAmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU16Z128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU16Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU16Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU32Z128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU32Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU32Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU64Z128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU64Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU64Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU8Z128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU8Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQUYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQUmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPSZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPSZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPDI2DIZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPDI2DImr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPQI(2QI|to64)Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPQI2QImr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPQIto64mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSSZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMPTRSTm")>;
-
-def SKXWriteResGroup12 : SchedWriteRes<[SKXPort0]> {
- let Latency = 2;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKXWriteResGroup12], (instregex "COMISDrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "COMISSrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "MMX_MOVD64grr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "MMX_PMOVMSKBrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "MOVMSKPDrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "MOVMSKPSrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "MOVPDI2DIrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "MOVPQIto64rr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "PMOVMSKBrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "UCOMISDrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "UCOMISSrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISDZrr(b?)")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISDrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISSZrr(b?)")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISSrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPDYrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPDrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPSYrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPSrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPDI2DIZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPDI2DIrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPQIto64Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPQIto64rr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VPMOVMSKBYrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VPMOVMSKBrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPDYrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPDrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPSYrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPSrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISDZrr(b?)")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISDrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISSZrr(b?)")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISSrr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "FBSTPm",
+ "KMOV(B|D|Q|W)mk",
+ "ST_FP(32|64|80)m",
+ "VMPTRSTm")>;
def SKXWriteResGroup13 : SchedWriteRes<[SKXPort5]> {
let Latency = 2;
@@ -1580,26 +682,13 @@ def SKXWriteResGroup13 : SchedWriteRes<[SKXPort5]> {
let ResourceCycles = [2];
}
def: InstRW<[SKXWriteResGroup13], (instregex "MMX_MOVQ2DQrr")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "MMX_PINSRWirri")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "PINSRBrr")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "PINSRDrr")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "PINSRQrr")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "PINSRWrri")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRBrr")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRDrr")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRQrr")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRWrri")>;
def SKXWriteResGroup14 : SchedWriteRes<[SKXPort05]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[SKXWriteResGroup14], (instregex "FDECSTP")>;
+def: InstRW<[SKXWriteResGroup14], (instrs FDECSTP)>;
def: InstRW<[SKXWriteResGroup14], (instregex "MMX_MOVDQ2Qrr")>;
def SKXWriteResGroup15 : SchedWriteRes<[SKXPort06]> {
@@ -1607,88 +696,20 @@ def SKXWriteResGroup15 : SchedWriteRes<[SKXPort06]> {
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[SKXWriteResGroup15], (instregex "CMOVA(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "CMOVBE(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROL(16|32|64)r1")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROL(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROL8r1")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROL8ri")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROR(16|32|64)r1")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROR(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROR8r1")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROR8ri")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "SETAr")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "SETBEr")>;
-
-def SKXWriteResGroup16 : SchedWriteRes<[SKXPort015]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SKXWriteResGroup16], (instregex "BLENDVPDrr0")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "BLENDVPSrr0")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "PBLENDVBrr0")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPDYrr")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPDrr")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPSYrr")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPSrr")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "VPBLENDVBYrr")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "VPBLENDVBrr")>;
+def: InstRW<[SKXWriteResGroup15], (instregex "ROL(8|16|32|64)r1",
+ "ROL(8|16|32|64)ri",
+ "ROR(8|16|32|64)r1",
+ "ROR(8|16|32|64)ri",
+ "SET(A|BE)r")>;
def SKXWriteResGroup17 : SchedWriteRes<[SKXPort0156]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[SKXWriteResGroup17], (instregex "LFENCE")>;
-def: InstRW<[SKXWriteResGroup17], (instregex "WAIT")>;
-def: InstRW<[SKXWriteResGroup17], (instregex "XGETBV")>;
-
-def SKXWriteResGroup18 : SchedWriteRes<[SKXPort0,SKXPort237]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup18], (instregex "MMX_MASKMOVQ64")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVDQU")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPDYmr")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPDmr")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPSYmr")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPSmr")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVDYmr")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVDmr")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVQYmr")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVQmr")>;
-
-def SKXWriteResGroup19 : SchedWriteRes<[SKXPort5,SKXPort01]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup19], (instregex "PSLLDrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "PSLLQrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "PSLLWrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "PSRADrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "PSRAWrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "PSRLDrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "PSRLQrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "PSRLWrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLDrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLQrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLWrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRADZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRADrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRAQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRAWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRAWrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLDrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLQrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLWrr")>;
+def: InstRW<[SKXWriteResGroup17], (instrs LFENCE,
+ WAIT,
+ XGETBV)>;
def SKXWriteResGroup20 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
let Latency = 2;
@@ -1702,77 +723,26 @@ def SKXWriteResGroup21 : SchedWriteRes<[SKXPort237,SKXPort0156]> {
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup21], (instregex "SFENCE")>;
-
-def SKXWriteResGroup22 : SchedWriteRes<[SKXPort06,SKXPort15]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup22], (instregex "BEXTR(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup22], (instregex "BSWAP(16|32|64)r")>;
+def: InstRW<[SKXWriteResGroup21], (instrs SFENCE)>;
def SKXWriteResGroup23 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup23], (instregex "ADC8i8")>;
-def: InstRW<[SKXWriteResGroup23], (instregex "ADC8ri")>;
-def: InstRW<[SKXWriteResGroup23], (instregex "CWD")>;
-def: InstRW<[SKXWriteResGroup23], (instregex "JRCXZ")>;
-def: InstRW<[SKXWriteResGroup23], (instregex "SBB8i8")>;
-def: InstRW<[SKXWriteResGroup23], (instregex "SBB8ri")>;
-
-def SKXWriteResGroup24 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
- let Latency = 2;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup24], (instregex "EXTRACTPSmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRBmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRDmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRQmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRWmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "STMXCSR")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VEXTRACTPSZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VEXTRACTPSmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRBmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRDmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRQZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRQmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRWZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRWmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VSTMXCSR")>;
+def: InstRW<[SKXWriteResGroup23], (instrs CWD)>;
+def: InstRW<[SKXWriteResGroup23], (instrs JCXZ, JECXZ, JRCXZ)>;
+def: InstRW<[SKXWriteResGroup23], (instregex "ADC8i8",
+ "ADC8ri",
+ "SBB8i8",
+ "SBB8ri")>;
def SKXWriteResGroup25 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237]> {
let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup25], (instregex "FNSTCW16m")>;
-
-def SKXWriteResGroup26 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> {
- let Latency = 2;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup26], (instregex "SETAEm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETBm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETEm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETGEm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETGm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETLEm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETLm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETNEm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETNOm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETNPm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETNSm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETOm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETPm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETSm")>;
+def: InstRW<[SKXWriteResGroup25], (instrs FNSTCW16m)>;
def SKXWriteResGroup27 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort15]> {
let Latency = 2;
@@ -1786,497 +756,131 @@ def SKXWriteResGroup28 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort0156]> {
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup28], (instregex "PUSH(16|32|64)r(mr)?")>;
-def: InstRW<[SKXWriteResGroup28], (instregex "PUSH64i8")>;
-def: InstRW<[SKXWriteResGroup28], (instregex "STOSB")>;
-def: InstRW<[SKXWriteResGroup28], (instregex "STOSL")>;
-def: InstRW<[SKXWriteResGroup28], (instregex "STOSQ")>;
-def: InstRW<[SKXWriteResGroup28], (instregex "STOSW")>;
+def: InstRW<[SKXWriteResGroup28], (instrs PUSH16r, PUSH32r, PUSH64r,
+ STOSB, STOSL, STOSQ, STOSW)>;
+def: InstRW<[SKXWriteResGroup28], (instregex "PUSH(16|32|64)rmr",
+ "PUSH64i8")>;
def SKXWriteResGroup29 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort15]> {
let Latency = 2;
let NumMicroOps = 5;
let ResourceCycles = [2,2,1];
}
-def: InstRW<[SKXWriteResGroup29], (instregex "VMOVDQU8Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup29], (instregex "VMOVDQU8Zmr(b?)")>;
def SKXWriteResGroup30 : SchedWriteRes<[SKXPort0]> {
let Latency = 3;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup30], (instregex "KADDBrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KADDDrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KADDQrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KADDWrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KMOVBrk")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KMOVDrk")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KMOVQrk")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KMOVWrk")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTBrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTDrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTQrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTWrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KTESTBrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KTESTDrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KTESTQrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KTESTWrr")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KMOV(B|D|Q|W)rk",
+ "KORTEST(B|D|Q|W)rr",
+ "KTEST(B|D|Q|W)rr")>;
def SKXWriteResGroup31 : SchedWriteRes<[SKXPort1]> {
let Latency = 3;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup31], (instregex "BSF(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "BSR(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "IMUL64rr(i8)?")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "IMUL8r")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "LZCNT(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "MUL8r")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "PDEP(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "PEXT(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "POPCNT(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "SHLD(16|32|64)rri8")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "SHRD(16|32|64)rri8")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "TZCNT(16|32|64)rr")>;
-
-def SKXWriteResGroup31_16 : SchedWriteRes<[SKXPort1, SKXPort0156]> {
- let Latency = 3;
+def: InstRW<[SKXWriteResGroup31], (instregex "PDEP(32|64)rr",
+ "PEXT(32|64)rr",
+ "SHLD(16|32|64)rri8",
+ "SHRD(16|32|64)rri8")>;
+
+def SKXWriteResGroup31_16i : SchedWriteRes<[SKXPort1, SKXPort0156]> {
+ let Latency = 4;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup31_16], (instregex "IMUL16rr(i8)?")>;
+def: InstRW<[SKXWriteResGroup31_16i], (instrs IMUL16rri, IMUL16rri8)>;
-def SKXWriteResGroup31_32 : SchedWriteRes<[SKXPort1]> {
- let Latency = 3;
- let NumMicroOps = 1;
-}
-def: InstRW<[SKXWriteResGroup31_32], (instregex "IMUL32rr(i8)?")>;
def SKXWriteResGroup32 : SchedWriteRes<[SKXPort5]> {
let Latency = 3;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup32], (instregex "ADD_FPrST0")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "ADD_FST0r")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "ADD_FrST0")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLBri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLDri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLQri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLWri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRBri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRDri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRQri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRWri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KUNPCKBWrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KUNPCKDQrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KUNPCKWDrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "MMX_PSADBWirr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "PCMPGTQrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "PSADBWrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "SUBR_FPrST0")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "SUBR_FST0r")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "SUBR_FrST0")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "SUB_FPrST0")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "SUB_FST0r")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "SUB_FrST0")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNQZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNQZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNQZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTF32X2Z256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTF32X2Zr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTI32X2Z256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTI32X2Zr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSDYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSDZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSDZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPSZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPSZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPSDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPSSZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VDBPSADBWZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VDBPSADBWZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VDBPSADBWZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF128rr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF32x4Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF32x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF32x8Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF64x2Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF64x2Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF64x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI128rr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI32x4Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI32x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI32x8Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI64x2Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI64x2Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI64x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSSDrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSSSrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF128rr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF32x4Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF32x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF32x8Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF64x2Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF64x2Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF64x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI128rr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI32x4Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI32x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI32x8Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI64x2Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI64x2Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI64x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDrZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDrZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDrZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQrZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQrZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQrZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPBZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPBZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPBZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPQZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPQZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPQZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUBZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUBZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUBZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUQZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUQZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUQZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUWZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUWZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUWZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPWZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPWZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPWZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERM2F128rr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERM2I128rr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMDYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2D128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2D256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Drr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PD128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PD256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PDrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PS128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PS256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PSrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Q128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Q256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Qrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDYri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPSYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQYri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2D128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2D256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Drr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PD128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PD256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PDrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PS128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PS256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PSrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Q128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Q256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Qrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXSQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXSQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXSQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXUQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXUQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXUQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMINSQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMINSQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMINSQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMINUQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMINUQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMINUQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVQDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVQDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVQDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF32X4Z256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF32X4Zrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF64X2Z256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF64X2Zrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI32X4Z256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI32X4Zrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI64X2Z256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI64X2Zrri(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup33 : SchedWriteRes<[SKXPort0,SKXPort5]> {
- let Latency = 3;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup33], (instregex "EXTRACTPSrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "MMX_PEXTRWirri")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRBrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRDrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRQrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRWri")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRWrr_REV")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "PTESTrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VEXTRACTPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VEXTRACTPSrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRBrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRDrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRQrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWZrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWri")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWrr_REV")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPTESTYrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPTESTrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)",
+ "KADD(B|D|Q|W)rr",
+ "KSHIFTL(B|D|Q|W)ri",
+ "KSHIFTR(B|D|Q|W)ri",
+ "KUNPCKBWrr",
+ "KUNPCKDQrr",
+ "KUNPCKWDrr",
+ "VALIGND(Z|Z128|Z256)rri",
+ "VALIGNQ(Z|Z128|Z256)rri",
+ "VCMPPD(Z|Z128|Z256)rri",
+ "VCMPPS(Z|Z128|Z256)rri",
+ "VCMPSDZrr",
+ "VCMPSSZrr",
+ "VDBPSADBWZrri", // TODO: 512-bit ops require ports 0/1 to be joined.
+ "VFPCLASSPD(Z|Z128|Z256)rr",
+ "VFPCLASSPS(Z|Z128|Z256)rr",
+ "VFPCLASSSDZrr",
+ "VFPCLASSSSZrr",
+ "VPBROADCASTBrr",
+ "VPBROADCASTWrr",
+ "VPCMPB(Z|Z128|Z256)rri",
+ "VPCMPD(Z|Z128|Z256)rri",
+ "VPCMPEQ(B|D|Q|W)(Z|Z128|Z256)rr",
+ "VPCMPGT(B|D|Q|W)(Z|Z128|Z256)rr",
+ "(V?)PCMPGTQ(Y?)rr",
+ "VPCMPQ(Z|Z128|Z256)rri",
+ "VPCMPU(B|D|Q|W)(Z|Z128|Z256)rri",
+ "VPCMPW(Z|Z128|Z256)rri",
+ "VP(MAX|MIN)(S|U)Q(Z|Z128|Z256)rr",
+ "VPSADBWZrr", // TODO: 512-bit ops require ports 0/1 to be joined.
+ "VPTEST(N?)M(B|D|Q|W)(Z|Z128|Z256)rr")>;
def SKXWriteResGroup34 : SchedWriteRes<[SKXPort0,SKXPort0156]> {
let Latency = 3;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup34], (instregex "FNSTSW16r")>;
+def: InstRW<[SKXWriteResGroup34], (instrs FNSTSW16r)>;
def SKXWriteResGroup35 : SchedWriteRes<[SKXPort06]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [3];
}
-def: InstRW<[SKXWriteResGroup35], (instregex "ROL(16|32|64)rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "ROL8rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "ROR(16|32|64)rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "ROR8rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "SAR(16|32|64)rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "SAR8rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "SHL(16|32|64)rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "SHL8rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "SHR(16|32|64)rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "SHR8rCL")>;
+def: InstRW<[SKXWriteResGroup35], (instregex "ROL(8|16|32|64)rCL",
+ "ROR(8|16|32|64)rCL",
+ "SAR(8|16|32|64)rCL",
+ "SHL(8|16|32|64)rCL",
+ "SHR(8|16|32|64)rCL")>;
def SKXWriteResGroup36 : SchedWriteRes<[SKXPort0156]> {
- let Latency = 3;
+ let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [3];
}
-def: InstRW<[SKXWriteResGroup36], (instregex "XADD(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup36], (instregex "XADD8rr")>;
-def: InstRW<[SKXWriteResGroup36], (instregex "XCHG8rr")>;
+def: InstRW<[SKXWriteResGroup36], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr,
+ XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr,
+ XCHG16ar, XCHG32ar, XCHG64ar)>;
def SKXWriteResGroup37 : SchedWriteRes<[SKXPort0,SKXPort5]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[SKXWriteResGroup37], (instregex "MMX_PHADDSWrr64")>;
-def: InstRW<[SKXWriteResGroup37], (instregex "MMX_PHSUBSWrr64")>;
+def: InstRW<[SKXWriteResGroup37], (instregex "MMX_PH(ADD|SUB)SWrr")>;
def SKXWriteResGroup38 : SchedWriteRes<[SKXPort5,SKXPort01]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKXWriteResGroup38], (instregex "PHADDSWrr128")>;
-def: InstRW<[SKXWriteResGroup38], (instregex "PHSUBSWrr128")>;
-def: InstRW<[SKXWriteResGroup38], (instregex "VPHADDSWrr128")>;
-def: InstRW<[SKXWriteResGroup38], (instregex "VPHADDSWrr256")>;
-def: InstRW<[SKXWriteResGroup38], (instregex "VPHSUBSWrr128")>;
-def: InstRW<[SKXWriteResGroup38], (instregex "VPHSUBSWrr256")>;
-
-def SKXWriteResGroup39 : SchedWriteRes<[SKXPort5,SKXPort05]> {
- let Latency = 3;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHADDWrr64")>;
-def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHADDrr64")>;
-def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHSUBDrr64")>;
-def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHSUBWrr64")>;
-
-def SKXWriteResGroup40 : SchedWriteRes<[SKXPort5,SKXPort015]> {
- let Latency = 3;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[SKXWriteResGroup40], (instregex "PHADDDrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "PHADDWrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "PHSUBDrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "PHSUBWrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDDYrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDDrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDWYrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDWrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBDYrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBDrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBWYrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBWrr")>;
+def: InstRW<[SKXWriteResGroup38], (instregex "(V?)PH(ADD|SUB)SW(Y?)rr")>;
def SKXWriteResGroup41 : SchedWriteRes<[SKXPort5,SKXPort0156]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKXWriteResGroup41], (instregex "MMX_PACKSSDWirr")>;
-def: InstRW<[SKXWriteResGroup41], (instregex "MMX_PACKSSWBirr")>;
-def: InstRW<[SKXWriteResGroup41], (instregex "MMX_PACKUSWBirr")>;
+def: InstRW<[SKXWriteResGroup41], (instregex "MMX_PACKSSDWirr",
+ "MMX_PACKSSWBirr",
+ "MMX_PACKUSWBirr")>;
def SKXWriteResGroup42 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
let Latency = 3;
@@ -2290,36 +894,31 @@ def SKXWriteResGroup43 : SchedWriteRes<[SKXPort237,SKXPort0156]> {
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[SKXWriteResGroup43], (instregex "MFENCE")>;
+def: InstRW<[SKXWriteResGroup43], (instrs MFENCE)>;
def SKXWriteResGroup44 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[SKXWriteResGroup44], (instregex "RCL(16|32|64)r1")>;
-def: InstRW<[SKXWriteResGroup44], (instregex "RCL(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup44], (instregex "RCL8r1")>;
-def: InstRW<[SKXWriteResGroup44], (instregex "RCL8ri")>;
-def: InstRW<[SKXWriteResGroup44], (instregex "RCR(16|32|64)r1")>;
-def: InstRW<[SKXWriteResGroup44], (instregex "RCR(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup44], (instregex "RCR8r1")>;
-def: InstRW<[SKXWriteResGroup44], (instregex "RCR8ri")>;
+def: InstRW<[SKXWriteResGroup44], (instregex "RCL(8|16|32|64)r1",
+ "RCL(8|16|32|64)ri",
+ "RCR(8|16|32|64)r1",
+ "RCR(8|16|32|64)ri")>;
def SKXWriteResGroup45 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237]> {
let Latency = 3;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup45], (instregex "FNSTSWm")>;
+def: InstRW<[SKXWriteResGroup45], (instrs FNSTSWm)>;
def SKXWriteResGroup46 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> {
let Latency = 3;
let NumMicroOps = 4;
let ResourceCycles = [1,1,2];
}
-def: InstRW<[SKXWriteResGroup46], (instregex "SETAm")>;
-def: InstRW<[SKXWriteResGroup46], (instregex "SETBEm")>;
+def: InstRW<[SKXWriteResGroup46], (instregex "SET(A|BE)m")>;
def SKXWriteResGroup47 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237,SKXPort0156]> {
let Latency = 3;
@@ -2333,474 +932,116 @@ def SKXWriteResGroup48 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06,SKXPort015
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[SKXWriteResGroup48], (instregex "CALL64pcrel32")>;
+def: InstRW<[SKXWriteResGroup48], (instrs CALL64pcrel32)>;
def SKXWriteResGroup49 : SchedWriteRes<[SKXPort0]> {
let Latency = 4;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup49], (instregex "AESDECLASTrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "AESDECrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "AESENCLASTrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "AESENCrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMADDUBSWrr64")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMADDWDirr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULHRSWrr64")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULHUWirr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULHWirr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULLWirr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULUDQirr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MUL_FPrST0")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MUL_FST0r")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MUL_FrST0")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "RCPPSr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "RCPSSr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "RSQRTPSr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "RSQRTSSr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VAESDECLASTrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VAESDECrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VAESENCLASTrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VAESENCrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PDZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PDZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PSZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PSZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14SDrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14SSrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCPPSYr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCPPSr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCPSSr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PDZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PDZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PSZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PSZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14SDrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14SSrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRTPSYr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRTPSr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRTSSr")>;
-
-def SKXWriteResGroup50 : SchedWriteRes<[SKXPort015]> {
+def: InstRW<[SKXWriteResGroup49], (instregex "MUL_(FPrST0|FST0r|FrST0)")>;
+
+def SKXWriteResGroup50 : SchedWriteRes<[SKXPort01]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PS(Y|Z128|Z256)rr",
+ "(V?)CVTDQ2PSrr",
+ "VCVTPD2QQ(Z128|Z256)rr",
+ "VCVTPD2UQQ(Z128|Z256)rr",
+ "VCVTPS2DQ(Y|Z128|Z256)rr",
+ "(V?)CVTPS2DQrr",
+ "VCVTPS2UDQ(Z128|Z256)rr",
+ "VCVTQQ2PD(Z128|Z256)rr",
+ "VCVTTPD2QQ(Z128|Z256)rr",
+ "VCVTTPD2UQQ(Z128|Z256)rr",
+ "VCVTTPS2DQ(Z128|Z256)rr",
+ "(V?)CVTTPS2DQrr",
+ "VCVTTPS2UDQ(Z128|Z256)rr",
+ "VCVTUDQ2PS(Z128|Z256)rr",
+ "VCVTUQQ2PD(Z128|Z256)rr")>;
+
+def SKXWriteResGroup50z : SchedWriteRes<[SKXPort05]> {
let Latency = 4;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup50], (instregex "ADDPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "ADDPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "ADDSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "ADDSSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "ADDSUBPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "ADDSUBPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "CMPPDrri")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "CMPPSrri")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "CMPSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "CMPSSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "CVTDQ2PSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "CVTPS2DQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "CVTTPS2DQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)PDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)PSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)SSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)PDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)PSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)SSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MULPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MULPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MULSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MULSSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PHMINPOSUWrr128")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMADDUBSWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMADDWDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMULDQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMULHRSWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMULHUWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMULHWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMULLWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMULUDQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "SUBPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "SUBPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "SUBSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "SUBSSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSSZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPDYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPSYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPDYrri")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPDrri")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPSYrri")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPSrri")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCMPSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCMPSSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2QQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2QQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2QQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2UQQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2UQQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2UQQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2UDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2UDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2UDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTQQ2PDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTQQ2PDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTQQ2PDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2QQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2QQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2QQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2UQQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2UQQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2UQQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2UDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2UDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2UDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUDQ2PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUDQ2PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUDQ2PSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUQQ2PDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUQQ2PDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUQQ2PDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPSZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPSZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMSDrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMSSrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50],
- (instregex
- "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Yr",
- "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z128r(b?)(k?)(z?)",
- "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z256r(b?)(k?)(z?)",
- "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Zr(b?)(k?)(z?)",
- "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)r",
- "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)Zr(b?)(_Int)?(k?)(z?)",
- "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPDZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPDZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPDr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPSZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPSZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPSr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPSDr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPSSr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPSZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPSZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTSDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTSSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SSZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SSZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULSDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULSSZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULSSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPHMINPOSUWrr128")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPSZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPSZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGESDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGESSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPSZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPSZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCESDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCESSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFSDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFSSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSSZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSSrr")>;
+def: InstRW<[SKXWriteResGroup50z], (instrs VCVTDQ2PSZrr,
+ VCVTPD2QQZrr,
+ VCVTPD2UQQZrr,
+ VCVTPS2DQZrr,
+ VCVTPS2UDQZrr,
+ VCVTQQ2PDZrr,
+ VCVTTPD2QQZrr,
+ VCVTTPD2UQQZrr,
+ VCVTTPS2DQZrr,
+ VCVTTPS2UDQZrr,
+ VCVTUDQ2PSZrr,
+ VCVTUQQ2PDZrr)>;
def SKXWriteResGroup51 : SchedWriteRes<[SKXPort5]> {
let Latency = 4;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[SKXWriteResGroup51], (instregex "MPSADBWrri")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VMPSADBWYrri")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VMPSADBWrri")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSWBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSWBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSWBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSWBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSWBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSWBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVWBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVWBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVWBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPD(Z|Z128|Z256)rr",
+ "VEXPANDPS(Z|Z128|Z256)rr",
+ "VPEXPANDD(Z|Z128|Z256)rr",
+ "VPEXPANDQ(Z|Z128|Z256)rr",
+ "VPMOVDB(Z|Z128|Z256)rr",
+ "VPMOVDW(Z|Z128|Z256)rr",
+ "VPMOVQB(Z|Z128|Z256)rr",
+ "VPMOVQW(Z|Z128|Z256)rr",
+ "VPMOVSDB(Z|Z128|Z256)rr",
+ "VPMOVSDW(Z|Z128|Z256)rr",
+ "VPMOVSQB(Z|Z128|Z256)rr",
+ "VPMOVSQD(Z|Z128|Z256)rr",
+ "VPMOVSQW(Z|Z128|Z256)rr",
+ "VPMOVSWB(Z|Z128|Z256)rr",
+ "VPMOVUSDB(Z|Z128|Z256)rr",
+ "VPMOVUSDW(Z|Z128|Z256)rr",
+ "VPMOVUSQB(Z|Z128|Z256)rr",
+ "VPMOVUSQD(Z|Z128|Z256)rr",
+ "VPMOVUSWB(Z|Z128|Z256)rr",
+ "VPMOVWB(Z|Z128|Z256)rr")>;
def SKXWriteResGroup52 : SchedWriteRes<[SKXPort1,SKXPort5]> {
let Latency = 4;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup52], (instregex "IMUL(32|64)r")>;
-def: InstRW<[SKXWriteResGroup52], (instregex "MUL(32|64)r")>;
-def: InstRW<[SKXWriteResGroup52], (instregex "MULX64rr")>;
+def: InstRW<[SKXWriteResGroup52], (instrs IMUL64r, MUL64r, MULX64rr)>;
def SKXWriteResGroup52_16 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
let Latency = 4;
let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
}
-def: InstRW<[SKXWriteResGroup52_16], (instregex "IMUL16r")>;
-def: InstRW<[SKXWriteResGroup52_16], (instregex "MUL16r")>;
-
-def SKXWriteResGroup53 : SchedWriteRes<[SKXPort5,SKXPort01]> {
- let Latency = 4;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLDYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLQYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLWYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRADYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRADZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRADZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAWYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLDYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLQYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLWYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup52_16], (instrs IMUL16r, MUL16r)>;
def SKXWriteResGroup54 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
let Latency = 4;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup54], (instregex "ISTT_FP16m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "ISTT_FP32m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "ISTT_FP64m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "IST_F16m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "IST_F32m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "IST_FP16m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "IST_FP32m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "IST_FP64m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "VPMOVQDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "VPMOVQDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "VPMOVQDZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup54], (instregex "IST(T?)_FP(16|32|64)m",
+ "IST_F(16|32)m",
+ "VPMOVQD(Z|Z128|Z256)mr(b?)")>;
def SKXWriteResGroup55 : SchedWriteRes<[SKXPort0156]> {
let Latency = 4;
let NumMicroOps = 4;
let ResourceCycles = [4];
}
-def: InstRW<[SKXWriteResGroup55], (instregex "FNCLEX")>;
+def: InstRW<[SKXWriteResGroup55], (instrs FNCLEX)>;
def SKXWriteResGroup56 : SchedWriteRes<[SKXPort015,SKXPort0156]> {
let Latency = 4;
let NumMicroOps = 4;
let ResourceCycles = [1,3];
}
-def: InstRW<[SKXWriteResGroup56], (instregex "VZEROUPPER")>;
+def: InstRW<[SKXWriteResGroup56], (instrs VZEROUPPER)>;
def SKXWriteResGroup57 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort0156]> {
let Latency = 4;
@@ -2814,109 +1055,53 @@ def SKXWriteResGroup58 : SchedWriteRes<[SKXPort23]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVD64rm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVD64to64rm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVQ64rm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOV(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOV64toPQIrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOV8rm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVDDUPrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVDI2PDIrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVQI2PQIrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVSDrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVSSrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm16")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm32")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm8")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVZX(16|32|64)rm16")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVZX(16|32|64)rm8")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHNTA")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHT0")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHT1")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHT2")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "VMOV64toPQIrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "VMOVDDUPrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "VMOVDI2PDIrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "VMOVQI2PQIrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "VMOVSDrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "VMOVSSrm")>;
-
-def SKXWriteResGroup59 : SchedWriteRes<[SKXPort015]> {
- let Latency = 5;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SKXWriteResGroup59], (instregex "VCVTSD2SSZrr(b?)(_Int)?(k?)(z?)")>;
-
-def SKXWriteResGroup60 : SchedWriteRes<[SKXPort0,SKXPort5]> {
- let Latency = 5;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup60], (instregex "CVTDQ2PDrr")>;
-def: InstRW<[SKXWriteResGroup60], (instregex "MMX_CVTPI2PDirr")>;
-def: InstRW<[SKXWriteResGroup60], (instregex "VCVTDQ2PDrr")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm16",
+ "MOVSX(16|32|64)rm32",
+ "MOVSX(16|32|64)rm8",
+ "MOVZX(16|32|64)rm16",
+ "MOVZX(16|32|64)rm8",
+ "(V?)MOVDDUPrm")>; // TODO: Should this be SKXWriteResGroup71?
def SKXWriteResGroup61 : SchedWriteRes<[SKXPort5,SKXPort015]> {
let Latency = 5;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTPD2DQrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTPD2PSrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTPS2PDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTSD2SSrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTSI642SDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTSI2SDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTSI2SSrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTSS2SDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTTPD2DQrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTPD2PIirr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTPS2PIirr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTTPD2PIirr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTTPS2PIirr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTDQ2PDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2DQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2DQrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2PSrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2UDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPH2PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPH2PSrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PHZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PHrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2QQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2UQQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTQQ2PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSD2SSrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI642SDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SSrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI642SDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSS2SDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSS2SDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPD2DQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPD2DQrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPD2UDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPS2QQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPS2UQQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUDQ2PDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUQQ2PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUSI2SDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUSI2SSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUSI642SDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVT(T?)PD2PIirr",
+ "MMX_CVT(T?)PS2PIirr",
+ "VCVTDQ2PDZ128rr",
+ "VCVTPD2DQZ128rr",
+ "(V?)CVT(T?)PD2DQrr",
+ "VCVTPD2PSZ128rr",
+ "(V?)CVTPD2PSrr",
+ "VCVTPD2UDQZ128rr",
+ "VCVTPS2PDZ128rr",
+ "(V?)CVTPS2PDrr",
+ "VCVTPS2QQZ128rr",
+ "VCVTPS2UQQZ128rr",
+ "VCVTQQ2PSZ128rr",
+ "(V?)CVTSD2SS(Z?)rr",
+ "(V?)CVTSI(64)?2SDrr",
+ "VCVTSI2SSZrr",
+ "(V?)CVTSI2SSrr",
+ "VCVTSI(64)?2SDZrr",
+ "VCVTSS2SDZrr",
+ "(V?)CVTSS2SDrr",
+ "VCVTTPD2DQZ128rr",
+ "VCVTTPD2UDQZ128rr",
+ "VCVTTPS2QQZ128rr",
+ "VCVTTPS2UQQZ128rr",
+ "VCVTUDQ2PDZ128rr",
+ "VCVTUQQ2PSZ128rr",
+ "VCVTUSI2SSZrr",
+ "VCVTUSI(64)?2SDZrr")>;
def SKXWriteResGroup62 : SchedWriteRes<[SKXPort5,SKXPort015]> {
let Latency = 5;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKXWriteResGroup62], (instregex "VPCONFLICTQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup62], (instregex "VPCONFLICTQZ128rr")>;
def SKXWriteResGroup63 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06]> {
let Latency = 5;
@@ -2926,426 +1111,172 @@ def SKXWriteResGroup63 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06]> {
def: InstRW<[SKXWriteResGroup63], (instregex "STR(16|32|64)r")>;
def SKXWriteResGroup64 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
- let Latency = 5;
+ let Latency = 4;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup64], (instregex "MULX32rr")>;
+def: InstRW<[SKXWriteResGroup64], (instrs IMUL32r, MUL32r, MULX32rr)>;
def SKXWriteResGroup65 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort015]> {
let Latency = 5;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZ128mr(b?)",
+ "VCVTPS2PHZ256mr(b?)",
+ "VCVTPS2PHZmr(b?)")>;
def SKXWriteResGroup66 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
let Latency = 5;
let NumMicroOps = 4;
let ResourceCycles = [1,2,1];
}
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDWZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDWZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDWZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQWZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQWZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQWZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDWZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDWZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDWZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQWZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQWZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQWZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSWBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSWBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSWBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDWZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDWZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDWZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQWZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQWZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQWZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSWBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSWBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSWBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVWBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVWBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVWBZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDB(Z|Z128|Z256)mr(b?)",
+ "VPMOVDW(Z|Z128|Z256)mr(b?)",
+ "VPMOVQB(Z|Z128|Z256)mr(b?)",
+ "VPMOVQW(Z|Z128|Z256)mr(b?)",
+ "VPMOVSDB(Z|Z128|Z256)mr(b?)",
+ "VPMOVSDW(Z|Z128|Z256)mr(b?)",
+ "VPMOVSQB(Z|Z128|Z256)mr(b?)",
+ "VPMOVSQD(Z|Z128|Z256)mr(b?)",
+ "VPMOVSQW(Z|Z128|Z256)mr(b?)",
+ "VPMOVSWB(Z|Z128|Z256)mr(b?)",
+ "VPMOVUSDB(Z|Z128|Z256)mr(b?)",
+ "VPMOVUSDW(Z|Z128|Z256)mr(b?)",
+ "VPMOVUSQB(Z|Z128|Z256)mr(b?)",
+ "VPMOVUSQD(Z|Z128|Z256)mr(b?)",
+ "VPMOVUSQW(Z|Z128|Z256)mr(b?)",
+ "VPMOVUSWB(Z|Z128|Z256)mr(b?)",
+ "VPMOVWB(Z|Z128|Z256)mr(b?)")>;
def SKXWriteResGroup67 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
let Latency = 5;
let NumMicroOps = 5;
let ResourceCycles = [1,4];
}
-def: InstRW<[SKXWriteResGroup67], (instregex "XSETBV")>;
+def: InstRW<[SKXWriteResGroup67], (instrs XSETBV)>;
def SKXWriteResGroup68 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
let Latency = 5;
let NumMicroOps = 5;
let ResourceCycles = [2,3];
}
-def: InstRW<[SKXWriteResGroup68], (instregex "CMPXCHG(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup68], (instregex "CMPXCHG8rr")>;
+def: InstRW<[SKXWriteResGroup68], (instregex "CMPXCHG(8|16|32|64)rr")>;
def SKXWriteResGroup69 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort0156]> {
let Latency = 5;
let NumMicroOps = 6;
let ResourceCycles = [1,1,4];
}
-def: InstRW<[SKXWriteResGroup69], (instregex "PUSHF16")>;
-def: InstRW<[SKXWriteResGroup69], (instregex "PUSHF64")>;
-
-def SKXWriteResGroup70 : SchedWriteRes<[SKXPort5]> {
- let Latency = 6;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKXWriteResGroup70], (instregex "PCLMULQDQrr")>;
-def: InstRW<[SKXWriteResGroup70], (instregex "VPCLMULQDQrr")>;
+def: InstRW<[SKXWriteResGroup69], (instregex "PUSHF(16|64)")>;
def SKXWriteResGroup71 : SchedWriteRes<[SKXPort23]> {
let Latency = 6;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup71], (instregex "LDDQUrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVAPDrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVAPSrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVDQArm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVDQUrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVNTDQArm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVSHDUPrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVSLDUPrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVUPDrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVUPSrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VBROADCASTSSrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VLDDQUrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVAPDrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVAPSrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVDQArm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVDQUrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVNTDQArm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVSHDUPrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVSLDUPrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVUPDrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVUPSrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VPBROADCASTDrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VPBROADCASTQrm")>;
-
-def SKXWriteResGroup72 : SchedWriteRes<[SKXPort0]> {
+def: InstRW<[SKXWriteResGroup71], (instregex "VBROADCASTSSrm",
+ "(V?)MOVSHDUPrm",
+ "(V?)MOVSLDUPrm",
+ "VPBROADCASTDrm",
+ "VPBROADCASTQrm")>;
+
+def SKXWriteResGroup72 : SchedWriteRes<[SKXPort5]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def: InstRW<[SKXWriteResGroup72], (instregex "MMX_CVTPI2PSirr")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPERMWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPERMWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPERMWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "MMX_CVTPI2PSirr",
+ "VCOMPRESSPD(Z|Z128|Z256)rr",
+ "VCOMPRESSPS(Z|Z128|Z256)rr",
+ "VPCOMPRESSD(Z|Z128|Z256)rr",
+ "VPCOMPRESSQ(Z|Z128|Z256)rr",
+ "VPERMW(Z|Z128|Z256)rr")>;
def SKXWriteResGroup73 : SchedWriteRes<[SKXPort0,SKXPort23]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDSBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDSWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDUSBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDUSWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PAVGBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PAVGWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPEQBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPEQDirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPEQWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPGTBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPGTDirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPGTWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMAXSWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMAXUBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMINSWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMINUBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSLLDrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSLLQrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSLLWrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRADrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRAWrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRLDrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRLQrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRLWrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBSBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBSWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBUSBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBUSWirm")>;
-
-def SKXWriteResGroup74 : SchedWriteRes<[SKXPort0,SKXPort015]> {
- let Latency = 6;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup74], (instregex "CVTSD2SI64rr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "CVTSD2SIrr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "CVTSS2SI64rr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "CVTSS2SIrr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "CVTTSD2SI64rr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "CVTTSD2SIrr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SI64Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SI64rr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SIZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SIrr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2USI64Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2USIZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SI64Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SI64rr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SIZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SIrr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2USIZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SI64Zrr(b?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SI64rr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SIZrr(b?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SIrr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2USI64Zrr(b?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2USIZrr(b?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSS2USIZrr(b?)")>;
-
-def SKXWriteResGroup75 : SchedWriteRes<[SKXPort5,SKXPort23]> {
- let Latency = 6;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PALIGNR64irm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PINSRWirmi")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PSHUFBrm64")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PSHUFWmi")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKHBWirm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKHDQirm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKHWDirm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKLBWirm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKLDQirm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKLWDirm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MOVHPDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MOVHPSrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MOVLPDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MOVLPSrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PINSRBrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PINSRDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PINSRQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PINSRWrmi")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXBDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXBQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXBWrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXDQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXWDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXWQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXBDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXBQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXBWrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXDQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXWDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXWQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPSrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPSrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRBrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRWrmi")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXBDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXBQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXBWrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXDQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXWDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXWQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXBDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXBQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXBWrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXDQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXWDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXWQrm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDSBirm",
+ "MMX_PADDSWirm",
+ "MMX_PADDUSBirm",
+ "MMX_PADDUSWirm",
+ "MMX_PAVGBirm",
+ "MMX_PAVGWirm",
+ "MMX_PCMPEQBirm",
+ "MMX_PCMPEQDirm",
+ "MMX_PCMPEQWirm",
+ "MMX_PCMPGTBirm",
+ "MMX_PCMPGTDirm",
+ "MMX_PCMPGTWirm",
+ "MMX_PMAXSWirm",
+ "MMX_PMAXUBirm",
+ "MMX_PMINSWirm",
+ "MMX_PMINUBirm",
+ "MMX_PSUBSBirm",
+ "MMX_PSUBSWirm",
+ "MMX_PSUBUSBirm",
+ "MMX_PSUBUSWirm")>;
def SKXWriteResGroup76 : SchedWriteRes<[SKXPort6,SKXPort23]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup76], (instregex "FARJMP64")>;
-def: InstRW<[SKXWriteResGroup76], (instregex "JMP(16|32|64)m")>;
-
-def SKXWriteResGroup77 : SchedWriteRes<[SKXPort23,SKXPort05]> {
- let Latency = 6;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PABSBrm64")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PABSDrm64")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PABSWrm64")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDBirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDDirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDQirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDWirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PANDNirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PANDirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PORirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSIGNBrm64")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSIGNDrm64")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSIGNWrm64")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBBirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBDirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBQirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBWirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PXORirm")>;
+def: InstRW<[SKXWriteResGroup76], (instregex "FARJMP64",
+ "JMP(16|32|64)m")>;
def SKXWriteResGroup78 : SchedWriteRes<[SKXPort23,SKXPort06]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup78], (instregex "ADC(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "ADC8rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "ADCX(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "ADOX(32|64)rm")>;
def: InstRW<[SKXWriteResGroup78], (instregex "BT(16|32|64)mi8")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVAE(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVB(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVE(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVG(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVGE(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVL(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVLE(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNE(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNO(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNP(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNS(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVO(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVP(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVS(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "RORX(32|64)mi")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "SARX(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "SBB(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "SBB8rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "SHLX(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "SHRX(32|64)rm")>;
def SKXWriteResGroup79 : SchedWriteRes<[SKXPort23,SKXPort15]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup79], (instregex "ANDN(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup79], (instregex "BLSI(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup79], (instregex "BLSMSK(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup79], (instregex "BLSR(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup79], (instregex "BZHI(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup79], (instregex "MOVBE(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup79], (instregex "ANDN(32|64)rm",
+ "BLSI(32|64)rm",
+ "BLSMSK(32|64)rm",
+ "BLSR(32|64)rm",
+ "MOVBE(16|32|64)rm")>;
def SKXWriteResGroup80 : SchedWriteRes<[SKXPort23,SKXPort015]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup80], (instregex "VMOV(64to|QI2)PQIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup80], (instregex "VMOVDI2PDIZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup80], (instregex "VMOV(64to|QI2)PQIZrm(b?)",
+ "VMOVDI2PDIZrm(b?)")>;
def SKXWriteResGroup81 : SchedWriteRes<[SKXPort23,SKXPort0156]> {
let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup81], (instregex "ADD(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "ADD8rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "AND(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "AND8rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "CMP(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "CMP(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "CMP(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "CMP8mi")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "CMP8mr")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "CMP8rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "OR(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "OR8rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "POP(16|32|64)r(mr)?")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "SUB(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "SUB8rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "TEST(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "TEST8mi")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "TEST8mr")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "XOR(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "XOR8rm")>;
+def: InstRW<[SKXWriteResGroup81], (instrs POP16r, POP32r, POP64r)>;
+def: InstRW<[SKXWriteResGroup81], (instregex "POP(16|32|64)rmr")>;
def SKXWriteResGroup82 : SchedWriteRes<[SKXPort5,SKXPort015]> {
let Latency = 6;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKXWriteResGroup82], (instregex "CVTSI642SSrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "HADDPDrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "HADDPSrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "HSUBPDrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "HSUBPSrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VCVTSI642SSrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VCVTSI642SSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VCVTUSI642SSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPDYrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPDrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPSYrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPSrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPDYrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPDrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPSYrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPSrr")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "(V?)CVTSI642SSrr",
+ "VCVTSI642SSZrr",
+ "VCVTUSI642SSZrr")>;
def SKXWriteResGroup83 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
let Latency = 6;
let NumMicroOps = 4;
let ResourceCycles = [1,2,1];
}
-def: InstRW<[SKXWriteResGroup83], (instregex "SHLD(16|32|64)rrCL")>;
-def: InstRW<[SKXWriteResGroup83], (instregex "SHRD(16|32|64)rrCL")>;
+def: InstRW<[SKXWriteResGroup83], (instregex "SHLD(16|32|64)rrCL",
+ "SHRD(16|32|64)rrCL")>;
def SKXWriteResGroup84 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06,SKXPort0156]> {
let Latency = 6;
@@ -3354,675 +1285,249 @@ def SKXWriteResGroup84 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06,SKXPort0156]
}
def: InstRW<[SKXWriteResGroup84], (instregex "SLDT(16|32|64)r")>;
-def SKXWriteResGroup85 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237,SKXPort015]> {
- let Latency = 6;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[SKXWriteResGroup85], (instregex "VCVTPS2PHmr")>;
-
def SKXWriteResGroup86 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> {
let Latency = 6;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[SKXWriteResGroup86], (instregex "BTC(16|32|64)mi8")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "BTR(16|32|64)mi8")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "BTS(16|32|64)mi8")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SAR(16|32|64)m1")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SAR(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SAR8m1")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SAR8mi")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHL(16|32|64)m1")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHL(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHL8m1")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHL8mi")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHR(16|32|64)m1")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHR(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHR8m1")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHR8mi")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "BTC(16|32|64)mi8",
+ "BTR(16|32|64)mi8",
+ "BTS(16|32|64)mi8",
+ "SAR(8|16|32|64)m1",
+ "SAR(8|16|32|64)mi",
+ "SHL(8|16|32|64)m1",
+ "SHL(8|16|32|64)mi",
+ "SHR(8|16|32|64)m1",
+ "SHR(8|16|32|64)mi")>;
def SKXWriteResGroup87 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> {
let Latency = 6;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[SKXWriteResGroup87], (instregex "ADD(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "ADD(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "ADD8mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "ADD8mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "AND(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "AND(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "AND8mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "AND8mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "DEC(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "DEC8m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "INC(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "INC8m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "NEG(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "NEG8m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "NOT(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "NOT8m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "OR(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "OR(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "OR8mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "OR8mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "POP(16|32|64)rmm")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "PUSH(16|32|64)rmm")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "SUB(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "SUB(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "SUB8mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "SUB8mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "XOR(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "XOR(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "XOR8mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "XOR8mr")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "POP(16|32|64)rmm",
+ "PUSH(16|32|64)rmm")>;
def SKXWriteResGroup88 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
let Latency = 6;
let NumMicroOps = 6;
let ResourceCycles = [1,5];
}
-def: InstRW<[SKXWriteResGroup88], (instregex "STD")>;
+def: InstRW<[SKXWriteResGroup88], (instrs STD)>;
def SKXWriteResGroup89 : SchedWriteRes<[SKXPort23]> {
let Latency = 7;
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup89], (instregex "LD_F32m")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "LD_F64m")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "LD_F80m")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTF128")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTI128")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTSDYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTSSYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VLDDQUYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVAPDYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVAPSYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVDDUPYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVDQAYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVDQUYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVNTDQAYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVNTDQAZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVSHDUPYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVSLDUPYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVUPDYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVUPSYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VPBROADCASTDYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VPBROADCASTQYrm")>;
-
-def SKXWriteResGroup90 : SchedWriteRes<[SKXPort0,SKXPort5]> {
+def: InstRW<[SKXWriteResGroup89], (instregex "LD_F(32|64|80)m",
+ "VBROADCASTF128",
+ "VBROADCASTI128",
+ "VBROADCASTSDYrm",
+ "VBROADCASTSSYrm",
+ "VMOVDDUPYrm",
+ "VMOVSHDUPYrm",
+ "VMOVSLDUPYrm",
+ "VPBROADCASTDYrm",
+ "VPBROADCASTQYrm")>;
+
+def SKXWriteResGroup90 : SchedWriteRes<[SKXPort01,SKXPort5]> {
let Latency = 7;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
def: InstRW<[SKXWriteResGroup90], (instregex "VCVTDQ2PDYrr")>;
-def SKXWriteResGroup91 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+def SKXWriteResGroup92 : SchedWriteRes<[SKXPort5,SKXPort23]> {
let Latency = 7;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup91], (instregex "COMISDrm")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "COMISSrm")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "UCOMISDrm")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "UCOMISSrm")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISDrm")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISSrm")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISDrm")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISSrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VMOVSDZrm(b?)",
+ "VMOVSSZrm(b?)")>;
-def SKXWriteResGroup92 : SchedWriteRes<[SKXPort5,SKXPort23]> {
- let Latency = 7;
+def SKXWriteResGroup92a : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 6;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup92], (instregex "INSERTPSrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PACKSSDWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PACKSSWBrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PACKUSDWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PACKUSWBrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PALIGNRrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PBLENDWrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFBrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFDmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFHWmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFLWmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHBWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHQDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHWDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLBWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLQDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLWDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "SHUFPDrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "SHUFPSrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKHPDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKHPSrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKLPDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKLPSrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VINSERTPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VINSERTPSrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VMOVSDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VMOVSSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSDWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSDWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSWBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSWBrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSDWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSDWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSWBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSWBrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPALIGNRZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPALIGNRrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPBLENDWrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTBZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTBrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTWZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFBrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFDZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFDmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFHWZ128mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFHWmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFLWZ128mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFLWmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSLLDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSRLDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHBWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHBWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHQDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHQDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHWDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHWDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLBWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLBWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLQDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLQDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLWDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLWDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPDrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPSZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPSrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPSrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPSrm")>;
+def: InstRW<[SKXWriteResGroup92a], (instregex "(V?)PMOV(SX|ZX)BDrm",
+ "(V?)PMOV(SX|ZX)BQrm",
+ "(V?)PMOV(SX|ZX)BWrm",
+ "(V?)PMOV(SX|ZX)DQrm",
+ "(V?)PMOV(SX|ZX)WDrm",
+ "(V?)PMOV(SX|ZX)WQrm")>;
def SKXWriteResGroup93 : SchedWriteRes<[SKXPort5,SKXPort015]> {
let Latency = 7;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTDQ2PDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTDQ2PDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2DQYrr")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2DQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2DQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2PSYrr")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2PSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2UDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2UDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPH2PSYrr")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPH2PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPH2PSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PDYrr")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PHYrr")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PHZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PHZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2QQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2QQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2UQQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2UQQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTQQ2PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTQQ2PSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2DQYrr")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2DQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2DQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2UDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2UDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2QQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2QQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2UQQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2UQQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUDQ2PDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUDQ2PDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUQQ2PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUQQ2PSZrr(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup94 : SchedWriteRes<[SKXPort01,SKXPort23]> {
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTDQ2PDZ256rr",
+ "VCVTPD2DQ(Y|Z256)rr",
+ "VCVTPD2PS(Y|Z256)rr",
+ "VCVTPD2UDQZ256rr",
+ "VCVTPS2PD(Y|Z256)rr",
+ "VCVTPS2QQZ256rr",
+ "VCVTPS2UQQZ256rr",
+ "VCVTQQ2PSZ256rr",
+ "VCVTTPD2DQ(Y|Z256)rr",
+ "VCVTTPD2UDQZ256rr",
+ "VCVTTPS2QQZ256rr",
+ "VCVTTPS2UQQZ256rr",
+ "VCVTUDQ2PDZ256rr",
+ "VCVTUQQ2PSZ256rr")>;
+
+def SKXWriteResGroup93z : SchedWriteRes<[SKXPort5,SKXPort05]> {
let Latency = 7;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup94], (instregex "PABSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PABSDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PABSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PADDSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PADDSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PADDUSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PADDUSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PAVGBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PAVGWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PCMPGTBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PCMPGTDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PCMPGTWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMAXSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMAX(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMAXSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMAXUBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMAXUDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMAXUWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMINSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMIN(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMINSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMINUBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMINUDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMINUWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSIGNBrm128")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSIGNDrm128")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSIGNWrm128")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSLLDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSLLQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSLLWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSRADrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSRAWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSRLDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSRLQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSRLWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSUBSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSUBSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSUBUSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSUBUSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPABSBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPABSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPABSDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPABSDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPABSQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPABSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPABSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPGTBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPGTDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPGTWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAX(C?)SDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAX(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMIN(C?)SDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMIN(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPROLDZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPROLQZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPROLVDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPROLVQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPRORDZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPRORQZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPRORVDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPRORVQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSIGNBrm128")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSIGNDrm128")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSIGNWrm128")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLDZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLQZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLWZ128mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRADZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRADZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRADrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAQZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAWZ128mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLDZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLQZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLWZ128mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSWrm")>;
+def: InstRW<[SKXWriteResGroup93z], (instrs VCVTDQ2PDZrr,
+ VCVTPD2DQZrr,
+ VCVTPD2PSZrr,
+ VCVTPD2UDQZrr,
+ VCVTPS2PDZrr,
+ VCVTPS2QQZrr,
+ VCVTPS2UQQZrr,
+ VCVTQQ2PSZrr,
+ VCVTTPD2DQZrr,
+ VCVTTPD2UDQZrr,
+ VCVTTPS2QQZrr,
+ VCVTTPS2UQQZrr,
+ VCVTUDQ2PDZrr,
+ VCVTUQQ2PSZrr)>;
def SKXWriteResGroup95 : SchedWriteRes<[SKXPort23,SKXPort015]> {
let Latency = 7;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup95], (instregex "ANDNPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "ANDNPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "ANDPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "ANDPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "BLENDPDrmi")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "BLENDPSrmi")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "ORPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "ORPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PADDBrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PADDDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PADDQrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PADDWrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PANDNrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PANDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PORrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PSUBBrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PSUBDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PSUBQrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PSUBWrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PXORrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDMPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDMPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDPDrmi")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDPSrmi")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VBROADCASTI32X2Z128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VBROADCASTSSZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VINSERTF128rm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VINSERTI128rm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMASKMOVPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMASKMOVPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVAPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVAPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDDUPZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQA32Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQA64Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU16Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU32Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU64Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU8Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVNTDQAZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVSHDUPZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVSLDUPZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVUPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVUPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VORPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VORPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VORPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VORPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDBrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDQrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDWrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPANDDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPANDNDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPANDNQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPANDNrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPANDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPANDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDDrmi")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPBROADCASTDZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPBROADCASTQZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPMASKMOVDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPMASKMOVQrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPORDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPORQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPORrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBBrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBQrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBWrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPTERNLOGDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPTERNLOGQZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPXORDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPXORQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPXORrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VXORPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VXORPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VXORPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VXORPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "XORPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "XORPSrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDMPDZ128rm(b?)",
+ "VBLENDMPSZ128rm(b?)",
+ "VBROADCASTI32X2Z128m(b?)",
+ "VBROADCASTSSZ128m(b?)",
+ "VINSERTF128rm",
+ "VINSERTI128rm",
+ "VMOVAPDZ128rm(b?)",
+ "VMOVAPSZ128rm(b?)",
+ "VMOVDDUPZ128rm(b?)",
+ "VMOVDQA32Z128rm(b?)",
+ "VMOVDQA64Z128rm(b?)",
+ "VMOVDQU16Z128rm(b?)",
+ "VMOVDQU32Z128rm(b?)",
+ "VMOVDQU64Z128rm(b?)",
+ "VMOVDQU8Z128rm(b?)",
+ "VMOVNTDQAZ128rm(b?)",
+ "VMOVSHDUPZ128rm(b?)",
+ "VMOVSLDUPZ128rm(b?)",
+ "VMOVUPDZ128rm(b?)",
+ "VMOVUPSZ128rm(b?)",
+ "VPADD(B|D|Q|W)Z128rm(b?)",
+ "(V?)PADD(B|D|Q|W)rm",
+ "VPBLENDDrmi",
+ "VPBLENDM(B|D|Q|W)Z128rm(b?)",
+ "VPBROADCASTDZ128m(b?)",
+ "VPBROADCASTQZ128m(b?)",
+ "VPSUB(B|D|Q|W)Z128rm(b?)",
+ "(V?)PSUB(B|D|Q|W)rm",
+ "VPTERNLOGDZ128rm(b?)i",
+ "VPTERNLOGQZ128rm(b?)i")>;
def SKXWriteResGroup96 : SchedWriteRes<[SKXPort5,SKXPort23]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKXWriteResGroup96], (instregex "MMX_PACKSSDWirm")>;
-def: InstRW<[SKXWriteResGroup96], (instregex "MMX_PACKSSWBirm")>;
-def: InstRW<[SKXWriteResGroup96], (instregex "MMX_PACKUSWBirm")>;
+def: InstRW<[SKXWriteResGroup96], (instregex "MMX_PACKSSDWirm",
+ "MMX_PACKSSWBirm",
+ "MMX_PACKUSWBirm")>;
def SKXWriteResGroup97 : SchedWriteRes<[SKXPort5,SKXPort015]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2W128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2W256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2Wrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup97], (instregex "VPERMT2W128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup97], (instregex "VPERMT2W256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup97], (instregex "VPERMT2Wrr(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup98 : SchedWriteRes<[SKXPort23,SKXPort06]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[SKXWriteResGroup98], (instregex "CMOVA(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup98], (instregex "CMOVBE(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2W128rr",
+ "VPERMI2W256rr",
+ "VPERMI2Wrr",
+ "VPERMT2W128rr",
+ "VPERMT2W256rr",
+ "VPERMT2Wrr")>;
def SKXWriteResGroup99 : SchedWriteRes<[SKXPort23,SKXPort0156]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[SKXWriteResGroup99], (instregex "LEAVE64")>;
-def: InstRW<[SKXWriteResGroup99], (instregex "SCASB")>;
-def: InstRW<[SKXWriteResGroup99], (instregex "SCASL")>;
-def: InstRW<[SKXWriteResGroup99], (instregex "SCASQ")>;
-def: InstRW<[SKXWriteResGroup99], (instregex "SCASW")>;
+def: InstRW<[SKXWriteResGroup99], (instrs LEAVE, LEAVE64,
+ SCASB, SCASL, SCASQ, SCASW)>;
def SKXWriteResGroup100 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort015]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup100], (instregex "CVTTSS2SI64rr")>;
-def: InstRW<[SKXWriteResGroup100], (instregex "CVTTSS2SIrr")>;
-def: InstRW<[SKXWriteResGroup100], (instregex "VCVTSS2USI64Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SI64Zrr(b?)")>;
-def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SI64rr")>;
-def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SIZrr(b?)")>;
-def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SIrr")>;
-def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2USI64Zrr(b?)")>;
+def: InstRW<[SKXWriteResGroup100], (instregex "VCVTSS2USI64Zrr",
+ "(V?)CVTSS2SI64(Z?)rr",
+ "(V?)CVTTSS2SI64(Z?)rr",
+ "VCVTTSS2USI64Zrr")>;
def SKXWriteResGroup101 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort05]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup101], (instregex "FLDCW16m")>;
-
-def SKXWriteResGroup102 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort0156]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup102], (instregex "LDMXCSR")>;
-def: InstRW<[SKXWriteResGroup102], (instregex "VLDMXCSR")>;
+def: InstRW<[SKXWriteResGroup101], (instrs FLDCW16m)>;
def SKXWriteResGroup103 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort0156]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup103], (instregex "KMOVBkm")>;
-def: InstRW<[SKXWriteResGroup103], (instregex "KMOVDkm")>;
-def: InstRW<[SKXWriteResGroup103], (instregex "KMOVQkm")>;
-def: InstRW<[SKXWriteResGroup103], (instregex "KMOVWkm")>;
+def: InstRW<[SKXWriteResGroup103], (instregex "KMOV(B|D|Q|W)km")>;
def SKXWriteResGroup104 : SchedWriteRes<[SKXPort6,SKXPort23,SKXPort0156]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup104], (instregex "LRETQ")>;
-def: InstRW<[SKXWriteResGroup104], (instregex "RETQ")>;
-
-def SKXWriteResGroup105 : SchedWriteRes<[SKXPort23,SKXPort06,SKXPort15]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup105], (instregex "BEXTR(32|64)rm")>;
+def: InstRW<[SKXWriteResGroup104], (instrs LRETQ, RETQ)>;
def SKXWriteResGroup106 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
let Latency = 7;
let NumMicroOps = 4;
let ResourceCycles = [1,2,1];
}
-def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPSZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPSZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPSZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSQZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSQZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSQZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPD(Z|Z128|Z256)mr(b?)",
+ "VCOMPRESSPS(Z|Z128|Z256)mr(b?)",
+ "VPCOMPRESSD(Z|Z128|Z256)mr(b?)",
+ "VPCOMPRESSQ(Z|Z128|Z256)mr(b?)")>;
def SKXWriteResGroup107 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> {
let Latency = 7;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,2];
}
-def: InstRW<[SKXWriteResGroup107], (instregex "ROL(16|32|64)m1")>;
-def: InstRW<[SKXWriteResGroup107], (instregex "ROL(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup107], (instregex "ROL8m1")>;
-def: InstRW<[SKXWriteResGroup107], (instregex "ROL8mi")>;
-def: InstRW<[SKXWriteResGroup107], (instregex "ROR(16|32|64)m1")>;
-def: InstRW<[SKXWriteResGroup107], (instregex "ROR(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup107], (instregex "ROR8m1")>;
-def: InstRW<[SKXWriteResGroup107], (instregex "ROR8mi")>;
+def: InstRW<[SKXWriteResGroup107], (instregex "ROL(8|16|32|64)m1",
+ "ROL(8|16|32|64)mi",
+ "ROR(8|16|32|64)m1",
+ "ROR(8|16|32|64)mi")>;
def SKXWriteResGroup108 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> {
let Latency = 7;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,2];
}
-def: InstRW<[SKXWriteResGroup108], (instregex "XADD(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup108], (instregex "XADD8rm")>;
+def: InstRW<[SKXWriteResGroup108], (instregex "XADD(8|16|32|64)rm")>;
def SKXWriteResGroup109 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> {
let Latency = 7;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,1,1];
}
-def: InstRW<[SKXWriteResGroup109], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup109], (instregex "FARCALL64")>;
+def: InstRW<[SKXWriteResGroup109], (instregex "CALL(16|32|64)m",
+ "FARCALL64")>;
def SKXWriteResGroup110 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> {
let Latency = 7;
@@ -4039,7 +1544,7 @@ def SKXWriteResGroup111 : SchedWriteRes<[SKXPort6,SKXPort06,SKXPort15,SKXPort015
let NumMicroOps = 7;
let ResourceCycles = [1,3,1,2];
}
-def: InstRW<[SKXWriteResGroup111], (instregex "LOOP")>;
+def: InstRW<[SKXWriteResGroup111], (instrs LOOP)>;
def SKXWriteResGroup112 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> {
let Latency = 7;
@@ -4068,629 +1573,142 @@ def SKXWriteResGroup114 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,S
}
def: InstRW<[SKXWriteResGroup114], (instrs VSCATTERDPSZmr)>;
-def SKXWriteResGroup115 : SchedWriteRes<[SKXPort0]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SKXWriteResGroup115], (instregex "AESIMCrr")>;
-def: InstRW<[SKXWriteResGroup115], (instregex "VAESIMCrr")>;
-
-def SKXWriteResGroup116 : SchedWriteRes<[SKXPort015]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SKXWriteResGroup116], (instregex "PMULLDrr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDPDr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDPSr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDSDr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDSSr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDYrr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDrr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPSZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPSZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALESDr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALESSr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDPDr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDPSr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDSDr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDSSr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDYPDr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDYPSr")>;
-
-def SKXWriteResGroup117 : SchedWriteRes<[SKXPort0,SKXPort23]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup117], (instregex "VTESTPDrm")>;
-def: InstRW<[SKXWriteResGroup117], (instregex "VTESTPSrm")>;
-
def SKXWriteResGroup118 : SchedWriteRes<[SKXPort1,SKXPort23]> {
let Latency = 8;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup118], (instregex "BSF(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "BSR(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "IMUL64m")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "IMUL(32|64)rm(i8)?")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "IMUL8m")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "LZCNT(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "MUL(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "MUL8m")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "PDEP(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "PEXT(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "POPCNT(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "TZCNT(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup118], (instregex "PDEP(32|64)rm",
+ "PEXT(32|64)rm")>;
def SKXWriteResGroup118_16_1 : SchedWriteRes<[SKXPort1, SKXPort0156, SKXPort23]> {
let Latency = 8;
let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
+ let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup118_16_1], (instregex "IMUL16rm(i8)?")>;
+def: InstRW<[SKXWriteResGroup118_16_1], (instrs IMUL16rm, IMUL16rmi, IMUL16rmi8)>;
-def SKXWriteResGroup118_16_2 : SchedWriteRes<[SKXPort1, SKXPort0156, SKXPort23]> {
- let Latency = 8;
+def SKXWriteResGroup118_16_2 : SchedWriteRes<[SKXPort1, SKXPort06, SKXPort0156, SKXPort23]> {
+ let Latency = 9;
let NumMicroOps = 5;
+ let ResourceCycles = [1,1,2,1];
}
-def: InstRW<[SKXWriteResGroup118_16_2], (instregex "IMUL16m")>;
-def: InstRW<[SKXWriteResGroup118_16_2], (instregex "MUL16m")>;
-
-def SKXWriteResGroup118_32 : SchedWriteRes<[SKXPort1, SKXPort0156, SKXPort23]> {
- let Latency = 8;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup118_32], (instregex "IMUL32m")>;
-def: InstRW<[SKXWriteResGroup118_32], (instregex "MUL32m")>;
+def: InstRW<[SKXWriteResGroup118_16_2], (instrs IMUL16m, MUL16m)>;
def SKXWriteResGroup119 : SchedWriteRes<[SKXPort5,SKXPort23]> {
let Latency = 8;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup119], (instregex "FCOM32m")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "FCOM64m")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "FCOMP32m")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "FCOMP64m")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "MMX_PSADBWirm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VFPCLASSSDrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSDWYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSDWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSDWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSWBYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSWBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSWBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSDWYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSDWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSDWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSWBYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSWBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSWBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPALIGNRYrmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPALIGNRZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPALIGNRZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPBLENDWYrmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTBYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTBZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTBZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTWYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTWZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTWZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDYmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSYmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPMOVSXBDYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPMOVSXBQYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPMOVSXWQYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFBYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFDYmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFDZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFDZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFHWYmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFHWZ256mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFHWZmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFLWYmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFLWZ256mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFLWZmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSLLDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSLLDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSRLDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSRLDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHBWYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHBWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHBWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHDQYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHQDQYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHQDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHQDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHWDYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHWDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHWDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLBWYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLBWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLBWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLDQYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLQDQYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLQDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLQDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLWDYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLWDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLWDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPDYrmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPSYrmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPSZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPSZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPDYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPSYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPDYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPSYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPSZrm(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup120 : SchedWriteRes<[SKXPort01,SKXPort23]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQQYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPGTBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPGTDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPGTWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAX(C?)SDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAX(C?)SDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAX(C?)SDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMIN(C?)SDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMIN(C?)SDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMIN(C?)SDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLDZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLDZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLQZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLQZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORDZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORDZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORQZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORQZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSIGNBYrm256")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSIGNDYrm256")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSIGNWYrm256")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVQYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZ256mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZ256mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVQYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZ256mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "FCOM(P?)(32|64)m",
+ "VFPCLASSSDZrm(b?)",
+ "VPBROADCASTBYrm",
+ "VPBROADCASTB(Z|Z256)m(b?)",
+ "VPBROADCASTWYrm",
+ "VPBROADCASTW(Z|Z256)m(b?)",
+ "VPMOVSXBDYrm",
+ "VPMOVSXBQYrm",
+ "VPMOVSXWQYrm")>;
def SKXWriteResGroup121 : SchedWriteRes<[SKXPort23,SKXPort015]> {
let Latency = 8;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPSYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDPDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDPSYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDPDYrmi")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDPSYrmi")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X2Z256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X2Zm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X4Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X4rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X8rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF64X2Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF64X2rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF64X4rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X2Z256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X2Zm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X4Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X4rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X8rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI64X2Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI64X2rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI64X4rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSDZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSDZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSSZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSSZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF32x4Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF32x4Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF32x8Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF64x2Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF64x2Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF64x4Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI32x4Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI32x4Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI32x8Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI64x2Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI64x2Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI64x4Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMASKMOVPDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMASKMOVPSYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDDUPZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDDUPZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA32Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA32Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA64Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU16Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU16Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU32Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU32Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU64Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU8Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU8Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVNTDQAZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSHDUPZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSHDUPZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSLDUPZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSLDUPZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VORPDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VORPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VORPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VORPSYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VORPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VORPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDBYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDQYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDWYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDDYrmi")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTDZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTDZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTQZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTQZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPMASKMOVDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPMASKMOVQYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPORDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPORDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPORQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPORQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPORYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBBYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBQYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBWYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGQZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGQZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPXORDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPXORDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPXORQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPXORQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPXORYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VXORPDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VXORPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VXORPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VXORPSYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VXORPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VXORPSZrm(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup122 : SchedWriteRes<[SKXPort23,SKXPort015]> {
- let Latency = 8;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[SKXWriteResGroup122], (instregex "BLENDVPDrm0")>;
-def: InstRW<[SKXWriteResGroup122], (instregex "BLENDVPSrm0")>;
-def: InstRW<[SKXWriteResGroup122], (instregex "PBLENDVBrm0")>;
-def: InstRW<[SKXWriteResGroup122], (instregex "VBLENDVPDrm")>;
-def: InstRW<[SKXWriteResGroup122], (instregex "VBLENDVPSrm")>;
-def: InstRW<[SKXWriteResGroup122], (instregex "VPBLENDVBYrm")>;
-def: InstRW<[SKXWriteResGroup122], (instregex "VPBLENDVBrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPD(Z|Z256)rm(b?)",
+ "VBLENDMPS(Z|Z256)rm(b?)",
+ "VBROADCASTF32X2Z256m(b?)",
+ "VBROADCASTF32X2Zm(b?)",
+ "VBROADCASTF32X4Z256rm(b?)",
+ "VBROADCASTF32X4rm(b?)",
+ "VBROADCASTF32X8rm(b?)",
+ "VBROADCASTF64X2Z128rm(b?)",
+ "VBROADCASTF64X2rm(b?)",
+ "VBROADCASTF64X4rm(b?)",
+ "VBROADCASTI32X2Z256m(b?)",
+ "VBROADCASTI32X2Zm(b?)",
+ "VBROADCASTI32X4Z256rm(b?)",
+ "VBROADCASTI32X4rm(b?)",
+ "VBROADCASTI32X8rm(b?)",
+ "VBROADCASTI64X2Z128rm(b?)",
+ "VBROADCASTI64X2rm(b?)",
+ "VBROADCASTI64X4rm(b?)",
+ "VBROADCASTSD(Z|Z256)m(b?)",
+ "VBROADCASTSS(Z|Z256)m(b?)",
+ "VINSERTF32x4(Z|Z256)rm(b?)",
+ "VINSERTF32x8Zrm(b?)",
+ "VINSERTF64x2(Z|Z256)rm(b?)",
+ "VINSERTF64x4Zrm(b?)",
+ "VINSERTI32x4(Z|Z256)rm(b?)",
+ "VINSERTI32x8Zrm(b?)",
+ "VINSERTI64x2(Z|Z256)rm(b?)",
+ "VINSERTI64x4Zrm(b?)",
+ "VMOVAPD(Z|Z256)rm(b?)",
+ "VMOVAPS(Z|Z256)rm(b?)",
+ "VMOVDDUP(Z|Z256)rm(b?)",
+ "VMOVDQA32(Z|Z256)rm(b?)",
+ "VMOVDQA64(Z|Z256)rm(b?)",
+ "VMOVDQU16(Z|Z256)rm(b?)",
+ "VMOVDQU32(Z|Z256)rm(b?)",
+ "VMOVDQU64(Z|Z256)rm(b?)",
+ "VMOVDQU8(Z|Z256)rm(b?)",
+ "VMOVNTDQAZ256rm(b?)",
+ "VMOVSHDUP(Z|Z256)rm(b?)",
+ "VMOVSLDUP(Z|Z256)rm(b?)",
+ "VMOVUPD(Z|Z256)rm(b?)",
+ "VMOVUPS(Z|Z256)rm(b?)",
+ "VPADD(B|D|Q|W)Yrm",
+ "VPADD(B|D|Q|W)(Z|Z256)rm(b?)",
+ "VPBLENDDYrmi",
+ "VPBLENDM(B|D|Q|W)(Z|Z256)rm(b?)",
+ "VPBROADCASTD(Z|Z256)m(b?)",
+ "VPBROADCASTQ(Z|Z256)m(b?)",
+ "VPSUB(B|D|Q|W)Yrm",
+ "VPSUB(B|D|Q|W)(Z|Z256)rm(b?)",
+ "VPTERNLOGD(Z|Z256)rm(b?)i",
+ "VPTERNLOGQ(Z|Z256)rm(b?)i")>;
def SKXWriteResGroup123 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
let Latency = 8;
let NumMicroOps = 4;
let ResourceCycles = [1,2,1];
}
-def: InstRW<[SKXWriteResGroup123], (instregex "MMX_PHADDSWrm64")>;
-def: InstRW<[SKXWriteResGroup123], (instregex "MMX_PHSUBSWrm64")>;
-
-def SKXWriteResGroup124 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort05]> {
- let Latency = 8;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHADDWrm64")>;
-def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHADDrm64")>;
-def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHSUBDrm64")>;
-def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHSUBWrm64")>;
-
-def SKXWriteResGroup125 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237,SKXPort015]> {
- let Latency = 8;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[SKXWriteResGroup125], (instregex "VCVTPS2PHYmr")>;
+def: InstRW<[SKXWriteResGroup123], (instregex "MMX_PH(ADD|SUB)SWrm")>;
def SKXWriteResGroup126 : SchedWriteRes<[SKXPort23,SKXPort237,SKXPort06]> {
let Latency = 8;
let NumMicroOps = 5;
let ResourceCycles = [1,1,3];
}
-def: InstRW<[SKXWriteResGroup126], (instregex "ROR(16|32|64)mCL")>;
-def: InstRW<[SKXWriteResGroup126], (instregex "ROR8mCL")>;
+def: InstRW<[SKXWriteResGroup126], (instregex "ROR(8|16|32|64)mCL")>;
def SKXWriteResGroup127 : SchedWriteRes<[SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
let Latency = 8;
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,2];
}
-def: InstRW<[SKXWriteResGroup127], (instregex "RCL(16|32|64)m1")>;
-def: InstRW<[SKXWriteResGroup127], (instregex "RCL(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup127], (instregex "RCL8m1")>;
-def: InstRW<[SKXWriteResGroup127], (instregex "RCL8mi")>;
-def: InstRW<[SKXWriteResGroup127], (instregex "RCR(16|32|64)m1")>;
-def: InstRW<[SKXWriteResGroup127], (instregex "RCR(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup127], (instregex "RCR8m1")>;
-def: InstRW<[SKXWriteResGroup127], (instregex "RCR8mi")>;
+def: InstRW<[SKXWriteResGroup127], (instregex "RCL(8|16|32|64)m1",
+ "RCL(8|16|32|64)mi",
+ "RCR(8|16|32|64)m1",
+ "RCR(8|16|32|64)mi")>;
def SKXWriteResGroup128 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> {
let Latency = 8;
let NumMicroOps = 6;
let ResourceCycles = [1,1,1,3];
}
-def: InstRW<[SKXWriteResGroup128], (instregex "ROL(16|32|64)mCL")>;
-def: InstRW<[SKXWriteResGroup128], (instregex "ROL8mCL")>;
-def: InstRW<[SKXWriteResGroup128], (instregex "SAR(16|32|64)mCL")>;
-def: InstRW<[SKXWriteResGroup128], (instregex "SAR8mCL")>;
-def: InstRW<[SKXWriteResGroup128], (instregex "SHL(16|32|64)mCL")>;
-def: InstRW<[SKXWriteResGroup128], (instregex "SHL8mCL")>;
-def: InstRW<[SKXWriteResGroup128], (instregex "SHR(16|32|64)mCL")>;
-def: InstRW<[SKXWriteResGroup128], (instregex "SHR8mCL")>;
-
-def SKXWriteResGroup129 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> {
- let Latency = 8;
- let NumMicroOps = 6;
- let ResourceCycles = [1,1,1,3];
-}
-def: InstRW<[SKXWriteResGroup129], (instregex "ADC(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup129], (instregex "ADC8mi")>;
+def: InstRW<[SKXWriteResGroup128], (instregex "ROL(8|16|32|64)mCL",
+ "SAR(8|16|32|64)mCL",
+ "SHL(8|16|32|64)mCL",
+ "SHR(8|16|32|64)mCL")>;
def SKXWriteResGroup130 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
let Latency = 8;
let NumMicroOps = 6;
let ResourceCycles = [1,1,1,2,1];
}
-def: InstRW<[SKXWriteResGroup130], (instregex "ADC(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup130], (instregex "ADC8mr")>;
-def: InstRW<[SKXWriteResGroup130], (instregex "CMPXCHG(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup130], (instregex "CMPXCHG8rm")>;
-def: InstRW<[SKXWriteResGroup130], (instregex "SBB(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup130], (instregex "SBB(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup130], (instregex "SBB8mi")>;
-def: InstRW<[SKXWriteResGroup130], (instregex "SBB8mr")>;
+def: SchedAlias<WriteADCRMW, SKXWriteResGroup130>;
+def: InstRW<[SKXWriteResGroup130], (instregex "CMPXCHG(8|16|32|64)rm")>;
def SKXWriteResGroup131 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> {
let Latency = 8;
@@ -4731,886 +1749,305 @@ def SKXWriteResGroup135 : SchedWriteRes<[SKXPort0,SKXPort23]> {
let ResourceCycles = [1,1];
}
def: InstRW<[SKXWriteResGroup135], (instregex "MMX_CVTPI2PSirm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMADDUBSWrm64")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMADDWDirm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULHRSWrm64")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULHUWirm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULHWirm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULLWirm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULUDQirm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "RCPSSm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "RSQRTSSm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "VRCPSSm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "VRSQRTSSm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "VTESTPDYrm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "VTESTPSYrm")>;
def SKXWriteResGroup136 : SchedWriteRes<[SKXPort5,SKXPort23]> {
let Latency = 9;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup136], (instregex "PCMPGTQrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "PSADBWrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VALIGNDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VALIGNQZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VCMPPDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VCMPPSZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VCMPSDZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VCMPSSZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VDBPSADBWZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VFPCLASSSSrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPBZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPDZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTQrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPQZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUBZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUDZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUQZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUWZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPWZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2D128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2PD128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2PS128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2Q128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2D128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2PD128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2PS128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2Q128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMAXSQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMAXUQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMINSQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMINUQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBWYrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXDQYrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXWDYrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXWDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXWQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXBDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXBQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXBWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXWDYrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXWDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXWQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPSADBWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPSADBWrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VALIGNDZ128rm(b?)i",
+ "VALIGNQZ128rm(b?)i",
+ "VCMPPDZ128rm(b?)i",
+ "VCMPPSZ128rm(b?)i",
+ "VCMPSDZrm",
+ "VCMPSSZrm",
+ "VFPCLASSSSZrm(b?)",
+ "VPCMPBZ128rmi(b?)",
+ "VPCMPDZ128rmi(b?)",
+ "VPCMPEQ(B|D|Q|W)Z128rm(b?)",
+ "VPCMPGT(B|D|Q|W)Z128rm(b?)",
+ "(V?)PCMPGTQrm",
+ "VPCMPQZ128rmi(b?)",
+ "VPCMPU(B|D|Q|W)Z128rmi(b?)",
+ "VPCMPWZ128rmi(b?)",
+ "VPERMI2D128rm(b?)",
+ "VPERMI2PD128rm(b?)",
+ "VPERMI2PS128rm(b?)",
+ "VPERMI2Q128rm(b?)",
+ "VPERMT2D128rm(b?)",
+ "VPERMT2PD128rm(b?)",
+ "VPERMT2PS128rm(b?)",
+ "VPERMT2Q128rm(b?)",
+ "VPMAXSQZ128rm(b?)",
+ "VPMAXUQZ128rm(b?)",
+ "VPMINSQZ128rm(b?)",
+ "VPMINUQZ128rm(b?)",
+ "VPMOVSXBDZ128rm(b?)",
+ "VPMOVSXBQZ128rm(b?)",
+ "VPMOVSXBWYrm",
+ "VPMOVSXBWZ128rm(b?)",
+ "VPMOVSXDQYrm",
+ "VPMOVSXDQZ128rm(b?)",
+ "VPMOVSXWDYrm",
+ "VPMOVSXWDZ128rm(b?)",
+ "VPMOVSXWQZ128rm(b?)",
+ "VPMOVZXBDZ128rm(b?)",
+ "VPMOVZXBQZ128rm(b?)",
+ "VPMOVZXBWZ128rm(b?)",
+ "VPMOVZXDQZ128rm(b?)",
+ "VPMOVZXWDYrm",
+ "VPMOVZXWDZ128rm(b?)",
+ "VPMOVZXWQZ128rm(b?)",
+ "VPTESTMBZ128rm(b?)",
+ "VPTESTMDZ128rm(b?)",
+ "VPTESTMQZ128rm(b?)",
+ "VPTESTMWZ128rm(b?)",
+ "VPTESTNMBZ128rm(b?)",
+ "VPTESTNMDZ128rm(b?)",
+ "VPTESTNMQZ128rm(b?)",
+ "VPTESTNMWZ128rm(b?)")>;
def SKXWriteResGroup137 : SchedWriteRes<[SKXPort23,SKXPort015]> {
let Latency = 9;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup137], (instregex "ADDSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "ADDSSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "CMPSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "CMPSSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "CVTPS2PDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MAX(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MAX(C?)SSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MIN(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MIN(C?)SSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVTPS2PIirm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVTTPS2PIirm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MULSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MULSSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "SUBSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "SUBSSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VADDSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VADDSSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VCMPSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VCMPSSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VCVTPH2PSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VCVTPS2PDrm")>;
-def: InstRW<[SKXWriteResGroup137],
- (instregex "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VMAX(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VMAX(C?)SSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VMIN(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VMIN(C?)SSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VMULSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VMULSSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VSUBSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VSUBSSrm")>;
-
-def SKXWriteResGroup138 : SchedWriteRes<[SKXPort0,SKXPort015]> {
- let Latency = 9;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[SKXWriteResGroup138], (instregex "VRCP14PDZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup138], (instregex "VRCP14PSZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup138], (instregex "VRSQRT14PDZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup138], (instregex "VRSQRT14PSZr(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup139 : SchedWriteRes<[SKXPort5,SKXPort015]> {
- let Latency = 9;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[SKXWriteResGroup139], (instregex "DPPDrri")>;
-def: InstRW<[SKXWriteResGroup139], (instregex "VDPPDrri")>;
-
-def SKXWriteResGroup140 : SchedWriteRes<[SKXPort23,SKXPort015]> {
- let Latency = 9;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[SKXWriteResGroup140], (instregex "VBLENDVPDYrm")>;
-def: InstRW<[SKXWriteResGroup140], (instregex "VBLENDVPSYrm")>;
-
-def SKXWriteResGroup141 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
- let Latency = 9;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup141], (instregex "PTESTrm")>;
-def: InstRW<[SKXWriteResGroup141], (instregex "VPTESTrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIirm",
+ "(V?)CVTPS2PDrm")>;
def SKXWriteResGroup142 : SchedWriteRes<[SKXPort1,SKXPort5,SKXPort23]> {
let Latency = 9;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup142], (instregex "MULX64rm")>;
+def: InstRW<[SKXWriteResGroup142], (instrs IMUL64m, MUL64m, MULX64rm)>;
def SKXWriteResGroup143 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> {
let Latency = 9;
let NumMicroOps = 4;
let ResourceCycles = [2,1,1];
}
-def: InstRW<[SKXWriteResGroup143], (instregex "PHADDSWrm128")>;
-def: InstRW<[SKXWriteResGroup143], (instregex "PHSUBSWrm128")>;
-def: InstRW<[SKXWriteResGroup143], (instregex "VPHADDSWrm128")>;
-def: InstRW<[SKXWriteResGroup143], (instregex "VPHSUBSWrm128")>;
-
-def SKXWriteResGroup144 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
- let Latency = 9;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup144], (instregex "PHADDDrm")>;
-def: InstRW<[SKXWriteResGroup144], (instregex "PHADDWrm")>;
-def: InstRW<[SKXWriteResGroup144], (instregex "PHSUBDrm")>;
-def: InstRW<[SKXWriteResGroup144], (instregex "PHSUBWrm")>;
-def: InstRW<[SKXWriteResGroup144], (instregex "VPHADDDrm")>;
-def: InstRW<[SKXWriteResGroup144], (instregex "VPHADDWrm")>;
-def: InstRW<[SKXWriteResGroup144], (instregex "VPHSUBDrm")>;
-def: InstRW<[SKXWriteResGroup144], (instregex "VPHSUBWrm")>;
+def: InstRW<[SKXWriteResGroup143], (instregex "(V?)PHADDSWrm",
+ "(V?)PHSUBSWrm")>;
def SKXWriteResGroup145 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort0156]> {
let Latency = 9;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[SKXWriteResGroup145], (instregex "SHLD(16|32|64)mri8")>;
-def: InstRW<[SKXWriteResGroup145], (instregex "SHRD(16|32|64)mri8")>;
+def: InstRW<[SKXWriteResGroup145], (instregex "SHLD(16|32|64)mri8",
+ "SHRD(16|32|64)mri8")>;
def SKXWriteResGroup146 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156]> {
let Latency = 9;
let NumMicroOps = 5;
let ResourceCycles = [1,2,1,1];
}
-def: InstRW<[SKXWriteResGroup146], (instregex "LAR(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup146], (instregex "LSL(16|32|64)rm")>;
-
-def SKXWriteResGroup147 : SchedWriteRes<[SKXPort0,SKXPort23]> {
- let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup147], (instregex "AESDECLASTrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "AESDECrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "AESENCLASTrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "AESENCrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "RCPPSm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "RSQRTPSm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VAESDECLASTrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VAESDECrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VAESENCLASTrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VAESENCrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14PDZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14PSZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14SDrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14SSrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRCPPSm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14PDZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14PSZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14SDrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14SSrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRTPSm")>;
+def: InstRW<[SKXWriteResGroup146], (instregex "LAR(16|32|64)rm",
+ "LSL(16|32|64)rm")>;
def SKXWriteResGroup148 : SchedWriteRes<[SKXPort5,SKXPort23]> {
let Latency = 10;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup148], (instregex "ADD_F32m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "ADD_F64m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "ILD_F16m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "ILD_F32m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "ILD_F64m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "SUBR_F32m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "SUBR_F64m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "SUB_F32m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "SUB_F64m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNQZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNQZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPSZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPSZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VDBPSADBWZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VDBPSADBWZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPBZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPBZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPDZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPDZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTQYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPQZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPQZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUBZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUBZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUDZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUDZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUQZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUQZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUWZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUWZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPWZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPWZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERM2F128rm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERM2I128rm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMDYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2D256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2Drm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PD256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PDrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PS256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PSrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2Q256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2Qrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDYmi")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPSYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQYmi")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2D256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2Drm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PD256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PDrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PS256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PSrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2Q256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2Qrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXSQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXSQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXUQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXUQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMINSQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMINSQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMINUQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMINUQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBDYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBQYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBWYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXDQYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWQYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPSADBWYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPSADBWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPSADBWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF32X4Z256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF32X4Zrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF64X2Z256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF64X2Zrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI32X4Z256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI32X4Zrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI64X2Z256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI64X2Zrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+ "ILD_F(16|32|64)m",
+ "VALIGND(Z|Z256)rm(b?)i",
+ "VALIGNQ(Z|Z256)rm(b?)i",
+ "VCMPPD(Z|Z256)rm(b?)i",
+ "VCMPPS(Z|Z256)rm(b?)i",
+ "VPCMPB(Z|Z256)rmi(b?)",
+ "VPCMPD(Z|Z256)rmi(b?)",
+ "VPCMPEQB(Z|Z256)rm(b?)",
+ "VPCMPEQD(Z|Z256)rm(b?)",
+ "VPCMPEQQ(Z|Z256)rm(b?)",
+ "VPCMPEQW(Z|Z256)rm(b?)",
+ "VPCMPGTB(Z|Z256)rm(b?)",
+ "VPCMPGTD(Z|Z256)rm(b?)",
+ "VPCMPGTQYrm",
+ "VPCMPGTQ(Z|Z256)rm(b?)",
+ "VPCMPGTW(Z|Z256)rm(b?)",
+ "VPCMPQ(Z|Z256)rmi(b?)",
+ "VPCMPU(B|D|Q|W)Z256rmi(b?)",
+ "VPCMPU(B|D|Q|W)Zrmi(b?)",
+ "VPCMPW(Z|Z256)rmi(b?)",
+ "VPMAXSQ(Z|Z256)rm(b?)",
+ "VPMAXUQ(Z|Z256)rm(b?)",
+ "VPMINSQ(Z|Z256)rm(b?)",
+ "VPMINUQ(Z|Z256)rm(b?)",
+ "VPTESTM(B|D|Q|W)Z256rm(b?)",
+ "VPTESTM(B|D|Q|W)Zrm(b?)",
+ "VPTESTNM(B|D|Q|W)Z256rm(b?)",
+ "VPTESTNM(B|D|Q|W)Zrm(b?)")>;
def SKXWriteResGroup149 : SchedWriteRes<[SKXPort23,SKXPort015]> {
let Latency = 10;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup149], (instregex "ADDPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "ADDPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "ADDSUBPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "ADDSUBPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "CMPPDrmi")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "CMPPSrmi")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "CVTDQ2PSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "CVTPS2DQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "CVTSS2SDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "CVTTPS2DQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "MAX(C?)PDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "MAX(C?)PSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "MIN(C?)PDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "MIN(C?)PSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "MULPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "MULPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PHMINPOSUWrm128")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMADDUBSWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMADDWDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMULDQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMULHRSWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMULHUWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMULHWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMULLWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMULUDQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "SUBPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "SUBPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDSDZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDSSZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDSUBPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDSUBPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCMPPDrmi")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCMPPSrmi")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPD2QQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPD2UQQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPH2PSYrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPH2PSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2DQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2DQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2PDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2QQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2UDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2UQQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTQQ2PDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTQQ2PSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTSS2SDZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTSS2SDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPD2QQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPD2UQQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2DQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2DQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2QQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2UDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2UQQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUDQ2PDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUDQ2PSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUQQ2PDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUQQ2PSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMPDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMPSZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMSDrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMSSrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149],
- (instregex
- "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z128m(b?)(k?)(z?)",
- "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m",
- "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)Zm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPPDZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPPSZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPSDm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPSSm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTPDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTPSZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTSDZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTSSZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)SDZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)SSZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)SDZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)SSZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMULPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMULPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMULPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMULPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMULSDZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMULSSZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPHMINPOSUWrm128")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPLZCNTDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPLZCNTQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDUBSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDUBSWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDWDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDWDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULDQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHRSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHRSWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHUWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHUWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULLWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULLWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULUDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULUDQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VRANGEPDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VRANGEPSZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VRANGESDZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VRANGESSZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCEPDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCEPSZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCESDZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCESSZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFSDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFSSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSUBSDZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSUBSSZrm(_Int)?(k?)(z?)")>;
-
-def SKXWriteResGroup150 : SchedWriteRes<[SKXPort0]> {
- let Latency = 10;
- let NumMicroOps = 3;
- let ResourceCycles = [3];
-}
-def: InstRW<[SKXWriteResGroup150], (instregex "PCMPISTRIrr")>;
-def: InstRW<[SKXWriteResGroup150], (instregex "PCMPISTRM128rr")>;
-def: InstRW<[SKXWriteResGroup150], (instregex "VPCMPISTRIrr")>;
-def: InstRW<[SKXWriteResGroup150], (instregex "VPCMPISTRM128rr")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PDZ128rm(b?)",
+ "VCVTDQ2PSZ128rm(b?)",
+ "(V?)CVTDQ2PSrm",
+ "VCVTPD2QQZ128rm(b?)",
+ "VCVTPD2UQQZ128rm(b?)",
+ "VCVTPH2PSZ128rm(b?)",
+ "VCVTPS2DQZ128rm(b?)",
+ "(V?)CVTPS2DQrm",
+ "VCVTPS2PDZ128rm(b?)",
+ "VCVTPS2QQZ128rm(b?)",
+ "VCVTPS2UDQZ128rm(b?)",
+ "VCVTPS2UQQZ128rm(b?)",
+ "VCVTQQ2PDZ128rm(b?)",
+ "VCVTQQ2PSZ128rm(b?)",
+ "VCVTSS2SDZrm",
+ "(V?)CVTSS2SDrm",
+ "VCVTTPD2QQZ128rm(b?)",
+ "VCVTTPD2UQQZ128rm(b?)",
+ "VCVTTPS2DQZ128rm(b?)",
+ "(V?)CVTTPS2DQrm",
+ "VCVTTPS2QQZ128rm(b?)",
+ "VCVTTPS2UDQZ128rm(b?)",
+ "VCVTTPS2UQQZ128rm(b?)",
+ "VCVTUDQ2PDZ128rm(b?)",
+ "VCVTUDQ2PSZ128rm(b?)",
+ "VCVTUQQ2PDZ128rm(b?)",
+ "VCVTUQQ2PSZ128rm(b?)")>;
def SKXWriteResGroup151 : SchedWriteRes<[SKXPort5,SKXPort23]> {
let Latency = 10;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKXWriteResGroup151], (instregex "MPSADBWrmi")>;
-def: InstRW<[SKXWriteResGroup151], (instregex "VEXPANDPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup151], (instregex "VEXPANDPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup151], (instregex "VMPSADBWrmi")>;
-def: InstRW<[SKXWriteResGroup151], (instregex "VPEXPANDDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup151], (instregex "VPEXPANDQZ128rm(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup152 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
- let Latency = 10;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup152], (instregex "MMX_CVTPI2PDirm")>;
-def: InstRW<[SKXWriteResGroup152], (instregex "VPTESTYrm")>;
+def: InstRW<[SKXWriteResGroup151], (instregex "VEXPANDPDZ128rm(b?)",
+ "VEXPANDPSZ128rm(b?)",
+ "VPEXPANDDZ128rm(b?)",
+ "VPEXPANDQZ128rm(b?)")>;
def SKXWriteResGroup153 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
let Latency = 10;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup153], (instregex "CVTSD2SSrm")>;
-def: InstRW<[SKXWriteResGroup153], (instregex "VCVTSD2SSrm")>;
+def: InstRW<[SKXWriteResGroup153], (instregex "(V?)CVTSD2SSrm")>;
def SKXWriteResGroup154 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> {
let Latency = 10;
let NumMicroOps = 4;
let ResourceCycles = [2,1,1];
}
-def: InstRW<[SKXWriteResGroup154], (instregex "VPHADDSWrm256")>;
-def: InstRW<[SKXWriteResGroup154], (instregex "VPHSUBSWrm256")>;
-
-def SKXWriteResGroup155 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
- let Latency = 10;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup155], (instregex "VPHADDDYrm")>;
-def: InstRW<[SKXWriteResGroup155], (instregex "VPHADDWYrm")>;
-def: InstRW<[SKXWriteResGroup155], (instregex "VPHSUBDYrm")>;
-def: InstRW<[SKXWriteResGroup155], (instregex "VPHSUBWYrm")>;
+def: InstRW<[SKXWriteResGroup154], (instregex "VPHADDSWYrm",
+ "VPHSUBSWYrm")>;
def SKXWriteResGroup156 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort06,SKXPort0156]> {
- let Latency = 10;
+ let Latency = 9;
let NumMicroOps = 4;
let ResourceCycles = [1,1,1,1];
}
-def: InstRW<[SKXWriteResGroup156], (instregex "MULX32rm")>;
+def: InstRW<[SKXWriteResGroup156], (instrs IMUL32m, MUL32m, MULX32rm)>;
def SKXWriteResGroup157 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
let Latency = 10;
let NumMicroOps = 8;
let ResourceCycles = [1,1,1,1,1,3];
}
-def: InstRW<[SKXWriteResGroup157], (instregex "ADD8mi")>;
-def: InstRW<[SKXWriteResGroup157], (instregex "AND8mi")>;
-def: InstRW<[SKXWriteResGroup157], (instregex "OR8mi")>;
-def: InstRW<[SKXWriteResGroup157], (instregex "SUB8mi")>;
-def: InstRW<[SKXWriteResGroup157], (instregex "XCHG(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup157], (instregex "XCHG8rm")>;
-def: InstRW<[SKXWriteResGroup157], (instregex "XOR8mi")>;
+def: InstRW<[SKXWriteResGroup157], (instregex "XCHG(8|16|32|64)rm")>;
-def SKXWriteResGroup158 : SchedWriteRes<[SKXPort05,SKXPort0156]> {
- let Latency = 10;
- let NumMicroOps = 10;
- let ResourceCycles = [9,1];
-}
-def: InstRW<[SKXWriteResGroup158], (instregex "MMX_EMMS")>;
-
-def SKXWriteResGroup159 : SchedWriteRes<[SKXPort0]> {
+def SKXWriteResGroup159 : SchedWriteRes<[SKXPort0,SKXFPDivider]> {
let Latency = 11;
let NumMicroOps = 1;
- let ResourceCycles = [1];
+ let ResourceCycles = [1,3];
}
-def: InstRW<[SKXWriteResGroup159], (instregex "DIVPSrr")>;
-def: InstRW<[SKXWriteResGroup159], (instregex "DIVSSrr")>;
-def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSYrr")>;
-def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSrr")>;
-def: InstRW<[SKXWriteResGroup159], (instregex "VDIVSSZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup159], (instregex "VDIVSSrr")>;
+def : SchedAlias<WriteFDivX, SKXWriteResGroup159>; // TODO - convert to ZnWriteResFpuPair
def SKXWriteResGroup160 : SchedWriteRes<[SKXPort0,SKXPort23]> {
let Latency = 11;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup160], (instregex "MUL_F32m")>;
-def: InstRW<[SKXWriteResGroup160], (instregex "MUL_F64m")>;
-def: InstRW<[SKXWriteResGroup160], (instregex "VRCP14PDZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup160], (instregex "VRCP14PSZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup160], (instregex "VRCPPSYm")>;
-def: InstRW<[SKXWriteResGroup160], (instregex "VRSQRT14PDZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup160], (instregex "VRSQRT14PSZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup160], (instregex "VRSQRTPSYm")>;
+def: InstRW<[SKXWriteResGroup160], (instregex "MUL_F(32|64)m")>;
def SKXWriteResGroup161 : SchedWriteRes<[SKXPort23,SKXPort015]> {
let Latency = 11;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDPDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDPSYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDSUBPDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDSUBPSYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCMPPDYrmi")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCMPPSYrmi")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PSYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2QQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2QQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2UQQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2UQQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPH2PSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPH2PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2DQYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2DQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2DQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2PDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2PDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2PDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2QQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2UDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2UDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2UQQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2QQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2QQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2UQQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2UQQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2DQYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2DQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2DQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2QQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2UDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2UDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2UQQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPSZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPSZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161],
- (instregex
- "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym",
- "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z256m(b?)(k?)(z?)",
- "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Zm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPDZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPDm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPSZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPSm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPSZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPSZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PSYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PSYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMULPDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMULPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMULPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMULPSYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMULPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMULPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDUBSWYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDUBSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDUBSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDWDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDWDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDWDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULDQYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHRSWYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHRSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHRSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHUWYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHUWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHUWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHWYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULLWYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULLWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULLWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULUDQYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULUDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULUDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPSZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPSZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPSZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPSZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPSYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PD(Z|Z256)rm(b?)",
+ "VCVTDQ2PSYrm",
+ "VCVTDQ2PS(Z|Z256)rm(b?)",
+ "VCVTPH2PS(Z|Z256)rm(b?)",
+ "VCVTPS2PDYrm",
+ "VCVTPS2PD(Z|Z256)rm(b?)",
+ "VCVTQQ2PD(Z|Z256)rm(b?)",
+ "VCVTQQ2PSZ256rm(b?)",
+ "VCVT(T?)PD2QQ(Z|Z256)rm(b?)",
+ "VCVT(T?)PD2UQQ(Z|Z256)rm(b?)",
+ "VCVT(T?)PS2DQYrm",
+ "VCVT(T?)PS2DQ(Z|Z256)rm(b?)",
+ "VCVT(T?)PS2QQZ256rm(b?)",
+ "VCVT(T?)PS2UDQ(Z|Z256)rm(b?)",
+ "VCVT(T?)PS2UQQZ256rm(b?)",
+ "VCVTUDQ2PD(Z|Z256)rm(b?)",
+ "VCVTUDQ2PS(Z|Z256)rm(b?)",
+ "VCVTUQQ2PD(Z|Z256)rm(b?)",
+ "VCVTUQQ2PSZ256rm(b?)")>;
def SKXWriteResGroup162 : SchedWriteRes<[SKXPort5,SKXPort23]> {
let Latency = 11;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKXWriteResGroup162], (instregex "FICOM16m")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "FICOM32m")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "FICOMP16m")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "FICOMP32m")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VMPSADBWYrmi")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup162], (instregex "FICOM(P?)(16|32)m",
+ "VEXPANDPD(Z|Z256)rm(b?)",
+ "VEXPANDPS(Z|Z256)rm(b?)",
+ "VPEXPANDD(Z|Z256)rm(b?)",
+ "VPEXPANDQ(Z|Z256)rm(b?)")>;
def SKXWriteResGroup163 : SchedWriteRes<[SKXPort23,SKXPort015]> {
let Latency = 11;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[SKXWriteResGroup163], (instregex "VCVTSD2SSZrm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup163], (instregex "VCVTSD2SSZrm")>;
def SKXWriteResGroup164 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
let Latency = 11;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup164], (instregex "CVTDQ2PDrm")>;
-def: InstRW<[SKXWriteResGroup164], (instregex "VCVTDQ2PDrm")>;
-
-def SKXWriteResGroup165 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
- let Latency = 11;
- let NumMicroOps = 3;
- let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup165], (instregex "CVTSD2SI64rm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "CVTSD2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "CVTSS2SI64rm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "CVTSS2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "CVTTSD2SI64rm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "CVTTSD2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "CVTTSS2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SI64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SI64rm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2USI64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SI64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SI64rm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2USIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SI64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SI64rm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2USI64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SI64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SI64rm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2USIZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup164], (instregex "(V?)CVTDQ2PDrm")>;
def SKXWriteResGroup166 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
let Latency = 11;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup166], (instregex "CVTPD2DQrm")>;
-def: InstRW<[SKXWriteResGroup166], (instregex "CVTPD2PSrm")>;
-def: InstRW<[SKXWriteResGroup166], (instregex "CVTTPD2DQrm")>;
-def: InstRW<[SKXWriteResGroup166], (instregex "MMX_CVTPD2PIirm")>;
-def: InstRW<[SKXWriteResGroup166], (instregex "MMX_CVTTPD2PIirm")>;
+def: InstRW<[SKXWriteResGroup166], (instregex "CVTPD2PSrm",
+ "CVT(T?)PD2DQrm",
+ "MMX_CVT(T?)PD2PIirm")>;
def SKXWriteResGroup167 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
let Latency = 11;
let NumMicroOps = 4;
let ResourceCycles = [2,1,1];
}
-def: InstRW<[SKXWriteResGroup167], (instregex "VPCONFLICTQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup167], (instregex "VPCONFLICTQZ128rm(b?)")>;
def SKXWriteResGroup168 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
let Latency = 11;
let NumMicroOps = 6;
let ResourceCycles = [1,1,1,2,1];
}
-def: InstRW<[SKXWriteResGroup168], (instregex "SHLD(16|32|64)mrCL")>;
-def: InstRW<[SKXWriteResGroup168], (instregex "SHRD(16|32|64)mrCL")>;
+def: InstRW<[SKXWriteResGroup168], (instregex "SHLD(16|32|64)mrCL",
+ "SHRD(16|32|64)mrCL")>;
def SKXWriteResGroup169 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
let Latency = 11;
let NumMicroOps = 7;
let ResourceCycles = [2,3,2];
}
-def: InstRW<[SKXWriteResGroup169], (instregex "RCL(16|32|64)rCL")>;
-def: InstRW<[SKXWriteResGroup169], (instregex "RCR(16|32|64)rCL")>;
+def: InstRW<[SKXWriteResGroup169], (instregex "RCL(16|32|64)rCL",
+ "RCR(16|32|64)rCL")>;
def SKXWriteResGroup170 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort15,SKXPort0156]> {
let Latency = 11;
@@ -5624,80 +2061,44 @@ def SKXWriteResGroup171 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
let NumMicroOps = 11;
let ResourceCycles = [2,9];
}
-def: InstRW<[SKXWriteResGroup171], (instregex "LOOPE")>;
-def: InstRW<[SKXWriteResGroup171], (instregex "LOOPNE")>;
-
-def SKXWriteResGroup172 : SchedWriteRes<[SKXPort0]> {
- let Latency = 12;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKXWriteResGroup172], (instregex "SQRTPSr")>;
-def: InstRW<[SKXWriteResGroup172], (instregex "SQRTSSr")>;
-def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSYr")>;
-def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSr")>;
-def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTSSZr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTSSr")>;
+def: InstRW<[SKXWriteResGroup171], (instrs LOOPE, LOOPNE)>;
-def SKXWriteResGroup173 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+def SKXWriteResGroup174 : SchedWriteRes<[SKXPort01]> {
let Latency = 12;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
}
-def: InstRW<[SKXWriteResGroup173], (instregex "PCLMULQDQrm")>;
-def: InstRW<[SKXWriteResGroup173], (instregex "VPCLMULQDQrm")>;
+def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQ(Z128|Z256)rr")>;
-def SKXWriteResGroup174 : SchedWriteRes<[SKXPort015]> {
+def SKXWriteResGroup174z : SchedWriteRes<[SKXPort05]> {
let Latency = 12;
let NumMicroOps = 3;
let ResourceCycles = [3];
}
-def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup174z], (instregex "VPMULLQZrr")>;
def SKXWriteResGroup175 : SchedWriteRes<[SKXPort5,SKXPort23]> {
let Latency = 12;
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKXWriteResGroup175], (instregex "VPERMWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup175], (instregex "VPERMWZ128rm(b?)")>;
def SKXWriteResGroup176 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
let Latency = 12;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup176], (instregex "VCVTSD2USIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup176], (instregex "VCVTSS2USI64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup176], (instregex "VCVTTSD2USIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup176], (instregex "VCVTTSS2USI64Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup176], (instregex "VCVT(T?)SD2USIZrm(b?)",
+ "VCVT(T?)SS2USI64Zrm(b?)")>;
def SKXWriteResGroup177 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
let Latency = 12;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup177], (instregex "VCVTPS2QQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup177], (instregex "VCVTPS2UQQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup177], (instregex "VCVTTPS2QQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup177], (instregex "VCVTTPS2UQQZrm(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup178 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
- let Latency = 12;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup178], (instregex "HADDPDrm")>;
-def: InstRW<[SKXWriteResGroup178], (instregex "HADDPSrm")>;
-def: InstRW<[SKXWriteResGroup178], (instregex "HSUBPDrm")>;
-def: InstRW<[SKXWriteResGroup178], (instregex "HSUBPSrm")>;
-def: InstRW<[SKXWriteResGroup178], (instregex "VHADDPDrm")>;
-def: InstRW<[SKXWriteResGroup178], (instregex "VHADDPSrm")>;
-def: InstRW<[SKXWriteResGroup178], (instregex "VHSUBPDrm")>;
-def: InstRW<[SKXWriteResGroup178], (instregex "VHSUBPSrm")>;
+def: InstRW<[SKXWriteResGroup177], (instregex "VCVT(T?)PS2QQZrm(b?)",
+ "VCVT(T?)PS2UQQZrm(b?)")>;
def SKXWriteResGroup179 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort015]> {
let Latency = 12;
@@ -5711,14 +2112,9 @@ def SKXWriteResGroup180 : SchedWriteRes<[SKXPort5,SKXPort23]> {
let NumMicroOps = 3;
let ResourceCycles = [2,1];
}
-def: InstRW<[SKXWriteResGroup180], (instregex "ADD_FI16m")>;
-def: InstRW<[SKXWriteResGroup180], (instregex "ADD_FI32m")>;
-def: InstRW<[SKXWriteResGroup180], (instregex "SUBR_FI16m")>;
-def: InstRW<[SKXWriteResGroup180], (instregex "SUBR_FI32m")>;
-def: InstRW<[SKXWriteResGroup180], (instregex "SUB_FI16m")>;
-def: InstRW<[SKXWriteResGroup180], (instregex "SUB_FI32m")>;
-def: InstRW<[SKXWriteResGroup180], (instregex "VPERMWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup180], (instregex "VPERMWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup180], (instregex "(ADD|SUB|SUBR)_FI(16|32)m",
+ "VPERMWZ256rm(b?)",
+ "VPERMWZrm(b?)")>;
def SKXWriteResGroup181 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
let Latency = 13;
@@ -5727,100 +2123,58 @@ def SKXWriteResGroup181 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
}
def: InstRW<[SKXWriteResGroup181], (instregex "VCVTDQ2PDYrm")>;
-def SKXWriteResGroup182 : SchedWriteRes<[SKXPort5,SKXPort015]> {
- let Latency = 13;
- let NumMicroOps = 4;
- let ResourceCycles = [1,3];
-}
-def: InstRW<[SKXWriteResGroup182], (instregex "DPPSrri")>;
-def: InstRW<[SKXWriteResGroup182], (instregex "VDPPSYrri")>;
-def: InstRW<[SKXWriteResGroup182], (instregex "VDPPSrri")>;
-
def SKXWriteResGroup183 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
let Latency = 13;
let NumMicroOps = 4;
let ResourceCycles = [2,1,1];
}
-def: InstRW<[SKXWriteResGroup183], (instregex "VHADDPDYrm")>;
-def: InstRW<[SKXWriteResGroup183], (instregex "VHADDPSYrm")>;
-def: InstRW<[SKXWriteResGroup183], (instregex "VHSUBPDYrm")>;
-def: InstRW<[SKXWriteResGroup183], (instregex "VHSUBPSYrm")>;
-def: InstRW<[SKXWriteResGroup183], (instregex "VPERMI2W128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup183], (instregex "VPERMT2W128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup183], (instregex "VPERMI2W128rm(b?)",
+ "VPERMT2W128rm(b?)")>;
-def SKXWriteResGroup184 : SchedWriteRes<[SKXPort0]> {
+def SKXWriteResGroup184 : SchedWriteRes<[SKXPort0,SKXFPDivider]> {
let Latency = 14;
let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKXWriteResGroup184], (instregex "DIVPDrr")>;
-def: InstRW<[SKXWriteResGroup184], (instregex "DIVSDrr")>;
-def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDYrr")>;
-def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDrr")>;
-def: InstRW<[SKXWriteResGroup184], (instregex "VDIVSDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup184], (instregex "VDIVSDrr")>;
-
-def SKXWriteResGroup185 : SchedWriteRes<[SKXPort0,SKXPort23]> {
- let Latency = 14;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
+ let ResourceCycles = [1,3];
}
-def: InstRW<[SKXWriteResGroup185], (instregex "AESIMCrm")>;
-def: InstRW<[SKXWriteResGroup185], (instregex "VAESIMCrm")>;
+def : SchedAlias<WriteFDiv64, SKXWriteResGroup184>; // TODO - convert to ZnWriteResFpuPair
+def : SchedAlias<WriteFDiv64X, SKXWriteResGroup184>; // TODO - convert to ZnWriteResFpuPair
-def SKXWriteResGroup186 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+def SKXWriteResGroup184_1 : SchedWriteRes<[SKXPort0,SKXFPDivider]> {
let Latency = 14;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
+ let NumMicroOps = 1;
+ let ResourceCycles = [1,5];
}
-def: InstRW<[SKXWriteResGroup186], (instregex "PMULLDrm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDPDm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDPSm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDSDm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDSSm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VPMULLDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VPMULLDrm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALEPDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALEPSZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALESDm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALESSm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDPDm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDPSm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDSDm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDSSm")>;
+def : SchedAlias<WriteFDiv64Y, SKXWriteResGroup184_1>; // TODO - convert to ZnWriteResFpuPair
def SKXWriteResGroup187 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
let Latency = 14;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup187], (instregex "MUL_FI16m")>;
-def: InstRW<[SKXWriteResGroup187], (instregex "MUL_FI32m")>;
+def: InstRW<[SKXWriteResGroup187], (instregex "MUL_FI(16|32)m")>;
def SKXWriteResGroup188 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
let Latency = 14;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2DQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2UDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup188], (instregex "VCVTQQ2PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup188], (instregex "VCVTTPD2DQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup188], (instregex "VCVTTPD2UDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup188], (instregex "VCVTUQQ2PSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2DQZrm(b?)",
+ "VCVTPD2PSZrm(b?)",
+ "VCVTPD2UDQZrm(b?)",
+ "VCVTQQ2PSZrm(b?)",
+ "VCVTTPD2DQZrm(b?)",
+ "VCVTTPD2UDQZrm(b?)",
+ "VCVTUQQ2PSZrm(b?)")>;
def SKXWriteResGroup189 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
let Latency = 14;
let NumMicroOps = 4;
let ResourceCycles = [2,1,1];
}
-def: InstRW<[SKXWriteResGroup189], (instregex "VPERMI2W256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup189], (instregex "VPERMI2Wrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup189], (instregex "VPERMT2W256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup189], (instregex "VPERMT2Wrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup189], (instregex "VPERMI2W256rm(b?)",
+ "VPERMI2Wrm(b?)",
+ "VPERMT2W256rm(b?)",
+ "VPERMT2Wrm(b?)")>;
def SKXWriteResGroup190 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort15,SKXPort0156]> {
let Latency = 14;
@@ -5834,206 +2188,85 @@ def SKXWriteResGroup191 : SchedWriteRes<[SKXPort0]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_FPrST0")>;
-def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_FST0r")>;
-def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_FrST0")>;
-
-def SKXWriteResGroup192 : SchedWriteRes<[SKXPort23,SKXPort015]> {
- let Latency = 15;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[SKXWriteResGroup192], (instregex "VPMULLDYrm")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VPMULLDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VPMULLDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPSZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPSZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VROUNDYPDm")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VROUNDYPSm")>;
-
-def SKXWriteResGroup193 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
- let Latency = 15;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,2];
-}
-def: InstRW<[SKXWriteResGroup193], (instregex "DPPDrmi")>;
-def: InstRW<[SKXWriteResGroup193], (instregex "VDPPDrmi")>;
+def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>;
def SKXWriteResGroup194 : SchedWriteRes<[SKXPort1,SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
let Latency = 15;
let NumMicroOps = 8;
let ResourceCycles = [1,2,2,1,2];
}
-def: InstRW<[SKXWriteResGroup194], (instregex "VPCONFLICTDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup194], (instregex "VPCONFLICTDZ128rm(b?)")>;
def SKXWriteResGroup195 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> {
let Latency = 15;
let NumMicroOps = 10;
let ResourceCycles = [1,1,1,5,1,1];
}
-def: InstRW<[SKXWriteResGroup195], (instregex "RCL(16|32|64)mCL")>;
-def: InstRW<[SKXWriteResGroup195], (instregex "RCL8mCL")>;
-
-def SKXWriteResGroup196 : SchedWriteRes<[SKXPort0,SKXPort23]> {
- let Latency = 16;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup196], (instregex "DIVSSrm")>;
-def: InstRW<[SKXWriteResGroup196], (instregex "VDIVSSrm")>;
-
-def SKXWriteResGroup197 : SchedWriteRes<[SKXPort0,SKXPort23]> {
- let Latency = 16;
- let NumMicroOps = 4;
- let ResourceCycles = [3,1];
-}
-def: InstRW<[SKXWriteResGroup197], (instregex "PCMPISTRIrm")>;
-def: InstRW<[SKXWriteResGroup197], (instregex "PCMPISTRM128rm")>;
-def: InstRW<[SKXWriteResGroup197], (instregex "VPCMPISTRIrm")>;
-def: InstRW<[SKXWriteResGroup197], (instregex "VPCMPISTRM128rm")>;
-
-def SKXWriteResGroup198 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
- let Latency = 16;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup198], (instregex "VRCP14PDZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup198], (instregex "VRCP14PSZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup198], (instregex "VRSQRT14PDZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup198], (instregex "VRSQRT14PSZm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup195], (instregex "RCL(8|16|32|64)mCL")>;
def SKXWriteResGroup199 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> {
let Latency = 16;
let NumMicroOps = 14;
let ResourceCycles = [1,1,1,4,2,5];
}
-def: InstRW<[SKXWriteResGroup199], (instregex "CMPXCHG8B")>;
+def: InstRW<[SKXWriteResGroup199], (instrs CMPXCHG8B)>;
def SKXWriteResGroup200 : SchedWriteRes<[SKXPort0156]> {
let Latency = 16;
let NumMicroOps = 16;
let ResourceCycles = [16];
}
-def: InstRW<[SKXWriteResGroup200], (instregex "VZEROALL")>;
+def: InstRW<[SKXWriteResGroup200], (instrs VZEROALL)>;
-def SKXWriteResGroup201 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+def SKXWriteResGroup201 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
let Latency = 17;
let NumMicroOps = 2;
- let ResourceCycles = [1,1];
+ let ResourceCycles = [1,1,5];
}
-def: InstRW<[SKXWriteResGroup201], (instregex "DIVPSrm")>;
-def: InstRW<[SKXWriteResGroup201], (instregex "SQRTSSm")>;
-def: InstRW<[SKXWriteResGroup201], (instregex "VDIVPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup201], (instregex "VDIVPSrm")>;
-def: InstRW<[SKXWriteResGroup201], (instregex "VDIVSSZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup201], (instregex "VSQRTSSm")>;
+def : SchedAlias<WriteFDivXLd, SKXWriteResGroup201>; // TODO - convert to ZnWriteResFpuPair
def SKXWriteResGroup202 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156]> {
let Latency = 17;
let NumMicroOps = 15;
let ResourceCycles = [2,1,2,4,2,4];
}
-def: InstRW<[SKXWriteResGroup202], (instregex "XCH_F")>;
-
-def SKXWriteResGroup203 : SchedWriteRes<[SKXPort0]> {
- let Latency = 18;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-def: InstRW<[SKXWriteResGroup203], (instregex "SQRTPDr")>;
-def: InstRW<[SKXWriteResGroup203], (instregex "SQRTSDr")>;
-def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDYr")>;
-def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDr")>;
-def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTSDZr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTSDr")>;
-
-def SKXWriteResGroup204 : SchedWriteRes<[SKXPort0,SKXPort23]> {
- let Latency = 18;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup204], (instregex "SQRTPSm")>;
-def: InstRW<[SKXWriteResGroup204], (instregex "VDIVPSYrm")>;
-def: InstRW<[SKXWriteResGroup204], (instregex "VDIVPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup204], (instregex "VSQRTPSZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup204], (instregex "VSQRTPSm")>;
-def: InstRW<[SKXWriteResGroup204], (instregex "VSQRTSSZm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup202], (instrs XCH_F)>;
def SKXWriteResGroup205 : SchedWriteRes<[SKXPort23,SKXPort015]> {
let Latency = 18;
let NumMicroOps = 4;
let ResourceCycles = [1,3];
}
-def: InstRW<[SKXWriteResGroup205], (instregex "VPMULLQZ128rm(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup206 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort0156]> {
- let Latency = 18;
- let NumMicroOps = 8;
- let ResourceCycles = [4,3,1];
-}
-def: InstRW<[SKXWriteResGroup206], (instregex "PCMPESTRIrr")>;
-def: InstRW<[SKXWriteResGroup206], (instregex "VPCMPESTRIrr")>;
+def: InstRW<[SKXWriteResGroup205], (instregex "VPMULLQZ128rm(b?)")>;
def SKXWriteResGroup207 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort06,SKXPort0156]> {
let Latency = 18;
let NumMicroOps = 8;
let ResourceCycles = [1,1,1,5];
}
-def: InstRW<[SKXWriteResGroup207], (instregex "CPUID")>;
-def: InstRW<[SKXWriteResGroup207], (instregex "RDTSC")>;
+def: InstRW<[SKXWriteResGroup207], (instrs CPUID, RDTSC)>;
def SKXWriteResGroup208 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> {
let Latency = 18;
let NumMicroOps = 11;
let ResourceCycles = [2,1,1,4,1,2];
}
-def: InstRW<[SKXWriteResGroup208], (instregex "RCR(16|32|64)mCL")>;
-def: InstRW<[SKXWriteResGroup208], (instregex "RCR8mCL")>;
+def: InstRW<[SKXWriteResGroup208], (instregex "RCR(8|16|32|64)mCL")>;
-def SKXWriteResGroup209 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+def SKXWriteResGroup209 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
let Latency = 19;
let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup209], (instregex "DIVSDrm")>;
-def: InstRW<[SKXWriteResGroup209], (instregex "VDIVSDrm")>;
-def: InstRW<[SKXWriteResGroup209], (instregex "VSQRTPSYm")>;
-def: InstRW<[SKXWriteResGroup209], (instregex "VSQRTPSZ256m(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup210 : SchedWriteRes<[SKXPort0,SKXPort015]> {
- let Latency = 19;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
+ let ResourceCycles = [1,1,4];
}
-def: InstRW<[SKXWriteResGroup210], (instregex "VSQRTPSZr(b?)(k?)(z?)")>;
+def : SchedAlias<WriteFDiv64Ld, SKXWriteResGroup209>; // TODO - convert to ZnWriteResFpuPair
def SKXWriteResGroup211 : SchedWriteRes<[SKXPort23,SKXPort015]> {
let Latency = 19;
let NumMicroOps = 4;
let ResourceCycles = [1,3];
}
-def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZrm(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup212 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
- let Latency = 19;
- let NumMicroOps = 5;
- let ResourceCycles = [1,1,3];
-}
-def: InstRW<[SKXWriteResGroup212], (instregex "DPPSrmi")>;
-def: InstRW<[SKXWriteResGroup212], (instregex "VDPPSrmi")>;
-
-def SKXWriteResGroup213 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort015,SKXPort0156]> {
- let Latency = 19;
- let NumMicroOps = 9;
- let ResourceCycles = [4,3,1,1];
-}
-def: InstRW<[SKXWriteResGroup213], (instregex "PCMPESTRM128rr")>;
-def: InstRW<[SKXWriteResGroup213], (instregex "VPCMPESTRM128rr")>;
+def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)",
+ "VPMULLQZrm(b?)")>;
def SKXWriteResGroup214 : SchedWriteRes<[]> {
let Latency = 20;
@@ -6048,26 +2281,14 @@ def SKXWriteResGroup215 : SchedWriteRes<[SKXPort0]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SKXWriteResGroup215], (instregex "DIV_FPrST0")>;
-def: InstRW<[SKXWriteResGroup215], (instregex "DIV_FST0r")>;
-def: InstRW<[SKXWriteResGroup215], (instregex "DIV_FrST0")>;
+def: InstRW<[SKXWriteResGroup215], (instregex "DIV_(FPrST0|FST0r|FrST0)")>;
-def SKXWriteResGroup216 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+def SKXWriteResGroup216 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
let Latency = 20;
let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup216], (instregex "DIVPDrm")>;
-def: InstRW<[SKXWriteResGroup216], (instregex "VDIVPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup216], (instregex "VDIVPDrm")>;
-def: InstRW<[SKXWriteResGroup216], (instregex "VDIVSDZrm(_Int)?(k?)(z?)")>;
-
-def SKXWriteResGroup217 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
- let Latency = 20;
- let NumMicroOps = 5;
- let ResourceCycles = [1,1,3];
+ let ResourceCycles = [1,1,4];
}
-def: InstRW<[SKXWriteResGroup217], (instregex "VDPPSYrmi")>;
+def : SchedAlias<WriteFDiv64XLd, SKXWriteResGroup216>; // TODO - convert to ZnWriteResFpuPair
def SKXWriteResGroup218 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
let Latency = 20;
@@ -6084,40 +2305,28 @@ def SKXWriteResGroup219 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort6,SKXPort23,SK
let NumMicroOps = 8;
let ResourceCycles = [1,1,1,1,1,1,2];
}
-def: InstRW<[SKXWriteResGroup219], (instregex "INSB")>;
-def: InstRW<[SKXWriteResGroup219], (instregex "INSL")>;
-def: InstRW<[SKXWriteResGroup219], (instregex "INSW")>;
+def: InstRW<[SKXWriteResGroup219], (instrs INSB, INSL, INSW)>;
def SKXWriteResGroup220 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort0156]> {
let Latency = 20;
let NumMicroOps = 10;
let ResourceCycles = [1,2,7];
}
-def: InstRW<[SKXWriteResGroup220], (instregex "MWAITrr")>;
+def: InstRW<[SKXWriteResGroup220], (instrs MWAITrr)>;
-def SKXWriteResGroup221 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort015]> {
- let Latency = 20;
- let NumMicroOps = 11;
- let ResourceCycles = [3,6,2];
-}
-def: InstRW<[SKXWriteResGroup221], (instregex "AESKEYGENASSIST128rr")>;
-def: InstRW<[SKXWriteResGroup221], (instregex "VAESKEYGENASSIST128rr")>;
-
-def SKXWriteResGroup222 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+def SKXWriteResGroup222 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
let Latency = 21;
let NumMicroOps = 2;
- let ResourceCycles = [1,1];
+ let ResourceCycles = [1,1,8];
}
-def: InstRW<[SKXWriteResGroup222], (instregex "VDIVPDYrm")>;
-def: InstRW<[SKXWriteResGroup222], (instregex "VDIVPDZ256rm(b?)(k?)(z?)")>;
+def : SchedAlias<WriteFDiv64YLd, SKXWriteResGroup222>; // TODO - convert to ZnWriteResFpuPair
def SKXWriteResGroup223 : SchedWriteRes<[SKXPort0,SKXPort23]> {
let Latency = 22;
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F32m")>;
-def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F64m")>;
+def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F(32|64)m")>;
def SKXWriteResGroup224 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
let Latency = 22;
@@ -6176,72 +2385,22 @@ def SKXWriteResGroup225 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> {
let NumMicroOps = 14;
let ResourceCycles = [5,5,4];
}
-def: InstRW<[SKXWriteResGroup225], (instregex "VPCONFLICTDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup225], (instregex "VPCONFLICTQZ256rr(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup226 : SchedWriteRes<[SKXPort0,SKXPort23]> {
- let Latency = 23;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup226], (instregex "SQRTSDm")>;
-def: InstRW<[SKXWriteResGroup226], (instregex "VSQRTSDm")>;
-
-def SKXWriteResGroup227 : SchedWriteRes<[SKXPort0,SKXPort015]> {
- let Latency = 23;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[SKXWriteResGroup227], (instregex "VDIVPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup227], (instregex "VDIVPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup225], (instregex "VPCONFLICTDZ128rr",
+ "VPCONFLICTQZ256rr")>;
def SKXWriteResGroup228 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
let Latency = 23;
let NumMicroOps = 19;
let ResourceCycles = [2,1,4,1,1,4,6];
}
-def: InstRW<[SKXWriteResGroup228], (instregex "CMPXCHG16B")>;
-
-def SKXWriteResGroup229 : SchedWriteRes<[SKXPort0,SKXPort23]> {
- let Latency = 24;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup229], (instregex "SQRTPDm")>;
-def: InstRW<[SKXWriteResGroup229], (instregex "VSQRTPDZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup229], (instregex "VSQRTPDm")>;
-def: InstRW<[SKXWriteResGroup229], (instregex "VSQRTSDZm(_Int)?(k?)(z?)")>;
-
-def SKXWriteResGroup230 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
- let Latency = 24;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup230], (instregex "VDIVPSZrm(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup231 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort0156]> {
- let Latency = 24;
- let NumMicroOps = 9;
- let ResourceCycles = [4,3,1,1];
-}
-def: InstRW<[SKXWriteResGroup231], (instregex "PCMPESTRIrm")>;
-def: InstRW<[SKXWriteResGroup231], (instregex "VPCMPESTRIrm")>;
-
-def SKXWriteResGroup232 : SchedWriteRes<[SKXPort0,SKXPort23]> {
- let Latency = 25;
- let NumMicroOps = 2;
- let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup232], (instregex "VSQRTPDYm")>;
-def: InstRW<[SKXWriteResGroup232], (instregex "VSQRTPDZ256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup228], (instrs CMPXCHG16B)>;
def SKXWriteResGroup233 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
let Latency = 25;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI16m")>;
-def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI32m")>;
+def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI(16|32)m")>;
def SKXWriteResGroup234 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
let Latency = 25;
@@ -6254,29 +2413,6 @@ def: InstRW<[SKXWriteResGroup234], (instrs VGATHERDPDZ256rm,
VPGATHERQDZrm,
VPGATHERQQZ256rm)>;
-def SKXWriteResGroup235 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort015,SKXPort0156]> {
- let Latency = 25;
- let NumMicroOps = 10;
- let ResourceCycles = [4,3,1,1,1];
-}
-def: InstRW<[SKXWriteResGroup235], (instregex "PCMPESTRM128rm")>;
-def: InstRW<[SKXWriteResGroup235], (instregex "VPCMPESTRM128rm")>;
-
-def SKXWriteResGroup236 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort015]> {
- let Latency = 25;
- let NumMicroOps = 11;
- let ResourceCycles = [3,6,1,1];
-}
-def: InstRW<[SKXWriteResGroup236], (instregex "AESKEYGENASSIST128rm")>;
-def: InstRW<[SKXWriteResGroup236], (instregex "VAESKEYGENASSIST128rm")>;
-
-def SKXWriteResGroup237 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
- let Latency = 26;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup237], (instregex "VSQRTPSZm(b?)(k?)(z?)")>;
-
def SKXWriteResGroup238 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
let Latency = 26;
let NumMicroOps = 5;
@@ -6292,8 +2428,7 @@ def SKXWriteResGroup239 : SchedWriteRes<[SKXPort0,SKXPort23]> {
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F32m")>;
-def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F64m")>;
+def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F(32|64)m")>;
def SKXWriteResGroup240 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
let Latency = 27;
@@ -6308,30 +2443,21 @@ def SKXWriteResGroup241 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort0156
let NumMicroOps = 8;
let ResourceCycles = [2,4,1,1];
}
-def: InstRW<[SKXWriteResGroup241], (instregex "IDIV(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup241], (instregex "IDIV8m")>;
+def: InstRW<[SKXWriteResGroup241], (instregex "IDIV(8|16|32|64)m")>;
def SKXWriteResGroup242 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
let Latency = 29;
let NumMicroOps = 15;
let ResourceCycles = [5,5,1,4];
}
-def: InstRW<[SKXWriteResGroup242], (instregex "VPCONFLICTQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup242], (instregex "VPCONFLICTQZ256rm(b?)")>;
def SKXWriteResGroup243 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
let Latency = 30;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI16m")>;
-def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI32m")>;
-
-def SKXWriteResGroup244 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
- let Latency = 30;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup244], (instregex "VDIVPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI(16|32)m")>;
def SKXWriteResGroup245 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
let Latency = 30;
@@ -6341,40 +2467,29 @@ def SKXWriteResGroup245 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort01
def: InstRW<[SKXWriteResGroup245], (instrs VGATHERDPSZrm,
VPGATHERDDZrm)>;
-def SKXWriteResGroup246 : SchedWriteRes<[SKXPort0,SKXPort015]> {
- let Latency = 31;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[SKXWriteResGroup246], (instregex "VSQRTPDZr(b?)(k?)(z?)")>;
-
def SKXWriteResGroup247 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort06,SKXPort0156]> {
let Latency = 35;
let NumMicroOps = 23;
let ResourceCycles = [1,5,3,4,10];
}
-def: InstRW<[SKXWriteResGroup247], (instregex "IN(16|32)ri")>;
-def: InstRW<[SKXWriteResGroup247], (instregex "IN(16|32)rr")>;
-def: InstRW<[SKXWriteResGroup247], (instregex "IN8ri")>;
-def: InstRW<[SKXWriteResGroup247], (instregex "IN8rr")>;
+def: InstRW<[SKXWriteResGroup247], (instregex "IN(8|16|32)ri",
+ "IN(8|16|32)rr")>;
def SKXWriteResGroup248 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
let Latency = 35;
let NumMicroOps = 23;
let ResourceCycles = [1,5,2,1,4,10];
}
-def: InstRW<[SKXWriteResGroup248], (instregex "OUT(16|32)ir")>;
-def: InstRW<[SKXWriteResGroup248], (instregex "OUT(16|32)rr")>;
-def: InstRW<[SKXWriteResGroup248], (instregex "OUT8ir")>;
-def: InstRW<[SKXWriteResGroup248], (instregex "OUT8rr")>;
+def: InstRW<[SKXWriteResGroup248], (instregex "OUT(8|16|32)ir",
+ "OUT(8|16|32)rr")>;
def SKXWriteResGroup249 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> {
let Latency = 37;
let NumMicroOps = 21;
let ResourceCycles = [9,7,5];
}
-def: InstRW<[SKXWriteResGroup249], (instregex "VPCONFLICTDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup249], (instregex "VPCONFLICTQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup249], (instregex "VPCONFLICTDZ256rr",
+ "VPCONFLICTQZrr")>;
def SKXWriteResGroup250 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156]> {
let Latency = 37;
@@ -6383,91 +2498,84 @@ def SKXWriteResGroup250 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156
}
def: InstRW<[SKXWriteResGroup250], (instregex "XRSTOR(64)?")>;
-def SKXWriteResGroup251 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
- let Latency = 38;
- let NumMicroOps = 4;
- let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup251], (instregex "VSQRTPDZm(b?)(k?)(z?)")>;
-
def SKXWriteResGroup252 : SchedWriteRes<[SKXPort1,SKXPort4,SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort15,SKXPort0156]> {
let Latency = 40;
let NumMicroOps = 18;
let ResourceCycles = [1,1,2,3,1,1,1,8];
}
-def: InstRW<[SKXWriteResGroup252], (instregex "VMCLEARm")>;
+def: InstRW<[SKXWriteResGroup252], (instrs VMCLEARm)>;
def SKXWriteResGroup253 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> {
let Latency = 41;
let NumMicroOps = 39;
let ResourceCycles = [1,10,1,1,26];
}
-def: InstRW<[SKXWriteResGroup253], (instregex "XSAVE64")>;
+def: InstRW<[SKXWriteResGroup253], (instrs XSAVE64)>;
def SKXWriteResGroup254 : SchedWriteRes<[SKXPort5,SKXPort0156]> {
let Latency = 42;
let NumMicroOps = 22;
let ResourceCycles = [2,20];
}
-def: InstRW<[SKXWriteResGroup254], (instregex "RDTSCP")>;
+def: InstRW<[SKXWriteResGroup254], (instrs RDTSCP)>;
def SKXWriteResGroup255 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> {
let Latency = 42;
let NumMicroOps = 40;
let ResourceCycles = [1,11,1,1,26];
}
-def: InstRW<[SKXWriteResGroup255], (instregex "XSAVE")>;
+def: InstRW<[SKXWriteResGroup255], (instrs XSAVE)>;
+def: InstRW<[SKXWriteResGroup255], (instregex "XSAVEC", "XSAVES", "XSAVEOPT")>;
def SKXWriteResGroup256 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
let Latency = 44;
let NumMicroOps = 22;
let ResourceCycles = [9,7,1,5];
}
-def: InstRW<[SKXWriteResGroup256], (instregex "VPCONFLICTDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup256], (instregex "VPCONFLICTQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup256], (instregex "VPCONFLICTDZ256rm(b?)",
+ "VPCONFLICTQZrm(b?)")>;
def SKXWriteResGroup258 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort05,SKXPort06,SKXPort0156]> {
let Latency = 62;
let NumMicroOps = 64;
let ResourceCycles = [2,8,5,10,39];
}
-def: InstRW<[SKXWriteResGroup258], (instregex "FLDENVm")>;
-def: InstRW<[SKXWriteResGroup258], (instregex "FLDENVm")>;
+def: InstRW<[SKXWriteResGroup258], (instrs FLDENVm)>;
def SKXWriteResGroup259 : SchedWriteRes<[SKXPort0,SKXPort6,SKXPort23,SKXPort05,SKXPort06,SKXPort15,SKXPort0156]> {
let Latency = 63;
let NumMicroOps = 88;
let ResourceCycles = [4,4,31,1,2,1,45];
}
-def: InstRW<[SKXWriteResGroup259], (instregex "FXRSTOR64")>;
+def: InstRW<[SKXWriteResGroup259], (instrs FXRSTOR64)>;
def SKXWriteResGroup260 : SchedWriteRes<[SKXPort0,SKXPort6,SKXPort23,SKXPort05,SKXPort06,SKXPort15,SKXPort0156]> {
let Latency = 63;
let NumMicroOps = 90;
let ResourceCycles = [4,2,33,1,2,1,47];
}
-def: InstRW<[SKXWriteResGroup260], (instregex "FXRSTOR")>;
+def: InstRW<[SKXWriteResGroup260], (instrs FXRSTOR)>;
def SKXWriteResGroup261 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> {
let Latency = 67;
let NumMicroOps = 35;
let ResourceCycles = [17,11,7];
}
-def: InstRW<[SKXWriteResGroup261], (instregex "VPCONFLICTDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup261], (instregex "VPCONFLICTDZrr")>;
def SKXWriteResGroup262 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
let Latency = 74;
let NumMicroOps = 36;
let ResourceCycles = [17,11,1,7];
}
-def: InstRW<[SKXWriteResGroup262], (instregex "VPCONFLICTDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup262], (instregex "VPCONFLICTDZrm(b?)")>;
def SKXWriteResGroup263 : SchedWriteRes<[SKXPort5,SKXPort05,SKXPort0156]> {
let Latency = 75;
let NumMicroOps = 15;
let ResourceCycles = [6,3,6];
}
-def: InstRW<[SKXWriteResGroup263], (instregex "FNINIT")>;
+def: InstRW<[SKXWriteResGroup263], (instrs FNINIT)>;
def SKXWriteResGroup264 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156]> {
let Latency = 76;
@@ -6488,13 +2596,15 @@ def SKXWriteResGroup266 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort4,SKXPort5,SKX
let NumMicroOps = 100;
let ResourceCycles = [9,1,11,16,1,11,21,30];
}
-def: InstRW<[SKXWriteResGroup266], (instregex "FSTENVm")>;
-def: InstRW<[SKXWriteResGroup266], (instregex "FSTENVm")>;
+def: InstRW<[SKXWriteResGroup266], (instrs FSTENVm)>;
def SKXWriteResGroup267 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
let Latency = 140;
let NumMicroOps = 4;
let ResourceCycles = [1,3];
}
-def: InstRW<[SKXWriteResGroup267], (instregex "PAUSE")>;
+def: InstRW<[SKXWriteResGroup267], (instrs PAUSE)>;
+
+def: InstRW<[WriteZero], (instrs CLC)>;
+
} // SchedModel
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 2e21a97541b2..d0167753ccd4 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -7,9 +7,8 @@
//
//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
// InstrSchedModel annotations for out-of-order CPUs.
-//
-// These annotations are independent of the itinerary classes defined below.
// Instructions with folded loads need to read the memory operand immediately,
// but other register operands don't have to be read until the load is ready.
@@ -20,6 +19,17 @@ def ReadAfterLd : SchedRead;
// load + WriteRMW.
def WriteRMW : SchedWrite;
+// Helper to set SchedWrite ExePorts/Latency/ResourceCycles/NumMicroOps.
+multiclass X86WriteRes<SchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res, int UOps> {
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+}
+
// Most instructions can fold loads, so almost every SchedWrite comes in two
// variants: With and without a folded load.
// An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite
@@ -39,20 +49,108 @@ multiclass X86SchedWritePair {
}
}
+// Helpers to mark SchedWrites as unsupported.
+multiclass X86WriteResUnsupported<SchedWrite SchedRW> {
+ let Unsupported = 1 in {
+ def : WriteRes<SchedRW, []>;
+ }
+}
+multiclass X86WriteResPairUnsupported<X86FoldableSchedWrite SchedRW> {
+ let Unsupported = 1 in {
+ def : WriteRes<SchedRW, []>;
+ def : WriteRes<SchedRW.Folded, []>;
+ }
+}
+
+// Multiclass that wraps X86FoldableSchedWrite for each vector width.
+class X86SchedWriteWidths<X86FoldableSchedWrite sScl,
+ X86FoldableSchedWrite s128,
+ X86FoldableSchedWrite s256,
+ X86FoldableSchedWrite s512> {
+ X86FoldableSchedWrite Scl = sScl; // Scalar float/double operations.
+ X86FoldableSchedWrite MMX = sScl; // MMX operations.
+ X86FoldableSchedWrite XMM = s128; // XMM operations.
+ X86FoldableSchedWrite YMM = s256; // YMM operations.
+ X86FoldableSchedWrite ZMM = s512; // ZMM operations.
+}
+
+// Multiclass that wraps X86SchedWriteWidths for each fp vector type.
+class X86SchedWriteSizes<X86SchedWriteWidths sPS,
+ X86SchedWriteWidths sPD> {
+ X86SchedWriteWidths PS = sPS;
+ X86SchedWriteWidths PD = sPD;
+}
+
+// Multiclass that wraps move/load/store triple for a vector width.
+class X86SchedWriteMoveLS<SchedWrite MoveRR,
+ SchedWrite LoadRM,
+ SchedWrite StoreMR> {
+ SchedWrite RR = MoveRR;
+ SchedWrite RM = LoadRM;
+ SchedWrite MR = StoreMR;
+}
+
+// Multiclass that wraps X86SchedWriteMoveLS for each vector width.
+class X86SchedWriteMoveLSWidths<X86SchedWriteMoveLS sScl,
+ X86SchedWriteMoveLS s128,
+ X86SchedWriteMoveLS s256,
+ X86SchedWriteMoveLS s512> {
+ X86SchedWriteMoveLS Scl = sScl; // Scalar float/double operations.
+ X86SchedWriteMoveLS MMX = sScl; // MMX operations.
+ X86SchedWriteMoveLS XMM = s128; // XMM operations.
+ X86SchedWriteMoveLS YMM = s256; // YMM operations.
+ X86SchedWriteMoveLS ZMM = s512; // ZMM operations.
+}
+
+// Loads, stores, and moves, not folded with other operations.
+def WriteLoad : SchedWrite;
+def WriteStore : SchedWrite;
+def WriteStoreNT : SchedWrite;
+def WriteMove : SchedWrite;
+
// Arithmetic.
-defm WriteALU : X86SchedWritePair; // Simple integer ALU op.
-defm WriteIMul : X86SchedWritePair; // Integer multiplication.
-def WriteIMulH : SchedWrite; // Integer multiplication, high part.
-defm WriteIDiv : X86SchedWritePair; // Integer division.
-def WriteLEA : SchedWrite; // LEA instructions can't fold loads.
+defm WriteALU : X86SchedWritePair; // Simple integer ALU op.
+defm WriteADC : X86SchedWritePair; // Integer ALU + flags op.
+def WriteALURMW : WriteSequence<[WriteALULd, WriteStore]>;
+def WriteADCRMW : WriteSequence<[WriteADCLd, WriteStore]>;
+defm WriteIMul : X86SchedWritePair; // Integer multiplication.
+defm WriteIMul64 : X86SchedWritePair; // Integer 64-bit multiplication.
+def WriteIMulH : SchedWrite; // Integer multiplication, high part.
+def WriteLEA : SchedWrite; // LEA instructions can't fold loads.
+
+defm WriteBSWAP32: X86SchedWritePair; // Byte Order (Endiannes) Swap
+defm WriteBSWAP64: X86SchedWritePair; // Byte Order (Endiannes) Swap
+
+// Integer division.
+defm WriteDiv8 : X86SchedWritePair;
+defm WriteDiv16 : X86SchedWritePair;
+defm WriteDiv32 : X86SchedWritePair;
+defm WriteDiv64 : X86SchedWritePair;
+defm WriteIDiv8 : X86SchedWritePair;
+defm WriteIDiv16 : X86SchedWritePair;
+defm WriteIDiv32 : X86SchedWritePair;
+defm WriteIDiv64 : X86SchedWritePair;
+
+defm WriteBSF : X86SchedWritePair; // Bit scan forward.
+defm WriteBSR : X86SchedWritePair; // Bit scan reverse.
+defm WritePOPCNT : X86SchedWritePair; // Bit population count.
+defm WriteLZCNT : X86SchedWritePair; // Leading zero count.
+defm WriteTZCNT : X86SchedWritePair; // Trailing zero count.
+defm WriteCMOV : X86SchedWritePair; // Conditional move.
+defm WriteCMOV2 : X86SchedWritePair; // Conditional (CF + ZF flag) move.
+def WriteFCMOV : SchedWrite; // X87 conditional move.
+def WriteSETCC : SchedWrite; // Set register based on condition code.
+def WriteSETCCStore : SchedWrite;
+def WriteLAHFSAHF : SchedWrite; // Load/Store flags in AH.
// Integer shifts and rotates.
defm WriteShift : X86SchedWritePair;
+// Double shift instructions.
+defm WriteShiftDouble : X86SchedWritePair;
-// Loads, stores, and moves, not folded with other operations.
-def WriteLoad : SchedWrite;
-def WriteStore : SchedWrite;
-def WriteMove : SchedWrite;
+// BMI1 BEXTR, BMI2 BZHI
+defm WriteBEXTR : X86SchedWritePair;
+defm WriteBZHI : X86SchedWritePair;
// Idioms that clear a register, like xorps %xmm0, %xmm0.
// These can often bypass execution ports completely.
@@ -63,41 +161,244 @@ def WriteZero : SchedWrite;
defm WriteJump : X86SchedWritePair;
// Floating point. This covers both scalar and vector operations.
-defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
-defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
-defm WriteFDiv : X86SchedWritePair; // Floating point division.
+def WriteFLD0 : SchedWrite;
+def WriteFLD1 : SchedWrite;
+def WriteFLDC : SchedWrite;
+def WriteFLoad : SchedWrite;
+def WriteFLoadX : SchedWrite;
+def WriteFLoadY : SchedWrite;
+def WriteFMaskedLoad : SchedWrite;
+def WriteFMaskedLoadY : SchedWrite;
+def WriteFStore : SchedWrite;
+def WriteFStoreX : SchedWrite;
+def WriteFStoreY : SchedWrite;
+def WriteFStoreNT : SchedWrite;
+def WriteFStoreNTX : SchedWrite;
+def WriteFStoreNTY : SchedWrite;
+def WriteFMaskedStore : SchedWrite;
+def WriteFMaskedStoreY : SchedWrite;
+def WriteFMove : SchedWrite;
+def WriteFMoveX : SchedWrite;
+def WriteFMoveY : SchedWrite;
+
+defm WriteFAdd : X86SchedWritePair; // Floating point add/sub.
+defm WriteFAddX : X86SchedWritePair; // Floating point add/sub (XMM).
+defm WriteFAddY : X86SchedWritePair; // Floating point add/sub (YMM).
+defm WriteFAddZ : X86SchedWritePair; // Floating point add/sub (ZMM).
+defm WriteFAdd64 : X86SchedWritePair; // Floating point double add/sub.
+defm WriteFAdd64X : X86SchedWritePair; // Floating point double add/sub (XMM).
+defm WriteFAdd64Y : X86SchedWritePair; // Floating point double add/sub (YMM).
+defm WriteFAdd64Z : X86SchedWritePair; // Floating point double add/sub (ZMM).
+defm WriteFCmp : X86SchedWritePair; // Floating point compare.
+defm WriteFCmpX : X86SchedWritePair; // Floating point compare (XMM).
+defm WriteFCmpY : X86SchedWritePair; // Floating point compare (YMM).
+defm WriteFCmpZ : X86SchedWritePair; // Floating point compare (ZMM).
+defm WriteFCmp64 : X86SchedWritePair; // Floating point double compare.
+defm WriteFCmp64X : X86SchedWritePair; // Floating point double compare (XMM).
+defm WriteFCmp64Y : X86SchedWritePair; // Floating point double compare (YMM).
+defm WriteFCmp64Z : X86SchedWritePair; // Floating point double compare (ZMM).
+defm WriteFCom : X86SchedWritePair; // Floating point compare to flags.
+defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
+defm WriteFMulX : X86SchedWritePair; // Floating point multiplication (XMM).
+defm WriteFMulY : X86SchedWritePair; // Floating point multiplication (YMM).
+defm WriteFMulZ : X86SchedWritePair; // Floating point multiplication (YMM).
+defm WriteFMul64 : X86SchedWritePair; // Floating point double multiplication.
+defm WriteFMul64X : X86SchedWritePair; // Floating point double multiplication (XMM).
+defm WriteFMul64Y : X86SchedWritePair; // Floating point double multiplication (YMM).
+defm WriteFMul64Z : X86SchedWritePair; // Floating point double multiplication (ZMM).
+defm WriteFDiv : X86SchedWritePair; // Floating point division.
+defm WriteFDivX : X86SchedWritePair; // Floating point division (XMM).
+defm WriteFDivY : X86SchedWritePair; // Floating point division (YMM).
+defm WriteFDivZ : X86SchedWritePair; // Floating point division (ZMM).
+defm WriteFDiv64 : X86SchedWritePair; // Floating point double division.
+defm WriteFDiv64X : X86SchedWritePair; // Floating point double division (XMM).
+defm WriteFDiv64Y : X86SchedWritePair; // Floating point double division (YMM).
+defm WriteFDiv64Z : X86SchedWritePair; // Floating point double division (ZMM).
defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
+defm WriteFSqrtX : X86SchedWritePair; // Floating point square root (XMM).
+defm WriteFSqrtY : X86SchedWritePair; // Floating point square root (YMM).
+defm WriteFSqrtZ : X86SchedWritePair; // Floating point square root (ZMM).
+defm WriteFSqrt64 : X86SchedWritePair; // Floating point double square root.
+defm WriteFSqrt64X : X86SchedWritePair; // Floating point double square root (XMM).
+defm WriteFSqrt64Y : X86SchedWritePair; // Floating point double square root (YMM).
+defm WriteFSqrt64Z : X86SchedWritePair; // Floating point double square root (ZMM).
+defm WriteFSqrt80 : X86SchedWritePair; // Floating point long double square root.
defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal estimate.
+defm WriteFRcpX : X86SchedWritePair; // Floating point reciprocal estimate (XMM).
+defm WriteFRcpY : X86SchedWritePair; // Floating point reciprocal estimate (YMM).
+defm WriteFRcpZ : X86SchedWritePair; // Floating point reciprocal estimate (ZMM).
defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate.
+defm WriteFRsqrtX: X86SchedWritePair; // Floating point reciprocal square root estimate (XMM).
+defm WriteFRsqrtY: X86SchedWritePair; // Floating point reciprocal square root estimate (YMM).
+defm WriteFRsqrtZ: X86SchedWritePair; // Floating point reciprocal square root estimate (ZMM).
defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
+defm WriteFMAX : X86SchedWritePair; // Fused Multiply Add (XMM).
+defm WriteFMAY : X86SchedWritePair; // Fused Multiply Add (YMM).
+defm WriteFMAZ : X86SchedWritePair; // Fused Multiply Add (ZMM).
+defm WriteDPPD : X86SchedWritePair; // Floating point double dot product.
+defm WriteDPPS : X86SchedWritePair; // Floating point single dot product.
+defm WriteDPPSY : X86SchedWritePair; // Floating point single dot product (YMM).
+defm WriteDPPSZ : X86SchedWritePair; // Floating point single dot product (ZMM).
+defm WriteFSign : X86SchedWritePair; // Floating point fabs/fchs.
+defm WriteFRnd : X86SchedWritePair; // Floating point rounding.
+defm WriteFRndY : X86SchedWritePair; // Floating point rounding (YMM).
+defm WriteFRndZ : X86SchedWritePair; // Floating point rounding (ZMM).
+defm WriteFLogic : X86SchedWritePair; // Floating point and/or/xor logicals.
+defm WriteFLogicY : X86SchedWritePair; // Floating point and/or/xor logicals (YMM).
+defm WriteFLogicZ : X86SchedWritePair; // Floating point and/or/xor logicals (ZMM).
+defm WriteFTest : X86SchedWritePair; // Floating point TEST instructions.
+defm WriteFTestY : X86SchedWritePair; // Floating point TEST instructions (YMM).
+defm WriteFTestZ : X86SchedWritePair; // Floating point TEST instructions (ZMM).
defm WriteFShuffle : X86SchedWritePair; // Floating point vector shuffles.
+defm WriteFShuffleY : X86SchedWritePair; // Floating point vector shuffles (YMM).
+defm WriteFShuffleZ : X86SchedWritePair; // Floating point vector shuffles (ZMM).
+defm WriteFVarShuffle : X86SchedWritePair; // Floating point vector variable shuffles.
+defm WriteFVarShuffleY : X86SchedWritePair; // Floating point vector variable shuffles (YMM).
+defm WriteFVarShuffleZ : X86SchedWritePair; // Floating point vector variable shuffles (ZMM).
defm WriteFBlend : X86SchedWritePair; // Floating point vector blends.
+defm WriteFBlendY : X86SchedWritePair; // Floating point vector blends (YMM).
+defm WriteFBlendZ : X86SchedWritePair; // Floating point vector blends (ZMM).
defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends.
+defm WriteFVarBlendY : X86SchedWritePair; // Fp vector variable blends (YMM).
+defm WriteFVarBlendZ : X86SchedWritePair; // Fp vector variable blends (YMZMM).
// FMA Scheduling helper class.
class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
// Horizontal Add/Sub (float and integer)
defm WriteFHAdd : X86SchedWritePair;
-defm WritePHAdd : X86SchedWritePair;
+defm WriteFHAddY : X86SchedWritePair;
+defm WriteFHAddZ : X86SchedWritePair;
+defm WritePHAdd : X86SchedWritePair;
+defm WritePHAddX : X86SchedWritePair;
+defm WritePHAddY : X86SchedWritePair;
+defm WritePHAddZ : X86SchedWritePair;
// Vector integer operations.
-defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals.
-defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
-defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply.
+def WriteVecLoad : SchedWrite;
+def WriteVecLoadX : SchedWrite;
+def WriteVecLoadY : SchedWrite;
+def WriteVecLoadNT : SchedWrite;
+def WriteVecLoadNTY : SchedWrite;
+def WriteVecMaskedLoad : SchedWrite;
+def WriteVecMaskedLoadY : SchedWrite;
+def WriteVecStore : SchedWrite;
+def WriteVecStoreX : SchedWrite;
+def WriteVecStoreY : SchedWrite;
+def WriteVecStoreNT : SchedWrite;
+def WriteVecStoreNTY : SchedWrite;
+def WriteVecMaskedStore : SchedWrite;
+def WriteVecMaskedStoreY : SchedWrite;
+def WriteVecMove : SchedWrite;
+def WriteVecMoveX : SchedWrite;
+def WriteVecMoveY : SchedWrite;
+def WriteVecMoveToGpr : SchedWrite;
+def WriteVecMoveFromGpr : SchedWrite;
+
+defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals.
+defm WriteVecALUX : X86SchedWritePair; // Vector integer ALU op, no logicals (XMM).
+defm WriteVecALUY : X86SchedWritePair; // Vector integer ALU op, no logicals (YMM).
+defm WriteVecALUZ : X86SchedWritePair; // Vector integer ALU op, no logicals (ZMM).
+defm WriteVecLogic : X86SchedWritePair; // Vector integer and/or/xor logicals.
+defm WriteVecLogicX : X86SchedWritePair; // Vector integer and/or/xor logicals (XMM).
+defm WriteVecLogicY : X86SchedWritePair; // Vector integer and/or/xor logicals (YMM).
+defm WriteVecLogicZ : X86SchedWritePair; // Vector integer and/or/xor logicals (ZMM).
+defm WriteVecTest : X86SchedWritePair; // Vector integer TEST instructions.
+defm WriteVecTestY : X86SchedWritePair; // Vector integer TEST instructions (YMM).
+defm WriteVecTestZ : X86SchedWritePair; // Vector integer TEST instructions (ZMM).
+defm WriteVecShift : X86SchedWritePair; // Vector integer shifts (default).
+defm WriteVecShiftX : X86SchedWritePair; // Vector integer shifts (XMM).
+defm WriteVecShiftY : X86SchedWritePair; // Vector integer shifts (YMM).
+defm WriteVecShiftZ : X86SchedWritePair; // Vector integer shifts (ZMM).
+defm WriteVecShiftImm : X86SchedWritePair; // Vector integer immediate shifts (default).
+defm WriteVecShiftImmX: X86SchedWritePair; // Vector integer immediate shifts (XMM).
+defm WriteVecShiftImmY: X86SchedWritePair; // Vector integer immediate shifts (YMM).
+defm WriteVecShiftImmZ: X86SchedWritePair; // Vector integer immediate shifts (ZMM).
+defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply (default).
+defm WriteVecIMulX : X86SchedWritePair; // Vector integer multiply (XMM).
+defm WriteVecIMulY : X86SchedWritePair; // Vector integer multiply (YMM).
+defm WriteVecIMulZ : X86SchedWritePair; // Vector integer multiply (ZMM).
+defm WritePMULLD : X86SchedWritePair; // Vector PMULLD.
+defm WritePMULLDY : X86SchedWritePair; // Vector PMULLD (YMM).
+defm WritePMULLDZ : X86SchedWritePair; // Vector PMULLD (ZMM).
defm WriteShuffle : X86SchedWritePair; // Vector shuffles.
+defm WriteShuffleX : X86SchedWritePair; // Vector shuffles (XMM).
+defm WriteShuffleY : X86SchedWritePair; // Vector shuffles (YMM).
+defm WriteShuffleZ : X86SchedWritePair; // Vector shuffles (ZMM).
+defm WriteVarShuffle : X86SchedWritePair; // Vector variable shuffles.
+defm WriteVarShuffleX : X86SchedWritePair; // Vector variable shuffles (XMM).
+defm WriteVarShuffleY : X86SchedWritePair; // Vector variable shuffles (YMM).
+defm WriteVarShuffleZ : X86SchedWritePair; // Vector variable shuffles (ZMM).
defm WriteBlend : X86SchedWritePair; // Vector blends.
+defm WriteBlendY : X86SchedWritePair; // Vector blends (YMM).
+defm WriteBlendZ : X86SchedWritePair; // Vector blends (ZMM).
defm WriteVarBlend : X86SchedWritePair; // Vector variable blends.
-defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD.
-
-// Vector bitwise operations.
-// These are often used on both floating point and integer vectors.
-defm WriteVecLogic : X86SchedWritePair; // Vector and/or/xor.
+defm WriteVarBlendY : X86SchedWritePair; // Vector variable blends (YMM).
+defm WriteVarBlendZ : X86SchedWritePair; // Vector variable blends (ZMM).
+defm WritePSADBW : X86SchedWritePair; // Vector PSADBW.
+defm WritePSADBWX : X86SchedWritePair; // Vector PSADBW (XMM).
+defm WritePSADBWY : X86SchedWritePair; // Vector PSADBW (YMM).
+defm WritePSADBWZ : X86SchedWritePair; // Vector PSADBW (ZMM).
+defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD.
+defm WriteMPSADY : X86SchedWritePair; // Vector MPSAD (YMM).
+defm WriteMPSADZ : X86SchedWritePair; // Vector MPSAD (ZMM).
+defm WritePHMINPOS : X86SchedWritePair; // Vector PHMINPOS.
+
+// Vector insert/extract operations.
+defm WriteVecInsert : X86SchedWritePair; // Insert gpr to vector element.
+def WriteVecExtract : SchedWrite; // Extract vector element to gpr.
+def WriteVecExtractSt : SchedWrite; // Extract vector element and store.
+
+// MOVMSK operations.
+def WriteFMOVMSK : SchedWrite;
+def WriteVecMOVMSK : SchedWrite;
+def WriteVecMOVMSKY : SchedWrite;
+def WriteMMXMOVMSK : SchedWrite;
// Conversion between integer and float.
-defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer.
-defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float.
-defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion.
+defm WriteCvtSD2I : X86SchedWritePair; // Double -> Integer.
+defm WriteCvtPD2I : X86SchedWritePair; // Double -> Integer (XMM).
+defm WriteCvtPD2IY : X86SchedWritePair; // Double -> Integer (YMM).
+defm WriteCvtPD2IZ : X86SchedWritePair; // Double -> Integer (ZMM).
+
+defm WriteCvtSS2I : X86SchedWritePair; // Float -> Integer.
+defm WriteCvtPS2I : X86SchedWritePair; // Float -> Integer (XMM).
+defm WriteCvtPS2IY : X86SchedWritePair; // Float -> Integer (YMM).
+defm WriteCvtPS2IZ : X86SchedWritePair; // Float -> Integer (ZMM).
+
+defm WriteCvtI2SD : X86SchedWritePair; // Integer -> Double.
+defm WriteCvtI2PD : X86SchedWritePair; // Integer -> Double (XMM).
+defm WriteCvtI2PDY : X86SchedWritePair; // Integer -> Double (YMM).
+defm WriteCvtI2PDZ : X86SchedWritePair; // Integer -> Double (ZMM).
+
+defm WriteCvtI2SS : X86SchedWritePair; // Integer -> Float.
+defm WriteCvtI2PS : X86SchedWritePair; // Integer -> Float (XMM).
+defm WriteCvtI2PSY : X86SchedWritePair; // Integer -> Float (YMM).
+defm WriteCvtI2PSZ : X86SchedWritePair; // Integer -> Float (ZMM).
+
+defm WriteCvtSS2SD : X86SchedWritePair; // Float -> Double size conversion.
+defm WriteCvtPS2PD : X86SchedWritePair; // Float -> Double size conversion (XMM).
+defm WriteCvtPS2PDY : X86SchedWritePair; // Float -> Double size conversion (YMM).
+defm WriteCvtPS2PDZ : X86SchedWritePair; // Float -> Double size conversion (ZMM).
+
+defm WriteCvtSD2SS : X86SchedWritePair; // Double -> Float size conversion.
+defm WriteCvtPD2PS : X86SchedWritePair; // Double -> Float size conversion (XMM).
+defm WriteCvtPD2PSY : X86SchedWritePair; // Double -> Float size conversion (YMM).
+defm WriteCvtPD2PSZ : X86SchedWritePair; // Double -> Float size conversion (ZMM).
+
+defm WriteCvtPH2PS : X86SchedWritePair; // Half -> Float size conversion.
+defm WriteCvtPH2PSY : X86SchedWritePair; // Half -> Float size conversion (YMM).
+defm WriteCvtPH2PSZ : X86SchedWritePair; // Half -> Float size conversion (ZMM).
+
+def WriteCvtPS2PH : SchedWrite; // // Float -> Half size conversion.
+def WriteCvtPS2PHY : SchedWrite; // // Float -> Half size conversion (YMM).
+def WriteCvtPS2PHZ : SchedWrite; // // Float -> Half size conversion (ZMM).
+def WriteCvtPS2PHSt : SchedWrite; // // Float -> Half + store size conversion.
+def WriteCvtPS2PHYSt : SchedWrite; // // Float -> Half + store size conversion (YMM).
+def WriteCvtPS2PHZSt : SchedWrite; // // Float -> Half + store size conversion (ZMM).
+
+// CRC32 instruction.
+defm WriteCRC32 : X86SchedWritePair;
// Strings instructions.
// Packed Compare Implicit Length Strings, Return Mask
@@ -117,13 +418,24 @@ defm WriteAESKeyGen : X86SchedWritePair; // Key Generation.
// Carry-less multiplication instructions.
defm WriteCLMul : X86SchedWritePair;
+// EMMS/FEMMS
+def WriteEMMS : SchedWrite;
+
+// Load/store MXCSR
+def WriteLDMXCSR : SchedWrite;
+def WriteSTMXCSR : SchedWrite;
+
// Catch-all for expensive system instructions.
def WriteSystem : SchedWrite;
// AVX2.
defm WriteFShuffle256 : X86SchedWritePair; // Fp 256-bit width vector shuffles.
+defm WriteFVarShuffle256 : X86SchedWritePair; // Fp 256-bit width variable shuffles.
defm WriteShuffle256 : X86SchedWritePair; // 256-bit width vector shuffles.
-defm WriteVarVecShift : X86SchedWritePair; // Variable vector shifts.
+defm WriteVarShuffle256 : X86SchedWritePair; // 256-bit width vector variable shuffles.
+defm WriteVarVecShift : X86SchedWritePair; // Variable vector shifts.
+defm WriteVarVecShiftY : X86SchedWritePair; // Variable vector shifts (YMM).
+defm WriteVarVecShiftZ : X86SchedWritePair; // Variable vector shifts (ZMM).
// Old microcoded instructions that nobody use.
def WriteMicrocoded : SchedWrite;
@@ -134,531 +446,182 @@ def WriteFence : SchedWrite;
// Nop, not very useful expect it provides a model for nops!
def WriteNop : SchedWrite;
-//===----------------------------------------------------------------------===//
-// Instruction Itinerary classes used for X86
-def IIC_ALU_MEM : InstrItinClass;
-def IIC_ALU_NONMEM : InstrItinClass;
-def IIC_LEA : InstrItinClass;
-def IIC_LEA_16 : InstrItinClass;
-def IIC_MUL8 : InstrItinClass;
-def IIC_MUL16_MEM : InstrItinClass;
-def IIC_MUL16_REG : InstrItinClass;
-def IIC_MUL32_MEM : InstrItinClass;
-def IIC_MUL32_REG : InstrItinClass;
-def IIC_MUL64 : InstrItinClass;
-// imul by al, ax, eax, tax
-def IIC_IMUL8 : InstrItinClass;
-def IIC_IMUL16_MEM : InstrItinClass;
-def IIC_IMUL16_REG : InstrItinClass;
-def IIC_IMUL32_MEM : InstrItinClass;
-def IIC_IMUL32_REG : InstrItinClass;
-def IIC_IMUL64 : InstrItinClass;
-// imul reg by reg|mem
-def IIC_IMUL16_RM : InstrItinClass;
-def IIC_IMUL16_RR : InstrItinClass;
-def IIC_IMUL32_RM : InstrItinClass;
-def IIC_IMUL32_RR : InstrItinClass;
-def IIC_IMUL64_RM : InstrItinClass;
-def IIC_IMUL64_RR : InstrItinClass;
-// imul reg = reg/mem * imm
-def IIC_IMUL16_RMI : InstrItinClass;
-def IIC_IMUL16_RRI : InstrItinClass;
-def IIC_IMUL32_RMI : InstrItinClass;
-def IIC_IMUL32_RRI : InstrItinClass;
-def IIC_IMUL64_RMI : InstrItinClass;
-def IIC_IMUL64_RRI : InstrItinClass;
-// div
-def IIC_DIV8_MEM : InstrItinClass;
-def IIC_DIV8_REG : InstrItinClass;
-def IIC_DIV16 : InstrItinClass;
-def IIC_DIV32 : InstrItinClass;
-def IIC_DIV64 : InstrItinClass;
-// idiv
-def IIC_IDIV8 : InstrItinClass;
-def IIC_IDIV16 : InstrItinClass;
-def IIC_IDIV32 : InstrItinClass;
-def IIC_IDIV64 : InstrItinClass;
-// neg/not/inc/dec
-def IIC_UNARY_REG : InstrItinClass;
-def IIC_UNARY_MEM : InstrItinClass;
-// add/sub/and/or/xor/sbc/cmp/test
-def IIC_BIN_MEM : InstrItinClass;
-def IIC_BIN_NONMEM : InstrItinClass;
-// adc/sbc
-def IIC_BIN_CARRY_MEM : InstrItinClass;
-def IIC_BIN_CARRY_NONMEM : InstrItinClass;
-// shift/rotate
-def IIC_SR : InstrItinClass;
-// shift double
-def IIC_SHD16_REG_IM : InstrItinClass;
-def IIC_SHD16_REG_CL : InstrItinClass;
-def IIC_SHD16_MEM_IM : InstrItinClass;
-def IIC_SHD16_MEM_CL : InstrItinClass;
-def IIC_SHD32_REG_IM : InstrItinClass;
-def IIC_SHD32_REG_CL : InstrItinClass;
-def IIC_SHD32_MEM_IM : InstrItinClass;
-def IIC_SHD32_MEM_CL : InstrItinClass;
-def IIC_SHD64_REG_IM : InstrItinClass;
-def IIC_SHD64_REG_CL : InstrItinClass;
-def IIC_SHD64_MEM_IM : InstrItinClass;
-def IIC_SHD64_MEM_CL : InstrItinClass;
-// cmov
-def IIC_CMOV16_RM : InstrItinClass;
-def IIC_CMOV16_RR : InstrItinClass;
-def IIC_CMOV32_RM : InstrItinClass;
-def IIC_CMOV32_RR : InstrItinClass;
-def IIC_CMOV64_RM : InstrItinClass;
-def IIC_CMOV64_RR : InstrItinClass;
-// set
-def IIC_SET_R : InstrItinClass;
-def IIC_SET_M : InstrItinClass;
-// jmp/jcc/jcxz
-def IIC_Jcc : InstrItinClass;
-def IIC_JCXZ : InstrItinClass;
-def IIC_JMP_REL : InstrItinClass;
-def IIC_JMP_REG : InstrItinClass;
-def IIC_JMP_MEM : InstrItinClass;
-def IIC_JMP_FAR_MEM : InstrItinClass;
-def IIC_JMP_FAR_PTR : InstrItinClass;
-// loop
-def IIC_LOOP : InstrItinClass;
-def IIC_LOOPE : InstrItinClass;
-def IIC_LOOPNE : InstrItinClass;
-// call
-def IIC_CALL_RI : InstrItinClass;
-def IIC_CALL_MEM : InstrItinClass;
-def IIC_CALL_FAR_MEM : InstrItinClass;
-def IIC_CALL_FAR_PTR : InstrItinClass;
-// ret
-def IIC_RET : InstrItinClass;
-def IIC_RET_IMM : InstrItinClass;
-//sign extension movs
-def IIC_MOVSX : InstrItinClass;
-def IIC_MOVSX_R16_R8 : InstrItinClass;
-def IIC_MOVSX_R16_M8 : InstrItinClass;
-def IIC_MOVSX_R16_R16 : InstrItinClass;
-def IIC_MOVSX_R32_R32 : InstrItinClass;
-//zero extension movs
-def IIC_MOVZX : InstrItinClass;
-def IIC_MOVZX_R16_R8 : InstrItinClass;
-def IIC_MOVZX_R16_M8 : InstrItinClass;
-
-def IIC_REP_MOVS : InstrItinClass;
-def IIC_REP_STOS : InstrItinClass;
-
-// SSE scalar/parallel binary operations
-def IIC_SSE_ALU_F32S_RR : InstrItinClass;
-def IIC_SSE_ALU_F32S_RM : InstrItinClass;
-def IIC_SSE_ALU_F64S_RR : InstrItinClass;
-def IIC_SSE_ALU_F64S_RM : InstrItinClass;
-def IIC_SSE_MUL_F32S_RR : InstrItinClass;
-def IIC_SSE_MUL_F32S_RM : InstrItinClass;
-def IIC_SSE_MUL_F64S_RR : InstrItinClass;
-def IIC_SSE_MUL_F64S_RM : InstrItinClass;
-def IIC_SSE_DIV_F32S_RR : InstrItinClass;
-def IIC_SSE_DIV_F32S_RM : InstrItinClass;
-def IIC_SSE_DIV_F64S_RR : InstrItinClass;
-def IIC_SSE_DIV_F64S_RM : InstrItinClass;
-def IIC_SSE_ALU_F32P_RR : InstrItinClass;
-def IIC_SSE_ALU_F32P_RM : InstrItinClass;
-def IIC_SSE_ALU_F64P_RR : InstrItinClass;
-def IIC_SSE_ALU_F64P_RM : InstrItinClass;
-def IIC_SSE_MUL_F32P_RR : InstrItinClass;
-def IIC_SSE_MUL_F32P_RM : InstrItinClass;
-def IIC_SSE_MUL_F64P_RR : InstrItinClass;
-def IIC_SSE_MUL_F64P_RM : InstrItinClass;
-def IIC_SSE_DIV_F32P_RR : InstrItinClass;
-def IIC_SSE_DIV_F32P_RM : InstrItinClass;
-def IIC_SSE_DIV_F64P_RR : InstrItinClass;
-def IIC_SSE_DIV_F64P_RM : InstrItinClass;
-
-def IIC_SSE_COMIS_RR : InstrItinClass;
-def IIC_SSE_COMIS_RM : InstrItinClass;
-
-def IIC_SSE_HADDSUB_RR : InstrItinClass;
-def IIC_SSE_HADDSUB_RM : InstrItinClass;
-
-def IIC_SSE_BIT_P_RR : InstrItinClass;
-def IIC_SSE_BIT_P_RM : InstrItinClass;
-
-def IIC_SSE_INTALU_P_RR : InstrItinClass;
-def IIC_SSE_INTALU_P_RM : InstrItinClass;
-def IIC_SSE_INTALUQ_P_RR : InstrItinClass;
-def IIC_SSE_INTALUQ_P_RM : InstrItinClass;
-
-def IIC_SSE_INTMUL_P_RR : InstrItinClass;
-def IIC_SSE_INTMUL_P_RM : InstrItinClass;
-
-def IIC_SSE_INTSH_P_RR : InstrItinClass;
-def IIC_SSE_INTSH_P_RM : InstrItinClass;
-def IIC_SSE_INTSH_P_RI : InstrItinClass;
-
-def IIC_SSE_INTSHDQ_P_RI : InstrItinClass;
-
-def IIC_SSE_SHUFP : InstrItinClass;
-def IIC_SSE_PSHUF_RI : InstrItinClass;
-def IIC_SSE_PSHUF_MI : InstrItinClass;
-
-def IIC_SSE_PACK : InstrItinClass;
-def IIC_SSE_UNPCK : InstrItinClass;
-
-def IIC_SSE_MOVMSK : InstrItinClass;
-def IIC_SSE_MASKMOV : InstrItinClass;
-
-def IIC_SSE_PEXTRW : InstrItinClass;
-def IIC_SSE_PINSRW : InstrItinClass;
-
-def IIC_SSE_PABS_RR : InstrItinClass;
-def IIC_SSE_PABS_RM : InstrItinClass;
-
-def IIC_SSE_SQRTPS_RR : InstrItinClass;
-def IIC_SSE_SQRTPS_RM : InstrItinClass;
-def IIC_SSE_SQRTSS_RR : InstrItinClass;
-def IIC_SSE_SQRTSS_RM : InstrItinClass;
-def IIC_SSE_SQRTPD_RR : InstrItinClass;
-def IIC_SSE_SQRTPD_RM : InstrItinClass;
-def IIC_SSE_SQRTSD_RR : InstrItinClass;
-def IIC_SSE_SQRTSD_RM : InstrItinClass;
-
-def IIC_SSE_RSQRTPS_RR : InstrItinClass;
-def IIC_SSE_RSQRTPS_RM : InstrItinClass;
-def IIC_SSE_RSQRTSS_RR : InstrItinClass;
-def IIC_SSE_RSQRTSS_RM : InstrItinClass;
-
-def IIC_SSE_RCPP_RR : InstrItinClass;
-def IIC_SSE_RCPP_RM : InstrItinClass;
-def IIC_SSE_RCPS_RR : InstrItinClass;
-def IIC_SSE_RCPS_RM : InstrItinClass;
-
-def IIC_SSE_MOV_S_RR : InstrItinClass;
-def IIC_SSE_MOV_S_RM : InstrItinClass;
-def IIC_SSE_MOV_S_MR : InstrItinClass;
-
-def IIC_SSE_MOVA_P_RR : InstrItinClass;
-def IIC_SSE_MOVA_P_RM : InstrItinClass;
-def IIC_SSE_MOVA_P_MR : InstrItinClass;
-
-def IIC_SSE_MOVU_P_RR : InstrItinClass;
-def IIC_SSE_MOVU_P_RM : InstrItinClass;
-def IIC_SSE_MOVU_P_MR : InstrItinClass;
-
-def IIC_SSE_MOVDQ : InstrItinClass;
-def IIC_SSE_MOVD_ToGP : InstrItinClass;
-def IIC_SSE_MOVQ_RR : InstrItinClass;
-
-def IIC_SSE_MOV_LH : InstrItinClass;
-
-def IIC_SSE_LDDQU : InstrItinClass;
-
-def IIC_SSE_MOVNT : InstrItinClass;
-
-def IIC_SSE_PHADDSUBD_RR : InstrItinClass;
-def IIC_SSE_PHADDSUBD_RM : InstrItinClass;
-def IIC_SSE_PHADDSUBSW_RR : InstrItinClass;
-def IIC_SSE_PHADDSUBSW_RM : InstrItinClass;
-def IIC_SSE_PHADDSUBW_RR : InstrItinClass;
-def IIC_SSE_PHADDSUBW_RM : InstrItinClass;
-def IIC_SSE_PSHUFB_RR : InstrItinClass;
-def IIC_SSE_PSHUFB_RM : InstrItinClass;
-def IIC_SSE_PSIGN_RR : InstrItinClass;
-def IIC_SSE_PSIGN_RM : InstrItinClass;
-
-def IIC_SSE_PMADD : InstrItinClass;
-def IIC_SSE_PMULHRSW : InstrItinClass;
-def IIC_SSE_PALIGNRR : InstrItinClass;
-def IIC_SSE_PALIGNRM : InstrItinClass;
-def IIC_SSE_MWAIT : InstrItinClass;
-def IIC_SSE_MONITOR : InstrItinClass;
-def IIC_SSE_MWAITX : InstrItinClass;
-def IIC_SSE_MONITORX : InstrItinClass;
-def IIC_SSE_CLZERO : InstrItinClass;
-
-def IIC_SSE_PREFETCH : InstrItinClass;
-def IIC_SSE_PAUSE : InstrItinClass;
-def IIC_SSE_LFENCE : InstrItinClass;
-def IIC_SSE_MFENCE : InstrItinClass;
-def IIC_SSE_SFENCE : InstrItinClass;
-def IIC_SSE_LDMXCSR : InstrItinClass;
-def IIC_SSE_STMXCSR : InstrItinClass;
-
-def IIC_SSE_CVT_PD_RR : InstrItinClass;
-def IIC_SSE_CVT_PD_RM : InstrItinClass;
-def IIC_SSE_CVT_PS_RR : InstrItinClass;
-def IIC_SSE_CVT_PS_RM : InstrItinClass;
-def IIC_SSE_CVT_Scalar_RR : InstrItinClass;
-def IIC_SSE_CVT_Scalar_RM : InstrItinClass;
-def IIC_SSE_CVT_SS2SI32_RM : InstrItinClass;
-def IIC_SSE_CVT_SS2SI32_RR : InstrItinClass;
-def IIC_SSE_CVT_SS2SI64_RM : InstrItinClass;
-def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass;
-def IIC_SSE_CVT_SD2SI_RM : InstrItinClass;
-def IIC_SSE_CVT_SD2SI_RR : InstrItinClass;
-
-def IIC_AVX_ZERO : InstrItinClass;
-
-// MMX
-def IIC_MMX_MOV_MM_RM : InstrItinClass;
-def IIC_MMX_MOV_REG_MM : InstrItinClass;
-def IIC_MMX_MOVQ_RM : InstrItinClass;
-def IIC_MMX_MOVQ_RR : InstrItinClass;
-
-def IIC_MMX_ALU_RM : InstrItinClass;
-def IIC_MMX_ALU_RR : InstrItinClass;
-def IIC_MMX_ALUQ_RM : InstrItinClass;
-def IIC_MMX_ALUQ_RR : InstrItinClass;
-def IIC_MMX_PHADDSUBW_RM : InstrItinClass;
-def IIC_MMX_PHADDSUBW_RR : InstrItinClass;
-def IIC_MMX_PHADDSUBD_RM : InstrItinClass;
-def IIC_MMX_PHADDSUBD_RR : InstrItinClass;
-def IIC_MMX_PMUL : InstrItinClass;
-def IIC_MMX_MISC_FUNC_MEM : InstrItinClass;
-def IIC_MMX_MISC_FUNC_REG : InstrItinClass;
-def IIC_MMX_PSADBW : InstrItinClass;
-def IIC_MMX_SHIFT_RI : InstrItinClass;
-def IIC_MMX_SHIFT_RM : InstrItinClass;
-def IIC_MMX_SHIFT_RR : InstrItinClass;
-def IIC_MMX_UNPCK_H_RM : InstrItinClass;
-def IIC_MMX_UNPCK_H_RR : InstrItinClass;
-def IIC_MMX_UNPCK_L : InstrItinClass;
-def IIC_MMX_PCK_RM : InstrItinClass;
-def IIC_MMX_PCK_RR : InstrItinClass;
-def IIC_MMX_PSHUF : InstrItinClass;
-def IIC_MMX_PEXTR : InstrItinClass;
-def IIC_MMX_PINSRW : InstrItinClass;
-def IIC_MMX_MASKMOV : InstrItinClass;
-def IIC_MMX_MOVMSK : InstrItinClass;
-def IIC_MMX_CVT_PD_RR : InstrItinClass;
-def IIC_MMX_CVT_PD_RM : InstrItinClass;
-def IIC_MMX_CVT_PS_RR : InstrItinClass;
-def IIC_MMX_CVT_PS_RM : InstrItinClass;
-
-def IIC_3DNOW_FALU_RM : InstrItinClass;
-def IIC_3DNOW_FALU_RR : InstrItinClass;
-def IIC_3DNOW_FCVT_F2I_RM : InstrItinClass;
-def IIC_3DNOW_FCVT_F2I_RR : InstrItinClass;
-def IIC_3DNOW_FCVT_I2F_RM : InstrItinClass;
-def IIC_3DNOW_FCVT_I2F_RR : InstrItinClass;
-def IIC_3DNOW_MISC_FUNC_REG : InstrItinClass;
-def IIC_3DNOW_MISC_FUNC_MEM : InstrItinClass;
-
-def IIC_CMPX_LOCK : InstrItinClass;
-def IIC_CMPX_LOCK_8 : InstrItinClass;
-def IIC_CMPX_LOCK_8B : InstrItinClass;
-def IIC_CMPX_LOCK_16B : InstrItinClass;
-
-def IIC_XADD_LOCK_MEM : InstrItinClass;
-def IIC_XADD_LOCK_MEM8 : InstrItinClass;
-
-def IIC_FCMOV : InstrItinClass;
-def IIC_FILD : InstrItinClass;
-def IIC_FLD : InstrItinClass;
-def IIC_FLD80 : InstrItinClass;
-def IIC_FST : InstrItinClass;
-def IIC_FST80 : InstrItinClass;
-def IIC_FIST : InstrItinClass;
-def IIC_FLDZ : InstrItinClass;
-def IIC_FUCOM : InstrItinClass;
-def IIC_FUCOMI : InstrItinClass;
-def IIC_FCOMI : InstrItinClass;
-def IIC_FNSTSW : InstrItinClass;
-def IIC_FNSTCW : InstrItinClass;
-def IIC_FLDCW : InstrItinClass;
-def IIC_FNINIT : InstrItinClass;
-def IIC_FFREE : InstrItinClass;
-def IIC_FNCLEX : InstrItinClass;
-def IIC_WAIT : InstrItinClass;
-def IIC_FXAM : InstrItinClass;
-def IIC_FNOP : InstrItinClass;
-def IIC_FLDL : InstrItinClass;
-def IIC_F2XM1 : InstrItinClass;
-def IIC_FYL2X : InstrItinClass;
-def IIC_FPTAN : InstrItinClass;
-def IIC_FPATAN : InstrItinClass;
-def IIC_FXTRACT : InstrItinClass;
-def IIC_FPREM1 : InstrItinClass;
-def IIC_FPSTP : InstrItinClass;
-def IIC_FPREM : InstrItinClass;
-def IIC_FSIGN : InstrItinClass;
-def IIC_FSQRT : InstrItinClass;
-def IIC_FYL2XP1 : InstrItinClass;
-def IIC_FSINCOS : InstrItinClass;
-def IIC_FRNDINT : InstrItinClass;
-def IIC_FSCALE : InstrItinClass;
-def IIC_FCOMPP : InstrItinClass;
-def IIC_FXSAVE : InstrItinClass;
-def IIC_FXRSTOR : InstrItinClass;
-
-def IIC_FXCH : InstrItinClass;
-
-// System instructions
-def IIC_CPUID : InstrItinClass;
-def IIC_INT : InstrItinClass;
-def IIC_INT3 : InstrItinClass;
-def IIC_INVD : InstrItinClass;
-def IIC_INVLPG : InstrItinClass;
-def IIC_INVPCID : InstrItinClass;
-def IIC_IRET : InstrItinClass;
-def IIC_HLT : InstrItinClass;
-def IIC_LXS : InstrItinClass;
-def IIC_LTR : InstrItinClass;
-def IIC_MPX : InstrItinClass;
-def IIC_PKU : InstrItinClass;
-def IIC_PTWRITE : InstrItinClass;
-def IIC_RDPID : InstrItinClass;
-def IIC_RDRAND : InstrItinClass;
-def IIC_RDSEED : InstrItinClass;
-def IIC_RDTSC : InstrItinClass;
-def IIC_RDTSCP : InstrItinClass;
-def IIC_RSM : InstrItinClass;
-def IIC_SIDT : InstrItinClass;
-def IIC_SGDT : InstrItinClass;
-def IIC_SLDT : InstrItinClass;
-def IIC_SMAP : InstrItinClass;
-def IIC_SMX : InstrItinClass;
-def IIC_STR : InstrItinClass;
-def IIC_SKINIT : InstrItinClass;
-def IIC_SVM : InstrItinClass;
-def IIC_VMX : InstrItinClass;
-def IIC_CLGI : InstrItinClass;
-def IIC_STGI : InstrItinClass;
-def IIC_SWAPGS : InstrItinClass;
-def IIC_SYSCALL : InstrItinClass;
-def IIC_SYS_ENTER_EXIT : InstrItinClass;
-def IIC_IN_RR : InstrItinClass;
-def IIC_IN_RI : InstrItinClass;
-def IIC_OUT_RR : InstrItinClass;
-def IIC_OUT_IR : InstrItinClass;
-def IIC_INS : InstrItinClass;
-def IIC_LWP : InstrItinClass;
-def IIC_MOV_REG_DR : InstrItinClass;
-def IIC_MOV_DR_REG : InstrItinClass;
-def IIC_MOV_REG_CR : InstrItinClass;
-def IIC_MOV_CR_REG : InstrItinClass;
-def IIC_MOV_REG_SR : InstrItinClass;
-def IIC_MOV_MEM_SR : InstrItinClass;
-def IIC_MOV_SR_REG : InstrItinClass;
-def IIC_MOV_SR_MEM : InstrItinClass;
-def IIC_LAR_RM : InstrItinClass;
-def IIC_LAR_RR : InstrItinClass;
-def IIC_LSL_RM : InstrItinClass;
-def IIC_LSL_RR : InstrItinClass;
-def IIC_LGDT : InstrItinClass;
-def IIC_LIDT : InstrItinClass;
-def IIC_LLDT_REG : InstrItinClass;
-def IIC_LLDT_MEM : InstrItinClass;
-def IIC_PUSH_CS : InstrItinClass;
-def IIC_PUSH_SR : InstrItinClass;
-def IIC_POP_SR : InstrItinClass;
-def IIC_POP_SR_SS : InstrItinClass;
-def IIC_SEGMENT_BASE_R : InstrItinClass;
-def IIC_SEGMENT_BASE_W : InstrItinClass;
-def IIC_VERR : InstrItinClass;
-def IIC_VERW_REG : InstrItinClass;
-def IIC_VERW_MEM : InstrItinClass;
-def IIC_WRMSR : InstrItinClass;
-def IIC_RDMSR : InstrItinClass;
-def IIC_RDPMC : InstrItinClass;
-def IIC_SMSW : InstrItinClass;
-def IIC_LMSW_REG : InstrItinClass;
-def IIC_LMSW_MEM : InstrItinClass;
-def IIC_ENTER : InstrItinClass;
-def IIC_LEAVE : InstrItinClass;
-def IIC_POP_MEM : InstrItinClass;
-def IIC_POP_REG16 : InstrItinClass;
-def IIC_POP_REG : InstrItinClass;
-def IIC_POP_F : InstrItinClass;
-def IIC_POP_FD : InstrItinClass;
-def IIC_POP_A : InstrItinClass;
-def IIC_PUSH_IMM : InstrItinClass;
-def IIC_PUSH_MEM : InstrItinClass;
-def IIC_PUSH_REG : InstrItinClass;
-def IIC_PUSH_F : InstrItinClass;
-def IIC_PUSH_A : InstrItinClass;
-def IIC_BSWAP : InstrItinClass;
-def IIC_BIT_SCAN_MEM : InstrItinClass;
-def IIC_BIT_SCAN_REG : InstrItinClass;
-def IIC_LZCNT_RR : InstrItinClass;
-def IIC_LZCNT_RM : InstrItinClass;
-def IIC_TZCNT_RR : InstrItinClass;
-def IIC_TZCNT_RM : InstrItinClass;
-def IIC_MOVS : InstrItinClass;
-def IIC_STOS : InstrItinClass;
-def IIC_SCAS : InstrItinClass;
-def IIC_CMPS : InstrItinClass;
-def IIC_MOV : InstrItinClass;
-def IIC_MOV_MEM : InstrItinClass;
-def IIC_AHF : InstrItinClass;
-def IIC_BT_MI : InstrItinClass;
-def IIC_BT_MR : InstrItinClass;
-def IIC_BT_RI : InstrItinClass;
-def IIC_BT_RR : InstrItinClass;
-def IIC_BTX_MI : InstrItinClass;
-def IIC_BTX_MR : InstrItinClass;
-def IIC_BTX_RI : InstrItinClass;
-def IIC_BTX_RR : InstrItinClass;
-def IIC_XCHG_REG : InstrItinClass;
-def IIC_XCHG_MEM : InstrItinClass;
-def IIC_XADD_REG : InstrItinClass;
-def IIC_XADD_MEM : InstrItinClass;
-def IIC_CMPXCHG_MEM : InstrItinClass;
-def IIC_CMPXCHG_REG : InstrItinClass;
-def IIC_CMPXCHG_MEM8 : InstrItinClass;
-def IIC_CMPXCHG_REG8 : InstrItinClass;
-def IIC_CMPXCHG_8B : InstrItinClass;
-def IIC_CMPXCHG_16B : InstrItinClass;
-def IIC_LODS : InstrItinClass;
-def IIC_OUTS : InstrItinClass;
-def IIC_CLC : InstrItinClass;
-def IIC_CLD : InstrItinClass;
-def IIC_CLI : InstrItinClass;
-def IIC_CMC : InstrItinClass;
-def IIC_CLTS : InstrItinClass;
-def IIC_STC : InstrItinClass;
-def IIC_STI : InstrItinClass;
-def IIC_STD : InstrItinClass;
-def IIC_XLAT : InstrItinClass;
-def IIC_AAA : InstrItinClass;
-def IIC_AAD : InstrItinClass;
-def IIC_AAM : InstrItinClass;
-def IIC_AAS : InstrItinClass;
-def IIC_DAA : InstrItinClass;
-def IIC_DAS : InstrItinClass;
-def IIC_BOUND : InstrItinClass;
-def IIC_ARPL_REG : InstrItinClass;
-def IIC_ARPL_MEM : InstrItinClass;
-def IIC_MOVBE : InstrItinClass;
-def IIC_AES : InstrItinClass;
-def IIC_BLEND_MEM : InstrItinClass;
-def IIC_BLEND_NOMEM : InstrItinClass;
-def IIC_CBW : InstrItinClass;
-def IIC_CRC32_REG : InstrItinClass;
-def IIC_CRC32_MEM : InstrItinClass;
-def IIC_SSE_DPPD_RR : InstrItinClass;
-def IIC_SSE_DPPD_RM : InstrItinClass;
-def IIC_SSE_DPPS_RR : InstrItinClass;
-def IIC_SSE_DPPS_RM : InstrItinClass;
-def IIC_MMX_EMMS : InstrItinClass;
-def IIC_SSE_EXTRACTPS_RR : InstrItinClass;
-def IIC_SSE_EXTRACTPS_RM : InstrItinClass;
-def IIC_SSE_INSERTPS_RR : InstrItinClass;
-def IIC_SSE_INSERTPS_RM : InstrItinClass;
-def IIC_SSE_MPSADBW_RR : InstrItinClass;
-def IIC_SSE_MPSADBW_RM : InstrItinClass;
-def IIC_SSE_PMULLD_RR : InstrItinClass;
-def IIC_SSE_PMULLD_RM : InstrItinClass;
-def IIC_SSE_ROUNDPS_REG : InstrItinClass;
-def IIC_SSE_ROUNDPS_MEM : InstrItinClass;
-def IIC_SSE_ROUNDPD_REG : InstrItinClass;
-def IIC_SSE_ROUNDPD_MEM : InstrItinClass;
-def IIC_SSE_POPCNT_RR : InstrItinClass;
-def IIC_SSE_POPCNT_RM : InstrItinClass;
-def IIC_SSE_PCLMULQDQ_RR : InstrItinClass;
-def IIC_SSE_PCLMULQDQ_RM : InstrItinClass;
-
-def IIC_NOP : InstrItinClass;
+// Move/Load/Store wrappers.
+def WriteFMoveLS
+ : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStore>;
+def WriteFMoveLSX
+ : X86SchedWriteMoveLS<WriteFMoveX, WriteFLoadX, WriteFStoreX>;
+def WriteFMoveLSY
+ : X86SchedWriteMoveLS<WriteFMoveY, WriteFLoadY, WriteFStoreY>;
+def SchedWriteFMoveLS
+ : X86SchedWriteMoveLSWidths<WriteFMoveLS, WriteFMoveLSX,
+ WriteFMoveLSY, WriteFMoveLSY>;
+
+def WriteFMoveLSNT
+ : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStoreNT>;
+def WriteFMoveLSNTX
+ : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStoreNTX>;
+def WriteFMoveLSNTY
+ : X86SchedWriteMoveLS<WriteFMoveY, WriteFLoadY, WriteFStoreNTY>;
+def SchedWriteFMoveLSNT
+ : X86SchedWriteMoveLSWidths<WriteFMoveLSNT, WriteFMoveLSNTX,
+ WriteFMoveLSNTY, WriteFMoveLSNTY>;
+
+def WriteVecMoveLS
+ : X86SchedWriteMoveLS<WriteVecMove, WriteVecLoad, WriteVecStore>;
+def WriteVecMoveLSX
+ : X86SchedWriteMoveLS<WriteVecMoveX, WriteVecLoadX, WriteVecStoreX>;
+def WriteVecMoveLSY
+ : X86SchedWriteMoveLS<WriteVecMoveY, WriteVecLoadY, WriteVecStoreY>;
+def SchedWriteVecMoveLS
+ : X86SchedWriteMoveLSWidths<WriteVecMoveLS, WriteVecMoveLSX,
+ WriteVecMoveLSY, WriteVecMoveLSY>;
+
+def WriteVecMoveLSNT
+ : X86SchedWriteMoveLS<WriteVecMove, WriteVecLoadNT, WriteVecStoreNT>;
+def WriteVecMoveLSNTX
+ : X86SchedWriteMoveLS<WriteVecMoveX, WriteVecLoadNT, WriteVecStoreNT>;
+def WriteVecMoveLSNTY
+ : X86SchedWriteMoveLS<WriteVecMoveY, WriteVecLoadNTY, WriteVecStoreNTY>;
+def SchedWriteVecMoveLSNT
+ : X86SchedWriteMoveLSWidths<WriteVecMoveLSNT, WriteVecMoveLSNTX,
+ WriteVecMoveLSNTY, WriteVecMoveLSNTY>;
+
+// Vector width wrappers.
+def SchedWriteFAdd
+ : X86SchedWriteWidths<WriteFAdd, WriteFAddX, WriteFAddY, WriteFAddZ>;
+def SchedWriteFAdd64
+ : X86SchedWriteWidths<WriteFAdd64, WriteFAdd64X, WriteFAdd64Y, WriteFAdd64Z>;
+def SchedWriteFHAdd
+ : X86SchedWriteWidths<WriteFHAdd, WriteFHAdd, WriteFHAddY, WriteFHAddZ>;
+def SchedWriteFCmp
+ : X86SchedWriteWidths<WriteFCmp, WriteFCmpX, WriteFCmpY, WriteFCmpZ>;
+def SchedWriteFCmp64
+ : X86SchedWriteWidths<WriteFCmp64, WriteFCmp64X, WriteFCmp64Y, WriteFCmp64Z>;
+def SchedWriteFMul
+ : X86SchedWriteWidths<WriteFMul, WriteFMulX, WriteFMulY, WriteFMulZ>;
+def SchedWriteFMul64
+ : X86SchedWriteWidths<WriteFMul64, WriteFMul64X, WriteFMul64Y, WriteFMul64Z>;
+def SchedWriteFMA
+ : X86SchedWriteWidths<WriteFMA, WriteFMAX, WriteFMAY, WriteFMAZ>;
+def SchedWriteDPPD
+ : X86SchedWriteWidths<WriteDPPD, WriteDPPD, WriteDPPD, WriteDPPD>;
+def SchedWriteDPPS
+ : X86SchedWriteWidths<WriteDPPS, WriteDPPS, WriteDPPSY, WriteDPPSZ>;
+def SchedWriteFDiv
+ : X86SchedWriteWidths<WriteFDiv, WriteFDivX, WriteFDivY, WriteFDivZ>;
+def SchedWriteFDiv64
+ : X86SchedWriteWidths<WriteFDiv64, WriteFDiv64X, WriteFDiv64Y, WriteFDiv64Z>;
+def SchedWriteFSqrt
+ : X86SchedWriteWidths<WriteFSqrt, WriteFSqrtX,
+ WriteFSqrtY, WriteFSqrtZ>;
+def SchedWriteFSqrt64
+ : X86SchedWriteWidths<WriteFSqrt64, WriteFSqrt64X,
+ WriteFSqrt64Y, WriteFSqrt64Z>;
+def SchedWriteFRcp
+ : X86SchedWriteWidths<WriteFRcp, WriteFRcpX, WriteFRcpY, WriteFRcpZ>;
+def SchedWriteFRsqrt
+ : X86SchedWriteWidths<WriteFRsqrt, WriteFRsqrtX, WriteFRsqrtY, WriteFRsqrtZ>;
+def SchedWriteFRnd
+ : X86SchedWriteWidths<WriteFRnd, WriteFRnd, WriteFRndY, WriteFRndZ>;
+def SchedWriteFLogic
+ : X86SchedWriteWidths<WriteFLogic, WriteFLogic, WriteFLogicY, WriteFLogicZ>;
+def SchedWriteFTest
+ : X86SchedWriteWidths<WriteFTest, WriteFTest, WriteFTestY, WriteFTestZ>;
+
+def SchedWriteFShuffle
+ : X86SchedWriteWidths<WriteFShuffle, WriteFShuffle,
+ WriteFShuffleY, WriteFShuffleZ>;
+def SchedWriteFVarShuffle
+ : X86SchedWriteWidths<WriteFVarShuffle, WriteFVarShuffle,
+ WriteFVarShuffleY, WriteFVarShuffleZ>;
+def SchedWriteFBlend
+ : X86SchedWriteWidths<WriteFBlend, WriteFBlend, WriteFBlendY, WriteFBlendZ>;
+def SchedWriteFVarBlend
+ : X86SchedWriteWidths<WriteFVarBlend, WriteFVarBlend,
+ WriteFVarBlendY, WriteFVarBlendZ>;
+
+def SchedWriteCvtDQ2PD
+ : X86SchedWriteWidths<WriteCvtI2SD, WriteCvtI2PD,
+ WriteCvtI2PDY, WriteCvtI2PDZ>;
+def SchedWriteCvtDQ2PS
+ : X86SchedWriteWidths<WriteCvtI2SS, WriteCvtI2PS,
+ WriteCvtI2PSY, WriteCvtI2PSZ>;
+def SchedWriteCvtPD2DQ
+ : X86SchedWriteWidths<WriteCvtSD2I, WriteCvtPD2I,
+ WriteCvtPD2IY, WriteCvtPD2IZ>;
+def SchedWriteCvtPS2DQ
+ : X86SchedWriteWidths<WriteCvtSS2I, WriteCvtPS2I,
+ WriteCvtPS2IY, WriteCvtPS2IZ>;
+def SchedWriteCvtPS2PD
+ : X86SchedWriteWidths<WriteCvtSS2SD, WriteCvtPS2PD,
+ WriteCvtPS2PDY, WriteCvtPS2PDZ>;
+def SchedWriteCvtPD2PS
+ : X86SchedWriteWidths<WriteCvtSD2SS, WriteCvtPD2PS,
+ WriteCvtPD2PSY, WriteCvtPD2PSZ>;
+
+def SchedWriteVecALU
+ : X86SchedWriteWidths<WriteVecALU, WriteVecALUX, WriteVecALUY, WriteVecALUZ>;
+def SchedWritePHAdd
+ : X86SchedWriteWidths<WritePHAdd, WritePHAddX, WritePHAddY, WritePHAddZ>;
+def SchedWriteVecLogic
+ : X86SchedWriteWidths<WriteVecLogic, WriteVecLogicX,
+ WriteVecLogicY, WriteVecLogicZ>;
+def SchedWriteVecTest
+ : X86SchedWriteWidths<WriteVecTest, WriteVecTest,
+ WriteVecTestY, WriteVecTestZ>;
+def SchedWriteVecShift
+ : X86SchedWriteWidths<WriteVecShift, WriteVecShiftX,
+ WriteVecShiftY, WriteVecShiftZ>;
+def SchedWriteVecShiftImm
+ : X86SchedWriteWidths<WriteVecShiftImm, WriteVecShiftImmX,
+ WriteVecShiftImmY, WriteVecShiftImmZ>;
+def SchedWriteVarVecShift
+ : X86SchedWriteWidths<WriteVarVecShift, WriteVarVecShift,
+ WriteVarVecShiftY, WriteVarVecShiftZ>;
+def SchedWriteVecIMul
+ : X86SchedWriteWidths<WriteVecIMul, WriteVecIMulX,
+ WriteVecIMulY, WriteVecIMulZ>;
+def SchedWritePMULLD
+ : X86SchedWriteWidths<WritePMULLD, WritePMULLD,
+ WritePMULLDY, WritePMULLDZ>;
+def SchedWriteMPSAD
+ : X86SchedWriteWidths<WriteMPSAD, WriteMPSAD,
+ WriteMPSADY, WriteMPSADZ>;
+def SchedWritePSADBW
+ : X86SchedWriteWidths<WritePSADBW, WritePSADBWX,
+ WritePSADBWY, WritePSADBWZ>;
+
+def SchedWriteShuffle
+ : X86SchedWriteWidths<WriteShuffle, WriteShuffleX,
+ WriteShuffleY, WriteShuffleZ>;
+def SchedWriteVarShuffle
+ : X86SchedWriteWidths<WriteVarShuffle, WriteVarShuffleX,
+ WriteVarShuffleY, WriteVarShuffleZ>;
+def SchedWriteBlend
+ : X86SchedWriteWidths<WriteBlend, WriteBlend, WriteBlendY, WriteBlendZ>;
+def SchedWriteVarBlend
+ : X86SchedWriteWidths<WriteVarBlend, WriteVarBlend,
+ WriteVarBlendY, WriteVarBlendZ>;
+
+// Vector size wrappers.
+def SchedWriteFAddSizes
+ : X86SchedWriteSizes<SchedWriteFAdd, SchedWriteFAdd64>;
+def SchedWriteFCmpSizes
+ : X86SchedWriteSizes<SchedWriteFCmp, SchedWriteFCmp64>;
+def SchedWriteFMulSizes
+ : X86SchedWriteSizes<SchedWriteFMul, SchedWriteFMul64>;
+def SchedWriteFDivSizes
+ : X86SchedWriteSizes<SchedWriteFDiv, SchedWriteFDiv64>;
+def SchedWriteFSqrtSizes
+ : X86SchedWriteSizes<SchedWriteFSqrt, SchedWriteFSqrt64>;
+def SchedWriteFLogicSizes
+ : X86SchedWriteSizes<SchedWriteFLogic, SchedWriteFLogic>;
+def SchedWriteFShuffleSizes
+ : X86SchedWriteSizes<SchedWriteFShuffle, SchedWriteFShuffle>;
//===----------------------------------------------------------------------===//
-// Processor instruction itineraries.
+// Generic Processor Scheduler Models.
// IssueWidth is analogous to the number of decode units. Core and its
// descendents, including Nehalem and SandyBridge have 4 decoders.
@@ -675,7 +638,7 @@ def IIC_NOP : InstrItinClass;
// latencies. Since these latencies are not used for pipeline hazards,
// they do not need to be exact.
//
-// The GenericX86Model contains no instruction itineraries
+// The GenericX86Model contains no instruction schedules
// and disables PostRAScheduler.
class GenericX86Model : SchedMachineModel {
let IssueWidth = 4;
@@ -692,4 +655,3 @@ def GenericModel : GenericX86Model;
def GenericPostRAModel : GenericX86Model {
let PostRAScheduler = 1;
}
-
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index e052ad98104c..d1e902e6c43f 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -7,7 +7,7 @@
//
//===----------------------------------------------------------------------===//
//
-// This file defines the itinerary class data for the Intel Atom
+// This file defines the schedule class data for the Intel Atom
// in order (Saltwell-32nm/Bonnell-45nm) processors.
//
//===----------------------------------------------------------------------===//
@@ -15,542 +15,907 @@
//
// Scheduling information derived from the "Intel 64 and IA32 Architectures
// Optimization Reference Manual", Chapter 13, Section 4.
-// Functional Units
-// Port 0
-def Port0 : FuncUnit; // ALU: ALU0, shift/rotate, load/store
- // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide
-def Port1 : FuncUnit; // ALU: ALU1, bit processing, jump, and LEA
- // SIMD/FP: SIMD ALU, FP Adder
-
-def AtomItineraries : ProcessorItineraries<
- [ Port0, Port1 ],
- [], [
- // P0 only
- // InstrItinData<class, [InstrStage<N, [P0]>] >,
- // P0 or P1
- // InstrItinData<class, [InstrStage<N, [P0, P1]>] >,
- // P0 and P1
- // InstrItinData<class, [InstrStage<N, [P0], 0>, InstrStage<N, [P1]>] >,
- //
- // Default is 1 cycle, port0 or port1
- InstrItinData<IIC_ALU_MEM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_LEA, [InstrStage<1, [Port1]>] >,
- InstrItinData<IIC_LEA_16, [InstrStage<2, [Port0, Port1]>] >,
- // mul
- InstrItinData<IIC_MUL8, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_MUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
- InstrItinData<IIC_MUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_MUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_MUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
- InstrItinData<IIC_MUL64, [InstrStage<12, [Port0, Port1]>] >,
- // imul by al, ax, eax, rax
- InstrItinData<IIC_IMUL8, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_IMUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
- InstrItinData<IIC_IMUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_IMUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
- InstrItinData<IIC_IMUL64, [InstrStage<12, [Port0, Port1]>] >,
- // imul reg by reg|mem
- InstrItinData<IIC_IMUL16_RM, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_IMUL16_RR, [InstrStage<6, [Port0, Port1]>] >,
- InstrItinData<IIC_IMUL32_RM, [InstrStage<5, [Port0]>] >,
- InstrItinData<IIC_IMUL32_RR, [InstrStage<5, [Port0]>] >,
- InstrItinData<IIC_IMUL64_RM, [InstrStage<12, [Port0, Port1]>] >,
- InstrItinData<IIC_IMUL64_RR, [InstrStage<12, [Port0, Port1]>] >,
- // imul reg = reg/mem * imm
- InstrItinData<IIC_IMUL16_RRI, [InstrStage<6, [Port0, Port1]>] >,
- InstrItinData<IIC_IMUL32_RRI, [InstrStage<5, [Port0]>] >,
- InstrItinData<IIC_IMUL64_RRI, [InstrStage<14, [Port0, Port1]>] >,
- InstrItinData<IIC_IMUL16_RMI, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_IMUL32_RMI, [InstrStage<5, [Port0]>] >,
- InstrItinData<IIC_IMUL64_RMI, [InstrStage<14, [Port0, Port1]>] >,
- // idiv
- InstrItinData<IIC_IDIV8, [InstrStage<62, [Port0, Port1]>] >,
- InstrItinData<IIC_IDIV16, [InstrStage<62, [Port0, Port1]>] >,
- InstrItinData<IIC_IDIV32, [InstrStage<62, [Port0, Port1]>] >,
- InstrItinData<IIC_IDIV64, [InstrStage<130, [Port0, Port1]>] >,
- // div
- InstrItinData<IIC_DIV8_REG, [InstrStage<50, [Port0, Port1]>] >,
- InstrItinData<IIC_DIV8_MEM, [InstrStage<68, [Port0, Port1]>] >,
- InstrItinData<IIC_DIV16, [InstrStage<50, [Port0, Port1]>] >,
- InstrItinData<IIC_DIV32, [InstrStage<50, [Port0, Port1]>] >,
- InstrItinData<IIC_DIV64, [InstrStage<130, [Port0, Port1]>] >,
- // neg/not/inc/dec
- InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >,
- // add/sub/and/or/xor/cmp/test
- InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >,
- // adc/sbc
- InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<1, [Port0]>] >,
- // shift/rotate
- InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >,
- // shift double
- InstrItinData<IIC_SHD16_REG_IM, [InstrStage<6, [Port0, Port1]>] >,
- InstrItinData<IIC_SHD16_REG_CL, [InstrStage<6, [Port0, Port1]>] >,
- InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<6, [Port0, Port1]>] >,
- InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<6, [Port0, Port1]>] >,
- InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_SHD32_REG_CL, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<4, [Port0, Port1]>] >,
- InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [Port0, Port1]>] >,
- InstrItinData<IIC_SHD64_REG_IM, [InstrStage<9, [Port0, Port1]>] >,
- InstrItinData<IIC_SHD64_REG_CL, [InstrStage<8, [Port0, Port1]>] >,
- InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<9, [Port0, Port1]>] >,
- InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<9, [Port0, Port1]>] >,
- // cmov
- InstrItinData<IIC_CMOV16_RM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_CMOV16_RR, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_CMOV32_RM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_CMOV32_RR, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >,
- // set
- InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >,
- // jcc
- InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >,
- // jcxz/jecxz/jrcxz
- InstrItinData<IIC_JCXZ, [InstrStage<4, [Port0, Port1]>] >,
- // jmp rel
- InstrItinData<IIC_JMP_REL, [InstrStage<1, [Port1]>] >,
- // jmp indirect
- InstrItinData<IIC_JMP_REG, [InstrStage<1, [Port1]>] >,
- InstrItinData<IIC_JMP_MEM, [InstrStage<2, [Port0, Port1]>] >,
- // jmp far
- InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<32, [Port0, Port1]>] >,
- InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<31, [Port0, Port1]>] >,
- // loop/loope/loopne
- InstrItinData<IIC_LOOP, [InstrStage<18, [Port0, Port1]>] >,
- InstrItinData<IIC_LOOPE, [InstrStage<8, [Port0, Port1]>] >,
- InstrItinData<IIC_LOOPNE, [InstrStage<17, [Port0, Port1]>] >,
- // call - all but reg/imm
- InstrItinData<IIC_CALL_RI, [InstrStage<1, [Port0], 0>,
- InstrStage<1, [Port1]>] >,
- InstrItinData<IIC_CALL_MEM, [InstrStage<15, [Port0, Port1]>] >,
- InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<40, [Port0, Port1]>] >,
- InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<39, [Port0, Port1]>] >,
- //ret
- InstrItinData<IIC_RET, [InstrStage<79, [Port0, Port1]>] >,
- InstrItinData<IIC_RET_IMM, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
- //sign extension movs
- InstrItinData<IIC_MOVSX,[InstrStage<1, [Port0] >] >,
- InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
- InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [Port0, Port1]>] >,
- //zero extension movs
- InstrItinData<IIC_MOVZX,[InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
-
- InstrItinData<IIC_REP_MOVS, [InstrStage<75, [Port0, Port1]>] >,
- InstrItinData<IIC_REP_STOS, [InstrStage<74, [Port0, Port1]>] >,
-
- // SSE binary operations
- // arithmetic fp scalar
- InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<5, [Port1]>] >,
- InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<5, [Port0], 0>,
- InstrStage<5, [Port1]>] >,
- InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<5, [Port1]>] >,
- InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<5, [Port0], 0>,
- InstrStage<5, [Port1]>] >,
- InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<4, [Port0]>] >,
- InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<4, [Port0]>] >,
- InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<5, [Port0]>] >,
- InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<5, [Port0]>] >,
- InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<34, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<34, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<62, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<62, [Port0, Port1]>] >,
-
- InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<9, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<10, [Port0, Port1]>] >,
-
- InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<8, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<9, [Port0, Port1]>] >,
-
- // arithmetic fp parallel
- InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<5, [Port1]>] >,
- InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<5, [Port0], 0>,
- InstrStage<5, [Port1]>] >,
- InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<6, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<5, [Port0]>] >,
- InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<5, [Port0]>] >,
- InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<9, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<10, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<70, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<70, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<125, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<125, [Port0, Port1]>] >,
-
- // bitwise parallel
- InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [Port0]>] >,
-
- // arithmetic int parallel
- InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<3, [Port0, Port1]>] >,
-
- // multiply int parallel
- InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [Port0]>] >,
- InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [Port0]>] >,
-
- // shift parallel
- InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >,
-
- InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [Port0, Port1]>] >,
-
- InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [Port0]>] >,
-
- InstrItinData<IIC_SSE_PACK, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
-
- InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<70, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<70, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<34, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<34, [Port0, Port1]>] >,
-
- InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<125, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<125, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
-
- InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >,
- InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >,
-
- InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
- InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [Port0]>] >,
-
- InstrItinData<IIC_SSE_MOVMSK, [InstrStage<3, [Port0]>] >,
- InstrItinData<IIC_SSE_MASKMOV, [InstrStage<2, [Port0, Port1]>] >,
-
- InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [Port0]>] >,
-
- InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [Port0]>] >,
-
- InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [Port0]>] >,
-
- InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [Port0]>] >,
-
- InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<3, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<2, [Port0, Port1]>] >,
-
- InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [Port0]>] >,
-
- InstrItinData<IIC_SSE_LDDQU, [InstrStage<3, [Port0, Port1]>] >,
-
- InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<3, [Port0]>] >,
- InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
-
- InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [Port0]>] >,
-
- InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_SSE_PAUSE, [InstrStage<17, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<5, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_STMXCSR, [InstrStage<15, [Port0, Port1]>] >,
-
- InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<8, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<8, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<4, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [Port0]>] >,
-
- InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >,
- InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >,
- InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >,
-
- // conversions
- // to/from PD ...
- InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
- // to/from PS except to/from PD and PS2PI
- InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<6, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<6, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<8, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<9, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<9, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<10, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >,
-
- // MMX MOVs
- InstrItinData<IIC_MMX_MOV_MM_RM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<3, [Port0]>] >,
- InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
- // other MMX
- InstrItinData<IIC_MMX_ALU_RM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_MMX_ALU_RR, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<3, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<6, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<5, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_PMUL, [InstrStage<4, [Port0]>] >,
- InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_PSADBW, [InstrStage<4, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<3, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_MMX_PCK_RM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_MMX_PCK_RR, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_PSHUF, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_MMX_PEXTR, [InstrStage<4, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_PINSRW, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_MMX_MOVMSK, [InstrStage<3, [Port0]>] >,
- // conversions
- // from/to PD
- InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
- // from/to PI
- InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<5, [Port1]>] >,
- InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<5, [Port0], 0>,
- InstrStage<5, [Port1]>]>,
-
- InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >,
- InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >,
- InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >,
- InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >,
-
- InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >,
-
- InstrItinData<IIC_FILD, [InstrStage<5, [Port0], 0>, InstrStage<5, [Port1]>] >,
- InstrItinData<IIC_FLD, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_FLD80, [InstrStage<4, [Port0, Port1]>] >,
-
- InstrItinData<IIC_FST, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >,
- InstrItinData<IIC_FIST, [InstrStage<6, [Port0, Port1]>] >,
-
- InstrItinData<IIC_FCMOV, [InstrStage<9, [Port0, Port1]>] >,
- InstrItinData<IIC_FLDZ, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_FUCOM, [InstrStage<1, [Port1]>] >,
- InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >,
- InstrItinData<IIC_FCOMI, [InstrStage<9, [Port0, Port1]>] >,
- InstrItinData<IIC_FNSTSW, [InstrStage<10, [Port0, Port1]>] >,
- InstrItinData<IIC_FNSTCW, [InstrStage<8, [Port0, Port1]>] >,
- InstrItinData<IIC_FLDCW, [InstrStage<5, [Port0, Port1]>] >,
- InstrItinData<IIC_FNINIT, [InstrStage<63, [Port0, Port1]>] >,
- InstrItinData<IIC_FFREE, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_FNCLEX, [InstrStage<25, [Port0, Port1]>] >,
- InstrItinData<IIC_WAIT, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_FXAM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_FNOP, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_FLDL, [InstrStage<10, [Port0, Port1]>] >,
- InstrItinData<IIC_F2XM1, [InstrStage<99, [Port0, Port1]>] >,
- InstrItinData<IIC_FYL2X, [InstrStage<146, [Port0, Port1]>] >,
- InstrItinData<IIC_FPTAN, [InstrStage<168, [Port0, Port1]>] >,
- InstrItinData<IIC_FPATAN, [InstrStage<183, [Port0, Port1]>] >,
- InstrItinData<IIC_FXTRACT, [InstrStage<25, [Port0, Port1]>] >,
- InstrItinData<IIC_FPREM1, [InstrStage<71, [Port0, Port1]>] >,
- InstrItinData<IIC_FPSTP, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_FPREM, [InstrStage<55, [Port0, Port1]>] >,
- InstrItinData<IIC_FYL2XP1, [InstrStage<147, [Port0, Port1]>] >,
- InstrItinData<IIC_FSINCOS, [InstrStage<174, [Port0, Port1]>] >,
- InstrItinData<IIC_FRNDINT, [InstrStage<46, [Port0, Port1]>] >,
- InstrItinData<IIC_FSCALE, [InstrStage<77, [Port0, Port1]>] >,
- InstrItinData<IIC_FCOMPP, [InstrStage<1, [Port1]>] >,
- InstrItinData<IIC_FXSAVE, [InstrStage<140, [Port0, Port1]>] >,
- InstrItinData<IIC_FXRSTOR, [InstrStage<141, [Port0, Port1]>] >,
- InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
- InstrItinData<IIC_FSIGN, [InstrStage<1, [Port1]>] >,
- InstrItinData<IIC_FSQRT, [InstrStage<71, [Port0, Port1]>] >,
-
- // System instructions
- InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >,
- InstrItinData<IIC_INT, [InstrStage<127, [Port0, Port1]>] >,
- InstrItinData<IIC_INT3, [InstrStage<130, [Port0, Port1]>] >,
- InstrItinData<IIC_INVD, [InstrStage<1003, [Port0, Port1]>] >,
- InstrItinData<IIC_INVLPG, [InstrStage<71, [Port0, Port1]>] >,
- InstrItinData<IIC_IRET, [InstrStage<109, [Port0, Port1]>] >,
- InstrItinData<IIC_HLT, [InstrStage<121, [Port0, Port1]>] >,
- InstrItinData<IIC_LXS, [InstrStage<10, [Port0, Port1]>] >,
- InstrItinData<IIC_LTR, [InstrStage<83, [Port0, Port1]>] >,
- InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >,
- InstrItinData<IIC_RDTSCP, [InstrStage<30, [Port0, Port1]>] >,
- InstrItinData<IIC_RSM, [InstrStage<741, [Port0, Port1]>] >,
- InstrItinData<IIC_SIDT, [InstrStage<4, [Port0, Port1]>] >,
- InstrItinData<IIC_SGDT, [InstrStage<4, [Port0, Port1]>] >,
- InstrItinData<IIC_SLDT, [InstrStage<3, [Port0, Port1]>] >,
- InstrItinData<IIC_STR, [InstrStage<3, [Port0, Port1]>] >,
- InstrItinData<IIC_SWAPGS, [InstrStage<22, [Port0, Port1]>] >,
- InstrItinData<IIC_SYSCALL, [InstrStage<96, [Port0, Port1]>] >,
- InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<88, [Port0, Port1]>] >,
-
- InstrItinData<IIC_IN_RR, [InstrStage<94, [Port0, Port1]>] >,
- InstrItinData<IIC_IN_RI, [InstrStage<92, [Port0, Port1]>] >,
- InstrItinData<IIC_OUT_RR, [InstrStage<68, [Port0, Port1]>] >,
- InstrItinData<IIC_OUT_IR, [InstrStage<72, [Port0, Port1]>] >,
- InstrItinData<IIC_INS, [InstrStage<59, [Port0, Port1]>] >,
-
- InstrItinData<IIC_MOV_REG_DR, [InstrStage<88, [Port0, Port1]>] >,
- InstrItinData<IIC_MOV_DR_REG, [InstrStage<123, [Port0, Port1]>] >,
- // worst case for mov REG_CRx
- InstrItinData<IIC_MOV_REG_CR, [InstrStage<12, [Port0, Port1]>] >,
- InstrItinData<IIC_MOV_CR_REG, [InstrStage<136, [Port0, Port1]>] >,
-
- InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_MOV_MEM_SR, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_MOV_SR_REG, [InstrStage<21, [Port0, Port1]>] >,
- InstrItinData<IIC_MOV_SR_MEM, [InstrStage<26, [Port0, Port1]>] >,
- // LAR
- InstrItinData<IIC_LAR_RM, [InstrStage<50, [Port0, Port1]>] >,
- InstrItinData<IIC_LAR_RR, [InstrStage<54, [Port0, Port1]>] >,
- // LSL
- InstrItinData<IIC_LSL_RM, [InstrStage<46, [Port0, Port1]>] >,
- InstrItinData<IIC_LSL_RR, [InstrStage<49, [Port0, Port1]>] >,
-
- InstrItinData<IIC_LGDT, [InstrStage<44, [Port0, Port1]>] >,
- InstrItinData<IIC_LIDT, [InstrStage<44, [Port0, Port1]>] >,
- InstrItinData<IIC_LLDT_REG, [InstrStage<60, [Port0, Port1]>] >,
- InstrItinData<IIC_LLDT_MEM, [InstrStage<64, [Port0, Port1]>] >,
- // push control register, segment registers
- InstrItinData<IIC_PUSH_CS, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_PUSH_SR, [InstrStage<2, [Port0, Port1]>] >,
- // pop control register, segment registers
- InstrItinData<IIC_POP_SR, [InstrStage<29, [Port0, Port1]>] >,
- InstrItinData<IIC_POP_SR_SS, [InstrStage<48, [Port0, Port1]>] >,
- // VERR, VERW
- InstrItinData<IIC_VERR, [InstrStage<41, [Port0, Port1]>] >,
- InstrItinData<IIC_VERW_REG, [InstrStage<51, [Port0, Port1]>] >,
- InstrItinData<IIC_VERW_MEM, [InstrStage<50, [Port0, Port1]>] >,
- // WRMSR, RDMSR
- InstrItinData<IIC_WRMSR, [InstrStage<202, [Port0, Port1]>] >,
- InstrItinData<IIC_RDMSR, [InstrStage<78, [Port0, Port1]>] >,
- InstrItinData<IIC_RDPMC, [InstrStage<46, [Port0, Port1]>] >,
- // SMSW, LMSW
- InstrItinData<IIC_SMSW, [InstrStage<9, [Port0, Port1]>] >,
- InstrItinData<IIC_LMSW_REG, [InstrStage<69, [Port0, Port1]>] >,
- InstrItinData<IIC_LMSW_MEM, [InstrStage<67, [Port0, Port1]>] >,
-
- InstrItinData<IIC_ENTER, [InstrStage<32, [Port0, Port1]>] >,
- InstrItinData<IIC_LEAVE, [InstrStage<2, [Port0, Port1]>] >,
-
- InstrItinData<IIC_POP_MEM, [InstrStage<3, [Port0, Port1]>] >,
- InstrItinData<IIC_POP_REG16, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_POP_REG, [InstrStage<1, [Port0], 0>,
- InstrStage<1, [Port1]>] >,
- InstrItinData<IIC_POP_F, [InstrStage<32, [Port0, Port1]>] >,
- InstrItinData<IIC_POP_FD, [InstrStage<26, [Port0, Port1]>] >,
- InstrItinData<IIC_POP_A, [InstrStage<9, [Port0, Port1]>] >,
-
- InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [Port0], 0>,
- InstrStage<1, [Port1]>] >,
- InstrItinData<IIC_PUSH_MEM, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_PUSH_REG, [InstrStage<1, [Port0], 0>,
- InstrStage<1, [Port1]>] >,
- InstrItinData<IIC_PUSH_F, [InstrStage<9, [Port0, Port1]>] >,
- InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >,
-
- InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<16, [Port0, Port1]>] >,
- InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<16, [Port0, Port1]>] >,
- InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >,
- InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_CMPS, [InstrStage<3, [Port0, Port1]>] >,
- InstrItinData<IIC_MOV, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_MOV_MEM, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_AHF, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_BT_MI, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_BT_MR, [InstrStage<9, [Port0, Port1]>] >,
- InstrItinData<IIC_BT_RI, [InstrStage<1, [Port1]>] >,
- InstrItinData<IIC_BT_RR, [InstrStage<1, [Port1]>] >,
- InstrItinData<IIC_BTX_MI, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_BTX_MR, [InstrStage<11, [Port0, Port1]>] >,
- InstrItinData<IIC_BTX_RI, [InstrStage<1, [Port1]>] >,
- InstrItinData<IIC_BTX_RR, [InstrStage<1, [Port1]>] >,
- InstrItinData<IIC_XCHG_REG, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_XCHG_MEM, [InstrStage<3, [Port0, Port1]>] >,
- InstrItinData<IIC_XADD_REG, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_XADD_MEM, [InstrStage<3, [Port0, Port1]>] >,
- InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<14, [Port0, Port1]>] >,
- InstrItinData<IIC_CMPXCHG_REG, [InstrStage<15, [Port0, Port1]>] >,
- InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [Port0, Port1]>] >,
- InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<9, [Port0, Port1]>] >,
- InstrItinData<IIC_CMPXCHG_8B, [InstrStage<18, [Port0, Port1]>] >,
- InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >,
- InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >,
- InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >,
- InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >,
- InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >,
- InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >,
- InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >,
- InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >,
- InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >,
- InstrItinData<IIC_AAA, [InstrStage<13, [Port0, Port1]>] >,
- InstrItinData<IIC_AAD, [InstrStage<7, [Port0, Port1]>] >,
- InstrItinData<IIC_AAM, [InstrStage<21, [Port0, Port1]>] >,
- InstrItinData<IIC_AAS, [InstrStage<13, [Port0, Port1]>] >,
- InstrItinData<IIC_DAA, [InstrStage<18, [Port0, Port1]>] >,
- InstrItinData<IIC_DAS, [InstrStage<20, [Port0, Port1]>] >,
- InstrItinData<IIC_BOUND, [InstrStage<11, [Port0, Port1]>] >,
- InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >,
- InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >,
- InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_CBW, [InstrStage<4, [Port0, Port1]>] >,
- InstrItinData<IIC_MMX_EMMS, [InstrStage<5, [Port0, Port1]>] >,
-
- InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] >
- ]>;
// Atom machine model.
def AtomModel : SchedMachineModel {
let IssueWidth = 2; // Allows 2 instructions per scheduling group.
let MicroOpBufferSize = 0; // In-order execution, always hide latency.
- let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
- let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+ let LoadLatency = 3; // Expected cycles, may be overriden.
+ let HighLatency = 30;// Expected, may be overriden.
// On the Atom, the throughput for taken branches is 2 cycles. For small
// simple loops, expand by a small factor to hide the backedge cost.
let LoopMicroOpBufferSize = 10;
let PostRAScheduler = 1;
let CompleteModel = 0;
+}
+
+let SchedModel = AtomModel in {
+
+// Functional Units
+def AtomPort0 : ProcResource<1>; // ALU: ALU0, shift/rotate, load/store
+ // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide
+def AtomPort1 : ProcResource<1>; // ALU: ALU1, bit processing, jump, and LEA
+ // SIMD/FP: SIMD ALU, FP Adder
+
+def AtomPort01 : ProcResGroup<[AtomPort0, AtomPort1]>;
+
+// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass AtomWriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> RRPorts,
+ list<ProcResourceKind> RMPorts,
+ int RRLat = 1, int RMLat = 1,
+ list<int> RRRes = [1],
+ list<int> RMRes = [1]> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, RRPorts> {
+ let Latency = RRLat;
+ let ResourceCycles = RRRes;
+ }
+
+ // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, RMPorts> {
+ let Latency = RMLat;
+ let ResourceCycles = RMRes;
+ }
+}
+
+// A folded store needs a cycle on Port0 for the store data.
+def : WriteRes<WriteRMW, [AtomPort0]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Arithmetic.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteALU, [AtomPort01], [AtomPort0]>;
+defm : AtomWriteResPair<WriteADC, [AtomPort01], [AtomPort0]>;
+defm : AtomWriteResPair<WriteIMul, [AtomPort01], [AtomPort01], 7, 7, [7], [7]>;
+defm : AtomWriteResPair<WriteIMul64, [AtomPort01], [AtomPort01], 12, 12, [12], [12]>;
+
+defm : AtomWriteResPair<WriteBSWAP32, [AtomPort0], [AtomPort0]>;
+defm : AtomWriteResPair<WriteBSWAP64, [AtomPort0], [AtomPort0]>;
+
+defm : AtomWriteResPair<WriteDiv8, [AtomPort01], [AtomPort01], 50, 68, [50], [68]>;
+defm : AtomWriteResPair<WriteDiv16, [AtomPort01], [AtomPort01], 50, 50, [50], [50]>;
+defm : AtomWriteResPair<WriteDiv32, [AtomPort01], [AtomPort01], 50, 50, [50], [50]>;
+defm : AtomWriteResPair<WriteDiv64, [AtomPort01], [AtomPort01],130,130,[130],[130]>;
+defm : AtomWriteResPair<WriteIDiv8, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
+defm : AtomWriteResPair<WriteIDiv16, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
+defm : AtomWriteResPair<WriteIDiv32, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
+defm : AtomWriteResPair<WriteIDiv64, [AtomPort01], [AtomPort01],130,130,[130],[130]>;
+
+defm : X86WriteResPairUnsupported<WriteCRC32>;
+
+defm : AtomWriteResPair<WriteCMOV, [AtomPort01], [AtomPort0]>;
+defm : AtomWriteResPair<WriteCMOV2, [AtomPort01], [AtomPort0]>;
+defm : X86WriteRes<WriteFCMOV, [AtomPort01], 9, [9], 1>; // x87 conditional move.
+
+def : WriteRes<WriteSETCC, [AtomPort01]>;
+def : WriteRes<WriteSETCCStore, [AtomPort01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteLAHFSAHF, [AtomPort01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+
+defm : X86WriteResUnsupported<WriteIMulH>;
+
+// This is for simple LEAs with one or two input operands.
+def : WriteRes<WriteLEA, [AtomPort1]>;
+
+def AtomWriteIMul16Ld : SchedWriteRes<[AtomPort01]> {
+ let Latency = 8;
+ let ResourceCycles = [8];
+}
+def : InstRW<[AtomWriteIMul16Ld], (instrs MUL16m, IMUL16m)>;
+
+def AtomWriteIMul32 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 6;
+ let ResourceCycles = [6];
+}
+def : InstRW<[AtomWriteIMul32], (instrs MUL32r, IMUL32r)>;
+
+def AtomWriteIMul64I : SchedWriteRes<[AtomPort01]> {
+ let Latency = 14;
+ let ResourceCycles = [14];
+}
+def : InstRW<[AtomWriteIMul64I], (instrs IMUL64rri8, IMUL64rri32,
+ IMUL64rmi8, IMUL64rmi32)>;
+
+// Bit counts.
+defm : AtomWriteResPair<WriteBSF, [AtomPort01], [AtomPort01], 16, 16, [16], [16]>;
+defm : AtomWriteResPair<WriteBSR, [AtomPort01], [AtomPort01], 16, 16, [16], [16]>;
+defm : X86WriteResPairUnsupported<WritePOPCNT>;
+defm : X86WriteResPairUnsupported<WriteLZCNT>;
+defm : X86WriteResPairUnsupported<WriteTZCNT>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : X86WriteResPairUnsupported<WriteBEXTR>;
+defm : X86WriteResPairUnsupported<WriteBZHI>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Integer shifts and rotates.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteShift, [AtomPort0], [AtomPort0]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Double shift instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteShiftDouble, [AtomPort0], [AtomPort0]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Loads, stores, and moves, not folded with other operations.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteLoad, [AtomPort0]>;
+def : WriteRes<WriteStore, [AtomPort0]>;
+def : WriteRes<WriteStoreNT, [AtomPort0]>;
+def : WriteRes<WriteMove, [AtomPort01]>;
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteZero, []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteJump, [AtomPort1], [AtomPort1]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Special case scheduling classes.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteSystem, [AtomPort01]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [AtomPort01]> { let Latency = 100; }
+def : WriteRes<WriteFence, [AtomPort0]>;
+
+// Nops don't have dependencies, so there's no actual latency, but we set this
+// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
+def : WriteRes<WriteNop, [AtomPort01]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Floating point. This covers both scalar and vector operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : X86WriteRes<WriteFLD0, [AtomPort01], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1, [AtomPort01], 6, [6], 1>;
+def : WriteRes<WriteFLoad, [AtomPort0]>;
+def : WriteRes<WriteFLoadX, [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteFLoadY>;
+defm : X86WriteResUnsupported<WriteFMaskedLoad>;
+defm : X86WriteResUnsupported<WriteFMaskedLoadY>;
+
+def : WriteRes<WriteFStore, [AtomPort0]>;
+def : WriteRes<WriteFStoreX, [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteFStoreY>;
+def : WriteRes<WriteFStoreNT, [AtomPort0]>;
+def : WriteRes<WriteFStoreNTX, [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteFStoreNTY>;
+defm : X86WriteResUnsupported<WriteFMaskedStore>;
+defm : X86WriteResUnsupported<WriteFMaskedStoreY>;
+
+def : WriteRes<WriteFMove, [AtomPort01]>;
+def : WriteRes<WriteFMoveX, [AtomPort01]>;
+defm : X86WriteResUnsupported<WriteFMoveY>;
+
+defm : X86WriteRes<WriteEMMS, [AtomPort01], 5, [5], 1>;
+
+defm : AtomWriteResPair<WriteFAdd, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFAddX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : X86WriteResPairUnsupported<WriteFAddY>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : AtomWriteResPair<WriteFAdd64, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFAdd64X, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Y>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+defm : AtomWriteResPair<WriteFCmp, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFCmpX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : X86WriteResPairUnsupported<WriteFCmpY>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : AtomWriteResPair<WriteFCmp64, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFCmp64X, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Y>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+defm : AtomWriteResPair<WriteFCom, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFMul, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>;
+defm : AtomWriteResPair<WriteFMulX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : X86WriteResPairUnsupported<WriteFMulY>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : AtomWriteResPair<WriteFMul64, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFMul64X, [AtomPort01], [AtomPort01], 9, 10, [9], [10]>;
+defm : X86WriteResPairUnsupported<WriteFMul64Y>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+defm : AtomWriteResPair<WriteFRcp, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>;
+defm : AtomWriteResPair<WriteFRcpX, [AtomPort01], [AtomPort01], 9, 10, [9], [10]>;
+defm : X86WriteResPairUnsupported<WriteFRcpY>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+defm : AtomWriteResPair<WriteFRsqrt, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>;
+defm : AtomWriteResPair<WriteFRsqrtX, [AtomPort01], [AtomPort01], 9, 10, [9], [10]>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtY>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+defm : AtomWriteResPair<WriteFDiv, [AtomPort01], [AtomPort01], 34, 34, [34], [34]>;
+defm : AtomWriteResPair<WriteFDivX, [AtomPort01], [AtomPort01], 70, 70, [70], [70]>;
+defm : X86WriteResPairUnsupported<WriteFDivY>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+defm : AtomWriteResPair<WriteFDiv64, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
+defm : AtomWriteResPair<WriteFDiv64X, [AtomPort01], [AtomPort01],125,125,[125],[125]>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Y>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+defm : AtomWriteResPair<WriteFSqrt, [AtomPort01], [AtomPort01], 34, 34, [34], [34]>;
+defm : AtomWriteResPair<WriteFSqrtX, [AtomPort01], [AtomPort01], 70, 70, [70], [70]>;
+defm : X86WriteResPairUnsupported<WriteFSqrtY>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : AtomWriteResPair<WriteFSqrt64, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
+defm : AtomWriteResPair<WriteFSqrt64X, [AtomPort01], [AtomPort01],125,125,[125],[125]>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Y>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : AtomWriteResPair<WriteFSqrt80, [AtomPort01], [AtomPort01], 71, 71, [71], [71]>;
+defm : AtomWriteResPair<WriteFSign, [AtomPort1], [AtomPort1]>;
+defm : AtomWriteResPair<WriteFRnd, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : X86WriteResPairUnsupported<WriteFRndY>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : AtomWriteResPair<WriteFLogic, [AtomPort01], [AtomPort0]>;
+defm : X86WriteResPairUnsupported<WriteFLogicY>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : AtomWriteResPair<WriteFTest, [AtomPort01], [AtomPort0]>;
+defm : X86WriteResPairUnsupported<WriteFTestY>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : AtomWriteResPair<WriteFShuffle, [AtomPort0], [AtomPort0]>;
+defm : X86WriteResPairUnsupported<WriteFShuffleY>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleY>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : X86WriteResPairUnsupported<WriteFMA>;
+defm : X86WriteResPairUnsupported<WriteFMAX>;
+defm : X86WriteResPairUnsupported<WriteFMAY>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : X86WriteResPairUnsupported<WriteDPPD>;
+defm : X86WriteResPairUnsupported<WriteDPPS>;
+defm : X86WriteResPairUnsupported<WriteDPPSY>;
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+defm : X86WriteResPairUnsupported<WriteFBlend>;
+defm : X86WriteResPairUnsupported<WriteFBlendY>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : X86WriteResPairUnsupported<WriteFVarBlend>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+defm : X86WriteResPairUnsupported<WriteFShuffle256>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Conversions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteCvtSS2I, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
+defm : AtomWriteResPair<WriteCvtPS2I, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IY>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : AtomWriteResPair<WriteCvtSD2I, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
+defm : AtomWriteResPair<WriteCvtPD2I, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IY>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+defm : AtomWriteResPair<WriteCvtI2SS, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteCvtI2PS, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSY>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : AtomWriteResPair<WriteCvtI2SD, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteCvtI2PD, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDY>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : AtomWriteResPair<WriteCvtSS2SD, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteCvtPS2PD, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDY>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+defm : AtomWriteResPair<WriteCvtSD2SS, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteCvtPD2PS, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSY>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+defm : X86WriteResPairUnsupported<WriteCvtPH2PS>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSY>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
+defm : X86WriteResUnsupported<WriteCvtPS2PH>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHSt>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHY>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHYSt>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Vector integer operations.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteVecLoad, [AtomPort0]>;
+def : WriteRes<WriteVecLoadX, [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteVecLoadY>;
+def : WriteRes<WriteVecLoadNT, [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteVecLoadNTY>;
+defm : X86WriteResUnsupported<WriteVecMaskedLoad>;
+defm : X86WriteResUnsupported<WriteVecMaskedLoadY>;
+
+def : WriteRes<WriteVecStore, [AtomPort0]>;
+def : WriteRes<WriteVecStoreX, [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteVecStoreY>;
+def : WriteRes<WriteVecStoreNT, [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteVecStoreNTY>;
+def : WriteRes<WriteVecMaskedStore, [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteVecMaskedStoreY>;
+
+def : WriteRes<WriteVecMove, [AtomPort0]>;
+def : WriteRes<WriteVecMoveX, [AtomPort01]>;
+defm : X86WriteResUnsupported<WriteVecMoveY>;
+defm : X86WriteRes<WriteVecMoveToGpr, [AtomPort0], 3, [3], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [AtomPort0], 1, [1], 1>;
+
+defm : AtomWriteResPair<WriteVecALU, [AtomPort01], [AtomPort0], 1, 1>;
+defm : AtomWriteResPair<WriteVecALUX, [AtomPort01], [AtomPort0], 1, 1>;
+defm : X86WriteResPairUnsupported<WriteVecALUY>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : AtomWriteResPair<WriteVecLogic, [AtomPort01], [AtomPort0], 1, 1>;
+defm : AtomWriteResPair<WriteVecLogicX, [AtomPort01], [AtomPort0], 1, 1>;
+defm : X86WriteResPairUnsupported<WriteVecLogicY>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : AtomWriteResPair<WriteVecTest, [AtomPort01], [AtomPort0], 1, 1>;
+defm : X86WriteResPairUnsupported<WriteVecTestY>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : AtomWriteResPair<WriteVecShift, [AtomPort01], [AtomPort01], 2, 3, [2], [3]>;
+defm : AtomWriteResPair<WriteVecShiftX, [AtomPort01], [AtomPort01], 2, 3, [2], [3]>;
+defm : X86WriteResPairUnsupported<WriteVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+defm : AtomWriteResPair<WriteVecShiftImm, [AtomPort01], [AtomPort01], 1, 1, [1], [1]>;
+defm : AtomWriteResPair<WriteVecShiftImmX, [AtomPort01], [AtomPort01], 1, 1, [1], [1]>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : AtomWriteResPair<WriteVecIMul, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>;
+defm : AtomWriteResPair<WriteVecIMulX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : X86WriteResPairUnsupported<WriteVecIMulY>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : X86WriteResPairUnsupported<WritePMULLD>;
+defm : X86WriteResPairUnsupported<WritePMULLDY>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : X86WriteResPairUnsupported<WritePHMINPOS>;
+defm : X86WriteResPairUnsupported<WriteMPSAD>;
+defm : X86WriteResPairUnsupported<WriteMPSADY>;
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+defm : AtomWriteResPair<WritePSADBW, [AtomPort01], [AtomPort01], 4, 4, [4], [4]>;
+defm : AtomWriteResPair<WritePSADBWX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : X86WriteResPairUnsupported<WritePSADBWY>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : AtomWriteResPair<WriteShuffle, [AtomPort0], [AtomPort0], 1, 1>;
+defm : AtomWriteResPair<WriteShuffleX, [AtomPort0], [AtomPort0], 1, 1>;
+defm : X86WriteResPairUnsupported<WriteShuffleY>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : AtomWriteResPair<WriteVarShuffle, [AtomPort0], [AtomPort0], 1, 1>;
+defm : AtomWriteResPair<WriteVarShuffleX, [AtomPort01], [AtomPort01], 4, 5, [4], [5]>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : X86WriteResPairUnsupported<WriteBlend>;
+defm : X86WriteResPairUnsupported<WriteBlendY>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : X86WriteResPairUnsupported<WriteVarBlend>;
+defm : X86WriteResPairUnsupported<WriteVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : X86WriteResPairUnsupported<WriteShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVarVecShift>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Vector insert/extract operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteVecInsert, [AtomPort0], [AtomPort0], 1, 1>;
+def : WriteRes<WriteVecExtract, [AtomPort0]>;
+def : WriteRes<WriteVecExtractSt, [AtomPort0]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// SSE42 String instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : X86WriteResPairUnsupported<WritePCmpIStrI>;
+defm : X86WriteResPairUnsupported<WritePCmpIStrM>;
+defm : X86WriteResPairUnsupported<WritePCmpEStrI>;
+defm : X86WriteResPairUnsupported<WritePCmpEStrM>;
+
+////////////////////////////////////////////////////////////////////////////////
+// MOVMSK Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteFMOVMSK, [AtomPort0]> { let Latency = 3; let ResourceCycles = [3]; }
+def : WriteRes<WriteVecMOVMSK, [AtomPort0]> { let Latency = 3; let ResourceCycles = [3]; }
+defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
+def : WriteRes<WriteMMXMOVMSK, [AtomPort0]> { let Latency = 3; let ResourceCycles = [3]; }
+
+////////////////////////////////////////////////////////////////////////////////
+// AES instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : X86WriteResPairUnsupported<WriteAESIMC>;
+defm : X86WriteResPairUnsupported<WriteAESKeyGen>;
+defm : X86WriteResPairUnsupported<WriteAESDecEnc>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteFHAdd, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
+defm : AtomWriteResPair<WriteFHAddY, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
+defm : AtomWriteResPair<WritePHAdd, [AtomPort01], [AtomPort01], 3, 4, [3], [4]>;
+defm : AtomWriteResPair<WritePHAddX, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : AtomWriteResPair<WritePHAddY, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Carry-less multiplication instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : X86WriteResPairUnsupported<WriteCLMul>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Load/store MXCSR.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteLDMXCSR, [AtomPort01]> { let Latency = 5; let ResourceCycles = [5]; }
+def : WriteRes<WriteSTMXCSR, [AtomPort01]> { let Latency = 15; let ResourceCycles = [15]; }
+
+////////////////////////////////////////////////////////////////////////////////
+// Special Cases.
+////////////////////////////////////////////////////////////////////////////////
+
+// Port0
+def AtomWrite0_1 : SchedWriteRes<[AtomPort0]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+}
+def : InstRW<[AtomWrite0_1], (instrs FXAM, LD_Frr,
+ MOVSX64rr32)>;
+def : SchedAlias<WriteALURMW, AtomWrite0_1>;
+def : SchedAlias<WriteADCRMW, AtomWrite0_1>;
+def : InstRW<[AtomWrite0_1], (instregex "(RCL|RCR|ROL|ROR|SAR|SHL|SHR)(8|16|32|64)m",
+ "MOV(S|Z)X(32|64)rr(8|8_NOREX|16)")>;
+
+def AtomWrite0_5 : SchedWriteRes<[AtomPort0]> {
+ let Latency = 5;
+ let ResourceCycles = [5];
+}
+def : InstRW<[AtomWrite0_5], (instregex "IMUL32(rm|rr)")>;
+
+// Port1
+def AtomWrite1_1 : SchedWriteRes<[AtomPort1]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+}
+def : InstRW<[AtomWrite1_1], (instrs FCOMPP)>;
+def : InstRW<[AtomWrite1_1], (instregex "UCOM_F(P|PP)?r",
+ "BT(C|R|S)?(16|32|64)(rr|ri8)")>;
+
+def AtomWrite1_5 : SchedWriteRes<[AtomPort1]> {
+ let Latency = 5;
+ let ResourceCycles = [5];
+}
+def : InstRW<[AtomWrite1_5], (instrs MMX_CVTPI2PSirr, MMX_CVTPI2PSirm,
+ MMX_CVTPS2PIirr, MMX_CVTTPS2PIirr)>;
+
+// Port0 and Port1
+def AtomWrite0_1_1 : SchedWriteRes<[AtomPort0, AtomPort1]> {
+ let Latency = 1;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[AtomWrite0_1_1], (instrs POP32r, POP64r,
+ POP16rmr, POP32rmr, POP64rmr,
+ PUSH16r, PUSH32r, PUSH64r,
+ PUSHi16, PUSHi32,
+ PUSH16rmr, PUSH32rmr, PUSH64rmr,
+ PUSH16i8, PUSH32i8, PUSH64i8, PUSH64i32,
+ XCH_F)>;
+def : InstRW<[AtomWrite0_1_1], (instregex "RETI(L|Q|W)$",
+ "IRET(16|32|64)?")>;
+
+def AtomWrite0_1_5 : SchedWriteRes<[AtomPort0, AtomPort1]> {
+ let Latency = 5;
+ let ResourceCycles = [5, 5];
+}
+def : InstRW<[AtomWrite0_1_5], (instrs MMX_CVTPS2PIirm, MMX_CVTTPS2PIirm)>;
+def : InstRW<[AtomWrite0_1_5], (instregex "ILD_F(16|32|64)")>;
- let Itineraries = AtomItineraries;
+// Port0 or Port1
+def AtomWrite01_1 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
}
+def : InstRW<[AtomWrite01_1], (instrs FDECSTP, FFREE, FFREEP, FINCSTP, WAIT,
+ LFENCE,
+ STOSB, STOSL, STOSQ, STOSW,
+ MOVSSrr, MOVSSrr_REV,
+ PSLLDQri, PSRLDQri)>;
+def : InstRW<[AtomWrite01_1], (instregex "MMX_PACK(SSDW|SSWB|USWB)irr",
+ "MMX_PUNPCKH(BW|DQ|WD)irr")>;
+
+def AtomWrite01_2 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : InstRW<[AtomWrite01_2], (instrs LEAVE, LEAVE64, POP16r,
+ PUSH16rmm, PUSH32rmm, PUSH64rmm,
+ LODSB, LODSL, LODSQ, LODSW,
+ SCASB, SCASL, SCASQ, SCASW,
+ SHLD32rrCL, SHRD32rrCL,
+ SHLD32rri8, SHRD32rri8)>;
+def : InstRW<[AtomWrite01_2], (instregex "BT(C|R|S)(16|32|64)mi8",
+ "PUSH(CS|DS|ES|FS|GS|SS)(16|32|64)",
+ "XADD(8|16|32|64)rr",
+ "XCHG(8|16|32|64)(ar|rr)",
+ "(ST|ISTT)_F(P)?(16|32|64)?(m|rr)",
+ "MMX_P(ADD|SUB)Qirr",
+ "MOV(S|Z)X16rr8",
+ "MOV(UPS|UPD|DQU)mr",
+ "MASKMOVDQU(64)?",
+ "P(ADD|SUB)Qrr")>;
+
+def AtomWrite01_3 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 3;
+ let ResourceCycles = [3];
+}
+def : InstRW<[AtomWrite01_3], (instrs CLD, LDDQUrm,
+ CMPSB, CMPSL, CMPSQ, CMPSW,
+ MOVSB, MOVSL, MOVSQ, MOVSW,
+ POP16rmm, POP32rmm, POP64rmm)>;
+def : InstRW<[AtomWrite01_3], (instregex "XADD(8|16|32|64)rm",
+ "XCHG(8|16|32|64)rm",
+ "PH(ADD|SUB)Drr",
+ "MOV(S|Z)X16rm8",
+ "MMX_P(ADD|SUB)Qirm",
+ "MOV(UPS|UPD|DQU)rm",
+ "P(ADD|SUB)Qrm")>;
+
+def AtomWrite01_4 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 4;
+ let ResourceCycles = [4];
+}
+def : InstRW<[AtomWrite01_4], (instrs CBW, CWD, CWDE, CDQ, CDQE, CQO,
+ JCXZ, JECXZ, JRCXZ,
+ SHLD32mrCL, SHRD32mrCL,
+ SHLD32mri8, SHRD32mri8,
+ LD_F80m)>;
+def : InstRW<[AtomWrite01_4], (instregex "PH(ADD|SUB)Drm",
+ "(MMX_)?PEXTRWrr(_REV)?")>;
+
+def AtomWrite01_5 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 5;
+ let ResourceCycles = [5];
+}
+def : InstRW<[AtomWrite01_5], (instrs FLDCW16m, ST_FP80m)>;
+def : InstRW<[AtomWrite01_5], (instregex "MMX_PH(ADD|SUB)S?Wrr")>;
+
+def AtomWrite01_6 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 6;
+ let ResourceCycles = [6];
+}
+def : InstRW<[AtomWrite01_6], (instrs CMPXCHG8rm, INTO, XLAT,
+ SHLD16rrCL, SHRD16rrCL,
+ SHLD16rri8, SHRD16rri8,
+ SHLD16mrCL, SHRD16mrCL,
+ SHLD16mri8, SHRD16mri8)>;
+def : InstRW<[AtomWrite01_6], (instregex "IMUL16rr",
+ "IST_F(P)?(16|32|64)?m",
+ "MMX_PH(ADD|SUB)S?Wrm")>;
+
+def AtomWrite01_7 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 7;
+ let ResourceCycles = [7];
+}
+def : InstRW<[AtomWrite01_7], (instrs AAD8i8)>;
+
+def AtomWrite01_8 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 8;
+ let ResourceCycles = [8];
+}
+def : InstRW<[AtomWrite01_8], (instrs LOOPE,
+ PUSHA16, PUSHA32,
+ SHLD64rrCL, SHRD64rrCL,
+ FNSTCW16m)>;
+
+def AtomWrite01_9 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 9;
+ let ResourceCycles = [9];
+}
+def : InstRW<[AtomWrite01_9], (instrs BT16mr, BT32mr, BT64mr,
+ POPA16, POPA32,
+ PUSHF16, PUSHF32, PUSHF64,
+ SHLD64mrCL, SHRD64mrCL,
+ SHLD64mri8, SHRD64mri8,
+ SHLD64rri8, SHRD64rri8,
+ CMPXCHG8rr)>;
+def : InstRW<[AtomWrite01_9], (instregex "(U)?COM_FI", "TST_F",
+ "(U)?COMIS(D|S)rr",
+ "CVT(T)?SS2SI64rr(_Int)?")>;
+
+def AtomWrite01_10 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 10;
+ let ResourceCycles = [10];
+}
+def : SchedAlias<WriteFLDC, AtomWrite01_10>;
+def : InstRW<[AtomWrite01_10], (instregex "(U)?COMIS(D|S)rm",
+ "CVT(T)?SS2SI64rm(_Int)?")>;
+
+def AtomWrite01_11 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 11;
+ let ResourceCycles = [11];
+}
+def : InstRW<[AtomWrite01_11], (instrs BOUNDS16rm, BOUNDS32rm)>;
+def : InstRW<[AtomWrite01_11], (instregex "BT(C|R|S)(16|32|64)mr")>;
+
+def AtomWrite01_13 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 13;
+ let ResourceCycles = [13];
+}
+def : InstRW<[AtomWrite01_13], (instrs AAA, AAS)>;
+
+def AtomWrite01_14 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 14;
+ let ResourceCycles = [14];
+}
+def : InstRW<[AtomWrite01_14], (instrs CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm)>;
+
+def AtomWrite01_15 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 15;
+ let ResourceCycles = [15];
+}
+def : InstRW<[AtomWrite01_15], (instrs CMPXCHG16rr, CMPXCHG32rr, CMPXCHG64rr)>;
+
+def AtomWrite01_17 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 17;
+ let ResourceCycles = [17];
+}
+def : InstRW<[AtomWrite01_17], (instrs LOOPNE, PAUSE)>;
+
+def AtomWrite01_18 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 18;
+ let ResourceCycles = [18];
+}
+def : InstRW<[AtomWrite01_18], (instrs CMPXCHG8B, DAA, LOOP)>;
+
+def AtomWrite01_20 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 20;
+ let ResourceCycles = [20];
+}
+def : InstRW<[AtomWrite01_20], (instrs DAS)>;
+
+def AtomWrite01_21 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 21;
+ let ResourceCycles = [21];
+}
+def : InstRW<[AtomWrite01_21], (instrs AAM8i8, STD)>;
+
+def AtomWrite01_22 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 22;
+ let ResourceCycles = [22];
+}
+def : InstRW<[AtomWrite01_22], (instrs CMPXCHG16B)>;
+
+def AtomWrite01_23 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 23;
+ let ResourceCycles = [23];
+}
+def : InstRW<[AtomWrite01_23], (instrs ARPL16mr, ARPL16rr)>;
+
+def AtomWrite01_25 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 25;
+ let ResourceCycles = [25];
+}
+def : InstRW<[AtomWrite01_25], (instrs FNCLEX, FXTRACT)>;
+
+def AtomWrite01_26 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 26;
+ let ResourceCycles = [26];
+}
+def : InstRW<[AtomWrite01_26], (instrs POPF32, POPF64)>;
+
+def AtomWrite01_29 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 29;
+ let ResourceCycles = [29];
+}
+def : InstRW<[AtomWrite01_29], (instregex "POP(DS|ES|FS|GS)(16|32|64)")>;
+
+def AtomWrite01_30 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 30;
+ let ResourceCycles = [30];
+}
+def : InstRW<[AtomWrite01_30], (instrs RDTSC, RDTSCP)>;
+
+def AtomWrite01_32 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 32;
+ let ResourceCycles = [32];
+}
+def : InstRW<[AtomWrite01_32], (instrs ENTER, POPF16)>;
+
+def AtomWrite01_45 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 45;
+ let ResourceCycles = [45];
+}
+def : InstRW<[AtomWrite01_45], (instrs MONITORrrr)>;
+
+def AtomWrite01_46 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 46;
+ let ResourceCycles = [46];
+}
+def : InstRW<[AtomWrite01_46], (instrs FRNDINT, MWAITrr, RDPMC)>;
+
+def AtomWrite01_48 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 48;
+ let ResourceCycles = [48];
+}
+def : InstRW<[AtomWrite01_48], (instrs POPSS16, POPSS32)>;
+
+def AtomWrite01_55 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 55;
+ let ResourceCycles = [55];
+}
+def : InstRW<[AtomWrite01_55], (instrs FPREM)>;
+
+def AtomWrite01_59 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 59;
+ let ResourceCycles = [59];
+}
+def : InstRW<[AtomWrite01_59], (instrs INSB, INSL, INSW)>;
+
+def AtomWrite01_63 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 63;
+ let ResourceCycles = [63];
+}
+def : InstRW<[AtomWrite01_63], (instrs FNINIT)>;
+
+def AtomWrite01_68 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 68;
+ let ResourceCycles = [68];
+}
+def : InstRW<[AtomWrite01_68], (instrs OUT8rr, OUT16rr, OUT32rr)>;
+
+def AtomWrite01_71 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 71;
+ let ResourceCycles = [71];
+}
+def : InstRW<[AtomWrite01_71], (instrs FPREM1,
+ INVLPG, INVLPGA32, INVLPGA64)>;
+
+def AtomWrite01_72 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 72;
+ let ResourceCycles = [72];
+}
+def : InstRW<[AtomWrite01_72], (instrs OUT8ir, OUT16ir, OUT32ir)>;
+
+def AtomWrite01_74 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 74;
+ let ResourceCycles = [74];
+}
+def : InstRW<[AtomWrite01_74], (instrs OUTSB, OUTSL, OUTSW)>;
+
+def AtomWrite01_77 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 77;
+ let ResourceCycles = [77];
+}
+def : InstRW<[AtomWrite01_77], (instrs FSCALE)>;
+
+def AtomWrite01_78 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 78;
+ let ResourceCycles = [78];
+}
+def : InstRW<[AtomWrite01_78], (instrs RDMSR)>;
+
+def AtomWrite01_79 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 79;
+ let ResourceCycles = [79];
+}
+def : InstRW<[AtomWrite01_79], (instregex "RET(L|Q|W)?$",
+ "LRETI?(L|Q|W)")>;
+
+def AtomWrite01_92 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 92;
+ let ResourceCycles = [92];
+}
+def : InstRW<[AtomWrite01_92], (instrs IN8ri, IN16ri, IN32ri)>;
+
+def AtomWrite01_94 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 94;
+ let ResourceCycles = [94];
+}
+def : InstRW<[AtomWrite01_94], (instrs IN8rr, IN16rr, IN32rr)>;
+
+def AtomWrite01_99 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 99;
+ let ResourceCycles = [99];
+}
+def : InstRW<[AtomWrite01_99], (instrs F2XM1)>;
+
+def AtomWrite01_121 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 121;
+ let ResourceCycles = [121];
+}
+def : InstRW<[AtomWrite01_121], (instrs CPUID)>;
+
+def AtomWrite01_127 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 127;
+ let ResourceCycles = [127];
+}
+def : InstRW<[AtomWrite01_127], (instrs INT)>;
+
+def AtomWrite01_130 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 130;
+ let ResourceCycles = [130];
+}
+def : InstRW<[AtomWrite01_130], (instrs INT3)>;
+
+def AtomWrite01_140 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 140;
+ let ResourceCycles = [140];
+}
+def : InstRW<[AtomWrite01_140], (instrs FXSAVE, FXSAVE64)>;
+
+def AtomWrite01_141 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 141;
+ let ResourceCycles = [141];
+}
+def : InstRW<[AtomWrite01_141], (instrs FXRSTOR, FXRSTOR64)>;
+
+def AtomWrite01_146 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 146;
+ let ResourceCycles = [146];
+}
+def : InstRW<[AtomWrite01_146], (instrs FYL2X)>;
+
+def AtomWrite01_147 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 147;
+ let ResourceCycles = [147];
+}
+def : InstRW<[AtomWrite01_147], (instrs FYL2XP1)>;
+
+def AtomWrite01_168 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 168;
+ let ResourceCycles = [168];
+}
+def : InstRW<[AtomWrite01_168], (instrs FPTAN)>;
+
+def AtomWrite01_174 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 174;
+ let ResourceCycles = [174];
+}
+def : InstRW<[AtomWrite01_174], (instrs FSINCOS)>;
+def : InstRW<[AtomWrite01_174], (instregex "(COS|SIN)_F")>;
+
+def AtomWrite01_183 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 183;
+ let ResourceCycles = [183];
+}
+def : InstRW<[AtomWrite01_183], (instrs FPATAN)>;
+
+def AtomWrite01_202 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 202;
+ let ResourceCycles = [202];
+}
+def : InstRW<[AtomWrite01_202], (instrs WRMSR)>;
+
+} // SchedModel
diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td
index 6ea81a25e41c..d78c343ebd5c 100644
--- a/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/lib/Target/X86/X86ScheduleBtVer2.td
@@ -38,8 +38,27 @@ def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
-// Any pipe - FIXME we need this until we can discriminate between int/fpu load/store/moves properly
-def JAny : ProcResGroup<[JALU0, JALU1, JLAGU, JSAGU, JFPU0, JFPU1]>;
+// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and
+// speculative version of the 64-bit integer registers.
+// Reference: www.realworldtech.com/jaguar/4/
+//
+// The processor always keeps the different parts of an integer register
+// together. An instruction that writes to a part of a register will therefore
+// have a false dependence on any previous write to the same register or any
+// part of it.
+// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
+// access" - Agner Fog's "microarchitecture.pdf".
+def JIntegerPRF : RegisterFile<64, [GR64, CCR]>;
+
+// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
+// registers. Operations on 256-bit data types are cracked into two COPs.
+// Reference: www.realworldtech.com/jaguar/4/
+def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2]>;
+
+// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can
+// retire up to two macro-ops per cycle.
+// Reference: "Software Optimization Guide for AMD Family 16h Processors"
+def JRCU : RetireControlUnit<64, 2>;
// Integer Pipe Scheduler
def JALU01 : ProcResGroup<[JALU0, JALU1]> {
@@ -56,6 +75,7 @@ def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
let BufferSize=18;
}
+// Functional units
def JDiv : ProcResource<1>; // integer division
def JMul : ProcResource<1>; // integer multiplication
def JVALU0 : ProcResource<1>; // vector integer
@@ -65,6 +85,10 @@ def JSTC : ProcResource<1>; // vector store/convert
def JFPM : ProcResource<1>; // FP multiplication
def JFPA : ProcResource<1>; // FP addition
+// Functional unit groups
+def JFPX : ProcResGroup<[JFPA, JFPM]>;
+def JVALU : ProcResGroup<[JVALU0, JVALU1]>;
+
// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
// cycles after the memory operand.
def : ReadAdvance<ReadAfterLd, 3>;
@@ -75,28 +99,59 @@ def : ReadAdvance<ReadAfterLd, 3>;
// This multiclass defines the resource usage for variants with and without
// folded loads.
multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [], int UOps = 1> {
// Register variant is using a single cycle on ExePort.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
// Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
// latency.
- def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> {
- let Latency = !add(Lat, 3);
+ def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
+ let Latency = !add(Lat, 3);
+ let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+ let NumMicroOps = UOps;
}
}
multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [], int UOps = 1> {
// Register variant is using a single cycle on ExePort.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
// Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
// latency.
- def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> {
- let Latency = !add(Lat, 5);
+ def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
+ let Latency = !add(Lat, 5);
+ let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+ let NumMicroOps = UOps;
+ }
+}
+
+multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [2], int UOps = 2> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
+ let Latency = !add(Lat, 5);
+ let ResourceCycles = !listconcat([2], Res);
+ let NumMicroOps = UOps;
}
}
@@ -107,66 +162,94 @@ def : WriteRes<WriteRMW, [JSAGU]>;
// Arithmetic.
////////////////////////////////////////////////////////////////////////////////
-defm : JWriteResIntPair<WriteALU, JALU01, 1>;
-defm : JWriteResIntPair<WriteIMul, JALU1, 3>;
-
-def : WriteRes<WriteIMulH, [JALU1]> {
- let Latency = 6;
- let ResourceCycles = [4];
-}
-
-// FIXME 8/16 bit divisions
-def : WriteRes<WriteIDiv, [JALU1, JDiv]> {
- let Latency = 25;
- let ResourceCycles = [1, 25];
-}
-def : WriteRes<WriteIDivLd, [JALU1, JLAGU, JDiv]> {
- let Latency = 41;
- let ResourceCycles = [1, 1, 25];
-}
+defm : JWriteResIntPair<WriteALU, [JALU01], 1>;
+defm : JWriteResIntPair<WriteADC, [JALU01], 1, [2]>;
+defm : JWriteResIntPair<WriteIMul, [JALU1, JMul], 3, [1, 1], 2>; // i8/i16/i32 multiplication
+defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; // i64 multiplication
+defm : X86WriteRes<WriteIMulH, [JALU1], 6, [4], 1>;
+
+defm : JWriteResIntPair<WriteBSWAP32,[JALU01], 1>;
+defm : JWriteResIntPair<WriteBSWAP64,[JALU01], 1>;
+
+defm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>;
+defm : JWriteResIntPair<WriteDiv16, [JALU1, JDiv], 17, [1, 17], 2>;
+defm : JWriteResIntPair<WriteDiv32, [JALU1, JDiv], 25, [1, 25], 2>;
+defm : JWriteResIntPair<WriteDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
+defm : JWriteResIntPair<WriteIDiv8, [JALU1, JDiv], 12, [1, 12], 1>;
+defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>;
+defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>;
+defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
+
+defm : JWriteResIntPair<WriteCRC32, [JALU01], 3, [4], 3>;
+
+defm : JWriteResIntPair<WriteCMOV, [JALU01], 1>; // Conditional move.
+defm : JWriteResIntPair<WriteCMOV2, [JALU01], 1>; // Conditional (CF + ZF flag) move.
+defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move.
+def : WriteRes<WriteSETCC, [JALU01]>; // Setcc.
+def : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>;
+def : WriteRes<WriteLAHFSAHF, [JALU01]>;
// This is for simple LEAs with one or two input operands.
-// FIXME: SAGU 3-operand LEA
def : WriteRes<WriteLEA, [JALU01]>;
+// Bit counts.
+defm : JWriteResIntPair<WriteBSF, [JALU01], 5, [4], 8>;
+defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [4], 8>;
+defm : JWriteResIntPair<WritePOPCNT, [JALU01], 1>;
+defm : JWriteResIntPair<WriteLZCNT, [JALU01], 1>;
+defm : JWriteResIntPair<WriteTZCNT, [JALU01], 2, [2]>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>;
+defm : X86WriteResPairUnsupported<WriteBZHI>;
+
////////////////////////////////////////////////////////////////////////////////
// Integer shifts and rotates.
////////////////////////////////////////////////////////////////////////////////
-defm : JWriteResIntPair<WriteShift, JALU01, 1>;
+defm : JWriteResIntPair<WriteShift, [JALU01], 1>;
+
+defm : JWriteResIntPair<WriteShiftDouble, [JALU01], 1>;
-def WriteSHLDrri : SchedWriteRes<[JALU01]> {
+def JWriteSHLDrri : SchedWriteRes<[JALU01]> {
let Latency = 3;
let ResourceCycles = [6];
let NumMicroOps = 6;
}
-def: InstRW<[WriteSHLDrri], (instregex "SHLD(16|32|64)rri8")>;
-def: InstRW<[WriteSHLDrri], (instregex "SHRD(16|32|64)rri8")>;
+def: InstRW<[JWriteSHLDrri], (instrs SHLD16rri8, SHLD32rri8, SHLD64rri8,
+ SHRD16rri8, SHRD32rri8, SHRD64rri8)>;
-def WriteSHLDrrCL : SchedWriteRes<[JALU01]> {
+def JWriteSHLDrrCL : SchedWriteRes<[JALU01]> {
let Latency = 4;
let ResourceCycles = [8];
let NumMicroOps = 7;
}
-def: InstRW<[WriteSHLDrrCL], (instregex "SHLD(16|32|64)rrCL")>;
-def: InstRW<[WriteSHLDrrCL], (instregex "SHRD(16|32|64)rrCL")>;
+def: InstRW<[JWriteSHLDrrCL], (instrs SHLD16rrCL, SHLD32rrCL, SHLD64rrCL,
+ SHRD16rrCL, SHRD32rrCL, SHRD64rrCL)>;
-def WriteSHLDm : SchedWriteRes<[JLAGU, JALU01]> {
+def JWriteSHLDm : SchedWriteRes<[JLAGU, JALU01]> {
let Latency = 9;
let ResourceCycles = [1, 22];
let NumMicroOps = 8;
}
-def: InstRW<[WriteSHLDm], (instregex "SHLD(16|32|64)mr(i8|CL)")>;
-def: InstRW<[WriteSHLDm], (instregex "SHRD(16|32|64)mr(i8|CL)")>;
+def: InstRW<[JWriteSHLDm],(instrs SHLD16mri8, SHLD32mri8, SHLD64mri8,
+ SHLD16mrCL, SHLD32mrCL, SHLD64mrCL,
+ SHRD16mri8, SHRD32mri8, SHRD64mri8,
+ SHRD16mrCL, SHRD32mrCL, SHRD64mrCL)>;
////////////////////////////////////////////////////////////////////////////////
// Loads, stores, and moves, not folded with other operations.
-// FIXME: Split x86 and SSE load/store/moves
////////////////////////////////////////////////////////////////////////////////
-def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 5; }
-def : WriteRes<WriteStore, [JSAGU]>;
-def : WriteRes<WriteMove, [JALU01]>;
+def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 5; }
+def : WriteRes<WriteStore, [JSAGU]>;
+def : WriteRes<WriteStoreNT, [JSAGU]>;
+def : WriteRes<WriteMove, [JALU01]>;
+
+// Load/store MXCSR.
+// FIXME: These are copy and pasted from WriteLoad/Store.
+def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 5; }
+def : WriteRes<WriteSTMXCSR, [JSAGU]>;
// Treat misc copies as a move.
def : InstRW<[WriteMove], (instrs COPY)>;
@@ -183,572 +266,438 @@ def : WriteRes<WriteZero, []>;
// consume resources. Indirect branches can fold loads.
////////////////////////////////////////////////////////////////////////////////
-defm : JWriteResIntPair<WriteJump, JALU01, 1>;
+defm : JWriteResIntPair<WriteJump, [JALU01], 1>;
////////////////////////////////////////////////////////////////////////////////
-// Floating point. This covers both scalar and vector operations.
-// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions?
-// FIXME: Double precision latencies
-// FIXME: SS vs PS latencies
-// FIXME: ymm latencies
-////////////////////////////////////////////////////////////////////////////////
-
-defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>;
-defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>;
-defm : JWriteResFpuPair<WriteFMA, JFPU1, 2>; // NOTE: Doesn't exist on Jaguar.
-defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>;
-defm : JWriteResFpuPair<WriteFRsqrt, JFPU1, 2>;
-defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>;
-defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>;
-defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>;
-
-def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> {
- let Latency = 21;
- let ResourceCycles = [1, 1, 21];
-}
-def : WriteRes<WriteFSqrtLd, [JFPU1, JLAGU, JFPM]> {
- let Latency = 26;
- let ResourceCycles = [1, 1, 21];
-}
-
-def : WriteRes<WriteFDiv, [JFPU1, JLAGU, JFPM]> {
- let Latency = 19;
- let ResourceCycles = [1, 1, 19];
-}
-def : WriteRes<WriteFDivLd, [JFPU1, JLAGU, JFPM]> {
- let Latency = 24;
- let ResourceCycles = [1, 1, 19];
-}
-
-// FIXME: integer pipes
-defm : JWriteResFpuPair<WriteCvtF2I, JFPU1, 3>; // Float -> Integer.
-defm : JWriteResFpuPair<WriteCvtI2F, JFPU1, 3>; // Integer -> Float.
-defm : JWriteResFpuPair<WriteCvtF2F, JFPU1, 3>; // Float -> Float size conversion.
-
-def : WriteRes<WriteFVarBlend, [JFPU01]> {
- let Latency = 2;
- let ResourceCycles = [4];
- let NumMicroOps = 3;
-}
-def : WriteRes<WriteFVarBlendLd, [JLAGU, JFPU01]> {
- let Latency = 7;
- let ResourceCycles = [1, 4];
- let NumMicroOps = 3;
-}
-
-// Vector integer operations.
-defm : JWriteResFpuPair<WriteVecALU, JFPU01, 1>;
-defm : JWriteResFpuPair<WriteVecShift, JFPU01, 1>;
-defm : JWriteResFpuPair<WriteVecIMul, JFPU0, 2>;
-defm : JWriteResFpuPair<WriteShuffle, JFPU01, 1>;
-defm : JWriteResFpuPair<WriteBlend, JFPU01, 1>;
-defm : JWriteResFpuPair<WriteVecLogic, JFPU01, 1>;
-defm : JWriteResFpuPair<WriteShuffle256, JFPU01, 1>;
-
-def : WriteRes<WriteVarBlend, [JFPU01]> {
- let Latency = 2;
- let ResourceCycles = [4];
- let NumMicroOps = 3;
-}
-def : WriteRes<WriteVarBlendLd, [JLAGU, JFPU01]> {
- let Latency = 7;
- let ResourceCycles = [1, 4];
- let NumMicroOps = 3;
-}
+// Special case scheduling classes.
+////////////////////////////////////////////////////////////////////////////////
-// FIXME: why do we need to define AVX2 resource on CPU that doesn't have AVX2?
-def : WriteRes<WriteVarVecShift, [JFPU01]> {}
-def : WriteRes<WriteVarVecShiftLd, [JLAGU, JFPU01]> {
- let Latency = 6;
- let ResourceCycles = [1, 2];
-}
+def : WriteRes<WriteSystem, [JALU01]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; }
+def : WriteRes<WriteFence, [JSAGU]>;
-def : WriteRes<WriteMPSAD, [JFPU0]> {
- let Latency = 3;
- let ResourceCycles = [2];
-}
-def : WriteRes<WriteMPSADLd, [JLAGU, JFPU0]> {
- let Latency = 8;
- let ResourceCycles = [1, 2];
-}
+// Nops don't have dependencies, so there's no actual latency, but we set this
+// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
+def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
////////////////////////////////////////////////////////////////////////////////
-// String instructions.
-// Packed Compare Implicit Length Strings, Return Mask
-// FIXME: approximate latencies + pipe dependencies
+// Floating point. This covers both scalar and vector operations.
////////////////////////////////////////////////////////////////////////////////
-def : WriteRes<WritePCmpIStrM, [JFPU1,JFPU0]> {
- let Latency = 8;
- let ResourceCycles = [2, 2];
- let NumMicroOps = 3;
-}
-def : WriteRes<WritePCmpIStrMLd, [JLAGU, JFPU1, JFPU0]> {
- let Latency = 13;
- let ResourceCycles = [1, 2, 2];
- let NumMicroOps = 3;
-}
-
-// Packed Compare Explicit Length Strings, Return Mask
-def : WriteRes<WritePCmpEStrM, [JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> {
- let Latency = 14;
- let ResourceCycles = [5, 5, 5, 5, 5];
- let NumMicroOps = 9;
-}
-def : WriteRes<WritePCmpEStrMLd, [JLAGU, JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> {
- let Latency = 19;
- let ResourceCycles = [1, 5, 5, 5, 5, 5];
- let NumMicroOps = 9;
-}
-
-// Packed Compare Implicit Length Strings, Return Index
-def : WriteRes<WritePCmpIStrI, [JFPU1, JFPU0]> {
- let Latency = 7;
- let ResourceCycles = [2, 2];
-}
-def : WriteRes<WritePCmpIStrILd, [JLAGU, JFPU1, JFPU0]> {
- let Latency = 12;
- let ResourceCycles = [1, 2, 2];
-}
-
-// Packed Compare Explicit Length Strings, Return Index
-def : WriteRes<WritePCmpEStrI, [JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> {
- let Latency = 14;
- let ResourceCycles = [5, 5, 5, 5, 5];
- let NumMicroOps = 9;
-}
-def : WriteRes<WritePCmpEStrILd, [JLAGU, JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> {
- let Latency = 19;
- let ResourceCycles = [1, 5, 5, 5, 5, 5];
- let NumMicroOps = 9;
-}
+defm : X86WriteRes<WriteFLD0, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : X86WriteRes<WriteFLD1, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : X86WriteRes<WriteFLDC, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : X86WriteRes<WriteFLoad, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFLoadX, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFLoadY, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad, [JLAGU, JFPU01, JFPX], 6, [1, 1, 2], 1>;
+defm : X86WriteRes<WriteFMaskedLoadY, [JLAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>;
+
+defm : X86WriteRes<WriteFStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>;
+defm : X86WriteRes<WriteFMaskedStore, [JSAGU, JFPU01, JFPX], 6, [1, 1, 4], 1>;
+defm : X86WriteRes<WriteFMaskedStoreY, [JSAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>;
+
+defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>;
+defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>;
+defm : X86WriteRes<WriteFMoveY, [JFPU01, JFPX], 1, [2, 2], 2>;
+
+defm : X86WriteRes<WriteEMMS, [JFPU01, JFPX], 2, [1, 1], 1>;
+
+defm : JWriteResFpuPair<WriteFAdd, [JFPU0, JFPA], 3>;
+defm : JWriteResFpuPair<WriteFAddX, [JFPU0, JFPA], 3>;
+defm : JWriteResYMMPair<WriteFAddY, [JFPU0, JFPA], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : JWriteResFpuPair<WriteFAdd64, [JFPU0, JFPA], 3>;
+defm : JWriteResFpuPair<WriteFAdd64X, [JFPU0, JFPA], 3>;
+defm : JWriteResYMMPair<WriteFAdd64Y, [JFPU0, JFPA], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+defm : JWriteResFpuPair<WriteFCmp, [JFPU0, JFPA], 2>;
+defm : JWriteResFpuPair<WriteFCmpX, [JFPU0, JFPA], 2>;
+defm : JWriteResYMMPair<WriteFCmpY, [JFPU0, JFPA], 2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : JWriteResFpuPair<WriteFCmp64, [JFPU0, JFPA], 2>;
+defm : JWriteResFpuPair<WriteFCmp64X, [JFPU0, JFPA], 2>;
+defm : JWriteResYMMPair<WriteFCmp64Y, [JFPU0, JFPA], 2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+defm : JWriteResFpuPair<WriteFCom, [JFPU0, JFPA, JALU0], 3>;
+defm : JWriteResFpuPair<WriteFMul, [JFPU1, JFPM], 2>;
+defm : JWriteResFpuPair<WriteFMulX, [JFPU1, JFPM], 2>;
+defm : JWriteResYMMPair<WriteFMulY, [JFPU1, JFPM], 2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : JWriteResFpuPair<WriteFMul64, [JFPU1, JFPM], 4, [1,2]>;
+defm : JWriteResFpuPair<WriteFMul64X, [JFPU1, JFPM], 4, [1,2]>;
+defm : JWriteResYMMPair<WriteFMul64Y, [JFPU1, JFPM], 4, [2,4], 2>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+defm : X86WriteResPairUnsupported<WriteFMA>;
+defm : X86WriteResPairUnsupported<WriteFMAX>;
+defm : X86WriteResPairUnsupported<WriteFMAY>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : JWriteResFpuPair<WriteDPPD, [JFPU1, JFPM, JFPA], 9, [1, 3, 3], 3>;
+defm : JWriteResFpuPair<WriteDPPS, [JFPU1, JFPM, JFPA], 11, [1, 3, 3], 5>;
+defm : JWriteResYMMPair<WriteDPPSY, [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>;
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+defm : JWriteResFpuPair<WriteFRcp, [JFPU1, JFPM], 2>;
+defm : JWriteResFpuPair<WriteFRcpX, [JFPU1, JFPM], 2>;
+defm : JWriteResYMMPair<WriteFRcpY, [JFPU1, JFPM], 2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+defm : JWriteResFpuPair<WriteFRsqrt, [JFPU1, JFPM], 2>;
+defm : JWriteResFpuPair<WriteFRsqrtX, [JFPU1, JFPM], 2>;
+defm : JWriteResYMMPair<WriteFRsqrtY, [JFPU1, JFPM], 2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+defm : JWriteResFpuPair<WriteFDiv, [JFPU1, JFPM], 19, [1, 19]>;
+defm : JWriteResFpuPair<WriteFDivX, [JFPU1, JFPM], 19, [1, 19]>;
+defm : JWriteResYMMPair<WriteFDivY, [JFPU1, JFPM], 38, [2, 38], 2>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+defm : JWriteResFpuPair<WriteFDiv64, [JFPU1, JFPM], 19, [1, 19]>;
+defm : JWriteResFpuPair<WriteFDiv64X, [JFPU1, JFPM], 19, [1, 19]>;
+defm : JWriteResYMMPair<WriteFDiv64Y, [JFPU1, JFPM], 38, [2, 38], 2>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+defm : JWriteResFpuPair<WriteFSqrt, [JFPU1, JFPM], 21, [1, 21]>;
+defm : JWriteResFpuPair<WriteFSqrtX, [JFPU1, JFPM], 21, [1, 21]>;
+defm : JWriteResYMMPair<WriteFSqrtY, [JFPU1, JFPM], 42, [2, 42], 2>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : JWriteResFpuPair<WriteFSqrt64, [JFPU1, JFPM], 27, [1, 27]>;
+defm : JWriteResFpuPair<WriteFSqrt64X, [JFPU1, JFPM], 27, [1, 27]>;
+defm : JWriteResYMMPair<WriteFSqrt64Y, [JFPU1, JFPM], 54, [2, 54], 2>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : JWriteResFpuPair<WriteFSqrt80, [JFPU1, JFPM], 35, [1, 35]>;
+defm : JWriteResFpuPair<WriteFSign, [JFPU1, JFPM], 2>;
+defm : JWriteResFpuPair<WriteFRnd, [JFPU1, JSTC], 3>;
+defm : JWriteResYMMPair<WriteFRndY, [JFPU1, JSTC], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : JWriteResFpuPair<WriteFLogic, [JFPU01, JFPX], 1>;
+defm : JWriteResYMMPair<WriteFLogicY, [JFPU01, JFPX], 1, [2, 2], 2>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : JWriteResFpuPair<WriteFTest, [JFPU0, JFPA, JALU0], 3>;
+defm : JWriteResYMMPair<WriteFTestY , [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : JWriteResFpuPair<WriteFShuffle, [JFPU01, JFPX], 1>;
+defm : JWriteResYMMPair<WriteFShuffleY, [JFPU01, JFPX], 1, [2, 2], 2>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 2, [1, 4], 3>;
+defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 3, [2, 6], 6>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : JWriteResFpuPair<WriteFBlend, [JFPU01, JFPX], 1>;
+defm : JWriteResYMMPair<WriteFBlendY, [JFPU01, JFPX], 1, [2, 2], 2>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : JWriteResFpuPair<WriteFVarBlend, [JFPU01, JFPX], 2, [1, 4], 3>;
+defm : JWriteResYMMPair<WriteFVarBlendY, [JFPU01, JFPX], 3, [2, 6], 6>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX], 1>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
////////////////////////////////////////////////////////////////////////////////
-// AES Instructions.
+// Conversions.
////////////////////////////////////////////////////////////////////////////////
-def : WriteRes<WriteAESDecEnc, [JFPU01, JVIMUL]> {
- let Latency = 3;
- let ResourceCycles = [1, 1];
-}
-def : WriteRes<WriteAESDecEncLd, [JFPU01, JLAGU, JVIMUL]> {
- let Latency = 8;
- let ResourceCycles = [1, 1, 1];
-}
-
-def : WriteRes<WriteAESIMC, [JVIMUL]> {
- let Latency = 2;
- let ResourceCycles = [1];
-}
-def : WriteRes<WriteAESIMCLd, [JLAGU, JVIMUL]> {
- let Latency = 7;
- let ResourceCycles = [1, 1];
-}
-
-def : WriteRes<WriteAESKeyGen, [JVIMUL]> {
- let Latency = 2;
- let ResourceCycles = [1];
-}
-def : WriteRes<WriteAESKeyGenLd, [JLAGU, JVIMUL]> {
- let Latency = 7;
- let ResourceCycles = [1, 1];
-}
+defm : JWriteResFpuPair<WriteCvtSS2I, [JFPU1, JSTC, JFPA, JALU0], 7, [1,1,1,1], 2>;
+defm : JWriteResFpuPair<WriteCvtPS2I, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtPS2IY, [JFPU1, JSTC], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : JWriteResFpuPair<WriteCvtSD2I, [JFPU1, JSTC, JFPA, JALU0], 7, [1,1,1,1], 2>;
+defm : JWriteResFpuPair<WriteCvtPD2I, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtPD2IY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+// FIXME: f+3 ST, LD+STC latency
+defm : JWriteResFpuPair<WriteCvtI2SS, [JFPU1, JSTC], 9, [1,1], 2>;
+defm : JWriteResFpuPair<WriteCvtI2PS, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtI2PSY, [JFPU1, JSTC], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : JWriteResFpuPair<WriteCvtI2SD, [JFPU1, JSTC], 9, [1,1], 2>;
+defm : JWriteResFpuPair<WriteCvtI2PD, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtI2PDY, [JFPU1, JSTC], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : JWriteResFpuPair<WriteCvtSS2SD, [JFPU1, JSTC], 7, [1,2], 2>;
+defm : JWriteResFpuPair<WriteCvtPS2PD, [JFPU1, JSTC], 2, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtPS2PDY, [JFPU1, JSTC], 2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+
+defm : JWriteResFpuPair<WriteCvtSD2SS, [JFPU1, JSTC], 7, [1,2], 2>;
+defm : JWriteResFpuPair<WriteCvtPD2PS, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtPD2PSY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+defm : JWriteResFpuPair<WriteCvtPH2PS, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtPH2PSY, [JFPU1, JSTC], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
+
+defm : X86WriteRes<WriteCvtPS2PH, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHY, [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteRes<WriteCvtPS2PHSt, [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
////////////////////////////////////////////////////////////////////////////////
-// Horizontal add/sub instructions.
+// Vector integer operations.
////////////////////////////////////////////////////////////////////////////////
-def : WriteRes<WriteFHAdd, [JFPU0]> {
- let Latency = 3;
-}
-
-def : WriteRes<WriteFHAddLd, [JLAGU, JFPU0]> {
- let Latency = 8;
-}
-
-def : WriteRes<WritePHAdd, [JFPU01]> {
- let ResourceCycles = [1];
-}
-def : WriteRes<WritePHAddLd, [JLAGU, JFPU01 ]> {
- let Latency = 6;
- let ResourceCycles = [1, 1];
-}
-
-def WriteFHAddY: SchedWriteRes<[JFPU0]> {
- let Latency = 3;
- let ResourceCycles = [2];
-}
-def : InstRW<[WriteFHAddY], (instregex "VH(ADD|SUB)P(S|D)Yrr")>;
-
-def WriteFHAddYLd: SchedWriteRes<[JLAGU, JFPU0]> {
- let Latency = 8;
- let ResourceCycles = [1, 2];
-}
-def : InstRW<[WriteFHAddYLd], (instregex "VH(ADD|SUB)P(S|D)Yrm")>;
+defm : X86WriteRes<WriteVecLoad, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecLoadNT, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [JLAGU, JFPU01, JVALU], 6, [1, 1, 2], 1>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [JLAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>;
+
+defm : X86WriteRes<WriteVecStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>;
+defm : X86WriteRes<WriteVecMaskedStore, [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>;
+defm : X86WriteRes<WriteVecMaskedStoreY, [JSAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>;
+
+defm : X86WriteRes<WriteVecMove, [JFPU01, JVALU], 1, [1, 1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [JFPU01, JVALU], 1, [1, 1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [JFPU01, JVALU], 1, [2, 2], 2>;
+defm : X86WriteRes<WriteVecMoveToGpr, [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [JFPU01, JFPX], 8, [1, 1], 2>;
+
+defm : JWriteResFpuPair<WriteVecALU, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecALUX, [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecALUY>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : X86WriteResPairUnsupported<WriteVarVecShift>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+defm : JWriteResFpuPair<WriteVecIMul, [JFPU0, JVIMUL], 2>;
+defm : JWriteResFpuPair<WriteVecIMulX, [JFPU0, JVIMUL], 2>;
+defm : X86WriteResPairUnsupported<WriteVecIMulY>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : JWriteResFpuPair<WritePMULLD, [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>;
+defm : X86WriteResPairUnsupported<WritePMULLDY>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : JWriteResFpuPair<WriteMPSAD, [JFPU0, JVIMUL], 3, [1, 2]>;
+defm : X86WriteResPairUnsupported<WriteMPSADY>;
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+defm : JWriteResFpuPair<WritePSADBW, [JFPU01, JVALU], 2>;
+defm : JWriteResFpuPair<WritePSADBWX, [JFPU01, JVALU], 2>;
+defm : X86WriteResPairUnsupported<WritePSADBWY>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : JWriteResFpuPair<WritePHMINPOS, [JFPU0, JVALU], 2>;
+defm : JWriteResFpuPair<WriteShuffle, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteShuffleX, [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteShuffleY>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : JWriteResFpuPair<WriteVarShuffle, [JFPU01, JVALU], 2, [1, 4], 3>;
+defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : JWriteResFpuPair<WriteBlend, [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteBlendY>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : JWriteResFpuPair<WriteVarBlend, [JFPU01, JVALU], 2, [1, 4], 3>;
+defm : X86WriteResPairUnsupported<WriteVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : JWriteResFpuPair<WriteVecLogic, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecLogicX, [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecLogicY>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : JWriteResFpuPair<WriteVecTest, [JFPU0, JFPA, JALU0], 3>;
+defm : JWriteResYMMPair<WriteVecTestY, [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : X86WriteResPairUnsupported<WriteShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
////////////////////////////////////////////////////////////////////////////////
-// Carry-less multiplication instructions.
+// Vector insert/extract operations.
////////////////////////////////////////////////////////////////////////////////
-def : WriteRes<WriteCLMul, [JVIMUL]> {
- let Latency = 2;
- let ResourceCycles = [1];
-}
-def : WriteRes<WriteCLMulLd, [JLAGU, JVIMUL]> {
- let Latency = 7;
- let ResourceCycles = [1, 1];
-}
-
-// FIXME: pipe for system/microcode?
-def : WriteRes<WriteSystem, [JAny]> { let Latency = 100; }
-def : WriteRes<WriteMicrocoded, [JAny]> { let Latency = 100; }
-def : WriteRes<WriteFence, [JSAGU]>;
-def : WriteRes<WriteNop, []>;
+defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
+defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
+defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
////////////////////////////////////////////////////////////////////////////////
-// SSE4.1 instructions.
+// SSE42 String instructions.
////////////////////////////////////////////////////////////////////////////////
-def WriteDPPS: SchedWriteRes<[JFPU0, JFPU1]> {
- let Latency = 11;
- let ResourceCycles = [3,3];
- let NumMicroOps = 5;
-}
-def : InstRW<[WriteDPPS], (instregex "(V)?DPPSrri")>;
-
-def WriteDPPSLd: SchedWriteRes<[JLAGU, JFPU0, JFPU1]> {
- let Latency = 16;
- let ResourceCycles = [1,3,3];
- let NumMicroOps = 6;
-}
-def : InstRW<[WriteDPPSLd], (instregex "(V)?DPPSrmi")>;
-
-def WriteDPPD: SchedWriteRes<[JFPU0, JFPU1]> {
- let Latency = 9;
- let ResourceCycles = [3,3];
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteDPPD], (instregex "(V)?DPPDrri")>;
-
-def WriteDPPDLd: SchedWriteRes<[JLAGU, JFPU0, JFPU1]> {
- let Latency = 14;
- let ResourceCycles = [1,3,3];
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteDPPDLd], (instregex "(V)?DPPDrmi")>;
+defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPA, JALU0], 7, [1, 2, 1, 1], 3>;
+defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPA, JALU0], 8, [1, 2, 1, 1], 3>;
+defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
+defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
////////////////////////////////////////////////////////////////////////////////
-// SSE4A instructions.
+// MOVMSK Instructions.
////////////////////////////////////////////////////////////////////////////////
-def WriteEXTRQ: SchedWriteRes<[JFPU01]> {
- let Latency = 1;
- let ResourceCycles = [1];
-}
-def : InstRW<[WriteEXTRQ], (instregex "EXTRQ")>;
-
-def WriteINSERTQ: SchedWriteRes<[JFPU01]> {
- let Latency = 2;
- let ResourceCycles = [4];
-}
-def : InstRW<[WriteINSERTQ], (instregex "INSERTQ")>;
+def : WriteRes<WriteFMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; }
+def : WriteRes<WriteVecMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; }
+defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
+def : WriteRes<WriteMMXMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; }
////////////////////////////////////////////////////////////////////////////////
-// F16C instructions.
+// AES Instructions.
////////////////////////////////////////////////////////////////////////////////
-def WriteCVT3: SchedWriteRes<[JFPU1]> {
- let Latency = 3;
-}
-def : InstRW<[WriteCVT3], (instregex "VCVTPS2PHrr")>;
-def : InstRW<[WriteCVT3], (instregex "VCVTPH2PSrr")>;
-
-def WriteCVT3St: SchedWriteRes<[JFPU1, JSAGU]> {
- let Latency = 3;
- let ResourceCycles = [1, 1];
-}
-def : InstRW<[WriteCVT3St], (instregex "VCVTPS2PHmr")>;
-
-def WriteCVT3Ld: SchedWriteRes<[JLAGU, JFPU1]> {
- let Latency = 8;
- let ResourceCycles = [1, 1];
-}
-def : InstRW<[WriteCVT3Ld], (instregex "VCVTPH2PSrm")>;
-
-def WriteCVTPS2PHY: SchedWriteRes<[JFPU1, JFPU01]> {
- let Latency = 6;
- let ResourceCycles = [2,2];
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteCVTPS2PHY], (instregex "VCVTPS2PHYrr")>;
-
-def WriteCVTPS2PHYSt: SchedWriteRes<[JFPU1, JFPU01, JSAGU]> {
- let Latency = 11;
- let ResourceCycles = [2,2,1];
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteCVTPS2PHYSt], (instregex "VCVTPS2PHYmr")>;
-
-def WriteCVTPH2PSY: SchedWriteRes<[JFPU1]> {
- let Latency = 3;
- let ResourceCycles = [2];
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteCVTPH2PSY], (instregex "VCVTPH2PSYrr")>;
-
-def WriteCVTPH2PSYLd: SchedWriteRes<[JLAGU, JFPU1]> {
- let Latency = 8;
- let ResourceCycles = [1,2];
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteCVTPH2PSYLd], (instregex "VCVTPH2PSYrm")>;
+defm : JWriteResFpuPair<WriteAESIMC, [JFPU0, JVIMUL], 2>;
+defm : JWriteResFpuPair<WriteAESKeyGen, [JFPU0, JVIMUL], 2>;
+defm : JWriteResFpuPair<WriteAESDecEnc, [JFPU0, JVIMUL], 3, [1, 1], 2>;
////////////////////////////////////////////////////////////////////////////////
-// AVX instructions.
+// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-def WriteVDPPSY: SchedWriteRes<[JFPU1, JFPU0]> {
- let Latency = 12;
- let ResourceCycles = [6, 6];
- let NumMicroOps = 10;
-}
-def : InstRW<[WriteVDPPSY], (instregex "VDPPSYrr")>;
-
-def WriteVDPPSYLd: SchedWriteRes<[JLAGU, JFPU1, JFPU0]> {
- let Latency = 17;
- let ResourceCycles = [1, 6, 6];
- let NumMicroOps = 11;
-}
-def : InstRW<[WriteVDPPSYLd, ReadAfterLd], (instregex "VDPPSYrm")>;
-
-def WriteFAddY: SchedWriteRes<[JFPU0]> {
- let Latency = 3;
- let ResourceCycles = [2];
-}
-def : InstRW<[WriteFAddY], (instregex "VADD(SUB)?P(S|D)Yrr", "VSUBP(S|D)Yrr")>;
-
-def WriteFAddYLd: SchedWriteRes<[JLAGU, JFPU0]> {
- let Latency = 8;
- let ResourceCycles = [1, 2];
-}
-def : InstRW<[WriteFAddYLd, ReadAfterLd], (instregex "VADD(SUB)?P(S|D)Yrm", "VSUBP(S|D)Yrm")>;
+defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 3>;
+defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 3, [2,2], 2>;
+defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WritePHAddY>;
-def WriteFDivY: SchedWriteRes<[JFPU1]> {
- let Latency = 38;
- let ResourceCycles = [38];
-}
-def : InstRW<[WriteFDivY], (instregex "VDIVP(D|S)Yrr")>;
-
-def WriteFDivYLd: SchedWriteRes<[JLAGU, JFPU1]> {
- let Latency = 43;
- let ResourceCycles = [1, 38];
-}
-def : InstRW<[WriteFDivYLd, ReadAfterLd], (instregex "VDIVP(S|D)Yrm")>;
+////////////////////////////////////////////////////////////////////////////////
+// Carry-less multiplication instructions.
+////////////////////////////////////////////////////////////////////////////////
-def WriteVMULYPD: SchedWriteRes<[JFPU1]> {
- let Latency = 4;
- let ResourceCycles = [4];
-}
-def : InstRW<[WriteVMULYPD], (instregex "VMULPDYrr")>;
+defm : JWriteResFpuPair<WriteCLMul, [JFPU0, JVIMUL], 2>;
-def WriteVMULYPDLd: SchedWriteRes<[JLAGU, JFPU1]> {
- let Latency = 9;
- let ResourceCycles = [1, 4];
-}
-def : InstRW<[WriteVMULYPDLd, ReadAfterLd], (instregex "VMULPDYrm")>;
+////////////////////////////////////////////////////////////////////////////////
+// SSE4A instructions.
+////////////////////////////////////////////////////////////////////////////////
-def WriteVMULYPS: SchedWriteRes<[JFPU1]> {
+def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> {
let Latency = 2;
- let ResourceCycles = [2];
-}
-def : InstRW<[WriteVMULYPS], (instregex "VMULPSYrr", "VRCPPSYr", "VRSQRTPSYr")>;
-
-def WriteVMULYPSLd: SchedWriteRes<[JLAGU, JFPU1]> {
- let Latency = 7;
- let ResourceCycles = [1, 2];
-}
-def : InstRW<[WriteVMULYPSLd, ReadAfterLd], (instregex "VMULPSYrm", "VRCPPSYm", "VRSQRTPSYm")>;
-
-def WriteVCVTY: SchedWriteRes<[JSTC]> {
- let Latency = 3;
- let ResourceCycles = [2];
-}
-def : InstRW<[WriteVCVTY], (instregex "VCVTDQ2P(S|D)Yrr")>;
-def : InstRW<[WriteVCVTY], (instregex "VROUNDYP(S|D)r")>;
-def : InstRW<[WriteVCVTY], (instregex "VCVTPS2DQYrr")>;
-def : InstRW<[WriteVCVTY], (instregex "VCVTTPS2DQYrr")>;
-
-def WriteVCVTYLd: SchedWriteRes<[JLAGU, JSTC]> {
- let Latency = 8;
- let ResourceCycles = [1, 2];
+ let ResourceCycles = [1, 4];
}
-def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VCVTDQ2P(S|D)Yrm")>;
-def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VROUNDYP(S|D)m")>;
-def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VCVTPS2DQYrm")>;
-def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VCVTTPS2DQYrm")>;
+def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
-def WriteVMONTPSt: SchedWriteRes<[JSTC, JLAGU]> {
- let Latency = 3;
- let ResourceCycles = [2,1];
-}
-def : InstRW<[WriteVMONTPSt], (instregex "VMOVNTP(S|D)Ymr")>;
-def : InstRW<[WriteVMONTPSt], (instregex "VMOVNTDQYmr")>;
+////////////////////////////////////////////////////////////////////////////////
+// AVX instructions.
+////////////////////////////////////////////////////////////////////////////////
-def WriteVCVTPDY: SchedWriteRes<[JSTC, JFPU01]> {
+def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> {
let Latency = 6;
- let ResourceCycles = [2, 4];
-}
-def : InstRW<[WriteVCVTPDY], (instregex "VCVTPD2(DQ|PS)Yrr")>;
-def : InstRW<[WriteVCVTPDY], (instregex "VCVTTPD2DQYrr")>;
-
-def WriteVCVTPDYLd: SchedWriteRes<[JLAGU, JSTC, JFPU01]> {
- let Latency = 11;
let ResourceCycles = [1, 2, 4];
+ let NumMicroOps = 2;
}
-def : InstRW<[WriteVCVTPDYLd, ReadAfterLd], (instregex "VCVTPD2(DQ|PS)Yrm")>;
-def : InstRW<[WriteVCVTPDYLd, ReadAfterLd], (instregex "VCVTTPD2DQYrm")>;
-
-def WriteVBlendVPY: SchedWriteRes<[JFPU01]> {
- let Latency = 3;
- let ResourceCycles = [6];
-}
-def : InstRW<[WriteVBlendVPY], (instregex "VBLENDVP(S|D)Yrr", "VPERMILP(D|S)Yrr")>;
+def : InstRW<[JWriteVBROADCASTYLd, ReadAfterLd], (instrs VBROADCASTSDYrm,
+ VBROADCASTSSYrm)>;
-def WriteVBlendVPYLd: SchedWriteRes<[JLAGU, JFPU01]> {
- let Latency = 8;
- let ResourceCycles = [1, 6];
+def JWriteJVZEROALL: SchedWriteRes<[]> {
+ let Latency = 90;
+ let NumMicroOps = 73;
}
-def : InstRW<[WriteVBlendVPYLd, ReadAfterLd], (instregex "VBLENDVP(S|D)Yrm")>;
+def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>;
-def WriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01]> {
- let Latency = 6;
- let ResourceCycles = [1, 4];
+def JWriteJVZEROUPPER: SchedWriteRes<[]> {
+ let Latency = 46;
+ let NumMicroOps = 37;
}
-def : InstRW<[WriteVBROADCASTYLd, ReadAfterLd], (instregex "VBROADCASTS(S|D)Yrm")>;
-
-def WriteFPAY22: SchedWriteRes<[JFPU0]> {
+def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
+
+///////////////////////////////////////////////////////////////////////////////
+// SchedWriteVariant definitions.
+///////////////////////////////////////////////////////////////////////////////
+
+def JWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+// Certain instructions that use the same register for both source
+// operands do not have a real dependency on the previous contents of the
+// register, and thus, do not have to wait before completing. They can be
+// optimized out at register renaming stage.
+// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family
+// 15h Processors".
+// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// Section 21.8 [Dependency-breaking instructions].
+
+def JWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+ SchedVar<MCSchedPredicate<TruePred>, [WriteALU]>
+]>;
+def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def JWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+ SchedVar<MCSchedPredicate<TruePred>, [WriteFLogic]>
+]>;
+def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr,
+ ANDNPSrr, VANDNPSrr,
+ ANDNPDrr, VANDNPDrr)>;
+
+def JWriteVZeroIdiomLogic : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+ SchedVar<MCSchedPredicate<TruePred>, [WriteVecLogic]>
+]>;
+def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>;
+
+def JWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+ SchedVar<MCSchedPredicate<TruePred>, [WriteVecLogicX]>
+]>;
+def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
+ PANDNrr, VPANDNrr)>;
+
+def JWriteVZeroIdiomALU : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+ SchedVar<MCSchedPredicate<TruePred>, [WriteVecALU]>
+]>;
+def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr,
+ MMX_PSUBQirr, MMX_PSUBWirr,
+ MMX_PCMPGTBirr, MMX_PCMPGTDirr,
+ MMX_PCMPGTWirr)>;
+
+def JWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+ SchedVar<MCSchedPredicate<TruePred>, [WriteVecALUX]>
+]>;
+def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
+ PSUBDrr, VPSUBDrr,
+ PSUBQrr, VPSUBQrr,
+ PSUBWrr, VPSUBWrr,
+ PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTQrr, VPCMPGTQrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+// This write is used for slow LEA instructions.
+def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> {
let Latency = 2;
- let ResourceCycles = [2];
-}
-def : InstRW<[WriteFPAY22], (instregex "VCMPP(S|D)Yrri", "VM(AX|IN)P(D|S)Yrr")>;
-
-def WriteFPAY22Ld: SchedWriteRes<[JLAGU, JFPU0]> {
- let Latency = 7;
- let ResourceCycles = [1, 2];
-}
-def : InstRW<[WriteFPAY22Ld, ReadAfterLd], (instregex "VCMPP(S|D)Yrmi", "VM(AX|IN)P(D|S)Yrm")>;
-
-def WriteVHAddSubY: SchedWriteRes<[JFPU0]> {
- let Latency = 3;
- let ResourceCycles = [2];
-}
-def : InstRW<[WriteVHAddSubY], (instregex "VH(ADD|SUB)P(D|S)Yrr")>;
-
-def WriteVHAddSubYLd: SchedWriteRes<[JLAGU, JFPU0]> {
- let Latency = 8;
- let ResourceCycles = [1, 2];
-}
-def : InstRW<[WriteVHAddSubYLd], (instregex "VH(ADD|SUB)P(D|S)Yrm")>;
-
-def WriteVMaskMovLd: SchedWriteRes<[JLAGU,JFPU01]> {
- let Latency = 6;
- let ResourceCycles = [1, 2];
-}
-def : InstRW<[WriteVMaskMovLd], (instregex "VMASKMOVP(D|S)rm")>;
-
-def WriteVMaskMovYLd: SchedWriteRes<[JLAGU,JFPU01]> {
- let Latency = 6;
- let ResourceCycles = [1, 4];
-}
-def : InstRW<[WriteVMaskMovYLd], (instregex "VMASKMOVP(D|S)Yrm")>;
-
-def WriteVMaskMovSt: SchedWriteRes<[JFPU01,JSAGU]> {
- let Latency = 6;
- let ResourceCycles = [4, 1];
-}
-def : InstRW<[WriteVMaskMovSt], (instregex "VMASKMOVP(D|S)mr")>;
-
-def WriteVMaskMovYSt: SchedWriteRes<[JFPU01,JSAGU]> {
- let Latency = 6;
- let ResourceCycles = [4, 1];
-}
-def : InstRW<[WriteVMaskMovYSt], (instregex "VMASKMOVP(D|S)Ymr")>;
-
-// TODO: In fact we have latency '2+i'. The +i represents an additional 1 cycle transfer
-// operation which moves the floating point result to the integer unit. During this
-// additional cycle the floating point unit execution resources are not occupied
-// and ALU0 in the integer unit is occupied instead.
-def WriteVMOVMSK: SchedWriteRes<[JFPU0]> {
- let Latency = 3;
-}
-def : InstRW<[WriteVMOVMSK], (instregex "VMOVMSKP(D|S)(Y)?rr")>;
-
-// TODO: In fact we have latency '3+i'. The +i represents an additional 1 cycle transfer
-// operation which moves the floating point result to the integer unit. During this
-// additional cycle the floating point unit execution resources are not occupied
-// and ALU0 in the integer unit is occupied instead.
-def WriteVTESTY: SchedWriteRes<[JFPU01, JFPU0]> {
- let Latency = 4;
- let ResourceCycles = [2, 2];
- let NumMicroOps = 3;
}
-def : InstRW<[WriteVTESTY], (instregex "VTESTP(S|D)Yrr")>;
-def : InstRW<[WriteVTESTY], (instregex "VPTESTYrr")>;
-def WriteVTESTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPU0]> {
- let Latency = 9;
- let ResourceCycles = [1, 2, 2];
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteVTESTYLd], (instregex "VTESTP(S|D)Yrm")>;
-def : InstRW<[WriteVTESTYLd], (instregex "VPTESTYrm")>;
-
-def WriteVTEST: SchedWriteRes<[JFPU0]> {
+// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA
+// with a `Scale` value different than 1.
+def JSlowLEAPredicate : MCSchedPredicate<
+ CheckAny<[
+ // A 3-operand LEA (base, index, offset).
+ IsThreeOperandsLEAFn,
+ // An LEA with a "Scale" different than 1.
+ CheckAll<[
+ CheckIsImmOperand<2>,
+ CheckNot<CheckImmOperand<2, 1>>
+ ]>
+ ]>
+>;
+
+def JWriteLEA : SchedWriteVariant<[
+ SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>,
+ SchedVar<MCSchedPredicate<TruePred>, [WriteLEA]>
+]>;
+
+def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
+
+def JSlowLEA16r : SchedWriteRes<[JALU01]> {
let Latency = 3;
+ let ResourceCycles = [4];
}
-def : InstRW<[WriteVTEST], (instregex "VTESTP(S|D)rr")>;
-def : InstRW<[WriteVTEST], (instregex "VPTESTrr")>;
-
-def WriteVTESTLd: SchedWriteRes<[JLAGU, JFPU0]> {
- let Latency = 8;
-}
-def : InstRW<[WriteVTESTLd], (instregex "VTESTP(S|D)rm")>;
-def : InstRW<[WriteVTESTLd], (instregex "VPTESTrm")>;
-
-def WriteVSQRTYPD: SchedWriteRes<[JFPU1]> {
- let Latency = 54;
- let ResourceCycles = [54];
-}
-def : InstRW<[WriteVSQRTYPD], (instregex "VSQRTPDYr")>;
-
-def WriteVSQRTYPDLd: SchedWriteRes<[JLAGU, JFPU1]> {
- let Latency = 59;
- let ResourceCycles = [1, 54];
-}
-def : InstRW<[WriteVSQRTYPDLd], (instregex "VSQRTPDYm")>;
-
-def WriteVSQRTYPS: SchedWriteRes<[JFPU1]> {
- let Latency = 42;
- let ResourceCycles = [42];
-}
-def : InstRW<[WriteVSQRTYPS], (instregex "VSQRTPSYr")>;
-
-def WriteVSQRTYPSLd: SchedWriteRes<[JLAGU, JFPU1]> {
- let Latency = 47;
- let ResourceCycles = [1, 42];
-}
-def : InstRW<[WriteVSQRTYPSLd], (instregex "VSQRTPSYm")>;
-def WriteJVZEROALL: SchedWriteRes<[]> {
- let Latency = 90;
- let NumMicroOps = 73;
-}
-def : InstRW<[WriteJVZEROALL], (instregex "VZEROALL")>;
+def : InstRW<[JSlowLEA16r], (instrs LEA16r)>;
-def WriteJVZEROUPPER: SchedWriteRes<[]> {
- let Latency = 46;
- let NumMicroOps = 37;
-}
-def : InstRW<[WriteJVZEROUPPER], (instregex "VZEROUPPER")>;
} // SchedModel
-
diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td
index 35ec7488db72..c938a4a8939e 100644
--- a/lib/Target/X86/X86ScheduleSLM.td
+++ b/lib/Target/X86/X86ScheduleSLM.td
@@ -32,19 +32,19 @@ def SLMModel : SchedMachineModel {
let SchedModel = SLMModel in {
// Silvermont has 5 reservation stations for micro-ops
-def IEC_RSV0 : ProcResource<1>;
-def IEC_RSV1 : ProcResource<1>;
-def FPC_RSV0 : ProcResource<1> { let BufferSize = 1; }
-def FPC_RSV1 : ProcResource<1> { let BufferSize = 1; }
-def MEC_RSV : ProcResource<1>;
+def SLM_IEC_RSV0 : ProcResource<1>;
+def SLM_IEC_RSV1 : ProcResource<1>;
+def SLM_FPC_RSV0 : ProcResource<1> { let BufferSize = 1; }
+def SLM_FPC_RSV1 : ProcResource<1> { let BufferSize = 1; }
+def SLM_MEC_RSV : ProcResource<1>;
// Many micro-ops are capable of issuing on multiple ports.
-def IEC_RSV01 : ProcResGroup<[IEC_RSV0, IEC_RSV1]>;
-def FPC_RSV01 : ProcResGroup<[FPC_RSV0, FPC_RSV1]>;
+def SLM_IEC_RSV01 : ProcResGroup<[SLM_IEC_RSV0, SLM_IEC_RSV1]>;
+def SLM_FPC_RSV01 : ProcResGroup<[SLM_FPC_RSV0, SLM_FPC_RSV1]>;
-def SMDivider : ProcResource<1>;
-def SMFPMultiplier : ProcResource<1>;
-def SMFPDivider : ProcResource<1>;
+def SLMDivider : ProcResource<1>;
+def SLMFPMultiplier : ProcResource<1>;
+def SLMFPDivider : ProcResource<1>;
// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
// cycles after the memory operand.
@@ -55,209 +55,426 @@ def : ReadAdvance<ReadAfterLd, 3>;
// as two micro-ops when queued in the reservation station.
// This multiclass defines the resource usage for variants with and without
// folded loads.
-multiclass SMWriteResPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+multiclass SLMWriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1,
+ int LoadLat = 3> {
// Register variant is using a single cycle on ExePort.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
- // Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the
- // latency.
- def : WriteRes<SchedRW.Folded, [MEC_RSV, ExePort]> {
- let Latency = !add(Lat, 3);
+ // Memory variant also uses a cycle on MEC_RSV and adds LoadLat cycles to
+ // the latency (default = 3).
+ def : WriteRes<SchedRW.Folded, !listconcat([SLM_MEC_RSV], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = UOps;
}
}
// A folded store needs a cycle on MEC_RSV for the store data, but it does not
// need an extra port cycle to recompute the address.
-def : WriteRes<WriteRMW, [MEC_RSV]>;
+def : WriteRes<WriteRMW, [SLM_MEC_RSV]>;
+
+def : WriteRes<WriteStore, [SLM_IEC_RSV01, SLM_MEC_RSV]>;
+def : WriteRes<WriteStoreNT, [SLM_IEC_RSV01, SLM_MEC_RSV]>;
+def : WriteRes<WriteLoad, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteMove, [SLM_IEC_RSV01]>;
+def : WriteRes<WriteZero, []>;
-def : WriteRes<WriteStore, [IEC_RSV01, MEC_RSV]>;
-def : WriteRes<WriteLoad, [MEC_RSV]> { let Latency = 3; }
-def : WriteRes<WriteMove, [IEC_RSV01]>;
-def : WriteRes<WriteZero, []>;
+// Load/store MXCSR.
+// FIXME: These are probably wrong. They are copy pasted from WriteStore/Load.
+def : WriteRes<WriteSTMXCSR, [SLM_IEC_RSV01, SLM_MEC_RSV]>;
+def : WriteRes<WriteLDMXCSR, [SLM_MEC_RSV]> { let Latency = 3; }
// Treat misc copies as a move.
def : InstRW<[WriteMove], (instrs COPY)>;
-defm : SMWriteResPair<WriteALU, IEC_RSV01, 1>;
-defm : SMWriteResPair<WriteIMul, IEC_RSV1, 3>;
-defm : SMWriteResPair<WriteShift, IEC_RSV0, 1>;
-defm : SMWriteResPair<WriteJump, IEC_RSV1, 1>;
+defm : SLMWriteResPair<WriteALU, [SLM_IEC_RSV01], 1>;
+defm : SLMWriteResPair<WriteADC, [SLM_IEC_RSV01], 1>;
+defm : SLMWriteResPair<WriteIMul, [SLM_IEC_RSV1], 3>;
+defm : SLMWriteResPair<WriteIMul64, [SLM_IEC_RSV1], 3>;
+
+defm : SLMWriteResPair<WriteBSWAP32,[SLM_IEC_RSV01], 1>;
+defm : SLMWriteResPair<WriteBSWAP64,[SLM_IEC_RSV01], 1>;
+
+defm : SLMWriteResPair<WriteShift, [SLM_IEC_RSV0], 1>;
+defm : SLMWriteResPair<WriteShiftDouble, [SLM_IEC_RSV0], 1>;
+defm : SLMWriteResPair<WriteJump, [SLM_IEC_RSV1], 1>;
+defm : SLMWriteResPair<WriteCRC32, [SLM_IEC_RSV1], 3>;
+
+defm : SLMWriteResPair<WriteCMOV, [SLM_IEC_RSV01], 2, [2]>;
+defm : SLMWriteResPair<WriteCMOV2, [SLM_IEC_RSV01], 2, [2]>;
+defm : X86WriteRes<WriteFCMOV, [SLM_FPC_RSV1], 3, [1], 1>; // x87 conditional move.
+def : WriteRes<WriteSETCC, [SLM_IEC_RSV01]>;
+def : WriteRes<WriteSETCCStore, [SLM_IEC_RSV01, SLM_MEC_RSV]> {
+ // FIXME Latency and NumMicrOps?
+ let ResourceCycles = [2,1];
+}
+def : WriteRes<WriteLAHFSAHF, [SLM_IEC_RSV01]>;
// This is for simple LEAs with one or two input operands.
// The complex ones can only execute on port 1, and they require two cycles on
// the port to read all inputs. We don't model that.
-def : WriteRes<WriteLEA, [IEC_RSV1]>;
-
-// This is quite rough, latency depends on the dividend.
-def : WriteRes<WriteIDiv, [IEC_RSV01, SMDivider]> {
- let Latency = 25;
- let ResourceCycles = [1, 25];
-}
-def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
- let Latency = 29;
- let ResourceCycles = [1, 1, 25];
-}
+def : WriteRes<WriteLEA, [SLM_IEC_RSV1]>;
+
+// Bit counts.
+defm : SLMWriteResPair<WriteBSF, [SLM_IEC_RSV01], 10, [20], 10>;
+defm : SLMWriteResPair<WriteBSR, [SLM_IEC_RSV01], 10, [20], 10>;
+defm : SLMWriteResPair<WriteLZCNT, [SLM_IEC_RSV0], 3>;
+defm : SLMWriteResPair<WriteTZCNT, [SLM_IEC_RSV0], 3>;
+defm : SLMWriteResPair<WritePOPCNT, [SLM_IEC_RSV0], 3>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : X86WriteResPairUnsupported<WriteBEXTR>;
+defm : X86WriteResPairUnsupported<WriteBZHI>;
+
+defm : SLMWriteResPair<WriteDiv8, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteDiv16, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteDiv32, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteDiv64, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteIDiv8, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteIDiv16, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteIDiv32, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteIDiv64, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
// Scalar and vector floating point.
-defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>;
-defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>;
-defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
-defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>;
-defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
-defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
-defm : SMWriteResPair<WriteCvtF2F, FPC_RSV01, 4>;
-defm : SMWriteResPair<WriteFShuffle, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteFBlend, FPC_RSV0, 1>;
-
-// This is quite rough, latency depends on precision
-def : WriteRes<WriteFMul, [FPC_RSV0, SMFPMultiplier]> {
- let Latency = 5;
- let ResourceCycles = [1, 2];
-}
-def : WriteRes<WriteFMulLd, [MEC_RSV, FPC_RSV0, SMFPMultiplier]> {
- let Latency = 8;
- let ResourceCycles = [1, 1, 2];
-}
-
-def : WriteRes<WriteFDiv, [FPC_RSV0, SMFPDivider]> {
- let Latency = 34;
- let ResourceCycles = [1, 34];
-}
-def : WriteRes<WriteFDivLd, [MEC_RSV, FPC_RSV0, SMFPDivider]> {
- let Latency = 37;
- let ResourceCycles = [1, 1, 34];
-}
+defm : X86WriteRes<WriteFLD0, [SLM_FPC_RSV01], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1, [SLM_FPC_RSV01], 1, [1], 1>;
+defm : X86WriteRes<WriteFLDC, [SLM_FPC_RSV01], 1, [2], 2>;
+def : WriteRes<WriteFLoad, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteFLoadX, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteFLoadY, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteFMaskedLoad, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteFMaskedLoadY, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteFStore, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFStoreX, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFStoreY, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFStoreNT, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFStoreNTX, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFStoreNTY, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFMaskedStore, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFMaskedStoreY, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFMove, [SLM_FPC_RSV01]>;
+def : WriteRes<WriteFMoveX, [SLM_FPC_RSV01]>;
+def : WriteRes<WriteFMoveY, [SLM_FPC_RSV01]>;
+defm : X86WriteRes<WriteEMMS, [SLM_FPC_RSV01], 10, [10], 9>;
+
+defm : SLMWriteResPair<WriteFAdd, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFAddX, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFAddY, [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : SLMWriteResPair<WriteFAdd64, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFAdd64X, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFAdd64Y, [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+defm : SLMWriteResPair<WriteFCmp, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFCmpX, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFCmpY, [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : SLMWriteResPair<WriteFCmp64, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFCmp64X, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFCmp64Y, [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+defm : SLMWriteResPair<WriteFCom, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFMul, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : SLMWriteResPair<WriteFMulX, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : SLMWriteResPair<WriteFMulY, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : SLMWriteResPair<WriteFMul64, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : SLMWriteResPair<WriteFMul64X, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : SLMWriteResPair<WriteFMul64Y, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+defm : SLMWriteResPair<WriteFDiv, [SLM_FPC_RSV0, SLMFPDivider], 19, [1,17]>;
+defm : SLMWriteResPair<WriteFDivX, [SLM_FPC_RSV0, SLMFPDivider], 39, [1,39]>;
+defm : SLMWriteResPair<WriteFDivY, [SLM_FPC_RSV0, SLMFPDivider], 39, [1,39]>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+defm : SLMWriteResPair<WriteFDiv64, [SLM_FPC_RSV0, SLMFPDivider], 34, [1,32]>;
+defm : SLMWriteResPair<WriteFDiv64X, [SLM_FPC_RSV0, SLMFPDivider], 69, [1,69]>;
+defm : SLMWriteResPair<WriteFDiv64Y, [SLM_FPC_RSV0, SLMFPDivider], 69, [1,69]>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+defm : SLMWriteResPair<WriteFRcp, [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteFRcpX, [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteFRcpY, [SLM_FPC_RSV0], 5>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+defm : SLMWriteResPair<WriteFRsqrt, [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteFRsqrtX, [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteFRsqrtY, [SLM_FPC_RSV0], 5>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+defm : SLMWriteResPair<WriteFSqrt, [SLM_FPC_RSV0,SLMFPDivider], 20, [1,20], 1, 3>;
+defm : SLMWriteResPair<WriteFSqrtX, [SLM_FPC_RSV0,SLMFPDivider], 41, [1,40], 1, 3>;
+defm : SLMWriteResPair<WriteFSqrtY, [SLM_FPC_RSV0,SLMFPDivider], 41, [1,40], 1, 3>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : SLMWriteResPair<WriteFSqrt64, [SLM_FPC_RSV0,SLMFPDivider], 35, [1,35], 1, 3>;
+defm : SLMWriteResPair<WriteFSqrt64X, [SLM_FPC_RSV0,SLMFPDivider], 71, [1,70], 1, 3>;
+defm : SLMWriteResPair<WriteFSqrt64Y, [SLM_FPC_RSV0,SLMFPDivider], 71, [1,70], 1, 3>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : SLMWriteResPair<WriteFSqrt80, [SLM_FPC_RSV0,SLMFPDivider], 40, [1,40]>;
+defm : SLMWriteResPair<WriteDPPD, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteDPPS, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteDPPSY, [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+defm : SLMWriteResPair<WriteFSign, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteFRnd, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFRndY, [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : SLMWriteResPair<WriteFLogic, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteFLogicY, [SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : SLMWriteResPair<WriteFTest, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteFTestY, [SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : SLMWriteResPair<WriteFShuffle, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteFShuffleY, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : SLMWriteResPair<WriteFVarShuffle, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteFVarShuffleY,[SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : SLMWriteResPair<WriteFBlend, [SLM_FPC_RSV0], 1>;
+
+// Conversion between integer and float.
+defm : SLMWriteResPair<WriteCvtSS2I, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPS2I, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPS2IY, [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : SLMWriteResPair<WriteCvtSD2I, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPD2I, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPD2IY, [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+defm : SLMWriteResPair<WriteCvtI2SS, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtI2PS, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtI2PSY, [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : SLMWriteResPair<WriteCvtI2SD, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtI2PD, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtI2PDY, [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : SLMWriteResPair<WriteCvtSS2SD, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPS2PD, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPS2PDY, [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+defm : SLMWriteResPair<WriteCvtSD2SS, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPD2PS, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPD2PSY, [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
// Vector integer operations.
-defm : SMWriteResPair<WriteVecShift, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>;
-defm : SMWriteResPair<WriteVecALU, FPC_RSV01, 1>;
-defm : SMWriteResPair<WriteVecIMul, FPC_RSV0, 4>;
-defm : SMWriteResPair<WriteShuffle, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteBlend, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteMPSAD, FPC_RSV0, 7>;
+def : WriteRes<WriteVecLoad, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteVecLoadX, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteVecLoadY, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteVecLoadNT, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteVecLoadNTY, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteVecMaskedLoad, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteVecMaskedLoadY, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteVecStore, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecStoreX, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecStoreY, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecStoreNT, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecStoreNTY, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecMaskedStore, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecMaskedStoreY, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecMove, [SLM_FPC_RSV01]>;
+def : WriteRes<WriteVecMoveX, [SLM_FPC_RSV01]>;
+def : WriteRes<WriteVecMoveY, [SLM_FPC_RSV01]>;
+def : WriteRes<WriteVecMoveToGpr, [SLM_IEC_RSV01]>;
+def : WriteRes<WriteVecMoveFromGpr, [SLM_IEC_RSV01]>;
+
+defm : SLMWriteResPair<WriteVecShift, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVecShiftX, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVecShiftY, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+defm : SLMWriteResPair<WriteVecShiftImm, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVecShiftImmX,[SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVecShiftImmY,[SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : SLMWriteResPair<WriteVecLogic, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteVecLogicX,[SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteVecLogicY,[SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : SLMWriteResPair<WriteVecTest, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteVecTestY, [SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : SLMWriteResPair<WriteVecALU, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteVecALUX, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteVecALUY, [SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : SLMWriteResPair<WriteVecIMul, [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0], 4>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+// FIXME: The below is closer to correct, but caused some perf regressions.
+//defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 11, [11], 7>;
+defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WritePMULLDY, [SLM_FPC_RSV0], 4>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : SLMWriteResPair<WriteShuffle, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteShuffleY, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : SLMWriteResPair<WriteShuffleX, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVarShuffle, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVarShuffleX, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVarShuffleY, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : SLMWriteResPair<WriteBlend, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteBlendY, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : SLMWriteResPair<WriteMPSAD, [SLM_FPC_RSV0], 7>;
+defm : SLMWriteResPair<WriteMPSADY, [SLM_FPC_RSV0], 7>;
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+defm : SLMWriteResPair<WritePSADBW, [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WritePSADBWX, [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WritePSADBWY, [SLM_FPC_RSV0], 4>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : SLMWriteResPair<WritePHMINPOS, [SLM_FPC_RSV0], 4>;
+
+// Vector insert/extract operations.
+defm : SLMWriteResPair<WriteVecInsert, [SLM_FPC_RSV0], 1>;
+
+def : WriteRes<WriteVecExtract, [SLM_FPC_RSV0]>;
+def : WriteRes<WriteVecExtractSt, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
////////////////////////////////////////////////////////////////////////////////
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-
-def : WriteRes<WriteFHAdd, [FPC_RSV01]> {
- let Latency = 3;
- let ResourceCycles = [2];
-}
-
-def : WriteRes<WriteFHAddLd, [FPC_RSV01, MEC_RSV]> {
- let Latency = 6;
- let ResourceCycles = [2, 1];
-}
-
-// PHADD|PHSUB (S) W/D.
-def : WriteRes<WritePHAdd, [FPC_RSV01]> {
- let Latency = 1;
- let ResourceCycles = [1];
-}
-
-def : WriteRes<WritePHAddLd, [FPC_RSV01, MEC_RSV]> {
- let Latency = 4;
- let ResourceCycles = [1, 1];
-}
+defm : SLMWriteResPair<WriteFHAdd, [SLM_FPC_RSV01], 3, [2]>;
+defm : SLMWriteResPair<WriteFHAddY, [SLM_FPC_RSV01], 3, [2]>;
+defm : X86WriteResPairUnsupported<WriteFHAddZ>;
+defm : SLMWriteResPair<WritePHAdd, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WritePHAddX, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WritePHAddY, [SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WritePHAddZ>;
// String instructions.
// Packed Compare Implicit Length Strings, Return Mask
-def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> {
+def : WriteRes<WritePCmpIStrM, [SLM_FPC_RSV0]> {
let Latency = 13;
let ResourceCycles = [13];
}
-def : WriteRes<WritePCmpIStrMLd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WritePCmpIStrMLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
let Latency = 13;
let ResourceCycles = [13, 1];
}
// Packed Compare Explicit Length Strings, Return Mask
-def : WriteRes<WritePCmpEStrM, [FPC_RSV0]> {
+def : WriteRes<WritePCmpEStrM, [SLM_FPC_RSV0]> {
let Latency = 17;
let ResourceCycles = [17];
}
-def : WriteRes<WritePCmpEStrMLd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WritePCmpEStrMLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
let Latency = 17;
let ResourceCycles = [17, 1];
}
// Packed Compare Implicit Length Strings, Return Index
-def : WriteRes<WritePCmpIStrI, [FPC_RSV0]> {
+def : WriteRes<WritePCmpIStrI, [SLM_FPC_RSV0]> {
let Latency = 17;
let ResourceCycles = [17];
}
-def : WriteRes<WritePCmpIStrILd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WritePCmpIStrILd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
let Latency = 17;
let ResourceCycles = [17, 1];
}
// Packed Compare Explicit Length Strings, Return Index
-def : WriteRes<WritePCmpEStrI, [FPC_RSV0]> {
+def : WriteRes<WritePCmpEStrI, [SLM_FPC_RSV0]> {
let Latency = 21;
let ResourceCycles = [21];
}
-def : WriteRes<WritePCmpEStrILd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WritePCmpEStrILd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
let Latency = 21;
let ResourceCycles = [21, 1];
}
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [SLM_FPC_RSV1]> { let Latency = 4; }
+def : WriteRes<WriteVecMOVMSK, [SLM_FPC_RSV1]> { let Latency = 4; }
+def : WriteRes<WriteVecMOVMSKY, [SLM_FPC_RSV1]> { let Latency = 4; }
+def : WriteRes<WriteMMXMOVMSK, [SLM_FPC_RSV1]> { let Latency = 4; }
+
// AES Instructions.
-def : WriteRes<WriteAESDecEnc, [FPC_RSV0]> {
+def : WriteRes<WriteAESDecEnc, [SLM_FPC_RSV0]> {
let Latency = 8;
let ResourceCycles = [5];
}
-def : WriteRes<WriteAESDecEncLd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WriteAESDecEncLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
let Latency = 8;
let ResourceCycles = [5, 1];
}
-def : WriteRes<WriteAESIMC, [FPC_RSV0]> {
+def : WriteRes<WriteAESIMC, [SLM_FPC_RSV0]> {
let Latency = 8;
let ResourceCycles = [5];
}
-def : WriteRes<WriteAESIMCLd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WriteAESIMCLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
let Latency = 8;
let ResourceCycles = [5, 1];
}
-def : WriteRes<WriteAESKeyGen, [FPC_RSV0]> {
+def : WriteRes<WriteAESKeyGen, [SLM_FPC_RSV0]> {
let Latency = 8;
let ResourceCycles = [5];
}
-def : WriteRes<WriteAESKeyGenLd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WriteAESKeyGenLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
let Latency = 8;
let ResourceCycles = [5, 1];
}
// Carry-less multiplication instructions.
-def : WriteRes<WriteCLMul, [FPC_RSV0]> {
+def : WriteRes<WriteCLMul, [SLM_FPC_RSV0]> {
let Latency = 10;
let ResourceCycles = [10];
}
-def : WriteRes<WriteCLMulLd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WriteCLMulLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
let Latency = 10;
let ResourceCycles = [10, 1];
}
-
-def : WriteRes<WriteSystem, [FPC_RSV0]> { let Latency = 100; }
-def : WriteRes<WriteMicrocoded, [FPC_RSV0]> { let Latency = 100; }
-def : WriteRes<WriteFence, [MEC_RSV]>;
+def : WriteRes<WriteSystem, [SLM_FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [SLM_FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteFence, [SLM_MEC_RSV]>;
def : WriteRes<WriteNop, []>;
// AVX/FMA is not supported on that architecture, but we should define the basic
// scheduling resources anyway.
-def : WriteRes<WriteIMulH, [FPC_RSV0]>;
-defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteShuffle256, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteFMA, FPC_RSV0, 1>;
+def : WriteRes<WriteIMulH, [SLM_FPC_RSV0]>;
+defm : X86WriteResPairUnsupported<WriteFBlendY>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : SLMWriteResPair<WriteVarBlend, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : SLMWriteResPair<WriteFVarBlend, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+defm : X86WriteResPairUnsupported<WriteFShuffle256>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
+defm : X86WriteResPairUnsupported<WriteShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
+defm : SLMWriteResPair<WriteVarVecShift, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+defm : X86WriteResPairUnsupported<WriteFMA>;
+defm : X86WriteResPairUnsupported<WriteFMAX>;
+defm : X86WriteResPairUnsupported<WriteFMAY>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+
+defm : X86WriteResPairUnsupported<WriteCvtPH2PS>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSY>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
+defm : X86WriteResUnsupported<WriteCvtPS2PH>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHY>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHSt>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHYSt>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
} // SchedModel
diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td
index a4e5327213c2..d28d58580752 100644
--- a/lib/Target/X86/X86ScheduleZnver1.td
+++ b/lib/Target/X86/X86ScheduleZnver1.td
@@ -55,7 +55,6 @@ def ZnFPU2 : ProcResource<1>;
def ZnFPU3 : ProcResource<1>;
// FPU grouping
-def ZnFPU : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]>;
def ZnFPU013 : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU3]>;
def ZnFPU01 : ProcResGroup<[ZnFPU0, ZnFPU1]>;
def ZnFPU12 : ProcResGroup<[ZnFPU1, ZnFPU2]>;
@@ -91,6 +90,32 @@ def ZnDivider : ProcResource<1>;
// 4 Cycles load-to use Latency is captured
def : ReadAdvance<ReadAfterLd, 4>;
+// The Integer PRF for Zen is 168 entries, and it holds the architectural and
+// speculative version of the 64-bit integer registers.
+// Reference: "Software Optimization Guide for AMD Family 17h Processors"
+def ZnIntegerPRF : RegisterFile<168, [GR64, CCR]>;
+
+// 36 Entry (9x4 entries) floating-point Scheduler
+def ZnFPU : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]> {
+let BufferSize=36;
+}
+
+// The Zen FP Retire Queue renames SIMD and FP uOps onto a pool of 160 128-bit
+// registers. Operations on 256-bit data types are cracked into two COPs.
+// Reference: "Software Optimization Guide for AMD Family 17h Processors"
+def ZnFpuPRF: RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>;
+
+// The unit can track up to 192 macro ops in-flight.
+// The retire unit handles in-order commit of up to 8 macro ops per cycle.
+// Reference: "Software Optimization Guide for AMD Family 17h Processors"
+// To be noted, the retire unit is shared between integer and FP ops.
+// In SMT mode it is 96 entry per thread. But, we do not use the conservative
+// value here because there is currently no way to fully mode the SMT mode,
+// so there is no point in trying.
+def ZnRCU : RetireControlUnit<192, 8>;
+
+// FIXME: there are 72 read buffers and 44 write buffers.
+
// (a folded load is an instruction that loads and does some operation)
// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops
// Instructions with folded loads are usually micro-fused, so they only appear
@@ -99,30 +124,43 @@ def : ReadAdvance<ReadAfterLd, 4>;
// b. addpd
// This multiclass is for folded loads for integer units.
multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [], int UOps = 1,
+ int LoadLat = 4, int LoadUOps = 1> {
// Register variant takes 1-cycle on Execution Port.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
// Memory variant also uses a cycle on ZnAGU
- // adds 4 cycles to the latency.
- def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
- let NumMicroOps = 2;
- let Latency = !add(Lat, 4);
+ // adds LoadLat cycles to the latency (default = 4).
+ def : WriteRes<SchedRW.Folded, !listconcat([ZnAGU], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+ let NumMicroOps = !add(UOps, LoadUOps);
}
}
// This multiclass is for folded loads for floating point units.
multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [], int UOps = 1,
+ int LoadLat = 7, int LoadUOps = 0> {
// Register variant takes 1-cycle on Execution Port.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
// Memory variant also uses a cycle on ZnAGU
- // adds 7 cycles to the latency.
- def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
- let Latency = !add(Lat, 7);
+ // adds LoadLat cycles to the latency (default = 7).
+ def : WriteRes<SchedRW.Folded, !listconcat([ZnAGU], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+ let NumMicroOps = !add(UOps, LoadUOps);
}
}
@@ -130,103 +168,310 @@ multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW,
// operation in codegen
def : WriteRes<WriteRMW, [ZnAGU]>;
-def : WriteRes<WriteStore, [ZnAGU]>;
-def : WriteRes<WriteMove, [ZnALU]>;
-def : WriteRes<WriteLoad, [ZnAGU]> { let Latency = 8; }
+def : WriteRes<WriteStore, [ZnAGU]>;
+def : WriteRes<WriteStoreNT, [ZnAGU]>;
+def : WriteRes<WriteMove, [ZnALU]>;
+def : WriteRes<WriteLoad, [ZnAGU]> { let Latency = 8; }
def : WriteRes<WriteZero, []>;
def : WriteRes<WriteLEA, [ZnALU]>;
-defm : ZnWriteResPair<WriteALU, ZnALU, 1>;
-defm : ZnWriteResPair<WriteShift, ZnALU, 1>;
-defm : ZnWriteResPair<WriteJump, ZnALU, 1>;
+defm : ZnWriteResPair<WriteALU, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteADC, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteIMul, [ZnALU1, ZnMultiplier], 4>;
+defm : ZnWriteResPair<WriteIMul64, [ZnALU1, ZnMultiplier], 4, [1,1], 2>;
+
+defm : ZnWriteResPair<WriteBSWAP32,[ZnALU], 1, [4]>;
+defm : ZnWriteResPair<WriteBSWAP64,[ZnALU], 1, [4]>;
+
+defm : ZnWriteResPair<WriteShift, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteShiftDouble, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteJump, [ZnALU], 1>;
+defm : ZnWriteResFpuPair<WriteCRC32, [ZnFPU0], 3>;
+
+defm : ZnWriteResPair<WriteCMOV, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteCMOV2, [ZnALU], 1>;
+def : WriteRes<WriteSETCC, [ZnALU]>;
+def : WriteRes<WriteSETCCStore, [ZnALU, ZnAGU]>;
+defm : X86WriteRes<WriteLAHFSAHF, [ZnALU], 2, [1], 2>;
+
+// Bit counts.
+defm : ZnWriteResPair<WriteBSF, [ZnALU], 3>;
+defm : ZnWriteResPair<WriteBSR, [ZnALU], 3>;
+defm : ZnWriteResPair<WriteLZCNT, [ZnALU], 2>;
+defm : ZnWriteResPair<WriteTZCNT, [ZnALU], 2>;
+defm : ZnWriteResPair<WritePOPCNT, [ZnALU], 1>;
// Treat misc copies as a move.
def : InstRW<[WriteMove], (instrs COPY)>;
-// IDIV
-def : WriteRes<WriteIDiv, [ZnALU2, ZnDivider]> {
- let Latency = 41;
- let ResourceCycles = [1, 41];
-}
+// BMI1 BEXTR, BMI2 BZHI
+defm : ZnWriteResPair<WriteBEXTR, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteBZHI, [ZnALU], 1>;
-def : WriteRes<WriteIDivLd, [ZnALU2, ZnAGU, ZnDivider]> {
- let Latency = 45;
- let ResourceCycles = [1, 4, 41];
-}
-
-// IMUL
+// IDIV
+defm : ZnWriteResPair<WriteDiv8, [ZnALU2, ZnDivider], 15, [1,15], 1>;
+defm : ZnWriteResPair<WriteDiv16, [ZnALU2, ZnDivider], 17, [1,17], 2>;
+defm : ZnWriteResPair<WriteDiv32, [ZnALU2, ZnDivider], 25, [1,25], 2>;
+defm : ZnWriteResPair<WriteDiv64, [ZnALU2, ZnDivider], 41, [1,41], 2>;
+defm : ZnWriteResPair<WriteIDiv8, [ZnALU2, ZnDivider], 15, [1,15], 1>;
+defm : ZnWriteResPair<WriteIDiv16, [ZnALU2, ZnDivider], 17, [1,17], 2>;
+defm : ZnWriteResPair<WriteIDiv32, [ZnALU2, ZnDivider], 25, [1,25], 2>;
+defm : ZnWriteResPair<WriteIDiv64, [ZnALU2, ZnDivider], 41, [1,41], 2>;
+
+// IMULH
def : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{
let Latency = 4;
}
-def : WriteRes<WriteIMul, [ZnALU1, ZnMultiplier]> {
- let Latency = 4;
-}
-
-def : WriteRes<WriteIMulLd,[ZnALU1, ZnMultiplier]> {
- let Latency = 8;
-}
// Floating point operations
-defm : ZnWriteResFpuPair<WriteFHAdd, ZnFPU0, 3>;
-defm : ZnWriteResFpuPair<WriteFAdd, ZnFPU0, 3>;
-defm : ZnWriteResFpuPair<WriteFBlend, ZnFPU01, 1>;
-defm : ZnWriteResFpuPair<WriteFVarBlend, ZnFPU01, 1>;
-defm : ZnWriteResFpuPair<WriteVarBlend, ZnFPU0, 1>;
-defm : ZnWriteResFpuPair<WriteCvtI2F, ZnFPU3, 5>;
-defm : ZnWriteResFpuPair<WriteCvtF2F, ZnFPU3, 5>;
-defm : ZnWriteResFpuPair<WriteCvtF2I, ZnFPU3, 5>;
-defm : ZnWriteResFpuPair<WriteFDiv, ZnFPU3, 15>;
-defm : ZnWriteResFpuPair<WriteFShuffle, ZnFPU12, 1>;
-defm : ZnWriteResFpuPair<WriteFMul, ZnFPU0, 5>;
-defm : ZnWriteResFpuPair<WriteFMA, ZnFPU03, 5>;
-defm : ZnWriteResFpuPair<WriteFRcp, ZnFPU01, 5>;
-defm : ZnWriteResFpuPair<WriteFRsqrt, ZnFPU01, 5>;
-defm : ZnWriteResFpuPair<WriteFSqrt, ZnFPU3, 20>;
+defm : X86WriteRes<WriteFLoad, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteFLoadX, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad, [ZnAGU,ZnFPU01], 8, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedLoadY, [ZnAGU,ZnFPU01], 8, [1,2], 2>;
+defm : X86WriteRes<WriteFStore, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreX, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreY, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreNT, [ZnAGU,ZnFPU2], 8, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreNTX, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreNTY, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMaskedStore, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteFMove, [ZnFPU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX, [ZnFPU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY, [ZnFPU], 1, [1], 1>;
+
+defm : ZnWriteResFpuPair<WriteFAdd, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFAddX, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFAddY, [ZnFPU0], 3>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : ZnWriteResFpuPair<WriteFAdd64, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFAdd64X, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFAdd64Y, [ZnFPU0], 3>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+defm : ZnWriteResFpuPair<WriteFCmp, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFCmpX, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFCmpY, [ZnFPU0], 3>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : ZnWriteResFpuPair<WriteFCmp64, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFCmp64X, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFCmp64Y, [ZnFPU0], 3>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+defm : ZnWriteResFpuPair<WriteFCom, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFBlend, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFBlendY, [ZnFPU01], 1>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : ZnWriteResFpuPair<WriteFVarBlend, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFVarBlendY,[ZnFPU01], 1>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+defm : ZnWriteResFpuPair<WriteVarBlend, [ZnFPU0], 1>;
+defm : ZnWriteResFpuPair<WriteVarBlendY, [ZnFPU0], 1>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : ZnWriteResFpuPair<WriteCvtSS2I, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtPS2I, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtPS2IY, [ZnFPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : ZnWriteResFpuPair<WriteCvtSD2I, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtPD2I, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtPD2IY, [ZnFPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+defm : ZnWriteResFpuPair<WriteCvtI2SS, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtI2PS, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtI2PSY, [ZnFPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : ZnWriteResFpuPair<WriteCvtI2SD, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtI2PD, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtI2PDY, [ZnFPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+defm : ZnWriteResFpuPair<WriteFDiv, [ZnFPU3], 15>;
+defm : ZnWriteResFpuPair<WriteFDivX, [ZnFPU3], 15>;
+//defm : ZnWriteResFpuPair<WriteFDivY, [ZnFPU3], 15>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+defm : ZnWriteResFpuPair<WriteFDiv64, [ZnFPU3], 15>;
+defm : ZnWriteResFpuPair<WriteFDiv64X, [ZnFPU3], 15>;
+//defm : ZnWriteResFpuPair<WriteFDiv64Y, [ZnFPU3], 15>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+defm : ZnWriteResFpuPair<WriteFSign, [ZnFPU3], 2>;
+defm : ZnWriteResFpuPair<WriteFRnd, [ZnFPU3], 4, [1], 1, 7, 1>; // FIXME: Should folds require 1 extra uops?
+defm : ZnWriteResFpuPair<WriteFRndY, [ZnFPU3], 4, [1], 1, 7, 1>; // FIXME: Should folds require 1 extra uops?
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : ZnWriteResFpuPair<WriteFLogic, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteFLogicY, [ZnFPU], 1>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : ZnWriteResFpuPair<WriteFTest, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteFTestY, [ZnFPU], 1>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : ZnWriteResFpuPair<WriteFShuffle, [ZnFPU12], 1>;
+defm : ZnWriteResFpuPair<WriteFShuffleY, [ZnFPU12], 1>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : ZnWriteResFpuPair<WriteFVarShuffle, [ZnFPU12], 1>;
+defm : ZnWriteResFpuPair<WriteFVarShuffleY,[ZnFPU12], 1>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : ZnWriteResFpuPair<WriteFMul, [ZnFPU01], 3, [1], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteFMulX, [ZnFPU01], 3, [1], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteFMulY, [ZnFPU01], 4, [1], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : ZnWriteResFpuPair<WriteFMul64, [ZnFPU01], 3, [1], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteFMul64X, [ZnFPU01], 3, [1], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteFMul64Y, [ZnFPU01], 4, [1], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+defm : ZnWriteResFpuPair<WriteFMA, [ZnFPU03], 5>;
+defm : ZnWriteResFpuPair<WriteFMAX, [ZnFPU03], 5>;
+defm : ZnWriteResFpuPair<WriteFMAY, [ZnFPU03], 5>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : ZnWriteResFpuPair<WriteFRcp, [ZnFPU01], 5>;
+defm : ZnWriteResFpuPair<WriteFRcpX, [ZnFPU01], 5>;
+defm : ZnWriteResFpuPair<WriteFRcpY, [ZnFPU01], 5, [1], 1, 7, 2>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+//defm : ZnWriteResFpuPair<WriteFRsqrt, [ZnFPU02], 5>;
+defm : ZnWriteResFpuPair<WriteFRsqrtX, [ZnFPU01], 5, [1], 1, 7, 1>;
+//defm : ZnWriteResFpuPair<WriteFRsqrtY, [ZnFPU01], 5, [2], 2>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+defm : ZnWriteResFpuPair<WriteFSqrt, [ZnFPU3], 20, [20]>;
+defm : ZnWriteResFpuPair<WriteFSqrtX, [ZnFPU3], 20, [20]>;
+defm : ZnWriteResFpuPair<WriteFSqrtY, [ZnFPU3], 28, [28], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : ZnWriteResFpuPair<WriteFSqrt64, [ZnFPU3], 20, [20]>;
+defm : ZnWriteResFpuPair<WriteFSqrt64X, [ZnFPU3], 20, [20]>;
+defm : ZnWriteResFpuPair<WriteFSqrt64Y, [ZnFPU3], 40, [40], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : ZnWriteResFpuPair<WriteFSqrt80, [ZnFPU3], 20, [20]>;
// Vector integer operations which uses FPU units
-defm : ZnWriteResFpuPair<WriteVecShift, ZnFPU, 1>;
-defm : ZnWriteResFpuPair<WriteVecLogic, ZnFPU, 1>;
-defm : ZnWriteResFpuPair<WritePHAdd, ZnFPU, 1>;
-defm : ZnWriteResFpuPair<WriteVecALU, ZnFPU, 1>;
-defm : ZnWriteResFpuPair<WriteVecIMul, ZnFPU0, 4>;
-defm : ZnWriteResFpuPair<WriteShuffle, ZnFPU, 1>;
-defm : ZnWriteResFpuPair<WriteBlend, ZnFPU01, 1>;
-defm : ZnWriteResFpuPair<WriteShuffle256, ZnFPU, 2>;
+defm : X86WriteRes<WriteVecLoad, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [ZnAGU,ZnFPU01], 8, [1,2], 2>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [ZnAGU,ZnFPU01], 9, [1,3], 2>;
+defm : X86WriteRes<WriteVecStore, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreX, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreY, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreNT, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreNTY, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteVecMove, [ZnFPU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [ZnFPU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [ZnFPU], 2, [1], 2>;
+defm : X86WriteRes<WriteVecMoveToGpr, [ZnFPU2], 2, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [ZnFPU2], 3, [1], 1>;
+defm : X86WriteRes<WriteEMMS, [ZnFPU], 2, [1], 1>;
+
+defm : ZnWriteResFpuPair<WriteVecShift, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftX, [ZnFPU2], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftY, [ZnFPU2], 2>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+defm : ZnWriteResFpuPair<WriteVecShiftImm, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftImmX, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftImmY, [ZnFPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : ZnWriteResFpuPair<WriteVecLogic, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecLogicX, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecLogicY, [ZnFPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : ZnWriteResFpuPair<WriteVecTest, [ZnFPU12], 1, [2], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteVecTestY, [ZnFPU12], 1, [2], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : ZnWriteResFpuPair<WriteVecALU, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecALUX, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecALUY, [ZnFPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : ZnWriteResFpuPair<WriteVecIMul, [ZnFPU0], 4>;
+defm : ZnWriteResFpuPair<WriteVecIMulX, [ZnFPU0], 4>;
+defm : ZnWriteResFpuPair<WriteVecIMulY, [ZnFPU0], 4>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : ZnWriteResFpuPair<WritePMULLD, [ZnFPU0], 4, [1], 1, 7, 1>; // FIXME
+defm : ZnWriteResFpuPair<WritePMULLDY, [ZnFPU0], 5, [2], 1, 7, 1>; // FIXME
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : ZnWriteResFpuPair<WriteShuffle, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteShuffleX, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteShuffleY, [ZnFPU], 1>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : ZnWriteResFpuPair<WriteVarShuffle, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVarShuffleX,[ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVarShuffleY,[ZnFPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : ZnWriteResFpuPair<WriteBlend, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteBlendY, [ZnFPU01], 1>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : ZnWriteResFpuPair<WriteShuffle256, [ZnFPU], 2>;
+defm : ZnWriteResFpuPair<WriteVarShuffle256, [ZnFPU], 2>;
+defm : ZnWriteResFpuPair<WritePSADBW, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WritePSADBWX, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WritePSADBWY, [ZnFPU0], 3>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : ZnWriteResFpuPair<WritePHMINPOS, [ZnFPU0], 4>;
// Vector Shift Operations
-defm : ZnWriteResFpuPair<WriteVarVecShift, ZnFPU12, 1>;
+defm : ZnWriteResFpuPair<WriteVarVecShift, [ZnFPU12], 1>;
+defm : ZnWriteResFpuPair<WriteVarVecShiftY, [ZnFPU12], 1>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+// Vector insert/extract operations.
+defm : ZnWriteResFpuPair<WriteVecInsert, [ZnFPU], 1>;
+
+def : WriteRes<WriteVecExtract, [ZnFPU12, ZnFPU2]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteVecExtractSt, [ZnAGU, ZnFPU12, ZnFPU2]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2, 3];
+}
+
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [ZnFPU2]>;
+def : WriteRes<WriteMMXMOVMSK, [ZnFPU2]>;
+def : WriteRes<WriteVecMOVMSK, [ZnFPU2]>;
+
+def : WriteRes<WriteVecMOVMSKY, [ZnFPU2]> {
+ let NumMicroOps = 2;
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
// AES Instructions.
-defm : ZnWriteResFpuPair<WriteAESDecEnc, ZnFPU01, 4>;
-defm : ZnWriteResFpuPair<WriteAESIMC, ZnFPU01, 4>;
-defm : ZnWriteResFpuPair<WriteAESKeyGen, ZnFPU01, 4>;
+defm : ZnWriteResFpuPair<WriteAESDecEnc, [ZnFPU01], 4>;
+defm : ZnWriteResFpuPair<WriteAESIMC, [ZnFPU01], 4>;
+defm : ZnWriteResFpuPair<WriteAESKeyGen, [ZnFPU01], 4>;
def : WriteRes<WriteFence, [ZnAGU]>;
def : WriteRes<WriteNop, []>;
// Following instructions with latency=100 are microcoded.
// We set long latency so as to block the entire pipeline.
-defm : ZnWriteResFpuPair<WriteFShuffle256, ZnFPU, 100>;
-
-//Microcoded Instructions
-let Latency = 100 in {
- def : WriteRes<WriteMicrocoded, []>;
- def : WriteRes<WriteSystem, []>;
- def : WriteRes<WriteMPSAD, []>;
- def : WriteRes<WriteMPSADLd, []>;
- def : WriteRes<WriteCLMul, []>;
- def : WriteRes<WriteCLMulLd, []>;
- def : WriteRes<WritePCmpIStrM, []>;
- def : WriteRes<WritePCmpIStrMLd, []>;
- def : WriteRes<WritePCmpEStrI, []>;
- def : WriteRes<WritePCmpEStrILd, []>;
- def : WriteRes<WritePCmpEStrM, []>;
- def : WriteRes<WritePCmpEStrMLd, []>;
- def : WriteRes<WritePCmpIStrI, []>;
- def : WriteRes<WritePCmpIStrILd, []>;
- }
-
-//=== Regex based itineraries ===//
+defm : ZnWriteResFpuPair<WriteFShuffle256, [ZnFPU], 100>;
+defm : ZnWriteResFpuPair<WriteFVarShuffle256, [ZnFPU], 100>;
+
+// Microcoded Instructions
+def ZnWriteMicrocoded : SchedWriteRes<[]> {
+ let Latency = 100;
+}
+
+def : SchedAlias<WriteMicrocoded, ZnWriteMicrocoded>;
+def : SchedAlias<WriteFCMOV, ZnWriteMicrocoded>;
+def : SchedAlias<WriteSystem, ZnWriteMicrocoded>;
+def : SchedAlias<WriteMPSAD, ZnWriteMicrocoded>;
+def : SchedAlias<WriteMPSADY, ZnWriteMicrocoded>;
+def : SchedAlias<WriteMPSADLd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteMPSADYLd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteCLMul, ZnWriteMicrocoded>;
+def : SchedAlias<WriteCLMulLd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrM, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrMLd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrI, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrILd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrM, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrMLd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrI, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrILd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteLDMXCSR, ZnWriteMicrocoded>;
+def : SchedAlias<WriteSTMXCSR, ZnWriteMicrocoded>;
+
+//=== Regex based InstRW ===//
// Notation:
// - r: register.
// - m = memory.
@@ -247,14 +492,6 @@ def : InstRW<[WriteALULd, ReadAfterLd], (instregex "MOV16rm")>;
// r,m.
def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>;
-// CMOVcc.
-// r,r.
-def : InstRW<[WriteALU],
- (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>;
-// r,m.
-def : InstRW<[WriteALULd, ReadAfterLd],
- (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>;
-
// XCHG.
// r,r.
def ZnWriteXCHG : SchedWriteRes<[ZnALU]> {
@@ -271,7 +508,7 @@ def ZnWriteXCHGrm : SchedWriteRes<[ZnAGU, ZnALU]> {
}
def : InstRW<[ZnWriteXCHGrm, ReadAfterLd], (instregex "XCHG(8|16|32|64)rm")>;
-def : InstRW<[WriteMicrocoded], (instregex "XLAT")>;
+def : InstRW<[WriteMicrocoded], (instrs XLAT)>;
// POP16.
// r.
@@ -302,20 +539,7 @@ def ZnWritePushA : SchedWriteRes<[ZnAGU]> {
def : InstRW<[ZnWritePushA], (instregex "PUSHA(16|32)")>;
//LAHF
-def : InstRW<[WriteMicrocoded], (instregex "LAHF")>;
-
-// SAHF.
-def ZnWriteSAHF : SchedWriteRes<[ZnALU]> {
- let Latency = 2;
- let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteSAHF], (instregex "SAHF")>;
-
-// BSWAP.
-def ZnWriteBSwap : SchedWriteRes<[ZnALU]> {
- let ResourceCycles = [4];
-}
-def : InstRW<[ZnWriteBSwap], (instregex "BSWAP")>;
+def : InstRW<[WriteMicrocoded], (instrs LAHF)>;
// MOVBE.
// r,m.
@@ -336,16 +560,6 @@ def : InstRW<[WriteALULd], (instregex "(ADD|SUB)(8|16|32|64)m(r|i)",
"(ADD|SUB)64mi32")>;
// ADC SBB.
-// r,r/i.
-def : InstRW<[WriteALU], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)",
- "(ADC|SBB)(16|32|64)ri8",
- "(ADC|SBB)64ri32",
- "(ADC|SBB)(8|16|32|64)rr_REV")>;
-
-// r,m.
-def : InstRW<[WriteALULd, ReadAfterLd],
- (instregex "(ADC|SBB)(8|16|32|64)rm")>;
-
// m,r/i.
def : InstRW<[WriteALULd],
(instregex "(ADC|SBB)(8|16|32|64)m(r|i)",
@@ -355,59 +569,52 @@ def : InstRW<[WriteALULd],
// INC DEC NOT NEG.
// m.
def : InstRW<[WriteALULd],
- (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m",
- "(INC|DEC)64(16|32)m")>;
+ (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m")>;
// MUL IMUL.
// r16.
def ZnWriteMul16 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
let Latency = 3;
}
-def : InstRW<[ZnWriteMul16], (instregex "IMUL16r", "MUL16r")>;
+def : InstRW<[ZnWriteMul16], (instrs IMUL16r, MUL16r)>;
+def : InstRW<[ZnWriteMul16], (instrs IMUL16rr, IMUL16rri, IMUL16rri8)>; // TODO: is this right?
+def : InstRW<[ZnWriteMul16], (instrs IMUL16rm, IMUL16rmi, IMUL16rmi8)>; // TODO: this is definitely wrong but matches what the instregex did.
// m16.
def ZnWriteMul16Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
let Latency = 8;
}
-def : InstRW<[ZnWriteMul16Ld, ReadAfterLd], (instregex "IMUL16m", "MUL16m")>;
+def : InstRW<[ZnWriteMul16Ld, ReadAfterLd], (instrs IMUL16m, MUL16m)>;
// r32.
def ZnWriteMul32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
let Latency = 3;
}
-def : InstRW<[ZnWriteMul32], (instregex "IMUL32r", "MUL32r")>;
+def : InstRW<[ZnWriteMul32], (instrs IMUL32r, MUL32r)>;
+def : InstRW<[ZnWriteMul32], (instrs IMUL32rr, IMUL32rri, IMUL32rri8)>; // TODO: is this right?
+def : InstRW<[ZnWriteMul32], (instrs IMUL32rm, IMUL32rmi, IMUL32rmi8)>; // TODO: this is definitely wrong but matches what the instregex did.
// m32.
def ZnWriteMul32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
let Latency = 8;
}
-def : InstRW<[ZnWriteMul32Ld, ReadAfterLd], (instregex "IMUL32m", "MUL32m")>;
+def : InstRW<[ZnWriteMul32Ld, ReadAfterLd], (instrs IMUL32m, MUL32m)>;
// r64.
def ZnWriteMul64 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
let Latency = 4;
let NumMicroOps = 2;
}
-def : InstRW<[ZnWriteMul64], (instregex "IMUL64r", "MUL64r")>;
+def : InstRW<[ZnWriteMul64], (instrs IMUL64r, MUL64r)>;
+def : InstRW<[ZnWriteMul64], (instrs IMUL64rr, IMUL64rri8, IMUL64rri32)>; // TODO: is this right?
+def : InstRW<[ZnWriteMul64], (instrs IMUL64rm, IMUL64rmi32, IMUL64rmi8)>; // TODO: this is definitely wrong but matches what the instregex did.
// m64.
def ZnWriteMul64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
let Latency = 9;
let NumMicroOps = 2;
}
-def : InstRW<[ZnWriteMul64Ld, ReadAfterLd], (instregex "IMUL64m", "MUL64m")>;
-
-// r16,r16.
-def ZnWriteMul16rri : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
- let Latency = 3;
-}
-def : InstRW<[ZnWriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>;
-
-// r16,m16.
-def ZnWriteMul16rmi : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
- let Latency = 8;
-}
-def : InstRW<[ZnWriteMul16rmi, ReadAfterLd], (instregex "IMUL16rmi", "IMUL16rmi8")>;
+def : InstRW<[ZnWriteMul64Ld, ReadAfterLd], (instrs IMUL64m, MUL64m)>;
// MULX.
// r32,r32,r32.
@@ -415,72 +622,43 @@ def ZnWriteMulX32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
let Latency = 3;
let ResourceCycles = [1, 2];
}
-def : InstRW<[ZnWriteMulX32], (instregex "MULX32rr")>;
+def : InstRW<[ZnWriteMulX32], (instrs MULX32rr)>;
// r32,r32,m32.
def ZnWriteMulX32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
let Latency = 8;
let ResourceCycles = [1, 2, 2];
}
-def : InstRW<[ZnWriteMulX32Ld, ReadAfterLd], (instregex "MULX32rm")>;
+def : InstRW<[ZnWriteMulX32Ld, ReadAfterLd], (instrs MULX32rm)>;
// r64,r64,r64.
def ZnWriteMulX64 : SchedWriteRes<[ZnALU1]> {
let Latency = 3;
}
-def : InstRW<[ZnWriteMulX64], (instregex "MULX64rr")>;
+def : InstRW<[ZnWriteMulX64], (instrs MULX64rr)>;
// r64,r64,m64.
def ZnWriteMulX64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
let Latency = 8;
}
-def : InstRW<[ZnWriteMulX64Ld, ReadAfterLd], (instregex "MULX64rm")>;
-
-// DIV, IDIV.
-// r8.
-def ZnWriteDiv8 : SchedWriteRes<[ZnALU2, ZnDivider]> {
- let Latency = 15;
-}
-def : InstRW<[ZnWriteDiv8], (instregex "DIV8r", "IDIV8r")>;
-
-// r16.
-def ZnWriteDiv16 : SchedWriteRes<[ZnALU2, ZnDivider]> {
- let Latency = 17;
- let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteDiv16], (instregex "DIV16r", "IDIV16r")>;
-
-// r32.
-def ZnWriteDiv32 : SchedWriteRes<[ZnALU2, ZnDivider]> {
- let Latency = 25;
- let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteDiv32], (instregex "DIV32r", "IDIV32r")>;
-
-// r64.
-def ZnWriteDiv64 : SchedWriteRes<[ZnALU2, ZnDivider]> {
- let Latency = 41;
- let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteDiv64], (instregex "DIV64r", "IDIV64r")>;
+def : InstRW<[ZnWriteMulX64Ld, ReadAfterLd], (instrs MULX64rm)>;
//-- Control transfer instructions --//
// J(E|R)CXZ.
def ZnWriteJCXZ : SchedWriteRes<[ZnALU03]>;
-def : InstRW<[ZnWriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>;
+def : InstRW<[ZnWriteJCXZ], (instrs JCXZ, JECXZ, JRCXZ)>;
// INTO
-def : InstRW<[WriteMicrocoded], (instregex "INTO")>;
+def : InstRW<[WriteMicrocoded], (instrs INTO)>;
// LOOP.
def ZnWriteLOOP : SchedWriteRes<[ZnALU03]>;
-def : InstRW<[ZnWriteLOOP], (instregex "LOOP")>;
+def : InstRW<[ZnWriteLOOP], (instrs LOOP)>;
// LOOP(N)E, LOOP(N)Z
def ZnWriteLOOPE : SchedWriteRes<[ZnALU03]>;
-def : InstRW<[ZnWriteLOOPE], (instregex "LOOPE", "LOOPNE",
- "LOOPZ", "LOOPNZ")>;
+def : InstRW<[ZnWriteLOOPE], (instrs LOOPE, LOOPNE)>;
// CALL.
// r.
@@ -494,7 +672,7 @@ def ZnWriteRET : SchedWriteRes<[ZnALU03]> {
let NumMicroOps = 2;
}
def : InstRW<[ZnWriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)",
- "IRET(D|Q)", "RETF")>;
+ "IRET(16|32|64)")>;
//-- Logic instructions --//
@@ -504,12 +682,6 @@ def : InstRW<[WriteALULd],
(instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)",
"(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>;
-// ANDN.
-// r,r.
-def : InstRW<[WriteALU], (instregex "ANDN(32|64)rr")>;
-// r,m.
-def : InstRW<[WriteALULd, ReadAfterLd], (instregex "ANDN(32|64)rm")>;
-
// Define ALU latency variants
def ZnWriteALULat2 : SchedWriteRes<[ZnALU]> {
let Latency = 2;
@@ -518,24 +690,8 @@ def ZnWriteALULat2Ld : SchedWriteRes<[ZnAGU, ZnALU]> {
let Latency = 6;
}
-def ZnWriteALULat3 : SchedWriteRes<[ZnALU]> {
- let Latency = 3;
-}
-def ZnWriteALULat3Ld : SchedWriteRes<[ZnAGU, ZnALU]> {
- let Latency = 7;
-}
-
-// BSF BSR.
-// r,r.
-def : InstRW<[ZnWriteALULat3], (instregex "BS(R|F)(16|32|64)rr")>;
-// r,m.
-def : InstRW<[ZnWriteALULat3Ld, ReadAfterLd], (instregex "BS(R|F)(16|32|64)rm")>;
-
// BT.
-// r,r/i.
-def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>;
-
-def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mr")>;
+// m,i.
def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>;
// BTR BTS BTC.
@@ -546,7 +702,6 @@ def ZnWriteBTRSC : SchedWriteRes<[ZnALU]> {
}
def : InstRW<[ZnWriteBTRSC], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>;
-
// m,r,i.
def ZnWriteBTRSCm : SchedWriteRes<[ZnAGU, ZnALU]> {
let Latency = 6;
@@ -559,79 +714,35 @@ def : InstRW<[ZnWriteBTRSCm], (instregex "BT(R|S|C)(16|32|64)m(r|i8)")>;
// r,r.
def : InstRW<[ZnWriteALULat2], (instregex "BLS(I|MSK|R)(32|64)rr")>;
// r,m.
-def : InstRW<[ZnWriteALULat2Ld, ReadAfterLd], (instregex "BLS(I|MSK|R)(32|64)rm")>;
-
-// BEXTR.
-// r,r,r.
-def : InstRW<[WriteALU], (instregex "BEXTR(32|64)rr")>;
-// r,m,r.
-def : InstRW<[WriteALULd, ReadAfterLd], (instregex "BEXTR(32|64)rm")>;
-
-// BZHI.
-// r,r,r.
-def : InstRW<[WriteALU], (instregex "BZHI(32|64)rr")>;
-// r,m,r.
-def : InstRW<[WriteALULd, ReadAfterLd], (instregex "BZHI(32|64)rm")>;
+def : InstRW<[ZnWriteALULat2Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>;
// CLD STD.
-def : InstRW<[WriteALU], (instregex "STD", "CLD")>;
+def : InstRW<[WriteALU], (instrs STD, CLD)>;
// PDEP PEXT.
// r,r,r.
def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>;
-// r,m,r.
+// r,r,m.
def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>;
-// ROR ROL.
-def : InstRW<[WriteShift], (instregex "RO(R|L)(8|16|32|64)r1")>;
-
// RCR RCL.
-// r,1.
-def : InstRW<[WriteShift], (instregex "RC(R|L)(8|16|32|64)r1")>;
-
-// m,1.
-def : InstRW<[WriteMicrocoded], (instregex "RC(R|L)(8|16|32|64)m1")>;
-
-// i.
-def : InstRW<[WriteShift], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>;
-
// m,i.
-def : InstRW<[WriteMicrocoded], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>;
+def : InstRW<[WriteMicrocoded], (instregex "RC(R|L)(8|16|32|64)m(1|i|CL)")>;
// SHR SHL SAR.
// m,i.
def : InstRW<[WriteShiftLd], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>;
// SHRD SHLD.
-// r,r
-def : InstRW<[WriteShift], (instregex "SH(R|L)D(16|32|64)rri8")>;
-
// m,r
def : InstRW<[WriteShiftLd], (instregex "SH(R|L)D(16|32|64)mri8")>;
// r,r,cl.
-def : InstRW<[WriteMicrocoded], (instregex "SHLD(16|32|64)rrCL")>;
-
-// r,r,cl.
-def : InstRW<[WriteMicrocoded], (instregex "SHRD(16|32|64)rrCL")>;
+def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)rrCL")>;
// m,r,cl.
def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)mrCL")>;
-// SETcc.
-// r.
-def : InstRW<[WriteShift],
- (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>;
-// m.
-def : InstRW<[WriteShift],
- (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>;
-
-// LZCNT TZCNT.
-// r,r.
-def : InstRW<[ZnWriteALULat2], (instregex "(LZCNT|TZCNT)(16|32|64)rr")>;
-// r,m.
-def : InstRW<[ZnWriteALULat2Ld, ReadAfterLd], (instregex "(LZCNT|TZCNT)(16|32|64)rm")>;
-
//-- Misc instructions --//
// CMPXCHG.
def ZnWriteCMPXCHG : SchedWriteRes<[ZnAGU, ZnALU]> {
@@ -644,9 +755,9 @@ def : InstRW<[ZnWriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>;
def ZnWriteCMPXCHG8B : SchedWriteRes<[ZnAGU, ZnALU]> {
let NumMicroOps = 18;
}
-def : InstRW<[ZnWriteCMPXCHG8B], (instregex "CMPXCHG8B")>;
+def : InstRW<[ZnWriteCMPXCHG8B], (instrs CMPXCHG8B)>;
-def : InstRW<[WriteMicrocoded], (instregex "CMPXCHG16B")>;
+def : InstRW<[WriteMicrocoded], (instrs CMPXCHG16B)>;
// LEAVE
def ZnWriteLEAVE : SchedWriteRes<[ZnALU, ZnAGU]> {
@@ -656,13 +767,13 @@ def ZnWriteLEAVE : SchedWriteRes<[ZnALU, ZnAGU]> {
def : InstRW<[ZnWriteLEAVE], (instregex "LEAVE")>;
// PAUSE.
-def : InstRW<[WriteMicrocoded], (instregex "PAUSE")>;
+def : InstRW<[WriteMicrocoded], (instrs PAUSE)>;
// RDTSC.
def : InstRW<[WriteMicrocoded], (instregex "RDTSC")>;
// RDPMC.
-def : InstRW<[WriteMicrocoded], (instregex "RDPMC")>;
+def : InstRW<[WriteMicrocoded], (instrs RDPMC)>;
// RDRAND.
def : InstRW<[WriteMicrocoded], (instregex "RDRAND(16|32|64)r")>;
@@ -732,7 +843,7 @@ def : InstRW<[WriteMicrocoded], (instregex "FBSTPm")>;
def ZnWriteFXCH : SchedWriteRes<[ZnFPU]>;
// FXCHG.
-def : InstRW<[ZnWriteFXCH], (instregex "XCH_F")>;
+def : InstRW<[ZnWriteFXCH], (instrs XCH_F)>;
// FILD.
def ZnWriteFILD : SchedWriteRes<[ZnAGU, ZnFPU3]> {
@@ -756,31 +867,29 @@ def ZnWriteFPU3 : SchedWriteRes<[ZnAGU, ZnFPU3]> {
}
// FLDZ.
-def : InstRW<[ZnWriteFPU13], (instregex "LD_F0")>;
+def : SchedAlias<WriteFLD0, ZnWriteFPU13>;
// FLD1.
-def : InstRW<[ZnWriteFPU3], (instregex "LD_F1")>;
+def : SchedAlias<WriteFLD1, ZnWriteFPU3>;
// FLDPI FLDL2E etc.
-def : InstRW<[ZnWriteFPU3], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>;
-
-def : InstRW<[WriteMicrocoded], (instregex "CMOV(B|BE|E|P|NB|NBE|NE|NP)_F")>;
+def : SchedAlias<WriteFLDC, ZnWriteFPU3>;
// FNSTSW.
// AX.
-def : InstRW<[WriteMicrocoded], (instregex "FNSTSW16r")>;
+def : InstRW<[WriteMicrocoded], (instrs FNSTSW16r)>;
// m16.
-def : InstRW<[WriteMicrocoded], (instregex "FNSTSWm")>;
+def : InstRW<[WriteMicrocoded], (instrs FNSTSWm)>;
// FLDCW.
-def : InstRW<[WriteMicrocoded], (instregex "FLDCW16m")>;
+def : InstRW<[WriteMicrocoded], (instrs FLDCW16m)>;
// FNSTCW.
-def : InstRW<[WriteMicrocoded], (instregex "FNSTCW16m")>;
+def : InstRW<[WriteMicrocoded], (instrs FNSTCW16m)>;
// FINCSTP FDECSTP.
-def : InstRW<[ZnWriteFPU3], (instregex "FINCSTP", "FDECSTP")>;
+def : InstRW<[ZnWriteFPU3], (instrs FINCSTP, FDECSTP)>;
// FFREE.
def : InstRW<[ZnWriteFPU3], (instregex "FFREE")>;
@@ -793,14 +902,6 @@ def : InstRW<[WriteMicrocoded], (instregex "FRSTORm")>;
//-- Arithmetic instructions --//
-def ZnWriteFPU3Lat2 : SchedWriteRes<[ZnFPU3]> {
- let Latency = 2;
-}
-
-def ZnWriteFPU3Lat2Ld : SchedWriteRes<[ZnAGU, ZnFPU3]> {
- let Latency = 9;
-}
-
def ZnWriteFPU3Lat1 : SchedWriteRes<[ZnFPU3]> ;
def ZnWriteFPU0Lat1 : SchedWriteRes<[ZnFPU0]> ;
@@ -809,22 +910,18 @@ def ZnWriteFPU0Lat1Ld : SchedWriteRes<[ZnAGU, ZnFPU0]> {
let Latency = 8;
}
-// FABS.
-def : InstRW<[ZnWriteFPU3Lat2], (instregex "ABS_F")>;
-
// FCHS.
def : InstRW<[ZnWriteFPU3Lat1], (instregex "CHS_F")>;
// FCOM(P) FUCOM(P).
// r.
-def : InstRW<[ZnWriteFPU0Lat1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr",
- "UCOM_FPr")>;
+def : InstRW<[ZnWriteFPU0Lat1], (instregex "COM(P?)_FST0r", "UCOM_F(P?)r")>;
// m.
-def : InstRW<[ZnWriteFPU0Lat1Ld], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>;
+def : InstRW<[ZnWriteFPU0Lat1Ld], (instregex "FCOM(P?)(32|64)m")>;
// FCOMPP FUCOMPP.
// r.
-def : InstRW<[ZnWriteFPU0Lat1], (instregex "FCOMPP", "UCOM_FPPr")>;
+def : InstRW<[ZnWriteFPU0Lat1], (instrs FCOMPP, UCOM_FPPr)>;
def ZnWriteFPU02 : SchedWriteRes<[ZnAGU, ZnFPU02]>
{
@@ -833,8 +930,7 @@ def ZnWriteFPU02 : SchedWriteRes<[ZnAGU, ZnFPU02]>
// FCOMI(P) FUCOMI(P).
// m.
-def : InstRW<[ZnWriteFPU02], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr",
- "UCOM_FIPr")>;
+def : InstRW<[ZnWriteFPU02], (instrs COM_FIPr, COM_FIr, UCOM_FIPr, UCOM_FIr)>;
def ZnWriteFPU03 : SchedWriteRes<[ZnAGU, ZnFPU03]>
{
@@ -844,92 +940,42 @@ def ZnWriteFPU03 : SchedWriteRes<[ZnAGU, ZnFPU03]>
}
// FICOM(P).
-def : InstRW<[ZnWriteFPU03], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>;
+def : InstRW<[ZnWriteFPU03], (instregex "FICOM(P?)(16|32)m")>;
// FTST.
def : InstRW<[ZnWriteFPU0Lat1], (instregex "TST_F")>;
// FXAM.
-def : InstRW<[ZnWriteFPU3Lat1], (instregex "FXAM")>;
+def : InstRW<[ZnWriteFPU3Lat1], (instrs FXAM)>;
// FPREM.
-def : InstRW<[WriteMicrocoded], (instregex "FPREM")>;
+def : InstRW<[WriteMicrocoded], (instrs FPREM)>;
// FPREM1.
-def : InstRW<[WriteMicrocoded], (instregex "FPREM1")>;
+def : InstRW<[WriteMicrocoded], (instrs FPREM1)>;
// FRNDINT.
-def : InstRW<[WriteMicrocoded], (instregex "FRNDINT")>;
+def : InstRW<[WriteMicrocoded], (instrs FRNDINT)>;
// FSCALE.
-def : InstRW<[WriteMicrocoded], (instregex "FSCALE")>;
+def : InstRW<[WriteMicrocoded], (instrs FSCALE)>;
// FXTRACT.
-def : InstRW<[WriteMicrocoded], (instregex "FXTRACT")>;
+def : InstRW<[WriteMicrocoded], (instrs FXTRACT)>;
// FNOP.
-def : InstRW<[ZnWriteFPU0Lat1], (instregex "FNOP")>;
+def : InstRW<[ZnWriteFPU0Lat1], (instrs FNOP)>;
// WAIT.
-def : InstRW<[ZnWriteFPU0Lat1], (instregex "WAIT")>;
+def : InstRW<[ZnWriteFPU0Lat1], (instrs WAIT)>;
// FNCLEX.
-def : InstRW<[WriteMicrocoded], (instregex "FNCLEX")>;
+def : InstRW<[WriteMicrocoded], (instrs FNCLEX)>;
// FNINIT.
-def : InstRW<[WriteMicrocoded], (instregex "FNINIT")>;
+def : InstRW<[WriteMicrocoded], (instrs FNINIT)>;
//=== Integer MMX and XMM Instructions ===//
-//-- Move instructions --//
-
-// Moves from GPR to FPR incurs a penalty
-def ZnWriteFPU2 : SchedWriteRes<[ZnFPU2]> {
- let Latency = 3;
-}
-
-// Move to ALU doesn't incur penalty
-def ZnWriteToALU2 : SchedWriteRes<[ZnFPU2]> {
- let Latency = 2;
-}
-
-def ZnWriteFPU : SchedWriteRes<[ZnFPU]>;
-def ZnWriteFPUY : SchedWriteRes<[ZnFPU]> {
- let NumMicroOps = 2;
- let Latency=2;
-}
-
-// MOVD.
-// r32/64 <- (x)mm.
-def : InstRW<[ZnWriteToALU2], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr",
- "VMOVPDI2DIrr", "MOVPDI2DIrr")>;
-
-// (x)mm <- r32/64.
-def : InstRW<[ZnWriteFPU2], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr",
- "VMOVDI2PDIrr", "MOVDI2PDIrr")>;
-
-// MOVQ.
-// r64 <- (x)mm.
-def : InstRW<[ZnWriteToALU2], (instregex "VMOVPQIto64rr")>;
-
-// (x)mm <- r64.
-def : InstRW<[ZnWriteFPU2], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>;
-
-// (x)mm <- (x)mm.
-def : InstRW<[ZnWriteFPU], (instregex "MMX_MOVQ64rr")>;
-
-// (V)MOVDQA/U.
-// x <- x.
-def : InstRW<[ZnWriteFPU], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr",
- "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV")>;
-
-// y <- y.
-def : InstRW<[ZnWriteFPUY], (instregex "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>;
-
-// MOVDQ2Q.
-def : InstRW<[ZnWriteFPU], (instregex "MMX_MOVDQ2Qrr")>;
-
-// MOVQ2DQ.
-def : InstRW<[ZnWriteFPU], (instregex "MMX_MOVQ2DQrr")>;
// PACKSSWB/DW.
// mm <- mm.
@@ -938,15 +984,22 @@ def ZnWriteFPU12Y : SchedWriteRes<[ZnFPU12]> {
let NumMicroOps = 2;
}
def ZnWriteFPU12m : SchedWriteRes<[ZnAGU, ZnFPU12]> ;
+def ZnWriteFPU12Ym : SchedWriteRes<[ZnAGU, ZnFPU12]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
-def : InstRW<[ZnWriteFPU12], (instregex "MMX_PACKSSDWirr",
- "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>;
-def : InstRW<[ZnWriteFPU12m], (instregex "MMX_PACKSSDWirm",
- "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>;
+def : InstRW<[ZnWriteFPU12], (instrs MMX_PACKSSDWirr,
+ MMX_PACKSSWBirr,
+ MMX_PACKUSWBirr)>;
+def : InstRW<[ZnWriteFPU12m], (instrs MMX_PACKSSDWirm,
+ MMX_PACKSSWBirm,
+ MMX_PACKUSWBirm)>;
-// VPMOVSX/ZX BW BD BQ DW DQ.
+// VPMOVSX/ZX BW BD BQ WD WQ DQ.
// y <- x.
-def : InstRW<[ZnWriteFPU12Y], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>;
+def : InstRW<[ZnWriteFPU12Y], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrr")>;
+def : InstRW<[ZnWriteFPU12Ym], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrm")>;
def ZnWriteFPU013 : SchedWriteRes<[ZnFPU013]> ;
def ZnWriteFPU013Y : SchedWriteRes<[ZnFPU013]> {
@@ -969,12 +1022,12 @@ def ZnWriteFPU013LdY : SchedWriteRes<[ZnAGU, ZnFPU013]> {
// x,x,i / v,v,v,i
def : InstRW<[ZnWriteFPU013], (instregex "(V?)PBLENDWrri")>;
// ymm
-def : InstRW<[ZnWriteFPU013Y], (instregex "(V?)PBLENDWYrri")>;
+def : InstRW<[ZnWriteFPU013Y], (instrs VPBLENDWYrri)>;
// x,m,i / v,v,m,i
def : InstRW<[ZnWriteFPU013Ld], (instregex "(V?)PBLENDWrmi")>;
// y,m,i
-def : InstRW<[ZnWriteFPU013LdY], (instregex "(V?)PBLENDWYrmi")>;
+def : InstRW<[ZnWriteFPU013LdY], (instrs VPBLENDWYrmi)>;
def ZnWriteFPU01 : SchedWriteRes<[ZnFPU01]> ;
def ZnWriteFPU01Y : SchedWriteRes<[ZnFPU01]> {
@@ -983,9 +1036,9 @@ def ZnWriteFPU01Y : SchedWriteRes<[ZnFPU01]> {
// VPBLENDD.
// v,v,v,i.
-def : InstRW<[ZnWriteFPU01], (instregex "VPBLENDDrri")>;
+def : InstRW<[ZnWriteFPU01], (instrs VPBLENDDrri)>;
// ymm
-def : InstRW<[ZnWriteFPU01Y], (instregex "VPBLENDDYrri")>;
+def : InstRW<[ZnWriteFPU01Y], (instrs VPBLENDDYrri)>;
// v,v,m,i
def ZnWriteFPU01Op2 : SchedWriteRes<[ZnAGU, ZnFPU01]> {
@@ -998,8 +1051,8 @@ def ZnWriteFPU01Op2Y : SchedWriteRes<[ZnAGU, ZnFPU01]> {
let Latency = 9;
let ResourceCycles = [1, 3];
}
-def : InstRW<[ZnWriteFPU01Op2], (instregex "VPBLENDDrmi")>;
-def : InstRW<[ZnWriteFPU01Op2Y], (instregex "VPBLENDDYrmi")>;
+def : InstRW<[ZnWriteFPU01Op2], (instrs VPBLENDDrmi)>;
+def : InstRW<[ZnWriteFPU01Op2Y], (instrs VPBLENDDYrmi)>;
// MASKMOVQ.
def : InstRW<[WriteMicrocoded], (instregex "MMX_MASKMOVQ(64)?")>;
@@ -1007,42 +1060,13 @@ def : InstRW<[WriteMicrocoded], (instregex "MMX_MASKMOVQ(64)?")>;
// MASKMOVDQU.
def : InstRW<[WriteMicrocoded], (instregex "(V?)MASKMOVDQU(64)?")>;
-// VPMASKMOVQ.
+// VPMASKMOVD.
// ymm
-def : InstRW<[ZnWriteFPU01Op2],(instregex "VPMASKMOVQrm")>;
-def : InstRW<[ZnWriteFPU01Op2Y],(instregex "VPMASKMOVQYrm")>;
-
def : InstRW<[WriteMicrocoded],
(instregex "VPMASKMOVD(Y?)rm")>;
// m, v,v.
def : InstRW<[WriteMicrocoded], (instregex "VPMASKMOV(D|Q)(Y?)mr")>;
-// PMOVMSKB.
-def ZnWritePMOVMSKB : SchedWriteRes<[ZnFPU2]> {
- let NumMicroOps = 2;
-}
-def ZnWritePMOVMSKBY : SchedWriteRes<[ZnFPU2]> {
- let Latency = 2;
-}
-def : InstRW<[ZnWritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKBrr")>;
-def : InstRW<[ZnWritePMOVMSKBY], (instregex "(V|MMX_)?PMOVMSKBYrr")>;
-
-// PEXTR B/W/D/Q.
-// r32,x,i.
-def ZnWritePEXTRr : SchedWriteRes<[ZnFPU12, ZnFPU2]> {
- let Latency = 2;
- let ResourceCycles = [1, 2];
-}
-def : InstRW<[ZnWritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>;
-
-def ZnWritePEXTRm : SchedWriteRes<[ZnAGU, ZnFPU12, ZnFPU2]> {
- let Latency = 5;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 2, 3];
-}
-// m8,x,i.
-def : InstRW<[ZnWritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>;
-
// VPBROADCAST B/W.
// x, m8/16.
def ZnWriteVPBROADCAST128Ld : SchedWriteRes<[ZnAGU, ZnFPU12]> {
@@ -1069,13 +1093,12 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>;
// HADD, HSUB PS/PD
// PHADD|PHSUB (S) W/D.
-def : InstRW<[WriteMicrocoded], (instregex "MMX_PHADD(W?)r(r|m)64",
- "MMX_PHADDSWr(r|m)64",
- "MMX_PHSUB(W|D)r(r|m)64",
- "MMX_PHSUBSWrr64",
- "(V?)PH(ADD|SUB)(W|D)(Y?)r(r|m)",
- "(V?)PH(ADD|SUB)SWr(r|m)(256)?")>;
-
+def : SchedAlias<WritePHAdd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePHAddLd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePHAddX, ZnWriteMicrocoded>;
+def : SchedAlias<WritePHAddXLd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePHAddY, ZnWriteMicrocoded>;
+def : SchedAlias<WritePHAddYLd, ZnWriteMicrocoded>;
// PCMPGTQ.
def ZnWritePCMPGTQr : SchedWriteRes<[ZnFPU03]>;
@@ -1092,69 +1115,16 @@ def ZnWritePCMPGTQYm : SchedWriteRes<[ZnAGU, ZnFPU03]> {
let ResourceCycles = [1,2];
}
def : InstRW<[ZnWritePCMPGTQm], (instregex "(V?)PCMPGTQrm")>;
-def : InstRW<[ZnWritePCMPGTQYm], (instregex "(V?)PCMPGTQYrm")>;
-
-// PMULLD.
-// x,x.
-def ZnWritePMULLDr : SchedWriteRes<[ZnFPU0]> {
- let Latency = 4;
-}
-// ymm.
-def ZnWritePMULLDYr : SchedWriteRes<[ZnFPU0]> {
- let Latency = 5;
- let ResourceCycles = [2];
-}
-def : InstRW<[ZnWritePMULLDr], (instregex "(V?)PMULLDrr")>;
-def : InstRW<[ZnWritePMULLDYr], (instregex "(V?)PMULLDYrr")>;
-
-// x,m.
-def ZnWritePMULLDm : SchedWriteRes<[ZnAGU, ZnFPU0]> {
- let Latency = 11;
- let NumMicroOps = 2;
-}
-// y,m.
-def ZnWritePMULLDYm : SchedWriteRes<[ZnAGU, ZnFPU0]> {
- let Latency = 12;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 2];
-}
-def : InstRW<[ZnWritePMULLDm], (instregex "(V?)PMULLDrm")>;
-def : InstRW<[ZnWritePMULLDYm], (instregex "(V?)PMULLDYrm")>;
+def : InstRW<[ZnWritePCMPGTQYm], (instrs VPCMPGTQYrm)>;
//-- Logic instructions --//
-// PTEST.
-// v,v.
-def ZnWritePTESTr : SchedWriteRes<[ZnFPU12]> {
- let ResourceCycles = [2];
-}
-def : InstRW<[ZnWritePTESTr], (instregex "(V?)PTEST(Y?)rr")>;
-
-// v,m.
-def ZnWritePTESTm : SchedWriteRes<[ZnAGU, ZnFPU12]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 2];
-}
-def : InstRW<[ZnWritePTESTm], (instregex "(V?)PTEST(Y?)rm")>;
-
// PSLL,PSRL,PSRA W/D/Q.
// x,x / v,v,x.
def ZnWritePShift : SchedWriteRes<[ZnFPU2]> ;
def ZnWritePShiftY : SchedWriteRes<[ZnFPU2]> {
let Latency = 2;
}
-def ZnWritePShiftLd : SchedWriteRes<[ZnAGU,ZnFPU2]> {
- let Latency = 8;
-}
-def ZnWritePShiftYLd : SchedWriteRes<[ZnAGU, ZnFPU2]> {
- let Latency = 9;
-}
-def : InstRW<[ZnWritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)rr")>;
-def : InstRW<[ZnWritePShiftY], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)Yrr")>;
-
-def : InstRW<[ZnWritePShiftLd], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)rm")>;
-def : InstRW<[ZnWritePShiftYLd], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)Yrm")>;
// PSLL,PSRL DQ.
def : InstRW<[ZnWritePShift], (instregex "(V?)PS(R|L)LDQri")>;
@@ -1163,33 +1133,16 @@ def : InstRW<[ZnWritePShiftY], (instregex "(V?)PS(R|L)LDQYri")>;
//=== Floating Point XMM and YMM Instructions ===//
//-- Move instructions --//
-// MOVMSKP S/D.
-// r32 <- x,y.
-def ZnWriteMOVMSKPr : SchedWriteRes<[ZnFPU2]> ;
-def : InstRW<[ZnWriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)(Y?)rr")>;
-
// VPERM2F128.
-def : InstRW<[WriteMicrocoded], (instregex "VPERM2F128rr")>;
-def : InstRW<[WriteMicrocoded], (instregex "VPERM2F128rm")>;
-
-// BLENDVP S/D.
-def ZnWriteFPU01Lat3 : SchedWriteRes<[ZnFPU013]> {
- let Latency = 3;
-}
-def ZnWriteFPU01Lat3Ld : SchedWriteRes<[ZnAGU, ZnFPU013]> {
- let Latency = 11;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 2];
-}
-def : InstRW<[ZnWriteFPU01Lat3], (instregex "BLENDVP(S|D)rr0")>;
-def : InstRW<[ZnWriteFPU01Lat3Ld, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>;
+def : InstRW<[WriteMicrocoded], (instrs VPERM2F128rr)>;
+def : InstRW<[WriteMicrocoded], (instrs VPERM2F128rm)>;
def ZnWriteBROADCAST : SchedWriteRes<[ZnAGU, ZnFPU13]> {
let NumMicroOps = 2;
let Latency = 8;
}
// VBROADCASTF128.
-def : InstRW<[ZnWriteBROADCAST], (instregex "VBROADCASTF128")>;
+def : InstRW<[ZnWriteBROADCAST], (instrs VBROADCASTF128)>;
// EXTRACTPS.
// r32,x,i.
@@ -1210,10 +1163,10 @@ def : InstRW<[ZnWriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>;
// VEXTRACTF128.
// x,y,i.
-def : InstRW<[ZnWriteFPU013], (instregex "VEXTRACTF128rr")>;
+def : InstRW<[ZnWriteFPU013], (instrs VEXTRACTF128rr)>;
// m128,y,i.
-def : InstRW<[ZnWriteFPU013m], (instregex "VEXTRACTF128mr")>;
+def : InstRW<[ZnWriteFPU013m], (instrs VEXTRACTF128mr)>;
def ZnWriteVINSERT128r: SchedWriteRes<[ZnFPU013]> {
let Latency = 2;
@@ -1226,69 +1179,27 @@ def ZnWriteVINSERT128Ld: SchedWriteRes<[ZnAGU,ZnFPU013]> {
}
// VINSERTF128.
// y,y,x,i.
-def : InstRW<[ZnWriteVINSERT128r], (instregex "VINSERTF128rr")>;
-def : InstRW<[ZnWriteVINSERT128Ld], (instregex "VINSERTF128rm")>;
-
-// VMASKMOVP S/D.
-// x,x,m.
-def ZnWriteVMASKMOVPLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
- let Latency = 8;
-}
-// y,y,m.
-def ZnWriteVMASKMOVPLdY : SchedWriteRes<[ZnAGU, ZnFPU01]> {
- let Latency = 8;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 2];
-}
-def ZnWriteVMASKMOVPm : SchedWriteRes<[ZnAGU, ZnFPU01]> {
- let Latency = 4;
-}
-def : InstRW<[ZnWriteVMASKMOVPLd], (instregex "VMASKMOVP(S|D)rm")>;
-def : InstRW<[ZnWriteVMASKMOVPLdY], (instregex "VMASKMOVP(S|D)Yrm")>;
-def : InstRW<[ZnWriteVMASKMOVPm], (instregex "VMASKMOVP(S|D)mr")>;
-
-// m256,y,y.
-def ZnWriteVMASKMOVPYmr : SchedWriteRes<[ZnAGU,ZnFPU01]> {
- let Latency = 5;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 2];
-}
-def : InstRW<[ZnWriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>;
-
-// VGATHERDPS.
-// x.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPSrm")>;
-// y.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPSYrm")>;
-
-// VGATHERQPS.
-// x.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPSrm")>;
+def : InstRW<[ZnWriteVINSERT128r], (instrs VINSERTF128rr)>;
+def : InstRW<[ZnWriteVINSERT128Ld], (instrs VINSERTF128rm)>;
-// y.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPSYrm")>;
-
-// VGATHERDPD.
-// x.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPDrm")>;
-
-// y.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPDYrm")>;
-
-// VGATHERQPD.
-// x.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPDrm")>;
-
-// y.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPDYrm")>;
+// VGATHER.
+def : InstRW<[WriteMicrocoded], (instregex "VGATHER(Q|D)(PD|PS)(Y?)rm")>;
//-- Conversion instructions --//
def ZnWriteCVTPD2PSr: SchedWriteRes<[ZnFPU3]> {
let Latency = 4;
}
+def ZnWriteCVTPD2PSYr: SchedWriteRes<[ZnFPU3]> {
+ let Latency = 5;
+}
+
// CVTPD2PS.
// x,x.
-def : InstRW<[ZnWriteCVTPD2PSr], (instregex "(V?)CVTPD2PSrr")>;
+def : SchedAlias<WriteCvtPD2PS, ZnWriteCVTPD2PSr>;
+// y,y.
+def : SchedAlias<WriteCvtPD2PSY, ZnWriteCVTPD2PSYr>;
+// z,z.
+defm : X86WriteResUnsupported<WriteCvtPD2PSZ>;
def ZnWriteCVTPD2PSLd: SchedWriteRes<[ZnAGU,ZnFPU03]> {
let Latency = 11;
@@ -1296,34 +1207,30 @@ def ZnWriteCVTPD2PSLd: SchedWriteRes<[ZnAGU,ZnFPU03]> {
let ResourceCycles = [1,2];
}
// x,m128.
-def : InstRW<[ZnWriteCVTPD2PSLd], (instregex "(V?)CVTPD2PS(X?)rm")>;
-
-// x,y.
-def ZnWriteCVTPD2PSYr : SchedWriteRes<[ZnFPU3]> {
- let Latency = 5;
-}
-def : InstRW<[ZnWriteCVTPD2PSYr], (instregex "(V?)CVTPD2PSYrr")>;
+def : SchedAlias<WriteCvtPD2PSLd, ZnWriteCVTPD2PSLd>;
// x,m256.
def ZnWriteCVTPD2PSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
let Latency = 11;
}
-def : InstRW<[ZnWriteCVTPD2PSYLd], (instregex "(V?)CVTPD2PSYrm")>;
+def : SchedAlias<WriteCvtPD2PSYLd, ZnWriteCVTPD2PSYLd>;
+// z,m512
+defm : X86WriteResUnsupported<WriteCvtPD2PSZLd>;
// CVTSD2SS.
// x,x.
// Same as WriteCVTPD2PSr
-def : InstRW<[ZnWriteCVTPD2PSr], (instregex "(Int_)?(V)?CVTSD2SSrr")>;
+def : SchedAlias<WriteCvtSD2SS, ZnWriteCVTPD2PSr>;
// x,m64.
-def : InstRW<[ZnWriteCVTPD2PSLd], (instregex "(Int_)?(V)?CVTSD2SSrm")>;
+def : SchedAlias<WriteCvtSD2SSLd, ZnWriteCVTPD2PSLd>;
// CVTPS2PD.
// x,x.
def ZnWriteCVTPS2PDr : SchedWriteRes<[ZnFPU3]> {
let Latency = 3;
}
-def : InstRW<[ZnWriteCVTPS2PDr], (instregex "(V?)CVTPS2PDrr")>;
+def : SchedAlias<WriteCvtPS2PD, ZnWriteCVTPS2PDr>;
// x,m64.
// y,m128.
@@ -1331,20 +1238,23 @@ def ZnWriteCVTPS2PDLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
let Latency = 10;
let NumMicroOps = 2;
}
-def : InstRW<[ZnWriteCVTPS2PDLd], (instregex "(V?)CVTPS2PD(Y?)rm")>;
+def : SchedAlias<WriteCvtPS2PDLd, ZnWriteCVTPS2PDLd>;
+def : SchedAlias<WriteCvtPS2PDYLd, ZnWriteCVTPS2PDLd>;
+defm : X86WriteResUnsupported<WriteCvtPS2PDZLd>;
// y,x.
def ZnWriteVCVTPS2PDY : SchedWriteRes<[ZnFPU3]> {
let Latency = 3;
}
-def : InstRW<[ZnWriteVCVTPS2PDY], (instregex "VCVTPS2PDYrr")>;
+def : SchedAlias<WriteCvtPS2PDY, ZnWriteVCVTPS2PDY>;
+defm : X86WriteResUnsupported<WriteCvtPS2PDZ>;
// CVTSS2SD.
// x,x.
def ZnWriteCVTSS2SDr : SchedWriteRes<[ZnFPU3]> {
let Latency = 4;
}
-def : InstRW<[ZnWriteCVTSS2SDr], (instregex "(Int_)?(V?)CVTSS2SDrr")>;
+def : SchedAlias<WriteCvtSS2SD, ZnWriteCVTSS2SDr>;
// x,m32.
def ZnWriteCVTSS2SDLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
@@ -1352,7 +1262,7 @@ def ZnWriteCVTSS2SDLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
let NumMicroOps = 2;
let ResourceCycles = [1, 2];
}
-def : InstRW<[ZnWriteCVTSS2SDLd], (instregex "(Int_)?(V?)CVTSS2SDrm")>;
+def : SchedAlias<WriteCvtSS2SDLd, ZnWriteCVTSS2SDLd>;
def ZnWriteCVTDQ2PDr: SchedWriteRes<[ZnFPU12,ZnFPU3]> {
let Latency = 5;
@@ -1363,7 +1273,7 @@ def : InstRW<[ZnWriteCVTDQ2PDr], (instregex "(V)?CVTDQ2PDrr")>;
// Same as xmm
// y,x.
-def : InstRW<[ZnWriteCVTDQ2PDr], (instregex "VCVTDQ2PDYrr")>;
+def : InstRW<[ZnWriteCVTDQ2PDr], (instrs VCVTDQ2PDYrr)>;
def ZnWriteCVTPD2DQr: SchedWriteRes<[ZnFPU12, ZnFPU3]> {
let Latency = 5;
@@ -1383,7 +1293,6 @@ def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "(V?)CVT(T?)PD2DQrm")>;
def : InstRW<[ZnWriteCVTPD2DQr], (instregex "VCVT(T?)PD2DQYrr")>;
// x,m256.
def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "VCVT(T?)PD2DQYrm")>;
-def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "VCVT(T?)PD2DQ(64)?rm")>;
def ZnWriteCVTPS2PIr: SchedWriteRes<[ZnFPU3]> {
let Latency = 4;
@@ -1394,7 +1303,7 @@ def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIirr")>;
// CVTPI2PD.
// x,mm.
-def : InstRW<[ZnWriteCVTPS2PDr], (instregex "MMX_CVT(T?)PI2PDirr")>;
+def : InstRW<[ZnWriteCVTPS2PDr], (instrs MMX_CVTPI2PDirr)>;
// CVT(T)PD2PI.
// mm,x.
@@ -1403,24 +1312,21 @@ def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>;
def ZnWriteCVSTSI2SSr: SchedWriteRes<[ZnFPU3]> {
let Latency = 5;
}
-// CVSTSI2SS.
-// x,r32.
-def : InstRW<[ZnWriteCVSTSI2SSr], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>;
// same as CVTPD2DQr
// CVT(T)SS2SI.
// r32,x.
-def : InstRW<[ZnWriteCVTPD2DQr], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>;
+def : InstRW<[ZnWriteCVTPD2DQr], (instregex "(V?)CVT(T?)SS2SI(64)?rr")>;
// same as CVTPD2DQm
// r32,m32.
-def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>;
+def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "(V?)CVT(T?)SS2SI(64)?rm")>;
def ZnWriteCVSTSI2SDr: SchedWriteRes<[ZnFPU013, ZnFPU3]> {
let Latency = 5;
}
// CVTSI2SD.
// x,r32/64.
-def : InstRW<[ZnWriteCVSTSI2SDr], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>;
+def : InstRW<[ZnWriteCVSTSI2SDr], (instregex "(V?)CVTSI(64)?2SDrr")>;
def ZnWriteCVSTSI2SIr: SchedWriteRes<[ZnFPU3, ZnFPU2]> {
@@ -1431,34 +1337,29 @@ def ZnWriteCVSTSI2SILd: SchedWriteRes<[ZnAGU, ZnFPU3, ZnFPU2]> {
}
// CVTSD2SI.
// r32/64
-def : InstRW<[ZnWriteCVSTSI2SIr], (instregex "(Int_)?CVT(T?)SD2SI(64)?rr")>;
+def : InstRW<[ZnWriteCVSTSI2SIr], (instregex "(V?)CVT(T?)SD2SI(64)?rr")>;
// r32,m32.
-def : InstRW<[ZnWriteCVSTSI2SILd], (instregex "(Int_)?CVT(T?)SD2SI(64)?rm")>;
-
-
-def ZnWriteVCVSTSI2SIr: SchedWriteRes<[ZnFPU3]> {
- let Latency = 5;
-}
-def ZnWriteVCVSTSI2SILd: SchedWriteRes<[ZnFPU3, ZnAGU]> {
- let Latency = 12;
-}
-// VCVTSD2SI.
-// r32/64
-def : InstRW<[ZnWriteCVSTSI2SIr], (instregex "(Int_)?VCVT(T?)SD2SI(64)?rr")>;
-// r32,m32.
-def : InstRW<[ZnWriteCVSTSI2SILd], (instregex "(Int_)?VCVT(T?)SD2SI(64)?rm")>;
+def : InstRW<[ZnWriteCVSTSI2SILd], (instregex "(V?)CVT(T?)SD2SI(64)?rm")>;
// VCVTPS2PH.
// x,v,i.
-def : InstRW<[WriteMicrocoded], (instregex "VCVTPS2PH(Y?)rr")>;
+def : SchedAlias<WriteCvtPS2PH, ZnWriteMicrocoded>;
+def : SchedAlias<WriteCvtPS2PHY, ZnWriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
// m,v,i.
-def : InstRW<[WriteMicrocoded], (instregex "VCVTPS2PH(Y?)mr")>;
+def : SchedAlias<WriteCvtPS2PHSt, ZnWriteMicrocoded>;
+def : SchedAlias<WriteCvtPS2PHYSt, ZnWriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
// VCVTPH2PS.
// v,x.
-def : InstRW<[WriteMicrocoded], (instregex "VCVTPH2PS(Y?)rr")>;
+def : SchedAlias<WriteCvtPH2PS, ZnWriteMicrocoded>;
+def : SchedAlias<WriteCvtPH2PSY, ZnWriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZ>;
// v,m.
-def : InstRW<[WriteMicrocoded], (instregex "VCVTPH2PS(Y?)rm")>;
+def : SchedAlias<WriteCvtPH2PSLd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteCvtPH2PSYLd, ZnWriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
//-- SSE4A instructions --//
// EXTRQ
@@ -1473,12 +1374,6 @@ def ZnWriteINSERTQ: SchedWriteRes<[ZnFPU03,ZnFPU1]> {
}
def : InstRW<[ZnWriteINSERTQ], (instregex "INSERTQ")>;
-// MOVNTSS/MOVNTSD
-def ZnWriteMOVNT: SchedWriteRes<[ZnAGU,ZnFPU2]> {
- let Latency = 8;
-}
-def : InstRW<[ZnWriteMOVNT], (instregex "MOVNTS(S|D)")>;
-
//-- SHA instructions --//
// SHA256MSG2
def : InstRW<[WriteMicrocoded], (instregex "SHA256MSG2(Y?)r(r|m)")>;
@@ -1544,41 +1439,19 @@ def : InstRW<[ZnWriteSHA256RNDS2Ld], (instregex "SHA256RNDS2rm")>;
//-- Arithmetic instructions --//
// HADD, HSUB PS/PD
-def : InstRW<[WriteMicrocoded], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)r(r|m)")>;
-
-// MULL SS/SD PS/PD.
-// x,x / v,v,v.
-def ZnWriteMULr : SchedWriteRes<[ZnFPU01]> {
- let Latency = 3;
-}
-// ymm.
-def ZnWriteMULYr : SchedWriteRes<[ZnFPU01]> {
- let Latency = 4;
-}
-def : InstRW<[ZnWriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>;
-def : InstRW<[ZnWriteMULYr], (instregex "(V?)MUL(P|S)(S|D)Yrr")>;
-
-// x,m / v,v,m.
-def ZnWriteMULLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
- let Latency = 10;
- let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteMULLd], (instregex "(V?)MUL(P|S)(S|D)rm")>;
-
-// ymm
-def ZnWriteMULYLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
- let Latency = 11;
- let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteMULYLd], (instregex "(V?)MUL(P|S)(S|D)Yrm")>;
+def : SchedAlias<WriteFHAdd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteFHAddLd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteFHAddY, ZnWriteMicrocoded>;
+def : SchedAlias<WriteFHAddYLd, ZnWriteMicrocoded>;
// VDIVPS.
+// TODO - convert to ZnWriteResFpuPair
// y,y,y.
def ZnWriteVDIVPSYr : SchedWriteRes<[ZnFPU3]> {
let Latency = 12;
let ResourceCycles = [12];
}
-def : InstRW<[ZnWriteVDIVPSYr], (instregex "VDIVPSYrr")>;
+def : SchedAlias<WriteFDivY, ZnWriteVDIVPSYr>;
// y,y,m256.
def ZnWriteVDIVPSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
@@ -1586,15 +1459,16 @@ def ZnWriteVDIVPSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
let NumMicroOps = 2;
let ResourceCycles = [1, 19];
}
-def : InstRW<[ZnWriteVDIVPSYLd], (instregex "VDIVPSYrm")>;
+def : SchedAlias<WriteFDivYLd, ZnWriteVDIVPSYLd>;
// VDIVPD.
+// TODO - convert to ZnWriteResFpuPair
// y,y,y.
def ZnWriteVDIVPDY : SchedWriteRes<[ZnFPU3]> {
let Latency = 15;
let ResourceCycles = [15];
}
-def : InstRW<[ZnWriteVDIVPDY], (instregex "VDIVPDYrr")>;
+def : SchedAlias<WriteFDiv64Y, ZnWriteVDIVPDY>;
// y,y,m256.
def ZnWriteVDIVPDYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
@@ -1602,173 +1476,63 @@ def ZnWriteVDIVPDYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
let NumMicroOps = 2;
let ResourceCycles = [1,22];
}
-def : InstRW<[ZnWriteVDIVPDYLd], (instregex "VDIVPDYrm")>;
-
-// VRCPPS.
-// y,y.
-def ZnWriteVRCPPSr : SchedWriteRes<[ZnFPU01]> {
- let Latency = 5;
-}
-def : InstRW<[ZnWriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>;
-
-// y,m256.
-def ZnWriteVRCPPSLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
- let Latency = 12;
- let NumMicroOps = 3;
-}
-def : InstRW<[ZnWriteVRCPPSLd], (instregex "VRCPPSYm(_Int)?")>;
-
-// ROUND SS/SD PS/PD.
-// v,v,i.
-def ZnWriteROUNDr : SchedWriteRes<[ZnFPU3]> {
- let Latency = 4;
-}
-def : InstRW<[ZnWriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>;
-
-// VFMADD.
-// v,v,v.
-def ZnWriteFMADDr : SchedWriteRes<[ZnFPU03]> {
- let Latency = 5;
-}
-def : InstRW<[ZnWriteFMADDr],
- (instregex
- "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(213|132|231)(Y)?r",
- "VF(N?)M(ADD|SUB)(132|231|213)S(S|D)r",
- "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?",
- "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>;
-
-// v,v,m.
-def ZnWriteFMADDm : SchedWriteRes<[ZnAGU, ZnFPU03]> {
- let Latency = 12;
- let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteFMADDm],
- (instregex
- "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)(213|132|231)P(S|D)(Y)?m",
- "VF(N?)M(ADD|SUB)(132|231|213)S(S|D)m",
- "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?",
- "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>;
-
-// v,m,i.
-def ZnWriteROUNDm : SchedWriteRes<[ZnAGU, ZnFPU3]> {
- let Latency = 11;
- let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>;
+def : SchedAlias<WriteFDiv64YLd, ZnWriteVDIVPDYLd>;
// DPPS.
// x,x,i / v,v,v,i.
-def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPS(Y?)rri")>;
+def : SchedAlias<WriteDPPS, ZnWriteMicrocoded>;
+def : SchedAlias<WriteDPPSY, ZnWriteMicrocoded>;
// x,m,i / v,v,m,i.
-def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPS(Y?)rmi")>;
+def : SchedAlias<WriteDPPSLd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteDPPSYLd,ZnWriteMicrocoded>;
// DPPD.
// x,x,i.
-def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPDrri")>;
+def : SchedAlias<WriteDPPD, ZnWriteMicrocoded>;
// x,m,i.
-def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPDrmi")>;
-
-// VSQRTPS.
-// y,y.
-def ZnWriteVSQRTPSYr : SchedWriteRes<[ZnFPU3]> {
- let Latency = 28;
- let ResourceCycles = [28];
-}
-def : InstRW<[ZnWriteVSQRTPSYr], (instregex "VSQRTPSYr")>;
-
-// y,m256.
-def ZnWriteVSQRTPSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
- let Latency = 35;
- let ResourceCycles = [1,35];
- let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteVSQRTPSYLd], (instregex "VSQRTPSYm")>;
-
-// VSQRTPD.
-// y,y.
-def ZnWriteVSQRTPDYr : SchedWriteRes<[ZnFPU3]> {
- let Latency = 40;
- let ResourceCycles = [40];
-}
-def : InstRW<[ZnWriteVSQRTPDYr], (instregex "VSQRTPDYr")>;
-
-// y,m256.
-def ZnWriteVSQRTPDYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
- let Latency = 47;
- let NumMicroOps = 2;
- let ResourceCycles = [1,47];
-}
-def : InstRW<[ZnWriteVSQRTPDYLd], (instregex "VSQRTPDYm")>;
+def : SchedAlias<WriteDPPDLd, ZnWriteMicrocoded>;
// RSQRTSS
+// TODO - convert to ZnWriteResFpuPair
// x,x.
def ZnWriteRSQRTSSr : SchedWriteRes<[ZnFPU02]> {
let Latency = 5;
}
-def : InstRW<[ZnWriteRSQRTSSr], (instregex "(V?)RSQRTSS(Y?)r(_Int)?")>;
+def : SchedAlias<WriteFRsqrt, ZnWriteRSQRTSSr>;
-// RSQRTPS
-// x,x.
-def ZnWriteRSQRTPSr : SchedWriteRes<[ZnFPU01]> {
- let Latency = 5;
-}
-def : InstRW<[ZnWriteRSQRTPSr], (instregex "(V?)RSQRTPS(Y?)r(_Int)?")>;
-
-// RSQRTSSm
// x,m128.
def ZnWriteRSQRTSSLd: SchedWriteRes<[ZnAGU, ZnFPU02]> {
let Latency = 12;
let NumMicroOps = 2;
- let ResourceCycles = [1,2];
-}
-def : InstRW<[ZnWriteRSQRTSSLd], (instregex "(V?)RSQRTSSm(_Int)?")>;
-
-// RSQRTPSm
-def ZnWriteRSQRTPSLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
- let Latency = 12;
- let NumMicroOps = 2;
+ let ResourceCycles = [1,2]; // FIXME: Is this right?
}
-def : InstRW<[ZnWriteRSQRTPSLd], (instregex "(V?)RSQRTPSm(_Int)?")>;
+def : SchedAlias<WriteFRsqrtLd, ZnWriteRSQRTSSLd>;
-// RSQRTPS 256.
+// RSQRTPS
+// TODO - convert to ZnWriteResFpuPair
// y,y.
def ZnWriteRSQRTPSYr : SchedWriteRes<[ZnFPU01]> {
let Latency = 5;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def : InstRW<[ZnWriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>;
+def : SchedAlias<WriteFRsqrtY, ZnWriteRSQRTPSYr>;
// y,m256.
def ZnWriteRSQRTPSYLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
let Latency = 12;
let NumMicroOps = 2;
}
-def : InstRW<[ZnWriteRSQRTPSYLd], (instregex "VRSQRTPSYm(_Int)?")>;
-
-//-- Logic instructions --//
-
-// AND, ANDN, OR, XOR PS/PD.
-// x,x / v,v,v.
-def : InstRW<[WriteVecLogic], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>;
-// x,m / v,v,m.
-def : InstRW<[WriteVecLogicLd],
- (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>;
+def : SchedAlias<WriteFRsqrtYLd, ZnWriteRSQRTPSYLd>;
//-- Other instructions --//
// VZEROUPPER.
-def : InstRW<[WriteMicrocoded], (instregex "VZEROUPPER")>;
+def : InstRW<[WriteMicrocoded], (instrs VZEROUPPER)>;
// VZEROALL.
-def : InstRW<[WriteMicrocoded], (instregex "VZEROALL")>;
-
-// LDMXCSR.
-def : InstRW<[WriteMicrocoded], (instregex "(V)?LDMXCSR")>;
-
-// STMXCSR.
-def : InstRW<[WriteMicrocoded], (instregex "(V)?STMXCSR")>;
+def : InstRW<[WriteMicrocoded], (instrs VZEROALL)>;
} // SchedModel
diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
new file mode 100644
index 000000000000..078fe1598f13
--- /dev/null
+++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -0,0 +1,2247 @@
+//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Provide a pass which mitigates speculative execution attacks which operate
+/// by speculating incorrectly past some predicate (a type check, bounds check,
+/// or other condition) to reach a load with invalid inputs and leak the data
+/// accessed by that load using a side channel out of the speculative domain.
+///
+/// For details on the attacks, see the first variant in both the Project Zero
+/// writeup and the Spectre paper:
+/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
+/// https://spectreattack.com/spectre.pdf
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define PASS_KEY "x86-speculative-load-hardening"
+#define DEBUG_TYPE PASS_KEY
+
+STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
+STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
+STATISTIC(NumAddrRegsHardened,
+ "Number of address mode used registers hardaned");
+STATISTIC(NumPostLoadRegsHardened,
+ "Number of post-load register values hardened");
+STATISTIC(NumCallsOrJumpsHardened,
+ "Number of calls or jumps requiring extra hardening");
+STATISTIC(NumInstsInserted, "Number of instructions inserted");
+STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
+
+static cl::opt<bool> HardenEdgesWithLFENCE(
+ PASS_KEY "-lfence",
+ cl::desc(
+ "Use LFENCE along each conditional edge to harden against speculative "
+ "loads rather than conditional movs and poisoned pointers."),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EnablePostLoadHardening(
+ PASS_KEY "-post-load",
+ cl::desc("Harden the value loaded *after* it is loaded by "
+ "flushing the loaded bits to 1. This is hard to do "
+ "in general but can be done easily for GPRs."),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool> FenceCallAndRet(
+ PASS_KEY "-fence-call-and-ret",
+ cl::desc("Use a full speculation fence to harden both call and ret edges "
+ "rather than a lighter weight mitigation."),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool> HardenInterprocedurally(
+ PASS_KEY "-ip",
+ cl::desc("Harden interprocedurally by passing our state in and out of "
+ "functions in the high bits of the stack pointer."),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+ HardenLoads(PASS_KEY "-loads",
+ cl::desc("Sanitize loads from memory. When disable, no "
+ "significant security is provided."),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool> HardenIndirectCallsAndJumps(
+ PASS_KEY "-indirect",
+ cl::desc("Harden indirect calls and jumps against using speculatively "
+ "stored attacker controlled addresses. This is designed to "
+ "mitigate Spectre v1.2 style attacks."),
+ cl::init(true), cl::Hidden);
+
+namespace llvm {
+
+void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
+
+} // end namespace llvm
+
+namespace {
+
+class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
+public:
+ X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
+ initializeX86SpeculativeLoadHardeningPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "X86 speculative load hardening";
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// Pass identification, replacement for typeid.
+ static char ID;
+
+private:
+ /// The information about a block's conditional terminators needed to trace
+ /// our predicate state through the exiting edges.
+ struct BlockCondInfo {
+ MachineBasicBlock *MBB;
+
+ // We mostly have one conditional branch, and in extremely rare cases have
+ // two. Three and more are so rare as to be unimportant for compile time.
+ SmallVector<MachineInstr *, 2> CondBrs;
+
+ MachineInstr *UncondBr;
+ };
+
+ /// Manages the predicate state traced through the program.
+ struct PredState {
+ unsigned InitialReg;
+ unsigned PoisonReg;
+
+ const TargetRegisterClass *RC;
+ MachineSSAUpdater SSA;
+
+ PredState(MachineFunction &MF, const TargetRegisterClass *RC)
+ : RC(RC), SSA(MF) {}
+ };
+
+ const X86Subtarget *Subtarget;
+ MachineRegisterInfo *MRI;
+ const X86InstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+
+ Optional<PredState> PS;
+
+ void hardenEdgesWithLFENCE(MachineFunction &MF);
+
+ SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
+
+ SmallVector<MachineInstr *, 16>
+ tracePredStateThroughCFG(MachineFunction &MF, ArrayRef<BlockCondInfo> Infos);
+
+ void unfoldCallAndJumpLoads(MachineFunction &MF);
+
+ void tracePredStateThroughBlocksAndHarden(MachineFunction &MF);
+
+ unsigned saveEFLAGS(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
+ void restoreEFLAGS(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+ unsigned OFReg);
+
+ void mergePredStateIntoSP(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+ unsigned PredStateReg);
+ unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPt,
+ DebugLoc Loc);
+
+ void
+ hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
+ MachineOperand &IndexMO,
+ SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
+ MachineInstr *
+ sinkPostLoadHardenedInst(MachineInstr &MI,
+ SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
+ bool canHardenRegister(unsigned Reg);
+ unsigned hardenValueInRegister(unsigned Reg, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPt,
+ DebugLoc Loc);
+ unsigned hardenPostLoad(MachineInstr &MI);
+ void hardenReturnInstr(MachineInstr &MI);
+ void tracePredStateThroughCall(MachineInstr &MI);
+ void hardenIndirectCallOrJumpInstr(
+ MachineInstr &MI,
+ SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
+};
+
+} // end anonymous namespace
+
+char X86SpeculativeLoadHardeningPass::ID = 0;
+
+void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
+ AnalysisUsage &AU) const {
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
+ MachineBasicBlock &Succ, int SuccCount,
+ MachineInstr *Br, MachineInstr *&UncondBr,
+ const X86InstrInfo &TII) {
+ assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
+
+ MachineFunction &MF = *MBB.getParent();
+
+ MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
+
+ // We have to insert the new block immediately after the current one as we
+ // don't know what layout-successor relationships the successor has and we
+ // may not be able to (and generally don't want to) try to fix those up.
+ MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
+
+ // Update the branch instruction if necessary.
+ if (Br) {
+ assert(Br->getOperand(0).getMBB() == &Succ &&
+ "Didn't start with the right target!");
+ Br->getOperand(0).setMBB(&NewMBB);
+
+ // If this successor was reached through a branch rather than fallthrough,
+ // we might have *broken* fallthrough and so need to inject a new
+ // unconditional branch.
+ if (!UncondBr) {
+ MachineBasicBlock &OldLayoutSucc =
+ *std::next(MachineFunction::iterator(&NewMBB));
+ assert(MBB.isSuccessor(&OldLayoutSucc) &&
+ "Without an unconditional branch, the old layout successor should "
+ "be an actual successor!");
+ auto BrBuilder =
+ BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
+ // Update the unconditional branch now that we've added one.
+ UncondBr = &*BrBuilder;
+ }
+
+ // Insert unconditional "jump Succ" instruction in the new block if
+ // necessary.
+ if (!NewMBB.isLayoutSuccessor(&Succ)) {
+ SmallVector<MachineOperand, 4> Cond;
+ TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
+ }
+ } else {
+ assert(!UncondBr &&
+ "Cannot have a branchless successor and an unconditional branch!");
+ assert(NewMBB.isLayoutSuccessor(&Succ) &&
+ "A non-branch successor must have been a layout successor before "
+ "and now is a layout successor of the new block.");
+ }
+
+ // If this is the only edge to the successor, we can just replace it in the
+ // CFG. Otherwise we need to add a new entry in the CFG for the new
+ // successor.
+ if (SuccCount == 1) {
+ MBB.replaceSuccessor(&Succ, &NewMBB);
+ } else {
+ MBB.splitSuccessor(&Succ, &NewMBB);
+ }
+
+ // Hook up the edge from the new basic block to the old successor in the CFG.
+ NewMBB.addSuccessor(&Succ);
+
+ // Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
+ for (MachineInstr &MI : Succ) {
+ if (!MI.isPHI())
+ break;
+ for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
+ OpIdx += 2) {
+ MachineOperand &OpV = MI.getOperand(OpIdx);
+ MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
+ assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
+ if (OpMBB.getMBB() != &MBB)
+ continue;
+
+ // If this is the last edge to the succesor, just replace MBB in the PHI
+ if (SuccCount == 1) {
+ OpMBB.setMBB(&NewMBB);
+ break;
+ }
+
+ // Otherwise, append a new pair of operands for the new incoming edge.
+ MI.addOperand(MF, OpV);
+ MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
+ break;
+ }
+ }
+
+ // Inherit live-ins from the successor
+ for (auto &LI : Succ.liveins())
+ NewMBB.addLiveIn(LI);
+
+ LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
+ << Succ.getName() << "'.\n");
+ return NewMBB;
+}
+
+/// Removing duplicate PHI operands to leave the PHI in a canonical and
+/// predictable form.
+///
+/// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
+/// isn't what you might expect. We may have multiple entries in PHI nodes for
+/// a single predecessor. This makes CFG-updating extremely complex, so here we
+/// simplify all PHI nodes to a model even simpler than the IR's model: exactly
+/// one entry per predecessor, regardless of how many edges there are.
+static void canonicalizePHIOperands(MachineFunction &MF) {
+ SmallPtrSet<MachineBasicBlock *, 4> Preds;
+ SmallVector<int, 4> DupIndices;
+ for (auto &MBB : MF)
+ for (auto &MI : MBB) {
+ if (!MI.isPHI())
+ break;
+
+ // First we scan the operands of the PHI looking for duplicate entries
+ // a particular predecessor. We retain the operand index of each duplicate
+ // entry found.
+ for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
+ OpIdx += 2)
+ if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
+ DupIndices.push_back(OpIdx);
+
+ // Now walk the duplicate indices, removing both the block and value. Note
+ // that these are stored as a vector making this element-wise removal
+ // :w
+ // potentially quadratic.
+ //
+ // FIXME: It is really frustrating that we have to use a quadratic
+ // removal algorithm here. There should be a better way, but the use-def
+ // updates required make that impossible using the public API.
+ //
+ // Note that we have to process these backwards so that we don't
+ // invalidate other indices with each removal.
+ while (!DupIndices.empty()) {
+ int OpIdx = DupIndices.pop_back_val();
+ // Remove both the block and value operand, again in reverse order to
+ // preserve indices.
+ MI.RemoveOperand(OpIdx + 1);
+ MI.RemoveOperand(OpIdx);
+ }
+
+ Preds.clear();
+ }
+}
+
+/// Helper to scan a function for loads vulnerable to misspeculation that we
+/// want to harden.
+///
+/// We use this to avoid making changes to functions where there is nothing we
+/// need to do to harden against misspeculation.
+static bool hasVulnerableLoad(MachineFunction &MF) {
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ // Loads within this basic block after an LFENCE are not at risk of
+ // speculatively executing with invalid predicates from prior control
+ // flow. So break out of this block but continue scanning the function.
+ if (MI.getOpcode() == X86::LFENCE)
+ break;
+
+ // Looking for loads only.
+ if (!MI.mayLoad())
+ continue;
+
+ // An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
+ if (MI.getOpcode() == X86::MFENCE)
+ continue;
+
+ // We found a load.
+ return true;
+ }
+ }
+
+ // No loads found.
+ return false;
+}
+
+bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
+ MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+ << " **********\n");
+
+ Subtarget = &MF.getSubtarget<X86Subtarget>();
+ MRI = &MF.getRegInfo();
+ TII = Subtarget->getInstrInfo();
+ TRI = Subtarget->getRegisterInfo();
+
+ // FIXME: Support for 32-bit.
+ PS.emplace(MF, &X86::GR64_NOSPRegClass);
+
+ if (MF.begin() == MF.end())
+ // Nothing to do for a degenerate empty function...
+ return false;
+
+ // We support an alternative hardening technique based on a debug flag.
+ if (HardenEdgesWithLFENCE) {
+ hardenEdgesWithLFENCE(MF);
+ return true;
+ }
+
+ // Create a dummy debug loc to use for all the generated code here.
+ DebugLoc Loc;
+
+ MachineBasicBlock &Entry = *MF.begin();
+ auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
+
+ // Do a quick scan to see if we have any checkable loads.
+ bool HasVulnerableLoad = hasVulnerableLoad(MF);
+
+ // See if we have any conditional branching blocks that we will need to trace
+ // predicate state through.
+ SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
+
+ // If we have no interesting conditions or loads, nothing to do here.
+ if (!HasVulnerableLoad && Infos.empty())
+ return true;
+
+ // The poison value is required to be an all-ones value for many aspects of
+ // this mitigation.
+ const int PoisonVal = -1;
+ PS->PoisonReg = MRI->createVirtualRegister(PS->RC);
+ BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PS->PoisonReg)
+ .addImm(PoisonVal);
+ ++NumInstsInserted;
+
+ // If we have loads being hardened and we've asked for call and ret edges to
+ // get a full fence-based mitigation, inject that fence.
+ if (HasVulnerableLoad && FenceCallAndRet) {
+ // We need to insert an LFENCE at the start of the function to suspend any
+ // incoming misspeculation from the caller. This helps two-fold: the caller
+ // may not have been protected as this code has been, and this code gets to
+ // not take any specific action to protect across calls.
+ // FIXME: We could skip this for functions which unconditionally return
+ // a constant.
+ BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
+ ++NumInstsInserted;
+ ++NumLFENCEsInserted;
+ }
+
+ // If we guarded the entry with an LFENCE and have no conditionals to protect
+ // in blocks, then we're done.
+ if (FenceCallAndRet && Infos.empty())
+ // We may have changed the function's code at this point to insert fences.
+ return true;
+
+ // For every basic block in the function which can b
+ if (HardenInterprocedurally && !FenceCallAndRet) {
+ // Set up the predicate state by extracting it from the incoming stack
+ // pointer so we pick up any misspeculation in our caller.
+ PS->InitialReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
+ } else {
+ // Otherwise, just build the predicate state itself by zeroing a register
+ // as we don't need any initial state.
+ PS->InitialReg = MRI->createVirtualRegister(PS->RC);
+ unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
+ PredStateSubReg);
+ ++NumInstsInserted;
+ MachineOperand *ZeroEFLAGSDefOp =
+ ZeroI->findRegisterDefOperand(X86::EFLAGS);
+ assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
+ "Must have an implicit def of EFLAGS!");
+ ZeroEFLAGSDefOp->setIsDead(true);
+ BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
+ PS->InitialReg)
+ .addImm(0)
+ .addReg(PredStateSubReg)
+ .addImm(X86::sub_32bit);
+ }
+
+ // We're going to need to trace predicate state throughout the function's
+ // CFG. Prepare for this by setting up our initial state of PHIs with unique
+ // predecessor entries and all the initial predicate state.
+ canonicalizePHIOperands(MF);
+
+ // Track the updated values in an SSA updater to rewrite into SSA form at the
+ // end.
+ PS->SSA.Initialize(PS->InitialReg);
+ PS->SSA.AddAvailableValue(&Entry, PS->InitialReg);
+
+ // Trace through the CFG.
+ auto CMovs = tracePredStateThroughCFG(MF, Infos);
+
+ // We may also enter basic blocks in this function via exception handling
+ // control flow. Here, if we are hardening interprocedurally, we need to
+ // re-capture the predicate state from the throwing code. In the Itanium ABI,
+ // the throw will always look like a call to __cxa_throw and will have the
+ // predicate state in the stack pointer, so extract fresh predicate state from
+ // the stack pointer and make it available in SSA.
+ // FIXME: Handle non-itanium ABI EH models.
+ if (HardenInterprocedurally) {
+ for (MachineBasicBlock &MBB : MF) {
+ assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
+ assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
+ assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
+ if (!MBB.isEHPad())
+ continue;
+ PS->SSA.AddAvailableValue(
+ &MBB,
+ extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
+ }
+ }
+
+ // If we are going to harden calls and jumps we need to unfold their memory
+ // operands.
+ if (HardenIndirectCallsAndJumps)
+ unfoldCallAndJumpLoads(MF);
+
+ // Now that we have the predicate state available at the start of each block
+ // in the CFG, trace it through each block, hardening vulnerable instructions
+ // as we go.
+ tracePredStateThroughBlocksAndHarden(MF);
+
+ // Now rewrite all the uses of the pred state using the SSA updater to insert
+ // PHIs connecting the state between blocks along the CFG edges.
+ for (MachineInstr *CMovI : CMovs)
+ for (MachineOperand &Op : CMovI->operands()) {
+ if (!Op.isReg() || Op.getReg() != PS->InitialReg)
+ continue;
+
+ PS->SSA.RewriteUse(Op);
+ }
+
+ LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
+ dbgs() << "\n"; MF.verify(this));
+ return true;
+}
+
+/// Implements the naive hardening approach of putting an LFENCE after every
+/// potentially mis-predicted control flow construct.
+///
+/// We include this as an alternative mostly for the purpose of comparison. The
+/// performance impact of this is expected to be extremely severe and not
+/// practical for any real-world users.
+void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
+ MachineFunction &MF) {
+ // First, we scan the function looking for blocks that are reached along edges
+ // that we might want to harden.
+ SmallSetVector<MachineBasicBlock *, 8> Blocks;
+ for (MachineBasicBlock &MBB : MF) {
+ // If there are no or only one successor, nothing to do here.
+ if (MBB.succ_size() <= 1)
+ continue;
+
+ // Skip blocks unless their terminators start with a branch. Other
+ // terminators don't seem interesting for guarding against misspeculation.
+ auto TermIt = MBB.getFirstTerminator();
+ if (TermIt == MBB.end() || !TermIt->isBranch())
+ continue;
+
+ // Add all the non-EH-pad succossors to the blocks we want to harden. We
+ // skip EH pads because there isn't really a condition of interest on
+ // entering.
+ for (MachineBasicBlock *SuccMBB : MBB.successors())
+ if (!SuccMBB->isEHPad())
+ Blocks.insert(SuccMBB);
+ }
+
+ for (MachineBasicBlock *MBB : Blocks) {
+ auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
+ BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
+ ++NumInstsInserted;
+ ++NumLFENCEsInserted;
+ }
+}
+
+SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
+X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
+ SmallVector<BlockCondInfo, 16> Infos;
+
+ // Walk the function and build up a summary for each block's conditions that
+ // we need to trace through.
+ for (MachineBasicBlock &MBB : MF) {
+ // If there are no or only one successor, nothing to do here.
+ if (MBB.succ_size() <= 1)
+ continue;
+
+ // We want to reliably handle any conditional branch terminators in the
+ // MBB, so we manually analyze the branch. We can handle all of the
+ // permutations here, including ones that analyze branch cannot.
+ //
+ // The approach is to walk backwards across the terminators, resetting at
+ // any unconditional non-indirect branch, and track all conditional edges
+ // to basic blocks as well as the fallthrough or unconditional successor
+ // edge. For each conditional edge, we track the target and the opposite
+ // condition code in order to inject a "no-op" cmov into that successor
+ // that will harden the predicate. For the fallthrough/unconditional
+ // edge, we inject a separate cmov for each conditional branch with
+ // matching condition codes. This effectively implements an "and" of the
+ // condition flags, even if there isn't a single condition flag that would
+ // directly implement that. We don't bother trying to optimize either of
+ // these cases because if such an optimization is possible, LLVM should
+ // have optimized the conditional *branches* in that way already to reduce
+ // instruction count. This late, we simply assume the minimal number of
+ // branch instructions is being emitted and use that to guide our cmov
+ // insertion.
+
+ BlockCondInfo Info = {&MBB, {}, nullptr};
+
+ // Now walk backwards through the terminators and build up successors they
+ // reach and the conditions.
+ for (MachineInstr &MI : llvm::reverse(MBB)) {
+ // Once we've handled all the terminators, we're done.
+ if (!MI.isTerminator())
+ break;
+
+ // If we see a non-branch terminator, we can't handle anything so bail.
+ if (!MI.isBranch()) {
+ Info.CondBrs.clear();
+ break;
+ }
+
+ // If we see an unconditional branch, reset our state, clear any
+ // fallthrough, and set this is the "else" successor.
+ if (MI.getOpcode() == X86::JMP_1) {
+ Info.CondBrs.clear();
+ Info.UncondBr = &MI;
+ continue;
+ }
+
+ // If we get an invalid condition, we have an indirect branch or some
+ // other unanalyzable "fallthrough" case. We model this as a nullptr for
+ // the destination so we can still guard any conditional successors.
+ // Consider code sequences like:
+ // ```
+ // jCC L1
+ // jmpq *%rax
+ // ```
+ // We still want to harden the edge to `L1`.
+ if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
+ Info.CondBrs.clear();
+ Info.UncondBr = &MI;
+ continue;
+ }
+
+ // We have a vanilla conditional branch, add it to our list.
+ Info.CondBrs.push_back(&MI);
+ }
+ if (Info.CondBrs.empty()) {
+ ++NumBranchesUntraced;
+ LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
+ MBB.dump());
+ continue;
+ }
+
+ Infos.push_back(Info);
+ }
+
+ return Infos;
+}
+
+/// Trace the predicate state through the CFG, instrumenting each conditional
+/// branch such that misspeculation through an edge will poison the predicate
+/// state.
+///
+/// Returns the list of inserted CMov instructions so that they can have their
+/// uses of the predicate state rewritten into proper SSA form once it is
+/// complete.
+SmallVector<MachineInstr *, 16>
+X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
+ MachineFunction &MF, ArrayRef<BlockCondInfo> Infos) {
+ // Collect the inserted cmov instructions so we can rewrite their uses of the
+ // predicate state into SSA form.
+ SmallVector<MachineInstr *, 16> CMovs;
+
+ // Now walk all of the basic blocks looking for ones that end in conditional
+ // jumps where we need to update this register along each edge.
+ for (const BlockCondInfo &Info : Infos) {
+ MachineBasicBlock &MBB = *Info.MBB;
+ const SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
+ MachineInstr *UncondBr = Info.UncondBr;
+
+ LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
+ << "\n");
+ ++NumCondBranchesTraced;
+
+ // Compute the non-conditional successor as either the target of any
+ // unconditional branch or the layout successor.
+ MachineBasicBlock *UncondSucc =
+ UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
+ ? UncondBr->getOperand(0).getMBB()
+ : nullptr)
+ : &*std::next(MachineFunction::iterator(&MBB));
+
+ // Count how many edges there are to any given successor.
+ SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
+ if (UncondSucc)
+ ++SuccCounts[UncondSucc];
+ for (auto *CondBr : CondBrs)
+ ++SuccCounts[CondBr->getOperand(0).getMBB()];
+
+ // A lambda to insert cmov instructions into a block checking all of the
+ // condition codes in a sequence.
+ auto BuildCheckingBlockForSuccAndConds =
+ [&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
+ MachineInstr *Br, MachineInstr *&UncondBr,
+ ArrayRef<X86::CondCode> Conds) {
+ // First, we split the edge to insert the checking block into a safe
+ // location.
+ auto &CheckingMBB =
+ (SuccCount == 1 && Succ.pred_size() == 1)
+ ? Succ
+ : splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
+
+ bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
+ if (!LiveEFLAGS)
+ CheckingMBB.addLiveIn(X86::EFLAGS);
+
+ // Now insert the cmovs to implement the checks.
+ auto InsertPt = CheckingMBB.begin();
+ assert((InsertPt == CheckingMBB.end() || !InsertPt->isPHI()) &&
+ "Should never have a PHI in the initial checking block as it "
+ "always has a single predecessor!");
+
+ // We will wire each cmov to each other, but need to start with the
+ // incoming pred state.
+ unsigned CurStateReg = PS->InitialReg;
+
+ for (X86::CondCode Cond : Conds) {
+ int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
+ auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
+
+ unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+ // Note that we intentionally use an empty debug location so that
+ // this picks up the preceding location.
+ auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(),
+ TII->get(CMovOp), UpdatedStateReg)
+ .addReg(CurStateReg)
+ .addReg(PS->PoisonReg);
+ // If this is the last cmov and the EFLAGS weren't originally
+ // live-in, mark them as killed.
+ if (!LiveEFLAGS && Cond == Conds.back())
+ CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
+
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
+ dbgs() << "\n");
+
+ // The first one of the cmovs will be using the top level
+ // `PredStateReg` and need to get rewritten into SSA form.
+ if (CurStateReg == PS->InitialReg)
+ CMovs.push_back(&*CMovI);
+
+ // The next cmov should start from this one's def.
+ CurStateReg = UpdatedStateReg;
+ }
+
+ // And put the last one into the available values for SSA form of our
+ // predicate state.
+ PS->SSA.AddAvailableValue(&CheckingMBB, CurStateReg);
+ };
+
+ std::vector<X86::CondCode> UncondCodeSeq;
+ for (auto *CondBr : CondBrs) {
+ MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
+ int &SuccCount = SuccCounts[&Succ];
+
+ X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
+ X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
+ UncondCodeSeq.push_back(Cond);
+
+ BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
+ {InvCond});
+
+ // Decrement the successor count now that we've split one of the edges.
+ // We need to keep the count of edges to the successor accurate in order
+ // to know above when to *replace* the successor in the CFG vs. just
+ // adding the new successor.
+ --SuccCount;
+ }
+
+ // Since we may have split edges and changed the number of successors,
+ // normalize the probabilities. This avoids doing it each time we split an
+ // edge.
+ MBB.normalizeSuccProbs();
+
+ // Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
+ // need to intersect the other condition codes. We can do this by just
+ // doing a cmov for each one.
+ if (!UncondSucc)
+ // If we have no fallthrough to protect (perhaps it is an indirect jump?)
+ // just skip this and continue.
+ continue;
+
+ assert(SuccCounts[UncondSucc] == 1 &&
+ "We should never have more than one edge to the unconditional "
+ "successor at this point because every other edge must have been "
+ "split above!");
+
+ // Sort and unique the codes to minimize them.
+ llvm::sort(UncondCodeSeq.begin(), UncondCodeSeq.end());
+ UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
+ UncondCodeSeq.end());
+
+ // Build a checking version of the successor.
+ BuildCheckingBlockForSuccAndConds(MBB, *UncondSucc, /*SuccCount*/ 1,
+ UncondBr, UncondBr, UncondCodeSeq);
+ }
+
+ return CMovs;
+}
+
+/// Compute the register class for the unfolded load.
+///
+/// FIXME: This should probably live in X86InstrInfo, potentially by adding
+/// a way to unfold into a newly created vreg rather than requiring a register
+/// input.
+static const TargetRegisterClass *
+getRegClassForUnfoldedLoad(MachineFunction &MF, const X86InstrInfo &TII,
+ unsigned Opcode) {
+ unsigned Index;
+ unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold(
+ Opcode, /*UnfoldLoad*/ true, /*UnfoldStore*/ false, &Index);
+ const MCInstrDesc &MCID = TII.get(UnfoldedOpc);
+ return TII.getRegClass(MCID, Index, &TII.getRegisterInfo(), MF);
+}
+
+void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
+ MachineFunction &MF) {
+ for (MachineBasicBlock &MBB : MF)
+ for (auto MII = MBB.instr_begin(), MIE = MBB.instr_end(); MII != MIE;) {
+ // Grab a reference and increment the iterator so we can remove this
+ // instruction if needed without disturbing the iteration.
+ MachineInstr &MI = *MII++;
+
+ // Must either be a call or a branch.
+ if (!MI.isCall() && !MI.isBranch())
+ continue;
+ // We only care about loading variants of these instructions.
+ if (!MI.mayLoad())
+ continue;
+
+ switch (MI.getOpcode()) {
+ default: {
+ LLVM_DEBUG(
+ dbgs() << "ERROR: Found an unexpected loading branch or call "
+ "instruction:\n";
+ MI.dump(); dbgs() << "\n");
+ report_fatal_error("Unexpected loading branch or call!");
+ }
+
+ case X86::FARCALL16m:
+ case X86::FARCALL32m:
+ case X86::FARCALL64:
+ case X86::FARJMP16m:
+ case X86::FARJMP32m:
+ case X86::FARJMP64:
+ // We cannot mitigate far jumps or calls, but we also don't expect them
+ // to be vulnerable to Spectre v1.2 style attacks.
+ continue;
+
+ case X86::CALL16m:
+ case X86::CALL16m_NT:
+ case X86::CALL32m:
+ case X86::CALL32m_NT:
+ case X86::CALL64m:
+ case X86::CALL64m_NT:
+ case X86::JMP16m:
+ case X86::JMP16m_NT:
+ case X86::JMP32m:
+ case X86::JMP32m_NT:
+ case X86::JMP64m:
+ case X86::JMP64m_NT:
+ case X86::TAILJMPm64:
+ case X86::TAILJMPm64_REX:
+ case X86::TAILJMPm:
+ case X86::TCRETURNmi64:
+ case X86::TCRETURNmi: {
+ // Use the generic unfold logic now that we know we're dealing with
+ // expected instructions.
+ // FIXME: We don't have test coverage for all of these!
+ auto *UnfoldedRC = getRegClassForUnfoldedLoad(MF, *TII, MI.getOpcode());
+ if (!UnfoldedRC) {
+ LLVM_DEBUG(dbgs()
+ << "ERROR: Unable to unfold load from instruction:\n";
+ MI.dump(); dbgs() << "\n");
+ report_fatal_error("Unable to unfold load!");
+ }
+ unsigned Reg = MRI->createVirtualRegister(UnfoldedRC);
+ SmallVector<MachineInstr *, 2> NewMIs;
+ // If we were able to compute an unfolded reg class, any failure here
+ // is just a programming error so just assert.
+ bool Unfolded =
+ TII->unfoldMemoryOperand(MF, MI, Reg, /*UnfoldLoad*/ true,
+ /*UnfoldStore*/ false, NewMIs);
+ (void)Unfolded;
+ assert(Unfolded &&
+ "Computed unfolded register class but failed to unfold");
+ // Now stitch the new instructions into place and erase the old one.
+ for (auto *NewMI : NewMIs)
+ MBB.insert(MI.getIterator(), NewMI);
+ MI.eraseFromParent();
+ LLVM_DEBUG({
+ dbgs() << "Unfolded load successfully into:\n";
+ for (auto *NewMI : NewMIs) {
+ NewMI->dump();
+ dbgs() << "\n";
+ }
+ });
+ continue;
+ }
+ }
+ llvm_unreachable("Escaped switch with default!");
+ }
+}
+
+/// Returns true if the instruction has no behavior (specified or otherwise)
+/// that is based on the value of any of its register operands
+///
+/// A classical example of something that is inherently not data invariant is an
+/// indirect jump -- the destination is loaded into icache based on the bits set
+/// in the jump destination register.
+///
+/// FIXME: This should become part of our instruction tables.
+static bool isDataInvariant(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ // By default, assume that the instruction is not data invariant.
+ return false;
+
+ // Some target-independent operations that trivially lower to data-invariant
+ // instructions.
+ case TargetOpcode::COPY:
+ case TargetOpcode::INSERT_SUBREG:
+ case TargetOpcode::SUBREG_TO_REG:
+ return true;
+
+ // On x86 it is believed that imul is constant time w.r.t. the loaded data.
+ // However, they set flags and are perhaps the most surprisingly constant
+ // time operations so we call them out here separately.
+ case X86::IMUL16rr:
+ case X86::IMUL16rri8:
+ case X86::IMUL16rri:
+ case X86::IMUL32rr:
+ case X86::IMUL32rri8:
+ case X86::IMUL32rri:
+ case X86::IMUL64rr:
+ case X86::IMUL64rri32:
+ case X86::IMUL64rri8:
+
+ // Bit scanning and counting instructions that are somewhat surprisingly
+ // constant time as they scan across bits and do other fairly complex
+ // operations like popcnt, but are believed to be constant time on x86.
+ // However, these set flags.
+ case X86::BSF16rr:
+ case X86::BSF32rr:
+ case X86::BSF64rr:
+ case X86::BSR16rr:
+ case X86::BSR32rr:
+ case X86::BSR64rr:
+ case X86::LZCNT16rr:
+ case X86::LZCNT32rr:
+ case X86::LZCNT64rr:
+ case X86::POPCNT16rr:
+ case X86::POPCNT32rr:
+ case X86::POPCNT64rr:
+ case X86::TZCNT16rr:
+ case X86::TZCNT32rr:
+ case X86::TZCNT64rr:
+
+ // Bit manipulation instructions are effectively combinations of basic
+ // arithmetic ops, and should still execute in constant time. These also
+ // set flags.
+ case X86::BLCFILL32rr:
+ case X86::BLCFILL64rr:
+ case X86::BLCI32rr:
+ case X86::BLCI64rr:
+ case X86::BLCIC32rr:
+ case X86::BLCIC64rr:
+ case X86::BLCMSK32rr:
+ case X86::BLCMSK64rr:
+ case X86::BLCS32rr:
+ case X86::BLCS64rr:
+ case X86::BLSFILL32rr:
+ case X86::BLSFILL64rr:
+ case X86::BLSI32rr:
+ case X86::BLSI64rr:
+ case X86::BLSIC32rr:
+ case X86::BLSIC64rr:
+ case X86::BLSMSK32rr:
+ case X86::BLSMSK64rr:
+ case X86::BLSR32rr:
+ case X86::BLSR64rr:
+ case X86::TZMSK32rr:
+ case X86::TZMSK64rr:
+
+ // Bit extracting and clearing instructions should execute in constant time,
+ // and set flags.
+ case X86::BEXTR32rr:
+ case X86::BEXTR64rr:
+ case X86::BEXTRI32ri:
+ case X86::BEXTRI64ri:
+ case X86::BZHI32rr:
+ case X86::BZHI64rr:
+
+ // Shift and rotate.
+ case X86::ROL8r1: case X86::ROL16r1: case X86::ROL32r1: case X86::ROL64r1:
+ case X86::ROL8rCL: case X86::ROL16rCL: case X86::ROL32rCL: case X86::ROL64rCL:
+ case X86::ROL8ri: case X86::ROL16ri: case X86::ROL32ri: case X86::ROL64ri:
+ case X86::ROR8r1: case X86::ROR16r1: case X86::ROR32r1: case X86::ROR64r1:
+ case X86::ROR8rCL: case X86::ROR16rCL: case X86::ROR32rCL: case X86::ROR64rCL:
+ case X86::ROR8ri: case X86::ROR16ri: case X86::ROR32ri: case X86::ROR64ri:
+ case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1: case X86::SAR64r1:
+ case X86::SAR8rCL: case X86::SAR16rCL: case X86::SAR32rCL: case X86::SAR64rCL:
+ case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri: case X86::SAR64ri:
+ case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1: case X86::SHL64r1:
+ case X86::SHL8rCL: case X86::SHL16rCL: case X86::SHL32rCL: case X86::SHL64rCL:
+ case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri: case X86::SHL64ri:
+ case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1: case X86::SHR64r1:
+ case X86::SHR8rCL: case X86::SHR16rCL: case X86::SHR32rCL: case X86::SHR64rCL:
+ case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri: case X86::SHR64ri:
+ case X86::SHLD16rrCL: case X86::SHLD32rrCL: case X86::SHLD64rrCL:
+ case X86::SHLD16rri8: case X86::SHLD32rri8: case X86::SHLD64rri8:
+ case X86::SHRD16rrCL: case X86::SHRD32rrCL: case X86::SHRD64rrCL:
+ case X86::SHRD16rri8: case X86::SHRD32rri8: case X86::SHRD64rri8:
+
+ // Basic arithmetic is constant time on the input but does set flags.
+ case X86::ADC8rr: case X86::ADC8ri:
+ case X86::ADC16rr: case X86::ADC16ri: case X86::ADC16ri8:
+ case X86::ADC32rr: case X86::ADC32ri: case X86::ADC32ri8:
+ case X86::ADC64rr: case X86::ADC64ri8: case X86::ADC64ri32:
+ case X86::ADD8rr: case X86::ADD8ri:
+ case X86::ADD16rr: case X86::ADD16ri: case X86::ADD16ri8:
+ case X86::ADD32rr: case X86::ADD32ri: case X86::ADD32ri8:
+ case X86::ADD64rr: case X86::ADD64ri8: case X86::ADD64ri32:
+ case X86::AND8rr: case X86::AND8ri:
+ case X86::AND16rr: case X86::AND16ri: case X86::AND16ri8:
+ case X86::AND32rr: case X86::AND32ri: case X86::AND32ri8:
+ case X86::AND64rr: case X86::AND64ri8: case X86::AND64ri32:
+ case X86::OR8rr: case X86::OR8ri:
+ case X86::OR16rr: case X86::OR16ri: case X86::OR16ri8:
+ case X86::OR32rr: case X86::OR32ri: case X86::OR32ri8:
+ case X86::OR64rr: case X86::OR64ri8: case X86::OR64ri32:
+ case X86::SBB8rr: case X86::SBB8ri:
+ case X86::SBB16rr: case X86::SBB16ri: case X86::SBB16ri8:
+ case X86::SBB32rr: case X86::SBB32ri: case X86::SBB32ri8:
+ case X86::SBB64rr: case X86::SBB64ri8: case X86::SBB64ri32:
+ case X86::SUB8rr: case X86::SUB8ri:
+ case X86::SUB16rr: case X86::SUB16ri: case X86::SUB16ri8:
+ case X86::SUB32rr: case X86::SUB32ri: case X86::SUB32ri8:
+ case X86::SUB64rr: case X86::SUB64ri8: case X86::SUB64ri32:
+ case X86::XOR8rr: case X86::XOR8ri:
+ case X86::XOR16rr: case X86::XOR16ri: case X86::XOR16ri8:
+ case X86::XOR32rr: case X86::XOR32ri: case X86::XOR32ri8:
+ case X86::XOR64rr: case X86::XOR64ri8: case X86::XOR64ri32:
+ // Arithmetic with just 32-bit and 64-bit variants and no immediates.
+ case X86::ADCX32rr: case X86::ADCX64rr:
+ case X86::ADOX32rr: case X86::ADOX64rr:
+ case X86::ANDN32rr: case X86::ANDN64rr:
+ // Unary arithmetic operations.
+ case X86::DEC8r: case X86::DEC16r: case X86::DEC32r: case X86::DEC64r:
+ case X86::INC8r: case X86::INC16r: case X86::INC32r: case X86::INC64r:
+ case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
+ // Check whether the EFLAGS implicit-def is dead. We assume that this will
+ // always find the implicit-def because this code should only be reached
+ // for instructions that do in fact implicitly def this.
+ if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
+ // If we would clobber EFLAGS that are used, just bail for now.
+ LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
+ MI.dump(); dbgs() << "\n");
+ return false;
+ }
+
+ // Otherwise, fallthrough to handle these the same as instructions that
+ // don't set EFLAGS.
+ LLVM_FALLTHROUGH;
+
+ // Unlike other arithmetic, NOT doesn't set EFLAGS.
+ case X86::NOT8r: case X86::NOT16r: case X86::NOT32r: case X86::NOT64r:
+
+ // Various move instructions used to zero or sign extend things. Note that we
+ // intentionally don't support the _NOREX variants as we can't handle that
+ // register constraint anyways.
+ case X86::MOVSX16rr8:
+ case X86::MOVSX32rr8: case X86::MOVSX32rr16:
+ case X86::MOVSX64rr8: case X86::MOVSX64rr16: case X86::MOVSX64rr32:
+ case X86::MOVZX16rr8:
+ case X86::MOVZX32rr8: case X86::MOVZX32rr16:
+ case X86::MOVZX64rr8: case X86::MOVZX64rr16:
+ case X86::MOV32rr:
+
+ // Arithmetic instructions that are both constant time and don't set flags.
+ case X86::RORX32ri:
+ case X86::RORX64ri:
+ case X86::SARX32rr:
+ case X86::SARX64rr:
+ case X86::SHLX32rr:
+ case X86::SHLX64rr:
+ case X86::SHRX32rr:
+ case X86::SHRX64rr:
+
+ // LEA doesn't actually access memory, and its arithmetic is constant time.
+ case X86::LEA16r:
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ case X86::LEA64r:
+ return true;
+ }
+}
+
+/// Returns true if the instruction has no behavior (specified or otherwise)
+/// that is based on the value loaded from memory or the value of any
+/// non-address register operands.
+///
+/// For example, if the latency of the instruction is dependent on the
+/// particular bits set in any of the registers *or* any of the bits loaded from
+/// memory.
+///
+/// A classical example of something that is inherently not data invariant is an
+/// indirect jump -- the destination is loaded into icache based on the bits set
+/// in the jump destination register.
+///
+/// FIXME: This should become part of our instruction tables.
+static bool isDataInvariantLoad(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ // By default, assume that the load will immediately leak.
+ return false;
+
+ // On x86 it is believed that imul is constant time w.r.t. the loaded data.
+ // However, they set flags and are perhaps the most surprisingly constant
+ // time operations so we call them out here separately.
+ case X86::IMUL16rm:
+ case X86::IMUL16rmi8:
+ case X86::IMUL16rmi:
+ case X86::IMUL32rm:
+ case X86::IMUL32rmi8:
+ case X86::IMUL32rmi:
+ case X86::IMUL64rm:
+ case X86::IMUL64rmi32:
+ case X86::IMUL64rmi8:
+
+ // Bit scanning and counting instructions that are somewhat surprisingly
+ // constant time as they scan across bits and do other fairly complex
+ // operations like popcnt, but are believed to be constant time on x86.
+ // However, these set flags.
+ case X86::BSF16rm:
+ case X86::BSF32rm:
+ case X86::BSF64rm:
+ case X86::BSR16rm:
+ case X86::BSR32rm:
+ case X86::BSR64rm:
+ case X86::LZCNT16rm:
+ case X86::LZCNT32rm:
+ case X86::LZCNT64rm:
+ case X86::POPCNT16rm:
+ case X86::POPCNT32rm:
+ case X86::POPCNT64rm:
+ case X86::TZCNT16rm:
+ case X86::TZCNT32rm:
+ case X86::TZCNT64rm:
+
+ // Bit manipulation instructions are effectively combinations of basic
+ // arithmetic ops, and should still execute in constant time. These also
+ // set flags.
+ case X86::BLCFILL32rm:
+ case X86::BLCFILL64rm:
+ case X86::BLCI32rm:
+ case X86::BLCI64rm:
+ case X86::BLCIC32rm:
+ case X86::BLCIC64rm:
+ case X86::BLCMSK32rm:
+ case X86::BLCMSK64rm:
+ case X86::BLCS32rm:
+ case X86::BLCS64rm:
+ case X86::BLSFILL32rm:
+ case X86::BLSFILL64rm:
+ case X86::BLSI32rm:
+ case X86::BLSI64rm:
+ case X86::BLSIC32rm:
+ case X86::BLSIC64rm:
+ case X86::BLSMSK32rm:
+ case X86::BLSMSK64rm:
+ case X86::BLSR32rm:
+ case X86::BLSR64rm:
+ case X86::TZMSK32rm:
+ case X86::TZMSK64rm:
+
+ // Bit extracting and clearing instructions should execute in constant time,
+ // and set flags.
+ case X86::BEXTR32rm:
+ case X86::BEXTR64rm:
+ case X86::BEXTRI32mi:
+ case X86::BEXTRI64mi:
+ case X86::BZHI32rm:
+ case X86::BZHI64rm:
+
+ // Basic arithmetic is constant time on the input but does set flags.
+ case X86::ADC8rm:
+ case X86::ADC16rm:
+ case X86::ADC32rm:
+ case X86::ADC64rm:
+ case X86::ADCX32rm:
+ case X86::ADCX64rm:
+ case X86::ADD8rm:
+ case X86::ADD16rm:
+ case X86::ADD32rm:
+ case X86::ADD64rm:
+ case X86::ADOX32rm:
+ case X86::ADOX64rm:
+ case X86::AND8rm:
+ case X86::AND16rm:
+ case X86::AND32rm:
+ case X86::AND64rm:
+ case X86::ANDN32rm:
+ case X86::ANDN64rm:
+ case X86::OR8rm:
+ case X86::OR16rm:
+ case X86::OR32rm:
+ case X86::OR64rm:
+ case X86::SBB8rm:
+ case X86::SBB16rm:
+ case X86::SBB32rm:
+ case X86::SBB64rm:
+ case X86::SUB8rm:
+ case X86::SUB16rm:
+ case X86::SUB32rm:
+ case X86::SUB64rm:
+ case X86::XOR8rm:
+ case X86::XOR16rm:
+ case X86::XOR32rm:
+ case X86::XOR64rm:
+ // Check whether the EFLAGS implicit-def is dead. We assume that this will
+ // always find the implicit-def because this code should only be reached
+ // for instructions that do in fact implicitly def this.
+ if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
+ // If we would clobber EFLAGS that are used, just bail for now.
+ LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
+ MI.dump(); dbgs() << "\n");
+ return false;
+ }
+
+ // Otherwise, fallthrough to handle these the same as instructions that
+ // don't set EFLAGS.
+ LLVM_FALLTHROUGH;
+
+ // Integer multiply w/o affecting flags is still believed to be constant
+ // time on x86. Called out separately as this is among the most surprising
+ // instructions to exhibit that behavior.
+ case X86::MULX32rm:
+ case X86::MULX64rm:
+
+ // Arithmetic instructions that are both constant time and don't set flags.
+ case X86::RORX32mi:
+ case X86::RORX64mi:
+ case X86::SARX32rm:
+ case X86::SARX64rm:
+ case X86::SHLX32rm:
+ case X86::SHLX64rm:
+ case X86::SHRX32rm:
+ case X86::SHRX64rm:
+
+ // Conversions are believed to be constant time and don't set flags.
+ case X86::CVTTSD2SI64rm: case X86::VCVTTSD2SI64rm: case X86::VCVTTSD2SI64Zrm:
+ case X86::CVTTSD2SIrm: case X86::VCVTTSD2SIrm: case X86::VCVTTSD2SIZrm:
+ case X86::CVTTSS2SI64rm: case X86::VCVTTSS2SI64rm: case X86::VCVTTSS2SI64Zrm:
+ case X86::CVTTSS2SIrm: case X86::VCVTTSS2SIrm: case X86::VCVTTSS2SIZrm:
+ case X86::CVTSI2SDrm: case X86::VCVTSI2SDrm: case X86::VCVTSI2SDZrm:
+ case X86::CVTSI2SSrm: case X86::VCVTSI2SSrm: case X86::VCVTSI2SSZrm:
+ case X86::CVTSI642SDrm: case X86::VCVTSI642SDrm: case X86::VCVTSI642SDZrm:
+ case X86::CVTSI642SSrm: case X86::VCVTSI642SSrm: case X86::VCVTSI642SSZrm:
+ case X86::CVTSS2SDrm: case X86::VCVTSS2SDrm: case X86::VCVTSS2SDZrm:
+ case X86::CVTSD2SSrm: case X86::VCVTSD2SSrm: case X86::VCVTSD2SSZrm:
+ // AVX512 added unsigned integer conversions.
+ case X86::VCVTTSD2USI64Zrm:
+ case X86::VCVTTSD2USIZrm:
+ case X86::VCVTTSS2USI64Zrm:
+ case X86::VCVTTSS2USIZrm:
+ case X86::VCVTUSI2SDZrm:
+ case X86::VCVTUSI642SDZrm:
+ case X86::VCVTUSI2SSZrm:
+ case X86::VCVTUSI642SSZrm:
+
+ // Loads to register don't set flags.
+ case X86::MOV8rm:
+ case X86::MOV8rm_NOREX:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::MOVSX16rm8:
+ case X86::MOVSX32rm16:
+ case X86::MOVSX32rm8:
+ case X86::MOVSX32rm8_NOREX:
+ case X86::MOVSX64rm16:
+ case X86::MOVSX64rm32:
+ case X86::MOVSX64rm8:
+ case X86::MOVZX16rm8:
+ case X86::MOVZX32rm16:
+ case X86::MOVZX32rm8:
+ case X86::MOVZX32rm8_NOREX:
+ case X86::MOVZX64rm16:
+ case X86::MOVZX64rm8:
+ return true;
+ }
+}
+
+static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const TargetRegisterInfo &TRI) {
+ // Check if EFLAGS are alive by seeing if there is a def of them or they
+ // live-in, and then seeing if that def is in turn used.
+ for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
+ if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
+ // If the def is dead, then EFLAGS is not live.
+ if (DefOp->isDead())
+ return false;
+
+ // Otherwise we've def'ed it, and it is live.
+ return true;
+ }
+ // While at this instruction, also check if we use and kill EFLAGS
+ // which means it isn't live.
+ if (MI.killsRegister(X86::EFLAGS, &TRI))
+ return false;
+ }
+
+ // If we didn't find anything conclusive (neither definitely alive or
+ // definitely dead) return whether it lives into the block.
+ return MBB.isLiveIn(X86::EFLAGS);
+}
+
+/// Trace the predicate state through each of the blocks in the function,
+/// hardening everything necessary along the way.
+///
+/// We call this routine once the initial predicate state has been established
+/// for each basic block in the function in the SSA updater. This routine traces
+/// it through the instructions within each basic block, and for non-returning
+/// blocks informs the SSA updater about the final state that lives out of the
+/// block. Along the way, it hardens any vulnerable instruction using the
+/// currently valid predicate state. We have to do these two things together
+/// because the SSA updater only works across blocks. Within a block, we track
+/// the current predicate state directly and update it as it changes.
+///
+/// This operates in two passes over each block. First, we analyze the loads in
+/// the block to determine which strategy will be used to harden them: hardening
+/// the address or hardening the loaded value when loaded into a register
+/// amenable to hardening. We have to process these first because the two
+/// strategies may interact -- later hardening may change what strategy we wish
+/// to use. We also will analyze data dependencies between loads and avoid
+/// hardening those loads that are data dependent on a load with a hardened
+/// address. We also skip hardening loads already behind an LFENCE as that is
+/// sufficient to harden them against misspeculation.
+///
+/// Second, we actively trace the predicate state through the block, applying
+/// the hardening steps we determined necessary in the first pass as we go.
+///
+/// These two passes are applied to each basic block. We operate one block at a
+/// time to simplify reasoning about reachability and sequencing.
+void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
+ MachineFunction &MF) {
+ SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
+ SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
+
+ SmallSet<unsigned, 16> HardenedAddrRegs;
+
+ SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
+
+ // Track the set of load-dependent registers through the basic block. Because
+ // the values of these registers have an existing data dependency on a loaded
+ // value which we would have checked, we can omit any checks on them.
+ SparseBitVector<> LoadDepRegs;
+
+ for (MachineBasicBlock &MBB : MF) {
+ // The first pass over the block: collect all the loads which can have their
+ // loaded value hardened and all the loads that instead need their address
+ // hardened. During this walk we propagate load dependence for address
+ // hardened loads and also look for LFENCE to stop hardening wherever
+ // possible. When deciding whether or not to harden the loaded value or not,
+ // we check to see if any registers used in the address will have been
+ // hardened at this point and if so, harden any remaining address registers
+ // as that often successfully re-uses hardened addresses and minimizes
+ // instructions.
+ //
+ // FIXME: We should consider an aggressive mode where we continue to keep as
+ // many loads value hardened even when some address register hardening would
+ // be free (due to reuse).
+ //
+ // Note that we only need this pass if we are actually hardening loads.
+ if (HardenLoads)
+ for (MachineInstr &MI : MBB) {
+ // We naively assume that all def'ed registers of an instruction have
+ // a data dependency on all of their operands.
+ // FIXME: Do a more careful analysis of x86 to build a conservative
+ // model here.
+ if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
+ return Op.isReg() && LoadDepRegs.test(Op.getReg());
+ }))
+ for (MachineOperand &Def : MI.defs())
+ if (Def.isReg())
+ LoadDepRegs.set(Def.getReg());
+
+ // Both Intel and AMD are guiding that they will change the semantics of
+ // LFENCE to be a speculation barrier, so if we see an LFENCE, there is
+ // no more need to guard things in this block.
+ if (MI.getOpcode() == X86::LFENCE)
+ break;
+
+ // If this instruction cannot load, nothing to do.
+ if (!MI.mayLoad())
+ continue;
+
+ // Some instructions which "load" are trivially safe or unimportant.
+ if (MI.getOpcode() == X86::MFENCE)
+ continue;
+
+ // Extract the memory operand information about this instruction.
+ // FIXME: This doesn't handle loading pseudo instructions which we often
+ // could handle with similarly generic logic. We probably need to add an
+ // MI-layer routine similar to the MC-layer one we use here which maps
+ // pseudos much like this maps real instructions.
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
+ if (MemRefBeginIdx < 0) {
+ LLVM_DEBUG(dbgs()
+ << "WARNING: unable to harden loading instruction: ";
+ MI.dump());
+ continue;
+ }
+
+ MemRefBeginIdx += X86II::getOperandBias(Desc);
+
+ MachineOperand &BaseMO =
+ MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
+ MachineOperand &IndexMO =
+ MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
+
+ // If we have at least one (non-frame-index, non-RIP) register operand,
+ // and neither operand is load-dependent, we need to check the load.
+ unsigned BaseReg = 0, IndexReg = 0;
+ if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
+ BaseMO.getReg() != X86::NoRegister)
+ BaseReg = BaseMO.getReg();
+ if (IndexMO.getReg() != X86::NoRegister)
+ IndexReg = IndexMO.getReg();
+
+ if (!BaseReg && !IndexReg)
+ // No register operands!
+ continue;
+
+ // If any register operand is dependent, this load is dependent and we
+ // needn't check it.
+ // FIXME: Is this true in the case where we are hardening loads after
+ // they complete? Unclear, need to investigate.
+ if ((BaseReg && LoadDepRegs.test(BaseReg)) ||
+ (IndexReg && LoadDepRegs.test(IndexReg)))
+ continue;
+
+ // If post-load hardening is enabled, this load is compatible with
+ // post-load hardening, and we aren't already going to harden one of the
+ // address registers, queue it up to be hardened post-load. Notably,
+ // even once hardened this won't introduce a useful dependency that
+ // could prune out subsequent loads.
+ if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
+ MI.getDesc().getNumDefs() == 1 && MI.getOperand(0).isReg() &&
+ canHardenRegister(MI.getOperand(0).getReg()) &&
+ !HardenedAddrRegs.count(BaseReg) &&
+ !HardenedAddrRegs.count(IndexReg)) {
+ HardenPostLoad.insert(&MI);
+ HardenedAddrRegs.insert(MI.getOperand(0).getReg());
+ continue;
+ }
+
+ // Record this instruction for address hardening and record its register
+ // operands as being address-hardened.
+ HardenLoadAddr.insert(&MI);
+ if (BaseReg)
+ HardenedAddrRegs.insert(BaseReg);
+ if (IndexReg)
+ HardenedAddrRegs.insert(IndexReg);
+
+ for (MachineOperand &Def : MI.defs())
+ if (Def.isReg())
+ LoadDepRegs.set(Def.getReg());
+ }
+
+ // Now re-walk the instructions in the basic block, and apply whichever
+ // hardening strategy we have elected. Note that we do this in a second
+ // pass specifically so that we have the complete set of instructions for
+ // which we will do post-load hardening and can defer it in certain
+ // circumstances.
+ //
+ // FIXME: This could probably be made even more effective by doing it
+ // across the entire function. Rather than just walking the flat list
+ // backwards here, we could walk the function in PO and each block bottom
+ // up, allowing us to in some cases sink hardening across block blocks. As
+ // long as the in-block predicate state is used at the eventual hardening
+ // site, this remains safe.
+ for (MachineInstr &MI : MBB) {
+ if (HardenLoads) {
+ // We cannot both require hardening the def of a load and its address.
+ assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
+ "Requested to harden both the address and def of a load!");
+
+ // Check if this is a load whose address needs to be hardened.
+ if (HardenLoadAddr.erase(&MI)) {
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
+ assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
+
+ MemRefBeginIdx += X86II::getOperandBias(Desc);
+
+ MachineOperand &BaseMO =
+ MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
+ MachineOperand &IndexMO =
+ MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
+ hardenLoadAddr(MI, BaseMO, IndexMO, AddrRegToHardenedReg);
+ continue;
+ }
+
+ // Test if this instruction is one of our post load instructions (and
+ // remove it from the set if so).
+ if (HardenPostLoad.erase(&MI)) {
+ assert(!MI.isCall() && "Must not try to post-load harden a call!");
+
+ // If this is a data-invariant load, we want to try and sink any
+ // hardening as far as possible.
+ if (isDataInvariantLoad(MI)) {
+ // Sink the instruction we'll need to harden as far as we can down
+ // the graph.
+ MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
+
+ // If we managed to sink this instruction, update everything so we
+ // harden that instruction when we reach it in the instruction
+ // sequence.
+ if (SunkMI != &MI) {
+ // If in sinking there was no instruction needing to be hardened,
+ // we're done.
+ if (!SunkMI)
+ continue;
+
+ // Otherwise, add this to the set of defs we harden.
+ HardenPostLoad.insert(SunkMI);
+ continue;
+ }
+ }
+
+ unsigned HardenedReg = hardenPostLoad(MI);
+
+ // Mark the resulting hardened register as such so we don't re-harden.
+ AddrRegToHardenedReg[HardenedReg] = HardenedReg;
+
+ continue;
+ }
+
+ // Check for an indirect call or branch that may need its input hardened
+ // even if we couldn't find the specific load used, or were able to
+ // avoid hardening it for some reason. Note that here we cannot break
+ // out afterward as we may still need to handle any call aspect of this
+ // instruction.
+ if ((MI.isCall() || MI.isBranch()) && HardenIndirectCallsAndJumps)
+ hardenIndirectCallOrJumpInstr(MI, AddrRegToHardenedReg);
+ }
+
+ // After we finish hardening loads we handle interprocedural hardening if
+ // enabled and relevant for this instruction.
+ if (!HardenInterprocedurally)
+ continue;
+ if (!MI.isCall() && !MI.isReturn())
+ continue;
+
+ // If this is a direct return (IE, not a tail call) just directly harden
+ // it.
+ if (MI.isReturn() && !MI.isCall()) {
+ hardenReturnInstr(MI);
+ continue;
+ }
+
+ // Otherwise we have a call. We need to handle transferring the predicate
+ // state into a call and recovering it after the call returns unless this
+ // is a tail call.
+ assert(MI.isCall() && "Should only reach here for calls!");
+ tracePredStateThroughCall(MI);
+ }
+
+ HardenPostLoad.clear();
+ HardenLoadAddr.clear();
+ HardenedAddrRegs.clear();
+ AddrRegToHardenedReg.clear();
+
+ // Currently, we only track data-dependent loads within a basic block.
+ // FIXME: We should see if this is necessary or if we could be more
+ // aggressive here without opening up attack avenues.
+ LoadDepRegs.clear();
+ }
+}
+
+/// Save EFLAGS into the returned GPR. This can in turn be restored with
+/// `restoreEFLAGS`.
+///
+/// Note that LLVM can only lower very simple patterns of saved and restored
+/// EFLAGS registers. The restore should always be within the same basic block
+/// as the save so that no PHI nodes are inserted.
+unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+ DebugLoc Loc) {
+ // FIXME: Hard coding this to a 32-bit register class seems weird, but matches
+ // what instruction selection does.
+ unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ // We directly copy the FLAGS register and rely on later lowering to clean
+ // this up into the appropriate setCC instructions.
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
+ ++NumInstsInserted;
+ return Reg;
+}
+
+/// Restore EFLAGS from the provided GPR. This should be produced by
+/// `saveEFLAGS`.
+///
+/// This must be done within the same basic block as the save in order to
+/// reliably lower.
+void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+ unsigned Reg) {
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
+ ++NumInstsInserted;
+}
+
+/// Takes the current predicate state (in a register) and merges it into the
+/// stack pointer. The state is essentially a single bit, but we merge this in
+/// a way that won't form non-canonical pointers and also will be preserved
+/// across normal stack adjustments.
+void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+ unsigned PredStateReg) {
+ unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
+ // FIXME: This hard codes a shift distance based on the number of bits needed
+ // to stay canonical on 64-bit. We should compute this somehow and support
+ // 32-bit as part of that.
+ auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
+ .addReg(PredStateReg, RegState::Kill)
+ .addImm(47);
+ ShiftI->addRegisterDead(X86::EFLAGS, TRI);
+ ++NumInstsInserted;
+ auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
+ .addReg(X86::RSP)
+ .addReg(TmpReg, RegState::Kill);
+ OrI->addRegisterDead(X86::EFLAGS, TRI);
+ ++NumInstsInserted;
+}
+
+/// Extracts the predicate state stored in the high bits of the stack pointer.
+unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+ DebugLoc Loc) {
+ unsigned PredStateReg = MRI->createVirtualRegister(PS->RC);
+ unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
+
+ // We know that the stack pointer will have any preserved predicate state in
+ // its high bit. We just want to smear this across the other bits. Turns out,
+ // this is exactly what an arithmetic right shift does.
+ BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
+ .addReg(X86::RSP);
+ auto ShiftI =
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
+ .addReg(TmpReg, RegState::Kill)
+ .addImm(TRI->getRegSizeInBits(*PS->RC) - 1);
+ ShiftI->addRegisterDead(X86::EFLAGS, TRI);
+ ++NumInstsInserted;
+
+ return PredStateReg;
+}
+
+void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
+ MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
+ SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc Loc = MI.getDebugLoc();
+
+ // Check if EFLAGS are alive by seeing if there is a def of them or they
+ // live-in, and then seeing if that def is in turn used.
+ bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
+
+ SmallVector<MachineOperand *, 2> HardenOpRegs;
+
+ if (BaseMO.isFI()) {
+ // A frame index is never a dynamically controllable load, so only
+ // harden it if we're covering fixed address loads as well.
+ LLVM_DEBUG(
+ dbgs() << " Skipping hardening base of explicit stack frame load: ";
+ MI.dump(); dbgs() << "\n");
+ } else if (BaseMO.getReg() == X86::RIP ||
+ BaseMO.getReg() == X86::NoRegister) {
+ // For both RIP-relative addressed loads or absolute loads, we cannot
+ // meaningfully harden them because the address being loaded has no
+ // dynamic component.
+ //
+ // FIXME: When using a segment base (like TLS does) we end up with the
+ // dynamic address being the base plus -1 because we can't mutate the
+ // segment register here. This allows the signed 32-bit offset to point at
+ // valid segment-relative addresses and load them successfully.
+ LLVM_DEBUG(
+ dbgs() << " Cannot harden base of "
+ << (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
+ << " address in a load!");
+ } else {
+ assert(BaseMO.isReg() &&
+ "Only allowed to have a frame index or register base.");
+ HardenOpRegs.push_back(&BaseMO);
+ }
+
+ if (IndexMO.getReg() != X86::NoRegister &&
+ (HardenOpRegs.empty() ||
+ HardenOpRegs.front()->getReg() != IndexMO.getReg()))
+ HardenOpRegs.push_back(&IndexMO);
+
+ assert((HardenOpRegs.size() == 1 || HardenOpRegs.size() == 2) &&
+ "Should have exactly one or two registers to harden!");
+ assert((HardenOpRegs.size() == 1 ||
+ HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
+ "Should not have two of the same registers!");
+
+ // Remove any registers that have alreaded been checked.
+ llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
+ // See if this operand's register has already been checked.
+ auto It = AddrRegToHardenedReg.find(Op->getReg());
+ if (It == AddrRegToHardenedReg.end())
+ // Not checked, so retain this one.
+ return false;
+
+ // Otherwise, we can directly update this operand and remove it.
+ Op->setReg(It->second);
+ return true;
+ });
+ // If there are none left, we're done.
+ if (HardenOpRegs.empty())
+ return;
+
+ // Compute the current predicate state.
+ unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
+
+ auto InsertPt = MI.getIterator();
+
+ // If EFLAGS are live and we don't have access to instructions that avoid
+ // clobbering EFLAGS we need to save and restore them. This in turn makes
+ // the EFLAGS no longer live.
+ unsigned FlagsReg = 0;
+ if (EFLAGSLive && !Subtarget->hasBMI2()) {
+ EFLAGSLive = false;
+ FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
+ }
+
+ for (MachineOperand *Op : HardenOpRegs) {
+ unsigned OpReg = Op->getReg();
+ auto *OpRC = MRI->getRegClass(OpReg);
+ unsigned TmpReg = MRI->createVirtualRegister(OpRC);
+
+ // If this is a vector register, we'll need somewhat custom logic to handle
+ // hardening it.
+ if (!Subtarget->hasVLX() && (OpRC->hasSuperClassEq(&X86::VR128RegClass) ||
+ OpRC->hasSuperClassEq(&X86::VR256RegClass))) {
+ assert(Subtarget->hasAVX2() && "AVX2-specific register classes!");
+ bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128RegClass);
+
+ // Move our state into a vector register.
+ // FIXME: We could skip this at the cost of longer encodings with AVX-512
+ // but that doesn't seem likely worth it.
+ unsigned VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
+ auto MovI =
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
+ .addReg(StateReg);
+ (void)MovI;
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n");
+
+ // Broadcast it across the vector register.
+ unsigned VBStateReg = MRI->createVirtualRegister(OpRC);
+ auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
+ TII->get(Is128Bit ? X86::VPBROADCASTQrr
+ : X86::VPBROADCASTQYrr),
+ VBStateReg)
+ .addReg(VStateReg);
+ (void)BroadcastI;
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
+ dbgs() << "\n");
+
+ // Merge our potential poison state into the value with a vector or.
+ auto OrI =
+ BuildMI(MBB, InsertPt, Loc,
+ TII->get(Is128Bit ? X86::VPORrr : X86::VPORYrr), TmpReg)
+ .addReg(VBStateReg)
+ .addReg(OpReg);
+ (void)OrI;
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
+ } else if (OpRC->hasSuperClassEq(&X86::VR128XRegClass) ||
+ OpRC->hasSuperClassEq(&X86::VR256XRegClass) ||
+ OpRC->hasSuperClassEq(&X86::VR512RegClass)) {
+ assert(Subtarget->hasAVX512() && "AVX512-specific register classes!");
+ bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128XRegClass);
+ bool Is256Bit = OpRC->hasSuperClassEq(&X86::VR256XRegClass);
+ if (Is128Bit || Is256Bit)
+ assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
+
+ // Broadcast our state into a vector register.
+ unsigned VStateReg = MRI->createVirtualRegister(OpRC);
+ unsigned BroadcastOp =
+ Is128Bit ? X86::VPBROADCASTQrZ128r
+ : Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr;
+ auto BroadcastI =
+ BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg)
+ .addReg(StateReg);
+ (void)BroadcastI;
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
+ dbgs() << "\n");
+
+ // Merge our potential poison state into the value with a vector or.
+ unsigned OrOp = Is128Bit ? X86::VPORQZ128rr
+ : Is256Bit ? X86::VPORQZ256rr : X86::VPORQZrr;
+ auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOp), TmpReg)
+ .addReg(VStateReg)
+ .addReg(OpReg);
+ (void)OrI;
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
+ } else {
+ // FIXME: Need to support GR32 here for 32-bit code.
+ assert(OpRC->hasSuperClassEq(&X86::GR64RegClass) &&
+ "Not a supported register class for address hardening!");
+
+ if (!EFLAGSLive) {
+ // Merge our potential poison state into the value with an or.
+ auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
+ .addReg(StateReg)
+ .addReg(OpReg);
+ OrI->addRegisterDead(X86::EFLAGS, TRI);
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
+ } else {
+ // We need to avoid touching EFLAGS so shift out all but the least
+ // significant bit using the instruction that doesn't update flags.
+ auto ShiftI =
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
+ .addReg(OpReg)
+ .addReg(StateReg);
+ (void)ShiftI;
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
+ dbgs() << "\n");
+ }
+ }
+
+ // Record this register as checked and update the operand.
+ assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
+ "Should not have checked this register yet!");
+ AddrRegToHardenedReg[Op->getReg()] = TmpReg;
+ Op->setReg(TmpReg);
+ ++NumAddrRegsHardened;
+ }
+
+ // And restore the flags if needed.
+ if (FlagsReg)
+ restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
+}
+
+MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
+ MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
+ assert(isDataInvariantLoad(InitialMI) &&
+ "Cannot get here with a non-invariant load!");
+
+ // See if we can sink hardening the loaded value.
+ auto SinkCheckToSingleUse =
+ [&](MachineInstr &MI) -> Optional<MachineInstr *> {
+ unsigned DefReg = MI.getOperand(0).getReg();
+
+ // We need to find a single use which we can sink the check. We can
+ // primarily do this because many uses may already end up checked on their
+ // own.
+ MachineInstr *SingleUseMI = nullptr;
+ for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
+ // If we're already going to harden this use, it is data invariant and
+ // within our block.
+ if (HardenedInstrs.count(&UseMI)) {
+ if (!isDataInvariantLoad(UseMI)) {
+ // If we've already decided to harden a non-load, we must have sunk
+ // some other post-load hardened instruction to it and it must itself
+ // be data-invariant.
+ assert(isDataInvariant(UseMI) &&
+ "Data variant instruction being hardened!");
+ continue;
+ }
+
+ // Otherwise, this is a load and the load component can't be data
+ // invariant so check how this register is being used.
+ const MCInstrDesc &Desc = UseMI.getDesc();
+ int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
+ assert(MemRefBeginIdx >= 0 &&
+ "Should always have mem references here!");
+ MemRefBeginIdx += X86II::getOperandBias(Desc);
+
+ MachineOperand &BaseMO =
+ UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
+ MachineOperand &IndexMO =
+ UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
+ if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) ||
+ (IndexMO.isReg() && IndexMO.getReg() == DefReg))
+ // The load uses the register as part of its address making it not
+ // invariant.
+ return {};
+
+ continue;
+ }
+
+ if (SingleUseMI)
+ // We already have a single use, this would make two. Bail.
+ return {};
+
+ // If this single use isn't data invariant, isn't in this block, or has
+ // interfering EFLAGS, we can't sink the hardening to it.
+ if (!isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent())
+ return {};
+
+ // If this instruction defines multiple registers bail as we won't harden
+ // all of them.
+ if (UseMI.getDesc().getNumDefs() > 1)
+ return {};
+
+ // If this register isn't a virtual register we can't walk uses of sanely,
+ // just bail. Also check that its register class is one of the ones we
+ // can harden.
+ unsigned UseDefReg = UseMI.getOperand(0).getReg();
+ if (!TRI->isVirtualRegister(UseDefReg) ||
+ !canHardenRegister(UseDefReg))
+ return {};
+
+ SingleUseMI = &UseMI;
+ }
+
+ // If SingleUseMI is still null, there is no use that needs its own
+ // checking. Otherwise, it is the single use that needs checking.
+ return {SingleUseMI};
+ };
+
+ MachineInstr *MI = &InitialMI;
+ while (Optional<MachineInstr *> SingleUse = SinkCheckToSingleUse(*MI)) {
+ // Update which MI we're checking now.
+ MI = *SingleUse;
+ if (!MI)
+ break;
+ }
+
+ return MI;
+}
+
+bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
+ auto *RC = MRI->getRegClass(Reg);
+ int RegBytes = TRI->getRegSizeInBits(*RC) / 8;
+ if (RegBytes > 8)
+ // We don't support post-load hardening of vectors.
+ return false;
+
+ // If this register class is explicitly constrained to a class that doesn't
+ // require REX prefix, we may not be able to satisfy that constraint when
+ // emitting the hardening instructions, so bail out here.
+ // FIXME: This seems like a pretty lame hack. The way this comes up is when we
+ // end up both with a NOREX and REX-only register as operands to the hardening
+ // instructions. It would be better to fix that code to handle this situation
+ // rather than hack around it in this way.
+ const TargetRegisterClass *NOREXRegClasses[] = {
+ &X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
+ &X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
+ if (RC == NOREXRegClasses[Log2_32(RegBytes)])
+ return false;
+
+ const TargetRegisterClass *GPRRegClasses[] = {
+ &X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
+ &X86::GR64RegClass};
+ return RC->hasSuperClassEq(GPRRegClasses[Log2_32(RegBytes)]);
+}
+
+/// Harden a value in a register.
+///
+/// This is the low-level logic to fully harden a value sitting in a register
+/// against leaking during speculative execution.
+///
+/// Unlike hardening an address that is used by a load, this routine is required
+/// to hide *all* incoming bits in the register.
+///
+/// `Reg` must be a virtual register. Currently, it is required to be a GPR no
+/// larger than the predicate state register. FIXME: We should support vector
+/// registers here by broadcasting the predicate state.
+///
+/// The new, hardened virtual register is returned. It will have the same
+/// register class as `Reg`.
+unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
+ unsigned Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+ DebugLoc Loc) {
+ assert(canHardenRegister(Reg) && "Cannot harden this register!");
+ assert(TRI->isVirtualRegister(Reg) && "Cannot harden a physical register!");
+
+ auto *RC = MRI->getRegClass(Reg);
+ int Bytes = TRI->getRegSizeInBits(*RC) / 8;
+
+ unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
+
+ // FIXME: Need to teach this about 32-bit mode.
+ if (Bytes != 8) {
+ unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
+ unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
+ unsigned NarrowStateReg = MRI->createVirtualRegister(RC);
+ BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), NarrowStateReg)
+ .addReg(StateReg, 0, SubRegImm);
+ StateReg = NarrowStateReg;
+ }
+
+ unsigned FlagsReg = 0;
+ if (isEFLAGSLive(MBB, InsertPt, *TRI))
+ FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
+
+ unsigned NewReg = MRI->createVirtualRegister(RC);
+ unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
+ unsigned OrOpCode = OrOpCodes[Log2_32(Bytes)];
+ auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), NewReg)
+ .addReg(StateReg)
+ .addReg(Reg);
+ OrI->addRegisterDead(X86::EFLAGS, TRI);
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
+
+ if (FlagsReg)
+ restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
+
+ return NewReg;
+}
+
+/// Harden a load by hardening the loaded value in the defined register.
+///
+/// We can harden a non-leaking load into a register without touching the
+/// address by just hiding all of the loaded bits during misspeculation. We use
+/// an `or` instruction to do this because we set up our poison value as all
+/// ones. And the goal is just for the loaded bits to not be exposed to
+/// execution and coercing them to one is sufficient.
+///
+/// Returns the newly hardened register.
+unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc Loc = MI.getDebugLoc();
+
+ auto &DefOp = MI.getOperand(0);
+ unsigned OldDefReg = DefOp.getReg();
+ auto *DefRC = MRI->getRegClass(OldDefReg);
+
+ // Because we want to completely replace the uses of this def'ed value with
+ // the hardened value, create a dedicated new register that will only be used
+ // to communicate the unhardened value to the hardening.
+ unsigned UnhardenedReg = MRI->createVirtualRegister(DefRC);
+ DefOp.setReg(UnhardenedReg);
+
+ // Now harden this register's value, getting a hardened reg that is safe to
+ // use. Note that we insert the instructions to compute this *after* the
+ // defining instruction, not before it.
+ unsigned HardenedReg = hardenValueInRegister(
+ UnhardenedReg, MBB, std::next(MI.getIterator()), Loc);
+
+ // Finally, replace the old register (which now only has the uses of the
+ // original def) with the hardened register.
+ MRI->replaceRegWith(/*FromReg*/ OldDefReg, /*ToReg*/ HardenedReg);
+
+ ++NumPostLoadRegsHardened;
+ return HardenedReg;
+}
+
+/// Harden a return instruction.
+///
+/// Returns implicitly perform a load which we need to harden. Without hardening
+/// this load, an attacker my speculatively write over the return address to
+/// steer speculation of the return to an attacker controlled address. This is
+/// called Spectre v1.1 or Bounds Check Bypass Store (BCBS) and is described in
+/// this paper:
+/// https://people.csail.mit.edu/vlk/spectre11.pdf
+///
+/// We can harden this by introducing an LFENCE that will delay any load of the
+/// return address until prior instructions have retired (and thus are not being
+/// speculated), or we can harden the address used by the implicit load: the
+/// stack pointer.
+///
+/// If we are not using an LFENCE, hardening the stack pointer has an additional
+/// benefit: it allows us to pass the predicate state accumulated in this
+/// function back to the caller. In the absence of a BCBS attack on the return,
+/// the caller will typically be resumed and speculatively executed due to the
+/// Return Stack Buffer (RSB) prediction which is very accurate and has a high
+/// priority. It is possible that some code from the caller will be executed
+/// speculatively even during a BCBS-attacked return until the steering takes
+/// effect. Whenever this happens, the caller can recover the (poisoned)
+/// predicate state from the stack pointer and continue to harden loads.
+void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc Loc = MI.getDebugLoc();
+ auto InsertPt = MI.getIterator();
+
+ if (FenceCallAndRet) {
+ // Simply forcibly block speculation of loads out of the function by using
+ // an LFENCE. This is potentially a heavy-weight mitigation strategy, but
+ // should be secure, is simple from an ABI perspective, and the cost can be
+ // minimized through inlining.
+ //
+ // FIXME: We should investigate ways to establish a strong data-dependency
+ // on the return. However, poisoning the stack pointer is unlikely to work
+ // because the return is *predicted* rather than relying on the load of the
+ // return address to actually resolve.
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::LFENCE));
+ ++NumInstsInserted;
+ ++NumLFENCEsInserted;
+ return;
+ }
+
+ // Take our predicate state, shift it to the high 17 bits (so that we keep
+ // pointers canonical) and merge it into RSP. This will allow the caller to
+ // extract it when we return (speculatively).
+ mergePredStateIntoSP(MBB, InsertPt, Loc, PS->SSA.GetValueAtEndOfBlock(&MBB));
+}
+
+/// Trace the predicate state through a call.
+///
+/// There are several layers of this needed to handle the full complexity of
+/// calls.
+///
+/// First, we need to send the predicate state into the called function. We do
+/// this by merging it into the high bits of the stack pointer.
+///
+/// For tail calls, this is all we need to do.
+///
+/// For calls where we might return to control flow, we further need to extract
+/// the predicate state built up within that function from the high bits of the
+/// stack pointer, and make that the newly available predicate state.
+void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
+ MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ auto InsertPt = MI.getIterator();
+ DebugLoc Loc = MI.getDebugLoc();
+
+ // First, we transfer the predicate state into the called function by merging
+ // it into the stack pointer. This will kill the current def of the state.
+ unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
+ mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
+
+ // If this call is also a return, it is a tail call and we don't need anything
+ // else to handle it so just continue.
+ // FIXME: We should also handle noreturn calls.
+ if (MI.isReturn())
+ return;
+
+ // We need to step past the call and recover the predicate state from SP after
+ // the return, and make this new state available.
+ ++InsertPt;
+ unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
+ PS->SSA.AddAvailableValue(&MBB, NewStateReg);
+}
+
+/// An attacker may speculatively store over a value that is then speculatively
+/// loaded and used as the target of an indirect call or jump instruction. This
+/// is called Spectre v1.2 or Bounds Check Bypass Store (BCBS) and is described
+/// in this paper:
+/// https://people.csail.mit.edu/vlk/spectre11.pdf
+///
+/// When this happens, the speculative execution of the call or jump will end up
+/// being steered to this attacker controlled address. While most such loads
+/// will be adequately hardened already, we want to ensure that they are
+/// definitively treated as needing post-load hardening. While address hardening
+/// is sufficient to prevent secret data from leaking to the attacker, it may
+/// not be sufficient to prevent an attacker from steering speculative
+/// execution. We forcibly unfolded all relevant loads above and so will always
+/// have an opportunity to post-load harden here, we just need to scan for cases
+/// not already flagged and add them.
+void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr(
+ MachineInstr &MI,
+ SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
+ switch (MI.getOpcode()) {
+ case X86::FARCALL16m:
+ case X86::FARCALL32m:
+ case X86::FARCALL64:
+ case X86::FARJMP16m:
+ case X86::FARJMP32m:
+ case X86::FARJMP64:
+ // We don't need to harden either far calls or far jumps as they are
+ // safe from Spectre.
+ return;
+
+ default:
+ break;
+ }
+
+ // We should never see a loading instruction at this point, as those should
+ // have been unfolded.
+ assert(!MI.mayLoad() && "Found a lingering loading instruction!");
+
+ // If the first operand isn't a register, this is a branch or call
+ // instruction with an immediate operand which doesn't need to be hardened.
+ if (!MI.getOperand(0).isReg())
+ return;
+
+ // For all of these, the target register is the first operand of the
+ // instruction.
+ auto &TargetOp = MI.getOperand(0);
+ unsigned OldTargetReg = TargetOp.getReg();
+
+ // Try to lookup a hardened version of this register. We retain a reference
+ // here as we want to update the map to track any newly computed hardened
+ // register.
+ unsigned &HardenedTargetReg = AddrRegToHardenedReg[OldTargetReg];
+
+ // If we don't have a hardened register yet, compute one. Otherwise, just use
+ // the already hardened register.
+ //
+ // FIXME: It is a little suspect that we use partially hardened registers that
+ // only feed addresses. The complexity of partial hardening with SHRX
+ // continues to pile up. Should definitively measure its value and consider
+ // eliminating it.
+ if (!HardenedTargetReg)
+ HardenedTargetReg = hardenValueInRegister(
+ OldTargetReg, *MI.getParent(), MI.getIterator(), MI.getDebugLoc());
+
+ // Set the target operand to the hardened register.
+ TargetOp.setReg(HardenedTargetReg);
+
+ ++NumCallsOrJumpsHardened;
+}
+
+INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
+ "X86 speculative load hardener", false, false)
+INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
+ "X86 speculative load hardener", false, false)
+
+FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
+ return new X86SpeculativeLoadHardeningPass();
+}
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index ad023623142f..7e84323dda4c 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -68,14 +68,36 @@ X86Subtarget::classifyGlobalReference(const GlobalValue *GV) const {
unsigned char
X86Subtarget::classifyLocalReference(const GlobalValue *GV) const {
- // 64 bits can use %rip addressing for anything local.
- if (is64Bit())
+ // If we're not PIC, it's not very interesting.
+ if (!isPositionIndependent())
return X86II::MO_NO_FLAG;
- // If this is for a position dependent executable, the static linker can
- // figure it out.
- if (!isPositionIndependent())
+ if (is64Bit()) {
+ // 64-bit ELF PIC local references may use GOTOFF relocations.
+ if (isTargetELF()) {
+ switch (TM.getCodeModel()) {
+ // 64-bit small code model is simple: All rip-relative.
+ case CodeModel::Small:
+ case CodeModel::Kernel:
+ return X86II::MO_NO_FLAG;
+
+ // The large PIC code model uses GOTOFF.
+ case CodeModel::Large:
+ return X86II::MO_GOTOFF;
+
+ // Medium is a hybrid: RIP-rel for code, GOTOFF for DSO local data.
+ case CodeModel::Medium:
+ if (isa<Function>(GV))
+ return X86II::MO_NO_FLAG; // All code is RIP-relative
+ return X86II::MO_GOTOFF; // Local symbols use GOTOFF.
+ }
+ llvm_unreachable("invalid code model");
+ }
+
+ // Otherwise, this is either a RIP-relative reference or a 64-bit movabsq,
+ // both of which use MO_NO_FLAG.
return X86II::MO_NO_FLAG;
+ }
// The COFF dynamic linker just patches the executable sections.
if (isTargetCOFF())
@@ -97,8 +119,8 @@ X86Subtarget::classifyLocalReference(const GlobalValue *GV) const {
unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
const Module &M) const {
- // Large model never uses stubs.
- if (TM.getCodeModel() == CodeModel::Large)
+ // The static large model never uses stubs.
+ if (TM.getCodeModel() == CodeModel::Large && !isPositionIndependent())
return X86II::MO_NO_FLAG;
// Absolute symbols can be referenced directly.
@@ -120,8 +142,14 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
if (isTargetCOFF())
return X86II::MO_DLLIMPORT;
- if (is64Bit())
+ if (is64Bit()) {
+ // ELF supports a large, truly PIC code model with non-PC relative GOT
+ // references. Other object file formats do not. Use the no-flag, 64-bit
+ // reference for them.
+ if (TM.getCodeModel() == CodeModel::Large)
+ return isTargetELF() ? X86II::MO_GOT : X86II::MO_NO_FLAG;
return X86II::MO_GOTPCREL;
+ }
if (isTargetDarwin()) {
if (!isPositionIndependent())
@@ -157,8 +185,11 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
// In Regcall calling convention those registers are used for passing
// parameters. Thus we need to prevent lazy binding in Regcall.
return X86II::MO_GOTPCREL;
- if (F && F->hasFnAttribute(Attribute::NonLazyBind) && is64Bit())
- return X86II::MO_GOTPCREL;
+ // If PLT must be avoided then the call should be via GOTPCREL.
+ if (((F && F->hasFnAttribute(Attribute::NonLazyBind)) ||
+ (!F && M.getRtLibUseGOT())) &&
+ is64Bit())
+ return X86II::MO_GOTPCREL;
return X86II::MO_PLT;
}
@@ -216,8 +247,6 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
// micro-architectures respectively.
if (hasSSE42() || hasSSE4A())
IsUAMem16Slow = false;
-
- InstrItins = getInstrItineraryForCPU(CPUName);
// It's important to keep the MCSubtargetInfo feature bits in sync with
// target data structure which is shared with MC code emitter, etc.
@@ -230,9 +259,9 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
else
llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!");
- DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
- << ", 3DNowLevel " << X863DNowLevel
- << ", 64bit " << HasX86_64 << "\n");
+ LLVM_DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
+ << ", 3DNowLevel " << X863DNowLevel << ", 64bit "
+ << HasX86_64 << "\n");
assert((!In64BitMode || HasX86_64) &&
"64-bit code requested on a subtarget that doesn't support it!");
@@ -254,112 +283,30 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
GatherOverhead = 2;
if (hasAVX512())
ScatterOverhead = 2;
-}
-void X86Subtarget::initializeEnvironment() {
- X86SSELevel = NoSSE;
- X863DNowLevel = NoThreeDNow;
- HasX87 = false;
- HasCMov = false;
- HasX86_64 = false;
- HasPOPCNT = false;
- HasSSE4A = false;
- HasAES = false;
- HasVAES = false;
- HasFXSR = false;
- HasXSAVE = false;
- HasXSAVEOPT = false;
- HasXSAVEC = false;
- HasXSAVES = false;
- HasPCLMUL = false;
- HasVPCLMULQDQ = false;
- HasGFNI = false;
- HasFMA = false;
- HasFMA4 = false;
- HasXOP = false;
- HasTBM = false;
- HasLWP = false;
- HasMOVBE = false;
- HasRDRAND = false;
- HasF16C = false;
- HasFSGSBase = false;
- HasLZCNT = false;
- HasBMI = false;
- HasBMI2 = false;
- HasVBMI = false;
- HasVBMI2 = false;
- HasIFMA = false;
- HasRTM = false;
- HasERI = false;
- HasCDI = false;
- HasPFI = false;
- HasDQI = false;
- HasVPOPCNTDQ = false;
- HasBWI = false;
- HasVLX = false;
- HasADX = false;
- HasPKU = false;
- HasVNNI = false;
- HasBITALG = false;
- HasSHA = false;
- HasPREFETCHWT1 = false;
- HasPRFCHW = false;
- HasRDSEED = false;
- HasLAHFSAHF = false;
- HasMWAITX = false;
- HasCLZERO = false;
- HasMPX = false;
- HasSHSTK = false;
- HasIBT = false;
- HasSGX = false;
- HasCLFLUSHOPT = false;
- HasCLWB = false;
- IsPMULLDSlow = false;
- IsSHLDSlow = false;
- IsUAMem16Slow = false;
- IsUAMem32Slow = false;
- HasSSEUnalignedMem = false;
- HasCmpxchg16b = false;
- UseLeaForSP = false;
- HasFastVariableShuffle = false;
- HasFastPartialYMMorZMMWrite = false;
- HasFastGather = false;
- HasFastScalarFSQRT = false;
- HasFastVectorFSQRT = false;
- HasFastLZCNT = false;
- HasFastSHLDRotate = false;
- HasMacroFusion = false;
- HasERMSB = false;
- HasSlowDivide32 = false;
- HasSlowDivide64 = false;
- PadShortFunctions = false;
- SlowTwoMemOps = false;
- LEAUsesAG = false;
- SlowLEA = false;
- Slow3OpsLEA = false;
- SlowIncDec = false;
- stackAlignment = 4;
- // FIXME: this is a known good value for Yonah. How about others?
- MaxInlineSizeThreshold = 128;
- UseSoftFloat = false;
- X86ProcFamily = Others;
- GatherOverhead = 1024;
- ScatterOverhead = 1024;
+ // Consume the vector width attribute or apply any target specific limit.
+ if (PreferVectorWidthOverride)
+ PreferVectorWidth = PreferVectorWidthOverride;
+ else if (Prefer256Bit)
+ PreferVectorWidth = 256;
}
X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
StringRef FS) {
- initializeEnvironment();
initSubtargetFeatures(CPU, FS);
return *this;
}
X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
const X86TargetMachine &TM,
- unsigned StackAlignOverride)
- : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
+ unsigned StackAlignOverride,
+ unsigned PreferVectorWidthOverride,
+ unsigned RequiredVectorWidth)
+ : X86GenSubtargetInfo(TT, CPU, FS),
PICStyle(PICStyles::None), TM(TM), TargetTriple(TT),
StackAlignOverride(StackAlignOverride),
+ PreferVectorWidthOverride(PreferVectorWidthOverride),
+ RequiredVectorWidth(RequiredVectorWidth),
In64BitMode(TargetTriple.getArch() == Triple::x86_64),
In32BitMode(TargetTriple.getArch() == Triple::x86 &&
TargetTriple.getEnvironment() != Triple::CODE16),
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index c9435890fc1f..fedb13f89e19 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -26,8 +26,8 @@
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/CallingConv.h"
-#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/Target/TargetMachine.h"
+#include <climits>
#include <memory>
#define GET_SUBTARGETINFO_HEADER
@@ -57,13 +57,16 @@ public:
IntelAtom,
IntelSLM,
IntelGLM,
+ IntelGLP,
+ IntelTRM,
IntelHaswell,
IntelBroadwell,
IntelSkylake,
IntelKNL,
IntelSKX,
IntelCannonlake,
- IntelIcelake,
+ IntelIcelakeClient,
+ IntelIcelakeServer,
};
protected:
@@ -76,7 +79,7 @@ protected:
};
/// X86 processor family: Intel Atom, and others
- X86ProcFamilyEnum X86ProcFamily;
+ X86ProcFamilyEnum X86ProcFamily = Others;
/// Which PIC style to use
PICStyles::Style PICStyle;
@@ -84,280 +87,330 @@ protected:
const TargetMachine &TM;
/// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
- X86SSEEnum X86SSELevel;
+ X86SSEEnum X86SSELevel = NoSSE;
/// MMX, 3DNow, 3DNow Athlon, or none supported.
- X863DNowEnum X863DNowLevel;
+ X863DNowEnum X863DNowLevel = NoThreeDNow;
/// True if the processor supports X87 instructions.
- bool HasX87;
+ bool HasX87 = false;
+
+ /// True if this processor has NOPL instruction
+ /// (generally pentium pro+).
+ bool HasNOPL = false;
/// True if this processor has conditional move instructions
/// (generally pentium pro+).
- bool HasCMov;
+ bool HasCMov = false;
/// True if the processor supports X86-64 instructions.
- bool HasX86_64;
+ bool HasX86_64 = false;
/// True if the processor supports POPCNT.
- bool HasPOPCNT;
+ bool HasPOPCNT = false;
/// True if the processor supports SSE4A instructions.
- bool HasSSE4A;
+ bool HasSSE4A = false;
/// Target has AES instructions
- bool HasAES;
- bool HasVAES;
+ bool HasAES = false;
+ bool HasVAES = false;
/// Target has FXSAVE/FXRESTOR instructions
- bool HasFXSR;
+ bool HasFXSR = false;
/// Target has XSAVE instructions
- bool HasXSAVE;
+ bool HasXSAVE = false;
/// Target has XSAVEOPT instructions
- bool HasXSAVEOPT;
+ bool HasXSAVEOPT = false;
/// Target has XSAVEC instructions
- bool HasXSAVEC;
+ bool HasXSAVEC = false;
/// Target has XSAVES instructions
- bool HasXSAVES;
+ bool HasXSAVES = false;
/// Target has carry-less multiplication
- bool HasPCLMUL;
- bool HasVPCLMULQDQ;
+ bool HasPCLMUL = false;
+ bool HasVPCLMULQDQ = false;
/// Target has Galois Field Arithmetic instructions
- bool HasGFNI;
+ bool HasGFNI = false;
/// Target has 3-operand fused multiply-add
- bool HasFMA;
+ bool HasFMA = false;
/// Target has 4-operand fused multiply-add
- bool HasFMA4;
+ bool HasFMA4 = false;
/// Target has XOP instructions
- bool HasXOP;
+ bool HasXOP = false;
/// Target has TBM instructions.
- bool HasTBM;
+ bool HasTBM = false;
/// Target has LWP instructions
- bool HasLWP;
+ bool HasLWP = false;
/// True if the processor has the MOVBE instruction.
- bool HasMOVBE;
+ bool HasMOVBE = false;
/// True if the processor has the RDRAND instruction.
- bool HasRDRAND;
+ bool HasRDRAND = false;
/// Processor has 16-bit floating point conversion instructions.
- bool HasF16C;
+ bool HasF16C = false;
/// Processor has FS/GS base insturctions.
- bool HasFSGSBase;
+ bool HasFSGSBase = false;
/// Processor has LZCNT instruction.
- bool HasLZCNT;
+ bool HasLZCNT = false;
/// Processor has BMI1 instructions.
- bool HasBMI;
+ bool HasBMI = false;
/// Processor has BMI2 instructions.
- bool HasBMI2;
+ bool HasBMI2 = false;
/// Processor has VBMI instructions.
- bool HasVBMI;
+ bool HasVBMI = false;
/// Processor has VBMI2 instructions.
- bool HasVBMI2;
+ bool HasVBMI2 = false;
/// Processor has Integer Fused Multiply Add
- bool HasIFMA;
+ bool HasIFMA = false;
/// Processor has RTM instructions.
- bool HasRTM;
+ bool HasRTM = false;
/// Processor has ADX instructions.
- bool HasADX;
+ bool HasADX = false;
/// Processor has SHA instructions.
- bool HasSHA;
+ bool HasSHA = false;
/// Processor has PRFCHW instructions.
- bool HasPRFCHW;
+ bool HasPRFCHW = false;
/// Processor has RDSEED instructions.
- bool HasRDSEED;
+ bool HasRDSEED = false;
/// Processor has LAHF/SAHF instructions.
- bool HasLAHFSAHF;
+ bool HasLAHFSAHF = false;
/// Processor has MONITORX/MWAITX instructions.
- bool HasMWAITX;
+ bool HasMWAITX = false;
/// Processor has Cache Line Zero instruction
- bool HasCLZERO;
+ bool HasCLZERO = false;
+
+ /// Processor has Cache Line Demote instruction
+ bool HasCLDEMOTE = false;
+
+ /// Processor has MOVDIRI instruction (direct store integer).
+ bool HasMOVDIRI = false;
+
+ /// Processor has MOVDIR64B instruction (direct store 64 bytes).
+ bool HasMOVDIR64B = false;
+
+ /// Processor has ptwrite instruction.
+ bool HasPTWRITE = false;
/// Processor has Prefetch with intent to Write instruction
- bool HasPREFETCHWT1;
+ bool HasPREFETCHWT1 = false;
/// True if SHLD instructions are slow.
- bool IsSHLDSlow;
+ bool IsSHLDSlow = false;
/// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and
// PMULUDQ.
- bool IsPMULLDSlow;
+ bool IsPMULLDSlow = false;
/// True if unaligned memory accesses of 16-bytes are slow.
- bool IsUAMem16Slow;
+ bool IsUAMem16Slow = false;
/// True if unaligned memory accesses of 32-bytes are slow.
- bool IsUAMem32Slow;
+ bool IsUAMem32Slow = false;
/// True if SSE operations can have unaligned memory operands.
/// This may require setting a configuration bit in the processor.
- bool HasSSEUnalignedMem;
+ bool HasSSEUnalignedMem = false;
/// True if this processor has the CMPXCHG16B instruction;
/// this is true for most x86-64 chips, but not the first AMD chips.
- bool HasCmpxchg16b;
+ bool HasCmpxchg16b = false;
/// True if the LEA instruction should be used for adjusting
/// the stack pointer. This is an optimization for Intel Atom processors.
- bool UseLeaForSP;
+ bool UseLeaForSP = false;
+
+ /// True if POPCNT instruction has a false dependency on the destination register.
+ bool HasPOPCNTFalseDeps = false;
+
+ /// True if LZCNT/TZCNT instructions have a false dependency on the destination register.
+ bool HasLZCNTFalseDeps = false;
/// True if its preferable to combine to a single shuffle using a variable
/// mask over multiple fixed shuffles.
- bool HasFastVariableShuffle;
+ bool HasFastVariableShuffle = false;
/// True if there is no performance penalty to writing only the lower parts
/// of a YMM or ZMM register without clearing the upper part.
- bool HasFastPartialYMMorZMMWrite;
+ bool HasFastPartialYMMorZMMWrite = false;
+
+ /// True if there is no performance penalty for writing NOPs with up to
+ /// 11 bytes.
+ bool HasFast11ByteNOP = false;
+
+ /// True if there is no performance penalty for writing NOPs with up to
+ /// 15 bytes.
+ bool HasFast15ByteNOP = false;
/// True if gather is reasonably fast. This is true for Skylake client and
/// all AVX-512 CPUs.
- bool HasFastGather;
+ bool HasFastGather = false;
/// True if hardware SQRTSS instruction is at least as fast (latency) as
/// RSQRTSS followed by a Newton-Raphson iteration.
- bool HasFastScalarFSQRT;
+ bool HasFastScalarFSQRT = false;
/// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
/// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
- bool HasFastVectorFSQRT;
+ bool HasFastVectorFSQRT = false;
/// True if 8-bit divisions are significantly faster than
/// 32-bit divisions and should be used when possible.
- bool HasSlowDivide32;
+ bool HasSlowDivide32 = false;
/// True if 32-bit divides are significantly faster than
/// 64-bit divisions and should be used when possible.
- bool HasSlowDivide64;
+ bool HasSlowDivide64 = false;
/// True if LZCNT instruction is fast.
- bool HasFastLZCNT;
+ bool HasFastLZCNT = false;
/// True if SHLD based rotate is fast.
- bool HasFastSHLDRotate;
+ bool HasFastSHLDRotate = false;
/// True if the processor supports macrofusion.
- bool HasMacroFusion;
+ bool HasMacroFusion = false;
/// True if the processor has enhanced REP MOVSB/STOSB.
- bool HasERMSB;
+ bool HasERMSB = false;
/// True if the short functions should be padded to prevent
/// a stall when returning too early.
- bool PadShortFunctions;
+ bool PadShortFunctions = false;
/// True if two memory operand instructions should use a temporary register
/// instead.
- bool SlowTwoMemOps;
+ bool SlowTwoMemOps = false;
/// True if the LEA instruction inputs have to be ready at address generation
/// (AG) time.
- bool LEAUsesAG;
+ bool LEAUsesAG = false;
/// True if the LEA instruction with certain arguments is slow
- bool SlowLEA;
+ bool SlowLEA = false;
/// True if the LEA instruction has all three source operands: base, index,
/// and offset or if the LEA instruction uses base and index registers where
/// the base is EBP, RBP,or R13
- bool Slow3OpsLEA;
+ bool Slow3OpsLEA = false;
/// True if INC and DEC instructions are slow when writing to flags
- bool SlowIncDec;
+ bool SlowIncDec = false;
/// Processor has AVX-512 PreFetch Instructions
- bool HasPFI;
+ bool HasPFI = false;
/// Processor has AVX-512 Exponential and Reciprocal Instructions
- bool HasERI;
+ bool HasERI = false;
/// Processor has AVX-512 Conflict Detection Instructions
- bool HasCDI;
+ bool HasCDI = false;
/// Processor has AVX-512 population count Instructions
- bool HasVPOPCNTDQ;
+ bool HasVPOPCNTDQ = false;
/// Processor has AVX-512 Doubleword and Quadword instructions
- bool HasDQI;
+ bool HasDQI = false;
/// Processor has AVX-512 Byte and Word instructions
- bool HasBWI;
+ bool HasBWI = false;
/// Processor has AVX-512 Vector Length eXtenstions
- bool HasVLX;
+ bool HasVLX = false;
/// Processor has PKU extenstions
- bool HasPKU;
+ bool HasPKU = false;
/// Processor has AVX-512 Vector Neural Network Instructions
- bool HasVNNI;
+ bool HasVNNI = false;
/// Processor has AVX-512 Bit Algorithms instructions
- bool HasBITALG;
+ bool HasBITALG = false;
/// Processor supports MPX - Memory Protection Extensions
- bool HasMPX;
+ bool HasMPX = false;
/// Processor supports CET SHSTK - Control-Flow Enforcement Technology
/// using Shadow Stack
- bool HasSHSTK;
+ bool HasSHSTK = false;
- /// Processor supports CET IBT - Control-Flow Enforcement Technology
- /// using Indirect Branch Tracking
- bool HasIBT;
+ /// Processor supports Invalidate Process-Context Identifier
+ bool HasINVPCID = false;
/// Processor has Software Guard Extensions
- bool HasSGX;
+ bool HasSGX = false;
/// Processor supports Flush Cache Line instruction
- bool HasCLFLUSHOPT;
+ bool HasCLFLUSHOPT = false;
/// Processor supports Cache Line Write Back instruction
- bool HasCLWB;
+ bool HasCLWB = false;
+
+ /// Processor supports Write Back No Invalidate instruction
+ bool HasWBNOINVD = false;
+
+ /// Processor support RDPID instruction
+ bool HasRDPID = false;
+
+ /// Processor supports WaitPKG instructions
+ bool HasWAITPKG = false;
+
+ /// Processor supports PCONFIG instruction
+ bool HasPCONFIG = false;
+
+ /// Use a retpoline thunk rather than indirect calls to block speculative
+ /// execution.
+ bool UseRetpoline = false;
+
+ /// When using a retpoline thunk, call an externally provided thunk rather
+ /// than emitting one inside the compiler.
+ bool UseRetpolineExternalThunk = false;
/// Use software floating point for code generation.
- bool UseSoftFloat;
+ bool UseSoftFloat = false;
/// The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
- unsigned stackAlignment;
+ unsigned stackAlignment = 4;
/// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
///
- unsigned MaxInlineSizeThreshold;
+ // FIXME: this is a known good value for Yonah. How about others?
+ unsigned MaxInlineSizeThreshold = 128;
+
+ /// Indicates target prefers 256 bit instructions.
+ bool Prefer256Bit = false;
/// What processor and OS we're targeting.
Triple TargetTriple;
- /// Instruction itineraries for scheduling
- InstrItineraryData InstrItins;
-
/// GlobalISel related APIs.
std::unique_ptr<CallLowering> CallLoweringInfo;
std::unique_ptr<LegalizerInfo> Legalizer;
@@ -368,6 +421,16 @@ private:
/// Override the stack alignment.
unsigned StackAlignOverride;
+ /// Preferred vector width from function attribute.
+ unsigned PreferVectorWidthOverride;
+
+ /// Resolved preferred vector width from function attribute and subtarget
+ /// features.
+ unsigned PreferVectorWidth = UINT32_MAX;
+
+ /// Required vector width from function attribute.
+ unsigned RequiredVectorWidth;
+
/// True if compiling for 64-bit, false for 16-bit or 32-bit.
bool In64BitMode;
@@ -378,8 +441,8 @@ private:
bool In16BitMode;
/// Contains the Overhead of gather\scatter instructions
- int GatherOverhead;
- int ScatterOverhead;
+ int GatherOverhead = 1024;
+ int ScatterOverhead = 1024;
X86SelectionDAGInfo TSInfo;
// Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
@@ -393,7 +456,9 @@ public:
/// of the specified triple.
///
X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
- const X86TargetMachine &TM, unsigned StackAlignOverride);
+ const X86TargetMachine &TM, unsigned StackAlignOverride,
+ unsigned PreferVectorWidthOverride,
+ unsigned RequiredVectorWidth);
const X86TargetLowering *getTargetLowering() const override {
return &TLInfo;
@@ -436,7 +501,6 @@ private:
/// Initialize the full set of dependencies so we can use an initializer
/// list for X86Subtarget.
X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
- void initializeEnvironment();
void initSubtargetFeatures(StringRef CPU, StringRef FS);
public:
@@ -469,6 +533,7 @@ public:
void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }
bool hasX87() const { return HasX87; }
+ bool hasNOPL() const { return HasNOPL; }
bool hasCMov() const { return HasCMov; }
bool hasSSE1() const { return X86SSELevel >= SSE1; }
bool hasSSE2() const { return X86SSELevel >= SSE2; }
@@ -479,7 +544,6 @@ public:
bool hasAVX() const { return X86SSELevel >= AVX; }
bool hasAVX2() const { return X86SSELevel >= AVX2; }
bool hasAVX512() const { return X86SSELevel >= AVX512F; }
- bool hasFp256() const { return hasAVX(); }
bool hasInt256() const { return hasAVX2(); }
bool hasSSE4A() const { return HasSSE4A; }
bool hasMMX() const { return X863DNowLevel >= MMX; }
@@ -529,6 +593,10 @@ public:
bool hasLAHFSAHF() const { return HasLAHFSAHF; }
bool hasMWAITX() const { return HasMWAITX; }
bool hasCLZERO() const { return HasCLZERO; }
+ bool hasCLDEMOTE() const { return HasCLDEMOTE; }
+ bool hasMOVDIRI() const { return HasMOVDIRI; }
+ bool hasMOVDIR64B() const { return HasMOVDIR64B; }
+ bool hasPTWRITE() const { return HasPTWRITE; }
bool isSHLDSlow() const { return IsSHLDSlow; }
bool isPMULLDSlow() const { return IsPMULLDSlow; }
bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
@@ -538,6 +606,8 @@ public:
bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; }
+ bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
+ bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
bool hasFastVariableShuffle() const {
return HasFastVariableShuffle;
}
@@ -571,9 +641,40 @@ public:
bool hasBITALG() const { return HasBITALG; }
bool hasMPX() const { return HasMPX; }
bool hasSHSTK() const { return HasSHSTK; }
- bool hasIBT() const { return HasIBT; }
bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
bool hasCLWB() const { return HasCLWB; }
+ bool hasWBNOINVD() const { return HasWBNOINVD; }
+ bool hasRDPID() const { return HasRDPID; }
+ bool hasWAITPKG() const { return HasWAITPKG; }
+ bool hasPCONFIG() const { return HasPCONFIG; }
+ bool hasSGX() const { return HasSGX; }
+ bool hasINVPCID() const { return HasINVPCID; }
+ bool useRetpoline() const { return UseRetpoline; }
+ bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }
+
+ unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
+ unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }
+
+ // Helper functions to determine when we should allow widening to 512-bit
+ // during codegen.
+ // TODO: Currently we're always allowing widening on CPUs without VLX,
+ // because for many cases we don't have a better option.
+ bool canExtendTo512DQ() const {
+ return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512);
+ }
+ bool canExtendTo512BW() const {
+ return hasBWI() && canExtendTo512DQ();
+ }
+
+ // If there are no 512-bit vectors and we prefer not to use 512-bit registers,
+ // disable them in the legalizer.
+ bool useAVX512Regs() const {
+ return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256);
+ }
+
+ bool useBWIRegs() const {
+ return hasBWI() && useAVX512Regs();
+ }
bool isXRaySupported() const override { return is64Bit(); }
@@ -582,6 +683,11 @@ public:
/// TODO: to be removed later and replaced with suitable properties
bool isAtom() const { return X86ProcFamily == IntelAtom; }
bool isSLM() const { return X86ProcFamily == IntelSLM; }
+ bool isGLM() const {
+ return X86ProcFamily == IntelGLM ||
+ X86ProcFamily == IntelGLP ||
+ X86ProcFamily == IntelTRM;
+ }
bool useSoftFloat() const { return UseSoftFloat; }
/// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
@@ -696,6 +802,10 @@ public:
/// Return true if the subtarget allows calls to immediate address.
bool isLegalToCallImmediateAddr() const;
+ /// If we are using retpolines, we need to expand indirectbr to avoid it
+ /// lowering to an actual indirect jump.
+ bool enableIndirectBrExpand() const override { return useRetpoline(); }
+
/// Enable the MachineScheduler pass for all X86 subtargets.
bool enableMachineScheduler() const override { return true; }
@@ -704,11 +814,6 @@ public:
bool enableEarlyIfConversion() const override;
- /// Return the instruction itineraries based on the subtarget selection.
- const InstrItineraryData *getInstrItineraryData() const override {
- return &InstrItins;
- }
-
AntiDepBreakMode getAntiDepBreakMode() const override {
return TargetSubtargetInfo::ANTIDEP_CRITICAL;
}
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index e95e6ecae091..374bf3daaf9b 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -26,7 +26,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/ExecutionDepsFix.h"
+#include "llvm/CodeGen/ExecutionDomainFix.h"
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
@@ -34,7 +34,6 @@
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DataLayout.h"
@@ -44,6 +43,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetOptions.h"
#include <memory>
#include <string>
@@ -54,14 +54,21 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
cl::desc("Enable the machine combiner pass"),
cl::init(true), cl::Hidden);
+static cl::opt<bool> EnableSpeculativeLoadHardening(
+ "x86-speculative-load-hardening",
+ cl::desc("Enable speculative load hardening"), cl::init(false), cl::Hidden);
+
namespace llvm {
void initializeWinEHStatePassPass(PassRegistry &);
void initializeFixupLEAPassPass(PassRegistry &);
+void initializeShadowCallStackPass(PassRegistry &);
void initializeX86CallFrameOptimizationPass(PassRegistry &);
void initializeX86CmovConverterPassPass(PassRegistry &);
-void initializeX86ExecutionDepsFixPass(PassRegistry &);
+void initializeX86ExecutionDomainFixPass(PassRegistry &);
void initializeX86DomainReassignmentPass(PassRegistry &);
+void initializeX86AvoidSFBPassPass(PassRegistry &);
+void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
} // end namespace llvm
@@ -76,10 +83,13 @@ extern "C" void LLVMInitializeX86Target() {
initializeFixupBWInstPassPass(PR);
initializeEvexToVexInstPassPass(PR);
initializeFixupLEAPassPass(PR);
+ initializeShadowCallStackPass(PR);
initializeX86CallFrameOptimizationPass(PR);
initializeX86CmovConverterPassPass(PR);
- initializeX86ExecutionDepsFixPass(PR);
+ initializeX86ExecutionDomainFixPass(PR);
initializeX86DomainReassignmentPass(PR);
+ initializeX86AvoidSFBPassPass(PR);
+ initializeX86FlagsCopyLoweringPassPass(PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -99,8 +109,6 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
return llvm::make_unique<X86FuchsiaTargetObjectFile>();
if (TT.isOSBinFormatELF())
return llvm::make_unique<X86ELFTargetObjectFile>();
- if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment())
- return llvm::make_unique<X86WindowsTargetObjectFile>();
if (TT.isOSBinFormatCOFF())
return llvm::make_unique<TargetLoweringObjectFileCOFF>();
llvm_unreachable("unknown subtarget type");
@@ -152,9 +160,15 @@ static std::string computeDataLayout(const Triple &TT) {
}
static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+ bool JIT,
Optional<Reloc::Model> RM) {
bool is64Bit = TT.getArch() == Triple::x86_64;
if (!RM.hasValue()) {
+ // JIT codegen should use static relocations by default, since it's
+ // typically executed in process and not relocatable.
+ if (JIT)
+ return Reloc::Static;
+
// Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
// Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
// use static relocation model by default.
@@ -206,7 +220,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
CodeGenOpt::Level OL, bool JIT)
: LLVMTargetMachine(
T, computeDataLayout(TT), TT, CPU, FS, Options,
- getEffectiveRelocModel(TT, RM),
+ getEffectiveRelocModel(TT, JIT, RM),
getEffectiveCodeModel(CM, JIT, TT.getArch() == Triple::x86_64), OL),
TLOF(createTLOF(getTargetTriple())) {
// Windows stack unwinder gets confused when execution flow "falls through"
@@ -218,8 +232,15 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
// The check here for 64-bit windows is a bit icky, but as we're unlikely
// to ever want to mix 32 and 64-bit windows code in a single module
// this should be fine.
- if ((TT.isOSWindows() && TT.getArch() == Triple::x86_64) || TT.isPS4())
+ if ((TT.isOSWindows() && TT.getArch() == Triple::x86_64) || TT.isPS4() ||
+ TT.isOSBinFormatMachO()) {
this->Options.TrapUnreachable = true;
+ this->Options.NoTrapAfterNoreturn = TT.isOSBinFormatMachO();
+ }
+
+ // Outlining is available for x86-64.
+ if (TT.getArch() == Triple::x86_64)
+ setMachineOutliner(true);
initAsmInfo();
}
@@ -255,7 +276,38 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
if (SoftFloat)
Key += FS.empty() ? "+soft-float" : ",+soft-float";
- FS = Key.substr(CPU.size());
+ // Keep track of the key width after all features are added so we can extract
+ // the feature string out later.
+ unsigned CPUFSWidth = Key.size();
+
+ // Extract prefer-vector-width attribute.
+ unsigned PreferVectorWidthOverride = 0;
+ if (F.hasFnAttribute("prefer-vector-width")) {
+ StringRef Val = F.getFnAttribute("prefer-vector-width").getValueAsString();
+ unsigned Width;
+ if (!Val.getAsInteger(0, Width)) {
+ Key += ",prefer-vector-width=";
+ Key += Val;
+ PreferVectorWidthOverride = Width;
+ }
+ }
+
+ // Extract required-vector-width attribute.
+ unsigned RequiredVectorWidth = UINT32_MAX;
+ if (F.hasFnAttribute("required-vector-width")) {
+ StringRef Val = F.getFnAttribute("required-vector-width").getValueAsString();
+ unsigned Width;
+ if (!Val.getAsInteger(0, Width)) {
+ Key += ",required-vector-width=";
+ Key += Val;
+ RequiredVectorWidth = Width;
+ }
+ }
+
+ // Extracted here so that we make sure there is backing for the StringRef. If
+ // we assigned earlier, its possible the SmallString reallocated leaving a
+ // dangling StringRef.
+ FS = Key.slice(CPU.size(), CPUFSWidth);
auto &I = SubtargetMap[Key];
if (!I) {
@@ -264,7 +316,9 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
// function that reside in TargetOptions.
resetTargetOptions(F);
I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
- Options.StackAlignmentOverride);
+ Options.StackAlignmentOverride,
+ PreferVectorWidthOverride,
+ RequiredVectorWidth);
}
return I.get();
}
@@ -321,23 +375,27 @@ public:
void addPreRegAlloc() override;
void addPostRegAlloc() override;
void addPreEmitPass() override;
+ void addPreEmitPass2() override;
void addPreSched2() override;
};
-class X86ExecutionDepsFix : public ExecutionDepsFix {
+class X86ExecutionDomainFix : public ExecutionDomainFix {
public:
static char ID;
- X86ExecutionDepsFix() : ExecutionDepsFix(ID, X86::VR128XRegClass) {}
+ X86ExecutionDomainFix() : ExecutionDomainFix(ID, X86::VR128XRegClass) {}
StringRef getPassName() const override {
return "X86 Execution Dependency Fix";
}
};
-char X86ExecutionDepsFix::ID;
+char X86ExecutionDomainFix::ID;
} // end anonymous namespace
-INITIALIZE_PASS(X86ExecutionDepsFix, "x86-execution-deps-fix",
- "X86 Execution Dependency Fix", false, false)
+INITIALIZE_PASS_BEGIN(X86ExecutionDomainFix, "x86-execution-domain-fix",
+ "X86 Execution Domain Fix", false, false)
+INITIALIZE_PASS_DEPENDENCY(ReachingDefAnalysis)
+INITIALIZE_PASS_END(X86ExecutionDomainFix, "x86-execution-domain-fix",
+ "X86 Execution Domain Fix", false, false)
TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
return new X86PassConfig(*this, PM);
@@ -350,6 +408,11 @@ void X86PassConfig::addIRPasses() {
if (TM->getOptLevel() != CodeGenOpt::None)
addPass(createInterleavedAccessPass());
+
+ // Add passes that handle indirect branch removal and insertion of a retpoline
+ // thunk. These will be a no-op unless a function subtarget has the retpoline
+ // feature enabled.
+ addPass(createIndirectBrExpandPass());
}
bool X86PassConfig::addInstSelector() {
@@ -407,8 +470,13 @@ void X86PassConfig::addPreRegAlloc() {
addPass(createX86FixupSetCC());
addPass(createX86OptimizeLEAs());
addPass(createX86CallFrameOptimization());
+ addPass(createX86AvoidStoreForwardingBlocks());
}
+ if (EnableSpeculativeLoadHardening)
+ addPass(createX86SpeculativeLoadHardeningPass());
+
+ addPass(createX86FlagsCopyLoweringPass());
addPass(createX86WinAllocaExpander());
}
void X86PassConfig::addMachineSSAOptimization() {
@@ -423,8 +491,13 @@ void X86PassConfig::addPostRegAlloc() {
void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
void X86PassConfig::addPreEmitPass() {
- if (getOptLevel() != CodeGenOpt::None)
- addPass(new X86ExecutionDepsFix());
+ if (getOptLevel() != CodeGenOpt::None) {
+ addPass(new X86ExecutionDomainFix());
+ addPass(createBreakFalseDeps());
+ }
+
+ addPass(createShadowCallStackPass());
+ addPass(createX86IndirectBranchTrackingPass());
if (UseVZeroUpper)
addPass(createX86IssueVZeroUpperPass());
@@ -436,3 +509,13 @@ void X86PassConfig::addPreEmitPass() {
addPass(createX86EvexToVexInsts());
}
}
+
+void X86PassConfig::addPreEmitPass2() {
+ addPass(createX86RetpolineThunksPass());
+ // Verify basic block incoming and outgoing cfa offset and register values and
+ // correct CFA calculation rule where needed by inserting appropriate CFI
+ // instructions.
+ const Triple &TT = TM->getTargetTriple();
+ if (!TT.isOSDarwin() && !TT.isOSWindows())
+ addPass(createCFIInstrInserter());
+}
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index fb35a6b2ec1a..505c4fa07b77 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -91,100 +91,3 @@ void X86SolarisTargetObjectFile::Initialize(MCContext &Ctx,
TargetLoweringObjectFileELF::Initialize(Ctx, TM);
InitializeELF(TM.Options.UseInitArray);
}
-
-const MCExpr *X86WindowsTargetObjectFile::lowerRelativeReference(
- const GlobalValue *LHS, const GlobalValue *RHS,
- const TargetMachine &TM) const {
- // Our symbols should exist in address space zero, cowardly no-op if
- // otherwise.
- if (LHS->getType()->getPointerAddressSpace() != 0 ||
- RHS->getType()->getPointerAddressSpace() != 0)
- return nullptr;
-
- // Both ptrtoint instructions must wrap global objects:
- // - Only global variables are eligible for image relative relocations.
- // - The subtrahend refers to the special symbol __ImageBase, a GlobalVariable.
- // We expect __ImageBase to be a global variable without a section, externally
- // defined.
- //
- // It should look something like this: @__ImageBase = external constant i8
- if (!isa<GlobalObject>(LHS) || !isa<GlobalVariable>(RHS) ||
- LHS->isThreadLocal() || RHS->isThreadLocal() ||
- RHS->getName() != "__ImageBase" || !RHS->hasExternalLinkage() ||
- cast<GlobalVariable>(RHS)->hasInitializer() || RHS->hasSection())
- return nullptr;
-
- return MCSymbolRefExpr::create(TM.getSymbol(LHS),
- MCSymbolRefExpr::VK_COFF_IMGREL32,
- getContext());
-}
-
-static std::string APIntToHexString(const APInt &AI) {
- unsigned Width = (AI.getBitWidth() / 8) * 2;
- std::string HexString = utohexstr(AI.getLimitedValue(), /*LowerCase=*/true);
- unsigned Size = HexString.size();
- assert(Width >= Size && "hex string is too large!");
- HexString.insert(HexString.begin(), Width - Size, '0');
-
- return HexString;
-}
-
-static std::string scalarConstantToHexString(const Constant *C) {
- Type *Ty = C->getType();
- if (isa<UndefValue>(C)) {
- return APIntToHexString(APInt::getNullValue(Ty->getPrimitiveSizeInBits()));
- } else if (const auto *CFP = dyn_cast<ConstantFP>(C)) {
- return APIntToHexString(CFP->getValueAPF().bitcastToAPInt());
- } else if (const auto *CI = dyn_cast<ConstantInt>(C)) {
- return APIntToHexString(CI->getValue());
- } else {
- unsigned NumElements;
- if (isa<VectorType>(Ty))
- NumElements = Ty->getVectorNumElements();
- else
- NumElements = Ty->getArrayNumElements();
- std::string HexString;
- for (int I = NumElements - 1, E = -1; I != E; --I)
- HexString += scalarConstantToHexString(C->getAggregateElement(I));
- return HexString;
- }
-}
-
-MCSection *X86WindowsTargetObjectFile::getSectionForConstant(
- const DataLayout &DL, SectionKind Kind, const Constant *C,
- unsigned &Align) const {
- if (Kind.isMergeableConst() && C) {
- const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
- COFF::IMAGE_SCN_MEM_READ |
- COFF::IMAGE_SCN_LNK_COMDAT;
- std::string COMDATSymName;
- if (Kind.isMergeableConst4()) {
- if (Align <= 4) {
- COMDATSymName = "__real@" + scalarConstantToHexString(C);
- Align = 4;
- }
- } else if (Kind.isMergeableConst8()) {
- if (Align <= 8) {
- COMDATSymName = "__real@" + scalarConstantToHexString(C);
- Align = 8;
- }
- } else if (Kind.isMergeableConst16()) {
- if (Align <= 16) {
- COMDATSymName = "__xmm@" + scalarConstantToHexString(C);
- Align = 16;
- }
- } else if (Kind.isMergeableConst32()) {
- if (Align <= 32) {
- COMDATSymName = "__ymm@" + scalarConstantToHexString(C);
- Align = 32;
- }
- }
-
- if (!COMDATSymName.empty())
- return getContext().getCOFFSection(".rdata", Characteristics, Kind,
- COMDATSymName,
- COFF::IMAGE_COMDAT_SELECT_ANY);
- }
-
- return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C, Align);
-}
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 76e9cd5db2a0..d045094edb1e 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -10,8 +10,8 @@
#ifndef LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
#define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
namespace llvm {
@@ -37,7 +37,7 @@ namespace llvm {
MCStreamer &Streamer) const override;
};
- /// \brief This implemenatation is used for X86 ELF targets that don't
+ /// This implemenatation is used for X86 ELF targets that don't
/// have a further specialization.
class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF {
public:
@@ -45,7 +45,7 @@ namespace llvm {
PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT;
}
- /// \brief Describe a TLS variable address within debug info.
+ /// Describe a TLS variable address within debug info.
const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
};
@@ -55,7 +55,7 @@ namespace llvm {
void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
};
- /// \brief This implementation is used for Fuchsia on x86-64.
+ /// This implementation is used for Fuchsia on x86-64.
class X86FuchsiaTargetObjectFile : public X86ELFTargetObjectFile {
void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
};
@@ -66,24 +66,11 @@ namespace llvm {
void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
};
- /// \brief This implementation is used for Solaris on x86/x86-64.
+ /// This implementation is used for Solaris on x86/x86-64.
class X86SolarisTargetObjectFile : public X86ELFTargetObjectFile {
void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
};
- /// \brief This implementation is used for Windows targets on x86 and x86-64.
- class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF {
- const MCExpr *
- lowerRelativeReference(const GlobalValue *LHS, const GlobalValue *RHS,
- const TargetMachine &TM) const override;
-
- /// \brief Given a mergeable constant with the specified size and relocation
- /// information, return a section that it should be placed in.
- MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
- const Constant *C,
- unsigned &Align) const override;
- };
-
} // end namespace llvm
#endif
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 223eed3048db..bae2ef80c365 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -130,12 +130,13 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
}
unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
+ unsigned PreferVectorWidth = ST->getPreferVectorWidth();
if (Vector) {
- if (ST->hasAVX512())
+ if (ST->hasAVX512() && PreferVectorWidth >= 512)
return 512;
- if (ST->hasAVX())
+ if (ST->hasAVX() && PreferVectorWidth >= 256)
return 256;
- if (ST->hasSSE1())
+ if (ST->hasSSE1() && PreferVectorWidth >= 128)
return 128;
return 0;
}
@@ -180,28 +181,40 @@ int X86TTIImpl::getArithmeticInstrCost(
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ static const CostTblEntry GLMCostTable[] = {
+ { ISD::FDIV, MVT::f32, 18 }, // divss
+ { ISD::FDIV, MVT::v4f32, 35 }, // divps
+ { ISD::FDIV, MVT::f64, 33 }, // divsd
+ { ISD::FDIV, MVT::v2f64, 65 }, // divpd
+ };
+
+ if (ST->isGLM())
+ if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+
static const CostTblEntry SLMCostTable[] = {
- { ISD::MUL, MVT::v4i32, 11 }, // pmulld
- { ISD::MUL, MVT::v8i16, 2 }, // pmullw
- { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
- { ISD::FMUL, MVT::f64, 2 }, // mulsd
- { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
- { ISD::FMUL, MVT::v4f32, 2 }, // mulps
- { ISD::FDIV, MVT::f32, 17 }, // divss
- { ISD::FDIV, MVT::v4f32, 39 }, // divps
- { ISD::FDIV, MVT::f64, 32 }, // divsd
- { ISD::FDIV, MVT::v2f64, 69 }, // divpd
- { ISD::FADD, MVT::v2f64, 2 }, // addpd
- { ISD::FSUB, MVT::v2f64, 2 }, // subpd
+ { ISD::MUL, MVT::v4i32, 11 }, // pmulld
+ { ISD::MUL, MVT::v8i16, 2 }, // pmullw
+ { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
+ { ISD::FMUL, MVT::f64, 2 }, // mulsd
+ { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
+ { ISD::FMUL, MVT::v4f32, 2 }, // mulps
+ { ISD::FDIV, MVT::f32, 17 }, // divss
+ { ISD::FDIV, MVT::v4f32, 39 }, // divps
+ { ISD::FDIV, MVT::f64, 32 }, // divsd
+ { ISD::FDIV, MVT::v2f64, 69 }, // divpd
+ { ISD::FADD, MVT::v2f64, 2 }, // addpd
+ { ISD::FSUB, MVT::v2f64, 2 }, // subpd
// v2i64/v4i64 mul is custom lowered as a series of long:
// multiplies(3), shifts(3) and adds(2)
// slm muldq version throughput is 2 and addq throughput 4
- // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) +
+ // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
// 3X4 (addq throughput) = 17
- { ISD::MUL, MVT::v2i64, 17 },
+ { ISD::MUL, MVT::v2i64, 17 },
// slm addq\subq throughput is 4
- { ISD::ADD, MVT::v2i64, 4 },
- { ISD::SUB, MVT::v2i64, 4 },
+ { ISD::ADD, MVT::v2i64, 4 },
+ { ISD::SUB, MVT::v2i64, 4 },
};
if (ST->isSLM()) {
@@ -224,30 +237,53 @@ int X86TTIImpl::getArithmeticInstrCost(
if (!signedMode && OpMinSize <= 16)
return LT.first * 5; // pmullw/pmulhw/pshuf
}
+
if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
LT.second)) {
return LT.first * Entry->Cost;
}
}
- if (ISD == ISD::SDIV &&
- Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
+ ISD == ISD::UREM) &&
+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
- // On X86, vector signed division by constants power-of-two are
- // normally expanded to the sequence SRA + SRL + ADD + SRA.
- // The OperandValue properties many not be same as that of previous
- // operation;conservatively assume OP_None.
- int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
- Op2Info, TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
+ if (ISD == ISD::SDIV || ISD == ISD::SREM) {
+ // On X86, vector signed division by constants power-of-two are
+ // normally expanded to the sequence SRA + SRL + ADD + SRA.
+ // The OperandValue properties may not be the same as that of the previous
+ // operation; conservatively assume OP_None.
+ int Cost =
+ 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+
+ if (ISD == ISD::SREM) {
+ // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
+ Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info);
+ Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info);
+ }
+
+ return Cost;
+ }
- return Cost;
+ // Vector unsigned division/remainder will be simplified to shifts/masks.
+ if (ISD == ISD::UDIV)
+ return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+
+ if (ISD == ISD::UREM)
+ return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
}
static const CostTblEntry AVX512BWUniformConstCostTable[] = {
@@ -256,7 +292,9 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
{ ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
+ { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
+ { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -272,7 +310,9 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v8i64, 1 },
{ ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
+ { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
{ ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
+ { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -290,9 +330,13 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
{ ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
+ { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
+ { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
{ ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
+ { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
{ ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
+ { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -312,13 +356,21 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
{ ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
+ { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
{ ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
+ { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
+ { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
{ ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
+ { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
{ ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
+ { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
{ ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+ { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
{ ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
+ { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
{ ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
+ { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -326,8 +378,12 @@ int X86TTIImpl::getArithmeticInstrCost(
// pmuldq sequence.
if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
return LT.first * 32;
+ if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
+ return LT.first * 38;
if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
return LT.first * 15;
+ if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
+ return LT.first * 20;
// XOP has faster vXi8 shifts.
if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
@@ -405,12 +461,6 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
-
- // Vectorizing division is a bad idea. See the SSE2 table for more comments.
- { ISD::SDIV, MVT::v64i8, 64*20 },
- { ISD::SDIV, MVT::v32i16, 32*20 },
- { ISD::UDIV, MVT::v64i8, 64*20 },
- { ISD::UDIV, MVT::v32i16, 32*20 }
};
// Look for AVX512BW lowering tricks for custom cases.
@@ -432,14 +482,18 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v16i32, 1 }, // pmulld
+ { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
+ { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
+ { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
{ ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
- // Vectorizing division is a bad idea. See the SSE2 table for more comments.
- { ISD::SDIV, MVT::v16i32, 16*20 },
- { ISD::SDIV, MVT::v8i64, 8*20 },
- { ISD::UDIV, MVT::v16i32, 16*20 },
- { ISD::UDIV, MVT::v8i64, 8*20 }
+ { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
+
+ { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
};
if (ST->hasAVX512())
@@ -468,7 +522,9 @@ int X86TTIImpl::getArithmeticInstrCost(
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
// On AVX2, a packed v16i16 shift left by a constant build_vector
// is lowered into a vector multiply (vpmullw).
- return LT.first;
+ return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
@@ -571,9 +627,16 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i16, 1 }, // pmullw
- { ISD::MUL, MVT::v8i32, 1 }, // pmulld
+ { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
{ ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
+ { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
+
{ ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
@@ -617,16 +680,6 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
-
- // Vectorizing division is a bad idea. See the SSE2 table for more comments.
- { ISD::SDIV, MVT::v32i8, 32*20 },
- { ISD::SDIV, MVT::v16i16, 16*20 },
- { ISD::SDIV, MVT::v8i32, 8*20 },
- { ISD::SDIV, MVT::v4i64, 4*20 },
- { ISD::UDIV, MVT::v32i8, 32*20 },
- { ISD::UDIV, MVT::v16i16, 16*20 },
- { ISD::UDIV, MVT::v8i32, 8*20 },
- { ISD::UDIV, MVT::v4i64, 4*20 },
};
if (ST->hasAVX())
@@ -634,6 +687,21 @@ int X86TTIImpl::getArithmeticInstrCost(
return LT.first * Entry->Cost;
static const CostTblEntry SSE42CostTable[] = {
+ { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
+
+ { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
+
+ { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
+
{ ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
@@ -666,7 +734,7 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
{ ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
- { ISD::MUL, MVT::v4i32, 1 } // pmulld
+ { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
};
if (ST->hasSSE41())
@@ -703,21 +771,6 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
{ ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
-
- // It is not a good idea to vectorize division. We have to scalarize it and
- // in the process we will often end up having to spilling regular
- // registers. The overhead of division is going to dominate most kernels
- // anyways so try hard to prevent vectorization of division - it is
- // generally a bad idea. Assume somewhat arbitrarily that we have to be able
- // to hide "20 cycles" for each lane.
- { ISD::SDIV, MVT::v16i8, 16*20 },
- { ISD::SDIV, MVT::v8i16, 8*20 },
- { ISD::SDIV, MVT::v4i32, 4*20 },
- { ISD::SDIV, MVT::v2i64, 2*20 },
- { ISD::UDIV, MVT::v16i8, 16*20 },
- { ISD::UDIV, MVT::v8i16, 8*20 },
- { ISD::UDIV, MVT::v4i32, 4*20 },
- { ISD::UDIV, MVT::v2i64, 2*20 },
};
if (ST->hasSSE2())
@@ -733,6 +786,20 @@ int X86TTIImpl::getArithmeticInstrCost(
if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
+ // It is not a good idea to vectorize division. We have to scalarize it and
+ // in the process we will often end up having to spilling regular
+ // registers. The overhead of division is going to dominate most kernels
+ // anyways so try hard to prevent vectorization of division - it is
+ // generally a bad idea. Assume somewhat arbitrarily that we have to be able
+ // to hide "20 cycles" for each lane.
+ if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
+ ISD == ISD::UDIV || ISD == ISD::UREM)) {
+ int ScalarCost = getArithmeticInstrCost(
+ Opcode, Ty->getScalarType(), Op1Info, Op2Info,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+ return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
+ }
+
// Fallback to the default implementation.
return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
}
@@ -754,7 +821,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
// type remains the same.
if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
MVT LegalVT = LT.second;
- if (LegalVT.getVectorElementType().getSizeInBits() ==
+ if (LegalVT.isVector() &&
+ LegalVT.getVectorElementType().getSizeInBits() ==
Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
@@ -886,8 +954,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb
{ TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb
- { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
- { TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb
+ { TTI::SK_Select, MVT::v16i16, 1 }, // vpblendvb
+ { TTI::SK_Select, MVT::v32i8, 1 }, // vpblendvb
{ TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
{ TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
@@ -951,15 +1019,15 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb
// + vinsertf128
- { TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd
- { TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd
- { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps
- { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps
- { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
- { TTI::SK_Alternate, MVT::v32i8, 3 }, // vpand + vpandn + vpor
+ { TTI::SK_Select, MVT::v4i64, 1 }, // vblendpd
+ { TTI::SK_Select, MVT::v4f64, 1 }, // vblendpd
+ { TTI::SK_Select, MVT::v8i32, 1 }, // vblendps
+ { TTI::SK_Select, MVT::v8f32, 1 }, // vblendps
+ { TTI::SK_Select, MVT::v16i16, 3 }, // vpand + vpandn + vpor
+ { TTI::SK_Select, MVT::v32i8, 3 }, // vpand + vpandn + vpor
- { TTI::SK_PermuteSingleSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd
- { TTI::SK_PermuteSingleSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd
+ { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vshufpd
+ { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vshufpd
{ TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
{ TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
{ TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb
@@ -967,9 +1035,9 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb
// + 2*por + vinsertf128
- { TTI::SK_PermuteTwoSrc, MVT::v4f64, 4 }, // 2*vperm2f128 + 2*vshufpd
+ { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd
+ { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd
{ TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
- { TTI::SK_PermuteTwoSrc, MVT::v4i64, 4 }, // 2*vperm2f128 + 2*vshufpd
{ TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
{ TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb
// + 4*por + vinsertf128
@@ -982,12 +1050,12 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;
static const CostTblEntry SSE41ShuffleTbl[] = {
- { TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw
- { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
- { TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw
- { TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps
- { TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw
- { TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb
+ { TTI::SK_Select, MVT::v2i64, 1 }, // pblendw
+ { TTI::SK_Select, MVT::v2f64, 1 }, // movsd
+ { TTI::SK_Select, MVT::v4i32, 1 }, // pblendw
+ { TTI::SK_Select, MVT::v4f32, 1 }, // blendps
+ { TTI::SK_Select, MVT::v8i16, 1 }, // pblendw
+ { TTI::SK_Select, MVT::v16i8, 1 } // pblendvb
};
if (ST->hasSSE41())
@@ -1001,8 +1069,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
{ TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
- { TTI::SK_Alternate, MVT::v8i16, 3 }, // 2*pshufb + por
- { TTI::SK_Alternate, MVT::v16i8, 3 }, // 2*pshufb + por
+ { TTI::SK_Select, MVT::v8i16, 3 }, // 2*pshufb + por
+ { TTI::SK_Select, MVT::v16i8, 3 }, // 2*pshufb + por
{ TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
{ TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
@@ -1029,11 +1097,11 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw
// + 2*pshufd + 2*unpck + packus
- { TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd
- { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
- { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps
- { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por
- { TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por
+ { TTI::SK_Select, MVT::v2i64, 1 }, // movsd
+ { TTI::SK_Select, MVT::v2f64, 1 }, // movsd
+ { TTI::SK_Select, MVT::v4i32, 2 }, // 2*shufps
+ { TTI::SK_Select, MVT::v8i16, 3 }, // pand + pandn + por
+ { TTI::SK_Select, MVT::v16i8, 3 }, // pand + pandn + por
{ TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd
{ TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
@@ -1057,7 +1125,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
static const CostTblEntry SSE1ShuffleTbl[] = {
{ TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
{ TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
- { TTI::SK_Alternate, MVT::v4f32, 2 }, // 2*shufps
+ { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
{ TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
{ TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
};
@@ -1487,6 +1555,15 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
{ ISD::SETCC, MVT::v16f32, 1 },
};
+ static const CostTblEntry AVX512BWCostTbl[] = {
+ { ISD::SETCC, MVT::v32i16, 1 },
+ { ISD::SETCC, MVT::v64i8, 1 },
+ };
+
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
if (ST->hasAVX512())
if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
@@ -1631,6 +1708,18 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
};
+ static const CostTblEntry GLMCostTbl[] = {
+ { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
+ { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
+ { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
+ { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
+ };
+ static const CostTblEntry SLMCostTbl[] = {
+ { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
+ { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
+ { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
+ { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
+ };
static const CostTblEntry SSE42CostTbl[] = {
{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
@@ -1721,6 +1810,14 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
MVT MTy = LT.second;
// Attempt to lookup cost.
+ if (ST->isGLM())
+ if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
if (ST->hasCDI())
if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
return LT.first * Entry->Cost;
@@ -1886,8 +1983,8 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
LT.second.getVectorNumElements() == NumElem)
// Promotion requires expand/truncate for data and a shuffle for mask.
- Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
- getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
+ Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) +
+ getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr);
else if (LT.second.getVectorNumElements() > NumElem) {
VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
@@ -2145,7 +2242,7 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
}
-/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
int X86TTIImpl::getIntImmCost(int64_t Val) {
@@ -2480,6 +2577,10 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
C2.ScaleCost, C2.ImmCost, C2.SetupCost);
}
+bool X86TTIImpl::canMacroFuseCmp() {
+ return ST->hasMacroFusion();
+}
+
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
// The backend can't handle a single element vector.
if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
@@ -2522,7 +2623,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
// TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
// enable gather with a -march.
return (DataWidth == 32 || DataWidth == 64) &&
- (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
+ (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
}
bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 6f01a6fd11df..3df899038820 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -120,6 +120,7 @@ public:
Type *Ty);
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
TargetTransformInfo::LSRCost &C2);
+ bool canMacroFuseCmp();
bool isLegalMaskedLoad(Type *DataType);
bool isLegalMaskedStore(Type *DataType);
bool isLegalMaskedGather(Type *DataType);
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 224262830b12..f882b760927c 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -264,8 +264,8 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
}
}
- DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
- << getBlockExitStateName(CurState) << '\n');
+ LLVM_DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
+ << getBlockExitStateName(CurState) << '\n');
if (CurState == EXITS_DIRTY)
for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
@@ -341,8 +341,8 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
// successors need to be added to the worklist (if they haven't been
// already).
if (BBState.ExitState == PASS_THROUGH) {
- DEBUG(dbgs() << "MBB #" << MBB.getNumber()
- << " was Pass-through, is now Dirty-out.\n");
+ LLVM_DEBUG(dbgs() << "MBB #" << MBB.getNumber()
+ << " was Pass-through, is now Dirty-out.\n");
for (MachineBasicBlock *Succ : MBB.successors())
addDirtySuccessor(*Succ);
}
diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp
index 1046696587d9..d298aaa97ecd 100644
--- a/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -62,6 +62,7 @@ private:
unsigned StackPtr;
unsigned SlotSize;
int64_t StackProbeSize;
+ bool NoStackArgProbe;
StringRef getPassName() const override { return "X86 WinAlloca Expander"; }
static char ID;
@@ -240,13 +241,21 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
}
break;
case Probe:
- // The probe lowering expects the amount in RAX/EAX.
- BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegA)
- .addReg(MI->getOperand(0).getReg());
-
- // Do the probe.
- STI->getFrameLowering()->emitStackProbe(*MBB->getParent(), *MBB, MI, DL,
- /*InPrologue=*/false);
+ if (!NoStackArgProbe) {
+ // The probe lowering expects the amount in RAX/EAX.
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegA)
+ .addReg(MI->getOperand(0).getReg());
+
+ // Do the probe.
+ STI->getFrameLowering()->emitStackProbe(*MBB->getParent(), *MBB, MI, DL,
+ /*InPrologue=*/false);
+ } else {
+ // Sub
+ BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::SUB64rr : X86::SUB32rr),
+ StackPtr)
+ .addReg(StackPtr)
+ .addReg(MI->getOperand(0).getReg());
+ }
break;
}
@@ -285,6 +294,9 @@ bool X86WinAllocaExpander::runOnMachineFunction(MachineFunction &MF) {
.getValueAsString()
.getAsInteger(0, StackProbeSize);
}
+ NoStackArgProbe = MF.getFunction().hasFnAttribute("no-stack-arg-probe");
+ if (NoStackArgProbe)
+ StackProbeSize = INT64_MAX;
LoweringMap Lowerings;
computeLowerings(MF, Lowerings);
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index 6d6dedc60736..dde9c734f492 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -695,10 +695,10 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
Worklist.push_back(BB);
continue;
}
- DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
- << " InitialState=" << InitialState << '\n');
- DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
- << " FinalState=" << FinalState << '\n');
+ LLVM_DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+ << " InitialState=" << InitialState << '\n');
+ LLVM_DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+ << " FinalState=" << FinalState << '\n');
InitialStates.insert({BB, InitialState});
FinalStates.insert({BB, FinalState});
}
@@ -743,8 +743,8 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
continue;
int PrevState = getPredState(FinalStates, F, ParentBaseState, BB);
- DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
- << " PrevState=" << PrevState << '\n');
+ LLVM_DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+ << " PrevState=" << PrevState << '\n');
for (Instruction &I : *BB) {
CallSite CS(&I);
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
index 0a609ef76f44..f7dbcbacb861 100644
--- a/lib/Target/XCore/CMakeLists.txt
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -1,12 +1,13 @@
set(LLVM_TARGET_DEFINITIONS XCore.td)
-tablegen(LLVM XCoreGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM XCoreGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM XCoreGenDisassemblerTables.inc -gen-disassembler)
tablegen(LLVM XCoreGenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM XCoreGenDAGISel.inc -gen-dag-isel)
tablegen(LLVM XCoreGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM XCoreGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM XCoreGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM XCoreGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM XCoreGenRegisterInfo.inc -gen-register-info)
tablegen(LLVM XCoreGenSubtargetInfo.inc -gen-subtarget)
+
add_public_tablegen_target(XCoreCommonTableGen)
add_llvm_target(XCoreCodeGen
@@ -28,5 +29,5 @@ add_llvm_target(XCoreCodeGen
add_subdirectory(Disassembler)
add_subdirectory(InstPrinter)
-add_subdirectory(TargetInfo)
add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
diff --git a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 059b75ef482a..faf66e5944ab 100644
--- a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file is part of the XCore Disassembler.
+/// This file is part of the XCore Disassembler.
///
//===----------------------------------------------------------------------===//
@@ -29,7 +29,7 @@ typedef MCDisassembler::DecodeStatus DecodeStatus;
namespace {
-/// \brief A disassembler class for XCore.
+/// A disassembler class for XCore.
class XCoreDisassembler : public MCDisassembler {
public:
XCoreDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
index 8a7efe2e39c6..a0b480026469 100644
--- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
+++ b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains the declaration of the XCoreInstPrinter class,
+/// This file contains the declaration of the XCoreInstPrinter class,
/// which is used to print XCore MCInst to a .s file.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 0da90df6eb16..8f7c8a82380a 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -27,7 +27,6 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfo.h"
@@ -42,6 +41,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include <algorithm>
#include <cctype>
using namespace llvm;
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index 62b2c8eee152..b87c149a36dc 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -151,7 +151,7 @@ static void GetSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
Offset,
FramePtr));
}
- std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
+ llvm::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
}
/// Creates an ordered list of EH info register 'spills'.
@@ -170,7 +170,7 @@ static void GetEHSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
SpillList.push_back(
StackSlotInfo(EHSlot[0], MFI.getObjectOffset(EHSlot[1]),
TL->getExceptionSelectorRegister(PersonalityFn)));
- std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
+ llvm::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
}
static MachineMemOperand *getFrameIndexMMO(MachineBasicBlock &MBB,
@@ -427,7 +427,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(*MF);
DebugLoc DL;
- if (MI != MBB.end() && !MI->isDebugValue())
+ if (MI != MBB.end() && !MI->isDebugInstr())
DL = MI->getDebugLoc();
for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 0ac5ecfa7e8c..99e76144cba3 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -91,10 +91,6 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
// XCore does not have the NodeTypes below.
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
- setOperationAction(ISD::ADDC, MVT::i32, Expand);
- setOperationAction(ISD::ADDE, MVT::i32, Expand);
- setOperationAction(ISD::SUBC, MVT::i32, Expand);
- setOperationAction(ISD::SUBE, MVT::i32, Expand);
// 64bit
setOperationAction(ISD::ADD, MVT::i64, Custom);
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index c885332b07ad..d5e276788f71 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -364,7 +364,7 @@ void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
const TargetRegisterInfo *TRI) const
{
DebugLoc DL;
- if (I != MBB.end() && !I->isDebugValue())
+ if (I != MBB.end() && !I->isDebugInstr())
DL = I->getDebugLoc();
MachineFunction *MF = MBB.getParent();
const MachineFrameInfo &MFI = MF->getFrameInfo();
@@ -386,7 +386,7 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
const TargetRegisterInfo *TRI) const
{
DebugLoc DL;
- if (I != MBB.end() && !I->isDebugValue())
+ if (I != MBB.end() && !I->isDebugInstr())
DL = I->getDebugLoc();
MachineFunction *MF = MBB.getParent();
const MachineFrameInfo &MFI = MF->getFrameInfo();
@@ -429,7 +429,7 @@ MachineBasicBlock::iterator XCoreInstrInfo::loadImmediate(
MachineBasicBlock::iterator MI,
unsigned Reg, uint64_t Value) const {
DebugLoc dl;
- if (MI != MBB.end() && !MI->isDebugValue())
+ if (MI != MBB.end() && !MI->isDebugInstr())
dl = MI->getDebugLoc();
if (isImmMskBitp(Value)) {
int N = Log2_32(Value) + 1;
diff --git a/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index 87532d11ede8..1c93ba8fa14c 100644
--- a/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains a pass that lowers thread local variables on the
+/// This file contains a pass that lowers thread local variables on the
/// XCore.
///
//===----------------------------------------------------------------------===//
@@ -129,7 +129,7 @@ createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
do {
SmallVector<WeakTrackingVH, 8> WUsers(CE->user_begin(), CE->user_end());
- std::sort(WUsers.begin(), WUsers.end());
+ llvm::sort(WUsers.begin(), WUsers.end());
WUsers.erase(std::unique(WUsers.begin(), WUsers.end()), WUsers.end());
while (!WUsers.empty())
if (WeakTrackingVH WU = WUsers.pop_back_val()) {
diff --git a/lib/Target/XCore/XCoreMCInstLower.cpp b/lib/Target/XCore/XCoreMCInstLower.cpp
index 7763ccc8f4af..21270192b234 100644
--- a/lib/Target/XCore/XCoreMCInstLower.cpp
+++ b/lib/Target/XCore/XCoreMCInstLower.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file contains code to lower XCore MachineInstrs to their
+/// This file contains code to lower XCore MachineInstrs to their
/// corresponding MCInst records.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/XCore/XCoreMCInstLower.h b/lib/Target/XCore/XCoreMCInstLower.h
index 8fb1593cc6e6..abcb80fcf766 100644
--- a/lib/Target/XCore/XCoreMCInstLower.h
+++ b/lib/Target/XCore/XCoreMCInstLower.h
@@ -21,7 +21,7 @@ namespace llvm {
class Mangler;
class AsmPrinter;
-/// \brief This class is used to lower an MachineInstr into an MCInst.
+/// This class is used to lower an MachineInstr into an MCInst.
class LLVM_LIBRARY_VISIBILITY XCoreMCInstLower {
typedef MachineOperand::MachineOperandType MachineOperandType;
MCContext *Ctx;
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 70376d40a37f..1915aaedc35d 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -274,14 +274,13 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int StackSize = MF.getFrameInfo().getStackSize();
#ifndef NDEBUG
- DEBUG(errs() << "\nFunction : "
- << MF.getName() << "\n");
- DEBUG(errs() << "<--------->\n");
- DEBUG(MI.print(errs()));
- DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n");
- DEBUG(errs() << "FrameOffset : " << Offset << "\n");
- DEBUG(errs() << "StackSize : " << StackSize << "\n");
- #endif
+ LLVM_DEBUG(errs() << "\nFunction : " << MF.getName() << "\n");
+ LLVM_DEBUG(errs() << "<--------->\n");
+ LLVM_DEBUG(MI.print(errs()));
+ LLVM_DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n");
+ LLVM_DEBUG(errs() << "FrameOffset : " << Offset << "\n");
+ LLVM_DEBUG(errs() << "StackSize : " << StackSize << "\n");
+#endif
Offset += StackSize;
@@ -299,7 +298,8 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
assert(Offset%4 == 0 && "Misaligned stack offset");
- DEBUG(errs() << "Offset : " << Offset << "\n" << "<--------->\n");
+ LLVM_DEBUG(errs() << "Offset : " << Offset << "\n"
+ << "<--------->\n");
Offset/=4;
unsigned Reg = MI.getOperand(0).getReg();
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 0e337d65a0f6..c31f5d5a7c44 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -33,6 +33,8 @@ public:
BitVector getReservedRegs(const MachineFunction &MF) const override;
+ bool enableMultipleCopyHints() const override { return true; }
+
bool requiresRegisterScavenging(const MachineFunction &MF) const override;
bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;